summaryrefslogtreecommitdiffstats
path: root/third_party/dav1d/src
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-19 00:47:55 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-19 00:47:55 +0000
commit26a029d407be480d791972afb5975cf62c9360a6 (patch)
treef435a8308119effd964b339f76abb83a57c29483 /third_party/dav1d/src
parentInitial commit. (diff)
downloadfirefox-26a029d407be480d791972afb5975cf62c9360a6.tar.xz
firefox-26a029d407be480d791972afb5975cf62c9360a6.zip
Adding upstream version 124.0.1.upstream/124.0.1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/dav1d/src')
-rw-r--r--third_party/dav1d/src/arm/32/cdef.S540
-rw-r--r--third_party/dav1d/src/arm/32/cdef16.S233
-rw-r--r--third_party/dav1d/src/arm/32/cdef_tmpl.S515
-rw-r--r--third_party/dav1d/src/arm/32/filmgrain.S2039
-rw-r--r--third_party/dav1d/src/arm/32/filmgrain16.S2137
-rw-r--r--third_party/dav1d/src/arm/32/ipred.S2958
-rw-r--r--third_party/dav1d/src/arm/32/ipred16.S3276
-rw-r--r--third_party/dav1d/src/arm/32/itx.S3343
-rw-r--r--third_party/dav1d/src/arm/32/itx16.S3625
-rw-r--r--third_party/dav1d/src/arm/32/loopfilter.S868
-rw-r--r--third_party/dav1d/src/arm/32/loopfilter16.S859
-rw-r--r--third_party/dav1d/src/arm/32/looprestoration.S791
-rw-r--r--third_party/dav1d/src/arm/32/looprestoration16.S801
-rw-r--r--third_party/dav1d/src/arm/32/looprestoration_common.S453
-rw-r--r--third_party/dav1d/src/arm/32/looprestoration_tmpl.S600
-rw-r--r--third_party/dav1d/src/arm/32/mc.S3340
-rw-r--r--third_party/dav1d/src/arm/32/mc16.S3658
-rw-r--r--third_party/dav1d/src/arm/32/msac.S575
-rw-r--r--third_party/dav1d/src/arm/32/refmvs.S303
-rw-r--r--third_party/dav1d/src/arm/32/util.S184
-rw-r--r--third_party/dav1d/src/arm/64/cdef.S520
-rw-r--r--third_party/dav1d/src/arm/64/cdef16.S229
-rw-r--r--third_party/dav1d/src/arm/64/cdef_tmpl.S511
-rw-r--r--third_party/dav1d/src/arm/64/filmgrain.S2010
-rw-r--r--third_party/dav1d/src/arm/64/filmgrain16.S1997
-rw-r--r--third_party/dav1d/src/arm/64/ipred.S5294
-rw-r--r--third_party/dav1d/src/arm/64/ipred16.S5674
-rw-r--r--third_party/dav1d/src/arm/64/itx.S3270
-rw-r--r--third_party/dav1d/src/arm/64/itx16.S3648
-rw-r--r--third_party/dav1d/src/arm/64/loopfilter.S1129
-rw-r--r--third_party/dav1d/src/arm/64/loopfilter16.S925
-rw-r--r--third_party/dav1d/src/arm/64/looprestoration.S1303
-rw-r--r--third_party/dav1d/src/arm/64/looprestoration16.S1388
-rw-r--r--third_party/dav1d/src/arm/64/looprestoration_common.S272
-rw-r--r--third_party/dav1d/src/arm/64/looprestoration_tmpl.S751
-rw-r--r--third_party/dav1d/src/arm/64/mc.S3310
-rw-r--r--third_party/dav1d/src/arm/64/mc16.S3611
-rw-r--r--third_party/dav1d/src/arm/64/msac.S480
-rw-r--r--third_party/dav1d/src/arm/64/refmvs.S292
-rw-r--r--third_party/dav1d/src/arm/64/util.S229
-rw-r--r--third_party/dav1d/src/arm/asm-offsets.h43
-rw-r--r--third_party/dav1d/src/arm/asm.S291
-rw-r--r--third_party/dav1d/src/arm/cdef.h88
-rw-r--r--third_party/dav1d/src/arm/cpu.c99
-rw-r--r--third_party/dav1d/src/arm/cpu.h37
-rw-r--r--third_party/dav1d/src/arm/filmgrain.h204
-rw-r--r--third_party/dav1d/src/arm/ipred.h326
-rw-r--r--third_party/dav1d/src/arm/itx.h141
-rw-r--r--third_party/dav1d/src/arm/loopfilter.h45
-rw-r--r--third_party/dav1d/src/arm/looprestoration.h1113
-rw-r--r--third_party/dav1d/src/arm/mc.h114
-rw-r--r--third_party/dav1d/src/arm/msac.h52
-rw-r--r--third_party/dav1d/src/arm/refmvs.h41
-rw-r--r--third_party/dav1d/src/cdef.h71
-rw-r--r--third_party/dav1d/src/cdef_apply.h39
-rw-r--r--third_party/dav1d/src/cdef_apply_tmpl.c309
-rw-r--r--third_party/dav1d/src/cdef_tmpl.c331
-rw-r--r--third_party/dav1d/src/cdf.c4123
-rw-r--r--third_party/dav1d/src/cdf.h150
-rw-r--r--third_party/dav1d/src/cpu.c105
-rw-r--r--third_party/dav1d/src/cpu.h110
-rw-r--r--third_party/dav1d/src/ctx.h91
-rw-r--r--third_party/dav1d/src/data.c149
-rw-r--r--third_party/dav1d/src/data.h56
-rw-r--r--third_party/dav1d/src/dav1d.rc.in32
-rw-r--r--third_party/dav1d/src/decode.c3760
-rw-r--r--third_party/dav1d/src/decode.h35
-rw-r--r--third_party/dav1d/src/dequant_tables.c229
-rw-r--r--third_party/dav1d/src/dequant_tables.h37
-rw-r--r--third_party/dav1d/src/env.h521
-rw-r--r--third_party/dav1d/src/ext/x86/x86inc.asm1902
-rw-r--r--third_party/dav1d/src/fg_apply.h58
-rw-r--r--third_party/dav1d/src/fg_apply_tmpl.c241
-rw-r--r--third_party/dav1d/src/filmgrain.h84
-rw-r--r--third_party/dav1d/src/filmgrain_tmpl.c441
-rw-r--r--third_party/dav1d/src/getbits.c164
-rw-r--r--third_party/dav1d/src/getbits.h71
-rw-r--r--third_party/dav1d/src/internal.h468
-rw-r--r--third_party/dav1d/src/intra_edge.c148
-rw-r--r--third_party/dav1d/src/intra_edge.h73
-rw-r--r--third_party/dav1d/src/ipred.h94
-rw-r--r--third_party/dav1d/src/ipred_prepare.h108
-rw-r--r--third_party/dav1d/src/ipred_prepare_tmpl.c204
-rw-r--r--third_party/dav1d/src/ipred_tmpl.c774
-rw-r--r--third_party/dav1d/src/itx.h48
-rw-r--r--third_party/dav1d/src/itx_1d.c1034
-rw-r--r--third_party/dav1d/src/itx_1d.h59
-rw-r--r--third_party/dav1d/src/itx_tmpl.c274
-rw-r--r--third_party/dav1d/src/levels.h289
-rw-r--r--third_party/dav1d/src/lf_apply.h48
-rw-r--r--third_party/dav1d/src/lf_apply_tmpl.c466
-rw-r--r--third_party/dav1d/src/lf_mask.c495
-rw-r--r--third_party/dav1d/src/lf_mask.h83
-rw-r--r--third_party/dav1d/src/lib.c761
-rw-r--r--third_party/dav1d/src/log.c57
-rw-r--r--third_party/dav1d/src/log.h47
-rw-r--r--third_party/dav1d/src/loongarch/cpu.c47
-rw-r--r--third_party/dav1d/src/loongarch/cpu.h37
-rw-r--r--third_party/dav1d/src/loongarch/itx.S8104
-rw-r--r--third_party/dav1d/src/loongarch/itx.h195
-rw-r--r--third_party/dav1d/src/loongarch/loongson_asm.S776
-rw-r--r--third_party/dav1d/src/loongarch/loopfilter.S1108
-rw-r--r--third_party/dav1d/src/loongarch/loopfilter.h52
-rw-r--r--third_party/dav1d/src/loongarch/looprestoration.S1407
-rw-r--r--third_party/dav1d/src/loongarch/looprestoration.h78
-rw-r--r--third_party/dav1d/src/loongarch/looprestoration_tmpl.c274
-rw-r--r--third_party/dav1d/src/loongarch/mc.S4758
-rw-r--r--third_party/dav1d/src/loongarch/mc.h118
-rw-r--r--third_party/dav1d/src/loongarch/msac.S368
-rw-r--r--third_party/dav1d/src/loongarch/msac.h46
-rw-r--r--third_party/dav1d/src/loongarch/refmvs.S152
-rw-r--r--third_party/dav1d/src/loongarch/refmvs.h44
-rw-r--r--third_party/dav1d/src/loopfilter.h57
-rw-r--r--third_party/dav1d/src/loopfilter_tmpl.c272
-rw-r--r--third_party/dav1d/src/looprestoration.h79
-rw-r--r--third_party/dav1d/src/looprestoration_tmpl.c558
-rw-r--r--third_party/dav1d/src/lr_apply.h47
-rw-r--r--third_party/dav1d/src/lr_apply_tmpl.c202
-rw-r--r--third_party/dav1d/src/mc.h136
-rw-r--r--third_party/dav1d/src/mc_tmpl.c957
-rw-r--r--third_party/dav1d/src/mem.c328
-rw-r--r--third_party/dav1d/src/mem.h137
-rw-r--r--third_party/dav1d/src/meson.build377
-rw-r--r--third_party/dav1d/src/msac.c208
-rw-r--r--third_party/dav1d/src/msac.h110
-rw-r--r--third_party/dav1d/src/obu.c1738
-rw-r--r--third_party/dav1d/src/obu.h36
-rw-r--r--third_party/dav1d/src/pal.c77
-rw-r--r--third_party/dav1d/src/pal.h43
-rw-r--r--third_party/dav1d/src/picture.c336
-rw-r--r--third_party/dav1d/src/picture.h122
-rw-r--r--third_party/dav1d/src/ppc/cdef.h61
-rw-r--r--third_party/dav1d/src/ppc/cdef_tmpl.c487
-rw-r--r--third_party/dav1d/src/ppc/cpu.c51
-rw-r--r--third_party/dav1d/src/ppc/cpu.h37
-rw-r--r--third_party/dav1d/src/ppc/dav1d_types.h54
-rw-r--r--third_party/dav1d/src/ppc/looprestoration.h48
-rw-r--r--third_party/dav1d/src/ppc/looprestoration_tmpl.c321
-rw-r--r--third_party/dav1d/src/qm.c1693
-rw-r--r--third_party/dav1d/src/qm.h37
-rw-r--r--third_party/dav1d/src/recon.h106
-rw-r--r--third_party/dav1d/src/recon_tmpl.c2361
-rw-r--r--third_party/dav1d/src/ref.c86
-rw-r--r--third_party/dav1d/src/ref.h77
-rw-r--r--third_party/dav1d/src/refmvs.c944
-rw-r--r--third_party/dav1d/src/refmvs.h177
-rw-r--r--third_party/dav1d/src/riscv/64/itx.S662
-rw-r--r--third_party/dav1d/src/riscv/asm.S126
-rw-r--r--third_party/dav1d/src/riscv/cpu.c49
-rw-r--r--third_party/dav1d/src/riscv/cpu.h37
-rw-r--r--third_party/dav1d/src/riscv/itx.h109
-rw-r--r--third_party/dav1d/src/scan.c299
-rw-r--r--third_party/dav1d/src/scan.h37
-rw-r--r--third_party/dav1d/src/tables.c1013
-rw-r--r--third_party/dav1d/src/tables.h125
-rw-r--r--third_party/dav1d/src/thread.h189
-rw-r--r--third_party/dav1d/src/thread_data.h40
-rw-r--r--third_party/dav1d/src/thread_task.c936
-rw-r--r--third_party/dav1d/src/thread_task.h53
-rw-r--r--third_party/dav1d/src/warpmv.c209
-rw-r--r--third_party/dav1d/src/warpmv.h39
-rw-r--r--third_party/dav1d/src/wedge.c299
-rw-r--r--third_party/dav1d/src/wedge.h96
-rw-r--r--third_party/dav1d/src/win32/thread.c99
-rw-r--r--third_party/dav1d/src/x86/cdef.h87
-rw-r--r--third_party/dav1d/src/x86/cdef16_avx2.asm877
-rw-r--r--third_party/dav1d/src/x86/cdef16_avx512.asm622
-rw-r--r--third_party/dav1d/src/x86/cdef16_sse.asm1033
-rw-r--r--third_party/dav1d/src/x86/cdef_avx2.asm1772
-rw-r--r--third_party/dav1d/src/x86/cdef_avx512.asm860
-rw-r--r--third_party/dav1d/src/x86/cdef_sse.asm1357
-rw-r--r--third_party/dav1d/src/x86/cpu.c97
-rw-r--r--third_party/dav1d/src/x86/cpu.h44
-rw-r--r--third_party/dav1d/src/x86/cpuid.asm55
-rw-r--r--third_party/dav1d/src/x86/filmgrain.h83
-rw-r--r--third_party/dav1d/src/x86/filmgrain16_avx2.asm2248
-rw-r--r--third_party/dav1d/src/x86/filmgrain16_avx512.asm930
-rw-r--r--third_party/dav1d/src/x86/filmgrain16_sse.asm3421
-rw-r--r--third_party/dav1d/src/x86/filmgrain_avx2.asm2107
-rw-r--r--third_party/dav1d/src/x86/filmgrain_avx512.asm813
-rw-r--r--third_party/dav1d/src/x86/filmgrain_common.asm46
-rw-r--r--third_party/dav1d/src/x86/filmgrain_sse.asm3233
-rw-r--r--third_party/dav1d/src/x86/ipred.h152
-rw-r--r--third_party/dav1d/src/x86/ipred16_avx2.asm5005
-rw-r--r--third_party/dav1d/src/x86/ipred16_avx512.asm2049
-rw-r--r--third_party/dav1d/src/x86/ipred16_sse.asm4103
-rw-r--r--third_party/dav1d/src/x86/ipred_avx2.asm5393
-rw-r--r--third_party/dav1d/src/x86/ipred_avx512.asm3143
-rw-r--r--third_party/dav1d/src/x86/ipred_sse.asm5408
-rw-r--r--third_party/dav1d/src/x86/itx.h367
-rw-r--r--third_party/dav1d/src/x86/itx16_avx2.asm8599
-rw-r--r--third_party/dav1d/src/x86/itx16_avx512.asm6056
-rw-r--r--third_party/dav1d/src/x86/itx16_sse.asm8135
-rw-r--r--third_party/dav1d/src/x86/itx_avx2.asm5542
-rw-r--r--third_party/dav1d/src/x86/itx_avx512.asm7507
-rw-r--r--third_party/dav1d/src/x86/itx_sse.asm6533
-rw-r--r--third_party/dav1d/src/x86/loopfilter.h69
-rw-r--r--third_party/dav1d/src/x86/loopfilter16_avx2.asm1161
-rw-r--r--third_party/dav1d/src/x86/loopfilter16_avx512.asm912
-rw-r--r--third_party/dav1d/src/x86/loopfilter16_sse.asm1793
-rw-r--r--third_party/dav1d/src/x86/loopfilter_avx2.asm1569
-rw-r--r--third_party/dav1d/src/x86/loopfilter_avx512.asm1529
-rw-r--r--third_party/dav1d/src/x86/loopfilter_sse.asm2348
-rw-r--r--third_party/dav1d/src/x86/looprestoration.h94
-rw-r--r--third_party/dav1d/src/x86/looprestoration16_avx2.asm2540
-rw-r--r--third_party/dav1d/src/x86/looprestoration16_avx512.asm2524
-rw-r--r--third_party/dav1d/src/x86/looprestoration16_sse.asm3723
-rw-r--r--third_party/dav1d/src/x86/looprestoration_avx2.asm2238
-rw-r--r--third_party/dav1d/src/x86/looprestoration_avx512.asm2122
-rw-r--r--third_party/dav1d/src/x86/looprestoration_sse.asm3681
-rw-r--r--third_party/dav1d/src/x86/mc.h302
-rw-r--r--third_party/dav1d/src/x86/mc16_avx2.asm5879
-rw-r--r--third_party/dav1d/src/x86/mc16_avx512.asm4858
-rw-r--r--third_party/dav1d/src/x86/mc16_sse.asm8731
-rw-r--r--third_party/dav1d/src/x86/mc_avx2.asm5669
-rw-r--r--third_party/dav1d/src/x86/mc_avx512.asm4538
-rw-r--r--third_party/dav1d/src/x86/mc_sse.asm9599
-rw-r--r--third_party/dav1d/src/x86/msac.asm667
-rw-r--r--third_party/dav1d/src/x86/msac.h75
-rw-r--r--third_party/dav1d/src/x86/pal.asm641
-rw-r--r--third_party/dav1d/src/x86/pal.h50
-rw-r--r--third_party/dav1d/src/x86/refmvs.asm912
-rw-r--r--third_party/dav1d/src/x86/refmvs.h66
223 files changed, 279482 insertions, 0 deletions
diff --git a/third_party/dav1d/src/arm/32/cdef.S b/third_party/dav1d/src/arm/32/cdef.S
new file mode 100644
index 0000000000..4a0df6eac8
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/cdef.S
@@ -0,0 +1,540 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+#include "cdef_tmpl.S"
+
+// n1 = s0/d0
+// w1 = d0/q0
+// n2 = s4/d2
+// w2 = d2/q1
+.macro pad_top_bottom s1, s2, w, stride, n1, w1, n2, w2, align, ret
+ tst r7, #1 // CDEF_HAVE_LEFT
+ beq 2f
+ // CDEF_HAVE_LEFT
+ tst r7, #2 // CDEF_HAVE_RIGHT
+ beq 1f
+ // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+ ldrh r12, [\s1, #-2]
+ vldr \n1, [\s1]
+ vdup.16 d4, r12
+ ldrh r12, [\s1, #\w]
+ vmov.16 d4[1], r12
+ ldrh r12, [\s2, #-2]
+ vldr \n2, [\s2]
+ vmov.16 d4[2], r12
+ ldrh r12, [\s2, #\w]
+ vmovl.u8 q0, d0
+ vmov.16 d4[3], r12
+ vmovl.u8 q1, d2
+ vmovl.u8 q2, d4
+ vstr s8, [r0, #-4]
+ vst1.16 {\w1}, [r0, :\align]
+ vstr s9, [r0, #2*\w]
+ add r0, r0, #2*\stride
+ vstr s10, [r0, #-4]
+ vst1.16 {\w2}, [r0, :\align]
+ vstr s11, [r0, #2*\w]
+.if \ret
+ pop {r4-r8,pc}
+.else
+ add r0, r0, #2*\stride
+ b 3f
+.endif
+
+1:
+ // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+ ldrh r12, [\s1, #-2]
+ vldr \n1, [\s1]
+ vdup.16 d4, r12
+ ldrh r12, [\s2, #-2]
+ vldr \n2, [\s2]
+ vmovl.u8 q0, d0
+ vmov.16 d4[1], r12
+ vmovl.u8 q1, d2
+ vmovl.u8 q2, d4
+ vstr s8, [r0, #-4]
+ vst1.16 {\w1}, [r0, :\align]
+ vstr s12, [r0, #2*\w]
+ add r0, r0, #2*\stride
+ vstr s9, [r0, #-4]
+ vst1.16 {\w2}, [r0, :\align]
+ vstr s12, [r0, #2*\w]
+.if \ret
+ pop {r4-r8,pc}
+.else
+ add r0, r0, #2*\stride
+ b 3f
+.endif
+
+2:
+ // !CDEF_HAVE_LEFT
+ tst r7, #2 // CDEF_HAVE_RIGHT
+ beq 1f
+ // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+ vldr \n1, [\s1]
+ ldrh r12, [\s1, #\w]
+ vldr \n2, [\s2]
+ vdup.16 d4, r12
+ ldrh r12, [\s2, #\w]
+ vmovl.u8 q0, d0
+ vmov.16 d4[1], r12
+ vmovl.u8 q1, d2
+ vmovl.u8 q2, d4
+ vstr s12, [r0, #-4]
+ vst1.16 {\w1}, [r0, :\align]
+ vstr s8, [r0, #2*\w]
+ add r0, r0, #2*\stride
+ vstr s12, [r0, #-4]
+ vst1.16 {\w2}, [r0, :\align]
+ vstr s9, [r0, #2*\w]
+.if \ret
+ pop {r4-r8,pc}
+.else
+ add r0, r0, #2*\stride
+ b 3f
+.endif
+
+1:
+ // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+ vldr \n1, [\s1]
+ vldr \n2, [\s2]
+ vmovl.u8 q0, d0
+ vmovl.u8 q1, d2
+ vstr s12, [r0, #-4]
+ vst1.16 {\w1}, [r0, :\align]
+ vstr s12, [r0, #2*\w]
+ add r0, r0, #2*\stride
+ vstr s12, [r0, #-4]
+ vst1.16 {\w2}, [r0, :\align]
+ vstr s12, [r0, #2*\w]
+.if \ret
+ pop {r4-r8,pc}
+.else
+ add r0, r0, #2*\stride
+.endif
+3:
+.endm
+
+.macro load_n_incr dst, src, incr, w
+.if \w == 4
+ vld1.32 {\dst\()[0]}, [\src, :32], \incr
+.else
+ vld1.8 {\dst\()}, [\src, :64], \incr
+.endif
+.endm
+
+// void dav1d_cdef_paddingX_8bpc_neon(uint16_t *tmp, const pixel *src,
+// ptrdiff_t src_stride, const pixel (*left)[2],
+// const pixel *const top,
+// const pixel *const bottom, int h,
+// enum CdefEdgeFlags edges);
+
+// n1 = s0/d0
+// w1 = d0/q0
+// n2 = s4/d2
+// w2 = d2/q1
+.macro padding_func w, stride, n1, w1, n2, w2, align
+function cdef_padding\w\()_8bpc_neon, export=1
+ push {r4-r8,lr}
+ ldrd r4, r5, [sp, #24]
+ ldrd r6, r7, [sp, #32]
+ cmp r7, #0xf // fully edged
+ beq cdef_padding\w\()_edged_8bpc_neon
+ vmov.i16 q3, #0x8000
+ tst r7, #4 // CDEF_HAVE_TOP
+ bne 1f
+ // !CDEF_HAVE_TOP
+ sub r12, r0, #2*(2*\stride+2)
+ vmov.i16 q2, #0x8000
+ vst1.16 {q2,q3}, [r12]!
+.if \w == 8
+ vst1.16 {q2,q3}, [r12]!
+.endif
+ b 3f
+1:
+ // CDEF_HAVE_TOP
+ add r8, r4, r2
+ sub r0, r0, #2*(2*\stride)
+ pad_top_bottom r4, r8, \w, \stride, \n1, \w1, \n2, \w2, \align, 0
+
+ // Middle section
+3:
+ tst r7, #1 // CDEF_HAVE_LEFT
+ beq 2f
+ // CDEF_HAVE_LEFT
+ tst r7, #2 // CDEF_HAVE_RIGHT
+ beq 1f
+ // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+0:
+ vld1.16 {d2[]}, [r3, :16]!
+ ldrh r12, [r1, #\w]
+ load_n_incr d0, r1, r2, \w
+ subs r6, r6, #1
+ vmov.16 d2[1], r12
+ vmovl.u8 q0, d0
+ vmovl.u8 q1, d2
+ vstr s4, [r0, #-4]
+ vst1.16 {\w1}, [r0, :\align]
+ vstr s5, [r0, #2*\w]
+ add r0, r0, #2*\stride
+ bgt 0b
+ b 3f
+1:
+ // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+ vld1.16 {d2[]}, [r3, :16]!
+ load_n_incr d0, r1, r2, \w
+ subs r6, r6, #1
+ vmovl.u8 q0, d0
+ vmovl.u8 q1, d2
+ vstr s4, [r0, #-4]
+ vst1.16 {\w1}, [r0, :\align]
+ vstr s12, [r0, #2*\w]
+ add r0, r0, #2*\stride
+ bgt 1b
+ b 3f
+2:
+ tst r7, #2 // CDEF_HAVE_RIGHT
+ beq 1f
+ // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+0:
+ ldrh r12, [r1, #\w]
+ load_n_incr d0, r1, r2, \w
+ vdup.16 d2, r12
+ subs r6, r6, #1
+ vmovl.u8 q0, d0
+ vmovl.u8 q1, d2
+ vstr s12, [r0, #-4]
+ vst1.16 {\w1}, [r0, :\align]
+ vstr s4, [r0, #2*\w]
+ add r0, r0, #2*\stride
+ bgt 0b
+ b 3f
+1:
+ // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+ load_n_incr d0, r1, r2, \w
+ subs r6, r6, #1
+ vmovl.u8 q0, d0
+ vstr s12, [r0, #-4]
+ vst1.16 {\w1}, [r0, :\align]
+ vstr s12, [r0, #2*\w]
+ add r0, r0, #2*\stride
+ bgt 1b
+
+3:
+ tst r7, #8 // CDEF_HAVE_BOTTOM
+ bne 1f
+ // !CDEF_HAVE_BOTTOM
+ sub r12, r0, #4
+ vmov.i16 q2, #0x8000
+ vst1.16 {q2,q3}, [r12]!
+.if \w == 8
+ vst1.16 {q2,q3}, [r12]!
+.endif
+ pop {r4-r8,pc}
+1:
+ // CDEF_HAVE_BOTTOM
+ add r8, r5, r2
+ pad_top_bottom r5, r8, \w, \stride, \n1, \w1, \n2, \w2, \align, 1
+endfunc
+.endm
+
+padding_func 8, 16, d0, q0, d2, q1, 128
+padding_func 4, 8, s0, d0, s4, d2, 64
+
+// void cdef_paddingX_edged_8bpc_neon(uint16_t *tmp, const pixel *src,
+// ptrdiff_t src_stride, const pixel (*left)[2],
+// const pixel *const top,
+// const pixel *const bottom, int h,
+// enum CdefEdgeFlags edges);
+
+.macro padding_func_edged w, stride, reg, align
+function cdef_padding\w\()_edged_8bpc_neon
+ sub r0, r0, #(2*\stride)
+
+ ldrh r12, [r4, #-2]
+ vldr \reg, [r4]
+ add r8, r4, r2
+ strh r12, [r0, #-2]
+ ldrh r12, [r4, #\w]
+ vstr \reg, [r0]
+ strh r12, [r0, #\w]
+
+ ldrh r12, [r8, #-2]
+ vldr \reg, [r8]
+ strh r12, [r0, #\stride-2]
+ ldrh r12, [r8, #\w]
+ vstr \reg, [r0, #\stride]
+ strh r12, [r0, #\stride+\w]
+ add r0, r0, #2*\stride
+
+0:
+ ldrh r12, [r3], #2
+ vldr \reg, [r1]
+ str r12, [r0, #-2]
+ ldrh r12, [r1, #\w]
+ add r1, r1, r2
+ subs r6, r6, #1
+ vstr \reg, [r0]
+ str r12, [r0, #\w]
+ add r0, r0, #\stride
+ bgt 0b
+
+ ldrh r12, [r5, #-2]
+ vldr \reg, [r5]
+ add r8, r5, r2
+ strh r12, [r0, #-2]
+ ldrh r12, [r5, #\w]
+ vstr \reg, [r0]
+ strh r12, [r0, #\w]
+
+ ldrh r12, [r8, #-2]
+ vldr \reg, [r8]
+ strh r12, [r0, #\stride-2]
+ ldrh r12, [r8, #\w]
+ vstr \reg, [r0, #\stride]
+ strh r12, [r0, #\stride+\w]
+
+ pop {r4-r8,pc}
+endfunc
+.endm
+
+padding_func_edged 8, 16, d0, 64
+padding_func_edged 4, 8, s0, 32
+
+tables
+
+filter 8, 8
+filter 4, 8
+
+find_dir 8
+
+.macro load_px_8 d11, d12, d21, d22, w
+.if \w == 8
+ add r6, r2, r9 // x + off
+ sub r9, r2, r9 // x - off
+ vld1.8 {\d11}, [r6] // p0
+ add r6, r6, #16 // += stride
+ vld1.8 {\d21}, [r9] // p1
+ add r9, r9, #16 // += stride
+ vld1.8 {\d12}, [r6] // p0
+ vld1.8 {\d22}, [r9] // p1
+.else
+ add r6, r2, r9 // x + off
+ sub r9, r2, r9 // x - off
+ vld1.32 {\d11[0]}, [r6] // p0
+ add r6, r6, #8 // += stride
+ vld1.32 {\d21[0]}, [r9] // p1
+ add r9, r9, #8 // += stride
+ vld1.32 {\d11[1]}, [r6] // p0
+ add r6, r6, #8 // += stride
+ vld1.32 {\d21[1]}, [r9] // p1
+ add r9, r9, #8 // += stride
+ vld1.32 {\d12[0]}, [r6] // p0
+ add r6, r6, #8 // += stride
+ vld1.32 {\d22[0]}, [r9] // p1
+ add r9, r9, #8 // += stride
+ vld1.32 {\d12[1]}, [r6] // p0
+ vld1.32 {\d22[1]}, [r9] // p1
+.endif
+.endm
+.macro handle_pixel_8 s1, s2, thresh_vec, shift, tap, min
+.if \min
+ vmin.u8 q3, q3, \s1
+ vmax.u8 q4, q4, \s1
+ vmin.u8 q3, q3, \s2
+ vmax.u8 q4, q4, \s2
+.endif
+ vabd.u8 q8, q0, \s1 // abs(diff)
+ vabd.u8 q11, q0, \s2 // abs(diff)
+ vshl.u8 q9, q8, \shift // abs(diff) >> shift
+ vshl.u8 q12, q11, \shift // abs(diff) >> shift
+ vqsub.u8 q9, \thresh_vec, q9 // clip = imax(0, threshold - (abs(diff) >> shift))
+ vqsub.u8 q12, \thresh_vec, q12// clip = imax(0, threshold - (abs(diff) >> shift))
+ vcgt.u8 q10, q0, \s1 // px > p0
+ vcgt.u8 q13, q0, \s2 // px > p1
+ vmin.u8 q9, q9, q8 // imin(abs(diff), clip)
+ vmin.u8 q12, q12, q11 // imin(abs(diff), clip)
+ vneg.s8 q8, q9 // -imin()
+ vneg.s8 q11, q12 // -imin()
+ vbsl q10, q8, q9 // constrain() = imax(imin(diff, clip), -clip)
+ vdup.8 d18, \tap // taps[k]
+ vbsl q13, q11, q12 // constrain() = imax(imin(diff, clip), -clip)
+ vmlal.s8 q1, d20, d18 // sum += taps[k] * constrain()
+ vmlal.s8 q1, d26, d18 // sum += taps[k] * constrain()
+ vmlal.s8 q2, d21, d18 // sum += taps[k] * constrain()
+ vmlal.s8 q2, d27, d18 // sum += taps[k] * constrain()
+.endm
+
+// void cdef_filterX_edged_neon(pixel *dst, ptrdiff_t dst_stride,
+// const uint16_t *tmp, int pri_strength,
+// int sec_strength, int dir, int damping,
+// int h, size_t edges);
+.macro filter_func_8 w, pri, sec, min, suffix
+function cdef_filter\w\suffix\()_edged_neon
+.if \pri
+ movrel_local r8, pri_taps
+ and r9, r3, #1
+ add r8, r8, r9, lsl #1
+.endif
+ movrel_local r9, directions\w
+ add r5, r9, r5, lsl #1
+ vmov.u8 d17, #7
+ vdup.8 d16, r6 // damping
+
+ vmov.8 d8[0], r3
+ vmov.8 d8[1], r4
+ vclz.i8 d8, d8 // clz(threshold)
+ vsub.i8 d8, d17, d8 // ulog2(threshold)
+ vqsub.u8 d8, d16, d8 // shift = imax(0, damping - ulog2(threshold))
+ vneg.s8 d8, d8 // -shift
+.if \sec
+ vdup.8 q6, d8[1]
+.endif
+.if \pri
+ vdup.8 q5, d8[0]
+.endif
+
+1:
+.if \w == 8
+ add r12, r2, #16
+ vld1.8 {d0}, [r2, :64] // px
+ vld1.8 {d1}, [r12, :64] // px
+.else
+ add r12, r2, #8
+ vld1.32 {d0[0]}, [r2, :32] // px
+ add r9, r2, #2*8
+ vld1.32 {d0[1]}, [r12, :32] // px
+ add r12, r12, #2*8
+ vld1.32 {d1[0]}, [r9, :32] // px
+ vld1.32 {d1[1]}, [r12, :32] // px
+.endif
+
+ vmov.u8 q1, #0 // sum
+ vmov.u8 q2, #0 // sum
+.if \min
+ vmov.u16 q3, q0 // min
+ vmov.u16 q4, q0 // max
+.endif
+
+ // Instead of loading sec_taps 2, 1 from memory, just set it
+ // to 2 initially and decrease for the second round.
+ // This is also used as loop counter.
+ mov lr, #2 // sec_taps[0]
+
+2:
+.if \pri
+ ldrsb r9, [r5] // off1
+
+ load_px_8 d28, d29, d30, d31, \w
+.endif
+
+.if \sec
+ add r5, r5, #4 // +2*2
+ ldrsb r9, [r5] // off2
+.endif
+
+.if \pri
+ ldrb r12, [r8] // *pri_taps
+ vdup.8 q7, r3 // threshold
+
+ handle_pixel_8 q14, q15, q7, q5, r12, \min
+.endif
+
+.if \sec
+ load_px_8 d28, d29, d30, d31, \w
+
+ add r5, r5, #8 // +2*4
+ ldrsb r9, [r5] // off3
+
+ vdup.8 q7, r4 // threshold
+
+ handle_pixel_8 q14, q15, q7, q6, lr, \min
+
+ load_px_8 d28, d29, d30, d31, \w
+
+ handle_pixel_8 q14, q15, q7, q6, lr, \min
+
+ sub r5, r5, #11 // r5 -= 2*(2+4); r5 += 1;
+.else
+ add r5, r5, #1 // r5 += 1
+.endif
+ subs lr, lr, #1 // sec_tap-- (value)
+.if \pri
+ add r8, r8, #1 // pri_taps++ (pointer)
+.endif
+ bne 2b
+
+ vshr.s16 q14, q1, #15 // -(sum < 0)
+ vshr.s16 q15, q2, #15 // -(sum < 0)
+ vadd.i16 q1, q1, q14 // sum - (sum < 0)
+ vadd.i16 q2, q2, q15 // sum - (sum < 0)
+ vrshr.s16 q1, q1, #4 // (8 + sum - (sum < 0)) >> 4
+ vrshr.s16 q2, q2, #4 // (8 + sum - (sum < 0)) >> 4
+ vaddw.u8 q1, q1, d0 // px + (8 + sum ...) >> 4
+ vaddw.u8 q2, q2, d1 // px + (8 + sum ...) >> 4
+ vqmovun.s16 d0, q1
+ vqmovun.s16 d1, q2
+.if \min
+ vmin.u8 q0, q0, q4
+ vmax.u8 q0, q0, q3 // iclip(px + .., min, max)
+.endif
+.if \w == 8
+ vst1.8 {d0}, [r0, :64], r1
+ add r2, r2, #2*16 // tmp += 2*tmp_stride
+ subs r7, r7, #2 // h -= 2
+ vst1.8 {d1}, [r0, :64], r1
+.else
+ vst1.32 {d0[0]}, [r0, :32], r1
+ add r2, r2, #4*8 // tmp += 4*tmp_stride
+ vst1.32 {d0[1]}, [r0, :32], r1
+ subs r7, r7, #4 // h -= 4
+ vst1.32 {d1[0]}, [r0, :32], r1
+ vst1.32 {d1[1]}, [r0, :32], r1
+.endif
+
+ // Reset pri_taps and directions back to the original point
+ sub r5, r5, #2
+.if \pri
+ sub r8, r8, #2
+.endif
+
+ bgt 1b
+ vpop {q4-q7}
+ pop {r4-r9,pc}
+endfunc
+.endm
+
+.macro filter_8 w
+filter_func_8 \w, pri=1, sec=0, min=0, suffix=_pri
+filter_func_8 \w, pri=0, sec=1, min=0, suffix=_sec
+filter_func_8 \w, pri=1, sec=1, min=1, suffix=_pri_sec
+.endm
+
+filter_8 8
+filter_8 4
diff --git a/third_party/dav1d/src/arm/32/cdef16.S b/third_party/dav1d/src/arm/32/cdef16.S
new file mode 100644
index 0000000000..d14525d720
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/cdef16.S
@@ -0,0 +1,233 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+#include "cdef_tmpl.S"
+
+// r1 = d0/q0
+// r2 = d2/q1
+.macro pad_top_bot_16 s1, s2, w, stride, r1, r2, align, ret
+ tst r7, #1 // CDEF_HAVE_LEFT
+ beq 2f
+ // CDEF_HAVE_LEFT
+ tst r7, #2 // CDEF_HAVE_RIGHT
+ beq 1f
+ // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+ vldr s8, [\s1, #-4]
+ vld1.16 {\r1}, [\s1, :\align]
+ vldr s9, [\s1, #2*\w]
+ vldr s10, [\s2, #-4]
+ vld1.16 {\r2}, [\s2, :\align]
+ vldr s11, [\s2, #2*\w]
+ vstr s8, [r0, #-4]
+ vst1.16 {\r1}, [r0, :\align]
+ vstr s9, [r0, #2*\w]
+ add r0, r0, #2*\stride
+ vstr s10, [r0, #-4]
+ vst1.16 {\r2}, [r0, :\align]
+ vstr s11, [r0, #2*\w]
+.if \ret
+ pop {r4-r8,pc}
+.else
+ add r0, r0, #2*\stride
+ b 3f
+.endif
+
+1:
+ // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+ vldr s8, [\s1, #-4]
+ vld1.16 {\r1}, [\s1, :\align]
+ vldr s9, [\s2, #-4]
+ vld1.16 {\r2}, [\s2, :\align]
+ vstr s8, [r0, #-4]
+ vst1.16 {\r1}, [r0, :\align]
+ vstr s12, [r0, #2*\w]
+ add r0, r0, #2*\stride
+ vstr s9, [r0, #-4]
+ vst1.16 {\r2}, [r0, :\align]
+ vstr s12, [r0, #2*\w]
+.if \ret
+ pop {r4-r8,pc}
+.else
+ add r0, r0, #2*\stride
+ b 3f
+.endif
+
+2:
+ // !CDEF_HAVE_LEFT
+ tst r7, #2 // CDEF_HAVE_RIGHT
+ beq 1f
+ // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+ vld1.16 {\r1}, [\s1, :\align]
+ vldr s8, [\s1, #2*\w]
+ vld1.16 {\r2}, [\s2, :\align]
+ vldr s9, [\s2, #2*\w]
+ vstr s12, [r0, #-4]
+ vst1.16 {\r1}, [r0, :\align]
+ vstr s8, [r0, #2*\w]
+ add r0, r0, #2*\stride
+ vstr s12, [r0, #-4]
+ vst1.16 {\r2}, [r0, :\align]
+ vstr s9, [r0, #2*\w]
+.if \ret
+ pop {r4-r8,pc}
+.else
+ add r0, r0, #2*\stride
+ b 3f
+.endif
+
+1:
+ // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+ vld1.16 {\r1}, [\s1, :\align]
+ vld1.16 {\r2}, [\s2, :\align]
+ vstr s12, [r0, #-4]
+ vst1.16 {\r1}, [r0, :\align]
+ vstr s12, [r0, #2*\w]
+ add r0, r0, #2*\stride
+ vstr s12, [r0, #-4]
+ vst1.16 {\r2}, [r0, :\align]
+ vstr s12, [r0, #2*\w]
+.if \ret
+ pop {r4-r8,pc}
+.else
+ add r0, r0, #2*\stride
+.endif
+3:
+.endm
+
+// void dav1d_cdef_paddingX_16bpc_neon(uint16_t *tmp, const pixel *src,
+// ptrdiff_t src_stride, const pixel (*left)[2],
+// const pixel *const top,
+// const pixel *const bottom, int h,
+// enum CdefEdgeFlags edges);
+
+// r1 = d0/q0
+// r2 = d2/q1
+.macro padding_func_16 w, stride, r1, r2, align
+function cdef_padding\w\()_16bpc_neon, export=1
+ push {r4-r8,lr}
+ ldrd r4, r5, [sp, #24]
+ ldrd r6, r7, [sp, #32]
+ vmov.i16 q3, #0x8000
+ tst r7, #4 // CDEF_HAVE_TOP
+ bne 1f
+ // !CDEF_HAVE_TOP
+ sub r12, r0, #2*(2*\stride+2)
+ vmov.i16 q2, #0x8000
+ vst1.16 {q2,q3}, [r12]!
+.if \w == 8
+ vst1.16 {q2,q3}, [r12]!
+.endif
+ b 3f
+1:
+ // CDEF_HAVE_TOP
+ add r8, r4, r2
+ sub r0, r0, #2*(2*\stride)
+ pad_top_bot_16 r4, r8, \w, \stride, \r1, \r2, \align, 0
+
+ // Middle section
+3:
+ tst r7, #1 // CDEF_HAVE_LEFT
+ beq 2f
+ // CDEF_HAVE_LEFT
+ tst r7, #2 // CDEF_HAVE_RIGHT
+ beq 1f
+ // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+0:
+ vld1.32 {d2[]}, [r3, :32]!
+ vldr s5, [r1, #2*\w]
+ vld1.16 {\r1}, [r1, :\align], r2
+ subs r6, r6, #1
+ vstr s4, [r0, #-4]
+ vst1.16 {\r1}, [r0, :\align]
+ vstr s5, [r0, #2*\w]
+ add r0, r0, #2*\stride
+ bgt 0b
+ b 3f
+1:
+ // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+ vld1.32 {d2[]}, [r3, :32]!
+ vld1.16 {\r1}, [r1, :\align], r2
+ subs r6, r6, #1
+ vstr s4, [r0, #-4]
+ vst1.16 {\r1}, [r0, :\align]
+ vstr s12, [r0, #2*\w]
+ add r0, r0, #2*\stride
+ bgt 1b
+ b 3f
+2:
+ tst r7, #2 // CDEF_HAVE_RIGHT
+ beq 1f
+ // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+0:
+ vldr s4, [r1, #2*\w]
+ vld1.16 {\r1}, [r1, :\align], r2
+ subs r6, r6, #1
+ vstr s12, [r0, #-4]
+ vst1.16 {\r1}, [r0, :\align]
+ vstr s4, [r0, #2*\w]
+ add r0, r0, #2*\stride
+ bgt 0b
+ b 3f
+1:
+ // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+ vld1.16 {\r1}, [r1, :\align], r2
+ subs r6, r6, #1
+ vstr s12, [r0, #-4]
+ vst1.16 {\r1}, [r0, :\align]
+ vstr s12, [r0, #2*\w]
+ add r0, r0, #2*\stride
+ bgt 1b
+
+3:
+ tst r7, #8 // CDEF_HAVE_BOTTOM
+ bne 1f
+ // !CDEF_HAVE_BOTTOM
+ sub r12, r0, #4
+ vmov.i16 q2, #0x8000
+ vst1.16 {q2,q3}, [r12]!
+.if \w == 8
+ vst1.16 {q2,q3}, [r12]!
+.endif
+ pop {r4-r8,pc}
+1:
+ // CDEF_HAVE_BOTTOM
+ add r8, r5, r2
+ pad_top_bot_16 r5, r8, \w, \stride, \r1, \r2, \align, 1
+endfunc
+.endm
+
+padding_func_16 8, 16, q0, q1, 128
+padding_func_16 4, 8, d0, d2, 64
+
+tables
+
+filter 8, 16
+filter 4, 16
+
+find_dir 16
diff --git a/third_party/dav1d/src/arm/32/cdef_tmpl.S b/third_party/dav1d/src/arm/32/cdef_tmpl.S
new file mode 100644
index 0000000000..33ff9e5816
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/cdef_tmpl.S
@@ -0,0 +1,515 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+.macro dir_table w, stride
+const directions\w
+ .byte -1 * \stride + 1, -2 * \stride + 2
+ .byte 0 * \stride + 1, -1 * \stride + 2
+ .byte 0 * \stride + 1, 0 * \stride + 2
+ .byte 0 * \stride + 1, 1 * \stride + 2
+ .byte 1 * \stride + 1, 2 * \stride + 2
+ .byte 1 * \stride + 0, 2 * \stride + 1
+ .byte 1 * \stride + 0, 2 * \stride + 0
+ .byte 1 * \stride + 0, 2 * \stride - 1
+// Repeated, to avoid & 7
+ .byte -1 * \stride + 1, -2 * \stride + 2
+ .byte 0 * \stride + 1, -1 * \stride + 2
+ .byte 0 * \stride + 1, 0 * \stride + 2
+ .byte 0 * \stride + 1, 1 * \stride + 2
+ .byte 1 * \stride + 1, 2 * \stride + 2
+ .byte 1 * \stride + 0, 2 * \stride + 1
+endconst
+.endm
+
+.macro tables
+dir_table 8, 16
+dir_table 4, 8
+
+const pri_taps
+ .byte 4, 2, 3, 3
+endconst
+.endm
+
+.macro load_px d11, d12, d21, d22, w
+.if \w == 8
+ add r6, r2, r9, lsl #1 // x + off
+ sub r9, r2, r9, lsl #1 // x - off
+ vld1.16 {\d11,\d12}, [r6] // p0
+ vld1.16 {\d21,\d22}, [r9] // p1
+.else
+ add r6, r2, r9, lsl #1 // x + off
+ sub r9, r2, r9, lsl #1 // x - off
+ vld1.16 {\d11}, [r6] // p0
+ add r6, r6, #2*8 // += stride
+ vld1.16 {\d21}, [r9] // p1
+ add r9, r9, #2*8 // += stride
+ vld1.16 {\d12}, [r6] // p0
+ vld1.16 {\d22}, [r9] // p1
+.endif
+.endm
+.macro handle_pixel s1, s2, thresh_vec, shift, tap, min
+.if \min
+ vmin.u16 q2, q2, \s1
+ vmax.s16 q3, q3, \s1
+ vmin.u16 q2, q2, \s2
+ vmax.s16 q3, q3, \s2
+.endif
+ vabd.u16 q8, q0, \s1 // abs(diff)
+ vabd.u16 q11, q0, \s2 // abs(diff)
+ vshl.u16 q9, q8, \shift // abs(diff) >> shift
+ vshl.u16 q12, q11, \shift // abs(diff) >> shift
+ vqsub.u16 q9, \thresh_vec, q9 // clip = imax(0, threshold - (abs(diff) >> shift))
+ vqsub.u16 q12, \thresh_vec, q12// clip = imax(0, threshold - (abs(diff) >> shift))
+ vsub.i16 q10, \s1, q0 // diff = p0 - px
+ vsub.i16 q13, \s2, q0 // diff = p1 - px
+ vneg.s16 q8, q9 // -clip
+ vneg.s16 q11, q12 // -clip
+ vmin.s16 q10, q10, q9 // imin(diff, clip)
+ vmin.s16 q13, q13, q12 // imin(diff, clip)
+ vdup.16 q9, \tap // taps[k]
+ vmax.s16 q10, q10, q8 // constrain() = imax(imin(diff, clip), -clip)
+ vmax.s16 q13, q13, q11 // constrain() = imax(imin(diff, clip), -clip)
+ vmla.i16 q1, q10, q9 // sum += taps[k] * constrain()
+ vmla.i16 q1, q13, q9 // sum += taps[k] * constrain()
+.endm
+
+// void dav1d_cdef_filterX_Ybpc_neon(pixel *dst, ptrdiff_t dst_stride,
+// const uint16_t *tmp, int pri_strength,
+// int sec_strength, int dir, int damping,
+// int h, size_t edges);
+.macro filter_func w, bpc, pri, sec, min, suffix
+function cdef_filter\w\suffix\()_\bpc\()bpc_neon
+.if \bpc == 8
+ cmp r8, #0xf
+ beq cdef_filter\w\suffix\()_edged_neon
+.endif
+.if \pri
+.if \bpc == 16
+ clz r9, r9
+ sub r9, r9, #24 // -bitdepth_min_8
+ neg r9, r9 // bitdepth_min_8
+.endif
+ movrel_local r8, pri_taps
+.if \bpc == 16
+ lsr r9, r3, r9 // pri_strength >> bitdepth_min_8
+ and r9, r9, #1 // (pri_strength >> bitdepth_min_8) & 1
+.else
+ and r9, r3, #1
+.endif
+ add r8, r8, r9, lsl #1
+.endif
+ movrel_local r9, directions\w
+ add r5, r9, r5, lsl #1
+ vmov.u16 d17, #15
+ vdup.16 d16, r6 // damping
+
+.if \pri
+ vdup.16 q5, r3 // threshold
+.endif
+.if \sec
+ vdup.16 q7, r4 // threshold
+.endif
+ vmov.16 d8[0], r3
+ vmov.16 d8[1], r4
+ vclz.i16 d8, d8 // clz(threshold)
+ vsub.i16 d8, d17, d8 // ulog2(threshold)
+ vqsub.u16 d8, d16, d8 // shift = imax(0, damping - ulog2(threshold))
+ vneg.s16 d8, d8 // -shift
+.if \sec
+ vdup.16 q6, d8[1]
+.endif
+.if \pri
+ vdup.16 q4, d8[0]
+.endif
+
+1:
+.if \w == 8
+ vld1.16 {q0}, [r2, :128] // px
+.else
+ add r12, r2, #2*8
+ vld1.16 {d0}, [r2, :64] // px
+ vld1.16 {d1}, [r12, :64] // px
+.endif
+
+ vmov.u16 q1, #0 // sum
+.if \min
+ vmov.u16 q2, q0 // min
+ vmov.u16 q3, q0 // max
+.endif
+
+ // Instead of loading sec_taps 2, 1 from memory, just set it
+ // to 2 initially and decrease for the second round.
+ // This is also used as loop counter.
+ mov lr, #2 // sec_taps[0]
+
+2:
+.if \pri
+ ldrsb r9, [r5] // off1
+
+ load_px d28, d29, d30, d31, \w
+.endif
+
+.if \sec
+ add r5, r5, #4 // +2*2
+ ldrsb r9, [r5] // off2
+.endif
+
+.if \pri
+ ldrb r12, [r8] // *pri_taps
+
+ handle_pixel q14, q15, q5, q4, r12, \min
+.endif
+
+.if \sec
+ load_px d28, d29, d30, d31, \w
+
+ add r5, r5, #8 // +2*4
+ ldrsb r9, [r5] // off3
+
+ handle_pixel q14, q15, q7, q6, lr, \min
+
+ load_px d28, d29, d30, d31, \w
+
+ handle_pixel q14, q15, q7, q6, lr, \min
+
+ sub r5, r5, #11 // r5 -= 2*(2+4); r5 += 1;
+.else
+ add r5, r5, #1 // r5 += 1
+.endif
+ subs lr, lr, #1 // sec_tap-- (value)
+.if \pri
+ add r8, r8, #1 // pri_taps++ (pointer)
+.endif
+ bne 2b
+
+ vshr.s16 q14, q1, #15 // -(sum < 0)
+ vadd.i16 q1, q1, q14 // sum - (sum < 0)
+ vrshr.s16 q1, q1, #4 // (8 + sum - (sum < 0)) >> 4
+ vadd.i16 q0, q0, q1 // px + (8 + sum ...) >> 4
+.if \min
+ vmin.s16 q0, q0, q3
+ vmax.s16 q0, q0, q2 // iclip(px + .., min, max)
+.endif
+.if \bpc == 8
+ vmovn.u16 d0, q0
+.endif
+.if \w == 8
+ add r2, r2, #2*16 // tmp += tmp_stride
+ subs r7, r7, #1 // h--
+.if \bpc == 8
+ vst1.8 {d0}, [r0, :64], r1
+.else
+ vst1.16 {q0}, [r0, :128], r1
+.endif
+.else
+.if \bpc == 8
+ vst1.32 {d0[0]}, [r0, :32], r1
+.else
+ vst1.16 {d0}, [r0, :64], r1
+.endif
+ add r2, r2, #2*16 // tmp += 2*tmp_stride
+ subs r7, r7, #2 // h -= 2
+.if \bpc == 8
+ vst1.32 {d0[1]}, [r0, :32], r1
+.else
+ vst1.16 {d1}, [r0, :64], r1
+.endif
+.endif
+
+ // Reset pri_taps and directions back to the original point
+ sub r5, r5, #2
+.if \pri
+ sub r8, r8, #2
+.endif
+
+ bgt 1b
+ vpop {q4-q7}
+ pop {r4-r9,pc}
+endfunc
+.endm
+
+.macro filter w, bpc
+filter_func \w, \bpc, pri=1, sec=0, min=0, suffix=_pri
+filter_func \w, \bpc, pri=0, sec=1, min=0, suffix=_sec
+filter_func \w, \bpc, pri=1, sec=1, min=1, suffix=_pri_sec
+
+function cdef_filter\w\()_\bpc\()bpc_neon, export=1
+ push {r4-r9,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #92]
+ ldrd r6, r7, [sp, #100]
+.if \bpc == 16
+ ldrd r8, r9, [sp, #108]
+.else
+ ldr r8, [sp, #108]
+.endif
+ cmp r3, #0 // pri_strength
+ bne 1f
+ b cdef_filter\w\()_sec_\bpc\()bpc_neon // only sec
+1:
+ cmp r4, #0 // sec_strength
+ bne 1f
+ b cdef_filter\w\()_pri_\bpc\()bpc_neon // only pri
+1:
+ b cdef_filter\w\()_pri_sec_\bpc\()bpc_neon // both pri and sec
+endfunc
+.endm
+
+const div_table, align=4
+ .short 840, 420, 280, 210, 168, 140, 120, 105
+endconst
+
+const alt_fact, align=4
+ .short 420, 210, 140, 105, 105, 105, 105, 105, 140, 210, 420, 0
+endconst
+
+.macro cost_alt dest, s1, s2, s3, s4, s5, s6
+ vmull.s16 q1, \s1, \s1 // sum_alt[n]*sum_alt[n]
+ vmull.s16 q2, \s2, \s2
+ vmull.s16 q3, \s3, \s3
+ vmull.s16 q5, \s4, \s4 // sum_alt[n]*sum_alt[n]
+ vmull.s16 q12, \s5, \s5
+ vmull.s16 q6, \s6, \s6 // q6 overlaps the first \s1-\s2 here
+ vmul.i32 q1, q1, q13 // sum_alt[n]^2*fact
+ vmla.i32 q1, q2, q14
+ vmla.i32 q1, q3, q15
+ vmul.i32 q5, q5, q13 // sum_alt[n]^2*fact
+ vmla.i32 q5, q12, q14
+ vmla.i32 q5, q6, q15
+ vadd.i32 d2, d2, d3
+ vadd.i32 d3, d10, d11
+ vpadd.i32 \dest, d2, d3 // *cost_ptr
+.endm
+
+.macro find_best s1, s2, s3
+.ifnb \s2
+ vmov.32 lr, \s2
+.endif
+ cmp r12, r1 // cost[n] > best_cost
+ itt gt
+ movgt r0, r3 // best_dir = n
+ movgt r1, r12 // best_cost = cost[n]
+.ifnb \s2
+ add r3, r3, #1 // n++
+ cmp lr, r1 // cost[n] > best_cost
+ vmov.32 r12, \s3
+ itt gt
+ movgt r0, r3 // best_dir = n
+ movgt r1, lr // best_cost = cost[n]
+ add r3, r3, #1 // n++
+.endif
+.endm
+
+// int dav1d_cdef_find_dir_Xbpc_neon(const pixel *img, const ptrdiff_t stride,
+// unsigned *const var)
+.macro find_dir bpc
+function cdef_find_dir_\bpc\()bpc_neon, export=1
+ push {lr}
+ vpush {q4-q7}
+.if \bpc == 16
+ clz r3, r3 // clz(bitdepth_max)
+ sub lr, r3, #24 // -bitdepth_min_8
+.endif
+ sub sp, sp, #32 // cost
+ mov r3, #8
+ vmov.u16 q1, #0 // q0-q1 sum_diag[0]
+ vmov.u16 q3, #0 // q2-q3 sum_diag[1]
+ vmov.u16 q5, #0 // q4-q5 sum_hv[0-1]
+ vmov.u16 q8, #0 // q6,d16 sum_alt[0]
+ // q7,d17 sum_alt[1]
+ vmov.u16 q9, #0 // q9,d22 sum_alt[2]
+ vmov.u16 q11, #0
+ vmov.u16 q10, #0 // q10,d23 sum_alt[3]
+
+
+.irpc i, 01234567
+.if \bpc == 8
+ vld1.8 {d30}, [r0, :64], r1
+ vmov.u8 d31, #128
+ vsubl.u8 q15, d30, d31 // img[x] - 128
+.else
+ vld1.16 {q15}, [r0, :128], r1
+ vdup.16 q14, lr // -bitdepth_min_8
+ vshl.u16 q15, q15, q14
+ vmov.u16 q14, #128
+ vsub.i16 q15, q15, q14 // img[x] - 128
+.endif
+ vmov.u16 q14, #0
+
+.if \i == 0
+ vmov q0, q15 // sum_diag[0]
+.else
+ vext.8 q12, q14, q15, #(16-2*\i)
+ vext.8 q13, q15, q14, #(16-2*\i)
+ vadd.i16 q0, q0, q12 // sum_diag[0]
+ vadd.i16 q1, q1, q13 // sum_diag[0]
+.endif
+ vrev64.16 q13, q15
+ vswp d26, d27 // [-x]
+.if \i == 0
+ vmov q2, q13 // sum_diag[1]
+.else
+ vext.8 q12, q14, q13, #(16-2*\i)
+ vext.8 q13, q13, q14, #(16-2*\i)
+ vadd.i16 q2, q2, q12 // sum_diag[1]
+ vadd.i16 q3, q3, q13 // sum_diag[1]
+.endif
+
+ vpadd.u16 d26, d30, d31 // [(x >> 1)]
+ vmov.u16 d27, #0
+ vpadd.u16 d24, d26, d28
+ vpadd.u16 d24, d24, d28 // [y]
+ vmov.u16 r12, d24[0]
+ vadd.i16 q5, q5, q15 // sum_hv[1]
+.if \i < 4
+ vmov.16 d8[\i], r12 // sum_hv[0]
+.else
+ vmov.16 d9[\i-4], r12 // sum_hv[0]
+.endif
+
+.if \i == 0
+ vmov.u16 q6, q13 // sum_alt[0]
+.else
+ vext.8 q12, q14, q13, #(16-2*\i)
+ vext.8 q14, q13, q14, #(16-2*\i)
+ vadd.i16 q6, q6, q12 // sum_alt[0]
+ vadd.i16 d16, d16, d28 // sum_alt[0]
+.endif
+ vrev64.16 d26, d26 // [-(x >> 1)]
+ vmov.u16 q14, #0
+.if \i == 0
+ vmov q7, q13 // sum_alt[1]
+.else
+ vext.8 q12, q14, q13, #(16-2*\i)
+ vext.8 q13, q13, q14, #(16-2*\i)
+ vadd.i16 q7, q7, q12 // sum_alt[1]
+ vadd.i16 d17, d17, d26 // sum_alt[1]
+.endif
+
+.if \i < 6
+ vext.8 q12, q14, q15, #(16-2*(3-(\i/2)))
+ vext.8 q13, q15, q14, #(16-2*(3-(\i/2)))
+ vadd.i16 q9, q9, q12 // sum_alt[2]
+ vadd.i16 d22, d22, d26 // sum_alt[2]
+.else
+ vadd.i16 q9, q9, q15 // sum_alt[2]
+.endif
+.if \i == 0
+ vmov q10, q15 // sum_alt[3]
+.elseif \i == 1
+ vadd.i16 q10, q10, q15 // sum_alt[3]
+.else
+ vext.8 q12, q14, q15, #(16-2*(\i/2))
+ vext.8 q13, q15, q14, #(16-2*(\i/2))
+ vadd.i16 q10, q10, q12 // sum_alt[3]
+ vadd.i16 d23, d23, d26 // sum_alt[3]
+.endif
+.endr
+
+ vmov.u32 q15, #105
+
+ vmull.s16 q12, d8, d8 // sum_hv[0]*sum_hv[0]
+ vmlal.s16 q12, d9, d9
+ vmull.s16 q13, d10, d10 // sum_hv[1]*sum_hv[1]
+ vmlal.s16 q13, d11, d11
+ vadd.s32 d8, d24, d25
+ vadd.s32 d9, d26, d27
+ vpadd.s32 d8, d8, d9 // cost[2,6] (s16, s17)
+ vmul.i32 d8, d8, d30 // cost[2,6] *= 105
+
+ vrev64.16 q1, q1
+ vrev64.16 q3, q3
+ vext.8 q1, q1, q1, #10 // sum_diag[0][14-n]
+ vext.8 q3, q3, q3, #10 // sum_diag[1][14-n]
+
+ vstr s16, [sp, #2*4] // cost[2]
+ vstr s17, [sp, #6*4] // cost[6]
+
+ movrel_local r12, div_table
+ vld1.16 {q14}, [r12, :128]
+
+ vmull.s16 q5, d0, d0 // sum_diag[0]*sum_diag[0]
+ vmull.s16 q12, d1, d1
+ vmlal.s16 q5, d2, d2
+ vmlal.s16 q12, d3, d3
+ vmull.s16 q0, d4, d4 // sum_diag[1]*sum_diag[1]
+ vmull.s16 q1, d5, d5
+ vmlal.s16 q0, d6, d6
+ vmlal.s16 q1, d7, d7
+ vmovl.u16 q13, d28 // div_table
+ vmovl.u16 q14, d29
+ vmul.i32 q5, q5, q13 // cost[0]
+ vmla.i32 q5, q12, q14
+ vmul.i32 q0, q0, q13 // cost[4]
+ vmla.i32 q0, q1, q14
+ vadd.i32 d10, d10, d11
+ vadd.i32 d0, d0, d1
+ vpadd.i32 d0, d10, d0 // cost[0,4] = s0,s1
+
+ movrel_local r12, alt_fact
+ vld1.16 {d29, d30, d31}, [r12, :64] // div_table[2*m+1] + 105
+
+ vstr s0, [sp, #0*4] // cost[0]
+ vstr s1, [sp, #4*4] // cost[4]
+
+ vmovl.u16 q13, d29 // div_table[2*m+1] + 105
+ vmovl.u16 q14, d30
+ vmovl.u16 q15, d31
+
+ cost_alt d14, d12, d13, d16, d14, d15, d17 // cost[1], cost[3]
+ cost_alt d15, d18, d19, d22, d20, d21, d23 // cost[5], cost[7]
+ vstr s28, [sp, #1*4] // cost[1]
+ vstr s29, [sp, #3*4] // cost[3]
+
+ mov r0, #0 // best_dir
+ vmov.32 r1, d0[0] // best_cost
+ mov r3, #1 // n
+
+ vstr s30, [sp, #5*4] // cost[5]
+ vstr s31, [sp, #7*4] // cost[7]
+
+ vmov.32 r12, d14[0]
+
+ find_best d14[0], d8[0], d14[1]
+ find_best d14[1], d0[1], d15[0]
+ find_best d15[0], d8[1], d15[1]
+ find_best d15[1]
+
+ eor r3, r0, #4 // best_dir ^4
+ ldr r12, [sp, r3, lsl #2]
+ sub r1, r1, r12 // best_cost - cost[best_dir ^ 4]
+ lsr r1, r1, #10
+ str r1, [r2] // *var
+
+ add sp, sp, #32
+ vpop {q4-q7}
+ pop {pc}
+endfunc
+.endm
diff --git a/third_party/dav1d/src/arm/32/filmgrain.S b/third_party/dav1d/src/arm/32/filmgrain.S
new file mode 100644
index 0000000000..9d59d5d5ed
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/filmgrain.S
@@ -0,0 +1,2039 @@
+/*
+ * Copyright © 2021, VideoLAN and dav1d authors
+ * Copyright © 2021, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+#include "src/arm/asm-offsets.h"
+
+#define GRAIN_WIDTH 82
+#define GRAIN_HEIGHT 73
+
+#define SUB_GRAIN_WIDTH 44
+#define SUB_GRAIN_HEIGHT 38
+
+.macro increment_seed steps, shift=1
+ lsr r11, r2, #3
+ lsr r12, r2, #12
+ lsr lr, r2, #1
+ eor r11, r2, r11 // (r >> 0) ^ (r >> 3)
+ eor r12, r12, lr // (r >> 12) ^ (r >> 1)
+ eor r11, r11, r12 // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1)
+.if \shift
+ lsr r2, r2, #\steps
+.endif
+ and r11, r11, #((1 << \steps) - 1) // bit
+.if \shift
+ orr r2, r2, r11, lsl #(16 - \steps) // *state
+.else
+ orr r2, r2, r11, lsl #16 // *state
+.endif
+.endm
+
+.macro read_rand dest, bits, age
+ ubfx \dest, r2, #16 - \bits - \age, #\bits
+.endm
+
+.macro read_shift_rand dest, bits
+ ubfx \dest, r2, #17 - \bits, #\bits
+ lsr r2, r2, #1
+.endm
+
+// special calling convention:
+// r2 holds seed
+// r3 holds dav1d_gaussian_sequence
+// clobbers r11-r12
+// returns in d0-d1
+function get_gaussian_neon
+ push {r5-r6,lr}
+ increment_seed 4
+ read_rand r5, 11, 3
+ read_rand r6, 11, 2
+ add r5, r3, r5, lsl #1
+ add r6, r3, r6, lsl #1
+ vld1.16 {d0[0]}, [r5]
+ read_rand r5, 11, 1
+ vld1.16 {d0[1]}, [r6]
+ add r5, r3, r5, lsl #1
+ read_rand r6, 11, 0
+ increment_seed 4
+ add r6, r3, r6, lsl #1
+ vld1.16 {d0[2]}, [r5]
+ read_rand r5, 11, 3
+ vld1.16 {d0[3]}, [r6]
+ add r5, r3, r5, lsl #1
+ read_rand r6, 11, 2
+ vld1.16 {d1[0]}, [r5]
+ add r6, r3, r6, lsl #1
+ read_rand r5, 11, 1
+ vld1.16 {d1[1]}, [r6]
+ read_rand r6, 11, 0
+ add r5, r3, r5, lsl #1
+ add r6, r3, r6, lsl #1
+ vld1.16 {d1[2]}, [r5]
+ vld1.16 {d1[3]}, [r6]
+ pop {r5-r6,pc}
+endfunc
+
+.macro get_grain_row r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r0, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r1, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r2, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r3, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r4, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r5, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r6, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r7, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r8, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r9, q0
+ increment_seed 2
+ read_rand r11, 11, 1
+ read_rand r12, 11, 0
+ add r11, r3, r11, lsl #1
+ add r12, r3, r12, lsl #1
+ vld1.16 {d0[0]}, [r11]
+ vld1.16 {d0[1]}, [r12]
+ vrshl.s16 d0, d0, d30
+ vmovn.i16 \r10, q0
+.endm
+
+.macro store_grain_row r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10
+ vst1.16 {\r0, \r1, \r2, \r3}, [r0]!
+ vst1.16 {\r4, \r5, \r6, \r7}, [r0]!
+ vst1.16 {\r8, \r9}, [r0]!
+ vst1.16 {\r10[0]}, [r0]!
+.endm
+
+.macro get_grain_row_44 r0, r1, r2, r3, r4, r5
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r0, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r1, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r2, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r3, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r4, q0
+ increment_seed 4
+ read_rand r11, 11, 3
+ read_rand r12, 11, 2
+ add r11, r3, r11, lsl #1
+ add r12, r3, r12, lsl #1
+ vld1.16 {d0[]}, [r11]
+ read_rand r11, 11, 1
+ vld1.16 {d0[1]}, [r12]
+ add r11, r3, r11, lsl #1
+ read_rand r12, 11, 0
+ vld1.16 {d0[2]}, [r11]
+ add r12, r3, r12, lsl #1
+ vld1.16 {d0[3]}, [r12]
+ vrshl.s16 d0, d0, d30
+ vmovn.i16 \r5, q0
+.endm
+
+.macro store_grain_row_44 r0, r1, r2, r3, r4, r5
+ vst1.16 {\r0, \r1, \r2, \r3}, [r0]!
+ vst1.16 {\r4, \r5}, [r0]
+ add r0, r0, #GRAIN_WIDTH-32
+.endm
+
+function get_grain_2_neon
+ push {r11,lr}
+ increment_seed 2
+ read_rand r11, 11, 1
+ read_rand r12, 11, 0
+ add r11, r3, r11, lsl #1
+ add r12, r3, r12, lsl #1
+ vld1.16 {d0[0]}, [r11]
+ vld1.16 {d0[1]}, [r12]
+ vrshl.s16 d0, d0, d30
+ vmovn.i16 d0, q0
+ pop {r11,pc}
+endfunc
+
+.macro get_grain_2 dst
+ bl get_grain_2_neon
+.ifnc \dst, d0
+ vmov \dst, d0
+.endif
+.endm
+
+// r1 holds the number of entries to produce
+// r6, r8 and r10 hold the previous output entries
+// q0 holds the vector of produced entries
+// q1 holds the input vector of sums from above
+.macro output_lag n
+function output_lag\n\()_neon
+ push {r0, lr}
+.if \n == 1
+ mov lr, #-128
+.else
+ mov r0, #1
+ mov lr, #1
+ sub r7, r7, #1
+ sub r9, r9, #1
+ lsl r0, r0, r7
+ lsl lr, lr, r9
+ add r7, r7, #1
+ add r9, r9, #1
+.endif
+1:
+ read_shift_rand r12, 11
+ vmov.32 r11, d2[0]
+ lsl r12, r12, #1
+ vext.8 q0, q0, q0, #1
+ ldrsh r12, [r3, r12]
+.if \n == 1
+ mla r11, r6, r4, r11 // sum (above) + *coeff * prev output
+ add r6, r11, r8 // 1 << (ar_coeff_shift - 1)
+ add r12, r12, r10
+ asr r6, r6, r7 // >> ar_coeff_shift
+ asr r12, r12, r9 // >> (4 + grain_scale_shift)
+ add r6, r6, r12
+ cmp r6, r5
+.elseif \n == 2
+ mla r11, r8, r4, r11 // sum (above) + *coeff * prev output 1
+ mla r11, r6, r10, r11 // += *coeff * prev output 2
+ mov r8, r6
+ add r6, r11, r0 // 1 << (ar_coeff_shift - 1)
+ add r12, r12, lr // 1 << (4 + grain_scale_shift - 1)
+ asr r6, r6, r7 // >> ar_coeff_shift
+ asr r12, r12, r9 // >> (4 + grain_scale_shift)
+ add r6, r6, r12
+ push {lr}
+ cmp r6, r5
+ mov lr, #-128
+.else
+ push {r1-r3}
+ sbfx r1, r4, #0, #8
+ sbfx r2, r4, #8, #8
+ sbfx r3, r4, #16, #8
+ mla r11, r10, r1, r11 // sum (above) + *coeff * prev output 1
+ mla r11, r8, r2, r11 // sum (above) + *coeff * prev output 2
+ mla r11, r6, r3, r11 // += *coeff * prev output 3
+ pop {r1-r3}
+ mov r10, r8
+ mov r8, r6
+
+ add r6, r11, r0 // 1 << (ar_coeff_shift - 1)
+ add r12, r12, lr // 1 << (4 + grain_scale_shift - 1)
+ asr r6, r6, r7 // >> ar_coeff_shift
+ asr r12, r12, r9 // >> (4 + grain_scale_shift)
+ add r6, r6, r12
+ push {lr}
+ cmp r6, r5
+ mov lr, #-128
+.endif
+ it gt
+ movgt r6, r5
+ cmp r6, lr
+ it lt
+ movlt r6, lr
+.if \n >= 2
+ pop {lr}
+.endif
+ subs r1, r1, #1
+ vext.8 q1, q1, q1, #4
+ vmov.8 d1[7], r6
+ bgt 1b
+ pop {r0, pc}
+endfunc
+.endm
+
+output_lag 1
+output_lag 2
+output_lag 3
+
+
+function sum_lag1_above_neon
+ vmull.s8 q2, d6, d28
+ vmull.s8 q3, d7, d28
+ vmull.s8 q4, d0, d27
+ vmull.s8 q5, d1, d27
+
+ vaddl.s16 q0, d4, d8
+ vaddl.s16 q2, d5, d9
+ vaddl.s16 q4, d6, d10
+ vaddl.s16 q5, d7, d11
+
+ vmull.s8 q3, d3, d29
+ vmull.s8 q1, d2, d29
+
+ vaddw.s16 q4, q4, d6
+ vaddw.s16 q5, q5, d7
+ vaddw.s16 q3, q2, d3
+ vaddw.s16 q2, q0, d2
+ bx lr
+endfunc
+
+.macro sum_lag_n_body lag, type, uv_layout, edge, elems, store, uv_coeff
+.ifc \lag\()_\edge, lag3_left
+ bl sum_lag3_left_above_neon
+.else
+ bl sum_\lag\()_above_neon
+.endif
+.ifc \type, uv_420
+ vpush {q6-q7}
+ add r12, r11, #GRAIN_WIDTH
+ vld1.16 {q0, q1}, [r11]!
+ vld1.16 {q6, q7}, [r12]!
+ vpaddl.s8 q0, q0
+ vpaddl.s8 q1, q1
+ vpaddl.s8 q6, q6
+ vpaddl.s8 q7, q7
+ vadd.i16 q0, q0, q6
+ vadd.i16 q1, q1, q7
+ vpop {q6-q7}
+ vrshrn.s16 d0, q0, #2
+ vrshrn.s16 d1, q1, #2
+.endif
+.ifc \type, uv_422
+ vld1.8 {q0, q1}, [r11]!
+ vpaddl.s8 q0, q0
+ vpaddl.s8 q1, q1
+ vrshrn.s16 d0, q0, #1
+ vrshrn.s16 d1, q1, #1
+.endif
+.ifc \type, uv_444
+ vld1.8 {q0}, [r11]!
+.endif
+.if \uv_layout
+.ifnb \uv_coeff
+ vdup.8 d13, \uv_coeff
+.endif
+ vmull.s8 q1, d0, d13
+ vmull.s8 q0, d1, d13
+ vaddw.s16 q2, q2, d2
+ vaddw.s16 q3, q3, d3
+ vaddw.s16 q4, q4, d0
+ vaddw.s16 q5, q5, d1
+.endif
+.if \uv_layout && \elems == 16
+ b sum_\lag\()_y_\edge\()_start
+.elseif \uv_layout == 444 && \elems == 15
+ b sum_\lag\()_y_\edge\()_start
+.elseif \uv_layout == 422 && \elems == 9
+ b sum_\lag\()_uv_420_\edge\()_start
+.else
+sum_\lag\()_\type\()_\edge\()_start:
+ push {r11}
+.ifc \edge, left
+ increment_seed 4
+ read_rand r11, 11, 3
+ read_rand r12, 11, 2
+ add r11, r3, r11, lsl #1
+ add r12, r3, r12, lsl #1
+ vld1.16 {d1[1]}, [r11]
+ read_rand r11, 11, 1
+ vld1.16 {d1[2]}, [r12]
+ add r11, r3, r11, lsl #1
+ vld1.16 {d1[3]}, [r11]
+ lsl r2, r2, #1 // shift back the state as if we'd done increment_seed with shift=0
+ vrshl.s16 d1, d1, d30
+ vmovn.i16 d1, q0
+ vext.8 q2, q2, q2, #12
+.ifc \lag, lag3
+ vmov.s8 r10, d1[5]
+.endif
+.ifnc \lag, lag1
+ vmov.s8 r8, d1[6]
+.endif
+ vmov.s8 r6, d1[7]
+
+ vmov q1, q2
+ mov r1, #1
+ bl output_\lag\()_neon
+.else
+ increment_seed 4, shift=0
+ vmov q1, q2
+ mov r1, #4
+ bl output_\lag\()_neon
+.endif
+
+ increment_seed 4, shift=0
+ vmov q1, q3
+ mov r1, #4
+ bl output_\lag\()_neon
+
+ increment_seed 4, shift=0
+ vmov q1, q4
+.if \elems == 9
+ mov r1, #1
+ bl output_\lag\()_neon
+ lsr r2, r2, #3
+
+ read_rand r11, 11, 2
+ read_rand r12, 11, 1
+ add r11, r3, r11, lsl #1
+ add r12, r3, r12, lsl #1
+ vld1.16 {d2[0]}, [r11]
+ read_rand r11, 11, 0
+ vld1.16 {d2[1]}, [r12]
+ add r11, r3, r11, lsl #1
+ vld1.16 {d2[2]}, [r11]
+ vrshl.s16 d2, d2, d30
+ vmovn.i16 d2, q1
+ vext.8 q0, q0, q1, #7
+.else
+ mov r1, #4
+ bl output_\lag\()_neon
+
+ increment_seed 4, shift=0
+ vmov q1, q5
+
+.ifc \edge, right
+ mov r1, #3
+ bl output_\lag\()_neon
+ read_shift_rand r11, 11
+ add r11, r3, r11, lsl #1
+ vld1.16 {d2[0]}, [r11]
+ vrshl.s16 d2, d2, d30
+ vext.8 q0, q0, q1, #1
+.else
+ mov r1, #4
+ bl output_\lag\()_neon
+.endif
+.endif
+.if \store
+ vst1.8 {q0}, [r0]!
+.endif
+ pop {r11}
+ pop {r1, pc}
+.endif
+.endm
+
+.macro sum_lag1_func type, uv_layout, edge, elems=16
+function sum_\type\()_lag1_\edge\()_neon
+ push {r1, lr}
+ sum_lag_n_body lag1, \type, \uv_layout, \edge, \elems, store=0
+endfunc
+.endm
+
+sum_lag1_func y, 0, left
+sum_lag1_func y, 0, mid
+sum_lag1_func y, 0, right, 15
+sum_lag1_func uv_444, 444, left
+sum_lag1_func uv_444, 444, mid
+sum_lag1_func uv_444, 444, right, 15
+sum_lag1_func uv_422, 422, left
+sum_lag1_func uv_422, 422, mid
+sum_lag1_func uv_422, 422, right, 9
+sum_lag1_func uv_420, 420, left
+sum_lag1_func uv_420, 420, mid
+sum_lag1_func uv_420, 420, right, 9
+
+.macro sum_lag1 type, dst, left, mid, right, edge=mid
+ vmov q3, \mid
+ vext.8 q0, \left, \mid, #15
+ vext.8 q1, \mid, \right, #1
+ bl sum_\type\()_lag1_\edge\()_neon
+ vmov \dst, q0
+.endm
+
+.macro sum_y_lag1 dst, left, mid, right, edge=mid
+ sum_lag1 y, \dst, \left, \mid, \right, \edge
+.endm
+
+.macro sum_uv_444_lag1 dst, left, mid, right, edge=mid
+ sum_lag1 uv_444, \dst, \left, \mid, \right, \edge
+.endm
+
+.macro sum_uv_422_lag1 dst, left, mid, right, edge=mid
+ sum_lag1 uv_422, \dst, \left, \mid, \right, \edge
+.endm
+
+.macro sum_uv_420_lag1 dst, left, mid, right, edge=mid
+ sum_lag1 uv_420, \dst, \left, \mid, \right, \edge
+.endm
+
+
+function sum_lag2_above_neon
+ push {lr}
+ sub r12, r0, #2*GRAIN_WIDTH - 16
+ sub lr, r0, #1*GRAIN_WIDTH - 16
+ vld1.8 {q10}, [r12] // load top right
+ vld1.8 {q13}, [lr]
+
+ vext.8 q6, q8, q9, #14 // top left, top mid
+ vdup.8 d14, d28[0]
+ vext.8 q8, q8, q9, #15
+ vdup.8 d15, d28[1]
+
+ vmull.s8 q0, d12, d14
+ vmull.s8 q1, d13, d14
+ vmull.s8 q6, d16, d15
+ vmull.s8 q8, d17, d15
+
+ vaddl.s16 q2, d0, d12
+ vaddl.s16 q3, d1, d13
+ vaddl.s16 q4, d2, d16
+ vaddl.s16 q5, d3, d17
+
+ vext.8 q6, q9, q10, #1 // top mid, top right
+ vdup.8 d14, d28[3]
+ vext.8 q8, q9, q10, #2
+ vdup.8 d15, d28[4]
+
+ vmull.s8 q0, d12, d14
+ vmull.s8 q1, d13, d14
+ vmull.s8 q6, d16, d15
+ vmull.s8 q8, d17, d15
+
+ vaddl.s16 q7, d0, d12
+ vaddl.s16 q0, d1, d13
+ vaddl.s16 q6, d2, d16
+ vaddl.s16 q1, d3, d17
+
+ vadd.i32 q2, q2, q7
+ vadd.i32 q3, q3, q0
+ vadd.i32 q4, q4, q6
+ vadd.i32 q5, q5, q1
+
+ vext.8 q6, q11, q12, #14 // top left, top mid
+ vdup.8 d14, d28[5]
+ vext.8 q8, q11, q12, #15
+ vdup.8 d15, d28[6]
+
+ vmull.s8 q0, d12, d14
+ vmull.s8 q1, d13, d14
+ vmull.s8 q6, d16, d15
+ vmull.s8 q8, d17, d15
+
+ vaddl.s16 q7, d0, d12
+ vaddl.s16 q0, d1, d13
+ vaddl.s16 q6, d2, d16
+ vaddl.s16 q1, d3, d17
+
+ vadd.i32 q2, q2, q7
+ vadd.i32 q3, q3, q0
+ vadd.i32 q4, q4, q6
+ vadd.i32 q5, q5, q1
+
+ vext.8 q6, q12, q13, #1 // top mid, top right
+ vdup.8 d14, d29[0]
+ vext.8 q8, q12, q13, #2
+ vdup.8 d15, d29[1]
+
+ vmull.s8 q0, d12, d14
+ vmull.s8 q1, d13, d14
+ vmull.s8 q6, d16, d15
+ vmull.s8 q8, d17, d15
+
+ vaddl.s16 q7, d0, d12
+ vaddl.s16 q0, d1, d13
+ vaddl.s16 q6, d2, d16
+ vaddl.s16 q1, d3, d17
+
+ vadd.i32 q2, q2, q7
+ vadd.i32 q3, q3, q0
+ vadd.i32 q4, q4, q6
+ vadd.i32 q5, q5, q1
+
+ vdup.8 d14, d28[2]
+ vdup.8 d15, d28[7]
+
+ vmull.s8 q0, d18, d14
+ vmull.s8 q1, d19, d14
+ vmull.s8 q6, d24, d15
+ vmull.s8 q8, d25, d15
+
+ vaddl.s16 q7, d0, d12
+ vaddl.s16 q0, d1, d13
+ vaddl.s16 q6, d2, d16
+ vaddl.s16 q1, d3, d17
+
+ vmov q8, q9
+ vmov q9, q10
+
+ vadd.i32 q2, q2, q7
+ vadd.i32 q3, q3, q0
+ vadd.i32 q4, q4, q6
+ vadd.i32 q5, q5, q1
+
+ vmov q11, q12
+ vmov q12, q13
+
+ pop {pc}
+endfunc
+
+.macro sum_lag2_func type, uv_layout, edge, elems=16
+function sum_\type\()_lag2_\edge\()_neon
+ push {r1, lr}
+.ifc \edge, left
+ sub r12, r0, #2*GRAIN_WIDTH
+ sub lr, r0, #1*GRAIN_WIDTH
+ vld1.8 {q9}, [r12] // load the previous block right above
+ vld1.8 {q12}, [lr]
+.endif
+ sum_lag_n_body lag2, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=d29[4]
+endfunc
+.endm
+
+sum_lag2_func y, 0, left
+sum_lag2_func y, 0, mid
+sum_lag2_func y, 0, right, 15
+sum_lag2_func uv_444, 444, left
+sum_lag2_func uv_444, 444, mid
+sum_lag2_func uv_444, 444, right, 15
+sum_lag2_func uv_422, 422, left
+sum_lag2_func uv_422, 422, mid
+sum_lag2_func uv_422, 422, right, 9
+sum_lag2_func uv_420, 420, left
+sum_lag2_func uv_420, 420, mid
+sum_lag2_func uv_420, 420, right, 9
+
+
+function sum_lag3_left_above_neon
+ // A separate codepath for the left edge, to avoid reading outside
+ // of the edge of the buffer.
+ sub r12, r0, #3*GRAIN_WIDTH
+ vld1.8 {q11, q12}, [r12]
+ vext.8 q12, q11, q12, #13
+ vext.8 q11, q11, q11, #13
+ b sum_lag3_above_start
+endfunc
+
+function sum_lag3_above_neon
+ sub r12, r0, #3*GRAIN_WIDTH + 3
+ vld1.8 {q11, q12}, [r12]
+
+sum_lag3_above_start:
+ vdup.8 d20, d26[0]
+ vext.8 q9, q11, q12, #1
+ vdup.8 d21, d26[1]
+
+ vmull.s8 q0, d22, d20
+ vmull.s8 q1, d23, d20
+ vmull.s8 q6, d18, d21
+ vmull.s8 q7, d19, d21
+
+ vext.8 q8, q11, q12, #2
+ vdup.8 d20, d26[2]
+ vext.8 q9, q11, q12, #3
+ vdup.8 d21, d26[3]
+
+ vaddl.s16 q2, d0, d12
+ vaddl.s16 q3, d1, d13
+ vaddl.s16 q4, d2, d14
+ vaddl.s16 q5, d3, d15
+
+ vmull.s8 q0, d16, d20
+ vmull.s8 q1, d17, d20
+ vmull.s8 q6, d18, d21
+ vmull.s8 q7, d19, d21
+
+ vaddl.s16 q8, d0, d12
+ vaddl.s16 q9, d1, d13
+ vaddl.s16 q0, d2, d14
+ vaddl.s16 q1, d3, d15
+
+ vext.8 q6, q11, q12, #4
+ vdup.8 d20, d26[4]
+ vext.8 q7, q11, q12, #5
+ vdup.8 d21, d26[5]
+
+ vadd.i32 q2, q2, q8
+ vadd.i32 q3, q3, q9
+ vadd.i32 q4, q4, q0
+ vadd.i32 q5, q5, q1
+
+ vmull.s8 q0, d12, d20
+ vmull.s8 q1, d13, d20
+ vmull.s8 q8, d14, d21
+ vmull.s8 q9, d15, d21
+
+ sub r12, r0, #2*GRAIN_WIDTH + 3
+
+ vaddl.s16 q6, d0, d16
+ vaddl.s16 q7, d1, d17
+ vaddl.s16 q0, d2, d18
+ vaddl.s16 q1, d3, d19
+
+ vext.8 q8, q11, q12, #6
+ vld1.8 {q11, q12}, [r12]
+ vdup.8 d20, d26[6]
+ vdup.8 d21, d26[7]
+
+ vadd.i32 q2, q2, q6
+ vadd.i32 q3, q3, q7
+ vadd.i32 q4, q4, q0
+ vadd.i32 q5, q5, q1
+
+ vmull.s8 q0, d16, d20
+ vmull.s8 q1, d17, d20
+ vmull.s8 q6, d22, d21
+ vmull.s8 q7, d23, d21
+
+ vaddl.s16 q8, d0, d12
+ vaddl.s16 q9, d1, d13
+ vaddl.s16 q0, d2, d14
+ vaddl.s16 q1, d3, d15
+
+ vext.8 q6, q11, q12, #1
+ vdup.8 d20, d27[0]
+ vext.8 q7, q11, q12, #2
+ vdup.8 d21, d27[1]
+
+ vadd.i32 q2, q2, q8
+ vadd.i32 q3, q3, q9
+ vadd.i32 q4, q4, q0
+ vadd.i32 q5, q5, q1
+
+ vmull.s8 q0, d12, d20
+ vmull.s8 q1, d13, d20
+ vmull.s8 q8, d14, d21
+ vmull.s8 q9, d15, d21
+
+ vaddl.s16 q6, d0, d16
+ vaddl.s16 q7, d1, d17
+ vaddl.s16 q0, d2, d18
+ vaddl.s16 q1, d3, d19
+
+ vext.8 q8, q11, q12, #3
+ vdup.8 d20, d27[2]
+ vext.8 q9, q11, q12, #4
+ vdup.8 d21, d27[3]
+
+ vadd.i32 q2, q2, q6
+ vadd.i32 q3, q3, q7
+ vadd.i32 q4, q4, q0
+ vadd.i32 q5, q5, q1
+
+ vmull.s8 q0, d16, d20
+ vmull.s8 q1, d17, d20
+ vmull.s8 q6, d18, d21
+ vmull.s8 q7, d19, d21
+
+ sub r12, r0, #1*GRAIN_WIDTH + 3
+
+ vaddl.s16 q8, d0, d12
+ vaddl.s16 q9, d1, d13
+ vaddl.s16 q0, d2, d14
+ vaddl.s16 q1, d3, d15
+
+ vext.8 q6, q11, q12, #5
+ vdup.8 d20, d27[4]
+ vext.8 q7, q11, q12, #6
+ vdup.8 d21, d27[5]
+
+ vld1.8 {q11, q12}, [r12]
+
+ vadd.i32 q2, q2, q8
+ vadd.i32 q3, q3, q9
+ vadd.i32 q4, q4, q0
+ vadd.i32 q5, q5, q1
+
+ vmull.s8 q0, d12, d20
+ vmull.s8 q1, d13, d20
+ vmull.s8 q8, d14, d21
+ vmull.s8 q9, d15, d21
+
+ vaddl.s16 q6, d0, d16
+ vaddl.s16 q7, d1, d17
+ vaddl.s16 q0, d2, d18
+ vaddl.s16 q1, d3, d19
+
+ vdup.8 d20, d27[6]
+ vext.8 q9, q11, q12, #1
+ vdup.8 d21, d27[7]
+
+ vadd.i32 q2, q2, q6
+ vadd.i32 q3, q3, q7
+ vadd.i32 q4, q4, q0
+ vadd.i32 q5, q5, q1
+
+ vmull.s8 q0, d22, d20
+ vmull.s8 q1, d23, d20
+ vmull.s8 q6, d18, d21
+ vmull.s8 q7, d19, d21
+
+ vaddl.s16 q8, d0, d12
+ vaddl.s16 q9, d1, d13
+ vaddl.s16 q0, d2, d14
+ vaddl.s16 q1, d3, d15
+
+ vext.8 q6, q11, q12, #2
+ vdup.8 d20, d28[0]
+ vext.8 q7, q11, q12, #3
+ vdup.8 d21, d28[1]
+
+ vadd.i32 q2, q2, q8
+ vadd.i32 q3, q3, q9
+ vadd.i32 q4, q4, q0
+ vadd.i32 q5, q5, q1
+
+ vmull.s8 q0, d12, d20
+ vmull.s8 q1, d13, d20
+ vmull.s8 q8, d14, d21
+ vmull.s8 q9, d15, d21
+
+ vaddl.s16 q6, d0, d16
+ vaddl.s16 q7, d1, d17
+ vaddl.s16 q0, d2, d18
+ vaddl.s16 q1, d3, d19
+
+ vext.8 q8, q11, q12, #4
+ vdup.8 d20, d28[2]
+ vext.8 q9, q11, q12, #5
+ vdup.8 d21, d28[3]
+
+ vadd.i32 q2, q2, q6
+ vadd.i32 q3, q3, q7
+ vadd.i32 q4, q4, q0
+ vadd.i32 q5, q5, q1
+
+ vmull.s8 q0, d16, d20
+ vmull.s8 q1, d17, d20
+ vmull.s8 q6, d18, d21
+ vmull.s8 q7, d19, d21
+
+ vaddl.s16 q8, d0, d12
+ vaddl.s16 q9, d1, d13
+ vaddl.s16 q0, d2, d14
+ vaddl.s16 q1, d3, d15
+
+ vext.8 q6, q11, q12, #6
+ vdup.8 d20, d28[4]
+
+ vadd.i32 q2, q2, q8
+ vadd.i32 q3, q3, q9
+ vadd.i32 q4, q4, q0
+ vadd.i32 q5, q5, q1
+
+ vmull.s8 q0, d12, d20
+ vmull.s8 q1, d13, d20
+
+ vaddw.s16 q2, q2, d0
+ vaddw.s16 q3, q3, d1
+ vaddw.s16 q4, q4, d2
+ vaddw.s16 q5, q5, d3
+
+ bx lr
+endfunc
+
+.macro sum_lag3_func type, uv_layout, edge, elems=16
+function sum_\type\()_lag3_\edge\()_neon
+ push {r1, lr}
+ sum_lag_n_body lag3, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=d29[0]
+endfunc
+.endm
+
+sum_lag3_func y, 0, left
+sum_lag3_func y, 0, mid
+sum_lag3_func y, 0, right, 15
+sum_lag3_func uv_444, 444, left
+sum_lag3_func uv_444, 444, mid
+sum_lag3_func uv_444, 444, right, 15
+sum_lag3_func uv_422, 422, left
+sum_lag3_func uv_422, 422, mid
+sum_lag3_func uv_422, 422, right, 9
+sum_lag3_func uv_420, 420, left
+sum_lag3_func uv_420, 420, mid
+sum_lag3_func uv_420, 420, right, 9
+
+function generate_grain_rows_neon
+ push {r11,lr}
+1:
+ get_grain_row d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26
+ subs r1, r1, #1
+ store_grain_row d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26
+ bgt 1b
+ pop {r11,pc}
+endfunc
+
+function generate_grain_rows_44_neon
+ push {r11,lr}
+1:
+ get_grain_row_44 d16, d17, d18, d19, d20, d21
+ subs r1, r1, #1
+ store_grain_row_44 d16, d17, d18, d19, d20, d21
+ bgt 1b
+ pop {r11,pc}
+endfunc
+
+function gen_grain_uv_444_lag0_neon
+ vld1.8 {q3}, [r11]!
+ push {r11,lr}
+ bl get_gaussian_neon
+ vrshl.s16 q8, q0, q15
+ bl get_gaussian_neon
+ vrshl.s16 q9, q0, q15
+ vqmovn.s16 d0, q8
+ vqmovn.s16 d1, q9
+
+ vand q3, q3, q1
+ vmull.s8 q2, d6, d22
+ vmull.s8 q3, d7, d22
+ vrshl.s16 q2, q2, q12
+ vrshl.s16 q3, q3, q12
+ vaddw.s8 q2, q2, d0
+ vaddw.s8 q3, q3, d1
+ vqmovn.s16 d4, q2
+ vqmovn.s16 d5, q3
+ vst1.8 {q2}, [r0]!
+ pop {r11,pc}
+endfunc
+
+function get_grain_row_44_neon
+ push {r11,lr}
+ get_grain_row_44 d16, d17, d18, d19, d20, d21
+ pop {r11,pc}
+endfunc
+
+function add_uv_420_coeff_lag0_neon
+ vld1.16 {q2, q3}, [r11]!
+ vld1.16 {q4, q5}, [r12]!
+ vpaddl.s8 q2, q2
+ vpaddl.s8 q3, q3
+ vpaddl.s8 q4, q4
+ vpaddl.s8 q5, q5
+ vadd.i16 q2, q2, q4
+ vadd.i16 q3, q3, q5
+ vrshrn.s16 d4, q2, #2
+ vrshrn.s16 d5, q3, #2
+ b add_coeff_lag0_start
+endfunc
+
+function add_uv_422_coeff_lag0_neon
+ vld1.16 {q2, q3}, [r11]!
+ vpaddl.s8 q2, q2
+ vpaddl.s8 q3, q3
+ vrshrn.s16 d4, q2, #1
+ vrshrn.s16 d5, q3, #1
+
+add_coeff_lag0_start:
+ vand q3, q2, q1
+ vmull.s8 q2, d6, d22
+ vmull.s8 q3, d7, d22
+ vrshl.s16 q2, q2, q12
+ vrshl.s16 q3, q3, q12
+ vaddw.s8 q2, q2, d0
+ vaddw.s8 q3, q3, d1
+ vqmovn.s16 d4, q2
+ vqmovn.s16 d5, q3
+ bx lr
+endfunc
+
+.macro gen_grain_82 type
+function generate_grain_\type\()_8bpc_neon, export=1
+ push {r4-r11,lr}
+
+.ifc \type, uv_444
+ mov r12, r3
+ mov lr, #28
+ add r11, r1, #3*GRAIN_WIDTH
+ mov r1, r2
+ mul r12, r12, lr
+.endif
+ movrel r3, X(gaussian_sequence)
+ ldr r2, [r1, #FGD_SEED]
+ ldr r9, [r1, #FGD_GRAIN_SCALE_SHIFT]
+.ifc \type, y
+ add r4, r1, #FGD_AR_COEFFS_Y
+.else
+ add r4, r1, #FGD_AR_COEFFS_UV
+.endif
+ adr r5, L(gen_grain_\type\()_tbl)
+ ldr r6, [r1, #FGD_AR_COEFF_LAG]
+ add r9, r9, #4
+ ldr r6, [r5, r6, lsl #2]
+ vdup.16 q15, r9 // 4 + data->grain_scale_shift
+ add r5, r5, r6
+ vneg.s16 q15, q15
+
+.ifc \type, uv_444
+ cmp r12, #0
+ movw r10, #0x49d8
+ movw lr, #0xb524
+ // Intentionally using a separate register instead of moveq with an
+ // immediate constant, to avoid armv8 deprecated it instruction forms.
+ it eq
+ moveq r10, lr
+ add r4, r4, r12 // Add offset to ar_coeffs_uv[1]
+ eor r2, r2, r10
+.endif
+
+ ldr r7, [r1, #FGD_AR_COEFF_SHIFT]
+ mov r8, #1
+ mov r10, #1
+ lsl r8, r8, r7 // 1 << ar_coeff_shift
+ lsl r10, r10, r9 // 1 << (4 + data->grain_scale_shift)
+ lsr r8, r8, #1 // 1 << (ar_coeff_shift - 1)
+ lsr r10, r10, #1 // 1 << (4 + data->grain_scale_shift - 1)
+
+ bx r5
+
+ .align 2
+L(gen_grain_\type\()_tbl):
+ .word L(generate_grain_\type\()_lag0) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+ .word L(generate_grain_\type\()_lag1) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+ .word L(generate_grain_\type\()_lag2) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+ .word L(generate_grain_\type\()_lag3) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+
+L(generate_grain_\type\()_lag0):
+.ifc \type, y
+ mov r1, #GRAIN_HEIGHT
+ bl generate_grain_rows_neon
+.else
+
+ mov r1, #3
+ bl generate_grain_rows_neon
+ mov r1, #GRAIN_HEIGHT-3
+
+ vdup.16 q12, r7
+ vld1.8 {d22[]}, [r4] // ar_coeffs_uv[0]
+ vmov.i8 q0, #0
+ vmov.i8 q1, #255
+ vext.8 q13, q0, q1, #13
+ vext.8 q14, q1, q0, #1
+ vneg.s16 q12, q12
+
+1:
+ vmov q1, q13
+ bl gen_grain_uv_444_lag0_neon // 16
+ vmov.i8 q1, #255
+ bl gen_grain_uv_444_lag0_neon // 32
+ bl gen_grain_uv_444_lag0_neon // 48
+ bl gen_grain_uv_444_lag0_neon // 64
+ vmov q1, q14
+ bl gen_grain_uv_444_lag0_neon // 80
+ get_grain_2 d16
+ subs r1, r1, #1
+ add r11, r11, #2
+ vst1.16 {d16[0]}, [r0]!
+ bgt 1b
+.endif
+ pop {r4-r11,pc}
+
+L(generate_grain_\type\()_lag1):
+ vpush {q4-q7}
+ mov r5, #127
+ vld1.8 {d27[]}, [r4]! // ar_coeffs_y[0]
+ vld1.8 {d28[]}, [r4]! // ar_coeffs_y[1]
+ vld1.8 {d29[]}, [r4] // ar_coeffs_y[2]
+.ifc \type, y
+ ldrsb r4, [r4, #1] // ar_coeffs_y[3]
+.else
+ add r4, r4, #2
+.endif
+
+ mov r1, #3
+.ifc \type, uv_444
+ vld1.8 {d13[]}, [r4] // ar_coeffs_uv[4]
+ ldrsb r4, [r4, #-1] // ar_coeffs_uv[3]
+.endif
+ bl generate_grain_rows_neon
+
+ mov r1, #GRAIN_HEIGHT - 3
+1:
+ sum_\type\()_lag1 q7, q8, q8, q9, left
+ sum_\type\()_lag1 q8, q8, q9, q10
+ sum_\type\()_lag1 q9, q9, q10, q11
+ sum_\type\()_lag1 q10, q10, q11, q12
+ sum_\type\()_lag1 q12, q11, q12, q13, right
+ get_grain_2 d26
+ subs r1, r1, #1
+.ifc \type, uv_444
+ add r11, r11, #2
+.endif
+ store_grain_row d14, d15, d16, d17, d18, d19, d20, d21, d24, d25, d26
+ vmov q11, q10
+ vmov q10, q9
+ vmov q9, q8
+ vmov q8, q7
+ bgt 1b
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+
+L(generate_grain_\type\()_lag2):
+ vpush {q4-q7}
+ mov r5, #127
+ vld1.8 {d28,d29}, [r4] // ar_coeffs_y[0-11], ar_coeffs_uv[0-12]
+
+ vmov.s8 r4, d29[2]
+ vmov.s8 r10, d29[3]
+
+ mov r1, #3
+ bl generate_grain_rows_neon
+
+ mov r1, #GRAIN_HEIGHT - 3
+1:
+ bl sum_\type\()_lag2_left_neon
+ bl sum_\type\()_lag2_mid_neon
+ bl sum_\type\()_lag2_mid_neon
+ bl sum_\type\()_lag2_mid_neon
+ bl sum_\type\()_lag2_right_neon
+ get_grain_2 d16
+ subs r1, r1, #1
+.ifc \type, uv_444
+ add r11, r11, #2
+.endif
+ vst1.16 {d16[0]}, [r0]!
+ bgt 1b
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+
+L(generate_grain_\type\()_lag3):
+ vpush {q4-q7}
+ mov r5, #127
+ vld1.8 {q13, q14}, [r4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24]
+
+ vmov.u8 r4, d28[5]
+ vmov.u8 r10, d28[6]
+ vmov.u8 r12, d28[7]
+
+ orr r4, r4, r10, lsl #8
+ orr r4, r4, r12, lsl #16
+
+ mov r1, #3
+ vpush {d26}
+ bl generate_grain_rows_neon
+ vpop {d26}
+
+ mov r1, #GRAIN_HEIGHT - 3
+1:
+ bl sum_\type\()_lag3_left_neon
+ bl sum_\type\()_lag3_mid_neon
+ bl sum_\type\()_lag3_mid_neon
+ bl sum_\type\()_lag3_mid_neon
+ bl sum_\type\()_lag3_right_neon
+ get_grain_2 d16
+ subs r1, r1, #1
+.ifc \type, uv_444
+ add r11, r11, #2
+.endif
+ vst1.16 {d16[0]}, [r0]!
+ bgt 1b
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+.endm
+
+gen_grain_82 y
+gen_grain_82 uv_444
+
+.macro set_height dst, type
+.ifc \type, uv_420
+ mov \dst, #SUB_GRAIN_HEIGHT-3
+.else
+ mov \dst, #GRAIN_HEIGHT-3
+.endif
+.endm
+
+.macro increment_y_ptr reg, type
+.ifc \type, uv_420
+ add \reg, \reg, #2*GRAIN_WIDTH-(3*32)
+.else
+ sub \reg, \reg, #3*32-GRAIN_WIDTH
+.endif
+.endm
+
+.macro gen_grain_44 type
+function generate_grain_\type\()_8bpc_neon, export=1
+ push {r4-r11,lr}
+
+ mov r12, r3
+ mov lr, #28
+ add r11, r1, #3*GRAIN_WIDTH-3
+ mov r1, r2
+ mul r12, r12, lr
+
+ movrel r3, X(gaussian_sequence)
+ ldr r2, [r1, #FGD_SEED]
+ ldr r9, [r1, #FGD_GRAIN_SCALE_SHIFT]
+ add r4, r1, #FGD_AR_COEFFS_UV
+ adr r5, L(gen_grain_\type\()_tbl)
+ ldr r6, [r1, #FGD_AR_COEFF_LAG]
+ add r9, r9, #4
+ ldr r6, [r5, r6, lsl #2]
+ vdup.16 q15, r9 // 4 + data->grain_scale_shift
+ add r5, r5, r6
+ vneg.s16 q15, q15
+
+ cmp r12, #0
+ movw r10, #0x49d8
+ movw lr, #0xb524
+ // Intentionally using a separate register instead of moveq with an
+ // immediate constant, to avoid armv8 deprecated it instruction forms.
+ it eq
+ moveq r10, lr
+ add r4, r4, r12 // Add offset to ar_coeffs_uv[1]
+ eor r2, r2, r10
+
+ ldr r7, [r1, #FGD_AR_COEFF_SHIFT]
+ mov r8, #1
+ mov r10, #1
+ lsl r8, r8, r7 // 1 << ar_coeff_shift
+ lsl r10, r10, r9 // 1 << (4 + data->grain_scale_shift)
+ lsr r8, r8, #1 // 1 << (ar_coeff_shift - 1)
+ lsr r10, r10, #1 // 1 << (4 + data->grain_scale_shift - 1)
+ bx r5
+
+ .align 2
+L(gen_grain_\type\()_tbl):
+ .word L(generate_grain_\type\()_lag0) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+ .word L(generate_grain_\type\()_lag1) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+ .word L(generate_grain_\type\()_lag2) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+ .word L(generate_grain_\type\()_lag3) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+
+L(generate_grain_\type\()_lag0):
+.ifc \type, uv_420
+ vpush {q4-q5}
+.endif
+ mov r1, #3
+ bl generate_grain_rows_44_neon
+ set_height r1, \type
+
+ vdup.16 q12, r7
+ vld1.8 {d22[]}, [r4] // ar_coeffs_uv[0]
+ vmov.i8 q0, #0
+ vmov.i8 q1, #255
+ vext.8 q13, q0, q1, #13
+ vext.8 q14, q1, q0, #7
+ vneg.s16 q12, q12
+
+1:
+ bl get_grain_row_44_neon
+.ifc \type, uv_420
+ add r12, r11, #GRAIN_WIDTH
+.endif
+ vmov q1, q13
+ vmov q0, q8
+ bl add_\type\()_coeff_lag0_neon
+ vmov.i8 q1, #255
+ vmov q0, q9
+ vmov q8, q2
+ bl add_\type\()_coeff_lag0_neon
+ vmov.i8 q1, q14
+ vmov q0, q10
+ vmov q9, q2
+ bl add_\type\()_coeff_lag0_neon
+ vmov q10, q2
+ subs r1, r1, #1
+ increment_y_ptr r11, \type
+ store_grain_row_44 d16, d17, d18, d19, d20, d21
+ bgt 1b
+
+.ifc \type, uv_420
+ vpop {q4-q5}
+.endif
+ pop {r4-r11,pc}
+
+L(generate_grain_\type\()_lag1):
+ vpush {q4-q7}
+ mov r5, #127
+ vld1.8 {d27[]}, [r4]! // ar_coeffs_uv[0]
+ vld1.8 {d28[]}, [r4]! // ar_coeffs_uv[1]
+ vld1.8 {d29[]}, [r4] // ar_coeffs_uv[2]
+ add r4, r4, #2
+
+ mov r1, #3
+ vld1.8 {d13[]}, [r4] // ar_coeffs_uv[4]
+ ldrsb r4, [r4, #-1] // ar_coeffs_uv[3]
+ bl generate_grain_rows_44_neon
+
+ set_height r1, \type
+1:
+ sum_\type\()_lag1 q7, q8, q8, q9, left
+ sum_\type\()_lag1 q8, q8, q9, q10
+ sum_\type\()_lag1 q10, q9, q10, q11, right
+ subs r1, r1, #1
+ increment_y_ptr r11, \type
+ store_grain_row_44 d14, d15, d16, d17, d20, d21
+ vmov q9, q8
+ vmov q8, q7
+ bgt 1b
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+
+L(generate_grain_\type\()_lag2):
+ vpush {q4-q7}
+ mov r5, #127
+ vld1.8 {d28,d29}, [r4] // ar_coeffs_uv[0-12]
+
+ vmov.s8 r4, d29[2]
+ vmov.s8 r10, d29[3]
+
+ mov r1, #3
+ bl generate_grain_rows_44_neon
+
+ set_height r1, \type
+1:
+ bl sum_\type\()_lag2_left_neon
+ bl sum_\type\()_lag2_mid_neon
+ bl sum_\type\()_lag2_right_neon
+ subs r1, r1, #1
+ increment_y_ptr r11, \type
+ add r0, r0, #GRAIN_WIDTH-48
+ bgt 1b
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+
+L(generate_grain_\type\()_lag3):
+ vpush {q4-q7}
+ mov r5, #127
+ vld1.8 {q13, q14}, [r4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24]
+
+ vmov.u8 r4, d28[5]
+ vmov.u8 r10, d28[6]
+ vmov.u8 r12, d28[7]
+
+ orr r4, r4, r10, lsl #8
+ orr r4, r4, r12, lsl #16
+
+ mov r1, #3
+ bl generate_grain_rows_44_neon
+
+ set_height r1, \type
+1:
+ bl sum_\type\()_lag3_left_neon
+ bl sum_\type\()_lag3_mid_neon
+ bl sum_\type\()_lag3_right_neon
+ subs r1, r1, #1
+ increment_y_ptr r11, \type
+ add r0, r0, #GRAIN_WIDTH-48
+ bgt 1b
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+.endm
+
+gen_grain_44 uv_420
+gen_grain_44 uv_422
+
+.macro gather_interleaved dst1, dst2, src1, src2, off
+ vmov.u8 r11, \src1[0+\off]
+ vmov.u8 r12, \src2[0+\off]
+ add r11, r11, r3
+ vmov.u8 lr, \src1[2+\off]
+ add r12, r12, r3
+ vld1.8 {\dst1[0+\off]}, [r11]
+ vmov.u8 r11, \src2[2+\off]
+ add lr, lr, r3
+ vld1.8 {\dst2[0+\off]}, [r12]
+ vmov.u8 r12, \src1[4+\off]
+ add r11, r11, r3
+ vld1.8 {\dst1[2+\off]}, [lr]
+ vmov.u8 lr, \src2[4+\off]
+ add r12, r12, r3
+ vld1.8 {\dst2[2+\off]}, [r11]
+ vmov.u8 r11, \src1[6+\off]
+ add lr, lr, r3
+ vld1.8 {\dst1[4+\off]}, [r12]
+ vmov.u8 r12, \src2[6+\off]
+ add r11, r11, r3
+ vld1.8 {\dst2[4+\off]}, [lr]
+ add r12, r12, r3
+ vld1.8 {\dst1[6+\off]}, [r11]
+ vld1.8 {\dst2[6+\off]}, [r12]
+.endm
+
+.macro gather dst1, dst2, dst3, dst4, src1, src2, src3, src4
+ gather_interleaved \dst1, \dst3, \src1, \src3, 0
+ gather_interleaved \dst1, \dst3, \src1, \src3, 1
+ gather_interleaved \dst2, \dst4, \src2, \src4, 0
+ gather_interleaved \dst2, \dst4, \src2, \src4, 1
+.endm
+
+function gather32_neon
+ push {r11-r12,lr}
+ gather d8, d9, d10, d11, d0, d1, d2, d3
+ pop {r11-r12,pc}
+endfunc
+
+function gather16_neon
+ push {r11-r12,lr}
+ gather_interleaved d8, d9, d0, d1, 0
+ gather_interleaved d8, d9, d0, d1, 1
+ pop {r11-r12,pc}
+endfunc
+
+const overlap_coeffs_0, align=4
+ .byte 27, 17, 0, 0, 0, 0, 0, 0
+ .byte 17, 27, 32, 32, 32, 32, 32, 32
+endconst
+
+const overlap_coeffs_1, align=4
+ .byte 23, 0, 0, 0, 0, 0, 0, 0
+ .byte 22, 32, 32, 32, 32, 32, 32, 32
+endconst
+
+.macro calc_offset offx, offy, src, sx, sy
+ and \offy, \src, #0xF // randval & 0xF
+ lsr \offx, \src, #4 // randval >> 4
+.if \sy == 0
+ add \offy, \offy, \offy // 2 * (randval & 0xF)
+.endif
+.if \sx == 0
+ add \offx, \offx, \offx // 2 * (randval >> 4)
+.endif
+.endm
+
+.macro add_offset dst, offx, offy, src, stride
+ mla \dst, \stride, \offy, \src // grain_lut += grain_stride * offy
+ add \dst, \dst, \offx // grain_lut += offx
+.endm
+
+// void dav1d_fgy_32x32_8bpc_neon(pixel *const dst, const pixel *const src,
+// const ptrdiff_t stride,
+// const uint8_t scaling[SCALING_SIZE],
+// const int scaling_shift,
+// const entry grain_lut[][GRAIN_WIDTH],
+// const int offsets[][2],
+// const int h, const ptrdiff_t clip,
+// const ptrdiff_t type);
+function fgy_32x32_8bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100] // scaling_shift, grain_lut
+ ldrd r6, r7, [sp, #108] // offsets, h
+ ldr r8, [sp, #116] // clip
+ mov r9, #GRAIN_WIDTH // grain_lut stride
+
+ neg r4, r4
+ vdup.16 q13, r4 // -scaling_shift
+ cmp r8, #0
+
+ movrel_local r12, overlap_coeffs_0
+
+ beq 1f
+ // clip
+ vmov.i8 q14, #16
+ vmov.i8 q15, #235
+ b 2f
+1:
+ // no clip
+ vmov.i8 q14, #0
+ vmov.i8 q15, #255
+2:
+
+ vld1.8 {d24, d25}, [r12, :128] // overlap_coeffs
+
+ add r5, r5, #9 // grain_lut += 9
+ add r5, r5, r9, lsl #3 // grain_lut += 8 * grain_stride
+ add r5, r5, r9 // grain_lut += grain_stride
+
+ ldr r10, [r6, #8] // offsets[1][0]
+ calc_offset r10, r4, r10, 0, 0
+ add_offset r4, r10, r4, r5, r9
+ ldr r10, [r6, #4] // offsets[0][1]
+ calc_offset r10, r11, r10, 0, 0
+ add_offset r11, r10, r11, r5, r9
+ ldr r10, [r6, #12] // offsets[1][1]
+ calc_offset r10, r8, r10, 0, 0
+ add_offset r8, r10, r8, r5, r9
+ ldr r6, [r6] // offsets[0][0]
+ calc_offset r6, lr, r6, 0, 0
+ add_offset r5, r6, lr, r5, r9
+
+ add r4, r4, #32 // grain_lut += FG_BLOCK_SIZE * bx
+ add r6, r11, r9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by
+
+ ldr r10, [sp, #120] // type
+ adr r11, L(fgy_loop_tbl)
+
+ tst r10, #1
+ ldr r10, [r11, r10, lsl #2]
+
+ add r8, r8, r9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by
+ add r8, r8, #32 // grain_lut += FG_BLOCK_SIZE * bx
+
+ add r11, r11, r10
+
+ beq 1f
+ // y overlap
+ vdup.8 d14, d24[0]
+ vdup.8 d15, d24[1]
+ mov r10, r7 // backup actual h
+ mov r7, #2
+1:
+ bx r11
+endfunc
+
+function fgy_loop_neon
+L(fgy_loop_tbl):
+ .word L(loop_00) - L(fgy_loop_tbl) + CONFIG_THUMB
+ .word L(loop_01) - L(fgy_loop_tbl) + CONFIG_THUMB
+ .word L(loop_10) - L(fgy_loop_tbl) + CONFIG_THUMB
+ .word L(loop_11) - L(fgy_loop_tbl) + CONFIG_THUMB
+
+.macro fgy ox, oy
+L(loop_\ox\oy):
+1:
+.if \ox
+ vld1.8 {d8}, [r4], r9 // grain_lut old
+.endif
+.if \oy
+ vld1.8 {q2, q3}, [r6], r9 // grain_lut top
+.endif
+.if \ox && \oy
+ vld1.8 {d10}, [r8], r9 // grain_lut top old
+.endif
+ vld1.8 {q0, q1}, [r1, :128], r2 // src
+ vld1.8 {q10, q11}, [r5], r9 // grain_lut
+
+.if \ox
+ vmull.s8 q4, d8, d24
+ vmlal.s8 q4, d20, d25
+.endif
+
+.if \oy
+.if \ox
+ vmull.s8 q5, d10, d24
+ vmlal.s8 q5, d4, d25
+ vqrshrn.s16 d20, q4, #5
+ vqrshrn.s16 d4, q5, #5
+.endif
+
+ vmull.s8 q4, d20, d15
+ vmull.s8 q5, d21, d15
+ vmull.s8 q8, d22, d15
+ vmull.s8 q9, d23, d15
+ vmlal.s8 q4, d4, d14
+ vmlal.s8 q5, d5, d14
+ vmlal.s8 q8, d6, d14
+ vmlal.s8 q9, d7, d14
+ vqrshrn.s16 d20, q4, #5
+ vqrshrn.s16 d21, q5, #5
+ vqrshrn.s16 d22, q8, #5
+ vqrshrn.s16 d23, q9, #5
+.elseif \ox
+ vqrshrn.s16 d20, q4, #5
+.endif
+
+ bl gather32_neon
+
+ vmovl.s8 q8, d20 // grain
+ vmovl.s8 q9, d21
+ vmovl.s8 q10, d22
+ vmovl.s8 q11, d23
+
+ vmovl.u8 q2, d8 // scaling
+ vmovl.u8 q3, d9
+ vmovl.u8 q4, d10
+ vmovl.u8 q5, d11
+
+ vmul.i16 q8, q8, q2 // scaling * grain
+ vmul.i16 q9, q9, q3
+ vmul.i16 q10, q10, q4
+ vmul.i16 q11, q11, q5
+
+ vrshl.s16 q8, q8, q13 // round2(scaling * grain, scaling_shift)
+ vrshl.s16 q9, q9, q13
+ vrshl.s16 q10, q10, q13
+ vrshl.s16 q11, q11, q13
+
+ vaddw.u8 q8, q8, d0 // *src + noise
+ vaddw.u8 q9, q9, d1
+ vaddw.u8 q10, q10, d2
+ vaddw.u8 q11, q11, d3
+
+ vqmovun.s16 d0, q8
+ vqmovun.s16 d1, q9
+ vqmovun.s16 d2, q10
+ vqmovun.s16 d3, q11
+
+ vmax.u8 q0, q0, q14
+ vmax.u8 q1, q1, q14
+ vmin.u8 q0, q0, q15
+ vmin.u8 q1, q1, q15
+
+ subs r7, r7, #1
+.if \oy
+ vdup.8 d14, d25[0]
+ vdup.8 d15, d25[1]
+.endif
+ vst1.8 {q0, q1}, [r0, :128], r2 // dst
+ bgt 1b
+
+.if \oy
+ cmp r10, #2
+ sub r7, r10, #2 // restore actual remaining h
+ bgt L(loop_\ox\()0)
+.endif
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+.endm
+
+ fgy 0, 0
+ fgy 0, 1
+ fgy 1, 0
+ fgy 1, 1
+endfunc
+
+// void dav1d_fguv_32x32_420_8bpc_neon(pixel *const dst,
+// const pixel *const src,
+// const ptrdiff_t stride,
+// const uint8_t scaling[SCALING_SIZE],
+// const Dav1dFilmGrainData *const data,
+// const entry grain_lut[][GRAIN_WIDTH],
+// const pixel *const luma_row,
+// const ptrdiff_t luma_stride,
+// const int offsets[][2],
+// const ptrdiff_t h, const ptrdiff_t uv,
+// const ptrdiff_t is_id,
+// const ptrdiff_t type);
+.macro fguv layout, sx, sy
+function fguv_32x32_\layout\()_8bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100] // data, grain_lut
+ ldrd r6, r7, [sp, #108] // luma_row, luma_stride
+ ldrd r8, r9, [sp, #116] // offsets, h
+ ldrd r10, r11, [sp, #124] // uv, is_id
+
+ // !csfl
+ add r10, r4, r10, lsl #2 // + 4*uv
+ add r12, r10, #FGD_UV_LUMA_MULT
+ add lr, r10, #FGD_UV_MULT
+ add r10, r10, #FGD_UV_OFFSET
+ vld1.16 {d4[]}, [r12] // uv_luma_mult
+ vld1.16 {d4[2]}, [r10] // uv_offset
+ vld1.16 {d4[1]}, [lr] // uv_mult
+
+ ldr lr, [r4, #FGD_SCALING_SHIFT]
+ ldr r12, [r4, #FGD_CLIP_TO_RESTRICTED_RANGE]
+ neg lr, lr // -scaling_shift
+
+ cmp r12, #0
+ vdup.16 q13, lr // -scaling_shift
+
+ beq 1f
+ // clip
+ cmp r11, #0
+ vmov.i8 q14, #16
+ vmov.i8 q15, #240
+ beq 2f
+ // is_id
+ vmov.i8 q15, #235
+ b 2f
+1:
+ // no clip
+ vmov.i8 q14, #0
+ vmov.i8 q15, #255
+2:
+
+ mov r10, #GRAIN_WIDTH // grain_lut stride
+
+ add r5, r5, #(3 + (2 >> \sx)*3) // grain_lut += 9 or 6
+.if \sy
+ add r5, r5, r10, lsl #2 // grain_lut += 4 * grain_stride
+ add r5, r5, r10, lsl #1 // grain_lut += 2 * grain_stride
+.else
+ add r5, r5, r10, lsl #3 // grain_lut += 8 * grain_stride
+ add r5, r5, r10 // grain_lut += grain_stride
+.endif
+
+ ldr r12, [r8, #8] // offsets[1][0]
+ calc_offset r12, r4, r12, \sx, \sy
+ add_offset r4, r12, r4, r5, r10
+
+ ldr r12, [r8, #4] // offsets[0][1]
+ calc_offset r12, lr, r12, \sx, \sy
+ add_offset lr, r12, lr, r5, r10
+
+ ldr r12, [r8, #12] // offsets[1][1]
+ calc_offset r12, r11, r12, \sx, \sy
+ add_offset r11, r12, r11, r5, r10
+
+ ldr r8, [r8] // offsets[0][0]
+ calc_offset r8, r12, r8, \sx, \sy
+ add_offset r5, r8, r12, r5, r10
+
+ add r4, r4, #(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx
+ add r8, lr, r10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by
+ add r11, r11, r10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by
+ add r11, r11, #(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx
+
+ movrel_local r12, overlap_coeffs_\sx
+ ldr lr, [sp, #132] // type
+
+ vld1.8 {d24, d25}, [r12, :128] // overlap_coeffs
+
+ movrel_local r12, L(fguv_loop_sx\sx\()_tbl)
+#if CONFIG_THUMB
+ // This uses movrel_local instead of adr above, because the target
+ // can be out of range for adr. But movrel_local leaves the thumb bit
+ // set on COFF (but probably wouldn't if building for thumb on ELF),
+ // thus try to clear the bit for robustness.
+ bic r12, r12, #1
+#endif
+
+ tst lr, #1
+ ldr lr, [r12, lr, lsl #2]
+
+ add r12, r12, lr
+
+ beq 1f
+ // y overlap
+ sub lr, r9, #(2 >> \sy) // backup remaining h
+ mov r9, #(2 >> \sy)
+
+1:
+
+.if \sy
+ vmov.i8 d6, #23
+ vmov.i8 d7, #22
+.else
+ vmov.i8 d6, #27
+ vmov.i8 d7, #17
+.endif
+
+.if \sy
+ add r7, r7, r7 // luma_stride *= 2
+.endif
+
+ bx r12
+endfunc
+.endm
+
+fguv 420, 1, 1
+fguv 422, 1, 0
+fguv 444, 0, 0
+
+function fguv_loop_sx0_neon
+L(fguv_loop_sx0_tbl):
+ .word L(fguv_loop_sx0_csfl0_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl0_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl0_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl0_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl1_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl1_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl1_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl1_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+
+.macro fguv_loop_sx0 csfl, ox, oy
+L(fguv_loop_sx0_csfl\csfl\()_\ox\oy):
+.if \oy
+ mov r12, lr
+.endif
+1:
+.if \ox
+ vld1.8 {d8}, [r4], r10 // grain_lut old
+.endif
+.if \oy
+ vld1.8 {q8, q9}, [r8], r10 // grain_lut top
+.endif
+.if \ox && \oy
+ vld1.8 {d10}, [r11], r10 // grain_lut top old
+.endif
+ vld1.8 {q0, q1}, [r6, :128], r7 // luma
+ vld1.8 {q10, q11}, [r5], r10 // grain_lut
+
+.if \ox
+ vmull.s8 q4, d8, d24
+ vmlal.s8 q4, d20, d25
+.endif
+
+.if \oy
+.if \ox
+ vmull.s8 q5, d10, d24
+ vmlal.s8 q5, d16, d25
+ vqrshrn.s16 d20, q4, #5
+ vqrshrn.s16 d16, q5, #5
+.endif
+
+ vmull.s8 q4, d20, d7
+ vmull.s8 q5, d21, d7
+ vmull.s8 q6, d22, d7
+ vmull.s8 q7, d23, d7
+ vmlal.s8 q4, d16, d6
+ vmlal.s8 q5, d17, d6
+ vmlal.s8 q6, d18, d6
+ vmlal.s8 q7, d19, d6
+ vqrshrn.s16 d20, q4, #5
+ vqrshrn.s16 d21, q5, #5
+ vqrshrn.s16 d22, q6, #5
+ vqrshrn.s16 d23, q7, #5
+.elseif \ox
+ vqrshrn.s16 d20, q4, #5
+.endif
+.if !\csfl
+ vld1.8 {q8, q9}, [r1, :128] // src
+ vmovl.u8 q4, d0
+ vmovl.u8 q5, d1
+ vmovl.u8 q6, d2
+ vmovl.u8 q7, d3
+ vmovl.u8 q0, d16
+ vmovl.u8 q1, d17
+ vmovl.u8 q8, d18
+ vmovl.u8 q9, d19
+ vmul.i16 q4, q4, d4[0]
+ vmul.i16 q5, q5, d4[0]
+ vmul.i16 q6, q6, d4[0]
+ vmul.i16 q7, q7, d4[0]
+ vmul.i16 q0, q0, d4[1]
+ vmul.i16 q1, q1, d4[1]
+ vmul.i16 q8, q8, d4[1]
+ vmul.i16 q9, q9, d4[1]
+ vqadd.s16 q4, q4, q0
+ vqadd.s16 q5, q5, q1
+ vqadd.s16 q6, q6, q8
+ vqadd.s16 q7, q7, q9
+ vdup.16 q0, d4[2]
+ vshr.s16 q4, q4, #6
+ vshr.s16 q5, q5, #6
+ vshr.s16 q6, q6, #6
+ vshr.s16 q7, q7, #6
+ vadd.i16 q4, q4, q0
+ vadd.i16 q5, q5, q0
+ vadd.i16 q6, q6, q0
+ vadd.i16 q7, q7, q0
+ vqmovun.s16 d0, q4
+ vqmovun.s16 d1, q5
+ vqmovun.s16 d2, q6
+ vqmovun.s16 d3, q7
+.endif
+
+ bl gather32_neon
+
+ vld1.8 {q0, q1}, [r1, :128], r2 // src
+
+ vmovl.s8 q8, d20 // grain
+ vmovl.s8 q9, d21
+ vmovl.s8 q10, d22
+ vmovl.s8 q11, d23
+
+ vmovl.u8 q6, d8 // scaling
+ vmovl.u8 q7, d9
+ vmovl.u8 q4, d10
+ vmovl.u8 q5, d11
+
+ vmul.i16 q8, q8, q6 // scaling * grain
+ vmul.i16 q9, q9, q7
+ vmul.i16 q10, q10, q4
+ vmul.i16 q11, q11, q5
+
+ vrshl.s16 q8, q8, q13 // round2(scaling * grain, scaling_shift)
+ vrshl.s16 q9, q9, q13
+ vrshl.s16 q10, q10, q13
+ vrshl.s16 q11, q11, q13
+
+ vaddw.u8 q8, q8, d0 // *src + noise
+ vaddw.u8 q9, q9, d1
+ vaddw.u8 q10, q10, d2
+ vaddw.u8 q11, q11, d3
+
+ vqmovun.s16 d0, q8
+ vqmovun.s16 d1, q9
+ vqmovun.s16 d2, q10
+ vqmovun.s16 d3, q11
+
+ vmax.u8 q0, q0, q14
+ vmax.u8 q1, q1, q14
+ vmin.u8 q0, q0, q15
+ vmin.u8 q1, q1, q15
+
+ subs r9, r9, #1
+.if \oy
+ vdup.8 d6, d25[0]
+ vdup.8 d7, d25[1]
+.endif
+
+ vst1.8 {q0, q1}, [r0, :128], r2 // dst
+ bgt 1b
+
+.if \oy
+ cmp r12, #0
+ mov r9, r12 // restore actual remaining h
+ bgt L(fguv_loop_sx0_csfl\csfl\()_\ox\()0)
+.endif
+ b 9f
+.endm
+ fguv_loop_sx0 0, 0, 0
+ fguv_loop_sx0 0, 0, 1
+ fguv_loop_sx0 0, 1, 0
+ fguv_loop_sx0 0, 1, 1
+ fguv_loop_sx0 1, 0, 0
+ fguv_loop_sx0 1, 0, 1
+ fguv_loop_sx0 1, 1, 0
+ fguv_loop_sx0 1, 1, 1
+
+9:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+function fguv_loop_sx1_neon
+L(fguv_loop_sx1_tbl):
+ .word L(fguv_loop_sx1_csfl0_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl0_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl0_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl0_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl1_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl1_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl1_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl1_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+
+.macro fguv_loop_sx1 csfl, ox, oy
+L(fguv_loop_sx1_csfl\csfl\()_\ox\oy):
+.if \oy
+ mov r12, lr
+.endif
+1:
+.if \ox
+ vld1.8 {d8}, [r4], r10 // grain_lut old
+.endif
+.if \oy
+ vld1.8 {q8}, [r8], r10 // grain_lut top
+.endif
+.if \ox && \oy
+ vld1.8 {d10}, [r11], r10 // grain_lut top old
+.endif
+ vld1.8 {q0, q1}, [r6, :128], r7 // luma
+ vld1.8 {q10}, [r5], r10 // grain_lut
+ vld1.8 {q11}, [r1, :128], r2 // src
+
+.if \ox
+ vmull.s8 q4, d8, d24
+ vmlal.s8 q4, d20, d25
+.endif
+
+ vpaddl.u8 q0, q0
+ vpaddl.u8 q1, q1
+.if \oy
+.if \ox
+ vmull.s8 q5, d10, d24
+ vmlal.s8 q5, d16, d25
+ vqrshrn.s16 d20, q4, #5
+ vqrshrn.s16 d16, q5, #5
+.endif
+
+ vmull.s8 q4, d20, d7
+ vmull.s8 q5, d21, d7
+ vmlal.s8 q4, d16, d6
+ vmlal.s8 q5, d17, d6
+ vqrshrn.s16 d20, q4, #5
+ vqrshrn.s16 d21, q5, #5
+.elseif \ox
+ vqrshrn.s16 d20, q4, #5
+.endif
+.if \csfl
+ vrshrn.u16 d0, q0, #1
+ vrshrn.u16 d1, q1, #1
+.else
+ vrshr.u16 q4, q0, #1
+ vrshr.u16 q5, q1, #1
+ vmovl.u8 q0, d22
+ vmovl.u8 q1, d23
+ vmul.i16 q4, q4, d4[0]
+ vmul.i16 q5, q5, d4[0]
+ vmul.i16 q0, q0, d4[1]
+ vmul.i16 q1, q1, d4[1]
+ vqadd.s16 q4, q4, q0
+ vqadd.s16 q5, q5, q1
+ vdup.16 q0, d4[2]
+ vshr.s16 q4, q4, #6
+ vshr.s16 q5, q5, #6
+ vadd.i16 q4, q4, q0
+ vadd.i16 q5, q5, q0
+ vqmovun.s16 d0, q4
+ vqmovun.s16 d1, q5
+.endif
+
+ bl gather16_neon
+
+ vmovl.s8 q8, d20 // grain
+ vmovl.s8 q9, d21
+
+ vmovl.u8 q6, d8 // scaling
+ vmovl.u8 q7, d9
+
+ vmul.i16 q8, q8, q6 // scaling * grain
+ vmul.i16 q9, q9, q7
+
+ vrshl.s16 q8, q8, q13 // round2(scaling * grain, scaling_shift)
+ vrshl.s16 q9, q9, q13
+
+ vaddw.u8 q8, q8, d22 // *src + noise
+ vaddw.u8 q9, q9, d23
+
+ vqmovun.s16 d0, q8
+ vqmovun.s16 d1, q9
+
+ vmax.u8 q0, q0, q14
+ vmin.u8 q0, q0, q15
+
+ subs r9, r9, #1
+.if \oy
+ vswp d6, d7
+.endif
+ vst1.8 {q0}, [r0, :128], r2 // dst
+ bgt 1b
+
+.if \oy
+ cmp r12, #0
+ mov r9, r12 // restore actual remaining h
+ bgt L(fguv_loop_sx1_csfl\csfl\()_\ox\()0)
+.endif
+
+ b 9f
+.endm
+ fguv_loop_sx1 0, 0, 0
+ fguv_loop_sx1 0, 0, 1
+ fguv_loop_sx1 0, 1, 0
+ fguv_loop_sx1 0, 1, 1
+ fguv_loop_sx1 1, 0, 0
+ fguv_loop_sx1 1, 0, 1
+ fguv_loop_sx1 1, 1, 0
+ fguv_loop_sx1 1, 1, 1
+
+9:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
diff --git a/third_party/dav1d/src/arm/32/filmgrain16.S b/third_party/dav1d/src/arm/32/filmgrain16.S
new file mode 100644
index 0000000000..d10bffff2f
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/filmgrain16.S
@@ -0,0 +1,2137 @@
+/*
+ * Copyright © 2021, VideoLAN and dav1d authors
+ * Copyright © 2021, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+#include "src/arm/asm-offsets.h"
+
+#define GRAIN_WIDTH 82
+#define GRAIN_HEIGHT 73
+
+#define SUB_GRAIN_WIDTH 44
+#define SUB_GRAIN_HEIGHT 38
+
+.macro increment_seed steps, shift=1
+ lsr r11, r2, #3
+ lsr r12, r2, #12
+ lsr lr, r2, #1
+ eor r11, r2, r11 // (r >> 0) ^ (r >> 3)
+ eor r12, r12, lr // (r >> 12) ^ (r >> 1)
+ eor r11, r11, r12 // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1)
+.if \shift
+ lsr r2, r2, #\steps
+.endif
+ and r11, r11, #((1 << \steps) - 1) // bit
+.if \shift
+ orr r2, r2, r11, lsl #(16 - \steps) // *state
+.else
+ orr r2, r2, r11, lsl #16 // *state
+.endif
+.endm
+
+.macro read_rand dest, bits, age
+ ubfx \dest, r2, #16 - \bits - \age, #\bits
+.endm
+
+.macro read_shift_rand dest, bits
+ ubfx \dest, r2, #17 - \bits, #\bits
+ lsr r2, r2, #1
+.endm
+
+// special calling convention:
+// r2 holds seed
+// r3 holds dav1d_gaussian_sequence
+// clobbers r11-r12
+// returns in d0-d1
+function get_gaussian_neon
+ push {r5-r6,lr}
+ increment_seed 4
+ read_rand r5, 11, 3
+ read_rand r6, 11, 2
+ add r5, r3, r5, lsl #1
+ add r6, r3, r6, lsl #1
+ vld1.16 {d0[0]}, [r5]
+ read_rand r5, 11, 1
+ vld1.16 {d0[1]}, [r6]
+ add r5, r3, r5, lsl #1
+ read_rand r6, 11, 0
+ increment_seed 4
+ add r6, r3, r6, lsl #1
+ vld1.16 {d0[2]}, [r5]
+ read_rand r5, 11, 3
+ vld1.16 {d0[3]}, [r6]
+ add r5, r3, r5, lsl #1
+ read_rand r6, 11, 2
+ vld1.16 {d1[0]}, [r5]
+ add r6, r3, r6, lsl #1
+ read_rand r5, 11, 1
+ vld1.16 {d1[1]}, [r6]
+ read_rand r6, 11, 0
+ add r5, r3, r5, lsl #1
+ add r6, r3, r6, lsl #1
+ vld1.16 {d1[2]}, [r5]
+ vld1.16 {d1[3]}, [r6]
+ pop {r5-r6,pc}
+endfunc
+
+function get_grain_2_neon
+ push {r11,lr}
+ increment_seed 2
+ read_rand r11, 11, 1
+ read_rand r12, 11, 0
+ add r11, r3, r11, lsl #1
+ add r12, r3, r12, lsl #1
+ vld1.16 {d0[0]}, [r11]
+ vld1.16 {d0[1]}, [r12]
+ vrshl.s16 d0, d0, d30
+ pop {r11,pc}
+endfunc
+
+.macro get_grain_2 dst
+ bl get_grain_2_neon
+.ifnc \dst, d0
+ vmov \dst, d0
+.endif
+.endm
+
+function get_grain_4_neon
+ push {r11,lr}
+ increment_seed 4
+ read_rand r11, 11, 3
+ read_rand r12, 11, 2
+ add r11, r3, r11, lsl #1
+ add r12, r3, r12, lsl #1
+ vld1.16 {d0[0]}, [r11]
+ read_rand r11, 11, 1
+ vld1.16 {d0[1]}, [r12]
+ read_rand r12, 11, 0
+ add r11, r3, r11, lsl #1
+ add r12, r3, r12, lsl #1
+ vld1.16 {d0[2]}, [r11]
+ vld1.16 {d0[3]}, [r12]
+ vrshl.s16 d0, d0, d30
+ pop {r11,pc}
+endfunc
+
+.macro get_grain_4 dst
+ bl get_grain_4_neon
+.ifnc \dst, d0
+ vmov \dst, d0
+.endif
+.endm
+
+// r1 holds the number of entries to produce
+// r6, r8 and r10 hold the previous output entries
+// q0 holds the vector of produced entries
+// q1 holds the input vector of sums from above
+.macro output_lag n
+function output_lag\n\()_neon
+ push {r0, lr}
+.if \n == 1
+ mvn lr, r5 // grain_min = ~grain_max
+.else
+ mov r0, #1
+ mov lr, #1
+ sub r7, r7, #1
+ sub r9, r9, #1
+ lsl r0, r0, r7
+ lsl lr, lr, r9
+ add r7, r7, #1
+ add r9, r9, #1
+.endif
+1:
+ read_shift_rand r12, 11
+ vmov.32 r11, d2[0]
+ lsl r12, r12, #1
+ vext.8 q0, q0, q0, #2
+ ldrsh r12, [r3, r12]
+.if \n == 1
+ mla r11, r6, r4, r11 // sum (above) + *coeff * prev output
+ add r6, r11, r8 // 1 << (ar_coeff_shift - 1)
+ add r12, r12, r10
+ asr r6, r6, r7 // >> ar_coeff_shift
+ asr r12, r12, r9 // >> (4 - bitdepth_min_8 + grain_scale_shift)
+ add r6, r6, r12
+ cmp r6, r5
+.elseif \n == 2
+ mla r11, r8, r4, r11 // sum (above) + *coeff * prev output 1
+ mla r11, r6, r10, r11 // += *coeff * prev output 2
+ mov r8, r6
+ add r6, r11, r0 // 1 << (ar_coeff_shift - 1)
+ add r12, r12, lr // 1 << (4 - bitdepth_min_8 + grain_scale_shift - 1)
+ asr r6, r6, r7 // >> ar_coeff_shift
+ asr r12, r12, r9 // >> (4 - bitdepth_min_8 + grain_scale_shift)
+ add r6, r6, r12
+ push {lr}
+ cmp r6, r5
+ mvn lr, r5 // grain_min = ~grain_max
+.else
+ push {r1-r3}
+ sbfx r1, r4, #0, #8
+ sbfx r2, r4, #8, #8
+ sbfx r3, r4, #16, #8
+ mla r11, r10, r1, r11 // sum (above) + *coeff * prev output 1
+ mla r11, r8, r2, r11 // sum (above) + *coeff * prev output 2
+ mla r11, r6, r3, r11 // += *coeff * prev output 3
+ pop {r1-r3}
+ mov r10, r8
+ mov r8, r6
+
+ add r6, r11, r0 // 1 << (ar_coeff_shift - 1)
+ add r12, r12, lr // 1 << (4 - bitdepth_min_8 + grain_scale_shift - 1)
+ asr r6, r6, r7 // >> ar_coeff_shift
+ asr r12, r12, r9 // >> (4 - bitdepth_min_8 + grain_scale_shift)
+ add r6, r6, r12
+ push {lr}
+ cmp r6, r5
+ mvn lr, r5 // grain_min = ~grain_max
+.endif
+ it gt
+ movgt r6, r5
+ cmp r6, lr
+ it lt
+ movlt r6, lr
+.if \n >= 2
+ pop {lr}
+.endif
+ subs r1, r1, #1
+ vext.8 q1, q1, q1, #4
+ vmov.16 d1[3], r6
+ bgt 1b
+ pop {r0, pc}
+endfunc
+.endm
+
+output_lag 1
+output_lag 2
+output_lag 3
+
+
+function sum_lag1_above_neon
+ sub r12, r0, #1*GRAIN_WIDTH*2 - 16
+ vld1.16 {q10}, [r12] // load top right
+
+ vext.8 q0, q8, q9, #14 // top left, top mid
+ vext.8 q1, q9, q10, #2 // top left, top mid
+
+ vmull.s16 q2, d18, d28
+ vmlal.s16 q2, d0, d27
+ vmlal.s16 q2, d2, d29
+ vmull.s16 q3, d19, d28
+ vmlal.s16 q3, d1, d27
+ vmlal.s16 q3, d3, d29
+
+ vmov q8, q9
+ vmov q9, q10
+
+ bx lr
+endfunc
+
+.macro sum_lag_n_body lag, type, uv_layout, edge, elems, uv_coeff
+.ifc \lag\()_\edge, lag3_left
+ bl sum_lag3_left_above_neon
+.else
+ bl sum_\lag\()_above_neon
+.endif
+.ifc \type, uv_420
+ vpush {q6-q7}
+ add r12, r11, #GRAIN_WIDTH*2
+ vld1.16 {q0, q1}, [r11]!
+ vld1.16 {q6, q7}, [r12]!
+ vpadd.i16 d0, d0, d1
+ vpadd.i16 d1, d2, d3
+ vpadd.i16 d12, d12, d13
+ vpadd.i16 d13, d14, d15
+ vadd.i16 q0, q0, q6
+ vpop {q6-q7}
+ vrshr.s16 q0, q0, #2
+.endif
+.ifc \type, uv_422
+ vld1.16 {q0, q1}, [r11]!
+ vpadd.i16 d0, d0, d1
+ vpadd.i16 d1, d2, d3
+ vrshr.s16 q0, q0, #1
+.endif
+.ifc \type, uv_444
+ vld1.16 {q0}, [r11]!
+.endif
+.if \uv_layout
+.ifnb \uv_coeff
+ vdup.8 d13, \uv_coeff
+ vmovl.s8 q6, d13
+.endif
+ vmlal.s16 q2, d0, d13
+ vmlal.s16 q3, d1, d13
+.endif
+.if \uv_layout && \elems == 8
+ b sum_\lag\()_y_\edge\()_start
+.elseif \uv_layout == 444 && \elems == 7
+ b sum_\lag\()_y_\edge\()_start
+.elseif \uv_layout == 422 && \elems == 1
+ b sum_\lag\()_uv_420_\edge\()_start
+.else
+sum_\lag\()_\type\()_\edge\()_start:
+ push {r11}
+.if \elems > 4
+.ifc \edge, left
+ increment_seed 4
+ read_rand r11, 11, 3
+ read_rand r12, 11, 2
+ add r11, r3, r11, lsl #1
+ add r12, r3, r12, lsl #1
+ vld1.16 {d1[1]}, [r11]
+ read_rand r11, 11, 1
+ vld1.16 {d1[2]}, [r12]
+ add r11, r3, r11, lsl #1
+ vld1.16 {d1[3]}, [r11]
+ lsl r2, r2, #1 // shift back the state as if we'd done increment_seed with shift=0
+ vrshl.s16 d1, d1, d30
+ vext.8 q2, q2, q2, #12
+.ifc \lag, lag3
+ vmov.s16 r10, d1[1]
+.endif
+.ifnc \lag, lag1
+ vmov.s16 r8, d1[2]
+.endif
+ vmov.s16 r6, d1[3]
+
+ vmov q1, q2
+ mov r1, #1
+ bl output_\lag\()_neon
+.else
+ increment_seed 4, shift=0
+ vmov q1, q2
+ mov r1, #4
+ bl output_\lag\()_neon
+.endif
+
+ increment_seed 4, shift=0
+ vmov q1, q3
+.ifc \edge, right
+ mov r1, #3
+ bl output_\lag\()_neon
+ read_shift_rand r12, 11
+ add r12, r3, r12, lsl #1
+ vld1.16 {d2[0]}, [r12]
+ vrshl.s16 d2, d2, d30
+ vext.8 q0, q0, q1, #2
+.else
+ mov r1, #4
+ bl output_\lag\()_neon
+.endif
+.else
+ // elems == 1
+ increment_seed 4, shift=0
+ vmov q1, q2
+ mov r1, #1
+ bl output_\lag\()_neon
+ lsr r2, r2, #3
+
+ read_rand r11, 11, 2
+ read_rand r12, 11, 1
+ add r11, r3, r11, lsl #1
+ add r12, r3, r12, lsl #1
+ vld1.16 {d2[0]}, [r11]
+ read_rand r11, 11, 0
+ vld1.16 {d2[1]}, [r12]
+ add r11, r3, r11, lsl #1
+ vld1.16 {d2[2]}, [r11]
+ vrshl.s16 d2, d2, d30
+ vext.8 q0, q0, q1, #14
+.endif
+ vst1.16 {q0}, [r0]!
+ pop {r11}
+ pop {r1, pc}
+.endif
+.endm
+
+.macro sum_lag1_func type, uv_layout, edge, elems=8
+function sum_\type\()_lag1_\edge\()_neon
+ push {r1, lr}
+.ifc \edge, left
+ sub r12, r0, #1*GRAIN_WIDTH*2
+ vld1.8 {q9}, [r12] // load the previous block right above
+.endif
+ sum_lag_n_body lag1, \type, \uv_layout, \edge, \elems
+endfunc
+.endm
+
+sum_lag1_func y, 0, left
+sum_lag1_func y, 0, mid
+sum_lag1_func y, 0, right, 7
+sum_lag1_func uv_444, 444, left
+sum_lag1_func uv_444, 444, mid
+sum_lag1_func uv_444, 444, right, 7
+sum_lag1_func uv_422, 422, left
+sum_lag1_func uv_422, 422, mid
+sum_lag1_func uv_422, 422, right, 1
+sum_lag1_func uv_420, 420, left
+sum_lag1_func uv_420, 420, mid
+sum_lag1_func uv_420, 420, right, 1
+
+
+function sum_lag2_above_neon
+ push {lr}
+ sub r12, r0, #2*GRAIN_WIDTH*2 - 16
+ sub lr, r0, #1*GRAIN_WIDTH*2 - 16
+ vld1.16 {q10}, [r12] // load top right
+ vld1.16 {q13}, [lr]
+
+ vdup.8 d10, d28[0]
+ vext.8 q0, q8, q9, #12 // top left, top mid
+ vdup.8 d12, d28[1]
+ vext.8 q1, q8, q9, #14
+ vdup.8 d14, d28[3]
+ vext.8 q4, q9, q10, #2 // top mid, top right
+ vmovl.s8 q5, d10
+ vmovl.s8 q6, d12
+ vmovl.s8 q7, d14
+
+ vmull.s16 q2, d0, d10
+ vmlal.s16 q2, d2, d12
+ vmlal.s16 q2, d8, d14
+ vmull.s16 q3, d1, d10
+ vmlal.s16 q3, d3, d12
+ vmlal.s16 q3, d9, d14
+
+ vdup.8 d10, d28[4]
+ vext.8 q0, q9, q10, #4 // top mid, top right
+ vdup.8 d12, d28[5]
+ vext.8 q1, q11, q12, #12 // top left, top mid
+ vdup.8 d14, d28[6]
+ vext.8 q4, q11, q12, #14
+ vmovl.s8 q5, d10
+ vmovl.s8 q6, d12
+ vmovl.s8 q7, d14
+
+ vmlal.s16 q2, d0, d10
+ vmlal.s16 q2, d2, d12
+ vmlal.s16 q2, d8, d14
+ vmlal.s16 q3, d1, d10
+ vmlal.s16 q3, d3, d12
+ vmlal.s16 q3, d9, d14
+
+ vdup.8 d10, d29[0]
+ vext.8 q0, q12, q13, #2 // top mid, top right
+ vdup.8 d12, d29[1]
+ vext.8 q1, q12, q13, #4
+
+ vdup.8 d14, d28[2]
+ vdup.8 d8, d28[7]
+
+ vmovl.s8 q5, d10
+ vmovl.s8 q6, d12
+ vmovl.s8 q7, d14
+ vmovl.s8 q4, d8
+
+ vmlal.s16 q2, d0, d10
+ vmlal.s16 q2, d2, d12
+ vmlal.s16 q2, d18, d14
+ vmlal.s16 q2, d24, d8
+ vmlal.s16 q3, d1, d10
+ vmlal.s16 q3, d3, d12
+ vmlal.s16 q3, d19, d14
+ vmlal.s16 q3, d25, d8
+
+ vmov q8, q9
+ vmov q9, q10
+
+ vmov q11, q12
+ vmov q12, q13
+
+ pop {pc}
+endfunc
+
+.macro sum_lag2_func type, uv_layout, edge, elems=8
+function sum_\type\()_lag2_\edge\()_neon
+ push {r1, lr}
+.ifc \edge, left
+ sub r12, r0, #2*GRAIN_WIDTH*2
+ sub lr, r0, #1*GRAIN_WIDTH*2
+ vld1.16 {q9}, [r12] // load the previous block right above
+ vld1.16 {q12}, [lr]
+.endif
+ sum_lag_n_body lag2, \type, \uv_layout, \edge, \elems, uv_coeff=d29[4]
+endfunc
+.endm
+
+sum_lag2_func y, 0, left
+sum_lag2_func y, 0, mid
+sum_lag2_func y, 0, right, 7
+sum_lag2_func uv_444, 444, left
+sum_lag2_func uv_444, 444, mid
+sum_lag2_func uv_444, 444, right, 7
+sum_lag2_func uv_422, 422, left
+sum_lag2_func uv_422, 422, mid
+sum_lag2_func uv_422, 422, right, 1
+sum_lag2_func uv_420, 420, left
+sum_lag2_func uv_420, 420, mid
+sum_lag2_func uv_420, 420, right, 1
+
+
+function sum_lag3_left_above_neon
+ // A separate codepath for the left edge, to avoid reading outside
+ // of the edge of the buffer.
+ sub r12, r0, #3*GRAIN_WIDTH*2
+ vld1.8 {q11, q12}, [r12]
+ vext.8 q12, q11, q12, #10
+ vext.8 q11, q11, q11, #10
+ b sum_lag3_above_start
+endfunc
+
+function sum_lag3_above_neon
+ movw r12, #(3*GRAIN_WIDTH + 3)*2
+ sub r12, r0, r12
+ vld1.8 {q11, q12}, [r12]
+
+sum_lag3_above_start:
+ vdup.8 d12, d26[0]
+ vext.8 q1, q11, q12, #2
+ vdup.8 d14, d26[1]
+ vext.8 q4, q11, q12, #4
+ vdup.8 d16, d26[2]
+ vext.8 q5, q11, q12, #6
+ vdup.8 d18, d26[3]
+ vmovl.s8 q6, d12
+ vmovl.s8 q7, d14
+ vmovl.s8 q8, d16
+ vmovl.s8 q9, d18
+
+ movw r12, #(2*GRAIN_WIDTH + 3)*2
+ sub r12, r0, r12
+
+ vmull.s16 q2, d22, d12
+ vmlal.s16 q2, d2, d14
+ vmlal.s16 q2, d8, d16
+ vmlal.s16 q2, d10, d18
+ vmull.s16 q3, d23, d12
+ vmlal.s16 q3, d3, d14
+ vmlal.s16 q3, d9, d16
+ vmlal.s16 q3, d11, d18
+
+ vdup.8 d12, d26[4]
+ vext.8 q0, q11, q12, #8
+ vdup.8 d14, d26[5]
+ vext.8 q1, q11, q12, #10
+ vdup.8 d16, d26[6]
+ vext.8 q4, q11, q12, #12
+ vld1.8 {q11, q12}, [r12]
+ vdup.8 d18, d26[7]
+ vmovl.s8 q6, d12
+ vmovl.s8 q7, d14
+ vmovl.s8 q8, d16
+ vmovl.s8 q9, d18
+
+ vmlal.s16 q2, d0, d12
+ vmlal.s16 q2, d2, d14
+ vmlal.s16 q2, d8, d16
+ vmlal.s16 q2, d22, d18
+ vmlal.s16 q3, d1, d12
+ vmlal.s16 q3, d3, d14
+ vmlal.s16 q3, d9, d16
+ vmlal.s16 q3, d23, d18
+
+ vdup.8 d12, d27[0]
+ vext.8 q0, q11, q12, #2
+ vdup.8 d14, d27[1]
+ vext.8 q1, q11, q12, #4
+ vdup.8 d16, d27[2]
+ vext.8 q4, q11, q12, #6
+ vdup.8 d18, d27[3]
+ vext.8 q5, q11, q12, #8
+ vmovl.s8 q6, d12
+ vmovl.s8 q7, d14
+ vmovl.s8 q8, d16
+ vmovl.s8 q9, d18
+
+ sub r12, r0, #(1*GRAIN_WIDTH + 3)*2
+
+ vmlal.s16 q2, d0, d12
+ vmlal.s16 q2, d2, d14
+ vmlal.s16 q2, d8, d16
+ vmlal.s16 q2, d10, d18
+ vmlal.s16 q3, d1, d12
+ vmlal.s16 q3, d3, d14
+ vmlal.s16 q3, d9, d16
+ vmlal.s16 q3, d11, d18
+
+ vdup.8 d12, d27[4]
+ vext.8 q0, q11, q12, #10
+ vdup.8 d14, d27[5]
+ vext.8 q1, q11, q12, #12
+ vld1.8 {q11, q12}, [r12]
+ vdup.8 d16, d27[6]
+ vdup.8 d18, d27[7]
+ vmovl.s8 q6, d12
+ vmovl.s8 q7, d14
+ vext.8 q5, q11, q12, #2
+ vmovl.s8 q8, d16
+ vmovl.s8 q9, d18
+
+ vmlal.s16 q2, d0, d12
+ vmlal.s16 q2, d2, d14
+ vmlal.s16 q2, d22, d16
+ vmlal.s16 q2, d10, d18
+ vmlal.s16 q3, d1, d12
+ vmlal.s16 q3, d3, d14
+ vmlal.s16 q3, d23, d16
+ vmlal.s16 q3, d11, d18
+
+ vdup.8 d12, d28[0]
+ vext.8 q0, q11, q12, #4
+ vdup.8 d14, d28[1]
+ vext.8 q1, q11, q12, #6
+ vdup.8 d16, d28[2]
+ vext.8 q4, q11, q12, #8
+ vdup.8 d18, d28[3]
+ vext.8 q5, q11, q12, #10
+ vmovl.s8 q6, d12
+ vmovl.s8 q7, d14
+ vmovl.s8 q8, d16
+ vmovl.s8 q9, d18
+
+ vmlal.s16 q2, d0, d12
+ vmlal.s16 q2, d2, d14
+ vmlal.s16 q2, d8, d16
+ vmlal.s16 q2, d10, d18
+ vmlal.s16 q3, d1, d12
+ vmlal.s16 q3, d3, d14
+ vmlal.s16 q3, d9, d16
+ vmlal.s16 q3, d11, d18
+
+ vdup.8 d12, d28[4]
+ vext.8 q0, q11, q12, #12
+ vmovl.s8 q6, d12
+
+ vmlal.s16 q2, d0, d12
+ vmlal.s16 q3, d1, d12
+
+ bx lr
+endfunc
+
+.macro sum_lag3_func type, uv_layout, edge, elems=8
+function sum_\type\()_lag3_\edge\()_neon
+ push {r1, lr}
+ sum_lag_n_body lag3, \type, \uv_layout, \edge, \elems, uv_coeff=d29[0]
+endfunc
+.endm
+
+sum_lag3_func y, 0, left
+sum_lag3_func y, 0, mid
+sum_lag3_func y, 0, right, 7
+sum_lag3_func uv_444, 444, left
+sum_lag3_func uv_444, 444, mid
+sum_lag3_func uv_444, 444, right, 7
+sum_lag3_func uv_422, 422, left
+sum_lag3_func uv_422, 422, mid
+sum_lag3_func uv_422, 422, right, 1
+sum_lag3_func uv_420, 420, left
+sum_lag3_func uv_420, 420, mid
+sum_lag3_func uv_420, 420, right, 1
+
+function generate_grain_rows_neon
+ push {r10-r11,lr}
+1:
+ mov r10, #80
+2:
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ subs r10, r10, #8
+ vst1.16 {q0}, [r0]!
+ bgt 2b
+ get_grain_2 d0
+ subs r1, r1, #1
+ vst1.32 {d0[0]}, [r0]!
+ bgt 1b
+ pop {r10-r11,pc}
+endfunc
+
+function generate_grain_rows_44_neon
+ push {r10-r11,lr}
+1:
+ mov r10, #40
+2:
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ subs r10, r10, #8
+ vst1.16 {q0}, [r0]!
+ bgt 2b
+ get_grain_4 d0
+ subs r1, r1, #1
+ vst1.16 {d0}, [r0]
+ add r0, r0, #GRAIN_WIDTH*2-80
+ bgt 1b
+ pop {r10-r11,pc}
+endfunc
+
+function gen_grain_uv_444_lag0_neon
+ vld1.16 {q3}, [r11]!
+gen_grain_uv_lag0_8_start:
+ push {r11,lr}
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+gen_grain_uv_lag0_8_add:
+ vand q3, q3, q1
+ vmull.s16 q2, d6, d22
+ vmull.s16 q3, d7, d22
+ vrshl.s32 q2, q2, q12
+ vrshl.s32 q3, q3, q12
+ vqmovn.s32 d4, q2
+ vqmovn.s32 d5, q3
+ vqadd.s16 q2, q2, q0
+ vmin.s16 q2, q2, q9
+ vmax.s16 q2, q2, q10
+ vst1.16 {q2}, [r0]!
+ pop {r11,pc}
+endfunc
+
+function gen_grain_uv_420_lag0_8_neon
+ add r12, r11, #GRAIN_WIDTH*2
+ vld1.16 {q2,q3}, [r11]!
+ vld1.16 {q4,q5}, [r12]
+ vpadd.i16 d4, d4, d5
+ vpadd.i16 d5, d6, d7
+ vpadd.i16 d8, d8, d9
+ vpadd.i16 d9, d10, d11
+ vadd.i16 q2, q2, q4
+ vrshr.s16 q3, q2, #2
+ b gen_grain_uv_lag0_8_start
+endfunc
+
+function gen_grain_uv_422_lag0_8_neon
+ vld1.16 {q2,q3}, [r11]!
+ vpadd.i16 d4, d4, d5
+ vpadd.i16 d5, d6, d7
+ vrshr.s16 q3, q2, #1
+ b gen_grain_uv_lag0_8_start
+endfunc
+
+function gen_grain_uv_420_lag0_4_neon
+ add r12, r11, #GRAIN_WIDTH*2
+ vld1.16 {q2}, [r11]
+ vld1.16 {q0}, [r12]
+ add r11, r11, #32
+ vpadd.i16 d4, d4, d5
+ vpadd.i16 d0, d0, d1
+ vadd.i16 d4, d4, d0
+ vrshr.s16 d6, d4, #2
+ push {r11,lr}
+ get_grain_4 d0
+ b gen_grain_uv_lag0_8_add
+endfunc
+
+function gen_grain_uv_422_lag0_4_neon
+ vld1.16 {q2}, [r11]
+ add r11, r11, #32
+ vpadd.i16 d4, d4, d5
+ vrshr.s16 d6, d4, #1
+ push {r11,lr}
+ get_grain_4 d0
+ b gen_grain_uv_lag0_8_add
+endfunc
+
+.macro gen_grain_82 type
+function generate_grain_\type\()_16bpc_neon, export=1
+ push {r4-r11,lr}
+
+.ifc \type, uv_444
+ ldr r4, [sp, #36]
+ mov r12, r3
+ mov lr, #28
+ add r11, r1, #3*GRAIN_WIDTH*2
+ mov r1, r2
+ mul r12, r12, lr
+ clz lr, r4
+.else
+ clz lr, r2
+.endif
+ movrel r3, X(gaussian_sequence)
+ sub lr, lr, #24 // -bitdepth_min_8
+ ldr r2, [r1, #FGD_SEED]
+ ldr r9, [r1, #FGD_GRAIN_SCALE_SHIFT]
+.ifc \type, y
+ add r4, r1, #FGD_AR_COEFFS_Y
+.else
+ add r4, r1, #FGD_AR_COEFFS_UV
+.endif
+ add r9, r9, lr // grain_scale_shift - bitdepth_min_8
+ adr r5, L(gen_grain_\type\()_tbl)
+ ldr r6, [r1, #FGD_AR_COEFF_LAG]
+ add r9, r9, #4
+ ldr r6, [r5, r6, lsl #2]
+ vdup.16 q15, r9 // 4 - bitdepth_min_8 + data->grain_scale_shift
+ add r5, r5, r6
+ vneg.s16 q15, q15
+
+.ifc \type, uv_444
+ push {lr}
+ cmp r12, #0
+ movw r10, #0x49d8
+ movw lr, #0xb524
+ // Intentionally using a separate register instead of moveq with an
+ // immediate constant, to avoid armv8 deprecated it instruction forms.
+ it eq
+ moveq r10, lr
+ add r4, r4, r12 // Add offset to ar_coeffs_uv[1]
+ eor r2, r2, r10
+ pop {lr}
+.endif
+
+ ldr r7, [r1, #FGD_AR_COEFF_SHIFT]
+ neg lr, lr // bitdepth_min_8
+ mov r8, #1
+ mov r10, #1
+ lsl r8, r8, r7 // 1 << ar_coeff_shift
+ lsl r10, r10, r9 // 1 << (4 + data->grain_scale_shift)
+ lsr r8, r8, #1 // 1 << (ar_coeff_shift - 1)
+ lsr r10, r10, #1 // 1 << (4 + data->grain_scale_shift - 1)
+
+ bx r5
+
+ .align 2
+L(gen_grain_\type\()_tbl):
+ .word L(generate_grain_\type\()_lag0) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+ .word L(generate_grain_\type\()_lag1) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+ .word L(generate_grain_\type\()_lag2) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+ .word L(generate_grain_\type\()_lag3) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+
+L(generate_grain_\type\()_lag0):
+.ifc \type, y
+ mov r1, #GRAIN_HEIGHT
+ bl generate_grain_rows_neon
+.else
+ mov r5, #128
+ lsl r5, r5, lr // 128 << bitdepth_min_8
+ sub r5, r5, #1 // (128 << bitdepth_min_8) - 1
+ mvn r6, r5 // grain_min = ~grain_max
+
+ mov r1, #3
+ bl generate_grain_rows_neon
+ mov r1, #GRAIN_HEIGHT-3
+
+ vdup.32 q12, r7
+ vld1.8 {d22[]}, [r4] // ar_coeffs_uv[0]
+ vmov.i8 q0, #0
+ vmov.i8 q1, #255
+ vdup.16 q9, r5
+ vdup.16 q10, r6
+ vext.8 q13, q0, q1, #10
+ vext.8 q14, q1, q0, #2
+ vneg.s32 q12, q12
+ vmovl.s8 q11, d22
+
+1:
+ vmov q1, q13
+ bl gen_grain_uv_444_lag0_neon // 8
+ vmov.i8 q1, #255
+ bl gen_grain_uv_444_lag0_neon // 16
+ bl gen_grain_uv_444_lag0_neon // 24
+ bl gen_grain_uv_444_lag0_neon // 32
+ bl gen_grain_uv_444_lag0_neon // 40
+ bl gen_grain_uv_444_lag0_neon // 48
+ bl gen_grain_uv_444_lag0_neon // 56
+ bl gen_grain_uv_444_lag0_neon // 64
+ bl gen_grain_uv_444_lag0_neon // 72
+ vmov q1, q14
+ bl gen_grain_uv_444_lag0_neon // 80
+ get_grain_2 d16
+ subs r1, r1, #1
+ add r11, r11, #4
+ vst1.32 {d16[0]}, [r0]!
+ bgt 1b
+.endif
+ pop {r4-r11,pc}
+
+L(generate_grain_\type\()_lag1):
+ vpush {q4-q7}
+ mov r5, #128
+ lsl r5, r5, lr // 128 << bitdepth_min_8
+ sub r5, r5, #1 // (128 << bitdepth_min_8) - 1
+ vld1.8 {d27[]}, [r4]! // ar_coeffs_y[0]
+ vld1.8 {d28[]}, [r4]! // ar_coeffs_y[1]
+ vld1.8 {d29[]}, [r4] // ar_coeffs_y[2]
+.ifc \type, y
+ ldrsb r4, [r4, #1] // ar_coeffs_y[3]
+.else
+ add r4, r4, #2
+.endif
+
+ mov r1, #3
+.ifc \type, uv_444
+ vld1.8 {d13[]}, [r4] // ar_coeffs_uv[4]
+ ldrsb r4, [r4, #-1] // ar_coeffs_uv[3]
+.endif
+ bl generate_grain_rows_neon
+ vmovl.s8 q13, d27
+ vmovl.s8 q12, d29
+ vmovl.s8 q14, d28
+ vmov d29, d24
+.ifc \type, uv_444
+ vmovl.s8 q6, d13
+.endif
+
+ mov r1, #GRAIN_HEIGHT - 3
+1:
+ bl sum_\type\()_lag1_left_neon // 8
+ bl sum_\type\()_lag1_mid_neon // 16
+ bl sum_\type\()_lag1_mid_neon // 24
+ bl sum_\type\()_lag1_mid_neon // 32
+ bl sum_\type\()_lag1_mid_neon // 40
+ bl sum_\type\()_lag1_mid_neon // 48
+ bl sum_\type\()_lag1_mid_neon // 56
+ bl sum_\type\()_lag1_mid_neon // 64
+ bl sum_\type\()_lag1_mid_neon // 72
+ bl sum_\type\()_lag1_right_neon // 80
+ get_grain_2 d16
+ subs r1, r1, #1
+.ifc \type, uv_444
+ add r11, r11, #4
+.endif
+ vst1.32 {d16[0]}, [r0]!
+ bgt 1b
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+
+L(generate_grain_\type\()_lag2):
+ vpush {q4-q7}
+ mov r5, #128
+ lsl r5, r5, lr // 128 << bitdepth_min_8
+ sub r5, r5, #1 // (128 << bitdepth_min_8) - 1
+ vld1.8 {d28,d29}, [r4] // ar_coeffs_y[0-11], ar_coeffs_uv[0-12]
+
+ vmov.s8 r4, d29[2]
+ vmov.s8 r10, d29[3]
+
+ mov r1, #3
+ bl generate_grain_rows_neon
+
+ mov r1, #GRAIN_HEIGHT - 3
+1:
+ bl sum_\type\()_lag2_left_neon // 8
+ bl sum_\type\()_lag2_mid_neon // 16
+ bl sum_\type\()_lag2_mid_neon // 24
+ bl sum_\type\()_lag2_mid_neon // 32
+ bl sum_\type\()_lag2_mid_neon // 40
+ bl sum_\type\()_lag2_mid_neon // 48
+ bl sum_\type\()_lag2_mid_neon // 56
+ bl sum_\type\()_lag2_mid_neon // 64
+ bl sum_\type\()_lag2_mid_neon // 72
+ bl sum_\type\()_lag2_right_neon // 80
+ get_grain_2 d16
+ subs r1, r1, #1
+.ifc \type, uv_444
+ add r11, r11, #4
+.endif
+ vst1.32 {d16[0]}, [r0]!
+ bgt 1b
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+
+L(generate_grain_\type\()_lag3):
+ vpush {q4-q7}
+ mov r5, #128
+ lsl r5, r5, lr // 128 << bitdepth_min_8
+ sub r5, r5, #1 // (128 << bitdepth_min_8) - 1
+ vld1.8 {q13, q14}, [r4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24]
+
+ vmov.u8 r4, d28[5]
+ vmov.u8 r10, d28[6]
+ vmov.u8 r12, d28[7]
+
+ orr r4, r4, r10, lsl #8
+ orr r4, r4, r12, lsl #16
+
+ mov r1, #3
+ vpush {d26}
+ bl generate_grain_rows_neon
+ vpop {d26}
+
+ mov r1, #GRAIN_HEIGHT - 3
+1:
+ bl sum_\type\()_lag3_left_neon // 8
+ bl sum_\type\()_lag3_mid_neon // 16
+ bl sum_\type\()_lag3_mid_neon // 24
+ bl sum_\type\()_lag3_mid_neon // 32
+ bl sum_\type\()_lag3_mid_neon // 40
+ bl sum_\type\()_lag3_mid_neon // 48
+ bl sum_\type\()_lag3_mid_neon // 56
+ bl sum_\type\()_lag3_mid_neon // 64
+ bl sum_\type\()_lag3_mid_neon // 72
+ bl sum_\type\()_lag3_right_neon // 80
+ get_grain_2 d16
+ subs r1, r1, #1
+.ifc \type, uv_444
+ add r11, r11, #4
+.endif
+ vst1.32 {d16[0]}, [r0]!
+ bgt 1b
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+.endm
+
+gen_grain_82 y
+gen_grain_82 uv_444
+
+.macro set_height dst, type
+.ifc \type, uv_420
+ mov \dst, #SUB_GRAIN_HEIGHT-3
+.else
+ mov \dst, #GRAIN_HEIGHT-3
+.endif
+.endm
+
+.macro increment_y_ptr reg, type
+.ifc \type, uv_420
+ add \reg, \reg, #2*GRAIN_WIDTH*2-(6*32)
+.else
+ sub \reg, \reg, #6*32-GRAIN_WIDTH*2
+.endif
+.endm
+
+.macro gen_grain_44 type
+function generate_grain_\type\()_16bpc_neon, export=1
+ push {r4-r11,lr}
+
+ ldr r4, [sp, #36]
+ mov r12, r3
+ movw r11, #(3*GRAIN_WIDTH-3)*2
+ mov lr, #28
+ add r11, r1, r11
+ mov r1, r2
+ mul r12, r12, lr
+ clz lr, r4
+
+ movrel r3, X(gaussian_sequence)
+ sub lr, lr, #24 // -bitdepth_min_8
+ ldr r2, [r1, #FGD_SEED]
+ ldr r9, [r1, #FGD_GRAIN_SCALE_SHIFT]
+ add r4, r1, #FGD_AR_COEFFS_UV
+ add r9, r9, lr // grain_scale_shift - bitdepth_min_8
+ adr r5, L(gen_grain_\type\()_tbl)
+ ldr r6, [r1, #FGD_AR_COEFF_LAG]
+ add r9, r9, #4
+ ldr r6, [r5, r6, lsl #2]
+ vdup.16 q15, r9 // 4 - bitdepth_min_8 + data->grain_scale_shift
+ add r5, r5, r6
+ vneg.s16 q15, q15
+
+ push {lr}
+ cmp r12, #0
+ movw r10, #0x49d8
+ movw lr, #0xb524
+ // Intentionally using a separate register instead of moveq with an
+ // immediate constant, to avoid armv8 deprecated it instruction forms.
+ it eq
+ moveq r10, lr
+ add r4, r4, r12 // Add offset to ar_coeffs_uv[1]
+ eor r2, r2, r10
+ pop {lr}
+
+ ldr r7, [r1, #FGD_AR_COEFF_SHIFT]
+ neg lr, lr
+ mov r8, #1
+ mov r10, #1
+ lsl r8, r8, r7 // 1 << ar_coeff_shift
+ lsl r10, r10, r9 // 1 << (4 + data->grain_scale_shift)
+ lsr r8, r8, #1 // 1 << (ar_coeff_shift - 1)
+ lsr r10, r10, #1 // 1 << (4 + data->grain_scale_shift - 1)
+ bx r5
+
+ .align 2
+L(gen_grain_\type\()_tbl):
+ .word L(generate_grain_\type\()_lag0) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+ .word L(generate_grain_\type\()_lag1) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+ .word L(generate_grain_\type\()_lag2) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+ .word L(generate_grain_\type\()_lag3) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+
+L(generate_grain_\type\()_lag0):
+.ifc \type, uv_420
+ vpush {q4-q5}
+.endif
+ mov r5, #128
+ lsl r5, r5, lr // 128 << bitdepth_min_8
+ sub r5, r5, #1 // (128 << bitdepth_min_8) - 1
+ mvn r6, r5 // grain_min = ~grain_max
+
+ mov r1, #3
+ bl generate_grain_rows_44_neon
+ set_height r1, \type
+
+ vdup.32 q12, r7
+ vld1.8 {d22[]}, [r4] // ar_coeffs_uv[0]
+ vmov.i8 q0, #0
+ vmov.i8 q1, #255
+ vdup.16 q9, r5
+ vdup.16 q10, r6
+ vext.8 q13, q0, q1, #10
+ vext.8 q14, q1, q0, #14
+ vneg.s32 q12, q12
+ vmovl.s8 q11, d22
+
+1:
+ vmov q1, q13
+ bl gen_grain_\type\()_lag0_8_neon // 8
+ vmov.i8 q1, #255
+ bl gen_grain_\type\()_lag0_8_neon // 16
+ bl gen_grain_\type\()_lag0_8_neon // 24
+ bl gen_grain_\type\()_lag0_8_neon // 32
+ bl gen_grain_\type\()_lag0_8_neon // 40
+ vmov q1, q14
+ bl gen_grain_\type\()_lag0_4_neon // 44
+ subs r1, r1, #1
+ increment_y_ptr r11, \type
+ add r0, r0, #GRAIN_WIDTH*2-6*16
+ bgt 1b
+
+.ifc \type, uv_420
+ vpop {q4-q5}
+.endif
+ pop {r4-r11,pc}
+
+L(generate_grain_\type\()_lag1):
+ vpush {q4-q7}
+ mov r5, #128
+ lsl r5, r5, lr // 128 << bitdepth_min_8
+ sub r5, r5, #1 // (128 << bitdepth_min_8) - 1
+ vld1.8 {d27[]}, [r4]! // ar_coeffs_uv[0]
+ vld1.8 {d28[]}, [r4]! // ar_coeffs_uv[1]
+ vld1.8 {d29[]}, [r4] // ar_coeffs_uv[2]
+ add r4, r4, #2
+
+ mov r1, #3
+ vld1.8 {d13[]}, [r4] // ar_coeffs_uv[4]
+ ldrsb r4, [r4, #-1] // ar_coeffs_uv[3]
+ bl generate_grain_rows_44_neon
+ vmovl.s8 q13, d27
+ vmovl.s8 q12, d29
+ vmovl.s8 q14, d28
+ vmov d29, d24
+ vmovl.s8 q6, d13
+
+ set_height r1, \type
+1:
+ bl sum_\type\()_lag1_left_neon // 8
+ bl sum_\type\()_lag1_mid_neon // 16
+ bl sum_\type\()_lag1_mid_neon // 24
+ bl sum_\type\()_lag1_mid_neon // 32
+ bl sum_\type\()_lag1_mid_neon // 40
+ bl sum_\type\()_lag1_right_neon // 44
+ subs r1, r1, #1
+ increment_y_ptr r11, \type
+ add r0, r0, #GRAIN_WIDTH*2-6*16
+ bgt 1b
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+
+L(generate_grain_\type\()_lag2):
+ vpush {q4-q7}
+ mov r5, #128
+ lsl r5, r5, lr // 128 << bitdepth_min_8
+ sub r5, r5, #1 // (128 << bitdepth_min_8) - 1
+ vld1.8 {d28,d29}, [r4] // ar_coeffs_uv[0-12]
+
+ vmov.s8 r4, d29[2]
+ vmov.s8 r10, d29[3]
+
+ mov r1, #3
+ bl generate_grain_rows_44_neon
+
+ set_height r1, \type
+1:
+ bl sum_\type\()_lag2_left_neon // 8
+ bl sum_\type\()_lag2_mid_neon // 16
+ bl sum_\type\()_lag2_mid_neon // 24
+ bl sum_\type\()_lag2_mid_neon // 32
+ bl sum_\type\()_lag2_mid_neon // 40
+ bl sum_\type\()_lag2_right_neon // 44
+ subs r1, r1, #1
+ increment_y_ptr r11, \type
+ add r0, r0, #GRAIN_WIDTH*2-6*16
+ bgt 1b
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+
+L(generate_grain_\type\()_lag3):
+ vpush {q4-q7}
+ mov r5, #128
+ lsl r5, r5, lr // 128 << bitdepth_min_8
+ sub r5, r5, #1 // (128 << bitdepth_min_8) - 1
+ vld1.8 {q13, q14}, [r4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24]
+
+ vmov.u8 r4, d28[5]
+ vmov.u8 r10, d28[6]
+ vmov.u8 r12, d28[7]
+
+ orr r4, r4, r10, lsl #8
+ orr r4, r4, r12, lsl #16
+
+ mov r1, #3
+ bl generate_grain_rows_44_neon
+
+ set_height r1, \type
+1:
+ bl sum_\type\()_lag3_left_neon // 8
+ bl sum_\type\()_lag3_mid_neon // 16
+ bl sum_\type\()_lag3_mid_neon // 24
+ bl sum_\type\()_lag3_mid_neon // 32
+ bl sum_\type\()_lag3_mid_neon // 40
+ bl sum_\type\()_lag3_right_neon // 44
+ subs r1, r1, #1
+ increment_y_ptr r11, \type
+ add r0, r0, #GRAIN_WIDTH*2-6*16
+ bgt 1b
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+.endm
+
+gen_grain_44 uv_420
+gen_grain_44 uv_422
+
+.macro gather_interleaved dst1, dst2, src1, src2, src3, src4, off
+ vmov.u16 r11, \src1[0+\off]
+ vmov.u16 r12, \src3[0+\off]
+ add r11, r11, r3
+ vmov.u16 lr, \src1[2+\off]
+ add r12, r12, r3
+ vld1.8 {\dst1[0+\off]}, [r11]
+ vmov.u16 r11, \src3[2+\off]
+ add lr, lr, r3
+ vld1.8 {\dst2[0+\off]}, [r12]
+ vmov.u16 r12, \src2[0+\off]
+ add r11, r11, r3
+ vld1.8 {\dst1[2+\off]}, [lr]
+ vmov.u16 lr, \src4[0+\off]
+ add r12, r12, r3
+ vld1.8 {\dst2[2+\off]}, [r11]
+ vmov.u16 r11, \src2[2+\off]
+ add lr, lr, r3
+ vld1.8 {\dst1[4+\off]}, [r12]
+ vmov.u16 r12, \src4[2+\off]
+ add r11, r11, r3
+ vld1.8 {\dst2[4+\off]}, [lr]
+ add r12, r12, r3
+ vld1.8 {\dst1[6+\off]}, [r11]
+ vld1.8 {\dst2[6+\off]}, [r12]
+.endm
+
+.macro gather dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, src7, src8
+ gather_interleaved \dst1, \dst3, \src1, \src2, \src5, \src6, 0
+ gather_interleaved \dst1, \dst3, \src1, \src2, \src5, \src6, 1
+ gather_interleaved \dst2, \dst4, \src3, \src4, \src7, \src8, 0
+ gather_interleaved \dst2, \dst4, \src3, \src4, \src7, \src8, 1
+.endm
+
+function gather32_neon
+ push {r11-r12,lr}
+ gather d8, d9, d10, d11, d0, d1, d2, d3, d4, d5, d6, d7
+ pop {r11-r12,pc}
+endfunc
+
+function gather16_neon
+ push {r11-r12,lr}
+ gather_interleaved d8, d9, d0, d1, d2, d3, 0
+ gather_interleaved d8, d9, d0, d1, d2, d3, 1
+ pop {r11-r12,pc}
+endfunc
+
+const overlap_coeffs_0, align=4
+ .short 27, 17, 0, 0
+ .short 17, 27, 32, 32
+endconst
+
+const overlap_coeffs_1, align=4
+ .short 23, 0, 0, 0
+ .short 22, 32, 32, 32
+endconst
+
+.macro calc_offset offx, offy, src, sx, sy
+ and \offy, \src, #0xF // randval & 0xF
+ lsr \offx, \src, #4 // randval >> 4
+.if \sy == 0
+ add \offy, \offy, \offy // 2 * (randval & 0xF)
+.endif
+.if \sx == 0
+ add \offx, \offx, \offx // 2 * (randval >> 4)
+.endif
+.endm
+
+.macro add_offset dst, offx, offy, src, stride
+ mla \dst, \stride, \offy, \src // grain_lut += grain_stride * offy
+ add \dst, \dst, \offx, lsl #1 // grain_lut += offx
+.endm
+
+// void dav1d_fgy_32x32_16bpc_neon(pixel *const dst, const pixel *const src,
+// const ptrdiff_t stride,
+// const uint8_t scaling[SCALING_SIZE],
+// const int scaling_shift,
+// const entry grain_lut[][GRAIN_WIDTH],
+// const int offsets[][2],
+// const int h, const ptrdiff_t clip,
+// const ptrdiff_t type,
+// const int bitdepth_max);
+function fgy_32x32_16bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100] // scaling_shift, grain_lut
+ ldrd r6, r7, [sp, #108] // offsets, h
+ ldr r8, [sp, #116] // clip
+ mov r9, #GRAIN_WIDTH*2 // grain_lut stride
+ ldr r10, [sp, #124] // bitdepth_max
+
+ eor r4, r4, #15 // 15 - scaling_shift
+ vdup.16 q6, r10 // bitdepth_max
+ clz r10, r10
+ vdup.16 q13, r4 // 15 - scaling_shift
+ rsb r10, r10, #24 // bitdepth_min_8
+ cmp r8, #0
+ vdup.16 q12, r10 // bitdepth_min_8
+
+ movrel_local r12, overlap_coeffs_0
+
+ beq 1f
+ // clip
+ vmov.i16 q14, #16
+ vmov.i16 q15, #235
+ vshl.s16 q14, q14, q12
+ vshl.s16 q15, q15, q12
+ b 2f
+1:
+ // no clip
+ vmov.i16 q14, #0
+ vmov q15, q6
+2:
+ vshr.u16 q6, q6, #1 // grain_max
+
+ vld1.16 {d24, d25}, [r12, :128] // overlap_coeffs
+
+ add r5, r5, #18 // grain_lut += 9
+ add r5, r5, r9, lsl #3 // grain_lut += 8 * grain_stride
+ add r5, r5, r9 // grain_lut += grain_stride
+
+ ldr r10, [r6, #8] // offsets[1][0]
+ calc_offset r10, r4, r10, 0, 0
+ add_offset r4, r10, r4, r5, r9
+ ldr r10, [r6, #4] // offsets[0][1]
+ calc_offset r10, r11, r10, 0, 0
+ add_offset r11, r10, r11, r5, r9
+ ldr r10, [r6, #12] // offsets[1][1]
+ calc_offset r10, r8, r10, 0, 0
+ add_offset r8, r10, r8, r5, r9
+ ldr r6, [r6] // offsets[0][0]
+ calc_offset r6, lr, r6, 0, 0
+ add_offset r5, r6, lr, r5, r9
+
+ add r4, r4, #32*2 // grain_lut += FG_BLOCK_SIZE * bx
+ add r6, r11, r9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by
+
+ ldr r10, [sp, #120] // type
+ adr r11, L(fgy_loop_tbl)
+
+ tst r10, #1
+ ldr r10, [r11, r10, lsl #2]
+
+ add r8, r8, r9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by
+ add r8, r8, #32*2 // grain_lut += FG_BLOCK_SIZE * bx
+
+ add r11, r11, r10
+
+ beq 1f
+ // y overlap
+ vdup.16 d14, d24[0]
+ vdup.16 d15, d24[1]
+ mov r10, r7 // backup actual h
+ mov r7, #2
+1:
+ sub r2, r2, #32 // src_stride -= 32
+ sub r9, r9, #32 // grain_stride -= 32
+ bx r11
+endfunc
+
+function fgy_loop_neon
+L(fgy_loop_tbl):
+ .word L(loop_00) - L(fgy_loop_tbl) + CONFIG_THUMB
+ .word L(loop_01) - L(fgy_loop_tbl) + CONFIG_THUMB
+ .word L(loop_10) - L(fgy_loop_tbl) + CONFIG_THUMB
+ .word L(loop_11) - L(fgy_loop_tbl) + CONFIG_THUMB
+
+.macro fgy ox, oy
+L(loop_\ox\oy):
+1:
+.if \ox
+ vld1.16 {d0}, [r4], r9 // grain_lut old
+.endif
+.if \oy
+ vld1.16 {q2, q3}, [r6]! // grain_lut top
+.endif
+.if \ox && \oy
+ vld1.16 {d2}, [r8], r9 // grain_lut top old
+.endif
+.if \oy
+ vld1.16 {q4, q5}, [r6], r9 // grain_lut top
+.endif
+.if !\ox && !\oy
+ vld1.16 {q0, q1}, [r1, :128]! // src
+.endif
+ vld1.16 {q8, q9}, [r5]! // grain_lut
+.if !\ox && !\oy
+ vld1.16 {q2, q3}, [r1, :128], r2 // src
+.endif
+.if !\oy
+ vmvn.i16 q5, #0xf000 // 0x0fff
+.endif
+ vld1.16 {q10, q11}, [r5], r9 // grain_lut
+
+.if \ox
+ add r4, r4, #32
+ vmull.s16 q0, d0, d24
+ vmlal.s16 q0, d16, d25
+.endif
+
+.if \oy
+.if \ox
+ add r8, r8, #32
+ vmull.s16 q1, d2, d24
+ vmlal.s16 q1, d4, d25
+ vqrshrn.s32 d16, q0, #5
+ vmvn d0, d12 // grain_min
+ vqrshrn.s32 d4, q1, #5
+ vmin.s16 d16, d16, d12
+ vmin.s16 d4, d4, d12
+ vmax.s16 d16, d16, d0
+ vmax.s16 d4, d4, d0
+.endif
+
+ vmull.s16 q0, d4, d14
+ vmull.s16 q1, d5, d14
+ vmull.s16 q2, d6, d14
+ vmull.s16 q3, d7, d14
+ vmlal.s16 q0, d16, d15
+ vmlal.s16 q1, d17, d15
+ vmlal.s16 q2, d18, d15
+ vmlal.s16 q3, d19, d15
+ vmull.s16 q8, d20, d15
+ vmull.s16 q9, d21, d15
+ vmull.s16 q10, d22, d15
+ vmull.s16 q11, d23, d15
+ vmlal.s16 q8, d8, d14
+ vmlal.s16 q9, d9, d14
+ vmlal.s16 q10, d10, d14
+ vmlal.s16 q11, d11, d14
+ vmvn q4, q6 // grain_min
+ vqrshrn.s32 d0, q0, #5
+ vqrshrn.s32 d1, q1, #5
+ vqrshrn.s32 d2, q2, #5
+ vqrshrn.s32 d3, q3, #5
+ vqrshrn.s32 d4, q8, #5
+ vqrshrn.s32 d5, q9, #5
+ vqrshrn.s32 d6, q10, #5
+ vqrshrn.s32 d7, q11, #5
+ vmin.s16 q8, q0, q6
+ vmin.s16 q9, q1, q6
+ vld1.16 {q0, q1}, [r1, :128]! // src
+ vmin.s16 q10, q2, q6
+ vmin.s16 q11, q3, q6
+ vmax.s16 q8, q8, q4
+ vmax.s16 q9, q9, q4
+ vld1.16 {q2, q3}, [r1, :128], r2 // src
+ vmvn.i16 q5, #0xf000 // 0x0fff
+ vmax.s16 q10, q10, q4
+ vmax.s16 q11, q11, q4
+.elseif \ox
+ vmvn d4, d12 // grain_min
+ vqrshrn.s32 d16, q0, #5
+ vld1.16 {q0, q1}, [r1, :128]! // src
+ vmin.s16 d16, d16, d12
+ vmax.s16 d16, d16, d4
+ vld1.16 {q2, q3}, [r1, :128], r2 // src
+.endif
+
+ // Make sure that uninitialized pixels out of range past the right
+ // edge are in range; their actual values shouldn't matter.
+ vand q0, q0, q5
+ vand q1, q1, q5
+ vand q2, q2, q5
+ vand q3, q3, q5
+
+ bl gather32_neon
+
+.if \ox || \oy
+ vpush {q6-q7}
+.endif
+
+ vmovl.u8 q6, d8 // scaling
+ vmovl.u8 q7, d9
+ vmovl.u8 q4, d10
+ vmovl.u8 q5, d11
+
+ vshl.u16 q6, q6, q13 // scaling << (15 - scaling_shift)
+ vshl.u16 q7, q7, q13
+ vshl.u16 q4, q4, q13
+ vshl.u16 q5, q5, q13
+
+ vqrdmulh.s16 q8, q8, q6 // round2((scaling << (15 - scaling_shift) * grain, 15)
+ vqrdmulh.s16 q9, q9, q7
+ vqrdmulh.s16 q10, q10, q4
+ vqrdmulh.s16 q11, q11, q5
+
+.if \ox || \oy
+ vpop {q6-q7}
+.endif
+
+ vqadd.s16 q0, q0, q8 // *src + noise
+ vqadd.s16 q1, q1, q9
+ vqadd.s16 q2, q2, q10
+ vqadd.s16 q3, q3, q11
+
+ vmax.s16 q0, q0, q14
+ vmax.s16 q1, q1, q14
+ vmax.s16 q2, q2, q14
+ vmax.s16 q3, q3, q14
+ vmin.s16 q0, q0, q15
+ vmin.s16 q1, q1, q15
+ vmin.s16 q2, q2, q15
+ vmin.s16 q3, q3, q15
+
+ vst1.16 {q0, q1}, [r0, :128]! // dst
+ subs r7, r7, #1
+.if \oy
+ vdup.16 d14, d25[0]
+ vdup.16 d15, d25[1]
+.endif
+ vst1.16 {q2, q3}, [r0, :128], r2 // dst
+ bgt 1b
+
+.if \oy
+ cmp r10, #2
+ sub r7, r10, #2 // restore actual remaining h
+ bgt L(loop_\ox\()0)
+.endif
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+.endm
+
+ fgy 0, 0
+ fgy 0, 1
+ fgy 1, 0
+ fgy 1, 1
+endfunc
+
+// void dav1d_fguv_32x32_420_16bpc_neon(pixel *const dst,
+// const pixel *const src,
+// const ptrdiff_t stride,
+// const uint8_t scaling[SCALING_SIZE],
+// const Dav1dFilmGrainData *const data,
+// const entry grain_lut[][GRAIN_WIDTH],
+// const pixel *const luma_row,
+// const ptrdiff_t luma_stride,
+// const int offsets[][2],
+// const ptrdiff_t h, const ptrdiff_t uv,
+// const ptrdiff_t is_id,
+// const ptrdiff_t type,
+// const int bitdepth_max);
+.macro fguv layout, sx, sy
+function fguv_32x32_\layout\()_16bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100] // data, grain_lut
+ ldrd r10, r11, [sp, #124] // uv, is_id
+ ldr r6, [sp, #136] // bitdepth_max
+
+ clz r7, r6
+ rsb r7, r7, #24 // bitdepth_min_8
+
+ // !csfl
+ add r10, r4, r10, lsl #2 // + 4*uv
+ add r12, r10, #FGD_UV_LUMA_MULT
+ add lr, r10, #FGD_UV_MULT
+ ldrh r10, [r10, #FGD_UV_OFFSET] // uv_offset
+ vld1.16 {d30[]}, [r12] // uv_luma_mult
+ lsl r10, r10, r7 // uv_offset << bitdepth_min_8
+ vld1.16 {d30[1]}, [lr] // uv_mult
+
+ ldr lr, [r4, #FGD_SCALING_SHIFT]
+ ldr r12, [r4, #FGD_CLIP_TO_RESTRICTED_RANGE]
+ eor lr, lr, #15 // 15 - scaling_shift
+
+ vmov.16 d30[2], r10 // uv_offset << bitdepth_min_8
+
+ cmp r12, #0
+ vdup.16 q13, lr // 15 - scaling_shift
+
+ beq 1f
+ // clip
+ cmp r11, #0
+ mov r8, #16
+ mov r9, #240
+ lsl r8, r8, r7
+ lsl r9, r9, r7
+ beq 2f
+ // is_id
+ mov r9, #235
+ lsl r9, r9, r7
+ b 2f
+1:
+ // no clip
+ mov r8, #0
+ mov r9, r6 // bitdepth_max
+2:
+ vmov.16 d30[3], r6 // bitdepth_max
+ vdup.16 d31, r8 // clip_min
+
+ mov r10, #GRAIN_WIDTH*2 // grain_lut stride
+
+.if \sy
+ mov r6, #23
+ mov r7, #22
+.else
+ mov r6, #27
+ mov r7, #17
+.endif
+ vmov.16 d31[1], r9 // clip_max
+
+ ldrd r8, r9, [sp, #116] // offsets, h
+
+ add r5, r5, #(2*(3 + (2 >> \sx)*3)) // grain_lut += 9 or 6
+.if \sy
+ add r5, r5, r10, lsl #2 // grain_lut += 4 * grain_stride
+ add r5, r5, r10, lsl #1 // grain_lut += 2 * grain_stride
+.else
+ add r5, r5, r10, lsl #3 // grain_lut += 8 * grain_stride
+ add r5, r5, r10 // grain_lut += grain_stride
+.endif
+ vmov.16 d31[2], r6 // overlap y [0]
+
+ ldr r12, [r8, #8] // offsets[1][0]
+ calc_offset r12, r4, r12, \sx, \sy
+ add_offset r4, r12, r4, r5, r10
+
+ ldr r12, [r8, #4] // offsets[0][1]
+ calc_offset r12, lr, r12, \sx, \sy
+ add_offset lr, r12, lr, r5, r10
+
+ ldr r12, [r8, #12] // offsets[1][1]
+ calc_offset r12, r11, r12, \sx, \sy
+ add_offset r11, r12, r11, r5, r10
+
+ ldr r8, [r8] // offsets[0][0]
+ calc_offset r8, r12, r8, \sx, \sy
+ add_offset r5, r8, r12, r5, r10
+
+ vmov.16 d31[3], r7 // overlap y [1]
+
+ add r4, r4, #2*(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx
+ add r8, lr, r10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by
+ add r11, r11, r10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by
+ add r11, r11, #2*(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx
+
+ movrel_local r12, overlap_coeffs_\sx
+ ldr lr, [sp, #132] // type
+ ldrd r6, r7, [sp, #108] // luma_row, luma_stride
+
+ vld1.16 {d24, d25}, [r12, :128] // overlap_coeffs
+
+ movrel_local r12, L(fguv_loop_sx\sx\()_tbl)
+#if CONFIG_THUMB
+ // This uses movrel_local instead of adr above, because the target
+ // can be out of range for adr. But movrel_local leaves the thumb bit
+ // set on COFF (but probably wouldn't if building for thumb on ELF),
+ // thus try to clear the bit for robustness.
+ bic r12, r12, #1
+#endif
+
+ tst lr, #1
+ ldr lr, [r12, lr, lsl #2]
+
+ add r12, r12, lr
+
+ beq 1f
+ // y overlap
+ sub lr, r9, #(2 >> \sy) // backup remaining h
+ mov r9, #(2 >> \sy)
+
+1:
+.if \sy
+ add r7, r7, r7 // luma_stride *= 2
+.endif
+ sub r7, r7, #32 // luma_stride -= 32
+
+ bx r12
+endfunc
+.endm
+
+fguv 420, 1, 1
+fguv 422, 1, 0
+fguv 444, 0, 0
+
+function fguv_loop_sx0_neon
+L(fguv_loop_sx0_tbl):
+ .word L(fguv_loop_sx0_csfl0_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl0_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl0_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl0_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl1_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl1_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl1_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl1_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+
+.macro fguv_loop_sx0 csfl, ox, oy
+L(fguv_loop_sx0_csfl\csfl\()_\ox\oy):
+ sub r2, r2, #32 // src_stride -= 32
+ sub r10, r10, #32 // grain_stride -= 32
+.if \oy
+ mov r12, lr
+.endif
+L(fguv_loop_sx0_csfl\csfl\()_\ox\oy\()_loopstart):
+1:
+.if \ox
+ vld1.16 {d0}, [r4], r10 // grain_lut old
+.endif
+.if \oy
+ vld1.16 {q2, q3}, [r8]! // grain_lut top
+.endif
+.if \ox && \oy
+ vld1.16 {d2}, [r11], r10 // grain_lut top old
+.endif
+.if !\ox && !\oy
+ vld1.16 {q0, q1}, [r6, :128]! // luma
+.endif
+ vld1.16 {q8, q9}, [r5]! // grain_lut
+.if \oy
+ vld1.16 {q4, q5}, [r8], r10 // grain_lut top
+.endif
+.if !\ox && !\oy
+ vld1.16 {q2, q3}, [r6, :128], r7 // luma
+.endif
+.if \oy
+ vdup.16 d28, d31[2] // overlap y coeff
+ vdup.16 d29, d31[3] // overlap y coeff
+.endif
+ vld1.16 {q10, q11}, [r5], r10 // grain_lut
+
+.if \ox
+ vdup.16 q7, d30[3] // bitdepth_max
+ add r4, r4, #32
+ vmull.s16 q0, d0, d24
+ vshr.u16 q7, q7, #1 // grain_max
+ vmlal.s16 q0, d16, d25
+ vmvn q6, q7 // grain_min
+.endif
+
+.if \oy
+.if \ox
+ add r11, r11, #32
+ vmull.s16 q1, d2, d24
+ vmlal.s16 q1, d4, d25
+ vqrshrn.s32 d16, q0, #5
+ vqrshrn.s32 d4, q1, #5
+ vmin.s16 d4, d4, d14
+ vmin.s16 d16, d16, d14
+ vmax.s16 d4, d4, d12
+ vmax.s16 d16, d16, d12
+.endif
+
+ vmull.s16 q0, d4, d28
+ vmull.s16 q1, d5, d28
+ vmull.s16 q2, d6, d28
+ vmull.s16 q3, d7, d28
+.if !\ox
+ vdup.16 q7, d30[3] // bitdepth_max
+.endif
+ vmlal.s16 q0, d16, d29
+ vmlal.s16 q1, d17, d29
+ vmlal.s16 q2, d18, d29
+ vmlal.s16 q3, d19, d29
+.if !\ox
+ vshr.u16 q7, q7, #1 // grain_max
+.endif
+ vmull.s16 q8, d20, d29
+ vmull.s16 q9, d21, d29
+ vmull.s16 q10, d22, d29
+ vmull.s16 q11, d23, d29
+.if !\ox
+ vmvn q6, q7 // grain_min
+.endif
+ vmlal.s16 q8, d8, d28
+ vmlal.s16 q9, d9, d28
+ vmlal.s16 q10, d10, d28
+ vmlal.s16 q11, d11, d28
+ vqrshrn.s32 d0, q0, #5
+ vqrshrn.s32 d1, q1, #5
+ vqrshrn.s32 d2, q2, #5
+ vqrshrn.s32 d3, q3, #5
+ vqrshrn.s32 d4, q8, #5
+ vqrshrn.s32 d5, q9, #5
+ vqrshrn.s32 d6, q10, #5
+ vqrshrn.s32 d7, q11, #5
+ vmin.s16 q8, q0, q7
+ vmin.s16 q9, q1, q7
+ vld1.16 {q0, q1}, [r6, :128]! // luma
+ vmin.s16 q10, q2, q7
+ vmin.s16 q11, q3, q7
+ vmax.s16 q8, q8, q6
+ vmax.s16 q9, q9, q6
+ vld1.16 {q2, q3}, [r6, :128], r7 // luma
+ vmax.s16 q10, q10, q6
+ vmax.s16 q11, q11, q6
+.elseif \ox
+ vqrshrn.s32 d16, q0, #5
+ vld1.16 {q0, q1}, [r6, :128]! // luma
+ vmin.s16 d16, d16, d14
+ vld1.16 {q2, q3}, [r6, :128], r7 // luma
+ vmax.s16 d16, d16, d12
+.endif
+
+.if !\csfl
+ vdup.16 d28, d30[0] // uv_luma_mult
+ vld1.16 {q4, q5}, [r1, :128]! // src
+ vdup.16 d29, d30[1] // uv_mult
+ vmull.s16 q6, d0, d28
+ vmull.s16 q7, d1, d28
+ vmull.s16 q0, d2, d28
+ vmull.s16 q1, d3, d28
+ vmlal.s16 q6, d8, d29
+ vmlal.s16 q7, d9, d29
+ vmlal.s16 q0, d10, d29
+ vmlal.s16 q1, d11, d29
+ vld1.16 {q4, q5}, [r1, :128] // src
+ sub r1, r1, #32
+ vshrn.s32 d12, q6, #6
+ vshrn.s32 d13, q7, #6
+ vshrn.s32 d14, q0, #6
+ vshrn.s32 d15, q1, #6
+ vmull.s16 q0, d4, d28
+ vmull.s16 q1, d5, d28
+ vmull.s16 q2, d6, d28
+ vmull.s16 q3, d7, d28
+ vmlal.s16 q0, d8, d29
+ vmlal.s16 q1, d9, d29
+ vmlal.s16 q2, d10, d29
+ vmlal.s16 q3, d11, d29
+ vdup.16 q14, d30[2] // uv_offset
+ vshrn.s32 d0, q0, #6
+ vshrn.s32 d1, q1, #6
+ vshrn.s32 d2, q2, #6
+ vshrn.s32 d3, q3, #6
+ vdup.16 q4, d30[3] // bitdepth_max
+ vmov.i16 q5, #0
+ vadd.i16 q6, q6, q14
+ vadd.i16 q7, q7, q14
+ vadd.i16 q2, q0, q14
+ vadd.i16 q3, q1, q14
+ vmin.s16 q0, q6, q4
+ vmin.s16 q1, q7, q4
+ vmin.s16 q2, q2, q4
+ vmin.s16 q3, q3, q4
+ vmax.s16 q0, q0, q5
+ vmax.s16 q1, q1, q5
+ vmax.s16 q2, q2, q5
+ vmax.s16 q3, q3, q5
+.else
+ vdup.16 q14, d30[3] // bitdepth_max
+ // Make sure that uninitialized pixels out of range past the right
+ // edge are in range; their actual values shouldn't matter.
+ vand q0, q0, q14
+ vand q1, q1, q14
+ vand q2, q2, q14
+ vand q3, q3, q14
+.endif
+
+ bl gather32_neon
+
+ vld1.16 {q0, q1}, [r1, :128]! // src
+
+ vmovl.u8 q6, d8 // scaling
+ vmovl.u8 q7, d9
+ vmovl.u8 q4, d10
+ vmovl.u8 q5, d11
+
+ vld1.16 {q2, q3}, [r1, :128], r2 // src
+
+ vshl.u16 q6, q6, q13 // scaling << (15 - scaling_shift)
+ vshl.u16 q7, q7, q13
+ vshl.u16 q4, q4, q13
+ vshl.u16 q5, q5, q13
+
+ vqrdmulh.s16 q8, q8, q6 // round2((scaling << (15 - scaling_shift) * grain, 15)
+ vqrdmulh.s16 q9, q9, q7
+ vqrdmulh.s16 q10, q10, q4
+ vqrdmulh.s16 q11, q11, q5
+
+
+ vdup.16 q4, d31[0] // clip_min
+ vdup.16 q5, d31[1] // clip_max
+
+ vqadd.s16 q0, q0, q8 // *src + noise
+ vqadd.s16 q1, q1, q9
+ vqadd.s16 q2, q2, q10
+ vqadd.s16 q3, q3, q11
+
+.if \oy
+ vmov.32 lr, d25[0] // 2 first 16 bit coeffs from overlap x
+.endif
+
+ vmax.s16 q0, q0, q4
+ vmax.s16 q1, q1, q4
+ vmax.s16 q2, q2, q4
+ vmax.s16 q3, q3, q4
+ vmin.s16 q0, q0, q5
+ vmin.s16 q1, q1, q5
+ vmin.s16 q2, q2, q5
+ vmin.s16 q3, q3, q5
+
+ vst1.16 {q0, q1}, [r0, :128]! // dst
+
+ subs r9, r9, #1
+.if \oy
+ vmov.32 d31[1], lr // new coeffs for overlap y
+.endif
+
+ vst1.16 {q2, q3}, [r0, :128], r2 // dst
+ bgt 1b
+
+.if \oy
+ cmp r12, #0
+ mov r9, r12 // restore actual remaining h
+ bgt L(fguv_loop_sx0_csfl\csfl\()_\ox\()0_loopstart)
+.endif
+ b 9f
+.endm
+ fguv_loop_sx0 0, 0, 0
+ fguv_loop_sx0 0, 0, 1
+ fguv_loop_sx0 0, 1, 0
+ fguv_loop_sx0 0, 1, 1
+ fguv_loop_sx0 1, 0, 0
+ fguv_loop_sx0 1, 0, 1
+ fguv_loop_sx0 1, 1, 0
+ fguv_loop_sx0 1, 1, 1
+
+9:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+function fguv_loop_sx1_neon
+L(fguv_loop_sx1_tbl):
+ .word L(fguv_loop_sx1_csfl0_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl0_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl0_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl0_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl1_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl1_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl1_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl1_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+
+.macro fguv_loop_sx1 csfl, ox, oy
+L(fguv_loop_sx1_csfl\csfl\()_\ox\oy):
+.if \oy
+ mov r12, lr
+.endif
+1:
+.if \ox
+ vld1.16 {d0}, [r4], r10 // grain_lut old
+.endif
+.if \ox && \oy
+ vld1.16 {d2}, [r11], r10 // grain_lut top old
+.endif
+.if \oy
+ vld1.16 {q2, q3}, [r8], r10 // grain_lut top
+.endif
+.if !\ox && !\oy
+ vld1.16 {q0, q1}, [r6, :128]! // luma
+.endif
+ vld1.16 {q8, q9}, [r5], r10 // grain_lut
+.if \oy
+ vdup.16 d28, d31[2] // overlap y coeff
+ vdup.16 d29, d31[3] // overlap y coeff
+.endif
+.if !\ox && !\oy
+ vld1.16 {q2, q3}, [r6, :128], r7 // luma
+.endif
+
+.if \ox
+ vdup.16 q7, d30[3] // bitdepth_max
+ vmull.s16 q0, d0, d24
+ vshr.u16 q7, q7, #1 // grain_max
+ vmlal.s16 q0, d16, d25
+ vmvn q6, q7 // grain_min
+.endif
+
+.if \oy
+.if \ox
+ vmull.s16 q1, d2, d24
+ vmlal.s16 q1, d4, d25
+ vqrshrn.s32 d16, q0, #5
+ vqrshrn.s32 d4, q1, #5
+ vmin.s16 d4, d4, d14
+ vmin.s16 d16, d16, d14
+ vmax.s16 d4, d4, d12
+ vmax.s16 d16, d16, d12
+.endif
+
+ vmull.s16 q0, d4, d28
+ vmull.s16 q1, d5, d28
+ vmull.s16 q2, d6, d28
+ vmull.s16 q3, d7, d28
+.if !\ox
+ vdup.16 q7, d30[3] // bitdepth_max
+.endif
+ vmlal.s16 q0, d16, d29
+ vmlal.s16 q1, d17, d29
+ vmlal.s16 q2, d18, d29
+ vmlal.s16 q3, d19, d29
+.if !\ox
+ vshr.u16 q7, q7, #1 // grain_max
+.endif
+ vqrshrn.s32 d16, q0, #5
+ vqrshrn.s32 d17, q1, #5
+ vqrshrn.s32 d18, q2, #5
+ vqrshrn.s32 d19, q3, #5
+.if !\ox
+ vmvn q6, q7 // grain_min
+.endif
+ vld1.16 {q0, q1}, [r6, :128]! // luma
+ vmin.s16 q8, q8, q7
+ vmin.s16 q9, q9, q7
+ vmax.s16 q8, q8, q6
+ vmax.s16 q9, q9, q6
+ vld1.16 {q2, q3}, [r6, :128], r7 // luma
+.elseif \ox
+ vqrshrn.s32 d16, q0, #5
+ vld1.16 {q0, q1}, [r6, :128]! // luma
+ vmin.s16 d16, d16, d14
+ vld1.16 {q2, q3}, [r6, :128], r7 // luma
+ vmax.s16 d16, d16, d12
+.endif
+
+ vpadd.i16 d0, d0, d1
+ vpadd.i16 d1, d2, d3
+ vpadd.i16 d2, d4, d5
+ vpadd.i16 d3, d6, d7
+ vrshr.u16 q0, q0, #1
+ vrshr.u16 q1, q1, #1
+.if !\csfl
+ vdup.16 d28, d30[0] // uv_luma_mult
+ vld1.16 {q2, q3}, [r1, :128], r2 // src
+ vdup.16 d29, d30[1] // uv_mult
+ vmull.s16 q6, d0, d28
+ vmull.s16 q7, d1, d28
+ vmull.s16 q0, d2, d28
+ vmull.s16 q1, d3, d28
+ vmlal.s16 q6, d4, d29
+ vmlal.s16 q7, d5, d29
+ vmlal.s16 q0, d6, d29
+ vmlal.s16 q1, d7, d29
+ vshrn.s32 d12, q6, #6
+ vshrn.s32 d13, q7, #6
+ vshrn.s32 d14, q0, #6
+ vshrn.s32 d15, q1, #6
+ vdup.16 q14, d30[2] // uv_offset
+ vdup.16 q4, d30[3] // bitdepth_max
+ vmov.i16 q5, #0
+ vadd.i16 q6, q6, q14
+ vadd.i16 q7, q7, q14
+ vmin.s16 q0, q6, q4
+ vmin.s16 q1, q7, q4
+ vmax.s16 q0, q0, q5
+ vmax.s16 q1, q1, q5
+.else
+ vdup.16 q14, d30[3] // bitdepth_max
+ vld1.16 {q2, q3}, [r1, :128], r2 // src
+
+ // Make sure that uninitialized pixels out of range past the right
+ // edge are in range; their actual values shouldn't matter.
+ vand q0, q0, q14
+ vand q1, q1, q14
+.endif
+
+ bl gather16_neon
+
+ vmovl.u8 q6, d8 // scaling
+ vmovl.u8 q7, d9
+
+ vshl.u16 q6, q6, q13 // scaling << (15 - scaling_shift)
+ vshl.u16 q7, q7, q13
+
+ vqrdmulh.s16 q8, q8, q6 // round2((scaling << (15 - scaling_shift) * grain, 15)
+ vqrdmulh.s16 q9, q9, q7
+
+
+ vdup.16 q4, d31[0] // clip_min
+ vdup.16 q5, d31[1] // clip_max
+
+ vqadd.s16 q0, q2, q8 // *src + noise
+ vqadd.s16 q1, q3, q9
+
+.if \oy
+ // Swap the two last coefficients of d31, place them first in d28
+ vrev64.16 d28, d31
+.endif
+
+ vmax.s16 q0, q0, q4
+ vmax.s16 q1, q1, q4
+ vmin.s16 q0, q0, q5
+ vmin.s16 q1, q1, q5
+
+ subs r9, r9, #1
+.if \oy
+ // Take the first two 16 bit coefficients of d28 and place them at the
+ // end of d31
+ vtrn.32 d31, d28
+.endif
+
+ vst1.16 {q0, q1}, [r0, :128], r2 // dst
+ bgt 1b
+
+.if \oy
+ cmp r12, #0
+ mov r9, r12 // restore actual remaining h
+ bgt L(fguv_loop_sx1_csfl\csfl\()_\ox\()0)
+.endif
+
+ b 9f
+.endm
+ fguv_loop_sx1 0, 0, 0
+ fguv_loop_sx1 0, 0, 1
+ fguv_loop_sx1 0, 1, 0
+ fguv_loop_sx1 0, 1, 1
+ fguv_loop_sx1 1, 0, 0
+ fguv_loop_sx1 1, 0, 1
+ fguv_loop_sx1 1, 1, 0
+ fguv_loop_sx1 1, 1, 1
+
+9:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
diff --git a/third_party/dav1d/src/arm/32/ipred.S b/third_party/dav1d/src/arm/32/ipred.S
new file mode 100644
index 0000000000..8c6d539a47
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/ipred.S
@@ -0,0 +1,2958 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * Copyright © 2019, B Krishnan Iyer
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// void ipred_dc_128_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_dc_128_8bpc_neon, export=1
+ push {r4, lr}
+ ldr r4, [sp, #8]
+ clz r3, r3
+ adr r2, L(ipred_dc_128_tbl)
+ sub r3, r3, #25
+ ldr r3, [r2, r3, lsl #2]
+ vmov.i8 q0, #128
+ add r2, r2, r3
+ add r12, r0, r1
+ lsl r1, r1, #1
+ bx r2
+
+ .align 2
+L(ipred_dc_128_tbl):
+ .word 640f - L(ipred_dc_128_tbl) + CONFIG_THUMB
+ .word 320f - L(ipred_dc_128_tbl) + CONFIG_THUMB
+ .word 16f - L(ipred_dc_128_tbl) + CONFIG_THUMB
+ .word 8f - L(ipred_dc_128_tbl) + CONFIG_THUMB
+ .word 4f - L(ipred_dc_128_tbl) + CONFIG_THUMB
+4:
+ vst1.32 {d0[0]}, [r0, :32], r1
+ vst1.32 {d0[0]}, [r12, :32], r1
+ subs r4, r4, #4
+ vst1.32 {d0[0]}, [r0, :32], r1
+ vst1.32 {d0[0]}, [r12, :32], r1
+ bgt 4b
+ pop {r4, pc}
+8:
+ vst1.8 {d0}, [r0, :64], r1
+ vst1.8 {d0}, [r12, :64], r1
+ subs r4, r4, #4
+ vst1.8 {d0}, [r0, :64], r1
+ vst1.8 {d0}, [r12, :64], r1
+ bgt 8b
+ pop {r4, pc}
+16:
+ vst1.8 {d0, d1}, [r0, :128], r1
+ vst1.8 {d0, d1}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.8 {d0, d1}, [r0, :128], r1
+ vst1.8 {d0, d1}, [r12, :128], r1
+ bgt 16b
+ pop {r4, pc}
+320:
+ vmov.i8 q1, #128
+32:
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 32b
+ pop {r4, pc}
+640:
+ vmov.i8 q1, #128
+ sub r1, r1, #32
+64:
+ vst1.8 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.8 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 64b
+ pop {r4, pc}
+endfunc
+
+// void ipred_v_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_v_8bpc_neon, export=1
+ push {r4, lr}
+ ldr lr, [sp, #8]
+ clz r3, r3
+ adr r4, L(ipred_v_tbl)
+ sub r3, r3, #25
+ ldr r3, [r4, r3, lsl #2]
+ add r2, r2, #1
+ add r4, r4, r3
+ add r12, r0, r1
+ lsl r1, r1, #1
+ bx r4
+
+ .align 2
+L(ipred_v_tbl):
+ .word 640f - L(ipred_v_tbl) + CONFIG_THUMB
+ .word 320f - L(ipred_v_tbl) + CONFIG_THUMB
+ .word 160f - L(ipred_v_tbl) + CONFIG_THUMB
+ .word 80f - L(ipred_v_tbl) + CONFIG_THUMB
+ .word 40f - L(ipred_v_tbl) + CONFIG_THUMB
+40:
+ vld1.32 {d0[]}, [r2]
+4:
+ vst1.32 {d0[0]}, [r0, :32], r1
+ vst1.32 {d0[0]}, [r12, :32], r1
+ subs lr, lr, #4
+ vst1.32 {d0[0]}, [r0, :32], r1
+ vst1.32 {d0[0]}, [r12, :32], r1
+ bgt 4b
+ pop {r4, pc}
+80:
+ vld1.8 {d0}, [r2]
+8:
+ vst1.8 {d0}, [r0, :64], r1
+ vst1.8 {d0}, [r12, :64], r1
+ subs lr, lr, #4
+ vst1.8 {d0}, [r0, :64], r1
+ vst1.8 {d0}, [r12, :64], r1
+ bgt 8b
+ pop {r4, pc}
+160:
+ vld1.8 {q0}, [r2]
+16:
+ vst1.8 {d0, d1}, [r0, :128], r1
+ vst1.8 {d0, d1}, [r12, :128], r1
+ subs lr, lr, #4
+ vst1.8 {d0, d1}, [r0, :128], r1
+ vst1.8 {d0, d1}, [r12, :128], r1
+ bgt 16b
+ pop {r4, pc}
+320:
+ vld1.8 {q0, q1}, [r2]
+32:
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ subs lr, lr, #4
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 32b
+ pop {r4, pc}
+640:
+ vld1.8 {q0, q1}, [r2]!
+ sub r1, r1, #32
+ vld1.8 {q2, q3}, [r2]
+64:
+ vst1.8 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
+ vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
+ subs lr, lr, #4
+ vst1.8 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
+ vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
+ bgt 64b
+ pop {r4, pc}
+endfunc
+
+// void ipred_h_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_h_8bpc_neon, export=1
+ push {r4-r5, lr}
+ ldr r4, [sp, #12]
+ clz r3, r3
+ adr r5, L(ipred_h_tbl)
+ sub r3, r3, #25
+ ldr r3, [r5, r3, lsl #2]
+ sub r2, r2, #4
+ mov lr, #-4
+ add r5, r5, r3
+ add r12, r0, r1
+ lsl r1, r1, #1
+ bx r5
+
+ .align 2
+L(ipred_h_tbl):
+ .word 640f - L(ipred_h_tbl) + CONFIG_THUMB
+ .word 320f - L(ipred_h_tbl) + CONFIG_THUMB
+ .word 160f - L(ipred_h_tbl) + CONFIG_THUMB
+ .word 8f - L(ipred_h_tbl) + CONFIG_THUMB
+ .word 4f - L(ipred_h_tbl) + CONFIG_THUMB
+4:
+ vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], lr
+ vst1.32 {d3[0]}, [r0, :32], r1
+ vst1.32 {d2[0]}, [r12, :32], r1
+ subs r4, r4, #4
+ vst1.32 {d1[0]}, [r0, :32], r1
+ vst1.32 {d0[0]}, [r12, :32], r1
+ bgt 4b
+ pop {r4-r5, pc}
+8:
+ vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], lr
+ vst1.8 {d3}, [r0, :64], r1
+ vst1.8 {d2}, [r12, :64], r1
+ subs r4, r4, #4
+ vst1.8 {d1}, [r0, :64], r1
+ vst1.8 {d0}, [r12, :64], r1
+ bgt 8b
+ pop {r4-r5, pc}
+160:
+ add r2, r2, #3
+ mov lr, #-1
+16:
+ vld1.8 {d0[], d1[]}, [r2], lr
+ subs r4, r4, #4
+ vld1.8 {d2[], d3[]}, [r2], lr
+ vst1.8 {q0}, [r0, :128], r1
+ vld1.8 {d4[], d5[]}, [r2], lr
+ vst1.8 {q1}, [r12, :128], r1
+ vld1.8 {d6[], d7[]}, [r2], lr
+ vst1.8 {q2}, [r0, :128], r1
+ vst1.8 {q3}, [r12, :128], r1
+ bgt 16b
+ pop {r4-r5, pc}
+320:
+ add r2, r2, #3
+ mov lr, #-1
+ sub r1, r1, #16
+32:
+ vld1.8 {d0[], d1[]}, [r2], lr
+ subs r4, r4, #4
+ vld1.8 {d2[], d3[]}, [r2], lr
+ vst1.8 {q0}, [r0, :128]!
+ vld1.8 {d4[], d5[]}, [r2], lr
+ vst1.8 {q1}, [r12, :128]!
+ vld1.8 {d6[], d7[]}, [r2], lr
+ vst1.8 {q0}, [r0, :128], r1
+ vst1.8 {q1}, [r12, :128], r1
+ vst1.8 {q2}, [r0, :128]!
+ vst1.8 {q3}, [r12, :128]!
+ vst1.8 {q2}, [r0, :128], r1
+ vst1.8 {q3}, [r12, :128], r1
+ bgt 32b
+ pop {r4-r5, pc}
+640:
+ add r2, r2, #3
+ mov lr, #-1
+ sub r1, r1, #48
+64:
+ vld1.8 {d0[], d1[]}, [r2], lr
+ subs r4, r4, #4
+ vld1.8 {d2[], d3[]}, [r2], lr
+ vst1.8 {q0}, [r0, :128]!
+ vld1.8 {d4[], d5[]}, [r2], lr
+ vst1.8 {q1}, [r12, :128]!
+ vld1.8 {d6[], d7[]}, [r2], lr
+ vst1.8 {q0}, [r0, :128]!
+ vst1.8 {q1}, [r12, :128]!
+ vst1.8 {q0}, [r0, :128]!
+ vst1.8 {q1}, [r12, :128]!
+ vst1.8 {q0}, [r0, :128], r1
+ vst1.8 {q1}, [r12, :128], r1
+ vst1.8 {q2}, [r0, :128]!
+ vst1.8 {q3}, [r12, :128]!
+ vst1.8 {q2}, [r0, :128]!
+ vst1.8 {q3}, [r12, :128]!
+ vst1.8 {q2}, [r0, :128]!
+ vst1.8 {q3}, [r12, :128]!
+ vst1.8 {q2}, [r0, :128], r1
+ vst1.8 {q3}, [r12, :128], r1
+ bgt 64b
+ pop {r4-r5, pc}
+endfunc
+
+// void ipred_dc_top_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_dc_top_8bpc_neon, export=1
+ push {r4-r5, lr}
+ ldr r4, [sp, #12]
+ clz r3, r3
+ adr r5, L(ipred_dc_top_tbl)
+ sub r3, r3, #25
+ ldr r3, [r5, r3, lsl #2]
+ add r2, r2, #1
+ add r5, r5, r3
+ add r12, r0, r1
+ lsl r1, r1, #1
+ bx r5
+
+ .align 2
+L(ipred_dc_top_tbl):
+ .word 640f - L(ipred_dc_top_tbl) + CONFIG_THUMB
+ .word 320f - L(ipred_dc_top_tbl) + CONFIG_THUMB
+ .word 160f - L(ipred_dc_top_tbl) + CONFIG_THUMB
+ .word 80f - L(ipred_dc_top_tbl) + CONFIG_THUMB
+ .word 40f - L(ipred_dc_top_tbl) + CONFIG_THUMB
+40:
+ vld1.32 {d0[]}, [r2]
+ vpaddl.u8 d0, d0
+ vpadd.u16 d0, d0
+ vrshrn.u16 d0, q0, #2
+ vdup.8 d0, d0[0]
+4:
+ vst1.32 {d0[0]}, [r0, :32], r1
+ vst1.32 {d0[0]}, [r12, :32], r1
+ subs r4, r4, #4
+ vst1.32 {d0[0]}, [r0, :32], r1
+ vst1.32 {d0[0]}, [r12, :32], r1
+ bgt 4b
+ pop {r4-r5, pc}
+80:
+ vld1.8 {d0}, [r2]
+ vpaddl.u8 d0, d0
+ vpadd.u16 d0, d0
+ vpadd.u16 d0, d0
+ vrshrn.u16 d0, q0, #3
+ vdup.8 d0, d0[0]
+8:
+ vst1.8 {d0}, [r0, :64], r1
+ vst1.8 {d0}, [r12, :64], r1
+ subs r4, r4, #4
+ vst1.8 {d0}, [r0, :64], r1
+ vst1.8 {d0}, [r12, :64], r1
+ bgt 8b
+ pop {r4-r5, pc}
+160:
+ vld1.8 {d0, d1}, [r2]
+ vaddl.u8 q0, d0, d1
+ vadd.u16 d0, d0, d1
+ vpadd.u16 d0, d0
+ vpadd.u16 d0, d0
+ vrshrn.u16 d0, q0, #4
+ vdup.8 q0, d0[0]
+16:
+ vst1.8 {d0, d1}, [r0, :128], r1
+ vst1.8 {d0, d1}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.8 {d0, d1}, [r0, :128], r1
+ vst1.8 {d0, d1}, [r12, :128], r1
+ bgt 16b
+ pop {r4-r5, pc}
+320:
+ vld1.8 {d0, d1, d2, d3}, [r2]
+ vaddl.u8 q0, d0, d1
+ vaddl.u8 q1, d2, d3
+ vadd.u16 q0, q0, q1
+ vadd.u16 d0, d0, d1
+ vpadd.u16 d0, d0
+ vpadd.u16 d0, d0
+ vrshrn.u16 d4, q0, #5
+ vdup.8 q0, d4[0]
+ vdup.8 q1, d4[0]
+32:
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 32b
+ pop {r4-r5, pc}
+640:
+ vld1.8 {d0, d1, d2, d3}, [r2]!
+ vaddl.u8 q0, d0, d1
+ vld1.8 {d4, d5, d6, d7}, [r2]
+ vaddl.u8 q1, d2, d3
+ vaddl.u8 q2, d4, d5
+ vaddl.u8 q3, d6, d7
+ vadd.u16 q0, q0, q1
+ vadd.u16 q1, q2, q3
+ vadd.u16 q0, q0, q1
+ vadd.u16 d0, d0, d1
+ vpadd.u16 d0, d0
+ vpadd.u16 d0, d0
+ vrshrn.u16 d18, q0, #6
+ vdup.8 q0, d18[0]
+ vdup.8 q1, d18[0]
+ sub r1, r1, #32
+64:
+ vst1.8 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.8 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 64b
+ pop {r4-r5, pc}
+endfunc
+
+// void ipred_dc_left_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_dc_left_8bpc_neon, export=1
+ push {r4-r5, lr}
+ ldr r4, [sp, #12]
+ sub r2, r2, r4
+ clz r3, r3
+ clz lr, r4
+ sub lr, lr, #25
+ adr r5, L(ipred_dc_left_tbl)
+ sub r3, r3, #20
+ ldr r3, [r5, r3, lsl #2]
+ ldr lr, [r5, lr, lsl #2]
+ add r3, r5, r3
+ add r5, r5, lr
+ add r12, r0, r1
+ lsl r1, r1, #1
+ bx r5
+
+ .align 2
+L(ipred_dc_left_tbl):
+ .word L(ipred_dc_left_h64) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_h32) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_h16) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_h8) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_h4) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_w64) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_w32) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_w16) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_w8) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_w4) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+
+L(ipred_dc_left_h4):
+ vld1.32 {d0[]}, [r2, :32]
+ vpaddl.u8 d0, d0
+ vpadd.u16 d0, d0
+ vrshrn.u16 d0, q0, #2
+ vdup.8 q0, d0[0]
+ bx r3
+L(ipred_dc_left_w4):
+ vst1.32 {d0[0]}, [r0, :32], r1
+ vst1.32 {d0[0]}, [r12, :32], r1
+ subs r4, r4, #4
+ vst1.32 {d0[0]}, [r0, :32], r1
+ vst1.32 {d0[0]}, [r12, :32], r1
+ bgt L(ipred_dc_left_w4)
+ pop {r4-r5, pc}
+L(ipred_dc_left_h8):
+ vld1.8 {d0}, [r2, :64]
+ vpaddl.u8 d0, d0
+ vpadd.u16 d0, d0
+ vpadd.u16 d0, d0
+ vrshrn.u16 d0, q0, #3
+ vdup.8 q0, d0[0]
+ bx r3
+L(ipred_dc_left_w8):
+ vst1.8 {d0}, [r0, :64], r1
+ vst1.8 {d0}, [r12, :64], r1
+ subs r4, r4, #4
+ vst1.8 {d0}, [r0, :64], r1
+ vst1.8 {d0}, [r12, :64], r1
+ bgt L(ipred_dc_left_w8)
+ pop {r4-r5, pc}
+L(ipred_dc_left_h16):
+ vld1.8 {d0, d1}, [r2, :128]
+ vaddl.u8 q0, d0, d1
+ vadd.u16 d0, d0, d1
+ vpadd.u16 d0, d0
+ vpadd.u16 d0, d0
+ vrshrn.u16 d0, q0, #4
+ vdup.8 q0, d0[0]
+ bx r3
+L(ipred_dc_left_w16):
+ vst1.8 {d0, d1}, [r0, :128], r1
+ vst1.8 {d0, d1}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.8 {d0, d1}, [r0, :128], r1
+ vst1.8 {d0, d1}, [r12, :128], r1
+ bgt L(ipred_dc_left_w16)
+ pop {r4-r5, pc}
+L(ipred_dc_left_h32):
+ vld1.8 {d0, d1, d2, d3}, [r2, :128]
+ vaddl.u8 q0, d0, d1
+ vaddl.u8 q1, d2, d3
+ vadd.u16 q0, q0, q1
+ vadd.u16 d0, d0, d1
+ vpadd.u16 d0, d0
+ vpadd.u16 d0, d0
+ vrshrn.u16 d0, q0, #5
+ vdup.8 q0, d0[0]
+ bx r3
+L(ipred_dc_left_w32):
+ vmov.8 q1, q0
+1:
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 1b
+ pop {r4-r5, pc}
+L(ipred_dc_left_h64):
+ vld1.8 {d0, d1, d2, d3}, [r2, :128]!
+ vld1.8 {d4, d5, d6, d7}, [r2, :128]
+ vaddl.u8 q0, d0, d1
+ vaddl.u8 q1, d2, d3
+ vaddl.u8 q2, d4, d5
+ vaddl.u8 q3, d6, d7
+ vadd.u16 q0, q0, q1
+ vadd.u16 q1, q2, q3
+ vadd.u16 q0, q0, q1
+ vadd.u16 d0, d0, d1
+ vpadd.u16 d0, d0
+ vpadd.u16 d0, d0
+ vrshrn.u16 d0, q0, #6
+ vdup.8 q0, d0[0]
+ bx r3
+L(ipred_dc_left_w64):
+ vmov.8 q1, q0
+ sub r1, r1, #32
+1:
+ vst1.8 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.8 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 1b
+ pop {r4-r5, pc}
+endfunc
+
+// void ipred_dc_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_dc_8bpc_neon, export=1
+ push {r4-r6, lr}
+ ldr r4, [sp, #16]
+ sub r2, r2, r4
+ add lr, r3, r4 // width + height
+ clz r3, r3
+ clz r12, r4
+ vdup.16 q15, lr // width + height
+ adr r5, L(ipred_dc_tbl)
+ rbit lr, lr // rbit(width + height)
+ sub r3, r3, #20 // 25 leading bits, minus table offset 5
+ sub r12, r12, #25
+ clz lr, lr // ctz(width + height)
+ ldr r3, [r5, r3, lsl #2]
+ ldr r12, [r5, r12, lsl #2]
+ neg lr, lr // -ctz(width + height)
+ add r3, r5, r3
+ add r5, r5, r12
+ vshr.u16 q15, q15, #1 // (width + height) >> 1
+ vdup.16 q14, lr // -ctz(width + height)
+ add r12, r0, r1
+ lsl r1, r1, #1
+ bx r5
+
+ .align 2
+L(ipred_dc_tbl):
+ .word L(ipred_dc_h64) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_h32) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_h16) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_h8) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_h4) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_w64) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_w32) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_w16) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_w8) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_w4) - L(ipred_dc_tbl) + CONFIG_THUMB
+
+L(ipred_dc_h4):
+ vld1.32 {d0[]}, [r2, :32]!
+ vpaddl.u8 d0, d0
+ add r2, r2, #1
+ vpadd.u16 d0, d0
+ bx r3
+L(ipred_dc_w4):
+ vld1.32 {d1[]}, [r2]
+ vadd.s16 d0, d0, d30
+ vpaddl.u8 d1, d1
+ vpadd.u16 d1, d1
+ cmp r4, #4
+ vadd.s16 d0, d0, d1
+ vshl.u16 d0, d0, d28
+ beq 1f
+ // h = 8/16
+ movw lr, #(0x3334/2)
+ movw r5, #(0x5556/2)
+ cmp r4, #16
+ it ne
+ movne lr, r5
+ vdup.16 d30, lr
+ vqdmulh.s16 d0, d0, d30
+1:
+ vdup.8 d0, d0[0]
+2:
+ vst1.32 {d0[0]}, [r0, :32], r1
+ vst1.32 {d0[0]}, [r12, :32], r1
+ subs r4, r4, #4
+ vst1.32 {d0[0]}, [r0, :32], r1
+ vst1.32 {d0[0]}, [r12, :32], r1
+ bgt 2b
+ pop {r4-r6, pc}
+
+L(ipred_dc_h8):
+ vld1.8 {d0}, [r2, :64]!
+ vpaddl.u8 d0, d0
+ vpadd.u16 d0, d0
+ add r2, r2, #1
+ vpadd.u16 d0, d0
+ bx r3
+L(ipred_dc_w8):
+ vld1.8 {d2}, [r2]
+ vadd.s16 d0, d0, d30
+ vpaddl.u8 d2, d2
+ vpadd.u16 d2, d2
+ vpadd.u16 d2, d2
+ cmp r4, #8
+ vadd.s16 d0, d0, d2
+ vshl.u16 d0, d0, d28
+ beq 1f
+ // h = 4/16/32
+ cmp r4, #32
+ movw lr, #(0x3334/2)
+ movw r5, #(0x5556/2)
+ it ne
+ movne lr, r5
+ vdup.16 d24, lr
+ vqdmulh.s16 d0, d0, d24
+1:
+ vdup.8 d0, d0[0]
+2:
+ vst1.8 {d0}, [r0, :64], r1
+ vst1.8 {d0}, [r12, :64], r1
+ subs r4, r4, #4
+ vst1.8 {d0}, [r0, :64], r1
+ vst1.8 {d0}, [r12, :64], r1
+ bgt 2b
+ pop {r4-r6, pc}
+
+L(ipred_dc_h16):
+ vld1.8 {d0, d1}, [r2, :128]!
+ vaddl.u8 q0, d0, d1
+ vadd.u16 d0, d0, d1
+ vpadd.u16 d0, d0
+ add r2, r2, #1
+ vpadd.u16 d0, d0
+ bx r3
+L(ipred_dc_w16):
+ vld1.8 {d2, d3}, [r2]
+ vadd.s16 d0, d0, d30
+ vaddl.u8 q1, d2, d3
+ vadd.u16 d2, d2, d3
+ vpadd.u16 d2, d2
+ vpadd.u16 d2, d2
+ cmp r4, #16
+ vadd.s16 d0, d0, d2
+ vshl.u16 d0, d0, d28
+ beq 1f
+ // h = 4/8/32/64
+ tst r4, #(32+16+8) // 16 added to make a consecutive bitmask
+ movw lr, #(0x3334/2)
+ movw r5, #(0x5556/2)
+ it ne
+ movne lr, r5
+ vdup.16 d24, lr
+ vqdmulh.s16 d0, d0, d24
+1:
+ vdup.8 q0, d0[0]
+2:
+ vst1.8 {d0, d1}, [r0, :128], r1
+ vst1.8 {d0, d1}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.8 {d0, d1}, [r0, :128], r1
+ vst1.8 {d0, d1}, [r12, :128], r1
+ bgt 2b
+ pop {r4-r6, pc}
+
+L(ipred_dc_h32):
+ vld1.8 {d0, d1, d2, d3}, [r2, :128]!
+ vaddl.u8 q0, d0, d1
+ vaddl.u8 q1, d2, d3
+ vadd.u16 q0, q0, q1
+ vadd.u16 d0, d0, d1
+ vpadd.u16 d0, d0
+ add r2, r2, #1
+ vpadd.u16 d0, d0
+ bx r3
+L(ipred_dc_w32):
+ vld1.8 {d2, d3, d4, d5}, [r2]
+ vadd.s16 d0, d0, d30
+ vaddl.u8 q1, d2, d3
+ vaddl.u8 q2, d4, d5
+ vadd.u16 q1, q1, q2
+ vadd.u16 d2, d2, d3
+ vpadd.u16 d2, d2
+ vpadd.u16 d2, d2
+ cmp r4, #32
+ vadd.s16 d0, d0, d2
+ vshl.u16 d4, d0, d28
+ beq 1f
+ // h = 8/16/64
+ cmp r4, #8
+ movw lr, #(0x3334/2)
+ movw r5, #(0x5556/2)
+ it ne
+ movne lr, r5
+ vdup.16 d24, lr
+ vqdmulh.s16 d4, d4, d24
+1:
+ vdup.8 q0, d4[0]
+ vdup.8 q1, d4[0]
+2:
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 2b
+ pop {r4-r6, pc}
+
+L(ipred_dc_h64):
+ vld1.8 {d0, d1, d2, d3}, [r2, :128]!
+ vaddl.u8 q0, d0, d1
+ vld1.8 {d4, d5, d6, d7}, [r2, :128]!
+ vaddl.u8 q1, d2, d3
+ vaddl.u8 q2, d4, d5
+ vaddl.u8 q3, d6, d7
+ vadd.u16 q0, q0, q1
+ vadd.u16 q1, q2, q3
+ vadd.u16 q0, q0, q1
+ vadd.u16 d0, d0, d1
+ vpadd.u16 d0, d0
+ add r2, r2, #1
+ vpadd.u16 d0, d0
+ bx r3
+L(ipred_dc_w64):
+ vld1.8 {d2, d3, d4, d5}, [r2]!
+ vadd.s16 d0, d0, d30
+ vaddl.u8 q2, d4, d5
+ vaddl.u8 q1, d2, d3
+ vadd.u16 d4, d4, d5
+ vadd.u16 d2, d2, d3
+ vld1.8 {d16, d17, d18, d19}, [r2]
+ vpadd.u16 d4, d4
+ vpadd.u16 d2, d2
+ vpadd.u16 d4, d4
+ vpadd.u16 d2, d2
+ vaddl.u8 q8, d16, d17
+ vaddl.u8 q9, d18, d19
+ vadd.u16 d16, d16, d17
+ vadd.u16 d18, d18, d19
+ vpadd.u16 d16, d16
+ vpadd.u16 d18, d18
+ vpadd.u16 d16, d16
+ vpadd.u16 d18, d18
+ vadd.u16 d2, d2, d4
+ vadd.u16 d3, d16, d18
+ cmp r4, #64
+ vadd.s16 d0, d0, d2
+ vadd.s16 d0, d0, d3
+ vshl.u16 d18, d0, d28
+ beq 1f
+ // h = 16/32
+ movw lr, #(0x5556/2)
+ movt lr, #(0x3334/2)
+ and r5, r4, #31
+ lsr lr, lr, r5
+ vdup.16 d30, lr
+ vqdmulh.s16 d18, d18, d30
+1:
+ sub r1, r1, #32
+ vdup.8 q0, d18[0]
+ vdup.8 q1, d18[0]
+2:
+ vst1.8 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.8 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 2b
+ pop {r4-r6, pc}
+endfunc
+
+// void ipred_paeth_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_paeth_8bpc_neon, export=1
+ push {r4-r8, lr}
+ ldr r4, [sp, #24]
+ clz lr, r3
+ adr r5, L(ipred_paeth_tbl)
+ sub lr, lr, #25
+ ldr lr, [r5, lr, lsl #2]
+ vld1.8 {d4[], d5[]}, [r2]
+ add r8, r2, #1
+ sub r2, r2, #4
+ add r5, r5, lr
+ mov r7, #-4
+ add r6, r0, r1
+ lsl r1, r1, #1
+ bx r5
+
+ .align 2
+L(ipred_paeth_tbl):
+ .word 640f - L(ipred_paeth_tbl) + CONFIG_THUMB
+ .word 320f - L(ipred_paeth_tbl) + CONFIG_THUMB
+ .word 160f - L(ipred_paeth_tbl) + CONFIG_THUMB
+ .word 80f - L(ipred_paeth_tbl) + CONFIG_THUMB
+ .word 40f - L(ipred_paeth_tbl) + CONFIG_THUMB
+
+40:
+ vld1.32 {d6[], d7[]}, [r8]
+ vsubl.u8 q8, d6, d4 // top - topleft
+4:
+ vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], r7
+ vzip.32 d0, d1
+ vzip.32 d2, d3
+ vaddw.u8 q9, q8, d0
+ vaddw.u8 q10, q8, d2
+ vqmovun.s16 d18, q9 // base
+ vqmovun.s16 d19, q10
+ vmov d1, d2
+ vabd.u8 q10, q3, q9 // tdiff
+ vabd.u8 q11, q2, q9 // tldiff
+ vabd.u8 q9, q0, q9 // ldiff
+ vmin.u8 q12, q10, q11 // min(tdiff, tldiff)
+ vcge.u8 q10, q11, q10 // tldiff >= tdiff
+ vcge.u8 q9, q12, q9 // min(tdiff, tldiff) >= ldiff
+ vbsl q10, q3, q2 // tdiff <= tldiff ? top : topleft
+ vbit q10, q0, q9 // ldiff <= min ? left : ...
+ vst1.32 {d21[1]}, [r0, :32], r1
+ vst1.32 {d21[0]}, [r6, :32], r1
+ subs r4, r4, #4
+ vst1.32 {d20[1]}, [r0, :32], r1
+ vst1.32 {d20[0]}, [r6, :32], r1
+ bgt 4b
+ pop {r4-r8, pc}
+80:
+ vld1.8 {d6}, [r8]
+ vsubl.u8 q8, d6, d4 // top - topleft
+ vmov d7, d6
+8:
+ vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], r7
+ vaddw.u8 q9, q8, d0
+ vaddw.u8 q10, q8, d1
+ vaddw.u8 q11, q8, d2
+ vaddw.u8 q12, q8, d3
+ vqmovun.s16 d18, q9 // base
+ vqmovun.s16 d19, q10
+ vqmovun.s16 d20, q11
+ vqmovun.s16 d21, q12
+ vabd.u8 q11, q3, q9 // tdiff
+ vabd.u8 q12, q3, q10
+ vabd.u8 q13, q2, q9 // tldiff
+ vabd.u8 q14, q2, q10
+ vabd.u8 q10, q1, q10 // ldiff
+ vabd.u8 q9, q0, q9
+ vmin.u8 q15, q12, q14 // min(tdiff, tldiff)
+ vcge.u8 q12, q14, q12 // tldiff >= tdiff
+ vmin.u8 q14, q11, q13 // min(tdiff, tldiff)
+ vcge.u8 q11, q13, q11 // tldiff >= tdiff
+ vcge.u8 q10, q15, q10 // min(tdiff, tldiff) >= ldiff
+ vcge.u8 q9, q14, q9
+ vbsl q12, q3, q2 // tdiff <= tldiff ? top : topleft
+ vbsl q11, q3, q2
+ vbit q12, q1, q10 // ldiff <= min ? left : ...
+ vbit q11, q0, q9
+ vst1.8 {d25}, [r0, :64], r1
+ vst1.8 {d24}, [r6, :64], r1
+ subs r4, r4, #4
+ vst1.8 {d23}, [r0, :64], r1
+ vst1.8 {d22}, [r6, :64], r1
+ bgt 8b
+ pop {r4-r8, pc}
+160:
+320:
+640:
+ vld1.8 {d6}, [r8]!
+ mov r12, r3
+ // Set up pointers for four rows in parallel; r0, r6, r5, lr
+ add r5, r0, r1
+ add lr, r6, r1
+ lsl r1, r1, #1
+ sub r1, r1, r3
+1:
+ vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], r7
+2:
+ vsubl.u8 q8, d6, d4 // top - topleft
+ vmov d7, d6
+ vaddw.u8 q9, q8, d0
+ vaddw.u8 q10, q8, d1
+ vaddw.u8 q11, q8, d2
+ vaddw.u8 q12, q8, d3
+ vqmovun.s16 d18, q9 // base
+ vqmovun.s16 d19, q10
+ vqmovun.s16 d20, q11
+ vqmovun.s16 d21, q12
+ vabd.u8 q11, q3, q9 // tdiff
+ vabd.u8 q12, q3, q10
+ vabd.u8 q13, q2, q9 // tldiff
+ vabd.u8 q14, q2, q10
+ vabd.u8 q10, q1, q10 // ldiff
+ vabd.u8 q9, q0, q9
+ vmin.u8 q15, q12, q14 // min(tdiff, tldiff)
+ vcge.u8 q12, q14, q12 // tldiff >= tdiff
+ vmin.u8 q14, q11, q13 // min(tdiff, tldiff)
+ vcge.u8 q11, q13, q11 // tldiff >= tdiff
+ vcge.u8 q10, q15, q10 // min(tdiff, tldiff) >= ldiff
+ vcge.u8 q9, q14, q9
+ vbsl q12, q3, q2 // tdiff <= tldiff ? top : topleft
+ vbsl q11, q3, q2
+ vbit q12, q1, q10 // ldiff <= min ? left : ...
+ vbit q11, q0, q9
+ subs r3, r3, #8
+ vst1.8 {d25}, [r0, :64]!
+ vst1.8 {d24}, [r6, :64]!
+ vst1.8 {d23}, [r5, :64]!
+ vst1.8 {d22}, [lr, :64]!
+ ble 8f
+ vld1.8 {d6}, [r8]!
+ b 2b
+8:
+ subs r4, r4, #4
+ ble 9f
+ // End of horizontal loop, move pointers to next four rows
+ sub r8, r8, r12
+ add r0, r0, r1
+ add r6, r6, r1
+ vld1.8 {d6}, [r8]!
+ add r5, r5, r1
+ add lr, lr, r1
+ mov r3, r12
+ b 1b
+9:
+ pop {r4-r8, pc}
+endfunc
+
+// void ipred_smooth_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_smooth_8bpc_neon, export=1
+ push {r4-r10, lr}
+ ldr r4, [sp, #32]
+ movrel r10, X(sm_weights)
+ add r12, r10, r4
+ add r10, r10, r3
+ clz r9, r3
+ adr r5, L(ipred_smooth_tbl)
+ sub lr, r2, r4
+ sub r9, r9, #25
+ ldr r9, [r5, r9, lsl #2]
+ vld1.8 {d4[]}, [lr] // bottom
+ add r8, r2, #1
+ add r5, r5, r9
+ add r6, r0, r1
+ lsl r1, r1, #1
+ bx r5
+
+ .align 2
+L(ipred_smooth_tbl):
+ .word 640f - L(ipred_smooth_tbl) + CONFIG_THUMB
+ .word 320f - L(ipred_smooth_tbl) + CONFIG_THUMB
+ .word 160f - L(ipred_smooth_tbl) + CONFIG_THUMB
+ .word 80f - L(ipred_smooth_tbl) + CONFIG_THUMB
+ .word 40f - L(ipred_smooth_tbl) + CONFIG_THUMB
+
+40:
+ vld1.32 {d16[]}, [r8] // top
+ vld1.32 {d18[]}, [r10, :32] // weights_hor
+ sub r2, r2, #4
+ mov r7, #-4
+ vdup.8 q3, d16[3] // right
+ vsubl.u8 q8, d16, d4 // top-bottom
+ vmovl.u8 q9, d18 // weights_hor
+4:
+ vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], r7 // left
+ vld4.8 {d20[], d21[], d22[], d23[]}, [r12, :32]! // weights_ver
+ vshll.i8 q12, d6, #8 // right*256
+ vshll.i8 q13, d6, #8
+ vzip.32 d1, d0 // left, flipped
+ vzip.32 d3, d2
+ vzip.32 d20, d21 // weights_ver
+ vzip.32 d22, d23
+ vshll.i8 q14, d4, #8 // bottom*256
+ vshll.i8 q15, d4, #8
+ vsubl.u8 q0, d1, d6 // left-right
+ vsubl.u8 q1, d3, d6
+ vmovl.u8 q10, d20 // weights_ver
+ vmovl.u8 q11, d22
+ vmla.i16 q12, q1, q9 // right*256 + (left-right)*weights_hor
+ vmla.i16 q13, q0, q9 // (left flipped)
+ vmla.i16 q14, q8, q10 // bottom*256 + (top-bottom)*weights_ver
+ vmla.i16 q15, q8, q11
+ vhadd.u16 q12, q12, q14
+ vhadd.u16 q13, q13, q15
+ vrshrn.i16 d24, q12, #8
+ vrshrn.i16 d25, q13, #8
+ vst1.32 {d24[0]}, [r0, :32], r1
+ vst1.32 {d24[1]}, [r6, :32], r1
+ subs r4, r4, #4
+ vst1.32 {d25[0]}, [r0, :32], r1
+ vst1.32 {d25[1]}, [r6, :32], r1
+ bgt 4b
+ pop {r4-r10, pc}
+80:
+ vld1.8 {d16}, [r8] // top
+ vld1.8 {d18}, [r10, :64] // weights_hor
+ sub r2, r2, #2
+ mov r7, #-2
+ vdup.8 q3, d16[7] // right
+ vsubl.u8 q8, d16, d4 // top-bottom
+ vmovl.u8 q9, d18 // weights_hor
+8:
+ vld2.8 {d0[], d1[]}, [r2, :16], r7 // left
+ vld2.8 {d20[], d22[]}, [r12, :16]! // weights_ver
+ vshll.i8 q12, d6, #8 // right*256
+ vshll.i8 q13, d6, #8
+ vshll.i8 q14, d4, #8 // bottom*256
+ vshll.i8 q15, d4, #8
+ vsubl.u8 q1, d0, d6 // left-right (left flipped)
+ vsubl.u8 q0, d1, d6
+ vmovl.u8 q10, d20 // weights_ver
+ vmovl.u8 q11, d22
+ vmla.i16 q12, q0, q9 // right*256 + (left-right)*weights_hor
+ vmla.i16 q13, q1, q9
+ vmla.i16 q14, q8, q10 // bottom*256 + (top-bottom)*weights_ver
+ vmla.i16 q15, q8, q11
+ vhadd.u16 q12, q12, q14
+ vhadd.u16 q13, q13, q15
+ vrshrn.i16 d24, q12, #8
+ vrshrn.i16 d25, q13, #8
+ subs r4, r4, #2
+ vst1.8 {d24}, [r0, :64], r1
+ vst1.8 {d25}, [r6, :64], r1
+ bgt 8b
+ pop {r4-r10, pc}
+160:
+320:
+640:
+ add lr, r2, r3
+ sub r2, r2, #2
+ mov r7, #-2
+ vld1.8 {d6[], d7[]}, [lr] // right
+ sub r1, r1, r3
+ mov r9, r3
+
+1:
+ vld2.8 {d0[], d1[]}, [r2, :16], r7 // left
+ vld2.8 {d20[], d22[]}, [r12, :16]! // weights_ver
+ vsubl.u8 q1, d0, d6 // left-right (left flipped)
+ vsubl.u8 q0, d1, d6
+ vmovl.u8 q10, d20 // weights_ver
+ vmovl.u8 q11, d22
+2:
+ vld1.8 {d16}, [r8]! // top
+ vld1.8 {d18}, [r10, :64]! // weights_hor
+ vshll.i8 q12, d6, #8 // right*256
+ vshll.i8 q13, d6, #8
+ vmovl.u8 q9, d18 // weights_hor
+ vshll.i8 q14, d4, #8 // bottom*256
+ vshll.i8 q15, d4, #8
+ vsubl.u8 q8, d16, d4 // top-bottom
+ vmla.i16 q12, q0, q9 // right*256 + (left-right)*weights_hor
+ vmla.i16 q13, q1, q9
+ vmla.i16 q14, q8, q10 // bottom*256 + (top-bottom)*weights_ver
+ vmla.i16 q15, q8, q11
+ vhadd.u16 q12, q12, q14
+ vhadd.u16 q13, q13, q15
+ vrshrn.i16 d24, q12, #8
+ vrshrn.i16 d25, q13, #8
+ subs r3, r3, #8
+ vst1.8 {d24}, [r0, :64]!
+ vst1.8 {d25}, [r6, :64]!
+ bgt 2b
+ subs r4, r4, #2
+ ble 9f
+ sub r8, r8, r9
+ sub r10, r10, r9
+ add r0, r0, r1
+ add r6, r6, r1
+ mov r3, r9
+ b 1b
+9:
+ pop {r4-r10, pc}
+endfunc
+
+// void ipred_smooth_v_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_smooth_v_8bpc_neon, export=1
+ push {r4-r7, lr}
+ ldr r4, [sp, #20]
+ movrel r7, X(sm_weights)
+ add r7, r7, r4
+ clz lr, r3
+ adr r5, L(ipred_smooth_v_tbl)
+ sub r12, r2, r4
+ sub lr, lr, #25
+ ldr lr, [r5, lr, lsl #2]
+ vld1.8 {d4[]}, [r12] // bottom
+ add r2, r2, #1
+ add r5, r5, lr
+ add r6, r0, r1
+ lsl r1, r1, #1
+ bx r5
+
+ .align 2
+L(ipred_smooth_v_tbl):
+ .word 640f - L(ipred_smooth_v_tbl) + CONFIG_THUMB
+ .word 320f - L(ipred_smooth_v_tbl) + CONFIG_THUMB
+ .word 160f - L(ipred_smooth_v_tbl) + CONFIG_THUMB
+ .word 80f - L(ipred_smooth_v_tbl) + CONFIG_THUMB
+ .word 40f - L(ipred_smooth_v_tbl) + CONFIG_THUMB
+
+40:
+ vld1.32 {d6[]}, [r2] // top
+ vsubl.u8 q3, d6, d4 // top-bottom
+4:
+ vld4.8 {d16[], d17[], d18[], d19[]}, [r7, :32]! // weights_ver
+ vshll.i8 q10, d4, #8 // bottom*256
+ vshll.i8 q11, d4, #8
+ vzip.32 d16, d17 // weights_ver
+ vzip.32 d18, d19
+ vmovl.u8 q8, d16 // weights_ver
+ vmovl.u8 q9, d18
+ subs r4, r4, #4
+ vmla.i16 q10, q3, q8 // bottom*256 + (top-bottom)*weights_ver
+ vmla.i16 q11, q3, q9
+ vrshrn.i16 d20, q10, #8
+ vrshrn.i16 d21, q11, #8
+ vst1.32 {d20[0]}, [r0, :32], r1
+ vst1.32 {d20[1]}, [r6, :32], r1
+ vst1.32 {d21[0]}, [r0, :32], r1
+ vst1.32 {d21[1]}, [r6, :32], r1
+ bgt 4b
+ pop {r4-r7, pc}
+80:
+ vld1.8 {d6}, [r2] // top
+ vsubl.u8 q3, d6, d4 // top-bottom
+8:
+ vld4.8 {d16[], d18[], d20[], d22[]}, [r7, :32]! // weights_ver
+ vshll.i8 q12, d4, #8 // bottom*256
+ vshll.i8 q13, d4, #8
+ vshll.i8 q14, d4, #8
+ vshll.i8 q15, d4, #8
+ vmovl.u8 q8, d16 // weights_ver
+ vmovl.u8 q9, d18
+ vmovl.u8 q10, d20
+ vmovl.u8 q11, d22
+ vmla.i16 q12, q3, q8 // bottom*256 + (top-bottom)*weights_ver
+ vmla.i16 q13, q3, q9
+ vmla.i16 q14, q3, q10
+ vmla.i16 q15, q3, q11
+ vrshrn.i16 d24, q12, #8
+ vrshrn.i16 d25, q13, #8
+ vrshrn.i16 d26, q14, #8
+ vrshrn.i16 d27, q15, #8
+ vst1.8 {d24}, [r0, :64], r1
+ vst1.8 {d25}, [r6, :64], r1
+ subs r4, r4, #4
+ vst1.8 {d26}, [r0, :64], r1
+ vst1.8 {d27}, [r6, :64], r1
+ bgt 8b
+ pop {r4-r7, pc}
+160:
+320:
+640:
+ vpush {q4-q7}
+ // Set up pointers for four rows in parallel; r0, r6, r5, lr
+ add r5, r0, r1
+ add lr, r6, r1
+ lsl r1, r1, #1
+ sub r1, r1, r3
+ mov r12, r3
+
+1:
+ vld4.8 {d8[], d10[], d12[], d14[]}, [r7, :32]! // weights_ver
+ vmovl.u8 q4, d8 // weights_ver
+ vmovl.u8 q5, d10
+ vmovl.u8 q6, d12
+ vmovl.u8 q7, d14
+2:
+ vld1.8 {q3}, [r2]! // top
+ vshll.i8 q8, d4, #8 // bottom*256
+ vshll.i8 q9, d4, #8
+ vshll.i8 q10, d4, #8
+ vshll.i8 q11, d4, #8
+ vsubl.u8 q0, d6, d4 // top-bottom
+ vsubl.u8 q1, d7, d4
+ vshll.i8 q12, d4, #8
+ vshll.i8 q13, d4, #8
+ vshll.i8 q14, d4, #8
+ vshll.i8 q15, d4, #8
+ vmla.i16 q8, q0, q4 // bottom*256 + (top-bottom)*weights_ver
+ vmla.i16 q9, q1, q4
+ vmla.i16 q10, q0, q5
+ vmla.i16 q11, q1, q5
+ vmla.i16 q12, q0, q6 // bottom*256 + (top-bottom)*weights_ver
+ vmla.i16 q13, q1, q6
+ vmla.i16 q14, q0, q7
+ vmla.i16 q15, q1, q7
+ vrshrn.i16 d16, q8, #8
+ vrshrn.i16 d17, q9, #8
+ vrshrn.i16 d18, q10, #8
+ vrshrn.i16 d19, q11, #8
+ vrshrn.i16 d20, q12, #8
+ vrshrn.i16 d21, q13, #8
+ vrshrn.i16 d22, q14, #8
+ vrshrn.i16 d23, q15, #8
+ subs r3, r3, #16
+ vst1.8 {q8}, [r0, :128]!
+ vst1.8 {q9}, [r6, :128]!
+ vst1.8 {q10}, [r5, :128]!
+ vst1.8 {q11}, [lr, :128]!
+ bgt 2b
+ subs r4, r4, #4
+ ble 9f
+ sub r2, r2, r12
+ add r0, r0, r1
+ add r6, r6, r1
+ add r5, r5, r1
+ add lr, lr, r1
+ mov r3, r12
+ b 1b
+9:
+ vpop {q4-q7}
+ pop {r4-r7, pc}
+endfunc
+
+// void ipred_smooth_h_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_smooth_h_8bpc_neon, export=1
+ push {r4-r8, lr}
+ ldr r4, [sp, #24]
+ movrel r8, X(sm_weights)
+ add r8, r8, r3
+ clz lr, r3
+ adr r5, L(ipred_smooth_h_tbl)
+ add r12, r2, r3
+ sub lr, lr, #25
+ ldr lr, [r5, lr, lsl #2]
+ vld1.8 {d4[]}, [r12] // right
+ add r5, r5, lr
+ add r6, r0, r1
+ lsl r1, r1, #1
+ bx r5
+
+ .align 2
+L(ipred_smooth_h_tbl):
+ .word 640f - L(ipred_smooth_h_tbl) + CONFIG_THUMB
+ .word 320f - L(ipred_smooth_h_tbl) + CONFIG_THUMB
+ .word 160f - L(ipred_smooth_h_tbl) + CONFIG_THUMB
+ .word 80f - L(ipred_smooth_h_tbl) + CONFIG_THUMB
+ .word 40f - L(ipred_smooth_h_tbl) + CONFIG_THUMB
+
+40:
+ vld1.32 {d6[]}, [r8, :32] // weights_hor
+ sub r2, r2, #4
+ mov r7, #-4
+ vmovl.u8 q3, d6 // weights_hor
+4:
+ vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], r7 // left
+ vshll.i8 q8, d4, #8 // right*256
+ vshll.i8 q9, d4, #8
+ vzip.32 d3, d2 // left, flipped
+ vzip.32 d1, d0
+ vsubl.u8 q1, d3, d4 // left-right
+ vsubl.u8 q0, d1, d4
+ subs r4, r4, #4
+ vmla.i16 q8, q1, q3 // right*256 + (left-right)*weights_hor
+ vmla.i16 q9, q0, q3
+ vrshrn.i16 d16, q8, #8
+ vrshrn.i16 d17, q9, #8
+ vst1.32 {d16[0]}, [r0, :32], r1
+ vst1.32 {d16[1]}, [r6, :32], r1
+ vst1.32 {d17[0]}, [r0, :32], r1
+ vst1.32 {d17[1]}, [r6, :32], r1
+ bgt 4b
+ pop {r4-r8, pc}
+80:
+ vld1.8 {d6}, [r8, :64] // weights_hor
+ sub r2, r2, #4
+ mov r7, #-4
+ vmovl.u8 q3, d6 // weights_hor
+8:
+ vld4.8 {d16[], d18[], d20[], d22[]}, [r2, :32], r7 // left
+ vshll.i8 q12, d4, #8 // right*256
+ vshll.i8 q13, d4, #8
+ vshll.i8 q14, d4, #8
+ vshll.i8 q15, d4, #8
+ vsubl.u8 q11, d22, d4 // left-right
+ vsubl.u8 q10, d20, d4
+ vsubl.u8 q9, d18, d4
+ vsubl.u8 q8, d16, d4
+ vmla.i16 q12, q11, q3 // right*256 + (left-right)*weights_hor
+ vmla.i16 q13, q10, q3 // (left flipped)
+ vmla.i16 q14, q9, q3
+ vmla.i16 q15, q8, q3
+ vrshrn.i16 d24, q12, #8
+ vrshrn.i16 d25, q13, #8
+ vrshrn.i16 d26, q14, #8
+ vrshrn.i16 d27, q15, #8
+ vst1.8 {d24}, [r0, :64], r1
+ vst1.8 {d25}, [r6, :64], r1
+ subs r4, r4, #4
+ vst1.8 {d26}, [r0, :64], r1
+ vst1.8 {d27}, [r6, :64], r1
+ bgt 8b
+ pop {r4-r8, pc}
+160:
+320:
+640:
+ vpush {q4-q7}
+ sub r2, r2, #4
+ mov r7, #-4
+ // Set up pointers for four rows in parallel; r0, r6, r5, lr
+ add r5, r0, r1
+ add lr, r6, r1
+ lsl r1, r1, #1
+ sub r1, r1, r3
+ mov r12, r3
+
+1:
+ vld4.8 {d8[], d10[], d12[], d14[]}, [r2, :32], r7 // left
+ vsubl.u8 q4, d8, d4 // left-right
+ vsubl.u8 q5, d10, d4
+ vsubl.u8 q6, d12, d4
+ vsubl.u8 q7, d14, d4
+2:
+ vld1.8 {q1}, [r8, :128]! // weights_hor
+ vshll.i8 q8, d4, #8 // right*256
+ vshll.i8 q9, d4, #8
+ vshll.i8 q10, d4, #8
+ vshll.i8 q11, d4, #8
+ vmovl.u8 q0, d2 // weights_hor
+ vmovl.u8 q1, d3
+ vshll.i8 q12, d4, #8
+ vshll.i8 q13, d4, #8
+ vshll.i8 q14, d4, #8
+ vshll.i8 q15, d4, #8
+ vmla.i16 q8, q7, q0 // right*256 + (left-right)*weights_hor
+ vmla.i16 q9, q7, q1 // (left flipped)
+ vmla.i16 q10, q6, q0
+ vmla.i16 q11, q6, q1
+ vmla.i16 q12, q5, q0
+ vmla.i16 q13, q5, q1
+ vmla.i16 q14, q4, q0
+ vmla.i16 q15, q4, q1
+ vrshrn.i16 d16, q8, #8
+ vrshrn.i16 d17, q9, #8
+ vrshrn.i16 d18, q10, #8
+ vrshrn.i16 d19, q11, #8
+ vrshrn.i16 d20, q12, #8
+ vrshrn.i16 d21, q13, #8
+ vrshrn.i16 d22, q14, #8
+ vrshrn.i16 d23, q15, #8
+ subs r3, r3, #16
+ vst1.8 {q8}, [r0, :128]!
+ vst1.8 {q9}, [r6, :128]!
+ vst1.8 {q10}, [r5, :128]!
+ vst1.8 {q11}, [lr, :128]!
+ bgt 2b
+ subs r4, r4, #4
+ ble 9f
+ sub r8, r8, r12
+ add r0, r0, r1
+ add r6, r6, r1
+ add r5, r5, r1
+ add lr, lr, r1
+ mov r3, r12
+ b 1b
+9:
+ vpop {q4-q7}
+ pop {r4-r8, pc}
+endfunc
+
+// void ipred_filter_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int filt_idx,
+// const int max_width, const int max_height);
+function ipred_filter_8bpc_neon, export=1
+ push {r4-r8, lr}
+ movw r12, #511
+ ldrd r4, r5, [sp, #24]
+ and r5, r5, r12 // 511
+ movrel r6, X(filter_intra_taps)
+ lsl r5, r5, #6
+ add r6, r6, r5
+ vld1.8 {d20, d21, d22, d23}, [r6, :128]!
+ clz lr, r3
+ adr r5, L(ipred_filter_tbl)
+ vld1.8 {d27, d28, d29}, [r6, :64]
+ sub lr, lr, #26
+ ldr lr, [r5, lr, lsl #2]
+ vmovl.s8 q8, d20
+ vmovl.s8 q9, d21
+ add r5, r5, lr
+ vmovl.s8 q10, d22
+ vmovl.s8 q11, d23
+ add r6, r0, r1
+ lsl r1, r1, #1
+ vmovl.s8 q12, d27
+ vmovl.s8 q13, d28
+ vmovl.s8 q14, d29
+ add r8, r2, #1
+ sub r2, r2, #2
+ mov r7, #-2
+ bx r5
+
+ .align 2
+L(ipred_filter_tbl):
+ .word 320f - L(ipred_filter_tbl) + CONFIG_THUMB
+ .word 160f - L(ipred_filter_tbl) + CONFIG_THUMB
+ .word 80f - L(ipred_filter_tbl) + CONFIG_THUMB
+ .word 40f - L(ipred_filter_tbl) + CONFIG_THUMB
+
+40:
+ vld1.32 {d0[]}, [r8] // top (0-3)
+ vmovl.u8 q0, d0 // top (0-3)
+4:
+ vld1.32 {d2[]}, [r2], r7 // left (0-1) + topleft (2)
+ vmul.i16 q2, q9, d0[0] // p1(top[0]) * filter(1)
+ vmla.i16 q2, q10, d0[1] // p2(top[1]) * filter(2)
+ vmla.i16 q2, q11, d0[2] // p3(top[2]) * filter(3)
+ vmovl.u8 q1, d2 // left (0-1) + topleft (2)
+ vmla.i16 q2, q12, d0[3] // p4(top[3]) * filter(4)
+ vmla.i16 q2, q8, d2[2] // p0(topleft) * filter(0)
+ vmla.i16 q2, q13, d2[1] // p5(left[0]) * filter(5)
+ vmla.i16 q2, q14, d2[0] // p6(left[1]) * filter(6)
+ vqrshrun.s16 d4, q2, #4
+ subs r4, r4, #2
+ vst1.32 {d4[0]}, [r0, :32], r1
+ vmovl.u8 q0, d4
+ vst1.32 {d4[1]}, [r6, :32], r1
+ vmov d0, d1 // move top from [4-7] to [0-3]
+ bgt 4b
+ pop {r4-r8, pc}
+80:
+ vld1.8 {d0}, [r8] // top (0-7)
+ vmovl.u8 q0, d0 // top (0-7)
+8:
+ vld1.32 {d2[]}, [r2], r7 // left (0-1) + topleft (2)
+ vmul.i16 q2, q9, d0[0] // p1(top[0]) * filter(1)
+ vmla.i16 q2, q10, d0[1] // p2(top[1]) * filter(2)
+ vmla.i16 q2, q11, d0[2] // p3(top[2]) * filter(3)
+ vmovl.u8 q1, d2 // left (0-1) + topleft (2)
+ vmla.i16 q2, q12, d0[3] // p4(top[3]) * filter(4)
+ vmla.i16 q2, q8, d2[2] // p0(topleft) * filter(0)
+ vmla.i16 q2, q13, d2[1] // p5(left[0]) * filter(5)
+ vmla.i16 q2, q14, d2[0] // p6(left[1]) * filter(6)
+ vmul.i16 q3, q9, d1[0] // p1(top[0]) * filter(1)
+ vmla.i16 q3, q10, d1[1] // p2(top[1]) * filter(2)
+ vmla.i16 q3, q11, d1[2] // p3(top[2]) * filter(3)
+ vqrshrun.s16 d4, q2, #4
+ vmovl.u8 q1, d4 // first block, in 16 bit
+ vmla.i16 q3, q12, d1[3] // p4(top[3]) * filter(4)
+ vmla.i16 q3, q8, d0[3] // p0(topleft) * filter(0)
+ vmla.i16 q3, q13, d2[3] // p5(left[0]) * filter(5)
+ vmla.i16 q3, q14, d3[3] // p6(left[1]) * filter(6)
+ vqrshrun.s16 d5, q3, #4
+ vzip.32 d4, d5
+ subs r4, r4, #2
+ vst1.8 {d4}, [r0, :64], r1
+ vmovl.u8 q0, d5
+ vst1.8 {d5}, [r6, :64], r1
+ bgt 8b
+ pop {r4-r8, pc}
+160:
+320:
+ vpush {q4-q5}
+ sub r1, r1, r3
+ mov lr, r3
+
+1:
+ vld1.32 {d0[]}, [r2], r7 // left (0-1) + topleft (2)
+ vmovl.u8 q0, d0 // left (0-1) + topleft (2)
+2:
+ vld1.8 {q2}, [r8]! // top(0-15)
+ vmul.i16 q3, q8, d0[2] // p0(topleft) * filter(0)
+ vmla.i16 q3, q13, d0[1] // p5(left[0]) * filter(5)
+ vmovl.u8 q1, d4 // top(0-7)
+ vmovl.u8 q2, d5 // top(8-15)
+ vmla.i16 q3, q14, d0[0] // p6(left[1]) * filter(6)
+ vmla.i16 q3, q9, d2[0] // p1(top[0]) * filter(1)
+ vmla.i16 q3, q10, d2[1] // p2(top[1]) * filter(2)
+ vmla.i16 q3, q11, d2[2] // p3(top[2]) * filter(3)
+ vmla.i16 q3, q12, d2[3] // p4(top[3]) * filter(4)
+
+ vmul.i16 q4, q9, d3[0] // p1(top[0]) * filter(1)
+ vmla.i16 q4, q10, d3[1] // p2(top[1]) * filter(2)
+ vmla.i16 q4, q11, d3[2] // p3(top[2]) * filter(3)
+ vqrshrun.s16 d6, q3, #4
+ vmovl.u8 q0, d6 // first block, in 16 bit
+ vmla.i16 q4, q12, d3[3] // p4(top[3]) * filter(4)
+ vmla.i16 q4, q8, d2[3] // p0(topleft) * filter(0)
+ vmla.i16 q4, q13, d0[3] // p5(left[0]) * filter(5)
+ vmla.i16 q4, q14, d1[3] // p6(left[1]) * filter(6)
+
+ vmul.i16 q5, q9, d4[0] // p1(top[0]) * filter(1)
+ vmla.i16 q5, q10, d4[1] // p2(top[1]) * filter(2)
+ vmla.i16 q5, q11, d4[2] // p3(top[2]) * filter(3)
+ vqrshrun.s16 d7, q4, #4
+ vmovl.u8 q0, d7 // second block, in 16 bit
+ vmla.i16 q5, q12, d4[3] // p4(top[3]) * filter(4)
+ vmla.i16 q5, q8, d3[3] // p0(topleft) * filter(0)
+ vmla.i16 q5, q13, d0[3] // p5(left[0]) * filter(5)
+ vmla.i16 q5, q14, d1[3] // p6(left[1]) * filter(6)
+
+ vmul.i16 q15, q9, d5[0] // p1(top[0]) * filter(1)
+ vmla.i16 q15, q10, d5[1] // p2(top[1]) * filter(2)
+ vmla.i16 q15, q11, d5[2] // p3(top[2]) * filter(3)
+ vqrshrun.s16 d8, q5, #4
+ vmovl.u8 q0, d8 // third block, in 16 bit
+ vmov.u8 r12, d5[6]
+ vmla.i16 q15, q12, d5[3] // p4(top[3]) * filter(4)
+ vmla.i16 q15, q8, d4[3] // p0(topleft) * filter(0)
+ vmla.i16 q15, q13, d0[3] // p5(left[0]) * filter(5)
+ vmla.i16 q15, q14, d1[3] // p6(left[1]) * filter(6)
+ vmov.8 d0[4], r12
+
+ subs r3, r3, #16
+ vqrshrun.s16 d9, q15, #4
+
+ vst4.32 {d6[0], d7[0], d8[0], d9[0]}, [r0, :128]!
+ vst4.32 {d6[1], d7[1], d8[1], d9[1]}, [r6, :128]!
+ ble 8f
+ vmov.u8 r12, d9[7]
+ vmov.8 d0[0], r12
+ vmov.u8 r12, d9[3]
+ vmov.8 d0[2], r12
+ b 2b
+8:
+ subs r4, r4, #2
+
+ ble 9f
+ sub r8, r6, lr
+ add r0, r0, r1
+ add r6, r6, r1
+ mov r3, lr
+ b 1b
+9:
+ vpop {q4-q5}
+ pop {r4-r8, pc}
+endfunc
+
+// void pal_pred_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const pal, const uint8_t *idx,
+// const int w, const int h);
+function pal_pred_8bpc_neon, export=1
+ push {r4-r5, lr}
+ ldrd r4, r5, [sp, #12]
+ vld1.8 {d0}, [r2, :64]
+ clz lr, r4
+ adr r12, L(pal_pred_tbl)
+ sub lr, lr, #25
+ vmov.i8 q15, #7
+ ldr lr, [r12, lr, lsl #2]
+ add r12, r12, lr
+ add r2, r0, r1
+ bx r12
+
+ .align 2
+L(pal_pred_tbl):
+ .word 640f - L(pal_pred_tbl) + CONFIG_THUMB
+ .word 320f - L(pal_pred_tbl) + CONFIG_THUMB
+ .word 160f - L(pal_pred_tbl) + CONFIG_THUMB
+ .word 80f - L(pal_pred_tbl) + CONFIG_THUMB
+ .word 40f - L(pal_pred_tbl) + CONFIG_THUMB
+
+40:
+ lsl r1, r1, #1
+4:
+ vld1.8 {d2}, [r3, :64]!
+ subs r5, r5, #4
+ vshr.u8 d3, d2, #4
+ vand.u8 d2, d2, d30
+ vzip.8 d2, d3
+ vtbl.8 d2, {d0}, d2
+ vtbl.8 d3, {d0}, d3
+ vst1.32 {d2[0]}, [r0, :32], r1
+ vst1.32 {d2[1]}, [r2, :32], r1
+ vst1.32 {d3[0]}, [r0, :32], r1
+ vst1.32 {d3[1]}, [r2, :32], r1
+ bgt 4b
+ pop {r4-r5, pc}
+80:
+ lsl r1, r1, #1
+8:
+ vld1.8 {q1}, [r3, :64]!
+ subs r5, r5, #4
+ vshr.u8 q2, q1, #4
+ vand.u8 q1, q1, q15
+ vzip.8 q1, q2
+ vtbl.8 d2, {d0}, d2
+ vtbl.8 d3, {d0}, d3
+ vst1.8 {d2}, [r0, :64], r1
+ vtbl.8 d4, {d0}, d4
+ vst1.8 {d3}, [r2, :64], r1
+ vtbl.8 d5, {d0}, d5
+ vst1.8 {d4}, [r0, :64], r1
+ vst1.8 {d5}, [r2, :64], r1
+ bgt 8b
+ pop {r4-r5, pc}
+160:
+ lsl r1, r1, #1
+16:
+ vld1.8 {q10, q11}, [r3, :64]!
+ subs r5, r5, #4
+ vand.u8 q8, q10, q15
+ vshr.u8 q9, q10, #4
+ vand.u8 q10, q11, q15
+ vshr.u8 q11, q11, #4
+ vzip.8 q8, q9
+ vzip.8 q10, q11
+ vtbl.8 d16, {d0}, d16
+ vtbl.8 d17, {d0}, d17
+ vtbl.8 d18, {d0}, d18
+ vtbl.8 d19, {d0}, d19
+ vtbl.8 d20, {d0}, d20
+ vtbl.8 d21, {d0}, d21
+ vst1.8 {q8}, [r0, :128], r1
+ vtbl.8 d22, {d0}, d22
+ vst1.8 {q9}, [r2, :128], r1
+ vtbl.8 d23, {d0}, d23
+ vst1.8 {q10}, [r0, :128], r1
+ vst1.8 {q11}, [r2, :128], r1
+ bgt 16b
+ pop {r4-r5, pc}
+320:
+ lsl r1, r1, #1
+32:
+ vld1.8 {q10, q11}, [r3, :64]!
+ subs r5, r5, #2
+ vand.u8 q8, q10, q15
+ vshr.u8 q9, q10, #4
+ vand.u8 q10, q11, q15
+ vshr.u8 q11, q11, #4
+ vzip.8 q8, q9
+ vzip.8 q10, q11
+ vtbl.8 d16, {d0}, d16
+ vtbl.8 d17, {d0}, d17
+ vtbl.8 d18, {d0}, d18
+ vtbl.8 d19, {d0}, d19
+ vtbl.8 d20, {d0}, d20
+ vtbl.8 d21, {d0}, d21
+ vst1.8 {q8, q9}, [r0, :128], r1
+ vtbl.8 d22, {d0}, d22
+ vtbl.8 d23, {d0}, d23
+ vst1.8 {q10, q11}, [r2, :128], r1
+ bgt 32b
+ pop {r4-r5, pc}
+640:
+ sub r1, r1, #32
+64:
+ vld1.8 {q10, q11}, [r3, :64]!
+ subs r5, r5, #1
+ vand.u8 q8, q10, q15
+ vshr.u8 q9, q10, #4
+ vand.u8 q10, q11, q15
+ vshr.u8 q11, q11, #4
+ vzip.8 q8, q9
+ vzip.8 q10, q11
+ vtbl.8 d16, {d0}, d16
+ vtbl.8 d17, {d0}, d17
+ vtbl.8 d18, {d0}, d18
+ vtbl.8 d19, {d0}, d19
+ vtbl.8 d20, {d0}, d20
+ vtbl.8 d21, {d0}, d21
+ vst1.8 {q8, q9}, [r0, :128]!
+ vtbl.8 d22, {d0}, d22
+ vtbl.8 d23, {d0}, d23
+ vst1.8 {q10, q11}, [r0, :128], r1
+ bgt 64b
+ pop {r4-r5, pc}
+endfunc
+
+// void ipred_cfl_128_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height,
+// const int16_t *ac, const int alpha);
+function ipred_cfl_128_8bpc_neon, export=1
+ push {r4-r8, lr}
+ ldrd r4, r5, [sp, #24]
+ ldr r6, [sp, #32]
+ clz lr, r3
+ adr r12, L(ipred_cfl_128_tbl)
+ sub lr, lr, #26
+ ldr lr, [r12, lr, lsl #2]
+ vmov.i16 q0, #128 // dc
+ vdup.i16 q1, r6 // alpha
+ add r12, r12, lr
+ add r6, r0, r1
+ lsl r1, r1, #1
+ bx r12
+
+ .align 2
+L(ipred_cfl_128_tbl):
+L(ipred_cfl_splat_tbl):
+ .word L(ipred_cfl_splat_w16) - L(ipred_cfl_128_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_splat_w16) - L(ipred_cfl_128_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_splat_w8) - L(ipred_cfl_128_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_splat_w4) - L(ipred_cfl_128_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_splat_w4):
+ vld1.16 {q2, q3}, [r5, :128]!
+ vmul.i16 q2, q2, q1 // diff = ac * alpha
+ vmul.i16 q3, q3, q1
+ vshr.s16 q8, q2, #15 // sign = diff >> 15
+ vshr.s16 q9, q3, #15
+ vadd.i16 q2, q2, q8 // diff + sign
+ vadd.i16 q3, q3, q9
+ vrshr.s16 q2, q2, #6 // (diff + sign + 32) >> 6 = apply_sign()
+ vrshr.s16 q3, q3, #6
+ vadd.i16 q2, q2, q0 // dc + apply_sign()
+ vadd.i16 q3, q3, q0
+ vqmovun.s16 d4, q2 // iclip_pixel(dc + apply_sign())
+ vqmovun.s16 d5, q3
+ vst1.32 {d4[0]}, [r0, :32], r1
+ vst1.32 {d4[1]}, [r6, :32], r1
+ subs r4, r4, #4
+ vst1.32 {d5[0]}, [r0, :32], r1
+ vst1.32 {d5[1]}, [r6, :32], r1
+ bgt L(ipred_cfl_splat_w4)
+ pop {r4-r8, pc}
+L(ipred_cfl_splat_w8):
+ vld1.16 {q8, q9}, [r5, :128]!
+ vld1.16 {q10, q11}, [r5, :128]!
+ vmul.i16 q8, q8, q1 // diff = ac * alpha
+ vmul.i16 q9, q9, q1
+ vmul.i16 q10, q10, q1
+ vmul.i16 q11, q11, q1
+ vshr.s16 q12, q8, #15 // sign = diff >> 15
+ vshr.s16 q13, q9, #15
+ vshr.s16 q14, q10, #15
+ vshr.s16 q15, q11, #15
+ vadd.i16 q8, q8, q12 // diff + sign
+ vadd.i16 q9, q9, q13
+ vadd.i16 q10, q10, q14
+ vadd.i16 q11, q11, q15
+ vrshr.s16 q8, q8, #6 // (diff + sign + 32) >> 6 = apply_sign()
+ vrshr.s16 q9, q9, #6
+ vrshr.s16 q10, q10, #6
+ vrshr.s16 q11, q11, #6
+ vadd.i16 q8, q8, q0 // dc + apply_sign()
+ vadd.i16 q9, q9, q0
+ vadd.i16 q10, q10, q0
+ vadd.i16 q11, q11, q0
+ vqmovun.s16 d16, q8 // iclip_pixel(dc + apply_sign())
+ vqmovun.s16 d17, q9
+ vqmovun.s16 d18, q10
+ vqmovun.s16 d19, q11
+ vst1.8 {d16}, [r0, :64], r1
+ vst1.8 {d17}, [r6, :64], r1
+ subs r4, r4, #4
+ vst1.8 {d18}, [r0, :64], r1
+ vst1.8 {d19}, [r6, :64], r1
+ bgt L(ipred_cfl_splat_w8)
+ pop {r4-r8, pc}
+L(ipred_cfl_splat_w16):
+ add r12, r5, r3, lsl #1
+ sub r1, r1, r3
+ mov lr, r3
+1:
+ vld1.16 {q8, q9}, [r5, :128]!
+ vmul.i16 q8, q8, q1 // diff = ac * alpha
+ vld1.16 {q10, q11}, [r12, :128]!
+ vmul.i16 q9, q9, q1
+ vmul.i16 q10, q10, q1
+ vmul.i16 q11, q11, q1
+ vshr.s16 q12, q8, #15 // sign = diff >> 15
+ vshr.s16 q13, q9, #15
+ vshr.s16 q14, q10, #15
+ vshr.s16 q15, q11, #15
+ vadd.i16 q8, q8, q12 // diff + sign
+ vadd.i16 q9, q9, q13
+ vadd.i16 q10, q10, q14
+ vadd.i16 q11, q11, q15
+ vrshr.s16 q8, q8, #6 // (diff + sign + 32) >> 6 = apply_sign()
+ vrshr.s16 q9, q9, #6
+ vrshr.s16 q10, q10, #6
+ vrshr.s16 q11, q11, #6
+ vadd.i16 q8, q8, q0 // dc + apply_sign()
+ vadd.i16 q9, q9, q0
+ vadd.i16 q10, q10, q0
+ vadd.i16 q11, q11, q0
+ vqmovun.s16 d16, q8 // iclip_pixel(dc + apply_sign())
+ vqmovun.s16 d17, q9
+ vqmovun.s16 d18, q10
+ vqmovun.s16 d19, q11
+ subs r3, r3, #16
+ vst1.16 {q8}, [r0, :128]!
+ vst1.16 {q9}, [r6, :128]!
+ bgt 1b
+ subs r4, r4, #2
+ add r5, r5, lr, lsl #1
+ add r12, r12, lr, lsl #1
+ add r0, r0, r1
+ add r6, r6, r1
+ mov r3, lr
+ bgt 1b
+ pop {r4-r8, pc}
+endfunc
+
+// void ipred_cfl_top_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height,
+// const int16_t *ac, const int alpha);
+function ipred_cfl_top_8bpc_neon, export=1
+ push {r4-r8, lr}
+ ldrd r4, r5, [sp, #24]
+ ldr r6, [sp, #32]
+ clz lr, r3
+ adr r12, L(ipred_cfl_top_tbl)
+ sub lr, lr, #26
+ ldr lr, [r12, lr, lsl #2]
+ vdup.16 q1, r6 // alpha
+ add r2, r2, #1
+ add r12, r12, lr
+ add r6, r0, r1
+ lsl r1, r1, #1
+ bx r12
+
+ .align 2
+L(ipred_cfl_top_tbl):
+ .word 32f - L(ipred_cfl_top_tbl) + CONFIG_THUMB
+ .word 16f - L(ipred_cfl_top_tbl) + CONFIG_THUMB
+ .word 8f - L(ipred_cfl_top_tbl) + CONFIG_THUMB
+ .word 4f - L(ipred_cfl_top_tbl) + CONFIG_THUMB
+
+4:
+ vld1.32 {d0[]}, [r2]
+ vpaddl.u8 d0, d0
+ vpadd.u16 d0, d0
+ vrshr.u16 d0, d0, #2
+ vdup.16 q0, d0[0]
+ b L(ipred_cfl_splat_w4)
+8:
+ vld1.8 {d0}, [r2]
+ vpaddl.u8 d0, d0
+ vpadd.u16 d0, d0
+ vpadd.u16 d0, d0
+ vrshr.u16 d0, d0, #3
+ vdup.16 q0, d0[0]
+ b L(ipred_cfl_splat_w8)
+16:
+ vld1.8 {q0}, [r2]
+ vaddl.u8 q0, d0, d1
+ vadd.u16 d0, d0, d1
+ vpadd.u16 d0, d0
+ vpadd.u16 d0, d0
+ vrshr.u16 d0, d0, #4
+ vdup.16 q0, d0[0]
+ b L(ipred_cfl_splat_w16)
+32:
+ vld1.8 {q2, q3}, [r2]
+ vaddl.u8 q2, d4, d5
+ vaddl.u8 q3, d6, d7
+ vadd.u16 q0, q2, q3
+ vadd.u16 d0, d0, d1
+ vpadd.u16 d0, d0
+ vpadd.u16 d0, d0
+ vrshr.u16 d0, d0, #5
+ vdup.16 q0, d0[0]
+ b L(ipred_cfl_splat_w16)
+endfunc
+
+// void ipred_cfl_left_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height,
+// const int16_t *ac, const int alpha);
+function ipred_cfl_left_8bpc_neon, export=1
+ push {r4-r8, lr}
+ ldrd r4, r5, [sp, #24]
+ ldr r6, [sp, #32]
+ sub r2, r2, r4
+ clz lr, r3
+ clz r8, r4
+ adr r12, L(ipred_cfl_splat_tbl)
+ adr r7, L(ipred_cfl_left_tbl)
+ sub lr, lr, #26
+ sub r8, r8, #26
+ ldr lr, [r12, lr, lsl #2]
+ ldr r8, [r7, r8, lsl #2]
+ vdup.16 q1, r6 // alpha
+ add r12, r12, lr
+ add r7, r7, r8
+ add r6, r0, r1
+ lsl r1, r1, #1
+ bx r7
+
+ .align 2
+L(ipred_cfl_left_tbl):
+ .word L(ipred_cfl_left_h32) - L(ipred_cfl_left_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_left_h16) - L(ipred_cfl_left_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_left_h8) - L(ipred_cfl_left_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_left_h4) - L(ipred_cfl_left_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_left_h4):
+ vld1.32 {d0[]}, [r2, :32]
+ vpaddl.u8 d0, d0
+ vpadd.u16 d0, d0
+ vrshr.u16 d0, d0, #2
+ vdup.16 q0, d0[0]
+ bx r12
+
+L(ipred_cfl_left_h8):
+ vld1.8 {d0}, [r2, :64]
+ vpaddl.u8 d0, d0
+ vpadd.u16 d0, d0
+ vpadd.u16 d0, d0
+ vrshr.u16 d0, d0, #3
+ vdup.16 q0, d0[0]
+ bx r12
+
+L(ipred_cfl_left_h16):
+ vld1.8 {q0}, [r2, :128]
+ vaddl.u8 q0, d0, d1
+ vadd.u16 d0, d0, d1
+ vpadd.u16 d0, d0
+ vpadd.u16 d0, d0
+ vrshr.u16 d0, d0, #4
+ vdup.16 q0, d0[0]
+ bx r12
+
+L(ipred_cfl_left_h32):
+ vld1.8 {q2, q3}, [r2, :128]
+ vaddl.u8 q2, d4, d5
+ vaddl.u8 q3, d6, d7
+ vadd.u16 q0, q2, q3
+ vadd.u16 d0, d0, d1
+ vpadd.u16 d0, d0
+ vpadd.u16 d0, d0
+ vrshr.u16 d0, d0, #5
+ vdup.16 q0, d0[0]
+ bx r12
+endfunc
+
+// void ipred_cfl_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height,
+// const int16_t *ac, const int alpha);
+function ipred_cfl_8bpc_neon, export=1
+ push {r4-r8, lr}
+ ldrd r4, r5, [sp, #24]
+ ldr r6, [sp, #32]
+ sub r2, r2, r4
+ add r8, r3, r4 // width + height
+ vdup.16 q1, r6 // alpha
+ clz lr, r3
+ clz r6, r4
+ vdup.16 d16, r8 // width + height
+ adr r7, L(ipred_cfl_tbl)
+ rbit r8, r8 // rbit(width + height)
+ sub lr, lr, #22 // 26 leading bits, minus table offset 4
+ sub r6, r6, #26
+ clz r8, r8 // ctz(width + height)
+ ldr lr, [r7, lr, lsl #2]
+ ldr r6, [r7, r6, lsl #2]
+ neg r8, r8 // -ctz(width + height)
+ add r12, r7, lr
+ add r7, r7, r6
+ vshr.u16 d16, d16, #1 // (width + height) >> 1
+ vdup.16 d17, r8 // -ctz(width + height)
+ add r6, r0, r1
+ lsl r1, r1, #1
+ bx r7
+
+ .align 2
+L(ipred_cfl_tbl):
+ .word L(ipred_cfl_h32) - L(ipred_cfl_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_h16) - L(ipred_cfl_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_h8) - L(ipred_cfl_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_h4) - L(ipred_cfl_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_w32) - L(ipred_cfl_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_w16) - L(ipred_cfl_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_w8) - L(ipred_cfl_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_w4) - L(ipred_cfl_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_h4):
+ vld1.32 {d0[]}, [r2, :32]!
+ vpaddl.u8 d0, d0
+ add r2, r2, #1
+ vpadd.i16 d0, d0
+ bx r12
+L(ipred_cfl_w4):
+ vld1.32 {d1[]}, [r2]
+ vadd.i16 d0, d0, d16
+ vpaddl.u8 d1, d1
+ vpadd.u16 d1, d1
+ cmp r4, #4
+ vadd.i16 d0, d0, d1
+ vshl.u16 d0, d0, d17
+ beq 1f
+ // h = 8/16
+ movw lr, #(0x3334/2)
+ movw r8, #(0x5556/2)
+ cmp r4, #16
+ it ne
+ movne lr, r8
+ vdup.16 d18, lr
+ vqdmulh.s16 d0, d0, d18
+1:
+ vdup.16 q0, d0[0]
+ b L(ipred_cfl_splat_w4)
+
+L(ipred_cfl_h8):
+ vld1.8 {d0}, [r2, :64]!
+ vpaddl.u8 d0, d0
+ vpadd.i16 d0, d0
+ add r2, r2, #1
+ vpadd.i16 d0, d0
+ bx r12
+L(ipred_cfl_w8):
+ vld1.8 {d1}, [r2]
+ vadd.i16 d0, d0, d16
+ vpaddl.u8 d1, d1
+ vpadd.i16 d1, d1
+ vpadd.i16 d1, d1
+ cmp r4, #8
+ vadd.i16 d0, d0, d1
+ vshl.u16 d0, d0, d17
+ beq 1f
+ // h = 4/16/32
+ cmp r4, #32
+ movw lr, #(0x3334/2)
+ movw r8, #(0x5556/2)
+ it ne
+ movne lr, r8
+ vdup.16 d18, lr
+ vqdmulh.s16 d0, d0, d18
+1:
+ vdup.16 q0, d0[0]
+ b L(ipred_cfl_splat_w8)
+
+L(ipred_cfl_h16):
+ vld1.8 {q0}, [r2, :128]!
+ vaddl.u8 q0, d0, d1
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0
+ add r2, r2, #1
+ vpadd.i16 d0, d0
+ bx r12
+L(ipred_cfl_w16):
+ vld1.8 {q2}, [r2]
+ vadd.i16 d0, d0, d16
+ vaddl.u8 q2, d4, d5
+ vadd.i16 d4, d4, d5
+ vpadd.i16 d4, d4
+ vpadd.i16 d4, d4
+ cmp r4, #16
+ vadd.i16 d0, d0, d4
+ vshl.u16 d0, d0, d17
+ beq 1f
+ // h = 4/8/32/64
+ tst r4, #(32+16+8) // 16 added to make a consecutive bitmask
+ movw lr, #(0x3334/2)
+ movw r8, #(0x5556/2)
+ it ne
+ movne lr, r8
+ vdup.16 d18, lr
+ vqdmulh.s16 d0, d0, d18
+1:
+ vdup.16 q0, d0[0]
+ b L(ipred_cfl_splat_w16)
+
+L(ipred_cfl_h32):
+ vld1.8 {q2, q3}, [r2, :128]!
+ vaddl.u8 q2, d4, d5
+ vaddl.u8 q3, d6, d7
+ vadd.i16 q0, q2, q3
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0
+ add r2, r2, #1
+ vpadd.i16 d0, d0
+ bx r12
+L(ipred_cfl_w32):
+ vld1.8 {q2, q3}, [r2]
+ vadd.i16 d0, d0, d16
+ vaddl.u8 q2, d4, d5
+ vaddl.u8 q3, d6, d7
+ vadd.i16 q2, q2, q3
+ vadd.i16 d4, d4, d5
+ vpadd.i16 d4, d4
+ vpadd.i16 d4, d4
+ cmp r4, #32
+ vadd.i16 d0, d0, d4
+ vshl.u16 d0, d0, d17
+ beq 1f
+ // h = 8/16/64
+ cmp r4, #8
+ movw lr, #(0x3334/2)
+ movw r8, #(0x5556/2)
+ it ne
+ movne lr, r8
+ vdup.16 d18, lr
+ vqdmulh.s16 d0, d0, d18
+1:
+ vdup.16 q0, d0[0]
+ b L(ipred_cfl_splat_w16)
+endfunc
+
+// void cfl_ac_420_8bpc_neon(int16_t *const ac, const pixel *const ypx,
+// const ptrdiff_t stride, const int w_pad,
+// const int h_pad, const int cw, const int ch);
+function ipred_cfl_ac_420_8bpc_neon, export=1
+ push {r4-r8,lr}
+ ldrd r4, r5, [sp, #24]
+ ldr r6, [sp, #32]
+ clz r8, r5
+ lsl r4, r4, #2
+ adr r7, L(ipred_cfl_ac_420_tbl)
+ sub r8, r8, #27
+ ldr r8, [r7, r8, lsl #2]
+ vmov.i16 q8, #0
+ vmov.i16 q9, #0
+ vmov.i16 q10, #0
+ vmov.i16 q11, #0
+ add r7, r7, r8
+ sub r8, r6, r4 // height - h_pad
+ rbit lr, r5 // rbit(width)
+ rbit r12, r6 // rbit(height)
+ clz lr, lr // ctz(width)
+ clz r12, r12 // ctz(height)
+ add lr, lr, r12 // log2sz
+ add r12, r1, r2
+ vdup.32 d31, lr
+ lsl r2, r2, #1
+ vneg.s32 d31, d31 // -log2sz
+ bx r7
+
+ .align 2
+L(ipred_cfl_ac_420_tbl):
+ .word L(ipred_cfl_ac_420_w16) - L(ipred_cfl_ac_420_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_420_w8) - L(ipred_cfl_ac_420_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_420_w4) - L(ipred_cfl_ac_420_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_ac_420_w4):
+1: // Copy and subsample input
+ vld1.8 {d0}, [r1, :64], r2
+ vld1.8 {d2}, [r12, :64], r2
+ vld1.8 {d1}, [r1, :64], r2
+ vld1.8 {d3}, [r12, :64], r2
+ vpaddl.u8 q0, q0
+ vpaddl.u8 q1, q1
+ vadd.i16 q0, q0, q1
+ vshl.i16 q0, q0, #1
+ subs r8, r8, #2
+ vst1.16 {q0}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ bgt 1b
+ cmp r4, #0
+ vmov d0, d1
+ vmov d2, d1
+ vmov d3, d1
+L(ipred_cfl_ac_420_w4_hpad):
+ beq 3f // This assumes that all callers already did "cmp r4, #0"
+2: // Vertical padding (h_pad > 0)
+ subs r4, r4, #4
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q8, q8, q1
+ bgt 2b
+3:
+L(ipred_cfl_ac_420_w4_calc_subtract_dc):
+ // Aggregate the sums
+ vadd.i16 q0, q8, q9
+ vadd.i16 q1, q10, q11
+ vpaddl.u16 q0, q0
+ vpaddl.u16 q1, q1
+ vadd.i32 q0, q1
+ vadd.i32 d0, d0, d1
+ vpadd.i32 d0, d0, d0 // sum
+ sub r0, r0, r6, lsl #3
+ vrshl.u32 d16, d0, d31 // (sum + (1 << (log2sz - 1))) >>= log2sz
+ vdup.16 q8, d16[0]
+L(ipred_cfl_ac_420_w4_subtract_dc):
+6: // Subtract dc from ac
+ vld1.16 {q0, q1}, [r0, :128]
+ subs r6, r6, #4
+ vsub.i16 q0, q0, q8
+ vsub.i16 q1, q1, q8
+ vst1.16 {q0, q1}, [r0, :128]!
+ bgt 6b
+ pop {r4-r8, pc}
+
+L(ipred_cfl_ac_420_w8):
+ cmp r3, #0
+ bne L(ipred_cfl_ac_420_w8_wpad)
+1: // Copy and subsample input, without padding
+ vld1.8 {q0}, [r1, :128], r2
+ vld1.8 {q1}, [r12, :128], r2
+ vld1.8 {q2}, [r1, :128], r2
+ vpaddl.u8 q0, q0
+ vld1.8 {q3}, [r12, :128], r2
+ vpaddl.u8 q1, q1
+ vpaddl.u8 q2, q2
+ vpaddl.u8 q3, q3
+ vadd.i16 q0, q0, q1
+ vadd.i16 q2, q2, q3
+ vshl.i16 q0, q0, #1
+ vshl.i16 q1, q2, #1
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q1
+ b L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_420_w8_wpad):
+1: // Copy and subsample input, padding 4
+ vld1.16 {d0}, [r1, :64], r2
+ vld1.16 {d2}, [r12, :64], r2
+ vld1.16 {d1}, [r1, :64], r2
+ vld1.16 {d3}, [r12, :64], r2
+ vpaddl.u8 q0, q0
+ vpaddl.u8 q1, q1
+ vadd.i16 q0, q0, q1
+ vshl.i16 q0, q0, #1
+ vdup.16 d3, d1[3]
+ vmov d2, d1
+ vdup.16 d1, d0[3]
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q1
+
+L(ipred_cfl_ac_420_w8_hpad):
+ beq 3f // This assumes that all callers already did "cmp r4, #0"
+2: // Vertical padding (h_pad > 0)
+ subs r4, r4, #4
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q10, q10, q0
+ vadd.i16 q11, q11, q1
+ bgt 2b
+3:
+
+ // Double the height and reuse the w4 summing/subtracting
+ lsl r6, r6, #1
+ b L(ipred_cfl_ac_420_w4_calc_subtract_dc)
+
+L(ipred_cfl_ac_420_w16):
+ adr r7, L(ipred_cfl_ac_420_w16_tbl)
+ ldr r3, [r7, r3, lsl #2]
+ add r7, r7, r3
+ bx r7
+
+ .align 2
+L(ipred_cfl_ac_420_w16_tbl):
+ .word L(ipred_cfl_ac_420_w16_wpad0) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_420_w16_wpad1) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_420_w16_wpad2) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_420_w16_wpad3) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_ac_420_w16_wpad0):
+1: // Copy and subsample input, without padding
+ vld1.8 {q0, q1}, [r1, :128], r2
+ vld1.8 {q2, q3}, [r12, :128], r2
+ vpaddl.u8 q0, q0
+ vld1.8 {q12, q13}, [r1, :128], r2
+ vpaddl.u8 q1, q1
+ vpaddl.u8 q2, q2
+ vpaddl.u8 q3, q3
+ vadd.i16 q0, q0, q2
+ vadd.i16 q1, q1, q3
+ vld1.8 {q2, q3}, [r12, :128], r2
+ vpaddl.u8 q12, q12
+ vpaddl.u8 q13, q13
+ vpaddl.u8 q2, q2
+ vpaddl.u8 q3, q3
+ vadd.i16 q12, q12, q2
+ vadd.i16 q13, q13, q3
+ vshl.i16 q0, q0, #1
+ vshl.i16 q1, q1, #1
+ vshl.i16 q2, q12, #1
+ vshl.i16 q3, q13, #1
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q2
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_wpad1):
+1: // Copy and subsample input, padding 4
+ vldr d2, [r1, #16]
+ vld1.8 {q0}, [r1, :128], r2
+ vldr d6, [r12, #16]
+ vld1.8 {q2}, [r12, :128], r2
+ vpaddl.u8 d2, d2
+ vldr d26, [r1, #16]
+ vpaddl.u8 q0, q0
+ vld1.8 {q12}, [r1, :128], r2
+ vpaddl.u8 d6, d6
+ vldr d30, [r12, #16]
+ vpaddl.u8 q2, q2
+ vld1.8 {q14}, [r12, :128], r2
+ vpaddl.u8 d26, d26
+ vpaddl.u8 q12, q12
+ vpaddl.u8 d30, d30
+ vpaddl.u8 q14, q14
+ vadd.i16 d2, d2, d6
+ vadd.i16 q0, q0, q2
+ vadd.i16 d26, d26, d30
+ vadd.i16 q12, q12, q14
+ vshl.i16 d2, d2, #1
+ vshl.i16 q0, q0, #1
+ vshl.i16 d6, d26, #1
+ vshl.i16 q2, q12, #1
+ vdup.16 d3, d2[3]
+ vdup.16 d7, d6[3]
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q2
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_wpad2):
+1: // Copy and subsample input, padding 8
+ vld1.8 {q0}, [r1, :128], r2
+ vld1.8 {q1}, [r12, :128], r2
+ vld1.8 {q2}, [r1, :128], r2
+ vpaddl.u8 q0, q0
+ vld1.8 {q3}, [r12, :128], r2
+ vpaddl.u8 q1, q1
+ vpaddl.u8 q2, q2
+ vpaddl.u8 q3, q3
+ vadd.i16 q0, q0, q1
+ vadd.i16 q2, q2, q3
+ vshl.i16 q0, q0, #1
+ vshl.i16 q2, q2, #1
+ vdup.16 q1, d1[3]
+ vdup.16 q3, d5[3]
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q2
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_wpad3):
+1: // Copy and subsample input, padding 12
+ vld1.8 {d0}, [r1, :64], r2
+ vld1.8 {d1}, [r12, :64], r2
+ vld1.8 {d4}, [r1, :64], r2
+ vpaddl.u8 q0, q0
+ vld1.8 {d5}, [r12, :64], r2
+ vpaddl.u8 q2, q2
+ vadd.i16 d0, d0, d1
+ vadd.i16 d4, d4, d5
+ vshl.i16 d0, d0, #1
+ vshl.i16 d4, d4, #1
+ vdup.16 q1, d0[3]
+ vdup.16 q3, d4[3]
+ vdup.16 d1, d0[3]
+ vdup.16 d5, d4[3]
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q2
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_hpad):
+ beq 3f // This assumes that all callers already did "cmp r4, #0"
+2: // Vertical padding (h_pad > 0)
+ subs r4, r4, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ bgt 2b
+3:
+
+ // Quadruple the height and reuse the w4 summing/subtracting
+ lsl r6, r6, #2
+ b L(ipred_cfl_ac_420_w4_calc_subtract_dc)
+endfunc
+
+// void cfl_ac_422_8bpc_neon(int16_t *const ac, const pixel *const ypx,
+// const ptrdiff_t stride, const int w_pad,
+// const int h_pad, const int cw, const int ch);
+function ipred_cfl_ac_422_8bpc_neon, export=1
+ push {r4-r8,lr}
+ ldrd r4, r5, [sp, #24]
+ ldr r6, [sp, #32]
+ clz r8, r5
+ lsl r4, r4, #2
+ adr r7, L(ipred_cfl_ac_422_tbl)
+ sub r8, r8, #27
+ ldr r8, [r7, r8, lsl #2]
+ vmov.i16 q8, #0
+ vmov.i16 q9, #0
+ vmov.i16 q10, #0
+ vmov.i16 q11, #0
+ add r7, r7, r8
+ sub r8, r6, r4 // height - h_pad
+ rbit lr, r5 // rbit(width)
+ rbit r12, r6 // rbit(height)
+ clz lr, lr // ctz(width)
+ clz r12, r12 // ctz(height)
+ add lr, lr, r12 // log2sz
+ add r12, r1, r2
+ vdup.32 d31, lr
+ lsl r2, r2, #1
+ vneg.s32 d31, d31 // -log2sz
+ bx r7
+
+ .align 2
+L(ipred_cfl_ac_422_tbl):
+ .word L(ipred_cfl_ac_422_w16) - L(ipred_cfl_ac_422_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_422_w8) - L(ipred_cfl_ac_422_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_422_w4) - L(ipred_cfl_ac_422_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_ac_422_w4):
+1: // Copy and subsample input
+ vld1.8 {d0}, [r1, :64], r2
+ vld1.8 {d1}, [r12, :64], r2
+ vld1.8 {d2}, [r1, :64], r2
+ vld1.8 {d3}, [r12, :64], r2
+ vpaddl.u8 q0, q0
+ vpaddl.u8 q1, q1
+ vshl.i16 q0, q0, #2
+ vshl.i16 q1, q1, #2
+ subs r8, r8, #4
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ bgt 1b
+ cmp r4, #0
+ vmov d0, d3
+ vmov d1, d3
+ vmov d2, d3
+ b L(ipred_cfl_ac_420_w4_hpad)
+
+L(ipred_cfl_ac_422_w8):
+ cmp r3, #0
+ bne L(ipred_cfl_ac_422_w8_wpad)
+1: // Copy and subsample input, without padding
+ vld1.8 {q0}, [r1, :128], r2
+ vld1.8 {q1}, [r12, :128], r2
+ vld1.8 {q2}, [r1, :128], r2
+ vpaddl.u8 q0, q0
+ vld1.8 {q3}, [r12, :128], r2
+ vpaddl.u8 q1, q1
+ vpaddl.u8 q2, q2
+ vpaddl.u8 q3, q3
+ vshl.i16 q0, q0, #2
+ vshl.i16 q1, q1, #2
+ vshl.i16 q2, q2, #2
+ vshl.i16 q3, q3, #2
+ subs r8, r8, #4
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q3
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_422_w8_wpad):
+1: // Copy and subsample input, padding 4
+ vld1.8 {d0}, [r1, :64], r2
+ vld1.8 {d1}, [r12, :64], r2
+ vld1.8 {d2}, [r1, :64], r2
+ vld1.8 {d3}, [r12, :64], r2
+ vpaddl.u8 q0, q0
+ vpaddl.u8 q1, q1
+ vshl.i16 q0, q0, #2
+ vshl.i16 q1, q1, #2
+ vdup.16 d7, d3[3]
+ vmov d6, d3
+ vdup.16 d5, d2[3]
+ vmov d4, d2
+ vdup.16 d3, d1[3]
+ vmov d2, d1
+ vdup.16 d1, d0[3]
+ subs r8, r8, #4
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q3
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_422_w16):
+ adr r7, L(ipred_cfl_ac_422_w16_tbl)
+ ldr r3, [r7, r3, lsl #2]
+ add r7, r7, r3
+ bx r7
+
+ .align 2
+L(ipred_cfl_ac_422_w16_tbl):
+ .word L(ipred_cfl_ac_422_w16_wpad0) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_422_w16_wpad1) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_422_w16_wpad2) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_422_w16_wpad3) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_ac_422_w16_wpad0):
+1: // Copy and subsample input, without padding
+ vld1.8 {q0, q1}, [r1, :128], r2
+ vld1.8 {q2, q3}, [r12, :128], r2
+ vpaddl.u8 q0, q0
+ vpaddl.u8 q1, q1
+ vpaddl.u8 q2, q2
+ vpaddl.u8 q3, q3
+ vshl.i16 q0, q0, #2
+ vshl.i16 q1, q1, #2
+ vshl.i16 q2, q2, #2
+ vshl.i16 q3, q3, #2
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q2
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_w16_wpad1):
+1: // Copy and subsample input, padding 4
+ vldr d2, [r1, #16]
+ vld1.8 {q0}, [r1, :128], r2
+ vldr d6, [r12, #16]
+ vld1.8 {q2}, [r12, :128], r2
+ vpaddl.u8 d2, d2
+ vpaddl.u8 q0, q0
+ vpaddl.u8 d6, d6
+ vpaddl.u8 q2, q2
+ vshl.i16 d2, d2, #2
+ vshl.i16 q0, q0, #2
+ vshl.i16 d6, d6, #2
+ vshl.i16 q2, q2, #2
+ vdup.16 d3, d2[3]
+ vdup.16 d7, d6[3]
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q2
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_w16_wpad2):
+1: // Copy and subsample input, padding 8
+ vld1.8 {q0}, [r1, :128], r2
+ vld1.8 {q2}, [r12, :128], r2
+ vpaddl.u8 q0, q0
+ vpaddl.u8 q2, q2
+ vshl.i16 q0, q0, #2
+ vshl.i16 q2, q2, #2
+ vdup.16 q1, d1[3]
+ vdup.16 q3, d5[3]
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q2
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_w16_wpad3):
+1: // Copy and subsample input, padding 12
+ vld1.8 {d0}, [r1, :64], r2
+ vld1.8 {d1}, [r12, :64], r2
+ vpaddl.u8 q0, q0
+ vshl.i16 q0, q0, #2
+ vdup.16 q3, d1[3]
+ vdup.16 q1, d0[3]
+ vdup.16 d5, d1[3]
+ vmov d4, d1
+ vdup.16 d1, d0[3]
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q2
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w16_hpad)
+endfunc
+
+// void cfl_ac_444_8bpc_neon(int16_t *const ac, const pixel *const ypx,
+// const ptrdiff_t stride, const int w_pad,
+// const int h_pad, const int cw, const int ch);
+function ipred_cfl_ac_444_8bpc_neon, export=1
+ push {r4-r8,lr}
+ ldrd r4, r5, [sp, #24]
+ ldr r6, [sp, #32]
+ clz r8, r5
+ lsl r4, r4, #2
+ adr r7, L(ipred_cfl_ac_444_tbl)
+ sub r8, r8, #26
+ ldr r8, [r7, r8, lsl #2]
+ vmov.i16 q8, #0
+ vmov.i16 q9, #0
+ vmov.i16 q10, #0
+ vmov.i16 q11, #0
+ add r7, r7, r8
+ sub r8, r6, r4 // height - h_pad
+ rbit lr, r5 // rbit(width)
+ rbit r12, r6 // rbit(height)
+ clz lr, lr // ctz(width)
+ clz r12, r12 // ctz(height)
+ add lr, lr, r12 // log2sz
+ add r12, r1, r2
+ vdup.32 d31, lr
+ lsl r2, r2, #1
+ vneg.s32 d31, d31 // -log2sz
+ bx r7
+
+ .align 2
+L(ipred_cfl_ac_444_tbl):
+ .word L(ipred_cfl_ac_444_w32) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_444_w16) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_444_w8) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_444_w4) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_ac_444_w4):
+1: // Copy and expand input
+ vld1.32 {d0[]}, [r1, :32], r2
+ vld1.32 {d0[1]}, [r12, :32], r2
+ vld1.32 {d2[]}, [r1, :32], r2
+ vld1.32 {d2[1]}, [r12, :32], r2
+ vshll.u8 q0, d0, #3
+ vshll.u8 q1, d2, #3
+ subs r8, r8, #4
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ bgt 1b
+ cmp r4, #0
+ vmov d0, d3
+ vmov d1, d3
+ vmov d2, d3
+ b L(ipred_cfl_ac_420_w4_hpad)
+
+L(ipred_cfl_ac_444_w8):
+1: // Copy and expand input
+ vld1.16 {d0}, [r1, :64], r2
+ vld1.16 {d2}, [r12, :64], r2
+ vld1.16 {d4}, [r1, :64], r2
+ vshll.u8 q0, d0, #3
+ vld1.16 {d6}, [r12, :64], r2
+ vshll.u8 q1, d2, #3
+ vshll.u8 q2, d4, #3
+ vshll.u8 q3, d6, #3
+ subs r8, r8, #4
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q3
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_444_w16):
+ cmp r3, #0
+ bne L(ipred_cfl_ac_444_w16_wpad)
+1: // Copy and expand input, without padding
+ vld1.8 {q1}, [r1, :128], r2
+ vld1.8 {q3}, [r12, :128], r2
+ vshll.u8 q0, d2, #3
+ vshll.u8 q1, d3, #3
+ vshll.u8 q2, d6, #3
+ vshll.u8 q3, d7, #3
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q2
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_444_w16_wpad):
+1: // Copy and expand input, padding 8
+ vld1.8 {d0}, [r1, :64], r2
+ vld1.8 {d4}, [r12, :64], r2
+ vshll.u8 q0, d0, #3
+ vshll.u8 q2, d4, #3
+ vdup.16 q1, d1[3]
+ vdup.16 q3, d5[3]
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q2
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_444_w32):
+ adr r7, L(ipred_cfl_ac_444_w32_tbl)
+ ldr r3, [r7, r3, lsl #1] // (w3>>1) << 2
+ add r7, r7, r3
+ bx r7
+
+ .align 2
+L(ipred_cfl_ac_444_w32_tbl):
+ .word L(ipred_cfl_ac_444_w32_wpad0) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_444_w32_wpad2) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_444_w32_wpad4) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_444_w32_wpad6) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_ac_444_w32_wpad0):
+1: // Copy and expand input, without padding
+ vld1.8 {q2, q3}, [r1, :128], r2
+ vld1.8 {q13, q14}, [r12, :128], r2
+ vshll.u8 q0, d4, #3
+ vshll.u8 q1, d5, #3
+ vshll.u8 q2, d6, #3
+ vshll.u8 q3, d7, #3
+ vshll.u8 q12, d26, #3
+ vshll.u8 q13, d27, #3
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vshll.u8 q0, d28, #3
+ vshll.u8 q1, d29, #3
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ vst1.16 {q12, q13}, [r0, :128]!
+ vadd.i16 q8, q8, q12
+ vadd.i16 q9, q9, q13
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q10, q10, q0
+ vadd.i16 q11, q11, q1
+ bgt 1b
+ cmp r4, #0
+ b L(ipred_cfl_ac_444_w32_hpad)
+
+L(ipred_cfl_ac_444_w32_wpad2):
+1: // Copy and expand input, padding 8
+ vldr d4, [r1, #16]
+ vld1.8 {q1}, [r1, :128], r2
+ vldr d28, [r12, #16]
+ vld1.8 {q13}, [r12, :128], r2
+ vshll.u8 q2, d4, #3
+ vshll.u8 q0, d2, #3
+ vshll.u8 q1, d3, #3
+ vshll.u8 q12, d26, #3
+ vshll.u8 q13, d27, #3
+ vdup.16 q3, d5[3]
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vshll.u8 q0, d28, #3
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ vdup.16 q1, d1[3]
+ vst1.16 {q12, q13}, [r0, :128]!
+ vadd.i16 q8, q8, q12
+ vadd.i16 q9, q9, q13
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q10, q10, q0
+ vadd.i16 q11, q11, q1
+ bgt 1b
+ cmp r4, #0
+ b L(ipred_cfl_ac_444_w32_hpad)
+
+L(ipred_cfl_ac_444_w32_wpad4):
+1: // Copy and expand input, padding 16
+ vld1.8 {q1}, [r1, :128], r2
+ vld1.8 {q13}, [r12, :128], r2
+ vshll.u8 q0, d2, #3
+ vshll.u8 q1, d3, #3
+ vshll.u8 q12, d26, #3
+ vshll.u8 q13, d27, #3
+ vdup.16 q2, d3[3]
+ vdup.16 q3, d3[3]
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vdup.16 q0, d27[3]
+ vdup.16 q1, d27[3]
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ vst1.16 {q12, q13}, [r0, :128]!
+ vadd.i16 q8, q8, q12
+ vadd.i16 q9, q9, q13
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q10, q10, q0
+ vadd.i16 q11, q11, q1
+ bgt 1b
+ cmp r4, #0
+ b L(ipred_cfl_ac_444_w32_hpad)
+
+L(ipred_cfl_ac_444_w32_wpad6):
+1: // Copy and expand input, padding 24
+ vld1.8 {d0}, [r1, :64], r2
+ vld1.8 {d24}, [r12, :64], r2
+ vshll.u8 q0, d0, #3
+ vshll.u8 q12, d24, #3
+ subs r8, r8, #2
+ vdup.16 q1, d1[3]
+ vdup.16 q2, d1[3]
+ vdup.16 q3, d1[3]
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vdup.16 q13, d25[3]
+ vdup.16 q0, d25[3]
+ vdup.16 q1, d25[3]
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ vst1.16 {q12, q13}, [r0, :128]!
+ vadd.i16 q8, q8, q12
+ vadd.i16 q9, q9, q13
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q10, q10, q0
+ vadd.i16 q11, q11, q1
+ bgt 1b
+ cmp r4, #0
+
+L(ipred_cfl_ac_444_w32_hpad):
+ beq 3f // This assumes that all callers already did "cmp r4, #0"
+2: // Vertical padding (h_pad > 0)
+ subs r4, r4, #1
+ vst1.16 {q12, q13}, [r0, :128]!
+ vadd.i16 q8, q8, q12
+ vadd.i16 q9, q9, q13
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q10, q10, q0
+ vadd.i16 q11, q11, q1
+ bgt 2b
+3:
+
+ // Multiply the height by eight and reuse the w4 subtracting
+ lsl r6, r6, #3
+ // Aggregate the sums, with wider intermediates earlier than in
+ // ipred_cfl_ac_420_w8_calc_subtract_dc.
+ vpaddl.u16 q0, q8
+ vpaddl.u16 q1, q9
+ vpaddl.u16 q2, q10
+ vpaddl.u16 q3, q11
+ vadd.i32 q0, q0, q1
+ vadd.i32 q2, q2, q3
+ vadd.i32 q0, q0, q2
+ vadd.i32 d0, d0, d1
+ vpadd.i32 d0, d0, d0 // sum
+ sub r0, r0, r6, lsl #3
+ vrshl.u32 d16, d0, d31 // (sum + (1 << (log2sz - 1))) >>= log2sz
+ vdup.16 q8, d16[0]
+ b L(ipred_cfl_ac_420_w4_subtract_dc)
+endfunc
diff --git a/third_party/dav1d/src/arm/32/ipred16.S b/third_party/dav1d/src/arm/32/ipred16.S
new file mode 100644
index 0000000000..fa78049768
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/ipred16.S
@@ -0,0 +1,3276 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, B Krishnan Iyer
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// void ipred_dc_128_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height,
+// const int bitdepth_max);
+function ipred_dc_128_16bpc_neon, export=1
+ push {r4, lr}
+ ldr r4, [sp, #8]
+ ldr r12, [sp, #24]
+ clz r3, r3
+ adr r2, L(ipred_dc_128_tbl)
+ sub r3, r3, #25
+ vdup.16 q0, r12
+ ldr r3, [r2, r3, lsl #2]
+ add r12, r0, r1
+ vrshr.u16 q0, q0, #1
+ add r2, r2, r3
+ lsl r1, r1, #1
+ bx r2
+
+ .align 2
+L(ipred_dc_128_tbl):
+ .word 640f - L(ipred_dc_128_tbl) + CONFIG_THUMB
+ .word 320f - L(ipred_dc_128_tbl) + CONFIG_THUMB
+ .word 160f - L(ipred_dc_128_tbl) + CONFIG_THUMB
+ .word 8f - L(ipred_dc_128_tbl) + CONFIG_THUMB
+ .word 4f - L(ipred_dc_128_tbl) + CONFIG_THUMB
+4:
+ vst1.16 {d0}, [r0, :64], r1
+ vst1.16 {d0}, [r12, :64], r1
+ subs r4, r4, #4
+ vst1.16 {d0}, [r0, :64], r1
+ vst1.16 {d0}, [r12, :64], r1
+ bgt 4b
+ pop {r4, pc}
+8:
+ vst1.16 {d0, d1}, [r0, :128], r1
+ vst1.16 {d0, d1}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.16 {d0, d1}, [r0, :128], r1
+ vst1.16 {d0, d1}, [r12, :128], r1
+ bgt 8b
+ pop {r4, pc}
+160:
+ vmov q1, q0
+16:
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 16b
+ pop {r4, pc}
+320:
+ vmov q1, q0
+ sub r1, r1, #32
+32:
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 32b
+ pop {r4, pc}
+640:
+ vmov q1, q0
+ sub r1, r1, #96
+64:
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ subs r4, r4, #2
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 64b
+ pop {r4, pc}
+endfunc
+
+// void ipred_v_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_v_16bpc_neon, export=1
+ push {r4, lr}
+ ldr lr, [sp, #8]
+ clz r3, r3
+ adr r4, L(ipred_v_tbl)
+ sub r3, r3, #25
+ ldr r3, [r4, r3, lsl #2]
+ add r2, r2, #2
+ add r4, r4, r3
+ add r12, r0, r1
+ lsl r1, r1, #1
+ bx r4
+
+ .align 2
+L(ipred_v_tbl):
+ .word 640f - L(ipred_v_tbl) + CONFIG_THUMB
+ .word 320f - L(ipred_v_tbl) + CONFIG_THUMB
+ .word 160f - L(ipred_v_tbl) + CONFIG_THUMB
+ .word 80f - L(ipred_v_tbl) + CONFIG_THUMB
+ .word 40f - L(ipred_v_tbl) + CONFIG_THUMB
+
+40:
+ vld1.16 {d0}, [r2]
+4:
+ vst1.16 {d0}, [r0, :64], r1
+ vst1.16 {d0}, [r12, :64], r1
+ subs lr, lr, #4
+ vst1.16 {d0}, [r0, :64], r1
+ vst1.16 {d0}, [r12, :64], r1
+ bgt 4b
+ pop {r4, pc}
+80:
+ vld1.16 {q0}, [r2]
+8:
+ vst1.16 {d0, d1}, [r0, :128], r1
+ vst1.16 {d0, d1}, [r12, :128], r1
+ subs lr, lr, #4
+ vst1.16 {d0, d1}, [r0, :128], r1
+ vst1.16 {d0, d1}, [r12, :128], r1
+ bgt 8b
+ pop {r4, pc}
+160:
+ vld1.16 {q0, q1}, [r2]
+16:
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ subs lr, lr, #4
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 16b
+ pop {r4, pc}
+320:
+ vld1.16 {q0, q1}, [r2]!
+ sub r1, r1, #32
+ vld1.16 {q2, q3}, [r2]
+32:
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.16 {d4, d5, d6, d7}, [r0, :128], r1
+ vst1.16 {d4, d5, d6, d7}, [r12, :128], r1
+ subs lr, lr, #4
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.16 {d4, d5, d6, d7}, [r0, :128], r1
+ vst1.16 {d4, d5, d6, d7}, [r12, :128], r1
+ bgt 32b
+ pop {r4, pc}
+640:
+ vld1.16 {q0, q1}, [r2]!
+ sub r1, r1, #96
+ vld1.16 {q2, q3}, [r2]!
+ vld1.16 {q8, q9}, [r2]!
+ vld1.16 {q10, q11}, [r2]!
+64:
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.16 {d4, d5, d6, d7}, [r0, :128]!
+ vst1.16 {d4, d5, d6, d7}, [r12, :128]!
+ subs lr, lr, #2
+ vst1.16 {d16, d17, d18, d19}, [r0, :128]!
+ vst1.16 {d16, d17, d18, d19}, [r12, :128]!
+ vst1.16 {d20, d21, d22, d23}, [r0, :128], r1
+ vst1.16 {d20, d21, d22, d23}, [r12, :128], r1
+ bgt 64b
+ pop {r4, pc}
+endfunc
+
+// void ipred_h_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_h_16bpc_neon, export=1
+ push {r4-r5, lr}
+ ldr r4, [sp, #12]
+ clz r3, r3
+ adr r5, L(ipred_h_tbl)
+ sub r3, r3, #25
+ ldr r3, [r5, r3, lsl #2]
+ sub r2, r2, #2
+ mov lr, #-2
+ add r5, r5, r3
+ add r12, r0, r1
+ lsl r1, r1, #1
+ bx r5
+
+ .align 2
+L(ipred_h_tbl):
+ .word 640f - L(ipred_h_tbl) + CONFIG_THUMB
+ .word 320f - L(ipred_h_tbl) + CONFIG_THUMB
+ .word 160f - L(ipred_h_tbl) + CONFIG_THUMB
+ .word 8f - L(ipred_h_tbl) + CONFIG_THUMB
+ .word 40f - L(ipred_h_tbl) + CONFIG_THUMB
+40:
+ sub r2, r2, #6
+ mov lr, #-8
+4:
+ vld4.16 {d0[], d1[], d2[], d3[]}, [r2], lr
+ vst1.16 {d3}, [r0, :64], r1
+ vst1.16 {d2}, [r12, :64], r1
+ subs r4, r4, #4
+ vst1.16 {d1}, [r0, :64], r1
+ vst1.16 {d0}, [r12, :64], r1
+ bgt 4b
+ pop {r4-r5, pc}
+8:
+ vld1.16 {d0[], d1[]}, [r2], lr
+ subs r4, r4, #4
+ vld1.16 {d2[], d3[]}, [r2], lr
+ vst1.16 {q0}, [r0, :128], r1
+ vld1.16 {d4[], d5[]}, [r2], lr
+ vst1.16 {q1}, [r12, :128], r1
+ vld1.16 {d6[], d7[]}, [r2], lr
+ vst1.16 {q2}, [r0, :128], r1
+ vst1.16 {q3}, [r12, :128], r1
+ bgt 8b
+ pop {r4-r5, pc}
+160:
+ sub r1, r1, #16
+16:
+ vld1.16 {d0[], d1[]}, [r2], lr
+ subs r4, r4, #4
+ vld1.16 {d2[], d3[]}, [r2], lr
+ vst1.16 {q0}, [r0, :128]!
+ vld1.16 {d4[], d5[]}, [r2], lr
+ vst1.16 {q1}, [r12, :128]!
+ vld1.16 {d6[], d7[]}, [r2], lr
+ vst1.16 {q0}, [r0, :128], r1
+ vst1.16 {q1}, [r12, :128], r1
+ vst1.16 {q2}, [r0, :128]!
+ vst1.16 {q3}, [r12, :128]!
+ vst1.16 {q2}, [r0, :128], r1
+ vst1.16 {q3}, [r12, :128], r1
+ bgt 16b
+ pop {r4-r5, pc}
+320:
+ sub r1, r1, #48
+32:
+ vld1.16 {d0[], d1[]}, [r2], lr
+ subs r4, r4, #4
+ vld1.16 {d2[], d3[]}, [r2], lr
+ vst1.16 {q0}, [r0, :128]!
+ vld1.16 {d4[], d5[]}, [r2], lr
+ vst1.16 {q1}, [r12, :128]!
+ vld1.16 {d6[], d7[]}, [r2], lr
+ vst1.16 {q0}, [r0, :128]!
+ vst1.16 {q1}, [r12, :128]!
+ vst1.16 {q0}, [r0, :128]!
+ vst1.16 {q1}, [r12, :128]!
+ vst1.16 {q0}, [r0, :128], r1
+ vst1.16 {q1}, [r12, :128], r1
+ vst1.16 {q2}, [r0, :128]!
+ vst1.16 {q3}, [r12, :128]!
+ vst1.16 {q2}, [r0, :128]!
+ vst1.16 {q3}, [r12, :128]!
+ vst1.16 {q2}, [r0, :128]!
+ vst1.16 {q3}, [r12, :128]!
+ vst1.16 {q2}, [r0, :128], r1
+ vst1.16 {q3}, [r12, :128], r1
+ bgt 32b
+ pop {r4-r5, pc}
+640:
+ sub r1, r1, #96
+64:
+ vld1.16 {d0[], d1[]}, [r2], lr
+ subs r4, r4, #2
+ vld1.16 {d4[], d5[]}, [r2], lr
+ vmov q1, q0
+ vmov q3, q2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vst1.16 {q2, q3}, [r12, :128]!
+ vst1.16 {q0, q1}, [r0, :128]!
+ vst1.16 {q2, q3}, [r12, :128]!
+ vst1.16 {q0, q1}, [r0, :128]!
+ vst1.16 {q2, q3}, [r12, :128]!
+ vst1.16 {q0, q1}, [r0, :128], r1
+ vst1.16 {q2, q3}, [r12, :128], r1
+ bgt 64b
+ pop {r4-r5, pc}
+endfunc
+
+// void ipred_dc_top_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_dc_top_16bpc_neon, export=1
+ push {r4-r5, lr}
+ ldr r4, [sp, #12]
+ clz r3, r3
+ adr r5, L(ipred_dc_top_tbl)
+ sub r3, r3, #25
+ ldr r3, [r5, r3, lsl #2]
+ add r2, r2, #2
+ add r5, r5, r3
+ add r12, r0, r1
+ lsl r1, r1, #1
+ bx r5
+
+ .align 2
+L(ipred_dc_top_tbl):
+ .word 640f - L(ipred_dc_top_tbl) + CONFIG_THUMB
+ .word 320f - L(ipred_dc_top_tbl) + CONFIG_THUMB
+ .word 160f - L(ipred_dc_top_tbl) + CONFIG_THUMB
+ .word 80f - L(ipred_dc_top_tbl) + CONFIG_THUMB
+ .word 40f - L(ipred_dc_top_tbl) + CONFIG_THUMB
+
+40:
+ vld1.16 {d0}, [r2]
+ vpadd.i16 d0, d0, d0
+ vpadd.i16 d0, d0, d0
+ vrshr.u16 d0, d0, #2
+ vdup.16 d0, d0[0]
+4:
+ vst1.16 {d0}, [r0, :64], r1
+ vst1.16 {d0}, [r12, :64], r1
+ subs r4, r4, #4
+ vst1.16 {d0}, [r0, :64], r1
+ vst1.16 {d0}, [r12, :64], r1
+ bgt 4b
+ pop {r4-r5, pc}
+80:
+ vld1.16 {d0, d1}, [r2]
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0, d0
+ vpadd.i16 d0, d0, d0
+ vrshr.u16 d0, d0, #3
+ vdup.16 q0, d0[0]
+8:
+ vst1.16 {d0, d1}, [r0, :128], r1
+ vst1.16 {d0, d1}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.16 {d0, d1}, [r0, :128], r1
+ vst1.16 {d0, d1}, [r12, :128], r1
+ bgt 8b
+ pop {r4-r5, pc}
+160:
+ vld1.16 {d0, d1, d2, d3}, [r2]
+ vadd.i16 q0, q0, q1
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0, d0
+ vpadd.i16 d0, d0, d0
+ vrshr.u16 d4, d0, #4
+ vdup.16 q0, d4[0]
+ vdup.16 q1, d4[0]
+16:
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 16b
+ pop {r4-r5, pc}
+320:
+ vld1.16 {d0, d1, d2, d3}, [r2]!
+ vld1.16 {d4, d5, d6, d7}, [r2]
+ vadd.i16 q0, q0, q1
+ vadd.i16 q2, q2, q3
+ vadd.i16 q0, q0, q2
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0, d0
+ vpaddl.u16 d0, d0
+ vrshrn.i32 d18, q0, #5
+ vdup.16 q0, d18[0]
+ vdup.16 q1, d18[0]
+ sub r1, r1, #32
+32:
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 32b
+ pop {r4-r5, pc}
+640:
+ vld1.16 {d0, d1, d2, d3}, [r2]!
+ vld1.16 {d4, d5, d6, d7}, [r2]!
+ vadd.i16 q0, q0, q1
+ vld1.16 {d16, d17, d18, d19}, [r2]!
+ vadd.i16 q2, q2, q3
+ vld1.16 {d20, d21, d22, d23}, [r2]
+ vadd.i16 q8, q8, q9
+ vadd.i16 q10, q10, q11
+ vadd.i16 q0, q0, q2
+ vadd.i16 q8, q8, q10
+ vadd.i16 q0, q0, q8
+ vadd.i16 d0, d0, d1
+ vpaddl.u16 d0, d0
+ vpadd.i32 d0, d0, d0
+ vrshrn.i32 d18, q0, #6
+ vdup.16 q0, d18[0]
+ vdup.16 q1, d18[0]
+ sub r1, r1, #96
+64:
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ subs r4, r4, #2
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 64b
+ pop {r4-r5, pc}
+endfunc
+
+// void ipred_dc_left_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_dc_left_16bpc_neon, export=1
+ push {r4-r5, lr}
+ ldr r4, [sp, #12]
+ sub r2, r2, r4, lsl #1
+ clz r3, r3
+ clz lr, r4
+ sub lr, lr, #25
+ adr r5, L(ipred_dc_left_tbl)
+ sub r3, r3, #20
+ ldr r3, [r5, r3, lsl #2]
+ ldr lr, [r5, lr, lsl #2]
+ add r3, r5, r3
+ add r5, r5, lr
+ add r12, r0, r1
+ lsl r1, r1, #1
+ bx r5
+
+ .align 2
+L(ipred_dc_left_tbl):
+ .word L(ipred_dc_left_h64) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_h32) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_h16) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_h8) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_h4) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_w64) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_w32) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_w16) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_w8) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_w4) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+
+L(ipred_dc_left_h4):
+ vld1.16 {d0}, [r2, :64]
+ vpadd.i16 d0, d0, d0
+ vpadd.i16 d0, d0, d0
+ vrshr.u16 d0, d0, #2
+ vdup.16 q0, d0[0]
+ bx r3
+L(ipred_dc_left_w4):
+ vst1.16 {d0}, [r0, :64], r1
+ vst1.16 {d0}, [r12, :64], r1
+ subs r4, r4, #4
+ vst1.16 {d0}, [r0, :64], r1
+ vst1.16 {d0}, [r12, :64], r1
+ bgt L(ipred_dc_left_w4)
+ pop {r4-r5, pc}
+L(ipred_dc_left_h8):
+ vld1.16 {d0, d1}, [r2, :128]
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0, d0
+ vpadd.i16 d0, d0, d0
+ vrshr.u16 d0, d0, #3
+ vdup.16 q0, d0[0]
+ bx r3
+L(ipred_dc_left_w8):
+ vst1.16 {d0, d1}, [r0, :128], r1
+ vst1.16 {d0, d1}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.16 {d0, d1}, [r0, :128], r1
+ vst1.16 {d0, d1}, [r12, :128], r1
+ bgt L(ipred_dc_left_w8)
+ pop {r4-r5, pc}
+L(ipred_dc_left_h16):
+ vld1.16 {d0, d1, d2, d3}, [r2, :128]
+ vadd.i16 q0, q0, q1
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0, d0
+ vpadd.i16 d0, d0, d0
+ vrshr.u16 d0, d0, #4
+ vdup.16 q0, d0[0]
+ bx r3
+L(ipred_dc_left_w16):
+ vmov q1, q0
+1:
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 1b
+ pop {r4-r5, pc}
+L(ipred_dc_left_h32):
+ vld1.16 {d0, d1, d2, d3}, [r2, :128]!
+ vld1.16 {d4, d5, d6, d7}, [r2, :128]
+ vadd.i16 q0, q0, q1
+ vadd.i16 q2, q2, q3
+ vadd.i16 q0, q0, q2
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0, d0
+ vpaddl.u16 d0, d0
+ vrshrn.i32 d0, q0, #5
+ vdup.16 q0, d0[0]
+ bx r3
+L(ipred_dc_left_w32):
+ sub r1, r1, #32
+ vmov q1, q0
+1:
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 1b
+ pop {r4-r5, pc}
+L(ipred_dc_left_h64):
+ vld1.16 {d0, d1, d2, d3}, [r2, :128]!
+ vld1.16 {d4, d5, d6, d7}, [r2, :128]!
+ vadd.i16 q0, q0, q1
+ vld1.16 {d16, d17, d18, d19}, [r2, :128]!
+ vadd.i16 q2, q2, q3
+ vld1.16 {d20, d21, d22, d23}, [r2, :128]
+ vadd.i16 q8, q8, q9
+ vadd.i16 q10, q10, q11
+ vadd.i16 q0, q0, q2
+ vadd.i16 q8, q8, q10
+ vadd.i16 q0, q0, q8
+ vadd.i16 d0, d0, d1
+ vpaddl.u16 d0, d0
+ vpadd.i32 d0, d0, d0
+ vrshrn.i32 d0, q0, #6
+ vdup.16 q0, d0[0]
+ bx r3
+L(ipred_dc_left_w64):
+ sub r1, r1, #96
+ vmov q1, q0
+1:
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ subs r4, r4, #2
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 1b
+ pop {r4-r5, pc}
+endfunc
+
+// void ipred_dc_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_dc_16bpc_neon, export=1
+ push {r4-r6, lr}
+ ldr r4, [sp, #16]
+ sub r2, r2, r4, lsl #1
+ add lr, r3, r4 // width + height
+ clz r3, r3
+ clz r12, r4
+ vdup.32 q15, lr // width + height
+ adr r5, L(ipred_dc_tbl)
+ rbit lr, lr // rbit(width + height)
+ sub r3, r3, #20 // 25 leading bits, minus table offset 5
+ sub r12, r12, #25
+ clz lr, lr // ctz(width + height)
+ ldr r3, [r5, r3, lsl #2]
+ ldr r12, [r5, r12, lsl #2]
+ neg lr, lr // -ctz(width + height)
+ add r3, r5, r3
+ add r5, r5, r12
+ vshr.u32 q15, q15, #1 // (width + height) >> 1
+ vdup.32 q14, lr // -ctz(width + height)
+ add r12, r0, r1
+ lsl r1, r1, #1
+ bx r5
+
+ .align 2
+L(ipred_dc_tbl):
+ .word L(ipred_dc_h64) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_h32) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_h16) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_h8) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_h4) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_w64) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_w32) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_w16) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_w8) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_w4) - L(ipred_dc_tbl) + CONFIG_THUMB
+
+L(ipred_dc_h4):
+ vld1.16 {d0}, [r2, :64]!
+ vpadd.i16 d0, d0, d0
+ add r2, r2, #2
+ vpaddl.u16 d0, d0
+ bx r3
+L(ipred_dc_w4):
+ vld1.16 {d2}, [r2]
+ vadd.i32 d0, d0, d30
+ vpadd.i16 d2, d2, d2
+ vpaddl.u16 d2, d2
+ cmp r4, #4
+ vadd.i32 d0, d0, d2
+ vshl.u32 d0, d0, d28
+ beq 1f
+ // h = 8/16
+ cmp r4, #16
+ movw lr, #0x6667
+ movw r5, #0xAAAB
+ it ne
+ movne lr, r5
+ vdup.32 d24, lr
+ vmul.i32 d0, d0, d24
+ vshr.u32 d0, d0, #17
+1:
+ vdup.16 d0, d0[0]
+2:
+ vst1.16 {d0}, [r0, :64], r1
+ vst1.16 {d0}, [r12, :64], r1
+ subs r4, r4, #4
+ vst1.16 {d0}, [r0, :64], r1
+ vst1.16 {d0}, [r12, :64], r1
+ bgt 2b
+ pop {r4-r6, pc}
+
+L(ipred_dc_h8):
+ vld1.16 {d0, d1}, [r2, :128]!
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0, d0
+ add r2, r2, #2
+ vpaddl.u16 d0, d0
+ bx r3
+L(ipred_dc_w8):
+ vld1.16 {d2, d3}, [r2]
+ vadd.i32 d0, d0, d30
+ vadd.i16 d2, d2, d3
+ vpadd.i16 d2, d2, d2
+ vpaddl.u16 d2, d2
+ cmp r4, #8
+ vadd.i32 d0, d0, d2
+ vshl.u32 d0, d0, d28
+ beq 1f
+ // h = 4/16/32
+ cmp r4, #32
+ movw lr, #0x6667
+ movw r5, #0xAAAB
+ it ne
+ movne lr, r5
+ vdup.32 d24, lr
+ vmul.i32 d0, d0, d24
+ vshr.u32 d0, d0, #17
+1:
+ vdup.16 q0, d0[0]
+2:
+ vst1.16 {d0, d1}, [r0, :128], r1
+ vst1.16 {d0, d1}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.16 {d0, d1}, [r0, :128], r1
+ vst1.16 {d0, d1}, [r12, :128], r1
+ bgt 2b
+ pop {r4-r6, pc}
+
+L(ipred_dc_h16):
+ vld1.16 {d0, d1, d2, d3}, [r2, :128]!
+ vadd.i16 q0, q0, q1
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0, d0
+ add r2, r2, #2
+ vpaddl.u16 d0, d0
+ bx r3
+L(ipred_dc_w16):
+ vld1.16 {d2, d3, d4, d5}, [r2]
+ vadd.i32 d0, d0, d30
+ vadd.i16 q1, q1, q2
+ vadd.i16 d2, d2, d3
+ vpadd.i16 d2, d2, d1
+ vpaddl.u16 d2, d2
+ cmp r4, #16
+ vadd.i32 d0, d0, d2
+ vshl.u32 d4, d0, d28
+ beq 1f
+ // h = 4/8/32/64
+ tst r4, #(32+16+8) // 16 added to make a consecutive bitmask
+ movw lr, #0x6667
+ movw r5, #0xAAAB
+ it ne
+ movne lr, r5
+ vdup.32 d24, lr
+ vmul.i32 d4, d4, d24
+ vshr.u32 d4, d4, #17
+1:
+ vdup.16 q0, d4[0]
+ vdup.16 q1, d4[0]
+2:
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 2b
+ pop {r4-r6, pc}
+
+L(ipred_dc_h32):
+ vld1.16 {d0, d1, d2, d3}, [r2, :128]!
+ vld1.16 {d4, d5, d6, d7}, [r2, :128]!
+ vadd.i16 q0, q0, q1
+ vadd.i16 q2, q2, q3
+ vadd.i16 q0, q0, q2
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0, d0
+ add r2, r2, #2
+ vpaddl.u16 d0, d0
+ bx r3
+L(ipred_dc_w32):
+ vld1.16 {d2, d3, d4, d5}, [r2]!
+ vadd.i32 d0, d0, d30
+ vld1.16 {d16, d17, d18, d19}, [r2]
+ vadd.i16 q1, q1, q2
+ vadd.i16 q8, q8, q9
+ vadd.i16 q1, q1, q8
+ vadd.i16 d2, d2, d3
+ vpadd.i16 d2, d2, d2
+ vpaddl.u16 d2, d2
+ cmp r4, #32
+ vadd.i32 d0, d0, d2
+ vshl.u32 d4, d0, d28
+ beq 1f
+ // h = 8/16/64
+ cmp r4, #8
+ movw lr, #0x6667
+ movw r5, #0xAAAB
+ it ne
+ movne lr, r5
+ vdup.32 d24, lr
+ vmul.i32 d4, d4, d24
+ vshr.u32 d4, d4, #17
+1:
+ sub r1, r1, #32
+ vdup.16 q0, d4[0]
+ vdup.16 q1, d4[0]
+2:
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 2b
+ pop {r4-r6, pc}
+L(ipred_dc_h64):
+ vld1.16 {d0, d1, d2, d3}, [r2, :128]!
+ vld1.16 {d4, d5, d6, d7}, [r2, :128]!
+ vadd.i16 q0, q0, q1
+ vld1.16 {d16, d17, d18, d19}, [r2, :128]!
+ vadd.i16 q2, q2, q3
+ vld1.16 {d20, d21, d22, d23}, [r2, :128]!
+ vadd.i16 q8, q8, q9
+ vadd.i16 q10, q10, q11
+ vadd.i16 q0, q0, q2
+ vadd.i16 q8, q8, q10
+ vadd.i16 q0, q0, q8
+ vadd.i16 d0, d0, d1
+ vpaddl.u16 d0, d0
+ add r2, r2, #2
+ vpadd.i32 d0, d0, d0
+ bx r3
+L(ipred_dc_w64):
+ vld1.16 {d2, d3, d4, d5}, [r2]!
+ vadd.i32 d0, d0, d30
+ vld1.16 {d16, d17, d18, d19}, [r2]!
+ vadd.i16 q1, q1, q2
+ vld1.16 {d20, d21, d22, d23}, [r2]!
+ vadd.i16 q8, q8, q9
+ vld1.16 {d24, d25, d26, d27}, [r2]!
+ vadd.i16 q10, q10, q11
+ vadd.i16 q12, q12, q13
+ vadd.i16 q1, q1, q8
+ vadd.i16 q10, q10, q12
+ vadd.i16 q1, q1, q10
+ vadd.i16 d2, d2, d3
+ vpaddl.u16 d2, d2
+ vpadd.i32 d2, d2, d2
+ cmp r4, #64
+ vadd.i32 d0, d0, d2
+ vshl.u32 d4, d0, d28
+ beq 1f
+ // h = 16/32
+ cmp r4, #16
+ movw lr, #0x6667
+ movw r5, #0xAAAB
+ it ne
+ movne lr, r5
+ vdup.32 d24, lr
+ vmul.i32 d4, d4, d24
+ vshr.u32 d4, d4, #17
+1:
+ sub r1, r1, #96
+ vdup.16 q0, d4[0]
+ vdup.16 q1, d4[0]
+2:
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ subs r4, r4, #2
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 2b
+ pop {r4-r6, pc}
+endfunc
+
+// void ipred_paeth_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_paeth_16bpc_neon, export=1
+ push {r4-r6, lr}
+ vpush {q4}
+ ldr r4, [sp, #32]
+ clz lr, r3
+ adr r12, L(ipred_paeth_tbl)
+ sub lr, lr, #25
+ ldr lr, [r12, lr, lsl #2]
+ vld1.16 {d4[], d5[]}, [r2]
+ add r6, r2, #2
+ sub r2, r2, #4
+ add r12, r12, lr
+ mov r5, #-4
+ add lr, r0, r1
+ lsl r1, r1, #1
+ bx r12
+
+ .align 2
+L(ipred_paeth_tbl):
+ .word 640f - L(ipred_paeth_tbl) + CONFIG_THUMB
+ .word 320f - L(ipred_paeth_tbl) + CONFIG_THUMB
+ .word 160f - L(ipred_paeth_tbl) + CONFIG_THUMB
+ .word 80f - L(ipred_paeth_tbl) + CONFIG_THUMB
+ .word 40f - L(ipred_paeth_tbl) + CONFIG_THUMB
+
+40:
+ sub r2, r2, #4
+ mov r5, #-8
+ vld1.16 {d6}, [r6]
+ vsub.i16 d16, d6, d4 // top - topleft
+ vmov d7, d6
+ vmov d17, d16
+4:
+ vld4.16 {d0[], d1[], d2[], d3[]}, [r2, :64], r5
+ vadd.i16 q9, q8, q0 // base
+ vadd.i16 q10, q8, q1
+ vabd.s16 q11, q3, q9 // tdiff
+ vabd.s16 q12, q3, q10
+ vabd.s16 q13, q2, q9 // tldiff
+ vabd.s16 q14, q2, q10
+ vabd.s16 q9, q0, q9 // ldiff
+ vabd.s16 q10, q1, q10
+ vmin.u16 q15, q11, q13 // min(tdiff, tldiff)
+ vmin.u16 q4, q12, q14
+ vcge.u16 q11, q13, q11 // tldiff >= tdiff
+ vcge.u16 q12, q14, q12
+ vcge.u16 q9, q15, q9 // min(tdiff, tldiff) >= ldiff
+ vcge.u16 q10, q4, q10
+ vbsl q12, q3, q2 // tdiff <= tldiff ? top : topleft
+ vbsl q11, q3, q2
+ vbit q12, q1, q10 // ldiff <= min ? left : ...
+ vbit q11, q0, q9
+ vst1.16 {d25}, [r0, :64], r1
+ vst1.16 {d24}, [lr, :64], r1
+ subs r4, r4, #4
+ vst1.16 {d23}, [r0, :64], r1
+ vst1.16 {d22}, [lr, :64], r1
+ bgt 4b
+ vpop {q4}
+ pop {r4-r6, pc}
+80:
+160:
+320:
+640:
+ vld1.16 {q3}, [r6]!
+ mov r12, r3
+ sub r1, r1, r3, lsl #1
+1:
+ vld2.16 {d0[], d2[]}, [r2, :32], r5
+ vmov d1, d0
+ vmov d3, d2
+2:
+ vsub.i16 q8, q3, q2 // top - topleft
+ vadd.i16 q9, q8, q0 // base
+ vadd.i16 q10, q8, q1
+ vabd.s16 q11, q3, q9 // tdiff
+ vabd.s16 q12, q3, q10
+ vabd.s16 q13, q2, q9 // tldiff
+ vabd.s16 q14, q2, q10
+ vabd.s16 q9, q0, q9 // ldiff
+ vabd.s16 q10, q1, q10
+ vmin.u16 q15, q11, q13 // min(tdiff, tldiff)
+ vmin.u16 q4, q12, q14
+ vcge.u16 q11, q13, q11 // tldiff >= tdiff
+ vcge.u16 q12, q14, q12
+ vcge.u16 q9, q15, q9 // min(tdiff, tldiff) >= ldiff
+ vcge.u16 q10, q4, q10
+ vbsl q12, q3, q2 // tdiff <= tldiff ? top : topleft
+ vbsl q11, q3, q2
+ vbit q12, q1, q10 // ldiff <= min ? left : ...
+ vbit q11, q0, q9
+ subs r3, r3, #8
+ vst1.16 {q12}, [r0, :128]!
+ vst1.16 {q11}, [lr, :128]!
+ ble 8f
+ vld1.16 {q3}, [r6]!
+ b 2b
+8:
+ subs r4, r4, #2
+ ble 9f
+ // End of horizontal loop, move pointers to next two rows
+ sub r6, r6, r12, lsl #1
+ add r0, r0, r1
+ add lr, lr, r1
+ vld1.16 {q3}, [r6]!
+ mov r3, r12
+ b 1b
+9:
+ vpop {q4}
+ pop {r4-r6, pc}
+endfunc
+
+// void ipred_smooth_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_smooth_16bpc_neon, export=1
+ push {r4-r10, lr}
+ ldr r4, [sp, #32]
+ movrel r10, X(sm_weights)
+ add r12, r10, r4
+ add r10, r10, r3
+ clz r9, r3
+ adr r5, L(ipred_smooth_tbl)
+ sub lr, r2, r4, lsl #1
+ sub r9, r9, #25
+ ldr r9, [r5, r9, lsl #2]
+ vld1.16 {d4[], d5[]}, [lr] // bottom
+ add r8, r2, #2
+ add r5, r5, r9
+ add r6, r0, r1
+ lsl r1, r1, #1
+ bx r5
+
+ .align 2
+L(ipred_smooth_tbl):
+ .word 640f - L(ipred_smooth_tbl) + CONFIG_THUMB
+ .word 320f - L(ipred_smooth_tbl) + CONFIG_THUMB
+ .word 160f - L(ipred_smooth_tbl) + CONFIG_THUMB
+ .word 80f - L(ipred_smooth_tbl) + CONFIG_THUMB
+ .word 40f - L(ipred_smooth_tbl) + CONFIG_THUMB
+
+40:
+ vld1.16 {d16}, [r8] // top
+ vld1.32 {d18[]}, [r10, :32] // weights_hor
+ sub r2, r2, #8
+ mov r7, #-8
+ vdup.16 q3, d16[3] // right
+ vsub.i16 q8, q8, q2 // top-bottom
+ vmovl.u8 q9, d18 // weights_hor
+ vadd.i16 d19, d4, d6 // bottom+right
+4:
+ vld4.16 {d0[], d1[], d2[], d3[]}, [r2, :64], r7 // left
+ vld4.8 {d20[], d21[], d22[], d23[]}, [r12, :32]! // weights_ver
+ vshll.u16 q12, d19, #8 // (bottom+right)*256
+ vshll.u16 q13, d19, #8
+ vshll.u16 q14, d19, #8
+ vshll.u16 q15, d19, #8
+ vzip.32 d20, d21 // weights_ver
+ vzip.32 d22, d23
+ vsub.i16 q1, q1, q3 // left-right
+ vsub.i16 q0, q0, q3
+ vmovl.u8 q10, d20 // weights_ver
+ vmovl.u8 q11, d22
+ vmlal.s16 q12, d3, d18 // += (left-right)*weights_hor
+ vmlal.s16 q13, d2, d18 // (left flipped)
+ vmlal.s16 q14, d1, d18
+ vmlal.s16 q15, d0, d18
+ vmlal.s16 q12, d16, d20 // += (top-bottom)*weights_ver
+ vmlal.s16 q13, d16, d21
+ vmlal.s16 q14, d16, d22
+ vmlal.s16 q15, d16, d23
+ vrshrn.i32 d24, q12, #9
+ vrshrn.i32 d25, q13, #9
+ vrshrn.i32 d26, q14, #9
+ vrshrn.i32 d27, q15, #9
+ vst1.16 {d24}, [r0, :64], r1
+ vst1.16 {d25}, [r6, :64], r1
+ subs r4, r4, #4
+ vst1.16 {d26}, [r0, :64], r1
+ vst1.16 {d27}, [r6, :64], r1
+ bgt 4b
+ pop {r4-r10, pc}
+80:
+ vld1.16 {q8}, [r8] // top
+ vld1.8 {d18}, [r10, :64] // weights_hor
+ sub r2, r2, #4
+ mov r7, #-4
+ vdup.16 q3, d17[3] // right
+ vsub.i16 q8, q8, q2 // top-bottom
+ vmovl.u8 q9, d18 // weights_hor
+ vadd.i16 d3, d4, d6 // bottom+right
+8:
+ vld2.16 {d0[], d1[]}, [r2, :32], r7 // left
+ vld2.8 {d20[], d22[]}, [r12, :16]! // weights_ver
+ vshll.u16 q12, d3, #8 // (bottom+right)*256
+ vshll.u16 q13, d3, #8
+ vshll.u16 q14, d3, #8
+ vshll.u16 q15, d3, #8
+ vsub.i16 q0, q0, q3 // left-right
+ vmovl.u8 q10, d20 // weights_ver
+ vmovl.u8 q11, d22
+ vmlal.s16 q12, d1, d18 // += (left-right)*weights_hor
+ vmlal.s16 q13, d1, d19 // (left flipped)
+ vmlal.s16 q14, d0, d18
+ vmlal.s16 q15, d0, d19
+ vmlal.s16 q12, d16, d20 // += (top-bottom)*weights_ver
+ vmlal.s16 q13, d17, d20
+ vmlal.s16 q14, d16, d22
+ vmlal.s16 q15, d17, d22
+ vrshrn.i32 d24, q12, #9
+ vrshrn.i32 d25, q13, #9
+ vrshrn.i32 d26, q14, #9
+ vrshrn.i32 d27, q15, #9
+ subs r4, r4, #2
+ vst1.16 {q12}, [r0, :128], r1
+ vst1.16 {q13}, [r6, :128], r1
+ bgt 8b
+ pop {r4-r10, pc}
+160:
+320:
+640:
+ add lr, r2, r3, lsl #1
+ sub r2, r2, #4
+ mov r7, #-4
+ vld1.16 {d6[], d7[]}, [lr] // right
+ sub r1, r1, r3, lsl #1
+ mov r9, r3
+ vadd.i16 d3, d4, d6 // bottom+right
+
+1:
+ vld2.16 {d0[], d1[]}, [r2, :32], r7 // left
+ vld2.8 {d20[], d22[]}, [r12, :16]! // weights_ver
+ vsub.i16 q0, q0, q3 // left-right
+ vmovl.u8 q10, d20 // weights_ver
+ vmovl.u8 q11, d22
+2:
+ vld1.8 {d18}, [r10, :64]! // weights_hor
+ vld1.16 {q8}, [r8]! // top
+ vshll.u16 q12, d3, #8 // (bottom+right)*256
+ vshll.u16 q13, d3, #8
+ vmovl.u8 q9, d18 // weights_hor
+ vshll.u16 q14, d3, #8
+ vshll.u16 q15, d3, #8
+ vsub.i16 q8, q8, q2 // top-bottom
+ vmlal.s16 q12, d1, d18 // += (left-right)*weights_hor
+ vmlal.s16 q13, d1, d19 // (left flipped)
+ vmlal.s16 q14, d0, d18
+ vmlal.s16 q15, d0, d19
+ vmlal.s16 q12, d16, d20 // += (top-bottom)*weights_ver
+ vmlal.s16 q13, d17, d20
+ vmlal.s16 q14, d16, d22
+ vmlal.s16 q15, d17, d22
+ vrshrn.i32 d24, q12, #9
+ vrshrn.i32 d25, q13, #9
+ vrshrn.i32 d26, q14, #9
+ vrshrn.i32 d27, q15, #9
+ subs r3, r3, #8
+ vst1.16 {q12}, [r0, :128]!
+ vst1.16 {q13}, [r6, :128]!
+ bgt 2b
+ subs r4, r4, #2
+ ble 9f
+ sub r8, r8, r9, lsl #1
+ sub r10, r10, r9
+ add r0, r0, r1
+ add r6, r6, r1
+ mov r3, r9
+ b 1b
+9:
+ pop {r4-r10, pc}
+endfunc
+
+// void ipred_smooth_v_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_smooth_v_16bpc_neon, export=1
+ push {r4-r7, lr}
+ ldr r4, [sp, #20]
+ movrel r7, X(sm_weights)
+ add r7, r7, r4
+ clz lr, r3
+ adr r5, L(ipred_smooth_v_tbl)
+ sub r12, r2, r4, lsl #1
+ sub lr, lr, #25
+ ldr lr, [r5, lr, lsl #2]
+ vld1.16 {d4[], d5[]}, [r12] // bottom
+ add r2, r2, #2
+ add r5, r5, lr
+ add r6, r0, r1
+ lsl r1, r1, #1
+ bx r5
+
+ .align 2
+L(ipred_smooth_v_tbl):
+ .word 640f - L(ipred_smooth_v_tbl) + CONFIG_THUMB
+ .word 320f - L(ipred_smooth_v_tbl) + CONFIG_THUMB
+ .word 160f - L(ipred_smooth_v_tbl) + CONFIG_THUMB
+ .word 80f - L(ipred_smooth_v_tbl) + CONFIG_THUMB
+ .word 40f - L(ipred_smooth_v_tbl) + CONFIG_THUMB
+
+40:
+ vld1.16 {d6}, [r2] // top
+ vsub.i16 d6, d6, d4 // top-bottom
+ vmov d7, d6
+4:
+ vld4.8 {d16[], d17[], d18[], d19[]}, [r7, :32]! // weights_ver
+ vzip.32 d16, d17 // weights_ver
+ vzip.32 d18, d19
+ vshll.u8 q8, d16, #7 // weights_ver << 7
+ vshll.u8 q9, d18, #7
+ vqrdmulh.s16 q10, q3, q8 // ((top-bottom)*weights_ver + 128) >> 8
+ vqrdmulh.s16 q11, q3, q9
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q2
+ vst1.16 {d20}, [r0, :64], r1
+ vst1.16 {d21}, [r6, :64], r1
+ subs r4, r4, #4
+ vst1.16 {d22}, [r0, :64], r1
+ vst1.16 {d23}, [r6, :64], r1
+ bgt 4b
+ pop {r4-r7, pc}
+80:
+ vld1.16 {q3}, [r2] // top
+ vsub.i16 q3, q3, q2 // top-bottom
+8:
+ vld4.8 {d16[], d18[], d20[], d22[]}, [r7, :32]! // weights_ver
+ vshll.u8 q8, d16, #7 // weights_ver << 7
+ vshll.u8 q9, d18, #7
+ vshll.u8 q10, d20, #7
+ vshll.u8 q11, d22, #7
+ vqrdmulh.s16 q8, q3, q8 // ((top-bottom)*weights_ver + 128) >> 8
+ vqrdmulh.s16 q9, q3, q9
+ vqrdmulh.s16 q10, q3, q10
+ vqrdmulh.s16 q11, q3, q11
+ vadd.i16 q8, q8, q2
+ vadd.i16 q9, q9, q2
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q2
+ vst1.16 {q8}, [r0, :128], r1
+ vst1.16 {q9}, [r6, :128], r1
+ subs r4, r4, #4
+ vst1.16 {q10}, [r0, :128], r1
+ vst1.16 {q11}, [r6, :128], r1
+ bgt 8b
+ pop {r4-r7, pc}
+160:
+320:
+640:
+ vpush {q4-q7}
+ // Set up pointers for four rows in parallel; r0, r6, r5, lr
+ add r5, r0, r1
+ add lr, r6, r1
+ lsl r1, r1, #1
+ sub r1, r1, r3, lsl #1
+ mov r12, r3
+
+1:
+ vld4.8 {d8[], d10[], d12[], d14[]}, [r7, :32]! // weights_ver
+ vshll.u8 q4, d8, #7 // weights_ver << 7
+ vshll.u8 q5, d10, #7
+ vshll.u8 q6, d12, #7
+ vshll.u8 q7, d14, #7
+2:
+ vld1.16 {q0, q1}, [r2]! // top
+ vsub.i16 q0, q0, q2 // top-bottom
+ vsub.i16 q1, q1, q2
+ vqrdmulh.s16 q8, q0, q4 // ((top-bottom)*weights_ver + 128) >> 8
+ vqrdmulh.s16 q9, q1, q4
+ vqrdmulh.s16 q10, q0, q5
+ vqrdmulh.s16 q11, q1, q5
+ vqrdmulh.s16 q12, q0, q6
+ vqrdmulh.s16 q13, q1, q6
+ vqrdmulh.s16 q14, q0, q7
+ vqrdmulh.s16 q15, q1, q7
+ vadd.i16 q8, q8, q2
+ vadd.i16 q9, q9, q2
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q2
+ vadd.i16 q12, q12, q2
+ vadd.i16 q13, q13, q2
+ vadd.i16 q14, q14, q2
+ vadd.i16 q15, q15, q2
+ subs r3, r3, #16
+ vst1.16 {q8, q9}, [r0, :128]!
+ vst1.16 {q10, q11}, [r6, :128]!
+ vst1.16 {q12, q13}, [r5, :128]!
+ vst1.16 {q14, q15}, [lr, :128]!
+ bgt 2b
+ subs r4, r4, #4
+ ble 9f
+ sub r2, r2, r12, lsl #1
+ add r0, r0, r1
+ add r6, r6, r1
+ add r5, r5, r1
+ add lr, lr, r1
+ mov r3, r12
+ b 1b
+9:
+ vpop {q4-q7}
+ pop {r4-r7, pc}
+endfunc
+
+// void ipred_smooth_h_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_smooth_h_16bpc_neon, export=1
+ push {r4-r8, lr}
+ ldr r4, [sp, #24]
+ movrel r8, X(sm_weights)
+ add r8, r8, r3
+ clz lr, r3
+ adr r5, L(ipred_smooth_h_tbl)
+ add r12, r2, r3, lsl #1
+ sub lr, lr, #25
+ ldr lr, [r5, lr, lsl #2]
+ vld1.16 {d4[], d5[]}, [r12] // right
+ add r5, r5, lr
+ add r6, r0, r1
+ lsl r1, r1, #1
+ bx r5
+
+ .align 2
+L(ipred_smooth_h_tbl):
+ .word 640f - L(ipred_smooth_h_tbl) + CONFIG_THUMB
+ .word 320f - L(ipred_smooth_h_tbl) + CONFIG_THUMB
+ .word 160f - L(ipred_smooth_h_tbl) + CONFIG_THUMB
+ .word 80f - L(ipred_smooth_h_tbl) + CONFIG_THUMB
+ .word 40f - L(ipred_smooth_h_tbl) + CONFIG_THUMB
+
+40:
+ vld1.32 {d6[]}, [r8, :32] // weights_hor
+ sub r2, r2, #8
+ mov r7, #-8
+ vshll.u8 q3, d6, #7 // weights_hor << 7
+4:
+ vld4.16 {d0[], d1[], d2[], d3[]}, [r2, :64], r7 // left
+ vsub.i16 q0, q0, q2 // left-right
+ vsub.i16 q1, q1, q2
+ subs r4, r4, #4
+ vqrdmulh.s16 q8, q1, q3 // ((left-right)*weights_hor + 128) >> 8
+ vqrdmulh.s16 q9, q0, q3 // (left flipped)
+ vadd.i16 q8, q8, q2
+ vadd.i16 q9, q9, q2
+ vst1.16 {d17}, [r0, :64], r1
+ vst1.16 {d16}, [r6, :64], r1
+ vst1.16 {d19}, [r0, :64], r1
+ vst1.16 {d18}, [r6, :64], r1
+ bgt 4b
+ pop {r4-r8, pc}
+80:
+ vld1.8 {d6}, [r8, :64] // weights_hor
+ sub r2, r2, #8
+ mov r7, #-8
+ vshll.u8 q3, d6, #7 // weights_hor << 7
+8:
+ vld1.16 {d23}, [r2, :64], r7 // left
+ subs r4, r4, #4
+ vsub.i16 d23, d23, d4 // left-right
+ vdup.16 q8, d23[3] // flip left
+ vdup.16 q9, d23[2]
+ vdup.16 q10, d23[1]
+ vdup.16 q11, d23[0]
+ vqrdmulh.s16 q8, q8, q3 // ((left-right)*weights_hor + 128) >> 8
+ vqrdmulh.s16 q9, q9, q3
+ vqrdmulh.s16 q10, q10, q3
+ vqrdmulh.s16 q11, q11, q3
+ vadd.i16 q8, q8, q2
+ vadd.i16 q9, q9, q2
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q2
+ vst1.16 {q8}, [r0, :128], r1
+ vst1.16 {q9}, [r6, :128], r1
+ vst1.16 {q10}, [r0, :128], r1
+ vst1.16 {q11}, [r6, :128], r1
+ bgt 8b
+ pop {r4-r8, pc}
+160:
+320:
+640:
+ vpush {q4-q7}
+ sub r2, r2, #8
+ mov r7, #-8
+ // Set up pointers for four rows in parallel; r0, r6, r5, lr
+ add r5, r0, r1
+ add lr, r6, r1
+ lsl r1, r1, #1
+ sub r1, r1, r3, lsl #1
+ mov r12, r3
+
+1:
+ vld1.16 {d15}, [r2, :64], r7 // left
+ vsub.i16 d15, d15, d4 // left-right
+ vdup.16 q4, d15[3] // flip left
+ vdup.16 q5, d15[2]
+ vdup.16 q6, d15[1]
+ vdup.16 q7, d15[0]
+2:
+ vld1.8 {q1}, [r8, :128]! // weights_hor
+ subs r3, r3, #16
+ vshll.u8 q0, d2, #7 // weights_hor << 7
+ vshll.u8 q1, d3, #7
+ vqrdmulh.s16 q8, q0, q4 // ((left-right)*weights_hor + 128) >> 8
+ vqrdmulh.s16 q9, q1, q4
+ vqrdmulh.s16 q10, q0, q5
+ vqrdmulh.s16 q11, q1, q5
+ vqrdmulh.s16 q12, q0, q6
+ vqrdmulh.s16 q13, q1, q6
+ vqrdmulh.s16 q14, q0, q7
+ vqrdmulh.s16 q15, q1, q7
+ vadd.i16 q8, q8, q2
+ vadd.i16 q9, q9, q2
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q2
+ vadd.i16 q12, q12, q2
+ vadd.i16 q13, q13, q2
+ vadd.i16 q14, q14, q2
+ vadd.i16 q15, q15, q2
+ vst1.16 {q8, q9}, [r0, :128]!
+ vst1.16 {q10, q11}, [r6, :128]!
+ vst1.16 {q12, q13}, [r5, :128]!
+ vst1.16 {q14, q15}, [lr, :128]!
+ bgt 2b
+ subs r4, r4, #4
+ ble 9f
+ sub r8, r8, r12
+ add r0, r0, r1
+ add r6, r6, r1
+ add r5, r5, r1
+ add lr, lr, r1
+ mov r3, r12
+ b 1b
+9:
+ vpop {q4-q7}
+ pop {r4-r8, pc}
+endfunc
+
+// void ipred_filter_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int filt_idx,
+// const int max_width, const int max_height,
+// const int bitdepth_max);
+.macro filter_fn bpc
+function ipred_filter_\bpc\()bpc_neon, export=1
+ movw r12, #511
+ ldrd r4, r5, [sp, #88]
+ and r5, r5, r12 // 511
+ movrel r6, X(filter_intra_taps)
+ lsl r5, r5, #6
+ add r6, r6, r5
+ vld1.8 {d20, d21, d22, d23}, [r6, :128]!
+ clz lr, r3
+ adr r5, L(ipred_filter\bpc\()_tbl)
+ vld1.8 {d27, d28, d29}, [r6, :64]
+ sub lr, lr, #26
+ ldr lr, [r5, lr, lsl #2]
+ vmovl.s8 q8, d20
+ vmovl.s8 q9, d21
+ add r5, r5, lr
+ vmovl.s8 q10, d22
+ vmovl.s8 q11, d23
+ add r6, r0, r1
+ lsl r1, r1, #1
+ vmovl.s8 q12, d27
+ vmovl.s8 q13, d28
+ vmovl.s8 q14, d29
+ mov r7, #-4
+ vdup.16 q15, r8
+ add r8, r2, #2
+ sub r2, r2, #4
+.if \bpc == 10
+ vmov.i16 q7, #0
+.endif
+ bx r5
+
+ .align 2
+L(ipred_filter\bpc\()_tbl):
+ .word 320f - L(ipred_filter\bpc\()_tbl) + CONFIG_THUMB
+ .word 160f - L(ipred_filter\bpc\()_tbl) + CONFIG_THUMB
+ .word 80f - L(ipred_filter\bpc\()_tbl) + CONFIG_THUMB
+ .word 40f - L(ipred_filter\bpc\()_tbl) + CONFIG_THUMB
+
+40:
+ vld1.16 {d0}, [r8] // top (0-3)
+4:
+ vld1.16 {d2}, [r2], r7 // left (0-1) + topleft (2)
+.if \bpc == 10
+ vmul.i16 q2, q9, d0[0] // p1(top[0]) * filter(1)
+ vmla.i16 q2, q10, d0[1] // p2(top[1]) * filter(2)
+ vmla.i16 q2, q11, d0[2] // p3(top[2]) * filter(3)
+ vmla.i16 q2, q12, d0[3] // p4(top[3]) * filter(4)
+ vmla.i16 q2, q8, d2[2] // p0(topleft) * filter(0)
+ vmla.i16 q2, q13, d2[1] // p5(left[0]) * filter(5)
+ vmla.i16 q2, q14, d2[0] // p6(left[1]) * filter(6)
+ vrshr.s16 q2, q2, #4
+ vmax.s16 q2, q2, q7
+.else
+ vmull.s16 q2, d18, d0[0] // p1(top[0]) * filter(1)
+ vmlal.s16 q2, d20, d0[1] // p2(top[1]) * filter(2)
+ vmlal.s16 q2, d22, d0[2] // p3(top[2]) * filter(3)
+ vmlal.s16 q2, d24, d0[3] // p4(top[3]) * filter(4)
+ vmlal.s16 q2, d16, d2[2] // p0(topleft) * filter(0)
+ vmlal.s16 q2, d26, d2[1] // p5(left[0]) * filter(5)
+ vmlal.s16 q2, d28, d2[0] // p6(left[1]) * filter(6)
+ vmull.s16 q3, d19, d0[0] // p1(top[0]) * filter(1)
+ vmlal.s16 q3, d21, d0[1] // p2(top[1]) * filter(2)
+ vmlal.s16 q3, d23, d0[2] // p3(top[2]) * filter(3)
+ vmlal.s16 q3, d25, d0[3] // p4(top[3]) * filter(4)
+ vmlal.s16 q3, d17, d2[2] // p0(topleft) * filter(0)
+ vmlal.s16 q3, d27, d2[1] // p5(left[0]) * filter(5)
+ vmlal.s16 q3, d29, d2[0] // p6(left[1]) * filter(6)
+ vqrshrun.s32 d4, q2, #4
+ vqrshrun.s32 d5, q3, #4
+.endif
+ vmin.s16 q2, q2, q15
+ subs r4, r4, #2
+ vst1.16 {d4}, [r0, :64], r1
+ vst1.16 {d5}, [r6, :64], r1
+ vmov d0, d5 // move top from [4-7] to [0-3]
+ bgt 4b
+ vpop {q4-q7}
+ pop {r4-r8, pc}
+80:
+ vld1.16 {q0}, [r8] // top (0-7)
+8:
+ vld1.16 {d2}, [r2], r7 // left (0-1) + topleft (2)
+.if \bpc == 10
+ vmul.i16 q2, q9, d0[0] // p1(top[0]) * filter(1)
+ vmla.i16 q2, q10, d0[1] // p2(top[1]) * filter(2)
+ vmla.i16 q2, q11, d0[2] // p3(top[2]) * filter(3)
+ vmla.i16 q2, q12, d0[3] // p4(top[3]) * filter(4)
+ vmla.i16 q2, q8, d2[2] // p0(topleft) * filter(0)
+ vmla.i16 q2, q13, d2[1] // p5(left[0]) * filter(5)
+ vmla.i16 q2, q14, d2[0] // p6(left[1]) * filter(6)
+ vmul.i16 q3, q9, d1[0] // p1(top[0]) * filter(1)
+ vmla.i16 q3, q10, d1[1] // p2(top[1]) * filter(2)
+ vmla.i16 q3, q11, d1[2] // p3(top[2]) * filter(3)
+ vrshr.s16 q2, q2, #4
+ vmax.s16 q2, q2, q7
+ vmin.s16 q2, q2, q15
+ vmla.i16 q3, q12, d1[3] // p4(top[3]) * filter(4)
+ vmla.i16 q3, q8, d0[3] // p0(topleft) * filter(0)
+ vmla.i16 q3, q13, d4[3] // p5(left[0]) * filter(5)
+ vmla.i16 q3, q14, d5[3] // p6(left[1]) * filter(6)
+ vrshr.s16 q3, q3, #4
+ vmax.s16 q3, q3, q7
+.else
+ vmull.s16 q2, d18, d0[0] // p1(top[0]) * filter(1)
+ vmlal.s16 q2, d20, d0[1] // p2(top[1]) * filter(2)
+ vmlal.s16 q2, d22, d0[2] // p3(top[2]) * filter(3)
+ vmlal.s16 q2, d24, d0[3] // p4(top[3]) * filter(4)
+ vmlal.s16 q2, d16, d2[2] // p0(topleft) * filter(0)
+ vmlal.s16 q2, d26, d2[1] // p5(left[0]) * filter(5)
+ vmlal.s16 q2, d28, d2[0] // p6(left[1]) * filter(6)
+ vmull.s16 q3, d19, d0[0] // p1(top[0]) * filter(1)
+ vmlal.s16 q3, d21, d0[1] // p2(top[1]) * filter(2)
+ vmlal.s16 q3, d23, d0[2] // p3(top[2]) * filter(3)
+ vmlal.s16 q3, d25, d0[3] // p4(top[3]) * filter(4)
+ vmlal.s16 q3, d17, d2[2] // p0(topleft) * filter(0)
+ vmlal.s16 q3, d27, d2[1] // p5(left[0]) * filter(5)
+ vmlal.s16 q3, d29, d2[0] // p6(left[1]) * filter(6)
+ vqrshrun.s32 d4, q2, #4
+ vmull.s16 q4, d18, d1[0] // p1(top[0]) * filter(1)
+ vmlal.s16 q4, d20, d1[1] // p2(top[1]) * filter(2)
+ vmlal.s16 q4, d22, d1[2] // p3(top[2]) * filter(3)
+ vqrshrun.s32 d5, q3, #4
+ vmin.s16 q2, q2, q15
+ vmlal.s16 q4, d24, d1[3] // p4(top[3]) * filter(4)
+ vmlal.s16 q4, d16, d0[3] // p0(topleft) * filter(0)
+ vmlal.s16 q4, d26, d4[3] // p5(left[0]) * filter(5)
+ vmlal.s16 q4, d28, d5[3] // p6(left[1]) * filter(6)
+ vmull.s16 q5, d19, d1[0] // p1(top[0]) * filter(1)
+ vmlal.s16 q5, d21, d1[1] // p2(top[1]) * filter(2)
+ vmlal.s16 q5, d23, d1[2] // p3(top[2]) * filter(3)
+ vmlal.s16 q5, d25, d1[3] // p4(top[3]) * filter(4)
+ vmlal.s16 q5, d17, d0[3] // p0(topleft) * filter(0)
+ vmlal.s16 q5, d27, d4[3] // p5(left[0]) * filter(5)
+ vmlal.s16 q5, d29, d5[3] // p6(left[1]) * filter(6)
+ vqrshrun.s32 d6, q4, #4
+ vqrshrun.s32 d7, q5, #4
+.endif
+ vmin.s16 q3, q3, q15
+ vswp d5, d6
+ subs r4, r4, #2
+ vst1.16 {q2}, [r0, :128], r1
+ vmov q0, q3
+ vst1.16 {q3}, [r6, :128], r1
+ bgt 8b
+ vpop {q4-q7}
+ pop {r4-r8, pc}
+160:
+320:
+ sub r1, r1, r3, lsl #1
+ mov lr, r3
+
+1:
+ vld1.16 {d0}, [r2], r7 // left (0-1) + topleft (2)
+2:
+ vld1.16 {q1, q2}, [r8]! // top(0-15)
+.if \bpc == 10
+ vmul.i16 q3, q8, d0[2] // p0(topleft) * filter(0)
+ vmla.i16 q3, q13, d0[1] // p5(left[0]) * filter(5)
+ vmla.i16 q3, q14, d0[0] // p6(left[1]) * filter(6)
+ vmla.i16 q3, q9, d2[0] // p1(top[0]) * filter(1)
+ vmla.i16 q3, q10, d2[1] // p2(top[1]) * filter(2)
+ vmla.i16 q3, q11, d2[2] // p3(top[2]) * filter(3)
+ vmla.i16 q3, q12, d2[3] // p4(top[3]) * filter(4)
+
+ vmul.i16 q4, q9, d3[0] // p1(top[0]) * filter(1)
+ vmla.i16 q4, q10, d3[1] // p2(top[1]) * filter(2)
+ vmla.i16 q4, q11, d3[2] // p3(top[2]) * filter(3)
+ vrshr.s16 q3, q3, #4
+ vmax.s16 q3, q3, q7
+ vmin.s16 q3, q3, q15
+ vmla.i16 q4, q12, d3[3] // p4(top[3]) * filter(4)
+ vmla.i16 q4, q8, d2[3] // p0(topleft) * filter(0)
+ vmla.i16 q4, q13, d6[3] // p5(left[0]) * filter(5)
+ vmla.i16 q4, q14, d7[3] // p6(left[1]) * filter(6)
+
+ vmul.i16 q5, q9, d4[0] // p1(top[0]) * filter(1)
+ vmla.i16 q5, q10, d4[1] // p2(top[1]) * filter(2)
+ vmla.i16 q5, q11, d4[2] // p3(top[2]) * filter(3)
+ vrshr.s16 q4, q4, #4
+ vmax.s16 q4, q4, q7
+ vmin.s16 q4, q4, q15
+ vmov q0, q4
+ vmla.i16 q5, q12, d4[3] // p4(top[3]) * filter(4)
+ vmla.i16 q5, q8, d3[3] // p0(topleft) * filter(0)
+ vmla.i16 q5, q13, d0[3] // p5(left[0]) * filter(5)
+ vmla.i16 q5, q14, d1[3] // p6(left[1]) * filter(6)
+
+ vmul.i16 q6, q9, d5[0] // p1(top[0]) * filter(1)
+ vmla.i16 q6, q10, d5[1] // p2(top[1]) * filter(2)
+ vmla.i16 q6, q11, d5[2] // p3(top[2]) * filter(3)
+ vrshr.s16 q5, q5, #4
+ vmax.s16 q5, q5, q7
+ vmin.s16 q5, q5, q15
+ vmov q0, q5
+ vmov.u16 r12, d5[3]
+ vmla.i16 q6, q12, d5[3] // p4(top[3]) * filter(4)
+ vmla.i16 q6, q8, d4[3] // p0(topleft) * filter(0)
+ vmla.i16 q6, q13, d0[3] // p5(left[0]) * filter(5)
+ vmla.i16 q6, q14, d1[3] // p6(left[1]) * filter(6)
+ vmov.16 d0[2], r12
+ subs r3, r3, #16
+ vrshr.s16 q6, q6, #4
+.else
+ vmull.s16 q3, d16, d0[2] // p0(topleft) * filter(0)
+ vmlal.s16 q3, d26, d0[1] // p5(left[0]) * filter(5)
+ vmlal.s16 q3, d28, d0[0] // p6(left[1]) * filter(6)
+ vmlal.s16 q3, d18, d2[0] // p1(top[0]) * filter(1)
+ vmlal.s16 q3, d20, d2[1] // p2(top[1]) * filter(2)
+ vmlal.s16 q3, d22, d2[2] // p3(top[2]) * filter(3)
+ vmlal.s16 q3, d24, d2[3] // p4(top[3]) * filter(4)
+ vmull.s16 q4, d17, d0[2] // p0(topleft) * filter(0)
+ vmlal.s16 q4, d27, d0[1] // p5(left[0]) * filter(5)
+ vmlal.s16 q4, d29, d0[0] // p6(left[1]) * filter(6)
+ vmlal.s16 q4, d19, d2[0] // p1(top[0]) * filter(1)
+ vmlal.s16 q4, d21, d2[1] // p2(top[1]) * filter(2)
+ vmlal.s16 q4, d23, d2[2] // p3(top[2]) * filter(3)
+ vmlal.s16 q4, d25, d2[3] // p4(top[3]) * filter(4)
+ vqrshrun.s32 d6, q3, #4
+ vmull.s16 q5, d18, d3[0] // p1(top[0]) * filter(1)
+ vmlal.s16 q5, d20, d3[1] // p2(top[1]) * filter(2)
+ vqrshrun.s32 d7, q4, #4
+ vmin.s16 q3, q3, q15
+ vmlal.s16 q5, d22, d3[2] // p3(top[2]) * filter(3)
+ vmlal.s16 q5, d24, d3[3] // p4(top[3]) * filter(4)
+ vmlal.s16 q5, d16, d2[3] // p0(topleft) * filter(0)
+ vmlal.s16 q5, d26, d6[3] // p5(left[0]) * filter(5)
+ vmlal.s16 q5, d28, d7[3] // p6(left[1]) * filter(6)
+ vmull.s16 q6, d19, d3[0] // p1(top[0]) * filter(1)
+ vmlal.s16 q6, d21, d3[1] // p2(top[1]) * filter(2)
+ vmlal.s16 q6, d23, d3[2] // p3(top[2]) * filter(3)
+ vmlal.s16 q6, d25, d3[3] // p4(top[3]) * filter(4)
+ vmlal.s16 q6, d17, d2[3] // p0(topleft) * filter(0)
+ vmlal.s16 q6, d27, d6[3] // p5(left[0]) * filter(5)
+ vmlal.s16 q6, d29, d7[3] // p6(left[1]) * filter(6)
+ vqrshrun.s32 d8, q5, #4
+ vmull.s16 q7, d18, d4[0] // p1(top[0]) * filter(1)
+ vmlal.s16 q7, d20, d4[1] // p2(top[1]) * filter(2)
+ vmlal.s16 q7, d22, d4[2] // p3(top[2]) * filter(3)
+ vqrshrun.s32 d9, q6, #4
+ vmin.s16 q0, q4, q15
+ vmlal.s16 q7, d24, d4[3] // p4(top[3]) * filter(4)
+ vmlal.s16 q7, d16, d3[3] // p0(topleft) * filter(0)
+ vmlal.s16 q7, d26, d0[3] // p5(left[0]) * filter(5)
+ vmlal.s16 q7, d28, d1[3] // p6(left[1]) * filter(6)
+ vmin.s16 q4, q4, q15
+ vmull.s16 q6, d19, d4[0] // p1(top[0]) * filter(1)
+ vmlal.s16 q6, d21, d4[1] // p2(top[1]) * filter(2)
+ vmlal.s16 q6, d23, d4[2] // p3(top[2]) * filter(3)
+ vmlal.s16 q6, d25, d4[3] // p4(top[3]) * filter(4)
+ vmlal.s16 q6, d17, d3[3] // p0(topleft) * filter(0)
+ vmlal.s16 q6, d27, d0[3] // p5(left[0]) * filter(5)
+ vmlal.s16 q6, d29, d1[3] // p6(left[1]) * filter(6)
+ vqrshrun.s32 d10, q7, #4
+ vmull.s16 q1, d18, d5[0] // p1(top[0]) * filter(1)
+ vmlal.s16 q1, d20, d5[1] // p2(top[1]) * filter(2)
+ vmlal.s16 q1, d22, d5[2] // p3(top[2]) * filter(3)
+ vqrshrun.s32 d11, q6, #4
+ vmin.s16 q0, q5, q15
+ vmlal.s16 q1, d24, d5[3] // p4(top[3]) * filter(4)
+ vmlal.s16 q1, d16, d4[3] // p0(topleft) * filter(0)
+ vmlal.s16 q1, d26, d0[3] // p5(left[0]) * filter(5)
+ vmlal.s16 q1, d28, d1[3] // p6(left[1]) * filter(6)
+ vmin.s16 q5, q5, q15
+ vmov.u16 r12, d5[3]
+ vmull.s16 q7, d19, d5[0] // p1(top[0]) * filter(1)
+ vmlal.s16 q7, d21, d5[1] // p2(top[1]) * filter(2)
+ vmlal.s16 q7, d23, d5[2] // p3(top[2]) * filter(3)
+ vmlal.s16 q7, d25, d5[3] // p4(top[3]) * filter(4)
+ vmlal.s16 q7, d17, d4[3] // p0(topleft) * filter(0)
+ vmlal.s16 q7, d27, d0[3] // p5(left[0]) * filter(5)
+ vmlal.s16 q7, d29, d1[3] // p6(left[1]) * filter(6)
+ vmov.16 d0[2], r12
+ vqrshrun.s32 d12, q1, #4
+ subs r3, r3, #16
+ vqrshrun.s32 d13, q7, #4
+.endif
+ vswp q4, q5
+.if \bpc == 10
+ vmax.s16 q6, q6, q7
+.endif
+ vswp d7, d10
+ vmin.s16 q6, q6, q15
+
+ vswp d9, d12
+
+ vst1.16 {q3, q4}, [r0, :128]!
+ vst1.16 {q5, q6}, [r6, :128]!
+ ble 8f
+ vmov.u16 r12, d13[3]
+ vmov.16 d0[0], r12
+ vmov.u16 r12, d9[3]
+ vmov.16 d0[1], r12
+ b 2b
+8:
+ subs r4, r4, #2
+
+ ble 9f
+ sub r8, r6, lr, lsl #1
+ add r0, r0, r1
+ add r6, r6, r1
+ mov r3, lr
+ b 1b
+9:
+ vpop {q4-q7}
+ pop {r4-r8, pc}
+endfunc
+.endm
+
+filter_fn 10
+filter_fn 12
+
+function ipred_filter_16bpc_neon, export=1
+ push {r4-r8, lr}
+ vpush {q4-q7}
+ movw r12, 0x3ff
+ ldr r8, [sp, #104]
+ cmp r8, r12
+ ble ipred_filter_10bpc_neon
+ b ipred_filter_12bpc_neon
+endfunc
+
+// void pal_pred_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const pal, const uint8_t *idx,
+// const int w, const int h);
+function pal_pred_16bpc_neon, export=1
+ push {r4-r5, lr}
+ ldr r4, [sp, #12]
+ ldr r5, [sp, #16]
+ vld1.16 {q14}, [r2, :128]
+ clz lr, r4
+ adr r12, L(pal_pred_tbl)
+ sub lr, lr, #25
+ vmov.i8 q13, #7
+ ldr lr, [r12, lr, lsl #2]
+ vmov.i16 q15, #0x100
+ add r12, r12, lr
+ add r2, r0, r1
+ bx r12
+
+ .align 2
+L(pal_pred_tbl):
+ .word 640f - L(pal_pred_tbl) + CONFIG_THUMB
+ .word 320f - L(pal_pred_tbl) + CONFIG_THUMB
+ .word 160f - L(pal_pred_tbl) + CONFIG_THUMB
+ .word 80f - L(pal_pred_tbl) + CONFIG_THUMB
+ .word 40f - L(pal_pred_tbl) + CONFIG_THUMB
+
+40:
+ lsl r1, r1, #1
+4:
+ vld1.8 {d2}, [r3, :64]!
+ subs r5, r5, #4
+ vshr.u8 d3, d2, #4
+ vand.u8 d2, d2, d26
+ vzip.8 d2, d3
+ // Restructure q1 from a, b, c, ... into 2*a, 2*a+1, 2*b, 2*b+1, 2*c, 2*c+1, ...
+ vadd.i8 q0, q1, q1
+ vadd.i8 q1, q1, q1
+ vzip.8 q0, q1
+ vadd.i16 q0, q0, q15
+ vadd.i16 q1, q1, q15
+ vtbl.8 d0, {q14}, d0
+ vtbl.8 d1, {q14}, d1
+ vst1.16 {d0}, [r0, :64], r1
+ vtbl.8 d2, {q14}, d2
+ vst1.16 {d1}, [r2, :64], r1
+ vtbl.8 d3, {q14}, d3
+ vst1.16 {d2}, [r0, :64], r1
+ vst1.16 {d3}, [r2, :64], r1
+ bgt 4b
+ pop {r4-r5, pc}
+80:
+ lsl r1, r1, #1
+8:
+ vld1.8 {q1}, [r3, :64]!
+ subs r5, r5, #4
+ vshr.u8 q2, q1, #4
+ vand.u8 q1, q1, q13
+ vzip.8 q1, q2
+ // Prefer doing the adds twice, instead of chaining a vmov after
+ // the add.
+ vadd.i8 q0, q1, q1
+ vadd.i8 q1, q1, q1
+ vadd.i8 q3, q2, q2
+ vadd.i8 q2, q2, q2
+ vzip.8 q0, q1
+ vzip.8 q2, q3
+ vadd.i16 q0, q0, q15
+ vadd.i16 q1, q1, q15
+ vtbl.8 d0, {q14}, d0
+ vadd.i16 q2, q2, q15
+ vtbl.8 d1, {q14}, d1
+ vadd.i16 q3, q3, q15
+ vtbl.8 d2, {q14}, d2
+ vtbl.8 d3, {q14}, d3
+ vtbl.8 d4, {q14}, d4
+ vtbl.8 d5, {q14}, d5
+ vst1.16 {q0}, [r0, :128], r1
+ vtbl.8 d6, {q14}, d6
+ vst1.16 {q1}, [r2, :128], r1
+ vtbl.8 d7, {q14}, d7
+ vst1.16 {q2}, [r0, :128], r1
+ vst1.16 {q3}, [r2, :128], r1
+ bgt 8b
+ pop {r4-r5, pc}
+160:
+ lsl r1, r1, #1
+16:
+ vld1.8 {q10, q11}, [r3, :64]!
+ subs r5, r5, #4
+ vand.u8 q2, q10, q13
+ vshr.u8 q3, q10, #4
+ vand.u8 q10, q11, q13
+ vshr.u8 q11, q11, #4
+ vzip.8 q2, q3
+ vzip.8 q10, q11
+ vadd.i8 q0, q2, q2
+ vadd.i8 q1, q2, q2
+ vadd.i8 q2, q3, q3
+ vadd.i8 q3, q3, q3
+ vadd.i8 q8, q10, q10
+ vadd.i8 q9, q10, q10
+ vadd.i8 q10, q11, q11
+ vzip.8 q0, q1
+ vadd.i8 q11, q11, q11
+ vzip.8 q2, q3
+ vzip.8 q8, q9
+ vadd.i16 q0, q0, q15
+ vzip.8 q10, q11
+ vadd.i16 q1, q1, q15
+ vadd.i16 q2, q2, q15
+ vadd.i16 q3, q3, q15
+ vadd.i16 q8, q8, q15
+ vadd.i16 q9, q9, q15
+ vadd.i16 q10, q10, q15
+ vtbl.8 d0, {q14}, d0
+ vadd.i16 q11, q11, q15
+ vtbl.8 d1, {q14}, d1
+ vtbl.8 d2, {q14}, d2
+ vtbl.8 d3, {q14}, d3
+ vtbl.8 d4, {q14}, d4
+ vtbl.8 d5, {q14}, d5
+ vtbl.8 d6, {q14}, d6
+ vtbl.8 d7, {q14}, d7
+ vtbl.8 d16, {q14}, d16
+ vtbl.8 d17, {q14}, d17
+ vtbl.8 d18, {q14}, d18
+ vst1.16 {q0, q1}, [r0, :128], r1
+ vtbl.8 d19, {q14}, d19
+ vtbl.8 d20, {q14}, d20
+ vst1.16 {q2, q3}, [r2, :128], r1
+ vtbl.8 d21, {q14}, d21
+ vtbl.8 d22, {q14}, d22
+ vst1.16 {q8, q9}, [r0, :128], r1
+ vtbl.8 d23, {q14}, d23
+ vst1.16 {q10, q11}, [r2, :128], r1
+ bgt 16b
+ pop {r4-r5, pc}
+320:
+ lsl r1, r1, #1
+ sub r1, r1, #32
+32:
+ vld1.8 {q10, q11}, [r3, :64]!
+ subs r5, r5, #2
+ vand.u8 q2, q10, q13
+ vshr.u8 q3, q10, #4
+ vand.u8 q10, q11, q13
+ vshr.u8 q11, q11, #4
+ vzip.8 q2, q3
+ vzip.8 q10, q11
+ vadd.i8 q0, q2, q2
+ vadd.i8 q1, q2, q2
+ vadd.i8 q2, q3, q3
+ vadd.i8 q3, q3, q3
+ vadd.i8 q8, q10, q10
+ vadd.i8 q9, q10, q10
+ vadd.i8 q10, q11, q11
+ vzip.8 q0, q1
+ vadd.i8 q11, q11, q11
+ vzip.8 q2, q3
+ vzip.8 q8, q9
+ vadd.i16 q0, q0, q15
+ vzip.8 q10, q11
+ vadd.i16 q1, q1, q15
+ vadd.i16 q2, q2, q15
+ vadd.i16 q3, q3, q15
+ vadd.i16 q8, q8, q15
+ vadd.i16 q9, q9, q15
+ vadd.i16 q10, q10, q15
+ vtbl.8 d0, {q14}, d0
+ vadd.i16 q11, q11, q15
+ vtbl.8 d1, {q14}, d1
+ vtbl.8 d2, {q14}, d2
+ vtbl.8 d3, {q14}, d3
+ vtbl.8 d4, {q14}, d4
+ vtbl.8 d5, {q14}, d5
+ vtbl.8 d6, {q14}, d6
+ vtbl.8 d7, {q14}, d7
+ vtbl.8 d16, {q14}, d16
+ vtbl.8 d17, {q14}, d17
+ vtbl.8 d18, {q14}, d18
+ vst1.16 {q0, q1}, [r0, :128]!
+ vtbl.8 d19, {q14}, d19
+ vtbl.8 d20, {q14}, d20
+ vst1.16 {q2, q3}, [r0, :128], r1
+ vtbl.8 d21, {q14}, d21
+ vtbl.8 d22, {q14}, d22
+ vst1.16 {q8, q9}, [r2, :128]!
+ vtbl.8 d23, {q14}, d23
+ vst1.16 {q10, q11}, [r2, :128], r1
+ bgt 32b
+ pop {r4-r5, pc}
+640:
+ sub r1, r1, #96
+64:
+ vld1.8 {q10, q11}, [r3, :64]!
+ subs r5, r5, #1
+ vand.u8 q2, q10, q13
+ vshr.u8 q3, q10, #4
+ vand.u8 q10, q11, q13
+ vshr.u8 q11, q11, #4
+ vzip.8 q2, q3
+ vzip.8 q10, q11
+ vadd.i8 q0, q2, q2
+ vadd.i8 q1, q2, q2
+ vadd.i8 q2, q3, q3
+ vadd.i8 q3, q3, q3
+ vadd.i8 q8, q10, q10
+ vadd.i8 q9, q10, q10
+ vadd.i8 q10, q11, q11
+ vzip.8 q0, q1
+ vadd.i8 q11, q11, q11
+ vzip.8 q2, q3
+ vzip.8 q8, q9
+ vadd.i16 q0, q0, q15
+ vzip.8 q10, q11
+ vadd.i16 q1, q1, q15
+ vadd.i16 q2, q2, q15
+ vadd.i16 q3, q3, q15
+ vadd.i16 q8, q8, q15
+ vadd.i16 q9, q9, q15
+ vadd.i16 q10, q10, q15
+ vtbl.8 d0, {q14}, d0
+ vadd.i16 q11, q11, q15
+ vtbl.8 d1, {q14}, d1
+ vtbl.8 d2, {q14}, d2
+ vtbl.8 d3, {q14}, d3
+ vtbl.8 d4, {q14}, d4
+ vtbl.8 d5, {q14}, d5
+ vtbl.8 d6, {q14}, d6
+ vtbl.8 d7, {q14}, d7
+ vtbl.8 d16, {q14}, d16
+ vtbl.8 d17, {q14}, d17
+ vtbl.8 d18, {q14}, d18
+ vst1.16 {q0, q1}, [r0, :128]!
+ vtbl.8 d19, {q14}, d19
+ vtbl.8 d20, {q14}, d20
+ vst1.16 {q2, q3}, [r0, :128]!
+ vtbl.8 d21, {q14}, d21
+ vtbl.8 d22, {q14}, d22
+ vst1.16 {q8, q9}, [r0, :128]!
+ vtbl.8 d23, {q14}, d23
+ vst1.16 {q10, q11}, [r0, :128], r1
+ bgt 64b
+ pop {r4-r5, pc}
+endfunc
+
+// void ipred_cfl_128_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height,
+// const int16_t *ac, const int alpha,
+// const int bitdepth_max);
+function ipred_cfl_128_16bpc_neon, export=1
+ push {r4-r8, lr}
+ ldrd r4, r5, [sp, #24]
+ ldrd r6, r7, [sp, #32]
+ clz lr, r3
+ vdup.16 q15, r7 // bitdepth_max
+ adr r12, L(ipred_cfl_128_tbl)
+ sub lr, lr, #26
+ ldr lr, [r12, lr, lsl #2]
+ vrshr.u16 q0, q15, #1
+ vdup.16 q1, r6 // alpha
+ add r12, r12, lr
+ add r6, r0, r1
+ lsl r1, r1, #1
+ vmov.i16 q14, #0
+ bx r12
+
+ .align 2
+L(ipred_cfl_128_tbl):
+L(ipred_cfl_splat_tbl):
+ .word L(ipred_cfl_splat_w16) - L(ipred_cfl_128_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_splat_w16) - L(ipred_cfl_128_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_splat_w8) - L(ipred_cfl_128_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_splat_w4) - L(ipred_cfl_128_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_splat_w4):
+ vld1.16 {q8, q9}, [r5, :128]!
+ vmull.s16 q2, d16, d2 // diff = ac * alpha
+ vmull.s16 q3, d17, d3
+ vmull.s16 q8, d18, d2
+ vmull.s16 q9, d19, d3
+ vshr.s32 q10, q2, #31 // sign = diff >> 15
+ vshr.s32 q11, q3, #31
+ vshr.s32 q12, q8, #31
+ vshr.s32 q13, q9, #31
+ vadd.i32 q2, q2, q10 // diff + sign
+ vadd.i32 q3, q3, q11
+ vadd.i32 q8, q8, q12
+ vadd.i32 q9, q9, q13
+ vrshrn.i32 d4, q2, #6 // (diff + sign + 32) >> 6 = apply_sign()
+ vrshrn.i32 d5, q3, #6
+ vrshrn.i32 d6, q8, #6
+ vrshrn.i32 d7, q9, #6
+ vadd.i16 q2, q2, q0 // dc + apply_sign()
+ vadd.i16 q3, q3, q0
+ vmax.s16 q2, q2, q14
+ vmax.s16 q3, q3, q14
+ vmin.s16 q2, q2, q15
+ vmin.s16 q3, q3, q15
+ vst1.16 {d4}, [r0, :64], r1
+ vst1.16 {d5}, [r6, :64], r1
+ subs r4, r4, #4
+ vst1.16 {d6}, [r0, :64], r1
+ vst1.16 {d7}, [r6, :64], r1
+ bgt L(ipred_cfl_splat_w4)
+ pop {r4-r8, pc}
+L(ipred_cfl_splat_w8):
+ vld1.16 {q8, q9}, [r5, :128]!
+ subs r4, r4, #2
+ vmull.s16 q2, d16, d2 // diff = ac * alpha
+ vmull.s16 q3, d17, d3
+ vmull.s16 q8, d18, d2
+ vmull.s16 q9, d19, d3
+ vshr.s32 q10, q2, #31 // sign = diff >> 15
+ vshr.s32 q11, q3, #31
+ vshr.s32 q12, q8, #31
+ vshr.s32 q13, q9, #31
+ vadd.i32 q2, q2, q10 // diff + sign
+ vadd.i32 q3, q3, q11
+ vadd.i32 q8, q8, q12
+ vadd.i32 q9, q9, q13
+ vrshrn.i32 d4, q2, #6 // (diff + sign + 32) >> 6 = apply_sign()
+ vrshrn.i32 d5, q3, #6
+ vrshrn.i32 d6, q8, #6
+ vrshrn.i32 d7, q9, #6
+ vadd.i16 q2, q2, q0 // dc + apply_sign()
+ vadd.i16 q3, q3, q0
+ vmax.s16 q2, q2, q14
+ vmax.s16 q3, q3, q14
+ vmin.s16 q2, q2, q15
+ vmin.s16 q3, q3, q15
+ vst1.16 {q2}, [r0, :128], r1
+ vst1.16 {q3}, [r6, :128], r1
+ bgt L(ipred_cfl_splat_w8)
+ pop {r4-r8, pc}
+L(ipred_cfl_splat_w16):
+ vpush {q4-q7}
+ add r12, r5, r3, lsl #1
+ sub r1, r1, r3, lsl #1
+ mov lr, r3
+1:
+ vld1.16 {q6, q7}, [r5, :128]!
+ vmull.s16 q2, d12, d2 // diff = ac * alpha
+ vld1.16 {q8, q9}, [r12, :128]!
+ vmull.s16 q3, d13, d3
+ vmull.s16 q4, d14, d2
+ vmull.s16 q5, d15, d3
+ vmull.s16 q6, d16, d2
+ vmull.s16 q7, d17, d3
+ vmull.s16 q8, d18, d2
+ vmull.s16 q9, d19, d3
+ vshr.s32 q10, q2, #31 // sign = diff >> 15
+ vshr.s32 q11, q3, #31
+ vshr.s32 q12, q4, #31
+ vshr.s32 q13, q5, #31
+ vadd.i32 q2, q2, q10 // diff + sign
+ vshr.s32 q10, q6, #31
+ vadd.i32 q3, q3, q11
+ vshr.s32 q11, q7, #31
+ vadd.i32 q4, q4, q12
+ vshr.s32 q12, q8, #31
+ vadd.i32 q5, q5, q13
+ vshr.s32 q13, q9, #31
+ vadd.i32 q6, q6, q10
+ vadd.i32 q7, q7, q11
+ vadd.i32 q8, q8, q12
+ vadd.i32 q9, q9, q13
+ vrshrn.i32 d4, q2, #6 // (diff + sign + 32) >> 6 = apply_sign()
+ vrshrn.i32 d5, q3, #6
+ vrshrn.i32 d6, q4, #6
+ vrshrn.i32 d7, q5, #6
+ vadd.i16 q2, q2, q0 // dc + apply_sign()
+ vrshrn.i32 d8, q6, #6
+ vrshrn.i32 d9, q7, #6
+ vadd.i16 q3, q3, q0
+ vrshrn.i32 d10, q8, #6
+ vrshrn.i32 d11, q9, #6
+ vadd.i16 q4, q4, q0
+ vadd.i16 q5, q5, q0
+ vmax.s16 q2, q2, q14
+ vmax.s16 q3, q3, q14
+ vmax.s16 q4, q4, q14
+ vmax.s16 q5, q5, q14
+ vmin.s16 q2, q2, q15
+ vmin.s16 q3, q3, q15
+ vmin.s16 q4, q4, q15
+ vmin.s16 q5, q5, q15
+ subs r3, r3, #16
+ vst1.16 {q2, q3}, [r0, :128]!
+ vst1.16 {q4, q5}, [r6, :128]!
+ bgt 1b
+ subs r4, r4, #2
+ add r5, r5, lr, lsl #1
+ add r12, r12, lr, lsl #1
+ add r0, r0, r1
+ add r6, r6, r1
+ mov r3, lr
+ bgt 1b
+ vpop {q4-q7}
+ pop {r4-r8, pc}
+endfunc
+
+// void ipred_cfl_top_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height,
+// const int16_t *ac, const int alpha,
+// const int bitdepth_max);
+function ipred_cfl_top_16bpc_neon, export=1
+ push {r4-r8, lr}
+ ldrd r4, r5, [sp, #24]
+ ldrd r6, r7, [sp, #32]
+ clz lr, r3
+ vdup.16 q15, r7 // bitdepth_max
+ adr r12, L(ipred_cfl_top_tbl)
+ sub lr, lr, #26
+ ldr lr, [r12, lr, lsl #2]
+ vdup.16 q1, r6 // alpha
+ add r2, r2, #2
+ add r12, r12, lr
+ add r6, r0, r1
+ lsl r1, r1, #1
+ vmov.i16 q14, #0
+ bx r12
+
+ .align 2
+L(ipred_cfl_top_tbl):
+ .word 32f - L(ipred_cfl_top_tbl) + CONFIG_THUMB
+ .word 16f - L(ipred_cfl_top_tbl) + CONFIG_THUMB
+ .word 8f - L(ipred_cfl_top_tbl) + CONFIG_THUMB
+ .word 4f - L(ipred_cfl_top_tbl) + CONFIG_THUMB
+
+4:
+ vld1.16 {d0}, [r2]
+ vpadd.i16 d0, d0, d0
+ vpadd.i16 d0, d0, d0
+ vrshr.u16 d0, d0, #2
+ vdup.16 q0, d0[0]
+ b L(ipred_cfl_splat_w4)
+8:
+ vld1.16 {q0}, [r2]
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0, d0
+ vpadd.i16 d0, d0, d0
+ vrshr.u16 d0, d0, #3
+ vdup.16 q0, d0[0]
+ b L(ipred_cfl_splat_w8)
+16:
+ vld1.16 {q2, q3}, [r2]
+ vadd.i16 q0, q2, q3
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0, d0
+ vpadd.i16 d0, d0, d0
+ vrshr.u16 d0, d0, #4
+ vdup.16 q0, d0[0]
+ b L(ipred_cfl_splat_w16)
+32:
+ vld1.16 {q8, q9}, [r2]!
+ vld1.16 {q10, q11}, [r2]
+ vadd.i16 q8, q8, q9
+ vadd.i16 q10, q10, q11
+ vadd.i16 q0, q8, q10
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0, d0
+ vpaddl.u16 d0, d0
+ vrshrn.i32 d0, q0, #5
+ vdup.16 q0, d0[0]
+ b L(ipred_cfl_splat_w16)
+endfunc
+
+// void ipred_cfl_left_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height,
+// const int16_t *ac, const int alpha,
+// const int bitdepth_max);
+function ipred_cfl_left_16bpc_neon, export=1
+ push {r4-r8, lr}
+ ldrd r4, r5, [sp, #24]
+ ldrd r6, r7, [sp, #32]
+ sub r2, r2, r4, lsl #1
+ clz lr, r3
+ clz r8, r4
+ vdup.16 q15, r7 // bitdepth_max
+ adr r12, L(ipred_cfl_splat_tbl)
+ adr r7, L(ipred_cfl_left_tbl)
+ sub lr, lr, #26
+ sub r8, r8, #26
+ ldr lr, [r12, lr, lsl #2]
+ ldr r8, [r7, r8, lsl #2]
+ vdup.16 q1, r6 // alpha
+ add r12, r12, lr
+ add r7, r7, r8
+ add r6, r0, r1
+ lsl r1, r1, #1
+ vmov.i16 q14, #0
+ bx r7
+
+ .align 2
+L(ipred_cfl_left_tbl):
+ .word L(ipred_cfl_left_h32) - L(ipred_cfl_left_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_left_h16) - L(ipred_cfl_left_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_left_h8) - L(ipred_cfl_left_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_left_h4) - L(ipred_cfl_left_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_left_h4):
+ vld1.16 {d0}, [r2, :64]
+ vpadd.i16 d0, d0, d0
+ vpadd.i16 d0, d0, d0
+ vrshr.u16 d0, d0, #2
+ vdup.16 q0, d0[0]
+ bx r12
+
+L(ipred_cfl_left_h8):
+ vld1.16 {q0}, [r2, :128]
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0, d0
+ vpadd.i16 d0, d0, d0
+ vrshr.u16 d0, d0, #3
+ vdup.16 q0, d0[0]
+ bx r12
+
+L(ipred_cfl_left_h16):
+ vld1.16 {q2, q3}, [r2, :128]
+ vadd.i16 q0, q2, q3
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0, d0
+ vpadd.i16 d0, d0, d0
+ vrshr.u16 d0, d0, #4
+ vdup.16 q0, d0[0]
+ bx r12
+
+L(ipred_cfl_left_h32):
+ vld1.16 {q8, q9}, [r2, :128]!
+ vld1.16 {q10, q11}, [r2, :128]
+ vadd.i16 q8, q8, q9
+ vadd.i16 q10, q10, q11
+ vadd.i16 q0, q8, q10
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0, d0
+ vpaddl.u16 d0, d0
+ vrshrn.i32 d0, q0, #5
+ vdup.16 q0, d0[0]
+ bx r12
+endfunc
+
+// void ipred_cfl_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height,
+// const int16_t *ac, const int alpha,
+// const int bitdepth_max);
+function ipred_cfl_16bpc_neon, export=1
+ push {r4-r8, lr}
+ ldrd r4, r5, [sp, #24]
+ ldrd r6, r7, [sp, #32]
+ sub r2, r2, r4, lsl #1
+ add r8, r3, r4 // width + height
+ vdup.16 q1, r6 // alpha
+ clz lr, r3
+ clz r6, r4
+ vdup.32 d16, r8 // width + height
+ vdup.16 q15, r7 // bitdepth_max
+ adr r7, L(ipred_cfl_tbl)
+ rbit r8, r8 // rbit(width + height)
+ sub lr, lr, #22 // 26 leading bits, minus table offset 4
+ sub r6, r6, #26
+ clz r8, r8 // ctz(width + height)
+ ldr lr, [r7, lr, lsl #2]
+ ldr r6, [r7, r6, lsl #2]
+ neg r8, r8 // -ctz(width + height)
+ add r12, r7, lr
+ add r7, r7, r6
+ vshr.u32 d16, d16, #1 // (width + height) >> 1
+ vdup.32 d17, r8 // -ctz(width + height)
+ add r6, r0, r1
+ lsl r1, r1, #1
+ vmov.i16 q14, #0
+ bx r7
+
+ .align 2
+L(ipred_cfl_tbl):
+ .word L(ipred_cfl_h32) - L(ipred_cfl_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_h16) - L(ipred_cfl_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_h8) - L(ipred_cfl_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_h4) - L(ipred_cfl_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_w32) - L(ipred_cfl_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_w16) - L(ipred_cfl_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_w8) - L(ipred_cfl_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_w4) - L(ipred_cfl_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_h4):
+ vld1.16 {d0}, [r2, :64]!
+ vpadd.i16 d0, d0, d0
+ add r2, r2, #2
+ vpaddl.u16 d0, d0
+ bx r12
+L(ipred_cfl_w4):
+ vld1.16 {d1}, [r2]
+ vadd.i32 d0, d0, d16
+ vpadd.i16 d1, d1, d1
+ vpaddl.u16 d1, d1
+ cmp r4, #4
+ vadd.i32 d0, d0, d1
+ vshl.u32 d0, d0, d17
+ beq 1f
+ // h = 8/16
+ cmp r4, #16
+ movw lr, #0x6667
+ movw r8, #0xAAAB
+ it ne
+ movne lr, r8
+ vdup.32 d18, lr
+ vmul.i32 d0, d0, d18
+ vshr.u32 d0, d0, #17
+1:
+ vdup.16 q0, d0[0]
+ b L(ipred_cfl_splat_w4)
+
+L(ipred_cfl_h8):
+ vld1.16 {q0}, [r2, :128]!
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0, d0
+ add r2, r2, #2
+ vpaddl.u16 d0, d0
+ bx r12
+L(ipred_cfl_w8):
+ vld1.16 {q2}, [r2]
+ vadd.i32 d0, d0, d16
+ vadd.i16 d1, d4, d5
+ vpadd.i16 d1, d1, d1
+ vpaddl.u16 d1, d1
+ cmp r4, #8
+ vadd.i32 d0, d0, d1
+ vshl.u32 d0, d0, d17
+ beq 1f
+ // h = 4/16/32
+ cmp r4, #32
+ movw lr, #0x6667
+ movw r8, #0xAAAB
+ it ne
+ movne lr, r8
+ vdup.32 d18, lr
+ vmul.i32 d0, d0, d18
+ vshr.u32 d0, d0, #17
+1:
+ vdup.16 q0, d0[0]
+ b L(ipred_cfl_splat_w8)
+
+L(ipred_cfl_h16):
+ vld1.16 {q2, q3}, [r2, :128]!
+ vadd.i16 q0, q2, q3
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0, d0
+ add r2, r2, #2
+ vpaddl.u16 d0, d0
+ bx r12
+L(ipred_cfl_w16):
+ vld1.16 {q2, q3}, [r2]
+ vadd.i32 d0, d0, d16
+ vadd.i16 q2, q2, q3
+ vadd.i16 d1, d4, d5
+ vpadd.i16 d1, d1, d1
+ vpaddl.u16 d1, d1
+ cmp r4, #16
+ vadd.i32 d0, d0, d1
+ vshl.u32 d0, d0, d17
+ beq 1f
+ // h = 4/8/32/64
+ tst r4, #(32+16+8) // 16 added to make a consecutive bitmask
+ movw lr, #0x6667
+ movw r8, #0xAAAB
+ it ne
+ movne lr, r8
+ vdup.32 d18, lr
+ vmul.i32 d0, d0, d18
+ vshr.u32 d0, d0, #17
+1:
+ vdup.16 q0, d0[0]
+ b L(ipred_cfl_splat_w16)
+
+L(ipred_cfl_h32):
+ vld1.16 {q2, q3}, [r2, :128]!
+ vld1.16 {q10, q11}, [r2, :128]!
+ vadd.i16 q2, q2, q3
+ vadd.i16 q10, q10, q11
+ vadd.i16 q0, q2, q10
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0, d0
+ add r2, r2, #2
+ vpaddl.u16 d0, d0
+ bx r12
+L(ipred_cfl_w32):
+ vld1.16 {q2, q3}, [r2]!
+ vadd.i32 d0, d0, d16
+ vld1.16 {q10, q11}, [r2]!
+ vadd.i16 q2, q2, q3
+ vadd.i16 q10, q10, q11
+ vadd.i16 q2, q2, q10
+ vadd.i16 d1, d4, d5
+ vpadd.i16 d1, d1, d1
+ vpaddl.u16 d1, d1
+ cmp r4, #32
+ vadd.i32 d0, d0, d1
+ vshl.u32 d0, d0, d17
+ beq 1f
+ // h = 8/16/64
+ cmp r4, #8
+ movw lr, #0x6667
+ movw r8, #0xAAAB
+ it ne
+ movne lr, r8
+ vdup.32 d18, lr
+ vmul.i32 d0, d0, d18
+ vshr.u32 d0, d0, #17
+1:
+ vdup.16 q0, d0[0]
+ b L(ipred_cfl_splat_w16)
+endfunc
+
+// void cfl_ac_420_16bpc_neon(int16_t *const ac, const pixel *const ypx,
+// const ptrdiff_t stride, const int w_pad,
+// const int h_pad, const int cw, const int ch);
+function ipred_cfl_ac_420_16bpc_neon, export=1
+ push {r4-r8,lr}
+ ldrd r4, r5, [sp, #24]
+ ldr r6, [sp, #32]
+ clz r8, r5
+ lsl r4, r4, #2
+ adr r7, L(ipred_cfl_ac_420_tbl)
+ sub r8, r8, #27
+ ldr r8, [r7, r8, lsl #2]
+ vmov.i32 q8, #0
+ vmov.i32 q9, #0
+ vmov.i32 q10, #0
+ vmov.i32 q11, #0
+ add r7, r7, r8
+ sub r8, r6, r4 // height - h_pad
+ rbit lr, r5 // rbit(width)
+ rbit r12, r6 // rbit(height)
+ clz lr, lr // ctz(width)
+ clz r12, r12 // ctz(height)
+ add lr, lr, r12 // log2sz
+ add r12, r1, r2
+ vdup.32 d31, lr
+ lsl r2, r2, #1
+ vneg.s32 d31, d31 // -log2sz
+ bx r7
+
+ .align 2
+L(ipred_cfl_ac_420_tbl):
+ .word L(ipred_cfl_ac_420_w16) - L(ipred_cfl_ac_420_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_420_w8) - L(ipred_cfl_ac_420_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_420_w4) - L(ipred_cfl_ac_420_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_ac_420_w4):
+1: // Copy and subsample input
+ vld1.16 {q0}, [r1, :128], r2
+ vld1.16 {q1}, [r12, :128], r2
+ vld1.16 {q2}, [r1, :128], r2
+ vld1.16 {q3}, [r12, :128], r2
+ vadd.i16 q0, q0, q1
+ vadd.i16 q2, q2, q3
+ vpadd.i16 d0, d0, d1
+ vpadd.i16 d1, d4, d5
+ vshl.i16 q0, q0, #1
+ subs r8, r8, #2
+ vst1.16 {q0}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ bgt 1b
+ cmp r4, #0
+ vmov d0, d1
+ vmov d2, d1
+ vmov d3, d1
+L(ipred_cfl_ac_420_w4_hpad):
+ beq 3f // This assumes that all callers already did "cmp r4, #0"
+2: // Vertical padding (h_pad > 0)
+ subs r4, r4, #4
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ bgt 2b
+3:
+L(ipred_cfl_ac_420_w4_calc_subtract_dc):
+ // Aggregate the sums
+ vadd.i32 q8, q8, q9
+ vadd.i32 q10, q10, q11
+ vadd.i32 q0, q8, q10
+ vadd.i32 d0, d0, d1
+ vpadd.i32 d0, d0, d0 // sum
+ sub r0, r0, r6, lsl #3
+ vrshl.u32 d16, d0, d31 // (sum + (1 << (log2sz - 1))) >>= log2sz
+ vdup.16 q8, d16[0]
+6: // Subtract dc from ac
+ vld1.16 {q0, q1}, [r0, :128]
+ subs r6, r6, #4
+ vsub.i16 q0, q0, q8
+ vsub.i16 q1, q1, q8
+ vst1.16 {q0, q1}, [r0, :128]!
+ bgt 6b
+ pop {r4-r8, pc}
+
+L(ipred_cfl_ac_420_w8):
+ cmp r3, #0
+ bne L(ipred_cfl_ac_420_w8_wpad)
+1: // Copy and subsample input, without padding
+ vld1.16 {q0, q1}, [r1, :128], r2
+ vld1.16 {q2, q3}, [r12, :128], r2
+ vld1.16 {q12, q13}, [r1, :128], r2
+ vadd.i16 q0, q0, q2
+ vadd.i16 q1, q1, q3
+ vld1.16 {q2, q3}, [r12, :128], r2
+ vpadd.i16 d0, d0, d1
+ vpadd.i16 d1, d2, d3
+ vadd.i16 q12, q12, q2
+ vadd.i16 q13, q13, q3
+ vpadd.i16 d2, d24, d25
+ vpadd.i16 d3, d26, d27
+ vshl.i16 q0, q0, #1
+ vshl.i16 q1, q1, #1
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q1
+ b L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_420_w8_wpad):
+1: // Copy and subsample input, padding 4
+ vld1.16 {q0}, [r1, :128], r2
+ vld1.16 {q1}, [r12, :128], r2
+ vld1.16 {q2}, [r1, :128], r2
+ vld1.16 {q3}, [r12, :128], r2
+ vadd.i16 q0, q0, q1
+ vadd.i16 q2, q2, q3
+ vpadd.i16 d0, d0, d1
+ vpadd.i16 d1, d4, d5
+ vshl.i16 q0, q0, #1
+ vdup.16 d3, d1[3]
+ vmov d2, d1
+ vdup.16 d1, d0[3]
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q1
+
+L(ipred_cfl_ac_420_w8_hpad):
+ beq 3f // This assumes that all callers already did "cmp r4, #0"
+2: // Vertical padding (h_pad > 0)
+ subs r4, r4, #4
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ bgt 2b
+3:
+
+ // Double the height and reuse the w4 summing/subtracting
+ lsl r6, r6, #1
+ b L(ipred_cfl_ac_420_w4_calc_subtract_dc)
+
+L(ipred_cfl_ac_420_w16):
+ adr r7, L(ipred_cfl_ac_420_w16_tbl)
+ ldr r3, [r7, r3, lsl #2]
+ add r7, r7, r3
+ bx r7
+
+ .align 2
+L(ipred_cfl_ac_420_w16_tbl):
+ .word L(ipred_cfl_ac_420_w16_wpad0) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_420_w16_wpad1) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_420_w16_wpad2) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_420_w16_wpad3) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_ac_420_w16_wpad0):
+ sub r2, r2, #32
+1: // Copy and subsample input, without padding
+ vld1.16 {q0, q1}, [r1, :128]!
+ vld1.16 {q12, q13}, [r12, :128]!
+ vld1.16 {q2, q3}, [r1, :128], r2
+ vadd.i16 q0, q0, q12
+ vadd.i16 q1, q1, q13
+ vld1.16 {q12, q13}, [r12, :128], r2
+ vpadd.i16 d0, d0, d1
+ vpadd.i16 d1, d2, d3
+ vadd.i16 q2, q2, q12
+ vadd.i16 q3, q3, q13
+ vpadd.i16 d2, d4, d5
+ vpadd.i16 d3, d6, d7
+ vshl.i16 q0, q0, #1
+ vshl.i16 q1, q1, #1
+ subs r8, r8, #1
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ bgt 1b
+ cmp r4, #0
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_wpad1):
+ sub r2, r2, #32
+1: // Copy and subsample input, padding 4
+ vld1.16 {q0, q1}, [r1, :128]!
+ vld1.16 {q12, q13}, [r12, :128]!
+ vld1.16 {q2}, [r1, :128], r2
+ vadd.i16 q0, q0, q12
+ vadd.i16 q1, q1, q13
+ vld1.16 {q12}, [r12, :128], r2
+ vpadd.i16 d0, d0, d1
+ vadd.i16 q2, q2, q12
+ vpadd.i16 d1, d2, d3
+ vpadd.i16 d2, d4, d5
+ vshl.i16 q0, q0, #1
+ vshl.i16 d2, d2, #1
+ subs r8, r8, #1
+ vdup.16 d3, d2[3]
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ bgt 1b
+ cmp r4, #0
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_wpad2):
+1: // Copy and subsample input, padding 8
+ vld1.16 {q0, q1}, [r1, :128], r2
+ vld1.16 {q12, q13}, [r12, :128], r2
+ vadd.i16 q0, q0, q12
+ vadd.i16 q1, q1, q13
+ vpadd.i16 d0, d0, d1
+ vpadd.i16 d1, d2, d3
+ vshl.i16 q0, q0, #1
+ subs r8, r8, #1
+ vdup.16 q1, d1[3]
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ bgt 1b
+ cmp r4, #0
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_wpad3):
+1: // Copy and subsample input, padding 12
+ vld1.16 {q0}, [r1, :128], r2
+ vld1.16 {q12}, [r12, :128], r2
+ vadd.i16 q0, q0, q12
+ vpadd.i16 d0, d0, d1
+ vshl.i16 d0, d0, #1
+ subs r8, r8, #1
+ vdup.16 q1, d0[3]
+ vdup.16 d1, d0[3]
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ bgt 1b
+ cmp r4, #0
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_hpad):
+ beq 3f // This assumes that all callers already did "cmp r4, #0"
+2: // Vertical padding (h_pad > 0)
+ subs r4, r4, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ bgt 2b
+3:
+
+ // Quadruple the height and reuse the w4 summing/subtracting
+ lsl r6, r6, #2
+ b L(ipred_cfl_ac_420_w4_calc_subtract_dc)
+endfunc
+
+// void cfl_ac_422_16bpc_neon(int16_t *const ac, const pixel *const ypx,
+// const ptrdiff_t stride, const int w_pad,
+// const int h_pad, const int cw, const int ch);
+function ipred_cfl_ac_422_16bpc_neon, export=1
+ push {r4-r8,lr}
+ ldrd r4, r5, [sp, #24]
+ ldr r6, [sp, #32]
+ clz r8, r5
+ lsl r4, r4, #2
+ adr r7, L(ipred_cfl_ac_422_tbl)
+ sub r8, r8, #27
+ ldr r8, [r7, r8, lsl #2]
+ vmov.i16 q8, #0
+ vmov.i16 q9, #0
+ vmov.i16 q10, #0
+ vmov.i16 q11, #0
+ add r7, r7, r8
+ sub r8, r6, r4 // height - h_pad
+ rbit lr, r5 // rbit(width)
+ rbit r12, r6 // rbit(height)
+ clz lr, lr // ctz(width)
+ clz r12, r12 // ctz(height)
+ add lr, lr, r12 // log2sz
+ add r12, r1, r2
+ vdup.32 d31, lr
+ lsl r2, r2, #1
+ vneg.s32 d31, d31 // -log2sz
+ bx r7
+
+ .align 2
+L(ipred_cfl_ac_422_tbl):
+ .word L(ipred_cfl_ac_422_w16) - L(ipred_cfl_ac_422_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_422_w8) - L(ipred_cfl_ac_422_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_422_w4) - L(ipred_cfl_ac_422_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_ac_422_w4):
+1: // Copy and subsample input
+ vld1.16 {q0}, [r1, :128], r2
+ vld1.16 {q1}, [r12, :128], r2
+ vld1.16 {q2}, [r1, :128], r2
+ vld1.16 {q3}, [r12, :128], r2
+ vpadd.i16 d0, d0, d1
+ vpadd.i16 d1, d2, d3
+ vpadd.i16 d2, d4, d5
+ vpadd.i16 d3, d6, d7
+ vshl.i16 q0, q0, #2
+ vshl.i16 q1, q1, #2
+ subs r8, r8, #4
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ bgt 1b
+ cmp r4, #0
+ vmov d0, d3
+ vmov d1, d3
+ vmov d2, d3
+ b L(ipred_cfl_ac_420_w4_hpad)
+
+L(ipred_cfl_ac_422_w8):
+ cmp r3, #0
+ bne L(ipred_cfl_ac_422_w8_wpad)
+1: // Copy and subsample input, without padding
+ vld1.16 {q0, q1}, [r1, :128], r2
+ vld1.16 {q2, q3}, [r12, :128], r2
+ vld1.16 {q12, q13}, [r1, :128], r2
+ vpadd.i16 d0, d0, d1
+ vpadd.i16 d1, d2, d3
+ vpadd.i16 d2, d4, d5
+ vpadd.i16 d3, d6, d7
+ vld1.16 {q2, q3}, [r12, :128], r2
+ vpadd.i16 d24, d24, d25
+ vpadd.i16 d25, d26, d27
+ vpadd.i16 d26, d4, d5
+ vpadd.i16 d27, d6, d7
+ vshl.i16 q0, q0, #2
+ vshl.i16 q1, q1, #2
+ vshl.i16 q2, q12, #2
+ vshl.i16 q3, q13, #2
+ subs r8, r8, #4
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ vst1.16 {q2, q3}, [r0, :128]!
+ vaddw.u16 q8, q8, d4
+ vaddw.u16 q9, q9, d5
+ vaddw.u16 q10, q10, d6
+ vaddw.u16 q11, q11, d7
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q3
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_422_w8_wpad):
+1: // Copy and subsample input, padding 4
+ vld1.16 {q0}, [r1, :128], r2
+ vld1.16 {q2}, [r12, :128], r2
+ vld1.16 {q12}, [r1, :128], r2
+ vpadd.i16 d0, d0, d1
+ vpadd.i16 d1, d4, d5
+ vld1.16 {q2, q3}, [r12, :128], r2
+ vpadd.i16 d24, d24, d25
+ vpadd.i16 d25, d4, d5
+ vshl.i16 q0, q0, #2
+ vshl.i16 q12, q12, #2
+ vdup.16 d7, d25[3]
+ vmov d6, d25
+ vdup.16 d5, d24[3]
+ vmov d4, d24
+ vdup.16 d3, d1[3]
+ vmov d2, d1
+ vdup.16 d1, d0[3]
+ subs r8, r8, #4
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ vst1.16 {q2, q3}, [r0, :128]!
+ vaddw.u16 q8, q8, d4
+ vaddw.u16 q9, q9, d5
+ vaddw.u16 q10, q10, d6
+ vaddw.u16 q11, q11, d7
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q3
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_422_w16):
+ adr r7, L(ipred_cfl_ac_422_w16_tbl)
+ ldr r3, [r7, r3, lsl #2]
+ add r7, r7, r3
+ bx r7
+
+ .align 2
+L(ipred_cfl_ac_422_w16_tbl):
+ .word L(ipred_cfl_ac_422_w16_wpad0) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_422_w16_wpad1) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_422_w16_wpad2) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_422_w16_wpad3) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_ac_422_w16_wpad0):
+ sub r2, r2, #32
+1: // Copy and subsample input, without padding
+ vld1.16 {q0, q1}, [r1, :128]!
+ vld1.16 {q2, q3}, [r12, :128]!
+ vld1.16 {q12, q13}, [r1, :128], r2
+ vpadd.i16 d0, d0, d1
+ vpadd.i16 d1, d2, d3
+ vpadd.i16 d2, d24, d25
+ vpadd.i16 d3, d26, d27
+ vld1.16 {q12, q13}, [r12, :128], r2
+ vpadd.i16 d4, d4, d5
+ vpadd.i16 d5, d6, d7
+ vpadd.i16 d6, d24, d25
+ vpadd.i16 d7, d26, d27
+ vshl.i16 q0, q0, #2
+ vshl.i16 q1, q1, #2
+ vshl.i16 q2, q2, #2
+ vshl.i16 q3, q3, #2
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ vst1.16 {q2, q3}, [r0, :128]!
+ vaddw.u16 q8, q8, d4
+ vaddw.u16 q9, q9, d5
+ vaddw.u16 q10, q10, d6
+ vaddw.u16 q11, q11, d7
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q2
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_w16_wpad1):
+ sub r2, r2, #32
+1: // Copy and subsample input, padding 4
+ vld1.16 {q0, q1}, [r1, :128]!
+ vld1.16 {q2, q3}, [r12, :128]!
+ vld1.16 {q12}, [r1, :128], r2
+ vpadd.i16 d0, d0, d1
+ vpadd.i16 d1, d2, d3
+ vpadd.i16 d2, d24, d25
+ vld1.16 {q12}, [r12, :128], r2
+ vpadd.i16 d4, d4, d5
+ vpadd.i16 d5, d6, d7
+ vpadd.i16 d6, d24, d25
+ vshl.i16 q0, q0, #2
+ vshl.i16 d2, d2, #2
+ vshl.i16 q2, q2, #2
+ vshl.i16 d6, d6, #2
+ vdup.16 d3, d2[3]
+ vdup.16 d7, d6[3]
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ vst1.16 {q2, q3}, [r0, :128]!
+ vaddw.u16 q8, q8, d4
+ vaddw.u16 q9, q9, d5
+ vaddw.u16 q10, q10, d6
+ vaddw.u16 q11, q11, d7
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q2
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_w16_wpad2):
+1: // Copy and subsample input, padding 8
+ vld1.16 {q0, q1}, [r1, :128], r2
+ vld1.16 {q2, q3}, [r12, :128], r2
+ vpadd.i16 d0, d0, d1
+ vpadd.i16 d1, d2, d3
+ vpadd.i16 d4, d4, d5
+ vpadd.i16 d5, d6, d7
+ vshl.i16 q0, q0, #2
+ vshl.i16 q2, q2, #2
+ vdup.16 q1, d1[3]
+ vdup.16 q3, d5[3]
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ vst1.16 {q2, q3}, [r0, :128]!
+ vaddw.u16 q8, q8, d4
+ vaddw.u16 q9, q9, d5
+ vaddw.u16 q10, q10, d6
+ vaddw.u16 q11, q11, d7
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q2
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_w16_wpad3):
+1: // Copy and subsample input, padding 12
+ vld1.16 {q0}, [r1, :128], r2
+ vld1.16 {q2}, [r12, :128], r2
+ vpadd.i16 d0, d0, d1
+ vpadd.i16 d1, d4, d5
+ vshl.i16 q0, q0, #2
+ vdup.16 q3, d1[3]
+ vdup.16 q1, d0[3]
+ vdup.16 d5, d1[3]
+ vmov d4, d1
+ vdup.16 d1, d0[3]
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ vst1.16 {q2, q3}, [r0, :128]!
+ vaddw.u16 q8, q8, d4
+ vaddw.u16 q9, q9, d5
+ vaddw.u16 q10, q10, d6
+ vaddw.u16 q11, q11, d7
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q2
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w16_hpad)
+endfunc
+
+// void cfl_ac_444_16bpc_neon(int16_t *const ac, const pixel *const ypx,
+// const ptrdiff_t stride, const int w_pad,
+// const int h_pad, const int cw, const int ch);
+function ipred_cfl_ac_444_16bpc_neon, export=1
+ push {r4-r8,lr}
+ ldrd r4, r5, [sp, #24]
+ ldr r6, [sp, #32]
+ clz r8, r5
+ lsl r4, r4, #2
+ adr r7, L(ipred_cfl_ac_444_tbl)
+ sub r8, r8, #26
+ ldr r8, [r7, r8, lsl #2]
+ vmov.i16 q8, #0
+ vmov.i16 q9, #0
+ vmov.i16 q10, #0
+ vmov.i16 q11, #0
+ add r7, r7, r8
+ sub r8, r6, r4 // height - h_pad
+ rbit lr, r5 // rbit(width)
+ rbit r12, r6 // rbit(height)
+ clz lr, lr // ctz(width)
+ clz r12, r12 // ctz(height)
+ add lr, lr, r12 // log2sz
+ add r12, r1, r2
+ vdup.32 d31, lr
+ lsl r2, r2, #1
+ vneg.s32 d31, d31 // -log2sz
+ bx r7
+
+ .align 2
+L(ipred_cfl_ac_444_tbl):
+ .word L(ipred_cfl_ac_444_w32) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_444_w16) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_444_w8) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_444_w4) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_ac_444_w4):
+1: // Copy and expand input
+ vld1.16 {d0}, [r1, :64], r2
+ vld1.16 {d1}, [r12, :64], r2
+ vld1.16 {d2}, [r1, :64], r2
+ vld1.16 {d3}, [r12, :64], r2
+ vshl.i16 q0, q0, #3
+ vshl.i16 q1, q1, #3
+ subs r8, r8, #4
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ bgt 1b
+ cmp r4, #0
+ vmov d0, d3
+ vmov d1, d3
+ vmov d2, d3
+ b L(ipred_cfl_ac_420_w4_hpad)
+
+L(ipred_cfl_ac_444_w8):
+1: // Copy and expand input
+ vld1.16 {q0}, [r1, :128], r2
+ vld1.16 {q1}, [r12, :128], r2
+ vld1.16 {q2}, [r1, :128], r2
+ vld1.16 {q3}, [r12, :128], r2
+ vshl.i16 q0, q0, #3
+ vshl.i16 q1, q1, #3
+ vshl.i16 q2, q2, #3
+ vshl.i16 q3, q3, #3
+ subs r8, r8, #4
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ vst1.16 {q2, q3}, [r0, :128]!
+ vaddw.u16 q8, q8, d4
+ vaddw.u16 q9, q9, d5
+ vaddw.u16 q10, q10, d6
+ vaddw.u16 q11, q11, d7
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q3
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_444_w16):
+ cmp r3, #0
+ bne L(ipred_cfl_ac_444_w16_wpad)
+1: // Copy and expand input, without padding
+ vld1.16 {q0, q1}, [r1, :128], r2
+ vld1.16 {q2, q3}, [r12, :128], r2
+ vshl.i16 q0, q0, #3
+ vshl.i16 q1, q1, #3
+ vshl.i16 q2, q2, #3
+ vshl.i16 q3, q3, #3
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ vst1.16 {q2, q3}, [r0, :128]!
+ vaddw.u16 q8, q8, d4
+ vaddw.u16 q9, q9, d5
+ vaddw.u16 q10, q10, d6
+ vaddw.u16 q11, q11, d7
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q2
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_444_w16_wpad):
+1: // Copy and expand input, padding 8
+ vld1.16 {q0}, [r1, :128], r2
+ vld1.16 {q2}, [r12, :128], r2
+ vshl.i16 q0, q0, #3
+ vshl.i16 q2, q2, #3
+ vdup.16 q1, d1[3]
+ vdup.16 q3, d5[3]
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ vst1.16 {q2, q3}, [r0, :128]!
+ vaddw.u16 q8, q8, d4
+ vaddw.u16 q9, q9, d5
+ vaddw.u16 q10, q10, d6
+ vaddw.u16 q11, q11, d7
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q2
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_444_w32):
+ adr r7, L(ipred_cfl_ac_444_w32_tbl)
+ ldr r3, [r7, r3, lsl #1] // (w3>>1) << 2
+ asr r2, r2, #1
+ add r7, r7, r3
+ bx r7
+
+ .align 2
+L(ipred_cfl_ac_444_w32_tbl):
+ .word L(ipred_cfl_ac_444_w32_wpad0) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_444_w32_wpad2) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_444_w32_wpad4) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_444_w32_wpad6) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_ac_444_w32_wpad0):
+ sub r2, r2, #32
+1: // Copy and expand input, without padding
+ vld1.16 {q0, q1}, [r1, :128]!
+ vld1.16 {q2, q3}, [r1, :128], r2
+ vshl.i16 q0, q0, #3
+ vshl.i16 q1, q1, #3
+ vshl.i16 q2, q2, #3
+ vshl.i16 q3, q3, #3
+ subs r8, r8, #1
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ vst1.16 {q2, q3}, [r0, :128]!
+ vaddw.u16 q8, q8, d4
+ vaddw.u16 q9, q9, d5
+ vaddw.u16 q10, q10, d6
+ vaddw.u16 q11, q11, d7
+ bgt 1b
+ cmp r4, #0
+ b L(ipred_cfl_ac_444_w32_hpad)
+
+L(ipred_cfl_ac_444_w32_wpad2):
+ sub r2, r2, #32
+1: // Copy and expand input, padding 8
+ vld1.16 {q0, q1}, [r1, :128]!
+ vld1.16 {q2}, [r1, :128], r2
+ vshl.i16 q0, q0, #3
+ vshl.i16 q1, q1, #3
+ vshl.i16 q2, q2, #3
+ subs r8, r8, #1
+ vst1.16 {q0, q1}, [r0, :128]!
+ vdup.16 q3, d5[3]
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ vst1.16 {q2, q3}, [r0, :128]!
+ vaddw.u16 q8, q8, d4
+ vaddw.u16 q9, q9, d5
+ vaddw.u16 q10, q10, d6
+ vaddw.u16 q11, q11, d7
+ bgt 1b
+ cmp r4, #0
+ b L(ipred_cfl_ac_444_w32_hpad)
+
+L(ipred_cfl_ac_444_w32_wpad4):
+1: // Copy and expand input, padding 16
+ vld1.16 {q0, q1}, [r1, :128], r2
+ vshl.i16 q0, q0, #3
+ vshl.i16 q1, q1, #3
+ subs r8, r8, #1
+ vst1.16 {q0, q1}, [r0, :128]!
+ vdup.16 q2, d3[3]
+ vdup.16 q3, d3[3]
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ vst1.16 {q2, q3}, [r0, :128]!
+ vaddw.u16 q8, q8, d4
+ vaddw.u16 q9, q9, d5
+ vaddw.u16 q10, q10, d6
+ vaddw.u16 q11, q11, d7
+ bgt 1b
+ cmp r4, #0
+ b L(ipred_cfl_ac_444_w32_hpad)
+
+L(ipred_cfl_ac_444_w32_wpad6):
+1: // Copy and expand input, padding 24
+ vld1.16 {q0}, [r1, :128], r2
+ vshl.i16 q0, q0, #3
+ subs r8, r8, #1
+ vdup.16 q1, d1[3]
+ vst1.16 {q0, q1}, [r0, :128]!
+ vdup.16 q2, d1[3]
+ vdup.16 q3, d1[3]
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ vst1.16 {q2, q3}, [r0, :128]!
+ vaddw.u16 q8, q8, d4
+ vaddw.u16 q9, q9, d5
+ vaddw.u16 q10, q10, d6
+ vaddw.u16 q11, q11, d7
+ bgt 1b
+ cmp r4, #0
+
+L(ipred_cfl_ac_444_w32_hpad):
+ beq 3f // This assumes that all callers already did "cmp r4, #0"
+2: // Vertical padding (h_pad > 0)
+ subs r4, r4, #1
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ vst1.16 {q2, q3}, [r0, :128]!
+ vaddw.u16 q8, q8, d4
+ vaddw.u16 q9, q9, d5
+ vaddw.u16 q10, q10, d6
+ vaddw.u16 q11, q11, d7
+ bgt 2b
+3:
+
+ // Multiply the height by eight and reuse the w4 subtracting
+ lsl r6, r6, #3
+ b L(ipred_cfl_ac_420_w4_calc_subtract_dc)
+endfunc
diff --git a/third_party/dav1d/src/arm/32/itx.S b/third_party/dav1d/src/arm/32/itx.S
new file mode 100644
index 0000000000..ceea025e45
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/itx.S
@@ -0,0 +1,3343 @@
+/******************************************************************************
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// The exported functions in this file have got the following signature:
+// void itxfm_add(pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob);
+
+// Most of the functions use the following register layout:
+// r0-r3 external parameters
+// r4 function pointer to first transform
+// r5 function pointer to second transform
+// r6 output parameter for helper function
+// r7 input parameter for helper function
+// r8 input stride for helper function
+// r9 scratch variable for helper functions
+// r10-r11 pointer to list of eob thresholds, eob threshold value,
+// scratch variables within helper functions (backed up)
+
+// The SIMD registers most often use the following layout:
+// d0-d3 multiplication coefficients
+// d4-d7 scratch registers
+// d8-d15 unused in some transforms, used for scratch registers in others
+// d16-v31 inputs/outputs of transforms
+
+// Potential further optimizations, that are left unimplemented for now:
+// - Trying to keep multiplication coefficients in registers across multiple
+// transform functions. (The register layout is designed to potentially
+// allow this.)
+// - Use a simplified version of the transforms themselves for cases where
+// we know a significant number of inputs are zero. E.g. if the eob value
+// indicates only a quarter of input values are set, for idct16 and up,
+// a significant amount of calculation can be skipped, at the cost of more
+// code duplication and special casing.
+
+const idct_coeffs, align=4
+ // idct4
+ .short 2896, 2896*8, 1567, 3784
+ // idct8
+ .short 799, 4017, 3406, 2276
+ // idct16
+ .short 401, 4076, 3166, 2598
+ .short 1931, 3612, 3920, 1189
+ // idct32
+ .short 201, 4091, 3035, 2751
+ .short 1751, 3703, 3857, 1380
+ .short 995, 3973, 3513, 2106
+ .short 2440, 3290, 4052, 601
+endconst
+
+const idct64_coeffs, align=4
+ .short 101*8, 4095*8, 2967*8, -2824*8
+ .short 1660*8, 3745*8, 3822*8, -1474*8
+ .short 4076, 401, 4017, 799
+
+ .short 4036*8, -700*8, 2359*8, 3349*8
+ .short 3461*8, -2191*8, 897*8, 3996*8
+ .short -3166, -2598, -799, -4017
+
+ .short 501*8, 4065*8, 3229*8, -2520*8
+ .short 2019*8, 3564*8, 3948*8, -1092*8
+ .short 3612, 1931, 2276, 3406
+
+ .short 4085*8, -301*8, 2675*8, 3102*8
+ .short 3659*8, -1842*8, 1285*8, 3889*8
+ .short -3920, -1189, -3406, -2276
+endconst
+
+const iadst4_coeffs, align=4
+ // .h[4-5] can be interpreted as .s[2]
+ .short 1321, 3803, 2482, 3344, 3344, 0
+endconst
+
+const iadst8_coeffs, align=4
+ .short 4076, 401, 3612, 1931
+ .short 2598, 3166, 1189, 3920
+ // idct_coeffs
+ .short 2896, 0, 1567, 3784, 0, 0, 0, 0
+endconst
+
+const iadst16_coeffs, align=4
+ .short 4091, 201, 3973, 995
+ .short 3703, 1751, 3290, 2440
+ .short 2751, 3035, 2106, 3513
+ .short 1380, 3857, 601, 4052
+endconst
+
+.macro vmull_vmlal d0, s0, s1, c0, c1
+ vmull.s16 \d0, \s0, \c0
+ vmlal.s16 \d0, \s1, \c1
+.endm
+
+.macro vmull_vmlal_8h d0, d1, s0, s1, s2, s3, c0, c1
+ vmull.s16 \d0, \s0, \c0
+ vmlal.s16 \d0, \s2, \c1
+ vmull.s16 \d1, \s1, \c0
+ vmlal.s16 \d1, \s3, \c1
+.endm
+
+.macro vmull_vmlsl d0, s0, s1, c0, c1
+ vmull.s16 \d0, \s0, \c0
+ vmlsl.s16 \d0, \s1, \c1
+.endm
+
+.macro vmull_vmlsl_8h d0, d1, s0, s1, s2, s3, c0, c1
+ vmull.s16 \d0, \s0, \c0
+ vmlsl.s16 \d0, \s2, \c1
+ vmull.s16 \d1, \s1, \c0
+ vmlsl.s16 \d1, \s3, \c1
+.endm
+
+.macro vqrshrn_8h d0, d1, s0, s1, shift
+ vqrshrn.s32 \d0, \s0, \shift
+ vqrshrn.s32 \d1, \s1, \shift
+.endm
+
+.macro scale_input c, r0, r1, r2 r3, r4, r5, r6, r7
+ vqrdmulh.s16 \r0, \r0, \c
+ vqrdmulh.s16 \r1, \r1, \c
+.ifnb \r2
+ vqrdmulh.s16 \r2, \r2, \c
+ vqrdmulh.s16 \r3, \r3, \c
+.endif
+.ifnb \r4
+ vqrdmulh.s16 \r4, \r4, \c
+ vqrdmulh.s16 \r5, \r5, \c
+ vqrdmulh.s16 \r6, \r6, \c
+ vqrdmulh.s16 \r7, \r7, \c
+.endif
+.endm
+
+.macro load_add_store load, shift, addsrc, adddst, narrowsrc, narrowdst, store, dst, src, shiftbits=4
+.ifnb \load
+ vld1.8 {\load}, [\src, :64], r1
+.endif
+.ifnb \shift
+ vrshr.s16 \shift, \shift, #\shiftbits
+.endif
+.ifnb \addsrc
+ vaddw.u8 \adddst, \adddst, \addsrc
+.endif
+.ifnb \narrowsrc
+ vqmovun.s16 \narrowdst, \narrowsrc
+.endif
+.ifnb \store
+ vst1.8 {\store}, [\dst, :64], r1
+.endif
+.endm
+.macro load_add_store_8x8 dst, src, shiftbits=4
+ mov \src, \dst
+ load_add_store d2, q8, , , , , , \dst, \src, \shiftbits
+ load_add_store d3, q9, , , , , , \dst, \src, \shiftbits
+ load_add_store d4, q10, d2, q8, , , , \dst, \src, \shiftbits
+ load_add_store d5, q11, d3, q9, q8, d2, , \dst, \src, \shiftbits
+ load_add_store d6, q12, d4, q10, q9, d3, d2, \dst, \src, \shiftbits
+ load_add_store d7, q13, d5, q11, q10, d4, d3, \dst, \src, \shiftbits
+ load_add_store d2, q14, d6, q12, q11, d5, d4, \dst, \src, \shiftbits
+ load_add_store d3, q15, d7, q13, q12, d6, d5, \dst, \src, \shiftbits
+ load_add_store , , d2, q14, q13, d7, d6, \dst, \src, \shiftbits
+ load_add_store , , d3, q15, q14, d2, d7, \dst, \src, \shiftbits
+ load_add_store , , , , q15, d3, d2, \dst, \src, \shiftbits
+ load_add_store , , , , , , d3, \dst, \src, \shiftbits
+.endm
+.macro load_add_store_8x4 dst, src
+ mov \src, \dst
+ load_add_store d2, q8, , , , , , \dst, \src
+ load_add_store d3, q9, , , , , , \dst, \src
+ load_add_store d4, q10, d2, q8, , , , \dst, \src
+ load_add_store d5, q11, d3, q9, q8, d2, , \dst, \src
+ load_add_store , , d4, q10, q9, d3, d2, \dst, \src
+ load_add_store , , d5, q11, q10, d4, d3, \dst, \src
+ load_add_store , , , , q11, d5, d4, \dst, \src
+ load_add_store , , , , , , d5, \dst, \src
+.endm
+.macro load_add_store4 load, shift, addsrc, adddst, narrowsrc, narrowdst, store, dst, src
+.ifnb \load
+ vld1.32 {\load[0]}, [\src, :32], r1
+.endif
+.ifnb \shift
+ vrshr.s16 \shift, \shift, #4
+.endif
+.ifnb \load
+ vld1.32 {\load[1]}, [\src, :32], r1
+.endif
+.ifnb \addsrc
+ vaddw.u8 \adddst, \adddst, \addsrc
+.endif
+.ifnb \store
+ vst1.32 {\store[0]}, [\dst, :32], r1
+.endif
+.ifnb \narrowsrc
+ vqmovun.s16 \narrowdst, \narrowsrc
+.endif
+.ifnb \store
+ vst1.32 {\store[1]}, [\dst, :32], r1
+.endif
+.endm
+.macro load_add_store_4x16 dst, src
+ mov \src, \dst
+ load_add_store4 d0, , , , , , , \dst, \src
+ load_add_store4 d1, q8, , , , , , \dst, \src
+ load_add_store4 d2, q9, d0, q8, , , , \dst, \src
+ load_add_store4 d3, q10, d1, q9, q8, d0, , \dst, \src
+ load_add_store4 d4, q11, d2, q10, q9, d1, d0, \dst, \src
+ load_add_store4 d5, q12, d3, q11, q10, d2, d1, \dst, \src
+ load_add_store4 d6, q13, d4, q12, q11, d3, d2, \dst, \src
+ load_add_store4 d7, q14, d5, q13, q12, d4, d3, \dst, \src
+ load_add_store4 , q15, d6, q14, q13, d5, d4, \dst, \src
+ load_add_store4 , , d7, q15, q14, d6, d5, \dst, \src
+ load_add_store4 , , , , q15, d7, d6, \dst, \src
+ load_add_store4 , , , , , , d7, \dst, \src
+.endm
+.macro load_add_store_4x8 dst, src
+ mov \src, \dst
+ load_add_store4 d0, , , , , , , \dst, \src
+ load_add_store4 d1, q8, , , , , , \dst, \src
+ load_add_store4 d2, q9, d0, q8, , , , \dst, \src
+ load_add_store4 d3, q10, d1, q9, q8, d0, , \dst, \src
+ load_add_store4 , q11, d2, q10, q9, d1, d0, \dst, \src
+ load_add_store4 , , d3, q11, q10, d2, d1, \dst, \src
+ load_add_store4 , , , , q11, d3, d2, \dst, \src
+ load_add_store4 , , , , , , d3, \dst, \src
+.endm
+
+.macro idct_dc w, h, shift
+ cmp r3, #0
+ bne 1f
+ vmov.i16 d30, #0
+ movw r12, #2896*8
+ vld1.16 {d16[]}, [r2, :16]
+ vdup.16 d0, r12
+ vqrdmulh.s16 d16, d16, d0[0]
+ vst1.16 {d30[0]}, [r2, :16]
+.if (\w == 2*\h) || (2*\w == \h)
+ vqrdmulh.s16 d16, d16, d0[0]
+.endif
+.if \shift > 0
+ vrshr.s16 d16, d16, #\shift
+.endif
+ vqrdmulh.s16 d20, d16, d0[0]
+ mov r3, #\h
+ vrshr.s16 d16, d20, #4
+ vrshr.s16 d17, d20, #4
+ b idct_dc_w\w\()_neon
+1:
+.endm
+
+function idct_dc_w4_neon
+1:
+ vld1.32 {d0[0]}, [r0, :32], r1
+ vld1.32 {d0[1]}, [r0, :32], r1
+ vld1.32 {d1[0]}, [r0, :32], r1
+ vld1.32 {d1[1]}, [r0, :32], r1
+ subs r3, r3, #4
+ sub r0, r0, r1, lsl #2
+ vaddw.u8 q10, q8, d0
+ vqmovun.s16 d0, q10
+ vaddw.u8 q11, q8, d1
+ vst1.32 {d0[0]}, [r0, :32], r1
+ vqmovun.s16 d1, q11
+ vst1.32 {d0[1]}, [r0, :32], r1
+ vst1.32 {d1[0]}, [r0, :32], r1
+ vst1.32 {d1[1]}, [r0, :32], r1
+ bgt 1b
+ bx lr
+endfunc
+
+function idct_dc_w8_neon
+1:
+ vld1.8 {d0}, [r0, :64], r1
+ vld1.8 {d1}, [r0, :64], r1
+ vld1.8 {d2}, [r0, :64], r1
+ vaddw.u8 q10, q8, d0
+ vld1.8 {d3}, [r0, :64], r1
+ sub r0, r0, r1, lsl #2
+ subs r3, r3, #4
+ vaddw.u8 q11, q8, d1
+ vqmovun.s16 d0, q10
+ vaddw.u8 q12, q8, d2
+ vqmovun.s16 d1, q11
+ vaddw.u8 q13, q8, d3
+ vst1.8 {d0}, [r0, :64], r1
+ vqmovun.s16 d2, q12
+ vst1.8 {d1}, [r0, :64], r1
+ vqmovun.s16 d3, q13
+ vst1.8 {d2}, [r0, :64], r1
+ vst1.8 {d3}, [r0, :64], r1
+ bgt 1b
+ bx lr
+endfunc
+
+function idct_dc_w16_neon
+1:
+ vld1.8 {q0}, [r0, :128], r1
+ vld1.8 {q1}, [r0, :128], r1
+ vld1.8 {q2}, [r0, :128], r1
+ subs r3, r3, #4
+ vaddw.u8 q10, q8, d0
+ vaddw.u8 q11, q8, d1
+ vld1.8 {q3}, [r0, :128], r1
+ vaddw.u8 q12, q8, d2
+ vaddw.u8 q13, q8, d3
+ sub r0, r0, r1, lsl #2
+ vaddw.u8 q14, q8, d4
+ vaddw.u8 q15, q8, d5
+ vqmovun.s16 d0, q10
+ vqmovun.s16 d1, q11
+ vaddw.u8 q10, q8, d6
+ vaddw.u8 q11, q8, d7
+ vqmovun.s16 d2, q12
+ vqmovun.s16 d3, q13
+ vqmovun.s16 d4, q14
+ vqmovun.s16 d5, q15
+ vst1.8 {q0}, [r0, :128], r1
+ vqmovun.s16 d6, q10
+ vqmovun.s16 d7, q11
+ vst1.8 {q1}, [r0, :128], r1
+ vst1.8 {q2}, [r0, :128], r1
+ vst1.8 {q3}, [r0, :128], r1
+ bgt 1b
+ bx lr
+endfunc
+
+function idct_dc_w32_neon
+1:
+ vld1.8 {q0, q1}, [r0, :128], r1
+ subs r3, r3, #2
+ vld1.8 {q2, q3}, [r0, :128], r1
+ vaddw.u8 q10, q8, d0
+ vaddw.u8 q11, q8, d1
+ vaddw.u8 q12, q8, d2
+ vaddw.u8 q13, q8, d3
+ sub r0, r0, r1, lsl #1
+ vaddw.u8 q14, q8, d4
+ vaddw.u8 q15, q8, d5
+ vqmovun.s16 d0, q10
+ vqmovun.s16 d1, q11
+ vaddw.u8 q10, q8, d6
+ vaddw.u8 q11, q8, d7
+ vqmovun.s16 d2, q12
+ vqmovun.s16 d3, q13
+ vqmovun.s16 d4, q14
+ vqmovun.s16 d5, q15
+ vst1.8 {q0, q1}, [r0, :128], r1
+ vqmovun.s16 d6, q10
+ vqmovun.s16 d7, q11
+ vst1.8 {q2, q3}, [r0, :128], r1
+ bgt 1b
+ bx lr
+endfunc
+
+function idct_dc_w64_neon
+ sub r1, r1, #32
+1:
+ vld1.8 {q0, q1}, [r0, :128]!
+ subs r3, r3, #1
+ vld1.8 {q2, q3}, [r0, :128]
+ vaddw.u8 q10, q8, d0
+ vaddw.u8 q11, q8, d1
+ vaddw.u8 q12, q8, d2
+ vaddw.u8 q13, q8, d3
+ sub r0, r0, #32
+ vaddw.u8 q14, q8, d4
+ vaddw.u8 q15, q8, d5
+ vqmovun.s16 d0, q10
+ vqmovun.s16 d1, q11
+ vaddw.u8 q10, q8, d6
+ vaddw.u8 q11, q8, d7
+ vqmovun.s16 d2, q12
+ vqmovun.s16 d3, q13
+ vqmovun.s16 d4, q14
+ vqmovun.s16 d5, q15
+ vst1.8 {q0, q1}, [r0, :128]!
+ vqmovun.s16 d6, q10
+ vqmovun.s16 d7, q11
+ vst1.8 {q2, q3}, [r0, :128], r1
+ bgt 1b
+ bx lr
+endfunc
+
+.macro iwht4
+ vadd.i16 d16, d16, d17
+ vsub.i16 d21, d18, d19
+ vsub.i16 d20, d16, d21
+ vshr.s16 d20, d20, #1
+ vsub.i16 d18, d20, d17
+ vsub.i16 d17, d20, d19
+ vadd.i16 d19, d21, d18
+ vsub.i16 d16, d16, d17
+.endm
+
+.macro idct_4h_x4 r0, r1, r2, r3
+ vmull_vmlal q3, \r1, \r3, d0[3], d0[2]
+ vmull_vmlsl q2, \r1, \r3, d0[2], d0[3]
+ vmull_vmlal q1, \r0, \r2, d0[0], d0[0]
+ vqrshrn.s32 d6, q3, #12
+ vqrshrn.s32 d7, q2, #12
+ vmull_vmlsl q2, \r0, \r2, d0[0], d0[0]
+ vqrshrn.s32 d2, q1, #12
+ vqrshrn.s32 d3, q2, #12
+ vqadd.s16 \r0, d2, d6
+ vqsub.s16 \r3, d2, d6
+ vqadd.s16 \r1, d3, d7
+ vqsub.s16 \r2, d3, d7
+.endm
+
+.macro idct_8h_x4 q0, q1, q2, q3, r0, r1, r2, r3, r4, r5, r6, r7
+ vmull_vmlal_8h q6, q7, \r2, \r3, \r6, \r7, d0[3], d0[2]
+ vmull_vmlsl_8h q4, q5, \r2, \r3, \r6, \r7, d0[2], d0[3]
+ vmull_vmlal_8h q2, q3, \r0, \r1, \r4, \r5, d0[0], d0[0]
+ vqrshrn_8h d12, d13, q6, q7, #12
+ vqrshrn_8h d14, d15, q4, q5, #12
+ vmull_vmlsl_8h q4, q5, \r0, \r1, \r4, \r5, d0[0], d0[0]
+ vqrshrn_8h d4, d5, q2, q3, #12
+ vqrshrn_8h d6, d7, q4, q5, #12
+ vqadd.s16 \q0, q2, q6
+ vqsub.s16 \q3, q2, q6
+ vqadd.s16 \q1, q3, q7
+ vqsub.s16 \q2, q3, q7
+.endm
+
+function inv_dct_4h_x4_neon, export=1
+ movrel_local r12, idct_coeffs
+ vld1.16 {d0}, [r12, :64]
+ idct_4h_x4 d16, d17, d18, d19
+ bx lr
+endfunc
+
+function inv_dct_8h_x4_neon, export=1
+ movrel_local r12, idct_coeffs
+ vld1.16 {d0}, [r12, :64]
+ idct_8h_x4 q8, q9, q10, q11, d16, d17, d18, d19, d20, d21, d22, d23
+ bx lr
+endfunc
+
+.macro iadst_4x4 o0, o1, o2, o3
+ movrel_local r12, iadst4_coeffs
+ vld1.16 {d0, d1}, [r12, :128]
+
+ vsubl.s16 q1, d16, d18
+ vmull.s16 q2, d16, d0[0]
+ vmlal.s16 q2, d18, d0[1]
+ vmlal.s16 q2, d19, d0[2]
+ vmull.s16 q10, d17, d0[3]
+ vaddw.s16 q1, q1, d19
+ vmull.s16 q3, d16, d0[2]
+ vmlsl.s16 q3, d18, d0[0]
+ vmlsl.s16 q3, d19, d0[1]
+
+ vadd.s32 q11, q2, q3
+ vmul.s32 q1, q1, d1[0]
+ vadd.s32 q2, q2, q10
+ vadd.s32 q3, q3, q10
+ vsub.s32 q11, q11, q10
+
+ vqrshrn.s32 \o0, q2, #12
+ vqrshrn.s32 \o2, q1, #12
+ vqrshrn.s32 \o1, q3, #12
+ vqrshrn.s32 \o3, q11, #12
+.endm
+
+function inv_adst_4h_x4_neon, export=1
+ iadst_4x4 d16, d17, d18, d19
+ bx lr
+endfunc
+
+function inv_flipadst_4h_x4_neon, export=1
+ iadst_4x4 d19, d18, d17, d16
+ bx lr
+endfunc
+
+.macro iadst_8x4 o0, o1, o2, o3, o4, o5, o6, o7
+ movrel_local r12, iadst4_coeffs
+ vld1.16 {d0, d1}, [r12, :128]
+
+ vsubl.s16 q2, d16, d20
+ vsubl.s16 q3, d17, d21
+ vmull.s16 q4, d16, d0[0]
+ vmlal.s16 q4, d20, d0[1]
+ vmlal.s16 q4, d22, d0[2]
+ vmull.s16 q5, d17, d0[0]
+ vmlal.s16 q5, d21, d0[1]
+ vmlal.s16 q5, d23, d0[2]
+ vaddw.s16 q2, q2, d22
+ vaddw.s16 q3, q3, d23
+ vmull.s16 q6, d16, d0[2]
+ vmlsl.s16 q6, d20, d0[0]
+ vmlsl.s16 q6, d22, d0[1]
+ vmull.s16 q7, d17, d0[2]
+ vmlsl.s16 q7, d21, d0[0]
+ vmlsl.s16 q7, d23, d0[1]
+
+ vmul.s32 q10, q2, d1[0]
+ vmul.s32 q11, q3, d1[0]
+
+ vmull.s16 q2, d18, d0[3]
+ vmull.s16 q3, d19, d0[3]
+
+ vadd.s32 q8, q4, q2 // out0
+ vadd.s32 q9, q5, q3
+
+ vadd.s32 q4, q4, q6 // out3
+ vadd.s32 q5, q5, q7
+
+ vadd.s32 q6, q6, q2 // out1
+ vadd.s32 q7, q7, q3
+
+ vsub.s32 q4, q4, q2 // out3
+ vsub.s32 q5, q5, q3
+
+ vqrshrn.s32 d20, q10, #12
+ vqrshrn.s32 d21, q11, #12
+
+ vqrshrn.s32 \o0, q8, #12
+ vqrshrn.s32 \o1, q9, #12
+
+.ifc \o4, d18
+ vmov q9, q10
+.endif
+
+ vqrshrn.s32 \o2, q6, #12
+ vqrshrn.s32 \o3, q7, #12
+
+ vqrshrn.s32 \o6, q4, #12
+ vqrshrn.s32 \o7, q5, #12
+.endm
+
+function inv_adst_8h_x4_neon, export=1
+ iadst_8x4 d16, d17, d18, d19, d20, d21, d22, d23
+ bx lr
+endfunc
+
+function inv_flipadst_8h_x4_neon, export=1
+ iadst_8x4 d22, d23, d20, d21, d18, d19, d16, d17
+ bx lr
+endfunc
+
+function inv_identity_4h_x4_neon, export=1
+ movw r12, #(5793-4096)*8
+ vdup.16 d0, r12
+ vqrdmulh.s16 q2, q8, d0[0]
+ vqrdmulh.s16 q3, q9, d0[0]
+ vqadd.s16 q8, q8, q2
+ vqadd.s16 q9, q9, q3
+ bx lr
+endfunc
+
+function inv_identity_8h_x4_neon, export=1
+ movw r12, #(5793-4096)*8
+ vdup.16 d0, r12
+ vqrdmulh.s16 q1, q8, d0[0]
+ vqrdmulh.s16 q2, q9, d0[0]
+ vqrdmulh.s16 q3, q10, d0[0]
+ vqadd.s16 q8, q8, q1
+ vqrdmulh.s16 q1, q11, d0[0]
+ vqadd.s16 q9, q9, q2
+ vqadd.s16 q10, q10, q3
+ vqadd.s16 q11, q11, q1
+ bx lr
+endfunc
+
+.macro identity_8x4_shift1 r0, r1, r2, r3, c
+.irp i, \r0, \r1, \r2, \r3
+ vqrdmulh.s16 q1, \i, \c
+ vrhadd.s16 \i, \i, q1
+.endr
+.endm
+
+function inv_txfm_add_wht_wht_4x4_8bpc_neon, export=1
+ push {r4-r5,lr}
+ vmov.i16 q15, #0
+ vld1.16 {d16, d17, d18, d19}, [r2, :128]
+ vst1.16 {q15}, [r2, :128]!
+
+ vshr.s16 q8, q8, #2
+ vshr.s16 q9, q9, #2
+
+ iwht4
+
+ vst1.16 {q15}, [r2, :128]!
+ transpose_4x4h q8, q9, d16, d17, d18, d19
+
+ iwht4
+
+ vld1.32 {d0[]}, [r0, :32], r1
+ vld1.32 {d0[1]}, [r0, :32], r1
+ vld1.32 {d1[]}, [r0, :32], r1
+ vld1.32 {d1[1]}, [r0, :32], r1
+
+ b L(itx_4x4_end)
+endfunc
+
+function inv_txfm_add_4x4_neon
+ vmov.i16 q15, #0
+ vld1.16 {d16, d17, d18, d19}, [r2, :128]
+ vst1.16 {q15}, [r2, :128]!
+
+ blx r4
+
+ vst1.16 {q15}, [r2, :128]!
+ transpose_4x4h q8, q9, d16, d17, d18, d19
+
+ blx r5
+
+ vld1.32 {d0[]}, [r0, :32], r1
+ vld1.32 {d0[1]}, [r0, :32], r1
+ vld1.32 {d1[]}, [r0, :32], r1
+ vld1.32 {d1[1]}, [r0, :32], r1
+ vrshr.s16 q8, q8, #4
+ vrshr.s16 q9, q9, #4
+
+L(itx_4x4_end):
+ sub r0, r0, r1, lsl #2
+ vaddw.u8 q8, q8, d0
+ vqmovun.s16 d0, q8
+ vaddw.u8 q9, q9, d1
+ vst1.32 {d0[0]}, [r0, :32], r1
+ vqmovun.s16 d1, q9
+ vst1.32 {d0[1]}, [r0, :32], r1
+ vst1.32 {d1[0]}, [r0, :32], r1
+ vst1.32 {d1[1]}, [r0, :32], r1
+
+ pop {r4-r5,pc}
+endfunc
+
+.macro def_fn_4x4 txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_8bpc_neon, export=1
+ push {r4-r5,lr}
+
+.ifc \txfm1\()_\txfm2, dct_dct
+ cmp r3, #0
+ bne 1f
+ vmov.i16 d30, #0
+ movw r12, #2896*8
+ vld1.16 {d16[]}, [r2, :16]
+ vdup.16 d4, r12
+ vst1.16 {d30[0]}, [r2, :16]
+ vqrdmulh.s16 d16, d16, d4[0]
+ vld1.32 {d0[0]}, [r0, :32], r1
+ vqrdmulh.s16 d20, d16, d4[0]
+ vld1.32 {d0[1]}, [r0, :32], r1
+ vrshr.s16 d16, d20, #4
+ vrshr.s16 d17, d20, #4
+ vld1.32 {d1[0]}, [r0, :32], r1
+ vmov q9, q8
+ vld1.32 {d1[1]}, [r0, :32], r1
+ b L(itx_4x4_end)
+1:
+.endif
+ movrel_local r4, inv_\txfm1\()_4h_x4_neon
+ movrel_local r5, inv_\txfm2\()_4h_x4_neon
+ b inv_txfm_add_4x4_neon
+endfunc
+.endm
+
+def_fn_4x4 dct, dct
+def_fn_4x4 identity, identity
+def_fn_4x4 dct, adst
+def_fn_4x4 dct, flipadst
+def_fn_4x4 dct, identity
+def_fn_4x4 adst, dct
+def_fn_4x4 adst, adst
+def_fn_4x4 adst, flipadst
+def_fn_4x4 flipadst, dct
+def_fn_4x4 flipadst, adst
+def_fn_4x4 flipadst, flipadst
+def_fn_4x4 identity, dct
+
+def_fn_4x4 adst, identity
+def_fn_4x4 flipadst, identity
+def_fn_4x4 identity, adst
+def_fn_4x4 identity, flipadst
+
+.macro idct_8h_x8 q0, q1, q2, q3, q4, q5, q6, q7, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15
+ idct_8h_x4 \q0, \q2, \q4, \q6, \r0, \r1, \r4, \r5, \r8, \r9, \r12, \r13
+
+ vmull_vmlsl_8h q2, q3, \r2, \r3, \r14, \r15, d1[0], d1[1] // -> t4a
+ vmull_vmlal_8h q4, q5, \r2, \r3, \r14, \r15, d1[1], d1[0] // -> t7a
+ vmull_vmlsl_8h q6, q7, \r10, \r11, \r6, \r7, d1[2], d1[3] // -> t5a
+ vqrshrn_8h \r2, \r3, q2, q3, #12 // t4a
+ vqrshrn_8h \r14, \r15, q4, q5, #12 // t7a
+ vmull_vmlal_8h q2, q3, \r10, \r11, \r6, \r7, d1[3], d1[2] // -> t6a
+ vqrshrn_8h \r6, \r7, q6, q7, #12 // t5a
+ vqrshrn_8h \r10, \r11, q2, q3, #12 // t6a
+
+ vqadd.s16 q2, \q1, \q3 // t4
+ vqsub.s16 \q1, \q1, \q3 // t5a
+ vqadd.s16 q3, \q7, \q5 // t7
+ vqsub.s16 \q3, \q7, \q5 // t6a
+
+ vmull_vmlsl_8h q4, q5, \r6, \r7, \r2, \r3, d0[0], d0[0] // -> t5
+ vmull_vmlal_8h q6, q7, \r6, \r7, \r2, \r3, d0[0], d0[0] // -> t6
+ vqrshrn_8h d8, d9, q4, q5, #12 // t5
+ vqrshrn_8h d10, d11, q6, q7, #12 // t6
+
+ vqsub.s16 \q7, \q0, q3 // out7
+ vqadd.s16 \q0, \q0, q3 // out0
+ vqadd.s16 \q1, \q2, q5 // out1
+ vqsub.s16 q6, \q2, q5 // out6
+ vqadd.s16 \q2, \q4, q4 // out2
+ vqsub.s16 \q5, \q4, q4 // out5
+ vqadd.s16 \q3, \q6, q2 // out3
+ vqsub.s16 \q4, \q6, q2 // out4
+ vmov \q6, q6 // out6
+.endm
+
+.macro idct_4h_x8 r0, r1, r2, r3, r4, r5, r6, r7
+ idct_4h_x4 \r0, \r2, \r4, \r6
+
+ vmull_vmlsl q1, \r1, \r7, d1[0], d1[1] // -> t4a
+ vmull_vmlal q2, \r1, \r7, d1[1], d1[0] // -> t7a
+ vmull_vmlsl q3, \r5, \r3, d1[2], d1[3] // -> t5a
+ vqrshrn.s32 \r1, q1, #12 // t4a
+ vmull_vmlal q1, \r5, \r3, d1[3], d1[2] // -> t6a
+ vqrshrn.s32 \r7, q2, #12 // t7a
+ vqrshrn.s32 \r3, q3, #12 // t5a
+ vqrshrn.s32 \r5, q1, #12 // taa
+
+ vqadd.s16 d2, \r1, \r3 // t4
+ vqsub.s16 \r1, \r1, \r3 // t5a
+ vqadd.s16 d3, \r7, \r5 // t7
+ vqsub.s16 \r3, \r7, \r5 // t6a
+
+ vmull_vmlsl q2, \r3, \r1, d0[0], d0[0] // -> t5
+ vmull_vmlal q3, \r3, \r1, d0[0], d0[0] // -> t6
+ vqrshrn.s32 d4, q2, #12 // t5
+ vqrshrn.s32 d5, q3, #12 // t6
+
+ vqsub.s16 \r7, \r0, d3 // out7
+ vqadd.s16 \r0, \r0, d3 // out0
+ vqadd.s16 \r1, \r2, d5 // out1
+ vqsub.s16 d6, \r2, d5 // out6
+ vqadd.s16 \r2, \r4, d4 // out2
+ vqsub.s16 \r5, \r4, d4 // out5
+ vqadd.s16 \r3, \r6, d2 // out3
+ vqsub.s16 \r4, \r6, d2 // out4
+ vmov \r6, d6 // out6
+.endm
+
+function inv_dct_8h_x8_neon, export=1
+ movrel_local r12, idct_coeffs
+ vld1.16 {q0}, [r12, :128]
+ idct_8h_x8 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ bx lr
+endfunc
+
+function inv_dct_4h_x8_neon, export=1
+ movrel_local r12, idct_coeffs
+ vld1.16 {q0}, [r12, :128]
+ idct_4h_x8 d16, d17, d18, d19, d20, d21, d22, d23
+ bx lr
+endfunc
+
+.macro iadst_8h_x8 q0, q1, q2, q3, q4, q5, q6, q7, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15
+ movrel_local r12, iadst8_coeffs
+ vld1.16 {d0, d1, d2}, [r12, :64]
+
+ vmull_vmlal_8h q2, q3, d30, d31, d16, d17, d0[0], d0[1]
+ vmull_vmlsl_8h q4, q5, d30, d31, d16, d17, d0[1], d0[0]
+ vmull_vmlal_8h q6, q7, d26, d27, d20, d21, d0[2], d0[3]
+ vqrshrn_8h d16, d17, q2, q3, #12 // t0a
+ vqrshrn_8h d30, d31, q4, q5, #12 // t1a
+ vmull_vmlsl_8h q2, q3, d26, d27, d20, d21, d0[3], d0[2]
+ vmull_vmlal_8h q4, q5, d22, d23, d24, d25, d1[0], d1[1]
+ vqrshrn_8h d20, d21, q6, q7, #12 // t2a
+ vqrshrn_8h d26, d27, q2, q3, #12 // t3a
+ vmull_vmlsl_8h q6, q7, d22, d23, d24, d25, d1[1], d1[0]
+ vmull_vmlal_8h q2, q3, d18, d19, d28, d29, d1[2], d1[3]
+ vqrshrn_8h d24, d25, q4, q5, #12 // t4a
+ vqrshrn_8h d22, d23, q6, q7, #12 // t5a
+ vmull_vmlsl_8h q4, q5, d18, d19, d28, d29, d1[3], d1[2]
+ vqrshrn_8h d28, d29, q2, q3, #12 // t6a
+ vqrshrn_8h d18, d19, q4, q5, #12 // t7a
+
+ vqadd.s16 q2, q8, q12 // t0
+ vqsub.s16 q3, q8, q12 // t4
+ vqadd.s16 q4, q15, q11 // t1
+ vqsub.s16 q5, q15, q11 // t5
+ vqadd.s16 q6, q10, q14 // t2
+ vqsub.s16 q7, q10, q14 // t6
+ vqadd.s16 q10, q13, q9 // t3
+ vqsub.s16 q11, q13, q9 // t7
+
+ vmull_vmlal_8h q8, q9, d6, d7, d10, d11, d2[3], d2[2]
+ vmull_vmlsl_8h q12, q13, d6, d7, d10, d11, d2[2], d2[3]
+ vmull_vmlsl_8h q14, q15, d22, d23, d14, d15, d2[3], d2[2]
+
+ vqrshrn_8h d6, d7, q8, q9, #12 // t4a
+ vqrshrn_8h d10, d11, q12, q13, #12 // t5a
+
+ vmull_vmlal_8h q8, q9, d22, d23, d14, d15, d2[2], d2[3]
+
+ vqrshrn_8h d14, d15, q14, q15, #12 // t6a
+ vqrshrn_8h d22, d23, q8, q9, #12 // t7a
+
+ vqadd.s16 \q0, q2, q6 // out0
+ vqsub.s16 q2, q2, q6 // t2
+ vqadd.s16 \q7, q4, q10 // out7
+ vqsub.s16 q4, q4, q10 // t3
+ vqneg.s16 \q7, \q7 // out7
+
+ vqadd.s16 \q1, q3, q7 // out1
+ vqsub.s16 q3, q3, q7 // t6
+ vqadd.s16 \q6, q5, q11 // out6
+ vqsub.s16 q5, q5, q11 // t7
+ vqneg.s16 \q1, \q1 // out1
+
+ vmull_vmlal_8h q10, q11, d4, d5, d8, d9, d2[0], d2[0] // -> out3 (q11 or q12)
+ vmull_vmlsl_8h q6, q7, d4, d5, d8, d9, d2[0], d2[0] // -> out4 (q12 or q11)
+ vmull_vmlsl_8h q12, q13, d6, d7, d10, d11, d2[0], d2[0] // -> out5 (q13 or q10)
+ vqrshrn_8h d4, d5, q10, q11, #12 // out3
+ vmull_vmlal_8h q10, q11, d6, d7, d10, d11, d2[0], d2[0] // -> out2 (q10 or q13)
+ vqrshrn_8h d6, d7, q12, q13, #12 // out5
+ vqrshrn_8h \r4, \r5, q10, q11, #12 // out2 (q10 or q13)
+ vqrshrn_8h \r8, \r9, q6, q7, #12 // out4 (q12 or q11)
+
+ vqneg.s16 \q3, q2 // out3
+ vqneg.s16 \q5, q3 // out5
+.endm
+
+.macro iadst_4h_x8 r0, r1, r2, r3, r4, r5, r6, r7
+ movrel_local r12, iadst8_coeffs
+ vld1.16 {d0, d1, d2}, [r12, :64]
+
+ vmull_vmlal q2, d23, d16, d0[0], d0[1]
+ vmull_vmlsl q3, d23, d16, d0[1], d0[0]
+ vmull_vmlal q4, d21, d18, d0[2], d0[3]
+ vqrshrn.s32 d16, q2, #12 // t0a
+ vqrshrn.s32 d23, q3, #12 // t1a
+ vmull_vmlsl q5, d21, d18, d0[3], d0[2]
+ vmull_vmlal q6, d19, d20, d1[0], d1[1]
+ vqrshrn.s32 d18, q4, #12 // t2a
+ vqrshrn.s32 d21, q5, #12 // t3a
+ vmull_vmlsl q7, d19, d20, d1[1], d1[0]
+ vmull_vmlal q2, d17, d22, d1[2], d1[3]
+ vqrshrn.s32 d20, q6, #12 // t4a
+ vqrshrn.s32 d19, q7, #12 // t5a
+ vmull_vmlsl q3, d17, d22, d1[3], d1[2]
+ vqrshrn.s32 d22, q2, #12 // t6a
+ vqrshrn.s32 d17, q3, #12 // t7a
+
+ vqadd.s16 d4, d16, d20 // t0
+ vqsub.s16 d5, d16, d20 // t4
+ vqadd.s16 d6, d23, d19 // t1
+ vqsub.s16 d7, d23, d19 // t5
+ vqadd.s16 d8, d18, d22 // t2
+ vqsub.s16 d9, d18, d22 // t6
+ vqadd.s16 d18, d21, d17 // t3
+ vqsub.s16 d19, d21, d17 // t7
+
+ vmull_vmlal q8, d5, d7, d2[3], d2[2]
+ vmull_vmlsl q10, d5, d7, d2[2], d2[3]
+ vmull_vmlsl q11, d19, d9, d2[3], d2[2]
+
+ vqrshrn.s32 d5, q8, #12 // t4a
+ vqrshrn.s32 d7, q10, #12 // t5a
+
+ vmull_vmlal q8, d19, d9, d2[2], d2[3]
+
+ vqrshrn.s32 d9, q11, #12 // t6a
+ vqrshrn.s32 d19, q8, #12 // t7a
+
+ vqadd.s16 \r0, d4, d8 // out0
+ vqsub.s16 d4, d4, d8 // t2
+ vqadd.s16 \r7, d6, d18 // out7
+ vqsub.s16 d6, d6, d18 // t3
+ vqneg.s16 \r7, \r7 // out7
+
+ vqadd.s16 \r1, d5, d9 // out1
+ vqsub.s16 d5, d5, d9 // t6
+ vqadd.s16 \r6, d7, d19 // out6
+ vqsub.s16 d7, d7, d19 // t7
+ vqneg.s16 \r1, \r1 // out1
+
+ vmull_vmlal q9, d4, d6, d2[0], d2[0] // -> out3 (d19 or d20)
+ vmull_vmlsl q4, d4, d6, d2[0], d2[0] // -> out4 (d20 or d19)
+ vmull_vmlsl q10, d5, d7, d2[0], d2[0] // -> out5 (d21 or d18)
+ vqrshrn.s32 d4, q9, #12 // out3
+ vmull_vmlal q9, d5, d7, d2[0], d2[0] // -> out2 (d18 or d21)
+ vqrshrn.s32 d5, q10, #12 // out5
+ vqrshrn.s32 \r2, q9, #12 // out2 (d18 or d21)
+ vqrshrn.s32 \r4, q4, #12 // out4 (d20 or d19)
+
+ vqneg.s16 \r3, d4 // out3
+ vqneg.s16 \r5, d5 // out5
+.endm
+
+function inv_adst_8h_x8_neon, export=1
+ iadst_8h_x8 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ bx lr
+endfunc
+
+function inv_flipadst_8h_x8_neon, export=1
+ iadst_8h_x8 q15, q14, q13, q12, q11, q10, q9, q8, d30, d31, d28, d29, d26, d27, d24, d25, d22, d23, d20, d21, d18, d19, d16, d17
+ bx lr
+endfunc
+
+function inv_adst_4h_x8_neon, export=1
+ iadst_4h_x8 d16, d17, d18, d19, d20, d21, d22, d23
+ bx lr
+endfunc
+
+function inv_flipadst_4h_x8_neon, export=1
+ iadst_4h_x8 d23, d22, d21, d20, d19, d18, d17, d16
+ bx lr
+endfunc
+
+function inv_identity_8h_x8_neon, export=1
+ vqshl.s16 q8, q8, #1
+ vqshl.s16 q9, q9, #1
+ vqshl.s16 q10, q10, #1
+ vqshl.s16 q11, q11, #1
+ vqshl.s16 q12, q12, #1
+ vqshl.s16 q13, q13, #1
+ vqshl.s16 q14, q14, #1
+ vqshl.s16 q15, q15, #1
+ bx lr
+endfunc
+
+function inv_identity_4h_x8_neon, export=1
+ vqshl.s16 q8, q8, #1
+ vqshl.s16 q9, q9, #1
+ vqshl.s16 q10, q10, #1
+ vqshl.s16 q11, q11, #1
+ bx lr
+endfunc
+
+.macro def_fn_8x8_base variant
+function inv_txfm_\variant\()add_8x8_neon
+ vmov.i16 q0, #0
+ vmov.i16 q1, #0
+ vld1.16 {q8, q9}, [r2, :128]
+ vst1.16 {q0, q1}, [r2, :128]!
+ vld1.16 {q10, q11}, [r2, :128]
+ vst1.16 {q0, q1}, [r2, :128]!
+ vld1.16 {q12, q13}, [r2, :128]
+ vst1.16 {q0, q1}, [r2, :128]!
+ vld1.16 {q14, q15}, [r2, :128]
+ vst1.16 {q0, q1}, [r2, :128]
+
+.ifc \variant, identity_
+ // The identity shl #1 and downshift srshr #1 cancel out
+.else
+ blx r4
+
+ vrshr.s16 q8, q8, #1
+ vrshr.s16 q9, q9, #1
+ vrshr.s16 q10, q10, #1
+ vrshr.s16 q11, q11, #1
+ vrshr.s16 q12, q12, #1
+ vrshr.s16 q13, q13, #1
+ vrshr.s16 q14, q14, #1
+ vrshr.s16 q15, q15, #1
+.endif
+
+ transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30
+
+ blx r5
+
+ load_add_store_8x8 r0, r7
+ vpop {q4-q7}
+ pop {r4-r5,r7,pc}
+endfunc
+.endm
+
+def_fn_8x8_base
+def_fn_8x8_base identity_
+
+.macro def_fn_8x8 txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc 8, 8, 1
+.endif
+ push {r4-r5,r7,lr}
+ vpush {q4-q7}
+ movrel_local r5, inv_\txfm2\()_8h_x8_neon
+.ifc \txfm1, identity
+ b inv_txfm_identity_add_8x8_neon
+.else
+ movrel_local r4, inv_\txfm1\()_8h_x8_neon
+ b inv_txfm_add_8x8_neon
+.endif
+endfunc
+.endm
+
+def_fn_8x8 dct, dct
+def_fn_8x8 identity, identity
+def_fn_8x8 dct, adst
+def_fn_8x8 dct, flipadst
+def_fn_8x8 dct, identity
+def_fn_8x8 adst, dct
+def_fn_8x8 adst, adst
+def_fn_8x8 adst, flipadst
+def_fn_8x8 flipadst, dct
+def_fn_8x8 flipadst, adst
+def_fn_8x8 flipadst, flipadst
+def_fn_8x8 identity, dct
+def_fn_8x8 adst, identity
+def_fn_8x8 flipadst, identity
+def_fn_8x8 identity, adst
+def_fn_8x8 identity, flipadst
+
+function inv_txfm_add_8x4_neon
+ vmov.i16 q14, #0
+ vmov.i16 q15, #0
+ movw r12, #2896*8
+ vdup.16 d0, r12
+ vld1.16 {d16, d17, d18, d19}, [r2, :128]
+ vst1.16 {q14, q15}, [r2, :128]!
+ vld1.16 {d20, d21, d22, d23}, [r2, :128]
+ vst1.16 {q14, q15}, [r2, :128]
+
+ scale_input d0[0], q8, q9, q10, q11
+
+ blx r4
+
+ transpose_4x4h q8, q9, d16, d17, d18, d19
+ transpose_4x4h q10, q11, d20, d21, d22, d23
+ vswp d17, d20
+ vswp d19, d21
+ vswp d18, d20
+ vswp d21, d22
+
+ blx r5
+
+ load_add_store_8x4 r0, r7
+ vpop {q4-q7}
+ pop {r4-r5,r7,pc}
+endfunc
+
+function inv_txfm_add_4x8_neon
+ vmov.i16 q14, #0
+ vmov.i16 q15, #0
+ movw r12, #2896*8
+ vdup.16 d0, r12
+ vld1.16 {q8, q9}, [r2, :128]
+ vst1.16 {q14, q15}, [r2, :128]!
+ vld1.16 {q10, q11}, [r2, :128]
+ vst1.16 {q14, q15}, [r2, :128]
+
+ scale_input d0[0], q8, q9, q10, q11
+
+ blx r4
+
+ transpose_4x8h q8, q9, q10, q11
+ vswp d17, d20
+ vswp d19, d21
+ vswp d17, d18
+ vswp d19, d22
+
+ blx r5
+
+ load_add_store_4x8 r0, r7
+ vpop {q4-q7}
+ pop {r4-r5,r7,pc}
+endfunc
+
+.macro def_fn_48 w, h, txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc \w, \h, 0
+.endif
+ push {r4-r5,r7,lr}
+ vpush {q4-q7}
+ movrel_local r4, inv_\txfm1\()_\h\()h_x\w\()_neon
+ movrel_local r5, inv_\txfm2\()_\w\()h_x\h\()_neon
+ b inv_txfm_add_\w\()x\h\()_neon
+endfunc
+.endm
+
+.macro def_fns_48 w, h
+def_fn_48 \w, \h, dct, dct
+def_fn_48 \w, \h, identity, identity
+def_fn_48 \w, \h, dct, adst
+def_fn_48 \w, \h, dct, flipadst
+def_fn_48 \w, \h, dct, identity
+def_fn_48 \w, \h, adst, dct
+def_fn_48 \w, \h, adst, adst
+def_fn_48 \w, \h, adst, flipadst
+def_fn_48 \w, \h, flipadst, dct
+def_fn_48 \w, \h, flipadst, adst
+def_fn_48 \w, \h, flipadst, flipadst
+def_fn_48 \w, \h, identity, dct
+def_fn_48 \w, \h, adst, identity
+def_fn_48 \w, \h, flipadst, identity
+def_fn_48 \w, \h, identity, adst
+def_fn_48 \w, \h, identity, flipadst
+.endm
+
+def_fns_48 4, 8
+def_fns_48 8, 4
+
+function inv_dct_4h_x16_neon, export=1
+ movrel_local r12, idct_coeffs
+ vld1.16 {q0, q1}, [r12, :128]
+
+ vmull_vmlsl q2, d17, d31, d2[0], d2[1] // -> t8a
+ vmull_vmlal q3, d17, d31, d2[1], d2[0] // -> t15a
+ vmull_vmlsl q4, d25, d23, d2[2], d2[3] // -> t9a
+ vqrshrn.s32 d17, q2, #12 // t8a
+ vqrshrn.s32 d31, q3, #12 // t15a
+ vmull_vmlal q2, d25, d23, d2[3], d2[2] // -> t14a
+ vmull_vmlsl q3, d21, d27, d3[0], d3[1] // -> t10a
+ vqrshrn.s32 d23, q4, #12 // t9a
+ vqrshrn.s32 d25, q2, #12 // t14a
+ vmull_vmlal q4, d21, d27, d3[1], d3[0] // -> t13a
+ vmull_vmlsl q2, d29, d19, d3[2], d3[3] // -> t11a
+ vqrshrn.s32 d21, q3, #12 // t10a
+ vqrshrn.s32 d27, q4, #12 // t13a
+ vmull_vmlal q3, d29, d19, d3[3], d3[2] // -> t12a
+ vqrshrn.s32 d19, q2, #12 // t11a
+ vqrshrn.s32 d29, q3, #12 // t12a
+
+ idct_4h_x8 d16, d18, d20, d22, d24, d26, d28, d30
+
+ vqsub.s16 d4, d17, d23 // t9
+ vqadd.s16 d17, d17, d23 // t8
+ vqsub.s16 d5, d31, d25 // t14
+ vqadd.s16 d31, d31, d25 // t15
+ vqsub.s16 d23, d19, d21 // t10
+ vqadd.s16 d19, d19, d21 // t11
+ vqadd.s16 d25, d29, d27 // t12
+ vqsub.s16 d29, d29, d27 // t13
+
+ vmull_vmlsl q3, d5, d4, d0[2], d0[3] // -> t9a
+ vmull_vmlal q4, d5, d4, d0[3], d0[2] // -> t14a
+ vqrshrn.s32 d21, q3, #12 // t9a
+ vqrshrn.s32 d27, q4, #12 // t14a
+
+ vmull_vmlsl q3, d29, d23, d0[2], d0[3] // -> t13a
+ vmull_vmlal q4, d29, d23, d0[3], d0[2] // -> t10a
+ vqrshrn.s32 d29, q3, #12 // t13a
+ vneg.s32 q4, q4
+ vqrshrn.s32 d23, q4, #12 // t10a
+
+ vqsub.s16 d4, d17, d19 // t11a
+ vqadd.s16 d17, d17, d19 // t8a
+ vqsub.s16 d5, d31, d25 // t12a
+ vqadd.s16 d31, d31, d25 // t15a
+ vqadd.s16 d19, d21, d23 // t9
+ vqsub.s16 d21, d21, d23 // t10
+ vqsub.s16 d25, d27, d29 // t13
+ vqadd.s16 d27, d27, d29 // t14
+
+ vmull_vmlsl q3, d5, d4, d0[0], d0[0] // -> t11
+ vmull_vmlal q4, d5, d4, d0[0], d0[0] // -> t12
+ vmull_vmlsl q2, d25, d21, d0[0], d0[0] // -> t10a
+
+ vqrshrn.s32 d6, q3, #12 // t11
+ vqrshrn.s32 d7, q4, #12 // t12
+ vmull_vmlal q4, d25, d21, d0[0], d0[0] // -> t13a
+ vqrshrn.s32 d4, q2, #12 // t10a
+ vqrshrn.s32 d5, q4, #12 // t13a
+
+ vqadd.s16 d8, d16, d31 // out0
+ vqsub.s16 d31, d16, d31 // out15
+ vmov d16, d8
+ vqadd.s16 d23, d30, d17 // out7
+ vqsub.s16 d9, d30, d17 // out8
+ vqadd.s16 d17, d18, d27 // out1
+ vqsub.s16 d30, d18, d27 // out14
+ vqadd.s16 d18, d20, d5 // out2
+ vqsub.s16 d29, d20, d5 // out13
+ vqadd.s16 d5, d28, d19 // out6
+ vqsub.s16 d25, d28, d19 // out9
+ vqadd.s16 d19, d22, d7 // out3
+ vqsub.s16 d28, d22, d7 // out12
+ vqadd.s16 d20, d24, d6 // out4
+ vqsub.s16 d27, d24, d6 // out11
+ vqadd.s16 d21, d26, d4 // out5
+ vqsub.s16 d26, d26, d4 // out10
+ vmov d24, d9
+ vmov d22, d5
+
+ bx lr
+endfunc
+
+.macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15
+ movrel_local r12, iadst16_coeffs
+ vld1.16 {q0, q1}, [r12, :128]
+ movrel_local r12, idct_coeffs
+
+ vmull_vmlal q2, d31, d16, d0[0], d0[1] // -> t0
+ vmull_vmlsl q3, d31, d16, d0[1], d0[0] // -> t1
+ vmull_vmlal q4, d29, d18, d0[2], d0[3] // -> t2
+ vqrshrn.s32 d16, q2, #12 // t0
+ vqrshrn.s32 d31, q3, #12 // t1
+ vmull_vmlsl q2, d29, d18, d0[3], d0[2] // -> t3
+ vmull_vmlal q3, d27, d20, d1[0], d1[1] // -> t4
+ vqrshrn.s32 d18, q4, #12 // t2
+ vqrshrn.s32 d29, q2, #12 // t3
+ vmull_vmlsl q4, d27, d20, d1[1], d1[0] // -> t5
+ vmull_vmlal q2, d25, d22, d1[2], d1[3] // -> t6
+ vqrshrn.s32 d20, q3, #12 // t4
+ vqrshrn.s32 d27, q4, #12 // t5
+ vmull_vmlsl q3, d25, d22, d1[3], d1[2] // -> t7
+ vmull_vmlal q4, d23, d24, d2[0], d2[1] // -> t8
+ vqrshrn.s32 d22, q2, #12 // t6
+ vqrshrn.s32 d25, q3, #12 // t7
+ vmull_vmlsl q2, d23, d24, d2[1], d2[0] // -> t9
+ vmull_vmlal q3, d21, d26, d2[2], d2[3] // -> t10
+ vqrshrn.s32 d23, q4, #12 // t8
+ vqrshrn.s32 d24, q2, #12 // t9
+ vmull_vmlsl q4, d21, d26, d2[3], d2[2] // -> t11
+ vmull_vmlal q2, d19, d28, d3[0], d3[1] // -> t12
+ vqrshrn.s32 d21, q3, #12 // t10
+ vqrshrn.s32 d26, q4, #12 // t11
+ vmull_vmlsl q3, d19, d28, d3[1], d3[0] // -> t13
+ vmull_vmlal q4, d17, d30, d3[2], d3[3] // -> t14
+ vqrshrn.s32 d19, q2, #12 // t12
+ vqrshrn.s32 d28, q3, #12 // t13
+ vmull_vmlsl q2, d17, d30, d3[3], d3[2] // -> t15
+ vqrshrn.s32 d17, q4, #12 // t14
+ vqrshrn.s32 d30, q2, #12 // t15
+
+ vld1.16 {q0}, [r12, :128]
+
+ vqsub.s16 d2, d16, d23 // t8a
+ vqadd.s16 d16, d16, d23 // t0a
+ vqsub.s16 d3, d31, d24 // t9a
+ vqadd.s16 d31, d31, d24 // t1a
+ vqadd.s16 d23, d18, d21 // t2a
+ vqsub.s16 d18, d18, d21 // t10a
+ vqadd.s16 d24, d29, d26 // t3a
+ vqsub.s16 d29, d29, d26 // t11a
+ vqadd.s16 d21, d20, d19 // t4a
+ vqsub.s16 d20, d20, d19 // t12a
+ vqadd.s16 d26, d27, d28 // t5a
+ vqsub.s16 d27, d27, d28 // t13a
+ vqadd.s16 d19, d22, d17 // t6a
+ vqsub.s16 d22, d22, d17 // t14a
+ vqadd.s16 d28, d25, d30 // t7a
+ vqsub.s16 d25, d25, d30 // t15a
+
+ vmull_vmlal q2, d2, d3, d1[1], d1[0] // -> t8
+ vmull_vmlsl q3, d2, d3, d1[0], d1[1] // -> t9
+ vmull_vmlal q4, d18, d29, d1[3], d1[2] // -> t10
+ vqrshrn.s32 d17, q2, #12 // t8
+ vqrshrn.s32 d30, q3, #12 // t9
+ vmull_vmlsl q2, d18, d29, d1[2], d1[3] // -> t11
+ vmull_vmlsl q3, d27, d20, d1[1], d1[0] // -> t12
+ vqrshrn.s32 d18, q4, #12 // t10
+ vqrshrn.s32 d29, q2, #12 // t11
+ vmull_vmlal q4, d27, d20, d1[0], d1[1] // -> t13
+ vmull_vmlsl q2, d25, d22, d1[3], d1[2] // -> t14
+ vqrshrn.s32 d27, q3, #12 // t12
+ vqrshrn.s32 d20, q4, #12 // t13
+ vmull_vmlal q3, d25, d22, d1[2], d1[3] // -> t15
+ vqrshrn.s32 d25, q2, #12 // t14
+ vqrshrn.s32 d22, q3, #12 // t15
+
+ vqsub.s16 d2, d16, d21 // t4
+ vqadd.s16 d16, d16, d21 // t0
+ vqsub.s16 d3, d31, d26 // t5
+ vqadd.s16 d31, d31, d26 // t1
+ vqadd.s16 d21, d23, d19 // t2
+ vqsub.s16 d23, d23, d19 // t6
+ vqadd.s16 d26, d24, d28 // t3
+ vqsub.s16 d24, d24, d28 // t7
+ vqadd.s16 d19, d17, d27 // t8a
+ vqsub.s16 d17, d17, d27 // t12a
+ vqadd.s16 d28, d30, d20 // t9a
+ vqsub.s16 d30, d30, d20 // t13a
+ vqadd.s16 d27, d18, d25 // t10a
+ vqsub.s16 d18, d18, d25 // t14a
+ vqadd.s16 d20, d29, d22 // t11a
+ vqsub.s16 d29, d29, d22 // t15a
+
+ vmull_vmlal q2, d2, d3, d0[3], d0[2] // -> t4a
+ vmull_vmlsl q3, d2, d3, d0[2], d0[3] // -> t5a
+ vmull_vmlsl q4, d24, d23, d0[3], d0[2] // -> t6a
+ vqrshrn.s32 d22, q2, #12 // t4a
+ vqrshrn.s32 d25, q3, #12 // t5a
+ vmull_vmlal q2, d24, d23, d0[2], d0[3] // -> t7a
+ vmull_vmlal q3, d17, d30, d0[3], d0[2] // -> t12
+ vqrshrn.s32 d24, q4, #12 // t6a
+ vqrshrn.s32 d23, q2, #12 // t7a
+ vmull_vmlsl q4, d17, d30, d0[2], d0[3] // -> t13
+ vmull_vmlsl q2, d29, d18, d0[3], d0[2] // -> t14
+ vqrshrn.s32 d17, q3, #12 // t12
+ vmull_vmlal q3, d29, d18, d0[2], d0[3] // -> t15
+ vqrshrn.s32 d29, q4, #12 // t13
+ vqrshrn.s32 d30, q2, #12 // t14
+ vqrshrn.s32 d18, q3, #12 // t15
+
+ vqsub.s16 d2, d16, d21 // t2a
+.ifc \o0, d16
+ vqadd.s16 \o0, d16, d21 // out0
+ vqsub.s16 d21, d31, d26 // t3a
+ vqadd.s16 \o15,d31, d26 // out15
+.else
+ vqadd.s16 d4, d16, d21 // out0
+ vqsub.s16 d21, d31, d26 // t3a
+ vqadd.s16 \o15,d31, d26 // out15
+ vmov \o0, d4
+.endif
+ vqneg.s16 \o15, \o15 // out15
+
+ vqsub.s16 d3, d29, d18 // t15a
+ vqadd.s16 \o13,d29, d18 // out13
+ vqadd.s16 \o2, d17, d30 // out2
+ vqsub.s16 d26, d17, d30 // t14a
+ vqneg.s16 \o13,\o13 // out13
+
+ vqadd.s16 \o1, d19, d27 // out1
+ vqsub.s16 d27, d19, d27 // t10
+ vqadd.s16 \o14,d28, d20 // out14
+ vqsub.s16 d20, d28, d20 // t11
+ vqneg.s16 \o1, \o1 // out1
+
+ vqadd.s16 \o3, d22, d24 // out3
+ vqsub.s16 d22, d22, d24 // t6
+ vqadd.s16 \o12,d25, d23 // out12
+ vqsub.s16 d23, d25, d23 // t7
+ vqneg.s16 \o3, \o3 // out3
+
+ vmull_vmlsl q12, d2, d21, d0[0], d0[0] // -> out8 (d24 or d23)
+ vmull_vmlal q2, d2, d21, d0[0], d0[0] // -> out7 (d23 or d24)
+ vmull_vmlal q3, d26, d3, d0[0], d0[0] // -> out5 (d21 or d26)
+
+ vqrshrn.s32 d24, q12, #12 // out8
+ vqrshrn.s32 d4, q2, #12 // out7
+ vqrshrn.s32 d5, q3, #12 // out5
+ vmull_vmlsl q4, d26, d3, d0[0], d0[0] // -> out10 (d26 or d21)
+ vmull_vmlal q1, d22, d23, d0[0], d0[0] // -> out4 (d20 or d27)
+ vqrshrn.s32 d26, q4, #12 // out10
+
+ vmull_vmlsl q4, d22, d23, d0[0], d0[0] // -> out11 (d27 or d20)
+ vmull_vmlal q11, d27, d20, d0[0], d0[0] // -> out6 (d22 or d25)
+ vmull_vmlsl q3, d27, d20, d0[0], d0[0] // -> out9 (d25 or d22)
+
+ vqrshrn.s32 \o4, q1, #12 // out4
+ vqrshrn.s32 d7, q3, #12 // out9
+ vqrshrn.s32 d6, q4, #12 // out11
+ vqrshrn.s32 \o6, q11, #12 // out6
+
+.ifc \o8, d23
+ vmov \o8, d24
+ vmov \o10,d26
+.endif
+
+ vqneg.s16 \o7, d4 // out7
+ vqneg.s16 \o5, d5 // out5
+ vqneg.s16 \o11,d6 // out11
+ vqneg.s16 \o9, d7 // out9
+.endm
+
+function inv_adst_4h_x16_neon, export=1
+ iadst_16 d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ bx lr
+endfunc
+
+function inv_flipadst_4h_x16_neon, export=1
+ iadst_16 d31, d30, d29, d28, d27, d26, d25, d24, d23, d22, d21, d20, d19, d18, d17, d16
+ bx lr
+endfunc
+
+function inv_identity_4h_x16_neon, export=1
+ movw r12, #2*(5793-4096)*8
+ vdup.16 d0, r12
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vqrdmulh.s16 q1, \i, d0[0]
+ vqadd.s16 \i, \i, \i
+ vqadd.s16 \i, \i, q1
+.endr
+ bx lr
+endfunc
+
+.macro identity_4x16_shift2 c
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vqrdmulh.s16 q2, \i, \c
+ vshr.s16 q2, q2, #1
+ vrhadd.s16 \i, \i, q2
+.endr
+.endm
+
+.macro identity_4x16_shift1 c
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vqrdmulh.s16 q2, \i, \c
+ vrshr.s16 q2, q2, #1
+ vqadd.s16 \i, \i, q2
+.endr
+.endm
+
+.macro identity_8x8_shift1 c
+ identity_4x16_shift1 \c
+.endm
+
+.macro identity_8x8 c
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vqrdmulh.s16 q2, \i, \c
+ vqadd.s16 \i, \i, \i
+ vqadd.s16 \i, \i, q2
+.endr
+.endm
+
+.macro def_horz_16 scale=0, identity=0, shift=2, suffix
+function inv_txfm_horz\suffix\()_16x4_neon
+ push {lr}
+ vmov.i16 d7, #0
+.if \identity
+ movw r12, #2*(5793-4096)*8
+ vdup.16 d0, r12
+.endif
+.if \scale
+ movw r12, #2896*8
+ vdup.16 d1, r12
+.endif
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ vld1.16 {\i}, [r7, :64]
+ vst1.16 {d7}, [r7, :64], r8
+.endr
+.if \scale
+ scale_input d1[0], q8, q9, q10, q11, q12, q13, q14, q15
+.endif
+.if \identity
+.if \shift == -2
+ identity_4x16_shift2 d0[0]
+.else
+ identity_4x16_shift1 d0[0]
+.endif
+.else
+ blx r4
+.endif
+.if \shift > 0
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vrshr.s16 \i, \i, #\shift
+.endr
+.endif
+ transpose_4x4h q8, q9, d16, d17, d18, d19
+ transpose_4x4h q10, q11, d20, d21, d22, d23
+ transpose_4x4h q12, q13, d24, d25, d26, d27
+ transpose_4x4h q14, q15, d28, d29, d30, d31
+
+.irp i, d16, d20, d24, d28, d17, d21, d25, d29, d18, d22, d26, d30, d19, d23, d27, d31
+ vst1.16 {\i}, [r6, :64]!
+.endr
+
+ pop {pc}
+endfunc
+.endm
+
+def_horz_16 scale=0, identity=0, shift=2
+def_horz_16 scale=1, identity=0, shift=1, suffix=_scale
+def_horz_16 scale=0, identity=1, shift=-2, suffix=_identity
+def_horz_16 scale=1, identity=1, shift=-1, suffix=_scale_identity
+
+function inv_txfm_add_vert_4x16_neon
+ push {lr}
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ vld1.16 {\i}, [r7, :64], r8
+.endr
+ blx r5
+ load_add_store_4x16 r6, r7
+ pop {pc}
+endfunc
+
+function inv_txfm_add_16x16_neon
+ sub_sp_align 512
+ ldrh r11, [r10], #2
+.irp i, 0, 4, 8, 12
+ add r6, sp, #(\i*16*2)
+.if \i > 0
+ mov r8, #(16 - \i)
+ cmp r3, r11
+ blt 1f
+.if \i < 12
+ ldrh r11, [r10], #2
+.endif
+.endif
+ add r7, r2, #(\i*2)
+ mov r8, #16*2
+ blx r9
+.endr
+ b 3f
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #4
+.rept 4
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+3:
+.irp i, 0, 4, 8, 12
+ add r6, r0, #(\i)
+ add r7, sp, #(\i*2)
+ mov r8, #32
+ bl inv_txfm_add_vert_4x16_neon
+.endr
+
+ add_sp_align 512
+ vpop {q4}
+ pop {r4-r11,pc}
+endfunc
+
+const eob_16x16
+ .short 10, 36, 78, 256
+endconst
+
+const eob_16x16_identity
+ .short 4, 8, 12, 256
+endconst
+
+.macro def_fn_16x16 txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_8bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc 16, 16, 2
+.endif
+ push {r4-r11,lr}
+ vpush {q4}
+.ifc \txfm1, identity
+ movrel_local r9, inv_txfm_horz_identity_16x4_neon
+.else
+ movrel_local r9, inv_txfm_horz_16x4_neon
+ movrel_local r4, inv_\txfm1\()_4h_x16_neon
+.endif
+ movrel_local r5, inv_\txfm2\()_4h_x16_neon
+.ifc \txfm1, identity
+.ifc \txfm2, identity
+ movrel_local r10, eob_16x16
+.else
+ movrel_local r10, eob_16x16_identity
+.endif
+.else
+.ifc \txfm2, identity
+ movrel_local r10, eob_16x16_identity
+.else
+ movrel_local r10, eob_16x16
+.endif
+.endif
+ b inv_txfm_add_16x16_neon
+endfunc
+.endm
+
+def_fn_16x16 dct, dct
+def_fn_16x16 identity, identity
+def_fn_16x16 dct, adst
+def_fn_16x16 dct, flipadst
+def_fn_16x16 dct, identity
+def_fn_16x16 adst, dct
+def_fn_16x16 adst, adst
+def_fn_16x16 adst, flipadst
+def_fn_16x16 flipadst, dct
+def_fn_16x16 flipadst, adst
+def_fn_16x16 flipadst, flipadst
+def_fn_16x16 identity, dct
+
+.macro def_fn_416_base variant
+function inv_txfm_\variant\()add_16x4_neon
+
+.ifc \variant, identity_
+ vmov.i16 d4, #0
+.irp i, d16, d18, d20, d22
+ vld1.16 {\i}, [r2, :64]
+ vst1.16 {d4}, [r2, :64]!
+.endr
+.irp i, d17, d19, d21, d23
+ vld1.16 {\i}, [r2, :64]
+ vst1.16 {d4}, [r2, :64]!
+.endr
+ movw r12, #2*(5793-4096)*8
+ vdup.16 d0, r12
+.irp i, d24, d26, d28, d30
+ vld1.16 {\i}, [r2, :64]
+ vst1.16 {d4}, [r2, :64]!
+.endr
+.irp i, d25, d27, d29, d31
+ vld1.16 {\i}, [r2, :64]
+ vst1.16 {d4}, [r2, :64]!
+.endr
+
+ identity_4x16_shift1 d0[0]
+.else
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+ vld1.16 {d16, d17, d18, d19}, [r2, :128]
+ vst1.16 {q2, q3}, [r2, :128]!
+ vld1.16 {d20, d21, d22, d23}, [r2, :128]
+ vst1.16 {q2, q3}, [r2, :128]!
+ vld1.16 {d24, d25, d26, d27}, [r2, :128]
+ vst1.16 {q2, q3}, [r2, :128]!
+ vld1.16 {d28, d29, d30, d31}, [r2, :128]
+ vst1.16 {q2, q3}, [r2, :128]!
+
+ blx r4
+
+ vswp d17, d20
+ vswp d19, d22
+ vswp d18, d20
+ vswp d19, d21
+.irp i, q8, q9, q10, q11
+ vrshr.s16 \i, \i, #1
+.endr
+.endif
+ transpose_4x8h q8, q9, q10, q11
+ blx r5
+ mov r6, r0
+ load_add_store_8x4 r6, r7
+
+.ifc \variant, identity_
+ vmov q8, q12
+ vmov q9, q13
+ vmov q10, q14
+ vmov q11, q15
+.else
+ vswp d25, d28
+ vswp d27, d30
+ vswp d26, d28
+ vswp d27, d29
+ vrshr.s16 q8, q12, #1
+ vrshr.s16 q9, q13, #1
+ vrshr.s16 q10, q14, #1
+ vrshr.s16 q11, q15, #1
+.endif
+ transpose_4x8h q8, q9, q10, q11
+ blx r5
+ add r6, r0, #8
+ load_add_store_8x4 r6, r7
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_\variant\()add_4x16_neon
+ vmov.i16 q2, #0
+
+ mov r11, #32
+ cmp r3, r10
+ blt 1f
+
+ add r6, r2, #16
+.ifc \variant, identity_
+.irp i, q12, q13, q14, q15
+ vld1.16 {\i}, [r6, :128]
+ vst1.16 {q2}, [r6, :128], r11
+.endr
+ movw r12, #(5793-4096)*8
+ vdup.16 d0, r12
+ identity_8x4_shift1 q12, q13, q14, q15, d0[0]
+.else
+.irp i, q8, q9, q10, q11
+ vld1.16 {\i}, [r6, :128]
+ vst1.16 {q2}, [r6, :128], r11
+.endr
+ blx r4
+ vrshr.s16 q12, q8, #1
+ vrshr.s16 q13, q9, #1
+ vrshr.s16 q14, q10, #1
+ vrshr.s16 q15, q11, #1
+.endif
+ transpose_4x8h q12, q13, q14, q15
+ vswp d27, d29
+ vswp d26, d28
+ vswp d27, d30
+ vswp d25, d28
+
+ b 2f
+1:
+.irp i, q12, q13, q14, q15
+ vmov.i16 \i, #0
+.endr
+2:
+ vmov.i16 q2, #0
+.irp i, q8, q9, q10, q11
+ vld1.16 {\i}, [r2, :128]
+ vst1.16 {q2}, [r2, :128], r11
+.endr
+.ifc \variant, identity_
+ movw r12, #(5793-4096)*8
+ vdup.16 d0, r12
+ identity_8x4_shift1 q8, q9, q10, q11, d0[0]
+.else
+ blx r4
+.irp i, q8, q9, q10, q11
+ vrshr.s16 \i, \i, #1
+.endr
+.endif
+ transpose_4x8h q8, q9, q10, q11
+ vswp d19, d21
+ vswp d18, d20
+ vswp d19, d22
+ vswp d17, d20
+
+ blx r5
+
+ load_add_store_4x16 r0, r6
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+.endm
+
+def_fn_416_base
+def_fn_416_base identity_
+
+.macro def_fn_416 w, h, txfm1, txfm2, eob_half
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc \w, \h, 1
+.endif
+ push {r4-r11,lr}
+ vpush {q4-q7}
+.if \w == 4
+ movrel_local r4, inv_\txfm1\()_8h_x\w\()_neon
+ movrel_local r5, inv_\txfm2\()_4h_x\h\()_neon
+ mov r10, #\eob_half
+.else
+ movrel_local r4, inv_\txfm1\()_4h_x\w\()_neon
+ movrel_local r5, inv_\txfm2\()_8h_x\h\()_neon
+.endif
+.ifc \txfm1, identity
+ b inv_txfm_identity_add_\w\()x\h\()_neon
+.else
+ b inv_txfm_add_\w\()x\h\()_neon
+.endif
+endfunc
+.endm
+
+.macro def_fns_416 w, h
+def_fn_416 \w, \h, dct, dct, 29
+def_fn_416 \w, \h, identity, identity, 29
+def_fn_416 \w, \h, dct, adst, 29
+def_fn_416 \w, \h, dct, flipadst, 29
+def_fn_416 \w, \h, dct, identity, 8
+def_fn_416 \w, \h, adst, dct, 29
+def_fn_416 \w, \h, adst, adst, 29
+def_fn_416 \w, \h, adst, flipadst, 29
+def_fn_416 \w, \h, flipadst, dct, 29
+def_fn_416 \w, \h, flipadst, adst, 29
+def_fn_416 \w, \h, flipadst, flipadst, 29
+def_fn_416 \w, \h, identity, dct, 32
+def_fn_416 \w, \h, adst, identity, 8
+def_fn_416 \w, \h, flipadst, identity, 8
+def_fn_416 \w, \h, identity, adst, 32
+def_fn_416 \w, \h, identity, flipadst, 32
+.endm
+
+def_fns_416 4, 16
+def_fns_416 16, 4
+
+.macro def_fn_816_base variant
+function inv_txfm_\variant\()add_16x8_neon
+ sub_sp_align 256
+
+.irp i, 0, 4
+ add r6, sp, #(\i*16*2)
+.if \i > 0
+ cmp r3, r10
+ blt 1f
+.endif
+ add r7, r2, #(\i*2)
+ mov r8, #8*2
+ blx r9
+.endr
+ b 2f
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+.rept 4
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+2:
+
+.irp i, 0, 8
+ add r7, sp, #(\i*2)
+ mov r8, #32
+.irp j, q8, q9, q10, q11, q12, q13, q14, q15
+ vld1.16 {\j}, [r7, :128], r8
+.endr
+ blx r5
+
+ add r6, r0, #(\i)
+ load_add_store_8x8 r6, r7
+.endr
+
+ add_sp_align 256
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_\variant\()add_8x16_neon
+ sub_sp_align 256
+
+.irp i, 0, 8
+ add r6, sp, #(\i*8*2)
+.if \i > 0
+ cmp r3, r10
+ blt 1f
+.endif
+ add r7, r2, #(\i*2)
+ mov r8, #16*2
+
+ vmov.i16 q2, #0
+ movw r12, #2896*8
+ vdup.16 d0, r12
+
+.irp j, q8, q9, q10, q11, q12, q13, q14, q15
+ vld1.16 {\j}, [r7, :128]
+ vst1.16 {q2}, [r7, :128], r8
+.endr
+ scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15
+.ifc \variant, identity_
+ // The identity shl #1 and downshift vrshr #1 cancel out
+.else
+ blx r4
+.irp j, q8, q9, q10, q11, q12, q13, q14, q15
+ vrshr.s16 \j, \j, #1
+.endr
+.endif
+ transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30
+ vst1.16 {q8, q9}, [r6, :128]!
+ vst1.16 {q10, q11}, [r6, :128]!
+ vst1.16 {q12, q13}, [r6, :128]!
+ vst1.16 {q14, q15}, [r6, :128]!
+.endr
+ b 2f
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+.rept 4
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+2:
+
+.irp i, 0, 4
+ add r6, r0, #(\i)
+ add r7, sp, #(\i*2)
+ mov r8, #16
+ bl inv_txfm_add_vert_4x16_neon
+.endr
+
+ add_sp_align 256
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+.endm
+
+def_fn_816_base
+def_fn_816_base identity_
+
+.macro def_fn_816 w, h, txfm1, txfm2, eob_8x8, eob_4x4
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc \w, \h, 1
+.endif
+ push {r4-r11,lr}
+ vpush {q4-q7}
+.if \w == 8
+ movrel_local r4, inv_\txfm1\()_8h_x8_neon
+ movrel_local r5, inv_\txfm2\()_4h_x16_neon
+.else
+.ifc \txfm1, identity
+ movrel_local r9, inv_txfm_horz_scale_identity_16x4_neon
+.else
+ movrel_local r4, inv_\txfm1\()_4h_x16_neon
+ movrel_local r9, inv_txfm_horz_scale_16x4_neon
+.endif
+ movrel_local r5, inv_\txfm2\()_8h_x8_neon
+.endif
+.if \w == 8
+ mov r10, #\eob_8x8
+.else
+ mov r10, #\eob_4x4
+.endif
+.ifc \txfm1, identity
+ b inv_txfm_identity_add_\w\()x\h\()_neon
+.else
+ b inv_txfm_add_\w\()x\h\()_neon
+.endif
+endfunc
+.endm
+
+.macro def_fns_816 w, h
+def_fn_816 \w, \h, dct, dct, 43, 10
+def_fn_816 \w, \h, identity, identity, 43, 10
+def_fn_816 \w, \h, dct, adst, 43, 10
+def_fn_816 \w, \h, dct, flipadst, 43, 10
+def_fn_816 \w, \h, dct, identity, 8, 4
+def_fn_816 \w, \h, adst, dct, 43, 10
+def_fn_816 \w, \h, adst, adst, 43, 10
+def_fn_816 \w, \h, adst, flipadst, 43, 10
+def_fn_816 \w, \h, flipadst, dct, 43, 10
+def_fn_816 \w, \h, flipadst, adst, 43, 10
+def_fn_816 \w, \h, flipadst, flipadst, 43, 10
+def_fn_816 \w, \h, identity, dct, 64, 4
+def_fn_816 \w, \h, adst, identity, 8, 4
+def_fn_816 \w, \h, flipadst, identity, 8, 4
+def_fn_816 \w, \h, identity, adst, 64, 4
+def_fn_816 \w, \h, identity, flipadst, 64, 4
+.endm
+
+def_fns_816 8, 16
+def_fns_816 16, 8
+
+function inv_dct32_odd_4h_x16_neon, export=1
+ movrel_local r12, idct_coeffs, 2*16
+ vld1.16 {q0, q1}, [r12, :128]
+ sub r12, r12, #2*16
+
+ vmull_vmlsl q2, d16, d31, d0[0], d0[1] // -> t16a
+ vmull_vmlal q3, d16, d31, d0[1], d0[0] // -> t31a
+ vmull_vmlsl q4, d24, d23, d0[2], d0[3] // -> t17a
+ vqrshrn.s32 d16, q2, #12 // t16a
+ vqrshrn.s32 d31, q3, #12 // t31a
+ vmull_vmlal q2, d24, d23, d0[3], d0[2] // -> t30a
+ vmull_vmlsl q3, d20, d27, d1[0], d1[1] // -> t18a
+ vqrshrn.s32 d24, q4, #12 // t17a
+ vqrshrn.s32 d23, q2, #12 // t30a
+ vmull_vmlal q4, d20, d27, d1[1], d1[0] // -> t29a
+ vmull_vmlsl q2, d28, d19, d1[2], d1[3] // -> t19a
+ vqrshrn.s32 d20, q3, #12 // t18a
+ vqrshrn.s32 d27, q4, #12 // t29a
+ vmull_vmlal q3, d28, d19, d1[3], d1[2] // -> t28a
+ vmull_vmlsl q4, d18, d29, d2[0], d2[1] // -> t20a
+ vqrshrn.s32 d28, q2, #12 // t19a
+ vqrshrn.s32 d19, q3, #12 // t28a
+ vmull_vmlal q2, d18, d29, d2[1], d2[0] // -> t27a
+ vmull_vmlsl q3, d26, d21, d2[2], d2[3] // -> t21a
+ vqrshrn.s32 d18, q4, #12 // t20a
+ vqrshrn.s32 d29, q2, #12 // t27a
+ vmull_vmlal q4, d26, d21, d2[3], d2[2] // -> t26a
+ vmull_vmlsl q2, d22, d25, d3[0], d3[1] // -> t22a
+ vqrshrn.s32 d26, q3, #12 // t21a
+ vqrshrn.s32 d21, q4, #12 // t26a
+ vmull_vmlal q3, d22, d25, d3[1], d3[0] // -> t25a
+ vmull_vmlsl q4, d30, d17, d3[2], d3[3] // -> t23a
+ vqrshrn.s32 d22, q2, #12 // t22a
+ vqrshrn.s32 d25, q3, #12 // t25a
+ vmull_vmlal q2, d30, d17, d3[3], d3[2] // -> t24a
+ vqrshrn.s32 d30, q4, #12 // t23a
+ vqrshrn.s32 d17, q2, #12 // t24a
+
+ vld1.16 {q0}, [r12, :128]
+
+ vqsub.s16 d2, d16, d24 // t17
+ vqadd.s16 d16, d16, d24 // t16
+ vqsub.s16 d3, d31, d23 // t30
+ vqadd.s16 d31, d31, d23 // t31
+ vqsub.s16 d24, d28, d20 // t18
+ vqadd.s16 d28, d28, d20 // t19
+ vqadd.s16 d23, d18, d26 // t20
+ vqsub.s16 d18, d18, d26 // t21
+ vqsub.s16 d20, d30, d22 // t22
+ vqadd.s16 d30, d30, d22 // t23
+ vqadd.s16 d26, d17, d25 // t24
+ vqsub.s16 d17, d17, d25 // t25
+ vqsub.s16 d22, d29, d21 // t26
+ vqadd.s16 d29, d29, d21 // t27
+ vqadd.s16 d25, d19, d27 // t28
+ vqsub.s16 d19, d19, d27 // t29
+
+ vmull_vmlsl q2, d3, d2, d1[0], d1[1] // -> t17a
+ vmull_vmlal q3, d3, d2, d1[1], d1[0] // -> t30a
+ vmull_vmlal q4, d19, d24, d1[1], d1[0] // -> t18a
+ vqrshrn.s32 d21, q2, #12 // t17a
+ vqrshrn.s32 d27, q3, #12 // t30a
+ vneg.s32 q4, q4 // -> t18a
+ vmull_vmlsl q1, d19, d24, d1[0], d1[1] // -> t29a
+ vmull_vmlsl q2, d22, d18, d1[2], d1[3] // -> t21a
+ vqrshrn.s32 d19, q4, #12 // t18a
+ vqrshrn.s32 d24, q1, #12 // t29a
+ vmull_vmlal q3, d22, d18, d1[3], d1[2] // -> t26a
+ vmull_vmlal q4, d17, d20, d1[3], d1[2] // -> t22a
+ vqrshrn.s32 d22, q2, #12 // t21a
+ vqrshrn.s32 d18, q3, #12 // t26a
+ vneg.s32 q4, q4 // -> t22a
+ vmull_vmlsl q1, d17, d20, d1[2], d1[3] // -> t25a
+ vqrshrn.s32 d17, q4, #12 // t22a
+ vqrshrn.s32 d20, q1, #12 // t25a
+
+ vqsub.s16 d2, d27, d24 // t29
+ vqadd.s16 d27, d27, d24 // t30
+ vqsub.s16 d3, d21, d19 // t18
+ vqadd.s16 d21, d21, d19 // t17
+ vqsub.s16 d24, d16, d28 // t19a
+ vqadd.s16 d16, d16, d28 // t16a
+ vqsub.s16 d19, d30, d23 // t20a
+ vqadd.s16 d30, d30, d23 // t23a
+ vqsub.s16 d28, d17, d22 // t21
+ vqadd.s16 d17, d17, d22 // t22
+ vqadd.s16 d23, d26, d29 // t24a
+ vqsub.s16 d26, d26, d29 // t27a
+ vqadd.s16 d22, d20, d18 // t25
+ vqsub.s16 d20, d20, d18 // t26
+ vqsub.s16 d29, d31, d25 // t28a
+ vqadd.s16 d31, d31, d25 // t31a
+
+ vmull_vmlsl q2, d2, d3, d0[2], d0[3] // -> t18a
+ vmull_vmlal q3, d2, d3, d0[3], d0[2] // -> t29a
+ vmull_vmlsl q4, d29, d24, d0[2], d0[3] // -> t19
+ vqrshrn.s32 d18, q2, #12 // t18a
+ vqrshrn.s32 d25, q3, #12 // t29a
+ vmull_vmlal q1, d29, d24, d0[3], d0[2] // -> t28
+ vmull_vmlal q2, d26, d19, d0[3], d0[2] // -> t20
+ vqrshrn.s32 d29, q4, #12 // t19
+ vqrshrn.s32 d24, q1, #12 // t28
+ vneg.s32 q2, q2 // -> t20
+ vmull_vmlsl q3, d26, d19, d0[2], d0[3] // -> t27
+ vmull_vmlal q4, d20, d28, d0[3], d0[2] // -> t21a
+ vqrshrn.s32 d26, q2, #12 // t20
+ vqrshrn.s32 d19, q3, #12 // t27
+ vneg.s32 q4, q4 // -> t21a
+ vmull_vmlsl q1, d20, d28, d0[2], d0[3] // -> t26a
+ vqrshrn.s32 d20, q4, #12 // t21a
+ vqrshrn.s32 d28, q1, #12 // t26a
+
+ vqsub.s16 d2, d16, d30 // t23
+ vqadd.s16 d16, d16, d30 // t16 = out16
+ vqsub.s16 d3, d31, d23 // t24
+ vqadd.s16 d31, d31, d23 // t31 = out31
+ vqsub.s16 d23, d21, d17 // t22a
+ vqadd.s16 d17, d21, d17 // t17a = out17
+ vqadd.s16 d30, d27, d22 // t30a = out30
+ vqsub.s16 d21, d27, d22 // t25a
+ vqsub.s16 d27, d18, d20 // t21
+ vqadd.s16 d18, d18, d20 // t18 = out18
+ vqadd.s16 d4, d29, d26 // t19a = out19
+ vqsub.s16 d26, d29, d26 // t20a
+ vqadd.s16 d29, d25, d28 // t29 = out29
+ vqsub.s16 d25, d25, d28 // t26
+ vqadd.s16 d28, d24, d19 // t28a = out28
+ vqsub.s16 d24, d24, d19 // t27a
+ vmov d19, d4 // out19
+
+ vmull_vmlsl q2, d24, d26, d0[0], d0[0] // -> t20
+ vmull_vmlal q3, d24, d26, d0[0], d0[0] // -> t27
+ vqrshrn.s32 d20, q2, #12 // t20
+ vqrshrn.s32 d22, q3, #12 // t27
+
+ vmull_vmlal q2, d25, d27, d0[0], d0[0] // -> t26a
+ vmull_vmlsl q3, d25, d27, d0[0], d0[0] // -> t21a
+ vmov d27, d22 // t27
+ vqrshrn.s32 d26, q2, #12 // t26a
+
+ vmull_vmlsl q12, d21, d23, d0[0], d0[0] // -> t22
+ vmull_vmlal q2, d21, d23, d0[0], d0[0] // -> t25
+ vqrshrn.s32 d21, q3, #12 // t21a
+ vqrshrn.s32 d22, q12, #12 // t22
+ vqrshrn.s32 d25, q2, #12 // t25
+
+ vmull_vmlsl q2, d3, d2, d0[0], d0[0] // -> t23a
+ vmull_vmlal q3, d3, d2, d0[0], d0[0] // -> t24a
+ vqrshrn.s32 d23, q2, #12 // t23a
+ vqrshrn.s32 d24, q3, #12 // t24a
+
+ bx lr
+endfunc
+
+.macro def_horz_32 scale=0, shift=2, suffix
+function inv_txfm_horz\suffix\()_dct_32x4_neon
+ push {lr}
+ vmov.i16 d7, #0
+ lsl r8, r8, #1
+.if \scale
+ movw r12, #2896*8
+ vdup.16 d0, r12
+.endif
+
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ vld1.16 {\i}, [r7, :64]
+ vst1.16 {d7}, [r7, :64], r8
+.endr
+ sub r7, r7, r8, lsl #4
+ add r7, r7, r8, lsr #1
+.if \scale
+ scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15
+.endif
+ bl inv_dct_4h_x16_neon
+ transpose_4x4h q8, q9, d16, d17, d18, d19
+ transpose_4x4h q10, q11, d20, d21, d22, d23
+ transpose_4x4h q12, q13, d24, d25, d26, d27
+ transpose_4x4h q14, q15, d28, d29, d30, d31
+
+.macro store1 r0, r1, r2, r3
+ vst1.16 {\r0}, [r6, :64]!
+ vst1.16 {\r1}, [r6, :64]!
+ vst1.16 {\r2}, [r6, :64]!
+ vst1.16 {\r3}, [r6, :64]!
+ add r6, r6, #32
+.endm
+ store1 d16, d20, d24, d28
+ store1 d17, d21, d25, d29
+ store1 d18, d22, d26, d30
+ store1 d19, d23, d27, d31
+.purgem store1
+ sub r6, r6, #64*4
+
+ vmov.i16 d7, #0
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ vld1.16 {\i}, [r7, :64]
+ vst1.16 {d7}, [r7, :64], r8
+.endr
+.if \scale
+ // This relies on the fact that the idct also leaves the right coeff in d0[1]
+ scale_input d0[1], q8, q9, q10, q11, q12, q13, q14, q15
+.endif
+ bl inv_dct32_odd_4h_x16_neon
+ transpose_4x4h q15, q14, d31, d30, d29, d28
+ transpose_4x4h q13, q12, d27, d26, d25, d24
+ transpose_4x4h q11, q10, d23, d22, d21, d20
+ transpose_4x4h q9, q8, d19, d18, d17, d16
+.macro store2 r0, r1, r2, r3, shift
+ vld1.16 {q0, q1}, [r6, :128]
+ vqsub.s16 d7, d0, \r0
+ vqadd.s16 d0, d0, \r0
+ vqsub.s16 d6, d1, \r1
+ vqadd.s16 d1, d1, \r1
+ vqsub.s16 d5, d2, \r2
+ vqadd.s16 d2, d2, \r2
+ vqsub.s16 d4, d3, \r3
+ vqadd.s16 d3, d3, \r3
+ vrev64.16 q2, q2
+ vrev64.16 q3, q3
+ vrshr.s16 q0, q0, #\shift
+ vrshr.s16 q1, q1, #\shift
+ vrshr.s16 q2, q2, #\shift
+ vrshr.s16 q3, q3, #\shift
+ vst1.16 {q0, q1}, [r6, :128]!
+ vst1.16 {q2, q3}, [r6, :128]!
+.endm
+
+ store2 d31, d27, d23, d19, \shift
+ store2 d30, d26, d22, d18, \shift
+ store2 d29, d25, d21, d17, \shift
+ store2 d28, d24, d20, d16, \shift
+.purgem store2
+ pop {pc}
+endfunc
+.endm
+
+def_horz_32 scale=0, shift=2
+def_horz_32 scale=1, shift=1, suffix=_scale
+
+function inv_txfm_add_vert_dct_4x32_neon
+ push {r10-r11,lr}
+ lsl r8, r8, #1
+
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ vld1.16 {\i}, [r7, :64], r8
+.endr
+ sub r7, r7, r8, lsl #4
+
+ bl inv_dct_4h_x16_neon
+
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ vst1.16 {\i}, [r7, :64], r8
+.endr
+ sub r7, r7, r8, lsl #4
+ add r7, r7, r8, lsr #1
+
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ vld1.16 {\i}, [r7, :64], r8
+.endr
+ sub r7, r7, r8, lsl #4
+ sub r7, r7, r8, lsr #1
+ bl inv_dct32_odd_4h_x16_neon
+
+ neg r9, r8
+ mov r10, r6
+.macro combine r0, r1, r2, r3, op, stride
+ vld1.16 {d4}, [r7, :64], \stride
+ vld1.32 {d2[0]}, [r10, :32], r1
+ vld1.16 {d5}, [r7, :64], \stride
+ vld1.32 {d2[1]}, [r10, :32], r1
+ \op\().s16 d4, d4, \r0
+ vld1.16 {d6}, [r7, :64], \stride
+ vld1.32 {d3[0]}, [r10, :32], r1
+ \op\().s16 d5, d5, \r1
+ vld1.32 {d3[1]}, [r10, :32], r1
+ vrshr.s16 q2, q2, #4
+ \op\().s16 d6, d6, \r2
+ vld1.16 {d7}, [r7, :64], \stride
+ vaddw.u8 q2, q2, d2
+ \op\().s16 d7, d7, \r3
+ vqmovun.s16 d2, q2
+ vrshr.s16 q3, q3, #4
+ vst1.32 {d2[0]}, [r6, :32], r1
+ vaddw.u8 q3, q3, d3
+ vst1.32 {d2[1]}, [r6, :32], r1
+ vqmovun.s16 d3, q3
+ vst1.32 {d3[0]}, [r6, :32], r1
+ vst1.32 {d3[1]}, [r6, :32], r1
+.endm
+ combine d31, d30, d29, d28, vqadd, r8
+ combine d27, d26, d25, d24, vqadd, r8
+ combine d23, d22, d21, d20, vqadd, r8
+ combine d19, d18, d17, d16, vqadd, r8
+ sub r7, r7, r8
+ combine d16, d17, d18, d19, vqsub, r9
+ combine d20, d21, d22, d23, vqsub, r9
+ combine d24, d25, d26, d27, vqsub, r9
+ combine d28, d29, d30, d31, vqsub, r9
+.purgem combine
+
+ pop {r10-r11,pc}
+endfunc
+
+const eob_32x32
+ .short 10, 36, 78, 136, 210, 300, 406, 1024
+endconst
+
+const eob_16x32
+ .short 10, 36, 78, 151, 215, 279, 343, 512
+endconst
+
+const eob_16x32_shortside
+ .short 10, 36, 78, 512
+endconst
+
+const eob_8x32
+ // Contrary to the others, this one is only ever used in increments of 8x8
+ .short 43, 107, 171, 256
+endconst
+
+function inv_txfm_add_identity_identity_32x32_8bpc_neon, export=1
+ push {r4-r7,lr}
+ vmov.i16 q0, #0
+ movrel_local r5, eob_32x32, 2
+
+ mov r6, #2*32
+1:
+ mov r12, #0
+ movrel_local r4, eob_32x32, 2
+2:
+ add r12, r12, #8
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vld1.16 {\i}, [r2, :128]
+ vst1.16 {q0}, [r2, :128], r6
+.endr
+ transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30
+
+ load_add_store_8x8 r0, r7, shiftbits=2
+ ldrh lr, [r4], #4
+ sub r0, r0, r1, lsl #3
+ cmp r3, lr
+ add r0, r0, #8
+ bge 2b
+
+ ldrh lr, [r5], #4
+ cmp r3, lr
+ blt 9f
+
+ sub r0, r0, r12
+ add r0, r0, r1, lsl #3
+ mls r2, r6, r12, r2
+ add r2, r2, #2*8
+ b 1b
+9:
+ pop {r4-r7,pc}
+endfunc
+
+.macro shift_8_regs op, shift
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ \op \i, \i, #\shift
+.endr
+.endm
+
+.macro def_identity_1632 w, h, wshort, hshort
+function inv_txfm_add_identity_identity_\w\()x\h\()_8bpc_neon, export=1
+ push {r4-r7,lr}
+ movw r6, #2896*8
+ movw r7, #2*(5793-4096)*8
+ vdup.i16 d0, r6
+ movrel_local r5, eob_16x32\hshort, 2
+ vmov.16 d0[1], r7
+
+ mov r6, #2*\h
+1:
+ mov r12, #0
+ movrel_local r4, eob_16x32\wshort, 2
+2:
+ vmov.i16 q1, #0
+ add r12, r12, #8
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vld1.16 {\i}, [r2, :128]
+ vst1.16 {q1}, [r2, :128], r6
+.endr
+ scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15
+
+.if \w == 16
+ // 16x32
+ identity_8x8_shift1 d0[1]
+.else
+ // 32x16
+ shift_8_regs vqshl.s16, 1
+ identity_8x8 d0[1]
+.endif
+
+ transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30
+
+.if \w == 16
+ load_add_store_8x8 r0, r7, shiftbits=2
+.else
+ load_add_store_8x8 r0, r7, shiftbits=4
+.endif
+ ldrh lr, [r4], #4
+ sub r0, r0, r1, lsl #3
+ cmp r3, lr
+ add r0, r0, #8
+ bge 2b
+
+ ldrh lr, [r5], #4
+ cmp r3, lr
+ blt 9f
+
+ sub r0, r0, r12
+ add r0, r0, r1, lsl #3
+ mls r2, r6, r12, r2
+ add r2, r2, #2*8
+ b 1b
+9:
+ pop {r4-r7,pc}
+endfunc
+.endm
+
+def_identity_1632 16, 32, _shortside,
+def_identity_1632 32, 16, , _shortside
+
+.macro def_identity_832 w, h
+function inv_txfm_add_identity_identity_\w\()x\h\()_8bpc_neon, export=1
+ push {r4-r5,lr}
+ vmov.i16 q0, #0
+ movrel_local r4, eob_8x32
+
+ mov r12, #2*\h
+1:
+ ldrh lr, [r4], #2
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vld1.16 {\i}, [r2, :128]
+ vst1.16 {q0}, [r2, :128], r12
+.endr
+
+.if \w == 8
+ // 8x32
+ shift_8_regs vrshr.s16, 1
+.endif
+
+ transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30
+
+ cmp r3, lr
+.if \w == 8
+ load_add_store_8x8 r0, r5, shiftbits=2
+.else
+ load_add_store_8x8 r0, r5, shiftbits=3
+.endif
+
+ blt 9f
+.if \w == 8
+ sub r2, r2, r12, lsl #3
+ add r2, r2, #2*8
+.else
+ sub r0, r0, r1, lsl #3
+ add r0, r0, #8
+.endif
+ b 1b
+
+9:
+ pop {r4-r5,pc}
+endfunc
+.endm
+
+def_identity_832 8, 32
+def_identity_832 32, 8
+
+function inv_txfm_add_dct_dct_32x32_8bpc_neon, export=1
+ idct_dc 32, 32, 2
+
+ push {r4-r11,lr}
+ vpush {q4}
+ sub_sp_align 2048
+ movrel_local r10, eob_32x32
+ ldrh r11, [r10], #2
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add r6, sp, #(\i*32*2)
+.if \i > 0
+ mov r8, #(32 - \i)
+ cmp r3, r11
+ blt 1f
+.if \i < 28
+ ldrh r11, [r10], #2
+.endif
+.endif
+ add r7, r2, #(\i*2)
+ mov r8, #32*2
+ bl inv_txfm_horz_dct_32x4_neon
+.endr
+ b 3f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #2
+.rept 4
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add r6, r0, #(\i)
+ add r7, sp, #(\i*2)
+ mov r8, #32*2
+ bl inv_txfm_add_vert_dct_4x32_neon
+.endr
+
+ add_sp_align 2048
+ vpop {q4}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_16x32_8bpc_neon, export=1
+ idct_dc 16, 32, 1
+
+ push {r4-r11,lr}
+ vpush {q4}
+ sub_sp_align 1024
+ movrel_local r10, eob_16x32
+ ldrh r11, [r10], #2
+ movrel_local r4, inv_dct_4h_x16_neon
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add r6, sp, #(\i*16*2)
+ add r7, r2, #(\i*2)
+.if \i > 0
+ mov r8, #(32 - \i)
+ cmp r3, r11
+ blt 1f
+.if \i < 28
+ ldrh r11, [r10], #2
+.endif
+.endif
+ mov r8, #2*32
+ bl inv_txfm_horz_scale_16x4_neon
+.endr
+ b 3f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #4
+.rept 4
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+.irp i, 0, 4, 8, 12
+ add r6, r0, #(\i)
+ add r7, sp, #(\i*2)
+ mov r8, #16*2
+ bl inv_txfm_add_vert_dct_4x32_neon
+.endr
+
+ add_sp_align 1024
+ vpop {q4}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_32x16_8bpc_neon, export=1
+ idct_dc 32, 16, 1
+
+ push {r4-r11,lr}
+ vpush {q4}
+ sub_sp_align 1024
+ movrel_local r10, eob_16x32
+ ldrh r11, [r10], #2
+ movrel_local r5, inv_dct_4h_x16_neon
+
+.irp i, 0, 4, 8, 12
+ add r6, sp, #(\i*32*2)
+ add r7, r2, #(\i*2)
+.if \i > 0
+ mov r8, #(16 - \i)
+ cmp r3, r11
+ blt 1f
+.if \i < 12
+ ldrh r11, [r10], #2
+.endif
+.endif
+ mov r8, #2*16
+ bl inv_txfm_horz_scale_dct_32x4_neon
+.endr
+ b 3f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #2
+.rept 4
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add r6, r0, #(\i)
+ add r7, sp, #(\i*2)
+ mov r8, #32*2
+ bl inv_txfm_add_vert_4x16_neon
+.endr
+
+ add_sp_align 1024
+ vpop {q4}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_8x32_8bpc_neon, export=1
+ idct_dc 8, 32, 2
+
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ sub_sp_align 512
+
+ movrel_local r10, eob_8x32
+
+ mov r8, #2*32
+ mov r9, #32
+ mov r6, sp
+1:
+ vmov.i16 q0, #0
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vld1.16 {\i}, [r2, :128]
+ vst1.16 {q0}, [r2, :128], r8
+.endr
+ ldrh r11, [r10], #2
+ sub r2, r2, r8, lsl #3
+ sub r9, r9, #8
+ add r2, r2, #2*8
+
+ bl inv_dct_8h_x8_neon
+
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vrshr.s16 \i, \i, #2
+.endr
+
+ transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30
+
+ vst1.16 {q8, q9}, [r6, :128]!
+ cmp r3, r11
+ vst1.16 {q10, q11}, [r6, :128]!
+ vst1.16 {q12, q13}, [r6, :128]!
+ vst1.16 {q14, q15}, [r6, :128]!
+
+ bge 1b
+ cmp r9, #0
+ beq 3f
+
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r9, r9, #8
+.rept 4
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+.irp i, 0, 4
+ add r6, r0, #(\i)
+ add r7, sp, #(\i*2)
+ mov r8, #8*2
+ bl inv_txfm_add_vert_dct_4x32_neon
+.endr
+
+ add_sp_align 512
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_32x8_8bpc_neon, export=1
+ idct_dc 32, 8, 2
+
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ sub_sp_align 512
+
+.irp i, 0, 4
+ add r6, sp, #(\i*32*2)
+ add r7, r2, #(\i*2)
+.if \i > 0
+ cmp r3, #10
+ blt 1f
+.endif
+ mov r8, #8*2
+ bl inv_txfm_horz_dct_32x4_neon
+.endr
+ b 2f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+.rept 8
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+
+2:
+ mov r8, #2*32
+ mov r9, #0
+1:
+ add r6, r0, r9
+ add r7, sp, r9, lsl #1 // #(\i*2)
+
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vld1.16 {\i}, [r7, :128], r8
+.endr
+ add r9, r9, #8
+
+ bl inv_dct_8h_x8_neon
+
+ cmp r9, #32
+
+ load_add_store_8x8 r6, r7
+
+ blt 1b
+
+ add_sp_align 512
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_dct64_step1_neon
+ // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
+ // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
+ // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
+ // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
+
+ vld1.16 {d0, d1, d2}, [r12, :64]!
+
+ vqrdmulh.s16 d23, d16, d0[1] // t63a
+ vqrdmulh.s16 d16, d16, d0[0] // t32a
+ vqrdmulh.s16 d22, d17, d0[2] // t62a
+ vqrdmulh.s16 d17, d17, d0[3] // t33a
+ vqrdmulh.s16 d21, d18, d1[1] // t61a
+ vqrdmulh.s16 d18, d18, d1[0] // t34a
+ vqrdmulh.s16 d20, d19, d1[2] // t60a
+ vqrdmulh.s16 d19, d19, d1[3] // t35a
+
+ vqadd.s16 d24, d16, d17 // t32
+ vqsub.s16 d25, d16, d17 // t33
+ vqsub.s16 d26, d19, d18 // t34
+ vqadd.s16 d27, d19, d18 // t35
+ vqadd.s16 d28, d20, d21 // t60
+ vqsub.s16 d29, d20, d21 // t61
+ vqsub.s16 d30, d23, d22 // t62
+ vqadd.s16 d31, d23, d22 // t63
+
+ vmull_vmlal q2, d29, d26, d2[0], d2[1] // -> t34a
+ vmull_vmlsl q3, d29, d26, d2[1], d2[0] // -> t61a
+ vneg.s32 q2, q2 // t34a
+ vmull_vmlsl q4, d30, d25, d2[1], d2[0] // -> t33a
+ vqrshrn.s32 d26, q2, #12 // t34a
+ vmull_vmlal q2, d30, d25, d2[0], d2[1] // -> t62a
+ vqrshrn.s32 d29, q3, #12 // t61a
+ vqrshrn.s32 d25, q4, #12 // t33a
+ vqrshrn.s32 d30, q2, #12 // t62a
+
+ vqadd.s16 d16, d24, d27 // t32a
+ vqsub.s16 d19, d24, d27 // t35a
+ vqadd.s16 d17, d25, d26 // t33
+ vqsub.s16 d18, d25, d26 // t34
+ vqsub.s16 d20, d31, d28 // t60a
+ vqadd.s16 d23, d31, d28 // t63a
+ vqsub.s16 d21, d30, d29 // t61
+ vqadd.s16 d22, d30, d29 // t62
+
+ vmull_vmlal q2, d21, d18, d2[2], d2[3] // -> t61a
+ vmull_vmlsl q3, d21, d18, d2[3], d2[2] // -> t34a
+ vmull_vmlal q4, d20, d19, d2[2], d2[3] // -> t60
+ vqrshrn.s32 d21, q2, #12 // t61a
+ vqrshrn.s32 d18, q3, #12 // t34a
+ vmull_vmlsl q2, d20, d19, d2[3], d2[2] // -> t35
+ vqrshrn.s32 d20, q4, #12 // t60
+ vqrshrn.s32 d19, q2, #12 // t35
+
+ vst1.16 {d16, d17, d18, d19}, [r6, :128]!
+ vst1.16 {d20, d21, d22, d23}, [r6, :128]!
+
+ bx lr
+endfunc
+
+function inv_dct64_step2_neon
+ movrel_local r12, idct_coeffs
+ vld1.16 {d0}, [r12, :64]
+1:
+ // t32a/33/34a/35/60/61a/62/63a
+ // t56a/57/58a/59/36/37a/38/39a
+ // t40a/41/42a/43/52/53a/54/55a
+ // t48a/49/50a/51/44/45a/46/47a
+ vldr d16, [r6, #2*4*0] // t32a
+ vldr d17, [r9, #2*4*8] // t39a
+ vldr d18, [r9, #2*4*0] // t63a
+ vldr d19, [r6, #2*4*8] // t56a
+ vldr d20, [r6, #2*4*16] // t40a
+ vldr d21, [r9, #2*4*24] // t47a
+ vldr d22, [r9, #2*4*16] // t55a
+ vldr d23, [r6, #2*4*24] // t48a
+
+ vqadd.s16 d24, d16, d17 // t32
+ vqsub.s16 d25, d16, d17 // t39
+ vqadd.s16 d26, d18, d19 // t63
+ vqsub.s16 d27, d18, d19 // t56
+ vqsub.s16 d28, d21, d20 // t40
+ vqadd.s16 d29, d21, d20 // t47
+ vqadd.s16 d30, d23, d22 // t48
+ vqsub.s16 d31, d23, d22 // t55
+
+ vmull_vmlal q2, d27, d25, d0[3], d0[2] // -> t56a
+ vmull_vmlsl q3, d27, d25, d0[2], d0[3] // -> t39a
+ vmull_vmlal q4, d31, d28, d0[3], d0[2] // -> t40a
+ vqrshrn.s32 d25, q2, #12 // t56a
+ vqrshrn.s32 d27, q3, #12 // t39a
+ vneg.s32 q4, q4 // t40a
+ vmull_vmlsl q2, d31, d28, d0[2], d0[3] // -> t55a
+ vqrshrn.s32 d31, q4, #12 // t40a
+ vqrshrn.s32 d28, q2, #12 // t55a
+
+ vqadd.s16 d16, d24, d29 // t32a
+ vqsub.s16 d19, d24, d29 // t47a
+ vqadd.s16 d17, d27, d31 // t39
+ vqsub.s16 d18, d27, d31 // t40
+ vqsub.s16 d20, d26, d30 // t48a
+ vqadd.s16 d23, d26, d30 // t63a
+ vqsub.s16 d21, d25, d28 // t55
+ vqadd.s16 d22, d25, d28 // t56
+
+ vmull_vmlsl q2, d21, d18, d0[0], d0[0] // -> t40a
+ vmull_vmlal q3, d21, d18, d0[0], d0[0] // -> t55a
+ vmull_vmlsl q4, d20, d19, d0[0], d0[0] // -> t47
+ vqrshrn.s32 d18, q2, #12 // t40a
+ vqrshrn.s32 d21, q3, #12 // t55a
+ vmull_vmlal q2, d20, d19, d0[0], d0[0] // -> t48
+ vqrshrn.s32 d19, q4, #12 // t47
+ vqrshrn.s32 d20, q2, #12 // t48
+
+ vstr d16, [r6, #2*4*0] // t32a
+ vstr d17, [r9, #2*4*0] // t39
+ vstr d18, [r6, #2*4*8] // t40a
+ vstr d19, [r9, #2*4*8] // t47
+ vstr d20, [r6, #2*4*16] // t48
+ vstr d21, [r9, #2*4*16] // t55a
+ vstr d22, [r6, #2*4*24] // t56
+ vstr d23, [r9, #2*4*24] // t63a
+
+ add r6, r6, #2*4
+ sub r9, r9, #2*4
+ cmp r6, r9
+ blt 1b
+ bx lr
+endfunc
+
+.macro load8 src, strd, zero, clear
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23
+.if \clear
+ vld1.16 {\i}, [\src, :64]
+ vst1.16 {\zero}, [\src, :64], \strd
+.else
+ vld1.16 {\i}, [\src, :64], \strd
+.endif
+.endr
+.endm
+
+.macro store16 dst
+ vst1.16 {q8, q9}, [\dst, :128]!
+ vst1.16 {q10, q11}, [\dst, :128]!
+ vst1.16 {q12, q13}, [\dst, :128]!
+ vst1.16 {q14, q15}, [\dst, :128]!
+.endm
+
+.macro clear_upper8
+.irp i, q12, q13, q14, q15
+ vmov.i16 \i, #0
+.endr
+.endm
+
+.macro vmov_if reg, val, cond
+.if \cond
+ vmov.i16 \reg, \val
+.endif
+.endm
+
+.macro movdup_if reg, gpr, val, cond
+.if \cond
+ movw \gpr, \val
+ vdup.16 \reg, \gpr
+.endif
+.endm
+
+.macro vst1_if regs, dst, dstalign, cond
+.if \cond
+ vst1.16 \regs, \dst, \dstalign
+.endif
+.endm
+
+.macro scale_if cond, c, r0, r1, r2, r3, r4, r5, r6, r7
+.if \cond
+ scale_input \c, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
+.endif
+.endm
+
+.macro def_dct64_func suffix, clear=0, scale=0
+function inv_txfm_dct\suffix\()_4h_x64_neon, export=1
+ mov r6, sp
+
+ push {r10-r11,lr}
+
+ lsl r8, r8, #2
+
+ movdup_if d0, r12, #2896*8, \scale
+ vmov_if d7, #0, \clear
+ load8 r7, r8, d7, \clear
+ clear_upper8
+ sub r7, r7, r8, lsl #3
+ add r7, r7, r8, lsr #1
+ scale_if \scale, d0[0], q8, q9, q10, q11
+
+ bl inv_dct_4h_x16_neon
+
+ store16 r6
+
+ movdup_if d0, r12, #2896*8, \scale
+ vmov_if d7, #0, \clear
+ load8 r7, r8, d7, \clear
+ clear_upper8
+ sub r7, r7, r8, lsl #3
+ lsr r8, r8, #1
+ sub r7, r7, r8, lsr #1
+ scale_if \scale, d0[0], q8, q9, q10, q11
+
+ bl inv_dct32_odd_4h_x16_neon
+
+ add r10, r6, #8*15
+ sub r6, r6, #8*16
+
+ mov r9, #-8
+
+.macro store_addsub r0, r1, r2, r3
+ vld1.16 {d2}, [r6, :64]!
+ vld1.16 {d3}, [r6, :64]!
+ vqadd.s16 d6, d2, \r0
+ vqsub.s16 \r0, d2, \r0
+ vld1.16 {d4}, [r6, :64]!
+ vqadd.s16 d7, d3, \r1
+ vqsub.s16 \r1, d3, \r1
+ vld1.16 {d5}, [r6, :64]!
+ vqadd.s16 d2, d4, \r2
+ sub r6, r6, #8*4
+ vqsub.s16 \r2, d4, \r2
+ vst1.16 {d6}, [r6, :64]!
+ vst1.16 {\r0}, [r10, :64], r9
+ vqadd.s16 d3, d5, \r3
+ vqsub.s16 \r3, d5, \r3
+ vst1.16 {d7}, [r6, :64]!
+ vst1.16 {\r1}, [r10, :64], r9
+ vst1.16 {d2}, [r6, :64]!
+ vst1.16 {\r2}, [r10, :64], r9
+ vst1.16 {d3}, [r6, :64]!
+ vst1.16 {\r3}, [r10, :64], r9
+.endm
+ store_addsub d31, d30, d29, d28
+ store_addsub d27, d26, d25, d24
+ store_addsub d23, d22, d21, d20
+ store_addsub d19, d18, d17, d16
+.purgem store_addsub
+
+ add r6, r6, #2*4*16
+
+ movrel_local r12, idct64_coeffs
+ movdup_if d0, lr, #2896*8, \scale
+ vmov_if d7, #0, \clear
+ add r9, r7, r8, lsl #4 // offset 16
+ add r10, r7, r8, lsl #3 // offset 8
+ sub r9, r9, r8 // offset 15
+ sub r11, r10, r8 // offset 7
+ vld1.16 {d16}, [r7, :64] // in1 (offset 0)
+ vld1.16 {d17}, [r9, :64] // in31 (offset 15)
+ vld1.16 {d18}, [r10, :64] // in17 (offset 8)
+ vld1.16 {d19}, [r11, :64] // in15 (offset 7)
+ vst1_if {d7}, [r7, :64], \clear
+ vst1_if {d7}, [r9, :64], \clear
+ vst1_if {d7}, [r10, :64], \clear
+ vst1_if {d7}, [r11, :64], \clear
+ scale_if \scale, d0[0], q8, q9
+ bl inv_dct64_step1_neon
+ movdup_if d0, lr, #2896*8, \scale
+ vmov_if d7, #0, \clear
+ add r7, r7, r8, lsl #2 // offset 4
+ sub r9, r9, r8, lsl #2 // offset 11
+ sub r10, r7, r8 // offset 3
+ add r11, r9, r8 // offset 12
+ vld1.16 {d16}, [r10, :64] // in7 (offset 3)
+ vld1.16 {d17}, [r11, :64] // in25 (offset 12)
+ vld1.16 {d18}, [r9, :64] // in23 (offset 11)
+ vld1.16 {d19}, [r7, :64] // in9 (offset 4)
+ vst1_if {d7}, [r7, :64], \clear
+ vst1_if {d7}, [r9, :64], \clear
+ vst1_if {d7}, [r10, :64], \clear
+ vst1_if {d7}, [r11, :64], \clear
+ scale_if \scale, d0[0], q8, q9
+ bl inv_dct64_step1_neon
+ movdup_if d0, lr, #2896*8, \scale
+ vmov_if d7, #0, \clear
+ sub r10, r10, r8, lsl #1 // offset 1
+ sub r9, r9, r8, lsl #1 // offset 9
+ add r10, r10, r8 // offset 2
+ add r9, r9, r8 // offset 10
+ add r7, r7, r8 // offset 5
+ add r11, r11, r8 // offset 13
+ vld1.16 d16, [r10, :64] // in5 (offset 2)
+ vld1.16 d17, [r11, :64] // in27 (offset 13)
+ vld1.16 d18, [r9, :64] // in21 (offset 10)
+ vld1.16 d19, [r7, :64] // in11 (offset 5)
+ vst1_if d7, [r10, :64], \clear
+ vst1_if d7, [r11, :64], \clear
+ vst1_if d7, [r9, :64], \clear
+ vst1_if d7, [r7, :64], \clear
+ scale_if \scale, d0[0], q8, q9
+ bl inv_dct64_step1_neon
+ movdup_if d0, lr, #2896*8, \scale
+ vmov_if d7, #0, \clear
+ sub r10, r10, r8 // offset 1
+ sub r9, r9, r8 // offset 9
+ add r11, r11, r8 // offset 14
+ add r7, r7, r8 // offset 6
+ vld1.16 d16, [r10, :64] // in3 (offset 1)
+ vld1.16 d17, [r11, :64] // in29 (offset 14)
+ vld1.16 d18, [r9, :64] // in19 (offset 9)
+ vld1.16 d19, [r7, :64] // in13 (offset 6)
+ vst1_if d7, [r10, :64], \clear
+ vst1_if d7, [r11, :64], \clear
+ vst1_if d7, [r9, :64], \clear
+ vst1_if d7, [r7, :64], \clear
+ scale_if \scale, d0[0], q8, q9
+ bl inv_dct64_step1_neon
+
+ sub r6, r6, #2*4*32
+ add r9, r6, #2*4*7
+
+ bl inv_dct64_step2_neon
+
+ pop {r10-r11,pc}
+endfunc
+.endm
+
+def_dct64_func
+def_dct64_func _clear, clear=1
+def_dct64_func _clear_scale, clear=1, scale=1
+
+function inv_txfm_horz_dct_64x4_neon
+ vdup.16 q3, r9
+
+ mov r7, sp
+ add r8, sp, #2*4*(64 - 4)
+ add r9, r6, #2*56
+
+ push {r10-r11,lr}
+
+ mov r10, #2*64
+ mov r11, #-2*4*4
+
+1:
+ vld1.16 {d16, d17, d18, d19}, [r7, :128]!
+ vld1.16 {d28, d29, d30, d31}, [r8, :128], r11
+ vld1.16 {d20, d21, d22, d23}, [r7, :128]!
+ vld1.16 {d24, d25, d26, d27}, [r8, :128], r11
+ transpose_4x4h q8, q9, d16, d17, d18, d19
+ transpose_4x4h q15, q14, d31, d30, d29, d28
+ transpose_4x4h q10, q11, d20, d21, d22, d23
+ transpose_4x4h q13, q12, d27, d26, d25, d24
+
+.macro store_addsub src0, src1, src2, src3
+ vqsub.s16 d3, \src0, \src1
+ vqsub.s16 d2, \src2, \src3
+ vqadd.s16 d0, \src0, \src1
+ vqadd.s16 d1, \src2, \src3
+ vrshl.s16 q1, q1, q3
+ vrshl.s16 q0, q0, q3
+ vrev64.16 q1, q1
+ vst1.16 {q0}, [r6, :128], r10
+ vst1.16 {q1}, [r9, :128], r10
+.endm
+ store_addsub d16, d31, d20, d27
+ store_addsub d17, d30, d21, d26
+ store_addsub d18, d29, d22, d25
+ store_addsub d19, d28, d23, d24
+.purgem store_addsub
+ sub r6, r6, r10, lsl #2
+ sub r9, r9, r10, lsl #2
+ add r6, r6, #16
+ sub r9, r9, #16
+
+ cmp r7, r8
+ blt 1b
+ pop {r10-r11,pc}
+endfunc
+
+function inv_txfm_add_vert_dct_4x64_neon
+ lsl r8, r8, #1
+
+ mov r7, sp
+ add r8, sp, #2*4*(64 - 4)
+ add r9, r6, r1, lsl #6
+ sub r9, r9, r1
+
+ push {r10-r11,lr}
+
+ neg r10, r1
+ mov r11, #-2*4*4
+
+1:
+ vld1.16 {d16, d17, d18, d19}, [r7, :128]!
+ vld1.16 {d28, d29, d30, d31}, [r8, :128], r11
+ vld1.16 {d20, d21, d22, d23}, [r7, :128]!
+ vld1.16 {d24, d25, d26, d27}, [r8, :128], r11
+
+.macro add_dest_addsub src0, src1, src2, src3
+ vld1.32 {d0[0]}, [r6, :32], r1
+ vld1.32 {d1[0]}, [r9, :32], r10
+ vqadd.s16 d4, \src0, \src1
+ vld1.32 {d0[1]}, [r6, :32]
+ vqadd.s16 d5, \src2, \src3
+ vld1.32 {d1[1]}, [r9, :32]
+ vqsub.s16 d6, \src0, \src1
+ vqsub.s16 d7, \src2, \src3
+ sub r6, r6, r1
+ sub r9, r9, r10
+ vrshr.s16 q2, q2, #4
+ vrshr.s16 q3, q3, #4
+ vaddw.u8 q2, q2, d0
+ vaddw.u8 q3, q3, d1
+ vqmovun.s16 d0, q2
+ vqmovun.s16 d1, q3
+ vst1.32 {d0[0]}, [r6, :32], r1
+ vst1.32 {d1[0]}, [r9, :32], r10
+ vst1.32 {d0[1]}, [r6, :32], r1
+ vst1.32 {d1[1]}, [r9, :32], r10
+.endm
+ add_dest_addsub d16, d31, d17, d30
+ add_dest_addsub d18, d29, d19, d28
+ add_dest_addsub d20, d27, d21, d26
+ add_dest_addsub d22, d25, d23, d24
+.purgem add_dest_addsub
+ cmp r7, r8
+ blt 1b
+
+ pop {r10-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_64x64_8bpc_neon, export=1
+ idct_dc 64, 64, 2
+
+ push {r4-r11,lr}
+ vpush {q4}
+
+ sub_sp_align 64*32*2+64*4*2
+ add r5, sp, #64*4*2
+
+ movrel_local r10, eob_32x32
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add r6, r5, #(\i*64*2)
+.if \i > 0
+ mov r8, #(32 - \i)
+ cmp r3, r11
+ blt 1f
+.endif
+ add r7, r2, #(\i*2)
+ mov r8, #32*2
+ bl inv_txfm_dct_clear_4h_x64_neon
+ add r6, r5, #(\i*64*2)
+ mov r9, #-2 // shift
+ bl inv_txfm_horz_dct_64x4_neon
+.if \i < 28
+ ldrh r11, [r10], #2
+.endif
+.endr
+ b 3f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #2
+.rept 8
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60
+ add r7, r5, #(\i*2)
+ mov r8, #64*2
+ bl inv_txfm_dct_4h_x64_neon
+ add r6, r0, #(\i)
+ bl inv_txfm_add_vert_dct_4x64_neon
+.endr
+
+ add_sp_align 64*32*2+64*4*2
+ vpop {q4}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_64x32_8bpc_neon, export=1
+ idct_dc 64, 32, 1
+
+ push {r4-r11,lr}
+ vpush {q4}
+
+ sub_sp_align 64*32*2+64*4*2
+ add r5, sp, #64*4*2
+
+ movrel_local r10, eob_32x32
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add r6, r5, #(\i*64*2)
+.if \i > 0
+ mov r8, #(32 - \i)
+ cmp r3, r11
+ blt 1f
+.endif
+ add r7, r2, #(\i*2)
+ mov r8, #32*2
+ bl inv_txfm_dct_clear_scale_4h_x64_neon
+ add r6, r5, #(\i*64*2)
+ mov r9, #-1 // shift
+ bl inv_txfm_horz_dct_64x4_neon
+.if \i < 28
+ ldrh r11, [r10], #2
+.endif
+.endr
+ b 3f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #2
+.rept 8
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60
+ add r6, r0, #(\i)
+ add r7, r5, #(\i*2)
+ mov r8, #64*2
+ bl inv_txfm_add_vert_dct_4x32_neon
+.endr
+
+ add_sp_align 64*32*2+64*4*2
+ vpop {q4}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_32x64_8bpc_neon, export=1
+ idct_dc 32, 64, 1
+
+ push {r4-r11,lr}
+ vpush {q4}
+
+ sub_sp_align 32*32*2+64*4*2
+ add r5, sp, #64*4*2
+
+ movrel_local r10, eob_32x32
+ ldrh r11, [r10], #2
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add r6, r5, #(\i*32*2)
+.if \i > 0
+ mov r8, #(32 - \i)
+ cmp r3, r11
+ blt 1f
+.if \i < 28
+ ldrh r11, [r10], #2
+.endif
+.endif
+ add r7, r2, #(\i*2)
+ mov r8, #32*2
+ bl inv_txfm_horz_scale_dct_32x4_neon
+.endr
+ b 3f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #2
+.rept 4
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add r7, r5, #(\i*2)
+ mov r8, #32*2
+ bl inv_txfm_dct_4h_x64_neon
+ add r6, r0, #(\i)
+ bl inv_txfm_add_vert_dct_4x64_neon
+.endr
+
+ add_sp_align 32*32*2+64*4*2
+ vpop {q4}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_64x16_8bpc_neon, export=1
+ idct_dc 64, 16, 2
+
+ push {r4-r11,lr}
+ vpush {q4}
+
+ sub_sp_align 64*16*2+64*4*2
+ add r4, sp, #64*4*2
+
+ movrel_local r10, eob_16x32
+
+.irp i, 0, 4, 8, 12
+ add r6, r4, #(\i*64*2)
+.if \i > 0
+ mov r8, #(16 - \i)
+ cmp r3, r11
+ blt 1f
+.endif
+ add r7, r2, #(\i*2)
+ mov r8, #16*2
+ bl inv_txfm_dct_clear_4h_x64_neon
+ add r6, r4, #(\i*64*2)
+ mov r9, #-2 // shift
+ bl inv_txfm_horz_dct_64x4_neon
+.if \i < 12
+ ldrh r11, [r10], #2
+.endif
+.endr
+ b 3f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #2
+.rept 8
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+ movrel_local r5, inv_dct_4h_x16_neon
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60
+ add r6, r0, #(\i)
+ add r7, r4, #(\i*2)
+ mov r8, #64*2
+ bl inv_txfm_add_vert_4x16_neon
+.endr
+
+ add_sp_align 64*16*2+64*4*2
+ vpop {q4}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_16x64_8bpc_neon, export=1
+ idct_dc 16, 64, 2
+
+ push {r4-r11,lr}
+ vpush {q4}
+
+ sub_sp_align 16*32*2+64*4*2
+ add r5, sp, #64*4*2
+
+ movrel_local r10, eob_16x32
+ ldrh r11, [r10], #2
+
+ movrel_local r4, inv_dct_4h_x16_neon
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add r6, r5, #(\i*16*2)
+.if \i > 0
+ mov r8, #(32 - \i)
+ cmp r3, r11
+ blt 1f
+.if \i < 28
+ ldrh r11, [r10], #2
+.endif
+.endif
+ add r7, r2, #(\i*2)
+ mov r8, #32*2
+ bl inv_txfm_horz_16x4_neon
+.endr
+ b 3f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #4
+.rept 4
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+.irp i, 0, 4, 8, 12
+ add r7, r5, #(\i*2)
+ mov r8, #16*2
+ bl inv_txfm_dct_4h_x64_neon
+ add r6, r0, #(\i)
+ bl inv_txfm_add_vert_dct_4x64_neon
+.endr
+
+ add_sp_align 16*32*2+64*4*2
+ vpop {q4}
+ pop {r4-r11,pc}
+endfunc
diff --git a/third_party/dav1d/src/arm/32/itx16.S b/third_party/dav1d/src/arm/32/itx16.S
new file mode 100644
index 0000000000..aa6c272e71
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/itx16.S
@@ -0,0 +1,3625 @@
+/******************************************************************************
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// The exported functions in this file have got the following signature:
+// void itxfm_add(pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob);
+
+// Most of the functions use the following register layout:
+// r0-r3 external parameters
+// r4 function pointer to first transform
+// r5 function pointer to second transform
+// r6 output parameter for helper function
+// r7 input parameter for helper function
+// r8 input stride for helper function
+// r9 scratch variable for helper functions
+// r10-r11 pointer to list of eob thresholds, eob threshold value,
+// scratch variables within helper functions (backed up)
+
+// The SIMD registers most often use the following layout:
+// d0-d3 multiplication coefficients
+// d4-d7 scratch registers
+// d8-d15 unused in some transforms, used for scratch registers in others
+// d16-v31 inputs/outputs of transforms
+
+// Potential further optimizations, that are left unimplemented for now:
+// - Trying to keep multiplication coefficients in registers across multiple
+// transform functions. (The register layout is designed to potentially
+// allow this.)
+// - Use a simplified version of the transforms themselves for cases where
+// we know a significant number of inputs are zero. E.g. if the eob value
+// indicates only a quarter of input values are set, for idct16 and up,
+// a significant amount of calculation can be skipped, at the cost of more
+// code duplication and special casing.
+
+// A macro for cases where a thumb mov can express the constant in one
+// instruction, while arm mode requires two separate movw+movt pairs.
+.macro mov_const reg, val
+#if CONFIG_THUMB
+ mov.w \reg, #\val
+#else
+ movw \reg, #((\val) & 0xffff)
+ movt \reg, #(((\val) >> 16) & 0xffff)
+#endif
+.endm
+
+const idct_coeffs, align=4
+ // idct4
+ .int 2896, 2896*8*(1<<16), 1567, 3784
+ // idct8
+ .int 799, 4017, 3406, 2276
+ // idct16
+ .int 401, 4076, 3166, 2598
+ .int 1931, 3612, 3920, 1189
+ // idct32
+ .int 201, 4091, 3035, 2751
+ .int 1751, 3703, 3857, 1380
+ .int 995, 3973, 3513, 2106
+ .int 2440, 3290, 4052, 601
+endconst
+
+const idct64_coeffs, align=4
+ .int 101*8*(1<<16), 4095*8*(1<<16), 2967*8*(1<<16), -2824*8*(1<<16)
+ .int 1660*8*(1<<16), 3745*8*(1<<16), 3822*8*(1<<16), -1474*8*(1<<16)
+ .int 4076, 401, 4017, 799
+
+ .int 4036*8*(1<<16), -700*8*(1<<16), 2359*8*(1<<16), 3349*8*(1<<16)
+ .int 3461*8*(1<<16), -2191*8*(1<<16), 897*8*(1<<16), 3996*8*(1<<16)
+ .int -3166, -2598, -799, -4017
+
+ .int 501*8*(1<<16), 4065*8*(1<<16), 3229*8*(1<<16), -2520*8*(1<<16)
+ .int 2019*8*(1<<16), 3564*8*(1<<16), 3948*8*(1<<16), -1092*8*(1<<16)
+ .int 3612, 1931, 2276, 3406
+
+ .int 4085*8*(1<<16), -301*8*(1<<16), 2675*8*(1<<16), 3102*8*(1<<16)
+ .int 3659*8*(1<<16), -1842*8*(1<<16), 1285*8*(1<<16), 3889*8*(1<<16)
+ .int -3920, -1189, -3406, -2276
+endconst
+
+const iadst4_coeffs, align=4
+ .int 1321, 3803, 2482, 3344
+endconst
+
+const iadst8_coeffs, align=4
+ .int 4076, 401, 3612, 1931
+ .int 2598, 3166, 1189, 3920
+ // idct_coeffs
+ .int 2896, 0, 1567, 3784
+endconst
+
+const iadst16_coeffs, align=4
+ .int 4091, 201, 3973, 995
+ .int 3703, 1751, 3290, 2440
+ .int 2751, 3035, 2106, 3513
+ .int 1380, 3857, 601, 4052
+endconst
+
+.macro vmul_vmla d0, s0, s1, c0, c1
+ vmul.i32 \d0, \s0, \c0
+ vmla.i32 \d0, \s1, \c1
+.endm
+
+.macro vmul_vmls d0, s0, s1, c0, c1
+ vmul.i32 \d0, \s0, \c0
+ vmls.i32 \d0, \s1, \c1
+.endm
+
+.macro scale_input c, r0, r1, r2 r3, r4, r5, r6, r7
+ vqrdmulh.s32 \r0, \r0, \c
+ vqrdmulh.s32 \r1, \r1, \c
+.ifnb \r2
+ vqrdmulh.s32 \r2, \r2, \c
+ vqrdmulh.s32 \r3, \r3, \c
+.endif
+.ifnb \r4
+ vqrdmulh.s32 \r4, \r4, \c
+ vqrdmulh.s32 \r5, \r5, \c
+ vqrdmulh.s32 \r6, \r6, \c
+ vqrdmulh.s32 \r7, \r7, \c
+.endif
+.endm
+
+.macro load_add_store load, shift, addsrc, adddst, max, min, store, dst, src, shiftbits=4
+.ifnb \load
+ vld1.16 {\load}, [\src, :128], r1
+.endif
+.ifnb \shift
+ vrshr.s16 \shift, \shift, #\shiftbits
+.endif
+.ifnb \addsrc
+ vqadd.s16 \adddst, \adddst, \addsrc
+.endif
+.ifnb \max
+ vmax.s16 \max, \max, q6
+.endif
+.ifnb \min
+ vmin.s16 \min, \min, q7
+.endif
+.ifnb \store
+ vst1.16 {\store}, [\dst, :128], r1
+.endif
+.endm
+.macro load_add_store_8x8 dst, src, shiftbits=4
+ mov \src, \dst
+ vmov.i16 q6, #0
+ vmvn.i16 q7, #0xfc00 // 0x3ff
+ load_add_store q0, q8, , , , , , \dst, \src, \shiftbits
+ load_add_store q1, q9, , , , , , \dst, \src, \shiftbits
+ load_add_store q2, q10, q0, q8, , , , \dst, \src, \shiftbits
+ load_add_store q3, q11, q1, q9, q8, , , \dst, \src, \shiftbits
+ load_add_store q4, q12, q2, q10, q9, q8, , \dst, \src, \shiftbits
+ load_add_store q5, q13, q3, q11, q10, q9, q8, \dst, \src, \shiftbits
+ load_add_store q0, q14, q4, q12, q11, q10, q9, \dst, \src, \shiftbits
+ load_add_store q1, q15, q5, q13, q12, q11, q10, \dst, \src, \shiftbits
+ load_add_store , , q0, q14, q13, q12, q11, \dst, \src, \shiftbits
+ load_add_store , , q1, q15, q14, q13, q12, \dst, \src, \shiftbits
+ load_add_store , , , , q15, q14, q13, \dst, \src, \shiftbits
+ load_add_store , , , , , q15, q14, \dst, \src, \shiftbits
+ load_add_store , , , , , , q15, \dst, \src, \shiftbits
+.endm
+.macro load_add_store_8x4 dst, src, shiftbits=4
+ mov \src, \dst
+ vmov.i16 q6, #0
+ vmvn.i16 q7, #0xfc00 // 0x3ff
+ load_add_store q0, q8, , , , , , \dst, \src, \shiftbits
+ load_add_store q1, q9, , , , , , \dst, \src, \shiftbits
+ load_add_store q2, q10, q0, q8, , , , \dst, \src, \shiftbits
+ load_add_store q3, q11, q1, q9, q8, , , \dst, \src, \shiftbits
+ load_add_store , , q2, q10, q9, q8, , \dst, \src, \shiftbits
+ load_add_store , , q3, q11, q10, q9, q8, \dst, \src, \shiftbits
+ load_add_store , , , , q11, q10, q9, \dst, \src, \shiftbits
+ load_add_store , , , , , q11, q10, \dst, \src, \shiftbits
+ load_add_store , , , , , , q11, \dst, \src, \shiftbits
+.endm
+.macro load_add_store4 load1, load2, shift, addsrc, adddst, max, min, store1, store2, dst, src, shiftbits=4
+.ifnb \load1
+ vld1.16 {\load1}, [\src, :64], r1
+.endif
+.ifnb \shift
+ vrshr.s16 \shift, \shift, #\shiftbits
+.endif
+.ifnb \load2
+ vld1.16 {\load2}, [\src, :64], r1
+.endif
+.ifnb \addsrc
+ vqadd.s16 \adddst, \adddst, \addsrc
+.endif
+.ifnb \max
+ vmax.s16 \max, \max, q6
+.endif
+.ifnb \store1
+ vst1.16 {\store1}, [\dst, :64], r1
+.endif
+.ifnb \min
+ vmin.s16 \min, \min, q7
+.endif
+.ifnb \store2
+ vst1.16 {\store2}, [\dst, :64], r1
+.endif
+.endm
+.macro load_add_store_4x16 dst, src
+ mov \src, \dst
+ vmov.i16 q6, #0
+ vmvn.i16 q7, #0xfc00 // 0x3ff
+ mov \src, \dst
+ load_add_store4 d0, d1, q8, , , , , , , \dst, \src
+ load_add_store4 d2, d3, q9, , , , , , , \dst, \src
+ load_add_store4 d4, d5, q10, q0, q8, , , , , \dst, \src
+ load_add_store4 d6, d7, q11, q1, q9, q8, , , , \dst, \src
+ load_add_store4 d8, d9, q12, q2, q10, q9, q8, , , \dst, \src
+ load_add_store4 d10, d11, q13, q3, q11, q10, q9, d16, d17, \dst, \src
+ load_add_store4 d0, d1, q14, q4, q12, q11, q10, d18, d19, \dst, \src
+ load_add_store4 d2, d3, q15, q5, q13, q12, q11, d20, d21, \dst, \src
+ load_add_store4 , , , q0, q14, q13, q12, d22, d23, \dst, \src
+ load_add_store4 , , , q1, q15, q14, q13, d24, d25, \dst, \src
+ load_add_store4 , , , , , q15, q14, d26, d27, \dst, \src
+ load_add_store4 , , , , , , q15, d28, d29, \dst, \src
+ load_add_store4 , , , , , , , d30, d31, \dst, \src
+.endm
+.macro load_add_store_4x8 dst, src, shiftbits=4
+ mov \src, \dst
+ vmov.i16 q6, #0
+ vmvn.i16 q7, #0xfc00 // 0x3ff
+ mov \src, \dst
+ load_add_store4 d0, d1, q8, , , , , , , \dst, \src, \shiftbits
+ load_add_store4 d2, d3, q9, , , , , , , \dst, \src, \shiftbits
+ load_add_store4 d4, d5, q10, q0, q8, , , , , \dst, \src, \shiftbits
+ load_add_store4 d6, d7, q11, q1, q9, q8, , , , \dst, \src, \shiftbits
+ load_add_store4 , , , q2, q10, q9, q8, , , \dst, \src, \shiftbits
+ load_add_store4 , , , q3, q11, q10, q9, d16, d17, \dst, \src, \shiftbits
+ load_add_store4 , , , , , q11, q10, d18, d19, \dst, \src, \shiftbits
+ load_add_store4 , , , , , , q11, d20, d21, \dst, \src, \shiftbits
+ load_add_store4 , , , , , , , d22, d23, \dst, \src, \shiftbits
+.endm
+.macro load_add_store_4x4 dst, src, shiftbits=4
+ mov \src, \dst
+ vmov.i16 q6, #0
+ vmvn.i16 q7, #0xfc00 // 0x3ff
+ mov \src, \dst
+ load_add_store4 d0, d1, q8, , , , , , , \dst, \src, \shiftbits
+ load_add_store4 d2, d3, q9, q0, q8, , , , , \dst, \src, \shiftbits
+ load_add_store4 , , , q1, q9, q8, , , , \dst, \src, \shiftbits
+ load_add_store4 , , , , , q9, q8, , , \dst, \src, \shiftbits
+ load_add_store4 , , , , , , q9, d16, d17, \dst, \src, \shiftbits
+ load_add_store4 , , , , , , , d18, d19, \dst, \src, \shiftbits
+.endm
+
+.macro idct_dc w, h, shift
+ cmp r3, #0
+ bne 1f
+ vmov.i16 q14, #0
+ mov_const r12, 2896*8*(1<<16)
+ vld1.32 {d24[], d25[]}, [r2, :32]
+ vdup.32 d0, r12
+ vqrdmulh.s32 q13, q12, d0[0]
+ vst1.32 {d28[0]}, [r2, :32]
+.if (\w == 2*\h) || (2*\w == \h)
+ vqrdmulh.s32 q13, q13, d0[0]
+.endif
+.if \shift > 0
+ vqrshrn.s32 d24, q13, #\shift
+ vqrshrn.s32 d25, q13, #\shift
+.else
+ vqmovn.s32 d24, q13
+ vqmovn.s32 d25, q13
+.endif
+ vqrdmulh.s16 q12, q12, d0[1]
+ mov r3, #\h
+ vrshr.s16 q12, q12, #4
+ b idct_dc_w\w\()_neon
+1:
+.endm
+
+function idct_dc_w4_neon
+ vmvn.i16 q15, #0xfc00 // 0x3ff
+1:
+ vld1.16 {d0}, [r0, :64], r1
+ vld1.16 {d1}, [r0, :64], r1
+ vld1.16 {d2}, [r0, :64], r1
+ vld1.16 {d3}, [r0, :64], r1
+ subs r3, r3, #4
+ vqadd.s16 q0, q0, q12
+ sub r0, r0, r1, lsl #2
+ vqadd.s16 q1, q1, q12
+ vmax.s16 q0, q0, q14
+ vmax.s16 q1, q1, q14
+ vmin.s16 q0, q0, q15
+ vst1.16 {d0}, [r0, :64], r1
+ vmin.s16 q1, q1, q15
+ vst1.16 {d1}, [r0, :64], r1
+ vst1.16 {d2}, [r0, :64], r1
+ vst1.16 {d3}, [r0, :64], r1
+ bgt 1b
+ bx lr
+endfunc
+
+function idct_dc_w8_neon
+ vmvn.i16 q15, #0xfc00 // 0x3ff
+1:
+ vld1.16 {q0}, [r0, :128], r1
+ subs r3, r3, #4
+ vld1.16 {q1}, [r0, :128], r1
+ vqadd.s16 q0, q0, q12
+ vld1.16 {q2}, [r0, :128], r1
+ vqadd.s16 q1, q1, q12
+ vld1.16 {q3}, [r0, :128], r1
+ vqadd.s16 q2, q2, q12
+ vqadd.s16 q3, q3, q12
+ sub r0, r0, r1, lsl #2
+ vmax.s16 q0, q0, q14
+ vmax.s16 q1, q1, q14
+ vmax.s16 q2, q2, q14
+ vmax.s16 q3, q3, q14
+ vmin.s16 q0, q0, q15
+ vmin.s16 q1, q1, q15
+ vst1.16 {q0}, [r0, :128], r1
+ vmin.s16 q2, q2, q15
+ vst1.16 {q1}, [r0, :128], r1
+ vmin.s16 q3, q3, q15
+ vst1.16 {q2}, [r0, :128], r1
+ vst1.16 {q3}, [r0, :128], r1
+ bgt 1b
+ bx lr
+endfunc
+
+function idct_dc_w16_neon
+ vmvn.i16 q15, #0xfc00 // 0x3ff
+1:
+ vld1.16 {q0, q1}, [r0, :128], r1
+ subs r3, r3, #2
+ vld1.16 {q2, q3}, [r0, :128], r1
+ vqadd.s16 q0, q0, q12
+ vqadd.s16 q1, q1, q12
+ vqadd.s16 q2, q2, q12
+ vqadd.s16 q3, q3, q12
+ sub r0, r0, r1, lsl #1
+ vmax.s16 q0, q0, q14
+ vmax.s16 q1, q1, q14
+ vmax.s16 q2, q2, q14
+ vmax.s16 q3, q3, q14
+ vmin.s16 q0, q0, q15
+ vmin.s16 q1, q1, q15
+ vmin.s16 q2, q2, q15
+ vst1.16 {q0, q1}, [r0, :128], r1
+ vmin.s16 q3, q3, q15
+ vst1.16 {q2, q3}, [r0, :128], r1
+ bgt 1b
+ bx lr
+endfunc
+
+function idct_dc_w32_neon
+ sub r1, r1, #32
+ vmvn.i16 q15, #0xfc00 // 0x3ff
+1:
+ vld1.16 {q0, q1}, [r0, :128]!
+ subs r3, r3, #1
+ vld1.16 {q2, q3}, [r0, :128]
+ vqadd.s16 q0, q0, q12
+ vqadd.s16 q1, q1, q12
+ vqadd.s16 q2, q2, q12
+ vqadd.s16 q3, q3, q12
+ sub r0, r0, #32
+ vmax.s16 q0, q0, q14
+ vmax.s16 q1, q1, q14
+ vmax.s16 q2, q2, q14
+ vmax.s16 q3, q3, q14
+ vmin.s16 q0, q0, q15
+ vmin.s16 q1, q1, q15
+ vmin.s16 q2, q2, q15
+ vst1.16 {q0, q1}, [r0, :128]!
+ vmin.s16 q3, q3, q15
+ vst1.16 {q2, q3}, [r0, :128], r1
+ bgt 1b
+ bx lr
+endfunc
+
+function idct_dc_w64_neon
+ sub r1, r1, #96
+ vmvn.i16 q15, #0xfc00 // 0x3ff
+1:
+ vld1.16 {q0, q1}, [r0, :128]!
+ subs r3, r3, #1
+ vld1.16 {q2, q3}, [r0, :128]!
+ vqadd.s16 q0, q0, q12
+ vld1.16 {q8, q9}, [r0, :128]!
+ vqadd.s16 q1, q1, q12
+ vld1.16 {q10, q11}, [r0, :128]
+ vqadd.s16 q2, q2, q12
+ vqadd.s16 q3, q3, q12
+ vqadd.s16 q8, q8, q12
+ vqadd.s16 q9, q9, q12
+ vqadd.s16 q10, q10, q12
+ vqadd.s16 q11, q11, q12
+ sub r0, r0, #96
+ vmax.s16 q0, q0, q14
+ vmax.s16 q1, q1, q14
+ vmax.s16 q2, q2, q14
+ vmax.s16 q3, q3, q14
+ vmax.s16 q8, q8, q14
+ vmax.s16 q9, q9, q14
+ vmax.s16 q10, q10, q14
+ vmax.s16 q11, q11, q14
+ vmin.s16 q0, q0, q15
+ vmin.s16 q1, q1, q15
+ vmin.s16 q2, q2, q15
+ vmin.s16 q3, q3, q15
+ vmin.s16 q8, q8, q15
+ vst1.16 {q0, q1}, [r0, :128]!
+ vmin.s16 q9, q9, q15
+ vst1.16 {q2, q3}, [r0, :128]!
+ vmin.s16 q10, q10, q15
+ vst1.16 {q8, q9}, [r0, :128]!
+ vmin.s16 q11, q11, q15
+ vst1.16 {q10, q11}, [r0, :128], r1
+ bgt 1b
+ bx lr
+endfunc
+
+.macro iwht4
+ vadd.i32 q8, q8, q9
+ vsub.i32 q13, q10, q11
+ vsub.i32 q12, q8, q13
+ vshr.s32 q12, q12, #1
+ vsub.i32 q10, q12, q9
+ vsub.i32 q9, q12, q11
+ vadd.i32 q11, q13, q10
+ vsub.i32 q8, q8, q9
+.endm
+
+.macro idct_4s_x4 r0, r1, r2, r3
+ vmul_vmla q4, \r1, \r3, d1[1], d1[0]
+ vmul_vmla q2, \r0, \r2, d0[0], d0[0]
+ vmul_vmls q3, \r1, \r3, d1[0], d1[1]
+ vmul_vmls q5, \r0, \r2, d0[0], d0[0]
+ vrshr.s32 q4, q4, #12
+ vrshr.s32 q2, q2, #12
+ vrshr.s32 q3, q3, #12
+ vrshr.s32 q5, q5, #12
+ vqadd.s32 \r0, q2, q4
+ vqsub.s32 \r3, q2, q4
+ vqadd.s32 \r1, q5, q3
+ vqsub.s32 \r2, q5, q3
+.endm
+
+.macro idct_2s_x4 r0, r1, r2, r3
+ vmul_vmla d6, \r1, \r3, d1[1], d1[0]
+ vmul_vmla d4, \r0, \r2, d0[0], d0[0]
+ vmul_vmls d5, \r1, \r3, d1[0], d1[1]
+ vmul_vmls d7, \r0, \r2, d0[0], d0[0]
+ vrshr.s32 d6, d6, #12
+ vrshr.s32 d4, d4, #12
+ vrshr.s32 d5, d5, #12
+ vrshr.s32 d7, d7, #12
+ vqadd.s32 \r0, d4, d6
+ vqsub.s32 \r3, d4, d6
+ vqadd.s32 \r1, d7, d5
+ vqsub.s32 \r2, d7, d5
+.endm
+
+function inv_dct_4s_x4_neon
+ movrel_local r12, idct_coeffs
+ vld1.32 {d0, d1}, [r12, :128]
+ idct_4s_x4 q8, q9, q10, q11
+ bx lr
+endfunc
+
+.macro iadst_4x4 o0, o1, o2, o3
+ movrel_local r12, iadst4_coeffs
+ vld1.32 {d0, d1}, [r12, :128]
+
+ vsub.i32 q1, q8, q10
+ vmul.i32 q2, q8, d0[0]
+ vmla.i32 q2, q10, d0[1]
+ vmla.i32 q2, q11, d1[0]
+ vmul.i32 q4, q9, d1[1]
+ vadd.i32 q1, q1, q11
+ vmul.i32 q3, q8, d1[0]
+ vmls.i32 q3, q10, d0[0]
+ vmls.i32 q3, q11, d0[1]
+
+ vadd.i32 \o3, q2, q3
+ vmul.i32 \o2, q1, d1[1]
+ vadd.i32 \o0, q2, q4
+ vadd.i32 \o1, q3, q4
+ vsub.i32 \o3, \o3, q4
+
+ vrshr.s32 \o0, \o0, #12
+ vrshr.s32 \o2, \o2, #12
+ vrshr.s32 \o1, \o1, #12
+ vrshr.s32 \o3, \o3, #12
+.endm
+
+function inv_adst_4s_x4_neon
+ iadst_4x4 q8, q9, q10, q11
+ bx lr
+endfunc
+
+function inv_flipadst_4s_x4_neon
+ iadst_4x4 q11, q10, q9, q8
+ bx lr
+endfunc
+
+function inv_identity_4s_x4_neon
+ mov r12, #0
+ movt r12, #(5793-4096)*8
+ vdup.32 d0, r12
+ vqrdmulh.s32 q1, q8, d0[0]
+ vqrdmulh.s32 q2, q9, d0[0]
+ vqrdmulh.s32 q3, q10, d0[0]
+ vqrdmulh.s32 q4, q11, d0[0]
+ vqadd.s32 q8, q8, q1
+ vqadd.s32 q9, q9, q2
+ vqadd.s32 q10, q10, q3
+ vqadd.s32 q11, q11, q4
+ bx lr
+endfunc
+
+function inv_txfm_add_wht_wht_4x4_16bpc_neon, export=1
+ push {r4-r5,lr}
+ vpush {q4-q5}
+ vmov.i16 q14, #0
+ vmov.i16 q15, #0
+ vld1.32 {q8, q9}, [r2, :128]
+ vst1.32 {q14, q15}, [r2, :128]!
+ vshr.s16 q8, q8, #2
+ vld1.32 {q10, q11}, [r2, :128]
+ vshr.s16 q9, q9, #2
+ vshr.s16 q10, q10, #2
+ vshr.s16 q11, q11, #2
+
+ iwht4
+
+ vst1.32 {q14, q15}, [r2, :128]
+ transpose_4x4s q8, q9, q10, q11, d16, d17, d18, d19, d20, d21, d22, d23
+
+ iwht4
+
+ vld1.16 {d0}, [r0, :64], r1
+ vqmovn.s32 d16, q8
+ vld1.16 {d1}, [r0, :64], r1
+ vqmovn.s32 d17, q9
+ vld1.16 {d2}, [r0, :64], r1
+ vqmovn.s32 d18, q10
+ vld1.16 {d3}, [r0, :64], r1
+ vqmovn.s32 d19, q11
+
+ b L(itx_4x4_end)
+endfunc
+
+function inv_txfm_add_4x4_neon
+ vmov.i16 q14, #0
+ vmov.i16 q15, #0
+ vld1.32 {q8, q9}, [r2, :128]
+ vst1.16 {q14, q15}, [r2, :128]!
+ vld1.32 {q10, q11}, [r2, :128]
+ vst1.16 {q14, q15}, [r2, :128]
+
+ blx r4
+
+ vqmovn.s32 d16, q8
+ vqmovn.s32 d17, q9
+ vqmovn.s32 d18, q10
+ vqmovn.s32 d19, q11
+ transpose_4x4h q8, q9, d16, d17, d18, d19
+
+ blx r5
+
+ vld1.16 {d0}, [r0, :64], r1
+ vld1.16 {d1}, [r0, :64], r1
+ vrshr.s16 q8, q8, #4
+ vld1.16 {d2}, [r0, :64], r1
+ vrshr.s16 q9, q9, #4
+ vld1.16 {d3}, [r0, :64], r1
+
+L(itx_4x4_end):
+ vmvn.i16 q15, #0xfc00 // 0x3ff
+ sub r0, r0, r1, lsl #2
+ vqadd.s16 q8, q8, q0
+ vqadd.s16 q9, q9, q1
+ vmax.s16 q8, q8, q14
+ vmax.s16 q9, q9, q14
+ vmin.s16 q8, q8, q15
+ vmin.s16 q9, q9, q15
+ vst1.16 {d16}, [r0, :64], r1
+ vst1.16 {d17}, [r0, :64], r1
+ vst1.16 {d18}, [r0, :64], r1
+ vst1.16 {d19}, [r0, :64], r1
+
+ vpop {q4-q5}
+ pop {r4-r5,pc}
+endfunc
+
+.macro def_fn_4x4 txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_16bpc_neon, export=1
+ push {r4-r5,lr}
+ vpush {q4-q5}
+
+.ifc \txfm1\()_\txfm2, dct_dct
+ cmp r3, #0
+ bne 1f
+ vmov.i16 q14, #0
+ mov_const r12, 2896*8*(1<<16)
+ vld1.32 {d16[], d17[]}, [r2, :32]
+ vdup.32 d4, r12
+ vst1.32 {d28[0]}, [r2, :32]
+ vqrdmulh.s32 q8, q8, d4[0]
+ vld1.16 {d0}, [r0, :64], r1
+ vqmovn.s32 d20, q8
+ vqmovn.s32 d21, q8
+ vld1.16 {d1}, [r0, :64], r1
+ vqrdmulh.s16 q10, q10, d4[1]
+ vld1.16 {d2}, [r0, :64], r1
+ vrshr.s16 q8, q10, #4
+ vld1.16 {d3}, [r0, :64], r1
+ vrshr.s16 q9, q10, #4
+ b L(itx_4x4_end)
+1:
+.endif
+ movrel_local r4, inv_\txfm1\()_4s_x4_neon
+ movrel r5, X(inv_\txfm2\()_4h_x4_neon)
+ b inv_txfm_add_4x4_neon
+endfunc
+.endm
+
+def_fn_4x4 dct, dct
+def_fn_4x4 identity, identity
+def_fn_4x4 dct, adst
+def_fn_4x4 dct, flipadst
+def_fn_4x4 dct, identity
+def_fn_4x4 adst, dct
+def_fn_4x4 adst, adst
+def_fn_4x4 adst, flipadst
+def_fn_4x4 flipadst, dct
+def_fn_4x4 flipadst, adst
+def_fn_4x4 flipadst, flipadst
+def_fn_4x4 identity, dct
+
+def_fn_4x4 adst, identity
+def_fn_4x4 flipadst, identity
+def_fn_4x4 identity, adst
+def_fn_4x4 identity, flipadst
+
+.macro idct_4s_x8 r0, r1, r2, r3, r4, r5, r6, r7
+ idct_4s_x4 \r0, \r2, \r4, \r6
+
+ vmov.i32 q5, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ vmvn.i32 q4, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
+.irp r, \r0, \r2, \r4, \r6
+ vmin.s32 \r, \r, q5
+.endr
+.irp r, \r0, \r2, \r4, \r6
+ vmax.s32 \r, \r, q4
+.endr
+
+ vmul_vmls q2, \r1, \r7, d2[0], d2[1] // -> t4a
+ vmul_vmla q3, \r1, \r7, d2[1], d2[0] // -> t7a
+ vmul_vmls q6, \r5, \r3, d3[0], d3[1] // -> t5a
+ vmul_vmla q7, \r5, \r3, d3[1], d3[0] // -> t6a
+ vrshr.s32 \r1, q2, #12 // t4a
+ vrshr.s32 \r7, q3, #12 // t7a
+ vrshr.s32 \r3, q6, #12 // t5a
+ vrshr.s32 \r5, q7, #12 // t6a
+
+ vqadd.s32 q2, \r1, \r3 // t4
+ vqsub.s32 \r1, \r1, \r3 // t5a
+ vqadd.s32 q3, \r7, \r5 // t7
+ vqsub.s32 \r3, \r7, \r5 // t6a
+
+.irp r, q2, \r1, q3, \r3
+ vmin.s32 \r, \r, q5
+.endr
+.irp r, q2, \r1, q3, \r3
+ vmax.s32 \r, \r, q4
+.endr
+
+ vmul_vmls q7, \r3, \r1, d0[0], d0[0] // -> t5
+ vmul_vmla q6, \r3, \r1, d0[0], d0[0] // -> t6
+ vrshr.s32 q7, q7, #12 // t5
+ vrshr.s32 q5, q6, #12 // t6
+
+ vqsub.s32 \r7, \r0, q3 // out7
+ vqadd.s32 \r0, \r0, q3 // out0
+ vqadd.s32 \r1, \r2, q5 // out1
+ vqsub.s32 q6, \r2, q5 // out6
+ vqadd.s32 \r2, \r4, q7 // out2
+ vqsub.s32 \r5, \r4, q7 // out5
+ vqadd.s32 \r3, \r6, q2 // out3
+ vqsub.s32 \r4, \r6, q2 // out4
+ vmov \r6, q6 // out6
+.endm
+
+.macro idct_2s_x8 r0, r1, r2, r3, r4, r5, r6, r7
+ idct_2s_x4 \r0, \r2, \r4, \r6
+
+ vmov.i32 d9, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ vmvn.i32 d8, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
+.irp r, \r0, \r2, \r4, \r6
+ vmin.s32 \r, \r, d9
+.endr
+.irp r, \r0, \r2, \r4, \r6
+ vmax.s32 \r, \r, d8
+.endr
+
+ vmul_vmls d4, \r1, \r7, d2[0], d2[1] // -> t4a
+ vmul_vmla d5, \r1, \r7, d2[1], d2[0] // -> t7a
+ vmul_vmls d6, \r5, \r3, d3[0], d3[1] // -> t5a
+ vmul_vmla d7, \r5, \r3, d3[1], d3[0] // -> t6a
+ vrshr.s32 \r1, d4, #12 // t4a
+ vrshr.s32 \r7, d5, #12 // t7a
+ vrshr.s32 \r3, d6, #12 // t5a
+ vrshr.s32 \r5, d7, #12 // t6a
+
+ vqadd.s32 d4, \r1, \r3 // t4
+ vqsub.s32 \r1, \r1, \r3 // t5a
+ vqadd.s32 d5, \r7, \r5 // t7
+ vqsub.s32 \r3, \r7, \r5 // t6a
+
+.irp r, d4, \r1, d5, \r3
+ vmin.s32 \r, \r, d9
+.endr
+.irp r, d4, \r1, d5, \r3
+ vmax.s32 \r, \r, d8
+.endr
+
+ vmul_vmls d6, \r3, \r1, d0[0], d0[0] // -> t5
+ vmul_vmla d7, \r3, \r1, d0[0], d0[0] // -> t6
+ vrshr.s32 d6, d6, #12 // t5
+ vrshr.s32 d7, d7, #12 // t6
+
+ vqsub.s32 \r7, \r0, d5 // out7
+ vqadd.s32 \r0, \r0, d5 // out0
+ vqadd.s32 \r1, \r2, d7 // out1
+ vqsub.s32 d7, \r2, d7 // out6
+ vqadd.s32 \r2, \r4, d6 // out2
+ vqsub.s32 \r5, \r4, d6 // out5
+ vqadd.s32 \r3, \r6, d4 // out3
+ vqsub.s32 \r4, \r6, d4 // out4
+ vmov \r6, d7 // out6
+.endm
+
+function inv_dct_4s_x8_neon
+ movrel_local r12, idct_coeffs
+ vld1.32 {q0, q1}, [r12, :128]
+ idct_4s_x8 q8, q9, q10, q11, q12, q13, q14, q15
+ bx lr
+endfunc
+
+.macro iadst_4s_x8 r0, r1, r2, r3, r4, r5, r6, r7
+ movrel_local r12, iadst8_coeffs
+ vld1.32 {q0, q1}, [r12, :128]!
+
+ vmul_vmla q2, q15, q8, d0[0], d0[1]
+ vmul_vmls q3, q15, q8, d0[1], d0[0]
+ vmul_vmla q4, q13, q10, d1[0], d1[1]
+ vrshr.s32 q8, q2, #12 // t0a
+ vrshr.s32 q15, q3, #12 // t1a
+ vmul_vmls q5, q13, q10, d1[1], d1[0]
+ vmul_vmla q6, q11, q12, d2[0], d2[1]
+ vrshr.s32 q10, q4, #12 // t2a
+ vrshr.s32 q13, q5, #12 // t3a
+ vmul_vmls q7, q11, q12, d2[1], d2[0]
+ vmul_vmla q2, q9, q14, d3[0], d3[1]
+ vrshr.s32 q12, q6, #12 // t4a
+ vrshr.s32 q11, q7, #12 // t5a
+ vmul_vmls q3, q9, q14, d3[1], d3[0]
+ vrshr.s32 q14, q2, #12 // t6a
+ vrshr.s32 q9, q3, #12 // t7a
+
+ vld1.32 {q0}, [r12]
+
+ vqadd.s32 q2, q8, q12 // t0
+ vqsub.s32 q3, q8, q12 // t4
+ vmov.i32 q12, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ vqadd.s32 q4, q15, q11 // t1
+ vqsub.s32 q5, q15, q11 // t5
+ vqadd.s32 q6, q10, q14 // t2
+ vqsub.s32 q7, q10, q14 // t6
+ vmvn.i32 q14, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
+ vqadd.s32 q10, q13, q9 // t3
+ vqsub.s32 q11, q13, q9 // t7
+
+.irp r, q2, q3, q4, q5, q6, q7, q10, q11
+ vmin.s32 \r, \r, q12
+.endr
+.irp r, q2, q3, q4, q5, q6, q7, q10, q11
+ vmax.s32 \r, \r, q14
+.endr
+
+ vmul_vmla q8, q3, q5, d1[1], d1[0]
+ vmul_vmls q13, q3, q5, d1[0], d1[1]
+ vmul_vmls q14, q11, q7, d1[1], d1[0]
+
+ vrshr.s32 q3, q8, #12 // t4a
+ vrshr.s32 q5, q13, #12 // t5a
+
+ vmul_vmla q8, q11, q7, d1[0], d1[1]
+
+ vrshr.s32 q7, q14, #12 // t6a
+ vrshr.s32 q11, q8, #12 // t7a
+
+ vqadd.s32 \r0, q2, q6 // out0
+ vqsub.s32 q2, q2, q6 // t2
+ vqadd.s32 \r7, q4, q10 // out7
+ vqsub.s32 q4, q4, q10 // t3
+
+ vmvn.i32 q10, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
+
+ vqadd.s32 \r1, q3, q7 // out1
+ vqsub.s32 q3, q3, q7 // t6
+ vqadd.s32 \r6, q5, q11 // out6
+ vqsub.s32 q5, q5, q11 // t7
+
+ // Not clipping the output registers, as they will be downshifted and
+ // narrowed afterwards anyway.
+.irp r, q2, q4, q3, q5
+ vmin.s32 \r, \r, q12
+.endr
+.irp r, q2, q4, q3, q5
+ vmax.s32 \r, \r, q10
+.endr
+
+ vqneg.s32 \r7, \r7 // out7
+ vqneg.s32 \r1, \r1 // out1
+
+ vmul_vmla q10, q2, q4, d0[0], d0[0] // -> out3 (q11 or q12)
+ vmul_vmls q6, q2, q4, d0[0], d0[0] // -> out4 (q12 or q11)
+ vmul_vmls q12, q3, q5, d0[0], d0[0] // -> out5 (q13 or q10)
+ vrshr.s32 q2, q10, #12 // out3
+ vmul_vmla q10, q3, q5, d0[0], d0[0] // -> out2 (q10 or q13)
+ vrshr.s32 q3, q12, #12 // out5
+ vrshr.s32 \r2, q10, #12 // out2 (q10 or q13)
+ vrshr.s32 \r4, q6, #12 // out4 (q12 or q11)
+
+ vqneg.s32 \r3, q2 // out3
+ vqneg.s32 \r5, q3 // out5
+.endm
+
+function inv_adst_4s_x8_neon
+ iadst_4s_x8 q8, q9, q10, q11, q12, q13, q14, q15
+ bx lr
+endfunc
+
+function inv_flipadst_4s_x8_neon
+ iadst_4s_x8 q15, q14, q13, q12, q11, q10, q9, q8
+ bx lr
+endfunc
+
+function inv_identity_4s_x8_neon
+ vqshl.s32 q8, q8, #1
+ vqshl.s32 q9, q9, #1
+ vqshl.s32 q10, q10, #1
+ vqshl.s32 q11, q11, #1
+ vqshl.s32 q12, q12, #1
+ vqshl.s32 q13, q13, #1
+ vqshl.s32 q14, q14, #1
+ vqshl.s32 q15, q15, #1
+ bx lr
+endfunc
+
+function inv_txfm_add_8x8_neon
+ vmov.i32 q0, #0
+ mov r7, #8*4
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vld1.32 {\i}, [r2, :128]
+ vst1.32 {q0}, [r2, :128], r7
+.endr
+
+ blx r4
+
+ vqrshrn.s32 d16, q8, #1
+ vqrshrn.s32 d17, q12, #1
+ vqrshrn.s32 d18, q9, #1
+ vqrshrn.s32 d19, q13, #1
+ vqrshrn.s32 d20, q10, #1
+ vqrshrn.s32 d21, q14, #1
+ vqrshrn.s32 d22, q11, #1
+ vqrshrn.s32 d23, q15, #1
+
+ cmp r3, r10
+ transpose_4x8h q8, q9, q10, q11
+
+ blt 1f
+
+ sub r2, r2, r7, lsl #3
+ vpush {q8-q11}
+
+ add r2, r2, #16
+ vmov.i32 q0, #0
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vld1.32 {\i}, [r2, :128]
+ vst1.32 {q0}, [r2, :128], r7
+.endr
+
+ blx r4
+
+ vqrshrn.s32 d31, q15, #1
+ vqrshrn.s32 d30, q11, #1
+ vqrshrn.s32 d29, q14, #1
+ vqrshrn.s32 d28, q10, #1
+ vqrshrn.s32 d27, q13, #1
+ vqrshrn.s32 d26, q9, #1
+ vqrshrn.s32 d25, q12, #1
+ vqrshrn.s32 d24, q8, #1
+ vpop {q8-q11}
+
+ transpose_4x8h q12, q13, q14, q15
+
+ b 2f
+
+1:
+ vmov.i16 q12, #0
+ vmov.i16 q13, #0
+ vmov.i16 q14, #0
+ vmov.i16 q15, #0
+
+2:
+ blx r5
+
+ load_add_store_8x8 r0, r7
+ vpop {q4-q7}
+ pop {r4-r5,r7,r10,pc}
+endfunc
+
+.macro def_fn_8x8 txfm1, txfm2, eob_half
+function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_16bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc 8, 8, 1
+.endif
+ push {r4-r5,r7,r10,lr}
+ vpush {q4-q7}
+ mov r10, #\eob_half
+ movrel_local r4, inv_\txfm1\()_4s_x8_neon
+ movrel r5, X(inv_\txfm2\()_8h_x8_neon)
+ b inv_txfm_add_8x8_neon
+endfunc
+.endm
+
+def_fn_8x8 dct, dct, 10
+def_fn_8x8 identity, identity, 10
+def_fn_8x8 dct, adst, 10
+def_fn_8x8 dct, flipadst, 10
+def_fn_8x8 dct, identity, 4
+def_fn_8x8 adst, dct, 10
+def_fn_8x8 adst, adst, 10
+def_fn_8x8 adst, flipadst, 10
+def_fn_8x8 flipadst, dct, 10
+def_fn_8x8 flipadst, adst, 10
+def_fn_8x8 flipadst, flipadst, 10
+def_fn_8x8 identity, dct, 4
+def_fn_8x8 adst, identity, 4
+def_fn_8x8 flipadst, identity, 4
+def_fn_8x8 identity, adst, 4
+def_fn_8x8 identity, flipadst, 4
+
+function inv_txfm_add_8x4_neon
+ mov_const r12, 2896*8*(1<<16)
+ vmov.i32 q0, #0
+ vmov.i32 q1, #0
+ vld1.16 {q8, q9}, [r2, :128]
+ vst1.16 {q0, q1}, [r2, :128]!
+ vdup.32 d4, r12
+ vld1.16 {q10, q11}, [r2, :128]
+ vst1.16 {q0, q1}, [r2, :128]!
+ vld1.16 {q12, q13}, [r2, :128]
+ vst1.16 {q0, q1}, [r2, :128]!
+ vld1.16 {q14, q15}, [r2, :128]
+ vst1.16 {q0, q1}, [r2, :128]!
+
+ scale_input d4[0], q8, q9, q10, q11, q12, q13, q14, q15
+
+ blx r4
+
+ vqmovn.s32 d16, q8
+ vqmovn.s32 d17, q9
+ vqmovn.s32 d18, q10
+ vqmovn.s32 d19, q11
+ vqmovn.s32 d20, q12
+ vqmovn.s32 d21, q13
+ vqmovn.s32 d22, q14
+ vqmovn.s32 d23, q15
+ transpose_4x4h q8, q9, d16, d17, d18, d19
+ transpose_4x4h q10, q11, d20, d21, d22, d23
+ vswp d17, d20
+ vswp d19, d21
+ vswp d18, d20
+ vswp d21, d22
+
+ blx r5
+
+ load_add_store_8x4 r0, r7
+ vpop {q4-q7}
+ pop {r4-r5,r7,r10,pc}
+endfunc
+
+function inv_txfm_add_4x8_neon
+ mov_const r12, 2896*8*(1<<16)
+ vmov.i32 q0, #0
+ cmp r3, r10
+ mov r7, #32
+ blt 1f
+
+ add r2, r2, #16
+ vdup.32 d2, r12
+.irp i, q8, q9, q10, q11
+ vld1.32 {\i}, [r2, :128]
+ vst1.32 {q0}, [r2, :128], r7
+.endr
+
+ scale_input d2[0], q8, q9, q10, q11
+ sub r2, r2, r7, lsl #2
+
+ blx r4
+
+ sub r2, r2, #16
+
+ vqmovn.s32 d24, q8
+ vqmovn.s32 d25, q9
+ vqmovn.s32 d26, q10
+ vqmovn.s32 d27, q11
+ transpose_4x4h q12, q13, d24, d25, d26, d27
+
+ b 2f
+
+1:
+ vmov.i16 q12, #0
+ vmov.i16 q13, #0
+
+2:
+ mov_const r12, 2896*8*(1<<16)
+ vmov.i32 q0, #0
+ vdup.32 d2, r12
+.irp i, q8, q9, q10, q11
+ vld1.32 {\i}, [r2, :128]
+ vst1.32 {q0}, [r2, :128], r7
+.endr
+ scale_input d2[0], q8, q9, q10, q11
+ blx r4
+
+ vqmovn.s32 d16, q8
+ vqmovn.s32 d17, q9
+ vqmovn.s32 d18, q10
+ vqmovn.s32 d19, q11
+ transpose_4x4h q8, q9, d16, d17, d18, d19
+
+ vmov q10, q12
+ vmov q11, q13
+
+ blx r5
+
+ load_add_store_4x8 r0, r7
+ vpop {q4-q7}
+ pop {r4-r5,r7,r10,pc}
+endfunc
+
+.macro def_fn_48 w, h, txfm1, txfm2, eob_half
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc \w, \h, 0
+.endif
+ push {r4-r5,r7,r10,lr}
+ vpush {q4-q7}
+ movrel_local r4, inv_\txfm1\()_4s_x\w\()_neon
+.if \w == 4
+ mov r10, #\eob_half
+.endif
+ movrel r5, X(inv_\txfm2\()_\w\()h_x\h\()_neon)
+ b inv_txfm_add_\w\()x\h\()_neon
+endfunc
+.endm
+
+.macro def_fns_48 w, h
+def_fn_48 \w, \h, dct, dct, 13
+def_fn_48 \w, \h, identity, identity, 13
+def_fn_48 \w, \h, dct, adst, 13
+def_fn_48 \w, \h, dct, flipadst, 13
+def_fn_48 \w, \h, dct, identity, 4
+def_fn_48 \w, \h, adst, dct, 13
+def_fn_48 \w, \h, adst, adst, 13
+def_fn_48 \w, \h, adst, flipadst, 13
+def_fn_48 \w, \h, flipadst, dct, 13
+def_fn_48 \w, \h, flipadst, adst, 13
+def_fn_48 \w, \h, flipadst, flipadst, 13
+def_fn_48 \w, \h, identity, dct, 16
+def_fn_48 \w, \h, adst, identity, 4
+def_fn_48 \w, \h, flipadst, identity, 4
+def_fn_48 \w, \h, identity, adst, 16
+def_fn_48 \w, \h, identity, flipadst, 16
+.endm
+
+def_fns_48 4, 8
+def_fns_48 8, 4
+
+function inv_dct_2s_x16_neon
+ movrel_local r12, idct_coeffs
+ vld1.32 {q0, q1}, [r12, :128]!
+
+ idct_2s_x8 d16, d18, d20, d22, d24, d26, d28, d30
+
+ // idct_8 leaves the row_clip_max/min constants in d9 and d8
+.irp r, d16, d18, d20, d22, d24, d26, d28, d30
+ vmin.s32 \r, \r, d9
+.endr
+.irp r, d16, d18, d20, d22, d24, d26, d28, d30
+ vmax.s32 \r, \r, d8
+.endr
+
+ vld1.32 {q0, q1}, [r12, :128]
+ sub r12, r12, #32
+
+ vmul_vmls d4, d17, d31, d0[0], d0[1] // -> t8a
+ vmul_vmla d5, d17, d31, d0[1], d0[0] // -> t15a
+ vmul_vmls d6, d25, d23, d1[0], d1[1] // -> t9a
+ vrshr.s32 d17, d4, #12 // t8a
+ vrshr.s32 d31, d5, #12 // t15a
+ vmul_vmla d4, d25, d23, d1[1], d1[0] // -> t14a
+ vmul_vmls d5, d21, d27, d2[0], d2[1] // -> t10a
+ vrshr.s32 d23, d6, #12 // t9a
+ vrshr.s32 d25, d4, #12 // t14a
+ vmul_vmla d6, d21, d27, d2[1], d2[0] // -> t13a
+ vmul_vmls d4, d29, d19, d3[0], d3[1] // -> t11a
+ vrshr.s32 d21, d5, #12 // t10a
+ vrshr.s32 d27, d6, #12 // t13a
+ vmul_vmla d5, d29, d19, d3[1], d3[0] // -> t12a
+ vrshr.s32 d19, d4, #12 // t11a
+ vrshr.s32 d29, d5, #12 // t12a
+
+ vld1.32 {q0}, [r12, :128]
+
+ vqsub.s32 d4, d17, d23 // t9
+ vqadd.s32 d17, d17, d23 // t8
+ vqsub.s32 d5, d31, d25 // t14
+ vqadd.s32 d31, d31, d25 // t15
+ vqsub.s32 d23, d19, d21 // t10
+ vqadd.s32 d19, d19, d21 // t11
+ vqadd.s32 d25, d29, d27 // t12
+ vqsub.s32 d29, d29, d27 // t13
+
+.irp r, d4, d17, d5, d31, d23, d19, d25, d29
+ vmin.s32 \r, \r, d9
+.endr
+.irp r, d4, d17, d5, d31, d23, d19, d25, d29
+ vmax.s32 \r, \r, d8
+.endr
+
+ vmul_vmls d6, d5, d4, d1[0], d1[1] // -> t9a
+ vmul_vmla d7, d5, d4, d1[1], d1[0] // -> t14a
+ vrshr.s32 d21, d6, #12 // t9a
+ vrshr.s32 d27, d7, #12 // t14a
+
+ vmul_vmls d6, d29, d23, d1[0], d1[1] // -> t13a
+ vmul_vmla d7, d29, d23, d1[1], d1[0] // -> t10a
+ vrshr.s32 d29, d6, #12 // t13a
+ vneg.s32 d7, d7
+ vrshr.s32 d23, d7, #12 // t10a
+
+ vqsub.s32 d4, d17, d19 // t11a
+ vqadd.s32 d17, d17, d19 // t8a
+ vqsub.s32 d5, d31, d25 // t12a
+ vqadd.s32 d31, d31, d25 // t15a
+ vqadd.s32 d19, d21, d23 // t9
+ vqsub.s32 d21, d21, d23 // t10
+ vqsub.s32 d25, d27, d29 // t13
+ vqadd.s32 d27, d27, d29 // t14
+
+.irp r, d4, d17, d5, d31, d19, d21, d25, d27
+ vmin.s32 \r, \r, d9
+.endr
+.irp r, d4, d17, d5, d31, d19, d21, d25, d27
+ vmax.s32 \r, \r, d8
+.endr
+
+ vmul_vmls d6, d5, d4, d0[0], d0[0] // -> t11
+ vmul_vmla d7, d5, d4, d0[0], d0[0] // -> t12
+ vmul_vmls d4, d25, d21, d0[0], d0[0] // -> t10a
+
+ vrshr.s32 d6, d6, #12 // t11
+ vrshr.s32 d7, d7, #12 // t12
+ vmul_vmla d5, d25, d21, d0[0], d0[0] // -> t13a
+ vrshr.s32 d4, d4, #12 // t10a
+ vrshr.s32 d5, d5, #12 // t13a
+
+ vqadd.s32 d8, d16, d31 // out0
+ vqsub.s32 d31, d16, d31 // out15
+ vmov d16, d8
+ vqadd.s32 d23, d30, d17 // out7
+ vqsub.s32 d9, d30, d17 // out8
+ vqadd.s32 d17, d18, d27 // out1
+ vqsub.s32 d30, d18, d27 // out14
+ vqadd.s32 d18, d20, d5 // out2
+ vqsub.s32 d29, d20, d5 // out13
+ vqadd.s32 d5, d28, d19 // out6
+ vqsub.s32 d25, d28, d19 // out9
+ vqadd.s32 d19, d22, d7 // out3
+ vqsub.s32 d28, d22, d7 // out12
+ vqadd.s32 d20, d24, d6 // out4
+ vqsub.s32 d27, d24, d6 // out11
+ vqadd.s32 d21, d26, d4 // out5
+ vqsub.s32 d26, d26, d4 // out10
+ vmov d24, d9
+ vmov d22, d5
+
+ bx lr
+endfunc
+
+.macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15
+ movrel_local r12, iadst16_coeffs
+ vld1.32 {q0, q1}, [r12, :128]!
+
+ vmul_vmla d4, d31, d16, d0[0], d0[1] // -> t0
+ vmul_vmls d6, d31, d16, d0[1], d0[0] // -> t1
+ vmul_vmla d8, d29, d18, d1[0], d1[1] // -> t2
+ vrshr.s32 d16, d4, #12 // t0
+ vrshr.s32 d31, d6, #12 // t1
+ vmul_vmls d4, d29, d18, d1[1], d1[0] // -> t3
+ vmul_vmla d6, d27, d20, d2[0], d2[1] // -> t4
+ vrshr.s32 d18, d8, #12 // t2
+ vrshr.s32 d29, d4, #12 // t3
+ vmul_vmls d8, d27, d20, d2[1], d2[0] // -> t5
+ vmul_vmla d4, d25, d22, d3[0], d3[1] // -> t6
+ vrshr.s32 d20, d6, #12 // t4
+ vrshr.s32 d27, d8, #12 // t5
+ vmul_vmls d6, d25, d22, d3[1], d3[0] // -> t7
+ vld1.32 {q0, q1}, [r12, :128]
+ movrel_local r12, idct_coeffs
+ vmul_vmla d8, d23, d24, d0[0], d0[1] // -> t8
+ vrshr.s32 d22, d4, #12 // t6
+ vrshr.s32 d25, d6, #12 // t7
+ vmul_vmls d4, d23, d24, d0[1], d0[0] // -> t9
+ vmul_vmla d6, d21, d26, d1[0], d1[1] // -> t10
+ vrshr.s32 d23, d8, #12 // t8
+ vrshr.s32 d24, d4, #12 // t9
+ vmul_vmls d8, d21, d26, d1[1], d1[0] // -> t11
+ vmul_vmla d4, d19, d28, d2[0], d2[1] // -> t12
+ vrshr.s32 d21, d6, #12 // t10
+ vrshr.s32 d26, d8, #12 // t11
+ vmul_vmls d6, d19, d28, d2[1], d2[0] // -> t13
+ vmul_vmla d8, d17, d30, d3[0], d3[1] // -> t14
+ vrshr.s32 d19, d4, #12 // t12
+ vrshr.s32 d28, d6, #12 // t13
+ vmul_vmls d4, d17, d30, d3[1], d3[0] // -> t15
+ vrshr.s32 d17, d8, #12 // t14
+ vrshr.s32 d30, d4, #12 // t15
+
+ vld1.32 {q0, q1}, [r12, :128]
+
+ vmov.i32 d11, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ vmvn.i32 d10, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
+
+ vqsub.s32 d5, d16, d23 // t8a
+ vqadd.s32 d16, d16, d23 // t0a
+ vqsub.s32 d7, d31, d24 // t9a
+ vqadd.s32 d31, d31, d24 // t1a
+ vqadd.s32 d23, d18, d21 // t2a
+ vqsub.s32 d18, d18, d21 // t10a
+ vqadd.s32 d24, d29, d26 // t3a
+ vqsub.s32 d29, d29, d26 // t11a
+ vqadd.s32 d21, d20, d19 // t4a
+ vqsub.s32 d20, d20, d19 // t12a
+ vqadd.s32 d26, d27, d28 // t5a
+ vqsub.s32 d27, d27, d28 // t13a
+ vqadd.s32 d19, d22, d17 // t6a
+ vqsub.s32 d22, d22, d17 // t14a
+ vqadd.s32 d28, d25, d30 // t7a
+ vqsub.s32 d25, d25, d30 // t15a
+
+.irp r, d5, d16, d7, d31, d23, d18, d24, d29, d21, d20, d26, d27, d19, d22, d28, d25
+ vmin.s32 \r, \r, d11
+.endr
+.irp r, d5, d16, d7, d31, d23, d18, d24, d29, d21, d20, d26, d27, d19, d22, d28, d25
+ vmax.s32 \r, \r, d10
+.endr
+
+ vmul_vmla d4, d5, d7, d2[1], d2[0] // -> t8
+ vmul_vmls d6, d5, d7, d2[0], d2[1] // -> t9
+ vmul_vmla d8, d18, d29, d3[1], d3[0] // -> t10
+ vrshr.s32 d17, d4, #12 // t8
+ vrshr.s32 d30, d6, #12 // t9
+ vmul_vmls d4, d18, d29, d3[0], d3[1] // -> t11
+ vmul_vmls d6, d27, d20, d2[1], d2[0] // -> t12
+ vrshr.s32 d18, d8, #12 // t10
+ vrshr.s32 d29, d4, #12 // t11
+ vmul_vmla d8, d27, d20, d2[0], d2[1] // -> t13
+ vmul_vmls d4, d25, d22, d3[1], d3[0] // -> t14
+ vrshr.s32 d27, d6, #12 // t12
+ vrshr.s32 d20, d8, #12 // t13
+ vmul_vmla d6, d25, d22, d3[0], d3[1] // -> t15
+ vrshr.s32 d25, d4, #12 // t14
+ vrshr.s32 d22, d6, #12 // t15
+
+ vqsub.s32 d2, d16, d21 // t4
+ vqadd.s32 d16, d16, d21 // t0
+ vqsub.s32 d3, d31, d26 // t5
+ vqadd.s32 d31, d31, d26 // t1
+ vqadd.s32 d21, d23, d19 // t2
+ vqsub.s32 d23, d23, d19 // t6
+ vqadd.s32 d26, d24, d28 // t3
+ vqsub.s32 d24, d24, d28 // t7
+ vqadd.s32 d19, d17, d27 // t8a
+ vqsub.s32 d17, d17, d27 // t12a
+ vqadd.s32 d28, d30, d20 // t9a
+ vqsub.s32 d30, d30, d20 // t13a
+ vqadd.s32 d27, d18, d25 // t10a
+ vqsub.s32 d18, d18, d25 // t14a
+ vqadd.s32 d20, d29, d22 // t11a
+ vqsub.s32 d29, d29, d22 // t15a
+
+.irp r, d2, d16, d3, d31, d21, d23, d26, d24, d19, d17, d28, d30, d27, d18, d20, d29
+ vmin.s32 \r, \r, d11
+.endr
+.irp r, d2, d16, d3, d31, d21, d23, d26, d24, d19, d17, d28, d30, d27, d18, d20, d29
+ vmax.s32 \r, \r, d10
+.endr
+
+ vmul_vmla d4, d2, d3, d1[1], d1[0] // -> t4a
+ vmul_vmls d6, d2, d3, d1[0], d1[1] // -> t5a
+ vmul_vmls d8, d24, d23, d1[1], d1[0] // -> t6a
+ vrshr.s32 d22, d4, #12 // t4a
+ vrshr.s32 d25, d6, #12 // t5a
+ vmul_vmla d4, d24, d23, d1[0], d1[1] // -> t7a
+ vmul_vmla d6, d17, d30, d1[1], d1[0] // -> t12
+ vrshr.s32 d24, d8, #12 // t6a
+ vrshr.s32 d23, d4, #12 // t7a
+ vmul_vmls d8, d17, d30, d1[0], d1[1] // -> t13
+ vmul_vmls d4, d29, d18, d1[1], d1[0] // -> t14
+ vrshr.s32 d17, d6, #12 // t12
+ vmul_vmla d6, d29, d18, d1[0], d1[1] // -> t15
+ vrshr.s32 d29, d8, #12 // t13
+ vrshr.s32 d30, d4, #12 // t14
+ vrshr.s32 d18, d6, #12 // t15
+
+ vqsub.s32 d2, d16, d21 // t2a
+.ifc \o0, d16
+ vqadd.s32 \o0, d16, d21 // out0
+ vqsub.s32 d21, d31, d26 // t3a
+ vqadd.s32 \o15,d31, d26 // out15
+.else
+ vqadd.s32 d4, d16, d21 // out0
+ vqsub.s32 d21, d31, d26 // t3a
+ vqadd.s32 \o15,d31, d26 // out15
+ vmov \o0, d4
+.endif
+
+ vqsub.s32 d3, d29, d18 // t15a
+ vqadd.s32 \o13,d29, d18 // out13
+ vqadd.s32 \o2, d17, d30 // out2
+ vqsub.s32 d26, d17, d30 // t14a
+
+ vqadd.s32 \o1, d19, d27 // out1
+ vqsub.s32 d27, d19, d27 // t10
+ vqadd.s32 \o14,d28, d20 // out14
+ vqsub.s32 d20, d28, d20 // t11
+
+ vqadd.s32 \o3, d22, d24 // out3
+ vqsub.s32 d22, d22, d24 // t6
+ vqadd.s32 \o12,d25, d23 // out12
+ vqsub.s32 d23, d25, d23 // t7
+
+ // Not clipping the output registers, as they will be downshifted and
+ // narrowed afterwards anyway.
+.irp r, d2, d21, d3, d26, d27, d20, d22, d23
+ vmin.s32 \r, \r, d11
+.endr
+.irp r, d2, d21, d3, d26, d27, d20, d22, d23
+ vmax.s32 \r, \r, d10
+.endr
+
+ vqneg.s32 \o15, \o15 // out15
+ vqneg.s32 \o13,\o13 // out13
+ vqneg.s32 \o1, \o1 // out1
+ vqneg.s32 \o3, \o3 // out3
+
+ vmul_vmls d24, d2, d21, d0[0], d0[0] // -> out8 (d24 or d23)
+ vmul_vmla d4, d2, d21, d0[0], d0[0] // -> out7 (d23 or d24)
+ vmul_vmla d6, d26, d3, d0[0], d0[0] // -> out5 (d21 or d26)
+
+ vrshr.s32 d24, d24, #12 // out8
+ vrshr.s32 d4, d4, #12 // out7
+ vrshr.s32 d5, d6, #12 // out5
+ vmul_vmls d8, d26, d3, d0[0], d0[0] // -> out10 (d26 or d21)
+ vmul_vmla d2, d22, d23, d0[0], d0[0] // -> out4 (d20 or d27)
+ vrshr.s32 d26, d8, #12 // out10
+
+ vmul_vmls d8, d22, d23, d0[0], d0[0] // -> out11 (d27 or d20)
+ vmul_vmla d22, d27, d20, d0[0], d0[0] // -> out6 (d22 or d25)
+ vmul_vmls d6, d27, d20, d0[0], d0[0] // -> out9 (d25 or d22)
+
+ vrshr.s32 \o4, d2, #12 // out4
+ vrshr.s32 d7, d6, #12 // out9
+ vrshr.s32 d6, d8, #12 // out11
+ vrshr.s32 \o6, d22, #12 // out6
+
+.ifc \o8, d23
+ vmov \o8, d24
+ vmov \o10,d26
+.endif
+
+ vqneg.s32 \o7, d4 // out7
+ vqneg.s32 \o5, d5 // out5
+ vqneg.s32 \o11,d6 // out11
+ vqneg.s32 \o9, d7 // out9
+.endm
+
+function inv_adst_2s_x16_neon
+ iadst_16 d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ bx lr
+endfunc
+
+function inv_flipadst_2s_x16_neon
+ iadst_16 d31, d30, d29, d28, d27, d26, d25, d24, d23, d22, d21, d20, d19, d18, d17, d16
+ bx lr
+endfunc
+
+function inv_identity_2s_x16_neon
+ mov r12, #0
+ movt r12, #2*(5793-4096)*8
+ vdup.32 d0, r12
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vqrdmulh.s32 q1, \i, d0[0]
+ vqadd.s32 \i, \i, \i
+ vqadd.s32 \i, \i, q1
+.endr
+ bx lr
+endfunc
+
+.macro identity_8x4_shift1 c
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vqrdmulh.s32 q2, \i, \c
+ vrshr.s32 q2, q2, #1
+ vqadd.s32 \i, \i, q2
+.endr
+.endm
+
+.macro identity_8x4 c
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vqrdmulh.s32 q2, \i, \c
+ vqadd.s32 \i, \i, \i
+ vqadd.s32 \i, \i, q2
+.endr
+.endm
+
+.macro def_horz_16 scale=0, shift=2, suffix
+function inv_txfm_horz\suffix\()_16x2_neon
+ push {lr}
+ vmov.i32 d7, #0
+.if \scale
+ mov_const r12, 2896*8*(1<<16)
+ vdup.32 d1, r12
+.endif
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ vld1.32 {\i}, [r7, :64]
+ vst1.32 {d7}, [r7, :64], r8
+.endr
+.if \scale
+ scale_input d1[0], q8, q9, q10, q11, q12, q13, q14, q15
+.endif
+ blx r4
+ vqrshrn.s32 d16, q8, #\shift
+ vqrshrn.s32 d17, q9, #\shift
+ vqrshrn.s32 d18, q10, #\shift
+ vqrshrn.s32 d19, q11, #\shift
+ vqrshrn.s32 d20, q12, #\shift
+ vqrshrn.s32 d21, q13, #\shift
+ vqrshrn.s32 d22, q14, #\shift
+ vqrshrn.s32 d23, q15, #\shift
+ vuzp.16 q8, q9
+ vuzp.16 q10, q11
+
+.irp i, q8, q10, q9, q11
+ vst1.16 {\i}, [r6, :128]!
+.endr
+
+ pop {pc}
+endfunc
+.endm
+
+def_horz_16 scale=0, shift=2
+def_horz_16 scale=1, shift=1, suffix=_scale
+
+function inv_txfm_add_vert_4x16_neon
+ push {lr}
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ vld1.16 {\i}, [r7, :64], r8
+.endr
+ blx r5
+ load_add_store_4x16 r6, r7
+ pop {pc}
+endfunc
+
+function inv_txfm_add_16x16_neon
+ sub_sp_align 512
+ ldrh r11, [r10], #2
+.irp i, 0, 2, 4, 6, 8, 10, 12, 14
+ add r6, sp, #(\i*16*2)
+.if \i > 0
+ mov r8, #(16 - \i)
+ cmp r3, r11
+ blt 1f
+.if \i < 14
+ ldrh r11, [r10], #2
+.endif
+.endif
+ add r7, r2, #(\i*4)
+ mov r8, #16*4
+ bl inv_txfm_horz_16x2_neon
+.endr
+ b 3f
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #2
+.rept 2
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+3:
+.irp i, 0, 4, 8, 12
+ add r6, r0, #(\i*2)
+ add r7, sp, #(\i*2)
+ mov r8, #32
+ bl inv_txfm_add_vert_4x16_neon
+.endr
+
+ add_sp_align 512
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+const eob_16x16
+ .short 3, 10, 21, 36, 55, 78, 105, 256
+endconst
+
+const eob_16x16_identity
+ .short 2, 4, 6, 8, 10, 12, 14, 256
+endconst
+
+.macro def_fn_16x16 txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_16bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc 16, 16, 2
+.endif
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ movrel_local r4, inv_\txfm1\()_2s_x16_neon
+ movrel r5, X(inv_\txfm2\()_4h_x16_neon)
+.ifc \txfm1, identity
+.ifc \txfm2, identity
+ movrel_local r10, eob_16x16
+.else
+ movrel_local r10, eob_16x16_identity
+.endif
+.else
+.ifc \txfm2, identity
+ movrel_local r10, eob_16x16_identity
+.else
+ movrel_local r10, eob_16x16
+.endif
+.endif
+ b inv_txfm_add_16x16_neon
+endfunc
+.endm
+
+def_fn_16x16 dct, dct
+def_fn_16x16 identity, identity
+def_fn_16x16 dct, adst
+def_fn_16x16 dct, flipadst
+def_fn_16x16 dct, identity
+def_fn_16x16 adst, dct
+def_fn_16x16 adst, adst
+def_fn_16x16 adst, flipadst
+def_fn_16x16 flipadst, dct
+def_fn_16x16 flipadst, adst
+def_fn_16x16 flipadst, flipadst
+def_fn_16x16 identity, dct
+
+function inv_txfm_add_16x4_neon
+ cmp r3, r10
+ mov r11, #16
+ blt 1f
+
+ add r6, r2, #8
+ vmov.i32 d4, #0
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ vld1.32 {\i}, [r6, :64]
+ vst1.32 {d4}, [r6, :64], r11
+.endr
+ blx r4
+
+ vqrshrn.s32 d16, q8, #1
+ vqrshrn.s32 d17, q9, #1
+ vqrshrn.s32 d18, q10, #1
+ vqrshrn.s32 d19, q11, #1
+ vqrshrn.s32 d20, q12, #1
+ vqrshrn.s32 d21, q13, #1
+ vqrshrn.s32 d22, q14, #1
+ vqrshrn.s32 d23, q15, #1
+ vuzp.16 q8, q9
+ mov r6, sp
+ vuzp.16 q10, q11
+ vpush {q8-q11}
+
+ b 2f
+
+1:
+ vmov.i16 q8, #0
+ vmov.i16 q9, #0
+ mov r6, sp
+ vpush {q8-q9}
+ vpush {q8-q9}
+
+2:
+ vmov.i32 d4, #0
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ vld1.32 {\i}, [r2, :64]
+ vst1.32 {d4}, [r2, :64], r11
+.endr
+
+ blx r4
+
+ vqrshrn.s32 d16, q8, #1
+ vqrshrn.s32 d17, q9, #1
+ vqrshrn.s32 d18, q10, #1
+ vqrshrn.s32 d19, q11, #1
+ vqrshrn.s32 d20, q12, #1
+ vqrshrn.s32 d21, q13, #1
+ vqrshrn.s32 d22, q14, #1
+ vqrshrn.s32 d23, q15, #1
+ vuzp.16 q8, q9
+ mov r6, sp
+ vuzp.16 q10, q11
+
+ vmov q12, q10
+ vmov q13, q11
+
+ vpop {q10-q11}
+ blx r5
+ mov r6, r0
+ load_add_store_8x4 r6, r7
+
+ vpop {q10-q11}
+ vmov q8, q12
+ vmov q9, q13
+ blx r5
+ add r6, r0, #16
+ load_add_store_8x4 r6, r7
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_4x16_neon
+ ldrh r9, [r10, #4]
+
+ mov r11, #64
+ cmp r3, r9
+ ldrh r9, [r10, #2]
+ blt 1f
+
+ add r6, r2, #48
+ vmov.i32 q2, #0
+.irp i, q8, q9, q10, q11
+ vld1.32 {\i}, [r6, :128]
+ vst1.32 {q2}, [r6, :128], r11
+.endr
+ blx r4
+ vqrshrn.s32 d28, q8, #1
+ vqrshrn.s32 d29, q9, #1
+ vqrshrn.s32 d30, q10, #1
+ vqrshrn.s32 d31, q11, #1
+ transpose_4x4h q14, q15, d28, d29, d30, d31
+
+ b 2f
+1:
+ vmov.i16 q14, #0
+ vmov.i16 q15, #0
+2:
+ cmp r3, r9
+ ldrh r9, [r10]
+ blt 1f
+
+ add r6, r2, #32
+ vmov.i32 q2, #0
+.irp i, q8, q9, q10, q11
+ vld1.32 {\i}, [r6, :128]
+ vst1.32 {q2}, [r6, :128], r11
+.endr
+ blx r4
+ vqrshrn.s32 d24, q8, #1
+ vqrshrn.s32 d25, q9, #1
+ vqrshrn.s32 d26, q10, #1
+ vqrshrn.s32 d27, q11, #1
+ transpose_4x4h q12, q13, d24, d25, d26, d27
+
+ b 2f
+1:
+ vmov.i16 q12, #0
+ vmov.i16 q13, #0
+2:
+ cmp r3, r9
+ blt 1f
+
+ add r6, r2, #16
+ vmov.i32 q2, #0
+.irp i, q8, q9, q10, q11
+ vld1.32 {\i}, [r6, :128]
+ vst1.32 {q2}, [r6, :128], r11
+.endr
+ blx r4
+ vqrshrn.s32 d16, q8, #1
+ vqrshrn.s32 d17, q9, #1
+ vqrshrn.s32 d18, q10, #1
+ vqrshrn.s32 d19, q11, #1
+ transpose_4x4h q8, q9, d16, d17, d18, d19
+
+ b 2f
+1:
+ vmov.i16 q8, #0
+ vmov.i16 q9, #0
+2:
+ vmov.i16 q2, #0
+ vpush {q8-q9}
+.irp i, q8, q9, q10, q11
+ vld1.16 {\i}, [r2, :128]
+ vst1.16 {q2}, [r2, :128], r11
+.endr
+ blx r4
+ vqrshrn.s32 d16, q8, #1
+ vqrshrn.s32 d17, q9, #1
+ vqrshrn.s32 d18, q10, #1
+ vqrshrn.s32 d19, q11, #1
+ transpose_4x4h q8, q9, d16, d17, d18, d19
+ vpop {q10-q11}
+
+ blx r5
+
+ load_add_store_4x16 r0, r6
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+const eob_4x16
+ .short 13, 29, 45, 64
+endconst
+
+const eob_4x16_identity1
+ .short 16, 32, 48, 64
+endconst
+
+const eob_4x16_identity2
+ .short 4, 8, 12, 64
+endconst
+
+.macro def_fn_416 w, h, txfm1, txfm2, eob_16x4
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc \w, \h, 1
+.endif
+ push {r4-r11,lr}
+ vpush {q4-q7}
+.if \w == 4
+ movrel_local r4, inv_\txfm1\()_4s_x\w\()_neon
+ movrel r5, X(inv_\txfm2\()_4h_x\h\()_neon)
+.ifc \txfm1, identity
+.ifc \txfm2, identity
+ movrel_local r10, eob_4x16
+.else
+ movrel_local r10, eob_4x16_identity1
+.endif
+.else
+.ifc \txfm2, identity
+ movrel_local r10, eob_4x16_identity2
+.else
+ movrel_local r10, eob_4x16
+.endif
+.endif
+.else
+ mov r10, #\eob_16x4
+ movrel_local r4, inv_\txfm1\()_2s_x\w\()_neon
+ movrel r5, X(inv_\txfm2\()_8h_x\h\()_neon)
+.endif
+ b inv_txfm_add_\w\()x\h\()_neon
+endfunc
+.endm
+
+.macro def_fns_416 w, h
+def_fn_416 \w, \h, dct, dct, 3
+def_fn_416 \w, \h, identity, identity, 3
+def_fn_416 \w, \h, dct, adst, 3
+def_fn_416 \w, \h, dct, flipadst, 3
+def_fn_416 \w, \h, dct, identity, 2
+def_fn_416 \w, \h, adst, dct, 3
+def_fn_416 \w, \h, adst, adst, 3
+def_fn_416 \w, \h, adst, flipadst, 3
+def_fn_416 \w, \h, flipadst, dct, 3
+def_fn_416 \w, \h, flipadst, adst, 3
+def_fn_416 \w, \h, flipadst, flipadst, 3
+def_fn_416 \w, \h, identity, dct, 2
+def_fn_416 \w, \h, adst, identity, 2
+def_fn_416 \w, \h, flipadst, identity, 2
+def_fn_416 \w, \h, identity, adst, 2
+def_fn_416 \w, \h, identity, flipadst, 2
+.endm
+
+def_fns_416 4, 16
+def_fns_416 16, 4
+
+function inv_txfm_add_16x8_neon
+ sub_sp_align 256
+ ldrh r11, [r10], #2
+
+.irp i, 0, 2, 4, 6
+ add r6, sp, #(\i*16*2)
+.if \i > 0
+ mov r8, #(8 - \i)
+ cmp r3, r11
+ blt 1f
+.if \i < 6
+ ldrh r11, [r10], #2
+.endif
+.endif
+ add r7, r2, #(\i*4)
+ mov r8, #8*4
+ bl inv_txfm_horz_scale_16x2_neon
+.endr
+ b 3f
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #2
+.rept 2
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+3:
+
+.irp i, 0, 8
+ add r7, sp, #(\i*2)
+ mov r8, #32
+.irp j, q8, q9, q10, q11, q12, q13, q14, q15
+ vld1.16 {\j}, [r7, :128], r8
+.endr
+ blx r5
+
+ add r6, r0, #(\i*2)
+ load_add_store_8x8 r6, r7
+.endr
+
+ add_sp_align 256
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_8x16_neon
+ add r10, r10, #2
+ sub_sp_align 256
+ ldrh r11, [r10], #4
+
+.irp i, 0, 4, 8, 12
+ add r6, sp, #(\i*8*2)
+.if \i > 0
+ mov r8, #(16 - \i)
+ cmp r3, r11
+ blt 1f
+.if \i < 12
+ ldrh r11, [r10], #4
+.endif
+.endif
+ add r7, r2, #(\i*4)
+ mov r8, #16*4
+
+ mov_const r12, 2896*8*(1<<16)
+ vmov.i32 q2, #0
+ vdup.32 d0, r12
+
+.irp j, q8, q9, q10, q11, q12, q13, q14, q15
+ vld1.32 {\j}, [r7, :128]
+ vst1.32 {q2}, [r7, :128], r8
+.endr
+ scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15
+ blx r4
+ vqrshrn.s32 d16, q8, #1
+ vqrshrn.s32 d17, q9, #1
+ vqrshrn.s32 d18, q10, #1
+ vqrshrn.s32 d19, q11, #1
+ vqrshrn.s32 d20, q12, #1
+ vqrshrn.s32 d21, q13, #1
+ vqrshrn.s32 d22, q14, #1
+ vqrshrn.s32 d23, q15, #1
+ transpose_4x4h q8, q9, d16, d17, d18, d19
+ transpose_4x4h q10, q11, d20, d21, d22, d23
+.irp j, d16, d20, d17, d21, d18, d22, d19, d23
+ vst1.16 {\j}, [r6, :64]!
+.endr
+.endr
+ b 3f
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #4
+.rept 2
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+3:
+
+.irp i, 0, 4
+ add r6, r0, #(\i*2)
+ add r7, sp, #(\i*2)
+ mov r8, #16
+ bl inv_txfm_add_vert_4x16_neon
+.endr
+
+ add_sp_align 256
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+const eob_8x16
+ .short 3, 10, 21, 43, 59, 75, 91, 128
+endconst
+
+const eob_8x16_identity1
+ .short 2, 4, 6, 64, 80, 96, 112, 128
+endconst
+
+const eob_8x16_identity2
+ .short 2, 4, 6, 8, 10, 12, 14, 128
+endconst
+
+.macro def_fn_816 w, h, txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc \w, \h, 1
+.endif
+ push {r4-r11,lr}
+ vpush {q4-q7}
+.if \w == 8
+ movrel_local r4, inv_\txfm1\()_4s_x8_neon
+ movrel r5, X(inv_\txfm2\()_4h_x16_neon)
+.else
+ movrel_local r4, inv_\txfm1\()_2s_x16_neon
+ movrel r5, X(inv_\txfm2\()_8h_x8_neon)
+.endif
+.ifc \txfm1, identity
+.ifc \txfm2, identity
+ movrel_local r10, eob_8x16
+.else
+ movrel_local r10, eob_8x16_identity1
+.endif
+.else
+.ifc \txfm2, identity
+ movrel_local r10, eob_8x16_identity2
+.else
+ movrel_local r10, eob_8x16
+.endif
+.endif
+ b inv_txfm_add_\w\()x\h\()_neon
+endfunc
+.endm
+
+.macro def_fns_816 w, h
+def_fn_816 \w, \h, dct, dct
+def_fn_816 \w, \h, identity, identity
+def_fn_816 \w, \h, dct, adst
+def_fn_816 \w, \h, dct, flipadst
+def_fn_816 \w, \h, dct, identity
+def_fn_816 \w, \h, adst, dct
+def_fn_816 \w, \h, adst, adst
+def_fn_816 \w, \h, adst, flipadst
+def_fn_816 \w, \h, flipadst, dct
+def_fn_816 \w, \h, flipadst, adst
+def_fn_816 \w, \h, flipadst, flipadst
+def_fn_816 \w, \h, identity, dct
+def_fn_816 \w, \h, adst, identity
+def_fn_816 \w, \h, flipadst, identity
+def_fn_816 \w, \h, identity, adst
+def_fn_816 \w, \h, identity, flipadst
+.endm
+
+def_fns_816 8, 16
+def_fns_816 16, 8
+
+function inv_dct32_odd_2s_x16_neon
+ movrel_local r12, idct_coeffs, 4*16
+ vld1.32 {q0, q1}, [r12, :128]!
+
+ vmul_vmls d4, d16, d31, d0[0], d0[1] // -> t16a
+ vmul_vmla d6, d16, d31, d0[1], d0[0] // -> t31a
+ vmul_vmls d8, d24, d23, d1[0], d1[1] // -> t17a
+ vrshr.s32 d16, d4, #12 // t16a
+ vrshr.s32 d31, d6, #12 // t31a
+ vmul_vmla d4, d24, d23, d1[1], d1[0] // -> t30a
+ vmul_vmls d6, d20, d27, d2[0], d2[1] // -> t18a
+ vrshr.s32 d24, d8, #12 // t17a
+ vrshr.s32 d23, d4, #12 // t30a
+ vmul_vmla d8, d20, d27, d2[1], d2[0] // -> t29a
+ vmul_vmls d4, d28, d19, d3[0], d3[1] // -> t19a
+ vrshr.s32 d20, d6, #12 // t18a
+ vrshr.s32 d27, d8, #12 // t29a
+ vmul_vmla d6, d28, d19, d3[1], d3[0] // -> t28a
+ vld1.32 {q0, q1}, [r12, :128]
+ sub r12, r12, #4*24
+ vmul_vmls d8, d18, d29, d0[0], d0[1] // -> t20a
+ vrshr.s32 d28, d4, #12 // t19a
+ vrshr.s32 d19, d6, #12 // t28a
+ vmul_vmla d4, d18, d29, d0[1], d0[0] // -> t27a
+ vmul_vmls d6, d26, d21, d1[0], d1[1] // -> t21a
+ vrshr.s32 d18, d8, #12 // t20a
+ vrshr.s32 d29, d4, #12 // t27a
+ vmul_vmla d8, d26, d21, d1[1], d1[0] // -> t26a
+ vmul_vmls d4, d22, d25, d2[0], d2[1] // -> t22a
+ vrshr.s32 d26, d6, #12 // t21a
+ vrshr.s32 d21, d8, #12 // t26a
+ vmul_vmla d6, d22, d25, d2[1], d2[0] // -> t25a
+ vmul_vmls d8, d30, d17, d3[0], d3[1] // -> t23a
+ vrshr.s32 d22, d4, #12 // t22a
+ vrshr.s32 d25, d6, #12 // t25a
+ vmul_vmla d4, d30, d17, d3[1], d3[0] // -> t24a
+ vrshr.s32 d30, d8, #12 // t23a
+ vrshr.s32 d17, d4, #12 // t24a
+
+ vld1.32 {q0, q1}, [r12, :128]
+
+ vmov.i32 d11, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ vmvn.i32 d10, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
+
+ vqsub.s32 d5, d16, d24 // t17
+ vqadd.s32 d16, d16, d24 // t16
+ vqsub.s32 d7, d31, d23 // t30
+ vqadd.s32 d31, d31, d23 // t31
+ vqsub.s32 d24, d28, d20 // t18
+ vqadd.s32 d28, d28, d20 // t19
+ vqadd.s32 d23, d18, d26 // t20
+ vqsub.s32 d18, d18, d26 // t21
+ vqsub.s32 d20, d30, d22 // t22
+ vqadd.s32 d30, d30, d22 // t23
+ vqadd.s32 d26, d17, d25 // t24
+ vqsub.s32 d17, d17, d25 // t25
+ vqsub.s32 d22, d29, d21 // t26
+ vqadd.s32 d29, d29, d21 // t27
+ vqadd.s32 d25, d19, d27 // t28
+ vqsub.s32 d19, d19, d27 // t29
+
+.irp r, d5, d16, d7, d31, d24, d28, d23, d18, d20, d30, d26, d17, d22, d29, d25, d19
+ vmin.s32 \r, \r, d11
+.endr
+.irp r, d5, d16, d7, d31, d24, d28, d23, d18, d20, d30, d26, d17, d22, d29, d25, d19
+ vmax.s32 \r, \r, d10
+.endr
+
+ vmul_vmls d4, d7, d5, d2[0], d2[1] // -> t17a
+ vmul_vmla d6, d7, d5, d2[1], d2[0] // -> t30a
+ vmul_vmla d8, d19, d24, d2[1], d2[0] // -> t18a
+ vrshr.s32 d21, d4, #12 // t17a
+ vrshr.s32 d27, d6, #12 // t30a
+ vneg.s32 d8, d8 // -> t18a
+ vmul_vmls d5, d19, d24, d2[0], d2[1] // -> t29a
+ vmul_vmls d4, d22, d18, d3[0], d3[1] // -> t21a
+ vrshr.s32 d19, d8, #12 // t18a
+ vrshr.s32 d24, d5, #12 // t29a
+ vmul_vmla d6, d22, d18, d3[1], d3[0] // -> t26a
+ vmul_vmla d8, d17, d20, d3[1], d3[0] // -> t22a
+ vrshr.s32 d22, d4, #12 // t21a
+ vrshr.s32 d18, d6, #12 // t26a
+ vneg.s32 d8, d8 // -> t22a
+ vmul_vmls d5, d17, d20, d3[0], d3[1] // -> t25a
+ vrshr.s32 d17, d8, #12 // t22a
+ vrshr.s32 d20, d5, #12 // t25a
+
+ vqsub.s32 d2, d27, d24 // t29
+ vqadd.s32 d27, d27, d24 // t30
+ vqsub.s32 d3, d21, d19 // t18
+ vqadd.s32 d21, d21, d19 // t17
+ vqsub.s32 d24, d16, d28 // t19a
+ vqadd.s32 d16, d16, d28 // t16a
+ vqsub.s32 d19, d30, d23 // t20a
+ vqadd.s32 d30, d30, d23 // t23a
+ vqsub.s32 d28, d17, d22 // t21
+ vqadd.s32 d17, d17, d22 // t22
+ vqadd.s32 d23, d26, d29 // t24a
+ vqsub.s32 d26, d26, d29 // t27a
+ vqadd.s32 d22, d20, d18 // t25
+ vqsub.s32 d20, d20, d18 // t26
+ vqsub.s32 d29, d31, d25 // t28a
+ vqadd.s32 d31, d31, d25 // t31a
+
+.irp r, d2, d27, d3, d21, d24, d16, d19, d30, d28, d17, d23, d26, d22, d20, d29, d31
+ vmin.s32 \r, \r, d11
+.endr
+.irp r, d2, d27, d3, d21, d24, d16, d19, d30, d28, d17, d23, d26, d22, d20, d29, d31
+ vmax.s32 \r, \r, d10
+.endr
+
+ vmul_vmls d4, d2, d3, d1[0], d1[1] // -> t18a
+ vmul_vmla d6, d2, d3, d1[1], d1[0] // -> t29a
+ vmul_vmls d8, d29, d24, d1[0], d1[1] // -> t19
+ vrshr.s32 d18, d4, #12 // t18a
+ vrshr.s32 d25, d6, #12 // t29a
+ vmul_vmla d5, d29, d24, d1[1], d1[0] // -> t28
+ vmul_vmla d4, d26, d19, d1[1], d1[0] // -> t20
+ vrshr.s32 d29, d8, #12 // t19
+ vrshr.s32 d24, d5, #12 // t28
+ vneg.s32 d4, d4 // -> t20
+ vmul_vmls d6, d26, d19, d1[0], d1[1] // -> t27
+ vmul_vmla d8, d20, d28, d1[1], d1[0] // -> t21a
+ vrshr.s32 d26, d4, #12 // t20
+ vrshr.s32 d19, d6, #12 // t27
+ vneg.s32 d8, d8 // -> t21a
+ vmul_vmls d5, d20, d28, d1[0], d1[1] // -> t26a
+ vrshr.s32 d20, d8, #12 // t21a
+ vrshr.s32 d28, d5, #12 // t26a
+
+ vqsub.s32 d2, d16, d30 // t23
+ vqadd.s32 d16, d16, d30 // t16 = out16
+ vqsub.s32 d3, d31, d23 // t24
+ vqadd.s32 d31, d31, d23 // t31 = out31
+ vqsub.s32 d23, d21, d17 // t22a
+ vqadd.s32 d17, d21, d17 // t17a = out17
+ vqadd.s32 d30, d27, d22 // t30a = out30
+ vqsub.s32 d21, d27, d22 // t25a
+ vqsub.s32 d27, d18, d20 // t21
+ vqadd.s32 d18, d18, d20 // t18 = out18
+ vqadd.s32 d4, d29, d26 // t19a = out19
+ vqsub.s32 d26, d29, d26 // t20a
+ vqadd.s32 d29, d25, d28 // t29 = out29
+ vqsub.s32 d25, d25, d28 // t26
+ vqadd.s32 d28, d24, d19 // t28a = out28
+ vqsub.s32 d24, d24, d19 // t27a
+ vmov d19, d4 // out19
+
+.irp r, d2, d16, d3, d31, d23, d17, d30, d21, d27, d18, d19, d26, d29, d25, d28, d24
+ vmin.s32 \r, \r, d11
+.endr
+.irp r, d2, d16, d3, d31, d23, d17, d30, d21, d27, d18, d19, d26, d29, d25, d28, d24
+ vmax.s32 \r, \r, d10
+.endr
+
+ vmul_vmls d4, d24, d26, d0[0], d0[0] // -> t20
+ vmul_vmla d6, d24, d26, d0[0], d0[0] // -> t27
+ vrshr.s32 d20, d4, #12 // t20
+ vrshr.s32 d22, d6, #12 // t27
+
+ vmul_vmla d4, d25, d27, d0[0], d0[0] // -> t26a
+ vmul_vmls d6, d25, d27, d0[0], d0[0] // -> t21a
+ vmov d27, d22 // t27
+ vrshr.s32 d26, d4, #12 // t26a
+
+ vmul_vmls d24, d21, d23, d0[0], d0[0] // -> t22
+ vmul_vmla d4, d21, d23, d0[0], d0[0] // -> t25
+ vrshr.s32 d21, d6, #12 // t21a
+ vrshr.s32 d22, d24, #12 // t22
+ vrshr.s32 d25, d4, #12 // t25
+
+ vmul_vmls d4, d3, d2, d0[0], d0[0] // -> t23a
+ vmul_vmla d6, d3, d2, d0[0], d0[0] // -> t24a
+ vrshr.s32 d23, d4, #12 // t23a
+ vrshr.s32 d24, d6, #12 // t24a
+
+ bx lr
+endfunc
+
+.macro def_horz_32 scale=0, shift=2, suffix
+function inv_txfm_horz\suffix\()_dct_32x2_neon
+ push {lr}
+ vmov.i32 d7, #0
+ lsl r8, r8, #1
+.if \scale
+ mov_const r12, 2896*8*(1<<16)
+ vdup.32 d0, r12
+.endif
+
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ vld1.32 {\i}, [r7, :64]
+ vst1.32 {d7}, [r7, :64], r8
+.endr
+ sub r7, r7, r8, lsl #4
+ add r7, r7, r8, lsr #1
+.if \scale
+ scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15
+.endif
+ bl inv_dct_2s_x16_neon
+
+ // idct_16 leaves the row_clip_max/min constants in d9 and d8,
+ // but here we want to use full q registers for clipping.
+ vmov.i32 q3, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ vmvn.i32 q2, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
+.irp r, q8, q9, q10, q11, q12, q13, q14, q15
+ vmin.s32 \r, \r, q3
+.endr
+.irp r, q8, q9, q10, q11, q12, q13, q14, q15
+ vmax.s32 \r, \r, q2
+.endr
+
+ vtrn.32 d16, d17
+ vtrn.32 d18, d19
+ vtrn.32 d20, d21
+ vtrn.32 d22, d23
+ vtrn.32 d24, d25
+ vtrn.32 d26, d27
+ vtrn.32 d28, d29
+ vtrn.32 d30, d31
+
+.macro store1 r0, r1, r2, r3
+ vst1.16 {\r0}, [r6, :64]!
+ vst1.16 {\r1}, [r6, :64]!
+ vst1.16 {\r2}, [r6, :64]!
+ vst1.16 {\r3}, [r6, :64]!
+.endm
+ store1 d16, d18, d20, d22
+ store1 d24, d26, d28, d30
+ store1 d17, d19, d21, d23
+ store1 d25, d27, d29, d31
+.purgem store1
+ sub r6, r6, #64*2
+
+ vmov.i32 d7, #0
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ vld1.32 {\i}, [r7, :64]
+ vst1.32 {d7}, [r7, :64], r8
+.endr
+.if \scale
+ // This relies on the fact that the idct also leaves the right coeff in d0[1]
+ scale_input d0[1], q8, q9, q10, q11, q12, q13, q14, q15
+.endif
+ bl inv_dct32_odd_2s_x16_neon
+ vtrn.32 d31, d30
+ vtrn.32 d29, d28
+ vtrn.32 d27, d26
+ vtrn.32 d25, d24
+ vtrn.32 d23, d22
+ vtrn.32 d21, d20
+ vtrn.32 d19, d18
+ vtrn.32 d17, d16
+.macro store2 r0, r1, r2, r3, r4, r5, r6, r7, shift
+ vld1.32 {q0, q1}, [r6, :128]!
+ vld1.32 {q2, q3}, [r6, :128]
+ sub r6, r6, #32
+ vqsub.s32 d15, d0, \r0
+ vqadd.s32 d0, d0, \r0
+ vqsub.s32 d14, d1, \r1
+ vqadd.s32 d1, d1, \r1
+ vqsub.s32 d13, d2, \r2
+ vqadd.s32 d2, d2, \r2
+ vqsub.s32 d12, d3, \r3
+ vqadd.s32 d3, d3, \r3
+ vqsub.s32 d11, d4, \r4
+ vqadd.s32 d4, d4, \r4
+ vqsub.s32 d10, d5, \r5
+ vqadd.s32 d5, d5, \r5
+ vqsub.s32 d9, d6, \r6
+ vqadd.s32 d6, d6, \r6
+ vqsub.s32 d8, d7, \r7
+ vqadd.s32 d7, d7, \r7
+ vqrshrn.s32 d0, q0, #\shift
+ vqrshrn.s32 d1, q1, #\shift
+ vqrshrn.s32 d2, q2, #\shift
+ vqrshrn.s32 d3, q3, #\shift
+ vqrshrn.s32 d4, q4, #\shift
+ vqrshrn.s32 d5, q5, #\shift
+ vqrshrn.s32 d6, q6, #\shift
+ vqrshrn.s32 d7, q7, #\shift
+ vrev32.16 q2, q2
+ vrev32.16 q3, q3
+ vst1.16 {q0, q1}, [r6, :128]!
+ vst1.16 {q2, q3}, [r6, :128]!
+.endm
+
+ store2 d31, d29, d27, d25, d23, d21, d19, d17, \shift
+ store2 d30, d28, d26, d24, d22, d20, d18, d16, \shift
+.purgem store2
+ pop {pc}
+endfunc
+.endm
+
+def_horz_32 scale=0, shift=2
+def_horz_32 scale=1, shift=1, suffix=_scale
+
+function inv_txfm_add_vert_dct_4x32_neon
+ push {r10-r11,lr}
+ lsl r8, r8, #1
+
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ vld1.16 {\i}, [r7, :64], r8
+.endr
+ sub r7, r7, r8, lsl #4
+
+ bl X(inv_dct_4h_x16_neon)
+
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ vst1.16 {\i}, [r7, :64], r8
+.endr
+ sub r7, r7, r8, lsl #4
+ add r7, r7, r8, lsr #1
+
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ vld1.16 {\i}, [r7, :64], r8
+.endr
+ sub r7, r7, r8, lsl #4
+ sub r7, r7, r8, lsr #1
+ bl X(inv_dct32_odd_4h_x16_neon)
+
+ neg r9, r8
+ mov r10, r6
+ vmov.i16 q6, #0
+ vmvn.i16 q7, #0xfc00 // 0x3ff
+.macro combine r0, r1, r2, r3, op, stride
+ vld1.16 {d4}, [r7, :64], \stride
+ vld1.16 {d0}, [r10, :64], r1
+ vld1.16 {d5}, [r7, :64], \stride
+ vld1.16 {d1}, [r10, :64], r1
+ \op\().s16 d4, d4, \r0
+ vld1.16 {d6}, [r7, :64], \stride
+ vld1.16 {d2}, [r10, :64], r1
+ \op\().s16 d5, d5, \r1
+ vld1.16 {d3}, [r10, :64], r1
+ vrshr.s16 q2, q2, #4
+ \op\().s16 d6, d6, \r2
+ vld1.16 {d7}, [r7, :64], \stride
+ vqadd.s16 q0, q0, q2
+ \op\().s16 d7, d7, \r3
+ vmax.s16 q0, q0, q6
+ vrshr.s16 q3, q3, #4
+ vmin.s16 q0, q0, q7
+ vqadd.s16 q1, q1, q3
+ vst1.16 {d0}, [r6, :64], r1
+ vmax.s16 q1, q1, q6
+ vst1.16 {d1}, [r6, :64], r1
+ vmin.s16 q1, q1, q7
+ vst1.16 {d2}, [r6, :64], r1
+ vst1.16 {d3}, [r6, :64], r1
+.endm
+ combine d31, d30, d29, d28, vqadd, r8
+ combine d27, d26, d25, d24, vqadd, r8
+ combine d23, d22, d21, d20, vqadd, r8
+ combine d19, d18, d17, d16, vqadd, r8
+ sub r7, r7, r8
+ combine d16, d17, d18, d19, vqsub, r9
+ combine d20, d21, d22, d23, vqsub, r9
+ combine d24, d25, d26, d27, vqsub, r9
+ combine d28, d29, d30, d31, vqsub, r9
+.purgem combine
+
+ pop {r10-r11,pc}
+endfunc
+
+const eob_32x32
+ .short 3, 10, 21, 36, 55, 78, 105, 136, 171, 210, 253, 300, 351, 406, 465, 1024
+endconst
+
+const eob_16x32
+ .short 3, 10, 21, 36, 55, 78, 105, 151, 183, 215, 247, 279, 311, 343, 375, 512
+endconst
+
+const eob_16x32_shortside
+ .short 3, 10, 21, 36, 55, 78, 105, 512
+endconst
+
+const eob_8x32
+ .short 3, 10, 21, 43, 59, 75, 91, 107, 123, 139, 155, 171, 187, 203, 219, 256
+endconst
+
+function inv_txfm_add_identity_identity_32x32_16bpc_neon, export=1
+ push {r4-r7,lr}
+ vpush {q6-q7}
+ movrel_local r5, eob_32x32, 2
+
+ mov r6, #4*32
+1:
+ mov r12, #0
+ movrel_local r4, eob_32x32, 6
+2:
+ vmov.i32 q0, #0
+ add r12, r12, #8
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vld1.32 {\i}, [r2, :128]
+ vst1.32 {q0}, [r2, :128], r6
+.endr
+ vqmovn.s32 d16, q8
+ vqmovn.s32 d17, q12
+ vqmovn.s32 d18, q9
+ vqmovn.s32 d19, q13
+ vqmovn.s32 d20, q10
+ vqmovn.s32 d21, q14
+ vqmovn.s32 d22, q11
+ vqmovn.s32 d23, q15
+ transpose_4x8h q8, q9, q10, q11
+
+ load_add_store_8x4 r0, r7, shiftbits=2
+ ldrh lr, [r4], #8
+ sub r0, r0, r1, lsl #2
+ cmp r3, lr
+ add r0, r0, #2*8
+ bge 2b
+
+ ldrh lr, [r5], #4
+ cmp r3, lr
+ blt 9f
+
+ sub r0, r0, r12, lsl #1
+ add r0, r0, r1, lsl #2
+ mls r2, r6, r12, r2
+ add r2, r2, #4*4
+ b 1b
+9:
+ vpop {q6-q7}
+ pop {r4-r7,pc}
+endfunc
+
+.macro shift_8_regs op, shift
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ \op \i, \i, #\shift
+.endr
+.endm
+
+.macro def_identity_1632 w, h, wshort, hshort
+function inv_txfm_add_identity_identity_\w\()x\h\()_16bpc_neon, export=1
+ push {r4-r9,lr}
+ vpush {q6-q7}
+ mov r9, #0
+ mov_const r8, 2896*8*(1<<16)
+ movt r9, #2*(5793-4096)*8
+ movrel_local r5, eob_16x32\hshort, 2
+
+ mov r6, #4*\h
+1:
+ mov r12, #0
+ movrel_local r4, eob_16x32\wshort, 6
+2:
+ vdup.i32 d0, r8
+ vmov.i32 q1, #0
+ vmov.32 d0[1], r9
+ add r12, r12, #8
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vld1.32 {\i}, [r2, :128]
+ vst1.32 {q1}, [r2, :128], r6
+.endr
+ scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15
+
+.if \w == 16
+ // 16x32
+ identity_8x4_shift1 d0[1]
+.else
+ // 32x16
+ shift_8_regs vqshl.s32, 1
+ identity_8x4 d0[1]
+.endif
+ vqmovn.s32 d16, q8
+ vqmovn.s32 d17, q12
+ vqmovn.s32 d18, q9
+ vqmovn.s32 d19, q13
+ vqmovn.s32 d20, q10
+ vqmovn.s32 d21, q14
+ vqmovn.s32 d22, q11
+ vqmovn.s32 d23, q15
+ transpose_4x8h q8, q9, q10, q11
+
+.if \w == 16
+ load_add_store_8x4 r0, r7, shiftbits=2
+.else
+ load_add_store_8x4 r0, r7, shiftbits=4
+.endif
+ ldrh lr, [r4], #8
+ sub r0, r0, r1, lsl #2
+ cmp r3, lr
+ add r0, r0, #2*8
+ bge 2b
+
+ ldrh lr, [r5], #4
+ cmp r3, lr
+ blt 9f
+
+ sub r0, r0, r12, lsl #1
+ add r0, r0, r1, lsl #2
+ mls r2, r6, r12, r2
+ add r2, r2, #4*4
+ b 1b
+9:
+ vpop {q6-q7}
+ pop {r4-r9,pc}
+endfunc
+.endm
+
+def_identity_1632 16, 32, _shortside,
+def_identity_1632 32, 16, , _shortside
+
+.macro def_identity_832 w, h
+function inv_txfm_add_identity_identity_\w\()x\h\()_16bpc_neon, export=1
+ push {r4-r5,lr}
+ vpush {q6-q7}
+ movrel_local r4, eob_8x32, 2
+
+ mov r12, #4*\h
+1:
+ ldrh lr, [r4], #4
+.if \w == 8
+ vmov.i32 q0, #0
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vld1.32 {\i}, [r2, :128]
+ vst1.32 {q0}, [r2, :128], r12
+.endr
+
+ vqrshrn.s32 d16, q8, #1
+ vqrshrn.s32 d17, q12, #1
+ vqrshrn.s32 d18, q9, #1
+ vqrshrn.s32 d19, q13, #1
+ vqrshrn.s32 d20, q10, #1
+ vqrshrn.s32 d21, q14, #1
+ vqrshrn.s32 d22, q11, #1
+ vqrshrn.s32 d23, q15, #1
+
+ transpose_4x8h q8, q9, q10, q11
+
+ cmp r3, lr
+ load_add_store_8x4 r0, r5, shiftbits=2
+ blt 9f
+ sub r2, r2, r12, lsl #3
+ add r2, r2, #4*4
+.else
+ vmov.i32 q0, #0
+ vmov.i32 q1, #0
+ vld1.32 {q8, q9}, [r2, :128]
+ vst1.32 {q0, q1}, [r2, :128], r12
+ vld1.32 {q10, q11}, [r2, :128]
+ vst1.32 {q0, q1}, [r2, :128], r12
+ vld1.32 {q12, q13}, [r2, :128]
+ vst1.32 {q0, q1}, [r2, :128], r12
+ vld1.32 {q14, q15}, [r2, :128]
+ vst1.32 {q0, q1}, [r2, :128], r12
+ vqmovn.s32 d16, q8
+ vqmovn.s32 d17, q10
+ vqmovn.s32 d20, q9
+ vqmovn.s32 d21, q11
+ vqmovn.s32 d18, q12
+ vqmovn.s32 d19, q14
+ vqmovn.s32 d22, q13
+ vqmovn.s32 d23, q15
+
+ transpose_4x4h q8, q9, d16, d17, d18, d19
+ transpose_4x4h q10, q11, d20, d21, d22, d23
+
+ cmp r3, lr
+ load_add_store_4x8 r0, r5, shiftbits=3
+ blt 9f
+ sub r0, r0, r1, lsl #3
+ add r0, r0, #2*4
+.endif
+ b 1b
+
+9:
+ vpop {q6-q7}
+ pop {r4-r5,pc}
+endfunc
+.endm
+
+def_identity_832 8, 32
+def_identity_832 32, 8
+
+function inv_txfm_add_dct_dct_32x32_16bpc_neon, export=1
+ idct_dc 32, 32, 2
+
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ sub_sp_align 2048
+ movrel_local r10, eob_32x32
+ ldrh r11, [r10], #2
+
+.irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+ add r6, sp, #(\i*32*2)
+.if \i > 0
+ mov r8, #(32 - \i)
+ cmp r3, r11
+ blt 1f
+.if \i < 30
+ ldrh r11, [r10], #2
+.endif
+.endif
+ add r7, r2, #(\i*4)
+ mov r8, #32*4
+ bl inv_txfm_horz_dct_32x2_neon
+.endr
+ b 3f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #2
+.rept 4
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add r6, r0, #(\i*2)
+ add r7, sp, #(\i*2)
+ mov r8, #32*2
+ bl inv_txfm_add_vert_dct_4x32_neon
+.endr
+
+ add_sp_align 2048
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_16x32_16bpc_neon, export=1
+ idct_dc 16, 32, 1
+
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ sub_sp_align 1024
+ movrel_local r10, eob_16x32
+ ldrh r11, [r10], #2
+ movrel_local r4, inv_dct_2s_x16_neon
+
+.irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+ add r6, sp, #(\i*16*2)
+ add r7, r2, #(\i*4)
+.if \i > 0
+ mov r8, #(32 - \i)
+ cmp r3, r11
+ blt 1f
+.if \i < 30
+ ldrh r11, [r10], #2
+.endif
+.endif
+ mov r8, #4*32
+ bl inv_txfm_horz_scale_16x2_neon
+.endr
+ b 3f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #2
+.rept 2
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+.irp i, 0, 4, 8, 12
+ add r6, r0, #(\i*2)
+ add r7, sp, #(\i*2)
+ mov r8, #16*2
+ bl inv_txfm_add_vert_dct_4x32_neon
+.endr
+
+ add_sp_align 1024
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_32x16_16bpc_neon, export=1
+ idct_dc 32, 16, 1
+
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ sub_sp_align 1024
+ movrel_local r10, eob_16x32
+ ldrh r11, [r10], #2
+ movrel r5, X(inv_dct_4h_x16_neon)
+
+.irp i, 0, 2, 4, 6, 8, 10, 12, 14
+ add r6, sp, #(\i*32*2)
+ add r7, r2, #(\i*4)
+.if \i > 0
+ mov r8, #(16 - \i)
+ cmp r3, r11
+ blt 1f
+.if \i < 14
+ ldrh r11, [r10], #2
+.endif
+.endif
+ mov r8, #4*16
+ bl inv_txfm_horz_scale_dct_32x2_neon
+.endr
+ b 3f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #2
+.rept 4
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add r6, r0, #(\i*2)
+ add r7, sp, #(\i*2)
+ mov r8, #32*2
+ bl inv_txfm_add_vert_4x16_neon
+.endr
+
+ add_sp_align 1024
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_8x32_16bpc_neon, export=1
+ idct_dc 8, 32, 2
+
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ sub_sp_align 512
+
+ movrel_local r10, eob_8x32, 2
+
+ mov r8, #4*32
+ mov r9, #32
+ mov r6, sp
+1:
+ vmov.i32 q0, #0
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vld1.32 {\i}, [r2, :128]
+ vst1.32 {q0}, [r2, :128], r8
+.endr
+ ldrh r11, [r10], #4
+ sub r2, r2, r8, lsl #3
+ sub r9, r9, #4
+ add r2, r2, #4*4
+
+ bl inv_dct_4s_x8_neon
+
+ vqrshrn.s32 d16, q8, #2
+ vqrshrn.s32 d18, q9, #2
+ vqrshrn.s32 d20, q10, #2
+ vqrshrn.s32 d22, q11, #2
+ vqrshrn.s32 d17, q12, #2
+ vqrshrn.s32 d19, q13, #2
+ vqrshrn.s32 d21, q14, #2
+ vqrshrn.s32 d23, q15, #2
+
+ transpose_4x8h q8, q9, q10, q11
+
+ vst1.16 {q8, q9}, [r6, :128]!
+ cmp r3, r11
+ vst1.16 {q10, q11}, [r6, :128]!
+
+ bge 1b
+ cmp r9, #0
+ beq 3f
+
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r9, r9, #4
+.rept 2
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+.irp i, 0, 4
+ add r6, r0, #(\i*2)
+ add r7, sp, #(\i*2)
+ mov r8, #8*2
+ bl inv_txfm_add_vert_dct_4x32_neon
+.endr
+
+ add_sp_align 512
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_32x8_16bpc_neon, export=1
+ idct_dc 32, 8, 2
+
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ movrel_local r10, eob_8x32
+ sub_sp_align 512
+ ldrh r11, [r10], #2
+
+.irp i, 0, 2, 4, 6
+ add r6, sp, #(\i*32*2)
+ add r7, r2, #(\i*4)
+.if \i > 0
+ cmp r3, r11
+ mov r8, #(8 - \i)
+ blt 1f
+.if \i < 6
+ ldrh r11, [r10], #2
+.endif
+.endif
+ mov r8, #8*4
+ bl inv_txfm_horz_dct_32x2_neon
+.endr
+ b 3f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #2
+.rept 4
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+ mov r8, #2*32
+ mov r9, #0
+1:
+ add r6, r0, r9, lsl #1
+ add r7, sp, r9, lsl #1 // #(\i*2)
+
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vld1.16 {\i}, [r7, :128], r8
+.endr
+ add r9, r9, #8
+
+ bl X(inv_dct_8h_x8_neon)
+
+ cmp r9, #32
+
+ load_add_store_8x8 r6, r7
+
+ blt 1b
+
+ add_sp_align 512
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_dct64_step1_neon
+ // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
+ // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
+ // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
+ // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
+
+ vld1.32 {q0, q1}, [r12, :128]!
+
+ vqrdmulh.s32 d23, d16, d0[1] // t63a
+ vqrdmulh.s32 d16, d16, d0[0] // t32a
+ vqrdmulh.s32 d22, d17, d1[0] // t62a
+ vqrdmulh.s32 d17, d17, d1[1] // t33a
+ vqrdmulh.s32 d21, d18, d2[1] // t61a
+ vqrdmulh.s32 d18, d18, d2[0] // t34a
+ vqrdmulh.s32 d20, d19, d3[0] // t60a
+ vqrdmulh.s32 d19, d19, d3[1] // t35a
+
+ vld1.32 {q0}, [r12, :128]!
+
+ vqadd.s32 d24, d16, d17 // t32
+ vqsub.s32 d25, d16, d17 // t33
+ vqsub.s32 d26, d19, d18 // t34
+ vqadd.s32 d27, d19, d18 // t35
+ vqadd.s32 d28, d20, d21 // t60
+ vqsub.s32 d29, d20, d21 // t61
+ vqsub.s32 d30, d23, d22 // t62
+ vqadd.s32 d31, d23, d22 // t63
+
+.irp r, q12, q13, q14, q15
+ vmin.s32 \r, \r, q5
+.endr
+.irp r, q12, q13, q14, q15
+ vmax.s32 \r, \r, q4
+.endr
+
+ vmul_vmla d4, d29, d26, d0[0], d0[1] // -> t34a
+ vmul_vmls d6, d29, d26, d0[1], d0[0] // -> t61a
+ vneg.s32 d4, d4 // t34a
+ vmul_vmls d7, d30, d25, d0[1], d0[0] // -> t33a
+ vrshr.s32 d26, d4, #12 // t34a
+ vmul_vmla d4, d30, d25, d0[0], d0[1] // -> t62a
+ vrshr.s32 d29, d6, #12 // t61a
+ vrshr.s32 d25, d7, #12 // t33a
+ vrshr.s32 d30, d4, #12 // t62a
+
+ vqadd.s32 d16, d24, d27 // t32a
+ vqsub.s32 d19, d24, d27 // t35a
+ vqadd.s32 d17, d25, d26 // t33
+ vqsub.s32 d18, d25, d26 // t34
+ vqsub.s32 d20, d31, d28 // t60a
+ vqadd.s32 d23, d31, d28 // t63a
+ vqsub.s32 d21, d30, d29 // t61
+ vqadd.s32 d22, d30, d29 // t62
+
+.irp r, q8, q9, q10, q11
+ vmin.s32 \r, \r, q5
+.endr
+.irp r, q8, q9, q10, q11
+ vmax.s32 \r, \r, q4
+.endr
+
+ vmul_vmla d4, d21, d18, d1[0], d1[1] // -> t61a
+ vmul_vmls d6, d21, d18, d1[1], d1[0] // -> t34a
+ vmul_vmla d7, d20, d19, d1[0], d1[1] // -> t60
+ vrshr.s32 d21, d4, #12 // t61a
+ vrshr.s32 d18, d6, #12 // t34a
+ vmul_vmls d4, d20, d19, d1[1], d1[0] // -> t35
+ vrshr.s32 d20, d7, #12 // t60
+ vrshr.s32 d19, d4, #12 // t35
+
+ vst1.32 {d16, d17, d18, d19}, [r6, :128]!
+ vst1.32 {d20, d21, d22, d23}, [r6, :128]!
+
+ bx lr
+endfunc
+
+function inv_dct64_step2_neon
+ movrel_local r12, idct_coeffs
+ vld1.32 {q0}, [r12, :128]
+1:
+ // t32a/33/34a/35/60/61a/62/63a
+ // t56a/57/58a/59/36/37a/38/39a
+ // t40a/41/42a/43/52/53a/54/55a
+ // t48a/49/50a/51/44/45a/46/47a
+ vldr d16, [r6, #4*2*0] // t32a
+ vldr d17, [r9, #4*2*8] // t39a
+ vldr d18, [r9, #4*2*0] // t63a
+ vldr d19, [r6, #4*2*8] // t56a
+ vldr d20, [r6, #4*2*16] // t40a
+ vldr d21, [r9, #4*2*24] // t47a
+ vldr d22, [r9, #4*2*16] // t55a
+ vldr d23, [r6, #4*2*24] // t48a
+
+ vqadd.s32 d24, d16, d17 // t32
+ vqsub.s32 d25, d16, d17 // t39
+ vqadd.s32 d26, d18, d19 // t63
+ vqsub.s32 d27, d18, d19 // t56
+ vqsub.s32 d28, d21, d20 // t40
+ vqadd.s32 d29, d21, d20 // t47
+ vqadd.s32 d30, d23, d22 // t48
+ vqsub.s32 d31, d23, d22 // t55
+
+.irp r, q12, q13, q14, q15
+ vmin.s32 \r, \r, q5
+.endr
+.irp r, q12, q13, q14, q15
+ vmax.s32 \r, \r, q4
+.endr
+
+ vmul_vmla d4, d27, d25, d1[1], d1[0] // -> t56a
+ vmul_vmls d6, d27, d25, d1[0], d1[1] // -> t39a
+ vmul_vmla d7, d31, d28, d1[1], d1[0] // -> t40a
+ vrshr.s32 d25, d4, #12 // t56a
+ vrshr.s32 d27, d6, #12 // t39a
+ vneg.s32 d7, d7 // t40a
+ vmul_vmls d4, d31, d28, d1[0], d1[1] // -> t55a
+ vrshr.s32 d31, d7, #12 // t40a
+ vrshr.s32 d28, d4, #12 // t55a
+
+ vqadd.s32 d16, d24, d29 // t32a
+ vqsub.s32 d19, d24, d29 // t47a
+ vqadd.s32 d17, d27, d31 // t39
+ vqsub.s32 d18, d27, d31 // t40
+ vqsub.s32 d20, d26, d30 // t48a
+ vqadd.s32 d23, d26, d30 // t63a
+ vqsub.s32 d21, d25, d28 // t55
+ vqadd.s32 d22, d25, d28 // t56
+
+.irp r, q8, q9, q10, q11
+ vmin.s32 \r, \r, q5
+.endr
+.irp r, q8, q9, q10, q11
+ vmax.s32 \r, \r, q4
+.endr
+
+ vmul_vmls d4, d21, d18, d0[0], d0[0] // -> t40a
+ vmul_vmla d6, d21, d18, d0[0], d0[0] // -> t55a
+ vmul_vmls d7, d20, d19, d0[0], d0[0] // -> t47
+ vrshr.s32 d18, d4, #12 // t40a
+ vrshr.s32 d21, d6, #12 // t55a
+ vmul_vmla d4, d20, d19, d0[0], d0[0] // -> t48
+ vrshr.s32 d19, d7, #12 // t47
+ vrshr.s32 d20, d4, #12 // t48
+
+ vstr d16, [r6, #4*2*0] // t32a
+ vstr d17, [r9, #4*2*0] // t39
+ vstr d18, [r6, #4*2*8] // t40a
+ vstr d19, [r9, #4*2*8] // t47
+ vstr d20, [r6, #4*2*16] // t48
+ vstr d21, [r9, #4*2*16] // t55a
+ vstr d22, [r6, #4*2*24] // t56
+ vstr d23, [r9, #4*2*24] // t63a
+
+ add r6, r6, #4*2
+ sub r9, r9, #4*2
+ cmp r6, r9
+ blt 1b
+ bx lr
+endfunc
+
+.macro load8 src, strd, zero, clear
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23
+.if \clear
+ vld1.32 {\i}, [\src, :64]
+ vst1.32 {\zero}, [\src, :64], \strd
+.else
+ vld1.32 {\i}, [\src, :64], \strd
+.endif
+.endr
+.endm
+
+.macro store16 dst
+ vst1.32 {q8, q9}, [\dst, :128]!
+ vst1.32 {q10, q11}, [\dst, :128]!
+ vst1.32 {q12, q13}, [\dst, :128]!
+ vst1.32 {q14, q15}, [\dst, :128]!
+.endm
+
+.macro clear_upper8
+.irp i, q12, q13, q14, q15
+ vmov.i32 \i, #0
+.endr
+.endm
+
+.macro vmov_if reg, val, cond
+.if \cond
+ vmov.i32 \reg, \val
+.endif
+.endm
+
+.macro movdup_if reg, gpr, val, cond
+.if \cond
+ mov_const \gpr, \val
+ vdup.32 \reg, \gpr
+.endif
+.endm
+
+.macro vst1_if regs, dst, dstalign, cond
+.if \cond
+ vst1.32 \regs, \dst, \dstalign
+.endif
+.endm
+
+.macro scale_if cond, c, r0, r1, r2, r3, r4, r5, r6, r7
+.if \cond
+ scale_input \c, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
+.endif
+.endm
+
+.macro def_dct64_func suffix, clear=0, scale=0
+function inv_txfm_dct\suffix\()_2s_x64_neon
+ mov r6, sp
+
+ push {r10-r11,lr}
+
+ lsl r8, r8, #2
+
+ movdup_if d0, r12, 2896*8*(1<<16), \scale
+ vmov_if d7, #0, \clear
+ load8 r7, r8, d7, \clear
+ clear_upper8
+ sub r7, r7, r8, lsl #3
+ add r7, r7, r8, lsr #1
+ scale_if \scale, d0[0], q8, q9, q10, q11
+
+ bl inv_dct_2s_x16_neon
+
+ // idct_16 leaves the row_clip_max/min constants in d9 and d8,
+ // but here we want to use full q registers for clipping.
+ vmov.i32 q3, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ vmvn.i32 q2, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
+.irp r, q8, q9, q10, q11, q12, q13, q14, q15
+ vmin.s32 \r, \r, q3
+.endr
+.irp r, q8, q9, q10, q11, q12, q13, q14, q15
+ vmax.s32 \r, \r, q2
+.endr
+
+ store16 r6
+
+ movdup_if d0, r12, 2896*8*(1<<16), \scale
+ vmov_if d7, #0, \clear
+ load8 r7, r8, d7, \clear
+ clear_upper8
+ sub r7, r7, r8, lsl #3
+ lsr r8, r8, #1
+ sub r7, r7, r8, lsr #1
+ scale_if \scale, d0[0], q8, q9, q10, q11
+
+ bl inv_dct32_odd_2s_x16_neon
+
+ add r10, r6, #8*15
+ sub r6, r6, #8*16
+
+ mov r9, #-8
+
+ vmov.i32 d1, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ vmvn.i32 d0, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
+.macro store_addsub r0, r1, r2, r3
+ vld1.32 {d2}, [r6, :64]!
+ vld1.32 {d3}, [r6, :64]!
+ vqadd.s32 d6, d2, \r0
+ vqsub.s32 \r0, d2, \r0
+ vld1.32 {d4}, [r6, :64]!
+ vqadd.s32 d7, d3, \r1
+ vqsub.s32 \r1, d3, \r1
+ vmin.s32 d6, d6, d1
+ vmin.s32 \r0, \r0, d1
+ vld1.32 {d5}, [r6, :64]!
+ vqadd.s32 d2, d4, \r2
+ sub r6, r6, #8*4
+ vmax.s32 d6, d6, d0
+ vmax.s32 \r0, \r0, d0
+ vqsub.s32 \r2, d4, \r2
+ vmin.s32 d7, d7, d1
+ vmin.s32 \r1, \r1, d1
+ vst1.32 {d6}, [r6, :64]!
+ vst1.32 {\r0}, [r10, :64], r9
+ vmin.s32 d2, d2, d1
+ vmin.s32 \r2, \r2, d1
+ vmax.s32 d7, d7, d0
+ vmax.s32 \r1, \r1, d0
+ vqadd.s32 d3, d5, \r3
+ vqsub.s32 \r3, d5, \r3
+ vmax.s32 d2, d2, d0
+ vmax.s32 \r2, \r2, d0
+ vmin.s32 d3, d3, d1
+ vmin.s32 \r3, \r3, d1
+ vst1.32 {d7}, [r6, :64]!
+ vst1.32 {\r1}, [r10, :64], r9
+ vmax.s32 d3, d3, d0
+ vmax.s32 \r3, \r3, d0
+ vst1.32 {d2}, [r6, :64]!
+ vst1.32 {\r2}, [r10, :64], r9
+ vst1.32 {d3}, [r6, :64]!
+ vst1.32 {\r3}, [r10, :64], r9
+.endm
+ store_addsub d31, d30, d29, d28
+ store_addsub d27, d26, d25, d24
+ store_addsub d23, d22, d21, d20
+ store_addsub d19, d18, d17, d16
+.purgem store_addsub
+
+ add r6, r6, #2*4*16
+
+ movrel_local r12, idct64_coeffs
+ vmov.i32 q5, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ vmvn.i32 q4, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
+ movdup_if d0, lr, 2896*8*(1<<16), \scale
+ vmov_if d7, #0, \clear
+ add r9, r7, r8, lsl #4 // offset 16
+ add r10, r7, r8, lsl #3 // offset 8
+ sub r9, r9, r8 // offset 15
+ sub r11, r10, r8 // offset 7
+ vld1.32 {d16}, [r7, :64] // in1 (offset 0)
+ vld1.32 {d17}, [r9, :64] // in31 (offset 15)
+ vld1.32 {d18}, [r10, :64] // in17 (offset 8)
+ vld1.32 {d19}, [r11, :64] // in15 (offset 7)
+ vst1_if {d7}, [r7, :64], \clear
+ vst1_if {d7}, [r9, :64], \clear
+ vst1_if {d7}, [r10, :64], \clear
+ vst1_if {d7}, [r11, :64], \clear
+ scale_if \scale, d0[0], q8, q9
+ bl inv_dct64_step1_neon
+ movdup_if d0, lr, 2896*8*(1<<16), \scale
+ vmov_if d7, #0, \clear
+ add r7, r7, r8, lsl #2 // offset 4
+ sub r9, r9, r8, lsl #2 // offset 11
+ sub r10, r7, r8 // offset 3
+ add r11, r9, r8 // offset 12
+ vld1.32 {d16}, [r10, :64] // in7 (offset 3)
+ vld1.32 {d17}, [r11, :64] // in25 (offset 12)
+ vld1.32 {d18}, [r9, :64] // in23 (offset 11)
+ vld1.32 {d19}, [r7, :64] // in9 (offset 4)
+ vst1_if {d7}, [r7, :64], \clear
+ vst1_if {d7}, [r9, :64], \clear
+ vst1_if {d7}, [r10, :64], \clear
+ vst1_if {d7}, [r11, :64], \clear
+ scale_if \scale, d0[0], q8, q9
+ bl inv_dct64_step1_neon
+ movdup_if d0, lr, 2896*8*(1<<16), \scale
+ vmov_if d7, #0, \clear
+ sub r10, r10, r8, lsl #1 // offset 1
+ sub r9, r9, r8, lsl #1 // offset 9
+ add r10, r10, r8 // offset 2
+ add r9, r9, r8 // offset 10
+ add r7, r7, r8 // offset 5
+ add r11, r11, r8 // offset 13
+ vld1.32 d16, [r10, :64] // in5 (offset 2)
+ vld1.32 d17, [r11, :64] // in27 (offset 13)
+ vld1.32 d18, [r9, :64] // in21 (offset 10)
+ vld1.32 d19, [r7, :64] // in11 (offset 5)
+ vst1_if d7, [r10, :64], \clear
+ vst1_if d7, [r11, :64], \clear
+ vst1_if d7, [r9, :64], \clear
+ vst1_if d7, [r7, :64], \clear
+ scale_if \scale, d0[0], q8, q9
+ bl inv_dct64_step1_neon
+ movdup_if d0, lr, 2896*8*(1<<16), \scale
+ vmov_if d7, #0, \clear
+ sub r10, r10, r8 // offset 1
+ sub r9, r9, r8 // offset 9
+ add r11, r11, r8 // offset 14
+ add r7, r7, r8 // offset 6
+ vld1.32 d16, [r10, :64] // in3 (offset 1)
+ vld1.32 d17, [r11, :64] // in29 (offset 14)
+ vld1.32 d18, [r9, :64] // in19 (offset 9)
+ vld1.32 d19, [r7, :64] // in13 (offset 6)
+ vst1_if d7, [r10, :64], \clear
+ vst1_if d7, [r11, :64], \clear
+ vst1_if d7, [r9, :64], \clear
+ vst1_if d7, [r7, :64], \clear
+ scale_if \scale, d0[0], q8, q9
+ bl inv_dct64_step1_neon
+
+ sub r6, r6, #2*4*32
+ add r9, r6, #2*4*7
+
+ bl inv_dct64_step2_neon
+
+ pop {r10-r11,pc}
+endfunc
+.endm
+
+def_dct64_func _clear, clear=1
+def_dct64_func _clear_scale, clear=1, scale=1
+
+function inv_txfm_horz_dct_64x2_neon
+ vdup.32 q4, r9
+
+ mov r7, sp
+ add r8, sp, #2*4*(64 - 4)
+ add r9, r6, #2*56
+
+ push {r10-r11,lr}
+
+ mov r10, #2*64
+ mov r11, #-2*4*4
+
+1:
+ vld1.32 {d16, d17, d18, d19}, [r7, :128]!
+ vld1.32 {d28, d29, d30, d31}, [r8, :128], r11
+ vld1.32 {d20, d21, d22, d23}, [r7, :128]!
+ vld1.32 {d24, d25, d26, d27}, [r8, :128], r11
+ vtrn.32 d16, d17
+ vtrn.32 d18, d19
+ vtrn.32 d20, d21
+ vtrn.32 d22, d23
+ vtrn.32 d31, d30
+ vtrn.32 d29, d28
+ vtrn.32 d27, d26
+ vtrn.32 d25, d24
+
+.macro store_addsub src0, src1, src2, src3, src4, src5, src6, src7
+ vqsub.s32 d7, \src0, \src1
+ vqsub.s32 d6, \src2, \src3
+ vqsub.s32 d5, \src4, \src5
+ vqsub.s32 d4, \src6, \src7
+ vqadd.s32 d0, \src0, \src1
+ vqadd.s32 d1, \src2, \src3
+ vqadd.s32 d2, \src4, \src5
+ vqadd.s32 d3, \src6, \src7
+ vrshl.s32 q3, q3, q4
+ vrshl.s32 q2, q2, q4
+ vrshl.s32 q0, q0, q4
+ vrshl.s32 q1, q1, q4
+ vqmovn.s32 d7, q3
+ vqmovn.s32 d6, q2
+ vqmovn.s32 d0, q0
+ vqmovn.s32 d1, q1
+ vrev32.16 q3, q3
+ vst1.16 {q0}, [r6, :128], r10
+ vst1.16 {q3}, [r9, :128], r10
+.endm
+ store_addsub d16, d31, d18, d29, d20, d27, d22, d25
+ store_addsub d17, d30, d19, d28, d21, d26, d23, d24
+.purgem store_addsub
+ sub r6, r6, r10, lsl #1
+ sub r9, r9, r10, lsl #1
+ add r6, r6, #16
+ sub r9, r9, #16
+
+ cmp r7, r8
+ blt 1b
+ pop {r10-r11,pc}
+endfunc
+
+function inv_txfm_add_vert_dct_4x64_neon
+ lsl r8, r8, #1
+
+ mov r7, sp
+ add r8, sp, #2*4*(64 - 4)
+ add r9, r6, r1, lsl #6
+ sub r9, r9, r1
+
+ push {r10-r11,lr}
+
+ neg r10, r1
+ mov r11, #-2*4*4
+
+1:
+ vld1.16 {d16, d17, d18, d19}, [r7, :128]!
+ vld1.16 {d28, d29, d30, d31}, [r8, :128], r11
+ vld1.16 {d20, d21, d22, d23}, [r7, :128]!
+ vld1.16 {d24, d25, d26, d27}, [r8, :128], r11
+
+ vmov.i16 q6, #0
+ vmvn.i16 q7, #0xfc00 // 0x3ff
+.macro add_dest_addsub src0, src1, src2, src3
+ vld1.16 {d0}, [r6, :64], r1
+ vld1.16 {d1}, [r9, :64], r10
+ vqadd.s16 d4, \src0, \src1
+ vld1.16 {d2}, [r6, :64]
+ vqsub.s16 d5, \src0, \src1
+ vld1.16 {d3}, [r9, :64]
+ vqadd.s16 d6, \src2, \src3
+ vqsub.s16 d7, \src2, \src3
+ sub r6, r6, r1
+ sub r9, r9, r10
+ vrshr.s16 q2, q2, #4
+ vrshr.s16 q3, q3, #4
+ vqadd.s16 q2, q2, q0
+ vqadd.s16 q3, q3, q1
+ vmax.s16 q2, q2, q6
+ vmax.s16 q3, q3, q6
+ vmin.s16 q2, q2, q7
+ vmin.s16 q3, q3, q7
+ vst1.16 {d4}, [r6, :64], r1
+ vst1.16 {d5}, [r9, :64], r10
+ vst1.16 {d6}, [r6, :64], r1
+ vst1.16 {d7}, [r9, :64], r10
+.endm
+ add_dest_addsub d16, d31, d17, d30
+ add_dest_addsub d18, d29, d19, d28
+ add_dest_addsub d20, d27, d21, d26
+ add_dest_addsub d22, d25, d23, d24
+.purgem add_dest_addsub
+ cmp r7, r8
+ blt 1b
+
+ pop {r10-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_64x64_16bpc_neon, export=1
+ idct_dc 64, 64, 2
+
+ push {r4-r11,lr}
+ vpush {q4-q7}
+
+ sub_sp_align 64*32*2+64*4*2
+ add r5, sp, #64*4*2
+
+ movrel_local r10, eob_32x32
+
+.irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+ add r6, r5, #(\i*64*2)
+.if \i > 0
+ mov r8, #(32 - \i)
+ cmp r3, r11
+ blt 1f
+.endif
+ add r7, r2, #(\i*4)
+ mov r8, #32*4
+ bl inv_txfm_dct_clear_2s_x64_neon
+ add r6, r5, #(\i*64*2)
+ mov r9, #-2 // shift
+ bl inv_txfm_horz_dct_64x2_neon
+.if \i < 30
+ ldrh r11, [r10], #2
+.endif
+.endr
+ b 3f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #2
+.rept 8
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60
+ add r7, r5, #(\i*2)
+ mov r8, #64*2
+ bl X(inv_txfm_dct_4h_x64_neon)
+ add r6, r0, #(\i*2)
+ bl inv_txfm_add_vert_dct_4x64_neon
+.endr
+
+ add_sp_align 64*32*2+64*4*2
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_64x32_16bpc_neon, export=1
+ idct_dc 64, 32, 1
+
+ push {r4-r11,lr}
+ vpush {q4-q7}
+
+ sub_sp_align 64*32*2+64*4*2
+ add r5, sp, #64*4*2
+
+ movrel_local r10, eob_32x32
+
+.irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+ add r6, r5, #(\i*64*2)
+.if \i > 0
+ mov r8, #(32 - \i)
+ cmp r3, r11
+ blt 1f
+.endif
+ add r7, r2, #(\i*4)
+ mov r8, #32*4
+ bl inv_txfm_dct_clear_scale_2s_x64_neon
+ add r6, r5, #(\i*64*2)
+ mov r9, #-1 // shift
+ bl inv_txfm_horz_dct_64x2_neon
+.if \i < 30
+ ldrh r11, [r10], #2
+.endif
+.endr
+ b 3f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #2
+.rept 8
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60
+ add r6, r0, #(\i*2)
+ add r7, r5, #(\i*2)
+ mov r8, #64*2
+ bl inv_txfm_add_vert_dct_4x32_neon
+.endr
+
+ add_sp_align 64*32*2+64*4*2
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_32x64_16bpc_neon, export=1
+ idct_dc 32, 64, 1
+
+ push {r4-r11,lr}
+ vpush {q4-q7}
+
+ sub_sp_align 32*32*2+64*4*2
+ add r5, sp, #64*4*2
+
+ movrel_local r10, eob_32x32
+ ldrh r11, [r10], #2
+
+.irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+ add r6, r5, #(\i*32*2)
+.if \i > 0
+ mov r8, #(32 - \i)
+ cmp r3, r11
+ blt 1f
+.if \i < 30
+ ldrh r11, [r10], #2
+.endif
+.endif
+ add r7, r2, #(\i*4)
+ mov r8, #32*4
+ bl inv_txfm_horz_scale_dct_32x2_neon
+.endr
+ b 3f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #2
+.rept 4
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add r7, r5, #(\i*2)
+ mov r8, #32*2
+ bl X(inv_txfm_dct_4h_x64_neon)
+ add r6, r0, #(\i*2)
+ bl inv_txfm_add_vert_dct_4x64_neon
+.endr
+
+ add_sp_align 32*32*2+64*4*2
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_64x16_16bpc_neon, export=1
+ idct_dc 64, 16, 2
+
+ push {r4-r11,lr}
+ vpush {q4-q7}
+
+ sub_sp_align 64*16*2+64*4*2
+ add r4, sp, #64*4*2
+
+ movrel_local r10, eob_16x32
+
+.irp i, 0, 2, 4, 6, 8, 10, 12, 14
+ add r6, r4, #(\i*64*2)
+.if \i > 0
+ mov r8, #(16 - \i)
+ cmp r3, r11
+ blt 1f
+.endif
+ add r7, r2, #(\i*4)
+ mov r8, #16*4
+ bl inv_txfm_dct_clear_2s_x64_neon
+ add r6, r4, #(\i*64*2)
+ mov r9, #-2 // shift
+ bl inv_txfm_horz_dct_64x2_neon
+.if \i < 8
+ ldrh r11, [r10], #2
+.endif
+.endr
+ b 3f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #2
+.rept 8
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+ movrel r5, X(inv_dct_4h_x16_neon)
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60
+ add r6, r0, #(\i*2)
+ add r7, r4, #(\i*2)
+ mov r8, #64*2
+ bl inv_txfm_add_vert_4x16_neon
+.endr
+
+ add_sp_align 64*16*2+64*4*2
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_16x64_16bpc_neon, export=1
+ idct_dc 16, 64, 2
+
+ push {r4-r11,lr}
+ vpush {q4-q7}
+
+ sub_sp_align 16*32*2+64*4*2
+ add r5, sp, #64*4*2
+
+ movrel_local r10, eob_16x32
+ ldrh r11, [r10], #2
+
+ movrel_local r4, inv_dct_2s_x16_neon
+.irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+ add r6, r5, #(\i*16*2)
+.if \i > 0
+ mov r8, #(32 - \i)
+ cmp r3, r11
+ blt 1f
+.if \i < 30
+ ldrh r11, [r10], #2
+.endif
+.endif
+ add r7, r2, #(\i*4)
+ mov r8, #32*4
+ bl inv_txfm_horz_16x2_neon
+.endr
+ b 3f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #2
+.rept 2
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+.irp i, 0, 4, 8, 12
+ add r7, r5, #(\i*2)
+ mov r8, #16*2
+ bl X(inv_txfm_dct_4h_x64_neon)
+ add r6, r0, #(\i*2)
+ bl inv_txfm_add_vert_dct_4x64_neon
+.endr
+
+ add_sp_align 16*32*2+64*4*2
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
diff --git a/third_party/dav1d/src/arm/32/loopfilter.S b/third_party/dav1d/src/arm/32/loopfilter.S
new file mode 100644
index 0000000000..97b960534f
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/loopfilter.S
@@ -0,0 +1,868 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+.macro loop_filter wd
+function lpf_8_wd\wd\()_neon
+ vabd.u8 d0, d22, d23 // abs(p1 - p0)
+ vabd.u8 d1, d25, d24 // abs(q1 - q0)
+ vabd.u8 d2, d23, d24 // abs(p0 - q0)
+ vabd.u8 d3, d22, d25 // abs(p1 - q1)
+.if \wd >= 6
+ vabd.u8 d4, d21, d22 // abs(p2 - p1)
+ vabd.u8 d5, d26, d25 // abs(q2 - q1)
+.endif
+.if \wd >= 8
+ vabd.u8 d6, d20, d21 // abs(p3 - p2)
+ vabd.u8 d7, d27, d26 // abs(q3 - q3)
+.endif
+.if \wd >= 6
+ vmax.u8 d4, d4, d5
+.endif
+ vqadd.u8 d2, d2, d2 // abs(p0 - q0) * 2
+.if \wd >= 8
+ vmax.u8 d6, d6, d7
+.endif
+ vshr.u8 d3, d3, #1
+.if \wd >= 8
+ vmax.u8 d4, d4, d6
+.endif
+.if \wd >= 6
+ vand d4, d4, d14
+.endif
+ vmax.u8 d0, d0, d1 // max(abs(p1 - p0), abs(q1 - q0))
+ vqadd.u8 d2, d2, d3 // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
+.if \wd >= 6
+ vmax.u8 d4, d0, d4
+ vcge.u8 d1, d11, d4 // max(abs(p1 - p0), abs(q1 - q0), abs(), abs(), ...) <= I
+.else
+ vcge.u8 d1, d11, d0 // max(abs(p1 - p0), abs(q1 - q0)) <= I
+.endif
+ vcge.u8 d2, d10, d2 // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 <= E
+ vand d1, d1, d2 // fm
+ vand d1, d1, d13 // fm && wd >= 4
+.if \wd >= 6
+ vand d14, d14, d1 // fm && wd > 4
+.endif
+.if \wd >= 16
+ vand d15, d15, d1 // fm && wd == 16
+.endif
+
+ vmov r10, r11, d1
+ orrs r10, r10, r11
+ beq 9f // if (!fm || wd < 4) return;
+
+.if \wd >= 6
+ vmov.i8 d10, #1
+ vabd.u8 d2, d21, d23 // abs(p2 - p0)
+ vabd.u8 d3, d22, d23 // abs(p1 - p0)
+ vabd.u8 d4, d25, d24 // abs(q1 - q0)
+ vabd.u8 d5, d26, d24 // abs(q2 - q0)
+.if \wd >= 8
+ vabd.u8 d6, d20, d23 // abs(p3 - p0)
+ vabd.u8 d7, d27, d24 // abs(q3 - q0)
+.endif
+ vmax.u8 d2, d2, d3
+ vmax.u8 d4, d4, d5
+.if \wd >= 8
+ vmax.u8 d6, d6, d7
+.endif
+ vmax.u8 d2, d2, d4
+.if \wd >= 8
+ vmax.u8 d2, d2, d6
+.endif
+
+.if \wd == 16
+ vabd.u8 d3, d17, d23 // abs(p6 - p0)
+ vabd.u8 d4, d18, d23 // abs(p5 - p0)
+ vabd.u8 d5, d19, d23 // abs(p4 - p0)
+.endif
+ vcge.u8 d2, d10, d2 // flat8in
+.if \wd == 16
+ vabd.u8 d6, d28, d24 // abs(q4 - q0)
+ vabd.u8 d7, d29, d24 // abs(q5 - q0)
+ vabd.u8 d8, d30, d24 // abs(q6 - q0)
+.endif
+ vand d14, d2, d14 // flat8in && fm && wd > 4
+ vbic d1, d1, d14 // fm && wd >= 4 && !flat8in
+.if \wd == 16
+ vmax.u8 d3, d3, d4
+ vmax.u8 d5, d5, d6
+.endif
+ vmov r10, r11, d1
+.if \wd == 16
+ vmax.u8 d7, d7, d8
+ vmax.u8 d3, d3, d5
+ vmax.u8 d3, d3, d7
+ vcge.u8 d3, d10, d3 // flat8out
+.endif
+ orrs r10, r10, r11
+.if \wd == 16
+ vand d15, d15, d3 // flat8out && fm && wd == 16
+ vand d15, d15, d14 // flat8out && flat8in && fm && wd == 16
+ vbic d14, d14, d15 // flat8in && fm && wd >= 4 && !flat8out
+.endif
+ beq 1f // skip wd == 4 case
+.endif
+
+ vsubl.u8 q1, d22, d25 // p1 - q1
+ vcgt.u8 d0, d0, d12 // hev
+ vqmovn.s16 d2, q1
+ vand d4, d2, d0 // if (hev) iclip_diff(p1 - q1)
+ vbic d0, d1, d0 // (fm && wd >= 4 && !hev)
+ vsubl.u8 q1, d24, d23
+ vmov.i16 q3, #3
+ vmul.i16 q1, q1, q3
+ vmov.i8 d6, #4
+ vaddw.s8 q1, q1, d4
+ vmov.i8 d7, #3
+ vqmovn.s16 d2, q1 // f
+ vqadd.s8 d4, d6, d2 // imin(f + 4, 127)
+ vqadd.s8 d5, d7, d2 // imin(f + 3, 127)
+ vshr.s8 d4, d4, #3 // f1
+ vshr.s8 d5, d5, #3 // f2
+ vmovl.u8 q1, d23 // p0
+ vmovl.u8 q3, d24 // q0
+ vaddw.s8 q1, q1, d5
+ vsubw.s8 q3, q3, d4
+ vrshr.s8 d4, d4, #1 // (f1 + 1) >> 1
+ vqmovun.s16 d2, q1 // out p0
+ vqmovun.s16 d6, q3 // out q0
+ vbit d23, d2, d1 // if (fm && wd >= 4)
+ vmovl.u8 q1, d22 // p1
+ vbit d24, d6, d1 // if (fm && wd >= 4)
+ vmovl.u8 q3, d25 // q1
+ vaddw.s8 q1, q1, d4
+ vsubw.s8 q3, q3, d4
+ vqmovun.s16 d2, q1 // out p1
+ vqmovun.s16 d6, q3 // out q1
+ vbit d22, d2, d0 // if (fm && wd >= 4 && !hev)
+ vbit d25, d6, d0 // if (fm && wd >= 4 && !hev)
+1:
+
+.if \wd == 6
+ vmov r10, r11, d14
+ orrs r10, r10, r11
+ beq 2f // skip if there's no flat8in
+
+ vaddl.u8 q0, d21, d21 // p2 * 2
+ vaddl.u8 q1, d21, d22 // p2 + p1
+ vaddl.u8 q2, d22, d23 // p1 + p0
+ vaddl.u8 q3, d23, d24 // p0 + q0
+ vadd.i16 q4, q0, q1
+ vadd.i16 q5, q2, q3
+ vaddl.u8 q6, d24, d25 // q0 + q1
+ vadd.i16 q4, q4, q5
+ vsub.i16 q6, q6, q0
+ vaddl.u8 q5, d25, d26 // q1 + q2
+ vrshrn.i16 d0, q4, #3 // out p1
+
+ vadd.i16 q4, q4, q6
+ vsub.i16 q5, q5, q1
+ vaddl.u8 q6, d26, d26 // q2 + q2
+ vrshrn.i16 d1, q4, #3 // out p0
+
+ vadd.i16 q4, q4, q5
+ vsub.i16 q6, q6, q2
+ vrshrn.i16 d2, q4, #3 // out q0
+
+ vbit d22, d0, d14 // p1 if (flat8in)
+ vadd.i16 q4, q4, q6
+ vbit d23, d1, d14 // p0 if (flat8in)
+ vrshrn.i16 d3, q4, #3 // out q1
+ vbit d24, d2, d14 // q0 if (flat8in)
+ vbit d25, d3, d14 // q1 if (flat8in)
+.elseif \wd >= 8
+ vmov r10, r11, d14
+ orrs r10, r10, r11
+.if \wd == 8
+ beq 8f // skip if there's no flat8in
+.else
+ beq 2f // skip if there's no flat8in
+.endif
+
+ vaddl.u8 q0, d20, d21 // p3 + p2
+ vaddl.u8 q1, d22, d25 // p1 + q1
+ vaddl.u8 q2, d20, d22 // p3 + p1
+ vaddl.u8 q3, d23, d26 // p0 + q2
+ vadd.i16 q4, q0, q0 // 2 * (p3 + p2)
+ vaddw.u8 q4, q4, d23 // + p0
+ vaddw.u8 q4, q4, d24 // + q0
+ vadd.i16 q4, q4, q2 // + p3 + p1
+ vsub.i16 q1, q1, q0 // p1 + q1 - p3 - p2
+ vsub.i16 q3, q3, q2 // p0 + q2 - p3 - p1
+ vrshrn.i16 d10, q4, #3 // out p2
+
+ vadd.i16 q4, q4, q1
+ vaddl.u8 q0, d20, d23 // p3 + p0
+ vaddl.u8 q1, d24, d27 // q0 + q3
+ vrshrn.i16 d11, q4, #3 // out p1
+
+ vadd.i16 q4, q4, q3
+ vsub.i16 q1, q1, q0 // q0 + q3 - p3 - p0
+ vaddl.u8 q2, d21, d24 // p2 + q0
+ vaddl.u8 q3, d25, d27 // q1 + q3
+ vrshrn.i16 d12, q4, #3 // out p0
+
+ vadd.i16 q4, q4, q1
+ vsub.i16 q3, q3, q2 // q1 + q3 - p2 - q0
+ vaddl.u8 q0, d22, d25 // p1 + q1
+ vaddl.u8 q1, d26, d27 // q2 + q3
+ vrshrn.i16 d13, q4, #3 // out q0
+
+ vadd.i16 q4, q4, q3
+ vsub.i16 q1, q1, q0 // q2 + q3 - p1 - q1
+ vrshrn.i16 d0, q4, #3 // out q1
+
+ vadd.i16 q4, q4, q1
+
+ vbit d21, d10, d14
+ vbit d22, d11, d14
+ vbit d23, d12, d14
+ vrshrn.i16 d1, q4, #3 // out q2
+ vbit d24, d13, d14
+ vbit d25, d0, d14
+ vbit d26, d1, d14
+.endif
+2:
+.if \wd == 16
+ vmov r10, r11, d15
+ orrs r10, r10, r11
+ bne 1f // check if flat8out is needed
+ vmov r10, r11, d14
+ orrs r10, r10, r11
+ beq 8f // if there was no flat8in, just write the inner 4 pixels
+ b 7f // if flat8in was used, write the inner 6 pixels
+1:
+
+ vaddl.u8 q1, d17, d17 // p6 + p6
+ vaddl.u8 q2, d17, d18 // p6 + p5
+ vaddl.u8 q3, d17, d19 // p6 + p4
+ vaddl.u8 q4, d17, d20 // p6 + p3
+ vadd.i16 q6, q1, q2
+ vadd.i16 q5, q3, q4
+ vaddl.u8 q3, d17, d21 // p6 + p2
+ vadd.i16 q6, q6, q5
+ vaddl.u8 q4, d17, d22 // p6 + p1
+ vaddl.u8 q5, d18, d23 // p5 + p0
+ vadd.i16 q3, q3, q4
+ vaddl.u8 q4, d19, d24 // p4 + q0
+ vadd.i16 q6, q6, q3
+ vadd.i16 q5, q5, q4
+ vaddl.u8 q3, d20, d25 // p3 + q1
+ vadd.i16 q6, q6, q5
+ vsub.i16 q3, q3, q1
+ vaddl.u8 q1, d21, d26 // p2 + q2
+ vrshrn.i16 d0, q6, #4 // out p5
+ vadd.i16 q6, q6, q3 // - (p6 + p6) + (p3 + q1)
+ vsub.i16 q1, q1, q2
+ vaddl.u8 q2, d22, d27 // p1 + q3
+ vaddl.u8 q3, d17, d19 // p6 + p4
+ vrshrn.i16 d1, q6, #4 // out p4
+ vadd.i16 q6, q6, q1 // - (p6 + p5) + (p2 + q2)
+ vsub.i16 q2, q2, q3
+ vaddl.u8 q3, d23, d28 // p0 + q4
+ vaddl.u8 q4, d17, d20 // p6 + p3
+ vrshrn.i16 d2, q6, #4 // out p3
+ vadd.i16 q6, q6, q2 // - (p6 + p4) + (p1 + q3)
+ vsub.i16 q3, q3, q4
+ vaddl.u8 q4, d24, d29 // q0 + q5
+ vaddl.u8 q2, d17, d21 // p6 + p2
+ vrshrn.i16 d3, q6, #4 // out p2
+ vadd.i16 q6, q6, q3 // - (p6 + p3) + (p0 + q4)
+ vsub.i16 q4, q4, q2
+ vaddl.u8 q3, d25, d30 // q1 + q6
+ vaddl.u8 q5, d17, d22 // p6 + p1
+ vrshrn.i16 d4, q6, #4 // out p1
+ vadd.i16 q6, q6, q4 // - (p6 + p2) + (q0 + q5)
+ vsub.i16 q3, q3, q5
+ vaddl.u8 q4, d26, d30 // q2 + q6
+ vbif d0, d18, d15 // out p5
+ vaddl.u8 q5, d18, d23 // p5 + p0
+ vrshrn.i16 d5, q6, #4 // out p0
+ vadd.i16 q6, q6, q3 // - (p6 + p1) + (q1 + q6)
+ vsub.i16 q4, q4, q5
+ vaddl.u8 q5, d27, d30 // q3 + q6
+ vbif d1, d19, d15 // out p4
+ vaddl.u8 q9, d19, d24 // p4 + q0
+ vrshrn.i16 d6, q6, #4 // out q0
+ vadd.i16 q6, q6, q4 // - (p5 + p0) + (q2 + q6)
+ vsub.i16 q5, q5, q9
+ vaddl.u8 q4, d28, d30 // q4 + q6
+ vbif d2, d20, d15 // out p3
+ vaddl.u8 q9, d20, d25 // p3 + q1
+ vrshrn.i16 d7, q6, #4 // out q1
+ vadd.i16 q6, q6, q5 // - (p4 + q0) + (q3 + q6)
+ vsub.i16 q9, q4, q9
+ vaddl.u8 q5, d29, d30 // q5 + q6
+ vbif d3, d21, d15 // out p2
+ vaddl.u8 q10, d21, d26 // p2 + q2
+ vrshrn.i16 d8, q6, #4 // out q2
+ vadd.i16 q6, q6, q9 // - (p3 + q1) + (q4 + q6)
+ vsub.i16 q5, q5, q10
+ vaddl.u8 q9, d30, d30 // q6 + q6
+ vbif d4, d22, d15 // out p1
+ vaddl.u8 q10, d22, d27 // p1 + q3
+ vrshrn.i16 d9, q6, #4 // out q3
+ vadd.i16 q6, q6, q5 // - (p2 + q2) + (q5 + q6)
+ vsub.i16 q9, q9, q10
+ vbif d5, d23, d15 // out p0
+ vrshrn.i16 d10, q6, #4 // out q4
+ vadd.i16 q6, q6, q9 // - (p1 + q3) + (q6 + q6)
+ vrshrn.i16 d11, q6, #4 // out q5
+ vbif d6, d24, d15 // out q0
+ vbif d7, d25, d15 // out q1
+ vbif d8, d26, d15 // out q2
+ vbif d9, d27, d15 // out q3
+ vbif d10, d28, d15 // out q4
+ vbif d11, d29, d15 // out q5
+.endif
+
+ bx lr
+.if \wd == 16
+7:
+ // Return to a shorter epilogue, writing only the inner 6 pixels
+ bx r8
+.endif
+.if \wd >= 8
+8:
+ // Return to a shorter epilogue, writing only the inner 4 pixels
+ bx r9
+.endif
+9:
+ // Return directly without writing back any pixels
+ bx r12
+endfunc
+.endm
+
+loop_filter 16
+loop_filter 8
+loop_filter 6
+loop_filter 4
+
+.macro lpf_8_wd16
+ adr r8, 7f + CONFIG_THUMB
+ adr r9, 8f + CONFIG_THUMB
+ bl lpf_8_wd16_neon
+.endm
+
+.macro lpf_8_wd8
+ adr r9, 8f + CONFIG_THUMB
+ bl lpf_8_wd8_neon
+.endm
+
+.macro lpf_8_wd6
+ bl lpf_8_wd6_neon
+.endm
+
+.macro lpf_8_wd4
+ bl lpf_8_wd4_neon
+.endm
+
+function lpf_v_4_8_neon
+ mov r12, lr
+ sub r10, r0, r1, lsl #1
+ vld1.8 {d22}, [r10, :64], r1 // p1
+ vld1.8 {d24}, [r0, :64], r1 // q0
+ vld1.8 {d23}, [r10, :64], r1 // p0
+ vld1.8 {d25}, [r0, :64], r1 // q1
+ sub r0, r0, r1, lsl #1
+
+ lpf_8_wd4
+
+ sub r10, r0, r1, lsl #1
+ vst1.8 {d22}, [r10, :64], r1 // p1
+ vst1.8 {d24}, [r0, :64], r1 // q0
+ vst1.8 {d23}, [r10, :64], r1 // p0
+ vst1.8 {d25}, [r0, :64], r1 // q1
+ sub r0, r0, r1, lsl #1
+ bx r12
+endfunc
+
+function lpf_h_4_8_neon
+ mov r12, lr
+ sub r10, r0, #2
+ add r0, r10, r1, lsl #2
+ vld1.32 {d22[0]}, [r10], r1
+ vld1.32 {d22[1]}, [r0], r1
+ vld1.32 {d23[0]}, [r10], r1
+ vld1.32 {d23[1]}, [r0], r1
+ vld1.32 {d24[0]}, [r10], r1
+ vld1.32 {d24[1]}, [r0], r1
+ vld1.32 {d25[0]}, [r10], r1
+ vld1.32 {d25[1]}, [r0], r1
+ add r0, r0, #2
+
+ transpose_4x8b q11, q12, d22, d23, d24, d25
+
+ lpf_8_wd4
+
+ sub r10, r0, r1, lsl #3
+ sub r10, r10, #2
+ transpose_4x8b q11, q12, d22, d23, d24, d25
+ add r0, r10, r1, lsl #2
+
+ vst1.32 {d22[0]}, [r10], r1
+ vst1.32 {d22[1]}, [r0], r1
+ vst1.32 {d23[0]}, [r10], r1
+ vst1.32 {d23[1]}, [r0], r1
+ vst1.32 {d24[0]}, [r10], r1
+ vst1.32 {d24[1]}, [r0], r1
+ vst1.32 {d25[0]}, [r10], r1
+ vst1.32 {d25[1]}, [r0], r1
+ add r0, r0, #2
+ bx r12
+endfunc
+
+function lpf_v_6_8_neon
+ mov r12, lr
+ sub r10, r0, r1, lsl #1
+ sub r10, r10, r1
+ vld1.8 {d21}, [r10, :64], r1 // p2
+ vld1.8 {d24}, [r0, :64], r1 // q0
+ vld1.8 {d22}, [r10, :64], r1 // p1
+ vld1.8 {d25}, [r0, :64], r1 // q1
+ vld1.8 {d23}, [r10, :64], r1 // p0
+ vld1.8 {d26}, [r0, :64], r1 // q2
+ sub r0, r0, r1, lsl #1
+ sub r0, r0, r1
+
+ lpf_8_wd6
+
+ sub r10, r0, r1, lsl #1
+ vst1.8 {d22}, [r10, :64], r1 // p1
+ vst1.8 {d24}, [r0, :64], r1 // q0
+ vst1.8 {d23}, [r10, :64], r1 // p0
+ vst1.8 {d25}, [r0, :64], r1 // q1
+ sub r0, r0, r1, lsl #1
+ bx r12
+endfunc
+
+function lpf_h_6_8_neon
+ mov r12, lr
+ sub r10, r0, #4
+ add r0, r10, r1, lsl #2
+ vld1.8 {d20}, [r10], r1
+ vld1.8 {d24}, [r0], r1
+ vld1.8 {d21}, [r10], r1
+ vld1.8 {d25}, [r0], r1
+ vld1.8 {d22}, [r10], r1
+ vld1.8 {d26}, [r0], r1
+ vld1.8 {d23}, [r10], r1
+ vld1.8 {d27}, [r0], r1
+ add r0, r0, #4
+
+ transpose_8x8b q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
+
+ lpf_8_wd6
+
+ sub r10, r0, r1, lsl #3
+ sub r10, r10, #2
+ transpose_4x8b q11, q12, d22, d23, d24, d25
+ add r0, r10, r1, lsl #2
+
+ vst1.32 {d22[0]}, [r10], r1
+ vst1.32 {d22[1]}, [r0], r1
+ vst1.32 {d23[0]}, [r10], r1
+ vst1.32 {d23[1]}, [r0], r1
+ vst1.32 {d24[0]}, [r10], r1
+ vst1.32 {d24[1]}, [r0], r1
+ vst1.32 {d25[0]}, [r10], r1
+ vst1.32 {d25[1]}, [r0], r1
+ add r0, r0, #2
+ bx r12
+endfunc
+
+function lpf_v_8_8_neon
+ mov r12, lr
+ sub r10, r0, r1, lsl #2
+ vld1.8 {d20}, [r10, :64], r1 // p3
+ vld1.8 {d24}, [r0, :64], r1 // q0
+ vld1.8 {d21}, [r10, :64], r1 // p2
+ vld1.8 {d25}, [r0, :64], r1 // q1
+ vld1.8 {d22}, [r10, :64], r1 // p1
+ vld1.8 {d26}, [r0, :64], r1 // q2
+ vld1.8 {d23}, [r10, :64], r1 // p0
+ vld1.8 {d27}, [r0, :64], r1 // q3
+ sub r0, r0, r1, lsl #2
+
+ lpf_8_wd8
+
+ sub r10, r0, r1, lsl #1
+ sub r10, r10, r1
+ vst1.8 {d21}, [r10, :64], r1 // p2
+ vst1.8 {d24}, [r0, :64], r1 // q0
+ vst1.8 {d22}, [r10, :64], r1 // p1
+ vst1.8 {d25}, [r0, :64], r1 // q1
+ vst1.8 {d23}, [r10, :64], r1 // p0
+ vst1.8 {d26}, [r0, :64], r1 // q2
+ sub r0, r0, r1, lsl #1
+ sub r0, r0, r1
+ bx r12
+
+8:
+ sub r10, r0, r1, lsl #1
+ vst1.8 {d22}, [r10, :64], r1 // p1
+ vst1.8 {d24}, [r0, :64], r1 // q0
+ vst1.8 {d23}, [r10, :64], r1 // p0
+ vst1.8 {d25}, [r0, :64], r1 // q1
+ sub r0, r0, r1, lsl #1
+ bx r12
+endfunc
+
+function lpf_h_8_8_neon
+ mov r12, lr
+ sub r10, r0, #4
+ add r0, r10, r1, lsl #2
+ vld1.8 {d20}, [r10], r1
+ vld1.8 {d24}, [r0], r1
+ vld1.8 {d21}, [r10], r1
+ vld1.8 {d25}, [r0], r1
+ vld1.8 {d22}, [r10], r1
+ vld1.8 {d26}, [r0], r1
+ vld1.8 {d23}, [r10], r1
+ vld1.8 {d27}, [r0], r1
+ add r0, r0, #4
+
+ transpose_8x8b q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
+
+ lpf_8_wd8
+
+ sub r10, r0, r1, lsl #3
+ sub r10, r10, #4
+ transpose_8x8b q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
+ add r0, r10, r1, lsl #2
+
+ vst1.8 {d20}, [r10], r1
+ vst1.8 {d24}, [r0], r1
+ vst1.8 {d21}, [r10], r1
+ vst1.8 {d25}, [r0], r1
+ vst1.8 {d22}, [r10], r1
+ vst1.8 {d26}, [r0], r1
+ vst1.8 {d23}, [r10], r1
+ vst1.8 {d27}, [r0], r1
+ add r0, r0, #4
+ bx r12
+8:
+ sub r10, r0, r1, lsl #3
+ sub r10, r10, #2
+ transpose_4x8b q11, q12, d22, d23, d24, d25
+ add r0, r10, r1, lsl #2
+
+ vst1.32 {d22[0]}, [r10], r1
+ vst1.32 {d22[1]}, [r0], r1
+ vst1.32 {d23[0]}, [r10], r1
+ vst1.32 {d23[1]}, [r0], r1
+ vst1.32 {d24[0]}, [r10], r1
+ vst1.32 {d24[1]}, [r0], r1
+ vst1.32 {d25[0]}, [r10], r1
+ vst1.32 {d25[1]}, [r0], r1
+ add r0, r0, #2
+ bx r12
+endfunc
+
+function lpf_v_16_8_neon
+ mov r12, lr
+
+ sub r10, r0, r1, lsl #3
+ add r10, r10, r1
+ vld1.8 {d17}, [r10, :64], r1 // p6
+ vld1.8 {d24}, [r0, :64], r1 // q0
+ vld1.8 {d18}, [r10, :64], r1 // p5
+ vld1.8 {d25}, [r0, :64], r1 // q1
+ vld1.8 {d19}, [r10, :64], r1 // p4
+ vld1.8 {d26}, [r0, :64], r1 // q2
+ vld1.8 {d20}, [r10, :64], r1 // p3
+ vld1.8 {d27}, [r0, :64], r1 // q3
+ vld1.8 {d21}, [r10, :64], r1 // p2
+ vld1.8 {d28}, [r0, :64], r1 // q4
+ vld1.8 {d22}, [r10, :64], r1 // p1
+ vld1.8 {d29}, [r0, :64], r1 // q5
+ vld1.8 {d23}, [r10, :64], r1 // p0
+ vld1.8 {d30}, [r0, :64], r1 // q6
+ sub r0, r0, r1, lsl #3
+ add r0, r0, r1
+
+ lpf_8_wd16
+
+ sub r10, r0, r1, lsl #2
+ sub r10, r10, r1, lsl #1
+ vst1.8 {d0}, [r10, :64], r1 // p5
+ vst1.8 {d6}, [r0, :64], r1 // q0
+ vst1.8 {d1}, [r10, :64], r1 // p4
+ vst1.8 {d7}, [r0, :64], r1 // q1
+ vst1.8 {d2}, [r10, :64], r1 // p3
+ vst1.8 {d8}, [r0, :64], r1 // q2
+ vst1.8 {d3}, [r10, :64], r1 // p2
+ vst1.8 {d9}, [r0, :64], r1 // q3
+ vst1.8 {d4}, [r10, :64], r1 // p1
+ vst1.8 {d10}, [r0, :64], r1 // q4
+ vst1.8 {d5}, [r10, :64], r1 // p0
+ vst1.8 {d11}, [r0, :64], r1 // q5
+ sub r0, r0, r1, lsl #2
+ sub r0, r0, r1, lsl #1
+ bx r12
+7:
+ sub r10, r0, r1
+ sub r10, r10, r1, lsl #1
+ vst1.8 {d21}, [r10, :64], r1 // p2
+ vst1.8 {d24}, [r0, :64], r1 // q0
+ vst1.8 {d22}, [r10, :64], r1 // p1
+ vst1.8 {d25}, [r0, :64], r1 // q1
+ vst1.8 {d23}, [r10, :64], r1 // p0
+ vst1.8 {d26}, [r0, :64], r1 // q2
+ sub r0, r0, r1, lsl #1
+ sub r0, r0, r1
+ bx r12
+
+8:
+ sub r10, r0, r1, lsl #1
+ vst1.8 {d22}, [r10, :64], r1 // p1
+ vst1.8 {d24}, [r0, :64], r1 // q0
+ vst1.8 {d23}, [r10, :64], r1 // p0
+ vst1.8 {d25}, [r0, :64], r1 // q1
+ sub r0, r0, r1, lsl #1
+ bx r12
+endfunc
+
+function lpf_h_16_8_neon
+ mov r12, lr
+ sub r10, r0, #8
+ vld1.8 {d16}, [r10, :64], r1
+ vld1.8 {d24}, [r0, :64], r1
+ vld1.8 {d17}, [r10, :64], r1
+ vld1.8 {d25}, [r0, :64], r1
+ vld1.8 {d18}, [r10, :64], r1
+ vld1.8 {d26}, [r0, :64], r1
+ vld1.8 {d19}, [r10, :64], r1
+ vld1.8 {d27}, [r0, :64], r1
+ vld1.8 {d20}, [r10, :64], r1
+ vld1.8 {d28}, [r0, :64], r1
+ vld1.8 {d21}, [r10, :64], r1
+ vld1.8 {d29}, [r0, :64], r1
+ vld1.8 {d22}, [r10, :64], r1
+ vld1.8 {d30}, [r0, :64], r1
+ vld1.8 {d23}, [r10, :64], r1
+ vld1.8 {d31}, [r0, :64], r1
+
+ transpose_8x8b q8, q9, q10, q11, d16, d17, d18, d19, d20, d21, d22, d23
+ transpose_8x8b q12, q13, q14, q15, d24, d25, d26, d27, d28, d29, d30, d31
+
+ lpf_8_wd16
+
+ sub r0, r0, r1, lsl #3
+ sub r10, r0, #8
+
+ transpose_8x8b q8, q0, q1, q2, d16, d17, d0, d1, d2, d3, d4, d5
+ transpose_8x8b q3, q4, q5, q15, d6, d7, d8, d9, d10, d11, d30, d31
+
+ vst1.8 {d16}, [r10, :64], r1
+ vst1.8 {d6}, [r0, :64], r1
+ vst1.8 {d17}, [r10, :64], r1
+ vst1.8 {d7}, [r0, :64], r1
+ vst1.8 {d0}, [r10, :64], r1
+ vst1.8 {d8}, [r0, :64], r1
+ vst1.8 {d1}, [r10, :64], r1
+ vst1.8 {d9}, [r0, :64], r1
+ vst1.8 {d2}, [r10, :64], r1
+ vst1.8 {d10}, [r0, :64], r1
+ vst1.8 {d3}, [r10, :64], r1
+ vst1.8 {d11}, [r0, :64], r1
+ vst1.8 {d4}, [r10, :64], r1
+ vst1.8 {d30}, [r0, :64], r1
+ vst1.8 {d5}, [r10, :64], r1
+ vst1.8 {d31}, [r0, :64], r1
+ bx r12
+
+7:
+ sub r10, r0, r1, lsl #3
+ sub r10, r10, #4
+ transpose_8x8b q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
+ add r0, r10, r1, lsl #2
+
+ vst1.8 {d20}, [r10], r1
+ vst1.8 {d24}, [r0], r1
+ vst1.8 {d21}, [r10], r1
+ vst1.8 {d25}, [r0], r1
+ vst1.8 {d22}, [r10], r1
+ vst1.8 {d26}, [r0], r1
+ vst1.8 {d23}, [r10], r1
+ vst1.8 {d27}, [r0], r1
+ add r0, r0, #4
+ bx r12
+8:
+ sub r10, r0, r1, lsl #3
+ sub r10, r10, #2
+ transpose_4x8b q11, q12, d22, d23, d24, d25
+ add r0, r10, r1, lsl #2
+
+ vst1.32 {d22[0]}, [r10], r1
+ vst1.32 {d22[1]}, [r0], r1
+ vst1.32 {d23[0]}, [r10], r1
+ vst1.32 {d23[1]}, [r0], r1
+ vst1.32 {d24[0]}, [r10], r1
+ vst1.32 {d24[1]}, [r0], r1
+ vst1.32 {d25[0]}, [r10], r1
+ vst1.32 {d25[1]}, [r0], r1
+ add r0, r0, #2
+ bx r12
+endfunc
+
+// void dav1d_lpf_v_sb_y_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const uint32_t *const vmask,
+// const uint8_t (*l)[4], ptrdiff_t b4_stride,
+// const Av1FilterLUT *lut, const int w)
+
+.macro lpf_func dir, type
+function lpf_\dir\()_sb_\type\()_8bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100]
+ ldrd r6, r7, [r2] // vmask[0], vmask[1]
+.ifc \type, y
+ ldr r2, [r2, #8] // vmask[2]
+.endif
+ add r5, r5, #128 // Move to sharp part of lut
+.ifc \type, y
+ orr r7, r7, r2 // vmask[1] |= vmask[2]
+.endif
+.ifc \dir, v
+ sub r4, r3, r4, lsl #2
+.else
+ sub r3, r3, #4
+ lsl r4, r4, #2
+.endif
+ orr r6, r6, r7 // vmask[0] |= vmask[1]
+
+1:
+ tst r6, #0x03
+.ifc \dir, v
+ vld1.8 {d0}, [r4]!
+ vld1.8 {d1}, [r3]!
+.else
+ vld2.32 {d0[0], d1[0]}, [r3], r4
+ vld2.32 {d0[1], d1[1]}, [r3], r4
+.endif
+ beq 7f // if (!(vm & bits)) continue;
+
+ vld1.8 {d5[]}, [r5] // sharp[0]
+ add r5, r5, #8
+ vmov.i32 d2, #0xff
+ vdup.32 d13, r6 // vmask[0]
+
+ vand d0, d0, d2 // Keep only lowest byte in each 32 bit word
+ vand d1, d1, d2
+ vtst.8 d3, d1, d2 // Check for nonzero values in l[0][0]
+ vmov.i8 d4, #1
+ vld1.8 {d6[]}, [r5] // sharp[1]
+ sub r5, r5, #8
+ vbif d1, d0, d3 // if (!l[0][0]) L = l[offset][0]
+ vtst.32 d2, d1, d2 // L != 0
+ vmul.i32 d1, d1, d4 // L
+.ifc \type, y
+ vdup.32 d15, r2 // vmask[2]
+.endif
+ vdup.32 d14, r7 // vmask[1]
+ vmov r10, r11, d2
+ orrs r10, r10, r11
+ beq 7f // if (!L) continue;
+ vneg.s8 d5, d5 // -sharp[0]
+ movrel_local r10, word_12
+ vshr.u8 d12, d1, #4 // H
+ vld1.32 {d16}, [r10, :64]
+ vshl.s8 d3, d1, d5 // L >> sharp[0]
+.ifc \type, y
+ vtst.32 d15, d15, d16 // if (vmask[2] & bits)
+.endif
+ vmov.i8 d7, #2
+ vmin.u8 d3, d3, d6 // imin(L >> sharp[0], sharp[1])
+ vadd.i8 d0, d1, d7 // L + 2
+ vmax.u8 d11, d3, d4 // imax(imin(), 1) = limit = I
+ vadd.u8 d0, d0, d0 // 2*(L + 2)
+ vtst.32 d14, d14, d16 // if (vmask[1] & bits)
+ vadd.i8 d10, d0, d11 // 2*(L + 2) + limit = E
+ vtst.32 d13, d13, d16 // if (vmask[0] & bits)
+ vand d13, d13, d2 // vmask[0] &= L != 0
+
+.ifc \type, y
+ tst r2, #0x03
+ beq 2f
+ // wd16
+ bl lpf_\dir\()_16_8_neon
+ b 8f
+2:
+.endif
+ tst r7, #0x03
+ beq 3f
+.ifc \type, y
+ // wd8
+ bl lpf_\dir\()_8_8_neon
+.else
+ // wd6
+ bl lpf_\dir\()_6_8_neon
+.endif
+ b 8f
+3:
+ // wd4
+ bl lpf_\dir\()_4_8_neon
+.ifc \dir, h
+ b 8f
+7:
+ // For dir h, the functions above increment r0.
+ // If the whole function is skipped, increment it here instead.
+ add r0, r0, r1, lsl #3
+.else
+7:
+.endif
+8:
+ lsrs r6, r6, #2 // vmask[0] >>= 2
+ lsr r7, r7, #2 // vmask[1] >>= 2
+.ifc \type, y
+ lsr r2, r2, #2 // vmask[2] >>= 2
+.endif
+.ifc \dir, v
+ add r0, r0, #8
+.else
+ // For dir h, r0 is returned incremented
+.endif
+ bne 1b
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+.endm
+
+lpf_func v, y
+lpf_func h, y
+lpf_func v, uv
+lpf_func h, uv
+
+const word_12, align=4
+ .word 1, 2
+endconst
diff --git a/third_party/dav1d/src/arm/32/loopfilter16.S b/third_party/dav1d/src/arm/32/loopfilter16.S
new file mode 100644
index 0000000000..d7daf21f1a
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/loopfilter16.S
@@ -0,0 +1,859 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+.macro loop_filter wd
+function lpf_4_wd\wd\()_neon
+ vabd.u16 d0, d22, d23 // abs(p1 - p0)
+ vabd.u16 d1, d25, d24 // abs(q1 - q0)
+ vabd.u16 d2, d23, d24 // abs(p0 - q0)
+ vabd.u16 d3, d22, d25 // abs(p1 - q1)
+.if \wd >= 6
+ vabd.u16 d4, d21, d22 // abs(p2 - p1)
+ vabd.u16 d5, d26, d25 // abs(q2 - q1)
+.endif
+.if \wd >= 8
+ vabd.u16 d6, d20, d21 // abs(p3 - p2)
+ vabd.u16 d7, d27, d26 // abs(q3 - q3)
+.endif
+.if \wd >= 6
+ vmax.u16 d4, d4, d5
+.endif
+ vqadd.u16 d2, d2, d2 // abs(p0 - q0) * 2
+.if \wd >= 8
+ vmax.u16 d6, d6, d7
+.endif
+ vshr.u16 d3, d3, #1
+.if \wd >= 8
+ vmax.u16 d4, d4, d6
+.endif
+ vmax.u16 d0, d0, d1 // max(abs(p1 - p0), abs(q1 - q0))
+ vqadd.u16 d2, d2, d3 // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
+.if \wd >= 6
+ vmax.u16 d4, d0, d4
+ vcge.u16 d1, d11, d4 // max(abs(p1 - p0), abs(q1 - q0), abs(), abs(), ...) <= I
+.else
+ vcge.u16 d1, d11, d0 // max(abs(p1 - p0), abs(q1 - q0)) <= I
+.endif
+ vcge.u16 d2, d10, d2 // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 <= E
+ vand d1, d1, d2 // fm && wd >= 4 (implicit)
+.if \wd >= 6
+ vmov d14, d1 // fm && wd > 4 (implicit)
+.endif
+.if \wd >= 16
+ vmov d15, d1 // fm && wd == 16 (implicit)
+.endif
+
+ vmov r10, r11, d1
+ orrs r10, r10, r11
+ beq 9f // if (!fm || wd < 4) return;
+
+.if \wd >= 6
+ vmov.i16 d10, #1
+ vabd.u16 d2, d21, d23 // abs(p2 - p0)
+ vabd.u16 d3, d22, d23 // abs(p1 - p0)
+ vabd.u16 d4, d25, d24 // abs(q1 - q0)
+ vabd.u16 d5, d26, d24 // abs(q2 - q0)
+ vdup.16 d9, r9 // bitdepth_min_8
+.if \wd >= 8
+ vabd.u16 d6, d20, d23 // abs(p3 - p0)
+ vabd.u16 d7, d27, d24 // abs(q3 - q0)
+.endif
+ vmax.u16 d2, d2, d3
+ vmax.u16 d4, d4, d5
+.if \wd >= 8
+ vmax.u16 d6, d6, d7
+.endif
+ vmax.u16 d2, d2, d4
+ vshl.u16 d10, d10, d9 // F = 1 << bitdepth_min_8
+.if \wd >= 8
+ vmax.u16 d2, d2, d6
+.endif
+
+.if \wd == 16
+ vabd.u16 d3, d17, d23 // abs(p6 - p0)
+ vabd.u16 d4, d18, d23 // abs(p5 - p0)
+ vabd.u16 d5, d19, d23 // abs(p4 - p0)
+.endif
+ vcge.u16 d2, d10, d2 // flat8in
+.if \wd == 16
+ vabd.u16 d6, d28, d24 // abs(q4 - q0)
+ vabd.u16 d7, d29, d24 // abs(q5 - q0)
+ vabd.u16 d8, d30, d24 // abs(q6 - q0)
+.endif
+ vand d14, d2, d14 // flat8in && fm && wd > 4
+ vbic d1, d1, d14 // fm && wd >= 4 && !flat8in
+.if \wd == 16
+ vmax.u16 d3, d3, d4
+ vmax.u16 d5, d5, d6
+.endif
+ vmov r10, r11, d1
+.if \wd == 16
+ vmax.u16 d7, d7, d8
+ vmax.u16 d3, d3, d5
+ vmax.u16 d3, d3, d7
+ vcge.u16 d3, d10, d3 // flat8out
+.endif
+ orrs r10, r10, r11
+.if \wd == 16
+ vand d15, d15, d3 // flat8out && fm && wd == 16
+ vand d15, d15, d14 // flat8out && flat8in && fm && wd == 16
+ vbic d14, d14, d15 // flat8in && fm && wd >= 4 && !flat8out
+.endif
+ beq 1f // skip wd == 4 case
+.endif
+
+ vdup.16 d3, r8 // bitdepth_max
+ vsub.u16 d2, d22, d25 // p1 - q1
+ vshr.u16 d3, d3, #1 // 128 << bitdepth_min_8 - 1
+ vcgt.u16 d0, d0, d12 // hev
+ vmvn d9, d3 // - 128 * (1 << bitdepth_min_8)
+ vmin.s16 d2, d2, d3 // iclip_diff(p1 - q1)
+ vmax.s16 d2, d2, d9 // iclip_diff(p1 - q1)
+ vand d4, d2, d0 // if (hev) iclip_diff(p1 - q1)
+ vsub.u16 d2, d24, d23
+ vmov.i16 d6, #3
+ vbic d0, d1, d0 // (fm && wd >= 4 && !hev)
+ vmul.i16 d2, d2, d6
+ vmov.i16 d7, #4
+ vadd.i16 d2, d2, d4
+ vmin.s16 d2, d2, d3 // f = iclip_diff()
+ vmax.s16 d2, d2, d9 // f = iclip_diff()
+ vqadd.s16 d4, d7, d2 // f + 4
+ vqadd.s16 d5, d6, d2 // f + 3
+ vmin.s16 d4, d4, d3 // imin(f + 4, 128 << bitdepth_min_8 - 1)
+ vmin.s16 d5, d5, d3 // imin(f + 3, 128 << bitdepth_min_8 - 1)
+ vshr.s16 d4, d4, #3 // f1
+ vshr.s16 d5, d5, #3 // f2
+ vmov.i16 d9, #0
+ vdup.16 d3, r8 // bitdepth_max
+ vqadd.s16 d2, d23, d5 // p0 + f2
+ vqsub.s16 d6, d24, d4 // q0 - f1
+ vrshr.s16 d4, d4, #1 // (f1 + 1) >> 1
+ vmin.s16 d2, d2, d3 // out p0 = iclip_pixel()
+ vmin.s16 d6, d6, d3 // out q0 = iclip_pixel()
+ vmax.s16 d2, d2, d9 // out p0 = iclip_pixel()
+ vmax.s16 d6, d6, d9 // out q0 = iclip_pixel()
+ vbit d23, d2, d1 // if (fm && wd >= 4)
+ vbit d24, d6, d1 // if (fm && wd >= 4)
+ vqadd.s16 d2, d22, d4 // p1 + f
+ vqsub.s16 d6, d25, d4 // q1 - f
+ vmin.s16 d2, d2, d3 // out p1 = iclip_pixel()
+ vmin.s16 d6, d6, d3 // out q1 = iclip_pixel()
+ vmax.s16 d2, d2, d9 // out p1 = iclip_pixel()
+ vmax.s16 d6, d6, d9 // out q1 = iclip_pixel()
+ vbit d22, d2, d0 // if (fm && wd >= 4 && !hev)
+ vbit d25, d6, d0 // if (fm && wd >= 4 && !hev)
+1:
+
+.if \wd == 6
+ vmov r10, r11, d14
+ orrs r10, r10, r11
+ beq 2f // skip if there's no flat8in
+
+ vadd.i16 d0, d21, d21 // p2 * 2
+ vadd.i16 d2, d21, d22 // p2 + p1
+ vadd.i16 d4, d22, d23 // p1 + p0
+ vadd.i16 d6, d23, d24 // p0 + q0
+ vadd.i16 d8, d0, d2
+ vadd.i16 d10, d4, d6
+ vadd.i16 d12, d24, d25 // q0 + q1
+ vadd.i16 d8, d8, d10
+ vsub.i16 d12, d12, d0
+ vadd.i16 d10, d25, d26 // q1 + q2
+ vrshr.u16 d0, d8, #3 // out p1
+
+ vadd.i16 d8, d8, d12
+ vsub.i16 d10, d10, d2
+ vadd.i16 d12, d26, d26 // q2 + q2
+ vrshr.u16 d1, d8, #3 // out p0
+
+ vadd.i16 d8, d8, d10
+ vsub.i16 d12, d12, d4
+ vrshr.u16 d2, d8, #3 // out q0
+
+ vbit d22, d0, d14 // p1 if (flat8in)
+ vadd.i16 d8, d8, d12
+ vbit d23, d1, d14 // p0 if (flat8in)
+ vrshr.u16 d3, d8, #3 // out q1
+ vbit d24, d2, d14 // q0 if (flat8in)
+ vbit d25, d3, d14 // q1 if (flat8in)
+.elseif \wd >= 8
+ vmov r10, r11, d14
+ orrs r10, r10, r11
+.if \wd == 8
+ beq 8f // skip if there's no flat8in
+.else
+ beq 2f // skip if there's no flat8in
+.endif
+
+ vadd.i16 d0, d20, d21 // p3 + p2
+ vadd.i16 d2, d22, d25 // p1 + q1
+ vadd.i16 d4, d20, d22 // p3 + p1
+ vadd.i16 d6, d23, d26 // p0 + q2
+ vadd.i16 d8, d0, d0 // 2 * (p3 + p2)
+ vadd.i16 d9, d23, d24 // p0 + q0
+ vadd.i16 d8, d8, d4 // + p3 + p1
+ vsub.i16 d2, d2, d0 // p1 + q1 - p3 - p2
+ vadd.i16 d8, d8, d9 // + p0 + q0
+ vsub.i16 d6, d6, d4 // p0 + q2 - p3 - p1
+ vrshr.u16 d10, d8, #3 // out p2
+
+ vadd.i16 d8, d8, d2
+ vadd.i16 d0, d20, d23 // p3 + p0
+ vadd.i16 d2, d24, d27 // q0 + q3
+ vrshr.u16 d11, d8, #3 // out p1
+
+ vadd.i16 d8, d8, d6
+ vsub.i16 d2, d2, d0 // q0 + q3 - p3 - p0
+ vadd.i16 d4, d21, d24 // p2 + q0
+ vadd.i16 d6, d25, d27 // q1 + q3
+ vrshr.u16 d12, d8, #3 // out p0
+
+ vadd.i16 d8, d8, d2
+ vsub.i16 d6, d6, d4 // q1 + q3 - p2 - q0
+ vadd.i16 d0, d22, d25 // p1 + q1
+ vadd.i16 d2, d26, d27 // q2 + q3
+ vrshr.u16 d13, d8, #3 // out q0
+
+ vadd.i16 d8, d8, d6
+ vsub.i16 d2, d2, d0 // q2 + q3 - p1 - q1
+ vrshr.u16 d0, d8, #3 // out q1
+
+ vadd.i16 d8, d8, d2
+
+ vbit d21, d10, d14
+ vbit d22, d11, d14
+ vbit d23, d12, d14
+ vrshr.u16 d1, d8, #3 // out q2
+ vbit d24, d13, d14
+ vbit d25, d0, d14
+ vbit d26, d1, d14
+.endif
+2:
+.if \wd == 16
+ vmov r10, r11, d15
+ orrs r10, r10, r11
+ bne 1f // check if flat8out is needed
+ vmov r10, r11, d14
+ orrs r10, r10, r11
+ beq 8f // if there was no flat8in, just write the inner 4 pixels
+ b 7f // if flat8in was used, write the inner 6 pixels
+1:
+
+ vadd.i16 d2, d17, d17 // p6 + p6
+ vadd.i16 d4, d17, d18 // p6 + p5
+ vadd.i16 d6, d17, d19 // p6 + p4
+ vadd.i16 d8, d17, d20 // p6 + p3
+ vadd.i16 d12, d2, d4
+ vadd.i16 d10, d6, d8
+ vadd.i16 d6, d17, d21 // p6 + p2
+ vadd.i16 d12, d12, d10
+ vadd.i16 d8, d17, d22 // p6 + p1
+ vadd.i16 d10, d18, d23 // p5 + p0
+ vadd.i16 d6, d6, d8
+ vadd.i16 d8, d19, d24 // p4 + q0
+ vadd.i16 d12, d12, d6
+ vadd.i16 d10, d10, d8
+ vadd.i16 d6, d20, d25 // p3 + q1
+ vadd.i16 d12, d12, d10
+ vsub.i16 d6, d6, d2
+ vadd.i16 d2, d21, d26 // p2 + q2
+ vrshr.u16 d0, d12, #4 // out p5
+ vadd.i16 d12, d12, d6 // - (p6 + p6) + (p3 + q1)
+ vsub.i16 d2, d2, d4
+ vadd.i16 d4, d22, d27 // p1 + q3
+ vadd.i16 d6, d17, d19 // p6 + p4
+ vrshr.u16 d1, d12, #4 // out p4
+ vadd.i16 d12, d12, d2 // - (p6 + p5) + (p2 + q2)
+ vsub.i16 d4, d4, d6
+ vadd.i16 d6, d23, d28 // p0 + q4
+ vadd.i16 d8, d17, d20 // p6 + p3
+ vrshr.u16 d2, d12, #4 // out p3
+ vadd.i16 d12, d12, d4 // - (p6 + p4) + (p1 + q3)
+ vsub.i16 d6, d6, d8
+ vadd.i16 d8, d24, d29 // q0 + q5
+ vadd.i16 d4, d17, d21 // p6 + p2
+ vrshr.u16 d3, d12, #4 // out p2
+ vadd.i16 d12, d12, d6 // - (p6 + p3) + (p0 + q4)
+ vsub.i16 d8, d8, d4
+ vadd.i16 d6, d25, d30 // q1 + q6
+ vadd.i16 d10, d17, d22 // p6 + p1
+ vrshr.u16 d4, d12, #4 // out p1
+ vadd.i16 d12, d12, d8 // - (p6 + p2) + (q0 + q5)
+ vsub.i16 d6, d6, d10
+ vadd.i16 d8, d26, d30 // q2 + q6
+ vbif d0, d18, d15 // out p5
+ vadd.i16 d10, d18, d23 // p5 + p0
+ vrshr.u16 d5, d12, #4 // out p0
+ vadd.i16 d12, d12, d6 // - (p6 + p1) + (q1 + q6)
+ vsub.i16 d8, d8, d10
+ vadd.i16 d10, d27, d30 // q3 + q6
+ vbif d1, d19, d15 // out p4
+ vadd.i16 d18, d19, d24 // p4 + q0
+ vrshr.u16 d6, d12, #4 // out q0
+ vadd.i16 d12, d12, d8 // - (p5 + p0) + (q2 + q6)
+ vsub.i16 d10, d10, d18
+ vadd.i16 d8, d28, d30 // q4 + q6
+ vbif d2, d20, d15 // out p3
+ vadd.i16 d18, d20, d25 // p3 + q1
+ vrshr.u16 d7, d12, #4 // out q1
+ vadd.i16 d12, d12, d10 // - (p4 + q0) + (q3 + q6)
+ vsub.i16 d18, d8, d18
+ vadd.i16 d10, d29, d30 // q5 + q6
+ vbif d3, d21, d15 // out p2
+ vadd.i16 d20, d21, d26 // p2 + q2
+ vrshr.u16 d8, d12, #4 // out q2
+ vadd.i16 d12, d12, d18 // - (p3 + q1) + (q4 + q6)
+ vsub.i16 d10, d10, d20
+ vadd.i16 d18, d30, d30 // q6 + q6
+ vbif d4, d22, d15 // out p1
+ vadd.i16 d20, d22, d27 // p1 + q3
+ vrshr.u16 d9, d12, #4 // out q3
+ vadd.i16 d12, d12, d10 // - (p2 + q2) + (q5 + q6)
+ vsub.i16 d18, d18, d20
+ vbif d5, d23, d15 // out p0
+ vrshr.u16 d10, d12, #4 // out q4
+ vadd.i16 d12, d12, d18 // - (p1 + q3) + (q6 + q6)
+ vrshr.u16 d11, d12, #4 // out q5
+ vbif d6, d24, d15 // out q0
+ vbif d7, d25, d15 // out q1
+ vbif d8, d26, d15 // out q2
+ vbif d9, d27, d15 // out q3
+ vbif d10, d28, d15 // out q4
+ vbif d11, d29, d15 // out q5
+.endif
+
+ bx lr
+.if \wd == 16
+7:
+ // Return to a shorter epilogue, writing only the inner 6 pixels
+ bx r6
+.endif
+.if \wd >= 8
+8:
+ // Return to a shorter epilogue, writing only the inner 4 pixels
+ bx r7
+.endif
+9:
+ // Return directly without writing back any pixels
+ bx r12
+endfunc
+.endm
+
+loop_filter 16
+loop_filter 8
+loop_filter 6
+loop_filter 4
+
+.macro lpf_4_wd16
+ adr r6, 7f + CONFIG_THUMB
+ adr r7, 8f + CONFIG_THUMB
+ bl lpf_4_wd16_neon
+.endm
+
+.macro lpf_4_wd8
+ adr r7, 8f + CONFIG_THUMB
+ bl lpf_4_wd8_neon
+.endm
+
+.macro lpf_4_wd6
+ bl lpf_4_wd6_neon
+.endm
+
+.macro lpf_4_wd4
+ bl lpf_4_wd4_neon
+.endm
+
+function lpf_v_4_4_neon
+ mov r12, lr
+ sub r10, r0, r1, lsl #1
+ vld1.16 {d22}, [r10, :64], r1 // p1
+ vld1.16 {d24}, [r0, :64], r1 // q0
+ vld1.16 {d23}, [r10, :64], r1 // p0
+ vld1.16 {d25}, [r0, :64], r1 // q1
+ sub r0, r0, r1, lsl #1
+
+ lpf_4_wd4
+
+ sub r10, r0, r1, lsl #1
+ vst1.16 {d22}, [r10, :64], r1 // p1
+ vst1.16 {d24}, [r0, :64], r1 // q0
+ vst1.16 {d23}, [r10, :64], r1 // p0
+ vst1.16 {d25}, [r0, :64], r1 // q1
+ sub r0, r0, r1, lsl #1
+ bx r12
+endfunc
+
+function lpf_h_4_4_neon
+ mov r12, lr
+ sub r10, r0, #4
+ add r0, r10, r1, lsl #1
+ vld1.16 {d22}, [r10], r1
+ vld1.16 {d24}, [r0], r1
+ vld1.16 {d23}, [r10], r1
+ vld1.16 {d25}, [r0], r1
+ add r0, r0, #4
+
+ transpose_4x4h q11, q12, d22, d23, d24, d25
+
+ lpf_4_wd4
+
+ sub r10, r0, r1, lsl #2
+ sub r10, r10, #4
+ transpose_4x4h q11, q12, d22, d23, d24, d25
+ add r0, r10, r1, lsl #1
+
+ vst1.16 {d22}, [r10], r1
+ vst1.16 {d24}, [r0], r1
+ vst1.16 {d23}, [r10], r1
+ vst1.16 {d25}, [r0], r1
+ add r0, r0, #4
+ bx r12
+endfunc
+
+function lpf_v_6_4_neon
+ mov r12, lr
+ sub r10, r0, r1, lsl #1
+ sub r10, r10, r1
+ vld1.16 {d21}, [r10, :64], r1 // p2
+ vld1.16 {d24}, [r0, :64], r1 // q0
+ vld1.16 {d22}, [r10, :64], r1 // p1
+ vld1.16 {d25}, [r0, :64], r1 // q1
+ vld1.16 {d23}, [r10, :64], r1 // p0
+ vld1.16 {d26}, [r0, :64], r1 // q2
+ sub r0, r0, r1, lsl #1
+ sub r0, r0, r1
+
+ lpf_4_wd6
+
+ sub r10, r0, r1, lsl #1
+ vst1.16 {d22}, [r10, :64], r1 // p1
+ vst1.16 {d24}, [r0, :64], r1 // q0
+ vst1.16 {d23}, [r10, :64], r1 // p0
+ vst1.16 {d25}, [r0, :64], r1 // q1
+ sub r0, r0, r1, lsl #1
+ bx r12
+endfunc
+
+function lpf_h_6_4_neon
+ mov r12, lr
+ sub r10, r0, #8
+ vld1.16 {d20}, [r10, :64], r1
+ vld1.16 {d24}, [r0, :64], r1
+ vld1.16 {d21}, [r10, :64], r1
+ vld1.16 {d25}, [r0, :64], r1
+ vld1.16 {d22}, [r10, :64], r1
+ vld1.16 {d26}, [r0, :64], r1
+ vld1.16 {d23}, [r10, :64], r1
+ vld1.16 {d27}, [r0, :64], r1
+
+ transpose_4x4h q10, q11, d20, d21, d22, d23
+ transpose_4x4h q12, q13, d24, d25, d26, d27
+
+ lpf_4_wd6
+
+ sub r0, r0, #4
+ transpose_4x4h q11, q12, d22, d23, d24, d25
+ sub r10, r0, r1, lsl #2
+ sub r0, r0, r1, lsl #1
+
+ vst1.16 {d22}, [r10], r1
+ vst1.16 {d24}, [r0], r1
+ vst1.16 {d23}, [r10], r1
+ vst1.16 {d25}, [r0], r1
+ add r0, r0, #4
+ bx r12
+endfunc
+
+function lpf_v_8_4_neon
+ mov r12, lr
+ sub r10, r0, r1, lsl #2
+ vld1.16 {d20}, [r10, :64], r1 // p3
+ vld1.16 {d24}, [r0, :64], r1 // q0
+ vld1.16 {d21}, [r10, :64], r1 // p2
+ vld1.16 {d25}, [r0, :64], r1 // q1
+ vld1.16 {d22}, [r10, :64], r1 // p1
+ vld1.16 {d26}, [r0, :64], r1 // q2
+ vld1.16 {d23}, [r10, :64], r1 // p0
+ vld1.16 {d27}, [r0, :64], r1 // q3
+ sub r0, r0, r1, lsl #2
+
+ lpf_4_wd8
+
+ sub r10, r0, r1, lsl #1
+ sub r10, r10, r1
+ vst1.16 {d21}, [r10, :64], r1 // p2
+ vst1.16 {d24}, [r0, :64], r1 // q0
+ vst1.16 {d22}, [r10, :64], r1 // p1
+ vst1.16 {d25}, [r0, :64], r1 // q1
+ vst1.16 {d23}, [r10, :64], r1 // p0
+ vst1.16 {d26}, [r0, :64], r1 // q2
+ sub r0, r0, r1, lsl #1
+ sub r0, r0, r1
+ bx r12
+
+8:
+ sub r10, r0, r1, lsl #1
+ vst1.16 {d22}, [r10, :64], r1 // p1
+ vst1.16 {d24}, [r0, :64], r1 // q0
+ vst1.16 {d23}, [r10, :64], r1 // p0
+ vst1.16 {d25}, [r0, :64], r1 // q1
+ sub r0, r0, r1, lsl #1
+ bx r12
+endfunc
+
+function lpf_h_8_4_neon
+ mov r12, lr
+ sub r10, r0, #8
+ vld1.16 {d20}, [r10, :64], r1
+ vld1.16 {d24}, [r0, :64], r1
+ vld1.16 {d21}, [r10, :64], r1
+ vld1.16 {d25}, [r0, :64], r1
+ vld1.16 {d22}, [r10, :64], r1
+ vld1.16 {d26}, [r0, :64], r1
+ vld1.16 {d23}, [r10, :64], r1
+ vld1.16 {d27}, [r0, :64], r1
+
+ transpose_4x4h q10, q11, d20, d21, d22, d23
+ transpose_4x4h q12, q13, d24, d25, d26, d27
+
+ lpf_4_wd8
+
+ sub r0, r0, r1, lsl #2
+ transpose_4x4h q10, q11, d20, d21, d22, d23
+ transpose_4x4h q12, q13, d24, d25, d26, d27
+ sub r10, r0, #8
+
+ vst1.16 {d20}, [r10, :64], r1
+ vst1.16 {d24}, [r0, :64], r1
+ vst1.16 {d21}, [r10, :64], r1
+ vst1.16 {d25}, [r0, :64], r1
+ vst1.16 {d22}, [r10, :64], r1
+ vst1.16 {d26}, [r0, :64], r1
+ vst1.16 {d23}, [r10, :64], r1
+ vst1.16 {d27}, [r0, :64], r1
+ bx r12
+8:
+ sub r0, r0, #4
+ transpose_4x4h q11, q12, d22, d23, d24, d25
+ sub r10, r0, r1, lsl #2
+ sub r0, r0, r1, lsl #1
+
+ vst1.16 {d22}, [r10], r1
+ vst1.16 {d24}, [r0], r1
+ vst1.16 {d23}, [r10], r1
+ vst1.16 {d25}, [r0], r1
+ add r0, r0, #4
+ bx r12
+endfunc
+
+function lpf_v_16_4_neon
+ mov r12, lr
+
+ sub r10, r0, r1, lsl #3
+ add r10, r10, r1
+ vld1.16 {d17}, [r10, :64], r1 // p6
+ vld1.16 {d24}, [r0, :64], r1 // q0
+ vld1.16 {d18}, [r10, :64], r1 // p5
+ vld1.16 {d25}, [r0, :64], r1 // q1
+ vld1.16 {d19}, [r10, :64], r1 // p4
+ vld1.16 {d26}, [r0, :64], r1 // q2
+ vld1.16 {d20}, [r10, :64], r1 // p3
+ vld1.16 {d27}, [r0, :64], r1 // q3
+ vld1.16 {d21}, [r10, :64], r1 // p2
+ vld1.16 {d28}, [r0, :64], r1 // q4
+ vld1.16 {d22}, [r10, :64], r1 // p1
+ vld1.16 {d29}, [r0, :64], r1 // q5
+ vld1.16 {d23}, [r10, :64], r1 // p0
+ vld1.16 {d30}, [r0, :64], r1 // q6
+ sub r0, r0, r1, lsl #3
+ add r0, r0, r1
+
+ lpf_4_wd16
+
+ sub r10, r0, r1, lsl #2
+ sub r10, r10, r1, lsl #1
+ vst1.16 {d0}, [r10, :64], r1 // p5
+ vst1.16 {d6}, [r0, :64], r1 // q0
+ vst1.16 {d1}, [r10, :64], r1 // p4
+ vst1.16 {d7}, [r0, :64], r1 // q1
+ vst1.16 {d2}, [r10, :64], r1 // p3
+ vst1.16 {d8}, [r0, :64], r1 // q2
+ vst1.16 {d3}, [r10, :64], r1 // p2
+ vst1.16 {d9}, [r0, :64], r1 // q3
+ vst1.16 {d4}, [r10, :64], r1 // p1
+ vst1.16 {d10}, [r0, :64], r1 // q4
+ vst1.16 {d5}, [r10, :64], r1 // p0
+ vst1.16 {d11}, [r0, :64], r1 // q5
+ sub r0, r0, r1, lsl #2
+ sub r0, r0, r1, lsl #1
+ bx r12
+7:
+ sub r10, r0, r1
+ sub r10, r10, r1, lsl #1
+ vst1.16 {d21}, [r10, :64], r1 // p2
+ vst1.16 {d24}, [r0, :64], r1 // q0
+ vst1.16 {d22}, [r10, :64], r1 // p1
+ vst1.16 {d25}, [r0, :64], r1 // q1
+ vst1.16 {d23}, [r10, :64], r1 // p0
+ vst1.16 {d26}, [r0, :64], r1 // q2
+ sub r0, r0, r1, lsl #1
+ sub r0, r0, r1
+ bx r12
+
+8:
+ sub r10, r0, r1, lsl #1
+ vst1.16 {d22}, [r10, :64], r1 // p1
+ vst1.16 {d24}, [r0, :64], r1 // q0
+ vst1.16 {d23}, [r10, :64], r1 // p0
+ vst1.16 {d25}, [r0, :64], r1 // q1
+ sub r0, r0, r1, lsl #1
+ bx r12
+endfunc
+
+function lpf_h_16_4_neon
+ mov r12, lr
+ sub r10, r0, #16
+ sub r0, r0, #8
+ vld1.16 {d16}, [r10, :64], r1
+ vld1.16 {d20}, [r0, :64], r1
+ vld1.16 {d17}, [r10, :64], r1
+ vld1.16 {d21}, [r0, :64], r1
+ vld1.16 {d18}, [r10, :64], r1
+ vld1.16 {d22}, [r0, :64], r1
+ vld1.16 {d19}, [r10, :64], r1
+ vld1.16 {d23}, [r0, :64], r1
+ sub r10, r10, r1, lsl #2
+ sub r0, r0, r1, lsl #2
+ add r10, r10, #16
+ add r0, r0, #16
+ vld1.16 {d24}, [r10, :64], r1
+ vld1.16 {d28}, [r0, :64], r1
+ vld1.16 {d25}, [r10, :64], r1
+ vld1.16 {d29}, [r0, :64], r1
+ vld1.16 {d26}, [r10, :64], r1
+ vld1.16 {d30}, [r0, :64], r1
+ vld1.16 {d27}, [r10, :64], r1
+ vld1.16 {d31}, [r0, :64], r1
+ sub r0, r0, #8
+
+ transpose_4x4h q8, q9, d16, d17, d18, d19
+ transpose_4x4h q10, q11, d20, d21, d22, d23
+ transpose_4x4h q12, q13, d24, d25, d26, d27
+ transpose_4x4h q14, q15, d28, d29, d30, d31
+
+ lpf_4_wd16
+
+ sub r0, r0, r1, lsl #2
+ transpose_4x4h q8, q0, d16, d17, d0, d1
+ transpose_4x4h q1, q2, d2, d3, d4, d5
+ transpose_4x4h q3, q4, d6, d7, d8, d9
+ transpose_4x4h q5, q15, d10, d11, d30, d31
+ sub r10, r0, #16
+ sub r0, r0, #8
+
+ vst1.16 {d16}, [r10, :64], r1
+ vst1.16 {d2}, [r0, :64], r1
+ vst1.16 {d17}, [r10, :64], r1
+ vst1.16 {d3}, [r0, :64], r1
+ vst1.16 {d0}, [r10, :64], r1
+ vst1.16 {d4}, [r0, :64], r1
+ vst1.16 {d1}, [r10, :64], r1
+ vst1.16 {d5}, [r0, :64], r1
+ sub r10, r10, r1, lsl #2
+ sub r0, r0, r1, lsl #2
+ add r10, r10, #16
+ add r0, r0, #16
+ vst1.16 {d6}, [r10, :64], r1
+ vst1.16 {d10}, [r0, :64], r1
+ vst1.16 {d7}, [r10, :64], r1
+ vst1.16 {d11}, [r0, :64], r1
+ vst1.16 {d8}, [r10, :64], r1
+ vst1.16 {d30}, [r0, :64], r1
+ vst1.16 {d9}, [r10, :64], r1
+ vst1.16 {d31}, [r0, :64], r1
+ sub r0, r0, #8
+
+ bx r12
+
+7:
+ sub r0, r0, r1, lsl #2
+ transpose_4x4h q10, q11, d20, d21, d22, d23
+ transpose_4x4h q12, q13, d24, d25, d26, d27
+ sub r10, r0, #8
+
+ vst1.16 {d20}, [r10, :64], r1
+ vst1.16 {d24}, [r0, :64], r1
+ vst1.16 {d21}, [r10, :64], r1
+ vst1.16 {d25}, [r0, :64], r1
+ vst1.16 {d22}, [r10, :64], r1
+ vst1.16 {d26}, [r0, :64], r1
+ vst1.16 {d23}, [r10, :64], r1
+ vst1.16 {d27}, [r0, :64], r1
+ bx r12
+8:
+ sub r0, r0, #4
+ transpose_4x4h q11, q12, d22, d23, d24, d25
+ sub r10, r0, r1, lsl #2
+ sub r0, r0, r1, lsl #1
+
+ vst1.16 {d22}, [r10], r1
+ vst1.16 {d24}, [r0], r1
+ vst1.16 {d23}, [r10], r1
+ vst1.16 {d25}, [r0], r1
+ add r0, r0, #4
+ bx r12
+endfunc
+
+// void dav1d_lpf_v_sb_y_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const uint32_t *const vmask,
+// const uint8_t (*l)[4], ptrdiff_t b4_stride,
+// const Av1FilterLUT *lut, const int w,
+// const int bitdepth_max)
+
+.macro lpf_func dir, type
+function lpf_\dir\()_sb_\type\()_16bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100]
+ ldr r8, [sp, #112] // bitdepth_max; the 'w' parameter isn't loaded
+ sub sp, sp, #8
+ clz r9, r8
+ rsb r9, r9, #24 // bitdepth_min_8
+ ldrd r6, r7, [r2] // vmask[0], vmask[1]
+.ifc \type, y
+ ldr r2, [r2, #8] // vmask[2]
+.endif
+ add r5, r5, #128 // Move to sharp part of lut
+.ifc \type, y
+ orr r7, r7, r2 // vmask[1] |= vmask[2]
+.endif
+.ifc \dir, v
+ sub r4, r3, r4, lsl #2
+.else
+ sub r3, r3, #4
+ lsl r4, r4, #2
+.endif
+ orr r6, r6, r7 // vmask[0] |= vmask[1]
+
+1:
+ tst r6, #0x01
+ strd r6, r7, [sp]
+.ifc \dir, v
+ ldrb r10, [r4], #4
+ ldrb r11, [r3], #4
+.else
+ ldrb r10, [r3]
+ ldrb r11, [r3, #4]
+ add r3, r3, r4
+.endif
+ beq 7f // if (!(vm & bits)) continue;
+
+ orrs r12, r10, r11
+ vdup.16 d31, r9 // bitdepth_min_8
+ beq 7f // if (!(l[0][0] | l[offset][0])) continue;
+ cmp r11, #0 // Check for nonzero values in l[0][0]
+ ldrb r6, [r5], #8 // sharp[0]
+ it eq
+ moveq r11, r10 // if (!l[0][0]) L = l[offset][0]
+ ldrb r12, [r5] // sharp[1]
+ lsr r6, r11, r6 // L >> sharp[0]
+ sub r5, r5, #8
+ cmp r12, r6
+ lsr r10, r11, #4 // H
+ add r11, r11, #2 // L + 2
+ it lt
+ movlt r6, r12 // imin(L >> sharp[0], sharp[1])
+ add r11, r11, r11 // 2*(L + 2)
+ cmp r6, #1
+ lsl r10, r10, r9 // H << bitdepth_min_8
+ it lt
+ movlt r6, #1 // imax(imin(), 1) = limit = I
+ vdup.16 d12, r10 // H << bitdepth_min_8
+ add r11, r11, r6 // 2*(L + 2) + limit = E
+ lsl r6, r6, r9 // I << bitdepth_min_8
+ lsl r11, r11, r9 // E << bitdepth_min_8
+ vdup.16 d11, r6 // I << bitdepth_min_8
+ vdup.16 d10, r11 // E << bitdepth_min_8
+
+.ifc \type, y
+ tst r2, #0x01
+ beq 2f
+ // wd16
+ bl lpf_\dir\()_16_4_neon
+ b 8f
+2:
+.endif
+ tst r7, #0x01
+ beq 3f
+.ifc \type, y
+ // wd8
+ bl lpf_\dir\()_8_4_neon
+.else
+ // wd6
+ bl lpf_\dir\()_6_4_neon
+.endif
+ b 8f
+3:
+ // wd4
+ bl lpf_\dir\()_4_4_neon
+.ifc \dir, h
+ b 8f
+7:
+ // For dir h, the functions above increment r0.
+ // If the whole function is skipped, increment it here instead.
+ add r0, r0, r1, lsl #2
+.else
+7:
+.endif
+8:
+ ldrd r6, r7, [sp]
+.ifc \type, y
+ lsr r2, r2, #1 // vmask[2] >>= 1
+.endif
+.ifc \dir, v
+ add r0, r0, #8
+.else
+ // For dir h, r0 is returned incremented
+.endif
+ lsrs r6, r6, #1 // vmask[0] >>= 1
+ lsr r7, r7, #1 // vmask[1] >>= 1
+ bne 1b
+
+ add sp, sp, #8
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+.endm
+
+lpf_func v, y
+lpf_func h, y
+lpf_func v, uv
+lpf_func h, uv
diff --git a/third_party/dav1d/src/arm/32/looprestoration.S b/third_party/dav1d/src/arm/32/looprestoration.S
new file mode 100644
index 0000000000..be5c658d6d
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/looprestoration.S
@@ -0,0 +1,791 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+const right_ext_mask_buf
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+right_ext_mask:
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+endconst
+
+// void dav1d_wiener_filter_h_8bpc_neon(int16_t *dst, const pixel (*left)[4],
+// const pixel *src, ptrdiff_t stride,
+// const int16_t fh[8], intptr_t w,
+// int h, enum LrEdgeFlags edges);
+function wiener_filter_h_8bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100]
+ ldrd r6, r7, [sp, #108]
+ mov r8, r5
+ vld1.16 {q0}, [r4, :128]
+ movw r9, #(1 << 14) - (1 << 2)
+ vdup.16 q14, r9
+ vmov.s16 q15, #2048
+ // Calculate mid_stride
+ add r10, r5, #7
+ bic r10, r10, #7
+ lsl r10, r10, #1
+
+ // Set up pointers for reading/writing alternate rows
+ add r12, r0, r10
+ lsl r10, r10, #1
+ add lr, r2, r3
+ lsl r3, r3, #1
+
+ // Subtract the aligned width from mid_stride
+ add r11, r5, #7
+ bic r11, r11, #7
+ sub r10, r10, r11, lsl #1
+
+ // Subtract the number of pixels read from the source stride
+ add r11, r11, #8
+ sub r3, r3, r11
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst r7, #1 // LR_HAVE_LEFT
+ beq 2f
+ // LR_HAVE_LEFT
+ cmp r1, #0
+ bne 0f
+ // left == NULL
+ sub r2, r2, #3
+ sub lr, lr, #3
+ b 1f
+0: // LR_HAVE_LEFT, left != NULL
+2: // !LR_HAVE_LEFT, increase the stride.
+ // For this case we don't read the left 3 pixels from the src pointer,
+ // but shift it as if we had done that.
+ add r3, r3, #3
+
+
+1: // Loop vertically
+ vld1.8 {q2}, [r2]!
+ vld1.8 {q9}, [lr]!
+
+ tst r7, #1 // LR_HAVE_LEFT
+ beq 0f
+ cmp r1, #0
+ beq 2f
+ // LR_HAVE_LEFT, left != NULL
+ vld1.32 {d3[1]}, [r1]!
+ // Move r2/lr back to account for the last 3 bytes we loaded earlier,
+ // which we'll shift out.
+ sub r2, r2, #3
+ sub lr, lr, #3
+ vld1.32 {d17[1]}, [r1]!
+ vext.8 q2, q1, q2, #13
+ vext.8 q9, q8, q9, #13
+ b 2f
+0:
+ // !LR_HAVE_LEFT, fill q1 with the leftmost byte
+ // and shift q2 to have 3x the first byte at the front.
+ vdup.8 q1, d4[0]
+ vdup.8 q8, d18[0]
+ // Move r2 back to account for the last 3 bytes we loaded before,
+ // which we shifted out.
+ sub r2, r2, #3
+ sub lr, lr, #3
+ vext.8 q2, q1, q2, #13
+ vext.8 q9, q8, q9, #13
+
+2:
+ vmovl.u8 q1, d4
+ vmovl.u8 q2, d5
+ vmovl.u8 q8, d18
+ vmovl.u8 q9, d19
+
+ tst r7, #2 // LR_HAVE_RIGHT
+ bne 4f
+ // If we'll need to pad the right edge, load that byte to pad with
+ // here since we can find it pretty easily from here.
+ sub r9, r5, #14
+ ldrb r11, [r2, r9]
+ ldrb r9, [lr, r9]
+ // Fill q12/q13 with the right padding pixel
+ vdup.16 q12, r11
+ vdup.16 q13, r9
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp r5, #11
+ bge 4f // If w >= 11, all used input pixels are valid
+
+ // 1 <= w < 11, w+3 pixels valid in q1-q2. For w=9 or w=10,
+ // this ends up called again; it's not strictly needed in those
+ // cases (we pad enough here), but keeping the code as simple as possible.
+
+ // Insert padding in q1/2.h[w+3] onwards; fuse the +3 (*2) into the
+ // buffer pointer.
+ movrel_local r4, right_ext_mask, -6
+ sub r4, r4, r5, lsl #1
+ vld1.8 {q10, q11}, [r4]
+
+ vbit q1, q12, q10
+ vbit q2, q12, q11
+ vbit q8, q13, q10
+ vbit q9, q13, q11
+
+4: // Loop horizontally
+ vext.8 q11, q1, q2, #4
+ vext.8 q5, q1, q2, #8
+ vext.8 q10, q1, q2, #2
+ vext.8 q6, q1, q2, #10
+ vext.8 q7, q1, q2, #12
+ vext.8 q4, q1, q2, #6
+ vadd.i16 q5, q5, q11
+ vadd.i16 q6, q6, q10
+ vadd.i16 q7, q7, q1
+ vmul.s16 q3, q4, d0[3]
+ vmla.s16 q3, q5, d1[0]
+ vmla.s16 q3, q6, d1[1]
+ vmla.s16 q3, q7, d1[2]
+
+ vext.8 q4, q8, q9, #4
+ vext.8 q6, q8, q9, #8
+ vext.8 q11, q8, q9, #2
+ vext.8 q7, q8, q9, #10
+ vadd.i16 q6, q6, q4
+ vext.8 q4, q8, q9, #12
+ vext.8 q5, q8, q9, #6
+ vadd.i16 q7, q7, q11
+ vadd.i16 q4, q4, q8
+ vmul.s16 q10, q5, d0[3]
+ vmla.s16 q10, q6, d1[0]
+ vmla.s16 q10, q7, d1[1]
+ vmla.s16 q10, q4, d1[2]
+
+ vext.8 q1, q1, q2, #6
+ vext.8 q8, q8, q9, #6
+ vshl.s16 q1, q1, #7
+ vshl.s16 q8, q8, #7
+ vsub.s16 q1, q1, q14
+ vsub.s16 q8, q8, q14
+ vqadd.s16 q3, q3, q1
+ vqadd.s16 q10, q10, q8
+ vshr.s16 q3, q3, #3
+ vshr.s16 q10, q10, #3
+ vadd.s16 q3, q3, q15
+ vadd.s16 q10, q10, q15
+ subs r5, r5, #8
+ vst1.16 {q3}, [r0, :128]!
+ vst1.16 {q10}, [r12, :128]!
+
+ ble 9f
+ tst r7, #2 // LR_HAVE_RIGHT
+ vmov q1, q2
+ vmov q8, q9
+ vld1.8 {d4}, [r2]!
+ vld1.8 {d18}, [lr]!
+ vmovl.u8 q2, d4
+ vmovl.u8 q9, d18
+ bne 4b // If we don't need to pad, just keep filtering.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+9:
+ subs r6, r6, #2
+ ble 0f
+ // Jump to the next row and loop horizontally
+ add r0, r0, r10
+ add r12, r12, r10
+ add r2, r2, r3
+ add lr, lr, r3
+ mov r5, r8
+ b 1b
+0:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+// void dav1d_wiener_filter_v_8bpc_neon(pixel *dst, ptrdiff_t stride,
+// const int16_t *mid, int w, int h,
+// const int16_t fv[8], enum LrEdgeFlags edges,
+// ptrdiff_t mid_stride);
+function wiener_filter_v_8bpc_neon, export=1
+ push {r4-r7,lr}
+ vpush {q4-q6}
+ ldrd r4, r5, [sp, #68]
+ ldrd r6, r7, [sp, #76]
+ mov lr, r4
+ vld1.16 {q0}, [r5, :128]
+
+ // Calculate the number of rows to move back when looping vertically
+ mov r12, r4
+ tst r6, #4 // LR_HAVE_TOP
+ beq 0f
+ sub r2, r2, r7, lsl #1
+ add r12, r12, #2
+0:
+ tst r6, #8 // LR_HAVE_BOTTOM
+ beq 1f
+ add r12, r12, #2
+
+1: // Start of horizontal loop; start one vertical filter slice.
+ // Load rows into q8-q11 and pad properly.
+ tst r6, #4 // LR_HAVE_TOP
+ vld1.16 {q8}, [r2, :128], r7
+ beq 2f
+ // LR_HAVE_TOP
+ vld1.16 {q10}, [r2, :128], r7
+ vmov q9, q8
+ vld1.16 {q11}, [r2, :128], r7
+ b 3f
+2: // !LR_HAVE_TOP
+ vmov q9, q8
+ vmov q10, q8
+ vmov q11, q8
+
+3:
+ cmp r4, #4
+ blt 5f
+ // Start filtering normally; fill in q12-q14 with unique rows.
+ vld1.16 {q12}, [r2, :128], r7
+ vld1.16 {q13}, [r2, :128], r7
+ vld1.16 {q14}, [r2, :128], r7
+
+4:
+.macro filter compare
+ subs r4, r4, #1
+ // Interleaving the mul/mla chains actually hurts performance
+ // significantly on Cortex A53, thus keeping mul/mla tightly
+ // chained like this.
+ vadd.i16 q4, q10, q12
+ vadd.i16 q5, q9, q13
+ vadd.i16 q6, q8, q14
+ vmull.s16 q2, d22, d0[3]
+ vmlal.s16 q2, d8, d1[0]
+ vmlal.s16 q2, d10, d1[1]
+ vmlal.s16 q2, d12, d1[2]
+ vmull.s16 q3, d23, d0[3]
+ vmlal.s16 q3, d9, d1[0]
+ vmlal.s16 q3, d11, d1[1]
+ vmlal.s16 q3, d13, d1[2]
+ vqrshrun.s32 d4, q2, #11
+ vqrshrun.s32 d5, q3, #11
+ vqmovun.s16 d4, q2
+ vst1.8 {d4}, [r0, :64], r1
+.if \compare
+ cmp r4, #4
+.else
+ ble 9f
+.endif
+ vmov q8, q9
+ vmov q9, q10
+ vmov q10, q11
+ vmov q11, q12
+ vmov q12, q13
+ vmov q13, q14
+.endm
+ filter 1
+ blt 7f
+ vld1.16 {q14}, [r2, :128], r7
+ b 4b
+
+5: // Less than 4 rows in total; not all of q12-q13 are filled yet.
+ tst r6, #8 // LR_HAVE_BOTTOM
+ beq 6f
+ // LR_HAVE_BOTTOM
+ cmp r4, #2
+ // We load at least 2 rows in all cases.
+ vld1.16 {q12}, [r2, :128], r7
+ vld1.16 {q13}, [r2, :128], r7
+ bgt 53f // 3 rows in total
+ beq 52f // 2 rows in total
+51: // 1 row in total, q11 already loaded, load edge into q12-q14.
+ vmov q13, q12
+ b 8f
+52: // 2 rows in total, q11 already loaded, load q12 with content data
+ // and 2 rows of edge.
+ vld1.16 {q14}, [r2, :128], r7
+ vmov q15, q14
+ b 8f
+53:
+ // 3 rows in total, q11 already loaded, load q12 and q13 with content
+ // and 2 rows of edge.
+ vld1.16 {q14}, [r2, :128], r7
+ vld1.16 {q15}, [r2, :128], r7
+ vmov q1, q15
+ b 8f
+
+6:
+ // !LR_HAVE_BOTTOM
+ cmp r4, #2
+ bgt 63f // 3 rows in total
+ beq 62f // 2 rows in total
+61: // 1 row in total, q11 already loaded, pad that into q12-q14.
+ vmov q12, q11
+ vmov q13, q11
+ vmov q14, q11
+ b 8f
+62: // 2 rows in total, q11 already loaded, load q12 and pad that into q12-q15.
+ vld1.16 {q12}, [r2, :128], r7
+ vmov q13, q12
+ vmov q14, q12
+ vmov q15, q12
+ b 8f
+63:
+ // 3 rows in total, q11 already loaded, load q12 and q13 and pad q13 into q14-q15,q1.
+ vld1.16 {q12}, [r2, :128], r7
+ vld1.16 {q13}, [r2, :128], r7
+ vmov q14, q13
+ vmov q15, q13
+ vmov q1, q13
+ b 8f
+
+7:
+ // All registers up to q13 are filled already, 3 valid rows left.
+ // < 4 valid rows left; fill in padding and filter the last
+ // few rows.
+ tst r6, #8 // LR_HAVE_BOTTOM
+ beq 71f
+ // LR_HAVE_BOTTOM; load 2 rows of edge.
+ vld1.16 {q14}, [r2, :128], r7
+ vld1.16 {q15}, [r2, :128], r7
+ vmov q1, q15
+ b 8f
+71:
+ // !LR_HAVE_BOTTOM, pad 3 rows
+ vmov q14, q13
+ vmov q15, q13
+ vmov q1, q13
+
+8: // At this point, all registers up to q14-15,q1 are loaded with
+ // edge/padding (depending on how many rows are left).
+ filter 0 // This branches to 9f when done
+ vmov q14, q15
+ vmov q15, q1
+ b 8b
+
+9: // End of one vertical slice.
+ subs r3, r3, #8
+ ble 0f
+ // Move pointers back up to the top and loop horizontally.
+ mls r0, r1, lr, r0
+ mls r2, r7, r12, r2
+ add r0, r0, #8
+ add r2, r2, #16
+ mov r4, lr
+ b 1b
+
+0:
+ vpop {q4-q6}
+ pop {r4-r7,pc}
+.purgem filter
+endfunc
+
+#define SUM_STRIDE (384+16)
+
+#include "looprestoration_tmpl.S"
+
+// void dav1d_sgr_box3_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
+// const pixel (*left)[4],
+// const pixel *src, const ptrdiff_t stride,
+// const int w, const int h,
+// const enum LrEdgeFlags edges);
+function sgr_box3_h_8bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100]
+ ldrd r6, r7, [sp, #108]
+ add r5, r5, #2 // w += 2
+
+ // Set up pointers for reading/writing alternate rows
+ add r10, r0, #(4*SUM_STRIDE) // sumsq
+ add r11, r1, #(2*SUM_STRIDE) // sum
+ add r12, r3, r4 // src
+ lsl r4, r4, #1
+ mov r9, #(2*2*SUM_STRIDE) // double sum stride
+
+ // Subtract the aligned width from the output stride.
+ add lr, r5, #7
+ bic lr, lr, #7
+ sub r9, r9, lr, lsl #1
+
+ // Store the width for the vertical loop
+ mov r8, r5
+
+ // Subtract the number of pixels read from the input from the stride
+ add lr, lr, #8
+ sub r4, r4, lr
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst r7, #1 // LR_HAVE_LEFT
+ beq 2f
+ // LR_HAVE_LEFT
+ cmp r2, #0
+ bne 0f
+ // left == NULL
+ sub r3, r3, #2
+ sub r12, r12, #2
+ b 1f
+0: // LR_HAVE_LEFT, left != NULL
+2: // !LR_HAVE_LEFT, increase the stride.
+ // For this case we don't read the left 2 pixels from the src pointer,
+ // but shift it as if we had done that.
+ add r4, r4, #2
+
+
+1: // Loop vertically
+ vld1.8 {q0}, [r3]!
+ vld1.8 {q4}, [r12]!
+
+ tst r7, #1 // LR_HAVE_LEFT
+ beq 0f
+ cmp r2, #0
+ beq 2f
+ // LR_HAVE_LEFT, left != NULL
+ vld1.32 {d3[]}, [r2]!
+ // Move r3/r12 back to account for the last 2 bytes we loaded earlier,
+ // which we'll shift out.
+ sub r3, r3, #2
+ sub r12, r12, #2
+ vld1.32 {d11[]}, [r2]!
+ vext.8 q0, q1, q0, #14
+ vext.8 q4, q5, q4, #14
+ b 2f
+0:
+ // !LR_HAVE_LEFT, fill q1 with the leftmost byte
+ // and shift q0 to have 2x the first byte at the front.
+ vdup.8 q1, d0[0]
+ vdup.8 q5, d8[0]
+ // Move r3 back to account for the last 2 bytes we loaded before,
+ // which we shifted out.
+ sub r3, r3, #2
+ sub r12, r12, #2
+ vext.8 q0, q1, q0, #14
+ vext.8 q4, q5, q4, #14
+
+2:
+ vmull.u8 q1, d0, d0
+ vmull.u8 q2, d1, d1
+ vmull.u8 q5, d8, d8
+ vmull.u8 q6, d9, d9
+
+ tst r7, #2 // LR_HAVE_RIGHT
+ bne 4f
+ // If we'll need to pad the right edge, load that byte to pad with
+ // here since we can find it pretty easily from here.
+ sub lr, r5, #(2 + 16 - 2 + 1)
+ ldrb r11, [r3, lr]
+ ldrb lr, [r12, lr]
+ // Fill q14/q15 with the right padding pixel
+ vdup.8 q14, r11
+ vdup.8 q15, lr
+ // Restore r11 after using it for a temporary value
+ add r11, r1, #(2*SUM_STRIDE)
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp r5, #10
+ bge 4f // If w >= 10, all used input pixels are valid
+
+ // 1 <= w < 10, w pixels valid in q0. For w=9, this ends up called
+ // again; it's not strictly needed in those cases (we pad enough here),
+ // but keeping the code as simple as possible.
+
+ // Insert padding in q0/4.b[w] onwards
+ movrel_local lr, right_ext_mask
+ sub lr, lr, r5
+ vld1.8 {q13}, [lr]
+
+ vbit q0, q14, q13
+ vbit q4, q15, q13
+
+ // Update the precalculated squares
+ vmull.u8 q1, d0, d0
+ vmull.u8 q2, d1, d1
+ vmull.u8 q5, d8, d8
+ vmull.u8 q6, d9, d9
+
+4: // Loop horizontally
+ vext.8 d16, d0, d1, #1
+ vext.8 d17, d0, d1, #2
+ vext.8 d18, d8, d9, #1
+ vext.8 d19, d8, d9, #2
+ vaddl.u8 q3, d0, d16
+ vaddw.u8 q3, q3, d17
+ vaddl.u8 q7, d8, d18
+ vaddw.u8 q7, q7, d19
+
+ vext.8 q8, q1, q2, #2
+ vext.8 q9, q1, q2, #4
+ vext.8 q10, q5, q6, #2
+ vext.8 q11, q5, q6, #4
+
+ vaddl.u16 q12, d2, d16
+ vaddl.u16 q13, d3, d17
+ vaddw.u16 q12, q12, d18
+ vaddw.u16 q13, q13, d19
+
+ vaddl.u16 q8, d10, d20
+ vaddl.u16 q9, d11, d21
+ vaddw.u16 q8, q8, d22
+ vaddw.u16 q9, q9, d23
+
+ subs r5, r5, #8
+ vst1.16 {q3}, [r1, :128]!
+ vst1.16 {q7}, [r11, :128]!
+ vst1.32 {q12, q13}, [r0, :128]!
+ vst1.32 {q8, q9}, [r10, :128]!
+
+ ble 9f
+ tst r7, #2 // LR_HAVE_RIGHT
+ vld1.8 {d6}, [r3]!
+ vld1.8 {d14}, [r12]!
+ vmov q1, q2
+ vmov q5, q6
+ vext.8 q0, q0, q3, #8
+ vext.8 q4, q4, q7, #8
+ vmull.u8 q2, d6, d6
+ vmull.u8 q6, d14, d14
+
+ bne 4b // If we don't need to pad, just keep summing.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+9:
+ subs r6, r6, #2
+ ble 0f
+ // Jump to the next row and loop horizontally
+ add r0, r0, r9, lsl #1
+ add r10, r10, r9, lsl #1
+ add r1, r1, r9
+ add r11, r11, r9
+ add r3, r3, r4
+ add r12, r12, r4
+ mov r5, r8
+ b 1b
+0:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+// void dav1d_sgr_box5_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
+// const pixel (*left)[4],
+// const pixel *src, const ptrdiff_t stride,
+// const int w, const int h,
+// const enum LrEdgeFlags edges);
+function sgr_box5_h_8bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100]
+ ldrd r6, r7, [sp, #108]
+ add r5, r5, #2 // w += 2
+
+ // Set up pointers for reading/writing alternate rows
+ add r10, r0, #(4*SUM_STRIDE) // sumsq
+ add r11, r1, #(2*SUM_STRIDE) // sum
+ add r12, r3, r4 // src
+ lsl r4, r4, #1
+ mov r9, #(2*2*SUM_STRIDE) // double sum stride
+
+ // Subtract the aligned width from the output stride.
+ add lr, r5, #7
+ bic lr, lr, #7
+ sub r9, r9, lr, lsl #1
+ add lr, lr, #8
+ sub r4, r4, lr
+
+ // Store the width for the vertical loop
+ mov r8, r5
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst r7, #1 // LR_HAVE_LEFT
+ beq 2f
+ // LR_HAVE_LEFT
+ cmp r2, #0
+ bne 0f
+ // left == NULL
+ sub r3, r3, #3
+ sub r12, r12, #3
+ b 1f
+0: // LR_HAVE_LEFT, left != NULL
+2: // !LR_HAVE_LEFT, increase the stride.
+ // For this case we don't read the left 3 pixels from the src pointer,
+ // but shift it as if we had done that.
+ add r4, r4, #3
+
+1: // Loop vertically
+ vld1.8 {q0}, [r3]!
+ vld1.8 {q4}, [r12]!
+
+ tst r7, #1 // LR_HAVE_LEFT
+ beq 0f
+ cmp r2, #0
+ beq 2f
+ // LR_HAVE_LEFT, left != NULL
+ vld1.32 {d3[]}, [r2]!
+ // Move r3/r12 back to account for the last 3 bytes we loaded earlier,
+ // which we'll shift out.
+ sub r3, r3, #3
+ sub r12, r12, #3
+ vld1.32 {d11[]}, [r2]!
+ vext.8 q0, q1, q0, #13
+ vext.8 q4, q5, q4, #13
+ b 2f
+0:
+ // !LR_HAVE_LEFT, fill q1 with the leftmost byte
+ // and shift q0 to have 3x the first byte at the front.
+ vdup.8 q1, d0[0]
+ vdup.8 q5, d8[0]
+ // Move r3 back to account for the last 3 bytes we loaded before,
+ // which we shifted out.
+ sub r3, r3, #3
+ sub r12, r12, #3
+ vext.8 q0, q1, q0, #13
+ vext.8 q4, q5, q4, #13
+
+2:
+ vmull.u8 q1, d0, d0
+ vmull.u8 q2, d1, d1
+ vmull.u8 q5, d8, d8
+ vmull.u8 q6, d9, d9
+
+ tst r7, #2 // LR_HAVE_RIGHT
+ bne 4f
+ // If we'll need to pad the right edge, load that byte to pad with
+ // here since we can find it pretty easily from here.
+ sub lr, r5, #(2 + 16 - 3 + 1)
+ ldrb r11, [r3, lr]
+ ldrb lr, [r12, lr]
+ // Fill q14/q15 with the right padding pixel
+ vdup.8 q14, r11
+ vdup.8 q15, lr
+ // Restore r11 after using it for a temporary value
+ add r11, r1, #(2*SUM_STRIDE)
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp r5, #11
+ bge 4f // If w >= 11, all used input pixels are valid
+
+ // 1 <= w < 11, w+1 pixels valid in q0. For w=9 or w=10,
+ // this ends up called again; it's not strictly needed in those
+ // cases (we pad enough here), but keeping the code as simple as possible.
+
+ // Insert padding in q0/4.b[w+1] onwards; fuse the +1 into the
+ // buffer pointer.
+ movrel_local lr, right_ext_mask, -1
+ sub lr, lr, r5
+ vld1.8 {q13}, [lr]
+
+ vbit q0, q14, q13
+ vbit q4, q15, q13
+
+ // Update the precalculated squares
+ vmull.u8 q1, d0, d0
+ vmull.u8 q2, d1, d1
+ vmull.u8 q5, d8, d8
+ vmull.u8 q6, d9, d9
+
+4: // Loop horizontally
+ vext.8 d16, d0, d1, #1
+ vext.8 d17, d0, d1, #2
+ vext.8 d18, d0, d1, #3
+ vext.8 d19, d0, d1, #4
+ vext.8 d20, d8, d9, #1
+ vext.8 d21, d8, d9, #2
+ vext.8 d22, d8, d9, #3
+ vext.8 d23, d8, d9, #4
+ vaddl.u8 q3, d0, d16
+ vaddl.u8 q12, d17, d18
+ vaddl.u8 q7, d8, d20
+ vaddl.u8 q13, d21, d22
+ vaddw.u8 q3, q3, d19
+ vaddw.u8 q7, q7, d23
+ vadd.u16 q3, q3, q12
+ vadd.u16 q7, q7, q13
+
+ vext.8 q8, q1, q2, #2
+ vext.8 q9, q1, q2, #4
+ vext.8 q10, q1, q2, #6
+ vext.8 q11, q1, q2, #8
+ vaddl.u16 q12, d2, d16
+ vaddl.u16 q13, d3, d17
+ vaddl.u16 q8, d18, d20
+ vaddl.u16 q9, d19, d21
+ vaddw.u16 q12, q12, d22
+ vaddw.u16 q13, q13, d23
+ vadd.i32 q12, q12, q8
+ vadd.i32 q13, q13, q9
+ vext.8 q8, q5, q6, #2
+ vext.8 q9, q5, q6, #4
+ vext.8 q10, q5, q6, #6
+ vext.8 q11, q5, q6, #8
+ vaddl.u16 q1, d10, d16
+ vaddl.u16 q5, d11, d17
+ vaddl.u16 q8, d18, d20
+ vaddl.u16 q9, d19, d21
+ vaddw.u16 q1, q1, d22
+ vaddw.u16 q5, q5, d23
+ vadd.i32 q10, q1, q8
+ vadd.i32 q11, q5, q9
+
+ subs r5, r5, #8
+ vst1.16 {q3}, [r1, :128]!
+ vst1.16 {q7}, [r11, :128]!
+ vst1.32 {q12, q13}, [r0, :128]!
+ vst1.32 {q10, q11}, [r10, :128]!
+
+ ble 9f
+ tst r7, #2 // LR_HAVE_RIGHT
+ vld1.8 {d6}, [r3]!
+ vld1.8 {d14}, [r12]!
+ vmov q1, q2
+ vmov q5, q6
+ vext.8 q0, q0, q3, #8
+ vext.8 q4, q4, q7, #8
+ vmull.u8 q2, d6, d6
+ vmull.u8 q6, d14, d14
+ bne 4b // If we don't need to pad, just keep summing.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+9:
+ subs r6, r6, #2
+ ble 0f
+ // Jump to the next row and loop horizontally
+ add r0, r0, r9, lsl #1
+ add r10, r10, r9, lsl #1
+ add r1, r1, r9
+ add r11, r11, r9
+ add r3, r3, r4
+ add r12, r12, r4
+ mov r5, r8
+ b 1b
+0:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+sgr_funcs 8
diff --git a/third_party/dav1d/src/arm/32/looprestoration16.S b/third_party/dav1d/src/arm/32/looprestoration16.S
new file mode 100644
index 0000000000..d699617a87
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/looprestoration16.S
@@ -0,0 +1,801 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+const right_ext_mask_buf
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+right_ext_mask:
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+endconst
+
+// void dav1d_wiener_filter_h_16bpc_neon(int16_t *dst, const pixel (*left)[4],
+// const pixel *src, ptrdiff_t stride,
+// const int16_t fh[7], const intptr_t w,
+// int h, enum LrEdgeFlags edges,
+// const int bitdepth_max);
+function wiener_filter_h_16bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100]
+ ldrd r6, r7, [sp, #108]
+ ldr r8, [sp, #116] // bitdepth_max
+ vld1.16 {q0}, [r4, :128]
+ clz r8, r8
+ vmov.i32 q14, #1
+ sub r9, r8, #38 // -(bitdepth + 6)
+ sub r8, r8, #25 // -round_bits_h
+ neg r9, r9 // bitdepth + 6
+ vdup.32 q1, r9
+ vdup.32 q13, r8 // -round_bits_h
+ vmov.i16 q15, #8192
+ vshl.u32 q14, q14, q1 // 1 << (bitdepth + 6)
+ mov r8, r5
+ // Calculate mid_stride
+ add r10, r5, #7
+ bic r10, r10, #7
+ lsl r10, r10, #1
+
+ // Set up pointers for reading/writing alternate rows
+ add r12, r0, r10
+ lsl r10, r10, #1
+ add lr, r2, r3
+ lsl r3, r3, #1
+
+ // Subtract the aligned width from mid_stride
+ add r11, r5, #7
+ bic r11, r11, #7
+ sub r10, r10, r11, lsl #1
+
+ // Subtract the number of pixels read from the source stride
+ add r11, r11, #8
+ sub r3, r3, r11, lsl #1
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst r7, #1 // LR_HAVE_LEFT
+ beq 2f
+ // LR_HAVE_LEFT
+ cmp r1, #0
+ bne 0f
+ // left == NULL
+ sub r2, r2, #6
+ sub lr, lr, #6
+ b 1f
+0: // LR_HAVE_LEFT, left != NULL
+2: // !LR_HAVE_LEFT, increase the stride.
+ // For this case we don't read the left 3 pixels from the src pointer,
+ // but shift it as if we had done that.
+ add r3, r3, #6
+
+
+1: // Loop vertically
+ vld1.16 {q2, q3}, [r2]!
+ vld1.16 {q4, q5}, [lr]!
+
+ tst r7, #1 // LR_HAVE_LEFT
+ beq 0f
+ cmp r1, #0
+ beq 2f
+ // LR_HAVE_LEFT, left != NULL
+ vld1.16 {d3}, [r1]!
+ // Move r2/lr back to account for the last 3 pixels we loaded earlier,
+ // which we'll shift out.
+ sub r2, r2, #6
+ sub lr, lr, #6
+ vld1.16 {d13}, [r1]!
+ vext.8 q3, q2, q3, #10
+ vext.8 q2, q1, q2, #10
+ vext.8 q5, q4, q5, #10
+ vext.8 q4, q6, q4, #10
+ b 2f
+0:
+ // !LR_HAVE_LEFT, fill q1 with the leftmost pixel
+ // and shift q2/q3 to have 3x the first pixel at the front.
+ vdup.16 q1, d4[0]
+ vdup.16 q6, d8[0]
+ // Move r2 back to account for the last 3 pixels we loaded before,
+ // which we shifted out.
+ sub r2, r2, #6
+ sub lr, lr, #6
+ vext.8 q3, q2, q3, #10
+ vext.8 q2, q1, q2, #10
+ vext.8 q5, q4, q5, #10
+ vext.8 q4, q6, q4, #10
+
+2:
+
+ tst r7, #2 // LR_HAVE_RIGHT
+ bne 4f
+ // If we'll need to pad the right edge, load that pixel to pad with
+ // here since we can find it pretty easily from here.
+ sub r9, r5, #14
+ lsl r9, r9, #1
+ ldrh r11, [r2, r9]
+ ldrh r9, [lr, r9]
+ // Fill q11/q12 with the right padding pixel
+ vdup.16 q11, r11
+ vdup.16 q12, r9
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp r5, #11
+ bge 4f // If w >= 11, all used input pixels are valid
+
+ // 1 <= w < 11, w+3 pixels valid in q2-q3. For w=9 or w=10,
+ // this ends up called again; it's not strictly needed in those
+ // cases (we pad enough here), but keeping the code as simple as possible.
+
+ // Insert padding in q2/3.h[w+3] onwards; fuse the +3 (*2) into the
+ // buffer pointer.
+ movrel_local r4, right_ext_mask, -6
+ sub r4, r4, r5, lsl #1
+ vld1.8 {q9, q10}, [r4]
+
+ vbit q2, q11, q9
+ vbit q3, q11, q10
+ vbit q4, q12, q9
+ vbit q5, q12, q10
+
+4: // Loop horizontally
+ vext.8 q7, q2, q3, #4
+ vext.8 q8, q2, q3, #8
+ vext.8 q6, q2, q3, #2
+ vext.8 q9, q2, q3, #10
+ vadd.i16 q8, q8, q7
+ vadd.i16 q9, q9, q6
+ vext.8 q6, q2, q3, #12
+ vext.8 q7, q2, q3, #6
+ vadd.i16 q2, q2, q6
+ vmull.s16 q6, d14, d0[3]
+ vmlal.s16 q6, d16, d1[0]
+ vmlal.s16 q6, d18, d1[1]
+ vmlal.s16 q6, d4, d1[2]
+ vmull.s16 q7, d15, d0[3]
+ vmlal.s16 q7, d17, d1[0]
+ vmlal.s16 q7, d19, d1[1]
+ vmlal.s16 q7, d5, d1[2]
+
+ vext.8 q8, q4, q5, #4
+ vext.8 q10, q4, q5, #8
+ vext.8 q9, q4, q5, #2
+ vext.8 q2, q4, q5, #10
+ vadd.i16 q10, q10, q8
+ vadd.i16 q2, q2, q9
+ vext.8 q8, q4, q5, #12
+ vext.8 q9, q4, q5, #6
+ vadd.i16 q4, q4, q8
+ vmull.s16 q8, d18, d0[3]
+ vmlal.s16 q8, d20, d1[0]
+ vmlal.s16 q8, d4, d1[1]
+ vmlal.s16 q8, d8, d1[2]
+ vmull.s16 q9, d19, d0[3]
+ vmlal.s16 q9, d21, d1[0]
+ vmlal.s16 q9, d5, d1[1]
+ vmlal.s16 q9, d9, d1[2]
+
+ vmvn.i16 q10, #0x8000 // 0x7fff = (1 << 15) - 1
+ vadd.i32 q6, q6, q14
+ vadd.i32 q7, q7, q14
+ vadd.i32 q8, q8, q14
+ vadd.i32 q9, q9, q14
+ vrshl.s32 q6, q6, q13
+ vrshl.s32 q7, q7, q13
+ vrshl.s32 q8, q8, q13
+ vrshl.s32 q9, q9, q13
+ vqmovun.s32 d12, q6
+ vqmovun.s32 d13, q7
+ vqmovun.s32 d14, q8
+ vqmovun.s32 d15, q9
+ vmin.u16 q6, q6, q10
+ vmin.u16 q7, q7, q10
+ vsub.i16 q6, q6, q15
+ vsub.i16 q7, q7, q15
+ subs r5, r5, #8
+ vst1.16 {q6}, [r0, :128]!
+ vst1.16 {q7}, [r12, :128]!
+
+ ble 9f
+ tst r7, #2 // LR_HAVE_RIGHT
+ vmov q2, q3
+ vmov q4, q5
+ vld1.16 {q3}, [r2]!
+ vld1.16 {q5}, [lr]!
+ bne 4b // If we don't need to pad, just keep filtering.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+9:
+ subs r6, r6, #2
+ ble 0f
+ // Jump to the next row and loop horizontally
+ add r0, r0, r10
+ add r12, r12, r10
+ add r2, r2, r3
+ add lr, lr, r3
+ mov r5, r8
+ b 1b
+0:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+// void dav1d_wiener_filter_v_16bpc_neon(pixel *dst, ptrdiff_t stride,
+// const int16_t *mid, int w, int h,
+// const int16_t fv[7], enum LrEdgeFlags edges,
+// ptrdiff_t mid_stride, const int bitdepth_max);
+function wiener_filter_v_16bpc_neon, export=1
+ push {r4-r7,lr}
+ vpush {q4-q5}
+ ldrd r4, r5, [sp, #52]
+ ldrd r6, r7, [sp, #60]
+ ldr lr, [sp, #68] // bitdepth_max
+ vld1.16 {q0}, [r5, :128]
+ vdup.16 q5, lr
+ clz lr, lr
+ sub lr, lr, #11 // round_bits_v
+ vdup.32 q4, lr
+ mov lr, r4
+ vneg.s32 q4, q4 // -round_bits_v
+
+ // Calculate the number of rows to move back when looping vertically
+ mov r12, r4
+ tst r6, #4 // LR_HAVE_TOP
+ beq 0f
+ sub r2, r2, r7, lsl #1
+ add r12, r12, #2
+0:
+ tst r6, #8 // LR_HAVE_BOTTOM
+ beq 1f
+ add r12, r12, #2
+
+1: // Start of horizontal loop; start one vertical filter slice.
+ // Load rows into q8-q11 and pad properly.
+ tst r6, #4 // LR_HAVE_TOP
+ vld1.16 {q8}, [r2, :128], r7
+ beq 2f
+ // LR_HAVE_TOP
+ vld1.16 {q10}, [r2, :128], r7
+ vmov q9, q8
+ vld1.16 {q11}, [r2, :128], r7
+ b 3f
+2: // !LR_HAVE_TOP
+ vmov q9, q8
+ vmov q10, q8
+ vmov q11, q8
+
+3:
+ cmp r4, #4
+ blt 5f
+ // Start filtering normally; fill in q12-q14 with unique rows.
+ vld1.16 {q12}, [r2, :128], r7
+ vld1.16 {q13}, [r2, :128], r7
+ vld1.16 {q14}, [r2, :128], r7
+
+4:
+.macro filter compare
+ subs r4, r4, #1
+ // Interleaving the mul/mla chains actually hurts performance
+ // significantly on Cortex A53, thus keeping mul/mla tightly
+ // chained like this.
+ vmull.s16 q2, d16, d0[0]
+ vmlal.s16 q2, d18, d0[1]
+ vmlal.s16 q2, d20, d0[2]
+ vmlal.s16 q2, d22, d0[3]
+ vmlal.s16 q2, d24, d1[0]
+ vmlal.s16 q2, d26, d1[1]
+ vmlal.s16 q2, d28, d1[2]
+ vmull.s16 q3, d17, d0[0]
+ vmlal.s16 q3, d19, d0[1]
+ vmlal.s16 q3, d21, d0[2]
+ vmlal.s16 q3, d23, d0[3]
+ vmlal.s16 q3, d25, d1[0]
+ vmlal.s16 q3, d27, d1[1]
+ vmlal.s16 q3, d29, d1[2]
+ vrshl.s32 q2, q2, q4 // round_bits_v
+ vrshl.s32 q3, q3, q4
+ vqmovun.s32 d4, q2
+ vqmovun.s32 d5, q3
+ vmin.u16 q2, q2, q5 // bitdepth_max
+ vst1.16 {q2}, [r0, :128], r1
+.if \compare
+ cmp r4, #4
+.else
+ ble 9f
+.endif
+ vmov q8, q9
+ vmov q9, q10
+ vmov q10, q11
+ vmov q11, q12
+ vmov q12, q13
+ vmov q13, q14
+.endm
+ filter 1
+ blt 7f
+ vld1.16 {q14}, [r2, :128], r7
+ b 4b
+
+5: // Less than 4 rows in total; not all of q12-q13 are filled yet.
+ tst r6, #8 // LR_HAVE_BOTTOM
+ beq 6f
+ // LR_HAVE_BOTTOM
+ cmp r4, #2
+ // We load at least 2 rows in all cases.
+ vld1.16 {q12}, [r2, :128], r7
+ vld1.16 {q13}, [r2, :128], r7
+ bgt 53f // 3 rows in total
+ beq 52f // 2 rows in total
+51: // 1 row in total, q11 already loaded, load edge into q12-q14.
+ vmov q13, q12
+ b 8f
+52: // 2 rows in total, q11 already loaded, load q12 with content data
+ // and 2 rows of edge.
+ vld1.16 {q14}, [r2, :128], r7
+ vmov q15, q14
+ b 8f
+53:
+ // 3 rows in total, q11 already loaded, load q12 and q13 with content
+ // and 2 rows of edge.
+ vld1.16 {q14}, [r2, :128], r7
+ vld1.16 {q15}, [r2, :128], r7
+ vmov q1, q15
+ b 8f
+
+6:
+ // !LR_HAVE_BOTTOM
+ cmp r4, #2
+ bgt 63f // 3 rows in total
+ beq 62f // 2 rows in total
+61: // 1 row in total, q11 already loaded, pad that into q12-q14.
+ vmov q12, q11
+ vmov q13, q11
+ vmov q14, q11
+ b 8f
+62: // 2 rows in total, q11 already loaded, load q12 and pad that into q12-q15.
+ vld1.16 {q12}, [r2, :128], r7
+ vmov q13, q12
+ vmov q14, q12
+ vmov q15, q12
+ b 8f
+63:
+ // 3 rows in total, q11 already loaded, load q12 and q13 and pad q13 into q14-q15,q1.
+ vld1.16 {q12}, [r2, :128], r7
+ vld1.16 {q13}, [r2, :128], r7
+ vmov q14, q13
+ vmov q15, q13
+ vmov q1, q13
+ b 8f
+
+7:
+ // All registers up to q13 are filled already, 3 valid rows left.
+ // < 4 valid rows left; fill in padding and filter the last
+ // few rows.
+ tst r6, #8 // LR_HAVE_BOTTOM
+ beq 71f
+ // LR_HAVE_BOTTOM; load 2 rows of edge.
+ vld1.16 {q14}, [r2, :128], r7
+ vld1.16 {q15}, [r2, :128], r7
+ vmov q1, q15
+ b 8f
+71:
+ // !LR_HAVE_BOTTOM, pad 3 rows
+ vmov q14, q13
+ vmov q15, q13
+ vmov q1, q13
+
+8: // At this point, all registers up to q14-q15,q1 are loaded with
+ // edge/padding (depending on how many rows are left).
+ filter 0 // This branches to 9f when done
+ vmov q14, q15
+ vmov q15, q1
+ b 8b
+
+9: // End of one vertical slice.
+ subs r3, r3, #8
+ ble 0f
+ // Move pointers back up to the top and loop horizontally.
+ mls r0, r1, lr, r0
+ mls r2, r7, r12, r2
+ add r0, r0, #16
+ add r2, r2, #16
+ mov r4, lr
+ b 1b
+
+0:
+ vpop {q4-q5}
+ pop {r4-r7,pc}
+.purgem filter
+endfunc
+
+#define SUM_STRIDE (384+16)
+
+#include "looprestoration_tmpl.S"
+
+// void dav1d_sgr_box3_h_16bpc_neon(int32_t *sumsq, int16_t *sum,
+// const pixel (*left)[4],
+// const pixel *src, const ptrdiff_t stride,
+// const int w, const int h,
+// const enum LrEdgeFlags edges);
+function sgr_box3_h_16bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100]
+ ldrd r6, r7, [sp, #108]
+ add r5, r5, #2 // w += 2
+
+ // Set up pointers for reading/writing alternate rows
+ add r10, r0, #(4*SUM_STRIDE) // sumsq
+ add r11, r1, #(2*SUM_STRIDE) // sum
+ add r12, r3, r4 // src
+ lsl r4, r4, #1
+ mov r9, #(2*2*SUM_STRIDE) // double sum stride
+
+ // Subtract the aligned width from the output stride.
+ add lr, r5, #7
+ bic lr, lr, #7
+ sub r9, r9, lr, lsl #1
+
+ // Store the width for the vertical loop
+ mov r8, r5
+
+ // Subtract the number of pixels read from the input from the stride
+ add lr, lr, #8
+ sub r4, r4, lr, lsl #1
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst r7, #1 // LR_HAVE_LEFT
+ beq 2f
+ // LR_HAVE_LEFT
+ cmp r2, #0
+ bne 0f
+ // left == NULL
+ sub r3, r3, #4
+ sub r12, r12, #4
+ b 1f
+0: // LR_HAVE_LEFT, left != NULL
+2: // !LR_HAVE_LEFT, increase the stride.
+ // For this case we don't read the left 2 pixels from the src pointer,
+ // but shift it as if we had done that.
+ add r4, r4, #4
+
+
+1: // Loop vertically
+ vld1.16 {q0, q1}, [r3]!
+ vld1.16 {q4, q5}, [r12]!
+
+ tst r7, #1 // LR_HAVE_LEFT
+ beq 0f
+ cmp r2, #0
+ beq 2f
+ // LR_HAVE_LEFT, left != NULL
+ vld1.16 {d5}, [r2]!
+ // Move r3/r12 back to account for the last 2 pixels we loaded earlier,
+ // which we'll shift out.
+ sub r3, r3, #4
+ sub r12, r12, #4
+ vld1.16 {d13}, [r2]!
+ vext.8 q1, q0, q1, #12
+ vext.8 q0, q2, q0, #12
+ vext.8 q5, q4, q5, #12
+ vext.8 q4, q6, q4, #12
+ b 2f
+0:
+ // !LR_HAVE_LEFT, fill q2 with the leftmost pixel
+ // and shift q0 to have 2x the first byte at the front.
+ vdup.16 q2, d0[0]
+ vdup.16 q6, d8[0]
+ // Move r3 back to account for the last 2 pixels we loaded before,
+ // which we shifted out.
+ sub r3, r3, #4
+ sub r12, r12, #4
+ vext.8 q1, q0, q1, #12
+ vext.8 q0, q2, q0, #12
+ vext.8 q5, q4, q5, #12
+ vext.8 q4, q6, q4, #12
+
+2:
+ tst r7, #2 // LR_HAVE_RIGHT
+ bne 4f
+ // If we'll need to pad the right edge, load that pixel to pad with
+ // here since we can find it pretty easily from here.
+ sub lr, r5, #(2 + 16 - 2 + 1)
+ lsl lr, lr, #1
+ ldrh r11, [r3, lr]
+ ldrh lr, [r12, lr]
+ // Fill q14/q15 with the right padding pixel
+ vdup.16 q14, r11
+ vdup.16 q15, lr
+ // Restore r11 after using it for a temporary value
+ add r11, r1, #(2*SUM_STRIDE)
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp r5, #10
+ bge 4f // If w >= 10, all used input pixels are valid
+
+ // 1 <= w < 10, w pixels valid in q0-q1. For w=9, this ends up called
+ // again; it's not strictly needed in those cases (we pad enough here),
+ // but keeping the code as simple as possible.
+
+ // Insert padding in q0/1.h[w] onwards
+ movrel_local lr, right_ext_mask
+ sub lr, lr, r5, lsl #1
+ vld1.8 {q12, q13}, [lr]
+
+ vbit q0, q14, q12
+ vbit q1, q14, q13
+ vbit q4, q15, q12
+ vbit q5, q15, q13
+
+4: // Loop horizontally
+ vext.8 q8, q0, q1, #2
+ vext.8 q10, q4, q5, #2
+ vext.8 q9, q0, q1, #4
+ vext.8 q11, q4, q5, #4
+ vadd.i16 q2, q0, q8
+ vadd.i16 q3, q4, q10
+ vadd.i16 q2, q2, q9
+ vadd.i16 q3, q3, q11
+
+ vmull.u16 q6, d0, d0
+ vmlal.u16 q6, d16, d16
+ vmlal.u16 q6, d18, d18
+ vmull.u16 q12, d8, d8
+ vmlal.u16 q12, d20, d20
+ vmlal.u16 q12, d22, d22
+ vmull.u16 q7, d1, d1
+ vmlal.u16 q7, d17, d17
+ vmlal.u16 q7, d19, d19
+ vmull.u16 q13, d9, d9
+ vmlal.u16 q13, d21, d21
+ vmlal.u16 q13, d23, d23
+ subs r5, r5, #8
+ vst1.16 {q2}, [r1, :128]!
+ vst1.16 {q3}, [r11, :128]!
+ vst1.32 {q6, q7}, [r0, :128]!
+ vst1.32 {q12, q13}, [r10, :128]!
+
+ ble 9f
+ tst r7, #2 // LR_HAVE_RIGHT
+ vmov q0, q1
+ vmov q4, q5
+ vld1.16 {q1}, [r3]!
+ vld1.16 {q5}, [r12]!
+
+ bne 4b // If we don't need to pad, just keep summing.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+9:
+ subs r6, r6, #2
+ ble 0f
+ // Jump to the next row and loop horizontally
+ add r0, r0, r9, lsl #1
+ add r10, r10, r9, lsl #1
+ add r1, r1, r9
+ add r11, r11, r9
+ add r3, r3, r4
+ add r12, r12, r4
+ mov r5, r8
+ b 1b
+0:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+// void dav1d_sgr_box5_h_16bpc_neon(int32_t *sumsq, int16_t *sum,
+// const pixel (*left)[4],
+// const pixel *src, const ptrdiff_t stride,
+// const int w, const int h,
+// const enum LrEdgeFlags edges);
+function sgr_box5_h_16bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100]
+ ldrd r6, r7, [sp, #108]
+ add r5, r5, #2 // w += 2
+
+ // Set up pointers for reading/writing alternate rows
+ add r10, r0, #(4*SUM_STRIDE) // sumsq
+ add r11, r1, #(2*SUM_STRIDE) // sum
+ add r12, r3, r4 // src
+ lsl r4, r4, #1
+ mov r9, #(2*2*SUM_STRIDE) // double sum stride
+
+ // Subtract the aligned width from the output stride.
+ add lr, r5, #7
+ bic lr, lr, #7
+ sub r9, r9, lr, lsl #1
+ add lr, lr, #8
+ sub r4, r4, lr, lsl #1
+
+ // Store the width for the vertical loop
+ mov r8, r5
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst r7, #1 // LR_HAVE_LEFT
+ beq 2f
+ // LR_HAVE_LEFT
+ cmp r2, #0
+ bne 0f
+ // left == NULL
+ sub r3, r3, #6
+ sub r12, r12, #6
+ b 1f
+0: // LR_HAVE_LEFT, left != NULL
+2: // !LR_HAVE_LEFT, increase the stride.
+ // For this case we don't read the left 3 pixels from the src pointer,
+ // but shift it as if we had done that.
+ add r4, r4, #6
+
+1: // Loop vertically
+ vld1.16 {q0, q1}, [r3]!
+ vld1.16 {q4, q5}, [r12]!
+
+ tst r7, #1 // LR_HAVE_LEFT
+ beq 0f
+ cmp r2, #0
+ beq 2f
+ // LR_HAVE_LEFT, left != NULL
+ vld1.16 {d5}, [r2]!
+ // Move r3/r12 back to account for the last 3 pixels we loaded earlier,
+ // which we'll shift out.
+ sub r3, r3, #6
+ sub r12, r12, #6
+ vld1.16 {d13}, [r2]!
+ vext.8 q1, q0, q1, #10
+ vext.8 q0, q2, q0, #10
+ vext.8 q5, q4, q5, #10
+ vext.8 q4, q6, q4, #10
+ b 2f
+0:
+ // !LR_HAVE_LEFT, fill q2 with the leftmost pixel
+ // and shift q0 to have 3x the first pixel at the front.
+ vdup.16 q2, d0[0]
+ vdup.16 q6, d8[0]
+ // Move r3 back to account for the last 3 pixels we loaded before,
+ // which we shifted out.
+ sub r3, r3, #6
+ sub r12, r12, #6
+ vext.8 q1, q0, q1, #10
+ vext.8 q0, q2, q0, #10
+ vext.8 q5, q4, q5, #10
+ vext.8 q4, q6, q4, #10
+
+2:
+ tst r7, #2 // LR_HAVE_RIGHT
+ bne 4f
+ // If we'll need to pad the right edge, load that pixel to pad with
+ // here since we can find it pretty easily from here.
+ sub lr, r5, #(2 + 16 - 3 + 1)
+ lsl lr, lr, #1
+ ldrh r11, [r3, lr]
+ ldrh lr, [r12, lr]
+ // Fill q14/q15 with the right padding pixel
+ vdup.16 q14, r11
+ vdup.16 q15, lr
+ // Restore r11 after using it for a temporary value
+ add r11, r1, #(2*SUM_STRIDE)
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp r5, #11
+ bge 4f // If w >= 11, all used input pixels are valid
+
+ // 1 <= w < 11, w+1 pixels valid in q0-q1. For w=9 or w=10,
+ // this ends up called again; it's not strictly needed in those
+ // cases (we pad enough here), but keeping the code as simple as possible.
+
+ // Insert padding in q0/1.h[w+1] onwards; fuse the +1 into the
+ // buffer pointer.
+ movrel_local lr, right_ext_mask, -2
+ sub lr, lr, r5, lsl #1
+ vld1.8 {q12, q13}, [lr]
+
+ vbit q0, q14, q12
+ vbit q1, q14, q13
+ vbit q4, q15, q12
+ vbit q5, q15, q13
+
+4: // Loop horizontally
+ vext.8 q8, q0, q1, #2
+ vext.8 q10, q4, q5, #2
+ vext.8 q9, q0, q1, #4
+ vext.8 q11, q4, q5, #4
+ vadd.i16 q2, q0, q8
+ vadd.i16 q3, q4, q10
+ vadd.i16 q2, q2, q9
+ vadd.i16 q3, q3, q11
+
+ vmull.u16 q6, d0, d0
+ vmlal.u16 q6, d16, d16
+ vmlal.u16 q6, d18, d18
+ vmull.u16 q12, d8, d8
+ vmlal.u16 q12, d20, d20
+ vmlal.u16 q12, d22, d22
+ vmull.u16 q7, d1, d1
+ vmlal.u16 q7, d17, d17
+ vmlal.u16 q7, d19, d19
+ vmull.u16 q13, d9, d9
+ vmlal.u16 q13, d21, d21
+ vmlal.u16 q13, d23, d23
+
+ vext.8 q8, q0, q1, #6
+ vext.8 q10, q4, q5, #6
+ vext.8 q9, q0, q1, #8
+ vext.8 q11, q4, q5, #8
+ vadd.i16 q2, q2, q8
+ vadd.i16 q3, q3, q10
+ vadd.i16 q2, q2, q9
+ vadd.i16 q3, q3, q11
+
+ vmlal.u16 q6, d16, d16
+ vmlal.u16 q6, d1, d1
+ vmlal.u16 q12, d20, d20
+ vmlal.u16 q12, d9, d9
+ vmlal.u16 q7, d17, d17
+ vmlal.u16 q7, d19, d19
+ vmlal.u16 q13, d21, d21
+ vmlal.u16 q13, d23, d23
+
+ subs r5, r5, #8
+ vst1.16 {q2}, [r1, :128]!
+ vst1.16 {q3}, [r11, :128]!
+ vst1.32 {q6, q7}, [r0, :128]!
+ vst1.32 {q12, q13}, [r10, :128]!
+
+ ble 9f
+ tst r7, #2 // LR_HAVE_RIGHT
+ vmov q0, q1
+ vmov q4, q5
+ vld1.16 {q1}, [r3]!
+ vld1.16 {q5}, [r12]!
+ bne 4b // If we don't need to pad, just keep summing.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+9:
+ subs r6, r6, #2
+ ble 0f
+ // Jump to the next row and loop horizontally
+ add r0, r0, r9, lsl #1
+ add r10, r10, r9, lsl #1
+ add r1, r1, r9
+ add r11, r11, r9
+ add r3, r3, r4
+ add r12, r12, r4
+ mov r5, r8
+ b 1b
+0:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+sgr_funcs 16
diff --git a/third_party/dav1d/src/arm/32/looprestoration_common.S b/third_party/dav1d/src/arm/32/looprestoration_common.S
new file mode 100644
index 0000000000..b080bb5115
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/looprestoration_common.S
@@ -0,0 +1,453 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+#define SUM_STRIDE (384+16)
+
+// void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum,
+// const int w, const int h,
+// const enum LrEdgeFlags edges);
+function sgr_box3_v_neon, export=1
+ push {r4-r9,lr}
+ ldr r4, [sp, #28]
+ add r12, r3, #2 // Number of output rows to move back
+ mov lr, r3 // Number of input rows to move back
+ add r2, r2, #2 // Actual summed width
+ mov r7, #(4*SUM_STRIDE) // sumsq stride
+ mov r8, #(2*SUM_STRIDE) // sum stride
+ sub r0, r0, #(4*SUM_STRIDE) // sumsq -= stride
+ sub r1, r1, #(2*SUM_STRIDE) // sum -= stride
+
+ tst r4, #4 // LR_HAVE_TOP
+ beq 0f
+ // If have top, read from row -2.
+ sub r5, r0, #(4*SUM_STRIDE)
+ sub r6, r1, #(2*SUM_STRIDE)
+ add lr, lr, #2
+ b 1f
+0:
+ // !LR_HAVE_TOP
+ // If we don't have top, read from row 0 even if
+ // we start writing to row -1.
+ add r5, r0, #(4*SUM_STRIDE)
+ add r6, r1, #(2*SUM_STRIDE)
+1:
+
+ tst r4, #8 // LR_HAVE_BOTTOM
+ beq 1f
+ // LR_HAVE_BOTTOM
+ add r3, r3, #2 // Sum all h+2 lines with the main loop
+ add lr, lr, #2
+1:
+ mov r9, r3 // Backup of h for next loops
+
+1:
+ // Start of horizontal loop; start one vertical filter slice.
+ // Start loading rows into q8-q13 and q0-q2 taking top
+ // padding into consideration.
+ tst r4, #4 // LR_HAVE_TOP
+ vld1.32 {q8, q9}, [r5, :128], r7
+ vld1.16 {q0}, [r6, :128], r8
+ beq 2f
+ // LR_HAVE_TOP
+ vld1.32 {q10, q11}, [r5, :128], r7
+ vld1.16 {q1}, [r6, :128], r8
+ vld1.32 {q12, q13}, [r5, :128], r7
+ vld1.16 {q2}, [r6, :128], r8
+ b 3f
+2: // !LR_HAVE_TOP
+ vmov q10, q8
+ vmov q11, q9
+ vmov q1, q0
+ vmov q12, q8
+ vmov q13, q9
+ vmov q2, q0
+
+3:
+ subs r3, r3, #1
+.macro add3
+ vadd.i32 q8, q8, q10
+ vadd.i32 q9, q9, q11
+ vadd.i16 q0, q0, q1
+ vadd.i32 q8, q8, q12
+ vadd.i32 q9, q9, q13
+ vadd.i16 q0, q0, q2
+ vst1.32 {q8, q9}, [r0, :128], r7
+ vst1.16 {q0}, [r1, :128], r8
+.endm
+ add3
+ vmov q8, q10
+ vmov q9, q11
+ vmov q0, q1
+ vmov q10, q12
+ vmov q11, q13
+ vmov q1, q2
+ ble 4f
+ vld1.32 {q12, q13}, [r5, :128], r7
+ vld1.16 {q2}, [r6, :128], r8
+ b 3b
+
+4:
+ tst r4, #8 // LR_HAVE_BOTTOM
+ bne 5f
+ // !LR_HAVE_BOTTOM
+ // Produce two more rows, extending the already loaded rows.
+ add3
+ vmov q8, q10
+ vmov q9, q11
+ vmov q0, q1
+ add3
+
+5: // End of one vertical slice.
+ subs r2, r2, #8
+ ble 0f
+ // Move pointers back up to the top and loop horizontally.
+ // Input pointers
+ mls r5, r7, lr, r5
+ mls r6, r8, lr, r6
+ // Output pointers
+ mls r0, r7, r12, r0
+ mls r1, r8, r12, r1
+ add r0, r0, #32
+ add r1, r1, #16
+ add r5, r5, #32
+ add r6, r6, #16
+ mov r3, r9
+ b 1b
+
+0:
+ pop {r4-r9,pc}
+.purgem add3
+endfunc
+
+// void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum,
+// const int w, const int h,
+// const enum LrEdgeFlags edges);
+function sgr_box5_v_neon, export=1
+ push {r4-r9,lr}
+ vpush {q5-q7}
+ ldr r4, [sp, #76]
+ add r12, r3, #2 // Number of output rows to move back
+ mov lr, r3 // Number of input rows to move back
+ add r2, r2, #8 // Actual summed width
+ mov r7, #(4*SUM_STRIDE) // sumsq stride
+ mov r8, #(2*SUM_STRIDE) // sum stride
+ sub r0, r0, #(4*SUM_STRIDE) // sumsq -= stride
+ sub r1, r1, #(2*SUM_STRIDE) // sum -= stride
+
+ tst r4, #4 // LR_HAVE_TOP
+ beq 0f
+ // If have top, read from row -2.
+ sub r5, r0, #(4*SUM_STRIDE)
+ sub r6, r1, #(2*SUM_STRIDE)
+ add lr, lr, #2
+ b 1f
+0:
+ // !LR_HAVE_TOP
+ // If we don't have top, read from row 0 even if
+ // we start writing to row -1.
+ add r5, r0, #(4*SUM_STRIDE)
+ add r6, r1, #(2*SUM_STRIDE)
+1:
+
+ tst r4, #8 // LR_HAVE_BOTTOM
+ beq 0f
+ // LR_HAVE_BOTTOM
+ add r3, r3, #2 // Handle h+2 lines with the main loop
+ add lr, lr, #2
+ b 1f
+0:
+ // !LR_HAVE_BOTTOM
+ sub r3, r3, #1 // Handle h-1 lines with the main loop
+1:
+ mov r9, r3 // Backup of h for next loops
+
+1:
+ // Start of horizontal loop; start one vertical filter slice.
+ // Start loading rows into q6-q15 and q0-q3,q5 taking top
+ // padding into consideration.
+ tst r4, #4 // LR_HAVE_TOP
+ vld1.32 {q6, q7}, [r5, :128], r7
+ vld1.16 {q0}, [r6, :128], r8
+ beq 2f
+ // LR_HAVE_TOP
+ vld1.32 {q10, q11}, [r5, :128], r7
+ vld1.16 {q2}, [r6, :128], r8
+ vmov q8, q6
+ vmov q9, q7
+ vmov q1, q0
+ vld1.32 {q12, q13}, [r5, :128], r7
+ vld1.16 {q3}, [r6, :128], r8
+ b 3f
+2: // !LR_HAVE_TOP
+ vmov q8, q6
+ vmov q9, q7
+ vmov q1, q0
+ vmov q10, q6
+ vmov q11, q7
+ vmov q2, q0
+ vmov q12, q6
+ vmov q13, q7
+ vmov q3, q0
+
+3:
+ cmp r3, #0
+ beq 4f
+ vld1.32 {q14, q15}, [r5, :128], r7
+ vld1.16 {q5}, [r6, :128], r8
+
+3:
+ // Start of vertical loop
+ subs r3, r3, #2
+.macro add5
+ vadd.i32 q6, q6, q8
+ vadd.i32 q7, q7, q9
+ vadd.i16 q0, q0, q1
+ vadd.i32 q6, q6, q10
+ vadd.i32 q7, q7, q11
+ vadd.i16 q0, q0, q2
+ vadd.i32 q6, q6, q12
+ vadd.i32 q7, q7, q13
+ vadd.i16 q0, q0, q3
+ vadd.i32 q6, q6, q14
+ vadd.i32 q7, q7, q15
+ vadd.i16 q0, q0, q5
+ vst1.32 {q6, q7}, [r0, :128], r7
+ vst1.16 {q0}, [r1, :128], r8
+.endm
+ add5
+.macro shift2
+ vmov q6, q10
+ vmov q7, q11
+ vmov q0, q2
+ vmov q8, q12
+ vmov q9, q13
+ vmov q1, q3
+ vmov q10, q14
+ vmov q11, q15
+ vmov q2, q5
+.endm
+ shift2
+ add r0, r0, r7
+ add r1, r1, r8
+ ble 5f
+ vld1.32 {q12, q13}, [r5, :128], r7
+ vld1.16 {q3}, [r6, :128], r8
+ vld1.32 {q14, q15}, [r5, :128], r7
+ vld1.16 {q5}, [r6, :128], r8
+ b 3b
+
+4:
+ // h == 1, !LR_HAVE_BOTTOM.
+ // Pad the last row with the only content row, and add.
+ vmov q14, q12
+ vmov q15, q13
+ vmov q5, q3
+ add5
+ shift2
+ add r0, r0, r7
+ add r1, r1, r8
+ add5
+ b 6f
+
+5:
+ tst r4, #8 // LR_HAVE_BOTTOM
+ bne 6f
+ // !LR_HAVE_BOTTOM
+ cmp r3, #0
+ bne 5f
+ // The intended three edge rows left; output the one at h-2 and
+ // the past edge one at h.
+ vld1.32 {q12, q13}, [r5, :128], r7
+ vld1.16 {q3}, [r6, :128], r8
+ // Pad the past-edge row from the last content row.
+ vmov q14, q12
+ vmov q15, q13
+ vmov q5, q3
+ add5
+ shift2
+ add r0, r0, r7
+ add r1, r1, r8
+ // The last two rows are already padded properly here.
+ add5
+ b 6f
+
+5:
+ // r3 == -1, two rows left, output one.
+ // Pad the last two rows from the mid one.
+ vmov q12, q10
+ vmov q13, q11
+ vmov q3, q2
+ vmov q14, q10
+ vmov q15, q11
+ vmov q5, q2
+ add5
+ add r0, r0, r7
+ add r1, r1, r8
+ b 6f
+
+6: // End of one vertical slice.
+ subs r2, r2, #8
+ ble 0f
+ // Move pointers back up to the top and loop horizontally.
+ // Input pointers
+ mls r5, r7, lr, r5
+ mls r6, r8, lr, r6
+ // Output pointers
+ mls r0, r7, r12, r0
+ mls r1, r8, r12, r1
+ add r0, r0, #32
+ add r1, r1, #16
+ add r5, r5, #32
+ add r6, r6, #16
+ mov r3, r9
+ b 1b
+
+0:
+ vpop {q5-q7}
+ pop {r4-r9,pc}
+.purgem add5
+endfunc
+
+// void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b,
+// const int w, const int h, const int strength,
+// const int bitdepth_max);
+// void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b,
+// const int w, const int h, const int strength,
+// const int bitdepth_max);
+function sgr_calc_ab1_neon, export=1
+ push {r4-r7,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #84]
+ add r3, r3, #2 // h += 2
+ clz r6, r5
+ vmov.i32 q15, #9 // n
+ movw r5, #455
+ mov lr, #SUM_STRIDE
+ b sgr_calc_ab_neon
+endfunc
+
+function sgr_calc_ab2_neon, export=1
+ push {r4-r7,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #84]
+ add r3, r3, #3 // h += 3
+ clz r6, r5
+ asr r3, r3, #1 // h /= 2
+ vmov.i32 q15, #25 // n
+ mov r5, #164
+ mov lr, #(2*SUM_STRIDE)
+endfunc
+
+function sgr_calc_ab_neon
+ movrel r12, X(sgr_x_by_x)
+ sub r6, r6, #24 // -bitdepth_min_8
+ vld1.8 {q8, q9}, [r12, :128]!
+ add r7, r6, r6 // -2*bitdepth_min_8
+ vmov.i8 q11, #5
+ vmov.i8 d10, #55 // idx of last 5
+ vld1.8 {q10}, [r12, :128]
+ vmov.i8 d11, #72 // idx of last 4
+ vmov.i8 d12, #101 // idx of last 3
+ vmov.i8 d13, #169 // idx of last 2
+ vmov.i8 d14, #254 // idx of last 1
+ vmov.i8 d15, #32 // elements consumed in first vtbl
+ add r2, r2, #2 // w += 2
+ add r12, r2, #7
+ bic r12, r12, #7 // aligned w
+ sub r12, lr, r12 // increment between rows
+ vdup.32 q12, r4
+ sub r0, r0, #(4*(SUM_STRIDE))
+ sub r1, r1, #(2*(SUM_STRIDE))
+ mov r4, r2 // backup of w
+ vsub.i8 q8, q8, q11
+ vsub.i8 q9, q9, q11
+ vsub.i8 q10, q10, q11
+1:
+ vld1.32 {q0, q1}, [r0, :128] // a
+ vld1.16 {q2}, [r1, :128] // b
+ vdup.32 q13, r7 // -2*bitdepth_min_8
+ vdup.16 q14, r6 // -bitdepth_min_8
+ subs r2, r2, #8
+ vrshl.s32 q0, q0, q13
+ vrshl.s32 q1, q1, q13
+ vrshl.s16 q4, q2, q14
+ vmul.i32 q0, q0, q15 // a * n
+ vmul.i32 q1, q1, q15 // a * n
+ vmull.u16 q3, d8, d8 // b * b
+ vmull.u16 q4, d9, d9 // b * b
+ vqsub.u32 q0, q0, q3 // imax(a * n - b * b, 0)
+ vqsub.u32 q1, q1, q4 // imax(a * n - b * b, 0)
+ vmul.i32 q0, q0, q12 // p * s
+ vmul.i32 q1, q1, q12 // p * s
+ vqshrn.u32 d0, q0, #16
+ vqshrn.u32 d1, q1, #16
+ vqrshrn.u16 d0, q0, #4 // imin(z, 255)
+
+ vcgt.u8 d2, d0, d10 // = -1 if sgr_x_by_x[d0] < 5
+ vcgt.u8 d3, d0, d11 // = -1 if sgr_x_by_x[d0] < 4
+ vtbl.8 d1, {q8, q9}, d0
+ vcgt.u8 d6, d0, d12 // = -1 if sgr_x_by_x[d0] < 3
+ vsub.i8 d9, d0, d15 // indices for vtbx
+ vcgt.u8 d7, d0, d13 // = -1 if sgr_x_by_x[d0] < 2
+ vadd.i8 d2, d2, d3
+ vtbx.8 d1, {q10}, d9
+ vcgt.u8 d8, d0, d14 // = -1 if sgr_x_by_x[d0] < 1
+ vadd.i8 d6, d6, d7
+ vadd.i8 d8, d8, d22
+ vadd.i8 d2, d2, d6
+ vadd.i8 d1, d1, d8
+ vadd.i8 d1, d1, d2
+ vmovl.u8 q0, d1 // x
+
+ vmov.i16 q13, #256
+ vdup.32 q14, r5 // one_by_x
+
+ vmull.u16 q1, d0, d4 // x * BB[i]
+ vmull.u16 q2, d1, d5 // x * BB[i]
+ vmul.i32 q1, q1, q14 // x * BB[i] * sgr_one_by_x
+ vmul.i32 q2, q2, q14 // x * BB[i] * sgr_one_by_x
+ vrshr.s32 q1, q1, #12 // AA[i]
+ vrshr.s32 q2, q2, #12 // AA[i]
+ vsub.i16 q0, q13, q0 // 256 - x
+
+ vst1.32 {q1, q2}, [r0, :128]!
+ vst1.16 {q0}, [r1, :128]!
+ bgt 1b
+
+ subs r3, r3, #1
+ ble 0f
+ add r0, r0, r12, lsl #2
+ add r1, r1, r12, lsl #1
+ mov r2, r4
+ b 1b
+0:
+ vpop {q4-q7}
+ pop {r4-r7,pc}
+endfunc
diff --git a/third_party/dav1d/src/arm/32/looprestoration_tmpl.S b/third_party/dav1d/src/arm/32/looprestoration_tmpl.S
new file mode 100644
index 0000000000..8a9940bb3a
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/looprestoration_tmpl.S
@@ -0,0 +1,600 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+
+#define FILTER_OUT_STRIDE 384
+
+.macro sgr_funcs bpc
+// void dav1d_sgr_finish_filter1_Xbpc_neon(int16_t *tmp,
+// const pixel *src, const ptrdiff_t stride,
+// const int32_t *a, const int16_t *b,
+// const int w, const int h);
+function sgr_finish_filter1_\bpc\()bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100]
+ ldr r6, [sp, #108]
+ sub r7, r3, #(4*SUM_STRIDE)
+ add r8, r3, #(4*SUM_STRIDE)
+ sub r9, r4, #(2*SUM_STRIDE)
+ add r10, r4, #(2*SUM_STRIDE)
+ mov r11, #SUM_STRIDE
+ mov r12, #FILTER_OUT_STRIDE
+ add lr, r5, #3
+ bic lr, lr, #3 // Aligned width
+.if \bpc == 8
+ sub r2, r2, lr
+.else
+ sub r2, r2, lr, lsl #1
+.endif
+ sub r12, r12, lr
+ sub r11, r11, lr
+ sub r11, r11, #4 // We read 4 extra elements from both a and b
+ mov lr, r5
+ vmov.i16 q14, #3
+ vmov.i32 q15, #3
+1:
+ vld1.16 {q0}, [r9, :128]!
+ vld1.16 {q1}, [r4, :128]!
+ vld1.16 {q2}, [r10, :128]!
+ vld1.32 {q8, q9}, [r7, :128]!
+ vld1.32 {q10, q11}, [r3, :128]!
+ vld1.32 {q12, q13}, [r8, :128]!
+
+2:
+ subs r5, r5, #4
+ vext.8 d6, d0, d1, #2 // -stride
+ vext.8 d7, d2, d3, #2 // 0
+ vext.8 d8, d4, d5, #2 // +stride
+ vext.8 d9, d0, d1, #4 // +1-stride
+ vext.8 d10, d2, d3, #4 // +1
+ vext.8 d11, d4, d5, #4 // +1+stride
+ vadd.i16 d2, d2, d6 // -1, -stride
+ vadd.i16 d7, d7, d8 // 0, +stride
+ vadd.i16 d0, d0, d9 // -1-stride, +1-stride
+ vadd.i16 d2, d2, d7
+ vadd.i16 d4, d4, d11 // -1+stride, +1+stride
+ vadd.i16 d2, d2, d10 // +1
+ vadd.i16 d0, d0, d4
+
+ vext.8 q3, q8, q9, #4 // -stride
+ vshl.i16 d2, d2, #2
+ vext.8 q4, q8, q9, #8 // +1-stride
+ vext.8 q5, q10, q11, #4 // 0
+ vext.8 q6, q10, q11, #8 // +1
+ vmla.i16 d2, d0, d28 // * 3 -> a
+ vadd.i32 q3, q3, q10 // -stride, -1
+ vadd.i32 q8, q8, q4 // -1-stride, +1-stride
+ vadd.i32 q5, q5, q6 // 0, +1
+ vadd.i32 q8, q8, q12 // -1+stride
+ vadd.i32 q3, q3, q5
+ vext.8 q7, q12, q13, #4 // +stride
+ vext.8 q10, q12, q13, #8 // +1+stride
+.if \bpc == 8
+ vld1.32 {d24[0]}, [r1, :32]! // src
+.else
+ vld1.16 {d24}, [r1, :64]! // src
+.endif
+ vadd.i32 q3, q3, q7 // +stride
+ vadd.i32 q8, q8, q10 // +1+stride
+ vshl.i32 q3, q3, #2
+ vmla.i32 q3, q8, q15 // * 3 -> b
+.if \bpc == 8
+ vmovl.u8 q12, d24 // src
+.endif
+ vmov d0, d1
+ vmlal.u16 q3, d2, d24 // b + a * src
+ vmov d2, d3
+ vrshrn.i32 d6, q3, #9
+ vmov d4, d5
+ vst1.16 {d6}, [r0]!
+
+ ble 3f
+ vmov q8, q9
+ vmov q10, q11
+ vmov q12, q13
+ vld1.16 {d1}, [r9, :64]!
+ vld1.16 {d3}, [r4, :64]!
+ vld1.16 {d5}, [r10, :64]!
+ vld1.32 {q9}, [r7, :128]!
+ vld1.32 {q11}, [r3, :128]!
+ vld1.32 {q13}, [r8, :128]!
+ b 2b
+
+3:
+ subs r6, r6, #1
+ ble 0f
+ mov r5, lr
+ add r0, r0, r12, lsl #1
+ add r1, r1, r2
+ add r3, r3, r11, lsl #2
+ add r7, r7, r11, lsl #2
+ add r8, r8, r11, lsl #2
+ add r4, r4, r11, lsl #1
+ add r9, r9, r11, lsl #1
+ add r10, r10, r11, lsl #1
+ b 1b
+0:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+// void dav1d_sgr_finish_filter2_Xbpc_neon(int16_t *tmp,
+// const pixel *src, const ptrdiff_t stride,
+// const int32_t *a, const int16_t *b,
+// const int w, const int h);
+function sgr_finish_filter2_\bpc\()bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100]
+ ldr r6, [sp, #108]
+ add r7, r3, #(4*(SUM_STRIDE))
+ sub r3, r3, #(4*(SUM_STRIDE))
+ add r8, r4, #(2*(SUM_STRIDE))
+ sub r4, r4, #(2*(SUM_STRIDE))
+ mov r9, #(2*SUM_STRIDE)
+ mov r10, #FILTER_OUT_STRIDE
+ add r11, r5, #7
+ bic r11, r11, #7 // Aligned width
+.if \bpc == 8
+ sub r2, r2, r11
+.else
+ sub r2, r2, r11, lsl #1
+.endif
+ sub r10, r10, r11
+ sub r9, r9, r11
+ sub r9, r9, #4 // We read 4 extra elements from a
+ sub r12, r9, #4 // We read 8 extra elements from b
+ mov lr, r5
+
+1:
+ vld1.16 {q0, q1}, [r4, :128]!
+ vld1.16 {q2, q3}, [r8, :128]!
+ vld1.32 {q8, q9}, [r3, :128]!
+ vld1.32 {q11, q12}, [r7, :128]!
+ vld1.32 {q10}, [r3, :128]!
+ vld1.32 {q13}, [r7, :128]!
+
+2:
+ vmov.i16 q14, #5
+ vmov.i16 q15, #6
+ subs r5, r5, #8
+ vext.8 q4, q0, q1, #4 // +1-stride
+ vext.8 q5, q2, q3, #4 // +1+stride
+ vext.8 q6, q0, q1, #2 // -stride
+ vext.8 q7, q2, q3, #2 // +stride
+ vadd.i16 q0, q0, q4 // -1-stride, +1-stride
+ vadd.i16 q5, q2, q5 // -1+stride, +1+stride
+ vadd.i16 q2, q6, q7 // -stride, +stride
+ vadd.i16 q0, q0, q5
+
+ vext.8 q4, q8, q9, #8 // +1-stride
+ vext.8 q5, q9, q10, #8
+ vext.8 q6, q11, q12, #8 // +1+stride
+ vext.8 q7, q12, q13, #8
+ vmul.i16 q0, q0, q14 // * 5
+ vmla.i16 q0, q2, q15 // * 6
+ vadd.i32 q4, q4, q8 // -1-stride, +1-stride
+ vadd.i32 q5, q5, q9
+ vadd.i32 q6, q6, q11 // -1+stride, +1+stride
+ vadd.i32 q7, q7, q12
+ vadd.i32 q4, q4, q6
+ vadd.i32 q5, q5, q7
+ vext.8 q6, q8, q9, #4 // -stride
+ vext.8 q7, q9, q10, #4
+ vext.8 q8, q11, q12, #4 // +stride
+ vext.8 q11, q12, q13, #4
+
+.if \bpc == 8
+ vld1.8 {d4}, [r1, :64]!
+.else
+ vld1.8 {q2}, [r1, :128]!
+.endif
+
+ vmov.i32 q14, #5
+ vmov.i32 q15, #6
+
+ vadd.i32 q6, q6, q8 // -stride, +stride
+ vadd.i32 q7, q7, q11
+ vmul.i32 q4, q4, q14 // * 5
+ vmla.i32 q4, q6, q15 // * 6
+ vmul.i32 q5, q5, q14 // * 5
+ vmla.i32 q5, q7, q15 // * 6
+
+.if \bpc == 8
+ vmovl.u8 q2, d4
+.endif
+ vmlal.u16 q4, d0, d4 // b + a * src
+ vmlal.u16 q5, d1, d5 // b + a * src
+ vmov q0, q1
+ vrshrn.i32 d8, q4, #9
+ vrshrn.i32 d9, q5, #9
+ vmov q2, q3
+ vst1.16 {q4}, [r0, :128]!
+
+ ble 3f
+ vmov q8, q10
+ vmov q11, q13
+ vld1.16 {q1}, [r4, :128]!
+ vld1.16 {q3}, [r8, :128]!
+ vld1.32 {q9, q10}, [r3, :128]!
+ vld1.32 {q12, q13}, [r7, :128]!
+ b 2b
+
+3:
+ subs r6, r6, #1
+ ble 0f
+ mov r5, lr
+ add r0, r0, r10, lsl #1
+ add r1, r1, r2
+ add r3, r3, r9, lsl #2
+ add r7, r7, r9, lsl #2
+ add r4, r4, r12, lsl #1
+ add r8, r8, r12, lsl #1
+
+ vld1.32 {q8, q9}, [r3, :128]!
+ vld1.16 {q0, q1}, [r4, :128]!
+ vld1.32 {q10}, [r3, :128]!
+
+ vmov.i16 q12, #5
+ vmov.i16 q13, #6
+
+4:
+ subs r5, r5, #8
+ vext.8 q3, q0, q1, #4 // +1
+ vext.8 q2, q0, q1, #2 // 0
+ vadd.i16 q0, q0, q3 // -1, +1
+
+ vext.8 q4, q8, q9, #4 // 0
+ vext.8 q5, q9, q10, #4
+ vext.8 q6, q8, q9, #8 // +1
+ vext.8 q7, q9, q10, #8
+ vmul.i16 q2, q2, q13 // * 6
+ vmla.i16 q2, q0, q12 // * 5 -> a
+.if \bpc == 8
+ vld1.8 {d22}, [r1, :64]!
+.else
+ vld1.16 {q11}, [r1, :128]!
+.endif
+ vadd.i32 q8, q8, q6 // -1, +1
+ vadd.i32 q9, q9, q7
+.if \bpc == 8
+ vmovl.u8 q11, d22
+.endif
+ vmul.i32 q4, q4, q15 // * 6
+ vmla.i32 q4, q8, q14 // * 5 -> b
+ vmul.i32 q5, q5, q15 // * 6
+ vmla.i32 q5, q9, q14 // * 5 -> b
+
+ vmlal.u16 q4, d4, d22 // b + a * src
+ vmlal.u16 q5, d5, d23
+ vmov q0, q1
+ vrshrn.i32 d8, q4, #8
+ vrshrn.i32 d9, q5, #8
+ vmov q8, q10
+ vst1.16 {q4}, [r0, :128]!
+
+ ble 5f
+ vld1.16 {q1}, [r4, :128]!
+ vld1.32 {q9, q10}, [r3, :128]!
+ b 4b
+
+5:
+ subs r6, r6, #1
+ ble 0f
+ mov r5, lr
+ sub r3, r3, r11, lsl #2 // Rewind r3/r4 to where they started
+ sub r4, r4, r11, lsl #1
+ add r0, r0, r10, lsl #1
+ add r1, r1, r2
+ sub r3, r3, #16
+ sub r4, r4, #16
+ b 1b
+0:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+// void dav1d_sgr_weighted1_Xbpc_neon(pixel *dst, const ptrdiff_t dst_stride,
+// const pixel *src, const ptrdiff_t src_stride,
+// const int16_t *t1, const int w, const int h,
+// const int wt, const int bitdepth_max);
+function sgr_weighted1_\bpc\()bpc_neon, export=1
+ push {r4-r9,lr}
+ ldrd r4, r5, [sp, #28]
+ ldrd r6, r7, [sp, #36]
+.if \bpc == 16
+ ldr r8, [sp, #44]
+.endif
+ vdup.16 d31, r7
+ cmp r6, #2
+.if \bpc == 16
+ vdup.16 q14, r8
+.endif
+ add r9, r0, r1
+ add r12, r2, r3
+ add lr, r4, #2*FILTER_OUT_STRIDE
+ mov r7, #(4*FILTER_OUT_STRIDE)
+ lsl r1, r1, #1
+ lsl r3, r3, #1
+ add r8, r5, #7
+ bic r8, r8, #7 // Aligned width
+.if \bpc == 8
+ sub r1, r1, r8
+ sub r3, r3, r8
+.else
+ sub r1, r1, r8, lsl #1
+ sub r3, r3, r8, lsl #1
+.endif
+ sub r7, r7, r8, lsl #1
+ mov r8, r5
+ blt 2f
+1:
+.if \bpc == 8
+ vld1.8 {d0}, [r2, :64]!
+ vld1.8 {d16}, [r12, :64]!
+.else
+ vld1.16 {q0}, [r2, :128]!
+ vld1.16 {q8}, [r12, :128]!
+.endif
+ vld1.16 {q1}, [r4, :128]!
+ vld1.16 {q9}, [lr, :128]!
+ subs r5, r5, #8
+.if \bpc == 8
+ vshll.u8 q0, d0, #4 // u
+ vshll.u8 q8, d16, #4 // u
+.else
+ vshl.i16 q0, q0, #4 // u
+ vshl.i16 q8, q8, #4 // u
+.endif
+ vsub.i16 q1, q1, q0 // t1 - u
+ vsub.i16 q9, q9, q8 // t1 - u
+ vshll.u16 q2, d0, #7 // u << 7
+ vshll.u16 q3, d1, #7 // u << 7
+ vshll.u16 q10, d16, #7 // u << 7
+ vshll.u16 q11, d17, #7 // u << 7
+ vmlal.s16 q2, d2, d31 // v
+ vmlal.s16 q3, d3, d31 // v
+ vmlal.s16 q10, d18, d31 // v
+ vmlal.s16 q11, d19, d31 // v
+.if \bpc == 8
+ vrshrn.i32 d4, q2, #11
+ vrshrn.i32 d5, q3, #11
+ vrshrn.i32 d20, q10, #11
+ vrshrn.i32 d21, q11, #11
+ vqmovun.s16 d4, q2
+ vqmovun.s16 d20, q10
+ vst1.8 {d4}, [r0, :64]!
+ vst1.8 {d20}, [r9, :64]!
+.else
+ vqrshrun.s32 d4, q2, #11
+ vqrshrun.s32 d5, q3, #11
+ vqrshrun.s32 d20, q10, #11
+ vqrshrun.s32 d21, q11, #11
+ vmin.u16 q2, q2, q14
+ vmin.u16 q10, q10, q14
+ vst1.16 {q2}, [r0, :128]!
+ vst1.16 {q10}, [r9, :128]!
+.endif
+ bgt 1b
+
+ sub r6, r6, #2
+ cmp r6, #1
+ blt 0f
+ mov r5, r8
+ add r0, r0, r1
+ add r9, r9, r1
+ add r2, r2, r3
+ add r12, r12, r3
+ add r4, r4, r7
+ add lr, lr, r7
+ beq 2f
+ b 1b
+
+2:
+.if \bpc == 8
+ vld1.8 {d0}, [r2, :64]!
+.else
+ vld1.16 {q0}, [r2, :128]!
+.endif
+ vld1.16 {q1}, [r4, :128]!
+ subs r5, r5, #8
+.if \bpc == 8
+ vshll.u8 q0, d0, #4 // u
+.else
+ vshl.i16 q0, q0, #4 // u
+.endif
+ vsub.i16 q1, q1, q0 // t1 - u
+ vshll.u16 q2, d0, #7 // u << 7
+ vshll.u16 q3, d1, #7 // u << 7
+ vmlal.s16 q2, d2, d31 // v
+ vmlal.s16 q3, d3, d31 // v
+.if \bpc == 8
+ vrshrn.i32 d4, q2, #11
+ vrshrn.i32 d5, q3, #11
+ vqmovun.s16 d2, q2
+ vst1.8 {d2}, [r0, :64]!
+.else
+ vqrshrun.s32 d4, q2, #11
+ vqrshrun.s32 d5, q3, #11
+ vmin.u16 q2, q2, q14
+ vst1.16 {q2}, [r0, :128]!
+.endif
+ bgt 2b
+0:
+ pop {r4-r9,pc}
+endfunc
+
+// void dav1d_sgr_weighted2_Xbpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *src, const ptrdiff_t src_stride,
+// const int16_t *t1, const int16_t *t2,
+// const int w, const int h,
+// const int16_t wt[2], const int bitdepth_max);
+function sgr_weighted2_\bpc\()bpc_neon, export=1
+ push {r4-r11,lr}
+ ldrd r4, r5, [sp, #36]
+ ldrd r6, r7, [sp, #44]
+.if \bpc == 8
+ ldr r8, [sp, #52]
+.else
+ ldrd r8, r9, [sp, #52]
+.endif
+ cmp r7, #2
+ add r10, r0, r1
+ add r11, r2, r3
+ add r12, r4, #2*FILTER_OUT_STRIDE
+ add lr, r5, #2*FILTER_OUT_STRIDE
+ vld2.16 {d30[], d31[]}, [r8] // wt[0], wt[1]
+.if \bpc == 16
+ vdup.16 q14, r9
+.endif
+ mov r8, #4*FILTER_OUT_STRIDE
+ lsl r1, r1, #1
+ lsl r3, r3, #1
+ add r9, r6, #7
+ bic r9, r9, #7 // Aligned width
+.if \bpc == 8
+ sub r1, r1, r9
+ sub r3, r3, r9
+.else
+ sub r1, r1, r9, lsl #1
+ sub r3, r3, r9, lsl #1
+.endif
+ sub r8, r8, r9, lsl #1
+ mov r9, r6
+ blt 2f
+1:
+.if \bpc == 8
+ vld1.8 {d0}, [r2, :64]!
+ vld1.8 {d16}, [r11, :64]!
+.else
+ vld1.16 {q0}, [r2, :128]!
+ vld1.16 {q8}, [r11, :128]!
+.endif
+ vld1.16 {q1}, [r4, :128]!
+ vld1.16 {q9}, [r12, :128]!
+ vld1.16 {q2}, [r5, :128]!
+ vld1.16 {q10}, [lr, :128]!
+ subs r6, r6, #8
+.if \bpc == 8
+ vshll.u8 q0, d0, #4 // u
+ vshll.u8 q8, d16, #4 // u
+.else
+ vshl.i16 q0, q0, #4 // u
+ vshl.i16 q8, q8, #4 // u
+.endif
+ vsub.i16 q1, q1, q0 // t1 - u
+ vsub.i16 q2, q2, q0 // t2 - u
+ vsub.i16 q9, q9, q8 // t1 - u
+ vsub.i16 q10, q10, q8 // t2 - u
+ vshll.u16 q3, d0, #7 // u << 7
+ vshll.u16 q0, d1, #7 // u << 7
+ vshll.u16 q11, d16, #7 // u << 7
+ vshll.u16 q8, d17, #7 // u << 7
+ vmlal.s16 q3, d2, d30 // wt[0] * (t1 - u)
+ vmlal.s16 q3, d4, d31 // wt[1] * (t2 - u)
+ vmlal.s16 q0, d3, d30 // wt[0] * (t1 - u)
+ vmlal.s16 q0, d5, d31 // wt[1] * (t2 - u)
+ vmlal.s16 q11, d18, d30 // wt[0] * (t1 - u)
+ vmlal.s16 q11, d20, d31 // wt[1] * (t2 - u)
+ vmlal.s16 q8, d19, d30 // wt[0] * (t1 - u)
+ vmlal.s16 q8, d21, d31 // wt[1] * (t2 - u)
+.if \bpc == 8
+ vrshrn.i32 d6, q3, #11
+ vrshrn.i32 d7, q0, #11
+ vrshrn.i32 d22, q11, #11
+ vrshrn.i32 d23, q8, #11
+ vqmovun.s16 d6, q3
+ vqmovun.s16 d22, q11
+ vst1.8 {d6}, [r0, :64]!
+ vst1.8 {d22}, [r10, :64]!
+.else
+ vqrshrun.s32 d6, q3, #11
+ vqrshrun.s32 d7, q0, #11
+ vqrshrun.s32 d22, q11, #11
+ vqrshrun.s32 d23, q8, #11
+ vmin.u16 q3, q3, q14
+ vmin.u16 q11, q11, q14
+ vst1.16 {q3}, [r0, :128]!
+ vst1.16 {q11}, [r10, :128]!
+.endif
+ bgt 1b
+
+ subs r7, r7, #2
+ cmp r7, #1
+ blt 0f
+ mov r6, r9
+ add r0, r0, r1
+ add r10, r10, r1
+ add r2, r2, r3
+ add r11, r11, r3
+ add r4, r4, r8
+ add r12, r12, r8
+ add r5, r5, r8
+ add lr, lr, r8
+ beq 2f
+ b 1b
+
+2:
+.if \bpc == 8
+ vld1.8 {d0}, [r2, :64]!
+.else
+ vld1.16 {q0}, [r2, :128]!
+.endif
+ vld1.16 {q1}, [r4, :128]!
+ vld1.16 {q2}, [r5, :128]!
+ subs r6, r6, #8
+.if \bpc == 8
+ vshll.u8 q0, d0, #4 // u
+.else
+ vshl.i16 q0, q0, #4 // u
+.endif
+ vsub.i16 q1, q1, q0 // t1 - u
+ vsub.i16 q2, q2, q0 // t2 - u
+ vshll.u16 q3, d0, #7 // u << 7
+ vshll.u16 q0, d1, #7 // u << 7
+ vmlal.s16 q3, d2, d30 // wt[0] * (t1 - u)
+ vmlal.s16 q3, d4, d31 // wt[1] * (t2 - u)
+ vmlal.s16 q0, d3, d30 // wt[0] * (t1 - u)
+ vmlal.s16 q0, d5, d31 // wt[1] * (t2 - u)
+.if \bpc == 8
+ vrshrn.i32 d6, q3, #11
+ vrshrn.i32 d7, q0, #11
+ vqmovun.s16 d6, q3
+ vst1.8 {d6}, [r0, :64]!
+.else
+ vqrshrun.s32 d6, q3, #11
+ vqrshrun.s32 d7, q0, #11
+ vmin.u16 q3, q3, q14
+ vst1.16 {q3}, [r0, :128]!
+.endif
+ bgt 1b
+0:
+ pop {r4-r11,pc}
+endfunc
+.endm
diff --git a/third_party/dav1d/src/arm/32/mc.S b/third_party/dav1d/src/arm/32/mc.S
new file mode 100644
index 0000000000..1b60a7bdb3
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/mc.S
@@ -0,0 +1,3340 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Janne Grunau
+ * Copyright © 2018, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+.macro avg dst0, dst1, t0, t1, t2, t3
+ vld1.16 {\t0,\t1}, [r2, :128]!
+ vld1.16 {\t2,\t3}, [r3, :128]!
+ vadd.i16 \t0, \t0, \t2
+ vadd.i16 \t1, \t1, \t3
+ vqrshrun.s16 \dst0, \t0, #5
+ vqrshrun.s16 \dst1, \t1, #5
+.endm
+
+.macro w_avg dst0, dst1, t0, t1, t2, t3
+ vld1.16 {\t0,\t1}, [r2, :128]!
+ vld1.16 {\t2,\t3}, [r3, :128]!
+ vsub.i16 \t0, \t2, \t0
+ vsub.i16 \t1, \t3, \t1
+ vqdmulh.s16 \t0, \t0, q15
+ vqdmulh.s16 \t1, \t1, q15
+ vadd.i16 \t0, \t2, \t0
+ vadd.i16 \t1, \t3, \t1
+ vqrshrun.s16 \dst0, \t0, #4
+ vqrshrun.s16 \dst1, \t1, #4
+.endm
+
+.macro mask dst0, dst1, t0, t1, t2, t3
+ vld1.8 {q14}, [lr, :128]!
+ vld1.16 {\t0,\t1}, [r2, :128]!
+ vmul.i8 q14, q14, q15
+ vld1.16 {\t2,\t3}, [r3, :128]!
+ vshll.i8 q13, d28, #8
+ vshll.i8 q14, d29, #8
+ vsub.i16 \t0, \t2, \t0
+ vsub.i16 \t1, \t3, \t1
+ vqdmulh.s16 \t0, \t0, q13
+ vqdmulh.s16 \t1, \t1, q14
+ vadd.i16 \t0, \t2, \t0
+ vadd.i16 \t1, \t3, \t1
+ vqrshrun.s16 \dst0, \t0, #4
+ vqrshrun.s16 \dst1, \t1, #4
+.endm
+
+.macro bidir_fn type
+function \type\()_8bpc_neon, export=1
+ push {r4-r6,lr}
+ ldrd r4, r5, [sp, #16]
+ clz r4, r4
+.ifnc \type, avg
+ ldr lr, [sp, #24]
+.endif
+.ifc \type, w_avg
+ vdup.s16 q15, lr
+ vneg.s16 q15, q15
+ vshl.i16 q15, q15, #11
+.endif
+.ifc \type, mask
+ vmov.i8 q15, #256-2
+.endif
+ adr r12, L(\type\()_tbl)
+ sub r4, r4, #24
+ ldr r4, [r12, r4, lsl #2]
+ \type d16, d17, q0, q1, q2, q3
+ add r12, r12, r4
+ bx r12
+
+ .align 2
+L(\type\()_tbl):
+ .word 1280f - L(\type\()_tbl) + CONFIG_THUMB
+ .word 640f - L(\type\()_tbl) + CONFIG_THUMB
+ .word 320f - L(\type\()_tbl) + CONFIG_THUMB
+ .word 160f - L(\type\()_tbl) + CONFIG_THUMB
+ .word 80f - L(\type\()_tbl) + CONFIG_THUMB
+ .word 4f - L(\type\()_tbl) + CONFIG_THUMB
+
+4:
+ add r6, r0, r1
+ lsl r1, r1, #1
+ cmp r5, #4
+ vst1.32 {d16[0]}, [r0, :32], r1
+ vst1.32 {d16[1]}, [r6, :32], r1
+ vst1.32 {d17[0]}, [r0, :32], r1
+ vst1.32 {d17[1]}, [r6, :32], r1
+ beq 0f
+ \type d18, d19, q0, q1, q2, q3
+ cmp r5, #8
+ vst1.32 {d18[0]}, [r0, :32], r1
+ vst1.32 {d18[1]}, [r6, :32], r1
+ vst1.32 {d19[0]}, [r0, :32], r1
+ vst1.32 {d19[1]}, [r6, :32], r1
+ beq 0f
+ \type d16, d17, q0, q1, q2, q3
+ vst1.32 {d16[0]}, [r0, :32], r1
+ vst1.32 {d16[1]}, [r6, :32], r1
+ \type d18, d19, q0, q1, q2, q3
+ vst1.32 {d17[0]}, [r0, :32], r1
+ vst1.32 {d17[1]}, [r6, :32], r1
+ vst1.32 {d18[0]}, [r0, :32], r1
+ vst1.32 {d18[1]}, [r6, :32], r1
+ vst1.32 {d19[0]}, [r0, :32], r1
+ vst1.32 {d19[1]}, [r6, :32], r1
+ pop {r4-r6,pc}
+80:
+ add r6, r0, r1
+ lsl r1, r1, #1
+8:
+ vst1.8 {d16}, [r0, :64], r1
+ \type d18, d19, q0, q1, q2, q3
+ vst1.8 {d17}, [r6, :64], r1
+ vst1.8 {d18}, [r0, :64], r1
+ subs r5, r5, #4
+ vst1.8 {d19}, [r6, :64], r1
+ ble 0f
+ \type d16, d17, q0, q1, q2, q3
+ b 8b
+160:
+ add r6, r0, r1
+ lsl r1, r1, #1
+16:
+ \type d18, d19, q0, q1, q2, q3
+ vst1.8 {q8}, [r0, :128], r1
+ \type d20, d21, q0, q1, q2, q3
+ vst1.8 {q9}, [r6, :128], r1
+ \type d22, d23, q0, q1, q2, q3
+ vst1.8 {q10}, [r0, :128], r1
+ subs r5, r5, #4
+ vst1.8 {q11}, [r6, :128], r1
+ ble 0f
+ \type d16, d17, q0, q1, q2, q3
+ b 16b
+320:
+ add r6, r0, r1
+ lsl r1, r1, #1
+32:
+ \type d18, d19, q0, q1, q2, q3
+ \type d20, d21, q0, q1, q2, q3
+ vst1.8 {q8, q9}, [r0, :128], r1
+ \type d22, d23, q0, q1, q2, q3
+ subs r5, r5, #2
+ vst1.8 {q10, q11}, [r6, :128], r1
+ ble 0f
+ \type d16, d17, q0, q1, q2, q3
+ b 32b
+640:
+ add r6, r0, #32
+64:
+ \type d18, d19, q0, q1, q2, q3
+ \type d20, d21, q0, q1, q2, q3
+ \type d22, d23, q0, q1, q2, q3
+ vst1.8 {q8, q9}, [r0, :128], r1
+ \type d16, d17, q0, q1, q2, q3
+ vst1.8 {q10, q11}, [r6, :128], r1
+ \type d18, d19, q0, q1, q2, q3
+ \type d20, d21, q0, q1, q2, q3
+ vst1.8 {q8, q9}, [r0, :128], r1
+ \type d22, d23, q0, q1, q2, q3
+ subs r5, r5, #2
+ vst1.8 {q10, q11}, [r6, :128], r1
+ ble 0f
+ \type d16, d17, q0, q1, q2, q3
+ b 64b
+1280:
+ sub r1, r1, #32
+ add r6, r0, #64
+128:
+ \type d18, d19, q0, q1, q2, q3
+ \type d20, d21, q0, q1, q2, q3
+ \type d22, d23, q0, q1, q2, q3
+ vst1.8 {q8, q9}, [r0, :128]!
+ \type d16, d17, q0, q1, q2, q3
+ vst1.8 {q10, q11}, [r0, :128], r1
+ \type d18, d19, q0, q1, q2, q3
+ \type d20, d21, q0, q1, q2, q3
+ vst1.8 {q8, q9}, [r6, :128]!
+ \type d22, d23, q0, q1, q2, q3
+ subs r5, r5, #1
+ vst1.8 {q10, q11}, [r6, :128], r1
+ ble 0f
+ \type d16, d17, q0, q1, q2, q3
+ b 128b
+
+0:
+ pop {r4-r6,pc}
+endfunc
+.endm
+
+bidir_fn avg
+bidir_fn w_avg
+bidir_fn mask
+
+
+.macro w_mask_fn type
+function w_mask_\type\()_8bpc_neon, export=1
+ push {r4-r9,lr}
+ ldrd r4, r5, [sp, #28]
+ ldrd r6, r7, [sp, #36]
+ clz r8, r4
+ adr r9, L(w_mask_\type\()_tbl)
+ sub r8, r8, #24
+ ldr r8, [r9, r8, lsl #2]
+ add r9, r9, r8
+ movw r12, #6903
+ vdup.16 q14, r12
+.if \type == 444
+ vmov.i8 q15, #64
+.elseif \type == 422
+ vdup.8 d0, r7 // d0[] <- sign
+ vmov.i8 d30, #129
+ vsub.i8 d30, d30, d0 // 129 - sign
+.elseif \type == 420
+ vdup.16 q0, r7 // d0[] <- sign
+ vmov.i16 q15, #256
+ vsub.i16 q15, q15, q0 // 256 - sign
+.endif
+ add r12, r0, r1
+ lsl r1, r1, #1
+ bx r9
+
+ .align 2
+L(w_mask_\type\()_tbl):
+ .word 1280f - L(w_mask_\type\()_tbl) + CONFIG_THUMB
+ .word 640f - L(w_mask_\type\()_tbl) + CONFIG_THUMB
+ .word 320f - L(w_mask_\type\()_tbl) + CONFIG_THUMB
+ .word 160f - L(w_mask_\type\()_tbl) + CONFIG_THUMB
+ .word 8f - L(w_mask_\type\()_tbl) + CONFIG_THUMB
+ .word 4f - L(w_mask_\type\()_tbl) + CONFIG_THUMB
+
+4:
+ vld1.16 {d0, d1, d2, d3}, [r2, :128]! // tmp1 (four rows at once)
+ vld1.16 {d4, d5, d6, d7}, [r3, :128]! // tmp2 (four rows at once)
+ subs r5, r5, #4
+ vsub.i16 q8, q2, q0 // tmp2-tmp1
+ vsub.i16 q9, q3, q1
+ vabd.s16 q10, q0, q2 // (abs(tmp1[x] - tmp2[x]))
+ vabd.s16 q11, q1, q3
+ vqsub.u16 q10, q14, q10 // 6903 - abs ()
+ vqsub.u16 q11, q14, q11
+ vshr.s16 q10, q10, #8 // 64-m = (6903 - abs()) >> 8
+ vshr.s16 q11, q11, #8
+ vshl.s16 q12, q10, #9 // (64-m)<<9
+ vshl.s16 q13, q11, #9
+ vqdmulh.s16 q12, q12, q8 // ((tmp2-tmp1)*(64-m)<<9)>>15
+ vqdmulh.s16 q13, q13, q9
+ vadd.i16 q12, q12, q0 // (((tmp2-tmp1)*(64-m)<<9)>>15) + tmp1
+ vadd.i16 q13, q13, q1
+ vqrshrun.s16 d24, q12, #4 // (((((tmp2-tmp1)*(64-m)<<9)>>15) + tmp1) + 8) >> 4
+ vqrshrun.s16 d25, q13, #4
+.if \type == 444
+ vmovn.u16 d20, q10 // 64 - m
+ vmovn.u16 d21, q11
+ vsub.i8 q10, q15, q10 // m
+ vst1.8 {d20, d21}, [r6, :128]!
+.elseif \type == 422
+ vpadd.s16 d20, d20, d21 // (64 - m) + (64 - n) (column wise addition)
+ vpadd.s16 d21, d22, d23
+ vmovn.s16 d6, q10
+ vhsub.u8 d6, d30, d6 // ((129 - sign) - ((64 - m) + (64 - n))) >> 1
+ vst1.8 {d6}, [r6, :64]!
+.elseif \type == 420
+ vadd.s16 d20, d20, d21 // (64 - my1) + (64 - my2) (row wise addition)
+ vadd.s16 d21, d22, d23
+ vpadd.s16 d20, d20, d21 // (128 - m) + (128 - n) (column wise addition)
+ vsub.s16 d20, d30, d20 // (256 - sign) - ((128 - m) + (128 - n))
+ vrshrn.u16 d20, q10, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
+ vst1.32 {d20[0]}, [r6, :32]!
+.endif
+ vst1.32 {d24[0]}, [r0, :32], r1
+ vst1.32 {d24[1]}, [r12, :32], r1
+ vst1.32 {d25[0]}, [r0, :32], r1
+ vst1.32 {d25[1]}, [r12, :32], r1
+ bgt 4b
+ pop {r4-r9,pc}
+8:
+ vld1.16 {d0, d1, d2, d3}, [r2, :128]! // tmp1y1, tmp1y2
+ vld1.16 {d4, d5, d6, d7}, [r3, :128]! // tmp2y1, tmp2y2
+ subs r5, r5, #2
+ vsub.i16 q8, q2, q0 // tmp2y1 - tmp1y1
+ vsub.i16 q9, q3, q1 // tmp2y2 - tmp1y2
+ vabd.s16 q10, q0, q2 // abs(tmp1y1 - tmp2y1)
+ vabd.s16 q11, q1, q3 // abs(tmp1y2 - tmp2y2)
+ vqsub.u16 q10, q14, q10 // 6903 - abs(tmp1y1 - tmp2y1)
+ vqsub.u16 q11, q14, q11 // 6903 - abs(tmp1y2 - tmp2y2)
+ vshr.s16 q10, q10, #8 // 64 - my1 = 6903 - abs(tmp1y1 - tmp2y1) >> 8
+ vshr.s16 q11, q11, #8 // 64 - my2 = 6903 - abs(tmp1y2 - tmp2y2) >> 8
+ vshl.s16 q12, q10, #9 // (64 - my1) << 9
+ vshl.s16 q13, q11, #9 // (64 - my2) << 9
+ vqdmulh.s16 q12, q12, q8 // ((tmp2y1 - tmp1y1) * (64 - my1) << 9) >> 15
+ vqdmulh.s16 q13, q13, q9 // ((tmp2y2 - tmp1y2) * (64 - my2) << 9) >> 15
+ vadd.s16 q12, q12, q0 // (((tmp2y1 - tmp1y1) * (64 - my1) << 9) >> 15) + tmp1y1
+ vadd.s16 q13, q13, q1 // (((tmp2y2 - tmp1y2) * (64 - my2) << 9) >> 15) + tmp1y2
+ vqrshrun.s16 d24, q12, #4 // (((((tmp2y1 - tmp1y1) * (64 - my1) << 9) >> 15) + tmp1y1) + 8) >> 4
+ vqrshrun.s16 d25, q13, #4 // (((((tmp2y2 - tmp1y2) * (64 - my2) << 9) >> 15) + tmp1y2) + 8) >> 4
+.if \type == 444
+ vmovn.u16 d20, q10 // 64 - m
+ vmovn.u16 d21, q11
+ vsub.i8 q10, q15, q10 // m
+ vst1.8 {d20, d21}, [r6, :128]!
+.elseif \type == 422
+ vpadd.s16 d20, d20, d21 // (64 - my1) + (64 - ny1) (column wise addition)
+ vpadd.s16 d21, d22, d23 // (64 - my2) + (64 - ny2)
+ vmovn.s16 d20, q10
+ vhsub.u8 d20, d30, d20 // ((129 - sign) - ((64 - my1/y2) + (64 - ny1/y2))) >> 1
+ vst1.8 {d20}, [r6, :64]!
+.elseif \type == 420
+ vadd.s16 q10, q10, q11 // (64 - my1) + (64 - my2) (row wise addition)
+ vpadd.s16 d20, d20, d21 // (128 - m) + (128 - n) (column wise addition)
+ vsub.s16 d20, d30, d20 // (256 - sign) - ((128 - m) + (128 - n))
+ vrshrn.u16 d20, q10, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
+ vst1.32 {d20[0]}, [r6, :32]!
+.endif
+ vst1.16 {d24}, [r0, :64], r1
+ vst1.16 {d25}, [r12, :64], r1
+ bgt 8b
+ pop {r4-r9,pc}
+1280:
+640:
+320:
+160:
+ sub r1, r1, r4
+.if \type == 444
+ add lr, r6, r4
+.elseif \type == 422
+ add lr, r6, r4, lsr #1
+.endif
+ add r9, r3, r4, lsl #1
+ add r7, r2, r4, lsl #1
+161:
+ mov r8, r4
+16:
+ vld1.16 {d0, d1, d2, d3}, [r2, :128]! // tmp1y1
+ vld1.16 {d4, d5, d6, d7}, [r3, :128]! // tmp2y1
+ vld1.16 {d16, d17, d18, d19}, [r7, :128]! // tmp1y2
+ subs r8, r8, #16
+ vsub.i16 q2, q2, q0 // tmp2y1 - tmp1y1
+ vsub.i16 q3, q3, q1
+ vabs.s16 q10, q2 // abs(tm2y1 - tmp1y1)
+ vabs.s16 q11, q3
+ vqsub.u16 q10, q14, q10 // 6903 - abs(tmp1y1 - tmp2y1)
+ vqsub.u16 q11, q14, q11
+ vshr.s16 q10, q10, #8 // 64 - my1 = 6903 - abs(tmp1y1 - tmp2y1) >> 8
+ vshr.s16 q11, q11, #8
+ vshl.s16 q12, q10, #9 // (64 - my1) << 9
+ vshl.s16 q13, q11, #9
+ vqdmulh.s16 q12, q12, q2 // ((tmp2y1 - tmp1y1) * (64 - my1) << 9) >> 15
+ vqdmulh.s16 q13, q13, q3
+ vadd.i16 q12, q12, q0 // (((tmp2y1 - tmp1y1) * (64 - my1) << 9) >> 15) + tmp1y1
+ vadd.i16 q13, q13, q1
+ vld1.16 {d0, d1, d2, d3}, [r9, :128]! // tmp2h2
+.if \type == 444
+ vmovn.u16 d20, q10 // 64 - my1
+ vmovn.u16 d21, q11
+ vsub.i8 q10, q15, q10 // my1
+ vst1.8 {d20, d21}, [r6, :128]!
+.elseif \type == 422
+ vpadd.s16 d20, d20, d21 // (64 - my1) + (64 - ny1) (column wise addition)
+ vpadd.s16 d21, d22, d23
+ vmovn.s16 d20, q10
+ vhsub.u8 d20, d30, d20 // ((129 - sign) - ((64 - my1) + (64 - ny1))) >> 1
+ vst1.8 {d20}, [r6, :64]!
+.endif
+ vqrshrun.s16 d24, q12, #4 // (((((tmp2y1 - tmp1y1)*(64 - my1) << 9) >> 15) + tmp1y1) + 8) >> 4
+ vqrshrun.s16 d25, q13, #4
+ vsub.i16 q0, q0, q8 // tmp2y2 - tmp1y2
+ vsub.i16 q1, q1, q9
+ vst1.16 {d24, d25}, [r0, :128]! // store dsty1
+ vabs.s16 q2, q0 // abs(tmp2y2 - tmp1y2)
+ vabs.s16 q3, q1
+ vqsub.u16 q2, q14, q2 // 6903 - abs(tmp2y2 - tmp1y2)
+ vqsub.u16 q3, q14, q3
+ vshr.s16 q2, q2, #8 // (6903 - abs(tmp2y2 - tmp1y2)) >> 8
+ vshr.s16 q3, q3, #8
+ vshl.s16 q12, q2, #9 // (64 - my2) << 9
+ vshl.s16 q13, q3, #9
+.if \type == 444
+ vmovn.u16 d4, q2 // 64 - my2
+ vmovn.u16 d5, q3
+ vsub.i8 q2, q15, q2 // my2
+ vst1.8 {d4, d5}, [lr, :128]!
+.elseif \type == 422
+ vpadd.s16 d4, d4, d5 // (64 - my2) + (64 - ny2) (column wise addition)
+ vpadd.s16 d5, d6, d7
+ vmovn.s16 d4, q2
+ vhsub.u8 d4, d30, d4 // ((129 - sign) - ((64 - my2) + (64 - ny2))) >> 1
+ vst1.8 {d4}, [lr, :64]!
+.elseif \type == 420
+ vadd.s16 q10, q10, q2 // (64 - my1) + (64 - my2) (row wise addition)
+ vadd.s16 q11, q11, q3
+ vpadd.s16 d20, d20, d21 // (128 - m) + (128 - n) (column wise addition)
+ vpadd.s16 d21, d22, d23
+ vsub.s16 q10, q15, q10 // (256 - sign) - ((128 - m) + (128 - n))
+ vrshrn.u16 d20, q10, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
+ vst1.8 {d20}, [r6, :64]!
+.endif
+ vqdmulh.s16 q12, q12, q0 // ((tmp2y2 - tmp1y2) * (64 - my2) << 9) >> 15
+ vqdmulh.s16 q13, q13, q1
+ vadd.i16 q12, q12, q8 // (((tmp2y2 - tmp1y2) * (64 - my2) << 9) >> 15) + tmp1y2
+ vadd.i16 q13, q13, q9
+ vqrshrun.s16 d24, q12, #4 // (((((tmp2y2 - tmp1y2)*(64 - my2) << 9) >> 15) + tmp1y2) + 8) >> 4
+ vqrshrun.s16 d25, q13, #4
+ vst1.16 {d24, d25}, [r12, :128]! // store dsty2
+ bgt 16b
+ subs r5, r5, #2
+ add r2, r2, r4, lsl #1
+ add r3, r3, r4, lsl #1
+ add r7, r7, r4, lsl #1
+ add r9, r9, r4, lsl #1
+.if \type == 444
+ add r6, r6, r4
+ add lr, lr, r4
+.elseif \type == 422
+ add r6, r6, r4, lsr #1
+ add lr, lr, r4, lsr #1
+.endif
+ add r0, r0, r1
+ add r12, r12, r1
+ bgt 161b
+ pop {r4-r9,pc}
+endfunc
+.endm
+
+w_mask_fn 444
+w_mask_fn 422
+w_mask_fn 420
+
+
+function blend_8bpc_neon, export=1
+ push {r4-r5,lr}
+ ldrd r4, r5, [sp, #12]
+ clz lr, r3
+ adr r3, L(blend_tbl)
+ sub lr, lr, #26
+ ldr lr, [r3, lr, lsl #2]
+ add r3, r3, lr
+ bx r3
+
+ .align 2
+L(blend_tbl):
+ .word 320f - L(blend_tbl) + CONFIG_THUMB
+ .word 160f - L(blend_tbl) + CONFIG_THUMB
+ .word 80f - L(blend_tbl) + CONFIG_THUMB
+ .word 40f - L(blend_tbl) + CONFIG_THUMB
+
+40:
+ vmov.i8 d22, #64
+ add r12, r0, r1
+ lsl r1, r1, #1
+4:
+ vld1.u8 {d2}, [r5, :64]!
+ vld1.u8 {d1}, [r2, :64]!
+ vld1.32 {d0[]}, [r0, :32]
+ subs r4, r4, #2
+ vld1.32 {d0[1]}, [r12, :32]
+ vsub.i8 d3, d22, d2
+ vmull.u8 q8, d1, d2
+ vmlal.u8 q8, d0, d3
+ vrshrn.i16 d20, q8, #6
+ vst1.32 {d20[0]}, [r0, :32], r1
+ vst1.32 {d20[1]}, [r12, :32], r1
+ bgt 4b
+ pop {r4-r5,pc}
+80:
+ vmov.i8 d16, #64
+ add r12, r0, r1
+ lsl r1, r1, #1
+8:
+ vld1.u8 {q1}, [r5, :128]!
+ vld1.u8 {q2}, [r2, :128]!
+ vld1.u8 {d0}, [r0, :64]
+ vsub.i8 d17, d16, d2
+ vld1.u8 {d1}, [r12, :64]
+ subs r4, r4, #2
+ vsub.i8 d18, d16, d3
+ vmull.u8 q3, d2, d4
+ vmlal.u8 q3, d0, d17
+ vmull.u8 q10, d3, d5
+ vmlal.u8 q10, d1, d18
+ vrshrn.i16 d22, q3, #6
+ vrshrn.i16 d23, q10, #6
+ vst1.u8 {d22}, [r0, :64], r1
+ vst1.u8 {d23}, [r12, :64], r1
+ bgt 8b
+ pop {r4-r5,pc}
+160:
+ vmov.i8 q12, #64
+ add r12, r0, r1
+ lsl r1, r1, #1
+16:
+ vld1.u8 {q1, q2}, [r5, :128]!
+ vld1.u8 {q8, q9}, [r2, :128]!
+ vld1.u8 {q0}, [r0, :128]
+ subs r4, r4, #2
+ vsub.i8 q15, q12, q1
+ vld1.u8 {q13}, [r12, :128]
+ vmull.u8 q3, d16, d2
+ vmlal.u8 q3, d0, d30
+ vmull.u8 q14, d17, d3
+ vmlal.u8 q14, d1, d31
+ vsub.i8 q15, q12, q2
+ vrshrn.i16 d20, q3, #6
+ vrshrn.i16 d21, q14, #6
+ vmull.u8 q3, d18, d4
+ vmlal.u8 q3, d26, d30
+ vmull.u8 q14, d19, d5
+ vmlal.u8 q14, d27, d31
+ vrshrn.i16 d22, q3, #6
+ vrshrn.i16 d23, q14, #6
+ vst1.u8 {q10}, [r0, :128], r1
+ vst1.u8 {q11}, [r12, :128], r1
+ bgt 16b
+ pop {r4-r5,pc}
+320:
+ vmov.i8 q10, #64
+32:
+ vld1.u8 {q2, q3}, [r5, :128]!
+ vld1.u8 {q8, q9}, [r2, :128]!
+ vld1.u8 {q0, q1}, [r0, :128]
+ subs r4, r4, #1
+ vsub.i8 q11, q10, q2
+ vmull.u8 q15, d16, d4
+ vmlal.u8 q15, d0, d22
+ vmull.u8 q14, d17, d5
+ vmlal.u8 q14, d1, d23
+ vsub.i8 q11, q10, q3
+ vrshrn.i16 d24, q15, #6
+ vrshrn.i16 d25, q14, #6
+ vmull.u8 q15, d18, d6
+ vmlal.u8 q15, d2, d22
+ vmull.u8 q14, d19, d7
+ vmlal.u8 q14, d3, d23
+ vrshrn.i16 d26, q15, #6
+ vrshrn.i16 d27, q14, #6
+ vst1.u8 {q12, q13}, [r0, :128], r1
+ bgt 32b
+ pop {r4-r5,pc}
+endfunc
+
+function blend_h_8bpc_neon, export=1
+ push {r4-r5,lr}
+ ldr r4, [sp, #12]
+ movrel r5, X(obmc_masks)
+ add r5, r5, r4
+ sub r4, r4, r4, lsr #2
+ clz lr, r3
+ adr r12, L(blend_h_tbl)
+ sub lr, lr, #24
+ ldr lr, [r12, lr, lsl #2]
+ add r12, r12, lr
+ bx r12
+
+ .align 2
+L(blend_h_tbl):
+ .word 1280f - L(blend_h_tbl) + CONFIG_THUMB
+ .word 640f - L(blend_h_tbl) + CONFIG_THUMB
+ .word 320f - L(blend_h_tbl) + CONFIG_THUMB
+ .word 160f - L(blend_h_tbl) + CONFIG_THUMB
+ .word 80f - L(blend_h_tbl) + CONFIG_THUMB
+ .word 40f - L(blend_h_tbl) + CONFIG_THUMB
+ .word 20f - L(blend_h_tbl) + CONFIG_THUMB
+
+20:
+ vmov.i8 d22, #64
+ add r12, r0, r1
+ lsl r1, r1, #1
+2:
+ vld1.16 {d2[], d3[]}, [r5, :16]!
+ vld1.32 {d1[]}, [r2, :32]!
+ subs r4, r4, #2
+ vld1.16 {d0[]}, [r0, :16]
+ vzip.8 d2, d3
+ vsub.i8 d4, d22, d2
+ vld1.16 {d0[1]}, [r12, :16]
+ vmull.u8 q8, d1, d2
+ vmlal.u8 q8, d0, d4
+ vrshrn.i16 d20, q8, #6
+ vst1.16 {d20[0]}, [r0, :16], r1
+ vst1.16 {d20[1]}, [r12, :16], r1
+ bgt 2b
+ pop {r4-r5,pc}
+40:
+ vmov.i8 d22, #64
+ add r12, r0, r1
+ lsl r1, r1, #1
+4:
+ vld2.u8 {d2[], d3[]}, [r5, :16]!
+ vld1.u8 {d1}, [r2, :64]!
+ subs r4, r4, #2
+ vext.u8 d2, d2, d3, #4
+ vld1.32 {d0[]}, [r0, :32]
+ vsub.i8 d6, d22, d2
+ vld1.32 {d0[1]}, [r12, :32]
+ vmull.u8 q8, d1, d2
+ vmlal.u8 q8, d0, d6
+ vrshrn.i16 d20, q8, #6
+ vst1.32 {d20[0]}, [r0, :32], r1
+ vst1.32 {d20[1]}, [r12, :32], r1
+ bgt 4b
+ pop {r4-r5,pc}
+80:
+ vmov.i8 q8, #64
+ add r12, r0, r1
+ lsl r1, r1, #1
+8:
+ vld2.u8 {d2[], d3[]}, [r5, :16]!
+ vld1.u8 {d4, d5}, [r2, :128]!
+ vld1.u8 {d0}, [r0, :64]
+ vsub.i8 q9, q8, q1
+ vld1.u8 {d1}, [r12, :64]
+ subs r4, r4, #2
+ vmull.u8 q3, d2, d4
+ vmlal.u8 q3, d0, d18
+ vmull.u8 q10, d3, d5
+ vmlal.u8 q10, d1, d19
+ vrshrn.i16 d22, q3, #6
+ vrshrn.i16 d23, q10, #6
+ vst1.u8 {d22}, [r0, :64], r1
+ vst1.u8 {d23}, [r12, :64], r1
+ bgt 8b
+ pop {r4-r5,pc}
+160:
+ vmov.i8 q12, #64
+ add r12, r0, r1
+ lsl r1, r1, #1
+16:
+ vld2.u8 {d28[], d29[]}, [r5, :16]!
+ vld1.u8 {d2, d3, d4, d5}, [r2, :128]!
+ vsub.i8 q15, q12, q14
+ vld1.u8 {q0}, [r0, :128]
+ subs r4, r4, #2
+ vld1.u8 {q13}, [r12, :128]
+ vmull.u8 q3, d2, d28
+ vmlal.u8 q3, d0, d30
+ vmull.u8 q8, d3, d28
+ vmlal.u8 q8, d1, d30
+ vrshrn.i16 d18, q3, #6
+ vrshrn.i16 d19, q8, #6
+ vmull.u8 q3, d4, d29
+ vmlal.u8 q3, d26, d31
+ vmull.u8 q8, d5, d29
+ vmlal.u8 q8, d27, d31
+ vrshrn.i16 d20, q3, #6
+ vrshrn.i16 d21, q8, #6
+ vst1.u8 {q9}, [r0, :128], r1
+ vst1.u8 {q10}, [r12, :128], r1
+ bgt 16b
+ pop {r4-r5,pc}
+320:
+640:
+1280:
+ vmov.i8 d20, #64
+ sub r1, r1, r3
+321:
+ vld1.u8 {d6[]}, [r5]!
+ vsub.i8 d7, d20, d6
+ mov r12, r3
+32:
+ vld1.u8 {q8, q9}, [r2, :128]!
+ vld1.u8 {q0, q1}, [r0, :128]
+ vmull.u8 q15, d16, d6
+ vmlal.u8 q15, d0, d7
+ vmull.u8 q14, d17, d6
+ vmlal.u8 q14, d1, d7
+ vrshrn.i16 d0, q15, #6
+ vrshrn.i16 d1, q14, #6
+ vmull.u8 q15, d18, d6
+ vmlal.u8 q15, d2, d7
+ vmull.u8 q14, d19, d6
+ vmlal.u8 q14, d3, d7
+ vrshrn.i16 d2, q15, #6
+ vrshrn.i16 d3, q14, #6
+ subs r12, r12, #32
+ vst1.u8 {q0, q1}, [r0, :128]!
+ bgt 32b
+ add r0, r0, r1
+ subs r4, r4, #1
+ bgt 321b
+ pop {r4-r5,pc}
+endfunc
+
+function blend_v_8bpc_neon, export=1
+ push {r4,lr}
+ ldr r4, [sp, #8]
+ movrel lr, X(obmc_masks)
+ add lr, lr, r3
+ clz r12, r3
+ adr r3, L(blend_v_tbl)
+ sub r12, r12, #26
+ ldr r12, [r3, r12, lsl #2]
+ add r3, r3, r12
+ bx r3
+
+ .align 2
+L(blend_v_tbl):
+ .word 320f - L(blend_v_tbl) + CONFIG_THUMB
+ .word 160f - L(blend_v_tbl) + CONFIG_THUMB
+ .word 80f - L(blend_v_tbl) + CONFIG_THUMB
+ .word 40f - L(blend_v_tbl) + CONFIG_THUMB
+ .word 20f - L(blend_v_tbl) + CONFIG_THUMB
+
+20:
+ vmov.i8 d22, #64
+ vld1.8 {d2[]}, [lr]
+ add r12, r0, r1
+ lsl r1, r1, #1
+ vsub.i8 d3, d22, d2
+2:
+ vld1.16 {d1[0]}, [r2, :16]!
+ vld1.8 {d0[]}, [r0]
+ subs r4, r4, #2
+ vld1.8 {d1[1]}, [r2]
+ vld1.8 {d0[1]}, [r12]
+ vmull.u8 q2, d1, d2
+ vmlal.u8 q2, d0, d3
+ vrshrn.i16 d6, q2, #6
+ add r2, r2, #2
+ vst1.8 {d6[0]}, [r0], r1
+ vst1.8 {d6[1]}, [r12], r1
+ bgt 2b
+ pop {r4,pc}
+40:
+ vmov.i8 d22, #64
+ vld1.32 {d4[]}, [lr, :32]
+ add r12, r0, r1
+ lsl r1, r1, #1
+ vsub.i8 d5, d22, d4
+ sub r1, r1, #2
+4:
+ vld1.u8 {d2}, [r2, :64]!
+ vld1.32 {d0[]}, [r0, :32]
+ vld1.32 {d0[1]}, [r12, :32]
+ subs r4, r4, #2
+ vmull.u8 q3, d2, d4
+ vmlal.u8 q3, d0, d5
+ vrshrn.i16 d20, q3, #6
+ vst1.16 {d20[0]}, [r0, :16]!
+ vst1.16 {d20[2]}, [r12, :16]!
+ vst1.8 {d20[2]}, [r0], r1
+ vst1.8 {d20[6]}, [r12], r1
+ bgt 4b
+ pop {r4,pc}
+80:
+ vmov.i8 d16, #64
+ vld1.u8 {d2}, [lr, :64]
+ add r12, r0, r1
+ lsl r1, r1, #1
+ vsub.i8 d17, d16, d2
+ sub r1, r1, #4
+8:
+ vld1.u8 {d4, d5}, [r2, :128]!
+ vld1.u8 {d0}, [r0, :64]
+ vld1.u8 {d1}, [r12, :64]
+ subs r4, r4, #2
+ vmull.u8 q3, d2, d4
+ vmlal.u8 q3, d0, d17
+ vmull.u8 q10, d2, d5
+ vmlal.u8 q10, d1, d17
+ vrshrn.i16 d22, q3, #6
+ vrshrn.i16 d23, q10, #6
+ vst1.32 {d22[0]}, [r0, :32]!
+ vst1.32 {d23[0]}, [r12, :32]!
+ vst1.16 {d22[2]}, [r0, :16], r1
+ vst1.16 {d23[2]}, [r12, :16], r1
+ bgt 8b
+ pop {r4,pc}
+160:
+ vmov.i8 q12, #64
+ vld1.u8 {q14}, [lr, :128]
+ add r12, r0, r1
+ lsl r1, r1, #1
+ vsub.i8 q11, q12, q14
+ sub r1, r1, #8
+16:
+ vld1.u8 {q1, q2}, [r2, :128]!
+ vld1.u8 {q0}, [r0, :128]
+ subs r4, r4, #2
+ vld1.u8 {q13}, [r12, :128]
+ vmull.u8 q3, d2, d28
+ vmlal.u8 q3, d0, d22
+ vmull.u8 q8, d3, d29
+ vmlal.u8 q8, d1, d23
+ vrshrn.i16 d18, q3, #6
+ vrshrn.i16 d19, q8, #6
+ vmull.u8 q3, d4, d28
+ vmlal.u8 q3, d26, d22
+ vmull.u8 q8, d5, d29
+ vmlal.u8 q8, d27, d23
+ vrshrn.i16 d20, q3, #6
+ vrshrn.i16 d21, q8, #6
+ vst1.u8 {d18}, [r0, :64]!
+ vst1.u8 {d20}, [r12, :64]!
+ vst1.32 {d19[0]}, [r0, :32], r1
+ vst1.32 {d21[0]}, [r12, :32], r1
+ bgt 16b
+ pop {r4,pc}
+320:
+ vmov.i8 q10, #64
+ vld1.u8 {q2, q3}, [lr, :128]
+ vsub.i8 q11, q10, q2
+ vsub.i8 d24, d20, d6
+32:
+ vld1.u8 {q8, q9}, [r2, :128]!
+ vld1.u8 {d0, d1, d2}, [r0, :64]
+ subs r4, r4, #1
+ vmull.u8 q15, d16, d4
+ vmlal.u8 q15, d0, d22
+ vmull.u8 q14, d17, d5
+ vmlal.u8 q14, d1, d23
+ vrshrn.i16 d0, q15, #6
+ vrshrn.i16 d1, q14, #6
+ vmull.u8 q15, d18, d6
+ vmlal.u8 q15, d2, d24
+ vrshrn.i16 d2, q15, #6
+ vst1.u8 {d0, d1, d2}, [r0, :64], r1
+ bgt 32b
+ pop {r4,pc}
+endfunc
+
+
+// This has got the same signature as the put_8tap functions,
+// assumes that the caller has loaded the h argument into r5,
+// and assumes that r8 is set to (clz(w)-24).
+function put_neon
+ adr r9, L(put_tbl)
+ ldr r8, [r9, r8, lsl #2]
+ add r9, r9, r8
+ bx r9
+
+ .align 2
+L(put_tbl):
+ .word 1280f - L(put_tbl) + CONFIG_THUMB
+ .word 640f - L(put_tbl) + CONFIG_THUMB
+ .word 32f - L(put_tbl) + CONFIG_THUMB
+ .word 160f - L(put_tbl) + CONFIG_THUMB
+ .word 8f - L(put_tbl) + CONFIG_THUMB
+ .word 4f - L(put_tbl) + CONFIG_THUMB
+ .word 2f - L(put_tbl) + CONFIG_THUMB
+
+2:
+ vld1.16 {d0[]}, [r2], r3
+ vld1.16 {d1[]}, [r2], r3
+ subs r5, r5, #2
+ vst1.16 {d0[0]}, [r0, :16], r1
+ vst1.16 {d1[0]}, [r0, :16], r1
+ bgt 2b
+ pop {r4-r11,pc}
+4:
+ vld1.32 {d0[]}, [r2], r3
+ vld1.32 {d1[]}, [r2], r3
+ subs r5, r5, #2
+ vst1.32 {d0[0]}, [r0, :32], r1
+ vst1.32 {d1[0]}, [r0, :32], r1
+ bgt 4b
+ pop {r4-r11,pc}
+8:
+ vld1.8 {d0}, [r2], r3
+ vld1.8 {d1}, [r2], r3
+ subs r5, r5, #2
+ vst1.8 {d0}, [r0, :64], r1
+ vst1.8 {d1}, [r0, :64], r1
+ bgt 8b
+ pop {r4-r11,pc}
+160:
+ add r8, r0, r1
+ lsl r1, r1, #1
+ add r9, r2, r3
+ lsl r3, r3, #1
+16:
+ vld1.8 {q0}, [r2], r3
+ vld1.8 {q1}, [r9], r3
+ subs r5, r5, #2
+ vst1.8 {q0}, [r0, :128], r1
+ vst1.8 {q1}, [r8, :128], r1
+ bgt 16b
+ pop {r4-r11,pc}
+32:
+ vld1.8 {q0, q1}, [r2], r3
+ subs r5, r5, #1
+ vst1.8 {q0, q1}, [r0, :128], r1
+ bgt 32b
+ pop {r4-r11,pc}
+640:
+ sub r1, r1, #32
+ sub r3, r3, #32
+64:
+ vld1.8 {q0, q1}, [r2]!
+ vst1.8 {q0, q1}, [r0, :128]!
+ vld1.8 {q2, q3}, [r2], r3
+ subs r5, r5, #1
+ vst1.8 {q2, q3}, [r0, :128], r1
+ bgt 64b
+ pop {r4-r11,pc}
+1280:
+ sub r1, r1, #96
+ sub r3, r3, #96
+128:
+ vld1.8 {q8, q9}, [r2]!
+ vst1.8 {q8, q9}, [r0, :128]!
+ vld1.8 {q10, q11}, [r2]!
+ vst1.8 {q10, q11}, [r0, :128]!
+ vld1.8 {q12, q13}, [r2]!
+ vst1.8 {q12, q13}, [r0, :128]!
+ vld1.8 {q14, q15}, [r2], r3
+ subs r5, r5, #1
+ vst1.8 {q14, q15}, [r0, :128], r1
+ bgt 128b
+ pop {r4-r11,pc}
+endfunc
+
+
+// This has got the same signature as the put_8tap functions,
+// assumes that the caller has loaded the h argument into r4,
+// and assumes that r8 is set to (clz(w)-24), and r7 to w*2.
+function prep_neon
+ adr r9, L(prep_tbl)
+ ldr r8, [r9, r8, lsl #2]
+ add r9, r9, r8
+ bx r9
+
+ .align 2
+L(prep_tbl):
+ .word 1280f - L(prep_tbl) + CONFIG_THUMB
+ .word 640f - L(prep_tbl) + CONFIG_THUMB
+ .word 320f - L(prep_tbl) + CONFIG_THUMB
+ .word 160f - L(prep_tbl) + CONFIG_THUMB
+ .word 8f - L(prep_tbl) + CONFIG_THUMB
+ .word 4f - L(prep_tbl) + CONFIG_THUMB
+
+4:
+ vld1.32 {d0[]}, [r1], r2
+ vld1.32 {d2[]}, [r1], r2
+ subs r4, r4, #2
+ vshll.u8 q0, d0, #4
+ vshll.u8 q1, d2, #4
+ vst1.16 {d1, d2}, [r0, :64]!
+ bgt 4b
+ pop {r4-r11,pc}
+8:
+ vld1.8 {d0}, [r1], r2
+ vld1.8 {d2}, [r1], r2
+ subs r4, r4, #2
+ vshll.u8 q0, d0, #4
+ vshll.u8 q1, d2, #4
+ vst1.16 {q0, q1}, [r0, :128]!
+ bgt 8b
+ pop {r4-r11,pc}
+160:
+ add r9, r1, r2
+ lsl r2, r2, #1
+ add r8, r0, r7
+ lsl r7, r7, #1
+16:
+ vld1.8 {q2}, [r1], r2
+ vld1.8 {q3}, [r9], r2
+ subs r4, r4, #2
+ vshll.u8 q0, d4, #4
+ vshll.u8 q1, d5, #4
+ vshll.u8 q2, d6, #4
+ vshll.u8 q3, d7, #4
+ vst1.16 {q0, q1}, [r0, :128], r7
+ vst1.16 {q2, q3}, [r8, :128], r7
+ bgt 16b
+ pop {r4-r11,pc}
+320:
+ add r8, r0, r3
+32:
+ vld1.8 {q0, q1}, [r1], r2
+ subs r4, r4, #2
+ vshll.u8 q8, d0, #4
+ vshll.u8 q9, d1, #4
+ vld1.8 {q2, q3}, [r1], r2
+ vshll.u8 q10, d2, #4
+ vshll.u8 q11, d3, #4
+ vshll.u8 q12, d4, #4
+ vst1.16 {q8, q9}, [r0, :128], r7
+ vshll.u8 q13, d5, #4
+ vst1.16 {q10, q11}, [r8, :128], r7
+ vshll.u8 q14, d6, #4
+ vst1.16 {q12, q13}, [r0, :128], r7
+ vshll.u8 q15, d7, #4
+ vst1.16 {q14, q15}, [r8, :128], r7
+ bgt 32b
+ pop {r4-r11,pc}
+640:
+ sub r2, r2, #32
+ add r8, r0, #32
+ mov r6, #64
+64:
+ vld1.8 {q0, q1}, [r1]!
+ subs r4, r4, #1
+ vshll.u8 q8, d0, #4
+ vshll.u8 q9, d1, #4
+ vld1.8 {q2, q3}, [r1], r2
+ vshll.u8 q10, d2, #4
+ vshll.u8 q11, d3, #4
+ vshll.u8 q12, d4, #4
+ vst1.16 {q8, q9}, [r0, :128], r6
+ vshll.u8 q13, d5, #4
+ vshll.u8 q14, d6, #4
+ vst1.16 {q10, q11}, [r8, :128], r6
+ vshll.u8 q15, d7, #4
+ vst1.16 {q12, q13}, [r0, :128], r6
+ vst1.16 {q14, q15}, [r8, :128], r6
+ bgt 64b
+ pop {r4-r11,pc}
+1280:
+ sub r2, r2, #96
+ add r8, r0, #32
+ mov r6, #64
+128:
+ vld1.8 {q0, q1}, [r1]!
+ vld1.8 {q2, q3}, [r1]!
+ vshll.u8 q10, d0, #4
+ vshll.u8 q11, d1, #4
+ vshll.u8 q12, d2, #4
+ vshll.u8 q13, d3, #4
+ vshll.u8 q14, d4, #4
+ vshll.u8 q15, d5, #4
+ vld1.8 {q8, q9}, [r1]!
+ vst1.16 {q10, q11}, [r0, :128], r6
+ vst1.16 {q12, q13}, [r8, :128], r6
+ vshll.u8 q0, d6, #4
+ vshll.u8 q1, d7, #4
+ vshll.u8 q2, d16, #4
+ vshll.u8 q3, d17, #4
+ vshll.u8 q8, d18, #4
+ vshll.u8 q9, d19, #4
+ vld1.8 {q10, q11}, [r1], r2
+ vst1.16 {q14, q15}, [r0, :128], r6
+ vst1.16 {q0, q1}, [r8, :128], r6
+ vshll.u8 q12, d20, #4
+ vshll.u8 q13, d21, #4
+ vshll.u8 q14, d22, #4
+ vshll.u8 q15, d23, #4
+ subs r4, r4, #1
+ vst1.16 {q2, q3}, [r0, :128], r6
+ vst1.16 {q8, q9}, [r8, :128], r6
+ vst1.16 {q12, q13}, [r0, :128], r6
+ vst1.16 {q14, q15}, [r8, :128], r6
+ bgt 128b
+ pop {r4-r11,pc}
+endfunc
+
+
+.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
+ vld1.\wd {\d0[]}, [\s0], \strd
+ vld1.\wd {\d1[]}, [\s1], \strd
+.ifnb \d2
+ vld1.\wd {\d2[]}, [\s0], \strd
+ vld1.\wd {\d3[]}, [\s1], \strd
+.endif
+.ifnb \d4
+ vld1.\wd {\d4[]}, [\s0], \strd
+.endif
+.ifnb \d5
+ vld1.\wd {\d5[]}, [\s1], \strd
+.endif
+.ifnb \d6
+ vld1.\wd {\d6[]}, [\s0], \strd
+.endif
+.endm
+.macro load_reg s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+ vld1.8 {\d0}, [\s0], \strd
+ vld1.8 {\d1}, [\s1], \strd
+.ifnb \d2
+ vld1.8 {\d2}, [\s0], \strd
+ vld1.8 {\d3}, [\s1], \strd
+.endif
+.ifnb \d4
+ vld1.8 {\d4}, [\s0], \strd
+.endif
+.ifnb \d5
+ vld1.8 {\d5}, [\s1], \strd
+.endif
+.ifnb \d6
+ vld1.8 {\d6}, [\s0], \strd
+.endif
+.endm
+.macro load_16 s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+ load_slice \s0, \s1, \strd, 16, \d0, \d1, \d2, \d3, \d4, \d5, \d6
+.endm
+.macro load_32 s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+ load_slice \s0, \s1, \strd, 32, \d0, \d1, \d2, \d3, \d4, \d5, \d6
+.endm
+.macro interleave_1_16 r0, r1, r2, r3, r4
+ vext.8 \r0, \r0, \r1, #6
+ vext.8 \r1, \r1, \r2, #6
+.ifnb \r3
+ vext.8 \r2, \r2, \r3, #6
+ vext.8 \r3, \r3, \r4, #6
+.endif
+.endm
+.macro interleave_1_32 r0, r1, r2, r3, r4
+ vext.8 \r0, \r0, \r1, #4
+ vext.8 \r1, \r1, \r2, #4
+.ifnb \r3
+ vext.8 \r2, \r2, \r3, #4
+ vext.8 \r3, \r3, \r4, #4
+.endif
+.endm
+.macro vmovl_u8 q0, d0, q1, d1, q2, d2, q3, d3, q4, d4, q5, d5, q6, d6
+ vmovl.u8 \q0, \d0
+ vmovl.u8 \q1, \d1
+.ifnb \q2
+ vmovl.u8 \q2, \d2
+ vmovl.u8 \q3, \d3
+.endif
+.ifnb \q4
+ vmovl.u8 \q4, \d4
+.endif
+.ifnb \q5
+ vmovl.u8 \q5, \d5
+.endif
+.ifnb \q6
+ vmovl.u8 \q6, \d6
+.endif
+.endm
+.macro mul_mla_4 d, s0, s1, s2, s3
+ vmul.s16 \d, \s0, d0[0]
+ vmla.s16 \d, \s1, d0[1]
+ vmla.s16 \d, \s2, d0[2]
+ vmla.s16 \d, \s3, d0[3]
+.endm
+.macro mul_mla_8_0 d0, s0, s1, s2, s3, s4, s5, s6, s7
+ vmul.s16 \d0, \s0, d0[0]
+ vmla.s16 \d0, \s1, d0[1]
+ vmla.s16 \d0, \s2, d0[2]
+ vmla.s16 \d0, \s3, d0[3]
+ vmla.s16 \d0, \s4, d1[0]
+ vmla.s16 \d0, \s5, d1[1]
+ vmla.s16 \d0, \s6, d1[2]
+ vmla.s16 \d0, \s7, d1[3]
+.endm
+.macro mul_mla_8_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8
+ vmul.s16 \d0, \s0, d0[0]
+ vmla.s16 \d0, \s1, d0[1]
+ vmla.s16 \d0, \s2, d0[2]
+ vmla.s16 \d0, \s3, d0[3]
+ vmla.s16 \d0, \s4, d1[0]
+ vmla.s16 \d0, \s5, d1[1]
+ vmla.s16 \d0, \s6, d1[2]
+ vmla.s16 \d0, \s7, d1[3]
+ vmul.s16 \d1, \s1, d0[0]
+ vmla.s16 \d1, \s2, d0[1]
+ vmla.s16 \d1, \s3, d0[2]
+ vmla.s16 \d1, \s4, d0[3]
+ vmla.s16 \d1, \s5, d1[0]
+ vmla.s16 \d1, \s6, d1[1]
+ vmla.s16 \d1, \s7, d1[2]
+ vmla.s16 \d1, \s8, d1[3]
+.endm
+.macro mul_mla_8_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9
+ vmul.s16 \d0, \s0, d0[0]
+ vmla.s16 \d0, \s1, d0[1]
+ vmla.s16 \d0, \s2, d0[2]
+ vmla.s16 \d0, \s3, d0[3]
+ vmla.s16 \d0, \s4, d1[0]
+ vmla.s16 \d0, \s5, d1[1]
+ vmla.s16 \d0, \s6, d1[2]
+ vmla.s16 \d0, \s7, d1[3]
+ vmul.s16 \d1, \s2, d0[0]
+ vmla.s16 \d1, \s3, d0[1]
+ vmla.s16 \d1, \s4, d0[2]
+ vmla.s16 \d1, \s5, d0[3]
+ vmla.s16 \d1, \s6, d1[0]
+ vmla.s16 \d1, \s7, d1[1]
+ vmla.s16 \d1, \s8, d1[2]
+ vmla.s16 \d1, \s9, d1[3]
+.endm
+.macro vqrshrun_s16 shift, q0, d0, q1, d1, q2, d2, q3, d3
+ vqrshrun.s16 \d0, \q0, #\shift
+.ifnb \q1
+ vqrshrun.s16 \d1, \q1, #\shift
+.endif
+.ifnb \q2
+ vqrshrun.s16 \d2, \q2, #\shift
+ vqrshrun.s16 \d3, \q3, #\shift
+.endif
+.endm
+.macro vrshr_s16 shift, r0, r1, r2, r3
+ vrshr.s16 \r0, \r0, #\shift
+.ifnb \r1
+ vrshr.s16 \r1, \r1, #\shift
+.endif
+.ifnb \r2
+ vrshr.s16 \r2, \r2, #\shift
+ vrshr.s16 \r3, \r3, #\shift
+.endif
+.endm
+.macro st_16 strd, reg, lanes
+ vst1.16 {\reg[0]}, [r0, :16], \strd
+ vst1.16 {\reg[1]}, [r8, :16], \strd
+.if \lanes > 2
+ vst1.16 {\reg[2]}, [r0, :16], \strd
+ vst1.16 {\reg[3]}, [r8, :16], \strd
+.endif
+.endm
+.macro st_32 strd, r0, r1
+ vst1.32 {\r0[0]}, [r0, :32], \strd
+ vst1.32 {\r0[1]}, [r8, :32], \strd
+.ifnb \r1
+ vst1.32 {\r1[0]}, [r0, :32], \strd
+ vst1.32 {\r1[1]}, [r8, :32], \strd
+.endif
+.endm
+.macro st_reg strd, align, r0, r1, r2, r3, r4, r5, r6, r7
+ vst1.8 {\r0}, [r0, \align], \strd
+ vst1.8 {\r1}, [r8, \align], \strd
+.ifnb \r2
+ vst1.8 {\r2}, [r0, \align], \strd
+ vst1.8 {\r3}, [r8, \align], \strd
+.endif
+.ifnb \r4
+ vst1.8 {\r4}, [r0, \align], \strd
+ vst1.8 {\r5}, [r8, \align], \strd
+ vst1.8 {\r6}, [r0, \align], \strd
+ vst1.8 {\r7}, [r8, \align], \strd
+.endif
+.endm
+.macro shift_store_4 type, strd, q0, d0, d1, q1, d2, d3
+.ifc \type, put
+ vqrshrun_s16 6, \q0, \d0, \q1, \d2
+ st_32 \strd, \d0, \d2
+.else
+ vrshr_s16 2, \q0, \q1
+ st_reg \strd, :64, \d0, \d1, \d2, \d3
+.endif
+.endm
+.macro shift_store_8 type, strd, q0, d0, q1, d1, q2, d2, q3, d3
+.ifc \type, put
+ vqrshrun_s16 6, \q0, \d0, \q1, \d1, \q2, \d2, \q3, \d3
+ st_reg \strd, :64, \d0, \d1, \d2, \d3
+.else
+ vrshr_s16 2, \q0, \q1, \q2, \q3
+ st_reg \strd, :128,\q0, \q1, \q2, \q3
+.endif
+.endm
+.macro shift_store_16 type, strd, q0, d0, d1, q1, q2, d4, d5, q3
+.ifc \type, put
+ vqrshrun.s16 \d0, \q0, #6
+ vqrshrun.s16 \d1, \q1, #6
+ vqrshrun.s16 \d4, \q2, #6
+ vqrshrun.s16 \d5, \q3, #6
+ st_reg \strd, :128, \q0, \q2
+.else
+ vrshr_s16 2, \q0, \q1, \q2, \q3
+ vst1.16 {\q0, \q1}, [r0, :128], \strd
+ vst1.16 {\q2, \q3}, [r8, :128], \strd
+.endif
+.endm
+
+.macro make_8tap_fn op, type, type_h, type_v
+function \op\()_8tap_\type\()_8bpc_neon, export=1
+ push {r4-r11,lr}
+ movw r8, \type_h
+ movw r9, \type_v
+ b \op\()_8tap_neon
+endfunc
+.endm
+
+// No spaces in these expressions, due to gas-preprocessor.
+#define REGULAR ((0*15<<7)|3*15)
+#define SMOOTH ((1*15<<7)|4*15)
+#define SHARP ((2*15<<7)|3*15)
+
+.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, my, ds2, sr2, shift_hv
+make_8tap_fn \type, regular, REGULAR, REGULAR
+make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH
+make_8tap_fn \type, regular_sharp, REGULAR, SHARP
+make_8tap_fn \type, smooth, SMOOTH, SMOOTH
+make_8tap_fn \type, smooth_regular, SMOOTH, REGULAR
+make_8tap_fn \type, smooth_sharp, SMOOTH, SHARP
+make_8tap_fn \type, sharp, SHARP, SHARP
+make_8tap_fn \type, sharp_regular, SHARP, REGULAR
+make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH
+
+function \type\()_8tap_neon
+ ldrd r4, r5, [sp, #36]
+ ldrd r6, r7, [sp, #44]
+ movw r10, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0)
+ mul \mx, \mx, r10
+ mul \my, \my, r10
+ add \mx, \mx, r8 // mx, 8tap_h, 4tap_h
+ add \my, \my, r9 // my, 8tap_v, 4tap_v
+.ifc \type, prep
+ lsl \d_strd, \w, #1
+.endif
+
+ clz r8, \w
+ tst \mx, #(0x7f << 14)
+ sub r8, r8, #24
+ movrel r10, X(mc_subpel_filters), -8
+ bne L(\type\()_8tap_h)
+ tst \my, #(0x7f << 14)
+ bne L(\type\()_8tap_v)
+ b \type\()_neon
+
+L(\type\()_8tap_h):
+ cmp \w, #4
+ ubfx r9, \mx, #7, #7
+ and \mx, \mx, #0x7f
+ it gt
+ movgt \mx, r9
+ tst \my, #(0x7f << 14)
+ add \mx, r10, \mx, lsl #3
+ bne L(\type\()_8tap_hv)
+
+ adr r9, L(\type\()_8tap_h_tbl)
+ ldr r8, [r9, r8, lsl #2]
+ add r9, r9, r8
+ bx r9
+
+ .align 2
+L(\type\()_8tap_h_tbl):
+ .word 1280f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+ .word 640f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+ .word 320f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+ .word 160f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+ .word 80f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+ .word 40f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+ .word 20f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+
+20: // 2xN h
+.ifc \type, put
+ add \mx, \mx, #2
+ vld1.32 {d0[]}, [\mx]
+ sub \src, \src, #1
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+ vmovl.s8 q0, d0
+2:
+ vld1.8 {d4}, [\src], \s_strd
+ vld1.8 {d6}, [\sr2], \s_strd
+ vmovl.u8 q2, d4
+ vmovl.u8 q3, d6
+ vext.8 d5, d4, d5, #2
+ vext.8 d7, d6, d7, #2
+ subs \h, \h, #2
+ vtrn.32 d4, d6
+ vtrn.32 d5, d7
+ vmul.s16 d2, d4, d0[0]
+ vmla.s16 d2, d5, d0[1]
+ vmla.s16 d2, d6, d0[2]
+ vmla.s16 d2, d7, d0[3]
+ vrshr.s16 d2, d2, #2
+ vqrshrun.s16 d2, q1, #4
+ vst1.16 {d2[0]}, [\dst, :16], \d_strd
+ vst1.16 {d2[1]}, [\ds2, :16], \d_strd
+ bgt 2b
+ pop {r4-r11,pc}
+.endif
+
+40: // 4xN h
+ add \mx, \mx, #2
+ vld1.32 {d0[]}, [\mx]
+ sub \src, \src, #1
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+ vmovl.s8 q0, d0
+4:
+ vld1.8 {d16}, [\src], \s_strd
+ vld1.8 {d24}, [\sr2], \s_strd
+ vmovl.u8 q8, d16
+ vmovl.u8 q12, d24
+ vext.8 d18, d16, d17, #2
+ vext.8 d20, d16, d17, #4
+ vext.8 d22, d16, d17, #6
+ vext.8 d26, d24, d25, #2
+ vext.8 d28, d24, d25, #4
+ vext.8 d30, d24, d25, #6
+ subs \h, \h, #2
+ vmul.s16 d4, d16, d0[0]
+ vmla.s16 d4, d18, d0[1]
+ vmla.s16 d4, d20, d0[2]
+ vmla.s16 d4, d22, d0[3]
+ vmul.s16 d5, d24, d0[0]
+ vmla.s16 d5, d26, d0[1]
+ vmla.s16 d5, d28, d0[2]
+ vmla.s16 d5, d30, d0[3]
+ vrshr.s16 q2, q2, #2
+.ifc \type, put
+ vqrshrun.s16 d4, q2, #4
+ vst1.32 {d4[0]}, [\dst, :32], \d_strd
+ vst1.32 {d4[1]}, [\ds2, :32], \d_strd
+.else
+ vst1.16 {d4}, [\dst, :64], \d_strd
+ vst1.16 {d5}, [\ds2, :64], \d_strd
+.endif
+ bgt 4b
+ pop {r4-r11,pc}
+
+80: // 8xN h
+ vld1.8 {d0}, [\mx, :64]
+ sub \src, \src, #3
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+ vmovl.s8 q0, d0
+8:
+ vld1.8 {q8}, [\src], \s_strd
+ vld1.8 {q12}, [\sr2], \s_strd
+ vmovl.u8 q9, d17
+ vmovl.u8 q8, d16
+ vmovl.u8 q13, d25
+ vmovl.u8 q12, d24
+
+ vmul.s16 q10, q8, d0[0]
+ vmul.s16 q14, q12, d0[0]
+.irpc i, 1234567
+ vext.8 q11, q8, q9, #(2*\i)
+ vext.8 q15, q12, q13, #(2*\i)
+.if \i < 4
+ vmla.s16 q10, q11, d0[\i]
+ vmla.s16 q14, q15, d0[\i]
+.else
+ vmla.s16 q10, q11, d1[\i-4]
+ vmla.s16 q14, q15, d1[\i-4]
+.endif
+.endr
+ subs \h, \h, #2
+ vrshr.s16 q10, q10, #2
+ vrshr.s16 q14, q14, #2
+.ifc \type, put
+ vqrshrun.s16 d20, q10, #4
+ vqrshrun.s16 d28, q14, #4
+ vst1.8 {d20}, [\dst, :64], \d_strd
+ vst1.8 {d28}, [\ds2, :64], \d_strd
+.else
+ vst1.16 {q10}, [\dst, :128], \d_strd
+ vst1.16 {q14}, [\ds2, :128], \d_strd
+.endif
+ bgt 8b
+ pop {r4-r11,pc}
+
+160:
+320:
+640:
+1280: // 16xN, 32xN, ... h
+ // This could be done without touching q4-q6, by using only
+ // one temporary for vext in the loop. That's slower on A7 and A53,
+ // (but surprisingly, marginally faster on A8 and A73).
+ vpush {q4-q6}
+ vld1.8 {d0}, [\mx, :64]
+ sub \src, \src, #3
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ vmovl.s8 q0, d0
+
+ sub \s_strd, \s_strd, \w
+ sub \s_strd, \s_strd, #8
+.ifc \type, put
+ lsl \d_strd, \d_strd, #1
+ sub \d_strd, \d_strd, \w
+.endif
+161:
+ vld1.8 {d16, d17, d18}, [\src]!
+ vld1.8 {d24, d25, d26}, [\sr2]!
+ mov \mx, \w
+ vmovl.u8 q10, d18
+ vmovl.u8 q9, d17
+ vmovl.u8 q8, d16
+ vmovl.u8 q14, d26
+ vmovl.u8 q13, d25
+ vmovl.u8 q12, d24
+
+16:
+ vmul.s16 q1, q8, d0[0]
+ vmul.s16 q2, q9, d0[0]
+ vmul.s16 q3, q12, d0[0]
+ vmul.s16 q4, q13, d0[0]
+.irpc i, 1234567
+ vext.8 q5, q8, q9, #(2*\i)
+ vext.8 q6, q9, q10, #(2*\i)
+ vext.8 q11, q12, q13, #(2*\i)
+ vext.8 q15, q13, q14, #(2*\i)
+.if \i < 4
+ vmla.s16 q1, q5, d0[\i]
+ vmla.s16 q2, q6, d0[\i]
+ vmla.s16 q3, q11, d0[\i]
+ vmla.s16 q4, q15, d0[\i]
+.else
+ vmla.s16 q1, q5, d1[\i-4]
+ vmla.s16 q2, q6, d1[\i-4]
+ vmla.s16 q3, q11, d1[\i-4]
+ vmla.s16 q4, q15, d1[\i-4]
+.endif
+.endr
+ vrshr.s16 q1, q1, #2
+ vrshr.s16 q2, q2, #2
+ vrshr.s16 q3, q3, #2
+ vrshr.s16 q4, q4, #2
+ subs \mx, \mx, #16
+.ifc \type, put
+ vqrshrun.s16 d2, q1, #4
+ vqrshrun.s16 d3, q2, #4
+ vqrshrun.s16 d4, q3, #4
+ vqrshrun.s16 d5, q4, #4
+ vst1.8 {q1}, [\dst, :128]!
+ vst1.8 {q2}, [\ds2, :128]!
+.else
+ vst1.16 {q1, q2}, [\dst, :128]!
+ vst1.16 {q3, q4}, [\ds2, :128]!
+.endif
+ ble 9f
+
+ vmov q8, q10
+ vmov q12, q14
+ vld1.8 {d18, d19}, [\src]!
+ vld1.8 {d26, d27}, [\sr2]!
+ vmovl.u8 q10, d19
+ vmovl.u8 q9, d18
+ vmovl.u8 q14, d27
+ vmovl.u8 q13, d26
+ b 16b
+
+9:
+ add \dst, \dst, \d_strd
+ add \ds2, \ds2, \d_strd
+ add \src, \src, \s_strd
+ add \sr2, \sr2, \s_strd
+
+ subs \h, \h, #2
+ bgt 161b
+ vpop {q4-q6}
+ pop {r4-r11,pc}
+
+L(\type\()_8tap_v):
+ cmp \h, #4
+ ubfx r9, \my, #7, #7
+ and \my, \my, #0x7f
+ it gt
+ movgt \my, r9
+ add \my, r10, \my, lsl #3
+
+ adr r9, L(\type\()_8tap_v_tbl)
+ ldr r8, [r9, r8, lsl #2]
+ add r9, r9, r8
+ bx r9
+
+ .align 2
+L(\type\()_8tap_v_tbl):
+ .word 1280f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+ .word 640f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+ .word 320f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+ .word 160f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+ .word 80f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+ .word 40f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+ .word 20f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+
+20: // 2xN v
+.ifc \type, put
+ bgt 28f
+
+ cmp \h, #2
+ add \my, \my, #2
+ vld1.32 {d0[]}, [\my]
+ sub \src, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vmovl.s8 q0, d0
+
+ // 2x2 v
+ load_16 \src, \sr2, \s_strd, d1, d2, d3, d4, d5
+ interleave_1_16 d1, d2, d3, d4, d5
+ bgt 24f
+ vmovl_u8 q8, d1, q9, d2, q10, d3, q11, d4
+ mul_mla_4 d6, d16, d18, d20, d22
+ vqrshrun_s16 6, q3, d6
+ st_16 \d_strd, d6, 2
+ pop {r4-r11,pc}
+
+24: // 2x4 v
+ load_16 \sr2, \src, \s_strd, d6, d7
+ interleave_1_16 d5, d6, d7
+ vmovl_u8 q8, d1, q9, d2, q10, d3, q11, d4, q12, d5, q13, d6
+ vmov d17, d20
+ vmov d19, d22
+ vmov d21, d24
+ vmov d23, d26
+ mul_mla_4 q3, q8, q9, q10, q11
+ vqrshrun_s16 6, q3, d6
+ st_16 \d_strd, d6, 4
+ pop {r4-r11,pc}
+
+28: // 2x6, 2x8, 2x12, 2x16 v
+ vpush {q4-q7}
+ vld1.8 {d0}, [\my, :64]
+ sub \sr2, \src, \s_strd, lsl #1
+ add \ds2, \dst, \d_strd
+ sub \src, \sr2, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+ vmovl.s8 q0, d0
+
+ load_16 \src, \sr2, \s_strd, d2, d4, d6, d8, d10, d12, d14
+ interleave_1_16 d2, d4, d6, d8, d10
+ interleave_1_16 d10, d12, d14
+ vmovl_u8 q1, d2, q2, d4, q3, d6, q4, d8, q5, d10, q6, d12
+ vmov d3, d6
+ vmov d5, d8
+ vmov d7, d10
+ vmov d9, d12
+216:
+ subs \h, \h, #4
+ load_16 \sr2, \src, \s_strd, d16, d18, d20, d22
+ interleave_1_16 d14, d16, d18, d20, d22
+ vmovl_u8 q7, d14, q8, d16, q9, d18, q10, d20
+ vmov d11, d14
+ vmov d13, d16
+ vmov d15, d18
+ vmov d17, d20
+ mul_mla_8_0 q1, q1, q2, q3, q4, q5, q6, q7, q8
+ vqrshrun_s16 6, q1, d2
+ st_16 \d_strd, d2, 4
+ ble 0f
+ cmp \h, #2
+ vmov q1, q5
+ vmov q2, q6
+ vmov q3, q7
+ vmov q4, q8
+ vmov q5, q9
+ vmov q6, q10
+ vmov d14, d22
+ beq 26f
+ b 216b
+26:
+ load_16 \sr2, \src, \s_strd, d16, d18
+ interleave_1_16 d14, d16, d18
+ vmovl_u8 q7, d14, q8, d16
+ vmov d11, d14
+ vmov d13, d16
+ mul_mla_8_0 d2, d2, d4, d6, d8, d10, d12, d14, d16
+ vqrshrun_s16 6, q1, d2
+ st_16 \d_strd, d2, 2
+0:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+.endif
+
+40:
+ bgt 480f
+
+ // 4x2, 4x4 v
+ cmp \h, #2
+ add \my, \my, #2
+ vld1.32 {d0[]}, [\my]
+ sub \src, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vmovl.s8 q0, d0
+
+ load_32 \src, \sr2, \s_strd, d1, d2, d3, d4, d5
+ interleave_1_32 d1, d2, d3, d4, d5
+ vmovl_u8 q8, d1, q9, d2, q10, d3, q11, d4
+ mul_mla_4 q3, q8, q9, q10, q11
+ shift_store_4 \type, \d_strd, q3, d6, d7
+ ble 0f
+ load_32 \sr2, \src, \s_strd, d6, d7
+ interleave_1_32 d5, d6, d7
+ vmovl_u8 q12, d5, q13, d6
+ mul_mla_4 q3, q10, q11, q12, q13
+ shift_store_4 \type, \d_strd, q3, d6, d7
+0:
+ pop {r4-r11,pc}
+
+480: // 4x6, 4x8, 4x12, 4x16 v
+ vpush {q4}
+ vld1.8 {d0}, [\my, :64]
+ sub \sr2, \src, \s_strd, lsl #1
+ add \ds2, \dst, \d_strd
+ sub \src, \sr2, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vmovl.s8 q0, d0
+
+ load_32 \src, \sr2, \s_strd, d2, d4, d6, d8, d16, d18, d20
+ interleave_1_32 d2, d4, d6
+ interleave_1_32 d6, d8, d16, d18, d20
+ vmovl_u8 q1, d2, q2, d4, q3, d6, q4, d8, q8, d16, q9, d18
+
+48:
+ subs \h, \h, #4
+ load_32 \sr2, \src, \s_strd, d22, d24, d26, d28
+ interleave_1_32 d20, d22, d24, d26, d28
+ vmovl_u8 q10, d20, q11, d22, q12, d24, q13, d26
+ mul_mla_8_2 q1, q2, q1, q2, q3, q4, q8, q9, q10, q11, q12, q13
+ shift_store_4 \type, \d_strd, q1, d2, d3, q2, d4, d5
+ ble 0f
+ load_32 \sr2, \src, \s_strd, d30, d2
+ subs \h, \h, #2
+ interleave_1_32 d28, d30, d2
+ vmovl_u8 q14, d28, q15, d30
+ mul_mla_8_0 q8, q8, q9, q10, q11, q12, q13, q14, q15
+ shift_store_4 \type, \d_strd, q8, d16, d17
+ ble 0f
+ load_32 \sr2, \src, \s_strd, d4, d6
+ subs \h, \h, #2
+ interleave_1_32 d2, d4, d6
+ vmovl_u8 q1, d2, q2, d4
+ mul_mla_8_0 q9, q10, q11, q12, q13, q14, q15, q1, q2
+ shift_store_4 \type, \d_strd, q9, d18, d19
+ ble 0f
+ subs \h, \h, #4
+ load_32 \sr2, \src, \s_strd, d8, d16, d18, d20
+ interleave_1_32 d6, d8, d16, d18, d20
+ vmovl_u8 q3, d6, q4, d8, q8, d16, q9, d18
+ mul_mla_8_2 q12, q13, q12, q13, q14, q15, q1, q2, q3, q4, q8, q9
+ shift_store_4 \type, \d_strd, q12, d24, d25, q13, d26, d27
+ bgt 48b
+0:
+ vpop {q4}
+ pop {r4-r11,pc}
+
+80:
+ bgt 880f
+
+ // 8x2, 8x4 v
+ cmp \h, #2
+ add \my, \my, #2
+ vld1.32 {d0[]}, [\my]
+ sub \src, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vmovl.s8 q0, d0
+
+ load_reg \src, \sr2, \s_strd, d1, d2, d3, d4, d5
+ vmovl_u8 q8, d1, q9, d2, q10, d3, q11, d4, q12, d5
+ mul_mla_4 q1, q8, q9, q10, q11
+ mul_mla_4 q2, q9, q10, q11, q12
+ shift_store_8 \type, \d_strd, q1, d2, q2, d4
+ ble 0f
+ load_reg \sr2, \src, \s_strd, d6, d7
+ vmovl_u8 q13, d6, q14, d7
+ mul_mla_4 q1, q10, q11, q12, q13
+ mul_mla_4 q2, q11, q12, q13, q14
+ shift_store_8 \type, \d_strd, q1, d2, q2, d4
+0:
+ pop {r4-r11,pc}
+
+880: // 8x6, 8x8, 8x16, 8x32 v
+1680: // 16x8, 16x16, ...
+320: // 32x8, 32x16, ...
+640:
+1280:
+ vpush {q4}
+ vld1.8 {d0}, [\my, :64]
+ sub \src, \src, \s_strd
+ sub \src, \src, \s_strd, lsl #1
+ vmovl.s8 q0, d0
+ mov \my, \h
+168:
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ load_reg \src, \sr2, \s_strd, d2, d4, d6, d8, d16, d18, d20
+ vmovl_u8 q1, d2, q2, d4, q3, d6, q4, d8, q8, d16, q9, d18, q10, d20
+
+88:
+ subs \h, \h, #2
+ load_reg \sr2, \src, \s_strd, d22, d24
+ vmovl_u8 q11, d22, q12, d24
+ mul_mla_8_1 q1, q2, q1, q2, q3, q4, q8, q9, q10, q11, q12
+ shift_store_8 \type, \d_strd, q1, d2, q2, d4
+ ble 9f
+ subs \h, \h, #2
+ load_reg \sr2, \src, \s_strd, d26, d28
+ vmovl_u8 q13, d26, q14, d28
+ mul_mla_8_1 q3, q4, q3, q4, q8, q9, q10, q11, q12, q13, q14
+ shift_store_8 \type, \d_strd, q3, d6, q4, d8
+ ble 9f
+ subs \h, \h, #2
+ load_reg \sr2, \src, \s_strd, d30, d2
+ vmovl_u8 q15, d30, q1, d2
+ mul_mla_8_1 q8, q9, q8, q9, q10, q11, q12, q13, q14, q15, q1
+ shift_store_8 \type, \d_strd, q8, d16, q9, d18
+ ble 9f
+ subs \h, \h, #2
+ load_reg \sr2, \src, \s_strd, d4, d6
+ vmovl_u8 q2, d4, q3, d6
+ mul_mla_8_1 q10, q11, q10, q11, q12, q13, q14, q15, q1, q2, q3
+ shift_store_8 \type, \d_strd, q10, d20, q11, d22
+ ble 9f
+ subs \h, \h, #4
+ load_reg \sr2, \src, \s_strd, d8, d16, d18, d20
+ vmovl_u8 q4, d8, q8, d16, q9, d18, q10, d20
+ mul_mla_8_1 q12, q13, q12, q13, q14, q15, q1, q2, q3, q4, q8
+ mul_mla_8_1 q14, q15, q14, q15, q1, q2, q3, q4, q8, q9, q10
+ shift_store_8 \type, \d_strd, q12, d24, q13, d26, q14, d28, q15, d30
+ bgt 88b
+9:
+ subs \w, \w, #8
+ ble 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ mls \src, \s_strd, \my, \src
+ mls \dst, \d_strd, \my, \dst
+ sub \src, \src, \s_strd, lsl #3
+ mov \h, \my
+ add \src, \src, #8
+.ifc \type, put
+ add \dst, \dst, #8
+.else
+ add \dst, \dst, #16
+.endif
+ b 168b
+0:
+ vpop {q4}
+ pop {r4-r11,pc}
+
+160:
+ bgt 1680b
+
+ // 16x2, 16x4 v
+ add \my, \my, #2
+ vld1.32 {d0[]}, [\my]
+ sub \src, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vmovl.s8 q0, d0
+
+ cmp \h, #2
+ load_reg \src, \sr2, \s_strd, q11, q12, q13, q14, q15
+ vmovl.u8 q1, d22
+ vmovl.u8 q2, d24
+ vmovl.u8 q3, d26
+ vmovl.u8 q8, d28
+ vmovl.u8 q9, d30
+ vmovl.u8 q11, d23
+ vmovl.u8 q12, d25
+ vmovl.u8 q13, d27
+ vmovl.u8 q14, d29
+ vmovl.u8 q15, d31
+ mul_mla_4 q1, q1, q2, q3, q8
+ mul_mla_4 q10, q2, q3, q8, q9
+ mul_mla_4 q2, q11, q12, q13, q14
+ mul_mla_4 q11, q12, q13, q14, q15
+ shift_store_16 \type, \d_strd, q1, d2, d3, q2, q10, d20, d21, q11
+ ble 0f
+ load_reg \sr2, \src, \s_strd, q10, q11
+ vmovl.u8 q1, d20
+ vmovl.u8 q10, d21
+ vmovl.u8 q12, d22
+ vmovl.u8 q11, d23
+ mul_mla_4 q2, q3, q8, q9, q1
+ mul_mla_4 q3, q13, q14, q15, q10
+ mul_mla_4 q13, q8, q9, q1, q12
+ mul_mla_4 q14, q14, q15, q10, q11
+ shift_store_16 \type, \d_strd, q2, d4, d5, q3, q13, d26, d27, q14
+0:
+ pop {r4-r11,pc}
+
+L(\type\()_8tap_hv):
+ cmp \h, #4
+ ubfx r9, \my, #7, #7
+ and \my, \my, #0x7f
+ it gt
+ movgt \my, r9
+ add \my, r10, \my, lsl #3
+
+ adr r9, L(\type\()_8tap_hv_tbl)
+ ldr r8, [r9, r8, lsl #2]
+ add r9, r9, r8
+ bx r9
+
+ .align 2
+L(\type\()_8tap_hv_tbl):
+ .word 1280f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+ .word 640f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+ .word 320f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+ .word 160f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+ .word 80f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+ .word 40f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+ .word 20f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+
+20:
+.ifc \type, put
+ add \mx, \mx, #2
+ vld1.32 {d0[]}, [\mx]
+ bgt 280f
+ add \my, \my, #2
+ vld1.32 {d2[]}, [\my]
+
+ // 2x2, 2x4 hv
+ sub \sr2, \src, #1
+ sub \src, \sr2, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vmovl.s8 q0, d0
+ vmovl.s8 q1, d2
+
+
+ vld1.8 {d26}, [\src], \s_strd
+ vmovl.u8 q13, d26
+ vext.8 q14, q13, q13, #2
+ vmul.s16 d26, d26, d0
+ vmul.s16 d28, d28, d0
+ vpadd.s16 d26, d26, d28
+ vpadd.s16 d26, d26, d26
+ vrshr.s16 d16, d26, #2
+ bl L(\type\()_8tap_filter_2)
+
+ vext.8 d16, d16, d16, #4
+ vmov d17, d26
+ vext.8 d16, d16, d26, #4
+
+2:
+ bl L(\type\()_8tap_filter_2)
+
+ vext.8 d18, d17, d26, #4
+ vmull.s16 q2, d16, d2[0]
+ vmlal.s16 q2, d17, d2[1]
+ vmlal.s16 q2, d18, d2[2]
+ vmlal.s16 q2, d26, d2[3]
+
+ vqrshrn.s32 d4, q2, #\shift_hv
+ vqmovun.s16 d4, q2
+ subs \h, \h, #2
+ vst1.16 {d4[0]}, [\dst, :16], \d_strd
+ vst1.16 {d4[1]}, [\ds2, :16], \d_strd
+ ble 0f
+ vmov d16, d18
+ vmov d17, d26
+ b 2b
+
+280: // 2x8, 2x16, 2x32 hv
+ vld1.8 {d2}, [\my, :64]
+ sub \src, \src, #1
+ sub \sr2, \src, \s_strd, lsl #1
+ sub \src, \sr2, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vmovl.s8 q0, d0
+ vmovl.s8 q1, d2
+
+ vld1.8 {d26}, [\src], \s_strd
+ vmovl.u8 q13, d26
+ vext.8 q14, q13, q13, #2
+ vmul.s16 d26, d26, d0
+ vmul.s16 d28, d28, d0
+ vpadd.s16 d26, d26, d28
+ vpadd.s16 d26, d26, d26
+ vrshr.s16 d16, d26, #2
+
+ bl L(\type\()_8tap_filter_2)
+ vext.8 d16, d16, d16, #4
+ vmov d17, d26
+ vext.8 d16, d16, d26, #4
+ bl L(\type\()_8tap_filter_2)
+ vext.8 d18, d17, d26, #4
+ vmov d19, d26
+ bl L(\type\()_8tap_filter_2)
+ vext.8 d20, d19, d26, #4
+ vmov d21, d26
+
+28:
+ bl L(\type\()_8tap_filter_2)
+ vext.8 d22, d21, d26, #4
+ vmull.s16 q2, d16, d2[0]
+ vmlal.s16 q2, d17, d2[1]
+ vmlal.s16 q2, d18, d2[2]
+ vmlal.s16 q2, d19, d2[3]
+ vmlal.s16 q2, d20, d3[0]
+ vmlal.s16 q2, d21, d3[1]
+ vmlal.s16 q2, d22, d3[2]
+ vmlal.s16 q2, d26, d3[3]
+
+ vqrshrn.s32 d4, q2, #\shift_hv
+ vqmovun.s16 d4, q2
+ subs \h, \h, #2
+ vst1.16 {d4[0]}, [\dst, :16], \d_strd
+ vst1.16 {d4[1]}, [\ds2, :16], \d_strd
+ ble 0f
+ vmov d16, d18
+ vmov d17, d19
+ vmov d18, d20
+ vmov d19, d21
+ vmov d20, d22
+ vmov d21, d26
+ b 28b
+
+0:
+ pop {r4-r11,pc}
+
+L(\type\()_8tap_filter_2):
+ vld1.8 {d28}, [\sr2], \s_strd
+ vld1.8 {d30}, [\src], \s_strd
+ vext.8 d29, d28, d28, #1
+ vext.8 d31, d30, d30, #1
+ vmovl.u8 q13, d28
+ vmovl.u8 q14, d29
+ vmov d27, d28
+ vmovl.u8 q14, d30
+ vmovl.u8 q15, d31
+ vtrn.32 d26, d28
+ vtrn.32 d27, d30
+ vmul.s16 d26, d26, d0[0]
+ vmla.s16 d26, d27, d0[1]
+ vmla.s16 d26, d28, d0[2]
+ vmla.s16 d26, d30, d0[3]
+ vrshr.s16 d26, d26, #2
+ vext.8 d27, d26, d26, #4
+ bx lr
+.endif
+
+40:
+ add \mx, \mx, #2
+ vld1.32 {d0[]}, [\mx]
+ bgt 480f
+ add \my, \my, #2
+ vld1.32 {d2[]}, [\my]
+ sub \sr2, \src, #1
+ sub \src, \sr2, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vmovl.s8 q0, d0
+ vmovl.s8 q1, d2
+
+ // 4x2, 4x4 hv
+ vld1.8 {d30}, [\src], \s_strd
+ vmovl.u8 q14, d30
+ vext.8 d27, d28, d29, #2
+ vext.8 d30, d28, d29, #4
+ vext.8 d31, d28, d29, #6
+ vmul.s16 d26, d28, d0[0]
+ vmla.s16 d26, d27, d0[1]
+ vmla.s16 d26, d30, d0[2]
+ vmla.s16 d26, d31, d0[3]
+ vrshr.s16 d16, d26, #2
+
+ bl L(\type\()_8tap_filter_4)
+ vmov d17, d26
+ vmov d18, d27
+
+4:
+ bl L(\type\()_8tap_filter_4)
+ vmull.s16 q2, d16, d2[0]
+ vmlal.s16 q2, d17, d2[1]
+ vmlal.s16 q2, d18, d2[2]
+ vmlal.s16 q2, d26, d2[3]
+ vmull.s16 q3, d17, d2[0]
+ vmlal.s16 q3, d18, d2[1]
+ vmlal.s16 q3, d26, d2[2]
+ vmlal.s16 q3, d27, d2[3]
+ vqrshrn.s32 d4, q2, #\shift_hv
+ vqrshrn.s32 d6, q3, #\shift_hv
+ subs \h, \h, #2
+.ifc \type, put
+ vqmovun.s16 d4, q2
+ vqmovun.s16 d6, q3
+ vst1.32 {d4[0]}, [\dst, :32], \d_strd
+ vst1.32 {d6[0]}, [\ds2, :32], \d_strd
+.else
+ vst1.16 {d4}, [\dst, :64], \d_strd
+ vst1.16 {d6}, [\ds2, :64], \d_strd
+.endif
+ ble 0f
+ vmov d16, d18
+ vmov d17, d26
+ vmov d18, d27
+ b 4b
+
+480: // 4x8, 4x16, 4x32 hv
+ vld1.8 {d2}, [\my, :64]
+ sub \src, \src, #1
+ sub \sr2, \src, \s_strd, lsl #1
+ sub \src, \sr2, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vmovl.s8 q0, d0
+ vmovl.s8 q1, d2
+
+ vld1.8 {d30}, [\src], \s_strd
+ vmovl.u8 q14, d30
+ vext.8 d27, d28, d29, #2
+ vext.8 d30, d28, d29, #4
+ vext.8 d31, d28, d29, #6
+ vmul.s16 d26, d28, d0[0]
+ vmla.s16 d26, d27, d0[1]
+ vmla.s16 d26, d30, d0[2]
+ vmla.s16 d26, d31, d0[3]
+ vrshr.s16 d16, d26, #2
+
+ bl L(\type\()_8tap_filter_4)
+ vmov d17, d26
+ vmov d18, d27
+ bl L(\type\()_8tap_filter_4)
+ vmov d19, d26
+ vmov d20, d27
+ bl L(\type\()_8tap_filter_4)
+ vmov d21, d26
+ vmov d22, d27
+
+48:
+ bl L(\type\()_8tap_filter_4)
+ vmull.s16 q2, d16, d2[0]
+ vmlal.s16 q2, d17, d2[1]
+ vmlal.s16 q2, d18, d2[2]
+ vmlal.s16 q2, d19, d2[3]
+ vmlal.s16 q2, d20, d3[0]
+ vmlal.s16 q2, d21, d3[1]
+ vmlal.s16 q2, d22, d3[2]
+ vmlal.s16 q2, d26, d3[3]
+ vmull.s16 q3, d17, d2[0]
+ vmlal.s16 q3, d18, d2[1]
+ vmlal.s16 q3, d19, d2[2]
+ vmlal.s16 q3, d20, d2[3]
+ vmlal.s16 q3, d21, d3[0]
+ vmlal.s16 q3, d22, d3[1]
+ vmlal.s16 q3, d26, d3[2]
+ vmlal.s16 q3, d27, d3[3]
+ vqrshrn.s32 d4, q2, #\shift_hv
+ vqrshrn.s32 d6, q3, #\shift_hv
+ subs \h, \h, #2
+.ifc \type, put
+ vqmovun.s16 d4, q2
+ vqmovun.s16 d6, q3
+ vst1.32 {d4[0]}, [\dst, :32], \d_strd
+ vst1.32 {d6[0]}, [\ds2, :32], \d_strd
+.else
+ vst1.16 {d4}, [\dst, :64], \d_strd
+ vst1.16 {d6}, [\ds2, :64], \d_strd
+.endif
+ ble 0f
+ vmov d16, d18
+ vmov d17, d19
+ vmov d18, d20
+ vmov d19, d21
+ vmov d20, d22
+ vmov d21, d26
+ vmov d22, d27
+ b 48b
+0:
+ pop {r4-r11,pc}
+
+L(\type\()_8tap_filter_4):
+ vld1.8 {d30}, [\sr2], \s_strd
+ vld1.8 {d31}, [\src], \s_strd
+ vmovl.u8 q14, d30
+ vext.8 d27, d28, d29, #2
+ vext.8 d30, d28, d29, #4
+ vext.8 d1, d28, d29, #6
+ vmul.s16 d26, d28, d0[0]
+ vmla.s16 d26, d27, d0[1]
+ vmla.s16 d26, d30, d0[2]
+ vmla.s16 d26, d1, d0[3]
+
+ vmovl.u8 q14, d31
+ vext.8 d30, d28, d29, #2
+ vext.8 d31, d28, d29, #4
+ vext.8 d1, d28, d29, #6
+ vmul.s16 d27, d28, d0[0]
+ vmla.s16 d27, d30, d0[1]
+ vmla.s16 d27, d31, d0[2]
+ vmla.s16 d27, d1, d0[3]
+ vrshr.s16 d26, d26, #2
+ vrshr.s16 d27, d27, #2
+ bx lr
+
+80:
+160:
+320:
+ bgt 880f
+ vpush {q4-q7}
+ add \my, \my, #2
+ vld1.8 {d0}, [\mx, :64]
+ vld1.32 {d2[]}, [\my]
+ sub \src, \src, #3
+ sub \src, \src, \s_strd
+ vmovl.s8 q0, d0
+ vmovl.s8 q1, d2
+ mov \my, \h
+
+164: // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+
+ vld1.8 {q14}, [\src], \s_strd
+ vmovl.u8 q12, d28
+ vmovl.u8 q13, d29
+ vmul.s16 q10, q12, d0[0]
+.irpc i, 123
+ vext.8 q14, q12, q13, #(2*\i)
+ vmla.s16 q10, q14, d0[\i]
+.endr
+.irpc i, 4567
+ vext.8 q14, q12, q13, #(2*\i)
+ vmla.s16 q10, q14, d1[\i-4]
+.endr
+ vrshr.s16 q3, q10, #2
+
+ bl L(\type\()_8tap_filter_8)
+ vmov q4, q10
+ vmov q5, q11
+
+8:
+ bl L(\type\()_8tap_filter_8)
+ vmull.s16 q12, d6, d2[0]
+ vmull.s16 q13, d7, d2[0]
+ vmull.s16 q14, d8, d2[0]
+ vmull.s16 q15, d9, d2[0]
+ vmlal.s16 q12, d8, d2[1]
+ vmlal.s16 q13, d9, d2[1]
+ vmlal.s16 q14, d10, d2[1]
+ vmlal.s16 q15, d11, d2[1]
+ vmlal.s16 q12, d10, d2[2]
+ vmlal.s16 q13, d11, d2[2]
+ vmlal.s16 q14, d20, d2[2]
+ vmlal.s16 q15, d21, d2[2]
+ vmlal.s16 q12, d20, d2[3]
+ vmlal.s16 q13, d21, d2[3]
+ vmlal.s16 q14, d22, d2[3]
+ vmlal.s16 q15, d23, d2[3]
+ vqrshrn.s32 d24, q12, #\shift_hv
+ vqrshrn.s32 d25, q13, #\shift_hv
+ vqrshrn.s32 d28, q14, #\shift_hv
+ vqrshrn.s32 d29, q15, #\shift_hv
+ subs \h, \h, #2
+.ifc \type, put
+ vqmovun.s16 d24, q12
+ vqmovun.s16 d28, q14
+ vst1.8 {d24}, [\dst, :64], \d_strd
+ vst1.8 {d28}, [\ds2, :64], \d_strd
+.else
+ vst1.16 {q12}, [\dst, :128], \d_strd
+ vst1.16 {q14}, [\ds2, :128], \d_strd
+.endif
+ ble 9f
+ vmov q3, q5
+ vmov q4, q10
+ vmov q5, q11
+ b 8b
+9:
+ subs \w, \w, #8
+ ble 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ mls \src, \s_strd, \my, \src
+ mls \dst, \d_strd, \my, \dst
+ sub \src, \src, \s_strd, lsl #2
+ mov \h, \my
+ add \src, \src, #8
+.ifc \type, put
+ add \dst, \dst, #8
+.else
+ add \dst, \dst, #16
+.endif
+ b 164b
+
+880: // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv
+640:
+1280:
+ vpush {q4-q7}
+ vld1.8 {d0}, [\mx, :64]
+ vld1.8 {d2}, [\my, :64]
+ sub \src, \src, #3
+ sub \src, \src, \s_strd
+ sub \src, \src, \s_strd, lsl #1
+ vmovl.s8 q0, d0
+ vmovl.s8 q1, d2
+ mov \my, \h
+
+168:
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+
+ vld1.8 {q14}, [\src], \s_strd
+ vmovl.u8 q12, d28
+ vmovl.u8 q13, d29
+ vmul.s16 q10, q12, d0[0]
+.irpc i, 123
+ vext.8 q14, q12, q13, #(2*\i)
+ vmla.s16 q10, q14, d0[\i]
+.endr
+.irpc i, 4567
+ vext.8 q14, q12, q13, #(2*\i)
+ vmla.s16 q10, q14, d1[\i-4]
+.endr
+ vrshr.s16 q3, q10, #2
+
+ bl L(\type\()_8tap_filter_8)
+ vmov q4, q10
+ vmov q5, q11
+ bl L(\type\()_8tap_filter_8)
+ vmov q6, q10
+ vmov q7, q11
+ bl L(\type\()_8tap_filter_8)
+ vmov q8, q10
+ vmov q9, q11
+
+88:
+ bl L(\type\()_8tap_filter_8)
+ vmull.s16 q12, d6, d2[0]
+ vmull.s16 q13, d7, d2[0]
+ vmull.s16 q14, d8, d2[0]
+ vmull.s16 q15, d9, d2[0]
+ vmlal.s16 q12, d8, d2[1]
+ vmlal.s16 q13, d9, d2[1]
+ vmlal.s16 q14, d10, d2[1]
+ vmlal.s16 q15, d11, d2[1]
+ vmlal.s16 q12, d10, d2[2]
+ vmlal.s16 q13, d11, d2[2]
+ vmlal.s16 q14, d12, d2[2]
+ vmlal.s16 q15, d13, d2[2]
+ vmlal.s16 q12, d12, d2[3]
+ vmlal.s16 q13, d13, d2[3]
+ vmlal.s16 q14, d14, d2[3]
+ vmlal.s16 q15, d15, d2[3]
+ vmlal.s16 q12, d14, d3[0]
+ vmlal.s16 q13, d15, d3[0]
+ vmlal.s16 q14, d16, d3[0]
+ vmlal.s16 q15, d17, d3[0]
+ vmlal.s16 q12, d16, d3[1]
+ vmlal.s16 q13, d17, d3[1]
+ vmlal.s16 q14, d18, d3[1]
+ vmlal.s16 q15, d19, d3[1]
+ vmlal.s16 q12, d18, d3[2]
+ vmlal.s16 q13, d19, d3[2]
+ vmlal.s16 q14, d20, d3[2]
+ vmlal.s16 q15, d21, d3[2]
+ vmlal.s16 q12, d20, d3[3]
+ vmlal.s16 q13, d21, d3[3]
+ vmlal.s16 q14, d22, d3[3]
+ vmlal.s16 q15, d23, d3[3]
+ vqrshrn.s32 d24, q12, #\shift_hv
+ vqrshrn.s32 d25, q13, #\shift_hv
+ vqrshrn.s32 d28, q14, #\shift_hv
+ vqrshrn.s32 d29, q15, #\shift_hv
+ subs \h, \h, #2
+.ifc \type, put
+ vqmovun.s16 d24, q12
+ vqmovun.s16 d28, q14
+ vst1.8 {d24}, [\dst, :64], \d_strd
+ vst1.8 {d28}, [\ds2, :64], \d_strd
+.else
+ vst1.16 {q12}, [\dst, :128], \d_strd
+ vst1.16 {q14}, [\ds2, :128], \d_strd
+.endif
+ ble 9f
+ vmov q3, q5
+ vmov q4, q6
+ vmov q5, q7
+ vmov q6, q8
+ vmov q7, q9
+ vmov q8, q10
+ vmov q9, q11
+ b 88b
+9:
+ subs \w, \w, #8
+ ble 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ mls \src, \s_strd, \my, \src
+ mls \dst, \d_strd, \my, \dst
+ sub \src, \src, \s_strd, lsl #3
+ mov \h, \my
+ add \src, \src, #8
+.ifc \type, put
+ add \dst, \dst, #8
+.else
+ add \dst, \dst, #16
+.endif
+ b 168b
+0:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+
+L(\type\()_8tap_filter_8):
+ vld1.8 {q14}, [\sr2], \s_strd
+ vld1.8 {q15}, [\src], \s_strd
+ vmovl.u8 q12, d28
+ vmovl.u8 q13, d29
+ vmul.s16 q10, q12, d0[0]
+.irpc i, 123
+ vext.8 q14, q12, q13, #(2*\i)
+ vmla.s16 q10, q14, d0[\i]
+.endr
+.irpc i, 4567
+ vext.8 q14, q12, q13, #(2*\i)
+ vmla.s16 q10, q14, d1[\i-4]
+.endr
+ vmovl.u8 q12, d30
+ vmovl.u8 q13, d31
+ vmul.s16 q11, q12, d0[0]
+.irpc i, 123
+ vext.8 q14, q12, q13, #(2*\i)
+ vmla.s16 q11, q14, d0[\i]
+.endr
+.irpc i, 4567
+ vext.8 q14, q12, q13, #(2*\i)
+ vmla.s16 q11, q14, d1[\i-4]
+.endr
+ vrshr.s16 q10, q10, #2
+ vrshr.s16 q11, q11, #2
+ bx lr
+endfunc
+
+
+function \type\()_bilin_8bpc_neon, export=1
+ push {r4-r11,lr}
+ ldrd r4, r5, [sp, #36]
+ ldrd r6, r7, [sp, #44]
+ vdup.8 d1, \mx
+ vdup.8 d3, \my
+ rsb r8, \mx, #16
+ rsb r9, \my, #16
+ vdup.8 d0, r8
+ vdup.8 d2, r9
+.ifc \type, prep
+ lsl \d_strd, \w, #1
+.endif
+ clz r8, \w
+ cmp \mx, #0
+ sub r8, r8, #24
+ bne L(\type\()_bilin_h)
+ cmp \my, #0
+ bne L(\type\()_bilin_v)
+ b \type\()_neon
+
+L(\type\()_bilin_h):
+ cmp \my, #0
+ bne L(\type\()_bilin_hv)
+
+ adr r9, L(\type\()_bilin_h_tbl)
+ ldr r8, [r9, r8, lsl #2]
+ add r9, r9, r8
+ bx r9
+
+ .align 2
+L(\type\()_bilin_h_tbl):
+ .word 1280f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+ .word 640f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+ .word 320f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+ .word 160f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+ .word 80f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+ .word 40f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+ .word 20f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+
+20: // 2xN h
+.ifc \type, put
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+2:
+ vld1.32 {d4[]}, [\src], \s_strd
+ vld1.32 {d6[]}, [\sr2], \s_strd
+ vext.8 d5, d4, d4, #1
+ vext.8 d7, d6, d6, #1
+ vtrn.16 q2, q3
+ subs \h, \h, #2
+ vmull.u8 q3, d4, d0
+ vmlal.u8 q3, d5, d1
+ vqrshrn.u16 d4, q3, #4
+ vst1.16 {d4[0]}, [\dst, :16], \d_strd
+ vst1.16 {d4[1]}, [\ds2, :16], \d_strd
+ bgt 2b
+ pop {r4-r11,pc}
+.endif
+
+40: // 4xN h
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+4:
+ vld1.8 {d4}, [\src], \s_strd
+ vld1.8 {d6}, [\sr2], \s_strd
+ vext.8 d5, d4, d4, #1
+ vext.8 d7, d6, d6, #1
+ vtrn.32 q2, q3
+ subs \h, \h, #2
+ vmull.u8 q3, d4, d0
+ vmlal.u8 q3, d5, d1
+.ifc \type, put
+ vqrshrn.u16 d4, q3, #4
+ vst1.32 {d4[0]}, [\dst, :32], \d_strd
+ vst1.32 {d4[1]}, [\ds2, :32], \d_strd
+.else
+ vst1.16 {d6}, [\dst, :64], \d_strd
+ vst1.16 {d7}, [\ds2, :64], \d_strd
+.endif
+ bgt 4b
+ pop {r4-r11,pc}
+
+80: // 8xN h
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+8:
+ vld1.8 {q8}, [\src], \s_strd
+ vld1.8 {q10}, [\sr2], \s_strd
+ vext.8 q9, q8, q8, #1
+ vext.8 q11, q10, q10, #1
+ subs \h, \h, #2
+ vmull.u8 q8, d16, d0
+ vmull.u8 q10, d20, d0
+ vmlal.u8 q8, d18, d1
+ vmlal.u8 q10, d22, d1
+.ifc \type, put
+ vqrshrn.u16 d16, q8, #4
+ vqrshrn.u16 d18, q10, #4
+ vst1.8 {d16}, [\dst, :64], \d_strd
+ vst1.8 {d18}, [\ds2, :64], \d_strd
+.else
+ vst1.16 {q8}, [\dst, :128], \d_strd
+ vst1.16 {q10}, [\ds2, :128], \d_strd
+.endif
+ bgt 8b
+ pop {r4-r11,pc}
+160:
+320:
+640:
+1280: // 16xN, 32xN, ... h
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+
+ sub \s_strd, \s_strd, \w
+ sub \s_strd, \s_strd, #8
+.ifc \type, put
+ lsl \d_strd, \d_strd, #1
+ sub \d_strd, \d_strd, \w
+.endif
+161:
+ vld1.8 {d16}, [\src]!
+ vld1.8 {d22}, [\sr2]!
+ mov \mx, \w
+
+16:
+ vld1.8 {d17,d18}, [\src]!
+ vld1.8 {d23,d24}, [\sr2]!
+ vext.8 q10, q8, q9, #1
+ vext.8 q13, q11, q12, #1
+ vmull.u8 q2, d16, d0
+ vmull.u8 q3, d17, d0
+ vmull.u8 q14, d22, d0
+ vmull.u8 q15, d23, d0
+ vmlal.u8 q2, d20, d1
+ vmlal.u8 q3, d21, d1
+ vmlal.u8 q14, d26, d1
+ vmlal.u8 q15, d27, d1
+ subs \mx, \mx, #16
+.ifc \type, put
+ vqrshrn.u16 d4, q2, #4
+ vqrshrn.u16 d5, q3, #4
+ vqrshrn.u16 d28, q14, #4
+ vqrshrn.u16 d29, q15, #4
+ vst1.8 {q2}, [\dst, :128]!
+ vst1.8 {q14}, [\ds2, :128]!
+.else
+ vst1.16 {q2, q3}, [\dst, :128]!
+ vst1.16 {q14, q15}, [\ds2, :128]!
+.endif
+ ble 9f
+
+ vmov d16, d18
+ vmov d22, d24
+ b 16b
+
+9:
+ add \dst, \dst, \d_strd
+ add \ds2, \ds2, \d_strd
+ add \src, \src, \s_strd
+ add \sr2, \sr2, \s_strd
+
+ subs \h, \h, #2
+ bgt 161b
+ pop {r4-r11,pc}
+
+L(\type\()_bilin_v):
+ cmp \h, #4
+ adr r9, L(\type\()_bilin_v_tbl)
+ ldr r8, [r9, r8, lsl #2]
+ add r9, r9, r8
+ bx r9
+
+ .align 2
+L(\type\()_bilin_v_tbl):
+ .word 1280f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+ .word 640f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+ .word 320f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+ .word 160f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+ .word 80f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+ .word 40f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+ .word 20f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+
+20: // 2xN v
+.ifc \type, put
+ cmp \h, #2
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ // 2x2 v
+ vld1.16 {d16[]}, [\src], \s_strd
+ bgt 24f
+22:
+ vld1.16 {d17[]}, [\sr2], \s_strd
+ vld1.16 {d18[]}, [\src], \s_strd
+ vext.8 d16, d16, d17, #6
+ vext.8 d17, d17, d18, #6
+ vmull.u8 q2, d16, d2
+ vmlal.u8 q2, d17, d3
+ vqrshrn.u16 d4, q2, #4
+ vst1.16 {d4[0]}, [\dst, :16]
+ vst1.16 {d4[1]}, [\ds2, :16]
+ pop {r4-r11,pc}
+24: // 2x4, 2x6, 2x8, ... v
+ vld1.16 {d17[]}, [\sr2], \s_strd
+ vld1.16 {d18[]}, [\src], \s_strd
+ vld1.16 {d19[]}, [\sr2], \s_strd
+ vld1.16 {d20[]}, [\src], \s_strd
+ sub \h, \h, #4
+ vext.8 d16, d16, d17, #6
+ vext.8 d17, d17, d18, #6
+ vext.8 d18, d18, d19, #6
+ vext.8 d19, d19, d20, #6
+ vtrn.32 d16, d18
+ vtrn.32 d17, d19
+ vmull.u8 q2, d16, d2
+ vmlal.u8 q2, d17, d3
+ cmp \h, #2
+ vqrshrn.u16 d4, q2, #4
+ vst1.16 {d4[0]}, [\dst, :16], \d_strd
+ vst1.16 {d4[1]}, [\ds2, :16], \d_strd
+ vst1.16 {d4[2]}, [\dst, :16], \d_strd
+ vst1.16 {d4[3]}, [\ds2, :16], \d_strd
+ blt 0f
+ vmov d16, d20
+ beq 22b
+ b 24b
+0:
+ pop {r4-r11,pc}
+.endif
+
+40: // 4xN v
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vld1.32 {d16[]}, [\src], \s_strd
+4:
+ vld1.32 {d17[]}, [\sr2], \s_strd
+ vld1.32 {d18[]}, [\src], \s_strd
+ vext.8 d16, d16, d17, #4
+ vext.8 d17, d17, d18, #4
+ vmull.u8 q2, d16, d2
+ vmlal.u8 q2, d17, d3
+ subs \h, \h, #2
+.ifc \type, put
+ vqrshrn.u16 d4, q2, #4
+ vst1.32 {d4[0]}, [\dst, :32], \d_strd
+ vst1.32 {d4[1]}, [\ds2, :32], \d_strd
+.else
+ vst1.16 {d4}, [\dst, :64], \d_strd
+ vst1.16 {d5}, [\ds2, :64], \d_strd
+.endif
+ ble 0f
+ vmov d16, d18
+ b 4b
+0:
+ pop {r4-r11,pc}
+
+80: // 8xN v
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vld1.8 {d16}, [\src], \s_strd
+8:
+ vld1.8 {d17}, [\sr2], \s_strd
+ vld1.8 {d18}, [\src], \s_strd
+ vmull.u8 q2, d16, d2
+ vmull.u8 q3, d17, d2
+ vmlal.u8 q2, d17, d3
+ vmlal.u8 q3, d18, d3
+ subs \h, \h, #2
+.ifc \type, put
+ vqrshrn.u16 d4, q2, #4
+ vqrshrn.u16 d6, q3, #4
+ vst1.8 {d4}, [\dst, :64], \d_strd
+ vst1.8 {d6}, [\ds2, :64], \d_strd
+.else
+ vst1.16 {q2}, [\dst, :128], \d_strd
+ vst1.16 {q3}, [\ds2, :128], \d_strd
+.endif
+ ble 0f
+ vmov d16, d18
+ b 8b
+0:
+ pop {r4-r11,pc}
+
+160: // 16xN, 32xN, ...
+320:
+640:
+1280:
+ mov \my, \h
+1:
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ vld1.8 {q8}, [\src], \s_strd
+2:
+ vld1.8 {q9}, [\sr2], \s_strd
+ vld1.8 {q10}, [\src], \s_strd
+ vmull.u8 q12, d16, d2
+ vmull.u8 q13, d17, d2
+ vmull.u8 q14, d18, d2
+ vmull.u8 q15, d19, d2
+ vmlal.u8 q12, d18, d3
+ vmlal.u8 q13, d19, d3
+ vmlal.u8 q14, d20, d3
+ vmlal.u8 q15, d21, d3
+ subs \h, \h, #2
+.ifc \type, put
+ vqrshrn.u16 d24, q12, #4
+ vqrshrn.u16 d25, q13, #4
+ vqrshrn.u16 d28, q14, #4
+ vqrshrn.u16 d29, q15, #4
+ vst1.8 {q12}, [\dst, :128], \d_strd
+ vst1.8 {q14}, [\ds2, :128], \d_strd
+.else
+ vst1.16 {q12, q13}, [\dst, :128], \d_strd
+ vst1.16 {q14, q15}, [\ds2, :128], \d_strd
+.endif
+ ble 9f
+ vmov q8, q10
+ b 2b
+9:
+ subs \w, \w, #16
+ ble 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ mls \src, \s_strd, \my, \src
+ mls \dst, \d_strd, \my, \dst
+ sub \src, \src, \s_strd, lsl #1
+ mov \h, \my
+ add \src, \src, #16
+.ifc \type, put
+ add \dst, \dst, #16
+.else
+ add \dst, \dst, #32
+.endif
+ b 1b
+0:
+ pop {r4-r11,pc}
+
+L(\type\()_bilin_hv):
+ vmovl.u8 q2, d2
+ vmovl.u8 q3, d3
+ adr r9, L(\type\()_bilin_hv_tbl)
+ ldr r8, [r9, r8, lsl #2]
+ add r9, r9, r8
+ bx r9
+
+ .align 2
+L(\type\()_bilin_hv_tbl):
+ .word 1280f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+ .word 640f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+ .word 320f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+ .word 160f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+ .word 80f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+ .word 40f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+ .word 20f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+
+20: // 2xN hv
+.ifc \type, put
+ add \sr2, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ vld1.32 {d28[]}, [\src], \s_strd
+ vext.8 d29, d28, d28, #1
+ vmull.u8 q8, d28, d0
+ vmlal.u8 q8, d29, d1
+
+2:
+ vld1.32 {d28[]}, [\sr2], \s_strd
+ vld1.32 {d30[]}, [\src], \s_strd
+ vext.8 d29, d28, d28, #1
+ vext.8 d31, d30, d30, #1
+ vtrn.16 d28, d30
+ vtrn.16 d29, d31
+ vmull.u8 q9, d28, d0
+ vmlal.u8 q9, d29, d1
+
+ vtrn.32 d16, d18
+
+ vmul.u16 d20, d16, d4
+ vmla.u16 d20, d19, d6
+ vqrshrn.u16 d20, q10, #8
+ subs \h, \h, #2
+ vst1.16 {d20[0]}, [\dst, :16], \d_strd
+ vst1.16 {d20[1]}, [\ds2, :16], \d_strd
+ ble 0f
+ vtrn.32 d19, d16
+ b 2b
+0:
+ pop {r4-r11,pc}
+.endif
+
+40: // 4xN hv
+ add \sr2, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ vld1.8 {d28}, [\src], \s_strd
+ vext.8 d29, d28, d28, #1
+ vmull.u8 q8, d28, d0
+ vmlal.u8 q8, d29, d1
+
+4:
+ vld1.8 {d28}, [\sr2], \s_strd
+ vld1.8 {d30}, [\src], \s_strd
+ vext.8 d29, d28, d28, #1
+ vext.8 d31, d30, d30, #1
+ vtrn.32 d28, d30
+ vtrn.32 d29, d31
+ vmull.u8 q9, d28, d0
+ vmlal.u8 q9, d29, d1
+
+ vmov d17, d18
+
+ vmul.u16 q10, q8, q2
+ vmla.u16 q10, q9, q3
+ subs \h, \h, #2
+.ifc \type, put
+ vqrshrn.u16 d20, q10, #8
+ vst1.32 {d20[0]}, [\dst, :32], \d_strd
+ vst1.32 {d20[1]}, [\ds2, :32], \d_strd
+.else
+ vrshr.u16 q10, q10, #4
+ vst1.16 {d20}, [\dst, :64], \d_strd
+ vst1.16 {d21}, [\ds2, :64], \d_strd
+.endif
+ ble 0f
+ vmov d16, d19
+ b 4b
+0:
+ pop {r4-r11,pc}
+
+80: // 8xN, 16xN, ... hv
+160:
+320:
+640:
+1280:
+ mov \my, \h
+
+1:
+ add \sr2, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ vld1.8 {q12}, [\src], \s_strd
+ vext.8 q13, q12, q12, #1
+ vmull.u8 q8, d24, d0
+ vmlal.u8 q8, d26, d1
+
+2:
+ vld1.8 {q12}, [\sr2], \s_strd
+ vld1.8 {q14}, [\src], \s_strd
+ vext.8 q13, q12, q12, #1
+ vext.8 q15, q14, q14, #1
+ vmull.u8 q9, d24, d0
+ vmlal.u8 q9, d26, d1
+ vmull.u8 q10, d28, d0
+ vmlal.u8 q10, d30, d1
+
+ vmul.u16 q8, q8, q2
+ vmla.u16 q8, q9, q3
+ vmul.u16 q9, q9, q2
+ vmla.u16 q9, q10, q3
+ subs \h, \h, #2
+.ifc \type, put
+ vqrshrn.u16 d16, q8, #8
+ vqrshrn.u16 d18, q9, #8
+ vst1.8 {d16}, [\dst, :64], \d_strd
+ vst1.8 {d18}, [\ds2, :64], \d_strd
+.else
+ vrshr.u16 q8, q8, #4
+ vrshr.u16 q9, q9, #4
+ vst1.16 {q8}, [\dst, :128], \d_strd
+ vst1.16 {q9}, [\ds2, :128], \d_strd
+.endif
+ ble 9f
+ vmov q8, q10
+ b 2b
+9:
+ subs \w, \w, #8
+ ble 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ mls \src, \s_strd, \my, \src
+ mls \dst, \d_strd, \my, \dst
+ sub \src, \src, \s_strd, lsl #1
+ mov \h, \my
+ add \src, \src, #8
+.ifc \type, put
+ add \dst, \dst, #8
+.else
+ add \dst, \dst, #16
+.endif
+ b 1b
+0:
+ pop {r4-r11,pc}
+endfunc
+.endm
+
+filter_fn put, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, 10
+filter_fn prep, r0, r7, r1, r2, r3, r4, r5, r6, r8, r9, 6
+
+.macro load_filter_ptr src
+ asr r12, \src, #10
+ add r12, r11, r12, lsl #3
+.endm
+
+.macro load_filter_coef dst, src, inc
+ add \src, \src, \inc
+ vld1.8 {\dst}, [r12, :64]
+.endm
+
+.macro load_filter_row dst, src, inc
+ load_filter_ptr \src
+ load_filter_coef \dst, \src, \inc
+.endm
+
+function warp_filter_horz_neon
+ load_filter_ptr r5 // filter 0
+ vld1.16 {q7}, [r2], r3
+ vmov.i8 q6, #128
+
+ load_filter_coef d0, r5, r7 // filter 0
+ load_filter_row d1, r5, r7 // filter 1
+ load_filter_row d2, r5, r7 // filter 2
+ load_filter_ptr r5 // filter 3
+ veor q7, q7, q6 // subtract by 128 to allow using vmull
+ load_filter_coef d3, r5, r7 // filter 3
+ vext.8 d12, d14, d15, #1 // filter 1 pixels
+ vext.8 d13, d14, d15, #2 // filter 2 pixels
+ load_filter_ptr r5 // filter 4
+ vmull.s8 q2, d14, d0 // filter 0 output
+ vmull.s8 q3, d12, d1 // filter 1 output
+ load_filter_coef d0, r5, r7 // filter 4
+ load_filter_ptr r5 // filter 5
+ vext.8 d12, d14, d15, #3 // filter 3 pixels
+ vmull.s8 q4, d13, d2 // filter 2 output
+ vext.8 d13, d14, d15, #4 // filter 4 pixels
+ vpadd.i16 d4, d4, d5 // pixel 0 (4x16)
+ vpadd.i16 d5, d6, d7 // pixel 1 (4x16)
+ load_filter_coef d1, r5, r7 // filter 5
+ load_filter_ptr r5 // filter 6
+ vmull.s8 q5, d12, d3 // filter 3 output
+ vext.8 d12, d14, d15, #5 // filter 5 pixels
+ vmull.s8 q3, d13, d0 // filter 4 output
+ load_filter_coef d0, r5, r7 // filter 6
+ vext.8 d13, d14, d15, #6 // filter 6 pixels
+ load_filter_ptr r5 // filter 7
+ vpadd.i16 d8, d8, d9 // pixel 2 (4x16)
+ vpadd.i16 d9, d10, d11 // pixel 3 (4x16)
+ vmull.s8 q5, d12, d1 // filter 5 output
+ load_filter_coef d1, r5, r7 // filter 7
+ vext.8 d14, d14, d15, #7 // filter 7 pixels
+ vpadd.i16 d6, d6, d7 // pixel 4 (4x16)
+ vpadd.i16 d10, d10, d11 // pixel 5 (4x16)
+ vmull.s8 q6, d13, d0 // filter 6 output
+ vmull.s8 q7, d14, d1 // filter 7 output
+
+ sub r5, r5, r7, lsl #3
+
+ vpadd.i16 d4, d4, d5 // pixel 0,1 (2x16)
+ vpadd.i16 d5, d8, d9 // pixel 2,3 (2x16)
+ vpadd.i16 d12, d12, d13 // pixel 6 (4x16)
+ vpadd.i16 d14, d14, d15 // pixel 7 (4x16)
+ vpadd.i16 d6, d6, d10 // pixel 4,5 (2x16)
+ vpadd.i16 d10, d12, d14 // pixel 6,7 (2x16)
+ vpadd.i16 d4, d4, d5 // pixel 0-3
+ vpadd.i16 d5, d6, d10 // pixel 4-7
+
+ add r5, r5, r8
+
+ bx lr
+endfunc
+
+// void dav1d_warp_affine_8x8_8bpc_neon(
+// pixel *dst, const ptrdiff_t dst_stride,
+// const pixel *src, const ptrdiff_t src_stride,
+// const int16_t *const abcd, int mx, int my)
+.macro warp t, shift
+function warp_affine_8x8\t\()_8bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100]
+ ldr r6, [sp, #108]
+ ldrd r8, r9, [r4]
+ sxth r7, r8
+ asr r8, r8, #16
+ asr r4, r9, #16
+ sxth r9, r9
+ mov r10, #8
+ sub r2, r2, r3, lsl #1
+ sub r2, r2, r3
+ sub r2, r2, #3
+ movrel r11, X(mc_warp_filter), 64*8
+.ifnb \t
+ lsl r1, r1, #1
+.endif
+ add r5, r5, #512
+ add r6, r6, #512
+
+ bl warp_filter_horz_neon
+ vrshr.s16 q8, q2, #3
+ bl warp_filter_horz_neon
+ vrshr.s16 q9, q2, #3
+ bl warp_filter_horz_neon
+ vrshr.s16 q10, q2, #3
+ bl warp_filter_horz_neon
+ vrshr.s16 q11, q2, #3
+ bl warp_filter_horz_neon
+ vrshr.s16 q12, q2, #3
+ bl warp_filter_horz_neon
+ vrshr.s16 q13, q2, #3
+ bl warp_filter_horz_neon
+ vrshr.s16 q14, q2, #3
+
+1:
+ bl warp_filter_horz_neon
+ vrshr.s16 q15, q2, #3
+
+ load_filter_row d8, r6, r9
+ load_filter_row d9, r6, r9
+ load_filter_row d10, r6, r9
+ load_filter_row d11, r6, r9
+ load_filter_row d12, r6, r9
+ load_filter_row d13, r6, r9
+ load_filter_row d14, r6, r9
+ load_filter_row d15, r6, r9
+ transpose_8x8b q4, q5, q6, q7, d8, d9, d10, d11, d12, d13, d14, d15
+ vmovl.s8 q1, d8
+ vmovl.s8 q2, d9
+ vmovl.s8 q3, d10
+ vmovl.s8 q4, d11
+ vmovl.s8 q5, d12
+ vmovl.s8 q6, d13
+
+ sub r6, r6, r9, lsl #3
+
+ // This ordering of vmull/vmlal is highly beneficial for
+ // Cortex A8/A9/A53 here, but harmful for Cortex A7.
+ vmull.s16 q0, d16, d2
+ vmlal.s16 q0, d18, d4
+ vmlal.s16 q0, d20, d6
+ vmlal.s16 q0, d22, d8
+ vmlal.s16 q0, d24, d10
+ vmlal.s16 q0, d26, d12
+ vmull.s16 q1, d17, d3
+ vmlal.s16 q1, d19, d5
+ vmlal.s16 q1, d21, d7
+ vmlal.s16 q1, d23, d9
+ vmlal.s16 q1, d25, d11
+ vmlal.s16 q1, d27, d13
+
+ vmovl.s8 q2, d14
+ vmovl.s8 q3, d15
+
+ vmlal.s16 q0, d28, d4
+ vmlal.s16 q0, d30, d6
+ vmlal.s16 q1, d29, d5
+ vmlal.s16 q1, d31, d7
+
+.ifb \t
+ vmov.i16 q7, #128
+.else
+ vmov.i16 q7, #0x800
+.endif
+
+ vmov q8, q9
+ vmov q9, q10
+ vqrshrn.s32 d0, q0, #\shift
+ vmov q10, q11
+ vqrshrn.s32 d1, q1, #\shift
+ vmov q11, q12
+ vadd.i16 q0, q0, q7
+ vmov q12, q13
+.ifb \t
+ vqmovun.s16 d0, q0
+.endif
+ vmov q13, q14
+ vmov q14, q15
+ subs r10, r10, #1
+.ifnb \t
+ vst1.16 {q0}, [r0, :128], r1
+.else
+ vst1.8 {d0}, [r0, :64], r1
+.endif
+
+ add r6, r6, r4
+ bgt 1b
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+.endm
+
+warp , 11
+warp t, 7
+
+// void dav1d_emu_edge_8bpc_neon(
+// const intptr_t bw, const intptr_t bh,
+// const intptr_t iw, const intptr_t ih,
+// const intptr_t x, const intptr_t y,
+// pixel *dst, const ptrdiff_t dst_stride,
+// const pixel *ref, const ptrdiff_t ref_stride)
+function emu_edge_8bpc_neon, export=1
+ push {r4-r11,lr}
+ ldrd r4, r5, [sp, #36]
+ ldrd r6, r7, [sp, #44]
+ ldrd r8, r9, [sp, #52]
+
+ // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
+ // ref += iclip(x, 0, iw - 1)
+ sub r12, r3, #1 // ih - 1
+ cmp r5, r3
+ sub lr, r2, #1 // iw - 1
+ it lt
+ movlt r12, r5 // min(y, ih - 1)
+ cmp r4, r2
+ bic r12, r12, r12, asr #31 // max(min(y, ih - 1), 0)
+ it lt
+ movlt lr, r4 // min(x, iw - 1)
+ bic lr, lr, lr, asr #31 // max(min(x, iw - 1), 0)
+ mla r8, r12, r9, r8 // ref += iclip() * stride
+ add r8, r8, lr // ref += iclip()
+
+ // bottom_ext = iclip(y + bh - ih, 0, bh - 1)
+ // top_ext = iclip(-y, 0, bh - 1)
+ add r10, r5, r1 // y + bh
+ neg r5, r5 // -y
+ sub r10, r10, r3 // y + bh - ih
+ sub r12, r1, #1 // bh - 1
+ cmp r10, r1
+ bic r5, r5, r5, asr #31 // max(-y, 0)
+ it ge
+ movge r10, r12 // min(y + bh - ih, bh-1)
+ cmp r5, r1
+ bic r10, r10, r10, asr #31 // max(min(y + bh - ih, bh-1), 0)
+ it ge
+ movge r5, r12 // min(max(-y, 0), bh-1)
+
+ // right_ext = iclip(x + bw - iw, 0, bw - 1)
+ // left_ext = iclip(-x, 0, bw - 1)
+ add r11, r4, r0 // x + bw
+ neg r4, r4 // -x
+ sub r11, r11, r2 // x + bw - iw
+ sub lr, r0, #1 // bw - 1
+ cmp r11, r0
+ bic r4, r4, r4, asr #31 // max(-x, 0)
+ it ge
+ movge r11, lr // min(x + bw - iw, bw-1)
+ cmp r4, r0
+ bic r11, r11, r11, asr #31 // max(min(x + bw - iw, bw-1), 0)
+ it ge
+ movge r4, lr // min(max(-x, 0), bw - 1)
+
+ // center_h = bh - top_ext - bottom_ext
+ // dst += top_ext * PXSTRIDE(dst_stride)
+ // center_w = bw - left_ext - right_ext
+ sub r1, r1, r5 // bh - top_ext
+ mla r6, r5, r7, r6
+ sub r2, r0, r4 // bw - left_ext
+ sub r1, r1, r10 // center_h = bh - top_ext - bottom_ext
+ sub r2, r2, r11 // center_w = bw - left_ext - right_ext
+
+ mov r0, r6 // backup of dst
+
+.macro v_loop need_left, need_right
+0:
+.if \need_left
+ vld1.8 {d0[], d1[]}, [r8]
+ mov r12, r6 // out = dst
+ mov r3, r4
+1:
+ subs r3, r3, #16
+ vst1.8 {q0}, [r12, :128]!
+ bgt 1b
+.endif
+ mov lr, r8
+ add r12, r6, r4 // out = dst + left_ext
+ mov r3, r2
+1:
+ vld1.8 {q0, q1}, [lr]!
+ subs r3, r3, #32
+.if \need_left
+ vst1.8 {q0, q1}, [r12]!
+.else
+ vst1.8 {q0, q1}, [r12, :128]!
+.endif
+ bgt 1b
+.if \need_right
+ add r3, r8, r2 // in + center_w
+ sub r3, r3, #1 // in + center_w - 1
+ add r12, r6, r4 // dst + left_ext
+ vld1.8 {d0[], d1[]}, [r3]
+ add r12, r12, r2 // out = dst + left_ext + center_w
+ mov r3, r11
+1:
+ subs r3, r3, #16
+ vst1.8 {q0}, [r12]!
+ bgt 1b
+.endif
+
+ subs r1, r1, #1 // center_h--
+ add r6, r6, r7
+ add r8, r8, r9
+ bgt 0b
+.endm
+
+ cmp r4, #0
+ beq 2f
+ // need_left
+ cmp r11, #0
+ beq 3f
+ // need_left + need_right
+ v_loop 1, 1
+ b 5f
+
+2:
+ // !need_left
+ cmp r11, #0
+ beq 4f
+ // !need_left + need_right
+ v_loop 0, 1
+ b 5f
+
+3:
+ // need_left + !need_right
+ v_loop 1, 0
+ b 5f
+
+4:
+ // !need_left + !need_right
+ v_loop 0, 0
+
+5:
+ cmp r10, #0
+ // Storing the original dst in r0 overwrote bw, recalculate it here
+ add r2, r2, r4 // center_w + left_ext
+ add r2, r2, r11 // bw = center_w + left_ext + right_ext
+
+ beq 3f
+ // need_bottom
+ sub r8, r6, r7 // ref = dst - stride
+ mov r4, r2
+1:
+ vld1.8 {q0, q1}, [r8, :128]!
+ mov r3, r10
+2:
+ subs r3, r3, #1
+ vst1.8 {q0, q1}, [r6, :128], r7
+ bgt 2b
+ mls r6, r7, r10, r6 // dst -= bottom_ext * stride
+ subs r4, r4, #32 // bw -= 32
+ add r6, r6, #32 // dst += 32
+ bgt 1b
+
+3:
+ cmp r5, #0
+ beq 3f
+ // need_top
+ mls r6, r7, r5, r0 // dst = stored_dst - top_ext * stride
+1:
+ vld1.8 {q0, q1}, [r0, :128]!
+ mov r3, r5
+2:
+ subs r3, r3, #1
+ vst1.8 {q0, q1}, [r6, :128], r7
+ bgt 2b
+ mls r6, r7, r5, r6 // dst -= top_ext * stride
+ subs r2, r2, #32 // bw -= 32
+ add r6, r6, #32 // dst += 32
+ bgt 1b
+
+3:
+ pop {r4-r11,pc}
+endfunc
diff --git a/third_party/dav1d/src/arm/32/mc16.S b/third_party/dav1d/src/arm/32/mc16.S
new file mode 100644
index 0000000000..b7d845e219
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/mc16.S
@@ -0,0 +1,3658 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Janne Grunau
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+#define PREP_BIAS 8192
+
+.macro avg d0, d00, d01, d1, d10, d11
+ vld1.16 {q0, q1}, [r2, :128]!
+ vld1.16 {q2, q3}, [r3, :128]!
+ vqadd.s16 q0, q0, q2
+ vqadd.s16 q1, q1, q3
+ vmax.s16 q0, q0, q12 // -2*PREP_BIAS - 1 << intermediate_bits
+ vmax.s16 q1, q1, q12 // -2*PREP_BIAS - 1 << intermediate_bits
+ vqsub.s16 q0, q0, q12 // -2*PREP_BIAS - 1 << intermediate_bits
+ vqsub.s16 q1, q1, q12 // -2*PREP_BIAS - 1 << intermediate_bits
+ vshl.s16 \d0, q0, q13 // -(intermediate_bits+1)
+ vshl.s16 \d1, q1, q13 // -(intermediate_bits+1)
+.endm
+
+.macro w_avg d0, d00, d01, d1, d10, d11
+ vld1.16 {q0, q1}, [r2, :128]!
+ vld1.16 {q2, q3}, [r3, :128]!
+ // This difference requires a 17 bit range, and all bits are
+ // significant for the following multiplication.
+ vsubl.s16 \d0, d4, d0
+ vsubl.s16 q0, d5, d1
+ vsubl.s16 \d1, d6, d2
+ vsubl.s16 q1, d7, d3
+ vmul.s32 \d0, \d0, q4
+ vmul.s32 q0, q0, q4
+ vmul.s32 \d1, \d1, q4
+ vmul.s32 q1, q1, q4
+ vshr.s32 \d0, \d0, #4
+ vshr.s32 q0, q0, #4
+ vshr.s32 \d1, \d1, #4
+ vshr.s32 q1, q1, #4
+ vaddw.s16 \d0, \d0, d4
+ vaddw.s16 q0, q0, d5
+ vaddw.s16 \d1, \d1, d6
+ vaddw.s16 q1, q1, d7
+ vmovn.i32 \d00, \d0
+ vmovn.i32 \d01, q0
+ vmovn.i32 \d10, \d1
+ vmovn.i32 \d11, q1
+ vrshl.s16 \d0, \d0, q13 // -intermediate_bits
+ vrshl.s16 \d1, \d1, q13 // -intermediate_bits
+ vadd.s16 \d0, \d0, q12 // PREP_BIAS >> intermediate_bits
+ vadd.s16 \d1, \d1, q12 // PREP_BIAS >> intermediate_bits
+ vmin.s16 \d0, \d0, q15 // bitdepth_max
+ vmin.s16 \d1, \d1, q15 // bitdepth_max
+ vmax.s16 \d0, \d0, q14 // 0
+ vmax.s16 \d1, \d1, q14 // 0
+.endm
+
+.macro mask d0, d00, d01, d1, d10, d11
+ vld1.8 {q7}, [r6, :128]!
+ vld1.16 {q0, q1}, [r2, :128]!
+ vneg.s8 q7, q7
+ vld1.16 {q2, q3}, [r3, :128]!
+ vmovl.s8 q6, d14
+ vmovl.s8 q7, d15
+ vmovl.s16 q4, d12
+ vmovl.s16 q5, d13
+ vmovl.s16 q6, d14
+ vmovl.s16 q7, d15
+ vsubl.s16 \d0, d4, d0
+ vsubl.s16 q0, d5, d1
+ vsubl.s16 \d1, d6, d2
+ vsubl.s16 q1, d7, d3
+ vmul.s32 \d0, \d0, q4
+ vmul.s32 q0, q0, q5
+ vmul.s32 \d1, \d1, q6
+ vmul.s32 q1, q1, q7
+ vshr.s32 \d0, \d0, #6
+ vshr.s32 q0, q0, #6
+ vshr.s32 \d1, \d1, #6
+ vshr.s32 q1, q1, #6
+ vaddw.s16 \d0, \d0, d4
+ vaddw.s16 q0, q0, d5
+ vaddw.s16 \d1, \d1, d6
+ vaddw.s16 q1, q1, d7
+ vmovn.i32 \d00, \d0
+ vmovn.i32 \d01, q0
+ vmovn.i32 \d10, \d1
+ vmovn.i32 \d11, q1
+ vrshl.s16 \d0, \d0, q13 // -intermediate_bits
+ vrshl.s16 \d1, \d1, q13 // -intermediate_bits
+ vadd.s16 \d0, \d0, q12 // PREP_BIAS >> intermediate_bits
+ vadd.s16 \d1, \d1, q12 // PREP_BIAS >> intermediate_bits
+ vmin.s16 \d0, \d0, q15 // bitdepth_max
+ vmin.s16 \d1, \d1, q15 // bitdepth_max
+ vmax.s16 \d0, \d0, q14 // 0
+ vmax.s16 \d1, \d1, q14 // 0
+.endm
+
+.macro bidir_fn type, bdmax
+function \type\()_16bpc_neon, export=1
+ push {r4-r7,lr}
+ ldrd r4, r5, [sp, #20]
+ ldr r6, [sp, #28]
+ clz r4, r4
+.ifnc \type, avg
+ ldr r7, [sp, #32]
+ vmov.i16 q14, #0
+ vdup.16 q15, r7 // bitdepth_max
+.endif
+.ifc \type, w_avg
+ vpush {q4}
+.endif
+.ifc \type, mask
+ vpush {q4-q7}
+.endif
+ clz r7, \bdmax
+ sub r7, r7, #18 // intermediate_bits = clz(bitdepth_max) - 18
+.ifc \type, avg
+ mov lr, #1
+ movw r12, #2*PREP_BIAS
+ lsl lr, lr, r7 // 1 << intermediate_bits
+ neg r12, r12 // -2*PREP_BIAS
+ add r7, r7, #1
+ sub r12, r12, lr // -2*PREP_BIAS - 1 << intermediate_bits
+ neg r7, r7 // -(intermediate_bits+1)
+ vdup.16 q12, r12 // -2*PREP_BIAS - 1 << intermediate_bits
+ vdup.16 q13, r7 // -(intermediate_bits+1)
+.else
+ mov r12, #PREP_BIAS
+ lsr r12, r12, r7 // PREP_BIAS >> intermediate_bits
+ neg r7, r7 // -intermediate_bits
+ vdup.16 q12, r12 // PREP_BIAS >> intermediate_bits
+ vdup.16 q13, r7 // -intermediate_bits
+.endif
+.ifc \type, w_avg
+ vdup.32 q4, r6
+ vneg.s32 q4, q4
+.endif
+ adr r7, L(\type\()_tbl)
+ sub r4, r4, #24
+ \type q8, d16, d17, q9, d18, d19
+ ldr r4, [r7, r4, lsl #2]
+ add r7, r7, r4
+ bx r7
+
+ .align 2
+L(\type\()_tbl):
+ .word 1280f - L(\type\()_tbl) + CONFIG_THUMB
+ .word 640f - L(\type\()_tbl) + CONFIG_THUMB
+ .word 320f - L(\type\()_tbl) + CONFIG_THUMB
+ .word 160f - L(\type\()_tbl) + CONFIG_THUMB
+ .word 80f - L(\type\()_tbl) + CONFIG_THUMB
+ .word 40f - L(\type\()_tbl) + CONFIG_THUMB
+
+40:
+ add r7, r0, r1
+ lsl r1, r1, #1
+4:
+ subs r5, r5, #4
+ vst1.16 {d16}, [r0, :64], r1
+ vst1.16 {d17}, [r7, :64], r1
+ vst1.16 {d18}, [r0, :64], r1
+ vst1.16 {d19}, [r7, :64], r1
+ ble 0f
+ \type q8, d16, d17, q9, d18, d19
+ b 4b
+80:
+ add r7, r0, r1
+ lsl r1, r1, #1
+8:
+ vst1.16 {q8}, [r0, :128], r1
+ subs r5, r5, #2
+ vst1.16 {q9}, [r7, :128], r1
+ ble 0f
+ \type q8, d16, d17, q9, d18, d19
+ b 8b
+160:
+16:
+ \type q10, d20, d21, q11, d22, d23
+ vst1.16 {q8, q9}, [r0, :128], r1
+ subs r5, r5, #2
+ vst1.16 {q10, q11}, [r0, :128], r1
+ ble 0f
+ \type q8, d16, d17, q9, d18, d19
+ b 16b
+320:
+ add r7, r0, #32
+32:
+ \type q10, d20, d21, q11, d22, d23
+ vst1.16 {q8, q9}, [r0, :128], r1
+ subs r5, r5, #1
+ vst1.16 {q10, q11}, [r7, :128], r1
+ ble 0f
+ \type q8, d16, d17, q9, d18, d19
+ b 32b
+640:
+ add r7, r0, #32
+ mov r12, #64
+ sub r1, r1, #64
+64:
+ \type q10, d20, d21, q11, d22, d23
+ vst1.16 {q8, q9}, [r0, :128], r12
+ \type q8, d16, d17, q9, d18, d19
+ vst1.16 {q10, q11}, [r7, :128], r12
+ \type q10, d20, d21, q11, d22, d23
+ vst1.16 {q8, q9}, [r0, :128], r1
+ subs r5, r5, #1
+ vst1.16 {q10, q11}, [r7, :128], r1
+ ble 0f
+ \type q8, d16, d17, q9, d18, d19
+ b 64b
+1280:
+ add r7, r0, #32
+ mov r12, #64
+ sub r1, r1, #192
+128:
+ \type q10, d20, d21, q11, d22, d23
+ vst1.16 {q8, q9}, [r0, :128], r12
+ \type q8, d16, d17, q9, d18, d19
+ vst1.16 {q10, q11}, [r7, :128], r12
+ \type q10, d20, d21, q11, d22, d23
+ vst1.16 {q8, q9}, [r0, :128], r12
+ \type q8, d16, d17, q9, d18, d19
+ vst1.16 {q10, q11}, [r7, :128], r12
+ \type q10, d20, d21, q11, d22, d23
+ vst1.16 {q8, q9}, [r0, :128], r12
+ \type q8, d16, d17, q9, d18, d19
+ vst1.16 {q10, q11}, [r7, :128], r12
+ \type q10, d20, d21, q11, d22, d23
+ vst1.16 {q8, q9}, [r0, :128], r1
+ subs r5, r5, #1
+ vst1.16 {q10, q11}, [r7, :128], r1
+ ble 0f
+ \type q8, d16, d17, q9, d18, d19
+ b 128b
+0:
+.ifc \type, mask
+ vpop {q4-q7}
+.endif
+.ifc \type, w_avg
+ vpop {q4}
+.endif
+ pop {r4-r7,pc}
+endfunc
+.endm
+
+bidir_fn avg, r6
+bidir_fn w_avg, r7
+bidir_fn mask, r7
+
+
+.macro w_mask_fn type
+function w_mask_\type\()_16bpc_neon, export=1
+ push {r4-r10,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #96]
+ ldrd r6, r7, [sp, #104]
+ ldr r8, [sp, #112]
+ clz r9, r4
+ adr lr, L(w_mask_\type\()_tbl)
+ vdup.16 q15, r8 // bitdepth_max
+ sub r9, r9, #24
+ clz r8, r8 // clz(bitdepth_max)
+ ldr r9, [lr, r9, lsl #2]
+ add r9, lr, r9
+ sub r8, r8, #12 // sh = intermediate_bits + 6 = clz(bitdepth_max) - 12
+ mov r10, #PREP_BIAS*64
+ neg r8, r8 // -sh
+ movw r12, #27615 // (64 + 1 - 38)<<mask_sh - 1 - mask_rnd
+ vdup.32 q14, r8 // -sh
+ vdup.16 q0, r12
+.if \type == 444
+ vmov.i8 q1, #64
+.elseif \type == 422
+ vdup.8 d4, r7
+ vmov.i8 d2, #129
+ vsub.i16 d2, d2, d4
+.elseif \type == 420
+ vdup.16 q2, r7
+ vmov.i16 q1, #0x100
+ vsub.i16 q1, q1, q2
+.endif
+ add r12, r0, r1
+ lsl r1, r1, #1
+ bx r9
+
+ .align 2
+L(w_mask_\type\()_tbl):
+ .word 1280f - L(w_mask_\type\()_tbl) + CONFIG_THUMB
+ .word 640f - L(w_mask_\type\()_tbl) + CONFIG_THUMB
+ .word 320f - L(w_mask_\type\()_tbl) + CONFIG_THUMB
+ .word 160f - L(w_mask_\type\()_tbl) + CONFIG_THUMB
+ .word 8f - L(w_mask_\type\()_tbl) + CONFIG_THUMB
+ .word 4f - L(w_mask_\type\()_tbl) + CONFIG_THUMB
+
+4:
+ vld1.16 {q2, q3}, [r2, :128]! // tmp1 (four rows at once)
+ vld1.16 {q4, q5}, [r3, :128]! // tmp2 (four rows at once)
+ subs r5, r5, #4
+ vdup.32 q13, r10 // PREP_BIAS*64
+ vabd.s16 q6, q2, q4 // abs(tmp1 - tmp2)
+ vabd.s16 q7, q3, q5
+ vsubl.s16 q8, d8, d4 // tmp2 - tmp1 (requires 17 bit)
+ vsubl.s16 q9, d9, d5
+ vsubl.s16 q10, d10, d6
+ vsubl.s16 q11, d11, d7
+ vqsub.u16 q6, q0, q6 // 27615 - abs()
+ vqsub.u16 q7, q0, q7
+ vshll.s16 q5, d7, #6 // tmp1 << 6
+ vshll.s16 q4, d6, #6
+ vshll.s16 q3, d5, #6
+ vshll.s16 q2, d4, #6
+ vshr.u16 q6, q6, #10 // 64-m = (27615 - abs()) >> mask_sh
+ vshr.u16 q7, q7, #10
+ vadd.i32 q2, q2, q13 // += PREP_BIAS*64
+ vadd.i32 q3, q3, q13
+ vadd.i32 q4, q4, q13
+ vadd.i32 q5, q5, q13
+ vmovl.u16 q12, d12
+ vmovl.u16 q13, d13
+ vmla.i32 q2, q8, q12 // (tmp2-tmp1)*(64-m)
+ vmovl.u16 q12, d14
+ vmla.i32 q3, q9, q13
+ vmovl.u16 q13, d15
+ vmla.i32 q4, q10, q12
+ vmla.i32 q5, q11, q13
+ vrshl.s32 q2, q2, q14 // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
+ vrshl.s32 q3, q3, q14
+ vrshl.s32 q4, q4, q14
+ vrshl.s32 q5, q5, q14
+ vqmovun.s32 d4, q2 // iclip_pixel
+ vqmovun.s32 d5, q3
+ vqmovun.s32 d6, q4
+ vqmovun.s32 d7, q5
+ vmin.u16 q2, q2, q15 // iclip_pixel
+ vmin.u16 q3, q3, q15 // iclip_pixel
+.if \type == 444
+ vmovn.i16 d12, q6 // 64 - m
+ vmovn.i16 d13, q7
+ vsub.i16 q6, q1, q6 // m
+ vst1.8 {q6}, [r6, :128]!
+.elseif \type == 422
+ vpadd.i16 d12, d12, d13 // (64 - m) + (64 - n) (column wise addition)
+ vpadd.i16 d13, d14, d15
+ vmovn.i16 d12, q6
+ vhsub.u8 d12, d2, d12 // ((129 - sign) - ((64 - m) + (64 - n)) >> 1
+ vst1.8 {d12}, [r6, :64]!
+.elseif \type == 420
+ vadd.i16 d12, d12, d13 // (64 - my1) + (64 - my2) (row wise addition)
+ vadd.i16 d13, d14, d15
+ vpadd.i16 d12, d12, d13 // (128 - m) + (128 - n) (column wise addition)
+ vsub.i16 d12, d2, d12 // (256 - sign) - ((128 - m) + (128 - n))
+ vrshrn.i16 d12, q6, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
+ vst1.32 {d12[0]}, [r6, :32]!
+.endif
+ vst1.16 {d4}, [r0, :64], r1
+ vst1.16 {d5}, [r12, :64], r1
+ vst1.16 {d6}, [r0, :64], r1
+ vst1.16 {d7}, [r12, :64], r1
+ bgt 4b
+ vpop {q4-q7}
+ pop {r4-r10,pc}
+8:
+ vld1.16 {q2, q3}, [r2, :128]! // tmp1
+ vld1.16 {q4, q5}, [r3, :128]! // tmp2
+ subs r5, r5, #2
+ vdup.32 q13, r10 // PREP_BIAS*64
+ vabd.s16 q6, q2, q4 // abs(tmp1 - tmp2)
+ vabd.s16 q7, q3, q5
+ vsubl.s16 q8, d8, d4 // tmp2 - tmp1 (requires 17 bit)
+ vsubl.s16 q9, d9, d5
+ vsubl.s16 q10, d10, d6
+ vsubl.s16 q11, d11, d7
+ vqsub.u16 q6, q0, q6 // 27615 - abs()
+ vqsub.u16 q7, q0, q7
+ vshll.s16 q5, d7, #6 // tmp1 << 6
+ vshll.s16 q4, d6, #6
+ vshll.s16 q3, d5, #6
+ vshll.s16 q2, d4, #6
+ vshr.u16 q6, q6, #10 // 64-m = (27615 - abs()) >> mask_sh
+ vshr.u16 q7, q7, #10
+ vadd.i32 q2, q2, q13 // += PREP_BIAS*64
+ vadd.i32 q3, q3, q13
+ vadd.i32 q4, q4, q13
+ vadd.i32 q5, q5, q13
+ vmovl.u16 q12, d12
+ vmovl.u16 q13, d13
+ vmla.i32 q2, q8, q12 // (tmp2-tmp1)*(64-m)
+ vmovl.u16 q12, d14
+ vmla.i32 q3, q9, q13
+ vmovl.u16 q13, d15
+ vmla.i32 q4, q10, q12
+ vmla.i32 q5, q11, q13
+ vrshl.s32 q2, q2, q14 // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
+ vrshl.s32 q3, q3, q14
+ vrshl.s32 q4, q4, q14
+ vrshl.s32 q5, q5, q14
+ vqmovun.s32 d4, q2 // iclip_pixel
+ vqmovun.s32 d5, q3
+ vqmovun.s32 d6, q4
+ vqmovun.s32 d7, q5
+ vmin.u16 q2, q2, q15 // iclip_pixel
+ vmin.u16 q3, q3, q15 // iclip_pixel
+.if \type == 444
+ vmovn.i16 d12, q6 // 64 - m
+ vmovn.i16 d13, q7
+ vsub.i16 q6, q1, q6 // m
+ vst1.8 {q6}, [r6, :128]!
+.elseif \type == 422
+ vpadd.i16 d12, d12, d13 // (64 - m) + (64 - n) (column wise addition)
+ vpadd.i16 d13, d14, d15
+ vmovn.i16 d12, q6
+ vhsub.u8 d12, d2, d12 // ((129 - sign) - ((64 - m) + (64 - n)) >> 1
+ vst1.8 {d12}, [r6, :64]!
+.elseif \type == 420
+ vadd.i16 q6, q6, q7 // (64 - my1) + (64 - my2) (row wise addition)
+ vpadd.i16 d12, d12, d13 // (128 - m) + (128 - n) (column wise addition)
+ vsub.i16 d12, d2, d12 // (256 - sign) - ((128 - m) + (128 - n))
+ vrshrn.i16 d12, q6, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
+ vst1.32 {d12[0]}, [r6, :32]!
+.endif
+ vst1.16 {q2}, [r0, :128], r1
+ vst1.16 {q3}, [r12, :128], r1
+ bgt 8b
+ vpop {q4-q7}
+ pop {r4-r10,pc}
+1280:
+640:
+320:
+160:
+ sub r1, r1, r4, lsl #1
+.if \type == 444
+ add lr, r6, r4
+.elseif \type == 422
+ add lr, r6, r4, lsr #1
+.endif
+ add r7, r2, r4, lsl #1
+ add r9, r3, r4, lsl #1
+161:
+ mov r8, r4
+16:
+ vld1.16 {q2}, [r2, :128]! // tmp1
+ vld1.16 {q4}, [r3, :128]! // tmp2
+ vld1.16 {q3}, [r7, :128]!
+ vld1.16 {q5}, [r9, :128]!
+ subs r8, r8, #8
+ vdup.32 q13, r10 // PREP_BIAS*64
+ vabd.s16 q6, q2, q4 // abs(tmp1 - tmp2)
+ vabd.s16 q7, q3, q5
+ vsubl.s16 q8, d8, d4 // tmp2 - tmp1 (requires 17 bit)
+ vsubl.s16 q9, d9, d5
+ vsubl.s16 q10, d10, d6
+ vsubl.s16 q11, d11, d7
+ vqsub.u16 q6, q0, q6 // 27615 - abs()
+ vqsub.u16 q7, q0, q7
+ vshll.s16 q5, d7, #6 // tmp1 << 6
+ vshll.s16 q4, d6, #6
+ vshll.s16 q3, d5, #6
+ vshll.s16 q2, d4, #6
+ vshr.u16 q6, q6, #10 // 64-m = (27615 - abs()) >> mask_sh
+ vshr.u16 q7, q7, #10
+ vadd.i32 q2, q2, q13 // += PREP_BIAS*64
+ vadd.i32 q3, q3, q13
+ vadd.i32 q4, q4, q13
+ vadd.i32 q5, q5, q13
+ vmovl.u16 q12, d12
+ vmovl.u16 q13, d13
+ vmla.i32 q2, q8, q12 // (tmp2-tmp1)*(64-m)
+ vmovl.u16 q12, d14
+ vmla.i32 q3, q9, q13
+ vmovl.u16 q13, d15
+ vmla.i32 q4, q10, q12
+ vmla.i32 q5, q11, q13
+ vrshl.s32 q2, q2, q14 // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
+ vrshl.s32 q3, q3, q14
+ vrshl.s32 q4, q4, q14
+ vrshl.s32 q5, q5, q14
+ vqmovun.s32 d4, q2 // iclip_pixel
+ vqmovun.s32 d5, q3
+ vqmovun.s32 d6, q4
+ vqmovun.s32 d7, q5
+ vmin.u16 q2, q2, q15 // iclip_pixel
+ vmin.u16 q3, q3, q15 // iclip_pixel
+.if \type == 444
+ vmovn.i16 d12, q6 // 64 - m
+ vmovn.i16 d13, q7
+ vsub.i16 q6, q1, q6 // m
+ vst1.8 {d12}, [r6, :64]!
+ vst1.8 {d13}, [lr, :64]!
+.elseif \type == 422
+ vpadd.i16 d12, d12, d13 // (64 - m) + (64 - n) (column wise addition)
+ vpadd.i16 d13, d14, d15
+ vmovn.i16 d12, q6
+ vhsub.u8 d12, d2, d12 // ((129 - sign) - ((64 - m) + (64 - n)) >> 1
+ vst1.32 {d12[0]}, [r6, :32]!
+ vst1.32 {d12[1]}, [lr, :32]!
+.elseif \type == 420
+ vadd.i16 q6, q6, q7 // (64 - my1) + (64 - my2) (row wise addition)
+ vpadd.i16 d12, d12, d13 // (128 - m) + (128 - n) (column wise addition)
+ vsub.i16 d12, d2, d12 // (256 - sign) - ((128 - m) + (128 - n))
+ vrshrn.i16 d12, q6, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
+ vst1.32 {d12[0]}, [r6, :32]!
+.endif
+ vst1.16 {q2}, [r0, :128]!
+ vst1.16 {q3}, [r12, :128]!
+ bgt 16b
+ subs r5, r5, #2
+ add r2, r2, r4, lsl #1
+ add r3, r3, r4, lsl #1
+ add r7, r7, r4, lsl #1
+ add r9, r9, r4, lsl #1
+.if \type == 444
+ add r6, r6, r4
+ add lr, lr, r4
+.elseif \type == 422
+ add r6, r6, r4, lsr #1
+ add lr, lr, r4, lsr #1
+.endif
+ add r0, r0, r1
+ add r12, r12, r1
+ bgt 161b
+ vpop {q4-q7}
+ pop {r4-r10,pc}
+endfunc
+.endm
+
+w_mask_fn 444
+w_mask_fn 422
+w_mask_fn 420
+
+function blend_16bpc_neon, export=1
+ push {r4-r5,lr}
+ ldrd r4, r5, [sp, #12]
+ clz lr, r3
+ adr r3, L(blend_tbl)
+ sub lr, lr, #26
+ ldr lr, [r3, lr, lsl #2]
+ add r3, r3, lr
+ bx r3
+
+ .align 2
+L(blend_tbl):
+ .word 320f - L(blend_tbl) + CONFIG_THUMB
+ .word 160f - L(blend_tbl) + CONFIG_THUMB
+ .word 80f - L(blend_tbl) + CONFIG_THUMB
+ .word 40f - L(blend_tbl) + CONFIG_THUMB
+
+40:
+ add r12, r0, r1
+ lsl r1, r1, #1
+4:
+ vld1.8 {d4}, [r5, :64]!
+ vld1.16 {q1}, [r2, :128]!
+ vld1.16 {d0}, [r0, :64]
+ vneg.s8 d4, d4 // -m
+ subs r4, r4, #2
+ vld1.16 {d1}, [r12, :64]
+ vmovl.s8 q2, d4
+ vshl.i16 q2, q2, #9 // -m << 9
+ vsub.i16 q1, q0, q1 // a - b
+ vqrdmulh.s16 q1, q1, q2 // ((a-b)*-m + 32) >> 6
+ vadd.i16 q0, q0, q1
+ vst1.16 {d0}, [r0, :64], r1
+ vst1.16 {d1}, [r12, :64], r1
+ bgt 4b
+ pop {r4-r5,pc}
+80:
+ add r12, r0, r1
+ lsl r1, r1, #1
+8:
+ vld1.8 {q8}, [r5, :128]!
+ vld1.16 {q2, q3}, [r2, :128]!
+ vneg.s8 q9, q8 // -m
+ vld1.16 {q0}, [r0, :128]
+ vld1.16 {q1}, [r12, :128]
+ vmovl.s8 q8, d18
+ vmovl.s8 q9, d19
+ vshl.i16 q8, q8, #9 // -m << 9
+ vshl.i16 q9, q9, #9
+ vsub.i16 q2, q0, q2 // a - b
+ vsub.i16 q3, q1, q3
+ subs r4, r4, #2
+ vqrdmulh.s16 q2, q2, q8 // ((a-b)*-m + 32) >> 6
+ vqrdmulh.s16 q3, q3, q9
+ vadd.i16 q0, q0, q2
+ vadd.i16 q1, q1, q3
+ vst1.16 {q0}, [r0, :128], r1
+ vst1.16 {q1}, [r12, :128], r1
+ bgt 8b
+ pop {r4-r5,pc}
+160:
+ add r12, r0, r1
+ lsl r1, r1, #1
+16:
+ vld1.8 {q12, q13}, [r5, :128]!
+ vld1.16 {q8, q9}, [r2, :128]!
+ subs r4, r4, #2
+ vneg.s8 q14, q12 // -m
+ vld1.16 {q0, q1}, [r0, :128]
+ vneg.s8 q15, q13
+ vld1.16 {q10, q11}, [r2, :128]!
+ vmovl.s8 q12, d28
+ vmovl.s8 q13, d29
+ vmovl.s8 q14, d30
+ vmovl.s8 q15, d31
+ vld1.16 {q2, q3}, [r12, :128]
+ vshl.i16 q12, q12, #9 // -m << 9
+ vshl.i16 q13, q13, #9
+ vshl.i16 q14, q14, #9
+ vshl.i16 q15, q15, #9
+ vsub.i16 q8, q0, q8 // a - b
+ vsub.i16 q9, q1, q9
+ vsub.i16 q10, q2, q10
+ vsub.i16 q11, q3, q11
+ vqrdmulh.s16 q8, q8, q12 // ((a-b)*-m + 32) >> 6
+ vqrdmulh.s16 q9, q9, q13
+ vqrdmulh.s16 q10, q10, q14
+ vqrdmulh.s16 q11, q11, q15
+ vadd.i16 q0, q0, q8
+ vadd.i16 q1, q1, q9
+ vadd.i16 q2, q2, q10
+ vst1.16 {q0, q1}, [r0, :128], r1
+ vadd.i16 q3, q3, q11
+ vst1.16 {q2, q3}, [r12, :128], r1
+ bgt 16b
+ pop {r4-r5,pc}
+320:
+ add r12, r0, #32
+32:
+ vld1.8 {q12, q13}, [r5, :128]!
+ vld1.16 {q8, q9}, [r2, :128]!
+ subs r4, r4, #1
+ vneg.s8 q14, q12 // -m
+ vld1.16 {q0, q1}, [r0, :128]
+ vneg.s8 q15, q13
+ vld1.16 {q10, q11}, [r2, :128]!
+ vmovl.s8 q12, d28
+ vmovl.s8 q13, d29
+ vmovl.s8 q14, d30
+ vmovl.s8 q15, d31
+ vld1.16 {q2, q3}, [r12, :128]
+ vshl.i16 q12, q12, #9 // -m << 9
+ vshl.i16 q13, q13, #9
+ vshl.i16 q14, q14, #9
+ vshl.i16 q15, q15, #9
+ vsub.i16 q8, q0, q8 // a - b
+ vsub.i16 q9, q1, q9
+ vsub.i16 q10, q2, q10
+ vsub.i16 q11, q3, q11
+ vqrdmulh.s16 q8, q8, q12 // ((a-b)*-m + 32) >> 6
+ vqrdmulh.s16 q9, q9, q13
+ vqrdmulh.s16 q10, q10, q14
+ vqrdmulh.s16 q11, q11, q15
+ vadd.i16 q0, q0, q8
+ vadd.i16 q1, q1, q9
+ vadd.i16 q2, q2, q10
+ vst1.16 {q0, q1}, [r0, :128], r1
+ vadd.i16 q3, q3, q11
+ vst1.16 {q2, q3}, [r12, :128], r1
+ bgt 32b
+ pop {r4-r5,pc}
+endfunc
+
+function blend_h_16bpc_neon, export=1
+ push {r4-r5,lr}
+ ldr r4, [sp, #12]
+ movrel r5, X(obmc_masks)
+ add r5, r5, r4
+ sub r4, r4, r4, lsr #2
+ clz lr, r3
+ adr r12, L(blend_h_tbl)
+ sub lr, lr, #24
+ ldr lr, [r12, lr, lsl #2]
+ add r12, r12, lr
+ bx r12
+
+ .align 2
+L(blend_h_tbl):
+ .word 1280f - L(blend_h_tbl) + CONFIG_THUMB
+ .word 640f - L(blend_h_tbl) + CONFIG_THUMB
+ .word 320f - L(blend_h_tbl) + CONFIG_THUMB
+ .word 160f - L(blend_h_tbl) + CONFIG_THUMB
+ .word 80f - L(blend_h_tbl) + CONFIG_THUMB
+ .word 40f - L(blend_h_tbl) + CONFIG_THUMB
+ .word 20f - L(blend_h_tbl) + CONFIG_THUMB
+
+20:
+ add r12, r0, r1
+ lsl r1, r1, #1
+2:
+ vld2.8 {d4[], d5[]}, [r5, :16]!
+ vld1.16 {d2}, [r2, :64]!
+ vext.8 d4, d4, d5, #6
+ subs r4, r4, #2
+ vneg.s8 d4, d4 // -m
+ vld1.32 {d0[]}, [r0, :32]
+ vld1.32 {d0[1]}, [r12, :32]
+ vmovl.s8 q2, d4
+ vshl.i16 d4, d4, #9 // -m << 9
+ vsub.i16 d2, d0, d2 // a - b
+ vqrdmulh.s16 d2, d2, d4 // ((a-b)*-m + 32) >> 6
+ vadd.i16 d0, d0, d2
+ vst1.32 {d0[0]}, [r0, :32], r1
+ vst1.32 {d0[1]}, [r12, :32], r1
+ bgt 2b
+ pop {r4-r5,pc}
+40:
+ add r12, r0, r1
+ lsl r1, r1, #1
+4:
+ vld2.8 {d4[], d5[]}, [r5, :16]!
+ vld1.16 {q1}, [r2, :128]!
+ vext.8 d4, d4, d5, #4
+ subs r4, r4, #2
+ vneg.s8 d4, d4 // -m
+ vld1.16 {d0}, [r0, :64]
+ vld1.16 {d1}, [r12, :64]
+ vmovl.s8 q2, d4
+ vshl.i16 q2, q2, #9 // -m << 9
+ vsub.i16 q1, q0, q1 // a - b
+ vqrdmulh.s16 q1, q1, q2 // ((a-b)*-m + 32) >> 6
+ vadd.i16 q0, q0, q1
+ vst1.16 {d0}, [r0, :64], r1
+ vst1.16 {d1}, [r12, :64], r1
+ bgt 4b
+ pop {r4-r5,pc}
+80:
+ add r12, r0, r1
+ lsl r1, r1, #1
+8:
+ vld2.8 {d16[], d17[]}, [r5, :16]!
+ vld1.16 {q2, q3}, [r2, :128]!
+ vneg.s8 q9, q8 // -m
+ vld1.16 {q0}, [r0, :128]
+ subs r4, r4, #2
+ vmovl.s8 q8, d18
+ vmovl.s8 q9, d19
+ vld1.16 {q1}, [r12, :128]
+ vshl.i16 q8, q8, #9 // -m << 9
+ vshl.i16 q9, q9, #9
+ vsub.i16 q2, q0, q2 // a - b
+ vsub.i16 q3, q1, q3
+ vqrdmulh.s16 q2, q2, q8 // ((a-b)*-m + 32) >> 6
+ vqrdmulh.s16 q3, q3, q9
+ vadd.i16 q0, q0, q2
+ vadd.i16 q1, q1, q3
+ vst1.16 {q0}, [r0, :128], r1
+ vst1.16 {q1}, [r12, :128], r1
+ bgt 8b
+ pop {r4-r5,pc}
+160:
+ add r12, r0, r1
+ lsl r1, r1, #1
+16:
+ vld2.8 {d24[], d25[]}, [r5, :16]!
+ vld1.16 {q8, q9}, [r2, :128]!
+ subs r4, r4, #2
+ vneg.s8 q13, q12 // -m
+ vld1.16 {q0, q1}, [r0, :128]
+ vmovl.s8 q12, d26
+ vld1.16 {q10, q11}, [r2, :128]!
+ vmovl.s8 q13, d27
+ vld1.16 {q2, q3}, [r12, :128]
+ vshl.i16 q12, q12, #9 // -m << 9
+ vshl.i16 q13, q13, #9
+ vsub.i16 q8, q0, q8 // a - b
+ vsub.i16 q9, q1, q9
+ vsub.i16 q10, q2, q10
+ vsub.i16 q11, q3, q11
+ vqrdmulh.s16 q8, q8, q12 // ((a-b)*-m + 32) >> 6
+ vqrdmulh.s16 q9, q9, q12
+ vqrdmulh.s16 q10, q10, q13
+ vqrdmulh.s16 q11, q11, q13
+ vadd.i16 q0, q0, q8
+ vadd.i16 q1, q1, q9
+ vadd.i16 q2, q2, q10
+ vadd.i16 q3, q3, q11
+ vst1.16 {q0, q1}, [r0, :128], r1
+ vst1.16 {q2, q3}, [r12, :128], r1
+ bgt 16b
+ pop {r4-r5,pc}
+1280:
+640:
+320:
+ sub r1, r1, r3, lsl #1
+321:
+ vld1.8 {d24[]}, [r5]!
+ mov r12, r3
+ vneg.s8 d24, d24 // -m
+ vmovl.s8 q12, d24
+ vshl.i16 q12, q12, #9 // -m << 9
+32:
+ vld1.16 {q8, q9}, [r2, :128]!
+ vld1.16 {q0, q1}, [r0, :128]!
+ subs r12, r12, #32
+ vld1.16 {q10, q11}, [r2, :128]!
+ vld1.16 {q2, q3}, [r0, :128]
+ vsub.i16 q8, q0, q8 // a - b
+ vsub.i16 q9, q1, q9
+ vsub.i16 q10, q2, q10
+ vsub.i16 q11, q3, q11
+ sub r0, r0, #32
+ vqrdmulh.s16 q8, q8, q12 // ((a-b)*-m + 32) >> 6
+ vqrdmulh.s16 q9, q9, q12
+ vqrdmulh.s16 q10, q10, q12
+ vqrdmulh.s16 q11, q11, q12
+ vadd.i16 q0, q0, q8
+ vadd.i16 q1, q1, q9
+ vadd.i16 q2, q2, q10
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q3, q3, q11
+ vst1.16 {q2, q3}, [r0, :128]!
+ bgt 32b
+ subs r4, r4, #1
+ add r0, r0, r1
+ bgt 321b
+ pop {r4-r5,pc}
+endfunc
+
+function blend_v_16bpc_neon, export=1
+ push {r4,lr}
+ ldr r4, [sp, #8]
+ movrel lr, X(obmc_masks)
+ add lr, lr, r3
+ clz r12, r3
+ adr r3, L(blend_v_tbl)
+ sub r12, r12, #26
+ ldr r12, [r3, r12, lsl #2]
+ add r3, r3, r12
+ bx r3
+
+ .align 2
+L(blend_v_tbl):
+ .word 320f - L(blend_v_tbl) + CONFIG_THUMB
+ .word 160f - L(blend_v_tbl) + CONFIG_THUMB
+ .word 80f - L(blend_v_tbl) + CONFIG_THUMB
+ .word 40f - L(blend_v_tbl) + CONFIG_THUMB
+ .word 20f - L(blend_v_tbl) + CONFIG_THUMB
+
+20:
+ add r12, r0, r1
+ lsl r1, r1, #1
+ vld1.8 {d4[]}, [lr]
+ vneg.s8 d4, d4 // -m
+ vmovl.s8 q2, d4
+ vshl.i16 d4, d4, #9 // -m << 9
+2:
+ vld1.32 {d2[]}, [r2, :32]!
+ vld1.16 {d0[]}, [r0, :16]
+ subs r4, r4, #2
+ vld1.16 {d2[1]}, [r2, :16]
+ vld1.16 {d0[1]}, [r12, :16]
+ add r2, r2, #4
+ vsub.i16 d2, d0, d2 // a - b
+ vqrdmulh.s16 d2, d2, d4 // ((a-b)*-m + 32) >> 6
+ vadd.i16 d0, d0, d2
+ vst1.16 {d0[0]}, [r0, :16], r1
+ vst1.16 {d0[1]}, [r12, :16], r1
+ bgt 2b
+ pop {r4,pc}
+40:
+ vld1.32 {d4[]}, [lr, :32]
+ add r12, r0, r1
+ vneg.s8 d4, d4 // -m
+ lsl r1, r1, #1
+ vmovl.s8 q2, d4
+ sub r1, r1, #4
+ vshl.i16 q2, q2, #9 // -m << 9
+4:
+ vld1.16 {q1}, [r2, :128]!
+ vld1.16 {d0}, [r0, :64]
+ vld1.16 {d1}, [r12, :64]
+ subs r4, r4, #2
+ vsub.i16 q1, q0, q1 // a - b
+ vqrdmulh.s16 q1, q1, q2 // ((a-b)*-m + 32) >> 6
+ vadd.i16 q0, q0, q1
+ vst1.32 {d0[0]}, [r0, :32]!
+ vst1.32 {d1[0]}, [r12, :32]!
+ vst1.16 {d0[2]}, [r0, :16], r1
+ vst1.16 {d1[2]}, [r12, :16], r1
+ bgt 4b
+ pop {r4,pc}
+80:
+ vld1.8 {d16}, [lr, :64]
+ add r12, r0, r1
+ vneg.s8 d16, d16 // -m
+ lsl r1, r1, #1
+ vmovl.s8 q8, d16
+ sub r1, r1, #8
+ vshl.i16 q8, q8, #9 // -m << 9
+8:
+ vld1.16 {q2, q3}, [r2, :128]!
+ vld1.16 {q0}, [r0, :128]
+ vld1.16 {q1}, [r12, :128]
+ subs r4, r4, #2
+ vsub.i16 q2, q0, q2 // a - b
+ vsub.i16 q3, q1, q3
+ vqrdmulh.s16 q2, q2, q8 // ((a-b)*-m + 32) >> 6
+ vqrdmulh.s16 q3, q3, q8
+ vadd.i16 q0, q0, q2
+ vadd.i16 q1, q1, q3
+ vst1.16 {d0}, [r0, :64]!
+ vst1.16 {d2}, [r12, :64]!
+ vst1.32 {d1[0]}, [r0, :32], r1
+ vst1.32 {d3[0]}, [r12, :32], r1
+ bgt 8b
+ pop {r4,pc}
+160:
+ vld1.8 {q12}, [lr, :128]
+ add r12, r0, r1
+ vneg.s8 q13, q12 // -m
+ lsl r1, r1, #1
+ vmovl.s8 q12, d26
+ vmovl.s8 q13, d27
+ vshl.i16 q12, q12, #9 // -m << 9
+ vshl.i16 d26, d26, #9
+16:
+ vld1.16 {q8, q9}, [r2, :128]!
+ vld1.16 {d0, d1, d2}, [r0, :64]
+ subs r4, r4, #2
+ vld1.16 {q10, q11}, [r2, :128]!
+ vsub.i16 q8, q0, q8 // a - b
+ vld1.16 {d4, d5, d6}, [r12, :64]
+ vsub.i16 d18, d2, d18
+ vsub.i16 q10, q2, q10
+ vsub.i16 d22, d6, d22
+ vqrdmulh.s16 q8, q8, q12 // ((a-b)*-m + 32) >> 6
+ vqrdmulh.s16 d18, d18, d26
+ vqrdmulh.s16 q10, q10, q12
+ vqrdmulh.s16 d22, d22, d26
+ vadd.i16 q0, q0, q8
+ vadd.i16 d2, d2, d18
+ vadd.i16 q2, q2, q10
+ vst1.16 {d0, d1, d2}, [r0, :64], r1
+ vadd.i16 d6, d6, d22
+ vst1.16 {d4, d5, d6}, [r12, :64], r1
+ bgt 16b
+ pop {r4,pc}
+320:
+ vld1.8 {d24, d25, d26}, [lr, :64]
+ vneg.s8 q14, q12 // -m
+ vneg.s8 d30, d26
+ vmovl.s8 q12, d28
+ vmovl.s8 q13, d29
+ vmovl.s8 q14, d30
+ sub r1, r1, #32
+ vshl.i16 q12, q12, #9 // -m << 9
+ vshl.i16 q13, q13, #9
+ vshl.i16 q14, q14, #9
+32:
+ vld1.16 {q8, q9}, [r2, :128]!
+ vld1.16 {q0, q1}, [r0, :128]!
+ subs r4, r4, #1
+ vld1.16 {q10}, [r2, :128]
+ vsub.i16 q8, q0, q8 // a - b
+ vld1.16 {q2}, [r0, :128]
+ sub r0, r0, #32
+ vsub.i16 q9, q1, q9
+ vsub.i16 q10, q2, q10
+ vqrdmulh.s16 q8, q8, q12 // ((a-b)*-m + 32) >> 6
+ vqrdmulh.s16 q9, q9, q13
+ vqrdmulh.s16 q10, q10, q14
+ vadd.i16 q0, q0, q8
+ vadd.i16 q1, q1, q9
+ vadd.i16 q2, q2, q10
+ vst1.16 {q0, q1}, [r0, :128]!
+ add r2, r2, #32
+ vst1.16 {q2}, [r0, :128], r1
+ bgt 32b
+ pop {r4,pc}
+endfunc
+
+// This has got the same signature as the put_8tap functions,
+// and assumes that r9 is set to (clz(w)-24).
+function put_neon
+ adr r10, L(put_tbl)
+ ldr r9, [r10, r9, lsl #2]
+ add r10, r10, r9
+ bx r10
+
+ .align 2
+L(put_tbl):
+ .word 1280f - L(put_tbl) + CONFIG_THUMB
+ .word 640f - L(put_tbl) + CONFIG_THUMB
+ .word 320f - L(put_tbl) + CONFIG_THUMB
+ .word 16f - L(put_tbl) + CONFIG_THUMB
+ .word 80f - L(put_tbl) + CONFIG_THUMB
+ .word 4f - L(put_tbl) + CONFIG_THUMB
+ .word 2f - L(put_tbl) + CONFIG_THUMB
+
+2:
+ vld1.32 {d0[]}, [r2], r3
+ vld1.32 {d1[]}, [r2], r3
+ subs r5, r5, #2
+ vst1.32 {d0[0]}, [r0, :32], r1
+ vst1.32 {d1[1]}, [r0, :32], r1
+ bgt 2b
+ pop {r4-r11,pc}
+4:
+ vld1.16 {d0}, [r2], r3
+ vld1.16 {d1}, [r2], r3
+ subs r5, r5, #2
+ vst1.16 {d0}, [r0, :64], r1
+ vst1.16 {d1}, [r0, :64], r1
+ bgt 4b
+ pop {r4-r11,pc}
+80:
+ add r8, r0, r1
+ lsl r1, r1, #1
+ add r9, r2, r3
+ lsl r3, r3, #1
+8:
+ vld1.16 {q0}, [r2], r3
+ vld1.16 {q1}, [r9], r3
+ subs r5, r5, #2
+ vst1.16 {q0}, [r0, :128], r1
+ vst1.16 {q1}, [r8, :128], r1
+ bgt 8b
+ pop {r4-r11,pc}
+16:
+ vld1.16 {q0, q1}, [r2], r3
+ subs r5, r5, #1
+ vst1.16 {q0, q1}, [r0, :128], r1
+ bgt 16b
+ pop {r4-r11,pc}
+320:
+ sub r1, r1, #32
+ sub r3, r3, #32
+32:
+ vld1.16 {q0, q1}, [r2]!
+ vst1.16 {q0, q1}, [r0, :128]!
+ vld1.16 {q2, q3}, [r2], r3
+ subs r5, r5, #1
+ vst1.16 {q2, q3}, [r0, :128], r1
+ bgt 32b
+ pop {r4-r11,pc}
+640:
+ sub r1, r1, #96
+ sub r3, r3, #96
+64:
+ vld1.16 {q8, q9}, [r2]!
+ vst1.16 {q8, q9}, [r0, :128]!
+ vld1.16 {q10, q11}, [r2]!
+ vst1.16 {q10, q11}, [r0, :128]!
+ vld1.16 {q12, q13}, [r2]!
+ vst1.16 {q12, q13}, [r0, :128]!
+ vld1.16 {q14, q15}, [r2], r3
+ subs r5, r5, #1
+ vst1.16 {q14, q15}, [r0, :128], r1
+ bgt 64b
+ pop {r4-r11,pc}
+1280:
+ sub r1, r1, #224
+ sub r3, r3, #224
+128:
+ vld1.16 {q8, q9}, [r2]!
+ vst1.16 {q8, q9}, [r0, :128]!
+ vld1.16 {q10, q11}, [r2]!
+ vst1.16 {q10, q11}, [r0, :128]!
+ vld1.16 {q12, q13}, [r2]!
+ vst1.16 {q12, q13}, [r0, :128]!
+ vld1.16 {q14, q15}, [r2]!
+ vst1.16 {q14, q15}, [r0, :128]!
+ vld1.16 {q8, q9}, [r2]!
+ vst1.16 {q8, q9}, [r0, :128]!
+ vld1.16 {q10, q11}, [r2]!
+ vst1.16 {q10, q11}, [r0, :128]!
+ vld1.16 {q12, q13}, [r2]!
+ vst1.16 {q12, q13}, [r0, :128]!
+ vld1.16 {q14, q15}, [r2], r3
+ subs r5, r5, #1
+ vst1.16 {q14, q15}, [r0, :128], r1
+ bgt 128b
+ pop {r4-r11,pc}
+endfunc
+
+// This has got the same signature as the prep_8tap functions,
+// and assumes that r9 is set to (clz(w)-24), r7 to intermediate_bits and
+// r8 to w*2.
+function prep_neon
+ adr r10, L(prep_tbl)
+ ldr r9, [r10, r9, lsl #2]
+ vdup.16 q15, r7 // intermediate_bits
+ vmov.i16 q14, #PREP_BIAS
+ add r10, r10, r9
+ bx r10
+
+ .align 2
+L(prep_tbl):
+ .word 1280f - L(prep_tbl) + CONFIG_THUMB
+ .word 640f - L(prep_tbl) + CONFIG_THUMB
+ .word 320f - L(prep_tbl) + CONFIG_THUMB
+ .word 16f - L(prep_tbl) + CONFIG_THUMB
+ .word 80f - L(prep_tbl) + CONFIG_THUMB
+ .word 40f - L(prep_tbl) + CONFIG_THUMB
+
+40:
+ add r9, r1, r2
+ lsl r2, r2, #1
+4:
+ vld1.16 {d0}, [r1], r2
+ vld1.16 {d1}, [r9], r2
+ subs r4, r4, #2
+ vshl.s16 q0, q0, q15
+ vsub.i16 q0, q0, q14
+ vst1.16 {q0}, [r0, :128]!
+ bgt 4b
+ pop {r4-r11,pc}
+80:
+ add r9, r1, r2
+ lsl r2, r2, #1
+8:
+ vld1.16 {q0}, [r1], r2
+ vld1.16 {q1}, [r9], r2
+ subs r4, r4, #2
+ vshl.s16 q0, q0, q15
+ vshl.s16 q1, q1, q15
+ vsub.i16 q0, q0, q14
+ vsub.i16 q1, q1, q14
+ vst1.16 {q0, q1}, [r0, :128]!
+ bgt 8b
+ pop {r4-r11,pc}
+16:
+ vld1.16 {q0, q1}, [r1], r2
+ vshl.s16 q0, q0, q15
+ vld1.16 {q2, q3}, [r1], r2
+ subs r4, r4, #2
+ vshl.s16 q1, q1, q15
+ vshl.s16 q2, q2, q15
+ vshl.s16 q3, q3, q15
+ vsub.i16 q0, q0, q14
+ vsub.i16 q1, q1, q14
+ vsub.i16 q2, q2, q14
+ vst1.16 {q0, q1}, [r0, :128]!
+ vsub.i16 q3, q3, q14
+ vst1.16 {q2, q3}, [r0, :128]!
+ bgt 16b
+ pop {r4-r11,pc}
+320:
+ sub r2, r2, #32
+32:
+ vld1.16 {q0, q1}, [r1]!
+ subs r4, r4, #1
+ vshl.s16 q0, q0, q15
+ vld1.16 {q2, q3}, [r1], r2
+ vshl.s16 q1, q1, q15
+ vshl.s16 q2, q2, q15
+ vshl.s16 q3, q3, q15
+ vsub.i16 q0, q0, q14
+ vsub.i16 q1, q1, q14
+ vsub.i16 q2, q2, q14
+ vst1.16 {q0, q1}, [r0, :128]!
+ vsub.i16 q3, q3, q14
+ vst1.16 {q2, q3}, [r0, :128]!
+ bgt 32b
+ pop {r4-r11,pc}
+640:
+ sub r2, r2, #96
+64:
+ vld1.16 {q0, q1}, [r1]!
+ subs r4, r4, #1
+ vshl.s16 q0, q0, q15
+ vld1.16 {q2, q3}, [r1]!
+ vshl.s16 q1, q1, q15
+ vld1.16 {q8, q9}, [r1]!
+ vshl.s16 q2, q2, q15
+ vld1.16 {q10, q11}, [r1], r2
+ vshl.s16 q3, q3, q15
+ vshl.s16 q8, q8, q15
+ vshl.s16 q9, q9, q15
+ vshl.s16 q10, q10, q15
+ vshl.s16 q11, q11, q15
+ vsub.i16 q0, q0, q14
+ vsub.i16 q1, q1, q14
+ vsub.i16 q2, q2, q14
+ vsub.i16 q3, q3, q14
+ vsub.i16 q8, q8, q14
+ vst1.16 {q0, q1}, [r0, :128]!
+ vsub.i16 q9, q9, q14
+ vst1.16 {q2, q3}, [r0, :128]!
+ vsub.i16 q10, q10, q14
+ vst1.16 {q8, q9}, [r0, :128]!
+ vsub.i16 q11, q11, q14
+ vst1.16 {q10, q11}, [r0, :128]!
+ bgt 64b
+ pop {r4-r11,pc}
+1280:
+ sub r2, r2, #224
+128:
+ vld1.16 {q0, q1}, [r1]!
+ subs r4, r4, #1
+ vshl.s16 q0, q0, q15
+ vld1.16 {q2, q3}, [r1]!
+ vshl.s16 q1, q1, q15
+ vld1.16 {q8, q9}, [r1]!
+ vshl.s16 q2, q2, q15
+ vld1.16 {q10, q11}, [r1]!
+ vshl.s16 q3, q3, q15
+ vshl.s16 q8, q8, q15
+ vshl.s16 q9, q9, q15
+ vshl.s16 q10, q10, q15
+ vshl.s16 q11, q11, q15
+ vsub.i16 q0, q0, q14
+ vsub.i16 q1, q1, q14
+ vsub.i16 q2, q2, q14
+ vsub.i16 q3, q3, q14
+ vsub.i16 q8, q8, q14
+ vst1.16 {q0, q1}, [r0, :128]!
+ vld1.16 {q0, q1}, [r1]!
+ vsub.i16 q9, q9, q14
+ vsub.i16 q10, q10, q14
+ vst1.16 {q2, q3}, [r0, :128]!
+ vld1.16 {q2, q3}, [r1]!
+ vsub.i16 q11, q11, q14
+ vshl.s16 q0, q0, q15
+ vst1.16 {q8, q9}, [r0, :128]!
+ vld1.16 {q8, q9}, [r1]!
+ vshl.s16 q1, q1, q15
+ vshl.s16 q2, q2, q15
+ vst1.16 {q10, q11}, [r0, :128]!
+ vld1.16 {q10, q11}, [r1], r2
+ vshl.s16 q3, q3, q15
+ vshl.s16 q8, q8, q15
+ vshl.s16 q9, q9, q15
+ vshl.s16 q10, q10, q15
+ vshl.s16 q11, q11, q15
+ vsub.i16 q0, q0, q14
+ vsub.i16 q1, q1, q14
+ vsub.i16 q2, q2, q14
+ vsub.i16 q3, q3, q14
+ vsub.i16 q8, q8, q14
+ vst1.16 {q0, q1}, [r0, :128]!
+ vsub.i16 q9, q9, q14
+ vst1.16 {q2, q3}, [r0, :128]!
+ vsub.i16 q10, q10, q14
+ vst1.16 {q8, q9}, [r0, :128]!
+ vsub.i16 q11, q11, q14
+ vst1.16 {q10, q11}, [r0, :128]!
+ bgt 128b
+ pop {r4-r11,pc}
+endfunc
+
+.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
+ vld1.\wd {\d0[]}, [\s0], \strd
+ vld1.\wd {\d1[]}, [\s1], \strd
+.ifnb \d2
+ vld1.\wd {\d2[]}, [\s0], \strd
+ vld1.\wd {\d3[]}, [\s1], \strd
+.endif
+.ifnb \d4
+ vld1.\wd {\d4[]}, [\s0], \strd
+.endif
+.ifnb \d5
+ vld1.\wd {\d5[]}, [\s1], \strd
+.endif
+.ifnb \d6
+ vld1.\wd {\d6[]}, [\s0], \strd
+.endif
+.endm
+.macro load_reg s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+ vld1.16 {\d0}, [\s0], \strd
+ vld1.16 {\d1}, [\s1], \strd
+.ifnb \d2
+ vld1.16 {\d2}, [\s0], \strd
+ vld1.16 {\d3}, [\s1], \strd
+.endif
+.ifnb \d4
+ vld1.16 {\d4}, [\s0], \strd
+.endif
+.ifnb \d5
+ vld1.16 {\d5}, [\s1], \strd
+.endif
+.ifnb \d6
+ vld1.16 {\d6}, [\s0], \strd
+.endif
+.endm
+.macro load_regpair s0, s1, strd, d0, d1, d2, d3, d4, d5
+ vld1.16 {\d0, \d1}, [\s0], \strd
+.ifnb \d2
+ vld1.16 {\d2, \d3}, [\s1], \strd
+.endif
+.ifnb \d4
+ vld1.16 {\d4, \d5}, [\s0], \strd
+.endif
+.endm
+.macro load_32 s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+ load_slice \s0, \s1, \strd, 32, \d0, \d1, \d2, \d3, \d4, \d5, \d6
+.endm
+.macro load_16s16 s0, s1, strd, d0, d1, d2, d3, d4, d5
+ load_regpair \s0, \s1, \strd, \d0, \d1, \d2, \d3, \d4, \d5
+.endm
+.macro interleave_1_32 r0, r1, r2, r3, r4
+ vext.8 \r0, \r0, \r1, #4
+ vext.8 \r1, \r1, \r2, #4
+.ifnb \r3
+ vext.8 \r2, \r2, \r3, #4
+ vext.8 \r3, \r3, \r4, #4
+.endif
+.endm
+.macro vmin_u16 c, r0, r1, r2, r3
+ vmin.u16 \r0, \r0, \c
+.ifnb \r1
+ vmin.u16 \r1, \r1, \c
+.endif
+.ifnb \r2
+ vmin.u16 \r2, \r2, \c
+ vmin.u16 \r3, \r3, \c
+.endif
+.endm
+.macro vsub_i16 c, r0, r1, r2, r3
+ vsub.i16 \r0, \r0, \c
+.ifnb \r1
+ vsub.i16 \r1, \r1, \c
+.endif
+.ifnb \r2
+ vsub.i16 \r2, \r2, \c
+ vsub.i16 \r3, \r3, \c
+.endif
+.endm
+.macro vmull_vmlal_4 d, s0, s1, s2, s3
+ vmull.s16 \d, \s0, d0[0]
+ vmlal.s16 \d, \s1, d0[1]
+ vmlal.s16 \d, \s2, d0[2]
+ vmlal.s16 \d, \s3, d0[3]
+.endm
+.macro vmull_vmlal_8 d, s0, s1, s2, s3, s4, s5, s6, s7
+ vmull.s16 \d, \s0, d0[0]
+ vmlal.s16 \d, \s1, d0[1]
+ vmlal.s16 \d, \s2, d0[2]
+ vmlal.s16 \d, \s3, d0[3]
+ vmlal.s16 \d, \s4, d1[0]
+ vmlal.s16 \d, \s5, d1[1]
+ vmlal.s16 \d, \s6, d1[2]
+ vmlal.s16 \d, \s7, d1[3]
+.endm
+.macro vqrshrun_s32 shift, q0, d0, q1, d1, q2, d2, q3, d3
+ vqrshrun.s32 \d0, \q0, #\shift
+.ifnb \q1
+ vqrshrun.s32 \d1, \q1, #\shift
+.endif
+.ifnb \q2
+ vqrshrun.s32 \d2, \q2, #\shift
+ vqrshrun.s32 \d3, \q3, #\shift
+.endif
+.endm
+.macro vmovn_i32 q0, d0, q1, d1, q2, d2, q3, d3
+ vmovn.i32 \d0, \q0
+.ifnb \q1
+ vmovn.i32 \d1, \q1
+.endif
+.ifnb \q2
+ vmovn.i32 \d2, \q2
+ vmovn.i32 \d3, \q3
+.endif
+.endm
+.macro vrshl_s32 shift, r0, r1, r2, r3
+ vrshl.s32 \r0, \r0, \shift
+ vrshl.s32 \r1, \r1, \shift
+.ifnb \r2
+ vrshl.s32 \r2, \r2, \shift
+ vrshl.s32 \r3, \r3, \shift
+.endif
+.endm
+.macro vst1_32 strd, r0, r1
+ vst1.32 {\r0[0]}, [r0, :32], \strd
+ vst1.32 {\r0[1]}, [r9, :32], \strd
+.ifnb \r1
+ vst1.32 {\r1[0]}, [r0, :32], \strd
+ vst1.32 {\r1[1]}, [r9, :32], \strd
+.endif
+.endm
+.macro vst1_reg strd, align, r0, r1, r2, r3, r4, r5, r6, r7
+ vst1.16 {\r0}, [r0, \align], \strd
+ vst1.16 {\r1}, [r9, \align], \strd
+.ifnb \r2
+ vst1.16 {\r2}, [r0, \align], \strd
+ vst1.16 {\r3}, [r9, \align], \strd
+.endif
+.ifnb \r4
+ vst1.16 {\r4}, [r0, \align], \strd
+ vst1.16 {\r5}, [r9, \align], \strd
+ vst1.16 {\r6}, [r0, \align], \strd
+ vst1.16 {\r7}, [r9, \align], \strd
+.endif
+.endm
+.macro finalize type, q0, q1, d0, d1, q2, q3, d2, d3
+.ifc \type, put
+ vqrshrun_s32 6, \q0, \d0, \q1, \d1, \q2, \d2, \q3, \d3
+ vmin_u16 q15, \q0, \q1
+.else
+ vrshl_s32 q14, \q0, \q1, \q2, \q3 // -(6-intermediate_bits)
+ vmovn_i32 \q0, \d0, \q1, \d1, \q2, \d2, \q3, \d3
+ vsub_i16 q15, \q0, \q1 // PREP_BIAS
+.endif
+.endm
+.macro shift_store_4 type, strd, q0, q1, d0, d1, q2, q3, d2, d3
+ finalize \type, \q0, \q1, \d0, \d1, \q2, \q3, \d2, \d3
+ vst1_reg \strd, :64, \d0, \d1, \d2, \d3
+.endm
+.macro shift_store_8 type, strd, q0, q1, d0, d1, q2, q3, d2, d3
+ finalize \type, \q0, \q1, \d0, \d1, \q2, \q3, \d2, \d3
+ vst1_reg \strd, :128, \q0, \q1
+.endm
+.macro shift_store_16 type, strd, q0, q1, d0, d1, q2, q3, d2, d3
+ finalize \type, \q0, \q1, \d0, \d1, \q2, \q3, \d2, \d3
+ vst1.16 {\q0, \q1}, [r0, :128], \strd
+.endm
+
+.macro make_8tap_fn op, type, type_h, type_v
+function \op\()_8tap_\type\()_16bpc_neon, export=1
+ push {r4-r11,lr}
+ movw r9, \type_h
+ movw r10, \type_v
+ b \op\()_8tap_neon
+endfunc
+.endm
+
+// No spaces in these expressions, due to gas-preprocessor.
+#define REGULAR ((0*15<<7)|3*15)
+#define SMOOTH ((1*15<<7)|4*15)
+#define SHARP ((2*15<<7)|3*15)
+
+.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, my, bdmax, ds2, sr2
+make_8tap_fn \type, regular, REGULAR, REGULAR
+make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH
+make_8tap_fn \type, regular_sharp, REGULAR, SHARP
+make_8tap_fn \type, smooth, SMOOTH, SMOOTH
+make_8tap_fn \type, smooth_regular, SMOOTH, REGULAR
+make_8tap_fn \type, smooth_sharp, SMOOTH, SHARP
+make_8tap_fn \type, sharp, SHARP, SHARP
+make_8tap_fn \type, sharp_regular, SHARP, REGULAR
+make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH
+
+function \type\()_8tap_neon
+ ldrd r4, r5, [sp, #36]
+ ldrd r6, r7, [sp, #44]
+.ifc \bdmax, r8
+ ldr r8, [sp, #52]
+.endif
+ movw r11, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0)
+ mul \mx, \mx, r11
+ mul \my, \my, r11
+ add \mx, \mx, r9 // mx, 8tap_h, 4tap_h
+ add \my, \my, r10 // my, 8tap_v, 4tap_v
+
+.ifc \type, prep
+ lsl \d_strd, \w, #1
+.endif
+
+ vdup.16 q15, \bdmax // bitdepth_max
+ clz \bdmax, \bdmax
+ clz r9, \w
+ sub \bdmax, \bdmax, #18 // intermediate_bits = clz(bitdepth_max) - 18
+ tst \mx, #(0x7f << 14)
+ sub r9, r9, #24
+ add lr, \bdmax, #6 // 6 + intermediate_bits
+ rsb r12, \bdmax, #6 // 6 - intermediate_bits
+ movrel r11, X(mc_subpel_filters), -8
+ bne L(\type\()_8tap_h)
+ tst \my, #(0x7f << 14)
+ bne L(\type\()_8tap_v)
+ b \type\()_neon
+
+L(\type\()_8tap_h):
+ cmp \w, #4
+ ubfx r10, \mx, #7, #7
+ and \mx, \mx, #0x7f
+ it gt
+ movgt \mx, r10
+ tst \my, #(0x7f << 14)
+ add \mx, r11, \mx, lsl #3
+ bne L(\type\()_8tap_hv)
+
+ adr r10, L(\type\()_8tap_h_tbl)
+ vdup.32 q14, r12 // 6 - intermediate_bits
+ ldr r9, [r10, r9, lsl #2]
+ vneg.s32 q14, q14 // -(6-intermediate_bits)
+.ifc \type, put
+ vdup.16 q13, \bdmax // intermediate_bits
+.else
+ vmov.i16 q13, #PREP_BIAS
+.endif
+ add r10, r10, r9
+.ifc \type, put
+ vneg.s16 q13, q13 // -intermediate_bits
+.endif
+ bx r10
+
+ .align 2
+L(\type\()_8tap_h_tbl):
+ .word 1280f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+ .word 640f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+ .word 320f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+ .word 160f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+ .word 80f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+ .word 40f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+ .word 20f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+
+20: // 2xN h
+.ifc \type, put
+ add \mx, \mx, #2
+ vld1.32 {d0[]}, [\mx]
+ sub \src, \src, #2
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+ vmovl.s8 q0, d0
+2:
+ vld1.16 {q2}, [\src], \s_strd
+ vld1.16 {q3}, [\sr2], \s_strd
+ vext.8 d5, d4, d5, #2
+ vext.8 d7, d6, d7, #2
+ subs \h, \h, #2
+ vtrn.32 d4, d6
+ vtrn.32 d5, d7
+ vmull.s16 q1, d4, d0[0]
+ vmlal.s16 q1, d5, d0[1]
+ vmlal.s16 q1, d6, d0[2]
+ vmlal.s16 q1, d7, d0[3]
+ vrshl.s32 q1, q1, q14 // -(6-intermediate_bits)
+ vqmovun.s32 d2, q1
+ vrshl.s16 d2, d2, d26 // -intermediate_bits
+ vmin.u16 d2, d2, d30
+ vst1.32 {d2[0]}, [\dst, :32], \d_strd
+ vst1.32 {d2[1]}, [\ds2, :32], \d_strd
+ bgt 2b
+ pop {r4-r11,pc}
+.endif
+
+40: // 4xN h
+ add \mx, \mx, #2
+ vld1.32 {d0[]}, [\mx]
+ sub \src, \src, #2
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+ vmovl.s8 q0, d0
+4:
+ vld1.16 {q8}, [\src], \s_strd
+ vld1.16 {q11}, [\sr2], \s_strd
+ vext.8 d18, d16, d17, #2
+ vext.8 d19, d16, d17, #4
+ vext.8 d20, d16, d17, #6
+ vext.8 d24, d22, d23, #2
+ vext.8 d25, d22, d23, #4
+ vext.8 d21, d22, d23, #6
+ subs \h, \h, #2
+ vmull.s16 q2, d16, d0[0]
+ vmlal.s16 q2, d18, d0[1]
+ vmlal.s16 q2, d19, d0[2]
+ vmlal.s16 q2, d20, d0[3]
+ vmull.s16 q3, d22, d0[0]
+ vmlal.s16 q3, d24, d0[1]
+ vmlal.s16 q3, d25, d0[2]
+ vmlal.s16 q3, d21, d0[3]
+ vrshl.s32 q2, q2, q14 // -(6-intermediate_bits)
+ vrshl.s32 q3, q3, q14 // -(6-intermediate_bits)
+.ifc \type, put
+ vqmovun.s32 d4, q2
+ vqmovun.s32 d5, q3
+ vrshl.s16 q2, q2, q13 // -intermediate_bits
+ vmin.u16 q2, q2, q15
+.else
+ vmovn.s32 d4, q2
+ vmovn.s32 d5, q3
+ vsub.i16 q2, q2, q13 // PREP_BIAS
+.endif
+ vst1.16 {d4}, [\dst, :64], \d_strd
+ vst1.16 {d5}, [\ds2, :64], \d_strd
+ bgt 4b
+ pop {r4-r11,pc}
+
+80:
+160:
+320:
+640:
+1280: // 8xN, 16xN, 32xN, ... h
+ vpush {q4-q5}
+ vld1.8 {d0}, [\mx, :64]
+ sub \src, \src, #6
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ vmovl.s8 q0, d0
+
+ sub \s_strd, \s_strd, \w, lsl #1
+ sub \s_strd, \s_strd, #16
+.ifc \type, put
+ lsl \d_strd, \d_strd, #1
+ sub \d_strd, \d_strd, \w, lsl #1
+.endif
+81:
+ vld1.16 {q8, q9}, [\src]!
+ vld1.16 {q10, q11}, [\sr2]!
+ mov \mx, \w
+
+8:
+ vmull.s16 q1, d16, d0[0]
+ vmull.s16 q2, d17, d0[0]
+ vmull.s16 q3, d20, d0[0]
+ vmull.s16 q4, d21, d0[0]
+.irpc i, 1234567
+ vext.8 q12, q8, q9, #(2*\i)
+ vext.8 q5, q10, q11, #(2*\i)
+.if \i < 4
+ vmlal.s16 q1, d24, d0[\i]
+ vmlal.s16 q2, d25, d0[\i]
+ vmlal.s16 q3, d10, d0[\i]
+ vmlal.s16 q4, d11, d0[\i]
+.else
+ vmlal.s16 q1, d24, d1[\i-4]
+ vmlal.s16 q2, d25, d1[\i-4]
+ vmlal.s16 q3, d10, d1[\i-4]
+ vmlal.s16 q4, d11, d1[\i-4]
+.endif
+.endr
+ subs \mx, \mx, #8
+ vrshl.s32 q1, q1, q14 // -(6-intermediate_bits)
+ vrshl.s32 q2, q2, q14 // -(6-intermediate_bits)
+ vrshl.s32 q3, q3, q14 // -(6-intermediate_bits)
+ vrshl.s32 q4, q4, q14 // -(6-intermediate_bits)
+.ifc \type, put
+ vqmovun.s32 d2, q1
+ vqmovun.s32 d3, q2
+ vqmovun.s32 d4, q3
+ vqmovun.s32 d5, q4
+ vrshl.s16 q1, q1, q13 // -intermediate_bits
+ vrshl.s16 q2, q2, q13 // -intermediate_bits
+ vmin.u16 q1, q1, q15
+ vmin.u16 q2, q2, q15
+.else
+ vmovn.s32 d2, q1
+ vmovn.s32 d3, q2
+ vmovn.s32 d4, q3
+ vmovn.s32 d5, q4
+ vsub.i16 q1, q1, q13 // PREP_BIAS
+ vsub.i16 q2, q2, q13 // PREP_BIAS
+.endif
+ vst1.16 {q1}, [\dst, :128]!
+ vst1.16 {q2}, [\ds2, :128]!
+ ble 9f
+
+ vmov q8, q9
+ vmov q10, q11
+ vld1.16 {q9}, [\src]!
+ vld1.16 {q11}, [\sr2]!
+ b 8b
+
+9:
+ add \dst, \dst, \d_strd
+ add \ds2, \ds2, \d_strd
+ add \src, \src, \s_strd
+ add \sr2, \sr2, \s_strd
+
+ subs \h, \h, #2
+ bgt 81b
+ vpop {q4-q5}
+ pop {r4-r11,pc}
+
+
+L(\type\()_8tap_v):
+ cmp \h, #4
+ ubfx r10, \my, #7, #7
+ and \my, \my, #0x7f
+ it gt
+ movgt \my, r10
+ add \my, r11, \my, lsl #3
+
+.ifc \type, prep
+ vdup.32 q14, r12 // 6 - intermediate_bits
+ vmov.i16 q15, #PREP_BIAS
+.endif
+ adr r10, L(\type\()_8tap_v_tbl)
+ ldr r9, [r10, r9, lsl #2]
+.ifc \type, prep
+ vneg.s32 q14, q14 // -(6-intermediate_bits)
+.endif
+ add r10, r10, r9
+ bx r10
+
+ .align 2
+L(\type\()_8tap_v_tbl):
+ .word 1280f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+ .word 640f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+ .word 320f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+ .word 160f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+ .word 80f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+ .word 40f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+ .word 20f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+
+20: // 2xN v
+.ifc \type, put
+ bgt 28f
+
+ cmp \h, #2
+ add \my, \my, #2
+ vld1.32 {d0[]}, [\my]
+ sub \src, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vmovl.s8 q0, d0
+
+ // 2x2 v
+ load_32 \src, \sr2, \s_strd, d1, d2, d3, d4, d5
+ interleave_1_32 d1, d2, d3, d4, d5
+ bgt 24f
+ vmull_vmlal_4 q8, d1, d2, d3, d4
+ vqrshrun_s32 6, q8, d16
+ vmin_u16 d30, d16
+ vst1_32 \d_strd, d16
+ pop {r4-r11,pc}
+
+24: // 2x4 v
+ load_32 \sr2, \src, \s_strd, d6, d7
+ interleave_1_32 d5, d6, d7
+ vmull_vmlal_4 q8, d1, d2, d3, d4
+ vmull_vmlal_4 q9, d3, d4, d5, d6
+ vqrshrun_s32 6, q8, d16, q9, d17
+ vmin_u16 q15, q8
+ vst1_32 \d_strd, d16, d17
+ pop {r4-r11,pc}
+
+28: // 2x6, 2x8, 2x12, 2x16 v
+ vld1.8 {d0}, [\my, :64]
+ sub \sr2, \src, \s_strd, lsl #1
+ add \ds2, \dst, \d_strd
+ sub \src, \sr2, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+ vmovl.s8 q0, d0
+
+ load_32 \src, \sr2, \s_strd, d2, d3, d4, d5, d6, d7, d16
+ interleave_1_32 d2, d3, d4, d5, d6
+ interleave_1_32 d6, d7, d16
+216:
+ subs \h, \h, #4
+ load_32 \sr2, \src, \s_strd, d17, d18, d19, d20
+ interleave_1_32 d16, d17, d18, d19, d20
+ vmull_vmlal_8 q13, d2, d3, d4, d5, d6, d7, d16, d17
+ vmull_vmlal_8 q1, d4, d5, d6, d7, d16, d17, d18, d19
+ vqrshrun_s32 6, q13, d26, q1, d27
+ vmin_u16 q15, q13
+ vst1_32 \d_strd, d26, d27
+ ble 0f
+ cmp \h, #2
+ vmov q1, q3
+ vmov q2, q8
+ vmov q3, q9
+ vmov d16, d20
+ beq 26f
+ b 216b
+26:
+ load_32 \sr2, \src, \s_strd, d17, d18
+ interleave_1_32 d16, d17, d18
+ vmull_vmlal_8 q13, d2, d3, d4, d5, d6, d7, d16, d17
+ vqrshrun_s32 6, q13, d26
+ vmin_u16 d30, d26
+ vst1_32 \d_strd, d26
+0:
+ pop {r4-r11,pc}
+.endif
+
+40:
+ bgt 480f
+
+ // 4x2, 4x4 v
+ cmp \h, #2
+ add \my, \my, #2
+ vld1.32 {d0[]}, [\my]
+ sub \src, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vmovl.s8 q0, d0
+
+ load_reg \src, \sr2, \s_strd, d1, d2, d3, d4, d5
+ vmull_vmlal_4 q8, d1, d2, d3, d4
+ vmull_vmlal_4 q9, d2, d3, d4, d5
+ shift_store_4 \type, \d_strd, q8, q9, d16, d17
+ ble 0f
+ load_reg \sr2, \src, \s_strd, d6, d7
+ vmull_vmlal_4 q8, d3, d4, d5, d6
+ vmull_vmlal_4 q9, d4, d5, d6, d7
+ shift_store_4 \type, \d_strd, q8, q9, d16, d17
+0:
+ pop {r4-r11,pc}
+
+480: // 4x6, 4x8, 4x12, 4x16 v
+ vld1.8 {d0}, [\my, :64]
+ sub \sr2, \src, \s_strd, lsl #1
+ add \ds2, \dst, \d_strd
+ sub \src, \sr2, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vmovl.s8 q0, d0
+
+ load_reg \src, \sr2, \s_strd, d16, d17, d18, d19, d20, d21, d22
+
+48:
+ subs \h, \h, #4
+ load_reg \sr2, \src, \s_strd, d23, d24, d25, d26
+ vmull_vmlal_8 q1, d16, d17, d18, d19, d20, d21, d22, d23
+ vmull_vmlal_8 q2, d17, d18, d19, d20, d21, d22, d23, d24
+ vmull_vmlal_8 q3, d18, d19, d20, d21, d22, d23, d24, d25
+ vmull_vmlal_8 q8, d19, d20, d21, d22, d23, d24, d25, d26
+ shift_store_4 \type, \d_strd, q1, q2, d2, d3, q3, q8, d4, d5
+ ble 0f
+ cmp \h, #2
+ vmov q8, q10
+ vmov q9, q11
+ vmov q10, q12
+ vmov d22, d26
+ beq 46f
+ b 48b
+46:
+ load_reg \sr2, \src, \s_strd, d23, d24
+ vmull_vmlal_8 q1, d16, d17, d18, d19, d20, d21, d22, d23
+ vmull_vmlal_8 q2, d17, d18, d19, d20, d21, d22, d23, d24
+ shift_store_4 \type, \d_strd, q1, q2, d2, d3
+0:
+ pop {r4-r11,pc}
+
+80:
+ bgt 880f
+
+ // 8x2, 8x4 v
+ cmp \h, #2
+ add \my, \my, #2
+ vld1.32 {d0[]}, [\my]
+ sub \src, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vmovl.s8 q0, d0
+
+ load_reg \src, \sr2, \s_strd, q1, q2, q3, q8, q9
+ vmull_vmlal_4 q10, d2, d4, d6, d16
+ vmull_vmlal_4 q11, d3, d5, d7, d17
+ vmull_vmlal_4 q12, d4, d6, d16, d18
+ vmull_vmlal_4 q13, d5, d7, d17, d19
+ shift_store_8 \type, \d_strd, q10, q11, d20, d21, q12, q13, d22, d23
+ ble 0f
+ load_reg \sr2, \src, \s_strd, q10, q11
+ vmull_vmlal_4 q1, d6, d16, d18, d20
+ vmull_vmlal_4 q2, d7, d17, d19, d21
+ vmull_vmlal_4 q12, d16, d18, d20, d22
+ vmull_vmlal_4 q13, d17, d19, d21, d23
+ shift_store_8 \type, \d_strd, q1, q2, d2, d3, q12, q13, d4, d5
+0:
+ pop {r4-r11,pc}
+
+880: // 8x6, 8x8, 8x16, 8x32 v
+1680: // 16x8, 16x16, ...
+320: // 32x8, 32x16, ...
+640:
+1280:
+ vpush {q4-q7}
+ vld1.8 {d0}, [\my, :64]
+ sub \src, \src, \s_strd
+ sub \src, \src, \s_strd, lsl #1
+ vmovl.s8 q0, d0
+ mov \my, \h
+168:
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ load_reg \src, \sr2, \s_strd, q5, q6, q7, q8, q9, q10, q11
+
+88:
+ subs \h, \h, #2
+ load_reg \sr2, \src, \s_strd, q12, q13
+ vmull_vmlal_8 q1, d10, d12, d14, d16, d18, d20, d22, d24
+ vmull_vmlal_8 q2, d11, d13, d15, d17, d19, d21, d23, d25
+ vmull_vmlal_8 q3, d12, d14, d16, d18, d20, d22, d24, d26
+ vmull_vmlal_8 q4, d13, d15, d17, d19, d21, d23, d25, d27
+ shift_store_8 \type, \d_strd, q1, q2, d2, d3, q3, q4, d4, d5
+ ble 9f
+ subs \h, \h, #2
+ load_reg \sr2, \src, \s_strd, q1, q2
+ vmull_vmlal_8 q3, d14, d16, d18, d20, d22, d24, d26, d2
+ vmull_vmlal_8 q4, d15, d17, d19, d21, d23, d25, d27, d3
+ vmull_vmlal_8 q5, d16, d18, d20, d22, d24, d26, d2, d4
+ vmull_vmlal_8 q6, d17, d19, d21, d23, d25, d27, d3, d5
+ shift_store_8 \type, \d_strd, q3, q4, d6, d7, q5, q6, d8, d9
+ ble 9f
+ vmov q5, q9
+ vmov q6, q10
+ vmov q7, q11
+ vmov q8, q12
+ vmov q9, q13
+ vmov q10, q1
+ vmov q11, q2
+ b 88b
+9:
+ subs \w, \w, #8
+ ble 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ mls \src, \s_strd, \my, \src
+ mls \dst, \d_strd, \my, \dst
+ sub \src, \src, \s_strd, lsl #3
+ mov \h, \my
+ add \src, \src, #16
+ add \dst, \dst, #16
+ b 168b
+0:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+
+160:
+ bgt 1680b
+
+ // 16x2, 16x4 v
+ vpush {q6-q7}
+ add \my, \my, #2
+ vld1.32 {d0[]}, [\my]
+ sub \src, \src, \s_strd
+ vmovl.s8 q0, d0
+
+ load_16s16 \src, \src, \s_strd, q6, q7, q8, q9, q10, q11
+16:
+ load_16s16 \src, \src, \s_strd, q12, q13
+ subs \h, \h, #1
+ vmull_vmlal_4 q1, d12, d16, d20, d24
+ vmull_vmlal_4 q2, d13, d17, d21, d25
+ vmull_vmlal_4 q3, d14, d18, d22, d26
+ vmull_vmlal_4 q6, d15, d19, d23, d27
+ shift_store_16 \type, \d_strd, q1, q2, d2, d3, q3, q6, d4, d5
+ ble 0f
+ vmov q6, q8
+ vmov q7, q9
+ vmov q8, q10
+ vmov q9, q11
+ vmov q10, q12
+ vmov q11, q13
+ b 16b
+0:
+ vpop {q6-q7}
+ pop {r4-r11,pc}
+
+
+L(\type\()_8tap_hv):
+ cmp \h, #4
+ ubfx r10, \my, #7, #7
+ and \my, \my, #0x7f
+ it gt
+ movgt \my, r10
+4:
+ add \my, r11, \my, lsl #3
+
+ adr r10, L(\type\()_8tap_hv_tbl)
+ neg r12, r12 // -(6-intermediate_bits)
+ ldr r9, [r10, r9, lsl #2]
+ vdup.32 q14, r12 // -(6-intermediate_bits)
+.ifc \type, put
+ neg r8, lr // -(6+intermeidate_bits)
+.else
+ vmov.i16 q13, #PREP_BIAS
+.endif
+ add r10, r10, r9
+.ifc \type, put
+ vdup.32 q13, r8 // -(6+intermediate_bits)
+.endif
+ bx r10
+
+ .align 2
+L(\type\()_8tap_hv_tbl):
+ .word 1280f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+ .word 640f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+ .word 320f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+ .word 160f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+ .word 80f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+ .word 40f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+ .word 20f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+
+20:
+.ifc \type, put
+ add \mx, \mx, #2
+ vld1.32 {d0[]}, [\mx]
+ bgt 280f
+ add \my, \my, #2
+ vld1.32 {d2[]}, [\my]
+
+ // 2x2, 2x4 hv
+ sub \sr2, \src, #2
+ sub \src, \sr2, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vmovl.s8 q0, d0
+ vmovl.s8 q1, d2
+
+ vld1.16 {q11}, [\src], \s_strd
+ vext.8 d24, d22, d23, #2
+ vmull.s16 q11, d22, d0
+ vmull.s16 q12, d24, d0
+ vpadd.s32 d22, d22, d23
+ vpadd.s32 d23, d24, d25
+ vpadd.s32 d22, d22, d23
+ vrshl.s32 d16, d22, d28 // -(6-intermediate_bits)
+ vmovn.i32 d16, q8
+ bl L(\type\()_8tap_filter_2)
+
+ vext.8 d16, d16, d16, #4
+ vext.8 d16, d16, d24, #4
+ vmov d17, d24
+
+2:
+ bl L(\type\()_8tap_filter_2)
+
+ vext.8 d18, d17, d24, #4
+ vmull.s16 q2, d16, d2[0]
+ vmlal.s16 q2, d17, d2[1]
+ vmlal.s16 q2, d18, d2[2]
+ vmlal.s16 q2, d24, d2[3]
+
+ vrshl.s32 q2, q2, q13 // -(6+intermediate_bits)
+ vqmovun.s32 d4, q2
+ vmin.u16 d4, d4, d30
+ subs \h, \h, #2
+ vst1.32 {d4[0]}, [\dst, :32], \d_strd
+ vst1.32 {d4[1]}, [\ds2, :32], \d_strd
+ ble 0f
+ vmov d16, d18
+ vmov d17, d24
+ b 2b
+
+280: // 2x8, 2x16, 2x32 hv
+ vld1.8 {d2}, [\my, :64]
+ sub \src, \src, #2
+ sub \sr2, \src, \s_strd, lsl #1
+ sub \src, \sr2, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vmovl.s8 q0, d0
+ vmovl.s8 q1, d2
+
+ vld1.16 {q11}, [\src], \s_strd
+ vext.8 d24, d22, d23, #2
+ vmull.s16 q11, d22, d0
+ vmull.s16 q12, d24, d0
+ vpadd.s32 d22, d22, d23
+ vpadd.s32 d23, d24, d25
+ vpadd.s32 d22, d22, d23
+ vrshl.s32 d16, d22, d28 // -(6-intermediate_bits)
+ vmovn.i32 d16, q8
+
+ bl L(\type\()_8tap_filter_2)
+
+ vext.8 d16, d16, d16, #4
+ vext.8 d16, d16, d24, #4
+ vmov d17, d24
+ bl L(\type\()_8tap_filter_2)
+ vext.8 d18, d17, d24, #4
+ vmov d19, d24
+ bl L(\type\()_8tap_filter_2)
+ vext.8 d20, d19, d24, #4
+ vmov d21, d24
+
+28:
+ bl L(\type\()_8tap_filter_2)
+ vext.8 d22, d21, d24, #4
+ vmull.s16 q3, d16, d2[0]
+ vmlal.s16 q3, d17, d2[1]
+ vmlal.s16 q3, d18, d2[2]
+ vmlal.s16 q3, d19, d2[3]
+ vmlal.s16 q3, d20, d3[0]
+ vmlal.s16 q3, d21, d3[1]
+ vmlal.s16 q3, d22, d3[2]
+ vmlal.s16 q3, d24, d3[3]
+
+ vrshl.s32 q3, q3, q13 // -(6+intermediate_bits)
+ vqmovun.s32 d6, q3
+ vmin.u16 d6, d6, d30
+ subs \h, \h, #2
+ vst1.32 {d6[0]}, [\dst, :32], \d_strd
+ vst1.32 {d6[1]}, [\ds2, :32], \d_strd
+ ble 0f
+ vmov q8, q9
+ vmov q9, q10
+ vmov d20, d22
+ vmov d21, d24
+ b 28b
+0:
+ pop {r4-r11,pc}
+
+L(\type\()_8tap_filter_2):
+ vld1.16 {q11}, [\sr2], \s_strd
+ vld1.16 {q12}, [\src], \s_strd
+ vext.8 d23, d22, d23, #2
+ vext.8 d25, d24, d25, #2
+ vtrn.32 q11, q12
+ vmull.s16 q3, d22, d0[0]
+ vmlal.s16 q3, d23, d0[1]
+ vmlal.s16 q3, d24, d0[2]
+ vmlal.s16 q3, d25, d0[3]
+ vrshl.s32 q3, q3, q14 // -(6-intermediate_bits)
+ vmovn.i32 d24, q3
+ bx lr
+.endif
+
+40:
+ add \mx, \mx, #2
+ vld1.32 {d0[]}, [\mx]
+ bgt 480f
+ add \my, \my, #2
+ vld1.32 {d2[]}, [\my]
+ sub \sr2, \src, #2
+ sub \src, \sr2, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vmovl.s8 q0, d0
+ vmovl.s8 q1, d2
+
+ // 4x2, 4x4 hv
+ vld1.16 {q11}, [\src], \s_strd
+ vext.8 d24, d22, d23, #2
+ vext.8 d25, d22, d23, #4
+ vext.8 d23, d22, d23, #6
+ vmull.s16 q10, d22, d0[0]
+ vmlal.s16 q10, d24, d0[1]
+ vmlal.s16 q10, d25, d0[2]
+ vmlal.s16 q10, d23, d0[3]
+ vrshl.s32 q10, q10, q14 // -(6-intermediate_bits)
+ vmovn.i32 d17, q10
+
+ bl L(\type\()_8tap_filter_4)
+ vmov q9, q12
+
+4:
+ bl L(\type\()_8tap_filter_4)
+ vmull.s16 q2, d17, d2[0]
+ vmlal.s16 q2, d18, d2[1]
+ vmlal.s16 q2, d19, d2[2]
+ vmlal.s16 q2, d24, d2[3]
+ vmull.s16 q3, d18, d2[0]
+ vmlal.s16 q3, d19, d2[1]
+ vmlal.s16 q3, d24, d2[2]
+ vmlal.s16 q3, d25, d2[3]
+.ifc \type, put
+ vrshl.s32 q2, q2, q13 // -(6+intermediate_bits)
+ vrshl.s32 q3, q3, q13 // -(6+intermediate_bits)
+ vqmovun.s32 d4, q2
+ vqmovun.s32 d5, q3
+ vmin.u16 q2, q2, q15
+.else
+ vrshrn.i32 d4, q2, #6
+ vrshrn.i32 d5, q3, #6
+ vsub.i16 q2, q2, q13 // PREP_BIAS
+.endif
+ subs \h, \h, #2
+
+ vst1.16 {d4}, [\dst, :64], \d_strd
+ vst1.16 {d5}, [\ds2, :64], \d_strd
+ ble 0f
+ vmov d17, d19
+ vmov q9, q12
+ b 4b
+0:
+ pop {r4-r11,pc}
+
+480: // 4x8, 4x16, 4x32 hv
+ vpush {d13-d15}
+ vld1.8 {d2}, [\my, :64]
+ sub \src, \src, #2
+ sub \sr2, \src, \s_strd, lsl #1
+ sub \src, \sr2, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vmovl.s8 q0, d0
+ vmovl.s8 q1, d2
+
+ vld1.16 {q11}, [\src], \s_strd
+ vext.8 d24, d22, d23, #2
+ vext.8 d25, d22, d23, #4
+ vext.8 d23, d22, d23, #6
+ vmull.s16 q10, d22, d0[0]
+ vmlal.s16 q10, d24, d0[1]
+ vmlal.s16 q10, d25, d0[2]
+ vmlal.s16 q10, d23, d0[3]
+ vrshl.s32 q10, q10, q14 // -(6-intermediate_bits)
+ vmovn.i32 d13, q10
+
+ bl L(\type\()_8tap_filter_4)
+ vmov q7, q12
+ bl L(\type\()_8tap_filter_4)
+ vmov q8, q12
+ bl L(\type\()_8tap_filter_4)
+ vmov q9, q12
+
+48:
+ bl L(\type\()_8tap_filter_4)
+ vmull.s16 q2, d13, d2[0]
+ vmlal.s16 q2, d14, d2[1]
+ vmlal.s16 q2, d15, d2[2]
+ vmlal.s16 q2, d16, d2[3]
+ vmlal.s16 q2, d17, d3[0]
+ vmlal.s16 q2, d18, d3[1]
+ vmlal.s16 q2, d19, d3[2]
+ vmlal.s16 q2, d24, d3[3]
+ vmull.s16 q3, d14, d2[0]
+ vmlal.s16 q3, d15, d2[1]
+ vmlal.s16 q3, d16, d2[2]
+ vmlal.s16 q3, d17, d2[3]
+ vmlal.s16 q3, d18, d3[0]
+ vmlal.s16 q3, d19, d3[1]
+ vmlal.s16 q3, d24, d3[2]
+ vmlal.s16 q3, d25, d3[3]
+.ifc \type, put
+ vrshl.s32 q2, q2, q13 // -(6+intermediate_bits)
+ vrshl.s32 q3, q3, q13 // -(6+intermediate_bits)
+ vqmovun.s32 d4, q2
+ vqmovun.s32 d5, q3
+ vmin.u16 q2, q2, q15
+.else
+ vrshrn.i32 d4, q2, #6
+ vrshrn.i32 d5, q3, #6
+ vsub.i16 q2, q2, q13 // PREP_BIAS
+.endif
+ subs \h, \h, #2
+ vst1.16 {d4}, [\dst, :64], \d_strd
+ vst1.16 {d5}, [\ds2, :64], \d_strd
+ ble 0f
+ vmov d13, d15
+ vmov q7, q8
+ vmov q8, q9
+ vmov q9, q12
+ b 48b
+0:
+ vpop {d13-d15}
+ pop {r4-r11,pc}
+
+L(\type\()_8tap_filter_4):
+ vld1.16 {q10}, [\sr2], \s_strd
+ vld1.16 {q11}, [\src], \s_strd
+ vext.8 d24, d20, d21, #2
+ vext.8 d25, d20, d21, #4
+ vext.8 d21, d20, d21, #6
+ vmull.s16 q3, d20, d0[0]
+ vmlal.s16 q3, d24, d0[1]
+ vmlal.s16 q3, d25, d0[2]
+ vmlal.s16 q3, d21, d0[3]
+ vext.8 d24, d22, d23, #2
+ vext.8 d25, d22, d23, #4
+ vext.8 d23, d22, d23, #6
+ vmull.s16 q10, d22, d0[0]
+ vmlal.s16 q10, d24, d0[1]
+ vmlal.s16 q10, d25, d0[2]
+ vmlal.s16 q10, d23, d0[3]
+ vrshl.s32 q3, q3, q14 // -(6-intermediate_bits)
+ vrshl.s32 q10, q10, q14 // -(6-intermediate_bits)
+ vmovn.i32 d24, q3
+ vmovn.i32 d25, q10
+ bx lr
+
+80:
+160:
+320:
+ bgt 880f
+ add \my, \my, #2
+ vld1.8 {d0}, [\mx, :64]
+ vld1.32 {d2[]}, [\my]
+ sub \src, \src, #6
+ sub \src, \src, \s_strd
+ vmovl.s8 q0, d0
+ vmovl.s8 q1, d2
+ mov \my, \h
+
+164: // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+
+ vld1.16 {q11, q12}, [\src], \s_strd
+ vmull.s16 q2, d22, d0[0]
+ vmull.s16 q3, d23, d0[0]
+ vdup.32 q14, r12 // -(6-intermediate_bits)
+.irpc i, 1234567
+ vext.8 q10, q11, q12, #(2*\i)
+.if \i < 4
+ vmlal.s16 q2, d20, d0[\i]
+ vmlal.s16 q3, d21, d0[\i]
+.else
+ vmlal.s16 q2, d20, d1[\i - 4]
+ vmlal.s16 q3, d21, d1[\i - 4]
+.endif
+.endr
+ vrshl.s32 q2, q2, q14 // -(6-intermediate_bits)
+ vrshl.s32 q3, q3, q14 // -(6-intermediate_bits)
+ vmovn.i32 d16, q2
+ vmovn.i32 d17, q3
+
+ bl L(\type\()_8tap_filter_8)
+ vmov q9, q11
+ vmov q10, q12
+
+8:
+ bl L(\type\()_8tap_filter_8)
+ vmull.s16 q2, d16, d2[0]
+ vmull.s16 q3, d17, d2[0]
+ vmull.s16 q13, d18, d2[0]
+ vmull.s16 q14, d19, d2[0]
+.ifc \type, put
+ vdup.32 q8, r8 // -(6+intermediate_bits)
+.endif
+ vmlal.s16 q2, d18, d2[1]
+ vmlal.s16 q3, d19, d2[1]
+ vmlal.s16 q13, d20, d2[1]
+ vmlal.s16 q14, d21, d2[1]
+ vmlal.s16 q2, d20, d2[2]
+ vmlal.s16 q3, d21, d2[2]
+ vmlal.s16 q13, d22, d2[2]
+ vmlal.s16 q14, d23, d2[2]
+ vmlal.s16 q2, d22, d2[3]
+ vmlal.s16 q3, d23, d2[3]
+ vmlal.s16 q13, d24, d2[3]
+ vmlal.s16 q14, d25, d2[3]
+.ifc \type, put
+ vdup.16 q9, \bdmax // bitdepth_max
+ vrshl.s32 q2, q2, q8 // -(6+intermediate_bits)
+ vrshl.s32 q3, q3, q8 // -(6+intermediate_bits)
+ vrshl.s32 q13, q13, q8 // -(6+intermediate_bits)
+ vrshl.s32 q14, q14, q8 // -(6+intermediate_bits)
+ vqmovun.s32 d4, q2
+ vqmovun.s32 d5, q3
+ vqmovun.s32 d6, q13
+ vqmovun.s32 d7, q14
+ vmin.u16 q2, q2, q15
+ vmin.u16 q3, q3, q15
+.else
+ vmov.i16 q9, #PREP_BIAS
+ vrshrn.i32 d4, q2, #6
+ vrshrn.i32 d5, q3, #6
+ vrshrn.i32 d6, q13, #6
+ vrshrn.i32 d7, q14, #6
+ vsub.i16 q2, q2, q9 // PREP_BIAS
+ vsub.i16 q3, q3, q9 // PREP_BIAS
+.endif
+ subs \h, \h, #2
+ vst1.16 {q2}, [\dst, :128], \d_strd
+ vst1.16 {q3}, [\ds2, :128], \d_strd
+ ble 9f
+ vmov q8, q10
+ vmov q9, q11
+ vmov q10, q12
+ b 8b
+9:
+ subs \w, \w, #8
+ ble 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ mls \src, \s_strd, \my, \src
+ mls \dst, \d_strd, \my, \dst
+ sub \src, \src, \s_strd, lsl #2
+ mov \h, \my
+ add \src, \src, #16
+ add \dst, \dst, #16
+ b 164b
+0:
+ pop {r4-r11,pc}
+
+880: // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv
+640:
+1280:
+ vpush {q4-q7}
+ vld1.8 {d0}, [\mx, :64]
+ vld1.8 {d2}, [\my, :64]
+ sub \src, \src, #6
+ sub \src, \src, \s_strd
+ sub \src, \src, \s_strd, lsl #1
+ vmovl.s8 q0, d0
+ vmovl.s8 q1, d2
+ mov \my, \h
+
+168:
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+
+ vld1.16 {q11, q12}, [\src], \s_strd
+ vmull.s16 q2, d22, d0[0]
+ vmull.s16 q3, d23, d0[0]
+ vdup.32 q14, r12 // -(6-intermediate_bits)
+.irpc i, 1234567
+ vext.8 q10, q11, q12, #(2*\i)
+.if \i < 4
+ vmlal.s16 q2, d20, d0[\i]
+ vmlal.s16 q3, d21, d0[\i]
+.else
+ vmlal.s16 q2, d20, d1[\i - 4]
+ vmlal.s16 q3, d21, d1[\i - 4]
+.endif
+.endr
+ vrshl.s32 q2, q2, q14 // -(6-intermediate_bits)
+ vrshl.s32 q3, q3, q14 // -(6-intermediate_bits)
+ vmovn.i32 d8, q2
+ vmovn.i32 d9, q3
+
+ bl L(\type\()_8tap_filter_8)
+ vmov q5, q11
+ vmov q6, q12
+ bl L(\type\()_8tap_filter_8)
+ vmov q7, q11
+ vmov q8, q12
+ bl L(\type\()_8tap_filter_8)
+ vmov q9, q11
+ vmov q10, q12
+
+88:
+ bl L(\type\()_8tap_filter_8)
+ vmull.s16 q2, d8, d2[0]
+ vmull.s16 q3, d9, d2[0]
+ vmull.s16 q13, d10, d2[0]
+ vmull.s16 q14, d11, d2[0]
+.ifc \type, put
+ vdup.32 q4, r8 // -(6+intermediate_bits)
+.endif
+ vmlal.s16 q2, d10, d2[1]
+ vmlal.s16 q3, d11, d2[1]
+ vmlal.s16 q13, d12, d2[1]
+ vmlal.s16 q14, d13, d2[1]
+ vmlal.s16 q2, d12, d2[2]
+ vmlal.s16 q3, d13, d2[2]
+ vmlal.s16 q13, d14, d2[2]
+ vmlal.s16 q14, d15, d2[2]
+ vmlal.s16 q2, d14, d2[3]
+ vmlal.s16 q3, d15, d2[3]
+ vmlal.s16 q13, d16, d2[3]
+ vmlal.s16 q14, d17, d2[3]
+ vmlal.s16 q2, d16, d3[0]
+ vmlal.s16 q3, d17, d3[0]
+ vmlal.s16 q13, d18, d3[0]
+ vmlal.s16 q14, d19, d3[0]
+ vmlal.s16 q2, d18, d3[1]
+ vmlal.s16 q3, d19, d3[1]
+ vmlal.s16 q13, d20, d3[1]
+ vmlal.s16 q14, d21, d3[1]
+ vmlal.s16 q2, d20, d3[2]
+ vmlal.s16 q3, d21, d3[2]
+ vmlal.s16 q13, d22, d3[2]
+ vmlal.s16 q14, d23, d3[2]
+ vmlal.s16 q2, d22, d3[3]
+ vmlal.s16 q3, d23, d3[3]
+ vmlal.s16 q13, d24, d3[3]
+ vmlal.s16 q14, d25, d3[3]
+.ifc \type, put
+ vrshl.s32 q2, q2, q4 // -(6+intermediate_bits)
+ vrshl.s32 q3, q3, q4 // -(6+intermediate_bits)
+ vrshl.s32 q13, q13, q4 // -(6+intermediate_bits)
+ vrshl.s32 q14, q14, q4 // -(6+intermediate_bits)
+ vqmovun.s32 d4, q2
+ vqmovun.s32 d5, q3
+ vqmovun.s32 d6, q13
+ vqmovun.s32 d7, q14
+ vmin.u16 q2, q2, q15
+ vmin.u16 q3, q3, q15
+.else
+ vmov.i16 q5, #PREP_BIAS
+ vrshrn.i32 d4, q2, #6
+ vrshrn.i32 d5, q3, #6
+ vrshrn.i32 d6, q13, #6
+ vrshrn.i32 d7, q14, #6
+ vsub.i16 q2, q2, q5 // PREP_BIAS
+ vsub.i16 q3, q3, q5 // PREP_BIAS
+.endif
+ subs \h, \h, #2
+ vst1.16 {q2}, [\dst, :128], \d_strd
+ vst1.16 {q3}, [\ds2, :128], \d_strd
+ ble 9f
+ vmov q4, q6
+ vmov q5, q7
+ vmov q6, q8
+ vmov q7, q9
+ vmov q8, q10
+ vmov q9, q11
+ vmov q10, q12
+ b 88b
+9:
+ subs \w, \w, #8
+ ble 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ mls \src, \s_strd, \my, \src
+ mls \dst, \d_strd, \my, \dst
+ sub \src, \src, \s_strd, lsl #3
+ mov \h, \my
+ add \src, \src, #16
+ add \dst, \dst, #16
+ b 168b
+0:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+
+L(\type\()_8tap_filter_8):
+ vld1.16 {q13, q14}, [\sr2], \s_strd
+ vmull.s16 q2, d26, d0[0]
+ vmull.s16 q3, d27, d0[0]
+.irpc i, 1234567
+ vext.8 q12, q13, q14, #(2*\i)
+.if \i < 4
+ vmlal.s16 q2, d24, d0[\i]
+ vmlal.s16 q3, d25, d0[\i]
+.else
+ vmlal.s16 q2, d24, d1[\i - 4]
+ vmlal.s16 q3, d25, d1[\i - 4]
+.endif
+.endr
+ vdup.32 q12, r12 // -(6-intermediate_bits)
+ vld1.16 {q13, q14}, [\src], \s_strd
+ vrshl.s32 q2, q2, q12 // -(6-intermediate_bits)
+ vrshl.s32 q3, q3, q12 // -(6-intermediate_bits)
+ vmovn.i32 d4, q2
+ vmovn.i32 d5, q3
+
+ vmull.s16 q3, d26, d0[0]
+ vmull.s16 q11, d27, d0[0]
+.irpc i, 1234567
+ vext.8 q12, q13, q14, #(2*\i)
+.if \i < 4
+ vmlal.s16 q3, d24, d0[\i]
+ vmlal.s16 q11, d25, d0[\i]
+.else
+ vmlal.s16 q3, d24, d1[\i - 4]
+ vmlal.s16 q11, d25, d1[\i - 4]
+.endif
+.endr
+ vdup.32 q13, r12 // -(6-intermediate_bits)
+ vrshl.s32 q3, q3, q13 // -(6-intermediate_bits)
+ vrshl.s32 q11, q11, q13 // -(6-intermediate_bits)
+
+ vmovn.i32 d24, q3
+ vmovn.i32 d25, q11
+ vmov q11, q2
+ bx lr
+endfunc
+
+function \type\()_bilin_16bpc_neon, export=1
+ push {r4-r11,lr}
+ ldrd r4, r5, [sp, #36]
+ ldrd r6, r7, [sp, #44]
+.ifc \bdmax, r8
+ ldr r8, [sp, #52]
+.endif
+ vdup.16 q1, \mx
+ vdup.16 q3, \my
+ rsb r9, \mx, #16
+ rsb r10, \my, #16
+ vdup.16 q0, r9
+ vdup.16 q2, r10
+.ifc \type, prep
+ lsl \d_strd, \w, #1
+.endif
+ clz \bdmax, \bdmax // bitdepth_max
+ clz r9, \w
+ sub \bdmax, \bdmax, #18 // intermediate_bits = clz(bitdepth_max) - 18
+ cmp \mx, #0
+ sub r9, r9, #24
+ rsb r11, \bdmax, #4 // 4 - intermediate_bits
+ add r12, \bdmax, #4 // 4 + intermediate_bits
+ bne L(\type\()_bilin_h)
+ cmp \my, #0
+ bne L(\type\()_bilin_v)
+ b \type\()_neon
+
+L(\type\()_bilin_h):
+ cmp \my, #0
+ bne L(\type\()_bilin_hv)
+
+ adr r10, L(\type\()_bilin_h_tbl)
+ vdup.16 q15, r11 // 4 - intermediate_bits
+ ldr r9, [r10, r9, lsl #2]
+ vneg.s16 q15, q15 // -(4-intermediate_bits)
+.ifc \type, put
+ vdup.16 q14, \bdmax // intermediate_bits
+.else
+ vmov.i16 q14, #PREP_BIAS
+.endif
+ add r10, r10, r9
+.ifc \type, put
+ vneg.s16 q14, q14 // -intermediate_bits
+.endif
+ bx r10
+
+ .align 2
+L(\type\()_bilin_h_tbl):
+ .word 1280f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+ .word 640f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+ .word 320f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+ .word 160f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+ .word 80f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+ .word 40f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+ .word 20f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+
+20: // 2xN h
+.ifc \type, put
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+2:
+ vld1.16 {d16}, [\src], \s_strd
+ vld1.16 {d18}, [\sr2], \s_strd
+ vext.8 d17, d16, d16, #2
+ vext.8 d19, d18, d18, #2
+ vtrn.32 d16, d18
+ vtrn.32 d17, d19
+ subs \h, \h, #2
+ vmul.i16 d16, d16, d0
+ vmla.i16 d16, d17, d2
+ vrshl.u16 d16, d16, d30
+ vrshl.u16 d16, d16, d28
+ vst1.32 {d16[0]}, [\dst, :32], \d_strd
+ vst1.32 {d16[1]}, [\ds2, :32], \d_strd
+ bgt 2b
+ pop {r4-r11,pc}
+.endif
+
+40: // 4xN h
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+4:
+ vld1.16 {q8}, [\src], \s_strd
+ vld1.16 {q10}, [\sr2], \s_strd
+ vext.8 q9, q8, q8, #2
+ vext.8 q11, q10, q10, #2
+ vmov d17, d20
+ vmov d19, d22
+ subs \h, \h, #2
+ vmul.i16 q8, q8, q0
+ vmla.i16 q8, q9, q1
+ vrshl.u16 q8, q8, q15
+.ifc \type, put
+ vrshl.u16 q8, q8, q14
+.else
+ vsub.i16 q8, q8, q14
+.endif
+ vst1.16 {d16}, [\dst, :64], \d_strd
+ vst1.16 {d17}, [\ds2, :64], \d_strd
+ bgt 4b
+ pop {r4-r11,pc}
+
+80: // 8xN h
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+8:
+ vld1.16 {d16, d17, d18}, [\src], \s_strd
+ vld1.16 {d20, d21, d22}, [\sr2], \s_strd
+ vext.8 q9, q8, q9, #2
+ vext.8 q11, q10, q11, #2
+ subs \h, \h, #2
+ vmul.i16 q8, q8, q0
+ vmla.i16 q8, q9, q1
+ vmul.i16 q10, q10, q0
+ vmla.i16 q10, q11, q1
+ vrshl.u16 q8, q8, q15
+ vrshl.u16 q10, q10, q15
+.ifc \type, put
+ vrshl.u16 q8, q8, q14
+ vrshl.u16 q10, q10, q14
+.else
+ vsub.i16 q8, q8, q14
+ vsub.i16 q10, q10, q14
+.endif
+ vst1.16 {q8}, [\dst, :128], \d_strd
+ vst1.16 {q10}, [\ds2, :128], \d_strd
+ bgt 8b
+ pop {r4-r11,pc}
+160:
+320:
+640:
+1280: // 16xN, 32xN, ... h
+ vpush {q4-q7}
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+
+ sub \s_strd, \s_strd, \w, lsl #1
+ sub \s_strd, \s_strd, #16
+.ifc \type, put
+ lsl \d_strd, \d_strd, #1
+ sub \d_strd, \d_strd, \w, lsl #1
+.endif
+161:
+ vld1.16 {q4}, [\src]!
+ vld1.16 {q9}, [\sr2]!
+ mov \mx, \w
+
+16:
+ vld1.16 {q5, q6}, [\src]!
+ vld1.16 {q10, q11}, [\sr2]!
+ vext.8 q7, q4, q5, #2
+ vext.8 q8, q5, q6, #2
+ vext.8 q12, q9, q10, #2
+ vext.8 q13, q10, q11, #2
+ vmul.i16 q4, q4, q0
+ vmla.i16 q4, q7, q1
+ vmul.i16 q5, q5, q0
+ vmla.i16 q5, q8, q1
+ vmul.i16 q9, q9, q0
+ vmla.i16 q9, q12, q1
+ vmul.i16 q10, q10, q0
+ vmla.i16 q10, q13, q1
+ vrshl.u16 q4, q4, q15
+ vrshl.u16 q5, q5, q15
+ vrshl.u16 q9, q9, q15
+ vrshl.u16 q10, q10, q15
+ subs \mx, \mx, #16
+.ifc \type, put
+ vrshl.u16 q4, q4, q14
+ vrshl.u16 q5, q5, q14
+ vrshl.u16 q9, q9, q14
+ vrshl.u16 q10, q10, q14
+.else
+ vsub.i16 q4, q4, q14
+ vsub.i16 q5, q5, q14
+ vsub.i16 q9, q9, q14
+ vsub.i16 q10, q10, q14
+.endif
+ vst1.16 {q4, q5}, [\dst, :128]!
+ vst1.16 {q9, q10}, [\ds2, :128]!
+ ble 9f
+
+ vmov q4, q6
+ vmov q9, q11
+ b 16b
+
+9:
+ add \dst, \dst, \d_strd
+ add \ds2, \ds2, \d_strd
+ add \src, \src, \s_strd
+ add \sr2, \sr2, \s_strd
+
+ subs \h, \h, #2
+ bgt 161b
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+
+
+L(\type\()_bilin_v):
+ cmp \h, #4
+ adr r10, L(\type\()_bilin_v_tbl)
+.ifc \type, prep
+ vdup.16 q15, r11 // 4 - intermediate_bits
+.endif
+ ldr r9, [r10, r9, lsl #2]
+.ifc \type, prep
+ vmov.i16 q14, #PREP_BIAS
+ vneg.s16 q15, q15 // -(4-intermediate_bits)
+.endif
+ add r10, r10, r9
+ bx r10
+
+ .align 2
+L(\type\()_bilin_v_tbl):
+ .word 1280f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+ .word 640f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+ .word 320f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+ .word 160f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+ .word 80f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+ .word 40f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+ .word 20f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+
+20: // 2xN v
+.ifc \type, put
+ cmp \h, #2
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ // 2x2 v
+ vld1.32 {d16[]}, [\src], \s_strd
+ bgt 24f
+22:
+ vld1.32 {d17[]}, [\sr2], \s_strd
+ vld1.32 {d18[]}, [\src], \s_strd
+ vext.8 d16, d16, d17, #4
+ vext.8 d17, d17, d18, #4
+ vmul.i16 d16, d16, d4
+ vmla.i16 d16, d17, d6
+ vrshr.u16 d16, d16, #4
+ vst1.32 {d16[0]}, [\dst, :32]
+ vst1.32 {d16[1]}, [\ds2, :32]
+ pop {r4-r11,pc}
+24: // 2x4, 2x6, 2x8, ... v
+ vld1.32 {d17[]}, [\sr2], \s_strd
+ vld1.32 {d18[]}, [\src], \s_strd
+ vld1.32 {d19[]}, [\sr2], \s_strd
+ vld1.32 {d20[]}, [\src], \s_strd
+ subs \h, \h, #4
+ vext.8 d16, d16, d17, #4
+ vext.8 d17, d17, d18, #4
+ vext.8 d18, d18, d19, #4
+ vext.8 d19, d19, d20, #4
+ vswp d17, d18
+ vmul.i16 q8, q8, q2
+ vmla.i16 q8, q9, q3
+ cmp \h, #2
+ vrshr.u16 q8, q8, #4
+ vst1.32 {d16[0]}, [\dst, :32], \d_strd
+ vst1.32 {d16[1]}, [\ds2, :32], \d_strd
+ vst1.32 {d17[0]}, [\dst, :32], \d_strd
+ vst1.32 {d17[1]}, [\ds2, :32], \d_strd
+ blt 0f
+ vmov d16, d20
+ beq 22b
+ b 24b
+0:
+ pop {r4-r11,pc}
+.endif
+
+40: // 4xN v
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vld1.16 {d16}, [\src], \s_strd
+4:
+ vld1.16 {d17}, [\sr2], \s_strd
+ vld1.16 {d19}, [\src], \s_strd
+ vmov d18, d17
+ vmul.i16 q8, q8, q2
+ vmla.i16 q8, q9, q3
+ subs \h, \h, #2
+.ifc \type, put
+ vrshr.u16 q8, q8, #4
+.else
+ vrshl.u16 q8, q8, q15
+ vsub.i16 q8, q8, q14
+.endif
+ vst1.16 {d16}, [\dst, :64], \d_strd
+ vst1.16 {d17}, [\ds2, :64], \d_strd
+ ble 0f
+ vmov d16, d19
+ b 4b
+0:
+ pop {r4-r11,pc}
+
+80: // 8xN v
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vld1.16 {q8}, [\src], \s_strd
+8:
+ vld1.16 {q9}, [\sr2], \s_strd
+ vld1.16 {q10}, [\src], \s_strd
+ vmul.i16 q8, q8, q2
+ vmla.i16 q8, q9, q3
+ vmul.i16 q9, q9, q2
+ vmla.i16 q9, q10, q3
+ subs \h, \h, #2
+.ifc \type, put
+ vrshr.u16 q8, q8, #4
+ vrshr.u16 q9, q9, #4
+.else
+ vrshl.u16 q8, q8, q15
+ vrshl.u16 q9, q9, q15
+ vsub.i16 q8, q8, q14
+ vsub.i16 q9, q9, q14
+.endif
+ vst1.16 {q8}, [\dst, :128], \d_strd
+ vst1.16 {q9}, [\ds2, :128], \d_strd
+ ble 0f
+ vmov q8, q10
+ b 8b
+0:
+ pop {r4-r11,pc}
+
+160: // 16xN, 32xN, ...
+320:
+640:
+1280:
+ mov \my, \h
+1:
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ vld1.16 {q8, q9}, [\src], \s_strd
+2:
+ vld1.16 {q10, q11}, [\sr2], \s_strd
+ vld1.16 {q12, q13}, [\src], \s_strd
+ vmul.i16 q8, q8, q2
+ vmla.i16 q8, q10, q3
+ vmul.i16 q9, q9, q2
+ vmla.i16 q9, q11, q3
+ vmul.i16 q10, q10, q2
+ vmla.i16 q10, q12, q3
+ vmul.i16 q11, q11, q2
+ vmla.i16 q11, q13, q3
+ subs \h, \h, #2
+.ifc \type, put
+ vrshr.u16 q8, q8, #4
+ vrshr.u16 q9, q9, #4
+ vrshr.u16 q10, q10, #4
+ vrshr.u16 q11, q11, #4
+.else
+ vrshl.u16 q8, q8, q15
+ vrshl.u16 q9, q9, q15
+ vrshl.u16 q10, q10, q15
+ vrshl.u16 q11, q11, q15
+ vsub.i16 q8, q8, q14
+ vsub.i16 q9, q9, q14
+ vsub.i16 q10, q10, q14
+ vsub.i16 q11, q11, q14
+.endif
+ vst1.16 {q8, q9}, [\dst, :128], \d_strd
+ vst1.16 {q10, q11}, [\ds2, :128], \d_strd
+ ble 9f
+ vmov q8, q12
+ vmov q9, q13
+ b 2b
+9:
+ subs \w, \w, #16
+ ble 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ mls \src, \s_strd, \my, \src
+ mls \dst, \d_strd, \my, \dst
+ sub \src, \src, \s_strd, lsl #1
+ mov \h, \my
+ add \src, \src, #32
+ add \dst, \dst, #32
+ b 1b
+0:
+ pop {r4-r11,pc}
+
+L(\type\()_bilin_hv):
+ adr r10, L(\type\()_bilin_hv_tbl)
+ vdup.16 q15, r11 // 4 - intermediate_bits
+ ldr r9, [r10, r9, lsl #2]
+ vneg.s16 q15, q15 // -(4-intermediate_bits)
+.ifc \type, put
+ vdup.32 q14, r12 // 4 + intermediate_bits
+.else
+ vmov.i16 q14, #PREP_BIAS
+.endif
+ add r10, r10, r9
+.ifc \type, put
+ vneg.s32 q14, q14 // -(4+intermediate_bits)
+.endif
+ bx r10
+
+ .align 2
+L(\type\()_bilin_hv_tbl):
+ .word 1280f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+ .word 640f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+ .word 320f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+ .word 160f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+ .word 80f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+ .word 40f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+ .word 20f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+
+20: // 2xN hv
+.ifc \type, put
+ add \sr2, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ vld1.16 {d20}, [\src], \s_strd
+ vext.8 d21, d20, d20, #2
+ vmul.i16 d16, d20, d0
+ vmla.i16 d16, d21, d2
+ vrshl.u16 d16, d16, d30
+ vext.8 d16, d16, d16, #4
+
+2:
+ vld1.16 {d20}, [\sr2], \s_strd
+ vld1.16 {d22}, [\src], \s_strd
+ vext.8 d21, d20, d20, #2
+ vext.8 d23, d22, d22, #2
+ vtrn.32 d20, d22
+ vtrn.32 d21, d23
+ vmul.i16 d18, d20, d0
+ vmla.i16 d18, d21, d2
+ vrshl.u16 d18, d18, d30
+
+ vext.8 d16, d16, d18, #4
+
+ vmull.u16 q8, d16, d4
+ vmlal.u16 q8, d18, d6
+ vrshl.u32 q8, q8, q14
+ vmovn.i32 d16, q8
+ subs \h, \h, #2
+ vst1.32 {d16[0]}, [\dst, :32], \d_strd
+ vst1.32 {d16[1]}, [\ds2, :32], \d_strd
+ ble 0f
+ vmov d16, d18
+ b 2b
+0:
+ pop {r4-r11,pc}
+.endif
+
+40: // 4xN hv
+ add \sr2, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ vld1.16 {q10}, [\src], \s_strd
+ vext.8 d21, d20, d21, #2
+ vmul.i16 d16, d20, d0
+ vmla.i16 d16, d21, d2
+ vrshl.u16 d16, d16, d30
+
+4:
+ vld1.16 {q10}, [\sr2], \s_strd
+ vld1.16 {q11}, [\src], \s_strd
+ vext.8 d21, d20, d21, #2
+ vext.8 d23, d22, d23, #2
+ vswp d21, d22
+ vmul.i16 q9, q10, q0
+ vmla.i16 q9, q11, q1
+ vrshl.u16 q9, q9, q15
+
+ vmull.u16 q10, d16, d4
+ vmlal.u16 q10, d18, d6
+ vmull.u16 q11, d18, d4
+ vmlal.u16 q11, d19, d6
+.ifc \type, put
+ vrshl.u32 q10, q10, q14
+ vrshl.u32 q11, q11, q14
+ vmovn.i32 d20, q10
+ vmovn.i32 d21, q11
+.else
+ vrshrn.i32 d20, q10, #4
+ vrshrn.i32 d21, q11, #4
+ vsub.i16 q10, q10, q14
+.endif
+ subs \h, \h, #2
+ vst1.16 {d20}, [\dst, :64], \d_strd
+ vst1.16 {d21}, [\ds2, :64], \d_strd
+ ble 0f
+ vmov d16, d19
+ b 4b
+0:
+ pop {r4-r11,pc}
+
+80: // 8xN, 16xN, ... hv
+160:
+320:
+640:
+1280:
+ mov \my, \h
+
+1:
+ add \sr2, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ vld1.16 {d20, d21, d22}, [\src], \s_strd
+ vext.8 q11, q10, q11, #2
+ vmul.i16 q8, q10, q0
+ vmla.i16 q8, q11, q1
+ vrshl.u16 q8, q8, q15
+
+2:
+ vld1.16 {d20, d21, d22}, [\sr2], \s_strd
+ vld1.16 {d24, d25, d26}, [\src], \s_strd
+ vext.8 q11, q10, q11, #2
+ vext.8 q13, q12, q13, #2
+ vmul.i16 q9, q10, q0
+ vmla.i16 q9, q11, q1
+ vmul.i16 q10, q12, q0
+ vmla.i16 q10, q13, q1
+ vrshl.u16 q9, q9, q15
+ vrshl.u16 q10, q10, q15
+
+ vmull.u16 q11, d16, d4
+ vmlal.u16 q11, d18, d6
+ vmull.u16 q12, d17, d4
+ vmlal.u16 q12, d19, d6
+ vmull.u16 q8, d18, d4
+ vmlal.u16 q8, d20, d6
+ vmull.u16 q9, d19, d4
+ vmlal.u16 q9, d21, d6
+.ifc \type, put
+ vrshl.u32 q11, q11, q14
+ vrshl.u32 q12, q12, q14
+ vrshl.u32 q8, q8, q14
+ vrshl.u32 q9, q9, q14
+ vmovn.i32 d22, q11
+ vmovn.i32 d23, q12
+ vmovn.i32 d16, q8
+ vmovn.i32 d17, q9
+.else
+ vrshrn.i32 d22, q11, #4
+ vrshrn.i32 d23, q12, #4
+ vrshrn.i32 d16, q8, #4
+ vrshrn.i32 d17, q9, #4
+ vsub.i16 q11, q11, q14
+ vsub.i16 q8, q8, q14
+.endif
+ subs \h, \h, #2
+ vst1.16 {q11}, [\dst, :128], \d_strd
+ vst1.16 {q8}, [\ds2, :128], \d_strd
+ ble 9f
+ vmov q8, q10
+ b 2b
+9:
+ subs \w, \w, #8
+ ble 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ mls \src, \s_strd, \my, \src
+ mls \dst, \d_strd, \my, \dst
+ sub \src, \src, \s_strd, lsl #1
+ mov \h, \my
+ add \src, \src, #16
+ add \dst, \dst, #16
+ b 1b
+0:
+ pop {r4-r11,pc}
+endfunc
+.endm
+
+filter_fn put, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10
+filter_fn prep, r0, r8, r1, r2, r3, r4, r5, r6, r7, r9, r10
+
+.macro load_filter_ptr src
+ asr r12, \src, #10
+ add r12, r11, r12, lsl #3
+.endm
+
+.macro load_filter_coef dst, src, inc
+ add \src, \src, \inc
+ vld1.8 {\dst}, [r12, :64]
+.endm
+
+.macro load_filter_row dst, src, inc
+ load_filter_ptr \src
+ load_filter_coef \dst, \src, \inc
+.endm
+
+function warp_filter_horz_neon
+ load_filter_ptr r5 // filter 0
+ vld1.16 {q6,q7}, [r2], r3
+
+ load_filter_coef d0, r5, r7 // filter 0
+ load_filter_row d2, r5, r7 // filter 1
+ vmovl.s8 q0, d0 // filter 0
+ vext.8 q3, q6, q7, #2*1 // filter 1 pixels
+ vmovl.s8 q1, d2 // filter 1
+
+ vmull.s16 q4, d12, d0 // filter 0 output (0-3)
+ vmull.s16 q5, d13, d1 // filter 0 output (4-7)
+
+ load_filter_ptr r5 // filter 2
+
+ vmull.s16 q2, d6, d2 // filter 1 output (0-3)
+ vmull.s16 q3, d7, d3 // filter 1 output (4-7)
+
+ load_filter_coef d0, r5, r7 // filter 2
+
+ vpadd.i32 d8, d8, d9 // half pixel 0 (2x32)
+ vpadd.i32 d9, d10, d11 // half pixel 0 (2x32)
+
+ load_filter_ptr r5 // filter 3
+
+ vpadd.i32 d4, d4, d5 // half pixel 1 (2x32)
+ vpadd.i32 d5, d6, d7 // half pixel 1 (2x32)
+
+ vmovl.s8 q0, d0 // filter 2
+ vext.8 q3, q6, q7, #2*2 // filter 2 pixels
+
+ vpadd.i32 d8, d8, d9 // pixel 0 (2x32)
+ vpadd.i32 d9, d4, d5 // pixel 1 (2x32)
+
+ load_filter_coef d2, r5, r7 // filter 3
+
+ vmull.s16 q2, d6, d0 // filter 2 output (0-3)
+ vmull.s16 q3, d7, d1 // filter 2 output (4-7)
+
+ load_filter_ptr r5 // filter 4
+
+ vpadd.i32 d8, d8, d9 // pixel 0,1
+
+ vpadd.i32 d9, d4, d5 // half pixel 2 (2x32)
+ vpadd.i32 d10, d6, d7 // half pixel 2 (2x32)
+
+ vmovl.s8 q1, d2 // filter 3
+ vext.8 q3, q6, q7, #2*3 // filter 3 pixels
+
+ load_filter_coef d0, r5, r7 // filter 4
+
+ vpadd.i32 d9, d9, d10 // pixel 2 (2x32)
+
+ vmull.s16 q2, d6, d2 // filter 3 output (0-3)
+ vmull.s16 q3, d7, d3 // filter 3 output (4-7)
+
+ vmovl.s8 q0, d0 // filter 4
+ load_filter_ptr r5 // filter 5
+
+ vpadd.i32 d10, d4, d5 // half pixel 3 (2x32)
+ vpadd.i32 d11, d6, d7 // half pixel 3 (2x32)
+
+ vext.8 q3, q6, q7, #2*4 // filter 4 pixels
+ load_filter_coef d2, r5, r7 // filter 5
+
+ vpadd.i32 d10, d10, d11 // pixel 3 (2x32)
+
+ vpadd.i32 d9, d9, d10 // pixel 2,3
+
+ vmull.s16 q2, d6, d0 // filter 4 output (0-3)
+ vmull.s16 q3, d7, d1 // filter 4 output (4-7)
+
+ vmovl.s8 q1, d2 // filter 5
+ load_filter_ptr r5 // filter 6
+
+ vpadd.i32 d10, d4, d5 // half pixel 4 (2x32)
+ vpadd.i32 d11, d6, d7 // half pixel 4 (2x32)
+
+ vext.8 q3, q6, q7, #2*5 // filter 5 pixels
+ load_filter_coef d0, r5, r7 // filter 6
+
+ vpadd.i32 d10, d10, d11 // pixel 4 (2x32)
+
+ vmull.s16 q2, d6, d2 // filter 5 output (0-3)
+ vmull.s16 q3, d7, d3 // filter 5 output (4-7)
+
+ vmovl.s8 q0, d0 // filter 6
+ load_filter_ptr r5 // filter 7
+
+ vpadd.i32 d4, d4, d5 // half pixel 5 (2x32)
+ vpadd.i32 d5, d6, d7 // half pixel 5 (2x32)
+
+ vext.8 q3, q6, q7, #2*6 // filter 6 pixels
+ load_filter_coef d2, r5, r7 // filter 7
+
+ vpadd.i32 d11, d4, d5 // pixel 5 (2x32)
+
+ vmull.s16 q2, d6, d0 // filter 6 output (0-3)
+ vmull.s16 q3, d7, d1 // filter 6 output (4-7)
+
+ vmovl.s8 q1, d2 // filter 7
+
+ vpadd.i32 d10, d10, d11 // pixel 4,5
+
+ vpadd.i32 d4, d4, d5 // half pixel 6 (2x32)
+ vpadd.i32 d5, d6, d7 // half pixel 6 (2x32)
+
+ vext.8 q3, q6, q7, #2*7 // filter 7 pixels
+
+ vpadd.i32 d11, d4, d5 // pixel 6 (2x32)
+
+ vmull.s16 q2, d6, d2 // filter 7 output (0-3)
+ vmull.s16 q3, d7, d3 // filter 7 output (4-7)
+
+ vld1.32 {d14[],d15[]}, [sp] // -(7 - intermediate_bits)
+
+ vpadd.i32 d4, d4, d5 // half pixel 7 (2x32)
+ vpadd.i32 d5, d6, d7 // half pixel 7 (2x32)
+
+ sub r5, r5, r7, lsl #3
+
+ vpadd.i32 d4, d4, d5 // pixel 7 (2x32)
+
+ add r5, r5, r8
+
+ vpadd.i32 d11, d11, d4 // pixel 6,7
+
+ vrshl.s32 q4, q4, q7 // -(7 - intermediate_bits)
+ vrshl.s32 q5, q5, q7 // -(7 - intermediate_bits)
+
+ bx lr
+endfunc
+
+// void dav1d_warp_affine_8x8_16bpc_neon(
+// pixel *dst, const ptrdiff_t dst_stride,
+// const pixel *src, const ptrdiff_t src_stride,
+// const int16_t *const abcd, int mx, int my,
+// const int bitdepth_max)
+.macro warp t
+function warp_affine_8x8\t\()_16bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100]
+ ldrd r6, r7, [sp, #108]
+ sub sp, sp, #8
+
+ clz r7, r7
+ // intermediate_bits = clz(bitdepth_max) - 18
+.ifb \t
+ sub r8, r7, #11 // 7 + intermediate_bits = clz(bitdepth_max) - 18 + 7
+.endif
+ sub r7, r7, #25 // -(7 - intermediate_bits)
+.ifb \t
+ neg r8, r8 // -(7 + intermediate_bits)
+.endif
+ str r7, [sp] // spill -(7 - intermediate_bits) on stack
+.ifb \t
+ str r8, [sp, #4] // spill -(7 + intermediate_bits) on stack
+.endif
+
+ ldrd r8, r9, [r4]
+ sxth r7, r8
+ asr r8, r8, #16
+ asr r4, r9, #16
+ sxth r9, r9
+ mov r10, #8
+ sub r2, r2, r3, lsl #1
+ sub r2, r2, r3
+ sub r2, r2, #6
+ movrel r11, X(mc_warp_filter), 64*8
+.ifnb \t
+ lsl r1, r1, #1
+.endif
+ add r5, r5, #512
+ add r6, r6, #512
+
+ bl warp_filter_horz_neon
+ vmovn.i32 d16, q4
+ vmovn.i32 d17, q5
+ bl warp_filter_horz_neon
+ vmovn.i32 d18, q4
+ vmovn.i32 d19, q5
+ bl warp_filter_horz_neon
+ vmovn.i32 d20, q4
+ vmovn.i32 d21, q5
+ bl warp_filter_horz_neon
+ vmovn.i32 d22, q4
+ vmovn.i32 d23, q5
+ bl warp_filter_horz_neon
+ vmovn.i32 d24, q4
+ vmovn.i32 d25, q5
+ bl warp_filter_horz_neon
+ vmovn.i32 d26, q4
+ vmovn.i32 d27, q5
+ bl warp_filter_horz_neon
+ vmovn.i32 d28, q4
+ vmovn.i32 d29, q5
+
+1:
+ bl warp_filter_horz_neon
+ vmovn.i32 d30, q4
+ vmovn.i32 d31, q5
+
+ load_filter_row d8, r6, r9
+ load_filter_row d9, r6, r9
+ load_filter_row d10, r6, r9
+ load_filter_row d11, r6, r9
+ load_filter_row d12, r6, r9
+ load_filter_row d13, r6, r9
+ load_filter_row d14, r6, r9
+ load_filter_row d15, r6, r9
+ transpose_8x8b q4, q5, q6, q7, d8, d9, d10, d11, d12, d13, d14, d15
+ vmovl.s8 q1, d8
+ vmovl.s8 q2, d9
+ vmovl.s8 q3, d10
+ vmovl.s8 q4, d11
+ vmovl.s8 q5, d12
+ vmovl.s8 q6, d13
+
+ sub r6, r6, r9, lsl #3
+
+ // This ordering of vmull/vmlal is highly beneficial for
+ // Cortex A8/A9/A53 here, but harmful for Cortex A7.
+ vmull.s16 q0, d16, d2
+ vmlal.s16 q0, d18, d4
+ vmlal.s16 q0, d20, d6
+ vmlal.s16 q0, d22, d8
+ vmlal.s16 q0, d24, d10
+ vmlal.s16 q0, d26, d12
+ vmull.s16 q1, d17, d3
+ vmlal.s16 q1, d19, d5
+ vmlal.s16 q1, d21, d7
+ vmlal.s16 q1, d23, d9
+ vmlal.s16 q1, d25, d11
+ vmlal.s16 q1, d27, d13
+
+ vmovl.s8 q2, d14
+ vmovl.s8 q3, d15
+
+ vmlal.s16 q0, d28, d4
+ vmlal.s16 q0, d30, d6
+ vmlal.s16 q1, d29, d5
+ vmlal.s16 q1, d31, d7
+
+.ifb \t
+ ldr lr, [sp, #4] // -(7 + intermediate_bits)
+ ldr r12, [sp, #120] // bitdepth_max
+ vdup.32 q2, lr // -(7 + intermediate_bits)
+ vdup.16 q3, r12 // bitdepth_max
+.endif
+
+ vmov q8, q9
+ vmov q9, q10
+.ifb \t
+ vrshl.s32 q0, q0, q2 // -(7 + intermediate_bits)
+ vrshl.s32 q1, q1, q2 // -(7 + intermediate_bits)
+.else
+ vrshrn.s32 d0, q0, #7
+ vrshrn.s32 d1, q1, #7
+ vmov.i16 q3, #PREP_BIAS
+.endif
+ vmov q10, q11
+.ifb \t
+ vqmovun.s32 d0, q0
+ vqmovun.s32 d1, q1
+.else
+ vsub.i16 q0, q0, q3 // PREP_BIAS
+.endif
+ vmov q11, q12
+ vmov q12, q13
+.ifb \t
+ vmin.u16 q0, q0, q3 // bitdepth_max
+.endif
+ vmov q13, q14
+ vmov q14, q15
+ subs r10, r10, #1
+ vst1.16 {q0}, [r0, :128], r1
+
+ add r6, r6, r4
+ bgt 1b
+
+ add sp, sp, #8
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+.endm
+
+warp
+warp t
+
+// void dav1d_emu_edge_16bpc_neon(
+// const intptr_t bw, const intptr_t bh,
+// const intptr_t iw, const intptr_t ih,
+// const intptr_t x, const intptr_t y,
+// pixel *dst, const ptrdiff_t dst_stride,
+// const pixel *ref, const ptrdiff_t ref_stride)
+function emu_edge_16bpc_neon, export=1
+ push {r4-r11,lr}
+ ldrd r4, r5, [sp, #36]
+ ldrd r6, r7, [sp, #44]
+ ldrd r8, r9, [sp, #52]
+
+ // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
+ // ref += iclip(x, 0, iw - 1)
+ sub r12, r3, #1 // ih - 1
+ cmp r5, r3
+ sub lr, r2, #1 // iw - 1
+ it lt
+ movlt r12, r5 // min(y, ih - 1)
+ cmp r4, r2
+ bic r12, r12, r12, asr #31 // max(min(y, ih - 1), 0)
+ it lt
+ movlt lr, r4 // min(x, iw - 1)
+ bic lr, lr, lr, asr #31 // max(min(x, iw - 1), 0)
+ mla r8, r12, r9, r8 // ref += iclip() * stride
+ add r8, r8, lr, lsl #1 // ref += iclip()
+
+ // bottom_ext = iclip(y + bh - ih, 0, bh - 1)
+ // top_ext = iclip(-y, 0, bh - 1)
+ add r10, r5, r1 // y + bh
+ neg r5, r5 // -y
+ sub r10, r10, r3 // y + bh - ih
+ sub r12, r1, #1 // bh - 1
+ cmp r10, r1
+ bic r5, r5, r5, asr #31 // max(-y, 0)
+ it ge
+ movge r10, r12 // min(y + bh - ih, bh-1)
+ cmp r5, r1
+ bic r10, r10, r10, asr #31 // max(min(y + bh - ih, bh-1), 0)
+ it ge
+ movge r5, r12 // min(max(-y, 0), bh-1)
+
+ // right_ext = iclip(x + bw - iw, 0, bw - 1)
+ // left_ext = iclip(-x, 0, bw - 1)
+ add r11, r4, r0 // x + bw
+ neg r4, r4 // -x
+ sub r11, r11, r2 // x + bw - iw
+ sub lr, r0, #1 // bw - 1
+ cmp r11, r0
+ bic r4, r4, r4, asr #31 // max(-x, 0)
+ it ge
+ movge r11, lr // min(x + bw - iw, bw-1)
+ cmp r4, r0
+ bic r11, r11, r11, asr #31 // max(min(x + bw - iw, bw-1), 0)
+ it ge
+ movge r4, lr // min(max(-x, 0), bw - 1)
+
+ // center_h = bh - top_ext - bottom_ext
+ // dst += top_ext * PXSTRIDE(dst_stride)
+ // center_w = bw - left_ext - right_ext
+ sub r1, r1, r5 // bh - top_ext
+ mla r6, r5, r7, r6
+ sub r2, r0, r4 // bw - left_ext
+ sub r1, r1, r10 // center_h = bh - top_ext - bottom_ext
+ sub r2, r2, r11 // center_w = bw - left_ext - right_ext
+
+ mov r0, r6 // backup of dst
+
+.macro v_loop need_left, need_right
+0:
+.if \need_left
+ vld1.16 {d0[], d1[]}, [r8]
+ mov r12, r6 // out = dst
+ mov r3, r4
+ vmov q1, q0
+1:
+ subs r3, r3, #16
+ vst1.16 {q0, q1}, [r12, :128]!
+ bgt 1b
+.endif
+ mov lr, r8
+ add r12, r6, r4, lsl #1 // out = dst + left_ext
+ mov r3, r2
+1:
+ vld1.16 {q0, q1}, [lr]!
+ subs r3, r3, #32
+ vld1.16 {q2, q3}, [lr]!
+.if \need_left
+ vst1.16 {q0, q1}, [r12]!
+ vst1.16 {q2, q3}, [r12]!
+.else
+ vst1.16 {q0, q1}, [r12, :128]!
+ vst1.16 {q2, q3}, [r12, :128]!
+.endif
+ bgt 1b
+.if \need_right
+ add r3, r8, r2, lsl #1 // in + center_w
+ sub r3, r3, #2 // in + center_w - 1
+ add r12, r6, r4, lsl #1 // dst + left_ext
+ vld1.16 {d0[], d1[]}, [r3]
+ add r12, r12, r2, lsl #1 // out = dst + left_ext + center_w
+ mov r3, r11
+ vmov q1, q0
+1:
+ subs r3, r3, #16
+ vst1.16 {q0, q1}, [r12]!
+ bgt 1b
+.endif
+
+ subs r1, r1, #1 // center_h--
+ add r6, r6, r7
+ add r8, r8, r9
+ bgt 0b
+.endm
+
+ cmp r4, #0
+ beq 2f
+ // need_left
+ cmp r11, #0
+ beq 3f
+ // need_left + need_right
+ v_loop 1, 1
+ b 5f
+
+2:
+ // !need_left
+ cmp r11, #0
+ beq 4f
+ // !need_left + need_right
+ v_loop 0, 1
+ b 5f
+
+3:
+ // need_left + !need_right
+ v_loop 1, 0
+ b 5f
+
+4:
+ // !need_left + !need_right
+ v_loop 0, 0
+
+5:
+ cmp r10, #0
+ // Storing the original dst in r0 overwrote bw, recalculate it here
+ add r2, r2, r4 // center_w + left_ext
+ add r2, r2, r11 // bw = center_w + left_ext + right_ext
+
+ beq 3f
+ // need_bottom
+ sub r8, r6, r7 // ref = dst - stride
+ mov r4, r2
+ sub r12, r7, #32
+1:
+ vld1.16 {q0, q1}, [r8, :128]!
+ mov r3, r10
+ vld1.16 {q2, q3}, [r8, :128]!
+2:
+ vst1.16 {q0, q1}, [r6, :128]!
+ subs r3, r3, #1
+ vst1.16 {q2, q3}, [r6, :128], r12
+ bgt 2b
+ mls r6, r7, r10, r6 // dst -= bottom_ext * stride
+ subs r4, r4, #32 // bw -= 32
+ add r6, r6, #64 // dst += 32
+ bgt 1b
+
+3:
+ cmp r5, #0
+ beq 3f
+ // need_top
+ mls r6, r7, r5, r0 // dst = stored_dst - top_ext * stride
+ sub r12, r7, #32
+1:
+ vld1.16 {q0, q1}, [r0, :128]!
+ mov r3, r5
+ vld1.16 {q2, q3}, [r0, :128]!
+2:
+ vst1.16 {q0, q1}, [r6, :128]!
+ subs r3, r3, #1
+ vst1.16 {q2, q3}, [r6, :128], r12
+ bgt 2b
+ mls r6, r7, r5, r6 // dst -= top_ext * stride
+ subs r2, r2, #32 // bw -= 32
+ add r6, r6, #64 // dst += 32
+ bgt 1b
+
+3:
+ pop {r4-r11,pc}
+endfunc
diff --git a/third_party/dav1d/src/arm/32/msac.S b/third_party/dav1d/src/arm/32/msac.S
new file mode 100644
index 0000000000..b06e109dda
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/msac.S
@@ -0,0 +1,575 @@
+/*
+ * Copyright © 2019, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+#define BUF_POS 0
+#define BUF_END 4
+#define DIF 8
+#define RNG 12
+#define CNT 16
+#define ALLOW_UPDATE_CDF 20
+
+const coeffs
+ .short 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0
+ .short 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+endconst
+
+const bits, align=4
+ .short 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
+ .short 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000
+endconst
+
+.macro vld1_align_n d0, q0, q1, src, n
+.if \n == 4
+ vld1.16 {\d0}, [\src, :64]
+.elseif \n == 8
+ vld1.16 {\q0}, [\src, :128]
+.else
+ vld1.16 {\q0, \q1}, [\src, :128]
+.endif
+.endm
+
+.macro vld1_n d0, q0, q1, src, n
+.if \n == 4
+ vld1.16 {\d0}, [\src]
+.elseif \n == 8
+ vld1.16 {\q0}, [\src]
+.else
+ vld1.16 {\q0, \q1}, [\src]
+.endif
+.endm
+
+.macro vst1_align_n d0, q0, q1, src, n
+.if \n == 4
+ vst1.16 {\d0}, [\src, :64]
+.elseif \n == 8
+ vst1.16 {\q0}, [\src, :128]
+.else
+ vst1.16 {\q0, \q1}, [\src, :128]
+.endif
+.endm
+
+.macro vst1_n d0, q0, q1, src, n
+.if \n == 4
+ vst1.16 {\d0}, [\src]
+.elseif \n == 8
+ vst1.16 {\q0}, [\src]
+.else
+ vst1.16 {\q0, \q1}, [\src]
+.endif
+.endm
+
+.macro vshr_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
+.if \n == 4
+ vshr.u16 \d0, \s0, \s3
+.else
+ vshr.u16 \d1, \s1, \s4
+.if \n == 16
+ vshr.u16 \d2, \s2, \s5
+.endif
+.endif
+.endm
+
+.macro vadd_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
+.if \n == 4
+ vadd.i16 \d0, \s0, \s3
+.else
+ vadd.i16 \d1, \s1, \s4
+.if \n == 16
+ vadd.i16 \d2, \s2, \s5
+.endif
+.endif
+.endm
+
+.macro vsub_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
+.if \n == 4
+ vsub.i16 \d0, \s0, \s3
+.else
+ vsub.i16 \d1, \s1, \s4
+.if \n == 16
+ vsub.i16 \d2, \s2, \s5
+.endif
+.endif
+.endm
+
+.macro vand_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
+.if \n == 4
+ vand \d0, \s0, \s3
+.else
+ vand \d1, \s1, \s4
+.if \n == 16
+ vand \d2, \s2, \s5
+.endif
+.endif
+.endm
+
+.macro vcge_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
+.if \n == 4
+ vcge.u16 \d0, \s0, \s3
+.else
+ vcge.u16 \d1, \s1, \s4
+.if \n == 16
+ vcge.u16 \d2, \s2, \s5
+.endif
+.endif
+.endm
+
+.macro vrhadd_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
+.if \n == 4
+ vrhadd.u16 \d0, \s0, \s3
+.else
+ vrhadd.u16 \d1, \s1, \s4
+.if \n == 16
+ vrhadd.u16 \d2, \s2, \s5
+.endif
+.endif
+.endm
+
+.macro vshl_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
+.if \n == 4
+ vshl.s16 \d0, \s0, \s3
+.else
+ vshl.s16 \d1, \s1, \s4
+.if \n == 16
+ vshl.s16 \d2, \s2, \s5
+.endif
+.endif
+.endm
+
+.macro vqdmulh_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
+.if \n == 4
+ vqdmulh.s16 \d0, \s0, \s3
+.else
+ vqdmulh.s16 \d1, \s1, \s4
+.if \n == 16
+ vqdmulh.s16 \d2, \s2, \s5
+.endif
+.endif
+.endm
+
+// unsigned dav1d_msac_decode_symbol_adapt4_neon(MsacContext *s, uint16_t *cdf,
+// size_t n_symbols);
+
+function msac_decode_symbol_adapt4_neon, export=1
+.macro decode_update n
+ push {r4-r10,lr}
+ sub sp, sp, #48
+ add r8, r0, #RNG
+
+ vld1_align_n d0, q0, q1, r1, \n // cdf
+ vld1.16 {d16[]}, [r8, :16] // rng
+ movrel_local r9, coeffs, 30
+ vmov.i16 d30, #0x7f00 // 0x7f00
+ sub r9, r9, r2, lsl #1
+ vmvn.i16 q14, #0x3f // 0xffc0
+ add r8, sp, #14
+ vand d22, d16, d30 // rng & 0x7f00
+ vst1.16 {d16[0]}, [r8, :16] // store original u = s->rng
+ vand_n d4, q2, q3, d0, q0, q1, d28, q14, q14, \n // cdf & 0xffc0
+.if \n > 4
+ vmov d23, d22
+.endif
+
+ vld1_n d16, q8, q9, r9, \n // EC_MIN_PROB * (n_symbols - ret)
+ vqdmulh_n d20, q10, q11, d4, q2, q3, d22, q11, q11, \n // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1
+ add r8, r0, #DIF + 2
+
+ vadd_n d16, q8, q9, d4, q2, q3, d16, q8, q9, \n // v = cdf + EC_MIN_PROB * (n_symbols - ret)
+.if \n == 4
+ vmov.i16 d17, #0
+.endif
+ vadd_n d16, q8, q9, d20, q10, q11, d16, q8, q9, \n // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret)
+
+ add r9, sp, #16
+ vld1.16 {d20[]}, [r8, :16] // dif >> (EC_WIN_SIZE - 16)
+ movrel_local r8, bits
+ vst1_n q8, q8, q9, r9, \n // store v values to allow indexed access
+
+ vmov d21, d20
+ vld1_align_n q12, q12, q13, r8, \n
+.if \n == 16
+ vmov q11, q10
+.endif
+
+ vcge_n q2, q2, q3, q10, q10, q11, q8, q8, q9, \n // c >= v
+
+ vand_n q10, q10, q11, q2, q2, q3, q12, q12, q13, \n // One bit per halfword set in the mask
+.if \n == 16
+ vadd.i16 q10, q10, q11
+.endif
+ vadd.i16 d20, d20, d21 // Aggregate mask bits
+ ldr r4, [r0, #ALLOW_UPDATE_CDF]
+ vpadd.i16 d20, d20, d20
+ lsl r10, r2, #1
+ vpadd.i16 d20, d20, d20
+ vmov.u16 r3, d20[0]
+ cmp r4, #0
+ rbit r3, r3
+ clz lr, r3 // ret
+
+ beq L(renorm)
+ // update_cdf
+ ldrh r3, [r1, r10] // count = cdf[n_symbols]
+ vmov.i8 q10, #0xff
+.if \n == 16
+ mov r4, #-5
+.else
+ mvn r12, r2
+ mov r4, #-4
+ cmn r12, #3 // set C if n_symbols <= 2
+.endif
+ vrhadd_n d16, q8, q9, d20, q10, q10, d4, q2, q3, \n // i >= val ? -1 : 32768
+.if \n == 16
+ sub r4, r4, r3, lsr #4 // -((count >> 4) + 5)
+.else
+ lsr r12, r3, #4 // count >> 4
+ sbc r4, r4, r12 // -((count >> 4) + (n_symbols > 2) + 4)
+.endif
+ vsub_n d16, q8, q9, d16, q8, q9, d0, q0, q1, \n // (32768 - cdf[i]) or (-1 - cdf[i])
+.if \n == 4
+ vdup.16 d20, r4 // -rate
+.else
+ vdup.16 q10, r4 // -rate
+.endif
+
+ sub r3, r3, r3, lsr #5 // count - (count == 32)
+ vsub_n d0, q0, q1, d0, q0, q1, d4, q2, q3, \n // cdf + (i >= val ? 1 : 0)
+ vshl_n d16, q8, q9, d16, q8, q9, d20, q10, q10, \n // ({32768,-1} - cdf[i]) >> rate
+ add r3, r3, #1 // count + (count < 32)
+ vadd_n d0, q0, q1, d0, q0, q1, d16, q8, q9, \n // cdf + (32768 - cdf[i]) >> rate
+ vst1_align_n d0, q0, q1, r1, \n
+ strh r3, [r1, r10]
+.endm
+
+ decode_update 4
+
+L(renorm):
+ add r8, sp, #16
+ add r8, r8, lr, lsl #1
+ ldrh r3, [r8] // v
+ ldrh r4, [r8, #-2] // u
+ ldr r6, [r0, #CNT]
+ ldr r7, [r0, #DIF]
+ sub r4, r4, r3 // rng = u - v
+ clz r5, r4 // clz(rng)
+ eor r5, r5, #16 // d = clz(rng) ^ 16
+ mvn r7, r7 // ~dif
+ add r7, r7, r3, lsl #16 // ~dif + (v << 16)
+L(renorm2):
+ lsl r4, r4, r5 // rng << d
+ subs r6, r6, r5 // cnt -= d
+ lsl r7, r7, r5 // (~dif + (v << 16)) << d
+ str r4, [r0, #RNG]
+ mvn r7, r7 // ~dif
+ bhs 9f
+
+ // refill
+ ldr r3, [r0, #BUF_POS] // BUF_POS
+ ldr r4, [r0, #BUF_END] // BUF_END
+ add r5, r3, #4
+ cmp r5, r4
+ bgt 2f
+
+ ldr r3, [r3] // next_bits
+ add r8, r6, #23 // shift_bits = cnt + 23
+ add r6, r6, #16 // cnt += 16
+ rev r3, r3 // next_bits = bswap(next_bits)
+ sub r5, r5, r8, lsr #3 // buf_pos -= shift_bits >> 3
+ and r8, r8, #24 // shift_bits &= 24
+ lsr r3, r3, r8 // next_bits >>= shift_bits
+ sub r8, r8, r6 // shift_bits -= 16 + cnt
+ str r5, [r0, #BUF_POS]
+ lsl r3, r3, r8 // next_bits <<= shift_bits
+ rsb r6, r8, #16 // cnt = cnt + 32 - shift_bits
+ eor r7, r7, r3 // dif ^= next_bits
+ b 9f
+
+2: // refill_eob
+ rsb r5, r6, #8 // c = 8 - cnt
+3:
+ cmp r3, r4
+ bge 4f
+ ldrb r8, [r3], #1
+ lsl r8, r8, r5
+ eor r7, r7, r8
+ subs r5, r5, #8
+ bge 3b
+
+4: // refill_eob_end
+ str r3, [r0, #BUF_POS]
+ rsb r6, r5, #8 // cnt = 8 - c
+
+9:
+ str r6, [r0, #CNT]
+ str r7, [r0, #DIF]
+
+ mov r0, lr
+ add sp, sp, #48
+
+ pop {r4-r10,pc}
+endfunc
+
+function msac_decode_symbol_adapt8_neon, export=1
+ decode_update 8
+ b L(renorm)
+endfunc
+
+function msac_decode_symbol_adapt16_neon, export=1
+ decode_update 16
+ b L(renorm)
+endfunc
+
+function msac_decode_hi_tok_neon, export=1
+ push {r4-r10,lr}
+ vld1.16 {d0}, [r1, :64] // cdf
+ add r4, r0, #RNG
+ vmov.i16 d31, #0x7f00 // 0x7f00
+ movrel_local r5, coeffs, 30-2*3
+ vmvn.i16 d30, #0x3f // 0xffc0
+ ldrh r9, [r1, #6] // count = cdf[n_symbols]
+ vld1.16 {d1[]}, [r4, :16] // rng
+ movrel_local r4, bits
+ vld1.16 {d29}, [r5] // EC_MIN_PROB * (n_symbols - ret)
+ add r5, r0, #DIF + 2
+ vld1.16 {q8}, [r4, :128]
+ mov r2, #-24
+ vand d20, d0, d30 // cdf & 0xffc0
+ ldr r10, [r0, #ALLOW_UPDATE_CDF]
+ vld1.16 {d2[]}, [r5, :16] // dif >> (EC_WIN_SIZE - 16)
+ sub sp, sp, #48
+ ldr r6, [r0, #CNT]
+ ldr r7, [r0, #DIF]
+ vmov d3, d2
+1:
+ vand d23, d1, d31 // rng & 0x7f00
+ vqdmulh.s16 d18, d20, d23 // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1
+ add r12, sp, #14
+ vadd.i16 d6, d20, d29 // v = cdf + EC_MIN_PROB * (n_symbols - ret)
+ vadd.i16 d6, d18, d6 // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret)
+ vmov.i16 d7, #0
+ vst1.16 {d1[0]}, [r12, :16] // store original u = s->rng
+ add r12, sp, #16
+ vcge.u16 q2, q1, q3 // c >= v
+ vst1.16 {q3}, [r12] // store v values to allow indexed access
+ vand q9, q2, q8 // One bit per halfword set in the mask
+
+ vadd.i16 d18, d18, d19 // Aggregate mask bits
+ vpadd.i16 d18, d18, d18
+ vpadd.i16 d18, d18, d18
+ vmov.u16 r3, d18[0]
+ cmp r10, #0
+ add r2, r2, #5
+ rbit r3, r3
+ add r8, sp, #16
+ clz lr, r3 // ret
+
+ beq 2f
+ // update_cdf
+ vmov.i8 d22, #0xff
+ mov r4, #-5
+ vrhadd.u16 d6, d22, d4 // i >= val ? -1 : 32768
+ sub r4, r4, r9, lsr #4 // -((count >> 4) + 5)
+ vsub.i16 d6, d6, d0 // (32768 - cdf[i]) or (-1 - cdf[i])
+ vdup.16 d18, r4 // -rate
+
+ sub r9, r9, r9, lsr #5 // count - (count == 32)
+ vsub.i16 d0, d0, d4 // cdf + (i >= val ? 1 : 0)
+ vshl.s16 d6, d6, d18 // ({32768,-1} - cdf[i]) >> rate
+ add r9, r9, #1 // count + (count < 32)
+ vadd.i16 d0, d0, d6 // cdf + (32768 - cdf[i]) >> rate
+ vst1.16 {d0}, [r1, :64]
+ vand d20, d0, d30 // cdf & 0xffc0
+ strh r9, [r1, #6]
+
+2:
+ add r8, r8, lr, lsl #1
+ ldrh r3, [r8] // v
+ ldrh r4, [r8, #-2] // u
+ sub r4, r4, r3 // rng = u - v
+ clz r5, r4 // clz(rng)
+ eor r5, r5, #16 // d = clz(rng) ^ 16
+ mvn r7, r7 // ~dif
+ add r7, r7, r3, lsl #16 // ~dif + (v << 16)
+ lsl r4, r4, r5 // rng << d
+ subs r6, r6, r5 // cnt -= d
+ lsl r7, r7, r5 // (~dif + (v << 16)) << d
+ str r4, [r0, #RNG]
+ vdup.16 d1, r4
+ mvn r7, r7 // ~dif
+ bhs 9f
+
+ // refill
+ ldr r3, [r0, #BUF_POS] // BUF_POS
+ ldr r4, [r0, #BUF_END] // BUF_END
+ add r5, r3, #4
+ cmp r5, r4
+ bgt 2f
+
+ ldr r3, [r3] // next_bits
+ add r8, r6, #23 // shift_bits = cnt + 23
+ add r6, r6, #16 // cnt += 16
+ rev r3, r3 // next_bits = bswap(next_bits)
+ sub r5, r5, r8, lsr #3 // buf_pos -= shift_bits >> 3
+ and r8, r8, #24 // shift_bits &= 24
+ lsr r3, r3, r8 // next_bits >>= shift_bits
+ sub r8, r8, r6 // shift_bits -= 16 + cnt
+ str r5, [r0, #BUF_POS]
+ lsl r3, r3, r8 // next_bits <<= shift_bits
+ rsb r6, r8, #16 // cnt = cnt + 32 - shift_bits
+ eor r7, r7, r3 // dif ^= next_bits
+ b 9f
+
+2: // refill_eob
+ rsb r5, r6, #8 // c = 40 - cnt
+3:
+ cmp r3, r4
+ bge 4f
+ ldrb r8, [r3], #1
+ lsl r8, r8, r5
+ eor r7, r7, r8
+ subs r5, r5, #8
+ bge 3b
+
+4: // refill_eob_end
+ str r3, [r0, #BUF_POS]
+ rsb r6, r5, #8 // cnt = 40 - c
+
+9:
+ lsl lr, lr, #1
+ sub lr, lr, #5
+ lsr r12, r7, #16
+ adds r2, r2, lr // carry = tok_br < 3 || tok == 15
+ vdup.16 q1, r12
+ bcc 1b // loop if !carry
+ add r2, r2, #30
+ str r6, [r0, #CNT]
+ add sp, sp, #48
+ str r7, [r0, #DIF]
+ lsr r0, r2, #1
+ pop {r4-r10,pc}
+endfunc
+
+function msac_decode_bool_equi_neon, export=1
+ push {r4-r10,lr}
+ ldr r5, [r0, #RNG]
+ ldr r6, [r0, #CNT]
+ sub sp, sp, #48
+ ldr r7, [r0, #DIF]
+ bic r4, r5, #0xff // r &= 0xff00
+ add r4, r4, #8
+ mov r2, #0
+ subs r8, r7, r4, lsl #15 // dif - vw
+ lsr r4, r4, #1 // v
+ sub r5, r5, r4 // r - v
+ itee lo
+ movlo r2, #1
+ movhs r4, r5 // if (ret) v = r - v;
+ movhs r7, r8 // if (ret) dif = dif - vw;
+
+ clz r5, r4 // clz(rng)
+ mvn r7, r7 // ~dif
+ eor r5, r5, #16 // d = clz(rng) ^ 16
+ mov lr, r2
+ b L(renorm2)
+endfunc
+
+function msac_decode_bool_neon, export=1
+ push {r4-r10,lr}
+ ldr r5, [r0, #RNG]
+ ldr r6, [r0, #CNT]
+ sub sp, sp, #48
+ ldr r7, [r0, #DIF]
+ lsr r4, r5, #8 // r >> 8
+ bic r1, r1, #0x3f // f &= ~63
+ mul r4, r4, r1
+ mov r2, #0
+ lsr r4, r4, #7
+ add r4, r4, #4 // v
+ subs r8, r7, r4, lsl #16 // dif - vw
+ sub r5, r5, r4 // r - v
+ itee lo
+ movlo r2, #1
+ movhs r4, r5 // if (ret) v = r - v;
+ movhs r7, r8 // if (ret) dif = dif - vw;
+
+ clz r5, r4 // clz(rng)
+ mvn r7, r7 // ~dif
+ eor r5, r5, #16 // d = clz(rng) ^ 16
+ mov lr, r2
+ b L(renorm2)
+endfunc
+
+function msac_decode_bool_adapt_neon, export=1
+ push {r4-r10,lr}
+ ldr r9, [r1] // cdf[0-1]
+ ldr r5, [r0, #RNG]
+ movw lr, #0xffc0
+ ldr r6, [r0, #CNT]
+ sub sp, sp, #48
+ ldr r7, [r0, #DIF]
+ lsr r4, r5, #8 // r >> 8
+ and r2, r9, lr // f &= ~63
+ mul r4, r4, r2
+ mov r2, #0
+ lsr r4, r4, #7
+ add r4, r4, #4 // v
+ subs r8, r7, r4, lsl #16 // dif - vw
+ sub r5, r5, r4 // r - v
+ ldr r10, [r0, #ALLOW_UPDATE_CDF]
+ itee lo
+ movlo r2, #1
+ movhs r4, r5 // if (ret) v = r - v;
+ movhs r7, r8 // if (ret) dif = dif - vw;
+
+ cmp r10, #0
+ clz r5, r4 // clz(rng)
+ mvn r7, r7 // ~dif
+ eor r5, r5, #16 // d = clz(rng) ^ 16
+ mov lr, r2
+
+ beq L(renorm2)
+
+ lsr r2, r9, #16 // count = cdf[1]
+ uxth r9, r9 // cdf[0]
+
+ sub r3, r2, r2, lsr #5 // count - (count >= 32)
+ lsr r2, r2, #4 // count >> 4
+ add r10, r3, #1 // count + (count < 32)
+ add r2, r2, #4 // rate = (count >> 4) | 4
+
+ sub r9, r9, lr // cdf[0] -= bit
+ sub r3, r9, lr, lsl #15 // {cdf[0], cdf[0] - 32769}
+ asr r3, r3, r2 // {cdf[0], cdf[0] - 32769} >> rate
+ sub r9, r9, r3 // cdf[0]
+
+ strh r9, [r1]
+ strh r10, [r1, #2]
+
+ b L(renorm2)
+endfunc
diff --git a/third_party/dav1d/src/arm/32/refmvs.S b/third_party/dav1d/src/arm/32/refmvs.S
new file mode 100644
index 0000000000..7f31db11eb
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/refmvs.S
@@ -0,0 +1,303 @@
+/*
+ * Copyright © 2021, VideoLAN and dav1d authors
+ * Copyright © 2021, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// void dav1d_splat_mv_neon(refmvs_block **rr, const refmvs_block *rmv,
+// int bx4, int bw4, int bh4)
+
+function splat_mv_neon, export=1
+ push {r4, lr}
+ vld1.8 {q3}, [r1]
+ ldr r4, [sp, #8]
+ clz r3, r3
+ adr lr, L(splat_tbl)
+ sub r3, r3, #26
+ vext.8 q2, q3, q3, #12
+ ldr r3, [lr, r3, lsl #2]
+ add r2, r2, r2, lsl #1
+ vext.8 q0, q2, q3, #4
+ add r3, lr, r3
+ vext.8 q1, q2, q3, #8
+ lsl r2, r2, #2
+ vext.8 q2, q2, q3, #12
+ vmov q3, q0
+1:
+ ldr r1, [r0], #4
+ subs r4, r4, #1
+ add r1, r1, r2
+ bx r3
+
+ .align 2
+L(splat_tbl):
+ .word 320f - L(splat_tbl) + CONFIG_THUMB
+ .word 160f - L(splat_tbl) + CONFIG_THUMB
+ .word 80f - L(splat_tbl) + CONFIG_THUMB
+ .word 40f - L(splat_tbl) + CONFIG_THUMB
+ .word 20f - L(splat_tbl) + CONFIG_THUMB
+ .word 10f - L(splat_tbl) + CONFIG_THUMB
+
+10:
+ vst1.8 {d0}, [r1]
+ vstr s2, [r1, #8]
+ bgt 1b
+ pop {r4, pc}
+20:
+ vst1.8 {q0}, [r1]
+ vstr d2, [r1, #16]
+ bgt 1b
+ pop {r4, pc}
+40:
+ vst1.8 {q0, q1}, [r1]!
+ vst1.8 {q2}, [r1]
+ bgt 1b
+ pop {r4, pc}
+320:
+ vst1.8 {q0, q1}, [r1]!
+ vst1.8 {q2, q3}, [r1]!
+ vst1.8 {q1, q2}, [r1]!
+ vst1.8 {q0, q1}, [r1]!
+ vst1.8 {q2, q3}, [r1]!
+ vst1.8 {q1, q2}, [r1]!
+160:
+ vst1.8 {q0, q1}, [r1]!
+ vst1.8 {q2, q3}, [r1]!
+ vst1.8 {q1, q2}, [r1]!
+80:
+ vst1.8 {q0, q1}, [r1]!
+ vst1.8 {q2, q3}, [r1]!
+ vst1.8 {q1, q2}, [r1]
+ bgt 1b
+ pop {r4, pc}
+endfunc
+
+const mv_tbls, align=4
+ .byte 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
+ .byte 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0
+ .byte 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4
+ .byte 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4
+endconst
+
+const mask_mult, align=4
+ .byte 1, 2, 1, 2, 0, 0, 0, 0
+endconst
+
+// void dav1d_save_tmvs_neon(refmvs_temporal_block *rp, ptrdiff_t stride,
+// refmvs_block **rr, const uint8_t *ref_sign,
+// int col_end8, int row_end8,
+// int col_start8, int row_start8)
+function save_tmvs_neon, export=1
+ push {r4-r11,lr}
+ ldrd r4, r5, [sp, #36]
+ ldrd r6, r7, [sp, #44]
+
+ vmov.i8 d30, #0
+ vld1.8 {d31}, [r3]
+ adr r8, L(save_tmvs_tbl)
+ movrel_local lr, mask_mult
+ movrel_local r12, mv_tbls
+ vld1.8 {d29}, [lr]
+ vext.8 d31, d30, d31, #7 // [0, ref_sign]
+ mov r3, #5
+ mul r1, r1, r3 // stride *= 5
+ sub r5, r5, r7 // h = row_end8 - row_start8
+ lsl r7, r7, #1 // row_start8 <<= 1
+1:
+ mov r3, #5
+ mov r11, #12*2
+ and r9, r7, #30 // (y & 15) * 2
+ ldr r9, [r2, r9, lsl #2] // b = rr[(y & 15) * 2]
+ add r9, r9, #12 // &b[... + 1]
+ mla r10, r4, r11, r9 // end_cand_b = &b[col_end8*2 + 1]
+ mla r9, r6, r11, r9 // cand_b = &b[x*2 + 1]
+
+ mla r3, r6, r3, r0 // &rp[x]
+
+ push {r2,r4,r6}
+
+2:
+ ldrb r11, [r9, #10] // cand_b->bs
+ add lr, r9, #8
+ vld1.8 {d0, d1}, [r9] // cand_b->mv
+ add r11, r8, r11, lsl #3
+ vld1.16 {d2[]}, [lr] // cand_b->ref
+ ldrh lr, [r11] // bw8
+ mov r2, r8
+ add r9, r9, lr, lsl #1 // cand_b += bw8*2
+ cmp r9, r10
+ vmov d4, d0
+ bge 3f
+
+ ldrb r2, [r9, #10] // cand_b->bs
+ add lr, r9, #8
+ vld1.8 {d6, d7}, [r9] // cand_b->mv
+ add r2, r8, r2, lsl #3
+ vld1.16 {d2[1]}, [lr] // cand_b->ref
+ ldrh lr, [r2] // bw8
+ add r9, r9, lr, lsl #1 // cand_b += bw8*2
+ vmov d5, d6
+
+3:
+ vabs.s16 q2, q2 // abs(mv[].xy)
+ vtbl.8 d2, {d31}, d2 // ref_sign[ref]
+ vshr.u16 q2, q2, #12 // abs(mv[].xy) >> 12
+ vmull.u8 q1, d2, d29 // ref_sign[ref] * {1, 2}
+ vceq.i32 q2, q2, #0 // abs(mv[].xy) <= 4096
+ vmovn.i32 d4, q2 // abs() condition to 16 bit
+ vand d2, d2, d4 // h[0-3] contains conditions for mv[0-1]
+ vpadd.i16 d2, d2, d2 // Combine condition for [1] and [0]
+ vmov.u16 r4, d2[0] // Extract case for first block
+ vmov.u16 r6, d2[1]
+ ldr r11, [r11, #4] // Fetch jump table entry
+ ldr r2, [r2, #4]
+ add r4, r12, r4, lsl #4
+ add r6, r12, r6, lsl #4
+ vld1.8 {d2, d3}, [r4] // Load permutation table base on case
+ vld1.8 {d4, d5}, [r6]
+ add r11, r8, r11 // Find jump table target
+ add r2, r8, r2
+ vtbl.8 d16, {d0, d1}, d2 // Permute cand_b to output refmvs_temporal_block
+ vtbl.8 d17, {d0, d1}, d3
+ vtbl.8 d18, {d6, d7}, d4
+ vtbl.8 d19, {d6, d7}, d5
+ vmov q0, q8
+
+ // q1 follows on q0 (q8), with another 3 full repetitions of the pattern.
+ vext.8 q1, q8, q8, #1
+ vext.8 q10, q9, q9, #1
+ // q2 ends with 3 complete repetitions of the pattern.
+ vext.8 q2, q8, q1, #4
+ vext.8 q11, q9, q10, #4
+
+ blx r11
+ bge 4f // if (cand_b >= end)
+ vmov q0, q9
+ vmov q1, q10
+ vmov q2, q11
+ cmp r9, r10
+ blx r2
+ blt 2b // if (cand_b < end)
+
+4:
+ pop {r2,r4,r6}
+
+ subs r5, r5, #1 // h--
+ add r7, r7, #2 // y += 2
+ add r0, r0, r1 // rp += stride
+ bgt 1b
+
+ pop {r4-r11,pc}
+
+ .align 2
+L(save_tmvs_tbl):
+ .word 16 * 12
+ .word 160f - L(save_tmvs_tbl) + CONFIG_THUMB
+ .word 16 * 12
+ .word 160f - L(save_tmvs_tbl) + CONFIG_THUMB
+ .word 8 * 12
+ .word 80f - L(save_tmvs_tbl) + CONFIG_THUMB
+ .word 8 * 12
+ .word 80f - L(save_tmvs_tbl) + CONFIG_THUMB
+ .word 8 * 12
+ .word 80f - L(save_tmvs_tbl) + CONFIG_THUMB
+ .word 8 * 12
+ .word 80f - L(save_tmvs_tbl) + CONFIG_THUMB
+ .word 4 * 12
+ .word 40f - L(save_tmvs_tbl) + CONFIG_THUMB
+ .word 4 * 12
+ .word 40f - L(save_tmvs_tbl) + CONFIG_THUMB
+ .word 4 * 12
+ .word 40f - L(save_tmvs_tbl) + CONFIG_THUMB
+ .word 4 * 12
+ .word 40f - L(save_tmvs_tbl) + CONFIG_THUMB
+ .word 2 * 12
+ .word 20f - L(save_tmvs_tbl) + CONFIG_THUMB
+ .word 2 * 12
+ .word 20f - L(save_tmvs_tbl) + CONFIG_THUMB
+ .word 2 * 12
+ .word 20f - L(save_tmvs_tbl) + CONFIG_THUMB
+ .word 2 * 12
+ .word 20f - L(save_tmvs_tbl) + CONFIG_THUMB
+ .word 2 * 12
+ .word 20f - L(save_tmvs_tbl) + CONFIG_THUMB
+ .word 1 * 12
+ .word 10f - L(save_tmvs_tbl) + CONFIG_THUMB
+ .word 1 * 12
+ .word 10f - L(save_tmvs_tbl) + CONFIG_THUMB
+ .word 1 * 12
+ .word 10f - L(save_tmvs_tbl) + CONFIG_THUMB
+ .word 1 * 12
+ .word 10f - L(save_tmvs_tbl) + CONFIG_THUMB
+ .word 1 * 12
+ .word 10f - L(save_tmvs_tbl) + CONFIG_THUMB
+ .word 1 * 12
+ .word 10f - L(save_tmvs_tbl) + CONFIG_THUMB
+ .word 1 * 12
+ .word 10f - L(save_tmvs_tbl) + CONFIG_THUMB
+
+10:
+ add r4, r3, #4
+ vst1.32 {d0[0]}, [r3]
+ vst1.8 {d0[4]}, [r4]
+ add r3, r3, #5
+ bx lr
+20:
+ add r4, r3, #8
+ vst1.8 {d0}, [r3]
+ vst1.16 {d1[0]}, [r4]
+ add r3, r3, #2*5
+ bx lr
+40:
+ add r4, r3, #16
+ vst1.8 {q0}, [r3]
+ vst1.32 {d2[0]}, [r4]
+ add r3, r3, #4*5
+ bx lr
+80:
+ add r4, r3, #(8*5-16)
+ // This writes 6 full entries plus 2 extra bytes
+ vst1.8 {q0, q1}, [r3]
+ // Write the last few, overlapping with the first write.
+ vst1.8 {q2}, [r4]
+ add r3, r3, #8*5
+ bx lr
+160:
+ add r4, r3, #6*5
+ add r6, r3, #12*5
+ // This writes 6 full entries plus 2 extra bytes
+ vst1.8 {q0, q1}, [r3]
+ // Write another 6 full entries, slightly overlapping with the first set
+ vst1.8 {q0, q1}, [r4]
+ add r4, r3, #(16*5-16)
+ // Write 8 bytes (one full entry) after the first 12
+ vst1.8 {d0}, [r6]
+ // Write the last 3 entries
+ vst1.8 {q2}, [r4]
+ add r3, r3, #16*5
+ bx lr
+endfunc
diff --git a/third_party/dav1d/src/arm/32/util.S b/third_party/dav1d/src/arm/32/util.S
new file mode 100644
index 0000000000..c3710d3767
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/util.S
@@ -0,0 +1,184 @@
+/******************************************************************************
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2015 Martin Storsjo
+ * Copyright © 2015 Janne Grunau
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+#ifndef DAV1D_SRC_ARM_32_UTIL_S
+#define DAV1D_SRC_ARM_32_UTIL_S
+
+#include "config.h"
+#include "src/arm/asm.S"
+
+.macro movrel_local rd, val, offset=0
+#if defined(PIC)
+ ldr \rd, 90001f
+ b 90002f
+90001:
+ .word \val + \offset - (90002f + 8 - 4 * CONFIG_THUMB)
+90002:
+ add \rd, \rd, pc
+#else
+ movw \rd, #:lower16:\val+\offset
+ movt \rd, #:upper16:\val+\offset
+#endif
+.endm
+
+.macro movrel rd, val, offset=0
+#if defined(PIC) && defined(__APPLE__)
+ ldr \rd, 1f
+ b 2f
+1:
+ .word 3f - (2f + 8 - 4 * CONFIG_THUMB)
+2:
+ ldr \rd, [pc, \rd]
+.if \offset < 0
+ sub \rd, \rd, #-(\offset)
+.elseif \offset > 0
+ add \rd, \rd, #\offset
+.endif
+ .non_lazy_symbol_pointer
+3:
+ .indirect_symbol \val
+ .word 0
+ .text
+#else
+ movrel_local \rd, \val, \offset
+#endif
+.endm
+
+// This macro clobbers r7 (and r12 on windows) and stores data at the
+// bottom of the stack; sp is the start of the space allocated that
+// the caller can use.
+.macro sub_sp_align space
+#if CONFIG_THUMB
+ mov r7, sp
+ and r7, r7, #15
+#else
+ and r7, sp, #15
+#endif
+ sub sp, sp, r7
+ // Now the stack is aligned, store the amount of adjustment back
+ // on the stack, as we don't want to waste a register as frame
+ // pointer.
+ str r7, [sp, #-16]!
+#ifdef _WIN32
+.if \space > 8192
+ // Here, we'd need to touch two (or more) pages while decrementing
+ // the stack pointer.
+ .error "sub_sp_align doesn't support values over 8K at the moment"
+.elseif \space > 4096
+ sub r7, sp, #4096
+ ldr r12, [r7]
+ sub r7, r7, #(\space - 4096)
+ mov sp, r7
+.else
+ sub sp, sp, #\space
+.endif
+#else
+.if \space >= 4096
+ sub sp, sp, #(\space)/4096*4096
+.endif
+.if (\space % 4096) != 0
+ sub sp, sp, #(\space)%4096
+.endif
+#endif
+.endm
+
+.macro add_sp_align space
+.if \space >= 4096
+ add sp, sp, #(\space)/4096*4096
+.endif
+.if (\space % 4096) != 0
+ add sp, sp, #(\space)%4096
+.endif
+ ldr r7, [sp], #16
+ // Add back the original stack adjustment
+ add sp, sp, r7
+.endm
+
+.macro transpose_8x8b q0, q1, q2, q3, r0, r1, r2, r3, r4, r5, r6, r7
+ vtrn.32 \q0, \q2
+ vtrn.32 \q1, \q3
+
+ vtrn.16 \r0, \r2
+ vtrn.16 \r1, \r3
+ vtrn.16 \r4, \r6
+ vtrn.16 \r5, \r7
+
+ vtrn.8 \r0, \r1
+ vtrn.8 \r2, \r3
+ vtrn.8 \r4, \r5
+ vtrn.8 \r6, \r7
+.endm
+
+.macro transpose_8x8h r0, r1, r2, r3, r4, r5, r6, r7, d0, d1, d2, d3, d4, d5, d6, d7
+ vswp \d0, \d4
+ vswp \d1, \d5
+ vswp \d2, \d6
+ vswp \d3, \d7
+
+ vtrn.32 \r0, \r2
+ vtrn.32 \r1, \r3
+ vtrn.32 \r4, \r6
+ vtrn.32 \r5, \r7
+
+ vtrn.16 \r0, \r1
+ vtrn.16 \r2, \r3
+ vtrn.16 \r4, \r5
+ vtrn.16 \r6, \r7
+.endm
+
+.macro transpose_4x8b q0, q1, r0, r1, r2, r3
+ vtrn.16 \q0, \q1
+
+ vtrn.8 \r0, \r1
+ vtrn.8 \r2, \r3
+.endm
+
+.macro transpose_4x4s q0, q1, q2, q3, r0, r1, r2, r3, r4, r5, r6, r7
+ vswp \r1, \r4 // vtrn.64 \q0, \q2
+ vswp \r3, \r6 // vtrn.64 \q1, \q3
+
+ vtrn.32 \q0, \q1
+ vtrn.32 \q2, \q3
+.endm
+
+.macro transpose_4x4h q0, q1, r0, r1, r2, r3
+ vtrn.32 \q0, \q1
+
+ vtrn.16 \r0, \r1
+ vtrn.16 \r2, \r3
+.endm
+
+.macro transpose_4x8h r0, r1, r2, r3
+ vtrn.32 \r0, \r2
+ vtrn.32 \r1, \r3
+
+ vtrn.16 \r0, \r1
+ vtrn.16 \r2, \r3
+.endm
+
+#endif /* DAV1D_SRC_ARM_32_UTIL_S */
diff --git a/third_party/dav1d/src/arm/64/cdef.S b/third_party/dav1d/src/arm/64/cdef.S
new file mode 100644
index 0000000000..32b258aba8
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/cdef.S
@@ -0,0 +1,520 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+#include "cdef_tmpl.S"
+
+.macro pad_top_bottom s1, s2, w, stride, rn, rw, ret
+ tst w7, #1 // CDEF_HAVE_LEFT
+ b.eq 2f
+ // CDEF_HAVE_LEFT
+ sub \s1, \s1, #2
+ sub \s2, \s2, #2
+ tst w7, #2 // CDEF_HAVE_RIGHT
+ b.eq 1f
+ // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+ ldr \rn\()0, [\s1]
+ ldr s1, [\s1, #\w]
+ ldr \rn\()2, [\s2]
+ ldr s3, [\s2, #\w]
+ uxtl v0.8h, v0.8b
+ uxtl v1.8h, v1.8b
+ uxtl v2.8h, v2.8b
+ uxtl v3.8h, v3.8b
+ str \rw\()0, [x0]
+ str d1, [x0, #2*\w]
+ add x0, x0, #2*\stride
+ str \rw\()2, [x0]
+ str d3, [x0, #2*\w]
+.if \ret
+ ret
+.else
+ add x0, x0, #2*\stride
+ b 3f
+.endif
+
+1:
+ // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+ ldr \rn\()0, [\s1]
+ ldr h1, [\s1, #\w]
+ ldr \rn\()2, [\s2]
+ ldr h3, [\s2, #\w]
+ uxtl v0.8h, v0.8b
+ uxtl v1.8h, v1.8b
+ uxtl v2.8h, v2.8b
+ uxtl v3.8h, v3.8b
+ str \rw\()0, [x0]
+ str s1, [x0, #2*\w]
+ str s31, [x0, #2*\w+4]
+ add x0, x0, #2*\stride
+ str \rw\()2, [x0]
+ str s3, [x0, #2*\w]
+ str s31, [x0, #2*\w+4]
+.if \ret
+ ret
+.else
+ add x0, x0, #2*\stride
+ b 3f
+.endif
+
+2:
+ // !CDEF_HAVE_LEFT
+ tst w7, #2 // CDEF_HAVE_RIGHT
+ b.eq 1f
+ // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+ ldr \rn\()0, [\s1]
+ ldr h1, [\s1, #\w]
+ ldr \rn\()2, [\s2]
+ ldr h3, [\s2, #\w]
+ uxtl v0.8h, v0.8b
+ uxtl v1.8h, v1.8b
+ uxtl v2.8h, v2.8b
+ uxtl v3.8h, v3.8b
+ str s31, [x0]
+ stur \rw\()0, [x0, #4]
+ str s1, [x0, #4+2*\w]
+ add x0, x0, #2*\stride
+ str s31, [x0]
+ stur \rw\()2, [x0, #4]
+ str s3, [x0, #4+2*\w]
+.if \ret
+ ret
+.else
+ add x0, x0, #2*\stride
+ b 3f
+.endif
+
+1:
+ // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+ ldr \rn\()0, [\s1]
+ ldr \rn\()1, [\s2]
+ uxtl v0.8h, v0.8b
+ uxtl v1.8h, v1.8b
+ str s31, [x0]
+ stur \rw\()0, [x0, #4]
+ str s31, [x0, #4+2*\w]
+ add x0, x0, #2*\stride
+ str s31, [x0]
+ stur \rw\()1, [x0, #4]
+ str s31, [x0, #4+2*\w]
+.if \ret
+ ret
+.else
+ add x0, x0, #2*\stride
+.endif
+3:
+.endm
+
+.macro load_n_incr dst, src, incr, w
+.if \w == 4
+ ld1 {\dst\().s}[0], [\src], \incr
+.else
+ ld1 {\dst\().8b}, [\src], \incr
+.endif
+.endm
+
+// void dav1d_cdef_paddingX_8bpc_neon(uint16_t *tmp, const pixel *src,
+// ptrdiff_t src_stride, const pixel (*left)[2],
+// const pixel *const top,
+// const pixel *const bottom, int h,
+// enum CdefEdgeFlags edges);
+
+.macro padding_func w, stride, rn, rw
+function cdef_padding\w\()_8bpc_neon, export=1
+ cmp w7, #0xf // fully edged
+ b.eq cdef_padding\w\()_edged_8bpc_neon
+ movi v30.8h, #0x80, lsl #8
+ mov v31.16b, v30.16b
+ sub x0, x0, #2*(2*\stride+2)
+ tst w7, #4 // CDEF_HAVE_TOP
+ b.ne 1f
+ // !CDEF_HAVE_TOP
+ st1 {v30.8h, v31.8h}, [x0], #32
+.if \w == 8
+ st1 {v30.8h, v31.8h}, [x0], #32
+.endif
+ b 3f
+1:
+ // CDEF_HAVE_TOP
+ add x9, x4, x2
+ pad_top_bottom x4, x9, \w, \stride, \rn, \rw, 0
+
+ // Middle section
+3:
+ tst w7, #1 // CDEF_HAVE_LEFT
+ b.eq 2f
+ // CDEF_HAVE_LEFT
+ tst w7, #2 // CDEF_HAVE_RIGHT
+ b.eq 1f
+ // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+0:
+ ld1 {v0.h}[0], [x3], #2
+ ldr h2, [x1, #\w]
+ load_n_incr v1, x1, x2, \w
+ subs w6, w6, #1
+ uxtl v0.8h, v0.8b
+ uxtl v1.8h, v1.8b
+ uxtl v2.8h, v2.8b
+ str s0, [x0]
+ stur \rw\()1, [x0, #4]
+ str s2, [x0, #4+2*\w]
+ add x0, x0, #2*\stride
+ b.gt 0b
+ b 3f
+1:
+ // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+ ld1 {v0.h}[0], [x3], #2
+ load_n_incr v1, x1, x2, \w
+ subs w6, w6, #1
+ uxtl v0.8h, v0.8b
+ uxtl v1.8h, v1.8b
+ str s0, [x0]
+ stur \rw\()1, [x0, #4]
+ str s31, [x0, #4+2*\w]
+ add x0, x0, #2*\stride
+ b.gt 1b
+ b 3f
+2:
+ tst w7, #2 // CDEF_HAVE_RIGHT
+ b.eq 1f
+ // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+0:
+ ldr h1, [x1, #\w]
+ load_n_incr v0, x1, x2, \w
+ subs w6, w6, #1
+ uxtl v0.8h, v0.8b
+ uxtl v1.8h, v1.8b
+ str s31, [x0]
+ stur \rw\()0, [x0, #4]
+ str s1, [x0, #4+2*\w]
+ add x0, x0, #2*\stride
+ b.gt 0b
+ b 3f
+1:
+ // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+ load_n_incr v0, x1, x2, \w
+ subs w6, w6, #1
+ uxtl v0.8h, v0.8b
+ str s31, [x0]
+ stur \rw\()0, [x0, #4]
+ str s31, [x0, #4+2*\w]
+ add x0, x0, #2*\stride
+ b.gt 1b
+
+3:
+ tst w7, #8 // CDEF_HAVE_BOTTOM
+ b.ne 1f
+ // !CDEF_HAVE_BOTTOM
+ st1 {v30.8h, v31.8h}, [x0], #32
+.if \w == 8
+ st1 {v30.8h, v31.8h}, [x0], #32
+.endif
+ ret
+1:
+ // CDEF_HAVE_BOTTOM
+ add x9, x5, x2
+ pad_top_bottom x5, x9, \w, \stride, \rn, \rw, 1
+endfunc
+.endm
+
+padding_func 8, 16, d, q
+padding_func 4, 8, s, d
+
+// void cdef_paddingX_edged_8bpc_neon(uint8_t *tmp, const pixel *src,
+// ptrdiff_t src_stride, const pixel (*left)[2],
+// const pixel *const top,
+// const pixel *const bottom, int h,
+// enum CdefEdgeFlags edges);
+
+.macro padding_func_edged w, stride, reg
+function cdef_padding\w\()_edged_8bpc_neon, export=1
+ sub x4, x4, #2
+ sub x5, x5, #2
+ sub x0, x0, #(2*\stride+2)
+
+.if \w == 4
+ ldr d0, [x4]
+ ldr d1, [x4, x2]
+ st1 {v0.8b, v1.8b}, [x0], #16
+.else
+ add x9, x4, x2
+ ldr d0, [x4]
+ ldr s1, [x4, #8]
+ ldr d2, [x9]
+ ldr s3, [x9, #8]
+ str d0, [x0]
+ str s1, [x0, #8]
+ str d2, [x0, #\stride]
+ str s3, [x0, #\stride+8]
+ add x0, x0, #2*\stride
+.endif
+
+0:
+ ld1 {v0.h}[0], [x3], #2
+ ldr h2, [x1, #\w]
+ load_n_incr v1, x1, x2, \w
+ subs w6, w6, #1
+ str h0, [x0]
+ stur \reg\()1, [x0, #2]
+ str h2, [x0, #2+\w]
+ add x0, x0, #\stride
+ b.gt 0b
+
+.if \w == 4
+ ldr d0, [x5]
+ ldr d1, [x5, x2]
+ st1 {v0.8b, v1.8b}, [x0], #16
+.else
+ add x9, x5, x2
+ ldr d0, [x5]
+ ldr s1, [x5, #8]
+ ldr d2, [x9]
+ ldr s3, [x9, #8]
+ str d0, [x0]
+ str s1, [x0, #8]
+ str d2, [x0, #\stride]
+ str s3, [x0, #\stride+8]
+.endif
+ ret
+endfunc
+.endm
+
+padding_func_edged 8, 16, d
+padding_func_edged 4, 8, s
+
+tables
+
+filter 8, 8
+filter 4, 8
+
+find_dir 8
+
+.macro load_px_8 d1, d2, w
+.if \w == 8
+ add x6, x2, w9, sxtb // x + off
+ sub x9, x2, w9, sxtb // x - off
+ ld1 {\d1\().d}[0], [x6] // p0
+ add x6, x6, #16 // += stride
+ ld1 {\d2\().d}[0], [x9] // p1
+ add x9, x9, #16 // += stride
+ ld1 {\d1\().d}[1], [x6] // p0
+ ld1 {\d2\().d}[1], [x9] // p0
+.else
+ add x6, x2, w9, sxtb // x + off
+ sub x9, x2, w9, sxtb // x - off
+ ld1 {\d1\().s}[0], [x6] // p0
+ add x6, x6, #8 // += stride
+ ld1 {\d2\().s}[0], [x9] // p1
+ add x9, x9, #8 // += stride
+ ld1 {\d1\().s}[1], [x6] // p0
+ add x6, x6, #8 // += stride
+ ld1 {\d2\().s}[1], [x9] // p1
+ add x9, x9, #8 // += stride
+ ld1 {\d1\().s}[2], [x6] // p0
+ add x6, x6, #8 // += stride
+ ld1 {\d2\().s}[2], [x9] // p1
+ add x9, x9, #8 // += stride
+ ld1 {\d1\().s}[3], [x6] // p0
+ ld1 {\d2\().s}[3], [x9] // p1
+.endif
+.endm
+.macro handle_pixel_8 s1, s2, thresh_vec, shift, tap, min
+.if \min
+ umin v3.16b, v3.16b, \s1\().16b
+ umax v4.16b, v4.16b, \s1\().16b
+ umin v3.16b, v3.16b, \s2\().16b
+ umax v4.16b, v4.16b, \s2\().16b
+.endif
+ uabd v16.16b, v0.16b, \s1\().16b // abs(diff)
+ uabd v20.16b, v0.16b, \s2\().16b // abs(diff)
+ ushl v17.16b, v16.16b, \shift // abs(diff) >> shift
+ ushl v21.16b, v20.16b, \shift // abs(diff) >> shift
+ uqsub v17.16b, \thresh_vec, v17.16b // clip = imax(0, threshold - (abs(diff) >> shift))
+ uqsub v21.16b, \thresh_vec, v21.16b // clip = imax(0, threshold - (abs(diff) >> shift))
+ cmhi v18.16b, v0.16b, \s1\().16b // px > p0
+ cmhi v22.16b, v0.16b, \s2\().16b // px > p1
+ umin v17.16b, v17.16b, v16.16b // imin(abs(diff), clip)
+ umin v21.16b, v21.16b, v20.16b // imin(abs(diff), clip)
+ dup v19.16b, \tap // taps[k]
+ neg v16.16b, v17.16b // -imin()
+ neg v20.16b, v21.16b // -imin()
+ bsl v18.16b, v16.16b, v17.16b // constrain() = apply_sign()
+ bsl v22.16b, v20.16b, v21.16b // constrain() = apply_sign()
+ mla v1.16b, v18.16b, v19.16b // sum += taps[k] * constrain()
+ mla v2.16b, v22.16b, v19.16b // sum += taps[k] * constrain()
+.endm
+
+// void cdef_filterX_edged_8bpc_neon(pixel *dst, ptrdiff_t dst_stride,
+// const uint8_t *tmp, int pri_strength,
+// int sec_strength, int dir, int damping,
+// int h);
+.macro filter_func_8 w, pri, sec, min, suffix
+function cdef_filter\w\suffix\()_edged_8bpc_neon
+.if \pri
+ movrel x8, pri_taps
+ and w9, w3, #1
+ add x8, x8, w9, uxtw #1
+.endif
+ movrel x9, directions\w
+ add x5, x9, w5, uxtw #1
+ movi v30.8b, #7
+ dup v28.8b, w6 // damping
+
+.if \pri
+ dup v25.16b, w3 // threshold
+.endif
+.if \sec
+ dup v27.16b, w4 // threshold
+.endif
+ trn1 v24.8b, v25.8b, v27.8b
+ clz v24.8b, v24.8b // clz(threshold)
+ sub v24.8b, v30.8b, v24.8b // ulog2(threshold)
+ uqsub v24.8b, v28.8b, v24.8b // shift = imax(0, damping - ulog2(threshold))
+ neg v24.8b, v24.8b // -shift
+.if \sec
+ dup v26.16b, v24.b[1]
+.endif
+.if \pri
+ dup v24.16b, v24.b[0]
+.endif
+
+1:
+.if \w == 8
+ add x12, x2, #16
+ ld1 {v0.d}[0], [x2] // px
+ ld1 {v0.d}[1], [x12] // px
+.else
+ add x12, x2, #1*8
+ add x13, x2, #2*8
+ add x14, x2, #3*8
+ ld1 {v0.s}[0], [x2] // px
+ ld1 {v0.s}[1], [x12] // px
+ ld1 {v0.s}[2], [x13] // px
+ ld1 {v0.s}[3], [x14] // px
+.endif
+
+ // We need 9-bits or two 8-bit accululators to fit the sum.
+ // Max of |sum| > 15*2*6(pri) + 4*4*3(sec) = 228.
+ // Start sum at -1 instead of 0 to help handle rounding later.
+ movi v1.16b, #255 // sum
+ movi v2.16b, #0 // sum
+.if \min
+ mov v3.16b, v0.16b // min
+ mov v4.16b, v0.16b // max
+.endif
+
+ // Instead of loading sec_taps 2, 1 from memory, just set it
+ // to 2 initially and decrease for the second round.
+ // This is also used as loop counter.
+ mov w11, #2 // sec_taps[0]
+
+2:
+.if \pri
+ ldrb w9, [x5] // off1
+
+ load_px_8 v5, v6, \w
+.endif
+
+.if \sec
+ add x5, x5, #4 // +2*2
+ ldrb w9, [x5] // off2
+ load_px_8 v28, v29, \w
+.endif
+
+.if \pri
+ ldrb w10, [x8] // *pri_taps
+
+ handle_pixel_8 v5, v6, v25.16b, v24.16b, w10, \min
+.endif
+
+.if \sec
+ add x5, x5, #8 // +2*4
+ ldrb w9, [x5] // off3
+ load_px_8 v5, v6, \w
+
+ handle_pixel_8 v28, v29, v27.16b, v26.16b, w11, \min
+
+ handle_pixel_8 v5, v6, v27.16b, v26.16b, w11, \min
+
+ sub x5, x5, #11 // x5 -= 2*(2+4); x5 += 1;
+.else
+ add x5, x5, #1 // x5 += 1
+.endif
+ subs w11, w11, #1 // sec_tap-- (value)
+.if \pri
+ add x8, x8, #1 // pri_taps++ (pointer)
+.endif
+ b.ne 2b
+
+ // Perform halving adds since the value won't fit otherwise.
+ // To handle the offset for negative values, use both halving w/ and w/o rounding.
+ srhadd v5.16b, v1.16b, v2.16b // sum >> 1
+ shadd v6.16b, v1.16b, v2.16b // (sum - 1) >> 1
+ cmlt v1.16b, v5.16b, #0 // sum < 0
+ bsl v1.16b, v6.16b, v5.16b // (sum - (sum < 0)) >> 1
+
+ srshr v1.16b, v1.16b, #3 // (8 + sum - (sum < 0)) >> 4
+
+ usqadd v0.16b, v1.16b // px + (8 + sum ...) >> 4
+.if \min
+ umin v0.16b, v0.16b, v4.16b
+ umax v0.16b, v0.16b, v3.16b // iclip(px + .., min, max)
+.endif
+.if \w == 8
+ st1 {v0.d}[0], [x0], x1
+ add x2, x2, #2*16 // tmp += 2*tmp_stride
+ subs w7, w7, #2 // h -= 2
+ st1 {v0.d}[1], [x0], x1
+.else
+ st1 {v0.s}[0], [x0], x1
+ add x2, x2, #4*8 // tmp += 4*tmp_stride
+ st1 {v0.s}[1], [x0], x1
+ subs w7, w7, #4 // h -= 4
+ st1 {v0.s}[2], [x0], x1
+ st1 {v0.s}[3], [x0], x1
+.endif
+
+ // Reset pri_taps and directions back to the original point
+ sub x5, x5, #2
+.if \pri
+ sub x8, x8, #2
+.endif
+
+ b.gt 1b
+ ret
+endfunc
+.endm
+
+.macro filter_8 w
+filter_func_8 \w, pri=1, sec=0, min=0, suffix=_pri
+filter_func_8 \w, pri=0, sec=1, min=0, suffix=_sec
+filter_func_8 \w, pri=1, sec=1, min=1, suffix=_pri_sec
+.endm
+
+filter_8 8
+filter_8 4
diff --git a/third_party/dav1d/src/arm/64/cdef16.S b/third_party/dav1d/src/arm/64/cdef16.S
new file mode 100644
index 0000000000..ecf864a26d
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/cdef16.S
@@ -0,0 +1,229 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+#include "cdef_tmpl.S"
+
+.macro pad_top_bot_16 s1, s2, w, stride, reg, ret
+ tst w7, #1 // CDEF_HAVE_LEFT
+ b.eq 2f
+ // CDEF_HAVE_LEFT
+ sub \s1, \s1, #4
+ sub \s2, \s2, #4
+ tst w7, #2 // CDEF_HAVE_RIGHT
+ b.eq 1f
+ // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+ ldr \reg\()0, [\s1]
+ ldr d1, [\s1, #2*\w]
+ ldr \reg\()2, [\s2]
+ ldr d3, [\s2, #2*\w]
+ str \reg\()0, [x0]
+ str d1, [x0, #2*\w]
+ add x0, x0, #2*\stride
+ str \reg\()2, [x0]
+ str d3, [x0, #2*\w]
+.if \ret
+ ret
+.else
+ add x0, x0, #2*\stride
+ b 3f
+.endif
+
+1:
+ // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+ ldr \reg\()0, [\s1]
+ ldr s1, [\s1, #2*\w]
+ ldr \reg\()2, [\s2]
+ ldr s3, [\s2, #2*\w]
+ str \reg\()0, [x0]
+ str s1, [x0, #2*\w]
+ str s31, [x0, #2*\w+4]
+ add x0, x0, #2*\stride
+ str \reg\()2, [x0]
+ str s3, [x0, #2*\w]
+ str s31, [x0, #2*\w+4]
+.if \ret
+ ret
+.else
+ add x0, x0, #2*\stride
+ b 3f
+.endif
+
+2:
+ // !CDEF_HAVE_LEFT
+ tst w7, #2 // CDEF_HAVE_RIGHT
+ b.eq 1f
+ // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+ ldr \reg\()0, [\s1]
+ ldr s1, [\s1, #2*\w]
+ ldr \reg\()2, [\s2]
+ ldr s3, [\s2, #2*\w]
+ str s31, [x0]
+ stur \reg\()0, [x0, #4]
+ str s1, [x0, #4+2*\w]
+ add x0, x0, #2*\stride
+ str s31, [x0]
+ stur \reg\()2, [x0, #4]
+ str s3, [x0, #4+2*\w]
+.if \ret
+ ret
+.else
+ add x0, x0, #2*\stride
+ b 3f
+.endif
+
+1:
+ // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+ ldr \reg\()0, [\s1]
+ ldr \reg\()1, [\s2]
+ str s31, [x0]
+ stur \reg\()0, [x0, #4]
+ str s31, [x0, #4+2*\w]
+ add x0, x0, #2*\stride
+ str s31, [x0]
+ stur \reg\()1, [x0, #4]
+ str s31, [x0, #4+2*\w]
+.if \ret
+ ret
+.else
+ add x0, x0, #2*\stride
+.endif
+3:
+.endm
+
+.macro load_n_incr_16 dst, src, incr, w
+.if \w == 4
+ ld1 {\dst\().4h}, [\src], \incr
+.else
+ ld1 {\dst\().8h}, [\src], \incr
+.endif
+.endm
+
+// void dav1d_cdef_paddingX_16bpc_neon(uint16_t *tmp, const pixel *src,
+// ptrdiff_t src_stride, const pixel (*left)[2],
+// const pixel *const top,
+// const pixel *const bottom, int h,
+// enum CdefEdgeFlags edges);
+
+.macro padding_func_16 w, stride, reg
+function cdef_padding\w\()_16bpc_neon, export=1
+ movi v30.8h, #0x80, lsl #8
+ mov v31.16b, v30.16b
+ sub x0, x0, #2*(2*\stride+2)
+ tst w7, #4 // CDEF_HAVE_TOP
+ b.ne 1f
+ // !CDEF_HAVE_TOP
+ st1 {v30.8h, v31.8h}, [x0], #32
+.if \w == 8
+ st1 {v30.8h, v31.8h}, [x0], #32
+.endif
+ b 3f
+1:
+ // CDEF_HAVE_TOP
+ add x9, x4, x2
+ pad_top_bot_16 x4, x9, \w, \stride, \reg, 0
+
+ // Middle section
+3:
+ tst w7, #1 // CDEF_HAVE_LEFT
+ b.eq 2f
+ // CDEF_HAVE_LEFT
+ tst w7, #2 // CDEF_HAVE_RIGHT
+ b.eq 1f
+ // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+0:
+ ld1 {v0.s}[0], [x3], #4
+ ldr s2, [x1, #2*\w]
+ load_n_incr_16 v1, x1, x2, \w
+ subs w6, w6, #1
+ str s0, [x0]
+ stur \reg\()1, [x0, #4]
+ str s2, [x0, #4+2*\w]
+ add x0, x0, #2*\stride
+ b.gt 0b
+ b 3f
+1:
+ // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+ ld1 {v0.s}[0], [x3], #4
+ load_n_incr_16 v1, x1, x2, \w
+ subs w6, w6, #1
+ str s0, [x0]
+ stur \reg\()1, [x0, #4]
+ str s31, [x0, #4+2*\w]
+ add x0, x0, #2*\stride
+ b.gt 1b
+ b 3f
+2:
+ tst w7, #2 // CDEF_HAVE_RIGHT
+ b.eq 1f
+ // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+0:
+ ldr s1, [x1, #2*\w]
+ load_n_incr_16 v0, x1, x2, \w
+ subs w6, w6, #1
+ str s31, [x0]
+ stur \reg\()0, [x0, #4]
+ str s1, [x0, #4+2*\w]
+ add x0, x0, #2*\stride
+ b.gt 0b
+ b 3f
+1:
+ // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+ load_n_incr_16 v0, x1, x2, \w
+ subs w6, w6, #1
+ str s31, [x0]
+ stur \reg\()0, [x0, #4]
+ str s31, [x0, #4+2*\w]
+ add x0, x0, #2*\stride
+ b.gt 1b
+
+3:
+ tst w7, #8 // CDEF_HAVE_BOTTOM
+ b.ne 1f
+ // !CDEF_HAVE_BOTTOM
+ st1 {v30.8h, v31.8h}, [x0], #32
+.if \w == 8
+ st1 {v30.8h, v31.8h}, [x0], #32
+.endif
+ ret
+1:
+ // CDEF_HAVE_BOTTOM
+ add x9, x5, x2
+ pad_top_bot_16 x5, x9, \w, \stride, \reg, 1
+endfunc
+.endm
+
+padding_func_16 8, 16, q
+padding_func_16 4, 8, d
+
+tables
+
+filter 8, 16
+filter 4, 16
+
+find_dir 16
diff --git a/third_party/dav1d/src/arm/64/cdef_tmpl.S b/third_party/dav1d/src/arm/64/cdef_tmpl.S
new file mode 100644
index 0000000000..d35d7a09ba
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/cdef_tmpl.S
@@ -0,0 +1,511 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+.macro dir_table w, stride
+const directions\w
+ .byte -1 * \stride + 1, -2 * \stride + 2
+ .byte 0 * \stride + 1, -1 * \stride + 2
+ .byte 0 * \stride + 1, 0 * \stride + 2
+ .byte 0 * \stride + 1, 1 * \stride + 2
+ .byte 1 * \stride + 1, 2 * \stride + 2
+ .byte 1 * \stride + 0, 2 * \stride + 1
+ .byte 1 * \stride + 0, 2 * \stride + 0
+ .byte 1 * \stride + 0, 2 * \stride - 1
+// Repeated, to avoid & 7
+ .byte -1 * \stride + 1, -2 * \stride + 2
+ .byte 0 * \stride + 1, -1 * \stride + 2
+ .byte 0 * \stride + 1, 0 * \stride + 2
+ .byte 0 * \stride + 1, 1 * \stride + 2
+ .byte 1 * \stride + 1, 2 * \stride + 2
+ .byte 1 * \stride + 0, 2 * \stride + 1
+endconst
+.endm
+
+.macro tables
+dir_table 8, 16
+dir_table 4, 8
+
+const pri_taps
+ .byte 4, 2, 3, 3
+endconst
+.endm
+
+.macro load_px d1, d2, w
+.if \w == 8
+ add x6, x2, w9, sxtb #1 // x + off
+ sub x9, x2, w9, sxtb #1 // x - off
+ ld1 {\d1\().8h}, [x6] // p0
+ ld1 {\d2\().8h}, [x9] // p1
+.else
+ add x6, x2, w9, sxtb #1 // x + off
+ sub x9, x2, w9, sxtb #1 // x - off
+ ld1 {\d1\().4h}, [x6] // p0
+ add x6, x6, #2*8 // += stride
+ ld1 {\d2\().4h}, [x9] // p1
+ add x9, x9, #2*8 // += stride
+ ld1 {\d1\().d}[1], [x6] // p0
+ ld1 {\d2\().d}[1], [x9] // p1
+.endif
+.endm
+.macro handle_pixel s1, s2, thresh_vec, shift, tap, min
+.if \min
+ umin v2.8h, v2.8h, \s1\().8h
+ smax v3.8h, v3.8h, \s1\().8h
+ umin v2.8h, v2.8h, \s2\().8h
+ smax v3.8h, v3.8h, \s2\().8h
+.endif
+ uabd v16.8h, v0.8h, \s1\().8h // abs(diff)
+ uabd v20.8h, v0.8h, \s2\().8h // abs(diff)
+ ushl v17.8h, v16.8h, \shift // abs(diff) >> shift
+ ushl v21.8h, v20.8h, \shift // abs(diff) >> shift
+ uqsub v17.8h, \thresh_vec, v17.8h // clip = imax(0, threshold - (abs(diff) >> shift))
+ uqsub v21.8h, \thresh_vec, v21.8h // clip = imax(0, threshold - (abs(diff) >> shift))
+ sub v18.8h, \s1\().8h, v0.8h // diff = p0 - px
+ sub v22.8h, \s2\().8h, v0.8h // diff = p1 - px
+ neg v16.8h, v17.8h // -clip
+ neg v20.8h, v21.8h // -clip
+ smin v18.8h, v18.8h, v17.8h // imin(diff, clip)
+ smin v22.8h, v22.8h, v21.8h // imin(diff, clip)
+ dup v19.8h, \tap // taps[k]
+ smax v18.8h, v18.8h, v16.8h // constrain() = imax(imin(diff, clip), -clip)
+ smax v22.8h, v22.8h, v20.8h // constrain() = imax(imin(diff, clip), -clip)
+ mla v1.8h, v18.8h, v19.8h // sum += taps[k] * constrain()
+ mla v1.8h, v22.8h, v19.8h // sum += taps[k] * constrain()
+.endm
+
+// void dav1d_cdef_filterX_Ybpc_neon(pixel *dst, ptrdiff_t dst_stride,
+// const uint16_t *tmp, int pri_strength,
+// int sec_strength, int dir, int damping,
+// int h, size_t edges);
+.macro filter_func w, bpc, pri, sec, min, suffix
+function cdef_filter\w\suffix\()_\bpc\()bpc_neon
+.if \bpc == 8
+ ldr w8, [sp] // edges
+ cmp w8, #0xf
+ b.eq cdef_filter\w\suffix\()_edged_8bpc_neon
+.endif
+.if \pri
+.if \bpc == 16
+ ldr w9, [sp, #8] // bitdepth_max
+ clz w9, w9
+ sub w9, w9, #24 // -bitdepth_min_8
+ neg w9, w9 // bitdepth_min_8
+.endif
+ movrel x8, pri_taps
+.if \bpc == 16
+ lsr w9, w3, w9 // pri_strength >> bitdepth_min_8
+ and w9, w9, #1 // (pri_strength >> bitdepth_min_8) & 1
+.else
+ and w9, w3, #1
+.endif
+ add x8, x8, w9, uxtw #1
+.endif
+ movrel x9, directions\w
+ add x5, x9, w5, uxtw #1
+ movi v30.4h, #15
+ dup v28.4h, w6 // damping
+
+.if \pri
+ dup v25.8h, w3 // threshold
+.endif
+.if \sec
+ dup v27.8h, w4 // threshold
+.endif
+ trn1 v24.4h, v25.4h, v27.4h
+ clz v24.4h, v24.4h // clz(threshold)
+ sub v24.4h, v30.4h, v24.4h // ulog2(threshold)
+ uqsub v24.4h, v28.4h, v24.4h // shift = imax(0, damping - ulog2(threshold))
+ neg v24.4h, v24.4h // -shift
+.if \sec
+ dup v26.8h, v24.h[1]
+.endif
+.if \pri
+ dup v24.8h, v24.h[0]
+.endif
+
+1:
+.if \w == 8
+ ld1 {v0.8h}, [x2] // px
+.else
+ add x12, x2, #2*8
+ ld1 {v0.4h}, [x2] // px
+ ld1 {v0.d}[1], [x12] // px
+.endif
+
+ movi v1.8h, #0 // sum
+.if \min
+ mov v2.16b, v0.16b // min
+ mov v3.16b, v0.16b // max
+.endif
+
+ // Instead of loading sec_taps 2, 1 from memory, just set it
+ // to 2 initially and decrease for the second round.
+ // This is also used as loop counter.
+ mov w11, #2 // sec_taps[0]
+
+2:
+.if \pri
+ ldrb w9, [x5] // off1
+
+ load_px v4, v5, \w
+.endif
+
+.if \sec
+ add x5, x5, #4 // +2*2
+ ldrb w9, [x5] // off2
+ load_px v6, v7, \w
+.endif
+
+.if \pri
+ ldrb w10, [x8] // *pri_taps
+
+ handle_pixel v4, v5, v25.8h, v24.8h, w10, \min
+.endif
+
+.if \sec
+ add x5, x5, #8 // +2*4
+ ldrb w9, [x5] // off3
+ load_px v4, v5, \w
+
+ handle_pixel v6, v7, v27.8h, v26.8h, w11, \min
+
+ handle_pixel v4, v5, v27.8h, v26.8h, w11, \min
+
+ sub x5, x5, #11 // x5 -= 2*(2+4); x5 += 1;
+.else
+ add x5, x5, #1 // x5 += 1
+.endif
+ subs w11, w11, #1 // sec_tap-- (value)
+.if \pri
+ add x8, x8, #1 // pri_taps++ (pointer)
+.endif
+ b.ne 2b
+
+ cmlt v4.8h, v1.8h, #0 // -(sum < 0)
+ add v1.8h, v1.8h, v4.8h // sum - (sum < 0)
+ srshr v1.8h, v1.8h, #4 // (8 + sum - (sum < 0)) >> 4
+ add v0.8h, v0.8h, v1.8h // px + (8 + sum ...) >> 4
+.if \min
+ smin v0.8h, v0.8h, v3.8h
+ smax v0.8h, v0.8h, v2.8h // iclip(px + .., min, max)
+.endif
+.if \bpc == 8
+ xtn v0.8b, v0.8h
+.endif
+.if \w == 8
+ add x2, x2, #2*16 // tmp += tmp_stride
+ subs w7, w7, #1 // h--
+.if \bpc == 8
+ st1 {v0.8b}, [x0], x1
+.else
+ st1 {v0.8h}, [x0], x1
+.endif
+.else
+.if \bpc == 8
+ st1 {v0.s}[0], [x0], x1
+.else
+ st1 {v0.d}[0], [x0], x1
+.endif
+ add x2, x2, #2*16 // tmp += 2*tmp_stride
+ subs w7, w7, #2 // h -= 2
+.if \bpc == 8
+ st1 {v0.s}[1], [x0], x1
+.else
+ st1 {v0.d}[1], [x0], x1
+.endif
+.endif
+
+ // Reset pri_taps and directions back to the original point
+ sub x5, x5, #2
+.if \pri
+ sub x8, x8, #2
+.endif
+
+ b.gt 1b
+ ret
+endfunc
+.endm
+
+.macro filter w, bpc
+filter_func \w, \bpc, pri=1, sec=0, min=0, suffix=_pri
+filter_func \w, \bpc, pri=0, sec=1, min=0, suffix=_sec
+filter_func \w, \bpc, pri=1, sec=1, min=1, suffix=_pri_sec
+
+function cdef_filter\w\()_\bpc\()bpc_neon, export=1
+ cbnz w3, 1f // pri_strength
+ b cdef_filter\w\()_sec_\bpc\()bpc_neon // only sec
+1:
+ cbnz w4, 1f // sec_strength
+ b cdef_filter\w\()_pri_\bpc\()bpc_neon // only pri
+1:
+ b cdef_filter\w\()_pri_sec_\bpc\()bpc_neon // both pri and sec
+endfunc
+.endm
+
+const div_table
+ .short 840, 420, 280, 210, 168, 140, 120, 105
+endconst
+
+const alt_fact
+ .short 420, 210, 140, 105, 105, 105, 105, 105, 140, 210, 420, 0
+endconst
+
+.macro cost_alt d1, d2, s1, s2, s3, s4
+ smull v22.4s, \s1\().4h, \s1\().4h // sum_alt[n]*sum_alt[n]
+ smull2 v23.4s, \s1\().8h, \s1\().8h
+ smull v24.4s, \s2\().4h, \s2\().4h
+ smull v25.4s, \s3\().4h, \s3\().4h // sum_alt[n]*sum_alt[n]
+ smull2 v26.4s, \s3\().8h, \s3\().8h
+ smull v27.4s, \s4\().4h, \s4\().4h
+ mul v22.4s, v22.4s, v29.4s // sum_alt[n]^2*fact
+ mla v22.4s, v23.4s, v30.4s
+ mla v22.4s, v24.4s, v31.4s
+ mul v25.4s, v25.4s, v29.4s // sum_alt[n]^2*fact
+ mla v25.4s, v26.4s, v30.4s
+ mla v25.4s, v27.4s, v31.4s
+ addv \d1, v22.4s // *cost_ptr
+ addv \d2, v25.4s // *cost_ptr
+.endm
+
+.macro find_best s1, s2, s3
+.ifnb \s2
+ mov w5, \s2\().s[0]
+.endif
+ cmp w4, w1 // cost[n] > best_cost
+ csel w0, w3, w0, gt // best_dir = n
+ csel w1, w4, w1, gt // best_cost = cost[n]
+.ifnb \s2
+ add w3, w3, #1 // n++
+ cmp w5, w1 // cost[n] > best_cost
+ mov w4, \s3\().s[0]
+ csel w0, w3, w0, gt // best_dir = n
+ csel w1, w5, w1, gt // best_cost = cost[n]
+ add w3, w3, #1 // n++
+.endif
+.endm
+
+// Steps for loading and preparing each row
+.macro dir_load_step1 s1, bpc
+.if \bpc == 8
+ ld1 {\s1\().8b}, [x0], x1
+.else
+ ld1 {\s1\().8h}, [x0], x1
+.endif
+.endm
+
+.macro dir_load_step2 s1, bpc
+.if \bpc == 8
+ usubl \s1\().8h, \s1\().8b, v31.8b
+.else
+ ushl \s1\().8h, \s1\().8h, v8.8h
+.endif
+.endm
+
+.macro dir_load_step3 s1, bpc
+// Nothing for \bpc == 8
+.if \bpc != 8
+ sub \s1\().8h, \s1\().8h, v31.8h
+.endif
+.endm
+
+// int dav1d_cdef_find_dir_Xbpc_neon(const pixel *img, const ptrdiff_t stride,
+// unsigned *const var)
+.macro find_dir bpc
+function cdef_find_dir_\bpc\()bpc_neon, export=1
+.if \bpc == 16
+ str d8, [sp, #-0x10]!
+ clz w3, w3 // clz(bitdepth_max)
+ sub w3, w3, #24 // -bitdepth_min_8
+ dup v8.8h, w3
+.endif
+ sub sp, sp, #32 // cost
+ mov w3, #8
+.if \bpc == 8
+ movi v31.16b, #128
+.else
+ movi v31.8h, #128
+.endif
+ movi v30.16b, #0
+ movi v1.8h, #0 // v0-v1 sum_diag[0]
+ movi v3.8h, #0 // v2-v3 sum_diag[1]
+ movi v5.8h, #0 // v4-v5 sum_hv[0-1]
+ movi v7.8h, #0 // v6-v7 sum_alt[0]
+ dir_load_step1 v26, \bpc // Setup first row early
+ movi v17.8h, #0 // v16-v17 sum_alt[1]
+ movi v18.8h, #0 // v18-v19 sum_alt[2]
+ dir_load_step2 v26, \bpc
+ movi v19.8h, #0
+ dir_load_step3 v26, \bpc
+ movi v21.8h, #0 // v20-v21 sum_alt[3]
+
+.irpc i, 01234567
+ addv h25, v26.8h // [y]
+ rev64 v27.8h, v26.8h
+ addp v28.8h, v26.8h, v30.8h // [(x >> 1)]
+ add v5.8h, v5.8h, v26.8h // sum_hv[1]
+ ext v27.16b, v27.16b, v27.16b, #8 // [-x]
+ rev64 v29.4h, v28.4h // [-(x >> 1)]
+ ins v4.h[\i], v25.h[0] // sum_hv[0]
+.if \i < 6
+ ext v22.16b, v30.16b, v26.16b, #(16-2*(3-(\i/2)))
+ ext v23.16b, v26.16b, v30.16b, #(16-2*(3-(\i/2)))
+ add v18.8h, v18.8h, v22.8h // sum_alt[2]
+ add v19.4h, v19.4h, v23.4h // sum_alt[2]
+.else
+ add v18.8h, v18.8h, v26.8h // sum_alt[2]
+.endif
+.if \i == 0
+ mov v20.16b, v26.16b // sum_alt[3]
+.elseif \i == 1
+ add v20.8h, v20.8h, v26.8h // sum_alt[3]
+.else
+ ext v24.16b, v30.16b, v26.16b, #(16-2*(\i/2))
+ ext v25.16b, v26.16b, v30.16b, #(16-2*(\i/2))
+ add v20.8h, v20.8h, v24.8h // sum_alt[3]
+ add v21.4h, v21.4h, v25.4h // sum_alt[3]
+.endif
+.if \i == 0
+ mov v0.16b, v26.16b // sum_diag[0]
+ dir_load_step1 v26, \bpc
+ mov v2.16b, v27.16b // sum_diag[1]
+ dir_load_step2 v26, \bpc
+ mov v6.16b, v28.16b // sum_alt[0]
+ dir_load_step3 v26, \bpc
+ mov v16.16b, v29.16b // sum_alt[1]
+.else
+ ext v22.16b, v30.16b, v26.16b, #(16-2*\i)
+ ext v23.16b, v26.16b, v30.16b, #(16-2*\i)
+ ext v24.16b, v30.16b, v27.16b, #(16-2*\i)
+ ext v25.16b, v27.16b, v30.16b, #(16-2*\i)
+.if \i != 7 // Nothing to load for the final row
+ dir_load_step1 v26, \bpc // Start setting up the next row early.
+.endif
+ add v0.8h, v0.8h, v22.8h // sum_diag[0]
+ add v1.8h, v1.8h, v23.8h // sum_diag[0]
+ add v2.8h, v2.8h, v24.8h // sum_diag[1]
+ add v3.8h, v3.8h, v25.8h // sum_diag[1]
+.if \i != 7
+ dir_load_step2 v26, \bpc
+.endif
+ ext v22.16b, v30.16b, v28.16b, #(16-2*\i)
+ ext v23.16b, v28.16b, v30.16b, #(16-2*\i)
+ ext v24.16b, v30.16b, v29.16b, #(16-2*\i)
+ ext v25.16b, v29.16b, v30.16b, #(16-2*\i)
+.if \i != 7
+ dir_load_step3 v26, \bpc
+.endif
+ add v6.8h, v6.8h, v22.8h // sum_alt[0]
+ add v7.4h, v7.4h, v23.4h // sum_alt[0]
+ add v16.8h, v16.8h, v24.8h // sum_alt[1]
+ add v17.4h, v17.4h, v25.4h // sum_alt[1]
+.endif
+.endr
+
+ movi v31.4s, #105
+
+ smull v26.4s, v4.4h, v4.4h // sum_hv[0]*sum_hv[0]
+ smlal2 v26.4s, v4.8h, v4.8h
+ smull v27.4s, v5.4h, v5.4h // sum_hv[1]*sum_hv[1]
+ smlal2 v27.4s, v5.8h, v5.8h
+ mul v26.4s, v26.4s, v31.4s // cost[2] *= 105
+ mul v27.4s, v27.4s, v31.4s // cost[6] *= 105
+ addv s4, v26.4s // cost[2]
+ addv s5, v27.4s // cost[6]
+
+ rev64 v1.8h, v1.8h
+ rev64 v3.8h, v3.8h
+ ext v1.16b, v1.16b, v1.16b, #10 // sum_diag[0][14-n]
+ ext v3.16b, v3.16b, v3.16b, #10 // sum_diag[1][14-n]
+
+ str s4, [sp, #2*4] // cost[2]
+ str s5, [sp, #6*4] // cost[6]
+
+ movrel x4, div_table
+ ld1 {v31.8h}, [x4]
+
+ smull v22.4s, v0.4h, v0.4h // sum_diag[0]*sum_diag[0]
+ smull2 v23.4s, v0.8h, v0.8h
+ smlal v22.4s, v1.4h, v1.4h
+ smlal2 v23.4s, v1.8h, v1.8h
+ smull v24.4s, v2.4h, v2.4h // sum_diag[1]*sum_diag[1]
+ smull2 v25.4s, v2.8h, v2.8h
+ smlal v24.4s, v3.4h, v3.4h
+ smlal2 v25.4s, v3.8h, v3.8h
+ uxtl v30.4s, v31.4h // div_table
+ uxtl2 v31.4s, v31.8h
+ mul v22.4s, v22.4s, v30.4s // cost[0]
+ mla v22.4s, v23.4s, v31.4s // cost[0]
+ mul v24.4s, v24.4s, v30.4s // cost[4]
+ mla v24.4s, v25.4s, v31.4s // cost[4]
+ addv s0, v22.4s // cost[0]
+ addv s2, v24.4s // cost[4]
+
+ movrel x5, alt_fact
+ ld1 {v29.4h, v30.4h, v31.4h}, [x5]// div_table[2*m+1] + 105
+
+ str s0, [sp, #0*4] // cost[0]
+ str s2, [sp, #4*4] // cost[4]
+
+ uxtl v29.4s, v29.4h // div_table[2*m+1] + 105
+ uxtl v30.4s, v30.4h
+ uxtl v31.4s, v31.4h
+
+ cost_alt s6, s16, v6, v7, v16, v17 // cost[1], cost[3]
+ cost_alt s18, s20, v18, v19, v20, v21 // cost[5], cost[7]
+ str s6, [sp, #1*4] // cost[1]
+ str s16, [sp, #3*4] // cost[3]
+
+ mov w0, #0 // best_dir
+ mov w1, v0.s[0] // best_cost
+ mov w3, #1 // n
+
+ str s18, [sp, #5*4] // cost[5]
+ str s20, [sp, #7*4] // cost[7]
+
+ mov w4, v6.s[0]
+
+ find_best v6, v4, v16
+ find_best v16, v2, v18
+ find_best v18, v5, v20
+ find_best v20
+
+ eor w3, w0, #4 // best_dir ^4
+ ldr w4, [sp, w3, uxtw #2]
+ sub w1, w1, w4 // best_cost - cost[best_dir ^ 4]
+ lsr w1, w1, #10
+ str w1, [x2] // *var
+
+ add sp, sp, #32
+.if \bpc == 16
+ ldr d8, [sp], 0x10
+.endif
+ ret
+endfunc
+.endm
diff --git a/third_party/dav1d/src/arm/64/filmgrain.S b/third_party/dav1d/src/arm/64/filmgrain.S
new file mode 100644
index 0000000000..aa7f18bf39
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/filmgrain.S
@@ -0,0 +1,2010 @@
+/*
+ * Copyright © 2021, VideoLAN and dav1d authors
+ * Copyright © 2021, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+#include "src/arm/asm-offsets.h"
+
+#define GRAIN_WIDTH 82
+#define GRAIN_HEIGHT 73
+
+#define SUB_GRAIN_WIDTH 44
+#define SUB_GRAIN_HEIGHT 38
+
+.macro increment_seed steps, shift=1
+ lsr w11, w2, #3
+ lsr w12, w2, #12
+ lsr w13, w2, #1
+ eor w11, w2, w11 // (r >> 0) ^ (r >> 3)
+ eor w12, w12, w13 // (r >> 12) ^ (r >> 1)
+ eor w11, w11, w12 // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1)
+.if \shift
+ lsr w2, w2, #\steps
+.endif
+ and w11, w11, #((1 << \steps) - 1) // bit
+.if \shift
+ orr w2, w2, w11, lsl #(16 - \steps) // *state
+.else
+ orr w2, w2, w11, lsl #16 // *state
+.endif
+.endm
+
+.macro read_rand dest, bits, age
+ ubfx \dest, x2, #16 - \bits - \age, #\bits
+.endm
+
+.macro read_shift_rand dest, bits
+ ubfx \dest, x2, #17 - \bits, #\bits
+ lsr w2, w2, #1
+.endm
+
+// special calling convention:
+// w2 holds seed
+// x3 holds dav1d_gaussian_sequence
+// clobbers x11-x15
+// returns in v0.8h
+function get_gaussian_neon
+ increment_seed 4
+ read_rand x14, 11, 3
+ read_rand x15, 11, 2
+ add x14, x3, x14, lsl #1
+ add x15, x3, x15, lsl #1
+ ld1 {v0.h}[0], [x14]
+ read_rand x14, 11, 1
+ ld1 {v0.h}[1], [x15]
+ add x14, x3, x14, lsl #1
+ read_rand x15, 11, 0
+ increment_seed 4
+ add x15, x3, x15, lsl #1
+ ld1 {v0.h}[2], [x14]
+ read_rand x14, 11, 3
+ ld1 {v0.h}[3], [x15]
+ add x14, x3, x14, lsl #1
+ read_rand x15, 11, 2
+ ld1 {v0.h}[4], [x14]
+ add x15, x3, x15, lsl #1
+ read_rand x14, 11, 1
+ ld1 {v0.h}[5], [x15]
+ read_rand x15, 11, 0
+ add x14, x3, x14, lsl #1
+ add x15, x3, x15, lsl #1
+ ld1 {v0.h}[6], [x14]
+ ld1 {v0.h}[7], [x15]
+ ret
+endfunc
+
+.macro get_grain_row r0, r1, r2, r3, r4, r5
+ bl get_gaussian_neon
+ srshl \r5\().8h, v0.8h, v31.8h
+ xtn \r0\().8b, \r5\().8h
+ bl get_gaussian_neon
+ srshl \r5\().8h, v0.8h, v31.8h
+ xtn2 \r0\().16b, \r5\().8h
+ bl get_gaussian_neon
+ srshl \r5\().8h, v0.8h, v31.8h
+ xtn \r1\().8b, \r5\().8h
+ bl get_gaussian_neon
+ srshl \r5\().8h, v0.8h, v31.8h
+ xtn2 \r1\().16b, \r5\().8h
+ bl get_gaussian_neon
+ srshl \r5\().8h, v0.8h, v31.8h
+ xtn \r2\().8b, \r5\().8h
+ bl get_gaussian_neon
+ srshl \r5\().8h, v0.8h, v31.8h
+ xtn2 \r2\().16b, \r5\().8h
+ bl get_gaussian_neon
+ srshl \r5\().8h, v0.8h, v31.8h
+ xtn \r3\().8b, \r5\().8h
+ bl get_gaussian_neon
+ srshl \r5\().8h, v0.8h, v31.8h
+ xtn2 \r3\().16b, \r5\().8h
+ bl get_gaussian_neon
+ srshl \r5\().8h, v0.8h, v31.8h
+ xtn \r4\().8b, \r5\().8h
+ bl get_gaussian_neon
+ srshl \r5\().8h, v0.8h, v31.8h
+ xtn2 \r4\().16b, \r5\().8h
+ increment_seed 2
+ read_rand x14, 11, 1
+ read_rand x15, 11, 0
+ add x14, x3, x14, lsl #1
+ add x15, x3, x15, lsl #1
+ ld1 {\r5\().h}[0], [x14]
+ ld1 {\r5\().h}[1], [x15]
+ srshl v0.4h, \r5\().4h, v31.4h
+ xtn \r5\().8b, v0.8h
+.endm
+
+.macro store_grain_row r0, r1, r2, r3, r4, r5
+ st1 {\r0\().16b,\r1\().16b}, [x0], #32
+ st1 {\r2\().16b,\r3\().16b}, [x0], #32
+ st1 {\r4\().16b}, [x0], #16
+ st1 {\r5\().h}[0], [x0], #2
+.endm
+
+.macro get_grain_row_44 r0, r1, r2
+ bl get_gaussian_neon
+ srshl \r2\().8h, v0.8h, v31.8h
+ xtn \r0\().8b, \r2\().8h
+ bl get_gaussian_neon
+ srshl \r2\().8h, v0.8h, v31.8h
+ xtn2 \r0\().16b, \r2\().8h
+ bl get_gaussian_neon
+ srshl \r2\().8h, v0.8h, v31.8h
+ xtn \r1\().8b, \r2\().8h
+ bl get_gaussian_neon
+ srshl \r2\().8h, v0.8h, v31.8h
+ xtn2 \r1\().16b, \r2\().8h
+ bl get_gaussian_neon
+ srshl \r2\().8h, v0.8h, v31.8h
+ xtn \r2\().8b, \r2\().8h
+
+ increment_seed 4
+ read_rand x14, 11, 3
+ read_rand x15, 11, 2
+ add x14, x3, x14, lsl #1
+ add x15, x3, x15, lsl #1
+ ld1 {v0.h}[0], [x14]
+ read_rand x14, 11, 1
+ ld1 {v0.h}[1], [x15]
+ read_rand x15, 11, 0
+ add x14, x3, x14, lsl #1
+ add x15, x3, x15, lsl #1
+ ld1 {v0.h}[2], [x14]
+ ld1 {v0.h}[3], [x15]
+ srshl v0.4h, v0.4h, v31.4h
+ xtn2 \r2\().16b, v0.8h
+.endm
+
+.macro store_grain_row_44 r0, r1, r2
+ st1 {\r0\().16b,\r1\().16b}, [x0], #32
+ st1 {\r2\().16b}, [x0]
+ add x0, x0, #GRAIN_WIDTH-32
+.endm
+
+function get_grain_2_neon
+ increment_seed 2
+ read_rand x14, 11, 1
+ read_rand x15, 11, 0
+ add x14, x3, x14, lsl #1
+ add x15, x3, x15, lsl #1
+ ld1 {v0.h}[0], [x14]
+ ld1 {v0.h}[1], [x15]
+ srshl v0.4h, v0.4h, v31.4h
+ xtn v0.8b, v0.8h
+ ret
+endfunc
+
+.macro get_grain_2 dst
+ bl get_grain_2_neon
+.ifnc \dst, v0
+ mov \dst\().8b, v0.8b
+.endif
+.endm
+
+// w15 holds the number of entries to produce
+// w14, w16 and w17 hold the previous output entries
+// v0 holds the vector of produced entries
+// v1 holds the input vector of sums from above
+.macro output_lag n
+function output_lag\n\()_neon
+1:
+ read_shift_rand x13, 11
+ mov w11, v1.s[0]
+ ldrsh w12, [x3, x13, lsl #1]
+ ext v0.16b, v0.16b, v0.16b, #1
+.if \n == 1
+ madd w11, w14, w4, w11 // sum (above) + *coeff * prev output
+.elseif \n == 2
+ madd w11, w16, w4, w11 // sum (above) + *coeff * prev output 1
+ madd w11, w14, w17, w11 // += *coeff * prev output 2
+ mov w16, w14
+.else
+ madd w11, w17, w4, w11 // sum (above) + *coeff * prev output 1
+ madd w11, w16, w20, w11 // sum (above) + *coeff * prev output 2
+ madd w11, w14, w21, w11 // += *coeff * prev output 3
+ mov w17, w16
+ mov w16, w14
+.endif
+ add w14, w11, w8 // 1 << (ar_coeff_shift - 1)
+ add w12, w12, w10 // 1 << (4 + grain_scale_shift - 1)
+ asr w14, w14, w7 // >> ar_coeff_shift
+ asr w12, w12, w9 // >> (4 + grain_scale_shift)
+ add w14, w14, w12
+ cmp w14, w5
+ csel w14, w14, w5, le
+ cmp w14, w6
+ csel w14, w14, w6, ge
+ subs w15, w15, #1
+ ext v1.16b, v1.16b, v1.16b, #4
+ ins v0.b[15], w14
+ b.gt 1b
+ ret
+endfunc
+.endm
+
+output_lag 1
+output_lag 2
+output_lag 3
+
+
+function sum_lag1_above_neon
+ smull v2.8h, v3.8b, v28.8b
+ smull2 v3.8h, v3.16b, v28.16b
+ smull v4.8h, v0.8b, v27.8b
+ smull2 v5.8h, v0.16b, v27.16b
+ smull v6.8h, v1.8b, v29.8b
+ smull2 v7.8h, v1.16b, v29.16b
+ saddl v0.4s, v2.4h, v4.4h
+ saddl2 v1.4s, v2.8h, v4.8h
+ saddl v2.4s, v3.4h, v5.4h
+ saddl2 v3.4s, v3.8h, v5.8h
+ saddw v4.4s, v0.4s, v6.4h
+ saddw2 v5.4s, v1.4s, v6.8h
+ saddw v6.4s, v2.4s, v7.4h
+ saddw2 v7.4s, v3.4s, v7.8h
+ ret
+endfunc
+
+.macro sum_lag_n_body lag, type, uv_layout, edge, elems, store, uv_coeff
+ bl sum_\lag\()_above_neon
+.ifc \type, uv_420
+ add x12, x19, #GRAIN_WIDTH
+ ld1 {v22.16b, v23.16b}, [x19], #32
+ ld1 {v24.16b, v25.16b}, [x12]
+ saddlp v22.8h, v22.16b
+ saddlp v23.8h, v23.16b
+ saddlp v24.8h, v24.16b
+ saddlp v25.8h, v25.16b
+ add v22.8h, v22.8h, v24.8h
+ add v23.8h, v23.8h, v25.8h
+ rshrn v0.8b, v22.8h, #2
+ rshrn2 v0.16b, v23.8h, #2
+.endif
+.ifc \type, uv_422
+ ld1 {v22.16b, v23.16b}, [x19], #32
+ saddlp v22.8h, v22.16b
+ saddlp v23.8h, v23.16b
+ rshrn v0.8b, v22.8h, #1
+ rshrn2 v0.16b, v23.8h, #1
+.endif
+.ifc \type, uv_444
+ ld1 {v0.16b}, [x19], #16
+.endif
+.if \uv_layout
+.ifnb \uv_coeff
+ dup v1.16b, \uv_coeff
+ smull v2.8h, v0.8b, v1.8b
+ smull2 v3.8h, v0.16b, v1.16b
+.else
+ smull v2.8h, v0.8b, v30.8b
+ smull2 v3.8h, v0.16b, v30.16b
+.endif
+ saddw v4.4s, v4.4s, v2.4h
+ saddw2 v5.4s, v5.4s, v2.8h
+ saddw v6.4s, v6.4s, v3.4h
+ saddw2 v7.4s, v7.4s, v3.8h
+.endif
+.if \uv_layout && \elems == 16
+ b sum_\lag\()_y_\edge\()_start
+.elseif \uv_layout == 444 && \elems == 15
+ b sum_\lag\()_y_\edge\()_start
+.elseif \uv_layout == 422 && \elems == 9
+ b sum_\lag\()_uv_420_\edge\()_start
+.else
+sum_\lag\()_\type\()_\edge\()_start:
+.ifc \edge, left
+ increment_seed 4
+ read_rand x12, 11, 3
+ read_rand x13, 11, 2
+ read_rand x14, 11, 1
+ add x12, x3, x12, lsl #1
+ add x13, x3, x13, lsl #1
+ add x14, x3, x14, lsl #1
+ ld1 {v0.h}[5], [x12]
+ ld1 {v0.h}[6], [x13]
+ ld1 {v0.h}[7], [x14]
+ lsl x2, x2, #1 // shift back the state as if we'd done increment_seed with shift=0
+ srshl v0.8h, v0.8h, v31.8h
+ xtn2 v0.16b, v0.8h
+ ext v4.16b, v4.16b, v4.16b, #12
+.ifc \lag, lag3
+ smov w17, v0.b[13]
+.endif
+.ifnc \lag, lag1
+ smov w16, v0.b[14]
+.endif
+ smov w14, v0.b[15]
+
+ mov v1.16b, v4.16b
+ mov w15, #1
+ bl output_\lag\()_neon
+.else
+ increment_seed 4, shift=0
+ mov v1.16b, v4.16b
+ mov w15, #4
+ bl output_\lag\()_neon
+.endif
+
+ increment_seed 4, shift=0
+ mov v1.16b, v5.16b
+ mov w15, #4
+ bl output_\lag\()_neon
+
+ increment_seed 4, shift=0
+ mov v1.16b, v6.16b
+.if \elems == 9
+ mov w15, #1
+ bl output_\lag\()_neon
+ lsr w2, w2, #3
+
+ read_rand x12, 11, 2
+ read_rand x13, 11, 1
+ read_rand x14, 11, 0
+ add x12, x3, x12, lsl #1
+ add x13, x3, x13, lsl #1
+ add x14, x3, x14, lsl #1
+ ld1 {v1.h}[0], [x12]
+ ld1 {v1.h}[1], [x13]
+ ld1 {v1.h}[2], [x14]
+ srshl v1.4h, v1.4h, v31.4h
+ xtn v1.8b, v1.8h
+ ext v0.16b, v0.16b, v1.16b, #7
+.else
+ mov w15, #4
+ bl output_\lag\()_neon
+
+ increment_seed 4, shift=0
+ mov v1.16b, v7.16b
+
+.ifc \edge, right
+ mov w15, #3
+ bl output_\lag\()_neon
+ read_shift_rand x15, 11
+ add x15, x3, x15, lsl #1
+ ld1 {v1.h}[0], [x15]
+ srshl v1.4h, v1.4h, v31.4h
+ ext v0.16b, v0.16b, v1.16b, #1
+.else
+ mov w15, #4
+ bl output_\lag\()_neon
+.endif
+.endif
+.if \store
+ st1 {v0.16b}, [x0], #16
+.endif
+ ldr x30, [sp], #16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.endif
+.endm
+
+.macro sum_lag1_func type, uv_layout, edge, elems=16
+function sum_\type\()_lag1_\edge\()_neon
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+ sum_lag_n_body lag1, \type, \uv_layout, \edge, \elems, store=0
+endfunc
+.endm
+
+sum_lag1_func y, 0, left
+sum_lag1_func y, 0, mid
+sum_lag1_func y, 0, right, 15
+sum_lag1_func uv_444, 444, left
+sum_lag1_func uv_444, 444, mid
+sum_lag1_func uv_444, 444, right, 15
+sum_lag1_func uv_422, 422, left
+sum_lag1_func uv_422, 422, mid
+sum_lag1_func uv_422, 422, right, 9
+sum_lag1_func uv_420, 420, left
+sum_lag1_func uv_420, 420, mid
+sum_lag1_func uv_420, 420, right, 9
+
+.macro sum_lag1 type, dst, left, mid, right, edge=mid
+ mov v3.16b, \mid\().16b
+ ext v0.16b, \left\().16b, \mid\().16b, #15
+ ext v1.16b, \mid\().16b, \right\().16b, #1
+ bl sum_\type\()_lag1_\edge\()_neon
+ mov \dst\().16b, v0.16b
+.endm
+
+.macro sum_y_lag1 dst, left, mid, right, edge=mid
+ sum_lag1 y, \dst, \left, \mid, \right, \edge
+.endm
+
+.macro sum_uv_444_lag1 dst, left, mid, right, edge=mid
+ sum_lag1 uv_444, \dst, \left, \mid, \right, \edge
+.endm
+
+.macro sum_uv_422_lag1 dst, left, mid, right, edge=mid
+ sum_lag1 uv_422, \dst, \left, \mid, \right, \edge
+.endm
+
+.macro sum_uv_420_lag1 dst, left, mid, right, edge=mid
+ sum_lag1 uv_420, \dst, \left, \mid, \right, \edge
+.endm
+
+
+function sum_lag2_above_neon
+ sub x12, x0, #2*GRAIN_WIDTH - 16
+ sub x13, x0, #1*GRAIN_WIDTH - 16
+ ld1 {v18.16b}, [x12] // load top right
+ ld1 {v21.16b}, [x13]
+
+ ext v22.16b, v16.16b, v17.16b, #14 // top left, top mid
+ dup v26.16b, v30.b[0]
+ ext v23.16b, v16.16b, v17.16b, #15
+ dup v27.16b, v30.b[1]
+ ext v0.16b, v17.16b, v18.16b, #1 // top mid, top right
+ dup v28.16b, v30.b[3]
+ ext v1.16b, v17.16b, v18.16b, #2
+ dup v29.16b, v30.b[4]
+
+ smull v2.8h, v22.8b, v26.8b
+ smull2 v3.8h, v22.16b, v26.16b
+ smull v4.8h, v23.8b, v27.8b
+ smull2 v5.8h, v23.16b, v27.16b
+ smull v6.8h, v0.8b, v28.8b
+ smull2 v7.8h, v0.16b, v28.16b
+ smull v0.8h, v1.8b, v29.8b
+ smull2 v1.8h, v1.16b, v29.16b
+ saddl v22.4s, v2.4h, v4.4h
+ saddl2 v23.4s, v2.8h, v4.8h
+ saddl v26.4s, v3.4h, v5.4h
+ saddl2 v27.4s, v3.8h, v5.8h
+ saddl v2.4s, v0.4h, v6.4h
+ saddl2 v3.4s, v0.8h, v6.8h
+ saddl v6.4s, v1.4h, v7.4h
+ saddl2 v7.4s, v1.8h, v7.8h
+ add v4.4s, v22.4s, v2.4s
+ add v5.4s, v23.4s, v3.4s
+ add v6.4s, v26.4s, v6.4s
+ add v7.4s, v27.4s, v7.4s
+
+ ext v22.16b, v19.16b, v20.16b, #14 // top left, top mid
+ dup v26.16b, v30.b[5]
+ ext v23.16b, v19.16b, v20.16b, #15
+ dup v27.16b, v30.b[6]
+ ext v0.16b, v20.16b, v21.16b, #1 // top mid, top right
+ dup v28.16b, v30.b[8]
+ ext v1.16b, v20.16b, v21.16b, #2
+ dup v29.16b, v30.b[9]
+
+ smull v2.8h, v22.8b, v26.8b
+ smull2 v3.8h, v22.16b, v26.16b
+ smull v22.8h, v23.8b, v27.8b
+ smull2 v23.8h, v23.16b, v27.16b
+ smull v26.8h, v0.8b, v28.8b
+ smull2 v27.8h, v0.16b, v28.16b
+ smull v28.8h, v1.8b, v29.8b
+ smull2 v29.8h, v1.16b, v29.16b
+ saddl v0.4s, v2.4h, v22.4h
+ saddl2 v1.4s, v2.8h, v22.8h
+ saddl v2.4s, v3.4h, v23.4h
+ saddl2 v3.4s, v3.8h, v23.8h
+ saddl v22.4s, v26.4h, v28.4h
+ saddl2 v23.4s, v26.8h, v28.8h
+ saddl v26.4s, v27.4h, v29.4h
+ saddl2 v27.4s, v27.8h, v29.8h
+ add v0.4s, v0.4s, v22.4s
+ add v1.4s, v1.4s, v23.4s
+ add v2.4s, v2.4s, v26.4s
+ add v3.4s, v3.4s, v27.4s
+ dup v26.16b, v30.b[2]
+ dup v27.16b, v30.b[7]
+ smull v22.8h, v17.8b, v26.8b
+ smull2 v23.8h, v17.16b, v26.16b
+ smull v24.8h, v20.8b, v27.8b
+ smull2 v25.8h, v20.16b, v27.16b
+ add v4.4s, v4.4s, v0.4s
+ add v5.4s, v5.4s, v1.4s
+ add v6.4s, v6.4s, v2.4s
+ add v7.4s, v7.4s, v3.4s
+
+ mov v16.16b, v17.16b
+ mov v17.16b, v18.16b
+
+ saddl v0.4s, v22.4h, v24.4h
+ saddl2 v1.4s, v22.8h, v24.8h
+ saddl v2.4s, v23.4h, v25.4h
+ saddl2 v3.4s, v23.8h, v25.8h
+ mov v19.16b, v20.16b
+ mov v20.16b, v21.16b
+ add v4.4s, v4.4s, v0.4s
+ add v5.4s, v5.4s, v1.4s
+ add v6.4s, v6.4s, v2.4s
+ add v7.4s, v7.4s, v3.4s
+ ret
+endfunc
+
+.macro sum_lag2_func type, uv_layout, edge, elems=16
+function sum_\type\()_lag2_\edge\()_neon
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+.ifc \edge, left
+ sub x12, x0, #2*GRAIN_WIDTH
+ sub x13, x0, #1*GRAIN_WIDTH
+ ld1 {v17.16b}, [x12] // load the previous block right above
+ ld1 {v20.16b}, [x13]
+.endif
+ sum_lag_n_body lag2, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=v30.b[12]
+endfunc
+.endm
+
+sum_lag2_func y, 0, left
+sum_lag2_func y, 0, mid
+sum_lag2_func y, 0, right, 15
+sum_lag2_func uv_444, 444, left
+sum_lag2_func uv_444, 444, mid
+sum_lag2_func uv_444, 444, right, 15
+sum_lag2_func uv_422, 422, left
+sum_lag2_func uv_422, 422, mid
+sum_lag2_func uv_422, 422, right, 9
+sum_lag2_func uv_420, 420, left
+sum_lag2_func uv_420, 420, mid
+sum_lag2_func uv_420, 420, right, 9
+
+
+function sum_lag3_above_neon
+ sub x11, x0, #3*GRAIN_WIDTH - 16
+ sub x12, x0, #2*GRAIN_WIDTH - 16
+ sub x13, x0, #1*GRAIN_WIDTH - 16
+ ld1 {v15.16b}, [x11] // load top right
+ ld1 {v18.16b}, [x12]
+ ld1 {v21.16b}, [x13]
+
+ ext v8.16b, v13.16b, v14.16b, #13 // top left, top mid
+ dup v22.16b, v29.b[0]
+ ext v9.16b, v13.16b, v14.16b, #14
+ dup v23.16b, v29.b[1]
+ ext v10.16b, v13.16b, v14.16b, #15
+ dup v24.16b, v29.b[2]
+ dup v25.16b, v29.b[3]
+ ext v11.16b, v14.16b, v15.16b, #1 // top mid, top right
+ dup v26.16b, v29.b[4]
+ ext v12.16b, v14.16b, v15.16b, #2
+ dup v27.16b, v29.b[5]
+ ext v13.16b, v14.16b, v15.16b, #3
+ dup v28.16b, v29.b[6]
+
+ smull v0.8h, v8.8b, v22.8b
+ smull2 v1.8h, v8.16b, v22.16b
+ smull v2.8h, v9.8b, v23.8b
+ smull2 v3.8h, v9.16b, v23.16b
+ smull v8.8h, v10.8b, v24.8b
+ smull2 v9.8h, v10.16b, v24.16b
+ smull v10.8h, v11.8b, v26.8b
+ smull2 v11.8h, v11.16b, v26.16b
+ saddl v22.4s, v0.4h, v2.4h
+ saddl2 v23.4s, v0.8h, v2.8h
+ saddl v24.4s, v1.4h, v3.4h
+ saddl2 v26.4s, v1.8h, v3.8h
+ saddl v0.4s, v8.4h, v10.4h
+ saddl2 v1.4s, v8.8h, v10.8h
+ saddl v2.4s, v9.4h, v11.4h
+ saddl2 v3.4s, v9.8h, v11.8h
+ smull v8.8h, v12.8b, v27.8b
+ smull2 v9.8h, v12.16b, v27.16b
+ smull v10.8h, v13.8b, v28.8b
+ smull2 v11.8h, v13.16b, v28.16b
+ smull v12.8h, v14.8b, v25.8b
+ smull2 v13.8h, v14.16b, v25.16b
+ add v4.4s, v22.4s, v0.4s
+ add v5.4s, v23.4s, v1.4s
+ add v6.4s, v24.4s, v2.4s
+ add v7.4s, v26.4s, v3.4s
+ saddl v0.4s, v8.4h, v10.4h
+ saddl2 v1.4s, v8.8h, v10.8h
+ saddl v2.4s, v9.4h, v11.4h
+ saddl2 v3.4s, v9.8h, v11.8h
+ add v4.4s, v4.4s, v0.4s
+ add v5.4s, v5.4s, v1.4s
+ add v6.4s, v6.4s, v2.4s
+ add v7.4s, v7.4s, v3.4s
+ saddw v4.4s, v4.4s, v12.4h
+ saddw2 v5.4s, v5.4s, v12.8h
+ saddw v6.4s, v6.4s, v13.4h
+ saddw2 v7.4s, v7.4s, v13.8h
+
+ ext v8.16b, v16.16b, v17.16b, #13 // top left, top mid
+ dup v22.16b, v29.b[7]
+ ext v9.16b, v16.16b, v17.16b, #14
+ dup v23.16b, v29.b[8]
+ ext v10.16b, v16.16b, v17.16b, #15
+ dup v24.16b, v29.b[9]
+ dup v25.16b, v29.b[10]
+ ext v11.16b, v17.16b, v18.16b, #1 // top mid, top right
+ dup v26.16b, v29.b[11]
+ ext v12.16b, v17.16b, v18.16b, #2
+ dup v27.16b, v29.b[12]
+ ext v13.16b, v17.16b, v18.16b, #3
+ dup v28.16b, v29.b[13]
+
+ smull v0.8h, v8.8b, v22.8b
+ smull2 v1.8h, v8.16b, v22.16b
+ smull v2.8h, v9.8b, v23.8b
+ smull2 v3.8h, v9.16b, v23.16b
+ smull v8.8h, v10.8b, v24.8b
+ smull2 v9.8h, v10.16b, v24.16b
+ smull v10.8h, v11.8b, v26.8b
+ smull2 v11.8h, v11.16b, v26.16b
+ saddl v22.4s, v0.4h, v2.4h
+ saddl2 v23.4s, v0.8h, v2.8h
+ saddl v24.4s, v1.4h, v3.4h
+ saddl2 v26.4s, v1.8h, v3.8h
+ saddl v0.4s, v8.4h, v10.4h
+ saddl2 v1.4s, v8.8h, v10.8h
+ saddl v2.4s, v9.4h, v11.4h
+ saddl2 v3.4s, v9.8h, v11.8h
+ smull v8.8h, v12.8b, v27.8b
+ smull2 v9.8h, v12.16b, v27.16b
+ smull v10.8h, v13.8b, v28.8b
+ smull2 v11.8h, v13.16b, v28.16b
+ smull v12.8h, v17.8b, v25.8b
+ smull2 v13.8h, v17.16b, v25.16b
+ add v22.4s, v22.4s, v0.4s
+ add v23.4s, v23.4s, v1.4s
+ add v24.4s, v24.4s, v2.4s
+ add v26.4s, v26.4s, v3.4s
+ saddl v0.4s, v8.4h, v10.4h
+ saddl2 v1.4s, v8.8h, v10.8h
+ saddl v2.4s, v9.4h, v11.4h
+ saddl2 v3.4s, v9.8h, v11.8h
+ add v4.4s, v4.4s, v22.4s
+ add v5.4s, v5.4s, v23.4s
+ add v6.4s, v6.4s, v24.4s
+ add v7.4s, v7.4s, v26.4s
+ add v4.4s, v4.4s, v0.4s
+ add v5.4s, v5.4s, v1.4s
+ add v6.4s, v6.4s, v2.4s
+ add v7.4s, v7.4s, v3.4s
+ saddw v4.4s, v4.4s, v12.4h
+ saddw2 v5.4s, v5.4s, v12.8h
+ saddw v6.4s, v6.4s, v13.4h
+ saddw2 v7.4s, v7.4s, v13.8h
+
+ ext v8.16b, v19.16b, v20.16b, #13 // top left, top mid
+ dup v22.16b, v29.b[14]
+ ext v9.16b, v19.16b, v20.16b, #14
+ dup v23.16b, v29.b[15]
+ ext v10.16b, v19.16b, v20.16b, #15
+ dup v24.16b, v30.b[0]
+ dup v25.16b, v30.b[1]
+ ext v11.16b, v20.16b, v21.16b, #1 // top mid, top right
+ dup v26.16b, v30.b[2]
+ ext v12.16b, v20.16b, v21.16b, #2
+ dup v27.16b, v30.b[3]
+ ext v13.16b, v20.16b, v21.16b, #3
+ dup v28.16b, v30.b[4]
+
+ smull v0.8h, v8.8b, v22.8b
+ smull2 v1.8h, v8.16b, v22.16b
+ smull v2.8h, v9.8b, v23.8b
+ smull2 v3.8h, v9.16b, v23.16b
+ smull v8.8h, v10.8b, v24.8b
+ smull2 v9.8h, v10.16b, v24.16b
+ smull v10.8h, v11.8b, v26.8b
+ smull2 v11.8h, v11.16b, v26.16b
+ saddl v22.4s, v0.4h, v2.4h
+ saddl2 v23.4s, v0.8h, v2.8h
+ saddl v24.4s, v1.4h, v3.4h
+ saddl2 v26.4s, v1.8h, v3.8h
+ saddl v0.4s, v8.4h, v10.4h
+ saddl2 v1.4s, v8.8h, v10.8h
+ saddl v2.4s, v9.4h, v11.4h
+ saddl2 v3.4s, v9.8h, v11.8h
+ smull v8.8h, v12.8b, v27.8b
+ smull2 v9.8h, v12.16b, v27.16b
+ smull v10.8h, v13.8b, v28.8b
+ smull2 v11.8h, v13.16b, v28.16b
+ smull v12.8h, v20.8b, v25.8b
+ smull2 v19.8h, v20.16b, v25.16b
+ add v22.4s, v22.4s, v0.4s
+ add v23.4s, v23.4s, v1.4s
+ add v24.4s, v24.4s, v2.4s
+ add v26.4s, v26.4s, v3.4s
+ saddl v0.4s, v8.4h, v10.4h
+ saddl2 v1.4s, v8.8h, v10.8h
+ saddl v2.4s, v9.4h, v11.4h
+ saddl2 v3.4s, v9.8h, v11.8h
+ add v4.4s, v4.4s, v22.4s
+ add v5.4s, v5.4s, v23.4s
+ add v6.4s, v6.4s, v24.4s
+ add v7.4s, v7.4s, v26.4s
+ mov v13.16b, v14.16b
+ mov v14.16b, v15.16b
+ add v4.4s, v4.4s, v0.4s
+ add v5.4s, v5.4s, v1.4s
+ add v6.4s, v6.4s, v2.4s
+ add v7.4s, v7.4s, v3.4s
+ mov v16.16b, v17.16b
+ mov v17.16b, v18.16b
+ saddw v4.4s, v4.4s, v12.4h
+ saddw2 v5.4s, v5.4s, v12.8h
+ saddw v6.4s, v6.4s, v19.4h
+ saddw2 v7.4s, v7.4s, v19.8h
+
+ mov v19.16b, v20.16b
+ mov v20.16b, v21.16b
+ ret
+endfunc
+
+.macro sum_lag3_func type, uv_layout, edge, elems=16
+function sum_\type\()_lag3_\edge\()_neon
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+.ifc \edge, left
+ sub x11, x0, #3*GRAIN_WIDTH
+ sub x12, x0, #2*GRAIN_WIDTH
+ sub x13, x0, #1*GRAIN_WIDTH
+ ld1 {v14.16b}, [x11] // load the previous block right above
+ ld1 {v17.16b}, [x12]
+ ld1 {v20.16b}, [x13]
+.endif
+ sum_lag_n_body lag3, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=v30.b[8]
+endfunc
+.endm
+
+sum_lag3_func y, 0, left
+sum_lag3_func y, 0, mid
+sum_lag3_func y, 0, right, 15
+sum_lag3_func uv_444, 444, left
+sum_lag3_func uv_444, 444, mid
+sum_lag3_func uv_444, 444, right, 15
+sum_lag3_func uv_422, 422, left
+sum_lag3_func uv_422, 422, mid
+sum_lag3_func uv_422, 422, right, 9
+sum_lag3_func uv_420, 420, left
+sum_lag3_func uv_420, 420, mid
+sum_lag3_func uv_420, 420, right, 9
+
+function generate_grain_rows_neon
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+1:
+ get_grain_row v16, v17, v18, v19, v20, v21
+ subs w1, w1, #1
+ store_grain_row v16, v17, v18, v19, v20, v21
+ b.gt 1b
+ ldr x30, [sp], #16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+endfunc
+
+function generate_grain_rows_44_neon
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+1:
+ get_grain_row_44 v16, v17, v18
+ subs w1, w1, #1
+ store_grain_row_44 v16, v17, v18
+ b.gt 1b
+ ldr x30, [sp], #16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+endfunc
+
+function get_grain_row_neon
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+ get_grain_row v16, v17, v18, v19, v20, v21
+ ldr x30, [sp], #16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+endfunc
+
+function get_grain_row_44_neon
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+ get_grain_row_44 v16, v17, v18
+ ldr x30, [sp], #16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+endfunc
+
+function add_uv_444_coeff_lag0_neon
+add_coeff_lag0_start:
+ smull v2.8h, v0.8b, v27.8b
+ smull2 v3.8h, v0.16b, v27.16b
+ srshl v2.8h, v2.8h, v28.8h
+ srshl v3.8h, v3.8h, v28.8h
+ saddw v2.8h, v2.8h, v1.8b
+ saddw2 v3.8h, v3.8h, v1.16b
+ sqxtn v2.8b, v2.8h
+ sqxtn2 v2.16b, v3.8h
+ ret
+endfunc
+
+function add_uv_420_coeff_lag0_neon
+ ld1 {v4.16b, v5.16b}, [x19], #32
+ ld1 {v6.16b, v7.16b}, [x12], #32
+ saddlp v4.8h, v4.16b
+ saddlp v5.8h, v5.16b
+ saddlp v6.8h, v6.16b
+ saddlp v7.8h, v7.16b
+ add v4.8h, v4.8h, v6.8h
+ add v5.8h, v5.8h, v7.8h
+ rshrn v4.8b, v4.8h, #2
+ rshrn2 v4.16b, v5.8h, #2
+ and v0.16b, v4.16b, v0.16b
+ b add_coeff_lag0_start
+endfunc
+
+function add_uv_422_coeff_lag0_neon
+ ld1 {v4.16b, v5.16b}, [x19], #32
+ saddlp v4.8h, v4.16b
+ saddlp v5.8h, v5.16b
+ rshrn v4.8b, v4.8h, #1
+ rshrn2 v4.16b, v5.8h, #1
+ and v0.16b, v4.16b, v0.16b
+ b add_coeff_lag0_start
+endfunc
+
+.macro gen_grain_82 type
+function generate_grain_\type\()_8bpc_neon, export=1
+ AARCH64_SIGN_LINK_REGISTER
+ stp x30, x19, [sp, #-96]!
+
+.ifc \type, uv_444
+ mov w13, w3
+ mov w14, #28
+ add x19, x1, #3*GRAIN_WIDTH
+ mov x1, x2
+ mul w13, w13, w14
+.endif
+ movrel x3, X(gaussian_sequence)
+ ldr w2, [x1, #FGD_SEED]
+ ldr w9, [x1, #FGD_GRAIN_SCALE_SHIFT]
+.ifc \type, y
+ add x4, x1, #FGD_AR_COEFFS_Y
+.else
+ add x4, x1, #FGD_AR_COEFFS_UV
+.endif
+ adr x16, L(gen_grain_\type\()_tbl)
+ ldr w17, [x1, #FGD_AR_COEFF_LAG]
+ add w9, w9, #4
+ ldrh w17, [x16, w17, uxtw #1]
+ dup v31.8h, w9 // 4 + data->grain_scale_shift
+ sub x16, x16, w17, uxtw
+ neg v31.8h, v31.8h
+
+.ifc \type, uv_444
+ cmp w13, #0
+ mov w11, #0x49d8
+ mov w14, #0xb524
+ add x4, x4, w13, uxtw // Add offset to ar_coeffs_uv[1]
+ csel w11, w11, w14, ne
+.endif
+
+ ldr w7, [x1, #FGD_AR_COEFF_SHIFT]
+ mov w8, #1
+ mov w10, #1
+ lsl w8, w8, w7 // 1 << ar_coeff_shift
+ lsl w10, w10, w9 // 1 << (4 + data->grain_scale_shift)
+ lsr w8, w8, #1 // 1 << (ar_coeff_shift - 1)
+ lsr w10, w10, #1 // 1 << (4 + data->grain_scale_shift - 1)
+ mov w5, #127
+ mov w6, #-128
+
+.ifc \type, uv_444
+ eor w2, w2, w11
+.endif
+
+ br x16
+
+L(generate_grain_\type\()_lag0):
+ AARCH64_VALID_JUMP_TARGET
+.ifc \type, y
+ mov w1, #GRAIN_HEIGHT
+ bl generate_grain_rows_neon
+.else
+ dup v28.8h, w7
+ ld1r {v27.16b}, [x4] // ar_coeffs_uv[0]
+ movi v0.16b, #0
+ movi v1.16b, #255
+ ext v29.16b, v0.16b, v1.16b, #13
+ ext v30.16b, v1.16b, v0.16b, #1
+ neg v28.8h, v28.8h
+
+ mov w1, #3
+ bl generate_grain_rows_neon
+ mov w1, #GRAIN_HEIGHT-3
+1:
+ ld1 {v22.16b, v23.16b, v24.16b, v25.16b}, [x19], #64
+ bl get_grain_row_neon
+ and v0.16b, v22.16b, v29.16b
+ mov v1.16b, v16.16b
+ bl add_uv_444_coeff_lag0_neon
+ mov v0.16b, v23.16b
+ mov v1.16b, v17.16b
+ mov v16.16b, v2.16b
+ bl add_uv_444_coeff_lag0_neon
+ ld1 {v26.16b}, [x19], #16
+ mov v0.16b, v24.16b
+ mov v1.16b, v18.16b
+ mov v17.16b, v2.16b
+ bl add_uv_444_coeff_lag0_neon
+ add x19, x19, #2
+ mov v0.16b, v25.16b
+ mov v1.16b, v19.16b
+ mov v18.16b, v2.16b
+ bl add_uv_444_coeff_lag0_neon
+ and v0.16b, v26.16b, v30.16b
+ mov v1.16b, v20.16b
+ mov v19.16b, v2.16b
+ bl add_uv_444_coeff_lag0_neon
+ mov v20.16b, v2.16b
+ subs w1, w1, #1
+ store_grain_row v16, v17, v18, v19, v20, v21
+ b.gt 1b
+.endif
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(generate_grain_\type\()_lag1):
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v27.16b}, [x4], #1 // ar_coeffs_y[0]
+ ld1r {v28.16b}, [x4], #1 // ar_coeffs_y[1]
+ ld1r {v29.16b}, [x4] // ar_coeffs_y[2]
+.ifc \type, y
+ ldrsb w4, [x4, #1] // ar_coeffs_y[3]
+.else
+ add x4, x4, #2
+.endif
+
+ mov w1, #3
+.ifc \type, uv_444
+ ld1r {v30.16b}, [x4] // ar_coeffs_uv[4]
+ ldursb w4, [x4, #-1] // ar_coeffs_uv[3]
+.endif
+ bl generate_grain_rows_neon
+
+ mov w1, #GRAIN_HEIGHT - 3
+1:
+ sum_\type\()_lag1 v22, v16, v16, v17, left
+ sum_\type\()_lag1 v23, v16, v17, v18
+ sum_\type\()_lag1 v24, v17, v18, v19
+ sum_\type\()_lag1 v25, v18, v19, v20
+ sum_\type\()_lag1 v20, v19, v20, v21, right
+ get_grain_2 v21
+ subs w1, w1, #1
+.ifc \type, uv_444
+ add x19, x19, #2
+.endif
+ store_grain_row v22, v23, v24, v25, v20, v21
+ mov v16.16b, v22.16b
+ mov v17.16b, v23.16b
+ mov v18.16b, v24.16b
+ mov v19.16b, v25.16b
+ b.gt 1b
+
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(generate_grain_\type\()_lag2):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v30.16b}, [x4] // ar_coeffs_y[0-11], ar_coeffs_uv[0-12]
+
+ smov w4, v30.b[10]
+ smov w17, v30.b[11]
+
+ mov w1, #3
+ bl generate_grain_rows_neon
+
+ mov w1, #GRAIN_HEIGHT - 3
+1:
+ bl sum_\type\()_lag2_left_neon
+ bl sum_\type\()_lag2_mid_neon
+ bl sum_\type\()_lag2_mid_neon
+ bl sum_\type\()_lag2_mid_neon
+ bl sum_\type\()_lag2_right_neon
+ get_grain_2 v16
+ subs w1, w1, #1
+.ifc \type, uv_444
+ add x19, x19, #2
+.endif
+ st1 {v16.h}[0], [x0], #2
+ b.gt 1b
+
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(generate_grain_\type\()_lag3):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v29.16b, v30.16b}, [x4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24]
+ stp d8, d9, [sp, #16]
+ stp d10, d11, [sp, #32]
+ stp d12, d13, [sp, #48]
+ stp d14, d15, [sp, #64]
+ stp x20, x21, [sp, #80]
+
+ smov w4, v30.b[5]
+ smov w20, v30.b[6]
+ smov w21, v30.b[7]
+
+ mov w1, #3
+ bl generate_grain_rows_neon
+
+ mov w1, #GRAIN_HEIGHT - 3
+1:
+ bl sum_\type\()_lag3_left_neon
+ bl sum_\type\()_lag3_mid_neon
+ bl sum_\type\()_lag3_mid_neon
+ bl sum_\type\()_lag3_mid_neon
+ bl sum_\type\()_lag3_right_neon
+ get_grain_2 v16
+ subs w1, w1, #1
+.ifc \type, uv_444
+ add x19, x19, #2
+.endif
+ st1 {v16.h}[0], [x0], #2
+ b.gt 1b
+
+ ldp x20, x21, [sp, #80]
+ ldp d14, d15, [sp, #64]
+ ldp d12, d13, [sp, #48]
+ ldp d10, d11, [sp, #32]
+ ldp d8, d9, [sp, #16]
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(gen_grain_\type\()_tbl):
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0)
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1)
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2)
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3)
+endfunc
+.endm
+
+gen_grain_82 y
+gen_grain_82 uv_444
+
+.macro set_height dst, type
+.ifc \type, uv_420
+ mov \dst, #SUB_GRAIN_HEIGHT-3
+.else
+ mov \dst, #GRAIN_HEIGHT-3
+.endif
+.endm
+
+.macro increment_y_ptr reg, type
+.ifc \type, uv_420
+ add \reg, \reg, #2*GRAIN_WIDTH-(3*32)
+.else
+ sub \reg, \reg, #3*32-GRAIN_WIDTH
+.endif
+.endm
+
+.macro gen_grain_44 type
+function generate_grain_\type\()_8bpc_neon, export=1
+ AARCH64_SIGN_LINK_REGISTER
+ stp x30, x19, [sp, #-96]!
+
+ mov w13, w3
+ mov w14, #28
+ add x19, x1, #3*GRAIN_WIDTH-3
+ mov x1, x2
+ mul w13, w13, w14
+
+ movrel x3, X(gaussian_sequence)
+ ldr w2, [x1, #FGD_SEED]
+ ldr w9, [x1, #FGD_GRAIN_SCALE_SHIFT]
+ add x4, x1, #FGD_AR_COEFFS_UV
+ adr x16, L(gen_grain_\type\()_tbl)
+ ldr w17, [x1, #FGD_AR_COEFF_LAG]
+ add w9, w9, #4
+ ldrh w17, [x16, w17, uxtw #1]
+ dup v31.8h, w9 // 4 + data->grain_scale_shift
+ sub x16, x16, w17, uxtw
+ neg v31.8h, v31.8h
+
+ cmp w13, #0
+ mov w11, #0x49d8
+ mov w14, #0xb524
+ add x4, x4, w13, uxtw // Add offset to ar_coeffs_uv[1]
+ csel w11, w11, w14, ne
+
+ ldr w7, [x1, #FGD_AR_COEFF_SHIFT]
+ mov w8, #1
+ mov w10, #1
+ lsl w8, w8, w7 // 1 << ar_coeff_shift
+ lsl w10, w10, w9 // 1 << (4 + data->grain_scale_shift)
+ lsr w8, w8, #1 // 1 << (ar_coeff_shift - 1)
+ lsr w10, w10, #1 // 1 << (4 + data->grain_scale_shift - 1)
+ mov w5, #127
+ mov w6, #-128
+
+ eor w2, w2, w11
+
+ br x16
+
+L(generate_grain_\type\()_lag0):
+ AARCH64_VALID_JUMP_TARGET
+ dup v28.8h, w7
+ ld1r {v27.16b}, [x4] // ar_coeffs_uv[0]
+ movi v0.16b, #0
+ movi v1.16b, #255
+ ext v29.16b, v0.16b, v1.16b, #13
+ ext v30.16b, v1.16b, v0.16b, #7
+ neg v28.8h, v28.8h
+
+ mov w1, #3
+ bl generate_grain_rows_44_neon
+ set_height w1, \type
+1:
+ bl get_grain_row_44_neon
+.ifc \type, uv_420
+ add x12, x19, #GRAIN_WIDTH
+.endif
+ mov v0.16b, v29.16b
+ mov v1.16b, v16.16b
+ bl add_\type\()_coeff_lag0_neon
+ movi v0.16b, #255
+ mov v1.16b, v17.16b
+ mov v16.16b, v2.16b
+ bl add_\type\()_coeff_lag0_neon
+ mov v0.16b, v30.16b
+ mov v1.16b, v18.16b
+ mov v17.16b, v2.16b
+ bl add_\type\()_coeff_lag0_neon
+ mov v18.16b, v2.16b
+ subs w1, w1, #1
+ increment_y_ptr x19, \type
+ store_grain_row_44 v16, v17, v18
+ b.gt 1b
+
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(generate_grain_\type\()_lag1):
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v27.16b}, [x4], #1 // ar_coeffs_uv[0]
+ ld1r {v28.16b}, [x4], #1 // ar_coeffs_uv[1]
+ ld1r {v29.16b}, [x4] // ar_coeffs_uv[2]
+ add x4, x4, #2
+
+ mov w1, #3
+ ld1r {v30.16b}, [x4] // ar_coeffs_u4[4]
+ ldursb w4, [x4, #-1] // ar_coeffs_uv[3]
+ bl generate_grain_rows_44_neon
+
+ set_height w1, \type
+1:
+ sum_\type\()_lag1 v20, v16, v16, v17, left
+ sum_\type\()_lag1 v21, v16, v17, v18
+ sum_\type\()_lag1 v18, v17, v18, v18, right
+ subs w1, w1, #1
+ increment_y_ptr x19, \type
+ store_grain_row_44 v20, v21, v18
+ mov v16.16b, v20.16b
+ mov v17.16b, v21.16b
+ b.gt 1b
+
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(generate_grain_\type\()_lag2):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v30.16b}, [x4] // ar_coeffs_uv[0-12]
+
+ smov w4, v30.b[10]
+ smov w17, v30.b[11]
+
+ mov w1, #3
+ bl generate_grain_rows_44_neon
+
+ set_height w1, \type
+1:
+ bl sum_\type\()_lag2_left_neon
+ bl sum_\type\()_lag2_mid_neon
+ bl sum_\type\()_lag2_right_neon
+ subs w1, w1, #1
+ increment_y_ptr x19, \type
+ add x0, x0, #GRAIN_WIDTH-48
+ b.gt 1b
+
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(generate_grain_\type\()_lag3):
+ AARCH64_VALID_JUMP_TARGET
+ ldr q29, [x4] // ar_coeffs_uv[0-15]
+ ldr q30, [x4, #16] // ar_coeffs_uv[16-24]
+ stp d8, d9, [sp, #16]
+ stp d10, d11, [sp, #32]
+ stp d12, d13, [sp, #48]
+ stp d14, d15, [sp, #64]
+ stp x20, x21, [sp, #80]
+
+ smov w4, v30.b[5]
+ smov w20, v30.b[6]
+ smov w21, v30.b[7]
+
+ mov w1, #3
+ bl generate_grain_rows_44_neon
+
+ set_height w1, \type
+1:
+ bl sum_\type\()_lag3_left_neon
+ bl sum_\type\()_lag3_mid_neon
+ bl sum_\type\()_lag3_right_neon
+ subs w1, w1, #1
+ increment_y_ptr x19, \type
+ add x0, x0, #GRAIN_WIDTH-48
+ b.gt 1b
+
+ ldp x20, x21, [sp, #80]
+ ldp d14, d15, [sp, #64]
+ ldp d12, d13, [sp, #48]
+ ldp d10, d11, [sp, #32]
+ ldp d8, d9, [sp, #16]
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(gen_grain_\type\()_tbl):
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0)
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1)
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2)
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3)
+endfunc
+.endm
+
+gen_grain_44 uv_420
+gen_grain_44 uv_422
+
+.macro gather_interleaved dst1, dst2, src1, src2, off
+ umov w14, \src1[0+\off]
+ umov w15, \src2[8+\off]
+ umov w16, \src1[2+\off]
+ add x14, x14, x3
+ umov w17, \src2[10+\off]
+ add x15, x15, x3
+ ld1 {\dst1}[0+\off], [x14]
+ umov w14, \src1[4+\off]
+ add x16, x16, x3
+ ld1 {\dst2}[8+\off], [x15]
+ umov w15, \src2[12+\off]
+ add x17, x17, x3
+ ld1 {\dst1}[2+\off], [x16]
+ umov w16, \src1[6+\off]
+ add x14, x14, x3
+ ld1 {\dst2}[10+\off], [x17]
+ umov w17, \src2[14+\off]
+ add x15, x15, x3
+ ld1 {\dst1}[4+\off], [x14]
+ add x16, x16, x3
+ ld1 {\dst2}[12+\off], [x15]
+ add x17, x17, x3
+ ld1 {\dst1}[6+\off], [x16]
+ ld1 {\dst2}[14+\off], [x17]
+.endm
+
+.macro gather dst1, dst2, src1, src2
+ gather_interleaved \dst1, \dst2, \src1, \src2, 0
+ gather_interleaved \dst2, \dst1, \src2, \src1, 0
+ gather_interleaved \dst1, \dst2, \src1, \src2, 1
+ gather_interleaved \dst2, \dst1, \src2, \src1, 1
+.endm
+
+function gather32_neon
+ gather v4.b, v5.b, v0.b, v1.b
+ ret
+endfunc
+
+function gather16_neon
+ gather_interleaved v4.b, v5.b, v0.b, v0.b, 0
+ gather_interleaved v4.b, v5.b, v0.b, v0.b, 1
+ ins v4.d[1], v5.d[1]
+ ret
+endfunc
+
+const overlap_coeffs_0, align=4
+ .byte 27, 17, 0, 0, 0, 0, 0, 0
+ .byte 17, 27, 32, 32, 32, 32, 32, 32
+endconst
+
+const overlap_coeffs_1, align=4
+ .byte 23, 0, 0, 0, 0, 0, 0, 0
+ .byte 22, 32, 32, 32, 32, 32, 32, 32
+endconst
+
+.macro calc_offset offx, offy, src, sx, sy
+ and \offy, \src, #0xF // randval & 0xF
+ lsr \offx, \src, #4 // randval >> 4
+.if \sy == 0
+ add \offy, \offy, \offy // 2 * (randval & 0xF)
+.endif
+.if \sx == 0
+ add \offx, \offx, \offx // 2 * (randval >> 4)
+.endif
+.endm
+
+.macro add_offset dst, offx, offy, src, stride
+ madd \dst, \stride, \offy, \src // grain_lut += grain_stride * offy
+ add \dst, \dst, \offx, uxtw // grain_lut += offx
+.endm
+
+// void dav1d_fgy_32x32_8bpc_neon(pixel *const dst, const pixel *const src,
+// const ptrdiff_t stride,
+// const uint8_t scaling[SCALING_SIZE],
+// const int scaling_shift,
+// const entry grain_lut[][GRAIN_WIDTH],
+// const int offsets[][2],
+// const int h, const ptrdiff_t clip,
+// const ptrdiff_t type);
+function fgy_32x32_8bpc_neon, export=1
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+ ldr w11, [x6, #8] // offsets[1][0]
+ ldr w13, [x6, #4] // offsets[0][1]
+ ldr w15, [x6, #12] // offsets[1][1]
+ ldr w6, [x6] // offsets[0][0]
+ ldr w8, [sp, #16] // clip
+ mov x9, #GRAIN_WIDTH // grain_lut stride
+
+ neg w4, w4
+ dup v29.8h, w4 // -scaling_shift
+
+ movrel x16, overlap_coeffs_0
+
+ cbz w8, 1f
+ // clip
+ movi v30.16b, #16
+ movi v31.16b, #235
+ b 2f
+1:
+ // no clip
+ movi v30.16b, #0
+ movi v31.16b, #255
+2:
+
+ ld1 {v27.8b, v28.8b}, [x16] // overlap_coeffs
+
+ add x5, x5, #9 // grain_lut += 9
+ add x5, x5, x9, lsl #3 // grain_lut += 8 * grain_stride
+ add x5, x5, x9 // grain_lut += grain_stride
+
+ calc_offset w11, w12, w11, 0, 0
+ calc_offset w13, w14, w13, 0, 0
+ calc_offset w15, w16, w15, 0, 0
+ calc_offset w6, w10, w6, 0, 0
+
+ add_offset x12, w11, x12, x5, x9
+ add_offset x14, w13, x14, x5, x9
+ add_offset x16, w15, x16, x5, x9
+ add_offset x5, w6, x10, x5, x9
+
+ ldr w11, [sp, #24] // type
+ adr x13, L(fgy_loop_tbl)
+
+ add x4, x12, #32 // grain_lut += FG_BLOCK_SIZE * bx
+ add x6, x14, x9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by
+
+ tst w11, #1
+ ldrh w11, [x13, w11, uxtw #1]
+
+ add x8, x16, x9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by
+ add x8, x8, #32 // grain_lut += FG_BLOCK_SIZE * bx
+
+ sub x11, x13, w11, uxtw
+
+ b.eq 1f
+ // y overlap
+ dup v6.16b, v27.b[0]
+ dup v7.16b, v27.b[1]
+ mov w10, w7 // backup actual h
+ mov w7, #2
+1:
+ br x11
+endfunc
+
+function fgy_loop_neon
+.macro fgy ox, oy
+L(loop_\ox\oy):
+ AARCH64_VALID_JUMP_TARGET
+1:
+ ld1 {v0.16b, v1.16b}, [x1], x2 // src
+.if \ox
+ ld1 {v20.8b}, [x4], x9 // grain_lut old
+.endif
+.if \oy
+ ld1 {v22.16b, v23.16b}, [x6], x9 // grain_lut top
+.endif
+.if \ox && \oy
+ ld1 {v21.8b}, [x8], x9 // grain_lut top old
+.endif
+ ld1 {v18.16b, v19.16b}, [x5], x9 // grain_lut
+
+ bl gather32_neon
+
+.if \ox
+ smull v20.8h, v20.8b, v27.8b
+ smlal v20.8h, v18.8b, v28.8b
+.endif
+
+.if \oy
+.if \ox
+ smull v21.8h, v21.8b, v27.8b
+ smlal v21.8h, v22.8b, v28.8b
+ sqrshrn v20.8b, v20.8h, #5
+ sqrshrn v21.8b, v21.8h, #5
+.endif
+
+.if \ox
+ smull v16.8h, v20.8b, v7.8b
+.else
+ smull v16.8h, v18.8b, v7.8b
+.endif
+ smull2 v17.8h, v18.16b, v7.16b
+ smull v18.8h, v19.8b, v7.8b
+ smull2 v19.8h, v19.16b, v7.16b
+.if \ox
+ smlal v16.8h, v21.8b, v6.8b
+.else
+ smlal v16.8h, v22.8b, v6.8b
+.endif
+ smlal2 v17.8h, v22.16b, v6.16b
+ smlal v18.8h, v23.8b, v6.8b
+ smlal2 v19.8h, v23.16b, v6.16b
+ sqrshrn v22.8b, v16.8h, #5
+ sqrshrn2 v22.16b, v17.8h, #5
+ sqrshrn v23.8b, v18.8h, #5
+ sqrshrn2 v23.16b, v19.8h, #5
+.endif
+
+ // sxtl of grain
+.if \oy
+ sxtl v16.8h, v22.8b
+ sxtl2 v17.8h, v22.16b
+ sxtl v18.8h, v23.8b
+ sxtl2 v19.8h, v23.16b
+.elseif \ox
+ sqrshrn v20.8b, v20.8h, #5
+ sxtl2 v17.8h, v18.16b
+ sxtl v18.8h, v19.8b
+ sxtl2 v19.8h, v19.16b
+ sxtl v16.8h, v20.8b
+.else
+ sxtl v16.8h, v18.8b
+ sxtl2 v17.8h, v18.16b
+ sxtl v18.8h, v19.8b
+ sxtl2 v19.8h, v19.16b
+.endif
+
+ uxtl v2.8h, v4.8b // scaling
+ uxtl2 v3.8h, v4.16b
+ uxtl v4.8h, v5.8b
+ uxtl2 v5.8h, v5.16b
+
+ mul v16.8h, v16.8h, v2.8h // scaling * grain
+ mul v17.8h, v17.8h, v3.8h
+ mul v18.8h, v18.8h, v4.8h
+ mul v19.8h, v19.8h, v5.8h
+
+ srshl v16.8h, v16.8h, v29.8h // round2(scaling * grain, scaling_shift)
+ srshl v17.8h, v17.8h, v29.8h
+ srshl v18.8h, v18.8h, v29.8h
+ srshl v19.8h, v19.8h, v29.8h
+
+ uaddw v16.8h, v16.8h, v0.8b // *src + noise
+ uaddw2 v17.8h, v17.8h, v0.16b
+ uaddw v18.8h, v18.8h, v1.8b
+ uaddw2 v19.8h, v19.8h, v1.16b
+
+ sqxtun v0.8b, v16.8h
+ sqxtun2 v0.16b, v17.8h
+ sqxtun v1.8b, v18.8h
+ sqxtun2 v1.16b, v19.8h
+
+ umax v0.16b, v0.16b, v30.16b
+ umax v1.16b, v1.16b, v30.16b
+ umin v0.16b, v0.16b, v31.16b
+ umin v1.16b, v1.16b, v31.16b
+
+ subs w7, w7, #1
+.if \oy
+ dup v6.16b, v28.b[0]
+ dup v7.16b, v28.b[1]
+.endif
+ st1 {v0.16b, v1.16b}, [x0], x2 // dst
+ b.gt 1b
+
+.if \oy
+ cmp w10, #2
+ sub w7, w10, #2 // restore actual remaining h
+ b.gt L(loop_\ox\()0)
+.endif
+ ldr x30, [sp], #16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.endm
+
+ fgy 0, 0
+ fgy 0, 1
+ fgy 1, 0
+ fgy 1, 1
+
+L(fgy_loop_tbl):
+ .hword L(fgy_loop_tbl) - L(loop_00)
+ .hword L(fgy_loop_tbl) - L(loop_01)
+ .hword L(fgy_loop_tbl) - L(loop_10)
+ .hword L(fgy_loop_tbl) - L(loop_11)
+endfunc
+
+// void dav1d_fguv_32x32_420_8bpc_neon(pixel *const dst,
+// const pixel *const src,
+// const ptrdiff_t stride,
+// const uint8_t scaling[SCALING_SIZE],
+// const Dav1dFilmGrainData *const data,
+// const entry grain_lut[][GRAIN_WIDTH],
+// const pixel *const luma_row,
+// const ptrdiff_t luma_stride,
+// const int offsets[][2],
+// const ptrdiff_t h, const ptrdiff_t uv,
+// const ptrdiff_t is_id,
+// const ptrdiff_t type);
+.macro fguv layout, sx, sy
+function fguv_32x32_\layout\()_8bpc_neon, export=1
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-32]!
+ str d8, [sp, #16]
+ ldp x8, x9, [sp, #32] // offsets, h
+ ldp x10, x11, [sp, #48] // uv, is_id
+
+ ldr w13, [x4, #FGD_SCALING_SHIFT]
+ ldr w12, [x4, #FGD_CLIP_TO_RESTRICTED_RANGE]
+ neg w13, w13 // -scaling_shift
+
+ // !csfl
+ add x10, x4, x10, lsl #2 // + 4*uv
+ add x14, x10, #FGD_UV_LUMA_MULT
+ add x15, x10, #FGD_UV_MULT
+ add x10, x10, #FGD_UV_OFFSET
+ ld1 {v8.h}[0], [x14] // uv_luma_mult
+ ld1r {v24.8h}, [x10] // uv_offset
+ ld1 {v8.h}[1], [x15] // uv_mult
+
+ dup v29.8h, w13 // -scaling_shift
+
+ cbz w12, 1f
+ // clip
+ movi v30.16b, #16
+ movi v31.16b, #240
+ cbz w11, 2f
+ // is_id
+ movi v31.16b, #235
+ b 2f
+1:
+ // no clip
+ movi v30.16b, #0
+ movi v31.16b, #255
+2:
+
+ ldr w12, [x8, #8] // offsets[1][0]
+ ldr w14, [x8, #4] // offsets[0][1]
+ ldr w16, [x8, #12] // offsets[1][1]
+ ldr w8, [x8] // offsets[0][0]
+
+ mov x10, #GRAIN_WIDTH // grain_lut stride
+
+ add x5, x5, #(3 + (2 >> \sx)*3) // grain_lut += 9 or 6
+.if \sy
+ add x5, x5, x10, lsl #2 // grain_lut += 4 * grain_stride
+ add x5, x5, x10, lsl #1 // grain_lut += 2 * grain_stride
+.else
+ add x5, x5, x10, lsl #3 // grain_lut += 8 * grain_stride
+ add x5, x5, x10 // grain_lut += grain_stride
+.endif
+
+ calc_offset w12, w13, w12, \sx, \sy
+ calc_offset w14, w15, w14, \sx, \sy
+ calc_offset w16, w17, w16, \sx, \sy
+ calc_offset w8, w11, w8, \sx, \sy
+
+ add_offset x13, w12, x13, x5, x10
+ add_offset x15, w14, x15, x5, x10
+ add_offset x17, w16, x17, x5, x10
+ add_offset x5, w8, x11, x5, x10
+
+ add x4, x13, #(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx
+ add x8, x15, x10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by
+ add x11, x17, x10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by
+ add x11, x11, #(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx
+
+ ldr w13, [sp, #64] // type
+
+ movrel x16, overlap_coeffs_\sx
+ adr x14, L(fguv_loop_sx\sx\()_tbl)
+
+ ld1 {v27.8b, v28.8b}, [x16] // overlap_coeffs
+ tst w13, #1
+ ldrh w13, [x14, w13, uxtw #1]
+
+ b.eq 1f
+ // y overlap
+ sub w12, w9, #(2 >> \sy) // backup remaining h
+ mov w9, #(2 >> \sy)
+
+1:
+ sub x13, x14, w13, uxtw
+
+.if \sy
+ movi v25.16b, #23
+ movi v26.16b, #22
+.else
+ movi v25.16b, #27
+ movi v26.16b, #17
+.endif
+
+.if \sy
+ add x7, x7, x7 // luma_stride *= 2
+.endif
+
+ br x13
+endfunc
+.endm
+
+fguv 420, 1, 1
+fguv 422, 1, 0
+fguv 444, 0, 0
+
+function fguv_loop_sx0_neon
+.macro fguv_loop_sx0 csfl, ox, oy
+L(fguv_loop_sx0_csfl\csfl\()_\ox\oy):
+ AARCH64_VALID_JUMP_TARGET
+1:
+ ld1 {v0.16b, v1.16b}, [x6], x7 // luma
+ ld1 {v6.16b, v7.16b}, [x1], x2 // src
+.if \ox
+ ld1 {v20.8b}, [x4], x10 // grain_lut old
+.endif
+.if \oy
+ ld1 {v22.16b, v23.16b}, [x8], x10 // grain_lut top
+.endif
+.if \ox && \oy
+ ld1 {v21.8b}, [x11], x10 // grain_lut top old
+.endif
+ ld1 {v18.16b, v19.16b}, [x5], x10 // grain_lut
+
+.if !\csfl
+ uxtl v2.8h, v0.8b
+ uxtl2 v3.8h, v0.16b
+ uxtl v4.8h, v1.8b
+ uxtl2 v5.8h, v1.16b
+ uxtl v0.8h, v6.8b
+ uxtl2 v1.8h, v6.16b
+ uxtl v16.8h, v7.8b
+ uxtl2 v17.8h, v7.16b
+ mul v2.8h, v2.8h, v8.h[0]
+ mul v3.8h, v3.8h, v8.h[0]
+ mul v4.8h, v4.8h, v8.h[0]
+ mul v5.8h, v5.8h, v8.h[0]
+ mul v0.8h, v0.8h, v8.h[1]
+ mul v1.8h, v1.8h, v8.h[1]
+ mul v16.8h, v16.8h, v8.h[1]
+ mul v17.8h, v17.8h, v8.h[1]
+ sqadd v2.8h, v2.8h, v0.8h
+ sqadd v3.8h, v3.8h, v1.8h
+ sqadd v4.8h, v4.8h, v16.8h
+ sqadd v5.8h, v5.8h, v17.8h
+ sshr v2.8h, v2.8h, #6
+ sshr v3.8h, v3.8h, #6
+ sshr v4.8h, v4.8h, #6
+ sshr v5.8h, v5.8h, #6
+ add v2.8h, v2.8h, v24.8h
+ add v3.8h, v3.8h, v24.8h
+ add v4.8h, v4.8h, v24.8h
+ add v5.8h, v5.8h, v24.8h
+ sqxtun v0.8b, v2.8h
+ sqxtun2 v0.16b, v3.8h
+ sqxtun v1.8b, v4.8h
+ sqxtun2 v1.16b, v5.8h
+.endif
+
+ bl gather32_neon
+
+.if \ox
+ smull v20.8h, v20.8b, v27.8b
+ smlal v20.8h, v18.8b, v28.8b
+.endif
+
+.if \oy
+.if \ox
+ smull v21.8h, v21.8b, v27.8b
+ smlal v21.8h, v22.8b, v28.8b
+ sqrshrn v20.8b, v20.8h, #5
+ sqrshrn v21.8b, v21.8h, #5
+.endif
+
+.if \ox
+ smull v16.8h, v20.8b, v26.8b
+.else
+ smull v16.8h, v18.8b, v26.8b
+.endif
+ smull2 v17.8h, v18.16b, v26.16b
+ smull v18.8h, v19.8b, v26.8b
+ smull2 v19.8h, v19.16b, v26.16b
+.if \ox
+ smlal v16.8h, v21.8b, v25.8b
+.else
+ smlal v16.8h, v22.8b, v25.8b
+.endif
+ smlal2 v17.8h, v22.16b, v25.16b
+ smlal v18.8h, v23.8b, v25.8b
+ smlal2 v19.8h, v23.16b, v25.16b
+ sqrshrn v22.8b, v16.8h, #5
+ sqrshrn2 v22.16b, v17.8h, #5
+ sqrshrn v23.8b, v18.8h, #5
+ sqrshrn2 v23.16b, v19.8h, #5
+.endif
+
+ // sxtl of grain
+.if \oy
+ sxtl v16.8h, v22.8b
+ sxtl2 v17.8h, v22.16b
+ sxtl v18.8h, v23.8b
+ sxtl2 v19.8h, v23.16b
+.elseif \ox
+ sqrshrn v20.8b, v20.8h, #5
+ sxtl2 v17.8h, v18.16b
+ sxtl v18.8h, v19.8b
+ sxtl2 v19.8h, v19.16b
+ sxtl v16.8h, v20.8b
+.else
+ sxtl v16.8h, v18.8b
+ sxtl2 v17.8h, v18.16b
+ sxtl v18.8h, v19.8b
+ sxtl2 v19.8h, v19.16b
+.endif
+
+ uxtl v2.8h, v4.8b // scaling
+ uxtl2 v3.8h, v4.16b
+ uxtl v4.8h, v5.8b
+ uxtl2 v5.8h, v5.16b
+
+ mul v16.8h, v16.8h, v2.8h // scaling * grain
+ mul v17.8h, v17.8h, v3.8h
+ mul v18.8h, v18.8h, v4.8h
+ mul v19.8h, v19.8h, v5.8h
+
+ srshl v16.8h, v16.8h, v29.8h // round2(scaling * grain, scaling_shift)
+ srshl v17.8h, v17.8h, v29.8h
+ srshl v18.8h, v18.8h, v29.8h
+ srshl v19.8h, v19.8h, v29.8h
+
+ uaddw v16.8h, v16.8h, v6.8b // *src + noise
+ uaddw2 v17.8h, v17.8h, v6.16b
+ uaddw v18.8h, v18.8h, v7.8b
+ uaddw2 v19.8h, v19.8h, v7.16b
+
+ sqxtun v0.8b, v16.8h
+ sqxtun2 v0.16b, v17.8h
+ sqxtun v1.8b, v18.8h
+ sqxtun2 v1.16b, v19.8h
+
+ umax v0.16b, v0.16b, v30.16b
+ umax v1.16b, v1.16b, v30.16b
+ umin v0.16b, v0.16b, v31.16b
+ umin v1.16b, v1.16b, v31.16b
+
+ subs w9, w9, #1
+.if \oy
+ dup v25.16b, v28.b[0]
+ dup v26.16b, v28.b[1]
+.endif
+ st1 {v0.16b, v1.16b}, [x0], x2 // dst
+ b.gt 1b
+
+.if \oy
+ cmp w12, #0
+ mov w9, w12 // restore actual remaining h
+ b.gt L(fguv_loop_sx0_csfl\csfl\()_\ox\()0)
+.endif
+ b 9f
+.endm
+ fguv_loop_sx0 0, 0, 0
+ fguv_loop_sx0 0, 0, 1
+ fguv_loop_sx0 0, 1, 0
+ fguv_loop_sx0 0, 1, 1
+ fguv_loop_sx0 1, 0, 0
+ fguv_loop_sx0 1, 0, 1
+ fguv_loop_sx0 1, 1, 0
+ fguv_loop_sx0 1, 1, 1
+
+9:
+ ldr d8, [sp, #16]
+ ldr x30, [sp], #32
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(fguv_loop_sx0_tbl):
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_00)
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_01)
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_10)
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_11)
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_00)
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_01)
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_10)
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_11)
+endfunc
+
+function fguv_loop_sx1_neon
+.macro fguv_loop_sx1 csfl, ox, oy
+L(fguv_loop_sx1_csfl\csfl\()_\ox\oy):
+ AARCH64_VALID_JUMP_TARGET
+1:
+ ld1 {v0.16b, v1.16b}, [x6], x7 // luma
+ ld1 {v6.16b}, [x1], x2 // src
+.if \ox
+ ld1 {v20.8b}, [x4], x10 // grain_lut old
+.endif
+.if \oy
+ ld1 {v22.16b}, [x8], x10 // grain_lut top
+.endif
+.if \ox && \oy
+ ld1 {v21.8b}, [x11], x10 // grain_lut top old
+.endif
+ ld1 {v18.16b}, [x5], x10 // grain_lut
+
+ uaddlp v2.8h, v0.16b
+ uaddlp v3.8h, v1.16b
+.if \csfl
+ rshrn v0.8b, v2.8h, #1
+ rshrn2 v0.16b, v3.8h, #1
+.else
+ urshr v2.8h, v2.8h, #1
+ urshr v3.8h, v3.8h, #1
+ uxtl v0.8h, v6.8b
+ uxtl2 v1.8h, v6.16b
+ mul v2.8h, v2.8h, v8.h[0]
+ mul v3.8h, v3.8h, v8.h[0]
+ mul v0.8h, v0.8h, v8.h[1]
+ mul v1.8h, v1.8h, v8.h[1]
+ sqadd v2.8h, v2.8h, v0.8h
+ sqadd v3.8h, v3.8h, v1.8h
+ sshr v2.8h, v2.8h, #6
+ sshr v3.8h, v3.8h, #6
+ add v2.8h, v2.8h, v24.8h
+ add v3.8h, v3.8h, v24.8h
+ sqxtun v0.8b, v2.8h
+ sqxtun2 v0.16b, v3.8h
+.endif
+
+ bl gather16_neon
+
+.if \ox
+ smull v20.8h, v20.8b, v27.8b
+ smlal v20.8h, v18.8b, v28.8b
+.endif
+
+.if \oy
+.if \ox
+ smull v21.8h, v21.8b, v27.8b
+ smlal v21.8h, v22.8b, v28.8b
+ sqrshrn v20.8b, v20.8h, #5
+ sqrshrn v21.8b, v21.8h, #5
+.endif
+
+.if \ox
+ smull v16.8h, v20.8b, v26.8b
+.else
+ smull v16.8h, v18.8b, v26.8b
+.endif
+ smull2 v17.8h, v18.16b, v26.16b
+.if \ox
+ smlal v16.8h, v21.8b, v25.8b
+.else
+ smlal v16.8h, v22.8b, v25.8b
+.endif
+ smlal2 v17.8h, v22.16b, v25.16b
+ sqrshrn v22.8b, v16.8h, #5
+ sqrshrn2 v22.16b, v17.8h, #5
+.endif
+
+ // sxtl of grain
+.if \oy
+ sxtl v16.8h, v22.8b
+ sxtl2 v17.8h, v22.16b
+.elseif \ox
+ sqrshrn v20.8b, v20.8h, #5
+ sxtl2 v17.8h, v18.16b
+ sxtl v16.8h, v20.8b
+.else
+ sxtl v16.8h, v18.8b
+ sxtl2 v17.8h, v18.16b
+.endif
+
+ uxtl v2.8h, v4.8b // scaling
+ uxtl2 v3.8h, v4.16b
+
+ mul v16.8h, v16.8h, v2.8h // scaling * grain
+ mul v17.8h, v17.8h, v3.8h
+
+ srshl v16.8h, v16.8h, v29.8h // round2(scaling * grain, scaling_shift)
+ srshl v17.8h, v17.8h, v29.8h
+
+ uaddw v16.8h, v16.8h, v6.8b // *src + noise
+ uaddw2 v17.8h, v17.8h, v6.16b
+
+ sqxtun v0.8b, v16.8h
+ sqxtun2 v0.16b, v17.8h
+
+ umax v0.16b, v0.16b, v30.16b
+ umin v0.16b, v0.16b, v31.16b
+
+.if \oy
+ mov v16.16b, v25.16b
+.endif
+ subs w9, w9, #1
+.if \oy
+ mov v25.16b, v26.16b
+ mov v26.16b, v16.16b
+.endif
+ st1 {v0.16b}, [x0], x2 // dst
+ b.gt 1b
+
+.if \oy
+ cmp w12, #0
+ mov w9, w12 // restore actual remaining h
+ b.gt L(fguv_loop_sx1_csfl\csfl\()_\ox\()0)
+.endif
+
+ b 9f
+.endm
+ fguv_loop_sx1 0, 0, 0
+ fguv_loop_sx1 0, 0, 1
+ fguv_loop_sx1 0, 1, 0
+ fguv_loop_sx1 0, 1, 1
+ fguv_loop_sx1 1, 0, 0
+ fguv_loop_sx1 1, 0, 1
+ fguv_loop_sx1 1, 1, 0
+ fguv_loop_sx1 1, 1, 1
+
+9:
+ ldr d8, [sp, #16]
+ ldr x30, [sp], #32
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(fguv_loop_sx1_tbl):
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_00)
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_01)
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_10)
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_11)
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_00)
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_01)
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_10)
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_11)
+endfunc
diff --git a/third_party/dav1d/src/arm/64/filmgrain16.S b/third_party/dav1d/src/arm/64/filmgrain16.S
new file mode 100644
index 0000000000..75252acfb1
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/filmgrain16.S
@@ -0,0 +1,1997 @@
+/*
+ * Copyright © 2021, VideoLAN and dav1d authors
+ * Copyright © 2021, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+#include "src/arm/asm-offsets.h"
+
+#define GRAIN_WIDTH 82
+#define GRAIN_HEIGHT 73
+
+#define SUB_GRAIN_WIDTH 44
+#define SUB_GRAIN_HEIGHT 38
+
+.macro increment_seed steps, shift=1
+ lsr w11, w2, #3
+ lsr w12, w2, #12
+ lsr w13, w2, #1
+ eor w11, w2, w11 // (r >> 0) ^ (r >> 3)
+ eor w12, w12, w13 // (r >> 12) ^ (r >> 1)
+ eor w11, w11, w12 // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1)
+.if \shift
+ lsr w2, w2, #\steps
+.endif
+ and w11, w11, #((1 << \steps) - 1) // bit
+.if \shift
+ orr w2, w2, w11, lsl #(16 - \steps) // *state
+.else
+ orr w2, w2, w11, lsl #16 // *state
+.endif
+.endm
+
+.macro read_rand dest, bits, age
+ ubfx \dest, x2, #16 - \bits - \age, #\bits
+.endm
+
+.macro read_shift_rand dest, bits
+ ubfx \dest, x2, #17 - \bits, #\bits
+ lsr w2, w2, #1
+.endm
+
+// special calling convention:
+// w2 holds seed
+// x3 holds dav1d_gaussian_sequence
+// clobbers x11-x15
+// returns in v0.8h
+function get_gaussian_neon
+ increment_seed 4
+ read_rand x14, 11, 3
+ read_rand x15, 11, 2
+ add x14, x3, x14, lsl #1
+ add x15, x3, x15, lsl #1
+ ld1 {v0.h}[0], [x14]
+ read_rand x14, 11, 1
+ ld1 {v0.h}[1], [x15]
+ add x14, x3, x14, lsl #1
+ read_rand x15, 11, 0
+ increment_seed 4
+ add x15, x3, x15, lsl #1
+ ld1 {v0.h}[2], [x14]
+ read_rand x14, 11, 3
+ ld1 {v0.h}[3], [x15]
+ add x14, x3, x14, lsl #1
+ read_rand x15, 11, 2
+ ld1 {v0.h}[4], [x14]
+ add x15, x3, x15, lsl #1
+ read_rand x14, 11, 1
+ ld1 {v0.h}[5], [x15]
+ read_rand x15, 11, 0
+ add x14, x3, x14, lsl #1
+ add x15, x3, x15, lsl #1
+ ld1 {v0.h}[6], [x14]
+ ld1 {v0.h}[7], [x15]
+ ret
+endfunc
+
+.macro store_grain_row r0, r1, r2, r3, r4, r5
+ st1 {\r0\().16b,\r1\().16b}, [x0], #32
+ st1 {\r2\().16b,\r3\().16b}, [x0], #32
+ st1 {\r4\().16b}, [x0], #16
+ st1 {\r5\().h}[0], [x0], #2
+.endm
+
+function get_grain_2_neon
+ increment_seed 2
+ read_rand x14, 11, 1
+ read_rand x15, 11, 0
+ add x14, x3, x14, lsl #1
+ add x15, x3, x15, lsl #1
+ ld1 {v0.h}[0], [x14]
+ ld1 {v0.h}[1], [x15]
+ srshl v0.4h, v0.4h, v31.4h
+ ret
+endfunc
+
+.macro get_grain_2 dst
+ bl get_grain_2_neon
+.ifnc \dst, v0
+ mov \dst\().8b, v0.8b
+.endif
+.endm
+
+function get_grain_4_neon
+ increment_seed 4
+ read_rand x14, 11, 3
+ read_rand x15, 11, 2
+ add x14, x3, x14, lsl #1
+ add x15, x3, x15, lsl #1
+ ld1 {v0.h}[0], [x14]
+ read_rand x14, 11, 1
+ ld1 {v0.h}[1], [x15]
+ add x14, x3, x14, lsl #1
+ read_rand x15, 11, 0
+ add x15, x3, x15, lsl #1
+ ld1 {v0.h}[2], [x14]
+ ld1 {v0.h}[3], [x15]
+ srshl v0.4h, v0.4h, v31.4h
+ ret
+endfunc
+
+.macro get_grain_4 dst
+ bl get_grain_4_neon
+.ifnc \dst, v0
+ mov \dst\().8b, v0.8b
+.endif
+.endm
+
+// w15 holds the number of entries to produce
+// w14, w16 and w17 hold the previous output entries
+// v0 holds the vector of produced entries
+// v1 holds the input vector of sums from above
+.macro output_lag n
+function output_lag\n\()_neon
+1:
+ read_shift_rand x13, 11
+ mov w11, v1.s[0]
+ ldrsh w12, [x3, x13, lsl #1]
+ ext v0.16b, v0.16b, v0.16b, #2
+.if \n == 1
+ madd w11, w14, w4, w11 // sum (above) + *coeff * prev output
+.elseif \n == 2
+ madd w11, w16, w4, w11 // sum (above) + *coeff * prev output 1
+ madd w11, w14, w17, w11 // += *coeff * prev output 2
+ mov w16, w14
+.else
+ madd w11, w17, w4, w11 // sum (above) + *coeff * prev output 1
+ madd w11, w16, w20, w11 // sum (above) + *coeff * prev output 2
+ madd w11, w14, w21, w11 // += *coeff * prev output 3
+ mov w17, w16
+ mov w16, w14
+.endif
+ add w14, w11, w8 // 1 << (ar_coeff_shift - 1)
+ add w12, w12, w10 // 1 << (4 - bitdepth_min_8 + grain_scale_shift - 1)
+ asr w14, w14, w7 // >> ar_coeff_shift
+ asr w12, w12, w9 // >> (4 - bitdepth_min_8 + grain_scale_shift)
+ add w14, w14, w12
+ cmp w14, w5
+ csel w14, w14, w5, le
+ cmp w14, w6
+ csel w14, w14, w6, ge
+ subs w15, w15, #1
+ ext v1.16b, v1.16b, v1.16b, #4
+ ins v0.h[7], w14
+ b.gt 1b
+ ret
+endfunc
+.endm
+
+output_lag 1
+output_lag 2
+output_lag 3
+
+
+function sum_lag1_above_neon
+ sub x12, x0, #1*GRAIN_WIDTH*2 - 16
+ ld1 {v18.8h}, [x12] // load top right
+
+ ext v0.16b, v16.16b, v17.16b, #14 // top left, top mid
+ ext v1.16b, v17.16b, v18.16b, #2 // top mid, top right
+
+ smull v4.4s, v17.4h, v28.4h
+ smlal v4.4s, v0.4h, v27.4h
+ smlal v4.4s, v1.4h, v29.4h
+ smull2 v5.4s, v17.8h, v28.8h
+ smlal2 v5.4s, v0.8h, v27.8h
+ smlal2 v5.4s, v1.8h, v29.8h
+
+ mov v16.16b, v17.16b
+ mov v17.16b, v18.16b
+
+ ret
+endfunc
+
+.macro sum_lag_n_body lag, type, uv_layout, edge, elems, uv_coeff
+ bl sum_\lag\()_above_neon
+.ifc \type, uv_420
+ add x12, x19, #GRAIN_WIDTH*2
+ ld1 {v22.8h, v23.8h}, [x19], #32
+ ld1 {v24.8h, v25.8h}, [x12]
+ addp v22.8h, v22.8h, v23.8h
+ addp v23.8h, v24.8h, v25.8h
+ add v22.8h, v22.8h, v23.8h
+ srshr v0.8h, v22.8h, #2
+.endif
+.ifc \type, uv_422
+ ld1 {v22.8h, v23.8h}, [x19], #32
+ addp v22.8h, v22.8h, v23.8h
+ srshr v0.8h, v22.8h, #1
+.endif
+.ifc \type, uv_444
+ ld1 {v0.8h}, [x19], #16
+.endif
+.if \uv_layout
+.ifnb \uv_coeff
+ dup v1.8b, \uv_coeff
+ sxtl v1.8h, v1.8b
+ smlal v4.4s, v0.4h, v1.4h
+ smlal2 v5.4s, v0.8h, v1.8h
+.else
+ smlal v4.4s, v0.4h, v30.4h
+ smlal2 v5.4s, v0.8h, v30.8h
+.endif
+.endif
+.if \uv_layout && \elems == 8
+ b sum_\lag\()_y_\edge\()_start
+.elseif \uv_layout == 444 && \elems == 7
+ b sum_\lag\()_y_\edge\()_start
+.elseif \uv_layout == 422 && \elems == 1
+ b sum_\lag\()_uv_420_\edge\()_start
+.else
+sum_\lag\()_\type\()_\edge\()_start:
+.if \elems > 4
+.ifc \edge, left
+ increment_seed 4
+ read_rand x12, 11, 3
+ read_rand x13, 11, 2
+ read_rand x14, 11, 1
+ add x12, x3, x12, lsl #1
+ add x13, x3, x13, lsl #1
+ add x14, x3, x14, lsl #1
+ ld1 {v0.h}[5], [x12]
+ ld1 {v0.h}[6], [x13]
+ ld1 {v0.h}[7], [x14]
+ lsl x2, x2, #1 // shift back the state as if we'd done increment_seed with shift=0
+ srshl v0.8h, v0.8h, v31.8h
+ ext v4.16b, v4.16b, v4.16b, #12
+.ifc \lag, lag3
+ smov w17, v0.h[5]
+.endif
+.ifnc \lag, lag1
+ smov w16, v0.h[6]
+.endif
+ smov w14, v0.h[7]
+
+ mov v1.16b, v4.16b
+ mov w15, #1
+ bl output_\lag\()_neon
+.else
+ increment_seed 4, shift=0
+ mov v1.16b, v4.16b
+ mov w15, #4
+ bl output_\lag\()_neon
+.endif
+
+ increment_seed 4, shift=0
+ mov v1.16b, v5.16b
+.ifc \edge, right
+ mov w15, #3
+ bl output_\lag\()_neon
+ read_shift_rand x15, 11
+ add x15, x3, x15, lsl #1
+ ld1 {v1.h}[0], [x15]
+ srshl v1.4h, v1.4h, v31.4h
+ ext v0.16b, v0.16b, v1.16b, #2
+.else
+ mov w15, #4
+ bl output_\lag\()_neon
+.endif
+.else
+ // elems == 1
+ increment_seed 4, shift=0
+ mov v1.16b, v4.16b
+ mov w15, #1
+ bl output_\lag\()_neon
+ lsr w2, w2, #3
+
+ read_rand x12, 11, 2
+ read_rand x13, 11, 1
+ read_rand x14, 11, 0
+ add x12, x3, x12, lsl #1
+ add x13, x3, x13, lsl #1
+ add x14, x3, x14, lsl #1
+ ld1 {v1.h}[0], [x12]
+ ld1 {v1.h}[1], [x13]
+ ld1 {v1.h}[2], [x14]
+ srshl v1.4h, v1.4h, v31.4h
+ ext v0.16b, v0.16b, v1.16b, #14
+.endif
+ st1 {v0.8h}, [x0], #16
+ ldr x30, [sp], #16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.endif
+.endm
+
+.macro sum_lag1_func type, uv_layout, edge, elems=8
+function sum_\type\()_lag1_\edge\()_neon
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+.ifc \edge, left
+ sub x12, x0, #1*GRAIN_WIDTH*2
+ ld1 {v17.8h}, [x12] // load the previous block right above
+.endif
+ sum_lag_n_body lag1, \type, \uv_layout, \edge, \elems
+endfunc
+.endm
+
+sum_lag1_func y, 0, left
+sum_lag1_func y, 0, mid
+sum_lag1_func y, 0, right, 7
+sum_lag1_func uv_444, 444, left
+sum_lag1_func uv_444, 444, mid
+sum_lag1_func uv_444, 444, right, 7
+sum_lag1_func uv_422, 422, left
+sum_lag1_func uv_422, 422, mid
+sum_lag1_func uv_422, 422, right, 1
+sum_lag1_func uv_420, 420, left
+sum_lag1_func uv_420, 420, mid
+sum_lag1_func uv_420, 420, right, 1
+
+
+function sum_lag2_above_neon
+ sub x12, x0, #2*GRAIN_WIDTH*2 - 16
+ sub x13, x0, #1*GRAIN_WIDTH*2 - 16
+ ld1 {v18.8h}, [x12] // load top right
+ ld1 {v21.8h}, [x13]
+
+ dup v26.8b, v30.b[0]
+ ext v22.16b, v16.16b, v17.16b, #12 // top left, top mid
+ dup v27.8b, v30.b[1]
+ ext v23.16b, v16.16b, v17.16b, #14
+ sxtl v26.8h, v26.8b
+ dup v28.8b, v30.b[3]
+ ext v0.16b, v17.16b, v18.16b, #2 // top mid, top right
+ sxtl v27.8h, v27.8b
+ dup v29.8b, v30.b[4]
+ ext v1.16b, v17.16b, v18.16b, #4
+ sxtl v28.8h, v28.8b
+ sxtl v29.8h, v29.8b
+
+ smull v4.4s, v22.4h, v26.4h
+ smlal v4.4s, v23.4h, v27.4h
+ smlal v4.4s, v0.4h, v28.4h
+ smlal v4.4s, v1.4h, v29.4h
+ smull2 v5.4s, v22.8h, v26.8h
+ smlal2 v5.4s, v23.8h, v27.8h
+ smlal2 v5.4s, v0.8h, v28.8h
+ smlal2 v5.4s, v1.8h, v29.8h
+
+ dup v26.16b, v30.b[5]
+ ext v22.16b, v19.16b, v20.16b, #12 // top left, top mid
+ dup v27.16b, v30.b[6]
+ ext v23.16b, v19.16b, v20.16b, #14
+ sxtl v26.8h, v26.8b
+ dup v28.16b, v30.b[8]
+ ext v0.16b, v20.16b, v21.16b, #2 // top mid, top right
+ sxtl v27.8h, v27.8b
+ dup v29.16b, v30.b[9]
+ ext v1.16b, v20.16b, v21.16b, #4
+ sxtl v28.8h, v28.8b
+ sxtl v29.8h, v29.8b
+
+ smlal v4.4s, v22.4h, v26.4h
+ smlal v4.4s, v23.4h, v27.4h
+ smlal v4.4s, v0.4h, v28.4h
+ smlal v4.4s, v1.4h, v29.4h
+ smlal2 v5.4s, v22.8h, v26.8h
+ smlal2 v5.4s, v23.8h, v27.8h
+ smlal2 v5.4s, v0.8h, v28.8h
+ smlal2 v5.4s, v1.8h, v29.8h
+
+ dup v26.16b, v30.b[2]
+ dup v27.16b, v30.b[7]
+ sxtl v26.8h, v26.8b
+ sxtl v27.8h, v27.8b
+
+ smlal v4.4s, v17.4h, v26.4h
+ smlal v4.4s, v20.4h, v27.4h
+ smlal2 v5.4s, v17.8h, v26.8h
+ smlal2 v5.4s, v20.8h, v27.8h
+ mov v16.16b, v17.16b
+ mov v17.16b, v18.16b
+
+ mov v19.16b, v20.16b
+ mov v20.16b, v21.16b
+ ret
+endfunc
+
+.macro sum_lag2_func type, uv_layout, edge, elems=8
+function sum_\type\()_lag2_\edge\()_neon
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+.ifc \edge, left
+ sub x12, x0, #2*GRAIN_WIDTH*2
+ sub x13, x0, #1*GRAIN_WIDTH*2
+ ld1 {v17.8h}, [x12] // load the previous block right above
+ ld1 {v20.8h}, [x13]
+.endif
+ sum_lag_n_body lag2, \type, \uv_layout, \edge, \elems, v30.b[12]
+endfunc
+.endm
+
+sum_lag2_func y, 0, left
+sum_lag2_func y, 0, mid
+sum_lag2_func y, 0, right, 7
+sum_lag2_func uv_444, 444, left
+sum_lag2_func uv_444, 444, mid
+sum_lag2_func uv_444, 444, right, 7
+sum_lag2_func uv_422, 422, left
+sum_lag2_func uv_422, 422, mid
+sum_lag2_func uv_422, 422, right, 1
+sum_lag2_func uv_420, 420, left
+sum_lag2_func uv_420, 420, mid
+sum_lag2_func uv_420, 420, right, 1
+
+
+function sum_lag3_above_neon
+ sub x11, x0, #3*GRAIN_WIDTH*2 - 16
+ sub x12, x0, #2*GRAIN_WIDTH*2 - 16
+ sub x13, x0, #1*GRAIN_WIDTH*2 - 16
+ ld1 {v15.8h}, [x11] // load top right
+ ld1 {v18.8h}, [x12]
+ ld1 {v21.8h}, [x13]
+
+ dup v22.8b, v29.b[0]
+ ext v8.16b, v13.16b, v14.16b, #10 // top left, top mid
+ dup v23.8b, v29.b[1]
+ ext v9.16b, v13.16b, v14.16b, #12
+ sxtl v22.8h, v22.8b
+ dup v24.8b, v29.b[2]
+ sxtl v23.8h, v23.8b
+ dup v25.8b, v29.b[3]
+ ext v10.16b, v13.16b, v14.16b, #14
+ sxtl v24.8h, v24.8b
+ dup v26.8b, v29.b[4]
+ ext v11.16b, v14.16b, v15.16b, #2 // top mid, top right
+ sxtl v25.8h, v25.8b
+ dup v27.8b, v29.b[5]
+ ext v12.16b, v14.16b, v15.16b, #4
+ sxtl v26.8h, v26.8b
+ dup v28.8b, v29.b[6]
+ ext v13.16b, v14.16b, v15.16b, #6
+ sxtl v27.8h, v27.8b
+ sxtl v28.8h, v28.8b
+
+ smull v4.4s, v8.4h, v22.4h
+ smlal v4.4s, v9.4h, v23.4h
+ smlal v4.4s, v10.4h, v24.4h
+ smlal v4.4s, v11.4h, v26.4h
+ smlal v4.4s, v12.4h, v27.4h
+ smlal v4.4s, v13.4h, v28.4h
+ smlal v4.4s, v14.4h, v25.4h
+ smull2 v5.4s, v8.8h, v22.8h
+ smlal2 v5.4s, v9.8h, v23.8h
+ smlal2 v5.4s, v10.8h, v24.8h
+ smlal2 v5.4s, v11.8h, v26.8h
+ smlal2 v5.4s, v12.8h, v27.8h
+ smlal2 v5.4s, v13.8h, v28.8h
+ smlal2 v5.4s, v14.8h, v25.8h
+
+ dup v22.8b, v29.b[7]
+ ext v8.16b, v16.16b, v17.16b, #10 // top left, top mid
+ dup v23.8b, v29.b[8]
+ ext v9.16b, v16.16b, v17.16b, #12
+ sxtl v22.8h, v22.8b
+ dup v24.8b, v29.b[9]
+ sxtl v23.8h, v23.8b
+ dup v25.8b, v29.b[10]
+ ext v10.16b, v16.16b, v17.16b, #14
+ sxtl v24.8h, v24.8b
+ dup v26.8b, v29.b[11]
+ ext v11.16b, v17.16b, v18.16b, #2 // top mid, top right
+ sxtl v25.8h, v25.8b
+ dup v27.8b, v29.b[12]
+ ext v12.16b, v17.16b, v18.16b, #4
+ sxtl v26.8h, v26.8b
+ dup v28.8b, v29.b[13]
+ ext v13.16b, v17.16b, v18.16b, #6
+ sxtl v27.8h, v27.8b
+ sxtl v28.8h, v28.8b
+
+ smlal v4.4s, v8.4h, v22.4h
+ smlal v4.4s, v9.4h, v23.4h
+ smlal v4.4s, v10.4h, v24.4h
+ smlal v4.4s, v11.4h, v26.4h
+ smlal v4.4s, v12.4h, v27.4h
+ smlal v4.4s, v13.4h, v28.4h
+ smlal v4.4s, v17.4h, v25.4h
+ smlal2 v5.4s, v8.8h, v22.8h
+ smlal2 v5.4s, v9.8h, v23.8h
+ smlal2 v5.4s, v10.8h, v24.8h
+ smlal2 v5.4s, v11.8h, v26.8h
+ smlal2 v5.4s, v12.8h, v27.8h
+ smlal2 v5.4s, v13.8h, v28.8h
+ smlal2 v5.4s, v17.8h, v25.8h
+
+ dup v22.8b, v29.b[14]
+ ext v8.16b, v19.16b, v20.16b, #10 // top left, top mid
+ dup v23.8b, v29.b[15]
+ ext v9.16b, v19.16b, v20.16b, #12
+ sxtl v22.8h, v22.8b
+ dup v24.8b, v30.b[0]
+ sxtl v23.8h, v23.8b
+ dup v25.8b, v30.b[1]
+ ext v10.16b, v19.16b, v20.16b, #14
+ sxtl v24.8h, v24.8b
+ dup v26.8b, v30.b[2]
+ ext v11.16b, v20.16b, v21.16b, #2 // top mid, top right
+ sxtl v25.8h, v25.8b
+ dup v27.8b, v30.b[3]
+ ext v12.16b, v20.16b, v21.16b, #4
+ sxtl v26.8h, v26.8b
+ dup v28.8b, v30.b[4]
+ ext v13.16b, v20.16b, v21.16b, #6
+ sxtl v27.8h, v27.8b
+ sxtl v28.8h, v28.8b
+
+ smlal v4.4s, v8.4h, v22.4h
+ smlal v4.4s, v9.4h, v23.4h
+ smlal v4.4s, v10.4h, v24.4h
+ smlal v4.4s, v11.4h, v26.4h
+ smlal v4.4s, v12.4h, v27.4h
+ smlal v4.4s, v13.4h, v28.4h
+ smlal v4.4s, v20.4h, v25.4h
+ mov v16.16b, v17.16b
+ mov v17.16b, v18.16b
+ smlal2 v5.4s, v8.8h, v22.8h
+ smlal2 v5.4s, v9.8h, v23.8h
+ smlal2 v5.4s, v10.8h, v24.8h
+ smlal2 v5.4s, v11.8h, v26.8h
+ smlal2 v5.4s, v12.8h, v27.8h
+ smlal2 v5.4s, v13.8h, v28.8h
+ smlal2 v5.4s, v20.8h, v25.8h
+
+ mov v13.16b, v14.16b
+ mov v14.16b, v15.16b
+
+ mov v19.16b, v20.16b
+ mov v20.16b, v21.16b
+ ret
+endfunc
+
+.macro sum_lag3_func type, uv_layout, edge, elems=8
+function sum_\type\()_lag3_\edge\()_neon
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+.ifc \edge, left
+ sub x11, x0, #3*GRAIN_WIDTH*2
+ sub x12, x0, #2*GRAIN_WIDTH*2
+ sub x13, x0, #1*GRAIN_WIDTH*2
+ ld1 {v14.8h}, [x11] // load the previous block right above
+ ld1 {v17.8h}, [x12]
+ ld1 {v20.8h}, [x13]
+.endif
+ sum_lag_n_body lag3, \type, \uv_layout, \edge, \elems, v30.b[8]
+endfunc
+.endm
+
+sum_lag3_func y, 0, left
+sum_lag3_func y, 0, mid
+sum_lag3_func y, 0, right, 7
+sum_lag3_func uv_444, 444, left
+sum_lag3_func uv_444, 444, mid
+sum_lag3_func uv_444, 444, right, 7
+sum_lag3_func uv_422, 422, left
+sum_lag3_func uv_422, 422, mid
+sum_lag3_func uv_422, 422, right, 1
+sum_lag3_func uv_420, 420, left
+sum_lag3_func uv_420, 420, mid
+sum_lag3_func uv_420, 420, right, 1
+
+function generate_grain_rows_neon
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+1:
+ mov w16, #80
+2:
+ bl get_gaussian_neon
+ srshl v0.8h, v0.8h, v31.8h
+ subs w16, w16, #8
+ st1 {v0.8h}, [x0], #16
+ b.gt 2b
+ get_grain_2 v0
+ subs w1, w1, #1
+ st1 {v0.s}[0], [x0], #4
+ b.gt 1b
+ ldr x30, [sp], #16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+endfunc
+
+function generate_grain_rows_44_neon
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+1:
+ mov w16, #40
+2:
+ bl get_gaussian_neon
+ srshl v0.8h, v0.8h, v31.8h
+ subs w16, w16, #8
+ st1 {v0.8h}, [x0], #16
+ b.gt 2b
+ get_grain_4 v0
+ subs w1, w1, #1
+ st1 {v0.4h}, [x0]
+ add x0, x0, #GRAIN_WIDTH*2-80
+ b.gt 1b
+ ldr x30, [sp], #16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+endfunc
+
+function gen_grain_uv_444_lag0_neon
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+ ld1 {v4.8h}, [x19], #16
+gen_grain_uv_lag0_8_start:
+ bl get_gaussian_neon
+ srshl v0.8h, v0.8h, v31.8h
+gen_grain_uv_lag0_8_add:
+ and v4.16b, v4.16b, v1.16b
+ smull v2.4s, v4.4h, v27.4h
+ smull2 v3.4s, v4.8h, v27.8h
+ srshl v2.4s, v2.4s, v28.4s
+ srshl v3.4s, v3.4s, v28.4s
+ sqxtn v2.4h, v2.4s
+ sqxtn2 v2.8h, v3.4s
+ sqadd v2.8h, v2.8h, v0.8h
+ smin v2.8h, v2.8h, v25.8h
+ smax v2.8h, v2.8h, v26.8h
+ st1 {v2.8h}, [x0], #16
+ ldr x30, [sp], #16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+endfunc
+
+function gen_grain_uv_420_lag0_8_neon
+ AARCH64_SIGN_LINK_REGISTER
+ add x12, x19, #GRAIN_WIDTH*2
+ str x30, [sp, #-16]!
+ ld1 {v16.8h, v17.8h}, [x19], #32
+ ld1 {v18.8h, v19.8h}, [x12]
+ addp v16.8h, v16.8h, v17.8h
+ addp v17.8h, v18.8h, v19.8h
+ add v16.8h, v16.8h, v17.8h
+ srshr v4.8h, v16.8h, #2
+ b gen_grain_uv_lag0_8_start
+endfunc
+
+function gen_grain_uv_422_lag0_8_neon
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+ ld1 {v16.8h, v17.8h}, [x19], #32
+ addp v16.8h, v16.8h, v17.8h
+ srshr v4.8h, v16.8h, #1
+ b gen_grain_uv_lag0_8_start
+endfunc
+
+function gen_grain_uv_420_lag0_4_neon
+ add x12, x19, #GRAIN_WIDTH*2
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+ ld1 {v16.4h, v17.4h}, [x19]
+ ld1 {v18.4h, v19.4h}, [x12]
+ add x19, x19, #32
+ addp v16.4h, v16.4h, v17.4h
+ addp v17.4h, v18.4h, v19.4h
+ add v16.4h, v16.4h, v17.4h
+ srshr v4.4h, v16.4h, #2
+ get_grain_4 v0
+ b gen_grain_uv_lag0_8_add
+endfunc
+
+function gen_grain_uv_422_lag0_4_neon
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+ ld1 {v16.4h, v17.4h}, [x19]
+ add x19, x19, #32
+ addp v16.4h, v16.4h, v17.4h
+ srshr v4.4h, v16.4h, #1
+ get_grain_4 v0
+ b gen_grain_uv_lag0_8_add
+endfunc
+
+.macro gen_grain_82 type
+function generate_grain_\type\()_16bpc_neon, export=1
+ AARCH64_SIGN_LINK_REGISTER
+ stp x30, x19, [sp, #-96]!
+
+.ifc \type, uv_444
+ mov w13, w3
+ mov w14, #28
+ add x19, x1, #3*GRAIN_WIDTH*2
+ mov x1, x2
+ mul w13, w13, w14
+ clz w15, w4
+.else
+ clz w15, w2
+.endif
+ movrel x3, X(gaussian_sequence)
+ sub w15, w15, #24 // -bitdepth_min_8
+ ldr w2, [x1, #FGD_SEED]
+ ldr w9, [x1, #FGD_GRAIN_SCALE_SHIFT]
+.ifc \type, y
+ add x4, x1, #FGD_AR_COEFFS_Y
+.else
+ add x4, x1, #FGD_AR_COEFFS_UV
+.endif
+ add w9, w9, w15 // grain_scale_shift - bitdepth_min_8
+ adr x16, L(gen_grain_\type\()_tbl)
+ ldr w17, [x1, #FGD_AR_COEFF_LAG]
+ add w9, w9, #4
+ ldrh w17, [x16, w17, uxtw #1]
+ dup v31.8h, w9 // 4 - bitdepth_min_8 + data->grain_scale_shift
+ sub x16, x16, w17, uxtw
+ neg v31.8h, v31.8h
+
+.ifc \type, uv_444
+ cmp w13, #0
+ mov w11, #0x49d8
+ mov w14, #0xb524
+ add x4, x4, w13, uxtw // Add offset to ar_coeffs_uv[1]
+ csel w11, w11, w14, ne
+.endif
+
+ ldr w7, [x1, #FGD_AR_COEFF_SHIFT]
+ neg w15, w15 // bitdepth_min_8
+ mov w8, #1
+ mov w10, #1
+ lsl w8, w8, w7 // 1 << ar_coeff_shift
+ lsl w10, w10, w9 // 1 << (4 + data->grain_scale_shift)
+ lsr w8, w8, #1 // 1 << (ar_coeff_shift - 1)
+ lsr w10, w10, #1 // 1 << (4 + data->grain_scale_shift - 1)
+ mov w5, #128
+ lsl w5, w5, w15 // 128 << bitdepth_min_8
+ neg w6, w5 // -(128 << bitpdeth_min_8)
+ sub w5, w5, #1 // (128 << bitdepth_min_8) - 1
+
+.ifc \type, uv_444
+ eor w2, w2, w11
+.endif
+
+ br x16
+
+L(generate_grain_\type\()_lag0):
+ AARCH64_VALID_JUMP_TARGET
+.ifc \type, y
+ mov w1, #GRAIN_HEIGHT
+ bl generate_grain_rows_neon
+.else
+ dup v28.4s, w7
+ ld1r {v27.8b}, [x4] // ar_coeffs_uv[0]
+ movi v0.16b, #0
+ movi v1.16b, #255
+ dup v25.8h, w5
+ dup v26.8h, w6
+ ext v29.16b, v0.16b, v1.16b, #10
+ ext v30.16b, v1.16b, v0.16b, #2
+ neg v28.4s, v28.4s
+ sxtl v27.8h, v27.8b
+
+ mov w1, #3
+ bl generate_grain_rows_neon
+ mov w1, #GRAIN_HEIGHT-3
+1:
+ mov v1.16b, v29.16b
+ bl gen_grain_uv_444_lag0_neon // 8
+ movi v1.16b, #255
+ bl gen_grain_uv_444_lag0_neon // 16
+ bl gen_grain_uv_444_lag0_neon // 24
+ bl gen_grain_uv_444_lag0_neon // 32
+ bl gen_grain_uv_444_lag0_neon // 40
+ bl gen_grain_uv_444_lag0_neon // 48
+ bl gen_grain_uv_444_lag0_neon // 56
+ bl gen_grain_uv_444_lag0_neon // 64
+ bl gen_grain_uv_444_lag0_neon // 72
+ mov v1.16b, v30.16b
+ bl gen_grain_uv_444_lag0_neon // 80
+ get_grain_2 v16
+ subs w1, w1, #1
+ add x19, x19, #4
+ st1 {v16.s}[0], [x0], #4
+ b.gt 1b
+.endif
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(generate_grain_\type\()_lag1):
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v27.8b}, [x4], #1 // ar_coeffs_y[0]
+ ld1r {v28.8b}, [x4], #1 // ar_coeffs_y[1]
+ ld1r {v29.8b}, [x4] // ar_coeffs_y[2]
+.ifc \type, y
+ ldrsb w4, [x4, #1] // ar_coeffs_y[3]
+.else
+ add x4, x4, #2
+.endif
+
+ mov w1, #3
+.ifc \type, uv_444
+ ld1r {v30.8b}, [x4] // ar_coeffs_uv[4]
+ ldursb w4, [x4, #-1] // ar_coeffs_uv[3]
+.endif
+ bl generate_grain_rows_neon
+ sxtl v27.8h, v27.8b
+ sxtl v28.8h, v28.8b
+ sxtl v29.8h, v29.8b
+.ifc \type, uv_444
+ sxtl v30.8h, v30.8b
+.endif
+
+ mov w1, #GRAIN_HEIGHT - 3
+1:
+ bl sum_\type\()_lag1_left_neon // 8
+ bl sum_\type\()_lag1_mid_neon // 16
+ bl sum_\type\()_lag1_mid_neon // 24
+ bl sum_\type\()_lag1_mid_neon // 32
+ bl sum_\type\()_lag1_mid_neon // 40
+ bl sum_\type\()_lag1_mid_neon // 48
+ bl sum_\type\()_lag1_mid_neon // 56
+ bl sum_\type\()_lag1_mid_neon // 64
+ bl sum_\type\()_lag1_mid_neon // 72
+ bl sum_\type\()_lag1_right_neon // 80
+ get_grain_2 v16
+ subs w1, w1, #1
+.ifc \type, uv_444
+ add x19, x19, #4
+.endif
+ st1 {v16.s}[0], [x0], #4
+ b.gt 1b
+
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(generate_grain_\type\()_lag2):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v30.16b}, [x4] // ar_coeffs_y[0-11], ar_coeffs_uv[0-12]
+
+ smov w4, v30.b[10]
+ smov w17, v30.b[11]
+
+ mov w1, #3
+ bl generate_grain_rows_neon
+
+ mov w1, #GRAIN_HEIGHT - 3
+1:
+ bl sum_\type\()_lag2_left_neon // 8
+ bl sum_\type\()_lag2_mid_neon // 16
+ bl sum_\type\()_lag2_mid_neon // 24
+ bl sum_\type\()_lag2_mid_neon // 32
+ bl sum_\type\()_lag2_mid_neon // 40
+ bl sum_\type\()_lag2_mid_neon // 48
+ bl sum_\type\()_lag2_mid_neon // 56
+ bl sum_\type\()_lag2_mid_neon // 64
+ bl sum_\type\()_lag2_mid_neon // 72
+ bl sum_\type\()_lag2_right_neon // 80
+ get_grain_2 v16
+ subs w1, w1, #1
+.ifc \type, uv_444
+ add x19, x19, #4
+.endif
+ st1 {v16.s}[0], [x0], #4
+ b.gt 1b
+
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(generate_grain_\type\()_lag3):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v29.16b, v30.16b}, [x4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24]
+ stp d8, d9, [sp, #16]
+ stp d10, d11, [sp, #32]
+ stp d12, d13, [sp, #48]
+ stp d14, d15, [sp, #64]
+ stp x20, x21, [sp, #80]
+
+ smov w4, v30.b[5]
+ smov w20, v30.b[6]
+ smov w21, v30.b[7]
+
+ mov w1, #3
+ bl generate_grain_rows_neon
+
+ mov w1, #GRAIN_HEIGHT - 3
+1:
+ bl sum_\type\()_lag3_left_neon // 8
+ bl sum_\type\()_lag3_mid_neon // 16
+ bl sum_\type\()_lag3_mid_neon // 24
+ bl sum_\type\()_lag3_mid_neon // 32
+ bl sum_\type\()_lag3_mid_neon // 40
+ bl sum_\type\()_lag3_mid_neon // 48
+ bl sum_\type\()_lag3_mid_neon // 56
+ bl sum_\type\()_lag3_mid_neon // 64
+ bl sum_\type\()_lag3_mid_neon // 72
+ bl sum_\type\()_lag3_right_neon // 80
+ get_grain_2 v16
+ subs w1, w1, #1
+.ifc \type, uv_444
+ add x19, x19, #4
+.endif
+ st1 {v16.s}[0], [x0], #4
+ b.gt 1b
+
+ ldp x20, x21, [sp, #80]
+ ldp d14, d15, [sp, #64]
+ ldp d12, d13, [sp, #48]
+ ldp d10, d11, [sp, #32]
+ ldp d8, d9, [sp, #16]
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(gen_grain_\type\()_tbl):
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0)
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1)
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2)
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3)
+endfunc
+.endm
+
+gen_grain_82 y
+gen_grain_82 uv_444
+
+.macro set_height dst, type
+.ifc \type, uv_420
+ mov \dst, #SUB_GRAIN_HEIGHT-3
+.else
+ mov \dst, #GRAIN_HEIGHT-3
+.endif
+.endm
+
+.macro increment_y_ptr reg, type
+.ifc \type, uv_420
+ add \reg, \reg, #2*GRAIN_WIDTH*2-(6*32)
+.else
+ sub \reg, \reg, #6*32-GRAIN_WIDTH*2
+.endif
+.endm
+
+.macro gen_grain_44 type
+function generate_grain_\type\()_16bpc_neon, export=1
+ AARCH64_SIGN_LINK_REGISTER
+ stp x30, x19, [sp, #-96]!
+
+ mov w13, w3
+ mov w14, #28
+ add x19, x1, #(3*GRAIN_WIDTH-3)*2
+ mov x1, x2
+ mul w13, w13, w14
+ clz w15, w4
+
+ movrel x3, X(gaussian_sequence)
+ sub w15, w15, #24 // -bitdepth_min_8
+ ldr w2, [x1, #FGD_SEED]
+ ldr w9, [x1, #FGD_GRAIN_SCALE_SHIFT]
+ add x4, x1, #FGD_AR_COEFFS_UV
+ add w9, w9, w15 // grain_scale_shift - bitdepth_min_8
+ adr x16, L(gen_grain_\type\()_tbl)
+ ldr w17, [x1, #FGD_AR_COEFF_LAG]
+ add w9, w9, #4
+ ldrh w17, [x16, w17, uxtw #1]
+ dup v31.8h, w9 // 4 - bitdepth_min_8 + data->grain_scale_shift
+ sub x16, x16, w17, uxtw
+ neg v31.8h, v31.8h
+
+ cmp w13, #0
+ mov w11, #0x49d8
+ mov w14, #0xb524
+ add x4, x4, w13, uxtw // Add offset to ar_coeffs_uv[1]
+ csel w11, w11, w14, ne
+
+ ldr w7, [x1, #FGD_AR_COEFF_SHIFT]
+ neg w15, w15 // bitdepth_min_8
+ mov w8, #1
+ mov w10, #1
+ lsl w8, w8, w7 // 1 << ar_coeff_shift
+ lsl w10, w10, w9 // 1 << (4 + data->grain_scale_shift)
+ lsr w8, w8, #1 // 1 << (ar_coeff_shift - 1)
+ lsr w10, w10, #1 // 1 << (4 + data->grain_scale_shift - 1)
+ mov w5, #128
+ lsl w5, w5, w15 // 128 << bitdepth_min_8
+ neg w6, w5 // -(128 << bitpdeth_min_8)
+ sub w5, w5, #1 // (128 << bitdepth_min_8) - 1
+
+ eor w2, w2, w11
+
+ br x16
+
+L(generate_grain_\type\()_lag0):
+ AARCH64_VALID_JUMP_TARGET
+ dup v28.4s, w7
+ ld1r {v27.8b}, [x4] // ar_coeffs_uv[0]
+ movi v0.16b, #0
+ movi v1.16b, #255
+ dup v25.8h, w5
+ dup v26.8h, w6
+ ext v29.16b, v0.16b, v1.16b, #10
+ ext v30.16b, v1.16b, v0.16b, #14
+ neg v28.4s, v28.4s
+ sxtl v27.8h, v27.8b
+
+ mov w1, #3
+ bl generate_grain_rows_44_neon
+ set_height w1, \type
+1:
+ mov v1.16b, v29.16b
+ bl gen_grain_\type\()_lag0_8_neon // 8
+ movi v1.16b, #255
+ bl gen_grain_\type\()_lag0_8_neon // 16
+ bl gen_grain_\type\()_lag0_8_neon // 24
+ bl gen_grain_\type\()_lag0_8_neon // 32
+ bl gen_grain_\type\()_lag0_8_neon // 40
+ mov v1.16b, v30.16b
+ bl gen_grain_\type\()_lag0_4_neon // 44
+ subs w1, w1, #1
+ increment_y_ptr x19, \type
+ add x0, x0, #GRAIN_WIDTH*2-6*16
+ b.gt 1b
+
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(generate_grain_\type\()_lag1):
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v27.8b}, [x4], #1 // ar_coeffs_uv[0]
+ ld1r {v28.8b}, [x4], #1 // ar_coeffs_uv[1]
+ ld1r {v29.8b}, [x4] // ar_coeffs_uv[2]
+ add x4, x4, #2
+
+ mov w1, #3
+ ld1r {v30.8b}, [x4] // ar_coeffs_u4[4]
+ ldursb w4, [x4, #-1] // ar_coeffs_uv[3]
+ bl generate_grain_rows_44_neon
+
+ sxtl v27.8h, v27.8b
+ sxtl v28.8h, v28.8b
+ sxtl v29.8h, v29.8b
+ sxtl v30.8h, v30.8b
+ set_height w1, \type
+1:
+ bl sum_\type\()_lag1_left_neon // 8
+ bl sum_\type\()_lag1_mid_neon // 16
+ bl sum_\type\()_lag1_mid_neon // 24
+ bl sum_\type\()_lag1_mid_neon // 32
+ bl sum_\type\()_lag1_mid_neon // 40
+ bl sum_\type\()_lag1_right_neon // 44
+ subs w1, w1, #1
+ increment_y_ptr x19, \type
+ add x0, x0, #GRAIN_WIDTH*2-6*16
+ b.gt 1b
+
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(generate_grain_\type\()_lag2):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v30.16b}, [x4] // ar_coeffs_uv[0-12]
+
+ smov w4, v30.b[10]
+ smov w17, v30.b[11]
+
+ mov w1, #3
+ bl generate_grain_rows_44_neon
+
+ set_height w1, \type
+1:
+ bl sum_\type\()_lag2_left_neon // 8
+ bl sum_\type\()_lag2_mid_neon // 16
+ bl sum_\type\()_lag2_mid_neon // 24
+ bl sum_\type\()_lag2_mid_neon // 32
+ bl sum_\type\()_lag2_mid_neon // 40
+ bl sum_\type\()_lag2_right_neon // 44
+ subs w1, w1, #1
+ increment_y_ptr x19, \type
+ add x0, x0, #GRAIN_WIDTH*2-6*16
+ b.gt 1b
+
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(generate_grain_\type\()_lag3):
+ AARCH64_VALID_JUMP_TARGET
+ ldr q29, [x4] // ar_coeffs_uv[0-15]
+ ldr q30, [x4, #16] // ar_coeffs_uv[16-24]
+ stp d8, d9, [sp, #16]
+ stp d10, d11, [sp, #32]
+ stp d12, d13, [sp, #48]
+ stp d14, d15, [sp, #64]
+ stp x20, x21, [sp, #80]
+
+ smov w4, v30.b[5]
+ smov w20, v30.b[6]
+ smov w21, v30.b[7]
+
+ mov w1, #3
+ bl generate_grain_rows_44_neon
+
+ set_height w1, \type
+1:
+ bl sum_\type\()_lag3_left_neon // 8
+ bl sum_\type\()_lag3_mid_neon // 16
+ bl sum_\type\()_lag3_mid_neon // 24
+ bl sum_\type\()_lag3_mid_neon // 32
+ bl sum_\type\()_lag3_mid_neon // 40
+ bl sum_\type\()_lag3_right_neon // 44
+ subs w1, w1, #1
+ increment_y_ptr x19, \type
+ add x0, x0, #GRAIN_WIDTH*2-6*16
+ b.gt 1b
+
+ ldp x20, x21, [sp, #80]
+ ldp d14, d15, [sp, #64]
+ ldp d12, d13, [sp, #48]
+ ldp d10, d11, [sp, #32]
+ ldp d8, d9, [sp, #16]
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(gen_grain_\type\()_tbl):
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0)
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1)
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2)
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3)
+endfunc
+.endm
+
+gen_grain_44 uv_420
+gen_grain_44 uv_422
+
+.macro gather_interleaved dst1, dst2, src1, src2, off
+ umov w14, \src1[0]
+ umov w15, \src2[1]
+ umov w16, \src1[2]
+ add x14, x14, x3
+ umov w17, \src2[3]
+ add x15, x15, x3
+ ld1 {\dst1}[0+\off], [x14]
+ umov w14, \src1[4]
+ add x16, x16, x3
+ ld1 {\dst2}[1+\off], [x15]
+ umov w15, \src2[5]
+ add x17, x17, x3
+ ld1 {\dst1}[2+\off], [x16]
+ umov w16, \src1[6]
+ add x14, x14, x3
+ ld1 {\dst2}[3+\off], [x17]
+ umov w17, \src2[7]
+ add x15, x15, x3
+ ld1 {\dst1}[4+\off], [x14]
+ add x16, x16, x3
+ ld1 {\dst2}[5+\off], [x15]
+ add x17, x17, x3
+ ld1 {\dst1}[6+\off], [x16]
+ ld1 {\dst2}[7+\off], [x17]
+.endm
+
+.macro gather dst1, dst2, src1, src2, src3, src4
+ gather_interleaved \dst1, \dst2, \src1, \src3, 0
+ gather_interleaved \dst2, \dst1, \src3, \src1, 0
+ gather_interleaved \dst1, \dst2, \src2, \src4, 8
+ gather_interleaved \dst2, \dst1, \src4, \src2, 8
+.endm
+
+function gather32_neon
+ gather v6.b, v7.b, v0.h, v1.h, v2.h, v3.h
+ ret
+endfunc
+
+function gather16_neon
+ gather_interleaved v6.b, v7.b, v0.h, v1.h, 0
+ gather_interleaved v7.b, v6.b, v1.h, v0.h, 0
+ ins v6.d[1], v7.d[0]
+ ret
+endfunc
+
+const overlap_coeffs_0, align=4
+ .short 27, 17, 0, 0
+ .short 17, 27, 32, 32
+endconst
+
+const overlap_coeffs_1, align=4
+ .short 23, 0, 0, 0
+ .short 22, 32, 32, 32
+endconst
+
+.macro calc_offset offx, offy, src, sx, sy
+ and \offy, \src, #0xF // randval & 0xF
+ lsr \offx, \src, #4 // randval >> 4
+.if \sy == 0
+ add \offy, \offy, \offy // 2 * (randval & 0xF)
+.endif
+.if \sx == 0
+ add \offx, \offx, \offx // 2 * (randval >> 4)
+.endif
+.endm
+
+.macro add_offset dst, offx, offy, src, stride
+ madd \dst, \stride, \offy, \src // grain_lut += grain_stride * offy
+ add \dst, \dst, \offx, uxtw #1 // grain_lut += offx
+.endm
+
+// void dav1d_fgy_32x32_16bpc_neon(pixel *const dst, const pixel *const src,
+// const ptrdiff_t stride,
+// const uint8_t scaling[SCALING_SIZE],
+// const int scaling_shift,
+// const entry grain_lut[][GRAIN_WIDTH],
+// const int offsets[][2],
+// const int h, const ptrdiff_t clip,
+// const ptrdiff_t type,
+// const int bitdepth_max);
+function fgy_32x32_16bpc_neon, export=1
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-80]!
+ stp d8, d9, [sp, #16]
+ stp d10, d11, [sp, #32]
+ stp d12, d13, [sp, #48]
+ str d14, [sp, #64]
+ eor w4, w4, #15 // 15 - scaling_shift
+ ldr w11, [x6, #8] // offsets[1][0]
+ ldr w13, [x6, #4] // offsets[0][1]
+ ldr w15, [x6, #12] // offsets[1][1]
+ ldr w10, [sp, #96] // bitdepth_max
+ ldr w6, [x6] // offsets[0][0]
+ dup v26.8h, w10 // bitdepth_max
+ clz w10, w10
+ ldr w8, [sp, #80] // clip
+ sub w10, w10, #24 // -bitdepth_min_8
+ mov x9, #GRAIN_WIDTH*2 // grain_lut stride
+ neg w10, w10 // bitdepth_min_8
+
+ dup v29.8h, w4 // 15 - scaling_shift
+ dup v27.8h, w10 // bitdepth_min_8
+
+ movrel x16, overlap_coeffs_0
+
+ cbz w8, 1f
+ // clip
+ movi v30.8h, #16
+ movi v31.8h, #235
+ sshl v30.8h, v30.8h, v27.8h
+ sshl v31.8h, v31.8h, v27.8h
+ b 2f
+1:
+ // no clip
+ movi v30.8h, #0
+ mov v31.16b, v26.16b // bitdepth_max
+2:
+
+ ushr v26.8h, v26.8h, #1 // grain_max
+ not v25.16b, v26.16b // grain_min
+
+ ld1 {v27.4h, v28.4h}, [x16] // overlap_coeffs
+
+ add x5, x5, #18 // grain_lut += 9
+ add x5, x5, x9, lsl #3 // grain_lut += 8 * grain_stride
+ add x5, x5, x9 // grain_lut += grain_stride
+
+ calc_offset w11, w12, w11, 0, 0
+ calc_offset w13, w14, w13, 0, 0
+ calc_offset w15, w16, w15, 0, 0
+ calc_offset w6, w10, w6, 0, 0
+
+ add_offset x12, w11, x12, x5, x9
+ add_offset x14, w13, x14, x5, x9
+ add_offset x16, w15, x16, x5, x9
+ add_offset x5, w6, x10, x5, x9
+
+ ldr w11, [sp, #88] // type
+ adr x13, L(fgy_loop_tbl)
+
+ add x4, x12, #32*2 // grain_lut += FG_BLOCK_SIZE * bx
+ add x6, x14, x9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by
+
+ tst w11, #1
+ ldrh w11, [x13, w11, uxtw #1]
+
+ add x8, x16, x9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by
+ add x8, x8, #32*2 // grain_lut += FG_BLOCK_SIZE * bx
+
+ sub x11, x13, w11, uxtw
+
+ b.eq 1f
+ // y overlap
+ dup v8.8h, v27.h[0]
+ dup v9.8h, v27.h[1]
+ mov w10, w7 // backup actual h
+ mov w7, #2
+1:
+ br x11
+endfunc
+
+function fgy_loop_neon
+.macro fgy ox, oy
+L(loop_\ox\oy):
+ AARCH64_VALID_JUMP_TARGET
+1:
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2 // src
+.if \ox
+ ld1 {v20.4h}, [x4], x9 // grain_lut old
+.endif
+.if \oy
+ ld1 {v21.8h, v22.8h, v23.8h, v24.8h}, [x6], x9 // grain_lut top
+.endif
+.if \ox && \oy
+ ld1 {v14.4h}, [x8], x9 // grain_lut top old
+.endif
+ mvni v4.8h, #0xf0, lsl #8 // 0x0fff
+ ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x9 // grain_lut
+
+ // Make sure that uninitialized pixels out of range past the right
+ // edge are in range; their actual values shouldn't matter.
+ and v0.16b, v0.16b, v4.16b
+ and v1.16b, v1.16b, v4.16b
+ and v2.16b, v2.16b, v4.16b
+ and v3.16b, v3.16b, v4.16b
+ bl gather32_neon
+
+.if \ox
+ smull v20.4s, v20.4h, v27.4h
+ smlal v20.4s, v16.4h, v28.4h
+.endif
+
+.if \oy
+.if \ox
+ smull v14.4s, v14.4h, v27.4h
+ smlal v14.4s, v21.4h, v28.4h
+ sqrshrn v20.4h, v20.4s, #5
+ sqrshrn v14.4h, v14.4s, #5
+ smin v20.4h, v20.4h, v26.4h
+ smin v14.4h, v14.4h, v26.4h
+ smax v20.4h, v20.4h, v25.4h
+ smax v14.4h, v14.4h, v25.4h
+.endif
+
+.if \ox
+ smull v10.4s, v20.4h, v9.4h
+.else
+ smull v10.4s, v16.4h, v9.4h
+.endif
+ smull2 v11.4s, v16.8h, v9.8h
+ smull v12.4s, v17.4h, v9.4h
+ smull2 v13.4s, v17.8h, v9.8h
+ smull v16.4s, v18.4h, v9.4h
+ smull2 v17.4s, v18.8h, v9.8h
+ smull v18.4s, v19.4h, v9.4h
+ smull2 v19.4s, v19.8h, v9.8h
+.if \ox
+ smlal v10.4s, v14.4h, v8.4h
+.else
+ smlal v10.4s, v21.4h, v8.4h
+.endif
+ smlal2 v11.4s, v21.8h, v8.8h
+ smlal v12.4s, v22.4h, v8.4h
+ smlal2 v13.4s, v22.8h, v8.8h
+ smlal v16.4s, v23.4h, v8.4h
+ smlal2 v17.4s, v23.8h, v8.8h
+ smlal v18.4s, v24.4h, v8.4h
+ smlal2 v19.4s, v24.8h, v8.8h
+ sqrshrn v10.4h, v10.4s, #5
+ sqrshrn2 v10.8h, v11.4s, #5
+ sqrshrn v11.4h, v12.4s, #5
+ sqrshrn2 v11.8h, v13.4s, #5
+ sqrshrn v12.4h, v16.4s, #5
+ sqrshrn2 v12.8h, v17.4s, #5
+ sqrshrn v13.4h, v18.4s, #5
+ sqrshrn2 v13.8h, v19.4s, #5
+ smin v16.8h, v10.8h, v26.8h
+ smin v17.8h, v11.8h, v26.8h
+ smin v18.8h, v12.8h, v26.8h
+ smin v19.8h, v13.8h, v26.8h
+ smax v16.8h, v16.8h, v25.8h
+ smax v17.8h, v17.8h, v25.8h
+ smax v18.8h, v18.8h, v25.8h
+ smax v19.8h, v19.8h, v25.8h
+.endif
+
+ uxtl v4.8h, v6.8b // scaling
+.if \ox && !\oy
+ sqrshrn v20.4h, v20.4s, #5
+.endif
+ uxtl2 v5.8h, v6.16b
+.if \ox && !\oy
+ smin v20.4h, v20.4h, v26.4h
+.endif
+ uxtl v6.8h, v7.8b
+.if \ox && !\oy
+ smax v20.4h, v20.4h, v25.4h
+.endif
+ uxtl2 v7.8h, v7.16b
+.if \ox && !\oy
+ ins v16.d[0], v20.d[0]
+.endif
+ ushl v4.8h, v4.8h, v29.8h // scaling << (15 - scaling_shift)
+ ushl v5.8h, v5.8h, v29.8h
+ ushl v6.8h, v6.8h, v29.8h
+ ushl v7.8h, v7.8h, v29.8h
+
+ sqrdmulh v20.8h, v16.8h, v4.8h // round2((scaling << (15 - scaling_shift) * grain, 15)
+ sqrdmulh v21.8h, v17.8h, v5.8h
+ sqrdmulh v22.8h, v18.8h, v6.8h
+ sqrdmulh v23.8h, v19.8h, v7.8h
+
+ usqadd v0.8h, v20.8h // *src + noise
+ usqadd v1.8h, v21.8h
+ usqadd v2.8h, v22.8h
+ usqadd v3.8h, v23.8h
+
+ umax v0.8h, v0.8h, v30.8h
+ umax v1.8h, v1.8h, v30.8h
+ umax v2.8h, v2.8h, v30.8h
+ umax v3.8h, v3.8h, v30.8h
+ umin v0.8h, v0.8h, v31.8h
+ umin v1.8h, v1.8h, v31.8h
+ umin v2.8h, v2.8h, v31.8h
+ umin v3.8h, v3.8h, v31.8h
+
+ subs w7, w7, #1
+.if \oy
+ dup v8.8h, v28.h[0]
+ dup v9.8h, v28.h[1]
+.endif
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x2 // dst
+ b.gt 1b
+
+.if \oy
+ cmp w10, #2
+ sub w7, w10, #2 // restore actual remaining h
+ b.gt L(loop_\ox\()0)
+.endif
+ ldr d14, [sp, #64]
+ ldp d12, d13, [sp, #48]
+ ldp d10, d11, [sp, #32]
+ ldp d8, d9, [sp, #16]
+ ldr x30, [sp], #80
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.endm
+
+ fgy 0, 0
+ fgy 0, 1
+ fgy 1, 0
+ fgy 1, 1
+
+L(fgy_loop_tbl):
+ .hword L(fgy_loop_tbl) - L(loop_00)
+ .hword L(fgy_loop_tbl) - L(loop_01)
+ .hword L(fgy_loop_tbl) - L(loop_10)
+ .hword L(fgy_loop_tbl) - L(loop_11)
+endfunc
+
+// void dav1d_fguv_32x32_420_16bpc_neon(pixel *const dst,
+// const pixel *const src,
+// const ptrdiff_t stride,
+// const uint8_t scaling[SCALING_SIZE],
+// const Dav1dFilmGrainData *const data,
+// const entry grain_lut[][GRAIN_WIDTH],
+// const pixel *const luma_row,
+// const ptrdiff_t luma_stride,
+// const int offsets[][2],
+// const ptrdiff_t h, const ptrdiff_t uv,
+// const ptrdiff_t is_id,
+// const ptrdiff_t type,
+// const int bitdepth_max);
+.macro fguv layout, sx, sy
+function fguv_32x32_\layout\()_16bpc_neon, export=1
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-80]!
+ stp d8, d9, [sp, #16]
+ stp d10, d11, [sp, #32]
+ stp d12, d13, [sp, #48]
+ stp d14, d15, [sp, #64]
+
+ ldp x8, x9, [sp, #80] // offsets, h
+ ldp x10, x11, [sp, #96] // uv, is_id
+ ldr w16, [sp, #120] // bitdepth_max
+
+ ldr w13, [x4, #FGD_SCALING_SHIFT]
+ ldr w12, [x4, #FGD_CLIP_TO_RESTRICTED_RANGE]
+ dup v23.8h, w16 // bitdepth_max
+ clz w16, w16
+ eor w13, w13, #15 // 15 - scaling_shift
+ sub w16, w16, #24 // -bitdepth_min_8
+
+ // !csfl
+ add x10, x4, x10, lsl #2 // + 4*uv
+ add x14, x10, #FGD_UV_LUMA_MULT
+ add x15, x10, #FGD_UV_MULT
+ add x10, x10, #FGD_UV_OFFSET
+ neg w16, w16 // bitdepth_min_8
+ ld1r {v8.8h}, [x14] // uv_luma_mult
+ ld1r {v24.8h}, [x10] // uv_offset
+ ld1r {v9.8h}, [x15] // uv_mult
+
+ dup v29.8h, w13 // 15 - scaling_shift
+ dup v27.8h, w16 // bitdepth_min_8
+
+ cbz w12, 1f
+ // clip
+ movi v30.8h, #16
+ movi v31.8h, #240
+ sshl v30.8h, v30.8h, v27.8h
+ sshl v31.8h, v31.8h, v27.8h
+ cbz w11, 2f
+ // is_id
+ movi v31.8h, #235
+ sshl v31.8h, v31.8h, v27.8h
+ b 2f
+1:
+ // no clip
+ movi v30.8h, #0
+ mov v31.16b, v23.16b // bitdepth_max
+2:
+
+ ushr v15.8h, v23.8h, #1 // grain_max
+ sshl v24.8h, v24.8h, v27.8h // uv_offset << bitdepth_min_8
+ not v14.16b, v15.16b // grain_min
+
+ ldr w12, [x8, #8] // offsets[1][0]
+ ldr w14, [x8, #4] // offsets[0][1]
+ ldr w16, [x8, #12] // offsets[1][1]
+ ldr w8, [x8] // offsets[0][0]
+
+ mov x10, #GRAIN_WIDTH*2 // grain_lut stride
+
+ add x5, x5, #(2*(3 + (2 >> \sx)*3)) // grain_lut += 9 or 6
+.if \sy
+ add x5, x5, x10, lsl #2 // grain_lut += 4 * grain_stride
+ add x5, x5, x10, lsl #1 // grain_lut += 2 * grain_stride
+.else
+ add x5, x5, x10, lsl #3 // grain_lut += 8 * grain_stride
+ add x5, x5, x10 // grain_lut += grain_stride
+.endif
+
+ calc_offset w12, w13, w12, \sx, \sy
+ calc_offset w14, w15, w14, \sx, \sy
+ calc_offset w16, w17, w16, \sx, \sy
+ calc_offset w8, w11, w8, \sx, \sy
+
+ add_offset x13, w12, x13, x5, x10
+ add_offset x15, w14, x15, x5, x10
+ add_offset x17, w16, x17, x5, x10
+ add_offset x5, w8, x11, x5, x10
+
+ add x4, x13, #2*(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx
+ add x8, x15, x10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by
+ add x11, x17, x10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by
+ add x11, x11, #2*(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx
+
+ ldr w13, [sp, #112] // type
+
+ movrel x16, overlap_coeffs_\sx
+ adr x14, L(fguv_loop_sx\sx\()_tbl)
+
+ ld1 {v27.4h, v28.4h}, [x16] // overlap_coeffs
+ tst w13, #1
+ ldrh w13, [x14, w13, uxtw #1]
+
+ b.eq 1f
+ // y overlap
+ sub w12, w9, #(2 >> \sy) // backup remaining h
+ mov w9, #(2 >> \sy)
+
+1:
+ sub x13, x14, w13, uxtw
+
+.if \sy
+ movi v25.8h, #23
+ movi v26.8h, #22
+.else
+ movi v25.8h, #27
+ movi v26.8h, #17
+.endif
+
+.if \sy
+ add x7, x7, x7 // luma_stride *= 2
+.endif
+
+ br x13
+endfunc
+.endm
+
+fguv 420, 1, 1
+fguv 422, 1, 0
+fguv 444, 0, 0
+
+function fguv_loop_sx0_neon
+.macro fguv_loop_sx0 csfl, ox, oy
+L(fguv_loop_sx0_csfl\csfl\()_\ox\oy):
+ AARCH64_VALID_JUMP_TARGET
+1:
+.if \ox
+ ld1 {v4.4h}, [x4], x10 // grain_lut old
+.endif
+.if \oy
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x8], x10 // grain_lut top
+.endif
+.if \ox && \oy
+ ld1 {v5.4h}, [x11], x10 // grain_lut top old
+.endif
+ ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x10 // grain_lut
+
+.if \ox
+ smull v4.4s, v4.4h, v27.4h
+ smlal v4.4s, v16.4h, v28.4h
+.endif
+
+.if \oy
+.if \ox
+ smull v5.4s, v5.4h, v27.4h
+ smlal v5.4s, v0.4h, v28.4h
+ sqrshrn v4.4h, v4.4s, #5
+ sqrshrn v5.4h, v5.4s, #5
+ smin v4.4h, v4.4h, v15.4h
+ smin v5.4h, v5.4h, v15.4h
+ smax v4.4h, v4.4h, v14.4h
+ smax v5.4h, v5.4h, v14.4h
+ ins v16.d[0], v4.d[0]
+ ins v0.d[0], v5.d[0]
+.endif
+
+ smull v6.4s, v16.4h, v26.4h
+ smull2 v7.4s, v16.8h, v26.8h
+ smull v10.4s, v17.4h, v26.4h
+ smull2 v11.4s, v17.8h, v26.8h
+ smull v16.4s, v18.4h, v26.4h
+ smull2 v17.4s, v18.8h, v26.8h
+ smull v18.4s, v19.4h, v26.4h
+ smull2 v19.4s, v19.8h, v26.8h
+ smlal v6.4s, v0.4h, v25.4h
+ smlal2 v7.4s, v0.8h, v25.8h
+ smlal v10.4s, v1.4h, v25.4h
+ smlal2 v11.4s, v1.8h, v25.8h
+ smlal v16.4s, v2.4h, v25.4h
+ smlal2 v17.4s, v2.8h, v25.8h
+ smlal v18.4s, v3.4h, v25.4h
+ smlal2 v19.4s, v3.8h, v25.8h
+ sqrshrn v6.4h, v6.4s, #5
+ sqrshrn2 v6.8h, v7.4s, #5
+ sqrshrn v7.4h, v10.4s, #5
+ sqrshrn2 v7.8h, v11.4s, #5
+ sqrshrn v10.4h, v16.4s, #5
+ sqrshrn2 v10.8h, v17.4s, #5
+ sqrshrn v11.4h, v18.4s, #5
+ sqrshrn2 v11.8h, v19.4s, #5
+.endif
+
+.if \ox && !\oy
+ sqrshrn v4.4h, v4.4s, #5
+ smin v4.4h, v4.4h, v15.4h
+.endif
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7 // luma
+.if \oy
+ smin v16.8h, v6.8h, v15.8h
+ smin v17.8h, v7.8h, v15.8h
+ smin v18.8h, v10.8h, v15.8h
+ smin v19.8h, v11.8h, v15.8h
+ smax v16.8h, v16.8h, v14.8h
+ smax v17.8h, v17.8h, v14.8h
+ smax v18.8h, v18.8h, v14.8h
+ smax v19.8h, v19.8h, v14.8h
+.endif
+
+.if \ox && !\oy
+ smax v4.4h, v4.4h, v14.4h
+.endif
+ ld1 {v10.8h, v11.8h, v12.8h, v13.8h}, [x1], x2 // src
+.if \ox && !\oy
+ ins v16.d[0], v4.d[0]
+.endif
+
+.if !\csfl
+ smull v4.4s, v0.4h, v8.4h
+ smull2 v5.4s, v0.8h, v8.8h
+ smull v6.4s, v1.4h, v8.4h
+ smull2 v7.4s, v1.8h, v8.8h
+ smull v0.4s, v2.4h, v8.4h
+ smull2 v1.4s, v2.8h, v8.8h
+ smull v2.4s, v3.4h, v8.4h
+ smull2 v3.4s, v3.8h, v8.8h
+ smlal v4.4s, v10.4h, v9.4h
+ smlal2 v5.4s, v10.8h, v9.8h
+ smlal v6.4s, v11.4h, v9.4h
+ smlal2 v7.4s, v11.8h, v9.8h
+ smlal v0.4s, v12.4h, v9.4h
+ smlal2 v1.4s, v12.8h, v9.8h
+ smlal v2.4s, v13.4h, v9.4h
+ smlal2 v3.4s, v13.8h, v9.8h
+ shrn v4.4h, v4.4s, #6
+ shrn2 v4.8h, v5.4s, #6
+ shrn v5.4h, v6.4s, #6
+ shrn2 v5.8h, v7.4s, #6
+ shrn v6.4h, v0.4s, #6
+ shrn2 v6.8h, v1.4s, #6
+ shrn v7.4h, v2.4s, #6
+ shrn2 v7.8h, v3.4s, #6
+ add v0.8h, v4.8h, v24.8h
+ add v1.8h, v5.8h, v24.8h
+ add v2.8h, v6.8h, v24.8h
+ add v3.8h, v7.8h, v24.8h
+ movi v20.8h, #0
+ smin v0.8h, v0.8h, v23.8h
+ smin v1.8h, v1.8h, v23.8h
+ smin v2.8h, v2.8h, v23.8h
+ smin v3.8h, v3.8h, v23.8h
+ smax v0.8h, v0.8h, v20.8h
+ smax v1.8h, v1.8h, v20.8h
+ smax v2.8h, v2.8h, v20.8h
+ smax v3.8h, v3.8h, v20.8h
+.else
+ // Make sure that uninitialized pixels out of range past the right
+ // edge are in range; their actual values shouldn't matter.
+ and v0.16b, v0.16b, v23.16b
+ and v1.16b, v1.16b, v23.16b
+ and v2.16b, v2.16b, v23.16b
+ and v3.16b, v3.16b, v23.16b
+.endif
+
+ bl gather32_neon
+
+ uxtl v4.8h, v6.8b // scaling
+ uxtl2 v5.8h, v6.16b
+ uxtl v6.8h, v7.8b
+ uxtl2 v7.8h, v7.16b
+
+ ushl v4.8h, v4.8h, v29.8h // scaling << (15 - scaling_shift)
+ ushl v5.8h, v5.8h, v29.8h
+ ushl v6.8h, v6.8h, v29.8h
+ ushl v7.8h, v7.8h, v29.8h
+
+ sqrdmulh v16.8h, v16.8h, v4.8h // round2((scaling << (15 - scaling_shift) * grain, 15)
+ sqrdmulh v17.8h, v17.8h, v5.8h
+ sqrdmulh v18.8h, v18.8h, v6.8h
+ sqrdmulh v19.8h, v19.8h, v7.8h
+
+ usqadd v10.8h, v16.8h // *src + noise
+ usqadd v11.8h, v17.8h
+ usqadd v12.8h, v18.8h
+ usqadd v13.8h, v19.8h
+
+ umax v0.8h, v10.8h, v30.8h
+ umax v1.8h, v11.8h, v30.8h
+ umax v2.8h, v12.8h, v30.8h
+ umax v3.8h, v13.8h, v30.8h
+ umin v0.8h, v0.8h, v31.8h
+ umin v1.8h, v1.8h, v31.8h
+ umin v2.8h, v2.8h, v31.8h
+ umin v3.8h, v3.8h, v31.8h
+
+ subs w9, w9, #1
+.if \oy
+ dup v25.8h, v28.h[0]
+ dup v26.8h, v28.h[1]
+.endif
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x2 // dst
+ b.gt 1b
+
+.if \oy
+ cmp w12, #0
+ mov w9, w12 // restore actual remaining h
+ b.gt L(fguv_loop_sx0_csfl\csfl\()_\ox\()0)
+.endif
+ b 9f
+.endm
+ fguv_loop_sx0 0, 0, 0
+ fguv_loop_sx0 0, 0, 1
+ fguv_loop_sx0 0, 1, 0
+ fguv_loop_sx0 0, 1, 1
+ fguv_loop_sx0 1, 0, 0
+ fguv_loop_sx0 1, 0, 1
+ fguv_loop_sx0 1, 1, 0
+ fguv_loop_sx0 1, 1, 1
+
+9:
+ ldp d14, d15, [sp, #64]
+ ldp d12, d13, [sp, #48]
+ ldp d10, d11, [sp, #32]
+ ldp d8, d9, [sp, #16]
+ ldr x30, [sp], #80
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(fguv_loop_sx0_tbl):
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_00)
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_01)
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_10)
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_11)
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_00)
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_01)
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_10)
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_11)
+endfunc
+
+function fguv_loop_sx1_neon
+.macro fguv_loop_sx1 csfl, ox, oy
+L(fguv_loop_sx1_csfl\csfl\()_\ox\oy):
+ AARCH64_VALID_JUMP_TARGET
+1:
+.if \ox
+ ld1 {v18.4h}, [x4], x10 // grain_lut old
+.endif
+.if \oy
+ ld1 {v20.8h, v21.8h}, [x8], x10 // grain_lut top
+.endif
+.if \ox && \oy
+ ld1 {v19.4h}, [x11], x10 // grain_lut top old
+.endif
+ ld1 {v16.8h, v17.8h}, [x5], x10 // grain_lut
+
+.if \ox
+ smull v18.4s, v18.4h, v27.4h
+ smlal v18.4s, v16.4h, v28.4h
+.endif
+
+.if \oy
+.if \ox
+ smull v19.4s, v19.4h, v27.4h
+ smlal v19.4s, v20.4h, v28.4h
+ sqrshrn v18.4h, v18.4s, #5
+ sqrshrn v19.4h, v19.4s, #5
+ smin v18.4h, v18.4h, v15.4h
+ smin v19.4h, v19.4h, v15.4h
+ smax v18.4h, v18.4h, v14.4h
+ smax v19.4h, v19.4h, v14.4h
+ ins v16.d[0], v18.d[0]
+ ins v20.d[0], v19.d[0]
+.endif
+
+ smull v0.4s, v16.4h, v26.4h
+ smull2 v1.4s, v16.8h, v26.8h
+ smull v2.4s, v17.4h, v26.4h
+ smull2 v3.4s, v17.8h, v26.8h
+ smlal v0.4s, v20.4h, v25.4h
+ smlal2 v1.4s, v20.8h, v25.8h
+ smlal v2.4s, v21.4h, v25.4h
+ smlal2 v3.4s, v21.8h, v25.8h
+ sqrshrn v16.4h, v0.4s, #5
+ sqrshrn2 v16.8h, v1.4s, #5
+ sqrshrn v17.4h, v2.4s, #5
+ sqrshrn2 v17.8h, v3.4s, #5
+.endif
+
+.if \ox && !\oy
+ sqrshrn v18.4h, v18.4s, #5
+ smin v18.4h, v18.4h, v15.4h
+.endif
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7 // luma
+.if \oy
+ smin v16.8h, v16.8h, v15.8h
+ smin v17.8h, v17.8h, v15.8h
+ smax v16.8h, v16.8h, v14.8h
+ smax v17.8h, v17.8h, v14.8h
+.endif
+
+.if \ox && !\oy
+ smax v18.4h, v18.4h, v14.4h
+.endif
+ ld1 {v10.8h, v11.8h}, [x1], x2 // src
+.if \ox && !\oy
+ ins v16.d[0], v18.d[0]
+.endif
+ addp v0.8h, v0.8h, v1.8h
+ addp v1.8h, v2.8h, v3.8h
+ urshr v0.8h, v0.8h, #1
+ urshr v1.8h, v1.8h, #1
+.if !\csfl
+ smull v2.4s, v0.4h, v8.4h
+ smull2 v3.4s, v0.8h, v8.8h
+ smull v0.4s, v1.4h, v8.4h
+ smull2 v1.4s, v1.8h, v8.8h
+ smlal v2.4s, v10.4h, v9.4h
+ smlal2 v3.4s, v10.8h, v9.8h
+ smlal v0.4s, v11.4h, v9.4h
+ smlal2 v1.4s, v11.8h, v9.8h
+ shrn v2.4h, v2.4s, #6
+ shrn2 v2.8h, v3.4s, #6
+ shrn v3.4h, v0.4s, #6
+ shrn2 v3.8h, v1.4s, #6
+ add v0.8h, v2.8h, v24.8h
+ add v1.8h, v3.8h, v24.8h
+ movi v2.8h, #0
+ smin v0.8h, v0.8h, v23.8h
+ smin v1.8h, v1.8h, v23.8h
+ smax v0.8h, v0.8h, v2.8h
+ smax v1.8h, v1.8h, v2.8h
+.else
+ // Make sure that uninitialized pixels out of range past the right
+ // edge are in range; their actual values shouldn't matter.
+ and v0.16b, v0.16b, v23.16b
+ and v1.16b, v1.16b, v23.16b
+.endif
+
+ bl gather16_neon
+
+ uxtl v4.8h, v6.8b // scaling
+ uxtl2 v5.8h, v6.16b
+
+ ushl v4.8h, v4.8h, v29.8h // scaling << (15 - scaling_shift)
+ ushl v5.8h, v5.8h, v29.8h
+
+ sqrdmulh v16.8h, v16.8h, v4.8h // round2((scaling << (15 - scaling_shift) * grain, 15)
+ sqrdmulh v17.8h, v17.8h, v5.8h
+
+ usqadd v10.8h, v16.8h // *src + noise
+ usqadd v11.8h, v17.8h
+
+ umax v0.8h, v10.8h, v30.8h
+ umax v1.8h, v11.8h, v30.8h
+ umin v0.8h, v0.8h, v31.8h
+ umin v1.8h, v1.8h, v31.8h
+
+.if \oy
+ mov v16.16b, v25.16b
+.endif
+ subs w9, w9, #1
+.if \oy
+ mov v25.16b, v26.16b
+ mov v26.16b, v16.16b
+.endif
+ st1 {v0.8h, v1.8h}, [x0], x2 // dst
+ b.gt 1b
+
+.if \oy
+ cmp w12, #0
+ mov w9, w12 // restore actual remaining h
+ b.gt L(fguv_loop_sx1_csfl\csfl\()_\ox\()0)
+.endif
+
+ b 9f
+.endm
+ fguv_loop_sx1 0, 0, 0
+ fguv_loop_sx1 0, 0, 1
+ fguv_loop_sx1 0, 1, 0
+ fguv_loop_sx1 0, 1, 1
+ fguv_loop_sx1 1, 0, 0
+ fguv_loop_sx1 1, 0, 1
+ fguv_loop_sx1 1, 1, 0
+ fguv_loop_sx1 1, 1, 1
+
+9:
+ ldp d14, d15, [sp, #64]
+ ldp d12, d13, [sp, #48]
+ ldp d10, d11, [sp, #32]
+ ldp d8, d9, [sp, #16]
+ ldr x30, [sp], #80
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(fguv_loop_sx1_tbl):
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_00)
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_01)
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_10)
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_11)
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_00)
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_01)
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_10)
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_11)
+endfunc
diff --git a/third_party/dav1d/src/arm/64/ipred.S b/third_party/dav1d/src/arm/64/ipred.S
new file mode 100644
index 0000000000..709238e2f8
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/ipred.S
@@ -0,0 +1,5294 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// void ipred_dc_128_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_dc_128_8bpc_neon, export=1
+ clz w3, w3
+ adr x5, L(ipred_dc_128_tbl)
+ sub w3, w3, #25
+ ldrh w3, [x5, w3, uxtw #1]
+ movi v0.16b, #128
+ sub x5, x5, w3, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+4:
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v0.s}[0], [x0], x1
+ st1 {v0.s}[0], [x6], x1
+ subs w4, w4, #4
+ st1 {v0.s}[0], [x0], x1
+ st1 {v0.s}[0], [x6], x1
+ b.gt 4b
+ ret
+8:
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v0.8b}, [x0], x1
+ st1 {v0.8b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8b}, [x0], x1
+ st1 {v0.8b}, [x6], x1
+ b.gt 8b
+ ret
+16:
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v0.16b}, [x0], x1
+ st1 {v0.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.16b}, [x0], x1
+ st1 {v0.16b}, [x6], x1
+ b.gt 16b
+ ret
+320:
+ AARCH64_VALID_JUMP_TARGET
+ movi v1.16b, #128
+32:
+ st1 {v0.16b, v1.16b}, [x0], x1
+ st1 {v0.16b, v1.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.16b, v1.16b}, [x0], x1
+ st1 {v0.16b, v1.16b}, [x6], x1
+ b.gt 32b
+ ret
+640:
+ AARCH64_VALID_JUMP_TARGET
+ movi v1.16b, #128
+ movi v2.16b, #128
+ movi v3.16b, #128
+64:
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
+ b.gt 64b
+ ret
+
+L(ipred_dc_128_tbl):
+ .hword L(ipred_dc_128_tbl) - 640b
+ .hword L(ipred_dc_128_tbl) - 320b
+ .hword L(ipred_dc_128_tbl) - 16b
+ .hword L(ipred_dc_128_tbl) - 8b
+ .hword L(ipred_dc_128_tbl) - 4b
+endfunc
+
+// void ipred_v_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_v_8bpc_neon, export=1
+ clz w3, w3
+ adr x5, L(ipred_v_tbl)
+ sub w3, w3, #25
+ ldrh w3, [x5, w3, uxtw #1]
+ add x2, x2, #1
+ sub x5, x5, w3, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+40:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.s}[0], [x2]
+4:
+ st1 {v0.s}[0], [x0], x1
+ st1 {v0.s}[0], [x6], x1
+ subs w4, w4, #4
+ st1 {v0.s}[0], [x0], x1
+ st1 {v0.s}[0], [x6], x1
+ b.gt 4b
+ ret
+80:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8b}, [x2]
+8:
+ st1 {v0.8b}, [x0], x1
+ st1 {v0.8b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8b}, [x0], x1
+ st1 {v0.8b}, [x6], x1
+ b.gt 8b
+ ret
+160:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.16b}, [x2]
+16:
+ st1 {v0.16b}, [x0], x1
+ st1 {v0.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.16b}, [x0], x1
+ st1 {v0.16b}, [x6], x1
+ b.gt 16b
+ ret
+320:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.16b, v1.16b}, [x2]
+32:
+ st1 {v0.16b, v1.16b}, [x0], x1
+ st1 {v0.16b, v1.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.16b, v1.16b}, [x0], x1
+ st1 {v0.16b, v1.16b}, [x6], x1
+ b.gt 32b
+ ret
+640:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2]
+64:
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
+ b.gt 64b
+ ret
+
+L(ipred_v_tbl):
+ .hword L(ipred_v_tbl) - 640b
+ .hword L(ipred_v_tbl) - 320b
+ .hword L(ipred_v_tbl) - 160b
+ .hword L(ipred_v_tbl) - 80b
+ .hword L(ipred_v_tbl) - 40b
+endfunc
+
+// void ipred_h_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_h_8bpc_neon, export=1
+ clz w3, w3
+ adr x5, L(ipred_h_tbl)
+ sub w3, w3, #25
+ ldrh w3, [x5, w3, uxtw #1]
+ sub x2, x2, #4
+ sub x5, x5, w3, uxtw
+ mov x7, #-4
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+4:
+ AARCH64_VALID_JUMP_TARGET
+ ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7
+ st1 {v3.s}[0], [x0], x1
+ st1 {v2.s}[0], [x6], x1
+ subs w4, w4, #4
+ st1 {v1.s}[0], [x0], x1
+ st1 {v0.s}[0], [x6], x1
+ b.gt 4b
+ ret
+8:
+ AARCH64_VALID_JUMP_TARGET
+ ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7
+ st1 {v3.8b}, [x0], x1
+ st1 {v2.8b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v1.8b}, [x0], x1
+ st1 {v0.8b}, [x6], x1
+ b.gt 8b
+ ret
+16:
+ AARCH64_VALID_JUMP_TARGET
+ ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x7
+ st1 {v3.16b}, [x0], x1
+ st1 {v2.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v1.16b}, [x0], x1
+ st1 {v0.16b}, [x6], x1
+ b.gt 16b
+ ret
+32:
+ AARCH64_VALID_JUMP_TARGET
+ ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x7
+ str q3, [x0, #16]
+ str q2, [x6, #16]
+ st1 {v3.16b}, [x0], x1
+ st1 {v2.16b}, [x6], x1
+ subs w4, w4, #4
+ str q1, [x0, #16]
+ str q0, [x6, #16]
+ st1 {v1.16b}, [x0], x1
+ st1 {v0.16b}, [x6], x1
+ b.gt 32b
+ ret
+64:
+ AARCH64_VALID_JUMP_TARGET
+ ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x7
+ str q3, [x0, #16]
+ str q2, [x6, #16]
+ stp q3, q3, [x0, #32]
+ stp q2, q2, [x6, #32]
+ st1 {v3.16b}, [x0], x1
+ st1 {v2.16b}, [x6], x1
+ subs w4, w4, #4
+ str q1, [x0, #16]
+ str q0, [x6, #16]
+ stp q1, q1, [x0, #32]
+ stp q0, q0, [x6, #32]
+ st1 {v1.16b}, [x0], x1
+ st1 {v0.16b}, [x6], x1
+ b.gt 64b
+ ret
+
+L(ipred_h_tbl):
+ .hword L(ipred_h_tbl) - 64b
+ .hword L(ipred_h_tbl) - 32b
+ .hword L(ipred_h_tbl) - 16b
+ .hword L(ipred_h_tbl) - 8b
+ .hword L(ipred_h_tbl) - 4b
+endfunc
+
+// void ipred_dc_top_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_dc_top_8bpc_neon, export=1
+ clz w3, w3
+ adr x5, L(ipred_dc_top_tbl)
+ sub w3, w3, #25
+ ldrh w3, [x5, w3, uxtw #1]
+ add x2, x2, #1
+ sub x5, x5, w3, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+40:
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v0.2s}, [x2]
+ uaddlv h0, v0.8b
+ rshrn v0.8b, v0.8h, #3
+ dup v0.8b, v0.b[0]
+4:
+ st1 {v0.s}[0], [x0], x1
+ st1 {v0.s}[0], [x6], x1
+ subs w4, w4, #4
+ st1 {v0.s}[0], [x0], x1
+ st1 {v0.s}[0], [x6], x1
+ b.gt 4b
+ ret
+80:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8b}, [x2]
+ uaddlv h0, v0.8b
+ rshrn v0.8b, v0.8h, #3
+ dup v0.8b, v0.b[0]
+8:
+ st1 {v0.8b}, [x0], x1
+ st1 {v0.8b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8b}, [x0], x1
+ st1 {v0.8b}, [x6], x1
+ b.gt 8b
+ ret
+160:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.16b}, [x2]
+ uaddlv h0, v0.16b
+ rshrn v0.8b, v0.8h, #4
+ dup v0.16b, v0.b[0]
+16:
+ st1 {v0.16b}, [x0], x1
+ st1 {v0.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.16b}, [x0], x1
+ st1 {v0.16b}, [x6], x1
+ b.gt 16b
+ ret
+320:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.16b, v1.16b}, [x2]
+ uaddlv h0, v0.16b
+ uaddlv h1, v1.16b
+ add v2.4h, v0.4h, v1.4h
+ rshrn v2.8b, v2.8h, #5
+ dup v0.16b, v2.b[0]
+ dup v1.16b, v2.b[0]
+32:
+ st1 {v0.16b, v1.16b}, [x0], x1
+ st1 {v0.16b, v1.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.16b, v1.16b}, [x0], x1
+ st1 {v0.16b, v1.16b}, [x6], x1
+ b.gt 32b
+ ret
+640:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2]
+ uaddlv h0, v0.16b
+ uaddlv h1, v1.16b
+ uaddlv h2, v2.16b
+ uaddlv h3, v3.16b
+ add v4.4h, v0.4h, v1.4h
+ add v5.4h, v2.4h, v3.4h
+ add v4.4h, v4.4h, v5.4h
+ rshrn v4.8b, v4.8h, #6
+ dup v0.16b, v4.b[0]
+ dup v1.16b, v4.b[0]
+ dup v2.16b, v4.b[0]
+ dup v3.16b, v4.b[0]
+64:
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
+ b.gt 64b
+ ret
+
+L(ipred_dc_top_tbl):
+ .hword L(ipred_dc_top_tbl) - 640b
+ .hword L(ipred_dc_top_tbl) - 320b
+ .hword L(ipred_dc_top_tbl) - 160b
+ .hword L(ipred_dc_top_tbl) - 80b
+ .hword L(ipred_dc_top_tbl) - 40b
+endfunc
+
+// void ipred_dc_left_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_dc_left_8bpc_neon, export=1
+ sub x2, x2, w4, uxtw
+ clz w3, w3
+ clz w7, w4
+ adr x5, L(ipred_dc_left_tbl)
+ sub w3, w3, #20 // 25 leading bits, minus table offset 5
+ sub w7, w7, #25
+ ldrh w3, [x5, w3, uxtw #1]
+ ldrh w7, [x5, w7, uxtw #1]
+ sub x3, x5, w3, uxtw
+ sub x5, x5, w7, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+
+L(ipred_dc_left_h4):
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v0.2s}, [x2]
+ uaddlv h0, v0.8b
+ rshrn v0.8b, v0.8h, #3
+ dup v0.16b, v0.b[0]
+ br x3
+L(ipred_dc_left_w4):
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v0.s}[0], [x0], x1
+ st1 {v0.s}[0], [x6], x1
+ subs w4, w4, #4
+ st1 {v0.s}[0], [x0], x1
+ st1 {v0.s}[0], [x6], x1
+ b.gt L(ipred_dc_left_w4)
+ ret
+
+L(ipred_dc_left_h8):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8b}, [x2]
+ uaddlv h0, v0.8b
+ rshrn v0.8b, v0.8h, #3
+ dup v0.16b, v0.b[0]
+ br x3
+L(ipred_dc_left_w8):
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v0.8b}, [x0], x1
+ st1 {v0.8b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8b}, [x0], x1
+ st1 {v0.8b}, [x6], x1
+ b.gt L(ipred_dc_left_w8)
+ ret
+
+L(ipred_dc_left_h16):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.16b}, [x2]
+ uaddlv h0, v0.16b
+ rshrn v0.8b, v0.8h, #4
+ dup v0.16b, v0.b[0]
+ br x3
+L(ipred_dc_left_w16):
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v0.16b}, [x0], x1
+ st1 {v0.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.16b}, [x0], x1
+ st1 {v0.16b}, [x6], x1
+ b.gt L(ipred_dc_left_w16)
+ ret
+
+L(ipred_dc_left_h32):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.16b, v1.16b}, [x2]
+ uaddlv h0, v0.16b
+ uaddlv h1, v1.16b
+ add v0.4h, v0.4h, v1.4h
+ rshrn v0.8b, v0.8h, #5
+ dup v0.16b, v0.b[0]
+ br x3
+L(ipred_dc_left_w32):
+ AARCH64_VALID_JUMP_TARGET
+ mov v1.16b, v0.16b
+1:
+ st1 {v0.16b, v1.16b}, [x0], x1
+ st1 {v0.16b, v1.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.16b, v1.16b}, [x0], x1
+ st1 {v0.16b, v1.16b}, [x6], x1
+ b.gt 1b
+ ret
+
+L(ipred_dc_left_h64):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2]
+ uaddlv h0, v0.16b
+ uaddlv h1, v1.16b
+ uaddlv h2, v2.16b
+ uaddlv h3, v3.16b
+ add v0.4h, v0.4h, v1.4h
+ add v2.4h, v2.4h, v3.4h
+ add v0.4h, v0.4h, v2.4h
+ rshrn v0.8b, v0.8h, #6
+ dup v0.16b, v0.b[0]
+ br x3
+L(ipred_dc_left_w64):
+ AARCH64_VALID_JUMP_TARGET
+ mov v1.16b, v0.16b
+ mov v2.16b, v0.16b
+ mov v3.16b, v0.16b
+1:
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
+ b.gt 1b
+ ret
+
+L(ipred_dc_left_tbl):
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h64)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h32)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h16)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h8)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h4)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w64)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w32)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w16)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w8)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w4)
+endfunc
+
+// void ipred_dc_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_dc_8bpc_neon, export=1
+ sub x2, x2, w4, uxtw
+ add w7, w3, w4 // width + height
+ clz w3, w3
+ clz w6, w4
+ dup v16.8h, w7 // width + height
+ adr x5, L(ipred_dc_tbl)
+ rbit w7, w7 // rbit(width + height)
+ sub w3, w3, #20 // 25 leading bits, minus table offset 5
+ sub w6, w6, #25
+ clz w7, w7 // ctz(width + height)
+ ldrh w3, [x5, w3, uxtw #1]
+ ldrh w6, [x5, w6, uxtw #1]
+ neg w7, w7 // -ctz(width + height)
+ sub x3, x5, w3, uxtw
+ sub x5, x5, w6, uxtw
+ ushr v16.8h, v16.8h, #1 // (width + height) >> 1
+ dup v17.8h, w7 // -ctz(width + height)
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+
+L(ipred_dc_h4):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.s}[0], [x2], #4
+ ins v0.s[1], wzr
+ uaddlv h0, v0.8b
+ add x2, x2, #1
+ br x3
+L(ipred_dc_w4):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v1.s}[0], [x2]
+ ins v1.s[1], wzr
+ add v0.4h, v0.4h, v16.4h
+ uaddlv h1, v1.8b
+ cmp w4, #4
+ add v0.4h, v0.4h, v1.4h
+ ushl v0.4h, v0.4h, v17.4h
+ b.eq 1f
+ // h = 8/16
+ mov w16, #(0x3334/2)
+ movk w16, #(0x5556/2), lsl #16
+ add w17, w4, w4 // w17 = 2*h = 16 or 32
+ lsr w16, w16, w17
+ dup v16.4h, w16
+ sqdmulh v0.4h, v0.4h, v16.4h
+1:
+ dup v0.8b, v0.b[0]
+2:
+ st1 {v0.s}[0], [x0], x1
+ st1 {v0.s}[0], [x6], x1
+ subs w4, w4, #4
+ st1 {v0.s}[0], [x0], x1
+ st1 {v0.s}[0], [x6], x1
+ b.gt 2b
+ ret
+
+L(ipred_dc_h8):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8b}, [x2], #8
+ uaddlv h0, v0.8b
+ add x2, x2, #1
+ br x3
+L(ipred_dc_w8):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v1.8b}, [x2]
+ add v0.4h, v0.4h, v16.4h
+ uaddlv h1, v1.8b
+ cmp w4, #8
+ add v0.4h, v0.4h, v1.4h
+ ushl v0.4h, v0.4h, v17.4h
+ b.eq 1f
+ // h = 4/16/32
+ cmp w4, #32
+ mov w16, #(0x3334/2)
+ mov w17, #(0x5556/2)
+ csel w16, w16, w17, eq
+ dup v16.4h, w16
+ sqdmulh v0.4h, v0.4h, v16.4h
+1:
+ dup v0.8b, v0.b[0]
+2:
+ st1 {v0.8b}, [x0], x1
+ st1 {v0.8b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8b}, [x0], x1
+ st1 {v0.8b}, [x6], x1
+ b.gt 2b
+ ret
+
+L(ipred_dc_h16):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.16b}, [x2], #16
+ uaddlv h0, v0.16b
+ add x2, x2, #1
+ br x3
+L(ipred_dc_w16):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v1.16b}, [x2]
+ add v0.4h, v0.4h, v16.4h
+ uaddlv h1, v1.16b
+ cmp w4, #16
+ add v0.4h, v0.4h, v1.4h
+ ushl v0.4h, v0.4h, v17.4h
+ b.eq 1f
+ // h = 4/8/32/64
+ tst w4, #(32+16+8) // 16 added to make a consecutive bitmask
+ mov w16, #(0x3334/2)
+ mov w17, #(0x5556/2)
+ csel w16, w16, w17, eq
+ dup v16.4h, w16
+ sqdmulh v0.4h, v0.4h, v16.4h
+1:
+ dup v0.16b, v0.b[0]
+2:
+ st1 {v0.16b}, [x0], x1
+ st1 {v0.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.16b}, [x0], x1
+ st1 {v0.16b}, [x6], x1
+ b.gt 2b
+ ret
+
+L(ipred_dc_h32):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.16b, v1.16b}, [x2], #32
+ uaddlv h0, v0.16b
+ uaddlv h1, v1.16b
+ add x2, x2, #1
+ add v0.4h, v0.4h, v1.4h
+ br x3
+L(ipred_dc_w32):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v1.16b, v2.16b}, [x2]
+ add v0.4h, v0.4h, v16.4h
+ uaddlv h1, v1.16b
+ uaddlv h2, v2.16b
+ cmp w4, #32
+ add v0.4h, v0.4h, v1.4h
+ add v0.4h, v0.4h, v2.4h
+ ushl v4.4h, v0.4h, v17.4h
+ b.eq 1f
+ // h = 8/16/64
+ cmp w4, #8
+ mov w16, #(0x3334/2)
+ mov w17, #(0x5556/2)
+ csel w16, w16, w17, eq
+ dup v16.4h, w16
+ sqdmulh v4.4h, v4.4h, v16.4h
+1:
+ dup v0.16b, v4.b[0]
+ dup v1.16b, v4.b[0]
+2:
+ st1 {v0.16b, v1.16b}, [x0], x1
+ st1 {v0.16b, v1.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.16b, v1.16b}, [x0], x1
+ st1 {v0.16b, v1.16b}, [x6], x1
+ b.gt 2b
+ ret
+
+L(ipred_dc_h64):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], #64
+ uaddlv h0, v0.16b
+ uaddlv h1, v1.16b
+ uaddlv h2, v2.16b
+ uaddlv h3, v3.16b
+ add v0.4h, v0.4h, v1.4h
+ add v2.4h, v2.4h, v3.4h
+ add x2, x2, #1
+ add v0.4h, v0.4h, v2.4h
+ br x3
+L(ipred_dc_w64):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v1.16b, v2.16b, v3.16b, v4.16b}, [x2]
+ add v0.4h, v0.4h, v16.4h
+ uaddlv h1, v1.16b
+ uaddlv h2, v2.16b
+ uaddlv h3, v3.16b
+ uaddlv h4, v4.16b
+ add v1.4h, v1.4h, v2.4h
+ add v3.4h, v3.4h, v4.4h
+ cmp w4, #64
+ add v0.4h, v0.4h, v1.4h
+ add v0.4h, v0.4h, v3.4h
+ ushl v4.4h, v0.4h, v17.4h
+ b.eq 1f
+ // h = 16/32
+ mov w16, #(0x5556/2)
+ movk w16, #(0x3334/2), lsl #16
+ lsr w16, w16, w4
+ dup v16.4h, w16
+ sqdmulh v4.4h, v4.4h, v16.4h
+1:
+ dup v0.16b, v4.b[0]
+ dup v1.16b, v4.b[0]
+ dup v2.16b, v4.b[0]
+ dup v3.16b, v4.b[0]
+2:
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
+ b.gt 2b
+ ret
+
+L(ipred_dc_tbl):
+ .hword L(ipred_dc_tbl) - L(ipred_dc_h64)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_h32)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_h16)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_h8)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_h4)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_w64)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_w32)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_w16)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_w8)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_w4)
+endfunc
+
+// void ipred_paeth_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_paeth_8bpc_neon, export=1
+ clz w9, w3
+ adr x5, L(ipred_paeth_tbl)
+ sub w9, w9, #25
+ ldrh w9, [x5, w9, uxtw #1]
+ ld1r {v4.16b}, [x2]
+ add x8, x2, #1
+ sub x2, x2, #4
+ sub x5, x5, w9, uxtw
+ mov x7, #-4
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+40:
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v5.4s}, [x8]
+ usubl v6.8h, v5.8b, v4.8b // top - topleft
+4:
+ ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7
+ zip1 v0.2s, v0.2s, v1.2s
+ zip1 v2.2s, v2.2s, v3.2s
+ uaddw v16.8h, v6.8h, v0.8b
+ uaddw v17.8h, v6.8h, v2.8b
+ sqxtun v16.8b, v16.8h // base
+ sqxtun2 v16.16b, v17.8h
+ zip1 v0.2d, v0.2d, v2.2d
+ uabd v20.16b, v5.16b, v16.16b // tdiff
+ uabd v22.16b, v4.16b, v16.16b // tldiff
+ uabd v16.16b, v0.16b, v16.16b // ldiff
+ umin v18.16b, v20.16b, v22.16b // min(tdiff, tldiff)
+ cmhs v20.16b, v22.16b, v20.16b // tldiff >= tdiff
+ cmhs v16.16b, v18.16b, v16.16b // min(tdiff, tldiff) >= ldiff
+ bsl v20.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft
+ bit v20.16b, v0.16b, v16.16b // ldiff <= min ? left : ...
+ st1 {v20.s}[3], [x0], x1
+ st1 {v20.s}[2], [x6], x1
+ subs w4, w4, #4
+ st1 {v20.s}[1], [x0], x1
+ st1 {v20.s}[0], [x6], x1
+ b.gt 4b
+ ret
+80:
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v5.2d}, [x8]
+ usubl v6.8h, v5.8b, v4.8b // top - topleft
+8:
+ ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7
+ uaddw v16.8h, v6.8h, v0.8b
+ uaddw v17.8h, v6.8h, v1.8b
+ uaddw v18.8h, v6.8h, v2.8b
+ uaddw v19.8h, v6.8h, v3.8b
+ sqxtun v16.8b, v16.8h // base
+ sqxtun2 v16.16b, v17.8h
+ sqxtun v18.8b, v18.8h
+ sqxtun2 v18.16b, v19.8h
+ zip1 v2.2d, v2.2d, v3.2d
+ zip1 v0.2d, v0.2d, v1.2d
+ uabd v21.16b, v5.16b, v18.16b // tdiff
+ uabd v20.16b, v5.16b, v16.16b
+ uabd v23.16b, v4.16b, v18.16b // tldiff
+ uabd v22.16b, v4.16b, v16.16b
+ uabd v17.16b, v2.16b, v18.16b // ldiff
+ uabd v16.16b, v0.16b, v16.16b
+ umin v19.16b, v21.16b, v23.16b // min(tdiff, tldiff)
+ umin v18.16b, v20.16b, v22.16b
+ cmhs v21.16b, v23.16b, v21.16b // tldiff >= tdiff
+ cmhs v20.16b, v22.16b, v20.16b
+ cmhs v17.16b, v19.16b, v17.16b // min(tdiff, tldiff) >= ldiff
+ cmhs v16.16b, v18.16b, v16.16b
+ bsl v21.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft
+ bsl v20.16b, v5.16b, v4.16b
+ bit v21.16b, v2.16b, v17.16b // ldiff <= min ? left : ...
+ bit v20.16b, v0.16b, v16.16b
+ st1 {v21.d}[1], [x0], x1
+ st1 {v21.d}[0], [x6], x1
+ subs w4, w4, #4
+ st1 {v20.d}[1], [x0], x1
+ st1 {v20.d}[0], [x6], x1
+ b.gt 8b
+ ret
+160:
+320:
+640:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v5.16b}, [x8], #16
+ mov w9, w3
+ // Set up pointers for four rows in parallel; x0, x6, x5, x10
+ add x5, x0, x1
+ add x10, x6, x1
+ lsl x1, x1, #1
+ sub x1, x1, w3, uxtw
+1:
+ ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x7
+2:
+ usubl v6.8h, v5.8b, v4.8b // top - topleft
+ usubl2 v7.8h, v5.16b, v4.16b
+ uaddw v24.8h, v6.8h, v0.8b
+ uaddw v25.8h, v7.8h, v0.8b
+ uaddw v26.8h, v6.8h, v1.8b
+ uaddw v27.8h, v7.8h, v1.8b
+ uaddw v28.8h, v6.8h, v2.8b
+ uaddw v29.8h, v7.8h, v2.8b
+ uaddw v30.8h, v6.8h, v3.8b
+ uaddw v31.8h, v7.8h, v3.8b
+ sqxtun v17.8b, v26.8h // base
+ sqxtun2 v17.16b, v27.8h
+ sqxtun v16.8b, v24.8h
+ sqxtun2 v16.16b, v25.8h
+ sqxtun v19.8b, v30.8h
+ sqxtun2 v19.16b, v31.8h
+ sqxtun v18.8b, v28.8h
+ sqxtun2 v18.16b, v29.8h
+ uabd v23.16b, v5.16b, v19.16b // tdiff
+ uabd v22.16b, v5.16b, v18.16b
+ uabd v21.16b, v5.16b, v17.16b
+ uabd v20.16b, v5.16b, v16.16b
+ uabd v27.16b, v4.16b, v19.16b // tldiff
+ uabd v26.16b, v4.16b, v18.16b
+ uabd v25.16b, v4.16b, v17.16b
+ uabd v24.16b, v4.16b, v16.16b
+ uabd v19.16b, v3.16b, v19.16b // ldiff
+ uabd v18.16b, v2.16b, v18.16b
+ uabd v17.16b, v1.16b, v17.16b
+ uabd v16.16b, v0.16b, v16.16b
+ umin v31.16b, v23.16b, v27.16b // min(tdiff, tldiff)
+ umin v30.16b, v22.16b, v26.16b
+ umin v29.16b, v21.16b, v25.16b
+ umin v28.16b, v20.16b, v24.16b
+ cmhs v23.16b, v27.16b, v23.16b // tldiff >= tdiff
+ cmhs v22.16b, v26.16b, v22.16b
+ cmhs v21.16b, v25.16b, v21.16b
+ cmhs v20.16b, v24.16b, v20.16b
+ cmhs v19.16b, v31.16b, v19.16b // min(tdiff, tldiff) >= ldiff
+ cmhs v18.16b, v30.16b, v18.16b
+ cmhs v17.16b, v29.16b, v17.16b
+ cmhs v16.16b, v28.16b, v16.16b
+ bsl v23.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft
+ bsl v22.16b, v5.16b, v4.16b
+ bsl v21.16b, v5.16b, v4.16b
+ bsl v20.16b, v5.16b, v4.16b
+ bit v23.16b, v3.16b, v19.16b // ldiff <= min ? left : ...
+ bit v22.16b, v2.16b, v18.16b
+ bit v21.16b, v1.16b, v17.16b
+ bit v20.16b, v0.16b, v16.16b
+ subs w3, w3, #16
+ st1 {v23.16b}, [x0], #16
+ st1 {v22.16b}, [x6], #16
+ st1 {v21.16b}, [x5], #16
+ st1 {v20.16b}, [x10], #16
+ b.le 8f
+ ld1 {v5.16b}, [x8], #16
+ b 2b
+8:
+ subs w4, w4, #4
+ b.le 9f
+ // End of horizontal loop, move pointers to next four rows
+ sub x8, x8, w9, uxtw
+ add x0, x0, x1
+ add x6, x6, x1
+ // Load the top row as early as possible
+ ld1 {v5.16b}, [x8], #16
+ add x5, x5, x1
+ add x10, x10, x1
+ mov w3, w9
+ b 1b
+9:
+ ret
+
+L(ipred_paeth_tbl):
+ .hword L(ipred_paeth_tbl) - 640b
+ .hword L(ipred_paeth_tbl) - 320b
+ .hword L(ipred_paeth_tbl) - 160b
+ .hword L(ipred_paeth_tbl) - 80b
+ .hword L(ipred_paeth_tbl) - 40b
+endfunc
+
+// void ipred_smooth_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_smooth_8bpc_neon, export=1
+ movrel x10, X(sm_weights)
+ add x11, x10, w4, uxtw
+ add x10, x10, w3, uxtw
+ clz w9, w3
+ adr x5, L(ipred_smooth_tbl)
+ sub x12, x2, w4, uxtw
+ sub w9, w9, #25
+ ldrh w9, [x5, w9, uxtw #1]
+ ld1r {v4.16b}, [x12] // bottom
+ add x8, x2, #1
+ sub x5, x5, w9, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+40:
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v6.2s}, [x8] // top
+ ld1r {v7.2s}, [x10] // weights_hor
+ sub x2, x2, #4
+ mov x7, #-4
+ dup v5.16b, v6.b[3] // right
+ usubl v6.8h, v6.8b, v4.8b // top-bottom
+ uxtl v7.8h, v7.8b // weights_hor
+4:
+ ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left
+ ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver
+ shll v20.8h, v5.8b, #8 // right*256
+ shll v21.8h, v5.8b, #8
+ zip1 v1.2s, v1.2s, v0.2s // left, flipped
+ zip1 v0.2s, v3.2s, v2.2s
+ zip1 v16.2s, v16.2s, v17.2s // weights_ver
+ zip1 v18.2s, v18.2s, v19.2s
+ shll v22.8h, v4.8b, #8 // bottom*256
+ shll v23.8h, v4.8b, #8
+ usubl v0.8h, v0.8b, v5.8b // left-right
+ usubl v1.8h, v1.8b, v5.8b
+ uxtl v16.8h, v16.8b // weights_ver
+ uxtl v18.8h, v18.8b
+ mla v20.8h, v0.8h, v7.8h // right*256 + (left-right)*weights_hor
+ mla v21.8h, v1.8h, v7.8h
+ mla v22.8h, v6.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver
+ mla v23.8h, v6.8h, v18.8h
+ uhadd v20.8h, v20.8h, v22.8h
+ uhadd v21.8h, v21.8h, v23.8h
+ rshrn v20.8b, v20.8h, #8
+ rshrn v21.8b, v21.8h, #8
+ st1 {v20.s}[0], [x0], x1
+ st1 {v20.s}[1], [x6], x1
+ subs w4, w4, #4
+ st1 {v21.s}[0], [x0], x1
+ st1 {v21.s}[1], [x6], x1
+ b.gt 4b
+ ret
+80:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v6.8b}, [x8] // top
+ ld1 {v7.8b}, [x10] // weights_hor
+ sub x2, x2, #4
+ mov x7, #-4
+ dup v5.16b, v6.b[7] // right
+ usubl v6.8h, v6.8b, v4.8b // top-bottom
+ uxtl v7.8h, v7.8b // weights_hor
+8:
+ ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left
+ ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver
+ shll v20.8h, v5.8b, #8 // right*256
+ shll v21.8h, v5.8b, #8
+ shll v22.8h, v5.8b, #8
+ shll v23.8h, v5.8b, #8
+ usubl v0.8h, v0.8b, v5.8b // left-right
+ usubl v1.8h, v1.8b, v5.8b
+ usubl v2.8h, v2.8b, v5.8b
+ usubl v3.8h, v3.8b, v5.8b
+ shll v24.8h, v4.8b, #8 // bottom*256
+ shll v25.8h, v4.8b, #8
+ shll v26.8h, v4.8b, #8
+ shll v27.8h, v4.8b, #8
+ uxtl v16.8h, v16.8b // weights_ver
+ uxtl v17.8h, v17.8b
+ uxtl v18.8h, v18.8b
+ uxtl v19.8h, v19.8b
+ mla v20.8h, v3.8h, v7.8h // right*256 + (left-right)*weights_hor
+ mla v21.8h, v2.8h, v7.8h // (left flipped)
+ mla v22.8h, v1.8h, v7.8h
+ mla v23.8h, v0.8h, v7.8h
+ mla v24.8h, v6.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver
+ mla v25.8h, v6.8h, v17.8h
+ mla v26.8h, v6.8h, v18.8h
+ mla v27.8h, v6.8h, v19.8h
+ uhadd v20.8h, v20.8h, v24.8h
+ uhadd v21.8h, v21.8h, v25.8h
+ uhadd v22.8h, v22.8h, v26.8h
+ uhadd v23.8h, v23.8h, v27.8h
+ rshrn v20.8b, v20.8h, #8
+ rshrn v21.8b, v21.8h, #8
+ rshrn v22.8b, v22.8h, #8
+ rshrn v23.8b, v23.8h, #8
+ st1 {v20.8b}, [x0], x1
+ st1 {v21.8b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v22.8b}, [x0], x1
+ st1 {v23.8b}, [x6], x1
+ b.gt 8b
+ ret
+160:
+320:
+640:
+ AARCH64_VALID_JUMP_TARGET
+ add x12, x2, w3, uxtw
+ sub x2, x2, #2
+ mov x7, #-2
+ ld1r {v5.16b}, [x12] // right
+ sub x1, x1, w3, uxtw
+ mov w9, w3
+
+1:
+ ld2r {v0.8b, v1.8b}, [x2], x7 // left
+ ld2r {v16.8b, v17.8b}, [x11], #2 // weights_ver
+ usubl v0.8h, v0.8b, v5.8b // left-right
+ usubl v1.8h, v1.8b, v5.8b
+ uxtl v16.8h, v16.8b // weights_ver
+ uxtl v17.8h, v17.8b
+2:
+ ld1 {v7.16b}, [x10], #16 // weights_hor
+ ld1 {v3.16b}, [x8], #16 // top
+ shll v20.8h, v5.8b, #8 // right*256
+ shll v21.8h, v5.8b, #8
+ shll v22.8h, v5.8b, #8
+ shll v23.8h, v5.8b, #8
+ uxtl v6.8h, v7.8b // weights_hor
+ uxtl2 v7.8h, v7.16b
+ usubl v2.8h, v3.8b, v4.8b // top-bottom
+ usubl2 v3.8h, v3.16b, v4.16b
+ mla v20.8h, v1.8h, v6.8h // right*256 + (left-right)*weights_hor
+ mla v21.8h, v1.8h, v7.8h // (left flipped)
+ mla v22.8h, v0.8h, v6.8h
+ mla v23.8h, v0.8h, v7.8h
+ shll v24.8h, v4.8b, #8 // bottom*256
+ shll v25.8h, v4.8b, #8
+ shll v26.8h, v4.8b, #8
+ shll v27.8h, v4.8b, #8
+ mla v24.8h, v2.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver
+ mla v25.8h, v3.8h, v16.8h
+ mla v26.8h, v2.8h, v17.8h
+ mla v27.8h, v3.8h, v17.8h
+ uhadd v20.8h, v20.8h, v24.8h
+ uhadd v21.8h, v21.8h, v25.8h
+ uhadd v22.8h, v22.8h, v26.8h
+ uhadd v23.8h, v23.8h, v27.8h
+ rshrn v20.8b, v20.8h, #8
+ rshrn2 v20.16b, v21.8h, #8
+ rshrn v22.8b, v22.8h, #8
+ rshrn2 v22.16b, v23.8h, #8
+ subs w3, w3, #16
+ st1 {v20.16b}, [x0], #16
+ st1 {v22.16b}, [x6], #16
+ b.gt 2b
+ subs w4, w4, #2
+ b.le 9f
+ sub x8, x8, w9, uxtw
+ sub x10, x10, w9, uxtw
+ add x0, x0, x1
+ add x6, x6, x1
+ mov w3, w9
+ b 1b
+9:
+ ret
+
+L(ipred_smooth_tbl):
+ .hword L(ipred_smooth_tbl) - 640b
+ .hword L(ipred_smooth_tbl) - 320b
+ .hword L(ipred_smooth_tbl) - 160b
+ .hword L(ipred_smooth_tbl) - 80b
+ .hword L(ipred_smooth_tbl) - 40b
+endfunc
+
+// void ipred_smooth_v_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_smooth_v_8bpc_neon, export=1
+ movrel x7, X(sm_weights)
+ add x7, x7, w4, uxtw
+ clz w9, w3
+ adr x5, L(ipred_smooth_v_tbl)
+ sub x8, x2, w4, uxtw
+ sub w9, w9, #25
+ ldrh w9, [x5, w9, uxtw #1]
+ ld1r {v4.16b}, [x8] // bottom
+ add x2, x2, #1
+ sub x5, x5, w9, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+40:
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v6.2s}, [x2] // top
+ usubl v6.8h, v6.8b, v4.8b // top-bottom
+4:
+ ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver
+ shll v22.8h, v4.8b, #8 // bottom*256
+ shll v23.8h, v4.8b, #8
+ zip1 v16.2s, v16.2s, v17.2s // weights_ver
+ zip1 v18.2s, v18.2s, v19.2s
+ uxtl v16.8h, v16.8b // weights_ver
+ uxtl v18.8h, v18.8b
+ mla v22.8h, v6.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver
+ mla v23.8h, v6.8h, v18.8h
+ rshrn v22.8b, v22.8h, #8
+ rshrn v23.8b, v23.8h, #8
+ st1 {v22.s}[0], [x0], x1
+ st1 {v22.s}[1], [x6], x1
+ subs w4, w4, #4
+ st1 {v23.s}[0], [x0], x1
+ st1 {v23.s}[1], [x6], x1
+ b.gt 4b
+ ret
+80:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v6.8b}, [x2] // top
+ usubl v6.8h, v6.8b, v4.8b // top-bottom
+8:
+ ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver
+ shll v24.8h, v4.8b, #8 // bottom*256
+ shll v25.8h, v4.8b, #8
+ shll v26.8h, v4.8b, #8
+ shll v27.8h, v4.8b, #8
+ uxtl v16.8h, v16.8b // weights_ver
+ uxtl v17.8h, v17.8b
+ uxtl v18.8h, v18.8b
+ uxtl v19.8h, v19.8b
+ mla v24.8h, v6.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver
+ mla v25.8h, v6.8h, v17.8h
+ mla v26.8h, v6.8h, v18.8h
+ mla v27.8h, v6.8h, v19.8h
+ rshrn v24.8b, v24.8h, #8
+ rshrn v25.8b, v25.8h, #8
+ rshrn v26.8b, v26.8h, #8
+ rshrn v27.8b, v27.8h, #8
+ st1 {v24.8b}, [x0], x1
+ st1 {v25.8b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v26.8b}, [x0], x1
+ st1 {v27.8b}, [x6], x1
+ b.gt 8b
+ ret
+160:
+320:
+640:
+ AARCH64_VALID_JUMP_TARGET
+ // Set up pointers for four rows in parallel; x0, x6, x5, x8
+ add x5, x0, x1
+ add x8, x6, x1
+ lsl x1, x1, #1
+ sub x1, x1, w3, uxtw
+ mov w9, w3
+
+1:
+ ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver
+ uxtl v16.8h, v16.8b // weights_ver
+ uxtl v17.8h, v17.8b
+ uxtl v18.8h, v18.8b
+ uxtl v19.8h, v19.8b
+2:
+ ld1 {v3.16b}, [x2], #16 // top
+ shll v20.8h, v4.8b, #8 // bottom*256
+ shll v21.8h, v4.8b, #8
+ shll v22.8h, v4.8b, #8
+ shll v23.8h, v4.8b, #8
+ shll v24.8h, v4.8b, #8
+ shll v25.8h, v4.8b, #8
+ shll v26.8h, v4.8b, #8
+ shll v27.8h, v4.8b, #8
+ usubl v2.8h, v3.8b, v4.8b // top-bottom
+ usubl2 v3.8h, v3.16b, v4.16b
+ mla v20.8h, v2.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver
+ mla v21.8h, v3.8h, v16.8h
+ mla v22.8h, v2.8h, v17.8h
+ mla v23.8h, v3.8h, v17.8h
+ mla v24.8h, v2.8h, v18.8h
+ mla v25.8h, v3.8h, v18.8h
+ mla v26.8h, v2.8h, v19.8h
+ mla v27.8h, v3.8h, v19.8h
+ rshrn v20.8b, v20.8h, #8
+ rshrn2 v20.16b, v21.8h, #8
+ rshrn v22.8b, v22.8h, #8
+ rshrn2 v22.16b, v23.8h, #8
+ rshrn v24.8b, v24.8h, #8
+ rshrn2 v24.16b, v25.8h, #8
+ rshrn v26.8b, v26.8h, #8
+ rshrn2 v26.16b, v27.8h, #8
+ subs w3, w3, #16
+ st1 {v20.16b}, [x0], #16
+ st1 {v22.16b}, [x6], #16
+ st1 {v24.16b}, [x5], #16
+ st1 {v26.16b}, [x8], #16
+ b.gt 2b
+ subs w4, w4, #4
+ b.le 9f
+ sub x2, x2, w9, uxtw
+ add x0, x0, x1
+ add x6, x6, x1
+ add x5, x5, x1
+ add x8, x8, x1
+ mov w3, w9
+ b 1b
+9:
+ ret
+
+L(ipred_smooth_v_tbl):
+ .hword L(ipred_smooth_v_tbl) - 640b
+ .hword L(ipred_smooth_v_tbl) - 320b
+ .hword L(ipred_smooth_v_tbl) - 160b
+ .hword L(ipred_smooth_v_tbl) - 80b
+ .hword L(ipred_smooth_v_tbl) - 40b
+endfunc
+
+// void ipred_smooth_h_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_smooth_h_8bpc_neon, export=1
+ movrel x8, X(sm_weights)
+ add x8, x8, w3, uxtw
+ clz w9, w3
+ adr x5, L(ipred_smooth_h_tbl)
+ add x12, x2, w3, uxtw
+ sub w9, w9, #25
+ ldrh w9, [x5, w9, uxtw #1]
+ ld1r {v5.16b}, [x12] // right
+ sub x5, x5, w9, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+40:
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v7.2s}, [x8] // weights_hor
+ sub x2, x2, #4
+ mov x7, #-4
+ uxtl v7.8h, v7.8b // weights_hor
+4:
+ ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left
+ shll v20.8h, v5.8b, #8 // right*256
+ shll v21.8h, v5.8b, #8
+ zip1 v1.2s, v1.2s, v0.2s // left, flipped
+ zip1 v0.2s, v3.2s, v2.2s
+ usubl v0.8h, v0.8b, v5.8b // left-right
+ usubl v1.8h, v1.8b, v5.8b
+ mla v20.8h, v0.8h, v7.8h // right*256 + (left-right)*weights_hor
+ mla v21.8h, v1.8h, v7.8h
+ rshrn v20.8b, v20.8h, #8
+ rshrn v21.8b, v21.8h, #8
+ st1 {v20.s}[0], [x0], x1
+ st1 {v20.s}[1], [x6], x1
+ subs w4, w4, #4
+ st1 {v21.s}[0], [x0], x1
+ st1 {v21.s}[1], [x6], x1
+ b.gt 4b
+ ret
+80:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v7.8b}, [x8] // weights_hor
+ sub x2, x2, #4
+ mov x7, #-4
+ uxtl v7.8h, v7.8b // weights_hor
+8:
+ ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left
+ shll v20.8h, v5.8b, #8 // right*256
+ shll v21.8h, v5.8b, #8
+ shll v22.8h, v5.8b, #8
+ shll v23.8h, v5.8b, #8
+ usubl v3.8h, v3.8b, v5.8b // left-right
+ usubl v2.8h, v2.8b, v5.8b
+ usubl v1.8h, v1.8b, v5.8b
+ usubl v0.8h, v0.8b, v5.8b
+ mla v20.8h, v3.8h, v7.8h // right*256 + (left-right)*weights_hor
+ mla v21.8h, v2.8h, v7.8h // (left flipped)
+ mla v22.8h, v1.8h, v7.8h
+ mla v23.8h, v0.8h, v7.8h
+ rshrn v20.8b, v20.8h, #8
+ rshrn v21.8b, v21.8h, #8
+ rshrn v22.8b, v22.8h, #8
+ rshrn v23.8b, v23.8h, #8
+ st1 {v20.8b}, [x0], x1
+ st1 {v21.8b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v22.8b}, [x0], x1
+ st1 {v23.8b}, [x6], x1
+ b.gt 8b
+ ret
+160:
+320:
+640:
+ AARCH64_VALID_JUMP_TARGET
+ sub x2, x2, #4
+ mov x7, #-4
+ // Set up pointers for four rows in parallel; x0, x6, x5, x10
+ add x5, x0, x1
+ add x10, x6, x1
+ lsl x1, x1, #1
+ sub x1, x1, w3, uxtw
+ mov w9, w3
+
+1:
+ ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left
+ usubl v0.8h, v0.8b, v5.8b // left-right
+ usubl v1.8h, v1.8b, v5.8b
+ usubl v2.8h, v2.8b, v5.8b
+ usubl v3.8h, v3.8b, v5.8b
+2:
+ ld1 {v7.16b}, [x8], #16 // weights_hor
+ shll v20.8h, v5.8b, #8 // right*256
+ shll v21.8h, v5.8b, #8
+ shll v22.8h, v5.8b, #8
+ shll v23.8h, v5.8b, #8
+ shll v24.8h, v5.8b, #8
+ shll v25.8h, v5.8b, #8
+ shll v26.8h, v5.8b, #8
+ shll v27.8h, v5.8b, #8
+ uxtl v6.8h, v7.8b // weights_hor
+ uxtl2 v7.8h, v7.16b
+ mla v20.8h, v3.8h, v6.8h // right*256 + (left-right)*weights_hor
+ mla v21.8h, v3.8h, v7.8h // (left flipped)
+ mla v22.8h, v2.8h, v6.8h
+ mla v23.8h, v2.8h, v7.8h
+ mla v24.8h, v1.8h, v6.8h
+ mla v25.8h, v1.8h, v7.8h
+ mla v26.8h, v0.8h, v6.8h
+ mla v27.8h, v0.8h, v7.8h
+ rshrn v20.8b, v20.8h, #8
+ rshrn2 v20.16b, v21.8h, #8
+ rshrn v22.8b, v22.8h, #8
+ rshrn2 v22.16b, v23.8h, #8
+ rshrn v24.8b, v24.8h, #8
+ rshrn2 v24.16b, v25.8h, #8
+ rshrn v26.8b, v26.8h, #8
+ rshrn2 v26.16b, v27.8h, #8
+ subs w3, w3, #16
+ st1 {v20.16b}, [x0], #16
+ st1 {v22.16b}, [x6], #16
+ st1 {v24.16b}, [x5], #16
+ st1 {v26.16b}, [x10], #16
+ b.gt 2b
+ subs w4, w4, #4
+ b.le 9f
+ sub x8, x8, w9, uxtw
+ add x0, x0, x1
+ add x6, x6, x1
+ add x5, x5, x1
+ add x10, x10, x1
+ mov w3, w9
+ b 1b
+9:
+ ret
+
+L(ipred_smooth_h_tbl):
+ .hword L(ipred_smooth_h_tbl) - 640b
+ .hword L(ipred_smooth_h_tbl) - 320b
+ .hword L(ipred_smooth_h_tbl) - 160b
+ .hword L(ipred_smooth_h_tbl) - 80b
+ .hword L(ipred_smooth_h_tbl) - 40b
+endfunc
+
+const padding_mask_buf
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+padding_mask:
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+endconst
+
+// void ipred_z1_upsample_edge_8bpc_neon(pixel *out, const int hsz,
+// const pixel *const in, const int end);
+function ipred_z1_upsample_edge_8bpc_neon, export=1
+ movrel x4, padding_mask
+ ld1 {v0.16b}, [x2] // in[]
+ add x5, x2, w3, uxtw // in[end]
+ sub x4, x4, w3, uxtw
+
+ ld1r {v1.16b}, [x5] // padding
+ ld1 {v3.16b}, [x4] // padding_mask
+
+ movi v31.8h, #9
+
+ bit v0.16b, v1.16b, v3.16b // padded in[]
+
+ ext v4.16b, v0.16b, v1.16b, #1
+ ext v5.16b, v0.16b, v1.16b, #2
+ ext v6.16b, v0.16b, v1.16b, #3
+
+ uaddl v16.8h, v4.8b, v5.8b // in[i+1] + in[i+2]
+ uaddl2 v17.8h, v4.16b, v5.16b
+ uaddl v18.8h, v0.8b, v6.8b // in[i+0] + in[i+3]
+ uaddl2 v19.8h, v0.16b, v6.16b
+ mul v16.8h, v16.8h, v31.8h // 9*(in[i+1] + in[i+2])
+ mul v17.8h, v17.8h, v31.8h
+ sub v16.8h, v16.8h, v18.8h
+ sub v17.8h, v17.8h, v19.8h
+
+ sqrshrun v16.8b, v16.8h, #4
+ sqrshrun2 v16.16b, v17.8h, #4
+
+ zip1 v0.16b, v4.16b, v16.16b
+ zip2 v1.16b, v4.16b, v16.16b
+
+ st1 {v0.16b, v1.16b}, [x0]
+
+ ret
+endfunc
+
+// void ipred_z2_upsample_edge_8bpc_neon(pixel *out, const int sz,
+// const pixel *const in);
+function ipred_z2_upsample_edge_8bpc_neon, export=1
+ // Here, sz is 4 or 8, and we produce 2*sz+1 output elements.
+ movrel x4, padding_mask
+ ld1 {v0.16b}, [x2] // in[]
+ add x5, x2, w1, uxtw // in[sz]
+ sub x4, x4, w1, uxtw
+
+ ld1r {v2.16b}, [x2] // in[0] for padding
+ ld1r {v1.16b}, [x5] // padding
+ ld1 {v3.16b}, [x4] // padding_mask
+
+ movi v31.8h, #9
+
+ bit v0.16b, v1.16b, v3.16b // padded in[]
+
+ ext v4.16b, v2.16b, v0.16b, #15
+ ext v5.16b, v0.16b, v1.16b, #1
+ ext v6.16b, v0.16b, v1.16b, #2
+
+ uaddl v16.8h, v0.8b, v5.8b // in[i+0] + in[i+1]
+ uaddl v18.8h, v4.8b, v6.8b // in[i-1] + in[i+2]
+ mul v16.8h, v16.8h, v31.8h // 9*(in[i+1] + in[i+2])
+ sub v16.8h, v16.8h, v18.8h
+
+ sqrshrun v16.8b, v16.8h, #4
+
+ add x5, x0, #16
+
+ zip1 v2.16b, v0.16b, v16.16b
+
+ st1 {v1.b}[0], [x5]
+ // In case sz=8, output one single pixel in out[16].
+ st1 {v2.16b}, [x0]
+
+ ret
+endfunc
+
+const edge_filter
+ .byte 0, 4, 8, 0
+ .byte 0, 5, 6, 0
+// Leaving out the coeffs for strength=3
+// .byte 2, 4, 4, 0
+endconst
+
+// void ipred_z1_filter_edge_8bpc_neon(pixel *out, const int sz,
+// const pixel *const in, const int end,
+// const int strength);
+function ipred_z1_filter_edge_8bpc_neon, export=1
+ cmp w4, #3
+ b.eq L(fivetap) // if (strength == 3) goto fivetap
+
+ movrel x5, edge_filter, -3
+ add x5, x5, w4, uxtw #2 // edge_filter + (strength - 1)*4 + 1
+
+ ld1 {v31.h}[0], [x5] // kernel[1-2]
+
+ ld1 {v0.16b}, [x2], #16
+
+ dup v30.16b, v31.b[0]
+ dup v31.16b, v31.b[1]
+1:
+ // in[end], is the last valid pixel. We produce 16 pixels out by
+ // using 18 pixels in - the last pixel used is [17] of the ones
+ // read/buffered.
+ cmp w3, #17
+ ld1 {v1.16b}, [x2], #16
+ b.lt 2f
+ ext v2.16b, v0.16b, v1.16b, #1
+ ext v3.16b, v0.16b, v1.16b, #2
+ umull v4.8h, v0.8b, v30.8b
+ umlal v4.8h, v2.8b, v31.8b
+ umlal v4.8h, v3.8b, v30.8b
+ umull2 v5.8h, v0.16b, v30.16b
+ umlal2 v5.8h, v2.16b, v31.16b
+ umlal2 v5.8h, v3.16b, v30.16b
+ subs w1, w1, #16
+ mov v0.16b, v1.16b
+ rshrn v4.8b, v4.8h, #4
+ rshrn2 v4.16b, v5.8h, #4
+ sub w3, w3, #16
+ st1 {v4.16b}, [x0], #16
+ b.gt 1b
+ ret
+2:
+ // Right padding
+
+ // x2[w3-32] is the padding pixel (x2 points 32 bytes ahead)
+ movrel x5, padding_mask
+ sub w6, w3, #32
+ sub x5, x5, w3, uxtw
+ add x6, x2, w6, sxtw
+
+ ld1 {v2.16b}, [x5] // padding_mask
+
+ ld1r {v1.16b}, [x6]
+ bit v0.16b, v1.16b, v2.16b // Pad v0-v1
+
+ // Filter one block
+ ext v2.16b, v0.16b, v1.16b, #1
+ ext v3.16b, v0.16b, v1.16b, #2
+ umull v4.8h, v0.8b, v30.8b
+ umlal v4.8h, v2.8b, v31.8b
+ umlal v4.8h, v3.8b, v30.8b
+ umull2 v5.8h, v0.16b, v30.16b
+ umlal2 v5.8h, v2.16b, v31.16b
+ umlal2 v5.8h, v3.16b, v30.16b
+ subs w1, w1, #16
+ rshrn v4.8b, v4.8h, #4
+ rshrn2 v4.16b, v5.8h, #4
+ st1 {v4.16b}, [x0], #16
+ b.le 9f
+5:
+ // After one block, any remaining output would only be filtering
+ // padding - thus just store the padding.
+ subs w1, w1, #16
+ st1 {v1.16b}, [x0], #16
+ b.gt 5b
+9:
+ ret
+
+L(fivetap):
+ sub x2, x2, #1 // topleft -= 1
+ movi v29.16b, #2
+ ld1 {v0.16b}, [x2], #16
+ movi v30.16b, #4
+ movi v31.16b, #4
+ ins v0.b[0], v0.b[1]
+1:
+ // in[end+1], is the last valid pixel. We produce 16 pixels out by
+ // using 20 pixels in - the last pixel used is [19] of the ones
+ // read/buffered.
+ cmp w3, #18
+ ld1 {v1.16b}, [x2], #16
+ b.lt 2f // if (end + 1 < 19)
+ ext v2.16b, v0.16b, v1.16b, #1
+ ext v3.16b, v0.16b, v1.16b, #2
+ ext v4.16b, v0.16b, v1.16b, #3
+ ext v5.16b, v0.16b, v1.16b, #4
+ umull v6.8h, v0.8b, v29.8b
+ umlal v6.8h, v2.8b, v30.8b
+ umlal v6.8h, v3.8b, v31.8b
+ umlal v6.8h, v4.8b, v30.8b
+ umlal v6.8h, v5.8b, v29.8b
+ umull2 v7.8h, v0.16b, v29.16b
+ umlal2 v7.8h, v2.16b, v30.16b
+ umlal2 v7.8h, v3.16b, v31.16b
+ umlal2 v7.8h, v4.16b, v30.16b
+ umlal2 v7.8h, v5.16b, v29.16b
+ subs w1, w1, #16
+ mov v0.16b, v1.16b
+ rshrn v6.8b, v6.8h, #4
+ rshrn2 v6.16b, v7.8h, #4
+ sub w3, w3, #16
+ st1 {v6.16b}, [x0], #16
+ b.gt 1b
+ ret
+2:
+ // Right padding
+
+ // x2[w3+1-32] is the padding pixel (x2 points 32 bytes ahead)
+ movrel x5, padding_mask, -1
+ sub w6, w3, #31
+ sub x5, x5, w3, uxtw
+ add x6, x2, w6, sxtw
+
+ ld1 {v2.16b, v3.16b}, [x5] // padding_mask
+
+ ld1r {v28.16b}, [x6]
+ bit v0.16b, v28.16b, v2.16b // Pad v0-v1
+ bit v1.16b, v28.16b, v3.16b
+4:
+ // Filter one block
+ ext v2.16b, v0.16b, v1.16b, #1
+ ext v3.16b, v0.16b, v1.16b, #2
+ ext v4.16b, v0.16b, v1.16b, #3
+ ext v5.16b, v0.16b, v1.16b, #4
+ umull v6.8h, v0.8b, v29.8b
+ umlal v6.8h, v2.8b, v30.8b
+ umlal v6.8h, v3.8b, v31.8b
+ umlal v6.8h, v4.8b, v30.8b
+ umlal v6.8h, v5.8b, v29.8b
+ umull2 v7.8h, v0.16b, v29.16b
+ umlal2 v7.8h, v2.16b, v30.16b
+ umlal2 v7.8h, v3.16b, v31.16b
+ umlal2 v7.8h, v4.16b, v30.16b
+ umlal2 v7.8h, v5.16b, v29.16b
+ subs w1, w1, #16
+ mov v0.16b, v1.16b
+ mov v1.16b, v28.16b
+ rshrn v6.8b, v6.8h, #4
+ rshrn2 v6.16b, v7.8h, #4
+ sub w3, w3, #16
+ st1 {v6.16b}, [x0], #16
+ b.le 9f
+ // v0-v1[w3+1] is the last valid pixel; if (w3 + 1 > 0) we need to
+ // filter properly once more - aka (w3 >= 0).
+ cmp w3, #0
+ b.ge 4b
+5:
+ // When w3 <= 0, all remaining pixels in v0-v1 are equal to the
+ // last valid pixel - thus just output that without filtering.
+ subs w1, w1, #16
+ st1 {v1.16b}, [x0], #16
+ b.gt 5b
+9:
+ ret
+endfunc
+
+// void ipred_pixel_set_8bpc_neon(pixel *out, const pixel px,
+// const int n);
+function ipred_pixel_set_8bpc_neon, export=1
+ dup v0.16b, w1
+1:
+ subs w2, w2, #16
+ st1 {v0.16b}, [x0], #16
+ b.gt 1b
+ ret
+endfunc
+
+// void ipred_z1_fill1_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const top,
+// const int width, const int height,
+// const int dx, const int max_base_x);
+function ipred_z1_fill1_8bpc_neon, export=1
+ clz w9, w3
+ adr x8, L(ipred_z1_fill1_tbl)
+ sub w9, w9, #25
+ ldrh w9, [x8, w9, uxtw #1]
+ add x10, x2, w6, uxtw // top[max_base_x]
+ sub x8, x8, w9, uxtw
+ ld1r {v31.16b}, [x10] // padding
+ mov w7, w5
+ mov w15, #64
+ br x8
+40:
+ AARCH64_VALID_JUMP_TARGET
+4:
+ lsr w8, w7, #6 // base
+ and w9, w7, #0x3e // frac
+ add w7, w7, w5 // xpos += dx
+ cmp w8, w6 // base >= max_base_x
+ lsr w10, w7, #6 // base
+ and w11, w7, #0x3e // frac
+ b.ge 49f
+ ldr d0, [x2, w8, uxtw] // top[base]
+ ldr d2, [x2, w10, uxtw]
+ dup v4.4h, w9 // frac
+ dup v5.4h, w11
+ ext v1.8b, v0.8b, v0.8b, #1 // top[base+1]
+ ext v3.8b, v2.8b, v2.8b, #1
+ usubl v6.8h, v1.8b, v0.8b // top[base+1]-top[base]
+ usubl v7.8h, v3.8b, v2.8b
+ ushll v16.8h, v0.8b, #6 // top[base]*64
+ ushll v17.8h, v2.8b, #6
+ mla v16.4h, v6.4h, v4.4h // + top[base+1]*frac
+ mla v17.4h, v7.4h, v5.4h
+ rshrn v16.8b, v16.8h, #6
+ rshrn v17.8b, v17.8h, #6
+ st1 {v16.s}[0], [x0], x1
+ add w7, w7, w5 // xpos += dx
+ subs w4, w4, #2
+ st1 {v17.s}[0], [x0], x1
+ b.gt 4b
+ ret
+
+49:
+ st1 {v31.s}[0], [x0], x1
+ subs w4, w4, #2
+ st1 {v31.s}[0], [x0], x1
+ b.gt 49b
+ ret
+
+80:
+ AARCH64_VALID_JUMP_TARGET
+8:
+ lsr w8, w7, #6 // base
+ and w9, w7, #0x3e // frac
+ add w7, w7, w5 // xpos += dx
+ cmp w8, w6 // base >= max_base_x
+ lsr w10, w7, #6 // base
+ and w11, w7, #0x3e // frac
+ b.ge 89f
+ ldr q0, [x2, w8, uxtw] // top[base]
+ ldr q2, [x2, w10, uxtw]
+ dup v4.8b, w9 // frac
+ dup v5.8b, w11
+ sub w9, w15, w9 // 64 - frac
+ sub w11, w15, w11
+ dup v6.8b, w9 // 64 - frac
+ dup v7.8b, w11
+ ext v1.16b, v0.16b, v0.16b, #1 // top[base+1]
+ ext v3.16b, v2.16b, v2.16b, #1
+ umull v16.8h, v0.8b, v6.8b // top[base]*(64-frac)
+ umlal v16.8h, v1.8b, v4.8b // + top[base+1]*frac
+ umull v17.8h, v2.8b, v7.8b
+ umlal v17.8h, v3.8b, v5.8b
+ rshrn v16.8b, v16.8h, #6
+ rshrn v17.8b, v17.8h, #6
+ st1 {v16.8b}, [x0], x1
+ add w7, w7, w5 // xpos += dx
+ subs w4, w4, #2
+ st1 {v17.8b}, [x0], x1
+ b.gt 8b
+ ret
+
+89:
+ st1 {v31.8b}, [x0], x1
+ subs w4, w4, #2
+ st1 {v31.8b}, [x0], x1
+ b.gt 89b
+ ret
+
+160:
+320:
+640:
+ AARCH64_VALID_JUMP_TARGET
+
+ mov w12, w3
+
+ add x13, x0, x1
+ lsl x1, x1, #1
+ sub x1, x1, w3, uxtw
+1:
+ lsr w8, w7, #6 // base
+ and w9, w7, #0x3e // frac
+ add w7, w7, w5 // xpos += dx
+ cmp w8, w6 // base >= max_base_x
+ lsr w10, w7, #6 // base
+ and w11, w7, #0x3e // frac
+ b.ge 169f
+ add x8, x2, w8, uxtw
+ add x10, x2, w10, uxtw
+ dup v4.16b, w9 // frac
+ dup v5.16b, w11
+ ld1 {v0.16b, v1.16b}, [x8], #32 // top[base]
+ ld1 {v2.16b, v3.16b}, [x10], #32
+ sub w9, w15, w9 // 64 - frac
+ sub w11, w15, w11
+ dup v6.16b, w9 // 64 - frac
+ dup v7.16b, w11
+ add w7, w7, w5 // xpos += dx
+2:
+ ext v16.16b, v0.16b, v1.16b, #1 // top[base+1]
+ ext v17.16b, v2.16b, v3.16b, #1
+ subs w3, w3, #16
+ umull v18.8h, v0.8b, v6.8b // top[base]*(64-frac)
+ umlal v18.8h, v16.8b, v4.8b // + top[base+1]*frac
+ umull2 v19.8h, v0.16b, v6.16b
+ umlal2 v19.8h, v16.16b, v4.16b
+ umull v20.8h, v2.8b, v7.8b
+ umlal v20.8h, v17.8b, v5.8b
+ umull2 v21.8h, v2.16b, v7.16b
+ umlal2 v21.8h, v17.16b, v5.16b
+ rshrn v16.8b, v18.8h, #6
+ rshrn2 v16.16b, v19.8h, #6
+ rshrn v17.8b, v20.8h, #6
+ rshrn2 v17.16b, v21.8h, #6
+ st1 {v16.16b}, [x0], #16
+ st1 {v17.16b}, [x13], #16
+ b.le 3f
+ mov v0.16b, v1.16b
+ ld1 {v1.16b}, [x8], #16 // top[base]
+ mov v2.16b, v3.16b
+ ld1 {v3.16b}, [x10], #16
+ b 2b
+
+3:
+ subs w4, w4, #2
+ b.le 9f
+ add x0, x0, x1
+ add x13, x13, x1
+ mov w3, w12
+ b 1b
+9:
+ ret
+
+169:
+ st1 {v31.16b}, [x0], #16
+ subs w3, w3, #16
+ st1 {v31.16b}, [x13], #16
+ b.gt 169b
+ subs w4, w4, #2
+ b.le 9b
+ add x0, x0, x1
+ add x13, x13, x1
+ mov w3, w12
+ b 169b
+
+L(ipred_z1_fill1_tbl):
+ .hword L(ipred_z1_fill1_tbl) - 640b
+ .hword L(ipred_z1_fill1_tbl) - 320b
+ .hword L(ipred_z1_fill1_tbl) - 160b
+ .hword L(ipred_z1_fill1_tbl) - 80b
+ .hword L(ipred_z1_fill1_tbl) - 40b
+endfunc
+
+function ipred_z1_fill2_8bpc_neon, export=1
+ cmp w3, #8
+ add x10, x2, w6, uxtw // top[max_base_x]
+ ld1r {v31.16b}, [x10] // padding
+ mov w7, w5
+ mov w15, #64
+ b.eq 8f
+
+4: // w == 4
+ lsr w8, w7, #6 // base
+ and w9, w7, #0x3e // frac
+ add w7, w7, w5 // xpos += dx
+ cmp w8, w6 // base >= max_base_x
+ lsr w10, w7, #6 // base
+ and w11, w7, #0x3e // frac
+ b.ge 49f
+ ldr d0, [x2, w8, uxtw] // top[base]
+ ldr d2, [x2, w10, uxtw]
+ dup v4.4h, w9 // frac
+ dup v5.4h, w11
+ uzp2 v1.8b, v0.8b, v0.8b // top[base+1]
+ uzp1 v0.8b, v0.8b, v0.8b // top[base]
+ uzp2 v3.8b, v2.8b, v2.8b
+ uzp1 v2.8b, v2.8b, v2.8b
+ usubl v6.8h, v1.8b, v0.8b // top[base+1]-top[base]
+ usubl v7.8h, v3.8b, v2.8b
+ ushll v16.8h, v0.8b, #6 // top[base]*64
+ ushll v17.8h, v2.8b, #6
+ mla v16.4h, v6.4h, v4.4h // + top[base+1]*frac
+ mla v17.4h, v7.4h, v5.4h
+ rshrn v16.8b, v16.8h, #6
+ rshrn v17.8b, v17.8h, #6
+ st1 {v16.s}[0], [x0], x1
+ add w7, w7, w5 // xpos += dx
+ subs w4, w4, #2
+ st1 {v17.s}[0], [x0], x1
+ b.gt 4b
+ ret
+
+49:
+ st1 {v31.s}[0], [x0], x1
+ subs w4, w4, #2
+ st1 {v31.s}[0], [x0], x1
+ b.gt 49b
+ ret
+
+8: // w == 8
+ lsr w8, w7, #6 // base
+ and w9, w7, #0x3e // frac
+ add w7, w7, w5 // xpos += dx
+ cmp w8, w6 // base >= max_base_x
+ lsr w10, w7, #6 // base
+ and w11, w7, #0x3e // frac
+ b.ge 89f
+ ldr q0, [x2, w8, uxtw] // top[base]
+ ldr q2, [x2, w10, uxtw]
+ dup v4.8b, w9 // frac
+ dup v5.8b, w11
+ sub w9, w15, w9 // 64 - frac
+ sub w11, w15, w11
+ dup v6.8b, w9 // 64 - frac
+ dup v7.8b, w11
+ uzp2 v1.16b, v0.16b, v0.16b // top[base+1]
+ uzp1 v0.16b, v0.16b, v0.16b // top[base]
+ uzp2 v3.16b, v2.16b, v2.16b
+ uzp1 v2.16b, v2.16b, v2.16b
+ umull v16.8h, v1.8b, v4.8b // top[base+1]*frac
+ umlal v16.8h, v0.8b, v6.8b // + top[base]*(64-frac)
+ umull v17.8h, v3.8b, v5.8b
+ umlal v17.8h, v2.8b, v7.8b
+ rshrn v16.8b, v16.8h, #6
+ rshrn v17.8b, v17.8h, #6
+ st1 {v16.8b}, [x0], x1
+ add w7, w7, w5 // xpos += dx
+ subs w4, w4, #2
+ st1 {v17.8b}, [x0], x1
+ b.gt 8b
+ ret
+
+89:
+ st1 {v31.8b}, [x0], x1
+ subs w4, w4, #2
+ st1 {v31.8b}, [x0], x1
+ b.gt 89b
+ ret
+endfunc
+
+// void ipred_reverse_8bpc_neon(pixel *dst, const pixel *const src,
+// const int n);
+function ipred_reverse_8bpc_neon, export=1
+ sub x1, x1, #16
+ add x3, x0, #8
+ mov x4, #16
+1:
+ ld1 {v0.16b}, [x1]
+ subs w2, w2, #16
+ rev64 v0.16b, v0.16b
+ sub x1, x1, #16
+ st1 {v0.d}[1], [x0], x4
+ st1 {v0.d}[0], [x3], x4
+ b.gt 1b
+ ret
+endfunc
+
+const increments
+ .short 0, 1, 2, 3, 4, 5, 6, 7
+ .short 8, 9, 10, 11, 12, 13, 14, 15
+endconst
+
+// void ipred_z2_fill1_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const top,
+// const pixel *const left,
+// const int width, const int height,
+// const int dx, const int dy);
+function ipred_z2_fill1_8bpc_neon, export=1
+ clz w10, w4
+ adr x9, L(ipred_z2_fill1_tbl)
+ sub w10, w10, #25
+ ldrh w10, [x9, w10, uxtw #1]
+ mov w8, #(1 << 6) // xpos = 1 << 6
+ sub x9, x9, w10, uxtw
+ sub w8, w8, w6 // xpos -= dx
+
+ movrel x11, increments
+ ld1 {v31.8h}, [x11] // increments
+ neg w7, w7 // -dy
+
+ br x9
+40:
+ AARCH64_VALID_JUMP_TARGET
+
+ dup v30.4h, w7 // -dy
+ movi v17.8b, #1
+
+ mul v16.4h, v31.4h, v30.4h // {0,1,2,3}* -dy
+ movi v25.16b, #0x3e
+ add v30.4h, v16.4h, v30.4h // -= dy
+
+ xtn v31.8b, v31.8h // {0,1,2,3}
+
+ // Worst case height for w=4 is 16, but we need at least h+1 elements
+ ld1 {v0.16b, v1.16b}, [x3] // left[]
+
+ movi v26.16b, #64
+ movi v19.16b, #2
+
+ xtn v27.8b, v30.8h // (uint8_t)ypos
+ shrn v29.8b, v30.8h, #6 // ypos >> 6
+ and v27.8b, v27.8b, v25.8b // frac_y
+
+ add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 1
+
+ add v30.8b, v29.8b, v17.8b // base_y + 1
+ add v28.8b, v29.8b, v19.8b // base_y + 2
+
+ tbl v16.8b, {v0.16b}, v29.8b // left[base_y]
+
+ trn1 v30.2s, v30.2s, v28.2s // base_y + 1, base_y + 2
+
+ sub v28.8b, v26.8b, v27.8b // 64 - frac_y
+
+ trn1 v31.2s, v31.2s, v31.2s // {0,1,2,3,0,1,2,3}
+
+ trn1 v27.2s, v27.2s, v27.2s // frac_y
+ trn1 v28.2s, v28.2s, v28.2s // 64 - frac_y
+
+ movi v29.8b, #2
+4:
+ asr w9, w8, #6 // base_x
+ dup v6.4h, w8 // xpos
+ sub w8, w8, w6 // xpos -= dx
+ cmp w9, #-4 // base_x <= -4
+ asr w11, w8, #6 // base_x
+ b.le 49f
+
+ dup v7.4h, w8 // xpos
+
+ ldr d2, [x2, w9, sxtw] // top[base_x]
+ ldr d4, [x2, w11, sxtw]
+
+ trn1 v6.2d, v6.2d, v7.2d // xpos
+
+ // Cut corners here; only doing tbl over v0 here; we only
+ // seem to need the last pixel, from v1, after skipping to the
+ // left-only codepath below.
+ tbl v17.8b, {v0.16b}, v30.8b // left[base_y+1], left[base_y+2]
+
+ shrn v20.8b, v6.8h, #6 // first base_x for each row
+ xtn v6.8b, v6.8h // (uint8_t)xpos
+
+ ext v3.8b, v2.8b, v2.8b, #1 // top[base_x+1]
+ ext v5.8b, v4.8b, v4.8b, #1
+
+ and v6.8b, v6.8b, v25.8b // frac_x
+
+ trn1 v16.2s, v16.2s, v17.2s // left[base_y], left[base_y+1]
+
+ trn1 v2.2s, v2.2s, v4.2s // top[base_x]
+ trn1 v3.2s, v3.2s, v5.2s // top[base_x+1]
+
+ sub v7.8b, v26.8b, v6.8b // 64 - frac_x
+
+ add v20.8b, v20.8b, v31.8b // actual base_x
+
+ umull v16.8h, v16.8b, v28.8b // left[base_y]*(64-frac_y)
+ umlal v16.8h, v17.8b, v27.8b // + left[base_y+1]*frac_y
+
+ umull v22.8h, v2.8b, v7.8b // top[base_x]-*(64-frac_x)
+ umlal v22.8h, v3.8b, v6.8b // + top[base_x+1]*frac_x
+
+ cmge v20.8b, v20.8b, #0
+
+ rshrn v16.8b, v16.8h, #6
+ rshrn v22.8b, v22.8h, #6
+
+ bit v16.8b, v22.8b, v20.8b
+
+ st1 {v16.s}[0], [x0], x1
+ sub w8, w8, w6 // xpos -= dx
+ subs w5, w5, #2
+ st1 {v16.s}[1], [x0], x1
+ b.le 9f
+
+ ext v16.8b, v17.8b, v17.8b, #4
+ add v30.8b, v30.8b, v29.8b // base_y += 2
+ b 4b
+
+49:
+ tbl v17.8b, {v0.16b, v1.16b}, v30.8b // left[base_y+1], left[base_y+2]
+
+ trn1 v16.2s, v16.2s, v17.2s // left[base_y], left[base_y+1]
+
+ umull v18.8h, v16.8b, v28.8b // left[base_y]*(64-frac_t)
+ umlal v18.8h, v17.8b, v27.8b // + left[base_y+1]*frac_y
+ rshrn v18.8b, v18.8h, #6
+
+ st1 {v18.s}[0], [x0], x1
+ subs w5, w5, #2
+ st1 {v18.s}[1], [x0], x1
+ b.le 9f
+
+ ext v16.8b, v17.8b, v17.8b, #4
+ add v30.8b, v30.8b, v29.8b // base_y += 2
+ b 49b
+
+9:
+ ret
+
+80:
+ AARCH64_VALID_JUMP_TARGET
+
+ dup v30.8h, w7 // -dy
+ movi v17.8b, #1
+
+ mul v16.8h, v31.8h, v30.8h // {0,1,2,3,4,5,6,7}* -dy
+ movi v25.16b, #0x3e
+ add v30.8h, v16.8h, v30.8h // -= dy
+
+ xtn v31.8b, v31.8h // {0,1,2,3,4,5,6,7}
+
+ // Worst case height for w=8 is 32, but we need at least h+1 elements
+ ld1 {v0.16b, v1.16b, v2.16b}, [x3] // left[]
+
+ movi v26.16b, #64
+ movi v19.16b, #2
+
+ xtn v27.8b, v30.8h // (uint8_t)ypos
+ shrn v29.8b, v30.8h, #6 // ypos >> 6
+ and v27.8b, v27.8b, v25.8b // frac_y
+
+ add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 1
+
+ // Cut corners here; for the first row we don't expect to need to
+ // read outside of v0.
+ tbl v18.8b, {v0.16b}, v29.8b // left[base_y]
+
+ add v30.8b, v29.8b, v19.8b // base_y + 2
+ add v29.8b, v29.8b, v17.8b // base_y + 1
+
+ sub v28.8b, v26.8b, v27.8b // 64 - frac_y
+
+ trn1 v31.2d, v31.2d, v31.2d // {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7}
+
+ movi v24.8b, #2 // 2
+8:
+ asr w9, w8, #6 // base_x
+ dup v16.8h, w8 // xpos
+ sub w8, w8, w6 // xpos -= dx
+ cmp w9, #-8 // base_x <= -8
+ asr w11, w8, #6 // base_x
+ b.le 89f
+
+ dup v17.8h, w8 // xpos
+
+ ldr q4, [x2, w9, sxtw] // top[base_x]
+ ldr q6, [x2, w11, sxtw]
+
+ // Cut corners here; only doing tbl over v0-v1 here; we only
+ // seem to need the last pixel, from v2, after skipping to the
+ // left-only codepath below.
+ tbl v19.8b, {v0.16b, v1.16b}, v29.8b // left[base_y+1]
+
+ shrn v21.8b, v16.8h, #6 // first base_x
+ shrn2 v21.16b, v17.8h, #6
+ xtn v16.8b, v16.8h // (uint8_t)xpos
+ xtn2 v16.16b, v17.8h
+
+ tbl v20.8b, {v0.16b, v1.16b}, v30.8b // left[base_y+2]
+
+ ext v5.16b, v4.16b, v4.16b, #1 // top[base_x+1]
+ ext v7.16b, v6.16b, v6.16b, #1
+
+ and v16.16b, v16.16b, v25.16b // frac_x
+
+ trn1 v4.2d, v4.2d, v6.2d // top[base_x]
+ trn1 v5.2d, v5.2d, v7.2d // top[base_x+1]
+
+ sub v7.16b, v26.16b, v16.16b // 64 - frac_x
+
+ add v21.16b, v21.16b, v31.16b // actual base_x
+
+ umull v6.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y)
+ umlal v6.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y
+ umull v17.8h, v19.8b, v28.8b
+ umlal v17.8h, v20.8b, v27.8b
+
+ umull v22.8h, v4.8b, v7.8b // top[base_x]-*(64-frac_x)
+ umlal v22.8h, v5.8b, v16.8b // + top[base_x+1]*frac_x
+ umull2 v23.8h, v4.16b, v7.16b
+ umlal2 v23.8h, v5.16b, v16.16b
+
+ cmge v21.16b, v21.16b, #0
+
+ rshrn v6.8b, v6.8h, #6
+ rshrn2 v6.16b, v17.8h, #6
+ rshrn v22.8b, v22.8h, #6
+ rshrn2 v22.16b, v23.8h, #6
+
+ bit v6.16b, v22.16b, v21.16b
+
+ st1 {v6.d}[0], [x0], x1
+ sub w8, w8, w6 // xpos -= dx
+ subs w5, w5, #2
+ st1 {v6.d}[1], [x0], x1
+ b.le 9f
+
+ mov v18.8b, v20.8b
+ add v29.8b, v29.8b, v24.8b // base_y += 2
+ add v30.8b, v30.8b, v24.8b // base_y += 2
+ b 8b
+
+89:
+ tbl v19.8b, {v0.16b, v1.16b, v2.16b}, v29.8b // left[base_y+1]
+ tbl v20.8b, {v0.16b, v1.16b, v2.16b}, v30.8b // left[base_y+2]
+
+ umull v6.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y)
+ umlal v6.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y
+ umull v17.8h, v19.8b, v28.8b
+ umlal v17.8h, v20.8b, v27.8b
+
+ rshrn v6.8b, v6.8h, #6
+ rshrn2 v6.16b, v17.8h, #6
+
+ st1 {v6.d}[0], [x0], x1
+ subs w5, w5, #2
+ st1 {v6.d}[1], [x0], x1
+ b.le 9f
+
+ mov v18.8b, v20.8b
+ add v29.8b, v29.8b, v24.8b // base_y += 2
+ add v30.8b, v30.8b, v24.8b // base_y += 2
+ b 89b
+
+9:
+ ret
+
+160:
+ AARCH64_VALID_JUMP_TARGET
+
+ stp d8, d9, [sp, #-0x40]!
+ stp d10, d11, [sp, #0x10]
+ stp d12, d13, [sp, #0x20]
+ stp d14, d15, [sp, #0x30]
+
+ add x11, x11, #16 // increments
+
+ dup v18.8h, w7 // -dy
+ movi v17.16b, #1
+ add x3, x3, #1 // Skip past left[0]
+
+ ld1 {v14.8h}, [x11] // {8,9,10,11,12,13,14,15}
+
+ mul v16.8h, v31.8h, v18.8h // {0,1,2,3,4,5,6,7}* -dy
+ mul v19.8h, v14.8h, v18.8h // {8,9,10,11,12,13,14,15}* -dy
+ movi v25.16b, #0x3e
+ add v16.8h, v16.8h, v18.8h // -= dy
+ add v18.8h, v19.8h, v18.8h
+
+ xtn v31.8b, v31.8h // {0,1,2,3,4,5,6,7}
+ xtn2 v31.16b, v14.8h // {8,9,10,11,12,13,14,15}
+
+ // Worst case height is 64.
+ ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x3] // left[]
+ ld1r {v15.16b}, [x2] // left[0] == top[0]
+
+ movi v26.16b, #64
+ movi v19.16b, #2
+
+ xtn v27.8b, v16.8h // (uint8_t)ypos
+ xtn2 v27.16b, v18.8h
+ shrn v29.8b, v16.8h, #6 // ypos >> 6
+ shrn2 v29.16b, v18.8h, #6
+ mov v18.16b, v15.16b // left[0]
+ and v27.16b, v27.16b, v25.16b // frac_y
+
+ // Cut corners here; for the first row we don't expect to need to
+ // read outside of v0.
+ tbx v18.16b, {v0.16b}, v29.16b // left[base_y]
+
+ add v30.16b, v29.16b, v19.16b // base_y + 2
+ add v29.16b, v29.16b, v17.16b // base_y + 1
+
+ sub v28.16b, v26.16b, v27.16b // 64 - frac_y
+
+ movi v24.16b, #2 // 2
+16:
+ asr w9, w8, #6 // base_x
+ dup v16.8h, w8 // xpos
+ sub w8, w8, w6 // xpos -= dx
+ cmp w9, #-16 // base_x <= -16
+ asr w11, w8, #6 // base_x
+ b.le 169f
+
+ dup v17.8h, w8 // xpos
+
+ add x9, x2, w9, sxtw
+ add x11, x2, w11, sxtw
+
+ ld1 {v4.16b, v5.16b}, [x9] // top[base_x]
+ mov v19.16b, v15.16b // left[0]
+ ld1 {v6.16b, v7.16b}, [x11]
+
+ tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1]
+
+ mov v20.16b, v15.16b // left[0]
+
+ shrn v21.8b, v16.8h, #6 // first base_x
+ shrn v22.8b, v17.8h, #6
+ xtn v16.8b, v16.8h // (uint8_t)xpos
+ xtn v17.8b, v17.8h
+
+ tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b // left[base_y+2]
+
+ trn1 v21.2d, v21.2d, v21.2d // first base_x
+ trn1 v22.2d, v22.2d, v22.2d
+ trn1 v16.2d, v16.2d, v16.2d // (uint8_t)xpos
+ trn1 v17.2d, v17.2d, v17.2d
+
+ ext v5.16b, v4.16b, v5.16b, #1 // top[base_x+1]
+ ext v7.16b, v6.16b, v7.16b, #1
+
+ and v16.16b, v16.16b, v25.16b // frac_x
+ and v17.16b, v17.16b, v25.16b
+
+ umull v10.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y)
+ umlal v10.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y
+
+ sub v8.16b, v26.16b, v16.16b // 64 - frac_x
+ sub v9.16b, v26.16b, v17.16b
+
+ umull2 v11.8h, v18.16b, v28.16b
+ umlal2 v11.8h, v19.16b, v27.16b
+
+ add v21.16b, v21.16b, v31.16b // actual base_x
+ add v22.16b, v22.16b, v31.16b
+
+ umull v12.8h, v19.8b, v28.8b
+ umlal v12.8h, v20.8b, v27.8b
+ umull2 v13.8h, v19.16b, v28.16b
+ umlal2 v13.8h, v20.16b, v27.16b
+
+ rshrn v10.8b, v10.8h, #6
+ rshrn2 v10.16b, v11.8h, #6
+ rshrn v11.8b, v12.8h, #6
+ rshrn2 v11.16b, v13.8h, #6
+
+ umull v12.8h, v4.8b, v8.8b // top[base_x]-*(64-frac_x)
+ umlal v12.8h, v5.8b, v16.8b // + top[base_x+1]*frac_x
+ umull2 v13.8h, v4.16b, v8.16b
+ umlal2 v13.8h, v5.16b, v16.16b
+ umull v14.8h, v6.8b, v9.8b
+ umlal v14.8h, v7.8b, v17.8b
+ umull2 v18.8h, v6.16b, v9.16b
+ umlal2 v18.8h, v7.16b, v17.16b
+
+ cmge v21.16b, v21.16b, #0
+ cmge v22.16b, v22.16b, #0
+
+ rshrn v12.8b, v12.8h, #6
+ rshrn2 v12.16b, v13.8h, #6
+ rshrn v13.8b, v14.8h, #6
+ rshrn2 v13.16b, v18.8h, #6
+
+ bit v10.16b, v12.16b, v21.16b
+ bit v11.16b, v13.16b, v22.16b
+
+ st1 {v10.16b}, [x0], x1
+ subs w5, w5, #2
+ sub w8, w8, w6 // xpos -= dx
+ st1 {v11.16b}, [x0], x1
+ b.le 9f
+
+ mov v18.16b, v20.16b
+ add v29.16b, v29.16b, v24.16b // base_y += 2
+ add v30.16b, v30.16b, v24.16b // base_y += 2
+ b 16b
+
+169:
+ mov v19.16b, v15.16b
+ mov v20.16b, v15.16b
+ tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1]
+ tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b // left[base_y+2]
+
+ umull v4.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y)
+ umlal v4.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y
+ umull2 v5.8h, v18.16b, v28.16b
+ umlal2 v5.8h, v19.16b, v27.16b
+ umull v6.8h, v19.8b, v28.8b
+ umlal v6.8h, v20.8b, v27.8b
+ umull2 v7.8h, v19.16b, v28.16b
+ umlal2 v7.8h, v20.16b, v27.16b
+
+ rshrn v4.8b, v4.8h, #6
+ rshrn2 v4.16b, v5.8h, #6
+ rshrn v5.8b, v6.8h, #6
+ rshrn2 v5.16b, v7.8h, #6
+
+ st1 {v4.16b}, [x0], x1
+ subs w5, w5, #2
+ st1 {v5.16b}, [x0], x1
+ b.le 9f
+
+ mov v18.16b, v20.16b
+ add v29.16b, v29.16b, v24.16b // base_y += 2
+ add v30.16b, v30.16b, v24.16b // base_y += 2
+ b 169b
+
+9:
+ ldp d14, d15, [sp, #0x30]
+ ldp d12, d13, [sp, #0x20]
+ ldp d10, d11, [sp, #0x10]
+ ldp d8, d9, [sp], 0x40
+ ret
+
+320:
+640:
+ AARCH64_VALID_JUMP_TARGET
+
+ stp d8, d9, [sp, #-0x40]!
+ stp d10, d11, [sp, #0x10]
+ stp d12, d13, [sp, #0x20]
+ stp d14, d15, [sp, #0x30]
+
+ add x11, x11, #16 // increments
+
+ dup v25.8h, w7 // -dy
+ add x3, x3, #1 // Skip past left[0]
+
+ ld1 {v14.8h}, [x11] // {8,9,10,11,12,13,14,15}
+
+ add x13, x0, x1 // alternating row
+ lsl x1, x1, #1 // stride *= 2
+ sub x1, x1, w4, uxtw // stride -= width
+
+ movi v11.8h, #8
+ mul v26.8h, v31.8h, v25.8h // {0,1,2,3,4,5,6,7}* -dy
+ add v26.8h, v26.8h, v25.8h // -= dy
+ mul v25.8h, v25.8h, v11.8h // -8*dy
+
+ xtn v31.8b, v31.8h // {0,1,2,3,4,5,6,7}
+ xtn2 v31.16b, v14.8h // {8,9,10,11,12,13,14,15}
+
+ // Worst case height is 64.
+ ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x3] // left[]
+ ld1r {v15.16b}, [x2] // left[0] == top[0]
+
+ mov w12, w4 // orig w
+ neg w14, w4 // -w
+
+1:
+ mov v23.16b, v26.16b // reset ypos
+
+ asr w9, w8, #6 // base_x
+ dup v16.8h, w8 // xpos
+ sub w8, w8, w6 // xpos -= dx
+ cmp w9, w14 // base_x <= -w
+ asr w11, w8, #6 // base_x
+ b.le 329f
+
+ dup v17.8h, w8 // xpos
+ sub w8, w8, w6 // xpos -= dx
+
+ add x9, x2, w9, sxtw
+ add x11, x2, w11, sxtw
+
+ sqshrn v21.8b, v16.8h, #6 // first base_x
+ sqshrn v22.8b, v17.8h, #6
+ xtn v16.8b, v16.8h // (uint8_t)xpos
+ xtn v17.8b, v17.8h
+
+ ld1 {v4.16b}, [x9], #16 // top[base_x]
+ ld1 {v6.16b}, [x11], #16
+
+ trn1 v21.2d, v21.2d, v21.2d // first base_x
+ trn1 v22.2d, v22.2d, v22.2d
+ trn1 v16.2d, v16.2d, v16.2d // (uint8_t)xpos
+ trn1 v17.2d, v17.2d, v17.2d
+
+ movi v10.16b, #0x3e
+ movi v11.16b, #64
+
+ and v16.16b, v16.16b, v10.16b // frac_x
+ and v17.16b, v17.16b, v10.16b
+
+ sub v8.16b, v11.16b, v16.16b // 64 - frac_x
+ sub v9.16b, v11.16b, v17.16b
+
+ add v21.16b, v21.16b, v31.16b // actual base_x
+ add v22.16b, v22.16b, v31.16b
+
+2:
+ add v13.8h, v23.8h, v25.8h // ypos -= 8*dy
+ movi v12.16b, #64
+ movi v20.16b, #2
+ movi v10.16b, #0x3e
+
+ smov w10, v22.b[0]
+
+ xtn v27.8b, v23.8h // (uint8_t)ypos
+ xtn2 v27.16b, v13.8h
+ shrn v29.8b, v23.8h, #6 // ypos >> 6
+ shrn2 v29.16b, v13.8h, #6
+ cmp w10, #0 // base_x (bottom left) >= 0
+ and v27.16b, v27.16b, v10.16b // frac_y
+
+ mov v18.16b, v15.16b // left[0]
+
+ b.ge 4f
+
+ add v23.8h, v13.8h, v25.8h // ypos -= 8*dy
+ movi v13.16b, #1
+
+ tbx v18.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y]
+ add v29.16b, v29.16b, v13.16b // base_y + 1
+ mov v19.16b, v15.16b // left[0]
+
+ sub v28.16b, v12.16b, v27.16b // 64 - frac_y
+
+ ld1 {v5.16b}, [x9], #16 // top[base_x]
+ ld1 {v7.16b}, [x11], #16
+
+ tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1]
+ add v29.16b, v29.16b, v13.16b // base_y + 2
+
+ mov v20.16b, v15.16b // left[0]
+ tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+2]
+
+ umull v10.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y)
+ umlal v10.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y
+ umull2 v11.8h, v18.16b, v28.16b
+ umlal2 v11.8h, v19.16b, v27.16b
+ umull v12.8h, v19.8b, v28.8b
+ umlal v12.8h, v20.8b, v27.8b
+ umull2 v13.8h, v19.16b, v28.16b
+ umlal2 v13.8h, v20.16b, v27.16b
+
+ ext v18.16b, v4.16b, v5.16b, #1 // top[base_x+1]
+ ext v19.16b, v6.16b, v7.16b, #1
+
+ rshrn v10.8b, v10.8h, #6
+ rshrn2 v10.16b, v11.8h, #6
+ rshrn v11.8b, v12.8h, #6
+ rshrn2 v11.16b, v13.8h, #6
+
+ umull v12.8h, v4.8b, v8.8b // top[base_x]-*(64-frac_x)
+ umlal v12.8h, v18.8b, v16.8b // + top[base_x+1]*frac_x
+ umull2 v13.8h, v4.16b, v8.16b
+ umlal2 v13.8h, v18.16b, v16.16b
+ umull v14.8h, v6.8b, v9.8b
+ umlal v14.8h, v19.8b, v17.8b
+ umull2 v20.8h, v6.16b, v9.16b
+ umlal2 v20.8h, v19.16b, v17.16b
+
+ cmge v18.16b, v21.16b, #0
+ cmge v19.16b, v22.16b, #0
+
+ rshrn v12.8b, v12.8h, #6
+ rshrn2 v12.16b, v13.8h, #6
+ rshrn v13.8b, v14.8h, #6
+ rshrn2 v13.16b, v20.8h, #6
+
+ bit v10.16b, v12.16b, v18.16b
+ bit v11.16b, v13.16b, v19.16b
+
+ st1 {v10.16b}, [x0], #16
+ subs w4, w4, #16
+ st1 {v11.16b}, [x13], #16
+ b.le 3f
+
+ movi v10.16b, #16
+ mov v4.16b, v5.16b
+ mov v6.16b, v7.16b
+ add v21.16b, v21.16b, v10.16b // base_x += 16
+ add v22.16b, v22.16b, v10.16b
+ b 2b
+
+3:
+ subs w5, w5, #2
+ b.le 9f
+ movi v10.8h, #128
+ add x0, x0, x1
+ add x13, x13, x1
+ mov w4, w12 // reset w
+ add v26.8h, v26.8h, v10.8h // ypos += 2*(1<<6)
+ b 1b
+
+4: // The rest of the row only predicted from top[]
+ ld1 {v5.16b}, [x9], #16 // top[base_x]
+ ld1 {v7.16b}, [x11], #16
+
+ ext v18.16b, v4.16b, v5.16b, #1 // top[base_x+1]
+ ext v19.16b, v6.16b, v7.16b, #1
+
+ umull v12.8h, v4.8b, v8.8b // top[base_x]-*(64-frac_x)
+ umlal v12.8h, v18.8b, v16.8b // + top[base_x+1]*frac_x
+ umull2 v13.8h, v4.16b, v8.16b
+ umlal2 v13.8h, v18.16b, v16.16b
+ umull v14.8h, v6.8b, v9.8b
+ umlal v14.8h, v19.8b, v17.8b
+ umull2 v20.8h, v6.16b, v9.16b
+ umlal2 v20.8h, v19.16b, v17.16b
+
+ rshrn v12.8b, v12.8h, #6
+ rshrn2 v12.16b, v13.8h, #6
+ rshrn v13.8b, v14.8h, #6
+ rshrn2 v13.16b, v20.8h, #6
+
+ st1 {v12.16b}, [x0], #16
+ subs w4, w4, #16
+ st1 {v13.16b}, [x13], #16
+ b.le 3b
+
+ mov v4.16b, v5.16b
+ mov v6.16b, v7.16b
+ b 4b
+
+329: // The rest of the block only predicted from left[]
+ add x1, x1, w4, uxtw // restore stride
+ mov w12, w5 // orig remaining h
+1:
+ add v13.8h, v23.8h, v25.8h // ypos -= 8*dy
+ movi v12.16b, #64
+ movi v10.16b, #0x3e
+
+ xtn v27.8b, v23.8h // (uint8_t)ypos
+ xtn2 v27.16b, v13.8h
+ shrn v29.8b, v23.8h, #6 // ypos >> 6
+ shrn2 v29.16b, v13.8h, #6
+ and v27.16b, v27.16b, v10.16b // frac_y
+
+ mov v18.16b, v15.16b // left[0]
+ add v23.8h, v13.8h, v25.8h // ypos -= 8*dy
+ movi v21.16b, #1
+
+ tbx v18.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y]
+ add v29.16b, v29.16b, v21.16b // base_y + 1
+
+ sub v28.16b, v12.16b, v27.16b // 64 - frac_y
+2:
+ mov v19.16b, v15.16b // left[0]
+ tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1]
+ add v29.16b, v29.16b, v21.16b // base_y + 2
+ mov v20.16b, v15.16b // left[0]
+ tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+2]
+ add v29.16b, v29.16b, v21.16b // next base_y
+
+ umull v10.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y)
+ umlal v10.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y
+ umull2 v11.8h, v18.16b, v28.16b
+ umlal2 v11.8h, v19.16b, v27.16b
+ umull v12.8h, v19.8b, v28.8b
+ umlal v12.8h, v20.8b, v27.8b
+ umull2 v13.8h, v19.16b, v28.16b
+ umlal2 v13.8h, v20.16b, v27.16b
+
+ rshrn v10.8b, v10.8h, #6
+ rshrn2 v10.16b, v11.8h, #6
+ rshrn v11.8b, v12.8h, #6
+ rshrn2 v11.16b, v13.8h, #6
+
+ st1 {v10.16b}, [x0], x1
+ subs w5, w5, #2
+ st1 {v11.16b}, [x13], x1
+ b.le 3f
+ mov v18.16b, v20.16b
+ b 2b
+
+3:
+ subs w4, w4, #16
+ b.le 9f
+
+ lsr x1, x1, #1
+ msub x0, x1, x12, x0 // ptr -= h * stride
+ msub x13, x1, x12, x13
+ lsl x1, x1, #1
+ add x0, x0, #16
+ add x13, x13, #16
+ mov w5, w12 // reset h
+ b 1b
+
+9:
+ ldp d14, d15, [sp, #0x30]
+ ldp d12, d13, [sp, #0x20]
+ ldp d10, d11, [sp, #0x10]
+ ldp d8, d9, [sp], 0x40
+ ret
+
+L(ipred_z2_fill1_tbl):
+ .hword L(ipred_z2_fill1_tbl) - 640b
+ .hword L(ipred_z2_fill1_tbl) - 320b
+ .hword L(ipred_z2_fill1_tbl) - 160b
+ .hword L(ipred_z2_fill1_tbl) - 80b
+ .hword L(ipred_z2_fill1_tbl) - 40b
+endfunc
+
+function ipred_z2_fill2_8bpc_neon, export=1
+ cmp w4, #8
+ mov w8, #(2 << 6) // xpos = 2 << 6
+ sub w8, w8, w6 // xpos -= dx
+
+ movrel x11, increments
+ ld1 {v31.8h}, [x11] // increments
+ neg w7, w7 // -dy
+ b.eq 80f
+
+40:
+ dup v30.4h, w7 // -dy
+ movi v17.8b, #1
+
+ mul v16.4h, v31.4h, v30.4h // {0,1,2,3}* -dy
+ movi v25.16b, #0x3e
+ add v30.4h, v16.4h, v30.4h // -= dy
+
+ xtn v31.8b, v31.8h // {0,1,2,3}
+
+ // For upsample_top, w <= 8 and h <= 8; we may need up to h+1 elements
+ // from left.
+ ld1 {v0.16b}, [x3] // left[]
+
+ movi v26.16b, #64
+ movi v19.16b, #2
+
+ xtn v27.8b, v30.8h // (uint8_t)ypos
+ shrn v29.8b, v30.8h, #6 // ypos >> 6
+ and v27.8b, v27.8b, v25.8b // frac_y
+
+ add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 1
+
+ add v30.8b, v29.8b, v17.8b // base_y + 1
+ add v28.8b, v29.8b, v19.8b // base_y + 2
+
+ tbl v16.8b, {v0.16b}, v29.8b // left[base_y]
+
+ trn1 v30.2s, v30.2s, v28.2s // base_y + 1, base_y + 2
+
+ sub v28.8b, v26.8b, v27.8b // 64 - frac_y
+
+ trn1 v31.2s, v31.2s, v31.2s // {0,1,2,3,0,1,2,3}
+
+ trn1 v27.2s, v27.2s, v27.2s // frac_y
+ trn1 v28.2s, v28.2s, v28.2s // 64 - frac_y
+
+ movi v29.8b, #2
+ add v31.8b, v31.8b, v31.8b // {0,2,4,6,0,2,4,6}
+4:
+ asr w9, w8, #6 // base_x
+ dup v6.4h, w8 // xpos
+ sub w8, w8, w6 // xpos -= dx
+ cmp w9, #-8 // base_x <= -8
+ asr w11, w8, #6 // base_x
+ b.le 49f
+
+ dup v7.4h, w8 // xpos
+
+ ldr d2, [x2, w9, sxtw] // top[base_x]
+ ldr d4, [x2, w11, sxtw]
+
+ trn1 v6.2d, v6.2d, v7.2d // xpos
+
+ tbl v17.8b, {v0.16b}, v30.8b // left[base_y+1], left[base_y+2]
+
+ shrn v20.8b, v6.8h, #6 // first base_x for each row
+ xtn v6.8b, v6.8h // (uint8_t)xpos
+
+ uzp2 v3.8b, v2.8b, v4.8b // top[base_x+1]
+ uzp1 v2.8b, v2.8b, v4.8b // top[base_x]
+
+ and v6.8b, v6.8b, v25.8b // frac_x
+
+ trn1 v16.2s, v16.2s, v17.2s // left[base_y], left[base_y+1]
+
+ sub v7.8b, v26.8b, v6.8b // 64 - frac_x
+
+ add v20.8b, v20.8b, v31.8b // actual base_x
+
+ umull v16.8h, v16.8b, v28.8b // left[base_y]*(64-frac_y)
+ umlal v16.8h, v17.8b, v27.8b // + left[base_y+1]*frac_y
+
+ umull v22.8h, v2.8b, v7.8b // top[base_x]-*(64-frac_x)
+ umlal v22.8h, v3.8b, v6.8b // + top[base_x+1]*frac_x
+
+ cmge v20.8b, v20.8b, #0
+
+ rshrn v16.8b, v16.8h, #6
+ rshrn v22.8b, v22.8h, #6
+
+ bit v16.8b, v22.8b, v20.8b
+
+ st1 {v16.s}[0], [x0], x1
+ sub w8, w8, w6 // xpos -= dx
+ subs w5, w5, #2
+ st1 {v16.s}[1], [x0], x1
+ b.le 9f
+
+ ext v16.8b, v17.8b, v17.8b, #4
+ add v30.8b, v30.8b, v29.8b // base_y += 2
+ b 4b
+
+49:
+ tbl v17.8b, {v0.16b}, v30.8b // left[base_y+1], left[base_y+2]
+
+ trn1 v16.2s, v16.2s, v17.2s // left[base_y], left[base_y+1]
+
+ umull v18.8h, v16.8b, v28.8b // left[base_y]*(64-frac_t)
+ umlal v18.8h, v17.8b, v27.8b // + left[base_y+1]*frac_y
+ rshrn v18.8b, v18.8h, #6
+
+ st1 {v18.s}[0], [x0], x1
+ subs w5, w5, #2
+ st1 {v18.s}[1], [x0], x1
+ b.le 9f
+
+ ext v16.8b, v17.8b, v17.8b, #4
+ add v30.8b, v30.8b, v29.8b // base_y += 2
+ b 49b
+
+9:
+ ret
+
+80:
+ dup v30.8h, w7 // -dy
+ movi v17.8b, #1
+
+ mul v16.8h, v31.8h, v30.8h // {0,1,2,3,4,5,6,7}* -dy
+ movi v25.16b, #0x3e
+ add v30.8h, v16.8h, v30.8h // -= dy
+
+ xtn v31.8b, v31.8h // {0,1,2,3,4,5,6,7}
+
+ // For upsample_top, w <= 8 and h <= 8; we may need up to h+1 elements
+ // from left.
+ ld1 {v0.16b}, [x3] // left[]
+
+ movi v26.16b, #64
+ movi v19.16b, #2
+
+ xtn v27.8b, v30.8h // (uint8_t)ypos
+ shrn v29.8b, v30.8h, #6 // ypos >> 6
+ and v27.8b, v27.8b, v25.8b // frac_y
+
+ add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 1
+
+ tbl v18.8b, {v0.16b}, v29.8b // left[base_y]
+
+ add v30.8b, v29.8b, v19.8b // base_y + 2
+ add v29.8b, v29.8b, v17.8b // base_y + 1
+
+ sub v28.8b, v26.8b, v27.8b // 64 - frac_y
+
+ trn1 v31.2d, v31.2d, v31.2d // {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7}
+
+ movi v24.8b, #2 // 2
+ add v31.16b, v31.16b, v31.16b // {0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14}
+8:
+ asr w9, w8, #6 // base_x
+ dup v16.8h, w8 // xpos
+ sub w8, w8, w6 // xpos -= dx
+ cmp w9, #-16 // base_x <= -16
+ asr w11, w8, #6 // base_x
+ b.le 89f
+
+ dup v17.8h, w8 // xpos
+
+ ldr q4, [x2, w9, sxtw] // top[base_x]
+ ldr q6, [x2, w11, sxtw]
+
+ tbl v19.8b, {v0.16b}, v29.8b // left[base_y+1]
+
+ shrn v21.8b, v16.8h, #6 // first base_x
+ shrn2 v21.16b, v17.8h, #6
+ xtn v16.8b, v16.8h // (uint8_t)xpos
+ xtn2 v16.16b, v17.8h
+
+ tbl v20.8b, {v0.16b}, v30.8b // left[base_y+2]
+
+ uzp2 v5.16b, v4.16b, v6.16b // top[base_x+1]
+ uzp1 v4.16b, v4.16b, v6.16b // top[base_x]
+
+ and v16.16b, v16.16b, v25.16b // frac_x
+
+ sub v7.16b, v26.16b, v16.16b // 64 - frac_x
+
+ add v21.16b, v21.16b, v31.16b // actual base_x
+
+ umull v6.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y)
+ umlal v6.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y
+ umull v17.8h, v19.8b, v28.8b
+ umlal v17.8h, v20.8b, v27.8b
+
+ umull v22.8h, v4.8b, v7.8b // top[base_x]-*(64-frac_x)
+ umlal v22.8h, v5.8b, v16.8b // + top[base_x+1]*frac_x
+ umull2 v23.8h, v4.16b, v7.16b
+ umlal2 v23.8h, v5.16b, v16.16b
+
+ cmge v21.16b, v21.16b, #0
+
+ rshrn v6.8b, v6.8h, #6
+ rshrn2 v6.16b, v17.8h, #6
+ rshrn v22.8b, v22.8h, #6
+ rshrn2 v22.16b, v23.8h, #6
+
+ bit v6.16b, v22.16b, v21.16b
+
+ st1 {v6.d}[0], [x0], x1
+ sub w8, w8, w6 // xpos -= dx
+ subs w5, w5, #2
+ st1 {v6.d}[1], [x0], x1
+ b.le 9f
+
+ mov v18.8b, v20.8b
+ add v29.8b, v29.8b, v24.8b // base_y += 2
+ add v30.8b, v30.8b, v24.8b // base_y += 2
+ b 8b
+
+89:
+ tbl v19.8b, {v0.16b}, v29.8b // left[base_y+1]
+ tbl v20.8b, {v0.16b}, v30.8b // left[base_y+2]
+
+ umull v6.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y)
+ umlal v6.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y
+ umull v17.8h, v19.8b, v28.8b
+ umlal v17.8h, v20.8b, v27.8b
+
+ rshrn v6.8b, v6.8h, #6
+ rshrn2 v6.16b, v17.8h, #6
+
+ st1 {v6.d}[0], [x0], x1
+ subs w5, w5, #2
+ st1 {v6.d}[1], [x0], x1
+ b.le 9f
+
+ mov v18.8b, v20.8b
+ add v29.8b, v29.8b, v24.8b // base_y += 2
+ add v30.8b, v30.8b, v24.8b // base_y += 2
+ b 89b
+
+9:
+ ret
+endfunc
+
+function ipred_z2_fill3_8bpc_neon, export=1
+ cmp w4, #8
+ mov w8, #(1 << 6) // xpos = 1 << 6
+ sub w8, w8, w6 // xpos -= dx
+
+ movrel x11, increments
+ ld1 {v31.8h}, [x11] // increments
+ neg w7, w7 // -dy
+ b.eq 80f
+
+40:
+ dup v30.4h, w7 // -dy
+ movi v17.8b, #1
+
+ mul v16.4h, v31.4h, v30.4h // {0,1,2,3}* -dy
+ movi v25.16b, #0x3e
+ add v30.4h, v16.4h, v30.4h // -= dy
+
+ xtn v31.8b, v31.8h // {0,1,2,3}
+
+ // For upsample_left, w <= 8 and h <= 8; we may need up to 2*h+1 elements.
+ ld1 {v0.16b, v1.16b}, [x3] // left[]
+
+ movi v26.16b, #64
+ movi v19.16b, #2
+
+ xtn v27.8b, v30.8h // (uint8_t)ypos
+ shrn v29.8b, v30.8h, #6 // ypos >> 6
+ and v27.8b, v27.8b, v25.8b // frac_y
+
+ add v29.8b, v29.8b, v19.8b // base_y = (ypos >> 6) + 2
+
+ add v30.8b, v29.8b, v17.8b // base_y + 1
+ add v28.8b, v29.8b, v19.8b // base_y + 2
+
+ trn1 v31.2s, v31.2s, v31.2s // {0,1,2,3,0,1,2,3}
+
+ add v24.8b, v30.8b, v19.8b // base_y + 3
+
+ trn1 v29.2s, v29.2s, v28.2s // base_y + 0, base_y + 2
+ trn1 v30.2s, v30.2s, v24.2s // base_y + 1, base_y + 3
+
+ sub v28.8b, v26.8b, v27.8b // 64 - frac_y
+
+ trn1 v27.2s, v27.2s, v27.2s // frac_y
+ trn1 v28.2s, v28.2s, v28.2s // 64 - frac_y
+
+ movi v24.8b, #4
+4:
+ asr w9, w8, #6 // base_x
+ dup v6.4h, w8 // xpos
+ sub w8, w8, w6 // xpos -= dx
+ cmp w9, #-4 // base_x <= -4
+ asr w11, w8, #6 // base_x
+ b.le 49f
+
+ dup v7.4h, w8 // xpos
+
+ ldr d2, [x2, w9, sxtw] // top[base_x]
+ ldr d4, [x2, w11, sxtw]
+
+ trn1 v6.2d, v6.2d, v7.2d // xpos
+
+ tbl v16.8b, {v0.16b, v1.16b}, v29.8b // left[base_y+0], left[base_y+2]
+ tbl v17.8b, {v0.16b, v1.16b}, v30.8b // left[base_y+1], left[base_y+3]
+
+ shrn v20.8b, v6.8h, #6 // first base_x for each row
+ xtn v6.8b, v6.8h // (uint8_t)xpos
+
+ ext v3.8b, v2.8b, v2.8b, #1 // top[base_x+1]
+ ext v5.8b, v4.8b, v4.8b, #1
+
+ and v6.8b, v6.8b, v25.8b // frac_x
+
+ trn1 v2.2s, v2.2s, v4.2s // top[base_x]
+ trn1 v3.2s, v3.2s, v5.2s // top[base_x+1]
+
+ sub v7.8b, v26.8b, v6.8b // 64 - frac_x
+
+ add v20.8b, v20.8b, v31.8b // actual base_x
+
+ umull v16.8h, v16.8b, v28.8b // left[base_y]*(64-frac_y)
+ umlal v16.8h, v17.8b, v27.8b // + left[base_y+1]*frac_y
+
+ umull v22.8h, v2.8b, v7.8b // top[base_x]-*(64-frac_x)
+ umlal v22.8h, v3.8b, v6.8b // + top[base_x+1]*frac_x
+
+ cmge v20.8b, v20.8b, #0
+
+ rshrn v16.8b, v16.8h, #6
+ rshrn v22.8b, v22.8h, #6
+
+ bit v16.8b, v22.8b, v20.8b
+
+ st1 {v16.s}[0], [x0], x1
+ sub w8, w8, w6 // xpos -= dx
+ subs w5, w5, #2
+ st1 {v16.s}[1], [x0], x1
+ b.le 9f
+
+ add v29.8b, v29.8b, v24.8b // base_y += 4
+ add v30.8b, v30.8b, v24.8b // base_y += 4
+ b 4b
+
+49:
+ tbl v16.8b, {v0.16b, v1.16b}, v29.8b // left[base_y+0], left[base_y+2]
+ tbl v17.8b, {v0.16b, v1.16b}, v30.8b // left[base_y+1], left[base_y+3]
+
+ umull v18.8h, v16.8b, v28.8b // left[base_y]*(64-frac_t)
+ umlal v18.8h, v17.8b, v27.8b // + left[base_y+1]*frac_y
+ rshrn v18.8b, v18.8h, #6
+
+ st1 {v18.s}[0], [x0], x1
+ subs w5, w5, #2
+ st1 {v18.s}[1], [x0], x1
+ b.le 9f
+
+ add v29.8b, v29.8b, v24.8b // base_y += 4
+ add v30.8b, v30.8b, v24.8b // base_y += 4
+ b 49b
+
+9:
+ ret
+
+80:
+ dup v30.8h, w7 // -dy
+ movi v17.8b, #1
+
+ mul v16.8h, v31.8h, v30.8h // {0,1,2,3,4,5,6,7}* -dy
+ movi v25.16b, #0x3e
+ add v30.8h, v16.8h, v30.8h // -= dy
+
+ xtn v31.8b, v31.8h // {0,1,2,3,4,5,6,7}
+
+ // For upsample_left, w <= 8 and h <= 8; we may need up to 2*h+1 elements.
+ ld1 {v0.16b, v1.16b, v2.16b}, [x3] // left[]
+
+ movi v26.16b, #64
+ movi v19.16b, #2
+
+ xtn v27.8b, v30.8h // (uint8_t)ypos
+ shrn v29.8b, v30.8h, #6 // ypos >> 6
+ and v27.8b, v27.8b, v25.8b // frac_y
+
+ add v29.8b, v29.8b, v19.8b // base_y = (ypos >> 6) + 2
+
+ add v28.8b, v29.8b, v17.8b // base_y + 1
+ add v30.8b, v29.8b, v19.8b // base_y + 2
+
+ trn1 v31.2d, v31.2d, v31.2d // {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7}
+ add v24.8b, v28.8b, v19.8b // base_y + 3
+
+ trn1 v29.2d, v29.2d, v30.2d // base_y + 0, base_y + 2
+ trn1 v30.2d, v28.2d, v24.2d // base_y + 1, base_y + 3
+
+ sub v28.8b, v26.8b, v27.8b // 64 - frac_y
+
+ movi v24.16b, #4
+
+ trn1 v27.2d, v27.2d, v27.2d // frac_y
+ trn1 v28.2d, v28.2d, v28.2d // 64 - frac_y
+8:
+ asr w9, w8, #6 // base_x
+ dup v16.8h, w8 // xpos
+ sub w8, w8, w6 // xpos -= dx
+ cmp w9, #-8 // base_x <= -8
+ asr w11, w8, #6 // base_x
+ b.le 89f
+
+ dup v17.8h, w8 // xpos
+
+ ldr q4, [x2, w9, sxtw] // top[base_x]
+ ldr q6, [x2, w11, sxtw]
+
+ tbl v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0], left[base_y+2]
+ tbl v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+3]
+
+ shrn v21.8b, v16.8h, #6 // first base_x
+ shrn2 v21.16b, v17.8h, #6
+ xtn v16.8b, v16.8h // (uint8_t)xpos
+ xtn2 v16.16b, v17.8h
+
+ ext v5.16b, v4.16b, v4.16b, #1 // top[base_x+1]
+ ext v7.16b, v6.16b, v6.16b, #1
+
+ and v16.16b, v16.16b, v25.16b // frac_x
+
+ trn1 v4.2d, v4.2d, v6.2d // top[base_x]
+ trn1 v5.2d, v5.2d, v7.2d // top[base_x+1]
+
+ sub v7.16b, v26.16b, v16.16b // 64 - frac_x
+
+ add v21.16b, v21.16b, v31.16b // actual base_x
+
+ umull v6.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y)
+ umlal v6.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y
+ umull2 v17.8h, v18.16b, v28.16b
+ umlal2 v17.8h, v19.16b, v27.16b
+
+ umull v22.8h, v4.8b, v7.8b // top[base_x]-*(64-frac_x)
+ umlal v22.8h, v5.8b, v16.8b // + top[base_x+1]*frac_x
+ umull2 v23.8h, v4.16b, v7.16b
+ umlal2 v23.8h, v5.16b, v16.16b
+
+ cmge v21.16b, v21.16b, #0
+
+ rshrn v6.8b, v6.8h, #6
+ rshrn2 v6.16b, v17.8h, #6
+ rshrn v22.8b, v22.8h, #6
+ rshrn2 v22.16b, v23.8h, #6
+
+ bit v6.16b, v22.16b, v21.16b
+
+ st1 {v6.d}[0], [x0], x1
+ sub w8, w8, w6 // xpos -= dx
+ subs w5, w5, #2
+ st1 {v6.d}[1], [x0], x1
+ b.le 9f
+
+ add v29.16b, v29.16b, v24.16b // base_y += 4
+ add v30.16b, v30.16b, v24.16b // base_y += 4
+ b 8b
+
+89:
+ tbl v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0], left[base_y+2]
+ tbl v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+3]
+
+ umull v6.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y)
+ umlal v6.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y
+ umull2 v17.8h, v18.16b, v28.16b
+ umlal2 v17.8h, v19.16b, v27.16b
+
+ rshrn v6.8b, v6.8h, #6
+ rshrn2 v6.16b, v17.8h, #6
+
+ st1 {v6.d}[0], [x0], x1
+ subs w5, w5, #2
+ st1 {v6.d}[1], [x0], x1
+ b.le 9f
+
+ add v29.16b, v29.16b, v24.16b // base_y += 4
+ add v30.16b, v30.16b, v24.16b // base_y += 4
+ b 89b
+
+9:
+ ret
+endfunc
+
+
+// void ipred_z3_fill1_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const left,
+// const int width, const int height,
+// const int dy, const int max_base_y);
+function ipred_z3_fill1_8bpc_neon, export=1
+ cmp w6, #64
+ clz w9, w3
+ adr x8, L(ipred_z3_fill1_tbl)
+ sub w9, w9, #25
+ ldrh w9, [x8, w9, uxtw #1]
+ add x10, x2, w6, uxtw // left[max_base_y]
+ sub x8, x8, w9, uxtw
+ movrel x11, increments
+ ld1r {v31.16b}, [x10] // padding
+ ld1 {v30.8h}, [x11] // increments
+ mov w7, w5
+ b.gt L(ipred_z3_fill1_large_h16)
+ br x8
+
+40:
+ AARCH64_VALID_JUMP_TARGET
+ dup v29.4h, w5 // dy
+
+ mul v30.4h, v30.4h, v29.4h // {0,1,2,3,4,5,6,7}*dy
+ movi v23.16b, #0x3e
+
+ // Worst case max_base_y is width+height-1, for w=4, h=16, <= 32
+ ld1 {v0.16b, v1.16b}, [x2] // left[]
+ add v30.4h, v29.4h, v30.4h // ypos
+
+ movi v22.16b, #64
+ movi v20.16b, #1
+ movi v21.16b, #2
+
+ xtn v24.8b, v30.8h // (uint8_t)ypos
+ uqshrn v26.8b, v30.8h, #6 // base
+ and v24.8b, v24.8b, v23.8b // frac
+
+ mov v4.8b, v31.8b
+ uqadd v27.8b, v26.8b, v20.8b // base + 1
+ uqadd v28.8b, v26.8b, v21.8b // base + 2
+ sub v25.8b, v22.8b, v24.8b // 64 - frac
+
+ tbx v4.8b, {v0.16b, v1.16b}, v26.8b // left[base]
+
+ trn1 v27.2s, v27.2s, v28.2s // base + 1, base + 2
+ trn1 v24.2s, v24.2s, v24.2s // frac
+ trn1 v25.2s, v25.2s, v25.2s // 64 - frac
+1:
+ mov v5.8b, v31.8b
+ tbx v5.8b, {v0.16b, v1.16b}, v27.8b // left[base+1], left[base+2]
+
+ trn1 v4.2s, v4.2s, v5.2s // left[base], left[base+1]
+
+ umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac)
+ umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac
+ rshrn v16.8b, v16.8h, #6
+ st1 {v16.s}[0], [x0], x1
+ subs w4, w4, #2
+ st1 {v16.s}[1], [x0], x1
+ b.le 9f
+
+ ext v4.8b, v5.8b, v5.8b, #4
+ uqadd v27.8b, v27.8b, v21.8b // base += 2
+ b 1b
+
+9:
+ ret
+
+80:
+ AARCH64_VALID_JUMP_TARGET
+ dup v29.8h, w5 // dy
+
+ mul v30.8h, v30.8h, v29.8h // {0,1,2,3,4,5,6,7}*dy
+ movi v23.16b, #0x3e
+
+ // Worst case max_base_y is width+height-1, for w=8, h=32, <= 48
+ ld1 {v0.16b, v1.16b, v2.16b}, [x2] // left[]
+ add v30.8h, v29.8h, v30.8h // ypos
+
+ movi v22.16b, #64
+ movi v20.16b, #1
+ movi v21.16b, #2
+
+ xtn v24.8b, v30.8h // (uint8_t)ypos
+ uqshrn v26.8b, v30.8h, #6 // base
+ and v24.8b, v24.8b, v23.8b // frac
+
+ mov v4.8b, v31.8b
+ uqadd v27.8b, v26.8b, v20.8b // base + 1
+ uqadd v28.8b, v26.8b, v21.8b // base + 2
+ sub v25.8b, v22.8b, v24.8b // 64 - frac
+
+ tbx v4.8b, {v0.16b, v1.16b, v2.16b}, v26.8b // left[base]
+1:
+ mov v5.8b, v31.8b
+ mov v6.8b, v31.8b
+ tbx v5.8b, {v0.16b, v1.16b, v2.16b}, v27.8b // left[base+1]
+ tbx v6.8b, {v0.16b, v1.16b, v2.16b}, v28.8b // left[base+2]
+
+ umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac)
+ umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac
+ umull v17.8h, v5.8b, v25.8b
+ umlal v17.8h, v6.8b, v24.8b
+ rshrn v16.8b, v16.8h, #6
+ rshrn v17.8b, v17.8h, #6
+ st1 {v16.8b}, [x0], x1
+ subs w4, w4, #2
+ st1 {v17.8b}, [x0], x1
+ b.le 9f
+
+ mov v4.8b, v6.8b
+ uqadd v27.8b, v27.8b, v21.8b // base += 2
+ uqadd v28.8b, v28.8b, v21.8b // base += 2
+ b 1b
+
+9:
+ ret
+
+160:
+ AARCH64_VALID_JUMP_TARGET
+ dup v28.8h, w5 // dy
+
+ shl v29.8h, v28.8h, #3 // 8*dy
+ mul v30.8h, v30.8h, v28.8h // {0,1,2,3,4,5,6,7}*dy
+ movi v23.16b, #0x3e
+
+ // This is only executed if we've checked that max_base_y <= 64.
+ ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] // left[]
+ add v28.8h, v28.8h, v30.8h // ypos
+
+ movi v22.16b, #64
+ movi v20.16b, #1
+ movi v21.16b, #2
+
+ add v29.8h, v28.8h, v29.8h // ypos + 8*dy
+
+ xtn v24.8b, v28.8h // (uint8_t)ypos
+ xtn2 v24.16b, v29.8h
+ uqshrn v26.8b, v28.8h, #6 // base
+ uqshrn2 v26.16b, v29.8h, #6
+ and v24.16b, v24.16b, v23.16b // frac
+
+ mov v4.16b, v31.16b
+ uqadd v27.16b, v26.16b, v20.16b // base + 1
+ uqadd v28.16b, v26.16b, v21.16b // base + 2
+ sub v25.16b, v22.16b, v24.16b // 64 - frac
+
+ tbx v4.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v26.16b // left[base]
+1:
+ mov v5.16b, v31.16b
+ mov v6.16b, v31.16b
+ tbx v5.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v27.16b // left[base+1]
+ tbx v6.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v28.16b // left[base+2]
+
+ umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac)
+ umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac
+ umull2 v17.8h, v4.16b, v25.16b
+ umlal2 v17.8h, v5.16b, v24.16b
+ umull v18.8h, v5.8b, v25.8b
+ umlal v18.8h, v6.8b, v24.8b
+ umull2 v19.8h, v5.16b, v25.16b
+ umlal2 v19.8h, v6.16b, v24.16b
+ rshrn v16.8b, v16.8h, #6
+ rshrn2 v16.16b, v17.8h, #6
+ rshrn v17.8b, v18.8h, #6
+ rshrn2 v17.16b, v19.8h, #6
+ st1 {v16.16b}, [x0], x1
+ subs w4, w4, #2
+ st1 {v17.16b}, [x0], x1
+ b.le 9f
+
+ mov v4.16b, v6.16b
+ uqadd v27.16b, v27.16b, v21.16b // base += 2
+ uqadd v28.16b, v28.16b, v21.16b // base += 2
+ b 1b
+
+9:
+ ret
+320:
+640:
+ AARCH64_VALID_JUMP_TARGET
+ dup v28.8h, w5 // dy
+ mov w12, w3
+
+ add x13, x0, x1
+
+ shl v29.8h, v28.8h, #3 // 8*dy
+ mul v30.8h, v30.8h, v28.8h // {0,1,2,3,4,5,6,7}*dy
+ movi v23.16b, #0x3e
+
+ lsl x1, x1, #1
+ sub x1, x1, w3, uxtw
+ add v30.8h, v28.8h, v30.8h // ypos
+
+ // This is only executed if we've checked that max_base_y <= 64.
+ ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] // left[]
+
+ movi v22.16b, #64
+ movi v20.16b, #1
+ movi v21.16b, #2
+
+1:
+ mov v26.16b, v30.16b // reset ypos
+
+2:
+ add v27.8h, v26.8h, v29.8h // ypos + 8*dy
+ uqshrn v16.8b, v26.8h, #6 // base
+ uqshrn2 v16.16b, v27.8h, #6
+ xtn v24.8b, v26.8h // (uint8_t)ypos
+ xtn2 v24.16b, v27.8h
+ umov w14, v16.b[0]
+ and v24.16b, v24.16b, v23.16b // frac
+
+ uqadd v17.16b, v16.16b, v20.16b // base + 1
+ cmp w14, w6 // base >= max_base_y
+ uqadd v18.16b, v16.16b, v21.16b // base + 2
+ sub v25.16b, v22.16b, v24.16b // 64 - frac
+
+ b.ge 4f
+
+ mov v4.16b, v31.16b
+ mov v5.16b, v31.16b
+ mov v6.16b, v31.16b
+ tbx v4.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v16.16b // left[base]
+ tbx v5.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v17.16b // left[base+1]
+ tbx v6.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v18.16b // left[base+2]
+
+ subs w3, w3, #16
+ umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac)
+ umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac
+ umull2 v17.8h, v4.16b, v25.16b
+ umlal2 v17.8h, v5.16b, v24.16b
+ umull v18.8h, v5.8b, v25.8b
+ umlal v18.8h, v6.8b, v24.8b
+ umull2 v19.8h, v5.16b, v25.16b
+ umlal2 v19.8h, v6.16b, v24.16b
+ rshrn v16.8b, v16.8h, #6
+ rshrn2 v16.16b, v17.8h, #6
+ rshrn v17.8b, v18.8h, #6
+ rshrn2 v17.16b, v19.8h, #6
+ st1 {v16.16b}, [x0], #16
+ st1 {v17.16b}, [x13], #16
+ b.le 3f
+ add v26.8h, v27.8h, v29.8h // ypos += 16*dy
+ b 2b
+
+3:
+ subs w4, w4, #2
+ b.le 9f
+ movi v16.8h, #128
+ add x0, x0, x1
+ add x13, x13, x1
+ add v30.8h, v30.8h, v16.8h // ypos = dy + y*(1<<6)*2
+ mov w3, w12
+ b 1b
+
+4:
+ subs w3, w3, #16
+ st1 {v31.16b}, [x0], #16
+ st1 {v31.16b}, [x13], #16
+ b.gt 4b
+ b 3b
+
+9:
+ ret
+
+L(ipred_z3_fill1_large_h16):
+ // Fallback case for max_base_y > 64; similar to the z1
+ // implementation. This does the filtering vertically, filling out
+ // a 2x pixel column at a time.
+ mov w15, #64
+ add x13, x0, x1
+ lsl x1, x1, #1
+
+ mov w12, w4
+1:
+ lsr w8, w7, #6 // base
+ and w9, w7, #0x3e // frac
+ add w7, w7, w5 // ypos += dy
+ cmp w8, w6 // base >= max_base_y
+ lsr w10, w7, #6 // base
+ and w11, w7, #0x3e // frac
+ b.ge ipred_z3_fill_padding_neon
+ add x8, x2, w8, uxtw
+ add x10, x2, w10, uxtw
+ dup v4.16b, w9 // frac
+ dup v5.16b, w11
+ ld1 {v0.16b, v1.16b}, [x8], #32 // left[base]
+ ld1 {v2.16b, v3.16b}, [x10], #32
+ sub w9, w15, w9 // 64 - frac
+ sub w11, w15, w11
+ dup v6.16b, w9 // 64 - frac
+ dup v7.16b, w11
+ add w7, w7, w5 // ypos += dy
+2:
+ ext v16.16b, v0.16b, v1.16b, #1 // left[base+1]
+ ext v17.16b, v2.16b, v3.16b, #1
+ subs w4, w4, #16
+ umull v18.8h, v16.8b, v4.8b // left[base+1]*frac
+ umlal v18.8h, v0.8b, v6.8b // + left[base]*(64-frac)
+ umull2 v19.8h, v16.16b, v4.16b
+ umlal2 v19.8h, v0.16b, v6.16b
+ umull v20.8h, v17.8b, v5.8b
+ umlal v20.8h, v2.8b, v7.8b
+ umull2 v21.8h, v17.16b, v5.16b
+ umlal2 v21.8h, v2.16b, v7.16b
+ rshrn v16.8b, v18.8h, #6
+ rshrn2 v16.16b, v19.8h, #6
+ rshrn v17.8b, v20.8h, #6
+ rshrn2 v17.16b, v21.8h, #6
+ zip1 v18.16b, v16.16b, v17.16b
+ zip2 v19.16b, v16.16b, v17.16b
+ st1 {v18.h}[0], [x0], x1
+ st1 {v18.h}[1], [x13], x1
+ st1 {v18.h}[2], [x0], x1
+ st1 {v18.h}[3], [x13], x1
+ st1 {v18.h}[4], [x0], x1
+ st1 {v18.h}[5], [x13], x1
+ st1 {v18.h}[6], [x0], x1
+ st1 {v18.h}[7], [x13], x1
+ st1 {v19.h}[0], [x0], x1
+ st1 {v19.h}[1], [x13], x1
+ st1 {v19.h}[2], [x0], x1
+ st1 {v19.h}[3], [x13], x1
+ st1 {v19.h}[4], [x0], x1
+ st1 {v19.h}[5], [x13], x1
+ st1 {v19.h}[6], [x0], x1
+ st1 {v19.h}[7], [x13], x1
+ b.le 3f
+ mov v0.16b, v1.16b
+ ld1 {v1.16b}, [x8], #16 // left[base]
+ mov v2.16b, v3.16b
+ ld1 {v3.16b}, [x10], #16
+ b 2b
+
+3:
+ subs w3, w3, #2
+ b.le 9f
+ lsr x1, x1, #1
+ msub x0, x1, x12, x0 // ptr -= h * stride
+ msub x13, x1, x12, x13
+ lsl x1, x1, #1
+ add x0, x0, #2
+ add x13, x13, #2
+ mov w4, w12
+ b 1b
+9:
+ ret
+
+L(ipred_z3_fill1_tbl):
+ .hword L(ipred_z3_fill1_tbl) - 640b
+ .hword L(ipred_z3_fill1_tbl) - 320b
+ .hword L(ipred_z3_fill1_tbl) - 160b
+ .hword L(ipred_z3_fill1_tbl) - 80b
+ .hword L(ipred_z3_fill1_tbl) - 40b
+endfunc
+
+function ipred_z3_fill_padding_neon, export=0
+ cmp w3, #16
+ adr x8, L(ipred_z3_fill_padding_tbl)
+ b.gt L(ipred_z3_fill_padding_wide)
+ // w3 = remaining width, w4 = constant height
+ mov w12, w4
+
+1:
+ // Fill a WxH rectangle with padding. W can be any number;
+ // this fills the exact width by filling in the largest
+ // power of two in the remaining width, and repeating.
+ clz w9, w3
+ sub w9, w9, #25
+ ldrh w9, [x8, w9, uxtw #1]
+ sub x9, x8, w9, uxtw
+ br x9
+
+2:
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v31.h}[0], [x0], x1
+ subs w4, w4, #4
+ st1 {v31.h}[0], [x13], x1
+ st1 {v31.h}[0], [x0], x1
+ st1 {v31.h}[0], [x13], x1
+ b.gt 2b
+ subs w3, w3, #2
+ lsr x1, x1, #1
+ msub x0, x1, x12, x0 // ptr -= h * stride
+ msub x13, x1, x12, x13
+ b.le 9f
+ lsl x1, x1, #1
+ add x0, x0, #2
+ add x13, x13, #2
+ mov w4, w12
+ b 1b
+
+4:
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v31.s}[0], [x0], x1
+ subs w4, w4, #4
+ st1 {v31.s}[0], [x13], x1
+ st1 {v31.s}[0], [x0], x1
+ st1 {v31.s}[0], [x13], x1
+ b.gt 4b
+ subs w3, w3, #4
+ lsr x1, x1, #1
+ msub x0, x1, x12, x0 // ptr -= h * stride
+ msub x13, x1, x12, x13
+ b.le 9f
+ lsl x1, x1, #1
+ add x0, x0, #4
+ add x13, x13, #4
+ mov w4, w12
+ b 1b
+
+8:
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v31.8b}, [x0], x1
+ subs w4, w4, #4
+ st1 {v31.8b}, [x13], x1
+ st1 {v31.8b}, [x0], x1
+ st1 {v31.8b}, [x13], x1
+ b.gt 4b
+ subs w3, w3, #8
+ lsr x1, x1, #1
+ msub x0, x1, x12, x0 // ptr -= h * stride
+ msub x13, x1, x12, x13
+ b.le 9f
+ lsl x1, x1, #1
+ add x0, x0, #8
+ add x13, x13, #8
+ mov w4, w12
+ b 1b
+
+16:
+32:
+64:
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v31.16b}, [x0], x1
+ subs w4, w4, #4
+ st1 {v31.16b}, [x13], x1
+ st1 {v31.16b}, [x0], x1
+ st1 {v31.16b}, [x13], x1
+ b.gt 4b
+ subs w3, w3, #16
+ lsr x1, x1, #1
+ msub x0, x1, x12, x0 // ptr -= h * stride
+ msub x13, x1, x12, x13
+ b.le 9f
+ lsl x1, x1, #1
+ add x0, x0, #16
+ add x13, x13, #16
+ mov w4, w12
+ b 1b
+
+9:
+ ret
+
+L(ipred_z3_fill_padding_tbl):
+ .hword L(ipred_z3_fill_padding_tbl) - 64b
+ .hword L(ipred_z3_fill_padding_tbl) - 32b
+ .hword L(ipred_z3_fill_padding_tbl) - 16b
+ .hword L(ipred_z3_fill_padding_tbl) - 8b
+ .hword L(ipred_z3_fill_padding_tbl) - 4b
+ .hword L(ipred_z3_fill_padding_tbl) - 2b
+
+L(ipred_z3_fill_padding_wide):
+ // Fill a WxH rectangle with padding, with W > 16.
+ lsr x1, x1, #1
+ mov w12, w3
+ sub x1, x1, w3, uxtw
+1:
+ ands w5, w3, #15
+ b.eq 2f
+ // If the width isn't aligned to 16, first do one 16 byte write
+ // and align the start pointer.
+ sub w3, w3, w5
+ st1 {v31.16b}, [x0]
+ add x0, x0, w5, uxtw
+2:
+ // Fill the rest of the line with aligned 16 byte writes.
+ subs w3, w3, #16
+ st1 {v31.16b}, [x0], #16
+ b.gt 2b
+ subs w4, w4, #1
+ add x0, x0, x1
+ b.le 9f
+ mov w3, w12
+ b 1b
+9:
+ ret
+endfunc
+
+function ipred_z3_fill2_8bpc_neon, export=1
+ cmp w3, #8
+ add x10, x2, w6, uxtw // left[max_base_y]
+ movrel x11, increments
+ ld1r {v31.16b}, [x10] // padding
+ ld1 {v30.8h}, [x11] // increments
+ b.eq 80f
+
+40: // w == 4
+ dup v29.4h, w5 // dy
+
+ mul v30.4h, v30.4h, v29.4h // {0,1,2,3,4,5,6,7}*dy
+ movi v23.16b, #0x3e
+
+ // Worst case max_base_y is 2*(width+height)-2, but width+height <= 16,
+ // so max_base_y <= 32.
+ ld1 {v0.16b, v1.16b}, [x2] // left[]
+ add v30.4h, v29.4h, v30.4h // ypos
+
+ movi v22.16b, #64
+ movi v20.16b, #1
+ movi v21.16b, #2
+
+ xtn v24.8b, v30.8h // (uint8_t)ypos
+ uqshrn v26.8b, v30.8h, #6 // base
+ and v24.8b, v24.8b, v23.8b // frac
+
+ uqadd v27.8b, v26.8b, v20.8b // base + 1
+ uqadd v28.8b, v26.8b, v21.8b // base + 2
+ sub v25.8b, v22.8b, v24.8b // 64 - frac
+ uqadd v29.8b, v27.8b, v21.8b // base + 3
+
+ trn1 v24.2s, v24.2s, v24.2s // frac
+ trn1 v26.2s, v26.2s, v28.2s // base + 0, base + 2
+ trn1 v27.2s, v27.2s, v29.2s // base + 1, base + 3
+ trn1 v25.2s, v25.2s, v25.2s // 64 - frac
+
+ movi v21.16b, #4
+1:
+ mov v4.8b, v31.8b
+ mov v5.8b, v31.8b
+ tbx v4.8b, {v0.16b, v1.16b}, v26.8b // left[base], left[base+2]
+ tbx v5.8b, {v0.16b, v1.16b}, v27.8b // left[base+1], left[base+3]
+
+ umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac)
+ umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac
+ rshrn v16.8b, v16.8h, #6
+ st1 {v16.s}[0], [x0], x1
+ subs w4, w4, #2
+ st1 {v16.s}[1], [x0], x1
+ b.le 9f
+
+ uqadd v26.8b, v26.8b, v21.8b // base += 4
+ uqadd v27.8b, v27.8b, v21.8b // base += 4
+ b 1b
+
+9:
+ ret
+
+80: // w == 8
+ dup v29.8h, w5 // dy
+
+ mul v30.8h, v30.8h, v29.8h // {0,1,2,3,4,5,6,7}*dy
+ movi v23.16b, #0x3e
+
+ // Worst case max_base_y is 2*(width+height)-2, but width+height <= 16,
+ // so max_base_y <= 32.
+ ld1 {v0.16b, v1.16b}, [x2] // left[]
+ add v30.8h, v29.8h, v30.8h // ypos
+
+ movi v22.16b, #64
+ movi v20.16b, #1
+ movi v21.16b, #2
+
+ xtn v24.8b, v30.8h // (uint8_t)ypos
+ uqshrn v26.8b, v30.8h, #6 // base
+ and v24.8b, v24.8b, v23.8b // frac
+
+ uqadd v27.8b, v26.8b, v20.8b // base + 1
+ uqadd v28.8b, v26.8b, v21.8b // base + 2
+ sub v25.8b, v22.8b, v24.8b // 64 - frac
+ uqadd v29.8b, v27.8b, v21.8b // base + 3
+
+ trn1 v24.2d, v24.2d, v24.2d // frac
+ trn1 v26.2d, v26.2d, v28.2d // base + 0, base + 2
+ trn1 v27.2d, v27.2d, v29.2d // base + 1, base + 3
+ trn1 v25.2d, v25.2d, v25.2d // 64 - frac
+
+ movi v21.16b, #4
+1:
+ mov v4.16b, v31.16b
+ mov v5.16b, v31.16b
+ tbx v4.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v26.16b // left[base], left[base+2]
+ tbx v5.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v27.16b // left[base+1], left[base+3]
+
+ umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac)
+ umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac
+ umull2 v17.8h, v4.16b, v25.16b
+ umlal2 v17.8h, v5.16b, v24.16b
+ rshrn v16.8b, v16.8h, #6
+ rshrn v17.8b, v17.8h, #6
+ st1 {v16.8b}, [x0], x1
+ subs w4, w4, #2
+ st1 {v17.8b}, [x0], x1
+ b.le 9f
+
+ uqadd v26.16b, v26.16b, v21.16b // base += 4
+ uqadd v27.16b, v27.16b, v21.16b // base += 4
+ b 1b
+
+9:
+ ret
+endfunc
+
+
+// void ipred_filter_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int filt_idx,
+// const int max_width, const int max_height);
+function ipred_filter_8bpc_neon, export=1
+ and w5, w5, #511
+ movrel x6, X(filter_intra_taps)
+ lsl w5, w5, #6
+ add x6, x6, w5, uxtw
+ ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32
+ clz w9, w3
+ adr x5, L(ipred_filter_tbl)
+ ld1 {v20.8b, v21.8b, v22.8b}, [x6]
+ sub w9, w9, #26
+ ldrh w9, [x5, w9, uxtw #1]
+ sxtl v16.8h, v16.8b
+ sxtl v17.8h, v17.8b
+ sub x5, x5, w9, uxtw
+ sxtl v18.8h, v18.8b
+ sxtl v19.8h, v19.8b
+ add x6, x0, x1
+ lsl x1, x1, #1
+ sxtl v20.8h, v20.8b
+ sxtl v21.8h, v21.8b
+ sxtl v22.8h, v22.8b
+ br x5
+40:
+ AARCH64_VALID_JUMP_TARGET
+ ldur s0, [x2, #1] // top (0-3)
+ sub x2, x2, #2
+ mov x7, #-2
+ uxtl v0.8h, v0.8b // top (0-3)
+4:
+ ld1 {v1.s}[0], [x2], x7 // left (0-1) + topleft (2)
+ mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1)
+ mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2)
+ mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3)
+ uxtl v1.8h, v1.8b // left (0-1) + topleft (2)
+ mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4)
+ mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0)
+ mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5)
+ mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6)
+ sqrshrun v2.8b, v2.8h, #4
+ subs w4, w4, #2
+ st1 {v2.s}[0], [x0], x1
+ uxtl v0.8h, v2.8b
+ st1 {v2.s}[1], [x6], x1
+ ext v0.16b, v0.16b, v0.16b, #8 // move top from [4-7] to [0-3]
+ b.gt 4b
+ ret
+80:
+ AARCH64_VALID_JUMP_TARGET
+ ldur d0, [x2, #1] // top (0-7)
+ sub x2, x2, #2
+ mov x7, #-2
+ uxtl v0.8h, v0.8b // top (0-7)
+8:
+ ld1 {v1.s}[0], [x2], x7 // left (0-1) + topleft (2)
+ mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1)
+ mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2)
+ mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3)
+ uxtl v1.8h, v1.8b // left (0-1) + topleft (2)
+ mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4)
+ mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0)
+ mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5)
+ mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6)
+ mul v3.8h, v17.8h, v0.h[4] // p1(top[0]) * filter(1)
+ mla v3.8h, v18.8h, v0.h[5] // p2(top[1]) * filter(2)
+ mla v3.8h, v19.8h, v0.h[6] // p3(top[2]) * filter(3)
+ sqrshrun v2.8b, v2.8h, #4
+ uxtl v1.8h, v2.8b // first block, in 16 bit
+ mla v3.8h, v20.8h, v0.h[7] // p4(top[3]) * filter(4)
+ mla v3.8h, v16.8h, v0.h[3] // p0(topleft) * filter(0)
+ mla v3.8h, v21.8h, v1.h[3] // p5(left[0]) * filter(5)
+ mla v3.8h, v22.8h, v1.h[7] // p6(left[1]) * filter(6)
+ sqrshrun v3.8b, v3.8h, #4
+ subs w4, w4, #2
+ st2 {v2.s, v3.s}[0], [x0], x1
+ zip2 v0.2s, v2.2s, v3.2s
+ st2 {v2.s, v3.s}[1], [x6], x1
+ uxtl v0.8h, v0.8b
+ b.gt 8b
+ ret
+160:
+320:
+ AARCH64_VALID_JUMP_TARGET
+ add x8, x2, #1
+ sub x2, x2, #2
+ mov x7, #-2
+ sub x1, x1, w3, uxtw
+ mov w9, w3
+
+1:
+ ld1 {v0.s}[0], [x2], x7 // left (0-1) + topleft (2)
+ uxtl v0.8h, v0.8b // left (0-1) + topleft (2)
+2:
+ ld1 {v2.16b}, [x8], #16 // top(0-15)
+ mul v3.8h, v16.8h, v0.h[2] // p0(topleft) * filter(0)
+ mla v3.8h, v21.8h, v0.h[1] // p5(left[0]) * filter(5)
+ uxtl v1.8h, v2.8b // top(0-7)
+ uxtl2 v2.8h, v2.16b // top(8-15)
+ mla v3.8h, v22.8h, v0.h[0] // p6(left[1]) * filter(6)
+ mla v3.8h, v17.8h, v1.h[0] // p1(top[0]) * filter(1)
+ mla v3.8h, v18.8h, v1.h[1] // p2(top[1]) * filter(2)
+ mla v3.8h, v19.8h, v1.h[2] // p3(top[2]) * filter(3)
+ mla v3.8h, v20.8h, v1.h[3] // p4(top[3]) * filter(4)
+
+ mul v4.8h, v17.8h, v1.h[4] // p1(top[0]) * filter(1)
+ mla v4.8h, v18.8h, v1.h[5] // p2(top[1]) * filter(2)
+ mla v4.8h, v19.8h, v1.h[6] // p3(top[2]) * filter(3)
+ sqrshrun v3.8b, v3.8h, #4
+ uxtl v0.8h, v3.8b // first block, in 16 bit
+ mla v4.8h, v20.8h, v1.h[7] // p4(top[3]) * filter(4)
+ mla v4.8h, v16.8h, v1.h[3] // p0(topleft) * filter(0)
+ mla v4.8h, v21.8h, v0.h[3] // p5(left[0]) * filter(5)
+ mla v4.8h, v22.8h, v0.h[7] // p6(left[1]) * filter(6)
+
+ mul v5.8h, v17.8h, v2.h[0] // p1(top[0]) * filter(1)
+ mla v5.8h, v18.8h, v2.h[1] // p2(top[1]) * filter(2)
+ mla v5.8h, v19.8h, v2.h[2] // p3(top[2]) * filter(3)
+ sqrshrun v4.8b, v4.8h, #4
+ uxtl v0.8h, v4.8b // second block, in 16 bit
+ mla v5.8h, v20.8h, v2.h[3] // p4(top[3]) * filter(4)
+ mla v5.8h, v16.8h, v1.h[7] // p0(topleft) * filter(0)
+ mla v5.8h, v21.8h, v0.h[3] // p5(left[0]) * filter(5)
+ mla v5.8h, v22.8h, v0.h[7] // p6(left[1]) * filter(6)
+
+ mul v6.8h, v17.8h, v2.h[4] // p1(top[0]) * filter(1)
+ mla v6.8h, v18.8h, v2.h[5] // p2(top[1]) * filter(2)
+ mla v6.8h, v19.8h, v2.h[6] // p3(top[2]) * filter(3)
+ sqrshrun v5.8b, v5.8h, #4
+ uxtl v0.8h, v5.8b // third block, in 16 bit
+ mla v6.8h, v20.8h, v2.h[7] // p4(top[3]) * filter(4)
+ mla v6.8h, v16.8h, v2.h[3] // p0(topleft) * filter(0)
+ mla v6.8h, v21.8h, v0.h[3] // p5(left[0]) * filter(5)
+ mla v6.8h, v22.8h, v0.h[7] // p6(left[1]) * filter(6)
+
+ subs w3, w3, #16
+ sqrshrun v6.8b, v6.8h, #4
+
+ st4 {v3.s, v4.s, v5.s, v6.s}[0], [x0], #16
+ st4 {v3.s, v4.s, v5.s, v6.s}[1], [x6], #16
+ b.le 8f
+ ins v0.h[2], v2.h[7]
+ ins v0.b[0], v6.b[7]
+ ins v0.b[2], v6.b[3]
+ b 2b
+8:
+ subs w4, w4, #2
+ b.le 9f
+ sub x8, x6, w9, uxtw
+ add x0, x0, x1
+ add x6, x6, x1
+ mov w3, w9
+ b 1b
+9:
+ ret
+
+L(ipred_filter_tbl):
+ .hword L(ipred_filter_tbl) - 320b
+ .hword L(ipred_filter_tbl) - 160b
+ .hword L(ipred_filter_tbl) - 80b
+ .hword L(ipred_filter_tbl) - 40b
+endfunc
+
+// void pal_pred_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const pal, const uint8_t *idx,
+// const int w, const int h);
+function pal_pred_8bpc_neon, export=1
+ ld1 {v0.8b}, [x2]
+ clz w9, w4
+ adr x6, L(pal_pred_tbl)
+ sub w9, w9, #25
+ movi v31.16b, #7
+ ldrh w9, [x6, w9, uxtw #1]
+ sub x6, x6, w9, uxtw
+ add x2, x0, x1
+ lsl x1, x1, #1
+ br x6
+4:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v1.8b}, [x3], #8
+ subs w5, w5, #4
+ ushr v3.8b, v1.8b, #4
+ and v2.8b, v1.8b, v31.8b
+ zip1 v1.16b, v2.16b, v3.16b
+ tbl v1.16b, {v0.16b}, v1.16b
+ st1 {v1.s}[0], [x0], x1
+ st1 {v1.s}[1], [x2], x1
+ st1 {v1.s}[2], [x0], x1
+ st1 {v1.s}[3], [x2], x1
+ b.gt 4b
+ ret
+8:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v1.16b}, [x3], #16
+ subs w5, w5, #4
+ ushr v4.16b, v1.16b, #4
+ and v3.16b, v1.16b, v31.16b
+ zip1 v1.16b, v3.16b, v4.16b
+ zip2 v2.16b, v3.16b, v4.16b
+ tbl v1.16b, {v0.16b}, v1.16b
+ st1 {v1.d}[0], [x0], x1
+ tbl v2.16b, {v0.16b}, v2.16b
+ st1 {v1.d}[1], [x2], x1
+ st1 {v2.d}[0], [x0], x1
+ st1 {v2.d}[1], [x2], x1
+ b.gt 8b
+ ret
+16:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v1.16b, v2.16b}, [x3], #32
+ subs w5, w5, #4
+ ushr v5.16b, v1.16b, #4
+ and v4.16b, v1.16b, v31.16b
+ ushr v7.16b, v2.16b, #4
+ and v6.16b, v2.16b, v31.16b
+ zip1 v1.16b, v4.16b, v5.16b
+ zip2 v2.16b, v4.16b, v5.16b
+ zip1 v3.16b, v6.16b, v7.16b
+ tbl v1.16b, {v0.16b}, v1.16b
+ zip2 v4.16b, v6.16b, v7.16b
+ tbl v2.16b, {v0.16b}, v2.16b
+ st1 {v1.16b}, [x0], x1
+ tbl v3.16b, {v0.16b}, v3.16b
+ st1 {v2.16b}, [x2], x1
+ tbl v4.16b, {v0.16b}, v4.16b
+ st1 {v3.16b}, [x0], x1
+ st1 {v4.16b}, [x2], x1
+ b.gt 16b
+ ret
+32:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64
+ subs w5, w5, #4
+ ushr v21.16b, v16.16b, #4
+ and v20.16b, v16.16b, v31.16b
+ ushr v23.16b, v17.16b, #4
+ and v22.16b, v17.16b, v31.16b
+ ushr v25.16b, v18.16b, #4
+ and v24.16b, v18.16b, v31.16b
+ ushr v27.16b, v19.16b, #4
+ and v26.16b, v19.16b, v31.16b
+ zip1 v16.16b, v20.16b, v21.16b
+ zip2 v17.16b, v20.16b, v21.16b
+ zip1 v18.16b, v22.16b, v23.16b
+ zip2 v19.16b, v22.16b, v23.16b
+ zip1 v20.16b, v24.16b, v25.16b
+ zip2 v21.16b, v24.16b, v25.16b
+ tbl v16.16b, {v0.16b}, v16.16b
+ zip1 v22.16b, v26.16b, v27.16b
+ tbl v17.16b, {v0.16b}, v17.16b
+ zip2 v23.16b, v26.16b, v27.16b
+ tbl v18.16b, {v0.16b}, v18.16b
+ tbl v19.16b, {v0.16b}, v19.16b
+ tbl v20.16b, {v0.16b}, v20.16b
+ st1 {v16.16b, v17.16b}, [x0], x1
+ tbl v21.16b, {v0.16b}, v21.16b
+ st1 {v18.16b, v19.16b}, [x2], x1
+ tbl v22.16b, {v0.16b}, v22.16b
+ st1 {v20.16b, v21.16b}, [x0], x1
+ tbl v23.16b, {v0.16b}, v23.16b
+ st1 {v22.16b, v23.16b}, [x2], x1
+ b.gt 32b
+ ret
+64:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64
+ subs w5, w5, #2
+ ushr v21.16b, v16.16b, #4
+ and v20.16b, v16.16b, v31.16b
+ ushr v23.16b, v17.16b, #4
+ and v22.16b, v17.16b, v31.16b
+ ushr v25.16b, v18.16b, #4
+ and v24.16b, v18.16b, v31.16b
+ ushr v27.16b, v19.16b, #4
+ and v26.16b, v19.16b, v31.16b
+ zip1 v16.16b, v20.16b, v21.16b
+ zip2 v17.16b, v20.16b, v21.16b
+ zip1 v18.16b, v22.16b, v23.16b
+ zip2 v19.16b, v22.16b, v23.16b
+ zip1 v20.16b, v24.16b, v25.16b
+ zip2 v21.16b, v24.16b, v25.16b
+ tbl v16.16b, {v0.16b}, v16.16b
+ zip1 v22.16b, v26.16b, v27.16b
+ tbl v17.16b, {v0.16b}, v17.16b
+ zip2 v23.16b, v26.16b, v27.16b
+ tbl v18.16b, {v0.16b}, v18.16b
+ tbl v19.16b, {v0.16b}, v19.16b
+ st1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1
+ tbl v20.16b, {v0.16b}, v20.16b
+ tbl v21.16b, {v0.16b}, v21.16b
+ tbl v22.16b, {v0.16b}, v22.16b
+ tbl v23.16b, {v0.16b}, v23.16b
+ st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x1
+ b.gt 64b
+ ret
+
+L(pal_pred_tbl):
+ .hword L(pal_pred_tbl) - 64b
+ .hword L(pal_pred_tbl) - 32b
+ .hword L(pal_pred_tbl) - 16b
+ .hword L(pal_pred_tbl) - 8b
+ .hword L(pal_pred_tbl) - 4b
+endfunc
+
+// void ipred_cfl_128_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height,
+// const int16_t *ac, const int alpha);
+function ipred_cfl_128_8bpc_neon, export=1
+ clz w9, w3
+ adr x7, L(ipred_cfl_128_tbl)
+ sub w9, w9, #26
+ ldrh w9, [x7, w9, uxtw #1]
+ movi v0.8h, #128 // dc
+ dup v1.8h, w6 // alpha
+ sub x7, x7, w9, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x7
+L(ipred_cfl_splat_w4):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.8h, v3.8h}, [x5], #32
+ mul v2.8h, v2.8h, v1.8h // diff = ac * alpha
+ mul v3.8h, v3.8h, v1.8h
+ cmlt v4.8h, v2.8h, #0 // sign
+ cmlt v5.8h, v3.8h, #0
+ add v2.8h, v2.8h, v4.8h // diff + sign
+ add v3.8h, v3.8h, v5.8h
+ srshr v2.8h, v2.8h, #6 // (diff + sign + 32) >> 6 = apply_sign()
+ srshr v3.8h, v3.8h, #6
+ add v2.8h, v2.8h, v0.8h // dc + apply_sign()
+ add v3.8h, v3.8h, v0.8h
+ sqxtun v2.8b, v2.8h // iclip_pixel(dc + apply_sign())
+ sqxtun v3.8b, v3.8h
+ st1 {v2.s}[0], [x0], x1
+ st1 {v2.s}[1], [x6], x1
+ subs w4, w4, #4
+ st1 {v3.s}[0], [x0], x1
+ st1 {v3.s}[1], [x6], x1
+ b.gt L(ipred_cfl_splat_w4)
+ ret
+L(ipred_cfl_splat_w8):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x5], #64
+ mul v2.8h, v2.8h, v1.8h // diff = ac * alpha
+ mul v3.8h, v3.8h, v1.8h
+ mul v4.8h, v4.8h, v1.8h
+ mul v5.8h, v5.8h, v1.8h
+ cmlt v16.8h, v2.8h, #0 // sign
+ cmlt v17.8h, v3.8h, #0
+ cmlt v18.8h, v4.8h, #0
+ cmlt v19.8h, v5.8h, #0
+ add v2.8h, v2.8h, v16.8h // diff + sign
+ add v3.8h, v3.8h, v17.8h
+ add v4.8h, v4.8h, v18.8h
+ add v5.8h, v5.8h, v19.8h
+ srshr v2.8h, v2.8h, #6 // (diff + sign + 32) >> 6 = apply_sign()
+ srshr v3.8h, v3.8h, #6
+ srshr v4.8h, v4.8h, #6
+ srshr v5.8h, v5.8h, #6
+ add v2.8h, v2.8h, v0.8h // dc + apply_sign()
+ add v3.8h, v3.8h, v0.8h
+ add v4.8h, v4.8h, v0.8h
+ add v5.8h, v5.8h, v0.8h
+ sqxtun v2.8b, v2.8h // iclip_pixel(dc + apply_sign())
+ sqxtun v3.8b, v3.8h
+ sqxtun v4.8b, v4.8h
+ sqxtun v5.8b, v5.8h
+ st1 {v2.8b}, [x0], x1
+ st1 {v3.8b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v4.8b}, [x0], x1
+ st1 {v5.8b}, [x6], x1
+ b.gt L(ipred_cfl_splat_w8)
+ ret
+L(ipred_cfl_splat_w16):
+ AARCH64_VALID_JUMP_TARGET
+ add x7, x5, w3, uxtw #1
+ sub x1, x1, w3, uxtw
+ mov w9, w3
+1:
+ ld1 {v2.8h, v3.8h}, [x5], #32
+ ld1 {v4.8h, v5.8h}, [x7], #32
+ mul v2.8h, v2.8h, v1.8h // diff = ac * alpha
+ mul v3.8h, v3.8h, v1.8h
+ mul v4.8h, v4.8h, v1.8h
+ mul v5.8h, v5.8h, v1.8h
+ cmlt v16.8h, v2.8h, #0 // sign
+ cmlt v17.8h, v3.8h, #0
+ cmlt v18.8h, v4.8h, #0
+ cmlt v19.8h, v5.8h, #0
+ add v2.8h, v2.8h, v16.8h // diff + sign
+ add v3.8h, v3.8h, v17.8h
+ add v4.8h, v4.8h, v18.8h
+ add v5.8h, v5.8h, v19.8h
+ srshr v2.8h, v2.8h, #6 // (diff + sign + 32) >> 6 = apply_sign()
+ srshr v3.8h, v3.8h, #6
+ srshr v4.8h, v4.8h, #6
+ srshr v5.8h, v5.8h, #6
+ add v2.8h, v2.8h, v0.8h // dc + apply_sign()
+ add v3.8h, v3.8h, v0.8h
+ add v4.8h, v4.8h, v0.8h
+ add v5.8h, v5.8h, v0.8h
+ sqxtun v2.8b, v2.8h // iclip_pixel(dc + apply_sign())
+ sqxtun v3.8b, v3.8h
+ sqxtun v4.8b, v4.8h
+ sqxtun v5.8b, v5.8h
+ subs w3, w3, #16
+ st1 {v2.8b, v3.8b}, [x0], #16
+ st1 {v4.8b, v5.8b}, [x6], #16
+ b.gt 1b
+ subs w4, w4, #2
+ add x5, x5, w9, uxtw #1
+ add x7, x7, w9, uxtw #1
+ add x0, x0, x1
+ add x6, x6, x1
+ mov w3, w9
+ b.gt 1b
+ ret
+
+L(ipred_cfl_128_tbl):
+L(ipred_cfl_splat_tbl):
+ .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16)
+ .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16)
+ .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w8)
+ .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w4)
+endfunc
+
+// void ipred_cfl_top_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height,
+// const int16_t *ac, const int alpha);
+function ipred_cfl_top_8bpc_neon, export=1
+ clz w9, w3
+ adr x7, L(ipred_cfl_top_tbl)
+ sub w9, w9, #26
+ ldrh w9, [x7, w9, uxtw #1]
+ dup v1.8h, w6 // alpha
+ add x2, x2, #1
+ sub x7, x7, w9, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x7
+4:
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v0.2s}, [x2]
+ uaddlv h0, v0.8b
+ urshr v0.4h, v0.4h, #3
+ dup v0.8h, v0.h[0]
+ b L(ipred_cfl_splat_w4)
+8:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8b}, [x2]
+ uaddlv h0, v0.8b
+ urshr v0.4h, v0.4h, #3
+ dup v0.8h, v0.h[0]
+ b L(ipred_cfl_splat_w8)
+16:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.16b}, [x2]
+ uaddlv h0, v0.16b
+ urshr v0.4h, v0.4h, #4
+ dup v0.8h, v0.h[0]
+ b L(ipred_cfl_splat_w16)
+32:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.16b, v3.16b}, [x2]
+ uaddlv h2, v2.16b
+ uaddlv h3, v3.16b
+ add v2.4h, v2.4h, v3.4h
+ urshr v2.4h, v2.4h, #5
+ dup v0.8h, v2.h[0]
+ b L(ipred_cfl_splat_w16)
+
+L(ipred_cfl_top_tbl):
+ .hword L(ipred_cfl_top_tbl) - 32b
+ .hword L(ipred_cfl_top_tbl) - 16b
+ .hword L(ipred_cfl_top_tbl) - 8b
+ .hword L(ipred_cfl_top_tbl) - 4b
+endfunc
+
+// void ipred_cfl_left_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height,
+// const int16_t *ac, const int alpha);
+function ipred_cfl_left_8bpc_neon, export=1
+ sub x2, x2, w4, uxtw
+ clz w9, w3
+ clz w8, w4
+ adr x10, L(ipred_cfl_splat_tbl)
+ adr x7, L(ipred_cfl_left_tbl)
+ sub w9, w9, #26
+ sub w8, w8, #26
+ ldrh w9, [x10, w9, uxtw #1]
+ ldrh w8, [x7, w8, uxtw #1]
+ dup v1.8h, w6 // alpha
+ sub x9, x10, w9, uxtw
+ sub x7, x7, w8, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x7
+
+L(ipred_cfl_left_h4):
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v0.2s}, [x2]
+ uaddlv h0, v0.8b
+ urshr v0.4h, v0.4h, #3
+ dup v0.8h, v0.h[0]
+ br x9
+
+L(ipred_cfl_left_h8):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8b}, [x2]
+ uaddlv h0, v0.8b
+ urshr v0.4h, v0.4h, #3
+ dup v0.8h, v0.h[0]
+ br x9
+
+L(ipred_cfl_left_h16):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.16b}, [x2]
+ uaddlv h0, v0.16b
+ urshr v0.4h, v0.4h, #4
+ dup v0.8h, v0.h[0]
+ br x9
+
+L(ipred_cfl_left_h32):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.16b, v3.16b}, [x2]
+ uaddlv h2, v2.16b
+ uaddlv h3, v3.16b
+ add v2.4h, v2.4h, v3.4h
+ urshr v2.4h, v2.4h, #5
+ dup v0.8h, v2.h[0]
+ br x9
+
+L(ipred_cfl_left_tbl):
+ .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h32)
+ .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h16)
+ .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h8)
+ .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h4)
+endfunc
+
+// void ipred_cfl_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height,
+// const int16_t *ac, const int alpha);
+function ipred_cfl_8bpc_neon, export=1
+ sub x2, x2, w4, uxtw
+ add w8, w3, w4 // width + height
+ dup v1.8h, w6 // alpha
+ clz w9, w3
+ clz w6, w4
+ dup v16.8h, w8 // width + height
+ adr x7, L(ipred_cfl_tbl)
+ rbit w8, w8 // rbit(width + height)
+ sub w9, w9, #22 // 26 leading bits, minus table offset 4
+ sub w6, w6, #26
+ clz w8, w8 // ctz(width + height)
+ ldrh w9, [x7, w9, uxtw #1]
+ ldrh w6, [x7, w6, uxtw #1]
+ neg w8, w8 // -ctz(width + height)
+ sub x9, x7, w9, uxtw
+ sub x7, x7, w6, uxtw
+ ushr v16.8h, v16.8h, #1 // (width + height) >> 1
+ dup v17.8h, w8 // -ctz(width + height)
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x7
+
+L(ipred_cfl_h4):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.s}[0], [x2], #4
+ ins v0.s[1], wzr
+ add x2, x2, #1
+ uaddlv h0, v0.8b
+ br x9
+L(ipred_cfl_w4):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.s}[0], [x2]
+ ins v2.s[1], wzr
+ add v0.4h, v0.4h, v16.4h
+ uaddlv h2, v2.8b
+ cmp w4, #4
+ add v0.4h, v0.4h, v2.4h
+ ushl v0.4h, v0.4h, v17.4h
+ b.eq 1f
+ // h = 8/16
+ mov w16, #(0x3334/2)
+ movk w16, #(0x5556/2), lsl #16
+ add w17, w4, w4 // w17 = 2*h = 16 or 32
+ lsr w16, w16, w17
+ dup v16.4h, w16
+ sqdmulh v0.4h, v0.4h, v16.4h
+1:
+ dup v0.8h, v0.h[0]
+ b L(ipred_cfl_splat_w4)
+
+L(ipred_cfl_h8):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8b}, [x2], #8
+ uaddlv h0, v0.8b
+ add x2, x2, #1
+ br x9
+L(ipred_cfl_w8):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.8b}, [x2]
+ add v0.4h, v0.4h, v16.4h
+ uaddlv h2, v2.8b
+ cmp w4, #8
+ add v0.4h, v0.4h, v2.4h
+ ushl v0.4h, v0.4h, v17.4h
+ b.eq 1f
+ // h = 4/16/32
+ cmp w4, #32
+ mov w16, #(0x3334/2)
+ mov w17, #(0x5556/2)
+ csel w16, w16, w17, eq
+ dup v16.4h, w16
+ sqdmulh v0.4h, v0.4h, v16.4h
+1:
+ dup v0.8h, v0.h[0]
+ b L(ipred_cfl_splat_w8)
+
+L(ipred_cfl_h16):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.16b}, [x2], #16
+ uaddlv h0, v0.16b
+ add x2, x2, #1
+ br x9
+L(ipred_cfl_w16):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.16b}, [x2]
+ add v0.4h, v0.4h, v16.4h
+ uaddlv h2, v2.16b
+ cmp w4, #16
+ add v0.4h, v0.4h, v2.4h
+ ushl v0.4h, v0.4h, v17.4h
+ b.eq 1f
+ // h = 4/8/32
+ cmp w4, #4
+ mov w16, #(0x3334/2)
+ mov w17, #(0x5556/2)
+ csel w16, w16, w17, eq
+ dup v16.4h, w16
+ sqdmulh v0.4h, v0.4h, v16.4h
+1:
+ dup v0.8h, v0.h[0]
+ b L(ipred_cfl_splat_w16)
+
+L(ipred_cfl_h32):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.16b, v3.16b}, [x2], #32
+ uaddlv h2, v2.16b
+ uaddlv h3, v3.16b
+ add x2, x2, #1
+ add v0.4h, v2.4h, v3.4h
+ br x9
+L(ipred_cfl_w32):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.16b, v3.16b}, [x2]
+ add v0.4h, v0.4h, v16.4h
+ uaddlv h2, v2.16b
+ uaddlv h3, v3.16b
+ cmp w4, #32
+ add v0.4h, v0.4h, v2.4h
+ add v0.4h, v0.4h, v3.4h
+ ushl v0.4h, v0.4h, v17.4h
+ b.eq 1f
+ // h = 8/16
+ mov w16, #(0x5556/2)
+ movk w16, #(0x3334/2), lsl #16
+ add w17, w4, w4 // w17 = 2*h = 16 or 32
+ lsr w16, w16, w17
+ dup v16.4h, w16
+ sqdmulh v0.4h, v0.4h, v16.4h
+1:
+ dup v0.8h, v0.h[0]
+ b L(ipred_cfl_splat_w16)
+
+L(ipred_cfl_tbl):
+ .hword L(ipred_cfl_tbl) - L(ipred_cfl_h32)
+ .hword L(ipred_cfl_tbl) - L(ipred_cfl_h16)
+ .hword L(ipred_cfl_tbl) - L(ipred_cfl_h8)
+ .hword L(ipred_cfl_tbl) - L(ipred_cfl_h4)
+ .hword L(ipred_cfl_tbl) - L(ipred_cfl_w32)
+ .hword L(ipred_cfl_tbl) - L(ipred_cfl_w16)
+ .hword L(ipred_cfl_tbl) - L(ipred_cfl_w8)
+ .hword L(ipred_cfl_tbl) - L(ipred_cfl_w4)
+endfunc
+
+// void cfl_ac_420_8bpc_neon(int16_t *const ac, const pixel *const ypx,
+// const ptrdiff_t stride, const int w_pad,
+// const int h_pad, const int cw, const int ch);
+function ipred_cfl_ac_420_8bpc_neon, export=1
+ clz w8, w5
+ lsl w4, w4, #2
+ adr x7, L(ipred_cfl_ac_420_tbl)
+ sub w8, w8, #27
+ ldrh w8, [x7, w8, uxtw #1]
+ movi v16.8h, #0
+ movi v17.8h, #0
+ movi v18.8h, #0
+ movi v19.8h, #0
+ sub x7, x7, w8, uxtw
+ sub w8, w6, w4 // height - h_pad
+ rbit w9, w5 // rbit(width)
+ rbit w10, w6 // rbit(height)
+ clz w9, w9 // ctz(width)
+ clz w10, w10 // ctz(height)
+ add w9, w9, w10 // log2sz
+ add x10, x1, x2
+ dup v31.4s, w9
+ lsl x2, x2, #1
+ neg v31.4s, v31.4s // -log2sz
+ br x7
+
+L(ipred_cfl_ac_420_w4):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and subsample input
+ ld1 {v0.8b}, [x1], x2
+ ld1 {v1.8b}, [x10], x2
+ ld1 {v0.d}[1], [x1], x2
+ ld1 {v1.d}[1], [x10], x2
+ uaddlp v0.8h, v0.16b
+ uaddlp v1.8h, v1.16b
+ add v0.8h, v0.8h, v1.8h
+ shl v0.8h, v0.8h, #1
+ subs w8, w8, #2
+ st1 {v0.8h}, [x0], #16
+ add v16.8h, v16.8h, v0.8h
+ b.gt 1b
+ trn2 v1.2d, v0.2d, v0.2d
+ trn2 v0.2d, v0.2d, v0.2d
+L(ipred_cfl_ac_420_w4_hpad):
+ cbz w4, 3f
+2: // Vertical padding (h_pad > 0)
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h}, [x0], #32
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ b.gt 2b
+3:
+ // Aggregate the sums
+ add v0.8h, v16.8h, v17.8h
+ uaddlv s0, v0.8h // sum
+ sub x0, x0, w6, uxtw #3
+ urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz
+ dup v4.8h, v4.h[0]
+6: // Subtract dc from ac
+ ld1 {v0.8h, v1.8h}, [x0]
+ subs w6, w6, #4
+ sub v0.8h, v0.8h, v4.8h
+ sub v1.8h, v1.8h, v4.8h
+ st1 {v0.8h, v1.8h}, [x0], #32
+ b.gt 6b
+ ret
+
+L(ipred_cfl_ac_420_w8):
+ AARCH64_VALID_JUMP_TARGET
+ cbnz w3, L(ipred_cfl_ac_420_w8_wpad)
+1: // Copy and subsample input, without padding
+ ld1 {v0.16b}, [x1], x2
+ ld1 {v1.16b}, [x10], x2
+ ld1 {v2.16b}, [x1], x2
+ uaddlp v0.8h, v0.16b
+ ld1 {v3.16b}, [x10], x2
+ uaddlp v1.8h, v1.16b
+ uaddlp v2.8h, v2.16b
+ uaddlp v3.8h, v3.16b
+ add v0.8h, v0.8h, v1.8h
+ add v2.8h, v2.8h, v3.8h
+ shl v0.8h, v0.8h, #1
+ shl v1.8h, v2.8h, #1
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h}, [x0], #32
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ b.gt 1b
+ mov v0.16b, v1.16b
+ b L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_420_w8_wpad):
+1: // Copy and subsample input, padding 4
+ ld1 {v0.8b}, [x1], x2
+ ld1 {v1.8b}, [x10], x2
+ ld1 {v0.d}[1], [x1], x2
+ ld1 {v1.d}[1], [x10], x2
+ uaddlp v0.8h, v0.16b
+ uaddlp v1.8h, v1.16b
+ add v0.8h, v0.8h, v1.8h
+ shl v0.8h, v0.8h, #1
+ dup v1.4h, v0.h[3]
+ dup v3.4h, v0.h[7]
+ trn2 v2.2d, v0.2d, v0.2d
+ subs w8, w8, #2
+ st1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32
+ add v16.4h, v16.4h, v0.4h
+ add v17.4h, v17.4h, v1.4h
+ add v18.4h, v18.4h, v2.4h
+ add v19.4h, v19.4h, v3.4h
+ b.gt 1b
+ trn1 v0.2d, v2.2d, v3.2d
+ trn1 v1.2d, v2.2d, v3.2d
+
+L(ipred_cfl_ac_420_w8_hpad):
+ cbz w4, 3f
+2: // Vertical padding (h_pad > 0)
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h}, [x0], #32
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ st1 {v0.8h, v1.8h}, [x0], #32
+ add v18.8h, v18.8h, v0.8h
+ add v19.8h, v19.8h, v1.8h
+ b.gt 2b
+3:
+
+L(ipred_cfl_ac_420_w8_calc_subtract_dc):
+ // Aggregate the sums
+ add v0.8h, v16.8h, v17.8h
+ add v2.8h, v18.8h, v19.8h
+ uaddlp v0.4s, v0.8h
+ uaddlp v2.4s, v2.8h
+ add v0.4s, v0.4s, v2.4s
+ addv s0, v0.4s // sum
+ sub x0, x0, w6, uxtw #4
+ urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz
+ dup v4.8h, v4.h[0]
+L(ipred_cfl_ac_420_w8_subtract_dc):
+6: // Subtract dc from ac
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]
+ subs w6, w6, #4
+ sub v0.8h, v0.8h, v4.8h
+ sub v1.8h, v1.8h, v4.8h
+ sub v2.8h, v2.8h, v4.8h
+ sub v3.8h, v3.8h, v4.8h
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ b.gt 6b
+ ret
+
+L(ipred_cfl_ac_420_w16):
+ AARCH64_VALID_JUMP_TARGET
+ adr x7, L(ipred_cfl_ac_420_w16_tbl)
+ ldrh w3, [x7, w3, uxtw #1]
+ sub x7, x7, w3, uxtw
+ br x7
+
+L(ipred_cfl_ac_420_w16_wpad0):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and subsample input, without padding
+ ld1 {v0.16b, v1.16b}, [x1], x2
+ ld1 {v2.16b, v3.16b}, [x10], x2
+ uaddlp v0.8h, v0.16b
+ ld1 {v4.16b, v5.16b}, [x1], x2
+ uaddlp v1.8h, v1.16b
+ ld1 {v6.16b, v7.16b}, [x10], x2
+ uaddlp v2.8h, v2.16b
+ uaddlp v3.8h, v3.16b
+ uaddlp v4.8h, v4.16b
+ uaddlp v5.8h, v5.16b
+ uaddlp v6.8h, v6.16b
+ uaddlp v7.8h, v7.16b
+ add v0.8h, v0.8h, v2.8h
+ add v1.8h, v1.8h, v3.8h
+ add v4.8h, v4.8h, v6.8h
+ add v5.8h, v5.8h, v7.8h
+ shl v0.8h, v0.8h, #1
+ shl v1.8h, v1.8h, #1
+ shl v2.8h, v4.8h, #1
+ shl v3.8h, v5.8h, #1
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_wpad1):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and subsample input, padding 4
+ ldr d1, [x1, #16]
+ ld1 {v0.16b}, [x1], x2
+ ldr d3, [x10, #16]
+ ld1 {v2.16b}, [x10], x2
+ uaddlp v1.4h, v1.8b
+ ldr d5, [x1, #16]
+ uaddlp v0.8h, v0.16b
+ ld1 {v4.16b}, [x1], x2
+ uaddlp v3.4h, v3.8b
+ ldr d7, [x10, #16]
+ uaddlp v2.8h, v2.16b
+ ld1 {v6.16b}, [x10], x2
+ uaddlp v5.4h, v5.8b
+ uaddlp v4.8h, v4.16b
+ uaddlp v7.4h, v7.8b
+ uaddlp v6.8h, v6.16b
+ add v1.4h, v1.4h, v3.4h
+ add v0.8h, v0.8h, v2.8h
+ add v5.4h, v5.4h, v7.4h
+ add v4.8h, v4.8h, v6.8h
+ shl v1.4h, v1.4h, #1
+ shl v0.8h, v0.8h, #1
+ shl v3.4h, v5.4h, #1
+ shl v2.8h, v4.8h, #1
+ dup v4.4h, v1.h[3]
+ dup v5.4h, v3.h[3]
+ trn1 v1.2d, v1.2d, v4.2d
+ trn1 v3.2d, v3.2d, v5.2d
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_wpad2):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and subsample input, padding 8
+ ld1 {v0.16b}, [x1], x2
+ ld1 {v2.16b}, [x10], x2
+ ld1 {v4.16b}, [x1], x2
+ uaddlp v0.8h, v0.16b
+ ld1 {v6.16b}, [x10], x2
+ uaddlp v2.8h, v2.16b
+ uaddlp v4.8h, v4.16b
+ uaddlp v6.8h, v6.16b
+ add v0.8h, v0.8h, v2.8h
+ add v4.8h, v4.8h, v6.8h
+ shl v0.8h, v0.8h, #1
+ shl v2.8h, v4.8h, #1
+ dup v1.8h, v0.h[7]
+ dup v3.8h, v2.h[7]
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_wpad3):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and subsample input, padding 12
+ ld1 {v0.8b}, [x1], x2
+ ld1 {v2.8b}, [x10], x2
+ ld1 {v4.8b}, [x1], x2
+ uaddlp v0.4h, v0.8b
+ ld1 {v6.8b}, [x10], x2
+ uaddlp v2.4h, v2.8b
+ uaddlp v4.4h, v4.8b
+ uaddlp v6.4h, v6.8b
+ add v0.4h, v0.4h, v2.4h
+ add v4.4h, v4.4h, v6.4h
+ shl v0.4h, v0.4h, #1
+ shl v2.4h, v4.4h, #1
+ dup v1.8h, v0.h[3]
+ dup v3.8h, v2.h[3]
+ trn1 v0.2d, v0.2d, v1.2d
+ trn1 v2.2d, v2.2d, v3.2d
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+
+L(ipred_cfl_ac_420_w16_hpad):
+ cbz w4, 3f
+2: // Vertical padding (h_pad > 0)
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ b.gt 2b
+3:
+
+ // Double the height and reuse the w8 summing/subtracting
+ lsl w6, w6, #1
+ b L(ipred_cfl_ac_420_w8_calc_subtract_dc)
+
+L(ipred_cfl_ac_420_tbl):
+ .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w16)
+ .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w8)
+ .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w4)
+ .hword 0
+
+L(ipred_cfl_ac_420_w16_tbl):
+ .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad0)
+ .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad1)
+ .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad2)
+ .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad3)
+endfunc
+
+// void cfl_ac_422_8bpc_neon(int16_t *const ac, const pixel *const ypx,
+// const ptrdiff_t stride, const int w_pad,
+// const int h_pad, const int cw, const int ch);
+function ipred_cfl_ac_422_8bpc_neon, export=1
+ clz w8, w5
+ lsl w4, w4, #2
+ adr x7, L(ipred_cfl_ac_422_tbl)
+ sub w8, w8, #27
+ ldrh w8, [x7, w8, uxtw #1]
+ movi v16.8h, #0
+ movi v17.8h, #0
+ movi v18.8h, #0
+ movi v19.8h, #0
+ sub x7, x7, w8, uxtw
+ sub w8, w6, w4 // height - h_pad
+ rbit w9, w5 // rbit(width)
+ rbit w10, w6 // rbit(height)
+ clz w9, w9 // ctz(width)
+ clz w10, w10 // ctz(height)
+ add w9, w9, w10 // log2sz
+ add x10, x1, x2
+ dup v31.4s, w9
+ lsl x2, x2, #1
+ neg v31.4s, v31.4s // -log2sz
+ br x7
+
+L(ipred_cfl_ac_422_w4):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and subsample input
+ ld1 {v0.8b}, [x1], x2
+ ld1 {v0.d}[1], [x10], x2
+ ld1 {v1.8b}, [x1], x2
+ ld1 {v1.d}[1], [x10], x2
+ uaddlp v0.8h, v0.16b
+ uaddlp v1.8h, v1.16b
+ shl v0.8h, v0.8h, #2
+ shl v1.8h, v1.8h, #2
+ subs w8, w8, #4
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ st1 {v0.8h, v1.8h}, [x0], #32
+ b.gt 1b
+ trn2 v0.2d, v1.2d, v1.2d
+ trn2 v1.2d, v1.2d, v1.2d
+ b L(ipred_cfl_ac_420_w4_hpad)
+
+L(ipred_cfl_ac_422_w8):
+ AARCH64_VALID_JUMP_TARGET
+ cbnz w3, L(ipred_cfl_ac_422_w8_wpad)
+1: // Copy and subsample input, without padding
+ ld1 {v0.16b}, [x1], x2
+ ld1 {v1.16b}, [x10], x2
+ ld1 {v2.16b}, [x1], x2
+ uaddlp v0.8h, v0.16b
+ ld1 {v3.16b}, [x10], x2
+ uaddlp v1.8h, v1.16b
+ uaddlp v2.8h, v2.16b
+ uaddlp v3.8h, v3.16b
+ shl v0.8h, v0.8h, #2
+ shl v1.8h, v1.8h, #2
+ shl v2.8h, v2.8h, #2
+ shl v3.8h, v3.8h, #2
+ subs w8, w8, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ b.gt 1b
+ mov v0.16b, v3.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_422_w8_wpad):
+1: // Copy and subsample input, padding 4
+ ld1 {v0.8b}, [x1], x2
+ ld1 {v0.d}[1], [x10], x2
+ ld1 {v2.8b}, [x1], x2
+ ld1 {v2.d}[1], [x10], x2
+ uaddlp v0.8h, v0.16b
+ uaddlp v2.8h, v2.16b
+ shl v0.8h, v0.8h, #2
+ shl v2.8h, v2.8h, #2
+ dup v4.4h, v0.h[3]
+ dup v5.8h, v0.h[7]
+ dup v6.4h, v2.h[3]
+ dup v7.8h, v2.h[7]
+ trn2 v1.2d, v0.2d, v5.2d
+ trn1 v0.2d, v0.2d, v4.2d
+ trn2 v3.2d, v2.2d, v7.2d
+ trn1 v2.2d, v2.2d, v6.2d
+ subs w8, w8, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ b.gt 1b
+ mov v0.16b, v3.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_422_w16):
+ AARCH64_VALID_JUMP_TARGET
+ adr x7, L(ipred_cfl_ac_422_w16_tbl)
+ ldrh w3, [x7, w3, uxtw #1]
+ sub x7, x7, w3, uxtw
+ br x7
+
+L(ipred_cfl_ac_422_w16_wpad0):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and subsample input, without padding
+ ld1 {v0.16b, v1.16b}, [x1], x2
+ ld1 {v2.16b, v3.16b}, [x10], x2
+ uaddlp v0.8h, v0.16b
+ uaddlp v1.8h, v1.16b
+ uaddlp v2.8h, v2.16b
+ uaddlp v3.8h, v3.16b
+ shl v0.8h, v0.8h, #2
+ shl v1.8h, v1.8h, #2
+ shl v2.8h, v2.8h, #2
+ shl v3.8h, v3.8h, #2
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_w16_wpad1):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and subsample input, padding 4
+ ldr d1, [x1, #16]
+ ld1 {v0.16b}, [x1], x2
+ ldr d3, [x10, #16]
+ ld1 {v2.16b}, [x10], x2
+ uaddlp v1.4h, v1.8b
+ uaddlp v0.8h, v0.16b
+ uaddlp v3.4h, v3.8b
+ uaddlp v2.8h, v2.16b
+ shl v1.4h, v1.4h, #2
+ shl v0.8h, v0.8h, #2
+ shl v3.4h, v3.4h, #2
+ shl v2.8h, v2.8h, #2
+ dup v4.4h, v1.h[3]
+ dup v5.4h, v3.h[3]
+ trn1 v1.2d, v1.2d, v4.2d
+ trn1 v3.2d, v3.2d, v5.2d
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_w16_wpad2):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and subsample input, padding 8
+ ld1 {v0.16b}, [x1], x2
+ ld1 {v2.16b}, [x10], x2
+ uaddlp v0.8h, v0.16b
+ uaddlp v2.8h, v2.16b
+ shl v0.8h, v0.8h, #2
+ shl v2.8h, v2.8h, #2
+ dup v1.8h, v0.h[7]
+ dup v3.8h, v2.h[7]
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_w16_wpad3):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and subsample input, padding 12
+ ld1 {v0.8b}, [x1], x2
+ ld1 {v2.8b}, [x10], x2
+ uaddlp v0.4h, v0.8b
+ uaddlp v2.4h, v2.8b
+ shl v0.4h, v0.4h, #2
+ shl v2.4h, v2.4h, #2
+ dup v1.8h, v0.h[3]
+ dup v3.8h, v2.h[3]
+ trn1 v0.2d, v0.2d, v1.2d
+ trn1 v2.2d, v2.2d, v3.2d
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_tbl):
+ .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w16)
+ .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w8)
+ .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w4)
+ .hword 0
+
+L(ipred_cfl_ac_422_w16_tbl):
+ .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad0)
+ .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad1)
+ .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad2)
+ .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad3)
+endfunc
+
+// void cfl_ac_444_8bpc_neon(int16_t *const ac, const pixel *const ypx,
+// const ptrdiff_t stride, const int w_pad,
+// const int h_pad, const int cw, const int ch);
+function ipred_cfl_ac_444_8bpc_neon, export=1
+ clz w8, w5
+ lsl w4, w4, #2
+ adr x7, L(ipred_cfl_ac_444_tbl)
+ sub w8, w8, #26
+ ldrh w8, [x7, w8, uxtw #1]
+ movi v16.8h, #0
+ movi v17.8h, #0
+ movi v18.8h, #0
+ movi v19.8h, #0
+ sub x7, x7, w8, uxtw
+ sub w8, w6, w4 // height - h_pad
+ rbit w9, w5 // rbit(width)
+ rbit w10, w6 // rbit(height)
+ clz w9, w9 // ctz(width)
+ clz w10, w10 // ctz(height)
+ add w9, w9, w10 // log2sz
+ add x10, x1, x2
+ dup v31.4s, w9
+ lsl x2, x2, #1
+ neg v31.4s, v31.4s // -log2sz
+ br x7
+
+L(ipred_cfl_ac_444_w4):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and expand input
+ ld1 {v0.s}[0], [x1], x2
+ ld1 {v0.s}[1], [x10], x2
+ ld1 {v1.s}[0], [x1], x2
+ ld1 {v1.s}[1], [x10], x2
+ ushll v0.8h, v0.8b, #3
+ ushll v1.8h, v1.8b, #3
+ subs w8, w8, #4
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ st1 {v0.8h, v1.8h}, [x0], #32
+ b.gt 1b
+ trn2 v0.2d, v1.2d, v1.2d
+ trn2 v1.2d, v1.2d, v1.2d
+ b L(ipred_cfl_ac_420_w4_hpad)
+
+L(ipred_cfl_ac_444_w8):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and expand input
+ ld1 {v0.8b}, [x1], x2
+ ld1 {v1.8b}, [x10], x2
+ ld1 {v2.8b}, [x1], x2
+ ushll v0.8h, v0.8b, #3
+ ld1 {v3.8b}, [x10], x2
+ ushll v1.8h, v1.8b, #3
+ ushll v2.8h, v2.8b, #3
+ ushll v3.8h, v3.8b, #3
+ subs w8, w8, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ b.gt 1b
+ mov v0.16b, v3.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_444_w16):
+ AARCH64_VALID_JUMP_TARGET
+ cbnz w3, L(ipred_cfl_ac_444_w16_wpad)
+1: // Copy and expand input, without padding
+ ld1 {v0.16b}, [x1], x2
+ ld1 {v2.16b}, [x10], x2
+ ld1 {v4.16b}, [x1], x2
+ ushll2 v1.8h, v0.16b, #3
+ ushll v0.8h, v0.8b, #3
+ ld1 {v6.16b}, [x10], x2
+ ushll2 v3.8h, v2.16b, #3
+ ushll v2.8h, v2.8b, #3
+ ushll2 v5.8h, v4.16b, #3
+ ushll v4.8h, v4.8b, #3
+ ushll2 v7.8h, v6.16b, #3
+ ushll v6.8h, v6.8b, #3
+ subs w8, w8, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+ add v16.8h, v16.8h, v4.8h
+ add v17.8h, v17.8h, v5.8h
+ add v18.8h, v18.8h, v6.8h
+ add v19.8h, v19.8h, v7.8h
+ b.gt 1b
+ mov v0.16b, v6.16b
+ mov v1.16b, v7.16b
+ mov v2.16b, v6.16b
+ mov v3.16b, v7.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_444_w16_wpad):
+1: // Copy and expand input, padding 8
+ ld1 {v0.8b}, [x1], x2
+ ld1 {v2.8b}, [x10], x2
+ ld1 {v4.8b}, [x1], x2
+ ld1 {v6.8b}, [x10], x2
+ ushll v0.8h, v0.8b, #3
+ ushll v2.8h, v2.8b, #3
+ ushll v4.8h, v4.8b, #3
+ ushll v6.8h, v6.8b, #3
+ dup v1.8h, v0.h[7]
+ dup v3.8h, v2.h[7]
+ dup v5.8h, v4.h[7]
+ dup v7.8h, v6.h[7]
+ subs w8, w8, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+ add v16.8h, v16.8h, v4.8h
+ add v17.8h, v17.8h, v5.8h
+ add v18.8h, v18.8h, v6.8h
+ add v19.8h, v19.8h, v7.8h
+ b.gt 1b
+ mov v0.16b, v6.16b
+ mov v1.16b, v7.16b
+ mov v2.16b, v6.16b
+ mov v3.16b, v7.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_444_w32):
+ AARCH64_VALID_JUMP_TARGET
+ adr x7, L(ipred_cfl_ac_444_w32_tbl)
+ ldrh w3, [x7, w3, uxtw] // (w3>>1) << 1
+ sub x7, x7, w3, uxtw
+ br x7
+
+L(ipred_cfl_ac_444_w32_wpad0):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and expand input, without padding
+ ld1 {v2.16b, v3.16b}, [x1], x2
+ ld1 {v6.16b, v7.16b}, [x10], x2
+ ushll v0.8h, v2.8b, #3
+ ushll2 v1.8h, v2.16b, #3
+ ushll v2.8h, v3.8b, #3
+ ushll2 v3.8h, v3.16b, #3
+ ushll v4.8h, v6.8b, #3
+ ushll2 v5.8h, v6.16b, #3
+ ushll v6.8h, v7.8b, #3
+ ushll2 v7.8h, v7.16b, #3
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+ add v16.8h, v16.8h, v4.8h
+ add v17.8h, v17.8h, v5.8h
+ add v18.8h, v18.8h, v6.8h
+ add v19.8h, v19.8h, v7.8h
+ b.gt 1b
+ b L(ipred_cfl_ac_444_w32_hpad)
+
+L(ipred_cfl_ac_444_w32_wpad2):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and expand input, padding 8
+ ldr d2, [x1, #16]
+ ld1 {v1.16b}, [x1], x2
+ ldr d6, [x10, #16]
+ ld1 {v5.16b}, [x10], x2
+ ushll v2.8h, v2.8b, #3
+ ushll v0.8h, v1.8b, #3
+ ushll2 v1.8h, v1.16b, #3
+ ushll v6.8h, v6.8b, #3
+ ushll v4.8h, v5.8b, #3
+ ushll2 v5.8h, v5.16b, #3
+ dup v3.8h, v2.h[7]
+ dup v7.8h, v6.h[7]
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+ add v16.8h, v16.8h, v4.8h
+ add v17.8h, v17.8h, v5.8h
+ add v18.8h, v18.8h, v6.8h
+ add v19.8h, v19.8h, v7.8h
+ b.gt 1b
+ b L(ipred_cfl_ac_444_w32_hpad)
+
+L(ipred_cfl_ac_444_w32_wpad4):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and expand input, padding 16
+ ld1 {v1.16b}, [x1], x2
+ ld1 {v5.16b}, [x10], x2
+ ushll v0.8h, v1.8b, #3
+ ushll2 v1.8h, v1.16b, #3
+ ushll v4.8h, v5.8b, #3
+ ushll2 v5.8h, v5.16b, #3
+ dup v2.8h, v1.h[7]
+ dup v3.8h, v1.h[7]
+ dup v6.8h, v5.h[7]
+ dup v7.8h, v5.h[7]
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+ add v16.8h, v16.8h, v4.8h
+ add v17.8h, v17.8h, v5.8h
+ add v18.8h, v18.8h, v6.8h
+ add v19.8h, v19.8h, v7.8h
+ b.gt 1b
+ b L(ipred_cfl_ac_444_w32_hpad)
+
+L(ipred_cfl_ac_444_w32_wpad6):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and expand input, padding 24
+ ld1 {v0.8b}, [x1], x2
+ ld1 {v4.8b}, [x10], x2
+ ushll v0.8h, v0.8b, #3
+ ushll v4.8h, v4.8b, #3
+ dup v1.8h, v0.h[7]
+ dup v2.8h, v0.h[7]
+ dup v3.8h, v0.h[7]
+ dup v5.8h, v4.h[7]
+ dup v6.8h, v4.h[7]
+ dup v7.8h, v4.h[7]
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+ add v16.8h, v16.8h, v4.8h
+ add v17.8h, v17.8h, v5.8h
+ add v18.8h, v18.8h, v6.8h
+ add v19.8h, v19.8h, v7.8h
+ b.gt 1b
+
+L(ipred_cfl_ac_444_w32_hpad):
+ cbz w4, 3f
+2: // Vertical padding (h_pad > 0)
+ subs w4, w4, #2
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+ add v16.8h, v16.8h, v4.8h
+ add v17.8h, v17.8h, v5.8h
+ add v18.8h, v18.8h, v6.8h
+ add v19.8h, v19.8h, v7.8h
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+ add v16.8h, v16.8h, v4.8h
+ add v17.8h, v17.8h, v5.8h
+ add v18.8h, v18.8h, v6.8h
+ add v19.8h, v19.8h, v7.8h
+ b.gt 2b
+3:
+
+ // Quadruple the height and reuse the w8 subtracting
+ lsl w6, w6, #2
+ // Aggregate the sums, with wider intermediates earlier than in
+ // ipred_cfl_ac_420_w8_calc_subtract_dc.
+ uaddlp v0.4s, v16.8h
+ uaddlp v1.4s, v17.8h
+ uaddlp v2.4s, v18.8h
+ uaddlp v3.4s, v19.8h
+ add v0.4s, v0.4s, v1.4s
+ add v2.4s, v2.4s, v3.4s
+ add v0.4s, v0.4s, v2.4s
+ addv s0, v0.4s // sum
+ sub x0, x0, w6, uxtw #4
+ urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz
+ dup v4.8h, v4.h[0]
+ b L(ipred_cfl_ac_420_w8_subtract_dc)
+
+L(ipred_cfl_ac_444_tbl):
+ .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w32)
+ .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w16)
+ .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w8)
+ .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w4)
+
+L(ipred_cfl_ac_444_w32_tbl):
+ .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad0)
+ .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad2)
+ .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad4)
+ .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad6)
+endfunc
diff --git a/third_party/dav1d/src/arm/64/ipred16.S b/third_party/dav1d/src/arm/64/ipred16.S
new file mode 100644
index 0000000000..3f8cff9869
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/ipred16.S
@@ -0,0 +1,5674 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// void ipred_dc_128_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height,
+// const int bitdepth_max);
+function ipred_dc_128_16bpc_neon, export=1
+ ldr w8, [sp]
+ clz w3, w3
+ adr x5, L(ipred_dc_128_tbl)
+ sub w3, w3, #25
+ ldrh w3, [x5, w3, uxtw #1]
+ dup v0.8h, w8
+ sub x5, x5, w3, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ urshr v0.8h, v0.8h, #1
+ br x5
+4:
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v0.4h}, [x0], x1
+ st1 {v0.4h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.4h}, [x0], x1
+ st1 {v0.4h}, [x6], x1
+ b.gt 4b
+ ret
+8:
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v0.8h}, [x0], x1
+ st1 {v0.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h}, [x0], x1
+ st1 {v0.8h}, [x6], x1
+ b.gt 8b
+ ret
+160:
+ AARCH64_VALID_JUMP_TARGET
+ mov v1.16b, v0.16b
+16:
+ st1 {v0.8h, v1.8h}, [x0], x1
+ st1 {v0.8h, v1.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h}, [x0], x1
+ st1 {v0.8h, v1.8h}, [x6], x1
+ b.gt 16b
+ ret
+320:
+ AARCH64_VALID_JUMP_TARGET
+ mov v1.16b, v0.16b
+ mov v2.16b, v0.16b
+ mov v3.16b, v0.16b
+32:
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ b.gt 32b
+ ret
+640:
+ AARCH64_VALID_JUMP_TARGET
+ mov v1.16b, v0.16b
+ mov v2.16b, v0.16b
+ mov v3.16b, v0.16b
+ sub x1, x1, #64
+64:
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ b.gt 64b
+ ret
+
+L(ipred_dc_128_tbl):
+ .hword L(ipred_dc_128_tbl) - 640b
+ .hword L(ipred_dc_128_tbl) - 320b
+ .hword L(ipred_dc_128_tbl) - 160b
+ .hword L(ipred_dc_128_tbl) - 8b
+ .hword L(ipred_dc_128_tbl) - 4b
+endfunc
+
+// void ipred_v_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_v_16bpc_neon, export=1
+ clz w3, w3
+ adr x5, L(ipred_v_tbl)
+ sub w3, w3, #25
+ ldrh w3, [x5, w3, uxtw #1]
+ add x2, x2, #2
+ sub x5, x5, w3, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+40:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.4h}, [x2]
+4:
+ st1 {v0.4h}, [x0], x1
+ st1 {v0.4h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.4h}, [x0], x1
+ st1 {v0.4h}, [x6], x1
+ b.gt 4b
+ ret
+80:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8h}, [x2]
+8:
+ st1 {v0.8h}, [x0], x1
+ st1 {v0.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h}, [x0], x1
+ st1 {v0.8h}, [x6], x1
+ b.gt 8b
+ ret
+160:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8h, v1.8h}, [x2]
+16:
+ st1 {v0.8h, v1.8h}, [x0], x1
+ st1 {v0.8h, v1.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h}, [x0], x1
+ st1 {v0.8h, v1.8h}, [x6], x1
+ b.gt 16b
+ ret
+320:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2]
+32:
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ b.gt 32b
+ ret
+640:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
+ sub x1, x1, #64
+ ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2]
+64:
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1
+ b.gt 64b
+ ret
+
+L(ipred_v_tbl):
+ .hword L(ipred_v_tbl) - 640b
+ .hword L(ipred_v_tbl) - 320b
+ .hword L(ipred_v_tbl) - 160b
+ .hword L(ipred_v_tbl) - 80b
+ .hword L(ipred_v_tbl) - 40b
+endfunc
+
+// void ipred_h_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_h_16bpc_neon, export=1
+ clz w3, w3
+ adr x5, L(ipred_h_tbl)
+ sub w3, w3, #25
+ ldrh w3, [x5, w3, uxtw #1]
+ sub x2, x2, #8
+ sub x5, x5, w3, uxtw
+ mov x7, #-8
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+4:
+ AARCH64_VALID_JUMP_TARGET
+ ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7
+ st1 {v3.4h}, [x0], x1
+ st1 {v2.4h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v1.4h}, [x0], x1
+ st1 {v0.4h}, [x6], x1
+ b.gt 4b
+ ret
+8:
+ AARCH64_VALID_JUMP_TARGET
+ ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7
+ st1 {v3.8h}, [x0], x1
+ st1 {v2.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v1.8h}, [x0], x1
+ st1 {v0.8h}, [x6], x1
+ b.gt 8b
+ ret
+16:
+ AARCH64_VALID_JUMP_TARGET
+ ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7
+ str q3, [x0, #16]
+ str q2, [x6, #16]
+ st1 {v3.8h}, [x0], x1
+ st1 {v2.8h}, [x6], x1
+ subs w4, w4, #4
+ str q1, [x0, #16]
+ str q0, [x6, #16]
+ st1 {v1.8h}, [x0], x1
+ st1 {v0.8h}, [x6], x1
+ b.gt 16b
+ ret
+32:
+ AARCH64_VALID_JUMP_TARGET
+ ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7
+ str q3, [x0, #16]
+ str q2, [x6, #16]
+ stp q3, q3, [x0, #32]
+ stp q2, q2, [x6, #32]
+ st1 {v3.8h}, [x0], x1
+ st1 {v2.8h}, [x6], x1
+ subs w4, w4, #4
+ str q1, [x0, #16]
+ str q0, [x6, #16]
+ stp q1, q1, [x0, #32]
+ stp q0, q0, [x6, #32]
+ st1 {v1.8h}, [x0], x1
+ st1 {v0.8h}, [x6], x1
+ b.gt 32b
+ ret
+64:
+ AARCH64_VALID_JUMP_TARGET
+ ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7
+ str q3, [x0, #16]
+ str q2, [x6, #16]
+ stp q3, q3, [x0, #32]
+ stp q2, q2, [x6, #32]
+ stp q3, q3, [x0, #64]
+ stp q2, q2, [x6, #64]
+ stp q3, q3, [x0, #96]
+ stp q2, q2, [x6, #96]
+ st1 {v3.8h}, [x0], x1
+ st1 {v2.8h}, [x6], x1
+ subs w4, w4, #4
+ str q1, [x0, #16]
+ str q0, [x6, #16]
+ stp q1, q1, [x0, #32]
+ stp q0, q0, [x6, #32]
+ stp q1, q1, [x0, #64]
+ stp q0, q0, [x6, #64]
+ stp q1, q1, [x0, #96]
+ stp q0, q0, [x6, #96]
+ st1 {v1.8h}, [x0], x1
+ st1 {v0.8h}, [x6], x1
+ b.gt 64b
+ ret
+
+L(ipred_h_tbl):
+ .hword L(ipred_h_tbl) - 64b
+ .hword L(ipred_h_tbl) - 32b
+ .hword L(ipred_h_tbl) - 16b
+ .hword L(ipred_h_tbl) - 8b
+ .hword L(ipred_h_tbl) - 4b
+endfunc
+
+// void ipred_dc_top_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_dc_top_16bpc_neon, export=1
+ clz w3, w3
+ adr x5, L(ipred_dc_top_tbl)
+ sub w3, w3, #25
+ ldrh w3, [x5, w3, uxtw #1]
+ add x2, x2, #2
+ sub x5, x5, w3, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+40:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.4h}, [x2]
+ addv h0, v0.4h
+ urshr v0.4h, v0.4h, #2
+ dup v0.4h, v0.h[0]
+4:
+ st1 {v0.4h}, [x0], x1
+ st1 {v0.4h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.4h}, [x0], x1
+ st1 {v0.4h}, [x6], x1
+ b.gt 4b
+ ret
+80:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8h}, [x2]
+ addv h0, v0.8h
+ urshr v0.4h, v0.4h, #3
+ dup v0.8h, v0.h[0]
+8:
+ st1 {v0.8h}, [x0], x1
+ st1 {v0.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h}, [x0], x1
+ st1 {v0.8h}, [x6], x1
+ b.gt 8b
+ ret
+160:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8h, v1.8h}, [x2]
+ addp v0.8h, v0.8h, v1.8h
+ addv h0, v0.8h
+ urshr v2.4h, v0.4h, #4
+ dup v0.8h, v2.h[0]
+ dup v1.8h, v2.h[0]
+16:
+ st1 {v0.8h, v1.8h}, [x0], x1
+ st1 {v0.8h, v1.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h}, [x0], x1
+ st1 {v0.8h, v1.8h}, [x6], x1
+ b.gt 16b
+ ret
+320:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2]
+ addp v0.8h, v0.8h, v1.8h
+ addp v2.8h, v2.8h, v3.8h
+ addp v0.8h, v0.8h, v2.8h
+ uaddlv s0, v0.8h
+ rshrn v4.4h, v0.4s, #5
+ dup v0.8h, v4.h[0]
+ dup v1.8h, v4.h[0]
+ dup v2.8h, v4.h[0]
+ dup v3.8h, v4.h[0]
+32:
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ b.gt 32b
+ ret
+640:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
+ addp v0.8h, v0.8h, v1.8h
+ ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2]
+ addp v2.8h, v2.8h, v3.8h
+ addp v4.8h, v4.8h, v5.8h
+ addp v6.8h, v6.8h, v7.8h
+ addp v0.8h, v0.8h, v2.8h
+ addp v4.8h, v4.8h, v6.8h
+ addp v0.8h, v0.8h, v4.8h
+ uaddlv s0, v0.8h
+ rshrn v4.4h, v0.4s, #6
+ sub x1, x1, #64
+ dup v0.8h, v4.h[0]
+ dup v1.8h, v4.h[0]
+ dup v2.8h, v4.h[0]
+ dup v3.8h, v4.h[0]
+64:
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ b.gt 64b
+ ret
+
+L(ipred_dc_top_tbl):
+ .hword L(ipred_dc_top_tbl) - 640b
+ .hword L(ipred_dc_top_tbl) - 320b
+ .hword L(ipred_dc_top_tbl) - 160b
+ .hword L(ipred_dc_top_tbl) - 80b
+ .hword L(ipred_dc_top_tbl) - 40b
+endfunc
+
+// void ipred_dc_left_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_dc_left_16bpc_neon, export=1
+ sub x2, x2, w4, uxtw #1
+ clz w3, w3
+ clz w7, w4
+ adr x5, L(ipred_dc_left_tbl)
+ sub w3, w3, #20 // 25 leading bits, minus table offset 5
+ sub w7, w7, #25
+ ldrh w3, [x5, w3, uxtw #1]
+ ldrh w7, [x5, w7, uxtw #1]
+ sub x3, x5, w3, uxtw
+ sub x5, x5, w7, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+
+L(ipred_dc_left_h4):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.4h}, [x2]
+ addv h0, v0.4h
+ urshr v0.4h, v0.4h, #2
+ dup v0.8h, v0.h[0]
+ br x3
+L(ipred_dc_left_w4):
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v0.4h}, [x0], x1
+ st1 {v0.4h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.4h}, [x0], x1
+ st1 {v0.4h}, [x6], x1
+ b.gt L(ipred_dc_left_w4)
+ ret
+
+L(ipred_dc_left_h8):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8h}, [x2]
+ addv h0, v0.8h
+ urshr v0.4h, v0.4h, #3
+ dup v0.8h, v0.h[0]
+ br x3
+L(ipred_dc_left_w8):
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v0.8h}, [x0], x1
+ st1 {v0.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h}, [x0], x1
+ st1 {v0.8h}, [x6], x1
+ b.gt L(ipred_dc_left_w8)
+ ret
+
+L(ipred_dc_left_h16):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8h, v1.8h}, [x2]
+ addp v0.8h, v0.8h, v1.8h
+ addv h0, v0.8h
+ urshr v2.4h, v0.4h, #4
+ dup v0.8h, v2.h[0]
+ dup v1.8h, v2.h[0]
+ br x3
+L(ipred_dc_left_w16):
+ AARCH64_VALID_JUMP_TARGET
+ mov v1.16b, v0.16b
+1:
+ st1 {v0.8h, v1.8h}, [x0], x1
+ st1 {v0.8h, v1.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h}, [x0], x1
+ st1 {v0.8h, v1.8h}, [x6], x1
+ b.gt 1b
+ ret
+
+L(ipred_dc_left_h32):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2]
+ addp v0.8h, v0.8h, v1.8h
+ addp v2.8h, v2.8h, v3.8h
+ addp v0.8h, v0.8h, v2.8h
+ uaddlp v0.4s, v0.8h
+ addv s0, v0.4s
+ rshrn v4.4h, v0.4s, #5
+ dup v0.8h, v4.h[0]
+ br x3
+L(ipred_dc_left_w32):
+ AARCH64_VALID_JUMP_TARGET
+ mov v1.16b, v0.16b
+ mov v2.16b, v0.16b
+ mov v3.16b, v0.16b
+1:
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ b.gt 1b
+ ret
+
+L(ipred_dc_left_h64):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
+ addp v0.8h, v0.8h, v1.8h
+ ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2]
+ addp v2.8h, v2.8h, v3.8h
+ addp v4.8h, v4.8h, v5.8h
+ addp v6.8h, v6.8h, v7.8h
+ addp v0.8h, v0.8h, v2.8h
+ addp v4.8h, v4.8h, v6.8h
+ addp v0.8h, v0.8h, v4.8h
+ uaddlv s0, v0.8h
+ rshrn v4.4h, v0.4s, #6
+ dup v0.8h, v4.h[0]
+ br x3
+L(ipred_dc_left_w64):
+ AARCH64_VALID_JUMP_TARGET
+ mov v1.16b, v0.16b
+ mov v2.16b, v0.16b
+ mov v3.16b, v0.16b
+ sub x1, x1, #64
+1:
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ b.gt 1b
+ ret
+
+L(ipred_dc_left_tbl):
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h64)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h32)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h16)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h8)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h4)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w64)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w32)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w16)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w8)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w4)
+endfunc
+
+// void ipred_dc_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_dc_16bpc_neon, export=1
+ sub x2, x2, w4, uxtw #1
+ add w7, w3, w4 // width + height
+ clz w3, w3
+ clz w6, w4
+ dup v16.4s, w7 // width + height
+ adr x5, L(ipred_dc_tbl)
+ rbit w7, w7 // rbit(width + height)
+ sub w3, w3, #20 // 25 leading bits, minus table offset 5
+ sub w6, w6, #25
+ clz w7, w7 // ctz(width + height)
+ ldrh w3, [x5, w3, uxtw #1]
+ ldrh w6, [x5, w6, uxtw #1]
+ neg w7, w7 // -ctz(width + height)
+ sub x3, x5, w3, uxtw
+ sub x5, x5, w6, uxtw
+ ushr v16.4s, v16.4s, #1 // (width + height) >> 1
+ dup v17.4s, w7 // -ctz(width + height)
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+
+L(ipred_dc_h4):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.4h}, [x2], #8
+ uaddlv s0, v0.4h
+ add x2, x2, #2
+ br x3
+L(ipred_dc_w4):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v1.4h}, [x2]
+ add v0.2s, v0.2s, v16.2s
+ uaddlv s1, v1.4h
+ cmp w4, #4
+ add v0.2s, v0.2s, v1.2s
+ ushl v0.2s, v0.2s, v17.2s
+ b.eq 1f
+ // h = 8/16
+ cmp w4, #16
+ mov w16, #0x6667
+ mov w17, #0xAAAB
+ csel w16, w16, w17, eq
+ dup v16.2s, w16
+ mul v0.2s, v0.2s, v16.2s
+ ushr v0.2s, v0.2s, #17
+1:
+ dup v0.4h, v0.h[0]
+2:
+ st1 {v0.4h}, [x0], x1
+ st1 {v0.4h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.4h}, [x0], x1
+ st1 {v0.4h}, [x6], x1
+ b.gt 2b
+ ret
+
+L(ipred_dc_h8):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8h}, [x2], #16
+ uaddlv s0, v0.8h
+ add x2, x2, #2
+ br x3
+L(ipred_dc_w8):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v1.8h}, [x2]
+ add v0.2s, v0.2s, v16.2s
+ uaddlv s1, v1.8h
+ cmp w4, #8
+ add v0.2s, v0.2s, v1.2s
+ ushl v0.2s, v0.2s, v17.2s
+ b.eq 1f
+ // h = 4/16/32
+ cmp w4, #32
+ mov w16, #0x6667
+ mov w17, #0xAAAB
+ csel w16, w16, w17, eq
+ dup v16.2s, w16
+ mul v0.2s, v0.2s, v16.2s
+ ushr v0.2s, v0.2s, #17
+1:
+ dup v0.8h, v0.h[0]
+2:
+ st1 {v0.8h}, [x0], x1
+ st1 {v0.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h}, [x0], x1
+ st1 {v0.8h}, [x6], x1
+ b.gt 2b
+ ret
+
+L(ipred_dc_h16):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8h, v1.8h}, [x2], #32
+ addp v0.8h, v0.8h, v1.8h
+ add x2, x2, #2
+ uaddlv s0, v0.8h
+ br x3
+L(ipred_dc_w16):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v1.8h, v2.8h}, [x2]
+ add v0.2s, v0.2s, v16.2s
+ addp v1.8h, v1.8h, v2.8h
+ uaddlv s1, v1.8h
+ cmp w4, #16
+ add v0.2s, v0.2s, v1.2s
+ ushl v4.2s, v0.2s, v17.2s
+ b.eq 1f
+ // h = 4/8/32/64
+ tst w4, #(32+16+8) // 16 added to make a consecutive bitmask
+ mov w16, #0x6667
+ mov w17, #0xAAAB
+ csel w16, w16, w17, eq
+ dup v16.2s, w16
+ mul v4.2s, v4.2s, v16.2s
+ ushr v4.2s, v4.2s, #17
+1:
+ dup v0.8h, v4.h[0]
+ dup v1.8h, v4.h[0]
+2:
+ st1 {v0.8h, v1.8h}, [x0], x1
+ st1 {v0.8h, v1.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h}, [x0], x1
+ st1 {v0.8h, v1.8h}, [x6], x1
+ b.gt 2b
+ ret
+
+L(ipred_dc_h32):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
+ addp v0.8h, v0.8h, v1.8h
+ addp v2.8h, v2.8h, v3.8h
+ addp v0.8h, v0.8h, v2.8h
+ add x2, x2, #2
+ uaddlv s0, v0.8h
+ br x3
+L(ipred_dc_w32):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v1.8h, v2.8h, v3.8h, v4.8h}, [x2]
+ add v0.2s, v0.2s, v16.2s
+ addp v1.8h, v1.8h, v2.8h
+ addp v3.8h, v3.8h, v4.8h
+ addp v1.8h, v1.8h, v3.8h
+ uaddlv s1, v1.8h
+ cmp w4, #32
+ add v0.2s, v0.2s, v1.2s
+ ushl v4.2s, v0.2s, v17.2s
+ b.eq 1f
+ // h = 8/16/64
+ cmp w4, #8
+ mov w16, #0x6667
+ mov w17, #0xAAAB
+ csel w16, w16, w17, eq
+ dup v16.2s, w16
+ mul v4.2s, v4.2s, v16.2s
+ ushr v4.2s, v4.2s, #17
+1:
+ dup v0.8h, v4.h[0]
+ dup v1.8h, v4.h[0]
+ dup v2.8h, v4.h[0]
+ dup v3.8h, v4.h[0]
+2:
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ b.gt 2b
+ ret
+
+L(ipred_dc_h64):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
+ addp v0.8h, v0.8h, v1.8h
+ ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
+ addp v2.8h, v2.8h, v3.8h
+ addp v4.8h, v4.8h, v5.8h
+ addp v6.8h, v6.8h, v7.8h
+ addp v0.8h, v0.8h, v2.8h
+ addp v4.8h, v4.8h, v6.8h
+ addp v0.8h, v0.8h, v4.8h
+ add x2, x2, #2
+ uaddlv s0, v0.8h
+ br x3
+L(ipred_dc_w64):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v1.8h, v2.8h, v3.8h, v4.8h}, [x2], #64
+ add v0.2s, v0.2s, v16.2s
+ addp v1.8h, v1.8h, v2.8h
+ ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2]
+ addp v3.8h, v3.8h, v4.8h
+ addp v20.8h, v20.8h, v21.8h
+ addp v22.8h, v22.8h, v23.8h
+ addp v1.8h, v1.8h, v3.8h
+ addp v20.8h, v20.8h, v22.8h
+ addp v1.8h, v1.8h, v20.8h
+ uaddlv s1, v1.8h
+ cmp w4, #64
+ add v0.2s, v0.2s, v1.2s
+ ushl v4.2s, v0.2s, v17.2s
+ b.eq 1f
+ // h = 16/32
+ cmp w4, #16
+ mov w16, #0x6667
+ mov w17, #0xAAAB
+ csel w16, w16, w17, eq
+ dup v16.2s, w16
+ mul v4.2s, v4.2s, v16.2s
+ ushr v4.2s, v4.2s, #17
+1:
+ sub x1, x1, #64
+ dup v0.8h, v4.h[0]
+ dup v1.8h, v4.h[0]
+ dup v2.8h, v4.h[0]
+ dup v3.8h, v4.h[0]
+2:
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ b.gt 2b
+ ret
+
+L(ipred_dc_tbl):
+ .hword L(ipred_dc_tbl) - L(ipred_dc_h64)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_h32)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_h16)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_h8)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_h4)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_w64)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_w32)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_w16)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_w8)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_w4)
+endfunc
+
+// void ipred_paeth_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_paeth_16bpc_neon, export=1
+ clz w9, w3
+ adr x5, L(ipred_paeth_tbl)
+ sub w9, w9, #25
+ ldrh w9, [x5, w9, uxtw #1]
+ ld1r {v4.8h}, [x2]
+ add x8, x2, #2
+ sub x2, x2, #8
+ sub x5, x5, w9, uxtw
+ mov x7, #-8
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+40:
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v5.2d}, [x8]
+ sub v6.8h, v5.8h, v4.8h // top - topleft
+4:
+ ld4r {v0.4h, v1.4h, v2.4h, v3.4h}, [x2], x7
+ zip1 v0.2d, v0.2d, v1.2d
+ zip1 v2.2d, v2.2d, v3.2d
+ add v16.8h, v6.8h, v0.8h // base
+ add v17.8h, v6.8h, v2.8h
+ sabd v20.8h, v5.8h, v16.8h // tdiff
+ sabd v21.8h, v5.8h, v17.8h
+ sabd v22.8h, v4.8h, v16.8h // tldiff
+ sabd v23.8h, v4.8h, v17.8h
+ sabd v16.8h, v0.8h, v16.8h // ldiff
+ sabd v17.8h, v2.8h, v17.8h
+ umin v18.8h, v20.8h, v22.8h // min(tdiff, tldiff)
+ umin v19.8h, v21.8h, v23.8h
+ cmge v20.8h, v22.8h, v20.8h // tldiff >= tdiff
+ cmge v21.8h, v23.8h, v21.8h
+ cmge v16.8h, v18.8h, v16.8h // min(tdiff, tldiff) >= ldiff
+ cmge v17.8h, v19.8h, v17.8h
+ bsl v21.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft
+ bsl v20.16b, v5.16b, v4.16b
+ bit v21.16b, v2.16b, v17.16b // ldiff <= min ? left : ...
+ bit v20.16b, v0.16b, v16.16b
+ st1 {v21.d}[1], [x0], x1
+ st1 {v21.d}[0], [x6], x1
+ subs w4, w4, #4
+ st1 {v20.d}[1], [x0], x1
+ st1 {v20.d}[0], [x6], x1
+ b.gt 4b
+ ret
+80:
+160:
+320:
+640:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v5.8h}, [x8], #16
+ mov w9, w3
+ // Set up pointers for four rows in parallel; x0, x6, x5, x10
+ add x5, x0, x1
+ add x10, x6, x1
+ lsl x1, x1, #1
+ sub x1, x1, w3, uxtw #1
+1:
+ ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7
+2:
+ sub v6.8h, v5.8h, v4.8h // top - topleft
+ add v16.8h, v6.8h, v0.8h // base
+ add v17.8h, v6.8h, v1.8h
+ add v18.8h, v6.8h, v2.8h
+ add v19.8h, v6.8h, v3.8h
+ sabd v20.8h, v5.8h, v16.8h // tdiff
+ sabd v21.8h, v5.8h, v17.8h
+ sabd v22.8h, v5.8h, v18.8h
+ sabd v23.8h, v5.8h, v19.8h
+ sabd v24.8h, v4.8h, v16.8h // tldiff
+ sabd v25.8h, v4.8h, v17.8h
+ sabd v26.8h, v4.8h, v18.8h
+ sabd v27.8h, v4.8h, v19.8h
+ sabd v16.8h, v0.8h, v16.8h // ldiff
+ sabd v17.8h, v1.8h, v17.8h
+ sabd v18.8h, v2.8h, v18.8h
+ sabd v19.8h, v3.8h, v19.8h
+ umin v28.8h, v20.8h, v24.8h // min(tdiff, tldiff)
+ umin v29.8h, v21.8h, v25.8h
+ umin v30.8h, v22.8h, v26.8h
+ umin v31.8h, v23.8h, v27.8h
+ cmge v20.8h, v24.8h, v20.8h // tldiff >= tdiff
+ cmge v21.8h, v25.8h, v21.8h
+ cmge v22.8h, v26.8h, v22.8h
+ cmge v23.8h, v27.8h, v23.8h
+ cmge v16.8h, v28.8h, v16.8h // min(tdiff, tldiff) >= ldiff
+ cmge v17.8h, v29.8h, v17.8h
+ cmge v18.8h, v30.8h, v18.8h
+ cmge v19.8h, v31.8h, v19.8h
+ bsl v23.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft
+ bsl v22.16b, v5.16b, v4.16b
+ bsl v21.16b, v5.16b, v4.16b
+ bsl v20.16b, v5.16b, v4.16b
+ bit v23.16b, v3.16b, v19.16b // ldiff <= min ? left : ...
+ bit v22.16b, v2.16b, v18.16b
+ bit v21.16b, v1.16b, v17.16b
+ bit v20.16b, v0.16b, v16.16b
+ st1 {v23.8h}, [x0], #16
+ st1 {v22.8h}, [x6], #16
+ subs w3, w3, #8
+ st1 {v21.8h}, [x5], #16
+ st1 {v20.8h}, [x10], #16
+ b.le 8f
+ ld1 {v5.8h}, [x8], #16
+ b 2b
+8:
+ subs w4, w4, #4
+ b.le 9f
+ // End of horizontal loop, move pointers to next four rows
+ sub x8, x8, w9, uxtw #1
+ add x0, x0, x1
+ add x6, x6, x1
+ // Load the top row as early as possible
+ ld1 {v5.8h}, [x8], #16
+ add x5, x5, x1
+ add x10, x10, x1
+ mov w3, w9
+ b 1b
+9:
+ ret
+
+L(ipred_paeth_tbl):
+ .hword L(ipred_paeth_tbl) - 640b
+ .hword L(ipred_paeth_tbl) - 320b
+ .hword L(ipred_paeth_tbl) - 160b
+ .hword L(ipred_paeth_tbl) - 80b
+ .hword L(ipred_paeth_tbl) - 40b
+endfunc
+
+// void ipred_smooth_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_smooth_16bpc_neon, export=1
+ movrel x10, X(sm_weights)
+ add x11, x10, w4, uxtw
+ add x10, x10, w3, uxtw
+ clz w9, w3
+ adr x5, L(ipred_smooth_tbl)
+ sub x12, x2, w4, uxtw #1
+ sub w9, w9, #25
+ ldrh w9, [x5, w9, uxtw #1]
+ ld1r {v4.8h}, [x12] // bottom
+ add x8, x2, #2
+ sub x5, x5, w9, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+40:
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v6.2d}, [x8] // top
+ ld1r {v7.2s}, [x10] // weights_hor
+ sub x2, x2, #8
+ mov x7, #-8
+ dup v5.8h, v6.h[3] // right
+ sub v6.8h, v6.8h, v4.8h // top-bottom
+ uxtl v7.8h, v7.8b // weights_hor
+ add v31.4h, v4.4h, v5.4h // bottom+right
+4:
+ ld4r {v0.4h, v1.4h, v2.4h, v3.4h}, [x2], x7 // left
+ ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver
+ ushll v20.4s, v31.4h, #8 // (bottom+right)*256
+ ushll v21.4s, v31.4h, #8
+ ushll v22.4s, v31.4h, #8
+ ushll v23.4s, v31.4h, #8
+ zip1 v1.2d, v1.2d, v0.2d // left, flipped
+ zip1 v0.2d, v3.2d, v2.2d
+ zip1 v16.2s, v16.2s, v17.2s // weights_ver
+ zip1 v18.2s, v18.2s, v19.2s
+ sub v0.8h, v0.8h, v5.8h // left-right
+ sub v1.8h, v1.8h, v5.8h
+ uxtl v16.8h, v16.8b // weights_ver
+ uxtl v18.8h, v18.8b
+ smlal v20.4s, v0.4h, v7.4h // += (left-right)*weights_hor
+ smlal2 v21.4s, v0.8h, v7.8h
+ smlal v22.4s, v1.4h, v7.4h
+ smlal2 v23.4s, v1.8h, v7.8h
+ smlal v20.4s, v6.4h, v16.4h // += (top-bottom)*weights_ver
+ smlal2 v21.4s, v6.8h, v16.8h
+ smlal v22.4s, v6.4h, v18.4h
+ smlal2 v23.4s, v6.8h, v18.8h
+ rshrn v20.4h, v20.4s, #9
+ rshrn v21.4h, v21.4s, #9
+ rshrn v22.4h, v22.4s, #9
+ rshrn v23.4h, v23.4s, #9
+ st1 {v20.4h}, [x0], x1
+ st1 {v21.4h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v22.4h}, [x0], x1
+ st1 {v23.4h}, [x6], x1
+ b.gt 4b
+ ret
+80:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v6.8h}, [x8] // top
+ ld1 {v7.8b}, [x10] // weights_hor
+ sub x2, x2, #8
+ mov x7, #-8
+ dup v5.8h, v6.h[7] // right
+ sub v6.8h, v6.8h, v4.8h // top-bottom
+ uxtl v7.8h, v7.8b // weights_hor
+ add v31.4h, v4.4h, v5.4h // bottom+right
+8:
+ ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 // left
+ ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver
+ ushll v20.4s, v31.4h, #8 // (bottom+right)*256
+ ushll v21.4s, v31.4h, #8
+ ushll v22.4s, v31.4h, #8
+ ushll v23.4s, v31.4h, #8
+ ushll v24.4s, v31.4h, #8
+ ushll v25.4s, v31.4h, #8
+ ushll v26.4s, v31.4h, #8
+ ushll v27.4s, v31.4h, #8
+ sub v0.8h, v0.8h, v5.8h // left-right
+ sub v1.8h, v1.8h, v5.8h
+ sub v2.8h, v2.8h, v5.8h
+ sub v3.8h, v3.8h, v5.8h
+ uxtl v16.8h, v16.8b // weights_ver
+ uxtl v17.8h, v17.8b
+ uxtl v18.8h, v18.8b
+ uxtl v19.8h, v19.8b
+ smlal v20.4s, v3.4h, v7.4h // += (left-right)*weights_hor
+ smlal2 v21.4s, v3.8h, v7.8h // (left flipped)
+ smlal v22.4s, v2.4h, v7.4h
+ smlal2 v23.4s, v2.8h, v7.8h
+ smlal v24.4s, v1.4h, v7.4h
+ smlal2 v25.4s, v1.8h, v7.8h
+ smlal v26.4s, v0.4h, v7.4h
+ smlal2 v27.4s, v0.8h, v7.8h
+ smlal v20.4s, v6.4h, v16.4h // += (top-bottom)*weights_ver
+ smlal2 v21.4s, v6.8h, v16.8h
+ smlal v22.4s, v6.4h, v17.4h
+ smlal2 v23.4s, v6.8h, v17.8h
+ smlal v24.4s, v6.4h, v18.4h
+ smlal2 v25.4s, v6.8h, v18.8h
+ smlal v26.4s, v6.4h, v19.4h
+ smlal2 v27.4s, v6.8h, v19.8h
+ rshrn v20.4h, v20.4s, #9
+ rshrn2 v20.8h, v21.4s, #9
+ rshrn v21.4h, v22.4s, #9
+ rshrn2 v21.8h, v23.4s, #9
+ rshrn v22.4h, v24.4s, #9
+ rshrn2 v22.8h, v25.4s, #9
+ rshrn v23.4h, v26.4s, #9
+ rshrn2 v23.8h, v27.4s, #9
+ st1 {v20.8h}, [x0], x1
+ st1 {v21.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v22.8h}, [x0], x1
+ st1 {v23.8h}, [x6], x1
+ b.gt 8b
+ ret
+160:
+320:
+640:
+ AARCH64_VALID_JUMP_TARGET
+ add x12, x2, w3, uxtw #1
+ sub x1, x1, w3, uxtw #1
+ ld1r {v5.8h}, [x12] // right
+ sub x2, x2, #4
+ mov x7, #-4
+ mov w9, w3
+ add v31.4h, v4.4h, v5.4h // bottom+right
+
+1:
+ ld2r {v0.8h, v1.8h}, [x2], x7 // left
+ ld2r {v16.8b, v17.8b}, [x11], #2 // weights_ver
+ sub v0.8h, v0.8h, v5.8h // left-right
+ sub v1.8h, v1.8h, v5.8h
+ uxtl v16.8h, v16.8b // weights_ver
+ uxtl v17.8h, v17.8b
+2:
+ ld1 {v7.16b}, [x10], #16 // weights_hor
+ ld1 {v2.8h, v3.8h}, [x8], #32 // top
+ ushll v20.4s, v31.4h, #8 // (bottom+right)*256
+ ushll v21.4s, v31.4h, #8
+ ushll v22.4s, v31.4h, #8
+ ushll v23.4s, v31.4h, #8
+ ushll v24.4s, v31.4h, #8
+ ushll v25.4s, v31.4h, #8
+ ushll v26.4s, v31.4h, #8
+ ushll v27.4s, v31.4h, #8
+ uxtl v6.8h, v7.8b // weights_hor
+ uxtl2 v7.8h, v7.16b
+ sub v2.8h, v2.8h, v4.8h // top-bottom
+ sub v3.8h, v3.8h, v4.8h
+ smlal v20.4s, v1.4h, v6.4h // += (left-right)*weights_hor
+ smlal2 v21.4s, v1.8h, v6.8h // (left flipped)
+ smlal v22.4s, v1.4h, v7.4h
+ smlal2 v23.4s, v1.8h, v7.8h
+ smlal v24.4s, v0.4h, v6.4h
+ smlal2 v25.4s, v0.8h, v6.8h
+ smlal v26.4s, v0.4h, v7.4h
+ smlal2 v27.4s, v0.8h, v7.8h
+ smlal v20.4s, v2.4h, v16.4h // += (top-bottom)*weights_ver
+ smlal2 v21.4s, v2.8h, v16.8h
+ smlal v22.4s, v3.4h, v16.4h
+ smlal2 v23.4s, v3.8h, v16.8h
+ smlal v24.4s, v2.4h, v17.4h
+ smlal2 v25.4s, v2.8h, v17.8h
+ smlal v26.4s, v3.4h, v17.4h
+ smlal2 v27.4s, v3.8h, v17.8h
+ rshrn v20.4h, v20.4s, #9
+ rshrn2 v20.8h, v21.4s, #9
+ rshrn v21.4h, v22.4s, #9
+ rshrn2 v21.8h, v23.4s, #9
+ rshrn v22.4h, v24.4s, #9
+ rshrn2 v22.8h, v25.4s, #9
+ rshrn v23.4h, v26.4s, #9
+ rshrn2 v23.8h, v27.4s, #9
+ subs w3, w3, #16
+ st1 {v20.8h, v21.8h}, [x0], #32
+ st1 {v22.8h, v23.8h}, [x6], #32
+ b.gt 2b
+ subs w4, w4, #2
+ b.le 9f
+ sub x8, x8, w9, uxtw #1
+ sub x10, x10, w9, uxtw
+ add x0, x0, x1
+ add x6, x6, x1
+ mov w3, w9
+ b 1b
+9:
+ ret
+
+L(ipred_smooth_tbl):
+ .hword L(ipred_smooth_tbl) - 640b
+ .hword L(ipred_smooth_tbl) - 320b
+ .hword L(ipred_smooth_tbl) - 160b
+ .hword L(ipred_smooth_tbl) - 80b
+ .hword L(ipred_smooth_tbl) - 40b
+endfunc
+
+// void ipred_smooth_v_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_smooth_v_16bpc_neon, export=1
+ movrel x7, X(sm_weights)
+ add x7, x7, w4, uxtw
+ clz w9, w3
+ adr x5, L(ipred_smooth_v_tbl)
+ sub x8, x2, w4, uxtw #1
+ sub w9, w9, #25
+ ldrh w9, [x5, w9, uxtw #1]
+ ld1r {v4.8h}, [x8] // bottom
+ add x2, x2, #2
+ sub x5, x5, w9, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+40:
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v6.2d}, [x2] // top
+ sub v6.8h, v6.8h, v4.8h // top-bottom
+4:
+ ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver
+ zip1 v16.2s, v16.2s, v17.2s // weights_ver
+ zip1 v18.2s, v18.2s, v19.2s
+ ushll v16.8h, v16.8b, #7 // weights_ver << 7
+ ushll v18.8h, v18.8b, #7
+ sqrdmulh v20.8h, v6.8h, v16.8h // ((top-bottom)*weights_ver + 128) >> 8
+ sqrdmulh v21.8h, v6.8h, v18.8h
+ add v20.8h, v20.8h, v4.8h
+ add v21.8h, v21.8h, v4.8h
+ st1 {v20.d}[0], [x0], x1
+ st1 {v20.d}[1], [x6], x1
+ subs w4, w4, #4
+ st1 {v21.d}[0], [x0], x1
+ st1 {v21.d}[1], [x6], x1
+ b.gt 4b
+ ret
+80:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v6.8h}, [x2] // top
+ sub v6.8h, v6.8h, v4.8h // top-bottom
+8:
+ ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver
+ ushll v16.8h, v16.8b, #7 // weights_ver << 7
+ ushll v17.8h, v17.8b, #7
+ ushll v18.8h, v18.8b, #7
+ ushll v19.8h, v19.8b, #7
+ sqrdmulh v20.8h, v6.8h, v16.8h // ((top-bottom)*weights_ver + 128) >> 8
+ sqrdmulh v21.8h, v6.8h, v17.8h
+ sqrdmulh v22.8h, v6.8h, v18.8h
+ sqrdmulh v23.8h, v6.8h, v19.8h
+ add v20.8h, v20.8h, v4.8h
+ add v21.8h, v21.8h, v4.8h
+ add v22.8h, v22.8h, v4.8h
+ add v23.8h, v23.8h, v4.8h
+ st1 {v20.8h}, [x0], x1
+ st1 {v21.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v22.8h}, [x0], x1
+ st1 {v23.8h}, [x6], x1
+ b.gt 8b
+ ret
+160:
+320:
+640:
+ AARCH64_VALID_JUMP_TARGET
+ // Set up pointers for four rows in parallel; x0, x6, x5, x8
+ add x5, x0, x1
+ add x8, x6, x1
+ lsl x1, x1, #1
+ sub x1, x1, w3, uxtw #1
+ mov w9, w3
+
+1:
+ ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver
+ ushll v16.8h, v16.8b, #7 // weights_ver << 7
+ ushll v17.8h, v17.8b, #7
+ ushll v18.8h, v18.8b, #7
+ ushll v19.8h, v19.8b, #7
+2:
+ ld1 {v2.8h, v3.8h}, [x2], #32 // top
+ sub v2.8h, v2.8h, v4.8h // top-bottom
+ sub v3.8h, v3.8h, v4.8h
+ sqrdmulh v20.8h, v2.8h, v16.8h // ((top-bottom)*weights_ver + 128) >> 8
+ sqrdmulh v21.8h, v3.8h, v16.8h
+ sqrdmulh v22.8h, v2.8h, v17.8h
+ sqrdmulh v23.8h, v3.8h, v17.8h
+ sqrdmulh v24.8h, v2.8h, v18.8h
+ sqrdmulh v25.8h, v3.8h, v18.8h
+ sqrdmulh v26.8h, v2.8h, v19.8h
+ sqrdmulh v27.8h, v3.8h, v19.8h
+ add v20.8h, v20.8h, v4.8h
+ add v21.8h, v21.8h, v4.8h
+ add v22.8h, v22.8h, v4.8h
+ add v23.8h, v23.8h, v4.8h
+ add v24.8h, v24.8h, v4.8h
+ add v25.8h, v25.8h, v4.8h
+ add v26.8h, v26.8h, v4.8h
+ add v27.8h, v27.8h, v4.8h
+ subs w3, w3, #16
+ st1 {v20.8h, v21.8h}, [x0], #32
+ st1 {v22.8h, v23.8h}, [x6], #32
+ st1 {v24.8h, v25.8h}, [x5], #32
+ st1 {v26.8h, v27.8h}, [x8], #32
+ b.gt 2b
+ subs w4, w4, #4
+ b.le 9f
+ sub x2, x2, w9, uxtw #1
+ add x0, x0, x1
+ add x6, x6, x1
+ add x5, x5, x1
+ add x8, x8, x1
+ mov w3, w9
+ b 1b
+9:
+ ret
+
+L(ipred_smooth_v_tbl):
+ .hword L(ipred_smooth_v_tbl) - 640b
+ .hword L(ipred_smooth_v_tbl) - 320b
+ .hword L(ipred_smooth_v_tbl) - 160b
+ .hword L(ipred_smooth_v_tbl) - 80b
+ .hword L(ipred_smooth_v_tbl) - 40b
+endfunc
+
+// void ipred_smooth_h_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_smooth_h_16bpc_neon, export=1
+ movrel x8, X(sm_weights)
+ add x8, x8, w3, uxtw
+ clz w9, w3
+ adr x5, L(ipred_smooth_h_tbl)
+ add x12, x2, w3, uxtw #1
+ sub w9, w9, #25
+ ldrh w9, [x5, w9, uxtw #1]
+ ld1r {v5.8h}, [x12] // right
+ sub x5, x5, w9, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+40:
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v7.2s}, [x8] // weights_hor
+ sub x2, x2, #8
+ mov x7, #-8
+ ushll v7.8h, v7.8b, #7 // weights_hor << 7
+4:
+ ld4r {v0.4h, v1.4h, v2.4h, v3.4h}, [x2], x7 // left
+ zip1 v1.2d, v1.2d, v0.2d // left, flipped
+ zip1 v0.2d, v3.2d, v2.2d
+ sub v0.8h, v0.8h, v5.8h // left-right
+ sub v1.8h, v1.8h, v5.8h
+ sqrdmulh v20.8h, v0.8h, v7.8h // ((left-right)*weights_hor + 128) >> 8
+ sqrdmulh v21.8h, v1.8h, v7.8h
+ add v20.8h, v20.8h, v5.8h
+ add v21.8h, v21.8h, v5.8h
+ st1 {v20.d}[0], [x0], x1
+ st1 {v20.d}[1], [x6], x1
+ subs w4, w4, #4
+ st1 {v21.d}[0], [x0], x1
+ st1 {v21.d}[1], [x6], x1
+ b.gt 4b
+ ret
+80:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v7.8b}, [x8] // weights_hor
+ sub x2, x2, #8
+ mov x7, #-8
+ ushll v7.8h, v7.8b, #7 // weights_hor << 7
+8:
+ ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 // left
+ sub v3.8h, v3.8h, v5.8h // left-right
+ sub v2.8h, v2.8h, v5.8h
+ sub v1.8h, v1.8h, v5.8h
+ sub v0.8h, v0.8h, v5.8h
+ sqrdmulh v20.8h, v3.8h, v7.8h // ((left-right)*weights_hor + 128) >> 8
+ sqrdmulh v21.8h, v2.8h, v7.8h // (left flipped)
+ sqrdmulh v22.8h, v1.8h, v7.8h
+ sqrdmulh v23.8h, v0.8h, v7.8h
+ add v20.8h, v20.8h, v5.8h
+ add v21.8h, v21.8h, v5.8h
+ add v22.8h, v22.8h, v5.8h
+ add v23.8h, v23.8h, v5.8h
+ st1 {v20.8h}, [x0], x1
+ st1 {v21.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v22.8h}, [x0], x1
+ st1 {v23.8h}, [x6], x1
+ b.gt 8b
+ ret
+160:
+320:
+640:
+ AARCH64_VALID_JUMP_TARGET
+ sub x2, x2, #8
+ mov x7, #-8
+ // Set up pointers for four rows in parallel; x0, x6, x5, x10
+ add x5, x0, x1
+ add x10, x6, x1
+ lsl x1, x1, #1
+ sub x1, x1, w3, uxtw #1
+ mov w9, w3
+
+1:
+ ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 // left
+ sub v0.8h, v0.8h, v5.8h // left-right
+ sub v1.8h, v1.8h, v5.8h
+ sub v2.8h, v2.8h, v5.8h
+ sub v3.8h, v3.8h, v5.8h
+2:
+ ld1 {v7.16b}, [x8], #16 // weights_hor
+ ushll v6.8h, v7.8b, #7 // weights_hor << 7
+ ushll2 v7.8h, v7.16b, #7
+ sqrdmulh v20.8h, v3.8h, v6.8h // ((left-right)*weights_hor + 128) >> 8
+ sqrdmulh v21.8h, v3.8h, v7.8h // (left flipped)
+ sqrdmulh v22.8h, v2.8h, v6.8h
+ sqrdmulh v23.8h, v2.8h, v7.8h
+ sqrdmulh v24.8h, v1.8h, v6.8h
+ sqrdmulh v25.8h, v1.8h, v7.8h
+ sqrdmulh v26.8h, v0.8h, v6.8h
+ sqrdmulh v27.8h, v0.8h, v7.8h
+ add v20.8h, v20.8h, v5.8h
+ add v21.8h, v21.8h, v5.8h
+ add v22.8h, v22.8h, v5.8h
+ add v23.8h, v23.8h, v5.8h
+ add v24.8h, v24.8h, v5.8h
+ add v25.8h, v25.8h, v5.8h
+ add v26.8h, v26.8h, v5.8h
+ add v27.8h, v27.8h, v5.8h
+ subs w3, w3, #16
+ st1 {v20.8h, v21.8h}, [x0], #32
+ st1 {v22.8h, v23.8h}, [x6], #32
+ st1 {v24.8h, v25.8h}, [x5], #32
+ st1 {v26.8h, v27.8h}, [x10], #32
+ b.gt 2b
+ subs w4, w4, #4
+ b.le 9f
+ sub x8, x8, w9, uxtw
+ add x0, x0, x1
+ add x6, x6, x1
+ add x5, x5, x1
+ add x10, x10, x1
+ mov w3, w9
+ b 1b
+9:
+ ret
+
+L(ipred_smooth_h_tbl):
+ .hword L(ipred_smooth_h_tbl) - 640b
+ .hword L(ipred_smooth_h_tbl) - 320b
+ .hword L(ipred_smooth_h_tbl) - 160b
+ .hword L(ipred_smooth_h_tbl) - 80b
+ .hword L(ipred_smooth_h_tbl) - 40b
+endfunc
+
+const padding_mask_buf
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+padding_mask:
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+endconst
+
+// void ipred_z1_upsample_edge_16bpc_neon(pixel *out, const int hsz,
+// const pixel *const in, const int end,
+// const int bitdepth_max);
+function ipred_z1_upsample_edge_16bpc_neon, export=1
+ dup v30.8h, w4 // bitdepth_max
+ movrel x4, padding_mask
+ ld1 {v0.8h, v1.8h}, [x2] // in[]
+ add x5, x2, w3, uxtw #1 // in[end]
+ sub x4, x4, w3, uxtw #1
+
+ ld1r {v2.8h}, [x5] // padding
+ ld1 {v3.8h, v4.8h}, [x4] // padding_mask
+
+ movi v31.8h, #9
+
+ bit v0.16b, v2.16b, v3.16b // padded in[]
+ bit v1.16b, v2.16b, v4.16b
+
+ ext v4.16b, v0.16b, v1.16b, #2
+ ext v5.16b, v1.16b, v2.16b, #2
+ ext v6.16b, v0.16b, v1.16b, #4
+ ext v7.16b, v1.16b, v2.16b, #4
+ ext v16.16b, v0.16b, v1.16b, #6
+ ext v17.16b, v1.16b, v2.16b, #6
+
+ add v18.8h, v4.8h, v6.8h // in[i+1] + in[i+2]
+ add v19.8h, v5.8h, v7.8h
+ add v20.8h, v0.8h, v16.8h
+ add v21.8h, v1.8h, v17.8h
+ umull v22.4s, v18.4h, v31.4h // 9*(in[i+1] + in[i+2])
+ umull2 v23.4s, v18.8h, v31.8h
+ umull v24.4s, v19.4h, v31.4h
+ umull2 v25.4s, v19.8h, v31.8h
+ usubw v22.4s, v22.4s, v20.4h
+ usubw2 v23.4s, v23.4s, v20.8h
+ usubw v24.4s, v24.4s, v21.4h
+ usubw2 v25.4s, v25.4s, v21.8h
+
+ sqrshrun v16.4h, v22.4s, #4
+ sqrshrun2 v16.8h, v23.4s, #4
+ sqrshrun v17.4h, v24.4s, #4
+ sqrshrun2 v17.8h, v25.4s, #4
+
+ smin v16.8h, v16.8h, v30.8h
+ smin v17.8h, v17.8h, v30.8h
+
+ zip1 v0.8h, v4.8h, v16.8h
+ zip2 v1.8h, v4.8h, v16.8h
+ zip1 v2.8h, v5.8h, v17.8h
+ zip2 v3.8h, v5.8h, v17.8h
+
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]
+
+ ret
+endfunc
+
+// void ipred_z2_upsample_edge_16bpc_neon(pixel *out, const int sz,
+// const pixel *const in,
+// const int bitdepth_max);
+function ipred_z2_upsample_edge_16bpc_neon, export=1
+ dup v30.8h, w3 // bitdepth_max
+ // Here, sz is 4 or 8, and we produce 2*sz+1 output elements.
+ movrel x4, padding_mask
+ ld1 {v0.8h, v1.8h}, [x2] // in[]
+ add x5, x2, w1, uxtw #1 // in[sz]
+ sub x4, x4, w1, uxtw #1
+
+ ld1r {v3.8h}, [x2] // in[0] for padding
+ ld1r {v2.8h}, [x5] // padding
+ ld1 {v4.8h, v5.8h}, [x4] // padding_mask
+
+ movi v31.8h, #9
+
+ bit v0.16b, v2.16b, v4.16b // padded in[]
+ bit v1.16b, v2.16b, v5.16b
+
+ ext v4.16b, v3.16b, v0.16b, #14
+ ext v5.16b, v0.16b, v1.16b, #2
+ ext v6.16b, v0.16b, v1.16b, #4
+
+ add v16.8h, v0.8h, v5.8h // in[i+0] + in[i+1]
+ add v17.8h, v4.8h, v6.8h // in[i-1] + in[i+2]
+ umull v18.4s, v16.4h, v31.4h // 9*(in[i+1] + in[i+2])
+ umull2 v19.4s, v16.8h, v31.8h
+ usubw v18.4s, v18.4s, v17.4h
+ usubw2 v19.4s, v19.4s, v17.8h
+
+ sqrshrun v16.4h, v18.4s, #4
+ sqrshrun2 v16.8h, v19.4s, #4
+
+ add x5, x0, #2*16
+
+ smin v16.8h, v16.8h, v30.8h
+
+ zip1 v4.8h, v0.8h, v16.8h
+ zip2 v5.8h, v0.8h, v16.8h
+
+ st1 {v2.h}[0], [x5]
+ // In case sz=8, output one single pixel in out[16].
+ st1 {v4.8h, v5.8h}, [x0]
+
+ ret
+endfunc
+
+const edge_filter
+ .short 0, 4, 8, 0
+ .short 0, 5, 6, 0
+// Leaving out the coeffs for strength=3
+// .byte 2, 4, 4, 0
+endconst
+
+// void ipred_z1_filter_edge_16bpc_neon(pixel *out, const int sz,
+// const pixel *const in, const int end,
+// const int strength);
+function ipred_z1_filter_edge_16bpc_neon, export=1
+ cmp w4, #3
+ b.eq L(fivetap) // if (strength == 3) goto fivetap
+
+ movrel x5, edge_filter, -6
+ add x5, x5, w4, uxtw #3 // edge_filter + 2*((strength - 1)*4 + 1)
+
+ ld1 {v31.s}[0], [x5] // kernel[1-2]
+
+ ld1 {v0.8h}, [x2], #16
+
+ dup v30.8h, v31.h[0]
+ dup v31.8h, v31.h[1]
+1:
+ // in[end], is the last valid pixel. We produce 16 pixels out by
+ // using 18 pixels in - the last pixel used is [17] of the ones
+ // read/buffered.
+ cmp w3, #17
+ ld1 {v1.8h, v2.8h}, [x2], #32
+ b.lt 2f
+ ext v3.16b, v0.16b, v1.16b, #2
+ ext v4.16b, v1.16b, v2.16b, #2
+ ext v5.16b, v0.16b, v1.16b, #4
+ ext v6.16b, v1.16b, v2.16b, #4
+ mul v16.8h, v0.8h, v30.8h
+ mla v16.8h, v3.8h, v31.8h
+ mla v16.8h, v5.8h, v30.8h
+ mul v17.8h, v1.8h, v30.8h
+ mla v17.8h, v4.8h, v31.8h
+ mla v17.8h, v6.8h, v30.8h
+ subs w1, w1, #16
+ mov v0.16b, v2.16b
+ urshr v16.8h, v16.8h, #4
+ urshr v17.8h, v17.8h, #4
+ sub w3, w3, #16
+ st1 {v16.8h, v17.8h}, [x0], #32
+ b.gt 1b
+ ret
+2:
+ // Right padding
+
+ // x2[w3-24] is the padding pixel (x2 points 24 pixels ahead)
+ movrel x5, padding_mask
+ sub w6, w3, #24
+ sub x5, x5, w3, uxtw #1
+ add x6, x2, w6, sxtw #1
+
+ ld1 {v3.8h, v4.8h}, [x5] // padding_mask
+
+ ld1r {v2.8h}, [x6]
+ bit v0.16b, v2.16b, v3.16b // Pad v0-v1
+ bit v1.16b, v2.16b, v4.16b
+
+ // Filter one block
+ ext v3.16b, v0.16b, v1.16b, #2
+ ext v4.16b, v1.16b, v2.16b, #2
+ ext v5.16b, v0.16b, v1.16b, #4
+ ext v6.16b, v1.16b, v2.16b, #4
+ mul v16.8h, v0.8h, v30.8h
+ mla v16.8h, v3.8h, v31.8h
+ mla v16.8h, v5.8h, v30.8h
+ mul v17.8h, v1.8h, v30.8h
+ mla v17.8h, v4.8h, v31.8h
+ mla v17.8h, v6.8h, v30.8h
+ subs w1, w1, #16
+ urshr v16.8h, v16.8h, #4
+ urshr v17.8h, v17.8h, #4
+ st1 {v16.8h, v17.8h}, [x0], #32
+ b.le 9f
+5:
+ // After one block, any remaining output would only be filtering
+ // padding - thus just store the padding.
+ subs w1, w1, #16
+ st1 {v2.16b}, [x0], #16
+ b.gt 5b
+9:
+ ret
+
+L(fivetap):
+ sub x2, x2, #2 // topleft -= 1 pixel
+ movi v29.8h, #2
+ ld1 {v0.8h}, [x2], #16
+ movi v30.8h, #4
+ movi v31.8h, #4
+ ins v0.h[0], v0.h[1]
+1:
+ // in[end+1], is the last valid pixel. We produce 16 pixels out by
+ // using 20 pixels in - the last pixel used is [19] of the ones
+ // read/buffered.
+ cmp w3, #18
+ ld1 {v1.8h, v2.8h}, [x2], #32
+ b.lt 2f // if (end + 1 < 19)
+ ext v3.16b, v0.16b, v1.16b, #2
+ ext v4.16b, v1.16b, v2.16b, #2
+ ext v5.16b, v0.16b, v1.16b, #4
+ ext v6.16b, v1.16b, v2.16b, #4
+ ext v16.16b, v0.16b, v1.16b, #6
+ ext v17.16b, v1.16b, v2.16b, #6
+ ext v18.16b, v0.16b, v1.16b, #8
+ ext v19.16b, v1.16b, v2.16b, #8
+ mul v20.8h, v0.8h, v29.8h
+ mla v20.8h, v3.8h, v30.8h
+ mla v20.8h, v5.8h, v31.8h
+ mla v20.8h, v16.8h, v30.8h
+ mla v20.8h, v18.8h, v29.8h
+ mul v21.8h, v1.8h, v29.8h
+ mla v21.8h, v4.8h, v30.8h
+ mla v21.8h, v6.8h, v31.8h
+ mla v21.8h, v17.8h, v30.8h
+ mla v21.8h, v19.8h, v29.8h
+ subs w1, w1, #16
+ mov v0.16b, v2.16b
+ urshr v20.8h, v20.8h, #4
+ urshr v21.8h, v21.8h, #4
+ sub w3, w3, #16
+ st1 {v20.8h, v21.8h}, [x0], #32
+ b.gt 1b
+ ret
+2:
+ // Right padding
+
+ // x2[w3+1-24] is the padding pixel (x2 points 24 pixels ahead)
+ movrel x5, padding_mask, -2
+ sub w6, w3, #23
+ sub x5, x5, w3, uxtw #1
+ add x6, x2, w6, sxtw #1
+
+ ld1 {v3.8h, v4.8h, v5.8h}, [x5] // padding_mask
+
+ ld1r {v28.8h}, [x6]
+ bit v0.16b, v28.16b, v3.16b // Pad v0-v2
+ bit v1.16b, v28.16b, v4.16b
+ bit v2.16b, v28.16b, v5.16b
+4:
+ // Filter one block
+ ext v3.16b, v0.16b, v1.16b, #2
+ ext v4.16b, v1.16b, v2.16b, #2
+ ext v5.16b, v0.16b, v1.16b, #4
+ ext v6.16b, v1.16b, v2.16b, #4
+ ext v16.16b, v0.16b, v1.16b, #6
+ ext v17.16b, v1.16b, v2.16b, #6
+ ext v18.16b, v0.16b, v1.16b, #8
+ ext v19.16b, v1.16b, v2.16b, #8
+ mul v20.8h, v0.8h, v29.8h
+ mla v20.8h, v3.8h, v30.8h
+ mla v20.8h, v5.8h, v31.8h
+ mla v20.8h, v16.8h, v30.8h
+ mla v20.8h, v18.8h, v29.8h
+ mul v21.8h, v1.8h, v29.8h
+ mla v21.8h, v4.8h, v30.8h
+ mla v21.8h, v6.8h, v31.8h
+ mla v21.8h, v17.8h, v30.8h
+ mla v21.8h, v19.8h, v29.8h
+ subs w1, w1, #16
+ mov v0.16b, v2.16b
+ mov v1.16b, v28.16b
+ mov v2.16b, v28.16b
+ urshr v20.8h, v20.8h, #4
+ urshr v21.8h, v21.8h, #4
+ sub w3, w3, #16
+ st1 {v20.8h, v21.8h}, [x0], #32
+ b.le 9f
+ // v0-v1[w3+1] is the last valid pixel; if (w3 + 1 > 0) we need to
+ // filter properly once more - aka (w3 >= 0).
+ cmp w3, #0
+ b.ge 4b
+5:
+ // When w3 <= 0, all remaining pixels in v0-v1 are equal to the
+ // last valid pixel - thus just output that without filtering.
+ subs w1, w1, #8
+ st1 {v28.8h}, [x0], #16
+ b.gt 5b
+9:
+ ret
+endfunc
+
+// void ipred_pixel_set_16bpc_neon(pixel *out, const pixel px,
+// const int n);
+function ipred_pixel_set_16bpc_neon, export=1
+ dup v0.8h, w1
+1:
+ subs w2, w2, #8
+ st1 {v0.8h}, [x0], #16
+ b.gt 1b
+ ret
+endfunc
+
+// void ipred_z1_fill1_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const top,
+// const int width, const int height,
+// const int dx, const int max_base_x);
+function ipred_z1_fill1_16bpc_neon, export=1
+ clz w9, w3
+ adr x8, L(ipred_z1_fill1_tbl)
+ sub w9, w9, #25
+ ldrh w9, [x8, w9, uxtw #1]
+ add x10, x2, w6, uxtw #1 // top[max_base_x]
+ sub x8, x8, w9, uxtw
+ ld1r {v31.8h}, [x10] // padding
+ mov w7, w5
+ mov w15, #64
+ br x8
+40:
+ AARCH64_VALID_JUMP_TARGET
+4:
+ lsr w8, w7, #6 // base
+ and w9, w7, #0x3e // frac
+ add w7, w7, w5 // xpos += dx
+ cmp w8, w6 // base >= max_base_x
+ lsr w10, w7, #6 // base
+ and w11, w7, #0x3e // frac
+ b.ge 49f
+ lsl w8, w8, #1
+ lsl w10, w10, #1
+ ldr q0, [x2, w8, uxtw] // top[base]
+ ldr q2, [x2, w10, uxtw]
+ dup v4.4h, w9 // frac
+ dup v5.4h, w11
+ ext v1.16b, v0.16b, v0.16b, #2 // top[base+1]
+ ext v3.16b, v2.16b, v2.16b, #2
+ sub v6.4h, v1.4h, v0.4h // top[base+1]-top[base]
+ sub v7.4h, v3.4h, v2.4h
+ ushll v16.4s, v0.4h, #6 // top[base]*64
+ ushll v17.4s, v2.4h, #6
+ smlal v16.4s, v6.4h, v4.4h // + top[base+1]*frac
+ smlal v17.4s, v7.4h, v5.4h
+ rshrn v16.4h, v16.4s, #6
+ rshrn v17.4h, v17.4s, #6
+ st1 {v16.4h}, [x0], x1
+ add w7, w7, w5 // xpos += dx
+ subs w4, w4, #2
+ st1 {v17.4h}, [x0], x1
+ b.gt 4b
+ ret
+
+49:
+ st1 {v31.4h}, [x0], x1
+ subs w4, w4, #2
+ st1 {v31.4h}, [x0], x1
+ b.gt 49b
+ ret
+
+80:
+ AARCH64_VALID_JUMP_TARGET
+8:
+ lsr w8, w7, #6 // base
+ and w9, w7, #0x3e // frac
+ add w7, w7, w5 // xpos += dx
+ cmp w8, w6 // base >= max_base_x
+ lsr w10, w7, #6 // base
+ and w11, w7, #0x3e // frac
+ b.ge 89f
+ add x8, x2, w8, uxtw #1
+ add x10, x2, w10, uxtw #1
+ dup v4.8h, w9 // frac
+ dup v5.8h, w11
+ ld1 {v0.8h}, [x8] // top[base]
+ ld1 {v2.8h}, [x10]
+ sub w9, w15, w9 // 64 - frac
+ sub w11, w15, w11
+ ldr h1, [x8, #16]
+ ldr h3, [x10, #16]
+ dup v6.8h, w9 // 64 - frac
+ dup v7.8h, w11
+ ext v1.16b, v0.16b, v1.16b, #2 // top[base+1]
+ ext v3.16b, v2.16b, v3.16b, #2
+ umull v16.4s, v0.4h, v6.4h // top[base]*(64-frac)
+ umlal v16.4s, v1.4h, v4.4h // + top[base+1]*frac
+ umull2 v17.4s, v0.8h, v6.8h
+ umlal2 v17.4s, v1.8h, v4.8h
+ umull v18.4s, v2.4h, v7.4h
+ umlal v18.4s, v3.4h, v5.4h
+ umull2 v19.4s, v2.8h, v7.8h
+ umlal2 v19.4s, v3.8h, v5.8h
+ rshrn v16.4h, v16.4s, #6
+ rshrn2 v16.8h, v17.4s, #6
+ rshrn v17.4h, v18.4s, #6
+ rshrn2 v17.8h, v19.4s, #6
+ st1 {v16.8h}, [x0], x1
+ add w7, w7, w5 // xpos += dx
+ subs w4, w4, #2
+ st1 {v17.8h}, [x0], x1
+ b.gt 8b
+ ret
+
+89:
+ st1 {v31.8h}, [x0], x1
+ subs w4, w4, #2
+ st1 {v31.8h}, [x0], x1
+ b.gt 89b
+ ret
+
+160:
+320:
+640:
+ AARCH64_VALID_JUMP_TARGET
+
+ mov w12, w3
+
+ add x13, x0, x1
+ lsl x1, x1, #1
+ sub x1, x1, w3, uxtw #1
+1:
+ lsr w8, w7, #6 // base
+ and w9, w7, #0x3e // frac
+ add w7, w7, w5 // xpos += dx
+ cmp w8, w6 // base >= max_base_x
+ lsr w10, w7, #6 // base
+ and w11, w7, #0x3e // frac
+ b.ge 169f
+ add x8, x2, w8, uxtw #1
+ add x10, x2, w10, uxtw #1
+ dup v6.8h, w9 // frac
+ dup v7.8h, w11
+ ld1 {v0.8h, v1.8h, v2.8h}, [x8], #48 // top[base]
+ ld1 {v3.8h, v4.8h, v5.8h}, [x10], #48
+ sub w9, w15, w9 // 64 - frac
+ sub w11, w15, w11
+ dup v16.8h, w9 // 64 - frac
+ dup v17.8h, w11
+ add w7, w7, w5 // xpos += dx
+2:
+ ext v18.16b, v0.16b, v1.16b, #2 // top[base+1]
+ ext v19.16b, v1.16b, v2.16b, #2
+ ext v20.16b, v3.16b, v4.16b, #2
+ ext v21.16b, v4.16b, v5.16b, #2
+ subs w3, w3, #16
+ umull v22.4s, v0.4h, v16.4h // top[base]*(64-frac)
+ umlal v22.4s, v18.4h, v6.4h // + top[base+1]*frac
+ umull2 v23.4s, v0.8h, v16.8h
+ umlal2 v23.4s, v18.8h, v6.8h
+ umull v24.4s, v1.4h, v16.4h
+ umlal v24.4s, v19.4h, v6.4h
+ umull2 v25.4s, v1.8h, v16.8h
+ umlal2 v25.4s, v19.8h, v6.8h
+ umull v26.4s, v3.4h, v17.4h
+ umlal v26.4s, v20.4h, v7.4h
+ umull2 v27.4s, v3.8h, v17.8h
+ umlal2 v27.4s, v20.8h, v7.8h
+ umull v28.4s, v4.4h, v17.4h
+ umlal v28.4s, v21.4h, v7.4h
+ umull2 v29.4s, v4.8h, v17.8h
+ umlal2 v29.4s, v21.8h, v7.8h
+ rshrn v22.4h, v22.4s, #6
+ rshrn2 v22.8h, v23.4s, #6
+ rshrn v23.4h, v24.4s, #6
+ rshrn2 v23.8h, v25.4s, #6
+ rshrn v24.4h, v26.4s, #6
+ rshrn2 v24.8h, v27.4s, #6
+ rshrn v25.4h, v28.4s, #6
+ rshrn2 v25.8h, v29.4s, #6
+ st1 {v22.8h, v23.8h}, [x0], #32
+ st1 {v24.8h, v25.8h}, [x13], #32
+ b.le 3f
+ mov v0.16b, v2.16b
+ ld1 {v1.8h, v2.8h}, [x8], #32 // top[base]
+ mov v3.16b, v5.16b
+ ld1 {v4.8h, v5.8h}, [x10], #32
+ b 2b
+
+3:
+ subs w4, w4, #2
+ b.le 9f
+ add x0, x0, x1
+ add x13, x13, x1
+ mov w3, w12
+ b 1b
+9:
+ ret
+
+169:
+ st1 {v31.8h}, [x0], #16
+ subs w3, w3, #8
+ st1 {v31.8h}, [x13], #16
+ b.gt 169b
+ subs w4, w4, #2
+ b.le 9b
+ add x0, x0, x1
+ add x13, x13, x1
+ mov w3, w12
+ b 169b
+
+L(ipred_z1_fill1_tbl):
+ .hword L(ipred_z1_fill1_tbl) - 640b
+ .hword L(ipred_z1_fill1_tbl) - 320b
+ .hword L(ipred_z1_fill1_tbl) - 160b
+ .hword L(ipred_z1_fill1_tbl) - 80b
+ .hword L(ipred_z1_fill1_tbl) - 40b
+endfunc
+
+function ipred_z1_fill2_16bpc_neon, export=1
+ cmp w3, #8
+ add x10, x2, w6, uxtw // top[max_base_x]
+ ld1r {v31.16b}, [x10] // padding
+ mov w7, w5
+ mov w15, #64
+ b.eq 8f
+
+4: // w == 4
+ lsr w8, w7, #6 // base
+ and w9, w7, #0x3e // frac
+ add w7, w7, w5 // xpos += dx
+ cmp w8, w6 // base >= max_base_x
+ lsr w10, w7, #6 // base
+ and w11, w7, #0x3e // frac
+ b.ge 49f
+ lsl w8, w8, #1
+ lsl w10, w10, #1
+ ldr q0, [x2, w8, uxtw] // top[base]
+ ldr q2, [x2, w10, uxtw]
+ dup v4.4h, w9 // frac
+ dup v5.4h, w11
+ uzp2 v1.8h, v0.8h, v0.8h // top[base+1]
+ uzp1 v0.8h, v0.8h, v0.8h // top[base]
+ uzp2 v3.8h, v2.8h, v2.8h
+ uzp1 v2.8h, v2.8h, v2.8h
+ sub v6.4h, v1.4h, v0.4h // top[base+1]-top[base]
+ sub v7.4h, v3.4h, v2.4h
+ ushll v16.4s, v0.4h, #6 // top[base]*64
+ ushll v17.4s, v2.4h, #6
+ smlal v16.4s, v6.4h, v4.4h // + top[base+1]*frac
+ smlal v17.4s, v7.4h, v5.4h
+ rshrn v16.4h, v16.4s, #6
+ rshrn v17.4h, v17.4s, #6
+ st1 {v16.4h}, [x0], x1
+ add w7, w7, w5 // xpos += dx
+ subs w4, w4, #2
+ st1 {v17.4h}, [x0], x1
+ b.gt 4b
+ ret
+
+49:
+ st1 {v31.4h}, [x0], x1
+ subs w4, w4, #2
+ st1 {v31.4h}, [x0], x1
+ b.gt 49b
+ ret
+
+8: // w == 8
+ lsr w8, w7, #6 // base
+ and w9, w7, #0x3e // frac
+ add w7, w7, w5 // xpos += dx
+ cmp w8, w6 // base >= max_base_x
+ lsr w10, w7, #6 // base
+ and w11, w7, #0x3e // frac
+ b.ge 89f
+ add x8, x2, w8, uxtw #1
+ add x10, x2, w10, uxtw #1
+ dup v4.8h, w9 // frac
+ dup v5.8h, w11
+ ld1 {v0.8h, v1.8h}, [x8] // top[base]
+ ld1 {v2.8h, v3.8h}, [x10]
+ sub w9, w15, w9 // 64 - frac
+ sub w11, w15, w11
+ dup v6.8h, w9 // 64 - frac
+ dup v7.8h, w11
+ uzp2 v20.8h, v0.8h, v1.8h // top[base+1]
+ uzp1 v0.8h, v0.8h, v1.8h // top[base]
+ uzp2 v21.8h, v2.8h, v3.8h
+ uzp1 v2.8h, v2.8h, v3.8h
+ umull v16.4s, v0.4h, v6.4h // top[base]*(64-frac)
+ umlal v16.4s, v20.4h, v4.4h // + top[base+1]*frac
+ umull2 v17.4s, v0.8h, v6.8h
+ umlal2 v17.4s, v20.8h, v4.8h
+ umull v18.4s, v2.4h, v7.4h
+ umlal v18.4s, v21.4h, v5.4h
+ umull2 v19.4s, v2.8h, v7.8h
+ umlal2 v19.4s, v21.8h, v5.8h
+ rshrn v16.4h, v16.4s, #6
+ rshrn2 v16.8h, v17.4s, #6
+ rshrn v17.4h, v18.4s, #6
+ rshrn2 v17.8h, v19.4s, #6
+ st1 {v16.8h}, [x0], x1
+ add w7, w7, w5 // xpos += dx
+ subs w4, w4, #2
+ st1 {v17.8h}, [x0], x1
+ b.gt 8b
+ ret
+
+89:
+ st1 {v31.8h}, [x0], x1
+ subs w4, w4, #2
+ st1 {v31.8h}, [x0], x1
+ b.gt 89b
+ ret
+endfunc
+
+// void ipred_reverse_16bpc_neon(pixel *dst, const pixel *const src,
+// const int n);
+function ipred_reverse_16bpc_neon, export=1
+ sub x1, x1, #16
+ add x3, x0, #8
+ mov x4, #16
+1:
+ ld1 {v0.8h}, [x1]
+ subs w2, w2, #8
+ rev64 v0.8h, v0.8h
+ sub x1, x1, #16
+ st1 {v0.d}[1], [x0], x4
+ st1 {v0.d}[0], [x3], x4
+ b.gt 1b
+ ret
+endfunc
+
+const increments
+ .short 0, 1, 2, 3, 4, 5, 6, 7
+endconst
+
+// void ipred_z2_fill1_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const top,
+// const pixel *const left,
+// const int width, const int height,
+// const int dx, const int dy);
+function ipred_z2_fill1_16bpc_neon, export=1
+ clz w10, w4
+ adr x9, L(ipred_z2_fill1_tbl)
+ sub w10, w10, #25
+ ldrh w10, [x9, w10, uxtw #1]
+ mov w8, #(1 << 6) // xpos = 1 << 6
+ sub x9, x9, w10, uxtw
+ sub w8, w8, w6 // xpos -= dx
+
+ movrel x11, increments
+ ld1 {v31.8h}, [x11] // increments
+ neg w7, w7 // -dy
+
+ br x9
+40:
+ AARCH64_VALID_JUMP_TARGET
+
+ dup v30.4h, w7 // -dy
+ movi v17.8b, #1
+
+ mul v16.4h, v31.4h, v30.4h // {0,1,2,3}* -dy
+ movi v25.8h, #0x3e
+ add v30.4h, v16.4h, v30.4h // -= dy
+
+ // Worst case height for w=4 is 16, but we need at least h+1 elements
+ ld1 {v0.8h, v1.8h, v2.8h}, [x3] // left[]
+
+ movi v26.8h, #64
+ movi v19.16b, #4
+
+ shrn v29.8b, v30.8h, #6 // ypos >> 6
+ and v27.8b, v30.8b, v25.8b // frac_y
+
+ add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 1
+
+ movi v23.4h, #1, lsl #8
+ shl v29.8b, v29.8b, #1 // 2*base_y
+ zip1 v29.8b, v29.8b, v29.8b // duplicate elements
+ movi v17.8b, #2
+ add v29.8b, v29.8b, v23.8b // 2*base, 2*base+1, ...
+
+ add v30.8b, v29.8b, v17.8b // base_y + 1 (*2)
+ add v28.8b, v29.8b, v19.8b // base_y + 2 (*2)
+
+ tbl v18.8b, {v0.16b}, v29.8b // left[base_y]
+
+ trn1 v30.2d, v30.2d, v28.2d // base_y + 1, base_y + 2
+
+ sub v28.4h, v26.4h, v27.4h // 64 - frac_y
+
+ trn1 v31.2d, v31.2d, v31.2d // {0,1,2,3,0,1,2,3}
+
+ trn1 v27.2d, v27.2d, v27.2d // frac_y
+ trn1 v28.2d, v28.2d, v28.2d // 64 - frac_y
+
+ movi v29.16b, #4
+4:
+ asr w9, w8, #6 // base_x
+ dup v16.4h, w8 // xpos
+ sub w8, w8, w6 // xpos -= dx
+ cmp w9, #-4 // base_x <= -4
+ asr w11, w8, #6 // base_x
+ b.le 49f
+
+ lsl w9, w9, #1
+ lsl w11, w11, #1
+
+ dup v17.4h, w8 // xpos
+
+ ldr q4, [x2, w9, sxtw] // top[base_x]
+ ldr q6, [x2, w11, sxtw]
+
+ trn1 v16.2d, v16.2d, v17.2d // xpos
+
+ // Cut corners here; only doing tbl over v0-v1 here; we only
+ // seem to need the last pixel, from v2, after skipping to the
+ // left-only codepath below.
+ tbl v19.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+1], left[base_y+2]
+
+ sshr v20.8h, v16.8h, #6 // first base_x for each row
+
+ ext v5.16b, v4.16b, v4.16b, #2 // top[base_x+1]
+ ext v7.16b, v6.16b, v6.16b, #2
+
+ and v16.16b, v16.16b, v25.16b // frac_x
+
+ trn1 v18.2d, v18.2d, v19.2d // left[base_y], left[base_y+1]
+
+ trn1 v4.2d, v4.2d, v6.2d // top[base_x]
+ trn1 v5.2d, v5.2d, v7.2d // top[base_x+1]
+
+ sub v17.8h, v26.8h, v16.8h // 64 - frac_x
+
+ add v20.8h, v20.8h, v31.8h // actual base_x
+
+ umull v21.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y)
+ umlal v21.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y
+ umull2 v22.4s, v18.8h, v28.8h
+ umlal2 v22.4s, v19.8h, v27.8h
+
+ umull v23.4s, v4.4h, v17.4h // top[base_x]-*(64-frac_x)
+ umlal v23.4s, v5.4h, v16.4h // + top[base_x+1]*frac_x
+ umull2 v24.4s, v4.8h, v17.8h
+ umlal2 v24.4s, v5.8h, v16.8h
+
+ cmge v20.8h, v20.8h, #0
+
+ rshrn v21.4h, v21.4s, #6
+ rshrn2 v21.8h, v22.4s, #6
+ rshrn v22.4h, v23.4s, #6
+ rshrn2 v22.8h, v24.4s, #6
+
+ bit v21.16b, v22.16b, v20.16b
+
+ st1 {v21.d}[0], [x0], x1
+ sub w8, w8, w6 // xpos -= dx
+ subs w5, w5, #2
+ st1 {v21.d}[1], [x0], x1
+ b.le 9f
+
+ ext v18.16b, v19.16b, v19.16b, #8
+ add v30.16b, v30.16b, v29.16b // base_y += 2 (*2)
+ b 4b
+
+49:
+ tbl v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+2]
+
+ trn1 v18.2d, v18.2d, v19.2d // left[base_y], left[base_y+1]
+
+ umull v20.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y)
+ umlal v20.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y
+ umull2 v21.4s, v18.8h, v28.8h
+ umlal2 v21.4s, v19.8h, v27.8h
+
+ rshrn v20.4h, v20.4s, #6
+ rshrn2 v20.8h, v21.4s, #6
+
+ st1 {v20.d}[0], [x0], x1
+ subs w5, w5, #2
+ st1 {v20.d}[1], [x0], x1
+ b.le 9f
+
+ ext v18.16b, v19.16b, v19.16b, #8
+ add v30.16b, v30.16b, v29.16b // base_y += 2 (*2)
+ b 49b
+
+9:
+ ret
+
+80:
+ AARCH64_VALID_JUMP_TARGET
+
+ stp d8, d9, [sp, #-0x40]!
+ stp d10, d11, [sp, #0x10]
+ stp d12, d13, [sp, #0x20]
+ stp d14, d15, [sp, #0x30]
+
+ dup v18.8h, w7 // -dy
+ add x3, x3, #2 // Skip past left[0]
+
+ mul v16.8h, v31.8h, v18.8h // {0,1,2,3,4,5,6,7}* -dy
+ movi v25.8h, #0x3e
+ add v16.8h, v16.8h, v18.8h // -= dy
+
+ // Worst case height for w=8 is 32.
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x3] // left[]
+ ld1r {v15.8h}, [x2] // left[0] == top[0]
+
+ movi v26.8h, #64
+ movi v19.16b, #4
+
+ shrn v29.8b, v16.8h, #6 // ypos >> 6
+ and v27.16b, v16.16b, v25.16b // frac_y
+
+ movi v23.8h, #1, lsl #8
+ shl v29.8b, v29.8b, #1 // 2*base_y
+ mov v18.16b, v15.16b // left[0]
+ zip1 v29.16b, v29.16b, v29.16b // duplicate elements
+ movi v17.16b, #2
+ add v29.16b, v29.16b, v23.16b // 2*base, 2*base+1, ...
+
+ // Cut corners here; for the first row we don't expect to need to
+ // read outside of v0.
+ tbx v18.16b, {v0.16b}, v29.16b // left[base_y]
+
+ add v30.16b, v29.16b, v19.16b // base_y + 2 (*2)
+ add v29.16b, v29.16b, v17.16b // base_y + 1 (*2)
+
+ sub v28.8h, v26.8h, v27.8h // 64 - frac_y
+
+ movi v24.16b, #4
+8:
+ asr w9, w8, #6 // base_x
+ dup v16.8h, w8 // xpos
+ sub w8, w8, w6 // xpos -= dx
+ cmp w9, #-16 // base_x <= -16
+ asr w11, w8, #6 // base_x
+ b.le 89f
+
+ dup v17.8h, w8 // xpos
+
+ add x9, x2, w9, sxtw #1
+ add x11, x2, w11, sxtw #1
+
+ ld1 {v4.8h, v5.8h}, [x9] // top[base_x]
+ mov v19.16b, v15.16b // left[0]
+ ld1 {v6.8h, v7.8h}, [x11]
+
+ tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1]
+
+ mov v20.16b, v15.16b // left[0]
+
+ sshr v21.8h, v16.8h, #6 // first base_x
+ sshr v22.8h, v17.8h, #6
+
+ tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b // left[base_y+2]
+
+ ext v5.16b, v4.16b, v5.16b, #2 // top[base_x+1]
+ ext v7.16b, v6.16b, v7.16b, #2
+
+ and v16.16b, v16.16b, v25.16b // frac_x
+ and v17.16b, v17.16b, v25.16b
+
+ umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y)
+ umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y
+
+ sub v8.8h, v26.8h, v16.8h // 64 - frac_x
+ sub v9.8h, v26.8h, v17.8h
+
+ umull2 v11.4s, v18.8h, v28.8h
+ umlal2 v11.4s, v19.8h, v27.8h
+
+ add v21.8h, v21.8h, v31.8h // actual base_x
+ add v22.8h, v22.8h, v31.8h
+
+ umull v12.4s, v19.4h, v28.4h
+ umlal v12.4s, v20.4h, v27.4h
+ umull2 v13.4s, v19.8h, v28.8h
+ umlal2 v13.4s, v20.8h, v27.8h
+
+ rshrn v10.4h, v10.4s, #6
+ rshrn2 v10.8h, v11.4s, #6
+ rshrn v11.4h, v12.4s, #6
+ rshrn2 v11.8h, v13.4s, #6
+
+ umull v12.4s, v4.4h, v8.4h // top[base_x]-*(64-frac_x)
+ umlal v12.4s, v5.4h, v16.4h // + top[base_x+1]*frac_x
+ umull2 v13.4s, v4.8h, v8.8h
+ umlal2 v13.4s, v5.8h, v16.8h
+ umull v14.4s, v6.4h, v9.4h
+ umlal v14.4s, v7.4h, v17.4h
+ umull2 v18.4s, v6.8h, v9.8h
+ umlal2 v18.4s, v7.8h, v17.8h
+
+ cmge v21.8h, v21.8h, #0
+ cmge v22.8h, v22.8h, #0
+
+ rshrn v12.4h, v12.4s, #6
+ rshrn2 v12.8h, v13.4s, #6
+ rshrn v13.4h, v14.4s, #6
+ rshrn2 v13.8h, v18.4s, #6
+
+ bit v10.16b, v12.16b, v21.16b
+ bit v11.16b, v13.16b, v22.16b
+
+ st1 {v10.8h}, [x0], x1
+ subs w5, w5, #2
+ sub w8, w8, w6 // xpos -= dx
+ st1 {v11.8h}, [x0], x1
+ b.le 9f
+
+ mov v18.16b, v20.16b
+ add v29.16b, v29.16b, v24.16b // base_y += 2 (*2)
+ add v30.16b, v30.16b, v24.16b // base_y += 2 (*2)
+ b 8b
+
+89:
+ mov v19.16b, v15.16b
+ mov v20.16b, v15.16b
+ tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1]
+ tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b // left[base_y+2]
+
+ umull v4.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y)
+ umlal v4.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y
+ umull2 v5.4s, v18.8h, v28.8h
+ umlal2 v5.4s, v19.8h, v27.8h
+ umull v6.4s, v19.4h, v28.4h
+ umlal v6.4s, v20.4h, v27.4h
+ umull2 v7.4s, v19.8h, v28.8h
+ umlal2 v7.4s, v20.8h, v27.8h
+
+ rshrn v4.4h, v4.4s, #6
+ rshrn2 v4.8h, v5.4s, #6
+ rshrn v5.4h, v6.4s, #6
+ rshrn2 v5.8h, v7.4s, #6
+
+ st1 {v4.8h}, [x0], x1
+ subs w5, w5, #2
+ st1 {v5.8h}, [x0], x1
+ b.le 9f
+
+ mov v18.16b, v20.16b
+ add v29.16b, v29.16b, v24.16b // base_y += 2 (*2)
+ add v30.16b, v30.16b, v24.16b // base_y += 2 (*2)
+ b 89b
+
+9:
+ ldp d14, d15, [sp, #0x30]
+ ldp d12, d13, [sp, #0x20]
+ ldp d10, d11, [sp, #0x10]
+ ldp d8, d9, [sp], 0x40
+ ret
+
+160:
+320:
+640:
+ AARCH64_VALID_JUMP_TARGET
+
+ stp d8, d9, [sp, #-0x40]!
+ stp d10, d11, [sp, #0x10]
+ stp d12, d13, [sp, #0x20]
+ stp d14, d15, [sp, #0x30]
+
+ dup v25.8h, w7 // -dy
+ add x3, x3, #2 // Skip past left[0]
+
+ add x13, x0, x1 // alternating row
+ lsl x1, x1, #1 // stride *= 2
+ sub x1, x1, w4, uxtw #1 // stride -= width
+
+ movi v11.8h, #8
+ mul v26.8h, v31.8h, v25.8h // {0,1,2,3,4,5,6,7}* -dy
+ add v26.8h, v26.8h, v25.8h // -= dy
+ mul v25.8h, v25.8h, v11.8h // -8*dy
+
+ // Worst case height is 64, but we can only fit 32 pixels into
+ // v0-v3 usable within one tbx instruction. As long as base_y is
+ // up to 32, we use tbx.
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x3] // left[]
+ ld1r {v15.8h}, [x2] // left[0] == top[0]
+
+ mov w12, w4 // orig w
+ neg w14, w4 // -w
+
+1:
+ mov v23.16b, v26.16b // reset ypos
+
+ asr w9, w8, #6 // base_x
+ dup v16.8h, w8 // xpos
+ sub w8, w8, w6 // xpos -= dx
+ cmp w9, w14 // base_x <= -2*w
+ asr w11, w8, #6 // base_x
+ b.le 169f
+
+ dup v17.8h, w8 // xpos
+ sub w8, w8, w6 // xpos -= dx
+
+ add x9, x2, w9, sxtw #1
+ add x11, x2, w11, sxtw #1
+
+ sshr v21.8h, v16.8h, #6 // first base_x
+ sshr v22.8h, v17.8h, #6
+
+ ld1 {v4.8h}, [x9], #16 // top[base_x]
+ ld1 {v6.8h}, [x11], #16
+
+ movi v10.8h, #0x3e
+ movi v11.8h, #64
+
+ and v16.16b, v16.16b, v10.16b // frac_x
+ and v17.16b, v17.16b, v10.16b
+
+ sub v8.8h, v11.8h, v16.8h // 64 - frac_x
+ sub v9.8h, v11.8h, v17.8h
+
+ add v21.8h, v21.8h, v31.8h // actual base_x
+ add v22.8h, v22.8h, v31.8h
+
+2:
+ smov w10, v22.h[0]
+
+ shrn v29.8b, v23.8h, #6 // ypos >> 6
+ movi v12.8h, #64
+ cmp w10, #0 // base_x (bottom left) >= 0
+ smov w10, v29.b[0] // base_y[0]
+ movi v10.8h, #0x3e
+
+ b.ge 4f
+ and v27.16b, v23.16b, v10.16b // frac_y
+ cmp w10, #(32-3)
+
+ mov v18.16b, v15.16b // left[0]
+ sub v28.8h, v12.8h, v27.8h // 64 - frac_y
+ b.gt 22f
+
+21:
+ // base_y < 32, using tbx
+ shl v29.8b, v29.8b, #1 // 2*base_y
+ movi v11.8h, #1, lsl #8
+ zip1 v29.16b, v29.16b, v29.16b // duplicate elements
+ add v29.16b, v29.16b, v11.16b // 2*base, 2*base+1, ...
+
+ movi v13.16b, #2
+
+ tbx v18.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y]
+
+ add v29.16b, v29.16b, v13.16b // base_y + 1 (*2)
+ mov v19.16b, v15.16b // left[0]
+
+ tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1]
+
+ add v29.16b, v29.16b, v13.16b // base_y + 2 (*2)
+ mov v20.16b, v15.16b // left[0]
+
+ tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+2]
+
+ b 23f
+
+22:
+ // base_y >= 32, using separate loads.
+ smov w15, v29.b[1]
+ smov w16, v29.b[2]
+ add x10, x3, w10, sxtw #1
+ smov w17, v29.b[3]
+ add x15, x3, w15, sxtw #1
+ ld3 {v18.h, v19.h, v20.h}[0], [x10]
+ smov w10, v29.b[4]
+ add x16, x3, w16, sxtw #1
+ ld3 {v18.h, v19.h, v20.h}[1], [x15]
+ smov w15, v29.b[5]
+ add x17, x3, w17, sxtw #1
+ ld3 {v18.h, v19.h, v20.h}[2], [x16]
+ smov w16, v29.b[6]
+ add x10, x3, w10, sxtw #1
+ ld3 {v18.h, v19.h, v20.h}[3], [x17]
+ smov w17, v29.b[7]
+ add x15, x3, w15, sxtw #1
+ add x16, x3, w16, sxtw #1
+ ld3 {v18.h, v19.h, v20.h}[4], [x10]
+ add x17, x3, w17, sxtw #1
+ ld3 {v18.h, v19.h, v20.h}[5], [x15]
+ ld3 {v18.h, v19.h, v20.h}[6], [x16]
+ ld3 {v18.h, v19.h, v20.h}[7], [x17]
+
+23:
+
+ ld1 {v5.8h}, [x9], #16 // top[base_x]
+ ld1 {v7.8h}, [x11], #16
+
+ add v23.8h, v23.8h, v25.8h // ypos -= 8*dy
+
+ umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y)
+ umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y
+ umull2 v11.4s, v18.8h, v28.8h
+ umlal2 v11.4s, v19.8h, v27.8h
+ umull v12.4s, v19.4h, v28.4h
+ umlal v12.4s, v20.4h, v27.4h
+ umull2 v13.4s, v19.8h, v28.8h
+ umlal2 v13.4s, v20.8h, v27.8h
+
+ ext v18.16b, v4.16b, v5.16b, #2 // top[base_x+1]
+ ext v19.16b, v6.16b, v7.16b, #2
+
+ rshrn v10.4h, v10.4s, #6
+ rshrn2 v10.8h, v11.4s, #6
+ rshrn v11.4h, v12.4s, #6
+ rshrn2 v11.8h, v13.4s, #6
+
+ umull v12.4s, v4.4h, v8.4h // top[base_x]-*(64-frac_x)
+ umlal v12.4s, v18.4h, v16.4h // + top[base_x+1]*frac_x
+ umull2 v13.4s, v4.8h, v8.8h
+ umlal2 v13.4s, v18.8h, v16.8h
+ umull v14.4s, v6.4h, v9.4h
+ umlal v14.4s, v19.4h, v17.4h
+ umull2 v20.4s, v6.8h, v9.8h
+ umlal2 v20.4s, v19.8h, v17.8h
+
+ cmge v18.8h, v21.8h, #0
+ cmge v19.8h, v22.8h, #0
+
+ rshrn v12.4h, v12.4s, #6
+ rshrn2 v12.8h, v13.4s, #6
+ rshrn v13.4h, v14.4s, #6
+ rshrn2 v13.8h, v20.4s, #6
+
+ bit v10.16b, v12.16b, v18.16b
+ bit v11.16b, v13.16b, v19.16b
+
+ st1 {v10.8h}, [x0], #16
+ subs w4, w4, #8
+ st1 {v11.8h}, [x13], #16
+ b.le 3f
+
+ movi v10.8h, #8
+ mov v4.16b, v5.16b
+ mov v6.16b, v7.16b
+ add v21.8h, v21.8h, v10.8h // base_x += 8
+ add v22.8h, v22.8h, v10.8h
+ b 2b
+
+3:
+ subs w5, w5, #2
+ b.le 9f
+ movi v10.8h, #128
+ add x0, x0, x1
+ add x13, x13, x1
+ mov w4, w12 // reset w
+ add v26.8h, v26.8h, v10.8h // ypos += 2*(1<<6)
+ b 1b
+
+4: // The rest of the row only predicted from top[]
+ ld1 {v5.8h}, [x9], #16 // top[base_x]
+ ld1 {v7.8h}, [x11], #16
+
+ ext v18.16b, v4.16b, v5.16b, #2 // top[base_x+1]
+ ext v19.16b, v6.16b, v7.16b, #2
+
+ umull v12.4s, v4.4h, v8.4h // top[base_x]-*(64-frac_x)
+ umlal v12.4s, v18.4h, v16.4h // + top[base_x+1]*frac_x
+ umull2 v13.4s, v4.8h, v8.8h
+ umlal2 v13.4s, v18.8h, v16.8h
+ umull v14.4s, v6.4h, v9.4h
+ umlal v14.4s, v19.4h, v17.4h
+ umull2 v20.4s, v6.8h, v9.8h
+ umlal2 v20.4s, v19.8h, v17.8h
+
+ rshrn v12.4h, v12.4s, #6
+ rshrn2 v12.8h, v13.4s, #6
+ rshrn v13.4h, v14.4s, #6
+ rshrn2 v13.8h, v20.4s, #6
+
+ st1 {v12.8h}, [x0], #16
+ subs w4, w4, #8
+ st1 {v13.8h}, [x13], #16
+ b.le 3b
+
+ mov v4.16b, v5.16b
+ mov v6.16b, v7.16b
+ b 4b
+
+169: // The rest of the block only predicted from left[]
+ add x1, x1, w4, uxtw #1 // restore stride
+ mov w12, w5 // orig remaining h
+1:
+ movi v12.8h, #64
+ movi v10.8h, #0x3e
+
+ shrn v29.8b, v23.8h, #6 // ypos >> 6
+ and v27.16b, v23.16b, v10.16b // frac_y
+
+ smov w10, v29.b[0] // base_y[0]
+
+ shl v29.8b, v29.8b, #1 // 2*base_y
+ movi v11.8h, #1, lsl #8
+ zip1 v29.16b, v29.16b, v29.16b // duplicate elements
+ add v23.8h, v23.8h, v25.8h // ypos -= 8*dy
+ add v29.16b, v29.16b, v11.16b // 2*base, 2*base+1, ...
+
+ cmp w10, #(32-1)
+
+ mov v18.16b, v15.16b // left[0]
+ movi v21.16b, #2
+
+ sub v28.8h, v12.8h, v27.8h // 64 - frac_y
+
+ b.gt 31f
+
+ tbx v18.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y]
+ add v29.16b, v29.16b, v21.16b // base_y + 1 (*2)
+
+2:
+ // base_y < 32, using tbx.
+ smov w10, v29.b[0] // base_y[0]
+ mov v19.16b, v15.16b // left[0]
+ cmp w10, #(64-4)
+ b.gt 32f
+ tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1]
+ add v29.16b, v29.16b, v21.16b // base_y + 2 (*2)
+ mov v20.16b, v15.16b // left[0]
+ tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+2]
+ add v29.16b, v29.16b, v21.16b // next base_y
+
+ umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y)
+ umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y
+ umull2 v11.4s, v18.8h, v28.8h
+ umlal2 v11.4s, v19.8h, v27.8h
+ umull v12.4s, v19.4h, v28.4h
+ umlal v12.4s, v20.4h, v27.4h
+ umull2 v13.4s, v19.8h, v28.8h
+ umlal2 v13.4s, v20.8h, v27.8h
+
+ rshrn v10.4h, v10.4s, #6
+ rshrn2 v10.8h, v11.4s, #6
+ rshrn v11.4h, v12.4s, #6
+ rshrn2 v11.8h, v13.4s, #6
+
+ st1 {v10.8h}, [x0], x1
+ subs w5, w5, #2
+ st1 {v11.8h}, [x13], x1
+ b.le 4f
+ mov v18.16b, v20.16b
+ b 2b
+
+31: // base_y >= 32, using separate loads, loading v18 if we had to bail
+ // in the prologue.
+ smov w10, v29.b[0]
+ smov w15, v29.b[2]
+ movi v21.16b, #2
+ smov w16, v29.b[4]
+ add x10, x3, w10, sxtw
+ smov w17, v29.b[6]
+ add x15, x3, w15, sxtw
+ ld1 {v18.h}[0], [x10]
+ smov w10, v29.b[8]
+ add x16, x3, w16, sxtw
+ ld1 {v18.h}[1], [x15]
+ smov w15, v29.b[10]
+ add x17, x3, w17, sxtw
+ ld1 {v18.h}[2], [x16]
+ smov w16, v29.b[12]
+ add x10, x3, w10, sxtw
+ ld1 {v18.h}[3], [x17]
+ smov w17, v29.b[14]
+ add x15, x3, w15, sxtw
+ add x16, x3, w16, sxtw
+ ld1 {v18.h}[4], [x10]
+ add x17, x3, w17, sxtw
+ ld1 {v18.h}[5], [x15]
+ add v29.16b, v29.16b, v21.16b // next base_y
+ ld1 {v18.h}[6], [x16]
+ ld1 {v18.h}[7], [x17]
+
+32: // base_y >= 32, using separate loads.
+ cmp w5, #4
+ b.lt 34f
+33: // h >= 4, preserving v18 from the previous round, loading v19-v22.
+ smov w10, v29.b[0]
+ subs w5, w5, #4
+ smov w15, v29.b[2]
+ movi v10.16b, #8
+ smov w16, v29.b[4]
+ add x10, x3, w10, sxtw
+ smov w17, v29.b[6]
+ add x15, x3, w15, sxtw
+ ld4 {v19.h, v20.h, v21.h, v22.h}[0], [x10]
+ smov w10, v29.b[8]
+ add x16, x3, w16, sxtw
+ ld4 {v19.h, v20.h, v21.h, v22.h}[1], [x15]
+ smov w15, v29.b[10]
+ add x17, x3, w17, sxtw
+ ld4 {v19.h, v20.h, v21.h, v22.h}[2], [x16]
+ smov w16, v29.b[12]
+ add x10, x3, w10, sxtw
+ ld4 {v19.h, v20.h, v21.h, v22.h}[3], [x17]
+ smov w17, v29.b[14]
+ add x15, x3, w15, sxtw
+ add x16, x3, w16, sxtw
+ ld4 {v19.h, v20.h, v21.h, v22.h}[4], [x10]
+ add x17, x3, w17, sxtw
+ ld4 {v19.h, v20.h, v21.h, v22.h}[5], [x15]
+ ld4 {v19.h, v20.h, v21.h, v22.h}[6], [x16]
+ add v29.16b, v29.16b, v10.16b // next base_y
+ ld4 {v19.h, v20.h, v21.h, v22.h}[7], [x17]
+
+ umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y)
+ umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y
+ umull2 v11.4s, v18.8h, v28.8h
+ umlal2 v11.4s, v19.8h, v27.8h
+ umull v12.4s, v19.4h, v28.4h
+ umlal v12.4s, v20.4h, v27.4h
+ umull2 v13.4s, v19.8h, v28.8h
+ umlal2 v13.4s, v20.8h, v27.8h
+
+ rshrn v10.4h, v10.4s, #6
+ rshrn2 v10.8h, v11.4s, #6
+ rshrn v11.4h, v12.4s, #6
+ rshrn2 v11.8h, v13.4s, #6
+
+ umull v12.4s, v20.4h, v28.4h // left[base_y]*(64-frac_y)
+ umlal v12.4s, v21.4h, v27.4h // + left[base_y+1]*frac_y
+ umull2 v13.4s, v20.8h, v28.8h
+ umlal2 v13.4s, v21.8h, v27.8h
+ umull v14.4s, v21.4h, v28.4h
+ umlal v14.4s, v22.4h, v27.4h
+ umull2 v18.4s, v21.8h, v28.8h
+ umlal2 v18.4s, v22.8h, v27.8h
+
+ rshrn v12.4h, v12.4s, #6
+ rshrn2 v12.8h, v13.4s, #6
+ rshrn v13.4h, v14.4s, #6
+ rshrn2 v13.8h, v18.4s, #6
+
+ st1 {v10.8h}, [x0], x1
+ cmp w5, #2
+ st1 {v11.8h}, [x13], x1
+ st1 {v12.8h}, [x0], x1
+ st1 {v13.8h}, [x13], x1
+ b.lt 4f
+ mov v18.16b, v22.16b
+ b.gt 33b
+
+34: // h == 2, preserving v18 from the previous round, loading v19-v20.
+ smov w10, v29.b[0]
+ smov w15, v29.b[2]
+ movi v21.16b, #4
+ smov w16, v29.b[4]
+ add x10, x3, w10, sxtw
+ smov w17, v29.b[6]
+ add x15, x3, w15, sxtw
+ ld2 {v19.h, v20.h}[0], [x10]
+ smov w10, v29.b[8]
+ add x16, x3, w16, sxtw
+ ld2 {v19.h, v20.h}[1], [x15]
+ smov w15, v29.b[10]
+ add x17, x3, w17, sxtw
+ ld2 {v19.h, v20.h}[2], [x16]
+ smov w16, v29.b[12]
+ add x10, x3, w10, sxtw
+ ld2 {v19.h, v20.h}[3], [x17]
+ smov w17, v29.b[14]
+ add x15, x3, w15, sxtw
+ add x16, x3, w16, sxtw
+ ld2 {v19.h, v20.h}[4], [x10]
+ add x17, x3, w17, sxtw
+ ld2 {v19.h, v20.h}[5], [x15]
+ ld2 {v19.h, v20.h}[6], [x16]
+ add v29.16b, v29.16b, v21.16b // next base_y
+ ld2 {v19.h, v20.h}[7], [x17]
+
+ umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y)
+ umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y
+ umull2 v11.4s, v18.8h, v28.8h
+ umlal2 v11.4s, v19.8h, v27.8h
+ umull v12.4s, v19.4h, v28.4h
+ umlal v12.4s, v20.4h, v27.4h
+ umull2 v13.4s, v19.8h, v28.8h
+ umlal2 v13.4s, v20.8h, v27.8h
+
+ rshrn v10.4h, v10.4s, #6
+ rshrn2 v10.8h, v11.4s, #6
+ rshrn v11.4h, v12.4s, #6
+ rshrn2 v11.8h, v13.4s, #6
+
+ st1 {v10.8h}, [x0], x1
+ st1 {v11.8h}, [x13], x1
+ // The h==2 case only happens once at the end, if at all.
+
+4:
+ subs w4, w4, #8
+ b.le 9f
+
+ lsr x1, x1, #1
+ msub x0, x1, x12, x0 // ptr -= h * stride
+ msub x13, x1, x12, x13
+ lsl x1, x1, #1
+ add x0, x0, #16
+ add x13, x13, #16
+ mov w5, w12 // reset h
+ b 1b
+
+9:
+ ldp d14, d15, [sp, #0x30]
+ ldp d12, d13, [sp, #0x20]
+ ldp d10, d11, [sp, #0x10]
+ ldp d8, d9, [sp], 0x40
+ ret
+
+L(ipred_z2_fill1_tbl):
+ .hword L(ipred_z2_fill1_tbl) - 640b
+ .hword L(ipred_z2_fill1_tbl) - 320b
+ .hword L(ipred_z2_fill1_tbl) - 160b
+ .hword L(ipred_z2_fill1_tbl) - 80b
+ .hword L(ipred_z2_fill1_tbl) - 40b
+endfunc
+
+function ipred_z2_fill2_16bpc_neon, export=1
+ cmp w4, #8
+ mov w8, #(2 << 6) // xpos = 2 << 6
+ sub w8, w8, w6 // xpos -= dx
+
+ movrel x11, increments
+ ld1 {v31.8h}, [x11] // increments
+ neg w7, w7 // -dy
+ b.eq 80f
+
+40:
+ dup v30.4h, w7 // -dy
+ movi v17.8b, #1
+
+ mul v16.4h, v31.4h, v30.4h // {0,1,2,3}* -dy
+ movi v25.8h, #0x3e
+ add v30.4h, v16.4h, v30.4h // -= dy
+
+ // For upsample_top, w <= 8 and h <= 8; we may need up to h+1 elements
+ // from left.
+ ld1 {v0.8h, v1.8h}, [x3] // left[]
+
+ movi v26.8h, #64
+ movi v19.16b, #4
+
+ shrn v29.8b, v30.8h, #6 // ypos >> 6
+ and v27.8b, v30.8b, v25.8b // frac_y
+
+ add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 1
+
+ movi v23.4h, #1, lsl #8
+ shl v29.8b, v29.8b, #1 // 2*base_y
+ zip1 v29.8b, v29.8b, v29.8b // duplicate elements
+ movi v17.8b, #2
+ add v29.8b, v29.8b, v23.8b // 2*base, 2*base+1, ...
+
+ add v30.8b, v29.8b, v17.8b // base_y + 1 (*2)
+ add v28.8b, v29.8b, v19.8b // base_y + 2 (*2)
+
+ tbl v18.8b, {v0.16b}, v29.8b // left[base_y]
+
+ trn1 v30.2d, v30.2d, v28.2d // base_y + 1, base_y + 2
+
+ sub v28.4h, v26.4h, v27.4h // 64 - frac_y
+
+ trn1 v31.2d, v31.2d, v31.2d // {0,1,2,3,0,1,2,3}
+
+ trn1 v27.2d, v27.2d, v27.2d // frac_y
+ trn1 v28.2d, v28.2d, v28.2d // 64 - frac_y
+
+ movi v29.16b, #4
+ add v31.8h, v31.8h, v31.8h // {0,2,4,6,0,2,4,6}
+4:
+ asr w9, w8, #6 // base_x
+ dup v16.4h, w8 // xpos
+ sub w8, w8, w6 // xpos -= dx
+ cmp w9, #-8 // base_x <= -8
+ asr w11, w8, #6 // base_x
+ b.le 49f
+
+ lsl w9, w9, #1
+ lsl w11, w11, #1
+
+ dup v17.4h, w8 // xpos
+
+ ldr q4, [x2, w9, sxtw] // top[base_x]
+ ldr q6, [x2, w11, sxtw]
+
+ trn1 v16.2d, v16.2d, v17.2d // xpos
+
+ tbl v19.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+1], left[base_y+2]
+
+ sshr v20.8h, v16.8h, #6 // first base_x for each row
+
+ uzp2 v5.8h, v4.8h, v6.8h // top[base_x+1]
+ uzp1 v4.8h, v4.8h, v6.8h // top[base_x]
+
+ and v16.16b, v16.16b, v25.16b // frac_x
+
+ trn1 v18.2d, v18.2d, v19.2d // left[base_y], left[base_y+1]
+
+ sub v17.8h, v26.8h, v16.8h // 64 - frac_x
+
+ add v20.8h, v20.8h, v31.8h // actual base_x
+
+ umull v21.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y)
+ umlal v21.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y
+ umull2 v22.4s, v18.8h, v28.8h
+ umlal2 v22.4s, v19.8h, v27.8h
+
+ umull v23.4s, v4.4h, v17.4h // top[base_x]-*(64-frac_x)
+ umlal v23.4s, v5.4h, v16.4h // + top[base_x+1]*frac_x
+ umull2 v24.4s, v4.8h, v17.8h
+ umlal2 v24.4s, v5.8h, v16.8h
+
+ cmge v20.8h, v20.8h, #0
+
+ rshrn v21.4h, v21.4s, #6
+ rshrn2 v21.8h, v22.4s, #6
+ rshrn v22.4h, v23.4s, #6
+ rshrn2 v22.8h, v24.4s, #6
+
+ bit v21.16b, v22.16b, v20.16b
+
+ st1 {v21.d}[0], [x0], x1
+ sub w8, w8, w6 // xpos -= dx
+ subs w5, w5, #2
+ st1 {v21.d}[1], [x0], x1
+ b.le 9f
+
+ ext v18.16b, v19.16b, v19.16b, #8
+ add v30.16b, v30.16b, v29.16b // base_y += 2 (*2)
+ b 4b
+
+49:
+ tbl v19.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+1], left[base_y+2]
+
+ trn1 v18.2d, v18.2d, v19.2d // left[base_y], left[base_y+1]
+
+ umull v20.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y)
+ umlal v20.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y
+ umull2 v21.4s, v18.8h, v28.8h
+ umlal2 v21.4s, v19.8h, v27.8h
+
+ rshrn v20.4h, v20.4s, #6
+ rshrn2 v20.8h, v21.4s, #6
+
+ st1 {v20.d}[0], [x0], x1
+ subs w5, w5, #2
+ st1 {v20.d}[1], [x0], x1
+ b.le 9f
+
+ ext v18.16b, v19.16b, v19.16b, #8
+ add v30.16b, v30.16b, v29.16b // base_y += 2 (*2)
+ b 49b
+
+9:
+ ret
+
+80:
+ stp d8, d9, [sp, #-0x40]!
+ stp d10, d11, [sp, #0x10]
+ stp d12, d13, [sp, #0x20]
+ stp d14, d15, [sp, #0x30]
+
+ dup v18.8h, w7 // -dy
+ movi v17.8b, #1
+
+ mul v16.8h, v31.8h, v18.8h // {0,1,2,3,4,5,6,7}* -dy
+ movi v25.8h, #0x3e
+ add v16.8h, v16.8h, v18.8h // -= dy
+
+ // For upsample_top, w <= 8 and h <= 8; we may need up to h+1 elements
+ // from left.
+ ld1 {v0.8h, v1.8h}, [x3] // left[]
+
+ movi v26.8h, #64
+ movi v19.16b, #4
+
+ shrn v29.8b, v16.8h, #6 // ypos >> 6
+ and v27.16b, v16.16b, v25.16b // frac_y
+
+ add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 1
+
+ movi v23.8h, #1, lsl #8
+ shl v29.8b, v29.8b, #1 // 2*base_y
+ zip1 v29.16b, v29.16b, v29.16b // duplicate elements
+ movi v17.16b, #2
+ add v29.16b, v29.16b, v23.16b // 2*base, 2*base+1, ...
+
+ // Cut corners here; for the first row we don't expect to need to
+ // read outside of v0.
+ tbl v18.16b, {v0.16b}, v29.16b // left[base_y]
+
+ add v30.16b, v29.16b, v19.16b // base_y + 2 (*2)
+ add v29.16b, v29.16b, v17.16b // base_y + 1 (*2)
+
+ sub v28.8h, v26.8h, v27.8h // 64 - frac_y
+
+ movi v24.16b, #4
+ add v31.16b, v31.16b, v31.16b // {0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14}
+8:
+ asr w9, w8, #6 // base_x
+ dup v16.8h, w8 // xpos
+ sub w8, w8, w6 // xpos -= dx
+ cmp w9, #-16 // base_x <= -16
+ asr w11, w8, #6 // base_x
+ b.le 89f
+
+ dup v17.8h, w8 // xpos
+
+ add x9, x2, w9, sxtw #1
+ add x11, x2, w11, sxtw #1
+
+ ld1 {v4.8h, v5.8h}, [x9] // top[base_x]
+ ld1 {v6.8h, v7.8h}, [x11]
+
+ tbl v19.16b, {v0.16b, v1.16b}, v29.16b // left[base_y+1]
+
+ sshr v21.8h, v16.8h, #6 // first base_x
+ sshr v22.8h, v17.8h, #6
+
+ tbl v20.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+2]
+
+ uzp2 v2.8h, v4.8h, v5.8h // top[base_x+1]
+ uzp1 v4.8h, v4.8h, v5.8h // top[base_x]
+ uzp2 v3.8h, v6.8h, v7.8h
+ uzp1 v6.8h, v6.8h, v7.8h
+ mov v5.16b, v2.16b
+ mov v7.16b, v3.16b
+
+ and v16.16b, v16.16b, v25.16b // frac_x
+ and v17.16b, v17.16b, v25.16b
+
+ umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y)
+ umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y
+
+ sub v8.8h, v26.8h, v16.8h // 64 - frac_x
+ sub v9.8h, v26.8h, v17.8h
+
+ umull2 v11.4s, v18.8h, v28.8h
+ umlal2 v11.4s, v19.8h, v27.8h
+
+ add v21.8h, v21.8h, v31.8h // actual base_x
+ add v22.8h, v22.8h, v31.8h
+
+ umull v12.4s, v19.4h, v28.4h
+ umlal v12.4s, v20.4h, v27.4h
+ umull2 v13.4s, v19.8h, v28.8h
+ umlal2 v13.4s, v20.8h, v27.8h
+
+ rshrn v10.4h, v10.4s, #6
+ rshrn2 v10.8h, v11.4s, #6
+ rshrn v11.4h, v12.4s, #6
+ rshrn2 v11.8h, v13.4s, #6
+
+ umull v12.4s, v4.4h, v8.4h // top[base_x]-*(64-frac_x)
+ umlal v12.4s, v5.4h, v16.4h // + top[base_x+1]*frac_x
+ umull2 v13.4s, v4.8h, v8.8h
+ umlal2 v13.4s, v5.8h, v16.8h
+ umull v14.4s, v6.4h, v9.4h
+ umlal v14.4s, v7.4h, v17.4h
+ umull2 v18.4s, v6.8h, v9.8h
+ umlal2 v18.4s, v7.8h, v17.8h
+
+ cmge v21.8h, v21.8h, #0
+ cmge v22.8h, v22.8h, #0
+
+ rshrn v12.4h, v12.4s, #6
+ rshrn2 v12.8h, v13.4s, #6
+ rshrn v13.4h, v14.4s, #6
+ rshrn2 v13.8h, v18.4s, #6
+
+ bit v10.16b, v12.16b, v21.16b
+ bit v11.16b, v13.16b, v22.16b
+
+ st1 {v10.8h}, [x0], x1
+ subs w5, w5, #2
+ sub w8, w8, w6 // xpos -= dx
+ st1 {v11.8h}, [x0], x1
+ b.le 9f
+
+ mov v18.16b, v20.16b
+ add v29.16b, v29.16b, v24.16b // base_y += 2 (*2)
+ add v30.16b, v30.16b, v24.16b // base_y += 2 (*2)
+ b 8b
+
+89:
+ tbl v19.16b, {v0.16b, v1.16b}, v29.16b // left[base_y+1]
+ tbl v20.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+2]
+
+ umull v4.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y)
+ umlal v4.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y
+ umull2 v5.4s, v18.8h, v28.8h
+ umlal2 v5.4s, v19.8h, v27.8h
+ umull v6.4s, v19.4h, v28.4h
+ umlal v6.4s, v20.4h, v27.4h
+ umull2 v7.4s, v19.8h, v28.8h
+ umlal2 v7.4s, v20.8h, v27.8h
+
+ rshrn v4.4h, v4.4s, #6
+ rshrn2 v4.8h, v5.4s, #6
+ rshrn v5.4h, v6.4s, #6
+ rshrn2 v5.8h, v7.4s, #6
+
+ st1 {v4.8h}, [x0], x1
+ subs w5, w5, #2
+ st1 {v5.8h}, [x0], x1
+ b.le 9f
+
+ mov v18.16b, v20.16b
+ add v29.16b, v29.16b, v24.16b // base_y += 2 (*2)
+ add v30.16b, v30.16b, v24.16b // base_y += 2 (*2)
+ b 89b
+
+9:
+ ldp d14, d15, [sp, #0x30]
+ ldp d12, d13, [sp, #0x20]
+ ldp d10, d11, [sp, #0x10]
+ ldp d8, d9, [sp], 0x40
+ ret
+endfunc
+
+function ipred_z2_fill3_16bpc_neon, export=1
+ cmp w4, #8
+ mov w8, #(1 << 6) // xpos = 1 << 6
+ sub w8, w8, w6 // xpos -= dx
+
+ movrel x11, increments
+ ld1 {v31.8h}, [x11] // increments
+ neg w7, w7 // -dy
+ b.eq 80f
+
+40:
+ dup v30.4h, w7 // -dy
+ movi v17.8b, #1
+
+ mul v16.4h, v31.4h, v30.4h // {0,1,2,3}* -dy
+ movi v25.8h, #0x3e
+ add v30.4h, v16.4h, v30.4h // -= dy
+
+ // For upsample_left, w <= 8 and h <= 8; we may need up to 2*h+1 elements.
+ ld1 {v0.8h, v1.8h, v2.8h}, [x3] // left[]
+
+ movi v26.8h, #64
+ movi v19.16b, #2
+
+ shrn v29.8b, v30.8h, #6 // ypos >> 6
+ and v27.8b, v30.8b, v25.8b // frac_y
+
+ add v29.8b, v29.8b, v19.8b // base_y = (ypos >> 6) + 2
+
+ movi v23.4h, #1, lsl #8
+ shl v29.8b, v29.8b, #1 // 2*base_y
+ movi v19.16b, #4
+ zip1 v29.8b, v29.8b, v29.8b // duplicate elements
+ movi v17.8b, #2
+ add v29.8b, v29.8b, v23.8b // 2*base, 2*base+1, ...
+
+ add v30.8b, v29.8b, v17.8b // base_y + 1 (*2)
+ add v28.8b, v29.8b, v19.8b // base_y + 2 (*2)
+
+ trn1 v31.2d, v31.2d, v31.2d // {0,1,2,3,0,1,2,3}
+
+ add v24.8b, v30.8b, v19.8b // base_y + 3 (*2)
+
+ trn1 v29.2d, v29.2d, v28.2d // base_y + 0, base_y + 2
+ trn1 v30.2d, v30.2d, v24.2d // base_y + 1, base_y + 3
+
+ sub v28.4h, v26.4h, v27.4h // 64 - frac_y
+
+ trn1 v27.2d, v27.2d, v27.2d // frac_y
+ trn1 v28.2d, v28.2d, v28.2d // 64 - frac_y
+
+ movi v24.16b, #8
+4:
+ asr w9, w8, #6 // base_x
+ dup v16.4h, w8 // xpos
+ sub w8, w8, w6 // xpos -= dx
+ cmp w9, #-4 // base_x <= -4
+ asr w11, w8, #6 // base_x
+ b.le 49f
+
+ lsl w9, w9, #1
+ lsl w11, w11, #1
+
+ dup v17.4h, w8 // xpos
+
+ ldr q4, [x2, w9, sxtw] // top[base_x]
+ ldr q6, [x2, w11, sxtw]
+
+ trn1 v16.2d, v16.2d, v17.2d // xpos
+
+ tbl v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0], left[base_y+2]
+ tbl v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+3]
+
+ sshr v20.8h, v16.8h, #6 // first base_x for each row
+
+ ext v5.16b, v4.16b, v4.16b, #2 // top[base_x+1]
+ ext v7.16b, v6.16b, v6.16b, #2
+
+ and v16.16b, v16.16b, v25.16b // frac_x
+
+ trn1 v4.2d, v4.2d, v6.2d // top[base_x]
+ trn1 v5.2d, v5.2d, v7.2d // top[base_x+1]
+
+ sub v17.8h, v26.8h, v16.8h // 64 - frac_x
+
+ add v20.8h, v20.8h, v31.8h // actual base_x
+
+ umull v21.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y)
+ umlal v21.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y
+ umull2 v22.4s, v18.8h, v28.8h
+ umlal2 v22.4s, v19.8h, v27.8h
+
+ umull v23.4s, v4.4h, v17.4h // top[base_x]-*(64-frac_x)
+ umlal v23.4s, v5.4h, v16.4h // + top[base_x+1]*frac_x
+ umull2 v24.4s, v4.8h, v17.8h
+ umlal2 v24.4s, v5.8h, v16.8h
+
+ cmge v20.8h, v20.8h, #0
+
+ rshrn v21.4h, v21.4s, #6
+ rshrn2 v21.8h, v22.4s, #6
+ rshrn v22.4h, v23.4s, #6
+ rshrn2 v22.8h, v24.4s, #6
+
+ movi v24.16b, #8
+
+ bit v21.16b, v22.16b, v20.16b
+
+ st1 {v21.d}[0], [x0], x1
+ sub w8, w8, w6 // xpos -= dx
+ subs w5, w5, #2
+ st1 {v21.d}[1], [x0], x1
+ b.le 9f
+
+ add v29.16b, v29.16b, v24.16b // base_y += 4 (*2)
+ add v30.16b, v30.16b, v24.16b // base_y += 4 (*2)
+ b 4b
+
+49:
+ tbl v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0], left[base_y+2]
+ tbl v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+3]
+
+ umull v20.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y)
+ umlal v20.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y
+ umull2 v21.4s, v18.8h, v28.8h
+ umlal2 v21.4s, v19.8h, v27.8h
+
+ rshrn v20.4h, v20.4s, #6
+ rshrn2 v20.8h, v21.4s, #6
+
+ st1 {v20.d}[0], [x0], x1
+ subs w5, w5, #2
+ st1 {v20.d}[1], [x0], x1
+ b.le 9f
+
+ add v29.16b, v29.16b, v24.16b // base_y += 4 (*2)
+ add v30.16b, v30.16b, v24.16b // base_y += 4 (*2)
+ b 49b
+
+9:
+ ret
+
+80:
+ stp d8, d9, [sp, #-0x40]!
+ stp d10, d11, [sp, #0x10]
+ stp d12, d13, [sp, #0x20]
+ stp d14, d15, [sp, #0x30]
+
+ dup v18.8h, w7 // -dy
+ movi v17.16b, #2
+
+ mul v16.8h, v31.8h, v18.8h // {0,1,2,3,4,5,6,7}* -dy
+ movi v25.8h, #0x3e
+ add v16.8h, v16.8h, v18.8h // -= dy
+
+ // For upsample_left, w <= 8 and h <= 8; we may need up to 2*h+1 elements.
+ ld1 {v0.8h, v1.8h, v2.8h}, [x3] // left[]
+
+ movi v26.8h, #64
+ movi v19.16b, #4
+
+ shrn v29.8b, v16.8h, #6 // ypos >> 6
+ and v27.16b, v16.16b, v25.16b // frac_y
+
+ add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 2
+
+ movi v23.8h, #1, lsl #8
+ shl v29.8b, v29.8b, #1 // 2*base_y
+ mov v18.16b, v15.16b // left[0]
+ zip1 v29.16b, v29.16b, v29.16b // duplicate elements
+ add v29.16b, v29.16b, v23.16b // 2*base, 2*base+1, ...
+
+ add v30.16b, v29.16b, v17.16b // base_y + 1 (*2)
+
+ sub v28.8h, v26.8h, v27.8h // 64 - frac_y
+
+ movi v24.16b, #4
+8:
+ asr w9, w8, #6 // base_x
+ dup v16.8h, w8 // xpos
+ sub w8, w8, w6 // xpos -= dx
+ cmp w9, #-16 // base_x <= -16
+ asr w11, w8, #6 // base_x
+ b.le 89f
+
+ dup v17.8h, w8 // xpos
+
+ add x9, x2, w9, sxtw #1
+ add x11, x2, w11, sxtw #1
+
+ ld1 {v4.8h, v5.8h}, [x9] // top[base_x]
+ ld1 {v6.8h, v7.8h}, [x11]
+
+ tbl v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0]
+ add v29.16b, v29.16b, v24.16b // base_y += 2 (*2)
+ tbl v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1]
+ add v30.16b, v30.16b, v24.16b
+
+ sshr v22.8h, v16.8h, #6 // first base_x
+ tbl v20.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+2]
+ sshr v23.8h, v17.8h, #6
+ tbl v21.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+3]
+
+ ext v5.16b, v4.16b, v5.16b, #2 // top[base_x+1]
+ ext v7.16b, v6.16b, v7.16b, #2
+
+ and v16.16b, v16.16b, v25.16b // frac_x
+ and v17.16b, v17.16b, v25.16b
+
+ umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y)
+ umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y
+
+ sub v8.8h, v26.8h, v16.8h // 64 - frac_x
+ sub v9.8h, v26.8h, v17.8h
+
+ umull2 v11.4s, v18.8h, v28.8h
+ umlal2 v11.4s, v19.8h, v27.8h
+
+ add v22.8h, v22.8h, v31.8h // actual base_x
+ add v23.8h, v23.8h, v31.8h
+
+ umull v12.4s, v20.4h, v28.4h
+ umlal v12.4s, v21.4h, v27.4h
+ umull2 v13.4s, v20.8h, v28.8h
+ umlal2 v13.4s, v21.8h, v27.8h
+
+ rshrn v10.4h, v10.4s, #6
+ rshrn2 v10.8h, v11.4s, #6
+ rshrn v11.4h, v12.4s, #6
+ rshrn2 v11.8h, v13.4s, #6
+
+ umull v12.4s, v4.4h, v8.4h // top[base_x]-*(64-frac_x)
+ umlal v12.4s, v5.4h, v16.4h // + top[base_x+1]*frac_x
+ umull2 v13.4s, v4.8h, v8.8h
+ umlal2 v13.4s, v5.8h, v16.8h
+ umull v14.4s, v6.4h, v9.4h
+ umlal v14.4s, v7.4h, v17.4h
+ umull2 v18.4s, v6.8h, v9.8h
+ umlal2 v18.4s, v7.8h, v17.8h
+
+ cmge v22.8h, v22.8h, #0
+ cmge v23.8h, v23.8h, #0
+
+ rshrn v12.4h, v12.4s, #6
+ rshrn2 v12.8h, v13.4s, #6
+ rshrn v13.4h, v14.4s, #6
+ rshrn2 v13.8h, v18.4s, #6
+
+ bit v10.16b, v12.16b, v22.16b
+ bit v11.16b, v13.16b, v23.16b
+
+ st1 {v10.8h}, [x0], x1
+ subs w5, w5, #2
+ sub w8, w8, w6 // xpos -= dx
+ st1 {v11.8h}, [x0], x1
+ b.le 9f
+
+ add v29.16b, v29.16b, v24.16b // base_y += 2 (*2)
+ add v30.16b, v30.16b, v24.16b
+ b 8b
+
+89:
+ tbl v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0]
+ add v29.16b, v29.16b, v24.16b // base_y += 2 (*2)
+ tbl v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1]
+ add v30.16b, v30.16b, v24.16b
+ tbl v20.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+2]
+ tbl v21.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+3]
+
+ umull v4.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y)
+ umlal v4.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y
+ umull2 v5.4s, v18.8h, v28.8h
+ umlal2 v5.4s, v19.8h, v27.8h
+ umull v6.4s, v20.4h, v28.4h
+ umlal v6.4s, v21.4h, v27.4h
+ umull2 v7.4s, v20.8h, v28.8h
+ umlal2 v7.4s, v21.8h, v27.8h
+
+ rshrn v4.4h, v4.4s, #6
+ rshrn2 v4.8h, v5.4s, #6
+ rshrn v5.4h, v6.4s, #6
+ rshrn2 v5.8h, v7.4s, #6
+
+ st1 {v4.8h}, [x0], x1
+ subs w5, w5, #2
+ st1 {v5.8h}, [x0], x1
+ b.le 9f
+
+ add v29.16b, v29.16b, v24.16b // base_y += 2 (*2)
+ add v30.16b, v30.16b, v24.16b
+ b 89b
+
+9:
+ ldp d14, d15, [sp, #0x30]
+ ldp d12, d13, [sp, #0x20]
+ ldp d10, d11, [sp, #0x10]
+ ldp d8, d9, [sp], 0x40
+ ret
+endfunc
+
+// void ipred_z3_fill1_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const left,
+// const int width, const int height,
+// const int dy, const int max_base_y);
+function ipred_z3_fill1_16bpc_neon, export=1
+ clz w9, w4
+ adr x8, L(ipred_z3_fill1_tbl)
+ sub w9, w9, #25
+ ldrh w9, [x8, w9, uxtw #1]
+ add x10, x2, w6, uxtw #1 // left[max_base_y]
+ sub x8, x8, w9, uxtw
+ ld1r {v31.8h}, [x10] // padding
+ mov w7, w5
+ mov w15, #64
+ add x13, x0, x1
+ lsl x1, x1, #1
+ br x8
+
+40:
+ AARCH64_VALID_JUMP_TARGET
+4:
+ lsr w8, w7, #6 // base
+ and w9, w7, #0x3e // frac
+ add w7, w7, w5 // xpos += dx
+ cmp w8, w6 // base >= max_base_x
+ lsr w10, w7, #6 // base
+ and w11, w7, #0x3e // frac
+ b.ge ipred_z3_fill_padding_neon
+ lsl w8, w8, #1
+ lsl w10, w10, #1
+ ldr q0, [x2, w8, uxtw] // left[base]
+ ldr q2, [x2, w10, uxtw]
+ dup v4.8h, w9 // frac
+ dup v5.8h, w11
+ ext v1.16b, v0.16b, v0.16b, #2 // left[base+1]
+ ext v3.16b, v2.16b, v2.16b, #2
+ sub v6.4h, v1.4h, v0.4h // top[base+1]-top[base]
+ sub v7.4h, v3.4h, v2.4h
+ ushll v16.4s, v0.4h, #6 // top[base]*64
+ ushll v17.4s, v2.4h, #6
+ smlal v16.4s, v6.4h, v4.4h // + top[base+1]*frac
+ smlal v17.4s, v7.4h, v5.4h
+ rshrn v16.4h, v16.4s, #6
+ rshrn v17.4h, v17.4s, #6
+ subs w3, w3, #2
+ zip1 v18.8h, v16.8h, v17.8h
+ st1 {v18.s}[0], [x0], x1
+ st1 {v18.s}[1], [x13], x1
+ add w7, w7, w5 // xpos += dx
+ st1 {v18.s}[2], [x0]
+ st1 {v18.s}[3], [x13]
+ b.le 9f
+ sub x0, x0, x1 // ptr -= 4 * (2*stride)
+ sub x13, x13, x1
+ add x0, x0, #4
+ add x13, x13, #4
+ b 4b
+9:
+ ret
+
+80:
+ AARCH64_VALID_JUMP_TARGET
+8:
+ lsr w8, w7, #6 // base
+ and w9, w7, #0x3e // frac
+ add w7, w7, w5 // xpos += dx
+ cmp w8, w6 // base >= max_base_x
+ lsr w10, w7, #6 // base
+ and w11, w7, #0x3e // frac
+ b.ge ipred_z3_fill_padding_neon
+ add x8, x2, w8, uxtw #1
+ add x10, x2, w10, uxtw #1
+ dup v4.8h, w9 // frac
+ dup v5.8h, w11
+ ld1 {v0.8h}, [x8] // left[base]
+ ld1 {v2.8h}, [x10]
+ sub w9, w15, w9 // 64 - frac
+ sub w11, w15, w11
+ ldr h1, [x8, #16]
+ ldr h3, [x10, #16]
+ dup v6.8h, w9 // 64 - frac
+ dup v7.8h, w11
+ ext v1.16b, v0.16b, v1.16b, #2 // left[base+1]
+ ext v3.16b, v2.16b, v3.16b, #2
+ umull v16.4s, v0.4h, v6.4h // left[base]*(64-frac)
+ umlal v16.4s, v1.4h, v4.4h // + left[base+1]*frac
+ umull2 v17.4s, v0.8h, v6.8h
+ umlal2 v17.4s, v1.8h, v4.8h
+ umull v18.4s, v2.4h, v7.4h
+ umlal v18.4s, v3.4h, v5.4h
+ umull2 v19.4s, v2.8h, v7.8h
+ umlal2 v19.4s, v3.8h, v5.8h
+ rshrn v16.4h, v16.4s, #6
+ rshrn2 v16.8h, v17.4s, #6
+ rshrn v17.4h, v18.4s, #6
+ rshrn2 v17.8h, v19.4s, #6
+ subs w3, w3, #2
+ zip1 v18.8h, v16.8h, v17.8h
+ zip2 v19.8h, v16.8h, v17.8h
+ add w7, w7, w5 // xpos += dx
+ st1 {v18.s}[0], [x0], x1
+ st1 {v18.s}[1], [x13], x1
+ st1 {v18.s}[2], [x0], x1
+ st1 {v18.s}[3], [x13], x1
+ st1 {v19.s}[0], [x0], x1
+ st1 {v19.s}[1], [x13], x1
+ st1 {v19.s}[2], [x0], x1
+ st1 {v19.s}[3], [x13], x1
+ b.le 9f
+ sub x0, x0, x1, lsl #2 // ptr -= 4 * (2*stride)
+ sub x13, x13, x1, lsl #2
+ add x0, x0, #4
+ add x13, x13, #4
+ b 8b
+9:
+ ret
+
+160:
+320:
+640:
+ AARCH64_VALID_JUMP_TARGET
+ mov w12, w4
+1:
+ lsr w8, w7, #6 // base
+ and w9, w7, #0x3e // frac
+ add w7, w7, w5 // ypos += dy
+ cmp w8, w6 // base >= max_base_y
+ lsr w10, w7, #6 // base
+ and w11, w7, #0x3e // frac
+ b.ge ipred_z3_fill_padding_neon
+ add x8, x2, w8, uxtw #1
+ add x10, x2, w10, uxtw #1
+ dup v6.8h, w9 // frac
+ dup v7.8h, w11
+ ld1 {v0.8h, v1.8h, v2.8h}, [x8], #48 // left[base]
+ ld1 {v3.8h, v4.8h, v5.8h}, [x10], #48
+ sub w9, w15, w9 // 64 - frac
+ sub w11, w15, w11
+ dup v16.8h, w9 // 64 - frac
+ dup v17.8h, w11
+ add w7, w7, w5 // ypos += dy
+2:
+ ext v18.16b, v0.16b, v1.16b, #2 // left[base+1]
+ ext v19.16b, v1.16b, v2.16b, #2
+ ext v20.16b, v3.16b, v4.16b, #2
+ ext v21.16b, v4.16b, v5.16b, #2
+ subs w4, w4, #16
+ umull v22.4s, v0.4h, v16.4h // left[base]*(64-frac)
+ umlal v22.4s, v18.4h, v6.4h // + left[base+1]*frac
+ umull2 v23.4s, v0.8h, v16.8h
+ umlal2 v23.4s, v18.8h, v6.8h
+ umull v24.4s, v1.4h, v16.4h
+ umlal v24.4s, v19.4h, v6.4h
+ umull2 v25.4s, v1.8h, v16.8h
+ umlal2 v25.4s, v19.8h, v6.8h
+ umull v26.4s, v3.4h, v17.4h
+ umlal v26.4s, v20.4h, v7.4h
+ umull2 v27.4s, v3.8h, v17.8h
+ umlal2 v27.4s, v20.8h, v7.8h
+ umull v28.4s, v4.4h, v17.4h
+ umlal v28.4s, v21.4h, v7.4h
+ umull2 v29.4s, v4.8h, v17.8h
+ umlal2 v29.4s, v21.8h, v7.8h
+ rshrn v22.4h, v22.4s, #6
+ rshrn2 v22.8h, v23.4s, #6
+ rshrn v23.4h, v24.4s, #6
+ rshrn2 v23.8h, v25.4s, #6
+ rshrn v24.4h, v26.4s, #6
+ rshrn2 v24.8h, v27.4s, #6
+ rshrn v25.4h, v28.4s, #6
+ rshrn2 v25.8h, v29.4s, #6
+ zip1 v18.8h, v22.8h, v24.8h
+ zip2 v19.8h, v22.8h, v24.8h
+ zip1 v20.8h, v23.8h, v25.8h
+ zip2 v21.8h, v23.8h, v25.8h
+ st1 {v18.s}[0], [x0], x1
+ st1 {v18.s}[1], [x13], x1
+ st1 {v18.s}[2], [x0], x1
+ st1 {v18.s}[3], [x13], x1
+ st1 {v19.s}[0], [x0], x1
+ st1 {v19.s}[1], [x13], x1
+ st1 {v19.s}[2], [x0], x1
+ st1 {v19.s}[3], [x13], x1
+ st1 {v20.s}[0], [x0], x1
+ st1 {v20.s}[1], [x13], x1
+ st1 {v20.s}[2], [x0], x1
+ st1 {v20.s}[3], [x13], x1
+ st1 {v21.s}[0], [x0], x1
+ st1 {v21.s}[1], [x13], x1
+ st1 {v21.s}[2], [x0], x1
+ st1 {v21.s}[3], [x13], x1
+ b.le 3f
+ mov v0.16b, v2.16b
+ ld1 {v1.8h, v2.8h}, [x8], #32 // left[base]
+ mov v3.16b, v5.16b
+ ld1 {v4.8h, v5.8h}, [x10], #32
+ b 2b
+
+3:
+ subs w3, w3, #2
+ b.le 9f
+ lsr x1, x1, #1
+ msub x0, x1, x12, x0 // ptr -= h * stride
+ msub x13, x1, x12, x13
+ lsl x1, x1, #1
+ add x0, x0, #4
+ add x13, x13, #4
+ mov w4, w12
+ b 1b
+9:
+ ret
+
+L(ipred_z3_fill1_tbl):
+ .hword L(ipred_z3_fill1_tbl) - 640b
+ .hword L(ipred_z3_fill1_tbl) - 320b
+ .hword L(ipred_z3_fill1_tbl) - 160b
+ .hword L(ipred_z3_fill1_tbl) - 80b
+ .hword L(ipred_z3_fill1_tbl) - 40b
+endfunc
+
+function ipred_z3_fill_padding_neon, export=0
+ cmp w3, #8
+ adr x8, L(ipred_z3_fill_padding_tbl)
+ b.gt L(ipred_z3_fill_padding_wide)
+ // w3 = remaining width, w4 = constant height
+ mov w12, w4
+
+1:
+ // Fill a WxH rectangle with padding. W can be any number;
+ // this fills the exact width by filling in the largest
+ // power of two in the remaining width, and repeating.
+ clz w9, w3
+ sub w9, w9, #25
+ ldrh w9, [x8, w9, uxtw #1]
+ sub x9, x8, w9, uxtw
+ br x9
+
+2:
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v31.s}[0], [x0], x1
+ subs w4, w4, #4
+ st1 {v31.s}[0], [x13], x1
+ st1 {v31.s}[0], [x0], x1
+ st1 {v31.s}[0], [x13], x1
+ b.gt 2b
+ subs w3, w3, #2
+ lsr x1, x1, #1
+ msub x0, x1, x12, x0 // ptr -= h * stride
+ msub x13, x1, x12, x13
+ b.le 9f
+ lsl x1, x1, #1
+ add x0, x0, #4
+ add x13, x13, #4
+ mov w4, w12
+ b 1b
+
+4:
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v31.4h}, [x0], x1
+ subs w4, w4, #4
+ st1 {v31.4h}, [x13], x1
+ st1 {v31.4h}, [x0], x1
+ st1 {v31.4h}, [x13], x1
+ b.gt 4b
+ subs w3, w3, #4
+ lsr x1, x1, #1
+ msub x0, x1, x12, x0 // ptr -= h * stride
+ msub x13, x1, x12, x13
+ b.le 9f
+ lsl x1, x1, #1
+ add x0, x0, #8
+ add x13, x13, #8
+ mov w4, w12
+ b 1b
+
+8:
+16:
+32:
+64:
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v31.8h}, [x0], x1
+ subs w4, w4, #4
+ st1 {v31.8h}, [x13], x1
+ st1 {v31.8h}, [x0], x1
+ st1 {v31.8h}, [x13], x1
+ b.gt 4b
+ subs w3, w3, #8
+ lsr x1, x1, #1
+ msub x0, x1, x12, x0 // ptr -= h * stride
+ msub x13, x1, x12, x13
+ b.le 9f
+ lsl x1, x1, #1
+ add x0, x0, #16
+ add x13, x13, #16
+ mov w4, w12
+ b 1b
+
+9:
+ ret
+
+L(ipred_z3_fill_padding_tbl):
+ .hword L(ipred_z3_fill_padding_tbl) - 64b
+ .hword L(ipred_z3_fill_padding_tbl) - 32b
+ .hword L(ipred_z3_fill_padding_tbl) - 16b
+ .hword L(ipred_z3_fill_padding_tbl) - 8b
+ .hword L(ipred_z3_fill_padding_tbl) - 4b
+ .hword L(ipred_z3_fill_padding_tbl) - 2b
+
+L(ipred_z3_fill_padding_wide):
+ // Fill a WxH rectangle with padding, with W > 8.
+ lsr x1, x1, #1
+ mov w12, w3
+ sub x1, x1, w3, uxtw #1
+1:
+ ands w5, w3, #7
+ b.eq 2f
+ // If the width isn't aligned to 8, first do one 8 pixel write
+ // and align the start pointer.
+ sub w3, w3, w5
+ st1 {v31.8h}, [x0]
+ add x0, x0, w5, uxtw #1
+2:
+ // Fill the rest of the line with aligned 8 pixel writes.
+ subs w3, w3, #8
+ st1 {v31.8h}, [x0], #16
+ b.gt 2b
+ subs w4, w4, #1
+ add x0, x0, x1
+ b.le 9f
+ mov w3, w12
+ b 1b
+9:
+ ret
+endfunc
+
+function ipred_z3_fill2_16bpc_neon, export=1
+ cmp w4, #8
+ add x10, x2, w6, uxtw // left[max_base_y]
+ ld1r {v31.16b}, [x10] // padding
+ mov w7, w5
+ mov w15, #64
+ add x13, x0, x1
+ lsl x1, x1, #1
+ b.eq 8f
+
+4: // h == 4
+ lsr w8, w7, #6 // base
+ and w9, w7, #0x3e // frac
+ add w7, w7, w5 // xpos += dx
+ cmp w8, w6 // base >= max_base_x
+ lsr w10, w7, #6 // base
+ and w11, w7, #0x3e // frac
+ b.ge ipred_z3_fill_padding_neon
+ lsl w8, w8, #1
+ lsl w10, w10, #1
+ ldr q0, [x2, w8, uxtw] // top[base]
+ ldr q2, [x2, w10, uxtw]
+ dup v4.4h, w9 // frac
+ dup v5.4h, w11
+ uzp2 v1.8h, v0.8h, v0.8h // top[base+1]
+ uzp1 v0.8h, v0.8h, v0.8h // top[base]
+ uzp2 v3.8h, v2.8h, v2.8h
+ uzp1 v2.8h, v2.8h, v2.8h
+ sub v6.4h, v1.4h, v0.4h // top[base+1]-top[base]
+ sub v7.4h, v3.4h, v2.4h
+ ushll v16.4s, v0.4h, #6 // top[base]*64
+ ushll v17.4s, v2.4h, #6
+ smlal v16.4s, v6.4h, v4.4h // + top[base+1]*frac
+ smlal v17.4s, v7.4h, v5.4h
+ rshrn v16.4h, v16.4s, #6
+ rshrn v17.4h, v17.4s, #6
+ subs w3, w3, #2
+ zip1 v18.8h, v16.8h, v17.8h
+ st1 {v18.s}[0], [x0], x1
+ st1 {v18.s}[1], [x13], x1
+ add w7, w7, w5 // xpos += dx
+ st1 {v18.s}[2], [x0]
+ st1 {v18.s}[3], [x13]
+ b.le 9f
+ sub x0, x0, x1 // ptr -= 4 * (2*stride)
+ sub x13, x13, x1
+ add x0, x0, #4
+ add x13, x13, #4
+ b 4b
+9:
+ ret
+
+8: // h == 8
+ lsr w8, w7, #6 // base
+ and w9, w7, #0x3e // frac
+ add w7, w7, w5 // xpos += dx
+ cmp w8, w6 // base >= max_base_x
+ lsr w10, w7, #6 // base
+ and w11, w7, #0x3e // frac
+ b.ge ipred_z3_fill_padding_neon
+ add x8, x2, w8, uxtw #1
+ add x10, x2, w10, uxtw #1
+ dup v4.8h, w9 // frac
+ dup v5.8h, w11
+ ld1 {v0.8h, v1.8h}, [x8] // top[base]
+ ld1 {v2.8h, v3.8h}, [x10]
+ sub w9, w15, w9 // 64 - frac
+ sub w11, w15, w11
+ dup v6.8h, w9 // 64 - frac
+ dup v7.8h, w11
+ uzp2 v20.8h, v0.8h, v1.8h // top[base+1]
+ uzp1 v0.8h, v0.8h, v1.8h // top[base]
+ uzp2 v21.8h, v2.8h, v3.8h
+ uzp1 v2.8h, v2.8h, v3.8h
+ umull v16.4s, v0.4h, v6.4h // top[base]*(64-frac)
+ umlal v16.4s, v20.4h, v4.4h // + top[base+1]*frac
+ umull2 v17.4s, v0.8h, v6.8h
+ umlal2 v17.4s, v20.8h, v4.8h
+ umull v18.4s, v2.4h, v7.4h
+ umlal v18.4s, v21.4h, v5.4h
+ umull2 v19.4s, v2.8h, v7.8h
+ umlal2 v19.4s, v21.8h, v5.8h
+ rshrn v16.4h, v16.4s, #6
+ rshrn2 v16.8h, v17.4s, #6
+ rshrn v17.4h, v18.4s, #6
+ rshrn2 v17.8h, v19.4s, #6
+ subs w3, w3, #2
+ zip1 v18.8h, v16.8h, v17.8h
+ zip2 v19.8h, v16.8h, v17.8h
+ add w7, w7, w5 // xpos += dx
+ st1 {v18.s}[0], [x0], x1
+ st1 {v18.s}[1], [x13], x1
+ st1 {v18.s}[2], [x0], x1
+ st1 {v18.s}[3], [x13], x1
+ st1 {v19.s}[0], [x0], x1
+ st1 {v19.s}[1], [x13], x1
+ st1 {v19.s}[2], [x0], x1
+ st1 {v19.s}[3], [x13], x1
+ b.le 9f
+ sub x0, x0, x1, lsl #2 // ptr -= 4 * (2*stride)
+ sub x13, x13, x1, lsl #2
+ add x0, x0, #4
+ add x13, x13, #4
+ b 8b
+9:
+ ret
+endfunc
+
+
+// void ipred_filter_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int filt_idx,
+// const int max_width, const int max_height,
+// const int bitdepth_max);
+.macro filter_fn bpc
+function ipred_filter_\bpc\()bpc_neon
+ and w5, w5, #511
+ movrel x6, X(filter_intra_taps)
+ lsl w5, w5, #6
+ add x6, x6, w5, uxtw
+ ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32
+ clz w9, w3
+ adr x5, L(ipred_filter\bpc\()_tbl)
+ ld1 {v20.8b, v21.8b, v22.8b}, [x6]
+ sub w9, w9, #26
+ ldrh w9, [x5, w9, uxtw #1]
+ sxtl v16.8h, v16.8b
+ sxtl v17.8h, v17.8b
+ sub x5, x5, w9, uxtw
+ sxtl v18.8h, v18.8b
+ sxtl v19.8h, v19.8b
+ add x6, x0, x1
+ lsl x1, x1, #1
+ sxtl v20.8h, v20.8b
+ sxtl v21.8h, v21.8b
+ sxtl v22.8h, v22.8b
+ dup v31.8h, w8
+.if \bpc == 10
+ movi v30.8h, #0
+.endif
+ br x5
+40:
+ AARCH64_VALID_JUMP_TARGET
+ ldur d0, [x2, #2] // top (0-3)
+ sub x2, x2, #4
+ mov x7, #-4
+4:
+ ld1 {v1.4h}, [x2], x7 // left (0-1) + topleft (2)
+.if \bpc == 10
+ mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1)
+ mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2)
+ mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3)
+ mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4)
+ mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0)
+ mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5)
+ mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6)
+ srshr v2.8h, v2.8h, #4
+ smax v2.8h, v2.8h, v30.8h
+.else
+ smull v2.4s, v17.4h, v0.h[0] // p1(top[0]) * filter(1)
+ smlal v2.4s, v18.4h, v0.h[1] // p2(top[1]) * filter(2)
+ smlal v2.4s, v19.4h, v0.h[2] // p3(top[2]) * filter(3)
+ smlal v2.4s, v20.4h, v0.h[3] // p4(top[3]) * filter(4)
+ smlal v2.4s, v16.4h, v1.h[2] // p0(topleft) * filter(0)
+ smlal v2.4s, v21.4h, v1.h[1] // p5(left[0]) * filter(5)
+ smlal v2.4s, v22.4h, v1.h[0] // p6(left[1]) * filter(6)
+ smull2 v3.4s, v17.8h, v0.h[0] // p1(top[0]) * filter(1)
+ smlal2 v3.4s, v18.8h, v0.h[1] // p2(top[1]) * filter(2)
+ smlal2 v3.4s, v19.8h, v0.h[2] // p3(top[2]) * filter(3)
+ smlal2 v3.4s, v20.8h, v0.h[3] // p4(top[3]) * filter(4)
+ smlal2 v3.4s, v16.8h, v1.h[2] // p0(topleft) * filter(0)
+ smlal2 v3.4s, v21.8h, v1.h[1] // p5(left[0]) * filter(5)
+ smlal2 v3.4s, v22.8h, v1.h[0] // p6(left[1]) * filter(6)
+ sqrshrun v2.4h, v2.4s, #4
+ sqrshrun2 v2.8h, v3.4s, #4
+.endif
+ smin v2.8h, v2.8h, v31.8h
+ subs w4, w4, #2
+ st1 {v2.d}[0], [x0], x1
+ ext v0.16b, v2.16b, v2.16b, #8 // move top from [4-7] to [0-3]
+ st1 {v2.d}[1], [x6], x1
+ b.gt 4b
+ ret
+80:
+ AARCH64_VALID_JUMP_TARGET
+ ldur q0, [x2, #2] // top (0-7)
+ sub x2, x2, #4
+ mov x7, #-4
+8:
+ ld1 {v1.4h}, [x2], x7 // left (0-1) + topleft (2)
+.if \bpc == 10
+ mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1)
+ mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2)
+ mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3)
+ mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4)
+ mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0)
+ mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5)
+ mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6)
+ mul v3.8h, v17.8h, v0.h[4] // p1(top[0]) * filter(1)
+ mla v3.8h, v18.8h, v0.h[5] // p2(top[1]) * filter(2)
+ mla v3.8h, v19.8h, v0.h[6] // p3(top[2]) * filter(3)
+ srshr v2.8h, v2.8h, #4
+ smax v2.8h, v2.8h, v30.8h
+ smin v2.8h, v2.8h, v31.8h
+ mla v3.8h, v20.8h, v0.h[7] // p4(top[3]) * filter(4)
+ mla v3.8h, v16.8h, v0.h[3] // p0(topleft) * filter(0)
+ mla v3.8h, v21.8h, v2.h[3] // p5(left[0]) * filter(5)
+ mla v3.8h, v22.8h, v2.h[7] // p6(left[1]) * filter(6)
+ srshr v3.8h, v3.8h, #4
+ smax v3.8h, v3.8h, v30.8h
+.else
+ smull v2.4s, v17.4h, v0.h[0] // p1(top[0]) * filter(1)
+ smlal v2.4s, v18.4h, v0.h[1] // p2(top[1]) * filter(2)
+ smlal v2.4s, v19.4h, v0.h[2] // p3(top[2]) * filter(3)
+ smlal v2.4s, v20.4h, v0.h[3] // p4(top[3]) * filter(4)
+ smlal v2.4s, v16.4h, v1.h[2] // p0(topleft) * filter(0)
+ smlal v2.4s, v21.4h, v1.h[1] // p5(left[0]) * filter(5)
+ smlal v2.4s, v22.4h, v1.h[0] // p6(left[1]) * filter(6)
+ smull2 v3.4s, v17.8h, v0.h[0] // p1(top[0]) * filter(1)
+ smlal2 v3.4s, v18.8h, v0.h[1] // p2(top[1]) * filter(2)
+ smlal2 v3.4s, v19.8h, v0.h[2] // p3(top[2]) * filter(3)
+ smlal2 v3.4s, v20.8h, v0.h[3] // p4(top[3]) * filter(4)
+ smlal2 v3.4s, v16.8h, v1.h[2] // p0(topleft) * filter(0)
+ smlal2 v3.4s, v21.8h, v1.h[1] // p5(left[0]) * filter(5)
+ smlal2 v3.4s, v22.8h, v1.h[0] // p6(left[1]) * filter(6)
+ smull v4.4s, v17.4h, v0.h[4] // p1(top[0]) * filter(1)
+ smlal v4.4s, v18.4h, v0.h[5] // p2(top[1]) * filter(2)
+ smlal v4.4s, v19.4h, v0.h[6] // p3(top[2]) * filter(3)
+ sqrshrun v2.4h, v2.4s, #4
+ sqrshrun2 v2.8h, v3.4s, #4
+ smin v2.8h, v2.8h, v31.8h
+ smlal v4.4s, v20.4h, v0.h[7] // p4(top[3]) * filter(4)
+ smlal v4.4s, v16.4h, v0.h[3] // p0(topleft) * filter(0)
+ smlal v4.4s, v21.4h, v2.h[3] // p5(left[0]) * filter(5)
+ smlal v4.4s, v22.4h, v2.h[7] // p6(left[1]) * filter(6)
+ smull2 v5.4s, v17.8h, v0.h[4] // p1(top[0]) * filter(1)
+ smlal2 v5.4s, v18.8h, v0.h[5] // p2(top[1]) * filter(2)
+ smlal2 v5.4s, v19.8h, v0.h[6] // p3(top[2]) * filter(3)
+ smlal2 v5.4s, v20.8h, v0.h[7] // p4(top[3]) * filter(4)
+ smlal2 v5.4s, v16.8h, v0.h[3] // p0(topleft) * filter(0)
+ smlal2 v5.4s, v21.8h, v2.h[3] // p5(left[0]) * filter(5)
+ smlal2 v5.4s, v22.8h, v2.h[7] // p6(left[1]) * filter(6)
+ sqrshrun v3.4h, v4.4s, #4
+ sqrshrun2 v3.8h, v5.4s, #4
+.endif
+ smin v3.8h, v3.8h, v31.8h
+ subs w4, w4, #2
+ st2 {v2.d, v3.d}[0], [x0], x1
+ zip2 v0.2d, v2.2d, v3.2d
+ st2 {v2.d, v3.d}[1], [x6], x1
+ b.gt 8b
+ ret
+160:
+320:
+ AARCH64_VALID_JUMP_TARGET
+ add x8, x2, #2
+ sub x2, x2, #4
+ mov x7, #-4
+ sub x1, x1, w3, uxtw #1
+ mov w9, w3
+
+1:
+ ld1 {v0.4h}, [x2], x7 // left (0-1) + topleft (2)
+2:
+ ld1 {v1.8h, v2.8h}, [x8], #32 // top(0-15)
+.if \bpc == 10
+ mul v3.8h, v16.8h, v0.h[2] // p0(topleft) * filter(0)
+ mla v3.8h, v21.8h, v0.h[1] // p5(left[0]) * filter(5)
+ mla v3.8h, v22.8h, v0.h[0] // p6(left[1]) * filter(6)
+ mla v3.8h, v17.8h, v1.h[0] // p1(top[0]) * filter(1)
+ mla v3.8h, v18.8h, v1.h[1] // p2(top[1]) * filter(2)
+ mla v3.8h, v19.8h, v1.h[2] // p3(top[2]) * filter(3)
+ mla v3.8h, v20.8h, v1.h[3] // p4(top[3]) * filter(4)
+
+ mul v4.8h, v17.8h, v1.h[4] // p1(top[0]) * filter(1)
+ mla v4.8h, v18.8h, v1.h[5] // p2(top[1]) * filter(2)
+ mla v4.8h, v19.8h, v1.h[6] // p3(top[2]) * filter(3)
+ srshr v3.8h, v3.8h, #4
+ smax v3.8h, v3.8h, v30.8h
+ smin v3.8h, v3.8h, v31.8h
+ mla v4.8h, v20.8h, v1.h[7] // p4(top[3]) * filter(4)
+ mla v4.8h, v16.8h, v1.h[3] // p0(topleft) * filter(0)
+ mla v4.8h, v21.8h, v3.h[3] // p5(left[0]) * filter(5)
+ mla v4.8h, v22.8h, v3.h[7] // p6(left[1]) * filter(6)
+
+ mul v5.8h, v17.8h, v2.h[0] // p1(top[0]) * filter(1)
+ mla v5.8h, v18.8h, v2.h[1] // p2(top[1]) * filter(2)
+ mla v5.8h, v19.8h, v2.h[2] // p3(top[2]) * filter(3)
+ srshr v4.8h, v4.8h, #4
+ smax v4.8h, v4.8h, v30.8h
+ smin v4.8h, v4.8h, v31.8h
+ mla v5.8h, v20.8h, v2.h[3] // p4(top[3]) * filter(4)
+ mla v5.8h, v16.8h, v1.h[7] // p0(topleft) * filter(0)
+ mla v5.8h, v21.8h, v4.h[3] // p5(left[0]) * filter(5)
+ mla v5.8h, v22.8h, v4.h[7] // p6(left[1]) * filter(6)
+
+ mul v6.8h, v17.8h, v2.h[4] // p1(top[0]) * filter(1)
+ mla v6.8h, v18.8h, v2.h[5] // p2(top[1]) * filter(2)
+ mla v6.8h, v19.8h, v2.h[6] // p3(top[2]) * filter(3)
+ srshr v5.8h, v5.8h, #4
+ smax v5.8h, v5.8h, v30.8h
+ smin v5.8h, v5.8h, v31.8h
+ mla v6.8h, v20.8h, v2.h[7] // p4(top[3]) * filter(4)
+ mla v6.8h, v16.8h, v2.h[3] // p0(topleft) * filter(0)
+ mla v6.8h, v21.8h, v5.h[3] // p5(left[0]) * filter(5)
+ mla v6.8h, v22.8h, v5.h[7] // p6(left[1]) * filter(6)
+
+ subs w3, w3, #16
+ srshr v6.8h, v6.8h, #4
+ smax v6.8h, v6.8h, v30.8h
+.else
+ smull v3.4s, v16.4h, v0.h[2] // p0(topleft) * filter(0)
+ smlal v3.4s, v21.4h, v0.h[1] // p5(left[0]) * filter(5)
+ smlal v3.4s, v22.4h, v0.h[0] // p6(left[1]) * filter(6)
+ smlal v3.4s, v17.4h, v1.h[0] // p1(top[0]) * filter(1)
+ smlal v3.4s, v18.4h, v1.h[1] // p2(top[1]) * filter(2)
+ smlal v3.4s, v19.4h, v1.h[2] // p3(top[2]) * filter(3)
+ smlal v3.4s, v20.4h, v1.h[3] // p4(top[3]) * filter(4)
+ smull2 v4.4s, v16.8h, v0.h[2] // p0(topleft) * filter(0)
+ smlal2 v4.4s, v21.8h, v0.h[1] // p5(left[0]) * filter(5)
+ smlal2 v4.4s, v22.8h, v0.h[0] // p6(left[1]) * filter(6)
+ smlal2 v4.4s, v17.8h, v1.h[0] // p1(top[0]) * filter(1)
+ smlal2 v4.4s, v18.8h, v1.h[1] // p2(top[1]) * filter(2)
+ smlal2 v4.4s, v19.8h, v1.h[2] // p3(top[2]) * filter(3)
+ smlal2 v4.4s, v20.8h, v1.h[3] // p4(top[3]) * filter(4)
+
+ smull v5.4s, v17.4h, v1.h[4] // p1(top[0]) * filter(1)
+ smlal v5.4s, v18.4h, v1.h[5] // p2(top[1]) * filter(2)
+ smlal v5.4s, v19.4h, v1.h[6] // p3(top[2]) * filter(3)
+ sqrshrun v3.4h, v3.4s, #4
+ sqrshrun2 v3.8h, v4.4s, #4
+ smin v3.8h, v3.8h, v31.8h
+ smlal v5.4s, v20.4h, v1.h[7] // p4(top[3]) * filter(4)
+ smlal v5.4s, v16.4h, v1.h[3] // p0(topleft) * filter(0)
+ smlal v5.4s, v21.4h, v3.h[3] // p5(left[0]) * filter(5)
+ smlal v5.4s, v22.4h, v3.h[7] // p6(left[1]) * filter(6)
+ smull2 v6.4s, v17.8h, v1.h[4] // p1(top[0]) * filter(1)
+ smlal2 v6.4s, v18.8h, v1.h[5] // p2(top[1]) * filter(2)
+ smlal2 v6.4s, v19.8h, v1.h[6] // p3(top[2]) * filter(3)
+ smlal2 v6.4s, v20.8h, v1.h[7] // p4(top[3]) * filter(4)
+ smlal2 v6.4s, v16.8h, v1.h[3] // p0(topleft) * filter(0)
+ smlal2 v6.4s, v21.8h, v3.h[3] // p5(left[0]) * filter(5)
+ smlal2 v6.4s, v22.8h, v3.h[7] // p6(left[1]) * filter(6)
+
+ smull v24.4s, v17.4h, v2.h[0] // p1(top[0]) * filter(1)
+ smlal v24.4s, v18.4h, v2.h[1] // p2(top[1]) * filter(2)
+ smlal v24.4s, v19.4h, v2.h[2] // p3(top[2]) * filter(3)
+ sqrshrun v4.4h, v5.4s, #4
+ sqrshrun2 v4.8h, v6.4s, #4
+ smin v4.8h, v4.8h, v31.8h
+ smlal v24.4s, v20.4h, v2.h[3] // p4(top[3]) * filter(4)
+ smlal v24.4s, v16.4h, v1.h[7] // p0(topleft) * filter(0)
+ smlal v24.4s, v21.4h, v4.h[3] // p5(left[0]) * filter(5)
+ smlal v24.4s, v22.4h, v4.h[7] // p6(left[1]) * filter(6)
+ smull2 v25.4s, v17.8h, v2.h[0] // p1(top[0]) * filter(1)
+ smlal2 v25.4s, v18.8h, v2.h[1] // p2(top[1]) * filter(2)
+ smlal2 v25.4s, v19.8h, v2.h[2] // p3(top[2]) * filter(3)
+ smlal2 v25.4s, v20.8h, v2.h[3] // p4(top[3]) * filter(4)
+ smlal2 v25.4s, v16.8h, v1.h[7] // p0(topleft) * filter(0)
+ smlal2 v25.4s, v21.8h, v4.h[3] // p5(left[0]) * filter(5)
+ smlal2 v25.4s, v22.8h, v4.h[7] // p6(left[1]) * filter(6)
+
+ smull v26.4s, v17.4h, v2.h[4] // p1(top[0]) * filter(1)
+ smlal v26.4s, v18.4h, v2.h[5] // p2(top[1]) * filter(2)
+ smlal v26.4s, v19.4h, v2.h[6] // p3(top[2]) * filter(3)
+ sqrshrun v5.4h, v24.4s, #4
+ sqrshrun2 v5.8h, v25.4s, #4
+ smin v5.8h, v5.8h, v31.8h
+ smlal v26.4s, v20.4h, v2.h[7] // p4(top[3]) * filter(4)
+ smlal v26.4s, v16.4h, v2.h[3] // p0(topleft) * filter(0)
+ smlal v26.4s, v21.4h, v5.h[3] // p5(left[0]) * filter(5)
+ smlal v26.4s, v22.4h, v5.h[7] // p6(left[1]) * filter(6)
+ smull2 v27.4s, v17.8h, v2.h[4] // p1(top[0]) * filter(1)
+ smlal2 v27.4s, v18.8h, v2.h[5] // p2(top[1]) * filter(2)
+ smlal2 v27.4s, v19.8h, v2.h[6] // p3(top[2]) * filter(3)
+ smlal2 v27.4s, v20.8h, v2.h[7] // p4(top[3]) * filter(4)
+ smlal2 v27.4s, v16.8h, v2.h[3] // p0(topleft) * filter(0)
+ smlal2 v27.4s, v21.8h, v5.h[3] // p5(left[0]) * filter(5)
+ smlal2 v27.4s, v22.8h, v5.h[7] // p6(left[1]) * filter(6)
+
+ subs w3, w3, #16
+ sqrshrun v6.4h, v26.4s, #4
+ sqrshrun2 v6.8h, v27.4s, #4
+.endif
+ smin v6.8h, v6.8h, v31.8h
+
+ ins v0.h[2], v2.h[7]
+ st4 {v3.d, v4.d, v5.d, v6.d}[0], [x0], #32
+ ins v0.h[0], v6.h[7]
+ st4 {v3.d, v4.d, v5.d, v6.d}[1], [x6], #32
+ ins v0.h[1], v6.h[3]
+ b.gt 2b
+ subs w4, w4, #2
+ b.le 9f
+ sub x8, x6, w9, uxtw #1
+ add x0, x0, x1
+ add x6, x6, x1
+ mov w3, w9
+ b 1b
+9:
+ ret
+
+L(ipred_filter\bpc\()_tbl):
+ .hword L(ipred_filter\bpc\()_tbl) - 320b
+ .hword L(ipred_filter\bpc\()_tbl) - 160b
+ .hword L(ipred_filter\bpc\()_tbl) - 80b
+ .hword L(ipred_filter\bpc\()_tbl) - 40b
+endfunc
+.endm
+
+filter_fn 10
+filter_fn 12
+
+function ipred_filter_16bpc_neon, export=1
+ ldr w8, [sp]
+ cmp w8, 0x3ff
+ b.le ipred_filter_10bpc_neon
+ b ipred_filter_12bpc_neon
+endfunc
+
+// void pal_pred_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const pal, const uint8_t *idx,
+// const int w, const int h);
+function pal_pred_16bpc_neon, export=1
+ ld1 {v30.8h}, [x2]
+ clz w9, w4
+ adr x6, L(pal_pred_tbl)
+ sub w9, w9, #25
+ movi v29.16b, #7
+ ldrh w9, [x6, w9, uxtw #1]
+ movi v31.8h, #1, lsl #8
+ sub x6, x6, w9, uxtw
+ br x6
+40:
+ AARCH64_VALID_JUMP_TARGET
+ add x2, x0, x1
+ lsl x1, x1, #1
+4:
+ ld1 {v1.8b}, [x3], #8
+ subs w5, w5, #4
+ ushr v3.8b, v1.8b, #4
+ and v2.8b, v1.8b, v29.8b
+ zip1 v1.16b, v2.16b, v3.16b
+ // Restructure v1 from a, b, c, ... into 2*a, 2*a+1, 2*b, 2*b+1, 2*c, 2*c+1, ...
+ add v1.16b, v1.16b, v1.16b
+ zip1 v0.16b, v1.16b, v1.16b
+ zip2 v1.16b, v1.16b, v1.16b
+ add v0.8h, v0.8h, v31.8h
+ add v1.8h, v1.8h, v31.8h
+ tbl v0.16b, {v30.16b}, v0.16b
+ st1 {v0.d}[0], [x0], x1
+ tbl v1.16b, {v30.16b}, v1.16b
+ st1 {v0.d}[1], [x2], x1
+ st1 {v1.d}[0], [x0], x1
+ st1 {v1.d}[1], [x2], x1
+ b.gt 4b
+ ret
+80:
+ AARCH64_VALID_JUMP_TARGET
+ add x2, x0, x1
+ lsl x1, x1, #1
+8:
+ ld1 {v2.16b}, [x3], #16
+ subs w5, w5, #4
+ ushr v4.16b, v2.16b, #4
+ and v3.16b, v2.16b, v29.16b
+ zip1 v2.16b, v3.16b, v4.16b
+ zip2 v3.16b, v3.16b, v4.16b
+ add v2.16b, v2.16b, v2.16b
+ add v3.16b, v3.16b, v3.16b
+ zip1 v0.16b, v2.16b, v2.16b
+ zip2 v1.16b, v2.16b, v2.16b
+ zip1 v2.16b, v3.16b, v3.16b
+ zip2 v3.16b, v3.16b, v3.16b
+ add v0.8h, v0.8h, v31.8h
+ add v1.8h, v1.8h, v31.8h
+ add v2.8h, v2.8h, v31.8h
+ add v3.8h, v3.8h, v31.8h
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v1.16b, {v30.16b}, v1.16b
+ st1 {v0.8h}, [x0], x1
+ tbl v2.16b, {v30.16b}, v2.16b
+ st1 {v1.8h}, [x2], x1
+ tbl v3.16b, {v30.16b}, v3.16b
+ st1 {v2.8h}, [x0], x1
+ st1 {v3.8h}, [x2], x1
+ b.gt 8b
+ ret
+160:
+ AARCH64_VALID_JUMP_TARGET
+ add x2, x0, x1
+ lsl x1, x1, #1
+16:
+ ld1 {v4.16b, v5.16b}, [x3], #32
+ subs w5, w5, #4
+ ushr v7.16b, v4.16b, #4
+ and v6.16b, v4.16b, v29.16b
+ ushr v3.16b, v5.16b, #4
+ and v2.16b, v5.16b, v29.16b
+ zip1 v4.16b, v6.16b, v7.16b
+ zip2 v5.16b, v6.16b, v7.16b
+ zip1 v6.16b, v2.16b, v3.16b
+ zip2 v7.16b, v2.16b, v3.16b
+ add v4.16b, v4.16b, v4.16b
+ add v5.16b, v5.16b, v5.16b
+ add v6.16b, v6.16b, v6.16b
+ add v7.16b, v7.16b, v7.16b
+ zip1 v0.16b, v4.16b, v4.16b
+ zip2 v1.16b, v4.16b, v4.16b
+ zip1 v2.16b, v5.16b, v5.16b
+ zip2 v3.16b, v5.16b, v5.16b
+ zip1 v4.16b, v6.16b, v6.16b
+ zip2 v5.16b, v6.16b, v6.16b
+ zip1 v6.16b, v7.16b, v7.16b
+ zip2 v7.16b, v7.16b, v7.16b
+ add v0.8h, v0.8h, v31.8h
+ add v1.8h, v1.8h, v31.8h
+ add v2.8h, v2.8h, v31.8h
+ add v3.8h, v3.8h, v31.8h
+ add v4.8h, v4.8h, v31.8h
+ tbl v0.16b, {v30.16b}, v0.16b
+ add v5.8h, v5.8h, v31.8h
+ tbl v1.16b, {v30.16b}, v1.16b
+ add v6.8h, v6.8h, v31.8h
+ tbl v2.16b, {v30.16b}, v2.16b
+ add v7.8h, v7.8h, v31.8h
+ tbl v3.16b, {v30.16b}, v3.16b
+ tbl v4.16b, {v30.16b}, v4.16b
+ tbl v5.16b, {v30.16b}, v5.16b
+ st1 {v0.8h, v1.8h}, [x0], x1
+ tbl v6.16b, {v30.16b}, v6.16b
+ st1 {v2.8h, v3.8h}, [x2], x1
+ tbl v7.16b, {v30.16b}, v7.16b
+ st1 {v4.8h, v5.8h}, [x0], x1
+ st1 {v6.8h, v7.8h}, [x2], x1
+ b.gt 16b
+ ret
+320:
+ AARCH64_VALID_JUMP_TARGET
+ add x2, x0, x1
+ lsl x1, x1, #1
+32:
+ ld1 {v4.16b, v5.16b}, [x3], #32
+ subs w5, w5, #2
+ ushr v7.16b, v4.16b, #4
+ and v6.16b, v4.16b, v29.16b
+ ushr v3.16b, v5.16b, #4
+ and v2.16b, v5.16b, v29.16b
+ zip1 v4.16b, v6.16b, v7.16b
+ zip2 v5.16b, v6.16b, v7.16b
+ zip1 v6.16b, v2.16b, v3.16b
+ zip2 v7.16b, v2.16b, v3.16b
+ add v4.16b, v4.16b, v4.16b
+ add v5.16b, v5.16b, v5.16b
+ add v6.16b, v6.16b, v6.16b
+ add v7.16b, v7.16b, v7.16b
+ zip1 v0.16b, v4.16b, v4.16b
+ zip2 v1.16b, v4.16b, v4.16b
+ zip1 v2.16b, v5.16b, v5.16b
+ zip2 v3.16b, v5.16b, v5.16b
+ zip1 v4.16b, v6.16b, v6.16b
+ zip2 v5.16b, v6.16b, v6.16b
+ zip1 v6.16b, v7.16b, v7.16b
+ zip2 v7.16b, v7.16b, v7.16b
+ add v0.8h, v0.8h, v31.8h
+ add v1.8h, v1.8h, v31.8h
+ add v2.8h, v2.8h, v31.8h
+ add v3.8h, v3.8h, v31.8h
+ add v4.8h, v4.8h, v31.8h
+ tbl v0.16b, {v30.16b}, v0.16b
+ add v5.8h, v5.8h, v31.8h
+ tbl v1.16b, {v30.16b}, v1.16b
+ add v6.8h, v6.8h, v31.8h
+ tbl v2.16b, {v30.16b}, v2.16b
+ add v7.8h, v7.8h, v31.8h
+ tbl v3.16b, {v30.16b}, v3.16b
+ tbl v4.16b, {v30.16b}, v4.16b
+ tbl v5.16b, {v30.16b}, v5.16b
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ tbl v6.16b, {v30.16b}, v6.16b
+ tbl v7.16b, {v30.16b}, v7.16b
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x1
+ b.gt 32b
+ ret
+640:
+ AARCH64_VALID_JUMP_TARGET
+ add x2, x0, #64
+64:
+ ld1 {v4.16b, v5.16b}, [x3], #32
+ subs w5, w5, #1
+ ushr v7.16b, v4.16b, #4
+ and v6.16b, v4.16b, v29.16b
+ ushr v3.16b, v5.16b, #4
+ and v2.16b, v5.16b, v29.16b
+ zip1 v4.16b, v6.16b, v7.16b
+ zip2 v5.16b, v6.16b, v7.16b
+ zip1 v6.16b, v2.16b, v3.16b
+ zip2 v7.16b, v2.16b, v3.16b
+ add v4.16b, v4.16b, v4.16b
+ add v5.16b, v5.16b, v5.16b
+ add v6.16b, v6.16b, v6.16b
+ add v7.16b, v7.16b, v7.16b
+ zip1 v0.16b, v4.16b, v4.16b
+ zip2 v1.16b, v4.16b, v4.16b
+ zip1 v2.16b, v5.16b, v5.16b
+ zip2 v3.16b, v5.16b, v5.16b
+ zip1 v4.16b, v6.16b, v6.16b
+ zip2 v5.16b, v6.16b, v6.16b
+ zip1 v6.16b, v7.16b, v7.16b
+ zip2 v7.16b, v7.16b, v7.16b
+ add v0.8h, v0.8h, v31.8h
+ add v1.8h, v1.8h, v31.8h
+ add v2.8h, v2.8h, v31.8h
+ add v3.8h, v3.8h, v31.8h
+ add v4.8h, v4.8h, v31.8h
+ tbl v0.16b, {v30.16b}, v0.16b
+ add v5.8h, v5.8h, v31.8h
+ tbl v1.16b, {v30.16b}, v1.16b
+ add v6.8h, v6.8h, v31.8h
+ tbl v2.16b, {v30.16b}, v2.16b
+ add v7.8h, v7.8h, v31.8h
+ tbl v3.16b, {v30.16b}, v3.16b
+ tbl v4.16b, {v30.16b}, v4.16b
+ tbl v5.16b, {v30.16b}, v5.16b
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ tbl v6.16b, {v30.16b}, v6.16b
+ tbl v7.16b, {v30.16b}, v7.16b
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x1
+ b.gt 64b
+ ret
+
+L(pal_pred_tbl):
+ .hword L(pal_pred_tbl) - 640b
+ .hword L(pal_pred_tbl) - 320b
+ .hword L(pal_pred_tbl) - 160b
+ .hword L(pal_pred_tbl) - 80b
+ .hword L(pal_pred_tbl) - 40b
+endfunc
+
+// void ipred_cfl_128_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height,
+// const int16_t *ac, const int alpha,
+// const int bitdepth_max);
+function ipred_cfl_128_16bpc_neon, export=1
+ dup v31.8h, w7 // bitdepth_max
+ clz w9, w3
+ adr x7, L(ipred_cfl_128_tbl)
+ sub w9, w9, #26
+ ldrh w9, [x7, w9, uxtw #1]
+ urshr v0.8h, v31.8h, #1
+ dup v1.8h, w6 // alpha
+ sub x7, x7, w9, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ movi v30.8h, #0
+ br x7
+L(ipred_cfl_splat_w4):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v4.8h, v5.8h}, [x5], #32
+ subs w4, w4, #4
+ smull v2.4s, v4.4h, v1.4h // diff = ac * alpha
+ smull2 v3.4s, v4.8h, v1.8h
+ smull v4.4s, v5.4h, v1.4h
+ smull2 v5.4s, v5.8h, v1.8h
+ cmlt v16.4s, v2.4s, #0 // sign
+ cmlt v17.4s, v3.4s, #0
+ cmlt v18.4s, v4.4s, #0
+ cmlt v19.4s, v5.4s, #0
+ add v2.4s, v2.4s, v16.4s // diff + sign
+ add v3.4s, v3.4s, v17.4s
+ add v4.4s, v4.4s, v18.4s
+ add v5.4s, v5.4s, v19.4s
+ rshrn v2.4h, v2.4s, #6 // (diff + sign + 32) >> 6 = apply_sign()
+ rshrn2 v2.8h, v3.4s, #6
+ rshrn v3.4h, v4.4s, #6
+ rshrn2 v3.8h, v5.4s, #6
+ add v2.8h, v2.8h, v0.8h // dc + apply_sign()
+ add v3.8h, v3.8h, v0.8h
+ smax v2.8h, v2.8h, v30.8h
+ smax v3.8h, v3.8h, v30.8h
+ smin v2.8h, v2.8h, v31.8h
+ smin v3.8h, v3.8h, v31.8h
+ st1 {v2.d}[0], [x0], x1
+ st1 {v2.d}[1], [x6], x1
+ st1 {v3.d}[0], [x0], x1
+ st1 {v3.d}[1], [x6], x1
+ b.gt L(ipred_cfl_splat_w4)
+ ret
+L(ipred_cfl_splat_w8):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v4.8h, v5.8h}, [x5], #32
+ subs w4, w4, #2
+ smull v2.4s, v4.4h, v1.4h // diff = ac * alpha
+ smull2 v3.4s, v4.8h, v1.8h
+ smull v4.4s, v5.4h, v1.4h
+ smull2 v5.4s, v5.8h, v1.8h
+ cmlt v16.4s, v2.4s, #0 // sign
+ cmlt v17.4s, v3.4s, #0
+ cmlt v18.4s, v4.4s, #0
+ cmlt v19.4s, v5.4s, #0
+ add v2.4s, v2.4s, v16.4s // diff + sign
+ add v3.4s, v3.4s, v17.4s
+ add v4.4s, v4.4s, v18.4s
+ add v5.4s, v5.4s, v19.4s
+ rshrn v2.4h, v2.4s, #6 // (diff + sign + 32) >> 6 = apply_sign()
+ rshrn2 v2.8h, v3.4s, #6
+ rshrn v3.4h, v4.4s, #6
+ rshrn2 v3.8h, v5.4s, #6
+ add v2.8h, v2.8h, v0.8h // dc + apply_sign()
+ add v3.8h, v3.8h, v0.8h
+ smax v2.8h, v2.8h, v30.8h
+ smax v3.8h, v3.8h, v30.8h
+ smin v2.8h, v2.8h, v31.8h
+ smin v3.8h, v3.8h, v31.8h
+ st1 {v2.8h}, [x0], x1
+ st1 {v3.8h}, [x6], x1
+ b.gt L(ipred_cfl_splat_w8)
+ ret
+L(ipred_cfl_splat_w16):
+ AARCH64_VALID_JUMP_TARGET
+ add x7, x5, w3, uxtw #1
+ sub x1, x1, w3, uxtw #1
+ mov w9, w3
+1:
+ ld1 {v2.8h, v3.8h}, [x5], #32
+ ld1 {v4.8h, v5.8h}, [x7], #32
+ subs w3, w3, #16
+ smull v16.4s, v2.4h, v1.4h // diff = ac * alpha
+ smull2 v17.4s, v2.8h, v1.8h
+ smull v18.4s, v3.4h, v1.4h
+ smull2 v19.4s, v3.8h, v1.8h
+ smull v2.4s, v4.4h, v1.4h
+ smull2 v3.4s, v4.8h, v1.8h
+ smull v4.4s, v5.4h, v1.4h
+ smull2 v5.4s, v5.8h, v1.8h
+ cmlt v20.4s, v16.4s, #0 // sign
+ cmlt v21.4s, v17.4s, #0
+ cmlt v22.4s, v18.4s, #0
+ cmlt v23.4s, v19.4s, #0
+ cmlt v24.4s, v2.4s, #0
+ cmlt v25.4s, v3.4s, #0
+ cmlt v26.4s, v4.4s, #0
+ cmlt v27.4s, v5.4s, #0
+ add v16.4s, v16.4s, v20.4s // diff + sign
+ add v17.4s, v17.4s, v21.4s
+ add v18.4s, v18.4s, v22.4s
+ add v19.4s, v19.4s, v23.4s
+ add v2.4s, v2.4s, v24.4s
+ add v3.4s, v3.4s, v25.4s
+ add v4.4s, v4.4s, v26.4s
+ add v5.4s, v5.4s, v27.4s
+ rshrn v16.4h, v16.4s, #6 // (diff + sign + 32) >> 6 = apply_sign()
+ rshrn2 v16.8h, v17.4s, #6
+ rshrn v17.4h, v18.4s, #6
+ rshrn2 v17.8h, v19.4s, #6
+ rshrn v6.4h, v2.4s, #6
+ rshrn2 v6.8h, v3.4s, #6
+ rshrn v7.4h, v4.4s, #6
+ rshrn2 v7.8h, v5.4s, #6
+ add v2.8h, v16.8h, v0.8h // dc + apply_sign()
+ add v3.8h, v17.8h, v0.8h
+ add v4.8h, v6.8h, v0.8h
+ add v5.8h, v7.8h, v0.8h
+ smax v2.8h, v2.8h, v30.8h
+ smax v3.8h, v3.8h, v30.8h
+ smax v4.8h, v4.8h, v30.8h
+ smax v5.8h, v5.8h, v30.8h
+ smin v2.8h, v2.8h, v31.8h
+ smin v3.8h, v3.8h, v31.8h
+ smin v4.8h, v4.8h, v31.8h
+ smin v5.8h, v5.8h, v31.8h
+ st1 {v2.8h, v3.8h}, [x0], #32
+ st1 {v4.8h, v5.8h}, [x6], #32
+ b.gt 1b
+ subs w4, w4, #2
+ add x5, x5, w9, uxtw #1
+ add x7, x7, w9, uxtw #1
+ add x0, x0, x1
+ add x6, x6, x1
+ mov w3, w9
+ b.gt 1b
+ ret
+
+L(ipred_cfl_128_tbl):
+L(ipred_cfl_splat_tbl):
+ .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16)
+ .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16)
+ .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w8)
+ .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w4)
+endfunc
+
+// void ipred_cfl_top_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height,
+// const int16_t *ac, const int alpha,
+// const int bitdepth_max);
+function ipred_cfl_top_16bpc_neon, export=1
+ dup v31.8h, w7 // bitdepth_max
+ clz w9, w3
+ adr x7, L(ipred_cfl_top_tbl)
+ sub w9, w9, #26
+ ldrh w9, [x7, w9, uxtw #1]
+ dup v1.8h, w6 // alpha
+ add x2, x2, #2
+ sub x7, x7, w9, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ movi v30.8h, #0
+ br x7
+4:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.4h}, [x2]
+ addv h0, v0.4h
+ urshr v0.4h, v0.4h, #2
+ dup v0.8h, v0.h[0]
+ b L(ipred_cfl_splat_w4)
+8:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8h}, [x2]
+ addv h0, v0.8h
+ urshr v0.4h, v0.4h, #3
+ dup v0.8h, v0.h[0]
+ b L(ipred_cfl_splat_w8)
+16:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.8h, v3.8h}, [x2]
+ addp v0.8h, v2.8h, v3.8h
+ addv h0, v0.8h
+ urshr v0.4h, v0.4h, #4
+ dup v0.8h, v0.h[0]
+ b L(ipred_cfl_splat_w16)
+32:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2]
+ addp v2.8h, v2.8h, v3.8h
+ addp v4.8h, v4.8h, v5.8h
+ addp v0.8h, v2.8h, v4.8h
+ uaddlv s0, v0.8h
+ rshrn v0.4h, v0.4s, #5
+ dup v0.8h, v0.h[0]
+ b L(ipred_cfl_splat_w16)
+
+L(ipred_cfl_top_tbl):
+ .hword L(ipred_cfl_top_tbl) - 32b
+ .hword L(ipred_cfl_top_tbl) - 16b
+ .hword L(ipred_cfl_top_tbl) - 8b
+ .hword L(ipred_cfl_top_tbl) - 4b
+endfunc
+
+// void ipred_cfl_left_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height,
+// const int16_t *ac, const int alpha,
+// const int bitdepth_max);
+function ipred_cfl_left_16bpc_neon, export=1
+ dup v31.8h, w7 // bitdepth_max
+ sub x2, x2, w4, uxtw #1
+ clz w9, w3
+ clz w8, w4
+ adr x10, L(ipred_cfl_splat_tbl)
+ adr x7, L(ipred_cfl_left_tbl)
+ sub w9, w9, #26
+ sub w8, w8, #26
+ ldrh w9, [x10, w9, uxtw #1]
+ ldrh w8, [x7, w8, uxtw #1]
+ dup v1.8h, w6 // alpha
+ sub x9, x10, w9, uxtw
+ sub x7, x7, w8, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ movi v30.8h, #0
+ br x7
+
+L(ipred_cfl_left_h4):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.4h}, [x2]
+ addv h0, v0.4h
+ urshr v0.4h, v0.4h, #2
+ dup v0.8h, v0.h[0]
+ br x9
+
+L(ipred_cfl_left_h8):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8h}, [x2]
+ addv h0, v0.8h
+ urshr v0.4h, v0.4h, #3
+ dup v0.8h, v0.h[0]
+ br x9
+
+L(ipred_cfl_left_h16):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.8h, v3.8h}, [x2]
+ addp v0.8h, v2.8h, v3.8h
+ addv h0, v0.8h
+ urshr v0.4h, v0.4h, #4
+ dup v0.8h, v0.h[0]
+ br x9
+
+L(ipred_cfl_left_h32):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2]
+ addp v2.8h, v2.8h, v3.8h
+ addp v4.8h, v4.8h, v5.8h
+ addp v0.8h, v2.8h, v4.8h
+ uaddlv s0, v0.8h
+ rshrn v0.4h, v0.4s, #5
+ dup v0.8h, v0.h[0]
+ br x9
+
+L(ipred_cfl_left_tbl):
+ .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h32)
+ .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h16)
+ .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h8)
+ .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h4)
+endfunc
+
+// void ipred_cfl_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height,
+// const int16_t *ac, const int alpha,
+// const int bitdepth_max);
+function ipred_cfl_16bpc_neon, export=1
+ dup v31.8h, w7 // bitdepth_max
+ sub x2, x2, w4, uxtw #1
+ add w8, w3, w4 // width + height
+ dup v1.8h, w6 // alpha
+ clz w9, w3
+ clz w6, w4
+ dup v16.4s, w8 // width + height
+ adr x7, L(ipred_cfl_tbl)
+ rbit w8, w8 // rbit(width + height)
+ sub w9, w9, #22 // 26 leading bits, minus table offset 4
+ sub w6, w6, #26
+ clz w8, w8 // ctz(width + height)
+ ldrh w9, [x7, w9, uxtw #1]
+ ldrh w6, [x7, w6, uxtw #1]
+ neg w8, w8 // -ctz(width + height)
+ sub x9, x7, w9, uxtw
+ sub x7, x7, w6, uxtw
+ ushr v16.4s, v16.4s, #1 // (width + height) >> 1
+ dup v17.4s, w8 // -ctz(width + height)
+ add x6, x0, x1
+ lsl x1, x1, #1
+ movi v30.8h, #0
+ br x7
+
+L(ipred_cfl_h4):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.4h}, [x2], #8
+ uaddlv s0, v0.4h
+ add x2, x2, #2
+ br x9
+L(ipred_cfl_w4):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.4h}, [x2]
+ add v0.2s, v0.2s, v16.2s
+ uaddlv s2, v2.4h
+ cmp w4, #4
+ add v0.2s, v0.2s, v2.2s
+ ushl v0.2s, v0.2s, v17.2s
+ b.eq 1f
+ // h = 8/16
+ cmp w4, #16
+ mov w16, #0x6667
+ mov w17, #0xAAAB
+ csel w16, w16, w17, eq
+ dup v16.2s, w16
+ mul v0.2s, v0.2s, v16.2s
+ ushr v0.2s, v0.2s, #17
+1:
+ dup v0.8h, v0.h[0]
+ b L(ipred_cfl_splat_w4)
+
+L(ipred_cfl_h8):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8h}, [x2], #16
+ uaddlv s0, v0.8h
+ add x2, x2, #2
+ br x9
+L(ipred_cfl_w8):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.8h}, [x2]
+ add v0.2s, v0.2s, v16.2s
+ uaddlv s2, v2.8h
+ cmp w4, #8
+ add v0.2s, v0.2s, v2.2s
+ ushl v0.2s, v0.2s, v17.2s
+ b.eq 1f
+ // h = 4/16/32
+ cmp w4, #32
+ mov w16, #0x6667
+ mov w17, #0xAAAB
+ csel w16, w16, w17, eq
+ dup v16.2s, w16
+ mul v0.2s, v0.2s, v16.2s
+ ushr v0.2s, v0.2s, #17
+1:
+ dup v0.8h, v0.h[0]
+ b L(ipred_cfl_splat_w8)
+
+L(ipred_cfl_h16):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.8h, v3.8h}, [x2], #32
+ addp v0.8h, v2.8h, v3.8h
+ add x2, x2, #2
+ uaddlv s0, v0.8h
+ br x9
+L(ipred_cfl_w16):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.8h, v3.8h}, [x2]
+ add v0.2s, v0.2s, v16.2s
+ addp v2.8h, v2.8h, v3.8h
+ uaddlv s2, v2.8h
+ cmp w4, #16
+ add v0.2s, v0.2s, v2.2s
+ ushl v0.2s, v0.2s, v17.2s
+ b.eq 1f
+ // h = 4/8/32
+ tst w4, #(32+16+8) // 16 added to make a consecutive bitmask
+ mov w16, #0x6667
+ mov w17, #0xAAAB
+ csel w16, w16, w17, eq
+ dup v16.2s, w16
+ mul v0.2s, v0.2s, v16.2s
+ ushr v0.2s, v0.2s, #17
+1:
+ dup v0.8h, v0.h[0]
+ b L(ipred_cfl_splat_w16)
+
+L(ipred_cfl_h32):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2], #64
+ addp v2.8h, v2.8h, v3.8h
+ addp v4.8h, v4.8h, v5.8h
+ addp v0.8h, v2.8h, v4.8h
+ add x2, x2, #2
+ uaddlv s0, v0.8h
+ br x9
+L(ipred_cfl_w32):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2]
+ add v0.4s, v0.4s, v16.4s
+ addp v2.8h, v2.8h, v3.8h
+ addp v4.8h, v4.8h, v5.8h
+ addp v2.8h, v2.8h, v4.8h
+ cmp w4, #32
+ uaddlv s2, v2.8h
+ add v0.2s, v0.2s, v2.2s
+ ushl v0.2s, v0.2s, v17.2s
+ b.eq 1f
+ // h = 8/16
+ cmp w4, #8
+ mov w16, #0x6667
+ mov w17, #0xAAAB
+ csel w16, w16, w17, eq
+ dup v16.2s, w16
+ mul v0.2s, v0.2s, v16.2s
+ ushr v0.2s, v0.2s, #17
+1:
+ dup v0.8h, v0.h[0]
+ b L(ipred_cfl_splat_w16)
+
+L(ipred_cfl_tbl):
+ .hword L(ipred_cfl_tbl) - L(ipred_cfl_h32)
+ .hword L(ipred_cfl_tbl) - L(ipred_cfl_h16)
+ .hword L(ipred_cfl_tbl) - L(ipred_cfl_h8)
+ .hword L(ipred_cfl_tbl) - L(ipred_cfl_h4)
+ .hword L(ipred_cfl_tbl) - L(ipred_cfl_w32)
+ .hword L(ipred_cfl_tbl) - L(ipred_cfl_w16)
+ .hword L(ipred_cfl_tbl) - L(ipred_cfl_w8)
+ .hword L(ipred_cfl_tbl) - L(ipred_cfl_w4)
+endfunc
+
+// void cfl_ac_420_16bpc_neon(int16_t *const ac, const pixel *const ypx,
+// const ptrdiff_t stride, const int w_pad,
+// const int h_pad, const int cw, const int ch);
+function ipred_cfl_ac_420_16bpc_neon, export=1
+ clz w8, w5
+ lsl w4, w4, #2
+ adr x7, L(ipred_cfl_ac_420_tbl)
+ sub w8, w8, #27
+ ldrh w8, [x7, w8, uxtw #1]
+ movi v24.4s, #0
+ movi v25.4s, #0
+ movi v26.4s, #0
+ movi v27.4s, #0
+ sub x7, x7, w8, uxtw
+ sub w8, w6, w4 // height - h_pad
+ rbit w9, w5 // rbit(width)
+ rbit w10, w6 // rbit(height)
+ clz w9, w9 // ctz(width)
+ clz w10, w10 // ctz(height)
+ add w9, w9, w10 // log2sz
+ add x10, x1, x2
+ dup v31.4s, w9
+ lsl x2, x2, #1
+ neg v31.4s, v31.4s // -log2sz
+ br x7
+
+L(ipred_cfl_ac_420_w4):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and subsample input
+ ld1 {v0.8h}, [x1], x2
+ ld1 {v1.8h}, [x10], x2
+ ld1 {v2.8h}, [x1], x2
+ ld1 {v3.8h}, [x10], x2
+ addp v0.8h, v0.8h, v2.8h
+ addp v1.8h, v1.8h, v3.8h
+ add v0.8h, v0.8h, v1.8h
+ shl v0.8h, v0.8h, #1
+ subs w8, w8, #2
+ st1 {v0.8h}, [x0], #16
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ b.gt 1b
+ trn2 v1.2d, v0.2d, v0.2d
+ trn2 v0.2d, v0.2d, v0.2d
+L(ipred_cfl_ac_420_w4_hpad):
+ cbz w4, 3f
+2: // Vertical padding (h_pad > 0)
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h}, [x0], #32
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ b.gt 2b
+3:
+L(ipred_cfl_ac_420_w4_calc_subtract_dc):
+ // Aggregate the sums
+ add v24.4s, v24.4s, v25.4s
+ add v26.4s, v26.4s, v27.4s
+ add v0.4s, v24.4s, v26.4s
+ addv s0, v0.4s // sum
+ sub x0, x0, w6, uxtw #3
+ urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz
+ dup v4.8h, v4.h[0]
+6: // Subtract dc from ac
+ ld1 {v0.8h, v1.8h}, [x0]
+ subs w6, w6, #4
+ sub v0.8h, v0.8h, v4.8h
+ sub v1.8h, v1.8h, v4.8h
+ st1 {v0.8h, v1.8h}, [x0], #32
+ b.gt 6b
+ ret
+
+L(ipred_cfl_ac_420_w8):
+ AARCH64_VALID_JUMP_TARGET
+ cbnz w3, L(ipred_cfl_ac_420_w8_wpad)
+1: // Copy and subsample input, without padding
+ ld1 {v0.8h, v1.8h}, [x1], x2
+ ld1 {v2.8h, v3.8h}, [x10], x2
+ ld1 {v4.8h, v5.8h}, [x1], x2
+ addp v0.8h, v0.8h, v1.8h
+ ld1 {v6.8h, v7.8h}, [x10], x2
+ addp v2.8h, v2.8h, v3.8h
+ addp v4.8h, v4.8h, v5.8h
+ addp v6.8h, v6.8h, v7.8h
+ add v0.8h, v0.8h, v2.8h
+ add v4.8h, v4.8h, v6.8h
+ shl v0.8h, v0.8h, #1
+ shl v1.8h, v4.8h, #1
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h}, [x0], #32
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ b.gt 1b
+ mov v0.16b, v1.16b
+ b L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_420_w8_wpad):
+1: // Copy and subsample input, padding 4
+ ld1 {v0.8h}, [x1], x2
+ ld1 {v1.8h}, [x10], x2
+ ld1 {v2.8h}, [x1], x2
+ ld1 {v3.8h}, [x10], x2
+ addp v0.8h, v0.8h, v2.8h
+ addp v1.8h, v1.8h, v3.8h
+ add v0.8h, v0.8h, v1.8h
+ shl v0.8h, v0.8h, #1
+ dup v1.4h, v0.h[3]
+ dup v3.4h, v0.h[7]
+ trn2 v2.2d, v0.2d, v0.2d
+ subs w8, w8, #2
+ st1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw v25.4s, v25.4s, v1.4h
+ uaddw v26.4s, v26.4s, v2.4h
+ uaddw v27.4s, v27.4s, v3.4h
+ b.gt 1b
+ trn1 v0.2d, v2.2d, v3.2d
+ trn1 v1.2d, v2.2d, v3.2d
+
+L(ipred_cfl_ac_420_w8_hpad):
+ cbz w4, 3f
+2: // Vertical padding (h_pad > 0)
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h}, [x0], #32
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ st1 {v0.8h, v1.8h}, [x0], #32
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ b.gt 2b
+3:
+
+ // Double the height and reuse the w4 summing/subtracting
+ lsl w6, w6, #1
+ b L(ipred_cfl_ac_420_w4_calc_subtract_dc)
+
+L(ipred_cfl_ac_420_w16):
+ AARCH64_VALID_JUMP_TARGET
+ adr x7, L(ipred_cfl_ac_420_w16_tbl)
+ ldrh w3, [x7, w3, uxtw #1]
+ sub x7, x7, w3, uxtw
+ br x7
+
+L(ipred_cfl_ac_420_w16_wpad0):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and subsample input, without padding
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2
+ ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2
+ addp v0.8h, v0.8h, v1.8h
+ addp v2.8h, v2.8h, v3.8h
+ addp v4.8h, v4.8h, v5.8h
+ addp v6.8h, v6.8h, v7.8h
+ ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x1], x2
+ add v0.8h, v0.8h, v4.8h
+ ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x10], x2
+ add v2.8h, v2.8h, v6.8h
+ addp v16.8h, v16.8h, v17.8h
+ addp v18.8h, v18.8h, v19.8h
+ addp v20.8h, v20.8h, v21.8h
+ addp v22.8h, v22.8h, v23.8h
+ add v16.8h, v16.8h, v20.8h
+ add v18.8h, v18.8h, v22.8h
+ shl v0.8h, v0.8h, #1
+ shl v1.8h, v2.8h, #1
+ shl v2.8h, v16.8h, #1
+ shl v3.8h, v18.8h, #1
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_wpad1):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and subsample input, padding 4
+ ldr q2, [x1, #32]
+ ld1 {v0.8h, v1.8h}, [x1], x2
+ ldr q5, [x10, #32]
+ ld1 {v3.8h, v4.8h}, [x10], x2
+ addp v2.8h, v2.8h, v2.8h
+ addp v0.8h, v0.8h, v1.8h
+ addp v5.8h, v5.8h, v5.8h
+ addp v3.8h, v3.8h, v4.8h
+ ldr q18, [x1, #32]
+ add v2.4h, v2.4h, v5.4h
+ ld1 {v16.8h, v17.8h}, [x1], x2
+ add v0.8h, v0.8h, v3.8h
+ ldr q21, [x10, #32]
+ ld1 {v19.8h, v20.8h}, [x10], x2
+ addp v18.8h, v18.8h, v18.8h
+ addp v16.8h, v16.8h, v17.8h
+ addp v21.8h, v21.8h, v21.8h
+ addp v19.8h, v19.8h, v20.8h
+ add v18.4h, v18.4h, v21.4h
+ add v16.8h, v16.8h, v19.8h
+ shl v1.4h, v2.4h, #1
+ shl v0.8h, v0.8h, #1
+ shl v3.4h, v18.4h, #1
+ shl v2.8h, v16.8h, #1
+ dup v4.4h, v1.h[3]
+ dup v5.4h, v3.h[3]
+ trn1 v1.2d, v1.2d, v4.2d
+ trn1 v3.2d, v3.2d, v5.2d
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_wpad2):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and subsample input, padding 8
+ ld1 {v0.8h, v1.8h}, [x1], x2
+ ld1 {v2.8h, v3.8h}, [x10], x2
+ ld1 {v4.8h, v5.8h}, [x1], x2
+ addp v0.8h, v0.8h, v1.8h
+ ld1 {v6.8h, v7.8h}, [x10], x2
+ addp v2.8h, v2.8h, v3.8h
+ addp v4.8h, v4.8h, v5.8h
+ addp v6.8h, v6.8h, v7.8h
+ add v0.8h, v0.8h, v2.8h
+ add v4.8h, v4.8h, v6.8h
+ shl v0.8h, v0.8h, #1
+ shl v2.8h, v4.8h, #1
+ dup v1.8h, v0.h[7]
+ dup v3.8h, v2.h[7]
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_wpad3):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and subsample input, padding 12
+ ld1 {v0.8h}, [x1], x2
+ ld1 {v2.8h}, [x10], x2
+ ld1 {v4.8h}, [x1], x2
+ ld1 {v6.8h}, [x10], x2
+ addp v0.8h, v0.8h, v4.8h
+ addp v2.8h, v2.8h, v6.8h
+ add v0.8h, v0.8h, v2.8h
+ shl v0.8h, v0.8h, #1
+ dup v1.8h, v0.h[3]
+ dup v3.8h, v0.h[7]
+ trn2 v2.2d, v0.2d, v3.2d
+ trn1 v0.2d, v0.2d, v1.2d
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+
+L(ipred_cfl_ac_420_w16_hpad):
+ cbz w4, 3f
+2: // Vertical padding (h_pad > 0)
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 2b
+3:
+
+ // Quadruple the height and reuse the w4 summing/subtracting
+ lsl w6, w6, #2
+ b L(ipred_cfl_ac_420_w4_calc_subtract_dc)
+
+L(ipred_cfl_ac_420_tbl):
+ .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w16)
+ .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w8)
+ .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w4)
+ .hword 0
+
+L(ipred_cfl_ac_420_w16_tbl):
+ .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad0)
+ .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad1)
+ .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad2)
+ .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad3)
+endfunc
+
+// void cfl_ac_422_16bpc_neon(int16_t *const ac, const pixel *const ypx,
+// const ptrdiff_t stride, const int w_pad,
+// const int h_pad, const int cw, const int ch);
+function ipred_cfl_ac_422_16bpc_neon, export=1
+ clz w8, w5
+ lsl w4, w4, #2
+ adr x7, L(ipred_cfl_ac_422_tbl)
+ sub w8, w8, #27
+ ldrh w8, [x7, w8, uxtw #1]
+ movi v24.4s, #0
+ movi v25.4s, #0
+ movi v26.4s, #0
+ movi v27.4s, #0
+ sub x7, x7, w8, uxtw
+ sub w8, w6, w4 // height - h_pad
+ rbit w9, w5 // rbit(width)
+ rbit w10, w6 // rbit(height)
+ clz w9, w9 // ctz(width)
+ clz w10, w10 // ctz(height)
+ add w9, w9, w10 // log2sz
+ add x10, x1, x2
+ dup v31.4s, w9
+ lsl x2, x2, #1
+ neg v31.4s, v31.4s // -log2sz
+ br x7
+
+L(ipred_cfl_ac_422_w4):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and subsample input
+ ld1 {v0.8h}, [x1], x2
+ ld1 {v1.8h}, [x10], x2
+ ld1 {v2.8h}, [x1], x2
+ ld1 {v3.8h}, [x10], x2
+ addp v0.8h, v0.8h, v1.8h
+ addp v2.8h, v2.8h, v3.8h
+ shl v0.8h, v0.8h, #2
+ shl v1.8h, v2.8h, #2
+ subs w8, w8, #4
+ st1 {v0.8h, v1.8h}, [x0], #32
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ b.gt 1b
+ trn2 v0.2d, v1.2d, v1.2d
+ trn2 v1.2d, v1.2d, v1.2d
+ b L(ipred_cfl_ac_420_w4_hpad)
+
+L(ipred_cfl_ac_422_w8):
+ AARCH64_VALID_JUMP_TARGET
+ cbnz w3, L(ipred_cfl_ac_422_w8_wpad)
+1: // Copy and subsample input, without padding
+ ld1 {v0.8h, v1.8h}, [x1], x2
+ ld1 {v2.8h, v3.8h}, [x10], x2
+ ld1 {v4.8h, v5.8h}, [x1], x2
+ addp v0.8h, v0.8h, v1.8h
+ ld1 {v6.8h, v7.8h}, [x10], x2
+ addp v2.8h, v2.8h, v3.8h
+ addp v4.8h, v4.8h, v5.8h
+ addp v6.8h, v6.8h, v7.8h
+ shl v0.8h, v0.8h, #2
+ shl v1.8h, v2.8h, #2
+ shl v2.8h, v4.8h, #2
+ shl v3.8h, v6.8h, #2
+ subs w8, w8, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ mov v0.16b, v3.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_422_w8_wpad):
+1: // Copy and subsample input, padding 4
+ ld1 {v0.8h}, [x1], x2
+ ld1 {v1.8h}, [x10], x2
+ ld1 {v2.8h}, [x1], x2
+ ld1 {v3.8h}, [x10], x2
+ addp v0.8h, v0.8h, v1.8h
+ addp v2.8h, v2.8h, v3.8h
+ shl v0.8h, v0.8h, #2
+ shl v2.8h, v2.8h, #2
+ dup v4.4h, v0.h[3]
+ dup v5.8h, v0.h[7]
+ dup v6.4h, v2.h[3]
+ dup v7.8h, v2.h[7]
+ trn2 v1.2d, v0.2d, v5.2d
+ trn1 v0.2d, v0.2d, v4.2d
+ trn2 v3.2d, v2.2d, v7.2d
+ trn1 v2.2d, v2.2d, v6.2d
+ subs w8, w8, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ mov v0.16b, v3.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_422_w16):
+ AARCH64_VALID_JUMP_TARGET
+ adr x7, L(ipred_cfl_ac_422_w16_tbl)
+ ldrh w3, [x7, w3, uxtw #1]
+ sub x7, x7, w3, uxtw
+ br x7
+
+L(ipred_cfl_ac_422_w16_wpad0):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and subsample input, without padding
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2
+ ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2
+ addp v0.8h, v0.8h, v1.8h
+ addp v2.8h, v2.8h, v3.8h
+ addp v4.8h, v4.8h, v5.8h
+ addp v6.8h, v6.8h, v7.8h
+ shl v0.8h, v0.8h, #2
+ shl v1.8h, v2.8h, #2
+ shl v2.8h, v4.8h, #2
+ shl v3.8h, v6.8h, #2
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_w16_wpad1):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and subsample input, padding 4
+ ldr q2, [x1, #32]
+ ld1 {v0.8h, v1.8h}, [x1], x2
+ ldr q6, [x10, #32]
+ ld1 {v4.8h, v5.8h}, [x10], x2
+ addp v2.8h, v2.8h, v2.8h
+ addp v0.8h, v0.8h, v1.8h
+ addp v6.8h, v6.8h, v6.8h
+ addp v4.8h, v4.8h, v5.8h
+ shl v1.4h, v2.4h, #2
+ shl v0.8h, v0.8h, #2
+ shl v3.4h, v6.4h, #2
+ shl v2.8h, v4.8h, #2
+ dup v4.4h, v1.h[3]
+ dup v5.4h, v3.h[3]
+ trn1 v1.2d, v1.2d, v4.2d
+ trn1 v3.2d, v3.2d, v5.2d
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_w16_wpad2):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and subsample input, padding 8
+ ld1 {v0.8h, v1.8h}, [x1], x2
+ ld1 {v2.8h, v3.8h}, [x10], x2
+ addp v0.8h, v0.8h, v1.8h
+ addp v2.8h, v2.8h, v3.8h
+ shl v0.8h, v0.8h, #2
+ shl v2.8h, v2.8h, #2
+ dup v1.8h, v0.h[7]
+ dup v3.8h, v2.h[7]
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_w16_wpad3):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and subsample input, padding 12
+ ld1 {v0.8h}, [x1], x2
+ ld1 {v2.8h}, [x10], x2
+ addp v0.8h, v0.8h, v0.8h
+ addp v2.8h, v2.8h, v2.8h
+ shl v0.4h, v0.4h, #2
+ shl v2.4h, v2.4h, #2
+ dup v1.8h, v0.h[3]
+ dup v3.8h, v2.h[3]
+ trn1 v0.2d, v0.2d, v1.2d
+ trn1 v2.2d, v2.2d, v3.2d
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_tbl):
+ .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w16)
+ .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w8)
+ .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w4)
+ .hword 0
+
+L(ipred_cfl_ac_422_w16_tbl):
+ .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad0)
+ .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad1)
+ .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad2)
+ .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad3)
+endfunc
+
+// void cfl_ac_444_16bpc_neon(int16_t *const ac, const pixel *const ypx,
+// const ptrdiff_t stride, const int w_pad,
+// const int h_pad, const int cw, const int ch);
+function ipred_cfl_ac_444_16bpc_neon, export=1
+ clz w8, w5
+ lsl w4, w4, #2
+ adr x7, L(ipred_cfl_ac_444_tbl)
+ sub w8, w8, #26
+ ldrh w8, [x7, w8, uxtw #1]
+ movi v24.4s, #0
+ movi v25.4s, #0
+ movi v26.4s, #0
+ movi v27.4s, #0
+ sub x7, x7, w8, uxtw
+ sub w8, w6, w4 // height - h_pad
+ rbit w9, w5 // rbit(width)
+ rbit w10, w6 // rbit(height)
+ clz w9, w9 // ctz(width)
+ clz w10, w10 // ctz(height)
+ add w9, w9, w10 // log2sz
+ add x10, x1, x2
+ dup v31.4s, w9
+ lsl x2, x2, #1
+ neg v31.4s, v31.4s // -log2sz
+ br x7
+
+L(ipred_cfl_ac_444_w4):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and expand input
+ ld1 {v0.4h}, [x1], x2
+ ld1 {v0.d}[1], [x10], x2
+ ld1 {v1.4h}, [x1], x2
+ ld1 {v1.d}[1], [x10], x2
+ shl v0.8h, v0.8h, #3
+ shl v1.8h, v1.8h, #3
+ subs w8, w8, #4
+ st1 {v0.8h, v1.8h}, [x0], #32
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ b.gt 1b
+ trn2 v0.2d, v1.2d, v1.2d
+ trn2 v1.2d, v1.2d, v1.2d
+ b L(ipred_cfl_ac_420_w4_hpad)
+
+L(ipred_cfl_ac_444_w8):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and expand input
+ ld1 {v0.8h}, [x1], x2
+ ld1 {v1.8h}, [x10], x2
+ ld1 {v2.8h}, [x1], x2
+ shl v0.8h, v0.8h, #3
+ ld1 {v3.8h}, [x10], x2
+ shl v1.8h, v1.8h, #3
+ shl v2.8h, v2.8h, #3
+ shl v3.8h, v3.8h, #3
+ subs w8, w8, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ mov v0.16b, v3.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_444_w16):
+ AARCH64_VALID_JUMP_TARGET
+ cbnz w3, L(ipred_cfl_ac_444_w16_wpad)
+1: // Copy and expand input, without padding
+ ld1 {v0.8h, v1.8h}, [x1], x2
+ ld1 {v2.8h, v3.8h}, [x10], x2
+ shl v0.8h, v0.8h, #3
+ shl v1.8h, v1.8h, #3
+ shl v2.8h, v2.8h, #3
+ shl v3.8h, v3.8h, #3
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_444_w16_wpad):
+1: // Copy and expand input, padding 8
+ ld1 {v0.8h}, [x1], x2
+ ld1 {v2.8h}, [x10], x2
+ shl v0.8h, v0.8h, #3
+ shl v2.8h, v2.8h, #3
+ dup v1.8h, v0.h[7]
+ dup v3.8h, v2.h[7]
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_444_w32):
+ AARCH64_VALID_JUMP_TARGET
+ adr x7, L(ipred_cfl_ac_444_w32_tbl)
+ ldrh w3, [x7, w3, uxtw] // (w3>>1) << 1
+ lsr x2, x2, #1 // Restore the stride to one line increments
+ sub x7, x7, w3, uxtw
+ br x7
+
+L(ipred_cfl_ac_444_w32_wpad0):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and expand input, without padding
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2
+ shl v0.8h, v0.8h, #3
+ shl v1.8h, v1.8h, #3
+ shl v2.8h, v2.8h, #3
+ shl v3.8h, v3.8h, #3
+ subs w8, w8, #1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ b L(ipred_cfl_ac_444_w32_hpad)
+
+L(ipred_cfl_ac_444_w32_wpad2):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and expand input, padding 8
+ ld1 {v0.8h, v1.8h, v2.8h}, [x1], x2
+ shl v2.8h, v2.8h, #3
+ shl v0.8h, v0.8h, #3
+ shl v1.8h, v1.8h, #3
+ dup v3.8h, v2.h[7]
+ subs w8, w8, #1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ b L(ipred_cfl_ac_444_w32_hpad)
+
+L(ipred_cfl_ac_444_w32_wpad4):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and expand input, padding 16
+ ld1 {v0.8h, v1.8h}, [x1], x2
+ shl v1.8h, v1.8h, #3
+ shl v0.8h, v0.8h, #3
+ dup v2.8h, v1.h[7]
+ dup v3.8h, v1.h[7]
+ subs w8, w8, #1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ b L(ipred_cfl_ac_444_w32_hpad)
+
+L(ipred_cfl_ac_444_w32_wpad6):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and expand input, padding 24
+ ld1 {v0.8h}, [x1], x2
+ shl v0.8h, v0.8h, #3
+ dup v1.8h, v0.h[7]
+ dup v2.8h, v0.h[7]
+ dup v3.8h, v0.h[7]
+ subs w8, w8, #1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+
+L(ipred_cfl_ac_444_w32_hpad):
+ cbz w4, 3f
+2: // Vertical padding (h_pad > 0)
+ subs w4, w4, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 2b
+3:
+
+ // Multiply the height by eight and reuse the w4 subtracting
+ lsl w6, w6, #3
+ b L(ipred_cfl_ac_420_w4_calc_subtract_dc)
+
+L(ipred_cfl_ac_444_tbl):
+ .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w32)
+ .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w16)
+ .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w8)
+ .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w4)
+
+L(ipred_cfl_ac_444_w32_tbl):
+ .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad0)
+ .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad2)
+ .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad4)
+ .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad6)
+endfunc
diff --git a/third_party/dav1d/src/arm/64/itx.S b/third_party/dav1d/src/arm/64/itx.S
new file mode 100644
index 0000000000..b1b2f8fe65
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/itx.S
@@ -0,0 +1,3270 @@
+/******************************************************************************
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// The exported functions in this file have got the following signature:
+// void itxfm_add(pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob);
+
+// Most of the functions use the following register layout:
+// x0-x3 external parameters
+// x4 function pointer to first transform
+// x5 function pointer to second transform
+// x6 output parameter for helper function
+// x7 input parameter for helper function
+// x8 input stride for helper function
+// x9-x12 scratch variables for helper functions
+// x13 pointer to list of eob thresholds
+// x14 return pointer for helper function
+// x15 return pointer for main function
+
+// The SIMD registers most often use the following layout:
+// v0-v1 multiplication coefficients
+// v2-v7 scratch registers
+// v8-v15 unused
+// v16-v31 inputs/outputs of transforms
+
+// Potential further optimizations, that are left unimplemented for now:
+// - Trying to keep multiplication coefficients in registers across multiple
+// transform functions. (The register layout is designed to potentially
+// allow this.)
+// - Use a simplified version of the transforms themselves for cases where
+// we know a significant number of inputs are zero. E.g. if the eob value
+// indicates only a quarter of input values are set, for idct16 and up,
+// a significant amount of calculation can be skipped, at the cost of more
+// code duplication and special casing.
+
+const idct_coeffs, align=4
+ // idct4
+ .short 2896, 2896*8, 1567, 3784
+ // idct8
+ .short 799, 4017, 3406, 2276
+ // idct16
+ .short 401, 4076, 3166, 2598
+ .short 1931, 3612, 3920, 1189
+ // idct32
+ .short 201, 4091, 3035, 2751
+ .short 1751, 3703, 3857, 1380
+ .short 995, 3973, 3513, 2106
+ .short 2440, 3290, 4052, 601
+endconst
+
+const idct64_coeffs, align=4
+ .short 101*8, 4095*8, 2967*8, -2824*8
+ .short 1660*8, 3745*8, 3822*8, -1474*8
+ .short 4076, 401, 4017, 799
+ .short 0, 0, 0, 0
+
+ .short 4036*8, -700*8, 2359*8, 3349*8
+ .short 3461*8, -2191*8, 897*8, 3996*8
+ .short -3166, -2598, -799, -4017
+ .short 0, 0, 0, 0
+
+ .short 501*8, 4065*8, 3229*8, -2520*8
+ .short 2019*8, 3564*8, 3948*8, -1092*8
+ .short 3612, 1931, 2276, 3406
+ .short 0, 0, 0, 0
+
+ .short 4085*8, -301*8, 2675*8, 3102*8
+ .short 3659*8, -1842*8, 1285*8, 3889*8
+ .short -3920, -1189, -3406, -2276
+ .short 0, 0, 0, 0
+endconst
+
+const iadst4_coeffs, align=4
+ // .h[4-5] can be interpreted as .s[2]
+ .short 1321, 3803, 2482, 3344, 3344, 0
+endconst
+
+const iadst8_coeffs, align=4
+ .short 4076, 401, 3612, 1931
+ .short 2598, 3166, 1189, 3920
+ // idct_coeffs
+ .short 2896, 0, 1567, 3784, 0, 0, 0, 0
+endconst
+
+const iadst16_coeffs, align=4
+ .short 4091, 201, 3973, 995
+ .short 3703, 1751, 3290, 2440
+ .short 2751, 3035, 2106, 3513
+ .short 1380, 3857, 601, 4052
+endconst
+
+.macro smull_smlal d0, d1, s0, s1, c0, c1, sz
+ smull \d0\().4s, \s0\().4h, \c0
+ smlal \d0\().4s, \s1\().4h, \c1
+.ifc \sz, .8h
+ smull2 \d1\().4s, \s0\().8h, \c0
+ smlal2 \d1\().4s, \s1\().8h, \c1
+.endif
+.endm
+
+.macro smull_smlsl d0, d1, s0, s1, c0, c1, sz
+ smull \d0\().4s, \s0\().4h, \c0
+ smlsl \d0\().4s, \s1\().4h, \c1
+.ifc \sz, .8h
+ smull2 \d1\().4s, \s0\().8h, \c0
+ smlsl2 \d1\().4s, \s1\().8h, \c1
+.endif
+.endm
+
+.macro sqrshrn_sz d0, s0, s1, shift, sz
+ sqrshrn \d0\().4h, \s0\().4s, \shift
+.ifc \sz, .8h
+ sqrshrn2 \d0\().8h, \s1\().4s, \shift
+.endif
+.endm
+
+.macro scale_input sz, c, r0, r1, r2 r3, r4, r5, r6, r7
+ sqrdmulh \r0\sz, \r0\sz, \c
+ sqrdmulh \r1\sz, \r1\sz, \c
+ sqrdmulh \r2\sz, \r2\sz, \c
+ sqrdmulh \r3\sz, \r3\sz, \c
+.ifnb \r4
+ sqrdmulh \r4\sz, \r4\sz, \c
+ sqrdmulh \r5\sz, \r5\sz, \c
+ sqrdmulh \r6\sz, \r6\sz, \c
+ sqrdmulh \r7\sz, \r7\sz, \c
+.endif
+.endm
+
+.macro load_add_store load, shift, addsrc, adddst, narrowsrc, narrowdst, store, dst, src, shiftbits=4
+.ifnb \load
+ ld1 {\load}, [\src], x1
+.endif
+.ifnb \shift
+ srshr \shift, \shift, #\shiftbits
+.endif
+.ifnb \addsrc
+ uaddw \adddst, \adddst, \addsrc
+.endif
+.ifnb \narrowsrc
+ sqxtun \narrowdst, \narrowsrc
+.endif
+.ifnb \store
+ st1 {\store}, [\dst], x1
+.endif
+.endm
+.macro load_add_store_8x16 dst, src
+ mov \src, \dst
+ load_add_store v2.8b, v16.8h, , , , , , \dst, \src
+ load_add_store v3.8b, v17.8h, , , , , , \dst, \src
+ load_add_store v4.8b, v18.8h, v2.8b, v16.8h, , , , \dst, \src
+ load_add_store v5.8b, v19.8h, v3.8b, v17.8h, v16.8h, v2.8b, , \dst, \src
+ load_add_store v6.8b, v20.8h, v4.8b, v18.8h, v17.8h, v3.8b, v2.8b, \dst, \src
+ load_add_store v7.8b, v21.8h, v5.8b, v19.8h, v18.8h, v4.8b, v3.8b, \dst, \src
+ load_add_store v2.8b, v22.8h, v6.8b, v20.8h, v19.8h, v5.8b, v4.8b, \dst, \src
+ load_add_store v3.8b, v23.8h, v7.8b, v21.8h, v20.8h, v6.8b, v5.8b, \dst, \src
+ load_add_store v4.8b, v24.8h, v2.8b, v22.8h, v21.8h, v7.8b, v6.8b, \dst, \src
+ load_add_store v5.8b, v25.8h, v3.8b, v23.8h, v22.8h, v2.8b, v7.8b, \dst, \src
+ load_add_store v6.8b, v26.8h, v4.8b, v24.8h, v23.8h, v3.8b, v2.8b, \dst, \src
+ load_add_store v7.8b, v27.8h, v5.8b, v25.8h, v24.8h, v4.8b, v3.8b, \dst, \src
+ load_add_store v2.8b, v28.8h, v6.8b, v26.8h, v25.8h, v5.8b, v4.8b, \dst, \src
+ load_add_store v3.8b, v29.8h, v7.8b, v27.8h, v26.8h, v6.8b, v5.8b, \dst, \src
+ load_add_store v4.8b, v30.8h, v2.8b, v28.8h, v27.8h, v7.8b, v6.8b, \dst, \src
+ load_add_store v5.8b, v31.8h, v3.8b, v29.8h, v28.8h, v2.8b, v7.8b, \dst, \src
+ load_add_store , , v4.8b, v30.8h, v29.8h, v3.8b, v2.8b, \dst, \src
+ load_add_store , , v5.8b, v31.8h, v30.8h, v4.8b, v3.8b, \dst, \src
+ load_add_store , , , , v31.8h, v5.8b, v4.8b, \dst, \src
+ load_add_store , , , , , , v5.8b, \dst, \src
+.endm
+.macro load_add_store_8x8 dst, src, shiftbits=4
+ mov \src, \dst
+ load_add_store v2.8b, v16.8h, , , , , , \dst, \src, \shiftbits
+ load_add_store v3.8b, v17.8h, , , , , , \dst, \src, \shiftbits
+ load_add_store v4.8b, v18.8h, v2.8b, v16.8h, , , , \dst, \src, \shiftbits
+ load_add_store v5.8b, v19.8h, v3.8b, v17.8h, v16.8h, v2.8b, , \dst, \src, \shiftbits
+ load_add_store v6.8b, v20.8h, v4.8b, v18.8h, v17.8h, v3.8b, v2.8b, \dst, \src, \shiftbits
+ load_add_store v7.8b, v21.8h, v5.8b, v19.8h, v18.8h, v4.8b, v3.8b, \dst, \src, \shiftbits
+ load_add_store v2.8b, v22.8h, v6.8b, v20.8h, v19.8h, v5.8b, v4.8b, \dst, \src, \shiftbits
+ load_add_store v3.8b, v23.8h, v7.8b, v21.8h, v20.8h, v6.8b, v5.8b, \dst, \src, \shiftbits
+ load_add_store , , v2.8b, v22.8h, v21.8h, v7.8b, v6.8b, \dst, \src, \shiftbits
+ load_add_store , , v3.8b, v23.8h, v22.8h, v2.8b, v7.8b, \dst, \src, \shiftbits
+ load_add_store , , , , v23.8h, v3.8b, v2.8b, \dst, \src, \shiftbits
+ load_add_store , , , , , , v3.8b, \dst, \src, \shiftbits
+.endm
+.macro load_add_store_8x4 dst, src
+ mov \src, \dst
+ load_add_store v2.8b, v16.8h, , , , , , \dst, \src
+ load_add_store v3.8b, v17.8h, , , , , , \dst, \src
+ load_add_store v4.8b, v18.8h, v2.8b, v16.8h, , , , \dst, \src
+ load_add_store v5.8b, v19.8h, v3.8b, v17.8h, v16.8h, v2.8b, , \dst, \src
+ load_add_store , , v4.8b, v18.8h, v17.8h, v3.8b, v2.8b, \dst, \src
+ load_add_store , , v5.8b, v19.8h, v18.8h, v4.8b, v3.8b, \dst, \src
+ load_add_store , , , , v19.8h, v5.8b, v4.8b, \dst, \src
+ load_add_store , , , , , , v5.8b, \dst, \src
+.endm
+.macro load_add_store4 load, inssrc, insdst, shift, addsrc, adddst, narrowsrc, narrowdst, store, dst, src
+.ifnb \load
+ ld1 {\load}[0], [\src], x1
+.endif
+.ifnb \inssrc
+ ins \insdst\().d[1], \inssrc\().d[0]
+.endif
+.ifnb \shift
+ srshr \shift, \shift, #4
+.endif
+.ifnb \load
+ ld1 {\load}[1], [\src], x1
+.endif
+.ifnb \addsrc
+ uaddw \adddst, \adddst, \addsrc
+.endif
+.ifnb \store
+ st1 {\store}[0], [\dst], x1
+.endif
+.ifnb \narrowsrc
+ sqxtun \narrowdst, \narrowsrc
+.endif
+.ifnb \store
+ st1 {\store}[1], [\dst], x1
+.endif
+.endm
+.macro load_add_store_4x16 dst, src
+ mov \src, \dst
+ load_add_store4 v0.s, v17, v16, , , , , , , \dst, \src
+ load_add_store4 v1.s, v19, v18, , , , , , , \dst, \src
+ load_add_store4 v2.s, v21, v20, v16.8h, , , , , , \dst, \src
+ load_add_store4 v3.s, v23, v22, v18.8h, v0.8b, v16.8h, , , , \dst, \src
+ load_add_store4 v4.s, v25, v24, v20.8h, v1.8b, v18.8h, v16.8h, v0.8b, , \dst, \src
+ load_add_store4 v5.s, v27, v26, v22.8h, v2.8b, v20.8h, v18.8h, v1.8b, v0.s, \dst, \src
+ load_add_store4 v6.s, v29, v28, v24.8h, v3.8b, v22.8h, v20.8h, v2.8b, v1.s, \dst, \src
+ load_add_store4 v7.s, v31, v30, v26.8h, v4.8b, v24.8h, v22.8h, v3.8b, v2.s, \dst, \src
+ load_add_store4 , , , v28.8h, v5.8b, v26.8h, v24.8h, v4.8b, v3.s, \dst, \src
+ load_add_store4 , , , v30.8h, v6.8b, v28.8h, v26.8h, v5.8b, v4.s, \dst, \src
+ load_add_store4 , , , , v7.8b, v30.8h, v28.8h, v6.8b, v5.s, \dst, \src
+ load_add_store4 , , , , , , v30.8h, v7.8b, v6.s, \dst, \src
+ load_add_store4 , , , , , , , , v7.s, \dst, \src
+.endm
+.macro load_add_store_4x8 dst, src
+ mov \src, \dst
+ load_add_store4 v0.s, v17, v16, , , , , , , \dst, \src
+ load_add_store4 v1.s, v19, v18, , , , , , , \dst, \src
+ load_add_store4 v2.s, v21, v20, v16.8h, , , , , , \dst, \src
+ load_add_store4 v3.s, v23, v22, v18.8h, v0.8b, v16.8h, , , , \dst, \src
+ load_add_store4 , , , v20.8h, v1.8b, v18.8h, v16.8h, v0.8b, , \dst, \src
+ load_add_store4 , , , v22.8h, v2.8b, v20.8h, v18.8h, v1.8b, v0.s, \dst, \src
+ load_add_store4 , , , , v3.8b, v22.8h, v20.8h, v2.8b, v1.s, \dst, \src
+ load_add_store4 , , , , , , v22.8h, v3.8b, v2.s, \dst, \src
+ load_add_store4 , , , , , , , , v3.s, \dst, \src
+.endm
+
+.macro idct_dc w, h, shift
+ cbnz w3, 1f
+ mov w16, #2896*8
+ ld1r {v16.8h}, [x2]
+ dup v0.4h, w16
+ sqrdmulh v16.8h, v16.8h, v0.h[0]
+ strh wzr, [x2]
+.if (\w == 2*\h) || (2*\w == \h)
+ sqrdmulh v16.8h, v16.8h, v0.h[0]
+.endif
+.if \shift > 0
+ srshr v16.8h, v16.8h, #\shift
+.endif
+ sqrdmulh v16.8h, v16.8h, v0.h[0]
+ srshr v16.8h, v16.8h, #4
+ mov w4, #\h
+ b idct_dc_w\w\()_neon
+1:
+.endm
+
+function idct_dc_w4_neon
+1:
+ ld1 {v0.s}[0], [x0], x1
+ ld1 {v0.s}[1], [x0], x1
+ ld1 {v1.s}[0], [x0], x1
+ ld1 {v1.s}[1], [x0], x1
+ subs w4, w4, #4
+ sub x0, x0, x1, lsl #2
+ uaddw v0.8h, v16.8h, v0.8b
+ sqxtun v0.8b, v0.8h
+ uaddw v1.8h, v16.8h, v1.8b
+ st1 {v0.s}[0], [x0], x1
+ sqxtun v1.8b, v1.8h
+ st1 {v0.s}[1], [x0], x1
+ st1 {v1.s}[0], [x0], x1
+ st1 {v1.s}[1], [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+function idct_dc_w8_neon
+1:
+ ld1 {v0.8b}, [x0], x1
+ ld1 {v1.8b}, [x0], x1
+ ld1 {v2.8b}, [x0], x1
+ uaddw v20.8h, v16.8h, v0.8b
+ ld1 {v3.8b}, [x0], x1
+ sub x0, x0, x1, lsl #2
+ subs w4, w4, #4
+ uaddw v21.8h, v16.8h, v1.8b
+ sqxtun v0.8b, v20.8h
+ uaddw v22.8h, v16.8h, v2.8b
+ sqxtun v1.8b, v21.8h
+ uaddw v23.8h, v16.8h, v3.8b
+ st1 {v0.8b}, [x0], x1
+ sqxtun v2.8b, v22.8h
+ st1 {v1.8b}, [x0], x1
+ sqxtun v3.8b, v23.8h
+ st1 {v2.8b}, [x0], x1
+ st1 {v3.8b}, [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+function idct_dc_w16_neon
+1:
+ ld1 {v0.16b}, [x0], x1
+ ld1 {v1.16b}, [x0], x1
+ ld1 {v2.16b}, [x0], x1
+ subs w4, w4, #4
+ uaddw v20.8h, v16.8h, v0.8b
+ uaddw2 v21.8h, v16.8h, v0.16b
+ ld1 {v3.16b}, [x0], x1
+ uaddw v22.8h, v16.8h, v1.8b
+ uaddw2 v23.8h, v16.8h, v1.16b
+ sub x0, x0, x1, lsl #2
+ uaddw v24.8h, v16.8h, v2.8b
+ uaddw2 v25.8h, v16.8h, v2.16b
+ sqxtun v0.8b, v20.8h
+ sqxtun2 v0.16b, v21.8h
+ uaddw v26.8h, v16.8h, v3.8b
+ uaddw2 v27.8h, v16.8h, v3.16b
+ sqxtun v1.8b, v22.8h
+ sqxtun2 v1.16b, v23.8h
+ sqxtun v2.8b, v24.8h
+ sqxtun2 v2.16b, v25.8h
+ st1 {v0.16b}, [x0], x1
+ sqxtun v3.8b, v26.8h
+ sqxtun2 v3.16b, v27.8h
+ st1 {v1.16b}, [x0], x1
+ st1 {v2.16b}, [x0], x1
+ st1 {v3.16b}, [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+function idct_dc_w32_neon
+1:
+ ld1 {v0.16b, v1.16b}, [x0], x1
+ subs w4, w4, #2
+ uaddw v20.8h, v16.8h, v0.8b
+ uaddw2 v21.8h, v16.8h, v0.16b
+ ld1 {v2.16b, v3.16b}, [x0]
+ uaddw v22.8h, v16.8h, v1.8b
+ uaddw2 v23.8h, v16.8h, v1.16b
+ sub x0, x0, x1
+ uaddw v24.8h, v16.8h, v2.8b
+ uaddw2 v25.8h, v16.8h, v2.16b
+ sqxtun v0.8b, v20.8h
+ sqxtun2 v0.16b, v21.8h
+ uaddw v26.8h, v16.8h, v3.8b
+ uaddw2 v27.8h, v16.8h, v3.16b
+ sqxtun v1.8b, v22.8h
+ sqxtun2 v1.16b, v23.8h
+ sqxtun v2.8b, v24.8h
+ sqxtun2 v2.16b, v25.8h
+ st1 {v0.16b, v1.16b}, [x0], x1
+ sqxtun v3.8b, v26.8h
+ sqxtun2 v3.16b, v27.8h
+ st1 {v2.16b, v3.16b}, [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+function idct_dc_w64_neon
+1:
+ ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0]
+ subs w4, w4, #1
+ uaddw v20.8h, v16.8h, v0.8b
+ uaddw2 v21.8h, v16.8h, v0.16b
+ uaddw v22.8h, v16.8h, v1.8b
+ uaddw2 v23.8h, v16.8h, v1.16b
+ uaddw v24.8h, v16.8h, v2.8b
+ uaddw2 v25.8h, v16.8h, v2.16b
+ sqxtun v0.8b, v20.8h
+ sqxtun2 v0.16b, v21.8h
+ uaddw v26.8h, v16.8h, v3.8b
+ uaddw2 v27.8h, v16.8h, v3.16b
+ sqxtun v1.8b, v22.8h
+ sqxtun2 v1.16b, v23.8h
+ sqxtun v2.8b, v24.8h
+ sqxtun2 v2.16b, v25.8h
+ sqxtun v3.8b, v26.8h
+ sqxtun2 v3.16b, v27.8h
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+.macro iwht4
+ add v16.4h, v16.4h, v17.4h
+ sub v21.4h, v18.4h, v19.4h
+ sub v20.4h, v16.4h, v21.4h
+ sshr v20.4h, v20.4h, #1
+ sub v18.4h, v20.4h, v17.4h
+ sub v17.4h, v20.4h, v19.4h
+ add v19.4h, v21.4h, v18.4h
+ sub v16.4h, v16.4h, v17.4h
+.endm
+
+.macro idct_4 r0, r1, r2, r3, sz
+ smull_smlal v6, v7, \r1, \r3, v0.h[3], v0.h[2], \sz
+ smull_smlsl v4, v5, \r1, \r3, v0.h[2], v0.h[3], \sz
+ smull_smlal v2, v3, \r0, \r2, v0.h[0], v0.h[0], \sz
+ sqrshrn_sz v6, v6, v7, #12, \sz
+ sqrshrn_sz v7, v4, v5, #12, \sz
+ smull_smlsl v4, v5, \r0, \r2, v0.h[0], v0.h[0], \sz
+ sqrshrn_sz v2, v2, v3, #12, \sz
+ sqrshrn_sz v3, v4, v5, #12, \sz
+ sqadd \r0\sz, v2\sz, v6\sz
+ sqsub \r3\sz, v2\sz, v6\sz
+ sqadd \r1\sz, v3\sz, v7\sz
+ sqsub \r2\sz, v3\sz, v7\sz
+.endm
+
+function inv_dct_4h_x4_neon, export=1
+ movrel x16, idct_coeffs
+ ld1 {v0.4h}, [x16]
+ idct_4 v16, v17, v18, v19, .4h
+ ret
+endfunc
+
+function inv_dct_8h_x4_neon, export=1
+ movrel x16, idct_coeffs
+ ld1 {v0.4h}, [x16]
+ idct_4 v16, v17, v18, v19, .8h
+ ret
+endfunc
+
+.macro iadst_4x4 o0, o1, o2, o3
+ movrel x16, iadst4_coeffs
+ ld1 {v0.8h}, [x16]
+
+ ssubl v3.4s, v16.4h, v18.4h
+ smull v4.4s, v16.4h, v0.h[0]
+ smlal v4.4s, v18.4h, v0.h[1]
+ smlal v4.4s, v19.4h, v0.h[2]
+ smull v7.4s, v17.4h, v0.h[3]
+ saddw v3.4s, v3.4s, v19.4h
+ smull v5.4s, v16.4h, v0.h[2]
+ smlsl v5.4s, v18.4h, v0.h[0]
+ smlsl v5.4s, v19.4h, v0.h[1]
+
+ add \o3\().4s, v4.4s, v5.4s
+ mul \o2\().4s, v3.4s, v0.s[2]
+ add \o0\().4s, v4.4s, v7.4s
+ add \o1\().4s, v5.4s, v7.4s
+ sub \o3\().4s, \o3\().4s, v7.4s
+
+ sqrshrn \o0\().4h, \o0\().4s, #12
+ sqrshrn \o2\().4h, \o2\().4s, #12
+ sqrshrn \o1\().4h, \o1\().4s, #12
+ sqrshrn \o3\().4h, \o3\().4s, #12
+.endm
+
+function inv_adst_4h_x4_neon, export=1
+ iadst_4x4 v16, v17, v18, v19
+ ret
+endfunc
+
+function inv_flipadst_4h_x4_neon, export=1
+ iadst_4x4 v19, v18, v17, v16
+ ret
+endfunc
+
+.macro iadst_8x4 o0, o1, o2, o3
+ movrel x16, iadst4_coeffs
+ ld1 {v0.8h}, [x16]
+
+ ssubl v2.4s, v16.4h, v18.4h
+ ssubl2 v3.4s, v16.8h, v18.8h
+ smull v4.4s, v16.4h, v0.h[0]
+ smlal v4.4s, v18.4h, v0.h[1]
+ smlal v4.4s, v19.4h, v0.h[2]
+ smull2 v5.4s, v16.8h, v0.h[0]
+ smlal2 v5.4s, v18.8h, v0.h[1]
+ smlal2 v5.4s, v19.8h, v0.h[2]
+ saddw v2.4s, v2.4s, v19.4h
+ saddw2 v3.4s, v3.4s, v19.8h
+ smull v6.4s, v16.4h, v0.h[2]
+ smlsl v6.4s, v18.4h, v0.h[0]
+ smlsl v6.4s, v19.4h, v0.h[1]
+ smull2 v7.4s, v16.8h, v0.h[2]
+ smlsl2 v7.4s, v18.8h, v0.h[0]
+ smlsl2 v7.4s, v19.8h, v0.h[1]
+
+ mul v18.4s, v2.4s, v0.s[2]
+ mul v19.4s, v3.4s, v0.s[2]
+
+ smull v2.4s, v17.4h, v0.h[3]
+ smull2 v3.4s, v17.8h, v0.h[3]
+
+ add v16.4s, v4.4s, v2.4s // out0
+ add v17.4s, v5.4s, v3.4s
+
+ add v4.4s, v4.4s, v6.4s // out3
+ add v5.4s, v5.4s, v7.4s
+
+ add v6.4s, v6.4s, v2.4s // out1
+ add v7.4s, v7.4s, v3.4s
+
+ sub v4.4s, v4.4s, v2.4s // out3
+ sub v5.4s, v5.4s, v3.4s
+
+ sqrshrn v18.4h, v18.4s, #12
+ sqrshrn2 v18.8h, v19.4s, #12
+
+ sqrshrn \o0\().4h, v16.4s, #12
+ sqrshrn2 \o0\().8h, v17.4s, #12
+
+.ifc \o2, v17
+ mov v17.16b, v18.16b
+.endif
+
+ sqrshrn \o1\().4h, v6.4s, #12
+ sqrshrn2 \o1\().8h, v7.4s, #12
+
+ sqrshrn \o3\().4h, v4.4s, #12
+ sqrshrn2 \o3\().8h, v5.4s, #12
+.endm
+
+function inv_adst_8h_x4_neon, export=1
+ iadst_8x4 v16, v17, v18, v19
+ ret
+endfunc
+
+function inv_flipadst_8h_x4_neon, export=1
+ iadst_8x4 v19, v18, v17, v16
+ ret
+endfunc
+
+function inv_identity_4h_x4_neon, export=1
+ mov w16, #(5793-4096)*8
+ dup v0.4h, w16
+ sqrdmulh v4.4h, v16.4h, v0.h[0]
+ sqrdmulh v5.4h, v17.4h, v0.h[0]
+ sqrdmulh v6.4h, v18.4h, v0.h[0]
+ sqrdmulh v7.4h, v19.4h, v0.h[0]
+ sqadd v16.4h, v16.4h, v4.4h
+ sqadd v17.4h, v17.4h, v5.4h
+ sqadd v18.4h, v18.4h, v6.4h
+ sqadd v19.4h, v19.4h, v7.4h
+ ret
+endfunc
+
+function inv_identity_8h_x4_neon, export=1
+ mov w16, #(5793-4096)*8
+ dup v0.4h, w16
+ sqrdmulh v4.8h, v16.8h, v0.h[0]
+ sqrdmulh v5.8h, v17.8h, v0.h[0]
+ sqrdmulh v6.8h, v18.8h, v0.h[0]
+ sqrdmulh v7.8h, v19.8h, v0.h[0]
+ sqadd v16.8h, v16.8h, v4.8h
+ sqadd v17.8h, v17.8h, v5.8h
+ sqadd v18.8h, v18.8h, v6.8h
+ sqadd v19.8h, v19.8h, v7.8h
+ ret
+endfunc
+
+.macro identity_8x4_shift1 r0, r1, r2, r3, c
+.irp i, \r0\().8h, \r1\().8h, \r2\().8h, \r3\().8h
+ sqrdmulh v2.8h, \i, \c
+ srhadd \i, \i, v2.8h
+.endr
+.endm
+
+function inv_txfm_add_wht_wht_4x4_8bpc_neon, export=1
+ mov x15, x30
+ movi v31.8h, #0
+ ld1 {v16.4h,v17.4h,v18.4h,v19.4h}, [x2]
+ st1 {v31.8h}, [x2], #16
+
+ sshr v16.4h, v16.4h, #2
+ sshr v17.4h, v17.4h, #2
+ sshr v18.4h, v18.4h, #2
+ sshr v19.4h, v19.4h, #2
+
+ iwht4
+
+ st1 {v31.8h}, [x2], #16
+ transpose_4x4h v16, v17, v18, v19, v20, v21, v22, v23
+
+ iwht4
+
+ ld1 {v0.s}[0], [x0], x1
+ ld1 {v0.s}[1], [x0], x1
+ ins v16.d[1], v17.d[0]
+ ins v18.d[1], v19.d[0]
+ ld1 {v1.s}[0], [x0], x1
+ ld1 {v1.s}[1], [x0], x1
+
+ b L(itx_4x4_end)
+endfunc
+
+function inv_txfm_add_4x4_neon
+ movi v31.8h, #0
+ ld1 {v16.4h,v17.4h,v18.4h,v19.4h}, [x2]
+ st1 {v31.8h}, [x2], #16
+
+ blr x4
+
+ st1 {v31.8h}, [x2], #16
+ transpose_4x4h v16, v17, v18, v19, v20, v21, v22, v23
+
+ blr x5
+
+ ld1 {v0.s}[0], [x0], x1
+ ld1 {v0.s}[1], [x0], x1
+ ins v16.d[1], v17.d[0]
+ ins v18.d[1], v19.d[0]
+ ld1 {v1.s}[0], [x0], x1
+ ld1 {v1.s}[1], [x0], x1
+ srshr v16.8h, v16.8h, #4
+ srshr v18.8h, v18.8h, #4
+
+L(itx_4x4_end):
+ sub x0, x0, x1, lsl #2
+ uaddw v16.8h, v16.8h, v0.8b
+ sqxtun v0.8b, v16.8h
+ uaddw v18.8h, v18.8h, v1.8b
+ st1 {v0.s}[0], [x0], x1
+ sqxtun v1.8b, v18.8h
+ st1 {v0.s}[1], [x0], x1
+ st1 {v1.s}[0], [x0], x1
+ st1 {v1.s}[1], [x0], x1
+
+ ret x15
+endfunc
+
+.macro def_fn_4x4 txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_8bpc_neon, export=1
+ mov x15, x30
+
+.ifc \txfm1\()_\txfm2, dct_dct
+ cbnz w3, 1f
+ mov w16, #2896*8
+ ld1r {v16.8h}, [x2]
+ dup v4.8h, w16
+ strh wzr, [x2]
+ sqrdmulh v16.8h, v16.8h, v4.h[0]
+ ld1 {v0.s}[0], [x0], x1
+ sqrdmulh v20.8h, v16.8h, v4.h[0]
+ ld1 {v0.s}[1], [x0], x1
+ srshr v16.8h, v20.8h, #4
+ ld1 {v1.s}[0], [x0], x1
+ srshr v18.8h, v20.8h, #4
+ ld1 {v1.s}[1], [x0], x1
+ b L(itx_4x4_end)
+1:
+.endif
+ adr x4, inv_\txfm1\()_4h_x4_neon
+ adr x5, inv_\txfm2\()_4h_x4_neon
+ b inv_txfm_add_4x4_neon
+endfunc
+.endm
+
+def_fn_4x4 dct, dct
+def_fn_4x4 identity, identity
+def_fn_4x4 dct, adst
+def_fn_4x4 dct, flipadst
+def_fn_4x4 dct, identity
+def_fn_4x4 adst, dct
+def_fn_4x4 adst, adst
+def_fn_4x4 adst, flipadst
+def_fn_4x4 flipadst, dct
+def_fn_4x4 flipadst, adst
+def_fn_4x4 flipadst, flipadst
+def_fn_4x4 identity, dct
+
+def_fn_4x4 adst, identity
+def_fn_4x4 flipadst, identity
+def_fn_4x4 identity, adst
+def_fn_4x4 identity, flipadst
+
+.macro idct_8 r0, r1, r2, r3, r4, r5, r6, r7, sz, szb
+ idct_4 \r0, \r2, \r4, \r6, \sz
+
+ smull_smlsl v2, v3, \r1, \r7, v0.h[4], v0.h[5], \sz // -> t4a
+ smull_smlal v4, v5, \r1, \r7, v0.h[5], v0.h[4], \sz // -> t7a
+ smull_smlsl v6, v7, \r5, \r3, v0.h[6], v0.h[7], \sz // -> t5a
+ sqrshrn_sz \r1, v2, v3, #12, \sz // t4a
+ sqrshrn_sz \r7, v4, v5, #12, \sz // t7a
+ smull_smlal v2, v3, \r5, \r3, v0.h[7], v0.h[6], \sz // -> t6a
+ sqrshrn_sz \r3, v6, v7, #12, \sz // t5a
+ sqrshrn_sz \r5, v2, v3, #12, \sz // t6a
+
+ sqadd v2\sz, \r1\sz, \r3\sz // t4
+ sqsub \r1\sz, \r1\sz, \r3\sz // t5a
+ sqadd v3\sz, \r7\sz, \r5\sz // t7
+ sqsub \r3\sz, \r7\sz, \r5\sz // t6a
+
+ smull_smlsl v4, v5, \r3, \r1, v0.h[0], v0.h[0], \sz // -> t5
+ smull_smlal v6, v7, \r3, \r1, v0.h[0], v0.h[0], \sz // -> t6
+ sqrshrn_sz v4, v4, v5, #12, \sz // t5
+ sqrshrn_sz v5, v6, v7, #12, \sz // t6
+
+ sqsub \r7\sz, \r0\sz, v3\sz // out7
+ sqadd \r0\sz, \r0\sz, v3\sz // out0
+ sqadd \r1\sz, \r2\sz, v5\sz // out1
+ sqsub v6\sz, \r2\sz, v5\sz // out6
+ sqadd \r2\sz, \r4\sz, v4\sz // out2
+ sqsub \r5\sz, \r4\sz, v4\sz // out5
+ sqadd \r3\sz, \r6\sz, v2\sz // out3
+ sqsub \r4\sz, \r6\sz, v2\sz // out4
+ mov \r6\szb, v6\szb // out6
+.endm
+
+function inv_dct_8h_x8_neon, export=1
+ movrel x16, idct_coeffs
+ ld1 {v0.8h}, [x16]
+ idct_8 v16, v17, v18, v19, v20, v21, v22, v23, .8h, .16b
+ ret
+endfunc
+
+function inv_dct_4h_x8_neon, export=1
+ movrel x16, idct_coeffs
+ ld1 {v0.8h}, [x16]
+ idct_8 v16, v17, v18, v19, v20, v21, v22, v23, .4h, .8b
+ ret
+endfunc
+
+.macro iadst_8 o0, o1, o2, o3, o4, o5, o6, o7, sz
+ movrel x16, iadst8_coeffs
+ ld1 {v0.8h, v1.8h}, [x16]
+
+ smull_smlal v2, v3, v23, v16, v0.h[0], v0.h[1], \sz
+ smull_smlsl v4, v5, v23, v16, v0.h[1], v0.h[0], \sz
+ smull_smlal v6, v7, v21, v18, v0.h[2], v0.h[3], \sz
+ sqrshrn_sz v16, v2, v3, #12, \sz // t0a
+ sqrshrn_sz v23, v4, v5, #12, \sz // t1a
+ smull_smlsl v2, v3, v21, v18, v0.h[3], v0.h[2], \sz
+ smull_smlal v4, v5, v19, v20, v0.h[4], v0.h[5], \sz
+ sqrshrn_sz v18, v6, v7, #12, \sz // t2a
+ sqrshrn_sz v21, v2, v3, #12, \sz // t3a
+ smull_smlsl v6, v7, v19, v20, v0.h[5], v0.h[4], \sz
+ smull_smlal v2, v3, v17, v22, v0.h[6], v0.h[7], \sz
+ sqrshrn_sz v20, v4, v5, #12, \sz // t4a
+ sqrshrn_sz v19, v6, v7, #12, \sz // t5a
+ smull_smlsl v4, v5, v17, v22, v0.h[7], v0.h[6], \sz
+ sqrshrn_sz v22, v2, v3, #12, \sz // t6a
+ sqrshrn_sz v17, v4, v5, #12, \sz // t7a
+
+ sqadd v2\sz, v16\sz, v20\sz // t0
+ sqsub v3\sz, v16\sz, v20\sz // t4
+ sqadd v4\sz, v23\sz, v19\sz // t1
+ sqsub v5\sz, v23\sz, v19\sz // t5
+ sqadd v6\sz, v18\sz, v22\sz // t2
+ sqsub v7\sz, v18\sz, v22\sz // t6
+ sqadd v18\sz, v21\sz, v17\sz // t3
+ sqsub v19\sz, v21\sz, v17\sz // t7
+
+ smull_smlal v16, v17, v3, v5, v1.h[3], v1.h[2], \sz
+ smull_smlsl v20, v21, v3, v5, v1.h[2], v1.h[3], \sz
+ smull_smlsl v22, v23, v19, v7, v1.h[3], v1.h[2], \sz
+
+ sqrshrn_sz v3, v16, v17, #12, \sz // t4a
+ sqrshrn_sz v5, v20, v21, #12, \sz // t5a
+
+ smull_smlal v16, v17, v19, v7, v1.h[2], v1.h[3], \sz
+
+ sqrshrn_sz v7, v22, v23, #12, \sz // t6a
+ sqrshrn_sz v19, v16, v17, #12, \sz // t7a
+
+ sqadd \o0\()\sz, v2\sz, v6\sz // out0
+ sqsub v2\sz, v2\sz, v6\sz // t2
+ sqadd \o7\()\sz, v4\sz, v18\sz // out7
+ sqsub v4\sz, v4\sz, v18\sz // t3
+ sqneg \o7\()\sz, \o7\()\sz // out7
+
+ sqadd \o1\()\sz, v3\sz, v7\sz // out1
+ sqsub v3\sz, v3\sz, v7\sz // t6
+ sqadd \o6\()\sz, v5\sz, v19\sz // out6
+ sqsub v5\sz, v5\sz, v19\sz // t7
+ sqneg \o1\()\sz, \o1\()\sz // out1
+
+ smull_smlal v18, v19, v2, v4, v1.h[0], v1.h[0], \sz // -> out3 (v19 or v20)
+ smull_smlsl v6, v7, v2, v4, v1.h[0], v1.h[0], \sz // -> out4 (v20 or v19)
+ smull_smlsl v20, v21, v3, v5, v1.h[0], v1.h[0], \sz // -> out5 (v21 or v18)
+ sqrshrn_sz v2, v18, v19, #12, \sz // out3
+ smull_smlal v18, v19, v3, v5, v1.h[0], v1.h[0], \sz // -> out2 (v18 or v21)
+ sqrshrn_sz v3, v20, v21, #12, \sz // out5
+ sqrshrn_sz \o2, v18, v19, #12, \sz // out2 (v18 or v21)
+ sqrshrn_sz \o4, v6, v7, #12, \sz // out4 (v20 or v19)
+
+ sqneg \o3\()\sz, v2\sz // out3
+ sqneg \o5\()\sz, v3\sz // out5
+.endm
+
+function inv_adst_8h_x8_neon, export=1
+ iadst_8 v16, v17, v18, v19, v20, v21, v22, v23, .8h
+ ret
+endfunc
+
+function inv_flipadst_8h_x8_neon, export=1
+ iadst_8 v23, v22, v21, v20, v19, v18, v17, v16, .8h
+ ret
+endfunc
+
+function inv_adst_4h_x8_neon, export=1
+ iadst_8 v16, v17, v18, v19, v20, v21, v22, v23, .4h
+ ret
+endfunc
+
+function inv_flipadst_4h_x8_neon, export=1
+ iadst_8 v23, v22, v21, v20, v19, v18, v17, v16, .4h
+ ret
+endfunc
+
+function inv_identity_8h_x8_neon, export=1
+ sqshl v16.8h, v16.8h, #1
+ sqshl v17.8h, v17.8h, #1
+ sqshl v18.8h, v18.8h, #1
+ sqshl v19.8h, v19.8h, #1
+ sqshl v20.8h, v20.8h, #1
+ sqshl v21.8h, v21.8h, #1
+ sqshl v22.8h, v22.8h, #1
+ sqshl v23.8h, v23.8h, #1
+ ret
+endfunc
+
+function inv_identity_4h_x8_neon, export=1
+ sqshl v16.4h, v16.4h, #1
+ sqshl v17.4h, v17.4h, #1
+ sqshl v18.4h, v18.4h, #1
+ sqshl v19.4h, v19.4h, #1
+ sqshl v20.4h, v20.4h, #1
+ sqshl v21.4h, v21.4h, #1
+ sqshl v22.4h, v22.4h, #1
+ sqshl v23.4h, v23.4h, #1
+ ret
+endfunc
+
+.macro def_fn_8x8_base variant
+function inv_txfm_\variant\()add_8x8_neon
+ movi v28.8h, #0
+ movi v29.8h, #0
+ movi v30.8h, #0
+ movi v31.8h, #0
+ ld1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x2]
+ st1 {v28.8h,v29.8h,v30.8h,v31.8h}, [x2], #64
+ ld1 {v20.8h,v21.8h,v22.8h,v23.8h}, [x2]
+ st1 {v28.8h,v29.8h,v30.8h,v31.8h}, [x2]
+
+.ifc \variant, identity_
+ // The identity shl #1 and downshift srshr #1 cancel out
+.else
+ blr x4
+
+ srshr v16.8h, v16.8h, #1
+ srshr v17.8h, v17.8h, #1
+ srshr v18.8h, v18.8h, #1
+ srshr v19.8h, v19.8h, #1
+ srshr v20.8h, v20.8h, #1
+ srshr v21.8h, v21.8h, #1
+ srshr v22.8h, v22.8h, #1
+ srshr v23.8h, v23.8h, #1
+.endif
+
+ transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
+
+ blr x5
+
+ load_add_store_8x8 x0, x7
+ ret x15
+endfunc
+.endm
+
+def_fn_8x8_base
+def_fn_8x8_base identity_
+
+.macro def_fn_8x8 txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_neon, export=1
+ mov x15, x30
+
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc 8, 8, 1
+.endif
+ adr x5, inv_\txfm2\()_8h_x8_neon
+.ifc \txfm1, identity
+ b inv_txfm_identity_add_8x8_neon
+.else
+ adr x4, inv_\txfm1\()_8h_x8_neon
+ b inv_txfm_add_8x8_neon
+.endif
+endfunc
+.endm
+
+def_fn_8x8 dct, dct
+def_fn_8x8 identity, identity
+def_fn_8x8 dct, adst
+def_fn_8x8 dct, flipadst
+def_fn_8x8 dct, identity
+def_fn_8x8 adst, dct
+def_fn_8x8 adst, adst
+def_fn_8x8 adst, flipadst
+def_fn_8x8 flipadst, dct
+def_fn_8x8 flipadst, adst
+def_fn_8x8 flipadst, flipadst
+def_fn_8x8 identity, dct
+def_fn_8x8 adst, identity
+def_fn_8x8 flipadst, identity
+def_fn_8x8 identity, adst
+def_fn_8x8 identity, flipadst
+
+function inv_txfm_add_8x4_neon
+ movi v30.8h, #0
+ movi v31.8h, #0
+ mov w16, #2896*8
+ dup v0.4h, w16
+ ld1 {v16.4h,v17.4h,v18.4h,v19.4h}, [x2]
+ st1 {v30.8h,v31.8h}, [x2], #32
+ ld1 {v20.4h,v21.4h,v22.4h,v23.4h}, [x2]
+ st1 {v30.8h,v31.8h}, [x2]
+
+ scale_input .4h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
+
+ blr x4
+
+ transpose_4x4h v16, v17, v18, v19, v4, v5, v6, v7
+ transpose_4x4h v20, v21, v22, v23, v4, v5, v6, v7
+ ins v16.d[1], v20.d[0]
+ ins v17.d[1], v21.d[0]
+ ins v18.d[1], v22.d[0]
+ ins v19.d[1], v23.d[0]
+
+ blr x5
+
+ load_add_store_8x4 x0, x7
+ ret x15
+endfunc
+
+function inv_txfm_add_4x8_neon
+ movi v28.8h, #0
+ movi v29.8h, #0
+ movi v30.8h, #0
+ movi v31.8h, #0
+ mov w16, #2896*8
+ dup v0.4h, w16
+ ld1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x2]
+ st1 {v28.8h,v29.8h,v30.8h,v31.8h}, [x2]
+
+ scale_input .8h, v0.h[0], v16, v17, v18, v19
+
+ blr x4
+
+ transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7
+ ins v20.d[0], v16.d[1]
+ ins v21.d[0], v17.d[1]
+ ins v22.d[0], v18.d[1]
+ ins v23.d[0], v19.d[1]
+
+ blr x5
+
+ load_add_store_4x8 x0, x7
+ ret x15
+endfunc
+
+.macro def_fn_48 w, h, txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
+ mov x15, x30
+
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc \w, \h, 0
+.endif
+ adr x4, inv_\txfm1\()_\h\()h_x\w\()_neon
+ adr x5, inv_\txfm2\()_\w\()h_x\h\()_neon
+ b inv_txfm_add_\w\()x\h\()_neon
+endfunc
+.endm
+
+.macro def_fns_48 w, h
+def_fn_48 \w, \h, dct, dct
+def_fn_48 \w, \h, identity, identity
+def_fn_48 \w, \h, dct, adst
+def_fn_48 \w, \h, dct, flipadst
+def_fn_48 \w, \h, dct, identity
+def_fn_48 \w, \h, adst, dct
+def_fn_48 \w, \h, adst, adst
+def_fn_48 \w, \h, adst, flipadst
+def_fn_48 \w, \h, flipadst, dct
+def_fn_48 \w, \h, flipadst, adst
+def_fn_48 \w, \h, flipadst, flipadst
+def_fn_48 \w, \h, identity, dct
+def_fn_48 \w, \h, adst, identity
+def_fn_48 \w, \h, flipadst, identity
+def_fn_48 \w, \h, identity, adst
+def_fn_48 \w, \h, identity, flipadst
+.endm
+
+def_fns_48 4, 8
+def_fns_48 8, 4
+
+
+.macro idct_16 sz, szb
+ idct_8 v16, v18, v20, v22, v24, v26, v28, v30, \sz, \szb
+
+ smull_smlsl v2, v3, v17, v31, v1.h[0], v1.h[1], \sz // -> t8a
+ smull_smlal v4, v5, v17, v31, v1.h[1], v1.h[0], \sz // -> t15a
+ smull_smlsl v6, v7, v25, v23, v1.h[2], v1.h[3], \sz // -> t9a
+ sqrshrn_sz v17, v2, v3, #12, \sz // t8a
+ sqrshrn_sz v31, v4, v5, #12, \sz // t15a
+ smull_smlal v2, v3, v25, v23, v1.h[3], v1.h[2], \sz // -> t14a
+ smull_smlsl v4, v5, v21, v27, v1.h[4], v1.h[5], \sz // -> t10a
+ sqrshrn_sz v23, v6, v7, #12, \sz // t9a
+ sqrshrn_sz v25, v2, v3, #12, \sz // t14a
+ smull_smlal v6, v7, v21, v27, v1.h[5], v1.h[4], \sz // -> t13a
+ smull_smlsl v2, v3, v29, v19, v1.h[6], v1.h[7], \sz // -> t11a
+ sqrshrn_sz v21, v4, v5, #12, \sz // t10a
+ sqrshrn_sz v27, v6, v7, #12, \sz // t13a
+ smull_smlal v4, v5, v29, v19, v1.h[7], v1.h[6], \sz // -> t12a
+ sqrshrn_sz v19, v2, v3, #12, \sz // t11a
+ sqrshrn_sz v29, v4, v5, #12, \sz // t12a
+
+ sqsub v2\sz, v17\sz, v23\sz // t9
+ sqadd v17\sz, v17\sz, v23\sz // t8
+ sqsub v3\sz, v31\sz, v25\sz // t14
+ sqadd v31\sz, v31\sz, v25\sz // t15
+ sqsub v23\sz, v19\sz, v21\sz // t10
+ sqadd v19\sz, v19\sz, v21\sz // t11
+ sqadd v25\sz, v29\sz, v27\sz // t12
+ sqsub v29\sz, v29\sz, v27\sz // t13
+
+ smull_smlsl v4, v5, v3, v2, v0.h[2], v0.h[3], \sz // -> t9a
+ smull_smlal v6, v7, v3, v2, v0.h[3], v0.h[2], \sz // -> t14a
+ sqrshrn_sz v21, v4, v5, #12, \sz // t9a
+ sqrshrn_sz v27, v6, v7, #12, \sz // t14a
+
+ smull_smlsl v4, v5, v29, v23, v0.h[2], v0.h[3], \sz // -> t13a
+ smull_smlal v6, v7, v29, v23, v0.h[3], v0.h[2], \sz // -> t10a
+ sqrshrn_sz v29, v4, v5, #12, \sz // t13a
+ neg v6.4s, v6.4s
+.ifc \sz, .8h
+ neg v7.4s, v7.4s
+.endif
+ sqrshrn_sz v23, v6, v7, #12, \sz // t10a
+
+ sqsub v2\sz, v17\sz, v19\sz // t11a
+ sqadd v17\sz, v17\sz, v19\sz // t8a
+ sqsub v3\sz, v31\sz, v25\sz // t12a
+ sqadd v31\sz, v31\sz, v25\sz // t15a
+ sqadd v19\sz, v21\sz, v23\sz // t9
+ sqsub v21\sz, v21\sz, v23\sz // t10
+ sqsub v25\sz, v27\sz, v29\sz // t13
+ sqadd v27\sz, v27\sz, v29\sz // t14
+
+ smull_smlsl v4, v5, v3, v2, v0.h[0], v0.h[0], \sz // -> t11
+ smull_smlal v6, v7, v3, v2, v0.h[0], v0.h[0], \sz // -> t12
+ smull_smlsl v2, v3, v25, v21, v0.h[0], v0.h[0], \sz // -> t10a
+
+ sqrshrn_sz v4, v4, v5, #12, \sz // t11
+ sqrshrn_sz v5, v6, v7, #12, \sz // t12
+ smull_smlal v6, v7, v25, v21, v0.h[0], v0.h[0], \sz // -> t13a
+ sqrshrn_sz v2, v2, v3, #12, \sz // t10a
+ sqrshrn_sz v3, v6, v7, #12, \sz // t13a
+
+ sqadd v6\sz, v16\sz, v31\sz // out0
+ sqsub v31\sz, v16\sz, v31\sz // out15
+ mov v16\szb, v6\szb
+ sqadd v23\sz, v30\sz, v17\sz // out7
+ sqsub v7\sz, v30\sz, v17\sz // out8
+ sqadd v17\sz, v18\sz, v27\sz // out1
+ sqsub v30\sz, v18\sz, v27\sz // out14
+ sqadd v18\sz, v20\sz, v3\sz // out2
+ sqsub v29\sz, v20\sz, v3\sz // out13
+ sqadd v3\sz, v28\sz, v19\sz // out6
+ sqsub v25\sz, v28\sz, v19\sz // out9
+ sqadd v19\sz, v22\sz, v5\sz // out3
+ sqsub v28\sz, v22\sz, v5\sz // out12
+ sqadd v20\sz, v24\sz, v4\sz // out4
+ sqsub v27\sz, v24\sz, v4\sz // out11
+ sqadd v21\sz, v26\sz, v2\sz // out5
+ sqsub v26\sz, v26\sz, v2\sz // out10
+ mov v24\szb, v7\szb
+ mov v22\szb, v3\szb
+.endm
+
+function inv_dct_8h_x16_neon, export=1
+ movrel x16, idct_coeffs
+ ld1 {v0.8h, v1.8h}, [x16]
+ idct_16 .8h, .16b
+ ret
+endfunc
+
+function inv_dct_4h_x16_neon, export=1
+ movrel x16, idct_coeffs
+ ld1 {v0.8h, v1.8h}, [x16]
+ idct_16 .4h, .8b
+ ret
+endfunc
+
+.macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15, sz, szb
+ movrel x16, iadst16_coeffs
+ ld1 {v0.8h, v1.8h}, [x16]
+ movrel x16, idct_coeffs
+
+ smull_smlal v2, v3, v31, v16, v0.h[0], v0.h[1], \sz // -> t0
+ smull_smlsl v4, v5, v31, v16, v0.h[1], v0.h[0], \sz // -> t1
+ smull_smlal v6, v7, v29, v18, v0.h[2], v0.h[3], \sz // -> t2
+ sqrshrn_sz v16, v2, v3, #12, \sz // t0
+ sqrshrn_sz v31, v4, v5, #12, \sz // t1
+ smull_smlsl v2, v3, v29, v18, v0.h[3], v0.h[2], \sz // -> t3
+ smull_smlal v4, v5, v27, v20, v0.h[4], v0.h[5], \sz // -> t4
+ sqrshrn_sz v18, v6, v7, #12, \sz // t2
+ sqrshrn_sz v29, v2, v3, #12, \sz // t3
+ smull_smlsl v6, v7, v27, v20, v0.h[5], v0.h[4], \sz // -> t5
+ smull_smlal v2, v3, v25, v22, v0.h[6], v0.h[7], \sz // -> t6
+ sqrshrn_sz v20, v4, v5, #12, \sz // t4
+ sqrshrn_sz v27, v6, v7, #12, \sz // t5
+ smull_smlsl v4, v5, v25, v22, v0.h[7], v0.h[6], \sz // -> t7
+ smull_smlal v6, v7, v23, v24, v1.h[0], v1.h[1], \sz // -> t8
+ sqrshrn_sz v22, v2, v3, #12, \sz // t6
+ sqrshrn_sz v25, v4, v5, #12, \sz // t7
+ smull_smlsl v2, v3, v23, v24, v1.h[1], v1.h[0], \sz // -> t9
+ smull_smlal v4, v5, v21, v26, v1.h[2], v1.h[3], \sz // -> t10
+ sqrshrn_sz v23, v6, v7, #12, \sz // t8
+ sqrshrn_sz v24, v2, v3, #12, \sz // t9
+ smull_smlsl v6, v7, v21, v26, v1.h[3], v1.h[2], \sz // -> t11
+ smull_smlal v2, v3, v19, v28, v1.h[4], v1.h[5], \sz // -> t12
+ sqrshrn_sz v21, v4, v5, #12, \sz // t10
+ sqrshrn_sz v26, v6, v7, #12, \sz // t11
+ smull_smlsl v4, v5, v19, v28, v1.h[5], v1.h[4], \sz // -> t13
+ smull_smlal v6, v7, v17, v30, v1.h[6], v1.h[7], \sz // -> t14
+ sqrshrn_sz v19, v2, v3, #12, \sz // t12
+ sqrshrn_sz v28, v4, v5, #12, \sz // t13
+ smull_smlsl v2, v3, v17, v30, v1.h[7], v1.h[6], \sz // -> t15
+ sqrshrn_sz v17, v6, v7, #12, \sz // t14
+ sqrshrn_sz v30, v2, v3, #12, \sz // t15
+
+ ld1 {v0.8h}, [x16]
+
+ sqsub v2\sz, v16\sz, v23\sz // t8a
+ sqadd v16\sz, v16\sz, v23\sz // t0a
+ sqsub v3\sz, v31\sz, v24\sz // t9a
+ sqadd v31\sz, v31\sz, v24\sz // t1a
+ sqadd v23\sz, v18\sz, v21\sz // t2a
+ sqsub v18\sz, v18\sz, v21\sz // t10a
+ sqadd v24\sz, v29\sz, v26\sz // t3a
+ sqsub v29\sz, v29\sz, v26\sz // t11a
+ sqadd v21\sz, v20\sz, v19\sz // t4a
+ sqsub v20\sz, v20\sz, v19\sz // t12a
+ sqadd v26\sz, v27\sz, v28\sz // t5a
+ sqsub v27\sz, v27\sz, v28\sz // t13a
+ sqadd v19\sz, v22\sz, v17\sz // t6a
+ sqsub v22\sz, v22\sz, v17\sz // t14a
+ sqadd v28\sz, v25\sz, v30\sz // t7a
+ sqsub v25\sz, v25\sz, v30\sz // t15a
+
+ smull_smlal v4, v5, v2, v3, v0.h[5], v0.h[4], \sz // -> t8
+ smull_smlsl v6, v7, v2, v3, v0.h[4], v0.h[5], \sz // -> t9
+ smull_smlal v2, v3, v18, v29, v0.h[7], v0.h[6], \sz // -> t10
+ sqrshrn_sz v17, v4, v5, #12, \sz // t8
+ sqrshrn_sz v30, v6, v7, #12, \sz // t9
+ smull_smlsl v4, v5, v18, v29, v0.h[6], v0.h[7], \sz // -> t11
+ smull_smlsl v6, v7, v27, v20, v0.h[5], v0.h[4], \sz // -> t12
+ sqrshrn_sz v18, v2, v3, #12, \sz // t10
+ sqrshrn_sz v29, v4, v5, #12, \sz // t11
+ smull_smlal v2, v3, v27, v20, v0.h[4], v0.h[5], \sz // -> t13
+ smull_smlsl v4, v5, v25, v22, v0.h[7], v0.h[6], \sz // -> t14
+ sqrshrn_sz v27, v6, v7, #12, \sz // t12
+ sqrshrn_sz v20, v2, v3, #12, \sz // t13
+ smull_smlal v6, v7, v25, v22, v0.h[6], v0.h[7], \sz // -> t15
+ sqrshrn_sz v25, v4, v5, #12, \sz // t14
+ sqrshrn_sz v22, v6, v7, #12, \sz // t15
+
+ sqsub v2\sz, v16\sz, v21\sz // t4
+ sqadd v16\sz, v16\sz, v21\sz // t0
+ sqsub v3\sz, v31\sz, v26\sz // t5
+ sqadd v31\sz, v31\sz, v26\sz // t1
+ sqadd v21\sz, v23\sz, v19\sz // t2
+ sqsub v23\sz, v23\sz, v19\sz // t6
+ sqadd v26\sz, v24\sz, v28\sz // t3
+ sqsub v24\sz, v24\sz, v28\sz // t7
+ sqadd v19\sz, v17\sz, v27\sz // t8a
+ sqsub v17\sz, v17\sz, v27\sz // t12a
+ sqadd v28\sz, v30\sz, v20\sz // t9a
+ sqsub v30\sz, v30\sz, v20\sz // t13a
+ sqadd v27\sz, v18\sz, v25\sz // t10a
+ sqsub v18\sz, v18\sz, v25\sz // t14a
+ sqadd v20\sz, v29\sz, v22\sz // t11a
+ sqsub v29\sz, v29\sz, v22\sz // t15a
+
+ smull_smlal v4, v5, v2, v3, v0.h[3], v0.h[2], \sz // -> t4a
+ smull_smlsl v6, v7, v2, v3, v0.h[2], v0.h[3], \sz // -> t5a
+ smull_smlsl v2, v3, v24, v23, v0.h[3], v0.h[2], \sz // -> t6a
+ sqrshrn_sz v22, v4, v5, #12, \sz // t4a
+ sqrshrn_sz v25, v6, v7, #12, \sz // t5a
+ smull_smlal v4, v5, v24, v23, v0.h[2], v0.h[3], \sz // -> t7a
+ smull_smlal v6, v7, v17, v30, v0.h[3], v0.h[2], \sz // -> t12
+ sqrshrn_sz v24, v2, v3, #12, \sz // t6a
+ sqrshrn_sz v23, v4, v5, #12, \sz // t7a
+ smull_smlsl v2, v3, v17, v30, v0.h[2], v0.h[3], \sz // -> t13
+ smull_smlsl v4, v5, v29, v18, v0.h[3], v0.h[2], \sz // -> t14
+ sqrshrn_sz v17, v6, v7, #12, \sz // t12
+ smull_smlal v6, v7, v29, v18, v0.h[2], v0.h[3], \sz // -> t15
+ sqrshrn_sz v29, v2, v3, #12, \sz // t13
+ sqrshrn_sz v30, v4, v5, #12, \sz // t14
+ sqrshrn_sz v18, v6, v7, #12, \sz // t15
+
+ sqsub v2\sz, v16\sz, v21\sz // t2a
+.ifc \o0, v16
+ sqadd \o0\sz, v16\sz, v21\sz // out0
+ sqsub v21\sz, v31\sz, v26\sz // t3a
+ sqadd \o15\sz, v31\sz, v26\sz // out15
+.else
+ sqadd v4\sz, v16\sz, v21\sz // out0
+ sqsub v21\sz, v31\sz, v26\sz // t3a
+ sqadd \o15\sz, v31\sz, v26\sz // out15
+ mov \o0\szb, v4\szb
+.endif
+ sqneg \o15\sz, \o15\sz // out15
+
+ sqsub v3\sz, v29\sz, v18\sz // t15a
+ sqadd \o13\sz, v29\sz, v18\sz // out13
+ sqadd \o2\sz, v17\sz, v30\sz // out2
+ sqsub v26\sz, v17\sz, v30\sz // t14a
+ sqneg \o13\sz, \o13\sz // out13
+
+ sqadd \o1\sz, v19\sz, v27\sz // out1
+ sqsub v27\sz, v19\sz, v27\sz // t10
+ sqadd \o14\sz, v28\sz, v20\sz // out14
+ sqsub v20\sz, v28\sz, v20\sz // t11
+ sqneg \o1\sz, \o1\sz // out1
+
+ sqadd \o3\sz, v22\sz, v24\sz // out3
+ sqsub v22\sz, v22\sz, v24\sz // t6
+ sqadd \o12\sz, v25\sz, v23\sz // out12
+ sqsub v23\sz, v25\sz, v23\sz // t7
+ sqneg \o3\sz, \o3\sz // out3
+
+ smull_smlsl v24, v25, v2, v21, v0.h[0], v0.h[0], \sz // -> out8 (v24 or v23)
+ smull_smlal v4, v5, v2, v21, v0.h[0], v0.h[0], \sz // -> out7 (v23 or v24)
+ smull_smlal v6, v7, v26, v3, v0.h[0], v0.h[0], \sz // -> out5 (v21 or v26)
+
+ sqrshrn_sz v24, v24, v25, #12, \sz // out8
+ sqrshrn_sz v4, v4, v5, #12, \sz // out7
+ sqrshrn_sz v5, v6, v7, #12, \sz // out5
+ smull_smlsl v6, v7, v26, v3, v0.h[0], v0.h[0], \sz // -> out10 (v26 or v21)
+ smull_smlal v2, v3, v22, v23, v0.h[0], v0.h[0], \sz // -> out4 (v20 or v27)
+ sqrshrn_sz v26, v6, v7, #12, \sz // out10
+
+ smull_smlsl v6, v7, v22, v23, v0.h[0], v0.h[0], \sz // -> out11 (v27 or v20)
+ smull_smlal v22, v23, v27, v20, v0.h[0], v0.h[0], \sz // -> out6 (v22 or v25)
+ smull_smlsl v21, v25, v27, v20, v0.h[0], v0.h[0], \sz // -> out9 (v25 or v22)
+
+ sqrshrn_sz \o4, v2, v3, #12, \sz // out4
+ sqrshrn_sz v6, v6, v7, #12, \sz // out11
+ sqrshrn_sz v7, v21, v25, #12, \sz // out9
+ sqrshrn_sz \o6, v22, v23, #12, \sz // out6
+
+.ifc \o8, v23
+ mov \o8\szb, v24\szb
+ mov \o10\szb, v26\szb
+.endif
+
+ sqneg \o7\sz, v4\sz // out7
+ sqneg \o5\sz, v5\sz // out5
+ sqneg \o11\sz, v6\sz // out11
+ sqneg \o9\sz, v7\sz // out9
+.endm
+
+function inv_adst_8h_x16_neon, export=1
+ iadst_16 v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, .8h, .16b
+ ret
+endfunc
+
+function inv_flipadst_8h_x16_neon, export=1
+ iadst_16 v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16, .8h, .16b
+ ret
+endfunc
+
+function inv_adst_4h_x16_neon, export=1
+ iadst_16 v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, .4h, .8b
+ ret
+endfunc
+
+function inv_flipadst_4h_x16_neon, export=1
+ iadst_16 v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16, .4h, .8b
+ ret
+endfunc
+
+function inv_identity_8h_x16_neon, export=1
+ mov w16, #2*(5793-4096)*8
+ dup v0.4h, w16
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ sqrdmulh v2.8h, v\i\().8h, v0.h[0]
+ sqadd v\i\().8h, v\i\().8h, v\i\().8h
+ sqadd v\i\().8h, v\i\().8h, v2.8h
+.endr
+ ret
+endfunc
+
+function inv_identity_4h_x16_neon, export=1
+ mov w16, #2*(5793-4096)*8
+ dup v0.4h, w16
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ sqrdmulh v2.4h, v\i\().4h, v0.h[0]
+ sqadd v\i\().4h, v\i\().4h, v\i\().4h
+ sqadd v\i\().4h, v\i\().4h, v2.4h
+.endr
+ ret
+endfunc
+
+.macro identity_8x16_shift2 c
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
+ sqrdmulh v2.8h, \i, \c
+ sshr v2.8h, v2.8h, #1
+ srhadd \i, \i, v2.8h
+.endr
+.endm
+
+.macro identity_8x16_shift1 c
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
+ sqrdmulh v2.8h, \i, \c
+ srshr v2.8h, v2.8h, #1
+ sqadd \i, \i, v2.8h
+.endr
+.endm
+
+.macro identity_8x8_shift1 c
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
+ sqrdmulh v2.8h, \i, \c
+ srshr v2.8h, v2.8h, #1
+ sqadd \i, \i, v2.8h
+.endr
+.endm
+
+.macro identity_8x8 c
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
+ sqrdmulh v2.8h, \i, \c
+ sqadd \i, \i, \i
+ sqadd \i, \i, v2.8h
+.endr
+.endm
+
+.macro def_horz_16 scale=0, identity=0, shift=2, suffix
+function inv_txfm_horz\suffix\()_16x8_neon
+ AARCH64_VALID_CALL_TARGET
+ mov x14, x30
+ movi v7.8h, #0
+.if \identity
+ mov w16, #2*(5793-4096)*8
+ dup v0.4h, w16
+.elseif \scale
+ mov w16, #2896*8
+ dup v0.4h, w16
+.endif
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
+ ld1 {\i}, [x7]
+ st1 {v7.8h}, [x7], x8
+.endr
+.if \scale
+ scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
+ scale_input .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31
+.endif
+.if \identity
+ identity_8x16_shift2 v0.h[0]
+.else
+ blr x4
+.endif
+.if \shift > 0
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
+ srshr \i, \i, #\shift
+.endr
+.endif
+ transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
+ transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v4, v5
+
+.irp i, v16.8h, v24.8h, v17.8h, v25.8h, v18.8h, v26.8h, v19.8h, v27.8h, v20.8h, v28.8h, v21.8h, v29.8h, v22.8h, v30.8h, v23.8h, v31.8h
+ st1 {\i}, [x6], #16
+.endr
+
+ ret x14
+endfunc
+.endm
+
+def_horz_16 scale=0, identity=0, shift=2
+def_horz_16 scale=1, identity=0, shift=1, suffix=_scale
+def_horz_16 scale=0, identity=1, shift=0, suffix=_identity
+
+function inv_txfm_add_vert_8x16_neon
+ mov x14, x30
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ ld1 {v\i\().8h}, [x7], x8
+.endr
+ blr x5
+ load_add_store_8x16 x6, x7
+ ret x14
+endfunc
+
+function inv_txfm_add_16x16_neon
+ mov x15, x30
+ sub sp, sp, #512
+.irp i, 0, 8
+ add x6, sp, #(\i*16*2)
+.if \i == 8
+ cmp w3, w13
+ b.lt 1f
+.endif
+ add x7, x2, #(\i*2)
+ mov x8, #16*2
+ blr x9
+.endr
+ b 2f
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+2:
+.irp i, 0, 8
+ add x6, x0, #(\i)
+ add x7, sp, #(\i*2)
+ mov x8, #32
+ bl inv_txfm_add_vert_8x16_neon
+.endr
+
+ add sp, sp, #512
+ ret x15
+endfunc
+
+.macro def_fn_16x16 txfm1, txfm2, eob_half
+function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_8bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc 16, 16, 2
+.endif
+.ifc \txfm1, identity
+ adr x9, inv_txfm_horz_identity_16x8_neon
+.else
+ adr x9, inv_txfm_horz_16x8_neon
+ adr x4, inv_\txfm1\()_8h_x16_neon
+.endif
+ adr x5, inv_\txfm2\()_8h_x16_neon
+ mov x13, #\eob_half
+ b inv_txfm_add_16x16_neon
+endfunc
+.endm
+
+def_fn_16x16 dct, dct, 36
+def_fn_16x16 identity, identity, 36
+def_fn_16x16 dct, adst, 36
+def_fn_16x16 dct, flipadst, 36
+def_fn_16x16 dct, identity, 8
+def_fn_16x16 adst, dct, 36
+def_fn_16x16 adst, adst, 36
+def_fn_16x16 adst, flipadst, 36
+def_fn_16x16 flipadst, dct, 36
+def_fn_16x16 flipadst, adst, 36
+def_fn_16x16 flipadst, flipadst, 36
+def_fn_16x16 identity, dct, 8
+
+.macro def_fn_416_base variant
+function inv_txfm_\variant\()add_16x4_neon
+ mov x15, x30
+ movi v4.8h, #0
+
+.ifc \variant, identity_
+.irp i, v16.4h, v17.4h, v18.4h, v19.4h
+ ld1 {\i}, [x2]
+ st1 {v4.4h}, [x2], #8
+.endr
+.irp i, v16.d, v17.d, v18.d, v19.d
+ ld1 {\i}[1], [x2]
+ st1 {v4.4h}, [x2], #8
+.endr
+ mov w16, #2*(5793-4096)*8
+ dup v0.4h, w16
+.irp i, v20.4h, v21.4h, v22.4h, v23.4h
+ ld1 {\i}, [x2]
+ st1 {v4.4h}, [x2], #8
+.endr
+.irp i, v20.d, v21.d, v22.d, v23.d
+ ld1 {\i}[1], [x2]
+ st1 {v4.4h}, [x2], #8
+.endr
+
+ identity_8x16_shift1 v0.h[0]
+.else
+.irp i, v16.4h, v17.4h, v18.4h, v19.4h, v20.4h, v21.4h, v22.4h, v23.4h, v24.4h, v25.4h, v26.4h, v27.4h, v28.4h, v29.4h, v30.4h, v31.4h
+ ld1 {\i}, [x2]
+ st1 {v4.4h}, [x2], #8
+.endr
+
+ blr x4
+
+ ins v16.d[1], v20.d[0]
+ ins v17.d[1], v21.d[0]
+ ins v18.d[1], v22.d[0]
+ ins v19.d[1], v23.d[0]
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h
+ srshr \i, \i, #1
+.endr
+.endif
+ transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5
+ blr x5
+ mov x6, x0
+ load_add_store_8x4 x6, x7
+
+.ifc \variant, identity_
+ mov v16.16b, v20.16b
+ mov v17.16b, v21.16b
+ mov v18.16b, v22.16b
+ mov v19.16b, v23.16b
+.else
+ ins v24.d[1], v28.d[0]
+ ins v25.d[1], v29.d[0]
+ ins v26.d[1], v30.d[0]
+ ins v27.d[1], v31.d[0]
+ srshr v16.8h, v24.8h, #1
+ srshr v17.8h, v25.8h, #1
+ srshr v18.8h, v26.8h, #1
+ srshr v19.8h, v27.8h, #1
+.endif
+ transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5
+ blr x5
+ add x6, x0, #8
+ load_add_store_8x4 x6, x7
+
+ ret x15
+endfunc
+
+function inv_txfm_\variant\()add_4x16_neon
+ mov x15, x30
+ movi v2.8h, #0
+
+ mov x11, #32
+ cmp w3, w13
+ b.lt 1f
+
+ add x6, x2, #16
+.ifc \variant, identity_
+.irp i, v24.8h, v25.8h, v26.8h, v27.8h
+ ld1 {\i}, [x6]
+ st1 {v2.8h}, [x6], x11
+.endr
+ mov w16, #(5793-4096)*8
+ dup v0.4h, w16
+ identity_8x4_shift1 v24, v25, v26, v27, v0.h[0]
+.else
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h
+ ld1 {\i}, [x6]
+ st1 {v2.8h}, [x6], x11
+.endr
+ blr x4
+ srshr v24.8h, v16.8h, #1
+ srshr v25.8h, v17.8h, #1
+ srshr v26.8h, v18.8h, #1
+ srshr v27.8h, v19.8h, #1
+.endif
+ transpose_4x8h v24, v25, v26, v27, v4, v5, v6, v7
+ ins v28.d[0], v24.d[1]
+ ins v29.d[0], v25.d[1]
+ ins v30.d[0], v26.d[1]
+ ins v31.d[0], v27.d[1]
+
+ b 2f
+1:
+.irp i, v24.4h, v25.4h, v26.4h, v27.4h, v28.4h, v29.4h, v30.4h, v31.4h
+ movi \i, #0
+.endr
+2:
+ movi v2.8h, #0
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h
+ ld1 {\i}, [x2]
+ st1 {v2.8h}, [x2], x11
+.endr
+.ifc \variant, identity_
+ mov w16, #(5793-4096)*8
+ dup v0.4h, w16
+ identity_8x4_shift1 v16, v17, v18, v19, v0.h[0]
+.else
+ blr x4
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h
+ srshr \i, \i, #1
+.endr
+.endif
+ transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7
+ ins v20.d[0], v16.d[1]
+ ins v21.d[0], v17.d[1]
+ ins v22.d[0], v18.d[1]
+ ins v23.d[0], v19.d[1]
+
+ blr x5
+
+ load_add_store_4x16 x0, x6
+
+ ret x15
+endfunc
+.endm
+
+def_fn_416_base
+def_fn_416_base identity_
+
+.macro def_fn_416 w, h, txfm1, txfm2, eob_half
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc \w, \h, 1
+.endif
+.if \w == 4
+ adr x4, inv_\txfm1\()_8h_x\w\()_neon
+ adr x5, inv_\txfm2\()_4h_x\h\()_neon
+ mov w13, #\eob_half
+.else
+ adr x4, inv_\txfm1\()_4h_x\w\()_neon
+ adr x5, inv_\txfm2\()_8h_x\h\()_neon
+.endif
+.ifc \txfm1, identity
+ b inv_txfm_identity_add_\w\()x\h\()_neon
+.else
+ b inv_txfm_add_\w\()x\h\()_neon
+.endif
+endfunc
+.endm
+
+.macro def_fns_416 w, h
+def_fn_416 \w, \h, dct, dct, 29
+def_fn_416 \w, \h, identity, identity, 29
+def_fn_416 \w, \h, dct, adst, 29
+def_fn_416 \w, \h, dct, flipadst, 29
+def_fn_416 \w, \h, dct, identity, 8
+def_fn_416 \w, \h, adst, dct, 29
+def_fn_416 \w, \h, adst, adst, 29
+def_fn_416 \w, \h, adst, flipadst, 29
+def_fn_416 \w, \h, flipadst, dct, 29
+def_fn_416 \w, \h, flipadst, adst, 29
+def_fn_416 \w, \h, flipadst, flipadst, 29
+def_fn_416 \w, \h, identity, dct, 32
+def_fn_416 \w, \h, adst, identity, 8
+def_fn_416 \w, \h, flipadst, identity, 8
+def_fn_416 \w, \h, identity, adst, 32
+def_fn_416 \w, \h, identity, flipadst, 32
+.endm
+
+def_fns_416 4, 16
+def_fns_416 16, 4
+
+
+.macro def_fn_816_base variant
+function inv_txfm_\variant\()add_16x8_neon
+ mov x15, x30
+ movi v4.8h, #0
+ mov w16, #2896*8
+ dup v0.4h, w16
+
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
+ ld1 {\i}, [x2]
+ st1 {v4.8h}, [x2], #16
+.endr
+
+ scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
+ scale_input .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31
+.ifc \variant, identity_
+ mov w16, #2*(5793-4096)*8
+ dup v0.4h, w16
+ identity_8x16_shift1 v0.h[0]
+.else
+ blr x4
+
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
+ srshr \i, \i, #1
+.endr
+.endif
+ transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
+
+ blr x5
+
+ mov x6, x0
+ load_add_store_8x8 x6, x7
+
+.ifc \variant, identity_
+ mov v16.16b, v24.16b
+ mov v17.16b, v25.16b
+ mov v18.16b, v26.16b
+ mov v19.16b, v27.16b
+ mov v20.16b, v28.16b
+ mov v21.16b, v29.16b
+ mov v22.16b, v30.16b
+ mov v23.16b, v31.16b
+.else
+ srshr v16.8h, v24.8h, #1
+ srshr v17.8h, v25.8h, #1
+ srshr v18.8h, v26.8h, #1
+ srshr v19.8h, v27.8h, #1
+ srshr v20.8h, v28.8h, #1
+ srshr v21.8h, v29.8h, #1
+ srshr v22.8h, v30.8h, #1
+ srshr v23.8h, v31.8h, #1
+.endif
+
+ transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
+
+ blr x5
+
+ add x0, x0, #8
+ load_add_store_8x8 x0, x7
+
+ ret x15
+endfunc
+
+function inv_txfm_\variant\()add_8x16_neon
+ mov x15, x30
+ movi v4.8h, #0
+ mov w16, #2896*8
+ dup v0.4h, w16
+ mov x11, #32
+
+ cmp w3, w13
+ b.lt 1f
+
+ add x6, x2, #16
+.ifc \variant, identity_
+.irp i, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
+ ld1 {\i}, [x6]
+ st1 {v4.8h}, [x6], x11
+.endr
+ scale_input .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31
+ // The identity shl #1 and downshift srshr #1 cancel out
+.else
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
+ ld1 {\i}, [x6]
+ st1 {v4.8h}, [x6], x11
+.endr
+ scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
+ blr x4
+
+ srshr v24.8h, v16.8h, #1
+ srshr v25.8h, v17.8h, #1
+ srshr v26.8h, v18.8h, #1
+ srshr v27.8h, v19.8h, #1
+ srshr v28.8h, v20.8h, #1
+ srshr v29.8h, v21.8h, #1
+ srshr v30.8h, v22.8h, #1
+ srshr v31.8h, v23.8h, #1
+.endif
+ transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
+
+ b 2f
+
+1:
+.irp i, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
+ movi \i, #0
+.endr
+
+2:
+ movi v4.8h, #0
+ mov w16, #2896*8
+ dup v0.4h, w16
+
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
+ ld1 {\i}, [x2]
+ st1 {v4.8h}, [x2], x11
+.endr
+ scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
+.ifc \variant, identity_
+ // The identity shl #1 and downshift srshr #1 cancel out
+.else
+ blr x4
+
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
+ srshr \i, \i, #1
+.endr
+.endif
+
+ transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
+
+ blr x5
+
+ load_add_store_8x16 x0, x6
+
+ ret x15
+endfunc
+.endm
+
+def_fn_816_base
+def_fn_816_base identity_
+
+.macro def_fn_816 w, h, txfm1, txfm2, eob_half
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc \w, \h, 1
+.endif
+ adr x4, inv_\txfm1\()_8h_x\w\()_neon
+ adr x5, inv_\txfm2\()_8h_x\h\()_neon
+.if \w == 8
+ mov x13, #\eob_half
+.endif
+.ifc \txfm1, identity
+ b inv_txfm_identity_add_\w\()x\h\()_neon
+.else
+ b inv_txfm_add_\w\()x\h\()_neon
+.endif
+endfunc
+.endm
+
+.macro def_fns_816 w, h
+def_fn_816 \w, \h, dct, dct, 43
+def_fn_816 \w, \h, identity, identity, 43
+def_fn_816 \w, \h, dct, adst, 43
+def_fn_816 \w, \h, dct, flipadst, 43
+def_fn_816 \w, \h, dct, identity, 8
+def_fn_816 \w, \h, adst, dct, 43
+def_fn_816 \w, \h, adst, adst, 43
+def_fn_816 \w, \h, adst, flipadst, 43
+def_fn_816 \w, \h, flipadst, dct, 43
+def_fn_816 \w, \h, flipadst, adst, 43
+def_fn_816 \w, \h, flipadst, flipadst, 43
+def_fn_816 \w, \h, identity, dct, 64
+def_fn_816 \w, \h, adst, identity, 8
+def_fn_816 \w, \h, flipadst, identity, 8
+def_fn_816 \w, \h, identity, adst, 64
+def_fn_816 \w, \h, identity, flipadst, 64
+.endm
+
+def_fns_816 8, 16
+def_fns_816 16, 8
+
+function inv_dct32_odd_8h_x16_neon, export=1
+ movrel x16, idct_coeffs, 2*16
+ ld1 {v0.8h, v1.8h}, [x16]
+ sub x16, x16, #2*16
+
+ smull_smlsl v2, v3, v16, v31, v0.h[0], v0.h[1], .8h // -> t16a
+ smull_smlal v4, v5, v16, v31, v0.h[1], v0.h[0], .8h // -> t31a
+ smull_smlsl v6, v7, v24, v23, v0.h[2], v0.h[3], .8h // -> t17a
+ sqrshrn_sz v16, v2, v3, #12, .8h // t16a
+ sqrshrn_sz v31, v4, v5, #12, .8h // t31a
+ smull_smlal v2, v3, v24, v23, v0.h[3], v0.h[2], .8h // -> t30a
+ smull_smlsl v4, v5, v20, v27, v0.h[4], v0.h[5], .8h // -> t18a
+ sqrshrn_sz v24, v6, v7, #12, .8h // t17a
+ sqrshrn_sz v23, v2, v3, #12, .8h // t30a
+ smull_smlal v6, v7, v20, v27, v0.h[5], v0.h[4], .8h // -> t29a
+ smull_smlsl v2, v3, v28, v19, v0.h[6], v0.h[7], .8h // -> t19a
+ sqrshrn_sz v20, v4, v5, #12, .8h // t18a
+ sqrshrn_sz v27, v6, v7, #12, .8h // t29a
+ smull_smlal v4, v5, v28, v19, v0.h[7], v0.h[6], .8h // -> t28a
+ smull_smlsl v6, v7, v18, v29, v1.h[0], v1.h[1], .8h // -> t20a
+ sqrshrn_sz v28, v2, v3, #12, .8h // t19a
+ sqrshrn_sz v19, v4, v5, #12, .8h // t28a
+ smull_smlal v2, v3, v18, v29, v1.h[1], v1.h[0], .8h // -> t27a
+ smull_smlsl v4, v5, v26, v21, v1.h[2], v1.h[3], .8h // -> t21a
+ sqrshrn_sz v18, v6, v7, #12, .8h // t20a
+ sqrshrn_sz v29, v2, v3, #12, .8h // t27a
+ smull_smlal v6, v7, v26, v21, v1.h[3], v1.h[2], .8h // -> t26a
+ smull_smlsl v2, v3, v22, v25, v1.h[4], v1.h[5], .8h // -> t22a
+ sqrshrn_sz v26, v4, v5, #12, .8h // t21a
+ sqrshrn_sz v21, v6, v7, #12, .8h // t26a
+ smull_smlal v4, v5, v22, v25, v1.h[5], v1.h[4], .8h // -> t25a
+ smull_smlsl v6, v7, v30, v17, v1.h[6], v1.h[7], .8h // -> t23a
+ sqrshrn_sz v22, v2, v3, #12, .8h // t22a
+ sqrshrn_sz v25, v4, v5, #12, .8h // t25a
+ smull_smlal v2, v3, v30, v17, v1.h[7], v1.h[6], .8h // -> t24a
+ sqrshrn_sz v30, v6, v7, #12, .8h // t23a
+ sqrshrn_sz v17, v2, v3, #12, .8h // t24a
+
+ ld1 {v0.8h}, [x16]
+
+ sqsub v2.8h, v16.8h, v24.8h // t17
+ sqadd v16.8h, v16.8h, v24.8h // t16
+ sqsub v3.8h, v31.8h, v23.8h // t30
+ sqadd v31.8h, v31.8h, v23.8h // t31
+ sqsub v24.8h, v28.8h, v20.8h // t18
+ sqadd v28.8h, v28.8h, v20.8h // t19
+ sqadd v23.8h, v18.8h, v26.8h // t20
+ sqsub v18.8h, v18.8h, v26.8h // t21
+ sqsub v20.8h, v30.8h, v22.8h // t22
+ sqadd v30.8h, v30.8h, v22.8h // t23
+ sqadd v26.8h, v17.8h, v25.8h // t24
+ sqsub v17.8h, v17.8h, v25.8h // t25
+ sqsub v22.8h, v29.8h, v21.8h // t26
+ sqadd v29.8h, v29.8h, v21.8h // t27
+ sqadd v25.8h, v19.8h, v27.8h // t28
+ sqsub v19.8h, v19.8h, v27.8h // t29
+
+ smull_smlsl v4, v5, v3, v2, v0.h[4], v0.h[5], .8h // -> t17a
+ smull_smlal v6, v7, v3, v2, v0.h[5], v0.h[4], .8h // -> t30a
+ smull_smlal v2, v3, v19, v24, v0.h[5], v0.h[4], .8h // -> t18a
+ sqrshrn_sz v21, v4, v5, #12, .8h // t17a
+ sqrshrn_sz v27, v6, v7, #12, .8h // t30a
+ neg v2.4s, v2.4s // -> t18a
+ neg v3.4s, v3.4s // -> t18a
+ smull_smlsl v4, v5, v19, v24, v0.h[4], v0.h[5], .8h // -> t29a
+ smull_smlsl v6, v7, v22, v18, v0.h[6], v0.h[7], .8h // -> t21a
+ sqrshrn_sz v19, v2, v3, #12, .8h // t18a
+ sqrshrn_sz v24, v4, v5, #12, .8h // t29a
+ smull_smlal v2, v3, v22, v18, v0.h[7], v0.h[6], .8h // -> t26a
+ smull_smlal v4, v5, v17, v20, v0.h[7], v0.h[6], .8h // -> t22a
+ sqrshrn_sz v22, v6, v7, #12, .8h // t21a
+ sqrshrn_sz v18, v2, v3, #12, .8h // t26a
+ neg v4.4s, v4.4s // -> t22a
+ neg v5.4s, v5.4s // -> t22a
+ smull_smlsl v6, v7, v17, v20, v0.h[6], v0.h[7], .8h // -> t25a
+ sqrshrn_sz v17, v4, v5, #12, .8h // t22a
+ sqrshrn_sz v20, v6, v7, #12, .8h // t25a
+
+ sqsub v2.8h, v27.8h, v24.8h // t29
+ sqadd v27.8h, v27.8h, v24.8h // t30
+ sqsub v3.8h, v21.8h, v19.8h // t18
+ sqadd v21.8h, v21.8h, v19.8h // t17
+ sqsub v24.8h, v16.8h, v28.8h // t19a
+ sqadd v16.8h, v16.8h, v28.8h // t16a
+ sqsub v19.8h, v30.8h, v23.8h // t20a
+ sqadd v30.8h, v30.8h, v23.8h // t23a
+ sqsub v28.8h, v17.8h, v22.8h // t21
+ sqadd v17.8h, v17.8h, v22.8h // t22
+ sqadd v23.8h, v26.8h, v29.8h // t24a
+ sqsub v26.8h, v26.8h, v29.8h // t27a
+ sqadd v22.8h, v20.8h, v18.8h // t25
+ sqsub v20.8h, v20.8h, v18.8h // t26
+ sqsub v29.8h, v31.8h, v25.8h // t28a
+ sqadd v31.8h, v31.8h, v25.8h // t31a
+
+ smull_smlsl v4, v5, v2, v3, v0.h[2], v0.h[3], .8h // -> t18a
+ smull_smlal v6, v7, v2, v3, v0.h[3], v0.h[2], .8h // -> t29a
+ smull_smlsl v2, v3, v29, v24, v0.h[2], v0.h[3], .8h // -> t19
+ sqrshrn_sz v18, v4, v5, #12, .8h // t18a
+ sqrshrn_sz v25, v6, v7, #12, .8h // t29a
+ smull_smlal v4, v5, v29, v24, v0.h[3], v0.h[2], .8h // -> t28
+ smull_smlal v6, v7, v26, v19, v0.h[3], v0.h[2], .8h // -> t20
+ sqrshrn_sz v29, v2, v3, #12, .8h // t19
+ sqrshrn_sz v24, v4, v5, #12, .8h // t28
+ neg v6.4s, v6.4s // -> t20
+ neg v7.4s, v7.4s // -> t20
+ smull_smlsl v2, v3, v26, v19, v0.h[2], v0.h[3], .8h // -> t27
+ smull_smlal v4, v5, v20, v28, v0.h[3], v0.h[2], .8h // -> t21a
+ sqrshrn_sz v26, v6, v7, #12, .8h // t20
+ sqrshrn_sz v19, v2, v3, #12, .8h // t27
+ neg v4.4s, v4.4s // -> t21a
+ neg v5.4s, v5.4s // -> t21a
+ smull_smlsl v6, v7, v20, v28, v0.h[2], v0.h[3], .8h // -> t26a
+ sqrshrn_sz v20, v4, v5, #12, .8h // t21a
+ sqrshrn_sz v28, v6, v7, #12, .8h // t26a
+
+ sqsub v2.8h, v16.8h, v30.8h // t23
+ sqadd v16.8h, v16.8h, v30.8h // t16 = out16
+ sqsub v3.8h, v31.8h, v23.8h // t24
+ sqadd v31.8h, v31.8h, v23.8h // t31 = out31
+ sqsub v23.8h, v21.8h, v17.8h // t22a
+ sqadd v17.8h, v21.8h, v17.8h // t17a = out17
+ sqadd v30.8h, v27.8h, v22.8h // t30a = out30
+ sqsub v21.8h, v27.8h, v22.8h // t25a
+ sqsub v27.8h, v18.8h, v20.8h // t21
+ sqadd v18.8h, v18.8h, v20.8h // t18 = out18
+ sqadd v4.8h, v29.8h, v26.8h // t19a = out19
+ sqsub v26.8h, v29.8h, v26.8h // t20a
+ sqadd v29.8h, v25.8h, v28.8h // t29 = out29
+ sqsub v25.8h, v25.8h, v28.8h // t26
+ sqadd v28.8h, v24.8h, v19.8h // t28a = out28
+ sqsub v24.8h, v24.8h, v19.8h // t27a
+ mov v19.16b, v4.16b // out19
+
+ smull_smlsl v4, v5, v24, v26, v0.h[0], v0.h[0], .8h // -> t20
+ smull_smlal v6, v7, v24, v26, v0.h[0], v0.h[0], .8h // -> t27
+ sqrshrn_sz v20, v4, v5, #12, .8h // t20
+ sqrshrn_sz v22, v6, v7, #12, .8h // t27
+
+ smull_smlal v4, v5, v25, v27, v0.h[0], v0.h[0], .8h // -> t26a
+ smull_smlsl v6, v7, v25, v27, v0.h[0], v0.h[0], .8h // -> t21a
+ mov v27.16b, v22.16b // t27
+ sqrshrn_sz v26, v4, v5, #12, .8h // t26a
+
+ smull_smlsl v24, v25, v21, v23, v0.h[0], v0.h[0], .8h // -> t22
+ smull_smlal v4, v5, v21, v23, v0.h[0], v0.h[0], .8h // -> t25
+ sqrshrn_sz v21, v6, v7, #12, .8h // t21a
+ sqrshrn_sz v22, v24, v25, #12, .8h // t22
+ sqrshrn_sz v25, v4, v5, #12, .8h // t25
+
+ smull_smlsl v4, v5, v3, v2, v0.h[0], v0.h[0], .8h // -> t23a
+ smull_smlal v6, v7, v3, v2, v0.h[0], v0.h[0], .8h // -> t24a
+ sqrshrn_sz v23, v4, v5, #12, .8h // t23a
+ sqrshrn_sz v24, v6, v7, #12, .8h // t24a
+
+ ret
+endfunc
+
+.macro def_horz_32 scale=0, shift=2, suffix
+function inv_txfm_horz\suffix\()_dct_32x8_neon
+ mov x14, x30
+ movi v7.8h, #0
+ lsl x8, x8, #1
+.if \scale
+ mov w16, #2896*8
+ dup v0.4h, w16
+.endif
+
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
+ ld1 {\i}, [x7]
+ st1 {v7.8h}, [x7], x8
+.endr
+ sub x7, x7, x8, lsl #4
+ add x7, x7, x8, lsr #1
+.if \scale
+ scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
+ scale_input .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31
+.endif
+ bl inv_dct_8h_x16_neon
+ transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
+ transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v4, v5
+
+.macro store1 r0, r1
+ st1 {\r0}, [x6], #16
+ st1 {\r1}, [x6], #16
+ add x6, x6, #32
+.endm
+ store1 v16.8h, v24.8h
+ store1 v17.8h, v25.8h
+ store1 v18.8h, v26.8h
+ store1 v19.8h, v27.8h
+ store1 v20.8h, v28.8h
+ store1 v21.8h, v29.8h
+ store1 v22.8h, v30.8h
+ store1 v23.8h, v31.8h
+.purgem store1
+ sub x6, x6, #64*8
+
+ movi v7.8h, #0
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
+ ld1 {\i}, [x7]
+ st1 {v7.8h}, [x7], x8
+.endr
+.if \scale
+ // This relies on the fact that the idct also leaves the right coeff in v0.h[1]
+ scale_input .8h, v0.h[1], v16, v17, v18, v19, v20, v21, v22, v23
+ scale_input .8h, v0.h[1], v24, v25, v26, v27, v28, v29, v30, v31
+.endif
+ bl inv_dct32_odd_8h_x16_neon
+ transpose_8x8h v31, v30, v29, v28, v27, v26, v25, v24, v4, v5
+ transpose_8x8h v23, v22, v21, v20, v19, v18, v17, v16, v4, v5
+.macro store2 r0, r1, shift
+ ld1 {v4.8h, v5.8h}, [x6]
+ sqsub v7.8h, v4.8h, \r0
+ sqsub v6.8h, v5.8h, \r1
+ sqadd v4.8h, v4.8h, \r0
+ sqadd v5.8h, v5.8h, \r1
+ rev64 v6.8h, v6.8h
+ rev64 v7.8h, v7.8h
+ srshr v4.8h, v4.8h, #\shift
+ srshr v5.8h, v5.8h, #\shift
+ srshr v6.8h, v6.8h, #\shift
+ srshr v7.8h, v7.8h, #\shift
+ ext v6.16b, v6.16b, v6.16b, #8
+ st1 {v4.8h, v5.8h}, [x6], #32
+ ext v7.16b, v7.16b, v7.16b, #8
+ st1 {v6.8h, v7.8h}, [x6], #32
+.endm
+
+ store2 v31.8h, v23.8h, \shift
+ store2 v30.8h, v22.8h, \shift
+ store2 v29.8h, v21.8h, \shift
+ store2 v28.8h, v20.8h, \shift
+ store2 v27.8h, v19.8h, \shift
+ store2 v26.8h, v18.8h, \shift
+ store2 v25.8h, v17.8h, \shift
+ store2 v24.8h, v16.8h, \shift
+.purgem store2
+ ret x14
+endfunc
+.endm
+
+def_horz_32 scale=0, shift=2
+def_horz_32 scale=1, shift=1, suffix=_scale
+
+function inv_txfm_add_vert_dct_8x32_neon
+ mov x14, x30
+ lsl x8, x8, #1
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ ld1 {v\i\().8h}, [x7], x8
+.endr
+ sub x7, x7, x8, lsl #4
+
+ bl inv_dct_8h_x16_neon
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ st1 {v\i\().8h}, [x7], x8
+.endr
+ sub x7, x7, x8, lsl #4
+ add x7, x7, x8, lsr #1
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ ld1 {v\i\().8h}, [x7], x8
+.endr
+ sub x7, x7, x8, lsl #4
+ sub x7, x7, x8, lsr #1
+ bl inv_dct32_odd_8h_x16_neon
+
+ neg x9, x8
+ mov x10, x6
+.macro combine r0, r1, r2, r3, op, stride
+ ld1 {v5.8h}, [x7], \stride
+ ld1 {v2.8b}, [x10], x1
+ ld1 {v6.8h}, [x7], \stride
+ ld1 {v3.8b}, [x10], x1
+ \op v5.8h, v5.8h, \r0
+ ld1 {v7.8h}, [x7], \stride
+ ld1 {v4.8b}, [x10], x1
+ srshr v5.8h, v5.8h, #4
+ \op v6.8h, v6.8h, \r1
+ uaddw v5.8h, v5.8h, v2.8b
+ srshr v6.8h, v6.8h, #4
+ \op v7.8h, v7.8h, \r2
+ sqxtun v2.8b, v5.8h
+ ld1 {v5.8h}, [x7], \stride
+ uaddw v6.8h, v6.8h, v3.8b
+ srshr v7.8h, v7.8h, #4
+ \op v5.8h, v5.8h, \r3
+ st1 {v2.8b}, [x6], x1
+ ld1 {v2.8b}, [x10], x1
+ sqxtun v3.8b, v6.8h
+ uaddw v7.8h, v7.8h, v4.8b
+ srshr v5.8h, v5.8h, #4
+ st1 {v3.8b}, [x6], x1
+ sqxtun v4.8b, v7.8h
+ uaddw v5.8h, v5.8h, v2.8b
+ st1 {v4.8b}, [x6], x1
+ sqxtun v2.8b, v5.8h
+ st1 {v2.8b}, [x6], x1
+.endm
+ combine v31.8h, v30.8h, v29.8h, v28.8h, sqadd, x8
+ combine v27.8h, v26.8h, v25.8h, v24.8h, sqadd, x8
+ combine v23.8h, v22.8h, v21.8h, v20.8h, sqadd, x8
+ combine v19.8h, v18.8h, v17.8h, v16.8h, sqadd, x8
+ sub x7, x7, x8
+ combine v16.8h, v17.8h, v18.8h, v19.8h, sqsub, x9
+ combine v20.8h, v21.8h, v22.8h, v23.8h, sqsub, x9
+ combine v24.8h, v25.8h, v26.8h, v27.8h, sqsub, x9
+ combine v28.8h, v29.8h, v30.8h, v31.8h, sqsub, x9
+.purgem combine
+
+ ret x14
+endfunc
+
+const eob_32x32
+ .short 36, 136, 300, 1024
+endconst
+
+const eob_16x32
+ .short 36, 151, 279, 512
+endconst
+
+const eob_16x32_shortside
+ .short 36, 512
+endconst
+
+const eob_8x32
+ .short 43, 107, 171, 256
+endconst
+
+function inv_txfm_add_identity_identity_32x32_8bpc_neon, export=1
+ movi v0.8h, #0
+ movrel x13, eob_32x32
+
+ mov x8, #2*32
+1:
+ mov w9, #0
+ movrel x12, eob_32x32
+2:
+ add w9, w9, #8
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ ld1 {v\i\().8h}, [x2]
+ st1 {v0.8h}, [x2], x8
+.endr
+ transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
+
+ load_add_store_8x8 x0, x7, shiftbits=2
+ ldrh w11, [x12], #2
+ sub x0, x0, x1, lsl #3
+ add x0, x0, #8
+ cmp w3, w11
+ b.ge 2b
+
+ ldrh w11, [x13], #2
+ cmp w3, w11
+ b.lt 9f
+
+ sub x0, x0, w9, uxtw
+ add x0, x0, x1, lsl #3
+ msub x2, x8, x9, x2
+ add x2, x2, #2*8
+ b 1b
+9:
+ ret
+endfunc
+
+.macro shift_8_regs op, shift
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
+ \op \i, \i, #\shift
+.endr
+.endm
+
+.macro def_identity_1632 w, h, wshort, hshort
+function inv_txfm_add_identity_identity_\w\()x\h\()_8bpc_neon, export=1
+ mov w16, #2896*8
+ mov w17, #2*(5793-4096)*8
+ dup v1.4h, w16
+ movi v0.8h, #0
+ mov v1.h[1], w17
+ movrel x13, eob_16x32\hshort
+
+ mov x8, #2*\h
+1:
+ mov w9, #0
+ movrel x12, eob_16x32\wshort
+2:
+ add w9, w9, #8
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
+ ld1 {\i}, [x2]
+ st1 {v0.8h}, [x2], x8
+.endr
+ scale_input .8h, v1.h[0], v16, v17, v18, v19, v20, v21, v22, v23
+
+.if \w == 16
+ // 16x32
+ identity_8x8_shift1 v1.h[1]
+.else
+ // 32x16
+ shift_8_regs sqshl, 1
+ identity_8x8 v1.h[1]
+.endif
+
+ transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
+
+.if \w == 16
+ load_add_store_8x8 x0, x7, shiftbits=2
+.else
+ load_add_store_8x8 x0, x7, shiftbits=4
+.endif
+ ldrh w11, [x12], #2
+ sub x0, x0, x1, lsl #3
+ add x0, x0, #8
+ cmp w3, w11
+ b.ge 2b
+
+ ldrh w11, [x13], #2
+ cmp w3, w11
+ b.lt 9f
+
+ sub x0, x0, w9, uxtw
+ add x0, x0, x1, lsl #3
+ msub x2, x8, x9, x2
+ add x2, x2, #2*8
+ b 1b
+9:
+ ret
+endfunc
+.endm
+
+def_identity_1632 16, 32, _shortside,
+def_identity_1632 32, 16, , _shortside
+
+.macro def_identity_832 w, h
+function inv_txfm_add_identity_identity_\w\()x\h\()_8bpc_neon, export=1
+ movi v0.8h, #0
+ movrel x13, eob_8x32
+
+ mov w8, #2*\h
+1:
+ ldrh w12, [x13], #2
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
+ ld1 {\i}, [x2]
+ st1 {v0.8h}, [x2], x8
+.endr
+
+.if \w == 8
+ // 8x32
+ shift_8_regs srshr, 1
+.endif
+
+ transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
+
+ cmp w3, w12
+.if \w == 8
+ load_add_store_8x8 x0, x7, shiftbits=2
+.else
+ load_add_store_8x8 x0, x7, shiftbits=3
+.endif
+
+ b.lt 9f
+.if \w == 8
+ sub x2, x2, x8, lsl #3
+ add x2, x2, #2*8
+.else
+ sub x0, x0, x1, lsl #3
+ add x0, x0, #8
+.endif
+ b 1b
+
+9:
+ ret
+endfunc
+.endm
+
+def_identity_832 8, 32
+def_identity_832 32, 8
+
+function inv_txfm_add_dct_dct_32x32_8bpc_neon, export=1
+ idct_dc 32, 32, 2
+
+ mov x15, x30
+ sub sp, sp, #2048
+ movrel x13, eob_32x32
+ ldrh w12, [x13], #2
+
+.irp i, 0, 8, 16, 24
+ add x6, sp, #(\i*32*2)
+.if \i > 0
+ mov w8, #(32 - \i)
+ cmp w3, w12
+ b.lt 1f
+.if \i < 24
+ ldrh w12, [x13], #2
+.endif
+.endif
+ add x7, x2, #(\i*2)
+ mov x8, #32*2
+ bl inv_txfm_horz_dct_32x8_neon
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #4
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+.irp i, 0, 8, 16, 24
+ add x6, x0, #(\i)
+ add x7, sp, #(\i*2)
+ mov x8, #32*2
+ bl inv_txfm_add_vert_dct_8x32_neon
+.endr
+
+ add sp, sp, #2048
+ ret x15
+endfunc
+
+function inv_txfm_add_dct_dct_16x32_8bpc_neon, export=1
+ idct_dc 16, 32, 1
+
+ mov x15, x30
+ sub sp, sp, #1024
+ movrel x13, eob_16x32
+ ldrh w12, [x13], #2
+ adr x4, inv_dct_8h_x16_neon
+
+.irp i, 0, 8, 16, 24
+ add x6, sp, #(\i*16*2)
+ add x7, x2, #(\i*2)
+.if \i > 0
+ mov w8, #(32 - \i)
+ cmp w3, w12
+ b.lt 1f
+.if \i < 24
+ ldrh w12, [x13], #2
+.endif
+.endif
+ mov x8, #2*32
+ bl inv_txfm_horz_scale_16x8_neon
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #8
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+.irp i, 0, 8
+ add x6, x0, #(\i)
+ add x7, sp, #(\i*2)
+ mov x8, #16*2
+ bl inv_txfm_add_vert_dct_8x32_neon
+.endr
+
+ add sp, sp, #1024
+ ret x15
+endfunc
+
+function inv_txfm_add_dct_dct_32x16_8bpc_neon, export=1
+ idct_dc 32, 16, 1
+
+ mov x15, x30
+ sub sp, sp, #1024
+
+ adr x5, inv_dct_8h_x16_neon
+
+.irp i, 0, 8
+ add x6, sp, #(\i*32*2)
+ add x7, x2, #(\i*2)
+.if \i > 0
+ mov w8, #(16 - \i)
+ cmp w3, #36
+ b.lt 1f
+.endif
+ mov x8, #2*16
+ bl inv_txfm_horz_scale_dct_32x8_neon
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #4
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+.irp i, 0, 8, 16, 24
+ add x6, x0, #(\i)
+ add x7, sp, #(\i*2)
+ mov x8, #32*2
+ bl inv_txfm_add_vert_8x16_neon
+.endr
+
+ add sp, sp, #1024
+ ret x15
+endfunc
+
+function inv_txfm_add_dct_dct_8x32_8bpc_neon, export=1
+ idct_dc 8, 32, 2
+
+ mov x15, x30
+ sub sp, sp, #512
+
+ movrel x13, eob_8x32
+
+ movi v28.8h, #0
+ mov x8, #2*32
+ mov w9, #32
+ mov x6, sp
+1:
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ ld1 {v\i\().8h}, [x2]
+ st1 {v28.8h}, [x2], x8
+.endr
+ ldrh w12, [x13], #2
+ sub x2, x2, x8, lsl #3
+ sub w9, w9, #8
+ add x2, x2, #2*8
+
+ bl inv_dct_8h_x8_neon
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ srshr v\i\().8h, v\i\().8h, #2
+.endr
+
+ transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
+
+ st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x6], #64
+ cmp w3, w12
+ st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x6], #64
+
+ b.ge 1b
+ cbz w9, 3f
+
+ movi v29.8h, #0
+ movi v30.8h, #0
+ movi v31.8h, #0
+2:
+ subs w9, w9, #8
+.rept 2
+ st1 {v28.8h,v29.8h,v30.8h,v31.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+ mov x6, x0
+ mov x7, sp
+ mov x8, #8*2
+ bl inv_txfm_add_vert_dct_8x32_neon
+
+ add sp, sp, #512
+ ret x15
+endfunc
+
+function inv_txfm_add_dct_dct_32x8_8bpc_neon, export=1
+ idct_dc 32, 8, 2
+
+ mov x15, x30
+ sub sp, sp, #512
+
+ mov x6, sp
+ mov x7, x2
+ mov x8, #8*2
+ bl inv_txfm_horz_dct_32x8_neon
+
+ mov x8, #2*32
+ mov w9, #0
+1:
+ add x6, x0, x9
+ add x7, sp, x9, lsl #1 // #(\i*2)
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ ld1 {v\i\().8h}, [x7], x8
+.endr
+ add w9, w9, #8
+
+ bl inv_dct_8h_x8_neon
+
+ cmp w9, #32
+
+ load_add_store_8x8 x6, x7
+
+ b.lt 1b
+
+ add sp, sp, #512
+ ret x15
+endfunc
+
+function inv_dct64_step1_neon
+ // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
+ // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
+ // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
+ // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
+
+ ld1 {v0.8h, v1.8h}, [x17], #32
+
+ sqrdmulh v23.8h, v16.8h, v0.h[1] // t63a
+ sqrdmulh v16.8h, v16.8h, v0.h[0] // t32a
+ sqrdmulh v22.8h, v17.8h, v0.h[2] // t62a
+ sqrdmulh v17.8h, v17.8h, v0.h[3] // t33a
+ sqrdmulh v21.8h, v18.8h, v0.h[5] // t61a
+ sqrdmulh v18.8h, v18.8h, v0.h[4] // t34a
+ sqrdmulh v20.8h, v19.8h, v0.h[6] // t60a
+ sqrdmulh v19.8h, v19.8h, v0.h[7] // t35a
+
+ sqadd v24.8h, v16.8h, v17.8h // t32
+ sqsub v25.8h, v16.8h, v17.8h // t33
+ sqsub v26.8h, v19.8h, v18.8h // t34
+ sqadd v27.8h, v19.8h, v18.8h // t35
+ sqadd v28.8h, v20.8h, v21.8h // t60
+ sqsub v29.8h, v20.8h, v21.8h // t61
+ sqsub v30.8h, v23.8h, v22.8h // t62
+ sqadd v31.8h, v23.8h, v22.8h // t63
+
+ smull_smlal v2, v3, v29, v26, v1.h[0], v1.h[1], .8h // -> t34a
+ smull_smlsl v4, v5, v29, v26, v1.h[1], v1.h[0], .8h // -> t61a
+ neg v2.4s, v2.4s // t34a
+ neg v3.4s, v3.4s // t34a
+ smull_smlsl v6, v7, v30, v25, v1.h[1], v1.h[0], .8h // -> t33a
+ sqrshrn_sz v26, v2, v3, #12, .8h // t34a
+ smull_smlal v2, v3, v30, v25, v1.h[0], v1.h[1], .8h // -> t62a
+ sqrshrn_sz v29, v4, v5, #12, .8h // t61a
+ sqrshrn_sz v25, v6, v7, #12, .8h // t33a
+ sqrshrn_sz v30, v2, v3, #12, .8h // t62a
+
+ sqadd v16.8h, v24.8h, v27.8h // t32a
+ sqsub v19.8h, v24.8h, v27.8h // t35a
+ sqadd v17.8h, v25.8h, v26.8h // t33
+ sqsub v18.8h, v25.8h, v26.8h // t34
+ sqsub v20.8h, v31.8h, v28.8h // t60a
+ sqadd v23.8h, v31.8h, v28.8h // t63a
+ sqsub v21.8h, v30.8h, v29.8h // t61
+ sqadd v22.8h, v30.8h, v29.8h // t62
+
+ smull_smlal v2, v3, v21, v18, v1.h[2], v1.h[3], .8h // -> t61a
+ smull_smlsl v4, v5, v21, v18, v1.h[3], v1.h[2], .8h // -> t34a
+ smull_smlal v6, v7, v20, v19, v1.h[2], v1.h[3], .8h // -> t60
+ sqrshrn_sz v21, v2, v3, #12, .8h // t61a
+ sqrshrn_sz v18, v4, v5, #12, .8h // t34a
+ smull_smlsl v2, v3, v20, v19, v1.h[3], v1.h[2], .8h // -> t35
+ sqrshrn_sz v20, v6, v7, #12, .8h // t60
+ sqrshrn_sz v19, v2, v3, #12, .8h // t35
+
+ st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x6], #64
+ st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x6], #64
+
+ ret
+endfunc
+
+function inv_dct64_step2_neon
+ movrel x16, idct_coeffs
+ ld1 {v0.4h}, [x16]
+1:
+ // t32a/33/34a/35/60/61a/62/63a
+ // t56a/57/58a/59/36/37a/38/39a
+ // t40a/41/42a/43/52/53a/54/55a
+ // t48a/49/50a/51/44/45a/46/47a
+ ldr q16, [x6, #2*8*0] // t32a
+ ldr q17, [x9, #2*8*8] // t39a
+ ldr q18, [x9, #2*8*0] // t63a
+ ldr q19, [x6, #2*8*8] // t56a
+ ldr q20, [x6, #2*8*16] // t40a
+ ldr q21, [x9, #2*8*24] // t47a
+ ldr q22, [x9, #2*8*16] // t55a
+ ldr q23, [x6, #2*8*24] // t48a
+
+ sqadd v24.8h, v16.8h, v17.8h // t32
+ sqsub v25.8h, v16.8h, v17.8h // t39
+ sqadd v26.8h, v18.8h, v19.8h // t63
+ sqsub v27.8h, v18.8h, v19.8h // t56
+ sqsub v28.8h, v21.8h, v20.8h // t40
+ sqadd v29.8h, v21.8h, v20.8h // t47
+ sqadd v30.8h, v23.8h, v22.8h // t48
+ sqsub v31.8h, v23.8h, v22.8h // t55
+
+ smull_smlal v2, v3, v27, v25, v0.h[3], v0.h[2], .8h // -> t56a
+ smull_smlsl v4, v5, v27, v25, v0.h[2], v0.h[3], .8h // -> t39a
+ smull_smlal v6, v7, v31, v28, v0.h[3], v0.h[2], .8h // -> t40a
+ sqrshrn_sz v25, v2, v3, #12, .8h // t56a
+ sqrshrn_sz v27, v4, v5, #12, .8h // t39a
+ neg v6.4s, v6.4s // t40a
+ neg v7.4s, v7.4s // t40a
+ smull_smlsl v2, v3, v31, v28, v0.h[2], v0.h[3], .8h // -> t55a
+ sqrshrn_sz v31, v6, v7, #12, .8h // t40a
+ sqrshrn_sz v28, v2, v3, #12, .8h // t55a
+
+ sqadd v16.8h, v24.8h, v29.8h // t32a
+ sqsub v19.8h, v24.8h, v29.8h // t47a
+ sqadd v17.8h, v27.8h, v31.8h // t39
+ sqsub v18.8h, v27.8h, v31.8h // t40
+ sqsub v20.8h, v26.8h, v30.8h // t48a
+ sqadd v23.8h, v26.8h, v30.8h // t63a
+ sqsub v21.8h, v25.8h, v28.8h // t55
+ sqadd v22.8h, v25.8h, v28.8h // t56
+
+ smull_smlsl v2, v3, v21, v18, v0.h[0], v0.h[0], .8h // -> t40a
+ smull_smlal v4, v5, v21, v18, v0.h[0], v0.h[0], .8h // -> t55a
+ smull_smlsl v6, v7, v20, v19, v0.h[0], v0.h[0], .8h // -> t47
+ sqrshrn_sz v18, v2, v3, #12, .8h // t40a
+ sqrshrn_sz v21, v4, v5, #12, .8h // t55a
+ smull_smlal v2, v3, v20, v19, v0.h[0], v0.h[0], .8h // -> t48
+ sqrshrn_sz v19, v6, v7, #12, .8h // t47
+ sqrshrn_sz v20, v2, v3, #12, .8h // t48
+
+ str q16, [x6, #2*8*0] // t32a
+ str q17, [x9, #2*8*0] // t39
+ str q18, [x6, #2*8*8] // t40a
+ str q19, [x9, #2*8*8] // t47
+ str q20, [x6, #2*8*16] // t48
+ str q21, [x9, #2*8*16] // t55a
+ str q22, [x6, #2*8*24] // t56
+ str q23, [x9, #2*8*24] // t63a
+
+ add x6, x6, #2*8
+ sub x9, x9, #2*8
+ cmp x6, x9
+ b.lt 1b
+ ret
+endfunc
+
+.macro load8 src, strd, zero, clear
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
+.if \clear
+ ld1 {\i}, [\src]
+ st1 {\zero}, [\src], \strd
+.else
+ ld1 {\i}, [\src], \strd
+.endif
+.endr
+.endm
+
+.macro store16 dst
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
+ st1 {\i}, [\dst], #16
+.endr
+.endm
+
+.macro clear_upper8
+.irp i, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
+ movi \i, #0
+.endr
+.endm
+
+.macro movi_if reg, val, cond
+.if \cond
+ movi \reg, \val
+.endif
+.endm
+
+.macro movdup_if reg, gpr, val, cond
+.if \cond
+ mov \gpr, \val
+ dup \reg, \gpr
+.endif
+.endm
+
+.macro st1_if regs, dst, cond
+.if \cond
+ st1 \regs, \dst
+.endif
+.endm
+
+.macro str_if reg, dst, cond
+.if \cond
+ str \reg, \dst
+.endif
+.endm
+
+.macro stroff_if reg, dst, dstoff, cond
+.if \cond
+ str \reg, \dst, \dstoff
+.endif
+.endm
+
+.macro scale_if cond, c, r0, r1, r2, r3, r4, r5, r6, r7
+.if \cond
+ scale_input .8h, \c, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
+.endif
+.endm
+
+.macro def_dct64_func suffix, clear=0, scale=0
+function inv_txfm_dct\suffix\()_8h_x64_neon, export=1
+ mov x14, x30
+ mov x6, sp
+ lsl x8, x8, #2
+
+ movdup_if v0.4h, w16, #2896*8, \scale
+ movi_if v7.8h, #0, \clear
+ load8 x7, x8, v7.8h, \clear
+ clear_upper8
+ sub x7, x7, x8, lsl #3
+ add x7, x7, x8, lsr #1
+ scale_if \scale, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
+
+ bl inv_dct_8h_x16_neon
+
+ store16 x6
+
+ movdup_if v0.4h, w16, #2896*8, \scale
+ movi_if v7.8h, #0, \clear
+ load8 x7, x8, v7.8h, \clear
+ clear_upper8
+ sub x7, x7, x8, lsl #3
+ lsr x8, x8, #1
+ sub x7, x7, x8, lsr #1
+ scale_if \scale, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
+
+ bl inv_dct32_odd_8h_x16_neon
+
+ add x10, x6, #16*15
+ sub x6, x6, #16*16
+
+ mov x9, #-16
+
+.macro store_addsub r0, r1, r2, r3
+ ld1 {v2.8h}, [x6], #16
+ ld1 {v3.8h}, [x6], #16
+ sqadd v6.8h, v2.8h, \r0
+ sqsub \r0, v2.8h, \r0
+ ld1 {v4.8h}, [x6], #16
+ sqadd v7.8h, v3.8h, \r1
+ sqsub \r1, v3.8h, \r1
+ ld1 {v5.8h}, [x6], #16
+ sqadd v2.8h, v4.8h, \r2
+ sub x6, x6, #16*4
+ sqsub \r2, v4.8h, \r2
+ st1 {v6.8h}, [x6], #16
+ st1 {\r0}, [x10], x9
+ sqadd v3.8h, v5.8h, \r3
+ sqsub \r3, v5.8h, \r3
+ st1 {v7.8h}, [x6], #16
+ st1 {\r1}, [x10], x9
+ st1 {v2.8h}, [x6], #16
+ st1 {\r2}, [x10], x9
+ st1 {v3.8h}, [x6], #16
+ st1 {\r3}, [x10], x9
+.endm
+ store_addsub v31.8h, v30.8h, v29.8h, v28.8h
+ store_addsub v27.8h, v26.8h, v25.8h, v24.8h
+ store_addsub v23.8h, v22.8h, v21.8h, v20.8h
+ store_addsub v19.8h, v18.8h, v17.8h, v16.8h
+.purgem store_addsub
+
+ add x6, x6, #2*8*16
+
+ movrel x17, idct64_coeffs
+ movdup_if v0.4h, w16, #2896*8, \scale
+ movi_if v7.8h, #0, \clear
+ add x9, x7, x8, lsl #4 // offset 16
+ add x10, x7, x8, lsl #3 // offset 8
+ sub x9, x9, x8 // offset 15
+ sub x11, x10, x8 // offset 7
+ ld1 {v16.8h}, [x7] // in1 (offset 0)
+ ld1 {v17.8h}, [x9] // in31 (offset 15)
+ ld1 {v18.8h}, [x10] // in17 (offset 8)
+ ld1 {v19.8h}, [x11] // in15 (offset 7)
+ st1_if {v7.8h}, [x7], \clear
+ st1_if {v7.8h}, [x9], \clear
+ st1_if {v7.8h}, [x10], \clear
+ st1_if {v7.8h}, [x11], \clear
+ scale_if \scale, v0.h[0], v16, v17, v18, v19
+ bl inv_dct64_step1_neon
+ movdup_if v0.4h, w16, #2896*8, \scale
+ movi_if v7.8h, #0, \clear
+ add x7, x7, x8, lsl #2 // offset 4
+ sub x9, x9, x8, lsl #2 // offset 11
+ sub x10, x7, x8 // offset 3
+ add x11, x9, x8 // offset 12
+ ld1 {v16.8h}, [x10] // in7 (offset 3)
+ ld1 {v17.8h}, [x11] // in25 (offset 12)
+ ld1 {v18.8h}, [x9] // in23 (offset 11)
+ ld1 {v19.8h}, [x7] // in9 (offset 4)
+ st1_if {v7.8h}, [x7], \clear
+ st1_if {v7.8h}, [x9], \clear
+ st1_if {v7.8h}, [x10], \clear
+ st1_if {v7.8h}, [x11], \clear
+ scale_if \scale, v0.h[0], v16, v17, v18, v19
+ bl inv_dct64_step1_neon
+ movdup_if v0.4h, w16, #2896*8, \scale
+ movi_if v7.8h, #0, \clear
+ sub x10, x10, x8, lsl #1 // offset 1
+ sub x9, x9, x8, lsl #1 // offset 9
+ add x7, x7, x8 // offset 5
+ add x11, x11, x8 // offset 13
+ ldr q16, [x10, x8] // in5 (offset 2)
+ ldr q17, [x11] // in27 (offset 13)
+ ldr q18, [x9, x8] // in21 (offset 10)
+ ldr q19, [x7] // in11 (offset 5)
+ stroff_if q7, [x10, x8], \clear
+ str_if q7, [x11], \clear
+ stroff_if q7, [x9, x8], \clear
+ str_if q7, [x7], \clear
+ scale_if \scale, v0.h[0], v16, v17, v18, v19
+ bl inv_dct64_step1_neon
+ movdup_if v0.4h, w16, #2896*8, \scale
+ movi_if v7.8h, #0, \clear
+ ldr q16, [x10] // in3 (offset 1)
+ ldr q17, [x11, x8] // in29 (offset 14)
+ ldr q18, [x9] // in19 (offset 9)
+ ldr q19, [x7, x8] // in13 (offset 6)
+ str_if q7, [x10], \clear
+ stroff_if q7, [x11, x8], \clear
+ str_if q7, [x9], \clear
+ stroff_if q7, [x7, x8], \clear
+ scale_if \scale, v0.h[0], v16, v17, v18, v19
+ bl inv_dct64_step1_neon
+
+ sub x6, x6, #2*8*32
+ add x9, x6, #2*8*7
+
+ bl inv_dct64_step2_neon
+
+ ret x14
+endfunc
+.endm
+
+def_dct64_func
+def_dct64_func _clear, clear=1
+def_dct64_func _clear_scale, clear=1, scale=1
+
+
+function inv_txfm_horz_dct_64x8_neon
+ mov x14, x30
+
+ mov x7, sp
+ add x8, sp, #2*8*(64 - 4)
+ add x9, x6, #2*56
+ mov x10, #2*64
+ mov x11, #-2*8*4
+
+ dup v7.8h, w12
+1:
+ ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x7], #64
+ ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x11
+ ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64
+ ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x8], x11
+ transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
+ transpose_8x8h v31, v30, v29, v28, v27, v26, v25, v24, v4, v5
+
+.macro store_addsub src0, src1, src2, src3
+ sqsub v1.8h, \src0, \src1
+ sqadd v0.8h, \src0, \src1
+ sqsub v3.8h, \src2, \src3
+ srshl v1.8h, v1.8h, v7.8h
+ sqadd v2.8h, \src2, \src3
+ srshl v0.8h, v0.8h, v7.8h
+ srshl v3.8h, v3.8h, v7.8h
+ rev64 v1.8h, v1.8h
+ srshl v2.8h, v2.8h, v7.8h
+ rev64 v3.8h, v3.8h
+ ext v1.16b, v1.16b, v1.16b, #8
+ st1 {v0.8h}, [x6], x10
+ ext v3.16b, v3.16b, v3.16b, #8
+ st1 {v1.8h}, [x9], x10
+ st1 {v2.8h}, [x6], x10
+ st1 {v3.8h}, [x9], x10
+.endm
+ store_addsub v16.8h, v31.8h, v17.8h, v30.8h
+ store_addsub v18.8h, v29.8h, v19.8h, v28.8h
+ store_addsub v20.8h, v27.8h, v21.8h, v26.8h
+ store_addsub v22.8h, v25.8h, v23.8h, v24.8h
+.purgem store_addsub
+ sub x6, x6, x10, lsl #3
+ sub x9, x9, x10, lsl #3
+ add x6, x6, #16
+ sub x9, x9, #16
+
+ cmp x7, x8
+ b.lt 1b
+ ret x14
+endfunc
+
+function inv_txfm_add_vert_dct_8x64_neon
+ mov x14, x30
+ lsl x8, x8, #1
+
+ mov x7, sp
+ add x8, sp, #2*8*(64 - 4)
+ add x9, x6, x1, lsl #6
+ sub x9, x9, x1
+ neg x10, x1
+ mov x11, #-2*8*4
+
+1:
+ ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x7], #64
+ ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x11
+ ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64
+ ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x8], x11
+
+.macro add_dest_addsub src0, src1, src2, src3
+ ld1 {v0.8b}, [x6], x1
+ ld1 {v1.8b}, [x9], x10
+ sqadd v4.8h, \src0, \src1
+ ld1 {v2.8b}, [x6]
+ sqsub v5.8h, \src0, \src1
+ ld1 {v3.8b}, [x9]
+ sqadd v6.8h, \src2, \src3
+ sqsub v7.8h, \src2, \src3
+ sub x6, x6, x1
+ sub x9, x9, x10
+ srshr v4.8h, v4.8h, #4
+ srshr v5.8h, v5.8h, #4
+ srshr v6.8h, v6.8h, #4
+ uaddw v4.8h, v4.8h, v0.8b
+ srshr v7.8h, v7.8h, #4
+ uaddw v5.8h, v5.8h, v1.8b
+ uaddw v6.8h, v6.8h, v2.8b
+ sqxtun v0.8b, v4.8h
+ uaddw v7.8h, v7.8h, v3.8b
+ sqxtun v1.8b, v5.8h
+ st1 {v0.8b}, [x6], x1
+ sqxtun v2.8b, v6.8h
+ st1 {v1.8b}, [x9], x10
+ sqxtun v3.8b, v7.8h
+ st1 {v2.8b}, [x6], x1
+ st1 {v3.8b}, [x9], x10
+.endm
+ add_dest_addsub v16.8h, v31.8h, v17.8h, v30.8h
+ add_dest_addsub v18.8h, v29.8h, v19.8h, v28.8h
+ add_dest_addsub v20.8h, v27.8h, v21.8h, v26.8h
+ add_dest_addsub v22.8h, v25.8h, v23.8h, v24.8h
+.purgem add_dest_addsub
+ cmp x7, x8
+ b.lt 1b
+
+ ret x14
+endfunc
+
+function inv_txfm_add_dct_dct_64x64_8bpc_neon, export=1
+ idct_dc 64, 64, 2
+
+ mov x15, x30
+
+ sub_sp 64*32*2+64*8*2
+ add x5, sp, #64*8*2
+
+ movrel x13, eob_32x32
+
+.irp i, 0, 8, 16, 24
+ add x6, x5, #(\i*64*2)
+.if \i > 0
+ mov w8, #(32 - \i)
+ cmp w3, w12
+ b.lt 1f
+.endif
+ add x7, x2, #(\i*2)
+ mov x8, #32*2
+ mov x12, #-2 // shift
+ bl inv_txfm_dct_clear_8h_x64_neon
+ add x6, x5, #(\i*64*2)
+ bl inv_txfm_horz_dct_64x8_neon
+.if \i < 24
+ ldrh w12, [x13], #2
+.endif
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #2
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+.irp i, 0, 8, 16, 24, 32, 40, 48, 56
+ add x7, x5, #(\i*2)
+ mov x8, #64*2
+ bl inv_txfm_dct_8h_x64_neon
+ add x6, x0, #(\i)
+ bl inv_txfm_add_vert_dct_8x64_neon
+.endr
+
+ add sp, x5, #64*32*2
+ ret x15
+endfunc
+
+function inv_txfm_add_dct_dct_64x32_8bpc_neon, export=1
+ idct_dc 64, 32, 1
+
+ mov x15, x30
+
+ sub_sp 64*32*2+64*8*2
+ add x5, sp, #64*8*2
+
+ movrel x13, eob_32x32
+
+.irp i, 0, 8, 16, 24
+ add x6, x5, #(\i*64*2)
+.if \i > 0
+ mov w8, #(32 - \i)
+ cmp w3, w12
+ b.lt 1f
+.endif
+ add x7, x2, #(\i*2)
+ mov x8, #32*2
+ mov x12, #-1 // shift
+ bl inv_txfm_dct_clear_scale_8h_x64_neon
+ add x6, x5, #(\i*64*2)
+ bl inv_txfm_horz_dct_64x8_neon
+.if \i < 24
+ ldrh w12, [x13], #2
+.endif
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #2
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+.irp i, 0, 8, 16, 24, 32, 40, 48, 56
+ add x6, x0, #(\i)
+ add x7, x5, #(\i*2)
+ mov x8, #64*2
+ bl inv_txfm_add_vert_dct_8x32_neon
+.endr
+
+ add sp, x5, #64*32*2
+ ret x15
+endfunc
+
+function inv_txfm_add_dct_dct_32x64_8bpc_neon, export=1
+ idct_dc 32, 64, 1
+
+ mov x15, x30
+
+ sub_sp 32*32*2+64*8*2
+ add x5, sp, #64*8*2
+
+ movrel x13, eob_32x32
+ ldrh w12, [x13], #2
+
+.irp i, 0, 8, 16, 24
+ add x6, x5, #(\i*32*2)
+.if \i > 0
+ mov w8, #(32 - \i)
+ cmp w3, w12
+ b.lt 1f
+.if \i < 24
+ ldrh w12, [x13], #2
+.endif
+.endif
+ add x7, x2, #(\i*2)
+ mov x8, #32*2
+ bl inv_txfm_horz_scale_dct_32x8_neon
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #4
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+.irp i, 0, 8, 16, 24
+ add x7, x5, #(\i*2)
+ mov x8, #32*2
+ bl inv_txfm_dct_8h_x64_neon
+ add x6, x0, #(\i)
+ bl inv_txfm_add_vert_dct_8x64_neon
+.endr
+
+ add sp, x5, #32*32*2
+ ret x15
+endfunc
+
+function inv_txfm_add_dct_dct_64x16_8bpc_neon, export=1
+ idct_dc 64, 16, 2
+
+ mov x15, x30
+
+ sub_sp 64*16*2+64*8*2
+ add x4, sp, #64*8*2
+
+ movrel x13, eob_16x32
+
+.irp i, 0, 8
+ add x6, x4, #(\i*64*2)
+.if \i > 0
+ mov w8, #(16 - \i)
+ cmp w3, w12
+ b.lt 1f
+.endif
+ add x7, x2, #(\i*2)
+ mov x8, #16*2
+ mov x12, #-2 // shift
+ bl inv_txfm_dct_clear_8h_x64_neon
+ add x6, x4, #(\i*64*2)
+ bl inv_txfm_horz_dct_64x8_neon
+.if \i < 8
+ ldrh w12, [x13], #2
+.endif
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #2
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+ adr x5, inv_dct_8h_x16_neon
+.irp i, 0, 8, 16, 24, 32, 40, 48, 56
+ add x6, x0, #(\i)
+ add x7, x4, #(\i*2)
+ mov x8, #64*2
+ bl inv_txfm_add_vert_8x16_neon
+.endr
+
+ add sp, x4, #64*16*2
+ ret x15
+endfunc
+
+function inv_txfm_add_dct_dct_16x64_8bpc_neon, export=1
+ idct_dc 16, 64, 2
+
+ mov x15, x30
+
+ sub_sp 16*32*2+64*8*2
+ add x5, sp, #64*8*2
+
+ movrel x13, eob_16x32
+ ldrh w12, [x13], #2
+
+ adr x4, inv_dct_8h_x16_neon
+.irp i, 0, 8, 16, 24
+ add x6, x5, #(\i*16*2)
+.if \i > 0
+ mov w8, #(32 - \i)
+ cmp w3, w12
+ b.lt 1f
+.if \i < 24
+ ldrh w12, [x13], #2
+.endif
+.endif
+ add x7, x2, #(\i*2)
+ mov x8, #32*2
+ bl inv_txfm_horz_16x8_neon
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #8
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+.irp i, 0, 8
+ add x7, x5, #(\i*2)
+ mov x8, #16*2
+ bl inv_txfm_dct_8h_x64_neon
+ add x6, x0, #(\i)
+ bl inv_txfm_add_vert_dct_8x64_neon
+.endr
+
+ add sp, x5, #16*32*2
+ ret x15
+endfunc
diff --git a/third_party/dav1d/src/arm/64/itx16.S b/third_party/dav1d/src/arm/64/itx16.S
new file mode 100644
index 0000000000..eee3a9636d
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/itx16.S
@@ -0,0 +1,3648 @@
+/******************************************************************************
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// The exported functions in this file have got the following signature:
+// void itxfm_add(pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob,
+// int bitdepth_max);
+
+// Most of the functions use the following register layout:
+// x0-x3 external parameters
+// x4 function pointer to first transform
+// x5 function pointer to second transform
+// x6 output parameter for helper function
+// x7 input parameter for helper function
+// x8 input stride for helper function
+// x9-x12 scratch variables for helper functions
+// x13 pointer to list of eob thresholds
+// x14 return pointer for helper function
+// x15 return pointer for main function
+
+// The SIMD registers most often use the following layout:
+// v0-v1 multiplication coefficients
+// v2-v7 scratch registers
+// v8-v15 unused
+// v16-v31 inputs/outputs of transforms
+
+const idct_coeffs, align=4
+ // idct4
+ .int 2896, 2896*8*(1<<16), 1567, 3784
+ // idct8
+ .int 799, 4017, 3406, 2276
+ // idct16
+ .int 401, 4076, 3166, 2598
+ .int 1931, 3612, 3920, 1189
+ // idct32
+ .int 201, 4091, 3035, 2751
+ .int 1751, 3703, 3857, 1380
+ .int 995, 3973, 3513, 2106
+ .int 2440, 3290, 4052, 601
+endconst
+
+const idct64_coeffs, align=4
+ .int 101*8*(1<<16), 4095*8*(1<<16), 2967*8*(1<<16), -2824*8*(1<<16)
+ .int 1660*8*(1<<16), 3745*8*(1<<16), 3822*8*(1<<16), -1474*8*(1<<16)
+ .int 4076, 401, 4017, 799
+
+ .int 4036*8*(1<<16), -700*8*(1<<16), 2359*8*(1<<16), 3349*8*(1<<16)
+ .int 3461*8*(1<<16), -2191*8*(1<<16), 897*8*(1<<16), 3996*8*(1<<16)
+ .int -3166, -2598, -799, -4017
+
+ .int 501*8*(1<<16), 4065*8*(1<<16), 3229*8*(1<<16), -2520*8*(1<<16)
+ .int 2019*8*(1<<16), 3564*8*(1<<16), 3948*8*(1<<16), -1092*8*(1<<16)
+ .int 3612, 1931, 2276, 3406
+
+ .int 4085*8*(1<<16), -301*8*(1<<16), 2675*8*(1<<16), 3102*8*(1<<16)
+ .int 3659*8*(1<<16), -1842*8*(1<<16), 1285*8*(1<<16), 3889*8*(1<<16)
+ .int -3920, -1189, -3406, -2276
+endconst
+
+const iadst4_coeffs, align=4
+ .int 1321, 3803, 2482, 3344
+endconst
+
+const iadst8_coeffs, align=4
+ .int 4076, 401, 3612, 1931
+ .int 2598, 3166, 1189, 3920
+ // idct_coeffs
+ .int 2896, 0, 1567, 3784
+endconst
+
+const iadst16_coeffs, align=4
+ .int 4091, 201, 3973, 995
+ .int 3703, 1751, 3290, 2440
+ .int 2751, 3035, 2106, 3513
+ .int 1380, 3857, 601, 4052
+endconst
+
+.macro mul_mla d, s0, s1, c0, c1
+ mul \d\().4s, \s0\().4s, \c0
+ mla \d\().4s, \s1\().4s, \c1
+.endm
+
+.macro mul_mls d, s0, s1, c0, c1
+ mul \d\().4s, \s0\().4s, \c0
+ mls \d\().4s, \s1\().4s, \c1
+.endm
+
+.macro scale_input sz, c, r0, r1, r2 r3, r4, r5, r6, r7
+ sqrdmulh \r0\sz, \r0\sz, \c
+ sqrdmulh \r1\sz, \r1\sz, \c
+ sqrdmulh \r2\sz, \r2\sz, \c
+ sqrdmulh \r3\sz, \r3\sz, \c
+.ifnb \r4
+ sqrdmulh \r4\sz, \r4\sz, \c
+ sqrdmulh \r5\sz, \r5\sz, \c
+ sqrdmulh \r6\sz, \r6\sz, \c
+ sqrdmulh \r7\sz, \r7\sz, \c
+.endif
+.endm
+
+.macro smin_4s r0, r1, r2
+ smin \r0\().4s, \r1\().4s, \r2\().4s
+.endm
+.macro smax_4s r0, r1, r2
+ smax \r0\().4s, \r1\().4s, \r2\().4s
+.endm
+
+.macro load_add_store load, shift, addsrc, adddst, min, store, dst, src, shiftbits=4
+.ifnb \load
+ ld1 {\load}, [\src], x1
+.endif
+.ifnb \shift
+ srshr \shift, \shift, #\shiftbits
+.endif
+.ifnb \addsrc
+ usqadd \adddst, \addsrc
+.endif
+.ifnb \min
+ smin \min, \min, v7.8h
+.endif
+.ifnb \store
+ st1 {\store}, [\dst], x1
+.endif
+.endm
+.macro load_add_store_8x16 dst, src
+ mov \src, \dst
+ mvni v7.8h, #0xfc, lsl #8 // 0x3ff
+ load_add_store v2.8h, v16.8h, , , , , \dst, \src
+ load_add_store v3.8h, v17.8h, , , , , \dst, \src
+ load_add_store v4.8h, v18.8h, v16.8h, v2.8h, , , \dst, \src
+ load_add_store v5.8h, v19.8h, v17.8h, v3.8h, v2.8h, , \dst, \src
+ load_add_store v16.8h, v20.8h, v18.8h, v4.8h, v3.8h, v2.8h, \dst, \src
+ load_add_store v17.8h, v21.8h, v19.8h, v5.8h, v4.8h, v3.8h, \dst, \src
+ load_add_store v18.8h, v22.8h, v20.8h, v16.8h, v5.8h, v4.8h, \dst, \src
+ load_add_store v19.8h, v23.8h, v21.8h, v17.8h, v16.8h, v5.8h, \dst, \src
+ load_add_store v20.8h, v24.8h, v22.8h, v18.8h, v17.8h, v16.8h, \dst, \src
+ load_add_store v21.8h, v25.8h, v23.8h, v19.8h, v18.8h, v17.8h, \dst, \src
+ load_add_store v22.8h, v26.8h, v24.8h, v20.8h, v19.8h, v18.8h, \dst, \src
+ load_add_store v23.8h, v27.8h, v25.8h, v21.8h, v20.8h, v19.8h, \dst, \src
+ load_add_store v24.8h, v28.8h, v26.8h, v22.8h, v21.8h, v20.8h, \dst, \src
+ load_add_store v25.8h, v29.8h, v27.8h, v23.8h, v22.8h, v21.8h, \dst, \src
+ load_add_store v26.8h, v30.8h, v28.8h, v24.8h, v23.8h, v22.8h, \dst, \src
+ load_add_store v27.8h, v31.8h, v29.8h, v25.8h, v24.8h, v23.8h, \dst, \src
+ load_add_store , , v30.8h, v26.8h, v25.8h, v24.8h, \dst, \src
+ load_add_store , , v31.8h, v27.8h, v26.8h, v25.8h, \dst, \src
+ load_add_store , , , , v27.8h, v26.8h, \dst, \src
+ load_add_store , , , , , v27.8h, \dst, \src
+.endm
+.macro load_add_store_8x8 dst, src, shiftbits=4
+ mov \src, \dst
+ mvni v7.8h, #0xfc, lsl #8 // 0x3ff
+ load_add_store v2.8h, v16.8h, , , , , \dst, \src, \shiftbits
+ load_add_store v3.8h, v17.8h, , , , , \dst, \src, \shiftbits
+ load_add_store v4.8h, v18.8h, v16.8h, v2.8h, , , \dst, \src, \shiftbits
+ load_add_store v5.8h, v19.8h, v17.8h, v3.8h, v2.8h, , \dst, \src, \shiftbits
+ load_add_store v16.8h, v20.8h, v18.8h, v4.8h, v3.8h, v2.8h, \dst, \src, \shiftbits
+ load_add_store v17.8h, v21.8h, v19.8h, v5.8h, v4.8h, v3.8h, \dst, \src, \shiftbits
+ load_add_store v18.8h, v22.8h, v20.8h, v16.8h, v5.8h, v4.8h, \dst, \src, \shiftbits
+ load_add_store v19.8h, v23.8h, v21.8h, v17.8h, v16.8h, v5.8h, \dst, \src, \shiftbits
+ load_add_store , , v22.8h, v18.8h, v17.8h, v16.8h, \dst, \src, \shiftbits
+ load_add_store , , v23.8h, v19.8h, v18.8h, v17.8h, \dst, \src, \shiftbits
+ load_add_store , , , , v19.8h, v18.8h, \dst, \src, \shiftbits
+ load_add_store , , , , , v19.8h, \dst, \src, \shiftbits
+.endm
+.macro load_add_store_8x4 dst, src, shiftbits=4
+ mov \src, \dst
+ mvni v7.8h, #0xfc, lsl #8 // 0x3ff
+ load_add_store v2.8h, v16.8h, , , , , \dst, \src, \shiftbits
+ load_add_store v3.8h, v17.8h, , , , , \dst, \src, \shiftbits
+ load_add_store v4.8h, v18.8h, v16.8h, v2.8h, , , \dst, \src, \shiftbits
+ load_add_store v5.8h, v19.8h, v17.8h, v3.8h, v2.8h, , \dst, \src, \shiftbits
+ load_add_store , , v18.8h, v4.8h, v3.8h, v2.8h, \dst, \src, \shiftbits
+ load_add_store , , v19.8h, v5.8h, v4.8h, v3.8h, \dst, \src, \shiftbits
+ load_add_store , , , , v5.8h, v4.8h, \dst, \src, \shiftbits
+ load_add_store , , , , , v5.8h, \dst, \src, \shiftbits
+.endm
+.macro load_add_store4 load, inssrc, insdst, shift, addsrc, adddst, min, store, dst, src
+.ifnb \load
+ ld1 {\load}[0], [\src], x1
+.endif
+.ifnb \inssrc
+ ins \insdst\().d[1], \inssrc\().d[0]
+.endif
+.ifnb \shift
+ srshr \shift, \shift, #4
+.endif
+.ifnb \load
+ ld1 {\load}[1], [\src], x1
+.endif
+.ifnb \addsrc
+ usqadd \adddst, \addsrc
+.endif
+.ifnb \store
+ st1 {\store}[0], [\dst], x1
+.endif
+.ifnb \min
+ smin \min, \min, v7.8h
+.endif
+.ifnb \store
+ st1 {\store}[1], [\dst], x1
+.endif
+.endm
+.macro load_add_store_4x16 dst, src
+ mov \src, \dst
+ mvni v7.8h, #0xfc, lsl #8 // 0x3ff
+ load_add_store4 v0.d, v17, v16, , , , , , \dst, \src
+ load_add_store4 v1.d, v19, v18, , , , , , \dst, \src
+ load_add_store4 v2.d, v21, v20, v16.8h, , , , , \dst, \src
+ load_add_store4 v3.d, v23, v22, v18.8h, v16.8h, v0.8h, , , \dst, \src
+ load_add_store4 v17.d, v25, v24, v20.8h, v18.8h, v1.8h, v0.8h, , \dst, \src
+ load_add_store4 v19.d, v27, v26, v22.8h, v20.8h, v2.8h, v1.8h, v0.d, \dst, \src
+ load_add_store4 v21.d, v29, v28, v24.8h, v22.8h, v3.8h, v2.8h, v1.d, \dst, \src
+ load_add_store4 v23.d, v31, v30, v26.8h, v24.8h, v17.8h, v3.8h, v2.d, \dst, \src
+ load_add_store4 , , , v28.8h, v26.8h, v19.8h, v17.8h, v3.d, \dst, \src
+ load_add_store4 , , , v30.8h, v28.8h, v21.8h, v19.8h, v17.d, \dst, \src
+ load_add_store4 , , , , v30.8h, v23.8h, v21.8h, v19.d, \dst, \src
+ load_add_store4 , , , , , , v23.8h, v21.d, \dst, \src
+ load_add_store4 , , , , , , , v23.d, \dst, \src
+.endm
+.macro load_add_store_4x8 dst, src
+ mov \src, \dst
+ mvni v7.8h, #0xfc, lsl #8 // 0x3ff
+ load_add_store4 v0.d, v17, v16, , , , , , \dst, \src
+ load_add_store4 v1.d, v19, v18, , , , , , \dst, \src
+ load_add_store4 v2.d, v21, v20, v16.8h, , , , , \dst, \src
+ load_add_store4 v3.d, v23, v22, v18.8h, v16.8h, v0.8h, , , \dst, \src
+ load_add_store4 , , , v20.8h, v18.8h, v1.8h, v0.8h, , \dst, \src
+ load_add_store4 , , , v22.8h, v20.8h, v2.8h, v1.8h, v0.d, \dst, \src
+ load_add_store4 , , , , v22.8h, v3.8h, v2.8h, v1.d, \dst, \src
+ load_add_store4 , , , , , , v3.8h, v2.d, \dst, \src
+ load_add_store4 , , , , , , , v3.d, \dst, \src
+.endm
+
+.macro idct_dc w, h, shift
+ cbnz w3, 1f
+ movz w16, #2896*8, lsl #16
+ ld1r {v16.4s}, [x2]
+ dup v0.2s, w16
+ sqrdmulh v20.4s, v16.4s, v0.s[0]
+ str wzr, [x2]
+.if (\w == 2*\h) || (2*\w == \h)
+ sqrdmulh v20.4s, v20.4s, v0.s[0]
+.endif
+.if \shift > 0
+ sqrshrn v16.4h, v20.4s, #\shift
+ sqrshrn2 v16.8h, v20.4s, #\shift
+.else
+ sqxtn v16.4h, v20.4s
+ sqxtn2 v16.8h, v20.4s
+.endif
+ sqrdmulh v16.8h, v16.8h, v0.h[1]
+ srshr v16.8h, v16.8h, #4
+ mov w4, #\h
+ b idct_dc_w\w\()_neon
+1:
+.endm
+
+function idct_dc_w4_neon
+ mvni v31.8h, #0xfc, lsl #8 // 0x3ff
+1:
+ ld1 {v0.d}[0], [x0], x1
+ ld1 {v0.d}[1], [x0], x1
+ ld1 {v1.d}[0], [x0], x1
+ subs w4, w4, #4
+ ld1 {v1.d}[1], [x0], x1
+ usqadd v0.8h, v16.8h
+ sub x0, x0, x1, lsl #2
+ usqadd v1.8h, v16.8h
+ smin v0.8h, v0.8h, v31.8h
+ st1 {v0.d}[0], [x0], x1
+ smin v1.8h, v1.8h, v31.8h
+ st1 {v0.d}[1], [x0], x1
+ st1 {v1.d}[0], [x0], x1
+ st1 {v1.d}[1], [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+function idct_dc_w8_neon
+ mvni v31.8h, #0xfc, lsl #8 // 0x3ff
+1:
+ ld1 {v0.8h}, [x0], x1
+ subs w4, w4, #4
+ ld1 {v1.8h}, [x0], x1
+ usqadd v0.8h, v16.8h
+ ld1 {v2.8h}, [x0], x1
+ usqadd v1.8h, v16.8h
+ ld1 {v3.8h}, [x0], x1
+ usqadd v2.8h, v16.8h
+ usqadd v3.8h, v16.8h
+ sub x0, x0, x1, lsl #2
+ smin v0.8h, v0.8h, v31.8h
+ smin v1.8h, v1.8h, v31.8h
+ st1 {v0.8h}, [x0], x1
+ smin v2.8h, v2.8h, v31.8h
+ st1 {v1.8h}, [x0], x1
+ smin v3.8h, v3.8h, v31.8h
+ st1 {v2.8h}, [x0], x1
+ st1 {v3.8h}, [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+function idct_dc_w16_neon
+ mvni v31.8h, #0xfc, lsl #8 // 0x3ff
+1:
+ ld1 {v0.8h, v1.8h}, [x0], x1
+ subs w4, w4, #2
+ ld1 {v2.8h, v3.8h}, [x0], x1
+ usqadd v0.8h, v16.8h
+ usqadd v1.8h, v16.8h
+ sub x0, x0, x1, lsl #1
+ usqadd v2.8h, v16.8h
+ usqadd v3.8h, v16.8h
+ smin v0.8h, v0.8h, v31.8h
+ smin v1.8h, v1.8h, v31.8h
+ smin v2.8h, v2.8h, v31.8h
+ st1 {v0.8h, v1.8h}, [x0], x1
+ smin v3.8h, v3.8h, v31.8h
+ st1 {v2.8h, v3.8h}, [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+function idct_dc_w32_neon
+ mvni v31.8h, #0xfc, lsl #8 // 0x3ff
+1:
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]
+ subs w4, w4, #1
+ usqadd v0.8h, v16.8h
+ usqadd v1.8h, v16.8h
+ usqadd v2.8h, v16.8h
+ usqadd v3.8h, v16.8h
+ smin v0.8h, v0.8h, v31.8h
+ smin v1.8h, v1.8h, v31.8h
+ smin v2.8h, v2.8h, v31.8h
+ smin v3.8h, v3.8h, v31.8h
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+function idct_dc_w64_neon
+ mvni v31.8h, #0xfc, lsl #8 // 0x3ff
+ sub x1, x1, #64
+1:
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ subs w4, w4, #1
+ usqadd v0.8h, v16.8h
+ ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0]
+ usqadd v1.8h, v16.8h
+ sub x0, x0, #64
+ usqadd v2.8h, v16.8h
+ usqadd v3.8h, v16.8h
+ usqadd v4.8h, v16.8h
+ usqadd v5.8h, v16.8h
+ usqadd v6.8h, v16.8h
+ usqadd v7.8h, v16.8h
+ smin v0.8h, v0.8h, v31.8h
+ smin v1.8h, v1.8h, v31.8h
+ smin v2.8h, v2.8h, v31.8h
+ smin v3.8h, v3.8h, v31.8h
+ smin v4.8h, v4.8h, v31.8h
+ smin v5.8h, v5.8h, v31.8h
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ smin v6.8h, v6.8h, v31.8h
+ smin v7.8h, v7.8h, v31.8h
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+.macro iwht4
+ add v16.4s, v16.4s, v17.4s
+ sub v21.4s, v18.4s, v19.4s
+ sub v20.4s, v16.4s, v21.4s
+ sshr v20.4s, v20.4s, #1
+ sub v18.4s, v20.4s, v17.4s
+ sub v17.4s, v20.4s, v19.4s
+ add v19.4s, v21.4s, v18.4s
+ sub v16.4s, v16.4s, v17.4s
+.endm
+
+.macro idct_4 r0, r1, r2, r3
+ mul_mla v6, \r1, \r3, v0.s[3], v0.s[2]
+ mul_mla v2, \r0, \r2, v0.s[0], v0.s[0]
+ mul_mls v4, \r1, \r3, v0.s[2], v0.s[3]
+ mul_mls v3, \r0, \r2, v0.s[0], v0.s[0]
+ srshr v6.4s, v6.4s, #12
+ srshr v2.4s, v2.4s, #12
+ srshr v7.4s, v4.4s, #12
+ srshr v3.4s, v3.4s, #12
+ sqadd \r0\().4s, v2.4s, v6.4s
+ sqsub \r3\().4s, v2.4s, v6.4s
+ sqadd \r1\().4s, v3.4s, v7.4s
+ sqsub \r2\().4s, v3.4s, v7.4s
+.endm
+
+function inv_dct_4s_x4_neon
+ AARCH64_VALID_CALL_TARGET
+ movrel x16, idct_coeffs
+ ld1 {v0.4s}, [x16]
+ idct_4 v16, v17, v18, v19
+ ret
+endfunc
+
+.macro iadst_4x4 o0, o1, o2, o3
+ movrel x16, iadst4_coeffs
+ ld1 {v0.4s}, [x16]
+
+ sub v3.4s, v16.4s, v18.4s
+ mul v4.4s, v16.4s, v0.s[0]
+ mla v4.4s, v18.4s, v0.s[1]
+ mla v4.4s, v19.4s, v0.s[2]
+ mul v7.4s, v17.4s, v0.s[3]
+ add v3.4s, v3.4s, v19.4s
+ mul v5.4s, v16.4s, v0.s[2]
+ mls v5.4s, v18.4s, v0.s[0]
+ mls v5.4s, v19.4s, v0.s[1]
+
+ add \o3\().4s, v4.4s, v5.4s
+ mul \o2\().4s, v3.4s, v0.s[3]
+ add \o0\().4s, v4.4s, v7.4s
+ add \o1\().4s, v5.4s, v7.4s
+ sub \o3\().4s, \o3\().4s, v7.4s
+
+ srshr \o0\().4s, \o0\().4s, #12
+ srshr \o2\().4s, \o2\().4s, #12
+ srshr \o1\().4s, \o1\().4s, #12
+ srshr \o3\().4s, \o3\().4s, #12
+.endm
+
+function inv_adst_4s_x4_neon
+ AARCH64_VALID_CALL_TARGET
+ iadst_4x4 v16, v17, v18, v19
+ ret
+endfunc
+
+function inv_flipadst_4s_x4_neon
+ AARCH64_VALID_CALL_TARGET
+ iadst_4x4 v19, v18, v17, v16
+ ret
+endfunc
+
+function inv_identity_4s_x4_neon
+ AARCH64_VALID_CALL_TARGET
+ movz w16, #(5793-4096)*8, lsl #16
+ dup v0.2s, w16
+ sqrdmulh v4.4s, v16.4s, v0.s[0]
+ sqrdmulh v5.4s, v17.4s, v0.s[0]
+ sqrdmulh v6.4s, v18.4s, v0.s[0]
+ sqrdmulh v7.4s, v19.4s, v0.s[0]
+ sqadd v16.4s, v16.4s, v4.4s
+ sqadd v17.4s, v17.4s, v5.4s
+ sqadd v18.4s, v18.4s, v6.4s
+ sqadd v19.4s, v19.4s, v7.4s
+ ret
+endfunc
+
+function inv_txfm_add_wht_wht_4x4_16bpc_neon, export=1
+ mov x15, x30
+ movi v30.4s, #0
+ movi v31.4s, #0
+ ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x2]
+ st1 {v30.4s, v31.4s}, [x2], #32
+
+ sshr v16.4s, v16.4s, #2
+ sshr v17.4s, v17.4s, #2
+ sshr v18.4s, v18.4s, #2
+ sshr v19.4s, v19.4s, #2
+
+ iwht4
+
+ st1 {v30.4s, v31.4s}, [x2], #32
+ transpose_4x4s v16, v17, v18, v19, v20, v21, v22, v23
+
+ iwht4
+
+ ld1 {v0.d}[0], [x0], x1
+ sqxtn v16.4h, v16.4s
+ ld1 {v0.d}[1], [x0], x1
+ sqxtn2 v16.8h, v17.4s
+ ld1 {v1.d}[0], [x0], x1
+ sqxtn v18.4h, v18.4s
+ ld1 {v1.d}[1], [x0], x1
+ sqxtn2 v18.8h, v19.4s
+
+ b L(itx_4x4_end)
+endfunc
+
+function inv_txfm_add_4x4_neon
+ movi v30.4s, #0
+ movi v31.4s, #0
+ ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x2]
+ st1 {v30.4s, v31.4s}, [x2], #32
+
+ blr x4
+
+ st1 {v30.4s, v31.4s}, [x2], #32
+ sqxtn v16.4h, v16.4s
+ sqxtn v17.4h, v17.4s
+ sqxtn v18.4h, v18.4s
+ sqxtn v19.4h, v19.4s
+ transpose_4x4h v16, v17, v18, v19, v20, v21, v22, v23
+
+ blr x5
+
+ ld1 {v0.d}[0], [x0], x1
+ ld1 {v0.d}[1], [x0], x1
+ ins v16.d[1], v17.d[0]
+ ins v18.d[1], v19.d[0]
+ ld1 {v1.d}[0], [x0], x1
+ ld1 {v1.d}[1], [x0], x1
+ srshr v16.8h, v16.8h, #4
+ srshr v18.8h, v18.8h, #4
+
+L(itx_4x4_end):
+ mvni v31.8h, #0xfc, lsl #8 // 0x3ff
+ sub x0, x0, x1, lsl #2
+ usqadd v0.8h, v16.8h
+ usqadd v1.8h, v18.8h
+ smin v0.8h, v0.8h, v31.8h
+ st1 {v0.d}[0], [x0], x1
+ smin v1.8h, v1.8h, v31.8h
+ st1 {v0.d}[1], [x0], x1
+ st1 {v1.d}[0], [x0], x1
+ st1 {v1.d}[1], [x0], x1
+
+ ret x15
+endfunc
+
+.macro def_fn_4x4 txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_16bpc_neon, export=1
+ mov x15, x30
+
+.ifc \txfm1\()_\txfm2, dct_dct
+ cbnz w3, 1f
+ movz w16, #2896*8, lsl #16
+ ld1r {v16.4s}, [x2]
+ dup v4.2s, w16
+ str wzr, [x2]
+ sqrdmulh v16.4s, v16.4s, v4.s[0]
+ ld1 {v0.d}[0], [x0], x1
+ sqxtn v20.4h, v16.4s
+ sqxtn2 v20.8h, v16.4s
+ ld1 {v0.d}[1], [x0], x1
+ sqrdmulh v20.8h, v20.8h, v4.h[1]
+ ld1 {v1.d}[0], [x0], x1
+ srshr v16.8h, v20.8h, #4
+ ld1 {v1.d}[1], [x0], x1
+ srshr v18.8h, v20.8h, #4
+ movi v30.8h, #0
+ b L(itx_4x4_end)
+1:
+.endif
+ adr x4, inv_\txfm1\()_4s_x4_neon
+ movrel x5, X(inv_\txfm2\()_4h_x4_neon)
+ b inv_txfm_add_4x4_neon
+endfunc
+.endm
+
+def_fn_4x4 dct, dct
+def_fn_4x4 identity, identity
+def_fn_4x4 dct, adst
+def_fn_4x4 dct, flipadst
+def_fn_4x4 dct, identity
+def_fn_4x4 adst, dct
+def_fn_4x4 adst, adst
+def_fn_4x4 adst, flipadst
+def_fn_4x4 flipadst, dct
+def_fn_4x4 flipadst, adst
+def_fn_4x4 flipadst, flipadst
+def_fn_4x4 identity, dct
+
+def_fn_4x4 adst, identity
+def_fn_4x4 flipadst, identity
+def_fn_4x4 identity, adst
+def_fn_4x4 identity, flipadst
+
+.macro idct_8 r0, r1, r2, r3, r4, r5, r6, r7
+ idct_4 \r0, \r2, \r4, \r6
+
+ movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ mvni v4.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
+.irp r, \r0, \r2, \r4, \r6
+ smin_4s \r, \r, v5
+.endr
+.irp r, \r0, \r2, \r4, \r6
+ smax_4s \r, \r, v4
+.endr
+
+ mul_mls v2, \r1, \r7, v1.s[0], v1.s[1] // -> t4a
+ mul_mla v3, \r1, \r7, v1.s[1], v1.s[0] // -> t7a
+ mul_mls v6, \r5, \r3, v1.s[2], v1.s[3] // -> t5a
+ mul_mla v7, \r5, \r3, v1.s[3], v1.s[2] // -> t6a
+ srshr \r1\().4s, v2.4s, #12 // t4a
+ srshr \r7\().4s, v3.4s, #12 // t7a
+ srshr \r3\().4s, v6.4s, #12 // t5a
+ srshr \r5\().4s, v7.4s, #12 // t6a
+
+ sqadd v2.4s, \r1\().4s, \r3\().4s // t4
+ sqsub \r1\().4s, \r1\().4s, \r3\().4s // t5a
+ sqadd v3.4s, \r7\().4s, \r5\().4s // t7
+ sqsub \r3\().4s, \r7\().4s, \r5\().4s // t6a
+
+.irp r, v2, \r1, v3, \r3
+ smin_4s \r, \r, v5
+.endr
+.irp r, v2, \r1, v3, \r3
+ smax_4s \r, \r, v4
+.endr
+
+ mul_mls v7, \r3, \r1, v0.s[0], v0.s[0] // -> t5
+ mul_mla v6, \r3, \r1, v0.s[0], v0.s[0] // -> t6
+ srshr v7.4s, v7.4s, #12 // t5
+ srshr v6.4s, v6.4s, #12 // t6
+
+ sqsub \r7\().4s, \r0\().4s, v3.4s // out7
+ sqadd \r0\().4s, \r0\().4s, v3.4s // out0
+ sqadd \r1\().4s, \r2\().4s, v6.4s // out1
+ sqsub v6.4s, \r2\().4s, v6.4s // out6
+ sqadd \r2\().4s, \r4\().4s, v7.4s // out2
+ sqsub \r5\().4s, \r4\().4s, v7.4s // out5
+ sqadd \r3\().4s, \r6\().4s, v2.4s // out3
+ sqsub \r4\().4s, \r6\().4s, v2.4s // out4
+ mov \r6\().16b, v6.16b // out6
+.endm
+
+function inv_dct_4s_x8_neon
+ AARCH64_VALID_CALL_TARGET
+ movrel x16, idct_coeffs
+ ld1 {v0.4s, v1.4s}, [x16]
+ idct_8 v16, v17, v18, v19, v20, v21, v22, v23
+ ret
+endfunc
+
+.macro iadst_8 o0, o1, o2, o3, o4, o5, o6, o7
+ movrel x16, iadst8_coeffs
+ ld1 {v0.4s, v1.4s}, [x16], #32
+
+ mul_mla v2, v23, v16, v0.s[0], v0.s[1]
+ mul_mls v4, v23, v16, v0.s[1], v0.s[0]
+ mul_mla v6, v21, v18, v0.s[2], v0.s[3]
+ srshr v16.4s, v2.4s, #12 // t0a
+ srshr v23.4s, v4.4s, #12 // t1a
+ mul_mls v2, v21, v18, v0.s[3], v0.s[2]
+ mul_mla v4, v19, v20, v1.s[0], v1.s[1]
+ srshr v18.4s, v6.4s, #12 // t2a
+ srshr v21.4s, v2.4s, #12 // t3a
+ mul_mls v6, v19, v20, v1.s[1], v1.s[0]
+ mul_mla v2, v17, v22, v1.s[2], v1.s[3]
+ srshr v20.4s, v4.4s, #12 // t4a
+ srshr v19.4s, v6.4s, #12 // t5a
+ mul_mls v4, v17, v22, v1.s[3], v1.s[2]
+ srshr v22.4s, v2.4s, #12 // t6a
+ srshr v17.4s, v4.4s, #12 // t7a
+
+ ld1 {v0.4s}, [x16]
+
+ movi v1.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff
+
+ sqadd v2.4s, v16.4s, v20.4s // t0
+ sqsub v3.4s, v16.4s, v20.4s // t4
+ mvni v20.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
+ sqadd v4.4s, v23.4s, v19.4s // t1
+ sqsub v5.4s, v23.4s, v19.4s // t5
+ sqadd v6.4s, v18.4s, v22.4s // t2
+ sqsub v7.4s, v18.4s, v22.4s // t6
+ sqadd v18.4s, v21.4s, v17.4s // t3
+ sqsub v19.4s, v21.4s, v17.4s // t7
+
+.irp r, v2, v3, v4, v5, v6, v7, v18, v19
+ smin_4s \r, \r, v1
+.endr
+.irp r, v2, v3, v4, v5, v6, v7, v18, v19
+ smax_4s \r, \r, v20
+.endr
+
+ mul_mla v16, v3, v5, v0.s[3], v0.s[2]
+ mul_mls v20, v3, v5, v0.s[2], v0.s[3]
+ mul_mls v22, v19, v7, v0.s[3], v0.s[2]
+
+ srshr v3.4s, v16.4s, #12 // t4a
+ srshr v5.4s, v20.4s, #12 // t5a
+
+ mul_mla v16, v19, v7, v0.s[2], v0.s[3]
+
+ srshr v7.4s, v22.4s, #12 // t6a
+ srshr v19.4s, v16.4s, #12 // t7a
+
+ sqadd \o0\().4s, v2.4s, v6.4s // out0
+ sqsub v2.4s, v2.4s, v6.4s // t2
+ sqadd \o7\().4s, v4.4s, v18.4s // out7
+ sqsub v4.4s, v4.4s, v18.4s // t3
+
+ mvni v18.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
+
+ sqadd \o1\().4s, v3.4s, v7.4s // out1
+ sqsub v3.4s, v3.4s, v7.4s // t6
+ sqadd \o6\().4s, v5.4s, v19.4s // out6
+ sqsub v5.4s, v5.4s, v19.4s // t7
+
+ // Not clipping the output registers, as they will be downshifted and
+ // narrowed afterwards anyway.
+.irp r, v2, v4, v3, v5
+ smin_4s \r, \r, v1
+.endr
+.irp r, v2, v4, v3, v5
+ smax_4s \r, \r, v18
+.endr
+
+ sqneg \o7\().4s, \o7\().4s // out7
+ sqneg \o1\().4s, \o1\().4s // out1
+
+ mul_mla v18, v2, v4, v0.s[0], v0.s[0] // -> out3 (v19 or v20)
+ mul_mls v6, v2, v4, v0.s[0], v0.s[0] // -> out4 (v20 or v19)
+ mul_mls v20, v3, v5, v0.s[0], v0.s[0] // -> out5 (v21 or v18)
+ srshr v2.4s, v18.4s, #12 // out3
+ mul_mla v18, v3, v5, v0.s[0], v0.s[0] // -> out2 (v18 or v21)
+ srshr v3.4s, v20.4s, #12 // out5
+ srshr \o2\().4s, v18.4s, #12 // out2 (v18 or v21)
+ srshr \o4\().4s, v6.4s, #12 // out4 (v20 or v19)
+
+ sqneg \o3\().4s, v2.4s // out3
+ sqneg \o5\().4s, v3.4s // out5
+.endm
+
+function inv_adst_4s_x8_neon
+ AARCH64_VALID_CALL_TARGET
+ iadst_8 v16, v17, v18, v19, v20, v21, v22, v23
+ ret
+endfunc
+
+function inv_flipadst_4s_x8_neon
+ AARCH64_VALID_CALL_TARGET
+ iadst_8 v23, v22, v21, v20, v19, v18, v17, v16
+ ret
+endfunc
+
+function inv_identity_4s_x8_neon
+ AARCH64_VALID_CALL_TARGET
+ sqshl v16.4s, v16.4s, #1
+ sqshl v17.4s, v17.4s, #1
+ sqshl v18.4s, v18.4s, #1
+ sqshl v19.4s, v19.4s, #1
+ sqshl v20.4s, v20.4s, #1
+ sqshl v21.4s, v21.4s, #1
+ sqshl v22.4s, v22.4s, #1
+ sqshl v23.4s, v23.4s, #1
+ ret
+endfunc
+
+function inv_txfm_add_8x8_neon
+ movi v31.4s, #0
+
+ cmp w3, w13
+ mov x11, #32
+ b.lt 1f
+
+ add x6, x2, #16
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+ ld1 {\i}, [x6]
+ st1 {v31.4s}, [x6], x11
+.endr
+
+ blr x4
+
+ sqrshrn v24.4h, v16.4s, #1
+ sqrshrn v25.4h, v17.4s, #1
+ sqrshrn v26.4h, v18.4s, #1
+ sqrshrn v27.4h, v19.4s, #1
+ sqrshrn2 v24.8h, v20.4s, #1
+ sqrshrn2 v25.8h, v21.4s, #1
+ sqrshrn2 v26.8h, v22.4s, #1
+ sqrshrn2 v27.8h, v23.4s, #1
+
+ transpose_4x8h v24, v25, v26, v27, v2, v3, v4, v5
+
+ b 2f
+
+1:
+.irp i, v24.8h, v25.8h, v26.8h, v27.8h
+ movi \i, #0
+.endr
+
+2:
+
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+ ld1 {\i}, [x2]
+ st1 {v31.4s}, [x2], x11
+.endr
+
+ blr x4
+
+ sqrshrn v16.4h, v16.4s, #1
+ sqrshrn v17.4h, v17.4s, #1
+ sqrshrn v18.4h, v18.4s, #1
+ sqrshrn v19.4h, v19.4s, #1
+ sqrshrn2 v16.8h, v20.4s, #1
+ sqrshrn2 v17.8h, v21.4s, #1
+ sqrshrn2 v18.8h, v22.4s, #1
+ sqrshrn2 v19.8h, v23.4s, #1
+
+ transpose_4x8h v16, v17, v18, v19, v20, v21, v22, v23
+
+ mov v20.16b, v24.16b
+ mov v21.16b, v25.16b
+ mov v22.16b, v26.16b
+ mov v23.16b, v27.16b
+
+ blr x5
+
+ load_add_store_8x8 x0, x7
+ ret x15
+endfunc
+
+.macro def_fn_8x8 txfm1, txfm2, eob_half
+function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_16bpc_neon, export=1
+ mov x15, x30
+
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc 8, 8, 1
+.endif
+ movrel x5, X(inv_\txfm2\()_8h_x8_neon)
+ mov w13, #\eob_half
+ adr x4, inv_\txfm1\()_4s_x8_neon
+ b inv_txfm_add_8x8_neon
+endfunc
+.endm
+
+def_fn_8x8 dct, dct, 10
+def_fn_8x8 identity, identity, 10
+def_fn_8x8 dct, adst, 10
+def_fn_8x8 dct, flipadst, 10
+def_fn_8x8 dct, identity, 4
+def_fn_8x8 adst, dct, 10
+def_fn_8x8 adst, adst, 10
+def_fn_8x8 adst, flipadst, 10
+def_fn_8x8 flipadst, dct, 10
+def_fn_8x8 flipadst, adst, 10
+def_fn_8x8 flipadst, flipadst, 10
+def_fn_8x8 identity, dct, 4
+def_fn_8x8 adst, identity, 4
+def_fn_8x8 flipadst, identity, 4
+def_fn_8x8 identity, adst, 4
+def_fn_8x8 identity, flipadst, 4
+
+function inv_txfm_add_8x4_neon
+ movi v28.4s, #0
+ movi v29.4s, #0
+ movi v30.4s, #0
+ movi v31.4s, #0
+ ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x2]
+ st1 {v28.4s,v29.4s,v30.4s,v31.4s}, [x2], #64
+ movz w16, #2896*8, lsl #16
+ dup v0.2s, w16
+ ld1 {v20.4s,v21.4s,v22.4s,v23.4s}, [x2]
+ st1 {v28.4s,v29.4s,v30.4s,v31.4s}, [x2]
+
+ scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+
+ blr x4
+
+ sqxtn v16.4h, v16.4s
+ sqxtn v17.4h, v17.4s
+ sqxtn v18.4h, v18.4s
+ sqxtn v19.4h, v19.4s
+ sqxtn v20.4h, v20.4s
+ sqxtn v21.4h, v21.4s
+ sqxtn v22.4h, v22.4s
+ sqxtn v23.4h, v23.4s
+
+ transpose_4x4h v16, v17, v18, v19, v4, v5, v6, v7
+ transpose_4x4h v20, v21, v22, v23, v4, v5, v6, v7
+ ins v16.d[1], v20.d[0]
+ ins v17.d[1], v21.d[0]
+ ins v18.d[1], v22.d[0]
+ ins v19.d[1], v23.d[0]
+
+ blr x5
+
+ load_add_store_8x4 x0, x7
+ ret x15
+endfunc
+
+function inv_txfm_add_4x8_neon
+ movz w16, #2896*8, lsl #16
+ movi v31.4s, #0
+ dup v30.2s, w16
+
+ cmp w3, w13
+ mov x11, #32
+ b.lt 1f
+
+ add x6, x2, #16
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s
+ ld1 {\i}, [x6]
+ st1 {v31.4s}, [x6], x11
+.endr
+ scale_input .4s, v30.s[0], v16, v17, v18, v19
+ blr x4
+ sqxtn v20.4h, v16.4s
+ sqxtn v21.4h, v17.4s
+ sqxtn v22.4h, v18.4s
+ sqxtn v23.4h, v19.4s
+ transpose_4x4h v20, v21, v22, v23, v4, v5, v6, v7
+
+ b 2f
+
+1:
+.irp i, v20, v21, v22, v23
+ movi \i\().4h, #0
+.endr
+
+2:
+
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s
+ ld1 {\i}, [x2]
+ st1 {v31.4s}, [x2], x11
+.endr
+ scale_input .4s, v30.s[0], v16, v17, v18, v19
+ blr x4
+ sqxtn v16.4h, v16.4s
+ sqxtn v17.4h, v17.4s
+ sqxtn v18.4h, v18.4s
+ sqxtn v19.4h, v19.4s
+ transpose_4x4h v16, v17, v18, v19, v4, v5, v6, v7
+
+ blr x5
+
+ load_add_store_4x8 x0, x7
+ ret x15
+endfunc
+
+.macro def_fn_48 w, h, txfm1, txfm2, eob_half
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1
+ mov x15, x30
+
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc \w, \h, 0
+.endif
+ adr x4, inv_\txfm1\()_4s_x\w\()_neon
+.if \w == 4
+ mov w13, #\eob_half
+.endif
+ movrel x5, X(inv_\txfm2\()_\w\()h_x\h\()_neon)
+ b inv_txfm_add_\w\()x\h\()_neon
+endfunc
+.endm
+
+.macro def_fns_48 w, h
+def_fn_48 \w, \h, dct, dct, 13
+def_fn_48 \w, \h, identity, identity, 13
+def_fn_48 \w, \h, dct, adst, 13
+def_fn_48 \w, \h, dct, flipadst, 13
+def_fn_48 \w, \h, dct, identity, 4
+def_fn_48 \w, \h, adst, dct, 13
+def_fn_48 \w, \h, adst, adst, 13
+def_fn_48 \w, \h, adst, flipadst, 13
+def_fn_48 \w, \h, flipadst, dct, 13
+def_fn_48 \w, \h, flipadst, adst, 13
+def_fn_48 \w, \h, flipadst, flipadst, 13
+def_fn_48 \w, \h, identity, dct, 16
+def_fn_48 \w, \h, adst, identity, 4
+def_fn_48 \w, \h, flipadst, identity, 4
+def_fn_48 \w, \h, identity, adst, 16
+def_fn_48 \w, \h, identity, flipadst, 16
+.endm
+
+def_fns_48 4, 8
+def_fns_48 8, 4
+
+
+function inv_dct_4s_x16_neon
+ AARCH64_VALID_CALL_TARGET
+ movrel x16, idct_coeffs
+ ld1 {v0.4s, v1.4s}, [x16], #32
+
+ idct_8 v16, v18, v20, v22, v24, v26, v28, v30
+
+ // idct_8 leaves the row_clip_max/min constants in v5 and v4
+.irp r, v16, v18, v20, v22, v24, v26, v28, v30
+ smin \r\().4s, \r\().4s, v5.4s
+.endr
+.irp r, v16, v18, v20, v22, v24, v26, v28, v30
+ smax \r\().4s, \r\().4s, v4.4s
+.endr
+
+ ld1 {v0.4s, v1.4s}, [x16]
+ sub x16, x16, #32
+
+ mul_mls v2, v17, v31, v0.s[0], v0.s[1] // -> t8a
+ mul_mla v3, v17, v31, v0.s[1], v0.s[0] // -> t15a
+ mul_mls v6, v25, v23, v0.s[2], v0.s[3] // -> t9a
+ srshr v17.4s, v2.4s, #12 // t8a
+ srshr v31.4s, v3.4s, #12 // t15a
+ mul_mla v2, v25, v23, v0.s[3], v0.s[2] // -> t14a
+ mul_mls v3, v21, v27, v1.s[0], v1.s[1] // -> t10a
+ srshr v23.4s, v6.4s, #12 // t9a
+ srshr v25.4s, v2.4s, #12 // t14a
+ mul_mla v6, v21, v27, v1.s[1], v1.s[0] // -> t13a
+ mul_mls v2, v29, v19, v1.s[2], v1.s[3] // -> t11a
+ srshr v21.4s, v3.4s, #12 // t10a
+ srshr v27.4s, v6.4s, #12 // t13a
+ mul_mla v3, v29, v19, v1.s[3], v1.s[2] // -> t12a
+ srshr v19.4s, v2.4s, #12 // t11a
+ srshr v29.4s, v3.4s, #12 // t12a
+
+ ld1 {v0.4s}, [x16]
+
+ sqsub v2.4s, v17.4s, v23.4s // t9
+ sqadd v17.4s, v17.4s, v23.4s // t8
+ sqsub v3.4s, v31.4s, v25.4s // t14
+ sqadd v31.4s, v31.4s, v25.4s // t15
+ sqsub v23.4s, v19.4s, v21.4s // t10
+ sqadd v19.4s, v19.4s, v21.4s // t11
+ sqadd v25.4s, v29.4s, v27.4s // t12
+ sqsub v29.4s, v29.4s, v27.4s // t13
+
+.irp r, v2, v17, v3, v31, v23, v19, v25, v29
+ smin \r\().4s, \r\().4s, v5.4s
+.endr
+.irp r, v2, v17, v3, v31, v23, v19, v25, v29
+ smax \r\().4s, \r\().4s, v4.4s
+.endr
+
+ mul_mls v7, v3, v2, v0.s[2], v0.s[3] // -> t9a
+ mul_mla v6, v3, v2, v0.s[3], v0.s[2] // -> t14a
+ srshr v21.4s, v7.4s, #12 // t9a
+ srshr v27.4s, v6.4s, #12 // t14a
+
+ mul_mls v7, v29, v23, v0.s[2], v0.s[3] // -> t13a
+ mul_mla v6, v29, v23, v0.s[3], v0.s[2] // -> t10a
+ srshr v29.4s, v7.4s, #12 // t13a
+ neg v6.4s, v6.4s
+ srshr v23.4s, v6.4s, #12 // t10a
+
+ sqsub v2.4s, v17.4s, v19.4s // t11a
+ sqadd v17.4s, v17.4s, v19.4s // t8a
+ sqsub v3.4s, v31.4s, v25.4s // t12a
+ sqadd v31.4s, v31.4s, v25.4s // t15a
+ sqadd v19.4s, v21.4s, v23.4s // t9
+ sqsub v21.4s, v21.4s, v23.4s // t10
+ sqsub v25.4s, v27.4s, v29.4s // t13
+ sqadd v27.4s, v27.4s, v29.4s // t14
+
+.irp r, v2, v17, v3, v31, v19, v21, v25, v27
+ smin \r\().4s, \r\().4s, v5.4s
+.endr
+.irp r, v2, v17, v3, v31, v19, v21, v25, v27
+ smax \r\().4s, \r\().4s, v4.4s
+.endr
+
+ mul_mls v7, v3, v2, v0.s[0], v0.s[0] // -> t11
+ mul_mla v6, v3, v2, v0.s[0], v0.s[0] // -> t12
+ mul_mls v2, v25, v21, v0.s[0], v0.s[0] // -> t10a
+
+ srshr v7.4s, v7.4s, #12 // t11
+ srshr v6.4s, v6.4s, #12 // t12
+ mul_mla v3, v25, v21, v0.s[0], v0.s[0] // -> t13a
+ srshr v2.4s, v2.4s, #12 // t10a
+ srshr v3.4s, v3.4s, #12 // t13a
+
+ sqadd v1.4s, v16.4s, v31.4s // out0
+ sqsub v31.4s, v16.4s, v31.4s // out15
+ mov v16.16b, v1.16b
+ sqadd v23.4s, v30.4s, v17.4s // out7
+ sqsub v1.4s, v30.4s, v17.4s // out8
+ sqadd v17.4s, v18.4s, v27.4s // out1
+ sqsub v30.4s, v18.4s, v27.4s // out14
+ sqadd v18.4s, v20.4s, v3.4s // out2
+ sqsub v29.4s, v20.4s, v3.4s // out13
+ sqadd v3.4s, v28.4s, v19.4s // out6
+ sqsub v25.4s, v28.4s, v19.4s // out9
+ sqadd v19.4s, v22.4s, v6.4s // out3
+ sqsub v28.4s, v22.4s, v6.4s // out12
+ sqadd v20.4s, v24.4s, v7.4s // out4
+ sqsub v27.4s, v24.4s, v7.4s // out11
+ sqadd v21.4s, v26.4s, v2.4s // out5
+ sqsub v26.4s, v26.4s, v2.4s // out10
+ mov v24.16b, v1.16b
+ mov v22.16b, v3.16b
+
+ ret
+endfunc
+
+.macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15
+ movrel x16, iadst16_coeffs
+ ld1 {v0.4s, v1.4s}, [x16], #32
+
+ mul_mla v2, v31, v16, v0.s[0], v0.s[1] // -> t0
+ mul_mls v4, v31, v16, v0.s[1], v0.s[0] // -> t1
+ mul_mla v6, v29, v18, v0.s[2], v0.s[3] // -> t2
+ srshr v16.4s, v2.4s, #12 // t0
+ srshr v31.4s, v4.4s, #12 // t1
+ mul_mls v2, v29, v18, v0.s[3], v0.s[2] // -> t3
+ mul_mla v4, v27, v20, v1.s[0], v1.s[1] // -> t4
+ srshr v18.4s, v6.4s, #12 // t2
+ srshr v29.4s, v2.4s, #12 // t3
+ mul_mls v6, v27, v20, v1.s[1], v1.s[0] // -> t5
+ mul_mla v2, v25, v22, v1.s[2], v1.s[3] // -> t6
+ srshr v20.4s, v4.4s, #12 // t4
+ srshr v27.4s, v6.4s, #12 // t5
+ mul_mls v4, v25, v22, v1.s[3], v1.s[2] // -> t7
+ ld1 {v0.4s, v1.4s}, [x16]
+ movrel x16, idct_coeffs
+ mul_mla v6, v23, v24, v0.s[0], v0.s[1] // -> t8
+ srshr v22.4s, v2.4s, #12 // t6
+ srshr v25.4s, v4.4s, #12 // t7
+ mul_mls v2, v23, v24, v0.s[1], v0.s[0] // -> t9
+ mul_mla v4, v21, v26, v0.s[2], v0.s[3] // -> t10
+ srshr v23.4s, v6.4s, #12 // t8
+ srshr v24.4s, v2.4s, #12 // t9
+ mul_mls v6, v21, v26, v0.s[3], v0.s[2] // -> t11
+ mul_mla v2, v19, v28, v1.s[0], v1.s[1] // -> t12
+ srshr v21.4s, v4.4s, #12 // t10
+ srshr v26.4s, v6.4s, #12 // t11
+ mul_mls v4, v19, v28, v1.s[1], v1.s[0] // -> t13
+ mul_mla v6, v17, v30, v1.s[2], v1.s[3] // -> t14
+ srshr v19.4s, v2.4s, #12 // t12
+ srshr v28.4s, v4.4s, #12 // t13
+ mul_mls v2, v17, v30, v1.s[3], v1.s[2] // -> t15
+ srshr v17.4s, v6.4s, #12 // t14
+ srshr v30.4s, v2.4s, #12 // t15
+
+ ld1 {v0.4s, v1.4s}, [x16]
+
+ movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ mvni v7.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
+
+ sqsub v2.4s, v16.4s, v23.4s // t8a
+ sqadd v16.4s, v16.4s, v23.4s // t0a
+ sqsub v3.4s, v31.4s, v24.4s // t9a
+ sqadd v31.4s, v31.4s, v24.4s // t1a
+ sqadd v23.4s, v18.4s, v21.4s // t2a
+ sqsub v18.4s, v18.4s, v21.4s // t10a
+ sqadd v24.4s, v29.4s, v26.4s // t3a
+ sqsub v29.4s, v29.4s, v26.4s // t11a
+ sqadd v21.4s, v20.4s, v19.4s // t4a
+ sqsub v20.4s, v20.4s, v19.4s // t12a
+ sqadd v26.4s, v27.4s, v28.4s // t5a
+ sqsub v27.4s, v27.4s, v28.4s // t13a
+ sqadd v19.4s, v22.4s, v17.4s // t6a
+ sqsub v22.4s, v22.4s, v17.4s // t14a
+ sqadd v28.4s, v25.4s, v30.4s // t7a
+ sqsub v25.4s, v25.4s, v30.4s // t15a
+
+.irp r, v2, v16, v3, v31, v23, v18, v24, v29, v21, v20, v26, v27, v19, v22, v28, v25
+ smin_4s \r, \r, v5
+.endr
+.irp r, v2, v16, v3, v31, v23, v18, v24, v29, v21, v20, v26, v27, v19, v22, v28, v25
+ smax_4s \r, \r, v7
+.endr
+
+ mul_mla v4, v2, v3, v1.s[1], v1.s[0] // -> t8
+ mul_mls v6, v2, v3, v1.s[0], v1.s[1] // -> t9
+ mul_mla v2, v18, v29, v1.s[3], v1.s[2] // -> t10
+ srshr v17.4s, v4.4s, #12 // t8
+ srshr v30.4s, v6.4s, #12 // t9
+ mul_mls v4, v18, v29, v1.s[2], v1.s[3] // -> t11
+ mul_mls v6, v27, v20, v1.s[1], v1.s[0] // -> t12
+ srshr v18.4s, v2.4s, #12 // t10
+ srshr v29.4s, v4.4s, #12 // t11
+ mul_mla v2, v27, v20, v1.s[0], v1.s[1] // -> t13
+ mul_mls v4, v25, v22, v1.s[3], v1.s[2] // -> t14
+ srshr v27.4s, v6.4s, #12 // t12
+ srshr v20.4s, v2.4s, #12 // t13
+ mul_mla v6, v25, v22, v1.s[2], v1.s[3] // -> t15
+ srshr v25.4s, v4.4s, #12 // t14
+ srshr v22.4s, v6.4s, #12 // t15
+
+ sqsub v2.4s, v16.4s, v21.4s // t4
+ sqadd v16.4s, v16.4s, v21.4s // t0
+ sqsub v3.4s, v31.4s, v26.4s // t5
+ sqadd v31.4s, v31.4s, v26.4s // t1
+ sqadd v21.4s, v23.4s, v19.4s // t2
+ sqsub v23.4s, v23.4s, v19.4s // t6
+ sqadd v26.4s, v24.4s, v28.4s // t3
+ sqsub v24.4s, v24.4s, v28.4s // t7
+ sqadd v19.4s, v17.4s, v27.4s // t8a
+ sqsub v17.4s, v17.4s, v27.4s // t12a
+ sqadd v28.4s, v30.4s, v20.4s // t9a
+ sqsub v30.4s, v30.4s, v20.4s // t13a
+ sqadd v27.4s, v18.4s, v25.4s // t10a
+ sqsub v18.4s, v18.4s, v25.4s // t14a
+ sqadd v20.4s, v29.4s, v22.4s // t11a
+ sqsub v29.4s, v29.4s, v22.4s // t15a
+
+.irp r, v2, v16, v3, v31, v21, v23, v26, v24, v19, v17, v28, v30, v27, v18, v20, v29
+ smin_4s \r, \r, v5
+.endr
+.irp r, v2, v16, v3, v31, v21, v23, v26, v24, v19, v17, v28, v30, v27, v18, v20, v29
+ smax_4s \r, \r, v7
+.endr
+
+ mul_mla v4, v2, v3, v0.s[3], v0.s[2] // -> t4a
+ mul_mls v6, v2, v3, v0.s[2], v0.s[3] // -> t5a
+ mul_mls v2, v24, v23, v0.s[3], v0.s[2] // -> t6a
+ srshr v22.4s, v4.4s, #12 // t4a
+ srshr v25.4s, v6.4s, #12 // t5a
+ mul_mla v4, v24, v23, v0.s[2], v0.s[3] // -> t7a
+ mul_mla v6, v17, v30, v0.s[3], v0.s[2] // -> t12
+ srshr v24.4s, v2.4s, #12 // t6a
+ srshr v23.4s, v4.4s, #12 // t7a
+ mul_mls v2, v17, v30, v0.s[2], v0.s[3] // -> t13
+ mul_mls v4, v29, v18, v0.s[3], v0.s[2] // -> t14
+ srshr v17.4s, v6.4s, #12 // t12
+ mul_mla v6, v29, v18, v0.s[2], v0.s[3] // -> t15
+ srshr v29.4s, v2.4s, #12 // t13
+ srshr v30.4s, v4.4s, #12 // t14
+ srshr v18.4s, v6.4s, #12 // t15
+
+ sqsub v2.4s, v16.4s, v21.4s // t2a
+.ifc \o0, v16
+ sqadd \o0\().4s, v16.4s, v21.4s // out0
+ sqsub v21.4s, v31.4s, v26.4s // t3a
+ sqadd \o15\().4s, v31.4s, v26.4s // out15
+.else
+ sqadd v4.4s, v16.4s, v21.4s // out0
+ sqsub v21.4s, v31.4s, v26.4s // t3a
+ sqadd \o15\().4s, v31.4s, v26.4s // out15
+ mov \o0\().16b, v4.16b
+.endif
+
+ sqsub v3.4s, v29.4s, v18.4s // t15a
+ sqadd \o13\().4s, v29.4s, v18.4s // out13
+ sqadd \o2\().4s, v17.4s, v30.4s // out2
+ sqsub v26.4s, v17.4s, v30.4s // t14a
+
+ sqadd \o1\().4s, v19.4s, v27.4s // out1
+ sqsub v27.4s, v19.4s, v27.4s // t10
+ sqadd \o14\().4s, v28.4s, v20.4s // out14
+ sqsub v20.4s, v28.4s, v20.4s // t11
+
+ sqadd \o3\().4s, v22.4s, v24.4s // out3
+ sqsub v22.4s, v22.4s, v24.4s // t6
+ sqadd \o12\().4s, v25.4s, v23.4s // out12
+ sqsub v23.4s, v25.4s, v23.4s // t7
+
+ // Not clipping the output registers, as they will be downshifted and
+ // narrowed afterwards anyway.
+.irp r, v2, v21, v3, v26, v27, v20, v22, v23
+ smin_4s \r, \r, v5
+.endr
+.irp r, v2, v21, v3, v26, v27, v20, v22, v23
+ smax_4s \r, \r, v7
+.endr
+
+ sqneg \o15\().4s, \o15\().4s // out15
+ sqneg \o13\().4s, \o13\().4s // out13
+ sqneg \o1\().4s, \o1\().4s // out1
+ sqneg \o3\().4s, \o3\().4s // out3
+
+ mul_mls v24, v2, v21, v0.s[0], v0.s[0] // -> out8 (v24 or v23)
+ mul_mla v4, v2, v21, v0.s[0], v0.s[0] // -> out7 (v23 or v24)
+ mul_mla v6, v26, v3, v0.s[0], v0.s[0] // -> out5 (v21 or v26)
+
+ srshr v24.4s, v24.4s, #12 // out8
+ srshr v4.4s, v4.4s, #12 // out7
+ srshr v5.4s, v6.4s, #12 // out5
+ mul_mls v6, v26, v3, v0.s[0], v0.s[0] // -> out10 (v26 or v21)
+ mul_mla v2, v22, v23, v0.s[0], v0.s[0] // -> out4 (v20 or v27)
+ srshr v26.4s, v6.4s, #12 // out10
+
+ mul_mls v6, v22, v23, v0.s[0], v0.s[0] // -> out11 (v27 or v20)
+ mul_mla v22, v27, v20, v0.s[0], v0.s[0] // -> out6 (v22 or v25)
+ mul_mls v21, v27, v20, v0.s[0], v0.s[0] // -> out9 (v25 or v22)
+
+ srshr \o4\().4s, v2.4s, #12 // out4
+ srshr v6.4s, v6.4s, #12 // out11
+ srshr v7.4s, v21.4s, #12 // out9
+ srshr \o6\().4s, v22.4s, #12 // out6
+
+.ifc \o8, v23
+ mov \o8\().16b, v24.16b
+ mov \o10\().16b, v26.16b
+.endif
+
+ sqneg \o7\().4s, v4.4s // out7
+ sqneg \o5\().4s, v5.4s // out5
+ sqneg \o11\().4s, v6.4s // out11
+ sqneg \o9\().4s, v7.4s // out9
+.endm
+
+function inv_adst_4s_x16_neon
+ AARCH64_VALID_CALL_TARGET
+ iadst_16 v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
+ ret
+endfunc
+
+function inv_flipadst_4s_x16_neon
+ AARCH64_VALID_CALL_TARGET
+ iadst_16 v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16
+ ret
+endfunc
+
+function inv_identity_4s_x16_neon
+ AARCH64_VALID_CALL_TARGET
+ movz w16, #2*(5793-4096)*8, lsl #16
+ dup v0.2s, w16
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ sqrdmulh v2.4s, v\i\().4s, v0.s[0]
+ sqadd v\i\().4s, v\i\().4s, v\i\().4s
+ sqadd v\i\().4s, v\i\().4s, v2.4s
+.endr
+ ret
+endfunc
+
+.macro identity_4x16_shift1 c
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+ sqrdmulh v3.4s, \i, \c
+ srshr v3.4s, v3.4s, #1
+ sqadd \i, \i, v3.4s
+.endr
+.endm
+
+.macro identity_4x16 c
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+ sqrdmulh v3.4s, \i, \c
+ sqadd \i, \i, \i
+ sqadd \i, \i, v3.4s
+.endr
+.endm
+
+.macro def_horz_16 scale=0, shift=2, suffix
+function inv_txfm_horz\suffix\()_16x4_neon
+ mov x14, x30
+ movi v7.4s, #0
+.if \scale
+ movz w16, #2896*8, lsl #16
+ dup v0.2s, w16
+.endif
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+ ld1 {\i}, [x7]
+ st1 {v7.4s}, [x7], x8
+.endr
+.if \scale
+ scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+ scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31
+.endif
+ blr x4
+ sqrshrn v16.4h, v16.4s, #\shift
+ sqrshrn v17.4h, v17.4s, #\shift
+ sqrshrn v18.4h, v18.4s, #\shift
+ sqrshrn v19.4h, v19.4s, #\shift
+ sqrshrn2 v16.8h, v20.4s, #\shift
+ sqrshrn2 v17.8h, v21.4s, #\shift
+ sqrshrn2 v18.8h, v22.4s, #\shift
+ sqrshrn2 v19.8h, v23.4s, #\shift
+ sqrshrn v20.4h, v24.4s, #\shift
+ sqrshrn v21.4h, v25.4s, #\shift
+ sqrshrn v22.4h, v26.4s, #\shift
+ sqrshrn v23.4h, v27.4s, #\shift
+ sqrshrn2 v20.8h, v28.4s, #\shift
+ sqrshrn2 v21.8h, v29.4s, #\shift
+ sqrshrn2 v22.8h, v30.4s, #\shift
+ sqrshrn2 v23.8h, v31.4s, #\shift
+ transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7
+ transpose_4x8h v20, v21, v22, v23, v4, v5, v6, v7
+
+.irp i, v16.8h, v20.8h, v17.8h, v21.8h, v18.8h, v22.8h, v19.8h, v23.8h
+ st1 {\i}, [x6], #16
+.endr
+
+ ret x14
+endfunc
+.endm
+
+def_horz_16 scale=0, shift=2
+def_horz_16 scale=1, shift=1, suffix=_scale
+
+function inv_txfm_add_vert_8x16_neon
+ mov x14, x30
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ ld1 {v\i\().8h}, [x7], x8
+.endr
+ blr x5
+ load_add_store_8x16 x6, x7
+ ret x14
+endfunc
+
+function inv_txfm_add_16x16_neon
+ mov x15, x30
+ sub sp, sp, #512
+ ldrh w12, [x13], #2
+.irp i, 0, 4, 8, 12
+ add x6, sp, #(\i*16*2)
+.if \i > 0
+ mov w8, #(16 - \i)
+ cmp w3, w12
+ b.lt 1f
+.if \i < 12
+ ldrh w12, [x13], #2
+.endif
+.endif
+ add x7, x2, #(\i*4)
+ mov x8, #16*4
+ bl inv_txfm_horz_16x4_neon
+.endr
+ b 3f
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #4
+.rept 2
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+3:
+.irp i, 0, 8
+ add x6, x0, #(\i*2)
+ add x7, sp, #(\i*2)
+ mov x8, #32
+ bl inv_txfm_add_vert_8x16_neon
+.endr
+
+ add sp, sp, #512
+ ret x15
+endfunc
+
+const eob_16x16
+ .short 10, 36, 78, 256
+endconst
+
+const eob_16x16_identity
+ .short 4, 8, 12, 256
+endconst
+
+.macro def_fn_16x16 txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_16bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc 16, 16, 2
+.endif
+ adr x4, inv_\txfm1\()_4s_x16_neon
+ movrel x5, X(inv_\txfm2\()_8h_x16_neon)
+.ifc \txfm1, identity
+.ifc \txfm2, identity
+ movrel x13, eob_16x16
+.else
+ movrel x13, eob_16x16_identity
+.endif
+.else
+.ifc \txfm2, identity
+ movrel x13, eob_16x16_identity
+.else
+ movrel x13, eob_16x16
+.endif
+.endif
+ b inv_txfm_add_16x16_neon
+endfunc
+.endm
+
+def_fn_16x16 dct, dct
+def_fn_16x16 identity, identity
+def_fn_16x16 dct, adst
+def_fn_16x16 dct, flipadst
+def_fn_16x16 dct, identity
+def_fn_16x16 adst, dct
+def_fn_16x16 adst, adst
+def_fn_16x16 adst, flipadst
+def_fn_16x16 flipadst, dct
+def_fn_16x16 flipadst, adst
+def_fn_16x16 flipadst, flipadst
+def_fn_16x16 identity, dct
+
+function inv_txfm_add_16x4_neon
+ mov x15, x30
+ movi v4.4s, #0
+
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+ ld1 {\i}, [x2]
+ st1 {v4.4s}, [x2], #16
+.endr
+
+ blr x4
+
+ sqrshrn v16.4h, v16.4s, #1
+ sqrshrn v17.4h, v17.4s, #1
+ sqrshrn v18.4h, v18.4s, #1
+ sqrshrn v19.4h, v19.4s, #1
+ sqrshrn2 v16.8h, v20.4s, #1
+ sqrshrn2 v17.8h, v21.4s, #1
+ sqrshrn2 v18.8h, v22.4s, #1
+ sqrshrn2 v19.8h, v23.4s, #1
+ transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5
+ blr x5
+ mov x6, x0
+ load_add_store_8x4 x6, x7
+
+ sqrshrn v16.4h, v24.4s, #1
+ sqrshrn v17.4h, v25.4s, #1
+ sqrshrn v18.4h, v26.4s, #1
+ sqrshrn v19.4h, v27.4s, #1
+ sqrshrn2 v16.8h, v28.4s, #1
+ sqrshrn2 v17.8h, v29.4s, #1
+ sqrshrn2 v18.8h, v30.4s, #1
+ sqrshrn2 v19.8h, v31.4s, #1
+ transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5
+ blr x5
+ add x6, x0, #16
+ load_add_store_8x4 x6, x7
+
+ ret x15
+endfunc
+
+function inv_txfm_add_4x16_neon
+ ldrh w12, [x13, #4]
+ mov x15, x30
+
+ mov x11, #64
+
+ cmp w3, w12
+ ldrh w12, [x13, #2]
+ b.lt 1f
+
+ add x6, x2, #48
+ movi v2.4s, #0
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s
+ ld1 {\i}, [x6]
+ st1 {v2.4s}, [x6], x11
+.endr
+ blr x4
+ sqrshrn v28.4h, v16.4s, #1
+ sqrshrn v29.4h, v17.4s, #1
+ sqrshrn v30.4h, v18.4s, #1
+ sqrshrn v31.4h, v19.4s, #1
+ transpose_4x4h v28, v29, v30, v31, v4, v5, v6, v7
+
+ b 2f
+1:
+.irp i, v28.4h, v29.4h, v30.4h, v31.4h
+ movi \i, #0
+.endr
+2:
+ cmp w3, w12
+ ldrh w12, [x13, #0]
+ b.lt 1f
+
+ add x6, x2, #32
+ movi v2.4s, #0
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s
+ ld1 {\i}, [x6]
+ st1 {v2.4s}, [x6], x11
+.endr
+ blr x4
+ sqrshrn v24.4h, v16.4s, #1
+ sqrshrn v25.4h, v17.4s, #1
+ sqrshrn v26.4h, v18.4s, #1
+ sqrshrn v27.4h, v19.4s, #1
+ transpose_4x4h v24, v25, v26, v27, v4, v5, v6, v7
+
+ b 2f
+1:
+.irp i, v24.4h, v25.4h, v26.4h, v27.4h
+ movi \i, #0
+.endr
+2:
+ cmp w3, w12
+ b.lt 1f
+
+ add x6, x2, #16
+ movi v2.4s, #0
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s
+ ld1 {\i}, [x6]
+ st1 {v2.4s}, [x6], x11
+.endr
+ blr x4
+ sqrshrn v20.4h, v16.4s, #1
+ sqrshrn v21.4h, v17.4s, #1
+ sqrshrn v22.4h, v18.4s, #1
+ sqrshrn v23.4h, v19.4s, #1
+ transpose_4x4h v20, v21, v22, v23, v4, v5, v6, v7
+
+ b 2f
+1:
+.irp i, v20.4h, v21.4h, v22.4h, v23.4h
+ movi \i, #0
+.endr
+2:
+
+ movi v2.4s, #0
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s
+ ld1 {\i}, [x2]
+ st1 {v2.4s}, [x2], x11
+.endr
+ blr x4
+ sqrshrn v16.4h, v16.4s, #1
+ sqrshrn v17.4h, v17.4s, #1
+ sqrshrn v18.4h, v18.4s, #1
+ sqrshrn v19.4h, v19.4s, #1
+ transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7
+
+ blr x5
+
+ load_add_store_4x16 x0, x6
+
+ ret x15
+endfunc
+
+const eob_4x16
+ .short 13, 29, 45, 64
+endconst
+
+const eob_4x16_identity1
+ .short 16, 32, 48, 64
+endconst
+
+const eob_4x16_identity2
+ .short 4, 8, 12, 64
+endconst
+
+.macro def_fn_416 w, h, txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc \w, \h, 1
+.endif
+.if \w == 4
+ adr x4, inv_\txfm1\()_4s_x\w\()_neon
+ movrel x5, X(inv_\txfm2\()_4h_x\h\()_neon)
+.ifc \txfm1, identity
+.ifc \txfm2, identity
+ movrel x13, eob_4x16
+.else
+ movrel x13, eob_4x16_identity1
+.endif
+.else
+.ifc \txfm2, identity
+ movrel x13, eob_4x16_identity2
+.else
+ movrel x13, eob_4x16
+.endif
+.endif
+.else
+ adr x4, inv_\txfm1\()_4s_x\w\()_neon
+ movrel x5, X(inv_\txfm2\()_8h_x\h\()_neon)
+.endif
+ b inv_txfm_add_\w\()x\h\()_neon
+endfunc
+.endm
+
+.macro def_fns_416 w, h
+def_fn_416 \w, \h, dct, dct
+def_fn_416 \w, \h, identity, identity
+def_fn_416 \w, \h, dct, adst
+def_fn_416 \w, \h, dct, flipadst
+def_fn_416 \w, \h, dct, identity
+def_fn_416 \w, \h, adst, dct
+def_fn_416 \w, \h, adst, adst
+def_fn_416 \w, \h, adst, flipadst
+def_fn_416 \w, \h, flipadst, dct
+def_fn_416 \w, \h, flipadst, adst
+def_fn_416 \w, \h, flipadst, flipadst
+def_fn_416 \w, \h, identity, dct
+def_fn_416 \w, \h, adst, identity
+def_fn_416 \w, \h, flipadst, identity
+def_fn_416 \w, \h, identity, adst
+def_fn_416 \w, \h, identity, flipadst
+.endm
+
+def_fns_416 4, 16
+def_fns_416 16, 4
+
+
+function inv_txfm_add_16x8_neon
+ mov x15, x30
+ stp d8, d9, [sp, #-0x40]!
+ stp d10, d11, [sp, #0x10]
+ stp d12, d13, [sp, #0x20]
+ stp d14, d15, [sp, #0x30]
+
+ cmp w3, w13
+ mov x11, #32
+ b.lt 1f
+
+ movi v4.4s, #0
+ movz w16, #2896*8, lsl #16
+ dup v0.2s, w16
+
+ add x6, x2, #16
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+ ld1 {\i}, [x6]
+ st1 {v4.4s}, [x6], x11
+.endr
+
+ scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+ scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31
+ blr x4
+
+ sqrshrn v8.4h, v16.4s, #1
+ sqrshrn v9.4h, v17.4s, #1
+ sqrshrn v10.4h, v18.4s, #1
+ sqrshrn v11.4h, v19.4s, #1
+ sqrshrn2 v8.8h, v20.4s, #1
+ sqrshrn2 v9.8h, v21.4s, #1
+ sqrshrn2 v10.8h, v22.4s, #1
+ sqrshrn2 v11.8h, v23.4s, #1
+ sqrshrn v12.4h, v24.4s, #1
+ sqrshrn v13.4h, v25.4s, #1
+ sqrshrn v14.4h, v26.4s, #1
+ sqrshrn v15.4h, v27.4s, #1
+ sqrshrn2 v12.8h, v28.4s, #1
+ sqrshrn2 v13.8h, v29.4s, #1
+ sqrshrn2 v14.8h, v30.4s, #1
+ sqrshrn2 v15.8h, v31.4s, #1
+
+ transpose_4x8h v8, v9, v10, v11, v2, v3, v4, v5
+ transpose_4x8h v12, v13, v14, v15, v2, v3, v4, v5
+
+ b 2f
+1:
+.irp i, v8.8h, v9.8h, v10.8h, v11.8h, v12.8h, v13.8h, v14.8h, v15.8h
+ movi \i, #0
+.endr
+2:
+ movz w16, #2896*8, lsl #16
+ dup v0.2s, w16
+
+ movi v4.4s, #0
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+ ld1 {\i}, [x2]
+ st1 {v4.4s}, [x2], x11
+.endr
+
+ scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+ scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31
+ blr x4
+
+ sqrshrn v16.4h, v16.4s, #1
+ sqrshrn v17.4h, v17.4s, #1
+ sqrshrn v18.4h, v18.4s, #1
+ sqrshrn v19.4h, v19.4s, #1
+ sqrshrn2 v16.8h, v20.4s, #1
+ sqrshrn2 v17.8h, v21.4s, #1
+ sqrshrn2 v18.8h, v22.4s, #1
+ sqrshrn2 v19.8h, v23.4s, #1
+
+ mov v20.16b, v8.16b
+ mov v21.16b, v9.16b
+ mov v22.16b, v10.16b
+ mov v23.16b, v11.16b
+
+ transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5
+
+ sqrshrn v8.4h, v24.4s, #1
+ sqrshrn v9.4h, v25.4s, #1
+ sqrshrn v10.4h, v26.4s, #1
+ sqrshrn v11.4h, v27.4s, #1
+ sqrshrn2 v8.8h, v28.4s, #1
+ sqrshrn2 v9.8h, v29.4s, #1
+ sqrshrn2 v10.8h, v30.4s, #1
+ sqrshrn2 v11.8h, v31.4s, #1
+
+ transpose_4x8h v8, v9, v10, v11, v2, v3, v4, v5
+
+ blr x5
+
+ mov x6, x0
+ load_add_store_8x8 x6, x7
+
+ mov v16.16b, v8.16b
+ mov v17.16b, v9.16b
+ mov v18.16b, v10.16b
+ mov v19.16b, v11.16b
+ mov v20.16b, v12.16b
+ mov v21.16b, v13.16b
+ mov v22.16b, v14.16b
+ mov v23.16b, v15.16b
+
+ blr x5
+
+ add x0, x0, #16
+ load_add_store_8x8 x0, x7
+
+ ldp d14, d15, [sp, #0x30]
+ ldp d12, d13, [sp, #0x20]
+ ldp d10, d11, [sp, #0x10]
+ ldp d8, d9, [sp], 0x40
+ ret x15
+endfunc
+
+function inv_txfm_add_8x16_neon
+ mov x15, x30
+ stp d8, d9, [sp, #-0x20]!
+ stp d10, d11, [sp, #0x10]
+ ldrh w12, [x13, #4]
+
+ mov x11, #64
+
+ cmp w3, w12
+ ldrh w12, [x13, #2]
+ b.lt 1f
+
+ add x6, x2, #48
+ movi v4.4s, #0
+ movz w16, #2896*8, lsl #16
+ dup v0.2s, w16
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+ ld1 {\i}, [x6]
+ st1 {v4.4s}, [x6], x11
+.endr
+ scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+ blr x4
+
+ sqrshrn v28.4h, v16.4s, #1
+ sqrshrn v29.4h, v17.4s, #1
+ sqrshrn v30.4h, v18.4s, #1
+ sqrshrn v31.4h, v19.4s, #1
+ sqrshrn2 v28.8h, v20.4s, #1
+ sqrshrn2 v29.8h, v21.4s, #1
+ sqrshrn2 v30.8h, v22.4s, #1
+ sqrshrn2 v31.8h, v23.4s, #1
+ transpose_4x8h v28, v29, v30, v31, v2, v3, v4, v5
+
+ b 2f
+
+1:
+.irp i, v28.8h, v29.8h, v30.8h, v31.8h
+ movi \i, #0
+.endr
+
+2:
+ cmp w3, w12
+ ldrh w12, [x13, #0]
+ b.lt 1f
+
+ add x6, x2, #32
+ movi v4.4s, #0
+ movz w16, #2896*8, lsl #16
+ dup v0.2s, w16
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+ ld1 {\i}, [x6]
+ st1 {v4.4s}, [x6], x11
+.endr
+ scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+ blr x4
+
+ sqrshrn v24.4h, v16.4s, #1
+ sqrshrn v25.4h, v17.4s, #1
+ sqrshrn v26.4h, v18.4s, #1
+ sqrshrn v27.4h, v19.4s, #1
+ sqrshrn2 v24.8h, v20.4s, #1
+ sqrshrn2 v25.8h, v21.4s, #1
+ sqrshrn2 v26.8h, v22.4s, #1
+ sqrshrn2 v27.8h, v23.4s, #1
+ transpose_4x8h v24, v25, v26, v27, v2, v3, v4, v5
+
+ b 2f
+
+1:
+.irp i, v24.8h, v25.8h, v26.8h, v27.8h
+ movi \i, #0
+.endr
+
+2:
+ cmp w3, w12
+ b.lt 1f
+
+ add x6, x2, #16
+ movi v4.4s, #0
+ movz w16, #2896*8, lsl #16
+ dup v0.2s, w16
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+ ld1 {\i}, [x6]
+ st1 {v4.4s}, [x6], x11
+.endr
+ scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+ blr x4
+
+ sqrshrn v8.4h, v16.4s, #1
+ sqrshrn v9.4h, v17.4s, #1
+ sqrshrn v10.4h, v18.4s, #1
+ sqrshrn v11.4h, v19.4s, #1
+ sqrshrn2 v8.8h, v20.4s, #1
+ sqrshrn2 v9.8h, v21.4s, #1
+ sqrshrn2 v10.8h, v22.4s, #1
+ sqrshrn2 v11.8h, v23.4s, #1
+ transpose_4x8h v8, v9, v10, v11, v2, v3, v4, v5
+
+ b 2f
+
+1:
+.irp i, v8.8h, v9.8h, v10.8h, v11.8h
+ movi \i, #0
+.endr
+
+2:
+ movi v4.4s, #0
+ movz w16, #2896*8, lsl #16
+ dup v0.2s, w16
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+ ld1 {\i}, [x2]
+ st1 {v4.4s}, [x2], x11
+.endr
+ scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+ blr x4
+
+ sqrshrn v16.4h, v16.4s, #1
+ sqrshrn v17.4h, v17.4s, #1
+ sqrshrn v18.4h, v18.4s, #1
+ sqrshrn v19.4h, v19.4s, #1
+ sqrshrn2 v16.8h, v20.4s, #1
+ sqrshrn2 v17.8h, v21.4s, #1
+ sqrshrn2 v18.8h, v22.4s, #1
+ sqrshrn2 v19.8h, v23.4s, #1
+ transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5
+
+ mov v20.16b, v8.16b
+ mov v21.16b, v9.16b
+ mov v22.16b, v10.16b
+ mov v23.16b, v11.16b
+
+ blr x5
+
+ load_add_store_8x16 x0, x6
+
+ ldp d10, d11, [sp, #0x10]
+ ldp d8, d9, [sp], 0x20
+
+ ret x15
+endfunc
+
+const eob_8x16
+ .short 10, 43, 75, 128
+endconst
+
+const eob_8x16_identity1
+ .short 4, 64, 96, 128
+endconst
+
+const eob_8x16_identity2
+ .short 4, 8, 12, 128
+endconst
+
+.macro def_fn_816 w, h, txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc \w, \h, 1
+.endif
+ adr x4, inv_\txfm1\()_4s_x\w\()_neon
+ movrel x5, X(inv_\txfm2\()_8h_x\h\()_neon)
+.ifc \txfm1, identity
+.ifc \txfm2, identity
+ movrel x13, eob_8x16
+.else
+ movrel x13, eob_8x16_identity1
+.endif
+.else
+.ifc \txfm2, identity
+ movrel x13, eob_8x16_identity2
+.else
+ movrel x13, eob_8x16
+.endif
+.endif
+.if \h == 8
+ ldrh w13, [x13]
+.endif
+ b inv_txfm_add_\w\()x\h\()_neon
+endfunc
+.endm
+
+.macro def_fns_816 w, h
+def_fn_816 \w, \h, dct, dct
+def_fn_816 \w, \h, identity, identity
+def_fn_816 \w, \h, dct, adst
+def_fn_816 \w, \h, dct, flipadst
+def_fn_816 \w, \h, dct, identity
+def_fn_816 \w, \h, adst, dct
+def_fn_816 \w, \h, adst, adst
+def_fn_816 \w, \h, adst, flipadst
+def_fn_816 \w, \h, flipadst, dct
+def_fn_816 \w, \h, flipadst, adst
+def_fn_816 \w, \h, flipadst, flipadst
+def_fn_816 \w, \h, identity, dct
+def_fn_816 \w, \h, adst, identity
+def_fn_816 \w, \h, flipadst, identity
+def_fn_816 \w, \h, identity, adst
+def_fn_816 \w, \h, identity, flipadst
+.endm
+
+def_fns_816 8, 16
+def_fns_816 16, 8
+
+function inv_dct32_odd_4s_x16_neon
+ movrel x16, idct_coeffs, 4*16
+ ld1 {v0.4s, v1.4s}, [x16], #32
+
+ mul_mls v2, v16, v31, v0.s[0], v0.s[1] // -> t16a
+ mul_mla v4, v16, v31, v0.s[1], v0.s[0] // -> t31a
+ mul_mls v6, v24, v23, v0.s[2], v0.s[3] // -> t17a
+ srshr v16.4s, v2.4s, #12 // t16a
+ srshr v31.4s, v4.4s, #12 // t31a
+ mul_mla v2, v24, v23, v0.s[3], v0.s[2] // -> t30a
+ mul_mls v4, v20, v27, v1.s[0], v1.s[1] // -> t18a
+ srshr v24.4s, v6.4s, #12 // t17a
+ srshr v23.4s, v2.4s, #12 // t30a
+ mul_mla v6, v20, v27, v1.s[1], v1.s[0] // -> t29a
+ mul_mls v2, v28, v19, v1.s[2], v1.s[3] // -> t19a
+ srshr v20.4s, v4.4s, #12 // t18a
+ srshr v27.4s, v6.4s, #12 // t29a
+ mul_mla v4, v28, v19, v1.s[3], v1.s[2] // -> t28a
+ ld1 {v0.4s, v1.4s}, [x16]
+ sub x16, x16, #4*24
+ mul_mls v6, v18, v29, v0.s[0], v0.s[1] // -> t20a
+ srshr v28.4s, v2.4s, #12 // t19a
+ srshr v19.4s, v4.4s, #12 // t28a
+ mul_mla v2, v18, v29, v0.s[1], v0.s[0] // -> t27a
+ mul_mls v4, v26, v21, v0.s[2], v0.s[3] // -> t21a
+ srshr v18.4s, v6.4s, #12 // t20a
+ srshr v29.4s, v2.4s, #12 // t27a
+ mul_mla v6, v26, v21, v0.s[3], v0.s[2] // -> t26a
+ mul_mls v2, v22, v25, v1.s[0], v1.s[1] // -> t22a
+ srshr v26.4s, v4.4s, #12 // t21a
+ srshr v21.4s, v6.4s, #12 // t26a
+ mul_mla v4, v22, v25, v1.s[1], v1.s[0] // -> t25a
+ mul_mls v6, v30, v17, v1.s[2], v1.s[3] // -> t23a
+ srshr v22.4s, v2.4s, #12 // t22a
+ srshr v25.4s, v4.4s, #12 // t25a
+ mul_mla v2, v30, v17, v1.s[3], v1.s[2] // -> t24a
+ srshr v30.4s, v6.4s, #12 // t23a
+ srshr v17.4s, v2.4s, #12 // t24a
+
+ ld1 {v0.4s, v1.4s}, [x16]
+
+ movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ mvni v4.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
+
+ sqsub v2.4s, v16.4s, v24.4s // t17
+ sqadd v16.4s, v16.4s, v24.4s // t16
+ sqsub v3.4s, v31.4s, v23.4s // t30
+ sqadd v31.4s, v31.4s, v23.4s // t31
+ sqsub v24.4s, v28.4s, v20.4s // t18
+ sqadd v28.4s, v28.4s, v20.4s // t19
+ sqadd v23.4s, v18.4s, v26.4s // t20
+ sqsub v18.4s, v18.4s, v26.4s // t21
+ sqsub v20.4s, v30.4s, v22.4s // t22
+ sqadd v30.4s, v30.4s, v22.4s // t23
+ sqadd v26.4s, v17.4s, v25.4s // t24
+ sqsub v17.4s, v17.4s, v25.4s // t25
+ sqsub v22.4s, v29.4s, v21.4s // t26
+ sqadd v29.4s, v29.4s, v21.4s // t27
+ sqadd v25.4s, v19.4s, v27.4s // t28
+ sqsub v19.4s, v19.4s, v27.4s // t29
+
+.irp r, v2, v16, v3, v31, v24, v28, v23, v18, v20, v30, v26, v17, v22, v29, v25, v19
+ smin \r\().4s, \r\().4s, v5.4s
+.endr
+.irp r, v2, v16, v3, v31, v24, v28, v23, v18, v20, v30, v26, v17, v22, v29, v25, v19
+ smax \r\().4s, \r\().4s, v4.4s
+.endr
+
+ mul_mls v7, v3, v2, v1.s[0], v1.s[1] // -> t17a
+ mul_mla v6, v3, v2, v1.s[1], v1.s[0] // -> t30a
+ mul_mla v2, v19, v24, v1.s[1], v1.s[0] // -> t18a
+ srshr v21.4s, v7.4s, #12 // t17a
+ srshr v27.4s, v6.4s, #12 // t30a
+ neg v2.4s, v2.4s // -> t18a
+ mul_mls v7, v19, v24, v1.s[0], v1.s[1] // -> t29a
+ mul_mls v6, v22, v18, v1.s[2], v1.s[3] // -> t21a
+ srshr v19.4s, v2.4s, #12 // t18a
+ srshr v24.4s, v7.4s, #12 // t29a
+ mul_mla v2, v22, v18, v1.s[3], v1.s[2] // -> t26a
+ mul_mla v7, v17, v20, v1.s[3], v1.s[2] // -> t22a
+ srshr v22.4s, v6.4s, #12 // t21a
+ srshr v18.4s, v2.4s, #12 // t26a
+ neg v7.4s, v7.4s // -> t22a
+ mul_mls v6, v17, v20, v1.s[2], v1.s[3] // -> t25a
+ srshr v17.4s, v7.4s, #12 // t22a
+ srshr v20.4s, v6.4s, #12 // t25a
+
+ sqsub v2.4s, v27.4s, v24.4s // t29
+ sqadd v27.4s, v27.4s, v24.4s // t30
+ sqsub v3.4s, v21.4s, v19.4s // t18
+ sqadd v21.4s, v21.4s, v19.4s // t17
+ sqsub v24.4s, v16.4s, v28.4s // t19a
+ sqadd v16.4s, v16.4s, v28.4s // t16a
+ sqsub v19.4s, v30.4s, v23.4s // t20a
+ sqadd v30.4s, v30.4s, v23.4s // t23a
+ sqsub v28.4s, v17.4s, v22.4s // t21
+ sqadd v17.4s, v17.4s, v22.4s // t22
+ sqadd v23.4s, v26.4s, v29.4s // t24a
+ sqsub v26.4s, v26.4s, v29.4s // t27a
+ sqadd v22.4s, v20.4s, v18.4s // t25
+ sqsub v20.4s, v20.4s, v18.4s // t26
+ sqsub v29.4s, v31.4s, v25.4s // t28a
+ sqadd v31.4s, v31.4s, v25.4s // t31a
+
+.irp r, v2, v27, v3, v21, v24, v16, v19, v30, v28, v17, v23, v26, v22, v20, v29, v31
+ smin \r\().4s, \r\().4s, v5.4s
+.endr
+.irp r, v2, v27, v3, v21, v24, v16, v19, v30, v28, v17, v23, v26, v22, v20, v29, v31
+ smax \r\().4s, \r\().4s, v4.4s
+.endr
+
+ mul_mls v7, v2, v3, v0.s[2], v0.s[3] // -> t18a
+ mul_mla v6, v2, v3, v0.s[3], v0.s[2] // -> t29a
+ mul_mls v2, v29, v24, v0.s[2], v0.s[3] // -> t19
+ srshr v18.4s, v7.4s, #12 // t18a
+ srshr v25.4s, v6.4s, #12 // t29a
+ mul_mla v7, v29, v24, v0.s[3], v0.s[2] // -> t28
+ mul_mla v6, v26, v19, v0.s[3], v0.s[2] // -> t20
+ srshr v29.4s, v2.4s, #12 // t19
+ srshr v24.4s, v7.4s, #12 // t28
+ neg v6.4s, v6.4s // -> t20
+ mul_mls v2, v26, v19, v0.s[2], v0.s[3] // -> t27
+ mul_mla v7, v20, v28, v0.s[3], v0.s[2] // -> t21a
+ srshr v26.4s, v6.4s, #12 // t20
+ srshr v19.4s, v2.4s, #12 // t27
+ neg v7.4s, v7.4s // -> t21a
+ mul_mls v6, v20, v28, v0.s[2], v0.s[3] // -> t26a
+ srshr v20.4s, v7.4s, #12 // t21a
+ srshr v28.4s, v6.4s, #12 // t26a
+
+ sqsub v2.4s, v16.4s, v30.4s // t23
+ sqadd v16.4s, v16.4s, v30.4s // t16 = out16
+ sqsub v3.4s, v31.4s, v23.4s // t24
+ sqadd v31.4s, v31.4s, v23.4s // t31 = out31
+ sqsub v23.4s, v21.4s, v17.4s // t22a
+ sqadd v17.4s, v21.4s, v17.4s // t17a = out17
+ sqadd v30.4s, v27.4s, v22.4s // t30a = out30
+ sqsub v21.4s, v27.4s, v22.4s // t25a
+ sqsub v27.4s, v18.4s, v20.4s // t21
+ sqadd v18.4s, v18.4s, v20.4s // t18 = out18
+ sqadd v7.4s, v29.4s, v26.4s // t19a = out19
+ sqsub v26.4s, v29.4s, v26.4s // t20a
+ sqadd v29.4s, v25.4s, v28.4s // t29 = out29
+ sqsub v25.4s, v25.4s, v28.4s // t26
+ sqadd v28.4s, v24.4s, v19.4s // t28a = out28
+ sqsub v24.4s, v24.4s, v19.4s // t27a
+ mov v19.16b, v7.16b // out19
+
+.irp r, v2, v16, v3, v31, v23, v17, v30, v21, v27, v18, v19, v26, v29, v25, v28, v24
+ smin \r\().4s, \r\().4s, v5.4s
+.endr
+.irp r, v2, v16, v3, v31, v23, v17, v30, v21, v27, v18, v19, v26, v29, v25, v28, v24
+ smax \r\().4s, \r\().4s, v4.4s
+.endr
+
+ mul_mls v7, v24, v26, v0.s[0], v0.s[0] // -> t20
+ mul_mla v6, v24, v26, v0.s[0], v0.s[0] // -> t27
+ srshr v20.4s, v7.4s, #12 // t20
+ srshr v22.4s, v6.4s, #12 // t27
+
+ mul_mla v7, v25, v27, v0.s[0], v0.s[0] // -> t26a
+ mul_mls v6, v25, v27, v0.s[0], v0.s[0] // -> t21a
+ mov v27.16b, v22.16b // t27
+ srshr v26.4s, v7.4s, #12 // t26a
+
+ mul_mls v24, v21, v23, v0.s[0], v0.s[0] // -> t22
+ mul_mla v7, v21, v23, v0.s[0], v0.s[0] // -> t25
+ srshr v21.4s, v6.4s, #12 // t21a
+ srshr v22.4s, v24.4s, #12 // t22
+ srshr v25.4s, v7.4s, #12 // t25
+
+ mul_mls v7, v3, v2, v0.s[0], v0.s[0] // -> t23a
+ mul_mla v6, v3, v2, v0.s[0], v0.s[0] // -> t24a
+ srshr v23.4s, v7.4s, #12 // t23a
+ srshr v24.4s, v6.4s, #12 // t24a
+
+ ret
+endfunc
+
+.macro def_horz_32 scale=0, shift=2, suffix
+function inv_txfm_horz\suffix\()_dct_32x4_neon
+ mov x14, x30
+ movi v7.4s, #0
+ lsl x8, x8, #1
+.if \scale
+ movz w16, #2896*8, lsl #16
+ dup v0.2s, w16
+.endif
+
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+ ld1 {\i}, [x7]
+ st1 {v7.4s}, [x7], x8
+.endr
+ sub x7, x7, x8, lsl #4
+ add x7, x7, x8, lsr #1
+.if \scale
+ scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+ scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31
+.endif
+ bl inv_dct_4s_x16_neon
+
+ // idct_16 leaves the row_clip_max/min constants in v5 and v4
+.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
+ smin_4s \r, \r, v5
+.endr
+.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
+ smax_4s \r, \r, v4
+.endr
+
+ transpose_4x4s v16, v17, v18, v19, v2, v3, v4, v5
+ transpose_4x4s v20, v21, v22, v23, v2, v3, v4, v5
+ transpose_4x4s v24, v25, v26, v27, v2, v3, v4, v5
+ transpose_4x4s v28, v29, v30, v31, v2, v3, v4, v5
+
+.macro store1 r0, r1, r2, r3
+ st1 {\r0}, [x6], #16
+ st1 {\r1}, [x6], #16
+ st1 {\r2}, [x6], #16
+ st1 {\r3}, [x6], #16
+.endm
+ store1 v16.4s, v20.4s, v24.4s, v28.4s
+ store1 v17.4s, v21.4s, v25.4s, v29.4s
+ store1 v18.4s, v22.4s, v26.4s, v30.4s
+ store1 v19.4s, v23.4s, v27.4s, v31.4s
+.purgem store1
+ sub x6, x6, #64*4
+
+ movi v7.4s, #0
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+ ld1 {\i}, [x7]
+ st1 {v7.4s}, [x7], x8
+.endr
+.if \scale
+ // This relies on the fact that the idct also leaves the right coeff in v0.s[1]
+ scale_input .4s, v0.s[1], v16, v17, v18, v19, v20, v21, v22, v23
+ scale_input .4s, v0.s[1], v24, v25, v26, v27, v28, v29, v30, v31
+.endif
+ bl inv_dct32_odd_4s_x16_neon
+ transpose_4x4s v31, v30, v29, v28, v2, v3, v4, v5
+ transpose_4x4s v27, v26, v25, v24, v2, v3, v4, v5
+ transpose_4x4s v23, v22, v21, v20, v2, v3, v4, v5
+ transpose_4x4s v19, v18, v17, v16, v2, v3, v4, v5
+.macro store2 r0, r1, r2, r3, shift
+ ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x6]
+ sqsub v4.4s, v0.4s, \r0
+ sqadd v0.4s, v0.4s, \r0
+ sqsub v5.4s, v1.4s, \r1
+ sqadd v1.4s, v1.4s, \r1
+ sqsub v6.4s, v2.4s, \r2
+ sqadd v2.4s, v2.4s, \r2
+ sqsub v7.4s, v3.4s, \r3
+ sqadd v3.4s, v3.4s, \r3
+ sqrshrn v0.4h, v0.4s, #\shift
+ sqrshrn2 v0.8h, v1.4s, #\shift
+ sqrshrn v1.4h, v2.4s, #\shift
+ sqrshrn2 v1.8h, v3.4s, #\shift
+ sqrshrn v2.4h, v7.4s, #\shift
+ sqrshrn2 v2.8h, v6.4s, #\shift
+ sqrshrn v3.4h, v5.4s, #\shift
+ sqrshrn2 v3.8h, v4.4s, #\shift
+ st1 {v0.8h, v1.8h}, [x6], #32
+ rev64 v2.8h, v2.8h
+ rev64 v3.8h, v3.8h
+ st1 {v2.8h, v3.8h}, [x6], #32
+.endm
+
+ store2 v31.4s, v27.4s, v23.4s, v19.4s, \shift
+ store2 v30.4s, v26.4s, v22.4s, v18.4s, \shift
+ store2 v29.4s, v25.4s, v21.4s, v17.4s, \shift
+ store2 v28.4s, v24.4s, v20.4s, v16.4s, \shift
+.purgem store2
+ ret x14
+endfunc
+.endm
+
+def_horz_32 scale=0, shift=2
+def_horz_32 scale=1, shift=1, suffix=_scale
+
+function inv_txfm_add_vert_dct_8x32_neon
+ mov x14, x30
+ lsl x8, x8, #1
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ ld1 {v\i\().8h}, [x7], x8
+.endr
+ sub x7, x7, x8, lsl #4
+
+ bl X(inv_dct_8h_x16_neon)
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ st1 {v\i\().8h}, [x7], x8
+.endr
+ sub x7, x7, x8, lsl #4
+ add x7, x7, x8, lsr #1
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ ld1 {v\i\().8h}, [x7], x8
+.endr
+ sub x7, x7, x8, lsl #4
+ sub x7, x7, x8, lsr #1
+ bl X(inv_dct32_odd_8h_x16_neon)
+
+ neg x9, x8
+ mov x10, x6
+ mvni v1.8h, #0xfc, lsl #8 // 0x3ff
+.macro combine r0, r1, r2, r3, op, stride
+ ld1 {v5.8h}, [x7], \stride
+ ld1 {v2.8h}, [x10], x1
+ ld1 {v6.8h}, [x7], \stride
+ ld1 {v3.8h}, [x10], x1
+ \op v5.8h, v5.8h, \r0
+ ld1 {v7.8h}, [x7], \stride
+ ld1 {v4.8h}, [x10], x1
+ srshr v5.8h, v5.8h, #4
+ \op v6.8h, v6.8h, \r1
+ usqadd v2.8h, v5.8h
+ srshr v6.8h, v6.8h, #4
+ \op v7.8h, v7.8h, \r2
+ ld1 {v5.8h}, [x7], \stride
+ usqadd v3.8h, v6.8h
+ smin v2.8h, v2.8h, v1.8h
+ srshr v7.8h, v7.8h, #4
+ \op v5.8h, v5.8h, \r3
+ st1 {v2.8h}, [x6], x1
+ ld1 {v2.8h}, [x10], x1
+ usqadd v4.8h, v7.8h
+ smin v3.8h, v3.8h, v1.8h
+ srshr v5.8h, v5.8h, #4
+ st1 {v3.8h}, [x6], x1
+ usqadd v2.8h, v5.8h
+ smin v4.8h, v4.8h, v1.8h
+ st1 {v4.8h}, [x6], x1
+ smin v2.8h, v2.8h, v1.8h
+ st1 {v2.8h}, [x6], x1
+.endm
+ combine v31.8h, v30.8h, v29.8h, v28.8h, sqadd, x8
+ combine v27.8h, v26.8h, v25.8h, v24.8h, sqadd, x8
+ combine v23.8h, v22.8h, v21.8h, v20.8h, sqadd, x8
+ combine v19.8h, v18.8h, v17.8h, v16.8h, sqadd, x8
+ sub x7, x7, x8
+ combine v16.8h, v17.8h, v18.8h, v19.8h, sqsub, x9
+ combine v20.8h, v21.8h, v22.8h, v23.8h, sqsub, x9
+ combine v24.8h, v25.8h, v26.8h, v27.8h, sqsub, x9
+ combine v28.8h, v29.8h, v30.8h, v31.8h, sqsub, x9
+.purgem combine
+
+ ret x14
+endfunc
+
+const eob_32x32
+ .short 10, 36, 78, 136, 210, 300, 406, 1024
+endconst
+
+const eob_16x32
+ .short 10, 36, 78, 151, 215, 279, 343, 512
+endconst
+
+const eob_16x32_shortside
+ .short 10, 36, 78, 512
+endconst
+
+const eob_8x32
+ .short 10, 43, 75, 107, 139, 171, 203, 256
+endconst
+
+function inv_txfm_add_identity_identity_32x32_16bpc_neon, export=1
+ movi v0.8h, #0
+ movi v1.8h, #0
+ movrel x13, eob_32x32, 2
+
+ mov x8, #4*32
+1:
+ mov w9, #0
+ movrel x12, eob_32x32, 2
+2:
+ add w9, w9, #8
+ ld1 {v16.4s, v17.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v18.4s, v19.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v20.4s, v21.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v22.4s, v23.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v24.4s, v25.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v26.4s, v27.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v28.4s, v29.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v30.4s, v31.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ sqxtn v16.4h, v16.4s
+ sqxtn2 v16.8h, v17.4s
+ sqxtn v17.4h, v18.4s
+ sqxtn2 v17.8h, v19.4s
+ sqxtn v18.4h, v20.4s
+ sqxtn2 v18.8h, v21.4s
+ sqxtn v19.4h, v22.4s
+ sqxtn2 v19.8h, v23.4s
+ sqxtn v20.4h, v24.4s
+ sqxtn2 v20.8h, v25.4s
+ sqxtn v21.4h, v26.4s
+ sqxtn2 v21.8h, v27.4s
+ sqxtn v22.4h, v28.4s
+ sqxtn2 v22.8h, v29.4s
+ sqxtn v23.4h, v30.4s
+ sqxtn2 v23.8h, v31.4s
+ transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
+
+ load_add_store_8x8 x0, x7, shiftbits=2
+ ldrh w11, [x12], #4
+ sub x0, x0, x1, lsl #3
+ add x0, x0, #2*8
+ cmp w3, w11
+ b.ge 2b
+
+ ldrh w11, [x13], #4
+ cmp w3, w11
+ b.lt 9f
+
+ sub x0, x0, w9, uxtw #1
+ add x0, x0, x1, lsl #3
+ msub x2, x8, x9, x2
+ add x2, x2, #4*8
+ b 1b
+9:
+ ret
+endfunc
+
+.macro shift_16_regs op, shift
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+ \op \i, \i, #\shift
+.endr
+.endm
+
+.macro def_identity_1632 w, h, wshort, hshort
+function inv_txfm_add_identity_identity_\w\()x\h\()_16bpc_neon, export=1
+ movz w16, #2896*8, lsl #16
+ movz w17, #2*(5793-4096)*8, lsl #16
+ movi v0.4s, #0
+ movi v1.4s, #0
+ movrel x13, eob_16x32\hshort, 2
+
+ mov x8, #4*\h
+1:
+ mov w9, #0
+ movrel x12, eob_16x32\wshort, 2
+2:
+ add w9, w9, #8
+ ld1 {v16.4s, v17.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ dup v2.2s, w16
+ ld1 {v18.4s, v19.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ mov v2.s[1], w17
+ ld1 {v20.4s, v21.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v22.4s, v23.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v24.4s, v25.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v26.4s, v27.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v28.4s, v29.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v30.4s, v31.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ scale_input .4s, v2.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+ scale_input .4s, v2.s[0], v24, v25, v26, v27, v28, v29, v30, v31
+
+.if \w == 16
+ // 16x32
+ identity_4x16_shift1 v2.s[1]
+.else
+ // 32x16
+ shift_16_regs sqshl, 1
+ identity_4x16 v2.s[1]
+.endif
+ sqxtn v16.4h, v16.4s
+ sqxtn2 v16.8h, v17.4s
+ sqxtn v17.4h, v18.4s
+ sqxtn2 v17.8h, v19.4s
+ sqxtn v18.4h, v20.4s
+ sqxtn2 v18.8h, v21.4s
+ sqxtn v19.4h, v22.4s
+ sqxtn2 v19.8h, v23.4s
+ sqxtn v20.4h, v24.4s
+ sqxtn2 v20.8h, v25.4s
+ sqxtn v21.4h, v26.4s
+ sqxtn2 v21.8h, v27.4s
+ sqxtn v22.4h, v28.4s
+ sqxtn2 v22.8h, v29.4s
+ sqxtn v23.4h, v30.4s
+ sqxtn2 v23.8h, v31.4s
+
+ transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
+
+.if \w == 16
+ load_add_store_8x8 x0, x7, shiftbits=2
+.else
+ load_add_store_8x8 x0, x7, shiftbits=4
+.endif
+ ldrh w11, [x12], #4
+ sub x0, x0, x1, lsl #3
+ add x0, x0, #16
+ cmp w3, w11
+ b.ge 2b
+
+ ldrh w11, [x13], #4
+ cmp w3, w11
+ b.lt 9f
+
+ sub x0, x0, w9, uxtw #1
+ add x0, x0, x1, lsl #3
+ msub x2, x8, x9, x2
+ add x2, x2, #4*8
+ b 1b
+9:
+ ret
+endfunc
+.endm
+
+def_identity_1632 16, 32, _shortside,
+def_identity_1632 32, 16, , _shortside
+
+.macro def_identity_832 w, h
+function inv_txfm_add_identity_identity_\w\()x\h\()_16bpc_neon, export=1
+ movi v0.4s, #0
+ movi v1.4s, #0
+ // Working on 8x8 blocks, read every other entry from eob_8x32
+ movrel x13, eob_8x32, 2
+
+ mov w8, #4*\h
+1:
+ // Working on 8x8 blocks, read every other entry from eob_8x32
+ ldrh w12, [x13], #4
+ ld1 {v16.4s, v17.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v18.4s, v19.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v20.4s, v21.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v22.4s, v23.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v24.4s, v25.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v26.4s, v27.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v28.4s, v29.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v30.4s, v31.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+
+.if \w == 8
+ sqrshrn v16.4h, v16.4s, #1
+ sqrshrn2 v16.8h, v17.4s, #1
+ sqrshrn v17.4h, v18.4s, #1
+ sqrshrn2 v17.8h, v19.4s, #1
+ sqrshrn v18.4h, v20.4s, #1
+ sqrshrn2 v18.8h, v21.4s, #1
+ sqrshrn v19.4h, v22.4s, #1
+ sqrshrn2 v19.8h, v23.4s, #1
+ sqrshrn v20.4h, v24.4s, #1
+ sqrshrn2 v20.8h, v25.4s, #1
+ sqrshrn v21.4h, v26.4s, #1
+ sqrshrn2 v21.8h, v27.4s, #1
+ sqrshrn v22.4h, v28.4s, #1
+ sqrshrn2 v22.8h, v29.4s, #1
+ sqrshrn v23.4h, v30.4s, #1
+ sqrshrn2 v23.8h, v31.4s, #1
+.else
+ sqxtn v16.4h, v16.4s
+ sqxtn2 v16.8h, v17.4s
+ sqxtn v17.4h, v18.4s
+ sqxtn2 v17.8h, v19.4s
+ sqxtn v18.4h, v20.4s
+ sqxtn2 v18.8h, v21.4s
+ sqxtn v19.4h, v22.4s
+ sqxtn2 v19.8h, v23.4s
+ sqxtn v20.4h, v24.4s
+ sqxtn2 v20.8h, v25.4s
+ sqxtn v21.4h, v26.4s
+ sqxtn2 v21.8h, v27.4s
+ sqxtn v22.4h, v28.4s
+ sqxtn2 v22.8h, v29.4s
+ sqxtn v23.4h, v30.4s
+ sqxtn2 v23.8h, v31.4s
+.endif
+
+ transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
+
+
+ cmp w3, w12
+.if \w == 8
+ load_add_store_8x8 x0, x7, shiftbits=2
+.else
+ load_add_store_8x8 x0, x7, shiftbits=3
+.endif
+
+ b.lt 9f
+.if \w == 8
+ sub x2, x2, x8, lsl #3
+ add x2, x2, #4*8
+.else
+ sub x0, x0, x1, lsl #3
+ add x0, x0, #2*8
+.endif
+ b 1b
+
+9:
+ ret
+endfunc
+.endm
+
+def_identity_832 8, 32
+def_identity_832 32, 8
+
+function inv_txfm_add_dct_dct_32x32_16bpc_neon, export=1
+ idct_dc 32, 32, 2
+
+ mov x15, x30
+ sub sp, sp, #2048
+ movrel x13, eob_32x32
+ ldrh w12, [x13], #2
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add x6, sp, #(\i*32*2)
+.if \i > 0
+ mov w8, #(32 - \i)
+ cmp w3, w12
+ b.lt 1f
+.if \i < 28
+ ldrh w12, [x13], #2
+.endif
+.endif
+ add x7, x2, #(\i*4)
+ mov x8, #32*4
+ bl inv_txfm_horz_dct_32x4_neon
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #4
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+.irp i, 0, 8, 16, 24
+ add x6, x0, #(\i*2)
+ add x7, sp, #(\i*2)
+ mov x8, #32*2
+ bl inv_txfm_add_vert_dct_8x32_neon
+.endr
+
+ add sp, sp, #2048
+ ret x15
+endfunc
+
+function inv_txfm_add_dct_dct_16x32_16bpc_neon, export=1
+ idct_dc 16, 32, 1
+
+ mov x15, x30
+ sub sp, sp, #1024
+ movrel x13, eob_16x32
+ ldrh w12, [x13], #2
+ adr x4, inv_dct_4s_x16_neon
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add x6, sp, #(\i*16*2)
+ add x7, x2, #(\i*4)
+.if \i > 0
+ mov w8, #(32 - \i)
+ cmp w3, w12
+ b.lt 1f
+.if \i < 28
+ ldrh w12, [x13], #2
+.endif
+.endif
+ mov x8, #4*32
+ bl inv_txfm_horz_scale_16x4_neon
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #4
+.rept 2
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+.irp i, 0, 8
+ add x6, x0, #(\i*2)
+ add x7, sp, #(\i*2)
+ mov x8, #16*2
+ bl inv_txfm_add_vert_dct_8x32_neon
+.endr
+
+ add sp, sp, #1024
+ ret x15
+endfunc
+
+function inv_txfm_add_dct_dct_32x16_16bpc_neon, export=1
+ idct_dc 32, 16, 1
+
+ mov x15, x30
+ sub sp, sp, #1024
+
+ movrel x13, eob_16x32
+ movrel x5, X(inv_dct_8h_x16_neon)
+ ldrh w12, [x13], #2
+
+.irp i, 0, 4, 8, 12
+ add x6, sp, #(\i*32*2)
+ add x7, x2, #(\i*4)
+.if \i > 0
+ mov w8, #(16 - \i)
+ cmp w3, w12
+ b.lt 1f
+.if \i < 12
+ ldrh w12, [x13], #2
+.endif
+.endif
+ mov x8, #4*16
+ bl inv_txfm_horz_scale_dct_32x4_neon
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #4
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+.irp i, 0, 8, 16, 24
+ add x6, x0, #(\i*2)
+ add x7, sp, #(\i*2)
+ mov x8, #32*2
+ bl inv_txfm_add_vert_8x16_neon
+.endr
+
+ add sp, sp, #1024
+ ret x15
+endfunc
+
+function inv_txfm_add_dct_dct_8x32_16bpc_neon, export=1
+ idct_dc 8, 32, 2
+
+ mov x15, x30
+ sub sp, sp, #512
+
+ movrel x13, eob_8x32
+
+ movi v28.4s, #0
+ mov x8, #4*32
+ mov w9, #32
+ mov x6, sp
+ mov x7, x2
+1:
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ ld1 {v\i\().4s}, [x7]
+ st1 {v28.4s}, [x7], x8
+.endr
+ ldrh w12, [x13], #2
+ sub w9, w9, #4
+ sub x7, x7, x8, lsl #3
+ add x7, x7, #4*4
+
+ bl inv_dct_4s_x8_neon
+
+ sqrshrn v16.4h, v16.4s, #2
+ sqrshrn v17.4h, v17.4s, #2
+ sqrshrn v18.4h, v18.4s, #2
+ sqrshrn v19.4h, v19.4s, #2
+ sqrshrn2 v16.8h, v20.4s, #2
+ sqrshrn2 v17.8h, v21.4s, #2
+ sqrshrn2 v18.8h, v22.4s, #2
+ sqrshrn2 v19.8h, v23.4s, #2
+
+ transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5
+
+ cmp w3, w12
+ st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x6], #64
+
+ b.ge 1b
+ cbz w9, 3f
+
+ movi v29.8h, #0
+ movi v30.8h, #0
+ movi v31.8h, #0
+2:
+ subs w9, w9, #4
+ st1 {v28.8h,v29.8h,v30.8h,v31.8h}, [x6], #64
+ b.gt 2b
+
+3:
+ mov x6, x0
+ mov x7, sp
+ mov x8, #8*2
+ bl inv_txfm_add_vert_dct_8x32_neon
+
+ add sp, sp, #512
+ ret x15
+endfunc
+
+function inv_txfm_add_dct_dct_32x8_16bpc_neon, export=1
+ idct_dc 32, 8, 2
+
+ mov x15, x30
+ sub sp, sp, #512
+
+.irp i, 0, 4
+ add x6, sp, #(\i*32*2)
+ add x7, x2, #(\i*4)
+.if \i > 0
+ cmp w3, #10
+ b.lt 1f
+.endif
+ mov x8, #8*4
+ bl inv_txfm_horz_dct_32x4_neon
+.endr
+ b 2f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+
+2:
+ mov x8, #2*32
+ mov w9, #0
+1:
+ add x6, x0, x9, lsl #1
+ add x7, sp, x9, lsl #1 // #(\i*2)
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ ld1 {v\i\().8h}, [x7], x8
+.endr
+ add w9, w9, #8
+
+ bl X(inv_dct_8h_x8_neon)
+
+ cmp w9, #32
+
+ load_add_store_8x8 x6, x7
+
+ b.lt 1b
+
+ add sp, sp, #512
+ ret x15
+endfunc
+
+function inv_dct64_step1_neon
+ // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
+ // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
+ // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
+ // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
+
+ ld1 {v0.4s, v1.4s}, [x17], #32
+
+ sqrdmulh v23.4s, v16.4s, v0.s[1] // t63a
+ sqrdmulh v16.4s, v16.4s, v0.s[0] // t32a
+ sqrdmulh v22.4s, v17.4s, v0.s[2] // t62a
+ sqrdmulh v17.4s, v17.4s, v0.s[3] // t33a
+ sqrdmulh v21.4s, v18.4s, v1.s[1] // t61a
+ sqrdmulh v18.4s, v18.4s, v1.s[0] // t34a
+ sqrdmulh v20.4s, v19.4s, v1.s[2] // t60a
+ sqrdmulh v19.4s, v19.4s, v1.s[3] // t35a
+
+ ld1 {v0.4s}, [x17], #16
+
+ sqadd v24.4s, v16.4s, v17.4s // t32
+ sqsub v25.4s, v16.4s, v17.4s // t33
+ sqsub v26.4s, v19.4s, v18.4s // t34
+ sqadd v27.4s, v19.4s, v18.4s // t35
+ sqadd v28.4s, v20.4s, v21.4s // t60
+ sqsub v29.4s, v20.4s, v21.4s // t61
+ sqsub v30.4s, v23.4s, v22.4s // t62
+ sqadd v31.4s, v23.4s, v22.4s // t63
+
+.irp r, v24, v25, v26, v27, v28, v29, v30, v31
+ smin_4s \r, \r, v5
+.endr
+.irp r, v24, v25, v26, v27, v28, v29, v30, v31
+ smax_4s \r, \r, v4
+.endr
+
+ mul_mla v2, v29, v26, v0.s[0], v0.s[1] // -> t34a
+ mul_mls v7, v29, v26, v0.s[1], v0.s[0] // -> t61a
+ neg v2.4s, v2.4s // t34a
+ mul_mls v6, v30, v25, v0.s[1], v0.s[0] // -> t33a
+ srshr v26.4s, v2.4s, #12 // t34a
+ mul_mla v2, v30, v25, v0.s[0], v0.s[1] // -> t62a
+ srshr v29.4s, v7.4s, #12 // t61a
+ srshr v25.4s, v6.4s, #12 // t33a
+ srshr v30.4s, v2.4s, #12 // t62a
+
+ sqadd v16.4s, v24.4s, v27.4s // t32a
+ sqsub v19.4s, v24.4s, v27.4s // t35a
+ sqadd v17.4s, v25.4s, v26.4s // t33
+ sqsub v18.4s, v25.4s, v26.4s // t34
+ sqsub v20.4s, v31.4s, v28.4s // t60a
+ sqadd v23.4s, v31.4s, v28.4s // t63a
+ sqsub v21.4s, v30.4s, v29.4s // t61
+ sqadd v22.4s, v30.4s, v29.4s // t62
+
+.irp r, v16, v19, v17, v18, v20, v23, v21, v22
+ smin_4s \r, \r, v5
+.endr
+.irp r, v16, v19, v17, v18, v20, v23, v21, v22
+ smax_4s \r, \r, v4
+.endr
+
+ mul_mla v2, v21, v18, v0.s[2], v0.s[3] // -> t61a
+ mul_mls v7, v21, v18, v0.s[3], v0.s[2] // -> t34a
+ mul_mla v6, v20, v19, v0.s[2], v0.s[3] // -> t60
+ srshr v21.4s, v2.4s, #12 // t61a
+ srshr v18.4s, v7.4s, #12 // t34a
+ mul_mls v2, v20, v19, v0.s[3], v0.s[2] // -> t35
+ srshr v20.4s, v6.4s, #12 // t60
+ srshr v19.4s, v2.4s, #12 // t35
+
+ st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x6], #64
+ st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x6], #64
+
+ ret
+endfunc
+
+function inv_dct64_step2_neon
+ movrel x16, idct_coeffs
+ ld1 {v0.4s}, [x16]
+1:
+ // t32a/33/34a/35/60/61a/62/63a
+ // t56a/57/58a/59/36/37a/38/39a
+ // t40a/41/42a/43/52/53a/54/55a
+ // t48a/49/50a/51/44/45a/46/47a
+ ldr q16, [x6, #4*4*0] // t32a
+ ldr q17, [x9, #4*4*8] // t39a
+ ldr q18, [x9, #4*4*0] // t63a
+ ldr q19, [x6, #4*4*8] // t56a
+ ldr q20, [x6, #4*4*16] // t40a
+ ldr q21, [x9, #4*4*24] // t47a
+ ldr q22, [x9, #4*4*16] // t55a
+ ldr q23, [x6, #4*4*24] // t48a
+
+ sqadd v24.4s, v16.4s, v17.4s // t32
+ sqsub v25.4s, v16.4s, v17.4s // t39
+ sqadd v26.4s, v18.4s, v19.4s // t63
+ sqsub v27.4s, v18.4s, v19.4s // t56
+ sqsub v28.4s, v21.4s, v20.4s // t40
+ sqadd v29.4s, v21.4s, v20.4s // t47
+ sqadd v30.4s, v23.4s, v22.4s // t48
+ sqsub v31.4s, v23.4s, v22.4s // t55
+
+.irp r, v24, v25, v26, v27, v28, v29, v30, v31
+ smin_4s \r, \r, v5
+.endr
+.irp r, v24, v25, v26, v27, v28, v29, v30, v31
+ smax_4s \r, \r, v4
+.endr
+
+ mul_mla v2, v27, v25, v0.s[3], v0.s[2] // -> t56a
+ mul_mls v7, v27, v25, v0.s[2], v0.s[3] // -> t39a
+ mul_mla v6, v31, v28, v0.s[3], v0.s[2] // -> t40a
+ srshr v25.4s, v2.4s, #12 // t56a
+ srshr v27.4s, v7.4s, #12 // t39a
+ neg v6.4s, v6.4s // t40a
+ mul_mls v2, v31, v28, v0.s[2], v0.s[3] // -> t55a
+ srshr v31.4s, v6.4s, #12 // t40a
+ srshr v28.4s, v2.4s, #12 // t55a
+
+ sqadd v16.4s, v24.4s, v29.4s // t32a
+ sqsub v19.4s, v24.4s, v29.4s // t47a
+ sqadd v17.4s, v27.4s, v31.4s // t39
+ sqsub v18.4s, v27.4s, v31.4s // t40
+ sqsub v20.4s, v26.4s, v30.4s // t48a
+ sqadd v23.4s, v26.4s, v30.4s // t63a
+ sqsub v21.4s, v25.4s, v28.4s // t55
+ sqadd v22.4s, v25.4s, v28.4s // t56
+
+.irp r, v16, v19, v17, v18, v20, v23, v21, v22
+ smin_4s \r, \r, v5
+.endr
+.irp r, v16, v19, v17, v18, v20, v23, v21, v22
+ smax_4s \r, \r, v4
+.endr
+
+ mul_mls v2, v21, v18, v0.s[0], v0.s[0] // -> t40a
+ mul_mla v7, v21, v18, v0.s[0], v0.s[0] // -> t55a
+ mul_mls v6, v20, v19, v0.s[0], v0.s[0] // -> t47
+ srshr v18.4s, v2.4s, #12 // t40a
+ srshr v21.4s, v7.4s, #12 // t55a
+ mul_mla v2, v20, v19, v0.s[0], v0.s[0] // -> t48
+ srshr v19.4s, v6.4s, #12 // t47
+ srshr v20.4s, v2.4s, #12 // t48
+
+ str q16, [x6, #4*4*0] // t32a
+ str q17, [x9, #4*4*0] // t39
+ str q18, [x6, #4*4*8] // t40a
+ str q19, [x9, #4*4*8] // t47
+ str q20, [x6, #4*4*16] // t48
+ str q21, [x9, #4*4*16] // t55a
+ str q22, [x6, #4*4*24] // t56
+ str q23, [x9, #4*4*24] // t63a
+
+ add x6, x6, #4*4
+ sub x9, x9, #4*4
+ cmp x6, x9
+ b.lt 1b
+ ret
+endfunc
+
+.macro load8 src, strd, zero, clear
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+.if \clear
+ ld1 {\i}, [\src]
+ st1 {\zero}, [\src], \strd
+.else
+ ld1 {\i}, [\src], \strd
+.endif
+.endr
+.endm
+
+.macro store16 dst
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+ st1 {\i}, [\dst], #16
+.endr
+.endm
+
+.macro clear_upper8
+.irp i, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+ movi \i, #0
+.endr
+.endm
+
+.macro movi_if reg, val, cond
+.if \cond
+ movi \reg, \val
+.endif
+.endm
+
+.macro movz16dup_if reg, gpr, val, cond
+.if \cond
+ movz \gpr, \val, lsl #16
+ dup \reg, \gpr
+.endif
+.endm
+
+.macro st1_if regs, dst, cond
+.if \cond
+ st1 \regs, \dst
+.endif
+.endm
+
+.macro str_if reg, dst, cond
+.if \cond
+ str \reg, \dst
+.endif
+.endm
+
+.macro stroff_if reg, dst, dstoff, cond
+.if \cond
+ str \reg, \dst, \dstoff
+.endif
+.endm
+
+.macro scale_if cond, c, r0, r1, r2, r3, r4, r5, r6, r7
+.if \cond
+ scale_input .4s, \c, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
+.endif
+.endm
+
+.macro def_dct64_func suffix, clear=0, scale=0
+function inv_txfm_dct\suffix\()_4s_x64_neon
+ mov x14, x30
+ mov x6, sp
+ lsl x8, x8, #2
+
+ movz16dup_if v0.2s, w16, #2896*8, \scale
+ movi_if v7.4s, #0, \clear
+ load8 x7, x8, v7.4s, \clear
+ clear_upper8
+ sub x7, x7, x8, lsl #3
+ add x7, x7, x8, lsr #1
+ scale_if \scale, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+
+ bl inv_dct_4s_x16_neon
+
+ // idct_16 leaves the row_clip_max/min constants in v5 and v4
+.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
+ smin_4s \r, \r, v5
+.endr
+.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
+ smax_4s \r, \r, v4
+.endr
+
+ store16 x6
+
+ movz16dup_if v0.2s, w16, #2896*8, \scale
+ movi_if v7.8h, #0, \clear
+ load8 x7, x8, v7.4s, \clear
+ clear_upper8
+ sub x7, x7, x8, lsl #3
+ lsr x8, x8, #1
+ sub x7, x7, x8, lsr #1
+ scale_if \scale, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+
+ bl inv_dct32_odd_4s_x16_neon
+
+ add x10, x6, #16*15
+ sub x6, x6, #16*16
+
+ mov x9, #-16
+
+ movi v1.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ mvni v0.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
+
+.macro store_addsub r0, r1, r2, r3
+ ld1 {v2.4s}, [x6], #16
+ ld1 {v3.4s}, [x6], #16
+ sqadd v6.4s, v2.4s, \r0
+ sqsub \r0, v2.4s, \r0
+ ld1 {v4.4s}, [x6], #16
+ sqadd v7.4s, v3.4s, \r1
+ sqsub \r1, v3.4s, \r1
+ smin v6.4s, v6.4s, v1.4s
+ smin \r0, \r0, v1.4s
+ ld1 {v5.4s}, [x6], #16
+ sqadd v2.4s, v4.4s, \r2
+ sub x6, x6, #16*4
+ smax v6.4s, v6.4s, v0.4s
+ smax \r0, \r0, v0.4s
+ sqsub \r2, v4.4s, \r2
+ smin v7.4s, v7.4s, v1.4s
+ smin \r1, \r1, v1.4s
+ st1 {v6.4s}, [x6], #16
+ st1 {\r0}, [x10], x9
+ smin v2.4s, v2.4s, v1.4s
+ smin \r2, \r2, v1.4s
+ smax v7.4s, v7.4s, v0.4s
+ smax \r1, \r1, v0.4s
+ sqadd v3.4s, v5.4s, \r3
+ sqsub \r3, v5.4s, \r3
+ smax v2.4s, v2.4s, v0.4s
+ smax \r2, \r2, v0.4s
+ smin v3.4s, v3.4s, v1.4s
+ smin \r3, \r3, v1.4s
+ st1 {v7.4s}, [x6], #16
+ st1 {\r1}, [x10], x9
+ smax v3.4s, v3.4s, v0.4s
+ smax \r3, \r3, v0.4s
+ st1 {v2.4s}, [x6], #16
+ st1 {\r2}, [x10], x9
+ st1 {v3.4s}, [x6], #16
+ st1 {\r3}, [x10], x9
+.endm
+ store_addsub v31.4s, v30.4s, v29.4s, v28.4s
+ store_addsub v27.4s, v26.4s, v25.4s, v24.4s
+ store_addsub v23.4s, v22.4s, v21.4s, v20.4s
+ store_addsub v19.4s, v18.4s, v17.4s, v16.4s
+.purgem store_addsub
+
+ add x6, x6, #4*4*16
+
+ movrel x17, idct64_coeffs
+ movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ mvni v4.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
+ movz16dup_if v0.2s, w16, #2896*8, \scale
+ movi_if v7.4s, #0, \clear
+ add x9, x7, x8, lsl #4 // offset 16
+ add x10, x7, x8, lsl #3 // offset 8
+ sub x9, x9, x8 // offset 15
+ sub x11, x10, x8 // offset 7
+ ld1 {v16.4s}, [x7] // in1 (offset 0)
+ ld1 {v17.4s}, [x9] // in31 (offset 15)
+ ld1 {v18.4s}, [x10] // in17 (offset 8)
+ ld1 {v19.4s}, [x11] // in15 (offset 7)
+ st1_if {v7.4s}, [x7], \clear
+ st1_if {v7.4s}, [x9], \clear
+ st1_if {v7.4s}, [x10], \clear
+ st1_if {v7.4s}, [x11], \clear
+ scale_if \scale, v0.s[0], v16, v17, v18, v19
+ bl inv_dct64_step1_neon
+ movz16dup_if v0.2s, w16, #2896*8, \scale
+ movi_if v7.4s, #0, \clear
+ add x7, x7, x8, lsl #2 // offset 4
+ sub x9, x9, x8, lsl #2 // offset 11
+ sub x10, x7, x8 // offset 3
+ add x11, x9, x8 // offset 12
+ ld1 {v16.4s}, [x10] // in7 (offset 3)
+ ld1 {v17.4s}, [x11] // in25 (offset 12)
+ ld1 {v18.4s}, [x9] // in23 (offset 11)
+ ld1 {v19.4s}, [x7] // in9 (offset 4)
+ st1_if {v7.4s}, [x7], \clear
+ st1_if {v7.4s}, [x9], \clear
+ st1_if {v7.4s}, [x10], \clear
+ st1_if {v7.4s}, [x11], \clear
+ scale_if \scale, v0.s[0], v16, v17, v18, v19
+ bl inv_dct64_step1_neon
+ movz16dup_if v0.2s, w16, #2896*8, \scale
+ movi_if v7.4s, #0, \clear
+ sub x10, x10, x8, lsl #1 // offset 1
+ sub x9, x9, x8, lsl #1 // offset 9
+ add x7, x7, x8 // offset 5
+ add x11, x11, x8 // offset 13
+ ldr q16, [x10, x8] // in5 (offset 2)
+ ldr q17, [x11] // in27 (offset 13)
+ ldr q18, [x9, x8] // in21 (offset 10)
+ ldr q19, [x7] // in11 (offset 5)
+ stroff_if q7, [x10, x8], \clear
+ str_if q7, [x11], \clear
+ stroff_if q7, [x9, x8], \clear
+ str_if q7, [x7], \clear
+ scale_if \scale, v0.s[0], v16, v17, v18, v19
+ bl inv_dct64_step1_neon
+ movz16dup_if v0.2s, w16, #2896*8, \scale
+ movi_if v7.4s, #0, \clear
+ ldr q16, [x10] // in3 (offset 1)
+ ldr q17, [x11, x8] // in29 (offset 14)
+ ldr q18, [x9] // in19 (offset 9)
+ ldr q19, [x7, x8] // in13 (offset 6)
+ str_if q7, [x10], \clear
+ stroff_if q7, [x11, x8], \clear
+ str_if q7, [x9], \clear
+ stroff_if q7, [x7, x8], \clear
+ scale_if \scale, v0.s[0], v16, v17, v18, v19
+ bl inv_dct64_step1_neon
+
+ sub x6, x6, #4*4*32
+ add x9, x6, #4*4*7
+
+ bl inv_dct64_step2_neon
+
+ ret x14
+endfunc
+.endm
+
+def_dct64_func _clear, clear=1
+def_dct64_func _clear_scale, clear=1, scale=1
+
+
+function inv_txfm_horz_dct_64x4_neon
+ mov x14, x30
+
+ mov x7, sp
+ add x8, sp, #4*4*(64 - 4)
+ add x9, x6, #2*56
+ mov x10, #2*64
+ mov x11, #-4*4*4
+
+ dup v7.4s, w12
+1:
+ ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x7], #64
+ ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x8], x11
+ ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x7], #64
+ ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x8], x11
+ transpose_4x4s v16, v17, v18, v19, v2, v3, v4, v5
+ transpose_4x4s v20, v21, v22, v23, v2, v3, v4, v5
+ transpose_4x4s v31, v30, v29, v28, v2, v3, v4, v5
+ transpose_4x4s v27, v26, v25, v24, v2, v3, v4, v5
+
+.macro store_addsub src0, src1, src2, src3
+ sqsub v1.4s, \src0, \src1
+ sqadd v0.4s, \src0, \src1
+ sqsub v3.4s, \src2, \src3
+ srshl v1.4s, v1.4s, v7.4s
+ sqadd v2.4s, \src2, \src3
+ srshl v3.4s, v3.4s, v7.4s
+ srshl v0.4s, v0.4s, v7.4s
+ srshl v2.4s, v2.4s, v7.4s
+ sqxtn v3.4h, v3.4s
+ sqxtn2 v3.8h, v1.4s
+ sqxtn v0.4h, v0.4s
+ sqxtn2 v0.8h, v2.4s
+ rev64 v3.8h, v3.8h
+ st1 {v0.8h}, [x6], x10
+ st1 {v3.8h}, [x9], x10
+.endm
+ store_addsub v16.4s, v31.4s, v20.4s, v27.4s
+ store_addsub v17.4s, v30.4s, v21.4s, v26.4s
+ store_addsub v18.4s, v29.4s, v22.4s, v25.4s
+ store_addsub v19.4s, v28.4s, v23.4s, v24.4s
+.purgem store_addsub
+ sub x6, x6, x10, lsl #2
+ sub x9, x9, x10, lsl #2
+ add x6, x6, #16
+ sub x9, x9, #16
+
+ cmp x7, x8
+ b.lt 1b
+ ret x14
+endfunc
+
+function inv_txfm_add_vert_dct_8x64_neon
+ mov x14, x30
+ lsl x8, x8, #1
+
+ mov x7, sp
+ add x8, sp, #2*8*(64 - 4)
+ add x9, x6, x1, lsl #6
+ sub x9, x9, x1
+ neg x10, x1
+ mov x11, #-2*8*4
+
+1:
+ ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x7], #64
+ ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x11
+ ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64
+ ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x8], x11
+
+ mvni v7.8h, #0xfc, lsl #8 // 0x3ff
+.macro add_dest_addsub src0, src1, src2, src3
+ ld1 {v0.8h}, [x6], x1
+ ld1 {v1.8h}, [x9], x10
+ sqadd v4.8h, \src0, \src1
+ ld1 {v2.8h}, [x6]
+ sqsub \src0, \src0, \src1
+ ld1 {v3.8h}, [x9]
+ sqadd v5.8h, \src2, \src3
+ sqsub \src2, \src2, \src3
+ sub x6, x6, x1
+ sub x9, x9, x10
+ srshr v4.8h, v4.8h, #4
+ srshr v5.8h, v5.8h, #4
+ srshr \src0, \src0, #4
+ usqadd v0.8h, v4.8h
+ srshr \src2, \src2, #4
+ usqadd v1.8h, \src0
+ usqadd v2.8h, v5.8h
+ smin v0.8h, v0.8h, v7.8h
+ usqadd v3.8h, \src2
+ smin v1.8h, v1.8h, v7.8h
+ st1 {v0.8h}, [x6], x1
+ smin v2.8h, v2.8h, v7.8h
+ st1 {v1.8h}, [x9], x10
+ smin v3.8h, v3.8h, v7.8h
+ st1 {v2.8h}, [x6], x1
+ st1 {v3.8h}, [x9], x10
+.endm
+ add_dest_addsub v16.8h, v31.8h, v17.8h, v30.8h
+ add_dest_addsub v18.8h, v29.8h, v19.8h, v28.8h
+ add_dest_addsub v20.8h, v27.8h, v21.8h, v26.8h
+ add_dest_addsub v22.8h, v25.8h, v23.8h, v24.8h
+.purgem add_dest_addsub
+ cmp x7, x8
+ b.lt 1b
+
+ ret x14
+endfunc
+
+function inv_txfm_add_dct_dct_64x64_16bpc_neon, export=1
+ idct_dc 64, 64, 2
+
+ mov x15, x30
+
+ sub_sp 64*32*2+64*4*4
+ add x5, sp, #64*4*4
+
+ movrel x13, eob_32x32
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add x6, x5, #(\i*64*2)
+.if \i > 0
+ mov w8, #(32 - \i)
+ cmp w3, w12
+ b.lt 1f
+.endif
+ add x7, x2, #(\i*4)
+ mov x8, #32*4
+ mov x12, #-2 // shift
+ bl inv_txfm_dct_clear_4s_x64_neon
+ add x6, x5, #(\i*64*2)
+ bl inv_txfm_horz_dct_64x4_neon
+.if \i < 28
+ ldrh w12, [x13], #2
+.endif
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #2
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+.irp i, 0, 8, 16, 24, 32, 40, 48, 56
+ add x7, x5, #(\i*2)
+ mov x8, #64*2
+ bl X(inv_txfm_dct_8h_x64_neon)
+ add x6, x0, #(\i*2)
+ bl inv_txfm_add_vert_dct_8x64_neon
+.endr
+
+ add sp, x5, #64*32*2
+ ret x15
+endfunc
+
+function inv_txfm_add_dct_dct_64x32_16bpc_neon, export=1
+ idct_dc 64, 32, 1
+
+ mov x15, x30
+
+ sub_sp 64*32*2+64*4*4
+ add x5, sp, #64*4*4
+
+ movrel x13, eob_32x32
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add x6, x5, #(\i*64*2)
+.if \i > 0
+ mov w8, #(32 - \i)
+ cmp w3, w12
+ b.lt 1f
+.endif
+ add x7, x2, #(\i*4)
+ mov x8, #32*4
+ mov x12, #-1 // shift
+ bl inv_txfm_dct_clear_scale_4s_x64_neon
+ add x6, x5, #(\i*64*2)
+ bl inv_txfm_horz_dct_64x4_neon
+.if \i < 28
+ ldrh w12, [x13], #2
+.endif
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #2
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+.irp i, 0, 8, 16, 24, 32, 40, 48, 56
+ add x6, x0, #(\i*2)
+ add x7, x5, #(\i*2)
+ mov x8, #64*2
+ bl inv_txfm_add_vert_dct_8x32_neon
+.endr
+
+ add sp, x5, #64*32*2
+ ret x15
+endfunc
+
+function inv_txfm_add_dct_dct_32x64_16bpc_neon, export=1
+ idct_dc 32, 64, 1
+
+ mov x15, x30
+
+ sub_sp 32*32*2+64*8*2
+ add x5, sp, #64*8*2
+
+ movrel x13, eob_32x32
+ ldrh w12, [x13], #2
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add x6, x5, #(\i*32*2)
+.if \i > 0
+ mov w8, #(32 - \i)
+ cmp w3, w12
+ b.lt 1f
+ ldrh w12, [x13], #2
+.endif
+ add x7, x2, #(\i*4)
+ mov x8, #32*4
+ bl inv_txfm_horz_scale_dct_32x4_neon
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #4
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+.irp i, 0, 8, 16, 24
+ add x7, x5, #(\i*2)
+ mov x8, #32*2
+ bl X(inv_txfm_dct_8h_x64_neon)
+ add x6, x0, #(\i*2)
+ bl inv_txfm_add_vert_dct_8x64_neon
+.endr
+
+ add sp, x5, #32*32*2
+ ret x15
+endfunc
+
+function inv_txfm_add_dct_dct_64x16_16bpc_neon, export=1
+ idct_dc 64, 16, 2
+
+ mov x15, x30
+
+ sub_sp 64*16*2+64*4*4
+ add x4, sp, #64*4*4
+
+ movrel x13, eob_16x32
+
+.irp i, 0, 4, 8, 12
+ add x6, x4, #(\i*64*2)
+.if \i > 0
+ mov w8, #(16 - \i)
+ cmp w3, w12
+ b.lt 1f
+.endif
+ add x7, x2, #(\i*4)
+ mov x8, #16*4
+ mov x12, #-2 // shift
+ bl inv_txfm_dct_clear_4s_x64_neon
+ add x6, x4, #(\i*64*2)
+ bl inv_txfm_horz_dct_64x4_neon
+.if \i < 12
+ ldrh w12, [x13], #2
+.endif
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #2
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+ movrel x5, X(inv_dct_8h_x16_neon)
+.irp i, 0, 8, 16, 24, 32, 40, 48, 56
+ add x6, x0, #(\i*2)
+ add x7, x4, #(\i*2)
+ mov x8, #64*2
+ bl inv_txfm_add_vert_8x16_neon
+.endr
+
+ add sp, x4, #64*16*2
+ ret x15
+endfunc
+
+function inv_txfm_add_dct_dct_16x64_16bpc_neon, export=1
+ idct_dc 16, 64, 2
+
+ mov x15, x30
+
+ sub_sp 16*32*2+64*8*2
+ add x5, sp, #64*8*2
+
+ movrel x13, eob_16x32
+ ldrh w12, [x13], #2
+
+ adr x4, inv_dct_4s_x16_neon
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add x6, x5, #(\i*16*2)
+.if \i > 0
+ mov w8, #(32 - \i)
+ cmp w3, w12
+ b.lt 1f
+.if \i < 28
+ ldrh w12, [x13], #2
+.endif
+.endif
+ add x7, x2, #(\i*4)
+ mov x8, #32*4
+ bl inv_txfm_horz_16x4_neon
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #4
+.rept 2
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+.irp i, 0, 8
+ add x7, x5, #(\i*2)
+ mov x8, #16*2
+ bl X(inv_txfm_dct_8h_x64_neon)
+ add x6, x0, #(\i*2)
+ bl inv_txfm_add_vert_dct_8x64_neon
+.endr
+
+ add sp, x5, #16*32*2
+ ret x15
+endfunc
diff --git a/third_party/dav1d/src/arm/64/loopfilter.S b/third_party/dav1d/src/arm/64/loopfilter.S
new file mode 100644
index 0000000000..63d5de10ad
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/loopfilter.S
@@ -0,0 +1,1129 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// depending on how many pixels need to be stored, returns:
+// x14 = (1 << 0) : 0 pixels
+// x14 = (1 << 4) : inner 4 pixels
+// x14 = (1 << 6) : inner 6 pixels
+// x14 = 0 : all pixels
+.macro loop_filter wd
+function lpf_16_wd\wd\()_neon
+ uabd v0.16b, v22.16b, v23.16b // abs(p1 - p0)
+ uabd v1.16b, v25.16b, v24.16b // abs(q1 - q0)
+ uabd v2.16b, v23.16b, v24.16b // abs(p0 - q0)
+ uabd v3.16b, v22.16b, v25.16b // abs(p1 - q1)
+.if \wd >= 6
+ uabd v4.16b, v21.16b, v22.16b // abs(p2 - p1)
+ uabd v5.16b, v26.16b, v25.16b // abs(q2 - q1)
+.endif
+.if \wd >= 8
+ uabd v6.16b, v20.16b, v21.16b // abs(p3 - p2)
+ uabd v7.16b, v27.16b, v26.16b // abs(q3 - q3)
+.endif
+.if \wd >= 6
+ umax v4.16b, v4.16b, v5.16b
+.endif
+ uqadd v2.16b, v2.16b, v2.16b // abs(p0 - q0) * 2
+.if \wd >= 8
+ umax v6.16b, v6.16b, v7.16b
+.endif
+ ushr v3.16b, v3.16b, #1
+.if \wd >= 8
+ umax v4.16b, v4.16b, v6.16b
+.endif
+.if \wd >= 6
+ and v4.16b, v4.16b, v14.16b
+.endif
+ umax v0.16b, v0.16b, v1.16b // max(abs(p1 - p0), abs(q1 - q0))
+ uqadd v2.16b, v2.16b, v3.16b // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
+.if \wd >= 6
+ umax v4.16b, v0.16b, v4.16b
+ cmhs v1.16b, v11.16b, v4.16b // max(abs(p1 - p0), abs(q1 - q0), abs(), abs(), ...) <= I
+.else
+ cmhs v1.16b, v11.16b, v0.16b // max(abs(p1 - p0), abs(q1 - q0)) <= I
+.endif
+ cmhs v2.16b, v10.16b, v2.16b // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 <= E
+ and v1.16b, v1.16b, v2.16b // fm
+ and v1.16b, v1.16b, v13.16b // fm && wd >= 4
+.if \wd >= 6
+ and v14.16b, v14.16b, v1.16b // fm && wd > 4
+.endif
+.if \wd >= 16
+ and v15.16b, v15.16b, v1.16b // fm && wd == 16
+.endif
+
+ mov x16, v1.d[0]
+ mov x17, v1.d[1]
+ adds x16, x16, x17
+ b.ne 9f // if (!fm || wd < 4) return;
+ mov x14, #(1 << 0)
+ ret
+9:
+.if \wd >= 6
+ movi v10.16b, #1
+ uabd v2.16b, v21.16b, v23.16b // abs(p2 - p0)
+ uabd v3.16b, v22.16b, v23.16b // abs(p1 - p0)
+ uabd v4.16b, v25.16b, v24.16b // abs(q1 - q0)
+ uabd v5.16b, v26.16b, v24.16b // abs(q2 - q0)
+.if \wd >= 8
+ uabd v6.16b, v20.16b, v23.16b // abs(p3 - p0)
+ uabd v7.16b, v27.16b, v24.16b // abs(q3 - q0)
+.endif
+ umax v2.16b, v2.16b, v3.16b
+ umax v4.16b, v4.16b, v5.16b
+.if \wd >= 8
+ umax v6.16b, v6.16b, v7.16b
+.endif
+ umax v2.16b, v2.16b, v4.16b
+.if \wd >= 8
+ umax v2.16b, v2.16b, v6.16b
+.endif
+
+.if \wd == 16
+ uabd v3.16b, v17.16b, v23.16b // abs(p6 - p0)
+ uabd v4.16b, v18.16b, v23.16b // abs(p5 - p0)
+ uabd v5.16b, v19.16b, v23.16b // abs(p4 - p0)
+.endif
+ cmhs v2.16b, v10.16b, v2.16b // flat8in
+.if \wd == 16
+ uabd v6.16b, v28.16b, v24.16b // abs(q4 - q0)
+ uabd v7.16b, v29.16b, v24.16b // abs(q5 - q0)
+ uabd v8.16b, v30.16b, v24.16b // abs(q6 - q0)
+.endif
+ and v14.16b, v2.16b, v14.16b // flat8in && fm && wd > 4
+ bic v1.16b, v1.16b, v14.16b // fm && wd >= 4 && !flat8in
+.if \wd == 16
+ umax v3.16b, v3.16b, v4.16b
+ umax v5.16b, v5.16b, v6.16b
+.endif
+ mov x16, v1.d[0]
+ mov x17, v1.d[1]
+.if \wd == 16
+ umax v7.16b, v7.16b, v8.16b
+ umax v3.16b, v3.16b, v5.16b
+ umax v3.16b, v3.16b, v7.16b
+ cmhs v3.16b, v10.16b, v3.16b // flat8out
+.endif
+ adds x16, x16, x17
+.if \wd == 16
+ and v15.16b, v15.16b, v3.16b // flat8out && fm && wd == 16
+ and v15.16b, v15.16b, v14.16b // flat8out && flat8in && fm && wd == 16
+ bic v14.16b, v14.16b, v15.16b // flat8in && fm && wd >= 4 && !flat8out
+.endif
+ b.eq 1f // skip wd == 4 case
+.endif
+ movi v3.16b, #128
+ eor v2.16b, v22.16b, v3.16b // p1 - 128
+ eor v3.16b, v25.16b, v3.16b // q1 - 128
+ cmhi v0.16b, v0.16b, v12.16b // hev
+ sqsub v2.16b, v2.16b, v3.16b // iclip_diff(p1 - q1)
+ and v4.16b, v2.16b, v0.16b // if (hev) iclip_diff(p1 - q1)
+ bic v0.16b, v1.16b, v0.16b // (fm && wd >= 4 && !hev)
+ usubl v2.8h, v24.8b, v23.8b
+ movi v5.8h, #3
+ usubl2 v3.8h, v24.16b, v23.16b
+ mul v2.8h, v2.8h, v5.8h
+ mul v3.8h, v3.8h, v5.8h
+ movi v6.16b, #4
+ saddw v2.8h, v2.8h, v4.8b
+ saddw2 v3.8h, v3.8h, v4.16b
+ movi v7.16b, #3
+ sqxtn v2.8b, v2.8h // f
+ sqxtn2 v2.16b, v3.8h
+ sqadd v4.16b, v6.16b, v2.16b // imin(f + 4, 127)
+ sqadd v5.16b, v7.16b, v2.16b // imin(f + 3, 127)
+ sshr v4.16b, v4.16b, #3 // f1
+ sshr v5.16b, v5.16b, #3 // f2
+ mov v2.16b, v23.16b // p0
+ mov v3.16b, v24.16b // q0
+ neg v6.16b, v4.16b // -f1
+ srshr v4.16b, v4.16b, #1 // (f1 + 1) >> 1
+ // p0 + f2, q0 - f1
+ usqadd v2.16b, v5.16b // out p0
+ usqadd v3.16b, v6.16b // out q0
+ neg v6.16b, v4.16b // -((f1 + 1) >> 1)
+ bit v23.16b, v2.16b, v1.16b // if (fm && wd >= 4)
+ bit v24.16b, v3.16b, v1.16b // if (fm && wd >= 4)
+ mov v2.16b, v22.16b // p1
+ mov v3.16b, v25.16b // q1
+ // p1 + ((f1 + 1) >> 1), q1 - ((f1 + 1) >> 1)
+ usqadd v2.16b, v4.16b // out p1
+ usqadd v3.16b, v6.16b // out q1
+ bit v22.16b, v2.16b, v0.16b // if (fm && wd >= 4 && !hev)
+ bit v25.16b, v3.16b, v0.16b // if (fm && wd >= 4 && !hev)
+1:
+
+.if \wd == 6
+ mov x16, v14.d[0]
+ mov x17, v14.d[1]
+ adds x16, x16, x17
+ b.eq 2f // skip if there's no flat8in
+
+ uaddl v0.8h, v21.8b, v21.8b // p2 * 2
+ uaddl2 v1.8h, v21.16b, v21.16b
+ uaddl v2.8h, v21.8b, v22.8b // p2 + p1
+ uaddl2 v3.8h, v21.16b, v22.16b
+ uaddl v4.8h, v22.8b, v23.8b // p1 + p0
+ uaddl2 v5.8h, v22.16b, v23.16b
+ uaddl v6.8h, v23.8b, v24.8b // p0 + q0
+ uaddl2 v7.8h, v23.16b, v24.16b
+ add v8.8h, v0.8h, v2.8h
+ add v9.8h, v1.8h, v3.8h
+ add v10.8h, v4.8h, v6.8h
+ add v11.8h, v5.8h, v7.8h
+ uaddl v12.8h, v24.8b, v25.8b // q0 + q1
+ uaddl2 v13.8h, v24.16b, v25.16b
+ add v8.8h, v8.8h, v10.8h
+ add v9.8h, v9.8h, v11.8h
+ sub v12.8h, v12.8h, v0.8h
+ sub v13.8h, v13.8h, v1.8h
+ uaddl v10.8h, v25.8b, v26.8b // q1 + q2
+ uaddl2 v11.8h, v25.16b, v26.16b
+ rshrn v0.8b, v8.8h, #3 // out p1
+ rshrn2 v0.16b, v9.8h, #3
+
+ add v8.8h, v8.8h, v12.8h
+ add v9.8h, v9.8h, v13.8h
+ sub v10.8h, v10.8h, v2.8h
+ sub v11.8h, v11.8h, v3.8h
+ uaddl v12.8h, v26.8b, v26.8b // q2 + q2
+ uaddl2 v13.8h, v26.16b, v26.16b
+ rshrn v1.8b, v8.8h, #3 // out p0
+ rshrn2 v1.16b, v9.8h, #3
+
+ add v8.8h, v8.8h, v10.8h
+ add v9.8h, v9.8h, v11.8h
+ sub v12.8h, v12.8h, v4.8h
+ sub v13.8h, v13.8h, v5.8h
+ rshrn v2.8b, v8.8h, #3 // out q0
+ rshrn2 v2.16b, v9.8h, #3
+
+ bit v22.16b, v0.16b, v14.16b // p1 if (flat8in)
+ add v8.8h, v8.8h, v12.8h
+ add v9.8h, v9.8h, v13.8h
+ bit v23.16b, v1.16b, v14.16b // p0 if (flat8in)
+ rshrn v3.8b, v8.8h, #3 // out q1
+ rshrn2 v3.16b, v9.8h, #3
+ bit v24.16b, v2.16b, v14.16b // q0 if (flat8in)
+ bit v25.16b, v3.16b, v14.16b // q1 if (flat8in)
+.elseif \wd >= 8
+ mov x16, v14.d[0]
+ mov x17, v14.d[1]
+ adds x16, x16, x17
+.if \wd == 8
+ b.eq 8f // skip if there's no flat8in
+.else
+ b.eq 2f // skip if there's no flat8in
+.endif
+
+ uaddl v0.8h, v20.8b, v21.8b // p3 + p2
+ uaddl2 v1.8h, v20.16b, v21.16b
+ uaddl v2.8h, v22.8b, v25.8b // p1 + q1
+ uaddl2 v3.8h, v22.16b, v25.16b
+ uaddl v4.8h, v20.8b, v22.8b // p3 + p1
+ uaddl2 v5.8h, v20.16b, v22.16b
+ uaddl v6.8h, v23.8b, v26.8b // p0 + q2
+ uaddl2 v7.8h, v23.16b, v26.16b
+ add v8.8h, v0.8h, v0.8h // 2 * (p3 + p2)
+ add v9.8h, v1.8h, v1.8h
+ uaddw v8.8h, v8.8h, v23.8b // + p0
+ uaddw2 v9.8h, v9.8h, v23.16b
+ uaddw v8.8h, v8.8h, v24.8b // + q0
+ uaddw2 v9.8h, v9.8h, v24.16b
+ add v8.8h, v8.8h, v4.8h
+ add v9.8h, v9.8h, v5.8h // + p3 + p1
+ sub v2.8h, v2.8h, v0.8h // p1 + q1 - p3 - p2
+ sub v3.8h, v3.8h, v1.8h
+ sub v6.8h, v6.8h, v4.8h // p0 + q2 - p3 - p1
+ sub v7.8h, v7.8h, v5.8h
+ rshrn v10.8b, v8.8h, #3 // out p2
+ rshrn2 v10.16b, v9.8h, #3
+
+ add v8.8h, v8.8h, v2.8h
+ add v9.8h, v9.8h, v3.8h
+ uaddl v0.8h, v20.8b, v23.8b // p3 + p0
+ uaddl2 v1.8h, v20.16b, v23.16b
+ uaddl v2.8h, v24.8b, v27.8b // q0 + q3
+ uaddl2 v3.8h, v24.16b, v27.16b
+ rshrn v11.8b, v8.8h, #3 // out p1
+ rshrn2 v11.16b, v9.8h, #3
+
+ add v8.8h, v8.8h, v6.8h
+ add v9.8h, v9.8h, v7.8h
+ sub v2.8h, v2.8h, v0.8h // q0 + q3 - p3 - p0
+ sub v3.8h, v3.8h, v1.8h
+ uaddl v4.8h, v21.8b, v24.8b // p2 + q0
+ uaddl2 v5.8h, v21.16b, v24.16b
+ uaddl v6.8h, v25.8b, v27.8b // q1 + q3
+ uaddl2 v7.8h, v25.16b, v27.16b
+ rshrn v12.8b, v8.8h, #3 // out p0
+ rshrn2 v12.16b, v9.8h, #3
+
+ add v8.8h, v8.8h, v2.8h
+ add v9.8h, v9.8h, v3.8h
+ sub v6.8h, v6.8h, v4.8h // q1 + q3 - p2 - q0
+ sub v7.8h, v7.8h, v5.8h
+ uaddl v0.8h, v22.8b, v25.8b // p1 + q1
+ uaddl2 v1.8h, v22.16b, v25.16b
+ uaddl v2.8h, v26.8b, v27.8b // q2 + q3
+ uaddl2 v3.8h, v26.16b, v27.16b
+ rshrn v13.8b, v8.8h, #3 // out q0
+ rshrn2 v13.16b, v9.8h, #3
+
+ add v8.8h, v8.8h, v6.8h
+ add v9.8h, v9.8h, v7.8h
+ sub v2.8h, v2.8h, v0.8h // q2 + q3 - p1 - q1
+ sub v3.8h, v3.8h, v1.8h
+ rshrn v0.8b, v8.8h, #3 // out q1
+ rshrn2 v0.16b, v9.8h, #3
+
+ add v8.8h, v8.8h, v2.8h
+ add v9.8h , v9.8h, v3.8h
+
+ bit v21.16b, v10.16b, v14.16b
+ bit v22.16b, v11.16b, v14.16b
+ bit v23.16b, v12.16b, v14.16b
+ rshrn v1.8b, v8.8h, #3 // out q2
+ rshrn2 v1.16b, v9.8h, #3
+ bit v24.16b, v13.16b, v14.16b
+ bit v25.16b, v0.16b, v14.16b
+ bit v26.16b, v1.16b, v14.16b
+.endif
+2:
+.if \wd == 16
+ mov x16, v15.d[0]
+ mov x17, v15.d[1]
+ adds x16, x16, x17
+ b.ne 1f // check if flat8out is needed
+ mov x16, v14.d[0]
+ mov x17, v14.d[1]
+ adds x16, x16, x17
+ b.eq 8f // if there was no flat8in, just write the inner 4 pixels
+ b 7f // if flat8in was used, write the inner 6 pixels
+1:
+
+ uaddl v2.8h, v17.8b, v17.8b // p6 + p6
+ uaddl2 v3.8h, v17.16b, v17.16b
+ uaddl v4.8h, v17.8b, v18.8b // p6 + p5
+ uaddl2 v5.8h, v17.16b, v18.16b
+ uaddl v6.8h, v17.8b, v19.8b // p6 + p4
+ uaddl2 v7.8h, v17.16b, v19.16b
+ uaddl v8.8h, v17.8b, v20.8b // p6 + p3
+ uaddl2 v9.8h, v17.16b, v20.16b
+ add v12.8h, v2.8h, v4.8h
+ add v13.8h, v3.8h, v5.8h
+ add v10.8h, v6.8h, v8.8h
+ add v11.8h, v7.8h, v9.8h
+ uaddl v6.8h, v17.8b, v21.8b // p6 + p2
+ uaddl2 v7.8h, v17.16b, v21.16b
+ add v12.8h, v12.8h, v10.8h
+ add v13.8h, v13.8h, v11.8h
+ uaddl v8.8h, v17.8b, v22.8b // p6 + p1
+ uaddl2 v9.8h, v17.16b, v22.16b
+ uaddl v10.8h, v18.8b, v23.8b // p5 + p0
+ uaddl2 v11.8h, v18.16b, v23.16b
+ add v6.8h, v6.8h, v8.8h
+ add v7.8h, v7.8h, v9.8h
+ uaddl v8.8h, v19.8b, v24.8b // p4 + q0
+ uaddl2 v9.8h, v19.16b, v24.16b
+ add v12.8h, v12.8h, v6.8h
+ add v13.8h, v13.8h, v7.8h
+ add v10.8h, v10.8h, v8.8h
+ add v11.8h, v11.8h, v9.8h
+ uaddl v6.8h, v20.8b, v25.8b // p3 + q1
+ uaddl2 v7.8h, v20.16b, v25.16b
+ add v12.8h, v12.8h, v10.8h
+ add v13.8h, v13.8h, v11.8h
+ sub v6.8h, v6.8h, v2.8h
+ sub v7.8h, v7.8h, v3.8h
+ uaddl v2.8h, v21.8b, v26.8b // p2 + q2
+ uaddl2 v3.8h, v21.16b, v26.16b
+ rshrn v0.8b, v12.8h, #4 // out p5
+ rshrn2 v0.16b, v13.8h, #4
+ add v12.8h, v12.8h, v6.8h // - (p6 + p6) + (p3 + q1)
+ add v13.8h, v13.8h, v7.8h
+ sub v2.8h, v2.8h, v4.8h
+ sub v3.8h, v3.8h, v5.8h
+ uaddl v4.8h, v22.8b, v27.8b // p1 + q3
+ uaddl2 v5.8h, v22.16b, v27.16b
+ uaddl v6.8h, v17.8b, v19.8b // p6 + p4
+ uaddl2 v7.8h, v17.16b, v19.16b
+ rshrn v1.8b, v12.8h, #4 // out p4
+ rshrn2 v1.16b, v13.8h, #4
+ add v12.8h, v12.8h, v2.8h // - (p6 + p5) + (p2 + q2)
+ add v13.8h, v13.8h, v3.8h
+ sub v4.8h, v4.8h, v6.8h
+ sub v5.8h, v5.8h, v7.8h
+ uaddl v6.8h, v23.8b, v28.8b // p0 + q4
+ uaddl2 v7.8h, v23.16b, v28.16b
+ uaddl v8.8h, v17.8b, v20.8b // p6 + p3
+ uaddl2 v9.8h, v17.16b, v20.16b
+ rshrn v2.8b, v12.8h, #4 // out p3
+ rshrn2 v2.16b, v13.8h, #4
+ add v12.8h, v12.8h, v4.8h // - (p6 + p4) + (p1 + q3)
+ add v13.8h, v13.8h, v5.8h
+ sub v6.8h, v6.8h, v8.8h
+ sub v7.8h, v7.8h, v9.8h
+ uaddl v8.8h, v24.8b, v29.8b // q0 + q5
+ uaddl2 v9.8h, v24.16b, v29.16b
+ uaddl v4.8h, v17.8b, v21.8b // p6 + p2
+ uaddl2 v5.8h, v17.16b, v21.16b
+ rshrn v3.8b, v12.8h, #4 // out p2
+ rshrn2 v3.16b, v13.8h, #4
+ add v12.8h, v12.8h, v6.8h // - (p6 + p3) + (p0 + q4)
+ add v13.8h, v13.8h, v7.8h
+ sub v8.8h, v8.8h, v4.8h
+ sub v9.8h, v9.8h, v5.8h
+ uaddl v6.8h, v25.8b, v30.8b // q1 + q6
+ uaddl2 v7.8h, v25.16b, v30.16b
+ uaddl v10.8h, v17.8b, v22.8b // p6 + p1
+ uaddl2 v11.8h, v17.16b, v22.16b
+ rshrn v4.8b, v12.8h, #4 // out p1
+ rshrn2 v4.16b, v13.8h, #4
+ add v12.8h, v12.8h, v8.8h // - (p6 + p2) + (q0 + q5)
+ add v13.8h, v13.8h, v9.8h
+ sub v6.8h, v6.8h, v10.8h
+ sub v7.8h, v7.8h, v11.8h
+ uaddl v8.8h, v26.8b, v30.8b // q2 + q6
+ uaddl2 v9.8h, v26.16b, v30.16b
+ bif v0.16b, v18.16b, v15.16b // out p5
+ uaddl v10.8h, v18.8b, v23.8b // p5 + p0
+ uaddl2 v11.8h, v18.16b, v23.16b
+ rshrn v5.8b, v12.8h, #4 // out p0
+ rshrn2 v5.16b, v13.8h, #4
+ add v12.8h, v12.8h, v6.8h // - (p6 + p1) + (q1 + q6)
+ add v13.8h, v13.8h, v7.8h
+ sub v8.8h, v8.8h, v10.8h
+ sub v9.8h, v9.8h, v11.8h
+ uaddl v10.8h, v27.8b, v30.8b // q3 + q6
+ uaddl2 v11.8h, v27.16b, v30.16b
+ bif v1.16b, v19.16b, v15.16b // out p4
+ uaddl v18.8h, v19.8b, v24.8b // p4 + q0
+ uaddl2 v19.8h, v19.16b, v24.16b
+ rshrn v6.8b, v12.8h, #4 // out q0
+ rshrn2 v6.16b, v13.8h, #4
+ add v12.8h, v12.8h, v8.8h // - (p5 + p0) + (q2 + q6)
+ add v13.8h, v13.8h, v9.8h
+ sub v10.8h, v10.8h, v18.8h
+ sub v11.8h, v11.8h, v19.8h
+ uaddl v8.8h, v28.8b, v30.8b // q4 + q6
+ uaddl2 v9.8h, v28.16b, v30.16b
+ bif v2.16b, v20.16b, v15.16b // out p3
+ uaddl v18.8h, v20.8b, v25.8b // p3 + q1
+ uaddl2 v19.8h, v20.16b, v25.16b
+ rshrn v7.8b, v12.8h, #4 // out q1
+ rshrn2 v7.16b, v13.8h, #4
+ add v12.8h, v12.8h, v10.8h // - (p4 + q0) + (q3 + q6)
+ add v13.8h, v13.8h, v11.8h
+ sub v18.8h, v8.8h, v18.8h
+ sub v19.8h, v9.8h, v19.8h
+ uaddl v10.8h, v29.8b, v30.8b // q5 + q6
+ uaddl2 v11.8h, v29.16b, v30.16b
+ bif v3.16b, v21.16b, v15.16b // out p2
+ uaddl v20.8h, v21.8b, v26.8b // p2 + q2
+ uaddl2 v21.8h, v21.16b, v26.16b
+ rshrn v8.8b, v12.8h, #4 // out q2
+ rshrn2 v8.16b, v13.8h, #4
+ add v12.8h, v12.8h, v18.8h // - (p3 + q1) + (q4 + q6)
+ add v13.8h, v13.8h, v19.8h
+ sub v10.8h, v10.8h, v20.8h
+ sub v11.8h, v11.8h, v21.8h
+ uaddl v18.8h, v30.8b, v30.8b // q6 + q6
+ uaddl2 v19.8h, v30.16b, v30.16b
+ bif v4.16b, v22.16b, v15.16b // out p1
+ uaddl v20.8h, v22.8b, v27.8b // p1 + q3
+ uaddl2 v21.8h, v22.16b, v27.16b
+ rshrn v9.8b, v12.8h, #4 // out q3
+ rshrn2 v9.16b, v13.8h, #4
+ add v12.8h, v12.8h, v10.8h // - (p2 + q2) + (q5 + q6)
+ add v13.8h, v13.8h, v11.8h
+ sub v18.8h, v18.8h, v20.8h
+ sub v19.8h, v19.8h, v21.8h
+ bif v5.16b, v23.16b, v15.16b // out p0
+ rshrn v10.8b, v12.8h, #4 // out q4
+ rshrn2 v10.16b, v13.8h, #4
+ add v12.8h, v12.8h, v18.8h // - (p1 + q3) + (q6 + q6)
+ add v13.8h, v13.8h, v19.8h
+ rshrn v11.8b, v12.8h, #4 // out q5
+ rshrn2 v11.16b, v13.8h, #4
+ bif v6.16b, v24.16b, v15.16b // out q0
+ bif v7.16b, v25.16b, v15.16b // out q1
+ bif v8.16b, v26.16b, v15.16b // out q2
+ bif v9.16b, v27.16b, v15.16b // out q3
+ bif v10.16b, v28.16b, v15.16b // out q4
+ bif v11.16b, v29.16b, v15.16b // out q5
+.endif
+
+ mov x14, #0
+ ret
+.if \wd == 16
+7:
+ // Return to a shorter epilogue, writing only the inner 6 pixels
+ mov x14, #(1 << 6)
+ ret
+.endif
+.if \wd >= 8
+8:
+ // Return to a shorter epilogue, writing only the inner 4 pixels
+ mov x14, #(1 << 4)
+ ret
+.endif
+endfunc
+.endm
+
+loop_filter 16
+loop_filter 8
+loop_filter 6
+loop_filter 4
+
+.macro lpf_16_wd16
+ bl lpf_16_wd16_neon
+ cbz x14, 1f
+ tbnz x14, #6, 7f
+ tbnz x14, #4, 8f
+ ret x15
+1:
+.endm
+
+.macro lpf_16_wd8
+ bl lpf_16_wd8_neon
+ cbz x14, 1f
+ tbnz x14, #4, 8f
+ ret x15
+1:
+.endm
+
+.macro lpf_16_wd6
+ bl lpf_16_wd6_neon
+ cbz x14, 1f
+ ret x15
+1:
+.endm
+
+.macro lpf_16_wd4
+ bl lpf_16_wd4_neon
+ cbz x14, 1f
+ ret x15
+1:
+.endm
+
+function lpf_v_4_16_neon
+ mov x15, x30
+ sub x16, x0, x1, lsl #1
+ ld1 {v22.16b}, [x16], x1 // p1
+ ld1 {v24.16b}, [x0], x1 // q0
+ ld1 {v23.16b}, [x16], x1 // p0
+ ld1 {v25.16b}, [x0], x1 // q1
+ sub x0, x0, x1, lsl #1
+
+ lpf_16_wd4
+
+ sub x16, x0, x1, lsl #1
+ st1 {v22.16b}, [x16], x1 // p1
+ st1 {v24.16b}, [x0], x1 // q0
+ st1 {v23.16b}, [x16], x1 // p0
+ st1 {v25.16b}, [x0], x1 // q1
+ sub x0, x0, x1, lsl #1
+ ret x15
+endfunc
+
+function lpf_h_4_16_neon
+ mov x15, x30
+ sub x16, x0, #2
+ add x0, x16, x1, lsl #3
+ ld1 {v22.s}[0], [x16], x1
+ ld1 {v22.s}[2], [x0], x1
+ ld1 {v23.s}[0], [x16], x1
+ ld1 {v23.s}[2], [x0], x1
+ ld1 {v24.s}[0], [x16], x1
+ ld1 {v24.s}[2], [x0], x1
+ ld1 {v25.s}[0], [x16], x1
+ ld1 {v25.s}[2], [x0], x1
+ ld1 {v22.s}[1], [x16], x1
+ ld1 {v22.s}[3], [x0], x1
+ ld1 {v23.s}[1], [x16], x1
+ ld1 {v23.s}[3], [x0], x1
+ ld1 {v24.s}[1], [x16], x1
+ ld1 {v24.s}[3], [x0], x1
+ ld1 {v25.s}[1], [x16], x1
+ ld1 {v25.s}[3], [x0], x1
+ add x0, x0, #2
+
+ transpose_4x16b v22, v23, v24, v25, v26, v27, v28, v29
+
+ lpf_16_wd4
+
+ sub x16, x0, x1, lsl #4
+ sub x16, x16, #2
+ transpose_4x16b v22, v23, v24, v25, v26, v27, v28, v29
+ add x0, x16, x1, lsl #3
+
+ st1 {v22.s}[0], [x16], x1
+ st1 {v22.s}[2], [x0], x1
+ st1 {v23.s}[0], [x16], x1
+ st1 {v23.s}[2], [x0], x1
+ st1 {v24.s}[0], [x16], x1
+ st1 {v24.s}[2], [x0], x1
+ st1 {v25.s}[0], [x16], x1
+ st1 {v25.s}[2], [x0], x1
+ st1 {v22.s}[1], [x16], x1
+ st1 {v22.s}[3], [x0], x1
+ st1 {v23.s}[1], [x16], x1
+ st1 {v23.s}[3], [x0], x1
+ st1 {v24.s}[1], [x16], x1
+ st1 {v24.s}[3], [x0], x1
+ st1 {v25.s}[1], [x16], x1
+ st1 {v25.s}[3], [x0], x1
+ add x0, x0, #2
+ ret x15
+endfunc
+
+function lpf_v_6_16_neon
+ mov x15, x30
+ sub x16, x0, x1, lsl #1
+ sub x16, x16, x1
+ ld1 {v21.16b}, [x16], x1 // p2
+ ld1 {v24.16b}, [x0], x1 // q0
+ ld1 {v22.16b}, [x16], x1 // p1
+ ld1 {v25.16b}, [x0], x1 // q1
+ ld1 {v23.16b}, [x16], x1 // p0
+ ld1 {v26.16b}, [x0], x1 // q2
+ sub x0, x0, x1, lsl #1
+ sub x0, x0, x1
+
+ lpf_16_wd6
+
+ sub x16, x0, x1, lsl #1
+ st1 {v22.16b}, [x16], x1 // p1
+ st1 {v24.16b}, [x0], x1 // q0
+ st1 {v23.16b}, [x16], x1 // p0
+ st1 {v25.16b}, [x0], x1 // q1
+ sub x0, x0, x1, lsl #1
+ ret x15
+endfunc
+
+function lpf_h_6_16_neon
+ mov x15, x30
+ sub x16, x0, #4
+ add x0, x16, x1, lsl #3
+ ld1 {v20.d}[0], [x16], x1
+ ld1 {v20.d}[1], [x0], x1
+ ld1 {v21.d}[0], [x16], x1
+ ld1 {v21.d}[1], [x0], x1
+ ld1 {v22.d}[0], [x16], x1
+ ld1 {v22.d}[1], [x0], x1
+ ld1 {v23.d}[0], [x16], x1
+ ld1 {v23.d}[1], [x0], x1
+ ld1 {v24.d}[0], [x16], x1
+ ld1 {v24.d}[1], [x0], x1
+ ld1 {v25.d}[0], [x16], x1
+ ld1 {v25.d}[1], [x0], x1
+ ld1 {v26.d}[0], [x16], x1
+ ld1 {v26.d}[1], [x0], x1
+ ld1 {v27.d}[0], [x16], x1
+ ld1 {v27.d}[1], [x0], x1
+ add x0, x0, #4
+
+ transpose_8x16b v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+
+ lpf_16_wd6
+
+ sub x16, x0, x1, lsl #4
+ sub x16, x16, #2
+ transpose_4x16b v22, v23, v24, v25, v26, v27, v28, v29
+ add x0, x16, x1, lsl #3
+
+ st1 {v22.s}[0], [x16], x1
+ st1 {v22.s}[2], [x0], x1
+ st1 {v23.s}[0], [x16], x1
+ st1 {v23.s}[2], [x0], x1
+ st1 {v24.s}[0], [x16], x1
+ st1 {v24.s}[2], [x0], x1
+ st1 {v25.s}[0], [x16], x1
+ st1 {v25.s}[2], [x0], x1
+ st1 {v22.s}[1], [x16], x1
+ st1 {v22.s}[3], [x0], x1
+ st1 {v23.s}[1], [x16], x1
+ st1 {v23.s}[3], [x0], x1
+ st1 {v24.s}[1], [x16], x1
+ st1 {v24.s}[3], [x0], x1
+ st1 {v25.s}[1], [x16], x1
+ st1 {v25.s}[3], [x0], x1
+ add x0, x0, #2
+ ret x15
+endfunc
+
+function lpf_v_8_16_neon
+ mov x15, x30
+ sub x16, x0, x1, lsl #2
+ ld1 {v20.16b}, [x16], x1 // p3
+ ld1 {v24.16b}, [x0], x1 // q0
+ ld1 {v21.16b}, [x16], x1 // p2
+ ld1 {v25.16b}, [x0], x1 // q1
+ ld1 {v22.16b}, [x16], x1 // p1
+ ld1 {v26.16b}, [x0], x1 // q2
+ ld1 {v23.16b}, [x16], x1 // p0
+ ld1 {v27.16b}, [x0], x1 // q3
+ sub x0, x0, x1, lsl #2
+
+ lpf_16_wd8
+
+ sub x16, x0, x1, lsl #1
+ sub x16, x16, x1
+ st1 {v21.16b}, [x16], x1 // p2
+ st1 {v24.16b}, [x0], x1 // q0
+ st1 {v22.16b}, [x16], x1 // p1
+ st1 {v25.16b}, [x0], x1 // q1
+ st1 {v23.16b}, [x16], x1 // p0
+ st1 {v26.16b}, [x0], x1 // q2
+ sub x0, x0, x1, lsl #1
+ sub x0, x0, x1
+ ret x15
+
+8:
+ sub x16, x0, x1, lsl #1
+ st1 {v22.16b}, [x16], x1 // p1
+ st1 {v24.16b}, [x0], x1 // q0
+ st1 {v23.16b}, [x16], x1 // p0
+ st1 {v25.16b}, [x0], x1 // q1
+ sub x0, x0, x1, lsl #1
+ ret x15
+endfunc
+
+function lpf_h_8_16_neon
+ mov x15, x30
+ sub x16, x0, #4
+ add x0, x16, x1, lsl #3
+ ld1 {v20.d}[0], [x16], x1
+ ld1 {v20.d}[1], [x0], x1
+ ld1 {v21.d}[0], [x16], x1
+ ld1 {v21.d}[1], [x0], x1
+ ld1 {v22.d}[0], [x16], x1
+ ld1 {v22.d}[1], [x0], x1
+ ld1 {v23.d}[0], [x16], x1
+ ld1 {v23.d}[1], [x0], x1
+ ld1 {v24.d}[0], [x16], x1
+ ld1 {v24.d}[1], [x0], x1
+ ld1 {v25.d}[0], [x16], x1
+ ld1 {v25.d}[1], [x0], x1
+ ld1 {v26.d}[0], [x16], x1
+ ld1 {v26.d}[1], [x0], x1
+ ld1 {v27.d}[0], [x16], x1
+ ld1 {v27.d}[1], [x0], x1
+ add x0, x0, #4
+
+ transpose_8x16b v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+
+ lpf_16_wd8
+
+ sub x16, x0, x1, lsl #4
+ sub x16, x16, #4
+ transpose_8x16b v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+ add x0, x16, x1, lsl #3
+
+ st1 {v20.d}[0], [x16], x1
+ st1 {v20.d}[1], [x0], x1
+ st1 {v21.d}[0], [x16], x1
+ st1 {v21.d}[1], [x0], x1
+ st1 {v22.d}[0], [x16], x1
+ st1 {v22.d}[1], [x0], x1
+ st1 {v23.d}[0], [x16], x1
+ st1 {v23.d}[1], [x0], x1
+ st1 {v24.d}[0], [x16], x1
+ st1 {v24.d}[1], [x0], x1
+ st1 {v25.d}[0], [x16], x1
+ st1 {v25.d}[1], [x0], x1
+ st1 {v26.d}[0], [x16], x1
+ st1 {v26.d}[1], [x0], x1
+ st1 {v27.d}[0], [x16], x1
+ st1 {v27.d}[1], [x0], x1
+ add x0, x0, #4
+ ret x15
+8:
+ sub x16, x0, x1, lsl #4
+ sub x16, x16, #2
+ transpose_4x16b v22, v23, v24, v25, v26, v27, v28, v29
+ add x0, x16, x1, lsl #3
+
+ st1 {v22.s}[0], [x16], x1
+ st1 {v22.s}[2], [x0], x1
+ st1 {v23.s}[0], [x16], x1
+ st1 {v23.s}[2], [x0], x1
+ st1 {v24.s}[0], [x16], x1
+ st1 {v24.s}[2], [x0], x1
+ st1 {v25.s}[0], [x16], x1
+ st1 {v25.s}[2], [x0], x1
+ st1 {v22.s}[1], [x16], x1
+ st1 {v22.s}[3], [x0], x1
+ st1 {v23.s}[1], [x16], x1
+ st1 {v23.s}[3], [x0], x1
+ st1 {v24.s}[1], [x16], x1
+ st1 {v24.s}[3], [x0], x1
+ st1 {v25.s}[1], [x16], x1
+ st1 {v25.s}[3], [x0], x1
+ add x0, x0, #2
+ ret x15
+endfunc
+
+function lpf_v_16_16_neon
+ mov x15, x30
+
+ sub x16, x0, x1, lsl #3
+ add x16, x16, x1
+ ld1 {v17.16b}, [x16], x1 // p6
+ ld1 {v24.16b}, [x0], x1 // q0
+ ld1 {v18.16b}, [x16], x1 // p5
+ ld1 {v25.16b}, [x0], x1 // q1
+ ld1 {v19.16b}, [x16], x1 // p4
+ ld1 {v26.16b}, [x0], x1 // q2
+ ld1 {v20.16b}, [x16], x1 // p3
+ ld1 {v27.16b}, [x0], x1 // q3
+ ld1 {v21.16b}, [x16], x1 // p2
+ ld1 {v28.16b}, [x0], x1 // q4
+ ld1 {v22.16b}, [x16], x1 // p1
+ ld1 {v29.16b}, [x0], x1 // q5
+ ld1 {v23.16b}, [x16], x1 // p0
+ ld1 {v30.16b}, [x0], x1 // q6
+ sub x0, x0, x1, lsl #3
+ add x0, x0, x1
+
+ lpf_16_wd16
+
+ sub x16, x0, x1, lsl #2
+ sub x16, x16, x1, lsl #1
+ st1 {v0.16b}, [x16], x1 // p5
+ st1 {v6.16b}, [x0], x1 // q0
+ st1 {v1.16b}, [x16], x1 // p4
+ st1 {v7.16b}, [x0], x1 // q1
+ st1 {v2.16b}, [x16], x1 // p3
+ st1 {v8.16b}, [x0], x1 // q2
+ st1 {v3.16b}, [x16], x1 // p2
+ st1 {v9.16b}, [x0], x1 // q3
+ st1 {v4.16b}, [x16], x1 // p1
+ st1 {v10.16b}, [x0], x1 // q4
+ st1 {v5.16b}, [x16], x1 // p0
+ st1 {v11.16b}, [x0], x1 // q5
+ sub x0, x0, x1, lsl #2
+ sub x0, x0, x1, lsl #1
+ ret x15
+7:
+ sub x16, x0, x1
+ sub x16, x16, x1, lsl #1
+ st1 {v21.16b}, [x16], x1 // p2
+ st1 {v24.16b}, [x0], x1 // q0
+ st1 {v22.16b}, [x16], x1 // p1
+ st1 {v25.16b}, [x0], x1 // q1
+ st1 {v23.16b}, [x16], x1 // p0
+ st1 {v26.16b}, [x0], x1 // q2
+ sub x0, x0, x1, lsl #1
+ sub x0, x0, x1
+ ret x15
+
+8:
+ sub x16, x0, x1, lsl #1
+ st1 {v22.16b}, [x16], x1 // p1
+ st1 {v24.16b}, [x0], x1 // q0
+ st1 {v23.16b}, [x16], x1 // p0
+ st1 {v25.16b}, [x0], x1 // q1
+ sub x0, x0, x1, lsl #1
+ ret x15
+endfunc
+
+function lpf_h_16_16_neon
+ mov x15, x30
+ sub x16, x0, #8
+ ld1 {v16.d}[0], [x16], x1
+ ld1 {v24.d}[0], [x0], x1
+ ld1 {v17.d}[0], [x16], x1
+ ld1 {v25.d}[0], [x0], x1
+ ld1 {v18.d}[0], [x16], x1
+ ld1 {v26.d}[0], [x0], x1
+ ld1 {v19.d}[0], [x16], x1
+ ld1 {v27.d}[0], [x0], x1
+ ld1 {v20.d}[0], [x16], x1
+ ld1 {v28.d}[0], [x0], x1
+ ld1 {v21.d}[0], [x16], x1
+ ld1 {v29.d}[0], [x0], x1
+ ld1 {v22.d}[0], [x16], x1
+ ld1 {v30.d}[0], [x0], x1
+ ld1 {v23.d}[0], [x16], x1
+ ld1 {v31.d}[0], [x0], x1
+ ld1 {v16.d}[1], [x16], x1
+ ld1 {v24.d}[1], [x0], x1
+ ld1 {v17.d}[1], [x16], x1
+ ld1 {v25.d}[1], [x0], x1
+ ld1 {v18.d}[1], [x16], x1
+ ld1 {v26.d}[1], [x0], x1
+ ld1 {v19.d}[1], [x16], x1
+ ld1 {v27.d}[1], [x0], x1
+ ld1 {v20.d}[1], [x16], x1
+ ld1 {v28.d}[1], [x0], x1
+ ld1 {v21.d}[1], [x16], x1
+ ld1 {v29.d}[1], [x0], x1
+ ld1 {v22.d}[1], [x16], x1
+ ld1 {v30.d}[1], [x0], x1
+ ld1 {v23.d}[1], [x16], x1
+ ld1 {v31.d}[1], [x0], x1
+
+ transpose_8x16b v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
+ transpose_8x16b v24, v25, v26, v27, v28, v29, v30, v31, v0, v1
+
+ lpf_16_wd16
+
+ sub x0, x0, x1, lsl #4
+ sub x16, x0, #8
+
+ transpose_8x16b v16, v17, v0, v1, v2, v3, v4, v5, v18, v19
+ transpose_8x16b v6, v7, v8, v9, v10, v11, v30, v31, v18, v19
+
+ st1 {v16.d}[0], [x16], x1
+ st1 {v6.d}[0], [x0], x1
+ st1 {v17.d}[0], [x16], x1
+ st1 {v7.d}[0], [x0], x1
+ st1 {v0.d}[0], [x16], x1
+ st1 {v8.d}[0], [x0], x1
+ st1 {v1.d}[0], [x16], x1
+ st1 {v9.d}[0], [x0], x1
+ st1 {v2.d}[0], [x16], x1
+ st1 {v10.d}[0], [x0], x1
+ st1 {v3.d}[0], [x16], x1
+ st1 {v11.d}[0], [x0], x1
+ st1 {v4.d}[0], [x16], x1
+ st1 {v30.d}[0], [x0], x1
+ st1 {v5.d}[0], [x16], x1
+ st1 {v31.d}[0], [x0], x1
+ st1 {v16.d}[1], [x16], x1
+ st1 {v6.d}[1], [x0], x1
+ st1 {v17.d}[1], [x16], x1
+ st1 {v7.d}[1], [x0], x1
+ st1 {v0.d}[1], [x16], x1
+ st1 {v8.d}[1], [x0], x1
+ st1 {v1.d}[1], [x16], x1
+ st1 {v9.d}[1], [x0], x1
+ st1 {v2.d}[1], [x16], x1
+ st1 {v10.d}[1], [x0], x1
+ st1 {v3.d}[1], [x16], x1
+ st1 {v11.d}[1], [x0], x1
+ st1 {v4.d}[1], [x16], x1
+ st1 {v30.d}[1], [x0], x1
+ st1 {v5.d}[1], [x16], x1
+ st1 {v31.d}[1], [x0], x1
+ ret x15
+
+7:
+ sub x16, x0, x1, lsl #4
+ sub x16, x16, #4
+ transpose_8x16b v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+ add x0, x16, x1, lsl #3
+
+ st1 {v20.d}[0], [x16], x1
+ st1 {v20.d}[1], [x0], x1
+ st1 {v21.d}[0], [x16], x1
+ st1 {v21.d}[1], [x0], x1
+ st1 {v22.d}[0], [x16], x1
+ st1 {v22.d}[1], [x0], x1
+ st1 {v23.d}[0], [x16], x1
+ st1 {v23.d}[1], [x0], x1
+ st1 {v24.d}[0], [x16], x1
+ st1 {v24.d}[1], [x0], x1
+ st1 {v25.d}[0], [x16], x1
+ st1 {v25.d}[1], [x0], x1
+ st1 {v26.d}[0], [x16], x1
+ st1 {v26.d}[1], [x0], x1
+ st1 {v27.d}[0], [x16], x1
+ st1 {v27.d}[1], [x0], x1
+ add x0, x0, #4
+ ret x15
+8:
+ sub x16, x0, x1, lsl #4
+ sub x16, x16, #2
+ transpose_4x16b v22, v23, v24, v25, v26, v27, v28, v29
+ add x0, x16, x1, lsl #3
+
+ st1 {v22.s}[0], [x16], x1
+ st1 {v22.s}[2], [x0], x1
+ st1 {v23.s}[0], [x16], x1
+ st1 {v23.s}[2], [x0], x1
+ st1 {v24.s}[0], [x16], x1
+ st1 {v24.s}[2], [x0], x1
+ st1 {v25.s}[0], [x16], x1
+ st1 {v25.s}[2], [x0], x1
+ st1 {v22.s}[1], [x16], x1
+ st1 {v22.s}[3], [x0], x1
+ st1 {v23.s}[1], [x16], x1
+ st1 {v23.s}[3], [x0], x1
+ st1 {v24.s}[1], [x16], x1
+ st1 {v24.s}[3], [x0], x1
+ st1 {v25.s}[1], [x16], x1
+ st1 {v25.s}[3], [x0], x1
+ add x0, x0, #2
+ ret x15
+endfunc
+
+// void dav1d_lpf_v_sb_y_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const uint32_t *const vmask,
+// const uint8_t (*l)[4], ptrdiff_t b4_stride,
+// const Av1FilterLUT *lut, const int w)
+
+.macro lpf_func dir, type
+function lpf_\dir\()_sb_\type\()_8bpc_neon, export=1
+ mov x11, x30
+ stp d8, d9, [sp, #-0x40]!
+ stp d10, d11, [sp, #0x10]
+ stp d12, d13, [sp, #0x20]
+ stp d14, d15, [sp, #0x30]
+ ldp w6, w7, [x2] // vmask[0], vmask[1]
+.ifc \type, y
+ ldr w2, [x2, #8] // vmask[2]
+.endif
+ add x5, x5, #128 // Move to sharp part of lut
+.ifc \type, y
+ orr w7, w7, w2 // vmask[1] |= vmask[2]
+.endif
+.ifc \dir, v
+ sub x4, x3, x4, lsl #2
+.else
+ sub x3, x3, #4
+ lsl x4, x4, #2
+.endif
+ orr w6, w6, w7 // vmask[0] |= vmask[1]
+
+1:
+ tst w6, #0x0f
+.ifc \dir, v
+ ld1 {v0.16b}, [x4], #16
+ ld1 {v1.16b}, [x3], #16
+.else
+ ld2 {v0.s,v1.s}[0], [x3], x4
+ ld2 {v0.s,v1.s}[1], [x3], x4
+ ld2 {v0.s,v1.s}[2], [x3], x4
+ ld2 {v0.s,v1.s}[3], [x3], x4
+.endif
+ b.eq 7f // if (!(vm & bits)) continue;
+
+ ld1r {v5.16b}, [x5] // sharp[0]
+ add x5, x5, #8
+ movi v2.4s, #0xff
+ dup v13.4s, w6 // vmask[0]
+
+ and v0.16b, v0.16b, v2.16b // Keep only lowest byte in each 32 bit word
+ and v1.16b, v1.16b, v2.16b
+ cmtst v3.16b, v1.16b, v2.16b // Check for nonzero values in l[0][0]
+ movi v4.16b, #1
+ ld1r {v6.16b}, [x5] // sharp[1]
+ sub x5, x5, #8
+ bif v1.16b, v0.16b, v3.16b // if (!l[0][0]) L = l[offset][0]
+ cmtst v2.4s, v1.4s, v2.4s // L != 0
+ mul v1.4s, v1.4s, v4.4s // L
+.ifc \type, y
+ dup v15.4s, w2 // vmask[2]
+.endif
+ dup v14.4s, w7 // vmask[1]
+ mov x16, v2.d[0]
+ mov x17, v2.d[1]
+ adds x16, x16, x17
+ b.eq 7f // if (!L) continue;
+ neg v5.16b, v5.16b // -sharp[0]
+ movrel x16, word_1248
+ ushr v12.16b, v1.16b, #4 // H
+ ld1 {v16.4s}, [x16]
+ sshl v3.16b, v1.16b, v5.16b // L >> sharp[0]
+.ifc \type, y
+ cmtst v15.4s, v15.4s, v16.4s // if (vmask[2] & bits)
+.endif
+ movi v7.16b, #2
+ umin v3.16b, v3.16b, v6.16b // imin(L >> sharp[0], sharp[1])
+ add v0.16b, v1.16b, v7.16b // L + 2
+ umax v11.16b, v3.16b, v4.16b // imax(imin(), 1) = limit = I
+ add v0.16b, v0.16b, v0.16b // 2*(L + 2)
+ cmtst v14.4s, v14.4s, v16.4s // if (vmask[1] & bits)
+ add v10.16b, v0.16b, v11.16b // 2*(L + 2) + limit = E
+ cmtst v13.4s, v13.4s, v16.4s // if (vmask[0] & bits)
+ and v13.16b, v13.16b, v2.16b // vmask[0] &= L != 0
+
+.ifc \type, y
+ tst w2, #0x0f
+ b.eq 2f
+ // wd16
+ bl lpf_\dir\()_16_16_neon
+ b 8f
+2:
+.endif
+ tst w7, #0x0f
+ b.eq 3f
+.ifc \type, y
+ // wd8
+ bl lpf_\dir\()_8_16_neon
+.else
+ // wd6
+ bl lpf_\dir\()_6_16_neon
+.endif
+ b 8f
+3:
+ // wd4
+ bl lpf_\dir\()_4_16_neon
+.ifc \dir, h
+ b 8f
+7:
+ // For dir h, the functions above increment x0.
+ // If the whole function is skipped, increment it here instead.
+ add x0, x0, x1, lsl #4
+.else
+7:
+.endif
+8:
+ lsr w6, w6, #4 // vmask[0] >>= 4
+ lsr w7, w7, #4 // vmask[1] >>= 4
+.ifc \type, y
+ lsr w2, w2, #4 // vmask[2] >>= 4
+.endif
+.ifc \dir, v
+ add x0, x0, #16
+.else
+ // For dir h, x0 is returned incremented
+.endif
+ cbnz w6, 1b
+
+ ldp d14, d15, [sp, #0x30]
+ ldp d12, d13, [sp, #0x20]
+ ldp d10, d11, [sp, #0x10]
+ ldp d8, d9, [sp], 0x40
+ ret x11
+endfunc
+.endm
+
+lpf_func v, y
+lpf_func h, y
+lpf_func v, uv
+lpf_func h, uv
+
+const word_1248
+ .word 1, 2, 4, 8
+endconst
diff --git a/third_party/dav1d/src/arm/64/loopfilter16.S b/third_party/dav1d/src/arm/64/loopfilter16.S
new file mode 100644
index 0000000000..d181a3e623
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/loopfilter16.S
@@ -0,0 +1,925 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// depending on how many pixels need to be stored, returns:
+// x14 = (1 << 0) : 0 pixels
+// x14 = (1 << 4) : inner 4 pixels
+// x14 = (1 << 6) : inner 6 pixels
+// x14 = 0 : all pixels
+.macro loop_filter wd
+function lpf_8_wd\wd\()_neon
+ uabd v0.8h, v22.8h, v23.8h // abs(p1 - p0)
+ uabd v1.8h, v25.8h, v24.8h // abs(q1 - q0)
+ uabd v2.8h, v23.8h, v24.8h // abs(p0 - q0)
+ uabd v3.8h, v22.8h, v25.8h // abs(p1 - q1)
+.if \wd >= 6
+ uabd v4.8h, v21.8h, v22.8h // abs(p2 - p1)
+ uabd v5.8h, v26.8h, v25.8h // abs(q2 - q1)
+.endif
+.if \wd >= 8
+ uabd v6.8h, v20.8h, v21.8h // abs(p3 - p2)
+ uabd v7.8h, v27.8h, v26.8h // abs(q3 - q3)
+.endif
+.if \wd >= 6
+ umax v4.8h, v4.8h, v5.8h
+.endif
+ uqadd v2.8h, v2.8h, v2.8h // abs(p0 - q0) * 2
+.if \wd >= 8
+ umax v6.8h, v6.8h, v7.8h
+.endif
+ ushr v3.8h, v3.8h, #1
+.if \wd >= 8
+ umax v4.8h, v4.8h, v6.8h
+.endif
+.if \wd >= 6
+ and v4.16b, v4.16b, v14.16b
+.endif
+ umax v0.8h, v0.8h, v1.8h // max(abs(p1 - p0), abs(q1 - q0))
+ uqadd v2.8h, v2.8h, v3.8h // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
+.if \wd >= 6
+ umax v4.8h, v0.8h, v4.8h
+ cmhs v1.8h, v11.8h, v4.8h // max(abs(p1 - p0), abs(q1 - q0), abs(), abs(), ...) <= I
+.else
+ cmhs v1.8h, v11.8h, v0.8h // max(abs(p1 - p0), abs(q1 - q0)) <= I
+.endif
+ cmhs v2.8h, v10.8h, v2.8h // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 <= E
+ and v1.16b, v1.16b, v2.16b // fm
+ and v1.16b, v1.16b, v13.16b // fm && wd >= 4
+.if \wd >= 6
+ and v14.16b, v14.16b, v1.16b // fm && wd > 4
+.endif
+.if \wd >= 16
+ and v15.16b, v15.16b, v1.16b // fm && wd == 16
+.endif
+
+ mov x16, v1.d[0]
+ mov x17, v1.d[1]
+ adds x16, x16, x17
+ b.ne 9f // if (!fm || wd < 4) return;
+ mov x14, #(1 << 0)
+ ret
+9:
+.if \wd >= 6
+ movi v10.8h, #1
+ uabd v2.8h, v21.8h, v23.8h // abs(p2 - p0)
+ uabd v3.8h, v22.8h, v23.8h // abs(p1 - p0)
+ uabd v4.8h, v25.8h, v24.8h // abs(q1 - q0)
+ uabd v5.8h, v26.8h, v24.8h // abs(q2 - q0)
+ dup v9.8h, w9 // bitdepth_min_8
+.if \wd >= 8
+ uabd v6.8h, v20.8h, v23.8h // abs(p3 - p0)
+ uabd v7.8h, v27.8h, v24.8h // abs(q3 - q0)
+.endif
+ umax v2.8h, v2.8h, v3.8h
+ umax v4.8h, v4.8h, v5.8h
+.if \wd >= 8
+ umax v6.8h, v6.8h, v7.8h
+.endif
+ umax v2.8h, v2.8h, v4.8h
+ ushl v10.8h, v10.8h, v9.8h // F = 1 << bitdepth_min_8
+.if \wd >= 8
+ umax v2.8h, v2.8h, v6.8h
+.endif
+
+.if \wd == 16
+ uabd v3.8h, v17.8h, v23.8h // abs(p6 - p0)
+ uabd v4.8h, v18.8h, v23.8h // abs(p5 - p0)
+ uabd v5.8h, v19.8h, v23.8h // abs(p4 - p0)
+.endif
+ cmhs v2.8h, v10.8h, v2.8h // flat8in
+.if \wd == 16
+ uabd v6.8h, v28.8h, v24.8h // abs(q4 - q0)
+ uabd v7.8h, v29.8h, v24.8h // abs(q5 - q0)
+ uabd v8.8h, v30.8h, v24.8h // abs(q6 - q0)
+.endif
+ and v14.16b, v2.16b, v14.16b // flat8in && fm && wd > 4
+ bic v1.16b, v1.16b, v14.16b // fm && wd >= 4 && !flat8in
+.if \wd == 16
+ umax v3.8h, v3.8h, v4.8h
+ umax v5.8h, v5.8h, v6.8h
+.endif
+ mov x16, v1.d[0]
+ mov x17, v1.d[1]
+.if \wd == 16
+ umax v7.8h, v7.8h, v8.8h
+ umax v3.8h, v3.8h, v5.8h
+ umax v3.8h, v3.8h, v7.8h
+ cmhs v3.8h, v10.8h, v3.8h // flat8out
+.endif
+ adds x16, x16, x17
+.if \wd == 16
+ and v15.16b, v15.16b, v3.16b // flat8out && fm && wd == 16
+ and v15.16b, v15.16b, v14.16b // flat8out && flat8in && fm && wd == 16
+ bic v14.16b, v14.16b, v15.16b // flat8in && fm && wd >= 4 && !flat8out
+.endif
+ b.eq 1f // skip wd == 4 case
+.endif
+
+ dup v3.8h, w8 // bitdepth_max
+ sub v2.8h, v22.8h, v25.8h // p1 - q1
+ ushr v3.8h, v3.8h, #1 // 128 << bitdepth_min_8 - 1
+ cmhi v0.8h, v0.8h, v12.8h // hev
+ not v9.16b, v3.16b // - 128 * (1 << bitdepth_min_8)
+ smin v2.8h, v2.8h, v3.8h // iclip_diff(p1 - q1)
+ smax v2.8h, v2.8h, v9.8h // iclip_diff(p1 - q1)
+ and v4.16b, v2.16b, v0.16b // if (hev) iclip_diff(p1 - q1)
+ sub v2.8h, v24.8h, v23.8h
+ movi v5.8h, #3
+ bic v0.16b, v1.16b, v0.16b // (fm && wd >= 4 && !hev)
+ mul v2.8h, v2.8h, v5.8h
+ movi v6.8h, #4
+ add v2.8h, v2.8h, v4.8h
+ smin v2.8h, v2.8h, v3.8h // f = iclip_diff()
+ smax v2.8h, v2.8h, v9.8h // f = iclip_diff()
+ sqadd v4.8h, v6.8h, v2.8h // f + 4
+ sqadd v5.8h, v5.8h, v2.8h // f + 3
+ smin v4.8h, v4.8h, v3.8h // imin(f + 4, 128 << bitdepth_min_8 - 1)
+ smin v5.8h, v5.8h, v3.8h // imin(f + 3, 128 << bitdepth_min_8 - 1)
+ sshr v4.8h, v4.8h, #3 // f1
+ sshr v5.8h, v5.8h, #3 // f2
+ movi v9.8h, #0
+ dup v3.8h, w8 // bitdepth_max
+ sqadd v2.8h, v23.8h, v5.8h // p0 + f2
+ sqsub v6.8h, v24.8h, v4.8h // q0 - f1
+ srshr v4.8h, v4.8h, #1 // (f1 + 1) >> 1
+ smin v2.8h, v2.8h, v3.8h // out p0 = iclip_pixel()
+ smin v6.8h, v6.8h, v3.8h // out q0 = iclip_pixel()
+ smax v2.8h, v2.8h, v9.8h // out p0 = iclip_pixel()
+ smax v6.8h, v6.8h, v9.8h // out q0 = iclip_pixel()
+ bit v23.16b, v2.16b, v1.16b // if (fm && wd >= 4)
+ bit v24.16b, v6.16b, v1.16b // if (fm && wd >= 4)
+ sqadd v2.8h, v22.8h, v4.8h // p1 + f
+ sqsub v6.8h, v25.8h, v4.8h // q1 - f
+ smin v2.8h, v2.8h, v3.8h // out p1 = iclip_pixel()
+ smin v6.8h, v6.8h, v3.8h // out q1 = iclip_pixel()
+ smax v2.8h, v2.8h, v9.8h // out p1 = iclip_pixel()
+ smax v6.8h, v6.8h, v9.8h // out q1 = iclip_pixel()
+ bit v22.16b, v2.16b, v0.16b // if (fm && wd >= 4 && !hev)
+ bit v25.16b, v6.16b, v0.16b // if (fm && wd >= 4 && !hev)
+1:
+
+.if \wd == 6
+ mov x16, v14.d[0]
+ mov x17, v14.d[1]
+ adds x16, x16, x17
+ b.eq 2f // skip if there's no flat8in
+
+ add v0.8h, v21.8h, v21.8h // p2 * 2
+ add v2.8h, v21.8h, v22.8h // p2 + p1
+ add v4.8h, v22.8h, v23.8h // p1 + p0
+ add v6.8h, v23.8h, v24.8h // p0 + q0
+ add v8.8h, v0.8h, v2.8h
+ add v10.8h, v4.8h, v6.8h
+ add v12.8h, v24.8h, v25.8h // q0 + q1
+ add v8.8h, v8.8h, v10.8h
+ sub v12.8h, v12.8h, v0.8h
+ add v10.8h, v25.8h, v26.8h // q1 + q2
+ urshr v0.8h, v8.8h, #3 // out p1
+
+ add v8.8h, v8.8h, v12.8h
+ sub v10.8h, v10.8h, v2.8h
+ add v12.8h, v26.8h, v26.8h // q2 + q2
+ urshr v1.8h, v8.8h, #3 // out p0
+
+ add v8.8h, v8.8h, v10.8h
+ sub v12.8h, v12.8h, v4.8h
+ urshr v2.8h, v8.8h, #3 // out q0
+
+ bit v22.16b, v0.16b, v14.16b // p1 if (flat8in)
+ add v8.8h, v8.8h, v12.8h
+ bit v23.16b, v1.16b, v14.16b // p0 if (flat8in)
+ urshr v3.8h, v8.8h, #3 // out q1
+ bit v24.16b, v2.16b, v14.16b // q0 if (flat8in)
+ bit v25.16b, v3.16b, v14.16b // q1 if (flat8in)
+.elseif \wd >= 8
+ mov x16, v14.d[0]
+ mov x17, v14.d[1]
+ adds x16, x16, x17
+.if \wd == 8
+ b.eq 8f // skip if there's no flat8in
+.else
+ b.eq 2f // skip if there's no flat8in
+.endif
+
+ add v0.8h, v20.8h, v21.8h // p3 + p2
+ add v2.8h, v22.8h, v25.8h // p1 + q1
+ add v4.8h, v20.8h, v22.8h // p3 + p1
+ add v6.8h, v23.8h, v26.8h // p0 + q2
+ add v8.8h, v0.8h, v0.8h // 2 * (p3 + p2)
+ add v9.8h, v23.8h, v24.8h // p0 + q0
+ add v8.8h, v8.8h, v4.8h // + p3 + p1
+ sub v2.8h, v2.8h, v0.8h // p1 + q1 - p3 - p2
+ add v8.8h, v8.8h, v9.8h // + p0 + q0
+ sub v6.8h, v6.8h, v4.8h // p0 + q2 - p3 - p1
+ urshr v10.8h, v8.8h, #3 // out p2
+
+ add v8.8h, v8.8h, v2.8h
+ add v0.8h, v20.8h, v23.8h // p3 + p0
+ add v2.8h, v24.8h, v27.8h // q0 + q3
+ urshr v11.8h, v8.8h, #3 // out p1
+
+ add v8.8h, v8.8h, v6.8h
+ sub v2.8h, v2.8h, v0.8h // q0 + q3 - p3 - p0
+ add v4.8h, v21.8h, v24.8h // p2 + q0
+ add v6.8h, v25.8h, v27.8h // q1 + q3
+ urshr v12.8h, v8.8h, #3 // out p0
+
+ add v8.8h, v8.8h, v2.8h
+ sub v6.8h, v6.8h, v4.8h // q1 + q3 - p2 - q0
+ add v0.8h, v22.8h, v25.8h // p1 + q1
+ add v2.8h, v26.8h, v27.8h // q2 + q3
+ urshr v13.8h, v8.8h, #3 // out q0
+
+ add v8.8h, v8.8h, v6.8h
+ sub v2.8h, v2.8h, v0.8h // q2 + q3 - p1 - q1
+ urshr v0.8h, v8.8h, #3 // out q1
+
+ add v8.8h, v8.8h, v2.8h
+
+ bit v21.16b, v10.16b, v14.16b
+ bit v22.16b, v11.16b, v14.16b
+ bit v23.16b, v12.16b, v14.16b
+ urshr v1.8h, v8.8h, #3 // out q2
+ bit v24.16b, v13.16b, v14.16b
+ bit v25.16b, v0.16b, v14.16b
+ bit v26.16b, v1.16b, v14.16b
+.endif
+2:
+.if \wd == 16
+ mov x16, v15.d[0]
+ mov x17, v15.d[1]
+ adds x16, x16, x17
+ b.ne 1f // check if flat8out is needed
+ mov x16, v14.d[0]
+ mov x17, v14.d[1]
+ adds x16, x16, x17
+ b.eq 8f // if there was no flat8in, just write the inner 4 pixels
+ b 7f // if flat8in was used, write the inner 6 pixels
+1:
+
+ add v2.8h, v17.8h, v17.8h // p6 + p6
+ add v4.8h, v17.8h, v18.8h // p6 + p5
+ add v6.8h, v17.8h, v19.8h // p6 + p4
+ add v8.8h, v17.8h, v20.8h // p6 + p3
+ add v12.8h, v2.8h, v4.8h
+ add v10.8h, v6.8h, v8.8h
+ add v6.8h, v17.8h, v21.8h // p6 + p2
+ add v12.8h, v12.8h, v10.8h
+ add v8.8h, v17.8h, v22.8h // p6 + p1
+ add v10.8h, v18.8h, v23.8h // p5 + p0
+ add v6.8h, v6.8h, v8.8h
+ add v8.8h, v19.8h, v24.8h // p4 + q0
+ add v12.8h, v12.8h, v6.8h
+ add v10.8h, v10.8h, v8.8h
+ add v6.8h, v20.8h, v25.8h // p3 + q1
+ add v12.8h, v12.8h, v10.8h
+ sub v6.8h, v6.8h, v2.8h
+ add v2.8h, v21.8h, v26.8h // p2 + q2
+ urshr v0.8h, v12.8h, #4 // out p5
+ add v12.8h, v12.8h, v6.8h // - (p6 + p6) + (p3 + q1)
+ sub v2.8h, v2.8h, v4.8h
+ add v4.8h, v22.8h, v27.8h // p1 + q3
+ add v6.8h, v17.8h, v19.8h // p6 + p4
+ urshr v1.8h, v12.8h, #4 // out p4
+ add v12.8h, v12.8h, v2.8h // - (p6 + p5) + (p2 + q2)
+ sub v4.8h, v4.8h, v6.8h
+ add v6.8h, v23.8h, v28.8h // p0 + q4
+ add v8.8h, v17.8h, v20.8h // p6 + p3
+ urshr v2.8h, v12.8h, #4 // out p3
+ add v12.8h, v12.8h, v4.8h // - (p6 + p4) + (p1 + q3)
+ sub v6.8h, v6.8h, v8.8h
+ add v8.8h, v24.8h, v29.8h // q0 + q5
+ add v4.8h, v17.8h, v21.8h // p6 + p2
+ urshr v3.8h, v12.8h, #4 // out p2
+ add v12.8h, v12.8h, v6.8h // - (p6 + p3) + (p0 + q4)
+ sub v8.8h, v8.8h, v4.8h
+ add v6.8h, v25.8h, v30.8h // q1 + q6
+ add v10.8h, v17.8h, v22.8h // p6 + p1
+ urshr v4.8h, v12.8h, #4 // out p1
+ add v12.8h, v12.8h, v8.8h // - (p6 + p2) + (q0 + q5)
+ sub v6.8h, v6.8h, v10.8h
+ add v8.8h, v26.8h, v30.8h // q2 + q6
+ bif v0.16b, v18.16b, v15.16b // out p5
+ add v10.8h, v18.8h, v23.8h // p5 + p0
+ urshr v5.8h, v12.8h, #4 // out p0
+ add v12.8h, v12.8h, v6.8h // - (p6 + p1) + (q1 + q6)
+ sub v8.8h, v8.8h, v10.8h
+ add v10.8h, v27.8h, v30.8h // q3 + q6
+ bif v1.16b, v19.16b, v15.16b // out p4
+ add v18.8h, v19.8h, v24.8h // p4 + q0
+ urshr v6.8h, v12.8h, #4 // out q0
+ add v12.8h, v12.8h, v8.8h // - (p5 + p0) + (q2 + q6)
+ sub v10.8h, v10.8h, v18.8h
+ add v8.8h, v28.8h, v30.8h // q4 + q6
+ bif v2.16b, v20.16b, v15.16b // out p3
+ add v18.8h, v20.8h, v25.8h // p3 + q1
+ urshr v7.8h, v12.8h, #4 // out q1
+ add v12.8h, v12.8h, v10.8h // - (p4 + q0) + (q3 + q6)
+ sub v18.8h, v8.8h, v18.8h
+ add v10.8h, v29.8h, v30.8h // q5 + q6
+ bif v3.16b, v21.16b, v15.16b // out p2
+ add v20.8h, v21.8h, v26.8h // p2 + q2
+ urshr v8.8h, v12.8h, #4 // out q2
+ add v12.8h, v12.8h, v18.8h // - (p3 + q1) + (q4 + q6)
+ sub v10.8h, v10.8h, v20.8h
+ add v18.8h, v30.8h, v30.8h // q6 + q6
+ bif v4.16b, v22.16b, v15.16b // out p1
+ add v20.8h, v22.8h, v27.8h // p1 + q3
+ urshr v9.8h, v12.8h, #4 // out q3
+ add v12.8h, v12.8h, v10.8h // - (p2 + q2) + (q5 + q6)
+ sub v18.8h, v18.8h, v20.8h
+ bif v5.16b, v23.16b, v15.16b // out p0
+ urshr v10.8h, v12.8h, #4 // out q4
+ add v12.8h, v12.8h, v18.8h // - (p1 + q3) + (q6 + q6)
+ urshr v11.8h, v12.8h, #4 // out q5
+ bif v6.16b, v24.16b, v15.16b // out q0
+ bif v7.16b, v25.16b, v15.16b // out q1
+ bif v8.16b, v26.16b, v15.16b // out q2
+ bif v9.16b, v27.16b, v15.16b // out q3
+ bif v10.16b, v28.16b, v15.16b // out q4
+ bif v11.16b, v29.16b, v15.16b // out q5
+.endif
+
+ mov x14, #0
+ ret
+.if \wd == 16
+7:
+ // Return to a shorter epilogue, writing only the inner 6 pixels
+ mov x14, #(1 << 6)
+ ret
+.endif
+.if \wd >= 8
+8:
+ // Return to a shorter epilogue, writing only the inner 4 pixels
+ mov x14, #(1 << 4)
+ ret
+.endif
+endfunc
+.endm
+
+loop_filter 16
+loop_filter 8
+loop_filter 6
+loop_filter 4
+
+.macro lpf_8_wd16
+ bl lpf_8_wd16_neon
+ cbz x14, 1f
+ tbnz x14, #6, 7f
+ tbnz x14, #4, 8f
+ ret x15
+1:
+.endm
+
+.macro lpf_8_wd8
+ bl lpf_8_wd8_neon
+ cbz x14, 1f
+ tbnz x14, #4, 8f
+ ret x15
+1:
+.endm
+
+.macro lpf_8_wd6
+ bl lpf_8_wd6_neon
+ cbz x14, 1f
+ ret x15
+1:
+.endm
+
+.macro lpf_8_wd4
+ bl lpf_8_wd4_neon
+ cbz x14, 1f
+ ret x15
+1:
+.endm
+
+function lpf_v_4_8_neon
+ mov x15, x30
+ sub x16, x0, x1, lsl #1
+ ld1 {v22.8h}, [x16], x1 // p1
+ ld1 {v24.8h}, [x0], x1 // q0
+ ld1 {v23.8h}, [x16], x1 // p0
+ ld1 {v25.8h}, [x0], x1 // q1
+ sub x0, x0, x1, lsl #1
+
+ lpf_8_wd4
+
+ sub x16, x0, x1, lsl #1
+ st1 {v22.8h}, [x16], x1 // p1
+ st1 {v24.8h}, [x0], x1 // q0
+ st1 {v23.8h}, [x16], x1 // p0
+ st1 {v25.8h}, [x0], x1 // q1
+ sub x0, x0, x1, lsl #1
+ ret x15
+endfunc
+
+function lpf_h_4_8_neon
+ mov x15, x30
+ sub x16, x0, #4
+ add x0, x16, x1, lsl #2
+ ld1 {v22.d}[0], [x16], x1
+ ld1 {v22.d}[1], [x0], x1
+ ld1 {v23.d}[0], [x16], x1
+ ld1 {v23.d}[1], [x0], x1
+ ld1 {v24.d}[0], [x16], x1
+ ld1 {v24.d}[1], [x0], x1
+ ld1 {v25.d}[0], [x16], x1
+ ld1 {v25.d}[1], [x0], x1
+ add x0, x0, #4
+
+ transpose_4x8h v22, v23, v24, v25, v26, v27, v28, v29
+
+ lpf_8_wd4
+
+ sub x16, x0, x1, lsl #3
+ sub x16, x16, #4
+ transpose_4x8h v22, v23, v24, v25, v26, v27, v28, v29
+ add x0, x16, x1, lsl #2
+
+ st1 {v22.d}[0], [x16], x1
+ st1 {v22.d}[1], [x0], x1
+ st1 {v23.d}[0], [x16], x1
+ st1 {v23.d}[1], [x0], x1
+ st1 {v24.d}[0], [x16], x1
+ st1 {v24.d}[1], [x0], x1
+ st1 {v25.d}[0], [x16], x1
+ st1 {v25.d}[1], [x0], x1
+ add x0, x0, #4
+ ret x15
+endfunc
+
+function lpf_v_6_8_neon
+ mov x15, x30
+ sub x16, x0, x1, lsl #1
+ sub x16, x16, x1
+ ld1 {v21.8h}, [x16], x1 // p2
+ ld1 {v24.8h}, [x0], x1 // q0
+ ld1 {v22.8h}, [x16], x1 // p1
+ ld1 {v25.8h}, [x0], x1 // q1
+ ld1 {v23.8h}, [x16], x1 // p0
+ ld1 {v26.8h}, [x0], x1 // q2
+ sub x0, x0, x1, lsl #1
+ sub x0, x0, x1
+
+ lpf_8_wd6
+
+ sub x16, x0, x1, lsl #1
+ st1 {v22.8h}, [x16], x1 // p1
+ st1 {v24.8h}, [x0], x1 // q0
+ st1 {v23.8h}, [x16], x1 // p0
+ st1 {v25.8h}, [x0], x1 // q1
+ sub x0, x0, x1, lsl #1
+ ret x15
+endfunc
+
+function lpf_h_6_8_neon
+ mov x15, x30
+ sub x16, x0, #8
+ add x0, x16, x1, lsl #2
+ ld1 {v20.8h}, [x16], x1
+ ld1 {v24.8h}, [x0], x1
+ ld1 {v21.8h}, [x16], x1
+ ld1 {v25.8h}, [x0], x1
+ ld1 {v22.8h}, [x16], x1
+ ld1 {v26.8h}, [x0], x1
+ ld1 {v23.8h}, [x16], x1
+ ld1 {v27.8h}, [x0], x1
+ add x0, x0, #8
+
+ transpose_8x8h v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+
+ lpf_8_wd6
+
+ sub x16, x0, x1, lsl #3
+ sub x16, x16, #4
+ transpose_4x8h v22, v23, v24, v25, v26, v27, v28, v29
+ add x0, x16, x1, lsl #2
+
+ st1 {v22.d}[0], [x16], x1
+ st1 {v22.d}[1], [x0], x1
+ st1 {v23.d}[0], [x16], x1
+ st1 {v23.d}[1], [x0], x1
+ st1 {v24.d}[0], [x16], x1
+ st1 {v24.d}[1], [x0], x1
+ st1 {v25.d}[0], [x16], x1
+ st1 {v25.d}[1], [x0], x1
+ add x0, x0, #4
+ ret x15
+endfunc
+
+function lpf_v_8_8_neon
+ mov x15, x30
+ sub x16, x0, x1, lsl #2
+ ld1 {v20.8h}, [x16], x1 // p3
+ ld1 {v24.8h}, [x0], x1 // q0
+ ld1 {v21.8h}, [x16], x1 // p2
+ ld1 {v25.8h}, [x0], x1 // q1
+ ld1 {v22.8h}, [x16], x1 // p1
+ ld1 {v26.8h}, [x0], x1 // q2
+ ld1 {v23.8h}, [x16], x1 // p0
+ ld1 {v27.8h}, [x0], x1 // q3
+ sub x0, x0, x1, lsl #2
+
+ lpf_8_wd8
+
+ sub x16, x0, x1, lsl #1
+ sub x16, x16, x1
+ st1 {v21.8h}, [x16], x1 // p2
+ st1 {v24.8h}, [x0], x1 // q0
+ st1 {v22.8h}, [x16], x1 // p1
+ st1 {v25.8h}, [x0], x1 // q1
+ st1 {v23.8h}, [x16], x1 // p0
+ st1 {v26.8h}, [x0], x1 // q2
+ sub x0, x0, x1, lsl #1
+ sub x0, x0, x1
+ ret x15
+
+8:
+ sub x16, x0, x1, lsl #1
+ st1 {v22.8h}, [x16], x1 // p1
+ st1 {v24.8h}, [x0], x1 // q0
+ st1 {v23.8h}, [x16], x1 // p0
+ st1 {v25.8h}, [x0], x1 // q1
+ sub x0, x0, x1, lsl #1
+ ret x15
+endfunc
+
+function lpf_h_8_8_neon
+ mov x15, x30
+ sub x16, x0, #8
+ add x0, x16, x1, lsl #2
+ ld1 {v20.8h}, [x16], x1
+ ld1 {v24.8h}, [x0], x1
+ ld1 {v21.8h}, [x16], x1
+ ld1 {v25.8h}, [x0], x1
+ ld1 {v22.8h}, [x16], x1
+ ld1 {v26.8h}, [x0], x1
+ ld1 {v23.8h}, [x16], x1
+ ld1 {v27.8h}, [x0], x1
+ add x0, x0, #8
+
+ transpose_8x8h v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+
+ lpf_8_wd8
+
+ sub x16, x0, x1, lsl #3
+ sub x16, x16, #8
+ transpose_8x8h v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+ add x0, x16, x1, lsl #2
+
+ st1 {v20.8h}, [x16], x1
+ st1 {v24.8h}, [x0], x1
+ st1 {v21.8h}, [x16], x1
+ st1 {v25.8h}, [x0], x1
+ st1 {v22.8h}, [x16], x1
+ st1 {v26.8h}, [x0], x1
+ st1 {v23.8h}, [x16], x1
+ st1 {v27.8h}, [x0], x1
+ add x0, x0, #8
+ ret x15
+8:
+ sub x16, x0, x1, lsl #3
+ sub x16, x16, #4
+ transpose_4x8h v22, v23, v24, v25, v26, v27, v28, v29
+ add x0, x16, x1, lsl #2
+
+ st1 {v22.d}[0], [x16], x1
+ st1 {v22.d}[1], [x0], x1
+ st1 {v23.d}[0], [x16], x1
+ st1 {v23.d}[1], [x0], x1
+ st1 {v24.d}[0], [x16], x1
+ st1 {v24.d}[1], [x0], x1
+ st1 {v25.d}[0], [x16], x1
+ st1 {v25.d}[1], [x0], x1
+ add x0, x0, #4
+ ret x15
+endfunc
+
+function lpf_v_16_8_neon
+ mov x15, x30
+
+ sub x16, x0, x1, lsl #3
+ add x16, x16, x1
+ ld1 {v17.8h}, [x16], x1 // p6
+ ld1 {v24.8h}, [x0], x1 // q0
+ ld1 {v18.8h}, [x16], x1 // p5
+ ld1 {v25.8h}, [x0], x1 // q1
+ ld1 {v19.8h}, [x16], x1 // p4
+ ld1 {v26.8h}, [x0], x1 // q2
+ ld1 {v20.8h}, [x16], x1 // p3
+ ld1 {v27.8h}, [x0], x1 // q3
+ ld1 {v21.8h}, [x16], x1 // p2
+ ld1 {v28.8h}, [x0], x1 // q4
+ ld1 {v22.8h}, [x16], x1 // p1
+ ld1 {v29.8h}, [x0], x1 // q5
+ ld1 {v23.8h}, [x16], x1 // p0
+ ld1 {v30.8h}, [x0], x1 // q6
+ sub x0, x0, x1, lsl #3
+ add x0, x0, x1
+
+ lpf_8_wd16
+
+ sub x16, x0, x1, lsl #2
+ sub x16, x16, x1, lsl #1
+ st1 {v0.8h}, [x16], x1 // p5
+ st1 {v6.8h}, [x0], x1 // q0
+ st1 {v1.8h}, [x16], x1 // p4
+ st1 {v7.8h}, [x0], x1 // q1
+ st1 {v2.8h}, [x16], x1 // p3
+ st1 {v8.8h}, [x0], x1 // q2
+ st1 {v3.8h}, [x16], x1 // p2
+ st1 {v9.8h}, [x0], x1 // q3
+ st1 {v4.8h}, [x16], x1 // p1
+ st1 {v10.8h}, [x0], x1 // q4
+ st1 {v5.8h}, [x16], x1 // p0
+ st1 {v11.8h}, [x0], x1 // q5
+ sub x0, x0, x1, lsl #2
+ sub x0, x0, x1, lsl #1
+ ret x15
+7:
+ sub x16, x0, x1
+ sub x16, x16, x1, lsl #1
+ st1 {v21.8h}, [x16], x1 // p2
+ st1 {v24.8h}, [x0], x1 // q0
+ st1 {v22.8h}, [x16], x1 // p1
+ st1 {v25.8h}, [x0], x1 // q1
+ st1 {v23.8h}, [x16], x1 // p0
+ st1 {v26.8h}, [x0], x1 // q2
+ sub x0, x0, x1, lsl #1
+ sub x0, x0, x1
+ ret x15
+
+8:
+ sub x16, x0, x1, lsl #1
+ st1 {v22.8h}, [x16], x1 // p1
+ st1 {v24.8h}, [x0], x1 // q0
+ st1 {v23.8h}, [x16], x1 // p0
+ st1 {v25.8h}, [x0], x1 // q1
+ sub x0, x0, x1, lsl #1
+ ret x15
+endfunc
+
+function lpf_h_16_8_neon
+ mov x15, x30
+ sub x16, x0, #16
+ ld1 {v16.8h}, [x16], x1
+ ld1 {v24.8h}, [x0], x1
+ ld1 {v17.8h}, [x16], x1
+ ld1 {v25.8h}, [x0], x1
+ ld1 {v18.8h}, [x16], x1
+ ld1 {v26.8h}, [x0], x1
+ ld1 {v19.8h}, [x16], x1
+ ld1 {v27.8h}, [x0], x1
+ ld1 {v20.8h}, [x16], x1
+ ld1 {v28.8h}, [x0], x1
+ ld1 {v21.8h}, [x16], x1
+ ld1 {v29.8h}, [x0], x1
+ ld1 {v22.8h}, [x16], x1
+ ld1 {v30.8h}, [x0], x1
+ ld1 {v23.8h}, [x16], x1
+ ld1 {v31.8h}, [x0], x1
+
+ transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
+ transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v0, v1
+
+ lpf_8_wd16
+
+ sub x0, x0, x1, lsl #3
+ sub x16, x0, #16
+
+ transpose_8x8h v16, v17, v0, v1, v2, v3, v4, v5, v18, v19
+ transpose_8x8h v6, v7, v8, v9, v10, v11, v30, v31, v18, v19
+
+ st1 {v16.8h}, [x16], x1
+ st1 {v6.8h}, [x0], x1
+ st1 {v17.8h}, [x16], x1
+ st1 {v7.8h}, [x0], x1
+ st1 {v0.8h}, [x16], x1
+ st1 {v8.8h}, [x0], x1
+ st1 {v1.8h}, [x16], x1
+ st1 {v9.8h}, [x0], x1
+ st1 {v2.8h}, [x16], x1
+ st1 {v10.8h}, [x0], x1
+ st1 {v3.8h}, [x16], x1
+ st1 {v11.8h}, [x0], x1
+ st1 {v4.8h}, [x16], x1
+ st1 {v30.8h}, [x0], x1
+ st1 {v5.8h}, [x16], x1
+ st1 {v31.8h}, [x0], x1
+ ret x15
+
+7:
+ sub x16, x0, x1, lsl #3
+ sub x16, x16, #8
+ transpose_8x8h v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+ add x0, x16, x1, lsl #2
+
+ st1 {v20.8h}, [x16], x1
+ st1 {v24.8h}, [x0], x1
+ st1 {v21.8h}, [x16], x1
+ st1 {v25.8h}, [x0], x1
+ st1 {v22.8h}, [x16], x1
+ st1 {v26.8h}, [x0], x1
+ st1 {v23.8h}, [x16], x1
+ st1 {v27.8h}, [x0], x1
+ add x0, x0, #8
+ ret x15
+8:
+ sub x16, x0, x1, lsl #3
+ sub x16, x16, #4
+ transpose_4x8h v22, v23, v24, v25, v26, v27, v28, v29
+ add x0, x16, x1, lsl #2
+
+ st1 {v22.d}[0], [x16], x1
+ st1 {v22.d}[1], [x0], x1
+ st1 {v23.d}[0], [x16], x1
+ st1 {v23.d}[1], [x0], x1
+ st1 {v24.d}[0], [x16], x1
+ st1 {v24.d}[1], [x0], x1
+ st1 {v25.d}[0], [x16], x1
+ st1 {v25.d}[1], [x0], x1
+ add x0, x0, #4
+ ret x15
+endfunc
+
+// void dav1d_lpf_v_sb_y_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const uint32_t *const vmask,
+// const uint8_t (*l)[4], ptrdiff_t b4_stride,
+// const Av1FilterLUT *lut, const int w,
+// const int bitdepth_max)
+
+.macro lpf_func dir, type
+function lpf_\dir\()_sb_\type\()_16bpc_neon, export=1
+ mov x11, x30
+ mov w8, w7 // bitdepth_max
+ clz w9, w8
+ mov w10, #24
+ sub w9, w10, w9 // bitdepth_min_8
+ stp d8, d9, [sp, #-0x40]!
+ stp d10, d11, [sp, #0x10]
+ stp d12, d13, [sp, #0x20]
+ stp d14, d15, [sp, #0x30]
+ ldp w6, w7, [x2] // vmask[0], vmask[1]
+.ifc \type, y
+ ldr w2, [x2, #8] // vmask[2]
+.endif
+ add x5, x5, #128 // Move to sharp part of lut
+.ifc \type, y
+ orr w7, w7, w2 // vmask[1] |= vmask[2]
+.endif
+.ifc \dir, v
+ sub x4, x3, x4, lsl #2
+.else
+ sub x3, x3, #4
+ lsl x4, x4, #2
+.endif
+ orr w6, w6, w7 // vmask[0] |= vmask[1]
+
+1:
+ tst w6, #0x03
+.ifc \dir, v
+ ld1 {v0.8b}, [x4], #8
+ ld1 {v1.8b}, [x3], #8
+.else
+ ld2 {v0.s,v1.s}[0], [x3], x4
+ ld2 {v0.s,v1.s}[1], [x3], x4
+.endif
+ b.eq 7f // if (!(vm & bits)) continue;
+
+ ld1r {v5.8b}, [x5] // sharp[0]
+ add x5, x5, #8
+ movi v2.2s, #0xff
+ dup v13.2s, w6 // vmask[0]
+ dup v31.8h, w9 // bitdepth_min_8
+
+ and v0.8b, v0.8b, v2.8b // Keep only lowest byte in each 32 bit word
+ and v1.8b, v1.8b, v2.8b
+ cmtst v3.8b, v1.8b, v2.8b // Check for nonzero values in l[0][0]
+ movi v4.8b, #1
+ ld1r {v6.8b}, [x5] // sharp[1]
+ sub x5, x5, #8
+ bif v1.8b, v0.8b, v3.8b // if (!l[0][0]) L = l[offset][0]
+ cmtst v2.2s, v1.2s, v2.2s // L != 0
+ mul v1.2s, v1.2s, v4.2s // L
+.ifc \type, y
+ dup v15.2s, w2 // vmask[2]
+.endif
+ dup v14.2s, w7 // vmask[1]
+ mov x16, v2.d[0]
+ cmp x16, #0
+ b.eq 7f // if (!L) continue;
+ neg v5.8b, v5.8b // -sharp[0]
+ movrel x16, word_12
+ ushr v12.8b, v1.8b, #4 // H
+ ld1 {v16.2s}, [x16]
+ sshl v3.8b, v1.8b, v5.8b // L >> sharp[0]
+.ifc \type, y
+ cmtst v15.2s, v15.2s, v16.2s // if (vmask[2] & bits)
+.endif
+ movi v7.8b, #2
+ umin v3.8b, v3.8b, v6.8b // imin(L >> sharp[0], sharp[1])
+ add v0.8b, v1.8b, v7.8b // L + 2
+ umax v11.8b, v3.8b, v4.8b // imax(imin(), 1) = limit = I
+ add v0.8b, v0.8b, v0.8b // 2*(L + 2)
+ cmtst v14.2s, v14.2s, v16.2s // if (vmask[1] & bits)
+ uxtl v12.8h, v12.8b
+ add v10.8b, v0.8b, v11.8b // 2*(L + 2) + limit = E
+ cmtst v13.2s, v13.2s, v16.2s // if (vmask[0] & bits)
+ uxtl v11.8h, v11.8b
+ uxtl v10.8h, v10.8b
+ and v13.8b, v13.8b, v2.8b // vmask[0] &= L != 0
+ sxtl v14.8h, v14.8b
+ sxtl v13.8h, v13.8b
+.ifc \type, y
+ sxtl v15.8h, v15.8b
+.endif
+ ushl v12.8h, v12.8h, v31.8h
+ ushl v11.8h, v11.8h, v31.8h
+ ushl v10.8h, v10.8h, v31.8h
+
+.ifc \type, y
+ tst w2, #0x03
+ b.eq 2f
+ // wd16
+ bl lpf_\dir\()_16_8_neon
+ b 8f
+2:
+.endif
+ tst w7, #0x03
+ b.eq 3f
+.ifc \type, y
+ // wd8
+ bl lpf_\dir\()_8_8_neon
+.else
+ // wd6
+ bl lpf_\dir\()_6_8_neon
+.endif
+ b 8f
+3:
+ // wd4
+ bl lpf_\dir\()_4_8_neon
+.ifc \dir, h
+ b 8f
+7:
+ // For dir h, the functions above increment x0.
+ // If the whole function is skipped, increment it here instead.
+ add x0, x0, x1, lsl #3
+.else
+7:
+.endif
+8:
+ lsr w6, w6, #2 // vmask[0] >>= 2
+ lsr w7, w7, #2 // vmask[1] >>= 2
+.ifc \type, y
+ lsr w2, w2, #2 // vmask[2] >>= 2
+.endif
+.ifc \dir, v
+ add x0, x0, #16
+.else
+ // For dir h, x0 is returned incremented
+.endif
+ cbnz w6, 1b
+
+ ldp d14, d15, [sp, #0x30]
+ ldp d12, d13, [sp, #0x20]
+ ldp d10, d11, [sp, #0x10]
+ ldp d8, d9, [sp], 0x40
+ ret x11
+endfunc
+.endm
+
+lpf_func v, y
+lpf_func h, y
+lpf_func v, uv
+lpf_func h, uv
+
+const word_12
+ .word 1, 2
+endconst
diff --git a/third_party/dav1d/src/arm/64/looprestoration.S b/third_party/dav1d/src/arm/64/looprestoration.S
new file mode 100644
index 0000000000..f8dc0df4d8
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/looprestoration.S
@@ -0,0 +1,1303 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+const right_ext_mask_buf
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+right_ext_mask:
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+endconst
+
+// void dav1d_wiener_filter7_8bpc_neon(pixel *p, const ptrdiff_t stride,
+// const pixel (*left)[4], const pixel *lpf,
+// const int w, int h,
+// const int16_t filter[2][8],
+// const enum LrEdgeFlags edges);
+function wiener_filter7_8bpc_neon, export=1
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29, x30, [sp, #-16]!
+ mov x29, sp
+ ld1 {v0.8h, v1.8h}, [x6]
+ tst w7, #4 // LR_HAVE_TOP
+ sub_sp 384*2*6
+
+ mov w17, #(1 << 14) - (1 << 2)
+ dup v30.8h, w17
+ movi v31.8h, #8, lsl #8
+
+ // x9 - t6
+ // x10 - t5
+ // x11 - t4
+ // x12 - t3
+ // x13 - t2
+ // x14 - t1
+ // x15 - t0
+ mov x14, sp // t1
+ b.eq L(no_top_7)
+
+ mov x16, x2 // backup left
+ mov x2, #0
+ bl wiener_filter7_h_8bpc_neon
+ add x3, x3, x1 // lpf += stride
+ mov x9, x14 // t6
+ mov x10, x14 // t5
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter7_h_8bpc_neon
+ add x3, x3, x1, lsl #2
+ add x3, x3, x1 // lpf += stride*5
+ mov x11, x14 // t4
+ add x14, x14, #384*2 // t1 += 384*2
+ mov x2, x16 // left
+ mov x16, x3 // backup lpf
+ mov x3, x0 // lpf = p
+ bl wiener_filter7_h_8bpc_neon
+ subs w5, w5, #1 // h--
+ mov x12, x14 // t3
+ mov x13, x14 // t2
+ b.eq L(v1_7)
+ add x3, x3, x1 // src += stride
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter7_h_8bpc_neon
+ mov x13, x14 // t2
+ subs w5, w5, #1 // h--
+ b.eq L(v2_7)
+ add x3, x3, x1 // src += stride
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter7_h_8bpc_neon
+ subs w5, w5, #1 // h--
+ b.eq L(v3_7)
+ add x3, x3, x1 // src += stride
+
+L(main_7):
+ add x15, x14, #384*2 // t0 = t1 + 384*2
+L(main_loop_7):
+ bl wiener_filter7_hv_8bpc_neon
+ subs w5, w5, #1 // h--
+ b.ne L(main_loop_7)
+ tst w7, #8 // LR_HAVE_BOTTOM
+ b.eq L(v3_7)
+
+ mov x3, x16 // restore lpf
+ mov x2, #0 // left = NULL
+ bl wiener_filter7_hv_8bpc_neon
+ bl wiener_filter7_hv_8bpc_neon
+L(v1_7):
+ bl wiener_filter7_v_8bpc_neon
+
+ mov sp, x29
+ ldp x29, x30, [sp], #16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(no_top_7):
+ add x3, x3, x1, lsl #2
+ add x16, x3, x1, lsl #1 // lpf += stride*6, backup
+ mov x3, x0 // lpf = p
+
+ bl wiener_filter7_h_8bpc_neon
+ subs w5, w5, #1 // h--
+ mov x9, x14 // t6
+ mov x10, x14 // t5
+ mov x11, x14 // t4
+ mov x12, x14 // t3
+ mov x13, x14 // t2
+ b.eq L(v1_7)
+ add x3, x3, x1 // src += stride
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter7_h_8bpc_neon
+ subs w5, w5, #1 // h--
+ mov x13, x14 // t2
+ b.eq L(v2_7)
+ add x3, x3, x1 // src += stride
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter7_h_8bpc_neon
+ subs w5, w5, #1 // h--
+ b.eq L(v3_7)
+ add x3, x3, x1 // src += stride
+ add x15, x14, #384*2 // t0 = t1 + 384*2
+ bl wiener_filter7_hv_8bpc_neon
+ subs w5, w5, #1 // h--
+ b.eq L(v3_7)
+ add x15, x15, #384*2*4 // t0 += 384*2*4
+ bl wiener_filter7_hv_8bpc_neon
+ subs w5, w5, #1 // h--
+ b.ne L(main_7)
+L(v3_7):
+ bl wiener_filter7_v_8bpc_neon
+L(v2_7):
+ bl wiener_filter7_v_8bpc_neon
+ b L(v1_7)
+endfunc
+
+
+function wiener_filter7_h_8bpc_neon
+ stp x3, x4, [sp, #-32]!
+ str x14, [sp, #16]
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst w7, #1 // LR_HAVE_LEFT
+ b.eq 1f
+ // LR_HAVE_LEFT
+ cbnz x2, 0f
+ // left == NULL
+ sub x3, x3, #3
+ ld1 {v3.16b}, [x3], #16
+ b 2f
+
+0:
+ // LR_HAVE_LEFT, left != NULL
+ ld1 {v3.16b}, [x3], #16
+ ld1 {v2.s}[3], [x2], #4
+ // Move x3 back to account for the last 3 bytes we loaded earlier,
+ // which we'll shift out.
+ sub x3, x3, #3
+ ext v3.16b, v2.16b, v3.16b, #13
+ b 2f
+
+1:
+ ld1 {v3.16b}, [x3], #16
+ // !LR_HAVE_LEFT, fill v2 with the leftmost byte
+ // and shift v3 to have 3x the first byte at the front.
+ dup v2.16b, v3.b[0]
+ // Move x3 back to account for the last 3 bytes we loaded before,
+ // which we shifted out.
+ sub x3, x3, #3
+ ext v3.16b, v2.16b, v3.16b, #13
+
+2:
+ ld1 {v4.8b}, [x3], #8
+ uxtl v2.8h, v3.8b
+ uxtl2 v3.8h, v3.16b
+ uxtl v4.8h, v4.8b
+
+ tst w7, #2 // LR_HAVE_RIGHT
+ b.ne 4f
+
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp w4, #19
+ b.ge 4f // If w >= 19, all used input pixels are valid
+
+ // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9,
+ // this ends up called again; it's not strictly needed in those
+ // cases (we pad enough here), but keeping the code as simple as possible.
+
+ // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie
+ // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel.
+ sub w17, w4, #22
+ // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the
+ // buffer pointer.
+ movrel x6, right_ext_mask, -6
+ ldr b28, [x3, w17, sxtw]
+ sub x6, x6, w4, uxtw #1
+ dup v28.8h, v28.h[0]
+ ld1 {v25.16b, v26.16b, v27.16b}, [x6]
+
+ bit v2.16b, v28.16b, v25.16b
+ bit v3.16b, v28.16b, v26.16b
+ bit v4.16b, v28.16b, v27.16b
+
+4: // Loop horizontally
+ // Interleaving the mul/mla chains actually hurts performance
+ // significantly on Cortex A53, thus keeping mul/mla tightly
+ // chained like this.
+ ext v17.16b, v2.16b, v3.16b, #4
+ ext v19.16b, v2.16b, v3.16b, #8
+ ext v16.16b, v2.16b, v3.16b, #2
+ ext v20.16b, v2.16b, v3.16b, #10
+ ext v21.16b, v2.16b, v3.16b, #12
+ ext v18.16b, v2.16b, v3.16b, #6
+ add v19.8h, v19.8h, v17.8h
+ add v20.8h, v20.8h, v16.8h
+ add v21.8h, v21.8h, v2.8h
+ shl v22.8h, v18.8h, #7
+ mul v6.8h, v18.8h, v0.h[3]
+ mla v6.8h, v19.8h, v0.h[4]
+ mla v6.8h, v20.8h, v0.h[5]
+ mla v6.8h, v21.8h, v0.h[6]
+
+ ext v17.16b, v3.16b, v4.16b, #4
+ ext v19.16b, v3.16b, v4.16b, #8
+ ext v16.16b, v3.16b, v4.16b, #2
+ ext v20.16b, v3.16b, v4.16b, #10
+ ext v21.16b, v3.16b, v4.16b, #12
+ ext v18.16b, v3.16b, v4.16b, #6
+
+ add v19.8h, v19.8h, v17.8h
+ add v20.8h, v20.8h, v16.8h
+ add v21.8h, v21.8h, v3.8h
+ shl v23.8h, v18.8h, #7
+ mul v7.8h, v18.8h, v0.h[3]
+ mla v7.8h, v19.8h, v0.h[4]
+ mla v7.8h, v20.8h, v0.h[5]
+ mla v7.8h, v21.8h, v0.h[6]
+
+ sub v22.8h, v22.8h, v30.8h
+ sub v23.8h, v23.8h, v30.8h
+ sqadd v6.8h, v6.8h, v22.8h
+ sqadd v7.8h, v7.8h, v23.8h
+ sshr v6.8h, v6.8h, #3
+ sshr v7.8h, v7.8h, #3
+ add v6.8h, v6.8h, v31.8h
+ add v7.8h, v7.8h, v31.8h
+
+ subs w4, w4, #16
+
+ st1 {v6.8h, v7.8h}, [x14], #32
+
+ b.le 0f
+ mov v2.16b, v4.16b
+ ld1 {v4.16b}, [x3], #16
+ tst w7, #2 // LR_HAVE_RIGHT
+ uxtl v3.8h, v4.8b
+ uxtl2 v4.8h, v4.16b
+ b.ne 4b // If we don't need to pad, just keep filtering.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+0:
+ ldr x14, [sp, #16]
+ ldp x3, x4, [sp], #32
+ ret
+endfunc
+
+function wiener_filter7_v_8bpc_neon
+ // Backing up/restoring registers shifted, so that x9 gets the value
+ // of x10, etc, afterwards.
+ stp x10, x11, [sp, #-64]!
+ stp x12, x13, [sp, #16]
+ stp x14, x14, [sp, #32]
+ stp x0, x4, [sp, #48]
+1:
+ ld1 {v20.8h, v21.8h}, [x11], #32
+ ld1 {v24.8h, v25.8h}, [x13], #32
+
+ ld1 {v18.8h, v19.8h}, [x10], #32
+ add v24.8h, v24.8h, v20.8h
+ ld1 {v26.8h, v27.8h}, [x14], #32
+
+ ld1 {v16.8h, v17.8h}, [x9], #32
+ add v28.8h, v26.8h, v18.8h
+ ld1 {v22.8h, v23.8h}, [x12], #32
+
+ add v16.8h, v26.8h, v16.8h
+ add v25.8h, v25.8h, v21.8h
+
+ smull v2.4s, v22.4h, v1.h[3]
+ smlal v2.4s, v24.4h, v1.h[4]
+ smlal v2.4s, v28.4h, v1.h[5]
+ smlal v2.4s, v16.4h, v1.h[6]
+ add v29.8h, v27.8h, v19.8h
+ smull2 v3.4s, v22.8h, v1.h[3]
+ smlal2 v3.4s, v24.8h, v1.h[4]
+ smlal2 v3.4s, v28.8h, v1.h[5]
+ smlal2 v3.4s, v16.8h, v1.h[6]
+ add v17.8h, v27.8h, v17.8h
+ smull v4.4s, v23.4h, v1.h[3]
+ smlal v4.4s, v25.4h, v1.h[4]
+ smlal v4.4s, v29.4h, v1.h[5]
+ smlal v4.4s, v17.4h, v1.h[6]
+ smull2 v5.4s, v23.8h, v1.h[3]
+ smlal2 v5.4s, v25.8h, v1.h[4]
+ smlal2 v5.4s, v29.8h, v1.h[5]
+ smlal2 v5.4s, v17.8h, v1.h[6]
+ sqrshrun v2.4h, v2.4s, #11
+ sqrshrun2 v2.8h, v3.4s, #11
+ sqrshrun v3.4h, v4.4s, #11
+ sqrshrun2 v3.8h, v5.4s, #11
+ sqxtun v2.8b, v2.8h
+ sqxtun2 v2.16b, v3.8h
+ subs w4, w4, #16
+ st1 {v2.16b}, [x0], #16
+ b.gt 1b
+
+ ldp x0, x4, [sp, #48]
+ ldp x13, x14, [sp, #32]
+ ldp x11, x12, [sp, #16]
+ ldp x9, x10, [sp], #64
+
+ add x0, x0, x1
+ ret
+endfunc
+
+function wiener_filter7_hv_8bpc_neon
+ // Backing up/restoring registers shifted, so that x9 gets the value
+ // of x10, etc, and x15==x9, afterwards.
+ stp x10, x11, [sp, #-80]!
+ stp x12, x13, [sp, #16]
+ stp x14, x15, [sp, #32]
+ stp x10, x0, [sp, #48]
+ stp x3, x4, [sp, #64]
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst w7, #1 // LR_HAVE_LEFT
+ b.eq 1f
+ // LR_HAVE_LEFT
+ cbnz x2, 0f
+ // left == NULL
+ sub x3, x3, #3
+ ld1 {v3.16b}, [x3], #16
+ b 2f
+
+0:
+ // LR_HAVE_LEFT, left != NULL
+ ld1 {v3.16b}, [x3], #16
+ ld1 {v2.s}[3], [x2], #4
+ // Move x3 back to account for the last 3 bytes we loaded earlier,
+ // which we'll shift out.
+ sub x3, x3, #3
+ ext v3.16b, v2.16b, v3.16b, #13
+ b 2f
+1:
+ ld1 {v3.16b}, [x3], #16
+ // !LR_HAVE_LEFT, fill v2 with the leftmost byte
+ // and shift v3 to have 3x the first byte at the front.
+ dup v2.16b, v3.b[0]
+ // Move x3 back to account for the last 3 bytes we loaded before,
+ // which we shifted out.
+ sub x3, x3, #3
+ ext v3.16b, v2.16b, v3.16b, #13
+
+2:
+ ld1 {v4.8b}, [x3], #8
+ uxtl v2.8h, v3.8b
+ uxtl2 v3.8h, v3.16b
+ uxtl v4.8h, v4.8b
+
+ tst w7, #2 // LR_HAVE_RIGHT
+ b.ne 4f
+
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp w4, #19
+ b.ge 4f // If w >= 19, all used input pixels are valid
+
+ // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9,
+ // this ends up called again; it's not strictly needed in those
+ // cases (we pad enough here), but keeping the code as simple as possible.
+
+ // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie
+ // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel.
+ sub w17, w4, #22
+ // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the
+ // buffer pointer.
+ movrel x6, right_ext_mask, -6
+ ldr b28, [x3, w17, sxtw]
+ sub x6, x6, w4, uxtw #1
+ dup v28.8h, v28.h[0]
+ ld1 {v25.16b, v26.16b, v27.16b}, [x6]
+
+ bit v2.16b, v28.16b, v25.16b
+ bit v3.16b, v28.16b, v26.16b
+ bit v4.16b, v28.16b, v27.16b
+
+4: // Loop horizontally
+ ext v17.16b, v2.16b, v3.16b, #4
+ ext v19.16b, v2.16b, v3.16b, #8
+ ext v16.16b, v2.16b, v3.16b, #2
+ ext v20.16b, v2.16b, v3.16b, #10
+ ext v21.16b, v2.16b, v3.16b, #12
+ ext v18.16b, v2.16b, v3.16b, #6
+ add v19.8h, v19.8h, v17.8h
+ add v20.8h, v20.8h, v16.8h
+ add v21.8h, v21.8h, v2.8h
+ shl v22.8h, v18.8h, #7
+ mul v6.8h, v18.8h, v0.h[3]
+ mla v6.8h, v19.8h, v0.h[4]
+ mla v6.8h, v20.8h, v0.h[5]
+ mla v6.8h, v21.8h, v0.h[6]
+
+ ext v17.16b, v3.16b, v4.16b, #4
+ ext v19.16b, v3.16b, v4.16b, #8
+ ext v16.16b, v3.16b, v4.16b, #2
+ ext v20.16b, v3.16b, v4.16b, #10
+ ext v21.16b, v3.16b, v4.16b, #12
+ ext v18.16b, v3.16b, v4.16b, #6
+
+ add v19.8h, v19.8h, v17.8h
+ add v20.8h, v20.8h, v16.8h
+ add v21.8h, v21.8h, v3.8h
+ shl v23.8h, v18.8h, #7
+ mul v7.8h, v18.8h, v0.h[3]
+ mla v7.8h, v19.8h, v0.h[4]
+ mla v7.8h, v20.8h, v0.h[5]
+ mla v7.8h, v21.8h, v0.h[6]
+
+ ld1 {v20.8h, v21.8h}, [x11], #32
+
+ sub v22.8h, v22.8h, v30.8h
+ sub v23.8h, v23.8h, v30.8h
+ ld1 {v26.8h, v27.8h}, [x13], #32
+ sqadd v6.8h, v6.8h, v22.8h
+ sqadd v7.8h, v7.8h, v23.8h
+ ld1 {v18.8h, v19.8h}, [x10], #32
+ sshr v6.8h, v6.8h, #3
+ sshr v7.8h, v7.8h, #3
+ ld1 {v28.8h, v29.8h}, [x14], #32
+ add v6.8h, v6.8h, v31.8h
+ add v7.8h, v7.8h, v31.8h
+
+ ld1 {v16.8h, v17.8h}, [x9], #32
+ add v26.8h, v20.8h, v26.8h
+
+ ld1 {v24.8h, v25.8h}, [x12], #32
+ add v28.8h, v18.8h, v28.8h
+
+ add v16.8h, v16.8h, v6.8h
+ add v27.8h, v21.8h, v27.8h
+
+ smull v18.4s, v24.4h, v1.h[3]
+ smlal v18.4s, v26.4h, v1.h[4]
+ smlal v18.4s, v28.4h, v1.h[5]
+ smlal v18.4s, v16.4h, v1.h[6]
+ add v29.8h, v19.8h, v29.8h
+ smull2 v19.4s, v24.8h, v1.h[3]
+ smlal2 v19.4s, v26.8h, v1.h[4]
+ smlal2 v19.4s, v28.8h, v1.h[5]
+ smlal2 v19.4s, v16.8h, v1.h[6]
+ add v17.8h, v17.8h, v7.8h
+ smull v20.4s, v25.4h, v1.h[3]
+ smlal v20.4s, v27.4h, v1.h[4]
+ smlal v20.4s, v29.4h, v1.h[5]
+ smlal v20.4s, v17.4h, v1.h[6]
+ smull2 v21.4s, v25.8h, v1.h[3]
+ smlal2 v21.4s, v27.8h, v1.h[4]
+ smlal2 v21.4s, v29.8h, v1.h[5]
+ smlal2 v21.4s, v17.8h, v1.h[6]
+ sqrshrun v18.4h, v18.4s, #11
+ sqrshrun2 v18.8h, v19.4s, #11
+ sqrshrun v19.4h, v20.4s, #11
+ sqrshrun2 v19.8h, v21.4s, #11
+ st1 {v6.8h, v7.8h}, [x15], #32
+ sqxtun v18.8b, v18.8h
+ sqxtun2 v18.16b, v19.8h
+ subs w4, w4, #16
+
+ st1 {v18.16b}, [x0], #16
+
+ b.le 0f
+ mov v2.16b, v4.16b
+ ld1 {v4.16b}, [x3], #16
+ tst w7, #2 // LR_HAVE_RIGHT
+ uxtl v3.8h, v4.8b
+ uxtl2 v4.8h, v4.16b
+ b.ne 4b // If we don't need to pad, just keep filtering.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+0:
+ ldp x3, x4, [sp, #64]
+ ldp x15, x0, [sp, #48]
+ ldp x13, x14, [sp, #32]
+ ldp x11, x12, [sp, #16]
+ ldp x9, x10, [sp], #80
+
+ add x3, x3, x1
+ add x0, x0, x1
+
+ ret
+endfunc
+
+// void dav1d_wiener_filter5_8bpc_neon(pixel *p, const ptrdiff_t stride,
+// const pixel (*left)[4], const pixel *lpf,
+// const int w, int h,
+// const int16_t filter[2][8],
+// const enum LrEdgeFlags edges);
+function wiener_filter5_8bpc_neon, export=1
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29, x30, [sp, #-16]!
+ mov x29, sp
+ ld1 {v0.8h, v1.8h}, [x6]
+ tst w7, #4 // LR_HAVE_TOP
+ sub_sp 384*2*4
+
+ mov w17, #(1 << 14) - (1 << 2)
+ dup v30.8h, w17
+ movi v31.8h, #8, lsl #8
+
+ // x11 - t4
+ // x12 - t3
+ // x13 - t2
+ // x14 - t1
+ // x15 - t0
+ mov x14, sp // t1
+ b.eq L(no_top_5)
+
+ mov x16, x2 // backup left
+ mov x2, #0
+ bl wiener_filter5_h_8bpc_neon
+ add x3, x3, x1 // lpf += stride
+ mov x11, x14 // t4
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter5_h_8bpc_neon
+ add x3, x3, x1, lsl #2
+ add x3, x3, x1 // lpf += stride*5
+ mov x12, x14 // t3
+ add x14, x14, #384*2 // t1 += 384*2
+ mov x2, x16 // left
+ mov x16, x3 // backup lpf
+ mov x3, x0 // lpf = p
+ bl wiener_filter5_h_8bpc_neon
+ subs w5, w5, #1 // h--
+ mov x13, x14 // t2
+ b.eq L(v1_5)
+ add x3, x3, x1 // src += stride
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter5_h_8bpc_neon
+ subs w5, w5, #1 // h--
+ b.eq L(v2_5)
+ add x3, x3, x1 // src += stride
+
+L(main_5):
+ mov x15, x11 // t0 = t4
+L(main_loop_5):
+ bl wiener_filter5_hv_8bpc_neon
+ subs w5, w5, #1 // h--
+ b.ne L(main_loop_5)
+ tst w7, #8 // LR_HAVE_BOTTOM
+ b.eq L(v2_5)
+
+ mov x3, x16 // restore lpf
+ mov x2, #0 // left = NULL
+ bl wiener_filter5_hv_8bpc_neon
+ bl wiener_filter5_hv_8bpc_neon
+L(end_5):
+
+ mov sp, x29
+ ldp x29, x30, [sp], #16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(no_top_5):
+ add x3, x3, x1, lsl #2
+ add x16, x3, x1, lsl #1 // lpf += stride*6, backup
+ mov x3, x0 // lpf = p
+
+ bl wiener_filter5_h_8bpc_neon
+ subs w5, w5, #1 // h--
+ mov x11, x14 // t4
+ mov x12, x14 // t3
+ mov x13, x14 // t2
+ b.eq L(v1_5)
+ add x3, x3, x1 // src += stride
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter5_h_8bpc_neon
+ subs w5, w5, #1 // h--
+ b.eq L(v2_5)
+ add x3, x3, x1 // src += stride
+ add x15, x14, #384*2 // t0 = t1 + 384*2
+ bl wiener_filter5_hv_8bpc_neon
+ subs w5, w5, #1 // h--
+ b.eq L(v2_5)
+ add x15, x15, #384*2*3 // t0 += 384*2*3
+ bl wiener_filter5_hv_8bpc_neon
+ subs w5, w5, #1 // h--
+ b.ne L(main_5)
+L(v2_5):
+ bl wiener_filter5_v_8bpc_neon
+ add x0, x0, x1
+ mov x11, x12
+ mov x12, x13
+ mov x13, x14
+L(v1_5):
+ bl wiener_filter5_v_8bpc_neon
+ b L(end_5)
+endfunc
+
+
+function wiener_filter5_h_8bpc_neon
+ stp x3, x4, [sp, #-32]!
+ str x14, [sp, #16]
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst w7, #1 // LR_HAVE_LEFT
+ b.eq 1f
+ // LR_HAVE_LEFT
+ cbnz x2, 0f
+ // left == NULL
+ sub x3, x3, #2
+ ld1 {v3.16b}, [x3], #16
+ b 2f
+
+0:
+ // LR_HAVE_LEFT, left != NULL
+ ld1 {v3.16b}, [x3], #16
+ ld1 {v2.s}[3], [x2], #4
+ // Move x3 back to account for the last 2 bytes we loaded earlier,
+ // which we'll shift out.
+ sub x3, x3, #2
+ ext v3.16b, v2.16b, v3.16b, #14
+ b 2f
+
+1:
+ ld1 {v3.16b}, [x3], #16
+ // !LR_HAVE_LEFT, fill v2 with the leftmost byte
+ // and shift v3 to have 3x the first byte at the front.
+ dup v2.16b, v3.b[0]
+ // Move x3 back to account for the last 2 bytes we loaded before,
+ // which we shifted out.
+ sub x3, x3, #2
+ ext v3.16b, v2.16b, v3.16b, #14
+
+2:
+ ld1 {v4.8b}, [x3], #8
+ uxtl v2.8h, v3.8b
+ uxtl2 v3.8h, v3.16b
+ uxtl v4.8h, v4.8b
+
+ tst w7, #2 // LR_HAVE_RIGHT
+ b.ne 4f
+
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp w4, #18
+ b.ge 4f // If w >= 18, all used input pixels are valid
+
+ // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9,
+ // this ends up called again; it's not strictly needed in those
+ // cases (we pad enough here), but keeping the code as simple as possible.
+
+ // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie
+ // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel.
+ sub w17, w4, #23
+ // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the
+ // buffer pointer.
+ movrel x6, right_ext_mask, -4
+ ldr b28, [x3, w17, sxtw]
+ sub x6, x6, w4, uxtw #1
+ dup v28.8h, v28.h[0]
+ ld1 {v25.16b, v26.16b, v27.16b}, [x6]
+
+ bit v2.16b, v28.16b, v25.16b
+ bit v3.16b, v28.16b, v26.16b
+ bit v4.16b, v28.16b, v27.16b
+
+4: // Loop horizontally
+ // Interleaving the mul/mla chains actually hurts performance
+ // significantly on Cortex A53, thus keeping mul/mla tightly
+ // chained like this.
+ ext v16.16b, v2.16b, v3.16b, #2
+ ext v18.16b, v2.16b, v3.16b, #6
+ ext v19.16b, v2.16b, v3.16b, #8
+ ext v17.16b, v2.16b, v3.16b, #4
+ add v18.8h, v18.8h, v16.8h
+ add v19.8h, v19.8h, v2.8h
+ shl v22.8h, v17.8h, #7
+ mul v6.8h, v17.8h, v0.h[3]
+ mla v6.8h, v18.8h, v0.h[4]
+ mla v6.8h, v19.8h, v0.h[5]
+
+ ext v16.16b, v3.16b, v4.16b, #2
+ ext v18.16b, v3.16b, v4.16b, #6
+ ext v19.16b, v3.16b, v4.16b, #8
+ ext v17.16b, v3.16b, v4.16b, #4
+ add v18.8h, v18.8h, v16.8h
+ add v19.8h, v19.8h, v3.8h
+ shl v23.8h, v17.8h, #7
+ mul v7.8h, v17.8h, v0.h[3]
+ mla v7.8h, v18.8h, v0.h[4]
+ mla v7.8h, v19.8h, v0.h[5]
+
+ sub v22.8h, v22.8h, v30.8h
+ sub v23.8h, v23.8h, v30.8h
+ sqadd v6.8h, v6.8h, v22.8h
+ sqadd v7.8h, v7.8h, v23.8h
+ sshr v6.8h, v6.8h, #3
+ sshr v7.8h, v7.8h, #3
+ add v6.8h, v6.8h, v31.8h
+ add v7.8h, v7.8h, v31.8h
+
+ subs w4, w4, #16
+
+ st1 {v6.8h, v7.8h}, [x14], #32
+
+ b.le 0f
+ mov v2.16b, v4.16b
+ ld1 {v4.16b}, [x3], #16
+ tst w7, #2 // LR_HAVE_RIGHT
+ uxtl v3.8h, v4.8b
+ uxtl2 v4.8h, v4.16b
+ b.ne 4b // If we don't need to pad, just keep filtering.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+0:
+ ldr x14, [sp, #16]
+ ldp x3, x4, [sp], #32
+ ret
+endfunc
+
+function wiener_filter5_v_8bpc_neon
+ stp x11, x12, [sp, #-48]!
+ stp x13, x14, [sp, #16]
+ stp x0, x4, [sp, #32]
+1:
+ ld1 {v18.8h, v19.8h}, [x12], #32
+ ld1 {v22.8h, v23.8h}, [x14], #32
+ ld1 {v16.8h, v17.8h}, [x11], #32
+
+ add v24.8h, v22.8h, v18.8h
+ ld1 {v20.8h, v21.8h}, [x13], #32
+ add v16.8h, v22.8h, v16.8h
+ add v25.8h, v23.8h, v19.8h
+
+ smull v2.4s, v20.4h, v1.h[3]
+ smlal v2.4s, v24.4h, v1.h[4]
+ smlal v2.4s, v16.4h, v1.h[5]
+ add v17.8h, v23.8h, v17.8h
+ smull2 v3.4s, v20.8h, v1.h[3]
+ smlal2 v3.4s, v24.8h, v1.h[4]
+ smlal2 v3.4s, v16.8h, v1.h[5]
+ smull v4.4s, v21.4h, v1.h[3]
+ smlal v4.4s, v25.4h, v1.h[4]
+ smlal v4.4s, v17.4h, v1.h[5]
+ smull2 v5.4s, v21.8h, v1.h[3]
+ smlal2 v5.4s, v25.8h, v1.h[4]
+ smlal2 v5.4s, v17.8h, v1.h[5]
+ sqrshrun v2.4h, v2.4s, #11
+ sqrshrun2 v2.8h, v3.4s, #11
+ sqrshrun v3.4h, v4.4s, #11
+ sqrshrun2 v3.8h, v5.4s, #11
+ sqxtun v2.8b, v2.8h
+ sqxtun2 v2.16b, v3.8h
+ subs w4, w4, #16
+ st1 {v2.16b}, [x0], #16
+ b.gt 1b
+
+ ldp x0, x4, [sp, #32]
+ ldp x13, x14, [sp, #16]
+ ldp x11, x12, [sp], #48
+
+ ret
+endfunc
+
+function wiener_filter5_hv_8bpc_neon
+ // Backing up/restoring registers shifted, so that x11 gets the value
+ // of x12, etc, and x15==x11, afterwards.
+ stp x12, x13, [sp, #-64]!
+ stp x14, x15, [sp, #16]
+ stp x12, x0, [sp, #32]
+ stp x3, x4, [sp, #48]
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst w7, #1 // LR_HAVE_LEFT
+ b.eq 1f
+ // LR_HAVE_LEFT
+ cbnz x2, 0f
+ // left == NULL
+ sub x3, x3, #2
+ ld1 {v3.16b}, [x3], #16
+ b 2f
+
+0:
+ // LR_HAVE_LEFT, left != NULL
+ ld1 {v3.16b}, [x3], #16
+ ld1 {v2.s}[3], [x2], #4
+ // Move x3 back to account for the last 2 bytes we loaded earlier,
+ // which we'll shift out.
+ sub x3, x3, #2
+ ext v3.16b, v2.16b, v3.16b, #14
+ b 2f
+1:
+ ld1 {v3.16b}, [x3], #16
+ // !LR_HAVE_LEFT, fill v2 with the leftmost byte
+ // and shift v3 to have 2x the first byte at the front.
+ dup v2.16b, v3.b[0]
+ // Move x3 back to account for the last 2 bytes we loaded before,
+ // which we shifted out.
+ sub x3, x3, #2
+ ext v3.16b, v2.16b, v3.16b, #14
+
+2:
+ ld1 {v4.8b}, [x3], #8
+ uxtl v2.8h, v3.8b
+ uxtl2 v3.8h, v3.16b
+ uxtl v4.8h, v4.8b
+
+ tst w7, #2 // LR_HAVE_RIGHT
+ b.ne 4f
+
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp w4, #18
+ b.ge 4f // If w >= 18, all used input pixels are valid
+
+ // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9,
+ // this ends up called again; it's not strictly needed in those
+ // cases (we pad enough here), but keeping the code as simple as possible.
+
+ // The padding pixel is v2/3/4.h[w+1]. x3 points at the next input, ie
+ // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel.
+ sub w17, w4, #23
+ // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the
+ // buffer pointer.
+ movrel x6, right_ext_mask, -4
+ ldr b28, [x3, w17, sxtw]
+ sub x6, x6, w4, uxtw #1
+ dup v28.8h, v28.h[0]
+ ld1 {v25.16b, v26.16b, v27.16b}, [x6]
+
+ bit v2.16b, v28.16b, v25.16b
+ bit v3.16b, v28.16b, v26.16b
+ bit v4.16b, v28.16b, v27.16b
+
+4: // Loop horizontally
+
+ ext v16.16b, v2.16b, v3.16b, #2
+ ext v18.16b, v2.16b, v3.16b, #6
+ ext v19.16b, v2.16b, v3.16b, #8
+ ext v17.16b, v2.16b, v3.16b, #4
+ add v18.8h, v18.8h, v16.8h
+ add v19.8h, v19.8h, v2.8h
+ shl v22.8h, v17.8h, #7
+ mul v6.8h, v17.8h, v0.h[3]
+ mla v6.8h, v18.8h, v0.h[4]
+ mla v6.8h, v19.8h, v0.h[5]
+
+ ext v16.16b, v3.16b, v4.16b, #2
+ ext v18.16b, v3.16b, v4.16b, #6
+ ext v19.16b, v3.16b, v4.16b, #8
+ ext v17.16b, v3.16b, v4.16b, #4
+ add v18.8h, v18.8h, v16.8h
+ add v19.8h, v19.8h, v3.8h
+ shl v23.8h, v17.8h, #7
+ mul v7.8h, v17.8h, v0.h[3]
+ mla v7.8h, v18.8h, v0.h[4]
+ mla v7.8h, v19.8h, v0.h[5]
+
+ ld1 {v18.8h, v19.8h}, [x12], #32
+
+ sub v22.8h, v22.8h, v30.8h
+ sub v23.8h, v23.8h, v30.8h
+ ld1 {v24.8h, v25.8h}, [x14], #32
+ sqadd v6.8h, v6.8h, v22.8h
+ sqadd v7.8h, v7.8h, v23.8h
+ ld1 {v16.8h, v17.8h}, [x11], #32
+ sshr v6.8h, v6.8h, #3
+ sshr v7.8h, v7.8h, #3
+ ld1 {v20.8h, v21.8h}, [x13], #32
+ add v6.8h, v6.8h, v31.8h
+ add v7.8h, v7.8h, v31.8h
+
+ add v24.8h, v24.8h, v18.8h
+ add v16.8h, v16.8h, v6.8h
+
+ smull v18.4s, v20.4h, v1.h[3]
+ smlal v18.4s, v24.4h, v1.h[4]
+ smlal v18.4s, v16.4h, v1.h[5]
+ add v25.8h, v25.8h, v19.8h
+ smull2 v19.4s, v20.8h, v1.h[3]
+ smlal2 v19.4s, v24.8h, v1.h[4]
+ smlal2 v19.4s, v16.8h, v1.h[5]
+ add v17.8h, v17.8h, v7.8h
+ smull v20.4s, v21.4h, v1.h[3]
+ smlal v20.4s, v25.4h, v1.h[4]
+ smlal v20.4s, v17.4h, v1.h[5]
+ smull2 v21.4s, v21.8h, v1.h[3]
+ smlal2 v21.4s, v25.8h, v1.h[4]
+ smlal2 v21.4s, v17.8h, v1.h[5]
+ sqrshrun v18.4h, v18.4s, #11
+ sqrshrun2 v18.8h, v19.4s, #11
+ sqrshrun v19.4h, v20.4s, #11
+ sqrshrun2 v19.8h, v21.4s, #11
+ st1 {v6.8h, v7.8h}, [x15], #32
+ sqxtun v18.8b, v18.8h
+ sqxtun2 v18.16b, v19.8h
+ subs w4, w4, #16
+
+ st1 {v18.16b}, [x0], #16
+
+ b.le 0f
+ mov v2.16b, v4.16b
+ ld1 {v4.16b}, [x3], #16
+ tst w7, #2 // LR_HAVE_RIGHT
+ uxtl v3.8h, v4.8b
+ uxtl2 v4.8h, v4.16b
+ b.ne 4b // If we don't need to pad, just keep filtering.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+0:
+ ldp x3, x4, [sp, #48]
+ ldp x15, x0, [sp, #32]
+ ldp x13, x14, [sp, #16]
+ ldp x11, x12, [sp], #64
+
+ add x3, x3, x1
+ add x0, x0, x1
+
+ ret
+endfunc
+
+#include "looprestoration_tmpl.S"
+
+// void dav1d_sgr_box3_row_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
+// const pixel (*left)[4],
+// const pixel *src, const int w,
+// const enum LrEdgeFlags edges);
+function sgr_box3_row_h_8bpc_neon, export=1
+ add w4, w4, #2 // w += 2
+
+ tst w5, #1 // LR_HAVE_LEFT
+ b.eq 1f
+ cbnz x2, 0f
+
+ // LR_HAVE_LEFT && left == NULL
+ sub x3, x3, #2
+ ld1 {v0.16b}, [x3], #16
+ b 2f
+
+0:
+ // LR_HAVE_LEFT, left != NULL
+ ld1 {v0.16b}, [x3], #16
+ ld1 {v1.s}[3], [x2]
+ // Move x3 back to account for the last 2 bytes we loaded earlier,
+ // which we'll shift out.
+ sub x3, x3, #2
+ ext v0.16b, v1.16b, v0.16b, #14
+ b 2f
+
+1:
+ ld1 {v0.16b}, [x3], #16
+ // !LR_HAVE_LEFT, fill v1 with the leftmost byte
+ // and shift v0 to have 2x the first byte at the front.
+ dup v1.16b, v0.b[0]
+ // Move x3 back to account for the last 2 bytes we loaded before,
+ // which we shifted out.
+ sub x3, x3, #2
+ ext v0.16b, v1.16b, v0.16b, #14
+
+2:
+ umull v1.8h, v0.8b, v0.8b
+ umull2 v2.8h, v0.16b, v0.16b
+
+ tst w5, #2 // LR_HAVE_RIGHT
+ b.ne 4f
+ // If we'll need to pad the right edge, load that byte to pad with
+ // here since we can find it pretty easily from here.
+ sub w13, w4, #(2 + 16 - 2 + 1)
+ ldr b30, [x3, w13, sxtw]
+ // Fill v30 with the right padding pixel
+ dup v30.16b, v30.b[0]
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp w4, #10
+ b.ge 4f // If w >= 10, all used input pixels are valid
+
+ // 1 <= w < 10, w pixels valid in v0. For w=9, this ends up called
+ // again; it's not strictly needed in those cases (we pad enough here),
+ // but keeping the code as simple as possible.
+
+ // Insert padding in v0.b[w] onwards
+ movrel x13, right_ext_mask
+ sub x13, x13, w4, uxtw
+ ld1 {v29.16b}, [x13]
+
+ bit v0.16b, v30.16b, v29.16b
+
+ // Update the precalculated squares
+ umull v1.8h, v0.8b, v0.8b
+ umull2 v2.8h, v0.16b, v0.16b
+
+4: // Loop horizontally
+ ext v16.16b, v0.16b, v0.16b, #1
+ ext v17.16b, v0.16b, v0.16b, #2
+ uaddl v3.8h, v0.8b, v16.8b
+ ext v20.16b, v1.16b, v2.16b, #2
+ uaddw v3.8h, v3.8h, v17.8b
+
+ ext v21.16b, v1.16b, v2.16b, #4
+
+ uaddl v26.4s, v1.4h, v20.4h
+ uaddl2 v27.4s, v1.8h, v20.8h
+ uaddw v26.4s, v26.4s, v21.4h
+ uaddw2 v27.4s, v27.4s, v21.8h
+
+ subs w4, w4, #8
+
+ st1 {v3.8h}, [x1], #16
+ st1 {v26.4s,v27.4s}, [x0], #32
+
+ b.le 9f
+ tst w5, #2 // LR_HAVE_RIGHT
+ ld1 {v3.8b}, [x3], #8
+ mov v1.16b, v2.16b
+ ext v0.16b, v0.16b, v3.16b, #8
+ umull v2.8h, v3.8b, v3.8b
+
+ b.ne 4b // If we don't need to pad, just keep summing.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+9:
+ ret
+endfunc
+
+// void dav1d_sgr_box5_row_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
+// const pixel (*left)[4],
+// const pixel *src, const int w,
+// const enum LrEdgeFlags edges);
+function sgr_box5_row_h_8bpc_neon, export=1
+ add w4, w4, #2 // w += 2
+
+ tst w5, #1 // LR_HAVE_LEFT
+ b.eq 1f
+ cbnz x2, 0f
+
+ // LR_HAVE_LEFT && left == NULL
+ sub x3, x3, #3
+ ld1 {v0.16b}, [x3], #16
+ b 2f
+
+0:
+ // LR_HAVE_LEFT, left != NULL
+ ld1 {v0.16b}, [x3], #16
+ ld1 {v1.s}[3], [x2], #4
+ // Move x3 back to account for the last 3 bytes we loaded earlier,
+ // which we'll shift out.
+ sub x3, x3, #3
+ ext v0.16b, v1.16b, v0.16b, #13
+ b 2f
+
+1:
+ ld1 {v0.16b}, [x3], #16
+ // !LR_HAVE_LEFT, fill v1 with the leftmost byte
+ // and shift v0 to have 3x the first byte at the front.
+ dup v1.16b, v0.b[0]
+ // Move x3 back to account for the last 3 bytes we loaded before,
+ // which we shifted out.
+ sub x3, x3, #3
+ ext v0.16b, v1.16b, v0.16b, #13
+
+2:
+ umull v1.8h, v0.8b, v0.8b
+ umull2 v2.8h, v0.16b, v0.16b
+
+ tst w5, #2 // LR_HAVE_RIGHT
+ b.ne 4f
+ // If we'll need to pad the right edge, load that byte to pad with
+ // here since we can find it pretty easily from here.
+ sub w13, w4, #(2 + 16 - 3 + 1)
+ ldr b30, [x3, w13, sxtw]
+ // Fill v30 with the right padding pixel
+ dup v30.16b, v30.b[0]
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp w4, #11
+ b.ge 4f // If w >= 11, all used input pixels are valid
+
+ // 1 <= w < 11, w+1 pixels valid in v0. For w=9 or w=10,
+ // this ends up called again; it's not strictly needed in those
+ // cases (we pad enough here), but keeping the code as simple as possible.
+
+ // Insert padding in v0.b[w+1] onwards; fuse the +1 into the
+ // buffer pointer.
+ movrel x13, right_ext_mask, -1
+ sub x13, x13, w4, uxtw
+ ld1 {v29.16b}, [x13]
+
+ bit v0.16b, v30.16b, v29.16b
+
+ // Update the precalculated squares
+ umull v1.8h, v0.8b, v0.8b
+ umull2 v2.8h, v0.16b, v0.16b
+
+4: // Loop horizontally
+ ext v16.16b, v0.16b, v0.16b, #1
+ ext v17.16b, v0.16b, v0.16b, #2
+ ext v18.16b, v0.16b, v0.16b, #3
+ ext v19.16b, v0.16b, v0.16b, #4
+ uaddl v3.8h, v0.8b, v16.8b
+ uaddl v24.8h, v17.8b, v18.8b
+ uaddw v3.8h, v3.8h, v19.8b
+ add v3.8h, v3.8h, v24.8h
+
+ ext v16.16b, v1.16b, v2.16b, #2
+ ext v17.16b, v1.16b, v2.16b, #4
+ ext v18.16b, v1.16b, v2.16b, #6
+ ext v19.16b, v1.16b, v2.16b, #8
+
+ uaddl v26.4s, v1.4h, v16.4h
+ uaddl2 v27.4s, v1.8h, v16.8h
+ uaddl v16.4s, v17.4h, v18.4h
+ uaddl2 v17.4s, v17.8h, v18.8h
+ uaddw v26.4s, v26.4s, v19.4h
+ uaddw2 v27.4s, v27.4s, v19.8h
+ add v26.4s, v26.4s, v16.4s
+ add v27.4s, v27.4s, v17.4s
+
+ subs w4, w4, #8
+
+ st1 {v3.8h}, [x1], #16
+ st1 {v26.4s,v27.4s}, [x0], #32
+
+ b.le 9f
+ tst w5, #2 // LR_HAVE_RIGHT
+ ld1 {v3.8b}, [x3], #8
+ mov v1.16b, v2.16b
+ ext v0.16b, v0.16b, v3.16b, #8
+ umull v2.8h, v3.8b, v3.8b
+
+ b.ne 4b // If we don't need to pad, just keep summing.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+9:
+ ret
+endfunc
+
+// void dav1d_sgr_box35_row_h_8bpc_neon(int32_t *sumsq3, int16_t *sum3,
+// int32_t *sumsq5, int16_t *sum5,
+// const pixel (*left)[4],
+// const pixel *src, const int w,
+// const enum LrEdgeFlags edges);
+function sgr_box35_row_h_8bpc_neon, export=1
+ add w6, w6, #2 // w += 2
+
+ tst w7, #1 // LR_HAVE_LEFT
+ b.eq 1f
+ cbnz x4, 0f
+
+ // LR_HAVE_LEFT && left == NULL
+ sub x5, x5, #3
+ ld1 {v0.16b}, [x5], #16
+ b 2f
+
+0:
+ // LR_HAVE_LEFT, left != NULL
+ ld1 {v0.16b}, [x5], #16
+ ld1 {v1.s}[3], [x4], #4
+ // Move x3 back to account for the last 3 bytes we loaded earlier,
+ // which we'll shift out.
+ sub x5, x5, #3
+ ext v0.16b, v1.16b, v0.16b, #13
+ b 2f
+
+1:
+ ld1 {v0.16b}, [x5], #16
+ // !LR_HAVE_LEFT, fill v1 with the leftmost byte
+ // and shift v0 to have 3x the first byte at the front.
+ dup v1.16b, v0.b[0]
+ // Move x3 back to account for the last 3 bytes we loaded before,
+ // which we shifted out.
+ sub x5, x5, #3
+ ext v0.16b, v1.16b, v0.16b, #13
+
+2:
+ umull v1.8h, v0.8b, v0.8b
+ umull2 v2.8h, v0.16b, v0.16b
+
+ tst w7, #2 // LR_HAVE_RIGHT
+ b.ne 4f
+ // If we'll need to pad the right edge, load that byte to pad with
+ // here since we can find it pretty easily from here.
+ sub w13, w6, #(2 + 16 - 3 + 1)
+ ldr b30, [x5, w13, sxtw]
+ // Fill v30 with the right padding pixel
+ dup v30.16b, v30.b[0]
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp w6, #11
+ b.ge 4f // If w >= 11, all used input pixels are valid
+
+ // 1 <= w < 11, w+1 pixels valid in v0. For w=9 or w=10,
+ // this ends up called again; it's not strictly needed in those
+ // cases (we pad enough here), but keeping the code as simple as possible.
+
+ // Insert padding in v0.b[w+1] onwards; fuse the +1 into the
+ // buffer pointer.
+ movrel x13, right_ext_mask, -1
+ sub x13, x13, w6, uxtw
+ ld1 {v29.16b}, [x13]
+
+ bit v0.16b, v30.16b, v29.16b
+
+ // Update the precalculated squares
+ umull v1.8h, v0.8b, v0.8b
+ umull2 v2.8h, v0.16b, v0.16b
+
+4: // Loop horizontally
+ ext v16.16b, v0.16b, v0.16b, #1
+ ext v17.16b, v0.16b, v0.16b, #2
+ ext v19.16b, v0.16b, v0.16b, #4
+ ext v18.16b, v0.16b, v0.16b, #3
+ uaddl v3.8h, v16.8b, v17.8b
+ uaddl v24.8h, v0.8b, v19.8b
+ uaddw v3.8h, v3.8h, v18.8b
+
+ ext v16.16b, v1.16b, v2.16b, #2
+ ext v17.16b, v1.16b, v2.16b, #4
+ ext v19.16b, v1.16b, v2.16b, #8
+ ext v18.16b, v1.16b, v2.16b, #6
+
+ st1 {v3.8h}, [x1], #16
+ add v3.8h, v3.8h, v24.8h
+
+ uaddl v26.4s, v16.4h, v17.4h
+ uaddl2 v27.4s, v16.8h, v17.8h
+ uaddl v16.4s, v1.4h, v19.4h
+ uaddl2 v17.4s, v1.8h, v19.8h
+ uaddw v26.4s, v26.4s, v18.4h
+ uaddw2 v27.4s, v27.4s, v18.8h
+
+ st1 {v26.4s,v27.4s}, [x0], #32
+ add v26.4s, v26.4s, v16.4s
+ add v27.4s, v27.4s, v17.4s
+
+ subs w6, w6, #8
+
+ st1 {v3.8h}, [x3], #16
+ st1 {v26.4s,v27.4s}, [x2], #32
+
+ b.le 9f
+ tst w7, #2 // LR_HAVE_RIGHT
+ ld1 {v3.8b}, [x5], #8
+ mov v1.16b, v2.16b
+ ext v0.16b, v0.16b, v3.16b, #8
+ umull v2.8h, v3.8b, v3.8b
+
+ b.ne 4b // If we don't need to pad, just keep summing.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+9:
+ ret
+endfunc
+
+sgr_funcs 8
diff --git a/third_party/dav1d/src/arm/64/looprestoration16.S b/third_party/dav1d/src/arm/64/looprestoration16.S
new file mode 100644
index 0000000000..3b76b1ee2a
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/looprestoration16.S
@@ -0,0 +1,1388 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+const right_ext_mask_buf
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+right_ext_mask:
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+endconst
+
+// void dav1d_wiener_filter7_16bpc_neon(pixel *p, const ptrdiff_t p_stride,
+// const pixel (*left)[4], const pixel *lpf,
+// const int w, int h,
+// const int16_t filter[2][8],
+// const enum LrEdgeFlags edges,
+// const int bitdepth_max);
+function wiener_filter7_16bpc_neon, export=1
+ ldr w8, [sp]
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29, x30, [sp, #-32]!
+ stp d8, d9, [sp, #16]
+ mov x29, sp
+ ld1 {v0.8h, v1.8h}, [x6]
+ tst w7, #4 // LR_HAVE_TOP
+ sub_sp 384*2*6
+
+ dup v28.8h, w8 // bitdepth_max
+ clz w8, w8
+ movi v30.4s, #1
+ sub w10, w8, #38 // -(bitdepth + 6)
+ sub w11, w8, #11 // round_bits_v
+ sub w8, w8, #25 // -round_bits_h
+ neg w10, w10 // bitdepth + 6
+ neg w11, w11 // -round_bits_v
+ dup v2.4s, w10
+ dup v29.4s, w8 // -round_bits_h
+ dup v27.4s, w11 // -round_bits_v
+ movi v31.8h, #0x20, lsl #8 // 1 << 13 = 8192
+ ushl v30.4s, v30.4s, v2.4s // 1 << (bitdepth + 6)
+
+ zip1 v0.2d, v0.2d, v1.2d // move vertical coeffs to v0.h[4-7], freeing up v1
+
+ // x9 - t6
+ // x10 - t5
+ // x11 - t4
+ // x12 - t3
+ // x13 - t2
+ // x14 - t1
+ // x15 - t0
+ mov x14, sp // t1
+ b.eq L(no_top_7)
+
+ mov x16, x2 // backup left
+ mov x2, #0
+ bl wiener_filter7_h_16bpc_neon
+ add x3, x3, x1 // lpf += stride
+ mov x9, x14 // t6
+ mov x10, x14 // t5
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter7_h_16bpc_neon
+ add x3, x3, x1, lsl #2
+ add x3, x3, x1 // lpf += stride*5
+ mov x11, x14 // t4
+ add x14, x14, #384*2 // t1 += 384*2
+ mov x2, x16 // left
+ mov x16, x3 // backup lpf
+ mov x3, x0 // lpf = p
+ bl wiener_filter7_h_16bpc_neon
+ subs w5, w5, #1 // h--
+ mov x12, x14 // t3
+ mov x13, x14 // t2
+ b.eq L(v1_7)
+ add x3, x3, x1 // src += stride
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter7_h_16bpc_neon
+ mov x13, x14 // t2
+ subs w5, w5, #1 // h--
+ b.eq L(v2_7)
+ add x3, x3, x1 // src += stride
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter7_h_16bpc_neon
+ subs w5, w5, #1 // h--
+ b.eq L(v3_7)
+ add x3, x3, x1 // src += stride
+
+L(main_7):
+ add x15, x14, #384*2 // t0 = t1 + 384*2
+L(main_loop_7):
+ bl wiener_filter7_hv_16bpc_neon
+ subs w5, w5, #1 // h--
+ b.ne L(main_loop_7)
+ tst w7, #8 // LR_HAVE_BOTTOM
+ b.eq L(v3_7)
+
+ mov x3, x16 // restore lpf
+ mov x2, #0 // left = NULL
+ bl wiener_filter7_hv_16bpc_neon
+ bl wiener_filter7_hv_16bpc_neon
+L(v1_7):
+ bl wiener_filter7_v_16bpc_neon
+
+ mov sp, x29
+ ldp d8, d9, [sp, #16]
+ ldp x29, x30, [sp], #32
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(no_top_7):
+ add x3, x3, x1, lsl #2
+ add x16, x3, x1, lsl #1 // lpf += stride*6, backup
+ mov x3, x0 // lpf = p
+
+ bl wiener_filter7_h_16bpc_neon
+ subs w5, w5, #1 // h--
+ mov x9, x14 // t6
+ mov x10, x14 // t5
+ mov x11, x14 // t4
+ mov x12, x14 // t3
+ mov x13, x14 // t2
+ b.eq L(v1_7)
+ add x3, x3, x1 // src += p_stride
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter7_h_16bpc_neon
+ subs w5, w5, #1 // h--
+ mov x13, x14 // t2
+ b.eq L(v2_7)
+ add x3, x3, x1 // src += p_stride
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter7_h_16bpc_neon
+ subs w5, w5, #1 // h--
+ b.eq L(v3_7)
+ add x3, x3, x1 // src += p_stride
+ add x15, x14, #384*2 // t0 = t1 + 384*2
+ bl wiener_filter7_hv_16bpc_neon
+ subs w5, w5, #1 // h--
+ b.eq L(v3_7)
+ add x15, x15, #384*2*4 // t0 += 384*2*4
+ bl wiener_filter7_hv_16bpc_neon
+ subs w5, w5, #1 // h--
+ b.ne L(main_7)
+L(v3_7):
+ bl wiener_filter7_v_16bpc_neon
+L(v2_7):
+ bl wiener_filter7_v_16bpc_neon
+ b L(v1_7)
+endfunc
+
+
+function wiener_filter7_h_16bpc_neon
+ stp x3, x4, [sp, #-32]!
+ str x14, [sp, #16]
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst w7, #1 // LR_HAVE_LEFT
+ b.eq 1f
+ // LR_HAVE_LEFT
+ cbnz x2, 0f
+ // left == NULL
+ sub x3, x3, #6
+ ld1 {v2.8h, v3.8h}, [x3], #32
+ b 2f
+
+0:
+ // LR_HAVE_LEFT, left != NULL
+ ld1 {v2.8h, v3.8h}, [x3], #32
+ ld1 {v4.d}[1], [x2], #8
+ // Move x3 back to account for the last 3 pixels we loaded earlier,
+ // which we'll shift out.
+ sub x3, x3, #6
+ ext v3.16b, v2.16b, v3.16b, #10
+ ext v2.16b, v4.16b, v2.16b, #10
+ b 2f
+
+1:
+ ld1 {v2.8h, v3.8h}, [x3], #32
+ // !LR_HAVE_LEFT, fill v4 with the leftmost pixel
+ // and shift v3 to have 3x the first pixel at the front.
+ dup v4.8h, v2.h[0]
+ // Move x3 back to account for the last 3 pixels we loaded before,
+ // which we shifted out.
+ sub x3, x3, #6
+ ext v3.16b, v2.16b, v3.16b, #10
+ ext v2.16b, v4.16b, v2.16b, #10
+
+2:
+ ld1 {v4.8h}, [x3], #16
+
+ tst w7, #2 // LR_HAVE_RIGHT
+ b.ne 4f
+
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp w4, #19
+ b.ge 4f // If w >= 19, all used input pixels are valid
+
+ // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9,
+ // this ends up called again; it's not strictly needed in those
+ // cases (we pad enough here), but keeping the code as simple as possible.
+
+ // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie
+ // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel.
+ sub w17, w4, #22
+ // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the
+ // buffer pointer.
+ movrel x6, right_ext_mask, -6
+ ldr h26, [x3, w17, sxtw #1]
+ sub x6, x6, w4, uxtw #1
+ dup v26.8h, v26.h[0]
+ ld1 {v23.16b, v24.16b, v25.16b}, [x6]
+
+ bit v2.16b, v26.16b, v23.16b
+ bit v3.16b, v26.16b, v24.16b
+ bit v4.16b, v26.16b, v25.16b
+
+4: // Loop horizontally
+ // Interleaving the mul/mla chains actually hurts performance
+ // significantly on Cortex A53, thus keeping mul/mla tightly
+ // chained like this.
+ ext v17.16b, v2.16b, v3.16b, #4
+ ext v19.16b, v2.16b, v3.16b, #8
+ ext v16.16b, v2.16b, v3.16b, #2
+ ext v20.16b, v2.16b, v3.16b, #10
+ ext v21.16b, v2.16b, v3.16b, #12
+ ext v18.16b, v2.16b, v3.16b, #6
+ add v19.8h, v19.8h, v17.8h
+ add v20.8h, v20.8h, v16.8h
+ add v21.8h, v21.8h, v2.8h
+ smull v6.4s, v18.4h, v0.h[3]
+ smlal v6.4s, v19.4h, v0.h[2]
+ smlal v6.4s, v20.4h, v0.h[1]
+ smlal v6.4s, v21.4h, v0.h[0]
+ smull2 v7.4s, v18.8h, v0.h[3]
+ smlal2 v7.4s, v19.8h, v0.h[2]
+ smlal2 v7.4s, v20.8h, v0.h[1]
+ smlal2 v7.4s, v21.8h, v0.h[0]
+
+ ext v17.16b, v3.16b, v4.16b, #4
+ ext v19.16b, v3.16b, v4.16b, #8
+ ext v16.16b, v3.16b, v4.16b, #2
+ ext v20.16b, v3.16b, v4.16b, #10
+ ext v21.16b, v3.16b, v4.16b, #12
+ ext v18.16b, v3.16b, v4.16b, #6
+
+ add v19.8h, v19.8h, v17.8h
+ add v20.8h, v20.8h, v16.8h
+ add v21.8h, v21.8h, v3.8h
+ smull v16.4s, v18.4h, v0.h[3]
+ smlal v16.4s, v19.4h, v0.h[2]
+ smlal v16.4s, v20.4h, v0.h[1]
+ smlal v16.4s, v21.4h, v0.h[0]
+ smull2 v17.4s, v18.8h, v0.h[3]
+ smlal2 v17.4s, v19.8h, v0.h[2]
+ smlal2 v17.4s, v20.8h, v0.h[1]
+ smlal2 v17.4s, v21.8h, v0.h[0]
+
+ mvni v24.8h, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1
+ add v6.4s, v6.4s, v30.4s
+ add v7.4s, v7.4s, v30.4s
+ add v16.4s, v16.4s, v30.4s
+ add v17.4s, v17.4s, v30.4s
+ srshl v6.4s, v6.4s, v29.4s
+ srshl v7.4s, v7.4s, v29.4s
+ srshl v16.4s, v16.4s, v29.4s
+ srshl v17.4s, v17.4s, v29.4s
+ sqxtun v6.4h, v6.4s
+ sqxtun2 v6.8h, v7.4s
+ sqxtun v7.4h, v16.4s
+ sqxtun2 v7.8h, v17.4s
+ umin v6.8h, v6.8h, v24.8h
+ umin v7.8h, v7.8h, v24.8h
+ sub v6.8h, v6.8h, v31.8h
+ sub v7.8h, v7.8h, v31.8h
+
+ subs w4, w4, #16
+
+ st1 {v6.8h, v7.8h}, [x14], #32
+
+ b.le 0f
+ mov v2.16b, v4.16b
+ tst w7, #2 // LR_HAVE_RIGHT
+ ld1 {v3.8h, v4.8h}, [x3], #32
+ b.ne 4b // If we don't need to pad, just keep filtering.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+0:
+ ldr x14, [sp, #16]
+ ldp x3, x4, [sp], #32
+ ret
+endfunc
+
+function wiener_filter7_v_16bpc_neon
+ // Backing up/restoring registers shifted, so that x9 gets the value
+ // of x10, etc, afterwards.
+ stp x10, x11, [sp, #-64]!
+ stp x12, x13, [sp, #16]
+ stp x14, x14, [sp, #32]
+ stp x0, x4, [sp, #48]
+1:
+ ld1 {v16.8h, v17.8h}, [x9], #32
+ ld1 {v18.8h, v19.8h}, [x10], #32
+ ld1 {v20.8h, v21.8h}, [x11], #32
+ ld1 {v22.8h, v23.8h}, [x12], #32
+ ld1 {v24.8h, v25.8h}, [x13], #32
+ ld1 {v6.8h, v7.8h}, [x14], #32
+
+ smull v2.4s, v16.4h, v0.h[4]
+ smlal v2.4s, v18.4h, v0.h[5]
+ smlal v2.4s, v20.4h, v0.h[6]
+ smlal v2.4s, v22.4h, v0.h[7]
+ smlal v2.4s, v24.4h, v0.h[6]
+ smlal v2.4s, v6.4h, v0.h[5]
+ smlal v2.4s, v6.4h, v0.h[4]
+ smull2 v3.4s, v16.8h, v0.h[4]
+ smlal2 v3.4s, v18.8h, v0.h[5]
+ smlal2 v3.4s, v20.8h, v0.h[6]
+ smlal2 v3.4s, v22.8h, v0.h[7]
+ smlal2 v3.4s, v24.8h, v0.h[6]
+ smlal2 v3.4s, v6.8h, v0.h[5]
+ smlal2 v3.4s, v6.8h, v0.h[4]
+ smull v4.4s, v17.4h, v0.h[4]
+ smlal v4.4s, v19.4h, v0.h[5]
+ smlal v4.4s, v21.4h, v0.h[6]
+ smlal v4.4s, v23.4h, v0.h[7]
+ smlal v4.4s, v25.4h, v0.h[6]
+ smlal v4.4s, v7.4h, v0.h[5]
+ smlal v4.4s, v7.4h, v0.h[4]
+ smull2 v5.4s, v17.8h, v0.h[4]
+ smlal2 v5.4s, v19.8h, v0.h[5]
+ smlal2 v5.4s, v21.8h, v0.h[6]
+ smlal2 v5.4s, v23.8h, v0.h[7]
+ smlal2 v5.4s, v25.8h, v0.h[6]
+ smlal2 v5.4s, v7.8h, v0.h[5]
+ smlal2 v5.4s, v7.8h, v0.h[4]
+ srshl v2.4s, v2.4s, v27.4s // -round_bits_v
+ srshl v3.4s, v3.4s, v27.4s
+ srshl v4.4s, v4.4s, v27.4s
+ srshl v5.4s, v5.4s, v27.4s
+ sqxtun v2.4h, v2.4s
+ sqxtun2 v2.8h, v3.4s
+ sqxtun v3.4h, v4.4s
+ sqxtun2 v3.8h, v5.4s
+ umin v2.8h, v2.8h, v28.8h // bitdepth_max
+ umin v3.8h, v3.8h, v28.8h
+ subs w4, w4, #16
+ st1 {v2.8h, v3.8h}, [x0], #32
+ b.gt 1b
+
+ ldp x0, x4, [sp, #48]
+ ldp x13, x14, [sp, #32]
+ ldp x11, x12, [sp, #16]
+ ldp x9, x10, [sp], #64
+
+ add x0, x0, x1
+ ret
+endfunc
+
+function wiener_filter7_hv_16bpc_neon
+ // Backing up/restoring registers shifted, so that x9 gets the value
+ // of x10, etc, and x15==x9, afterwards.
+ stp x10, x11, [sp, #-80]!
+ stp x12, x13, [sp, #16]
+ stp x14, x15, [sp, #32]
+ stp x10, x0, [sp, #48]
+ stp x3, x4, [sp, #64]
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst w7, #1 // LR_HAVE_LEFT
+ b.eq 1f
+ // LR_HAVE_LEFT
+ cbnz x2, 0f
+ // left == NULL
+ sub x3, x3, #6
+ ld1 {v2.8h, v3.8h}, [x3], #32
+ b 2f
+
+0:
+ // LR_HAVE_LEFT, left != NULL
+ ld1 {v2.8h, v3.8h}, [x3], #32
+ ld1 {v4.d}[1], [x2], #8
+ // Move x3 back to account for the last 3 pixels we loaded earlier,
+ // which we'll shift out.
+ sub x3, x3, #6
+ ext v3.16b, v2.16b, v3.16b, #10
+ ext v2.16b, v4.16b, v2.16b, #10
+ b 2f
+1:
+ ld1 {v2.8h, v3.8h}, [x3], #32
+ // !LR_HAVE_LEFT, fill v4 with the leftmost pixel
+ // and shift v3 to have 3x the first pixel at the front.
+ dup v4.8h, v2.h[0]
+ // Move x3 back to account for the last 3 pixels we loaded before,
+ // which we shifted out.
+ sub x3, x3, #6
+ ext v3.16b, v2.16b, v3.16b, #10
+ ext v2.16b, v4.16b, v2.16b, #10
+
+2:
+ ld1 {v4.8h}, [x3], #16
+
+ tst w7, #2 // LR_HAVE_RIGHT
+ b.ne 4f
+
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp w4, #19
+ b.ge 4f // If w >= 19, all used input pixels are valid
+
+ // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9,
+ // this ends up called again; it's not strictly needed in those
+ // cases (we pad enough here), but keeping the code as simple as possible.
+
+ // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie
+ // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel.
+ sub w17, w4, #22
+ // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the
+ // buffer pointer.
+ movrel x6, right_ext_mask, -6
+ ldr h26, [x3, w17, sxtw #1]
+ sub x6, x6, w4, uxtw #1
+ dup v26.8h, v26.h[0]
+ ld1 {v23.16b, v24.16b, v25.16b}, [x6]
+
+ bit v2.16b, v26.16b, v23.16b
+ bit v3.16b, v26.16b, v24.16b
+ bit v4.16b, v26.16b, v25.16b
+
+4: // Loop horizontally
+ ext v17.16b, v2.16b, v3.16b, #4
+ ext v19.16b, v2.16b, v3.16b, #8
+ ext v16.16b, v2.16b, v3.16b, #2
+ ext v20.16b, v2.16b, v3.16b, #10
+ ext v21.16b, v2.16b, v3.16b, #12
+ ext v18.16b, v2.16b, v3.16b, #6
+ add v19.8h, v19.8h, v17.8h
+ add v20.8h, v20.8h, v16.8h
+ add v21.8h, v21.8h, v2.8h
+ smull v6.4s, v18.4h, v0.h[3]
+ smlal v6.4s, v19.4h, v0.h[2]
+ smlal v6.4s, v20.4h, v0.h[1]
+ smlal v6.4s, v21.4h, v0.h[0]
+ smull2 v7.4s, v18.8h, v0.h[3]
+ smlal2 v7.4s, v19.8h, v0.h[2]
+ smlal2 v7.4s, v20.8h, v0.h[1]
+ smlal2 v7.4s, v21.8h, v0.h[0]
+
+ ext v17.16b, v3.16b, v4.16b, #4
+ ext v19.16b, v3.16b, v4.16b, #8
+ ext v16.16b, v3.16b, v4.16b, #2
+ ext v20.16b, v3.16b, v4.16b, #10
+ ext v21.16b, v3.16b, v4.16b, #12
+ ext v18.16b, v3.16b, v4.16b, #6
+
+ add v19.8h, v19.8h, v17.8h
+ add v20.8h, v20.8h, v16.8h
+ add v21.8h, v21.8h, v3.8h
+ smull v24.4s, v18.4h, v0.h[3]
+ smlal v24.4s, v19.4h, v0.h[2]
+ smlal v24.4s, v20.4h, v0.h[1]
+ smlal v24.4s, v21.4h, v0.h[0]
+ smull2 v25.4s, v18.8h, v0.h[3]
+ smlal2 v25.4s, v19.8h, v0.h[2]
+ smlal2 v25.4s, v20.8h, v0.h[1]
+ smlal2 v25.4s, v21.8h, v0.h[0]
+
+ ld1 {v16.8h, v17.8h}, [x9], #32
+
+ mvni v26.8h, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1
+ add v6.4s, v6.4s, v30.4s
+ add v7.4s, v7.4s, v30.4s
+ add v24.4s, v24.4s, v30.4s
+ add v25.4s, v25.4s, v30.4s
+ ld1 {v18.8h, v19.8h}, [x10], #32
+ srshl v6.4s, v6.4s, v29.4s
+ srshl v7.4s, v7.4s, v29.4s
+ srshl v24.4s, v24.4s, v29.4s
+ srshl v25.4s, v25.4s, v29.4s
+ ld1 {v20.8h, v21.8h}, [x11], #32
+ sqxtun v6.4h, v6.4s
+ sqxtun2 v6.8h, v7.4s
+ sqxtun v7.4h, v24.4s
+ sqxtun2 v7.8h, v25.4s
+ ld1 {v22.8h, v23.8h}, [x12], #32
+ umin v6.8h, v6.8h, v26.8h
+ umin v7.8h, v7.8h, v26.8h
+ ld1 {v24.8h, v25.8h}, [x13], #32
+ sub v6.8h, v6.8h, v31.8h
+ sub v7.8h, v7.8h, v31.8h
+
+ ld1 {v8.8h, v9.8h}, [x14], #32
+
+ smull v1.4s, v16.4h, v0.h[4]
+ smlal v1.4s, v18.4h, v0.h[5]
+ smlal v1.4s, v20.4h, v0.h[6]
+ smlal v1.4s, v22.4h, v0.h[7]
+ smlal v1.4s, v24.4h, v0.h[6]
+ smlal v1.4s, v8.4h, v0.h[5]
+ smlal v1.4s, v6.4h, v0.h[4]
+ smull2 v5.4s, v16.8h, v0.h[4]
+ smlal2 v5.4s, v18.8h, v0.h[5]
+ smlal2 v5.4s, v20.8h, v0.h[6]
+ smlal2 v5.4s, v22.8h, v0.h[7]
+ smlal2 v5.4s, v24.8h, v0.h[6]
+ smlal2 v5.4s, v8.8h, v0.h[5]
+ smlal2 v5.4s, v6.8h, v0.h[4]
+ smull v26.4s, v17.4h, v0.h[4]
+ smlal v26.4s, v19.4h, v0.h[5]
+ smlal v26.4s, v21.4h, v0.h[6]
+ smlal v26.4s, v23.4h, v0.h[7]
+ smlal v26.4s, v25.4h, v0.h[6]
+ smlal v26.4s, v9.4h, v0.h[5]
+ smlal v26.4s, v7.4h, v0.h[4]
+ smull2 v16.4s, v17.8h, v0.h[4]
+ smlal2 v16.4s, v19.8h, v0.h[5]
+ smlal2 v16.4s, v21.8h, v0.h[6]
+ smlal2 v16.4s, v23.8h, v0.h[7]
+ smlal2 v16.4s, v25.8h, v0.h[6]
+ smlal2 v16.4s, v9.8h, v0.h[5]
+ smlal2 v16.4s, v7.8h, v0.h[4]
+ srshl v1.4s, v1.4s, v27.4s // -round_bits_v
+ srshl v5.4s, v5.4s, v27.4s
+ srshl v26.4s, v26.4s, v27.4s
+ srshl v16.4s, v16.4s, v27.4s
+ sqxtun v18.4h, v1.4s
+ sqxtun2 v18.8h, v5.4s
+ sqxtun v19.4h, v26.4s
+ sqxtun2 v19.8h, v16.4s
+ st1 {v6.8h, v7.8h}, [x15], #32
+ umin v18.8h, v18.8h, v28.8h // bitdepth_max
+ umin v19.8h, v19.8h, v28.8h
+ subs w4, w4, #16
+
+ st1 {v18.8h, v19.8h}, [x0], #32
+
+ b.le 0f
+ mov v2.16b, v4.16b
+ tst w7, #2 // LR_HAVE_RIGHT
+ ld1 {v3.8h, v4.8h}, [x3], #32
+ b.ne 4b // If we don't need to pad, just keep filtering.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+0:
+ ldp x3, x4, [sp, #64]
+ ldp x15, x0, [sp, #48]
+ ldp x13, x14, [sp, #32]
+ ldp x11, x12, [sp, #16]
+ ldp x9, x10, [sp], #80
+
+ add x3, x3, x1
+ add x0, x0, x1
+
+ ret
+endfunc
+
+// void dav1d_wiener_filter5_16bpc_neon(pixel *p, const ptrdiff_t p_stride,
+// const pixel (*left)[4], const pixel *lpf,
+// const int w, int h,
+// const int16_t filter[2][8],
+// const enum LrEdgeFlags edges,
+// const int bitdepth_max);
+function wiener_filter5_16bpc_neon, export=1
+ ldr w8, [sp]
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29, x30, [sp, #-32]!
+ stp d8, d9, [sp, #16]
+ mov x29, sp
+ ld1 {v0.8h, v1.8h}, [x6]
+ tst w7, #4 // LR_HAVE_TOP
+ sub_sp 384*2*4
+
+ dup v28.8h, w8 // bitdepth_max
+ clz w8, w8
+ movi v30.4s, #1
+ sub w10, w8, #38 // -(bitdepth + 6)
+ sub w11, w8, #11 // round_bits_v
+ sub w8, w8, #25 // -round_bits_h
+ neg w10, w10 // bitdepth + 6
+ neg w11, w11 // -round_bits_v
+ dup v2.4s, w10
+ dup v29.4s, w8 // -round_bits_h
+ dup v27.4s, w11 // -round_bits_v
+ movi v31.8h, #0x20, lsl #8 // 1 << 13 = 8192
+ ushl v30.4s, v30.4s, v2.4s // 1 << (bitdepth + 6)
+
+ zip1 v0.2d, v0.2d, v1.2d // move vertical coeffs to v0.h[4-7], freeing up v1
+
+ // x11 - t4
+ // x12 - t3
+ // x13 - t2
+ // x14 - t1
+ // x15 - t0
+ mov x14, sp // t1
+ b.eq L(no_top_5)
+
+ mov x16, x2 // backup left
+ mov x2, #0
+ bl wiener_filter5_h_16bpc_neon
+ add x3, x3, x1 // lpf += stride
+ mov x11, x14 // t4
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter5_h_16bpc_neon
+ add x3, x3, x1, lsl #2
+ add x3, x3, x1 // lpf += stride*5
+ mov x12, x14 // t3
+ add x14, x14, #384*2 // t1 += 384*2
+ mov x2, x16 // left
+ mov x16, x3 // backup lpf
+ mov x3, x0 // lpf = p
+ bl wiener_filter5_h_16bpc_neon
+ subs w5, w5, #1 // h--
+ mov x13, x14 // t2
+ b.eq L(v1_5)
+ add x3, x3, x1 // src += stride
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter5_h_16bpc_neon
+ subs w5, w5, #1 // h--
+ b.eq L(v2_5)
+ add x3, x3, x1 // src += stride
+
+L(main_5):
+ mov x15, x11 // t0 = t4
+L(main_loop_5):
+ bl wiener_filter5_hv_16bpc_neon
+ subs w5, w5, #1 // h--
+ b.ne L(main_loop_5)
+ tst w7, #8 // LR_HAVE_BOTTOM
+ b.eq L(v2_5)
+
+ mov x3, x16 // restore lpf
+ mov x2, #0 // left = NULL
+ bl wiener_filter5_hv_16bpc_neon
+ bl wiener_filter5_hv_16bpc_neon
+L(end_5):
+
+ mov sp, x29
+ ldp d8, d9, [sp, #16]
+ ldp x29, x30, [sp], #32
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(no_top_5):
+ add x3, x3, x1, lsl #2
+ add x16, x3, x1, lsl #1 // lpf += stride*6, backup
+ mov x3, x0 // lpf = p
+
+ bl wiener_filter5_h_16bpc_neon
+ subs w5, w5, #1 // h--
+ mov x11, x14 // t4
+ mov x12, x14 // t3
+ mov x13, x14 // t2
+ b.eq L(v1_5)
+ add x3, x3, x1 // src += stride
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter5_h_16bpc_neon
+ subs w5, w5, #1 // h--
+ b.eq L(v2_5)
+ add x3, x3, x1 // src += stride
+ add x15, x14, #384*2 // t0 = t1 + 384*2
+ bl wiener_filter5_hv_16bpc_neon
+ subs w5, w5, #1 // h--
+ b.eq L(v2_5)
+ add x15, x15, #384*2*3 // t0 += 384*2*3
+ bl wiener_filter5_hv_16bpc_neon
+ subs w5, w5, #1 // h--
+ b.ne L(main_5)
+L(v2_5):
+ bl wiener_filter5_v_16bpc_neon
+ add x0, x0, x1
+ mov x11, x12
+ mov x12, x13
+ mov x13, x14
+L(v1_5):
+ bl wiener_filter5_v_16bpc_neon
+ b L(end_5)
+endfunc
+
+
+function wiener_filter5_h_16bpc_neon
+ stp x3, x4, [sp, #-32]!
+ str x14, [sp, #16]
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst w7, #1 // LR_HAVE_LEFT
+ b.eq 1f
+ // LR_HAVE_LEFT
+ cbnz x2, 0f
+ // left == NULL
+ sub x3, x3, #4
+ ld1 {v2.8h, v3.8h}, [x3], #32
+ b 2f
+
+0:
+ // LR_HAVE_LEFT, left != NULL
+ ld1 {v2.8h, v3.8h}, [x3], #32
+ ld1 {v4.d}[1], [x2], #8
+ // Move x3 back to account for the last 2 pixels we loaded earlier,
+ // which we'll shift out.
+ sub x3, x3, #4
+ ext v3.16b, v2.16b, v3.16b, #12
+ ext v2.16b, v4.16b, v2.16b, #12
+ b 2f
+
+1:
+ ld1 {v2.8h, v3.8h}, [x3], #32
+ // !LR_HAVE_LEFT, fill v2 with the leftmost pixel
+ // and shift v3 to have 3x the first pixel at the front.
+ dup v4.8h, v2.h[0]
+ // Move x3 back to account for the last 2 pixels we loaded before,
+ // which we shifted out.
+ sub x3, x3, #4
+ ext v3.16b, v2.16b, v3.16b, #12
+ ext v2.16b, v4.16b, v2.16b, #12
+
+2:
+ ld1 {v4.8h}, [x3], #16
+
+ tst w7, #2 // LR_HAVE_RIGHT
+ b.ne 4f
+
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp w4, #18
+ b.ge 4f // If w >= 18, all used input pixels are valid
+
+ // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9,
+ // this ends up called again; it's not strictly needed in those
+ // cases (we pad enough here), but keeping the code as simple as possible.
+
+ // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie
+ // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel.
+ sub w17, w4, #23
+ // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the
+ // buffer pointer.
+ movrel x6, right_ext_mask, -4
+ ldr h26, [x3, w17, sxtw #1]
+ sub x6, x6, w4, uxtw #1
+ dup v26.8h, v26.h[0]
+ ld1 {v23.16b, v24.16b, v25.16b}, [x6]
+
+ bit v2.16b, v26.16b, v23.16b
+ bit v3.16b, v26.16b, v24.16b
+ bit v4.16b, v26.16b, v25.16b
+
+4: // Loop horizontally
+ // Interleaving the mul/mla chains actually hurts performance
+ // significantly on Cortex A53, thus keeping mul/mla tightly
+ // chained like this.
+ ext v16.16b, v2.16b, v3.16b, #2
+ ext v18.16b, v2.16b, v3.16b, #6
+ ext v19.16b, v2.16b, v3.16b, #8
+ ext v17.16b, v2.16b, v3.16b, #4
+ add v18.8h, v18.8h, v16.8h
+ add v19.8h, v19.8h, v2.8h
+ smull v6.4s, v17.4h, v0.h[3]
+ smlal v6.4s, v18.4h, v0.h[2]
+ smlal v6.4s, v19.4h, v0.h[1]
+ smull2 v7.4s, v17.8h, v0.h[3]
+ smlal2 v7.4s, v18.8h, v0.h[2]
+ smlal2 v7.4s, v19.8h, v0.h[1]
+
+ ext v16.16b, v3.16b, v4.16b, #2
+ ext v18.16b, v3.16b, v4.16b, #6
+ ext v19.16b, v3.16b, v4.16b, #8
+ ext v17.16b, v3.16b, v4.16b, #4
+ add v18.8h, v18.8h, v16.8h
+ add v19.8h, v19.8h, v3.8h
+ smull v16.4s, v17.4h, v0.h[3]
+ smlal v16.4s, v18.4h, v0.h[2]
+ smlal v16.4s, v19.4h, v0.h[1]
+ smull2 v17.4s, v17.8h, v0.h[3]
+ smlal2 v17.4s, v18.8h, v0.h[2]
+ smlal2 v17.4s, v19.8h, v0.h[1]
+
+ mvni v24.8h, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1
+ add v6.4s, v6.4s, v30.4s
+ add v7.4s, v7.4s, v30.4s
+ add v16.4s, v16.4s, v30.4s
+ add v17.4s, v17.4s, v30.4s
+ srshl v6.4s, v6.4s, v29.4s
+ srshl v7.4s, v7.4s, v29.4s
+ srshl v16.4s, v16.4s, v29.4s
+ srshl v17.4s, v17.4s, v29.4s
+ sqxtun v6.4h, v6.4s
+ sqxtun2 v6.8h, v7.4s
+ sqxtun v7.4h, v16.4s
+ sqxtun2 v7.8h, v17.4s
+ umin v6.8h, v6.8h, v24.8h
+ umin v7.8h, v7.8h, v24.8h
+ sub v6.8h, v6.8h, v31.8h
+ sub v7.8h, v7.8h, v31.8h
+
+ subs w4, w4, #16
+
+ st1 {v6.8h, v7.8h}, [x14], #32
+
+ b.le 0f
+ mov v2.16b, v4.16b
+ tst w7, #2 // LR_HAVE_RIGHT
+ ld1 {v3.8h, v4.8h}, [x3], #32
+ b.ne 4b // If we don't need to pad, just keep filtering.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+0:
+ ldr x14, [sp, #16]
+ ldp x3, x4, [sp], #32
+ ret
+endfunc
+
+function wiener_filter5_v_16bpc_neon
+ stp x11, x12, [sp, #-48]!
+ stp x13, x14, [sp, #16]
+ stp x0, x4, [sp, #32]
+1:
+ ld1 {v16.8h, v17.8h}, [x11], #32
+ ld1 {v18.8h, v19.8h}, [x12], #32
+ ld1 {v20.8h, v21.8h}, [x13], #32
+ ld1 {v22.8h, v23.8h}, [x14], #32
+
+ smull v2.4s, v16.4h, v0.h[5]
+ smlal v2.4s, v18.4h, v0.h[6]
+ smlal v2.4s, v20.4h, v0.h[7]
+ smlal v2.4s, v22.4h, v0.h[6]
+ smlal v2.4s, v22.4h, v0.h[5]
+ smull2 v3.4s, v16.8h, v0.h[5]
+ smlal2 v3.4s, v18.8h, v0.h[6]
+ smlal2 v3.4s, v20.8h, v0.h[7]
+ smlal2 v3.4s, v22.8h, v0.h[6]
+ smlal2 v3.4s, v22.8h, v0.h[5]
+ smull v4.4s, v17.4h, v0.h[5]
+ smlal v4.4s, v19.4h, v0.h[6]
+ smlal v4.4s, v21.4h, v0.h[7]
+ smlal v4.4s, v23.4h, v0.h[6]
+ smlal v4.4s, v23.4h, v0.h[5]
+ smull2 v5.4s, v17.8h, v0.h[5]
+ smlal2 v5.4s, v19.8h, v0.h[6]
+ smlal2 v5.4s, v21.8h, v0.h[7]
+ smlal2 v5.4s, v23.8h, v0.h[6]
+ smlal2 v5.4s, v23.8h, v0.h[5]
+ srshl v2.4s, v2.4s, v27.4s // -round_bits_v
+ srshl v3.4s, v3.4s, v27.4s
+ srshl v4.4s, v4.4s, v27.4s
+ srshl v5.4s, v5.4s, v27.4s
+ sqxtun v2.4h, v2.4s
+ sqxtun2 v2.8h, v3.4s
+ sqxtun v3.4h, v4.4s
+ sqxtun2 v3.8h, v5.4s
+ umin v2.8h, v2.8h, v28.8h // bitdepth_max
+ umin v3.8h, v3.8h, v28.8h
+
+ subs w4, w4, #16
+ st1 {v2.8h, v3.8h}, [x0], #32
+ b.gt 1b
+
+ ldp x0, x4, [sp, #32]
+ ldp x13, x14, [sp, #16]
+ ldp x11, x12, [sp], #48
+
+ ret
+endfunc
+
+function wiener_filter5_hv_16bpc_neon
+ // Backing up/restoring registers shifted, so that x11 gets the value
+ // of x12, etc, and x15==x11, afterwards.
+ stp x12, x13, [sp, #-64]!
+ stp x14, x15, [sp, #16]
+ stp x12, x0, [sp, #32]
+ stp x3, x4, [sp, #48]
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst w7, #1 // LR_HAVE_LEFT
+ b.eq 1f
+ // LR_HAVE_LEFT
+ cbnz x2, 0f
+ // left == NULL
+ sub x3, x3, #4
+ ld1 {v2.8h, v3.8h}, [x3], #32
+ b 2f
+
+0:
+ // LR_HAVE_LEFT, left != NULL
+ ld1 {v2.8h, v3.8h}, [x3], #32
+ ld1 {v4.d}[1], [x2], #8
+ // Move x3 back to account for the last 2 pixels we loaded earlier,
+ // which we'll shift out.
+ sub x3, x3, #4
+ ext v3.16b, v2.16b, v3.16b, #12
+ ext v2.16b, v4.16b, v2.16b, #12
+ b 2f
+1:
+ ld1 {v2.8h, v3.8h}, [x3], #32
+ // !LR_HAVE_LEFT, fill v2 with the leftmost pixel
+ // and shift v3 to have 2x the first pixel at the front.
+ dup v4.8h, v2.h[0]
+ // Move x3 back to account for the last 2 pixels we loaded before,
+ // which we shifted out.
+ sub x3, x3, #4
+ ext v3.16b, v2.16b, v3.16b, #12
+ ext v2.16b, v4.16b, v2.16b, #12
+
+2:
+ ld1 {v4.8h}, [x3], #16
+
+ tst w7, #2 // LR_HAVE_RIGHT
+ b.ne 4f
+
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp w4, #18
+ b.ge 4f // If w >= 18, all used input pixels are valid
+
+ // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9,
+ // this ends up called again; it's not strictly needed in those
+ // cases (we pad enough here), but keeping the code as simple as possible.
+
+ // The padding pixel is v2/3/4.h[w+1]. x3 points at the next input, ie
+ // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel.
+ sub w17, w4, #23
+ // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the
+ // buffer pointer.
+ movrel x6, right_ext_mask, -4
+ ldr h26, [x3, w17, sxtw #1]
+ sub x6, x6, w4, uxtw #1
+ dup v26.8h, v26.h[0]
+ ld1 {v23.16b, v24.16b, v25.16b}, [x6]
+
+ bit v2.16b, v26.16b, v23.16b
+ bit v3.16b, v26.16b, v24.16b
+ bit v4.16b, v26.16b, v25.16b
+
+4: // Loop horizontally
+ ext v16.16b, v2.16b, v3.16b, #2
+ ext v18.16b, v2.16b, v3.16b, #6
+ ext v19.16b, v2.16b, v3.16b, #8
+ ext v17.16b, v2.16b, v3.16b, #4
+ add v18.8h, v18.8h, v16.8h
+ add v19.8h, v19.8h, v2.8h
+ smull v6.4s, v17.4h, v0.h[3]
+ smlal v6.4s, v18.4h, v0.h[2]
+ smlal v6.4s, v19.4h, v0.h[1]
+ smull2 v7.4s, v17.8h, v0.h[3]
+ smlal2 v7.4s, v18.8h, v0.h[2]
+ smlal2 v7.4s, v19.8h, v0.h[1]
+
+ ext v16.16b, v3.16b, v4.16b, #2
+ ext v18.16b, v3.16b, v4.16b, #6
+ ext v19.16b, v3.16b, v4.16b, #8
+ ext v17.16b, v3.16b, v4.16b, #4
+ add v18.8h, v18.8h, v16.8h
+ add v19.8h, v19.8h, v3.8h
+ smull v24.4s, v17.4h, v0.h[3]
+ smlal v24.4s, v18.4h, v0.h[2]
+ smlal v24.4s, v19.4h, v0.h[1]
+ smull2 v25.4s, v17.8h, v0.h[3]
+ smlal2 v25.4s, v18.8h, v0.h[2]
+ smlal2 v25.4s, v19.8h, v0.h[1]
+
+ ld1 {v16.8h, v17.8h}, [x11], #32
+ mvni v26.8h, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1
+ add v6.4s, v6.4s, v30.4s
+ add v7.4s, v7.4s, v30.4s
+ add v24.4s, v24.4s, v30.4s
+ add v25.4s, v25.4s, v30.4s
+ ld1 {v18.8h, v19.8h}, [x12], #32
+ srshl v6.4s, v6.4s, v29.4s
+ srshl v7.4s, v7.4s, v29.4s
+ srshl v24.4s, v24.4s, v29.4s
+ srshl v25.4s, v25.4s, v29.4s
+ ld1 {v20.8h, v21.8h}, [x13], #32
+ sqxtun v6.4h, v6.4s
+ sqxtun2 v6.8h, v7.4s
+ sqxtun v7.4h, v24.4s
+ sqxtun2 v7.8h, v25.4s
+ ld1 {v22.8h, v23.8h}, [x14], #32
+ umin v6.8h, v6.8h, v26.8h
+ umin v7.8h, v7.8h, v26.8h
+ sub v6.8h, v6.8h, v31.8h
+ sub v7.8h, v7.8h, v31.8h
+
+ smull v8.4s, v16.4h, v0.h[5]
+ smlal v8.4s, v18.4h, v0.h[6]
+ smlal v8.4s, v20.4h, v0.h[7]
+ smlal v8.4s, v22.4h, v0.h[6]
+ smlal v8.4s, v6.4h, v0.h[5]
+ smull2 v9.4s, v16.8h, v0.h[5]
+ smlal2 v9.4s, v18.8h, v0.h[6]
+ smlal2 v9.4s, v20.8h, v0.h[7]
+ smlal2 v9.4s, v22.8h, v0.h[6]
+ smlal2 v9.4s, v6.8h, v0.h[5]
+ smull v1.4s, v17.4h, v0.h[5]
+ smlal v1.4s, v19.4h, v0.h[6]
+ smlal v1.4s, v21.4h, v0.h[7]
+ smlal v1.4s, v23.4h, v0.h[6]
+ smlal v1.4s, v7.4h, v0.h[5]
+ smull2 v5.4s, v17.8h, v0.h[5]
+ smlal2 v5.4s, v19.8h, v0.h[6]
+ smlal2 v5.4s, v21.8h, v0.h[7]
+ smlal2 v5.4s, v23.8h, v0.h[6]
+ smlal2 v5.4s, v7.8h, v0.h[5]
+ srshl v8.4s, v8.4s, v27.4s // -round_bits_v
+ srshl v9.4s, v9.4s, v27.4s
+ srshl v1.4s, v1.4s, v27.4s
+ srshl v5.4s, v5.4s, v27.4s
+ sqxtun v8.4h, v8.4s
+ sqxtun2 v8.8h, v9.4s
+ sqxtun v9.4h, v1.4s
+ sqxtun2 v9.8h, v5.4s
+ st1 {v6.8h, v7.8h}, [x15], #32
+ umin v8.8h, v8.8h, v28.8h // bitdepth_max
+ umin v9.8h, v9.8h, v28.8h
+
+ subs w4, w4, #16
+
+ st1 {v8.8h, v9.8h}, [x0], #32
+
+ b.le 0f
+ mov v2.16b, v4.16b
+ tst w7, #2 // LR_HAVE_RIGHT
+ ld1 {v3.8h, v4.8h}, [x3], #32
+ b.ne 4b // If we don't need to pad, just keep filtering.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+0:
+ ldp x3, x4, [sp, #48]
+ ldp x15, x0, [sp, #32]
+ ldp x13, x14, [sp, #16]
+ ldp x11, x12, [sp], #64
+
+ add x3, x3, x1
+ add x0, x0, x1
+
+ ret
+endfunc
+
+#include "looprestoration_tmpl.S"
+
+// void dav1d_sgr_box3_row_h_16bpc_neon(int32_t *sumsq, int16_t *sum,
+// const pixel (*left)[4],
+// const pixel *src, const int w,
+// const enum LrEdgeFlags edges);
+function sgr_box3_row_h_16bpc_neon, export=1
+ add w4, w4, #2 // w += 2
+
+ tst w5, #1 // LR_HAVE_LEFT
+ b.eq 1f
+ cbnz x2, 0f
+
+ // LR_HAVE_LEFT && left == NULL
+ sub x3, x3, #4
+ ld1 {v0.8h, v1.8h}, [x3], #32
+ b 2f
+
+0:
+ // LR_HAVE_LEFT, left != NULL
+ ld1 {v0.8h, v1.8h}, [x3], #32
+ ld1 {v2.d}[1], [x2]
+ // Move x3 back to account for the last 2 pixels we loaded earlier,
+ // which we'll shift out.
+ sub x3, x3, #4
+ ext v1.16b, v0.16b, v1.16b, #12
+ ext v0.16b, v2.16b, v0.16b, #12
+ b 2f
+
+1:
+ ld1 {v0.8h, v1.8h}, [x3], #32
+ // !LR_HAVE_LEFT, fill v2 with the leftmost pixel
+ // and shift v0/v1 to have 2x the first pixel at the front.
+ dup v2.8h, v0.h[0]
+ // Move x3 back to account for the last 2 pixels we loaded before,
+ // which we shifted out.
+ sub x3, x3, #4
+ ext v1.16b, v0.16b, v1.16b, #12
+ ext v0.16b, v2.16b, v0.16b, #12
+
+2:
+ tst w5, #2 // LR_HAVE_RIGHT
+ b.ne 4f
+ // If we'll need to pad the right edge, load that pixel to pad with
+ // here since we can find it pretty easily from here.
+ sub w13, w4, #(2 + 16 - 2 + 1)
+ ldr h30, [x3, w13, sxtw #1]
+ // Fill v30 with the right padding pixel
+ dup v30.8h, v30.h[0]
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp w4, #10
+ b.ge 4f // If w >= 10, all used input pixels are valid
+
+ // 1 <= w < 10, w pixels valid in v0. For w=9, this ends up called
+ // again; it's not strictly needed in those cases (we pad enough here),
+ // but keeping the code as simple as possible.
+
+ // Insert padding in v0.b[w] onwards
+ movrel x13, right_ext_mask
+ sub x13, x13, w4, uxtw #1
+ ld1 {v28.16b, v29.16b}, [x13]
+
+ bit v0.16b, v30.16b, v28.16b
+ bit v1.16b, v30.16b, v29.16b
+
+4: // Loop horizontally
+ ext v26.16b, v0.16b, v1.16b, #2
+ ext v27.16b, v0.16b, v1.16b, #4
+
+ add v6.8h, v0.8h, v26.8h
+ umull v22.4s, v0.4h, v0.4h
+ umlal v22.4s, v26.4h, v26.4h
+ umlal v22.4s, v27.4h, v27.4h
+ add v6.8h, v6.8h, v27.8h
+ umull2 v23.4s, v0.8h, v0.8h
+ umlal2 v23.4s, v26.8h, v26.8h
+ umlal2 v23.4s, v27.8h, v27.8h
+
+ subs w4, w4, #8
+
+ st1 {v6.8h}, [x1], #16
+ st1 {v22.4s,v23.4s}, [x0], #32
+
+ b.le 9f
+ tst w5, #2 // LR_HAVE_RIGHT
+ mov v0.16b, v1.16b
+ ld1 {v1.8h}, [x3], #16
+
+ b.ne 4b // If we don't need to pad, just keep summing.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+9:
+ ret
+endfunc
+
+// void dav1d_sgr_box5_row_h_16bpc_neon(int32_t *sumsq, int16_t *sum,
+// const pixel (*left)[4],
+// const pixel *src, const int w,
+// const enum LrEdgeFlags edges);
+function sgr_box5_row_h_16bpc_neon, export=1
+ add w4, w4, #2 // w += 2
+
+ tst w5, #1 // LR_HAVE_LEFT
+ b.eq 1f
+ cbnz x2, 0f
+
+ // LR_HAVE_LEFT && left == NULL
+ sub x3, x3, #6
+ ld1 {v0.8h, v1.8h}, [x3], #32
+ b 2f
+
+0:
+ // LR_HAVE_LEFT, left != NULL
+ ld1 {v0.8h, v1.8h}, [x3], #32
+ ld1 {v2.d}[1], [x2], #8
+ // Move x3 back to account for the last 3 pixels we loaded earlier,
+ // which we'll shift out.
+ sub x3, x3, #6
+ ext v1.16b, v0.16b, v1.16b, #10
+ ext v0.16b, v2.16b, v0.16b, #10
+ b 2f
+
+1:
+ ld1 {v0.8h, v1.8h}, [x3], #32
+ // !LR_HAVE_LEFT, fill v2 with the leftmost pixel
+ // and shift v0/v1 to have 3x the first pixel at the front.
+ dup v2.8h, v0.h[0]
+ // Move x3 back to account for the last 3 pixels we loaded before,
+ // which we shifted out.
+ sub x3, x3, #6
+ ext v1.16b, v0.16b, v1.16b, #10
+ ext v0.16b, v2.16b, v0.16b, #10
+
+2:
+ tst w5, #2 // LR_HAVE_RIGHT
+ b.ne 4f
+ // If we'll need to pad the right edge, load that pixel to pad with
+ // here since we can find it pretty easily from here.
+ sub w13, w4, #(2 + 16 - 3 + 1)
+ ldr h30, [x3, w13, sxtw #1]
+ // Fill v30 with the right padding pixel
+ dup v30.8h, v30.h[0]
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp w4, #11
+ b.ge 4f // If w >= 11, all used input pixels are valid
+
+ // 1 <= w < 11, w+1 pixels valid in v0. For w=9 or w=10,
+ // this ends up called again; it's not strictly needed in those
+ // cases (we pad enough here), but keeping the code as simple as possible.
+
+ // Insert padding in v0.b[w+1] onwards; fuse the +1 into the
+ // buffer pointer.
+ movrel x13, right_ext_mask, -1
+ sub x13, x13, w4, uxtw #1
+ ld1 {v28.16b, v29.16b}, [x13]
+
+ bit v0.16b, v30.16b, v28.16b
+ bit v1.16b, v30.16b, v29.16b
+
+4: // Loop horizontally
+ ext v26.16b, v0.16b, v1.16b, #2
+ ext v27.16b, v0.16b, v1.16b, #4
+
+ add v6.8h, v0.8h, v26.8h
+ umull v22.4s, v0.4h, v0.4h
+ umlal v22.4s, v26.4h, v26.4h
+ umlal v22.4s, v27.4h, v27.4h
+ add v6.8h, v6.8h, v27.8h
+ umull2 v23.4s, v0.8h, v0.8h
+ umlal2 v23.4s, v26.8h, v26.8h
+ umlal2 v23.4s, v27.8h, v27.8h
+
+ ext v26.16b, v0.16b, v1.16b, #6
+ ext v27.16b, v0.16b, v1.16b, #8
+
+ add v6.8h, v6.8h, v26.8h
+ umlal v22.4s, v26.4h, v26.4h
+ umlal v22.4s, v27.4h, v27.4h
+ add v6.8h, v6.8h, v27.8h
+ umlal2 v23.4s, v26.8h, v26.8h
+ umlal2 v23.4s, v27.8h, v27.8h
+
+ subs w4, w4, #8
+
+ st1 {v6.8h}, [x1], #16
+ st1 {v22.4s,v23.4s}, [x0], #32
+
+ b.le 9f
+ tst w5, #2 // LR_HAVE_RIGHT
+ mov v0.16b, v1.16b
+ ld1 {v1.8h}, [x3], #16
+
+ b.ne 4b // If we don't need to pad, just keep summing.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+9:
+ ret
+endfunc
+
+// void dav1d_sgr_box35_row_h_16bpc_neon(int32_t *sumsq3, int16_t *sum3,
+// int32_t *sumsq5, int16_t *sum5,
+// const pixel (*left)[4],
+// const pixel *src, const int w,
+// const enum LrEdgeFlags edges);
+function sgr_box35_row_h_16bpc_neon, export=1
+ add w6, w6, #2 // w += 2
+
+ tst w7, #1 // LR_HAVE_LEFT
+ b.eq 1f
+ cbnz x4, 0f
+
+ // LR_HAVE_LEFT && left == NULL
+ sub x5, x5, #6
+ ld1 {v0.8h, v1.8h}, [x5], #32
+ b 2f
+
+0:
+ // LR_HAVE_LEFT, left != NULL
+ ld1 {v0.8h, v1.8h}, [x5], #32
+ ld1 {v2.d}[1], [x4], #8
+ // Move x3 back to account for the last 3 pixels we loaded earlier,
+ // which we'll shift out.
+ sub x5, x5, #6
+ ext v1.16b, v0.16b, v1.16b, #10
+ ext v0.16b, v2.16b, v0.16b, #10
+ b 2f
+
+1:
+ ld1 {v0.8h, v1.8h}, [x5], #32
+ // !LR_HAVE_LEFT, fill v2 with the leftmost pixel
+ // and shift v0/v1 to have 3x the first pixel at the front.
+ dup v2.8h, v0.h[0]
+ // Move x5 back to account for the last 3 pixels we loaded before,
+ // which we shifted out.
+ sub x5, x5, #6
+ ext v1.16b, v0.16b, v1.16b, #10
+ ext v0.16b, v2.16b, v0.16b, #10
+
+2:
+ tst w7, #2 // LR_HAVE_RIGHT
+ b.ne 4f
+ // If we'll need to pad the right edge, load that pixel to pad with
+ // here since we can find it pretty easily from here.
+ sub w13, w6, #(2 + 16 - 3 + 1)
+ ldr h30, [x5, w13, sxtw #1]
+ // Fill v30 with the right padding pixel
+ dup v30.8h, v30.h[0]
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp w6, #11
+ b.ge 4f // If w >= 11, all used input pixels are valid
+
+ // 1 <= w < 11, w+1 pixels valid in v0. For w=9 or w=10,
+ // this ends up called again; it's not strictly needed in those
+ // cases (we pad enough here), but keeping the code as simple as possible.
+
+ // Insert padding in v0.b[w+1] onwards; fuse the +1 into the
+ // buffer pointer.
+ movrel x13, right_ext_mask, -1
+ sub x13, x13, w6, uxtw #1
+ ld1 {v28.16b, v29.16b}, [x13]
+
+ bit v0.16b, v30.16b, v28.16b
+ bit v1.16b, v30.16b, v29.16b
+
+4: // Loop horizontally
+ ext v16.16b, v0.16b, v1.16b, #2
+ ext v17.16b, v0.16b, v1.16b, #4
+ ext v19.16b, v0.16b, v1.16b, #8
+ ext v18.16b, v0.16b, v1.16b, #6
+
+ add v20.8h, v16.8h, v17.8h
+ add v21.8h, v0.8h, v19.8h
+ add v20.8h, v20.8h, v18.8h
+
+ umull v22.4s, v16.4h, v16.4h
+ umlal v22.4s, v17.4h, v17.4h
+ umlal v22.4s, v18.4h, v18.4h
+
+ umull2 v23.4s, v16.8h, v16.8h
+ umlal2 v23.4s, v17.8h, v17.8h
+ umlal2 v23.4s, v18.8h, v18.8h
+
+ add v21.8h, v21.8h, v20.8h
+ st1 {v20.8h}, [x1], #16
+ st1 {v22.4s,v23.4s}, [x0], #32
+
+ umlal v22.4s, v0.4h, v0.4h
+ umlal v22.4s, v19.4h, v19.4h
+
+ umlal2 v23.4s, v0.8h, v0.8h
+ umlal2 v23.4s, v19.8h, v19.8h
+
+ subs w6, w6, #8
+
+ st1 {v21.8h}, [x3], #16
+ st1 {v22.4s,v23.4s}, [x2], #32
+
+ b.le 9f
+ tst w7, #2 // LR_HAVE_RIGHT
+ mov v0.16b, v1.16b
+ ld1 {v1.8h}, [x5], #16
+
+ b.ne 4b // If we don't need to pad, just keep summing.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+9:
+ ret
+endfunc
+
+sgr_funcs 16
diff --git a/third_party/dav1d/src/arm/64/looprestoration_common.S b/third_party/dav1d/src/arm/64/looprestoration_common.S
new file mode 100644
index 0000000000..745f6c20f4
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/looprestoration_common.S
@@ -0,0 +1,272 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// void dav1d_sgr_box3_vert_neon(int32_t **sumsq, int16_t **sum,
+// int32_t *AA, int16_t *BB,
+// const int w, const int s,
+// const int bitdepth_max);
+function sgr_box3_vert_neon, export=1
+ stp d8, d9, [sp, #-0x30]!
+ stp d10, d11, [sp, #0x10]
+ stp d12, d13, [sp, #0x20]
+
+ add w4, w4, #2
+ clz w9, w6 // bitdepth_max
+ dup v28.4s, w5 // strength
+
+ ldp x5, x6, [x0]
+ ldr x0, [x0, #16]
+ ldp x7, x8, [x1]
+ ldr x1, [x1, #16]
+
+ movi v31.4s, #9 // n
+
+ sub w9, w9, #24 // -bitdepth_min_8
+ movrel x12, X(sgr_x_by_x)
+ mov w13, #455 // one_by_x
+ ld1 {v16.16b, v17.16b, v18.16b}, [x12]
+ dup v6.8h, w9 // -bitdepth_min_8
+ movi v19.16b, #5
+ movi v20.8b, #55 // idx of last 5
+ movi v21.8b, #72 // idx of last 4
+ movi v22.8b, #101 // idx of last 3
+ movi v23.8b, #169 // idx of last 2
+ movi v24.8b, #254 // idx of last 1
+ saddl v7.4s, v6.4h, v6.4h // -2*bitdepth_min_8
+ movi v29.8h, #1, lsl #8
+ dup v30.4s, w13 // one_by_x
+
+ sub v16.16b, v16.16b, v19.16b
+ sub v17.16b, v17.16b, v19.16b
+ sub v18.16b, v18.16b, v19.16b
+
+ ld1 {v8.4s, v9.4s}, [x5], #32
+ ld1 {v10.4s, v11.4s}, [x6], #32
+ ld1 {v12.8h}, [x7], #16
+ ld1 {v13.8h}, [x8], #16
+ ld1 {v0.4s, v1.4s}, [x0], #32
+ ld1 {v2.8h}, [x1], #16
+1:
+
+ add v8.4s, v8.4s, v10.4s
+ add v9.4s, v9.4s, v11.4s
+
+ add v12.8h, v12.8h, v13.8h
+
+ subs w4, w4, #8
+ add v0.4s, v0.4s, v8.4s
+ add v1.4s, v1.4s, v9.4s
+ add v2.8h, v2.8h, v12.8h
+
+ srshl v0.4s, v0.4s, v7.4s
+ srshl v1.4s, v1.4s, v7.4s
+ srshl v4.8h, v2.8h, v6.8h
+ mul v0.4s, v0.4s, v31.4s // a * n
+ mul v1.4s, v1.4s, v31.4s // a * n
+ umull v3.4s, v4.4h, v4.4h // b * b
+ umull2 v4.4s, v4.8h, v4.8h // b * b
+ uqsub v0.4s, v0.4s, v3.4s // imax(a * n - b * b, 0)
+ uqsub v1.4s, v1.4s, v4.4s // imax(a * n - b * b, 0)
+ mul v0.4s, v0.4s, v28.4s // p * s
+ mul v1.4s, v1.4s, v28.4s // p * s
+ ld1 {v8.4s, v9.4s}, [x5], #32
+ uqshrn v0.4h, v0.4s, #16
+ uqshrn2 v0.8h, v1.4s, #16
+ ld1 {v10.4s, v11.4s}, [x6], #32
+ uqrshrn v0.8b, v0.8h, #4 // imin(z, 255)
+
+ ld1 {v12.8h}, [x7], #16
+
+ cmhi v25.8b, v0.8b, v20.8b // = -1 if sgr_x_by_x[v0] < 5
+ cmhi v26.8b, v0.8b, v21.8b // = -1 if sgr_x_by_x[v0] < 4
+ tbl v1.8b, {v16.16b, v17.16b, v18.16b}, v0.8b
+ cmhi v27.8b, v0.8b, v22.8b // = -1 if sgr_x_by_x[v0] < 3
+ cmhi v4.8b, v0.8b, v23.8b // = -1 if sgr_x_by_x[v0] < 2
+ add v25.8b, v25.8b, v26.8b
+ cmhi v5.8b, v0.8b, v24.8b // = -1 if sgr_x_by_x[v0] < 1
+ add v27.8b, v27.8b, v4.8b
+ add v5.8b, v5.8b, v19.8b
+ add v25.8b, v25.8b, v27.8b
+ add v5.8b, v1.8b, v5.8b
+ ld1 {v13.8h}, [x8], #16
+ add v5.8b, v5.8b, v25.8b
+ ld1 {v0.4s, v1.4s}, [x0], #32
+ uxtl v5.8h, v5.8b // x
+
+ umull v3.4s, v5.4h, v2.4h // x * BB[i]
+ umull2 v4.4s, v5.8h, v2.8h // x * BB[i]
+ mul v3.4s, v3.4s, v30.4s // x * BB[i] * sgr_one_by_x
+ mul v4.4s, v4.4s, v30.4s // x * BB[i] * sgr_one_by_x
+ srshr v3.4s, v3.4s, #12 // AA[i]
+ srshr v4.4s, v4.4s, #12 // AA[i]
+ sub v5.8h, v29.8h, v5.8h // 256 - x
+ ld1 {v2.8h}, [x1], #16
+
+ st1 {v3.4s, v4.4s}, [x2], #32
+ st1 {v5.8h}, [x3], #16
+ b.gt 1b
+
+ ldp d12, d13, [sp, #0x20]
+ ldp d10, d11, [sp, #0x10]
+ ldp d8, d9, [sp], 0x30
+ ret
+endfunc
+
+// void dav1d_sgr_box5_vert_neon(int32_t **sumsq, int16_t **sum,
+// int32_t *AA, int16_t *BB,
+// const int w, const int s,
+// const int bitdepth_max);
+function sgr_box5_vert_neon, export=1
+ stp d8, d9, [sp, #-0x40]!
+ stp d10, d11, [sp, #0x10]
+ stp d12, d13, [sp, #0x20]
+ stp d14, d15, [sp, #0x30]
+
+ add w4, w4, #2
+ clz w15, w6 // bitdepth_max
+ dup v28.4s, w5 // strength
+
+ ldp x5, x6, [x0]
+ ldp x7, x8, [x0, #16]
+ ldr x0, [x0, #32]
+ ldp x9, x10, [x1]
+ ldp x11, x12, [x1, #16]
+ ldr x1, [x1, #32]
+
+ movi v31.4s, #25 // n
+
+ sub w15, w15, #24 // -bitdepth_min_8
+ movrel x13, X(sgr_x_by_x)
+ mov w14, #164 // one_by_x
+ ld1 {v16.16b, v17.16b, v18.16b}, [x13]
+ dup v6.8h, w15 // -bitdepth_min_8
+ movi v19.16b, #5
+ movi v24.8b, #254 // idx of last 1
+ saddl v7.4s, v6.4h, v6.4h // -2*bitdepth_min_8
+ movi v29.8h, #1, lsl #8
+ dup v30.4s, w14 // one_by_x
+
+ sub v16.16b, v16.16b, v19.16b
+ sub v17.16b, v17.16b, v19.16b
+ sub v18.16b, v18.16b, v19.16b
+
+ ld1 {v8.4s, v9.4s}, [x5], #32
+ ld1 {v10.4s, v11.4s}, [x6], #32
+ ld1 {v12.4s, v13.4s}, [x7], #32
+ ld1 {v14.4s, v15.4s}, [x8], #32
+ ld1 {v20.8h}, [x9], #16
+ ld1 {v21.8h}, [x10], #16
+ ld1 {v22.8h}, [x11], #16
+ ld1 {v23.8h}, [x12], #16
+ ld1 {v0.4s, v1.4s}, [x0], #32
+ ld1 {v2.8h}, [x1], #16
+
+1:
+ add v8.4s, v8.4s, v10.4s
+ add v9.4s, v9.4s, v11.4s
+ add v12.4s, v12.4s, v14.4s
+ add v13.4s, v13.4s, v15.4s
+
+ add v20.8h, v20.8h, v21.8h
+ add v22.8h, v22.8h, v23.8h
+
+ add v0.4s, v0.4s, v8.4s
+ add v1.4s, v1.4s, v9.4s
+ add v2.8h, v2.8h, v20.8h
+
+ add v0.4s, v0.4s, v12.4s
+ add v1.4s, v1.4s, v13.4s
+ add v2.8h, v2.8h, v22.8h
+
+ subs w4, w4, #8
+
+ movi v20.8b, #55 // idx of last 5
+ movi v21.8b, #72 // idx of last 4
+ movi v22.8b, #101 // idx of last 3
+ movi v23.8b, #169 // idx of last 2
+
+ srshl v0.4s, v0.4s, v7.4s
+ srshl v1.4s, v1.4s, v7.4s
+ srshl v4.8h, v2.8h, v6.8h
+ mul v0.4s, v0.4s, v31.4s // a * n
+ mul v1.4s, v1.4s, v31.4s // a * n
+ umull v3.4s, v4.4h, v4.4h // b * b
+ umull2 v4.4s, v4.8h, v4.8h // b * b
+ uqsub v0.4s, v0.4s, v3.4s // imax(a * n - b * b, 0)
+ uqsub v1.4s, v1.4s, v4.4s // imax(a * n - b * b, 0)
+ mul v0.4s, v0.4s, v28.4s // p * s
+ mul v1.4s, v1.4s, v28.4s // p * s
+ ld1 {v8.4s, v9.4s}, [x5], #32
+ uqshrn v0.4h, v0.4s, #16
+ uqshrn2 v0.8h, v1.4s, #16
+ ld1 {v10.4s, v11.4s}, [x6], #32
+ uqrshrn v0.8b, v0.8h, #4 // imin(z, 255)
+
+ ld1 {v12.4s, v13.4s}, [x7], #32
+
+ cmhi v25.8b, v0.8b, v20.8b // = -1 if sgr_x_by_x[v0] < 5
+ cmhi v26.8b, v0.8b, v21.8b // = -1 if sgr_x_by_x[v0] < 4
+ tbl v1.8b, {v16.16b, v17.16b, v18.16b}, v0.8b
+ cmhi v27.8b, v0.8b, v22.8b // = -1 if sgr_x_by_x[v0] < 3
+ cmhi v4.8b, v0.8b, v23.8b // = -1 if sgr_x_by_x[v0] < 2
+ ld1 {v14.4s, v15.4s}, [x8], #32
+ add v25.8b, v25.8b, v26.8b
+ cmhi v5.8b, v0.8b, v24.8b // = -1 if sgr_x_by_x[v0] < 1
+ add v27.8b, v27.8b, v4.8b
+ ld1 {v20.8h}, [x9], #16
+ add v5.8b, v5.8b, v19.8b
+ add v25.8b, v25.8b, v27.8b
+ ld1 {v21.8h}, [x10], #16
+ add v5.8b, v1.8b, v5.8b
+ ld1 {v22.8h}, [x11], #16
+ add v5.8b, v5.8b, v25.8b
+ ld1 {v23.8h}, [x12], #16
+ uxtl v5.8h, v5.8b // x
+
+ ld1 {v0.4s, v1.4s}, [x0], #32
+ umull v3.4s, v5.4h, v2.4h // x * BB[i]
+ umull2 v4.4s, v5.8h, v2.8h // x * BB[i]
+ mul v3.4s, v3.4s, v30.4s // x * BB[i] * sgr_one_by_x
+ mul v4.4s, v4.4s, v30.4s // x * BB[i] * sgr_one_by_x
+ srshr v3.4s, v3.4s, #12 // AA[i]
+ srshr v4.4s, v4.4s, #12 // AA[i]
+ sub v5.8h, v29.8h, v5.8h // 256 - x
+ ld1 {v2.8h}, [x1], #16
+
+ st1 {v3.4s, v4.4s}, [x2], #32
+ st1 {v5.8h}, [x3], #16
+ b.gt 1b
+
+ ldp d14, d15, [sp, #0x30]
+ ldp d12, d13, [sp, #0x20]
+ ldp d10, d11, [sp, #0x10]
+ ldp d8, d9, [sp], 0x40
+ ret
+endfunc
diff --git a/third_party/dav1d/src/arm/64/looprestoration_tmpl.S b/third_party/dav1d/src/arm/64/looprestoration_tmpl.S
new file mode 100644
index 0000000000..1373f9ace3
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/looprestoration_tmpl.S
@@ -0,0 +1,751 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+
+#define FILTER_OUT_STRIDE 384
+
+.macro sgr_funcs bpc
+// void dav1d_sgr_finish_filter1_2rows_Xbpc_neon(int16_t *tmp,
+// const pixel *src,
+// const ptrdiff_t src_stride,
+// const int32_t **a,
+// const int16_t **b,
+// const int w, const int h);
+function sgr_finish_filter1_2rows_\bpc\()bpc_neon, export=1
+ stp d8, d9, [sp, #-0x40]!
+ stp d10, d11, [sp, #0x10]
+ stp d12, d13, [sp, #0x20]
+ stp d14, d15, [sp, #0x30]
+
+ ldp x7, x8, [x3]
+ ldp x9, x3, [x3, #16]
+ ldp x10, x11, [x4]
+ ldp x12, x4, [x4, #16]
+
+ mov x13, #FILTER_OUT_STRIDE
+ cmp w6, #1
+ add x2, x1, x2 // src + stride
+ csel x2, x1, x2, le // if (h <= 1) x2 = x1
+ add x13, x0, x13, lsl #1
+
+ movi v30.8h, #3
+ movi v31.4s, #3
+1:
+ ld1 {v0.8h, v1.8h}, [x10], #32
+ ld1 {v2.8h, v3.8h}, [x11], #32
+ ld1 {v4.8h, v5.8h}, [x12], #32
+ ld1 {v6.8h, v7.8h}, [x4], #32
+ ld1 {v16.4s, v17.4s, v18.4s}, [x7], #48
+ ld1 {v19.4s, v20.4s, v21.4s}, [x8], #48
+ ld1 {v22.4s, v23.4s, v24.4s}, [x9], #48
+ ld1 {v25.4s, v26.4s, v27.4s}, [x3], #48
+
+2:
+ ext v8.16b, v0.16b, v1.16b, #2 // [0][1]
+ ext v9.16b, v2.16b, v3.16b, #2 // [1][1]
+ ext v10.16b, v4.16b, v5.16b, #2 // [2][1]
+ ext v11.16b, v0.16b, v1.16b, #4 // [0][2]
+ ext v12.16b, v2.16b, v3.16b, #4 // [1][2]
+ ext v13.16b, v4.16b, v5.16b, #4 // [2][2]
+
+ add v14.8h, v2.8h, v8.8h // [1][0] + [0][1]
+ add v15.8h, v9.8h, v10.8h // [1][1] + [2][1]
+
+ add v28.8h, v0.8h, v11.8h // [0][0] + [0][2]
+ add v14.8h, v14.8h, v12.8h // () + [1][2]
+ add v29.8h, v4.8h, v13.8h // [2][0] + [2][2]
+
+ ext v8.16b, v6.16b, v7.16b, #2 // [3][1]
+ ext v11.16b, v6.16b, v7.16b, #4 // [3][2]
+
+ add v14.8h, v14.8h, v15.8h // mid
+ add v15.8h, v28.8h, v29.8h // corners
+
+ add v28.8h, v4.8h, v9.8h // [2][0] + [1][1]
+ add v29.8h, v10.8h, v8.8h // [2][1] + [3][1]
+
+ add v2.8h, v2.8h, v12.8h // [1][0] + [1][2]
+ add v28.8h, v28.8h, v13.8h // () + [2][2]
+ add v4.8h, v6.8h, v11.8h // [3][0] + [3][2]
+
+ add v0.8h, v28.8h, v29.8h // mid
+ add v2.8h, v2.8h, v4.8h // corners
+
+ shl v4.8h, v14.8h, #2
+ mla v4.8h, v15.8h, v30.8h // * 3 -> a
+
+ shl v0.8h, v0.8h, #2
+ mla v0.8h, v2.8h, v30.8h // * 3 -> a
+
+ ext v8.16b, v16.16b, v17.16b, #4 // [0][1]
+ ext v9.16b, v17.16b, v18.16b, #4
+ ext v10.16b, v16.16b, v17.16b, #8 // [0][2]
+ ext v11.16b, v17.16b, v18.16b, #8
+ ext v12.16b, v19.16b, v20.16b, #4 // [1][1]
+ ext v13.16b, v20.16b, v21.16b, #4
+ add v8.4s, v8.4s, v19.4s // [0][1] + [1][0]
+ add v9.4s, v9.4s, v20.4s
+ add v16.4s, v16.4s, v10.4s // [0][0] + [0][2]
+ add v17.4s, v17.4s, v11.4s
+ ext v14.16b, v19.16b, v20.16b, #8 // [1][2]
+ ext v15.16b, v20.16b, v21.16b, #8
+ add v16.4s, v16.4s, v22.4s // () + [2][0]
+ add v17.4s, v17.4s, v23.4s
+ add v28.4s, v12.4s, v14.4s // [1][1] + [1][2]
+ add v29.4s, v13.4s, v15.4s
+ ext v10.16b, v22.16b, v23.16b, #4 // [2][1]
+ ext v11.16b, v23.16b, v24.16b, #4
+ add v8.4s, v8.4s, v28.4s // mid (incomplete)
+ add v9.4s, v9.4s, v29.4s
+
+ add v19.4s, v19.4s, v14.4s // [1][0] + [1][2]
+ add v20.4s, v20.4s, v15.4s
+ add v14.4s, v22.4s, v12.4s // [2][0] + [1][1]
+ add v15.4s, v23.4s, v13.4s
+
+ ext v12.16b, v22.16b, v23.16b, #8 // [2][2]
+ ext v13.16b, v23.16b, v24.16b, #8
+ ext v28.16b, v25.16b, v26.16b, #4 // [3][1]
+ ext v29.16b, v26.16b, v27.16b, #4
+ add v8.4s, v8.4s, v10.4s // () + [2][1] = mid
+ add v9.4s, v9.4s, v11.4s
+ add v14.4s, v14.4s, v10.4s // () + [2][1]
+ add v15.4s, v15.4s, v11.4s
+ ext v10.16b, v25.16b, v26.16b, #8 // [3][2]
+ ext v11.16b, v26.16b, v27.16b, #8
+ add v16.4s, v16.4s, v12.4s // () + [2][2] = corner
+ add v17.4s, v17.4s, v13.4s
+
+ add v12.4s, v12.4s, v28.4s // [2][2] + [3][1]
+ add v13.4s, v13.4s, v29.4s
+ add v25.4s, v25.4s, v10.4s // [3][0] + [3][2]
+ add v26.4s, v26.4s, v11.4s
+
+ add v14.4s, v14.4s, v12.4s // mid
+ add v15.4s, v15.4s, v13.4s
+ add v19.4s, v19.4s, v25.4s // corner
+ add v20.4s, v20.4s, v26.4s
+
+.if \bpc == 8
+ ld1 {v25.8b}, [x1], #8 // src
+ ld1 {v26.8b}, [x2], #8
+.else
+ ld1 {v25.8h}, [x1], #16 // src
+ ld1 {v26.8h}, [x2], #16
+.endif
+
+ shl v8.4s, v8.4s, #2
+ shl v9.4s, v9.4s, #2
+ mla v8.4s, v16.4s, v31.4s // * 3 -> b
+ mla v9.4s, v17.4s, v31.4s
+
+.if \bpc == 8
+ uxtl v25.8h, v25.8b // src
+ uxtl v26.8h, v26.8b
+.endif
+
+ shl v14.4s, v14.4s, #2
+ shl v15.4s, v15.4s, #2
+ mla v14.4s, v19.4s, v31.4s // * 3 -> b
+ mla v15.4s, v20.4s, v31.4s
+
+ umlal v8.4s, v4.4h, v25.4h // b + a * src
+ umlal2 v9.4s, v4.8h, v25.8h
+ umlal v14.4s, v0.4h, v26.4h // b + a * src
+ umlal2 v15.4s, v0.8h, v26.8h
+ mov v0.16b, v1.16b
+ rshrn v8.4h, v8.4s, #9
+ rshrn2 v8.8h, v9.4s, #9
+ mov v2.16b, v3.16b
+ rshrn v14.4h, v14.4s, #9
+ rshrn2 v14.8h, v15.4s, #9
+ subs w5, w5, #8
+ mov v4.16b, v5.16b
+ st1 {v8.8h}, [x0], #16
+ mov v6.16b, v7.16b
+ st1 {v14.8h}, [x13], #16
+
+ b.le 3f
+ mov v16.16b, v18.16b
+ mov v19.16b, v21.16b
+ mov v22.16b, v24.16b
+ mov v25.16b, v27.16b
+ ld1 {v1.8h}, [x10], #16
+ ld1 {v3.8h}, [x11], #16
+ ld1 {v5.8h}, [x12], #16
+ ld1 {v7.8h}, [x4], #16
+ ld1 {v17.4s, v18.4s}, [x7], #32
+ ld1 {v20.4s, v21.4s}, [x8], #32
+ ld1 {v23.4s, v24.4s}, [x9], #32
+ ld1 {v26.4s, v27.4s}, [x3], #32
+ b 2b
+
+3:
+ ldp d14, d15, [sp, #0x30]
+ ldp d12, d13, [sp, #0x20]
+ ldp d10, d11, [sp, #0x10]
+ ldp d8, d9, [sp], 0x40
+ ret
+endfunc
+
+// void dav1d_sgr_finish_weighted1_Xbpc_neon(pixel *dst,
+// const int32_t **a, const int16_t **b,
+// const int w, const int w1,
+// const int bitdepth_max);
+function sgr_finish_weighted1_\bpc\()bpc_neon, export=1
+ ldp x7, x8, [x1]
+ ldr x1, [x1, #16]
+ ldp x9, x10, [x2]
+ ldr x2, [x2, #16]
+
+ dup v31.8h, w4
+ dup v30.8h, w5
+
+ movi v6.8h, #3
+ movi v7.4s, #3
+1:
+ ld1 {v0.8h, v1.8h}, [x9], #32
+ ld1 {v2.8h, v3.8h}, [x10], #32
+ ld1 {v4.8h, v5.8h}, [x2], #32
+ ld1 {v16.4s, v17.4s, v18.4s}, [x7], #48
+ ld1 {v19.4s, v20.4s, v21.4s}, [x8], #48
+ ld1 {v22.4s, v23.4s, v24.4s}, [x1], #48
+
+2:
+ ext v25.16b, v0.16b, v1.16b, #2 // -stride
+ ext v26.16b, v2.16b, v3.16b, #2 // 0
+ ext v27.16b, v4.16b, v5.16b, #2 // +stride
+ ext v28.16b, v0.16b, v1.16b, #4 // +1-stride
+ ext v29.16b, v2.16b, v3.16b, #4 // +1
+ add v2.8h, v2.8h, v25.8h // -1, -stride
+ ext v25.16b, v4.16b, v5.16b, #4 // +1+stride
+ add v26.8h, v26.8h, v27.8h // 0, +stride
+ add v0.8h, v0.8h, v28.8h // -1-stride, +1-stride
+ add v2.8h, v2.8h, v26.8h
+ add v4.8h, v4.8h, v25.8h // -1+stride, +1+stride
+ add v2.8h, v2.8h, v29.8h // +1
+ add v0.8h, v0.8h, v4.8h
+
+ ext v25.16b, v16.16b, v17.16b, #4 // -stride
+ ext v26.16b, v17.16b, v18.16b, #4
+ shl v2.8h, v2.8h, #2
+ ext v27.16b, v16.16b, v17.16b, #8 // +1-stride
+ ext v28.16b, v17.16b, v18.16b, #8
+ ext v29.16b, v19.16b, v20.16b, #4 // 0
+ ext v4.16b, v20.16b, v21.16b, #4
+ mla v2.8h, v0.8h, v6.8h // * 3 -> a
+ add v25.4s, v25.4s, v19.4s // -stride, -1
+ add v26.4s, v26.4s, v20.4s
+ add v16.4s, v16.4s, v27.4s // -1-stride, +1-stride
+ add v17.4s, v17.4s, v28.4s
+ ext v27.16b, v19.16b, v20.16b, #8 // +1
+ ext v28.16b, v20.16b, v21.16b, #8
+ add v16.4s, v16.4s, v22.4s // -1+stride
+ add v17.4s, v17.4s, v23.4s
+ add v29.4s, v29.4s, v27.4s // 0, +1
+ add v4.4s, v4.4s, v28.4s
+ add v25.4s, v25.4s, v29.4s
+ add v26.4s, v26.4s, v4.4s
+ ext v27.16b, v22.16b, v23.16b, #4 // +stride
+ ext v28.16b, v23.16b, v24.16b, #4
+ ext v29.16b, v22.16b, v23.16b, #8 // +1+stride
+ ext v4.16b, v23.16b, v24.16b, #8
+.if \bpc == 8
+ ld1 {v19.8b}, [x0] // src
+.else
+ ld1 {v19.8h}, [x0] // src
+.endif
+ add v25.4s, v25.4s, v27.4s // +stride
+ add v26.4s, v26.4s, v28.4s
+ add v16.4s, v16.4s, v29.4s // +1+stride
+ add v17.4s, v17.4s, v4.4s
+ shl v25.4s, v25.4s, #2
+ shl v26.4s, v26.4s, #2
+ mla v25.4s, v16.4s, v7.4s // * 3 -> b
+ mla v26.4s, v17.4s, v7.4s
+.if \bpc == 8
+ uxtl v19.8h, v19.8b // src
+.endif
+ mov v0.16b, v1.16b
+ umlal v25.4s, v2.4h, v19.4h // b + a * src
+ umlal2 v26.4s, v2.8h, v19.8h
+ mov v2.16b, v3.16b
+ rshrn v25.4h, v25.4s, #9
+ rshrn2 v25.8h, v26.4s, #9
+
+ subs w3, w3, #8
+
+ // weighted1
+ shl v19.8h, v19.8h, #4 // u
+ mov v4.16b, v5.16b
+
+ sub v25.8h, v25.8h, v19.8h // t1 - u
+ ld1 {v1.8h}, [x9], #16
+ ushll v26.4s, v19.4h, #7 // u << 7
+ ushll2 v27.4s, v19.8h, #7 // u << 7
+ ld1 {v3.8h}, [x10], #16
+ smlal v26.4s, v25.4h, v31.4h // v
+ smlal2 v27.4s, v25.8h, v31.8h // v
+ ld1 {v5.8h}, [x2], #16
+.if \bpc == 8
+ rshrn v26.4h, v26.4s, #11
+ rshrn2 v26.8h, v27.4s, #11
+ mov v16.16b, v18.16b
+ sqxtun v26.8b, v26.8h
+ mov v19.16b, v21.16b
+ mov v22.16b, v24.16b
+ st1 {v26.8b}, [x0], #8
+.else
+ sqrshrun v26.4h, v26.4s, #11
+ sqrshrun2 v26.8h, v27.4s, #11
+ mov v16.16b, v18.16b
+ umin v26.8h, v26.8h, v30.8h
+ mov v19.16b, v21.16b
+ mov v22.16b, v24.16b
+ st1 {v26.8h}, [x0], #16
+.endif
+
+ b.le 3f
+ ld1 {v17.4s, v18.4s}, [x7], #32
+ ld1 {v20.4s, v21.4s}, [x8], #32
+ ld1 {v23.4s, v24.4s}, [x1], #32
+ b 2b
+
+3:
+ ret
+endfunc
+
+// void dav1d_sgr_finish_filter2_2rows_Xbpc_neon(int16_t *tmp,
+// const pixel *src,
+// const ptrdiff_t stride,
+// const int32_t **a,
+// const int16_t **b,
+// const int w, const int h);
+function sgr_finish_filter2_2rows_\bpc\()bpc_neon, export=1
+ stp d8, d9, [sp, #-0x40]!
+ stp d10, d11, [sp, #0x10]
+ stp d12, d13, [sp, #0x20]
+ stp d14, d15, [sp, #0x30]
+
+ ldp x3, x7, [x3]
+ ldp x4, x8, [x4]
+ mov x10, #FILTER_OUT_STRIDE
+ cmp w6, #1
+ add x2, x1, x2 // src + stride
+ csel x2, x1, x2, le // if (h <= 1) x2 = x1
+ add x10, x0, x10, lsl #1
+ movi v4.8h, #5
+ movi v5.4s, #5
+ movi v6.8h, #6
+ movi v7.4s, #6
+1:
+ ld1 {v0.8h, v1.8h}, [x4], #32
+ ld1 {v2.8h, v3.8h}, [x8], #32
+ ld1 {v16.4s, v17.4s, v18.4s}, [x3], #48
+ ld1 {v19.4s, v20.4s, v21.4s}, [x7], #48
+
+2:
+ ext v24.16b, v0.16b, v1.16b, #4 // +1-stride
+ ext v25.16b, v2.16b, v3.16b, #4 // +1+stride
+ ext v22.16b, v0.16b, v1.16b, #2 // -stride
+ ext v23.16b, v2.16b, v3.16b, #2 // +stride
+ add v0.8h, v0.8h, v24.8h // -1-stride, +1-stride
+ add v25.8h, v2.8h, v25.8h // -1+stride, +1+stride
+ add v2.8h, v22.8h, v23.8h // -stride, +stride
+ add v0.8h, v0.8h, v25.8h
+
+ mul v8.8h, v25.8h, v4.8h // * 5
+ mla v8.8h, v23.8h, v6.8h // * 6
+
+ ext v22.16b, v16.16b, v17.16b, #4 // -stride
+ ext v23.16b, v17.16b, v18.16b, #4
+ ext v24.16b, v19.16b, v20.16b, #4 // +stride
+ ext v25.16b, v20.16b, v21.16b, #4
+ ext v26.16b, v16.16b, v17.16b, #8 // +1-stride
+ ext v27.16b, v17.16b, v18.16b, #8
+ ext v28.16b, v19.16b, v20.16b, #8 // +1+stride
+ ext v29.16b, v20.16b, v21.16b, #8
+ mul v0.8h, v0.8h, v4.8h // * 5
+ mla v0.8h, v2.8h, v6.8h // * 6
+.if \bpc == 8
+ ld1 {v31.8b}, [x1], #8
+ ld1 {v30.8b}, [x2], #8
+.else
+ ld1 {v31.8h}, [x1], #16
+ ld1 {v30.8h}, [x2], #16
+.endif
+ add v16.4s, v16.4s, v26.4s // -1-stride, +1-stride
+ add v17.4s, v17.4s, v27.4s
+ add v19.4s, v19.4s, v28.4s // -1+stride, +1+stride
+ add v20.4s, v20.4s, v29.4s
+ add v16.4s, v16.4s, v19.4s
+ add v17.4s, v17.4s, v20.4s
+
+ mul v9.4s, v19.4s, v5.4s // * 5
+ mla v9.4s, v24.4s, v7.4s // * 6
+ mul v10.4s, v20.4s, v5.4s // * 5
+ mla v10.4s, v25.4s, v7.4s // * 6
+
+ add v22.4s, v22.4s, v24.4s // -stride, +stride
+ add v23.4s, v23.4s, v25.4s
+ // This is, surprisingly, faster than other variants where the
+ // mul+mla pairs are further apart, on Cortex A53.
+ mul v16.4s, v16.4s, v5.4s // * 5
+ mla v16.4s, v22.4s, v7.4s // * 6
+ mul v17.4s, v17.4s, v5.4s // * 5
+ mla v17.4s, v23.4s, v7.4s // * 6
+
+.if \bpc == 8
+ uxtl v31.8h, v31.8b
+ uxtl v30.8h, v30.8b
+.endif
+ umlal v16.4s, v0.4h, v31.4h // b + a * src
+ umlal2 v17.4s, v0.8h, v31.8h
+ umlal v9.4s, v8.4h, v30.4h // b + a * src
+ umlal2 v10.4s, v8.8h, v30.8h
+ mov v0.16b, v1.16b
+ rshrn v16.4h, v16.4s, #9
+ rshrn2 v16.8h, v17.4s, #9
+ rshrn v9.4h, v9.4s, #8
+ rshrn2 v9.8h, v10.4s, #8
+ subs w5, w5, #8
+ mov v2.16b, v3.16b
+ st1 {v16.8h}, [x0], #16
+ st1 {v9.8h}, [x10], #16
+
+ b.le 9f
+ mov v16.16b, v18.16b
+ mov v19.16b, v21.16b
+ ld1 {v1.8h}, [x4], #16
+ ld1 {v3.8h}, [x8], #16
+ ld1 {v17.4s, v18.4s}, [x3], #32
+ ld1 {v20.4s, v21.4s}, [x7], #32
+ b 2b
+
+9:
+ ldp d14, d15, [sp, #0x30]
+ ldp d12, d13, [sp, #0x20]
+ ldp d10, d11, [sp, #0x10]
+ ldp d8, d9, [sp], 0x40
+ ret
+endfunc
+
+// void dav1d_sgr_finish_weighted2_Xbpc_neon(pixel *dst, const ptrdiff_t stride,
+// const int32_t **a,
+// const int16_t **b,
+// const int w, const int h,
+// const int w1,
+// const int bitdepth_max);
+function sgr_finish_weighted2_\bpc\()bpc_neon, export=1
+ stp d8, d9, [sp, #-0x30]!
+ str d10, [sp, #0x10]
+ stp d14, d15, [sp, #0x20]
+
+ dup v14.8h, w6
+ dup v15.8h, w7
+
+ ldp x2, x7, [x2]
+ ldp x3, x8, [x3]
+ cmp w5, #1
+ add x1, x0, x1 // src + stride
+ // if (h <= 1), set the pointer to the second row to any dummy buffer
+ // we can clobber (x2 in this case)
+ csel x1, x2, x1, le
+ movi v4.8h, #5
+ movi v5.4s, #5
+ movi v6.8h, #6
+ movi v7.4s, #6
+1:
+ ld1 {v0.8h, v1.8h}, [x3], #32
+ ld1 {v2.8h, v3.8h}, [x8], #32
+ ld1 {v16.4s, v17.4s, v18.4s}, [x2], #48
+ ld1 {v19.4s, v20.4s, v21.4s}, [x7], #48
+
+2:
+ ext v24.16b, v0.16b, v1.16b, #4 // +1-stride
+ ext v25.16b, v2.16b, v3.16b, #4 // +1+stride
+ ext v22.16b, v0.16b, v1.16b, #2 // -stride
+ ext v23.16b, v2.16b, v3.16b, #2 // +stride
+ add v0.8h, v0.8h, v24.8h // -1-stride, +1-stride
+ add v25.8h, v2.8h, v25.8h // -1+stride, +1+stride
+ add v2.8h, v22.8h, v23.8h // -stride, +stride
+ add v0.8h, v0.8h, v25.8h
+
+ mul v8.8h, v25.8h, v4.8h // * 5
+ mla v8.8h, v23.8h, v6.8h // * 6
+
+ ext v22.16b, v16.16b, v17.16b, #4 // -stride
+ ext v23.16b, v17.16b, v18.16b, #4
+ ext v24.16b, v19.16b, v20.16b, #4 // +stride
+ ext v25.16b, v20.16b, v21.16b, #4
+ ext v26.16b, v16.16b, v17.16b, #8 // +1-stride
+ ext v27.16b, v17.16b, v18.16b, #8
+ ext v28.16b, v19.16b, v20.16b, #8 // +1+stride
+ ext v29.16b, v20.16b, v21.16b, #8
+ mul v0.8h, v0.8h, v4.8h // * 5
+ mla v0.8h, v2.8h, v6.8h // * 6
+.if \bpc == 8
+ ld1 {v31.8b}, [x0]
+ ld1 {v30.8b}, [x1]
+.else
+ ld1 {v31.8h}, [x0]
+ ld1 {v30.8h}, [x1]
+.endif
+ add v16.4s, v16.4s, v26.4s // -1-stride, +1-stride
+ add v17.4s, v17.4s, v27.4s
+ add v19.4s, v19.4s, v28.4s // -1+stride, +1+stride
+ add v20.4s, v20.4s, v29.4s
+ add v16.4s, v16.4s, v19.4s
+ add v17.4s, v17.4s, v20.4s
+
+ mul v9.4s, v19.4s, v5.4s // * 5
+ mla v9.4s, v24.4s, v7.4s // * 6
+ mul v10.4s, v20.4s, v5.4s // * 5
+ mla v10.4s, v25.4s, v7.4s // * 6
+
+ add v22.4s, v22.4s, v24.4s // -stride, +stride
+ add v23.4s, v23.4s, v25.4s
+ // This is, surprisingly, faster than other variants where the
+ // mul+mla pairs are further apart, on Cortex A53.
+ mul v16.4s, v16.4s, v5.4s // * 5
+ mla v16.4s, v22.4s, v7.4s // * 6
+ mul v17.4s, v17.4s, v5.4s // * 5
+ mla v17.4s, v23.4s, v7.4s // * 6
+
+.if \bpc == 8
+ uxtl v31.8h, v31.8b
+ uxtl v30.8h, v30.8b
+.endif
+ umlal v16.4s, v0.4h, v31.4h // b + a * src
+ umlal2 v17.4s, v0.8h, v31.8h
+ umlal v9.4s, v8.4h, v30.4h // b + a * src
+ umlal2 v10.4s, v8.8h, v30.8h
+ mov v0.16b, v1.16b
+ rshrn v16.4h, v16.4s, #9
+ rshrn2 v16.8h, v17.4s, #9
+ rshrn v9.4h, v9.4s, #8
+ rshrn2 v9.8h, v10.4s, #8
+
+ subs w4, w4, #8
+
+ // weighted1
+ shl v31.8h, v31.8h, #4 // u
+ shl v30.8h, v30.8h, #4
+ mov v2.16b, v3.16b
+
+ sub v16.8h, v16.8h, v31.8h // t1 - u
+ sub v9.8h, v9.8h, v30.8h
+ ld1 {v1.8h}, [x3], #16
+ ushll v22.4s, v31.4h, #7 // u << 7
+ ushll2 v23.4s, v31.8h, #7
+ ushll v24.4s, v30.4h, #7
+ ushll2 v25.4s, v30.8h, #7
+ ld1 {v3.8h}, [x8], #16
+ smlal v22.4s, v16.4h, v14.4h // v
+ smlal2 v23.4s, v16.8h, v14.8h
+ mov v16.16b, v18.16b
+ smlal v24.4s, v9.4h, v14.4h
+ smlal2 v25.4s, v9.8h, v14.8h
+ mov v19.16b, v21.16b
+.if \bpc == 8
+ rshrn v22.4h, v22.4s, #11
+ rshrn2 v22.8h, v23.4s, #11
+ rshrn v23.4h, v24.4s, #11
+ rshrn2 v23.8h, v25.4s, #11
+ sqxtun v22.8b, v22.8h
+ sqxtun v23.8b, v23.8h
+ st1 {v22.8b}, [x0], #8
+ st1 {v23.8b}, [x1], #8
+.else
+ sqrshrun v22.4h, v22.4s, #11
+ sqrshrun2 v22.8h, v23.4s, #11
+ sqrshrun v23.4h, v24.4s, #11
+ sqrshrun2 v23.8h, v25.4s, #11
+ umin v22.8h, v22.8h, v15.8h
+ umin v23.8h, v23.8h, v15.8h
+ st1 {v22.8h}, [x0], #16
+ st1 {v23.8h}, [x1], #16
+.endif
+
+ b.le 3f
+ ld1 {v17.4s, v18.4s}, [x2], #32
+ ld1 {v20.4s, v21.4s}, [x7], #32
+ b 2b
+
+3:
+ ldp d14, d15, [sp, #0x20]
+ ldr d10, [sp, #0x10]
+ ldp d8, d9, [sp], 0x30
+ ret
+endfunc
+
+// void dav1d_sgr_weighted2_Xbpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *src, const ptrdiff_t src_stride,
+// const int16_t *t1, const int16_t *t2,
+// const int w, const int h,
+// const int16_t wt[2], const int bitdepth_max);
+function sgr_weighted2_\bpc\()bpc_neon, export=1
+.if \bpc == 8
+ ldr x8, [sp]
+.else
+ ldp x8, x9, [sp]
+.endif
+ cmp w7, #2
+ add x10, x0, x1
+ add x11, x2, x3
+ add x12, x4, #2*FILTER_OUT_STRIDE
+ add x13, x5, #2*FILTER_OUT_STRIDE
+ ld2r {v30.8h, v31.8h}, [x8] // wt[0], wt[1]
+.if \bpc == 16
+ dup v29.8h, w9
+.endif
+ mov x8, #4*FILTER_OUT_STRIDE
+ lsl x1, x1, #1
+ lsl x3, x3, #1
+ add x9, x6, #7
+ bic x9, x9, #7 // Aligned width
+.if \bpc == 8
+ sub x1, x1, x9
+ sub x3, x3, x9
+.else
+ sub x1, x1, x9, lsl #1
+ sub x3, x3, x9, lsl #1
+.endif
+ sub x8, x8, x9, lsl #1
+ mov w9, w6
+ b.lt 2f
+1:
+.if \bpc == 8
+ ld1 {v0.8b}, [x2], #8
+ ld1 {v16.8b}, [x11], #8
+.else
+ ld1 {v0.8h}, [x2], #16
+ ld1 {v16.8h}, [x11], #16
+.endif
+ ld1 {v1.8h}, [x4], #16
+ ld1 {v17.8h}, [x12], #16
+ ld1 {v2.8h}, [x5], #16
+ ld1 {v18.8h}, [x13], #16
+ subs w6, w6, #8
+.if \bpc == 8
+ ushll v0.8h, v0.8b, #4 // u
+ ushll v16.8h, v16.8b, #4 // u
+.else
+ shl v0.8h, v0.8h, #4 // u
+ shl v16.8h, v16.8h, #4 // u
+.endif
+ sub v1.8h, v1.8h, v0.8h // t1 - u
+ sub v2.8h, v2.8h, v0.8h // t2 - u
+ sub v17.8h, v17.8h, v16.8h // t1 - u
+ sub v18.8h, v18.8h, v16.8h // t2 - u
+ ushll v3.4s, v0.4h, #7 // u << 7
+ ushll2 v4.4s, v0.8h, #7 // u << 7
+ ushll v19.4s, v16.4h, #7 // u << 7
+ ushll2 v20.4s, v16.8h, #7 // u << 7
+ smlal v3.4s, v1.4h, v30.4h // wt[0] * (t1 - u)
+ smlal v3.4s, v2.4h, v31.4h // wt[1] * (t2 - u)
+ smlal2 v4.4s, v1.8h, v30.8h // wt[0] * (t1 - u)
+ smlal2 v4.4s, v2.8h, v31.8h // wt[1] * (t2 - u)
+ smlal v19.4s, v17.4h, v30.4h // wt[0] * (t1 - u)
+ smlal v19.4s, v18.4h, v31.4h // wt[1] * (t2 - u)
+ smlal2 v20.4s, v17.8h, v30.8h // wt[0] * (t1 - u)
+ smlal2 v20.4s, v18.8h, v31.8h // wt[1] * (t2 - u)
+.if \bpc == 8
+ rshrn v3.4h, v3.4s, #11
+ rshrn2 v3.8h, v4.4s, #11
+ rshrn v19.4h, v19.4s, #11
+ rshrn2 v19.8h, v20.4s, #11
+ sqxtun v3.8b, v3.8h
+ sqxtun v19.8b, v19.8h
+ st1 {v3.8b}, [x0], #8
+ st1 {v19.8b}, [x10], #8
+.else
+ sqrshrun v3.4h, v3.4s, #11
+ sqrshrun2 v3.8h, v4.4s, #11
+ sqrshrun v19.4h, v19.4s, #11
+ sqrshrun2 v19.8h, v20.4s, #11
+ umin v3.8h, v3.8h, v29.8h
+ umin v19.8h, v19.8h, v29.8h
+ st1 {v3.8h}, [x0], #16
+ st1 {v19.8h}, [x10], #16
+.endif
+ b.gt 1b
+
+ subs w7, w7, #2
+ cmp w7, #1
+ b.lt 0f
+ mov w6, w9
+ add x0, x0, x1
+ add x10, x10, x1
+ add x2, x2, x3
+ add x11, x11, x3
+ add x4, x4, x8
+ add x12, x12, x8
+ add x5, x5, x8
+ add x13, x13, x8
+ b.eq 2f
+ b 1b
+
+2:
+.if \bpc == 8
+ ld1 {v0.8b}, [x2], #8
+.else
+ ld1 {v0.8h}, [x2], #16
+.endif
+ ld1 {v1.8h}, [x4], #16
+ ld1 {v2.8h}, [x5], #16
+ subs w6, w6, #8
+.if \bpc == 8
+ ushll v0.8h, v0.8b, #4 // u
+.else
+ shl v0.8h, v0.8h, #4 // u
+.endif
+ sub v1.8h, v1.8h, v0.8h // t1 - u
+ sub v2.8h, v2.8h, v0.8h // t2 - u
+ ushll v3.4s, v0.4h, #7 // u << 7
+ ushll2 v4.4s, v0.8h, #7 // u << 7
+ smlal v3.4s, v1.4h, v30.4h // wt[0] * (t1 - u)
+ smlal v3.4s, v2.4h, v31.4h // wt[1] * (t2 - u)
+ smlal2 v4.4s, v1.8h, v30.8h // wt[0] * (t1 - u)
+ smlal2 v4.4s, v2.8h, v31.8h // wt[1] * (t2 - u)
+.if \bpc == 8
+ rshrn v3.4h, v3.4s, #11
+ rshrn2 v3.8h, v4.4s, #11
+ sqxtun v3.8b, v3.8h
+ st1 {v3.8b}, [x0], #8
+.else
+ sqrshrun v3.4h, v3.4s, #11
+ sqrshrun2 v3.8h, v4.4s, #11
+ umin v3.8h, v3.8h, v29.8h
+ st1 {v3.8h}, [x0], #16
+.endif
+ b.gt 1b
+0:
+ ret
+endfunc
+.endm
diff --git a/third_party/dav1d/src/arm/64/mc.S b/third_party/dav1d/src/arm/64/mc.S
new file mode 100644
index 0000000000..9f7b4e7a89
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/mc.S
@@ -0,0 +1,3310 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Janne Grunau
+ * Copyright © 2018, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+.macro avg dst, t0, t1, t2, t3
+ ld1 {\t0\().8h,\t1\().8h}, [x2], 32
+ ld1 {\t2\().8h,\t3\().8h}, [x3], 32
+ add \t0\().8h, \t0\().8h, \t2\().8h
+ add \t1\().8h, \t1\().8h, \t3\().8h
+ sqrshrun \dst\().8b, \t0\().8h, #5
+ sqrshrun2 \dst\().16b, \t1\().8h, #5
+.endm
+
+.macro w_avg dst, t0, t1, t2, t3
+ ld1 {\t0\().8h,\t1\().8h}, [x2], 32
+ ld1 {\t2\().8h,\t3\().8h}, [x3], 32
+ sub \t0\().8h, \t2\().8h, \t0\().8h
+ sub \t1\().8h, \t3\().8h, \t1\().8h
+ sqdmulh \t0\().8h, \t0\().8h, v30.8h
+ sqdmulh \t1\().8h, \t1\().8h, v30.8h
+ add \t0\().8h, \t2\().8h, \t0\().8h
+ add \t1\().8h, \t3\().8h, \t1\().8h
+ sqrshrun \dst\().8b, \t0\().8h, #4
+ sqrshrun2 \dst\().16b, \t1\().8h, #4
+.endm
+
+.macro mask dst, t0, t1, t2, t3
+ ld1 {v30.16b}, [x6], 16
+ ld1 {\t0\().8h,\t1\().8h}, [x2], 32
+ mul v30.16b, v30.16b, v31.16b
+ ld1 {\t2\().8h,\t3\().8h}, [x3], 32
+ shll v28.8h, v30.8b, #8
+ shll2 v29.8h, v30.16b, #8
+ sub \t0\().8h, \t2\().8h, \t0\().8h
+ sub \t1\().8h, \t3\().8h, \t1\().8h
+ sqdmulh \t0\().8h, \t0\().8h, v28.8h
+ sqdmulh \t1\().8h, \t1\().8h, v29.8h
+ add \t0\().8h, \t2\().8h, \t0\().8h
+ add \t1\().8h, \t3\().8h, \t1\().8h
+ sqrshrun \dst\().8b, \t0\().8h, #4
+ sqrshrun2 \dst\().16b, \t1\().8h, #4
+.endm
+
+.macro bidir_fn type
+function \type\()_8bpc_neon, export=1
+ clz w4, w4
+.ifc \type, w_avg
+ dup v30.8h, w6
+ neg v30.8h, v30.8h
+ shl v30.8h, v30.8h, #11
+.endif
+.ifc \type, mask
+ movi v31.16b, #256-2
+.endif
+ adr x7, L(\type\()_tbl)
+ sub w4, w4, #24
+ ldrh w4, [x7, x4, lsl #1]
+ \type v4, v0, v1, v2, v3
+ sub x7, x7, w4, uxtw
+ br x7
+40:
+ AARCH64_VALID_JUMP_TARGET
+ add x7, x0, x1
+ lsl x1, x1, #1
+4:
+ cmp w5, #4
+ st1 {v4.s}[0], [x0], x1
+ st1 {v4.s}[1], [x7], x1
+ st1 {v4.s}[2], [x0], x1
+ st1 {v4.s}[3], [x7], x1
+ b.eq 0f
+ \type v5, v0, v1, v2, v3
+ cmp w5, #8
+ st1 {v5.s}[0], [x0], x1
+ st1 {v5.s}[1], [x7], x1
+ st1 {v5.s}[2], [x0], x1
+ st1 {v5.s}[3], [x7], x1
+ b.eq 0f
+ \type v4, v0, v1, v2, v3
+ st1 {v4.s}[0], [x0], x1
+ st1 {v4.s}[1], [x7], x1
+ \type v5, v0, v1, v2, v3
+ st1 {v4.s}[2], [x0], x1
+ st1 {v4.s}[3], [x7], x1
+ st1 {v5.s}[0], [x0], x1
+ st1 {v5.s}[1], [x7], x1
+ st1 {v5.s}[2], [x0], x1
+ st1 {v5.s}[3], [x7], x1
+ ret
+80:
+ AARCH64_VALID_JUMP_TARGET
+ add x7, x0, x1
+ lsl x1, x1, #1
+8:
+ st1 {v4.d}[0], [x0], x1
+ \type v5, v0, v1, v2, v3
+ st1 {v4.d}[1], [x7], x1
+ st1 {v5.d}[0], [x0], x1
+ subs w5, w5, #4
+ st1 {v5.d}[1], [x7], x1
+ b.le 0f
+ \type v4, v0, v1, v2, v3
+ b 8b
+16:
+ AARCH64_VALID_JUMP_TARGET
+ \type v5, v0, v1, v2, v3
+ st1 {v4.16b}, [x0], x1
+ \type v6, v0, v1, v2, v3
+ st1 {v5.16b}, [x0], x1
+ \type v7, v0, v1, v2, v3
+ st1 {v6.16b}, [x0], x1
+ subs w5, w5, #4
+ st1 {v7.16b}, [x0], x1
+ b.le 0f
+ \type v4, v0, v1, v2, v3
+ b 16b
+320:
+ AARCH64_VALID_JUMP_TARGET
+ add x7, x0, x1
+ lsl x1, x1, #1
+32:
+ \type v5, v0, v1, v2, v3
+ \type v6, v0, v1, v2, v3
+ st1 {v4.16b,v5.16b}, [x0], x1
+ \type v7, v0, v1, v2, v3
+ subs w5, w5, #2
+ st1 {v6.16b,v7.16b}, [x7], x1
+ b.le 0f
+ \type v4, v0, v1, v2, v3
+ b 32b
+640:
+ AARCH64_VALID_JUMP_TARGET
+ add x7, x0, x1
+ lsl x1, x1, #1
+64:
+ \type v5, v0, v1, v2, v3
+ \type v6, v0, v1, v2, v3
+ \type v7, v0, v1, v2, v3
+ \type v16, v0, v1, v2, v3
+ \type v17, v0, v1, v2, v3
+ st1 {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1
+ \type v18, v0, v1, v2, v3
+ \type v19, v0, v1, v2, v3
+ subs w5, w5, #2
+ st1 {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1
+ b.le 0f
+ \type v4, v0, v1, v2, v3
+ b 64b
+1280:
+ AARCH64_VALID_JUMP_TARGET
+ add x7, x0, #64
+128:
+ \type v5, v0, v1, v2, v3
+ \type v6, v0, v1, v2, v3
+ \type v7, v0, v1, v2, v3
+ \type v16, v0, v1, v2, v3
+ \type v17, v0, v1, v2, v3
+ st1 {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1
+ \type v18, v0, v1, v2, v3
+ \type v19, v0, v1, v2, v3
+ subs w5, w5, #1
+ st1 {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1
+ b.le 0f
+ \type v4, v0, v1, v2, v3
+ b 128b
+0:
+ ret
+L(\type\()_tbl):
+ .hword L(\type\()_tbl) - 1280b
+ .hword L(\type\()_tbl) - 640b
+ .hword L(\type\()_tbl) - 320b
+ .hword L(\type\()_tbl) - 16b
+ .hword L(\type\()_tbl) - 80b
+ .hword L(\type\()_tbl) - 40b
+endfunc
+.endm
+
+bidir_fn avg
+bidir_fn w_avg
+bidir_fn mask
+
+
+.macro w_mask_fn type
+function w_mask_\type\()_8bpc_neon, export=1
+ clz w8, w4
+ adr x9, L(w_mask_\type\()_tbl)
+ sub w8, w8, #24
+ ldrh w8, [x9, x8, lsl #1]
+ sub x9, x9, w8, uxtw
+ mov w10, #6903
+ dup v0.8h, w10
+.if \type == 444
+ movi v1.16b, #64
+.elseif \type == 422
+ dup v2.8b, w7
+ movi v3.8b, #129
+ sub v3.8b, v3.8b, v2.8b
+.elseif \type == 420
+ dup v2.8h, w7
+ movi v3.8h, #1, lsl #8
+ sub v3.8h, v3.8h, v2.8h
+.endif
+ add x12, x0, x1
+ lsl x1, x1, #1
+ br x9
+4:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1 (four rows at once)
+ ld1 {v6.8h, v7.8h}, [x3], #32 // tmp2 (four rows at once)
+ subs w5, w5, #4
+ sub v16.8h, v6.8h, v4.8h
+ sub v17.8h, v7.8h, v5.8h
+ sabd v18.8h, v4.8h, v6.8h
+ sabd v19.8h, v5.8h, v7.8h
+ uqsub v18.8h, v0.8h, v18.8h
+ uqsub v19.8h, v0.8h, v19.8h
+ ushr v18.8h, v18.8h, #8
+ ushr v19.8h, v19.8h, #8
+ shl v20.8h, v18.8h, #9
+ shl v21.8h, v19.8h, #9
+ sqdmulh v20.8h, v20.8h, v16.8h
+ sqdmulh v21.8h, v21.8h, v17.8h
+ add v20.8h, v20.8h, v4.8h
+ add v21.8h, v21.8h, v5.8h
+ sqrshrun v22.8b, v20.8h, #4
+ sqrshrun v23.8b, v21.8h, #4
+.if \type == 444
+ uzp1 v18.16b, v18.16b, v19.16b // Same as xtn, xtn2
+ sub v18.16b, v1.16b, v18.16b
+ st1 {v18.16b}, [x6], #16
+.elseif \type == 422
+ addp v18.8h, v18.8h, v19.8h
+ xtn v18.8b, v18.8h
+ uhsub v18.8b, v3.8b, v18.8b
+ st1 {v18.8b}, [x6], #8
+.elseif \type == 420
+ trn1 v24.2d, v18.2d, v19.2d
+ trn2 v25.2d, v18.2d, v19.2d
+ add v24.8h, v24.8h, v25.8h
+ addp v18.8h, v24.8h, v24.8h
+ sub v18.4h, v3.4h, v18.4h
+ rshrn v18.8b, v18.8h, #2
+ st1 {v18.s}[0], [x6], #4
+.endif
+ st1 {v22.s}[0], [x0], x1
+ st1 {v22.s}[1], [x12], x1
+ st1 {v23.s}[0], [x0], x1
+ st1 {v23.s}[1], [x12], x1
+ b.gt 4b
+ ret
+8:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v4.8h, v5.8h}, [x2], #32
+ ld1 {v6.8h, v7.8h}, [x3], #32
+ subs w5, w5, #2
+ sub v16.8h, v6.8h, v4.8h
+ sub v17.8h, v7.8h, v5.8h
+ sabd v18.8h, v4.8h, v6.8h
+ sabd v19.8h, v5.8h, v7.8h
+ uqsub v18.8h, v0.8h, v18.8h
+ uqsub v19.8h, v0.8h, v19.8h
+ ushr v18.8h, v18.8h, #8
+ ushr v19.8h, v19.8h, #8
+ shl v20.8h, v18.8h, #9
+ shl v21.8h, v19.8h, #9
+ sqdmulh v20.8h, v20.8h, v16.8h
+ sqdmulh v21.8h, v21.8h, v17.8h
+ add v20.8h, v20.8h, v4.8h
+ add v21.8h, v21.8h, v5.8h
+ sqrshrun v22.8b, v20.8h, #4
+ sqrshrun v23.8b, v21.8h, #4
+.if \type == 444
+ uzp1 v18.16b, v18.16b, v19.16b // Same as xtn, xtn2
+ sub v18.16b, v1.16b, v18.16b
+ st1 {v18.16b}, [x6], #16
+.elseif \type == 422
+ addp v18.8h, v18.8h, v19.8h
+ xtn v18.8b, v18.8h
+ uhsub v18.8b, v3.8b, v18.8b
+ st1 {v18.8b}, [x6], #8
+.elseif \type == 420
+ add v18.8h, v18.8h, v19.8h
+ addp v18.8h, v18.8h, v18.8h
+ sub v18.4h, v3.4h, v18.4h
+ rshrn v18.8b, v18.8h, #2
+ st1 {v18.s}[0], [x6], #4
+.endif
+ st1 {v22.8b}, [x0], x1
+ st1 {v23.8b}, [x12], x1
+ b.gt 8b
+ ret
+1280:
+640:
+320:
+160:
+ AARCH64_VALID_JUMP_TARGET
+ mov w11, w4
+ sub x1, x1, w4, uxtw
+.if \type == 444
+ add x10, x6, w4, uxtw
+.elseif \type == 422
+ add x10, x6, x11, lsr #1
+.endif
+ add x9, x3, w4, uxtw #1
+ add x7, x2, w4, uxtw #1
+161:
+ mov w8, w4
+16:
+ ld1 {v4.8h, v5.8h}, [x2], #32
+ ld1 {v6.8h, v7.8h}, [x3], #32
+ ld1 {v16.8h, v17.8h}, [x7], #32
+ ld1 {v18.8h, v19.8h}, [x9], #32
+ subs w8, w8, #16
+ sub v6.8h, v6.8h, v4.8h
+ sub v7.8h, v7.8h, v5.8h
+ sub v18.8h, v18.8h, v16.8h
+ sub v19.8h, v19.8h, v17.8h
+ abs v20.8h, v6.8h
+ abs v21.8h, v7.8h
+ abs v22.8h, v18.8h
+ abs v23.8h, v19.8h
+ uqsub v20.8h, v0.8h, v20.8h
+ uqsub v21.8h, v0.8h, v21.8h
+ uqsub v22.8h, v0.8h, v22.8h
+ uqsub v23.8h, v0.8h, v23.8h
+ ushr v20.8h, v20.8h, #8
+ ushr v21.8h, v21.8h, #8
+ ushr v22.8h, v22.8h, #8
+ ushr v23.8h, v23.8h, #8
+ shl v24.8h, v20.8h, #9
+ shl v25.8h, v21.8h, #9
+ shl v26.8h, v22.8h, #9
+ shl v27.8h, v23.8h, #9
+ sqdmulh v24.8h, v24.8h, v6.8h
+ sqdmulh v25.8h, v25.8h, v7.8h
+ sqdmulh v26.8h, v26.8h, v18.8h
+ sqdmulh v27.8h, v27.8h, v19.8h
+ add v24.8h, v24.8h, v4.8h
+ add v25.8h, v25.8h, v5.8h
+ add v26.8h, v26.8h, v16.8h
+ add v27.8h, v27.8h, v17.8h
+ sqrshrun v24.8b, v24.8h, #4
+ sqrshrun v25.8b, v25.8h, #4
+ sqrshrun v26.8b, v26.8h, #4
+ sqrshrun v27.8b, v27.8h, #4
+.if \type == 444
+ uzp1 v20.16b, v20.16b, v21.16b // Same as xtn, xtn2
+ uzp1 v21.16b, v22.16b, v23.16b // Ditto
+ sub v20.16b, v1.16b, v20.16b
+ sub v21.16b, v1.16b, v21.16b
+ st1 {v20.16b}, [x6], #16
+ st1 {v21.16b}, [x10], #16
+.elseif \type == 422
+ addp v20.8h, v20.8h, v21.8h
+ addp v21.8h, v22.8h, v23.8h
+ xtn v20.8b, v20.8h
+ xtn v21.8b, v21.8h
+ uhsub v20.8b, v3.8b, v20.8b
+ uhsub v21.8b, v3.8b, v21.8b
+ st1 {v20.8b}, [x6], #8
+ st1 {v21.8b}, [x10], #8
+.elseif \type == 420
+ add v20.8h, v20.8h, v22.8h
+ add v21.8h, v21.8h, v23.8h
+ addp v20.8h, v20.8h, v21.8h
+ sub v20.8h, v3.8h, v20.8h
+ rshrn v20.8b, v20.8h, #2
+ st1 {v20.8b}, [x6], #8
+.endif
+ st1 {v24.8b, v25.8b}, [x0], #16
+ st1 {v26.8b, v27.8b}, [x12], #16
+ b.gt 16b
+ subs w5, w5, #2
+ add x2, x2, w4, uxtw #1
+ add x3, x3, w4, uxtw #1
+ add x7, x7, w4, uxtw #1
+ add x9, x9, w4, uxtw #1
+.if \type == 444
+ add x6, x6, w4, uxtw
+ add x10, x10, w4, uxtw
+.elseif \type == 422
+ add x6, x6, x11, lsr #1
+ add x10, x10, x11, lsr #1
+.endif
+ add x0, x0, x1
+ add x12, x12, x1
+ b.gt 161b
+ ret
+L(w_mask_\type\()_tbl):
+ .hword L(w_mask_\type\()_tbl) - 1280b
+ .hword L(w_mask_\type\()_tbl) - 640b
+ .hword L(w_mask_\type\()_tbl) - 320b
+ .hword L(w_mask_\type\()_tbl) - 160b
+ .hword L(w_mask_\type\()_tbl) - 8b
+ .hword L(w_mask_\type\()_tbl) - 4b
+endfunc
+.endm
+
+w_mask_fn 444
+w_mask_fn 422
+w_mask_fn 420
+
+
+function blend_8bpc_neon, export=1
+ adr x6, L(blend_tbl)
+ clz w3, w3
+ sub w3, w3, #26
+ ldrh w3, [x6, x3, lsl #1]
+ sub x6, x6, w3, uxtw
+ movi v4.16b, #64
+ add x8, x0, x1
+ lsl x1, x1, #1
+ br x6
+4:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.8b}, [x5], #8
+ ld1 {v1.d}[0], [x2], #8
+ ld1 {v0.s}[0], [x0]
+ subs w4, w4, #2
+ ld1 {v0.s}[1], [x8]
+ sub v3.8b, v4.8b, v2.8b
+ umull v5.8h, v1.8b, v2.8b
+ umlal v5.8h, v0.8b, v3.8b
+ rshrn v6.8b, v5.8h, #6
+ st1 {v6.s}[0], [x0], x1
+ st1 {v6.s}[1], [x8], x1
+ b.gt 4b
+ ret
+8:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.16b}, [x5], #16
+ ld1 {v1.16b}, [x2], #16
+ ld1 {v0.d}[0], [x0]
+ ld1 {v0.d}[1], [x8]
+ sub v3.16b, v4.16b, v2.16b
+ subs w4, w4, #2
+ umull v5.8h, v1.8b, v2.8b
+ umlal v5.8h, v0.8b, v3.8b
+ umull2 v6.8h, v1.16b, v2.16b
+ umlal2 v6.8h, v0.16b, v3.16b
+ rshrn v7.8b, v5.8h, #6
+ rshrn2 v7.16b, v6.8h, #6
+ st1 {v7.d}[0], [x0], x1
+ st1 {v7.d}[1], [x8], x1
+ b.gt 8b
+ ret
+16:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v1.16b, v2.16b}, [x5], #32
+ ld1 {v5.16b, v6.16b}, [x2], #32
+ ld1 {v0.16b}, [x0]
+ subs w4, w4, #2
+ sub v7.16b, v4.16b, v1.16b
+ sub v20.16b, v4.16b, v2.16b
+ ld1 {v3.16b}, [x8]
+ umull v16.8h, v5.8b, v1.8b
+ umlal v16.8h, v0.8b, v7.8b
+ umull2 v17.8h, v5.16b, v1.16b
+ umlal2 v17.8h, v0.16b, v7.16b
+ umull v21.8h, v6.8b, v2.8b
+ umlal v21.8h, v3.8b, v20.8b
+ umull2 v22.8h, v6.16b, v2.16b
+ umlal2 v22.8h, v3.16b, v20.16b
+ rshrn v18.8b, v16.8h, #6
+ rshrn2 v18.16b, v17.8h, #6
+ rshrn v19.8b, v21.8h, #6
+ rshrn2 v19.16b, v22.8h, #6
+ st1 {v18.16b}, [x0], x1
+ st1 {v19.16b}, [x8], x1
+ b.gt 16b
+ ret
+32:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x5], #64
+ ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], #64
+ ld1 {v20.16b, v21.16b}, [x0]
+ subs w4, w4, #2
+ ld1 {v22.16b, v23.16b}, [x8]
+ sub v5.16b, v4.16b, v0.16b
+ sub v6.16b, v4.16b, v1.16b
+ sub v30.16b, v4.16b, v2.16b
+ sub v31.16b, v4.16b, v3.16b
+ umull v24.8h, v16.8b, v0.8b
+ umlal v24.8h, v20.8b, v5.8b
+ umull2 v26.8h, v16.16b, v0.16b
+ umlal2 v26.8h, v20.16b, v5.16b
+ umull v28.8h, v17.8b, v1.8b
+ umlal v28.8h, v21.8b, v6.8b
+ umull2 v7.8h, v17.16b, v1.16b
+ umlal2 v7.8h, v21.16b, v6.16b
+ umull v27.8h, v18.8b, v2.8b
+ umlal v27.8h, v22.8b, v30.8b
+ umull2 v1.8h, v18.16b, v2.16b
+ umlal2 v1.8h, v22.16b, v30.16b
+ umull v29.8h, v19.8b, v3.8b
+ umlal v29.8h, v23.8b, v31.8b
+ umull2 v21.8h, v19.16b, v3.16b
+ umlal2 v21.8h, v23.16b, v31.16b
+ rshrn v24.8b, v24.8h, #6
+ rshrn2 v24.16b, v26.8h, #6
+ rshrn v25.8b, v28.8h, #6
+ rshrn2 v25.16b, v7.8h, #6
+ rshrn v27.8b, v27.8h, #6
+ rshrn2 v27.16b, v1.8h, #6
+ rshrn v28.8b, v29.8h, #6
+ rshrn2 v28.16b, v21.8h, #6
+ st1 {v24.16b, v25.16b}, [x0], x1
+ st1 {v27.16b, v28.16b}, [x8], x1
+ b.gt 32b
+ ret
+L(blend_tbl):
+ .hword L(blend_tbl) - 32b
+ .hword L(blend_tbl) - 16b
+ .hword L(blend_tbl) - 8b
+ .hword L(blend_tbl) - 4b
+endfunc
+
+function blend_h_8bpc_neon, export=1
+ adr x6, L(blend_h_tbl)
+ movrel x5, X(obmc_masks)
+ add x5, x5, w4, uxtw
+ sub w4, w4, w4, lsr #2
+ clz w7, w3
+ movi v4.16b, #64
+ add x8, x0, x1
+ lsl x1, x1, #1
+ sub w7, w7, #24
+ ldrh w7, [x6, x7, lsl #1]
+ sub x6, x6, w7, uxtw
+ br x6
+2:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.h}[0], [x5], #2
+ ld1 {v1.s}[0], [x2], #4
+ subs w4, w4, #2
+ ld1 {v2.h}[0], [x0]
+ zip1 v0.8b, v0.8b, v0.8b
+ sub v3.8b, v4.8b, v0.8b
+ ld1 {v2.h}[1], [x8]
+ umull v5.8h, v1.8b, v0.8b
+ umlal v5.8h, v2.8b, v3.8b
+ rshrn v5.8b, v5.8h, #6
+ st1 {v5.h}[0], [x0], x1
+ st1 {v5.h}[1], [x8], x1
+ b.gt 2b
+ ret
+4:
+ AARCH64_VALID_JUMP_TARGET
+ ld2r {v0.8b, v1.8b}, [x5], #2
+ ld1 {v2.8b}, [x2], #8
+ subs w4, w4, #2
+ ext v0.8b, v0.8b, v1.8b, #4
+ ld1 {v3.s}[0], [x0]
+ sub v5.8b, v4.8b, v0.8b
+ ld1 {v3.s}[1], [x8]
+ umull v6.8h, v2.8b, v0.8b
+ umlal v6.8h, v3.8b, v5.8b
+ rshrn v6.8b, v6.8h, #6
+ st1 {v6.s}[0], [x0], x1
+ st1 {v6.s}[1], [x8], x1
+ b.gt 4b
+ ret
+8:
+ AARCH64_VALID_JUMP_TARGET
+ ld2r {v0.16b, v1.16b}, [x5], #2
+ ld1 {v2.16b}, [x2], #16
+ ld1 {v3.d}[0], [x0]
+ ext v0.16b, v0.16b, v1.16b, #8
+ sub v5.16b, v4.16b, v0.16b
+ ld1 {v3.d}[1], [x8]
+ subs w4, w4, #2
+ umull v6.8h, v0.8b, v2.8b
+ umlal v6.8h, v3.8b, v5.8b
+ umull2 v7.8h, v0.16b, v2.16b
+ umlal2 v7.8h, v3.16b, v5.16b
+ rshrn v16.8b, v6.8h, #6
+ rshrn2 v16.16b, v7.8h, #6
+ st1 {v16.d}[0], [x0], x1
+ st1 {v16.d}[1], [x8], x1
+ b.gt 8b
+ ret
+16:
+ AARCH64_VALID_JUMP_TARGET
+ ld2r {v0.16b, v1.16b}, [x5], #2
+ ld1 {v2.16b, v3.16b}, [x2], #32
+ ld1 {v5.16b}, [x0]
+ sub v7.16b, v4.16b, v0.16b
+ sub v16.16b, v4.16b, v1.16b
+ ld1 {v6.16b}, [x8]
+ subs w4, w4, #2
+ umull v17.8h, v0.8b, v2.8b
+ umlal v17.8h, v5.8b, v7.8b
+ umull2 v18.8h, v0.16b, v2.16b
+ umlal2 v18.8h, v5.16b, v7.16b
+ umull v19.8h, v1.8b, v3.8b
+ umlal v19.8h, v6.8b, v16.8b
+ umull2 v20.8h, v1.16b, v3.16b
+ umlal2 v20.8h, v6.16b, v16.16b
+ rshrn v21.8b, v17.8h, #6
+ rshrn2 v21.16b, v18.8h, #6
+ rshrn v22.8b, v19.8h, #6
+ rshrn2 v22.16b, v20.8h, #6
+ st1 {v21.16b}, [x0], x1
+ st1 {v22.16b}, [x8], x1
+ b.gt 16b
+ ret
+1280:
+640:
+320:
+ AARCH64_VALID_JUMP_TARGET
+ sub x1, x1, w3, uxtw
+ add x7, x2, w3, uxtw
+321:
+ ld2r {v0.16b, v1.16b}, [x5], #2
+ mov w6, w3
+ sub v20.16b, v4.16b, v0.16b
+ sub v21.16b, v4.16b, v1.16b
+32:
+ ld1 {v16.16b, v17.16b}, [x2], #32
+ ld1 {v2.16b, v3.16b}, [x0]
+ subs w6, w6, #32
+ umull v23.8h, v0.8b, v16.8b
+ umlal v23.8h, v2.8b, v20.8b
+ ld1 {v18.16b, v19.16b}, [x7], #32
+ umull2 v27.8h, v0.16b, v16.16b
+ umlal2 v27.8h, v2.16b, v20.16b
+ ld1 {v6.16b, v7.16b}, [x8]
+ umull v24.8h, v0.8b, v17.8b
+ umlal v24.8h, v3.8b, v20.8b
+ umull2 v28.8h, v0.16b, v17.16b
+ umlal2 v28.8h, v3.16b, v20.16b
+ umull v25.8h, v1.8b, v18.8b
+ umlal v25.8h, v6.8b, v21.8b
+ umull2 v5.8h, v1.16b, v18.16b
+ umlal2 v5.8h, v6.16b, v21.16b
+ rshrn v29.8b, v23.8h, #6
+ rshrn2 v29.16b, v27.8h, #6
+ umull v26.8h, v1.8b, v19.8b
+ umlal v26.8h, v7.8b, v21.8b
+ umull2 v31.8h, v1.16b, v19.16b
+ umlal2 v31.8h, v7.16b, v21.16b
+ rshrn v30.8b, v24.8h, #6
+ rshrn2 v30.16b, v28.8h, #6
+ rshrn v23.8b, v25.8h, #6
+ rshrn2 v23.16b, v5.8h, #6
+ rshrn v24.8b, v26.8h, #6
+ st1 {v29.16b, v30.16b}, [x0], #32
+ rshrn2 v24.16b, v31.8h, #6
+ st1 {v23.16b, v24.16b}, [x8], #32
+ b.gt 32b
+ subs w4, w4, #2
+ add x0, x0, x1
+ add x8, x8, x1
+ add x2, x2, w3, uxtw
+ add x7, x7, w3, uxtw
+ b.gt 321b
+ ret
+L(blend_h_tbl):
+ .hword L(blend_h_tbl) - 1280b
+ .hword L(blend_h_tbl) - 640b
+ .hword L(blend_h_tbl) - 320b
+ .hword L(blend_h_tbl) - 16b
+ .hword L(blend_h_tbl) - 8b
+ .hword L(blend_h_tbl) - 4b
+ .hword L(blend_h_tbl) - 2b
+endfunc
+
+function blend_v_8bpc_neon, export=1
+ adr x6, L(blend_v_tbl)
+ movrel x5, X(obmc_masks)
+ add x5, x5, w3, uxtw
+ clz w3, w3
+ movi v4.16b, #64
+ add x8, x0, x1
+ lsl x1, x1, #1
+ sub w3, w3, #26
+ ldrh w3, [x6, x3, lsl #1]
+ sub x6, x6, w3, uxtw
+ br x6
+20:
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v0.8b}, [x5]
+ sub v1.8b, v4.8b, v0.8b
+2:
+ ld1 {v2.h}[0], [x2], #2
+ ld1 {v3.b}[0], [x0]
+ subs w4, w4, #2
+ ld1 {v2.b}[1], [x2]
+ ld1 {v3.b}[1], [x8]
+ umull v5.8h, v2.8b, v0.8b
+ umlal v5.8h, v3.8b, v1.8b
+ rshrn v5.8b, v5.8h, #6
+ add x2, x2, #2
+ st1 {v5.b}[0], [x0], x1
+ st1 {v5.b}[1], [x8], x1
+ b.gt 2b
+ ret
+40:
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v0.2s}, [x5]
+ sub x1, x1, #2
+ sub v1.8b, v4.8b, v0.8b
+4:
+ ld1 {v2.8b}, [x2], #8
+ ld1 {v3.s}[0], [x0]
+ ld1 {v3.s}[1], [x8]
+ subs w4, w4, #2
+ umull v5.8h, v2.8b, v0.8b
+ umlal v5.8h, v3.8b, v1.8b
+ rshrn v5.8b, v5.8h, #6
+ st1 {v5.h}[0], [x0], #2
+ st1 {v5.h}[2], [x8], #2
+ st1 {v5.b}[2], [x0], x1
+ st1 {v5.b}[6], [x8], x1
+ b.gt 4b
+ ret
+80:
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v0.2d}, [x5]
+ sub x1, x1, #4
+ sub v1.16b, v4.16b, v0.16b
+8:
+ ld1 {v2.16b}, [x2], #16
+ ld1 {v3.d}[0], [x0]
+ ld1 {v3.d}[1], [x8]
+ subs w4, w4, #2
+ umull v5.8h, v0.8b, v2.8b
+ umlal v5.8h, v3.8b, v1.8b
+ umull2 v6.8h, v0.16b, v2.16b
+ umlal2 v6.8h, v3.16b, v1.16b
+ rshrn v7.8b, v5.8h, #6
+ rshrn2 v7.16b, v6.8h, #6
+ st1 {v7.s}[0], [x0], #4
+ st1 {v7.s}[2], [x8], #4
+ st1 {v7.h}[2], [x0], x1
+ st1 {v7.h}[6], [x8], x1
+ b.gt 8b
+ ret
+160:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.16b}, [x5]
+ sub x1, x1, #8
+ sub v2.16b, v4.16b, v0.16b
+16:
+ ld1 {v5.16b, v6.16b}, [x2], #32
+ ld1 {v7.16b}, [x0]
+ subs w4, w4, #2
+ ld1 {v16.16b}, [x8]
+ umull v17.8h, v5.8b, v0.8b
+ umlal v17.8h, v7.8b, v2.8b
+ umull2 v18.8h, v5.16b, v0.16b
+ umlal2 v18.8h, v7.16b, v2.16b
+ umull v20.8h, v6.8b, v0.8b
+ umlal v20.8h, v16.8b, v2.8b
+ umull2 v21.8h, v6.16b, v0.16b
+ umlal2 v21.8h, v16.16b, v2.16b
+ rshrn v19.8b, v17.8h, #6
+ rshrn2 v19.16b, v18.8h, #6
+ rshrn v22.8b, v20.8h, #6
+ rshrn2 v22.16b, v21.8h, #6
+ st1 {v19.8b}, [x0], #8
+ st1 {v22.8b}, [x8], #8
+ st1 {v19.s}[2], [x0], x1
+ st1 {v22.s}[2], [x8], x1
+ b.gt 16b
+ ret
+320:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.16b, v1.16b}, [x5]
+ sub x1, x1, #16
+ sub v2.16b, v4.16b, v0.16b
+ sub v3.8b, v4.8b, v1.8b
+32:
+ ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], #64
+ ld1 {v5.16b, v6.16b}, [x0]
+ subs w4, w4, #2
+ ld1 {v20.16b, v21.16b}, [x8]
+ umull v22.8h, v16.8b, v0.8b
+ umlal v22.8h, v5.8b, v2.8b
+ umull2 v23.8h, v16.16b, v0.16b
+ umlal2 v23.8h, v5.16b, v2.16b
+ umull v28.8h, v17.8b, v1.8b
+ umlal v28.8h, v6.8b, v3.8b
+ umull v30.8h, v18.8b, v0.8b
+ umlal v30.8h, v20.8b, v2.8b
+ umull2 v31.8h, v18.16b, v0.16b
+ umlal2 v31.8h, v20.16b, v2.16b
+ umull v25.8h, v19.8b, v1.8b
+ umlal v25.8h, v21.8b, v3.8b
+ rshrn v24.8b, v22.8h, #6
+ rshrn2 v24.16b, v23.8h, #6
+ rshrn v28.8b, v28.8h, #6
+ rshrn v30.8b, v30.8h, #6
+ rshrn2 v30.16b, v31.8h, #6
+ rshrn v27.8b, v25.8h, #6
+ st1 {v24.16b}, [x0], #16
+ st1 {v30.16b}, [x8], #16
+ st1 {v28.8b}, [x0], x1
+ st1 {v27.8b}, [x8], x1
+ b.gt 32b
+ ret
+L(blend_v_tbl):
+ .hword L(blend_v_tbl) - 320b
+ .hword L(blend_v_tbl) - 160b
+ .hword L(blend_v_tbl) - 80b
+ .hword L(blend_v_tbl) - 40b
+ .hword L(blend_v_tbl) - 20b
+endfunc
+
+
+// This has got the same signature as the put_8tap functions,
+// and assumes that x8 is set to (clz(w)-24).
+function put_neon
+ adr x9, L(put_tbl)
+ ldrh w8, [x9, x8, lsl #1]
+ sub x9, x9, w8, uxtw
+ br x9
+
+2:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.h}[0], [x2], x3
+ ld1 {v1.h}[0], [x2], x3
+ subs w5, w5, #2
+ st1 {v0.h}[0], [x0], x1
+ st1 {v1.h}[0], [x0], x1
+ b.gt 2b
+ ret
+4:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.s}[0], [x2], x3
+ ld1 {v1.s}[0], [x2], x3
+ subs w5, w5, #2
+ st1 {v0.s}[0], [x0], x1
+ st1 {v1.s}[0], [x0], x1
+ b.gt 4b
+ ret
+8:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8b}, [x2], x3
+ ld1 {v1.8b}, [x2], x3
+ subs w5, w5, #2
+ st1 {v0.8b}, [x0], x1
+ st1 {v1.8b}, [x0], x1
+ b.gt 8b
+ ret
+160:
+ AARCH64_VALID_JUMP_TARGET
+ add x8, x0, x1
+ lsl x1, x1, #1
+ add x9, x2, x3
+ lsl x3, x3, #1
+16:
+ ld1 {v0.16b}, [x2], x3
+ ld1 {v1.16b}, [x9], x3
+ subs w5, w5, #2
+ st1 {v0.16b}, [x0], x1
+ st1 {v1.16b}, [x8], x1
+ b.gt 16b
+ ret
+32:
+ AARCH64_VALID_JUMP_TARGET
+ ldp x6, x7, [x2]
+ ldp x8, x9, [x2, #16]
+ stp x6, x7, [x0]
+ subs w5, w5, #1
+ stp x8, x9, [x0, #16]
+ add x2, x2, x3
+ add x0, x0, x1
+ b.gt 32b
+ ret
+64:
+ AARCH64_VALID_JUMP_TARGET
+ ldp x6, x7, [x2]
+ ldp x8, x9, [x2, #16]
+ stp x6, x7, [x0]
+ ldp x10, x11, [x2, #32]
+ stp x8, x9, [x0, #16]
+ subs w5, w5, #1
+ ldp x12, x13, [x2, #48]
+ stp x10, x11, [x0, #32]
+ stp x12, x13, [x0, #48]
+ add x2, x2, x3
+ add x0, x0, x1
+ b.gt 64b
+ ret
+128:
+ AARCH64_VALID_JUMP_TARGET
+ ldp q0, q1, [x2]
+ ldp q2, q3, [x2, #32]
+ stp q0, q1, [x0]
+ ldp q4, q5, [x2, #64]
+ stp q2, q3, [x0, #32]
+ ldp q6, q7, [x2, #96]
+ subs w5, w5, #1
+ stp q4, q5, [x0, #64]
+ stp q6, q7, [x0, #96]
+ add x2, x2, x3
+ add x0, x0, x1
+ b.gt 128b
+ ret
+
+L(put_tbl):
+ .hword L(put_tbl) - 128b
+ .hword L(put_tbl) - 64b
+ .hword L(put_tbl) - 32b
+ .hword L(put_tbl) - 160b
+ .hword L(put_tbl) - 8b
+ .hword L(put_tbl) - 4b
+ .hword L(put_tbl) - 2b
+endfunc
+
+
+// This has got the same signature as the prep_8tap functions,
+// and assumes that x8 is set to (clz(w)-24), and x7 to w*2.
+function prep_neon
+ adr x9, L(prep_tbl)
+ ldrh w8, [x9, x8, lsl #1]
+ sub x9, x9, w8, uxtw
+ br x9
+
+4:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.s}[0], [x1], x2
+ ld1 {v1.s}[0], [x1], x2
+ subs w4, w4, #2
+ ushll v0.8h, v0.8b, #4
+ ushll v1.8h, v1.8b, #4
+ st1 {v0.4h, v1.4h}, [x0], #16
+ b.gt 4b
+ ret
+8:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8b}, [x1], x2
+ ld1 {v1.8b}, [x1], x2
+ subs w4, w4, #2
+ ushll v0.8h, v0.8b, #4
+ ushll v1.8h, v1.8b, #4
+ st1 {v0.8h, v1.8h}, [x0], #32
+ b.gt 8b
+ ret
+160:
+ AARCH64_VALID_JUMP_TARGET
+ add x9, x1, x2
+ lsl x2, x2, #1
+16:
+ ld1 {v0.16b}, [x1], x2
+ ld1 {v1.16b}, [x9], x2
+ subs w4, w4, #2
+ ushll v4.8h, v0.8b, #4
+ ushll2 v5.8h, v0.16b, #4
+ ushll v6.8h, v1.8b, #4
+ ushll2 v7.8h, v1.16b, #4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+ b.gt 16b
+ ret
+320:
+ AARCH64_VALID_JUMP_TARGET
+ add x8, x0, w3, uxtw
+32:
+ ld1 {v0.16b, v1.16b}, [x1], x2
+ subs w4, w4, #2
+ ushll v4.8h, v0.8b, #4
+ ushll2 v5.8h, v0.16b, #4
+ ld1 {v2.16b, v3.16b}, [x1], x2
+ ushll v6.8h, v1.8b, #4
+ ushll2 v7.8h, v1.16b, #4
+ ushll v16.8h, v2.8b, #4
+ st1 {v4.8h, v5.8h}, [x0], x7
+ ushll2 v17.8h, v2.16b, #4
+ st1 {v6.8h, v7.8h}, [x8], x7
+ ushll v18.8h, v3.8b, #4
+ st1 {v16.8h, v17.8h}, [x0], x7
+ ushll2 v19.8h, v3.16b, #4
+ st1 {v18.8h, v19.8h}, [x8], x7
+ b.gt 32b
+ ret
+640:
+ AARCH64_VALID_JUMP_TARGET
+ add x8, x0, #32
+ mov x6, #64
+64:
+ ldp q0, q1, [x1]
+ subs w4, w4, #1
+ ushll v4.8h, v0.8b, #4
+ ushll2 v5.8h, v0.16b, #4
+ ldp q2, q3, [x1, #32]
+ ushll v6.8h, v1.8b, #4
+ ushll2 v7.8h, v1.16b, #4
+ add x1, x1, x2
+ ushll v16.8h, v2.8b, #4
+ st1 {v4.8h, v5.8h}, [x0], x6
+ ushll2 v17.8h, v2.16b, #4
+ ushll v18.8h, v3.8b, #4
+ st1 {v6.8h, v7.8h}, [x8], x6
+ ushll2 v19.8h, v3.16b, #4
+ st1 {v16.8h, v17.8h}, [x0], x6
+ st1 {v18.8h, v19.8h}, [x8], x6
+ b.gt 64b
+ ret
+1280:
+ AARCH64_VALID_JUMP_TARGET
+ add x8, x0, #64
+ mov x6, #128
+128:
+ ldp q0, q1, [x1]
+ ldp q2, q3, [x1, #32]
+ ushll v16.8h, v0.8b, #4
+ ushll2 v17.8h, v0.16b, #4
+ ushll v18.8h, v1.8b, #4
+ ushll2 v19.8h, v1.16b, #4
+ ushll v20.8h, v2.8b, #4
+ ushll2 v21.8h, v2.16b, #4
+ ldp q4, q5, [x1, #64]
+ st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x6
+ ushll v22.8h, v3.8b, #4
+ ushll2 v23.8h, v3.16b, #4
+ ushll v24.8h, v4.8b, #4
+ ushll2 v25.8h, v4.16b, #4
+ ushll v26.8h, v5.8b, #4
+ ushll2 v27.8h, v5.16b, #4
+ ldp q6, q7, [x1, #96]
+ st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x8], x6
+ ushll v28.8h, v6.8b, #4
+ ushll2 v29.8h, v6.16b, #4
+ ushll v30.8h, v7.8b, #4
+ ushll2 v31.8h, v7.16b, #4
+ subs w4, w4, #1
+ add x1, x1, x2
+ st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x0], x6
+ st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x6
+ b.gt 128b
+ ret
+
+L(prep_tbl):
+ .hword L(prep_tbl) - 1280b
+ .hword L(prep_tbl) - 640b
+ .hword L(prep_tbl) - 320b
+ .hword L(prep_tbl) - 160b
+ .hword L(prep_tbl) - 8b
+ .hword L(prep_tbl) - 4b
+endfunc
+
+
+.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
+ ld1 {\d0\wd}[0], [\s0], \strd
+ ld1 {\d1\wd}[0], [\s1], \strd
+.ifnb \d2
+ ld1 {\d2\wd}[0], [\s0], \strd
+ ld1 {\d3\wd}[0], [\s1], \strd
+.endif
+.ifnb \d4
+ ld1 {\d4\wd}[0], [\s0], \strd
+.endif
+.ifnb \d5
+ ld1 {\d5\wd}[0], [\s1], \strd
+.endif
+.ifnb \d6
+ ld1 {\d6\wd}[0], [\s0], \strd
+.endif
+.endm
+.macro load_reg s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
+ ld1 {\d0\wd}, [\s0], \strd
+ ld1 {\d1\wd}, [\s1], \strd
+.ifnb \d2
+ ld1 {\d2\wd}, [\s0], \strd
+ ld1 {\d3\wd}, [\s1], \strd
+.endif
+.ifnb \d4
+ ld1 {\d4\wd}, [\s0], \strd
+.endif
+.ifnb \d5
+ ld1 {\d5\wd}, [\s1], \strd
+.endif
+.ifnb \d6
+ ld1 {\d6\wd}, [\s0], \strd
+.endif
+.endm
+.macro load_h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+ load_slice \s0, \s1, \strd, .h, \d0, \d1, \d2, \d3, \d4, \d5, \d6
+.endm
+.macro load_s s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+ load_slice \s0, \s1, \strd, .s, \d0, \d1, \d2, \d3, \d4, \d5, \d6
+.endm
+.macro load_8b s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+ load_reg \s0, \s1, \strd, .8b, \d0, \d1, \d2, \d3, \d4, \d5, \d6
+.endm
+.macro load_16b s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+ load_reg \s0, \s1, \strd, .16b, \d0, \d1, \d2, \d3, \d4, \d5, \d6
+.endm
+.macro interleave_1 wd, r0, r1, r2, r3, r4
+ trn1 \r0\wd, \r0\wd, \r1\wd
+ trn1 \r1\wd, \r1\wd, \r2\wd
+.ifnb \r3
+ trn1 \r2\wd, \r2\wd, \r3\wd
+ trn1 \r3\wd, \r3\wd, \r4\wd
+.endif
+.endm
+.macro interleave_1_h r0, r1, r2, r3, r4
+ interleave_1 .4h, \r0, \r1, \r2, \r3, \r4
+.endm
+.macro interleave_1_s r0, r1, r2, r3, r4
+ interleave_1 .2s, \r0, \r1, \r2, \r3, \r4
+.endm
+.macro interleave_2 wd, r0, r1, r2, r3, r4, r5
+ trn1 \r0\wd, \r0\wd, \r2\wd
+ trn1 \r1\wd, \r1\wd, \r3\wd
+ trn1 \r2\wd, \r2\wd, \r4\wd
+ trn1 \r3\wd, \r3\wd, \r5\wd
+.endm
+.macro interleave_2_s r0, r1, r2, r3, r4, r5
+ interleave_2 .2s, \r0, \r1, \r2, \r3, \r4, \r5
+.endm
+.macro uxtl_b r0, r1, r2, r3, r4, r5, r6
+ uxtl \r0\().8h, \r0\().8b
+ uxtl \r1\().8h, \r1\().8b
+.ifnb \r2
+ uxtl \r2\().8h, \r2\().8b
+ uxtl \r3\().8h, \r3\().8b
+.endif
+.ifnb \r4
+ uxtl \r4\().8h, \r4\().8b
+.endif
+.ifnb \r5
+ uxtl \r5\().8h, \r5\().8b
+.endif
+.ifnb \r6
+ uxtl \r6\().8h, \r6\().8b
+.endif
+.endm
+.macro mul_mla_4 d, s0, s1, s2, s3, wd
+ mul \d\wd, \s0\wd, v0.h[0]
+ mla \d\wd, \s1\wd, v0.h[1]
+ mla \d\wd, \s2\wd, v0.h[2]
+ mla \d\wd, \s3\wd, v0.h[3]
+.endm
+// Interleaving the mul/mla chains actually hurts performance
+// significantly on Cortex A53, thus keeping mul/mla tightly
+// chained like this.
+.macro mul_mla_8_0_4h d0, s0, s1, s2, s3, s4, s5, s6, s7
+ mul \d0\().4h, \s0\().4h, v0.h[0]
+ mla \d0\().4h, \s1\().4h, v0.h[1]
+ mla \d0\().4h, \s2\().4h, v0.h[2]
+ mla \d0\().4h, \s3\().4h, v0.h[3]
+ mla \d0\().4h, \s4\().4h, v0.h[4]
+ mla \d0\().4h, \s5\().4h, v0.h[5]
+ mla \d0\().4h, \s6\().4h, v0.h[6]
+ mla \d0\().4h, \s7\().4h, v0.h[7]
+.endm
+.macro mul_mla_8_0 d0, s0, s1, s2, s3, s4, s5, s6, s7
+ mul \d0\().8h, \s0\().8h, v0.h[0]
+ mla \d0\().8h, \s1\().8h, v0.h[1]
+ mla \d0\().8h, \s2\().8h, v0.h[2]
+ mla \d0\().8h, \s3\().8h, v0.h[3]
+ mla \d0\().8h, \s4\().8h, v0.h[4]
+ mla \d0\().8h, \s5\().8h, v0.h[5]
+ mla \d0\().8h, \s6\().8h, v0.h[6]
+ mla \d0\().8h, \s7\().8h, v0.h[7]
+.endm
+.macro mul_mla_8_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8
+ mul \d0\().8h, \s0\().8h, v0.h[0]
+ mla \d0\().8h, \s1\().8h, v0.h[1]
+ mla \d0\().8h, \s2\().8h, v0.h[2]
+ mla \d0\().8h, \s3\().8h, v0.h[3]
+ mla \d0\().8h, \s4\().8h, v0.h[4]
+ mla \d0\().8h, \s5\().8h, v0.h[5]
+ mla \d0\().8h, \s6\().8h, v0.h[6]
+ mla \d0\().8h, \s7\().8h, v0.h[7]
+ mul \d1\().8h, \s1\().8h, v0.h[0]
+ mla \d1\().8h, \s2\().8h, v0.h[1]
+ mla \d1\().8h, \s3\().8h, v0.h[2]
+ mla \d1\().8h, \s4\().8h, v0.h[3]
+ mla \d1\().8h, \s5\().8h, v0.h[4]
+ mla \d1\().8h, \s6\().8h, v0.h[5]
+ mla \d1\().8h, \s7\().8h, v0.h[6]
+ mla \d1\().8h, \s8\().8h, v0.h[7]
+.endm
+.macro mul_mla_8_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9
+ mul \d0\().8h, \s0\().8h, v0.h[0]
+ mla \d0\().8h, \s1\().8h, v0.h[1]
+ mla \d0\().8h, \s2\().8h, v0.h[2]
+ mla \d0\().8h, \s3\().8h, v0.h[3]
+ mla \d0\().8h, \s4\().8h, v0.h[4]
+ mla \d0\().8h, \s5\().8h, v0.h[5]
+ mla \d0\().8h, \s6\().8h, v0.h[6]
+ mla \d0\().8h, \s7\().8h, v0.h[7]
+ mul \d1\().8h, \s2\().8h, v0.h[0]
+ mla \d1\().8h, \s3\().8h, v0.h[1]
+ mla \d1\().8h, \s4\().8h, v0.h[2]
+ mla \d1\().8h, \s5\().8h, v0.h[3]
+ mla \d1\().8h, \s6\().8h, v0.h[4]
+ mla \d1\().8h, \s7\().8h, v0.h[5]
+ mla \d1\().8h, \s8\().8h, v0.h[6]
+ mla \d1\().8h, \s9\().8h, v0.h[7]
+.endm
+.macro sqrshrun_b shift, r0, r1, r2, r3
+ sqrshrun \r0\().8b, \r0\().8h, #\shift
+.ifnb \r1
+ sqrshrun \r1\().8b, \r1\().8h, #\shift
+.endif
+.ifnb \r2
+ sqrshrun \r2\().8b, \r2\().8h, #\shift
+ sqrshrun \r3\().8b, \r3\().8h, #\shift
+.endif
+.endm
+.macro srshr_h shift, r0, r1, r2, r3
+ srshr \r0\().8h, \r0\().8h, #\shift
+.ifnb \r1
+ srshr \r1\().8h, \r1\().8h, #\shift
+.endif
+.ifnb \r2
+ srshr \r2\().8h, \r2\().8h, #\shift
+ srshr \r3\().8h, \r3\().8h, #\shift
+.endif
+.endm
+.macro st_h strd, reg, lanes
+ st1 {\reg\().h}[0], [x0], \strd
+ st1 {\reg\().h}[1], [x8], \strd
+.if \lanes > 2
+ st1 {\reg\().h}[2], [x0], \strd
+ st1 {\reg\().h}[3], [x8], \strd
+.endif
+.endm
+.macro st_s strd, r0, r1
+ st1 {\r0\().s}[0], [x0], \strd
+ st1 {\r0\().s}[1], [x8], \strd
+.ifnb \r1
+ st1 {\r1\().s}[0], [x0], \strd
+ st1 {\r1\().s}[1], [x8], \strd
+.endif
+.endm
+.macro st_d strd, r0, r1
+ st1 {\r0\().d}[0], [x0], \strd
+ st1 {\r0\().d}[1], [x8], \strd
+.ifnb \r1
+ st1 {\r1\().d}[0], [x0], \strd
+ st1 {\r1\().d}[1], [x8], \strd
+.endif
+.endm
+.macro shift_store_4 type, strd, r0, r1
+.ifc \type, put
+ sqrshrun_b 6, \r0, \r1
+ st_s \strd, \r0, \r1
+.else
+ srshr_h 2, \r0, \r1
+ st_d \strd, \r0, \r1
+.endif
+.endm
+.macro st_reg strd, wd, r0, r1, r2, r3, r4, r5, r6, r7
+ st1 {\r0\wd}, [x0], \strd
+ st1 {\r1\wd}, [x8], \strd
+.ifnb \r2
+ st1 {\r2\wd}, [x0], \strd
+ st1 {\r3\wd}, [x8], \strd
+.endif
+.ifnb \r4
+ st1 {\r4\wd}, [x0], \strd
+ st1 {\r5\wd}, [x8], \strd
+ st1 {\r6\wd}, [x0], \strd
+ st1 {\r7\wd}, [x8], \strd
+.endif
+.endm
+.macro st_8b strd, r0, r1, r2, r3, r4, r5, r6, r7
+ st_reg \strd, .8b, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
+.endm
+.macro st_16b strd, r0, r1, r2, r3, r4, r5, r6, r7
+ st_reg \strd, .16b, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
+.endm
+.macro shift_store_8 type, strd, r0, r1, r2, r3
+.ifc \type, put
+ sqrshrun_b 6, \r0, \r1, \r2, \r3
+ st_8b \strd, \r0, \r1, \r2, \r3
+.else
+ srshr_h 2, \r0, \r1, \r2, \r3
+ st_16b \strd, \r0, \r1, \r2, \r3
+.endif
+.endm
+.macro shift_store_16 type, strd, r0, r1, r2, r3
+.ifc \type, put
+ sqrshrun \r0\().8b, \r0\().8h, #6
+ sqrshrun2 \r0\().16b, \r1\().8h, #6
+ sqrshrun \r2\().8b, \r2\().8h, #6
+ sqrshrun2 \r2\().16b, \r3\().8h, #6
+ st_16b \strd, \r0, \r2
+.else
+ srshr_h 2, \r0, \r1, \r2, \r3
+ st1 {\r0\().8h, \r1\().8h}, [x0], \strd
+ st1 {\r2\().8h, \r3\().8h}, [x8], \strd
+.endif
+.endm
+
+.macro make_8tap_fn op, type, type_h, type_v
+function \op\()_8tap_\type\()_8bpc_neon, export=1
+ mov x8, \type_h
+ mov x9, \type_v
+ b \op\()_8tap_neon
+endfunc
+.endm
+
+// No spaces in these expressions, due to gas-preprocessor.
+#define REGULAR ((0*15<<7)|3*15)
+#define SMOOTH ((1*15<<7)|4*15)
+#define SHARP ((2*15<<7)|3*15)
+
+.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, ds2, sr2, shift_hv
+make_8tap_fn \type, regular, REGULAR, REGULAR
+make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH
+make_8tap_fn \type, regular_sharp, REGULAR, SHARP
+make_8tap_fn \type, smooth, SMOOTH, SMOOTH
+make_8tap_fn \type, smooth_regular, SMOOTH, REGULAR
+make_8tap_fn \type, smooth_sharp, SMOOTH, SHARP
+make_8tap_fn \type, sharp, SHARP, SHARP
+make_8tap_fn \type, sharp_regular, SHARP, REGULAR
+make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH
+
+function \type\()_8tap_neon
+ mov w10, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0)
+ mul \mx, \mx, w10
+ mul \my, \my, w10
+ add \mx, \mx, w8 // mx, 8tap_h, 4tap_h
+ add \my, \my, w9 // my, 8tap_v, 4tap_v
+.ifc \type, prep
+ uxtw \d_strd, \w
+ lsl \d_strd, \d_strd, #1
+.endif
+
+ clz w8, \w
+ tst \mx, #(0x7f << 14)
+ sub w8, w8, #24
+ movrel x10, X(mc_subpel_filters), -8
+ b.ne L(\type\()_8tap_h)
+ tst \my, #(0x7f << 14)
+ b.ne L(\type\()_8tap_v)
+ b \type\()_neon
+
+L(\type\()_8tap_h):
+ cmp \w, #4
+ ubfx w9, \mx, #7, #7
+ and \mx, \mx, #0x7f
+ b.le 4f
+ mov \mx, w9
+4:
+ tst \my, #(0x7f << 14)
+ add \xmx, x10, \mx, uxtw #3
+ b.ne L(\type\()_8tap_hv)
+
+ adr x9, L(\type\()_8tap_h_tbl)
+ ldrh w8, [x9, x8, lsl #1]
+ sub x9, x9, w8, uxtw
+ br x9
+
+20: // 2xN h
+ AARCH64_VALID_JUMP_TARGET
+.ifc \type, put
+ add \xmx, \xmx, #2
+ ld1 {v0.s}[0], [\xmx]
+ sub \src, \src, #1
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+ sxtl v0.8h, v0.8b
+2:
+ ld1 {v4.8b}, [\src], \s_strd
+ ld1 {v6.8b}, [\sr2], \s_strd
+ uxtl v4.8h, v4.8b
+ uxtl v6.8h, v6.8b
+ ext v5.16b, v4.16b, v4.16b, #2
+ ext v7.16b, v6.16b, v6.16b, #2
+ subs \h, \h, #2
+ trn1 v3.2s, v4.2s, v6.2s
+ trn2 v6.2s, v4.2s, v6.2s
+ trn1 v4.2s, v5.2s, v7.2s
+ trn2 v7.2s, v5.2s, v7.2s
+ mul v3.4h, v3.4h, v0.h[0]
+ mla v3.4h, v4.4h, v0.h[1]
+ mla v3.4h, v6.4h, v0.h[2]
+ mla v3.4h, v7.4h, v0.h[3]
+ srshr v3.4h, v3.4h, #2
+ sqrshrun v3.8b, v3.8h, #4
+ st1 {v3.h}[0], [\dst], \d_strd
+ st1 {v3.h}[1], [\ds2], \d_strd
+ b.gt 2b
+ ret
+.endif
+
+40: // 4xN h
+ AARCH64_VALID_JUMP_TARGET
+ add \xmx, \xmx, #2
+ ld1 {v0.s}[0], [\xmx]
+ sub \src, \src, #1
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+ sxtl v0.8h, v0.8b
+4:
+ ld1 {v16.8b}, [\src], \s_strd
+ ld1 {v20.8b}, [\sr2], \s_strd
+ uxtl v16.8h, v16.8b
+ uxtl v20.8h, v20.8b
+ ext v17.16b, v16.16b, v16.16b, #2
+ ext v18.16b, v16.16b, v16.16b, #4
+ ext v19.16b, v16.16b, v16.16b, #6
+ ext v21.16b, v20.16b, v20.16b, #2
+ ext v22.16b, v20.16b, v20.16b, #4
+ ext v23.16b, v20.16b, v20.16b, #6
+ subs \h, \h, #2
+ mul v16.4h, v16.4h, v0.h[0]
+ mla v16.4h, v17.4h, v0.h[1]
+ mla v16.4h, v18.4h, v0.h[2]
+ mla v16.4h, v19.4h, v0.h[3]
+ mul v20.4h, v20.4h, v0.h[0]
+ mla v20.4h, v21.4h, v0.h[1]
+ mla v20.4h, v22.4h, v0.h[2]
+ mla v20.4h, v23.4h, v0.h[3]
+ srshr v16.4h, v16.4h, #2
+ srshr v20.4h, v20.4h, #2
+.ifc \type, put
+ sqrshrun v16.8b, v16.8h, #4
+ sqrshrun v20.8b, v20.8h, #4
+ st1 {v16.s}[0], [\dst], \d_strd
+ st1 {v20.s}[0], [\ds2], \d_strd
+.else
+ st1 {v16.4h}, [\dst], \d_strd
+ st1 {v20.4h}, [\ds2], \d_strd
+.endif
+ b.gt 4b
+ ret
+
+80: // 8xN h
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8b}, [\xmx]
+ sub \src, \src, #3
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+ sxtl v0.8h, v0.8b
+8:
+ ld1 {v16.8b, v17.8b}, [\src], \s_strd
+ ld1 {v20.8b, v21.8b}, [\sr2], \s_strd
+ uxtl v16.8h, v16.8b
+ uxtl v17.8h, v17.8b
+ uxtl v20.8h, v20.8b
+ uxtl v21.8h, v21.8b
+
+ mul v18.8h, v16.8h, v0.h[0]
+ mul v22.8h, v20.8h, v0.h[0]
+.irpc i, 1234567
+ ext v19.16b, v16.16b, v17.16b, #(2*\i)
+ ext v23.16b, v20.16b, v21.16b, #(2*\i)
+ mla v18.8h, v19.8h, v0.h[\i]
+ mla v22.8h, v23.8h, v0.h[\i]
+.endr
+ subs \h, \h, #2
+ srshr v18.8h, v18.8h, #2
+ srshr v22.8h, v22.8h, #2
+.ifc \type, put
+ sqrshrun v18.8b, v18.8h, #4
+ sqrshrun v22.8b, v22.8h, #4
+ st1 {v18.8b}, [\dst], \d_strd
+ st1 {v22.8b}, [\ds2], \d_strd
+.else
+ st1 {v18.8h}, [\dst], \d_strd
+ st1 {v22.8h}, [\ds2], \d_strd
+.endif
+ b.gt 8b
+ ret
+160:
+320:
+640:
+1280: // 16xN, 32xN, ... h
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8b}, [\xmx]
+ sub \src, \src, #3
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ sxtl v0.8h, v0.8b
+
+ sub \s_strd, \s_strd, \w, uxtw
+ sub \s_strd, \s_strd, #8
+.ifc \type, put
+ lsl \d_strd, \d_strd, #1
+ sub \d_strd, \d_strd, \w, uxtw
+.endif
+161:
+ ld1 {v16.8b, v17.8b, v18.8b}, [\src], #24
+ ld1 {v20.8b, v21.8b, v22.8b}, [\sr2], #24
+ mov \mx, \w
+ uxtl v16.8h, v16.8b
+ uxtl v17.8h, v17.8b
+ uxtl v18.8h, v18.8b
+ uxtl v20.8h, v20.8b
+ uxtl v21.8h, v21.8b
+ uxtl v22.8h, v22.8b
+
+16:
+ mul v24.8h, v16.8h, v0.h[0]
+ mul v25.8h, v17.8h, v0.h[0]
+ mul v26.8h, v20.8h, v0.h[0]
+ mul v27.8h, v21.8h, v0.h[0]
+.irpc i, 1234567
+ ext v28.16b, v16.16b, v17.16b, #(2*\i)
+ ext v29.16b, v17.16b, v18.16b, #(2*\i)
+ ext v30.16b, v20.16b, v21.16b, #(2*\i)
+ ext v31.16b, v21.16b, v22.16b, #(2*\i)
+ mla v24.8h, v28.8h, v0.h[\i]
+ mla v25.8h, v29.8h, v0.h[\i]
+ mla v26.8h, v30.8h, v0.h[\i]
+ mla v27.8h, v31.8h, v0.h[\i]
+.endr
+ srshr v24.8h, v24.8h, #2
+ srshr v25.8h, v25.8h, #2
+ srshr v26.8h, v26.8h, #2
+ srshr v27.8h, v27.8h, #2
+ subs \mx, \mx, #16
+.ifc \type, put
+ sqrshrun v24.8b, v24.8h, #4
+ sqrshrun2 v24.16b, v25.8h, #4
+ sqrshrun v26.8b, v26.8h, #4
+ sqrshrun2 v26.16b, v27.8h, #4
+ st1 {v24.16b}, [\dst], #16
+ st1 {v26.16b}, [\ds2], #16
+.else
+ st1 {v24.8h, v25.8h}, [\dst], #32
+ st1 {v26.8h, v27.8h}, [\ds2], #32
+.endif
+ b.le 9f
+
+ mov v16.16b, v18.16b
+ mov v20.16b, v22.16b
+ ld1 {v17.8b, v18.8b}, [\src], #16
+ ld1 {v21.8b, v22.8b}, [\sr2], #16
+ uxtl v17.8h, v17.8b
+ uxtl v18.8h, v18.8b
+ uxtl v21.8h, v21.8b
+ uxtl v22.8h, v22.8b
+ b 16b
+
+9:
+ add \dst, \dst, \d_strd
+ add \ds2, \ds2, \d_strd
+ add \src, \src, \s_strd
+ add \sr2, \sr2, \s_strd
+
+ subs \h, \h, #2
+ b.gt 161b
+ ret
+
+L(\type\()_8tap_h_tbl):
+ .hword L(\type\()_8tap_h_tbl) - 1280b
+ .hword L(\type\()_8tap_h_tbl) - 640b
+ .hword L(\type\()_8tap_h_tbl) - 320b
+ .hword L(\type\()_8tap_h_tbl) - 160b
+ .hword L(\type\()_8tap_h_tbl) - 80b
+ .hword L(\type\()_8tap_h_tbl) - 40b
+ .hword L(\type\()_8tap_h_tbl) - 20b
+ .hword 0
+
+
+L(\type\()_8tap_v):
+ cmp \h, #4
+ ubfx w9, \my, #7, #7
+ and \my, \my, #0x7f
+ b.le 4f
+ mov \my, w9
+4:
+ add \xmy, x10, \my, uxtw #3
+
+ adr x9, L(\type\()_8tap_v_tbl)
+ ldrh w8, [x9, x8, lsl #1]
+ sub x9, x9, w8, uxtw
+ br x9
+
+20: // 2xN v
+ AARCH64_VALID_JUMP_TARGET
+.ifc \type, put
+ b.gt 28f
+
+ cmp \h, #2
+ add \xmy, \xmy, #2
+ ld1 {v0.s}[0], [\xmy]
+ sub \src, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+
+ // 2x2 v
+ load_h \src, \sr2, \s_strd, v1, v2, v3, v4, v5
+ interleave_1_h v1, v2, v3, v4, v5
+ b.gt 24f
+ uxtl_b v1, v2, v3, v4
+ mul_mla_4 v6, v1, v2, v3, v4, .4h
+ sqrshrun_b 6, v6
+ st_h \d_strd, v6, 2
+ ret
+
+24: // 2x4 v
+ load_h \sr2, \src, \s_strd, v6, v7
+ interleave_1_h v5, v6, v7
+ interleave_2_s v1, v2, v3, v4, v5, v6
+ uxtl_b v1, v2, v3, v4
+ mul_mla_4 v6, v1, v2, v3, v4, .8h
+ sqrshrun_b 6, v6
+ st_h \d_strd, v6, 4
+ ret
+
+28: // 2x6, 2x8, 2x12, 2x16 v
+ ld1 {v0.8b}, [\xmy]
+ sub \sr2, \src, \s_strd, lsl #1
+ add \ds2, \dst, \d_strd
+ sub \src, \sr2, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+ sxtl v0.8h, v0.8b
+
+ load_h \src, \sr2, \s_strd, v1, v2, v3, v4, v5, v6, v7
+ interleave_1_h v1, v2, v3, v4, v5
+ interleave_1_h v5, v6, v7
+ interleave_2_s v1, v2, v3, v4, v5, v6
+ uxtl_b v1, v2, v3, v4
+216:
+ subs \h, \h, #4
+ load_h \sr2, \src, \s_strd, v16, v17, v18, v19
+ interleave_1_h v7, v16, v17, v18, v19
+ interleave_2_s v5, v6, v7, v16, v17, v18
+ uxtl_b v5, v6, v7, v16
+ mul_mla_8_0 v30, v1, v2, v3, v4, v5, v6, v7, v16
+ sqrshrun_b 6, v30
+ st_h \d_strd, v30, 4
+ b.le 0f
+ cmp \h, #2
+ mov v1.16b, v5.16b
+ mov v2.16b, v6.16b
+ mov v3.16b, v7.16b
+ mov v4.16b, v16.16b
+ mov v5.16b, v17.16b
+ mov v6.16b, v18.16b
+ mov v7.16b, v19.16b
+ b.eq 26f
+ b 216b
+26:
+ load_h \sr2, \src, \s_strd, v16, v17
+ interleave_1_h v7, v16, v17
+ uxtl_b v5, v6, v7, v16
+ mul_mla_8_0_4h v30, v1, v2, v3, v4, v5, v6, v7, v16
+ sqrshrun_b 6, v30
+ st_h \d_strd, v30, 2
+0:
+ ret
+.endif
+
+40:
+ AARCH64_VALID_JUMP_TARGET
+ b.gt 480f
+
+ // 4x2, 4x4 v
+ cmp \h, #2
+ add \xmy, \xmy, #2
+ ld1 {v0.s}[0], [\xmy]
+ sub \src, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+
+ load_s \src, \sr2, \s_strd, v1, v2, v3, v4, v5
+ interleave_1_s v1, v2, v3, v4, v5
+ uxtl_b v1, v2, v3, v4
+ mul_mla_4 v6, v1, v2, v3, v4, .8h
+ shift_store_4 \type, \d_strd, v6
+ b.le 0f
+ load_s \sr2, \src, \s_strd, v6, v7
+ interleave_1_s v5, v6, v7
+ uxtl_b v5, v6
+ mul_mla_4 v7, v3, v4, v5, v6, .8h
+ shift_store_4 \type, \d_strd, v7
+0:
+ ret
+
+480: // 4x6, 4x8, 4x12, 4x16 v
+ ld1 {v0.8b}, [\xmy]
+ sub \sr2, \src, \s_strd, lsl #1
+ add \ds2, \dst, \d_strd
+ sub \src, \sr2, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+
+ load_s \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22
+ interleave_1_s v16, v17, v18
+ interleave_1_s v18, v19, v20, v21, v22
+ uxtl_b v16, v17
+ uxtl_b v18, v19, v20, v21
+
+48:
+ subs \h, \h, #4
+ load_s \sr2, \src, \s_strd, v23, v24, v25, v26
+ interleave_1_s v22, v23, v24, v25, v26
+ uxtl_b v22, v23, v24, v25
+ mul_mla_8_2 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
+ shift_store_4 \type, \d_strd, v1, v2
+ b.le 0f
+ load_s \sr2, \src, \s_strd, v27, v16
+ subs \h, \h, #2
+ interleave_1_s v26, v27, v16
+ uxtl_b v26, v27
+ mul_mla_8_0 v1, v20, v21, v22, v23, v24, v25, v26, v27
+ shift_store_4 \type, \d_strd, v1
+ b.le 0f
+ load_s \sr2, \src, \s_strd, v17, v18
+ subs \h, \h, #2
+ interleave_1_s v16, v17, v18
+ uxtl_b v16, v17
+ mul_mla_8_0 v2, v22, v23, v24, v25, v26, v27, v16, v17
+ shift_store_4 \type, \d_strd, v2
+ b.le 0f
+ subs \h, \h, #4
+ load_s \sr2, \src, \s_strd, v19, v20, v21, v22
+ interleave_1_s v18, v19, v20, v21, v22
+ uxtl_b v18, v19, v20, v21
+ mul_mla_8_2 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20, v21
+ shift_store_4 \type, \d_strd, v1, v2
+ b.gt 48b
+0:
+ ret
+
+80:
+ AARCH64_VALID_JUMP_TARGET
+ b.gt 880f
+
+ // 8x2, 8x4 v
+ cmp \h, #2
+ add \xmy, \xmy, #2
+ ld1 {v0.s}[0], [\xmy]
+ sub \src, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+
+ load_8b \src, \sr2, \s_strd, v1, v2, v3, v4, v5
+ uxtl_b v1, v2, v3, v4, v5
+ mul_mla_4 v6, v1, v2, v3, v4, .8h
+ mul_mla_4 v7, v2, v3, v4, v5, .8h
+ shift_store_8 \type, \d_strd, v6, v7
+ b.le 0f
+ load_8b \sr2, \src, \s_strd, v6, v7
+ uxtl_b v6, v7
+ mul_mla_4 v1, v3, v4, v5, v6, .8h
+ mul_mla_4 v2, v4, v5, v6, v7, .8h
+ shift_store_8 \type, \d_strd, v1, v2
+0:
+ ret
+
+880: // 8x6, 8x8, 8x16, 8x32 v
+1680: // 16x8, 16x16, ...
+320: // 32x8, 32x16, ...
+640:
+1280:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8b}, [\xmy]
+ sub \src, \src, \s_strd
+ sub \src, \src, \s_strd, lsl #1
+ sxtl v0.8h, v0.8b
+ mov \my, \h
+168:
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ load_8b \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22
+ uxtl_b v16, v17, v18, v19, v20, v21, v22
+
+88:
+ subs \h, \h, #2
+ load_8b \sr2, \src, \s_strd, v23, v24
+ uxtl_b v23, v24
+ mul_mla_8_1 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24
+ shift_store_8 \type, \d_strd, v1, v2
+ b.le 9f
+ subs \h, \h, #2
+ load_8b \sr2, \src, \s_strd, v25, v26
+ uxtl_b v25, v26
+ mul_mla_8_1 v3, v4, v18, v19, v20, v21, v22, v23, v24, v25, v26
+ shift_store_8 \type, \d_strd, v3, v4
+ b.le 9f
+ subs \h, \h, #2
+ load_8b \sr2, \src, \s_strd, v27, v16
+ uxtl_b v27, v16
+ mul_mla_8_1 v1, v2, v20, v21, v22, v23, v24, v25, v26, v27, v16
+ shift_store_8 \type, \d_strd, v1, v2
+ b.le 9f
+ subs \h, \h, #2
+ load_8b \sr2, \src, \s_strd, v17, v18
+ uxtl_b v17, v18
+ mul_mla_8_1 v3, v4, v22, v23, v24, v25, v26, v27, v16, v17, v18
+ shift_store_8 \type, \d_strd, v3, v4
+ b.le 9f
+ subs \h, \h, #4
+ load_8b \sr2, \src, \s_strd, v19, v20, v21, v22
+ uxtl_b v19, v20, v21, v22
+ mul_mla_8_1 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20
+ mul_mla_8_1 v3, v4, v26, v27, v16, v17, v18, v19, v20, v21, v22
+ shift_store_8 \type, \d_strd, v1, v2, v3, v4
+ b.gt 88b
+9:
+ subs \w, \w, #8
+ b.le 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ msub \src, \s_strd, \xmy, \src
+ msub \dst, \d_strd, \xmy, \dst
+ sub \src, \src, \s_strd, lsl #3
+ mov \h, \my
+ add \src, \src, #8
+.ifc \type, put
+ add \dst, \dst, #8
+.else
+ add \dst, \dst, #16
+.endif
+ b 168b
+0:
+ ret
+
+160:
+ AARCH64_VALID_JUMP_TARGET
+ b.gt 1680b
+
+ // 16x2, 16x4 v
+ add \xmy, \xmy, #2
+ ld1 {v0.s}[0], [\xmy]
+ sub \src, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+
+ cmp \h, #2
+ load_16b \src, \sr2, \s_strd, v1, v2, v3, v4, v5
+ uxtl v16.8h, v1.8b
+ uxtl v17.8h, v2.8b
+ uxtl v18.8h, v3.8b
+ uxtl v19.8h, v4.8b
+ uxtl v20.8h, v5.8b
+ uxtl2 v23.8h, v1.16b
+ uxtl2 v24.8h, v2.16b
+ uxtl2 v25.8h, v3.16b
+ uxtl2 v26.8h, v4.16b
+ uxtl2 v27.8h, v5.16b
+ mul_mla_4 v1, v16, v17, v18, v19, .8h
+ mul_mla_4 v16, v17, v18, v19, v20, .8h
+ mul_mla_4 v2, v23, v24, v25, v26, .8h
+ mul_mla_4 v17, v24, v25, v26, v27, .8h
+ shift_store_16 \type, \d_strd, v1, v2, v16, v17
+ b.le 0f
+ load_16b \sr2, \src, \s_strd, v6, v7
+ uxtl v21.8h, v6.8b
+ uxtl v22.8h, v7.8b
+ uxtl2 v28.8h, v6.16b
+ uxtl2 v29.8h, v7.16b
+ mul_mla_4 v1, v18, v19, v20, v21, .8h
+ mul_mla_4 v3, v19, v20, v21, v22, .8h
+ mul_mla_4 v2, v25, v26, v27, v28, .8h
+ mul_mla_4 v4, v26, v27, v28, v29, .8h
+ shift_store_16 \type, \d_strd, v1, v2, v3, v4
+0:
+ ret
+
+L(\type\()_8tap_v_tbl):
+ .hword L(\type\()_8tap_v_tbl) - 1280b
+ .hword L(\type\()_8tap_v_tbl) - 640b
+ .hword L(\type\()_8tap_v_tbl) - 320b
+ .hword L(\type\()_8tap_v_tbl) - 160b
+ .hword L(\type\()_8tap_v_tbl) - 80b
+ .hword L(\type\()_8tap_v_tbl) - 40b
+ .hword L(\type\()_8tap_v_tbl) - 20b
+ .hword 0
+
+L(\type\()_8tap_hv):
+ cmp \h, #4
+ ubfx w9, \my, #7, #7
+ and \my, \my, #0x7f
+ b.le 4f
+ mov \my, w9
+4:
+ add \xmy, x10, \my, uxtw #3
+
+ adr x9, L(\type\()_8tap_hv_tbl)
+ ldrh w8, [x9, x8, lsl #1]
+ sub x9, x9, w8, uxtw
+ br x9
+
+20:
+ AARCH64_VALID_JUMP_TARGET
+.ifc \type, put
+ add \xmx, \xmx, #2
+ ld1 {v0.s}[0], [\xmx]
+ b.gt 280f
+ add \xmy, \xmy, #2
+ ld1 {v1.s}[0], [\xmy]
+
+ // 2x2, 2x4 hv
+ sub \sr2, \src, #1
+ sub \src, \sr2, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+ sxtl v1.8h, v1.8b
+ mov x15, x30
+
+ ld1 {v28.8b}, [\src], \s_strd
+ uxtl v28.8h, v28.8b
+ ext v29.16b, v28.16b, v28.16b, #2
+ mul v28.4h, v28.4h, v0.4h
+ mul v29.4h, v29.4h, v0.4h
+ addp v28.4h, v28.4h, v29.4h
+ addp v16.4h, v28.4h, v28.4h
+ srshr v16.4h, v16.4h, #2
+ bl L(\type\()_8tap_filter_2)
+
+ trn1 v16.2s, v16.2s, v28.2s
+ mov v17.8b, v28.8b
+
+2:
+ bl L(\type\()_8tap_filter_2)
+
+ ext v18.8b, v17.8b, v28.8b, #4
+ smull v2.4s, v16.4h, v1.h[0]
+ smlal v2.4s, v17.4h, v1.h[1]
+ smlal v2.4s, v18.4h, v1.h[2]
+ smlal v2.4s, v28.4h, v1.h[3]
+
+ sqrshrn v2.4h, v2.4s, #\shift_hv
+ sqxtun v2.8b, v2.8h
+ subs \h, \h, #2
+ st1 {v2.h}[0], [\dst], \d_strd
+ st1 {v2.h}[1], [\ds2], \d_strd
+ b.le 0f
+ mov v16.8b, v18.8b
+ mov v17.8b, v28.8b
+ b 2b
+
+280: // 2x8, 2x16, 2x32 hv
+ ld1 {v1.8b}, [\xmy]
+ sub \src, \src, #1
+ sub \sr2, \src, \s_strd, lsl #1
+ sub \src, \sr2, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+ sxtl v1.8h, v1.8b
+ mov x15, x30
+
+ ld1 {v28.8b}, [\src], \s_strd
+ uxtl v28.8h, v28.8b
+ ext v29.16b, v28.16b, v28.16b, #2
+ mul v28.4h, v28.4h, v0.4h
+ mul v29.4h, v29.4h, v0.4h
+ addp v28.4h, v28.4h, v29.4h
+ addp v16.4h, v28.4h, v28.4h
+ srshr v16.4h, v16.4h, #2
+
+ bl L(\type\()_8tap_filter_2)
+ trn1 v16.2s, v16.2s, v28.2s
+ mov v17.8b, v28.8b
+ bl L(\type\()_8tap_filter_2)
+ ext v18.8b, v17.8b, v28.8b, #4
+ mov v19.8b, v28.8b
+ bl L(\type\()_8tap_filter_2)
+ ext v20.8b, v19.8b, v28.8b, #4
+ mov v21.8b, v28.8b
+
+28:
+ bl L(\type\()_8tap_filter_2)
+ ext v22.8b, v21.8b, v28.8b, #4
+ smull v2.4s, v16.4h, v1.h[0]
+ smlal v2.4s, v17.4h, v1.h[1]
+ smlal v2.4s, v18.4h, v1.h[2]
+ smlal v2.4s, v19.4h, v1.h[3]
+ smlal v2.4s, v20.4h, v1.h[4]
+ smlal v2.4s, v21.4h, v1.h[5]
+ smlal v2.4s, v22.4h, v1.h[6]
+ smlal v2.4s, v28.4h, v1.h[7]
+
+ sqrshrn v2.4h, v2.4s, #\shift_hv
+ sqxtun v2.8b, v2.8h
+ subs \h, \h, #2
+ st1 {v2.h}[0], [\dst], \d_strd
+ st1 {v2.h}[1], [\ds2], \d_strd
+ b.le 0f
+ mov v16.8b, v18.8b
+ mov v17.8b, v19.8b
+ mov v18.8b, v20.8b
+ mov v19.8b, v21.8b
+ mov v20.8b, v22.8b
+ mov v21.8b, v28.8b
+ b 28b
+
+0:
+ ret x15
+
+L(\type\()_8tap_filter_2):
+ ld1 {v28.8b}, [\sr2], \s_strd
+ ld1 {v30.8b}, [\src], \s_strd
+ uxtl v28.8h, v28.8b
+ uxtl v30.8h, v30.8b
+ ext v29.16b, v28.16b, v28.16b, #2
+ ext v31.16b, v30.16b, v30.16b, #2
+ trn1 v27.2s, v28.2s, v30.2s
+ trn2 v30.2s, v28.2s, v30.2s
+ trn1 v28.2s, v29.2s, v31.2s
+ trn2 v31.2s, v29.2s, v31.2s
+ mul v27.4h, v27.4h, v0.h[0]
+ mla v27.4h, v28.4h, v0.h[1]
+ mla v27.4h, v30.4h, v0.h[2]
+ mla v27.4h, v31.4h, v0.h[3]
+ srshr v28.4h, v27.4h, #2
+ ret
+.endif
+
+40:
+ AARCH64_VALID_JUMP_TARGET
+ add \xmx, \xmx, #2
+ ld1 {v0.s}[0], [\xmx]
+ b.gt 480f
+ add \xmy, \xmy, #2
+ ld1 {v1.s}[0], [\xmy]
+ sub \sr2, \src, #1
+ sub \src, \sr2, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+ sxtl v1.8h, v1.8b
+ mov x15, x30
+
+ // 4x2, 4x4 hv
+ ld1 {v26.8b}, [\src], \s_strd
+ uxtl v26.8h, v26.8b
+ ext v28.16b, v26.16b, v26.16b, #2
+ ext v29.16b, v26.16b, v26.16b, #4
+ ext v30.16b, v26.16b, v26.16b, #6
+ mul v31.4h, v26.4h, v0.h[0]
+ mla v31.4h, v28.4h, v0.h[1]
+ mla v31.4h, v29.4h, v0.h[2]
+ mla v31.4h, v30.4h, v0.h[3]
+ srshr v16.4h, v31.4h, #2
+
+ bl L(\type\()_8tap_filter_4)
+ mov v17.8b, v28.8b
+ mov v18.8b, v29.8b
+
+4:
+ bl L(\type\()_8tap_filter_4)
+ // Interleaving the mul/mla chains actually hurts performance
+ // significantly on Cortex A53, thus keeping mul/mla tightly
+ // chained like this.
+ smull v2.4s, v16.4h, v1.h[0]
+ smlal v2.4s, v17.4h, v1.h[1]
+ smlal v2.4s, v18.4h, v1.h[2]
+ smlal v2.4s, v28.4h, v1.h[3]
+ smull v3.4s, v17.4h, v1.h[0]
+ smlal v3.4s, v18.4h, v1.h[1]
+ smlal v3.4s, v28.4h, v1.h[2]
+ smlal v3.4s, v29.4h, v1.h[3]
+ sqrshrn v2.4h, v2.4s, #\shift_hv
+ sqrshrn v3.4h, v3.4s, #\shift_hv
+ subs \h, \h, #2
+.ifc \type, put
+ sqxtun v2.8b, v2.8h
+ sqxtun v3.8b, v3.8h
+ st1 {v2.s}[0], [\dst], \d_strd
+ st1 {v3.s}[0], [\ds2], \d_strd
+.else
+ st1 {v2.4h}, [\dst], \d_strd
+ st1 {v3.4h}, [\ds2], \d_strd
+.endif
+ b.le 0f
+ mov v16.8b, v18.8b
+ mov v17.8b, v28.8b
+ mov v18.8b, v29.8b
+ b 4b
+
+480: // 4x8, 4x16, 4x32 hv
+ ld1 {v1.8b}, [\xmy]
+ sub \src, \src, #1
+ sub \sr2, \src, \s_strd, lsl #1
+ sub \src, \sr2, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+ sxtl v1.8h, v1.8b
+ mov x15, x30
+
+ ld1 {v26.8b}, [\src], \s_strd
+ uxtl v26.8h, v26.8b
+ ext v28.16b, v26.16b, v26.16b, #2
+ ext v29.16b, v26.16b, v26.16b, #4
+ ext v30.16b, v26.16b, v26.16b, #6
+ mul v31.4h, v26.4h, v0.h[0]
+ mla v31.4h, v28.4h, v0.h[1]
+ mla v31.4h, v29.4h, v0.h[2]
+ mla v31.4h, v30.4h, v0.h[3]
+ srshr v16.4h, v31.4h, #2
+
+ bl L(\type\()_8tap_filter_4)
+ mov v17.8b, v28.8b
+ mov v18.8b, v29.8b
+ bl L(\type\()_8tap_filter_4)
+ mov v19.8b, v28.8b
+ mov v20.8b, v29.8b
+ bl L(\type\()_8tap_filter_4)
+ mov v21.8b, v28.8b
+ mov v22.8b, v29.8b
+
+48:
+ bl L(\type\()_8tap_filter_4)
+ smull v2.4s, v16.4h, v1.h[0]
+ smlal v2.4s, v17.4h, v1.h[1]
+ smlal v2.4s, v18.4h, v1.h[2]
+ smlal v2.4s, v19.4h, v1.h[3]
+ smlal v2.4s, v20.4h, v1.h[4]
+ smlal v2.4s, v21.4h, v1.h[5]
+ smlal v2.4s, v22.4h, v1.h[6]
+ smlal v2.4s, v28.4h, v1.h[7]
+ smull v3.4s, v17.4h, v1.h[0]
+ smlal v3.4s, v18.4h, v1.h[1]
+ smlal v3.4s, v19.4h, v1.h[2]
+ smlal v3.4s, v20.4h, v1.h[3]
+ smlal v3.4s, v21.4h, v1.h[4]
+ smlal v3.4s, v22.4h, v1.h[5]
+ smlal v3.4s, v28.4h, v1.h[6]
+ smlal v3.4s, v29.4h, v1.h[7]
+ sqrshrn v2.4h, v2.4s, #\shift_hv
+ sqrshrn v3.4h, v3.4s, #\shift_hv
+ subs \h, \h, #2
+.ifc \type, put
+ sqxtun v2.8b, v2.8h
+ sqxtun v3.8b, v3.8h
+ st1 {v2.s}[0], [\dst], \d_strd
+ st1 {v3.s}[0], [\ds2], \d_strd
+.else
+ st1 {v2.4h}, [\dst], \d_strd
+ st1 {v3.4h}, [\ds2], \d_strd
+.endif
+ b.le 0f
+ mov v16.8b, v18.8b
+ mov v17.8b, v19.8b
+ mov v18.8b, v20.8b
+ mov v19.8b, v21.8b
+ mov v20.8b, v22.8b
+ mov v21.8b, v28.8b
+ mov v22.8b, v29.8b
+ b 48b
+0:
+ ret x15
+
+L(\type\()_8tap_filter_4):
+ ld1 {v26.8b}, [\sr2], \s_strd
+ ld1 {v27.8b}, [\src], \s_strd
+ uxtl v26.8h, v26.8b
+ uxtl v27.8h, v27.8b
+ ext v28.16b, v26.16b, v26.16b, #2
+ ext v29.16b, v26.16b, v26.16b, #4
+ ext v30.16b, v26.16b, v26.16b, #6
+ mul v31.4h, v26.4h, v0.h[0]
+ mla v31.4h, v28.4h, v0.h[1]
+ mla v31.4h, v29.4h, v0.h[2]
+ mla v31.4h, v30.4h, v0.h[3]
+ ext v28.16b, v27.16b, v27.16b, #2
+ ext v29.16b, v27.16b, v27.16b, #4
+ ext v30.16b, v27.16b, v27.16b, #6
+ mul v27.4h, v27.4h, v0.h[0]
+ mla v27.4h, v28.4h, v0.h[1]
+ mla v27.4h, v29.4h, v0.h[2]
+ mla v27.4h, v30.4h, v0.h[3]
+ srshr v28.4h, v31.4h, #2
+ srshr v29.4h, v27.4h, #2
+ ret
+
+80:
+160:
+320:
+ AARCH64_VALID_JUMP_TARGET
+ b.gt 880f
+ add \xmy, \xmy, #2
+ ld1 {v0.8b}, [\xmx]
+ ld1 {v1.s}[0], [\xmy]
+ sub \src, \src, #3
+ sub \src, \src, \s_strd
+ sxtl v0.8h, v0.8b
+ sxtl v1.8h, v1.8b
+ mov x15, x30
+ mov \my, \h
+
+164: // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+
+ bl L(\type\()_8tap_filter_8_first)
+ bl L(\type\()_8tap_filter_8)
+ mov v17.16b, v24.16b
+ mov v18.16b, v25.16b
+
+8:
+ smull v2.4s, v16.4h, v1.h[0]
+ smull2 v3.4s, v16.8h, v1.h[0]
+ bl L(\type\()_8tap_filter_8)
+ smull v4.4s, v17.4h, v1.h[0]
+ smull2 v5.4s, v17.8h, v1.h[0]
+ smlal v2.4s, v17.4h, v1.h[1]
+ smlal2 v3.4s, v17.8h, v1.h[1]
+ smlal v4.4s, v18.4h, v1.h[1]
+ smlal2 v5.4s, v18.8h, v1.h[1]
+ smlal v2.4s, v18.4h, v1.h[2]
+ smlal2 v3.4s, v18.8h, v1.h[2]
+ smlal v4.4s, v24.4h, v1.h[2]
+ smlal2 v5.4s, v24.8h, v1.h[2]
+ smlal v2.4s, v24.4h, v1.h[3]
+ smlal2 v3.4s, v24.8h, v1.h[3]
+ smlal v4.4s, v25.4h, v1.h[3]
+ smlal2 v5.4s, v25.8h, v1.h[3]
+ sqrshrn v2.4h, v2.4s, #\shift_hv
+ sqrshrn2 v2.8h, v3.4s, #\shift_hv
+ sqrshrn v4.4h, v4.4s, #\shift_hv
+ sqrshrn2 v4.8h, v5.4s, #\shift_hv
+ subs \h, \h, #2
+.ifc \type, put
+ sqxtun v2.8b, v2.8h
+ sqxtun v4.8b, v4.8h
+ st1 {v2.8b}, [\dst], \d_strd
+ st1 {v4.8b}, [\ds2], \d_strd
+.else
+ st1 {v2.8h}, [\dst], \d_strd
+ st1 {v4.8h}, [\ds2], \d_strd
+.endif
+ b.le 9f
+ mov v16.16b, v18.16b
+ mov v17.16b, v24.16b
+ mov v18.16b, v25.16b
+ b 8b
+9:
+ subs \w, \w, #8
+ b.le 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ msub \src, \s_strd, \xmy, \src
+ msub \dst, \d_strd, \xmy, \dst
+ sub \src, \src, \s_strd, lsl #2
+ mov \h, \my
+ add \src, \src, #8
+.ifc \type, put
+ add \dst, \dst, #8
+.else
+ add \dst, \dst, #16
+.endif
+ b 164b
+
+880: // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv
+640:
+1280:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8b}, [\xmx]
+ ld1 {v1.8b}, [\xmy]
+ sub \src, \src, #3
+ sub \src, \src, \s_strd
+ sub \src, \src, \s_strd, lsl #1
+ sxtl v0.8h, v0.8b
+ sxtl v1.8h, v1.8b
+ mov x15, x30
+ mov \my, \h
+
+168:
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+
+ bl L(\type\()_8tap_filter_8_first)
+ bl L(\type\()_8tap_filter_8)
+ mov v17.16b, v24.16b
+ mov v18.16b, v25.16b
+ bl L(\type\()_8tap_filter_8)
+ mov v19.16b, v24.16b
+ mov v20.16b, v25.16b
+ bl L(\type\()_8tap_filter_8)
+ mov v21.16b, v24.16b
+ mov v22.16b, v25.16b
+
+88:
+ smull v2.4s, v16.4h, v1.h[0]
+ smull2 v3.4s, v16.8h, v1.h[0]
+ bl L(\type\()_8tap_filter_8)
+ smull v4.4s, v17.4h, v1.h[0]
+ smull2 v5.4s, v17.8h, v1.h[0]
+ smlal v2.4s, v17.4h, v1.h[1]
+ smlal2 v3.4s, v17.8h, v1.h[1]
+ smlal v4.4s, v18.4h, v1.h[1]
+ smlal2 v5.4s, v18.8h, v1.h[1]
+ smlal v2.4s, v18.4h, v1.h[2]
+ smlal2 v3.4s, v18.8h, v1.h[2]
+ smlal v4.4s, v19.4h, v1.h[2]
+ smlal2 v5.4s, v19.8h, v1.h[2]
+ smlal v2.4s, v19.4h, v1.h[3]
+ smlal2 v3.4s, v19.8h, v1.h[3]
+ smlal v4.4s, v20.4h, v1.h[3]
+ smlal2 v5.4s, v20.8h, v1.h[3]
+ smlal v2.4s, v20.4h, v1.h[4]
+ smlal2 v3.4s, v20.8h, v1.h[4]
+ smlal v4.4s, v21.4h, v1.h[4]
+ smlal2 v5.4s, v21.8h, v1.h[4]
+ smlal v2.4s, v21.4h, v1.h[5]
+ smlal2 v3.4s, v21.8h, v1.h[5]
+ smlal v4.4s, v22.4h, v1.h[5]
+ smlal2 v5.4s, v22.8h, v1.h[5]
+ smlal v2.4s, v22.4h, v1.h[6]
+ smlal2 v3.4s, v22.8h, v1.h[6]
+ smlal v4.4s, v24.4h, v1.h[6]
+ smlal2 v5.4s, v24.8h, v1.h[6]
+ smlal v2.4s, v24.4h, v1.h[7]
+ smlal2 v3.4s, v24.8h, v1.h[7]
+ smlal v4.4s, v25.4h, v1.h[7]
+ smlal2 v5.4s, v25.8h, v1.h[7]
+ sqrshrn v2.4h, v2.4s, #\shift_hv
+ sqrshrn2 v2.8h, v3.4s, #\shift_hv
+ sqrshrn v4.4h, v4.4s, #\shift_hv
+ sqrshrn2 v4.8h, v5.4s, #\shift_hv
+ subs \h, \h, #2
+.ifc \type, put
+ sqxtun v2.8b, v2.8h
+ sqxtun v4.8b, v4.8h
+ st1 {v2.8b}, [\dst], \d_strd
+ st1 {v4.8b}, [\ds2], \d_strd
+.else
+ st1 {v2.8h}, [\dst], \d_strd
+ st1 {v4.8h}, [\ds2], \d_strd
+.endif
+ b.le 9f
+ mov v16.16b, v18.16b
+ mov v17.16b, v19.16b
+ mov v18.16b, v20.16b
+ mov v19.16b, v21.16b
+ mov v20.16b, v22.16b
+ mov v21.16b, v24.16b
+ mov v22.16b, v25.16b
+ b 88b
+9:
+ subs \w, \w, #8
+ b.le 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ msub \src, \s_strd, \xmy, \src
+ msub \dst, \d_strd, \xmy, \dst
+ sub \src, \src, \s_strd, lsl #3
+ mov \h, \my
+ add \src, \src, #8
+.ifc \type, put
+ add \dst, \dst, #8
+.else
+ add \dst, \dst, #16
+.endif
+ b 168b
+0:
+ ret x15
+
+L(\type\()_8tap_filter_8_first):
+ ld1 {v28.8b, v29.8b}, [\src], \s_strd
+ uxtl v28.8h, v28.8b
+ uxtl v29.8h, v29.8b
+ mul v16.8h, v28.8h, v0.h[0]
+ ext v24.16b, v28.16b, v29.16b, #(2*1)
+ ext v25.16b, v28.16b, v29.16b, #(2*2)
+ ext v26.16b, v28.16b, v29.16b, #(2*3)
+ ext v27.16b, v28.16b, v29.16b, #(2*4)
+ mla v16.8h, v24.8h, v0.h[1]
+ mla v16.8h, v25.8h, v0.h[2]
+ mla v16.8h, v26.8h, v0.h[3]
+ mla v16.8h, v27.8h, v0.h[4]
+ ext v24.16b, v28.16b, v29.16b, #(2*5)
+ ext v25.16b, v28.16b, v29.16b, #(2*6)
+ ext v26.16b, v28.16b, v29.16b, #(2*7)
+ mla v16.8h, v24.8h, v0.h[5]
+ mla v16.8h, v25.8h, v0.h[6]
+ mla v16.8h, v26.8h, v0.h[7]
+ srshr v16.8h, v16.8h, #2
+ ret
+
+L(\type\()_8tap_filter_8):
+ ld1 {v28.8b, v29.8b}, [\sr2], \s_strd
+ ld1 {v30.8b, v31.8b}, [\src], \s_strd
+ uxtl v28.8h, v28.8b
+ uxtl v29.8h, v29.8b
+ uxtl v30.8h, v30.8b
+ uxtl v31.8h, v31.8b
+ mul v24.8h, v28.8h, v0.h[0]
+ mul v25.8h, v30.8h, v0.h[0]
+.irpc i, 1234567
+ ext v26.16b, v28.16b, v29.16b, #(2*\i)
+ ext v27.16b, v30.16b, v31.16b, #(2*\i)
+ mla v24.8h, v26.8h, v0.h[\i]
+ mla v25.8h, v27.8h, v0.h[\i]
+.endr
+ srshr v24.8h, v24.8h, #2
+ srshr v25.8h, v25.8h, #2
+ ret
+
+L(\type\()_8tap_hv_tbl):
+ .hword L(\type\()_8tap_hv_tbl) - 1280b
+ .hword L(\type\()_8tap_hv_tbl) - 640b
+ .hword L(\type\()_8tap_hv_tbl) - 320b
+ .hword L(\type\()_8tap_hv_tbl) - 160b
+ .hword L(\type\()_8tap_hv_tbl) - 80b
+ .hword L(\type\()_8tap_hv_tbl) - 40b
+ .hword L(\type\()_8tap_hv_tbl) - 20b
+ .hword 0
+endfunc
+
+
+function \type\()_bilin_8bpc_neon, export=1
+ dup v1.16b, \mx
+ dup v3.16b, \my
+ mov w9, #16
+ sub w8, w9, \mx
+ sub w9, w9, \my
+ dup v0.16b, w8
+ dup v2.16b, w9
+.ifc \type, prep
+ uxtw \d_strd, \w
+ lsl \d_strd, \d_strd, #1
+.endif
+
+ clz w8, \w
+ sub w8, w8, #24
+ cbnz \mx, L(\type\()_bilin_h)
+ cbnz \my, L(\type\()_bilin_v)
+ b \type\()_neon
+
+L(\type\()_bilin_h):
+ cbnz \my, L(\type\()_bilin_hv)
+
+ adr x9, L(\type\()_bilin_h_tbl)
+ ldrh w8, [x9, x8, lsl #1]
+ sub x9, x9, w8, uxtw
+ br x9
+
+20: // 2xN h
+ AARCH64_VALID_JUMP_TARGET
+.ifc \type, put
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+2:
+ ld1 {v4.s}[0], [\src], \s_strd
+ ld1 {v6.s}[0], [\sr2], \s_strd
+ ext v5.8b, v4.8b, v4.8b, #1
+ ext v7.8b, v6.8b, v6.8b, #1
+ trn1 v4.4h, v4.4h, v6.4h
+ trn1 v5.4h, v5.4h, v7.4h
+ subs \h, \h, #2
+ umull v4.8h, v4.8b, v0.8b
+ umlal v4.8h, v5.8b, v1.8b
+ uqrshrn v4.8b, v4.8h, #4
+ st1 {v4.h}[0], [\dst], \d_strd
+ st1 {v4.h}[1], [\ds2], \d_strd
+ b.gt 2b
+ ret
+.endif
+
+40: // 4xN h
+ AARCH64_VALID_JUMP_TARGET
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+4:
+ ld1 {v4.8b}, [\src], \s_strd
+ ld1 {v6.8b}, [\sr2], \s_strd
+ ext v5.8b, v4.8b, v4.8b, #1
+ ext v7.8b, v6.8b, v6.8b, #1
+ trn1 v4.2s, v4.2s, v6.2s
+ trn1 v5.2s, v5.2s, v7.2s
+ subs \h, \h, #2
+ umull v4.8h, v4.8b, v0.8b
+ umlal v4.8h, v5.8b, v1.8b
+.ifc \type, put
+ uqrshrn v4.8b, v4.8h, #4
+ st1 {v4.s}[0], [\dst], \d_strd
+ st1 {v4.s}[1], [\ds2], \d_strd
+.else
+ st1 {v4.d}[0], [\dst], \d_strd
+ st1 {v4.d}[1], [\ds2], \d_strd
+.endif
+ b.gt 4b
+ ret
+
+80: // 8xN h
+ AARCH64_VALID_JUMP_TARGET
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+8:
+ ld1 {v4.16b}, [\src], \s_strd
+ ld1 {v6.16b}, [\sr2], \s_strd
+ ext v5.16b, v4.16b, v4.16b, #1
+ ext v7.16b, v6.16b, v6.16b, #1
+ subs \h, \h, #2
+ umull v4.8h, v4.8b, v0.8b
+ umull v6.8h, v6.8b, v0.8b
+ umlal v4.8h, v5.8b, v1.8b
+ umlal v6.8h, v7.8b, v1.8b
+.ifc \type, put
+ uqrshrn v4.8b, v4.8h, #4
+ uqrshrn v6.8b, v6.8h, #4
+ st1 {v4.8b}, [\dst], \d_strd
+ st1 {v6.8b}, [\ds2], \d_strd
+.else
+ st1 {v4.8h}, [\dst], \d_strd
+ st1 {v6.8h}, [\ds2], \d_strd
+.endif
+ b.gt 8b
+ ret
+160:
+320:
+640:
+1280: // 16xN, 32xN, ... h
+ AARCH64_VALID_JUMP_TARGET
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+
+ sub \s_strd, \s_strd, \w, uxtw
+ sub \s_strd, \s_strd, #8
+.ifc \type, put
+ lsl \d_strd, \d_strd, #1
+ sub \d_strd, \d_strd, \w, uxtw
+.endif
+161:
+ ld1 {v16.d}[1], [\src], #8
+ ld1 {v20.d}[1], [\sr2], #8
+ mov \mx, \w
+
+16:
+ ld1 {v18.16b}, [\src], #16
+ ld1 {v22.16b}, [\sr2], #16
+ ext v17.16b, v16.16b, v18.16b, #8
+ ext v19.16b, v16.16b, v18.16b, #9
+ ext v21.16b, v20.16b, v22.16b, #8
+ ext v23.16b, v20.16b, v22.16b, #9
+ umull v16.8h, v17.8b, v0.8b
+ umull2 v17.8h, v17.16b, v0.16b
+ umull v20.8h, v21.8b, v0.8b
+ umull2 v21.8h, v21.16b, v0.16b
+ umlal v16.8h, v19.8b, v1.8b
+ umlal2 v17.8h, v19.16b, v1.16b
+ umlal v20.8h, v23.8b, v1.8b
+ umlal2 v21.8h, v23.16b, v1.16b
+ subs \mx, \mx, #16
+.ifc \type, put
+ uqrshrn v16.8b, v16.8h, #4
+ uqrshrn2 v16.16b, v17.8h, #4
+ uqrshrn v20.8b, v20.8h, #4
+ uqrshrn2 v20.16b, v21.8h, #4
+ st1 {v16.16b}, [\dst], #16
+ st1 {v20.16b}, [\ds2], #16
+.else
+ st1 {v16.8h, v17.8h}, [\dst], #32
+ st1 {v20.8h, v21.8h}, [\ds2], #32
+.endif
+ b.le 9f
+
+ mov v16.16b, v18.16b
+ mov v20.16b, v22.16b
+ b 16b
+
+9:
+ add \dst, \dst, \d_strd
+ add \ds2, \ds2, \d_strd
+ add \src, \src, \s_strd
+ add \sr2, \sr2, \s_strd
+
+ subs \h, \h, #2
+ b.gt 161b
+ ret
+
+L(\type\()_bilin_h_tbl):
+ .hword L(\type\()_bilin_h_tbl) - 1280b
+ .hword L(\type\()_bilin_h_tbl) - 640b
+ .hword L(\type\()_bilin_h_tbl) - 320b
+ .hword L(\type\()_bilin_h_tbl) - 160b
+ .hword L(\type\()_bilin_h_tbl) - 80b
+ .hword L(\type\()_bilin_h_tbl) - 40b
+ .hword L(\type\()_bilin_h_tbl) - 20b
+ .hword 0
+
+
+L(\type\()_bilin_v):
+ cmp \h, #4
+ adr x9, L(\type\()_bilin_v_tbl)
+ ldrh w8, [x9, x8, lsl #1]
+ sub x9, x9, w8, uxtw
+ br x9
+
+20: // 2xN v
+ AARCH64_VALID_JUMP_TARGET
+.ifc \type, put
+ cmp \h, #2
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ // 2x2 v
+ ld1 {v16.h}[0], [\src], \s_strd
+ b.gt 24f
+22:
+ ld1 {v17.h}[0], [\sr2], \s_strd
+ ld1 {v18.h}[0], [\src], \s_strd
+ trn1 v16.4h, v16.4h, v17.4h
+ trn1 v17.4h, v17.4h, v18.4h
+ umull v4.8h, v16.8b, v2.8b
+ umlal v4.8h, v17.8b, v3.8b
+ uqrshrn v4.8b, v4.8h, #4
+ st1 {v4.h}[0], [\dst]
+ st1 {v4.h}[1], [\ds2]
+ ret
+24: // 2x4, 2x6, 2x8, ... v
+ ld1 {v17.h}[0], [\sr2], \s_strd
+ ld1 {v18.h}[0], [\src], \s_strd
+ ld1 {v19.h}[0], [\sr2], \s_strd
+ ld1 {v20.h}[0], [\src], \s_strd
+ sub \h, \h, #4
+ trn1 v16.4h, v16.4h, v17.4h
+ trn1 v17.4h, v17.4h, v18.4h
+ trn1 v18.4h, v18.4h, v19.4h
+ trn1 v19.4h, v19.4h, v20.4h
+ trn1 v16.2s, v16.2s, v18.2s
+ trn1 v17.2s, v17.2s, v19.2s
+ umull v4.8h, v16.8b, v2.8b
+ umlal v4.8h, v17.8b, v3.8b
+ cmp \h, #2
+ uqrshrn v4.8b, v4.8h, #4
+ st1 {v4.h}[0], [\dst], \d_strd
+ st1 {v4.h}[1], [\ds2], \d_strd
+ st1 {v4.h}[2], [\dst], \d_strd
+ st1 {v4.h}[3], [\ds2], \d_strd
+ b.lt 0f
+ mov v16.8b, v20.8b
+ b.eq 22b
+ b 24b
+0:
+ ret
+.endif
+
+40: // 4xN v
+ AARCH64_VALID_JUMP_TARGET
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ ld1 {v16.s}[0], [\src], \s_strd
+4:
+ ld1 {v17.s}[0], [\sr2], \s_strd
+ ld1 {v18.s}[0], [\src], \s_strd
+ trn1 v16.2s, v16.2s, v17.2s
+ trn1 v17.2s, v17.2s, v18.2s
+ umull v4.8h, v16.8b, v2.8b
+ umlal v4.8h, v17.8b, v3.8b
+ subs \h, \h, #2
+.ifc \type, put
+ uqrshrn v4.8b, v4.8h, #4
+ st1 {v4.s}[0], [\dst], \d_strd
+ st1 {v4.s}[1], [\ds2], \d_strd
+.else
+ st1 {v4.d}[0], [\dst], \d_strd
+ st1 {v4.d}[1], [\ds2], \d_strd
+.endif
+ b.le 0f
+ mov v16.8b, v18.8b
+ b 4b
+0:
+ ret
+
+80: // 8xN v
+ AARCH64_VALID_JUMP_TARGET
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ ld1 {v16.8b}, [\src], \s_strd
+8:
+ ld1 {v17.8b}, [\sr2], \s_strd
+ ld1 {v18.8b}, [\src], \s_strd
+ umull v4.8h, v16.8b, v2.8b
+ umull v5.8h, v17.8b, v2.8b
+ umlal v4.8h, v17.8b, v3.8b
+ umlal v5.8h, v18.8b, v3.8b
+ subs \h, \h, #2
+.ifc \type, put
+ uqrshrn v4.8b, v4.8h, #4
+ uqrshrn v5.8b, v5.8h, #4
+ st1 {v4.8b}, [\dst], \d_strd
+ st1 {v5.8b}, [\ds2], \d_strd
+.else
+ st1 {v4.8h}, [\dst], \d_strd
+ st1 {v5.8h}, [\ds2], \d_strd
+.endif
+ b.le 0f
+ mov v16.8b, v18.8b
+ b 8b
+0:
+ ret
+
+160: // 16xN, 32xN, ...
+320:
+640:
+1280:
+ AARCH64_VALID_JUMP_TARGET
+ mov \my, \h
+1:
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ ld1 {v16.16b}, [\src], \s_strd
+2:
+ ld1 {v17.16b}, [\sr2], \s_strd
+ ld1 {v18.16b}, [\src], \s_strd
+ umull v4.8h, v16.8b, v2.8b
+ umull2 v5.8h, v16.16b, v2.16b
+ umull v6.8h, v17.8b, v2.8b
+ umull2 v7.8h, v17.16b, v2.16b
+ umlal v4.8h, v17.8b, v3.8b
+ umlal2 v5.8h, v17.16b, v3.16b
+ umlal v6.8h, v18.8b, v3.8b
+ umlal2 v7.8h, v18.16b, v3.16b
+ subs \h, \h, #2
+.ifc \type, put
+ uqrshrn v4.8b, v4.8h, #4
+ uqrshrn2 v4.16b, v5.8h, #4
+ uqrshrn v6.8b, v6.8h, #4
+ uqrshrn2 v6.16b, v7.8h, #4
+ st1 {v4.16b}, [\dst], \d_strd
+ st1 {v6.16b}, [\ds2], \d_strd
+.else
+ st1 {v4.8h, v5.8h}, [\dst], \d_strd
+ st1 {v6.8h, v7.8h}, [\ds2], \d_strd
+.endif
+ b.le 9f
+ mov v16.16b, v18.16b
+ b 2b
+9:
+ subs \w, \w, #16
+ b.le 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ msub \src, \s_strd, \xmy, \src
+ msub \dst, \d_strd, \xmy, \dst
+ sub \src, \src, \s_strd, lsl #1
+ mov \h, \my
+ add \src, \src, #16
+.ifc \type, put
+ add \dst, \dst, #16
+.else
+ add \dst, \dst, #32
+.endif
+ b 1b
+0:
+ ret
+
+L(\type\()_bilin_v_tbl):
+ .hword L(\type\()_bilin_v_tbl) - 1280b
+ .hword L(\type\()_bilin_v_tbl) - 640b
+ .hword L(\type\()_bilin_v_tbl) - 320b
+ .hword L(\type\()_bilin_v_tbl) - 160b
+ .hword L(\type\()_bilin_v_tbl) - 80b
+ .hword L(\type\()_bilin_v_tbl) - 40b
+ .hword L(\type\()_bilin_v_tbl) - 20b
+ .hword 0
+
+L(\type\()_bilin_hv):
+ uxtl v2.8h, v2.8b
+ uxtl v3.8h, v3.8b
+ adr x9, L(\type\()_bilin_hv_tbl)
+ ldrh w8, [x9, x8, lsl #1]
+ sub x9, x9, w8, uxtw
+ br x9
+
+20: // 2xN hv
+ AARCH64_VALID_JUMP_TARGET
+.ifc \type, put
+ add \sr2, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ ld1 {v28.s}[0], [\src], \s_strd
+ ext v29.8b, v28.8b, v28.8b, #1
+ umull v16.8h, v28.8b, v0.8b
+ umlal v16.8h, v29.8b, v1.8b
+
+2:
+ ld1 {v28.s}[0], [\sr2], \s_strd
+ ld1 {v30.s}[0], [\src], \s_strd
+ ext v29.8b, v28.8b, v28.8b, #1
+ ext v31.8b, v30.8b, v30.8b, #1
+ trn1 v28.4h, v28.4h, v30.4h
+ trn1 v29.4h, v29.4h, v31.4h
+ umull v17.8h, v28.8b, v0.8b
+ umlal v17.8h, v29.8b, v1.8b
+
+ trn1 v16.2s, v16.2s, v17.2s
+
+ mul v4.4h, v16.4h, v2.4h
+ mla v4.4h, v17.4h, v3.4h
+ uqrshrn v4.8b, v4.8h, #8
+ subs \h, \h, #2
+ st1 {v4.h}[0], [\dst], \d_strd
+ st1 {v4.h}[1], [\ds2], \d_strd
+ b.le 0f
+ trn2 v16.2s, v17.2s, v17.2s
+ b 2b
+0:
+ ret
+.endif
+
+40: // 4xN hv
+ AARCH64_VALID_JUMP_TARGET
+ add \sr2, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ ld1 {v28.8b}, [\src], \s_strd
+ ext v29.8b, v28.8b, v28.8b, #1
+ umull v16.8h, v28.8b, v0.8b
+ umlal v16.8h, v29.8b, v1.8b
+
+4:
+ ld1 {v28.8b}, [\sr2], \s_strd
+ ld1 {v30.8b}, [\src], \s_strd
+ ext v29.8b, v28.8b, v28.8b, #1
+ ext v31.8b, v30.8b, v30.8b, #1
+ trn1 v28.2s, v28.2s, v30.2s
+ trn1 v29.2s, v29.2s, v31.2s
+ umull v17.8h, v28.8b, v0.8b
+ umlal v17.8h, v29.8b, v1.8b
+
+ trn1 v16.2d, v16.2d, v17.2d
+
+ mul v4.8h, v16.8h, v2.8h
+ mla v4.8h, v17.8h, v3.8h
+ subs \h, \h, #2
+.ifc \type, put
+ uqrshrn v4.8b, v4.8h, #8
+ st1 {v4.s}[0], [\dst], \d_strd
+ st1 {v4.s}[1], [\ds2], \d_strd
+.else
+ urshr v4.8h, v4.8h, #4
+ st1 {v4.d}[0], [\dst], \d_strd
+ st1 {v4.d}[1], [\ds2], \d_strd
+.endif
+ b.le 0f
+ trn2 v16.2d, v17.2d, v17.2d
+ b 4b
+0:
+ ret
+
+80: // 8xN, 16xN, ... hv
+160:
+320:
+640:
+1280:
+ AARCH64_VALID_JUMP_TARGET
+ mov \my, \h
+
+1:
+ add \sr2, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ ld1 {v28.16b}, [\src], \s_strd
+ ext v29.16b, v28.16b, v28.16b, #1
+ umull v16.8h, v28.8b, v0.8b
+ umlal v16.8h, v29.8b, v1.8b
+
+2:
+ ld1 {v28.16b}, [\sr2], \s_strd
+ ld1 {v30.16b}, [\src], \s_strd
+ ext v29.16b, v28.16b, v28.16b, #1
+ ext v31.16b, v30.16b, v30.16b, #1
+ umull v17.8h, v28.8b, v0.8b
+ umlal v17.8h, v29.8b, v1.8b
+ umull v18.8h, v30.8b, v0.8b
+ umlal v18.8h, v31.8b, v1.8b
+
+ mul v4.8h, v16.8h, v2.8h
+ mla v4.8h, v17.8h, v3.8h
+ mul v5.8h, v17.8h, v2.8h
+ mla v5.8h, v18.8h, v3.8h
+ subs \h, \h, #2
+.ifc \type, put
+ uqrshrn v4.8b, v4.8h, #8
+ uqrshrn v5.8b, v5.8h, #8
+ st1 {v4.8b}, [\dst], \d_strd
+ st1 {v5.8b}, [\ds2], \d_strd
+.else
+ urshr v4.8h, v4.8h, #4
+ urshr v5.8h, v5.8h, #4
+ st1 {v4.8h}, [\dst], \d_strd
+ st1 {v5.8h}, [\ds2], \d_strd
+.endif
+ b.le 9f
+ mov v16.16b, v18.16b
+ b 2b
+9:
+ subs \w, \w, #8
+ b.le 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ msub \src, \s_strd, \xmy, \src
+ msub \dst, \d_strd, \xmy, \dst
+ sub \src, \src, \s_strd, lsl #1
+ mov \h, \my
+ add \src, \src, #8
+.ifc \type, put
+ add \dst, \dst, #8
+.else
+ add \dst, \dst, #16
+.endif
+ b 1b
+0:
+ ret
+
+L(\type\()_bilin_hv_tbl):
+ .hword L(\type\()_bilin_hv_tbl) - 1280b
+ .hword L(\type\()_bilin_hv_tbl) - 640b
+ .hword L(\type\()_bilin_hv_tbl) - 320b
+ .hword L(\type\()_bilin_hv_tbl) - 160b
+ .hword L(\type\()_bilin_hv_tbl) - 80b
+ .hword L(\type\()_bilin_hv_tbl) - 40b
+ .hword L(\type\()_bilin_hv_tbl) - 20b
+ .hword 0
+endfunc
+.endm
+
+filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10
+filter_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6
+
+.macro load_filter_row dst, src, inc
+ asr w13, \src, #10
+ add \src, \src, \inc
+ ldr \dst, [x11, w13, sxtw #3]
+.endm
+
+function warp_filter_horz_neon
+ add w12, w5, #512
+
+ ld1 {v16.8b, v17.8b}, [x2], x3
+
+ load_filter_row d0, w12, w7
+ load_filter_row d1, w12, w7
+ load_filter_row d2, w12, w7
+ load_filter_row d3, w12, w7
+ load_filter_row d4, w12, w7
+ load_filter_row d5, w12, w7
+ load_filter_row d6, w12, w7
+ // subtract by 128 to allow using smull
+ eor v16.8b, v16.8b, v22.8b
+ eor v17.8b, v17.8b, v22.8b
+ load_filter_row d7, w12, w7
+
+ ext v18.8b, v16.8b, v17.8b, #1
+ ext v19.8b, v16.8b, v17.8b, #2
+ smull v0.8h, v0.8b, v16.8b
+ smull v1.8h, v1.8b, v18.8b
+ ext v18.8b, v16.8b, v17.8b, #3
+ ext v20.8b, v16.8b, v17.8b, #4
+ smull v2.8h, v2.8b, v19.8b
+ smull v3.8h, v3.8b, v18.8b
+ ext v18.8b, v16.8b, v17.8b, #5
+ ext v19.8b, v16.8b, v17.8b, #6
+ smull v4.8h, v4.8b, v20.8b
+ smull v5.8h, v5.8b, v18.8b
+ ext v18.8b, v16.8b, v17.8b, #7
+ smull v6.8h, v6.8b, v19.8b
+ smull v7.8h, v7.8b, v18.8b
+
+ addp v0.8h, v0.8h, v1.8h
+ addp v2.8h, v2.8h, v3.8h
+ addp v4.8h, v4.8h, v5.8h
+ addp v6.8h, v6.8h, v7.8h
+
+ addp v0.8h, v0.8h, v2.8h
+ addp v4.8h, v4.8h, v6.8h
+
+ addp v0.8h, v0.8h, v4.8h
+
+ add w5, w5, w8
+
+ ret
+endfunc
+
+// void dav1d_warp_affine_8x8_8bpc_neon(
+// pixel *dst, const ptrdiff_t dst_stride,
+// const pixel *src, const ptrdiff_t src_stride,
+// const int16_t *const abcd, int mx, int my)
+.macro warp t, shift
+function warp_affine_8x8\t\()_8bpc_neon, export=1
+ ldr x4, [x4]
+ sbfx x7, x4, #0, #16
+ sbfx x8, x4, #16, #16
+ sbfx x9, x4, #32, #16
+ sbfx x4, x4, #48, #16
+ mov w10, #8
+ sub x2, x2, x3, lsl #1
+ sub x2, x2, x3
+ sub x2, x2, #3
+ movrel x11, X(mc_warp_filter), 64*8
+ mov x15, x30
+.ifnb \t
+ lsl x1, x1, #1
+.endif
+
+ movi v22.8b, #128
+.ifb \t
+ movi v23.8h, #128
+.else
+ movi v23.8h, #8, lsl #8
+.endif
+
+ bl warp_filter_horz_neon
+ srshr v24.8h, v0.8h, #3
+ bl warp_filter_horz_neon
+ srshr v25.8h, v0.8h, #3
+ bl warp_filter_horz_neon
+ srshr v26.8h, v0.8h, #3
+ bl warp_filter_horz_neon
+ srshr v27.8h, v0.8h, #3
+ bl warp_filter_horz_neon
+ srshr v28.8h, v0.8h, #3
+ bl warp_filter_horz_neon
+ srshr v29.8h, v0.8h, #3
+ bl warp_filter_horz_neon
+ srshr v30.8h, v0.8h, #3
+
+1:
+ add w14, w6, #512
+ bl warp_filter_horz_neon
+ srshr v31.8h, v0.8h, #3
+
+ load_filter_row d0, w14, w9
+ load_filter_row d1, w14, w9
+ load_filter_row d2, w14, w9
+ load_filter_row d3, w14, w9
+ load_filter_row d4, w14, w9
+ load_filter_row d5, w14, w9
+ load_filter_row d6, w14, w9
+ load_filter_row d7, w14, w9
+ transpose_8x8b_xtl v0, v1, v2, v3, v4, v5, v6, v7, sxtl
+
+ // This ordering of smull/smlal/smull2/smlal2 is highly
+ // beneficial for Cortex A53 here.
+ smull v16.4s, v24.4h, v0.4h
+ smlal v16.4s, v25.4h, v1.4h
+ smlal v16.4s, v26.4h, v2.4h
+ smlal v16.4s, v27.4h, v3.4h
+ smlal v16.4s, v28.4h, v4.4h
+ smlal v16.4s, v29.4h, v5.4h
+ smlal v16.4s, v30.4h, v6.4h
+ smlal v16.4s, v31.4h, v7.4h
+ smull2 v17.4s, v24.8h, v0.8h
+ smlal2 v17.4s, v25.8h, v1.8h
+ smlal2 v17.4s, v26.8h, v2.8h
+ smlal2 v17.4s, v27.8h, v3.8h
+ smlal2 v17.4s, v28.8h, v4.8h
+ smlal2 v17.4s, v29.8h, v5.8h
+ smlal2 v17.4s, v30.8h, v6.8h
+ smlal2 v17.4s, v31.8h, v7.8h
+
+ mov v24.16b, v25.16b
+ mov v25.16b, v26.16b
+ sqrshrn v16.4h, v16.4s, #\shift
+ mov v26.16b, v27.16b
+ sqrshrn2 v16.8h, v17.4s, #\shift
+ mov v27.16b, v28.16b
+ mov v28.16b, v29.16b
+ add v16.8h, v16.8h, v23.8h
+.ifb \t
+ sqxtun v16.8b, v16.8h
+.endif
+ mov v29.16b, v30.16b
+ mov v30.16b, v31.16b
+ subs w10, w10, #1
+.ifnb \t
+ st1 {v16.8h}, [x0], x1
+.else
+ st1 {v16.8b}, [x0], x1
+.endif
+
+ add w6, w6, w4
+ b.gt 1b
+
+ ret x15
+endfunc
+.endm
+
+warp , 11
+warp t, 7
+
+// void dav1d_emu_edge_8bpc_neon(
+// const intptr_t bw, const intptr_t bh,
+// const intptr_t iw, const intptr_t ih,
+// const intptr_t x, const intptr_t y,
+// pixel *dst, const ptrdiff_t dst_stride,
+// const pixel *ref, const ptrdiff_t ref_stride)
+function emu_edge_8bpc_neon, export=1
+ ldp x8, x9, [sp]
+
+ // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
+ // ref += iclip(x, 0, iw - 1)
+ sub x12, x3, #1 // ih - 1
+ cmp x5, x3
+ sub x13, x2, #1 // iw - 1
+ csel x12, x12, x5, ge // min(y, ih - 1)
+ cmp x4, x2
+ bic x12, x12, x12, asr #63 // max(min(y, ih - 1), 0)
+ csel x13, x13, x4, ge // min(x, iw - 1)
+ bic x13, x13, x13, asr #63 // max(min(x, iw - 1), 0)
+ madd x8, x12, x9, x8 // ref += iclip() * stride
+ add x8, x8, x13 // ref += iclip()
+
+ // bottom_ext = iclip(y + bh - ih, 0, bh - 1)
+ // top_ext = iclip(-y, 0, bh - 1)
+ add x10, x5, x1 // y + bh
+ neg x5, x5 // -y
+ sub x10, x10, x3 // y + bh - ih
+ sub x12, x1, #1 // bh - 1
+ cmp x10, x1
+ bic x5, x5, x5, asr #63 // max(-y, 0)
+ csel x10, x10, x12, lt // min(y + bh - ih, bh-1)
+ cmp x5, x1
+ bic x10, x10, x10, asr #63 // max(min(y + bh - ih, bh-1), 0)
+ csel x5, x5, x12, lt // min(max(-y, 0), bh-1)
+
+ // right_ext = iclip(x + bw - iw, 0, bw - 1)
+ // left_ext = iclip(-x, 0, bw - 1)
+ add x11, x4, x0 // x + bw
+ neg x4, x4 // -x
+ sub x11, x11, x2 // x + bw - iw
+ sub x13, x0, #1 // bw - 1
+ cmp x11, x0
+ bic x4, x4, x4, asr #63 // max(-x, 0)
+ csel x11, x11, x13, lt // min(x + bw - iw, bw-1)
+ cmp x4, x0
+ bic x11, x11, x11, asr #63 // max(min(x + bw - iw, bw-1), 0)
+ csel x4, x4, x13, lt // min(max(-x, 0), bw - 1)
+
+ // center_h = bh - top_ext - bottom_ext
+ // dst += top_ext * PXSTRIDE(dst_stride)
+ // center_w = bw - left_ext - right_ext
+ sub x1, x1, x5 // bh - top_ext
+ madd x6, x5, x7, x6
+ sub x2, x0, x4 // bw - left_ext
+ sub x1, x1, x10 // center_h = bh - top_ext - bottom_ext
+ sub x2, x2, x11 // center_w = bw - left_ext - right_ext
+
+ mov x14, x6 // backup of dst
+
+.macro v_loop need_left, need_right
+0:
+.if \need_left
+ ld1r {v0.16b}, [x8]
+ mov x12, x6 // out = dst
+ mov x3, x4
+1:
+ subs x3, x3, #16
+ st1 {v0.16b}, [x12], #16
+ b.gt 1b
+.endif
+ mov x13, x8
+ add x12, x6, x4 // out = dst + left_ext
+ mov x3, x2
+1:
+ ld1 {v0.16b, v1.16b}, [x13], #32
+ subs x3, x3, #32
+ st1 {v0.16b, v1.16b}, [x12], #32
+ b.gt 1b
+.if \need_right
+ add x3, x8, x2 // in + center_w
+ sub x3, x3, #1 // in + center_w - 1
+ add x12, x6, x4 // dst + left_ext
+ ld1r {v0.16b}, [x3]
+ add x12, x12, x2 // out = dst + left_ext + center_w
+ mov x3, x11
+1:
+ subs x3, x3, #16
+ st1 {v0.16b}, [x12], #16
+ b.gt 1b
+.endif
+
+ subs x1, x1, #1 // center_h--
+ add x6, x6, x7
+ add x8, x8, x9
+ b.gt 0b
+.endm
+
+ cbz x4, 2f
+ // need_left
+ cbz x11, 3f
+ // need_left + need_right
+ v_loop 1, 1
+ b 5f
+
+2:
+ // !need_left
+ cbz x11, 4f
+ // !need_left + need_right
+ v_loop 0, 1
+ b 5f
+
+3:
+ // need_left + !need_right
+ v_loop 1, 0
+ b 5f
+
+4:
+ // !need_left + !need_right
+ v_loop 0, 0
+
+5:
+
+ cbz x10, 3f
+ // need_bottom
+ sub x8, x6, x7 // ref = dst - stride
+ mov x4, x0
+1:
+ ld1 {v0.16b, v1.16b}, [x8], #32
+ mov x3, x10
+2:
+ subs x3, x3, #1
+ st1 {v0.16b, v1.16b}, [x6], x7
+ b.gt 2b
+ msub x6, x7, x10, x6 // dst -= bottom_ext * stride
+ subs x4, x4, #32 // bw -= 32
+ add x6, x6, #32 // dst += 32
+ b.gt 1b
+
+3:
+ cbz x5, 3f
+ // need_top
+ msub x6, x7, x5, x14 // dst = stored_dst - top_ext * stride
+1:
+ ld1 {v0.16b, v1.16b}, [x14], #32
+ mov x3, x5
+2:
+ subs x3, x3, #1
+ st1 {v0.16b, v1.16b}, [x6], x7
+ b.gt 2b
+ msub x6, x7, x5, x6 // dst -= top_ext * stride
+ subs x0, x0, #32 // bw -= 32
+ add x6, x6, #32 // dst += 32
+ b.gt 1b
+
+3:
+ ret
+endfunc
diff --git a/third_party/dav1d/src/arm/64/mc16.S b/third_party/dav1d/src/arm/64/mc16.S
new file mode 100644
index 0000000000..1bfb12ebb3
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/mc16.S
@@ -0,0 +1,3611 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Janne Grunau
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+#define PREP_BIAS 8192
+
+.macro avg d0, d1, t0, t1, t2, t3
+ ld1 {\t0\().8h,\t1\().8h}, [x2], 32
+ ld1 {\t2\().8h,\t3\().8h}, [x3], 32
+ sqadd \t0\().8h, \t0\().8h, \t2\().8h
+ sqadd \t1\().8h, \t1\().8h, \t3\().8h
+ smax \t0\().8h, \t0\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
+ smax \t1\().8h, \t1\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
+ sqsub \t0\().8h, \t0\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
+ sqsub \t1\().8h, \t1\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
+ sshl \d0\().8h, \t0\().8h, v29.8h // -(intermediate_bits+1)
+ sshl \d1\().8h, \t1\().8h, v29.8h // -(intermediate_bits+1)
+.endm
+
+.macro w_avg d0, d1, t0, t1, t2, t3
+ ld1 {\t0\().8h,\t1\().8h}, [x2], 32
+ ld1 {\t2\().8h,\t3\().8h}, [x3], 32
+ // This difference requires a 17 bit range, and all bits are
+ // significant for the following multiplication.
+ ssubl \d0\().4s, \t2\().4h, \t0\().4h
+ ssubl2 \t0\().4s, \t2\().8h, \t0\().8h
+ ssubl \d1\().4s, \t3\().4h, \t1\().4h
+ ssubl2 \t1\().4s, \t3\().8h, \t1\().8h
+ mul \d0\().4s, \d0\().4s, v27.4s
+ mul \t0\().4s, \t0\().4s, v27.4s
+ mul \d1\().4s, \d1\().4s, v27.4s
+ mul \t1\().4s, \t1\().4s, v27.4s
+ sshr \d0\().4s, \d0\().4s, #4
+ sshr \t0\().4s, \t0\().4s, #4
+ sshr \d1\().4s, \d1\().4s, #4
+ sshr \t1\().4s, \t1\().4s, #4
+ saddw \d0\().4s, \d0\().4s, \t2\().4h
+ saddw2 \t0\().4s, \t0\().4s, \t2\().8h
+ saddw \d1\().4s, \d1\().4s, \t3\().4h
+ saddw2 \t1\().4s, \t1\().4s, \t3\().8h
+ uzp1 \d0\().8h, \d0\().8h, \t0\().8h // Same as xtn, xtn2
+ uzp1 \d1\().8h, \d1\().8h, \t1\().8h // Ditto
+ srshl \d0\().8h, \d0\().8h, v29.8h // -intermediate_bits
+ srshl \d1\().8h, \d1\().8h, v29.8h // -intermediate_bits
+ add \d0\().8h, \d0\().8h, v28.8h // PREP_BIAS >> intermediate_bits
+ add \d1\().8h, \d1\().8h, v28.8h // PREP_BIAS >> intermediate_bits
+ smin \d0\().8h, \d0\().8h, v31.8h // bitdepth_max
+ smin \d1\().8h, \d1\().8h, v31.8h // bitdepth_max
+ smax \d0\().8h, \d0\().8h, v30.8h // 0
+ smax \d1\().8h, \d1\().8h, v30.8h // 0
+.endm
+
+.macro mask d0, d1, t0, t1, t2, t3
+ ld1 {v27.16b}, [x6], 16
+ ld1 {\t0\().8h,\t1\().8h}, [x2], 32
+ neg v27.16b, v27.16b
+ ld1 {\t2\().8h,\t3\().8h}, [x3], 32
+ sxtl v26.8h, v27.8b
+ sxtl2 v27.8h, v27.16b
+ sxtl v24.4s, v26.4h
+ sxtl2 v25.4s, v26.8h
+ sxtl v26.4s, v27.4h
+ sxtl2 v27.4s, v27.8h
+ ssubl \d0\().4s, \t2\().4h, \t0\().4h
+ ssubl2 \t0\().4s, \t2\().8h, \t0\().8h
+ ssubl \d1\().4s, \t3\().4h, \t1\().4h
+ ssubl2 \t1\().4s, \t3\().8h, \t1\().8h
+ mul \d0\().4s, \d0\().4s, v24.4s
+ mul \t0\().4s, \t0\().4s, v25.4s
+ mul \d1\().4s, \d1\().4s, v26.4s
+ mul \t1\().4s, \t1\().4s, v27.4s
+ sshr \d0\().4s, \d0\().4s, #6
+ sshr \t0\().4s, \t0\().4s, #6
+ sshr \d1\().4s, \d1\().4s, #6
+ sshr \t1\().4s, \t1\().4s, #6
+ saddw \d0\().4s, \d0\().4s, \t2\().4h
+ saddw2 \t0\().4s, \t0\().4s, \t2\().8h
+ saddw \d1\().4s, \d1\().4s, \t3\().4h
+ saddw2 \t1\().4s, \t1\().4s, \t3\().8h
+ uzp1 \d0\().8h, \d0\().8h, \t0\().8h // Same as xtn, xtn2
+ uzp1 \d1\().8h, \d1\().8h, \t1\().8h // Ditto
+ srshl \d0\().8h, \d0\().8h, v29.8h // -intermediate_bits
+ srshl \d1\().8h, \d1\().8h, v29.8h // -intermediate_bits
+ add \d0\().8h, \d0\().8h, v28.8h // PREP_BIAS >> intermediate_bits
+ add \d1\().8h, \d1\().8h, v28.8h // PREP_BIAS >> intermediate_bits
+ smin \d0\().8h, \d0\().8h, v31.8h // bitdepth_max
+ smin \d1\().8h, \d1\().8h, v31.8h // bitdepth_max
+ smax \d0\().8h, \d0\().8h, v30.8h // 0
+ smax \d1\().8h, \d1\().8h, v30.8h // 0
+.endm
+
+.macro bidir_fn type, bdmax
+function \type\()_16bpc_neon, export=1
+ clz w4, w4
+.ifnc \type, avg
+ dup v31.8h, \bdmax // bitdepth_max
+ movi v30.8h, #0
+.endif
+ clz w7, \bdmax
+ sub w7, w7, #18 // intermediate_bits = clz(bitdepth_max) - 18
+.ifc \type, avg
+ mov w9, #1
+ mov w8, #-2*PREP_BIAS
+ lsl w9, w9, w7 // 1 << intermediate_bits
+ add w7, w7, #1
+ sub w8, w8, w9 // -2*PREP_BIAS - 1 << intermediate_bits
+ neg w7, w7 // -(intermediate_bits+1)
+ dup v28.8h, w8 // -2*PREP_BIAS - 1 << intermediate_bits
+ dup v29.8h, w7 // -(intermediate_bits+1)
+.else
+ mov w8, #PREP_BIAS
+ lsr w8, w8, w7 // PREP_BIAS >> intermediate_bits
+ neg w7, w7 // -intermediate_bits
+ dup v28.8h, w8 // PREP_BIAS >> intermediate_bits
+ dup v29.8h, w7 // -intermediate_bits
+.endif
+.ifc \type, w_avg
+ dup v27.4s, w6
+ neg v27.4s, v27.4s
+.endif
+ adr x7, L(\type\()_tbl)
+ sub w4, w4, #24
+ \type v4, v5, v0, v1, v2, v3
+ ldrh w4, [x7, x4, lsl #1]
+ sub x7, x7, w4, uxtw
+ br x7
+40:
+ AARCH64_VALID_JUMP_TARGET
+ add x7, x0, x1
+ lsl x1, x1, #1
+4:
+ subs w5, w5, #4
+ st1 {v4.d}[0], [x0], x1
+ st1 {v4.d}[1], [x7], x1
+ st1 {v5.d}[0], [x0], x1
+ st1 {v5.d}[1], [x7], x1
+ b.le 0f
+ \type v4, v5, v0, v1, v2, v3
+ b 4b
+80:
+ AARCH64_VALID_JUMP_TARGET
+ add x7, x0, x1
+ lsl x1, x1, #1
+8:
+ st1 {v4.8h}, [x0], x1
+ subs w5, w5, #2
+ st1 {v5.8h}, [x7], x1
+ b.le 0f
+ \type v4, v5, v0, v1, v2, v3
+ b 8b
+16:
+ AARCH64_VALID_JUMP_TARGET
+ \type v6, v7, v0, v1, v2, v3
+ st1 {v4.8h, v5.8h}, [x0], x1
+ subs w5, w5, #2
+ st1 {v6.8h, v7.8h}, [x0], x1
+ b.le 0f
+ \type v4, v5, v0, v1, v2, v3
+ b 16b
+32:
+ AARCH64_VALID_JUMP_TARGET
+ \type v6, v7, v0, v1, v2, v3
+ subs w5, w5, #1
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1
+ b.le 0f
+ \type v4, v5, v0, v1, v2, v3
+ b 32b
+640:
+ AARCH64_VALID_JUMP_TARGET
+ add x7, x0, #64
+64:
+ \type v6, v7, v0, v1, v2, v3
+ \type v16, v17, v0, v1, v2, v3
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1
+ \type v18, v19, v0, v1, v2, v3
+ subs w5, w5, #1
+ st1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x1
+ b.le 0f
+ \type v4, v5, v0, v1, v2, v3
+ b 64b
+1280:
+ AARCH64_VALID_JUMP_TARGET
+ add x7, x0, #64
+ mov x8, #128
+ sub x1, x1, #128
+128:
+ \type v6, v7, v0, v1, v2, v3
+ \type v16, v17, v0, v1, v2, v3
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x8
+ \type v18, v19, v0, v1, v2, v3
+ st1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x8
+ \type v4, v5, v0, v1, v2, v3
+ \type v6, v7, v0, v1, v2, v3
+ \type v16, v17, v0, v1, v2, v3
+ subs w5, w5, #1
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1
+ \type v18, v19, v0, v1, v2, v3
+ st1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x1
+ b.le 0f
+ \type v4, v5, v0, v1, v2, v3
+ b 128b
+0:
+ ret
+L(\type\()_tbl):
+ .hword L(\type\()_tbl) - 1280b
+ .hword L(\type\()_tbl) - 640b
+ .hword L(\type\()_tbl) - 32b
+ .hword L(\type\()_tbl) - 16b
+ .hword L(\type\()_tbl) - 80b
+ .hword L(\type\()_tbl) - 40b
+endfunc
+.endm
+
+bidir_fn avg, w6
+bidir_fn w_avg, w7
+bidir_fn mask, w7
+
+
+.macro w_mask_fn type
+function w_mask_\type\()_16bpc_neon, export=1
+ ldr w8, [sp]
+ clz w9, w4
+ adr x10, L(w_mask_\type\()_tbl)
+ dup v31.8h, w8 // bitdepth_max
+ sub w9, w9, #24
+ clz w8, w8 // clz(bitdepth_max)
+ ldrh w9, [x10, x9, lsl #1]
+ sub x10, x10, w9, uxtw
+ sub w8, w8, #12 // sh = intermediate_bits + 6 = clz(bitdepth_max) - 12
+ mov w9, #PREP_BIAS*64
+ neg w8, w8 // -sh
+ mov w11, #27615 // (64 + 1 - 38)<<mask_sh - 1 - mask_rnd
+ dup v30.4s, w9 // PREP_BIAS*64
+ dup v29.4s, w8 // -sh
+ dup v0.8h, w11
+.if \type == 444
+ movi v1.16b, #64
+.elseif \type == 422
+ dup v2.8b, w7
+ movi v3.8b, #129
+ sub v3.8b, v3.8b, v2.8b
+.elseif \type == 420
+ dup v2.8h, w7
+ movi v3.8h, #1, lsl #8
+ sub v3.8h, v3.8h, v2.8h
+.endif
+ add x12, x0, x1
+ lsl x1, x1, #1
+ br x10
+4:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1 (four rows at once)
+ ld1 {v6.8h, v7.8h}, [x3], #32 // tmp2 (four rows at once)
+ subs w5, w5, #4
+ sabd v20.8h, v4.8h, v6.8h // abs(tmp1 - tmp2)
+ sabd v21.8h, v5.8h, v7.8h
+ ssubl v16.4s, v6.4h, v4.4h // tmp2 - tmp1 (requires 17 bit)
+ ssubl2 v17.4s, v6.8h, v4.8h
+ ssubl v18.4s, v7.4h, v5.4h
+ ssubl2 v19.4s, v7.8h, v5.8h
+ uqsub v20.8h, v0.8h, v20.8h // 27615 - abs()
+ uqsub v21.8h, v0.8h, v21.8h
+ sshll2 v7.4s, v5.8h, #6 // tmp1 << 6
+ sshll v6.4s, v5.4h, #6
+ sshll2 v5.4s, v4.8h, #6
+ sshll v4.4s, v4.4h, #6
+ ushr v20.8h, v20.8h, #10 // 64-m = (27615 - abs()) >> mask_sh
+ ushr v21.8h, v21.8h, #10
+ add v4.4s, v4.4s, v30.4s // += PREP_BIAS*64
+ add v5.4s, v5.4s, v30.4s
+ add v6.4s, v6.4s, v30.4s
+ add v7.4s, v7.4s, v30.4s
+ uxtl v22.4s, v20.4h
+ uxtl2 v23.4s, v20.8h
+ uxtl v24.4s, v21.4h
+ uxtl2 v25.4s, v21.8h
+ mla v4.4s, v16.4s, v22.4s // (tmp2-tmp1)*(64-m)
+ mla v5.4s, v17.4s, v23.4s
+ mla v6.4s, v18.4s, v24.4s
+ mla v7.4s, v19.4s, v25.4s
+ srshl v4.4s, v4.4s, v29.4s // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
+ srshl v5.4s, v5.4s, v29.4s
+ srshl v6.4s, v6.4s, v29.4s
+ srshl v7.4s, v7.4s, v29.4s
+ sqxtun v4.4h, v4.4s // iclip_pixel
+ sqxtun2 v4.8h, v5.4s
+ sqxtun v5.4h, v6.4s
+ sqxtun2 v5.8h, v7.4s
+ umin v4.8h, v4.8h, v31.8h // iclip_pixel
+ umin v5.8h, v5.8h, v31.8h
+.if \type == 444
+ uzp1 v20.16b, v20.16b, v21.16b // 64 - m
+ sub v20.16b, v1.16b, v20.16b // m
+ st1 {v20.16b}, [x6], #16
+.elseif \type == 422
+ addp v20.8h, v20.8h, v21.8h // (64 - m) + (64 - n) (column wise addition)
+ xtn v20.8b, v20.8h
+ uhsub v20.8b, v3.8b, v20.8b // ((129 - sign) - ((64 - m) + (64 - n)) >> 1
+ st1 {v20.8b}, [x6], #8
+.elseif \type == 420
+ trn1 v24.2d, v20.2d, v21.2d
+ trn2 v25.2d, v20.2d, v21.2d
+ add v24.8h, v24.8h, v25.8h // (64 - my1) + (64 - my2) (row wise addition)
+ addp v20.8h, v24.8h, v24.8h // (128 - m) + (128 - n) (column wise addition)
+ sub v20.4h, v3.4h, v20.4h // (256 - sign) - ((128 - m) + (128 - n))
+ rshrn v20.8b, v20.8h, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
+ st1 {v20.s}[0], [x6], #4
+.endif
+ st1 {v4.d}[0], [x0], x1
+ st1 {v4.d}[1], [x12], x1
+ st1 {v5.d}[0], [x0], x1
+ st1 {v5.d}[1], [x12], x1
+ b.gt 4b
+ ret
+8:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1
+ ld1 {v6.8h, v7.8h}, [x3], #32 // tmp2
+ subs w5, w5, #2
+ sabd v20.8h, v4.8h, v6.8h // abs(tmp1 - tmp2)
+ sabd v21.8h, v5.8h, v7.8h
+ ssubl v16.4s, v6.4h, v4.4h // tmp2 - tmp1 (requires 17 bit)
+ ssubl2 v17.4s, v6.8h, v4.8h
+ ssubl v18.4s, v7.4h, v5.4h
+ ssubl2 v19.4s, v7.8h, v5.8h
+ uqsub v20.8h, v0.8h, v20.8h // 27615 - abs()
+ uqsub v21.8h, v0.8h, v21.8h
+ sshll2 v7.4s, v5.8h, #6 // tmp1 << 6
+ sshll v6.4s, v5.4h, #6
+ sshll2 v5.4s, v4.8h, #6
+ sshll v4.4s, v4.4h, #6
+ ushr v20.8h, v20.8h, #10 // 64-m = (27615 - abs()) >> mask_sh
+ ushr v21.8h, v21.8h, #10
+ add v4.4s, v4.4s, v30.4s // += PREP_BIAS*64
+ add v5.4s, v5.4s, v30.4s
+ add v6.4s, v6.4s, v30.4s
+ add v7.4s, v7.4s, v30.4s
+ uxtl v22.4s, v20.4h
+ uxtl2 v23.4s, v20.8h
+ uxtl v24.4s, v21.4h
+ uxtl2 v25.4s, v21.8h
+ mla v4.4s, v16.4s, v22.4s // (tmp2-tmp1)*(64-m)
+ mla v5.4s, v17.4s, v23.4s
+ mla v6.4s, v18.4s, v24.4s
+ mla v7.4s, v19.4s, v25.4s
+ srshl v4.4s, v4.4s, v29.4s // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
+ srshl v5.4s, v5.4s, v29.4s
+ srshl v6.4s, v6.4s, v29.4s
+ srshl v7.4s, v7.4s, v29.4s
+ sqxtun v4.4h, v4.4s // iclip_pixel
+ sqxtun2 v4.8h, v5.4s
+ sqxtun v5.4h, v6.4s
+ sqxtun2 v5.8h, v7.4s
+ umin v4.8h, v4.8h, v31.8h // iclip_pixel
+ umin v5.8h, v5.8h, v31.8h
+.if \type == 444
+ uzp1 v20.16b, v20.16b, v21.16b // 64 - m
+ sub v20.16b, v1.16b, v20.16b // m
+ st1 {v20.16b}, [x6], #16
+.elseif \type == 422
+ addp v20.8h, v20.8h, v21.8h // (64 - m) + (64 - n) (column wise addition)
+ xtn v20.8b, v20.8h
+ uhsub v20.8b, v3.8b, v20.8b // ((129 - sign) - ((64 - m) + (64 - n)) >> 1
+ st1 {v20.8b}, [x6], #8
+.elseif \type == 420
+ add v20.8h, v20.8h, v21.8h // (64 - my1) + (64 - my2) (row wise addition)
+ addp v20.8h, v20.8h, v20.8h // (128 - m) + (128 - n) (column wise addition)
+ sub v20.4h, v3.4h, v20.4h // (256 - sign) - ((128 - m) + (128 - n))
+ rshrn v20.8b, v20.8h, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
+ st1 {v20.s}[0], [x6], #4
+.endif
+ st1 {v4.8h}, [x0], x1
+ st1 {v5.8h}, [x12], x1
+ b.gt 8b
+ ret
+1280:
+640:
+320:
+160:
+ AARCH64_VALID_JUMP_TARGET
+ mov w11, w4
+ sub x1, x1, w4, uxtw #1
+.if \type == 444
+ add x10, x6, w4, uxtw
+.elseif \type == 422
+ add x10, x6, x11, lsr #1
+.endif
+ add x9, x3, w4, uxtw #1
+ add x7, x2, w4, uxtw #1
+161:
+ mov w8, w4
+16:
+ ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1
+ ld1 {v16.8h, v17.8h}, [x3], #32 // tmp2
+ ld1 {v6.8h, v7.8h}, [x7], #32
+ ld1 {v18.8h, v19.8h}, [x9], #32
+ subs w8, w8, #16
+ sabd v20.8h, v4.8h, v16.8h // abs(tmp1 - tmp2)
+ sabd v21.8h, v5.8h, v17.8h
+ ssubl v22.4s, v16.4h, v4.4h // tmp2 - tmp1 (requires 17 bit)
+ ssubl2 v23.4s, v16.8h, v4.8h
+ ssubl v24.4s, v17.4h, v5.4h
+ ssubl2 v25.4s, v17.8h, v5.8h
+ uqsub v20.8h, v0.8h, v20.8h // 27615 - abs()
+ uqsub v21.8h, v0.8h, v21.8h
+ sshll2 v27.4s, v5.8h, #6 // tmp1 << 6
+ sshll v26.4s, v5.4h, #6
+ sshll2 v5.4s, v4.8h, #6
+ sshll v4.4s, v4.4h, #6
+ ushr v20.8h, v20.8h, #10 // 64-m = (27615 - abs()) >> mask_sh
+ ushr v21.8h, v21.8h, #10
+ add v4.4s, v4.4s, v30.4s // += PREP_BIAS*64
+ add v5.4s, v5.4s, v30.4s
+ add v26.4s, v26.4s, v30.4s
+ add v27.4s, v27.4s, v30.4s
+ uxtl v16.4s, v20.4h
+ uxtl2 v17.4s, v20.8h
+ uxtl v28.4s, v21.4h
+ mla v4.4s, v22.4s, v16.4s // (tmp2-tmp1)*(64-m)
+ uxtl2 v16.4s, v21.8h
+ mla v5.4s, v23.4s, v17.4s
+ mla v26.4s, v24.4s, v28.4s
+ mla v27.4s, v25.4s, v16.4s
+ srshl v4.4s, v4.4s, v29.4s // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
+ srshl v5.4s, v5.4s, v29.4s
+ srshl v26.4s, v26.4s, v29.4s
+ srshl v27.4s, v27.4s, v29.4s
+ sqxtun v4.4h, v4.4s // iclip_pixel
+ sqxtun2 v4.8h, v5.4s
+ sqxtun v5.4h, v26.4s
+ sqxtun2 v5.8h, v27.4s
+
+ // Start of other half
+ sabd v22.8h, v6.8h, v18.8h // abs(tmp1 - tmp2)
+ sabd v23.8h, v7.8h, v19.8h
+
+ umin v4.8h, v4.8h, v31.8h // iclip_pixel
+ umin v5.8h, v5.8h, v31.8h
+
+ ssubl v16.4s, v18.4h, v6.4h // tmp2 - tmp1 (requires 17 bit)
+ ssubl2 v17.4s, v18.8h, v6.8h
+ ssubl v18.4s, v19.4h, v7.4h
+ ssubl2 v19.4s, v19.8h, v7.8h
+ uqsub v22.8h, v0.8h, v22.8h // 27615 - abs()
+ uqsub v23.8h, v0.8h, v23.8h
+ sshll v24.4s, v6.4h, #6 // tmp1 << 6
+ sshll2 v25.4s, v6.8h, #6
+ sshll v26.4s, v7.4h, #6
+ sshll2 v27.4s, v7.8h, #6
+ ushr v22.8h, v22.8h, #10 // 64-m = (27615 - abs()) >> mask_sh
+ ushr v23.8h, v23.8h, #10
+ add v24.4s, v24.4s, v30.4s // += PREP_BIAS*64
+ add v25.4s, v25.4s, v30.4s
+ add v26.4s, v26.4s, v30.4s
+ add v27.4s, v27.4s, v30.4s
+ uxtl v6.4s, v22.4h
+ uxtl2 v7.4s, v22.8h
+ uxtl v28.4s, v23.4h
+ mla v24.4s, v16.4s, v6.4s // (tmp2-tmp1)*(64-m)
+ uxtl2 v6.4s, v23.8h
+ mla v25.4s, v17.4s, v7.4s
+ mla v26.4s, v18.4s, v28.4s
+ mla v27.4s, v19.4s, v6.4s
+ srshl v24.4s, v24.4s, v29.4s // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
+ srshl v25.4s, v25.4s, v29.4s
+ srshl v26.4s, v26.4s, v29.4s
+ srshl v27.4s, v27.4s, v29.4s
+ sqxtun v6.4h, v24.4s // iclip_pixel
+ sqxtun2 v6.8h, v25.4s
+ sqxtun v7.4h, v26.4s
+ sqxtun2 v7.8h, v27.4s
+ umin v6.8h, v6.8h, v31.8h // iclip_pixel
+ umin v7.8h, v7.8h, v31.8h
+.if \type == 444
+ uzp1 v20.16b, v20.16b, v21.16b // 64 - m
+ uzp1 v21.16b, v22.16b, v23.16b
+ sub v20.16b, v1.16b, v20.16b // m
+ sub v21.16b, v1.16b, v21.16b
+ st1 {v20.16b}, [x6], #16
+ st1 {v21.16b}, [x10], #16
+.elseif \type == 422
+ addp v20.8h, v20.8h, v21.8h // (64 - m) + (64 - n) (column wise addition)
+ addp v21.8h, v22.8h, v23.8h
+ xtn v20.8b, v20.8h
+ xtn v21.8b, v21.8h
+ uhsub v20.8b, v3.8b, v20.8b // ((129 - sign) - ((64 - m) + (64 - n)) >> 1
+ uhsub v21.8b, v3.8b, v21.8b
+ st1 {v20.8b}, [x6], #8
+ st1 {v21.8b}, [x10], #8
+.elseif \type == 420
+ add v20.8h, v20.8h, v22.8h // (64 - my1) + (64 - my2) (row wise addition)
+ add v21.8h, v21.8h, v23.8h
+ addp v20.8h, v20.8h, v21.8h // (128 - m) + (128 - n) (column wise addition)
+ sub v20.8h, v3.8h, v20.8h // (256 - sign) - ((128 - m) + (128 - n))
+ rshrn v20.8b, v20.8h, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
+ st1 {v20.8b}, [x6], #8
+.endif
+ st1 {v4.8h, v5.8h}, [x0], #32
+ st1 {v6.8h, v7.8h}, [x12], #32
+ b.gt 16b
+ subs w5, w5, #2
+ add x2, x2, w4, uxtw #1
+ add x3, x3, w4, uxtw #1
+ add x7, x7, w4, uxtw #1
+ add x9, x9, w4, uxtw #1
+.if \type == 444
+ add x6, x6, w4, uxtw
+ add x10, x10, w4, uxtw
+.elseif \type == 422
+ add x6, x6, x11, lsr #1
+ add x10, x10, x11, lsr #1
+.endif
+ add x0, x0, x1
+ add x12, x12, x1
+ b.gt 161b
+ ret
+L(w_mask_\type\()_tbl):
+ .hword L(w_mask_\type\()_tbl) - 1280b
+ .hword L(w_mask_\type\()_tbl) - 640b
+ .hword L(w_mask_\type\()_tbl) - 320b
+ .hword L(w_mask_\type\()_tbl) - 160b
+ .hword L(w_mask_\type\()_tbl) - 8b
+ .hword L(w_mask_\type\()_tbl) - 4b
+endfunc
+.endm
+
+w_mask_fn 444
+w_mask_fn 422
+w_mask_fn 420
+
+
+function blend_16bpc_neon, export=1
+ adr x6, L(blend_tbl)
+ clz w3, w3
+ sub w3, w3, #26
+ ldrh w3, [x6, x3, lsl #1]
+ sub x6, x6, w3, uxtw
+ add x8, x0, x1
+ br x6
+40:
+ AARCH64_VALID_JUMP_TARGET
+ lsl x1, x1, #1
+4:
+ ld1 {v2.8b}, [x5], #8
+ ld1 {v1.8h}, [x2], #16
+ ld1 {v0.d}[0], [x0]
+ neg v2.8b, v2.8b // -m
+ subs w4, w4, #2
+ ld1 {v0.d}[1], [x8]
+ sxtl v2.8h, v2.8b
+ shl v2.8h, v2.8h, #9 // -m << 9
+ sub v1.8h, v0.8h, v1.8h // a - b
+ sqrdmulh v1.8h, v1.8h, v2.8h // ((a-b)*-m + 32) >> 6
+ add v0.8h, v0.8h, v1.8h
+ st1 {v0.d}[0], [x0], x1
+ st1 {v0.d}[1], [x8], x1
+ b.gt 4b
+ ret
+80:
+ AARCH64_VALID_JUMP_TARGET
+ lsl x1, x1, #1
+8:
+ ld1 {v4.16b}, [x5], #16
+ ld1 {v2.8h, v3.8h}, [x2], #32
+ neg v5.16b, v4.16b // -m
+ ld1 {v0.8h}, [x0]
+ ld1 {v1.8h}, [x8]
+ sxtl v4.8h, v5.8b
+ sxtl2 v5.8h, v5.16b
+ shl v4.8h, v4.8h, #9 // -m << 9
+ shl v5.8h, v5.8h, #9
+ sub v2.8h, v0.8h, v2.8h // a - b
+ sub v3.8h, v1.8h, v3.8h
+ subs w4, w4, #2
+ sqrdmulh v2.8h, v2.8h, v4.8h // ((a-b)*-m + 32) >> 6
+ sqrdmulh v3.8h, v3.8h, v5.8h
+ add v0.8h, v0.8h, v2.8h
+ add v1.8h, v1.8h, v3.8h
+ st1 {v0.8h}, [x0], x1
+ st1 {v1.8h}, [x8], x1
+ b.gt 8b
+ ret
+160:
+ AARCH64_VALID_JUMP_TARGET
+ lsl x1, x1, #1
+16:
+ ld1 {v16.16b, v17.16b}, [x5], #32
+ ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
+ subs w4, w4, #2
+ neg v18.16b, v16.16b // -m
+ neg v19.16b, v17.16b
+ ld1 {v0.8h, v1.8h}, [x0]
+ sxtl v16.8h, v18.8b
+ sxtl2 v17.8h, v18.16b
+ sxtl v18.8h, v19.8b
+ sxtl2 v19.8h, v19.16b
+ ld1 {v2.8h, v3.8h}, [x8]
+ shl v16.8h, v16.8h, #9 // -m << 9
+ shl v17.8h, v17.8h, #9
+ shl v18.8h, v18.8h, #9
+ shl v19.8h, v19.8h, #9
+ sub v4.8h, v0.8h, v4.8h // a - b
+ sub v5.8h, v1.8h, v5.8h
+ sub v6.8h, v2.8h, v6.8h
+ sub v7.8h, v3.8h, v7.8h
+ sqrdmulh v4.8h, v4.8h, v16.8h // ((a-b)*-m + 32) >> 6
+ sqrdmulh v5.8h, v5.8h, v17.8h
+ sqrdmulh v6.8h, v6.8h, v18.8h
+ sqrdmulh v7.8h, v7.8h, v19.8h
+ add v0.8h, v0.8h, v4.8h
+ add v1.8h, v1.8h, v5.8h
+ add v2.8h, v2.8h, v6.8h
+ add v3.8h, v3.8h, v7.8h
+ st1 {v0.8h, v1.8h}, [x0], x1
+ st1 {v2.8h, v3.8h}, [x8], x1
+ b.gt 16b
+ ret
+32:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v16.16b, v17.16b}, [x5], #32
+ ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
+ subs w4, w4, #1
+ neg v18.16b, v16.16b // -m
+ neg v19.16b, v17.16b
+ sxtl v16.8h, v18.8b
+ sxtl2 v17.8h, v18.16b
+ sxtl v18.8h, v19.8b
+ sxtl2 v19.8h, v19.16b
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]
+ shl v16.8h, v16.8h, #9 // -m << 9
+ shl v17.8h, v17.8h, #9
+ shl v18.8h, v18.8h, #9
+ shl v19.8h, v19.8h, #9
+ sub v4.8h, v0.8h, v4.8h // a - b
+ sub v5.8h, v1.8h, v5.8h
+ sub v6.8h, v2.8h, v6.8h
+ sub v7.8h, v3.8h, v7.8h
+ sqrdmulh v4.8h, v4.8h, v16.8h // ((a-b)*-m + 32) >> 6
+ sqrdmulh v5.8h, v5.8h, v17.8h
+ sqrdmulh v6.8h, v6.8h, v18.8h
+ sqrdmulh v7.8h, v7.8h, v19.8h
+ add v0.8h, v0.8h, v4.8h
+ add v1.8h, v1.8h, v5.8h
+ add v2.8h, v2.8h, v6.8h
+ add v3.8h, v3.8h, v7.8h
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ b.gt 32b
+ ret
+L(blend_tbl):
+ .hword L(blend_tbl) - 32b
+ .hword L(blend_tbl) - 160b
+ .hword L(blend_tbl) - 80b
+ .hword L(blend_tbl) - 40b
+endfunc
+
+function blend_h_16bpc_neon, export=1
+ adr x6, L(blend_h_tbl)
+ movrel x5, X(obmc_masks)
+ add x5, x5, w4, uxtw
+ sub w4, w4, w4, lsr #2
+ clz w7, w3
+ add x8, x0, x1
+ lsl x1, x1, #1
+ sub w7, w7, #24
+ ldrh w7, [x6, x7, lsl #1]
+ sub x6, x6, w7, uxtw
+ br x6
+2:
+ AARCH64_VALID_JUMP_TARGET
+ ld2r {v2.8b, v3.8b}, [x5], #2
+ ld1 {v1.4h}, [x2], #8
+ ext v2.8b, v2.8b, v3.8b, #6
+ subs w4, w4, #2
+ neg v2.8b, v2.8b // -m
+ ld1 {v0.s}[0], [x0]
+ ld1 {v0.s}[1], [x8]
+ sxtl v2.8h, v2.8b
+ shl v2.4h, v2.4h, #9 // -m << 9
+ sub v1.4h, v0.4h, v1.4h // a - b
+ sqrdmulh v1.4h, v1.4h, v2.4h // ((a-b)*-m + 32) >> 6
+ add v0.4h, v0.4h, v1.4h
+ st1 {v0.s}[0], [x0], x1
+ st1 {v0.s}[1], [x8], x1
+ b.gt 2b
+ ret
+4:
+ AARCH64_VALID_JUMP_TARGET
+ ld2r {v2.8b, v3.8b}, [x5], #2
+ ld1 {v1.8h}, [x2], #16
+ ext v2.8b, v2.8b, v3.8b, #4
+ subs w4, w4, #2
+ neg v2.8b, v2.8b // -m
+ ld1 {v0.d}[0], [x0]
+ ld1 {v0.d}[1], [x8]
+ sxtl v2.8h, v2.8b
+ shl v2.8h, v2.8h, #9 // -m << 9
+ sub v1.8h, v0.8h, v1.8h // a - b
+ sqrdmulh v1.8h, v1.8h, v2.8h // ((a-b)*-m + 32) >> 6
+ add v0.8h, v0.8h, v1.8h
+ st1 {v0.d}[0], [x0], x1
+ st1 {v0.d}[1], [x8], x1
+ b.gt 4b
+ ret
+8:
+ AARCH64_VALID_JUMP_TARGET
+ ld2r {v4.8b, v5.8b}, [x5], #2
+ ld1 {v2.8h, v3.8h}, [x2], #32
+ neg v4.8b, v4.8b // -m
+ neg v5.8b, v5.8b
+ ld1 {v0.8h}, [x0]
+ subs w4, w4, #2
+ sxtl v4.8h, v4.8b
+ sxtl v5.8h, v5.8b
+ ld1 {v1.8h}, [x8]
+ shl v4.8h, v4.8h, #9 // -m << 9
+ shl v5.8h, v5.8h, #9
+ sub v2.8h, v0.8h, v2.8h // a - b
+ sub v3.8h, v1.8h, v3.8h
+ sqrdmulh v2.8h, v2.8h, v4.8h // ((a-b)*-m + 32) >> 6
+ sqrdmulh v3.8h, v3.8h, v5.8h
+ add v0.8h, v0.8h, v2.8h
+ add v1.8h, v1.8h, v3.8h
+ st1 {v0.8h}, [x0], x1
+ st1 {v1.8h}, [x8], x1
+ b.gt 8b
+ ret
+16:
+ AARCH64_VALID_JUMP_TARGET
+ ld2r {v16.8b, v17.8b}, [x5], #2
+ ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
+ neg v16.8b, v16.8b // -m
+ neg v17.8b, v17.8b
+ ld1 {v0.8h, v1.8h}, [x0]
+ ld1 {v2.8h, v3.8h}, [x8]
+ subs w4, w4, #2
+ sxtl v16.8h, v16.8b
+ sxtl v17.8h, v17.8b
+ shl v16.8h, v16.8h, #9 // -m << 9
+ shl v17.8h, v17.8h, #9
+ sub v4.8h, v0.8h, v4.8h // a - b
+ sub v5.8h, v1.8h, v5.8h
+ sub v6.8h, v2.8h, v6.8h
+ sub v7.8h, v3.8h, v7.8h
+ sqrdmulh v4.8h, v4.8h, v16.8h // ((a-b)*-m + 32) >> 6
+ sqrdmulh v5.8h, v5.8h, v16.8h
+ sqrdmulh v6.8h, v6.8h, v17.8h
+ sqrdmulh v7.8h, v7.8h, v17.8h
+ add v0.8h, v0.8h, v4.8h
+ add v1.8h, v1.8h, v5.8h
+ add v2.8h, v2.8h, v6.8h
+ add v3.8h, v3.8h, v7.8h
+ st1 {v0.8h, v1.8h}, [x0], x1
+ st1 {v2.8h, v3.8h}, [x8], x1
+ b.gt 16b
+ ret
+1280:
+640:
+320:
+ AARCH64_VALID_JUMP_TARGET
+ sub x1, x1, w3, uxtw #1
+ add x7, x2, w3, uxtw #1
+321:
+ ld2r {v24.8b, v25.8b}, [x5], #2
+ mov w6, w3
+ neg v24.8b, v24.8b // -m
+ neg v25.8b, v25.8b
+ sxtl v24.8h, v24.8b
+ sxtl v25.8h, v25.8b
+ shl v24.8h, v24.8h, #9 // -m << 9
+ shl v25.8h, v25.8h, #9
+32:
+ ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x2], #64
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]
+ subs w6, w6, #32
+ sub v16.8h, v0.8h, v16.8h // a - b
+ sub v17.8h, v1.8h, v17.8h
+ sub v18.8h, v2.8h, v18.8h
+ sub v19.8h, v3.8h, v19.8h
+ ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64
+ ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x8]
+ sqrdmulh v16.8h, v16.8h, v24.8h // ((a-b)*-m + 32) >> 6
+ sqrdmulh v17.8h, v17.8h, v24.8h
+ sqrdmulh v18.8h, v18.8h, v24.8h
+ sqrdmulh v19.8h, v19.8h, v24.8h
+ sub v20.8h, v4.8h, v20.8h // a - b
+ sub v21.8h, v5.8h, v21.8h
+ sub v22.8h, v6.8h, v22.8h
+ sub v23.8h, v7.8h, v23.8h
+ add v0.8h, v0.8h, v16.8h
+ add v1.8h, v1.8h, v17.8h
+ add v2.8h, v2.8h, v18.8h
+ add v3.8h, v3.8h, v19.8h
+ sqrdmulh v20.8h, v20.8h, v25.8h // ((a-b)*-m + 32) >> 6
+ sqrdmulh v21.8h, v21.8h, v25.8h
+ sqrdmulh v22.8h, v22.8h, v25.8h
+ sqrdmulh v23.8h, v23.8h, v25.8h
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v4.8h, v4.8h, v20.8h
+ add v5.8h, v5.8h, v21.8h
+ add v6.8h, v6.8h, v22.8h
+ add v7.8h, v7.8h, v23.8h
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x8], #64
+ b.gt 32b
+ subs w4, w4, #2
+ add x0, x0, x1
+ add x8, x8, x1
+ add x2, x2, w3, uxtw #1
+ add x7, x7, w3, uxtw #1
+ b.gt 321b
+ ret
+L(blend_h_tbl):
+ .hword L(blend_h_tbl) - 1280b
+ .hword L(blend_h_tbl) - 640b
+ .hword L(blend_h_tbl) - 320b
+ .hword L(blend_h_tbl) - 16b
+ .hword L(blend_h_tbl) - 8b
+ .hword L(blend_h_tbl) - 4b
+ .hword L(blend_h_tbl) - 2b
+endfunc
+
+function blend_v_16bpc_neon, export=1
+ adr x6, L(blend_v_tbl)
+ movrel x5, X(obmc_masks)
+ add x5, x5, w3, uxtw
+ clz w3, w3
+ add x8, x0, x1
+ lsl x1, x1, #1
+ sub w3, w3, #26
+ ldrh w3, [x6, x3, lsl #1]
+ sub x6, x6, w3, uxtw
+ br x6
+20:
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v2.8b}, [x5]
+ neg v2.8b, v2.8b // -m
+ sxtl v2.8h, v2.8b
+ shl v2.4h, v2.4h, #9 // -m << 9
+2:
+ ld1 {v1.s}[0], [x2], #4
+ ld1 {v0.h}[0], [x0]
+ subs w4, w4, #2
+ ld1 {v1.h}[1], [x2]
+ ld1 {v0.h}[1], [x8]
+ add x2, x2, #4
+ sub v1.4h, v0.4h, v1.4h // a - b
+ sqrdmulh v1.4h, v1.4h, v2.4h // ((a-b)*-m + 32) >> 6
+ add v0.4h, v0.4h, v1.4h
+ st1 {v0.h}[0], [x0], x1
+ st1 {v0.h}[1], [x8], x1
+ b.gt 2b
+ ret
+40:
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v2.2s}, [x5]
+ sub x1, x1, #4
+ neg v2.8b, v2.8b // -m
+ sxtl v2.8h, v2.8b
+ shl v2.8h, v2.8h, #9 // -m << 9
+4:
+ ld1 {v1.8h}, [x2], #16
+ ld1 {v0.d}[0], [x0]
+ ld1 {v0.d}[1], [x8]
+ subs w4, w4, #2
+ sub v1.8h, v0.8h, v1.8h // a - b
+ sqrdmulh v1.8h, v1.8h, v2.8h // ((a-b)*-m + 32) >> 6
+ add v0.8h, v0.8h, v1.8h
+ st1 {v0.s}[0], [x0], #4
+ st1 {v0.s}[2], [x8], #4
+ st1 {v0.h}[2], [x0], x1
+ st1 {v0.h}[6], [x8], x1
+ b.gt 4b
+ ret
+80:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v4.8b}, [x5]
+ sub x1, x1, #8
+ neg v4.8b, v4.8b // -m
+ sxtl v4.8h, v4.8b
+ shl v4.8h, v4.8h, #9 // -m << 9
+8:
+ ld1 {v2.8h, v3.8h}, [x2], #32
+ ld1 {v0.8h}, [x0]
+ ld1 {v1.8h}, [x8]
+ subs w4, w4, #2
+ sub v2.8h, v0.8h, v2.8h // a - b
+ sub v3.8h, v1.8h, v3.8h
+ sqrdmulh v2.8h, v2.8h, v4.8h // ((a-b)*-m + 32) >> 6
+ sqrdmulh v3.8h, v3.8h, v4.8h
+ add v0.8h, v0.8h, v2.8h
+ add v1.8h, v1.8h, v3.8h
+ st1 {v0.d}[0], [x0], #8
+ st1 {v1.d}[0], [x8], #8
+ st1 {v0.s}[2], [x0], x1
+ st1 {v1.s}[2], [x8], x1
+ b.gt 8b
+ ret
+160:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v16.16b}, [x5]
+ sub x1, x1, #16
+ neg v17.16b, v16.16b // -m
+ sxtl v16.8h, v17.8b
+ sxtl2 v17.8h, v17.16b
+ shl v16.8h, v16.8h, #9 // -m << 9
+ shl v17.4h, v17.4h, #9
+16:
+ ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
+ ld1 {v0.8h, v1.8h}, [x0]
+ subs w4, w4, #2
+ ld1 {v2.8h, v3.8h}, [x8]
+ sub v4.8h, v0.8h, v4.8h // a - b
+ sub v5.4h, v1.4h, v5.4h
+ sub v6.8h, v2.8h, v6.8h
+ sub v7.4h, v3.4h, v7.4h
+ sqrdmulh v4.8h, v4.8h, v16.8h // ((a-b)*-m + 32) >> 6
+ sqrdmulh v5.4h, v5.4h, v17.4h
+ sqrdmulh v6.8h, v6.8h, v16.8h
+ sqrdmulh v7.4h, v7.4h, v17.4h
+ add v0.8h, v0.8h, v4.8h
+ add v1.4h, v1.4h, v5.4h
+ add v2.8h, v2.8h, v6.8h
+ add v3.4h, v3.4h, v7.4h
+ st1 {v0.8h}, [x0], #16
+ st1 {v2.8h}, [x8], #16
+ st1 {v1.4h}, [x0], x1
+ st1 {v3.4h}, [x8], x1
+ b.gt 16b
+ ret
+320:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v24.16b, v25.16b}, [x5]
+ neg v26.16b, v24.16b // -m
+ neg v27.8b, v25.8b
+ sxtl v24.8h, v26.8b
+ sxtl2 v25.8h, v26.16b
+ sxtl v26.8h, v27.8b
+ shl v24.8h, v24.8h, #9 // -m << 9
+ shl v25.8h, v25.8h, #9
+ shl v26.8h, v26.8h, #9
+32:
+ ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x2], #64
+ ld1 {v0.8h, v1.8h, v2.8h}, [x0]
+ ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], #64
+ ld1 {v4.8h, v5.8h, v6.8h}, [x8]
+ subs w4, w4, #2
+ sub v16.8h, v0.8h, v16.8h // a - b
+ sub v17.8h, v1.8h, v17.8h
+ sub v18.8h, v2.8h, v18.8h
+ sub v20.8h, v4.8h, v20.8h
+ sub v21.8h, v5.8h, v21.8h
+ sub v22.8h, v6.8h, v22.8h
+ sqrdmulh v16.8h, v16.8h, v24.8h // ((a-b)*-m + 32) >> 6
+ sqrdmulh v17.8h, v17.8h, v25.8h
+ sqrdmulh v18.8h, v18.8h, v26.8h
+ sqrdmulh v20.8h, v20.8h, v24.8h
+ sqrdmulh v21.8h, v21.8h, v25.8h
+ sqrdmulh v22.8h, v22.8h, v26.8h
+ add v0.8h, v0.8h, v16.8h
+ add v1.8h, v1.8h, v17.8h
+ add v2.8h, v2.8h, v18.8h
+ add v4.8h, v4.8h, v20.8h
+ add v5.8h, v5.8h, v21.8h
+ add v6.8h, v6.8h, v22.8h
+ st1 {v0.8h, v1.8h, v2.8h}, [x0], x1
+ st1 {v4.8h, v5.8h, v6.8h}, [x8], x1
+ b.gt 32b
+ ret
+L(blend_v_tbl):
+ .hword L(blend_v_tbl) - 320b
+ .hword L(blend_v_tbl) - 160b
+ .hword L(blend_v_tbl) - 80b
+ .hword L(blend_v_tbl) - 40b
+ .hword L(blend_v_tbl) - 20b
+endfunc
+
+
+// This has got the same signature as the put_8tap functions,
+// and assumes that x9 is set to (clz(w)-24).
+function put_neon
+ adr x10, L(put_tbl)
+ ldrh w9, [x10, x9, lsl #1]
+ sub x10, x10, w9, uxtw
+ br x10
+
+2:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.s}[0], [x2], x3
+ ld1 {v1.s}[0], [x2], x3
+ subs w5, w5, #2
+ st1 {v0.s}[0], [x0], x1
+ st1 {v1.s}[0], [x0], x1
+ b.gt 2b
+ ret
+4:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.4h}, [x2], x3
+ ld1 {v1.4h}, [x2], x3
+ subs w5, w5, #2
+ st1 {v0.4h}, [x0], x1
+ st1 {v1.4h}, [x0], x1
+ b.gt 4b
+ ret
+80:
+ AARCH64_VALID_JUMP_TARGET
+ add x8, x0, x1
+ lsl x1, x1, #1
+ add x9, x2, x3
+ lsl x3, x3, #1
+8:
+ ld1 {v0.8h}, [x2], x3
+ ld1 {v1.8h}, [x9], x3
+ subs w5, w5, #2
+ st1 {v0.8h}, [x0], x1
+ st1 {v1.8h}, [x8], x1
+ b.gt 8b
+ ret
+16:
+ AARCH64_VALID_JUMP_TARGET
+ ldp x6, x7, [x2]
+ ldp x8, x9, [x2, #16]
+ stp x6, x7, [x0]
+ subs w5, w5, #1
+ stp x8, x9, [x0, #16]
+ add x2, x2, x3
+ add x0, x0, x1
+ b.gt 16b
+ ret
+32:
+ AARCH64_VALID_JUMP_TARGET
+ ldp x6, x7, [x2]
+ ldp x8, x9, [x2, #16]
+ stp x6, x7, [x0]
+ ldp x10, x11, [x2, #32]
+ stp x8, x9, [x0, #16]
+ subs w5, w5, #1
+ ldp x12, x13, [x2, #48]
+ stp x10, x11, [x0, #32]
+ stp x12, x13, [x0, #48]
+ add x2, x2, x3
+ add x0, x0, x1
+ b.gt 32b
+ ret
+64:
+ AARCH64_VALID_JUMP_TARGET
+ ldp q0, q1, [x2]
+ ldp q2, q3, [x2, #32]
+ stp q0, q1, [x0]
+ ldp q4, q5, [x2, #64]
+ stp q2, q3, [x0, #32]
+ ldp q6, q7, [x2, #96]
+ subs w5, w5, #1
+ stp q4, q5, [x0, #64]
+ stp q6, q7, [x0, #96]
+ add x2, x2, x3
+ add x0, x0, x1
+ b.gt 64b
+ ret
+128:
+ AARCH64_VALID_JUMP_TARGET
+ ldp q0, q1, [x2]
+ ldp q2, q3, [x2, #32]
+ stp q0, q1, [x0]
+ ldp q4, q5, [x2, #64]
+ stp q2, q3, [x0, #32]
+ ldp q6, q7, [x2, #96]
+ subs w5, w5, #1
+ stp q4, q5, [x0, #64]
+ ldp q16, q17, [x2, #128]
+ stp q6, q7, [x0, #96]
+ ldp q18, q19, [x2, #160]
+ stp q16, q17, [x0, #128]
+ ldp q20, q21, [x2, #192]
+ stp q18, q19, [x0, #160]
+ ldp q22, q23, [x2, #224]
+ stp q20, q21, [x0, #192]
+ stp q22, q23, [x0, #224]
+ add x2, x2, x3
+ add x0, x0, x1
+ b.gt 128b
+ ret
+
+L(put_tbl):
+ .hword L(put_tbl) - 128b
+ .hword L(put_tbl) - 64b
+ .hword L(put_tbl) - 32b
+ .hword L(put_tbl) - 16b
+ .hword L(put_tbl) - 80b
+ .hword L(put_tbl) - 4b
+ .hword L(put_tbl) - 2b
+endfunc
+
+
+// This has got the same signature as the prep_8tap functions,
+// and assumes that x9 is set to (clz(w)-24), w7 to intermediate_bits and
+// x8 to w*2.
+function prep_neon
+ adr x10, L(prep_tbl)
+ ldrh w9, [x10, x9, lsl #1]
+ dup v31.8h, w7 // intermediate_bits
+ movi v30.8h, #(PREP_BIAS >> 8), lsl #8
+ sub x10, x10, w9, uxtw
+ br x10
+
+40:
+ AARCH64_VALID_JUMP_TARGET
+ add x9, x1, x2
+ lsl x2, x2, #1
+4:
+ ld1 {v0.d}[0], [x1], x2
+ ld1 {v0.d}[1], [x9], x2
+ subs w4, w4, #2
+ sshl v0.8h, v0.8h, v31.8h
+ sub v0.8h, v0.8h, v30.8h
+ st1 {v0.8h}, [x0], #16
+ b.gt 4b
+ ret
+80:
+ AARCH64_VALID_JUMP_TARGET
+ add x9, x1, x2
+ lsl x2, x2, #1
+8:
+ ld1 {v0.8h}, [x1], x2
+ ld1 {v1.8h}, [x9], x2
+ subs w4, w4, #2
+ sshl v0.8h, v0.8h, v31.8h
+ sshl v1.8h, v1.8h, v31.8h
+ sub v0.8h, v0.8h, v30.8h
+ sub v1.8h, v1.8h, v30.8h
+ st1 {v0.8h, v1.8h}, [x0], #32
+ b.gt 8b
+ ret
+16:
+ AARCH64_VALID_JUMP_TARGET
+ ldp q0, q1, [x1]
+ add x1, x1, x2
+ sshl v0.8h, v0.8h, v31.8h
+ ldp q2, q3, [x1]
+ add x1, x1, x2
+ subs w4, w4, #2
+ sshl v1.8h, v1.8h, v31.8h
+ sshl v2.8h, v2.8h, v31.8h
+ sshl v3.8h, v3.8h, v31.8h
+ sub v0.8h, v0.8h, v30.8h
+ sub v1.8h, v1.8h, v30.8h
+ sub v2.8h, v2.8h, v30.8h
+ sub v3.8h, v3.8h, v30.8h
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ b.gt 16b
+ ret
+32:
+ AARCH64_VALID_JUMP_TARGET
+ ldp q0, q1, [x1]
+ sshl v0.8h, v0.8h, v31.8h
+ ldp q2, q3, [x1, #32]
+ add x1, x1, x2
+ sshl v1.8h, v1.8h, v31.8h
+ sshl v2.8h, v2.8h, v31.8h
+ sshl v3.8h, v3.8h, v31.8h
+ subs w4, w4, #1
+ sub v0.8h, v0.8h, v30.8h
+ sub v1.8h, v1.8h, v30.8h
+ sub v2.8h, v2.8h, v30.8h
+ sub v3.8h, v3.8h, v30.8h
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ b.gt 32b
+ ret
+64:
+ AARCH64_VALID_JUMP_TARGET
+ ldp q0, q1, [x1]
+ subs w4, w4, #1
+ sshl v0.8h, v0.8h, v31.8h
+ ldp q2, q3, [x1, #32]
+ sshl v1.8h, v1.8h, v31.8h
+ ldp q4, q5, [x1, #64]
+ sshl v2.8h, v2.8h, v31.8h
+ sshl v3.8h, v3.8h, v31.8h
+ ldp q6, q7, [x1, #96]
+ add x1, x1, x2
+ sshl v4.8h, v4.8h, v31.8h
+ sshl v5.8h, v5.8h, v31.8h
+ sshl v6.8h, v6.8h, v31.8h
+ sshl v7.8h, v7.8h, v31.8h
+ sub v0.8h, v0.8h, v30.8h
+ sub v1.8h, v1.8h, v30.8h
+ sub v2.8h, v2.8h, v30.8h
+ sub v3.8h, v3.8h, v30.8h
+ stp q0, q1, [x0]
+ sub v4.8h, v4.8h, v30.8h
+ sub v5.8h, v5.8h, v30.8h
+ stp q2, q3, [x0, #32]
+ sub v6.8h, v6.8h, v30.8h
+ sub v7.8h, v7.8h, v30.8h
+ stp q4, q5, [x0, #64]
+ stp q6, q7, [x0, #96]
+ add x0, x0, x8
+ b.gt 64b
+ ret
+128:
+ AARCH64_VALID_JUMP_TARGET
+ ldp q0, q1, [x1]
+ subs w4, w4, #1
+ sshl v0.8h, v0.8h, v31.8h
+ ldp q2, q3, [x1, #32]
+ sshl v1.8h, v1.8h, v31.8h
+ ldp q4, q5, [x1, #64]
+ sshl v2.8h, v2.8h, v31.8h
+ sshl v3.8h, v3.8h, v31.8h
+ ldp q6, q7, [x1, #96]
+ sshl v4.8h, v4.8h, v31.8h
+ sshl v5.8h, v5.8h, v31.8h
+ ldp q16, q17, [x1, #128]
+ sshl v6.8h, v6.8h, v31.8h
+ sshl v7.8h, v7.8h, v31.8h
+ ldp q18, q19, [x1, #160]
+ sshl v16.8h, v16.8h, v31.8h
+ sshl v17.8h, v17.8h, v31.8h
+ ldp q20, q21, [x1, #192]
+ sshl v18.8h, v18.8h, v31.8h
+ sshl v19.8h, v19.8h, v31.8h
+ ldp q22, q23, [x1, #224]
+ add x1, x1, x2
+ sshl v20.8h, v20.8h, v31.8h
+ sshl v21.8h, v21.8h, v31.8h
+ sshl v22.8h, v22.8h, v31.8h
+ sshl v23.8h, v23.8h, v31.8h
+ sub v0.8h, v0.8h, v30.8h
+ sub v1.8h, v1.8h, v30.8h
+ sub v2.8h, v2.8h, v30.8h
+ sub v3.8h, v3.8h, v30.8h
+ stp q0, q1, [x0]
+ sub v4.8h, v4.8h, v30.8h
+ sub v5.8h, v5.8h, v30.8h
+ stp q2, q3, [x0, #32]
+ sub v6.8h, v6.8h, v30.8h
+ sub v7.8h, v7.8h, v30.8h
+ stp q4, q5, [x0, #64]
+ sub v16.8h, v16.8h, v30.8h
+ sub v17.8h, v17.8h, v30.8h
+ stp q6, q7, [x0, #96]
+ sub v18.8h, v18.8h, v30.8h
+ sub v19.8h, v19.8h, v30.8h
+ stp q16, q17, [x0, #128]
+ sub v20.8h, v20.8h, v30.8h
+ sub v21.8h, v21.8h, v30.8h
+ stp q18, q19, [x0, #160]
+ sub v22.8h, v22.8h, v30.8h
+ sub v23.8h, v23.8h, v30.8h
+ stp q20, q21, [x0, #192]
+ stp q22, q23, [x0, #224]
+ add x0, x0, x8
+ b.gt 128b
+ ret
+
+L(prep_tbl):
+ .hword L(prep_tbl) - 128b
+ .hword L(prep_tbl) - 64b
+ .hword L(prep_tbl) - 32b
+ .hword L(prep_tbl) - 16b
+ .hword L(prep_tbl) - 80b
+ .hword L(prep_tbl) - 40b
+endfunc
+
+
+.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
+ ld1 {\d0\wd}[0], [\s0], \strd
+ ld1 {\d1\wd}[0], [\s1], \strd
+.ifnb \d2
+ ld1 {\d2\wd}[0], [\s0], \strd
+ ld1 {\d3\wd}[0], [\s1], \strd
+.endif
+.ifnb \d4
+ ld1 {\d4\wd}[0], [\s0], \strd
+.endif
+.ifnb \d5
+ ld1 {\d5\wd}[0], [\s1], \strd
+.endif
+.ifnb \d6
+ ld1 {\d6\wd}[0], [\s0], \strd
+.endif
+.endm
+.macro load_reg s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
+ ld1 {\d0\wd}, [\s0], \strd
+ ld1 {\d1\wd}, [\s1], \strd
+.ifnb \d2
+ ld1 {\d2\wd}, [\s0], \strd
+ ld1 {\d3\wd}, [\s1], \strd
+.endif
+.ifnb \d4
+ ld1 {\d4\wd}, [\s0], \strd
+.endif
+.ifnb \d5
+ ld1 {\d5\wd}, [\s1], \strd
+.endif
+.ifnb \d6
+ ld1 {\d6\wd}, [\s0], \strd
+.endif
+.endm
+.macro load_regpair s0, s1, strd, wd, d0, d1, d2, d3, d4, d5
+ ld1 {\d0\wd, \d1\wd}, [\s0], \strd
+.ifnb \d2
+ ld1 {\d2\wd, \d3\wd}, [\s1], \strd
+.endif
+.ifnb \d4
+ ld1 {\d4\wd, \d5\wd}, [\s0], \strd
+.endif
+.endm
+.macro load_s s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+ load_slice \s0, \s1, \strd, .s, \d0, \d1, \d2, \d3, \d4, \d5, \d6
+.endm
+.macro load_4h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+ load_reg \s0, \s1, \strd, .4h, \d0, \d1, \d2, \d3, \d4, \d5, \d6
+.endm
+.macro load_8h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+ load_reg \s0, \s1, \strd, .8h, \d0, \d1, \d2, \d3, \d4, \d5, \d6
+.endm
+.macro load_16h s0, s1, strd, d0, d1, d2, d3, d4, d5
+ load_regpair \s0, \s1, \strd, .8h, \d0, \d1, \d2, \d3, \d4, \d5
+.endm
+.macro interleave_1 wd, r0, r1, r2, r3, r4
+ trn1 \r0\wd, \r0\wd, \r1\wd
+ trn1 \r1\wd, \r1\wd, \r2\wd
+.ifnb \r3
+ trn1 \r2\wd, \r2\wd, \r3\wd
+ trn1 \r3\wd, \r3\wd, \r4\wd
+.endif
+.endm
+.macro interleave_1_s r0, r1, r2, r3, r4
+ interleave_1 .2s, \r0, \r1, \r2, \r3, \r4
+.endm
+.macro umin_h c, wd, r0, r1, r2, r3
+ umin \r0\wd, \r0\wd, \c\wd
+.ifnb \r1
+ umin \r1\wd, \r1\wd, \c\wd
+.endif
+.ifnb \r2
+ umin \r2\wd, \r2\wd, \c\wd
+ umin \r3\wd, \r3\wd, \c\wd
+.endif
+.endm
+.macro sub_h c, wd, r0, r1, r2, r3
+ sub \r0\wd, \r0\wd, \c\wd
+.ifnb \r1
+ sub \r1\wd, \r1\wd, \c\wd
+.endif
+.ifnb \r2
+ sub \r2\wd, \r2\wd, \c\wd
+ sub \r3\wd, \r3\wd, \c\wd
+.endif
+.endm
+.macro smull_smlal_4 d, s0, s1, s2, s3
+ smull \d\().4s, \s0\().4h, v0.h[0]
+ smlal \d\().4s, \s1\().4h, v0.h[1]
+ smlal \d\().4s, \s2\().4h, v0.h[2]
+ smlal \d\().4s, \s3\().4h, v0.h[3]
+.endm
+.macro smull2_smlal2_4 d, s0, s1, s2, s3
+ smull2 \d\().4s, \s0\().8h, v0.h[0]
+ smlal2 \d\().4s, \s1\().8h, v0.h[1]
+ smlal2 \d\().4s, \s2\().8h, v0.h[2]
+ smlal2 \d\().4s, \s3\().8h, v0.h[3]
+.endm
+.macro smull_smlal_8 d, s0, s1, s2, s3, s4, s5, s6, s7
+ smull \d\().4s, \s0\().4h, v0.h[0]
+ smlal \d\().4s, \s1\().4h, v0.h[1]
+ smlal \d\().4s, \s2\().4h, v0.h[2]
+ smlal \d\().4s, \s3\().4h, v0.h[3]
+ smlal \d\().4s, \s4\().4h, v0.h[4]
+ smlal \d\().4s, \s5\().4h, v0.h[5]
+ smlal \d\().4s, \s6\().4h, v0.h[6]
+ smlal \d\().4s, \s7\().4h, v0.h[7]
+.endm
+.macro smull2_smlal2_8 d, s0, s1, s2, s3, s4, s5, s6, s7
+ smull2 \d\().4s, \s0\().8h, v0.h[0]
+ smlal2 \d\().4s, \s1\().8h, v0.h[1]
+ smlal2 \d\().4s, \s2\().8h, v0.h[2]
+ smlal2 \d\().4s, \s3\().8h, v0.h[3]
+ smlal2 \d\().4s, \s4\().8h, v0.h[4]
+ smlal2 \d\().4s, \s5\().8h, v0.h[5]
+ smlal2 \d\().4s, \s6\().8h, v0.h[6]
+ smlal2 \d\().4s, \s7\().8h, v0.h[7]
+.endm
+.macro sqrshrun_h shift, r0, r1, r2, r3
+ sqrshrun \r0\().4h, \r0\().4s, #\shift
+.ifnb \r1
+ sqrshrun2 \r0\().8h, \r1\().4s, #\shift
+.endif
+.ifnb \r2
+ sqrshrun \r2\().4h, \r2\().4s, #\shift
+ sqrshrun2 \r2\().8h, \r3\().4s, #\shift
+.endif
+.endm
+.macro xtn_h r0, r1, r2, r3
+ uzp1 \r0\().8h, \r0\().8h, \r1\().8h // Same as xtn, xtn2
+.ifnb \r2
+ uzp1 \r2\().8h, \r2\().8h, \r3\().8h // Ditto
+.endif
+.endm
+.macro srshl_s shift, r0, r1, r2, r3
+ srshl \r0\().4s, \r0\().4s, \shift\().4s
+ srshl \r1\().4s, \r1\().4s, \shift\().4s
+.ifnb \r2
+ srshl \r2\().4s, \r2\().4s, \shift\().4s
+ srshl \r3\().4s, \r3\().4s, \shift\().4s
+.endif
+.endm
+.macro st_s strd, reg, lanes
+ st1 {\reg\().s}[0], [x0], \strd
+ st1 {\reg\().s}[1], [x9], \strd
+.if \lanes > 2
+ st1 {\reg\().s}[2], [x0], \strd
+ st1 {\reg\().s}[3], [x9], \strd
+.endif
+.endm
+.macro st_d strd, r0, r1
+ st1 {\r0\().d}[0], [x0], \strd
+ st1 {\r0\().d}[1], [x9], \strd
+.ifnb \r1
+ st1 {\r1\().d}[0], [x0], \strd
+ st1 {\r1\().d}[1], [x9], \strd
+.endif
+.endm
+.macro shift_store_4 type, strd, r0, r1, r2, r3
+.ifc \type, put
+ sqrshrun_h 6, \r0, \r1, \r2, \r3
+ umin_h v31, .8h, \r0, \r2
+.else
+ srshl_s v30, \r0, \r1, \r2, \r3 // -(6-intermediate_bits)
+ xtn_h \r0, \r1, \r2, \r3
+ sub_h v29, .8h, \r0, \r2 // PREP_BIAS
+.endif
+ st_d \strd, \r0, \r2
+.endm
+.macro st_reg strd, wd, r0, r1, r2, r3, r4, r5, r6, r7
+ st1 {\r0\wd}, [x0], \strd
+ st1 {\r1\wd}, [x9], \strd
+.ifnb \r2
+ st1 {\r2\wd}, [x0], \strd
+ st1 {\r3\wd}, [x9], \strd
+.endif
+.ifnb \r4
+ st1 {\r4\wd}, [x0], \strd
+ st1 {\r5\wd}, [x9], \strd
+ st1 {\r6\wd}, [x0], \strd
+ st1 {\r7\wd}, [x9], \strd
+.endif
+.endm
+.macro st_8h strd, r0, r1, r2, r3, r4, r5, r6, r7
+ st_reg \strd, .8h, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
+.endm
+.macro shift_store_8 type, strd, r0, r1, r2, r3
+.ifc \type, put
+ sqrshrun_h 6, \r0, \r1, \r2, \r3
+ umin_h v31, .8h, \r0, \r2
+.else
+ srshl_s v30, \r0, \r1, \r2, \r3 // -(6-intermediate_bits)
+ xtn_h \r0, \r1, \r2, \r3
+ sub_h v29, .8h, \r0, \r2 // PREP_BIAS
+.endif
+ st_8h \strd, \r0, \r2
+.endm
+.macro shift_store_16 type, strd, dst, r0, r1, r2, r3
+.ifc \type, put
+ sqrshrun_h 6, \r0, \r1, \r2, \r3
+ umin \r0\().8h, \r0\().8h, v31.8h
+ umin \r1\().8h, \r2\().8h, v31.8h
+.else
+ srshl_s v30, \r0, \r1, \r2, \r3 // -(6-intermediate_bits)
+ xtn_h \r0, \r1, \r2, \r3
+ sub \r0\().8h, \r0\().8h, v29.8h
+ sub \r1\().8h, \r2\().8h, v29.8h
+.endif
+ st1 {\r0\().8h, \r1\().8h}, [\dst], \strd
+.endm
+
+.macro make_8tap_fn op, type, type_h, type_v
+function \op\()_8tap_\type\()_16bpc_neon, export=1
+ mov w9, \type_h
+ mov w10, \type_v
+ b \op\()_8tap_neon
+endfunc
+.endm
+
+// No spaces in these expressions, due to gas-preprocessor.
+#define REGULAR ((0*15<<7)|3*15)
+#define SMOOTH ((1*15<<7)|4*15)
+#define SHARP ((2*15<<7)|3*15)
+
+.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, bdmax, ds2, sr2
+make_8tap_fn \type, regular, REGULAR, REGULAR
+make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH
+make_8tap_fn \type, regular_sharp, REGULAR, SHARP
+make_8tap_fn \type, smooth, SMOOTH, SMOOTH
+make_8tap_fn \type, smooth_regular, SMOOTH, REGULAR
+make_8tap_fn \type, smooth_sharp, SMOOTH, SHARP
+make_8tap_fn \type, sharp, SHARP, SHARP
+make_8tap_fn \type, sharp_regular, SHARP, REGULAR
+make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH
+
+function \type\()_8tap_neon
+.ifc \bdmax, w8
+ ldr w8, [sp]
+.endif
+ mov w11, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0)
+ mul \mx, \mx, w11
+ mul \my, \my, w11
+ add \mx, \mx, w9 // mx, 8tap_h, 4tap_h
+ add \my, \my, w10 // my, 8tap_v, 4tap_v
+.ifc \type, prep
+ uxtw \d_strd, \w
+ lsl \d_strd, \d_strd, #1
+.endif
+
+ dup v31.8h, \bdmax // bitdepth_max
+ clz \bdmax, \bdmax
+ clz w9, \w
+ sub \bdmax, \bdmax, #18 // intermediate_bits = clz(bitdepth_max) - 18
+ mov w12, #6
+ tst \mx, #(0x7f << 14)
+ sub w9, w9, #24
+ add w13, w12, \bdmax // 6 + intermediate_bits
+ sub w12, w12, \bdmax // 6 - intermediate_bits
+ movrel x11, X(mc_subpel_filters), -8
+ b.ne L(\type\()_8tap_h)
+ tst \my, #(0x7f << 14)
+ b.ne L(\type\()_8tap_v)
+ b \type\()_neon
+
+L(\type\()_8tap_h):
+ cmp \w, #4
+ ubfx w10, \mx, #7, #7
+ and \mx, \mx, #0x7f
+ b.le 4f
+ mov \mx, w10
+4:
+ tst \my, #(0x7f << 14)
+ add \xmx, x11, \mx, uxtw #3
+ b.ne L(\type\()_8tap_hv)
+
+ adr x10, L(\type\()_8tap_h_tbl)
+ dup v30.4s, w12 // 6 - intermediate_bits
+ ldrh w9, [x10, x9, lsl #1]
+ neg v30.4s, v30.4s // -(6-intermediate_bits)
+.ifc \type, put
+ dup v29.8h, \bdmax // intermediate_bits
+.else
+ movi v28.8h, #(PREP_BIAS >> 8), lsl #8
+.endif
+ sub x10, x10, w9, uxtw
+.ifc \type, put
+ neg v29.8h, v29.8h // -intermediate_bits
+.endif
+ br x10
+
+20: // 2xN h
+ AARCH64_VALID_JUMP_TARGET
+.ifc \type, put
+ add \xmx, \xmx, #2
+ ld1 {v0.s}[0], [\xmx]
+ sub \src, \src, #2
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+ sxtl v0.8h, v0.8b
+2:
+ ld1 {v4.8h}, [\src], \s_strd
+ ld1 {v6.8h}, [\sr2], \s_strd
+ ext v5.16b, v4.16b, v4.16b, #2
+ ext v7.16b, v6.16b, v6.16b, #2
+ subs \h, \h, #2
+ trn1 v3.2s, v4.2s, v6.2s
+ trn2 v6.2s, v4.2s, v6.2s
+ trn1 v4.2s, v5.2s, v7.2s
+ trn2 v7.2s, v5.2s, v7.2s
+ smull v3.4s, v3.4h, v0.h[0]
+ smlal v3.4s, v4.4h, v0.h[1]
+ smlal v3.4s, v6.4h, v0.h[2]
+ smlal v3.4s, v7.4h, v0.h[3]
+ srshl v3.4s, v3.4s, v30.4s // -(6-intermediate_bits)
+ sqxtun v3.4h, v3.4s
+ srshl v3.4h, v3.4h, v29.4h // -intermediate_bits
+ umin v3.4h, v3.4h, v31.4h
+ st1 {v3.s}[0], [\dst], \d_strd
+ st1 {v3.s}[1], [\ds2], \d_strd
+ b.gt 2b
+ ret
+.endif
+
+40: // 4xN h
+ AARCH64_VALID_JUMP_TARGET
+ add \xmx, \xmx, #2
+ ld1 {v0.s}[0], [\xmx]
+ sub \src, \src, #2
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+ sxtl v0.8h, v0.8b
+4:
+ ld1 {v16.8h}, [\src], \s_strd
+ ld1 {v20.8h}, [\sr2], \s_strd
+ ext v17.16b, v16.16b, v16.16b, #2
+ ext v18.16b, v16.16b, v16.16b, #4
+ ext v19.16b, v16.16b, v16.16b, #6
+ ext v21.16b, v20.16b, v20.16b, #2
+ ext v22.16b, v20.16b, v20.16b, #4
+ ext v23.16b, v20.16b, v20.16b, #6
+ subs \h, \h, #2
+ smull v16.4s, v16.4h, v0.h[0]
+ smlal v16.4s, v17.4h, v0.h[1]
+ smlal v16.4s, v18.4h, v0.h[2]
+ smlal v16.4s, v19.4h, v0.h[3]
+ smull v20.4s, v20.4h, v0.h[0]
+ smlal v20.4s, v21.4h, v0.h[1]
+ smlal v20.4s, v22.4h, v0.h[2]
+ smlal v20.4s, v23.4h, v0.h[3]
+ srshl v16.4s, v16.4s, v30.4s // -(6-intermediate_bits)
+ srshl v20.4s, v20.4s, v30.4s // -(6-intermediate_bits)
+.ifc \type, put
+ sqxtun v16.4h, v16.4s
+ sqxtun2 v16.8h, v20.4s
+ srshl v16.8h, v16.8h, v29.8h // -intermediate_bits
+ umin v16.8h, v16.8h, v31.8h
+.else
+ uzp1 v16.8h, v16.8h, v20.8h // Same as xtn, xtn2
+ sub v16.8h, v16.8h, v28.8h // PREP_BIAS
+.endif
+ st1 {v16.d}[0], [\dst], \d_strd
+ st1 {v16.d}[1], [\ds2], \d_strd
+ b.gt 4b
+ ret
+
+80:
+160:
+320:
+640:
+1280: // 8xN, 16xN, 32xN, ... h
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8b}, [\xmx]
+ sub \src, \src, #6
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ sxtl v0.8h, v0.8b
+
+ sub \s_strd, \s_strd, \w, uxtw #1
+ sub \s_strd, \s_strd, #16
+.ifc \type, put
+ lsl \d_strd, \d_strd, #1
+ sub \d_strd, \d_strd, \w, uxtw #1
+.endif
+81:
+ ld1 {v16.8h, v17.8h}, [\src], #32
+ ld1 {v20.8h, v21.8h}, [\sr2], #32
+ mov \mx, \w
+
+8:
+ smull v18.4s, v16.4h, v0.h[0]
+ smull2 v19.4s, v16.8h, v0.h[0]
+ smull v22.4s, v20.4h, v0.h[0]
+ smull2 v23.4s, v20.8h, v0.h[0]
+.irpc i, 1234567
+ ext v24.16b, v16.16b, v17.16b, #(2*\i)
+ ext v25.16b, v20.16b, v21.16b, #(2*\i)
+ smlal v18.4s, v24.4h, v0.h[\i]
+ smlal2 v19.4s, v24.8h, v0.h[\i]
+ smlal v22.4s, v25.4h, v0.h[\i]
+ smlal2 v23.4s, v25.8h, v0.h[\i]
+.endr
+ subs \mx, \mx, #8
+ srshl v18.4s, v18.4s, v30.4s // -(6-intermediate_bits)
+ srshl v19.4s, v19.4s, v30.4s // -(6-intermediate_bits)
+ srshl v22.4s, v22.4s, v30.4s // -(6-intermediate_bits)
+ srshl v23.4s, v23.4s, v30.4s // -(6-intermediate_bits)
+.ifc \type, put
+ sqxtun v18.4h, v18.4s
+ sqxtun2 v18.8h, v19.4s
+ sqxtun v22.4h, v22.4s
+ sqxtun2 v22.8h, v23.4s
+ srshl v18.8h, v18.8h, v29.8h // -intermediate_bits
+ srshl v22.8h, v22.8h, v29.8h // -intermediate_bits
+ umin v18.8h, v18.8h, v31.8h
+ umin v22.8h, v22.8h, v31.8h
+.else
+ uzp1 v18.8h, v18.8h, v19.8h // Same as xtn, xtn2
+ uzp1 v22.8h, v22.8h, v23.8h // Ditto
+ sub v18.8h, v18.8h, v28.8h // PREP_BIAS
+ sub v22.8h, v22.8h, v28.8h // PREP_BIAS
+.endif
+ st1 {v18.8h}, [\dst], #16
+ st1 {v22.8h}, [\ds2], #16
+ b.le 9f
+
+ mov v16.16b, v17.16b
+ mov v20.16b, v21.16b
+ ld1 {v17.8h}, [\src], #16
+ ld1 {v21.8h}, [\sr2], #16
+ b 8b
+
+9:
+ add \dst, \dst, \d_strd
+ add \ds2, \ds2, \d_strd
+ add \src, \src, \s_strd
+ add \sr2, \sr2, \s_strd
+
+ subs \h, \h, #2
+ b.gt 81b
+ ret
+
+L(\type\()_8tap_h_tbl):
+ .hword L(\type\()_8tap_h_tbl) - 1280b
+ .hword L(\type\()_8tap_h_tbl) - 640b
+ .hword L(\type\()_8tap_h_tbl) - 320b
+ .hword L(\type\()_8tap_h_tbl) - 160b
+ .hword L(\type\()_8tap_h_tbl) - 80b
+ .hword L(\type\()_8tap_h_tbl) - 40b
+ .hword L(\type\()_8tap_h_tbl) - 20b
+ .hword 0
+
+
+L(\type\()_8tap_v):
+ cmp \h, #4
+ ubfx w10, \my, #7, #7
+ and \my, \my, #0x7f
+ b.le 4f
+ mov \my, w10
+4:
+ add \xmy, x11, \my, uxtw #3
+
+.ifc \type, prep
+ dup v30.4s, w12 // 6 - intermediate_bits
+ movi v29.8h, #(PREP_BIAS >> 8), lsl #8
+.endif
+ adr x10, L(\type\()_8tap_v_tbl)
+ ldrh w9, [x10, x9, lsl #1]
+.ifc \type, prep
+ neg v30.4s, v30.4s // -(6-intermediate_bits)
+.endif
+ sub x10, x10, w9, uxtw
+ br x10
+
+20: // 2xN v
+ AARCH64_VALID_JUMP_TARGET
+.ifc \type, put
+ b.gt 28f
+
+ cmp \h, #2
+ add \xmy, \xmy, #2
+ ld1 {v0.s}[0], [\xmy]
+ sub \src, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+
+ // 2x2 v
+ load_s \src, \sr2, \s_strd, v1, v2, v3, v4, v5
+ interleave_1_s v1, v2, v3, v4, v5
+ b.gt 24f
+ smull_smlal_4 v6, v1, v2, v3, v4
+ sqrshrun_h 6, v6
+ umin_h v31, .8h, v6
+ st_s \d_strd, v6, 2
+ ret
+
+24: // 2x4 v
+ load_s \sr2, \src, \s_strd, v6, v7
+ interleave_1_s v5, v6, v7
+ smull_smlal_4 v16, v1, v2, v3, v4
+ smull_smlal_4 v17, v3, v4, v5, v6
+ sqrshrun_h 6, v16, v17
+ umin_h v31, .8h, v16
+ st_s \d_strd, v16, 4
+ ret
+
+28: // 2x6, 2x8, 2x12, 2x16 v
+ ld1 {v0.8b}, [\xmy]
+ sub \sr2, \src, \s_strd, lsl #1
+ add \ds2, \dst, \d_strd
+ sub \src, \sr2, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+ sxtl v0.8h, v0.8b
+
+ load_s \src, \sr2, \s_strd, v1, v2, v3, v4, v5, v6, v7
+ interleave_1_s v1, v2, v3, v4, v5
+ interleave_1_s v5, v6, v7
+216:
+ subs \h, \h, #4
+ load_s \sr2, \src, \s_strd, v16, v17, v18, v19
+ interleave_1_s v7, v16, v17, v18, v19
+ smull_smlal_8 v24, v1, v2, v3, v4, v5, v6, v7, v16
+ smull_smlal_8 v25, v3, v4, v5, v6, v7, v16, v17, v18
+ sqrshrun_h 6, v24, v25
+ umin_h v31, .8h, v24
+ st_s \d_strd, v24, 4
+ b.le 0f
+ cmp \h, #2
+ mov v1.16b, v5.16b
+ mov v2.16b, v6.16b
+ mov v3.16b, v7.16b
+ mov v4.16b, v16.16b
+ mov v5.16b, v17.16b
+ mov v6.16b, v18.16b
+ mov v7.16b, v19.16b
+ b.eq 26f
+ b 216b
+26:
+ load_s \sr2, \src, \s_strd, v16, v17
+ interleave_1_s v7, v16, v17
+ smull_smlal_8 v24, v1, v2, v3, v4, v5, v6, v7, v16
+ sqrshrun_h 6, v24
+ umin_h v31, .4h, v24
+ st_s \d_strd, v24, 2
+0:
+ ret
+.endif
+
+40:
+ AARCH64_VALID_JUMP_TARGET
+ b.gt 480f
+
+ // 4x2, 4x4 v
+ cmp \h, #2
+ add \xmy, \xmy, #2
+ ld1 {v0.s}[0], [\xmy]
+ sub \src, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+
+ load_4h \src, \sr2, \s_strd, v1, v2, v3, v4, v5
+ smull_smlal_4 v6, v1, v2, v3, v4
+ smull_smlal_4 v7, v2, v3, v4, v5
+ shift_store_4 \type, \d_strd, v6, v7
+ b.le 0f
+ load_4h \sr2, \src, \s_strd, v6, v7
+ smull_smlal_4 v1, v3, v4, v5, v6
+ smull_smlal_4 v2, v4, v5, v6, v7
+ shift_store_4 \type, \d_strd, v1, v2
+0:
+ ret
+
+480: // 4x6, 4x8, 4x12, 4x16 v
+ ld1 {v0.8b}, [\xmy]
+ sub \sr2, \src, \s_strd, lsl #1
+ add \ds2, \dst, \d_strd
+ sub \src, \sr2, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+
+ load_4h \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22
+
+48:
+ subs \h, \h, #4
+ load_4h \sr2, \src, \s_strd, v23, v24, v25, v26
+ smull_smlal_8 v1, v16, v17, v18, v19, v20, v21, v22, v23
+ smull_smlal_8 v2, v17, v18, v19, v20, v21, v22, v23, v24
+ smull_smlal_8 v3, v18, v19, v20, v21, v22, v23, v24, v25
+ smull_smlal_8 v4, v19, v20, v21, v22, v23, v24, v25, v26
+ shift_store_4 \type, \d_strd, v1, v2, v3, v4
+ b.le 0f
+ cmp \h, #2
+ mov v16.8b, v20.8b
+ mov v17.8b, v21.8b
+ mov v18.8b, v22.8b
+ mov v19.8b, v23.8b
+ mov v20.8b, v24.8b
+ mov v21.8b, v25.8b
+ mov v22.8b, v26.8b
+ b.eq 46f
+ b 48b
+46:
+ load_4h \sr2, \src, \s_strd, v23, v24
+ smull_smlal_8 v1, v16, v17, v18, v19, v20, v21, v22, v23
+ smull_smlal_8 v2, v17, v18, v19, v20, v21, v22, v23, v24
+ shift_store_4 \type, \d_strd, v1, v2
+0:
+ ret
+
+80:
+ AARCH64_VALID_JUMP_TARGET
+ b.gt 880f
+
+ // 8x2, 8x4 v
+ cmp \h, #2
+ add \xmy, \xmy, #2
+ ld1 {v0.s}[0], [\xmy]
+ sub \src, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+
+ load_8h \src, \sr2, \s_strd, v1, v2, v3, v4, v5
+ smull_smlal_4 v16, v1, v2, v3, v4
+ smull2_smlal2_4 v17, v1, v2, v3, v4
+ smull_smlal_4 v18, v2, v3, v4, v5
+ smull2_smlal2_4 v19, v2, v3, v4, v5
+ shift_store_8 \type, \d_strd, v16, v17, v18, v19
+ b.le 0f
+ load_8h \sr2, \src, \s_strd, v6, v7
+ smull_smlal_4 v16, v3, v4, v5, v6
+ smull2_smlal2_4 v17, v3, v4, v5, v6
+ smull_smlal_4 v18, v4, v5, v6, v7
+ smull2_smlal2_4 v19, v4, v5, v6, v7
+ shift_store_8 \type, \d_strd, v16, v17, v18, v19
+0:
+ ret
+
+880: // 8x6, 8x8, 8x16, 8x32 v
+1680: // 16x8, 16x16, ...
+320: // 32x8, 32x16, ...
+640:
+1280:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8b}, [\xmy]
+ sub \src, \src, \s_strd
+ sub \src, \src, \s_strd, lsl #1
+ sxtl v0.8h, v0.8b
+ mov \my, \h
+168:
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ load_8h \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22
+
+88:
+ subs \h, \h, #2
+ load_8h \sr2, \src, \s_strd, v23, v24
+ smull_smlal_8 v1, v16, v17, v18, v19, v20, v21, v22, v23
+ smull2_smlal2_8 v2, v16, v17, v18, v19, v20, v21, v22, v23
+ smull_smlal_8 v3, v17, v18, v19, v20, v21, v22, v23, v24
+ smull2_smlal2_8 v4, v17, v18, v19, v20, v21, v22, v23, v24
+ shift_store_8 \type, \d_strd, v1, v2, v3, v4
+ b.le 9f
+ subs \h, \h, #2
+ load_8h \sr2, \src, \s_strd, v25, v26
+ smull_smlal_8 v1, v18, v19, v20, v21, v22, v23, v24, v25
+ smull2_smlal2_8 v2, v18, v19, v20, v21, v22, v23, v24, v25
+ smull_smlal_8 v3, v19, v20, v21, v22, v23, v24, v25, v26
+ smull2_smlal2_8 v4, v19, v20, v21, v22, v23, v24, v25, v26
+ shift_store_8 \type, \d_strd, v1, v2, v3, v4
+ b.le 9f
+ mov v16.16b, v20.16b
+ mov v17.16b, v21.16b
+ mov v18.16b, v22.16b
+ mov v19.16b, v23.16b
+ mov v20.16b, v24.16b
+ mov v21.16b, v25.16b
+ mov v22.16b, v26.16b
+ b 88b
+9:
+ subs \w, \w, #8
+ b.le 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ msub \src, \s_strd, \xmy, \src
+ msub \dst, \d_strd, \xmy, \dst
+ sub \src, \src, \s_strd, lsl #3
+ mov \h, \my
+ add \src, \src, #16
+ add \dst, \dst, #16
+ b 168b
+0:
+ ret
+
+160:
+ AARCH64_VALID_JUMP_TARGET
+ b.gt 1680b
+
+ // 16x2, 16x4 v
+ add \xmy, \xmy, #2
+ ld1 {v0.s}[0], [\xmy]
+ sub \src, \src, \s_strd
+ sxtl v0.8h, v0.8b
+
+ load_16h \src, \src, \s_strd, v16, v17, v18, v19, v20, v21
+16:
+ load_16h \src, \src, \s_strd, v22, v23
+ subs \h, \h, #1
+ smull_smlal_4 v1, v16, v18, v20, v22
+ smull2_smlal2_4 v2, v16, v18, v20, v22
+ smull_smlal_4 v3, v17, v19, v21, v23
+ smull2_smlal2_4 v4, v17, v19, v21, v23
+ shift_store_16 \type, \d_strd, x0, v1, v2, v3, v4
+ b.le 0f
+ mov v16.16b, v18.16b
+ mov v17.16b, v19.16b
+ mov v18.16b, v20.16b
+ mov v19.16b, v21.16b
+ mov v20.16b, v22.16b
+ mov v21.16b, v23.16b
+ b 16b
+0:
+ ret
+
+L(\type\()_8tap_v_tbl):
+ .hword L(\type\()_8tap_v_tbl) - 1280b
+ .hword L(\type\()_8tap_v_tbl) - 640b
+ .hword L(\type\()_8tap_v_tbl) - 320b
+ .hword L(\type\()_8tap_v_tbl) - 160b
+ .hword L(\type\()_8tap_v_tbl) - 80b
+ .hword L(\type\()_8tap_v_tbl) - 40b
+ .hword L(\type\()_8tap_v_tbl) - 20b
+ .hword 0
+
+L(\type\()_8tap_hv):
+ cmp \h, #4
+ ubfx w10, \my, #7, #7
+ and \my, \my, #0x7f
+ b.le 4f
+ mov \my, w10
+4:
+ add \xmy, x11, \my, uxtw #3
+
+ adr x10, L(\type\()_8tap_hv_tbl)
+ dup v30.4s, w12 // 6 - intermediate_bits
+ ldrh w9, [x10, x9, lsl #1]
+ neg v30.4s, v30.4s // -(6-intermediate_bits)
+.ifc \type, put
+ dup v29.4s, w13 // 6 + intermediate_bits
+.else
+ movi v29.8h, #(PREP_BIAS >> 8), lsl #8
+.endif
+ sub x10, x10, w9, uxtw
+.ifc \type, put
+ neg v29.4s, v29.4s // -(6+intermediate_bits)
+.endif
+ br x10
+
+20:
+ AARCH64_VALID_JUMP_TARGET
+.ifc \type, put
+ add \xmx, \xmx, #2
+ ld1 {v0.s}[0], [\xmx]
+ b.gt 280f
+ add \xmy, \xmy, #2
+ ld1 {v1.s}[0], [\xmy]
+
+ // 2x2, 2x4 hv
+ sub \sr2, \src, #2
+ sub \src, \sr2, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+ sxtl v1.8h, v1.8b
+ mov x15, x30
+
+ ld1 {v27.8h}, [\src], \s_strd
+ ext v28.16b, v27.16b, v27.16b, #2
+ smull v27.4s, v27.4h, v0.4h
+ smull v28.4s, v28.4h, v0.4h
+ addp v27.4s, v27.4s, v28.4s
+ addp v16.4s, v27.4s, v27.4s
+ srshl v16.2s, v16.2s, v30.2s // -(6-intermediate_bits)
+ bl L(\type\()_8tap_filter_2)
+ // The intermediates from the horizontal pass fit in 16 bit without
+ // any bias; we could just as well keep them as .4s, but narrowing
+ // them to .4h gives a significant speedup on out of order cores
+ // (at the cost of a smaller slowdown on in-order cores such as A53).
+ xtn v16.4h, v16.4s
+
+ trn1 v16.2s, v16.2s, v24.2s
+ mov v17.8b, v24.8b
+
+2:
+ bl L(\type\()_8tap_filter_2)
+
+ ext v18.8b, v17.8b, v24.8b, #4
+ smull v2.4s, v16.4h, v1.h[0]
+ smlal v2.4s, v17.4h, v1.h[1]
+ smlal v2.4s, v18.4h, v1.h[2]
+ smlal v2.4s, v24.4h, v1.h[3]
+
+ srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits)
+ sqxtun v2.4h, v2.4s
+ umin v2.4h, v2.4h, v31.4h
+ subs \h, \h, #2
+ st1 {v2.s}[0], [\dst], \d_strd
+ st1 {v2.s}[1], [\ds2], \d_strd
+ b.le 0f
+ mov v16.8b, v18.8b
+ mov v17.8b, v24.8b
+ b 2b
+
+280: // 2x8, 2x16, 2x32 hv
+ ld1 {v1.8b}, [\xmy]
+ sub \src, \src, #2
+ sub \sr2, \src, \s_strd, lsl #1
+ sub \src, \sr2, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+ sxtl v1.8h, v1.8b
+ mov x15, x30
+
+ ld1 {v27.8h}, [\src], \s_strd
+ ext v28.16b, v27.16b, v27.16b, #2
+ smull v27.4s, v27.4h, v0.4h
+ smull v28.4s, v28.4h, v0.4h
+ addp v27.4s, v27.4s, v28.4s
+ addp v16.4s, v27.4s, v27.4s
+ srshl v16.2s, v16.2s, v30.2s // -(6-intermediate_bits)
+ // The intermediates from the horizontal pass fit in 16 bit without
+ // any bias; we could just as well keep them as .4s, but narrowing
+ // them to .4h gives a significant speedup on out of order cores
+ // (at the cost of a smaller slowdown on in-order cores such as A53).
+
+ bl L(\type\()_8tap_filter_2)
+ xtn v16.4h, v16.4s
+ trn1 v16.2s, v16.2s, v24.2s
+ mov v17.8b, v24.8b
+ bl L(\type\()_8tap_filter_2)
+ ext v18.8b, v17.8b, v24.8b, #4
+ mov v19.8b, v24.8b
+ bl L(\type\()_8tap_filter_2)
+ ext v20.8b, v19.8b, v24.8b, #4
+ mov v21.8b, v24.8b
+
+28:
+ bl L(\type\()_8tap_filter_2)
+ ext v22.8b, v21.8b, v24.8b, #4
+ smull v3.4s, v16.4h, v1.h[0]
+ smlal v3.4s, v17.4h, v1.h[1]
+ smlal v3.4s, v18.4h, v1.h[2]
+ smlal v3.4s, v19.4h, v1.h[3]
+ smlal v3.4s, v20.4h, v1.h[4]
+ smlal v3.4s, v21.4h, v1.h[5]
+ smlal v3.4s, v22.4h, v1.h[6]
+ smlal v3.4s, v24.4h, v1.h[7]
+
+ srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits)
+ sqxtun v3.4h, v3.4s
+ umin v3.4h, v3.4h, v31.4h
+ subs \h, \h, #2
+ st1 {v3.s}[0], [\dst], \d_strd
+ st1 {v3.s}[1], [\ds2], \d_strd
+ b.le 0f
+ mov v16.8b, v18.8b
+ mov v17.8b, v19.8b
+ mov v18.8b, v20.8b
+ mov v19.8b, v21.8b
+ mov v20.8b, v22.8b
+ mov v21.8b, v24.8b
+ b 28b
+
+0:
+ ret x15
+
+L(\type\()_8tap_filter_2):
+ ld1 {v25.8h}, [\sr2], \s_strd
+ ld1 {v27.8h}, [\src], \s_strd
+ ext v26.16b, v25.16b, v25.16b, #2
+ ext v28.16b, v27.16b, v27.16b, #2
+ trn1 v24.2s, v25.2s, v27.2s
+ trn2 v27.2s, v25.2s, v27.2s
+ trn1 v25.2s, v26.2s, v28.2s
+ trn2 v28.2s, v26.2s, v28.2s
+ smull v24.4s, v24.4h, v0.h[0]
+ smlal v24.4s, v25.4h, v0.h[1]
+ smlal v24.4s, v27.4h, v0.h[2]
+ smlal v24.4s, v28.4h, v0.h[3]
+ srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits)
+ xtn v24.4h, v24.4s
+ ret
+.endif
+
+40:
+ AARCH64_VALID_JUMP_TARGET
+ add \xmx, \xmx, #2
+ ld1 {v0.s}[0], [\xmx]
+ b.gt 480f
+ add \xmy, \xmy, #2
+ ld1 {v1.s}[0], [\xmy]
+ sub \sr2, \src, #2
+ sub \src, \sr2, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+ sxtl v1.8h, v1.8b
+ mov x15, x30
+
+ // 4x2, 4x4 hv
+ ld1 {v25.8h}, [\src], \s_strd
+ ext v26.16b, v25.16b, v25.16b, #2
+ ext v27.16b, v25.16b, v25.16b, #4
+ ext v28.16b, v25.16b, v25.16b, #6
+ smull v25.4s, v25.4h, v0.h[0]
+ smlal v25.4s, v26.4h, v0.h[1]
+ smlal v25.4s, v27.4h, v0.h[2]
+ smlal v25.4s, v28.4h, v0.h[3]
+ srshl v16.4s, v25.4s, v30.4s // -(6-intermediate_bits)
+ // The intermediates from the horizontal pass fit in 16 bit without
+ // any bias; we could just as well keep them as .4s, but narrowing
+ // them to .4h gives a significant speedup on out of order cores
+ // (at the cost of a smaller slowdown on in-order cores such as A53).
+ xtn v16.4h, v16.4s
+
+ bl L(\type\()_8tap_filter_4)
+ mov v17.8b, v24.8b
+ mov v18.8b, v25.8b
+
+4:
+ bl L(\type\()_8tap_filter_4)
+ smull v2.4s, v16.4h, v1.h[0]
+ smlal v2.4s, v17.4h, v1.h[1]
+ smlal v2.4s, v18.4h, v1.h[2]
+ smlal v2.4s, v24.4h, v1.h[3]
+ smull v3.4s, v17.4h, v1.h[0]
+ smlal v3.4s, v18.4h, v1.h[1]
+ smlal v3.4s, v24.4h, v1.h[2]
+ smlal v3.4s, v25.4h, v1.h[3]
+.ifc \type, put
+ srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits)
+ srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits)
+ sqxtun v2.4h, v2.4s
+ sqxtun2 v2.8h, v3.4s
+ umin v2.8h, v2.8h, v31.8h
+.else
+ rshrn v2.4h, v2.4s, #6
+ rshrn2 v2.8h, v3.4s, #6
+ sub v2.8h, v2.8h, v29.8h // PREP_BIAS
+.endif
+ subs \h, \h, #2
+
+ st1 {v2.d}[0], [\dst], \d_strd
+ st1 {v2.d}[1], [\ds2], \d_strd
+ b.le 0f
+ mov v16.8b, v18.8b
+ mov v17.8b, v24.8b
+ mov v18.8b, v25.8b
+ b 4b
+
+480: // 4x8, 4x16, 4x32 hv
+ ld1 {v1.8b}, [\xmy]
+ sub \src, \src, #2
+ sub \sr2, \src, \s_strd, lsl #1
+ sub \src, \sr2, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+ sxtl v1.8h, v1.8b
+ mov x15, x30
+
+ ld1 {v25.8h}, [\src], \s_strd
+ ext v26.16b, v25.16b, v25.16b, #2
+ ext v27.16b, v25.16b, v25.16b, #4
+ ext v28.16b, v25.16b, v25.16b, #6
+ smull v25.4s, v25.4h, v0.h[0]
+ smlal v25.4s, v26.4h, v0.h[1]
+ smlal v25.4s, v27.4h, v0.h[2]
+ smlal v25.4s, v28.4h, v0.h[3]
+ srshl v16.4s, v25.4s, v30.4s // -(6-intermediate_bits)
+ // The intermediates from the horizontal pass fit in 16 bit without
+ // any bias; we could just as well keep them as .4s, but narrowing
+ // them to .4h gives a significant speedup on out of order cores
+ // (at the cost of a smaller slowdown on in-order cores such as A53).
+ xtn v16.4h, v16.4s
+
+ bl L(\type\()_8tap_filter_4)
+ mov v17.8b, v24.8b
+ mov v18.8b, v25.8b
+ bl L(\type\()_8tap_filter_4)
+ mov v19.8b, v24.8b
+ mov v20.8b, v25.8b
+ bl L(\type\()_8tap_filter_4)
+ mov v21.8b, v24.8b
+ mov v22.8b, v25.8b
+
+48:
+ bl L(\type\()_8tap_filter_4)
+ smull v3.4s, v16.4h, v1.h[0]
+ smlal v3.4s, v17.4h, v1.h[1]
+ smlal v3.4s, v18.4h, v1.h[2]
+ smlal v3.4s, v19.4h, v1.h[3]
+ smlal v3.4s, v20.4h, v1.h[4]
+ smlal v3.4s, v21.4h, v1.h[5]
+ smlal v3.4s, v22.4h, v1.h[6]
+ smlal v3.4s, v24.4h, v1.h[7]
+ smull v4.4s, v17.4h, v1.h[0]
+ smlal v4.4s, v18.4h, v1.h[1]
+ smlal v4.4s, v19.4h, v1.h[2]
+ smlal v4.4s, v20.4h, v1.h[3]
+ smlal v4.4s, v21.4h, v1.h[4]
+ smlal v4.4s, v22.4h, v1.h[5]
+ smlal v4.4s, v24.4h, v1.h[6]
+ smlal v4.4s, v25.4h, v1.h[7]
+.ifc \type, put
+ srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits)
+ srshl v4.4s, v4.4s, v29.4s // -(6+intermediate_bits)
+ sqxtun v3.4h, v3.4s
+ sqxtun2 v3.8h, v4.4s
+ umin v3.8h, v3.8h, v31.8h
+.else
+ rshrn v3.4h, v3.4s, #6
+ rshrn2 v3.8h, v4.4s, #6
+ sub v3.8h, v3.8h, v29.8h // PREP_BIAS
+.endif
+ subs \h, \h, #2
+ st1 {v3.d}[0], [\dst], \d_strd
+ st1 {v3.d}[1], [\ds2], \d_strd
+ b.le 0f
+ mov v16.8b, v18.8b
+ mov v17.8b, v19.8b
+ mov v18.8b, v20.8b
+ mov v19.8b, v21.8b
+ mov v20.8b, v22.8b
+ mov v21.8b, v24.8b
+ mov v22.8b, v25.8b
+ b 48b
+0:
+ ret x15
+
+L(\type\()_8tap_filter_4):
+ ld1 {v24.8h}, [\sr2], \s_strd
+ ld1 {v25.8h}, [\src], \s_strd
+ ext v26.16b, v24.16b, v24.16b, #2
+ ext v27.16b, v24.16b, v24.16b, #4
+ ext v28.16b, v24.16b, v24.16b, #6
+ smull v24.4s, v24.4h, v0.h[0]
+ smlal v24.4s, v26.4h, v0.h[1]
+ smlal v24.4s, v27.4h, v0.h[2]
+ smlal v24.4s, v28.4h, v0.h[3]
+ ext v26.16b, v25.16b, v25.16b, #2
+ ext v27.16b, v25.16b, v25.16b, #4
+ ext v28.16b, v25.16b, v25.16b, #6
+ smull v25.4s, v25.4h, v0.h[0]
+ smlal v25.4s, v26.4h, v0.h[1]
+ smlal v25.4s, v27.4h, v0.h[2]
+ smlal v25.4s, v28.4h, v0.h[3]
+ srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits)
+ srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits)
+ xtn v24.4h, v24.4s
+ xtn v25.4h, v25.4s
+ ret
+
+80:
+160:
+320:
+ AARCH64_VALID_JUMP_TARGET
+ b.gt 880f
+ add \xmy, \xmy, #2
+ ld1 {v0.8b}, [\xmx]
+ ld1 {v1.s}[0], [\xmy]
+ sub \src, \src, #6
+ sub \src, \src, \s_strd
+ sxtl v0.8h, v0.8b
+ sxtl v1.8h, v1.8b
+ mov x15, x30
+ mov \my, \h
+
+164: // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+
+ ld1 {v27.8h, v28.8h}, [\src], \s_strd
+ smull v24.4s, v27.4h, v0.h[0]
+ smull2 v25.4s, v27.8h, v0.h[0]
+.irpc i, 1234567
+ ext v26.16b, v27.16b, v28.16b, #(2*\i)
+ smlal v24.4s, v26.4h, v0.h[\i]
+ smlal2 v25.4s, v26.8h, v0.h[\i]
+.endr
+ srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits)
+ srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits)
+ // The intermediates from the horizontal pass fit in 16 bit without
+ // any bias; we could just as well keep them as .4s, but narrowing
+ // them to .4h gives a significant speedup on out of order cores
+ // (at the cost of a smaller slowdown on in-order cores such as A53),
+ // and conserves register space (no need to clobber v8-v15).
+ uzp1 v16.8h, v24.8h, v25.8h // Same as xtn, xtn2
+
+ bl L(\type\()_8tap_filter_8)
+ mov v17.16b, v23.16b
+ mov v18.16b, v24.16b
+
+8:
+ smull v2.4s, v16.4h, v1.h[0]
+ smull2 v3.4s, v16.8h, v1.h[0]
+ bl L(\type\()_8tap_filter_8)
+ smull v4.4s, v17.4h, v1.h[0]
+ smull2 v5.4s, v17.8h, v1.h[0]
+ smlal v2.4s, v17.4h, v1.h[1]
+ smlal2 v3.4s, v17.8h, v1.h[1]
+ smlal v4.4s, v18.4h, v1.h[1]
+ smlal2 v5.4s, v18.8h, v1.h[1]
+ smlal v2.4s, v18.4h, v1.h[2]
+ smlal2 v3.4s, v18.8h, v1.h[2]
+ smlal v4.4s, v23.4h, v1.h[2]
+ smlal2 v5.4s, v23.8h, v1.h[2]
+ smlal v2.4s, v23.4h, v1.h[3]
+ smlal2 v3.4s, v23.8h, v1.h[3]
+ smlal v4.4s, v24.4h, v1.h[3]
+ smlal2 v5.4s, v24.8h, v1.h[3]
+.ifc \type, put
+ srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits)
+ srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits)
+ srshl v4.4s, v4.4s, v29.4s // -(6+intermediate_bits)
+ srshl v5.4s, v5.4s, v29.4s // -(6+intermediate_bits)
+ sqxtun v2.4h, v2.4s
+ sqxtun2 v2.8h, v3.4s
+ sqxtun v3.4h, v4.4s
+ sqxtun2 v3.8h, v5.4s
+ umin v2.8h, v2.8h, v31.8h
+ umin v3.8h, v3.8h, v31.8h
+.else
+ rshrn v2.4h, v2.4s, #6
+ rshrn2 v2.8h, v3.4s, #6
+ rshrn v3.4h, v4.4s, #6
+ rshrn2 v3.8h, v5.4s, #6
+ sub v2.8h, v2.8h, v29.8h // PREP_BIAS
+ sub v3.8h, v3.8h, v29.8h // PREP_BIAS
+.endif
+ subs \h, \h, #2
+ st1 {v2.8h}, [\dst], \d_strd
+ st1 {v3.8h}, [\ds2], \d_strd
+ b.le 9f
+ mov v16.16b, v18.16b
+ mov v17.16b, v23.16b
+ mov v18.16b, v24.16b
+ b 8b
+9:
+ subs \w, \w, #8
+ b.le 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ msub \src, \s_strd, \xmy, \src
+ msub \dst, \d_strd, \xmy, \dst
+ sub \src, \src, \s_strd, lsl #2
+ mov \h, \my
+ add \src, \src, #16
+ add \dst, \dst, #16
+ b 164b
+
+880: // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv
+640:
+1280:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8b}, [\xmx]
+ ld1 {v1.8b}, [\xmy]
+ sub \src, \src, #6
+ sub \src, \src, \s_strd
+ sub \src, \src, \s_strd, lsl #1
+ sxtl v0.8h, v0.8b
+ sxtl v1.8h, v1.8b
+ mov x15, x30
+ mov \my, \h
+
+168:
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+
+ ld1 {v27.8h, v28.8h}, [\src], \s_strd
+ smull v24.4s, v27.4h, v0.h[0]
+ smull2 v25.4s, v27.8h, v0.h[0]
+.irpc i, 1234567
+ ext v26.16b, v27.16b, v28.16b, #(2*\i)
+ smlal v24.4s, v26.4h, v0.h[\i]
+ smlal2 v25.4s, v26.8h, v0.h[\i]
+.endr
+ srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits)
+ srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits)
+ // The intermediates from the horizontal pass fit in 16 bit without
+ // any bias; we could just as well keep them as .4s, but narrowing
+ // them to .4h gives a significant speedup on out of order cores
+ // (at the cost of a smaller slowdown on in-order cores such as A53),
+ // and conserves register space (no need to clobber v8-v15).
+ uzp1 v16.8h, v24.8h, v25.8h // Same as xtn, xtn2
+
+ bl L(\type\()_8tap_filter_8)
+ mov v17.16b, v23.16b
+ mov v18.16b, v24.16b
+ bl L(\type\()_8tap_filter_8)
+ mov v19.16b, v23.16b
+ mov v20.16b, v24.16b
+ bl L(\type\()_8tap_filter_8)
+ mov v21.16b, v23.16b
+ mov v22.16b, v24.16b
+
+88:
+ smull v2.4s, v16.4h, v1.h[0]
+ smull2 v3.4s, v16.8h, v1.h[0]
+ bl L(\type\()_8tap_filter_8)
+ smull v4.4s, v17.4h, v1.h[0]
+ smull2 v5.4s, v17.8h, v1.h[0]
+ smlal v2.4s, v17.4h, v1.h[1]
+ smlal2 v3.4s, v17.8h, v1.h[1]
+ smlal v4.4s, v18.4h, v1.h[1]
+ smlal2 v5.4s, v18.8h, v1.h[1]
+ smlal v2.4s, v18.4h, v1.h[2]
+ smlal2 v3.4s, v18.8h, v1.h[2]
+ smlal v4.4s, v19.4h, v1.h[2]
+ smlal2 v5.4s, v19.8h, v1.h[2]
+ smlal v2.4s, v19.4h, v1.h[3]
+ smlal2 v3.4s, v19.8h, v1.h[3]
+ smlal v4.4s, v20.4h, v1.h[3]
+ smlal2 v5.4s, v20.8h, v1.h[3]
+ smlal v2.4s, v20.4h, v1.h[4]
+ smlal2 v3.4s, v20.8h, v1.h[4]
+ smlal v4.4s, v21.4h, v1.h[4]
+ smlal2 v5.4s, v21.8h, v1.h[4]
+ smlal v2.4s, v21.4h, v1.h[5]
+ smlal2 v3.4s, v21.8h, v1.h[5]
+ smlal v4.4s, v22.4h, v1.h[5]
+ smlal2 v5.4s, v22.8h, v1.h[5]
+ smlal v2.4s, v22.4h, v1.h[6]
+ smlal2 v3.4s, v22.8h, v1.h[6]
+ smlal v4.4s, v23.4h, v1.h[6]
+ smlal2 v5.4s, v23.8h, v1.h[6]
+ smlal v2.4s, v23.4h, v1.h[7]
+ smlal2 v3.4s, v23.8h, v1.h[7]
+ smlal v4.4s, v24.4h, v1.h[7]
+ smlal2 v5.4s, v24.8h, v1.h[7]
+.ifc \type, put
+ srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits)
+ srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits)
+ srshl v4.4s, v4.4s, v29.4s // -(6+intermediate_bits)
+ srshl v5.4s, v5.4s, v29.4s // -(6+intermediate_bits)
+ sqxtun v2.4h, v2.4s
+ sqxtun2 v2.8h, v3.4s
+ sqxtun v3.4h, v4.4s
+ sqxtun2 v3.8h, v5.4s
+ umin v2.8h, v2.8h, v31.8h
+ umin v3.8h, v3.8h, v31.8h
+.else
+ rshrn v2.4h, v2.4s, #6
+ rshrn2 v2.8h, v3.4s, #6
+ rshrn v3.4h, v4.4s, #6
+ rshrn2 v3.8h, v5.4s, #6
+ sub v2.8h, v2.8h, v29.8h // PREP_BIAS
+ sub v3.8h, v3.8h, v29.8h // PREP_BIAS
+.endif
+ subs \h, \h, #2
+ st1 {v2.8h}, [\dst], \d_strd
+ st1 {v3.8h}, [\ds2], \d_strd
+ b.le 9f
+ mov v16.16b, v18.16b
+ mov v17.16b, v19.16b
+ mov v18.16b, v20.16b
+ mov v19.16b, v21.16b
+ mov v20.16b, v22.16b
+ mov v21.16b, v23.16b
+ mov v22.16b, v24.16b
+ b 88b
+9:
+ subs \w, \w, #8
+ b.le 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ msub \src, \s_strd, \xmy, \src
+ msub \dst, \d_strd, \xmy, \dst
+ sub \src, \src, \s_strd, lsl #3
+ mov \h, \my
+ add \src, \src, #16
+ add \dst, \dst, #16
+ b 168b
+0:
+ ret x15
+
+L(\type\()_8tap_filter_8):
+ ld1 {v4.8h, v5.8h}, [\sr2], \s_strd
+ ld1 {v6.8h, v7.8h}, [\src], \s_strd
+ smull v25.4s, v4.4h, v0.h[0]
+ smull2 v26.4s, v4.8h, v0.h[0]
+ smull v27.4s, v6.4h, v0.h[0]
+ smull2 v28.4s, v6.8h, v0.h[0]
+.irpc i, 1234567
+ ext v23.16b, v4.16b, v5.16b, #(2*\i)
+ ext v24.16b, v6.16b, v7.16b, #(2*\i)
+ smlal v25.4s, v23.4h, v0.h[\i]
+ smlal2 v26.4s, v23.8h, v0.h[\i]
+ smlal v27.4s, v24.4h, v0.h[\i]
+ smlal2 v28.4s, v24.8h, v0.h[\i]
+.endr
+ srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits)
+ srshl v26.4s, v26.4s, v30.4s // -(6-intermediate_bits)
+ srshl v27.4s, v27.4s, v30.4s // -(6-intermediate_bits)
+ srshl v28.4s, v28.4s, v30.4s // -(6-intermediate_bits)
+ uzp1 v23.8h, v25.8h, v26.8h // Same as xtn, xtn2
+ uzp1 v24.8h, v27.8h, v28.8h // Ditto
+ ret
+
+L(\type\()_8tap_hv_tbl):
+ .hword L(\type\()_8tap_hv_tbl) - 1280b
+ .hword L(\type\()_8tap_hv_tbl) - 640b
+ .hword L(\type\()_8tap_hv_tbl) - 320b
+ .hword L(\type\()_8tap_hv_tbl) - 160b
+ .hword L(\type\()_8tap_hv_tbl) - 80b
+ .hword L(\type\()_8tap_hv_tbl) - 40b
+ .hword L(\type\()_8tap_hv_tbl) - 20b
+ .hword 0
+endfunc
+
+
+function \type\()_bilin_16bpc_neon, export=1
+.ifc \bdmax, w8
+ ldr w8, [sp]
+.endif
+ dup v1.8h, \mx
+ dup v3.8h, \my
+ mov w10, #16
+ sub w9, w10, \mx
+ sub w10, w10, \my
+ dup v0.8h, w9
+ dup v2.8h, w10
+.ifc \type, prep
+ uxtw \d_strd, \w
+ lsl \d_strd, \d_strd, #1
+.endif
+
+ clz \bdmax, \bdmax // bitdepth_max
+ clz w9, \w
+ sub \bdmax, \bdmax, #18 // intermediate_bits = clz(bitdepth_max) - 18
+ mov w11, #4
+ sub w9, w9, #24
+ sub w11, w11, \bdmax // 4 - intermediate_bits
+ add w12, \bdmax, #4 // 4 + intermediate_bits
+ cbnz \mx, L(\type\()_bilin_h)
+ cbnz \my, L(\type\()_bilin_v)
+ b \type\()_neon
+
+L(\type\()_bilin_h):
+ cbnz \my, L(\type\()_bilin_hv)
+
+ adr x10, L(\type\()_bilin_h_tbl)
+ dup v31.8h, w11 // 4 - intermediate_bits
+ ldrh w9, [x10, x9, lsl #1]
+ neg v31.8h, v31.8h // -(4-intermediate_bits)
+.ifc \type, put
+ dup v30.8h, \bdmax // intermediate_bits
+.else
+ movi v29.8h, #(PREP_BIAS >> 8), lsl #8
+.endif
+ sub x10, x10, w9, uxtw
+.ifc \type, put
+ neg v30.8h, v30.8h // -intermediate_bits
+.endif
+ br x10
+
+20: // 2xN h
+ AARCH64_VALID_JUMP_TARGET
+.ifc \type, put
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+2:
+ ld1 {v4.4h}, [\src], \s_strd
+ ld1 {v6.4h}, [\sr2], \s_strd
+ ext v5.8b, v4.8b, v4.8b, #2
+ ext v7.8b, v6.8b, v6.8b, #2
+ trn1 v4.2s, v4.2s, v6.2s
+ trn1 v5.2s, v5.2s, v7.2s
+ subs \h, \h, #2
+ mul v4.4h, v4.4h, v0.4h
+ mla v4.4h, v5.4h, v1.4h
+ urshl v4.4h, v4.4h, v31.4h
+ urshl v4.4h, v4.4h, v30.4h
+ st1 {v4.s}[0], [\dst], \d_strd
+ st1 {v4.s}[1], [\ds2], \d_strd
+ b.gt 2b
+ ret
+.endif
+
+40: // 4xN h
+ AARCH64_VALID_JUMP_TARGET
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+4:
+ ld1 {v4.8h}, [\src], \s_strd
+ ld1 {v6.8h}, [\sr2], \s_strd
+ ext v5.16b, v4.16b, v4.16b, #2
+ ext v7.16b, v6.16b, v6.16b, #2
+ trn1 v4.2d, v4.2d, v6.2d
+ trn1 v5.2d, v5.2d, v7.2d
+ subs \h, \h, #2
+ mul v4.8h, v4.8h, v0.8h
+ mla v4.8h, v5.8h, v1.8h
+ urshl v4.8h, v4.8h, v31.8h
+.ifc \type, put
+ urshl v4.8h, v4.8h, v30.8h
+.else
+ sub v4.8h, v4.8h, v29.8h
+.endif
+ st1 {v4.d}[0], [\dst], \d_strd
+ st1 {v4.d}[1], [\ds2], \d_strd
+ b.gt 4b
+ ret
+
+80: // 8xN h
+ AARCH64_VALID_JUMP_TARGET
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+8:
+ ldr h5, [\src, #16]
+ ldr h7, [\sr2, #16]
+ ld1 {v4.8h}, [\src], \s_strd
+ ld1 {v6.8h}, [\sr2], \s_strd
+ ext v5.16b, v4.16b, v5.16b, #2
+ ext v7.16b, v6.16b, v7.16b, #2
+ subs \h, \h, #2
+ mul v4.8h, v4.8h, v0.8h
+ mla v4.8h, v5.8h, v1.8h
+ mul v6.8h, v6.8h, v0.8h
+ mla v6.8h, v7.8h, v1.8h
+ urshl v4.8h, v4.8h, v31.8h
+ urshl v6.8h, v6.8h, v31.8h
+.ifc \type, put
+ urshl v4.8h, v4.8h, v30.8h
+ urshl v6.8h, v6.8h, v30.8h
+.else
+ sub v4.8h, v4.8h, v29.8h
+ sub v6.8h, v6.8h, v29.8h
+.endif
+ st1 {v4.8h}, [\dst], \d_strd
+ st1 {v6.8h}, [\ds2], \d_strd
+ b.gt 8b
+ ret
+160:
+320:
+640:
+1280: // 16xN, 32xN, ... h
+ AARCH64_VALID_JUMP_TARGET
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+
+ sub \s_strd, \s_strd, \w, uxtw #1
+ sub \s_strd, \s_strd, #16
+.ifc \type, put
+ lsl \d_strd, \d_strd, #1
+ sub \d_strd, \d_strd, \w, uxtw #1
+.endif
+161:
+ ld1 {v16.8h}, [\src], #16
+ ld1 {v21.8h}, [\sr2], #16
+ mov \mx, \w
+
+16:
+ ld1 {v17.8h, v18.8h}, [\src], #32
+ ld1 {v22.8h, v23.8h}, [\sr2], #32
+ ext v19.16b, v16.16b, v17.16b, #2
+ ext v20.16b, v17.16b, v18.16b, #2
+ ext v24.16b, v21.16b, v22.16b, #2
+ ext v25.16b, v22.16b, v23.16b, #2
+ mul v16.8h, v16.8h, v0.8h
+ mla v16.8h, v19.8h, v1.8h
+ mul v17.8h, v17.8h, v0.8h
+ mla v17.8h, v20.8h, v1.8h
+ mul v21.8h, v21.8h, v0.8h
+ mla v21.8h, v24.8h, v1.8h
+ mul v22.8h, v22.8h, v0.8h
+ mla v22.8h, v25.8h, v1.8h
+ urshl v16.8h, v16.8h, v31.8h
+ urshl v17.8h, v17.8h, v31.8h
+ urshl v21.8h, v21.8h, v31.8h
+ urshl v22.8h, v22.8h, v31.8h
+ subs \mx, \mx, #16
+.ifc \type, put
+ urshl v16.8h, v16.8h, v30.8h
+ urshl v17.8h, v17.8h, v30.8h
+ urshl v21.8h, v21.8h, v30.8h
+ urshl v22.8h, v22.8h, v30.8h
+.else
+ sub v16.8h, v16.8h, v29.8h
+ sub v17.8h, v17.8h, v29.8h
+ sub v21.8h, v21.8h, v29.8h
+ sub v22.8h, v22.8h, v29.8h
+.endif
+ st1 {v16.8h, v17.8h}, [\dst], #32
+ st1 {v21.8h, v22.8h}, [\ds2], #32
+ b.le 9f
+
+ mov v16.16b, v18.16b
+ mov v21.16b, v23.16b
+ b 16b
+
+9:
+ add \dst, \dst, \d_strd
+ add \ds2, \ds2, \d_strd
+ add \src, \src, \s_strd
+ add \sr2, \sr2, \s_strd
+
+ subs \h, \h, #2
+ b.gt 161b
+ ret
+
+L(\type\()_bilin_h_tbl):
+ .hword L(\type\()_bilin_h_tbl) - 1280b
+ .hword L(\type\()_bilin_h_tbl) - 640b
+ .hword L(\type\()_bilin_h_tbl) - 320b
+ .hword L(\type\()_bilin_h_tbl) - 160b
+ .hword L(\type\()_bilin_h_tbl) - 80b
+ .hword L(\type\()_bilin_h_tbl) - 40b
+ .hword L(\type\()_bilin_h_tbl) - 20b
+ .hword 0
+
+
+L(\type\()_bilin_v):
+ cmp \h, #4
+ adr x10, L(\type\()_bilin_v_tbl)
+.ifc \type, prep
+ dup v31.8h, w11 // 4 - intermediate_bits
+.endif
+ ldrh w9, [x10, x9, lsl #1]
+.ifc \type, prep
+ movi v29.8h, #(PREP_BIAS >> 8), lsl #8
+ neg v31.8h, v31.8h // -(4-intermediate_bits)
+.endif
+ sub x10, x10, w9, uxtw
+ br x10
+
+20: // 2xN v
+ AARCH64_VALID_JUMP_TARGET
+.ifc \type, put
+ cmp \h, #2
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ // 2x2 v
+ ld1 {v16.s}[0], [\src], \s_strd
+ b.gt 24f
+22:
+ ld1 {v17.s}[0], [\sr2], \s_strd
+ ld1 {v18.s}[0], [\src], \s_strd
+ trn1 v16.2s, v16.2s, v17.2s
+ trn1 v17.2s, v17.2s, v18.2s
+ mul v4.4h, v16.4h, v2.4h
+ mla v4.4h, v17.4h, v3.4h
+ urshr v4.8h, v4.8h, #4
+ st1 {v4.s}[0], [\dst]
+ st1 {v4.s}[1], [\ds2]
+ ret
+24: // 2x4, 2x6, 2x8, ... v
+ ld1 {v17.s}[0], [\sr2], \s_strd
+ ld1 {v18.s}[0], [\src], \s_strd
+ ld1 {v19.s}[0], [\sr2], \s_strd
+ ld1 {v20.s}[0], [\src], \s_strd
+ sub \h, \h, #4
+ trn1 v16.2s, v16.2s, v17.2s
+ trn1 v17.2s, v17.2s, v18.2s
+ trn1 v18.2s, v18.2s, v19.2s
+ trn1 v19.2s, v19.2s, v20.2s
+ trn1 v16.2d, v16.2d, v18.2d
+ trn1 v17.2d, v17.2d, v19.2d
+ mul v4.8h, v16.8h, v2.8h
+ mla v4.8h, v17.8h, v3.8h
+ cmp \h, #2
+ urshr v4.8h, v4.8h, #4
+ st1 {v4.s}[0], [\dst], \d_strd
+ st1 {v4.s}[1], [\ds2], \d_strd
+ st1 {v4.s}[2], [\dst], \d_strd
+ st1 {v4.s}[3], [\ds2], \d_strd
+ b.lt 0f
+ mov v16.8b, v20.8b
+ b.eq 22b
+ b 24b
+0:
+ ret
+.endif
+
+40: // 4xN v
+ AARCH64_VALID_JUMP_TARGET
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ ld1 {v16.4h}, [\src], \s_strd
+4:
+ ld1 {v17.4h}, [\sr2], \s_strd
+ ld1 {v18.4h}, [\src], \s_strd
+ trn1 v16.2d, v16.2d, v17.2d
+ trn1 v17.2d, v17.2d, v18.2d
+ mul v4.8h, v16.8h, v2.8h
+ mla v4.8h, v17.8h, v3.8h
+ subs \h, \h, #2
+.ifc \type, put
+ urshr v4.8h, v4.8h, #4
+.else
+ urshl v4.8h, v4.8h, v31.8h
+ sub v4.8h, v4.8h, v29.8h
+.endif
+ st1 {v4.d}[0], [\dst], \d_strd
+ st1 {v4.d}[1], [\ds2], \d_strd
+ b.le 0f
+ mov v16.8b, v18.8b
+ b 4b
+0:
+ ret
+
+80: // 8xN v
+ AARCH64_VALID_JUMP_TARGET
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ ld1 {v16.8h}, [\src], \s_strd
+8:
+ ld1 {v17.8h}, [\sr2], \s_strd
+ ld1 {v18.8h}, [\src], \s_strd
+ mul v4.8h, v16.8h, v2.8h
+ mla v4.8h, v17.8h, v3.8h
+ mul v5.8h, v17.8h, v2.8h
+ mla v5.8h, v18.8h, v3.8h
+ subs \h, \h, #2
+.ifc \type, put
+ urshr v4.8h, v4.8h, #4
+ urshr v5.8h, v5.8h, #4
+.else
+ urshl v4.8h, v4.8h, v31.8h
+ urshl v5.8h, v5.8h, v31.8h
+ sub v4.8h, v4.8h, v29.8h
+ sub v5.8h, v5.8h, v29.8h
+.endif
+ st1 {v4.8h}, [\dst], \d_strd
+ st1 {v5.8h}, [\ds2], \d_strd
+ b.le 0f
+ mov v16.16b, v18.16b
+ b 8b
+0:
+ ret
+
+160: // 16xN, 32xN, ...
+320:
+640:
+1280:
+ AARCH64_VALID_JUMP_TARGET
+ mov \my, \h
+1:
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ ld1 {v16.8h, v17.8h}, [\src], \s_strd
+2:
+ ld1 {v18.8h, v19.8h}, [\sr2], \s_strd
+ ld1 {v20.8h, v21.8h}, [\src], \s_strd
+ mul v4.8h, v16.8h, v2.8h
+ mla v4.8h, v18.8h, v3.8h
+ mul v5.8h, v17.8h, v2.8h
+ mla v5.8h, v19.8h, v3.8h
+ mul v6.8h, v18.8h, v2.8h
+ mla v6.8h, v20.8h, v3.8h
+ mul v7.8h, v19.8h, v2.8h
+ mla v7.8h, v21.8h, v3.8h
+ subs \h, \h, #2
+.ifc \type, put
+ urshr v4.8h, v4.8h, #4
+ urshr v5.8h, v5.8h, #4
+ urshr v6.8h, v6.8h, #4
+ urshr v7.8h, v7.8h, #4
+.else
+ urshl v4.8h, v4.8h, v31.8h
+ urshl v5.8h, v5.8h, v31.8h
+ urshl v6.8h, v6.8h, v31.8h
+ urshl v7.8h, v7.8h, v31.8h
+ sub v4.8h, v4.8h, v29.8h
+ sub v5.8h, v5.8h, v29.8h
+ sub v6.8h, v6.8h, v29.8h
+ sub v7.8h, v7.8h, v29.8h
+.endif
+ st1 {v4.8h, v5.8h}, [\dst], \d_strd
+ st1 {v6.8h, v7.8h}, [\ds2], \d_strd
+ b.le 9f
+ mov v16.16b, v20.16b
+ mov v17.16b, v21.16b
+ b 2b
+9:
+ subs \w, \w, #16
+ b.le 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ msub \src, \s_strd, \xmy, \src
+ msub \dst, \d_strd, \xmy, \dst
+ sub \src, \src, \s_strd, lsl #1
+ mov \h, \my
+ add \src, \src, #32
+ add \dst, \dst, #32
+ b 1b
+0:
+ ret
+
+L(\type\()_bilin_v_tbl):
+ .hword L(\type\()_bilin_v_tbl) - 1280b
+ .hword L(\type\()_bilin_v_tbl) - 640b
+ .hword L(\type\()_bilin_v_tbl) - 320b
+ .hword L(\type\()_bilin_v_tbl) - 160b
+ .hword L(\type\()_bilin_v_tbl) - 80b
+ .hword L(\type\()_bilin_v_tbl) - 40b
+ .hword L(\type\()_bilin_v_tbl) - 20b
+ .hword 0
+
+L(\type\()_bilin_hv):
+ adr x10, L(\type\()_bilin_hv_tbl)
+ dup v31.8h, w11 // 4 - intermediate_bits
+ ldrh w9, [x10, x9, lsl #1]
+ neg v31.8h, v31.8h // -(4-intermediate_bits)
+.ifc \type, put
+ dup v30.4s, w12 // 4 + intermediate_bits
+.else
+ movi v29.8h, #(PREP_BIAS >> 8), lsl #8
+.endif
+ sub x10, x10, w9, uxtw
+.ifc \type, put
+ neg v30.4s, v30.4s // -(4+intermediate_bits)
+.endif
+ br x10
+
+20: // 2xN hv
+ AARCH64_VALID_JUMP_TARGET
+.ifc \type, put
+ add \sr2, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ ld1 {v20.4h}, [\src], \s_strd
+ ext v21.8b, v20.8b, v20.8b, #2
+ mul v16.4h, v20.4h, v0.4h
+ mla v16.4h, v21.4h, v1.4h
+ urshl v16.4h, v16.4h, v31.4h
+
+2:
+ ld1 {v22.4h}, [\sr2], \s_strd
+ ld1 {v24.4h}, [\src], \s_strd
+ ext v23.8b, v22.8b, v22.8b, #2
+ ext v25.8b, v24.8b, v24.8b, #2
+ trn1 v22.2s, v22.2s, v24.2s
+ trn1 v23.2s, v23.2s, v25.2s
+ mul v17.4h, v22.4h, v0.4h
+ mla v17.4h, v23.4h, v1.4h
+ urshl v17.4h, v17.4h, v31.4h
+
+ trn1 v16.2s, v16.2s, v17.2s
+
+ umull v4.4s, v16.4h, v2.4h
+ umlal v4.4s, v17.4h, v3.4h
+ urshl v4.4s, v4.4s, v30.4s
+ xtn v4.4h, v4.4s
+ subs \h, \h, #2
+ st1 {v4.s}[0], [\dst], \d_strd
+ st1 {v4.s}[1], [\ds2], \d_strd
+ b.le 0f
+ trn2 v16.2s, v17.2s, v17.2s
+ b 2b
+0:
+ ret
+.endif
+
+40: // 4xN hv
+ AARCH64_VALID_JUMP_TARGET
+ add \sr2, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ ld1 {v20.8h}, [\src], \s_strd
+ ext v21.16b, v20.16b, v20.16b, #2
+ mul v16.4h, v20.4h, v0.4h
+ mla v16.4h, v21.4h, v1.4h
+ urshl v16.4h, v16.4h, v31.4h
+
+4:
+ ld1 {v22.8h}, [\sr2], \s_strd
+ ld1 {v24.8h}, [\src], \s_strd
+ ext v23.16b, v22.16b, v22.16b, #2
+ ext v25.16b, v24.16b, v24.16b, #2
+ trn1 v22.2d, v22.2d, v24.2d
+ trn1 v23.2d, v23.2d, v25.2d
+ mul v17.8h, v22.8h, v0.8h
+ mla v17.8h, v23.8h, v1.8h
+ urshl v17.8h, v17.8h, v31.8h
+
+ trn1 v16.2d, v16.2d, v17.2d
+
+ umull v4.4s, v16.4h, v2.4h
+ umlal v4.4s, v17.4h, v3.4h
+ umull2 v5.4s, v16.8h, v2.8h
+ umlal2 v5.4s, v17.8h, v3.8h
+.ifc \type, put
+ urshl v4.4s, v4.4s, v30.4s
+ urshl v5.4s, v5.4s, v30.4s
+ uzp1 v4.8h, v4.8h, v5.8h // Same as xtn, xtn2
+.else
+ rshrn v4.4h, v4.4s, #4
+ rshrn2 v4.8h, v5.4s, #4
+ sub v4.8h, v4.8h, v29.8h
+.endif
+ subs \h, \h, #2
+ st1 {v4.d}[0], [\dst], \d_strd
+ st1 {v4.d}[1], [\ds2], \d_strd
+ b.le 0f
+ trn2 v16.2d, v17.2d, v17.2d
+ b 4b
+0:
+ ret
+
+80: // 8xN, 16xN, ... hv
+160:
+320:
+640:
+1280:
+ AARCH64_VALID_JUMP_TARGET
+ mov \my, \h
+
+1:
+ add \sr2, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ ldr h21, [\src, #16]
+ ld1 {v20.8h}, [\src], \s_strd
+ ext v21.16b, v20.16b, v21.16b, #2
+ mul v16.8h, v20.8h, v0.8h
+ mla v16.8h, v21.8h, v1.8h
+ urshl v16.8h, v16.8h, v31.8h
+
+2:
+ ldr h23, [\sr2, #16]
+ ld1 {v22.8h}, [\sr2], \s_strd
+ ldr h25, [\src, #16]
+ ld1 {v24.8h}, [\src], \s_strd
+ ext v23.16b, v22.16b, v23.16b, #2
+ ext v25.16b, v24.16b, v25.16b, #2
+ mul v17.8h, v22.8h, v0.8h
+ mla v17.8h, v23.8h, v1.8h
+ mul v18.8h, v24.8h, v0.8h
+ mla v18.8h, v25.8h, v1.8h
+ urshl v17.8h, v17.8h, v31.8h
+ urshl v18.8h, v18.8h, v31.8h
+
+ umull v4.4s, v16.4h, v2.4h
+ umlal v4.4s, v17.4h, v3.4h
+ umull2 v5.4s, v16.8h, v2.8h
+ umlal2 v5.4s, v17.8h, v3.8h
+ umull v6.4s, v17.4h, v2.4h
+ umlal v6.4s, v18.4h, v3.4h
+ umull2 v7.4s, v17.8h, v2.8h
+ umlal2 v7.4s, v18.8h, v3.8h
+.ifc \type, put
+ urshl v4.4s, v4.4s, v30.4s
+ urshl v5.4s, v5.4s, v30.4s
+ urshl v6.4s, v6.4s, v30.4s
+ urshl v7.4s, v7.4s, v30.4s
+ uzp1 v4.8h, v4.8h, v5.8h // Same as xtn, xtn2
+ uzp1 v5.8h, v6.8h, v7.8h // Ditto
+.else
+ rshrn v4.4h, v4.4s, #4
+ rshrn2 v4.8h, v5.4s, #4
+ rshrn v5.4h, v6.4s, #4
+ rshrn2 v5.8h, v7.4s, #4
+ sub v4.8h, v4.8h, v29.8h
+ sub v5.8h, v5.8h, v29.8h
+.endif
+ subs \h, \h, #2
+ st1 {v4.8h}, [\dst], \d_strd
+ st1 {v5.8h}, [\ds2], \d_strd
+ b.le 9f
+ mov v16.16b, v18.16b
+ b 2b
+9:
+ subs \w, \w, #8
+ b.le 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ msub \src, \s_strd, \xmy, \src
+ msub \dst, \d_strd, \xmy, \dst
+ sub \src, \src, \s_strd, lsl #1
+ mov \h, \my
+ add \src, \src, #16
+ add \dst, \dst, #16
+ b 1b
+0:
+ ret
+
+L(\type\()_bilin_hv_tbl):
+ .hword L(\type\()_bilin_hv_tbl) - 1280b
+ .hword L(\type\()_bilin_hv_tbl) - 640b
+ .hword L(\type\()_bilin_hv_tbl) - 320b
+ .hword L(\type\()_bilin_hv_tbl) - 160b
+ .hword L(\type\()_bilin_hv_tbl) - 80b
+ .hword L(\type\()_bilin_hv_tbl) - 40b
+ .hword L(\type\()_bilin_hv_tbl) - 20b
+ .hword 0
+endfunc
+.endm
+
+filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10
+filter_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10
+
+.macro load_filter_row dst, src, inc
+ asr w13, \src, #10
+ add \src, \src, \inc
+ ldr \dst, [x11, w13, sxtw #3]
+.endm
+
+function warp_filter_horz_neon
+ add w12, w5, #512
+
+ ld1 {v16.8h, v17.8h}, [x2], x3
+
+ load_filter_row d0, w12, w7
+ load_filter_row d1, w12, w7
+ load_filter_row d2, w12, w7
+ sxtl v0.8h, v0.8b
+ load_filter_row d3, w12, w7
+ sxtl v1.8h, v1.8b
+ load_filter_row d4, w12, w7
+ sxtl v2.8h, v2.8b
+ load_filter_row d5, w12, w7
+ sxtl v3.8h, v3.8b
+ load_filter_row d6, w12, w7
+ sxtl v4.8h, v4.8b
+ load_filter_row d7, w12, w7
+ sxtl v5.8h, v5.8b
+ ext v18.16b, v16.16b, v17.16b, #2*1
+ smull v8.4s, v16.4h, v0.4h
+ smull2 v9.4s, v16.8h, v0.8h
+ sxtl v6.8h, v6.8b
+ ext v19.16b, v16.16b, v17.16b, #2*2
+ smull v10.4s, v18.4h, v1.4h
+ smull2 v11.4s, v18.8h, v1.8h
+ sxtl v7.8h, v7.8b
+ ext v20.16b, v16.16b, v17.16b, #2*3
+ smull v0.4s, v19.4h, v2.4h
+ smull2 v1.4s, v19.8h, v2.8h
+ ext v21.16b, v16.16b, v17.16b, #2*4
+ addp v8.4s, v8.4s, v9.4s
+ smull v2.4s, v20.4h, v3.4h
+ smull2 v3.4s, v20.8h, v3.8h
+ ext v22.16b, v16.16b, v17.16b, #2*5
+ addp v9.4s, v10.4s, v11.4s
+ smull v10.4s, v21.4h, v4.4h
+ smull2 v11.4s, v21.8h, v4.8h
+ ext v23.16b, v16.16b, v17.16b, #2*6
+ addp v0.4s, v0.4s, v1.4s
+ smull v18.4s, v22.4h, v5.4h
+ smull2 v19.4s, v22.8h, v5.8h
+ ext v16.16b, v16.16b, v17.16b, #2*7
+ addp v1.4s, v2.4s, v3.4s
+ addp v2.4s, v10.4s, v11.4s
+ smull v20.4s, v23.4h, v6.4h
+ smull2 v21.4s, v23.8h, v6.8h
+ addp v3.4s, v18.4s, v19.4s
+ smull v22.4s, v16.4h, v7.4h
+ smull2 v23.4s, v16.8h, v7.8h
+ addp v4.4s, v20.4s, v21.4s
+ addp v5.4s, v22.4s, v23.4s
+
+ addp v8.4s, v8.4s, v9.4s
+ addp v0.4s, v0.4s, v1.4s
+ addp v2.4s, v2.4s, v3.4s
+ addp v4.4s, v4.4s, v5.4s
+
+ addp v16.4s, v8.4s, v0.4s
+ addp v17.4s, v2.4s, v4.4s
+
+ add w5, w5, w8
+
+ srshl v16.4s, v16.4s, v14.4s // -(7 - intermediate_bits)
+ srshl v17.4s, v17.4s, v14.4s // -(7 - intermediate_bits)
+
+ ret
+endfunc
+
+// void dav1d_warp_affine_8x8_16bpc_neon(
+// pixel *dst, const ptrdiff_t dst_stride,
+// const pixel *src, const ptrdiff_t src_stride,
+// const int16_t *const abcd, int mx, int my,
+// const int bitdepth_max)
+.macro warp t
+function warp_affine_8x8\t\()_16bpc_neon, export=1
+ stp d8, d9, [sp, #-0x40]!
+ stp d10, d11, [sp, #0x10]
+ stp d12, d13, [sp, #0x20]
+ stp d14, d15, [sp, #0x30]
+
+.ifb \t
+ dup v15.8h, w7 // bitdepth_max
+.else
+ movi v15.8h, #(PREP_BIAS >> 8), lsl #8
+.endif
+ clz w7, w7
+ // intermediate_bits = clz(bitdepth_max) - 18
+.ifb \t
+ sub w8, w7, #11 // 7 + intermediate_bits = clz(bitdepth_max) - 18 + 7
+.endif
+ sub w7, w7, #25 // -(7 - intermediate_bits)
+.ifb \t
+ neg w8, w8 // -(7 + intermediate_bits)
+.endif
+ dup v14.4s, w7 // -(7 - intermediate_bits)
+.ifb \t
+ dup v13.4s, w8 // -(7 + intermediate_bits)
+.endif
+
+ ldr x4, [x4]
+ sbfx x7, x4, #0, #16
+ sbfx x8, x4, #16, #16
+ sbfx x9, x4, #32, #16
+ sbfx x4, x4, #48, #16
+ mov w10, #8
+ sub x2, x2, x3, lsl #1
+ sub x2, x2, x3
+ sub x2, x2, #6
+ movrel x11, X(mc_warp_filter), 64*8
+ mov x15, x30
+.ifnb \t
+ lsl x1, x1, #1
+.endif
+
+ bl warp_filter_horz_neon
+ uzp1 v24.8h, v16.8h, v17.8h // Same as xtn, xtn2
+ bl warp_filter_horz_neon
+ uzp1 v25.8h, v16.8h, v17.8h // Ditto
+ bl warp_filter_horz_neon
+ uzp1 v26.8h, v16.8h, v17.8h // Ditto
+ bl warp_filter_horz_neon
+ uzp1 v27.8h, v16.8h, v17.8h // Ditto
+ bl warp_filter_horz_neon
+ uzp1 v28.8h, v16.8h, v17.8h // Ditto
+ bl warp_filter_horz_neon
+ uzp1 v29.8h, v16.8h, v17.8h // Ditto
+ bl warp_filter_horz_neon
+ uzp1 v30.8h, v16.8h, v17.8h // Ditto
+
+1:
+ add w14, w6, #512
+ bl warp_filter_horz_neon
+ uzp1 v31.8h, v16.8h, v17.8h // Same as xtn, xtn2
+
+ load_filter_row d0, w14, w9
+ load_filter_row d1, w14, w9
+ load_filter_row d2, w14, w9
+ load_filter_row d3, w14, w9
+ load_filter_row d4, w14, w9
+ load_filter_row d5, w14, w9
+ load_filter_row d6, w14, w9
+ load_filter_row d7, w14, w9
+ transpose_8x8b_xtl v0, v1, v2, v3, v4, v5, v6, v7, sxtl
+
+ // This ordering of smull/smlal/smull2/smlal2 is highly
+ // beneficial for Cortex A53 here.
+ smull v16.4s, v24.4h, v0.4h
+ smlal v16.4s, v25.4h, v1.4h
+ smlal v16.4s, v26.4h, v2.4h
+ smlal v16.4s, v27.4h, v3.4h
+ smlal v16.4s, v28.4h, v4.4h
+ smlal v16.4s, v29.4h, v5.4h
+ smlal v16.4s, v30.4h, v6.4h
+ smlal v16.4s, v31.4h, v7.4h
+ smull2 v17.4s, v24.8h, v0.8h
+ smlal2 v17.4s, v25.8h, v1.8h
+ smlal2 v17.4s, v26.8h, v2.8h
+ smlal2 v17.4s, v27.8h, v3.8h
+ smlal2 v17.4s, v28.8h, v4.8h
+ smlal2 v17.4s, v29.8h, v5.8h
+ smlal2 v17.4s, v30.8h, v6.8h
+ smlal2 v17.4s, v31.8h, v7.8h
+
+ mov v24.16b, v25.16b
+ mov v25.16b, v26.16b
+.ifb \t
+ srshl v16.4s, v16.4s, v13.4s // -(7 + intermediate_bits)
+ srshl v17.4s, v17.4s, v13.4s // -(7 + intermediate_bits)
+.else
+ rshrn v16.4h, v16.4s, #7
+ rshrn2 v16.8h, v17.4s, #7
+.endif
+ mov v26.16b, v27.16b
+.ifb \t
+ sqxtun v16.4h, v16.4s
+ sqxtun2 v16.8h, v17.4s
+.else
+ sub v16.8h, v16.8h, v15.8h // PREP_BIAS
+.endif
+ mov v27.16b, v28.16b
+ mov v28.16b, v29.16b
+.ifb \t
+ umin v16.8h, v16.8h, v15.8h // bitdepth_max
+.endif
+ mov v29.16b, v30.16b
+ mov v30.16b, v31.16b
+ subs w10, w10, #1
+ st1 {v16.8h}, [x0], x1
+
+ add w6, w6, w4
+ b.gt 1b
+
+ ldp d14, d15, [sp, #0x30]
+ ldp d12, d13, [sp, #0x20]
+ ldp d10, d11, [sp, #0x10]
+ ldp d8, d9, [sp], 0x40
+
+ ret x15
+endfunc
+.endm
+
+warp
+warp t
+
+// void dav1d_emu_edge_16bpc_neon(
+// const intptr_t bw, const intptr_t bh,
+// const intptr_t iw, const intptr_t ih,
+// const intptr_t x, const intptr_t y,
+// pixel *dst, const ptrdiff_t dst_stride,
+// const pixel *ref, const ptrdiff_t ref_stride)
+function emu_edge_16bpc_neon, export=1
+ ldp x8, x9, [sp]
+
+ // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
+ // ref += iclip(x, 0, iw - 1)
+ sub x12, x3, #1 // ih - 1
+ cmp x5, x3
+ sub x13, x2, #1 // iw - 1
+ csel x12, x12, x5, ge // min(y, ih - 1)
+ cmp x4, x2
+ bic x12, x12, x12, asr #63 // max(min(y, ih - 1), 0)
+ csel x13, x13, x4, ge // min(x, iw - 1)
+ bic x13, x13, x13, asr #63 // max(min(x, iw - 1), 0)
+ madd x8, x12, x9, x8 // ref += iclip() * stride
+ add x8, x8, x13, lsl #1 // ref += iclip()
+
+ // bottom_ext = iclip(y + bh - ih, 0, bh - 1)
+ // top_ext = iclip(-y, 0, bh - 1)
+ add x10, x5, x1 // y + bh
+ neg x5, x5 // -y
+ sub x10, x10, x3 // y + bh - ih
+ sub x12, x1, #1 // bh - 1
+ cmp x10, x1
+ bic x5, x5, x5, asr #63 // max(-y, 0)
+ csel x10, x10, x12, lt // min(y + bh - ih, bh-1)
+ cmp x5, x1
+ bic x10, x10, x10, asr #63 // max(min(y + bh - ih, bh-1), 0)
+ csel x5, x5, x12, lt // min(max(-y, 0), bh-1)
+
+ // right_ext = iclip(x + bw - iw, 0, bw - 1)
+ // left_ext = iclip(-x, 0, bw - 1)
+ add x11, x4, x0 // x + bw
+ neg x4, x4 // -x
+ sub x11, x11, x2 // x + bw - iw
+ sub x13, x0, #1 // bw - 1
+ cmp x11, x0
+ bic x4, x4, x4, asr #63 // max(-x, 0)
+ csel x11, x11, x13, lt // min(x + bw - iw, bw-1)
+ cmp x4, x0
+ bic x11, x11, x11, asr #63 // max(min(x + bw - iw, bw-1), 0)
+ csel x4, x4, x13, lt // min(max(-x, 0), bw - 1)
+
+ // center_h = bh - top_ext - bottom_ext
+ // dst += top_ext * PXSTRIDE(dst_stride)
+ // center_w = bw - left_ext - right_ext
+ sub x1, x1, x5 // bh - top_ext
+ madd x6, x5, x7, x6
+ sub x2, x0, x4 // bw - left_ext
+ sub x1, x1, x10 // center_h = bh - top_ext - bottom_ext
+ sub x2, x2, x11 // center_w = bw - left_ext - right_ext
+
+ mov x14, x6 // backup of dst
+
+.macro v_loop need_left, need_right
+0:
+.if \need_left
+ ld1r {v0.8h}, [x8]
+ mov x12, x6 // out = dst
+ mov x3, x4
+ mov v1.16b, v0.16b
+1:
+ subs x3, x3, #16
+ st1 {v0.8h, v1.8h}, [x12], #32
+ b.gt 1b
+.endif
+ mov x13, x8
+ add x12, x6, x4, lsl #1 // out = dst + left_ext
+ mov x3, x2
+1:
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x13], #64
+ subs x3, x3, #32
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x12], #64
+ b.gt 1b
+.if \need_right
+ add x3, x8, x2, lsl #1 // in + center_w
+ sub x3, x3, #2 // in + center_w - 1
+ add x12, x6, x4, lsl #1 // dst + left_ext
+ ld1r {v0.8h}, [x3]
+ add x12, x12, x2, lsl #1 // out = dst + left_ext + center_w
+ mov x3, x11
+ mov v1.16b, v0.16b
+1:
+ subs x3, x3, #16
+ st1 {v0.8h, v1.8h}, [x12], #32
+ b.gt 1b
+.endif
+
+ subs x1, x1, #1 // center_h--
+ add x6, x6, x7
+ add x8, x8, x9
+ b.gt 0b
+.endm
+
+ cbz x4, 2f
+ // need_left
+ cbz x11, 3f
+ // need_left + need_right
+ v_loop 1, 1
+ b 5f
+
+2:
+ // !need_left
+ cbz x11, 4f
+ // !need_left + need_right
+ v_loop 0, 1
+ b 5f
+
+3:
+ // need_left + !need_right
+ v_loop 1, 0
+ b 5f
+
+4:
+ // !need_left + !need_right
+ v_loop 0, 0
+
+5:
+
+ cbz x10, 3f
+ // need_bottom
+ sub x8, x6, x7 // ref = dst - stride
+ mov x4, x0
+1:
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x8], #64
+ mov x3, x10
+2:
+ subs x3, x3, #1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7
+ b.gt 2b
+ msub x6, x7, x10, x6 // dst -= bottom_ext * stride
+ subs x4, x4, #32 // bw -= 32
+ add x6, x6, #64 // dst += 32
+ b.gt 1b
+
+3:
+ cbz x5, 3f
+ // need_top
+ msub x6, x7, x5, x14 // dst = stored_dst - top_ext * stride
+1:
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x14], #64
+ mov x3, x5
+2:
+ subs x3, x3, #1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7
+ b.gt 2b
+ msub x6, x7, x5, x6 // dst -= top_ext * stride
+ subs x0, x0, #32 // bw -= 32
+ add x6, x6, #64 // dst += 32
+ b.gt 1b
+
+3:
+ ret
+endfunc
diff --git a/third_party/dav1d/src/arm/64/msac.S b/third_party/dav1d/src/arm/64/msac.S
new file mode 100644
index 0000000000..3a6cf900a9
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/msac.S
@@ -0,0 +1,480 @@
+/*
+ * Copyright © 2019, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+#define BUF_POS 0
+#define BUF_END 8
+#define DIF 16
+#define RNG 24
+#define CNT 28
+#define ALLOW_UPDATE_CDF 32
+
+const coeffs
+ .short 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0
+ .short 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+endconst
+
+const bits
+ .short 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
+ .short 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000
+endconst
+
+.macro ld1_n d0, d1, src, sz, n
+.if \n <= 8
+ ld1 {\d0\sz}, [\src]
+.else
+ ld1 {\d0\sz, \d1\sz}, [\src]
+.endif
+.endm
+
+.macro st1_n s0, s1, dst, sz, n
+.if \n <= 8
+ st1 {\s0\sz}, [\dst]
+.else
+ st1 {\s0\sz, \s1\sz}, [\dst]
+.endif
+.endm
+
+.macro ushr_n d0, d1, s0, s1, shift, sz, n
+ ushr \d0\sz, \s0\sz, \shift
+.if \n == 16
+ ushr \d1\sz, \s1\sz, \shift
+.endif
+.endm
+
+.macro add_n d0, d1, s0, s1, s2, s3, sz, n
+ add \d0\sz, \s0\sz, \s2\sz
+.if \n == 16
+ add \d1\sz, \s1\sz, \s3\sz
+.endif
+.endm
+
+.macro sub_n d0, d1, s0, s1, s2, s3, sz, n
+ sub \d0\sz, \s0\sz, \s2\sz
+.if \n == 16
+ sub \d1\sz, \s1\sz, \s3\sz
+.endif
+.endm
+
+.macro and_n d0, d1, s0, s1, s2, s3, sz, n
+ and \d0\sz, \s0\sz, \s2\sz
+.if \n == 16
+ and \d1\sz, \s1\sz, \s3\sz
+.endif
+.endm
+
+.macro cmhs_n d0, d1, s0, s1, s2, s3, sz, n
+ cmhs \d0\sz, \s0\sz, \s2\sz
+.if \n == 16
+ cmhs \d1\sz, \s1\sz, \s3\sz
+.endif
+.endm
+
+.macro urhadd_n d0, d1, s0, s1, s2, s3, sz, n
+ urhadd \d0\sz, \s0\sz, \s2\sz
+.if \n == 16
+ urhadd \d1\sz, \s1\sz, \s3\sz
+.endif
+.endm
+
+.macro sshl_n d0, d1, s0, s1, s2, s3, sz, n
+ sshl \d0\sz, \s0\sz, \s2\sz
+.if \n == 16
+ sshl \d1\sz, \s1\sz, \s3\sz
+.endif
+.endm
+
+.macro sqdmulh_n d0, d1, s0, s1, s2, s3, sz, n
+ sqdmulh \d0\sz, \s0\sz, \s2\sz
+.if \n == 16
+ sqdmulh \d1\sz, \s1\sz, \s3\sz
+.endif
+.endm
+
+.macro str_n idx0, idx1, dstreg, dstoff, n
+ str \idx0, [\dstreg, \dstoff]
+.if \n == 16
+ str \idx1, [\dstreg, \dstoff + 16]
+.endif
+.endm
+
+// unsigned dav1d_msac_decode_symbol_adapt4_neon(MsacContext *s, uint16_t *cdf,
+// size_t n_symbols);
+
+function msac_decode_symbol_adapt4_neon, export=1
+.macro decode_update sz, szb, n
+ sub sp, sp, #48
+ add x8, x0, #RNG
+ ld1_n v0, v1, x1, \sz, \n // cdf
+ ld1r {v4\sz}, [x8] // rng
+ movrel x9, coeffs, 30
+ movi v31\sz, #0x7f, lsl #8 // 0x7f00
+ sub x9, x9, x2, lsl #1
+ mvni v30\sz, #0x3f // 0xffc0
+ and v7\szb, v4\szb, v31\szb // rng & 0x7f00
+ str h4, [sp, #14] // store original u = s->rng
+ and_n v2, v3, v0, v1, v30, v30, \szb, \n // cdf & 0xffc0
+
+ ld1_n v4, v5, x9, \sz, \n // EC_MIN_PROB * (n_symbols - ret)
+ sqdmulh_n v6, v7, v2, v3, v7, v7, \sz, \n // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1
+ add x8, x0, #DIF + 6
+
+ add_n v4, v5, v2, v3, v4, v5, \sz, \n // v = cdf + EC_MIN_PROB * (n_symbols - ret)
+ add_n v4, v5, v6, v7, v4, v5, \sz, \n // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret)
+
+ ld1r {v6.8h}, [x8] // dif >> (EC_WIN_SIZE - 16)
+ movrel x8, bits
+ str_n q4, q5, sp, #16, \n // store v values to allow indexed access
+
+ ld1_n v16, v17, x8, .8h, \n
+
+ cmhs_n v2, v3, v6, v6, v4, v5, .8h, \n // c >= v
+
+ and_n v6, v7, v2, v3, v16, v17, .16b, \n // One bit per halfword set in the mask
+.if \n == 16
+ add v6.8h, v6.8h, v7.8h
+.endif
+ addv h6, v6.8h // Aggregate mask bits
+ ldr w4, [x0, #ALLOW_UPDATE_CDF]
+ umov w3, v6.h[0]
+ rbit w3, w3
+ clz w15, w3 // ret
+
+ cbz w4, L(renorm)
+ // update_cdf
+ ldrh w3, [x1, x2, lsl #1] // count = cdf[n_symbols]
+ movi v5\szb, #0xff
+.if \n == 16
+ mov w4, #-5
+.else
+ mvn w14, w2
+ mov w4, #-4
+ cmn w14, #3 // set C if n_symbols <= 2
+.endif
+ urhadd_n v4, v5, v5, v5, v2, v3, \sz, \n // i >= val ? -1 : 32768
+.if \n == 16
+ sub w4, w4, w3, lsr #4 // -((count >> 4) + 5)
+.else
+ lsr w14, w3, #4 // count >> 4
+ sbc w4, w4, w14 // -((count >> 4) + (n_symbols > 2) + 4)
+.endif
+ sub_n v4, v5, v4, v5, v0, v1, \sz, \n // (32768 - cdf[i]) or (-1 - cdf[i])
+ dup v6\sz, w4 // -rate
+
+ sub w3, w3, w3, lsr #5 // count - (count == 32)
+ sub_n v0, v1, v0, v1, v2, v3, \sz, \n // cdf + (i >= val ? 1 : 0)
+ sshl_n v4, v5, v4, v5, v6, v6, \sz, \n // ({32768,-1} - cdf[i]) >> rate
+ add w3, w3, #1 // count + (count < 32)
+ add_n v0, v1, v0, v1, v4, v5, \sz, \n // cdf + (32768 - cdf[i]) >> rate
+ st1_n v0, v1, x1, \sz, \n
+ strh w3, [x1, x2, lsl #1]
+.endm
+
+ decode_update .4h, .8b, 4
+
+L(renorm):
+ add x8, sp, #16
+ add x8, x8, w15, uxtw #1
+ ldrh w3, [x8] // v
+ ldurh w4, [x8, #-2] // u
+ ldr w6, [x0, #CNT]
+ ldr x7, [x0, #DIF]
+ sub w4, w4, w3 // rng = u - v
+ clz w5, w4 // clz(rng)
+ eor w5, w5, #16 // d = clz(rng) ^ 16
+ mvn x7, x7 // ~dif
+ add x7, x7, x3, lsl #48 // ~dif + (v << 48)
+L(renorm2):
+ lsl w4, w4, w5 // rng << d
+ subs w6, w6, w5 // cnt -= d
+ lsl x7, x7, x5 // (~dif + (v << 48)) << d
+ str w4, [x0, #RNG]
+ mvn x7, x7 // ~dif
+ b.hs 9f
+
+ // refill
+ ldp x3, x4, [x0] // BUF_POS, BUF_END
+ add x5, x3, #8
+ cmp x5, x4
+ b.gt 2f
+
+ ldr x3, [x3] // next_bits
+ add w8, w6, #23 // shift_bits = cnt + 23
+ add w6, w6, #16 // cnt += 16
+ rev x3, x3 // next_bits = bswap(next_bits)
+ sub x5, x5, x8, lsr #3 // buf_pos -= shift_bits >> 3
+ and w8, w8, #24 // shift_bits &= 24
+ lsr x3, x3, x8 // next_bits >>= shift_bits
+ sub w8, w8, w6 // shift_bits -= 16 + cnt
+ str x5, [x0, #BUF_POS]
+ lsl x3, x3, x8 // next_bits <<= shift_bits
+ mov w4, #48
+ sub w6, w4, w8 // cnt = cnt + 64 - shift_bits
+ eor x7, x7, x3 // dif ^= next_bits
+ b 9f
+
+2: // refill_eob
+ mov w14, #40
+ sub w5, w14, w6 // c = 40 - cnt
+3:
+ cmp x3, x4
+ b.ge 4f
+ ldrb w8, [x3], #1
+ lsl x8, x8, x5
+ eor x7, x7, x8
+ subs w5, w5, #8
+ b.ge 3b
+
+4: // refill_eob_end
+ str x3, [x0, #BUF_POS]
+ sub w6, w14, w5 // cnt = 40 - c
+
+9:
+ str w6, [x0, #CNT]
+ str x7, [x0, #DIF]
+
+ mov w0, w15
+ add sp, sp, #48
+ ret
+endfunc
+
+function msac_decode_symbol_adapt8_neon, export=1
+ decode_update .8h, .16b, 8
+ b L(renorm)
+endfunc
+
+function msac_decode_symbol_adapt16_neon, export=1
+ decode_update .8h, .16b, 16
+ b L(renorm)
+endfunc
+
+function msac_decode_hi_tok_neon, export=1
+ ld1 {v0.4h}, [x1] // cdf
+ add x16, x0, #RNG
+ movi v31.4h, #0x7f, lsl #8 // 0x7f00
+ movrel x17, coeffs, 30-2*3
+ mvni v30.4h, #0x3f // 0xffc0
+ ldrh w9, [x1, #6] // count = cdf[n_symbols]
+ ld1r {v3.4h}, [x16] // rng
+ movrel x16, bits
+ ld1 {v29.4h}, [x17] // EC_MIN_PROB * (n_symbols - ret)
+ add x17, x0, #DIF + 6
+ ld1 {v16.8h}, [x16]
+ mov w13, #-24
+ and v17.8b, v0.8b, v30.8b // cdf & 0xffc0
+ ldr w10, [x0, #ALLOW_UPDATE_CDF]
+ ld1r {v1.8h}, [x17] // dif >> (EC_WIN_SIZE - 16)
+ sub sp, sp, #48
+ ldr w6, [x0, #CNT]
+ ldr x7, [x0, #DIF]
+1:
+ and v7.8b, v3.8b, v31.8b // rng & 0x7f00
+ sqdmulh v6.4h, v17.4h, v7.4h // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1
+ add v4.4h, v17.4h, v29.4h // v = cdf + EC_MIN_PROB * (n_symbols - ret)
+ add v4.4h, v6.4h, v4.4h // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret)
+ str h3, [sp, #14] // store original u = s->rng
+ cmhs v2.8h, v1.8h, v4.8h // c >= v
+ str q4, [sp, #16] // store v values to allow indexed access
+ and v6.16b, v2.16b, v16.16b // One bit per halfword set in the mask
+ addv h6, v6.8h // Aggregate mask bits
+ umov w3, v6.h[0]
+ add w13, w13, #5
+ rbit w3, w3
+ add x8, sp, #16
+ clz w15, w3 // ret
+
+ cbz w10, 2f
+ // update_cdf
+ movi v5.8b, #0xff
+ mov w4, #-5
+ urhadd v4.4h, v5.4h, v2.4h // i >= val ? -1 : 32768
+ sub w4, w4, w9, lsr #4 // -((count >> 4) + 5)
+ sub v4.4h, v4.4h, v0.4h // (32768 - cdf[i]) or (-1 - cdf[i])
+ dup v6.4h, w4 // -rate
+
+ sub w9, w9, w9, lsr #5 // count - (count == 32)
+ sub v0.4h, v0.4h, v2.4h // cdf + (i >= val ? 1 : 0)
+ sshl v4.4h, v4.4h, v6.4h // ({32768,-1} - cdf[i]) >> rate
+ add w9, w9, #1 // count + (count < 32)
+ add v0.4h, v0.4h, v4.4h // cdf + (32768 - cdf[i]) >> rate
+ st1 {v0.4h}, [x1]
+ and v17.8b, v0.8b, v30.8b // cdf & 0xffc0
+ strh w9, [x1, #6]
+
+2:
+ add x8, x8, w15, uxtw #1
+ ldrh w3, [x8] // v
+ ldurh w4, [x8, #-2] // u
+ sub w4, w4, w3 // rng = u - v
+ clz w5, w4 // clz(rng)
+ eor w5, w5, #16 // d = clz(rng) ^ 16
+ mvn x7, x7 // ~dif
+ add x7, x7, x3, lsl #48 // ~dif + (v << 48)
+ lsl w4, w4, w5 // rng << d
+ subs w6, w6, w5 // cnt -= d
+ lsl x7, x7, x5 // (~dif + (v << 48)) << d
+ str w4, [x0, #RNG]
+ dup v3.4h, w4
+ mvn x7, x7 // ~dif
+ b.hs 9f
+
+ // refill
+ ldp x3, x4, [x0] // BUF_POS, BUF_END
+ add x5, x3, #8
+ cmp x5, x4
+ b.gt 2f
+
+ ldr x3, [x3] // next_bits
+ add w8, w6, #23 // shift_bits = cnt + 23
+ add w6, w6, #16 // cnt += 16
+ rev x3, x3 // next_bits = bswap(next_bits)
+ sub x5, x5, x8, lsr #3 // buf_pos -= shift_bits >> 3
+ and w8, w8, #24 // shift_bits &= 24
+ lsr x3, x3, x8 // next_bits >>= shift_bits
+ sub w8, w8, w6 // shift_bits -= 16 + cnt
+ str x5, [x0, #BUF_POS]
+ lsl x3, x3, x8 // next_bits <<= shift_bits
+ mov w4, #48
+ sub w6, w4, w8 // cnt = cnt + 64 - shift_bits
+ eor x7, x7, x3 // dif ^= next_bits
+ b 9f
+
+2: // refill_eob
+ mov w14, #40
+ sub w5, w14, w6 // c = 40 - cnt
+3:
+ cmp x3, x4
+ b.ge 4f
+ ldrb w8, [x3], #1
+ lsl x8, x8, x5
+ eor x7, x7, x8
+ subs w5, w5, #8
+ b.ge 3b
+
+4: // refill_eob_end
+ str x3, [x0, #BUF_POS]
+ sub w6, w14, w5 // cnt = 40 - c
+
+9:
+ lsl w15, w15, #1
+ sub w15, w15, #5
+ lsr x12, x7, #48
+ adds w13, w13, w15 // carry = tok_br < 3 || tok == 15
+ dup v1.8h, w12
+ b.cc 1b // loop if !carry
+ add w13, w13, #30
+ str w6, [x0, #CNT]
+ add sp, sp, #48
+ str x7, [x0, #DIF]
+ lsr w0, w13, #1
+ ret
+endfunc
+
+function msac_decode_bool_equi_neon, export=1
+ ldp w5, w6, [x0, #RNG] // + CNT
+ sub sp, sp, #48
+ ldr x7, [x0, #DIF]
+ bic w4, w5, #0xff // r &= 0xff00
+ add w4, w4, #8
+ subs x8, x7, x4, lsl #47 // dif - vw
+ lsr w4, w4, #1 // v
+ sub w5, w5, w4 // r - v
+ cset w15, lo
+ csel w4, w5, w4, hs // if (ret) v = r - v;
+ csel x7, x8, x7, hs // if (ret) dif = dif - vw;
+
+ clz w5, w4 // clz(rng)
+ mvn x7, x7 // ~dif
+ eor w5, w5, #16 // d = clz(rng) ^ 16
+ b L(renorm2)
+endfunc
+
+function msac_decode_bool_neon, export=1
+ ldp w5, w6, [x0, #RNG] // + CNT
+ sub sp, sp, #48
+ ldr x7, [x0, #DIF]
+ lsr w4, w5, #8 // r >> 8
+ bic w1, w1, #0x3f // f &= ~63
+ mul w4, w4, w1
+ lsr w4, w4, #7
+ add w4, w4, #4 // v
+ subs x8, x7, x4, lsl #48 // dif - vw
+ sub w5, w5, w4 // r - v
+ cset w15, lo
+ csel w4, w5, w4, hs // if (ret) v = r - v;
+ csel x7, x8, x7, hs // if (ret) dif = dif - vw;
+
+ clz w5, w4 // clz(rng)
+ mvn x7, x7 // ~dif
+ eor w5, w5, #16 // d = clz(rng) ^ 16
+ b L(renorm2)
+endfunc
+
+function msac_decode_bool_adapt_neon, export=1
+ ldr w9, [x1] // cdf[0-1]
+ ldp w5, w6, [x0, #RNG] // + CNT
+ sub sp, sp, #48
+ ldr x7, [x0, #DIF]
+ lsr w4, w5, #8 // r >> 8
+ and w2, w9, #0xffc0 // f &= ~63
+ mul w4, w4, w2
+ lsr w4, w4, #7
+ add w4, w4, #4 // v
+ subs x8, x7, x4, lsl #48 // dif - vw
+ sub w5, w5, w4 // r - v
+ cset w15, lo
+ csel w4, w5, w4, hs // if (ret) v = r - v;
+ csel x7, x8, x7, hs // if (ret) dif = dif - vw;
+
+ ldr w10, [x0, #ALLOW_UPDATE_CDF]
+
+ clz w5, w4 // clz(rng)
+ mvn x7, x7 // ~dif
+ eor w5, w5, #16 // d = clz(rng) ^ 16
+
+ cbz w10, L(renorm2)
+
+ lsr w2, w9, #16 // count = cdf[1]
+ and w9, w9, #0xffff // cdf[0]
+
+ sub w3, w2, w2, lsr #5 // count - (count >= 32)
+ lsr w2, w2, #4 // count >> 4
+ add w10, w3, #1 // count + (count < 32)
+ add w2, w2, #4 // rate = (count >> 4) | 4
+
+ sub w9, w9, w15 // cdf[0] -= bit
+ sub w11, w9, w15, lsl #15 // {cdf[0], cdf[0] - 32769}
+ asr w11, w11, w2 // {cdf[0], cdf[0] - 32769} >> rate
+ sub w9, w9, w11 // cdf[0]
+
+ strh w9, [x1]
+ strh w10, [x1, #2]
+
+ b L(renorm2)
+endfunc
diff --git a/third_party/dav1d/src/arm/64/refmvs.S b/third_party/dav1d/src/arm/64/refmvs.S
new file mode 100644
index 0000000000..e905682f47
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/refmvs.S
@@ -0,0 +1,292 @@
+/*
+ * Copyright © 2021, VideoLAN and dav1d authors
+ * Copyright © 2021, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// void dav1d_splat_mv_neon(refmvs_block **rr, const refmvs_block *rmv,
+// int bx4, int bw4, int bh4)
+
+function splat_mv_neon, export=1
+ ld1 {v3.16b}, [x1]
+ clz w3, w3
+ adr x5, L(splat_tbl)
+ sub w3, w3, #26
+ ext v2.16b, v3.16b, v3.16b, #12
+ ldrh w3, [x5, w3, uxtw #1]
+ add w2, w2, w2, lsl #1
+ ext v0.16b, v2.16b, v3.16b, #4
+ sub x3, x5, w3, uxtw
+ ext v1.16b, v2.16b, v3.16b, #8
+ lsl w2, w2, #2
+ ext v2.16b, v2.16b, v3.16b, #12
+1:
+ ldr x1, [x0], #8
+ subs w4, w4, #1
+ add x1, x1, x2
+ br x3
+
+10:
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v0.8b}, [x1]
+ str s2, [x1, #8]
+ b.gt 1b
+ ret
+20:
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v0.16b}, [x1]
+ str d1, [x1, #16]
+ b.gt 1b
+ ret
+320:
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v0.16b, v1.16b, v2.16b}, [x1], #48
+ st1 {v0.16b, v1.16b, v2.16b}, [x1], #48
+ st1 {v0.16b, v1.16b, v2.16b}, [x1], #48
+ st1 {v0.16b, v1.16b, v2.16b}, [x1], #48
+160:
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v0.16b, v1.16b, v2.16b}, [x1], #48
+ st1 {v0.16b, v1.16b, v2.16b}, [x1], #48
+80:
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v0.16b, v1.16b, v2.16b}, [x1], #48
+40:
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v0.16b, v1.16b, v2.16b}, [x1]
+ b.gt 1b
+ ret
+
+L(splat_tbl):
+ .hword L(splat_tbl) - 320b
+ .hword L(splat_tbl) - 160b
+ .hword L(splat_tbl) - 80b
+ .hword L(splat_tbl) - 40b
+ .hword L(splat_tbl) - 20b
+ .hword L(splat_tbl) - 10b
+endfunc
+
+const mv_tbls, align=4
+ .byte 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
+ .byte 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0
+ .byte 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4
+ .byte 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4
+endconst
+
+const mask_mult, align=4
+ .byte 1, 2, 1, 2, 0, 0, 0, 0
+endconst
+
+// void dav1d_save_tmvs_neon(refmvs_temporal_block *rp, ptrdiff_t stride,
+// refmvs_block **rr, const uint8_t *ref_sign,
+// int col_end8, int row_end8,
+// int col_start8, int row_start8)
+function save_tmvs_neon, export=1
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29, x30, [sp, #-16]!
+ mov x29, sp
+
+ movi v30.8b, #0
+ ld1 {v31.8b}, [x3]
+ adr x8, L(save_tmvs_tbl)
+ movrel x16, mask_mult
+ movrel x13, mv_tbls
+ ld1 {v29.8b}, [x16]
+ ext v31.8b, v30.8b, v31.8b, #7 // [0, ref_sign]
+ mov w15, #5
+ mov w14, #12*2
+ sxtw x4, w4
+ sxtw x6, w6
+ mul w1, w1, w15 // stride *= 5
+ sub w5, w5, w7 // h = row_end8 - row_start8
+ lsl w7, w7, #1 // row_start8 <<= 1
+1:
+ mov w15, #5
+ and w9, w7, #30 // (y & 15) * 2
+ ldr x9, [x2, w9, uxtw #3] // b = rr[(y & 15) * 2]
+ add x9, x9, #12 // &b[... + 1]
+ madd x10, x4, x14, x9 // end_cand_b = &b[col_end8*2 + 1]
+ madd x9, x6, x14, x9 // cand_b = &b[x*2 + 1]
+
+ madd x3, x6, x15, x0 // &rp[x]
+
+2:
+ ldrb w11, [x9, #10] // cand_b->bs
+ ld1 {v0.16b}, [x9] // cand_b->mv
+ add x11, x8, w11, uxtw #2
+ ldr h1, [x9, #8] // cand_b->ref
+ ldrh w12, [x11] // bw8
+ mov x15, x8
+ add x9, x9, w12, uxtw #1 // cand_b += bw8*2
+ cmp x9, x10
+ mov v2.8b, v0.8b
+ b.ge 3f
+
+ ldrb w15, [x9, #10] // cand_b->bs
+ add x16, x9, #8
+ ld1 {v4.16b}, [x9] // cand_b->mv
+ add x15, x8, w15, uxtw #2
+ ld1 {v1.h}[1], [x16] // cand_b->ref
+ ldrh w12, [x15] // bw8
+ add x9, x9, w12, uxtw #1 // cand_b += bw8*2
+ trn1 v2.2d, v0.2d, v4.2d
+
+3:
+ abs v2.8h, v2.8h // abs(mv[].xy)
+ tbl v1.8b, {v31.16b}, v1.8b // ref_sign[ref]
+ ushr v2.8h, v2.8h, #12 // abs(mv[].xy) >> 12
+ umull v1.8h, v1.8b, v29.8b // ref_sign[ref] * {1, 2}
+ cmeq v2.4s, v2.4s, #0 // abs(mv[].xy) <= 4096
+ xtn v2.4h, v2.4s // abs() condition to 16 bit
+ and v1.8b, v1.8b, v2.8b // h[0-3] contains conditions for mv[0-1]
+ addp v1.4h, v1.4h, v1.4h // Combine condition for [1] and [0]
+ umov w16, v1.h[0] // Extract case for first block
+ umov w17, v1.h[1]
+ ldrh w11, [x11, #2] // Fetch jump table entry
+ ldrh w15, [x15, #2]
+ ldr q1, [x13, w16, uxtw #4] // Load permutation table base on case
+ ldr q5, [x13, w17, uxtw #4]
+ sub x11, x8, w11, uxtw // Find jump table target
+ sub x15, x8, w15, uxtw
+ tbl v0.16b, {v0.16b}, v1.16b // Permute cand_b to output refmvs_temporal_block
+ tbl v4.16b, {v4.16b}, v5.16b
+
+ // v1 follows on v0, with another 3 full repetitions of the pattern.
+ ext v1.16b, v0.16b, v0.16b, #1
+ ext v5.16b, v4.16b, v4.16b, #1
+ // v2 ends with 3 complete repetitions of the pattern.
+ ext v2.16b, v0.16b, v1.16b, #4
+ ext v6.16b, v4.16b, v5.16b, #4
+
+ blr x11
+ b.ge 4f // if (cand_b >= end)
+ mov v0.16b, v4.16b
+ mov v1.16b, v5.16b
+ mov v2.16b, v6.16b
+ cmp x9, x10
+ blr x15
+ b.lt 2b // if (cand_b < end)
+
+4:
+ subs w5, w5, #1 // h--
+ add w7, w7, #2 // y += 2
+ add x0, x0, x1 // rp += stride
+ b.gt 1b
+
+ ldp x29, x30, [sp], #16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+10:
+ AARCH64_VALID_CALL_TARGET
+ add x16, x3, #4
+ st1 {v0.s}[0], [x3]
+ st1 {v0.b}[4], [x16]
+ add x3, x3, #5
+ ret
+20:
+ AARCH64_VALID_CALL_TARGET
+ add x16, x3, #8
+ st1 {v0.d}[0], [x3]
+ st1 {v0.h}[4], [x16]
+ add x3, x3, #2*5
+ ret
+40:
+ AARCH64_VALID_CALL_TARGET
+ st1 {v0.16b}, [x3]
+ str s1, [x3, #16]
+ add x3, x3, #4*5
+ ret
+80:
+ AARCH64_VALID_CALL_TARGET
+ // This writes 6 full entries plus 2 extra bytes
+ st1 {v0.16b, v1.16b}, [x3]
+ // Write the last few, overlapping with the first write.
+ stur q2, [x3, #(8*5-16)]
+ add x3, x3, #8*5
+ ret
+160:
+ AARCH64_VALID_CALL_TARGET
+ add x16, x3, #6*5
+ add x17, x3, #12*5
+ // This writes 6 full entries plus 2 extra bytes
+ st1 {v0.16b, v1.16b}, [x3]
+ // Write another 6 full entries, slightly overlapping with the first set
+ st1 {v0.16b, v1.16b}, [x16]
+ // Write 8 bytes (one full entry) after the first 12
+ st1 {v0.8b}, [x17]
+ // Write the last 3 entries
+ str q2, [x3, #(16*5-16)]
+ add x3, x3, #16*5
+ ret
+
+L(save_tmvs_tbl):
+ .hword 16 * 12
+ .hword L(save_tmvs_tbl) - 160b
+ .hword 16 * 12
+ .hword L(save_tmvs_tbl) - 160b
+ .hword 8 * 12
+ .hword L(save_tmvs_tbl) - 80b
+ .hword 8 * 12
+ .hword L(save_tmvs_tbl) - 80b
+ .hword 8 * 12
+ .hword L(save_tmvs_tbl) - 80b
+ .hword 8 * 12
+ .hword L(save_tmvs_tbl) - 80b
+ .hword 4 * 12
+ .hword L(save_tmvs_tbl) - 40b
+ .hword 4 * 12
+ .hword L(save_tmvs_tbl) - 40b
+ .hword 4 * 12
+ .hword L(save_tmvs_tbl) - 40b
+ .hword 4 * 12
+ .hword L(save_tmvs_tbl) - 40b
+ .hword 2 * 12
+ .hword L(save_tmvs_tbl) - 20b
+ .hword 2 * 12
+ .hword L(save_tmvs_tbl) - 20b
+ .hword 2 * 12
+ .hword L(save_tmvs_tbl) - 20b
+ .hword 2 * 12
+ .hword L(save_tmvs_tbl) - 20b
+ .hword 2 * 12
+ .hword L(save_tmvs_tbl) - 20b
+ .hword 1 * 12
+ .hword L(save_tmvs_tbl) - 10b
+ .hword 1 * 12
+ .hword L(save_tmvs_tbl) - 10b
+ .hword 1 * 12
+ .hword L(save_tmvs_tbl) - 10b
+ .hword 1 * 12
+ .hword L(save_tmvs_tbl) - 10b
+ .hword 1 * 12
+ .hword L(save_tmvs_tbl) - 10b
+ .hword 1 * 12
+ .hword L(save_tmvs_tbl) - 10b
+ .hword 1 * 12
+ .hword L(save_tmvs_tbl) - 10b
+endfunc
diff --git a/third_party/dav1d/src/arm/64/util.S b/third_party/dav1d/src/arm/64/util.S
new file mode 100644
index 0000000000..9013fd4b1e
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/util.S
@@ -0,0 +1,229 @@
+/******************************************************************************
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2015 Martin Storsjo
+ * Copyright © 2015 Janne Grunau
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+#ifndef DAV1D_SRC_ARM_64_UTIL_S
+#define DAV1D_SRC_ARM_64_UTIL_S
+
+#include "config.h"
+#include "src/arm/asm.S"
+
+.macro movrel rd, val, offset=0
+#if defined(__APPLE__)
+ .if \offset < 0
+ adrp \rd, \val@PAGE
+ add \rd, \rd, \val@PAGEOFF
+ sub \rd, \rd, -(\offset)
+ .else
+ adrp \rd, \val+(\offset)@PAGE
+ add \rd, \rd, \val+(\offset)@PAGEOFF
+ .endif
+#elif defined(PIC) && defined(_WIN32)
+ .if \offset < 0
+ adrp \rd, \val
+ add \rd, \rd, :lo12:\val
+ sub \rd, \rd, -(\offset)
+ .else
+ adrp \rd, \val+(\offset)
+ add \rd, \rd, :lo12:\val+(\offset)
+ .endif
+#elif defined(PIC)
+ adrp \rd, \val+(\offset)
+ add \rd, \rd, :lo12:\val+(\offset)
+#else
+ ldr \rd, =\val+\offset
+#endif
+.endm
+
+.macro sub_sp space
+#ifdef _WIN32
+.if \space > 8192
+ // Here, we'd need to touch two (or more) pages while decrementing
+ // the stack pointer.
+ .error "sub_sp_align doesn't support values over 8K at the moment"
+.elseif \space > 4096
+ sub x16, sp, #4096
+ ldr xzr, [x16]
+ sub sp, x16, #(\space - 4096)
+.else
+ sub sp, sp, #\space
+.endif
+#else
+.if \space >= 4096
+ sub sp, sp, #(\space)/4096*4096
+.endif
+.if (\space % 4096) != 0
+ sub sp, sp, #(\space)%4096
+.endif
+#endif
+.endm
+
+.macro transpose_8x8b_xtl r0, r1, r2, r3, r4, r5, r6, r7, xtl
+ // a0 b0 a1 b1 a2 b2 a3 b3 a4 b4 a5 b5 a6 b6 a7 b7
+ zip1 \r0\().16b, \r0\().16b, \r1\().16b
+ // c0 d0 c1 d1 c2 d2 d3 d3 c4 d4 c5 d5 c6 d6 d7 d7
+ zip1 \r2\().16b, \r2\().16b, \r3\().16b
+ // e0 f0 e1 f1 e2 f2 e3 f3 e4 f4 e5 f5 e6 f6 e7 f7
+ zip1 \r4\().16b, \r4\().16b, \r5\().16b
+ // g0 h0 g1 h1 g2 h2 h3 h3 g4 h4 g5 h5 g6 h6 h7 h7
+ zip1 \r6\().16b, \r6\().16b, \r7\().16b
+
+ // a0 b0 c0 d0 a2 b2 c2 d2 a4 b4 c4 d4 a6 b6 c6 d6
+ trn1 \r1\().8h, \r0\().8h, \r2\().8h
+ // a1 b1 c1 d1 a3 b3 c3 d3 a5 b5 c5 d5 a7 b7 c7 d7
+ trn2 \r3\().8h, \r0\().8h, \r2\().8h
+ // e0 f0 g0 h0 e2 f2 g2 h2 e4 f4 g4 h4 e6 f6 g6 h6
+ trn1 \r5\().8h, \r4\().8h, \r6\().8h
+ // e1 f1 g1 h1 e3 f3 g3 h3 e5 f5 g5 h5 e7 f7 g7 h7
+ trn2 \r7\().8h, \r4\().8h, \r6\().8h
+
+ // a0 b0 c0 d0 e0 f0 g0 h0 a4 b4 c4 d4 e4 f4 g4 h4
+ trn1 \r0\().4s, \r1\().4s, \r5\().4s
+ // a2 b2 c2 d2 e2 f2 g2 h2 a6 b6 c6 d6 e6 f6 g6 h6
+ trn2 \r2\().4s, \r1\().4s, \r5\().4s
+ // a1 b1 c1 d1 e1 f1 g1 h1 a5 b5 c5 d5 e5 f5 g5 h5
+ trn1 \r1\().4s, \r3\().4s, \r7\().4s
+ // a3 b3 c3 d3 e3 f3 g3 h3 a7 b7 c7 d7 e7 f7 g7 h7
+ trn2 \r3\().4s, \r3\().4s, \r7\().4s
+
+ \xtl\()2 \r4\().8h, \r0\().16b
+ \xtl \r0\().8h, \r0\().8b
+ \xtl\()2 \r6\().8h, \r2\().16b
+ \xtl \r2\().8h, \r2\().8b
+ \xtl\()2 \r5\().8h, \r1\().16b
+ \xtl \r1\().8h, \r1\().8b
+ \xtl\()2 \r7\().8h, \r3\().16b
+ \xtl \r3\().8h, \r3\().8b
+.endm
+
+.macro transpose_8x8h r0, r1, r2, r3, r4, r5, r6, r7, t8, t9
+ trn1 \t8\().8h, \r0\().8h, \r1\().8h
+ trn2 \t9\().8h, \r0\().8h, \r1\().8h
+ trn1 \r1\().8h, \r2\().8h, \r3\().8h
+ trn2 \r3\().8h, \r2\().8h, \r3\().8h
+ trn1 \r0\().8h, \r4\().8h, \r5\().8h
+ trn2 \r5\().8h, \r4\().8h, \r5\().8h
+ trn1 \r2\().8h, \r6\().8h, \r7\().8h
+ trn2 \r7\().8h, \r6\().8h, \r7\().8h
+
+ trn1 \r4\().4s, \r0\().4s, \r2\().4s
+ trn2 \r2\().4s, \r0\().4s, \r2\().4s
+ trn1 \r6\().4s, \r5\().4s, \r7\().4s
+ trn2 \r7\().4s, \r5\().4s, \r7\().4s
+ trn1 \r5\().4s, \t9\().4s, \r3\().4s
+ trn2 \t9\().4s, \t9\().4s, \r3\().4s
+ trn1 \r3\().4s, \t8\().4s, \r1\().4s
+ trn2 \t8\().4s, \t8\().4s, \r1\().4s
+
+ trn1 \r0\().2d, \r3\().2d, \r4\().2d
+ trn2 \r4\().2d, \r3\().2d, \r4\().2d
+ trn1 \r1\().2d, \r5\().2d, \r6\().2d
+ trn2 \r5\().2d, \r5\().2d, \r6\().2d
+ trn2 \r6\().2d, \t8\().2d, \r2\().2d
+ trn1 \r2\().2d, \t8\().2d, \r2\().2d
+ trn1 \r3\().2d, \t9\().2d, \r7\().2d
+ trn2 \r7\().2d, \t9\().2d, \r7\().2d
+.endm
+
+.macro transpose_8x16b r0, r1, r2, r3, r4, r5, r6, r7, t8, t9
+ trn1 \t8\().16b, \r0\().16b, \r1\().16b
+ trn2 \t9\().16b, \r0\().16b, \r1\().16b
+ trn1 \r1\().16b, \r2\().16b, \r3\().16b
+ trn2 \r3\().16b, \r2\().16b, \r3\().16b
+ trn1 \r0\().16b, \r4\().16b, \r5\().16b
+ trn2 \r5\().16b, \r4\().16b, \r5\().16b
+ trn1 \r2\().16b, \r6\().16b, \r7\().16b
+ trn2 \r7\().16b, \r6\().16b, \r7\().16b
+
+ trn1 \r4\().8h, \r0\().8h, \r2\().8h
+ trn2 \r2\().8h, \r0\().8h, \r2\().8h
+ trn1 \r6\().8h, \r5\().8h, \r7\().8h
+ trn2 \r7\().8h, \r5\().8h, \r7\().8h
+ trn1 \r5\().8h, \t9\().8h, \r3\().8h
+ trn2 \t9\().8h, \t9\().8h, \r3\().8h
+ trn1 \r3\().8h, \t8\().8h, \r1\().8h
+ trn2 \t8\().8h, \t8\().8h, \r1\().8h
+
+ trn1 \r0\().4s, \r3\().4s, \r4\().4s
+ trn2 \r4\().4s, \r3\().4s, \r4\().4s
+ trn1 \r1\().4s, \r5\().4s, \r6\().4s
+ trn2 \r5\().4s, \r5\().4s, \r6\().4s
+ trn2 \r6\().4s, \t8\().4s, \r2\().4s
+ trn1 \r2\().4s, \t8\().4s, \r2\().4s
+ trn1 \r3\().4s, \t9\().4s, \r7\().4s
+ trn2 \r7\().4s, \t9\().4s, \r7\().4s
+.endm
+
+.macro transpose_4x16b r0, r1, r2, r3, t4, t5, t6, t7
+ trn1 \t4\().16b, \r0\().16b, \r1\().16b
+ trn2 \t5\().16b, \r0\().16b, \r1\().16b
+ trn1 \t6\().16b, \r2\().16b, \r3\().16b
+ trn2 \t7\().16b, \r2\().16b, \r3\().16b
+
+ trn1 \r0\().8h, \t4\().8h, \t6\().8h
+ trn2 \r2\().8h, \t4\().8h, \t6\().8h
+ trn1 \r1\().8h, \t5\().8h, \t7\().8h
+ trn2 \r3\().8h, \t5\().8h, \t7\().8h
+.endm
+
+.macro transpose_4x4h r0, r1, r2, r3, t4, t5, t6, t7
+ trn1 \t4\().4h, \r0\().4h, \r1\().4h
+ trn2 \t5\().4h, \r0\().4h, \r1\().4h
+ trn1 \t6\().4h, \r2\().4h, \r3\().4h
+ trn2 \t7\().4h, \r2\().4h, \r3\().4h
+
+ trn1 \r0\().2s, \t4\().2s, \t6\().2s
+ trn2 \r2\().2s, \t4\().2s, \t6\().2s
+ trn1 \r1\().2s, \t5\().2s, \t7\().2s
+ trn2 \r3\().2s, \t5\().2s, \t7\().2s
+.endm
+
+.macro transpose_4x4s r0, r1, r2, r3, t4, t5, t6, t7
+ trn1 \t4\().4s, \r0\().4s, \r1\().4s
+ trn2 \t5\().4s, \r0\().4s, \r1\().4s
+ trn1 \t6\().4s, \r2\().4s, \r3\().4s
+ trn2 \t7\().4s, \r2\().4s, \r3\().4s
+
+ trn1 \r0\().2d, \t4\().2d, \t6\().2d
+ trn2 \r2\().2d, \t4\().2d, \t6\().2d
+ trn1 \r1\().2d, \t5\().2d, \t7\().2d
+ trn2 \r3\().2d, \t5\().2d, \t7\().2d
+.endm
+
+.macro transpose_4x8h r0, r1, r2, r3, t4, t5, t6, t7
+ trn1 \t4\().8h, \r0\().8h, \r1\().8h
+ trn2 \t5\().8h, \r0\().8h, \r1\().8h
+ trn1 \t6\().8h, \r2\().8h, \r3\().8h
+ trn2 \t7\().8h, \r2\().8h, \r3\().8h
+
+ trn1 \r0\().4s, \t4\().4s, \t6\().4s
+ trn2 \r2\().4s, \t4\().4s, \t6\().4s
+ trn1 \r1\().4s, \t5\().4s, \t7\().4s
+ trn2 \r3\().4s, \t5\().4s, \t7\().4s
+.endm
+
+#endif /* DAV1D_SRC_ARM_64_UTIL_S */
diff --git a/third_party/dav1d/src/arm/asm-offsets.h b/third_party/dav1d/src/arm/asm-offsets.h
new file mode 100644
index 0000000000..2f3c3caa1f
--- /dev/null
+++ b/third_party/dav1d/src/arm/asm-offsets.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright © 2021, VideoLAN and dav1d authors
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef ARM_ASM_OFFSETS_H
+#define ARM_ASM_OFFSETS_H
+
+#define FGD_SEED 0
+#define FGD_AR_COEFF_LAG 92
+#define FGD_AR_COEFFS_Y 96
+#define FGD_AR_COEFFS_UV 120
+#define FGD_AR_COEFF_SHIFT 176
+#define FGD_GRAIN_SCALE_SHIFT 184
+
+#define FGD_SCALING_SHIFT 88
+#define FGD_UV_MULT 188
+#define FGD_UV_LUMA_MULT 196
+#define FGD_UV_OFFSET 204
+#define FGD_CLIP_TO_RESTRICTED_RANGE 216
+
+#endif /* ARM_ASM_OFFSETS_H */
diff --git a/third_party/dav1d/src/arm/asm.S b/third_party/dav1d/src/arm/asm.S
new file mode 100644
index 0000000000..dc50415f1f
--- /dev/null
+++ b/third_party/dav1d/src/arm/asm.S
@@ -0,0 +1,291 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Janne Grunau
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_ARM_ASM_S
+#define DAV1D_SRC_ARM_ASM_S
+
+#include "config.h"
+
+#if ARCH_AARCH64
+#define x18 do_not_use_x18
+#define w18 do_not_use_w18
+
+/* Support macros for
+ * - Armv8.3-A Pointer Authentication and
+ * - Armv8.5-A Branch Target Identification
+ * features which require emitting a .note.gnu.property section with the
+ * appropriate architecture-dependent feature bits set.
+ *
+ * |AARCH64_SIGN_LINK_REGISTER| and |AARCH64_VALIDATE_LINK_REGISTER| expand to
+ * PACIxSP and AUTIxSP, respectively. |AARCH64_SIGN_LINK_REGISTER| should be
+ * used immediately before saving the LR register (x30) to the stack.
+ * |AARCH64_VALIDATE_LINK_REGISTER| should be used immediately after restoring
+ * it. Note |AARCH64_SIGN_LINK_REGISTER|'s modifications to LR must be undone
+ * with |AARCH64_VALIDATE_LINK_REGISTER| before RET. The SP register must also
+ * have the same value at the two points. For example:
+ *
+ * .global f
+ * f:
+ * AARCH64_SIGN_LINK_REGISTER
+ * stp x29, x30, [sp, #-96]!
+ * mov x29, sp
+ * ...
+ * ldp x29, x30, [sp], #96
+ * AARCH64_VALIDATE_LINK_REGISTER
+ * ret
+ *
+ * |AARCH64_VALID_CALL_TARGET| expands to BTI 'c'. Either it, or
+ * |AARCH64_SIGN_LINK_REGISTER|, must be used at every point that may be an
+ * indirect call target. In particular, all symbols exported from a file must
+ * begin with one of these macros. For example, a leaf function that does not
+ * save LR can instead use |AARCH64_VALID_CALL_TARGET|:
+ *
+ * .globl return_zero
+ * return_zero:
+ * AARCH64_VALID_CALL_TARGET
+ * mov x0, #0
+ * ret
+ *
+ * A non-leaf function which does not immediately save LR may need both macros
+ * because |AARCH64_SIGN_LINK_REGISTER| appears late. For example, the function
+ * may jump to an alternate implementation before setting up the stack:
+ *
+ * .globl with_early_jump
+ * with_early_jump:
+ * AARCH64_VALID_CALL_TARGET
+ * cmp x0, #128
+ * b.lt .Lwith_early_jump_128
+ * AARCH64_SIGN_LINK_REGISTER
+ * stp x29, x30, [sp, #-96]!
+ * mov x29, sp
+ * ...
+ * ldp x29, x30, [sp], #96
+ * AARCH64_VALIDATE_LINK_REGISTER
+ * ret
+ *
+ * .Lwith_early_jump_128:
+ * ...
+ * ret
+ *
+ * These annotations are only required with indirect calls. Private symbols that
+ * are only the target of direct calls do not require annotations. Also note
+ * that |AARCH64_VALID_CALL_TARGET| is only valid for indirect calls (BLR), not
+ * indirect jumps (BR). Indirect jumps in assembly are supported through
+ * |AARCH64_VALID_JUMP_TARGET|. Landing Pads which shall serve for jumps and
+ * calls can be created using |AARCH64_VALID_JUMP_CALL_TARGET|.
+ *
+ * Although not necessary, it is safe to use these macros in 32-bit ARM
+ * assembly. This may be used to simplify dual 32-bit and 64-bit files.
+ *
+ * References:
+ * - "ELF for the Arm® 64-bit Architecture"
+ * https: *github.com/ARM-software/abi-aa/blob/master/aaelf64/aaelf64.rst
+ * - "Providing protection for complex software"
+ * https://developer.arm.com/architectures/learn-the-architecture/providing-protection-for-complex-software
+ */
+#if defined(__ARM_FEATURE_BTI_DEFAULT) && (__ARM_FEATURE_BTI_DEFAULT == 1)
+#define GNU_PROPERTY_AARCH64_BTI (1 << 0) // Has Branch Target Identification
+#define AARCH64_VALID_JUMP_CALL_TARGET hint #38 // BTI 'jc'
+#define AARCH64_VALID_CALL_TARGET hint #34 // BTI 'c'
+#define AARCH64_VALID_JUMP_TARGET hint #36 // BTI 'j'
+#else
+#define GNU_PROPERTY_AARCH64_BTI 0 // No Branch Target Identification
+#define AARCH64_VALID_JUMP_CALL_TARGET
+#define AARCH64_VALID_CALL_TARGET
+#define AARCH64_VALID_JUMP_TARGET
+#endif
+
+#if defined(__ARM_FEATURE_PAC_DEFAULT)
+
+#if ((__ARM_FEATURE_PAC_DEFAULT & (1 << 0)) != 0) // authentication using key A
+#define AARCH64_SIGN_LINK_REGISTER paciasp
+#define AARCH64_VALIDATE_LINK_REGISTER autiasp
+#elif ((__ARM_FEATURE_PAC_DEFAULT & (1 << 1)) != 0) // authentication using key B
+#define AARCH64_SIGN_LINK_REGISTER pacibsp
+#define AARCH64_VALIDATE_LINK_REGISTER autibsp
+#else
+#error Pointer authentication defines no valid key!
+#endif
+#if ((__ARM_FEATURE_PAC_DEFAULT & (1 << 2)) != 0) // authentication of leaf functions
+#error Authentication of leaf functions is enabled but not supported in dav1d!
+#endif
+#define GNU_PROPERTY_AARCH64_PAC (1 << 1)
+
+#elif defined(__APPLE__) && defined(__arm64e__)
+
+#define GNU_PROPERTY_AARCH64_PAC 0
+#define AARCH64_SIGN_LINK_REGISTER pacibsp
+#define AARCH64_VALIDATE_LINK_REGISTER autibsp
+
+#else /* __ARM_FEATURE_PAC_DEFAULT */
+
+#define GNU_PROPERTY_AARCH64_PAC 0
+#define AARCH64_SIGN_LINK_REGISTER
+#define AARCH64_VALIDATE_LINK_REGISTER
+
+#endif /* !__ARM_FEATURE_PAC_DEFAULT */
+
+
+#if (GNU_PROPERTY_AARCH64_BTI != 0 || GNU_PROPERTY_AARCH64_PAC != 0) && defined(__ELF__)
+ .pushsection .note.gnu.property, "a"
+ .balign 8
+ .long 4
+ .long 0x10
+ .long 0x5
+ .asciz "GNU"
+ .long 0xc0000000 /* GNU_PROPERTY_AARCH64_FEATURE_1_AND */
+ .long 4
+ .long (GNU_PROPERTY_AARCH64_BTI | GNU_PROPERTY_AARCH64_PAC)
+ .long 0
+ .popsection
+#endif /* (GNU_PROPERTY_AARCH64_BTI != 0 || GNU_PROPERTY_AARCH64_PAC != 0) && defined(__ELF__) */
+#endif /* ARCH_AARCH64 */
+
+#if ARCH_ARM
+ .syntax unified
+#ifdef __ELF__
+ .arch armv7-a
+ .fpu neon
+ .eabi_attribute 10, 0 // suppress Tag_FP_arch
+ .eabi_attribute 12, 0 // suppress Tag_Advanced_SIMD_arch
+ .section .note.GNU-stack,"",%progbits // Mark stack as non-executable
+#endif /* __ELF__ */
+
+#ifdef _WIN32
+#define CONFIG_THUMB 1
+#else
+#define CONFIG_THUMB 0
+#endif
+
+#if CONFIG_THUMB
+ .thumb
+#define A @
+#define T
+#else
+#define A
+#define T @
+#endif /* CONFIG_THUMB */
+#endif /* ARCH_ARM */
+
+#if !defined(PIC)
+#if defined(__PIC__)
+#define PIC __PIC__
+#elif defined(__pic__)
+#define PIC __pic__
+#endif
+#endif
+
+#ifndef PRIVATE_PREFIX
+#define PRIVATE_PREFIX dav1d_
+#endif
+
+#define PASTE(a,b) a ## b
+#define CONCAT(a,b) PASTE(a,b)
+
+#ifdef PREFIX
+#define EXTERN CONCAT(_,PRIVATE_PREFIX)
+#else
+#define EXTERN PRIVATE_PREFIX
+#endif
+
+.macro function name, export=0, align=2
+ .macro endfunc
+#ifdef __ELF__
+ .size \name, . - \name
+#endif
+#if HAVE_AS_FUNC
+ .endfunc
+#endif
+ .purgem endfunc
+ .endm
+ .text
+ .align \align
+ .if \export
+ .global EXTERN\name
+#ifdef __ELF__
+ .type EXTERN\name, %function
+ .hidden EXTERN\name
+#elif defined(__MACH__)
+ .private_extern EXTERN\name
+#endif
+#if HAVE_AS_FUNC
+ .func EXTERN\name
+#endif
+EXTERN\name:
+ .else
+#ifdef __ELF__
+ .type \name, %function
+#endif
+#if HAVE_AS_FUNC
+ .func \name
+#endif
+ .endif
+\name:
+#if ARCH_AARCH64
+ .if \export
+ AARCH64_VALID_CALL_TARGET
+ .endif
+#endif
+.endm
+
+.macro const name, export=0, align=2
+ .macro endconst
+#ifdef __ELF__
+ .size \name, . - \name
+#endif
+ .purgem endconst
+ .endm
+#if defined(_WIN32)
+ .section .rdata
+#elif !defined(__MACH__)
+ .section .rodata
+#else
+ .const_data
+#endif
+ .align \align
+ .if \export
+ .global EXTERN\name
+#ifdef __ELF__
+ .hidden EXTERN\name
+#elif defined(__MACH__)
+ .private_extern EXTERN\name
+#endif
+EXTERN\name:
+ .endif
+\name:
+.endm
+
+#ifdef __APPLE__
+#define L(x) L ## x
+#else
+#define L(x) .L ## x
+#endif
+
+#define X(x) CONCAT(EXTERN, x)
+
+
+#endif /* DAV1D_SRC_ARM_ASM_S */
diff --git a/third_party/dav1d/src/arm/cdef.h b/third_party/dav1d/src/arm/cdef.h
new file mode 100644
index 0000000000..2e8c8ab6fb
--- /dev/null
+++ b/third_party/dav1d/src/arm/cdef.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/cdef.h"
+
+decl_cdef_dir_fn(BF(dav1d_cdef_find_dir, neon));
+
+void BF(dav1d_cdef_padding4, neon)(uint16_t *tmp, const pixel *src,
+ ptrdiff_t src_stride, const pixel (*left)[2],
+ const pixel *const top,
+ const pixel *const bottom, int h,
+ enum CdefEdgeFlags edges);
+void BF(dav1d_cdef_padding8, neon)(uint16_t *tmp, const pixel *src,
+ ptrdiff_t src_stride, const pixel (*left)[2],
+ const pixel *const top,
+ const pixel *const bottom, int h,
+ enum CdefEdgeFlags edges);
+
+// Passing edges to this function, to allow it to switch to a more
+// optimized version for fully edged cases. Using size_t for edges,
+// to avoid ABI differences for passing more than one argument on the stack.
+void BF(dav1d_cdef_filter4, neon)(pixel *dst, ptrdiff_t dst_stride,
+ const uint16_t *tmp, int pri_strength,
+ int sec_strength, int dir, int damping, int h,
+ size_t edges HIGHBD_DECL_SUFFIX);
+void BF(dav1d_cdef_filter8, neon)(pixel *dst, ptrdiff_t dst_stride,
+ const uint16_t *tmp, int pri_strength,
+ int sec_strength, int dir, int damping, int h,
+ size_t edges HIGHBD_DECL_SUFFIX);
+
+#define DEFINE_FILTER(w, h, tmp_stride) \
+static void \
+cdef_filter_##w##x##h##_neon(pixel *dst, const ptrdiff_t stride, \
+ const pixel (*left)[2], \
+ const pixel *const top, \
+ const pixel *const bottom, \
+ const int pri_strength, const int sec_strength, \
+ const int dir, const int damping, \
+ const enum CdefEdgeFlags edges \
+ HIGHBD_DECL_SUFFIX) \
+{ \
+ ALIGN_STK_16(uint16_t, tmp_buf, 12 * tmp_stride + 8,); \
+ uint16_t *tmp = tmp_buf + 2 * tmp_stride + 8; \
+ BF(dav1d_cdef_padding##w, neon)(tmp, dst, stride, \
+ left, top, bottom, h, edges); \
+ BF(dav1d_cdef_filter##w, neon)(dst, stride, tmp, pri_strength, \
+ sec_strength, dir, damping, h, edges \
+ HIGHBD_TAIL_SUFFIX); \
+}
+
+DEFINE_FILTER(8, 8, 16)
+DEFINE_FILTER(4, 8, 8)
+DEFINE_FILTER(4, 4, 8)
+
+static ALWAYS_INLINE void cdef_dsp_init_arm(Dav1dCdefDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
+
+ c->dir = BF(dav1d_cdef_find_dir, neon);
+ c->fb[0] = cdef_filter_8x8_neon;
+ c->fb[1] = cdef_filter_4x8_neon;
+ c->fb[2] = cdef_filter_4x4_neon;
+}
diff --git a/third_party/dav1d/src/arm/cpu.c b/third_party/dav1d/src/arm/cpu.c
new file mode 100644
index 0000000000..b7a0d3adbc
--- /dev/null
+++ b/third_party/dav1d/src/arm/cpu.c
@@ -0,0 +1,99 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Janne Grunau
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "common/attributes.h"
+
+#include "src/arm/cpu.h"
+
+#if defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32) || ARCH_AARCH64
+// NEON is always available; runtime tests are not needed.
+#elif defined(HAVE_GETAUXVAL) && ARCH_ARM
+#include <sys/auxv.h>
+
+#ifndef HWCAP_ARM_NEON
+#define HWCAP_ARM_NEON (1 << 12)
+#endif
+#define NEON_HWCAP HWCAP_ARM_NEON
+
+#elif defined(HAVE_ELF_AUX_INFO) && ARCH_ARM
+#include <sys/auxv.h>
+
+#define NEON_HWCAP HWCAP_NEON
+
+#elif defined(__ANDROID__)
+#include <stdio.h>
+#include <string.h>
+
+static unsigned parse_proc_cpuinfo(const char *flag) {
+ FILE *file = fopen("/proc/cpuinfo", "r");
+ if (!file)
+ return 0;
+
+ char line_buffer[120];
+ const char *line;
+
+ while ((line = fgets(line_buffer, sizeof(line_buffer), file))) {
+ if (strstr(line, flag)) {
+ fclose(file);
+ return 1;
+ }
+ // if line is incomplete seek back to avoid splitting the search
+ // string into two buffers
+ if (!strchr(line, '\n') && strlen(line) > strlen(flag)) {
+ // use fseek since the 64 bit fseeko is only available since
+ // Android API level 24 and meson defines _FILE_OFFSET_BITS
+ // by default 64
+ if (fseek(file, -strlen(flag), SEEK_CUR))
+ break;
+ }
+ }
+
+ fclose(file);
+
+ return 0;
+}
+#endif
+
+COLD unsigned dav1d_get_cpu_flags_arm(void) {
+ unsigned flags = 0;
+#if defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32) || ARCH_AARCH64
+ flags |= DAV1D_ARM_CPU_FLAG_NEON;
+#elif defined(HAVE_GETAUXVAL) && ARCH_ARM
+ unsigned long hw_cap = getauxval(AT_HWCAP);
+ flags |= (hw_cap & NEON_HWCAP) ? DAV1D_ARM_CPU_FLAG_NEON : 0;
+#elif defined(HAVE_ELF_AUX_INFO) && ARCH_ARM
+ unsigned long hw_cap = 0;
+ elf_aux_info(AT_HWCAP, &hw_cap, sizeof(hw_cap));
+ flags |= (hw_cap & NEON_HWCAP) ? DAV1D_ARM_CPU_FLAG_NEON : 0;
+#elif defined(__ANDROID__)
+ flags |= parse_proc_cpuinfo("neon") ? DAV1D_ARM_CPU_FLAG_NEON : 0;
+#endif
+
+ return flags;
+}
diff --git a/third_party/dav1d/src/arm/cpu.h b/third_party/dav1d/src/arm/cpu.h
new file mode 100644
index 0000000000..8c10a1b6b0
--- /dev/null
+++ b/third_party/dav1d/src/arm/cpu.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Janne Grunau
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_ARM_CPU_H
+#define DAV1D_SRC_ARM_CPU_H
+
+enum CpuFlags {
+ DAV1D_ARM_CPU_FLAG_NEON = 1 << 0,
+};
+
+unsigned dav1d_get_cpu_flags_arm(void);
+
+#endif /* DAV1D_SRC_ARM_CPU_H */
diff --git a/third_party/dav1d/src/arm/filmgrain.h b/third_party/dav1d/src/arm/filmgrain.h
new file mode 100644
index 0000000000..9f51b0310f
--- /dev/null
+++ b/third_party/dav1d/src/arm/filmgrain.h
@@ -0,0 +1,204 @@
+/*
+ * Copyright © 2018, Niklas Haas
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * Copyright © 2021, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/filmgrain.h"
+#include "asm-offsets.h"
+
+CHECK_OFFSET(Dav1dFilmGrainData, seed, FGD_SEED);
+CHECK_OFFSET(Dav1dFilmGrainData, ar_coeff_lag, FGD_AR_COEFF_LAG);
+CHECK_OFFSET(Dav1dFilmGrainData, ar_coeffs_y, FGD_AR_COEFFS_Y);
+CHECK_OFFSET(Dav1dFilmGrainData, ar_coeffs_uv, FGD_AR_COEFFS_UV);
+CHECK_OFFSET(Dav1dFilmGrainData, ar_coeff_shift, FGD_AR_COEFF_SHIFT);
+CHECK_OFFSET(Dav1dFilmGrainData, grain_scale_shift, FGD_GRAIN_SCALE_SHIFT);
+
+CHECK_OFFSET(Dav1dFilmGrainData, scaling_shift, FGD_SCALING_SHIFT);
+CHECK_OFFSET(Dav1dFilmGrainData, uv_mult, FGD_UV_MULT);
+CHECK_OFFSET(Dav1dFilmGrainData, uv_luma_mult, FGD_UV_LUMA_MULT);
+CHECK_OFFSET(Dav1dFilmGrainData, uv_offset, FGD_UV_OFFSET);
+CHECK_OFFSET(Dav1dFilmGrainData, clip_to_restricted_range, FGD_CLIP_TO_RESTRICTED_RANGE);
+
+void BF(dav1d_generate_grain_y, neon)(entry buf[][GRAIN_WIDTH],
+ const Dav1dFilmGrainData *const data
+ HIGHBD_DECL_SUFFIX);
+
+#define GEN_GRAIN_UV(suff) \
+void BF(dav1d_generate_grain_uv_ ## suff, neon)(entry buf[][GRAIN_WIDTH], \
+ const entry buf_y[][GRAIN_WIDTH], \
+ const Dav1dFilmGrainData *const data, \
+ const intptr_t uv \
+ HIGHBD_DECL_SUFFIX)
+
+GEN_GRAIN_UV(420);
+GEN_GRAIN_UV(422);
+GEN_GRAIN_UV(444);
+
+// Use ptrdiff_t instead of int for the last few parameters, to get the
+// same layout of parameters on the stack across platforms.
+void BF(dav1d_fgy_32x32, neon)(pixel *const dst,
+ const pixel *const src,
+ const ptrdiff_t stride,
+ const uint8_t scaling[SCALING_SIZE],
+ const int scaling_shift,
+ const entry grain_lut[][GRAIN_WIDTH],
+ const int offsets[][2],
+ const int h, const ptrdiff_t clip,
+ const ptrdiff_t type
+ HIGHBD_DECL_SUFFIX);
+
+static void fgy_32x32xn_neon(pixel *const dst_row, const pixel *const src_row,
+ const ptrdiff_t stride,
+ const Dav1dFilmGrainData *const data, const size_t pw,
+ const uint8_t scaling[SCALING_SIZE],
+ const entry grain_lut[][GRAIN_WIDTH],
+ const int bh, const int row_num HIGHBD_DECL_SUFFIX)
+{
+ const int rows = 1 + (data->overlap_flag && row_num > 0);
+
+ // seed[0] contains the current row, seed[1] contains the previous
+ unsigned seed[2];
+ for (int i = 0; i < rows; i++) {
+ seed[i] = data->seed;
+ seed[i] ^= (((row_num - i) * 37 + 178) & 0xFF) << 8;
+ seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF);
+ }
+
+ int offsets[2 /* col offset */][2 /* row offset */];
+
+ // process this row in FG_BLOCK_SIZE^2 blocks
+ for (unsigned bx = 0; bx < pw; bx += FG_BLOCK_SIZE) {
+
+ if (data->overlap_flag && bx) {
+ // shift previous offsets left
+ for (int i = 0; i < rows; i++)
+ offsets[1][i] = offsets[0][i];
+ }
+
+ // update current offsets
+ for (int i = 0; i < rows; i++)
+ offsets[0][i] = get_random_number(8, &seed[i]);
+
+ int type = 0;
+ if (data->overlap_flag && row_num)
+ type |= 1; /* overlap y */
+ if (data->overlap_flag && bx)
+ type |= 2; /* overlap x */
+
+ BF(dav1d_fgy_32x32, neon)(dst_row + bx, src_row + bx, stride,
+ scaling, data->scaling_shift,
+ grain_lut, offsets, bh,
+ data->clip_to_restricted_range, type
+ HIGHBD_TAIL_SUFFIX);
+ }
+}
+
+// Use ptrdiff_t instead of int for the last few parameters, to get the
+// parameters on the stack with the same layout across platforms.
+#define FGUV(nm, sx, sy) \
+void BF(dav1d_fguv_32x32_##nm, neon)(pixel *const dst, \
+ const pixel *const src, \
+ const ptrdiff_t stride, \
+ const uint8_t scaling[SCALING_SIZE], \
+ const Dav1dFilmGrainData *const data, \
+ const entry grain_lut[][GRAIN_WIDTH], \
+ const pixel *const luma_row, \
+ const ptrdiff_t luma_stride, \
+ const int offsets[][2], \
+ const ptrdiff_t h, const ptrdiff_t uv, \
+ const ptrdiff_t is_id, \
+ const ptrdiff_t type \
+ HIGHBD_DECL_SUFFIX); \
+static void \
+fguv_32x32xn_##nm##_neon(pixel *const dst_row, const pixel *const src_row, \
+ const ptrdiff_t stride, const Dav1dFilmGrainData *const data, \
+ const size_t pw, const uint8_t scaling[SCALING_SIZE], \
+ const entry grain_lut[][GRAIN_WIDTH], const int bh, \
+ const int row_num, const pixel *const luma_row, \
+ const ptrdiff_t luma_stride, const int uv, const int is_id \
+ HIGHBD_DECL_SUFFIX) \
+{ \
+ const int rows = 1 + (data->overlap_flag && row_num > 0); \
+ \
+ /* seed[0] contains the current row, seed[1] contains the previous */ \
+ unsigned seed[2]; \
+ for (int i = 0; i < rows; i++) { \
+ seed[i] = data->seed; \
+ seed[i] ^= (((row_num - i) * 37 + 178) & 0xFF) << 8; \
+ seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF); \
+ } \
+ \
+ int offsets[2 /* col offset */][2 /* row offset */]; \
+ \
+ /* process this row in FG_BLOCK_SIZE^2 blocks (subsampled) */ \
+ for (unsigned bx = 0; bx < pw; bx += FG_BLOCK_SIZE >> sx) { \
+ if (data->overlap_flag && bx) { \
+ /* shift previous offsets left */ \
+ for (int i = 0; i < rows; i++) \
+ offsets[1][i] = offsets[0][i]; \
+ } \
+ \
+ /* update current offsets */ \
+ for (int i = 0; i < rows; i++) \
+ offsets[0][i] = get_random_number(8, &seed[i]); \
+ \
+ int type = 0; \
+ if (data->overlap_flag && row_num) \
+ type |= 1; /* overlap y */ \
+ if (data->overlap_flag && bx) \
+ type |= 2; /* overlap x */ \
+ if (data->chroma_scaling_from_luma) \
+ type |= 4; \
+ \
+ BF(dav1d_fguv_32x32_##nm, neon)(dst_row + bx, src_row + bx, stride, \
+ scaling, data, grain_lut, \
+ luma_row + (bx << sx), luma_stride, \
+ offsets, bh, uv, is_id, type \
+ HIGHBD_TAIL_SUFFIX); \
+ } \
+}
+
+FGUV(420, 1, 1);
+FGUV(422, 1, 0);
+FGUV(444, 0, 0);
+
+static ALWAYS_INLINE void film_grain_dsp_init_arm(Dav1dFilmGrainDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
+
+ c->generate_grain_y = BF(dav1d_generate_grain_y, neon);
+ c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_generate_grain_uv_420, neon);
+ c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_generate_grain_uv_422, neon);
+ c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_generate_grain_uv_444, neon);
+
+ c->fgy_32x32xn = fgy_32x32xn_neon;
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = fguv_32x32xn_420_neon;
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = fguv_32x32xn_422_neon;
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = fguv_32x32xn_444_neon;
+}
diff --git a/third_party/dav1d/src/arm/ipred.h b/third_party/dav1d/src/arm/ipred.h
new file mode 100644
index 0000000000..9c2aae748d
--- /dev/null
+++ b/third_party/dav1d/src/arm/ipred.h
@@ -0,0 +1,326 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/ipred.h"
+
+decl_angular_ipred_fn(BF(dav1d_ipred_dc, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_dc_128, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_dc_top, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_dc_left, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_h, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_v, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_paeth, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_smooth, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_smooth_v, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_smooth_h, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_filter, neon));
+
+decl_cfl_pred_fn(BF(dav1d_ipred_cfl, neon));
+decl_cfl_pred_fn(BF(dav1d_ipred_cfl_128, neon));
+decl_cfl_pred_fn(BF(dav1d_ipred_cfl_top, neon));
+decl_cfl_pred_fn(BF(dav1d_ipred_cfl_left, neon));
+
+decl_cfl_ac_fn(BF(dav1d_ipred_cfl_ac_420, neon));
+decl_cfl_ac_fn(BF(dav1d_ipred_cfl_ac_422, neon));
+decl_cfl_ac_fn(BF(dav1d_ipred_cfl_ac_444, neon));
+
+decl_pal_pred_fn(BF(dav1d_pal_pred, neon));
+
+#if ARCH_AARCH64
+void BF(dav1d_ipred_z1_upsample_edge, neon)(pixel *out, const int hsz,
+ const pixel *const in,
+ const int end HIGHBD_DECL_SUFFIX);
+void BF(dav1d_ipred_z1_filter_edge, neon)(pixel *out, const int sz,
+ const pixel *const in,
+ const int end, const int strength);
+void BF(dav1d_ipred_pixel_set, neon)(pixel *out, const pixel px,
+ const int n);
+void BF(dav1d_ipred_z1_fill1, neon)(pixel *dst, ptrdiff_t stride,
+ const pixel *const top, const int width,
+ const int height, const int dx,
+ const int max_base_x);
+void BF(dav1d_ipred_z1_fill2, neon)(pixel *dst, ptrdiff_t stride,
+ const pixel *const top, const int width,
+ const int height, const int dx,
+ const int max_base_x);
+
+static void ipred_z1_neon(pixel *dst, const ptrdiff_t stride,
+ const pixel *const topleft_in,
+ const int width, const int height, int angle,
+ const int max_width, const int max_height
+ HIGHBD_DECL_SUFFIX)
+{
+ const int is_sm = (angle >> 9) & 0x1;
+ const int enable_intra_edge_filter = angle >> 10;
+ angle &= 511;
+ int dx = dav1d_dr_intra_derivative[angle >> 1];
+ pixel top_out[64 + 64 + (64+15)*2 + 16];
+ int max_base_x;
+ const int upsample_above = enable_intra_edge_filter ?
+ get_upsample(width + height, 90 - angle, is_sm) : 0;
+ if (upsample_above) {
+ BF(dav1d_ipred_z1_upsample_edge, neon)(top_out, width + height,
+ topleft_in,
+ width + imin(width, height)
+ HIGHBD_TAIL_SUFFIX);
+ max_base_x = 2 * (width + height) - 2;
+ dx <<= 1;
+ } else {
+ const int filter_strength = enable_intra_edge_filter ?
+ get_filter_strength(width + height, 90 - angle, is_sm) : 0;
+ if (filter_strength) {
+ BF(dav1d_ipred_z1_filter_edge, neon)(top_out, width + height,
+ topleft_in,
+ width + imin(width, height),
+ filter_strength);
+ max_base_x = width + height - 1;
+ } else {
+ max_base_x = width + imin(width, height) - 1;
+ memcpy(top_out, &topleft_in[1], (max_base_x + 1) * sizeof(pixel));
+ }
+ }
+ const int base_inc = 1 + upsample_above;
+ int pad_pixels = width + 15; // max(dx >> 6) == 15
+ BF(dav1d_ipred_pixel_set, neon)(&top_out[max_base_x + 1],
+ top_out[max_base_x], pad_pixels * base_inc);
+ if (upsample_above)
+ BF(dav1d_ipred_z1_fill2, neon)(dst, stride, top_out, width, height,
+ dx, max_base_x);
+ else
+ BF(dav1d_ipred_z1_fill1, neon)(dst, stride, top_out, width, height,
+ dx, max_base_x);
+}
+
+void BF(dav1d_ipred_reverse, neon)(pixel *dst, const pixel *const src,
+ const int n);
+
+void BF(dav1d_ipred_z2_upsample_edge, neon)(pixel *out, const int sz,
+ const pixel *const in
+ HIGHBD_DECL_SUFFIX);
+
+void BF(dav1d_ipred_z2_fill1, neon)(pixel *dst, ptrdiff_t stride,
+ const pixel *const top,
+ const pixel *const left,
+ const int width, const int height,
+ const int dx, const int dy);
+void BF(dav1d_ipred_z2_fill2, neon)(pixel *dst, ptrdiff_t stride,
+ const pixel *const top,
+ const pixel *const left,
+ const int width, const int height,
+ const int dx, const int dy);
+void BF(dav1d_ipred_z2_fill3, neon)(pixel *dst, ptrdiff_t stride,
+ const pixel *const top,
+ const pixel *const left,
+ const int width, const int height,
+ const int dx, const int dy);
+
+static void ipred_z2_neon(pixel *dst, const ptrdiff_t stride,
+ const pixel *const topleft_in,
+ const int width, const int height, int angle,
+ const int max_width, const int max_height
+ HIGHBD_DECL_SUFFIX)
+{
+ const int is_sm = (angle >> 9) & 0x1;
+ const int enable_intra_edge_filter = angle >> 10;
+ angle &= 511;
+ assert(angle > 90 && angle < 180);
+ int dy = dav1d_dr_intra_derivative[(angle - 90) >> 1];
+ int dx = dav1d_dr_intra_derivative[(180 - angle) >> 1];
+ const int upsample_left = enable_intra_edge_filter ?
+ get_upsample(width + height, 180 - angle, is_sm) : 0;
+ const int upsample_above = enable_intra_edge_filter ?
+ get_upsample(width + height, angle - 90, is_sm) : 0;
+ pixel buf[3*(64+1)];
+ pixel *left = &buf[2*(64+1)];
+ // The asm can underread below the start of top[] and left[]; to avoid
+ // surprising behaviour, make sure this is within the allocated stack space.
+ pixel *top = &buf[1*(64+1)];
+ pixel *flipped = &buf[0*(64+1)];
+
+ if (upsample_above) {
+ BF(dav1d_ipred_z2_upsample_edge, neon)(top, width, topleft_in
+ HIGHBD_TAIL_SUFFIX);
+ dx <<= 1;
+ } else {
+ const int filter_strength = enable_intra_edge_filter ?
+ get_filter_strength(width + height, angle - 90, is_sm) : 0;
+
+ if (filter_strength) {
+ BF(dav1d_ipred_z1_filter_edge, neon)(&top[1], imin(max_width, width),
+ topleft_in, width,
+ filter_strength);
+ if (max_width < width)
+ memcpy(&top[1 + max_width], &topleft_in[1 + max_width],
+ (width - max_width) * sizeof(pixel));
+ } else {
+ pixel_copy(&top[1], &topleft_in[1], width);
+ }
+ }
+ if (upsample_left) {
+ flipped[0] = topleft_in[0];
+ BF(dav1d_ipred_reverse, neon)(&flipped[1], &topleft_in[0],
+ height);
+ BF(dav1d_ipred_z2_upsample_edge, neon)(left, height, flipped
+ HIGHBD_TAIL_SUFFIX);
+ dy <<= 1;
+ } else {
+ const int filter_strength = enable_intra_edge_filter ?
+ get_filter_strength(width + height, 180 - angle, is_sm) : 0;
+
+ if (filter_strength) {
+ flipped[0] = topleft_in[0];
+ BF(dav1d_ipred_reverse, neon)(&flipped[1], &topleft_in[0],
+ height);
+ BF(dav1d_ipred_z1_filter_edge, neon)(&left[1], imin(max_height, height),
+ flipped, height,
+ filter_strength);
+ if (max_height < height)
+ memcpy(&left[1 + max_height], &flipped[1 + max_height],
+ (height - max_height) * sizeof(pixel));
+ } else {
+ BF(dav1d_ipred_reverse, neon)(&left[1], &topleft_in[0],
+ height);
+ }
+ }
+ top[0] = left[0] = *topleft_in;
+
+ assert(!(upsample_above && upsample_left));
+ if (!upsample_above && !upsample_left) {
+ BF(dav1d_ipred_z2_fill1, neon)(dst, stride, top, left, width, height,
+ dx, dy);
+ } else if (upsample_above) {
+ BF(dav1d_ipred_z2_fill2, neon)(dst, stride, top, left, width, height,
+ dx, dy);
+ } else /*if (upsample_left)*/ {
+ BF(dav1d_ipred_z2_fill3, neon)(dst, stride, top, left, width, height,
+ dx, dy);
+ }
+}
+
+void BF(dav1d_ipred_z3_fill1, neon)(pixel *dst, ptrdiff_t stride,
+ const pixel *const left, const int width,
+ const int height, const int dy,
+ const int max_base_y);
+void BF(dav1d_ipred_z3_fill2, neon)(pixel *dst, ptrdiff_t stride,
+ const pixel *const left, const int width,
+ const int height, const int dy,
+ const int max_base_y);
+
+static void ipred_z3_neon(pixel *dst, const ptrdiff_t stride,
+ const pixel *const topleft_in,
+ const int width, const int height, int angle,
+ const int max_width, const int max_height
+ HIGHBD_DECL_SUFFIX)
+{
+ const int is_sm = (angle >> 9) & 0x1;
+ const int enable_intra_edge_filter = angle >> 10;
+ angle &= 511;
+ assert(angle > 180);
+ int dy = dav1d_dr_intra_derivative[(270 - angle) >> 1];
+ pixel flipped[64 + 64 + 16];
+ pixel left_out[64 + 64 + (64+15)*2];
+ int max_base_y;
+ const int upsample_left = enable_intra_edge_filter ?
+ get_upsample(width + height, angle - 180, is_sm) : 0;
+ if (upsample_left) {
+ flipped[0] = topleft_in[0];
+ BF(dav1d_ipred_reverse, neon)(&flipped[1], &topleft_in[0],
+ height + imax(width, height));
+ BF(dav1d_ipred_z1_upsample_edge, neon)(left_out, width + height,
+ flipped,
+ height + imin(width, height)
+ HIGHBD_TAIL_SUFFIX);
+ max_base_y = 2 * (width + height) - 2;
+ dy <<= 1;
+ } else {
+ const int filter_strength = enable_intra_edge_filter ?
+ get_filter_strength(width + height, angle - 180, is_sm) : 0;
+
+ if (filter_strength) {
+ flipped[0] = topleft_in[0];
+ BF(dav1d_ipred_reverse, neon)(&flipped[1], &topleft_in[0],
+ height + imax(width, height));
+ BF(dav1d_ipred_z1_filter_edge, neon)(left_out, width + height,
+ flipped,
+ height + imin(width, height),
+ filter_strength);
+ max_base_y = width + height - 1;
+ } else {
+ BF(dav1d_ipred_reverse, neon)(left_out, &topleft_in[0],
+ height + imin(width, height));
+ max_base_y = height + imin(width, height) - 1;
+ }
+ }
+ const int base_inc = 1 + upsample_left;
+ // The tbx based implementation needs left[] to have 64 bytes intitialized,
+ // the other implementation can read height + max(dy >> 6) past the end.
+ int pad_pixels = imax(64 - max_base_y - 1, height + 15);
+
+ BF(dav1d_ipred_pixel_set, neon)(&left_out[max_base_y + 1],
+ left_out[max_base_y], pad_pixels * base_inc);
+ if (upsample_left)
+ BF(dav1d_ipred_z3_fill2, neon)(dst, stride, left_out, width, height,
+ dy, max_base_y);
+ else
+ BF(dav1d_ipred_z3_fill1, neon)(dst, stride, left_out, width, height,
+ dy, max_base_y);
+}
+#endif
+
+static ALWAYS_INLINE void intra_pred_dsp_init_arm(Dav1dIntraPredDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
+
+ c->intra_pred[DC_PRED] = BF(dav1d_ipred_dc, neon);
+ c->intra_pred[DC_128_PRED] = BF(dav1d_ipred_dc_128, neon);
+ c->intra_pred[TOP_DC_PRED] = BF(dav1d_ipred_dc_top, neon);
+ c->intra_pred[LEFT_DC_PRED] = BF(dav1d_ipred_dc_left, neon);
+ c->intra_pred[HOR_PRED] = BF(dav1d_ipred_h, neon);
+ c->intra_pred[VERT_PRED] = BF(dav1d_ipred_v, neon);
+ c->intra_pred[PAETH_PRED] = BF(dav1d_ipred_paeth, neon);
+ c->intra_pred[SMOOTH_PRED] = BF(dav1d_ipred_smooth, neon);
+ c->intra_pred[SMOOTH_V_PRED] = BF(dav1d_ipred_smooth_v, neon);
+ c->intra_pred[SMOOTH_H_PRED] = BF(dav1d_ipred_smooth_h, neon);
+#if ARCH_AARCH64
+ c->intra_pred[Z1_PRED] = ipred_z1_neon;
+ c->intra_pred[Z2_PRED] = ipred_z2_neon;
+ c->intra_pred[Z3_PRED] = ipred_z3_neon;
+#endif
+ c->intra_pred[FILTER_PRED] = BF(dav1d_ipred_filter, neon);
+
+ c->cfl_pred[DC_PRED] = BF(dav1d_ipred_cfl, neon);
+ c->cfl_pred[DC_128_PRED] = BF(dav1d_ipred_cfl_128, neon);
+ c->cfl_pred[TOP_DC_PRED] = BF(dav1d_ipred_cfl_top, neon);
+ c->cfl_pred[LEFT_DC_PRED] = BF(dav1d_ipred_cfl_left, neon);
+
+ c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_ipred_cfl_ac_420, neon);
+ c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_ipred_cfl_ac_422, neon);
+ c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_ipred_cfl_ac_444, neon);
+
+ c->pal_pred = BF(dav1d_pal_pred, neon);
+}
diff --git a/third_party/dav1d/src/arm/itx.h b/third_party/dav1d/src/arm/itx.h
new file mode 100644
index 0000000000..2ecd086b3b
--- /dev/null
+++ b/third_party/dav1d/src/arm/itx.h
@@ -0,0 +1,141 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/itx.h"
+
+#define decl_itx2_fns(w, h, opt) \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_##w##x##h, opt))
+
+#define decl_itx12_fns(w, h, opt) \
+decl_itx2_fns(w, h, opt); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_##w##x##h, opt))
+
+#define decl_itx16_fns(w, h, opt) \
+decl_itx12_fns(w, h, opt); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_##w##x##h, opt))
+
+#define decl_itx17_fns(w, h, opt) \
+decl_itx16_fns(w, h, opt); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_##w##x##h, opt))
+
+decl_itx17_fns( 4, 4, neon);
+decl_itx16_fns( 4, 8, neon);
+decl_itx16_fns( 4, 16, neon);
+decl_itx16_fns( 8, 4, neon);
+decl_itx16_fns( 8, 8, neon);
+decl_itx16_fns( 8, 16, neon);
+decl_itx2_fns ( 8, 32, neon);
+decl_itx16_fns(16, 4, neon);
+decl_itx16_fns(16, 8, neon);
+decl_itx12_fns(16, 16, neon);
+decl_itx2_fns (16, 32, neon);
+decl_itx2_fns (32, 8, neon);
+decl_itx2_fns (32, 16, neon);
+decl_itx2_fns (32, 32, neon);
+
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_16x64, neon));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_32x64, neon));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x16, neon));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x32, neon));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x64, neon));
+
+static ALWAYS_INLINE void itx_dsp_init_arm(Dav1dInvTxfmDSPContext *const c, int bpc) {
+#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \
+ c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
+ BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
+
+#define assign_itx1_fn(pfx, w, h, ext) \
+ assign_itx_fn(pfx, w, h, dct_dct, DCT_DCT, ext)
+
+#define assign_itx2_fn(pfx, w, h, ext) \
+ assign_itx1_fn(pfx, w, h, ext); \
+ assign_itx_fn(pfx, w, h, identity_identity, IDTX, ext)
+
+#define assign_itx12_fn(pfx, w, h, ext) \
+ assign_itx2_fn(pfx, w, h, ext); \
+ assign_itx_fn(pfx, w, h, dct_adst, ADST_DCT, ext); \
+ assign_itx_fn(pfx, w, h, dct_flipadst, FLIPADST_DCT, ext); \
+ assign_itx_fn(pfx, w, h, dct_identity, H_DCT, ext); \
+ assign_itx_fn(pfx, w, h, adst_dct, DCT_ADST, ext); \
+ assign_itx_fn(pfx, w, h, adst_adst, ADST_ADST, ext); \
+ assign_itx_fn(pfx, w, h, adst_flipadst, FLIPADST_ADST, ext); \
+ assign_itx_fn(pfx, w, h, flipadst_dct, DCT_FLIPADST, ext); \
+ assign_itx_fn(pfx, w, h, flipadst_adst, ADST_FLIPADST, ext); \
+ assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \
+ assign_itx_fn(pfx, w, h, identity_dct, V_DCT, ext)
+
+#define assign_itx16_fn(pfx, w, h, ext) \
+ assign_itx12_fn(pfx, w, h, ext); \
+ assign_itx_fn(pfx, w, h, adst_identity, H_ADST, ext); \
+ assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST, ext); \
+ assign_itx_fn(pfx, w, h, identity_adst, V_ADST, ext); \
+ assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST, ext)
+
+#define assign_itx17_fn(pfx, w, h, ext) \
+ assign_itx16_fn(pfx, w, h, ext); \
+ assign_itx_fn(pfx, w, h, wht_wht, WHT_WHT, ext)
+
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
+
+ if (BITDEPTH == 16 && bpc != 10) return;
+
+ assign_itx17_fn( , 4, 4, neon);
+ assign_itx16_fn(R, 4, 8, neon);
+ assign_itx16_fn(R, 4, 16, neon);
+ assign_itx16_fn(R, 8, 4, neon);
+ assign_itx16_fn( , 8, 8, neon);
+ assign_itx16_fn(R, 8, 16, neon);
+ assign_itx2_fn (R, 8, 32, neon);
+ assign_itx16_fn(R, 16, 4, neon);
+ assign_itx16_fn(R, 16, 8, neon);
+ assign_itx12_fn( , 16, 16, neon);
+ assign_itx2_fn (R, 16, 32, neon);
+ assign_itx1_fn (R, 16, 64, neon);
+ assign_itx2_fn (R, 32, 8, neon);
+ assign_itx2_fn (R, 32, 16, neon);
+ assign_itx2_fn ( , 32, 32, neon);
+ assign_itx1_fn (R, 32, 64, neon);
+ assign_itx1_fn (R, 64, 16, neon);
+ assign_itx1_fn (R, 64, 32, neon);
+ assign_itx1_fn ( , 64, 64, neon);
+}
diff --git a/third_party/dav1d/src/arm/loopfilter.h b/third_party/dav1d/src/arm/loopfilter.h
new file mode 100644
index 0000000000..9ac08d94d2
--- /dev/null
+++ b/third_party/dav1d/src/arm/loopfilter.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/loopfilter.h"
+
+decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_y, neon));
+decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_y, neon));
+decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_uv, neon));
+decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_uv, neon));
+
+static ALWAYS_INLINE void loop_filter_dsp_init_arm(Dav1dLoopFilterDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
+
+ c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, neon);
+ c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, neon);
+ c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, neon);
+ c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, neon);
+}
diff --git a/third_party/dav1d/src/arm/looprestoration.h b/third_party/dav1d/src/arm/looprestoration.h
new file mode 100644
index 0000000000..1ac6d5fb5e
--- /dev/null
+++ b/third_party/dav1d/src/arm/looprestoration.h
@@ -0,0 +1,1113 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/looprestoration.h"
+
+#if ARCH_AARCH64
+void BF(dav1d_wiener_filter7, neon)(pixel *p, const ptrdiff_t stride,
+ const pixel (*left)[4], const pixel *lpf,
+ const int w, int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges
+ HIGHBD_DECL_SUFFIX);
+void BF(dav1d_wiener_filter5, neon)(pixel *p, const ptrdiff_t stride,
+ const pixel (*left)[4], const pixel *lpf,
+ const int w, int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges
+ HIGHBD_DECL_SUFFIX);
+#else
+
+// The 8bpc version calculates things slightly differently than the reference
+// C version. That version calculates roughly this:
+// int16_t sum = 0;
+// for (int i = 0; i < 7; i++)
+// sum += src[idx] * fh[i];
+// int16_t sum2 = (src[x] << 7) - (1 << (bitdepth + 6)) + rounding_off_h;
+// sum = iclip(sum + sum2, INT16_MIN, INT16_MAX) >> round_bits_h;
+// sum += 1 << (bitdepth + 6 - round_bits_h);
+// Compared to the reference C version, this is the output of the first pass
+// _subtracted_ by 1 << (bitdepth + 6 - round_bits_h) = 2048, i.e.
+// with round_offset precompensated.
+// The 16bpc version calculates things pretty much the same way as the
+// reference C version, but with the end result subtracted by
+// 1 << (bitdepth + 6 - round_bits_h).
+void BF(dav1d_wiener_filter_h, neon)(int16_t *dst, const pixel (*left)[4],
+ const pixel *src, ptrdiff_t stride,
+ const int16_t fh[8], intptr_t w,
+ int h, enum LrEdgeFlags edges
+ HIGHBD_DECL_SUFFIX);
+// This calculates things slightly differently than the reference C version.
+// This version calculates roughly this:
+// int32_t sum = 0;
+// for (int i = 0; i < 7; i++)
+// sum += mid[idx] * fv[i];
+// sum = (sum + rounding_off_v) >> round_bits_v;
+// This function assumes that the width is a multiple of 8.
+void BF(dav1d_wiener_filter_v, neon)(pixel *dst, ptrdiff_t stride,
+ const int16_t *mid, int w, int h,
+ const int16_t fv[8], enum LrEdgeFlags edges,
+ ptrdiff_t mid_stride HIGHBD_DECL_SUFFIX);
+
+static void wiener_filter_neon(pixel *const dst, const ptrdiff_t stride,
+ const pixel (*const left)[4], const pixel *lpf,
+ const int w, const int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+ const int16_t (*const filter)[8] = params->filter;
+ ALIGN_STK_16(int16_t, mid, 68 * 384,);
+ int mid_stride = (w + 7) & ~7;
+
+ // Horizontal filter
+ BF(dav1d_wiener_filter_h, neon)(&mid[2 * mid_stride], left, dst, stride,
+ filter[0], w, h, edges HIGHBD_TAIL_SUFFIX);
+ if (edges & LR_HAVE_TOP)
+ BF(dav1d_wiener_filter_h, neon)(mid, NULL, lpf, stride,
+ filter[0], w, 2, edges
+ HIGHBD_TAIL_SUFFIX);
+ if (edges & LR_HAVE_BOTTOM)
+ BF(dav1d_wiener_filter_h, neon)(&mid[(2 + h) * mid_stride], NULL,
+ lpf + 6 * PXSTRIDE(stride),
+ stride, filter[0], w, 2, edges
+ HIGHBD_TAIL_SUFFIX);
+
+ // Vertical filter
+ BF(dav1d_wiener_filter_v, neon)(dst, stride, &mid[2*mid_stride],
+ w, h, filter[1], edges,
+ mid_stride * sizeof(*mid)
+ HIGHBD_TAIL_SUFFIX);
+}
+#endif
+
+#if ARCH_ARM
+void BF(dav1d_sgr_box3_h, neon)(int32_t *sumsq, int16_t *sum,
+ const pixel (*left)[4],
+ const pixel *src, const ptrdiff_t stride,
+ const int w, const int h,
+ const enum LrEdgeFlags edges);
+void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum,
+ const int w, const int h,
+ const enum LrEdgeFlags edges);
+void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b,
+ const int w, const int h, const int strength,
+ const int bitdepth_max);
+void BF(dav1d_sgr_finish_filter1, neon)(int16_t *tmp,
+ const pixel *src, const ptrdiff_t stride,
+ const int32_t *a, const int16_t *b,
+ const int w, const int h);
+
+/* filter with a 3x3 box (radius=1) */
+static void dav1d_sgr_filter1_neon(int16_t *tmp,
+ const pixel *src, const ptrdiff_t stride,
+ const pixel (*left)[4], const pixel *lpf,
+ const int w, const int h, const int strength,
+ const enum LrEdgeFlags edges
+ HIGHBD_DECL_SUFFIX)
+{
+ ALIGN_STK_16(int32_t, sumsq_mem, (384 + 16) * 68 + 8,);
+ int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq;
+ ALIGN_STK_16(int16_t, sum_mem, (384 + 16) * 68 + 16,);
+ int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum;
+
+ BF(dav1d_sgr_box3_h, neon)(sumsq, sum, left, src, stride, w, h, edges);
+ if (edges & LR_HAVE_TOP)
+ BF(dav1d_sgr_box3_h, neon)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)],
+ NULL, lpf, stride, w, 2, edges);
+
+ if (edges & LR_HAVE_BOTTOM)
+ BF(dav1d_sgr_box3_h, neon)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)],
+ NULL, lpf + 6 * PXSTRIDE(stride),
+ stride, w, 2, edges);
+
+ dav1d_sgr_box3_v_neon(sumsq, sum, w, h, edges);
+ dav1d_sgr_calc_ab1_neon(a, b, w, h, strength, BITDEPTH_MAX);
+ BF(dav1d_sgr_finish_filter1, neon)(tmp, src, stride, a, b, w, h);
+}
+
+void BF(dav1d_sgr_box5_h, neon)(int32_t *sumsq, int16_t *sum,
+ const pixel (*left)[4],
+ const pixel *src, const ptrdiff_t stride,
+ const int w, const int h,
+ const enum LrEdgeFlags edges);
+void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum,
+ const int w, const int h,
+ const enum LrEdgeFlags edges);
+void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b,
+ const int w, const int h, const int strength,
+ const int bitdepth_max);
+void BF(dav1d_sgr_finish_filter2, neon)(int16_t *tmp,
+ const pixel *src, const ptrdiff_t stride,
+ const int32_t *a, const int16_t *b,
+ const int w, const int h);
+
+/* filter with a 5x5 box (radius=2) */
+static void dav1d_sgr_filter2_neon(int16_t *tmp,
+ const pixel *src, const ptrdiff_t stride,
+ const pixel (*left)[4], const pixel *lpf,
+ const int w, const int h, const int strength,
+ const enum LrEdgeFlags edges
+ HIGHBD_DECL_SUFFIX)
+{
+ ALIGN_STK_16(int32_t, sumsq_mem, (384 + 16) * 68 + 8,);
+ int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq;
+ ALIGN_STK_16(int16_t, sum_mem, (384 + 16) * 68 + 16,);
+ int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum;
+
+ BF(dav1d_sgr_box5_h, neon)(sumsq, sum, left, src, stride, w, h, edges);
+ if (edges & LR_HAVE_TOP)
+ BF(dav1d_sgr_box5_h, neon)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)],
+ NULL, lpf, stride, w, 2, edges);
+
+ if (edges & LR_HAVE_BOTTOM)
+ BF(dav1d_sgr_box5_h, neon)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)],
+ NULL, lpf + 6 * PXSTRIDE(stride),
+ stride, w, 2, edges);
+
+ dav1d_sgr_box5_v_neon(sumsq, sum, w, h, edges);
+ dav1d_sgr_calc_ab2_neon(a, b, w, h, strength, BITDEPTH_MAX);
+ BF(dav1d_sgr_finish_filter2, neon)(tmp, src, stride, a, b, w, h);
+}
+
+void BF(dav1d_sgr_weighted1, neon)(pixel *dst, const ptrdiff_t dst_stride,
+ const pixel *src, const ptrdiff_t src_stride,
+ const int16_t *t1, const int w, const int h,
+ const int wt HIGHBD_DECL_SUFFIX);
+void BF(dav1d_sgr_weighted2, neon)(pixel *dst, const ptrdiff_t dst_stride,
+ const pixel *src, const ptrdiff_t src_stride,
+ const int16_t *t1, const int16_t *t2,
+ const int w, const int h,
+ const int16_t wt[2] HIGHBD_DECL_SUFFIX);
+
+static void sgr_filter_5x5_neon(pixel *const dst, const ptrdiff_t stride,
+ const pixel (*const left)[4], const pixel *lpf,
+ const int w, const int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+ ALIGN_STK_16(int16_t, tmp, 64 * 384,);
+ dav1d_sgr_filter2_neon(tmp, dst, stride, left, lpf,
+ w, h, params->sgr.s0, edges HIGHBD_TAIL_SUFFIX);
+ BF(dav1d_sgr_weighted1, neon)(dst, stride, dst, stride,
+ tmp, w, h, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
+}
+
+static void sgr_filter_3x3_neon(pixel *const dst, const ptrdiff_t stride,
+ const pixel (*const left)[4], const pixel *lpf,
+ const int w, const int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+ ALIGN_STK_16(int16_t, tmp, 64 * 384,);
+ dav1d_sgr_filter1_neon(tmp, dst, stride, left, lpf,
+ w, h, params->sgr.s1, edges HIGHBD_TAIL_SUFFIX);
+ BF(dav1d_sgr_weighted1, neon)(dst, stride, dst, stride,
+ tmp, w, h, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
+}
+
+static void sgr_filter_mix_neon(pixel *const dst, const ptrdiff_t stride,
+ const pixel (*const left)[4], const pixel *lpf,
+ const int w, const int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+ ALIGN_STK_16(int16_t, tmp1, 64 * 384,);
+ ALIGN_STK_16(int16_t, tmp2, 64 * 384,);
+ dav1d_sgr_filter2_neon(tmp1, dst, stride, left, lpf,
+ w, h, params->sgr.s0, edges HIGHBD_TAIL_SUFFIX);
+ dav1d_sgr_filter1_neon(tmp2, dst, stride, left, lpf,
+ w, h, params->sgr.s1, edges HIGHBD_TAIL_SUFFIX);
+ const int16_t wt[2] = { params->sgr.w0, params->sgr.w1 };
+ BF(dav1d_sgr_weighted2, neon)(dst, stride, dst, stride,
+ tmp1, tmp2, w, h, wt HIGHBD_TAIL_SUFFIX);
+}
+
+#else
+static void rotate(int32_t **sumsq_ptrs, int16_t **sum_ptrs, int n) {
+ int32_t *tmp32 = sumsq_ptrs[0];
+ int16_t *tmp16 = sum_ptrs[0];
+ for (int i = 0; i < n - 1; i++) {
+ sumsq_ptrs[i] = sumsq_ptrs[i+1];
+ sum_ptrs[i] = sum_ptrs[i+1];
+ }
+ sumsq_ptrs[n - 1] = tmp32;
+ sum_ptrs[n - 1] = tmp16;
+}
+static void rotate5_x2(int32_t **sumsq_ptrs, int16_t **sum_ptrs) {
+ int32_t *tmp32[2];
+ int16_t *tmp16[2];
+ for (int i = 0; i < 2; i++) {
+ tmp32[i] = sumsq_ptrs[i];
+ tmp16[i] = sum_ptrs[i];
+ }
+ for (int i = 0; i < 3; i++) {
+ sumsq_ptrs[i] = sumsq_ptrs[i+2];
+ sum_ptrs[i] = sum_ptrs[i+2];
+ }
+ for (int i = 0; i < 2; i++) {
+ sumsq_ptrs[3 + i] = tmp32[i];
+ sum_ptrs[3 + i] = tmp16[i];
+ }
+}
+
+static void rotate_ab_3(int32_t **A_ptrs, int16_t **B_ptrs) {
+ rotate(A_ptrs, B_ptrs, 3);
+}
+
+static void rotate_ab_2(int32_t **A_ptrs, int16_t **B_ptrs) {
+ rotate(A_ptrs, B_ptrs, 2);
+}
+
+static void rotate_ab_4(int32_t **A_ptrs, int16_t **B_ptrs) {
+ rotate(A_ptrs, B_ptrs, 4);
+}
+
+void BF(dav1d_sgr_box3_row_h, neon)(int32_t *sumsq, int16_t *sum,
+ const pixel (*left)[4],
+ const pixel *src, const int w,
+ const enum LrEdgeFlags edges);
+void BF(dav1d_sgr_box5_row_h, neon)(int32_t *sumsq, int16_t *sum,
+ const pixel (*left)[4],
+ const pixel *src, const int w,
+ const enum LrEdgeFlags edges);
+void BF(dav1d_sgr_box35_row_h, neon)(int32_t *sumsq3, int16_t *sum3,
+ int32_t *sumsq5, int16_t *sum5,
+ const pixel (*left)[4],
+ const pixel *src, const int w,
+ const enum LrEdgeFlags edges);
+
+void dav1d_sgr_box3_vert_neon(int32_t **sumsq, int16_t **sum,
+ int32_t *AA, int16_t *BB,
+ const int w, const int s,
+ const int bitdepth_max);
+void dav1d_sgr_box5_vert_neon(int32_t **sumsq, int16_t **sum,
+ int32_t *AA, int16_t *BB,
+ const int w, const int s,
+ const int bitdepth_max);
+
+void BF(dav1d_sgr_finish_weighted1, neon)(pixel *dst,
+ int32_t **A_ptrs, int16_t **B_ptrs,
+ const int w, const int w1
+ HIGHBD_DECL_SUFFIX);
+void BF(dav1d_sgr_finish_weighted2, neon)(pixel *dst, const ptrdiff_t stride,
+ int32_t **A_ptrs, int16_t **B_ptrs,
+ const int w, const int h,
+ const int w1 HIGHBD_DECL_SUFFIX);
+
+void BF(dav1d_sgr_finish_filter1_2rows, neon)(int16_t *tmp, const pixel *src,
+ const ptrdiff_t src_stride,
+ int32_t **A_ptrs,
+ int16_t **B_ptrs,
+ const int w, const int h);
+void BF(dav1d_sgr_finish_filter2_2rows, neon)(int16_t *tmp, const pixel *src,
+ const ptrdiff_t src_stride,
+ int32_t **A_ptrs, int16_t **B_ptrs,
+ const int w, const int h);
+void BF(dav1d_sgr_weighted2, neon)(pixel *dst, const ptrdiff_t dst_stride,
+ const pixel *src, const ptrdiff_t src_stride,
+ const int16_t *t1, const int16_t *t2,
+ const int w, const int h,
+ const int16_t wt[2] HIGHBD_DECL_SUFFIX);
+
+static void sgr_box3_vert_neon(int32_t **sumsq, int16_t **sum,
+ int32_t *sumsq_out, int16_t *sum_out,
+ const int w, int s, int bitdepth_max) {
+ // box3_v + calc_ab1
+ dav1d_sgr_box3_vert_neon(sumsq, sum, sumsq_out, sum_out, w, s, bitdepth_max);
+ rotate(sumsq, sum, 3);
+}
+
+static void sgr_box5_vert_neon(int32_t **sumsq, int16_t **sum,
+ int32_t *sumsq_out, int16_t *sum_out,
+ const int w, int s, int bitdepth_max) {
+ // box5_v + calc_ab2
+ dav1d_sgr_box5_vert_neon(sumsq, sum, sumsq_out, sum_out, w, s, bitdepth_max);
+ rotate5_x2(sumsq, sum);
+}
+
+static void sgr_box3_hv_neon(int32_t **sumsq, int16_t **sum,
+ int32_t *AA, int16_t *BB,
+ const pixel (*left)[4],
+ const pixel *src, const int w,
+ const int s,
+ const enum LrEdgeFlags edges,
+ const int bitdepth_max) {
+ BF(dav1d_sgr_box3_row_h, neon)(sumsq[2], sum[2], left, src, w, edges);
+ sgr_box3_vert_neon(sumsq, sum, AA, BB, w, s, bitdepth_max);
+}
+
+
+static void sgr_finish1_neon(pixel **dst, const ptrdiff_t stride,
+ int32_t **A_ptrs, int16_t **B_ptrs, const int w,
+ const int w1 HIGHBD_DECL_SUFFIX) {
+ BF(dav1d_sgr_finish_weighted1, neon)(*dst, A_ptrs, B_ptrs,
+ w, w1 HIGHBD_TAIL_SUFFIX);
+ *dst += PXSTRIDE(stride);
+ rotate_ab_3(A_ptrs, B_ptrs);
+}
+
+static void sgr_finish2_neon(pixel **dst, const ptrdiff_t stride,
+ int32_t **A_ptrs, int16_t **B_ptrs,
+ const int w, const int h, const int w1
+ HIGHBD_DECL_SUFFIX) {
+ BF(dav1d_sgr_finish_weighted2, neon)(*dst, stride, A_ptrs, B_ptrs,
+ w, h, w1 HIGHBD_TAIL_SUFFIX);
+ *dst += 2*PXSTRIDE(stride);
+ rotate_ab_2(A_ptrs, B_ptrs);
+}
+
+static void sgr_finish_mix_neon(pixel **dst, const ptrdiff_t stride,
+ int32_t **A5_ptrs, int16_t **B5_ptrs,
+ int32_t **A3_ptrs, int16_t **B3_ptrs,
+ const int w, const int h,
+ const int w0, const int w1 HIGHBD_DECL_SUFFIX) {
+#define FILTER_OUT_STRIDE 384
+ ALIGN_STK_16(int16_t, tmp5, 2*FILTER_OUT_STRIDE,);
+ ALIGN_STK_16(int16_t, tmp3, 2*FILTER_OUT_STRIDE,);
+
+ BF(dav1d_sgr_finish_filter2_2rows, neon)(tmp5, *dst, stride,
+ A5_ptrs, B5_ptrs, w, h);
+ BF(dav1d_sgr_finish_filter1_2rows, neon)(tmp3, *dst, stride,
+ A3_ptrs, B3_ptrs, w, h);
+ const int16_t wt[2] = { w0, w1 };
+ BF(dav1d_sgr_weighted2, neon)(*dst, stride, *dst, stride,
+ tmp5, tmp3, w, h, wt HIGHBD_TAIL_SUFFIX);
+ *dst += h*PXSTRIDE(stride);
+ rotate_ab_2(A5_ptrs, B5_ptrs);
+ rotate_ab_4(A3_ptrs, B3_ptrs);
+}
+
+
+static void sgr_filter_3x3_neon(pixel *dst, const ptrdiff_t stride,
+ const pixel (*left)[4], const pixel *lpf,
+ const int w, int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+#define BUF_STRIDE (384 + 16)
+ ALIGN_STK_16(int32_t, sumsq_buf, BUF_STRIDE * 3 + 16,);
+ ALIGN_STK_16(int16_t, sum_buf, BUF_STRIDE * 3 + 16,);
+ int32_t *sumsq_ptrs[3], *sumsq_rows[3];
+ int16_t *sum_ptrs[3], *sum_rows[3];
+ for (int i = 0; i < 3; i++) {
+ sumsq_rows[i] = &sumsq_buf[i * BUF_STRIDE];
+ sum_rows[i] = &sum_buf[i * BUF_STRIDE];
+ }
+
+ ALIGN_STK_16(int32_t, A_buf, BUF_STRIDE * 3 + 16,);
+ ALIGN_STK_16(int16_t, B_buf, BUF_STRIDE * 3 + 16,);
+ int32_t *A_ptrs[3];
+ int16_t *B_ptrs[3];
+ for (int i = 0; i < 3; i++) {
+ A_ptrs[i] = &A_buf[i * BUF_STRIDE];
+ B_ptrs[i] = &B_buf[i * BUF_STRIDE];
+ }
+ const pixel *src = dst;
+ const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride);
+
+ if (edges & LR_HAVE_TOP) {
+ sumsq_ptrs[0] = sumsq_rows[0];
+ sumsq_ptrs[1] = sumsq_rows[1];
+ sumsq_ptrs[2] = sumsq_rows[2];
+ sum_ptrs[0] = sum_rows[0];
+ sum_ptrs[1] = sum_rows[1];
+ sum_ptrs[2] = sum_rows[2];
+
+ BF(dav1d_sgr_box3_row_h, neon)(sumsq_rows[0], sum_rows[0],
+ NULL, lpf, w, edges);
+ lpf += PXSTRIDE(stride);
+ BF(dav1d_sgr_box3_row_h, neon)(sumsq_rows[1], sum_rows[1],
+ NULL, lpf, w, edges);
+
+ sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
+ left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
+ left++;
+ src += PXSTRIDE(stride);
+ rotate_ab_3(A_ptrs, B_ptrs);
+
+ if (--h <= 0)
+ goto vert_1;
+
+ sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
+ left++;
+ src += PXSTRIDE(stride);
+ rotate_ab_3(A_ptrs, B_ptrs);
+
+ if (--h <= 0)
+ goto vert_2;
+ } else {
+ sumsq_ptrs[0] = sumsq_rows[0];
+ sumsq_ptrs[1] = sumsq_rows[0];
+ sumsq_ptrs[2] = sumsq_rows[0];
+ sum_ptrs[0] = sum_rows[0];
+ sum_ptrs[1] = sum_rows[0];
+ sum_ptrs[2] = sum_rows[0];
+
+ BF(dav1d_sgr_box3_row_h, neon)(sumsq_rows[0], sum_rows[0],
+ left, src, w, edges);
+ left++;
+ src += PXSTRIDE(stride);
+
+ sgr_box3_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
+ w, params->sgr.s1, BITDEPTH_MAX);
+ rotate_ab_3(A_ptrs, B_ptrs);
+
+ if (--h <= 0)
+ goto vert_1;
+
+ sumsq_ptrs[2] = sumsq_rows[1];
+ sum_ptrs[2] = sum_rows[1];
+
+ sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
+ left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
+ left++;
+ src += PXSTRIDE(stride);
+ rotate_ab_3(A_ptrs, B_ptrs);
+
+ if (--h <= 0)
+ goto vert_2;
+
+ sumsq_ptrs[2] = sumsq_rows[2];
+ sum_ptrs[2] = sum_rows[2];
+ }
+
+ do {
+ sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
+ left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
+ left++;
+ src += PXSTRIDE(stride);
+
+ sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs,
+ w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
+ } while (--h > 0);
+
+ if (!(edges & LR_HAVE_BOTTOM))
+ goto vert_2;
+
+ sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
+ NULL, lpf_bottom, w, params->sgr.s1, edges, BITDEPTH_MAX);
+ lpf_bottom += PXSTRIDE(stride);
+
+ sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs,
+ w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
+
+ sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
+ NULL, lpf_bottom, w, params->sgr.s1, edges, BITDEPTH_MAX);
+
+ sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs,
+ w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
+ return;
+
+vert_2:
+ sumsq_ptrs[2] = sumsq_ptrs[1];
+ sum_ptrs[2] = sum_ptrs[1];
+ sgr_box3_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
+ w, params->sgr.s1, BITDEPTH_MAX);
+
+ sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs,
+ w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
+
+output_1:
+ sumsq_ptrs[2] = sumsq_ptrs[1];
+ sum_ptrs[2] = sum_ptrs[1];
+ sgr_box3_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
+ w, params->sgr.s1, BITDEPTH_MAX);
+
+ sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs,
+ w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
+ return;
+
+vert_1:
+ sumsq_ptrs[2] = sumsq_ptrs[1];
+ sum_ptrs[2] = sum_ptrs[1];
+ sgr_box3_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
+ w, params->sgr.s1, BITDEPTH_MAX);
+ rotate_ab_3(A_ptrs, B_ptrs);
+ goto output_1;
+}
+
+static void sgr_filter_5x5_neon(pixel *dst, const ptrdiff_t stride,
+ const pixel (*left)[4], const pixel *lpf,
+ const int w, int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+ ALIGN_STK_16(int32_t, sumsq_buf, BUF_STRIDE * 5 + 16,);
+ ALIGN_STK_16(int16_t, sum_buf, BUF_STRIDE * 5 + 16,);
+ int32_t *sumsq_ptrs[5], *sumsq_rows[5];
+ int16_t *sum_ptrs[5], *sum_rows[5];
+ for (int i = 0; i < 5; i++) {
+ sumsq_rows[i] = &sumsq_buf[i * BUF_STRIDE];
+ sum_rows[i] = &sum_buf[i * BUF_STRIDE];
+ }
+
+ ALIGN_STK_16(int32_t, A_buf, BUF_STRIDE * 2 + 16,);
+ ALIGN_STK_16(int16_t, B_buf, BUF_STRIDE * 2 + 16,);
+ int32_t *A_ptrs[2];
+ int16_t *B_ptrs[2];
+ for (int i = 0; i < 2; i++) {
+ A_ptrs[i] = &A_buf[i * BUF_STRIDE];
+ B_ptrs[i] = &B_buf[i * BUF_STRIDE];
+ }
+ const pixel *src = dst;
+ const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride);
+
+ if (edges & LR_HAVE_TOP) {
+ sumsq_ptrs[0] = sumsq_rows[0];
+ sumsq_ptrs[1] = sumsq_rows[0];
+ sumsq_ptrs[2] = sumsq_rows[1];
+ sumsq_ptrs[3] = sumsq_rows[2];
+ sumsq_ptrs[4] = sumsq_rows[3];
+ sum_ptrs[0] = sum_rows[0];
+ sum_ptrs[1] = sum_rows[0];
+ sum_ptrs[2] = sum_rows[1];
+ sum_ptrs[3] = sum_rows[2];
+ sum_ptrs[4] = sum_rows[3];
+
+ BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[0], sum_rows[0],
+ NULL, lpf, w, edges);
+ lpf += PXSTRIDE(stride);
+ BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[1], sum_rows[1],
+ NULL, lpf, w, edges);
+
+ BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[2], sum_rows[2],
+ left, src, w, edges);
+ left++;
+ src += PXSTRIDE(stride);
+
+ if (--h <= 0)
+ goto vert_1;
+
+ BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[3], sum_rows[3],
+ left, src, w, edges);
+ left++;
+ src += PXSTRIDE(stride);
+ sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
+ w, params->sgr.s0, BITDEPTH_MAX);
+ rotate_ab_2(A_ptrs, B_ptrs);
+
+ if (--h <= 0)
+ goto vert_2;
+
+ // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
+ // one of them to point at the previously unused rows[4].
+ sumsq_ptrs[3] = sumsq_rows[4];
+ sum_ptrs[3] = sum_rows[4];
+ } else {
+ sumsq_ptrs[0] = sumsq_rows[0];
+ sumsq_ptrs[1] = sumsq_rows[0];
+ sumsq_ptrs[2] = sumsq_rows[0];
+ sumsq_ptrs[3] = sumsq_rows[0];
+ sumsq_ptrs[4] = sumsq_rows[0];
+ sum_ptrs[0] = sum_rows[0];
+ sum_ptrs[1] = sum_rows[0];
+ sum_ptrs[2] = sum_rows[0];
+ sum_ptrs[3] = sum_rows[0];
+ sum_ptrs[4] = sum_rows[0];
+
+ BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[0], sum_rows[0],
+ left, src, w, edges);
+ left++;
+ src += PXSTRIDE(stride);
+
+ if (--h <= 0)
+ goto vert_1;
+
+ sumsq_ptrs[4] = sumsq_rows[1];
+ sum_ptrs[4] = sum_rows[1];
+
+ BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[1], sum_rows[1],
+ left, src, w, edges);
+ left++;
+ src += PXSTRIDE(stride);
+
+ sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
+ w, params->sgr.s0, BITDEPTH_MAX);
+ rotate_ab_2(A_ptrs, B_ptrs);
+
+ if (--h <= 0)
+ goto vert_2;
+
+ sumsq_ptrs[3] = sumsq_rows[2];
+ sumsq_ptrs[4] = sumsq_rows[3];
+ sum_ptrs[3] = sum_rows[2];
+ sum_ptrs[4] = sum_rows[3];
+
+ BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[2], sum_rows[2],
+ left, src, w, edges);
+ left++;
+ src += PXSTRIDE(stride);
+
+ if (--h <= 0)
+ goto odd;
+
+ BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[3], sum_rows[3],
+ left, src, w, edges);
+ left++;
+ src += PXSTRIDE(stride);
+
+ sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
+ w, params->sgr.s0, BITDEPTH_MAX);
+ sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs,
+ w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
+
+ if (--h <= 0)
+ goto vert_2;
+
+ // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
+ // one of them to point at the previously unused rows[4].
+ sumsq_ptrs[3] = sumsq_rows[4];
+ sum_ptrs[3] = sum_rows[4];
+ }
+
+ do {
+ BF(dav1d_sgr_box5_row_h, neon)(sumsq_ptrs[3], sum_ptrs[3],
+ left, src, w, edges);
+ left++;
+ src += PXSTRIDE(stride);
+
+ if (--h <= 0)
+ goto odd;
+
+ BF(dav1d_sgr_box5_row_h, neon)(sumsq_ptrs[4], sum_ptrs[4],
+ left, src, w, edges);
+ left++;
+ src += PXSTRIDE(stride);
+
+ sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
+ w, params->sgr.s0, BITDEPTH_MAX);
+ sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs,
+ w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
+ } while (--h > 0);
+
+ if (!(edges & LR_HAVE_BOTTOM))
+ goto vert_2;
+
+ BF(dav1d_sgr_box5_row_h, neon)(sumsq_ptrs[3], sum_ptrs[3],
+ NULL, lpf_bottom, w, edges);
+ lpf_bottom += PXSTRIDE(stride);
+ BF(dav1d_sgr_box5_row_h, neon)(sumsq_ptrs[4], sum_ptrs[4],
+ NULL, lpf_bottom, w, edges);
+
+output_2:
+ sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
+ w, params->sgr.s0, BITDEPTH_MAX);
+ sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs,
+ w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
+ return;
+
+vert_2:
+ // Duplicate the last row twice more
+ sumsq_ptrs[3] = sumsq_ptrs[2];
+ sumsq_ptrs[4] = sumsq_ptrs[2];
+ sum_ptrs[3] = sum_ptrs[2];
+ sum_ptrs[4] = sum_ptrs[2];
+ goto output_2;
+
+odd:
+ // Copy the last row as padding once
+ sumsq_ptrs[4] = sumsq_ptrs[3];
+ sum_ptrs[4] = sum_ptrs[3];
+
+ sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
+ w, params->sgr.s0, BITDEPTH_MAX);
+ sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs,
+ w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
+
+output_1:
+ // Duplicate the last row twice more
+ sumsq_ptrs[3] = sumsq_ptrs[2];
+ sumsq_ptrs[4] = sumsq_ptrs[2];
+ sum_ptrs[3] = sum_ptrs[2];
+ sum_ptrs[4] = sum_ptrs[2];
+
+ sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
+ w, params->sgr.s0, BITDEPTH_MAX);
+ // Output only one row
+ sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs,
+ w, 1, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
+ return;
+
+vert_1:
+ // Copy the last row as padding once
+ sumsq_ptrs[4] = sumsq_ptrs[3];
+ sum_ptrs[4] = sum_ptrs[3];
+
+ sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
+ w, params->sgr.s0, BITDEPTH_MAX);
+ rotate_ab_2(A_ptrs, B_ptrs);
+
+ goto output_1;
+}
+
+static void sgr_filter_mix_neon(pixel *dst, const ptrdiff_t stride,
+ const pixel (*left)[4], const pixel *lpf,
+ const int w, int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+ ALIGN_STK_16(int32_t, sumsq5_buf, BUF_STRIDE * 5 + 16,);
+ ALIGN_STK_16(int16_t, sum5_buf, BUF_STRIDE * 5 + 16,);
+ int32_t *sumsq5_ptrs[5], *sumsq5_rows[5];
+ int16_t *sum5_ptrs[5], *sum5_rows[5];
+ for (int i = 0; i < 5; i++) {
+ sumsq5_rows[i] = &sumsq5_buf[i * BUF_STRIDE];
+ sum5_rows[i] = &sum5_buf[i * BUF_STRIDE];
+ }
+ ALIGN_STK_16(int32_t, sumsq3_buf, BUF_STRIDE * 3 + 16,);
+ ALIGN_STK_16(int16_t, sum3_buf, BUF_STRIDE * 3 + 16,);
+ int32_t *sumsq3_ptrs[3], *sumsq3_rows[3];
+ int16_t *sum3_ptrs[3], *sum3_rows[3];
+ for (int i = 0; i < 3; i++) {
+ sumsq3_rows[i] = &sumsq3_buf[i * BUF_STRIDE];
+ sum3_rows[i] = &sum3_buf[i * BUF_STRIDE];
+ }
+
+ ALIGN_STK_16(int32_t, A5_buf, BUF_STRIDE * 2 + 16,);
+ ALIGN_STK_16(int16_t, B5_buf, BUF_STRIDE * 2 + 16,);
+ int32_t *A5_ptrs[2];
+ int16_t *B5_ptrs[2];
+ for (int i = 0; i < 2; i++) {
+ A5_ptrs[i] = &A5_buf[i * BUF_STRIDE];
+ B5_ptrs[i] = &B5_buf[i * BUF_STRIDE];
+ }
+ ALIGN_STK_16(int32_t, A3_buf, BUF_STRIDE * 4 + 16,);
+ ALIGN_STK_16(int16_t, B3_buf, BUF_STRIDE * 4 + 16,);
+ int32_t *A3_ptrs[4];
+ int16_t *B3_ptrs[4];
+ for (int i = 0; i < 4; i++) {
+ A3_ptrs[i] = &A3_buf[i * BUF_STRIDE];
+ B3_ptrs[i] = &B3_buf[i * BUF_STRIDE];
+ }
+ const pixel *src = dst;
+ const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride);
+
+ if (edges & LR_HAVE_TOP) {
+ sumsq5_ptrs[0] = sumsq5_rows[0];
+ sumsq5_ptrs[1] = sumsq5_rows[0];
+ sumsq5_ptrs[2] = sumsq5_rows[1];
+ sumsq5_ptrs[3] = sumsq5_rows[2];
+ sumsq5_ptrs[4] = sumsq5_rows[3];
+ sum5_ptrs[0] = sum5_rows[0];
+ sum5_ptrs[1] = sum5_rows[0];
+ sum5_ptrs[2] = sum5_rows[1];
+ sum5_ptrs[3] = sum5_rows[2];
+ sum5_ptrs[4] = sum5_rows[3];
+
+ sumsq3_ptrs[0] = sumsq3_rows[0];
+ sumsq3_ptrs[1] = sumsq3_rows[1];
+ sumsq3_ptrs[2] = sumsq3_rows[2];
+ sum3_ptrs[0] = sum3_rows[0];
+ sum3_ptrs[1] = sum3_rows[1];
+ sum3_ptrs[2] = sum3_rows[2];
+
+ BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[0], sum3_rows[0],
+ sumsq5_rows[0], sum5_rows[0],
+ NULL, lpf, w, edges);
+ lpf += PXSTRIDE(stride);
+ BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[1], sum3_rows[1],
+ sumsq5_rows[1], sum5_rows[1],
+ NULL, lpf, w, edges);
+
+ BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[2], sum3_rows[2],
+ sumsq5_rows[2], sum5_rows[2],
+ left, src, w, edges);
+ left++;
+ src += PXSTRIDE(stride);
+
+ sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
+ w, params->sgr.s1, BITDEPTH_MAX);
+ rotate_ab_4(A3_ptrs, B3_ptrs);
+
+ if (--h <= 0)
+ goto vert_1;
+
+ BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2],
+ sumsq5_rows[3], sum5_rows[3],
+ left, src, w, edges);
+ left++;
+ src += PXSTRIDE(stride);
+ sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
+ w, params->sgr.s0, BITDEPTH_MAX);
+ rotate_ab_2(A5_ptrs, B5_ptrs);
+ sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
+ w, params->sgr.s1, BITDEPTH_MAX);
+ rotate_ab_4(A3_ptrs, B3_ptrs);
+
+ if (--h <= 0)
+ goto vert_2;
+
+ // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
+ // one of them to point at the previously unused rows[4].
+ sumsq5_ptrs[3] = sumsq5_rows[4];
+ sum5_ptrs[3] = sum5_rows[4];
+ } else {
+ sumsq5_ptrs[0] = sumsq5_rows[0];
+ sumsq5_ptrs[1] = sumsq5_rows[0];
+ sumsq5_ptrs[2] = sumsq5_rows[0];
+ sumsq5_ptrs[3] = sumsq5_rows[0];
+ sumsq5_ptrs[4] = sumsq5_rows[0];
+ sum5_ptrs[0] = sum5_rows[0];
+ sum5_ptrs[1] = sum5_rows[0];
+ sum5_ptrs[2] = sum5_rows[0];
+ sum5_ptrs[3] = sum5_rows[0];
+ sum5_ptrs[4] = sum5_rows[0];
+
+ sumsq3_ptrs[0] = sumsq3_rows[0];
+ sumsq3_ptrs[1] = sumsq3_rows[0];
+ sumsq3_ptrs[2] = sumsq3_rows[0];
+ sum3_ptrs[0] = sum3_rows[0];
+ sum3_ptrs[1] = sum3_rows[0];
+ sum3_ptrs[2] = sum3_rows[0];
+
+ BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[0], sum3_rows[0],
+ sumsq5_rows[0], sum5_rows[0],
+ left, src, w, edges);
+ left++;
+ src += PXSTRIDE(stride);
+
+ sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
+ w, params->sgr.s1, BITDEPTH_MAX);
+ rotate_ab_4(A3_ptrs, B3_ptrs);
+
+ if (--h <= 0)
+ goto vert_1;
+
+ sumsq5_ptrs[4] = sumsq5_rows[1];
+ sum5_ptrs[4] = sum5_rows[1];
+
+ sumsq3_ptrs[2] = sumsq3_rows[1];
+ sum3_ptrs[2] = sum3_rows[1];
+
+ BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[1], sum3_rows[1],
+ sumsq5_rows[1], sum5_rows[1],
+ left, src, w, edges);
+ left++;
+ src += PXSTRIDE(stride);
+
+ sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
+ w, params->sgr.s0, BITDEPTH_MAX);
+ rotate_ab_2(A5_ptrs, B5_ptrs);
+ sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
+ w, params->sgr.s1, BITDEPTH_MAX);
+ rotate_ab_4(A3_ptrs, B3_ptrs);
+
+ if (--h <= 0)
+ goto vert_2;
+
+ sumsq5_ptrs[3] = sumsq5_rows[2];
+ sumsq5_ptrs[4] = sumsq5_rows[3];
+ sum5_ptrs[3] = sum5_rows[2];
+ sum5_ptrs[4] = sum5_rows[3];
+
+ sumsq3_ptrs[2] = sumsq3_rows[2];
+ sum3_ptrs[2] = sum3_rows[2];
+
+ BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[2], sum3_rows[2],
+ sumsq5_rows[2], sum5_rows[2],
+ left, src, w, edges);
+ left++;
+ src += PXSTRIDE(stride);
+
+ sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
+ w, params->sgr.s1, BITDEPTH_MAX);
+ rotate_ab_4(A3_ptrs, B3_ptrs);
+
+ if (--h <= 0)
+ goto odd;
+
+ BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2],
+ sumsq5_rows[3], sum5_rows[3],
+ left, src, w, edges);
+ left++;
+ src += PXSTRIDE(stride);
+
+ sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
+ w, params->sgr.s0, BITDEPTH_MAX);
+ sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
+ w, params->sgr.s1, BITDEPTH_MAX);
+ sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
+ w, 2, params->sgr.w0, params->sgr.w1
+ HIGHBD_TAIL_SUFFIX);
+
+ if (--h <= 0)
+ goto vert_2;
+
+ // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
+ // one of them to point at the previously unused rows[4].
+ sumsq5_ptrs[3] = sumsq5_rows[4];
+ sum5_ptrs[3] = sum5_rows[4];
+ }
+
+ do {
+ BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2],
+ sumsq5_ptrs[3], sum5_ptrs[3],
+ left, src, w, edges);
+ left++;
+ src += PXSTRIDE(stride);
+
+ sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
+ w, params->sgr.s1, BITDEPTH_MAX);
+ rotate_ab_4(A3_ptrs, B3_ptrs);
+
+ if (--h <= 0)
+ goto odd;
+
+ BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2],
+ sumsq5_ptrs[4], sum5_ptrs[4],
+ left, src, w, edges);
+ left++;
+ src += PXSTRIDE(stride);
+
+ sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
+ w, params->sgr.s0, BITDEPTH_MAX);
+ sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
+ w, params->sgr.s1, BITDEPTH_MAX);
+ sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
+ w, 2, params->sgr.w0, params->sgr.w1
+ HIGHBD_TAIL_SUFFIX);
+ } while (--h > 0);
+
+ if (!(edges & LR_HAVE_BOTTOM))
+ goto vert_2;
+
+ BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2],
+ sumsq5_ptrs[3], sum5_ptrs[3],
+ NULL, lpf_bottom, w, edges);
+ lpf_bottom += PXSTRIDE(stride);
+ sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
+ w, params->sgr.s1, BITDEPTH_MAX);
+ rotate_ab_4(A3_ptrs, B3_ptrs);
+
+ BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2],
+ sumsq5_ptrs[4], sum5_ptrs[4],
+ NULL, lpf_bottom, w, edges);
+
+output_2:
+ sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
+ w, params->sgr.s0, BITDEPTH_MAX);
+ sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
+ w, params->sgr.s1, BITDEPTH_MAX);
+ sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
+ w, 2, params->sgr.w0, params->sgr.w1
+ HIGHBD_TAIL_SUFFIX);
+ return;
+
+vert_2:
+ // Duplicate the last row twice more
+ sumsq5_ptrs[3] = sumsq5_ptrs[2];
+ sumsq5_ptrs[4] = sumsq5_ptrs[2];
+ sum5_ptrs[3] = sum5_ptrs[2];
+ sum5_ptrs[4] = sum5_ptrs[2];
+
+ sumsq3_ptrs[2] = sumsq3_ptrs[1];
+ sum3_ptrs[2] = sum3_ptrs[1];
+ sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
+ w, params->sgr.s1, BITDEPTH_MAX);
+ rotate_ab_4(A3_ptrs, B3_ptrs);
+
+ sumsq3_ptrs[2] = sumsq3_ptrs[1];
+ sum3_ptrs[2] = sum3_ptrs[1];
+
+ goto output_2;
+
+odd:
+ // Copy the last row as padding once
+ sumsq5_ptrs[4] = sumsq5_ptrs[3];
+ sum5_ptrs[4] = sum5_ptrs[3];
+
+ sumsq3_ptrs[2] = sumsq3_ptrs[1];
+ sum3_ptrs[2] = sum3_ptrs[1];
+
+ sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
+ w, params->sgr.s0, BITDEPTH_MAX);
+ sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
+ w, params->sgr.s1, BITDEPTH_MAX);
+ sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
+ w, 2, params->sgr.w0, params->sgr.w1
+ HIGHBD_TAIL_SUFFIX);
+
+output_1:
+ // Duplicate the last row twice more
+ sumsq5_ptrs[3] = sumsq5_ptrs[2];
+ sumsq5_ptrs[4] = sumsq5_ptrs[2];
+ sum5_ptrs[3] = sum5_ptrs[2];
+ sum5_ptrs[4] = sum5_ptrs[2];
+
+ sumsq3_ptrs[2] = sumsq3_ptrs[1];
+ sum3_ptrs[2] = sum3_ptrs[1];
+
+ sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
+ w, params->sgr.s0, BITDEPTH_MAX);
+ sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
+ w, params->sgr.s1, BITDEPTH_MAX);
+ rotate_ab_4(A3_ptrs, B3_ptrs);
+ // Output only one row
+ sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
+ w, 1, params->sgr.w0, params->sgr.w1
+ HIGHBD_TAIL_SUFFIX);
+ return;
+
+vert_1:
+ // Copy the last row as padding once
+ sumsq5_ptrs[4] = sumsq5_ptrs[3];
+ sum5_ptrs[4] = sum5_ptrs[3];
+
+ sumsq3_ptrs[2] = sumsq3_ptrs[1];
+ sum3_ptrs[2] = sum3_ptrs[1];
+
+ sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
+ w, params->sgr.s0, BITDEPTH_MAX);
+ rotate_ab_2(A5_ptrs, B5_ptrs);
+ sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
+ w, params->sgr.s1, BITDEPTH_MAX);
+ rotate_ab_4(A3_ptrs, B3_ptrs);
+
+ goto output_1;
+}
+
+#endif
+
+
+static ALWAYS_INLINE void loop_restoration_dsp_init_arm(Dav1dLoopRestorationDSPContext *const c, int bpc) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
+
+#if ARCH_AARCH64
+ c->wiener[0] = BF(dav1d_wiener_filter7, neon);
+ c->wiener[1] = BF(dav1d_wiener_filter5, neon);
+#else
+ c->wiener[0] = c->wiener[1] = wiener_filter_neon;
+#endif
+ if (BITDEPTH == 8 || bpc == 10) {
+ c->sgr[0] = sgr_filter_5x5_neon;
+ c->sgr[1] = sgr_filter_3x3_neon;
+ c->sgr[2] = sgr_filter_mix_neon;
+ }
+}
diff --git a/third_party/dav1d/src/arm/mc.h b/third_party/dav1d/src/arm/mc.h
new file mode 100644
index 0000000000..06cd533a9b
--- /dev/null
+++ b/third_party/dav1d/src/arm/mc.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "src/mc.h"
+#include "src/cpu.h"
+
+decl_mc_fn(BF(dav1d_put_8tap_regular, neon));
+decl_mc_fn(BF(dav1d_put_8tap_regular_smooth, neon));
+decl_mc_fn(BF(dav1d_put_8tap_regular_sharp, neon));
+decl_mc_fn(BF(dav1d_put_8tap_smooth, neon));
+decl_mc_fn(BF(dav1d_put_8tap_smooth_regular, neon));
+decl_mc_fn(BF(dav1d_put_8tap_smooth_sharp, neon));
+decl_mc_fn(BF(dav1d_put_8tap_sharp, neon));
+decl_mc_fn(BF(dav1d_put_8tap_sharp_regular, neon));
+decl_mc_fn(BF(dav1d_put_8tap_sharp_smooth, neon));
+decl_mc_fn(BF(dav1d_put_bilin, neon));
+
+decl_mct_fn(BF(dav1d_prep_8tap_regular, neon));
+decl_mct_fn(BF(dav1d_prep_8tap_regular_smooth, neon));
+decl_mct_fn(BF(dav1d_prep_8tap_regular_sharp, neon));
+decl_mct_fn(BF(dav1d_prep_8tap_smooth, neon));
+decl_mct_fn(BF(dav1d_prep_8tap_smooth_regular, neon));
+decl_mct_fn(BF(dav1d_prep_8tap_smooth_sharp, neon));
+decl_mct_fn(BF(dav1d_prep_8tap_sharp, neon));
+decl_mct_fn(BF(dav1d_prep_8tap_sharp_regular, neon));
+decl_mct_fn(BF(dav1d_prep_8tap_sharp_smooth, neon));
+decl_mct_fn(BF(dav1d_prep_bilin, neon));
+
+decl_avg_fn(BF(dav1d_avg, neon));
+decl_w_avg_fn(BF(dav1d_w_avg, neon));
+decl_mask_fn(BF(dav1d_mask, neon));
+decl_blend_fn(BF(dav1d_blend, neon));
+decl_blend_dir_fn(BF(dav1d_blend_h, neon));
+decl_blend_dir_fn(BF(dav1d_blend_v, neon));
+
+decl_w_mask_fn(BF(dav1d_w_mask_444, neon));
+decl_w_mask_fn(BF(dav1d_w_mask_422, neon));
+decl_w_mask_fn(BF(dav1d_w_mask_420, neon));
+
+decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, neon));
+decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, neon));
+
+decl_emu_edge_fn(BF(dav1d_emu_edge, neon));
+
+static ALWAYS_INLINE void mc_dsp_init_arm(Dav1dMCDSPContext *const c) {
+#define init_mc_fn(type, name, suffix) \
+ c->mc[type] = BF(dav1d_put_##name, suffix)
+#define init_mct_fn(type, name, suffix) \
+ c->mct[type] = BF(dav1d_prep_##name, suffix)
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
+
+ init_mc_fn (FILTER_2D_8TAP_REGULAR, 8tap_regular, neon);
+ init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, neon);
+ init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, neon);
+ init_mc_fn (FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, neon);
+ init_mc_fn (FILTER_2D_8TAP_SMOOTH, 8tap_smooth, neon);
+ init_mc_fn (FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, neon);
+ init_mc_fn (FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, neon);
+ init_mc_fn (FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, neon);
+ init_mc_fn (FILTER_2D_8TAP_SHARP, 8tap_sharp, neon);
+ init_mc_fn (FILTER_2D_BILINEAR, bilin, neon);
+
+ init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, neon);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, neon);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, neon);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, neon);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, neon);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, neon);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, neon);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, neon);
+ init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, neon);
+ init_mct_fn(FILTER_2D_BILINEAR, bilin, neon);
+
+ c->avg = BF(dav1d_avg, neon);
+ c->w_avg = BF(dav1d_w_avg, neon);
+ c->mask = BF(dav1d_mask, neon);
+ c->blend = BF(dav1d_blend, neon);
+ c->blend_h = BF(dav1d_blend_h, neon);
+ c->blend_v = BF(dav1d_blend_v, neon);
+ c->w_mask[0] = BF(dav1d_w_mask_444, neon);
+ c->w_mask[1] = BF(dav1d_w_mask_422, neon);
+ c->w_mask[2] = BF(dav1d_w_mask_420, neon);
+ c->warp8x8 = BF(dav1d_warp_affine_8x8, neon);
+ c->warp8x8t = BF(dav1d_warp_affine_8x8t, neon);
+ c->emu_edge = BF(dav1d_emu_edge, neon);
+}
diff --git a/third_party/dav1d/src/arm/msac.h b/third_party/dav1d/src/arm/msac.h
new file mode 100644
index 0000000000..9db0bf86ae
--- /dev/null
+++ b/third_party/dav1d/src/arm/msac.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright © 2019, VideoLAN and dav1d authors
+ * Copyright © 2019, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_ARM_MSAC_H
+#define DAV1D_SRC_ARM_MSAC_H
+
+unsigned dav1d_msac_decode_symbol_adapt4_neon(MsacContext *s, uint16_t *cdf,
+ size_t n_symbols);
+unsigned dav1d_msac_decode_symbol_adapt8_neon(MsacContext *s, uint16_t *cdf,
+ size_t n_symbols);
+unsigned dav1d_msac_decode_symbol_adapt16_neon(MsacContext *s, uint16_t *cdf,
+ size_t n_symbols);
+unsigned dav1d_msac_decode_hi_tok_neon(MsacContext *s, uint16_t *cdf);
+unsigned dav1d_msac_decode_bool_adapt_neon(MsacContext *s, uint16_t *cdf);
+unsigned dav1d_msac_decode_bool_equi_neon(MsacContext *s);
+unsigned dav1d_msac_decode_bool_neon(MsacContext *s, unsigned f);
+
+#if ARCH_AARCH64 || defined(__ARM_NEON)
+#define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt4_neon
+#define dav1d_msac_decode_symbol_adapt8 dav1d_msac_decode_symbol_adapt8_neon
+#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_neon
+#define dav1d_msac_decode_hi_tok dav1d_msac_decode_hi_tok_neon
+#define dav1d_msac_decode_bool_adapt dav1d_msac_decode_bool_adapt_neon
+#define dav1d_msac_decode_bool_equi dav1d_msac_decode_bool_equi_neon
+#define dav1d_msac_decode_bool dav1d_msac_decode_bool_neon
+#endif
+
+#endif /* DAV1D_SRC_ARM_MSAC_H */
diff --git a/third_party/dav1d/src/arm/refmvs.h b/third_party/dav1d/src/arm/refmvs.h
new file mode 100644
index 0000000000..1c2dc704cf
--- /dev/null
+++ b/third_party/dav1d/src/arm/refmvs.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright © 2021, VideoLAN and dav1d authors
+ * Copyright © 2021, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/refmvs.h"
+
+decl_save_tmvs_fn(dav1d_save_tmvs_neon);
+decl_splat_mv_fn(dav1d_splat_mv_neon);
+
+static ALWAYS_INLINE void refmvs_dsp_init_arm(Dav1dRefmvsDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
+
+ c->save_tmvs = dav1d_save_tmvs_neon;
+ c->splat_mv = dav1d_splat_mv_neon;
+}
diff --git a/third_party/dav1d/src/cdef.h b/third_party/dav1d/src/cdef.h
new file mode 100644
index 0000000000..07c84d9ff5
--- /dev/null
+++ b/third_party/dav1d/src/cdef.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_CDEF_H
+#define DAV1D_SRC_CDEF_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "common/bitdepth.h"
+
+enum CdefEdgeFlags {
+ CDEF_HAVE_LEFT = 1 << 0,
+ CDEF_HAVE_RIGHT = 1 << 1,
+ CDEF_HAVE_TOP = 1 << 2,
+ CDEF_HAVE_BOTTOM = 1 << 3,
+};
+
+#ifdef BITDEPTH
+typedef const pixel (*const_left_pixel_row_2px)[2];
+#else
+typedef const void *const_left_pixel_row_2px;
+#endif
+
+// CDEF operates entirely on pre-filter data; if bottom/right edges are
+// present (according to $edges), then the pre-filter data is located in
+// $dst. However, the edge pixels above $dst may be post-filter, so in
+// order to get access to pre-filter top pixels, use $top.
+#define decl_cdef_fn(name) \
+void (name)(pixel *dst, ptrdiff_t stride, const_left_pixel_row_2px left, \
+ const pixel *top, const pixel *bottom, \
+ int pri_strength, int sec_strength, \
+ int dir, int damping, enum CdefEdgeFlags edges HIGHBD_DECL_SUFFIX)
+typedef decl_cdef_fn(*cdef_fn);
+
+#define decl_cdef_dir_fn(name) \
+int (name)(const pixel *dst, ptrdiff_t dst_stride, unsigned *var HIGHBD_DECL_SUFFIX)
+typedef decl_cdef_dir_fn(*cdef_dir_fn);
+
+typedef struct Dav1dCdefDSPContext {
+ cdef_dir_fn dir;
+ cdef_fn fb[3 /* 444/luma, 422, 420 */];
+} Dav1dCdefDSPContext;
+
+bitfn_decls(void dav1d_cdef_dsp_init, Dav1dCdefDSPContext *c);
+
+#endif /* DAV1D_SRC_CDEF_H */
diff --git a/third_party/dav1d/src/cdef_apply.h b/third_party/dav1d/src/cdef_apply.h
new file mode 100644
index 0000000000..a9748ee4f6
--- /dev/null
+++ b/third_party/dav1d/src/cdef_apply.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_CDEF_APPLY_H
+#define DAV1D_SRC_CDEF_APPLY_H
+
+#include "common/bitdepth.h"
+
+#include "src/internal.h"
+
+void bytefn(dav1d_cdef_brow)(Dav1dTaskContext *tc, pixel *const p[3],
+ const Av1Filter *lflvl, int by_start, int by_end,
+ int sbrow_start, int sby);
+
+#endif /* DAV1D_SRC_CDEF_APPLY_H */
diff --git a/third_party/dav1d/src/cdef_apply_tmpl.c b/third_party/dav1d/src/cdef_apply_tmpl.c
new file mode 100644
index 0000000000..e2d8b83fc7
--- /dev/null
+++ b/third_party/dav1d/src/cdef_apply_tmpl.c
@@ -0,0 +1,309 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <string.h>
+
+#include "common/intops.h"
+
+#include "src/cdef_apply.h"
+
+enum Backup2x8Flags {
+ BACKUP_2X8_Y = 1 << 0,
+ BACKUP_2X8_UV = 1 << 1,
+};
+
+static void backup2lines(pixel *const dst[3], /*const*/ pixel *const src[3],
+ const ptrdiff_t stride[2],
+ const enum Dav1dPixelLayout layout)
+{
+ const ptrdiff_t y_stride = PXSTRIDE(stride[0]);
+ if (y_stride < 0)
+ pixel_copy(dst[0] + y_stride, src[0] + 7 * y_stride, -2 * y_stride);
+ else
+ pixel_copy(dst[0], src[0] + 6 * y_stride, 2 * y_stride);
+
+ if (layout != DAV1D_PIXEL_LAYOUT_I400) {
+ const ptrdiff_t uv_stride = PXSTRIDE(stride[1]);
+ if (uv_stride < 0) {
+ const int uv_off = layout == DAV1D_PIXEL_LAYOUT_I420 ? 3 : 7;
+ pixel_copy(dst[1] + uv_stride, src[1] + uv_off * uv_stride, -2 * uv_stride);
+ pixel_copy(dst[2] + uv_stride, src[2] + uv_off * uv_stride, -2 * uv_stride);
+ } else {
+ const int uv_off = layout == DAV1D_PIXEL_LAYOUT_I420 ? 2 : 6;
+ pixel_copy(dst[1], src[1] + uv_off * uv_stride, 2 * uv_stride);
+ pixel_copy(dst[2], src[2] + uv_off * uv_stride, 2 * uv_stride);
+ }
+ }
+}
+
+static void backup2x8(pixel dst[3][8][2],
+ /*const*/ pixel *const src[3],
+ const ptrdiff_t src_stride[2], int x_off,
+ const enum Dav1dPixelLayout layout,
+ const enum Backup2x8Flags flag)
+{
+ ptrdiff_t y_off = 0;
+ if (flag & BACKUP_2X8_Y) {
+ for (int y = 0; y < 8; y++, y_off += PXSTRIDE(src_stride[0]))
+ pixel_copy(dst[0][y], &src[0][y_off + x_off - 2], 2);
+ }
+
+ if (layout == DAV1D_PIXEL_LAYOUT_I400 || !(flag & BACKUP_2X8_UV))
+ return;
+
+ const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
+
+ x_off >>= ss_hor;
+ y_off = 0;
+ for (int y = 0; y < (8 >> ss_ver); y++, y_off += PXSTRIDE(src_stride[1])) {
+ pixel_copy(dst[1][y], &src[1][y_off + x_off - 2], 2);
+ pixel_copy(dst[2][y], &src[2][y_off + x_off - 2], 2);
+ }
+}
+
+static int adjust_strength(const int strength, const unsigned var) {
+ if (!var) return 0;
+ const int i = var >> 6 ? imin(ulog2(var >> 6), 12) : 0;
+ return (strength * (4 + i) + 8) >> 4;
+}
+
+void bytefn(dav1d_cdef_brow)(Dav1dTaskContext *const tc,
+ pixel *const p[3],
+ const Av1Filter *const lflvl,
+ const int by_start, const int by_end,
+ const int sbrow_start, const int sby)
+{
+ Dav1dFrameContext *const f = (Dav1dFrameContext *)tc->f;
+ const int bitdepth_min_8 = BITDEPTH == 8 ? 0 : f->cur.p.bpc - 8;
+ const Dav1dDSPContext *const dsp = f->dsp;
+ enum CdefEdgeFlags edges = CDEF_HAVE_BOTTOM | (by_start > 0 ? CDEF_HAVE_TOP : 0);
+ pixel *ptrs[3] = { p[0], p[1], p[2] };
+ const int sbsz = 16;
+ const int sb64w = f->sb128w << 1;
+ const int damping = f->frame_hdr->cdef.damping + bitdepth_min_8;
+ const enum Dav1dPixelLayout layout = f->cur.p.layout;
+ const int uv_idx = DAV1D_PIXEL_LAYOUT_I444 - layout;
+ const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
+ static const uint8_t uv_dirs[2][8] = { { 0, 1, 2, 3, 4, 5, 6, 7 },
+ { 7, 0, 2, 4, 5, 6, 6, 6 } };
+ const uint8_t *uv_dir = uv_dirs[layout == DAV1D_PIXEL_LAYOUT_I422];
+ const int have_tt = f->c->n_tc > 1;
+ const int sb128 = f->seq_hdr->sb128;
+ const int resize = f->frame_hdr->width[0] != f->frame_hdr->width[1];
+ const ptrdiff_t y_stride = PXSTRIDE(f->cur.stride[0]);
+ const ptrdiff_t uv_stride = PXSTRIDE(f->cur.stride[1]);
+
+ for (int bit = 0, by = by_start; by < by_end; by += 2, edges |= CDEF_HAVE_TOP) {
+ const int tf = tc->top_pre_cdef_toggle;
+ const int by_idx = (by & 30) >> 1;
+ if (by + 2 >= f->bh) edges &= ~CDEF_HAVE_BOTTOM;
+
+ if ((!have_tt || sbrow_start || by + 2 < by_end) &&
+ edges & CDEF_HAVE_BOTTOM)
+ {
+ // backup pre-filter data for next iteration
+ pixel *const cdef_top_bak[3] = {
+ f->lf.cdef_line[!tf][0] + have_tt * sby * 4 * y_stride,
+ f->lf.cdef_line[!tf][1] + have_tt * sby * 8 * uv_stride,
+ f->lf.cdef_line[!tf][2] + have_tt * sby * 8 * uv_stride
+ };
+ backup2lines(cdef_top_bak, ptrs, f->cur.stride, layout);
+ }
+
+ ALIGN_STK_16(pixel, lr_bak, 2 /* idx */, [3 /* plane */][8 /* y */][2 /* x */]);
+ pixel *iptrs[3] = { ptrs[0], ptrs[1], ptrs[2] };
+ edges &= ~CDEF_HAVE_LEFT;
+ edges |= CDEF_HAVE_RIGHT;
+ enum Backup2x8Flags prev_flag = 0;
+ for (int sbx = 0, last_skip = 1; sbx < sb64w; sbx++, edges |= CDEF_HAVE_LEFT) {
+ const int sb128x = sbx >> 1;
+ const int sb64_idx = ((by & sbsz) >> 3) + (sbx & 1);
+ const int cdef_idx = lflvl[sb128x].cdef_idx[sb64_idx];
+ if (cdef_idx == -1 ||
+ (!f->frame_hdr->cdef.y_strength[cdef_idx] &&
+ !f->frame_hdr->cdef.uv_strength[cdef_idx]))
+ {
+ last_skip = 1;
+ goto next_sb;
+ }
+
+ // Create a complete 32-bit mask for the sb row ahead of time.
+ const uint16_t (*noskip_row)[2] = &lflvl[sb128x].noskip_mask[by_idx];
+ const unsigned noskip_mask = (unsigned) noskip_row[0][1] << 16 |
+ noskip_row[0][0];
+
+ const int y_lvl = f->frame_hdr->cdef.y_strength[cdef_idx];
+ const int uv_lvl = f->frame_hdr->cdef.uv_strength[cdef_idx];
+ const enum Backup2x8Flags flag = !!y_lvl + (!!uv_lvl << 1);
+
+ const int y_pri_lvl = (y_lvl >> 2) << bitdepth_min_8;
+ int y_sec_lvl = y_lvl & 3;
+ y_sec_lvl += y_sec_lvl == 3;
+ y_sec_lvl <<= bitdepth_min_8;
+
+ const int uv_pri_lvl = (uv_lvl >> 2) << bitdepth_min_8;
+ int uv_sec_lvl = uv_lvl & 3;
+ uv_sec_lvl += uv_sec_lvl == 3;
+ uv_sec_lvl <<= bitdepth_min_8;
+
+ pixel *bptrs[3] = { iptrs[0], iptrs[1], iptrs[2] };
+ for (int bx = sbx * sbsz; bx < imin((sbx + 1) * sbsz, f->bw);
+ bx += 2, edges |= CDEF_HAVE_LEFT)
+ {
+ if (bx + 2 >= f->bw) edges &= ~CDEF_HAVE_RIGHT;
+
+ // check if this 8x8 block had any coded coefficients; if not,
+ // go to the next block
+ const uint32_t bx_mask = 3U << (bx & 30);
+ if (!(noskip_mask & bx_mask)) {
+ last_skip = 1;
+ goto next_b;
+ }
+ const int do_left = last_skip ? flag : (prev_flag ^ flag) & flag;
+ prev_flag = flag;
+ if (do_left && edges & CDEF_HAVE_LEFT) {
+ // we didn't backup the prefilter data because it wasn't
+ // there, so do it here instead
+ backup2x8(lr_bak[bit], bptrs, f->cur.stride, 0, layout, do_left);
+ }
+ if (edges & CDEF_HAVE_RIGHT) {
+ // backup pre-filter data for next iteration
+ backup2x8(lr_bak[!bit], bptrs, f->cur.stride, 8, layout, flag);
+ }
+
+ int dir;
+ unsigned variance;
+ if (y_pri_lvl || uv_pri_lvl)
+ dir = dsp->cdef.dir(bptrs[0], f->cur.stride[0],
+ &variance HIGHBD_CALL_SUFFIX);
+
+ const pixel *top, *bot;
+ ptrdiff_t offset;
+
+ if (!have_tt) goto st_y;
+ if (sbrow_start && by == by_start) {
+ if (resize) {
+ offset = (sby - 1) * 4 * y_stride + bx * 4;
+ top = &f->lf.cdef_lpf_line[0][offset];
+ } else {
+ offset = (sby * (4 << sb128) - 4) * y_stride + bx * 4;
+ top = &f->lf.lr_lpf_line[0][offset];
+ }
+ bot = bptrs[0] + 8 * y_stride;
+ } else if (!sbrow_start && by + 2 >= by_end) {
+ top = &f->lf.cdef_line[tf][0][sby * 4 * y_stride + bx * 4];
+ if (resize) {
+ offset = (sby * 4 + 2) * y_stride + bx * 4;
+ bot = &f->lf.cdef_lpf_line[0][offset];
+ } else {
+ const int line = sby * (4 << sb128) + 4 * sb128 + 2;
+ offset = line * y_stride + bx * 4;
+ bot = &f->lf.lr_lpf_line[0][offset];
+ }
+ } else {
+ st_y:;
+ offset = sby * 4 * y_stride;
+ top = &f->lf.cdef_line[tf][0][have_tt * offset + bx * 4];
+ bot = bptrs[0] + 8 * y_stride;
+ }
+ if (y_pri_lvl) {
+ const int adj_y_pri_lvl = adjust_strength(y_pri_lvl, variance);
+ if (adj_y_pri_lvl || y_sec_lvl)
+ dsp->cdef.fb[0](bptrs[0], f->cur.stride[0], lr_bak[bit][0],
+ top, bot, adj_y_pri_lvl, y_sec_lvl,
+ dir, damping, edges HIGHBD_CALL_SUFFIX);
+ } else if (y_sec_lvl)
+ dsp->cdef.fb[0](bptrs[0], f->cur.stride[0], lr_bak[bit][0],
+ top, bot, 0, y_sec_lvl, 0, damping,
+ edges HIGHBD_CALL_SUFFIX);
+
+ if (!uv_lvl) goto skip_uv;
+ assert(layout != DAV1D_PIXEL_LAYOUT_I400);
+
+ const int uvdir = uv_pri_lvl ? uv_dir[dir] : 0;
+ for (int pl = 1; pl <= 2; pl++) {
+ if (!have_tt) goto st_uv;
+ if (sbrow_start && by == by_start) {
+ if (resize) {
+ offset = (sby - 1) * 4 * uv_stride + (bx * 4 >> ss_hor);
+ top = &f->lf.cdef_lpf_line[pl][offset];
+ } else {
+ const int line = sby * (4 << sb128) - 4;
+ offset = line * uv_stride + (bx * 4 >> ss_hor);
+ top = &f->lf.lr_lpf_line[pl][offset];
+ }
+ bot = bptrs[pl] + (8 >> ss_ver) * uv_stride;
+ } else if (!sbrow_start && by + 2 >= by_end) {
+ const ptrdiff_t top_offset = sby * 8 * uv_stride +
+ (bx * 4 >> ss_hor);
+ top = &f->lf.cdef_line[tf][pl][top_offset];
+ if (resize) {
+ offset = (sby * 4 + 2) * uv_stride + (bx * 4 >> ss_hor);
+ bot = &f->lf.cdef_lpf_line[pl][offset];
+ } else {
+ const int line = sby * (4 << sb128) + 4 * sb128 + 2;
+ offset = line * uv_stride + (bx * 4 >> ss_hor);
+ bot = &f->lf.lr_lpf_line[pl][offset];
+ }
+ } else {
+ st_uv:;
+ const ptrdiff_t offset = sby * 8 * uv_stride;
+ top = &f->lf.cdef_line[tf][pl][have_tt * offset + (bx * 4 >> ss_hor)];
+ bot = bptrs[pl] + (8 >> ss_ver) * uv_stride;
+ }
+ dsp->cdef.fb[uv_idx](bptrs[pl], f->cur.stride[1],
+ lr_bak[bit][pl], top, bot,
+ uv_pri_lvl, uv_sec_lvl, uvdir,
+ damping - 1, edges HIGHBD_CALL_SUFFIX);
+ }
+
+ skip_uv:
+ bit ^= 1;
+ last_skip = 0;
+
+ next_b:
+ bptrs[0] += 8;
+ bptrs[1] += 8 >> ss_hor;
+ bptrs[2] += 8 >> ss_hor;
+ }
+
+ next_sb:
+ iptrs[0] += sbsz * 4;
+ iptrs[1] += sbsz * 4 >> ss_hor;
+ iptrs[2] += sbsz * 4 >> ss_hor;
+ }
+
+ ptrs[0] += 8 * PXSTRIDE(f->cur.stride[0]);
+ ptrs[1] += 8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver;
+ ptrs[2] += 8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver;
+ tc->top_pre_cdef_toggle ^= 1;
+ }
+}
diff --git a/third_party/dav1d/src/cdef_tmpl.c b/third_party/dav1d/src/cdef_tmpl.c
new file mode 100644
index 0000000000..59439457a1
--- /dev/null
+++ b/third_party/dav1d/src/cdef_tmpl.c
@@ -0,0 +1,331 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stdlib.h>
+
+#include "common/intops.h"
+
+#include "src/cdef.h"
+#include "src/tables.h"
+
+static inline int constrain(const int diff, const int threshold,
+ const int shift)
+{
+ const int adiff = abs(diff);
+ return apply_sign(imin(adiff, imax(0, threshold - (adiff >> shift))), diff);
+}
+
+static inline void fill(int16_t *tmp, const ptrdiff_t stride,
+ const int w, const int h)
+{
+ /* Use a value that's a large positive number when interpreted as unsigned,
+ * and a large negative number when interpreted as signed. */
+ for (int y = 0; y < h; y++) {
+ for (int x = 0; x < w; x++)
+ tmp[x] = INT16_MIN;
+ tmp += stride;
+ }
+}
+
+static void padding(int16_t *tmp, const ptrdiff_t tmp_stride,
+ const pixel *src, const ptrdiff_t src_stride,
+ const pixel (*left)[2],
+ const pixel *top, const pixel *bottom,
+ const int w, const int h, const enum CdefEdgeFlags edges)
+{
+ // fill extended input buffer
+ int x_start = -2, x_end = w + 2, y_start = -2, y_end = h + 2;
+ if (!(edges & CDEF_HAVE_TOP)) {
+ fill(tmp - 2 - 2 * tmp_stride, tmp_stride, w + 4, 2);
+ y_start = 0;
+ }
+ if (!(edges & CDEF_HAVE_BOTTOM)) {
+ fill(tmp + h * tmp_stride - 2, tmp_stride, w + 4, 2);
+ y_end -= 2;
+ }
+ if (!(edges & CDEF_HAVE_LEFT)) {
+ fill(tmp + y_start * tmp_stride - 2, tmp_stride, 2, y_end - y_start);
+ x_start = 0;
+ }
+ if (!(edges & CDEF_HAVE_RIGHT)) {
+ fill(tmp + y_start * tmp_stride + w, tmp_stride, 2, y_end - y_start);
+ x_end -= 2;
+ }
+
+ for (int y = y_start; y < 0; y++) {
+ for (int x = x_start; x < x_end; x++)
+ tmp[x + y * tmp_stride] = top[x];
+ top += PXSTRIDE(src_stride);
+ }
+ for (int y = 0; y < h; y++)
+ for (int x = x_start; x < 0; x++)
+ tmp[x + y * tmp_stride] = left[y][2 + x];
+ for (int y = 0; y < h; y++) {
+ for (int x = (y < h) ? 0 : x_start; x < x_end; x++)
+ tmp[x] = src[x];
+ src += PXSTRIDE(src_stride);
+ tmp += tmp_stride;
+ }
+ for (int y = h; y < y_end; y++) {
+ for (int x = x_start; x < x_end; x++)
+ tmp[x] = bottom[x];
+ bottom += PXSTRIDE(src_stride);
+ tmp += tmp_stride;
+ }
+
+}
+
+static NOINLINE void
+cdef_filter_block_c(pixel *dst, const ptrdiff_t dst_stride,
+ const pixel (*left)[2],
+ const pixel *const top, const pixel *const bottom,
+ const int pri_strength, const int sec_strength,
+ const int dir, const int damping, const int w, int h,
+ const enum CdefEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+ const ptrdiff_t tmp_stride = 12;
+ assert((w == 4 || w == 8) && (h == 4 || h == 8));
+ int16_t tmp_buf[144]; // 12*12 is the maximum value of tmp_stride * (h + 4)
+ int16_t *tmp = tmp_buf + 2 * tmp_stride + 2;
+
+ padding(tmp, tmp_stride, dst, dst_stride, left, top, bottom, w, h, edges);
+
+ if (pri_strength) {
+ const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
+ const int pri_tap = 4 - ((pri_strength >> bitdepth_min_8) & 1);
+ const int pri_shift = imax(0, damping - ulog2(pri_strength));
+ if (sec_strength) {
+ const int sec_shift = damping - ulog2(sec_strength);
+ do {
+ for (int x = 0; x < w; x++) {
+ const int px = dst[x];
+ int sum = 0;
+ int max = px, min = px;
+ int pri_tap_k = pri_tap;
+ for (int k = 0; k < 2; k++) {
+ const int off1 = dav1d_cdef_directions[dir + 2][k]; // dir
+ const int p0 = tmp[x + off1];
+ const int p1 = tmp[x - off1];
+ sum += pri_tap_k * constrain(p0 - px, pri_strength, pri_shift);
+ sum += pri_tap_k * constrain(p1 - px, pri_strength, pri_shift);
+ // if pri_tap_k == 4 then it becomes 2 else it remains 3
+ pri_tap_k = (pri_tap_k & 3) | 2;
+ min = umin(p0, min);
+ max = imax(p0, max);
+ min = umin(p1, min);
+ max = imax(p1, max);
+ const int off2 = dav1d_cdef_directions[dir + 4][k]; // dir + 2
+ const int off3 = dav1d_cdef_directions[dir + 0][k]; // dir - 2
+ const int s0 = tmp[x + off2];
+ const int s1 = tmp[x - off2];
+ const int s2 = tmp[x + off3];
+ const int s3 = tmp[x - off3];
+ // sec_tap starts at 2 and becomes 1
+ const int sec_tap = 2 - k;
+ sum += sec_tap * constrain(s0 - px, sec_strength, sec_shift);
+ sum += sec_tap * constrain(s1 - px, sec_strength, sec_shift);
+ sum += sec_tap * constrain(s2 - px, sec_strength, sec_shift);
+ sum += sec_tap * constrain(s3 - px, sec_strength, sec_shift);
+ min = umin(s0, min);
+ max = imax(s0, max);
+ min = umin(s1, min);
+ max = imax(s1, max);
+ min = umin(s2, min);
+ max = imax(s2, max);
+ min = umin(s3, min);
+ max = imax(s3, max);
+ }
+ dst[x] = iclip(px + ((sum - (sum < 0) + 8) >> 4), min, max);
+ }
+ dst += PXSTRIDE(dst_stride);
+ tmp += tmp_stride;
+ } while (--h);
+ } else { // pri_strength only
+ do {
+ for (int x = 0; x < w; x++) {
+ const int px = dst[x];
+ int sum = 0;
+ int pri_tap_k = pri_tap;
+ for (int k = 0; k < 2; k++) {
+ const int off = dav1d_cdef_directions[dir + 2][k]; // dir
+ const int p0 = tmp[x + off];
+ const int p1 = tmp[x - off];
+ sum += pri_tap_k * constrain(p0 - px, pri_strength, pri_shift);
+ sum += pri_tap_k * constrain(p1 - px, pri_strength, pri_shift);
+ pri_tap_k = (pri_tap_k & 3) | 2;
+ }
+ dst[x] = px + ((sum - (sum < 0) + 8) >> 4);
+ }
+ dst += PXSTRIDE(dst_stride);
+ tmp += tmp_stride;
+ } while (--h);
+ }
+ } else { // sec_strength only
+ assert(sec_strength);
+ const int sec_shift = damping - ulog2(sec_strength);
+ do {
+ for (int x = 0; x < w; x++) {
+ const int px = dst[x];
+ int sum = 0;
+ for (int k = 0; k < 2; k++) {
+ const int off1 = dav1d_cdef_directions[dir + 4][k]; // dir + 2
+ const int off2 = dav1d_cdef_directions[dir + 0][k]; // dir - 2
+ const int s0 = tmp[x + off1];
+ const int s1 = tmp[x - off1];
+ const int s2 = tmp[x + off2];
+ const int s3 = tmp[x - off2];
+ const int sec_tap = 2 - k;
+ sum += sec_tap * constrain(s0 - px, sec_strength, sec_shift);
+ sum += sec_tap * constrain(s1 - px, sec_strength, sec_shift);
+ sum += sec_tap * constrain(s2 - px, sec_strength, sec_shift);
+ sum += sec_tap * constrain(s3 - px, sec_strength, sec_shift);
+ }
+ dst[x] = px + ((sum - (sum < 0) + 8) >> 4);
+ }
+ dst += PXSTRIDE(dst_stride);
+ tmp += tmp_stride;
+ } while (--h);
+ }
+}
+
+#define cdef_fn(w, h) \
+static void cdef_filter_block_##w##x##h##_c(pixel *const dst, \
+ const ptrdiff_t stride, \
+ const pixel (*left)[2], \
+ const pixel *const top, \
+ const pixel *const bottom, \
+ const int pri_strength, \
+ const int sec_strength, \
+ const int dir, \
+ const int damping, \
+ const enum CdefEdgeFlags edges \
+ HIGHBD_DECL_SUFFIX) \
+{ \
+ cdef_filter_block_c(dst, stride, left, top, bottom, \
+ pri_strength, sec_strength, dir, damping, w, h, edges HIGHBD_TAIL_SUFFIX); \
+}
+
+cdef_fn(4, 4);
+cdef_fn(4, 8);
+cdef_fn(8, 8);
+
+static int cdef_find_dir_c(const pixel *img, const ptrdiff_t stride,
+ unsigned *const var HIGHBD_DECL_SUFFIX)
+{
+ const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
+ int partial_sum_hv[2][8] = { { 0 } };
+ int partial_sum_diag[2][15] = { { 0 } };
+ int partial_sum_alt[4][11] = { { 0 } };
+
+ for (int y = 0; y < 8; y++) {
+ for (int x = 0; x < 8; x++) {
+ const int px = (img[x] >> bitdepth_min_8) - 128;
+
+ partial_sum_diag[0][ y + x ] += px;
+ partial_sum_alt [0][ y + (x >> 1)] += px;
+ partial_sum_hv [0][ y ] += px;
+ partial_sum_alt [1][3 + y - (x >> 1)] += px;
+ partial_sum_diag[1][7 + y - x ] += px;
+ partial_sum_alt [2][3 - (y >> 1) + x ] += px;
+ partial_sum_hv [1][ x ] += px;
+ partial_sum_alt [3][ (y >> 1) + x ] += px;
+ }
+ img += PXSTRIDE(stride);
+ }
+
+ unsigned cost[8] = { 0 };
+ for (int n = 0; n < 8; n++) {
+ cost[2] += partial_sum_hv[0][n] * partial_sum_hv[0][n];
+ cost[6] += partial_sum_hv[1][n] * partial_sum_hv[1][n];
+ }
+ cost[2] *= 105;
+ cost[6] *= 105;
+
+ static const uint16_t div_table[7] = { 840, 420, 280, 210, 168, 140, 120 };
+ for (int n = 0; n < 7; n++) {
+ const int d = div_table[n];
+ cost[0] += (partial_sum_diag[0][n] * partial_sum_diag[0][n] +
+ partial_sum_diag[0][14 - n] * partial_sum_diag[0][14 - n]) * d;
+ cost[4] += (partial_sum_diag[1][n] * partial_sum_diag[1][n] +
+ partial_sum_diag[1][14 - n] * partial_sum_diag[1][14 - n]) * d;
+ }
+ cost[0] += partial_sum_diag[0][7] * partial_sum_diag[0][7] * 105;
+ cost[4] += partial_sum_diag[1][7] * partial_sum_diag[1][7] * 105;
+
+ for (int n = 0; n < 4; n++) {
+ unsigned *const cost_ptr = &cost[n * 2 + 1];
+ for (int m = 0; m < 5; m++)
+ *cost_ptr += partial_sum_alt[n][3 + m] * partial_sum_alt[n][3 + m];
+ *cost_ptr *= 105;
+ for (int m = 0; m < 3; m++) {
+ const int d = div_table[2 * m + 1];
+ *cost_ptr += (partial_sum_alt[n][m] * partial_sum_alt[n][m] +
+ partial_sum_alt[n][10 - m] * partial_sum_alt[n][10 - m]) * d;
+ }
+ }
+
+ int best_dir = 0;
+ unsigned best_cost = cost[0];
+ for (int n = 1; n < 8; n++) {
+ if (cost[n] > best_cost) {
+ best_cost = cost[n];
+ best_dir = n;
+ }
+ }
+
+ *var = (best_cost - (cost[best_dir ^ 4])) >> 10;
+ return best_dir;
+}
+
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+#include "src/arm/cdef.h"
+#elif ARCH_PPC64LE
+#include "src/ppc/cdef.h"
+#elif ARCH_X86
+#include "src/x86/cdef.h"
+#endif
+#endif
+
+COLD void bitfn(dav1d_cdef_dsp_init)(Dav1dCdefDSPContext *const c) {
+ c->dir = cdef_find_dir_c;
+ c->fb[0] = cdef_filter_block_8x8_c;
+ c->fb[1] = cdef_filter_block_4x8_c;
+ c->fb[2] = cdef_filter_block_4x4_c;
+
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+ cdef_dsp_init_arm(c);
+#elif ARCH_PPC64LE
+ cdef_dsp_init_ppc(c);
+#elif ARCH_X86
+ cdef_dsp_init_x86(c);
+#endif
+#endif
+}
diff --git a/third_party/dav1d/src/cdf.c b/third_party/dav1d/src/cdf.c
new file mode 100644
index 0000000000..e0f2132e00
--- /dev/null
+++ b/third_party/dav1d/src/cdf.c
@@ -0,0 +1,4123 @@
+/*
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <string.h>
+
+#include "common/frame.h"
+
+#include "src/internal.h"
+#include "src/tables.h"
+
+#define CDF1(x) (32768-(x))
+
+#define CDF2(a,b) \
+ CDF1(a), CDF1(b)
+#define CDF3(a,b,c) \
+ CDF1(a), CDF2(b,c)
+#define CDF4(a,b,c,d) \
+ CDF1(a), CDF3(b,c,d)
+#define CDF5(a,b,c,d,e) \
+ CDF1(a), CDF4(b,c,d,e)
+#define CDF6(a,b,c,d,e,f) \
+ CDF1(a), CDF5(b,c,d,e,f)
+#define CDF7(a,b,c,d,e,f,g) \
+ CDF1(a), CDF6(b,c,d,e,f,g)
+#define CDF8(a,b,c,d,e,f,g,h) \
+ CDF1(a), CDF7(b,c,d,e,f,g,h)
+#define CDF9(a,b,c,d,e,f,g,h,i) \
+ CDF1(a), CDF8(b,c,d,e,f,g,h,i)
+#define CDF10(a,b,c,d,e,f,g,h,i,j) \
+ CDF1(a), CDF9(b,c,d,e,f,g,h,i,j)
+#define CDF11(a,b,c,d,e,f,g,h,i,j,k) \
+ CDF1(a), CDF10(b,c,d,e,f,g,h,i,j,k)
+#define CDF12(a,b,c,d,e,f,g,h,i,j,k,l) \
+ CDF1(a), CDF11(b,c,d,e,f,g,h,i,j,k,l)
+#define CDF13(a,b,c,d,e,f,g,h,i,j,k,l,m) \
+ CDF1(a), CDF12(b,c,d,e,f,g,h,i,j,k,l,m)
+#define CDF14(a,b,c,d,e,f,g,h,i,j,k,l,m,n) \
+ CDF1(a), CDF13(b,c,d,e,f,g,h,i,j,k,l,m,n)
+#define CDF15(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o) \
+ CDF1(a), CDF14(b,c,d,e,f,g,h,i,j,k,l,m,n,o)
+
+static const CdfModeContext av1_default_cdf = {
+ .y_mode = {
+ { CDF12(22801, 23489, 24293, 24756, 25601, 26123,
+ 26606, 27418, 27945, 29228, 29685, 30349) },
+ { CDF12(18673, 19845, 22631, 23318, 23950, 24649,
+ 25527, 27364, 28152, 29701, 29984, 30852) },
+ { CDF12(19770, 20979, 23396, 23939, 24241, 24654,
+ 25136, 27073, 27830, 29360, 29730, 30659) },
+ { CDF12(20155, 21301, 22838, 23178, 23261, 23533,
+ 23703, 24804, 25352, 26575, 27016, 28049) },
+ }, .use_filter_intra = {
+ [BS_4x4] = { CDF1( 4621) },
+ [BS_4x8] = { CDF1( 6743) },
+ [BS_8x4] = { CDF1( 5893) },
+ [BS_8x8] = { CDF1( 7866) },
+ [BS_8x16] = { CDF1(12551) },
+ [BS_16x8] = { CDF1( 9394) },
+ [BS_16x16] = { CDF1(12408) },
+ [BS_16x32] = { CDF1(14301) },
+ [BS_32x16] = { CDF1(12756) },
+ [BS_32x32] = { CDF1(22343) },
+ [BS_32x64] = { CDF1(16384) },
+ [BS_64x32] = { CDF1(16384) },
+ [BS_64x64] = { CDF1(16384) },
+ [BS_64x128] = { CDF1(16384) },
+ [BS_128x64] = { CDF1(16384) },
+ [BS_128x128] = { CDF1(16384) },
+ [BS_4x16] = { CDF1(12770) },
+ [BS_16x4] = { CDF1(10368) },
+ [BS_8x32] = { CDF1(20229) },
+ [BS_32x8] = { CDF1(18101) },
+ [BS_16x64] = { CDF1(16384) },
+ [BS_64x16] = { CDF1(16384) },
+ }, .filter_intra = {
+ CDF4(8949, 12776, 17211, 29558),
+ }, .uv_mode = {
+ {
+ { CDF12(22631, 24152, 25378, 25661, 25986, 26520,
+ 27055, 27923, 28244, 30059, 30941, 31961) },
+ { CDF12( 9513, 26881, 26973, 27046, 27118, 27664,
+ 27739, 27824, 28359, 29505, 29800, 31796) },
+ { CDF12( 9845, 9915, 28663, 28704, 28757, 28780,
+ 29198, 29822, 29854, 30764, 31777, 32029) },
+ { CDF12(13639, 13897, 14171, 25331, 25606, 25727,
+ 25953, 27148, 28577, 30612, 31355, 32493) },
+ { CDF12( 9764, 9835, 9930, 9954, 25386, 27053,
+ 27958, 28148, 28243, 31101, 31744, 32363) },
+ { CDF12(11825, 13589, 13677, 13720, 15048, 29213,
+ 29301, 29458, 29711, 31161, 31441, 32550) },
+ { CDF12(14175, 14399, 16608, 16821, 17718, 17775,
+ 28551, 30200, 30245, 31837, 32342, 32667) },
+ { CDF12(12885, 13038, 14978, 15590, 15673, 15748,
+ 16176, 29128, 29267, 30643, 31961, 32461) },
+ { CDF12(12026, 13661, 13874, 15305, 15490, 15726,
+ 15995, 16273, 28443, 30388, 30767, 32416) },
+ { CDF12(19052, 19840, 20579, 20916, 21150, 21467,
+ 21885, 22719, 23174, 28861, 30379, 32175) },
+ { CDF12(18627, 19649, 20974, 21219, 21492, 21816,
+ 22199, 23119, 23527, 27053, 31397, 32148) },
+ { CDF12(17026, 19004, 19997, 20339, 20586, 21103,
+ 21349, 21907, 22482, 25896, 26541, 31819) },
+ { CDF12(12124, 13759, 14959, 14992, 15007, 15051,
+ 15078, 15166, 15255, 15753, 16039, 16606) },
+ }, {
+ { CDF13(10407, 11208, 12900, 13181, 13823, 14175, 14899,
+ 15656, 15986, 20086, 20995, 22455, 24212) },
+ { CDF13( 4532, 19780, 20057, 20215, 20428, 21071, 21199,
+ 21451, 22099, 24228, 24693, 27032, 29472) },
+ { CDF13( 5273, 5379, 20177, 20270, 20385, 20439, 20949,
+ 21695, 21774, 23138, 24256, 24703, 26679) },
+ { CDF13( 6740, 7167, 7662, 14152, 14536, 14785, 15034,
+ 16741, 18371, 21520, 22206, 23389, 24182) },
+ { CDF13( 4987, 5368, 5928, 6068, 19114, 20315, 21857,
+ 22253, 22411, 24911, 25380, 26027, 26376) },
+ { CDF13( 5370, 6889, 7247, 7393, 9498, 21114, 21402,
+ 21753, 21981, 24780, 25386, 26517, 27176) },
+ { CDF13( 4816, 4961, 7204, 7326, 8765, 8930, 20169,
+ 20682, 20803, 23188, 23763, 24455, 24940) },
+ { CDF13( 6608, 6740, 8529, 9049, 9257, 9356, 9735,
+ 18827, 19059, 22336, 23204, 23964, 24793) },
+ { CDF13( 5998, 7419, 7781, 8933, 9255, 9549, 9753,
+ 10417, 18898, 22494, 23139, 24764, 25989) },
+ { CDF13(10660, 11298, 12550, 12957, 13322, 13624, 14040,
+ 15004, 15534, 20714, 21789, 23443, 24861) },
+ { CDF13(10522, 11530, 12552, 12963, 13378, 13779, 14245,
+ 15235, 15902, 20102, 22696, 23774, 25838) },
+ { CDF13(10099, 10691, 12639, 13049, 13386, 13665, 14125,
+ 15163, 15636, 19676, 20474, 23519, 25208) },
+ { CDF13( 3144, 5087, 7382, 7504, 7593, 7690, 7801,
+ 8064, 8232, 9248, 9875, 10521, 29048) },
+ },
+ }, .angle_delta = {
+ { CDF6( 2180, 5032, 7567, 22776, 26989, 30217) },
+ { CDF6( 2301, 5608, 8801, 23487, 26974, 30330) },
+ { CDF6( 3780, 11018, 13699, 19354, 23083, 31286) },
+ { CDF6( 4581, 11226, 15147, 17138, 21834, 28397) },
+ { CDF6( 1737, 10927, 14509, 19588, 22745, 28823) },
+ { CDF6( 2664, 10176, 12485, 17650, 21600, 30495) },
+ { CDF6( 2240, 11096, 15453, 20341, 22561, 28917) },
+ { CDF6( 3605, 10428, 12459, 17676, 21244, 30655) },
+ }, .filter = {
+ {
+ { CDF2(31935, 32720) }, { CDF2( 5568, 32719) },
+ { CDF2( 422, 2938) }, { CDF2(28244, 32608) },
+ { CDF2(31206, 31953) }, { CDF2( 4862, 32121) },
+ { CDF2( 770, 1152) }, { CDF2(20889, 25637) },
+ }, {
+ { CDF2(31910, 32724) }, { CDF2( 4120, 32712) },
+ { CDF2( 305, 2247) }, { CDF2(27403, 32636) },
+ { CDF2(31022, 32009) }, { CDF2( 2963, 32093) },
+ { CDF2( 601, 943) }, { CDF2(14969, 21398) },
+ },
+ }, .newmv_mode = {
+ { CDF1(24035) }, { CDF1(16630) }, { CDF1(15339) },
+ { CDF1( 8386) }, { CDF1(12222) }, { CDF1( 4676) },
+ }, .globalmv_mode = {
+ { CDF1( 2175) }, { CDF1( 1054) },
+ }, .refmv_mode = {
+ { CDF1(23974) }, { CDF1(24188) }, { CDF1(17848) },
+ { CDF1(28622) }, { CDF1(24312) }, { CDF1(19923) },
+ }, .drl_bit = {
+ { CDF1(13104) }, { CDF1(24560) }, { CDF1(18945) },
+ }, .comp_inter_mode = {
+ { CDF7( 7760, 13823, 15808, 17641, 19156, 20666, 26891) },
+ { CDF7(10730, 19452, 21145, 22749, 24039, 25131, 28724) },
+ { CDF7(10664, 20221, 21588, 22906, 24295, 25387, 28436) },
+ { CDF7(13298, 16984, 20471, 24182, 25067, 25736, 26422) },
+ { CDF7(18904, 23325, 25242, 27432, 27898, 28258, 30758) },
+ { CDF7(10725, 17454, 20124, 22820, 24195, 25168, 26046) },
+ { CDF7(17125, 24273, 25814, 27492, 28214, 28704, 30592) },
+ { CDF7(13046, 23214, 24505, 25942, 27435, 28442, 29330) },
+ }, .intra = {
+ { CDF1( 806) }, { CDF1(16662) }, { CDF1(20186) },
+ { CDF1(26538) },
+ }, .comp = {
+ { CDF1(26828) }, { CDF1(24035) }, { CDF1(12031) },
+ { CDF1(10640) }, { CDF1( 2901) },
+ }, .comp_dir = {
+ { CDF1( 1198) }, { CDF1( 2070) }, { CDF1( 9166) },
+ { CDF1( 7499) }, { CDF1(22475) },
+ }, .jnt_comp = {
+ { CDF1(18244) }, { CDF1(12865) }, { CDF1( 7053) },
+ { CDF1(13259) }, { CDF1( 9334) }, { CDF1( 4644) },
+ }, .mask_comp = {
+ { CDF1(26607) }, { CDF1(22891) }, { CDF1(18840) },
+ { CDF1(24594) }, { CDF1(19934) }, { CDF1(22674) },
+ }, .wedge_comp = {
+ { CDF1(23431) }, { CDF1(13171) }, { CDF1(11470) },
+ { CDF1( 9770) }, { CDF1( 9100) }, { CDF1( 8233) },
+ { CDF1( 6172) }, { CDF1(11820) }, { CDF1( 7701) },
+ }, .wedge_idx = {
+ { CDF15( 2438, 4440, 6599, 8663, 11005, 12874, 15751, 18094,
+ 20359, 22362, 24127, 25702, 27752, 29450, 31171) },
+ { CDF15( 806, 3266, 6005, 6738, 7218, 7367, 7771, 14588,
+ 16323, 17367, 18452, 19422, 22839, 26127, 29629) },
+ { CDF15( 2779, 3738, 4683, 7213, 7775, 8017, 8655, 14357,
+ 17939, 21332, 24520, 27470, 29456, 30529, 31656) },
+ { CDF15( 1684, 3625, 5675, 7108, 9302, 11274, 14429, 17144,
+ 19163, 20961, 22884, 24471, 26719, 28714, 30877) },
+ { CDF15( 1142, 3491, 6277, 7314, 8089, 8355, 9023, 13624,
+ 15369, 16730, 18114, 19313, 22521, 26012, 29550) },
+ { CDF15( 2742, 4195, 5727, 8035, 8980, 9336, 10146, 14124,
+ 17270, 20533, 23434, 25972, 27944, 29570, 31416) },
+ { CDF15( 1727, 3948, 6101, 7796, 9841, 12344, 15766, 18944,
+ 20638, 22038, 23963, 25311, 26988, 28766, 31012) },
+ { CDF15( 154, 987, 1925, 2051, 2088, 2111, 2151, 23033,
+ 23703, 24284, 24985, 25684, 27259, 28883, 30911) },
+ { CDF15( 1135, 1322, 1493, 2635, 2696, 2737, 2770, 21016,
+ 22935, 25057, 27251, 29173, 30089, 30960, 31933) },
+ }, .interintra = {
+ { CDF1(16384) }, { CDF1(26887) }, { CDF1(27597) },
+ { CDF1(30237) },
+ }, .interintra_mode = {
+ { CDF3(8192, 16384, 24576) },
+ { CDF3(1875, 11082, 27332) },
+ { CDF3(2473, 9996, 26388) },
+ { CDF3(4238, 11537, 25926) },
+ }, .interintra_wedge = {
+ { CDF1(20036) }, { CDF1(24957) }, { CDF1(26704) },
+ { CDF1(27530) }, { CDF1(29564) }, { CDF1(29444) },
+ { CDF1(26872) },
+ }, .ref = {
+ { { CDF1( 4897) }, { CDF1(16973) }, { CDF1(29744) } },
+ { { CDF1( 1555) }, { CDF1(16751) }, { CDF1(30279) } },
+ { { CDF1( 4236) }, { CDF1(19647) }, { CDF1(31194) } },
+ { { CDF1( 8650) }, { CDF1(24773) }, { CDF1(31895) } },
+ { { CDF1( 904) }, { CDF1(11014) }, { CDF1(26875) } },
+ { { CDF1( 1444) }, { CDF1(15087) }, { CDF1(30304) } },
+ }, .comp_fwd_ref = {
+ { { CDF1( 4946) }, { CDF1(19891) }, { CDF1(30731) } },
+ { { CDF1( 9468) }, { CDF1(22441) }, { CDF1(31059) } },
+ { { CDF1( 1503) }, { CDF1(15160) }, { CDF1(27544) } },
+ }, .comp_bwd_ref = {
+ { { CDF1( 2235) }, { CDF1(17182) }, { CDF1(30606) } },
+ { { CDF1( 1423) }, { CDF1(15175) }, { CDF1(30489) } },
+ }, .comp_uni_ref = {
+ { { CDF1( 5284) }, { CDF1(23152) }, { CDF1(31774) } },
+ { { CDF1( 3865) }, { CDF1(14173) }, { CDF1(25120) } },
+ { { CDF1( 3128) }, { CDF1(15270) }, { CDF1(26710) } },
+ }, .txsz = {
+ {
+ { CDF1(19968) }, { CDF1(19968) }, { CDF1(24320) },
+ }, {
+ { CDF2(12272, 30172) }, { CDF2(12272, 30172) },
+ { CDF2(18677, 30848) },
+ }, {
+ { CDF2(12986, 15180) }, { CDF2(12986, 15180) },
+ { CDF2(24302, 25602) },
+ }, {
+ { CDF2( 5782, 11475) }, { CDF2( 5782, 11475) },
+ { CDF2(16803, 22759) },
+ },
+ }, .txpart = {
+ { { CDF1(28581) }, { CDF1(23846) }, { CDF1(20847) } },
+ { { CDF1(24315) }, { CDF1(18196) }, { CDF1(12133) } },
+ { { CDF1(18791) }, { CDF1(10887) }, { CDF1(11005) } },
+ { { CDF1(27179) }, { CDF1(20004) }, { CDF1(11281) } },
+ { { CDF1(26549) }, { CDF1(19308) }, { CDF1(14224) } },
+ { { CDF1(28015) }, { CDF1(21546) }, { CDF1(14400) } },
+ { { CDF1(28165) }, { CDF1(22401) }, { CDF1(16088) } },
+ }, .txtp_inter1 = {
+ { CDF15( 4458, 5560, 7695, 9709, 13330, 14789, 17537, 20266,
+ 21504, 22848, 23934, 25474, 27727, 28915, 30631) },
+ { CDF15( 1645, 2573, 4778, 5711, 7807, 8622, 10522, 15357,
+ 17674, 20408, 22517, 25010, 27116, 28856, 30749) },
+ }, .txtp_inter2 = {
+ CDF11( 770, 2421, 5225, 12907, 15819, 18927,
+ 21561, 24089, 26595, 28526, 30529)
+ }, .txtp_inter3 = {
+ { CDF1(16384) }, { CDF1( 4167) }, { CDF1( 1998) }, { CDF1( 748) },
+ }, .txtp_intra1 = {
+ {
+ { CDF6( 1535, 8035, 9461, 12751, 23467, 27825) },
+ { CDF6( 564, 3335, 9709, 10870, 18143, 28094) },
+ { CDF6( 672, 3247, 3676, 11982, 19415, 23127) },
+ { CDF6( 5279, 13885, 15487, 18044, 23527, 30252) },
+ { CDF6( 4423, 6074, 7985, 10416, 25693, 29298) },
+ { CDF6( 1486, 4241, 9460, 10662, 16456, 27694) },
+ { CDF6( 439, 2838, 3522, 6737, 18058, 23754) },
+ { CDF6( 1190, 4233, 4855, 11670, 20281, 24377) },
+ { CDF6( 1045, 4312, 8647, 10159, 18644, 29335) },
+ { CDF6( 202, 3734, 4747, 7298, 17127, 24016) },
+ { CDF6( 447, 4312, 6819, 8884, 16010, 23858) },
+ { CDF6( 277, 4369, 5255, 8905, 16465, 22271) },
+ { CDF6( 3409, 5436, 10599, 15599, 19687, 24040) },
+ }, {
+ { CDF6( 1870, 13742, 14530, 16498, 23770, 27698) },
+ { CDF6( 326, 8796, 14632, 15079, 19272, 27486) },
+ { CDF6( 484, 7576, 7712, 14443, 19159, 22591) },
+ { CDF6( 1126, 15340, 15895, 17023, 20896, 30279) },
+ { CDF6( 655, 4854, 5249, 5913, 22099, 27138) },
+ { CDF6( 1299, 6458, 8885, 9290, 14851, 25497) },
+ { CDF6( 311, 5295, 5552, 6885, 16107, 22672) },
+ { CDF6( 883, 8059, 8270, 11258, 17289, 21549) },
+ { CDF6( 741, 7580, 9318, 10345, 16688, 29046) },
+ { CDF6( 110, 7406, 7915, 9195, 16041, 23329) },
+ { CDF6( 363, 7974, 9357, 10673, 15629, 24474) },
+ { CDF6( 153, 7647, 8112, 9936, 15307, 19996) },
+ { CDF6( 3511, 6332, 11165, 15335, 19323, 23594) },
+ },
+ }, .txtp_intra2 = {
+ {
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ }, {
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ }, {
+ { CDF4( 1127, 12814, 22772, 27483) },
+ { CDF4( 145, 6761, 11980, 26667) },
+ { CDF4( 362, 5887, 11678, 16725) },
+ { CDF4( 385, 15213, 18587, 30693) },
+ { CDF4( 25, 2914, 23134, 27903) },
+ { CDF4( 60, 4470, 11749, 23991) },
+ { CDF4( 37, 3332, 14511, 21448) },
+ { CDF4( 157, 6320, 13036, 17439) },
+ { CDF4( 119, 6719, 12906, 29396) },
+ { CDF4( 47, 5537, 12576, 21499) },
+ { CDF4( 269, 6076, 11258, 23115) },
+ { CDF4( 83, 5615, 12001, 17228) },
+ { CDF4( 1968, 5556, 12023, 18547) },
+ },
+ }, .skip = {
+ { CDF1(31671) }, { CDF1(16515) }, { CDF1( 4576) },
+ }, .skip_mode = {
+ { CDF1(32621) }, { CDF1(20708) }, { CDF1( 8127) },
+ }, .partition = {
+ {
+ // 128x128 -> 64x64
+ { CDF7(27899, 28219, 28529, 32484, 32539, 32619, 32639) },
+ { CDF7( 6607, 6990, 8268, 32060, 32219, 32338, 32371) },
+ { CDF7( 5429, 6676, 7122, 32027, 32227, 32531, 32582) },
+ { CDF7( 711, 966, 1172, 32448, 32538, 32617, 32664) },
+ }, {
+ // 64x64 -> 32x32
+ { CDF9(20137, 21547, 23078, 29566, 29837,
+ 30261, 30524, 30892, 31724) },
+ { CDF9( 6732, 7490, 9497, 27944, 28250,
+ 28515, 28969, 29630, 30104) },
+ { CDF9( 5945, 7663, 8348, 28683, 29117,
+ 29749, 30064, 30298, 32238) },
+ { CDF9( 870, 1212, 1487, 31198, 31394,
+ 31574, 31743, 31881, 32332) },
+ }, {
+ // 32x32 -> 16x16
+ { CDF9(18462, 20920, 23124, 27647, 28227,
+ 29049, 29519, 30178, 31544) },
+ { CDF9( 7689, 9060, 12056, 24992, 25660,
+ 26182, 26951, 28041, 29052) },
+ { CDF9( 6015, 9009, 10062, 24544, 25409,
+ 26545, 27071, 27526, 32047) },
+ { CDF9( 1394, 2208, 2796, 28614, 29061,
+ 29466, 29840, 30185, 31899) },
+ }, {
+ // 16x16 -> 8x8
+ { CDF9(15597, 20929, 24571, 26706, 27664,
+ 28821, 29601, 30571, 31902) },
+ { CDF9( 7925, 11043, 16785, 22470, 23971,
+ 25043, 26651, 28701, 29834) },
+ { CDF9( 5414, 13269, 15111, 20488, 22360,
+ 24500, 25537, 26336, 32117) },
+ { CDF9( 2662, 6362, 8614, 20860, 23053,
+ 24778, 26436, 27829, 31171) },
+ }, {
+ // 8x8 -> 4x4 only supports the four legacy partition types
+ { CDF3(19132, 25510, 30392) },
+ { CDF3(13928, 19855, 28540) },
+ { CDF3(12522, 23679, 28629) },
+ { CDF3( 9896, 18783, 25853) },
+ },
+ }, .seg_pred = {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+ }, .seg_id = {
+ { CDF7( 5622, 7893, 16093, 18233, 27809, 28373, 32533) },
+ { CDF7(14274, 18230, 22557, 24935, 29980, 30851, 32344) },
+ { CDF7(27527, 28487, 28723, 28890, 32397, 32647, 32679) },
+ }, .cfl_sign = {
+ CDF7( 1418, 2123, 13340, 18405, 26972, 28343, 32294)
+ }, .cfl_alpha = {
+ { CDF15( 7637, 20719, 31401, 32481, 32657, 32688, 32692, 32696,
+ 32700, 32704, 32708, 32712, 32716, 32720, 32724) },
+ { CDF15(14365, 23603, 28135, 31168, 32167, 32395, 32487, 32573,
+ 32620, 32647, 32668, 32672, 32676, 32680, 32684) },
+ { CDF15(11532, 22380, 28445, 31360, 32349, 32523, 32584, 32649,
+ 32673, 32677, 32681, 32685, 32689, 32693, 32697) },
+ { CDF15(26990, 31402, 32282, 32571, 32692, 32696, 32700, 32704,
+ 32708, 32712, 32716, 32720, 32724, 32728, 32732) },
+ { CDF15(17248, 26058, 28904, 30608, 31305, 31877, 32126, 32321,
+ 32394, 32464, 32516, 32560, 32576, 32593, 32622) },
+ { CDF15(14738, 21678, 25779, 27901, 29024, 30302, 30980, 31843,
+ 32144, 32413, 32520, 32594, 32622, 32656, 32660) },
+ }, .restore_wiener = {
+ CDF1(11570)
+ }, .restore_sgrproj = {
+ CDF1(16855)
+ }, .restore_switchable = {
+ CDF2( 9413, 22581)
+ }, .delta_q = {
+ CDF3(28160, 32120, 32677)
+ }, .delta_lf = {
+ { CDF3(28160, 32120, 32677) },
+ { CDF3(28160, 32120, 32677) },
+ { CDF3(28160, 32120, 32677) },
+ { CDF3(28160, 32120, 32677) },
+ { CDF3(28160, 32120, 32677) },
+ }, .motion_mode = {
+ [BS_8x8] = { CDF2( 7651, 24760) },
+ [BS_8x16] = { CDF2( 4738, 24765) },
+ [BS_8x32] = { CDF2(28799, 31390) },
+ [BS_16x8] = { CDF2( 5391, 25528) },
+ [BS_16x16] = { CDF2(19419, 26810) },
+ [BS_16x32] = { CDF2( 5123, 23606) },
+ [BS_16x64] = { CDF2(28973, 31594) },
+ [BS_32x8] = { CDF2(26431, 30774) },
+ [BS_32x16] = { CDF2(11606, 24308) },
+ [BS_32x32] = { CDF2(26260, 29116) },
+ [BS_32x64] = { CDF2(20360, 28062) },
+ [BS_64x16] = { CDF2(29742, 31203) },
+ [BS_64x32] = { CDF2(21679, 26830) },
+ [BS_64x64] = { CDF2(29516, 30701) },
+ [BS_64x128] = { CDF2(28898, 30397) },
+ [BS_128x64] = { CDF2(30878, 31335) },
+ [BS_128x128] = { CDF2(32507, 32558) },
+ }, .obmc = {
+ [BS_8x8] = { CDF1(10437) },
+ [BS_8x16] = { CDF1( 9371) },
+ [BS_8x32] = { CDF1(23664) },
+ [BS_16x8] = { CDF1( 9301) },
+ [BS_16x16] = { CDF1(17432) },
+ [BS_16x32] = { CDF1(14423) },
+ [BS_16x64] = { CDF1(24008) },
+ [BS_32x8] = { CDF1(20901) },
+ [BS_32x16] = { CDF1(15142) },
+ [BS_32x32] = { CDF1(25817) },
+ [BS_32x64] = { CDF1(22823) },
+ [BS_64x16] = { CDF1(26879) },
+ [BS_64x32] = { CDF1(22083) },
+ [BS_64x64] = { CDF1(30128) },
+ [BS_64x128] = { CDF1(31014) },
+ [BS_128x64] = { CDF1(31560) },
+ [BS_128x128] = { CDF1(32638) },
+ }, .pal_y = {
+ { { CDF1(31676) }, { CDF1( 3419) }, { CDF1( 1261) } },
+ { { CDF1(31912) }, { CDF1( 2859) }, { CDF1( 980) } },
+ { { CDF1(31823) }, { CDF1( 3400) }, { CDF1( 781) } },
+ { { CDF1(32030) }, { CDF1( 3561) }, { CDF1( 904) } },
+ { { CDF1(32309) }, { CDF1( 7337) }, { CDF1( 1462) } },
+ { { CDF1(32265) }, { CDF1( 4015) }, { CDF1( 1521) } },
+ { { CDF1(32450) }, { CDF1( 7946) }, { CDF1( 129) } },
+ }, .pal_sz = {
+ {
+ { CDF6( 7952, 13000, 18149, 21478, 25527, 29241) },
+ { CDF6( 7139, 11421, 16195, 19544, 23666, 28073) },
+ { CDF6( 7788, 12741, 17325, 20500, 24315, 28530) },
+ { CDF6( 8271, 14064, 18246, 21564, 25071, 28533) },
+ { CDF6(12725, 19180, 21863, 24839, 27535, 30120) },
+ { CDF6( 9711, 14888, 16923, 21052, 25661, 27875) },
+ { CDF6(14940, 20797, 21678, 24186, 27033, 28999) },
+ }, {
+ { CDF6( 8713, 19979, 27128, 29609, 31331, 32272) },
+ { CDF6( 5839, 15573, 23581, 26947, 29848, 31700) },
+ { CDF6( 4426, 11260, 17999, 21483, 25863, 29430) },
+ { CDF6( 3228, 9464, 14993, 18089, 22523, 27420) },
+ { CDF6( 3768, 8886, 13091, 17852, 22495, 27207) },
+ { CDF6( 2464, 8451, 12861, 21632, 25525, 28555) },
+ { CDF6( 1269, 5435, 10433, 18963, 21700, 25865) },
+ },
+ }, .pal_uv = {
+ { CDF1(32461) }, { CDF1(21488) },
+ }, .color_map = {
+ { /* y */
+ {
+ { CDF1(28710) }, { CDF1(16384) }, { CDF1(10553) },
+ { CDF1(27036) }, { CDF1(31603) },
+ }, {
+ { CDF2(27877, 30490) }, { CDF2(11532, 25697) },
+ { CDF2( 6544, 30234) }, { CDF2(23018, 28072) },
+ { CDF2(31915, 32385) },
+ }, {
+ { CDF3(25572, 28046, 30045) },
+ { CDF3( 9478, 21590, 27256) },
+ { CDF3( 7248, 26837, 29824) },
+ { CDF3(19167, 24486, 28349) },
+ { CDF3(31400, 31825, 32250) },
+ }, {
+ { CDF4(24779, 26955, 28576, 30282) },
+ { CDF4( 8669, 20364, 24073, 28093) },
+ { CDF4( 4255, 27565, 29377, 31067) },
+ { CDF4(19864, 23674, 26716, 29530) },
+ { CDF4(31646, 31893, 32147, 32426) },
+ }, {
+ { CDF5(23132, 25407, 26970, 28435, 30073) },
+ { CDF5( 7443, 17242, 20717, 24762, 27982) },
+ { CDF5( 6300, 24862, 26944, 28784, 30671) },
+ { CDF5(18916, 22895, 25267, 27435, 29652) },
+ { CDF5(31270, 31550, 31808, 32059, 32353) },
+ }, {
+ { CDF6(23105, 25199, 26464, 27684, 28931, 30318) },
+ { CDF6( 6950, 15447, 18952, 22681, 25567, 28563) },
+ { CDF6( 7560, 23474, 25490, 27203, 28921, 30708) },
+ { CDF6(18544, 22373, 24457, 26195, 28119, 30045) },
+ { CDF6(31198, 31451, 31670, 31882, 32123, 32391) },
+ }, {
+ { CDF7(21689, 23883, 25163, 26352, 27506, 28827, 30195) },
+ { CDF7( 6892, 15385, 17840, 21606, 24287, 26753, 29204) },
+ { CDF7( 5651, 23182, 25042, 26518, 27982, 29392, 30900) },
+ { CDF7(19349, 22578, 24418, 25994, 27524, 29031, 30448) },
+ { CDF7(31028, 31270, 31504, 31705, 31927, 32153, 32392) },
+ },
+ }, { /* uv */
+ {
+ { CDF1(29089) }, { CDF1(16384) }, { CDF1( 8713) },
+ { CDF1(29257) }, { CDF1(31610) },
+ }, {
+ { CDF2(25257, 29145) }, { CDF2(12287, 27293) },
+ { CDF2( 7033, 27960) }, { CDF2(20145, 25405) },
+ { CDF2(30608, 31639) },
+ }, {
+ { CDF3(24210, 27175, 29903) },
+ { CDF3( 9888, 22386, 27214) },
+ { CDF3( 5901, 26053, 29293) },
+ { CDF3(18318, 22152, 28333) },
+ { CDF3(30459, 31136, 31926) },
+ }, {
+ { CDF4(22980, 25479, 27781, 29986) },
+ { CDF4( 8413, 21408, 24859, 28874) },
+ { CDF4( 2257, 29449, 30594, 31598) },
+ { CDF4(19189, 21202, 25915, 28620) },
+ { CDF4(31844, 32044, 32281, 32518) },
+ }, {
+ { CDF5(22217, 24567, 26637, 28683, 30548) },
+ { CDF5( 7307, 16406, 19636, 24632, 28424) },
+ { CDF5( 4441, 25064, 26879, 28942, 30919) },
+ { CDF5(17210, 20528, 23319, 26750, 29582) },
+ { CDF5(30674, 30953, 31396, 31735, 32207) },
+ }, {
+ { CDF6(21239, 23168, 25044, 26962, 28705, 30506) },
+ { CDF6( 6545, 15012, 18004, 21817, 25503, 28701) },
+ { CDF6( 3448, 26295, 27437, 28704, 30126, 31442) },
+ { CDF6(15889, 18323, 21704, 24698, 26976, 29690) },
+ { CDF6(30988, 31204, 31479, 31734, 31983, 32325) },
+ }, {
+ { CDF7(21442, 23288, 24758, 26246, 27649, 28980, 30563) },
+ { CDF7( 5863, 14933, 17552, 20668, 23683, 26411, 29273) },
+ { CDF7( 3415, 25810, 26877, 27990, 29223, 30394, 31618) },
+ { CDF7(17965, 20084, 22232, 23974, 26274, 28402, 30390) },
+ { CDF7(31190, 31329, 31516, 31679, 31825, 32026, 32322) },
+ },
+ },
+ }, .intrabc = {
+ CDF1(30531)
+ },
+};
+
+static const CdfMvComponent default_mv_component_cdf = {
+ .classes = {
+ CDF10(28672, 30976, 31858, 32320, 32551,
+ 32656, 32740, 32757, 32762, 32767)
+ }, .class0 = {
+ CDF1(27648)
+ }, .classN = {
+ { CDF1(17408) }, { CDF1(17920) }, { CDF1(18944) },
+ { CDF1(20480) }, { CDF1(22528) }, { CDF1(24576) },
+ { CDF1(28672) }, { CDF1(29952) }, { CDF1(29952) },
+ { CDF1(30720) },
+ }, .class0_fp = {
+ { CDF3(16384, 24576, 26624) },
+ { CDF3(12288, 21248, 24128) },
+ }, .classN_fp = {
+ CDF3( 8192, 17408, 21248)
+ }, .class0_hp = {
+ CDF1(20480)
+ }, .classN_hp = {
+ CDF1(16384)
+ }, .sign = {
+ CDF1(16384)
+ },
+};
+
+static const uint16_t ALIGN(default_mv_joint_cdf[N_MV_JOINTS], 8) = {
+ CDF3( 4096, 11264, 19328)
+};
+
+static const uint16_t ALIGN(default_kf_y_mode_cdf[5][5][N_INTRA_PRED_MODES + 3], 32) = {
+ {
+ { CDF12(15588, 17027, 19338, 20218, 20682, 21110,
+ 21825, 23244, 24189, 28165, 29093, 30466) },
+ { CDF12(12016, 18066, 19516, 20303, 20719, 21444,
+ 21888, 23032, 24434, 28658, 30172, 31409) },
+ { CDF12(10052, 10771, 22296, 22788, 23055, 23239,
+ 24133, 25620, 26160, 29336, 29929, 31567) },
+ { CDF12(14091, 15406, 16442, 18808, 19136, 19546,
+ 19998, 22096, 24746, 29585, 30958, 32462) },
+ { CDF12(12122, 13265, 15603, 16501, 18609, 20033,
+ 22391, 25583, 26437, 30261, 31073, 32475) },
+ }, {
+ { CDF12(10023, 19585, 20848, 21440, 21832, 22760,
+ 23089, 24023, 25381, 29014, 30482, 31436) },
+ { CDF12( 5983, 24099, 24560, 24886, 25066, 25795,
+ 25913, 26423, 27610, 29905, 31276, 31794) },
+ { CDF12( 7444, 12781, 20177, 20728, 21077, 21607,
+ 22170, 23405, 24469, 27915, 29090, 30492) },
+ { CDF12( 8537, 14689, 15432, 17087, 17408, 18172,
+ 18408, 19825, 24649, 29153, 31096, 32210) },
+ { CDF12( 7543, 14231, 15496, 16195, 17905, 20717,
+ 21984, 24516, 26001, 29675, 30981, 31994) },
+ }, {
+ { CDF12(12613, 13591, 21383, 22004, 22312, 22577,
+ 23401, 25055, 25729, 29538, 30305, 32077) },
+ { CDF12( 9687, 13470, 18506, 19230, 19604, 20147,
+ 20695, 22062, 23219, 27743, 29211, 30907) },
+ { CDF12( 6183, 6505, 26024, 26252, 26366, 26434,
+ 27082, 28354, 28555, 30467, 30794, 32086) },
+ { CDF12(10718, 11734, 14954, 17224, 17565, 17924,
+ 18561, 21523, 23878, 28975, 30287, 32252) },
+ { CDF12( 9194, 9858, 16501, 17263, 18424, 19171,
+ 21563, 25961, 26561, 30072, 30737, 32463) },
+ }, {
+ { CDF12(12602, 14399, 15488, 18381, 18778, 19315,
+ 19724, 21419, 25060, 29696, 30917, 32409) },
+ { CDF12( 8203, 13821, 14524, 17105, 17439, 18131,
+ 18404, 19468, 25225, 29485, 31158, 32342) },
+ { CDF12( 8451, 9731, 15004, 17643, 18012, 18425,
+ 19070, 21538, 24605, 29118, 30078, 32018) },
+ { CDF12( 7714, 9048, 9516, 16667, 16817, 16994,
+ 17153, 18767, 26743, 30389, 31536, 32528) },
+ { CDF12( 8843, 10280, 11496, 15317, 16652, 17943,
+ 19108, 22718, 25769, 29953, 30983, 32485) },
+ }, {
+ { CDF12(12578, 13671, 15979, 16834, 19075, 20913,
+ 22989, 25449, 26219, 30214, 31150, 32477) },
+ { CDF12( 9563, 13626, 15080, 15892, 17756, 20863,
+ 22207, 24236, 25380, 29653, 31143, 32277) },
+ { CDF12( 8356, 8901, 17616, 18256, 19350, 20106,
+ 22598, 25947, 26466, 29900, 30523, 32261) },
+ { CDF12(10835, 11815, 13124, 16042, 17018, 18039,
+ 18947, 22753, 24615, 29489, 30883, 32482) },
+ { CDF12( 7618, 8288, 9859, 10509, 15386, 18657,
+ 22903, 28776, 29180, 31355, 31802, 32593) },
+ },
+};
+
+static const CdfCoefContext av1_default_coef_cdf[4] = {
+ [0] = {
+ .skip = {
+ {
+ { CDF1(31849) }, { CDF1( 5892) }, { CDF1(12112) },
+ { CDF1(21935) }, { CDF1(20289) }, { CDF1(27473) },
+ { CDF1(32487) }, { CDF1( 7654) }, { CDF1(19473) },
+ { CDF1(29984) }, { CDF1( 9961) }, { CDF1(30242) },
+ { CDF1(32117) },
+ }, {
+ { CDF1(31548) }, { CDF1( 1549) }, { CDF1(10130) },
+ { CDF1(16656) }, { CDF1(18591) }, { CDF1(26308) },
+ { CDF1(32537) }, { CDF1( 5403) }, { CDF1(18096) },
+ { CDF1(30003) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) },
+ }, {
+ { CDF1(29957) }, { CDF1( 5391) }, { CDF1(18039) },
+ { CDF1(23566) }, { CDF1(22431) }, { CDF1(25822) },
+ { CDF1(32197) }, { CDF1( 3778) }, { CDF1(15336) },
+ { CDF1(28981) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) },
+ }, {
+ { CDF1(17920) }, { CDF1( 1818) }, { CDF1( 7282) },
+ { CDF1(25273) }, { CDF1(10923) }, { CDF1(31554) },
+ { CDF1(32624) }, { CDF1( 1366) }, { CDF1(15628) },
+ { CDF1(30462) }, { CDF1( 146) }, { CDF1( 5132) },
+ { CDF1(31657) },
+ }, {
+ { CDF1( 6308) }, { CDF1( 117) }, { CDF1( 1638) },
+ { CDF1( 2161) }, { CDF1(16384) }, { CDF1(10923) },
+ { CDF1(30247) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) },
+ },
+ }, .eob_bin_16 = {
+ {
+ { CDF4( 840, 1039, 1980, 4895) },
+ { CDF4( 370, 671, 1883, 4471) },
+ }, {
+ { CDF4( 3247, 4950, 9688, 14563) },
+ { CDF4( 1904, 3354, 7763, 14647) },
+ },
+ }, .eob_bin_32 = {
+ {
+ { CDF5( 400, 520, 977, 2102, 6542) },
+ { CDF5( 210, 405, 1315, 3326, 7537) },
+ }, {
+ { CDF5( 2636, 4273, 7588, 11794, 20401) },
+ { CDF5( 1786, 3179, 6902, 11357, 19054) },
+ },
+ }, .eob_bin_64 = {
+ {
+ { CDF6( 329, 498, 1101, 1784, 3265, 7758) },
+ { CDF6( 335, 730, 1459, 5494, 8755, 12997) },
+ }, {
+ { CDF6( 3505, 5304, 10086, 13814, 17684, 23370) },
+ { CDF6( 1563, 2700, 4876, 10911, 14706, 22480) },
+ },
+ }, .eob_bin_128 = {
+ {
+ { CDF7( 219, 482, 1140, 2091, 3680, 6028, 12586) },
+ { CDF7( 371, 699, 1254, 4830, 9479, 12562, 17497) },
+ }, {
+ { CDF7( 5245, 7456, 12880, 15852, 20033, 23932, 27608) },
+ { CDF7( 2054, 3472, 5869, 14232, 18242, 20590, 26752) },
+ },
+ }, .eob_bin_256 = {
+ {
+ { CDF8( 310, 584, 1887, 3589,
+ 6168, 8611, 11352, 15652) },
+ { CDF8( 998, 1850, 2998, 5604,
+ 17341, 19888, 22899, 25583) },
+ }, {
+ { CDF8( 2520, 3240, 5952, 8870,
+ 12577, 17558, 19954, 24168) },
+ { CDF8( 2203, 4130, 7435, 10739,
+ 20652, 23681, 25609, 27261) },
+ },
+ }, .eob_bin_512 = {
+ { CDF9( 641, 983, 3707, 5430, 10234,
+ 14958, 18788, 23412, 26061) },
+ { CDF9( 5095, 6446, 9996, 13354, 16017,
+ 17986, 20919, 26129, 29140) },
+ }, .eob_bin_1024 = {
+ { CDF10( 393, 421, 751, 1623, 3160,
+ 6352, 13345, 18047, 22571, 25830) },
+ { CDF10( 1865, 1988, 2930, 4242, 10533,
+ 16538, 21354, 27255, 28546, 31784) },
+ }, .eob_hi_bit = {
+ {
+ {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(16961) },
+ { CDF1(17223) }, { CDF1( 7621) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) },
+ }, {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(19069) },
+ { CDF1(22525) }, { CDF1(13377) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) },
+ },
+ }, {
+ {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(20401) },
+ { CDF1(17025) }, { CDF1(12845) }, { CDF1(12873) },
+ { CDF1(14094) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) },
+ }, {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(20681) },
+ { CDF1(20701) }, { CDF1(15250) }, { CDF1(15017) },
+ { CDF1(14928) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) },
+ },
+ }, {
+ {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(23905) },
+ { CDF1(17194) }, { CDF1(16170) }, { CDF1(17695) },
+ { CDF1(13826) }, { CDF1(15810) }, { CDF1(12036) },
+ { CDF1(16384) }, { CDF1(16384) },
+ }, {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(23959) },
+ { CDF1(20799) }, { CDF1(19021) }, { CDF1(16203) },
+ { CDF1(17886) }, { CDF1(14144) }, { CDF1(12010) },
+ { CDF1(16384) }, { CDF1(16384) },
+ },
+ }, {
+ {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(27399) },
+ { CDF1(16327) }, { CDF1(18071) }, { CDF1(19584) },
+ { CDF1(20721) }, { CDF1(18432) }, { CDF1(19560) },
+ { CDF1(10150) }, { CDF1( 8805) },
+ }, {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(24932) },
+ { CDF1(20833) }, { CDF1(12027) }, { CDF1(16670) },
+ { CDF1(19914) }, { CDF1(15106) }, { CDF1(17662) },
+ { CDF1(13783) }, { CDF1(28756) },
+ },
+ }, {
+ {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(23406) },
+ { CDF1(21845) }, { CDF1(18432) }, { CDF1(16384) },
+ { CDF1(17096) }, { CDF1(12561) }, { CDF1(17320) },
+ { CDF1(22395) }, { CDF1(21370) },
+ }, {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) },
+ },
+ },
+ }, .eob_base_tok = {
+ {
+ {
+ { CDF2(17837, 29055) }, { CDF2(29600, 31446) },
+ { CDF2(30844, 31878) }, { CDF2(24926, 28948) },
+ }, {
+ { CDF2(21365, 30026) }, { CDF2(30512, 32423) },
+ { CDF2(31658, 32621) }, { CDF2(29630, 31881) },
+ },
+ }, {
+ {
+ { CDF2( 5717, 26477) }, { CDF2(30491, 31703) },
+ { CDF2(31550, 32158) }, { CDF2(29648, 31491) },
+ }, {
+ { CDF2(12608, 27820) }, { CDF2(30680, 32225) },
+ { CDF2(30809, 32335) }, { CDF2(31299, 32423) },
+ },
+ }, {
+ {
+ { CDF2( 1786, 12612) }, { CDF2(30663, 31625) },
+ { CDF2(32339, 32468) }, { CDF2(31148, 31833) },
+ }, {
+ { CDF2(18857, 23865) }, { CDF2(31428, 32428) },
+ { CDF2(31744, 32373) }, { CDF2(31775, 32526) },
+ },
+ }, {
+ {
+ { CDF2( 1787, 2532) }, { CDF2(30832, 31662) },
+ { CDF2(31824, 32682) }, { CDF2(32133, 32569) },
+ }, {
+ { CDF2(13751, 22235) }, { CDF2(32089, 32409) },
+ { CDF2(27084, 27920) }, { CDF2(29291, 32594) },
+ },
+ }, {
+ {
+ { CDF2( 1725, 3449) }, { CDF2(31102, 31935) },
+ { CDF2(32457, 32613) }, { CDF2(32412, 32649) },
+ }, {
+ { CDF2(10923, 21845) }, { CDF2(10923, 21845) },
+ { CDF2(10923, 21845) }, { CDF2(10923, 21845) },
+ },
+ },
+ }, .base_tok = {
+ {
+ {
+ { CDF3( 4034, 8930, 12727) },
+ { CDF3(18082, 29741, 31877) },
+ { CDF3(12596, 26124, 30493) },
+ { CDF3( 9446, 21118, 27005) },
+ { CDF3( 6308, 15141, 21279) },
+ { CDF3( 2463, 6357, 9783) },
+ { CDF3(20667, 30546, 31929) },
+ { CDF3(13043, 26123, 30134) },
+ { CDF3( 8151, 18757, 24778) },
+ { CDF3( 5255, 12839, 18632) },
+ { CDF3( 2820, 7206, 11161) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3(15736, 27553, 30604) },
+ { CDF3(11210, 23794, 28787) },
+ { CDF3( 5947, 13874, 19701) },
+ { CDF3( 4215, 9323, 13891) },
+ { CDF3( 2833, 6462, 10059) },
+ { CDF3(19605, 30393, 31582) },
+ { CDF3(13523, 26252, 30248) },
+ { CDF3( 8446, 18622, 24512) },
+ { CDF3( 3818, 10343, 15974) },
+ { CDF3( 1481, 4117, 6796) },
+ { CDF3(22649, 31302, 32190) },
+ { CDF3(14829, 27127, 30449) },
+ { CDF3( 8313, 17702, 23304) },
+ { CDF3( 3022, 8301, 12786) },
+ { CDF3( 1536, 4412, 7184) },
+ { CDF3(22354, 29774, 31372) },
+ { CDF3(14723, 25472, 29214) },
+ { CDF3( 6673, 13745, 18662) },
+ { CDF3( 2068, 5766, 9322) },
+ { CDF3( 8192, 16384, 24576) },
+ }, {
+ { CDF3( 6302, 16444, 21761) },
+ { CDF3(23040, 31538, 32475) },
+ { CDF3(15196, 28452, 31496) },
+ { CDF3(10020, 22946, 28514) },
+ { CDF3( 6533, 16862, 23501) },
+ { CDF3( 3538, 9816, 15076) },
+ { CDF3(24444, 31875, 32525) },
+ { CDF3(15881, 28924, 31635) },
+ { CDF3( 9922, 22873, 28466) },
+ { CDF3( 6527, 16966, 23691) },
+ { CDF3( 4114, 11303, 17220) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3(20201, 30770, 32209) },
+ { CDF3(14754, 28071, 31258) },
+ { CDF3( 8378, 20186, 26517) },
+ { CDF3( 5916, 15299, 21978) },
+ { CDF3( 4268, 11583, 17901) },
+ { CDF3(24361, 32025, 32581) },
+ { CDF3(18673, 30105, 31943) },
+ { CDF3(10196, 22244, 27576) },
+ { CDF3( 5495, 14349, 20417) },
+ { CDF3( 2676, 7415, 11498) },
+ { CDF3(24678, 31958, 32585) },
+ { CDF3(18629, 29906, 31831) },
+ { CDF3( 9364, 20724, 26315) },
+ { CDF3( 4641, 12318, 18094) },
+ { CDF3( 2758, 7387, 11579) },
+ { CDF3(25433, 31842, 32469) },
+ { CDF3(18795, 29289, 31411) },
+ { CDF3( 7644, 17584, 23592) },
+ { CDF3( 3408, 9014, 15047) },
+ { CDF3( 8192, 16384, 24576) },
+ },
+ }, {
+ {
+ { CDF3( 4536, 10072, 14001) },
+ { CDF3(25459, 31416, 32206) },
+ { CDF3(16605, 28048, 30818) },
+ { CDF3(11008, 22857, 27719) },
+ { CDF3( 6915, 16268, 22315) },
+ { CDF3( 2625, 6812, 10537) },
+ { CDF3(24257, 31788, 32499) },
+ { CDF3(16880, 29454, 31879) },
+ { CDF3(11958, 25054, 29778) },
+ { CDF3( 7916, 18718, 25084) },
+ { CDF3( 3383, 8777, 13446) },
+ { CDF3(22720, 31603, 32393) },
+ { CDF3(14960, 28125, 31335) },
+ { CDF3( 9731, 22210, 27928) },
+ { CDF3( 6304, 15832, 22277) },
+ { CDF3( 2910, 7818, 12166) },
+ { CDF3(20375, 30627, 32131) },
+ { CDF3(13904, 27284, 30887) },
+ { CDF3( 9368, 21558, 27144) },
+ { CDF3( 5937, 14966, 21119) },
+ { CDF3( 2667, 7225, 11319) },
+ { CDF3(23970, 31470, 32378) },
+ { CDF3(17173, 29734, 32018) },
+ { CDF3(12795, 25441, 29965) },
+ { CDF3( 8981, 19680, 25893) },
+ { CDF3( 4728, 11372, 16902) },
+ { CDF3(24287, 31797, 32439) },
+ { CDF3(16703, 29145, 31696) },
+ { CDF3(10833, 23554, 28725) },
+ { CDF3( 6468, 16566, 23057) },
+ { CDF3( 2415, 6562, 10278) },
+ { CDF3(26610, 32395, 32659) },
+ { CDF3(18590, 30498, 32117) },
+ { CDF3(12420, 25756, 29950) },
+ { CDF3( 7639, 18746, 24710) },
+ { CDF3( 3001, 8086, 12347) },
+ { CDF3(25076, 32064, 32580) },
+ { CDF3(17946, 30128, 32028) },
+ { CDF3(12024, 24985, 29378) },
+ { CDF3( 7517, 18390, 24304) },
+ { CDF3( 3243, 8781, 13331) },
+ }, {
+ { CDF3( 6037, 16771, 21957) },
+ { CDF3(24774, 31704, 32426) },
+ { CDF3(16830, 28589, 31056) },
+ { CDF3(10602, 22828, 27760) },
+ { CDF3( 6733, 16829, 23071) },
+ { CDF3( 3250, 8914, 13556) },
+ { CDF3(25582, 32220, 32668) },
+ { CDF3(18659, 30342, 32223) },
+ { CDF3(12546, 26149, 30515) },
+ { CDF3( 8420, 20451, 26801) },
+ { CDF3( 4636, 12420, 18344) },
+ { CDF3(27581, 32362, 32639) },
+ { CDF3(18987, 30083, 31978) },
+ { CDF3(11327, 24248, 29084) },
+ { CDF3( 7264, 17719, 24120) },
+ { CDF3( 3995, 10768, 16169) },
+ { CDF3(25893, 31831, 32487) },
+ { CDF3(16577, 28587, 31379) },
+ { CDF3(10189, 22748, 28182) },
+ { CDF3( 6832, 17094, 23556) },
+ { CDF3( 3708, 10110, 15334) },
+ { CDF3(25904, 32282, 32656) },
+ { CDF3(19721, 30792, 32276) },
+ { CDF3(12819, 26243, 30411) },
+ { CDF3( 8572, 20614, 26891) },
+ { CDF3( 5364, 14059, 20467) },
+ { CDF3(26580, 32438, 32677) },
+ { CDF3(20852, 31225, 32340) },
+ { CDF3(12435, 25700, 29967) },
+ { CDF3( 8691, 20825, 26976) },
+ { CDF3( 4446, 12209, 17269) },
+ { CDF3(27350, 32429, 32696) },
+ { CDF3(21372, 30977, 32272) },
+ { CDF3(12673, 25270, 29853) },
+ { CDF3( 9208, 20925, 26640) },
+ { CDF3( 5018, 13351, 18732) },
+ { CDF3(27351, 32479, 32713) },
+ { CDF3(21398, 31209, 32387) },
+ { CDF3(12162, 25047, 29842) },
+ { CDF3( 7896, 18691, 25319) },
+ { CDF3( 4670, 12882, 18881) },
+ },
+ }, {
+ {
+ { CDF3( 5487, 10460, 13708) },
+ { CDF3(21597, 28303, 30674) },
+ { CDF3(11037, 21953, 26476) },
+ { CDF3( 8147, 17962, 22952) },
+ { CDF3( 5242, 13061, 18532) },
+ { CDF3( 1889, 5208, 8182) },
+ { CDF3(26774, 32133, 32590) },
+ { CDF3(17844, 29564, 31767) },
+ { CDF3(11690, 24438, 29171) },
+ { CDF3( 7542, 18215, 24459) },
+ { CDF3( 2993, 8050, 12319) },
+ { CDF3(28023, 32328, 32591) },
+ { CDF3(18651, 30126, 31954) },
+ { CDF3(12164, 25146, 29589) },
+ { CDF3( 7762, 18530, 24771) },
+ { CDF3( 3492, 9183, 13920) },
+ { CDF3(27591, 32008, 32491) },
+ { CDF3(17149, 28853, 31510) },
+ { CDF3(11485, 24003, 28860) },
+ { CDF3( 7697, 18086, 24210) },
+ { CDF3( 3075, 7999, 12218) },
+ { CDF3(28268, 32482, 32654) },
+ { CDF3(19631, 31051, 32404) },
+ { CDF3(13860, 27260, 31020) },
+ { CDF3( 9605, 21613, 27594) },
+ { CDF3( 4876, 12162, 17908) },
+ { CDF3(27248, 32316, 32576) },
+ { CDF3(18955, 30457, 32075) },
+ { CDF3(11824, 23997, 28795) },
+ { CDF3( 7346, 18196, 24647) },
+ { CDF3( 3403, 9247, 14111) },
+ { CDF3(29711, 32655, 32735) },
+ { CDF3(21169, 31394, 32417) },
+ { CDF3(13487, 27198, 30957) },
+ { CDF3( 8828, 21683, 27614) },
+ { CDF3( 4270, 11451, 17038) },
+ { CDF3(28708, 32578, 32731) },
+ { CDF3(20120, 31241, 32482) },
+ { CDF3(13692, 27550, 31321) },
+ { CDF3( 9418, 22514, 28439) },
+ { CDF3( 4999, 13283, 19462) },
+ }, {
+ { CDF3( 5673, 14302, 19711) },
+ { CDF3(26251, 30701, 31834) },
+ { CDF3(12782, 23783, 27803) },
+ { CDF3( 9127, 20657, 25808) },
+ { CDF3( 6368, 16208, 21462) },
+ { CDF3( 2465, 7177, 10822) },
+ { CDF3(29961, 32563, 32719) },
+ { CDF3(18318, 29891, 31949) },
+ { CDF3(11361, 24514, 29357) },
+ { CDF3( 7900, 19603, 25607) },
+ { CDF3( 4002, 10590, 15546) },
+ { CDF3(29637, 32310, 32595) },
+ { CDF3(18296, 29913, 31809) },
+ { CDF3(10144, 21515, 26871) },
+ { CDF3( 5358, 14322, 20394) },
+ { CDF3( 3067, 8362, 13346) },
+ { CDF3(28652, 32470, 32676) },
+ { CDF3(17538, 30771, 32209) },
+ { CDF3(13924, 26882, 30494) },
+ { CDF3(10496, 22837, 27869) },
+ { CDF3( 7236, 16396, 21621) },
+ { CDF3(30743, 32687, 32746) },
+ { CDF3(23006, 31676, 32489) },
+ { CDF3(14494, 27828, 31120) },
+ { CDF3(10174, 22801, 28352) },
+ { CDF3( 6242, 15281, 21043) },
+ { CDF3(25817, 32243, 32720) },
+ { CDF3(18618, 31367, 32325) },
+ { CDF3(13997, 28318, 31878) },
+ { CDF3(12255, 26534, 31383) },
+ { CDF3( 9561, 21588, 28450) },
+ { CDF3(28188, 32635, 32724) },
+ { CDF3(22060, 32365, 32728) },
+ { CDF3(18102, 30690, 32528) },
+ { CDF3(14196, 28864, 31999) },
+ { CDF3(12262, 25792, 30865) },
+ { CDF3(24176, 32109, 32628) },
+ { CDF3(18280, 29681, 31963) },
+ { CDF3(10205, 23703, 29664) },
+ { CDF3( 7889, 20025, 27676) },
+ { CDF3( 6060, 16743, 23970) },
+ },
+ }, {
+ {
+ { CDF3( 5141, 7096, 8260) },
+ { CDF3(27186, 29022, 29789) },
+ { CDF3( 6668, 12568, 15682) },
+ { CDF3( 2172, 6181, 8638) },
+ { CDF3( 1126, 3379, 4531) },
+ { CDF3( 443, 1361, 2254) },
+ { CDF3(26083, 31153, 32436) },
+ { CDF3(13486, 24603, 28483) },
+ { CDF3( 6508, 14840, 19910) },
+ { CDF3( 3386, 8800, 13286) },
+ { CDF3( 1530, 4322, 7054) },
+ { CDF3(29639, 32080, 32548) },
+ { CDF3(15897, 27552, 30290) },
+ { CDF3( 8588, 20047, 25383) },
+ { CDF3( 4889, 13339, 19269) },
+ { CDF3( 2240, 6871, 10498) },
+ { CDF3(28165, 32197, 32517) },
+ { CDF3(20735, 30427, 31568) },
+ { CDF3(14325, 24671, 27692) },
+ { CDF3( 5119, 12554, 17805) },
+ { CDF3( 1810, 5441, 8261) },
+ { CDF3(31212, 32724, 32748) },
+ { CDF3(23352, 31766, 32545) },
+ { CDF3(14669, 27570, 31059) },
+ { CDF3( 8492, 20894, 27272) },
+ { CDF3( 3644, 10194, 15204) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ }, {
+ { CDF3( 2461, 7013, 9371) },
+ { CDF3(24749, 29600, 30986) },
+ { CDF3( 9466, 19037, 22417) },
+ { CDF3( 3584, 9280, 14400) },
+ { CDF3( 1505, 3929, 5433) },
+ { CDF3( 677, 1500, 2736) },
+ { CDF3(23987, 30702, 32117) },
+ { CDF3(13554, 24571, 29263) },
+ { CDF3( 6211, 14556, 21155) },
+ { CDF3( 3135, 10972, 15625) },
+ { CDF3( 2435, 7127, 11427) },
+ { CDF3(31300, 32532, 32550) },
+ { CDF3(14757, 30365, 31954) },
+ { CDF3( 4405, 11612, 18553) },
+ { CDF3( 580, 4132, 7322) },
+ { CDF3( 1695, 10169, 14124) },
+ { CDF3(30008, 32282, 32591) },
+ { CDF3(19244, 30108, 31748) },
+ { CDF3(11180, 24158, 29555) },
+ { CDF3( 5650, 14972, 19209) },
+ { CDF3( 2114, 5109, 8456) },
+ { CDF3(31856, 32716, 32748) },
+ { CDF3(23012, 31664, 32572) },
+ { CDF3(13694, 26656, 30636) },
+ { CDF3( 8142, 19508, 26093) },
+ { CDF3( 4253, 10955, 16724) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ },
+ }, {
+ {
+ { CDF3( 601, 983, 1311) },
+ { CDF3(18725, 23406, 28087) },
+ { CDF3( 5461, 8192, 10923) },
+ { CDF3( 3781, 15124, 21425) },
+ { CDF3( 2587, 7761, 12072) },
+ { CDF3( 106, 458, 810) },
+ { CDF3(22282, 29710, 31894) },
+ { CDF3( 8508, 20926, 25984) },
+ { CDF3( 3726, 12713, 18083) },
+ { CDF3( 1620, 7112, 10893) },
+ { CDF3( 729, 2236, 3495) },
+ { CDF3(30163, 32474, 32684) },
+ { CDF3(18304, 30464, 32000) },
+ { CDF3(11443, 26526, 29647) },
+ { CDF3( 6007, 15292, 21299) },
+ { CDF3( 2234, 6703, 8937) },
+ { CDF3(30954, 32177, 32571) },
+ { CDF3(17363, 29562, 31076) },
+ { CDF3( 9686, 22464, 27410) },
+ { CDF3( 8192, 16384, 21390) },
+ { CDF3( 1755, 8046, 11264) },
+ { CDF3(31168, 32734, 32748) },
+ { CDF3(22486, 31441, 32471) },
+ { CDF3(12833, 25627, 29738) },
+ { CDF3( 6980, 17379, 23122) },
+ { CDF3( 3111, 8887, 13479) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ }, {
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ },
+ },
+ }, .dc_sign = {
+ { { CDF1(16000) }, { CDF1(13056) }, { CDF1(18816) } },
+ { { CDF1(15232) }, { CDF1(12928) }, { CDF1(17280) } },
+ }, .br_tok = {
+ {
+ {
+ { CDF3(14298, 20718, 24174) },
+ { CDF3(12536, 19601, 23789) },
+ { CDF3( 8712, 15051, 19503) },
+ { CDF3( 6170, 11327, 15434) },
+ { CDF3( 4742, 8926, 12538) },
+ { CDF3( 3803, 7317, 10546) },
+ { CDF3( 1696, 3317, 4871) },
+ { CDF3(14392, 19951, 22756) },
+ { CDF3(15978, 23218, 26818) },
+ { CDF3(12187, 19474, 23889) },
+ { CDF3( 9176, 15640, 20259) },
+ { CDF3( 7068, 12655, 17028) },
+ { CDF3( 5656, 10442, 14472) },
+ { CDF3( 2580, 4992, 7244) },
+ { CDF3(12136, 18049, 21426) },
+ { CDF3(13784, 20721, 24481) },
+ { CDF3(10836, 17621, 21900) },
+ { CDF3( 8372, 14444, 18847) },
+ { CDF3( 6523, 11779, 16000) },
+ { CDF3( 5337, 9898, 13760) },
+ { CDF3( 3034, 5860, 8462) },
+ }, {
+ { CDF3(15967, 22905, 26286) },
+ { CDF3(13534, 20654, 24579) },
+ { CDF3( 9504, 16092, 20535) },
+ { CDF3( 6975, 12568, 16903) },
+ { CDF3( 5364, 10091, 14020) },
+ { CDF3( 4357, 8370, 11857) },
+ { CDF3( 2506, 4934, 7218) },
+ { CDF3(23032, 28815, 30936) },
+ { CDF3(19540, 26704, 29719) },
+ { CDF3(15158, 22969, 27097) },
+ { CDF3(11408, 18865, 23650) },
+ { CDF3( 8885, 15448, 20250) },
+ { CDF3( 7108, 12853, 17416) },
+ { CDF3( 4231, 8041, 11480) },
+ { CDF3(19823, 26490, 29156) },
+ { CDF3(18890, 25929, 28932) },
+ { CDF3(15660, 23491, 27433) },
+ { CDF3(12147, 19776, 24488) },
+ { CDF3( 9728, 16774, 21649) },
+ { CDF3( 7919, 14277, 19066) },
+ { CDF3( 5440, 10170, 14185) },
+ },
+ }, {
+ {
+ { CDF3(14406, 20862, 24414) },
+ { CDF3(11824, 18907, 23109) },
+ { CDF3( 8257, 14393, 18803) },
+ { CDF3( 5860, 10747, 14778) },
+ { CDF3( 4475, 8486, 11984) },
+ { CDF3( 3606, 6954, 10043) },
+ { CDF3( 1736, 3410, 5048) },
+ { CDF3(14430, 20046, 22882) },
+ { CDF3(15593, 22899, 26709) },
+ { CDF3(12102, 19368, 23811) },
+ { CDF3( 9059, 15584, 20262) },
+ { CDF3( 6999, 12603, 17048) },
+ { CDF3( 5684, 10497, 14553) },
+ { CDF3( 2822, 5438, 7862) },
+ { CDF3(15785, 21585, 24359) },
+ { CDF3(18347, 25229, 28266) },
+ { CDF3(14974, 22487, 26389) },
+ { CDF3(11423, 18681, 23271) },
+ { CDF3( 8863, 15350, 20008) },
+ { CDF3( 7153, 12852, 17278) },
+ { CDF3( 3707, 7036, 9982) },
+ }, {
+ { CDF3(15460, 21696, 25469) },
+ { CDF3(12170, 19249, 23191) },
+ { CDF3( 8723, 15027, 19332) },
+ { CDF3( 6428, 11704, 15874) },
+ { CDF3( 4922, 9292, 13052) },
+ { CDF3( 4139, 7695, 11010) },
+ { CDF3( 2291, 4508, 6598) },
+ { CDF3(19856, 26920, 29828) },
+ { CDF3(17923, 25289, 28792) },
+ { CDF3(14278, 21968, 26297) },
+ { CDF3(10910, 18136, 22950) },
+ { CDF3( 8423, 14815, 19627) },
+ { CDF3( 6771, 12283, 16774) },
+ { CDF3( 4074, 7750, 11081) },
+ { CDF3(19852, 26074, 28672) },
+ { CDF3(19371, 26110, 28989) },
+ { CDF3(16265, 23873, 27663) },
+ { CDF3(12758, 20378, 24952) },
+ { CDF3(10095, 17098, 21961) },
+ { CDF3( 8250, 14628, 19451) },
+ { CDF3( 5205, 9745, 13622) },
+ },
+ }, {
+ {
+ { CDF3(10563, 16233, 19763) },
+ { CDF3( 9794, 16022, 19804) },
+ { CDF3( 6750, 11945, 15759) },
+ { CDF3( 4963, 9186, 12752) },
+ { CDF3( 3845, 7435, 10627) },
+ { CDF3( 3051, 6085, 8834) },
+ { CDF3( 1311, 2596, 3830) },
+ { CDF3(11246, 16404, 19689) },
+ { CDF3(12315, 18911, 22731) },
+ { CDF3(10557, 17095, 21289) },
+ { CDF3( 8136, 14006, 18249) },
+ { CDF3( 6348, 11474, 15565) },
+ { CDF3( 5196, 9655, 13400) },
+ { CDF3( 2349, 4526, 6587) },
+ { CDF3(13337, 18730, 21569) },
+ { CDF3(19306, 26071, 28882) },
+ { CDF3(15952, 23540, 27254) },
+ { CDF3(12409, 19934, 24430) },
+ { CDF3( 9760, 16706, 21389) },
+ { CDF3( 8004, 14220, 18818) },
+ { CDF3( 4138, 7794, 10961) },
+ }, {
+ { CDF3(10870, 16684, 20949) },
+ { CDF3( 9664, 15230, 18680) },
+ { CDF3( 6886, 12109, 15408) },
+ { CDF3( 4825, 8900, 12305) },
+ { CDF3( 3630, 7162, 10314) },
+ { CDF3( 3036, 6429, 9387) },
+ { CDF3( 1671, 3296, 4940) },
+ { CDF3(13819, 19159, 23026) },
+ { CDF3(11984, 19108, 23120) },
+ { CDF3(10690, 17210, 21663) },
+ { CDF3( 7984, 14154, 18333) },
+ { CDF3( 6868, 12294, 16124) },
+ { CDF3( 5274, 8994, 12868) },
+ { CDF3( 2988, 5771, 8424) },
+ { CDF3(19736, 26647, 29141) },
+ { CDF3(18933, 26070, 28984) },
+ { CDF3(15779, 23048, 27200) },
+ { CDF3(12638, 20061, 24532) },
+ { CDF3(10692, 17545, 22220) },
+ { CDF3( 9217, 15251, 20054) },
+ { CDF3( 5078, 9284, 12594) },
+ },
+ }, {
+ {
+ { CDF3( 2331, 3662, 5244) },
+ { CDF3( 2891, 4771, 6145) },
+ { CDF3( 4598, 7623, 9729) },
+ { CDF3( 3520, 6845, 9199) },
+ { CDF3( 3417, 6119, 9324) },
+ { CDF3( 2601, 5412, 7385) },
+ { CDF3( 600, 1173, 1744) },
+ { CDF3( 7672, 13286, 17469) },
+ { CDF3( 4232, 7792, 10793) },
+ { CDF3( 2915, 5317, 7397) },
+ { CDF3( 2318, 4356, 6152) },
+ { CDF3( 2127, 4000, 5554) },
+ { CDF3( 1850, 3478, 5275) },
+ { CDF3( 977, 1933, 2843) },
+ { CDF3(18280, 24387, 27989) },
+ { CDF3(15852, 22671, 26185) },
+ { CDF3(13845, 20951, 24789) },
+ { CDF3(11055, 17966, 22129) },
+ { CDF3( 9138, 15422, 19801) },
+ { CDF3( 7454, 13145, 17456) },
+ { CDF3( 3370, 6393, 9013) },
+ }, {
+ { CDF3( 5842, 9229, 10838) },
+ { CDF3( 2313, 3491, 4276) },
+ { CDF3( 2998, 6104, 7496) },
+ { CDF3( 2420, 7447, 9868) },
+ { CDF3( 3034, 8495, 10923) },
+ { CDF3( 4076, 8937, 10975) },
+ { CDF3( 1086, 2370, 3299) },
+ { CDF3( 9714, 17254, 20444) },
+ { CDF3( 8543, 13698, 17123) },
+ { CDF3( 4918, 9007, 11910) },
+ { CDF3( 4129, 7532, 10553) },
+ { CDF3( 2364, 5533, 8058) },
+ { CDF3( 1834, 3546, 5563) },
+ { CDF3( 1473, 2908, 4133) },
+ { CDF3(15405, 21193, 25619) },
+ { CDF3(15691, 21952, 26561) },
+ { CDF3(12962, 19194, 24165) },
+ { CDF3(10272, 17855, 22129) },
+ { CDF3( 8588, 15270, 20718) },
+ { CDF3( 8682, 14669, 19500) },
+ { CDF3( 4870, 9636, 13205) },
+ },
+ },
+ },
+ }, [1] = {
+ .skip = {
+ {
+ { CDF1(30371) }, { CDF1( 7570) }, { CDF1(13155) },
+ { CDF1(20751) }, { CDF1(20969) }, { CDF1(27067) },
+ { CDF1(32013) }, { CDF1( 5495) }, { CDF1(17942) },
+ { CDF1(28280) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) },
+ }, {
+ { CDF1(31782) }, { CDF1( 1836) }, { CDF1(10689) },
+ { CDF1(17604) }, { CDF1(21622) }, { CDF1(27518) },
+ { CDF1(32399) }, { CDF1( 4419) }, { CDF1(16294) },
+ { CDF1(28345) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) },
+ }, {
+ { CDF1(31901) }, { CDF1(10311) }, { CDF1(18047) },
+ { CDF1(24806) }, { CDF1(23288) }, { CDF1(27914) },
+ { CDF1(32296) }, { CDF1( 4215) }, { CDF1(15756) },
+ { CDF1(28341) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) },
+ }, {
+ { CDF1(26726) }, { CDF1( 1045) }, { CDF1(11703) },
+ { CDF1(20590) }, { CDF1(18554) }, { CDF1(25970) },
+ { CDF1(31938) }, { CDF1( 5583) }, { CDF1(21313) },
+ { CDF1(29390) }, { CDF1( 641) }, { CDF1(22265) },
+ { CDF1(31452) },
+ }, {
+ { CDF1(26584) }, { CDF1( 188) }, { CDF1( 8847) },
+ { CDF1(24519) }, { CDF1(22938) }, { CDF1(30583) },
+ { CDF1(32608) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) },
+ },
+ }, .eob_bin_16 = {
+ {
+ { CDF4( 2125, 2551, 5165, 8946) },
+ { CDF4( 513, 765, 1859, 6339) },
+ }, {
+ { CDF4( 7637, 9498, 14259, 19108) },
+ { CDF4( 2497, 4096, 8866, 16993) },
+ },
+ }, .eob_bin_32 = {
+ {
+ { CDF5( 989, 1249, 2019, 4151, 10785) },
+ { CDF5( 313, 441, 1099, 2917, 8562) },
+ }, {
+ { CDF5( 8394, 10352, 13932, 18855, 26014) },
+ { CDF5( 2578, 4124, 8181, 13670, 24234) },
+ },
+ }, .eob_bin_64 = {
+ {
+ { CDF6( 1260, 1446, 2253, 3712, 6652, 13369) },
+ { CDF6( 401, 605, 1029, 2563, 5845, 12626) },
+ }, {
+ { CDF6( 8609, 10612, 14624, 18714, 22614, 29024) },
+ { CDF6( 1923, 3127, 5867, 9703, 14277, 27100) },
+ },
+ }, .eob_bin_128 = {
+ {
+ { CDF7( 685, 933, 1488, 2714, 4766, 8562, 19254) },
+ { CDF7( 217, 352, 618, 2303, 5261, 9969, 17472) },
+ }, {
+ { CDF7( 8045, 11200, 15497, 19595, 23948, 27408, 30938) },
+ { CDF7( 2310, 4160, 7471, 14997, 17931, 20768, 30240) },
+ },
+ }, .eob_bin_256 = {
+ {
+ { CDF8( 1448, 2109, 4151, 6263,
+ 9329, 13260, 17944, 23300) },
+ { CDF8( 399, 1019, 1749, 3038,
+ 10444, 15546, 22739, 27294) },
+ }, {
+ { CDF8( 6402, 8148, 12623, 15072,
+ 18728, 22847, 26447, 29377) },
+ { CDF8( 1674, 3252, 5734, 10159,
+ 22397, 23802, 24821, 30940) },
+ },
+ }, .eob_bin_512 = {
+ { CDF9( 1230, 2278, 5035, 7776, 11871,
+ 15346, 19590, 24584, 28749) },
+ { CDF9( 7265, 9979, 15819, 19250, 21780,
+ 23846, 26478, 28396, 31811) },
+ }, .eob_bin_1024 = {
+ { CDF10( 696, 948, 3145, 5702, 9706,
+ 13217, 17851, 21856, 25692, 28034) },
+ { CDF10( 2672, 3591, 9330, 17084, 22725,
+ 24284, 26527, 28027, 28377, 30876) },
+ }, .eob_hi_bit = {
+ {
+ {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(17471) },
+ { CDF1(20223) }, { CDF1(11357) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) },
+ }, {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(20335) },
+ { CDF1(21667) }, { CDF1(14818) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) },
+ },
+ }, {
+ {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(20430) },
+ { CDF1(20662) }, { CDF1(15367) }, { CDF1(16970) },
+ { CDF1(14657) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) },
+ }, {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(22117) },
+ { CDF1(22028) }, { CDF1(18650) }, { CDF1(16042) },
+ { CDF1(15885) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) },
+ },
+ }, {
+ {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(22409) },
+ { CDF1(21012) }, { CDF1(15650) }, { CDF1(17395) },
+ { CDF1(15469) }, { CDF1(20205) }, { CDF1(19511) },
+ { CDF1(16384) }, { CDF1(16384) },
+ }, {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(24220) },
+ { CDF1(22480) }, { CDF1(17737) }, { CDF1(18916) },
+ { CDF1(19268) }, { CDF1(18412) }, { CDF1(18844) },
+ { CDF1(16384) }, { CDF1(16384) },
+ },
+ }, {
+ {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(25991) },
+ { CDF1(20314) }, { CDF1(17731) }, { CDF1(19678) },
+ { CDF1(18649) }, { CDF1(17307) }, { CDF1(21798) },
+ { CDF1(17549) }, { CDF1(15630) },
+ }, {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(26585) },
+ { CDF1(21469) }, { CDF1(20432) }, { CDF1(17735) },
+ { CDF1(19280) }, { CDF1(15235) }, { CDF1(20297) },
+ { CDF1(22471) }, { CDF1(28997) },
+ },
+ }, {
+ {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(26605) },
+ { CDF1(11304) }, { CDF1(16726) }, { CDF1(16560) },
+ { CDF1(20866) }, { CDF1(23524) }, { CDF1(19878) },
+ { CDF1(13469) }, { CDF1(23084) },
+ }, {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) },
+ },
+ },
+ }, .eob_base_tok = {
+ {
+ {
+ { CDF2(17560, 29888) }, { CDF2(29671, 31549) },
+ { CDF2(31007, 32056) }, { CDF2(27286, 30006) },
+ }, {
+ { CDF2(26594, 31212) }, { CDF2(31208, 32582) },
+ { CDF2(31835, 32637) }, { CDF2(30595, 32206) },
+ },
+ }, {
+ {
+ { CDF2(15239, 29932) }, { CDF2(31315, 32095) },
+ { CDF2(32130, 32434) }, { CDF2(30864, 31996) },
+ }, {
+ { CDF2(26279, 30968) }, { CDF2(31142, 32495) },
+ { CDF2(31713, 32540) }, { CDF2(31929, 32594) },
+ },
+ }, {
+ {
+ { CDF2( 2644, 25198) }, { CDF2(32038, 32451) },
+ { CDF2(32639, 32695) }, { CDF2(32166, 32518) },
+ }, {
+ { CDF2(17187, 27668) }, { CDF2(31714, 32550) },
+ { CDF2(32283, 32678) }, { CDF2(31930, 32563) },
+ },
+ }, {
+ {
+ { CDF2( 1044, 2257) }, { CDF2(30755, 31923) },
+ { CDF2(32208, 32693) }, { CDF2(32244, 32615) },
+ }, {
+ { CDF2(21317, 26207) }, { CDF2(29133, 30868) },
+ { CDF2(29311, 31231) }, { CDF2(29657, 31087) },
+ },
+ }, {
+ {
+ { CDF2( 478, 1834) }, { CDF2(31005, 31987) },
+ { CDF2(32317, 32724) }, { CDF2(30865, 32648) },
+ }, {
+ { CDF2(10923, 21845) }, { CDF2(10923, 21845) },
+ { CDF2(10923, 21845) }, { CDF2(10923, 21845) },
+ },
+ },
+ }, .base_tok = {
+ {
+ {
+ { CDF3( 6041, 11854, 15927) },
+ { CDF3(20326, 30905, 32251) },
+ { CDF3(14164, 26831, 30725) },
+ { CDF3( 9760, 20647, 26585) },
+ { CDF3( 6416, 14953, 21219) },
+ { CDF3( 2966, 7151, 10891) },
+ { CDF3(23567, 31374, 32254) },
+ { CDF3(14978, 27416, 30946) },
+ { CDF3( 9434, 20225, 26254) },
+ { CDF3( 6658, 14558, 20535) },
+ { CDF3( 3916, 8677, 12989) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3(18088, 29545, 31587) },
+ { CDF3(13062, 25843, 30073) },
+ { CDF3( 8940, 16827, 22251) },
+ { CDF3( 7654, 13220, 17973) },
+ { CDF3( 5733, 10316, 14456) },
+ { CDF3(22879, 31388, 32114) },
+ { CDF3(15215, 27993, 30955) },
+ { CDF3( 9397, 19445, 24978) },
+ { CDF3( 3442, 9813, 15344) },
+ { CDF3( 1368, 3936, 6532) },
+ { CDF3(25494, 32033, 32406) },
+ { CDF3(16772, 27963, 30718) },
+ { CDF3( 9419, 18165, 23260) },
+ { CDF3( 2677, 7501, 11797) },
+ { CDF3( 1516, 4344, 7170) },
+ { CDF3(26556, 31454, 32101) },
+ { CDF3(17128, 27035, 30108) },
+ { CDF3( 8324, 15344, 20249) },
+ { CDF3( 1903, 5696, 9469) },
+ { CDF3( 8192, 16384, 24576) },
+ }, {
+ { CDF3( 8455, 19003, 24368) },
+ { CDF3(23563, 32021, 32604) },
+ { CDF3(16237, 29446, 31935) },
+ { CDF3(10724, 23999, 29358) },
+ { CDF3( 6725, 17528, 24416) },
+ { CDF3( 3927, 10927, 16825) },
+ { CDF3(26313, 32288, 32634) },
+ { CDF3(17430, 30095, 32095) },
+ { CDF3(11116, 24606, 29679) },
+ { CDF3( 7195, 18384, 25269) },
+ { CDF3( 4726, 12852, 19315) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3(22822, 31648, 32483) },
+ { CDF3(16724, 29633, 31929) },
+ { CDF3(10261, 23033, 28725) },
+ { CDF3( 7029, 17840, 24528) },
+ { CDF3( 4867, 13886, 21502) },
+ { CDF3(25298, 31892, 32491) },
+ { CDF3(17809, 29330, 31512) },
+ { CDF3( 9668, 21329, 26579) },
+ { CDF3( 4774, 12956, 18976) },
+ { CDF3( 2322, 7030, 11540) },
+ { CDF3(25472, 31920, 32543) },
+ { CDF3(17957, 29387, 31632) },
+ { CDF3( 9196, 20593, 26400) },
+ { CDF3( 4680, 12705, 19202) },
+ { CDF3( 2917, 8456, 13436) },
+ { CDF3(26471, 32059, 32574) },
+ { CDF3(18458, 29783, 31909) },
+ { CDF3( 8400, 19464, 25956) },
+ { CDF3( 3812, 10973, 17206) },
+ { CDF3( 8192, 16384, 24576) },
+ },
+ }, {
+ {
+ { CDF3( 6779, 13743, 17678) },
+ { CDF3(24806, 31797, 32457) },
+ { CDF3(17616, 29047, 31372) },
+ { CDF3(11063, 23175, 28003) },
+ { CDF3( 6521, 16110, 22324) },
+ { CDF3( 2764, 7504, 11654) },
+ { CDF3(25266, 32367, 32637) },
+ { CDF3(19054, 30553, 32175) },
+ { CDF3(12139, 25212, 29807) },
+ { CDF3( 7311, 18162, 24704) },
+ { CDF3( 3397, 9164, 14074) },
+ { CDF3(25988, 32208, 32522) },
+ { CDF3(16253, 28912, 31526) },
+ { CDF3( 9151, 21387, 27372) },
+ { CDF3( 5688, 14915, 21496) },
+ { CDF3( 2717, 7627, 12004) },
+ { CDF3(23144, 31855, 32443) },
+ { CDF3(16070, 28491, 31325) },
+ { CDF3( 8702, 20467, 26517) },
+ { CDF3( 5243, 13956, 20367) },
+ { CDF3( 2621, 7335, 11567) },
+ { CDF3(26636, 32340, 32630) },
+ { CDF3(19990, 31050, 32341) },
+ { CDF3(13243, 26105, 30315) },
+ { CDF3( 8588, 19521, 25918) },
+ { CDF3( 4717, 11585, 17304) },
+ { CDF3(25844, 32292, 32582) },
+ { CDF3(19090, 30635, 32097) },
+ { CDF3(11963, 24546, 28939) },
+ { CDF3( 6218, 16087, 22354) },
+ { CDF3( 2340, 6608, 10426) },
+ { CDF3(28046, 32576, 32694) },
+ { CDF3(21178, 31313, 32296) },
+ { CDF3(13486, 26184, 29870) },
+ { CDF3( 7149, 17871, 23723) },
+ { CDF3( 2833, 7958, 12259) },
+ { CDF3(27710, 32528, 32686) },
+ { CDF3(20674, 31076, 32268) },
+ { CDF3(12413, 24955, 29243) },
+ { CDF3( 6676, 16927, 23097) },
+ { CDF3( 2966, 8333, 12919) },
+ }, {
+ { CDF3( 8639, 19339, 24429) },
+ { CDF3(24404, 31837, 32525) },
+ { CDF3(16997, 29425, 31784) },
+ { CDF3(11253, 24234, 29149) },
+ { CDF3( 6751, 17394, 24028) },
+ { CDF3( 3490, 9830, 15191) },
+ { CDF3(26283, 32471, 32714) },
+ { CDF3(19599, 31168, 32442) },
+ { CDF3(13146, 26954, 30893) },
+ { CDF3( 8214, 20588, 26890) },
+ { CDF3( 4699, 13081, 19300) },
+ { CDF3(28212, 32458, 32669) },
+ { CDF3(18594, 30316, 32100) },
+ { CDF3(11219, 24408, 29234) },
+ { CDF3( 6865, 17656, 24149) },
+ { CDF3( 3678, 10362, 16006) },
+ { CDF3(25825, 32136, 32616) },
+ { CDF3(17313, 29853, 32021) },
+ { CDF3(11197, 24471, 29472) },
+ { CDF3( 6947, 17781, 24405) },
+ { CDF3( 3768, 10660, 16261) },
+ { CDF3(27352, 32500, 32706) },
+ { CDF3(20850, 31468, 32469) },
+ { CDF3(14021, 27707, 31133) },
+ { CDF3( 8964, 21748, 27838) },
+ { CDF3( 5437, 14665, 21187) },
+ { CDF3(26304, 32492, 32698) },
+ { CDF3(20409, 31380, 32385) },
+ { CDF3(13682, 27222, 30632) },
+ { CDF3( 8974, 21236, 26685) },
+ { CDF3( 4234, 11665, 16934) },
+ { CDF3(26273, 32357, 32711) },
+ { CDF3(20672, 31242, 32441) },
+ { CDF3(14172, 27254, 30902) },
+ { CDF3( 9870, 21898, 27275) },
+ { CDF3( 5164, 13506, 19270) },
+ { CDF3(26725, 32459, 32728) },
+ { CDF3(20991, 31442, 32527) },
+ { CDF3(13071, 26434, 30811) },
+ { CDF3( 8184, 20090, 26742) },
+ { CDF3( 4803, 13255, 19895) },
+ },
+ }, {
+ {
+ { CDF3( 7555, 14942, 18501) },
+ { CDF3(24410, 31178, 32287) },
+ { CDF3(14394, 26738, 30253) },
+ { CDF3( 8413, 19554, 25195) },
+ { CDF3( 4766, 12924, 18785) },
+ { CDF3( 2029, 5806, 9207) },
+ { CDF3(26776, 32364, 32663) },
+ { CDF3(18732, 29967, 31931) },
+ { CDF3(11005, 23786, 28852) },
+ { CDF3( 6466, 16909, 23510) },
+ { CDF3( 3044, 8638, 13419) },
+ { CDF3(29208, 32582, 32704) },
+ { CDF3(20068, 30857, 32208) },
+ { CDF3(12003, 25085, 29595) },
+ { CDF3( 6947, 17750, 24189) },
+ { CDF3( 3245, 9103, 14007) },
+ { CDF3(27359, 32465, 32669) },
+ { CDF3(19421, 30614, 32174) },
+ { CDF3(11915, 25010, 29579) },
+ { CDF3( 6950, 17676, 24074) },
+ { CDF3( 3007, 8473, 13096) },
+ { CDF3(29002, 32676, 32735) },
+ { CDF3(22102, 31849, 32576) },
+ { CDF3(14408, 28009, 31405) },
+ { CDF3( 9027, 21679, 27931) },
+ { CDF3( 4694, 12678, 18748) },
+ { CDF3(28216, 32528, 32682) },
+ { CDF3(20849, 31264, 32318) },
+ { CDF3(12756, 25815, 29751) },
+ { CDF3( 7565, 18801, 24923) },
+ { CDF3( 3509, 9533, 14477) },
+ { CDF3(30133, 32687, 32739) },
+ { CDF3(23063, 31910, 32515) },
+ { CDF3(14588, 28051, 31132) },
+ { CDF3( 9085, 21649, 27457) },
+ { CDF3( 4261, 11654, 17264) },
+ { CDF3(29518, 32691, 32748) },
+ { CDF3(22451, 31959, 32613) },
+ { CDF3(14864, 28722, 31700) },
+ { CDF3( 9695, 22964, 28716) },
+ { CDF3( 4932, 13358, 19502) },
+ }, {
+ { CDF3( 6465, 16958, 21688) },
+ { CDF3(25199, 31514, 32360) },
+ { CDF3(14774, 27149, 30607) },
+ { CDF3( 9257, 21438, 26972) },
+ { CDF3( 5723, 15183, 21882) },
+ { CDF3( 3150, 8879, 13731) },
+ { CDF3(26989, 32262, 32682) },
+ { CDF3(17396, 29937, 32085) },
+ { CDF3(11387, 24901, 29784) },
+ { CDF3( 7289, 18821, 25548) },
+ { CDF3( 3734, 10577, 16086) },
+ { CDF3(29728, 32501, 32695) },
+ { CDF3(17431, 29701, 31903) },
+ { CDF3( 9921, 22826, 28300) },
+ { CDF3( 5896, 15434, 22068) },
+ { CDF3( 3430, 9646, 14757) },
+ { CDF3(28614, 32511, 32705) },
+ { CDF3(19364, 30638, 32263) },
+ { CDF3(13129, 26254, 30402) },
+ { CDF3( 8754, 20484, 26440) },
+ { CDF3( 4378, 11607, 17110) },
+ { CDF3(30292, 32671, 32744) },
+ { CDF3(21780, 31603, 32501) },
+ { CDF3(14314, 27829, 31291) },
+ { CDF3( 9611, 22327, 28263) },
+ { CDF3( 4890, 13087, 19065) },
+ { CDF3(25862, 32567, 32733) },
+ { CDF3(20794, 32050, 32567) },
+ { CDF3(17243, 30625, 32254) },
+ { CDF3(13283, 27628, 31474) },
+ { CDF3( 9669, 22532, 28918) },
+ { CDF3(27435, 32697, 32748) },
+ { CDF3(24922, 32390, 32714) },
+ { CDF3(21449, 31504, 32536) },
+ { CDF3(16392, 29729, 31832) },
+ { CDF3(11692, 24884, 29076) },
+ { CDF3(24193, 32290, 32735) },
+ { CDF3(18909, 31104, 32563) },
+ { CDF3(12236, 26841, 31403) },
+ { CDF3( 8171, 21840, 29082) },
+ { CDF3( 7224, 17280, 25275) },
+ },
+ }, {
+ {
+ { CDF3( 3078, 6839, 9890) },
+ { CDF3(13837, 20450, 24479) },
+ { CDF3( 5914, 14222, 19328) },
+ { CDF3( 3866, 10267, 14762) },
+ { CDF3( 2612, 7208, 11042) },
+ { CDF3( 1067, 2991, 4776) },
+ { CDF3(25817, 31646, 32529) },
+ { CDF3(13708, 26338, 30385) },
+ { CDF3( 7328, 18585, 24870) },
+ { CDF3( 4691, 13080, 19276) },
+ { CDF3( 1825, 5253, 8352) },
+ { CDF3(29386, 32315, 32624) },
+ { CDF3(17160, 29001, 31360) },
+ { CDF3( 9602, 21862, 27396) },
+ { CDF3( 5915, 15772, 22148) },
+ { CDF3( 2786, 7779, 12047) },
+ { CDF3(29246, 32450, 32663) },
+ { CDF3(18696, 29929, 31818) },
+ { CDF3(10510, 23369, 28560) },
+ { CDF3( 6229, 16499, 23125) },
+ { CDF3( 2608, 7448, 11705) },
+ { CDF3(30753, 32710, 32748) },
+ { CDF3(21638, 31487, 32503) },
+ { CDF3(12937, 26854, 30870) },
+ { CDF3( 8182, 20596, 26970) },
+ { CDF3( 3637, 10269, 15497) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ }, {
+ { CDF3( 5244, 12150, 16906) },
+ { CDF3(20486, 26858, 29701) },
+ { CDF3( 7756, 18317, 23735) },
+ { CDF3( 3452, 9256, 13146) },
+ { CDF3( 2020, 5206, 8229) },
+ { CDF3( 1801, 4993, 7903) },
+ { CDF3(27051, 31858, 32531) },
+ { CDF3(15988, 27531, 30619) },
+ { CDF3( 9188, 21484, 26719) },
+ { CDF3( 6273, 17186, 23800) },
+ { CDF3( 3108, 9355, 14764) },
+ { CDF3(31076, 32520, 32680) },
+ { CDF3(18119, 30037, 31850) },
+ { CDF3(10244, 22969, 27472) },
+ { CDF3( 4692, 14077, 19273) },
+ { CDF3( 3694, 11677, 17556) },
+ { CDF3(30060, 32581, 32720) },
+ { CDF3(21011, 30775, 32120) },
+ { CDF3(11931, 24820, 29289) },
+ { CDF3( 7119, 17662, 24356) },
+ { CDF3( 3833, 10706, 16304) },
+ { CDF3(31954, 32731, 32748) },
+ { CDF3(23913, 31724, 32489) },
+ { CDF3(15520, 28060, 31286) },
+ { CDF3(11517, 23008, 28571) },
+ { CDF3( 6193, 14508, 20629) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ },
+ }, {
+ {
+ { CDF3( 1035, 2807, 4156) },
+ { CDF3(13162, 18138, 20939) },
+ { CDF3( 2696, 6633, 8755) },
+ { CDF3( 1373, 4161, 6853) },
+ { CDF3( 1099, 2746, 4716) },
+ { CDF3( 340, 1021, 1599) },
+ { CDF3(22826, 30419, 32135) },
+ { CDF3(10395, 21762, 26942) },
+ { CDF3( 4726, 12407, 17361) },
+ { CDF3( 2447, 7080, 10593) },
+ { CDF3( 1227, 3717, 6011) },
+ { CDF3(28156, 31424, 31934) },
+ { CDF3(16915, 27754, 30373) },
+ { CDF3( 9148, 20990, 26431) },
+ { CDF3( 5950, 15515, 21148) },
+ { CDF3( 2492, 7327, 11526) },
+ { CDF3(30602, 32477, 32670) },
+ { CDF3(20026, 29955, 31568) },
+ { CDF3(11220, 23628, 28105) },
+ { CDF3( 6652, 17019, 22973) },
+ { CDF3( 3064, 8536, 13043) },
+ { CDF3(31769, 32724, 32748) },
+ { CDF3(22230, 30887, 32373) },
+ { CDF3(12234, 25079, 29731) },
+ { CDF3( 7326, 18816, 25353) },
+ { CDF3( 3933, 10907, 16616) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ }, {
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ },
+ },
+ }, .dc_sign = {
+ { { CDF1(16000) }, { CDF1(13056) }, { CDF1(18816) } },
+ { { CDF1(15232) }, { CDF1(12928) }, { CDF1(17280) } },
+ }, .br_tok = {
+ {
+ {
+ { CDF3(14995, 21341, 24749) },
+ { CDF3(13158, 20289, 24601) },
+ { CDF3( 8941, 15326, 19876) },
+ { CDF3( 6297, 11541, 15807) },
+ { CDF3( 4817, 9029, 12776) },
+ { CDF3( 3731, 7273, 10627) },
+ { CDF3( 1847, 3617, 5354) },
+ { CDF3(14472, 19659, 22343) },
+ { CDF3(16806, 24162, 27533) },
+ { CDF3(12900, 20404, 24713) },
+ { CDF3( 9411, 16112, 20797) },
+ { CDF3( 7056, 12697, 17148) },
+ { CDF3( 5544, 10339, 14460) },
+ { CDF3( 2954, 5704, 8319) },
+ { CDF3(12464, 18071, 21354) },
+ { CDF3(15482, 22528, 26034) },
+ { CDF3(12070, 19269, 23624) },
+ { CDF3( 8953, 15406, 20106) },
+ { CDF3( 7027, 12730, 17220) },
+ { CDF3( 5887, 10913, 15140) },
+ { CDF3( 3793, 7278, 10447) },
+ }, {
+ { CDF3(15571, 22232, 25749) },
+ { CDF3(14506, 21575, 25374) },
+ { CDF3(10189, 17089, 21569) },
+ { CDF3( 7316, 13301, 17915) },
+ { CDF3( 5783, 10912, 15190) },
+ { CDF3( 4760, 9155, 13088) },
+ { CDF3( 2993, 5966, 8774) },
+ { CDF3(23424, 28903, 30778) },
+ { CDF3(20775, 27666, 30290) },
+ { CDF3(16474, 24410, 28299) },
+ { CDF3(12471, 20180, 24987) },
+ { CDF3( 9410, 16487, 21439) },
+ { CDF3( 7536, 13614, 18529) },
+ { CDF3( 5048, 9586, 13549) },
+ { CDF3(21090, 27290, 29756) },
+ { CDF3(20796, 27402, 30026) },
+ { CDF3(17819, 25485, 28969) },
+ { CDF3(13860, 21909, 26462) },
+ { CDF3(11002, 18494, 23529) },
+ { CDF3( 8953, 15929, 20897) },
+ { CDF3( 6448, 11918, 16454) },
+ },
+ }, {
+ {
+ { CDF3(15999, 22208, 25449) },
+ { CDF3(13050, 19988, 24122) },
+ { CDF3( 8594, 14864, 19378) },
+ { CDF3( 6033, 11079, 15238) },
+ { CDF3( 4554, 8683, 12347) },
+ { CDF3( 3672, 7139, 10337) },
+ { CDF3( 1900, 3771, 5576) },
+ { CDF3(15788, 21340, 23949) },
+ { CDF3(16825, 24235, 27758) },
+ { CDF3(12873, 20402, 24810) },
+ { CDF3( 9590, 16363, 21094) },
+ { CDF3( 7352, 13209, 17733) },
+ { CDF3( 5960, 10989, 15184) },
+ { CDF3( 3232, 6234, 9007) },
+ { CDF3(15761, 20716, 23224) },
+ { CDF3(19318, 25989, 28759) },
+ { CDF3(15529, 23094, 26929) },
+ { CDF3(11662, 18989, 23641) },
+ { CDF3( 8955, 15568, 20366) },
+ { CDF3( 7281, 13106, 17708) },
+ { CDF3( 4248, 8059, 11440) },
+ }, {
+ { CDF3(14899, 21217, 24503) },
+ { CDF3(13519, 20283, 24047) },
+ { CDF3( 9429, 15966, 20365) },
+ { CDF3( 6700, 12355, 16652) },
+ { CDF3( 5088, 9704, 13716) },
+ { CDF3( 4243, 8154, 11731) },
+ { CDF3( 2702, 5364, 7861) },
+ { CDF3(22745, 28388, 30454) },
+ { CDF3(20235, 27146, 29922) },
+ { CDF3(15896, 23715, 27637) },
+ { CDF3(11840, 19350, 24131) },
+ { CDF3( 9122, 15932, 20880) },
+ { CDF3( 7488, 13581, 18362) },
+ { CDF3( 5114, 9568, 13370) },
+ { CDF3(20845, 26553, 28932) },
+ { CDF3(20981, 27372, 29884) },
+ { CDF3(17781, 25335, 28785) },
+ { CDF3(13760, 21708, 26297) },
+ { CDF3(10975, 18415, 23365) },
+ { CDF3( 9045, 15789, 20686) },
+ { CDF3( 6130, 11199, 15423) },
+ },
+ }, {
+ {
+ { CDF3(13549, 19724, 23158) },
+ { CDF3(11844, 18382, 22246) },
+ { CDF3( 7919, 13619, 17773) },
+ { CDF3( 5486, 10143, 13946) },
+ { CDF3( 4166, 7983, 11324) },
+ { CDF3( 3364, 6506, 9427) },
+ { CDF3( 1598, 3160, 4674) },
+ { CDF3(15281, 20979, 23781) },
+ { CDF3(14939, 22119, 25952) },
+ { CDF3(11363, 18407, 22812) },
+ { CDF3( 8609, 14857, 19370) },
+ { CDF3( 6737, 12184, 16480) },
+ { CDF3( 5506, 10263, 14262) },
+ { CDF3( 2990, 5786, 8380) },
+ { CDF3(20249, 25253, 27417) },
+ { CDF3(21070, 27518, 30001) },
+ { CDF3(16854, 24469, 28074) },
+ { CDF3(12864, 20486, 25000) },
+ { CDF3( 9962, 16978, 21778) },
+ { CDF3( 8074, 14338, 19048) },
+ { CDF3( 4494, 8479, 11906) },
+ }, {
+ { CDF3(13960, 19617, 22829) },
+ { CDF3(11150, 17341, 21228) },
+ { CDF3( 7150, 12964, 17190) },
+ { CDF3( 5331, 10002, 13867) },
+ { CDF3( 4167, 7744, 11057) },
+ { CDF3( 3480, 6629, 9646) },
+ { CDF3( 1883, 3784, 5686) },
+ { CDF3(18752, 25660, 28912) },
+ { CDF3(16968, 24586, 28030) },
+ { CDF3(13520, 21055, 25313) },
+ { CDF3(10453, 17626, 22280) },
+ { CDF3( 8386, 14505, 19116) },
+ { CDF3( 6742, 12595, 17008) },
+ { CDF3( 4273, 8140, 11499) },
+ { CDF3(22120, 27827, 30233) },
+ { CDF3(20563, 27358, 29895) },
+ { CDF3(17076, 24644, 28153) },
+ { CDF3(13362, 20942, 25309) },
+ { CDF3(10794, 17965, 22695) },
+ { CDF3( 9014, 15652, 20319) },
+ { CDF3( 5708, 10512, 14497) },
+ },
+ }, {
+ {
+ { CDF3( 5705, 10930, 15725) },
+ { CDF3( 7946, 12765, 16115) },
+ { CDF3( 6801, 12123, 16226) },
+ { CDF3( 5462, 10135, 14200) },
+ { CDF3( 4189, 8011, 11507) },
+ { CDF3( 3191, 6229, 9408) },
+ { CDF3( 1057, 2137, 3212) },
+ { CDF3(10018, 17067, 21491) },
+ { CDF3( 7380, 12582, 16453) },
+ { CDF3( 6068, 10845, 14339) },
+ { CDF3( 5098, 9198, 12555) },
+ { CDF3( 4312, 8010, 11119) },
+ { CDF3( 3700, 6966, 9781) },
+ { CDF3( 1693, 3326, 4887) },
+ { CDF3(18757, 24930, 27774) },
+ { CDF3(17648, 24596, 27817) },
+ { CDF3(14707, 22052, 26026) },
+ { CDF3(11720, 18852, 23292) },
+ { CDF3( 9357, 15952, 20525) },
+ { CDF3( 7810, 13753, 18210) },
+ { CDF3( 3879, 7333, 10328) },
+ }, {
+ { CDF3( 8278, 13242, 15922) },
+ { CDF3(10547, 15867, 18919) },
+ { CDF3( 9106, 15842, 20609) },
+ { CDF3( 6833, 13007, 17218) },
+ { CDF3( 4811, 9712, 13923) },
+ { CDF3( 3985, 7352, 11128) },
+ { CDF3( 1688, 3458, 5262) },
+ { CDF3(12951, 21861, 26510) },
+ { CDF3( 9788, 16044, 20276) },
+ { CDF3( 6309, 11244, 14870) },
+ { CDF3( 5183, 9349, 12566) },
+ { CDF3( 4389, 8229, 11492) },
+ { CDF3( 3633, 6945, 10620) },
+ { CDF3( 3600, 6847, 9907) },
+ { CDF3(21748, 28137, 30255) },
+ { CDF3(19436, 26581, 29560) },
+ { CDF3(16359, 24201, 27953) },
+ { CDF3(13961, 21693, 25871) },
+ { CDF3(11544, 18686, 23322) },
+ { CDF3( 9372, 16462, 20952) },
+ { CDF3( 6138, 11210, 15390) },
+ },
+ },
+ },
+ }, [2] = {
+ .skip = {
+ {
+ { CDF1(29614) }, { CDF1( 9068) }, { CDF1(12924) },
+ { CDF1(19538) }, { CDF1(17737) }, { CDF1(24619) },
+ { CDF1(30642) }, { CDF1( 4119) }, { CDF1(16026) },
+ { CDF1(25657) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) },
+ }, {
+ { CDF1(31957) }, { CDF1( 3230) }, { CDF1(11153) },
+ { CDF1(18123) }, { CDF1(20143) }, { CDF1(26536) },
+ { CDF1(31986) }, { CDF1( 3050) }, { CDF1(14603) },
+ { CDF1(25155) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) },
+ }, {
+ { CDF1(32363) }, { CDF1(10692) }, { CDF1(19090) },
+ { CDF1(24357) }, { CDF1(24442) }, { CDF1(28312) },
+ { CDF1(32169) }, { CDF1( 3648) }, { CDF1(15690) },
+ { CDF1(26815) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) },
+ }, {
+ { CDF1(30669) }, { CDF1( 3832) }, { CDF1(11663) },
+ { CDF1(18889) }, { CDF1(19782) }, { CDF1(23313) },
+ { CDF1(31330) }, { CDF1( 5124) }, { CDF1(18719) },
+ { CDF1(28468) }, { CDF1( 3082) }, { CDF1(20982) },
+ { CDF1(29443) },
+ }, {
+ { CDF1(28573) }, { CDF1( 3183) }, { CDF1(17802) },
+ { CDF1(25977) }, { CDF1(26677) }, { CDF1(27832) },
+ { CDF1(32387) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) },
+ },
+ }, .eob_bin_16 = {
+ {
+ { CDF4( 4016, 4897, 8881, 14968) },
+ { CDF4( 716, 1105, 2646, 10056) },
+ }, {
+ { CDF4(11139, 13270, 18241, 23566) },
+ { CDF4( 3192, 5032, 10297, 19755) },
+ },
+ }, .eob_bin_32 = {
+ {
+ { CDF5( 2515, 3003, 4452, 8162, 16041) },
+ { CDF5( 574, 821, 1836, 5089, 13128) },
+ }, {
+ { CDF5(13468, 16303, 20361, 25105, 29281) },
+ { CDF5( 3542, 5502, 10415, 16760, 25644) },
+ },
+ }, .eob_bin_64 = {
+ {
+ { CDF6( 2374, 2772, 4583, 7276, 12288, 19706) },
+ { CDF6( 497, 810, 1315, 3000, 7004, 15641) },
+ }, {
+ { CDF6(15050, 17126, 21410, 24886, 28156, 30726) },
+ { CDF6( 4034, 6290, 10235, 14982, 21214, 28491) },
+ },
+ }, .eob_bin_128 = {
+ {
+ { CDF7( 1366, 1738, 2527, 5016, 9355, 15797, 24643) },
+ { CDF7( 354, 558, 944, 2760, 7287, 14037, 21779) },
+ }, {
+ { CDF7(13627, 16246, 20173, 24429, 27948, 30415, 31863) },
+ { CDF7( 6275, 9889, 14769, 23164, 27988, 30493, 32272) },
+ },
+ }, .eob_bin_256 = {
+ {
+ { CDF8( 3089, 3920, 6038, 9460,
+ 14266, 19881, 25766, 29176) },
+ { CDF8( 1084, 2358, 3488, 5122,
+ 11483, 18103, 26023, 29799) },
+ }, {
+ { CDF8(11514, 13794, 17480, 20754,
+ 24361, 27378, 29492, 31277) },
+ { CDF8( 6571, 9610, 15516, 21826,
+ 29092, 30829, 31842, 32708) },
+ },
+ }, .eob_bin_512 = {
+ { CDF9( 2624, 3936, 6480, 9686, 13979,
+ 17726, 23267, 28410, 31078) },
+ { CDF9(12015, 14769, 19588, 22052, 24222,
+ 25812, 27300, 29219, 32114) },
+ }, .eob_bin_1024 = {
+ { CDF10( 2784, 3831, 7041, 10521, 14847,
+ 18844, 23155, 26682, 29229, 31045) },
+ { CDF10( 9577, 12466, 17739, 20750, 22061,
+ 23215, 24601, 25483, 25843, 32056) },
+ }, .eob_hi_bit = {
+ {
+ {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(18983) },
+ { CDF1(20512) }, { CDF1(14885) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) },
+ }, {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(20090) },
+ { CDF1(19444) }, { CDF1(17286) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) },
+ },
+ }, {
+ {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(19139) },
+ { CDF1(21487) }, { CDF1(18959) }, { CDF1(20910) },
+ { CDF1(19089) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) },
+ }, {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(20536) },
+ { CDF1(20664) }, { CDF1(20625) }, { CDF1(19123) },
+ { CDF1(14862) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) },
+ },
+ }, {
+ {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(19833) },
+ { CDF1(21502) }, { CDF1(17485) }, { CDF1(20267) },
+ { CDF1(18353) }, { CDF1(23329) }, { CDF1(21478) },
+ { CDF1(16384) }, { CDF1(16384) },
+ }, {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(22041) },
+ { CDF1(23434) }, { CDF1(20001) }, { CDF1(20554) },
+ { CDF1(20951) }, { CDF1(20145) }, { CDF1(15562) },
+ { CDF1(16384) }, { CDF1(16384) },
+ },
+ }, {
+ {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(23312) },
+ { CDF1(21607) }, { CDF1(16526) }, { CDF1(18957) },
+ { CDF1(18034) }, { CDF1(18934) }, { CDF1(24247) },
+ { CDF1(16921) }, { CDF1(17080) },
+ }, {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(26579) },
+ { CDF1(24910) }, { CDF1(18637) }, { CDF1(19800) },
+ { CDF1(20388) }, { CDF1( 9887) }, { CDF1(15642) },
+ { CDF1(30198) }, { CDF1(24721) },
+ },
+ }, {
+ {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(26998) },
+ { CDF1(16737) }, { CDF1(17838) }, { CDF1(18922) },
+ { CDF1(19515) }, { CDF1(18636) }, { CDF1(17333) },
+ { CDF1(15776) }, { CDF1(22658) },
+ }, {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) },
+ },
+ },
+ }, .eob_base_tok = {
+ {
+ {
+ { CDF2(20092, 30774) }, { CDF2(30695, 32020) },
+ { CDF2(31131, 32103) }, { CDF2(28666, 30870) },
+ }, {
+ { CDF2(27258, 31095) }, { CDF2(31804, 32623) },
+ { CDF2(31763, 32528) }, { CDF2(31438, 32506) },
+ },
+ }, {
+ {
+ { CDF2(18049, 30489) }, { CDF2(31706, 32286) },
+ { CDF2(32163, 32473) }, { CDF2(31550, 32184) },
+ }, {
+ { CDF2(27116, 30842) }, { CDF2(31971, 32598) },
+ { CDF2(32088, 32576) }, { CDF2(32067, 32664) },
+ },
+ }, {
+ {
+ { CDF2(12854, 29093) }, { CDF2(32272, 32558) },
+ { CDF2(32667, 32729) }, { CDF2(32306, 32585) },
+ }, {
+ { CDF2(25476, 30366) }, { CDF2(32169, 32687) },
+ { CDF2(32479, 32689) }, { CDF2(31673, 32634) },
+ },
+ }, {
+ {
+ { CDF2( 2809, 19301) }, { CDF2(32205, 32622) },
+ { CDF2(32338, 32730) }, { CDF2(31786, 32616) },
+ }, {
+ { CDF2(22737, 29105) }, { CDF2(30810, 32362) },
+ { CDF2(30014, 32627) }, { CDF2(30528, 32574) },
+ },
+ }, {
+ {
+ { CDF2( 935, 3382) }, { CDF2(30789, 31909) },
+ { CDF2(32466, 32756) }, { CDF2(30860, 32513) },
+ }, {
+ { CDF2(10923, 21845) }, { CDF2(10923, 21845) },
+ { CDF2(10923, 21845) }, { CDF2(10923, 21845) },
+ },
+ },
+ }, .base_tok = {
+ {
+ {
+ { CDF3( 8896, 16227, 20630) },
+ { CDF3(23629, 31782, 32527) },
+ { CDF3(15173, 27755, 31321) },
+ { CDF3(10158, 21233, 27382) },
+ { CDF3( 6420, 14857, 21558) },
+ { CDF3( 3269, 8155, 12646) },
+ { CDF3(24835, 32009, 32496) },
+ { CDF3(16509, 28421, 31579) },
+ { CDF3(10957, 21514, 27418) },
+ { CDF3( 7881, 15930, 22096) },
+ { CDF3( 5388, 10960, 15918) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3(20745, 30773, 32093) },
+ { CDF3(15200, 27221, 30861) },
+ { CDF3(13032, 20873, 25667) },
+ { CDF3(12285, 18663, 23494) },
+ { CDF3(11563, 17481, 21489) },
+ { CDF3(26260, 31982, 32320) },
+ { CDF3(15397, 28083, 31100) },
+ { CDF3( 9742, 19217, 24824) },
+ { CDF3( 3261, 9629, 15362) },
+ { CDF3( 1480, 4322, 7499) },
+ { CDF3(27599, 32256, 32460) },
+ { CDF3(16857, 27659, 30774) },
+ { CDF3( 9551, 18290, 23748) },
+ { CDF3( 3052, 8933, 14103) },
+ { CDF3( 2021, 5910, 9787) },
+ { CDF3(29005, 32015, 32392) },
+ { CDF3(17677, 27694, 30863) },
+ { CDF3( 9204, 17356, 23219) },
+ { CDF3( 2403, 7516, 12814) },
+ { CDF3( 8192, 16384, 24576) },
+ }, {
+ { CDF3(10808, 22056, 26896) },
+ { CDF3(25739, 32313, 32676) },
+ { CDF3(17288, 30203, 32221) },
+ { CDF3(11359, 24878, 29896) },
+ { CDF3( 6949, 17767, 24893) },
+ { CDF3( 4287, 11796, 18071) },
+ { CDF3(27880, 32521, 32705) },
+ { CDF3(19038, 31004, 32414) },
+ { CDF3(12564, 26345, 30768) },
+ { CDF3( 8269, 19947, 26779) },
+ { CDF3( 5674, 14657, 21674) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3(25742, 32319, 32671) },
+ { CDF3(19557, 31164, 32454) },
+ { CDF3(13381, 26381, 30755) },
+ { CDF3(10101, 21466, 26722) },
+ { CDF3( 9209, 19650, 26825) },
+ { CDF3(27107, 31917, 32432) },
+ { CDF3(18056, 28893, 31203) },
+ { CDF3(10200, 21434, 26764) },
+ { CDF3( 4660, 12913, 19502) },
+ { CDF3( 2368, 6930, 12504) },
+ { CDF3(26960, 32158, 32613) },
+ { CDF3(18628, 30005, 32031) },
+ { CDF3(10233, 22442, 28232) },
+ { CDF3( 5471, 14630, 21516) },
+ { CDF3( 3235, 10767, 17109) },
+ { CDF3(27696, 32440, 32692) },
+ { CDF3(20032, 31167, 32438) },
+ { CDF3( 8700, 21341, 28442) },
+ { CDF3( 5662, 14831, 21795) },
+ { CDF3( 8192, 16384, 24576) },
+ },
+ }, {
+ {
+ { CDF3( 9704, 17294, 21132) },
+ { CDF3(26762, 32278, 32633) },
+ { CDF3(18382, 29620, 31819) },
+ { CDF3(10891, 23475, 28723) },
+ { CDF3( 6358, 16583, 23309) },
+ { CDF3( 3248, 9118, 14141) },
+ { CDF3(27204, 32573, 32699) },
+ { CDF3(19818, 30824, 32329) },
+ { CDF3(11772, 25120, 30041) },
+ { CDF3( 6995, 18033, 25039) },
+ { CDF3( 3752, 10442, 16098) },
+ { CDF3(27222, 32256, 32559) },
+ { CDF3(15356, 28399, 31475) },
+ { CDF3( 8821, 20635, 27057) },
+ { CDF3( 5511, 14404, 21239) },
+ { CDF3( 2935, 8222, 13051) },
+ { CDF3(24875, 32120, 32529) },
+ { CDF3(15233, 28265, 31445) },
+ { CDF3( 8605, 20570, 26932) },
+ { CDF3( 5431, 14413, 21196) },
+ { CDF3( 2994, 8341, 13223) },
+ { CDF3(28201, 32604, 32700) },
+ { CDF3(21041, 31446, 32456) },
+ { CDF3(13221, 26213, 30475) },
+ { CDF3( 8255, 19385, 26037) },
+ { CDF3( 4930, 12585, 18830) },
+ { CDF3(28768, 32448, 32627) },
+ { CDF3(19705, 30561, 32021) },
+ { CDF3(11572, 23589, 28220) },
+ { CDF3( 5532, 15034, 21446) },
+ { CDF3( 2460, 7150, 11456) },
+ { CDF3(29874, 32619, 32699) },
+ { CDF3(21621, 31071, 32201) },
+ { CDF3(12511, 24747, 28992) },
+ { CDF3( 6281, 16395, 22748) },
+ { CDF3( 3246, 9278, 14497) },
+ { CDF3(29715, 32625, 32712) },
+ { CDF3(20958, 31011, 32283) },
+ { CDF3(11233, 23671, 28806) },
+ { CDF3( 6012, 16128, 22868) },
+ { CDF3( 3427, 9851, 15414) },
+ }, {
+ { CDF3(11016, 22111, 26794) },
+ { CDF3(25946, 32357, 32677) },
+ { CDF3(17890, 30452, 32252) },
+ { CDF3(11678, 25142, 29816) },
+ { CDF3( 6720, 17534, 24584) },
+ { CDF3( 4230, 11665, 17820) },
+ { CDF3(28400, 32623, 32747) },
+ { CDF3(21164, 31668, 32575) },
+ { CDF3(13572, 27388, 31182) },
+ { CDF3( 8234, 20750, 27358) },
+ { CDF3( 5065, 14055, 20897) },
+ { CDF3(28981, 32547, 32705) },
+ { CDF3(18681, 30543, 32239) },
+ { CDF3(10919, 24075, 29286) },
+ { CDF3( 6431, 17199, 24077) },
+ { CDF3( 3819, 10464, 16618) },
+ { CDF3(26870, 32467, 32693) },
+ { CDF3(19041, 30831, 32347) },
+ { CDF3(11794, 25211, 30016) },
+ { CDF3( 6888, 18019, 24970) },
+ { CDF3( 4370, 12363, 18992) },
+ { CDF3(29578, 32670, 32744) },
+ { CDF3(23159, 32007, 32613) },
+ { CDF3(15315, 28669, 31676) },
+ { CDF3( 9298, 22607, 28782) },
+ { CDF3( 6144, 15913, 22968) },
+ { CDF3(28110, 32499, 32669) },
+ { CDF3(21574, 30937, 32015) },
+ { CDF3(12759, 24818, 28727) },
+ { CDF3( 6545, 16761, 23042) },
+ { CDF3( 3649, 10597, 16833) },
+ { CDF3(28163, 32552, 32728) },
+ { CDF3(22101, 31469, 32464) },
+ { CDF3(13160, 25472, 30143) },
+ { CDF3( 7303, 18684, 25468) },
+ { CDF3( 5241, 13975, 20955) },
+ { CDF3(28400, 32631, 32744) },
+ { CDF3(22104, 31793, 32603) },
+ { CDF3(13557, 26571, 30846) },
+ { CDF3( 7749, 19861, 26675) },
+ { CDF3( 4873, 14030, 21234) },
+ },
+ }, {
+ {
+ { CDF3( 9800, 17635, 21073) },
+ { CDF3(26153, 31885, 32527) },
+ { CDF3(15038, 27852, 31006) },
+ { CDF3( 8718, 20564, 26486) },
+ { CDF3( 5128, 14076, 20514) },
+ { CDF3( 2636, 7566, 11925) },
+ { CDF3(27551, 32504, 32701) },
+ { CDF3(18310, 30054, 32100) },
+ { CDF3(10211, 23420, 29082) },
+ { CDF3( 6222, 16876, 23916) },
+ { CDF3( 3462, 9954, 15498) },
+ { CDF3(29991, 32633, 32721) },
+ { CDF3(19883, 30751, 32201) },
+ { CDF3(11141, 24184, 29285) },
+ { CDF3( 6420, 16940, 23774) },
+ { CDF3( 3392, 9753, 15118) },
+ { CDF3(28465, 32616, 32712) },
+ { CDF3(19850, 30702, 32244) },
+ { CDF3(10983, 24024, 29223) },
+ { CDF3( 6294, 16770, 23582) },
+ { CDF3( 3244, 9283, 14509) },
+ { CDF3(30023, 32717, 32748) },
+ { CDF3(22940, 32032, 32626) },
+ { CDF3(14282, 27928, 31473) },
+ { CDF3( 8562, 21327, 27914) },
+ { CDF3( 4846, 13393, 19919) },
+ { CDF3(29981, 32590, 32695) },
+ { CDF3(20465, 30963, 32166) },
+ { CDF3(11479, 23579, 28195) },
+ { CDF3( 5916, 15648, 22073) },
+ { CDF3( 3031, 8605, 13398) },
+ { CDF3(31146, 32691, 32739) },
+ { CDF3(23106, 31724, 32444) },
+ { CDF3(13783, 26738, 30439) },
+ { CDF3( 7852, 19468, 25807) },
+ { CDF3( 3860, 11124, 16853) },
+ { CDF3(31014, 32724, 32748) },
+ { CDF3(23629, 32109, 32628) },
+ { CDF3(14747, 28115, 31403) },
+ { CDF3( 8545, 21242, 27478) },
+ { CDF3( 4574, 12781, 19067) },
+ }, {
+ { CDF3( 9185, 19694, 24688) },
+ { CDF3(26081, 31985, 32621) },
+ { CDF3(16015, 29000, 31787) },
+ { CDF3(10542, 23690, 29206) },
+ { CDF3( 6732, 17945, 24677) },
+ { CDF3( 3916, 11039, 16722) },
+ { CDF3(28224, 32566, 32744) },
+ { CDF3(19100, 31138, 32485) },
+ { CDF3(12528, 26620, 30879) },
+ { CDF3( 7741, 20277, 26885) },
+ { CDF3( 4566, 12845, 18990) },
+ { CDF3(29933, 32593, 32718) },
+ { CDF3(17670, 30333, 32155) },
+ { CDF3(10385, 23600, 28909) },
+ { CDF3( 6243, 16236, 22407) },
+ { CDF3( 3976, 10389, 16017) },
+ { CDF3(28377, 32561, 32738) },
+ { CDF3(19366, 31175, 32482) },
+ { CDF3(13327, 27175, 31094) },
+ { CDF3( 8258, 20769, 27143) },
+ { CDF3( 4703, 13198, 19527) },
+ { CDF3(31086, 32706, 32748) },
+ { CDF3(22853, 31902, 32583) },
+ { CDF3(14759, 28186, 31419) },
+ { CDF3( 9284, 22382, 28348) },
+ { CDF3( 5585, 15192, 21868) },
+ { CDF3(28291, 32652, 32746) },
+ { CDF3(19849, 32107, 32571) },
+ { CDF3(14834, 26818, 29214) },
+ { CDF3(10306, 22594, 28672) },
+ { CDF3( 6615, 17384, 23384) },
+ { CDF3(28947, 32604, 32745) },
+ { CDF3(25625, 32289, 32646) },
+ { CDF3(18758, 28672, 31403) },
+ { CDF3(10017, 23430, 28523) },
+ { CDF3( 6862, 15269, 22131) },
+ { CDF3(23933, 32509, 32739) },
+ { CDF3(19927, 31495, 32631) },
+ { CDF3(11903, 26023, 30621) },
+ { CDF3( 7026, 20094, 27252) },
+ { CDF3( 5998, 18106, 24437) },
+ },
+ }, {
+ {
+ { CDF3( 4456, 11274, 15533) },
+ { CDF3(21219, 29079, 31616) },
+ { CDF3(11173, 23774, 28567) },
+ { CDF3( 7282, 18293, 24263) },
+ { CDF3( 4890, 13286, 19115) },
+ { CDF3( 1890, 5508, 8659) },
+ { CDF3(26651, 32136, 32647) },
+ { CDF3(14630, 28254, 31455) },
+ { CDF3( 8716, 21287, 27395) },
+ { CDF3( 5615, 15331, 22008) },
+ { CDF3( 2675, 7700, 12150) },
+ { CDF3(29954, 32526, 32690) },
+ { CDF3(16126, 28982, 31633) },
+ { CDF3( 9030, 21361, 27352) },
+ { CDF3( 5411, 14793, 21271) },
+ { CDF3( 2943, 8422, 13163) },
+ { CDF3(29539, 32601, 32730) },
+ { CDF3(18125, 30385, 32201) },
+ { CDF3(10422, 24090, 29468) },
+ { CDF3( 6468, 17487, 24438) },
+ { CDF3( 2970, 8653, 13531) },
+ { CDF3(30912, 32715, 32748) },
+ { CDF3(20666, 31373, 32497) },
+ { CDF3(12509, 26640, 30917) },
+ { CDF3( 8058, 20629, 27290) },
+ { CDF3( 4231, 12006, 18052) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ }, {
+ { CDF3(10202, 20633, 25484) },
+ { CDF3(27336, 31445, 32352) },
+ { CDF3(12420, 24384, 28552) },
+ { CDF3( 7648, 18115, 23856) },
+ { CDF3( 5662, 14341, 19902) },
+ { CDF3( 3611, 10328, 15390) },
+ { CDF3(30945, 32616, 32736) },
+ { CDF3(18682, 30505, 32253) },
+ { CDF3(11513, 25336, 30203) },
+ { CDF3( 7449, 19452, 26148) },
+ { CDF3( 4482, 13051, 18886) },
+ { CDF3(32022, 32690, 32747) },
+ { CDF3(18578, 30501, 32146) },
+ { CDF3(11249, 23368, 28631) },
+ { CDF3( 5645, 16958, 22158) },
+ { CDF3( 5009, 11444, 16637) },
+ { CDF3(31357, 32710, 32748) },
+ { CDF3(21552, 31494, 32504) },
+ { CDF3(13891, 27677, 31340) },
+ { CDF3( 9051, 22098, 28172) },
+ { CDF3( 5190, 13377, 19486) },
+ { CDF3(32364, 32740, 32748) },
+ { CDF3(24839, 31907, 32551) },
+ { CDF3(17160, 28779, 31696) },
+ { CDF3(12452, 24137, 29602) },
+ { CDF3( 6165, 15389, 22477) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ },
+ }, {
+ {
+ { CDF3( 2575, 7281, 11077) },
+ { CDF3(14002, 20866, 25402) },
+ { CDF3( 6343, 15056, 19658) },
+ { CDF3( 4474, 11858, 17041) },
+ { CDF3( 2865, 8299, 12534) },
+ { CDF3( 1344, 3949, 6391) },
+ { CDF3(24720, 31239, 32459) },
+ { CDF3(12585, 25356, 29968) },
+ { CDF3( 7181, 18246, 24444) },
+ { CDF3( 5025, 13667, 19885) },
+ { CDF3( 2521, 7304, 11605) },
+ { CDF3(29908, 32252, 32584) },
+ { CDF3(17421, 29156, 31575) },
+ { CDF3( 9889, 22188, 27782) },
+ { CDF3( 5878, 15647, 22123) },
+ { CDF3( 2814, 8665, 13323) },
+ { CDF3(30183, 32568, 32713) },
+ { CDF3(18528, 30195, 32049) },
+ { CDF3(10982, 24606, 29657) },
+ { CDF3( 6957, 18165, 25231) },
+ { CDF3( 3508, 10118, 15468) },
+ { CDF3(31761, 32736, 32748) },
+ { CDF3(21041, 31328, 32546) },
+ { CDF3(12568, 26732, 31166) },
+ { CDF3( 8052, 20720, 27733) },
+ { CDF3( 4336, 12192, 18396) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ }, {
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ },
+ },
+ }, .dc_sign = {
+ { { CDF1(16000) }, { CDF1(13056) }, { CDF1(18816) } },
+ { { CDF1(15232) }, { CDF1(12928) }, { CDF1(17280) } },
+ }, .br_tok = {
+ {
+ {
+ { CDF3(16138, 22223, 25509) },
+ { CDF3(15347, 22430, 26332) },
+ { CDF3( 9614, 16736, 21332) },
+ { CDF3( 6600, 12275, 16907) },
+ { CDF3( 4811, 9424, 13547) },
+ { CDF3( 3748, 7809, 11420) },
+ { CDF3( 2254, 4587, 6890) },
+ { CDF3(15196, 20284, 23177) },
+ { CDF3(18317, 25469, 28451) },
+ { CDF3(13918, 21651, 25842) },
+ { CDF3(10052, 17150, 21995) },
+ { CDF3( 7499, 13630, 18587) },
+ { CDF3( 6158, 11417, 16003) },
+ { CDF3( 4014, 7785, 11252) },
+ { CDF3(15048, 21067, 24384) },
+ { CDF3(18202, 25346, 28553) },
+ { CDF3(14302, 22019, 26356) },
+ { CDF3(10839, 18139, 23166) },
+ { CDF3( 8715, 15744, 20806) },
+ { CDF3( 7536, 13576, 18544) },
+ { CDF3( 5413, 10335, 14498) },
+ }, {
+ { CDF3(17394, 24501, 27895) },
+ { CDF3(15889, 23420, 27185) },
+ { CDF3(11561, 19133, 23870) },
+ { CDF3( 8285, 14812, 19844) },
+ { CDF3( 6496, 12043, 16550) },
+ { CDF3( 4771, 9574, 13677) },
+ { CDF3( 3603, 6830, 10144) },
+ { CDF3(21656, 27704, 30200) },
+ { CDF3(21324, 27915, 30511) },
+ { CDF3(17327, 25336, 28997) },
+ { CDF3(13417, 21381, 26033) },
+ { CDF3(10132, 17425, 22338) },
+ { CDF3( 8580, 15016, 19633) },
+ { CDF3( 5694, 11477, 16411) },
+ { CDF3(24116, 29780, 31450) },
+ { CDF3(23853, 29695, 31591) },
+ { CDF3(20085, 27614, 30428) },
+ { CDF3(15326, 24335, 28575) },
+ { CDF3(11814, 19472, 24810) },
+ { CDF3(10221, 18611, 24767) },
+ { CDF3( 7689, 14558, 20321) },
+ },
+ }, {
+ {
+ { CDF3(16214, 22380, 25770) },
+ { CDF3(14213, 21304, 25295) },
+ { CDF3( 9213, 15823, 20455) },
+ { CDF3( 6395, 11758, 16139) },
+ { CDF3( 4779, 9187, 13066) },
+ { CDF3( 3821, 7501, 10953) },
+ { CDF3( 2293, 4567, 6795) },
+ { CDF3(15859, 21283, 23820) },
+ { CDF3(18404, 25602, 28726) },
+ { CDF3(14325, 21980, 26206) },
+ { CDF3(10669, 17937, 22720) },
+ { CDF3( 8297, 14642, 19447) },
+ { CDF3( 6746, 12389, 16893) },
+ { CDF3( 4324, 8251, 11770) },
+ { CDF3(16532, 21631, 24475) },
+ { CDF3(20667, 27150, 29668) },
+ { CDF3(16728, 24510, 28175) },
+ { CDF3(12861, 20645, 25332) },
+ { CDF3(10076, 17361, 22417) },
+ { CDF3( 8395, 14940, 19963) },
+ { CDF3( 5731, 10683, 14912) },
+ }, {
+ { CDF3(14433, 21155, 24938) },
+ { CDF3(14658, 21716, 25545) },
+ { CDF3( 9923, 16824, 21557) },
+ { CDF3( 6982, 13052, 17721) },
+ { CDF3( 5419, 10503, 15050) },
+ { CDF3( 4852, 9162, 13014) },
+ { CDF3( 3271, 6395, 9630) },
+ { CDF3(22210, 27833, 30109) },
+ { CDF3(20750, 27368, 29821) },
+ { CDF3(16894, 24828, 28573) },
+ { CDF3(13247, 21276, 25757) },
+ { CDF3(10038, 17265, 22563) },
+ { CDF3( 8587, 14947, 20327) },
+ { CDF3( 5645, 11371, 15252) },
+ { CDF3(22027, 27526, 29714) },
+ { CDF3(23098, 29146, 31221) },
+ { CDF3(19886, 27341, 30272) },
+ { CDF3(15609, 23747, 28046) },
+ { CDF3(11993, 20065, 24939) },
+ { CDF3( 9637, 18267, 23671) },
+ { CDF3( 7625, 13801, 19144) },
+ },
+ }, {
+ {
+ { CDF3(14438, 20798, 24089) },
+ { CDF3(12621, 19203, 23097) },
+ { CDF3( 8177, 14125, 18402) },
+ { CDF3( 5674, 10501, 14456) },
+ { CDF3( 4236, 8239, 11733) },
+ { CDF3( 3447, 6750, 9806) },
+ { CDF3( 1986, 3950, 5864) },
+ { CDF3(16208, 22099, 24930) },
+ { CDF3(16537, 24025, 27585) },
+ { CDF3(12780, 20381, 24867) },
+ { CDF3( 9767, 16612, 21416) },
+ { CDF3( 7686, 13738, 18398) },
+ { CDF3( 6333, 11614, 15964) },
+ { CDF3( 3941, 7571, 10836) },
+ { CDF3(22819, 27422, 29202) },
+ { CDF3(22224, 28514, 30721) },
+ { CDF3(17660, 25433, 28913) },
+ { CDF3(13574, 21482, 26002) },
+ { CDF3(10629, 17977, 22938) },
+ { CDF3( 8612, 15298, 20265) },
+ { CDF3( 5607, 10491, 14596) },
+ }, {
+ { CDF3(13569, 19800, 23206) },
+ { CDF3(13128, 19924, 23869) },
+ { CDF3( 8329, 14841, 19403) },
+ { CDF3( 6130, 10976, 15057) },
+ { CDF3( 4682, 8839, 12518) },
+ { CDF3( 3656, 7409, 10588) },
+ { CDF3( 2577, 5099, 7412) },
+ { CDF3(22427, 28684, 30585) },
+ { CDF3(20913, 27750, 30139) },
+ { CDF3(15840, 24109, 27834) },
+ { CDF3(12308, 20029, 24569) },
+ { CDF3(10216, 16785, 21458) },
+ { CDF3( 8309, 14203, 19113) },
+ { CDF3( 6043, 11168, 15307) },
+ { CDF3(23166, 28901, 30998) },
+ { CDF3(21899, 28405, 30751) },
+ { CDF3(18413, 26091, 29443) },
+ { CDF3(15233, 23114, 27352) },
+ { CDF3(12683, 20472, 25288) },
+ { CDF3(10702, 18259, 23409) },
+ { CDF3( 8125, 14464, 19226) },
+ },
+ }, {
+ {
+ { CDF3( 9040, 14786, 18360) },
+ { CDF3( 9979, 15718, 19415) },
+ { CDF3( 7913, 13918, 18311) },
+ { CDF3( 5859, 10889, 15184) },
+ { CDF3( 4593, 8677, 12510) },
+ { CDF3( 3820, 7396, 10791) },
+ { CDF3( 1730, 3471, 5192) },
+ { CDF3(11803, 18365, 22709) },
+ { CDF3(11419, 18058, 22225) },
+ { CDF3( 9418, 15774, 20243) },
+ { CDF3( 7539, 13325, 17657) },
+ { CDF3( 6233, 11317, 15384) },
+ { CDF3( 5137, 9656, 13545) },
+ { CDF3( 2977, 5774, 8349) },
+ { CDF3(21207, 27246, 29640) },
+ { CDF3(19547, 26578, 29497) },
+ { CDF3(16169, 23871, 27690) },
+ { CDF3(12820, 20458, 25018) },
+ { CDF3(10224, 17332, 22214) },
+ { CDF3( 8526, 15048, 19884) },
+ { CDF3( 5037, 9410, 13118) },
+ }, {
+ { CDF3(12339, 17329, 20140) },
+ { CDF3(13505, 19895, 23225) },
+ { CDF3( 9847, 16944, 21564) },
+ { CDF3( 7280, 13256, 18348) },
+ { CDF3( 4712, 10009, 14454) },
+ { CDF3( 4361, 7914, 12477) },
+ { CDF3( 2870, 5628, 7995) },
+ { CDF3(20061, 25504, 28526) },
+ { CDF3(15235, 22878, 26145) },
+ { CDF3(12985, 19958, 24155) },
+ { CDF3( 9782, 16641, 21403) },
+ { CDF3( 9456, 16360, 20760) },
+ { CDF3( 6855, 12940, 18557) },
+ { CDF3( 5661, 10564, 15002) },
+ { CDF3(25656, 30602, 31894) },
+ { CDF3(22570, 29107, 31092) },
+ { CDF3(18917, 26423, 29541) },
+ { CDF3(15940, 23649, 27754) },
+ { CDF3(12803, 20581, 25219) },
+ { CDF3(11082, 18695, 23376) },
+ { CDF3( 7939, 14373, 19005) },
+ },
+ },
+ },
+ }, [3] = {
+ .skip = {
+ {
+ { CDF1(26887) }, { CDF1( 6729) }, { CDF1(10361) },
+ { CDF1(17442) }, { CDF1(15045) }, { CDF1(22478) },
+ { CDF1(29072) }, { CDF1( 2713) }, { CDF1(11861) },
+ { CDF1(20773) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) },
+ }, {
+ { CDF1(31903) }, { CDF1( 2044) }, { CDF1( 7528) },
+ { CDF1(14618) }, { CDF1(16182) }, { CDF1(24168) },
+ { CDF1(31037) }, { CDF1( 2786) }, { CDF1(11194) },
+ { CDF1(20155) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) },
+ }, {
+ { CDF1(32510) }, { CDF1( 8430) }, { CDF1(17318) },
+ { CDF1(24154) }, { CDF1(23674) }, { CDF1(28789) },
+ { CDF1(32139) }, { CDF1( 3440) }, { CDF1(13117) },
+ { CDF1(22702) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) },
+ }, {
+ { CDF1(31671) }, { CDF1( 2056) }, { CDF1(11746) },
+ { CDF1(16852) }, { CDF1(18635) }, { CDF1(24715) },
+ { CDF1(31484) }, { CDF1( 4656) }, { CDF1(16074) },
+ { CDF1(24704) }, { CDF1( 1806) }, { CDF1(14645) },
+ { CDF1(25336) },
+ }, {
+ { CDF1(31539) }, { CDF1( 8433) }, { CDF1(20576) },
+ { CDF1(27904) }, { CDF1(27852) }, { CDF1(30026) },
+ { CDF1(32441) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) },
+ },
+ }, .eob_bin_16 = {
+ {
+ { CDF4( 6708, 8958, 14746, 22133) },
+ { CDF4( 1222, 2074, 4783, 15410) },
+ }, {
+ { CDF4(19575, 21766, 26044, 29709) },
+ { CDF4( 7297, 10767, 19273, 28194) },
+ },
+ }, .eob_bin_32 = {
+ {
+ { CDF5( 4617, 5709, 8446, 13584, 23135) },
+ { CDF5( 1156, 1702, 3675, 9274, 20539) },
+ }, {
+ { CDF5(22086, 24282, 27010, 29770, 31743) },
+ { CDF5( 7699, 10897, 20891, 26926, 31628) },
+ },
+ }, .eob_bin_64 = {
+ {
+ { CDF6( 6307, 7541, 12060, 16358, 22553, 27865) },
+ { CDF6( 1289, 2320, 3971, 7926, 14153, 24291) },
+ }, {
+ { CDF6(24212, 25708, 28268, 30035, 31307, 32049) },
+ { CDF6( 8726, 12378, 19409, 26450, 30038, 32462) },
+ },
+ }, .eob_bin_128 = {
+ {
+ { CDF7( 3472, 4885, 7489, 12481, 18517, 24536, 29635) },
+ { CDF7( 886, 1731, 3271, 8469, 15569, 22126, 28383) },
+ }, {
+ { CDF7(24313, 26062, 28385, 30107, 31217, 31898, 32345) },
+ { CDF7( 9165, 13282, 21150, 30286, 31894, 32571, 32712) },
+ },
+ }, .eob_bin_256 = {
+ {
+ { CDF8( 5348, 7113, 11820, 15924,
+ 22106, 26777, 30334, 31757) },
+ { CDF8( 2453, 4474, 6307, 8777,
+ 16474, 22975, 29000, 31547) },
+ }, {
+ { CDF8(23110, 24597, 27140, 28894,
+ 30167, 30927, 31392, 32094) },
+ { CDF8( 9998, 17661, 25178, 28097,
+ 31308, 32038, 32403, 32695) },
+ },
+ }, .eob_bin_512 = {
+ { CDF9( 5927, 7809, 10923, 14597, 19439,
+ 24135, 28456, 31142, 32060) },
+ { CDF9(21093, 23043, 25742, 27658, 29097,
+ 29716, 30073, 30820, 31956) },
+ }, .eob_bin_1024 = {
+ { CDF10( 6698, 8334, 11961, 15762, 20186,
+ 23862, 27434, 29326, 31082, 32050) },
+ { CDF10(20569, 22426, 25569, 26859, 28053,
+ 28913, 29486, 29724, 29807, 32570) },
+ }, .eob_hi_bit = {
+ {
+ {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(20177) },
+ { CDF1(20789) }, { CDF1(20262) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) },
+ }, {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(21416) },
+ { CDF1(20855) }, { CDF1(23410) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) },
+ },
+ }, {
+ {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(20238) },
+ { CDF1(21057) }, { CDF1(19159) }, { CDF1(22337) },
+ { CDF1(20159) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) },
+ }, {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(20125) },
+ { CDF1(20559) }, { CDF1(21707) }, { CDF1(22296) },
+ { CDF1(17333) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) },
+ },
+ }, {
+ {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(19941) },
+ { CDF1(20527) }, { CDF1(21470) }, { CDF1(22487) },
+ { CDF1(19558) }, { CDF1(22354) }, { CDF1(20331) },
+ { CDF1(16384) }, { CDF1(16384) },
+ }, {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(22752) },
+ { CDF1(25006) }, { CDF1(22075) }, { CDF1(21576) },
+ { CDF1(17740) }, { CDF1(21690) }, { CDF1(19211) },
+ { CDF1(16384) }, { CDF1(16384) },
+ },
+ }, {
+ {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(21442) },
+ { CDF1(22358) }, { CDF1(18503) }, { CDF1(20291) },
+ { CDF1(19945) }, { CDF1(21294) }, { CDF1(21178) },
+ { CDF1(19400) }, { CDF1(10556) },
+ }, {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(24648) },
+ { CDF1(24949) }, { CDF1(20708) }, { CDF1(23905) },
+ { CDF1(20501) }, { CDF1( 9558) }, { CDF1( 9423) },
+ { CDF1(30365) }, { CDF1(19253) },
+ },
+ }, {
+ {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(26064) },
+ { CDF1(22098) }, { CDF1(19613) }, { CDF1(20525) },
+ { CDF1(17595) }, { CDF1(16618) }, { CDF1(20497) },
+ { CDF1(18989) }, { CDF1(15513) },
+ }, {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) },
+ },
+ },
+ }, .eob_base_tok = {
+ {
+ {
+ { CDF2(22497, 31198) }, { CDF2(31715, 32495) },
+ { CDF2(31606, 32337) }, { CDF2(30388, 31990) },
+ }, {
+ { CDF2(27877, 31584) }, { CDF2(32170, 32728) },
+ { CDF2(32155, 32688) }, { CDF2(32219, 32702) },
+ },
+ }, {
+ {
+ { CDF2(21457, 31043) }, { CDF2(31951, 32483) },
+ { CDF2(32153, 32562) }, { CDF2(31473, 32215) },
+ }, {
+ { CDF2(27558, 31151) }, { CDF2(32020, 32640) },
+ { CDF2(32097, 32575) }, { CDF2(32242, 32719) },
+ },
+ }, {
+ {
+ { CDF2(19980, 30591) }, { CDF2(32219, 32597) },
+ { CDF2(32581, 32706) }, { CDF2(31803, 32287) },
+ }, {
+ { CDF2(26473, 30507) }, { CDF2(32431, 32723) },
+ { CDF2(32196, 32611) }, { CDF2(31588, 32528) },
+ },
+ }, {
+ {
+ { CDF2(24647, 30463) }, { CDF2(32412, 32695) },
+ { CDF2(32468, 32720) }, { CDF2(31269, 32523) },
+ }, {
+ { CDF2(28482, 31505) }, { CDF2(32152, 32701) },
+ { CDF2(31732, 32598) }, { CDF2(31767, 32712) },
+ },
+ }, {
+ {
+ { CDF2(12358, 24977) }, { CDF2(31331, 32385) },
+ { CDF2(32634, 32756) }, { CDF2(30411, 32548) },
+ }, {
+ { CDF2(10923, 21845) }, { CDF2(10923, 21845) },
+ { CDF2(10923, 21845) }, { CDF2(10923, 21845) },
+ },
+ },
+ }, .base_tok = {
+ {
+ {
+ { CDF3( 7062, 16472, 22319) },
+ { CDF3(24538, 32261, 32674) },
+ { CDF3(13675, 28041, 31779) },
+ { CDF3( 8590, 20674, 27631) },
+ { CDF3( 5685, 14675, 22013) },
+ { CDF3( 3655, 9898, 15731) },
+ { CDF3(26493, 32418, 32658) },
+ { CDF3(16376, 29342, 32090) },
+ { CDF3(10594, 22649, 28970) },
+ { CDF3( 8176, 17170, 24303) },
+ { CDF3( 5605, 12694, 19139) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3(23888, 31902, 32542) },
+ { CDF3(18612, 29687, 31987) },
+ { CDF3(16245, 24852, 29249) },
+ { CDF3(15765, 22608, 27559) },
+ { CDF3(19895, 24699, 27510) },
+ { CDF3(28401, 32212, 32457) },
+ { CDF3(15274, 27825, 30980) },
+ { CDF3( 9364, 18128, 24332) },
+ { CDF3( 2283, 8193, 15082) },
+ { CDF3( 1228, 3972, 7881) },
+ { CDF3(29455, 32469, 32620) },
+ { CDF3(17981, 28245, 31388) },
+ { CDF3(10921, 20098, 26240) },
+ { CDF3( 3743, 11829, 18657) },
+ { CDF3( 2374, 9593, 15715) },
+ { CDF3(31068, 32466, 32635) },
+ { CDF3(20321, 29572, 31971) },
+ { CDF3(10771, 20255, 27119) },
+ { CDF3( 2795, 10410, 17361) },
+ { CDF3( 8192, 16384, 24576) },
+ }, {
+ { CDF3( 9320, 22102, 27840) },
+ { CDF3(27057, 32464, 32724) },
+ { CDF3(16331, 30268, 32309) },
+ { CDF3(10319, 23935, 29720) },
+ { CDF3( 6189, 16448, 24106) },
+ { CDF3( 3589, 10884, 18808) },
+ { CDF3(29026, 32624, 32748) },
+ { CDF3(19226, 31507, 32587) },
+ { CDF3(12692, 26921, 31203) },
+ { CDF3( 7049, 19532, 27635) },
+ { CDF3( 7727, 15669, 23252) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3(28056, 32625, 32748) },
+ { CDF3(22383, 32075, 32669) },
+ { CDF3(15417, 27098, 31749) },
+ { CDF3(18127, 26493, 27190) },
+ { CDF3( 5461, 16384, 21845) },
+ { CDF3(27982, 32091, 32584) },
+ { CDF3(19045, 29868, 31972) },
+ { CDF3(10397, 22266, 27932) },
+ { CDF3( 5990, 13697, 21500) },
+ { CDF3( 1792, 6912, 15104) },
+ { CDF3(28198, 32501, 32718) },
+ { CDF3(21534, 31521, 32569) },
+ { CDF3(11109, 25217, 30017) },
+ { CDF3( 5671, 15124, 26151) },
+ { CDF3( 4681, 14043, 18725) },
+ { CDF3(28688, 32580, 32741) },
+ { CDF3(22576, 32079, 32661) },
+ { CDF3(10627, 22141, 28340) },
+ { CDF3( 9362, 14043, 28087) },
+ { CDF3( 8192, 16384, 24576) },
+ },
+ }, {
+ {
+ { CDF3( 7754, 16948, 22142) },
+ { CDF3(25670, 32330, 32691) },
+ { CDF3(15663, 29225, 31994) },
+ { CDF3( 9878, 23288, 29158) },
+ { CDF3( 6419, 17088, 24336) },
+ { CDF3( 3859, 11003, 17039) },
+ { CDF3(27562, 32595, 32725) },
+ { CDF3(17575, 30588, 32399) },
+ { CDF3(10819, 24838, 30309) },
+ { CDF3( 7124, 18686, 25916) },
+ { CDF3( 4479, 12688, 19340) },
+ { CDF3(28385, 32476, 32673) },
+ { CDF3(15306, 29005, 31938) },
+ { CDF3( 8937, 21615, 28322) },
+ { CDF3( 5982, 15603, 22786) },
+ { CDF3( 3620, 10267, 16136) },
+ { CDF3(27280, 32464, 32667) },
+ { CDF3(15607, 29160, 32004) },
+ { CDF3( 9091, 22135, 28740) },
+ { CDF3( 6232, 16632, 24020) },
+ { CDF3( 4047, 11377, 17672) },
+ { CDF3(29220, 32630, 32718) },
+ { CDF3(19650, 31220, 32462) },
+ { CDF3(13050, 26312, 30827) },
+ { CDF3( 9228, 20870, 27468) },
+ { CDF3( 6146, 15149, 21971) },
+ { CDF3(30169, 32481, 32623) },
+ { CDF3(17212, 29311, 31554) },
+ { CDF3( 9911, 21311, 26882) },
+ { CDF3( 4487, 13314, 20372) },
+ { CDF3( 2570, 7772, 12889) },
+ { CDF3(30924, 32613, 32708) },
+ { CDF3(19490, 30206, 32107) },
+ { CDF3(11232, 23998, 29276) },
+ { CDF3( 6769, 17955, 25035) },
+ { CDF3( 4398, 12623, 19214) },
+ { CDF3(30609, 32627, 32722) },
+ { CDF3(19370, 30582, 32287) },
+ { CDF3(10457, 23619, 29409) },
+ { CDF3( 6443, 17637, 24834) },
+ { CDF3( 4645, 13236, 20106) },
+ }, {
+ { CDF3( 8626, 20271, 26216) },
+ { CDF3(26707, 32406, 32711) },
+ { CDF3(16999, 30329, 32286) },
+ { CDF3(11445, 25123, 30286) },
+ { CDF3( 6411, 18828, 25601) },
+ { CDF3( 6801, 12458, 20248) },
+ { CDF3(29918, 32682, 32748) },
+ { CDF3(20649, 31739, 32618) },
+ { CDF3(12879, 27773, 31581) },
+ { CDF3( 7896, 21751, 28244) },
+ { CDF3( 5260, 14870, 23698) },
+ { CDF3(29252, 32593, 32731) },
+ { CDF3(17072, 30460, 32294) },
+ { CDF3(10653, 24143, 29365) },
+ { CDF3( 6536, 17490, 23983) },
+ { CDF3( 4929, 13170, 20085) },
+ { CDF3(28137, 32518, 32715) },
+ { CDF3(18171, 30784, 32407) },
+ { CDF3(11437, 25436, 30459) },
+ { CDF3( 7252, 18534, 26176) },
+ { CDF3( 4126, 13353, 20978) },
+ { CDF3(31162, 32726, 32748) },
+ { CDF3(23017, 32222, 32701) },
+ { CDF3(15629, 29233, 32046) },
+ { CDF3( 9387, 22621, 29480) },
+ { CDF3( 6922, 17616, 25010) },
+ { CDF3(28838, 32265, 32614) },
+ { CDF3(19701, 30206, 31920) },
+ { CDF3(11214, 22410, 27933) },
+ { CDF3( 5320, 14177, 23034) },
+ { CDF3( 5049, 12881, 17827) },
+ { CDF3(27484, 32471, 32734) },
+ { CDF3(21076, 31526, 32561) },
+ { CDF3(12707, 26303, 31211) },
+ { CDF3( 8169, 21722, 28219) },
+ { CDF3( 6045, 19406, 27042) },
+ { CDF3(27753, 32572, 32745) },
+ { CDF3(20832, 31878, 32653) },
+ { CDF3(13250, 27356, 31674) },
+ { CDF3( 7718, 21508, 29858) },
+ { CDF3( 7209, 18350, 25559) },
+ },
+ }, {
+ {
+ { CDF3( 7876, 16901, 21741) },
+ { CDF3(24001, 31898, 32625) },
+ { CDF3(14529, 27959, 31451) },
+ { CDF3( 8273, 20818, 27258) },
+ { CDF3( 5278, 14673, 21510) },
+ { CDF3( 2983, 8843, 14039) },
+ { CDF3(28016, 32574, 32732) },
+ { CDF3(17471, 30306, 32301) },
+ { CDF3(10224, 24063, 29728) },
+ { CDF3( 6602, 17954, 25052) },
+ { CDF3( 4002, 11585, 17759) },
+ { CDF3(30190, 32634, 32739) },
+ { CDF3(17497, 30282, 32270) },
+ { CDF3(10229, 23729, 29538) },
+ { CDF3( 6344, 17211, 24440) },
+ { CDF3( 3849, 11189, 17108) },
+ { CDF3(28570, 32583, 32726) },
+ { CDF3(17521, 30161, 32238) },
+ { CDF3(10153, 23565, 29378) },
+ { CDF3( 6455, 17341, 24443) },
+ { CDF3( 3907, 11042, 17024) },
+ { CDF3(30689, 32715, 32748) },
+ { CDF3(21546, 31840, 32610) },
+ { CDF3(13547, 27581, 31459) },
+ { CDF3( 8912, 21757, 28309) },
+ { CDF3( 5548, 15080, 22046) },
+ { CDF3(30783, 32540, 32685) },
+ { CDF3(17540, 29528, 31668) },
+ { CDF3(10160, 21468, 26783) },
+ { CDF3( 4724, 13393, 20054) },
+ { CDF3( 2702, 8174, 13102) },
+ { CDF3(31648, 32686, 32742) },
+ { CDF3(20954, 31094, 32337) },
+ { CDF3(12420, 25698, 30179) },
+ { CDF3( 7304, 19320, 26248) },
+ { CDF3( 4366, 12261, 18864) },
+ { CDF3(31581, 32723, 32748) },
+ { CDF3(21373, 31586, 32525) },
+ { CDF3(12744, 26625, 30885) },
+ { CDF3( 7431, 20322, 26950) },
+ { CDF3( 4692, 13323, 20111) },
+ }, {
+ { CDF3( 7833, 18369, 24095) },
+ { CDF3(26650, 32273, 32702) },
+ { CDF3(16371, 29961, 32191) },
+ { CDF3(11055, 24082, 29629) },
+ { CDF3( 6892, 18644, 25400) },
+ { CDF3( 5006, 13057, 19240) },
+ { CDF3(29834, 32666, 32748) },
+ { CDF3(19577, 31335, 32570) },
+ { CDF3(12253, 26509, 31122) },
+ { CDF3( 7991, 20772, 27711) },
+ { CDF3( 5677, 15910, 23059) },
+ { CDF3(30109, 32532, 32720) },
+ { CDF3(16747, 30166, 32252) },
+ { CDF3(10134, 23542, 29184) },
+ { CDF3( 5791, 16176, 23556) },
+ { CDF3( 4362, 10414, 17284) },
+ { CDF3(29492, 32626, 32748) },
+ { CDF3(19894, 31402, 32525) },
+ { CDF3(12942, 27071, 30869) },
+ { CDF3( 8346, 21216, 27405) },
+ { CDF3( 6572, 17087, 23859) },
+ { CDF3(32035, 32735, 32748) },
+ { CDF3(22957, 31838, 32618) },
+ { CDF3(14724, 28572, 31772) },
+ { CDF3(10364, 23999, 29553) },
+ { CDF3( 7004, 18433, 25655) },
+ { CDF3(27528, 32277, 32681) },
+ { CDF3(16959, 31171, 32096) },
+ { CDF3(10486, 23593, 27962) },
+ { CDF3( 8192, 16384, 23211) },
+ { CDF3( 8937, 17873, 20852) },
+ { CDF3(27715, 32002, 32615) },
+ { CDF3(15073, 29491, 31676) },
+ { CDF3(11264, 24576, 28672) },
+ { CDF3( 2341, 18725, 23406) },
+ { CDF3( 7282, 18204, 25486) },
+ { CDF3(28547, 32213, 32657) },
+ { CDF3(20788, 29773, 32239) },
+ { CDF3( 6780, 21469, 30508) },
+ { CDF3( 5958, 14895, 23831) },
+ { CDF3(16384, 21845, 27307) },
+ },
+ }, {
+ {
+ { CDF3( 5992, 14304, 19765) },
+ { CDF3(22612, 31238, 32456) },
+ { CDF3(13456, 27162, 31087) },
+ { CDF3( 8001, 20062, 26504) },
+ { CDF3( 5168, 14105, 20764) },
+ { CDF3( 2632, 7771, 12385) },
+ { CDF3(27034, 32344, 32709) },
+ { CDF3(15850, 29415, 31997) },
+ { CDF3( 9494, 22776, 28841) },
+ { CDF3( 6151, 16830, 23969) },
+ { CDF3( 3461, 10039, 15722) },
+ { CDF3(30134, 32569, 32731) },
+ { CDF3(15638, 29422, 31945) },
+ { CDF3( 9150, 21865, 28218) },
+ { CDF3( 5647, 15719, 22676) },
+ { CDF3( 3402, 9772, 15477) },
+ { CDF3(28530, 32586, 32735) },
+ { CDF3(17139, 30298, 32292) },
+ { CDF3(10200, 24039, 29685) },
+ { CDF3( 6419, 17674, 24786) },
+ { CDF3( 3544, 10225, 15824) },
+ { CDF3(31333, 32726, 32748) },
+ { CDF3(20618, 31487, 32544) },
+ { CDF3(12901, 27217, 31232) },
+ { CDF3( 8624, 21734, 28171) },
+ { CDF3( 5104, 14191, 20748) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ }, {
+ { CDF3(11206, 21090, 26561) },
+ { CDF3(28759, 32279, 32671) },
+ { CDF3(14171, 27952, 31569) },
+ { CDF3( 9743, 22907, 29141) },
+ { CDF3( 6871, 17886, 24868) },
+ { CDF3( 4960, 13152, 19315) },
+ { CDF3(31077, 32661, 32748) },
+ { CDF3(19400, 31195, 32515) },
+ { CDF3(12752, 26858, 31040) },
+ { CDF3( 8370, 22098, 28591) },
+ { CDF3( 5457, 15373, 22298) },
+ { CDF3(31697, 32706, 32748) },
+ { CDF3(17860, 30657, 32333) },
+ { CDF3(12510, 24812, 29261) },
+ { CDF3( 6180, 19124, 24722) },
+ { CDF3( 5041, 13548, 17959) },
+ { CDF3(31552, 32716, 32748) },
+ { CDF3(21908, 31769, 32623) },
+ { CDF3(14470, 28201, 31565) },
+ { CDF3( 9493, 22982, 28608) },
+ { CDF3( 6858, 17240, 24137) },
+ { CDF3(32543, 32752, 32756) },
+ { CDF3(24286, 32097, 32666) },
+ { CDF3(15958, 29217, 32024) },
+ { CDF3(10207, 24234, 29958) },
+ { CDF3( 6929, 18305, 25652) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ },
+ }, {
+ {
+ { CDF3( 4137, 10847, 15682) },
+ { CDF3(17824, 27001, 30058) },
+ { CDF3(10204, 22796, 28291) },
+ { CDF3( 6076, 15935, 22125) },
+ { CDF3( 3852, 10937, 16816) },
+ { CDF3( 2252, 6324, 10131) },
+ { CDF3(25840, 32016, 32662) },
+ { CDF3(15109, 28268, 31531) },
+ { CDF3( 9385, 22231, 28340) },
+ { CDF3( 6082, 16672, 23479) },
+ { CDF3( 3318, 9427, 14681) },
+ { CDF3(30594, 32574, 32718) },
+ { CDF3(16836, 29552, 31859) },
+ { CDF3( 9556, 22542, 28356) },
+ { CDF3( 6305, 16725, 23540) },
+ { CDF3( 3376, 9895, 15184) },
+ { CDF3(29383, 32617, 32745) },
+ { CDF3(18891, 30809, 32401) },
+ { CDF3(11688, 25942, 30687) },
+ { CDF3( 7468, 19469, 26651) },
+ { CDF3( 3909, 11358, 17012) },
+ { CDF3(31564, 32736, 32748) },
+ { CDF3(20906, 31611, 32600) },
+ { CDF3(13191, 27621, 31537) },
+ { CDF3( 8768, 22029, 28676) },
+ { CDF3( 5079, 14109, 20906) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ }, {
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ },
+ },
+ }, .dc_sign = {
+ { { CDF1(16000) }, { CDF1(13056) }, { CDF1(18816) } },
+ { { CDF1(15232) }, { CDF1(12928) }, { CDF1(17280) } },
+ }, .br_tok = {
+ {
+ {
+ { CDF3(18315, 24289, 27551) },
+ { CDF3(16854, 24068, 27835) },
+ { CDF3(10140, 17927, 23173) },
+ { CDF3( 6722, 12982, 18267) },
+ { CDF3( 4661, 9826, 14706) },
+ { CDF3( 3832, 8165, 12294) },
+ { CDF3( 2795, 6098, 9245) },
+ { CDF3(17145, 23326, 26672) },
+ { CDF3(20733, 27680, 30308) },
+ { CDF3(16032, 24461, 28546) },
+ { CDF3(11653, 20093, 25081) },
+ { CDF3( 9290, 16429, 22086) },
+ { CDF3( 7796, 14598, 19982) },
+ { CDF3( 6502, 12378, 17441) },
+ { CDF3(21681, 27732, 30320) },
+ { CDF3(22389, 29044, 31261) },
+ { CDF3(19027, 26731, 30087) },
+ { CDF3(14739, 23755, 28624) },
+ { CDF3(11358, 20778, 25511) },
+ { CDF3(10995, 18073, 24190) },
+ { CDF3( 9162, 14990, 20617) },
+ }, {
+ { CDF3(21425, 27952, 30388) },
+ { CDF3(18062, 25838, 29034) },
+ { CDF3(11956, 19881, 24808) },
+ { CDF3( 7718, 15000, 20980) },
+ { CDF3( 5702, 11254, 16143) },
+ { CDF3( 4898, 9088, 16864) },
+ { CDF3( 3679, 6776, 11907) },
+ { CDF3(23294, 30160, 31663) },
+ { CDF3(24397, 29896, 31836) },
+ { CDF3(19245, 27128, 30593) },
+ { CDF3(13202, 19825, 26404) },
+ { CDF3(11578, 19297, 23957) },
+ { CDF3( 8073, 13297, 21370) },
+ { CDF3( 5461, 10923, 19745) },
+ { CDF3(27367, 30521, 31934) },
+ { CDF3(24904, 30671, 31940) },
+ { CDF3(23075, 28460, 31299) },
+ { CDF3(14400, 23658, 30417) },
+ { CDF3(13885, 23882, 28325) },
+ { CDF3(14746, 22938, 27853) },
+ { CDF3( 5461, 16384, 27307) },
+ },
+ }, {
+ {
+ { CDF3(18274, 24813, 27890) },
+ { CDF3(15537, 23149, 27003) },
+ { CDF3( 9449, 16740, 21827) },
+ { CDF3( 6700, 12498, 17261) },
+ { CDF3( 4988, 9866, 14198) },
+ { CDF3( 4236, 8147, 11902) },
+ { CDF3( 2867, 5860, 8654) },
+ { CDF3(17124, 23171, 26101) },
+ { CDF3(20396, 27477, 30148) },
+ { CDF3(16573, 24629, 28492) },
+ { CDF3(12749, 20846, 25674) },
+ { CDF3(10233, 17878, 22818) },
+ { CDF3( 8525, 15332, 20363) },
+ { CDF3( 6283, 11632, 16255) },
+ { CDF3(20466, 26511, 29286) },
+ { CDF3(23059, 29174, 31191) },
+ { CDF3(19481, 27263, 30241) },
+ { CDF3(15458, 23631, 28137) },
+ { CDF3(12416, 20608, 25693) },
+ { CDF3(10261, 18011, 23261) },
+ { CDF3( 8016, 14655, 19666) },
+ }, {
+ { CDF3(17616, 24586, 28112) },
+ { CDF3(15809, 23299, 27155) },
+ { CDF3(10767, 18890, 23793) },
+ { CDF3( 7727, 14255, 18865) },
+ { CDF3( 6129, 11926, 16882) },
+ { CDF3( 4482, 9704, 14861) },
+ { CDF3( 3277, 7452, 11522) },
+ { CDF3(22956, 28551, 30730) },
+ { CDF3(22724, 28937, 30961) },
+ { CDF3(18467, 26324, 29580) },
+ { CDF3(13234, 20713, 25649) },
+ { CDF3(11181, 17592, 22481) },
+ { CDF3( 8291, 18358, 24576) },
+ { CDF3( 7568, 11881, 14984) },
+ { CDF3(24948, 29001, 31147) },
+ { CDF3(25674, 30619, 32151) },
+ { CDF3(20841, 26793, 29603) },
+ { CDF3(14669, 24356, 28666) },
+ { CDF3(11334, 23593, 28219) },
+ { CDF3( 8922, 14762, 22873) },
+ { CDF3( 8301, 13544, 20535) },
+ },
+ }, {
+ {
+ { CDF3(17113, 23733, 27081) },
+ { CDF3(14139, 21406, 25452) },
+ { CDF3( 8552, 15002, 19776) },
+ { CDF3( 5871, 11120, 15378) },
+ { CDF3( 4455, 8616, 12253) },
+ { CDF3( 3469, 6910, 10386) },
+ { CDF3( 2255, 4553, 6782) },
+ { CDF3(18224, 24376, 27053) },
+ { CDF3(19290, 26710, 29614) },
+ { CDF3(14936, 22991, 27184) },
+ { CDF3(11238, 18951, 23762) },
+ { CDF3( 8786, 15617, 20588) },
+ { CDF3( 7317, 13228, 18003) },
+ { CDF3( 5101, 9512, 13493) },
+ { CDF3(22639, 28222, 30210) },
+ { CDF3(23216, 29331, 31307) },
+ { CDF3(19075, 26762, 29895) },
+ { CDF3(15014, 23113, 27457) },
+ { CDF3(11938, 19857, 24752) },
+ { CDF3( 9942, 17280, 22282) },
+ { CDF3( 7167, 13144, 17752) },
+ }, {
+ { CDF3(15820, 22738, 26488) },
+ { CDF3(13530, 20885, 25216) },
+ { CDF3( 8395, 15530, 20452) },
+ { CDF3( 6574, 12321, 16380) },
+ { CDF3( 5353, 10419, 14568) },
+ { CDF3( 4613, 8446, 12381) },
+ { CDF3( 3440, 7158, 9903) },
+ { CDF3(24247, 29051, 31224) },
+ { CDF3(22118, 28058, 30369) },
+ { CDF3(16498, 24768, 28389) },
+ { CDF3(12920, 21175, 26137) },
+ { CDF3(10730, 18619, 25352) },
+ { CDF3(10187, 16279, 22791) },
+ { CDF3( 9310, 14631, 22127) },
+ { CDF3(24970, 30558, 32057) },
+ { CDF3(24801, 29942, 31698) },
+ { CDF3(22432, 28453, 30855) },
+ { CDF3(19054, 25680, 29580) },
+ { CDF3(14392, 23036, 28109) },
+ { CDF3(12495, 20947, 26650) },
+ { CDF3(12442, 20326, 26214) },
+ },
+ }, {
+ {
+ { CDF3(12162, 18785, 22648) },
+ { CDF3(12749, 19697, 23806) },
+ { CDF3( 8580, 15297, 20346) },
+ { CDF3( 6169, 11749, 16543) },
+ { CDF3( 4836, 9391, 13448) },
+ { CDF3( 3821, 7711, 11613) },
+ { CDF3( 2228, 4601, 7070) },
+ { CDF3(16319, 24725, 28280) },
+ { CDF3(15698, 23277, 27168) },
+ { CDF3(12726, 20368, 25047) },
+ { CDF3( 9912, 17015, 21976) },
+ { CDF3( 7888, 14220, 19179) },
+ { CDF3( 6777, 12284, 17018) },
+ { CDF3( 4492, 8590, 12252) },
+ { CDF3(23249, 28904, 30947) },
+ { CDF3(21050, 27908, 30512) },
+ { CDF3(17440, 25340, 28949) },
+ { CDF3(14059, 22018, 26541) },
+ { CDF3(11288, 18903, 23898) },
+ { CDF3( 9411, 16342, 21428) },
+ { CDF3( 6278, 11588, 15944) },
+ }, {
+ { CDF3(13981, 20067, 23226) },
+ { CDF3(16922, 23580, 26783) },
+ { CDF3(11005, 19039, 24487) },
+ { CDF3( 7389, 14218, 19798) },
+ { CDF3( 5598, 11505, 17206) },
+ { CDF3( 6090, 11213, 15659) },
+ { CDF3( 3820, 7371, 10119) },
+ { CDF3(21082, 26925, 29675) },
+ { CDF3(21262, 28627, 31128) },
+ { CDF3(18392, 26454, 30437) },
+ { CDF3(14870, 22910, 27096) },
+ { CDF3(12620, 19484, 24908) },
+ { CDF3( 9290, 16553, 22802) },
+ { CDF3( 6668, 14288, 20004) },
+ { CDF3(27704, 31055, 31949) },
+ { CDF3(24709, 29978, 31788) },
+ { CDF3(21668, 29264, 31657) },
+ { CDF3(18295, 26968, 30074) },
+ { CDF3(16399, 24422, 29313) },
+ { CDF3(14347, 23026, 28104) },
+ { CDF3(12370, 19806, 24477) },
+ },
+ },
+ },
+ }
+};
+
+void dav1d_cdf_thread_update(const Dav1dFrameHeader *const hdr,
+ CdfContext *const dst,
+ const CdfContext *const src)
+{
+#define update_cdf_1d(n1d, name) \
+ do { \
+ memcpy(dst->name, src->name, sizeof(dst->name)); \
+ dst->name[n1d] = 0; \
+ } while (0)
+
+#define update_cdf_2d(n1d, n2d, name) \
+ for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
+#define update_cdf_3d(n1d, n2d, n3d, name) \
+ for (int k = 0; k < (n1d); k++) update_cdf_2d(n2d, n3d, name[k])
+#define update_cdf_4d(n1d, n2d, n3d, n4d, name) \
+ for (int l = 0; l < (n1d); l++) update_cdf_3d(n2d, n3d, n4d, name[l])
+
+#define update_bit_0d(name) \
+ do { \
+ dst->name[0] = src->name[0]; \
+ dst->name[1] = 0; \
+ } while (0)
+
+#define update_bit_1d(n1d, name) \
+ for (int i = 0; i < (n1d); i++) update_bit_0d(name[i])
+#define update_bit_2d(n1d, n2d, name) \
+ for (int j = 0; j < (n1d); j++) update_bit_1d(n2d, name[j])
+#define update_bit_3d(n1d, n2d, n3d, name) \
+ for (int k = 0; k < (n1d); k++) update_bit_2d(n2d, n3d, name[k])
+
+ update_bit_1d(N_BS_SIZES, m.use_filter_intra);
+ update_cdf_1d(4, m.filter_intra);
+ update_cdf_3d(2, N_INTRA_PRED_MODES, N_UV_INTRA_PRED_MODES - 1 - !k, m.uv_mode);
+ update_cdf_2d(8, 6, m.angle_delta);
+ update_cdf_3d(N_TX_SIZES - 1, 3, imin(k + 1, 2), m.txsz);
+ update_cdf_3d(2, N_INTRA_PRED_MODES, 6, m.txtp_intra1);
+ update_cdf_3d(3, N_INTRA_PRED_MODES, 4, m.txtp_intra2);
+ update_bit_1d(3, m.skip);
+ update_cdf_3d(N_BL_LEVELS, 4, dav1d_partition_type_count[k], m.partition);
+ update_bit_2d(N_TX_SIZES, 13, coef.skip);
+ update_cdf_3d(2, 2, 4, coef.eob_bin_16);
+ update_cdf_3d(2, 2, 5, coef.eob_bin_32);
+ update_cdf_3d(2, 2, 6, coef.eob_bin_64);
+ update_cdf_3d(2, 2, 7, coef.eob_bin_128);
+ update_cdf_3d(2, 2, 8, coef.eob_bin_256);
+ update_cdf_2d(2, 9, coef.eob_bin_512);
+ update_cdf_2d(2, 10, coef.eob_bin_1024);
+ update_bit_3d(N_TX_SIZES, 2, 11 /*22*/, coef.eob_hi_bit);
+ update_cdf_4d(N_TX_SIZES, 2, 4, 2, coef.eob_base_tok);
+ update_cdf_4d(N_TX_SIZES, 2, 41 /*42*/, 3, coef.base_tok);
+ update_bit_2d(2, 3, coef.dc_sign);
+ update_cdf_4d(4, 2, 21, 3, coef.br_tok);
+ update_cdf_2d(3, DAV1D_MAX_SEGMENTS - 1, m.seg_id);
+ update_cdf_1d(7, m.cfl_sign);
+ update_cdf_2d(6, 15, m.cfl_alpha);
+ update_bit_0d(m.restore_wiener);
+ update_bit_0d(m.restore_sgrproj);
+ update_cdf_1d(2, m.restore_switchable);
+ update_cdf_1d(3, m.delta_q);
+ update_cdf_2d(5, 3, m.delta_lf);
+ update_bit_2d(7, 3, m.pal_y);
+ update_bit_1d(2, m.pal_uv);
+ update_cdf_3d(2, 7, 6, m.pal_sz);
+ update_cdf_4d(2, 7, 5, k + 1, m.color_map);
+ update_bit_2d(7, 3, m.txpart);
+ update_cdf_2d(2, 15, m.txtp_inter1);
+ update_cdf_1d(11, m.txtp_inter2);
+ update_bit_1d(4, m.txtp_inter3);
+
+ if (IS_KEY_OR_INTRA(hdr)) {
+ update_bit_0d(m.intrabc);
+
+ update_cdf_1d(N_MV_JOINTS - 1, dmv.joint);
+ for (int k = 0; k < 2; k++) {
+ update_cdf_1d(10, dmv.comp[k].classes);
+ update_bit_0d(dmv.comp[k].class0);
+ update_bit_1d(10, dmv.comp[k].classN);
+ update_bit_0d(dmv.comp[k].sign);
+ }
+ return;
+ }
+
+ update_bit_1d(3, m.skip_mode);
+ update_cdf_2d(4, N_INTRA_PRED_MODES - 1, m.y_mode);
+ update_cdf_3d(2, 8, DAV1D_N_SWITCHABLE_FILTERS - 1, m.filter);
+ update_bit_1d(6, m.newmv_mode);
+ update_bit_1d(2, m.globalmv_mode);
+ update_bit_1d(6, m.refmv_mode);
+ update_bit_1d(3, m.drl_bit);
+ update_cdf_2d(8, N_COMP_INTER_PRED_MODES - 1, m.comp_inter_mode);
+ update_bit_1d(4, m.intra);
+ update_bit_1d(5, m.comp);
+ update_bit_1d(5, m.comp_dir);
+ update_bit_1d(6, m.jnt_comp);
+ update_bit_1d(6, m.mask_comp);
+ update_bit_1d(9, m.wedge_comp);
+ update_cdf_2d(9, 15, m.wedge_idx);
+ update_bit_2d(6, 3, m.ref);
+ update_bit_2d(3, 3, m.comp_fwd_ref);
+ update_bit_2d(2, 3, m.comp_bwd_ref);
+ update_bit_2d(3, 3, m.comp_uni_ref);
+ update_bit_1d(3, m.seg_pred);
+ update_bit_1d(4, m.interintra);
+ update_bit_1d(7, m.interintra_wedge);
+ update_cdf_2d(4, 3, m.interintra_mode);
+ update_cdf_2d(N_BS_SIZES, 2, m.motion_mode);
+ update_bit_1d(N_BS_SIZES, m.obmc);
+
+ update_cdf_1d(N_MV_JOINTS - 1, mv.joint);
+ for (int k = 0; k < 2; k++) {
+ update_cdf_1d(10, mv.comp[k].classes);
+ update_bit_0d(mv.comp[k].class0);
+ update_bit_1d(10, mv.comp[k].classN);
+ update_cdf_2d(2, 3, mv.comp[k].class0_fp);
+ update_cdf_1d(3, mv.comp[k].classN_fp);
+ update_bit_0d(mv.comp[k].class0_hp);
+ update_bit_0d(mv.comp[k].classN_hp);
+ update_bit_0d(mv.comp[k].sign);
+ }
+}
+
+/*
+ * CDF threading wrappers.
+ */
+static inline int get_qcat_idx(const int q) {
+ if (q <= 20) return 0;
+ if (q <= 60) return 1;
+ if (q <= 120) return 2;
+ return 3;
+}
+
+void dav1d_cdf_thread_init_static(CdfThreadContext *const cdf, const int qidx) {
+ cdf->ref = NULL;
+ cdf->data.qcat = get_qcat_idx(qidx);
+}
+
+void dav1d_cdf_thread_copy(CdfContext *const dst, const CdfThreadContext *const src) {
+ if (src->ref) {
+ memcpy(dst, src->data.cdf, sizeof(*dst));
+ } else {
+ dst->m = av1_default_cdf;
+ memcpy(dst->kfym, default_kf_y_mode_cdf, sizeof(default_kf_y_mode_cdf));
+ dst->coef = av1_default_coef_cdf[src->data.qcat];
+ memcpy(dst->mv.joint, default_mv_joint_cdf, sizeof(default_mv_joint_cdf));
+ memcpy(dst->dmv.joint, default_mv_joint_cdf, sizeof(default_mv_joint_cdf));
+ dst->mv.comp[0] = dst->mv.comp[1] = dst->dmv.comp[0] = dst->dmv.comp[1] =
+ default_mv_component_cdf;
+ }
+}
+
+int dav1d_cdf_thread_alloc(Dav1dContext *const c, CdfThreadContext *const cdf,
+ const int have_frame_mt)
+{
+ cdf->ref = dav1d_ref_create_using_pool(c->cdf_pool,
+ sizeof(CdfContext) + sizeof(atomic_uint));
+ if (!cdf->ref) return DAV1D_ERR(ENOMEM);
+ cdf->data.cdf = cdf->ref->data;
+ if (have_frame_mt) {
+ cdf->progress = (atomic_uint *) &cdf->data.cdf[1];
+ atomic_init(cdf->progress, 0);
+ }
+ return 0;
+}
+
+void dav1d_cdf_thread_ref(CdfThreadContext *const dst,
+ CdfThreadContext *const src)
+{
+ *dst = *src;
+ if (src->ref)
+ dav1d_ref_inc(src->ref);
+}
+
+void dav1d_cdf_thread_unref(CdfThreadContext *const cdf) {
+ memset(&cdf->data, 0, sizeof(*cdf) - offsetof(CdfThreadContext, data));
+ dav1d_ref_dec(&cdf->ref);
+}
diff --git a/third_party/dav1d/src/cdf.h b/third_party/dav1d/src/cdf.h
new file mode 100644
index 0000000000..4b30474baa
--- /dev/null
+++ b/third_party/dav1d/src/cdf.h
@@ -0,0 +1,150 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_CDF_H
+#define DAV1D_SRC_CDF_H
+
+#include <stdint.h>
+
+#include "src/levels.h"
+#include "src/ref.h"
+#include "src/thread_data.h"
+
+/* Buffers padded to [8] or [16] for SIMD where needed. */
+
+typedef struct CdfModeContext {
+ ALIGN(uint16_t y_mode[4][N_INTRA_PRED_MODES + 3], 32);
+ ALIGN(uint16_t uv_mode[2][N_INTRA_PRED_MODES][N_UV_INTRA_PRED_MODES + 2], 32);
+ ALIGN(uint16_t wedge_idx[9][16], 32);
+ ALIGN(uint16_t partition[N_BL_LEVELS][4][N_PARTITIONS + 6], 32);
+ ALIGN(uint16_t cfl_alpha[6][16], 32);
+ ALIGN(uint16_t txtp_inter1[2][16], 32);
+ ALIGN(uint16_t txtp_inter2[12 + 4], 32);
+ ALIGN(uint16_t txtp_intra1[2][N_INTRA_PRED_MODES][7 + 1], 16);
+ ALIGN(uint16_t txtp_intra2[3][N_INTRA_PRED_MODES][5 + 3], 16);
+ ALIGN(uint16_t cfl_sign[8], 16);
+ ALIGN(uint16_t angle_delta[8][8], 16);
+ ALIGN(uint16_t filter_intra[5 + 3], 16);
+ ALIGN(uint16_t comp_inter_mode[8][N_COMP_INTER_PRED_MODES], 16);
+ ALIGN(uint16_t seg_id[3][DAV1D_MAX_SEGMENTS], 16);
+ ALIGN(uint16_t pal_sz[2][7][7 + 1], 16);
+ ALIGN(uint16_t color_map[2][7][5][8], 16);
+ ALIGN(uint16_t filter[2][8][DAV1D_N_SWITCHABLE_FILTERS + 1], 8);
+ ALIGN(uint16_t txsz[N_TX_SIZES - 1][3][4], 8);
+ ALIGN(uint16_t motion_mode[N_BS_SIZES][3 + 1], 8);
+ ALIGN(uint16_t delta_q[4], 8);
+ ALIGN(uint16_t delta_lf[5][4], 8);
+ ALIGN(uint16_t interintra_mode[4][4], 8);
+ ALIGN(uint16_t restore_switchable[3 + 1], 8);
+ ALIGN(uint16_t restore_wiener[2], 4);
+ ALIGN(uint16_t restore_sgrproj[2], 4);
+ ALIGN(uint16_t interintra[7][2], 4);
+ ALIGN(uint16_t interintra_wedge[7][2], 4);
+ ALIGN(uint16_t txtp_inter3[4][2], 4);
+ ALIGN(uint16_t use_filter_intra[N_BS_SIZES][2], 4);
+ ALIGN(uint16_t newmv_mode[6][2], 4);
+ ALIGN(uint16_t globalmv_mode[2][2], 4);
+ ALIGN(uint16_t refmv_mode[6][2], 4);
+ ALIGN(uint16_t drl_bit[3][2], 4);
+ ALIGN(uint16_t intra[4][2], 4);
+ ALIGN(uint16_t comp[5][2], 4);
+ ALIGN(uint16_t comp_dir[5][2], 4);
+ ALIGN(uint16_t jnt_comp[6][2], 4);
+ ALIGN(uint16_t mask_comp[6][2], 4);
+ ALIGN(uint16_t wedge_comp[9][2], 4);
+ ALIGN(uint16_t ref[6][3][2], 4);
+ ALIGN(uint16_t comp_fwd_ref[3][3][2], 4);
+ ALIGN(uint16_t comp_bwd_ref[2][3][2], 4);
+ ALIGN(uint16_t comp_uni_ref[3][3][2], 4);
+ ALIGN(uint16_t txpart[7][3][2], 4);
+ ALIGN(uint16_t skip[3][2], 4);
+ ALIGN(uint16_t skip_mode[3][2], 4);
+ ALIGN(uint16_t seg_pred[3][2], 4);
+ ALIGN(uint16_t obmc[N_BS_SIZES][2], 4);
+ ALIGN(uint16_t pal_y[7][3][2], 4);
+ ALIGN(uint16_t pal_uv[2][2], 4);
+ ALIGN(uint16_t intrabc[2], 4);
+} CdfModeContext;
+
+typedef struct CdfCoefContext {
+ ALIGN(uint16_t eob_bin_16[2][2][5 + 3], 16);
+ ALIGN(uint16_t eob_bin_32[2][2][6 + 2], 16);
+ ALIGN(uint16_t eob_bin_64[2][2][7 + 1], 16);
+ ALIGN(uint16_t eob_bin_128[2][2][8 + 0], 16);
+ ALIGN(uint16_t eob_bin_256[2][2][9 + 7], 32);
+ ALIGN(uint16_t eob_bin_512[2][10 + 6], 32);
+ ALIGN(uint16_t eob_bin_1024[2][11 + 5], 32);
+ ALIGN(uint16_t eob_base_tok[N_TX_SIZES][2][4][4], 8);
+ ALIGN(uint16_t base_tok[N_TX_SIZES][2][41][4], 8);
+ ALIGN(uint16_t br_tok[4 /*5*/][2][21][4], 8);
+ ALIGN(uint16_t eob_hi_bit[N_TX_SIZES][2][11 /*22*/][2], 4);
+ ALIGN(uint16_t skip[N_TX_SIZES][13][2], 4);
+ ALIGN(uint16_t dc_sign[2][3][2], 4);
+} CdfCoefContext;
+
+typedef struct CdfMvComponent {
+ ALIGN(uint16_t classes[11 + 5], 32);
+ ALIGN(uint16_t class0_fp[2][4], 8);
+ ALIGN(uint16_t classN_fp[4], 8);
+ ALIGN(uint16_t class0_hp[2], 4);
+ ALIGN(uint16_t classN_hp[2], 4);
+ ALIGN(uint16_t class0[2], 4);
+ ALIGN(uint16_t classN[10][2], 4);
+ ALIGN(uint16_t sign[2], 4);
+} CdfMvComponent;
+
+typedef struct CdfMvContext {
+ CdfMvComponent comp[2];
+ ALIGN(uint16_t joint[N_MV_JOINTS], 8);
+} CdfMvContext;
+
+typedef struct CdfContext {
+ CdfModeContext m;
+ ALIGN(uint16_t kfym[5][5][N_INTRA_PRED_MODES + 3], 32);
+ CdfCoefContext coef;
+ CdfMvContext mv, dmv;
+} CdfContext;
+
+typedef struct CdfThreadContext {
+ Dav1dRef *ref; ///< allocation origin
+ union {
+ CdfContext *cdf; // if ref != NULL
+ unsigned qcat; // if ref == NULL, from static CDF tables
+ } data;
+ atomic_uint *progress;
+} CdfThreadContext;
+
+void dav1d_cdf_thread_init_static(CdfThreadContext *cdf, int qidx);
+int dav1d_cdf_thread_alloc(Dav1dContext *c, CdfThreadContext *cdf,
+ const int have_frame_mt);
+void dav1d_cdf_thread_copy(CdfContext *dst, const CdfThreadContext *src);
+void dav1d_cdf_thread_ref(CdfThreadContext *dst, CdfThreadContext *src);
+void dav1d_cdf_thread_unref(CdfThreadContext *cdf);
+void dav1d_cdf_thread_update(const Dav1dFrameHeader *hdr, CdfContext *dst,
+ const CdfContext *src);
+
+#endif /* DAV1D_SRC_CDF_H */
diff --git a/third_party/dav1d/src/cpu.c b/third_party/dav1d/src/cpu.c
new file mode 100644
index 0000000000..9bb85f151b
--- /dev/null
+++ b/third_party/dav1d/src/cpu.c
@@ -0,0 +1,105 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include "config.h"
+
+#include <stdint.h>
+
+#include "src/cpu.h"
+#include "src/log.h"
+
+#ifdef _WIN32
+#include <windows.h>
+#elif defined(__APPLE__)
+#include <sys/sysctl.h>
+#include <sys/types.h>
+#else
+#include <pthread.h>
+#include <unistd.h>
+#endif
+
+#ifdef HAVE_PTHREAD_NP_H
+#include <pthread_np.h>
+#endif
+#if defined(__FreeBSD__)
+#define cpu_set_t cpuset_t
+#endif
+
+unsigned dav1d_cpu_flags = 0U;
+unsigned dav1d_cpu_flags_mask = ~0U;
+
+COLD void dav1d_init_cpu(void) {
+#if HAVE_ASM && !__has_feature(memory_sanitizer)
+// memory sanitizer is inherently incompatible with asm
+#if ARCH_AARCH64 || ARCH_ARM
+ dav1d_cpu_flags = dav1d_get_cpu_flags_arm();
+#elif ARCH_LOONGARCH
+ dav1d_cpu_flags = dav1d_get_cpu_flags_loongarch();
+#elif ARCH_PPC64LE
+ dav1d_cpu_flags = dav1d_get_cpu_flags_ppc();
+#elif ARCH_RISCV
+ dav1d_cpu_flags = dav1d_get_cpu_flags_riscv();
+#elif ARCH_X86
+ dav1d_cpu_flags = dav1d_get_cpu_flags_x86();
+#endif
+#endif
+}
+
+COLD void dav1d_set_cpu_flags_mask(const unsigned mask) {
+ dav1d_cpu_flags_mask = mask;
+}
+
+COLD int dav1d_num_logical_processors(Dav1dContext *const c) {
+#ifdef _WIN32
+#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+ GROUP_AFFINITY affinity;
+ if (GetThreadGroupAffinity(GetCurrentThread(), &affinity)) {
+ int num_processors = 1;
+ while (affinity.Mask &= affinity.Mask - 1)
+ num_processors++;
+ return num_processors;
+ }
+#else
+ SYSTEM_INFO system_info;
+ GetNativeSystemInfo(&system_info);
+ return system_info.dwNumberOfProcessors;
+#endif
+#elif defined(HAVE_PTHREAD_GETAFFINITY_NP) && defined(CPU_COUNT)
+ cpu_set_t affinity;
+ if (!pthread_getaffinity_np(pthread_self(), sizeof(affinity), &affinity))
+ return CPU_COUNT(&affinity);
+#elif defined(__APPLE__)
+ int num_processors;
+ size_t length = sizeof(num_processors);
+ if (!sysctlbyname("hw.logicalcpu", &num_processors, &length, NULL, 0))
+ return num_processors;
+#elif defined(_SC_NPROCESSORS_ONLN)
+ return (int)sysconf(_SC_NPROCESSORS_ONLN);
+#endif
+ if (c)
+ dav1d_log(c, "Unable to detect thread count, defaulting to single-threaded mode\n");
+ return 1;
+}
diff --git a/third_party/dav1d/src/cpu.h b/third_party/dav1d/src/cpu.h
new file mode 100644
index 0000000000..c9009c7778
--- /dev/null
+++ b/third_party/dav1d/src/cpu.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright © 2018-2022, VideoLAN and dav1d authors
+ * Copyright © 2018-2022, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_CPU_H
+#define DAV1D_SRC_CPU_H
+
+#include "config.h"
+
+#include "common/attributes.h"
+
+#include "dav1d/common.h"
+#include "dav1d/dav1d.h"
+
+#if ARCH_AARCH64 || ARCH_ARM
+#include "src/arm/cpu.h"
+#elif ARCH_LOONGARCH
+#include "src/loongarch/cpu.h"
+#elif ARCH_PPC64LE
+#include "src/ppc/cpu.h"
+#elif ARCH_RISCV
+#include "src/riscv/cpu.h"
+#elif ARCH_X86
+#include "src/x86/cpu.h"
+#endif
+
+EXTERN unsigned dav1d_cpu_flags;
+EXTERN unsigned dav1d_cpu_flags_mask;
+
+void dav1d_init_cpu(void);
+DAV1D_API void dav1d_set_cpu_flags_mask(unsigned mask);
+int dav1d_num_logical_processors(Dav1dContext *c);
+
+static ALWAYS_INLINE unsigned dav1d_get_cpu_flags(void) {
+ unsigned flags = dav1d_cpu_flags & dav1d_cpu_flags_mask;
+
+#if TRIM_DSP_FUNCTIONS
+/* Since this function is inlined, unconditionally setting a flag here will
+ * enable dead code elimination in the calling function. */
+#if ARCH_AARCH64 || ARCH_ARM
+#if defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32) || ARCH_AARCH64
+ flags |= DAV1D_ARM_CPU_FLAG_NEON;
+#endif
+#elif ARCH_PPC64LE
+#if defined(__VSX__)
+ flags |= DAV1D_PPC_CPU_FLAG_VSX;
+#endif
+#elif ARCH_RISCV
+#if defined(__riscv_v)
+ flags |= DAV1D_RISCV_CPU_FLAG_V;
+#endif
+#elif ARCH_X86
+#if defined(__AVX512F__) && defined(__AVX512CD__) && \
+ defined(__AVX512BW__) && defined(__AVX512DQ__) && \
+ defined(__AVX512VL__) && defined(__AVX512VNNI__) && \
+ defined(__AVX512IFMA__) && defined(__AVX512VBMI__) && \
+ defined(__AVX512VBMI2__) && defined(__AVX512VPOPCNTDQ__) && \
+ defined(__AVX512BITALG__) && defined(__GFNI__) && \
+ defined(__VAES__) && defined(__VPCLMULQDQ__)
+ flags |= DAV1D_X86_CPU_FLAG_AVX512ICL |
+ DAV1D_X86_CPU_FLAG_AVX2 |
+ DAV1D_X86_CPU_FLAG_SSE41 |
+ DAV1D_X86_CPU_FLAG_SSSE3 |
+ DAV1D_X86_CPU_FLAG_SSE2;
+#elif defined(__AVX2__)
+ flags |= DAV1D_X86_CPU_FLAG_AVX2 |
+ DAV1D_X86_CPU_FLAG_SSE41 |
+ DAV1D_X86_CPU_FLAG_SSSE3 |
+ DAV1D_X86_CPU_FLAG_SSE2;
+#elif defined(__SSE4_1__) || defined(__AVX__)
+ flags |= DAV1D_X86_CPU_FLAG_SSE41 |
+ DAV1D_X86_CPU_FLAG_SSSE3 |
+ DAV1D_X86_CPU_FLAG_SSE2;
+#elif defined(__SSSE3__)
+ flags |= DAV1D_X86_CPU_FLAG_SSSE3 |
+ DAV1D_X86_CPU_FLAG_SSE2;
+#elif ARCH_X86_64 || defined(__SSE2__) || \
+ (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
+ flags |= DAV1D_X86_CPU_FLAG_SSE2;
+#endif
+#endif
+#endif
+
+ return flags;
+}
+
+#endif /* DAV1D_SRC_CPU_H */
diff --git a/third_party/dav1d/src/ctx.h b/third_party/dav1d/src/ctx.h
new file mode 100644
index 0000000000..d0e1f310ae
--- /dev/null
+++ b/third_party/dav1d/src/ctx.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_CTX_H
+#define DAV1D_SRC_CTX_H
+
+#include <stdint.h>
+
+#include "common/attributes.h"
+
+union alias64 { uint64_t u64; uint8_t u8[8]; } ATTR_ALIAS;
+union alias32 { uint32_t u32; uint8_t u8[4]; } ATTR_ALIAS;
+union alias16 { uint16_t u16; uint8_t u8[2]; } ATTR_ALIAS;
+union alias8 { uint8_t u8; } ATTR_ALIAS;
+
+#define set_ctx_rep4(type, var, off, val) do { \
+ const uint64_t const_val = val; \
+ ((union alias64 *) &var[off + 0])->u64 = const_val; \
+ ((union alias64 *) &var[off + 8])->u64 = const_val; \
+ ((union alias64 *) &var[off + 16])->u64 = const_val; \
+ ((union alias64 *) &var[off + 24])->u64 = const_val; \
+ } while (0)
+#define set_ctx_rep2(type, var, off, val) do { \
+ const uint64_t const_val = val; \
+ ((union alias64 *) &var[off + 0])->u64 = const_val; \
+ ((union alias64 *) &var[off + 8])->u64 = const_val; \
+ } while (0)
+#define set_ctx_rep1(typesz, var, off, val) \
+ ((union alias##typesz *) &var[off])->u##typesz = val
+#define case_set(var, dir, diridx, off) \
+ switch (var) { \
+ case 1: set_ctx( 8, dir, diridx, off, 0x01, set_ctx_rep1); break; \
+ case 2: set_ctx(16, dir, diridx, off, 0x0101, set_ctx_rep1); break; \
+ case 4: set_ctx(32, dir, diridx, off, 0x01010101U, set_ctx_rep1); break; \
+ case 8: set_ctx(64, dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep1); break; \
+ case 16: set_ctx( , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep2); break; \
+ case 32: set_ctx( , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep4); break; \
+ }
+#define case_set_upto16(var, dir, diridx, off) \
+ switch (var) { \
+ case 1: set_ctx( 8, dir, diridx, off, 0x01, set_ctx_rep1); break; \
+ case 2: set_ctx(16, dir, diridx, off, 0x0101, set_ctx_rep1); break; \
+ case 4: set_ctx(32, dir, diridx, off, 0x01010101U, set_ctx_rep1); break; \
+ case 8: set_ctx(64, dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep1); break; \
+ case 16: set_ctx( , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep2); break; \
+ }
+#define case_set_upto32_with_default(var, dir, diridx, off) \
+ switch (var) { \
+ case 1: set_ctx( 8, dir, diridx, off, 0x01, set_ctx_rep1); break; \
+ case 2: set_ctx(16, dir, diridx, off, 0x0101, set_ctx_rep1); break; \
+ case 4: set_ctx(32, dir, diridx, off, 0x01010101U, set_ctx_rep1); break; \
+ case 8: set_ctx(64, dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep1); break; \
+ case 16: set_ctx( , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep2); break; \
+ case 32: set_ctx( , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep4); break; \
+ default: default_memset(dir, diridx, off, var); break; \
+ }
+#define case_set_upto16_with_default(var, dir, diridx, off) \
+ switch (var) { \
+ case 1: set_ctx( 8, dir, diridx, off, 0x01, set_ctx_rep1); break; \
+ case 2: set_ctx(16, dir, diridx, off, 0x0101, set_ctx_rep1); break; \
+ case 4: set_ctx(32, dir, diridx, off, 0x01010101U, set_ctx_rep1); break; \
+ case 8: set_ctx(64, dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep1); break; \
+ case 16: set_ctx( , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep2); break; \
+ default: default_memset(dir, diridx, off, var); break; \
+ }
+
+#endif /* DAV1D_SRC_CTX_H */
diff --git a/third_party/dav1d/src/data.c b/third_party/dav1d/src/data.c
new file mode 100644
index 0000000000..bbbe02e8d1
--- /dev/null
+++ b/third_party/dav1d/src/data.c
@@ -0,0 +1,149 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <errno.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "dav1d/data.h"
+
+#include "common/attributes.h"
+#include "common/validate.h"
+
+#include "src/data.h"
+#include "src/ref.h"
+
+uint8_t *dav1d_data_create_internal(Dav1dData *const buf, const size_t sz) {
+ validate_input_or_ret(buf != NULL, NULL);
+
+ if (sz > SIZE_MAX / 2) return NULL;
+ buf->ref = dav1d_ref_create(ALLOC_DAV1DDATA, sz);
+ if (!buf->ref) return NULL;
+ buf->data = buf->ref->const_data;
+ buf->sz = sz;
+ dav1d_data_props_set_defaults(&buf->m);
+ buf->m.size = sz;
+
+ return buf->ref->data;
+}
+
+int dav1d_data_wrap_internal(Dav1dData *const buf, const uint8_t *const ptr,
+ const size_t sz,
+ void (*const free_callback)(const uint8_t *data,
+ void *cookie),
+ void *const cookie)
+{
+ validate_input_or_ret(buf != NULL, DAV1D_ERR(EINVAL));
+ validate_input_or_ret(ptr != NULL, DAV1D_ERR(EINVAL));
+ validate_input_or_ret(free_callback != NULL, DAV1D_ERR(EINVAL));
+
+ if (sz > SIZE_MAX / 2) return DAV1D_ERR(EINVAL);
+ Dav1dRef *const ref = dav1d_malloc(ALLOC_DAV1DDATA, sizeof(Dav1dRef));
+ if (!ref) return DAV1D_ERR(ENOMEM);
+
+ buf->ref = dav1d_ref_init(ref, ptr, free_callback, cookie, 1);
+ buf->data = ptr;
+ buf->sz = sz;
+ dav1d_data_props_set_defaults(&buf->m);
+ buf->m.size = sz;
+
+ return 0;
+}
+
+int dav1d_data_wrap_user_data_internal(Dav1dData *const buf,
+ const uint8_t *const user_data,
+ void (*const free_callback)(const uint8_t *user_data,
+ void *cookie),
+ void *const cookie)
+{
+ validate_input_or_ret(buf != NULL, DAV1D_ERR(EINVAL));
+ validate_input_or_ret(free_callback != NULL, DAV1D_ERR(EINVAL));
+
+ Dav1dRef *const ref = dav1d_malloc(ALLOC_DAV1DDATA, sizeof(Dav1dRef));
+ if (!ref) return DAV1D_ERR(ENOMEM);
+
+ buf->m.user_data.ref = dav1d_ref_init(ref, user_data, free_callback, cookie, 1);
+ buf->m.user_data.data = user_data;
+
+ return 0;
+}
+
+void dav1d_data_ref(Dav1dData *const dst, const Dav1dData *const src) {
+ assert(dst != NULL);
+ assert(dst->data == NULL);
+ assert(src != NULL);
+
+ if (src->ref) {
+ assert(src->data != NULL);
+ dav1d_ref_inc(src->ref);
+ }
+ if (src->m.user_data.ref) dav1d_ref_inc(src->m.user_data.ref);
+ *dst = *src;
+}
+
+void dav1d_data_props_copy(Dav1dDataProps *const dst,
+ const Dav1dDataProps *const src)
+{
+ assert(dst != NULL);
+ assert(src != NULL);
+
+ dav1d_ref_dec(&dst->user_data.ref);
+ *dst = *src;
+ if (dst->user_data.ref) dav1d_ref_inc(dst->user_data.ref);
+}
+
+void dav1d_data_props_set_defaults(Dav1dDataProps *const props) {
+ assert(props != NULL);
+
+ memset(props, 0, sizeof(*props));
+ props->timestamp = INT64_MIN;
+ props->offset = -1;
+}
+
+void dav1d_data_props_unref_internal(Dav1dDataProps *const props) {
+ validate_input(props != NULL);
+
+ struct Dav1dRef *user_data_ref = props->user_data.ref;
+ dav1d_data_props_set_defaults(props);
+ dav1d_ref_dec(&user_data_ref);
+}
+
+void dav1d_data_unref_internal(Dav1dData *const buf) {
+ validate_input(buf != NULL);
+
+ struct Dav1dRef *user_data_ref = buf->m.user_data.ref;
+ if (buf->ref) {
+ validate_input(buf->data != NULL);
+ dav1d_ref_dec(&buf->ref);
+ }
+ memset(buf, 0, sizeof(*buf));
+ dav1d_data_props_set_defaults(&buf->m);
+ dav1d_ref_dec(&user_data_ref);
+}
diff --git a/third_party/dav1d/src/data.h b/third_party/dav1d/src/data.h
new file mode 100644
index 0000000000..b34c1db702
--- /dev/null
+++ b/third_party/dav1d/src/data.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_DATA_H
+#define DAV1D_SRC_DATA_H
+
+#include "dav1d/data.h"
+
+void dav1d_data_ref(Dav1dData *dst, const Dav1dData *src);
+
+/**
+ * Copy the source properties to the destination and increase the
+ * user_data's reference count (if it's not NULL).
+ */
+void dav1d_data_props_copy(Dav1dDataProps *dst, const Dav1dDataProps *src);
+
+void dav1d_data_props_set_defaults(Dav1dDataProps *props);
+
+uint8_t *dav1d_data_create_internal(Dav1dData *buf, size_t sz);
+int dav1d_data_wrap_internal(Dav1dData *buf, const uint8_t *ptr, size_t sz,
+ void (*free_callback)(const uint8_t *data,
+ void *user_data),
+ void *user_data);
+int dav1d_data_wrap_user_data_internal(Dav1dData *buf,
+ const uint8_t *user_data,
+ void (*free_callback)(const uint8_t *user_data,
+ void *cookie),
+ void *cookie);
+void dav1d_data_unref_internal(Dav1dData *buf);
+void dav1d_data_props_unref_internal(Dav1dDataProps *props);
+
+#endif /* DAV1D_SRC_DATA_H */
diff --git a/third_party/dav1d/src/dav1d.rc.in b/third_party/dav1d/src/dav1d.rc.in
new file mode 100644
index 0000000000..ce5a33ad4d
--- /dev/null
+++ b/third_party/dav1d/src/dav1d.rc.in
@@ -0,0 +1,32 @@
+#define API_VERSION_NUMBER @API_VERSION_MAJOR@,@API_VERSION_MINOR@,@API_VERSION_REVISION@,0
+#define API_VERSION_NUMBER_STR "@API_VERSION_MAJOR@.@API_VERSION_MINOR@.@API_VERSION_REVISION@"
+#define PROJECT_VERSION_NUMBER @PROJECT_VERSION_MAJOR@,@PROJECT_VERSION_MINOR@,@PROJECT_VERSION_REVISION@,0
+#define PROJECT_VERSION_NUMBER_STR "@PROJECT_VERSION_MAJOR@.@PROJECT_VERSION_MINOR@.@PROJECT_VERSION_REVISION@"
+
+#include <windows.h>
+
+1 VERSIONINFO
+FILETYPE VFT_DLL
+FILEOS VOS_NT_WINDOWS32
+PRODUCTVERSION PROJECT_VERSION_NUMBER
+FILEVERSION API_VERSION_NUMBER
+BEGIN
+ BLOCK "StringFileInfo"
+ BEGIN
+ BLOCK "040904E4"
+ BEGIN
+ VALUE "CompanyName", "VideoLAN"
+ VALUE "ProductName", "dav1d"
+ VALUE "ProductVersion", PROJECT_VERSION_NUMBER_STR
+ VALUE "FileVersion", API_VERSION_NUMBER_STR
+ VALUE "FileDescription", "dav1d " PROJECT_VERSION_NUMBER_STR " - AV1 decoder"
+ VALUE "InternalName", "dav1d"
+ VALUE "OriginalFilename", "libdav1d.dll"
+ VALUE "LegalCopyright", L"Copyright \251 @COPYRIGHT_YEARS@ VideoLAN and dav1d Authors"
+ END
+ END
+ BLOCK "VarFileInfo"
+ BEGIN
+ VALUE "Translation", 0x409, 1252
+ END
+END
diff --git a/third_party/dav1d/src/decode.c b/third_party/dav1d/src/decode.c
new file mode 100644
index 0000000000..97d15ca1c6
--- /dev/null
+++ b/third_party/dav1d/src/decode.c
@@ -0,0 +1,3760 @@
+/*
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <errno.h>
+#include <limits.h>
+#include <string.h>
+#include <stdio.h>
+#include <inttypes.h>
+
+#include "dav1d/data.h"
+
+#include "common/frame.h"
+#include "common/intops.h"
+
+#include "src/ctx.h"
+#include "src/decode.h"
+#include "src/dequant_tables.h"
+#include "src/env.h"
+#include "src/filmgrain.h"
+#include "src/log.h"
+#include "src/qm.h"
+#include "src/recon.h"
+#include "src/ref.h"
+#include "src/tables.h"
+#include "src/thread_task.h"
+#include "src/warpmv.h"
+
+static void init_quant_tables(const Dav1dSequenceHeader *const seq_hdr,
+ const Dav1dFrameHeader *const frame_hdr,
+ const int qidx, uint16_t (*dq)[3][2])
+{
+ for (int i = 0; i < (frame_hdr->segmentation.enabled ? 8 : 1); i++) {
+ const int yac = frame_hdr->segmentation.enabled ?
+ iclip_u8(qidx + frame_hdr->segmentation.seg_data.d[i].delta_q) : qidx;
+ const int ydc = iclip_u8(yac + frame_hdr->quant.ydc_delta);
+ const int uac = iclip_u8(yac + frame_hdr->quant.uac_delta);
+ const int udc = iclip_u8(yac + frame_hdr->quant.udc_delta);
+ const int vac = iclip_u8(yac + frame_hdr->quant.vac_delta);
+ const int vdc = iclip_u8(yac + frame_hdr->quant.vdc_delta);
+
+ dq[i][0][0] = dav1d_dq_tbl[seq_hdr->hbd][ydc][0];
+ dq[i][0][1] = dav1d_dq_tbl[seq_hdr->hbd][yac][1];
+ dq[i][1][0] = dav1d_dq_tbl[seq_hdr->hbd][udc][0];
+ dq[i][1][1] = dav1d_dq_tbl[seq_hdr->hbd][uac][1];
+ dq[i][2][0] = dav1d_dq_tbl[seq_hdr->hbd][vdc][0];
+ dq[i][2][1] = dav1d_dq_tbl[seq_hdr->hbd][vac][1];
+ }
+}
+
+static int read_mv_component_diff(Dav1dTaskContext *const t,
+ CdfMvComponent *const mv_comp,
+ const int have_fp)
+{
+ Dav1dTileState *const ts = t->ts;
+ const Dav1dFrameContext *const f = t->f;
+ const int have_hp = f->frame_hdr->hp;
+ const int sign = dav1d_msac_decode_bool_adapt(&ts->msac, mv_comp->sign);
+ const int cl = dav1d_msac_decode_symbol_adapt16(&ts->msac,
+ mv_comp->classes, 10);
+ int up, fp, hp;
+
+ if (!cl) {
+ up = dav1d_msac_decode_bool_adapt(&ts->msac, mv_comp->class0);
+ if (have_fp) {
+ fp = dav1d_msac_decode_symbol_adapt4(&ts->msac,
+ mv_comp->class0_fp[up], 3);
+ hp = have_hp ? dav1d_msac_decode_bool_adapt(&ts->msac,
+ mv_comp->class0_hp) : 1;
+ } else {
+ fp = 3;
+ hp = 1;
+ }
+ } else {
+ up = 1 << cl;
+ for (int n = 0; n < cl; n++)
+ up |= dav1d_msac_decode_bool_adapt(&ts->msac,
+ mv_comp->classN[n]) << n;
+ if (have_fp) {
+ fp = dav1d_msac_decode_symbol_adapt4(&ts->msac,
+ mv_comp->classN_fp, 3);
+ hp = have_hp ? dav1d_msac_decode_bool_adapt(&ts->msac,
+ mv_comp->classN_hp) : 1;
+ } else {
+ fp = 3;
+ hp = 1;
+ }
+ }
+
+ const int diff = ((up << 3) | (fp << 1) | hp) + 1;
+
+ return sign ? -diff : diff;
+}
+
+static void read_mv_residual(Dav1dTaskContext *const t, mv *const ref_mv,
+ CdfMvContext *const mv_cdf, const int have_fp)
+{
+ switch (dav1d_msac_decode_symbol_adapt4(&t->ts->msac, t->ts->cdf.mv.joint,
+ N_MV_JOINTS - 1))
+ {
+ case MV_JOINT_HV:
+ ref_mv->y += read_mv_component_diff(t, &mv_cdf->comp[0], have_fp);
+ ref_mv->x += read_mv_component_diff(t, &mv_cdf->comp[1], have_fp);
+ break;
+ case MV_JOINT_H:
+ ref_mv->x += read_mv_component_diff(t, &mv_cdf->comp[1], have_fp);
+ break;
+ case MV_JOINT_V:
+ ref_mv->y += read_mv_component_diff(t, &mv_cdf->comp[0], have_fp);
+ break;
+ default:
+ break;
+ }
+}
+
+static void read_tx_tree(Dav1dTaskContext *const t,
+ const enum RectTxfmSize from,
+ const int depth, uint16_t *const masks,
+ const int x_off, const int y_off)
+{
+ const Dav1dFrameContext *const f = t->f;
+ const int bx4 = t->bx & 31, by4 = t->by & 31;
+ const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[from];
+ const int txw = t_dim->lw, txh = t_dim->lh;
+ int is_split;
+
+ if (depth < 2 && from > (int) TX_4X4) {
+ const int cat = 2 * (TX_64X64 - t_dim->max) - depth;
+ const int a = t->a->tx[bx4] < txw;
+ const int l = t->l.tx[by4] < txh;
+
+ is_split = dav1d_msac_decode_bool_adapt(&t->ts->msac,
+ t->ts->cdf.m.txpart[cat][a + l]);
+ if (is_split)
+ masks[depth] |= 1 << (y_off * 4 + x_off);
+ } else {
+ is_split = 0;
+ }
+
+ if (is_split && t_dim->max > TX_8X8) {
+ const enum RectTxfmSize sub = t_dim->sub;
+ const TxfmInfo *const sub_t_dim = &dav1d_txfm_dimensions[sub];
+ const int txsw = sub_t_dim->w, txsh = sub_t_dim->h;
+
+ read_tx_tree(t, sub, depth + 1, masks, x_off * 2 + 0, y_off * 2 + 0);
+ t->bx += txsw;
+ if (txw >= txh && t->bx < f->bw)
+ read_tx_tree(t, sub, depth + 1, masks, x_off * 2 + 1, y_off * 2 + 0);
+ t->bx -= txsw;
+ t->by += txsh;
+ if (txh >= txw && t->by < f->bh) {
+ read_tx_tree(t, sub, depth + 1, masks, x_off * 2 + 0, y_off * 2 + 1);
+ t->bx += txsw;
+ if (txw >= txh && t->bx < f->bw)
+ read_tx_tree(t, sub, depth + 1, masks,
+ x_off * 2 + 1, y_off * 2 + 1);
+ t->bx -= txsw;
+ }
+ t->by -= txsh;
+ } else {
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir tx, off, is_split ? TX_4X4 : mul * txh)
+ case_set_upto16(t_dim->h, l., 1, by4);
+#undef set_ctx
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir tx, off, is_split ? TX_4X4 : mul * txw)
+ case_set_upto16(t_dim->w, a->, 0, bx4);
+#undef set_ctx
+ }
+}
+
+static int neg_deinterleave(int diff, int ref, int max) {
+ if (!ref) return diff;
+ if (ref >= (max - 1)) return max - diff - 1;
+ if (2 * ref < max) {
+ if (diff <= 2 * ref) {
+ if (diff & 1)
+ return ref + ((diff + 1) >> 1);
+ else
+ return ref - (diff >> 1);
+ }
+ return diff;
+ } else {
+ if (diff <= 2 * (max - ref - 1)) {
+ if (diff & 1)
+ return ref + ((diff + 1) >> 1);
+ else
+ return ref - (diff >> 1);
+ }
+ return max - (diff + 1);
+ }
+}
+
+static void find_matching_ref(const Dav1dTaskContext *const t,
+ const enum EdgeFlags intra_edge_flags,
+ const int bw4, const int bh4,
+ const int w4, const int h4,
+ const int have_left, const int have_top,
+ const int ref, uint64_t masks[2])
+{
+ /*const*/ refmvs_block *const *r = &t->rt.r[(t->by & 31) + 5];
+ int count = 0;
+ int have_topleft = have_top && have_left;
+ int have_topright = imax(bw4, bh4) < 32 &&
+ have_top && t->bx + bw4 < t->ts->tiling.col_end &&
+ (intra_edge_flags & EDGE_I444_TOP_HAS_RIGHT);
+
+#define bs(rp) dav1d_block_dimensions[(rp)->bs]
+#define matches(rp) ((rp)->ref.ref[0] == ref + 1 && (rp)->ref.ref[1] == -1)
+
+ if (have_top) {
+ const refmvs_block *r2 = &r[-1][t->bx];
+ if (matches(r2)) {
+ masks[0] |= 1;
+ count = 1;
+ }
+ int aw4 = bs(r2)[0];
+ if (aw4 >= bw4) {
+ const int off = t->bx & (aw4 - 1);
+ if (off) have_topleft = 0;
+ if (aw4 - off > bw4) have_topright = 0;
+ } else {
+ unsigned mask = 1 << aw4;
+ for (int x = aw4; x < w4; x += aw4) {
+ r2 += aw4;
+ if (matches(r2)) {
+ masks[0] |= mask;
+ if (++count >= 8) return;
+ }
+ aw4 = bs(r2)[0];
+ mask <<= aw4;
+ }
+ }
+ }
+ if (have_left) {
+ /*const*/ refmvs_block *const *r2 = r;
+ if (matches(&r2[0][t->bx - 1])) {
+ masks[1] |= 1;
+ if (++count >= 8) return;
+ }
+ int lh4 = bs(&r2[0][t->bx - 1])[1];
+ if (lh4 >= bh4) {
+ if (t->by & (lh4 - 1)) have_topleft = 0;
+ } else {
+ unsigned mask = 1 << lh4;
+ for (int y = lh4; y < h4; y += lh4) {
+ r2 += lh4;
+ if (matches(&r2[0][t->bx - 1])) {
+ masks[1] |= mask;
+ if (++count >= 8) return;
+ }
+ lh4 = bs(&r2[0][t->bx - 1])[1];
+ mask <<= lh4;
+ }
+ }
+ }
+ if (have_topleft && matches(&r[-1][t->bx - 1])) {
+ masks[1] |= 1ULL << 32;
+ if (++count >= 8) return;
+ }
+ if (have_topright && matches(&r[-1][t->bx + bw4])) {
+ masks[0] |= 1ULL << 32;
+ }
+#undef matches
+}
+
+static void derive_warpmv(const Dav1dTaskContext *const t,
+ const int bw4, const int bh4,
+ const uint64_t masks[2], const union mv mv,
+ Dav1dWarpedMotionParams *const wmp)
+{
+ int pts[8][2 /* in, out */][2 /* x, y */], np = 0;
+ /*const*/ refmvs_block *const *r = &t->rt.r[(t->by & 31) + 5];
+
+#define add_sample(dx, dy, sx, sy, rp) do { \
+ pts[np][0][0] = 16 * (2 * dx + sx * bs(rp)[0]) - 8; \
+ pts[np][0][1] = 16 * (2 * dy + sy * bs(rp)[1]) - 8; \
+ pts[np][1][0] = pts[np][0][0] + (rp)->mv.mv[0].x; \
+ pts[np][1][1] = pts[np][0][1] + (rp)->mv.mv[0].y; \
+ np++; \
+} while (0)
+
+ // use masks[] to find the projectable motion vectors in the edges
+ if ((unsigned) masks[0] == 1 && !(masks[1] >> 32)) {
+ const int off = t->bx & (bs(&r[-1][t->bx])[0] - 1);
+ add_sample(-off, 0, 1, -1, &r[-1][t->bx]);
+ } else for (unsigned off = 0, xmask = (uint32_t) masks[0]; np < 8 && xmask;) { // top
+ const int tz = ctz(xmask);
+ off += tz;
+ xmask >>= tz;
+ add_sample(off, 0, 1, -1, &r[-1][t->bx + off]);
+ xmask &= ~1;
+ }
+ if (np < 8 && masks[1] == 1) {
+ const int off = t->by & (bs(&r[0][t->bx - 1])[1] - 1);
+ add_sample(0, -off, -1, 1, &r[-off][t->bx - 1]);
+ } else for (unsigned off = 0, ymask = (uint32_t) masks[1]; np < 8 && ymask;) { // left
+ const int tz = ctz(ymask);
+ off += tz;
+ ymask >>= tz;
+ add_sample(0, off, -1, 1, &r[off][t->bx - 1]);
+ ymask &= ~1;
+ }
+ if (np < 8 && masks[1] >> 32) // top/left
+ add_sample(0, 0, -1, -1, &r[-1][t->bx - 1]);
+ if (np < 8 && masks[0] >> 32) // top/right
+ add_sample(bw4, 0, 1, -1, &r[-1][t->bx + bw4]);
+ assert(np > 0 && np <= 8);
+#undef bs
+
+ // select according to motion vector difference against a threshold
+ int mvd[8], ret = 0;
+ const int thresh = 4 * iclip(imax(bw4, bh4), 4, 28);
+ for (int i = 0; i < np; i++) {
+ mvd[i] = abs(pts[i][1][0] - pts[i][0][0] - mv.x) +
+ abs(pts[i][1][1] - pts[i][0][1] - mv.y);
+ if (mvd[i] > thresh)
+ mvd[i] = -1;
+ else
+ ret++;
+ }
+ if (!ret) {
+ ret = 1;
+ } else for (int i = 0, j = np - 1, k = 0; k < np - ret; k++, i++, j--) {
+ while (mvd[i] != -1) i++;
+ while (mvd[j] == -1) j--;
+ assert(i != j);
+ if (i > j) break;
+ // replace the discarded samples;
+ mvd[i] = mvd[j];
+ memcpy(pts[i], pts[j], sizeof(*pts));
+ }
+
+ if (!dav1d_find_affine_int(pts, ret, bw4, bh4, mv, wmp, t->bx, t->by) &&
+ !dav1d_get_shear_params(wmp))
+ {
+ wmp->type = DAV1D_WM_TYPE_AFFINE;
+ } else
+ wmp->type = DAV1D_WM_TYPE_IDENTITY;
+}
+
+static inline int findoddzero(const uint8_t *buf, int len) {
+ for (int n = 0; n < len; n++)
+ if (!buf[n * 2]) return 1;
+ return 0;
+}
+
+// meant to be SIMD'able, so that theoretical complexity of this function
+// times block size goes from w4*h4 to w4+h4-1
+// a and b are previous two lines containing (a) top/left entries or (b)
+// top/left entries, with a[0] being either the first top or first left entry,
+// depending on top_offset being 1 or 0, and b being the first top/left entry
+// for whichever has one. left_offset indicates whether the (len-1)th entry
+// has a left neighbour.
+// output is order[] and ctx for each member of this diagonal.
+static void order_palette(const uint8_t *pal_idx, const ptrdiff_t stride,
+ const int i, const int first, const int last,
+ uint8_t (*const order)[8], uint8_t *const ctx)
+{
+ int have_top = i > first;
+
+ assert(pal_idx);
+ pal_idx += first + (i - first) * stride;
+ for (int j = first, n = 0; j >= last; have_top = 1, j--, n++, pal_idx += stride - 1) {
+ const int have_left = j > 0;
+
+ assert(have_left || have_top);
+
+#define add(v_in) do { \
+ const int v = v_in; \
+ assert((unsigned)v < 8U); \
+ order[n][o_idx++] = v; \
+ mask |= 1 << v; \
+ } while (0)
+
+ unsigned mask = 0;
+ int o_idx = 0;
+ if (!have_left) {
+ ctx[n] = 0;
+ add(pal_idx[-stride]);
+ } else if (!have_top) {
+ ctx[n] = 0;
+ add(pal_idx[-1]);
+ } else {
+ const int l = pal_idx[-1], t = pal_idx[-stride], tl = pal_idx[-(stride + 1)];
+ const int same_t_l = t == l;
+ const int same_t_tl = t == tl;
+ const int same_l_tl = l == tl;
+ const int same_all = same_t_l & same_t_tl & same_l_tl;
+
+ if (same_all) {
+ ctx[n] = 4;
+ add(t);
+ } else if (same_t_l) {
+ ctx[n] = 3;
+ add(t);
+ add(tl);
+ } else if (same_t_tl | same_l_tl) {
+ ctx[n] = 2;
+ add(tl);
+ add(same_t_tl ? l : t);
+ } else {
+ ctx[n] = 1;
+ add(imin(t, l));
+ add(imax(t, l));
+ add(tl);
+ }
+ }
+ for (unsigned m = 1, bit = 0; m < 0x100; m <<= 1, bit++)
+ if (!(mask & m))
+ order[n][o_idx++] = bit;
+ assert(o_idx == 8);
+#undef add
+ }
+}
+
+static void read_pal_indices(Dav1dTaskContext *const t,
+ uint8_t *const pal_idx,
+ const Av1Block *const b, const int pl,
+ const int w4, const int h4,
+ const int bw4, const int bh4)
+{
+ Dav1dTileState *const ts = t->ts;
+ const ptrdiff_t stride = bw4 * 4;
+ assert(pal_idx);
+ pixel *const pal_tmp = t->scratch.pal_idx_uv;
+ pal_tmp[0] = dav1d_msac_decode_uniform(&ts->msac, b->pal_sz[pl]);
+ uint16_t (*const color_map_cdf)[8] =
+ ts->cdf.m.color_map[pl][b->pal_sz[pl] - 2];
+ uint8_t (*const order)[8] = t->scratch.pal_order;
+ uint8_t *const ctx = t->scratch.pal_ctx;
+ for (int i = 1; i < 4 * (w4 + h4) - 1; i++) {
+ // top/left-to-bottom/right diagonals ("wave-front")
+ const int first = imin(i, w4 * 4 - 1);
+ const int last = imax(0, i - h4 * 4 + 1);
+ order_palette(pal_tmp, stride, i, first, last, order, ctx);
+ for (int j = first, m = 0; j >= last; j--, m++) {
+ const int color_idx = dav1d_msac_decode_symbol_adapt8(&ts->msac,
+ color_map_cdf[ctx[m]], b->pal_sz[pl] - 1);
+ pal_tmp[(i - j) * stride + j] = order[m][color_idx];
+ }
+ }
+
+ t->c->pal_dsp.pal_idx_finish(pal_idx, pal_tmp, bw4 * 4, bh4 * 4,
+ w4 * 4, h4 * 4);
+}
+
+static void read_vartx_tree(Dav1dTaskContext *const t,
+ Av1Block *const b, const enum BlockSize bs,
+ const int bx4, const int by4)
+{
+ const Dav1dFrameContext *const f = t->f;
+ const uint8_t *const b_dim = dav1d_block_dimensions[bs];
+ const int bw4 = b_dim[0], bh4 = b_dim[1];
+
+ // var-tx tree coding
+ uint16_t tx_split[2] = { 0 };
+ b->max_ytx = dav1d_max_txfm_size_for_bs[bs][0];
+ if (!b->skip && (f->frame_hdr->segmentation.lossless[b->seg_id] ||
+ b->max_ytx == TX_4X4))
+ {
+ b->max_ytx = b->uvtx = TX_4X4;
+ if (f->frame_hdr->txfm_mode == DAV1D_TX_SWITCHABLE) {
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir tx, off, TX_4X4)
+ case_set(bh4, l., 1, by4);
+ case_set(bw4, a->, 0, bx4);
+#undef set_ctx
+ }
+ } else if (f->frame_hdr->txfm_mode != DAV1D_TX_SWITCHABLE || b->skip) {
+ if (f->frame_hdr->txfm_mode == DAV1D_TX_SWITCHABLE) {
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir tx, off, mul * b_dim[2 + diridx])
+ case_set(bh4, l., 1, by4);
+ case_set(bw4, a->, 0, bx4);
+#undef set_ctx
+ }
+ b->uvtx = dav1d_max_txfm_size_for_bs[bs][f->cur.p.layout];
+ } else {
+ assert(bw4 <= 16 || bh4 <= 16 || b->max_ytx == TX_64X64);
+ int y, x, y_off, x_off;
+ const TxfmInfo *const ytx = &dav1d_txfm_dimensions[b->max_ytx];
+ for (y = 0, y_off = 0; y < bh4; y += ytx->h, y_off++) {
+ for (x = 0, x_off = 0; x < bw4; x += ytx->w, x_off++) {
+ read_tx_tree(t, b->max_ytx, 0, tx_split, x_off, y_off);
+ // contexts are updated inside read_tx_tree()
+ t->bx += ytx->w;
+ }
+ t->bx -= x;
+ t->by += ytx->h;
+ }
+ t->by -= y;
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-vartxtree[%x/%x]: r=%d\n",
+ tx_split[0], tx_split[1], t->ts->msac.rng);
+ b->uvtx = dav1d_max_txfm_size_for_bs[bs][f->cur.p.layout];
+ }
+ assert(!(tx_split[0] & ~0x33));
+ b->tx_split0 = (uint8_t)tx_split[0];
+ b->tx_split1 = tx_split[1];
+}
+
+static inline unsigned get_prev_frame_segid(const Dav1dFrameContext *const f,
+ const int by, const int bx,
+ const int w4, int h4,
+ const uint8_t *ref_seg_map,
+ const ptrdiff_t stride)
+{
+ assert(f->frame_hdr->primary_ref_frame != DAV1D_PRIMARY_REF_NONE);
+
+ unsigned seg_id = 8;
+ ref_seg_map += by * stride + bx;
+ do {
+ for (int x = 0; x < w4; x++)
+ seg_id = imin(seg_id, ref_seg_map[x]);
+ ref_seg_map += stride;
+ } while (--h4 > 0 && seg_id);
+ assert(seg_id < 8);
+
+ return seg_id;
+}
+
+static inline void splat_oneref_mv(const Dav1dContext *const c,
+ Dav1dTaskContext *const t,
+ const enum BlockSize bs,
+ const Av1Block *const b,
+ const int bw4, const int bh4)
+{
+ const enum InterPredMode mode = b->inter_mode;
+ const refmvs_block ALIGN(tmpl, 16) = (refmvs_block) {
+ .ref.ref = { b->ref[0] + 1, b->interintra_type ? 0 : -1 },
+ .mv.mv[0] = b->mv[0],
+ .bs = bs,
+ .mf = (mode == GLOBALMV && imin(bw4, bh4) >= 2) | ((mode == NEWMV) * 2),
+ };
+ c->refmvs_dsp.splat_mv(&t->rt.r[(t->by & 31) + 5], &tmpl, t->bx, bw4, bh4);
+}
+
+static inline void splat_intrabc_mv(const Dav1dContext *const c,
+ Dav1dTaskContext *const t,
+ const enum BlockSize bs,
+ const Av1Block *const b,
+ const int bw4, const int bh4)
+{
+ const refmvs_block ALIGN(tmpl, 16) = (refmvs_block) {
+ .ref.ref = { 0, -1 },
+ .mv.mv[0] = b->mv[0],
+ .bs = bs,
+ .mf = 0,
+ };
+ c->refmvs_dsp.splat_mv(&t->rt.r[(t->by & 31) + 5], &tmpl, t->bx, bw4, bh4);
+}
+
+static inline void splat_tworef_mv(const Dav1dContext *const c,
+ Dav1dTaskContext *const t,
+ const enum BlockSize bs,
+ const Av1Block *const b,
+ const int bw4, const int bh4)
+{
+ assert(bw4 >= 2 && bh4 >= 2);
+ const enum CompInterPredMode mode = b->inter_mode;
+ const refmvs_block ALIGN(tmpl, 16) = (refmvs_block) {
+ .ref.ref = { b->ref[0] + 1, b->ref[1] + 1 },
+ .mv.mv = { b->mv[0], b->mv[1] },
+ .bs = bs,
+ .mf = (mode == GLOBALMV_GLOBALMV) | !!((1 << mode) & (0xbc)) * 2,
+ };
+ c->refmvs_dsp.splat_mv(&t->rt.r[(t->by & 31) + 5], &tmpl, t->bx, bw4, bh4);
+}
+
+static inline void splat_intraref(const Dav1dContext *const c,
+ Dav1dTaskContext *const t,
+ const enum BlockSize bs,
+ const int bw4, const int bh4)
+{
+ const refmvs_block ALIGN(tmpl, 16) = (refmvs_block) {
+ .ref.ref = { 0, -1 },
+ .mv.mv[0].n = INVALID_MV,
+ .bs = bs,
+ .mf = 0,
+ };
+ c->refmvs_dsp.splat_mv(&t->rt.r[(t->by & 31) + 5], &tmpl, t->bx, bw4, bh4);
+}
+
+static void mc_lowest_px(int *const dst, const int by4, const int bh4,
+ const int mvy, const int ss_ver,
+ const struct ScalableMotionParams *const smp)
+{
+ const int v_mul = 4 >> ss_ver;
+ if (!smp->scale) {
+ const int my = mvy >> (3 + ss_ver), dy = mvy & (15 >> !ss_ver);
+ *dst = imax(*dst, (by4 + bh4) * v_mul + my + 4 * !!dy);
+ } else {
+ int y = (by4 * v_mul << 4) + mvy * (1 << !ss_ver);
+ const int64_t tmp = (int64_t)(y) * smp->scale + (smp->scale - 0x4000) * 8;
+ y = apply_sign64((int)((llabs(tmp) + 128) >> 8), tmp) + 32;
+ const int bottom = ((y + (bh4 * v_mul - 1) * smp->step) >> 10) + 1 + 4;
+ *dst = imax(*dst, bottom);
+ }
+}
+
+static ALWAYS_INLINE void affine_lowest_px(Dav1dTaskContext *const t, int *const dst,
+ const uint8_t *const b_dim,
+ const Dav1dWarpedMotionParams *const wmp,
+ const int ss_ver, const int ss_hor)
+{
+ const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;
+ assert(!((b_dim[0] * h_mul) & 7) && !((b_dim[1] * v_mul) & 7));
+ const int32_t *const mat = wmp->matrix;
+ const int y = b_dim[1] * v_mul - 8; // lowest y
+
+ const int src_y = t->by * 4 + ((y + 4) << ss_ver);
+ const int64_t mat5_y = (int64_t) mat[5] * src_y + mat[1];
+ // check left- and right-most blocks
+ for (int x = 0; x < b_dim[0] * h_mul; x += imax(8, b_dim[0] * h_mul - 8)) {
+ // calculate transformation relative to center of 8x8 block in
+ // luma pixel units
+ const int src_x = t->bx * 4 + ((x + 4) << ss_hor);
+ const int64_t mvy = ((int64_t) mat[4] * src_x + mat5_y) >> ss_ver;
+ const int dy = (int) (mvy >> 16) - 4;
+ *dst = imax(*dst, dy + 4 + 8);
+ }
+}
+
+static NOINLINE void affine_lowest_px_luma(Dav1dTaskContext *const t, int *const dst,
+ const uint8_t *const b_dim,
+ const Dav1dWarpedMotionParams *const wmp)
+{
+ affine_lowest_px(t, dst, b_dim, wmp, 0, 0);
+}
+
+static NOINLINE void affine_lowest_px_chroma(Dav1dTaskContext *const t, int *const dst,
+ const uint8_t *const b_dim,
+ const Dav1dWarpedMotionParams *const wmp)
+{
+ const Dav1dFrameContext *const f = t->f;
+ assert(f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400);
+ if (f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I444)
+ affine_lowest_px_luma(t, dst, b_dim, wmp);
+ else
+ affine_lowest_px(t, dst, b_dim, wmp, f->cur.p.layout & DAV1D_PIXEL_LAYOUT_I420, 1);
+}
+
+static void obmc_lowest_px(Dav1dTaskContext *const t,
+ int (*const dst)[2], const int is_chroma,
+ const uint8_t *const b_dim,
+ const int bx4, const int by4, const int w4, const int h4)
+{
+ assert(!(t->bx & 1) && !(t->by & 1));
+ const Dav1dFrameContext *const f = t->f;
+ /*const*/ refmvs_block **r = &t->rt.r[(t->by & 31) + 5];
+ const int ss_ver = is_chroma && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_hor = is_chroma && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+ const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;
+
+ if (t->by > t->ts->tiling.row_start &&
+ (!is_chroma || b_dim[0] * h_mul + b_dim[1] * v_mul >= 16))
+ {
+ for (int i = 0, x = 0; x < w4 && i < imin(b_dim[2], 4); ) {
+ // only odd blocks are considered for overlap handling, hence +1
+ const refmvs_block *const a_r = &r[-1][t->bx + x + 1];
+ const uint8_t *const a_b_dim = dav1d_block_dimensions[a_r->bs];
+
+ if (a_r->ref.ref[0] > 0) {
+ const int oh4 = imin(b_dim[1], 16) >> 1;
+ mc_lowest_px(&dst[a_r->ref.ref[0] - 1][is_chroma], t->by,
+ (oh4 * 3 + 3) >> 2, a_r->mv.mv[0].y, ss_ver,
+ &f->svc[a_r->ref.ref[0] - 1][1]);
+ i++;
+ }
+ x += imax(a_b_dim[0], 2);
+ }
+ }
+
+ if (t->bx > t->ts->tiling.col_start)
+ for (int i = 0, y = 0; y < h4 && i < imin(b_dim[3], 4); ) {
+ // only odd blocks are considered for overlap handling, hence +1
+ const refmvs_block *const l_r = &r[y + 1][t->bx - 1];
+ const uint8_t *const l_b_dim = dav1d_block_dimensions[l_r->bs];
+
+ if (l_r->ref.ref[0] > 0) {
+ const int oh4 = iclip(l_b_dim[1], 2, b_dim[1]);
+ mc_lowest_px(&dst[l_r->ref.ref[0] - 1][is_chroma],
+ t->by + y, oh4, l_r->mv.mv[0].y, ss_ver,
+ &f->svc[l_r->ref.ref[0] - 1][1]);
+ i++;
+ }
+ y += imax(l_b_dim[1], 2);
+ }
+}
+
+static int decode_b(Dav1dTaskContext *const t,
+ const enum BlockLevel bl,
+ const enum BlockSize bs,
+ const enum BlockPartition bp,
+ const enum EdgeFlags intra_edge_flags)
+{
+ Dav1dTileState *const ts = t->ts;
+ const Dav1dFrameContext *const f = t->f;
+ Av1Block b_mem, *const b = t->frame_thread.pass ?
+ &f->frame_thread.b[t->by * f->b4_stride + t->bx] : &b_mem;
+ const uint8_t *const b_dim = dav1d_block_dimensions[bs];
+ const int bx4 = t->bx & 31, by4 = t->by & 31;
+ const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+ const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;
+ const int bw4 = b_dim[0], bh4 = b_dim[1];
+ const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
+ const int cbw4 = (bw4 + ss_hor) >> ss_hor, cbh4 = (bh4 + ss_ver) >> ss_ver;
+ const int have_left = t->bx > ts->tiling.col_start;
+ const int have_top = t->by > ts->tiling.row_start;
+ const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 &&
+ (bw4 > ss_hor || t->bx & 1) &&
+ (bh4 > ss_ver || t->by & 1);
+
+ if (t->frame_thread.pass == 2) {
+ if (b->intra) {
+ f->bd_fn.recon_b_intra(t, bs, intra_edge_flags, b);
+
+ const enum IntraPredMode y_mode_nofilt =
+ b->y_mode == FILTER_PRED ? DC_PRED : b->y_mode;
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir mode, off, mul * y_mode_nofilt); \
+ rep_macro(type, t->dir intra, off, mul)
+ case_set(bh4, l., 1, by4);
+ case_set(bw4, a->, 0, bx4);
+#undef set_ctx
+ if (IS_INTER_OR_SWITCH(f->frame_hdr)) {
+ refmvs_block *const r = &t->rt.r[(t->by & 31) + 5 + bh4 - 1][t->bx];
+ for (int x = 0; x < bw4; x++) {
+ r[x].ref.ref[0] = 0;
+ r[x].bs = bs;
+ }
+ refmvs_block *const *rr = &t->rt.r[(t->by & 31) + 5];
+ for (int y = 0; y < bh4 - 1; y++) {
+ rr[y][t->bx + bw4 - 1].ref.ref[0] = 0;
+ rr[y][t->bx + bw4 - 1].bs = bs;
+ }
+ }
+
+ if (has_chroma) {
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir uvmode, off, mul * b->uv_mode)
+ case_set(cbh4, l., 1, cby4);
+ case_set(cbw4, a->, 0, cbx4);
+#undef set_ctx
+ }
+ } else {
+ if (IS_INTER_OR_SWITCH(f->frame_hdr) /* not intrabc */ &&
+ b->comp_type == COMP_INTER_NONE && b->motion_mode == MM_WARP)
+ {
+ if (b->matrix[0] == SHRT_MIN) {
+ t->warpmv.type = DAV1D_WM_TYPE_IDENTITY;
+ } else {
+ t->warpmv.type = DAV1D_WM_TYPE_AFFINE;
+ t->warpmv.matrix[2] = b->matrix[0] + 0x10000;
+ t->warpmv.matrix[3] = b->matrix[1];
+ t->warpmv.matrix[4] = b->matrix[2];
+ t->warpmv.matrix[5] = b->matrix[3] + 0x10000;
+ dav1d_set_affine_mv2d(bw4, bh4, b->mv2d, &t->warpmv,
+ t->bx, t->by);
+ dav1d_get_shear_params(&t->warpmv);
+#define signabs(v) v < 0 ? '-' : ' ', abs(v)
+ if (DEBUG_BLOCK_INFO)
+ printf("[ %c%x %c%x %c%x\n %c%x %c%x %c%x ]\n"
+ "alpha=%c%x, beta=%c%x, gamma=%c%x, delta=%c%x, mv=y:%d,x:%d\n",
+ signabs(t->warpmv.matrix[0]),
+ signabs(t->warpmv.matrix[1]),
+ signabs(t->warpmv.matrix[2]),
+ signabs(t->warpmv.matrix[3]),
+ signabs(t->warpmv.matrix[4]),
+ signabs(t->warpmv.matrix[5]),
+ signabs(t->warpmv.u.p.alpha),
+ signabs(t->warpmv.u.p.beta),
+ signabs(t->warpmv.u.p.gamma),
+ signabs(t->warpmv.u.p.delta),
+ b->mv2d.y, b->mv2d.x);
+#undef signabs
+ }
+ }
+ if (f->bd_fn.recon_b_inter(t, bs, b)) return -1;
+
+ const uint8_t *const filter = dav1d_filter_dir[b->filter2d];
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir filter[0], off, mul * filter[0]); \
+ rep_macro(type, t->dir filter[1], off, mul * filter[1]); \
+ rep_macro(type, t->dir intra, off, 0)
+ case_set(bh4, l., 1, by4);
+ case_set(bw4, a->, 0, bx4);
+#undef set_ctx
+
+ if (IS_INTER_OR_SWITCH(f->frame_hdr)) {
+ refmvs_block *const r = &t->rt.r[(t->by & 31) + 5 + bh4 - 1][t->bx];
+ for (int x = 0; x < bw4; x++) {
+ r[x].ref.ref[0] = b->ref[0] + 1;
+ r[x].mv.mv[0] = b->mv[0];
+ r[x].bs = bs;
+ }
+ refmvs_block *const *rr = &t->rt.r[(t->by & 31) + 5];
+ for (int y = 0; y < bh4 - 1; y++) {
+ rr[y][t->bx + bw4 - 1].ref.ref[0] = b->ref[0] + 1;
+ rr[y][t->bx + bw4 - 1].mv.mv[0] = b->mv[0];
+ rr[y][t->bx + bw4 - 1].bs = bs;
+ }
+ }
+
+ if (has_chroma) {
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir uvmode, off, mul * DC_PRED)
+ case_set(cbh4, l., 1, cby4);
+ case_set(cbw4, a->, 0, cbx4);
+#undef set_ctx
+ }
+ }
+ return 0;
+ }
+
+ const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
+
+ b->bl = bl;
+ b->bp = bp;
+ b->bs = bs;
+
+ const Dav1dSegmentationData *seg = NULL;
+
+ // segment_id (if seg_feature for skip/ref/gmv is enabled)
+ int seg_pred = 0;
+ if (f->frame_hdr->segmentation.enabled) {
+ if (!f->frame_hdr->segmentation.update_map) {
+ if (f->prev_segmap) {
+ unsigned seg_id = get_prev_frame_segid(f, t->by, t->bx, w4, h4,
+ f->prev_segmap,
+ f->b4_stride);
+ if (seg_id >= 8) return -1;
+ b->seg_id = seg_id;
+ } else {
+ b->seg_id = 0;
+ }
+ seg = &f->frame_hdr->segmentation.seg_data.d[b->seg_id];
+ } else if (f->frame_hdr->segmentation.seg_data.preskip) {
+ if (f->frame_hdr->segmentation.temporal &&
+ (seg_pred = dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.seg_pred[t->a->seg_pred[bx4] +
+ t->l.seg_pred[by4]])))
+ {
+ // temporal predicted seg_id
+ if (f->prev_segmap) {
+ unsigned seg_id = get_prev_frame_segid(f, t->by, t->bx,
+ w4, h4,
+ f->prev_segmap,
+ f->b4_stride);
+ if (seg_id >= 8) return -1;
+ b->seg_id = seg_id;
+ } else {
+ b->seg_id = 0;
+ }
+ } else {
+ int seg_ctx;
+ const unsigned pred_seg_id =
+ get_cur_frame_segid(t->by, t->bx, have_top, have_left,
+ &seg_ctx, f->cur_segmap, f->b4_stride);
+ const unsigned diff = dav1d_msac_decode_symbol_adapt8(&ts->msac,
+ ts->cdf.m.seg_id[seg_ctx],
+ DAV1D_MAX_SEGMENTS - 1);
+ const unsigned last_active_seg_id =
+ f->frame_hdr->segmentation.seg_data.last_active_segid;
+ b->seg_id = neg_deinterleave(diff, pred_seg_id,
+ last_active_seg_id + 1);
+ if (b->seg_id > last_active_seg_id) b->seg_id = 0; // error?
+ if (b->seg_id >= DAV1D_MAX_SEGMENTS) b->seg_id = 0; // error?
+ }
+
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-segid[preskip;%d]: r=%d\n",
+ b->seg_id, ts->msac.rng);
+
+ seg = &f->frame_hdr->segmentation.seg_data.d[b->seg_id];
+ }
+ } else {
+ b->seg_id = 0;
+ }
+
+ // skip_mode
+ if ((!seg || (!seg->globalmv && seg->ref == -1 && !seg->skip)) &&
+ f->frame_hdr->skip_mode_enabled && imin(bw4, bh4) > 1)
+ {
+ const int smctx = t->a->skip_mode[bx4] + t->l.skip_mode[by4];
+ b->skip_mode = dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.skip_mode[smctx]);
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-skipmode[%d]: r=%d\n", b->skip_mode, ts->msac.rng);
+ } else {
+ b->skip_mode = 0;
+ }
+
+ // skip
+ if (b->skip_mode || (seg && seg->skip)) {
+ b->skip = 1;
+ } else {
+ const int sctx = t->a->skip[bx4] + t->l.skip[by4];
+ b->skip = dav1d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.skip[sctx]);
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-skip[%d]: r=%d\n", b->skip, ts->msac.rng);
+ }
+
+ // segment_id
+ if (f->frame_hdr->segmentation.enabled &&
+ f->frame_hdr->segmentation.update_map &&
+ !f->frame_hdr->segmentation.seg_data.preskip)
+ {
+ if (!b->skip && f->frame_hdr->segmentation.temporal &&
+ (seg_pred = dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.seg_pred[t->a->seg_pred[bx4] +
+ t->l.seg_pred[by4]])))
+ {
+ // temporal predicted seg_id
+ if (f->prev_segmap) {
+ unsigned seg_id = get_prev_frame_segid(f, t->by, t->bx, w4, h4,
+ f->prev_segmap,
+ f->b4_stride);
+ if (seg_id >= 8) return -1;
+ b->seg_id = seg_id;
+ } else {
+ b->seg_id = 0;
+ }
+ } else {
+ int seg_ctx;
+ const unsigned pred_seg_id =
+ get_cur_frame_segid(t->by, t->bx, have_top, have_left,
+ &seg_ctx, f->cur_segmap, f->b4_stride);
+ if (b->skip) {
+ b->seg_id = pred_seg_id;
+ } else {
+ const unsigned diff = dav1d_msac_decode_symbol_adapt8(&ts->msac,
+ ts->cdf.m.seg_id[seg_ctx],
+ DAV1D_MAX_SEGMENTS - 1);
+ const unsigned last_active_seg_id =
+ f->frame_hdr->segmentation.seg_data.last_active_segid;
+ b->seg_id = neg_deinterleave(diff, pred_seg_id,
+ last_active_seg_id + 1);
+ if (b->seg_id > last_active_seg_id) b->seg_id = 0; // error?
+ }
+ if (b->seg_id >= DAV1D_MAX_SEGMENTS) b->seg_id = 0; // error?
+ }
+
+ seg = &f->frame_hdr->segmentation.seg_data.d[b->seg_id];
+
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-segid[postskip;%d]: r=%d\n",
+ b->seg_id, ts->msac.rng);
+ }
+
+ // cdef index
+ if (!b->skip) {
+ const int idx = f->seq_hdr->sb128 ? ((t->bx & 16) >> 4) +
+ ((t->by & 16) >> 3) : 0;
+ if (t->cur_sb_cdef_idx_ptr[idx] == -1) {
+ const int v = dav1d_msac_decode_bools(&ts->msac,
+ f->frame_hdr->cdef.n_bits);
+ t->cur_sb_cdef_idx_ptr[idx] = v;
+ if (bw4 > 16) t->cur_sb_cdef_idx_ptr[idx + 1] = v;
+ if (bh4 > 16) t->cur_sb_cdef_idx_ptr[idx + 2] = v;
+ if (bw4 == 32 && bh4 == 32) t->cur_sb_cdef_idx_ptr[idx + 3] = v;
+
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-cdef_idx[%d]: r=%d\n",
+ *t->cur_sb_cdef_idx_ptr, ts->msac.rng);
+ }
+ }
+
+ // delta-q/lf
+ if (!(t->bx & (31 >> !f->seq_hdr->sb128)) &&
+ !(t->by & (31 >> !f->seq_hdr->sb128)))
+ {
+ const int prev_qidx = ts->last_qidx;
+ const int have_delta_q = f->frame_hdr->delta.q.present &&
+ (bs != (f->seq_hdr->sb128 ? BS_128x128 : BS_64x64) || !b->skip);
+
+ int8_t prev_delta_lf[4];
+ memcpy(prev_delta_lf, ts->last_delta_lf, 4);
+
+ if (have_delta_q) {
+ int delta_q = dav1d_msac_decode_symbol_adapt4(&ts->msac,
+ ts->cdf.m.delta_q, 3);
+ if (delta_q == 3) {
+ const int n_bits = 1 + dav1d_msac_decode_bools(&ts->msac, 3);
+ delta_q = dav1d_msac_decode_bools(&ts->msac, n_bits) +
+ 1 + (1 << n_bits);
+ }
+ if (delta_q) {
+ if (dav1d_msac_decode_bool_equi(&ts->msac)) delta_q = -delta_q;
+ delta_q *= 1 << f->frame_hdr->delta.q.res_log2;
+ }
+ ts->last_qidx = iclip(ts->last_qidx + delta_q, 1, 255);
+ if (have_delta_q && DEBUG_BLOCK_INFO)
+ printf("Post-delta_q[%d->%d]: r=%d\n",
+ delta_q, ts->last_qidx, ts->msac.rng);
+
+ if (f->frame_hdr->delta.lf.present) {
+ const int n_lfs = f->frame_hdr->delta.lf.multi ?
+ f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 ? 4 : 2 : 1;
+
+ for (int i = 0; i < n_lfs; i++) {
+ int delta_lf = dav1d_msac_decode_symbol_adapt4(&ts->msac,
+ ts->cdf.m.delta_lf[i + f->frame_hdr->delta.lf.multi], 3);
+ if (delta_lf == 3) {
+ const int n_bits = 1 + dav1d_msac_decode_bools(&ts->msac, 3);
+ delta_lf = dav1d_msac_decode_bools(&ts->msac, n_bits) +
+ 1 + (1 << n_bits);
+ }
+ if (delta_lf) {
+ if (dav1d_msac_decode_bool_equi(&ts->msac))
+ delta_lf = -delta_lf;
+ delta_lf *= 1 << f->frame_hdr->delta.lf.res_log2;
+ }
+ ts->last_delta_lf[i] =
+ iclip(ts->last_delta_lf[i] + delta_lf, -63, 63);
+ if (have_delta_q && DEBUG_BLOCK_INFO)
+ printf("Post-delta_lf[%d:%d]: r=%d\n", i, delta_lf,
+ ts->msac.rng);
+ }
+ }
+ }
+ if (ts->last_qidx == f->frame_hdr->quant.yac) {
+ // assign frame-wide q values to this sb
+ ts->dq = f->dq;
+ } else if (ts->last_qidx != prev_qidx) {
+ // find sb-specific quant parameters
+ init_quant_tables(f->seq_hdr, f->frame_hdr, ts->last_qidx, ts->dqmem);
+ ts->dq = ts->dqmem;
+ }
+ if (!memcmp(ts->last_delta_lf, (int8_t[4]) { 0, 0, 0, 0 }, 4)) {
+ // assign frame-wide lf values to this sb
+ ts->lflvl = f->lf.lvl;
+ } else if (memcmp(ts->last_delta_lf, prev_delta_lf, 4)) {
+ // find sb-specific lf lvl parameters
+ dav1d_calc_lf_values(ts->lflvlmem, f->frame_hdr, ts->last_delta_lf);
+ ts->lflvl = ts->lflvlmem;
+ }
+ }
+
+ if (b->skip_mode) {
+ b->intra = 0;
+ } else if (IS_INTER_OR_SWITCH(f->frame_hdr)) {
+ if (seg && (seg->ref >= 0 || seg->globalmv)) {
+ b->intra = !seg->ref;
+ } else {
+ const int ictx = get_intra_ctx(t->a, &t->l, by4, bx4,
+ have_top, have_left);
+ b->intra = !dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.intra[ictx]);
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-intra[%d]: r=%d\n", b->intra, ts->msac.rng);
+ }
+ } else if (f->frame_hdr->allow_intrabc) {
+ b->intra = !dav1d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.intrabc);
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-intrabcflag[%d]: r=%d\n", b->intra, ts->msac.rng);
+ } else {
+ b->intra = 1;
+ }
+
+ // intra/inter-specific stuff
+ if (b->intra) {
+ uint16_t *const ymode_cdf = IS_INTER_OR_SWITCH(f->frame_hdr) ?
+ ts->cdf.m.y_mode[dav1d_ymode_size_context[bs]] :
+ ts->cdf.kfym[dav1d_intra_mode_context[t->a->mode[bx4]]]
+ [dav1d_intra_mode_context[t->l.mode[by4]]];
+ b->y_mode = dav1d_msac_decode_symbol_adapt16(&ts->msac, ymode_cdf,
+ N_INTRA_PRED_MODES - 1);
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-ymode[%d]: r=%d\n", b->y_mode, ts->msac.rng);
+
+ // angle delta
+ if (b_dim[2] + b_dim[3] >= 2 && b->y_mode >= VERT_PRED &&
+ b->y_mode <= VERT_LEFT_PRED)
+ {
+ uint16_t *const acdf = ts->cdf.m.angle_delta[b->y_mode - VERT_PRED];
+ const int angle = dav1d_msac_decode_symbol_adapt8(&ts->msac, acdf, 6);
+ b->y_angle = angle - 3;
+ } else {
+ b->y_angle = 0;
+ }
+
+ if (has_chroma) {
+ const int cfl_allowed = f->frame_hdr->segmentation.lossless[b->seg_id] ?
+ cbw4 == 1 && cbh4 == 1 : !!(cfl_allowed_mask & (1 << bs));
+ uint16_t *const uvmode_cdf = ts->cdf.m.uv_mode[cfl_allowed][b->y_mode];
+ b->uv_mode = dav1d_msac_decode_symbol_adapt16(&ts->msac, uvmode_cdf,
+ N_UV_INTRA_PRED_MODES - 1 - !cfl_allowed);
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-uvmode[%d]: r=%d\n", b->uv_mode, ts->msac.rng);
+
+ b->uv_angle = 0;
+ if (b->uv_mode == CFL_PRED) {
+#define SIGN(a) (!!(a) + ((a) > 0))
+ const int sign = dav1d_msac_decode_symbol_adapt8(&ts->msac,
+ ts->cdf.m.cfl_sign, 7) + 1;
+ const int sign_u = sign * 0x56 >> 8, sign_v = sign - sign_u * 3;
+ assert(sign_u == sign / 3);
+ if (sign_u) {
+ const int ctx = (sign_u == 2) * 3 + sign_v;
+ b->cfl_alpha[0] = dav1d_msac_decode_symbol_adapt16(&ts->msac,
+ ts->cdf.m.cfl_alpha[ctx], 15) + 1;
+ if (sign_u == 1) b->cfl_alpha[0] = -b->cfl_alpha[0];
+ } else {
+ b->cfl_alpha[0] = 0;
+ }
+ if (sign_v) {
+ const int ctx = (sign_v == 2) * 3 + sign_u;
+ b->cfl_alpha[1] = dav1d_msac_decode_symbol_adapt16(&ts->msac,
+ ts->cdf.m.cfl_alpha[ctx], 15) + 1;
+ if (sign_v == 1) b->cfl_alpha[1] = -b->cfl_alpha[1];
+ } else {
+ b->cfl_alpha[1] = 0;
+ }
+#undef SIGN
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-uvalphas[%d/%d]: r=%d\n",
+ b->cfl_alpha[0], b->cfl_alpha[1], ts->msac.rng);
+ } else if (b_dim[2] + b_dim[3] >= 2 && b->uv_mode >= VERT_PRED &&
+ b->uv_mode <= VERT_LEFT_PRED)
+ {
+ uint16_t *const acdf = ts->cdf.m.angle_delta[b->uv_mode - VERT_PRED];
+ const int angle = dav1d_msac_decode_symbol_adapt8(&ts->msac, acdf, 6);
+ b->uv_angle = angle - 3;
+ }
+ }
+
+ b->pal_sz[0] = b->pal_sz[1] = 0;
+ if (f->frame_hdr->allow_screen_content_tools &&
+ imax(bw4, bh4) <= 16 && bw4 + bh4 >= 4)
+ {
+ const int sz_ctx = b_dim[2] + b_dim[3] - 2;
+ if (b->y_mode == DC_PRED) {
+ const int pal_ctx = (t->a->pal_sz[bx4] > 0) + (t->l.pal_sz[by4] > 0);
+ const int use_y_pal = dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.pal_y[sz_ctx][pal_ctx]);
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-y_pal[%d]: r=%d\n", use_y_pal, ts->msac.rng);
+ if (use_y_pal)
+ f->bd_fn.read_pal_plane(t, b, 0, sz_ctx, bx4, by4);
+ }
+
+ if (has_chroma && b->uv_mode == DC_PRED) {
+ const int pal_ctx = b->pal_sz[0] > 0;
+ const int use_uv_pal = dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.pal_uv[pal_ctx]);
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-uv_pal[%d]: r=%d\n", use_uv_pal, ts->msac.rng);
+ if (use_uv_pal) // see aomedia bug 2183 for why we use luma coordinates
+ f->bd_fn.read_pal_uv(t, b, sz_ctx, bx4, by4);
+ }
+ }
+
+ if (b->y_mode == DC_PRED && !b->pal_sz[0] &&
+ imax(b_dim[2], b_dim[3]) <= 3 && f->seq_hdr->filter_intra)
+ {
+ const int is_filter = dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.use_filter_intra[bs]);
+ if (is_filter) {
+ b->y_mode = FILTER_PRED;
+ b->y_angle = dav1d_msac_decode_symbol_adapt4(&ts->msac,
+ ts->cdf.m.filter_intra, 4);
+ }
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-filterintramode[%d/%d]: r=%d\n",
+ b->y_mode, b->y_angle, ts->msac.rng);
+ }
+
+ if (b->pal_sz[0]) {
+ uint8_t *pal_idx;
+ if (t->frame_thread.pass) {
+ const int p = t->frame_thread.pass & 1;
+ assert(ts->frame_thread[p].pal_idx);
+ pal_idx = ts->frame_thread[p].pal_idx;
+ ts->frame_thread[p].pal_idx += bw4 * bh4 * 8;
+ } else
+ pal_idx = t->scratch.pal_idx_y;
+ read_pal_indices(t, pal_idx, b, 0, w4, h4, bw4, bh4);
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-y-pal-indices: r=%d\n", ts->msac.rng);
+ }
+
+ if (has_chroma && b->pal_sz[1]) {
+ uint8_t *pal_idx;
+ if (t->frame_thread.pass) {
+ const int p = t->frame_thread.pass & 1;
+ assert(ts->frame_thread[p].pal_idx);
+ pal_idx = ts->frame_thread[p].pal_idx;
+ ts->frame_thread[p].pal_idx += cbw4 * cbh4 * 8;
+ } else
+ pal_idx = t->scratch.pal_idx_uv;
+ read_pal_indices(t, pal_idx, b, 1, cw4, ch4, cbw4, cbh4);
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-uv-pal-indices: r=%d\n", ts->msac.rng);
+ }
+
+ const TxfmInfo *t_dim;
+ if (f->frame_hdr->segmentation.lossless[b->seg_id]) {
+ b->tx = b->uvtx = (int) TX_4X4;
+ t_dim = &dav1d_txfm_dimensions[TX_4X4];
+ } else {
+ b->tx = dav1d_max_txfm_size_for_bs[bs][0];
+ b->uvtx = dav1d_max_txfm_size_for_bs[bs][f->cur.p.layout];
+ t_dim = &dav1d_txfm_dimensions[b->tx];
+ if (f->frame_hdr->txfm_mode == DAV1D_TX_SWITCHABLE && t_dim->max > TX_4X4) {
+ const int tctx = get_tx_ctx(t->a, &t->l, t_dim, by4, bx4);
+ uint16_t *const tx_cdf = ts->cdf.m.txsz[t_dim->max - 1][tctx];
+ int depth = dav1d_msac_decode_symbol_adapt4(&ts->msac, tx_cdf,
+ imin(t_dim->max, 2));
+
+ while (depth--) {
+ b->tx = t_dim->sub;
+ t_dim = &dav1d_txfm_dimensions[b->tx];
+ }
+ }
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-tx[%d]: r=%d\n", b->tx, ts->msac.rng);
+ }
+
+ // reconstruction
+ if (t->frame_thread.pass == 1) {
+ f->bd_fn.read_coef_blocks(t, bs, b);
+ } else {
+ f->bd_fn.recon_b_intra(t, bs, intra_edge_flags, b);
+ }
+
+ if (f->frame_hdr->loopfilter.level_y[0] ||
+ f->frame_hdr->loopfilter.level_y[1])
+ {
+ dav1d_create_lf_mask_intra(t->lf_mask, f->lf.level, f->b4_stride,
+ (const uint8_t (*)[8][2])
+ &ts->lflvl[b->seg_id][0][0][0],
+ t->bx, t->by, f->w4, f->h4, bs,
+ b->tx, b->uvtx, f->cur.p.layout,
+ &t->a->tx_lpf_y[bx4], &t->l.tx_lpf_y[by4],
+ has_chroma ? &t->a->tx_lpf_uv[cbx4] : NULL,
+ has_chroma ? &t->l.tx_lpf_uv[cby4] : NULL);
+ }
+
+ // update contexts
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir tx_intra, off, mul * (((uint8_t *) &t_dim->lw)[diridx])); \
+ rep_macro(type, t->dir tx, off, mul * (((uint8_t *) &t_dim->lw)[diridx])); \
+ rep_macro(type, t->dir mode, off, mul * y_mode_nofilt); \
+ rep_macro(type, t->dir pal_sz, off, mul * b->pal_sz[0]); \
+ rep_macro(type, t->dir seg_pred, off, mul * seg_pred); \
+ rep_macro(type, t->dir skip_mode, off, 0); \
+ rep_macro(type, t->dir intra, off, mul); \
+ rep_macro(type, t->dir skip, off, mul * b->skip); \
+ /* see aomedia bug 2183 for why we use luma coordinates here */ \
+ rep_macro(type, t->pal_sz_uv[diridx], off, mul * (has_chroma ? b->pal_sz[1] : 0)); \
+ if (IS_INTER_OR_SWITCH(f->frame_hdr)) { \
+ rep_macro(type, t->dir comp_type, off, mul * COMP_INTER_NONE); \
+ rep_macro(type, t->dir ref[0], off, mul * ((uint8_t) -1)); \
+ rep_macro(type, t->dir ref[1], off, mul * ((uint8_t) -1)); \
+ rep_macro(type, t->dir filter[0], off, mul * DAV1D_N_SWITCHABLE_FILTERS); \
+ rep_macro(type, t->dir filter[1], off, mul * DAV1D_N_SWITCHABLE_FILTERS); \
+ }
+ const enum IntraPredMode y_mode_nofilt =
+ b->y_mode == FILTER_PRED ? DC_PRED : b->y_mode;
+ case_set(bh4, l., 1, by4);
+ case_set(bw4, a->, 0, bx4);
+#undef set_ctx
+ if (b->pal_sz[0])
+ f->bd_fn.copy_pal_block_y(t, bx4, by4, bw4, bh4);
+ if (has_chroma) {
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir uvmode, off, mul * b->uv_mode)
+ case_set(cbh4, l., 1, cby4);
+ case_set(cbw4, a->, 0, cbx4);
+#undef set_ctx
+ if (b->pal_sz[1])
+ f->bd_fn.copy_pal_block_uv(t, bx4, by4, bw4, bh4);
+ }
+ if (IS_INTER_OR_SWITCH(f->frame_hdr) || f->frame_hdr->allow_intrabc)
+ splat_intraref(f->c, t, bs, bw4, bh4);
+ } else if (IS_KEY_OR_INTRA(f->frame_hdr)) {
+ // intra block copy
+ refmvs_candidate mvstack[8];
+ int n_mvs, ctx;
+ dav1d_refmvs_find(&t->rt, mvstack, &n_mvs, &ctx,
+ (union refmvs_refpair) { .ref = { 0, -1 }},
+ bs, intra_edge_flags, t->by, t->bx);
+
+ if (mvstack[0].mv.mv[0].n)
+ b->mv[0] = mvstack[0].mv.mv[0];
+ else if (mvstack[1].mv.mv[0].n)
+ b->mv[0] = mvstack[1].mv.mv[0];
+ else {
+ if (t->by - (16 << f->seq_hdr->sb128) < ts->tiling.row_start) {
+ b->mv[0].y = 0;
+ b->mv[0].x = -(512 << f->seq_hdr->sb128) - 2048;
+ } else {
+ b->mv[0].y = -(512 << f->seq_hdr->sb128);
+ b->mv[0].x = 0;
+ }
+ }
+
+ const union mv ref = b->mv[0];
+ read_mv_residual(t, &b->mv[0], &ts->cdf.dmv, 0);
+
+ // clip intrabc motion vector to decoded parts of current tile
+ int border_left = ts->tiling.col_start * 4;
+ int border_top = ts->tiling.row_start * 4;
+ if (has_chroma) {
+ if (bw4 < 2 && ss_hor)
+ border_left += 4;
+ if (bh4 < 2 && ss_ver)
+ border_top += 4;
+ }
+ int src_left = t->bx * 4 + (b->mv[0].x >> 3);
+ int src_top = t->by * 4 + (b->mv[0].y >> 3);
+ int src_right = src_left + bw4 * 4;
+ int src_bottom = src_top + bh4 * 4;
+ const int border_right = ((ts->tiling.col_end + (bw4 - 1)) & ~(bw4 - 1)) * 4;
+
+ // check against left or right tile boundary and adjust if necessary
+ if (src_left < border_left) {
+ src_right += border_left - src_left;
+ src_left += border_left - src_left;
+ } else if (src_right > border_right) {
+ src_left -= src_right - border_right;
+ src_right -= src_right - border_right;
+ }
+ // check against top tile boundary and adjust if necessary
+ if (src_top < border_top) {
+ src_bottom += border_top - src_top;
+ src_top += border_top - src_top;
+ }
+
+ const int sbx = (t->bx >> (4 + f->seq_hdr->sb128)) << (6 + f->seq_hdr->sb128);
+ const int sby = (t->by >> (4 + f->seq_hdr->sb128)) << (6 + f->seq_hdr->sb128);
+ const int sb_size = 1 << (6 + f->seq_hdr->sb128);
+ // check for overlap with current superblock
+ if (src_bottom > sby && src_right > sbx) {
+ if (src_top - border_top >= src_bottom - sby) {
+ // if possible move src up into the previous suberblock row
+ src_top -= src_bottom - sby;
+ src_bottom -= src_bottom - sby;
+ } else if (src_left - border_left >= src_right - sbx) {
+ // if possible move src left into the previous suberblock
+ src_left -= src_right - sbx;
+ src_right -= src_right - sbx;
+ }
+ }
+ // move src up if it is below current superblock row
+ if (src_bottom > sby + sb_size) {
+ src_top -= src_bottom - (sby + sb_size);
+ src_bottom -= src_bottom - (sby + sb_size);
+ }
+ // error out if mv still overlaps with the current superblock
+ if (src_bottom > sby && src_right > sbx)
+ return -1;
+
+ b->mv[0].x = (src_left - t->bx * 4) * 8;
+ b->mv[0].y = (src_top - t->by * 4) * 8;
+
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-dmv[%d/%d,ref=%d/%d|%d/%d]: r=%d\n",
+ b->mv[0].y, b->mv[0].x, ref.y, ref.x,
+ mvstack[0].mv.mv[0].y, mvstack[0].mv.mv[0].x, ts->msac.rng);
+ read_vartx_tree(t, b, bs, bx4, by4);
+
+ // reconstruction
+ if (t->frame_thread.pass == 1) {
+ f->bd_fn.read_coef_blocks(t, bs, b);
+ b->filter2d = FILTER_2D_BILINEAR;
+ } else {
+ if (f->bd_fn.recon_b_inter(t, bs, b)) return -1;
+ }
+
+ splat_intrabc_mv(f->c, t, bs, b, bw4, bh4);
+
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir tx_intra, off, mul * b_dim[2 + diridx]); \
+ rep_macro(type, t->dir mode, off, mul * DC_PRED); \
+ rep_macro(type, t->dir pal_sz, off, 0); \
+ /* see aomedia bug 2183 for why this is outside if (has_chroma) */ \
+ rep_macro(type, t->pal_sz_uv[diridx], off, 0); \
+ rep_macro(type, t->dir seg_pred, off, mul * seg_pred); \
+ rep_macro(type, t->dir skip_mode, off, 0); \
+ rep_macro(type, t->dir intra, off, 0); \
+ rep_macro(type, t->dir skip, off, mul * b->skip)
+ case_set(bh4, l., 1, by4);
+ case_set(bw4, a->, 0, bx4);
+#undef set_ctx
+ if (has_chroma) {
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir uvmode, off, mul * DC_PRED)
+ case_set(cbh4, l., 1, cby4);
+ case_set(cbw4, a->, 0, cbx4);
+#undef set_ctx
+ }
+ } else {
+ // inter-specific mode/mv coding
+ int is_comp, has_subpel_filter;
+
+ if (b->skip_mode) {
+ is_comp = 1;
+ } else if ((!seg || (seg->ref == -1 && !seg->globalmv && !seg->skip)) &&
+ f->frame_hdr->switchable_comp_refs && imin(bw4, bh4) > 1)
+ {
+ const int ctx = get_comp_ctx(t->a, &t->l, by4, bx4,
+ have_top, have_left);
+ is_comp = dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.comp[ctx]);
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-compflag[%d]: r=%d\n", is_comp, ts->msac.rng);
+ } else {
+ is_comp = 0;
+ }
+
+ if (b->skip_mode) {
+ b->ref[0] = f->frame_hdr->skip_mode_refs[0];
+ b->ref[1] = f->frame_hdr->skip_mode_refs[1];
+ b->comp_type = COMP_INTER_AVG;
+ b->inter_mode = NEARESTMV_NEARESTMV;
+ b->drl_idx = NEAREST_DRL;
+ has_subpel_filter = 0;
+
+ refmvs_candidate mvstack[8];
+ int n_mvs, ctx;
+ dav1d_refmvs_find(&t->rt, mvstack, &n_mvs, &ctx,
+ (union refmvs_refpair) { .ref = {
+ b->ref[0] + 1, b->ref[1] + 1 }},
+ bs, intra_edge_flags, t->by, t->bx);
+
+ b->mv[0] = mvstack[0].mv.mv[0];
+ b->mv[1] = mvstack[0].mv.mv[1];
+ fix_mv_precision(f->frame_hdr, &b->mv[0]);
+ fix_mv_precision(f->frame_hdr, &b->mv[1]);
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-skipmodeblock[mv=1:y=%d,x=%d,2:y=%d,x=%d,refs=%d+%d\n",
+ b->mv[0].y, b->mv[0].x, b->mv[1].y, b->mv[1].x,
+ b->ref[0], b->ref[1]);
+ } else if (is_comp) {
+ const int dir_ctx = get_comp_dir_ctx(t->a, &t->l, by4, bx4,
+ have_top, have_left);
+ if (dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.comp_dir[dir_ctx]))
+ {
+ // bidir - first reference (fw)
+ const int ctx1 = av1_get_fwd_ref_ctx(t->a, &t->l, by4, bx4,
+ have_top, have_left);
+ if (dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.comp_fwd_ref[0][ctx1]))
+ {
+ const int ctx2 = av1_get_fwd_ref_2_ctx(t->a, &t->l, by4, bx4,
+ have_top, have_left);
+ b->ref[0] = 2 + dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.comp_fwd_ref[2][ctx2]);
+ } else {
+ const int ctx2 = av1_get_fwd_ref_1_ctx(t->a, &t->l, by4, bx4,
+ have_top, have_left);
+ b->ref[0] = dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.comp_fwd_ref[1][ctx2]);
+ }
+
+ // second reference (bw)
+ const int ctx3 = av1_get_bwd_ref_ctx(t->a, &t->l, by4, bx4,
+ have_top, have_left);
+ if (dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.comp_bwd_ref[0][ctx3]))
+ {
+ b->ref[1] = 6;
+ } else {
+ const int ctx4 = av1_get_bwd_ref_1_ctx(t->a, &t->l, by4, bx4,
+ have_top, have_left);
+ b->ref[1] = 4 + dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.comp_bwd_ref[1][ctx4]);
+ }
+ } else {
+ // unidir
+ const int uctx_p = av1_get_uni_p_ctx(t->a, &t->l, by4, bx4,
+ have_top, have_left);
+ if (dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.comp_uni_ref[0][uctx_p]))
+ {
+ b->ref[0] = 4;
+ b->ref[1] = 6;
+ } else {
+ const int uctx_p1 = av1_get_uni_p1_ctx(t->a, &t->l, by4, bx4,
+ have_top, have_left);
+ b->ref[0] = 0;
+ b->ref[1] = 1 + dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.comp_uni_ref[1][uctx_p1]);
+ if (b->ref[1] == 2) {
+ const int uctx_p2 = av1_get_uni_p2_ctx(t->a, &t->l, by4, bx4,
+ have_top, have_left);
+ b->ref[1] += dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.comp_uni_ref[2][uctx_p2]);
+ }
+ }
+ }
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-refs[%d/%d]: r=%d\n",
+ b->ref[0], b->ref[1], ts->msac.rng);
+
+ refmvs_candidate mvstack[8];
+ int n_mvs, ctx;
+ dav1d_refmvs_find(&t->rt, mvstack, &n_mvs, &ctx,
+ (union refmvs_refpair) { .ref = {
+ b->ref[0] + 1, b->ref[1] + 1 }},
+ bs, intra_edge_flags, t->by, t->bx);
+
+ b->inter_mode = dav1d_msac_decode_symbol_adapt8(&ts->msac,
+ ts->cdf.m.comp_inter_mode[ctx],
+ N_COMP_INTER_PRED_MODES - 1);
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-compintermode[%d,ctx=%d,n_mvs=%d]: r=%d\n",
+ b->inter_mode, ctx, n_mvs, ts->msac.rng);
+
+ const uint8_t *const im = dav1d_comp_inter_pred_modes[b->inter_mode];
+ b->drl_idx = NEAREST_DRL;
+ if (b->inter_mode == NEWMV_NEWMV) {
+ if (n_mvs > 1) { // NEARER, NEAR or NEARISH
+ const int drl_ctx_v1 = get_drl_context(mvstack, 0);
+ b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.drl_bit[drl_ctx_v1]);
+ if (b->drl_idx == NEARER_DRL && n_mvs > 2) {
+ const int drl_ctx_v2 = get_drl_context(mvstack, 1);
+ b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.drl_bit[drl_ctx_v2]);
+ }
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-drlidx[%d,n_mvs=%d]: r=%d\n",
+ b->drl_idx, n_mvs, ts->msac.rng);
+ }
+ } else if (im[0] == NEARMV || im[1] == NEARMV) {
+ b->drl_idx = NEARER_DRL;
+ if (n_mvs > 2) { // NEAR or NEARISH
+ const int drl_ctx_v2 = get_drl_context(mvstack, 1);
+ b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.drl_bit[drl_ctx_v2]);
+ if (b->drl_idx == NEAR_DRL && n_mvs > 3) {
+ const int drl_ctx_v3 = get_drl_context(mvstack, 2);
+ b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.drl_bit[drl_ctx_v3]);
+ }
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-drlidx[%d,n_mvs=%d]: r=%d\n",
+ b->drl_idx, n_mvs, ts->msac.rng);
+ }
+ }
+ assert(b->drl_idx >= NEAREST_DRL && b->drl_idx <= NEARISH_DRL);
+
+#define assign_comp_mv(idx) \
+ switch (im[idx]) { \
+ case NEARMV: \
+ case NEARESTMV: \
+ b->mv[idx] = mvstack[b->drl_idx].mv.mv[idx]; \
+ fix_mv_precision(f->frame_hdr, &b->mv[idx]); \
+ break; \
+ case GLOBALMV: \
+ has_subpel_filter |= \
+ f->frame_hdr->gmv[b->ref[idx]].type == DAV1D_WM_TYPE_TRANSLATION; \
+ b->mv[idx] = get_gmv_2d(&f->frame_hdr->gmv[b->ref[idx]], \
+ t->bx, t->by, bw4, bh4, f->frame_hdr); \
+ break; \
+ case NEWMV: \
+ b->mv[idx] = mvstack[b->drl_idx].mv.mv[idx]; \
+ read_mv_residual(t, &b->mv[idx], &ts->cdf.mv, \
+ !f->frame_hdr->force_integer_mv); \
+ break; \
+ }
+ has_subpel_filter = imin(bw4, bh4) == 1 ||
+ b->inter_mode != GLOBALMV_GLOBALMV;
+ assign_comp_mv(0);
+ assign_comp_mv(1);
+#undef assign_comp_mv
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-residual_mv[1:y=%d,x=%d,2:y=%d,x=%d]: r=%d\n",
+ b->mv[0].y, b->mv[0].x, b->mv[1].y, b->mv[1].x,
+ ts->msac.rng);
+
+ // jnt_comp vs. seg vs. wedge
+ int is_segwedge = 0;
+ if (f->seq_hdr->masked_compound) {
+ const int mask_ctx = get_mask_comp_ctx(t->a, &t->l, by4, bx4);
+
+ is_segwedge = dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.mask_comp[mask_ctx]);
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-segwedge_vs_jntavg[%d,ctx=%d]: r=%d\n",
+ is_segwedge, mask_ctx, ts->msac.rng);
+ }
+
+ if (!is_segwedge) {
+ if (f->seq_hdr->jnt_comp) {
+ const int jnt_ctx =
+ get_jnt_comp_ctx(f->seq_hdr->order_hint_n_bits,
+ f->cur.frame_hdr->frame_offset,
+ f->refp[b->ref[0]].p.frame_hdr->frame_offset,
+ f->refp[b->ref[1]].p.frame_hdr->frame_offset,
+ t->a, &t->l, by4, bx4);
+ b->comp_type = COMP_INTER_WEIGHTED_AVG +
+ dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.jnt_comp[jnt_ctx]);
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-jnt_comp[%d,ctx=%d[ac:%d,ar:%d,lc:%d,lr:%d]]: r=%d\n",
+ b->comp_type == COMP_INTER_AVG,
+ jnt_ctx, t->a->comp_type[bx4], t->a->ref[0][bx4],
+ t->l.comp_type[by4], t->l.ref[0][by4],
+ ts->msac.rng);
+ } else {
+ b->comp_type = COMP_INTER_AVG;
+ }
+ } else {
+ if (wedge_allowed_mask & (1 << bs)) {
+ const int ctx = dav1d_wedge_ctx_lut[bs];
+ b->comp_type = COMP_INTER_WEDGE -
+ dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.wedge_comp[ctx]);
+ if (b->comp_type == COMP_INTER_WEDGE)
+ b->wedge_idx = dav1d_msac_decode_symbol_adapt16(&ts->msac,
+ ts->cdf.m.wedge_idx[ctx], 15);
+ } else {
+ b->comp_type = COMP_INTER_SEG;
+ }
+ b->mask_sign = dav1d_msac_decode_bool_equi(&ts->msac);
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-seg/wedge[%d,wedge_idx=%d,sign=%d]: r=%d\n",
+ b->comp_type == COMP_INTER_WEDGE,
+ b->wedge_idx, b->mask_sign, ts->msac.rng);
+ }
+ } else {
+ b->comp_type = COMP_INTER_NONE;
+
+ // ref
+ if (seg && seg->ref > 0) {
+ b->ref[0] = seg->ref - 1;
+ } else if (seg && (seg->globalmv || seg->skip)) {
+ b->ref[0] = 0;
+ } else {
+ const int ctx1 = av1_get_ref_ctx(t->a, &t->l, by4, bx4,
+ have_top, have_left);
+ if (dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.ref[0][ctx1]))
+ {
+ const int ctx2 = av1_get_ref_2_ctx(t->a, &t->l, by4, bx4,
+ have_top, have_left);
+ if (dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.ref[1][ctx2]))
+ {
+ b->ref[0] = 6;
+ } else {
+ const int ctx3 = av1_get_ref_6_ctx(t->a, &t->l, by4, bx4,
+ have_top, have_left);
+ b->ref[0] = 4 + dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.ref[5][ctx3]);
+ }
+ } else {
+ const int ctx2 = av1_get_ref_3_ctx(t->a, &t->l, by4, bx4,
+ have_top, have_left);
+ if (dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.ref[2][ctx2]))
+ {
+ const int ctx3 = av1_get_ref_5_ctx(t->a, &t->l, by4, bx4,
+ have_top, have_left);
+ b->ref[0] = 2 + dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.ref[4][ctx3]);
+ } else {
+ const int ctx3 = av1_get_ref_4_ctx(t->a, &t->l, by4, bx4,
+ have_top, have_left);
+ b->ref[0] = dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.ref[3][ctx3]);
+ }
+ }
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-ref[%d]: r=%d\n", b->ref[0], ts->msac.rng);
+ }
+ b->ref[1] = -1;
+
+ refmvs_candidate mvstack[8];
+ int n_mvs, ctx;
+ dav1d_refmvs_find(&t->rt, mvstack, &n_mvs, &ctx,
+ (union refmvs_refpair) { .ref = { b->ref[0] + 1, -1 }},
+ bs, intra_edge_flags, t->by, t->bx);
+
+ // mode parsing and mv derivation from ref_mvs
+ if ((seg && (seg->skip || seg->globalmv)) ||
+ dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.newmv_mode[ctx & 7]))
+ {
+ if ((seg && (seg->skip || seg->globalmv)) ||
+ !dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.globalmv_mode[(ctx >> 3) & 1]))
+ {
+ b->inter_mode = GLOBALMV;
+ b->mv[0] = get_gmv_2d(&f->frame_hdr->gmv[b->ref[0]],
+ t->bx, t->by, bw4, bh4, f->frame_hdr);
+ has_subpel_filter = imin(bw4, bh4) == 1 ||
+ f->frame_hdr->gmv[b->ref[0]].type == DAV1D_WM_TYPE_TRANSLATION;
+ } else {
+ has_subpel_filter = 1;
+ if (dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.refmv_mode[(ctx >> 4) & 15]))
+ { // NEAREST, NEARER, NEAR or NEARISH
+ b->inter_mode = NEARMV;
+ b->drl_idx = NEARER_DRL;
+ if (n_mvs > 2) { // NEARER, NEAR or NEARISH
+ const int drl_ctx_v2 = get_drl_context(mvstack, 1);
+ b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.drl_bit[drl_ctx_v2]);
+ if (b->drl_idx == NEAR_DRL && n_mvs > 3) { // NEAR or NEARISH
+ const int drl_ctx_v3 =
+ get_drl_context(mvstack, 2);
+ b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.drl_bit[drl_ctx_v3]);
+ }
+ }
+ } else {
+ b->inter_mode = NEARESTMV;
+ b->drl_idx = NEAREST_DRL;
+ }
+ assert(b->drl_idx >= NEAREST_DRL && b->drl_idx <= NEARISH_DRL);
+ b->mv[0] = mvstack[b->drl_idx].mv.mv[0];
+ if (b->drl_idx < NEAR_DRL)
+ fix_mv_precision(f->frame_hdr, &b->mv[0]);
+ }
+
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-intermode[%d,drl=%d,mv=y:%d,x:%d,n_mvs=%d]: r=%d\n",
+ b->inter_mode, b->drl_idx, b->mv[0].y, b->mv[0].x, n_mvs,
+ ts->msac.rng);
+ } else {
+ has_subpel_filter = 1;
+ b->inter_mode = NEWMV;
+ b->drl_idx = NEAREST_DRL;
+ if (n_mvs > 1) { // NEARER, NEAR or NEARISH
+ const int drl_ctx_v1 = get_drl_context(mvstack, 0);
+ b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.drl_bit[drl_ctx_v1]);
+ if (b->drl_idx == NEARER_DRL && n_mvs > 2) { // NEAR or NEARISH
+ const int drl_ctx_v2 = get_drl_context(mvstack, 1);
+ b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.drl_bit[drl_ctx_v2]);
+ }
+ }
+ assert(b->drl_idx >= NEAREST_DRL && b->drl_idx <= NEARISH_DRL);
+ if (n_mvs > 1) {
+ b->mv[0] = mvstack[b->drl_idx].mv.mv[0];
+ } else {
+ assert(!b->drl_idx);
+ b->mv[0] = mvstack[0].mv.mv[0];
+ fix_mv_precision(f->frame_hdr, &b->mv[0]);
+ }
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-intermode[%d,drl=%d]: r=%d\n",
+ b->inter_mode, b->drl_idx, ts->msac.rng);
+ read_mv_residual(t, &b->mv[0], &ts->cdf.mv,
+ !f->frame_hdr->force_integer_mv);
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-residualmv[mv=y:%d,x:%d]: r=%d\n",
+ b->mv[0].y, b->mv[0].x, ts->msac.rng);
+ }
+
+ // interintra flags
+ const int ii_sz_grp = dav1d_ymode_size_context[bs];
+ if (f->seq_hdr->inter_intra &&
+ interintra_allowed_mask & (1 << bs) &&
+ dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.interintra[ii_sz_grp]))
+ {
+ b->interintra_mode = dav1d_msac_decode_symbol_adapt4(&ts->msac,
+ ts->cdf.m.interintra_mode[ii_sz_grp],
+ N_INTER_INTRA_PRED_MODES - 1);
+ const int wedge_ctx = dav1d_wedge_ctx_lut[bs];
+ b->interintra_type = INTER_INTRA_BLEND +
+ dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.interintra_wedge[wedge_ctx]);
+ if (b->interintra_type == INTER_INTRA_WEDGE)
+ b->wedge_idx = dav1d_msac_decode_symbol_adapt16(&ts->msac,
+ ts->cdf.m.wedge_idx[wedge_ctx], 15);
+ } else {
+ b->interintra_type = INTER_INTRA_NONE;
+ }
+ if (DEBUG_BLOCK_INFO && f->seq_hdr->inter_intra &&
+ interintra_allowed_mask & (1 << bs))
+ {
+ printf("Post-interintra[t=%d,m=%d,w=%d]: r=%d\n",
+ b->interintra_type, b->interintra_mode,
+ b->wedge_idx, ts->msac.rng);
+ }
+
+ // motion variation
+ if (f->frame_hdr->switchable_motion_mode &&
+ b->interintra_type == INTER_INTRA_NONE && imin(bw4, bh4) >= 2 &&
+ // is not warped global motion
+ !(!f->frame_hdr->force_integer_mv && b->inter_mode == GLOBALMV &&
+ f->frame_hdr->gmv[b->ref[0]].type > DAV1D_WM_TYPE_TRANSLATION) &&
+ // has overlappable neighbours
+ ((have_left && findoddzero(&t->l.intra[by4 + 1], h4 >> 1)) ||
+ (have_top && findoddzero(&t->a->intra[bx4 + 1], w4 >> 1))))
+ {
+ // reaching here means the block allows obmc - check warp by
+ // finding matching-ref blocks in top/left edges
+ uint64_t mask[2] = { 0, 0 };
+ find_matching_ref(t, intra_edge_flags, bw4, bh4, w4, h4,
+ have_left, have_top, b->ref[0], mask);
+ const int allow_warp = !f->svc[b->ref[0]][0].scale &&
+ !f->frame_hdr->force_integer_mv &&
+ f->frame_hdr->warp_motion && (mask[0] | mask[1]);
+
+ b->motion_mode = allow_warp ?
+ dav1d_msac_decode_symbol_adapt4(&ts->msac,
+ ts->cdf.m.motion_mode[bs], 2) :
+ dav1d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.obmc[bs]);
+ if (b->motion_mode == MM_WARP) {
+ has_subpel_filter = 0;
+ derive_warpmv(t, bw4, bh4, mask, b->mv[0], &t->warpmv);
+#define signabs(v) v < 0 ? '-' : ' ', abs(v)
+ if (DEBUG_BLOCK_INFO)
+ printf("[ %c%x %c%x %c%x\n %c%x %c%x %c%x ]\n"
+ "alpha=%c%x, beta=%c%x, gamma=%c%x, delta=%c%x, "
+ "mv=y:%d,x:%d\n",
+ signabs(t->warpmv.matrix[0]),
+ signabs(t->warpmv.matrix[1]),
+ signabs(t->warpmv.matrix[2]),
+ signabs(t->warpmv.matrix[3]),
+ signabs(t->warpmv.matrix[4]),
+ signabs(t->warpmv.matrix[5]),
+ signabs(t->warpmv.u.p.alpha),
+ signabs(t->warpmv.u.p.beta),
+ signabs(t->warpmv.u.p.gamma),
+ signabs(t->warpmv.u.p.delta),
+ b->mv[0].y, b->mv[0].x);
+#undef signabs
+ if (t->frame_thread.pass) {
+ if (t->warpmv.type == DAV1D_WM_TYPE_AFFINE) {
+ b->matrix[0] = t->warpmv.matrix[2] - 0x10000;
+ b->matrix[1] = t->warpmv.matrix[3];
+ b->matrix[2] = t->warpmv.matrix[4];
+ b->matrix[3] = t->warpmv.matrix[5] - 0x10000;
+ } else {
+ b->matrix[0] = SHRT_MIN;
+ }
+ }
+ }
+
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-motionmode[%d]: r=%d [mask: 0x%" PRIx64 "/0x%"
+ PRIx64 "]\n", b->motion_mode, ts->msac.rng, mask[0],
+ mask[1]);
+ } else {
+ b->motion_mode = MM_TRANSLATION;
+ }
+ }
+
+ // subpel filter
+ enum Dav1dFilterMode filter[2];
+ if (f->frame_hdr->subpel_filter_mode == DAV1D_FILTER_SWITCHABLE) {
+ if (has_subpel_filter) {
+ const int comp = b->comp_type != COMP_INTER_NONE;
+ const int ctx1 = get_filter_ctx(t->a, &t->l, comp, 0, b->ref[0],
+ by4, bx4);
+ filter[0] = dav1d_msac_decode_symbol_adapt4(&ts->msac,
+ ts->cdf.m.filter[0][ctx1],
+ DAV1D_N_SWITCHABLE_FILTERS - 1);
+ if (f->seq_hdr->dual_filter) {
+ const int ctx2 = get_filter_ctx(t->a, &t->l, comp, 1,
+ b->ref[0], by4, bx4);
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-subpel_filter1[%d,ctx=%d]: r=%d\n",
+ filter[0], ctx1, ts->msac.rng);
+ filter[1] = dav1d_msac_decode_symbol_adapt4(&ts->msac,
+ ts->cdf.m.filter[1][ctx2],
+ DAV1D_N_SWITCHABLE_FILTERS - 1);
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-subpel_filter2[%d,ctx=%d]: r=%d\n",
+ filter[1], ctx2, ts->msac.rng);
+ } else {
+ filter[1] = filter[0];
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-subpel_filter[%d,ctx=%d]: r=%d\n",
+ filter[0], ctx1, ts->msac.rng);
+ }
+ } else {
+ filter[0] = filter[1] = DAV1D_FILTER_8TAP_REGULAR;
+ }
+ } else {
+ filter[0] = filter[1] = f->frame_hdr->subpel_filter_mode;
+ }
+ b->filter2d = dav1d_filter_2d[filter[1]][filter[0]];
+
+ read_vartx_tree(t, b, bs, bx4, by4);
+
+ // reconstruction
+ if (t->frame_thread.pass == 1) {
+ f->bd_fn.read_coef_blocks(t, bs, b);
+ } else {
+ if (f->bd_fn.recon_b_inter(t, bs, b)) return -1;
+ }
+
+ if (f->frame_hdr->loopfilter.level_y[0] ||
+ f->frame_hdr->loopfilter.level_y[1])
+ {
+ const int is_globalmv =
+ b->inter_mode == (is_comp ? GLOBALMV_GLOBALMV : GLOBALMV);
+ const uint8_t (*const lf_lvls)[8][2] = (const uint8_t (*)[8][2])
+ &ts->lflvl[b->seg_id][0][b->ref[0] + 1][!is_globalmv];
+ const uint16_t tx_split[2] = { b->tx_split0, b->tx_split1 };
+ enum RectTxfmSize ytx = b->max_ytx, uvtx = b->uvtx;
+ if (f->frame_hdr->segmentation.lossless[b->seg_id]) {
+ ytx = (enum RectTxfmSize) TX_4X4;
+ uvtx = (enum RectTxfmSize) TX_4X4;
+ }
+ dav1d_create_lf_mask_inter(t->lf_mask, f->lf.level, f->b4_stride, lf_lvls,
+ t->bx, t->by, f->w4, f->h4, b->skip, bs,
+ ytx, tx_split, uvtx, f->cur.p.layout,
+ &t->a->tx_lpf_y[bx4], &t->l.tx_lpf_y[by4],
+ has_chroma ? &t->a->tx_lpf_uv[cbx4] : NULL,
+ has_chroma ? &t->l.tx_lpf_uv[cby4] : NULL);
+ }
+
+ // context updates
+ if (is_comp)
+ splat_tworef_mv(f->c, t, bs, b, bw4, bh4);
+ else
+ splat_oneref_mv(f->c, t, bs, b, bw4, bh4);
+
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir seg_pred, off, mul * seg_pred); \
+ rep_macro(type, t->dir skip_mode, off, mul * b->skip_mode); \
+ rep_macro(type, t->dir intra, off, 0); \
+ rep_macro(type, t->dir skip, off, mul * b->skip); \
+ rep_macro(type, t->dir pal_sz, off, 0); \
+ /* see aomedia bug 2183 for why this is outside if (has_chroma) */ \
+ rep_macro(type, t->pal_sz_uv[diridx], off, 0); \
+ rep_macro(type, t->dir tx_intra, off, mul * b_dim[2 + diridx]); \
+ rep_macro(type, t->dir comp_type, off, mul * b->comp_type); \
+ rep_macro(type, t->dir filter[0], off, mul * filter[0]); \
+ rep_macro(type, t->dir filter[1], off, mul * filter[1]); \
+ rep_macro(type, t->dir mode, off, mul * b->inter_mode); \
+ rep_macro(type, t->dir ref[0], off, mul * b->ref[0]); \
+ rep_macro(type, t->dir ref[1], off, mul * ((uint8_t) b->ref[1]))
+ case_set(bh4, l., 1, by4);
+ case_set(bw4, a->, 0, bx4);
+#undef set_ctx
+
+ if (has_chroma) {
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir uvmode, off, mul * DC_PRED)
+ case_set(cbh4, l., 1, cby4);
+ case_set(cbw4, a->, 0, cbx4);
+#undef set_ctx
+ }
+ }
+
+ // update contexts
+ if (f->frame_hdr->segmentation.enabled &&
+ f->frame_hdr->segmentation.update_map)
+ {
+ uint8_t *seg_ptr = &f->cur_segmap[t->by * f->b4_stride + t->bx];
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ for (int y = 0; y < bh4; y++) { \
+ rep_macro(type, seg_ptr, 0, mul * b->seg_id); \
+ seg_ptr += f->b4_stride; \
+ }
+ case_set(bw4, NULL, 0, 0);
+#undef set_ctx
+ }
+ if (!b->skip) {
+ uint16_t (*noskip_mask)[2] = &t->lf_mask->noskip_mask[by4 >> 1];
+ const unsigned mask = (~0U >> (32 - bw4)) << (bx4 & 15);
+ const int bx_idx = (bx4 & 16) >> 4;
+ for (int y = 0; y < bh4; y += 2, noskip_mask++) {
+ (*noskip_mask)[bx_idx] |= mask;
+ if (bw4 == 32) // this should be mask >> 16, but it's 0xffffffff anyway
+ (*noskip_mask)[1] |= mask;
+ }
+ }
+
+ if (t->frame_thread.pass == 1 && !b->intra && IS_INTER_OR_SWITCH(f->frame_hdr)) {
+ const int sby = (t->by - ts->tiling.row_start) >> f->sb_shift;
+ int (*const lowest_px)[2] = ts->lowest_pixel[sby];
+
+ // keep track of motion vectors for each reference
+ if (b->comp_type == COMP_INTER_NONE) {
+ // y
+ if (imin(bw4, bh4) > 1 &&
+ ((b->inter_mode == GLOBALMV && f->gmv_warp_allowed[b->ref[0]]) ||
+ (b->motion_mode == MM_WARP && t->warpmv.type > DAV1D_WM_TYPE_TRANSLATION)))
+ {
+ affine_lowest_px_luma(t, &lowest_px[b->ref[0]][0], b_dim,
+ b->motion_mode == MM_WARP ? &t->warpmv :
+ &f->frame_hdr->gmv[b->ref[0]]);
+ } else {
+ mc_lowest_px(&lowest_px[b->ref[0]][0], t->by, bh4, b->mv[0].y,
+ 0, &f->svc[b->ref[0]][1]);
+ if (b->motion_mode == MM_OBMC) {
+ obmc_lowest_px(t, lowest_px, 0, b_dim, bx4, by4, w4, h4);
+ }
+ }
+
+ // uv
+ if (has_chroma) {
+ // sub8x8 derivation
+ int is_sub8x8 = bw4 == ss_hor || bh4 == ss_ver;
+ refmvs_block *const *r;
+ if (is_sub8x8) {
+ assert(ss_hor == 1);
+ r = &t->rt.r[(t->by & 31) + 5];
+ if (bw4 == 1) is_sub8x8 &= r[0][t->bx - 1].ref.ref[0] > 0;
+ if (bh4 == ss_ver) is_sub8x8 &= r[-1][t->bx].ref.ref[0] > 0;
+ if (bw4 == 1 && bh4 == ss_ver)
+ is_sub8x8 &= r[-1][t->bx - 1].ref.ref[0] > 0;
+ }
+
+ // chroma prediction
+ if (is_sub8x8) {
+ assert(ss_hor == 1);
+ if (bw4 == 1 && bh4 == ss_ver) {
+ const refmvs_block *const rr = &r[-1][t->bx - 1];
+ mc_lowest_px(&lowest_px[rr->ref.ref[0] - 1][1],
+ t->by - 1, bh4, rr->mv.mv[0].y, ss_ver,
+ &f->svc[rr->ref.ref[0] - 1][1]);
+ }
+ if (bw4 == 1) {
+ const refmvs_block *const rr = &r[0][t->bx - 1];
+ mc_lowest_px(&lowest_px[rr->ref.ref[0] - 1][1],
+ t->by, bh4, rr->mv.mv[0].y, ss_ver,
+ &f->svc[rr->ref.ref[0] - 1][1]);
+ }
+ if (bh4 == ss_ver) {
+ const refmvs_block *const rr = &r[-1][t->bx];
+ mc_lowest_px(&lowest_px[rr->ref.ref[0] - 1][1],
+ t->by - 1, bh4, rr->mv.mv[0].y, ss_ver,
+ &f->svc[rr->ref.ref[0] - 1][1]);
+ }
+ mc_lowest_px(&lowest_px[b->ref[0]][1], t->by, bh4,
+ b->mv[0].y, ss_ver, &f->svc[b->ref[0]][1]);
+ } else {
+ if (imin(cbw4, cbh4) > 1 &&
+ ((b->inter_mode == GLOBALMV && f->gmv_warp_allowed[b->ref[0]]) ||
+ (b->motion_mode == MM_WARP && t->warpmv.type > DAV1D_WM_TYPE_TRANSLATION)))
+ {
+ affine_lowest_px_chroma(t, &lowest_px[b->ref[0]][1], b_dim,
+ b->motion_mode == MM_WARP ? &t->warpmv :
+ &f->frame_hdr->gmv[b->ref[0]]);
+ } else {
+ mc_lowest_px(&lowest_px[b->ref[0]][1],
+ t->by & ~ss_ver, bh4 << (bh4 == ss_ver),
+ b->mv[0].y, ss_ver, &f->svc[b->ref[0]][1]);
+ if (b->motion_mode == MM_OBMC) {
+ obmc_lowest_px(t, lowest_px, 1, b_dim, bx4, by4, w4, h4);
+ }
+ }
+ }
+ }
+ } else {
+ // y
+ for (int i = 0; i < 2; i++) {
+ if (b->inter_mode == GLOBALMV_GLOBALMV && f->gmv_warp_allowed[b->ref[i]]) {
+ affine_lowest_px_luma(t, &lowest_px[b->ref[i]][0], b_dim,
+ &f->frame_hdr->gmv[b->ref[i]]);
+ } else {
+ mc_lowest_px(&lowest_px[b->ref[i]][0], t->by, bh4,
+ b->mv[i].y, 0, &f->svc[b->ref[i]][1]);
+ }
+ }
+
+ // uv
+ if (has_chroma) for (int i = 0; i < 2; i++) {
+ if (b->inter_mode == GLOBALMV_GLOBALMV &&
+ imin(cbw4, cbh4) > 1 && f->gmv_warp_allowed[b->ref[i]])
+ {
+ affine_lowest_px_chroma(t, &lowest_px[b->ref[i]][1], b_dim,
+ &f->frame_hdr->gmv[b->ref[i]]);
+ } else {
+ mc_lowest_px(&lowest_px[b->ref[i]][1], t->by, bh4,
+ b->mv[i].y, ss_ver, &f->svc[b->ref[i]][1]);
+ }
+ }
+ }
+ }
+
+ return 0;
+}
+
+#if __has_feature(memory_sanitizer)
+
+#include <sanitizer/msan_interface.h>
+
+static int checked_decode_b(Dav1dTaskContext *const t,
+ const enum BlockLevel bl,
+ const enum BlockSize bs,
+ const enum BlockPartition bp,
+ const enum EdgeFlags intra_edge_flags)
+{
+ const Dav1dFrameContext *const f = t->f;
+ const int err = decode_b(t, bl, bs, bp, intra_edge_flags);
+
+ if (err == 0 && !(t->frame_thread.pass & 1)) {
+ const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+ const uint8_t *const b_dim = dav1d_block_dimensions[bs];
+ const int bw4 = b_dim[0], bh4 = b_dim[1];
+ const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
+ const int has_chroma = f->seq_hdr->layout != DAV1D_PIXEL_LAYOUT_I400 &&
+ (bw4 > ss_hor || t->bx & 1) &&
+ (bh4 > ss_ver || t->by & 1);
+
+ for (int p = 0; p < 1 + 2 * has_chroma; p++) {
+ const int ss_ver = p && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_hor = p && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+ const ptrdiff_t stride = f->cur.stride[!!p];
+ const int bx = t->bx & ~ss_hor;
+ const int by = t->by & ~ss_ver;
+ const int width = w4 << (2 - ss_hor + (bw4 == ss_hor));
+ const int height = h4 << (2 - ss_ver + (bh4 == ss_ver));
+
+ const uint8_t *data = f->cur.data[p] + (by << (2 - ss_ver)) * stride +
+ (bx << (2 - ss_hor + !!f->seq_hdr->hbd));
+
+ for (int y = 0; y < height; data += stride, y++) {
+ const size_t line_sz = width << !!f->seq_hdr->hbd;
+ if (__msan_test_shadow(data, line_sz) != -1) {
+ fprintf(stderr, "B[%d](%d, %d) w4:%d, h4:%d, row:%d\n",
+ p, bx, by, w4, h4, y);
+ __msan_check_mem_is_initialized(data, line_sz);
+ }
+ }
+ }
+ }
+
+ return err;
+}
+
+#define decode_b checked_decode_b
+
+#endif /* defined(__has_feature) */
+
+static int decode_sb(Dav1dTaskContext *const t, const enum BlockLevel bl,
+ const EdgeNode *const node)
+{
+ const Dav1dFrameContext *const f = t->f;
+ Dav1dTileState *const ts = t->ts;
+ const int hsz = 16 >> bl;
+ const int have_h_split = f->bw > t->bx + hsz;
+ const int have_v_split = f->bh > t->by + hsz;
+
+ if (!have_h_split && !have_v_split) {
+ assert(bl < BL_8X8);
+ return decode_sb(t, bl + 1, INTRA_EDGE_SPLIT(node, 0));
+ }
+
+ uint16_t *pc;
+ enum BlockPartition bp;
+ int ctx, bx8, by8;
+ if (t->frame_thread.pass != 2) {
+ if (0 && bl == BL_64X64)
+ printf("poc=%d,y=%d,x=%d,bl=%d,r=%d\n",
+ f->frame_hdr->frame_offset, t->by, t->bx, bl, ts->msac.rng);
+ bx8 = (t->bx & 31) >> 1;
+ by8 = (t->by & 31) >> 1;
+ ctx = get_partition_ctx(t->a, &t->l, bl, by8, bx8);
+ pc = ts->cdf.m.partition[bl][ctx];
+ }
+
+ if (have_h_split && have_v_split) {
+ if (t->frame_thread.pass == 2) {
+ const Av1Block *const b = &f->frame_thread.b[t->by * f->b4_stride + t->bx];
+ bp = b->bl == bl ? b->bp : PARTITION_SPLIT;
+ } else {
+ bp = dav1d_msac_decode_symbol_adapt16(&ts->msac, pc,
+ dav1d_partition_type_count[bl]);
+ if (f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I422 &&
+ (bp == PARTITION_V || bp == PARTITION_V4 ||
+ bp == PARTITION_T_LEFT_SPLIT || bp == PARTITION_T_RIGHT_SPLIT))
+ {
+ return 1;
+ }
+ if (DEBUG_BLOCK_INFO)
+ printf("poc=%d,y=%d,x=%d,bl=%d,ctx=%d,bp=%d: r=%d\n",
+ f->frame_hdr->frame_offset, t->by, t->bx, bl, ctx, bp,
+ ts->msac.rng);
+ }
+ const uint8_t *const b = dav1d_block_sizes[bl][bp];
+
+ switch (bp) {
+ case PARTITION_NONE:
+ if (decode_b(t, bl, b[0], PARTITION_NONE, node->o))
+ return -1;
+ break;
+ case PARTITION_H:
+ if (decode_b(t, bl, b[0], PARTITION_H, node->h[0]))
+ return -1;
+ t->by += hsz;
+ if (decode_b(t, bl, b[0], PARTITION_H, node->h[1]))
+ return -1;
+ t->by -= hsz;
+ break;
+ case PARTITION_V:
+ if (decode_b(t, bl, b[0], PARTITION_V, node->v[0]))
+ return -1;
+ t->bx += hsz;
+ if (decode_b(t, bl, b[0], PARTITION_V, node->v[1]))
+ return -1;
+ t->bx -= hsz;
+ break;
+ case PARTITION_SPLIT:
+ if (bl == BL_8X8) {
+ const EdgeTip *const tip = (const EdgeTip *) node;
+ assert(hsz == 1);
+ if (decode_b(t, bl, BS_4x4, PARTITION_SPLIT, EDGE_ALL_TR_AND_BL))
+ return -1;
+ const enum Filter2d tl_filter = t->tl_4x4_filter;
+ t->bx++;
+ if (decode_b(t, bl, BS_4x4, PARTITION_SPLIT, tip->split[0]))
+ return -1;
+ t->bx--;
+ t->by++;
+ if (decode_b(t, bl, BS_4x4, PARTITION_SPLIT, tip->split[1]))
+ return -1;
+ t->bx++;
+ t->tl_4x4_filter = tl_filter;
+ if (decode_b(t, bl, BS_4x4, PARTITION_SPLIT, tip->split[2]))
+ return -1;
+ t->bx--;
+ t->by--;
+#if ARCH_X86_64
+ if (t->frame_thread.pass) {
+ /* In 8-bit mode with 2-pass decoding the coefficient buffer
+ * can end up misaligned due to skips here. Work around
+ * the issue by explicitly realigning the buffer. */
+ const int p = t->frame_thread.pass & 1;
+ ts->frame_thread[p].cf =
+ (void*)(((uintptr_t)ts->frame_thread[p].cf + 63) & ~63);
+ }
+#endif
+ } else {
+ if (decode_sb(t, bl + 1, INTRA_EDGE_SPLIT(node, 0)))
+ return 1;
+ t->bx += hsz;
+ if (decode_sb(t, bl + 1, INTRA_EDGE_SPLIT(node, 1)))
+ return 1;
+ t->bx -= hsz;
+ t->by += hsz;
+ if (decode_sb(t, bl + 1, INTRA_EDGE_SPLIT(node, 2)))
+ return 1;
+ t->bx += hsz;
+ if (decode_sb(t, bl + 1, INTRA_EDGE_SPLIT(node, 3)))
+ return 1;
+ t->bx -= hsz;
+ t->by -= hsz;
+ }
+ break;
+ case PARTITION_T_TOP_SPLIT: {
+ if (decode_b(t, bl, b[0], PARTITION_T_TOP_SPLIT, EDGE_ALL_TR_AND_BL))
+ return -1;
+ t->bx += hsz;
+ if (decode_b(t, bl, b[0], PARTITION_T_TOP_SPLIT, node->v[1]))
+ return -1;
+ t->bx -= hsz;
+ t->by += hsz;
+ if (decode_b(t, bl, b[1], PARTITION_T_TOP_SPLIT, node->h[1]))
+ return -1;
+ t->by -= hsz;
+ break;
+ }
+ case PARTITION_T_BOTTOM_SPLIT: {
+ if (decode_b(t, bl, b[0], PARTITION_T_BOTTOM_SPLIT, node->h[0]))
+ return -1;
+ t->by += hsz;
+ if (decode_b(t, bl, b[1], PARTITION_T_BOTTOM_SPLIT, node->v[0]))
+ return -1;
+ t->bx += hsz;
+ if (decode_b(t, bl, b[1], PARTITION_T_BOTTOM_SPLIT, 0))
+ return -1;
+ t->bx -= hsz;
+ t->by -= hsz;
+ break;
+ }
+ case PARTITION_T_LEFT_SPLIT: {
+ if (decode_b(t, bl, b[0], PARTITION_T_LEFT_SPLIT, EDGE_ALL_TR_AND_BL))
+ return -1;
+ t->by += hsz;
+ if (decode_b(t, bl, b[0], PARTITION_T_LEFT_SPLIT, node->h[1]))
+ return -1;
+ t->by -= hsz;
+ t->bx += hsz;
+ if (decode_b(t, bl, b[1], PARTITION_T_LEFT_SPLIT, node->v[1]))
+ return -1;
+ t->bx -= hsz;
+ break;
+ }
+ case PARTITION_T_RIGHT_SPLIT: {
+ if (decode_b(t, bl, b[0], PARTITION_T_RIGHT_SPLIT, node->v[0]))
+ return -1;
+ t->bx += hsz;
+ if (decode_b(t, bl, b[1], PARTITION_T_RIGHT_SPLIT, node->h[0]))
+ return -1;
+ t->by += hsz;
+ if (decode_b(t, bl, b[1], PARTITION_T_RIGHT_SPLIT, 0))
+ return -1;
+ t->by -= hsz;
+ t->bx -= hsz;
+ break;
+ }
+ case PARTITION_H4: {
+ const EdgeBranch *const branch = (const EdgeBranch *) node;
+ if (decode_b(t, bl, b[0], PARTITION_H4, node->h[0]))
+ return -1;
+ t->by += hsz >> 1;
+ if (decode_b(t, bl, b[0], PARTITION_H4, branch->h4))
+ return -1;
+ t->by += hsz >> 1;
+ if (decode_b(t, bl, b[0], PARTITION_H4, EDGE_ALL_LEFT_HAS_BOTTOM))
+ return -1;
+ t->by += hsz >> 1;
+ if (t->by < f->bh)
+ if (decode_b(t, bl, b[0], PARTITION_H4, node->h[1]))
+ return -1;
+ t->by -= hsz * 3 >> 1;
+ break;
+ }
+ case PARTITION_V4: {
+ const EdgeBranch *const branch = (const EdgeBranch *) node;
+ if (decode_b(t, bl, b[0], PARTITION_V4, node->v[0]))
+ return -1;
+ t->bx += hsz >> 1;
+ if (decode_b(t, bl, b[0], PARTITION_V4, branch->v4))
+ return -1;
+ t->bx += hsz >> 1;
+ if (decode_b(t, bl, b[0], PARTITION_V4, EDGE_ALL_TOP_HAS_RIGHT))
+ return -1;
+ t->bx += hsz >> 1;
+ if (t->bx < f->bw)
+ if (decode_b(t, bl, b[0], PARTITION_V4, node->v[1]))
+ return -1;
+ t->bx -= hsz * 3 >> 1;
+ break;
+ }
+ default: assert(0);
+ }
+ } else if (have_h_split) {
+ unsigned is_split;
+ if (t->frame_thread.pass == 2) {
+ const Av1Block *const b = &f->frame_thread.b[t->by * f->b4_stride + t->bx];
+ is_split = b->bl != bl;
+ } else {
+ is_split = dav1d_msac_decode_bool(&ts->msac,
+ gather_top_partition_prob(pc, bl));
+ if (DEBUG_BLOCK_INFO)
+ printf("poc=%d,y=%d,x=%d,bl=%d,ctx=%d,bp=%d: r=%d\n",
+ f->frame_hdr->frame_offset, t->by, t->bx, bl, ctx,
+ is_split ? PARTITION_SPLIT : PARTITION_H, ts->msac.rng);
+ }
+
+ assert(bl < BL_8X8);
+ if (is_split) {
+ bp = PARTITION_SPLIT;
+ if (decode_sb(t, bl + 1, INTRA_EDGE_SPLIT(node, 0))) return 1;
+ t->bx += hsz;
+ if (decode_sb(t, bl + 1, INTRA_EDGE_SPLIT(node, 1))) return 1;
+ t->bx -= hsz;
+ } else {
+ bp = PARTITION_H;
+ if (decode_b(t, bl, dav1d_block_sizes[bl][PARTITION_H][0],
+ PARTITION_H, node->h[0]))
+ return -1;
+ }
+ } else {
+ assert(have_v_split);
+ unsigned is_split;
+ if (t->frame_thread.pass == 2) {
+ const Av1Block *const b = &f->frame_thread.b[t->by * f->b4_stride + t->bx];
+ is_split = b->bl != bl;
+ } else {
+ is_split = dav1d_msac_decode_bool(&ts->msac,
+ gather_left_partition_prob(pc, bl));
+ if (f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I422 && !is_split)
+ return 1;
+ if (DEBUG_BLOCK_INFO)
+ printf("poc=%d,y=%d,x=%d,bl=%d,ctx=%d,bp=%d: r=%d\n",
+ f->frame_hdr->frame_offset, t->by, t->bx, bl, ctx,
+ is_split ? PARTITION_SPLIT : PARTITION_V, ts->msac.rng);
+ }
+
+ assert(bl < BL_8X8);
+ if (is_split) {
+ bp = PARTITION_SPLIT;
+ if (decode_sb(t, bl + 1, INTRA_EDGE_SPLIT(node, 0))) return 1;
+ t->by += hsz;
+ if (decode_sb(t, bl + 1, INTRA_EDGE_SPLIT(node, 2))) return 1;
+ t->by -= hsz;
+ } else {
+ bp = PARTITION_V;
+ if (decode_b(t, bl, dav1d_block_sizes[bl][PARTITION_V][0],
+ PARTITION_V, node->v[0]))
+ return -1;
+ }
+ }
+
+ if (t->frame_thread.pass != 2 && (bp != PARTITION_SPLIT || bl == BL_8X8)) {
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->a->partition, bx8, mul * dav1d_al_part_ctx[0][bl][bp]); \
+ rep_macro(type, t->l.partition, by8, mul * dav1d_al_part_ctx[1][bl][bp])
+ case_set_upto16(hsz,,,);
+#undef set_ctx
+ }
+
+ return 0;
+}
+
+static void reset_context(BlockContext *const ctx, const int keyframe, const int pass) {
+ memset(ctx->intra, keyframe, sizeof(ctx->intra));
+ memset(ctx->uvmode, DC_PRED, sizeof(ctx->uvmode));
+ if (keyframe)
+ memset(ctx->mode, DC_PRED, sizeof(ctx->mode));
+
+ if (pass == 2) return;
+
+ memset(ctx->partition, 0, sizeof(ctx->partition));
+ memset(ctx->skip, 0, sizeof(ctx->skip));
+ memset(ctx->skip_mode, 0, sizeof(ctx->skip_mode));
+ memset(ctx->tx_lpf_y, 2, sizeof(ctx->tx_lpf_y));
+ memset(ctx->tx_lpf_uv, 1, sizeof(ctx->tx_lpf_uv));
+ memset(ctx->tx_intra, -1, sizeof(ctx->tx_intra));
+ memset(ctx->tx, TX_64X64, sizeof(ctx->tx));
+ if (!keyframe) {
+ memset(ctx->ref, -1, sizeof(ctx->ref));
+ memset(ctx->comp_type, 0, sizeof(ctx->comp_type));
+ memset(ctx->mode, NEARESTMV, sizeof(ctx->mode));
+ }
+ memset(ctx->lcoef, 0x40, sizeof(ctx->lcoef));
+ memset(ctx->ccoef, 0x40, sizeof(ctx->ccoef));
+ memset(ctx->filter, DAV1D_N_SWITCHABLE_FILTERS, sizeof(ctx->filter));
+ memset(ctx->seg_pred, 0, sizeof(ctx->seg_pred));
+ memset(ctx->pal_sz, 0, sizeof(ctx->pal_sz));
+}
+
+// { Y+U+V, Y+U } * 4
+static const uint8_t ss_size_mul[4][2] = {
+ [DAV1D_PIXEL_LAYOUT_I400] = { 4, 4 },
+ [DAV1D_PIXEL_LAYOUT_I420] = { 6, 5 },
+ [DAV1D_PIXEL_LAYOUT_I422] = { 8, 6 },
+ [DAV1D_PIXEL_LAYOUT_I444] = { 12, 8 },
+};
+
+static void setup_tile(Dav1dTileState *const ts,
+ const Dav1dFrameContext *const f,
+ const uint8_t *const data, const size_t sz,
+ const int tile_row, const int tile_col,
+ const unsigned tile_start_off)
+{
+ const int col_sb_start = f->frame_hdr->tiling.col_start_sb[tile_col];
+ const int col_sb128_start = col_sb_start >> !f->seq_hdr->sb128;
+ const int col_sb_end = f->frame_hdr->tiling.col_start_sb[tile_col + 1];
+ const int row_sb_start = f->frame_hdr->tiling.row_start_sb[tile_row];
+ const int row_sb_end = f->frame_hdr->tiling.row_start_sb[tile_row + 1];
+ const int sb_shift = f->sb_shift;
+
+ const uint8_t *const size_mul = ss_size_mul[f->cur.p.layout];
+ for (int p = 0; p < 2; p++) {
+ ts->frame_thread[p].pal_idx = f->frame_thread.pal_idx ?
+ &f->frame_thread.pal_idx[(size_t)tile_start_off * size_mul[1] / 8] :
+ NULL;
+ ts->frame_thread[p].cbi = f->frame_thread.cbi ?
+ &f->frame_thread.cbi[(size_t)tile_start_off * size_mul[0] / 64] :
+ NULL;
+ ts->frame_thread[p].cf = f->frame_thread.cf ?
+ (uint8_t*)f->frame_thread.cf +
+ (((size_t)tile_start_off * size_mul[0]) >> !f->seq_hdr->hbd) :
+ NULL;
+ }
+
+ dav1d_cdf_thread_copy(&ts->cdf, &f->in_cdf);
+ ts->last_qidx = f->frame_hdr->quant.yac;
+ memset(ts->last_delta_lf, 0, sizeof(ts->last_delta_lf));
+
+ dav1d_msac_init(&ts->msac, data, sz, f->frame_hdr->disable_cdf_update);
+
+ ts->tiling.row = tile_row;
+ ts->tiling.col = tile_col;
+ ts->tiling.col_start = col_sb_start << sb_shift;
+ ts->tiling.col_end = imin(col_sb_end << sb_shift, f->bw);
+ ts->tiling.row_start = row_sb_start << sb_shift;
+ ts->tiling.row_end = imin(row_sb_end << sb_shift, f->bh);
+
+ // Reference Restoration Unit (used for exp coding)
+ int sb_idx, unit_idx;
+ if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) {
+ // vertical components only
+ sb_idx = (ts->tiling.row_start >> 5) * f->sr_sb128w;
+ unit_idx = (ts->tiling.row_start & 16) >> 3;
+ } else {
+ sb_idx = (ts->tiling.row_start >> 5) * f->sb128w + col_sb128_start;
+ unit_idx = ((ts->tiling.row_start & 16) >> 3) +
+ ((ts->tiling.col_start & 16) >> 4);
+ }
+ for (int p = 0; p < 3; p++) {
+ if (!((f->lf.restore_planes >> p) & 1U))
+ continue;
+
+ if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) {
+ const int ss_hor = p && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+ const int d = f->frame_hdr->super_res.width_scale_denominator;
+ const int unit_size_log2 = f->frame_hdr->restoration.unit_size[!!p];
+ const int rnd = (8 << unit_size_log2) - 1, shift = unit_size_log2 + 3;
+ const int x = ((4 * ts->tiling.col_start * d >> ss_hor) + rnd) >> shift;
+ const int px_x = x << (unit_size_log2 + ss_hor);
+ const int u_idx = unit_idx + ((px_x & 64) >> 6);
+ const int sb128x = px_x >> 7;
+ if (sb128x >= f->sr_sb128w) continue;
+ ts->lr_ref[p] = &f->lf.lr_mask[sb_idx + sb128x].lr[p][u_idx];
+ } else {
+ ts->lr_ref[p] = &f->lf.lr_mask[sb_idx].lr[p][unit_idx];
+ }
+
+ ts->lr_ref[p]->filter_v[0] = 3;
+ ts->lr_ref[p]->filter_v[1] = -7;
+ ts->lr_ref[p]->filter_v[2] = 15;
+ ts->lr_ref[p]->filter_h[0] = 3;
+ ts->lr_ref[p]->filter_h[1] = -7;
+ ts->lr_ref[p]->filter_h[2] = 15;
+ ts->lr_ref[p]->sgr_weights[0] = -32;
+ ts->lr_ref[p]->sgr_weights[1] = 31;
+ }
+
+ if (f->c->n_tc > 1) {
+ for (int p = 0; p < 2; p++)
+ atomic_init(&ts->progress[p], row_sb_start);
+ }
+}
+
+static void read_restoration_info(Dav1dTaskContext *const t,
+ Av1RestorationUnit *const lr, const int p,
+ const enum Dav1dRestorationType frame_type)
+{
+ const Dav1dFrameContext *const f = t->f;
+ Dav1dTileState *const ts = t->ts;
+
+ if (frame_type == DAV1D_RESTORATION_SWITCHABLE) {
+ const int filter = dav1d_msac_decode_symbol_adapt4(&ts->msac,
+ ts->cdf.m.restore_switchable, 2);
+ lr->type = filter + !!filter; /* NONE/WIENER/SGRPROJ */
+ } else {
+ const unsigned type =
+ dav1d_msac_decode_bool_adapt(&ts->msac,
+ frame_type == DAV1D_RESTORATION_WIENER ?
+ ts->cdf.m.restore_wiener : ts->cdf.m.restore_sgrproj);
+ lr->type = type ? frame_type : DAV1D_RESTORATION_NONE;
+ }
+
+ if (lr->type == DAV1D_RESTORATION_WIENER) {
+ lr->filter_v[0] = p ? 0 :
+ dav1d_msac_decode_subexp(&ts->msac,
+ ts->lr_ref[p]->filter_v[0] + 5, 16, 1) - 5;
+ lr->filter_v[1] =
+ dav1d_msac_decode_subexp(&ts->msac,
+ ts->lr_ref[p]->filter_v[1] + 23, 32, 2) - 23;
+ lr->filter_v[2] =
+ dav1d_msac_decode_subexp(&ts->msac,
+ ts->lr_ref[p]->filter_v[2] + 17, 64, 3) - 17;
+
+ lr->filter_h[0] = p ? 0 :
+ dav1d_msac_decode_subexp(&ts->msac,
+ ts->lr_ref[p]->filter_h[0] + 5, 16, 1) - 5;
+ lr->filter_h[1] =
+ dav1d_msac_decode_subexp(&ts->msac,
+ ts->lr_ref[p]->filter_h[1] + 23, 32, 2) - 23;
+ lr->filter_h[2] =
+ dav1d_msac_decode_subexp(&ts->msac,
+ ts->lr_ref[p]->filter_h[2] + 17, 64, 3) - 17;
+ memcpy(lr->sgr_weights, ts->lr_ref[p]->sgr_weights, sizeof(lr->sgr_weights));
+ ts->lr_ref[p] = lr;
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-lr_wiener[pl=%d,v[%d,%d,%d],h[%d,%d,%d]]: r=%d\n",
+ p, lr->filter_v[0], lr->filter_v[1],
+ lr->filter_v[2], lr->filter_h[0],
+ lr->filter_h[1], lr->filter_h[2], ts->msac.rng);
+ } else if (lr->type == DAV1D_RESTORATION_SGRPROJ) {
+ const unsigned idx = dav1d_msac_decode_bools(&ts->msac, 4);
+ const uint16_t *const sgr_params = dav1d_sgr_params[idx];
+ lr->type += idx;
+ lr->sgr_weights[0] = sgr_params[0] ? dav1d_msac_decode_subexp(&ts->msac,
+ ts->lr_ref[p]->sgr_weights[0] + 96, 128, 4) - 96 : 0;
+ lr->sgr_weights[1] = sgr_params[1] ? dav1d_msac_decode_subexp(&ts->msac,
+ ts->lr_ref[p]->sgr_weights[1] + 32, 128, 4) - 32 : 95;
+ memcpy(lr->filter_v, ts->lr_ref[p]->filter_v, sizeof(lr->filter_v));
+ memcpy(lr->filter_h, ts->lr_ref[p]->filter_h, sizeof(lr->filter_h));
+ ts->lr_ref[p] = lr;
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-lr_sgrproj[pl=%d,idx=%d,w[%d,%d]]: r=%d\n",
+ p, idx, lr->sgr_weights[0],
+ lr->sgr_weights[1], ts->msac.rng);
+ }
+}
+
+int dav1d_decode_tile_sbrow(Dav1dTaskContext *const t) {
+ const Dav1dFrameContext *const f = t->f;
+ const enum BlockLevel root_bl = f->seq_hdr->sb128 ? BL_128X128 : BL_64X64;
+ Dav1dTileState *const ts = t->ts;
+ const Dav1dContext *const c = f->c;
+ const int sb_step = f->sb_step;
+ const int tile_row = ts->tiling.row, tile_col = ts->tiling.col;
+ const int col_sb_start = f->frame_hdr->tiling.col_start_sb[tile_col];
+ const int col_sb128_start = col_sb_start >> !f->seq_hdr->sb128;
+
+ if (IS_INTER_OR_SWITCH(f->frame_hdr) || f->frame_hdr->allow_intrabc) {
+ dav1d_refmvs_tile_sbrow_init(&t->rt, &f->rf, ts->tiling.col_start,
+ ts->tiling.col_end, ts->tiling.row_start,
+ ts->tiling.row_end, t->by >> f->sb_shift,
+ ts->tiling.row, t->frame_thread.pass);
+ }
+
+ if (IS_INTER_OR_SWITCH(f->frame_hdr) && c->n_fc > 1) {
+ const int sby = (t->by - ts->tiling.row_start) >> f->sb_shift;
+ int (*const lowest_px)[2] = ts->lowest_pixel[sby];
+ for (int n = 0; n < 7; n++)
+ for (int m = 0; m < 2; m++)
+ lowest_px[n][m] = INT_MIN;
+ }
+
+ reset_context(&t->l, IS_KEY_OR_INTRA(f->frame_hdr), t->frame_thread.pass);
+ if (t->frame_thread.pass == 2) {
+ const int off_2pass = c->n_tc > 1 ? f->sb128w * f->frame_hdr->tiling.rows : 0;
+ for (t->bx = ts->tiling.col_start,
+ t->a = f->a + off_2pass + col_sb128_start + tile_row * f->sb128w;
+ t->bx < ts->tiling.col_end; t->bx += sb_step)
+ {
+ if (atomic_load_explicit(c->flush, memory_order_acquire))
+ return 1;
+ if (decode_sb(t, root_bl, dav1d_intra_edge_tree[root_bl]))
+ return 1;
+ if (t->bx & 16 || f->seq_hdr->sb128)
+ t->a++;
+ }
+ f->bd_fn.backup_ipred_edge(t);
+ return 0;
+ }
+
+ // error out on symbol decoder overread
+ if (ts->msac.cnt < -15) return 1;
+
+ if (f->c->n_tc > 1 && f->frame_hdr->use_ref_frame_mvs) {
+ f->c->refmvs_dsp.load_tmvs(&f->rf, ts->tiling.row,
+ ts->tiling.col_start >> 1, ts->tiling.col_end >> 1,
+ t->by >> 1, (t->by + sb_step) >> 1);
+ }
+ memset(t->pal_sz_uv[1], 0, sizeof(*t->pal_sz_uv));
+ const int sb128y = t->by >> 5;
+ for (t->bx = ts->tiling.col_start, t->a = f->a + col_sb128_start + tile_row * f->sb128w,
+ t->lf_mask = f->lf.mask + sb128y * f->sb128w + col_sb128_start;
+ t->bx < ts->tiling.col_end; t->bx += sb_step)
+ {
+ if (atomic_load_explicit(c->flush, memory_order_acquire))
+ return 1;
+ if (root_bl == BL_128X128) {
+ t->cur_sb_cdef_idx_ptr = t->lf_mask->cdef_idx;
+ t->cur_sb_cdef_idx_ptr[0] = -1;
+ t->cur_sb_cdef_idx_ptr[1] = -1;
+ t->cur_sb_cdef_idx_ptr[2] = -1;
+ t->cur_sb_cdef_idx_ptr[3] = -1;
+ } else {
+ t->cur_sb_cdef_idx_ptr =
+ &t->lf_mask->cdef_idx[((t->bx & 16) >> 4) +
+ ((t->by & 16) >> 3)];
+ t->cur_sb_cdef_idx_ptr[0] = -1;
+ }
+ // Restoration filter
+ for (int p = 0; p < 3; p++) {
+ if (!((f->lf.restore_planes >> p) & 1U))
+ continue;
+
+ const int ss_ver = p && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_hor = p && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+ const int unit_size_log2 = f->frame_hdr->restoration.unit_size[!!p];
+ const int y = t->by * 4 >> ss_ver;
+ const int h = (f->cur.p.h + ss_ver) >> ss_ver;
+
+ const int unit_size = 1 << unit_size_log2;
+ const unsigned mask = unit_size - 1;
+ if (y & mask) continue;
+ const int half_unit = unit_size >> 1;
+ // Round half up at frame boundaries, if there's more than one
+ // restoration unit
+ if (y && y + half_unit > h) continue;
+
+ const enum Dav1dRestorationType frame_type = f->frame_hdr->restoration.type[p];
+
+ if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) {
+ const int w = (f->sr_cur.p.p.w + ss_hor) >> ss_hor;
+ const int n_units = imax(1, (w + half_unit) >> unit_size_log2);
+
+ const int d = f->frame_hdr->super_res.width_scale_denominator;
+ const int rnd = unit_size * 8 - 1, shift = unit_size_log2 + 3;
+ const int x0 = ((4 * t->bx * d >> ss_hor) + rnd) >> shift;
+ const int x1 = ((4 * (t->bx + sb_step) * d >> ss_hor) + rnd) >> shift;
+
+ for (int x = x0; x < imin(x1, n_units); x++) {
+ const int px_x = x << (unit_size_log2 + ss_hor);
+ const int sb_idx = (t->by >> 5) * f->sr_sb128w + (px_x >> 7);
+ const int unit_idx = ((t->by & 16) >> 3) + ((px_x & 64) >> 6);
+ Av1RestorationUnit *const lr = &f->lf.lr_mask[sb_idx].lr[p][unit_idx];
+
+ read_restoration_info(t, lr, p, frame_type);
+ }
+ } else {
+ const int x = 4 * t->bx >> ss_hor;
+ if (x & mask) continue;
+ const int w = (f->cur.p.w + ss_hor) >> ss_hor;
+ // Round half up at frame boundaries, if there's more than one
+ // restoration unit
+ if (x && x + half_unit > w) continue;
+ const int sb_idx = (t->by >> 5) * f->sr_sb128w + (t->bx >> 5);
+ const int unit_idx = ((t->by & 16) >> 3) + ((t->bx & 16) >> 4);
+ Av1RestorationUnit *const lr = &f->lf.lr_mask[sb_idx].lr[p][unit_idx];
+
+ read_restoration_info(t, lr, p, frame_type);
+ }
+ }
+ if (decode_sb(t, root_bl, dav1d_intra_edge_tree[root_bl]))
+ return 1;
+ if (t->bx & 16 || f->seq_hdr->sb128) {
+ t->a++;
+ t->lf_mask++;
+ }
+ }
+
+ if (f->seq_hdr->ref_frame_mvs && f->c->n_tc > 1 && IS_INTER_OR_SWITCH(f->frame_hdr)) {
+ dav1d_refmvs_save_tmvs(&f->c->refmvs_dsp, &t->rt,
+ ts->tiling.col_start >> 1, ts->tiling.col_end >> 1,
+ t->by >> 1, (t->by + sb_step) >> 1);
+ }
+
+ // backup pre-loopfilter pixels for intra prediction of the next sbrow
+ if (t->frame_thread.pass != 1)
+ f->bd_fn.backup_ipred_edge(t);
+
+ // backup t->a/l.tx_lpf_y/uv at tile boundaries to use them to "fix"
+ // up the initial value in neighbour tiles when running the loopfilter
+ int align_h = (f->bh + 31) & ~31;
+ memcpy(&f->lf.tx_lpf_right_edge[0][align_h * tile_col + t->by],
+ &t->l.tx_lpf_y[t->by & 16], sb_step);
+ const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ align_h >>= ss_ver;
+ memcpy(&f->lf.tx_lpf_right_edge[1][align_h * tile_col + (t->by >> ss_ver)],
+ &t->l.tx_lpf_uv[(t->by & 16) >> ss_ver], sb_step >> ss_ver);
+
+ return 0;
+}
+
+int dav1d_decode_frame_init(Dav1dFrameContext *const f) {
+ const Dav1dContext *const c = f->c;
+ int retval = DAV1D_ERR(ENOMEM);
+
+ if (f->sbh > f->lf.start_of_tile_row_sz) {
+ dav1d_free(f->lf.start_of_tile_row);
+ f->lf.start_of_tile_row = dav1d_malloc(ALLOC_TILE, f->sbh * sizeof(uint8_t));
+ if (!f->lf.start_of_tile_row) {
+ f->lf.start_of_tile_row_sz = 0;
+ goto error;
+ }
+ f->lf.start_of_tile_row_sz = f->sbh;
+ }
+ int sby = 0;
+ for (int tile_row = 0; tile_row < f->frame_hdr->tiling.rows; tile_row++) {
+ f->lf.start_of_tile_row[sby++] = tile_row;
+ while (sby < f->frame_hdr->tiling.row_start_sb[tile_row + 1])
+ f->lf.start_of_tile_row[sby++] = 0;
+ }
+
+ const int n_ts = f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows;
+ if (n_ts != f->n_ts) {
+ if (c->n_fc > 1) {
+ dav1d_free(f->frame_thread.tile_start_off);
+ f->frame_thread.tile_start_off =
+ dav1d_malloc(ALLOC_TILE, sizeof(*f->frame_thread.tile_start_off) * n_ts);
+ if (!f->frame_thread.tile_start_off) {
+ f->n_ts = 0;
+ goto error;
+ }
+ }
+ dav1d_free_aligned(f->ts);
+ f->ts = dav1d_alloc_aligned(ALLOC_TILE, sizeof(*f->ts) * n_ts, 32);
+ if (!f->ts) goto error;
+ f->n_ts = n_ts;
+ }
+
+ const int a_sz = f->sb128w * f->frame_hdr->tiling.rows * (1 + (c->n_fc > 1 && c->n_tc > 1));
+ if (a_sz != f->a_sz) {
+ dav1d_free(f->a);
+ f->a = dav1d_malloc(ALLOC_TILE, sizeof(*f->a) * a_sz);
+ if (!f->a) {
+ f->a_sz = 0;
+ goto error;
+ }
+ f->a_sz = a_sz;
+ }
+
+ const int num_sb128 = f->sb128w * f->sb128h;
+ const uint8_t *const size_mul = ss_size_mul[f->cur.p.layout];
+ const int hbd = !!f->seq_hdr->hbd;
+ if (c->n_fc > 1) {
+ const unsigned sb_step4 = f->sb_step * 4;
+ int tile_idx = 0;
+ for (int tile_row = 0; tile_row < f->frame_hdr->tiling.rows; tile_row++) {
+ const unsigned row_off = f->frame_hdr->tiling.row_start_sb[tile_row] *
+ sb_step4 * f->sb128w * 128;
+ const unsigned b_diff = (f->frame_hdr->tiling.row_start_sb[tile_row + 1] -
+ f->frame_hdr->tiling.row_start_sb[tile_row]) * sb_step4;
+ for (int tile_col = 0; tile_col < f->frame_hdr->tiling.cols; tile_col++) {
+ f->frame_thread.tile_start_off[tile_idx++] = row_off + b_diff *
+ f->frame_hdr->tiling.col_start_sb[tile_col] * sb_step4;
+ }
+ }
+
+ const int lowest_pixel_mem_sz = f->frame_hdr->tiling.cols * f->sbh;
+ if (lowest_pixel_mem_sz != f->tile_thread.lowest_pixel_mem_sz) {
+ dav1d_free(f->tile_thread.lowest_pixel_mem);
+ f->tile_thread.lowest_pixel_mem =
+ dav1d_malloc(ALLOC_TILE, lowest_pixel_mem_sz *
+ sizeof(*f->tile_thread.lowest_pixel_mem));
+ if (!f->tile_thread.lowest_pixel_mem) {
+ f->tile_thread.lowest_pixel_mem_sz = 0;
+ goto error;
+ }
+ f->tile_thread.lowest_pixel_mem_sz = lowest_pixel_mem_sz;
+ }
+ int (*lowest_pixel_ptr)[7][2] = f->tile_thread.lowest_pixel_mem;
+ for (int tile_row = 0, tile_row_base = 0; tile_row < f->frame_hdr->tiling.rows;
+ tile_row++, tile_row_base += f->frame_hdr->tiling.cols)
+ {
+ const int tile_row_sb_h = f->frame_hdr->tiling.row_start_sb[tile_row + 1] -
+ f->frame_hdr->tiling.row_start_sb[tile_row];
+ for (int tile_col = 0; tile_col < f->frame_hdr->tiling.cols; tile_col++) {
+ f->ts[tile_row_base + tile_col].lowest_pixel = lowest_pixel_ptr;
+ lowest_pixel_ptr += tile_row_sb_h;
+ }
+ }
+
+ const int cbi_sz = num_sb128 * size_mul[0];
+ if (cbi_sz != f->frame_thread.cbi_sz) {
+ dav1d_free_aligned(f->frame_thread.cbi);
+ f->frame_thread.cbi =
+ dav1d_alloc_aligned(ALLOC_BLOCK, sizeof(*f->frame_thread.cbi) *
+ cbi_sz * 32 * 32 / 4, 64);
+ if (!f->frame_thread.cbi) {
+ f->frame_thread.cbi_sz = 0;
+ goto error;
+ }
+ f->frame_thread.cbi_sz = cbi_sz;
+ }
+
+ const int cf_sz = (num_sb128 * size_mul[0]) << hbd;
+ if (cf_sz != f->frame_thread.cf_sz) {
+ dav1d_free_aligned(f->frame_thread.cf);
+ f->frame_thread.cf =
+ dav1d_alloc_aligned(ALLOC_COEF, (size_t)cf_sz * 128 * 128 / 2, 64);
+ if (!f->frame_thread.cf) {
+ f->frame_thread.cf_sz = 0;
+ goto error;
+ }
+ memset(f->frame_thread.cf, 0, (size_t)cf_sz * 128 * 128 / 2);
+ f->frame_thread.cf_sz = cf_sz;
+ }
+
+ if (f->frame_hdr->allow_screen_content_tools) {
+ const int pal_sz = num_sb128 << hbd;
+ if (pal_sz != f->frame_thread.pal_sz) {
+ dav1d_free_aligned(f->frame_thread.pal);
+ f->frame_thread.pal =
+ dav1d_alloc_aligned(ALLOC_PAL, sizeof(*f->frame_thread.pal) *
+ pal_sz * 16 * 16, 64);
+ if (!f->frame_thread.pal) {
+ f->frame_thread.pal_sz = 0;
+ goto error;
+ }
+ f->frame_thread.pal_sz = pal_sz;
+ }
+
+ const int pal_idx_sz = num_sb128 * size_mul[1];
+ if (pal_idx_sz != f->frame_thread.pal_idx_sz) {
+ dav1d_free_aligned(f->frame_thread.pal_idx);
+ f->frame_thread.pal_idx =
+ dav1d_alloc_aligned(ALLOC_PAL, sizeof(*f->frame_thread.pal_idx) *
+ pal_idx_sz * 128 * 128 / 8, 64);
+ if (!f->frame_thread.pal_idx) {
+ f->frame_thread.pal_idx_sz = 0;
+ goto error;
+ }
+ f->frame_thread.pal_idx_sz = pal_idx_sz;
+ }
+ } else if (f->frame_thread.pal) {
+ dav1d_freep_aligned(&f->frame_thread.pal);
+ dav1d_freep_aligned(&f->frame_thread.pal_idx);
+ f->frame_thread.pal_sz = f->frame_thread.pal_idx_sz = 0;
+ }
+ }
+
+ // update allocation of block contexts for above
+ ptrdiff_t y_stride = f->cur.stride[0], uv_stride = f->cur.stride[1];
+ const int has_resize = f->frame_hdr->width[0] != f->frame_hdr->width[1];
+ const int need_cdef_lpf_copy = c->n_tc > 1 && has_resize;
+ if (y_stride * f->sbh * 4 != f->lf.cdef_buf_plane_sz[0] ||
+ uv_stride * f->sbh * 8 != f->lf.cdef_buf_plane_sz[1] ||
+ need_cdef_lpf_copy != f->lf.need_cdef_lpf_copy ||
+ f->sbh != f->lf.cdef_buf_sbh)
+ {
+ dav1d_free_aligned(f->lf.cdef_line_buf);
+ size_t alloc_sz = 64;
+ alloc_sz += (size_t)llabs(y_stride) * 4 * f->sbh << need_cdef_lpf_copy;
+ alloc_sz += (size_t)llabs(uv_stride) * 8 * f->sbh << need_cdef_lpf_copy;
+ uint8_t *ptr = f->lf.cdef_line_buf = dav1d_alloc_aligned(ALLOC_CDEF, alloc_sz, 32);
+ if (!ptr) {
+ f->lf.cdef_buf_plane_sz[0] = f->lf.cdef_buf_plane_sz[1] = 0;
+ goto error;
+ }
+
+ ptr += 32;
+ if (y_stride < 0) {
+ f->lf.cdef_line[0][0] = ptr - y_stride * (f->sbh * 4 - 1);
+ f->lf.cdef_line[1][0] = ptr - y_stride * (f->sbh * 4 - 3);
+ } else {
+ f->lf.cdef_line[0][0] = ptr + y_stride * 0;
+ f->lf.cdef_line[1][0] = ptr + y_stride * 2;
+ }
+ ptr += llabs(y_stride) * f->sbh * 4;
+ if (uv_stride < 0) {
+ f->lf.cdef_line[0][1] = ptr - uv_stride * (f->sbh * 8 - 1);
+ f->lf.cdef_line[0][2] = ptr - uv_stride * (f->sbh * 8 - 3);
+ f->lf.cdef_line[1][1] = ptr - uv_stride * (f->sbh * 8 - 5);
+ f->lf.cdef_line[1][2] = ptr - uv_stride * (f->sbh * 8 - 7);
+ } else {
+ f->lf.cdef_line[0][1] = ptr + uv_stride * 0;
+ f->lf.cdef_line[0][2] = ptr + uv_stride * 2;
+ f->lf.cdef_line[1][1] = ptr + uv_stride * 4;
+ f->lf.cdef_line[1][2] = ptr + uv_stride * 6;
+ }
+
+ if (need_cdef_lpf_copy) {
+ ptr += llabs(uv_stride) * f->sbh * 8;
+ if (y_stride < 0)
+ f->lf.cdef_lpf_line[0] = ptr - y_stride * (f->sbh * 4 - 1);
+ else
+ f->lf.cdef_lpf_line[0] = ptr;
+ ptr += llabs(y_stride) * f->sbh * 4;
+ if (uv_stride < 0) {
+ f->lf.cdef_lpf_line[1] = ptr - uv_stride * (f->sbh * 4 - 1);
+ f->lf.cdef_lpf_line[2] = ptr - uv_stride * (f->sbh * 8 - 1);
+ } else {
+ f->lf.cdef_lpf_line[1] = ptr;
+ f->lf.cdef_lpf_line[2] = ptr + uv_stride * f->sbh * 4;
+ }
+ }
+
+ f->lf.cdef_buf_plane_sz[0] = (int) y_stride * f->sbh * 4;
+ f->lf.cdef_buf_plane_sz[1] = (int) uv_stride * f->sbh * 8;
+ f->lf.need_cdef_lpf_copy = need_cdef_lpf_copy;
+ f->lf.cdef_buf_sbh = f->sbh;
+ }
+
+ const int sb128 = f->seq_hdr->sb128;
+ const int num_lines = c->n_tc > 1 ? f->sbh * 4 << sb128 : 12;
+ y_stride = f->sr_cur.p.stride[0], uv_stride = f->sr_cur.p.stride[1];
+ if (y_stride * num_lines != f->lf.lr_buf_plane_sz[0] ||
+ uv_stride * num_lines * 2 != f->lf.lr_buf_plane_sz[1])
+ {
+ dav1d_free_aligned(f->lf.lr_line_buf);
+ // lr simd may overread the input, so slightly over-allocate the lpf buffer
+ size_t alloc_sz = 128;
+ alloc_sz += (size_t)llabs(y_stride) * num_lines;
+ alloc_sz += (size_t)llabs(uv_stride) * num_lines * 2;
+ uint8_t *ptr = f->lf.lr_line_buf = dav1d_alloc_aligned(ALLOC_LR, alloc_sz, 64);
+ if (!ptr) {
+ f->lf.lr_buf_plane_sz[0] = f->lf.lr_buf_plane_sz[1] = 0;
+ goto error;
+ }
+
+ ptr += 64;
+ if (y_stride < 0)
+ f->lf.lr_lpf_line[0] = ptr - y_stride * (num_lines - 1);
+ else
+ f->lf.lr_lpf_line[0] = ptr;
+ ptr += llabs(y_stride) * num_lines;
+ if (uv_stride < 0) {
+ f->lf.lr_lpf_line[1] = ptr - uv_stride * (num_lines * 1 - 1);
+ f->lf.lr_lpf_line[2] = ptr - uv_stride * (num_lines * 2 - 1);
+ } else {
+ f->lf.lr_lpf_line[1] = ptr;
+ f->lf.lr_lpf_line[2] = ptr + uv_stride * num_lines;
+ }
+
+ f->lf.lr_buf_plane_sz[0] = (int) y_stride * num_lines;
+ f->lf.lr_buf_plane_sz[1] = (int) uv_stride * num_lines * 2;
+ }
+
+ // update allocation for loopfilter masks
+ if (num_sb128 != f->lf.mask_sz) {
+ dav1d_free(f->lf.mask);
+ dav1d_free(f->lf.level);
+ f->lf.mask = dav1d_malloc(ALLOC_LF, sizeof(*f->lf.mask) * num_sb128);
+ // over-allocate by 3 bytes since some of the SIMD implementations
+ // index this from the level type and can thus over-read by up to 3
+ f->lf.level = dav1d_malloc(ALLOC_LF, sizeof(*f->lf.level) * num_sb128 * 32 * 32 + 3);
+ if (!f->lf.mask || !f->lf.level) {
+ f->lf.mask_sz = 0;
+ goto error;
+ }
+ if (c->n_fc > 1) {
+ dav1d_free(f->frame_thread.b);
+ f->frame_thread.b = dav1d_malloc(ALLOC_BLOCK, sizeof(*f->frame_thread.b) *
+ num_sb128 * 32 * 32);
+ if (!f->frame_thread.b) {
+ f->lf.mask_sz = 0;
+ goto error;
+ }
+ }
+ f->lf.mask_sz = num_sb128;
+ }
+
+ f->sr_sb128w = (f->sr_cur.p.p.w + 127) >> 7;
+ const int lr_mask_sz = f->sr_sb128w * f->sb128h;
+ if (lr_mask_sz != f->lf.lr_mask_sz) {
+ dav1d_free(f->lf.lr_mask);
+ f->lf.lr_mask = dav1d_malloc(ALLOC_LR, sizeof(*f->lf.lr_mask) * lr_mask_sz);
+ if (!f->lf.lr_mask) {
+ f->lf.lr_mask_sz = 0;
+ goto error;
+ }
+ f->lf.lr_mask_sz = lr_mask_sz;
+ }
+ f->lf.restore_planes =
+ ((f->frame_hdr->restoration.type[0] != DAV1D_RESTORATION_NONE) << 0) +
+ ((f->frame_hdr->restoration.type[1] != DAV1D_RESTORATION_NONE) << 1) +
+ ((f->frame_hdr->restoration.type[2] != DAV1D_RESTORATION_NONE) << 2);
+ if (f->frame_hdr->loopfilter.sharpness != f->lf.last_sharpness) {
+ dav1d_calc_eih(&f->lf.lim_lut, f->frame_hdr->loopfilter.sharpness);
+ f->lf.last_sharpness = f->frame_hdr->loopfilter.sharpness;
+ }
+ dav1d_calc_lf_values(f->lf.lvl, f->frame_hdr, (int8_t[4]) { 0, 0, 0, 0 });
+ memset(f->lf.mask, 0, sizeof(*f->lf.mask) * num_sb128);
+
+ const int ipred_edge_sz = f->sbh * f->sb128w << hbd;
+ if (ipred_edge_sz != f->ipred_edge_sz) {
+ dav1d_free_aligned(f->ipred_edge[0]);
+ uint8_t *ptr = f->ipred_edge[0] =
+ dav1d_alloc_aligned(ALLOC_IPRED, ipred_edge_sz * 128 * 3, 64);
+ if (!ptr) {
+ f->ipred_edge_sz = 0;
+ goto error;
+ }
+ f->ipred_edge[1] = ptr + ipred_edge_sz * 128 * 1;
+ f->ipred_edge[2] = ptr + ipred_edge_sz * 128 * 2;
+ f->ipred_edge_sz = ipred_edge_sz;
+ }
+
+ const int re_sz = f->sb128h * f->frame_hdr->tiling.cols;
+ if (re_sz != f->lf.re_sz) {
+ dav1d_free(f->lf.tx_lpf_right_edge[0]);
+ f->lf.tx_lpf_right_edge[0] = dav1d_malloc(ALLOC_LF, re_sz * 32 * 2);
+ if (!f->lf.tx_lpf_right_edge[0]) {
+ f->lf.re_sz = 0;
+ goto error;
+ }
+ f->lf.tx_lpf_right_edge[1] = f->lf.tx_lpf_right_edge[0] + re_sz * 32;
+ f->lf.re_sz = re_sz;
+ }
+
+ // init ref mvs
+ if (IS_INTER_OR_SWITCH(f->frame_hdr) || f->frame_hdr->allow_intrabc) {
+ const int ret =
+ dav1d_refmvs_init_frame(&f->rf, f->seq_hdr, f->frame_hdr,
+ f->refpoc, f->mvs, f->refrefpoc, f->ref_mvs,
+ f->c->n_tc, f->c->n_fc);
+ if (ret < 0) goto error;
+ }
+
+ // setup dequant tables
+ init_quant_tables(f->seq_hdr, f->frame_hdr, f->frame_hdr->quant.yac, f->dq);
+ if (f->frame_hdr->quant.qm)
+ for (int i = 0; i < N_RECT_TX_SIZES; i++) {
+ f->qm[i][0] = dav1d_qm_tbl[f->frame_hdr->quant.qm_y][0][i];
+ f->qm[i][1] = dav1d_qm_tbl[f->frame_hdr->quant.qm_u][1][i];
+ f->qm[i][2] = dav1d_qm_tbl[f->frame_hdr->quant.qm_v][1][i];
+ }
+ else
+ memset(f->qm, 0, sizeof(f->qm));
+
+ // setup jnt_comp weights
+ if (f->frame_hdr->switchable_comp_refs) {
+ for (int i = 0; i < 7; i++) {
+ const unsigned ref0poc = f->refp[i].p.frame_hdr->frame_offset;
+
+ for (int j = i + 1; j < 7; j++) {
+ const unsigned ref1poc = f->refp[j].p.frame_hdr->frame_offset;
+
+ const unsigned d1 =
+ imin(abs(get_poc_diff(f->seq_hdr->order_hint_n_bits, ref0poc,
+ f->cur.frame_hdr->frame_offset)), 31);
+ const unsigned d0 =
+ imin(abs(get_poc_diff(f->seq_hdr->order_hint_n_bits, ref1poc,
+ f->cur.frame_hdr->frame_offset)), 31);
+ const int order = d0 <= d1;
+
+ static const uint8_t quant_dist_weight[3][2] = {
+ { 2, 3 }, { 2, 5 }, { 2, 7 }
+ };
+ static const uint8_t quant_dist_lookup_table[4][2] = {
+ { 9, 7 }, { 11, 5 }, { 12, 4 }, { 13, 3 }
+ };
+
+ int k;
+ for (k = 0; k < 3; k++) {
+ const int c0 = quant_dist_weight[k][order];
+ const int c1 = quant_dist_weight[k][!order];
+ const int d0_c0 = d0 * c0;
+ const int d1_c1 = d1 * c1;
+ if ((d0 > d1 && d0_c0 < d1_c1) || (d0 <= d1 && d0_c0 > d1_c1)) break;
+ }
+
+ f->jnt_weights[i][j] = quant_dist_lookup_table[k][order];
+ }
+ }
+ }
+
+ /* Init loopfilter pointers. Increasing NULL pointers is technically UB,
+ * so just point the chroma pointers in 4:0:0 to the luma plane here to
+ * avoid having additional in-loop branches in various places. We never
+ * dereference those pointers so it doesn't really matter what they
+ * point at, as long as the pointers are valid. */
+ const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400;
+ f->lf.p[0] = f->cur.data[0];
+ f->lf.p[1] = f->cur.data[has_chroma ? 1 : 0];
+ f->lf.p[2] = f->cur.data[has_chroma ? 2 : 0];
+ f->lf.sr_p[0] = f->sr_cur.p.data[0];
+ f->lf.sr_p[1] = f->sr_cur.p.data[has_chroma ? 1 : 0];
+ f->lf.sr_p[2] = f->sr_cur.p.data[has_chroma ? 2 : 0];
+
+ retval = 0;
+error:
+ return retval;
+}
+
+int dav1d_decode_frame_init_cdf(Dav1dFrameContext *const f) {
+ const Dav1dContext *const c = f->c;
+ int retval = DAV1D_ERR(EINVAL);
+
+ if (f->frame_hdr->refresh_context)
+ dav1d_cdf_thread_copy(f->out_cdf.data.cdf, &f->in_cdf);
+
+ // parse individual tiles per tile group
+ int tile_row = 0, tile_col = 0;
+ f->task_thread.update_set = 0;
+ for (int i = 0; i < f->n_tile_data; i++) {
+ const uint8_t *data = f->tile[i].data.data;
+ size_t size = f->tile[i].data.sz;
+
+ for (int j = f->tile[i].start; j <= f->tile[i].end; j++) {
+ size_t tile_sz;
+ if (j == f->tile[i].end) {
+ tile_sz = size;
+ } else {
+ if (f->frame_hdr->tiling.n_bytes > size) goto error;
+ tile_sz = 0;
+ for (unsigned k = 0; k < f->frame_hdr->tiling.n_bytes; k++)
+ tile_sz |= (unsigned)*data++ << (k * 8);
+ tile_sz++;
+ size -= f->frame_hdr->tiling.n_bytes;
+ if (tile_sz > size) goto error;
+ }
+
+ setup_tile(&f->ts[j], f, data, tile_sz, tile_row, tile_col++,
+ c->n_fc > 1 ? f->frame_thread.tile_start_off[j] : 0);
+
+ if (tile_col == f->frame_hdr->tiling.cols) {
+ tile_col = 0;
+ tile_row++;
+ }
+ if (j == f->frame_hdr->tiling.update && f->frame_hdr->refresh_context)
+ f->task_thread.update_set = 1;
+ data += tile_sz;
+ size -= tile_sz;
+ }
+ }
+
+ if (c->n_tc > 1) {
+ const int uses_2pass = c->n_fc > 1;
+ for (int n = 0; n < f->sb128w * f->frame_hdr->tiling.rows * (1 + uses_2pass); n++)
+ reset_context(&f->a[n], IS_KEY_OR_INTRA(f->frame_hdr),
+ uses_2pass ? 1 + (n >= f->sb128w * f->frame_hdr->tiling.rows) : 0);
+ }
+
+ retval = 0;
+error:
+ return retval;
+}
+
+int dav1d_decode_frame_main(Dav1dFrameContext *const f) {
+ const Dav1dContext *const c = f->c;
+ int retval = DAV1D_ERR(EINVAL);
+
+ assert(f->c->n_tc == 1);
+
+ Dav1dTaskContext *const t = &c->tc[f - c->fc];
+ t->f = f;
+ t->frame_thread.pass = 0;
+
+ for (int n = 0; n < f->sb128w * f->frame_hdr->tiling.rows; n++)
+ reset_context(&f->a[n], IS_KEY_OR_INTRA(f->frame_hdr), 0);
+
+ // no threading - we explicitly interleave tile/sbrow decoding
+ // and post-filtering, so that the full process runs in-line
+ for (int tile_row = 0; tile_row < f->frame_hdr->tiling.rows; tile_row++) {
+ const int sbh_end =
+ imin(f->frame_hdr->tiling.row_start_sb[tile_row + 1], f->sbh);
+ for (int sby = f->frame_hdr->tiling.row_start_sb[tile_row];
+ sby < sbh_end; sby++)
+ {
+ t->by = sby << (4 + f->seq_hdr->sb128);
+ const int by_end = (t->by + f->sb_step) >> 1;
+ if (f->frame_hdr->use_ref_frame_mvs) {
+ f->c->refmvs_dsp.load_tmvs(&f->rf, tile_row,
+ 0, f->bw >> 1, t->by >> 1, by_end);
+ }
+ for (int tile_col = 0; tile_col < f->frame_hdr->tiling.cols; tile_col++) {
+ t->ts = &f->ts[tile_row * f->frame_hdr->tiling.cols + tile_col];
+ if (dav1d_decode_tile_sbrow(t)) goto error;
+ }
+ if (IS_INTER_OR_SWITCH(f->frame_hdr)) {
+ dav1d_refmvs_save_tmvs(&f->c->refmvs_dsp, &t->rt,
+ 0, f->bw >> 1, t->by >> 1, by_end);
+ }
+
+ // loopfilter + cdef + restoration
+ f->bd_fn.filter_sbrow(f, sby);
+ }
+ }
+
+ retval = 0;
+error:
+ return retval;
+}
+
+void dav1d_decode_frame_exit(Dav1dFrameContext *const f, const int retval) {
+ const Dav1dContext *const c = f->c;
+
+ if (f->sr_cur.p.data[0])
+ atomic_init(&f->task_thread.error, 0);
+
+ if (c->n_fc > 1 && retval && f->frame_thread.cf) {
+ memset(f->frame_thread.cf, 0,
+ (size_t)f->frame_thread.cf_sz * 128 * 128 / 2);
+ }
+ for (int i = 0; i < 7; i++) {
+ if (f->refp[i].p.frame_hdr)
+ dav1d_thread_picture_unref(&f->refp[i]);
+ dav1d_ref_dec(&f->ref_mvs_ref[i]);
+ }
+
+ dav1d_picture_unref_internal(&f->cur);
+ dav1d_thread_picture_unref(&f->sr_cur);
+ dav1d_cdf_thread_unref(&f->in_cdf);
+ if (f->frame_hdr && f->frame_hdr->refresh_context) {
+ if (f->out_cdf.progress)
+ atomic_store(f->out_cdf.progress, retval == 0 ? 1 : TILE_ERROR);
+ dav1d_cdf_thread_unref(&f->out_cdf);
+ }
+ dav1d_ref_dec(&f->cur_segmap_ref);
+ dav1d_ref_dec(&f->prev_segmap_ref);
+ dav1d_ref_dec(&f->mvs_ref);
+ dav1d_ref_dec(&f->seq_hdr_ref);
+ dav1d_ref_dec(&f->frame_hdr_ref);
+
+ for (int i = 0; i < f->n_tile_data; i++)
+ dav1d_data_unref_internal(&f->tile[i].data);
+ f->task_thread.retval = retval;
+}
+
+int dav1d_decode_frame(Dav1dFrameContext *const f) {
+ assert(f->c->n_fc == 1);
+ // if n_tc > 1 (but n_fc == 1), we could run init/exit in the task
+ // threads also. Not sure it makes a measurable difference.
+ int res = dav1d_decode_frame_init(f);
+ if (!res) res = dav1d_decode_frame_init_cdf(f);
+ // wait until all threads have completed
+ if (!res) {
+ if (f->c->n_tc > 1) {
+ res = dav1d_task_create_tile_sbrow(f, 0, 1);
+ pthread_mutex_lock(&f->task_thread.ttd->lock);
+ pthread_cond_signal(&f->task_thread.ttd->cond);
+ if (!res) {
+ while (!f->task_thread.done[0] ||
+ atomic_load(&f->task_thread.task_counter) > 0)
+ {
+ pthread_cond_wait(&f->task_thread.cond,
+ &f->task_thread.ttd->lock);
+ }
+ }
+ pthread_mutex_unlock(&f->task_thread.ttd->lock);
+ res = f->task_thread.retval;
+ } else {
+ res = dav1d_decode_frame_main(f);
+ if (!res && f->frame_hdr->refresh_context && f->task_thread.update_set) {
+ dav1d_cdf_thread_update(f->frame_hdr, f->out_cdf.data.cdf,
+ &f->ts[f->frame_hdr->tiling.update].cdf);
+ }
+ }
+ }
+ dav1d_decode_frame_exit(f, res);
+ f->n_tile_data = 0;
+ return res;
+}
+
+static int get_upscale_x0(const int in_w, const int out_w, const int step) {
+ const int err = out_w * step - (in_w << 14);
+ const int x0 = (-((out_w - in_w) << 13) + (out_w >> 1)) / out_w + 128 - (err / 2);
+ return x0 & 0x3fff;
+}
+
+int dav1d_submit_frame(Dav1dContext *const c) {
+ Dav1dFrameContext *f;
+ int res = -1;
+
+ // wait for c->out_delayed[next] and move into c->out if visible
+ Dav1dThreadPicture *out_delayed;
+ if (c->n_fc > 1) {
+ pthread_mutex_lock(&c->task_thread.lock);
+ const unsigned next = c->frame_thread.next++;
+ if (c->frame_thread.next == c->n_fc)
+ c->frame_thread.next = 0;
+
+ f = &c->fc[next];
+ while (f->n_tile_data > 0)
+ pthread_cond_wait(&f->task_thread.cond,
+ &c->task_thread.lock);
+ out_delayed = &c->frame_thread.out_delayed[next];
+ if (out_delayed->p.data[0] || atomic_load(&f->task_thread.error)) {
+ unsigned first = atomic_load(&c->task_thread.first);
+ if (first + 1U < c->n_fc)
+ atomic_fetch_add(&c->task_thread.first, 1U);
+ else
+ atomic_store(&c->task_thread.first, 0);
+ atomic_compare_exchange_strong(&c->task_thread.reset_task_cur,
+ &first, UINT_MAX);
+ if (c->task_thread.cur && c->task_thread.cur < c->n_fc)
+ c->task_thread.cur--;
+ }
+ const int error = f->task_thread.retval;
+ if (error) {
+ f->task_thread.retval = 0;
+ c->cached_error = error;
+ dav1d_data_props_copy(&c->cached_error_props, &out_delayed->p.m);
+ dav1d_thread_picture_unref(out_delayed);
+ } else if (out_delayed->p.data[0]) {
+ const unsigned progress = atomic_load_explicit(&out_delayed->progress[1],
+ memory_order_relaxed);
+ if ((out_delayed->visible || c->output_invisible_frames) &&
+ progress != FRAME_ERROR)
+ {
+ dav1d_thread_picture_ref(&c->out, out_delayed);
+ c->event_flags |= dav1d_picture_get_event_flags(out_delayed);
+ }
+ dav1d_thread_picture_unref(out_delayed);
+ }
+ } else {
+ f = c->fc;
+ }
+
+ f->seq_hdr = c->seq_hdr;
+ f->seq_hdr_ref = c->seq_hdr_ref;
+ dav1d_ref_inc(f->seq_hdr_ref);
+ f->frame_hdr = c->frame_hdr;
+ f->frame_hdr_ref = c->frame_hdr_ref;
+ c->frame_hdr = NULL;
+ c->frame_hdr_ref = NULL;
+ f->dsp = &c->dsp[f->seq_hdr->hbd];
+
+ const int bpc = 8 + 2 * f->seq_hdr->hbd;
+
+ if (!f->dsp->ipred.intra_pred[DC_PRED]) {
+ Dav1dDSPContext *const dsp = &c->dsp[f->seq_hdr->hbd];
+
+ switch (bpc) {
+#define assign_bitdepth_case(bd) \
+ dav1d_cdef_dsp_init_##bd##bpc(&dsp->cdef); \
+ dav1d_intra_pred_dsp_init_##bd##bpc(&dsp->ipred); \
+ dav1d_itx_dsp_init_##bd##bpc(&dsp->itx, bpc); \
+ dav1d_loop_filter_dsp_init_##bd##bpc(&dsp->lf); \
+ dav1d_loop_restoration_dsp_init_##bd##bpc(&dsp->lr, bpc); \
+ dav1d_mc_dsp_init_##bd##bpc(&dsp->mc); \
+ dav1d_film_grain_dsp_init_##bd##bpc(&dsp->fg); \
+ break
+#if CONFIG_8BPC
+ case 8:
+ assign_bitdepth_case(8);
+#endif
+#if CONFIG_16BPC
+ case 10:
+ case 12:
+ assign_bitdepth_case(16);
+#endif
+#undef assign_bitdepth_case
+ default:
+ dav1d_log(c, "Compiled without support for %d-bit decoding\n",
+ 8 + 2 * f->seq_hdr->hbd);
+ res = DAV1D_ERR(ENOPROTOOPT);
+ goto error;
+ }
+ }
+
+#define assign_bitdepth_case(bd) \
+ f->bd_fn.recon_b_inter = dav1d_recon_b_inter_##bd##bpc; \
+ f->bd_fn.recon_b_intra = dav1d_recon_b_intra_##bd##bpc; \
+ f->bd_fn.filter_sbrow = dav1d_filter_sbrow_##bd##bpc; \
+ f->bd_fn.filter_sbrow_deblock_cols = dav1d_filter_sbrow_deblock_cols_##bd##bpc; \
+ f->bd_fn.filter_sbrow_deblock_rows = dav1d_filter_sbrow_deblock_rows_##bd##bpc; \
+ f->bd_fn.filter_sbrow_cdef = dav1d_filter_sbrow_cdef_##bd##bpc; \
+ f->bd_fn.filter_sbrow_resize = dav1d_filter_sbrow_resize_##bd##bpc; \
+ f->bd_fn.filter_sbrow_lr = dav1d_filter_sbrow_lr_##bd##bpc; \
+ f->bd_fn.backup_ipred_edge = dav1d_backup_ipred_edge_##bd##bpc; \
+ f->bd_fn.read_coef_blocks = dav1d_read_coef_blocks_##bd##bpc; \
+ f->bd_fn.copy_pal_block_y = dav1d_copy_pal_block_y_##bd##bpc; \
+ f->bd_fn.copy_pal_block_uv = dav1d_copy_pal_block_uv_##bd##bpc; \
+ f->bd_fn.read_pal_plane = dav1d_read_pal_plane_##bd##bpc; \
+ f->bd_fn.read_pal_uv = dav1d_read_pal_uv_##bd##bpc
+ if (!f->seq_hdr->hbd) {
+#if CONFIG_8BPC
+ assign_bitdepth_case(8);
+#endif
+ } else {
+#if CONFIG_16BPC
+ assign_bitdepth_case(16);
+#endif
+ }
+#undef assign_bitdepth_case
+
+ int ref_coded_width[7];
+ if (IS_INTER_OR_SWITCH(f->frame_hdr)) {
+ if (f->frame_hdr->primary_ref_frame != DAV1D_PRIMARY_REF_NONE) {
+ const int pri_ref = f->frame_hdr->refidx[f->frame_hdr->primary_ref_frame];
+ if (!c->refs[pri_ref].p.p.data[0]) {
+ res = DAV1D_ERR(EINVAL);
+ goto error;
+ }
+ }
+ for (int i = 0; i < 7; i++) {
+ const int refidx = f->frame_hdr->refidx[i];
+ if (!c->refs[refidx].p.p.data[0] ||
+ f->frame_hdr->width[0] * 2 < c->refs[refidx].p.p.p.w ||
+ f->frame_hdr->height * 2 < c->refs[refidx].p.p.p.h ||
+ f->frame_hdr->width[0] > c->refs[refidx].p.p.p.w * 16 ||
+ f->frame_hdr->height > c->refs[refidx].p.p.p.h * 16 ||
+ f->seq_hdr->layout != c->refs[refidx].p.p.p.layout ||
+ bpc != c->refs[refidx].p.p.p.bpc)
+ {
+ for (int j = 0; j < i; j++)
+ dav1d_thread_picture_unref(&f->refp[j]);
+ res = DAV1D_ERR(EINVAL);
+ goto error;
+ }
+ dav1d_thread_picture_ref(&f->refp[i], &c->refs[refidx].p);
+ ref_coded_width[i] = c->refs[refidx].p.p.frame_hdr->width[0];
+ if (f->frame_hdr->width[0] != c->refs[refidx].p.p.p.w ||
+ f->frame_hdr->height != c->refs[refidx].p.p.p.h)
+ {
+#define scale_fac(ref_sz, this_sz) \
+ ((((ref_sz) << 14) + ((this_sz) >> 1)) / (this_sz))
+ f->svc[i][0].scale = scale_fac(c->refs[refidx].p.p.p.w,
+ f->frame_hdr->width[0]);
+ f->svc[i][1].scale = scale_fac(c->refs[refidx].p.p.p.h,
+ f->frame_hdr->height);
+ f->svc[i][0].step = (f->svc[i][0].scale + 8) >> 4;
+ f->svc[i][1].step = (f->svc[i][1].scale + 8) >> 4;
+ } else {
+ f->svc[i][0].scale = f->svc[i][1].scale = 0;
+ }
+ f->gmv_warp_allowed[i] = f->frame_hdr->gmv[i].type > DAV1D_WM_TYPE_TRANSLATION &&
+ !f->frame_hdr->force_integer_mv &&
+ !dav1d_get_shear_params(&f->frame_hdr->gmv[i]) &&
+ !f->svc[i][0].scale;
+ }
+ }
+
+ // setup entropy
+ if (f->frame_hdr->primary_ref_frame == DAV1D_PRIMARY_REF_NONE) {
+ dav1d_cdf_thread_init_static(&f->in_cdf, f->frame_hdr->quant.yac);
+ } else {
+ const int pri_ref = f->frame_hdr->refidx[f->frame_hdr->primary_ref_frame];
+ dav1d_cdf_thread_ref(&f->in_cdf, &c->cdf[pri_ref]);
+ }
+ if (f->frame_hdr->refresh_context) {
+ res = dav1d_cdf_thread_alloc(c, &f->out_cdf, c->n_fc > 1);
+ if (res < 0) goto error;
+ }
+
+ // FIXME qsort so tiles are in order (for frame threading)
+ if (f->n_tile_data_alloc < c->n_tile_data) {
+ dav1d_free(f->tile);
+ assert(c->n_tile_data < INT_MAX / (int)sizeof(*f->tile));
+ f->tile = dav1d_malloc(ALLOC_TILE, c->n_tile_data * sizeof(*f->tile));
+ if (!f->tile) {
+ f->n_tile_data_alloc = f->n_tile_data = 0;
+ res = DAV1D_ERR(ENOMEM);
+ goto error;
+ }
+ f->n_tile_data_alloc = c->n_tile_data;
+ }
+ memcpy(f->tile, c->tile, c->n_tile_data * sizeof(*f->tile));
+ memset(c->tile, 0, c->n_tile_data * sizeof(*c->tile));
+ f->n_tile_data = c->n_tile_data;
+ c->n_tile_data = 0;
+
+ // allocate frame
+ res = dav1d_thread_picture_alloc(c, f, bpc);
+ if (res < 0) goto error;
+
+ if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) {
+ res = dav1d_picture_alloc_copy(c, &f->cur, f->frame_hdr->width[0], &f->sr_cur.p);
+ if (res < 0) goto error;
+ } else {
+ dav1d_picture_ref(&f->cur, &f->sr_cur.p);
+ }
+
+ if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) {
+ f->resize_step[0] = scale_fac(f->cur.p.w, f->sr_cur.p.p.w);
+ const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+ const int in_cw = (f->cur.p.w + ss_hor) >> ss_hor;
+ const int out_cw = (f->sr_cur.p.p.w + ss_hor) >> ss_hor;
+ f->resize_step[1] = scale_fac(in_cw, out_cw);
+#undef scale_fac
+ f->resize_start[0] = get_upscale_x0(f->cur.p.w, f->sr_cur.p.p.w, f->resize_step[0]);
+ f->resize_start[1] = get_upscale_x0(in_cw, out_cw, f->resize_step[1]);
+ }
+
+ // move f->cur into output queue
+ if (c->n_fc == 1) {
+ if (f->frame_hdr->show_frame || c->output_invisible_frames) {
+ dav1d_thread_picture_ref(&c->out, &f->sr_cur);
+ c->event_flags |= dav1d_picture_get_event_flags(&f->sr_cur);
+ }
+ } else {
+ dav1d_thread_picture_ref(out_delayed, &f->sr_cur);
+ }
+
+ f->w4 = (f->frame_hdr->width[0] + 3) >> 2;
+ f->h4 = (f->frame_hdr->height + 3) >> 2;
+ f->bw = ((f->frame_hdr->width[0] + 7) >> 3) << 1;
+ f->bh = ((f->frame_hdr->height + 7) >> 3) << 1;
+ f->sb128w = (f->bw + 31) >> 5;
+ f->sb128h = (f->bh + 31) >> 5;
+ f->sb_shift = 4 + f->seq_hdr->sb128;
+ f->sb_step = 16 << f->seq_hdr->sb128;
+ f->sbh = (f->bh + f->sb_step - 1) >> f->sb_shift;
+ f->b4_stride = (f->bw + 31) & ~31;
+ f->bitdepth_max = (1 << f->cur.p.bpc) - 1;
+ atomic_init(&f->task_thread.error, 0);
+ const int uses_2pass = c->n_fc > 1;
+ const int cols = f->frame_hdr->tiling.cols;
+ const int rows = f->frame_hdr->tiling.rows;
+ atomic_store(&f->task_thread.task_counter,
+ (cols * rows + f->sbh) << uses_2pass);
+
+ // ref_mvs
+ if (IS_INTER_OR_SWITCH(f->frame_hdr) || f->frame_hdr->allow_intrabc) {
+ f->mvs_ref = dav1d_ref_create_using_pool(c->refmvs_pool,
+ sizeof(*f->mvs) * f->sb128h * 16 * (f->b4_stride >> 1));
+ if (!f->mvs_ref) {
+ res = DAV1D_ERR(ENOMEM);
+ goto error;
+ }
+ f->mvs = f->mvs_ref->data;
+ if (!f->frame_hdr->allow_intrabc) {
+ for (int i = 0; i < 7; i++)
+ f->refpoc[i] = f->refp[i].p.frame_hdr->frame_offset;
+ } else {
+ memset(f->refpoc, 0, sizeof(f->refpoc));
+ }
+ if (f->frame_hdr->use_ref_frame_mvs) {
+ for (int i = 0; i < 7; i++) {
+ const int refidx = f->frame_hdr->refidx[i];
+ const int ref_w = ((ref_coded_width[i] + 7) >> 3) << 1;
+ const int ref_h = ((f->refp[i].p.p.h + 7) >> 3) << 1;
+ if (c->refs[refidx].refmvs != NULL &&
+ ref_w == f->bw && ref_h == f->bh)
+ {
+ f->ref_mvs_ref[i] = c->refs[refidx].refmvs;
+ dav1d_ref_inc(f->ref_mvs_ref[i]);
+ f->ref_mvs[i] = c->refs[refidx].refmvs->data;
+ } else {
+ f->ref_mvs[i] = NULL;
+ f->ref_mvs_ref[i] = NULL;
+ }
+ memcpy(f->refrefpoc[i], c->refs[refidx].refpoc,
+ sizeof(*f->refrefpoc));
+ }
+ } else {
+ memset(f->ref_mvs_ref, 0, sizeof(f->ref_mvs_ref));
+ }
+ } else {
+ f->mvs_ref = NULL;
+ memset(f->ref_mvs_ref, 0, sizeof(f->ref_mvs_ref));
+ }
+
+ // segmap
+ if (f->frame_hdr->segmentation.enabled) {
+ // By default, the previous segmentation map is not initialised.
+ f->prev_segmap_ref = NULL;
+ f->prev_segmap = NULL;
+
+ // We might need a previous frame's segmentation map. This
+ // happens if there is either no update or a temporal update.
+ if (f->frame_hdr->segmentation.temporal || !f->frame_hdr->segmentation.update_map) {
+ const int pri_ref = f->frame_hdr->primary_ref_frame;
+ assert(pri_ref != DAV1D_PRIMARY_REF_NONE);
+ const int ref_w = ((ref_coded_width[pri_ref] + 7) >> 3) << 1;
+ const int ref_h = ((f->refp[pri_ref].p.p.h + 7) >> 3) << 1;
+ if (ref_w == f->bw && ref_h == f->bh) {
+ f->prev_segmap_ref = c->refs[f->frame_hdr->refidx[pri_ref]].segmap;
+ if (f->prev_segmap_ref) {
+ dav1d_ref_inc(f->prev_segmap_ref);
+ f->prev_segmap = f->prev_segmap_ref->data;
+ }
+ }
+ }
+
+ if (f->frame_hdr->segmentation.update_map) {
+ // We're updating an existing map, but need somewhere to
+ // put the new values. Allocate them here (the data
+ // actually gets set elsewhere)
+ f->cur_segmap_ref = dav1d_ref_create_using_pool(c->segmap_pool,
+ sizeof(*f->cur_segmap) * f->b4_stride * 32 * f->sb128h);
+ if (!f->cur_segmap_ref) {
+ dav1d_ref_dec(&f->prev_segmap_ref);
+ res = DAV1D_ERR(ENOMEM);
+ goto error;
+ }
+ f->cur_segmap = f->cur_segmap_ref->data;
+ } else if (f->prev_segmap_ref) {
+ // We're not updating an existing map, and we have a valid
+ // reference. Use that.
+ f->cur_segmap_ref = f->prev_segmap_ref;
+ dav1d_ref_inc(f->cur_segmap_ref);
+ f->cur_segmap = f->prev_segmap_ref->data;
+ } else {
+ // We need to make a new map. Allocate one here and zero it out.
+ const size_t segmap_size = sizeof(*f->cur_segmap) * f->b4_stride * 32 * f->sb128h;
+ f->cur_segmap_ref = dav1d_ref_create_using_pool(c->segmap_pool, segmap_size);
+ if (!f->cur_segmap_ref) {
+ res = DAV1D_ERR(ENOMEM);
+ goto error;
+ }
+ f->cur_segmap = f->cur_segmap_ref->data;
+ memset(f->cur_segmap, 0, segmap_size);
+ }
+ } else {
+ f->cur_segmap = NULL;
+ f->cur_segmap_ref = NULL;
+ f->prev_segmap_ref = NULL;
+ }
+
+ // update references etc.
+ const unsigned refresh_frame_flags = f->frame_hdr->refresh_frame_flags;
+ for (int i = 0; i < 8; i++) {
+ if (refresh_frame_flags & (1 << i)) {
+ if (c->refs[i].p.p.frame_hdr)
+ dav1d_thread_picture_unref(&c->refs[i].p);
+ dav1d_thread_picture_ref(&c->refs[i].p, &f->sr_cur);
+
+ dav1d_cdf_thread_unref(&c->cdf[i]);
+ if (f->frame_hdr->refresh_context) {
+ dav1d_cdf_thread_ref(&c->cdf[i], &f->out_cdf);
+ } else {
+ dav1d_cdf_thread_ref(&c->cdf[i], &f->in_cdf);
+ }
+
+ dav1d_ref_dec(&c->refs[i].segmap);
+ c->refs[i].segmap = f->cur_segmap_ref;
+ if (f->cur_segmap_ref)
+ dav1d_ref_inc(f->cur_segmap_ref);
+ dav1d_ref_dec(&c->refs[i].refmvs);
+ if (!f->frame_hdr->allow_intrabc) {
+ c->refs[i].refmvs = f->mvs_ref;
+ if (f->mvs_ref)
+ dav1d_ref_inc(f->mvs_ref);
+ }
+ memcpy(c->refs[i].refpoc, f->refpoc, sizeof(f->refpoc));
+ }
+ }
+
+ if (c->n_fc == 1) {
+ if ((res = dav1d_decode_frame(f)) < 0) {
+ dav1d_thread_picture_unref(&c->out);
+ for (int i = 0; i < 8; i++) {
+ if (refresh_frame_flags & (1 << i)) {
+ if (c->refs[i].p.p.frame_hdr)
+ dav1d_thread_picture_unref(&c->refs[i].p);
+ dav1d_cdf_thread_unref(&c->cdf[i]);
+ dav1d_ref_dec(&c->refs[i].segmap);
+ dav1d_ref_dec(&c->refs[i].refmvs);
+ }
+ }
+ goto error;
+ }
+ } else {
+ dav1d_task_frame_init(f);
+ pthread_mutex_unlock(&c->task_thread.lock);
+ }
+
+ return 0;
+error:
+ atomic_init(&f->task_thread.error, 1);
+ dav1d_cdf_thread_unref(&f->in_cdf);
+ if (f->frame_hdr->refresh_context)
+ dav1d_cdf_thread_unref(&f->out_cdf);
+ for (int i = 0; i < 7; i++) {
+ if (f->refp[i].p.frame_hdr)
+ dav1d_thread_picture_unref(&f->refp[i]);
+ dav1d_ref_dec(&f->ref_mvs_ref[i]);
+ }
+ if (c->n_fc == 1)
+ dav1d_thread_picture_unref(&c->out);
+ else
+ dav1d_thread_picture_unref(out_delayed);
+ dav1d_picture_unref_internal(&f->cur);
+ dav1d_thread_picture_unref(&f->sr_cur);
+ dav1d_ref_dec(&f->mvs_ref);
+ dav1d_ref_dec(&f->seq_hdr_ref);
+ dav1d_ref_dec(&f->frame_hdr_ref);
+ dav1d_data_props_copy(&c->cached_error_props, &c->in.m);
+
+ for (int i = 0; i < f->n_tile_data; i++)
+ dav1d_data_unref_internal(&f->tile[i].data);
+ f->n_tile_data = 0;
+
+ if (c->n_fc > 1)
+ pthread_mutex_unlock(&c->task_thread.lock);
+
+ return res;
+}
diff --git a/third_party/dav1d/src/decode.h b/third_party/dav1d/src/decode.h
new file mode 100644
index 0000000000..1eae5850a5
--- /dev/null
+++ b/third_party/dav1d/src/decode.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_DECODE_H
+#define DAV1D_SRC_DECODE_H
+
+#include "src/internal.h"
+
+int dav1d_submit_frame(Dav1dContext *c);
+
+#endif /* DAV1D_SRC_DECODE_H */
diff --git a/third_party/dav1d/src/dequant_tables.c b/third_party/dav1d/src/dequant_tables.c
new file mode 100644
index 0000000000..520d727b03
--- /dev/null
+++ b/third_party/dav1d/src/dequant_tables.c
@@ -0,0 +1,229 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "src/dequant_tables.h"
+
+const uint16_t dav1d_dq_tbl[3][QINDEX_RANGE][2] = {
+ {
+ { 4, 4, }, { 8, 8, }, { 8, 9, }, { 9, 10, },
+ { 10, 11, }, { 11, 12, }, { 12, 13, }, { 12, 14, },
+ { 13, 15, }, { 14, 16, }, { 15, 17, }, { 16, 18, },
+ { 17, 19, }, { 18, 20, }, { 19, 21, }, { 19, 22, },
+ { 20, 23, }, { 21, 24, }, { 22, 25, }, { 23, 26, },
+ { 24, 27, }, { 25, 28, }, { 26, 29, }, { 26, 30, },
+ { 27, 31, }, { 28, 32, }, { 29, 33, }, { 30, 34, },
+ { 31, 35, }, { 32, 36, }, { 32, 37, }, { 33, 38, },
+ { 34, 39, }, { 35, 40, }, { 36, 41, }, { 37, 42, },
+ { 38, 43, }, { 38, 44, }, { 39, 45, }, { 40, 46, },
+ { 41, 47, }, { 42, 48, }, { 43, 49, }, { 43, 50, },
+ { 44, 51, }, { 45, 52, }, { 46, 53, }, { 47, 54, },
+ { 48, 55, }, { 48, 56, }, { 49, 57, }, { 50, 58, },
+ { 51, 59, }, { 52, 60, }, { 53, 61, }, { 53, 62, },
+ { 54, 63, }, { 55, 64, }, { 56, 65, }, { 57, 66, },
+ { 57, 67, }, { 58, 68, }, { 59, 69, }, { 60, 70, },
+ { 61, 71, }, { 62, 72, }, { 62, 73, }, { 63, 74, },
+ { 64, 75, }, { 65, 76, }, { 66, 77, }, { 66, 78, },
+ { 67, 79, }, { 68, 80, }, { 69, 81, }, { 70, 82, },
+ { 70, 83, }, { 71, 84, }, { 72, 85, }, { 73, 86, },
+ { 74, 87, }, { 74, 88, }, { 75, 89, }, { 76, 90, },
+ { 77, 91, }, { 78, 92, }, { 78, 93, }, { 79, 94, },
+ { 80, 95, }, { 81, 96, }, { 81, 97, }, { 82, 98, },
+ { 83, 99, }, { 84, 100, }, { 85, 101, }, { 85, 102, },
+ { 87, 104, }, { 88, 106, }, { 90, 108, }, { 92, 110, },
+ { 93, 112, }, { 95, 114, }, { 96, 116, }, { 98, 118, },
+ { 99, 120, }, { 101, 122, }, { 102, 124, }, { 104, 126, },
+ { 105, 128, }, { 107, 130, }, { 108, 132, }, { 110, 134, },
+ { 111, 136, }, { 113, 138, }, { 114, 140, }, { 116, 142, },
+ { 117, 144, }, { 118, 146, }, { 120, 148, }, { 121, 150, },
+ { 123, 152, }, { 125, 155, }, { 127, 158, }, { 129, 161, },
+ { 131, 164, }, { 134, 167, }, { 136, 170, }, { 138, 173, },
+ { 140, 176, }, { 142, 179, }, { 144, 182, }, { 146, 185, },
+ { 148, 188, }, { 150, 191, }, { 152, 194, }, { 154, 197, },
+ { 156, 200, }, { 158, 203, }, { 161, 207, }, { 164, 211, },
+ { 166, 215, }, { 169, 219, }, { 172, 223, }, { 174, 227, },
+ { 177, 231, }, { 180, 235, }, { 182, 239, }, { 185, 243, },
+ { 187, 247, }, { 190, 251, }, { 192, 255, }, { 195, 260, },
+ { 199, 265, }, { 202, 270, }, { 205, 275, }, { 208, 280, },
+ { 211, 285, }, { 214, 290, }, { 217, 295, }, { 220, 300, },
+ { 223, 305, }, { 226, 311, }, { 230, 317, }, { 233, 323, },
+ { 237, 329, }, { 240, 335, }, { 243, 341, }, { 247, 347, },
+ { 250, 353, }, { 253, 359, }, { 257, 366, }, { 261, 373, },
+ { 265, 380, }, { 269, 387, }, { 272, 394, }, { 276, 401, },
+ { 280, 408, }, { 284, 416, }, { 288, 424, }, { 292, 432, },
+ { 296, 440, }, { 300, 448, }, { 304, 456, }, { 309, 465, },
+ { 313, 474, }, { 317, 483, }, { 322, 492, }, { 326, 501, },
+ { 330, 510, }, { 335, 520, }, { 340, 530, }, { 344, 540, },
+ { 349, 550, }, { 354, 560, }, { 359, 571, }, { 364, 582, },
+ { 369, 593, }, { 374, 604, }, { 379, 615, }, { 384, 627, },
+ { 389, 639, }, { 395, 651, }, { 400, 663, }, { 406, 676, },
+ { 411, 689, }, { 417, 702, }, { 423, 715, }, { 429, 729, },
+ { 435, 743, }, { 441, 757, }, { 447, 771, }, { 454, 786, },
+ { 461, 801, }, { 467, 816, }, { 475, 832, }, { 482, 848, },
+ { 489, 864, }, { 497, 881, }, { 505, 898, }, { 513, 915, },
+ { 522, 933, }, { 530, 951, }, { 539, 969, }, { 549, 988, },
+ { 559, 1007, }, { 569, 1026, }, { 579, 1046, }, { 590, 1066, },
+ { 602, 1087, }, { 614, 1108, }, { 626, 1129, }, { 640, 1151, },
+ { 654, 1173, }, { 668, 1196, }, { 684, 1219, }, { 700, 1243, },
+ { 717, 1267, }, { 736, 1292, }, { 755, 1317, }, { 775, 1343, },
+ { 796, 1369, }, { 819, 1396, }, { 843, 1423, }, { 869, 1451, },
+ { 896, 1479, }, { 925, 1508, }, { 955, 1537, }, { 988, 1567, },
+ { 1022, 1597, }, { 1058, 1628, }, { 1098, 1660, }, { 1139, 1692, },
+ { 1184, 1725, }, { 1232, 1759, }, { 1282, 1793, }, { 1336, 1828, },
+ }, {
+ { 4, 4, }, { 9, 9, }, { 10, 11, }, { 13, 13, },
+ { 15, 16, }, { 17, 18, }, { 20, 21, }, { 22, 24, },
+ { 25, 27, }, { 28, 30, }, { 31, 33, }, { 34, 37, },
+ { 37, 40, }, { 40, 44, }, { 43, 48, }, { 47, 51, },
+ { 50, 55, }, { 53, 59, }, { 57, 63, }, { 60, 67, },
+ { 64, 71, }, { 68, 75, }, { 71, 79, }, { 75, 83, },
+ { 78, 88, }, { 82, 92, }, { 86, 96, }, { 90, 100, },
+ { 93, 105, }, { 97, 109, }, { 101, 114, }, { 105, 118, },
+ { 109, 122, }, { 113, 127, }, { 116, 131, }, { 120, 136, },
+ { 124, 140, }, { 128, 145, }, { 132, 149, }, { 136, 154, },
+ { 140, 158, }, { 143, 163, }, { 147, 168, }, { 151, 172, },
+ { 155, 177, }, { 159, 181, }, { 163, 186, }, { 166, 190, },
+ { 170, 195, }, { 174, 199, }, { 178, 204, }, { 182, 208, },
+ { 185, 213, }, { 189, 217, }, { 193, 222, }, { 197, 226, },
+ { 200, 231, }, { 204, 235, }, { 208, 240, }, { 212, 244, },
+ { 215, 249, }, { 219, 253, }, { 223, 258, }, { 226, 262, },
+ { 230, 267, }, { 233, 271, }, { 237, 275, }, { 241, 280, },
+ { 244, 284, }, { 248, 289, }, { 251, 293, }, { 255, 297, },
+ { 259, 302, }, { 262, 306, }, { 266, 311, }, { 269, 315, },
+ { 273, 319, }, { 276, 324, }, { 280, 328, }, { 283, 332, },
+ { 287, 337, }, { 290, 341, }, { 293, 345, }, { 297, 349, },
+ { 300, 354, }, { 304, 358, }, { 307, 362, }, { 310, 367, },
+ { 314, 371, }, { 317, 375, }, { 321, 379, }, { 324, 384, },
+ { 327, 388, }, { 331, 392, }, { 334, 396, }, { 337, 401, },
+ { 343, 409, }, { 350, 417, }, { 356, 425, }, { 362, 433, },
+ { 369, 441, }, { 375, 449, }, { 381, 458, }, { 387, 466, },
+ { 394, 474, }, { 400, 482, }, { 406, 490, }, { 412, 498, },
+ { 418, 506, }, { 424, 514, }, { 430, 523, }, { 436, 531, },
+ { 442, 539, }, { 448, 547, }, { 454, 555, }, { 460, 563, },
+ { 466, 571, }, { 472, 579, }, { 478, 588, }, { 484, 596, },
+ { 490, 604, }, { 499, 616, }, { 507, 628, }, { 516, 640, },
+ { 525, 652, }, { 533, 664, }, { 542, 676, }, { 550, 688, },
+ { 559, 700, }, { 567, 713, }, { 576, 725, }, { 584, 737, },
+ { 592, 749, }, { 601, 761, }, { 609, 773, }, { 617, 785, },
+ { 625, 797, }, { 634, 809, }, { 644, 825, }, { 655, 841, },
+ { 666, 857, }, { 676, 873, }, { 687, 889, }, { 698, 905, },
+ { 708, 922, }, { 718, 938, }, { 729, 954, }, { 739, 970, },
+ { 749, 986, }, { 759, 1002, }, { 770, 1018, }, { 782, 1038, },
+ { 795, 1058, }, { 807, 1078, }, { 819, 1098, }, { 831, 1118, },
+ { 844, 1138, }, { 856, 1158, }, { 868, 1178, }, { 880, 1198, },
+ { 891, 1218, }, { 906, 1242, }, { 920, 1266, }, { 933, 1290, },
+ { 947, 1314, }, { 961, 1338, }, { 975, 1362, }, { 988, 1386, },
+ { 1001, 1411, }, { 1015, 1435, }, { 1030, 1463, }, { 1045, 1491, },
+ { 1061, 1519, }, { 1076, 1547, }, { 1090, 1575, }, { 1105, 1603, },
+ { 1120, 1631, }, { 1137, 1663, }, { 1153, 1695, }, { 1170, 1727, },
+ { 1186, 1759, }, { 1202, 1791, }, { 1218, 1823, }, { 1236, 1859, },
+ { 1253, 1895, }, { 1271, 1931, }, { 1288, 1967, }, { 1306, 2003, },
+ { 1323, 2039, }, { 1342, 2079, }, { 1361, 2119, }, { 1379, 2159, },
+ { 1398, 2199, }, { 1416, 2239, }, { 1436, 2283, }, { 1456, 2327, },
+ { 1476, 2371, }, { 1496, 2415, }, { 1516, 2459, }, { 1537, 2507, },
+ { 1559, 2555, }, { 1580, 2603, }, { 1601, 2651, }, { 1624, 2703, },
+ { 1647, 2755, }, { 1670, 2807, }, { 1692, 2859, }, { 1717, 2915, },
+ { 1741, 2971, }, { 1766, 3027, }, { 1791, 3083, }, { 1817, 3143, },
+ { 1844, 3203, }, { 1871, 3263, }, { 1900, 3327, }, { 1929, 3391, },
+ { 1958, 3455, }, { 1990, 3523, }, { 2021, 3591, }, { 2054, 3659, },
+ { 2088, 3731, }, { 2123, 3803, }, { 2159, 3876, }, { 2197, 3952, },
+ { 2236, 4028, }, { 2276, 4104, }, { 2319, 4184, }, { 2363, 4264, },
+ { 2410, 4348, }, { 2458, 4432, }, { 2508, 4516, }, { 2561, 4604, },
+ { 2616, 4692, }, { 2675, 4784, }, { 2737, 4876, }, { 2802, 4972, },
+ { 2871, 5068, }, { 2944, 5168, }, { 3020, 5268, }, { 3102, 5372, },
+ { 3188, 5476, }, { 3280, 5584, }, { 3375, 5692, }, { 3478, 5804, },
+ { 3586, 5916, }, { 3702, 6032, }, { 3823, 6148, }, { 3953, 6268, },
+ { 4089, 6388, }, { 4236, 6512, }, { 4394, 6640, }, { 4559, 6768, },
+ { 4737, 6900, }, { 4929, 7036, }, { 5130, 7172, }, { 5347, 7312, },
+ }, {
+ { 4, 4 }, { 12, 13 }, { 18, 19 }, { 25, 27 },
+ { 33, 35 }, { 41, 44 }, { 50, 54 }, { 60, 64 },
+ { 70, 75 }, { 80, 87 }, { 91, 99 }, { 103, 112 },
+ { 115, 126 }, { 127, 139 }, { 140, 154 }, { 153, 168 },
+ { 166, 183 }, { 180, 199 }, { 194, 214 }, { 208, 230 },
+ { 222, 247 }, { 237, 263 }, { 251, 280 }, { 266, 297 },
+ { 281, 314 }, { 296, 331 }, { 312, 349 }, { 327, 366 },
+ { 343, 384 }, { 358, 402 }, { 374, 420 }, { 390, 438 },
+ { 405, 456 }, { 421, 475 }, { 437, 493 }, { 453, 511 },
+ { 469, 530 }, { 484, 548 }, { 500, 567 }, { 516, 586 },
+ { 532, 604 }, { 548, 623 }, { 564, 642 }, { 580, 660 },
+ { 596, 679 }, { 611, 698 }, { 627, 716 }, { 643, 735 },
+ { 659, 753 }, { 674, 772 }, { 690, 791 }, { 706, 809 },
+ { 721, 828 }, { 737, 846 }, { 752, 865 }, { 768, 884 },
+ { 783, 902 }, { 798, 920 }, { 814, 939 }, { 829, 957 },
+ { 844, 976 }, { 859, 994 }, { 874, 1012 }, { 889, 1030 },
+ { 904, 1049 }, { 919, 1067 }, { 934, 1085 }, { 949, 1103 },
+ { 964, 1121 }, { 978, 1139 }, { 993, 1157 }, { 1008, 1175 },
+ { 1022, 1193 }, { 1037, 1211 }, { 1051, 1229 }, { 1065, 1246 },
+ { 1080, 1264 }, { 1094, 1282 }, { 1108, 1299 }, { 1122, 1317 },
+ { 1136, 1335 }, { 1151, 1352 }, { 1165, 1370 }, { 1179, 1387 },
+ { 1192, 1405 }, { 1206, 1422 }, { 1220, 1440 }, { 1234, 1457 },
+ { 1248, 1474 }, { 1261, 1491 }, { 1275, 1509 }, { 1288, 1526 },
+ { 1302, 1543 }, { 1315, 1560 }, { 1329, 1577 }, { 1342, 1595 },
+ { 1368, 1627 }, { 1393, 1660 }, { 1419, 1693 }, { 1444, 1725 },
+ { 1469, 1758 }, { 1494, 1791 }, { 1519, 1824 }, { 1544, 1856 },
+ { 1569, 1889 }, { 1594, 1922 }, { 1618, 1954 }, { 1643, 1987 },
+ { 1668, 2020 }, { 1692, 2052 }, { 1717, 2085 }, { 1741, 2118 },
+ { 1765, 2150 }, { 1789, 2183 }, { 1814, 2216 }, { 1838, 2248 },
+ { 1862, 2281 }, { 1885, 2313 }, { 1909, 2346 }, { 1933, 2378 },
+ { 1957, 2411 }, { 1992, 2459 }, { 2027, 2508 }, { 2061, 2556 },
+ { 2096, 2605 }, { 2130, 2653 }, { 2165, 2701 }, { 2199, 2750 },
+ { 2233, 2798 }, { 2267, 2847 }, { 2300, 2895 }, { 2334, 2943 },
+ { 2367, 2992 }, { 2400, 3040 }, { 2434, 3088 }, { 2467, 3137 },
+ { 2499, 3185 }, { 2532, 3234 }, { 2575, 3298 }, { 2618, 3362 },
+ { 2661, 3426 }, { 2704, 3491 }, { 2746, 3555 }, { 2788, 3619 },
+ { 2830, 3684 }, { 2872, 3748 }, { 2913, 3812 }, { 2954, 3876 },
+ { 2995, 3941 }, { 3036, 4005 }, { 3076, 4069 }, { 3127, 4149 },
+ { 3177, 4230 }, { 3226, 4310 }, { 3275, 4390 }, { 3324, 4470 },
+ { 3373, 4550 }, { 3421, 4631 }, { 3469, 4711 }, { 3517, 4791 },
+ { 3565, 4871 }, { 3621, 4967 }, { 3677, 5064 }, { 3733, 5160 },
+ { 3788, 5256 }, { 3843, 5352 }, { 3897, 5448 }, { 3951, 5544 },
+ { 4005, 5641 }, { 4058, 5737 }, { 4119, 5849 }, { 4181, 5961 },
+ { 4241, 6073 }, { 4301, 6185 }, { 4361, 6297 }, { 4420, 6410 },
+ { 4479, 6522 }, { 4546, 6650 }, { 4612, 6778 }, { 4677, 6906 },
+ { 4742, 7034 }, { 4807, 7162 }, { 4871, 7290 }, { 4942, 7435 },
+ { 5013, 7579 }, { 5083, 7723 }, { 5153, 7867 }, { 5222, 8011 },
+ { 5291, 8155 }, { 5367, 8315 }, { 5442, 8475 }, { 5517, 8635 },
+ { 5591, 8795 }, { 5665, 8956 }, { 5745, 9132 }, { 5825, 9308 },
+ { 5905, 9484 }, { 5984, 9660 }, { 6063, 9836 }, { 6149, 10028 },
+ { 6234, 10220 }, { 6319, 10412 }, { 6404, 10604 }, { 6495, 10812 },
+ { 6587, 11020 }, { 6678, 11228 }, { 6769, 11437 }, { 6867, 11661 },
+ { 6966, 11885 }, { 7064, 12109 }, { 7163, 12333 }, { 7269, 12573 },
+ { 7376, 12813 }, { 7483, 13053 }, { 7599, 13309 }, { 7715, 13565 },
+ { 7832, 13821 }, { 7958, 14093 }, { 8085, 14365 }, { 8214, 14637 },
+ { 8352, 14925 }, { 8492, 15213 }, { 8635, 15502 }, { 8788, 15806 },
+ { 8945, 16110 }, { 9104, 16414 }, { 9275, 16734 }, { 9450, 17054 },
+ { 9639, 17390 }, { 9832, 17726 }, { 10031, 18062 }, { 10245, 18414 },
+ { 10465, 18766 }, { 10702, 19134 }, { 10946, 19502 }, { 11210, 19886 },
+ { 11482, 20270 }, { 11776, 20670 }, { 12081, 21070 }, { 12409, 21486 },
+ { 12750, 21902 }, { 13118, 22334 }, { 13501, 22766 }, { 13913, 23214 },
+ { 14343, 23662 }, { 14807, 24126 }, { 15290, 24590 }, { 15812, 25070 },
+ { 16356, 25551 }, { 16943, 26047 }, { 17575, 26559 }, { 18237, 27071 },
+ { 18949, 27599 }, { 19718, 28143 }, { 20521, 28687 }, { 21387, 29247 },
+ }
+};
diff --git a/third_party/dav1d/src/dequant_tables.h b/third_party/dav1d/src/dequant_tables.h
new file mode 100644
index 0000000000..17763377bc
--- /dev/null
+++ b/third_party/dav1d/src/dequant_tables.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_DEQUANT_TABLES_H
+#define DAV1D_SRC_DEQUANT_TABLES_H
+
+#include <stdint.h>
+
+#include "src/levels.h"
+
+EXTERN const uint16_t dav1d_dq_tbl[3][QINDEX_RANGE][2];
+
+#endif /* DAV1D_SRC_DEQUANT_TABLES_H */
diff --git a/third_party/dav1d/src/env.h b/third_party/dav1d/src/env.h
new file mode 100644
index 0000000000..7b91c4cab6
--- /dev/null
+++ b/third_party/dav1d/src/env.h
@@ -0,0 +1,521 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_ENV_H
+#define DAV1D_SRC_ENV_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "src/levels.h"
+#include "src/refmvs.h"
+#include "src/tables.h"
+
+typedef struct BlockContext {
+ uint8_t ALIGN(mode[32], 8);
+ uint8_t ALIGN(lcoef[32], 8);
+ uint8_t ALIGN(ccoef[2][32], 8);
+ uint8_t ALIGN(seg_pred[32], 8);
+ uint8_t ALIGN(skip[32], 8);
+ uint8_t ALIGN(skip_mode[32], 8);
+ uint8_t ALIGN(intra[32], 8);
+ uint8_t ALIGN(comp_type[32], 8);
+ int8_t ALIGN(ref[2][32], 8); // -1 means intra
+ uint8_t ALIGN(filter[2][32], 8); // 3 means unset
+ int8_t ALIGN(tx_intra[32], 8);
+ int8_t ALIGN(tx[32], 8);
+ uint8_t ALIGN(tx_lpf_y[32], 8);
+ uint8_t ALIGN(tx_lpf_uv[32], 8);
+ uint8_t ALIGN(partition[16], 8);
+ uint8_t ALIGN(uvmode[32], 8);
+ uint8_t ALIGN(pal_sz[32], 8);
+} BlockContext;
+
+static inline int get_intra_ctx(const BlockContext *const a,
+ const BlockContext *const l,
+ const int yb4, const int xb4,
+ const int have_top, const int have_left)
+{
+ if (have_left) {
+ if (have_top) {
+ const int ctx = l->intra[yb4] + a->intra[xb4];
+ return ctx + (ctx == 2);
+ } else
+ return l->intra[yb4] * 2;
+ } else {
+ return have_top ? a->intra[xb4] * 2 : 0;
+ }
+}
+
+static inline int get_tx_ctx(const BlockContext *const a,
+ const BlockContext *const l,
+ const TxfmInfo *const max_tx,
+ const int yb4, const int xb4)
+{
+ return (l->tx_intra[yb4] >= max_tx->lh) + (a->tx_intra[xb4] >= max_tx->lw);
+}
+
+static inline int get_partition_ctx(const BlockContext *const a,
+ const BlockContext *const l,
+ const enum BlockLevel bl,
+ const int yb8, const int xb8)
+{
+ return ((a->partition[xb8] >> (4 - bl)) & 1) +
+ (((l->partition[yb8] >> (4 - bl)) & 1) << 1);
+}
+
+static inline unsigned gather_left_partition_prob(const uint16_t *const in,
+ const enum BlockLevel bl)
+{
+ unsigned out = in[PARTITION_H - 1] - in[PARTITION_H];
+ // Exploit the fact that cdfs for PARTITION_SPLIT, PARTITION_T_TOP_SPLIT,
+ // PARTITION_T_BOTTOM_SPLIT and PARTITION_T_LEFT_SPLIT are neighbors.
+ out += in[PARTITION_SPLIT - 1] - in[PARTITION_T_LEFT_SPLIT];
+ if (bl != BL_128X128)
+ out += in[PARTITION_H4 - 1] - in[PARTITION_H4];
+ return out;
+}
+
+static inline unsigned gather_top_partition_prob(const uint16_t *const in,
+ const enum BlockLevel bl)
+{
+ // Exploit the fact that cdfs for PARTITION_V, PARTITION_SPLIT and
+ // PARTITION_T_TOP_SPLIT are neighbors.
+ unsigned out = in[PARTITION_V - 1] - in[PARTITION_T_TOP_SPLIT];
+ // Exploit the facts that cdfs for PARTITION_T_LEFT_SPLIT and
+ // PARTITION_T_RIGHT_SPLIT are neighbors, the probability for
+ // PARTITION_V4 is always zero, and the probability for
+ // PARTITION_T_RIGHT_SPLIT is zero in 128x128 blocks.
+ out += in[PARTITION_T_LEFT_SPLIT - 1];
+ if (bl != BL_128X128)
+ out += in[PARTITION_V4 - 1] - in[PARTITION_T_RIGHT_SPLIT];
+ return out;
+}
+
+static inline enum TxfmType get_uv_inter_txtp(const TxfmInfo *const uvt_dim,
+ const enum TxfmType ytxtp)
+{
+ if (uvt_dim->max == TX_32X32)
+ return ytxtp == IDTX ? IDTX : DCT_DCT;
+ if (uvt_dim->min == TX_16X16 &&
+ ((1 << ytxtp) & ((1 << H_FLIPADST) | (1 << V_FLIPADST) |
+ (1 << H_ADST) | (1 << V_ADST))))
+ {
+ return DCT_DCT;
+ }
+
+ return ytxtp;
+}
+
+static inline int get_filter_ctx(const BlockContext *const a,
+ const BlockContext *const l,
+ const int comp, const int dir, const int ref,
+ const int yb4, const int xb4)
+{
+ const int a_filter = (a->ref[0][xb4] == ref || a->ref[1][xb4] == ref) ?
+ a->filter[dir][xb4] : DAV1D_N_SWITCHABLE_FILTERS;
+ const int l_filter = (l->ref[0][yb4] == ref || l->ref[1][yb4] == ref) ?
+ l->filter[dir][yb4] : DAV1D_N_SWITCHABLE_FILTERS;
+
+ if (a_filter == l_filter) {
+ return comp * 4 + a_filter;
+ } else if (a_filter == DAV1D_N_SWITCHABLE_FILTERS) {
+ return comp * 4 + l_filter;
+ } else if (l_filter == DAV1D_N_SWITCHABLE_FILTERS) {
+ return comp * 4 + a_filter;
+ } else {
+ return comp * 4 + DAV1D_N_SWITCHABLE_FILTERS;
+ }
+}
+
+static inline int get_comp_ctx(const BlockContext *const a,
+ const BlockContext *const l,
+ const int yb4, const int xb4,
+ const int have_top, const int have_left)
+{
+ if (have_top) {
+ if (have_left) {
+ if (a->comp_type[xb4]) {
+ if (l->comp_type[yb4]) {
+ return 4;
+ } else {
+ // 4U means intra (-1) or bwd (>= 4)
+ return 2 + ((unsigned)l->ref[0][yb4] >= 4U);
+ }
+ } else if (l->comp_type[yb4]) {
+ // 4U means intra (-1) or bwd (>= 4)
+ return 2 + ((unsigned)a->ref[0][xb4] >= 4U);
+ } else {
+ return (l->ref[0][yb4] >= 4) ^ (a->ref[0][xb4] >= 4);
+ }
+ } else {
+ return a->comp_type[xb4] ? 3 : a->ref[0][xb4] >= 4;
+ }
+ } else if (have_left) {
+ return l->comp_type[yb4] ? 3 : l->ref[0][yb4] >= 4;
+ } else {
+ return 1;
+ }
+}
+
+static inline int get_comp_dir_ctx(const BlockContext *const a,
+ const BlockContext *const l,
+ const int yb4, const int xb4,
+ const int have_top, const int have_left)
+{
+#define has_uni_comp(edge, off) \
+ ((edge->ref[0][off] < 4) == (edge->ref[1][off] < 4))
+
+ if (have_top && have_left) {
+ const int a_intra = a->intra[xb4], l_intra = l->intra[yb4];
+
+ if (a_intra && l_intra) return 2;
+ if (a_intra || l_intra) {
+ const BlockContext *const edge = a_intra ? l : a;
+ const int off = a_intra ? yb4 : xb4;
+
+ if (edge->comp_type[off] == COMP_INTER_NONE) return 2;
+ return 1 + 2 * has_uni_comp(edge, off);
+ }
+
+ const int a_comp = a->comp_type[xb4] != COMP_INTER_NONE;
+ const int l_comp = l->comp_type[yb4] != COMP_INTER_NONE;
+ const int a_ref0 = a->ref[0][xb4], l_ref0 = l->ref[0][yb4];
+
+ if (!a_comp && !l_comp) {
+ return 1 + 2 * ((a_ref0 >= 4) == (l_ref0 >= 4));
+ } else if (!a_comp || !l_comp) {
+ const BlockContext *const edge = a_comp ? a : l;
+ const int off = a_comp ? xb4 : yb4;
+
+ if (!has_uni_comp(edge, off)) return 1;
+ return 3 + ((a_ref0 >= 4) == (l_ref0 >= 4));
+ } else {
+ const int a_uni = has_uni_comp(a, xb4), l_uni = has_uni_comp(l, yb4);
+
+ if (!a_uni && !l_uni) return 0;
+ if (!a_uni || !l_uni) return 2;
+ return 3 + ((a_ref0 == 4) == (l_ref0 == 4));
+ }
+ } else if (have_top || have_left) {
+ const BlockContext *const edge = have_left ? l : a;
+ const int off = have_left ? yb4 : xb4;
+
+ if (edge->intra[off]) return 2;
+ if (edge->comp_type[off] == COMP_INTER_NONE) return 2;
+ return 4 * has_uni_comp(edge, off);
+ } else {
+ return 2;
+ }
+}
+
+static inline int get_poc_diff(const int order_hint_n_bits,
+ const int poc0, const int poc1)
+{
+ if (!order_hint_n_bits) return 0;
+ const int mask = 1 << (order_hint_n_bits - 1);
+ const int diff = poc0 - poc1;
+ return (diff & (mask - 1)) - (diff & mask);
+}
+
+static inline int get_jnt_comp_ctx(const int order_hint_n_bits,
+ const unsigned poc, const unsigned ref0poc,
+ const unsigned ref1poc,
+ const BlockContext *const a,
+ const BlockContext *const l,
+ const int yb4, const int xb4)
+{
+ const unsigned d0 = abs(get_poc_diff(order_hint_n_bits, ref0poc, poc));
+ const unsigned d1 = abs(get_poc_diff(order_hint_n_bits, poc, ref1poc));
+ const int offset = d0 == d1;
+ const int a_ctx = a->comp_type[xb4] >= COMP_INTER_AVG ||
+ a->ref[0][xb4] == 6;
+ const int l_ctx = l->comp_type[yb4] >= COMP_INTER_AVG ||
+ l->ref[0][yb4] == 6;
+
+ return 3 * offset + a_ctx + l_ctx;
+}
+
+static inline int get_mask_comp_ctx(const BlockContext *const a,
+ const BlockContext *const l,
+ const int yb4, const int xb4)
+{
+ const int a_ctx = a->comp_type[xb4] >= COMP_INTER_SEG ? 1 :
+ a->ref[0][xb4] == 6 ? 3 : 0;
+ const int l_ctx = l->comp_type[yb4] >= COMP_INTER_SEG ? 1 :
+ l->ref[0][yb4] == 6 ? 3 : 0;
+
+ return imin(a_ctx + l_ctx, 5);
+}
+
+#define av1_get_ref_2_ctx av1_get_bwd_ref_ctx
+#define av1_get_ref_3_ctx av1_get_fwd_ref_ctx
+#define av1_get_ref_4_ctx av1_get_fwd_ref_1_ctx
+#define av1_get_ref_5_ctx av1_get_fwd_ref_2_ctx
+#define av1_get_ref_6_ctx av1_get_bwd_ref_1_ctx
+#define av1_get_uni_p_ctx av1_get_ref_ctx
+#define av1_get_uni_p2_ctx av1_get_fwd_ref_2_ctx
+
+static inline int av1_get_ref_ctx(const BlockContext *const a,
+ const BlockContext *const l,
+ const int yb4, const int xb4,
+ int have_top, int have_left)
+{
+ int cnt[2] = { 0 };
+
+ if (have_top && !a->intra[xb4]) {
+ cnt[a->ref[0][xb4] >= 4]++;
+ if (a->comp_type[xb4]) cnt[a->ref[1][xb4] >= 4]++;
+ }
+
+ if (have_left && !l->intra[yb4]) {
+ cnt[l->ref[0][yb4] >= 4]++;
+ if (l->comp_type[yb4]) cnt[l->ref[1][yb4] >= 4]++;
+ }
+
+ return cnt[0] == cnt[1] ? 1 : cnt[0] < cnt[1] ? 0 : 2;
+}
+
+static inline int av1_get_fwd_ref_ctx(const BlockContext *const a,
+ const BlockContext *const l,
+ const int yb4, const int xb4,
+ const int have_top, const int have_left)
+{
+ int cnt[4] = { 0 };
+
+ if (have_top && !a->intra[xb4]) {
+ if (a->ref[0][xb4] < 4) cnt[a->ref[0][xb4]]++;
+ if (a->comp_type[xb4] && a->ref[1][xb4] < 4) cnt[a->ref[1][xb4]]++;
+ }
+
+ if (have_left && !l->intra[yb4]) {
+ if (l->ref[0][yb4] < 4) cnt[l->ref[0][yb4]]++;
+ if (l->comp_type[yb4] && l->ref[1][yb4] < 4) cnt[l->ref[1][yb4]]++;
+ }
+
+ cnt[0] += cnt[1];
+ cnt[2] += cnt[3];
+
+ return cnt[0] == cnt[2] ? 1 : cnt[0] < cnt[2] ? 0 : 2;
+}
+
+static inline int av1_get_fwd_ref_1_ctx(const BlockContext *const a,
+ const BlockContext *const l,
+ const int yb4, const int xb4,
+ const int have_top, const int have_left)
+{
+ int cnt[2] = { 0 };
+
+ if (have_top && !a->intra[xb4]) {
+ if (a->ref[0][xb4] < 2) cnt[a->ref[0][xb4]]++;
+ if (a->comp_type[xb4] && a->ref[1][xb4] < 2) cnt[a->ref[1][xb4]]++;
+ }
+
+ if (have_left && !l->intra[yb4]) {
+ if (l->ref[0][yb4] < 2) cnt[l->ref[0][yb4]]++;
+ if (l->comp_type[yb4] && l->ref[1][yb4] < 2) cnt[l->ref[1][yb4]]++;
+ }
+
+ return cnt[0] == cnt[1] ? 1 : cnt[0] < cnt[1] ? 0 : 2;
+}
+
+static inline int av1_get_fwd_ref_2_ctx(const BlockContext *const a,
+ const BlockContext *const l,
+ const int yb4, const int xb4,
+ const int have_top, const int have_left)
+{
+ int cnt[2] = { 0 };
+
+ if (have_top && !a->intra[xb4]) {
+ if ((a->ref[0][xb4] ^ 2U) < 2) cnt[a->ref[0][xb4] - 2]++;
+ if (a->comp_type[xb4] && (a->ref[1][xb4] ^ 2U) < 2) cnt[a->ref[1][xb4] - 2]++;
+ }
+
+ if (have_left && !l->intra[yb4]) {
+ if ((l->ref[0][yb4] ^ 2U) < 2) cnt[l->ref[0][yb4] - 2]++;
+ if (l->comp_type[yb4] && (l->ref[1][yb4] ^ 2U) < 2) cnt[l->ref[1][yb4] - 2]++;
+ }
+
+ return cnt[0] == cnt[1] ? 1 : cnt[0] < cnt[1] ? 0 : 2;
+}
+
+static inline int av1_get_bwd_ref_ctx(const BlockContext *const a,
+ const BlockContext *const l,
+ const int yb4, const int xb4,
+ const int have_top, const int have_left)
+{
+ int cnt[3] = { 0 };
+
+ if (have_top && !a->intra[xb4]) {
+ if (a->ref[0][xb4] >= 4) cnt[a->ref[0][xb4] - 4]++;
+ if (a->comp_type[xb4] && a->ref[1][xb4] >= 4) cnt[a->ref[1][xb4] - 4]++;
+ }
+
+ if (have_left && !l->intra[yb4]) {
+ if (l->ref[0][yb4] >= 4) cnt[l->ref[0][yb4] - 4]++;
+ if (l->comp_type[yb4] && l->ref[1][yb4] >= 4) cnt[l->ref[1][yb4] - 4]++;
+ }
+
+ cnt[1] += cnt[0];
+
+ return cnt[2] == cnt[1] ? 1 : cnt[1] < cnt[2] ? 0 : 2;
+}
+
+static inline int av1_get_bwd_ref_1_ctx(const BlockContext *const a,
+ const BlockContext *const l,
+ const int yb4, const int xb4,
+ const int have_top, const int have_left)
+{
+ int cnt[3] = { 0 };
+
+ if (have_top && !a->intra[xb4]) {
+ if (a->ref[0][xb4] >= 4) cnt[a->ref[0][xb4] - 4]++;
+ if (a->comp_type[xb4] && a->ref[1][xb4] >= 4) cnt[a->ref[1][xb4] - 4]++;
+ }
+
+ if (have_left && !l->intra[yb4]) {
+ if (l->ref[0][yb4] >= 4) cnt[l->ref[0][yb4] - 4]++;
+ if (l->comp_type[yb4] && l->ref[1][yb4] >= 4) cnt[l->ref[1][yb4] - 4]++;
+ }
+
+ return cnt[0] == cnt[1] ? 1 : cnt[0] < cnt[1] ? 0 : 2;
+}
+
+static inline int av1_get_uni_p1_ctx(const BlockContext *const a,
+ const BlockContext *const l,
+ const int yb4, const int xb4,
+ const int have_top, const int have_left)
+{
+ int cnt[3] = { 0 };
+
+ if (have_top && !a->intra[xb4]) {
+ if (a->ref[0][xb4] - 1U < 3) cnt[a->ref[0][xb4] - 1]++;
+ if (a->comp_type[xb4] && a->ref[1][xb4] - 1U < 3) cnt[a->ref[1][xb4] - 1]++;
+ }
+
+ if (have_left && !l->intra[yb4]) {
+ if (l->ref[0][yb4] - 1U < 3) cnt[l->ref[0][yb4] - 1]++;
+ if (l->comp_type[yb4] && l->ref[1][yb4] - 1U < 3) cnt[l->ref[1][yb4] - 1]++;
+ }
+
+ cnt[1] += cnt[2];
+
+ return cnt[0] == cnt[1] ? 1 : cnt[0] < cnt[1] ? 0 : 2;
+}
+
+static inline int get_drl_context(const refmvs_candidate *const ref_mv_stack,
+ const int ref_idx)
+{
+ if (ref_mv_stack[ref_idx].weight >= 640)
+ return ref_mv_stack[ref_idx + 1].weight < 640;
+
+ return ref_mv_stack[ref_idx + 1].weight < 640 ? 2 : 0;
+}
+
+static inline unsigned get_cur_frame_segid(const int by, const int bx,
+ const int have_top,
+ const int have_left,
+ int *const seg_ctx,
+ const uint8_t *cur_seg_map,
+ const ptrdiff_t stride)
+{
+ cur_seg_map += bx + by * stride;
+ if (have_left && have_top) {
+ const int l = cur_seg_map[-1];
+ const int a = cur_seg_map[-stride];
+ const int al = cur_seg_map[-(stride + 1)];
+
+ if (l == a && al == l) *seg_ctx = 2;
+ else if (l == a || al == l || a == al) *seg_ctx = 1;
+ else *seg_ctx = 0;
+ return a == al ? a : l;
+ } else {
+ *seg_ctx = 0;
+ return have_left ? cur_seg_map[-1] : have_top ? cur_seg_map[-stride] : 0;
+ }
+}
+
+static inline void fix_int_mv_precision(mv *const mv) {
+ mv->x = (mv->x - (mv->x >> 15) + 3) & ~7U;
+ mv->y = (mv->y - (mv->y >> 15) + 3) & ~7U;
+}
+
+static inline void fix_mv_precision(const Dav1dFrameHeader *const hdr,
+ mv *const mv)
+{
+ if (hdr->force_integer_mv) {
+ fix_int_mv_precision(mv);
+ } else if (!hdr->hp) {
+ mv->x = (mv->x - (mv->x >> 15)) & ~1U;
+ mv->y = (mv->y - (mv->y >> 15)) & ~1U;
+ }
+}
+
+static inline mv get_gmv_2d(const Dav1dWarpedMotionParams *const gmv,
+ const int bx4, const int by4,
+ const int bw4, const int bh4,
+ const Dav1dFrameHeader *const hdr)
+{
+ switch (gmv->type) {
+ case DAV1D_WM_TYPE_ROT_ZOOM:
+ assert(gmv->matrix[5] == gmv->matrix[2]);
+ assert(gmv->matrix[4] == -gmv->matrix[3]);
+ // fall-through
+ default:
+ case DAV1D_WM_TYPE_AFFINE: {
+ const int x = bx4 * 4 + bw4 * 2 - 1;
+ const int y = by4 * 4 + bh4 * 2 - 1;
+ const int xc = (gmv->matrix[2] - (1 << 16)) * x +
+ gmv->matrix[3] * y + gmv->matrix[0];
+ const int yc = (gmv->matrix[5] - (1 << 16)) * y +
+ gmv->matrix[4] * x + gmv->matrix[1];
+ const int shift = 16 - (3 - !hdr->hp);
+ const int round = (1 << shift) >> 1;
+ mv res = (mv) {
+ .y = apply_sign(((abs(yc) + round) >> shift) << !hdr->hp, yc),
+ .x = apply_sign(((abs(xc) + round) >> shift) << !hdr->hp, xc),
+ };
+ if (hdr->force_integer_mv)
+ fix_int_mv_precision(&res);
+ return res;
+ }
+ case DAV1D_WM_TYPE_TRANSLATION: {
+ mv res = (mv) {
+ .y = gmv->matrix[0] >> 13,
+ .x = gmv->matrix[1] >> 13,
+ };
+ if (hdr->force_integer_mv)
+ fix_int_mv_precision(&res);
+ return res;
+ }
+ case DAV1D_WM_TYPE_IDENTITY:
+ return (mv) { .x = 0, .y = 0 };
+ }
+}
+
+#endif /* DAV1D_SRC_ENV_H */
diff --git a/third_party/dav1d/src/ext/x86/x86inc.asm b/third_party/dav1d/src/ext/x86/x86inc.asm
new file mode 100644
index 0000000000..68b1f74f4b
--- /dev/null
+++ b/third_party/dav1d/src/ext/x86/x86inc.asm
@@ -0,0 +1,1902 @@
+;*****************************************************************************
+;* x86inc.asm: x86 abstraction layer
+;*****************************************************************************
+;* Copyright (C) 2005-2022 x264 project
+;*
+;* Authors: Loren Merritt <lorenm@u.washington.edu>
+;* Henrik Gramner <henrik@gramner.com>
+;* Anton Mitrofanov <BugMaster@narod.ru>
+;* Fiona Glaser <fiona@x264.com>
+;*
+;* Permission to use, copy, modify, and/or distribute this software for any
+;* purpose with or without fee is hereby granted, provided that the above
+;* copyright notice and this permission notice appear in all copies.
+;*
+;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+;*****************************************************************************
+
+; This is a header file for the x86inc.asm assembly language, which uses
+; NASM/YASM syntax combined with a large number of macros to provide easy
+; abstraction between different calling conventions (x86_32, win64, linux64).
+; It also has various other useful features to simplify writing the kind of
+; DSP functions that are most often used.
+
+%ifndef private_prefix
+ %error private_prefix not defined
+%endif
+
+%ifndef public_prefix
+ %define public_prefix private_prefix
+%endif
+
+%ifndef STACK_ALIGNMENT
+ %if ARCH_X86_64
+ %define STACK_ALIGNMENT 16
+ %else
+ %define STACK_ALIGNMENT 4
+ %endif
+%endif
+
+%define WIN64 0
+%define UNIX64 0
+%if ARCH_X86_64
+ %ifidn __OUTPUT_FORMAT__,win32
+ %define WIN64 1
+ %elifidn __OUTPUT_FORMAT__,win64
+ %define WIN64 1
+ %elifidn __OUTPUT_FORMAT__,x64
+ %define WIN64 1
+ %else
+ %define UNIX64 1
+ %endif
+%endif
+
+%define FORMAT_ELF 0
+%define FORMAT_MACHO 0
+%ifidn __OUTPUT_FORMAT__,elf
+ %define FORMAT_ELF 1
+%elifidn __OUTPUT_FORMAT__,elf32
+ %define FORMAT_ELF 1
+%elifidn __OUTPUT_FORMAT__,elf64
+ %define FORMAT_ELF 1
+%elifidn __OUTPUT_FORMAT__,macho
+ %define FORMAT_MACHO 1
+%elifidn __OUTPUT_FORMAT__,macho32
+ %define FORMAT_MACHO 1
+%elifidn __OUTPUT_FORMAT__,macho64
+ %define FORMAT_MACHO 1
+%endif
+
+%ifdef PREFIX
+ %define mangle(x) _ %+ x
+%else
+ %define mangle(x) x
+%endif
+
+; Use VEX-encoding even in non-AVX functions
+%ifndef FORCE_VEX_ENCODING
+ %define FORCE_VEX_ENCODING 0
+%endif
+
+%macro SECTION_RODATA 0-1 16
+ %ifidn __OUTPUT_FORMAT__,win32
+ SECTION .rdata align=%1
+ %elif WIN64
+ SECTION .rdata align=%1
+ %else
+ SECTION .rodata align=%1
+ %endif
+%endmacro
+
+%if ARCH_X86_64
+ %define PIC 1 ; always use PIC on x86-64
+ default rel
+%elifidn __OUTPUT_FORMAT__,win32
+ %define PIC 0 ; PIC isn't used on 32-bit Windows
+%elifndef PIC
+ %define PIC 0
+%endif
+
+%define HAVE_PRIVATE_EXTERN 1
+%ifdef __NASM_VER__
+ %use smartalign
+ %if __NASM_VERSION_ID__ < 0x020e0000 ; 2.14
+ %define HAVE_PRIVATE_EXTERN 0
+ %endif
+%endif
+
+; Macros to eliminate most code duplication between x86_32 and x86_64:
+; Currently this works only for leaf functions which load all their arguments
+; into registers at the start, and make no other use of the stack. Luckily that
+; covers most use cases.
+
+; PROLOGUE:
+; %1 = number of arguments. loads them from stack if needed.
+; %2 = number of registers used. pushes callee-saved regs if needed.
+; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
+; %4 = (optional) stack size to be allocated. The stack will be aligned before
+; allocating the specified stack size. If the required stack alignment is
+; larger than the known stack alignment the stack will be manually aligned
+; and an extra register will be allocated to hold the original stack
+; pointer (to not invalidate r0m etc.). To prevent the use of an extra
+; register as stack pointer, request a negative stack size.
+; %4+/%5+ = list of names to define to registers
+; PROLOGUE can also be invoked by adding the same options to cglobal
+
+; e.g.
+; cglobal foo, 2,3,7,0x40, dst, src, tmp
+; declares a function (foo) that automatically loads two arguments (dst and
+; src) into registers, uses one additional register (tmp) plus 7 vector
+; registers (m0-m6) and allocates 0x40 bytes of stack space.
+
+; TODO Some functions can use some args directly from the stack. If they're the
+; last args then you can just not declare them, but if they're in the middle
+; we need more flexible macro.
+
+; RET:
+; Pops anything that was pushed by PROLOGUE, and returns.
+
+; REP_RET:
+; Use this instead of RET if it's a branch target.
+
+; registers:
+; rN and rNq are the native-size register holding function argument N
+; rNd, rNw, rNb are dword, word, and byte size
+; rNh is the high 8 bits of the word size
+; rNm is the original location of arg N (a register or on the stack), dword
+; rNmp is native size
+
+%macro DECLARE_REG 2-3
+ %define r%1q %2
+ %define r%1d %2d
+ %define r%1w %2w
+ %define r%1b %2b
+ %define r%1h %2h
+ %define %2q %2
+ %if %0 == 2
+ %define r%1m %2d
+ %define r%1mp %2
+ %elif ARCH_X86_64 ; memory
+ %define r%1m [rstk + stack_offset + %3]
+ %define r%1mp qword r %+ %1 %+ m
+ %else
+ %define r%1m [rstk + stack_offset + %3]
+ %define r%1mp dword r %+ %1 %+ m
+ %endif
+ %define r%1 %2
+%endmacro
+
+%macro DECLARE_REG_SIZE 3
+ %define r%1q r%1
+ %define e%1q r%1
+ %define r%1d e%1
+ %define e%1d e%1
+ %define r%1w %1
+ %define e%1w %1
+ %define r%1h %3
+ %define e%1h %3
+ %define r%1b %2
+ %define e%1b %2
+ %if ARCH_X86_64 == 0
+ %define r%1 e%1
+ %endif
+%endmacro
+
+DECLARE_REG_SIZE ax, al, ah
+DECLARE_REG_SIZE bx, bl, bh
+DECLARE_REG_SIZE cx, cl, ch
+DECLARE_REG_SIZE dx, dl, dh
+DECLARE_REG_SIZE si, sil, null
+DECLARE_REG_SIZE di, dil, null
+DECLARE_REG_SIZE bp, bpl, null
+
+; t# defines for when per-arch register allocation is more complex than just function arguments
+
+%macro DECLARE_REG_TMP 1-*
+ %assign %%i 0
+ %rep %0
+ CAT_XDEFINE t, %%i, r%1
+ %assign %%i %%i+1
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro DECLARE_REG_TMP_SIZE 0-*
+ %rep %0
+ %define t%1q t%1 %+ q
+ %define t%1d t%1 %+ d
+ %define t%1w t%1 %+ w
+ %define t%1h t%1 %+ h
+ %define t%1b t%1 %+ b
+ %rotate 1
+ %endrep
+%endmacro
+
+DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
+
+%if ARCH_X86_64
+ %define gprsize 8
+%else
+ %define gprsize 4
+%endif
+
+%macro LEA 2
+%if ARCH_X86_64
+ lea %1, [%2]
+%elif PIC
+ call $+5 ; special-cased to not affect the RSB on most CPU:s
+ pop %1
+ add %1, (%2)-$+1
+%else
+ mov %1, %2
+%endif
+%endmacro
+
+; Repeats an instruction/operation for multiple arguments.
+; Example usage: "REPX {psrlw x, 8}, m0, m1, m2, m3"
+%macro REPX 2-* ; operation, args
+ %xdefine %%f(x) %1
+ %rep %0 - 1
+ %rotate 1
+ %%f(%1)
+ %endrep
+%endmacro
+
+%macro PUSH 1
+ push %1
+ %ifidn rstk, rsp
+ %assign stack_offset stack_offset+gprsize
+ %endif
+%endmacro
+
+%macro POP 1
+ pop %1
+ %ifidn rstk, rsp
+ %assign stack_offset stack_offset-gprsize
+ %endif
+%endmacro
+
+%macro PUSH_IF_USED 1-*
+ %rep %0
+ %if %1 < regs_used
+ PUSH r%1
+ %endif
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro POP_IF_USED 1-*
+ %rep %0
+ %if %1 < regs_used
+ pop r%1
+ %endif
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro LOAD_IF_USED 1-*
+ %rep %0
+ %if %1 < num_args
+ mov r%1, r %+ %1 %+ mp
+ %endif
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro SUB 2
+ sub %1, %2
+ %ifidn %1, rstk
+ %assign stack_offset stack_offset+(%2)
+ %endif
+%endmacro
+
+%macro ADD 2
+ add %1, %2
+ %ifidn %1, rstk
+ %assign stack_offset stack_offset-(%2)
+ %endif
+%endmacro
+
+%macro movifnidn 2
+ %ifnidn %1, %2
+ mov %1, %2
+ %endif
+%endmacro
+
+%if ARCH_X86_64 == 0
+ %define movsxd movifnidn
+%endif
+
+%macro movsxdifnidn 2
+ %ifnidn %1, %2
+ movsxd %1, %2
+ %endif
+%endmacro
+
+%macro ASSERT 1
+ %if (%1) == 0
+ %error assertion ``%1'' failed
+ %endif
+%endmacro
+
+%macro DEFINE_ARGS 0-*
+ %ifdef n_arg_names
+ %assign %%i 0
+ %rep n_arg_names
+ CAT_UNDEF arg_name %+ %%i, q
+ CAT_UNDEF arg_name %+ %%i, d
+ CAT_UNDEF arg_name %+ %%i, w
+ CAT_UNDEF arg_name %+ %%i, h
+ CAT_UNDEF arg_name %+ %%i, b
+ CAT_UNDEF arg_name %+ %%i, m
+ CAT_UNDEF arg_name %+ %%i, mp
+ CAT_UNDEF arg_name, %%i
+ %assign %%i %%i+1
+ %endrep
+ %endif
+
+ %xdefine %%stack_offset stack_offset
+ %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine
+ %assign %%i 0
+ %rep %0
+ %xdefine %1q r %+ %%i %+ q
+ %xdefine %1d r %+ %%i %+ d
+ %xdefine %1w r %+ %%i %+ w
+ %xdefine %1h r %+ %%i %+ h
+ %xdefine %1b r %+ %%i %+ b
+ %xdefine %1m r %+ %%i %+ m
+ %xdefine %1mp r %+ %%i %+ mp
+ CAT_XDEFINE arg_name, %%i, %1
+ %assign %%i %%i+1
+ %rotate 1
+ %endrep
+ %xdefine stack_offset %%stack_offset
+ %assign n_arg_names %0
+%endmacro
+
+%define required_stack_alignment ((mmsize + 15) & ~15)
+%define vzeroupper_required (mmsize > 16 && (ARCH_X86_64 == 0 || xmm_regs_used > 16 || notcpuflag(avx512)))
+%define high_mm_regs (16*cpuflag(avx512))
+
+; Large stack allocations on Windows need to use stack probing in order
+; to guarantee that all stack memory is committed before accessing it.
+; This is done by ensuring that the guard page(s) at the end of the
+; currently committed pages are touched prior to any pages beyond that.
+%if WIN64
+ %assign STACK_PROBE_SIZE 8192
+%elifidn __OUTPUT_FORMAT__, win32
+ %assign STACK_PROBE_SIZE 4096
+%else
+ %assign STACK_PROBE_SIZE 0
+%endif
+
+%macro PROBE_STACK 1 ; stack_size
+ %if STACK_PROBE_SIZE
+ %assign %%i STACK_PROBE_SIZE
+ %rep %1 / STACK_PROBE_SIZE
+ mov eax, [rsp-%%i]
+ %assign %%i %%i+STACK_PROBE_SIZE
+ %endrep
+ %endif
+%endmacro
+
+%macro ALLOC_STACK 0-2 0, 0 ; stack_size, n_xmm_regs (for win64 only)
+ %ifnum %1
+ %if %1 != 0
+ %assign %%pad 0
+ %assign stack_size %1
+ %if stack_size < 0
+ %assign stack_size -stack_size
+ %endif
+ %if WIN64
+ %assign %%pad %%pad + 32 ; shadow space
+ %if mmsize != 8
+ %assign xmm_regs_used %2
+ %if xmm_regs_used > 8
+ %assign %%pad %%pad + (xmm_regs_used-8)*16 ; callee-saved xmm registers
+ %endif
+ %endif
+ %endif
+ %if required_stack_alignment <= STACK_ALIGNMENT
+ ; maintain the current stack alignment
+ %assign stack_size_padded stack_size + %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
+ PROBE_STACK stack_size_padded
+ SUB rsp, stack_size_padded
+ %else
+ %assign %%reg_num (regs_used - 1)
+ %xdefine rstk r %+ %%reg_num
+ ; align stack, and save original stack location directly above
+ ; it, i.e. in [rsp+stack_size_padded], so we can restore the
+ ; stack in a single instruction (i.e. mov rsp, rstk or mov
+ ; rsp, [rsp+stack_size_padded])
+ %if %1 < 0 ; need to store rsp on stack
+ %xdefine rstkm [rsp + stack_size + %%pad]
+ %assign %%pad %%pad + gprsize
+ %else ; can keep rsp in rstk during whole function
+ %xdefine rstkm rstk
+ %endif
+ %assign stack_size_padded stack_size + ((%%pad + required_stack_alignment-1) & ~(required_stack_alignment-1))
+ PROBE_STACK stack_size_padded
+ mov rstk, rsp
+ and rsp, ~(required_stack_alignment-1)
+ sub rsp, stack_size_padded
+ movifnidn rstkm, rstk
+ %endif
+ WIN64_PUSH_XMM
+ %endif
+ %endif
+%endmacro
+
+%macro SETUP_STACK_POINTER 0-1 0
+ %ifnum %1
+ %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT
+ %if %1 > 0
+ ; Reserve an additional register for storing the original stack pointer, but avoid using
+ ; eax/rax for this purpose since it can potentially get overwritten as a return value.
+ %assign regs_used (regs_used + 1)
+ %if ARCH_X86_64 && regs_used == 7
+ %assign regs_used 8
+ %elif ARCH_X86_64 == 0 && regs_used == 1
+ %assign regs_used 2
+ %endif
+ %endif
+ %if ARCH_X86_64 && regs_used < 5 + UNIX64 * 3
+ ; Ensure that we don't clobber any registers containing arguments. For UNIX64 we also preserve r6 (rax)
+ ; since it's used as a hidden argument in vararg functions to specify the number of vector registers used.
+ %assign regs_used 5 + UNIX64 * 3
+ %endif
+ %endif
+ %endif
+%endmacro
+
+%if WIN64 ; Windows x64 ;=================================================
+
+DECLARE_REG 0, rcx
+DECLARE_REG 1, rdx
+DECLARE_REG 2, R8
+DECLARE_REG 3, R9
+DECLARE_REG 4, R10, 40
+DECLARE_REG 5, R11, 48
+DECLARE_REG 6, rax, 56
+DECLARE_REG 7, rdi, 64
+DECLARE_REG 8, rsi, 72
+DECLARE_REG 9, rbx, 80
+DECLARE_REG 10, rbp, 88
+DECLARE_REG 11, R14, 96
+DECLARE_REG 12, R15, 104
+DECLARE_REG 13, R12, 112
+DECLARE_REG 14, R13, 120
+
+%macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
+ %assign num_args %1
+ %assign regs_used %2
+ ASSERT regs_used >= num_args
+ SETUP_STACK_POINTER %4
+ ASSERT regs_used <= 15
+ PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14
+ ALLOC_STACK %4, %3
+ %if mmsize != 8 && stack_size == 0
+ WIN64_SPILL_XMM %3
+ %endif
+ LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
+ %if %0 > 4
+ %ifnum %4
+ DEFINE_ARGS %5
+ %else
+ DEFINE_ARGS %4, %5
+ %endif
+ %elifnnum %4
+ DEFINE_ARGS %4
+ %endif
+%endmacro
+
+%macro WIN64_PUSH_XMM 0
+ ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated.
+ %if xmm_regs_used > 6 + high_mm_regs
+ movaps [rstk + stack_offset + 8], xmm6
+ %endif
+ %if xmm_regs_used > 7 + high_mm_regs
+ movaps [rstk + stack_offset + 24], xmm7
+ %endif
+ %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
+ %if %%xmm_regs_on_stack > 0
+ %assign %%i 8
+ %rep %%xmm_regs_on_stack
+ movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i
+ %assign %%i %%i+1
+ %endrep
+ %endif
+%endmacro
+
+%macro WIN64_SPILL_XMM 1
+ %assign xmm_regs_used %1
+ ASSERT xmm_regs_used <= 16 + high_mm_regs
+ %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
+ %if %%xmm_regs_on_stack > 0
+ ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack.
+ %assign %%pad %%xmm_regs_on_stack*16 + 32
+ %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
+ SUB rsp, stack_size_padded
+ %endif
+ WIN64_PUSH_XMM
+%endmacro
+
+%macro WIN64_RESTORE_XMM_INTERNAL 0
+ %assign %%pad_size 0
+ %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
+ %if %%xmm_regs_on_stack > 0
+ %assign %%i xmm_regs_used - high_mm_regs
+ %rep %%xmm_regs_on_stack
+ %assign %%i %%i-1
+ movaps xmm %+ %%i, [rsp + (%%i-8)*16 + stack_size + 32]
+ %endrep
+ %endif
+ %if stack_size_padded > 0
+ %if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT
+ mov rsp, rstkm
+ %else
+ add rsp, stack_size_padded
+ %assign %%pad_size stack_size_padded
+ %endif
+ %endif
+ %if xmm_regs_used > 7 + high_mm_regs
+ movaps xmm7, [rsp + stack_offset - %%pad_size + 24]
+ %endif
+ %if xmm_regs_used > 6 + high_mm_regs
+ movaps xmm6, [rsp + stack_offset - %%pad_size + 8]
+ %endif
+%endmacro
+
+%macro WIN64_RESTORE_XMM 0
+ WIN64_RESTORE_XMM_INTERNAL
+ %assign stack_offset (stack_offset-stack_size_padded)
+ %assign stack_size_padded 0
+ %assign xmm_regs_used 0
+%endmacro
+
+%define has_epilogue regs_used > 7 || stack_size > 0 || vzeroupper_required || xmm_regs_used > 6+high_mm_regs
+
+%macro RET 0
+ WIN64_RESTORE_XMM_INTERNAL
+ POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
+ %if vzeroupper_required
+ vzeroupper
+ %endif
+ AUTO_REP_RET
+%endmacro
+
+%elif ARCH_X86_64 ; *nix x64 ;=============================================
+
+DECLARE_REG 0, rdi
+DECLARE_REG 1, rsi
+DECLARE_REG 2, rdx
+DECLARE_REG 3, rcx
+DECLARE_REG 4, R8
+DECLARE_REG 5, R9
+DECLARE_REG 6, rax, 8
+DECLARE_REG 7, R10, 16
+DECLARE_REG 8, R11, 24
+DECLARE_REG 9, rbx, 32
+DECLARE_REG 10, rbp, 40
+DECLARE_REG 11, R14, 48
+DECLARE_REG 12, R15, 56
+DECLARE_REG 13, R12, 64
+DECLARE_REG 14, R13, 72
+
+%macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
+ %assign num_args %1
+ %assign regs_used %2
+ %assign xmm_regs_used %3
+ ASSERT regs_used >= num_args
+ SETUP_STACK_POINTER %4
+ ASSERT regs_used <= 15
+ PUSH_IF_USED 9, 10, 11, 12, 13, 14
+ ALLOC_STACK %4
+ LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
+ %if %0 > 4
+ %ifnum %4
+ DEFINE_ARGS %5
+ %else
+ DEFINE_ARGS %4, %5
+ %endif
+ %elifnnum %4
+ DEFINE_ARGS %4
+ %endif
+%endmacro
+
+%define has_epilogue regs_used > 9 || stack_size > 0 || vzeroupper_required
+
+%macro RET 0
+ %if stack_size_padded > 0
+ %if required_stack_alignment > STACK_ALIGNMENT
+ mov rsp, rstkm
+ %else
+ add rsp, stack_size_padded
+ %endif
+ %endif
+ POP_IF_USED 14, 13, 12, 11, 10, 9
+ %if vzeroupper_required
+ vzeroupper
+ %endif
+ AUTO_REP_RET
+%endmacro
+
+%else ; X86_32 ;==============================================================
+
+DECLARE_REG 0, eax, 4
+DECLARE_REG 1, ecx, 8
+DECLARE_REG 2, edx, 12
+DECLARE_REG 3, ebx, 16
+DECLARE_REG 4, esi, 20
+DECLARE_REG 5, edi, 24
+DECLARE_REG 6, ebp, 28
+%define rsp esp
+
+%macro DECLARE_ARG 1-*
+ %rep %0
+ %define r%1m [rstk + stack_offset + 4*%1 + 4]
+ %define r%1mp dword r%1m
+ %rotate 1
+ %endrep
+%endmacro
+
+DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
+
+%macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
+ %assign num_args %1
+ %assign regs_used %2
+ ASSERT regs_used >= num_args
+ %if num_args > 7
+ %assign num_args 7
+ %endif
+ %if regs_used > 7
+ %assign regs_used 7
+ %endif
+ SETUP_STACK_POINTER %4
+ ASSERT regs_used <= 7
+ PUSH_IF_USED 3, 4, 5, 6
+ ALLOC_STACK %4
+ LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
+ %if %0 > 4
+ %ifnum %4
+ DEFINE_ARGS %5
+ %else
+ DEFINE_ARGS %4, %5
+ %endif
+ %elifnnum %4
+ DEFINE_ARGS %4
+ %endif
+%endmacro
+
+%define has_epilogue regs_used > 3 || stack_size > 0 || vzeroupper_required
+
+%macro RET 0
+ %if stack_size_padded > 0
+ %if required_stack_alignment > STACK_ALIGNMENT
+ mov rsp, rstkm
+ %else
+ add rsp, stack_size_padded
+ %endif
+ %endif
+ POP_IF_USED 6, 5, 4, 3
+ %if vzeroupper_required
+ vzeroupper
+ %endif
+ AUTO_REP_RET
+%endmacro
+
+%endif ;======================================================================
+
+%if WIN64 == 0
+ %macro WIN64_SPILL_XMM 1
+ %assign xmm_regs_used %1
+ %endmacro
+ %macro WIN64_RESTORE_XMM 0
+ %assign xmm_regs_used 0
+ %endmacro
+ %macro WIN64_PUSH_XMM 0
+ %endmacro
+%endif
+
+; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either
+; a branch or a branch target. So switch to a 2-byte form of ret in that case.
+; We can automatically detect "follows a branch", but not a branch target.
+; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.)
+%macro REP_RET 0
+ %if has_epilogue || cpuflag(ssse3)
+ RET
+ %else
+ rep ret
+ %endif
+ annotate_function_size
+%endmacro
+
+%define last_branch_adr $$
+%macro AUTO_REP_RET 0
+ %if notcpuflag(ssse3)
+ times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ == last_branch_adr.
+ %endif
+ ret
+ annotate_function_size
+%endmacro
+
+%macro BRANCH_INSTR 0-*
+ %rep %0
+ %macro %1 1-2 %1
+ %2 %1
+ %if notcpuflag(ssse3)
+ %%branch_instr equ $
+ %xdefine last_branch_adr %%branch_instr
+ %endif
+ %endmacro
+ %rotate 1
+ %endrep
+%endmacro
+
+BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp
+
+%macro TAIL_CALL 1-2 1 ; callee, is_nonadjacent
+ %if has_epilogue
+ call %1
+ RET
+ %elif %2
+ jmp %1
+ %endif
+ annotate_function_size
+%endmacro
+
+;=============================================================================
+; arch-independent part
+;=============================================================================
+
+%assign function_align 16
+
+; Begin a function.
+; Applies any symbol mangling needed for C linkage, and sets up a define such that
+; subsequent uses of the function name automatically refer to the mangled version.
+; Appends cpuflags to the function name if cpuflags has been specified.
+; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX
+; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2).
+%macro cglobal 1-2+ "" ; name, [PROLOGUE args]
+ cglobal_internal 1, %1 %+ SUFFIX, %2
+%endmacro
+%macro cvisible 1-2+ "" ; name, [PROLOGUE args]
+ cglobal_internal 0, %1 %+ SUFFIX, %2
+%endmacro
+%macro cglobal_internal 2-3+
+ annotate_function_size
+ %ifndef cglobaled_%2
+ %if %1
+ %xdefine %2 mangle(private_prefix %+ _ %+ %2)
+ %else
+ %xdefine %2 mangle(public_prefix %+ _ %+ %2)
+ %endif
+ %xdefine %2.skip_prologue %2 %+ .skip_prologue
+ CAT_XDEFINE cglobaled_, %2, 1
+ %endif
+ %xdefine current_function %2
+ %xdefine current_function_section __SECT__
+ %if FORMAT_ELF
+ %if %1
+ global %2:function hidden
+ %else
+ global %2:function
+ %endif
+ %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN && %1
+ global %2:private_extern
+ %else
+ global %2
+ %endif
+ align function_align
+ %2:
+ RESET_MM_PERMUTATION ; needed for x86-64, also makes disassembly somewhat nicer
+ %xdefine rstk rsp ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required
+ %assign stack_offset 0 ; stack pointer offset relative to the return address
+ %assign stack_size 0 ; amount of stack space that can be freely used inside a function
+ %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding
+ %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 and vzeroupper
+ %ifnidn %3, ""
+ PROLOGUE %3
+ %endif
+%endmacro
+
+; Create a global symbol from a local label with the correct name mangling and type
+%macro cglobal_label 1
+ %if FORMAT_ELF
+ global current_function %+ %1:function hidden
+ %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN
+ global current_function %+ %1:private_extern
+ %else
+ global current_function %+ %1
+ %endif
+ %1:
+%endmacro
+
+%macro cextern 1
+ %xdefine %1 mangle(private_prefix %+ _ %+ %1)
+ CAT_XDEFINE cglobaled_, %1, 1
+ extern %1
+%endmacro
+
+; like cextern, but without the prefix
+%macro cextern_naked 1
+ %ifdef PREFIX
+ %xdefine %1 mangle(%1)
+ %endif
+ CAT_XDEFINE cglobaled_, %1, 1
+ extern %1
+%endmacro
+
+%macro const 1-2+
+ %xdefine %1 mangle(private_prefix %+ _ %+ %1)
+ %if FORMAT_ELF
+ global %1:data hidden
+ %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN
+ global %1:private_extern
+ %else
+ global %1
+ %endif
+ %1: %2
+%endmacro
+
+; This is needed for ELF, otherwise the GNU linker assumes the stack is executable by default.
+%if FORMAT_ELF
+ [SECTION .note.GNU-stack noalloc noexec nowrite progbits]
+%endif
+
+; Tell debuggers how large the function was.
+; This may be invoked multiple times per function; we rely on later instances overriding earlier ones.
+; This is invoked by RET and similar macros, and also cglobal does it for the previous function,
+; but if the last function in a source file doesn't use any of the standard macros for its epilogue,
+; then its size might be unspecified.
+%macro annotate_function_size 0
+ %ifdef __YASM_VER__
+ %ifdef current_function
+ %if FORMAT_ELF
+ current_function_section
+ %%ecf equ $
+ size current_function %%ecf - current_function
+ __SECT__
+ %endif
+ %endif
+ %endif
+%endmacro
+
+; cpuflags
+
+%assign cpuflags_mmx (1<<0)
+%assign cpuflags_mmx2 (1<<1) | cpuflags_mmx
+%assign cpuflags_3dnow (1<<2) | cpuflags_mmx
+%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow
+%assign cpuflags_sse (1<<4) | cpuflags_mmx2
+%assign cpuflags_sse2 (1<<5) | cpuflags_sse
+%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2
+%assign cpuflags_lzcnt (1<<7) | cpuflags_sse2
+%assign cpuflags_sse3 (1<<8) | cpuflags_sse2
+%assign cpuflags_ssse3 (1<<9) | cpuflags_sse3
+%assign cpuflags_sse4 (1<<10) | cpuflags_ssse3
+%assign cpuflags_sse42 (1<<11) | cpuflags_sse4
+%assign cpuflags_aesni (1<<12) | cpuflags_sse42
+%assign cpuflags_gfni (1<<13) | cpuflags_sse42
+%assign cpuflags_avx (1<<14) | cpuflags_sse42
+%assign cpuflags_xop (1<<15) | cpuflags_avx
+%assign cpuflags_fma4 (1<<16) | cpuflags_avx
+%assign cpuflags_fma3 (1<<17) | cpuflags_avx
+%assign cpuflags_bmi1 (1<<18) | cpuflags_avx|cpuflags_lzcnt
+%assign cpuflags_bmi2 (1<<19) | cpuflags_bmi1
+%assign cpuflags_avx2 (1<<20) | cpuflags_fma3|cpuflags_bmi2
+%assign cpuflags_avx512 (1<<21) | cpuflags_avx2 ; F, CD, BW, DQ, VL
+%assign cpuflags_avx512icl (1<<22) | cpuflags_avx512|cpuflags_gfni ; VNNI, IFMA, VBMI, VBMI2, VPOPCNTDQ, BITALG, VAES, VPCLMULQDQ
+
+%assign cpuflags_cache32 (1<<23)
+%assign cpuflags_cache64 (1<<24)
+%assign cpuflags_aligned (1<<25) ; not a cpu feature, but a function variant
+%assign cpuflags_atom (1<<26)
+
+; Returns a boolean value expressing whether or not the specified cpuflag is enabled.
+%define cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1)
+%define notcpuflag(x) (cpuflag(x) ^ 1)
+
+; Takes an arbitrary number of cpuflags from the above list.
+; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu.
+; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co.
+%macro INIT_CPUFLAGS 0-*
+ %xdefine SUFFIX
+ %undef cpuname
+ %assign cpuflags 0
+
+ %if %0 >= 1
+ %rep %0
+ %ifdef cpuname
+ %xdefine cpuname cpuname %+ _%1
+ %else
+ %xdefine cpuname %1
+ %endif
+ %assign cpuflags cpuflags | cpuflags_%1
+ %rotate 1
+ %endrep
+ %xdefine SUFFIX _ %+ cpuname
+
+ %if cpuflag(avx)
+ %assign avx_enabled 1
+ %endif
+ %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(avx2))
+ %define mova movaps
+ %define movu movups
+ %define movnta movntps
+ %endif
+ %if cpuflag(aligned)
+ %define movu mova
+ %elif cpuflag(sse3) && notcpuflag(ssse3)
+ %define movu lddqu
+ %endif
+ %endif
+
+ %if ARCH_X86_64 || cpuflag(sse2)
+ %ifdef __NASM_VER__
+ ALIGNMODE p6
+ %else
+ CPU amdnop
+ %endif
+ %else
+ %ifdef __NASM_VER__
+ ALIGNMODE nop
+ %else
+ CPU basicnop
+ %endif
+ %endif
+%endmacro
+
+; Merge mmx, sse*, and avx*
+; m# is a simd register of the currently selected size
+; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m#
+; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m#
+; zm# is the corresponding zmm register if mmsize >= 64, otherwise the same as m#
+; (All 4 remain in sync through SWAP.)
+
+%macro CAT_XDEFINE 3
+ %xdefine %1%2 %3
+%endmacro
+
+%macro CAT_UNDEF 2
+ %undef %1%2
+%endmacro
+
+%macro DEFINE_MMREGS 1 ; mmtype
+ %assign %%prev_mmregs 0
+ %ifdef num_mmregs
+ %assign %%prev_mmregs num_mmregs
+ %endif
+
+ %assign num_mmregs 8
+ %if ARCH_X86_64 && mmsize >= 16
+ %assign num_mmregs 16
+ %if cpuflag(avx512) || mmsize == 64
+ %assign num_mmregs 32
+ %endif
+ %endif
+
+ %assign %%i 0
+ %rep num_mmregs
+ CAT_XDEFINE m, %%i, %1 %+ %%i
+ CAT_XDEFINE nn%1, %%i, %%i
+ %assign %%i %%i+1
+ %endrep
+ %if %%prev_mmregs > num_mmregs
+ %rep %%prev_mmregs - num_mmregs
+ CAT_UNDEF m, %%i
+ CAT_UNDEF nn %+ mmtype, %%i
+ %assign %%i %%i+1
+ %endrep
+ %endif
+ %xdefine mmtype %1
+%endmacro
+
+; Prefer registers 16-31 over 0-15 to avoid having to use vzeroupper
+%macro AVX512_MM_PERMUTATION 0-1 0 ; start_reg
+ %if ARCH_X86_64 && cpuflag(avx512)
+ %assign %%i %1
+ %rep 16-%1
+ %assign %%i_high %%i+16
+ SWAP %%i, %%i_high
+ %assign %%i %%i+1
+ %endrep
+ %endif
+%endmacro
+
+%macro INIT_MMX 0-1+
+ %assign avx_enabled 0
+ %define RESET_MM_PERMUTATION INIT_MMX %1
+ %define mmsize 8
+ %define mova movq
+ %define movu movq
+ %define movh movd
+ %define movnta movntq
+ INIT_CPUFLAGS %1
+ DEFINE_MMREGS mm
+%endmacro
+
+%macro INIT_XMM 0-1+
+ %assign avx_enabled FORCE_VEX_ENCODING
+ %define RESET_MM_PERMUTATION INIT_XMM %1
+ %define mmsize 16
+ %define mova movdqa
+ %define movu movdqu
+ %define movh movq
+ %define movnta movntdq
+ INIT_CPUFLAGS %1
+ DEFINE_MMREGS xmm
+ %if WIN64
+ AVX512_MM_PERMUTATION 6 ; Swap callee-saved registers with volatile registers
+ %endif
+ %xdefine bcstd 1to4
+ %xdefine bcstq 1to2
+%endmacro
+
+%macro INIT_YMM 0-1+
+ %assign avx_enabled 1
+ %define RESET_MM_PERMUTATION INIT_YMM %1
+ %define mmsize 32
+ %define mova movdqa
+ %define movu movdqu
+ %undef movh
+ %define movnta movntdq
+ INIT_CPUFLAGS %1
+ DEFINE_MMREGS ymm
+ AVX512_MM_PERMUTATION
+ %xdefine bcstd 1to8
+ %xdefine bcstq 1to4
+%endmacro
+
+%macro INIT_ZMM 0-1+
+ %assign avx_enabled 1
+ %define RESET_MM_PERMUTATION INIT_ZMM %1
+ %define mmsize 64
+ %define mova movdqa
+ %define movu movdqu
+ %undef movh
+ %define movnta movntdq
+ INIT_CPUFLAGS %1
+ DEFINE_MMREGS zmm
+ AVX512_MM_PERMUTATION
+ %xdefine bcstd 1to16
+ %xdefine bcstq 1to8
+%endmacro
+
+INIT_XMM
+
+%macro DECLARE_MMCAST 1
+ %define mmmm%1 mm%1
+ %define mmxmm%1 mm%1
+ %define mmymm%1 mm%1
+ %define mmzmm%1 mm%1
+ %define xmmmm%1 mm%1
+ %define xmmxmm%1 xmm%1
+ %define xmmymm%1 xmm%1
+ %define xmmzmm%1 xmm%1
+ %define ymmmm%1 mm%1
+ %define ymmxmm%1 xmm%1
+ %define ymmymm%1 ymm%1
+ %define ymmzmm%1 ymm%1
+ %define zmmmm%1 mm%1
+ %define zmmxmm%1 xmm%1
+ %define zmmymm%1 ymm%1
+ %define zmmzmm%1 zmm%1
+ %define xm%1 xmm %+ m%1
+ %define ym%1 ymm %+ m%1
+ %define zm%1 zmm %+ m%1
+%endmacro
+
+%assign i 0
+%rep 32
+ DECLARE_MMCAST i
+ %assign i i+1
+%endrep
+
+; I often want to use macros that permute their arguments. e.g. there's no
+; efficient way to implement butterfly or transpose or dct without swapping some
+; arguments.
+;
+; I would like to not have to manually keep track of the permutations:
+; If I insert a permutation in the middle of a function, it should automatically
+; change everything that follows. For more complex macros I may also have multiple
+; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
+;
+; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
+; permutes its arguments. It's equivalent to exchanging the contents of the
+; registers, except that this way you exchange the register names instead, so it
+; doesn't cost any cycles.
+
+%macro PERMUTE 2-* ; takes a list of pairs to swap
+ %rep %0/2
+ %xdefine %%tmp%2 m%2
+ %rotate 2
+ %endrep
+ %rep %0/2
+ %xdefine m%1 %%tmp%2
+ CAT_XDEFINE nn, m%1, %1
+ %rotate 2
+ %endrep
+%endmacro
+
+%macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs)
+ %ifnum %1 ; SWAP 0, 1, ...
+ SWAP_INTERNAL_NUM %1, %2
+ %else ; SWAP m0, m1, ...
+ SWAP_INTERNAL_NAME %1, %2
+ %endif
+%endmacro
+
+%macro SWAP_INTERNAL_NUM 2-*
+ %rep %0-1
+ %xdefine %%tmp m%1
+ %xdefine m%1 m%2
+ %xdefine m%2 %%tmp
+ CAT_XDEFINE nn, m%1, %1
+ CAT_XDEFINE nn, m%2, %2
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro SWAP_INTERNAL_NAME 2-*
+ %xdefine %%args nn %+ %1
+ %rep %0-1
+ %xdefine %%args %%args, nn %+ %2
+ %rotate 1
+ %endrep
+ SWAP_INTERNAL_NUM %%args
+%endmacro
+
+; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later
+; calls to that function will automatically load the permutation, so values can
+; be returned in mmregs.
+%macro SAVE_MM_PERMUTATION 0-1
+ %if %0
+ %xdefine %%f %1_m
+ %else
+ %xdefine %%f current_function %+ _m
+ %endif
+ %assign %%i 0
+ %rep num_mmregs
+ %xdefine %%tmp m %+ %%i
+ CAT_XDEFINE %%f, %%i, regnumof %+ %%tmp
+ %assign %%i %%i+1
+ %endrep
+%endmacro
+
+%macro LOAD_MM_PERMUTATION 0-1 ; name to load from
+ %if %0
+ %xdefine %%f %1_m
+ %else
+ %xdefine %%f current_function %+ _m
+ %endif
+ %xdefine %%tmp %%f %+ 0
+ %ifnum %%tmp
+ DEFINE_MMREGS mmtype
+ %assign %%i 0
+ %rep num_mmregs
+ %xdefine %%tmp %%f %+ %%i
+ CAT_XDEFINE %%m, %%i, m %+ %%tmp
+ %assign %%i %%i+1
+ %endrep
+ %rep num_mmregs
+ %assign %%i %%i-1
+ CAT_XDEFINE m, %%i, %%m %+ %%i
+ CAT_XDEFINE nn, m %+ %%i, %%i
+ %endrep
+ %endif
+%endmacro
+
+; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't
+%macro call 1
+ %ifid %1
+ call_internal %1 %+ SUFFIX, %1
+ %else
+ call %1
+ %endif
+%endmacro
+%macro call_internal 2
+ %xdefine %%i %2
+ %ifndef cglobaled_%2
+ %ifdef cglobaled_%1
+ %xdefine %%i %1
+ %endif
+ %endif
+ call %%i
+ LOAD_MM_PERMUTATION %%i
+%endmacro
+
+; Substitutions that reduce instruction size but are functionally equivalent
+%macro add 2
+ %ifnum %2
+ %if %2==128
+ sub %1, -128
+ %else
+ add %1, %2
+ %endif
+ %else
+ add %1, %2
+ %endif
+%endmacro
+
+%macro sub 2
+ %ifnum %2
+ %if %2==128
+ add %1, -128
+ %else
+ sub %1, %2
+ %endif
+ %else
+ sub %1, %2
+ %endif
+%endmacro
+
+;=============================================================================
+; AVX abstraction layer
+;=============================================================================
+
+%assign i 0
+%rep 32
+ %if i < 8
+ CAT_XDEFINE sizeofmm, i, 8
+ CAT_XDEFINE regnumofmm, i, i
+ %endif
+ CAT_XDEFINE sizeofxmm, i, 16
+ CAT_XDEFINE sizeofymm, i, 32
+ CAT_XDEFINE sizeofzmm, i, 64
+ CAT_XDEFINE regnumofxmm, i, i
+ CAT_XDEFINE regnumofymm, i, i
+ CAT_XDEFINE regnumofzmm, i, i
+ %assign i i+1
+%endrep
+%undef i
+
+%macro CHECK_AVX_INSTR_EMU 3-*
+ %xdefine %%opcode %1
+ %xdefine %%dst %2
+ %rep %0-2
+ %ifidn %%dst, %3
+ %error non-avx emulation of ``%%opcode'' is not supported
+ %endif
+ %rotate 1
+ %endrep
+%endmacro
+
+;%1 == instruction
+;%2 == minimal instruction set
+;%3 == 1 if float, 0 if int
+;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation)
+;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
+;%6+: operands
+%macro RUN_AVX_INSTR 6-9+
+ %ifnum sizeof%7
+ %assign __sizeofreg sizeof%7
+ %elifnum sizeof%6
+ %assign __sizeofreg sizeof%6
+ %else
+ %assign __sizeofreg mmsize
+ %endif
+ %assign __emulate_avx 0
+ %if avx_enabled && __sizeofreg >= 16
+ %xdefine __instr v%1
+ %else
+ %xdefine __instr %1
+ %if %0 >= 8+%4
+ %assign __emulate_avx 1
+ %endif
+ %endif
+ %ifnidn %2, fnord
+ %ifdef cpuname
+ %if notcpuflag(%2)
+ %error use of ``%1'' %2 instruction in cpuname function: current_function
+ %elif %3 == 0 && __sizeofreg == 16 && notcpuflag(sse2)
+ %error use of ``%1'' sse2 instruction in cpuname function: current_function
+ %elif %3 == 0 && __sizeofreg == 32 && notcpuflag(avx2)
+ %error use of ``%1'' avx2 instruction in cpuname function: current_function
+ %elif __sizeofreg == 16 && notcpuflag(sse)
+ %error use of ``%1'' sse instruction in cpuname function: current_function
+ %elif __sizeofreg == 32 && notcpuflag(avx)
+ %error use of ``%1'' avx instruction in cpuname function: current_function
+ %elif __sizeofreg == 64 && notcpuflag(avx512)
+ %error use of ``%1'' avx512 instruction in cpuname function: current_function
+ %elifidn %1, pextrw ; special case because the base instruction is mmx2,
+ %ifnid %6 ; but sse4 is required for memory operands
+ %if notcpuflag(sse4)
+ %error use of ``%1'' sse4 instruction in cpuname function: current_function
+ %endif
+ %endif
+ %endif
+ %endif
+ %endif
+
+ %if __emulate_avx
+ %xdefine __src1 %7
+ %xdefine __src2 %8
+ %if %5 && %4 == 0
+ %ifnidn %6, %7
+ %ifidn %6, %8
+ %xdefine __src1 %8
+ %xdefine __src2 %7
+ %elifnnum sizeof%8
+ ; 3-operand AVX instructions with a memory arg can only have it in src2,
+ ; whereas SSE emulation prefers to have it in src1 (i.e. the mov).
+ ; So, if the instruction is commutative with a memory arg, swap them.
+ %xdefine __src1 %8
+ %xdefine __src2 %7
+ %endif
+ %endif
+ %endif
+ %ifnidn %6, __src1
+ %if %0 >= 9
+ CHECK_AVX_INSTR_EMU {%1 %6, %7, %8, %9}, %6, __src2, %9
+ %else
+ CHECK_AVX_INSTR_EMU {%1 %6, %7, %8}, %6, __src2
+ %endif
+ %if __sizeofreg == 8
+ MOVQ %6, __src1
+ %elif %3
+ MOVAPS %6, __src1
+ %else
+ MOVDQA %6, __src1
+ %endif
+ %endif
+ %if %0 >= 9
+ %1 %6, __src2, %9
+ %else
+ %1 %6, __src2
+ %endif
+ %elif %0 >= 9
+ %if avx_enabled && __sizeofreg >= 16 && %4 == 1
+ %ifnnum regnumof%7
+ %if %3
+ vmovaps %6, %7
+ %else
+ vmovdqa %6, %7
+ %endif
+ __instr %6, %6, %8, %9
+ %else
+ __instr %6, %7, %8, %9
+ %endif
+ %else
+ __instr %6, %7, %8, %9
+ %endif
+ %elif %0 == 8
+ %if avx_enabled && __sizeofreg >= 16 && %4 == 0
+ %xdefine __src1 %7
+ %xdefine __src2 %8
+ %if %5
+ %ifnum regnumof%7
+ %ifnum regnumof%8
+ %if regnumof%7 < 8 && regnumof%8 >= 8 && regnumof%8 < 16 && sizeof%8 <= 32
+ ; Most VEX-encoded instructions require an additional byte to encode when
+ ; src2 is a high register (e.g. m8..15). If the instruction is commutative
+ ; we can swap src1 and src2 when doing so reduces the instruction length.
+ %xdefine __src1 %8
+ %xdefine __src2 %7
+ %endif
+ %endif
+ %elifnum regnumof%8 ; put memory operands in src2 when possible
+ %xdefine __src1 %8
+ %xdefine __src2 %7
+ %else
+ %assign __emulate_avx 1
+ %endif
+ %elifnnum regnumof%7
+ ; EVEX allows imm8 shift instructions to be used with memory operands,
+ ; but VEX does not. This handles those special cases.
+ %ifnnum %8
+ %assign __emulate_avx 1
+ %elif notcpuflag(avx512)
+ %assign __emulate_avx 1
+ %endif
+ %endif
+ %if __emulate_avx ; a separate load is required
+ %if %3
+ vmovaps %6, %7
+ %else
+ vmovdqa %6, %7
+ %endif
+ __instr %6, %6, %8
+ %else
+ __instr %6, __src1, __src2
+ %endif
+ %else
+ __instr %6, %7, %8
+ %endif
+ %elif %0 == 7
+ %if avx_enabled && __sizeofreg >= 16 && %5
+ %xdefine __src1 %6
+ %xdefine __src2 %7
+ %ifnum regnumof%6
+ %ifnum regnumof%7
+ %if regnumof%6 < 8 && regnumof%7 >= 8 && regnumof%7 < 16 && sizeof%7 <= 32
+ %xdefine __src1 %7
+ %xdefine __src2 %6
+ %endif
+ %endif
+ %endif
+ __instr %6, __src1, __src2
+ %else
+ __instr %6, %7
+ %endif
+ %else
+ __instr %6
+ %endif
+%endmacro
+
+;%1 == instruction
+;%2 == minimal instruction set
+;%3 == 1 if float, 0 if int
+;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation)
+;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
+%macro AVX_INSTR 1-5 fnord, 0, 255, 0
+ %macro %1 1-10 fnord, fnord, fnord, fnord, %1, %2, %3, %4, %5
+ %ifidn %2, fnord
+ RUN_AVX_INSTR %6, %7, %8, %9, %10, %1
+ %elifidn %3, fnord
+ RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2
+ %elifidn %4, fnord
+ RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3
+ %elifidn %5, fnord
+ RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4
+ %else
+ RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4, %5
+ %endif
+ %endmacro
+%endmacro
+
+; Instructions with both VEX/EVEX and legacy encodings
+; Non-destructive instructions are written without parameters
+AVX_INSTR addpd, sse2, 1, 0, 1
+AVX_INSTR addps, sse, 1, 0, 1
+AVX_INSTR addsd, sse2, 1, 0, 0
+AVX_INSTR addss, sse, 1, 0, 0
+AVX_INSTR addsubpd, sse3, 1, 0, 0
+AVX_INSTR addsubps, sse3, 1, 0, 0
+AVX_INSTR aesdec, aesni, 0, 0, 0
+AVX_INSTR aesdeclast, aesni, 0, 0, 0
+AVX_INSTR aesenc, aesni, 0, 0, 0
+AVX_INSTR aesenclast, aesni, 0, 0, 0
+AVX_INSTR aesimc, aesni
+AVX_INSTR aeskeygenassist, aesni
+AVX_INSTR andnpd, sse2, 1, 0, 0
+AVX_INSTR andnps, sse, 1, 0, 0
+AVX_INSTR andpd, sse2, 1, 0, 1
+AVX_INSTR andps, sse, 1, 0, 1
+AVX_INSTR blendpd, sse4, 1, 1, 0
+AVX_INSTR blendps, sse4, 1, 1, 0
+AVX_INSTR blendvpd, sse4, 1, 1, 0 ; last operand must be xmm0 with legacy encoding
+AVX_INSTR blendvps, sse4, 1, 1, 0 ; last operand must be xmm0 with legacy encoding
+AVX_INSTR cmpeqpd, sse2, 1, 0, 1
+AVX_INSTR cmpeqps, sse, 1, 0, 1
+AVX_INSTR cmpeqsd, sse2, 1, 0, 0
+AVX_INSTR cmpeqss, sse, 1, 0, 0
+AVX_INSTR cmplepd, sse2, 1, 0, 0
+AVX_INSTR cmpleps, sse, 1, 0, 0
+AVX_INSTR cmplesd, sse2, 1, 0, 0
+AVX_INSTR cmpless, sse, 1, 0, 0
+AVX_INSTR cmpltpd, sse2, 1, 0, 0
+AVX_INSTR cmpltps, sse, 1, 0, 0
+AVX_INSTR cmpltsd, sse2, 1, 0, 0
+AVX_INSTR cmpltss, sse, 1, 0, 0
+AVX_INSTR cmpneqpd, sse2, 1, 0, 1
+AVX_INSTR cmpneqps, sse, 1, 0, 1
+AVX_INSTR cmpneqsd, sse2, 1, 0, 0
+AVX_INSTR cmpneqss, sse, 1, 0, 0
+AVX_INSTR cmpnlepd, sse2, 1, 0, 0
+AVX_INSTR cmpnleps, sse, 1, 0, 0
+AVX_INSTR cmpnlesd, sse2, 1, 0, 0
+AVX_INSTR cmpnless, sse, 1, 0, 0
+AVX_INSTR cmpnltpd, sse2, 1, 0, 0
+AVX_INSTR cmpnltps, sse, 1, 0, 0
+AVX_INSTR cmpnltsd, sse2, 1, 0, 0
+AVX_INSTR cmpnltss, sse, 1, 0, 0
+AVX_INSTR cmpordpd, sse2 1, 0, 1
+AVX_INSTR cmpordps, sse 1, 0, 1
+AVX_INSTR cmpordsd, sse2 1, 0, 0
+AVX_INSTR cmpordss, sse 1, 0, 0
+AVX_INSTR cmppd, sse2, 1, 1, 0
+AVX_INSTR cmpps, sse, 1, 1, 0
+AVX_INSTR cmpsd, sse2, 1, 1, 0
+AVX_INSTR cmpss, sse, 1, 1, 0
+AVX_INSTR cmpunordpd, sse2, 1, 0, 1
+AVX_INSTR cmpunordps, sse, 1, 0, 1
+AVX_INSTR cmpunordsd, sse2, 1, 0, 0
+AVX_INSTR cmpunordss, sse, 1, 0, 0
+AVX_INSTR comisd, sse2, 1
+AVX_INSTR comiss, sse, 1
+AVX_INSTR cvtdq2pd, sse2, 1
+AVX_INSTR cvtdq2ps, sse2, 1
+AVX_INSTR cvtpd2dq, sse2, 1
+AVX_INSTR cvtpd2ps, sse2, 1
+AVX_INSTR cvtps2dq, sse2, 1
+AVX_INSTR cvtps2pd, sse2, 1
+AVX_INSTR cvtsd2si, sse2, 1
+AVX_INSTR cvtsd2ss, sse2, 1, 0, 0
+AVX_INSTR cvtsi2sd, sse2, 1, 0, 0
+AVX_INSTR cvtsi2ss, sse, 1, 0, 0
+AVX_INSTR cvtss2sd, sse2, 1, 0, 0
+AVX_INSTR cvtss2si, sse, 1
+AVX_INSTR cvttpd2dq, sse2, 1
+AVX_INSTR cvttps2dq, sse2, 1
+AVX_INSTR cvttsd2si, sse2, 1
+AVX_INSTR cvttss2si, sse, 1
+AVX_INSTR divpd, sse2, 1, 0, 0
+AVX_INSTR divps, sse, 1, 0, 0
+AVX_INSTR divsd, sse2, 1, 0, 0
+AVX_INSTR divss, sse, 1, 0, 0
+AVX_INSTR dppd, sse4, 1, 1, 0
+AVX_INSTR dpps, sse4, 1, 1, 0
+AVX_INSTR extractps, sse4, 1
+AVX_INSTR gf2p8affineinvqb, gfni, 0, 1, 0
+AVX_INSTR gf2p8affineqb, gfni, 0, 1, 0
+AVX_INSTR gf2p8mulb, gfni, 0, 0, 0
+AVX_INSTR haddpd, sse3, 1, 0, 0
+AVX_INSTR haddps, sse3, 1, 0, 0
+AVX_INSTR hsubpd, sse3, 1, 0, 0
+AVX_INSTR hsubps, sse3, 1, 0, 0
+AVX_INSTR insertps, sse4, 1, 1, 0
+AVX_INSTR lddqu, sse3
+AVX_INSTR ldmxcsr, sse, 1
+AVX_INSTR maskmovdqu, sse2
+AVX_INSTR maxpd, sse2, 1, 0, 1
+AVX_INSTR maxps, sse, 1, 0, 1
+AVX_INSTR maxsd, sse2, 1, 0, 0
+AVX_INSTR maxss, sse, 1, 0, 0
+AVX_INSTR minpd, sse2, 1, 0, 1
+AVX_INSTR minps, sse, 1, 0, 1
+AVX_INSTR minsd, sse2, 1, 0, 0
+AVX_INSTR minss, sse, 1, 0, 0
+AVX_INSTR movapd, sse2, 1
+AVX_INSTR movaps, sse, 1
+AVX_INSTR movd, mmx
+AVX_INSTR movddup, sse3, 1
+AVX_INSTR movdqa, sse2
+AVX_INSTR movdqu, sse2
+AVX_INSTR movhlps, sse, 1, 0, 0
+AVX_INSTR movhpd, sse2, 1, 0, 0
+AVX_INSTR movhps, sse, 1, 0, 0
+AVX_INSTR movlhps, sse, 1, 0, 0
+AVX_INSTR movlpd, sse2, 1, 0, 0
+AVX_INSTR movlps, sse, 1, 0, 0
+AVX_INSTR movmskpd, sse2, 1
+AVX_INSTR movmskps, sse, 1
+AVX_INSTR movntdq, sse2
+AVX_INSTR movntdqa, sse4
+AVX_INSTR movntpd, sse2, 1
+AVX_INSTR movntps, sse, 1
+AVX_INSTR movq, mmx
+AVX_INSTR movsd, sse2, 1, 0, 0
+AVX_INSTR movshdup, sse3, 1
+AVX_INSTR movsldup, sse3, 1
+AVX_INSTR movss, sse, 1, 0, 0
+AVX_INSTR movupd, sse2, 1
+AVX_INSTR movups, sse, 1
+AVX_INSTR mpsadbw, sse4, 0, 1, 0
+AVX_INSTR mulpd, sse2, 1, 0, 1
+AVX_INSTR mulps, sse, 1, 0, 1
+AVX_INSTR mulsd, sse2, 1, 0, 0
+AVX_INSTR mulss, sse, 1, 0, 0
+AVX_INSTR orpd, sse2, 1, 0, 1
+AVX_INSTR orps, sse, 1, 0, 1
+AVX_INSTR pabsb, ssse3
+AVX_INSTR pabsd, ssse3
+AVX_INSTR pabsw, ssse3
+AVX_INSTR packssdw, mmx, 0, 0, 0
+AVX_INSTR packsswb, mmx, 0, 0, 0
+AVX_INSTR packusdw, sse4, 0, 0, 0
+AVX_INSTR packuswb, mmx, 0, 0, 0
+AVX_INSTR paddb, mmx, 0, 0, 1
+AVX_INSTR paddd, mmx, 0, 0, 1
+AVX_INSTR paddq, sse2, 0, 0, 1
+AVX_INSTR paddsb, mmx, 0, 0, 1
+AVX_INSTR paddsw, mmx, 0, 0, 1
+AVX_INSTR paddusb, mmx, 0, 0, 1
+AVX_INSTR paddusw, mmx, 0, 0, 1
+AVX_INSTR paddw, mmx, 0, 0, 1
+AVX_INSTR palignr, ssse3, 0, 1, 0
+AVX_INSTR pand, mmx, 0, 0, 1
+AVX_INSTR pandn, mmx, 0, 0, 0
+AVX_INSTR pavgb, mmx2, 0, 0, 1
+AVX_INSTR pavgw, mmx2, 0, 0, 1
+AVX_INSTR pblendvb, sse4, 0, 1, 0 ; last operand must be xmm0 with legacy encoding
+AVX_INSTR pblendw, sse4, 0, 1, 0
+AVX_INSTR pclmulhqhqdq, fnord, 0, 0, 0
+AVX_INSTR pclmulhqlqdq, fnord, 0, 0, 0
+AVX_INSTR pclmullqhqdq, fnord, 0, 0, 0
+AVX_INSTR pclmullqlqdq, fnord, 0, 0, 0
+AVX_INSTR pclmulqdq, fnord, 0, 1, 0
+AVX_INSTR pcmpeqb, mmx, 0, 0, 1
+AVX_INSTR pcmpeqd, mmx, 0, 0, 1
+AVX_INSTR pcmpeqq, sse4, 0, 0, 1
+AVX_INSTR pcmpeqw, mmx, 0, 0, 1
+AVX_INSTR pcmpestri, sse42
+AVX_INSTR pcmpestrm, sse42
+AVX_INSTR pcmpgtb, mmx, 0, 0, 0
+AVX_INSTR pcmpgtd, mmx, 0, 0, 0
+AVX_INSTR pcmpgtq, sse42, 0, 0, 0
+AVX_INSTR pcmpgtw, mmx, 0, 0, 0
+AVX_INSTR pcmpistri, sse42
+AVX_INSTR pcmpistrm, sse42
+AVX_INSTR pextrb, sse4
+AVX_INSTR pextrd, sse4
+AVX_INSTR pextrq, sse4
+AVX_INSTR pextrw, mmx2
+AVX_INSTR phaddd, ssse3, 0, 0, 0
+AVX_INSTR phaddsw, ssse3, 0, 0, 0
+AVX_INSTR phaddw, ssse3, 0, 0, 0
+AVX_INSTR phminposuw, sse4
+AVX_INSTR phsubd, ssse3, 0, 0, 0
+AVX_INSTR phsubsw, ssse3, 0, 0, 0
+AVX_INSTR phsubw, ssse3, 0, 0, 0
+AVX_INSTR pinsrb, sse4, 0, 1, 0
+AVX_INSTR pinsrd, sse4, 0, 1, 0
+AVX_INSTR pinsrq, sse4, 0, 1, 0
+AVX_INSTR pinsrw, mmx2, 0, 1, 0
+AVX_INSTR pmaddubsw, ssse3, 0, 0, 0
+AVX_INSTR pmaddwd, mmx, 0, 0, 1
+AVX_INSTR pmaxsb, sse4, 0, 0, 1
+AVX_INSTR pmaxsd, sse4, 0, 0, 1
+AVX_INSTR pmaxsw, mmx2, 0, 0, 1
+AVX_INSTR pmaxub, mmx2, 0, 0, 1
+AVX_INSTR pmaxud, sse4, 0, 0, 1
+AVX_INSTR pmaxuw, sse4, 0, 0, 1
+AVX_INSTR pminsb, sse4, 0, 0, 1
+AVX_INSTR pminsd, sse4, 0, 0, 1
+AVX_INSTR pminsw, mmx2, 0, 0, 1
+AVX_INSTR pminub, mmx2, 0, 0, 1
+AVX_INSTR pminud, sse4, 0, 0, 1
+AVX_INSTR pminuw, sse4, 0, 0, 1
+AVX_INSTR pmovmskb, mmx2
+AVX_INSTR pmovsxbd, sse4
+AVX_INSTR pmovsxbq, sse4
+AVX_INSTR pmovsxbw, sse4
+AVX_INSTR pmovsxdq, sse4
+AVX_INSTR pmovsxwd, sse4
+AVX_INSTR pmovsxwq, sse4
+AVX_INSTR pmovzxbd, sse4
+AVX_INSTR pmovzxbq, sse4
+AVX_INSTR pmovzxbw, sse4
+AVX_INSTR pmovzxdq, sse4
+AVX_INSTR pmovzxwd, sse4
+AVX_INSTR pmovzxwq, sse4
+AVX_INSTR pmuldq, sse4, 0, 0, 1
+AVX_INSTR pmulhrsw, ssse3, 0, 0, 1
+AVX_INSTR pmulhuw, mmx2, 0, 0, 1
+AVX_INSTR pmulhw, mmx, 0, 0, 1
+AVX_INSTR pmulld, sse4, 0, 0, 1
+AVX_INSTR pmullw, mmx, 0, 0, 1
+AVX_INSTR pmuludq, sse2, 0, 0, 1
+AVX_INSTR por, mmx, 0, 0, 1
+AVX_INSTR psadbw, mmx2, 0, 0, 1
+AVX_INSTR pshufb, ssse3, 0, 0, 0
+AVX_INSTR pshufd, sse2
+AVX_INSTR pshufhw, sse2
+AVX_INSTR pshuflw, sse2
+AVX_INSTR psignb, ssse3, 0, 0, 0
+AVX_INSTR psignd, ssse3, 0, 0, 0
+AVX_INSTR psignw, ssse3, 0, 0, 0
+AVX_INSTR pslld, mmx, 0, 0, 0
+AVX_INSTR pslldq, sse2, 0, 0, 0
+AVX_INSTR psllq, mmx, 0, 0, 0
+AVX_INSTR psllw, mmx, 0, 0, 0
+AVX_INSTR psrad, mmx, 0, 0, 0
+AVX_INSTR psraw, mmx, 0, 0, 0
+AVX_INSTR psrld, mmx, 0, 0, 0
+AVX_INSTR psrldq, sse2, 0, 0, 0
+AVX_INSTR psrlq, mmx, 0, 0, 0
+AVX_INSTR psrlw, mmx, 0, 0, 0
+AVX_INSTR psubb, mmx, 0, 0, 0
+AVX_INSTR psubd, mmx, 0, 0, 0
+AVX_INSTR psubq, sse2, 0, 0, 0
+AVX_INSTR psubsb, mmx, 0, 0, 0
+AVX_INSTR psubsw, mmx, 0, 0, 0
+AVX_INSTR psubusb, mmx, 0, 0, 0
+AVX_INSTR psubusw, mmx, 0, 0, 0
+AVX_INSTR psubw, mmx, 0, 0, 0
+AVX_INSTR ptest, sse4
+AVX_INSTR punpckhbw, mmx, 0, 0, 0
+AVX_INSTR punpckhdq, mmx, 0, 0, 0
+AVX_INSTR punpckhqdq, sse2, 0, 0, 0
+AVX_INSTR punpckhwd, mmx, 0, 0, 0
+AVX_INSTR punpcklbw, mmx, 0, 0, 0
+AVX_INSTR punpckldq, mmx, 0, 0, 0
+AVX_INSTR punpcklqdq, sse2, 0, 0, 0
+AVX_INSTR punpcklwd, mmx, 0, 0, 0
+AVX_INSTR pxor, mmx, 0, 0, 1
+AVX_INSTR rcpps, sse, 1
+AVX_INSTR rcpss, sse, 1, 0, 0
+AVX_INSTR roundpd, sse4, 1
+AVX_INSTR roundps, sse4, 1
+AVX_INSTR roundsd, sse4, 1, 1, 0
+AVX_INSTR roundss, sse4, 1, 1, 0
+AVX_INSTR rsqrtps, sse, 1
+AVX_INSTR rsqrtss, sse, 1, 0, 0
+AVX_INSTR shufpd, sse2, 1, 1, 0
+AVX_INSTR shufps, sse, 1, 1, 0
+AVX_INSTR sqrtpd, sse2, 1
+AVX_INSTR sqrtps, sse, 1
+AVX_INSTR sqrtsd, sse2, 1, 0, 0
+AVX_INSTR sqrtss, sse, 1, 0, 0
+AVX_INSTR stmxcsr, sse, 1
+AVX_INSTR subpd, sse2, 1, 0, 0
+AVX_INSTR subps, sse, 1, 0, 0
+AVX_INSTR subsd, sse2, 1, 0, 0
+AVX_INSTR subss, sse, 1, 0, 0
+AVX_INSTR ucomisd, sse2, 1
+AVX_INSTR ucomiss, sse, 1
+AVX_INSTR unpckhpd, sse2, 1, 0, 0
+AVX_INSTR unpckhps, sse, 1, 0, 0
+AVX_INSTR unpcklpd, sse2, 1, 0, 0
+AVX_INSTR unpcklps, sse, 1, 0, 0
+AVX_INSTR xorpd, sse2, 1, 0, 1
+AVX_INSTR xorps, sse, 1, 0, 1
+
+; 3DNow instructions, for sharing code between AVX, SSE and 3DN
+AVX_INSTR pfadd, 3dnow, 1, 0, 1
+AVX_INSTR pfmul, 3dnow, 1, 0, 1
+AVX_INSTR pfsub, 3dnow, 1, 0, 0
+
+;%1 == instruction
+;%2 == minimal instruction set
+%macro GPR_INSTR 2
+ %macro %1 2-5 fnord, %1, %2
+ %ifdef cpuname
+ %if notcpuflag(%5)
+ %error use of ``%4'' %5 instruction in cpuname function: current_function
+ %endif
+ %endif
+ %ifidn %3, fnord
+ %4 %1, %2
+ %else
+ %4 %1, %2, %3
+ %endif
+ %endmacro
+%endmacro
+
+GPR_INSTR andn, bmi1
+GPR_INSTR bextr, bmi1
+GPR_INSTR blsi, bmi1
+GPR_INSTR blsmsk, bmi1
+GPR_INSTR blsr, bmi1
+GPR_INSTR bzhi, bmi2
+GPR_INSTR mulx, bmi2
+GPR_INSTR pdep, bmi2
+GPR_INSTR pext, bmi2
+GPR_INSTR popcnt, sse42
+GPR_INSTR rorx, bmi2
+GPR_INSTR sarx, bmi2
+GPR_INSTR shlx, bmi2
+GPR_INSTR shrx, bmi2
+
+; base-4 constants for shuffles
+%assign i 0
+%rep 256
+ %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3)
+ %if j < 10
+ CAT_XDEFINE q000, j, i
+ %elif j < 100
+ CAT_XDEFINE q00, j, i
+ %elif j < 1000
+ CAT_XDEFINE q0, j, i
+ %else
+ CAT_XDEFINE q, j, i
+ %endif
+ %assign i i+1
+%endrep
+%undef i
+%undef j
+
+%macro FMA_INSTR 3
+ %macro %1 4-7 %1, %2, %3
+ %if cpuflag(xop)
+ v%5 %1, %2, %3, %4
+ %elifnidn %1, %4
+ %6 %1, %2, %3
+ %7 %1, %4
+ %else
+ %error non-xop emulation of ``%5 %1, %2, %3, %4'' is not supported
+ %endif
+ %endmacro
+%endmacro
+
+FMA_INSTR pmacsdd, pmulld, paddd ; sse4 emulation
+FMA_INSTR pmacsdql, pmuldq, paddq ; sse4 emulation
+FMA_INSTR pmacsww, pmullw, paddw
+FMA_INSTR pmadcswd, pmaddwd, paddd
+
+; Macros for consolidating FMA3 and FMA4 using 4-operand (dst, src1, src2, src3) syntax.
+; FMA3 is only possible if dst is the same as one of the src registers.
+; Either src2 or src3 can be a memory operand.
+%macro FMA4_INSTR 2-*
+ %push fma4_instr
+ %xdefine %$prefix %1
+ %rep %0 - 1
+ %macro %$prefix%2 4-6 %$prefix, %2
+ %if notcpuflag(fma3) && notcpuflag(fma4)
+ %error use of ``%5%6'' fma instruction in cpuname function: current_function
+ %elif cpuflag(fma4)
+ v%5%6 %1, %2, %3, %4
+ %elifidn %1, %2
+ ; If %3 or %4 is a memory operand it needs to be encoded as the last operand.
+ %ifnum sizeof%3
+ v%{5}213%6 %2, %3, %4
+ %else
+ v%{5}132%6 %2, %4, %3
+ %endif
+ %elifidn %1, %3
+ v%{5}213%6 %3, %2, %4
+ %elifidn %1, %4
+ v%{5}231%6 %4, %2, %3
+ %else
+ %error fma3 emulation of ``%5%6 %1, %2, %3, %4'' is not supported
+ %endif
+ %endmacro
+ %rotate 1
+ %endrep
+ %pop
+%endmacro
+
+FMA4_INSTR fmadd, pd, ps, sd, ss
+FMA4_INSTR fmaddsub, pd, ps
+FMA4_INSTR fmsub, pd, ps, sd, ss
+FMA4_INSTR fmsubadd, pd, ps
+FMA4_INSTR fnmadd, pd, ps, sd, ss
+FMA4_INSTR fnmsub, pd, ps, sd, ss
+
+; Macros for converting VEX instructions to equivalent EVEX ones.
+%macro EVEX_INSTR 2-3 0 ; vex, evex, prefer_evex
+ %macro %1 2-7 fnord, fnord, %1, %2, %3
+ %ifidn %3, fnord
+ %define %%args %1, %2
+ %elifidn %4, fnord
+ %define %%args %1, %2, %3
+ %else
+ %define %%args %1, %2, %3, %4
+ %endif
+ %assign %%evex_required cpuflag(avx512) & %7
+ %ifnum regnumof%1
+ %if regnumof%1 >= 16 || sizeof%1 > 32
+ %assign %%evex_required 1
+ %endif
+ %endif
+ %ifnum regnumof%2
+ %if regnumof%2 >= 16 || sizeof%2 > 32
+ %assign %%evex_required 1
+ %endif
+ %endif
+ %ifnum regnumof%3
+ %if regnumof%3 >= 16 || sizeof%3 > 32
+ %assign %%evex_required 1
+ %endif
+ %endif
+ %if %%evex_required
+ %6 %%args
+ %else
+ %5 %%args ; Prefer VEX over EVEX due to shorter instruction length
+ %endif
+ %endmacro
+%endmacro
+
+EVEX_INSTR vbroadcastf128, vbroadcastf32x4
+EVEX_INSTR vbroadcasti128, vbroadcasti32x4
+EVEX_INSTR vextractf128, vextractf32x4
+EVEX_INSTR vextracti128, vextracti32x4
+EVEX_INSTR vinsertf128, vinsertf32x4
+EVEX_INSTR vinserti128, vinserti32x4
+EVEX_INSTR vmovdqa, vmovdqa32
+EVEX_INSTR vmovdqu, vmovdqu32
+EVEX_INSTR vpand, vpandd
+EVEX_INSTR vpandn, vpandnd
+EVEX_INSTR vpor, vpord
+EVEX_INSTR vpxor, vpxord
+EVEX_INSTR vrcpps, vrcp14ps, 1 ; EVEX versions have higher precision
+EVEX_INSTR vrcpss, vrcp14ss, 1
+EVEX_INSTR vrsqrtps, vrsqrt14ps, 1
+EVEX_INSTR vrsqrtss, vrsqrt14ss, 1
diff --git a/third_party/dav1d/src/fg_apply.h b/third_party/dav1d/src/fg_apply.h
new file mode 100644
index 0000000000..be6685d801
--- /dev/null
+++ b/third_party/dav1d/src/fg_apply.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_FG_APPLY_H
+#define DAV1D_SRC_FG_APPLY_H
+
+#include "dav1d/picture.h"
+
+#include "common/bitdepth.h"
+
+#include "src/filmgrain.h"
+
+#ifdef BITDEPTH
+# define array_decl(type, name, sz) type name sz
+#else
+# define array_decl(type, name, sz) void *name
+#endif
+
+bitfn_decls(void dav1d_apply_grain,
+ const Dav1dFilmGrainDSPContext *const dsp,
+ Dav1dPicture *const out, const Dav1dPicture *const in);
+bitfn_decls(void dav1d_prep_grain,
+ const Dav1dFilmGrainDSPContext *const dsp,
+ Dav1dPicture *const out, const Dav1dPicture *const in,
+ array_decl(uint8_t, scaling, [3][SCALING_SIZE]),
+ array_decl(entry, grain_lut, [3][GRAIN_HEIGHT+1][GRAIN_WIDTH]));
+bitfn_decls(void dav1d_apply_grain_row,
+ const Dav1dFilmGrainDSPContext *const dsp,
+ Dav1dPicture *const out, const Dav1dPicture *const in,
+ array_decl(const uint8_t, scaling, [3][SCALING_SIZE]),
+ array_decl(const entry, grain_lut, [3][GRAIN_HEIGHT+1][GRAIN_WIDTH]),
+ const int row);
+
+#endif /* DAV1D_SRC_FG_APPLY_H */
diff --git a/third_party/dav1d/src/fg_apply_tmpl.c b/third_party/dav1d/src/fg_apply_tmpl.c
new file mode 100644
index 0000000000..044e257de9
--- /dev/null
+++ b/third_party/dav1d/src/fg_apply_tmpl.c
@@ -0,0 +1,241 @@
+/*
+ * Copyright © 2018, Niklas Haas
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stdint.h>
+
+#include "dav1d/common.h"
+#include "dav1d/picture.h"
+
+#include "common/intops.h"
+#include "common/bitdepth.h"
+
+#include "src/fg_apply.h"
+
+static void generate_scaling(const int bitdepth,
+ const uint8_t points[][2], const int num,
+ uint8_t scaling[SCALING_SIZE])
+{
+#if BITDEPTH == 8
+ const int shift_x = 0;
+ const int scaling_size = SCALING_SIZE;
+#else
+ assert(bitdepth > 8);
+ const int shift_x = bitdepth - 8;
+ const int scaling_size = 1 << bitdepth;
+#endif
+
+ if (num == 0) {
+ memset(scaling, 0, scaling_size);
+ return;
+ }
+
+ // Fill up the preceding entries with the initial value
+ memset(scaling, points[0][1], points[0][0] << shift_x);
+
+ // Linearly interpolate the values in the middle
+ for (int i = 0; i < num - 1; i++) {
+ const int bx = points[i][0];
+ const int by = points[i][1];
+ const int ex = points[i+1][0];
+ const int ey = points[i+1][1];
+ const int dx = ex - bx;
+ const int dy = ey - by;
+ assert(dx > 0);
+ const int delta = dy * ((0x10000 + (dx >> 1)) / dx);
+ for (int x = 0, d = 0x8000; x < dx; x++) {
+ scaling[(bx + x) << shift_x] = by + (d >> 16);
+ d += delta;
+ }
+ }
+
+ // Fill up the remaining entries with the final value
+ const int n = points[num - 1][0] << shift_x;
+ memset(&scaling[n], points[num - 1][1], scaling_size - n);
+
+#if BITDEPTH != 8
+ const int pad = 1 << shift_x, rnd = pad >> 1;
+ for (int i = 0; i < num - 1; i++) {
+ const int bx = points[i][0] << shift_x;
+ const int ex = points[i+1][0] << shift_x;
+ const int dx = ex - bx;
+ for (int x = 0; x < dx; x += pad) {
+ const int range = scaling[bx + x + pad] - scaling[bx + x];
+ for (int n = 1, r = rnd; n < pad; n++) {
+ r += range;
+ scaling[bx + x + n] = scaling[bx + x] + (r >> shift_x);
+ }
+ }
+ }
+#endif
+}
+
+#ifndef UNIT_TEST
+void bitfn(dav1d_prep_grain)(const Dav1dFilmGrainDSPContext *const dsp,
+ Dav1dPicture *const out,
+ const Dav1dPicture *const in,
+ uint8_t scaling[3][SCALING_SIZE],
+ entry grain_lut[3][GRAIN_HEIGHT+1][GRAIN_WIDTH])
+{
+ const Dav1dFilmGrainData *const data = &out->frame_hdr->film_grain.data;
+#if BITDEPTH != 8
+ const int bitdepth_max = (1 << out->p.bpc) - 1;
+#endif
+
+ // Generate grain LUTs as needed
+ dsp->generate_grain_y(grain_lut[0], data HIGHBD_TAIL_SUFFIX); // always needed
+ if (data->num_uv_points[0] || data->chroma_scaling_from_luma)
+ dsp->generate_grain_uv[in->p.layout - 1](grain_lut[1], grain_lut[0],
+ data, 0 HIGHBD_TAIL_SUFFIX);
+ if (data->num_uv_points[1] || data->chroma_scaling_from_luma)
+ dsp->generate_grain_uv[in->p.layout - 1](grain_lut[2], grain_lut[0],
+ data, 1 HIGHBD_TAIL_SUFFIX);
+
+ // Generate scaling LUTs as needed
+ if (data->num_y_points || data->chroma_scaling_from_luma)
+ generate_scaling(in->p.bpc, data->y_points, data->num_y_points, scaling[0]);
+ if (data->num_uv_points[0])
+ generate_scaling(in->p.bpc, data->uv_points[0], data->num_uv_points[0], scaling[1]);
+ if (data->num_uv_points[1])
+ generate_scaling(in->p.bpc, data->uv_points[1], data->num_uv_points[1], scaling[2]);
+
+ // Copy over the non-modified planes
+ assert(out->stride[0] == in->stride[0]);
+ if (!data->num_y_points) {
+ const ptrdiff_t stride = out->stride[0];
+ const ptrdiff_t sz = out->p.h * stride;
+ if (sz < 0)
+ memcpy((uint8_t*) out->data[0] + sz - stride,
+ (uint8_t*) in->data[0] + sz - stride, -sz);
+ else
+ memcpy(out->data[0], in->data[0], sz);
+ }
+
+ if (in->p.layout != DAV1D_PIXEL_LAYOUT_I400 && !data->chroma_scaling_from_luma) {
+ assert(out->stride[1] == in->stride[1]);
+ const int ss_ver = in->p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ const ptrdiff_t stride = out->stride[1];
+ const ptrdiff_t sz = ((out->p.h + ss_ver) >> ss_ver) * stride;
+ if (sz < 0) {
+ if (!data->num_uv_points[0])
+ memcpy((uint8_t*) out->data[1] + sz - stride,
+ (uint8_t*) in->data[1] + sz - stride, -sz);
+ if (!data->num_uv_points[1])
+ memcpy((uint8_t*) out->data[2] + sz - stride,
+ (uint8_t*) in->data[2] + sz - stride, -sz);
+ } else {
+ if (!data->num_uv_points[0])
+ memcpy(out->data[1], in->data[1], sz);
+ if (!data->num_uv_points[1])
+ memcpy(out->data[2], in->data[2], sz);
+ }
+ }
+}
+
+void bitfn(dav1d_apply_grain_row)(const Dav1dFilmGrainDSPContext *const dsp,
+ Dav1dPicture *const out,
+ const Dav1dPicture *const in,
+ const uint8_t scaling[3][SCALING_SIZE],
+ const entry grain_lut[3][GRAIN_HEIGHT+1][GRAIN_WIDTH],
+ const int row)
+{
+ // Synthesize grain for the affected planes
+ const Dav1dFilmGrainData *const data = &out->frame_hdr->film_grain.data;
+ const int ss_y = in->p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_x = in->p.layout != DAV1D_PIXEL_LAYOUT_I444;
+ const int cpw = (out->p.w + ss_x) >> ss_x;
+ const int is_id = out->seq_hdr->mtrx == DAV1D_MC_IDENTITY;
+ pixel *const luma_src =
+ ((pixel *) in->data[0]) + row * FG_BLOCK_SIZE * PXSTRIDE(in->stride[0]);
+#if BITDEPTH != 8
+ const int bitdepth_max = (1 << out->p.bpc) - 1;
+#endif
+
+ if (data->num_y_points) {
+ const int bh = imin(out->p.h - row * FG_BLOCK_SIZE, FG_BLOCK_SIZE);
+ dsp->fgy_32x32xn(((pixel *) out->data[0]) + row * FG_BLOCK_SIZE * PXSTRIDE(out->stride[0]),
+ luma_src, out->stride[0], data,
+ out->p.w, scaling[0], grain_lut[0], bh, row HIGHBD_TAIL_SUFFIX);
+ }
+
+ if (!data->num_uv_points[0] && !data->num_uv_points[1] &&
+ !data->chroma_scaling_from_luma)
+ {
+ return;
+ }
+
+ const int bh = (imin(out->p.h - row * FG_BLOCK_SIZE, FG_BLOCK_SIZE) + ss_y) >> ss_y;
+
+ // extend padding pixels
+ if (out->p.w & ss_x) {
+ pixel *ptr = luma_src;
+ for (int y = 0; y < bh; y++) {
+ ptr[out->p.w] = ptr[out->p.w - 1];
+ ptr += PXSTRIDE(in->stride[0]) << ss_y;
+ }
+ }
+
+ const ptrdiff_t uv_off = row * FG_BLOCK_SIZE * PXSTRIDE(out->stride[1]) >> ss_y;
+ if (data->chroma_scaling_from_luma) {
+ for (int pl = 0; pl < 2; pl++)
+ dsp->fguv_32x32xn[in->p.layout - 1](((pixel *) out->data[1 + pl]) + uv_off,
+ ((const pixel *) in->data[1 + pl]) + uv_off,
+ in->stride[1], data, cpw,
+ scaling[0], grain_lut[1 + pl],
+ bh, row, luma_src, in->stride[0],
+ pl, is_id HIGHBD_TAIL_SUFFIX);
+ } else {
+ for (int pl = 0; pl < 2; pl++)
+ if (data->num_uv_points[pl])
+ dsp->fguv_32x32xn[in->p.layout - 1](((pixel *) out->data[1 + pl]) + uv_off,
+ ((const pixel *) in->data[1 + pl]) + uv_off,
+ in->stride[1], data, cpw,
+ scaling[1 + pl], grain_lut[1 + pl],
+ bh, row, luma_src, in->stride[0],
+ pl, is_id HIGHBD_TAIL_SUFFIX);
+ }
+}
+
+void bitfn(dav1d_apply_grain)(const Dav1dFilmGrainDSPContext *const dsp,
+ Dav1dPicture *const out,
+ const Dav1dPicture *const in)
+{
+ ALIGN_STK_16(entry, grain_lut, 3,[GRAIN_HEIGHT + 1][GRAIN_WIDTH]);
+#if ARCH_X86_64 && BITDEPTH == 8
+ ALIGN_STK_64(uint8_t, scaling, 3,[SCALING_SIZE]);
+#else
+ uint8_t scaling[3][SCALING_SIZE];
+#endif
+ const int rows = (out->p.h + FG_BLOCK_SIZE - 1) / FG_BLOCK_SIZE;
+
+ bitfn(dav1d_prep_grain)(dsp, out, in, scaling, grain_lut);
+ for (int row = 0; row < rows; row++)
+ bitfn(dav1d_apply_grain_row)(dsp, out, in, scaling, grain_lut, row);
+}
+#endif
diff --git a/third_party/dav1d/src/filmgrain.h b/third_party/dav1d/src/filmgrain.h
new file mode 100644
index 0000000000..1509bb67e1
--- /dev/null
+++ b/third_party/dav1d/src/filmgrain.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_FILM_GRAIN_H
+#define DAV1D_SRC_FILM_GRAIN_H
+
+#include "common/bitdepth.h"
+
+#include "src/levels.h"
+
+#define GRAIN_WIDTH 82
+#define GRAIN_HEIGHT 73
+#define FG_BLOCK_SIZE 32
+#if !defined(BITDEPTH) || BITDEPTH == 8
+#define SCALING_SIZE 256
+typedef int8_t entry;
+#else
+#define SCALING_SIZE 4096
+typedef int16_t entry;
+#endif
+
+#define decl_generate_grain_y_fn(name) \
+void (name)(entry buf[][GRAIN_WIDTH], \
+ const Dav1dFilmGrainData *const data HIGHBD_DECL_SUFFIX)
+typedef decl_generate_grain_y_fn(*generate_grain_y_fn);
+
+#define decl_generate_grain_uv_fn(name) \
+void (name)(entry buf[][GRAIN_WIDTH], \
+ const entry buf_y[][GRAIN_WIDTH], \
+ const Dav1dFilmGrainData *const data, const intptr_t uv HIGHBD_DECL_SUFFIX)
+typedef decl_generate_grain_uv_fn(*generate_grain_uv_fn);
+
+#define decl_fgy_32x32xn_fn(name) \
+void (name)(pixel *dst_row, const pixel *src_row, ptrdiff_t stride, \
+ const Dav1dFilmGrainData *data, \
+ size_t pw, const uint8_t scaling[SCALING_SIZE], \
+ const entry grain_lut[][GRAIN_WIDTH], \
+ int bh, int row_num HIGHBD_DECL_SUFFIX)
+typedef decl_fgy_32x32xn_fn(*fgy_32x32xn_fn);
+
+#define decl_fguv_32x32xn_fn(name) \
+void (name)(pixel *dst_row, const pixel *src_row, ptrdiff_t stride, \
+ const Dav1dFilmGrainData *data, size_t pw, \
+ const uint8_t scaling[SCALING_SIZE], \
+ const entry grain_lut[][GRAIN_WIDTH], int bh, int row_num, \
+ const pixel *luma_row, ptrdiff_t luma_stride, \
+ int uv_pl, int is_id HIGHBD_DECL_SUFFIX)
+typedef decl_fguv_32x32xn_fn(*fguv_32x32xn_fn);
+
+typedef struct Dav1dFilmGrainDSPContext {
+ generate_grain_y_fn generate_grain_y;
+ generate_grain_uv_fn generate_grain_uv[3];
+
+ fgy_32x32xn_fn fgy_32x32xn;
+ fguv_32x32xn_fn fguv_32x32xn[3];
+} Dav1dFilmGrainDSPContext;
+
+bitfn_decls(void dav1d_film_grain_dsp_init, Dav1dFilmGrainDSPContext *c);
+
+#endif /* DAV1D_SRC_FILM_GRAIN_H */
diff --git a/third_party/dav1d/src/filmgrain_tmpl.c b/third_party/dav1d/src/filmgrain_tmpl.c
new file mode 100644
index 0000000000..12e91dd661
--- /dev/null
+++ b/third_party/dav1d/src/filmgrain_tmpl.c
@@ -0,0 +1,441 @@
+/*
+ * Copyright © 2018, Niklas Haas
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "common/attributes.h"
+#include "common/intops.h"
+
+#include "src/filmgrain.h"
+#include "src/tables.h"
+
+#define SUB_GRAIN_WIDTH 44
+#define SUB_GRAIN_HEIGHT 38
+
+static inline int get_random_number(const int bits, unsigned *const state) {
+ const int r = *state;
+ unsigned bit = ((r >> 0) ^ (r >> 1) ^ (r >> 3) ^ (r >> 12)) & 1;
+ *state = (r >> 1) | (bit << 15);
+
+ return (*state >> (16 - bits)) & ((1 << bits) - 1);
+}
+
+static inline int round2(const int x, const uint64_t shift) {
+ return (x + ((1 << shift) >> 1)) >> shift;
+}
+
+static void generate_grain_y_c(entry buf[][GRAIN_WIDTH],
+ const Dav1dFilmGrainData *const data
+ HIGHBD_DECL_SUFFIX)
+{
+ const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
+ unsigned seed = data->seed;
+ const int shift = 4 - bitdepth_min_8 + data->grain_scale_shift;
+ const int grain_ctr = 128 << bitdepth_min_8;
+ const int grain_min = -grain_ctr, grain_max = grain_ctr - 1;
+
+ for (int y = 0; y < GRAIN_HEIGHT; y++) {
+ for (int x = 0; x < GRAIN_WIDTH; x++) {
+ const int value = get_random_number(11, &seed);
+ buf[y][x] = round2(dav1d_gaussian_sequence[ value ], shift);
+ }
+ }
+
+ const int ar_pad = 3;
+ const int ar_lag = data->ar_coeff_lag;
+
+ for (int y = ar_pad; y < GRAIN_HEIGHT; y++) {
+ for (int x = ar_pad; x < GRAIN_WIDTH - ar_pad; x++) {
+ const int8_t *coeff = data->ar_coeffs_y;
+ int sum = 0;
+ for (int dy = -ar_lag; dy <= 0; dy++) {
+ for (int dx = -ar_lag; dx <= ar_lag; dx++) {
+ if (!dx && !dy)
+ break;
+ sum += *(coeff++) * buf[y + dy][x + dx];
+ }
+ }
+
+ const int grain = buf[y][x] + round2(sum, data->ar_coeff_shift);
+ buf[y][x] = iclip(grain, grain_min, grain_max);
+ }
+ }
+}
+
+static NOINLINE void
+generate_grain_uv_c(entry buf[][GRAIN_WIDTH],
+ const entry buf_y[][GRAIN_WIDTH],
+ const Dav1dFilmGrainData *const data, const intptr_t uv,
+ const int subx, const int suby HIGHBD_DECL_SUFFIX)
+{
+ const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
+ unsigned seed = data->seed ^ (uv ? 0x49d8 : 0xb524);
+ const int shift = 4 - bitdepth_min_8 + data->grain_scale_shift;
+ const int grain_ctr = 128 << bitdepth_min_8;
+ const int grain_min = -grain_ctr, grain_max = grain_ctr - 1;
+
+ const int chromaW = subx ? SUB_GRAIN_WIDTH : GRAIN_WIDTH;
+ const int chromaH = suby ? SUB_GRAIN_HEIGHT : GRAIN_HEIGHT;
+
+ for (int y = 0; y < chromaH; y++) {
+ for (int x = 0; x < chromaW; x++) {
+ const int value = get_random_number(11, &seed);
+ buf[y][x] = round2(dav1d_gaussian_sequence[ value ], shift);
+ }
+ }
+
+ const int ar_pad = 3;
+ const int ar_lag = data->ar_coeff_lag;
+
+ for (int y = ar_pad; y < chromaH; y++) {
+ for (int x = ar_pad; x < chromaW - ar_pad; x++) {
+ const int8_t *coeff = data->ar_coeffs_uv[uv];
+ int sum = 0;
+ for (int dy = -ar_lag; dy <= 0; dy++) {
+ for (int dx = -ar_lag; dx <= ar_lag; dx++) {
+ // For the final (current) pixel, we need to add in the
+ // contribution from the luma grain texture
+ if (!dx && !dy) {
+ if (!data->num_y_points)
+ break;
+ int luma = 0;
+ const int lumaX = ((x - ar_pad) << subx) + ar_pad;
+ const int lumaY = ((y - ar_pad) << suby) + ar_pad;
+ for (int i = 0; i <= suby; i++) {
+ for (int j = 0; j <= subx; j++) {
+ luma += buf_y[lumaY + i][lumaX + j];
+ }
+ }
+ luma = round2(luma, subx + suby);
+ sum += luma * (*coeff);
+ break;
+ }
+
+ sum += *(coeff++) * buf[y + dy][x + dx];
+ }
+ }
+
+ const int grain = buf[y][x] + round2(sum, data->ar_coeff_shift);
+ buf[y][x] = iclip(grain, grain_min, grain_max);
+ }
+ }
+}
+
+#define gnuv_ss_fn(nm, ss_x, ss_y) \
+static decl_generate_grain_uv_fn(generate_grain_uv_##nm##_c) { \
+ generate_grain_uv_c(buf, buf_y, data, uv, ss_x, ss_y HIGHBD_TAIL_SUFFIX); \
+}
+
+gnuv_ss_fn(420, 1, 1);
+gnuv_ss_fn(422, 1, 0);
+gnuv_ss_fn(444, 0, 0);
+
+// samples from the correct block of a grain LUT, while taking into account the
+// offsets provided by the offsets cache
+static inline entry sample_lut(const entry grain_lut[][GRAIN_WIDTH],
+ const int offsets[2][2], const int subx, const int suby,
+ const int bx, const int by, const int x, const int y)
+{
+ const int randval = offsets[bx][by];
+ const int offx = 3 + (2 >> subx) * (3 + (randval >> 4));
+ const int offy = 3 + (2 >> suby) * (3 + (randval & 0xF));
+ return grain_lut[offy + y + (FG_BLOCK_SIZE >> suby) * by]
+ [offx + x + (FG_BLOCK_SIZE >> subx) * bx];
+}
+
+static void fgy_32x32xn_c(pixel *const dst_row, const pixel *const src_row,
+ const ptrdiff_t stride,
+ const Dav1dFilmGrainData *const data, const size_t pw,
+ const uint8_t scaling[SCALING_SIZE],
+ const entry grain_lut[][GRAIN_WIDTH],
+ const int bh, const int row_num HIGHBD_DECL_SUFFIX)
+{
+ const int rows = 1 + (data->overlap_flag && row_num > 0);
+ const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
+ const int grain_ctr = 128 << bitdepth_min_8;
+ const int grain_min = -grain_ctr, grain_max = grain_ctr - 1;
+
+ int min_value, max_value;
+ if (data->clip_to_restricted_range) {
+ min_value = 16 << bitdepth_min_8;
+ max_value = 235 << bitdepth_min_8;
+ } else {
+ min_value = 0;
+ max_value = BITDEPTH_MAX;
+ }
+
+ // seed[0] contains the current row, seed[1] contains the previous
+ unsigned seed[2];
+ for (int i = 0; i < rows; i++) {
+ seed[i] = data->seed;
+ seed[i] ^= (((row_num - i) * 37 + 178) & 0xFF) << 8;
+ seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF);
+ }
+
+ assert(stride % (FG_BLOCK_SIZE * sizeof(pixel)) == 0);
+
+ int offsets[2 /* col offset */][2 /* row offset */];
+
+ // process this row in FG_BLOCK_SIZE^2 blocks
+ for (unsigned bx = 0; bx < pw; bx += FG_BLOCK_SIZE) {
+ const int bw = imin(FG_BLOCK_SIZE, (int) pw - bx);
+
+ if (data->overlap_flag && bx) {
+ // shift previous offsets left
+ for (int i = 0; i < rows; i++)
+ offsets[1][i] = offsets[0][i];
+ }
+
+ // update current offsets
+ for (int i = 0; i < rows; i++)
+ offsets[0][i] = get_random_number(8, &seed[i]);
+
+ // x/y block offsets to compensate for overlapped regions
+ const int ystart = data->overlap_flag && row_num ? imin(2, bh) : 0;
+ const int xstart = data->overlap_flag && bx ? imin(2, bw) : 0;
+
+ static const int w[2][2] = { { 27, 17 }, { 17, 27 } };
+
+#define add_noise_y(x, y, grain) \
+ const pixel *const src = src_row + (y) * PXSTRIDE(stride) + (x) + bx; \
+ pixel *const dst = dst_row + (y) * PXSTRIDE(stride) + (x) + bx; \
+ const int noise = round2(scaling[ *src ] * (grain), data->scaling_shift); \
+ *dst = iclip(*src + noise, min_value, max_value);
+
+ for (int y = ystart; y < bh; y++) {
+ // Non-overlapped image region (straightforward)
+ for (int x = xstart; x < bw; x++) {
+ int grain = sample_lut(grain_lut, offsets, 0, 0, 0, 0, x, y);
+ add_noise_y(x, y, grain);
+ }
+
+ // Special case for overlapped column
+ for (int x = 0; x < xstart; x++) {
+ int grain = sample_lut(grain_lut, offsets, 0, 0, 0, 0, x, y);
+ int old = sample_lut(grain_lut, offsets, 0, 0, 1, 0, x, y);
+ grain = round2(old * w[x][0] + grain * w[x][1], 5);
+ grain = iclip(grain, grain_min, grain_max);
+ add_noise_y(x, y, grain);
+ }
+ }
+
+ for (int y = 0; y < ystart; y++) {
+ // Special case for overlapped row (sans corner)
+ for (int x = xstart; x < bw; x++) {
+ int grain = sample_lut(grain_lut, offsets, 0, 0, 0, 0, x, y);
+ int old = sample_lut(grain_lut, offsets, 0, 0, 0, 1, x, y);
+ grain = round2(old * w[y][0] + grain * w[y][1], 5);
+ grain = iclip(grain, grain_min, grain_max);
+ add_noise_y(x, y, grain);
+ }
+
+ // Special case for doubly-overlapped corner
+ for (int x = 0; x < xstart; x++) {
+ // Blend the top pixel with the top left block
+ int top = sample_lut(grain_lut, offsets, 0, 0, 0, 1, x, y);
+ int old = sample_lut(grain_lut, offsets, 0, 0, 1, 1, x, y);
+ top = round2(old * w[x][0] + top * w[x][1], 5);
+ top = iclip(top, grain_min, grain_max);
+
+ // Blend the current pixel with the left block
+ int grain = sample_lut(grain_lut, offsets, 0, 0, 0, 0, x, y);
+ old = sample_lut(grain_lut, offsets, 0, 0, 1, 0, x, y);
+ grain = round2(old * w[x][0] + grain * w[x][1], 5);
+ grain = iclip(grain, grain_min, grain_max);
+
+ // Mix the row rows together and apply grain
+ grain = round2(top * w[y][0] + grain * w[y][1], 5);
+ grain = iclip(grain, grain_min, grain_max);
+ add_noise_y(x, y, grain);
+ }
+ }
+ }
+}
+
+static NOINLINE void
+fguv_32x32xn_c(pixel *const dst_row, const pixel *const src_row,
+ const ptrdiff_t stride, const Dav1dFilmGrainData *const data,
+ const size_t pw, const uint8_t scaling[SCALING_SIZE],
+ const entry grain_lut[][GRAIN_WIDTH], const int bh,
+ const int row_num, const pixel *const luma_row,
+ const ptrdiff_t luma_stride, const int uv, const int is_id,
+ const int sx, const int sy HIGHBD_DECL_SUFFIX)
+{
+ const int rows = 1 + (data->overlap_flag && row_num > 0);
+ const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
+ const int grain_ctr = 128 << bitdepth_min_8;
+ const int grain_min = -grain_ctr, grain_max = grain_ctr - 1;
+
+ int min_value, max_value;
+ if (data->clip_to_restricted_range) {
+ min_value = 16 << bitdepth_min_8;
+ max_value = (is_id ? 235 : 240) << bitdepth_min_8;
+ } else {
+ min_value = 0;
+ max_value = BITDEPTH_MAX;
+ }
+
+ // seed[0] contains the current row, seed[1] contains the previous
+ unsigned seed[2];
+ for (int i = 0; i < rows; i++) {
+ seed[i] = data->seed;
+ seed[i] ^= (((row_num - i) * 37 + 178) & 0xFF) << 8;
+ seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF);
+ }
+
+ assert(stride % (FG_BLOCK_SIZE * sizeof(pixel)) == 0);
+
+ int offsets[2 /* col offset */][2 /* row offset */];
+
+ // process this row in FG_BLOCK_SIZE^2 blocks (subsampled)
+ for (unsigned bx = 0; bx < pw; bx += FG_BLOCK_SIZE >> sx) {
+ const int bw = imin(FG_BLOCK_SIZE >> sx, (int)(pw - bx));
+ if (data->overlap_flag && bx) {
+ // shift previous offsets left
+ for (int i = 0; i < rows; i++)
+ offsets[1][i] = offsets[0][i];
+ }
+
+ // update current offsets
+ for (int i = 0; i < rows; i++)
+ offsets[0][i] = get_random_number(8, &seed[i]);
+
+ // x/y block offsets to compensate for overlapped regions
+ const int ystart = data->overlap_flag && row_num ? imin(2 >> sy, bh) : 0;
+ const int xstart = data->overlap_flag && bx ? imin(2 >> sx, bw) : 0;
+
+ static const int w[2 /* sub */][2 /* off */][2] = {
+ { { 27, 17 }, { 17, 27 } },
+ { { 23, 22 } },
+ };
+
+#define add_noise_uv(x, y, grain) \
+ const int lx = (bx + x) << sx; \
+ const int ly = y << sy; \
+ const pixel *const luma = luma_row + ly * PXSTRIDE(luma_stride) + lx; \
+ pixel avg = luma[0]; \
+ if (sx) \
+ avg = (avg + luma[1] + 1) >> 1; \
+ const pixel *const src = src_row + (y) * PXSTRIDE(stride) + (bx + (x)); \
+ pixel *const dst = dst_row + (y) * PXSTRIDE(stride) + (bx + (x)); \
+ int val = avg; \
+ if (!data->chroma_scaling_from_luma) { \
+ const int combined = avg * data->uv_luma_mult[uv] + \
+ *src * data->uv_mult[uv]; \
+ val = iclip_pixel( (combined >> 6) + \
+ (data->uv_offset[uv] * (1 << bitdepth_min_8)) ); \
+ } \
+ const int noise = round2(scaling[ val ] * (grain), data->scaling_shift); \
+ *dst = iclip(*src + noise, min_value, max_value);
+
+ for (int y = ystart; y < bh; y++) {
+ // Non-overlapped image region (straightforward)
+ for (int x = xstart; x < bw; x++) {
+ int grain = sample_lut(grain_lut, offsets, sx, sy, 0, 0, x, y);
+ add_noise_uv(x, y, grain);
+ }
+
+ // Special case for overlapped column
+ for (int x = 0; x < xstart; x++) {
+ int grain = sample_lut(grain_lut, offsets, sx, sy, 0, 0, x, y);
+ int old = sample_lut(grain_lut, offsets, sx, sy, 1, 0, x, y);
+ grain = round2(old * w[sx][x][0] + grain * w[sx][x][1], 5);
+ grain = iclip(grain, grain_min, grain_max);
+ add_noise_uv(x, y, grain);
+ }
+ }
+
+ for (int y = 0; y < ystart; y++) {
+ // Special case for overlapped row (sans corner)
+ for (int x = xstart; x < bw; x++) {
+ int grain = sample_lut(grain_lut, offsets, sx, sy, 0, 0, x, y);
+ int old = sample_lut(grain_lut, offsets, sx, sy, 0, 1, x, y);
+ grain = round2(old * w[sy][y][0] + grain * w[sy][y][1], 5);
+ grain = iclip(grain, grain_min, grain_max);
+ add_noise_uv(x, y, grain);
+ }
+
+ // Special case for doubly-overlapped corner
+ for (int x = 0; x < xstart; x++) {
+ // Blend the top pixel with the top left block
+ int top = sample_lut(grain_lut, offsets, sx, sy, 0, 1, x, y);
+ int old = sample_lut(grain_lut, offsets, sx, sy, 1, 1, x, y);
+ top = round2(old * w[sx][x][0] + top * w[sx][x][1], 5);
+ top = iclip(top, grain_min, grain_max);
+
+ // Blend the current pixel with the left block
+ int grain = sample_lut(grain_lut, offsets, sx, sy, 0, 0, x, y);
+ old = sample_lut(grain_lut, offsets, sx, sy, 1, 0, x, y);
+ grain = round2(old * w[sx][x][0] + grain * w[sx][x][1], 5);
+ grain = iclip(grain, grain_min, grain_max);
+
+ // Mix the row rows together and apply to image
+ grain = round2(top * w[sy][y][0] + grain * w[sy][y][1], 5);
+ grain = iclip(grain, grain_min, grain_max);
+ add_noise_uv(x, y, grain);
+ }
+ }
+ }
+}
+
+#define fguv_ss_fn(nm, ss_x, ss_y) \
+static decl_fguv_32x32xn_fn(fguv_32x32xn_##nm##_c) { \
+ fguv_32x32xn_c(dst_row, src_row, stride, data, pw, scaling, grain_lut, bh, \
+ row_num, luma_row, luma_stride, uv_pl, is_id, ss_x, ss_y \
+ HIGHBD_TAIL_SUFFIX); \
+}
+
+fguv_ss_fn(420, 1, 1);
+fguv_ss_fn(422, 1, 0);
+fguv_ss_fn(444, 0, 0);
+
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+#include "src/arm/filmgrain.h"
+#elif ARCH_X86
+#include "src/x86/filmgrain.h"
+#endif
+#endif
+
+COLD void bitfn(dav1d_film_grain_dsp_init)(Dav1dFilmGrainDSPContext *const c) {
+ c->generate_grain_y = generate_grain_y_c;
+ c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = generate_grain_uv_420_c;
+ c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = generate_grain_uv_422_c;
+ c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = generate_grain_uv_444_c;
+
+ c->fgy_32x32xn = fgy_32x32xn_c;
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = fguv_32x32xn_420_c;
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = fguv_32x32xn_422_c;
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = fguv_32x32xn_444_c;
+
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+ film_grain_dsp_init_arm(c);
+#elif ARCH_X86
+ film_grain_dsp_init_x86(c);
+#endif
+#endif
+}
diff --git a/third_party/dav1d/src/getbits.c b/third_party/dav1d/src/getbits.c
new file mode 100644
index 0000000000..03776285dd
--- /dev/null
+++ b/third_party/dav1d/src/getbits.c
@@ -0,0 +1,164 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <limits.h>
+
+#include "common/intops.h"
+
+#include "src/getbits.h"
+
+void dav1d_init_get_bits(GetBits *const c, const uint8_t *const data,
+ const size_t sz)
+{
+ assert(sz);
+ c->ptr = c->ptr_start = data;
+ c->ptr_end = &c->ptr_start[sz];
+ c->state = 0;
+ c->bits_left = 0;
+ c->error = 0;
+}
+
+unsigned dav1d_get_bit(GetBits *const c) {
+ if (!c->bits_left) {
+ if (c->ptr >= c->ptr_end) {
+ c->error = 1;
+ } else {
+ const unsigned state = *c->ptr++;
+ c->bits_left = 7;
+ c->state = (uint64_t) state << 57;
+ return state >> 7;
+ }
+ }
+
+ const uint64_t state = c->state;
+ c->bits_left--;
+ c->state = state << 1;
+ return (unsigned) (state >> 63);
+}
+
+static inline void refill(GetBits *const c, const int n) {
+ assert(c->bits_left >= 0 && c->bits_left < 32);
+ unsigned state = 0;
+ do {
+ if (c->ptr >= c->ptr_end) {
+ c->error = 1;
+ if (state) break;
+ return;
+ }
+ state = (state << 8) | *c->ptr++;
+ c->bits_left += 8;
+ } while (n > c->bits_left);
+ c->state |= (uint64_t) state << (64 - c->bits_left);
+}
+
+#define GET_BITS(name, type, type64) \
+type name(GetBits *const c, const int n) { \
+ assert(n > 0 && n <= 32); \
+ /* Unsigned cast avoids refill after eob */ \
+ if ((unsigned) n > (unsigned) c->bits_left) \
+ refill(c, n); \
+ const uint64_t state = c->state; \
+ c->bits_left -= n; \
+ c->state = state << n; \
+ return (type) ((type64) state >> (64 - n)); \
+}
+
+GET_BITS(dav1d_get_bits, unsigned, uint64_t)
+GET_BITS(dav1d_get_sbits, int, int64_t)
+
+unsigned dav1d_get_uleb128(GetBits *const c) {
+ uint64_t val = 0;
+ unsigned i = 0, more;
+
+ do {
+ const int v = dav1d_get_bits(c, 8);
+ more = v & 0x80;
+ val |= ((uint64_t) (v & 0x7F)) << i;
+ i += 7;
+ } while (more && i < 56);
+
+ if (val > UINT_MAX || more) {
+ c->error = 1;
+ return 0;
+ }
+
+ return (unsigned) val;
+}
+
+unsigned dav1d_get_uniform(GetBits *const c, const unsigned max) {
+ // Output in range [0..max-1]
+ // max must be > 1, or else nothing is read from the bitstream
+ assert(max > 1);
+ const int l = ulog2(max) + 1;
+ assert(l > 1);
+ const unsigned m = (1U << l) - max;
+ const unsigned v = dav1d_get_bits(c, l - 1);
+ return v < m ? v : (v << 1) - m + dav1d_get_bit(c);
+}
+
+unsigned dav1d_get_vlc(GetBits *const c) {
+ if (dav1d_get_bit(c))
+ return 0;
+
+ int n_bits = 0;
+ do {
+ if (++n_bits == 32)
+ return 0xFFFFFFFFU;
+ } while (!dav1d_get_bit(c));
+
+ return ((1U << n_bits) - 1) + dav1d_get_bits(c, n_bits);
+}
+
+static unsigned get_bits_subexp_u(GetBits *const c, const unsigned ref,
+ const unsigned n)
+{
+ unsigned v = 0;
+
+ for (int i = 0;; i++) {
+ const int b = i ? 3 + i - 1 : 3;
+
+ if (n < v + 3 * (1 << b)) {
+ v += dav1d_get_uniform(c, n - v + 1);
+ break;
+ }
+
+ if (!dav1d_get_bit(c)) {
+ v += dav1d_get_bits(c, b);
+ break;
+ }
+
+ v += 1 << b;
+ }
+
+ return ref * 2 <= n ? inv_recenter(ref, v) : n - inv_recenter(n - ref, v);
+}
+
+int dav1d_get_bits_subexp(GetBits *const c, const int ref, const unsigned n) {
+ return (int) get_bits_subexp_u(c, ref + (1 << n), 2 << n) - (1 << n);
+}
diff --git a/third_party/dav1d/src/getbits.h b/third_party/dav1d/src/getbits.h
new file mode 100644
index 0000000000..67925943cd
--- /dev/null
+++ b/third_party/dav1d/src/getbits.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_GETBITS_H
+#define DAV1D_SRC_GETBITS_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+typedef struct GetBits {
+ uint64_t state;
+ int bits_left, error;
+ const uint8_t *ptr, *ptr_start, *ptr_end;
+} GetBits;
+
+void dav1d_init_get_bits(GetBits *c, const uint8_t *data, size_t sz);
+unsigned dav1d_get_bit(GetBits *c);
+unsigned dav1d_get_bits(GetBits *c, int n);
+int dav1d_get_sbits(GetBits *c, int n);
+unsigned dav1d_get_uleb128(GetBits *c);
+
+// Output in range 0..max-1
+unsigned dav1d_get_uniform(GetBits *c, unsigned max);
+unsigned dav1d_get_vlc(GetBits *c);
+int dav1d_get_bits_subexp(GetBits *c, int ref, unsigned n);
+
+// Discard bits from the buffer until we're next byte-aligned.
+static inline void dav1d_bytealign_get_bits(GetBits *c) {
+ // bits_left is never more than 7, because it is only incremented
+ // by refill(), called by dav1d_get_bits and that never reads more
+ // than 7 bits more than it needs.
+ //
+ // If this wasn't true, we would need to work out how many bits to
+ // discard (bits_left % 8), subtract that from bits_left and then
+ // shift state right by that amount.
+ assert(c->bits_left <= 7);
+
+ c->bits_left = 0;
+ c->state = 0;
+}
+
+// Return the current bit position relative to the start of the buffer.
+static inline unsigned dav1d_get_bits_pos(const GetBits *c) {
+ return (unsigned) (c->ptr - c->ptr_start) * 8 - c->bits_left;
+}
+
+#endif /* DAV1D_SRC_GETBITS_H */
diff --git a/third_party/dav1d/src/internal.h b/third_party/dav1d/src/internal.h
new file mode 100644
index 0000000000..72f65607ed
--- /dev/null
+++ b/third_party/dav1d/src/internal.h
@@ -0,0 +1,468 @@
+/*
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_INTERNAL_H
+#define DAV1D_SRC_INTERNAL_H
+
+#include <stdatomic.h>
+
+#include "dav1d/data.h"
+
+typedef struct Dav1dFrameContext Dav1dFrameContext;
+typedef struct Dav1dTileState Dav1dTileState;
+typedef struct Dav1dTaskContext Dav1dTaskContext;
+typedef struct Dav1dTask Dav1dTask;
+
+#include "common/attributes.h"
+
+#include "src/cdef.h"
+#include "src/cdf.h"
+#include "src/data.h"
+#include "src/env.h"
+#include "src/filmgrain.h"
+#include "src/intra_edge.h"
+#include "src/ipred.h"
+#include "src/itx.h"
+#include "src/levels.h"
+#include "src/lf_mask.h"
+#include "src/loopfilter.h"
+#include "src/looprestoration.h"
+#include "src/mc.h"
+#include "src/msac.h"
+#include "src/pal.h"
+#include "src/picture.h"
+#include "src/recon.h"
+#include "src/refmvs.h"
+#include "src/thread.h"
+
+typedef struct Dav1dDSPContext {
+ Dav1dFilmGrainDSPContext fg;
+ Dav1dIntraPredDSPContext ipred;
+ Dav1dMCDSPContext mc;
+ Dav1dInvTxfmDSPContext itx;
+ Dav1dLoopFilterDSPContext lf;
+ Dav1dCdefDSPContext cdef;
+ Dav1dLoopRestorationDSPContext lr;
+} Dav1dDSPContext;
+
+struct Dav1dTileGroup {
+ Dav1dData data;
+ int start, end;
+};
+
+enum TaskType {
+ DAV1D_TASK_TYPE_INIT,
+ DAV1D_TASK_TYPE_INIT_CDF,
+ DAV1D_TASK_TYPE_TILE_ENTROPY,
+ DAV1D_TASK_TYPE_ENTROPY_PROGRESS,
+ DAV1D_TASK_TYPE_TILE_RECONSTRUCTION,
+ DAV1D_TASK_TYPE_DEBLOCK_COLS,
+ DAV1D_TASK_TYPE_DEBLOCK_ROWS,
+ DAV1D_TASK_TYPE_CDEF,
+ DAV1D_TASK_TYPE_SUPER_RESOLUTION,
+ DAV1D_TASK_TYPE_LOOP_RESTORATION,
+ DAV1D_TASK_TYPE_RECONSTRUCTION_PROGRESS,
+ DAV1D_TASK_TYPE_FG_PREP,
+ DAV1D_TASK_TYPE_FG_APPLY,
+};
+
+struct Dav1dContext {
+ Dav1dFrameContext *fc;
+ unsigned n_fc;
+
+ Dav1dTaskContext *tc;
+ unsigned n_tc;
+
+ // cache of OBUs that make up a single frame before we submit them
+ // to a frame worker to be decoded
+ struct Dav1dTileGroup *tile;
+ int n_tile_data_alloc;
+ int n_tile_data;
+ int n_tiles;
+ Dav1dMemPool *seq_hdr_pool;
+ Dav1dRef *seq_hdr_ref;
+ Dav1dSequenceHeader *seq_hdr;
+ Dav1dMemPool *frame_hdr_pool;
+ Dav1dRef *frame_hdr_ref;
+ Dav1dFrameHeader *frame_hdr;
+
+ Dav1dRef *content_light_ref;
+ Dav1dContentLightLevel *content_light;
+ Dav1dRef *mastering_display_ref;
+ Dav1dMasteringDisplay *mastering_display;
+ Dav1dRef *itut_t35_ref;
+ Dav1dITUTT35 *itut_t35;
+ int n_itut_t35;
+
+ // decoded output picture queue
+ Dav1dData in;
+ Dav1dThreadPicture out, cache;
+ // dummy is a pointer to prevent compiler errors about atomic_load()
+ // not taking const arguments
+ atomic_int flush_mem, *flush;
+ struct {
+ Dav1dThreadPicture *out_delayed;
+ unsigned next;
+ } frame_thread;
+
+ // task threading (refer to tc[] for per_thread thingies)
+ struct TaskThreadData {
+ pthread_mutex_t lock;
+ pthread_cond_t cond;
+ atomic_uint first;
+ unsigned cur;
+ // This is used for delayed reset of the task cur pointer when
+ // such operation is needed but the thread doesn't enter a critical
+ // section (typically when executing the next sbrow task locklessly).
+ // See src/thread_task.c:reset_task_cur().
+ atomic_uint reset_task_cur;
+ atomic_int cond_signaled;
+ struct {
+ int exec, finished;
+ pthread_cond_t cond;
+ const Dav1dPicture *in;
+ Dav1dPicture *out;
+ enum TaskType type;
+ atomic_int progress[2]; /* [0]=started, [1]=completed */
+ union {
+ struct {
+ ALIGN(int8_t grain_lut_8bpc[3][GRAIN_HEIGHT + 1][GRAIN_WIDTH], 16);
+ ALIGN(uint8_t scaling_8bpc[3][256], 64);
+ };
+ struct {
+ ALIGN(int16_t grain_lut_16bpc[3][GRAIN_HEIGHT + 1][GRAIN_WIDTH], 16);
+ ALIGN(uint8_t scaling_16bpc[3][4096], 64);
+ };
+ };
+ } delayed_fg;
+ int inited;
+ } task_thread;
+
+ // reference/entropy state
+ Dav1dMemPool *segmap_pool;
+ Dav1dMemPool *refmvs_pool;
+ struct {
+ Dav1dThreadPicture p;
+ Dav1dRef *segmap;
+ Dav1dRef *refmvs;
+ unsigned refpoc[7];
+ } refs[8];
+ Dav1dMemPool *cdf_pool;
+ CdfThreadContext cdf[8];
+
+ Dav1dDSPContext dsp[3 /* 8, 10, 12 bits/component */];
+ Dav1dPalDSPContext pal_dsp;
+ Dav1dRefmvsDSPContext refmvs_dsp;
+
+ Dav1dPicAllocator allocator;
+ int apply_grain;
+ int operating_point;
+ unsigned operating_point_idc;
+ int all_layers;
+ int max_spatial_id;
+ unsigned frame_size_limit;
+ int strict_std_compliance;
+ int output_invisible_frames;
+ enum Dav1dInloopFilterType inloop_filters;
+ enum Dav1dDecodeFrameType decode_frame_type;
+ int drain;
+ enum PictureFlags frame_flags;
+ enum Dav1dEventFlags event_flags;
+ Dav1dDataProps cached_error_props;
+ int cached_error;
+
+ Dav1dLogger logger;
+
+ Dav1dMemPool *picture_pool;
+ Dav1dMemPool *pic_ctx_pool;
+};
+
+struct Dav1dTask {
+ unsigned frame_idx; // frame thread id
+ enum TaskType type; // task work
+ int sby; // sbrow
+
+ // task dependencies
+ int recon_progress, deblock_progress;
+ int deps_skip;
+ struct Dav1dTask *next; // only used in task queue
+};
+
+struct Dav1dFrameContext {
+ Dav1dRef *seq_hdr_ref;
+ Dav1dSequenceHeader *seq_hdr;
+ Dav1dRef *frame_hdr_ref;
+ Dav1dFrameHeader *frame_hdr;
+ Dav1dThreadPicture refp[7];
+ Dav1dPicture cur; // during block coding / reconstruction
+ Dav1dThreadPicture sr_cur; // after super-resolution upscaling
+ Dav1dRef *mvs_ref;
+ refmvs_temporal_block *mvs, *ref_mvs[7];
+ Dav1dRef *ref_mvs_ref[7];
+ Dav1dRef *cur_segmap_ref, *prev_segmap_ref;
+ uint8_t *cur_segmap;
+ const uint8_t *prev_segmap;
+ unsigned refpoc[7], refrefpoc[7][7];
+ uint8_t gmv_warp_allowed[7];
+ CdfThreadContext in_cdf, out_cdf;
+ struct Dav1dTileGroup *tile;
+ int n_tile_data_alloc;
+ int n_tile_data;
+
+ // for scalable references
+ struct ScalableMotionParams {
+ int scale; // if no scaling, this is 0
+ int step;
+ } svc[7][2 /* x, y */];
+ int resize_step[2 /* y, uv */], resize_start[2 /* y, uv */];
+
+ const Dav1dContext *c;
+ Dav1dTileState *ts;
+ int n_ts;
+ const Dav1dDSPContext *dsp;
+ struct {
+ recon_b_intra_fn recon_b_intra;
+ recon_b_inter_fn recon_b_inter;
+ filter_sbrow_fn filter_sbrow;
+ filter_sbrow_fn filter_sbrow_deblock_cols;
+ filter_sbrow_fn filter_sbrow_deblock_rows;
+ void (*filter_sbrow_cdef)(Dav1dTaskContext *tc, int sby);
+ filter_sbrow_fn filter_sbrow_resize;
+ filter_sbrow_fn filter_sbrow_lr;
+ backup_ipred_edge_fn backup_ipred_edge;
+ read_coef_blocks_fn read_coef_blocks;
+ copy_pal_block_fn copy_pal_block_y;
+ copy_pal_block_fn copy_pal_block_uv;
+ read_pal_plane_fn read_pal_plane;
+ read_pal_uv_fn read_pal_uv;
+ } bd_fn;
+
+ int ipred_edge_sz;
+ pixel *ipred_edge[3];
+ ptrdiff_t b4_stride;
+ int w4, h4, bw, bh, sb128w, sb128h, sbh, sb_shift, sb_step, sr_sb128w;
+ uint16_t dq[DAV1D_MAX_SEGMENTS][3 /* plane */][2 /* dc/ac */];
+ const uint8_t *qm[N_RECT_TX_SIZES][3 /* plane */];
+ BlockContext *a;
+ int a_sz /* w*tile_rows */;
+ refmvs_frame rf;
+ uint8_t jnt_weights[7][7];
+ int bitdepth_max;
+
+ struct {
+ int next_tile_row[2 /* 0: reconstruction, 1: entropy */];
+ atomic_int entropy_progress;
+ atomic_int deblock_progress; // in sby units
+ atomic_uint *frame_progress, *copy_lpf_progress;
+ // indexed using t->by * f->b4_stride + t->bx
+ Av1Block *b;
+ int16_t *cbi; /* bits 0-4: txtp, bits 5-15: eob */
+ // indexed using (t->by >> 1) * (f->b4_stride >> 1) + (t->bx >> 1)
+ pixel (*pal)[3 /* plane */][8 /* idx */];
+ // iterated over inside tile state
+ uint8_t *pal_idx;
+ coef *cf;
+ int prog_sz;
+ int cbi_sz, pal_sz, pal_idx_sz, cf_sz;
+ // start offsets per tile
+ unsigned *tile_start_off;
+ } frame_thread;
+
+ // loopfilter
+ struct {
+ uint8_t (*level)[4];
+ Av1Filter *mask;
+ Av1Restoration *lr_mask;
+ int mask_sz /* w*h */, lr_mask_sz;
+ int cdef_buf_plane_sz[2]; /* stride*sbh*4 */
+ int cdef_buf_sbh;
+ int lr_buf_plane_sz[2]; /* (stride*sbh*4) << sb128 if n_tc > 1, else stride*4 */
+ int re_sz /* h */;
+ ALIGN(Av1FilterLUT lim_lut, 16);
+ int last_sharpness;
+ uint8_t lvl[8 /* seg_id */][4 /* dir */][8 /* ref */][2 /* is_gmv */];
+ uint8_t *tx_lpf_right_edge[2];
+ uint8_t *cdef_line_buf, *lr_line_buf;
+ pixel *cdef_line[2 /* pre, post */][3 /* plane */];
+ pixel *cdef_lpf_line[3 /* plane */];
+ pixel *lr_lpf_line[3 /* plane */];
+
+ // in-loop filter per-frame state keeping
+ uint8_t *start_of_tile_row;
+ int start_of_tile_row_sz;
+ int need_cdef_lpf_copy;
+ pixel *p[3], *sr_p[3];
+ int restore_planes; // enum LrRestorePlanes
+ } lf;
+
+ struct {
+ pthread_mutex_t lock;
+ pthread_cond_t cond;
+ struct TaskThreadData *ttd;
+ struct Dav1dTask *tasks, *tile_tasks[2], init_task;
+ int num_tasks, num_tile_tasks;
+ atomic_int init_done;
+ atomic_int done[2];
+ int retval;
+ int update_set; // whether we need to update CDF reference
+ atomic_int error;
+ atomic_int task_counter;
+ struct Dav1dTask *task_head, *task_tail;
+ // Points to the task directly before the cur pointer in the queue.
+ // This cur pointer is theoretical here, we actually keep track of the
+ // "prev_t" variable. This is needed to not loose the tasks in
+ // [head;cur-1] when picking one for execution.
+ struct Dav1dTask *task_cur_prev;
+ struct { // async task insertion
+ atomic_int merge;
+ pthread_mutex_t lock;
+ Dav1dTask *head, *tail;
+ } pending_tasks;
+ } task_thread;
+
+ // threading (refer to tc[] for per-thread things)
+ struct FrameTileThreadData {
+ int (*lowest_pixel_mem)[7][2];
+ int lowest_pixel_mem_sz;
+ } tile_thread;
+};
+
+struct Dav1dTileState {
+ CdfContext cdf;
+ MsacContext msac;
+
+ struct {
+ int col_start, col_end, row_start, row_end; // in 4px units
+ int col, row; // in tile units
+ } tiling;
+
+ // in sby units, TILE_ERROR after a decoding error
+ atomic_int progress[2 /* 0: reconstruction, 1: entropy */];
+ struct {
+ uint8_t *pal_idx;
+ int16_t *cbi;
+ coef *cf;
+ } frame_thread[2 /* 0: reconstruction, 1: entropy */];
+
+ // in fullpel units, [0] = Y, [1] = UV, used for progress requirements
+ // each entry is one tile-sbrow; middle index is refidx
+ int (*lowest_pixel)[7][2];
+
+ uint16_t dqmem[DAV1D_MAX_SEGMENTS][3 /* plane */][2 /* dc/ac */];
+ const uint16_t (*dq)[3][2];
+ int last_qidx;
+
+ int8_t last_delta_lf[4];
+ uint8_t lflvlmem[8 /* seg_id */][4 /* dir */][8 /* ref */][2 /* is_gmv */];
+ const uint8_t (*lflvl)[4][8][2];
+
+ Av1RestorationUnit *lr_ref[3];
+};
+
+struct Dav1dTaskContext {
+ const Dav1dContext *c;
+ const Dav1dFrameContext *f;
+ Dav1dTileState *ts;
+ int bx, by;
+ BlockContext l, *a;
+ refmvs_tile rt;
+ ALIGN(union, 64) {
+ int16_t cf_8bpc [32 * 32];
+ int32_t cf_16bpc[32 * 32];
+ };
+ union {
+ uint8_t al_pal_8bpc [2 /* a/l */][32 /* bx/y4 */][3 /* plane */][8 /* palette_idx */];
+ uint16_t al_pal_16bpc[2 /* a/l */][32 /* bx/y4 */][3 /* plane */][8 /* palette_idx */];
+ };
+ uint8_t pal_sz_uv[2 /* a/l */][32 /* bx4/by4 */];
+ ALIGN(union, 64) {
+ struct {
+ union {
+ uint8_t lap_8bpc [128 * 32];
+ uint16_t lap_16bpc[128 * 32];
+ struct {
+ int16_t compinter[2][128 * 128];
+ uint8_t seg_mask[128 * 128];
+ };
+ };
+ union {
+ // stride=192 for non-SVC, or 320 for SVC
+ uint8_t emu_edge_8bpc [320 * (256 + 7)];
+ uint16_t emu_edge_16bpc[320 * (256 + 7)];
+ };
+ };
+ struct {
+ union {
+ uint8_t levels[32 * 34];
+ struct {
+ uint8_t pal_order[64][8];
+ uint8_t pal_ctx[64];
+ };
+ };
+ union {
+ int16_t ac[32 * 32]; // intra-only
+ uint8_t txtp_map[32 * 32]; // inter-only
+ };
+ uint8_t pal_idx_y[32 * 64];
+ uint8_t pal_idx_uv[64 * 64]; /* also used as pre-pack scratch buffer */
+ union {
+ struct {
+ uint8_t interintra_8bpc[64 * 64];
+ uint8_t edge_8bpc[257];
+ ALIGN(uint8_t pal_8bpc[3 /* plane */][8 /* palette_idx */], 8);
+ };
+ struct {
+ uint16_t interintra_16bpc[64 * 64];
+ uint16_t edge_16bpc[257];
+ ALIGN(uint16_t pal_16bpc[3 /* plane */][8 /* palette_idx */], 16);
+ };
+ };
+ };
+ } scratch;
+
+ Dav1dWarpedMotionParams warpmv;
+ Av1Filter *lf_mask;
+ int top_pre_cdef_toggle;
+ int8_t *cur_sb_cdef_idx_ptr;
+ // for chroma sub8x8, we need to know the filter for all 4 subblocks in
+ // a 4x4 area, but the top/left one can go out of cache already, so this
+ // keeps it accessible
+ enum Filter2d tl_4x4_filter;
+
+ struct {
+ int pass;
+ } frame_thread;
+ struct {
+ struct thread_data td;
+ struct TaskThreadData *ttd;
+ struct FrameTileThreadData *fttd;
+ int flushed;
+ int die;
+ } task_thread;
+};
+
+#endif /* DAV1D_SRC_INTERNAL_H */
diff --git a/third_party/dav1d/src/intra_edge.c b/third_party/dav1d/src/intra_edge.c
new file mode 100644
index 0000000000..e9261e6cb8
--- /dev/null
+++ b/third_party/dav1d/src/intra_edge.c
@@ -0,0 +1,148 @@
+/*
+ * Copyright © 2018-2023, VideoLAN and dav1d authors
+ * Copyright © 2018-2023, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stdlib.h>
+
+#include "common/attributes.h"
+
+#include "src/intra_edge.h"
+#include "src/levels.h"
+
+struct ModeSelMem {
+ EdgeBranch *nwc[3 /* 64x64, 32x32, 16x16 */];
+ EdgeTip *nt;
+};
+
+/* Because we're using 16-bit offsets to refer to other nodes those arrays
+ * are placed in a struct to ensure they're consecutive in memory. */
+static struct {
+ EdgeBranch branch_sb128[1 + 4 + 16 + 64];
+ EdgeTip tip_sb128[256];
+ EdgeBranch branch_sb64[1 + 4 + 16];
+ EdgeTip tip_sb64[64];
+} ALIGN(nodes, 16);
+
+const EdgeNode *dav1d_intra_edge_tree[2] = {
+ (EdgeNode*)nodes.branch_sb128, (EdgeNode*)nodes.branch_sb64
+};
+
+static COLD void init_edges(EdgeNode *const node,
+ const enum BlockLevel bl,
+ const enum EdgeFlags edge_flags)
+{
+ node->o = edge_flags;
+ node->h[0] = edge_flags | EDGE_ALL_LEFT_HAS_BOTTOM;
+ node->v[0] = edge_flags | EDGE_ALL_TOP_HAS_RIGHT;
+
+ if (bl == BL_8X8) {
+ EdgeTip *const nt = (EdgeTip *) node;
+
+ node->h[1] = edge_flags & (EDGE_ALL_LEFT_HAS_BOTTOM |
+ EDGE_I420_TOP_HAS_RIGHT);
+ node->v[1] = edge_flags & (EDGE_ALL_TOP_HAS_RIGHT |
+ EDGE_I420_LEFT_HAS_BOTTOM |
+ EDGE_I422_LEFT_HAS_BOTTOM);
+
+ nt->split[0] = (edge_flags & EDGE_ALL_TOP_HAS_RIGHT) |
+ EDGE_I422_LEFT_HAS_BOTTOM;
+ nt->split[1] = edge_flags | EDGE_I444_TOP_HAS_RIGHT;
+ nt->split[2] = edge_flags & (EDGE_I420_TOP_HAS_RIGHT |
+ EDGE_I420_LEFT_HAS_BOTTOM |
+ EDGE_I422_LEFT_HAS_BOTTOM);
+ } else {
+ EdgeBranch *const nwc = (EdgeBranch *) node;
+
+ node->h[1] = edge_flags & EDGE_ALL_LEFT_HAS_BOTTOM;
+ node->v[1] = edge_flags & EDGE_ALL_TOP_HAS_RIGHT;
+
+ nwc->h4 = EDGE_ALL_LEFT_HAS_BOTTOM;
+ nwc->v4 = EDGE_ALL_TOP_HAS_RIGHT;
+ if (bl == BL_16X16) {
+ nwc->h4 |= edge_flags & EDGE_I420_TOP_HAS_RIGHT;
+ nwc->v4 |= edge_flags & (EDGE_I420_LEFT_HAS_BOTTOM |
+ EDGE_I422_LEFT_HAS_BOTTOM);
+ }
+ }
+}
+
+#define PTR_OFFSET(a, b) ((uint16_t)((uintptr_t)(b) - (uintptr_t)(a)))
+
+static COLD void init_mode_node(EdgeBranch *const nwc,
+ const enum BlockLevel bl,
+ struct ModeSelMem *const mem,
+ const int top_has_right,
+ const int left_has_bottom)
+{
+ init_edges(&nwc->node, bl,
+ (top_has_right ? EDGE_ALL_TOP_HAS_RIGHT : 0) |
+ (left_has_bottom ? EDGE_ALL_LEFT_HAS_BOTTOM : 0));
+ if (bl == BL_16X16) {
+ for (int n = 0; n < 4; n++) {
+ EdgeTip *const nt = mem->nt++;
+ nwc->split_offset[n] = PTR_OFFSET(nwc, nt);
+ init_edges(&nt->node, bl + 1,
+ ((n == 3 || (n == 1 && !top_has_right)) ? 0 :
+ EDGE_ALL_TOP_HAS_RIGHT) |
+ (!(n == 0 || (n == 2 && left_has_bottom)) ? 0 :
+ EDGE_ALL_LEFT_HAS_BOTTOM));
+ }
+ } else {
+ for (int n = 0; n < 4; n++) {
+ EdgeBranch *const nwc_child = mem->nwc[bl]++;
+ nwc->split_offset[n] = PTR_OFFSET(nwc, nwc_child);
+ init_mode_node(nwc_child, bl + 1, mem,
+ !(n == 3 || (n == 1 && !top_has_right)),
+ n == 0 || (n == 2 && left_has_bottom));
+ }
+ }
+}
+
+COLD void dav1d_init_intra_edge_tree(void) {
+ // This function is guaranteed to be called only once
+ struct ModeSelMem mem;
+
+ mem.nwc[BL_128X128] = &nodes.branch_sb128[1];
+ mem.nwc[BL_64X64] = &nodes.branch_sb128[1 + 4];
+ mem.nwc[BL_32X32] = &nodes.branch_sb128[1 + 4 + 16];
+ mem.nt = nodes.tip_sb128;
+ init_mode_node(nodes.branch_sb128, BL_128X128, &mem, 1, 0);
+ assert(mem.nwc[BL_128X128] == &nodes.branch_sb128[1 + 4]);
+ assert(mem.nwc[BL_64X64] == &nodes.branch_sb128[1 + 4 + 16]);
+ assert(mem.nwc[BL_32X32] == &nodes.branch_sb128[1 + 4 + 16 + 64]);
+ assert(mem.nt == &nodes.tip_sb128[256]);
+
+ mem.nwc[BL_128X128] = NULL;
+ mem.nwc[BL_64X64] = &nodes.branch_sb64[1];
+ mem.nwc[BL_32X32] = &nodes.branch_sb64[1 + 4];
+ mem.nt = nodes.tip_sb64;
+ init_mode_node(nodes.branch_sb64, BL_64X64, &mem, 1, 0);
+ assert(mem.nwc[BL_64X64] == &nodes.branch_sb64[1 + 4]);
+ assert(mem.nwc[BL_32X32] == &nodes.branch_sb64[1 + 4 + 16]);
+ assert(mem.nt == &nodes.tip_sb64[64]);
+}
diff --git a/third_party/dav1d/src/intra_edge.h b/third_party/dav1d/src/intra_edge.h
new file mode 100644
index 0000000000..ecfb3de564
--- /dev/null
+++ b/third_party/dav1d/src/intra_edge.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright © 2018-2023, VideoLAN and dav1d authors
+ * Copyright © 2018-2023, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_INTRA_EDGE_H
+#define DAV1D_SRC_INTRA_EDGE_H
+
+#include <stdint.h>
+
+enum EdgeFlags {
+ EDGE_I444_TOP_HAS_RIGHT = 1 << 0,
+ EDGE_I422_TOP_HAS_RIGHT = 1 << 1,
+ EDGE_I420_TOP_HAS_RIGHT = 1 << 2,
+ EDGE_I444_LEFT_HAS_BOTTOM = 1 << 3,
+ EDGE_I422_LEFT_HAS_BOTTOM = 1 << 4,
+ EDGE_I420_LEFT_HAS_BOTTOM = 1 << 5,
+ EDGE_ALL_TOP_HAS_RIGHT = EDGE_I444_TOP_HAS_RIGHT |
+ EDGE_I422_TOP_HAS_RIGHT |
+ EDGE_I420_TOP_HAS_RIGHT,
+ EDGE_ALL_LEFT_HAS_BOTTOM = EDGE_I444_LEFT_HAS_BOTTOM |
+ EDGE_I422_LEFT_HAS_BOTTOM |
+ EDGE_I420_LEFT_HAS_BOTTOM,
+ EDGE_ALL_TR_AND_BL = EDGE_ALL_TOP_HAS_RIGHT |
+ EDGE_ALL_LEFT_HAS_BOTTOM,
+};
+
+#define INTRA_EDGE_SPLIT(n, i) \
+ ((const EdgeNode*)((uintptr_t)(n) + ((const EdgeBranch*)(n))->split_offset[i]))
+
+typedef struct EdgeNode {
+ uint8_t /* enum EdgeFlags */ o, h[2], v[2];
+} EdgeNode;
+
+typedef struct EdgeTip {
+ EdgeNode node;
+ uint8_t /* enum EdgeFlags */ split[3];
+} EdgeTip;
+
+typedef struct EdgeBranch {
+ EdgeNode node;
+ uint8_t /* enum EdgeFlags */ h4, v4;
+ uint16_t split_offset[4]; /* relative to the address of this node */
+} EdgeBranch;
+
+/* Tree to keep track of which edges are available. */
+EXTERN const EdgeNode *dav1d_intra_edge_tree[2 /* BL_128X128, BL_64X64 */];
+
+void dav1d_init_intra_edge_tree(void);
+
+#endif /* DAV1D_SRC_INTRA_EDGE_H */
diff --git a/third_party/dav1d/src/ipred.h b/third_party/dav1d/src/ipred.h
new file mode 100644
index 0000000000..35adb02edf
--- /dev/null
+++ b/third_party/dav1d/src/ipred.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_IPRED_H
+#define DAV1D_SRC_IPRED_H
+
+#include <stddef.h>
+
+#include "common/bitdepth.h"
+
+#include "src/levels.h"
+
+/*
+ * Intra prediction.
+ * - a is the angle (in degrees) for directional intra predictors. For other
+ * modes, it is ignored;
+ * - topleft is the same as the argument given to dav1d_prepare_intra_edges(),
+ * see ipred_prepare.h for more detailed documentation.
+ */
+#define decl_angular_ipred_fn(name) \
+void (name)(pixel *dst, ptrdiff_t stride, const pixel *topleft, \
+ int width, int height, int angle, int max_width, int max_height \
+ HIGHBD_DECL_SUFFIX)
+typedef decl_angular_ipred_fn(*angular_ipred_fn);
+
+/*
+ * Create a subsampled Y plane with the DC subtracted.
+ * - w/h_pad is the edge of the width/height that extends outside the visible
+ * portion of the frame in 4px units;
+ * - ac has a stride of 16.
+ */
+#define decl_cfl_ac_fn(name) \
+void (name)(int16_t *ac, const pixel *y, ptrdiff_t stride, \
+ int w_pad, int h_pad, int cw, int ch)
+typedef decl_cfl_ac_fn(*cfl_ac_fn);
+
+/*
+ * dst[x,y] += alpha * ac[x,y]
+ * - alpha contains a q3 scalar in [-16,16] range;
+ */
+#define decl_cfl_pred_fn(name) \
+void (name)(pixel *dst, ptrdiff_t stride, const pixel *topleft, \
+ int width, int height, const int16_t *ac, int alpha \
+ HIGHBD_DECL_SUFFIX)
+typedef decl_cfl_pred_fn(*cfl_pred_fn);
+
+/*
+ * dst[x,y] = pal[idx[x,y]]
+ * - palette indices are [0-7]
+ * - only 16-byte alignment is guaranteed for idx.
+ */
+#define decl_pal_pred_fn(name) \
+void (name)(pixel *dst, ptrdiff_t stride, const pixel *pal, \
+ const uint8_t *idx, int w, int h)
+typedef decl_pal_pred_fn(*pal_pred_fn);
+
+typedef struct Dav1dIntraPredDSPContext {
+ angular_ipred_fn intra_pred[N_IMPL_INTRA_PRED_MODES];
+
+ // chroma-from-luma
+ cfl_ac_fn cfl_ac[3 /* 420, 422, 444 */];
+ cfl_pred_fn cfl_pred[DC_128_PRED + 1];
+
+ // palette
+ pal_pred_fn pal_pred;
+} Dav1dIntraPredDSPContext;
+
+bitfn_decls(void dav1d_intra_pred_dsp_init, Dav1dIntraPredDSPContext *c);
+
+#endif /* DAV1D_SRC_IPRED_H */
diff --git a/third_party/dav1d/src/ipred_prepare.h b/third_party/dav1d/src/ipred_prepare.h
new file mode 100644
index 0000000000..6a7efeb3d7
--- /dev/null
+++ b/third_party/dav1d/src/ipred_prepare.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_IPRED_PREPARE_H
+#define DAV1D_SRC_IPRED_PREPARE_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "common/bitdepth.h"
+
+#include "src/env.h"
+#include "src/intra_edge.h"
+#include "src/levels.h"
+
+/*
+ * Luma intra edge preparation.
+ *
+ * x/y/start/w/h are in luma block (4px) units:
+ * - x and y are the absolute block positions in the image;
+ * - start/w/h are the *dependent tile* boundary positions. In practice, start
+ * is the horizontal tile start, w is the horizontal tile end, the vertical
+ * tile start is assumed to be 0 and h is the vertical image end.
+ *
+ * edge_flags signals which edges are available for this transform-block inside
+ * the given partition, as well as for the partition inside the superblock
+ * structure.
+ *
+ * dst and stride are pointers to the top/left position of the current block,
+ * and can be used to locate the top, left, top/left, top/right and bottom/left
+ * edge pointers also.
+ *
+ * angle is the angle_delta [-3..3] on input, and the absolute angle on output.
+ *
+ * mode is the intra prediction mode as coded in the bitstream. The return value
+ * is this same mode, converted to an index in the DSP functions.
+ *
+ * tw/th are the size of the transform block in block (4px) units.
+ *
+ * topleft_out is a pointer to scratch memory that will be filled with the edge
+ * pixels. The memory array should have space to be indexed in the [-2*w,2*w]
+ * range, in the following order:
+ * - [0] will be the top/left edge pixel;
+ * - [1..w] will be the top edge pixels (1 being left-most, w being right-most);
+ * - [w+1..2*w] will be the top/right edge pixels;
+ * - [-1..-w] will be the left edge pixels (-1 being top-most, -w being bottom-
+ * most);
+ * - [-w-1..-2*w] will be the bottom/left edge pixels.
+ * Each edge may remain uninitialized if it is not used by the returned mode
+ * index. If edges are not available (because the edge position is outside the
+ * tile dimensions or because edge_flags indicates lack of edge availability),
+ * they will be extended from nearby edges as defined by the av1 spec.
+ */
+enum IntraPredMode
+ bytefn(dav1d_prepare_intra_edges)(int x, int have_left, int y, int have_top,
+ int w, int h, enum EdgeFlags edge_flags,
+ const pixel *dst, ptrdiff_t stride,
+ const pixel *prefilter_toplevel_sb_edge,
+ enum IntraPredMode mode, int *angle,
+ int tw, int th, int filter_edge,
+ pixel *topleft_out HIGHBD_DECL_SUFFIX);
+
+// These flags are OR'd with the angle argument into intra predictors.
+// ANGLE_USE_EDGE_FILTER_FLAG signals that edges should be convolved
+// with a filter before using them to predict values in a block.
+// ANGLE_SMOOTH_EDGE_FLAG means that edges are smooth and should use
+// reduced filter strength.
+#define ANGLE_USE_EDGE_FILTER_FLAG 1024
+#define ANGLE_SMOOTH_EDGE_FLAG 512
+
+static inline int sm_flag(const BlockContext *const b, const int idx) {
+ if (!b->intra[idx]) return 0;
+ const enum IntraPredMode m = b->mode[idx];
+ return (m == SMOOTH_PRED || m == SMOOTH_H_PRED ||
+ m == SMOOTH_V_PRED) ? ANGLE_SMOOTH_EDGE_FLAG : 0;
+}
+
+static inline int sm_uv_flag(const BlockContext *const b, const int idx) {
+ const enum IntraPredMode m = b->uvmode[idx];
+ return (m == SMOOTH_PRED || m == SMOOTH_H_PRED ||
+ m == SMOOTH_V_PRED) ? ANGLE_SMOOTH_EDGE_FLAG : 0;
+}
+
+#endif /* DAV1D_SRC_IPRED_PREPARE_H */
diff --git a/third_party/dav1d/src/ipred_prepare_tmpl.c b/third_party/dav1d/src/ipred_prepare_tmpl.c
new file mode 100644
index 0000000000..0bf9de9418
--- /dev/null
+++ b/third_party/dav1d/src/ipred_prepare_tmpl.c
@@ -0,0 +1,204 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stdint.h>
+#include <string.h>
+
+#include "common/intops.h"
+
+#include "src/ipred_prepare.h"
+
+static const uint8_t av1_mode_conv[N_INTRA_PRED_MODES]
+ [2 /* have_left */][2 /* have_top */] =
+{
+ [DC_PRED] = { { DC_128_PRED, TOP_DC_PRED },
+ { LEFT_DC_PRED, DC_PRED } },
+ [PAETH_PRED] = { { DC_128_PRED, VERT_PRED },
+ { HOR_PRED, PAETH_PRED } },
+};
+
+static const uint8_t av1_mode_to_angle_map[8] = {
+ 90, 180, 45, 135, 113, 157, 203, 67
+};
+
+static const struct {
+ uint8_t needs_left:1;
+ uint8_t needs_top:1;
+ uint8_t needs_topleft:1;
+ uint8_t needs_topright:1;
+ uint8_t needs_bottomleft:1;
+} av1_intra_prediction_edges[N_IMPL_INTRA_PRED_MODES] = {
+ [DC_PRED] = { .needs_top = 1, .needs_left = 1 },
+ [VERT_PRED] = { .needs_top = 1 },
+ [HOR_PRED] = { .needs_left = 1 },
+ [LEFT_DC_PRED] = { .needs_left = 1 },
+ [TOP_DC_PRED] = { .needs_top = 1 },
+ [DC_128_PRED] = { 0 },
+ [Z1_PRED] = { .needs_top = 1, .needs_topright = 1,
+ .needs_topleft = 1 },
+ [Z2_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
+ [Z3_PRED] = { .needs_left = 1, .needs_bottomleft = 1,
+ .needs_topleft = 1 },
+ [SMOOTH_PRED] = { .needs_left = 1, .needs_top = 1 },
+ [SMOOTH_V_PRED] = { .needs_left = 1, .needs_top = 1 },
+ [SMOOTH_H_PRED] = { .needs_left = 1, .needs_top = 1 },
+ [PAETH_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
+ [FILTER_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
+};
+
+enum IntraPredMode
+bytefn(dav1d_prepare_intra_edges)(const int x, const int have_left,
+ const int y, const int have_top,
+ const int w, const int h,
+ const enum EdgeFlags edge_flags,
+ const pixel *const dst,
+ const ptrdiff_t stride,
+ const pixel *prefilter_toplevel_sb_edge,
+ enum IntraPredMode mode, int *const angle,
+ const int tw, const int th, const int filter_edge,
+ pixel *const topleft_out HIGHBD_DECL_SUFFIX)
+{
+ const int bitdepth = bitdepth_from_max(bitdepth_max);
+ assert(y < h && x < w);
+
+ switch (mode) {
+ case VERT_PRED:
+ case HOR_PRED:
+ case DIAG_DOWN_LEFT_PRED:
+ case DIAG_DOWN_RIGHT_PRED:
+ case VERT_RIGHT_PRED:
+ case HOR_DOWN_PRED:
+ case HOR_UP_PRED:
+ case VERT_LEFT_PRED: {
+ *angle = av1_mode_to_angle_map[mode - VERT_PRED] + 3 * *angle;
+
+ if (*angle <= 90)
+ mode = *angle < 90 && have_top ? Z1_PRED : VERT_PRED;
+ else if (*angle < 180)
+ mode = Z2_PRED;
+ else
+ mode = *angle > 180 && have_left ? Z3_PRED : HOR_PRED;
+ break;
+ }
+ case DC_PRED:
+ case PAETH_PRED:
+ mode = av1_mode_conv[mode][have_left][have_top];
+ break;
+ default:
+ break;
+ }
+
+ const pixel *dst_top;
+ if (have_top &&
+ (av1_intra_prediction_edges[mode].needs_top ||
+ av1_intra_prediction_edges[mode].needs_topleft ||
+ (av1_intra_prediction_edges[mode].needs_left && !have_left)))
+ {
+ if (prefilter_toplevel_sb_edge) {
+ dst_top = &prefilter_toplevel_sb_edge[x * 4];
+ } else {
+ dst_top = &dst[-PXSTRIDE(stride)];
+ }
+ }
+
+ if (av1_intra_prediction_edges[mode].needs_left) {
+ const int sz = th << 2;
+ pixel *const left = &topleft_out[-sz];
+
+ if (have_left) {
+ const int px_have = imin(sz, (h - y) << 2);
+
+ for (int i = 0; i < px_have; i++)
+ left[sz - 1 - i] = dst[PXSTRIDE(stride) * i - 1];
+ if (px_have < sz)
+ pixel_set(left, left[sz - px_have], sz - px_have);
+ } else {
+ pixel_set(left, have_top ? *dst_top : ((1 << bitdepth) >> 1) + 1, sz);
+ }
+
+ if (av1_intra_prediction_edges[mode].needs_bottomleft) {
+ const int have_bottomleft = (!have_left || y + th >= h) ? 0 :
+ (edge_flags & EDGE_I444_LEFT_HAS_BOTTOM);
+
+ if (have_bottomleft) {
+ const int px_have = imin(sz, (h - y - th) << 2);
+
+ for (int i = 0; i < px_have; i++)
+ left[-(i + 1)] = dst[(sz + i) * PXSTRIDE(stride) - 1];
+ if (px_have < sz)
+ pixel_set(left - sz, left[-px_have], sz - px_have);
+ } else {
+ pixel_set(left - sz, left[0], sz);
+ }
+ }
+ }
+
+ if (av1_intra_prediction_edges[mode].needs_top) {
+ const int sz = tw << 2;
+ pixel *const top = &topleft_out[1];
+
+ if (have_top) {
+ const int px_have = imin(sz, (w - x) << 2);
+ pixel_copy(top, dst_top, px_have);
+ if (px_have < sz)
+ pixel_set(top + px_have, top[px_have - 1], sz - px_have);
+ } else {
+ pixel_set(top, have_left ? dst[-1] : ((1 << bitdepth) >> 1) - 1, sz);
+ }
+
+ if (av1_intra_prediction_edges[mode].needs_topright) {
+ const int have_topright = (!have_top || x + tw >= w) ? 0 :
+ (edge_flags & EDGE_I444_TOP_HAS_RIGHT);
+
+ if (have_topright) {
+ const int px_have = imin(sz, (w - x - tw) << 2);
+
+ pixel_copy(top + sz, &dst_top[sz], px_have);
+ if (px_have < sz)
+ pixel_set(top + sz + px_have, top[sz + px_have - 1],
+ sz - px_have);
+ } else {
+ pixel_set(top + sz, top[sz - 1], sz);
+ }
+ }
+ }
+
+ if (av1_intra_prediction_edges[mode].needs_topleft) {
+ if (have_left)
+ *topleft_out = have_top ? dst_top[-1] : dst[-1];
+ else
+ *topleft_out = have_top ? *dst_top : (1 << bitdepth) >> 1;
+
+ if (mode == Z2_PRED && tw + th >= 6 && filter_edge)
+ *topleft_out = ((topleft_out[-1] + topleft_out[1]) * 5 +
+ topleft_out[0] * 6 + 8) >> 4;
+ }
+
+ return mode;
+}
diff --git a/third_party/dav1d/src/ipred_tmpl.c b/third_party/dav1d/src/ipred_tmpl.c
new file mode 100644
index 0000000000..997581674d
--- /dev/null
+++ b/third_party/dav1d/src/ipred_tmpl.c
@@ -0,0 +1,774 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "common/attributes.h"
+#include "common/intops.h"
+
+#include "src/ipred.h"
+#include "src/tables.h"
+
+static NOINLINE void
+splat_dc(pixel *dst, const ptrdiff_t stride,
+ const int width, const int height, const int dc HIGHBD_DECL_SUFFIX)
+{
+#if BITDEPTH == 8
+ assert(dc <= 0xff);
+ if (width > 4) {
+ const uint64_t dcN = dc * 0x0101010101010101ULL;
+ for (int y = 0; y < height; y++) {
+ for (int x = 0; x < width; x += sizeof(dcN))
+ *((uint64_t *) &dst[x]) = dcN;
+ dst += PXSTRIDE(stride);
+ }
+ } else {
+ const unsigned dcN = dc * 0x01010101U;
+ for (int y = 0; y < height; y++) {
+ for (int x = 0; x < width; x += sizeof(dcN))
+ *((unsigned *) &dst[x]) = dcN;
+ dst += PXSTRIDE(stride);
+ }
+ }
+#else
+ assert(dc <= bitdepth_max);
+ const uint64_t dcN = dc * 0x0001000100010001ULL;
+ for (int y = 0; y < height; y++) {
+ for (int x = 0; x < width; x += sizeof(dcN) >> 1)
+ *((uint64_t *) &dst[x]) = dcN;
+ dst += PXSTRIDE(stride);
+ }
+#endif
+}
+
+static NOINLINE void
+cfl_pred(pixel *dst, const ptrdiff_t stride,
+ const int width, const int height, const int dc,
+ const int16_t *ac, const int alpha HIGHBD_DECL_SUFFIX)
+{
+ for (int y = 0; y < height; y++) {
+ for (int x = 0; x < width; x++) {
+ const int diff = alpha * ac[x];
+ dst[x] = iclip_pixel(dc + apply_sign((abs(diff) + 32) >> 6, diff));
+ }
+ ac += width;
+ dst += PXSTRIDE(stride);
+ }
+}
+
+static unsigned dc_gen_top(const pixel *const topleft, const int width) {
+ unsigned dc = width >> 1;
+ for (int i = 0; i < width; i++)
+ dc += topleft[1 + i];
+ return dc >> ctz(width);
+}
+
+static void ipred_dc_top_c(pixel *dst, const ptrdiff_t stride,
+ const pixel *const topleft,
+ const int width, const int height, const int a,
+ const int max_width, const int max_height
+ HIGHBD_DECL_SUFFIX)
+{
+ splat_dc(dst, stride, width, height, dc_gen_top(topleft, width)
+ HIGHBD_TAIL_SUFFIX);
+}
+
+static void ipred_cfl_top_c(pixel *dst, const ptrdiff_t stride,
+ const pixel *const topleft,
+ const int width, const int height,
+ const int16_t *ac, const int alpha
+ HIGHBD_DECL_SUFFIX)
+{
+ cfl_pred(dst, stride, width, height, dc_gen_top(topleft, width), ac, alpha
+ HIGHBD_TAIL_SUFFIX);
+}
+
+static unsigned dc_gen_left(const pixel *const topleft, const int height) {
+ unsigned dc = height >> 1;
+ for (int i = 0; i < height; i++)
+ dc += topleft[-(1 + i)];
+ return dc >> ctz(height);
+}
+
+static void ipred_dc_left_c(pixel *dst, const ptrdiff_t stride,
+ const pixel *const topleft,
+ const int width, const int height, const int a,
+ const int max_width, const int max_height
+ HIGHBD_DECL_SUFFIX)
+{
+ splat_dc(dst, stride, width, height, dc_gen_left(topleft, height)
+ HIGHBD_TAIL_SUFFIX);
+}
+
+static void ipred_cfl_left_c(pixel *dst, const ptrdiff_t stride,
+ const pixel *const topleft,
+ const int width, const int height,
+ const int16_t *ac, const int alpha
+ HIGHBD_DECL_SUFFIX)
+{
+ const unsigned dc = dc_gen_left(topleft, height);
+ cfl_pred(dst, stride, width, height, dc, ac, alpha HIGHBD_TAIL_SUFFIX);
+}
+
+#if BITDEPTH == 8
+#define MULTIPLIER_1x2 0x5556
+#define MULTIPLIER_1x4 0x3334
+#define BASE_SHIFT 16
+#else
+#define MULTIPLIER_1x2 0xAAAB
+#define MULTIPLIER_1x4 0x6667
+#define BASE_SHIFT 17
+#endif
+
+static unsigned dc_gen(const pixel *const topleft,
+ const int width, const int height)
+{
+ unsigned dc = (width + height) >> 1;
+ for (int i = 0; i < width; i++)
+ dc += topleft[i + 1];
+ for (int i = 0; i < height; i++)
+ dc += topleft[-(i + 1)];
+ dc >>= ctz(width + height);
+
+ if (width != height) {
+ dc *= (width > height * 2 || height > width * 2) ? MULTIPLIER_1x4 :
+ MULTIPLIER_1x2;
+ dc >>= BASE_SHIFT;
+ }
+ return dc;
+}
+
+static void ipred_dc_c(pixel *dst, const ptrdiff_t stride,
+ const pixel *const topleft,
+ const int width, const int height, const int a,
+ const int max_width, const int max_height
+ HIGHBD_DECL_SUFFIX)
+{
+ splat_dc(dst, stride, width, height, dc_gen(topleft, width, height)
+ HIGHBD_TAIL_SUFFIX);
+}
+
+static void ipred_cfl_c(pixel *dst, const ptrdiff_t stride,
+ const pixel *const topleft,
+ const int width, const int height,
+ const int16_t *ac, const int alpha
+ HIGHBD_DECL_SUFFIX)
+{
+ unsigned dc = dc_gen(topleft, width, height);
+ cfl_pred(dst, stride, width, height, dc, ac, alpha HIGHBD_TAIL_SUFFIX);
+}
+
+#undef MULTIPLIER_1x2
+#undef MULTIPLIER_1x4
+#undef BASE_SHIFT
+
+static void ipred_dc_128_c(pixel *dst, const ptrdiff_t stride,
+ const pixel *const topleft,
+ const int width, const int height, const int a,
+ const int max_width, const int max_height
+ HIGHBD_DECL_SUFFIX)
+{
+#if BITDEPTH == 16
+ const int dc = (bitdepth_max + 1) >> 1;
+#else
+ const int dc = 128;
+#endif
+ splat_dc(dst, stride, width, height, dc HIGHBD_TAIL_SUFFIX);
+}
+
+static void ipred_cfl_128_c(pixel *dst, const ptrdiff_t stride,
+ const pixel *const topleft,
+ const int width, const int height,
+ const int16_t *ac, const int alpha
+ HIGHBD_DECL_SUFFIX)
+{
+#if BITDEPTH == 16
+ const int dc = (bitdepth_max + 1) >> 1;
+#else
+ const int dc = 128;
+#endif
+ cfl_pred(dst, stride, width, height, dc, ac, alpha HIGHBD_TAIL_SUFFIX);
+}
+
+static void ipred_v_c(pixel *dst, const ptrdiff_t stride,
+ const pixel *const topleft,
+ const int width, const int height, const int a,
+ const int max_width, const int max_height
+ HIGHBD_DECL_SUFFIX)
+{
+ for (int y = 0; y < height; y++) {
+ pixel_copy(dst, topleft + 1, width);
+ dst += PXSTRIDE(stride);
+ }
+}
+
+static void ipred_h_c(pixel *dst, const ptrdiff_t stride,
+ const pixel *const topleft,
+ const int width, const int height, const int a,
+ const int max_width, const int max_height
+ HIGHBD_DECL_SUFFIX)
+{
+ for (int y = 0; y < height; y++) {
+ pixel_set(dst, topleft[-(1 + y)], width);
+ dst += PXSTRIDE(stride);
+ }
+}
+
+static void ipred_paeth_c(pixel *dst, const ptrdiff_t stride,
+ const pixel *const tl_ptr,
+ const int width, const int height, const int a,
+ const int max_width, const int max_height
+ HIGHBD_DECL_SUFFIX)
+{
+ const int topleft = tl_ptr[0];
+ for (int y = 0; y < height; y++) {
+ const int left = tl_ptr[-(y + 1)];
+ for (int x = 0; x < width; x++) {
+ const int top = tl_ptr[1 + x];
+ const int base = left + top - topleft;
+ const int ldiff = abs(left - base);
+ const int tdiff = abs(top - base);
+ const int tldiff = abs(topleft - base);
+
+ dst[x] = ldiff <= tdiff && ldiff <= tldiff ? left :
+ tdiff <= tldiff ? top : topleft;
+ }
+ dst += PXSTRIDE(stride);
+ }
+}
+
+static void ipred_smooth_c(pixel *dst, const ptrdiff_t stride,
+ const pixel *const topleft,
+ const int width, const int height, const int a,
+ const int max_width, const int max_height
+ HIGHBD_DECL_SUFFIX)
+{
+ const uint8_t *const weights_hor = &dav1d_sm_weights[width];
+ const uint8_t *const weights_ver = &dav1d_sm_weights[height];
+ const int right = topleft[width], bottom = topleft[-height];
+
+ for (int y = 0; y < height; y++) {
+ for (int x = 0; x < width; x++) {
+ const int pred = weights_ver[y] * topleft[1 + x] +
+ (256 - weights_ver[y]) * bottom +
+ weights_hor[x] * topleft[-(1 + y)] +
+ (256 - weights_hor[x]) * right;
+ dst[x] = (pred + 256) >> 9;
+ }
+ dst += PXSTRIDE(stride);
+ }
+}
+
+static void ipred_smooth_v_c(pixel *dst, const ptrdiff_t stride,
+ const pixel *const topleft,
+ const int width, const int height, const int a,
+ const int max_width, const int max_height
+ HIGHBD_DECL_SUFFIX)
+{
+ const uint8_t *const weights_ver = &dav1d_sm_weights[height];
+ const int bottom = topleft[-height];
+
+ for (int y = 0; y < height; y++) {
+ for (int x = 0; x < width; x++) {
+ const int pred = weights_ver[y] * topleft[1 + x] +
+ (256 - weights_ver[y]) * bottom;
+ dst[x] = (pred + 128) >> 8;
+ }
+ dst += PXSTRIDE(stride);
+ }
+}
+
+static void ipred_smooth_h_c(pixel *dst, const ptrdiff_t stride,
+ const pixel *const topleft,
+ const int width, const int height, const int a,
+ const int max_width, const int max_height
+ HIGHBD_DECL_SUFFIX)
+{
+ const uint8_t *const weights_hor = &dav1d_sm_weights[width];
+ const int right = topleft[width];
+
+ for (int y = 0; y < height; y++) {
+ for (int x = 0; x < width; x++) {
+ const int pred = weights_hor[x] * topleft[-(y + 1)] +
+ (256 - weights_hor[x]) * right;
+ dst[x] = (pred + 128) >> 8;
+ }
+ dst += PXSTRIDE(stride);
+ }
+}
+
+static NOINLINE int get_filter_strength(const int wh, const int angle,
+ const int is_sm)
+{
+ if (is_sm) {
+ if (wh <= 8) {
+ if (angle >= 64) return 2;
+ if (angle >= 40) return 1;
+ } else if (wh <= 16) {
+ if (angle >= 48) return 2;
+ if (angle >= 20) return 1;
+ } else if (wh <= 24) {
+ if (angle >= 4) return 3;
+ } else {
+ return 3;
+ }
+ } else {
+ if (wh <= 8) {
+ if (angle >= 56) return 1;
+ } else if (wh <= 16) {
+ if (angle >= 40) return 1;
+ } else if (wh <= 24) {
+ if (angle >= 32) return 3;
+ if (angle >= 16) return 2;
+ if (angle >= 8) return 1;
+ } else if (wh <= 32) {
+ if (angle >= 32) return 3;
+ if (angle >= 4) return 2;
+ return 1;
+ } else {
+ return 3;
+ }
+ }
+ return 0;
+}
+
+static NOINLINE void filter_edge(pixel *const out, const int sz,
+ const int lim_from, const int lim_to,
+ const pixel *const in, const int from,
+ const int to, const int strength)
+{
+ static const uint8_t kernel[3][5] = {
+ { 0, 4, 8, 4, 0 },
+ { 0, 5, 6, 5, 0 },
+ { 2, 4, 4, 4, 2 }
+ };
+
+ assert(strength > 0);
+ int i = 0;
+ for (; i < imin(sz, lim_from); i++)
+ out[i] = in[iclip(i, from, to - 1)];
+ for (; i < imin(lim_to, sz); i++) {
+ int s = 0;
+ for (int j = 0; j < 5; j++)
+ s += in[iclip(i - 2 + j, from, to - 1)] * kernel[strength - 1][j];
+ out[i] = (s + 8) >> 4;
+ }
+ for (; i < sz; i++)
+ out[i] = in[iclip(i, from, to - 1)];
+}
+
+static inline int get_upsample(const int wh, const int angle, const int is_sm) {
+ return angle < 40 && wh <= 16 >> is_sm;
+}
+
+static NOINLINE void upsample_edge(pixel *const out, const int hsz,
+ const pixel *const in, const int from,
+ const int to HIGHBD_DECL_SUFFIX)
+{
+ static const int8_t kernel[4] = { -1, 9, 9, -1 };
+ int i;
+ for (i = 0; i < hsz - 1; i++) {
+ out[i * 2] = in[iclip(i, from, to - 1)];
+
+ int s = 0;
+ for (int j = 0; j < 4; j++)
+ s += in[iclip(i + j - 1, from, to - 1)] * kernel[j];
+ out[i * 2 + 1] = iclip_pixel((s + 8) >> 4);
+ }
+ out[i * 2] = in[iclip(i, from, to - 1)];
+}
+
+static void ipred_z1_c(pixel *dst, const ptrdiff_t stride,
+ const pixel *const topleft_in,
+ const int width, const int height, int angle,
+ const int max_width, const int max_height
+ HIGHBD_DECL_SUFFIX)
+{
+ const int is_sm = (angle >> 9) & 0x1;
+ const int enable_intra_edge_filter = angle >> 10;
+ angle &= 511;
+ assert(angle < 90);
+ int dx = dav1d_dr_intra_derivative[angle >> 1];
+ pixel top_out[64 + 64];
+ const pixel *top;
+ int max_base_x;
+ const int upsample_above = enable_intra_edge_filter ?
+ get_upsample(width + height, 90 - angle, is_sm) : 0;
+ if (upsample_above) {
+ upsample_edge(top_out, width + height, &topleft_in[1], -1,
+ width + imin(width, height) HIGHBD_TAIL_SUFFIX);
+ top = top_out;
+ max_base_x = 2 * (width + height) - 2;
+ dx <<= 1;
+ } else {
+ const int filter_strength = enable_intra_edge_filter ?
+ get_filter_strength(width + height, 90 - angle, is_sm) : 0;
+ if (filter_strength) {
+ filter_edge(top_out, width + height, 0, width + height,
+ &topleft_in[1], -1, width + imin(width, height),
+ filter_strength);
+ top = top_out;
+ max_base_x = width + height - 1;
+ } else {
+ top = &topleft_in[1];
+ max_base_x = width + imin(width, height) - 1;
+ }
+ }
+ const int base_inc = 1 + upsample_above;
+ for (int y = 0, xpos = dx; y < height;
+ y++, dst += PXSTRIDE(stride), xpos += dx)
+ {
+ const int frac = xpos & 0x3E;
+
+ for (int x = 0, base = xpos >> 6; x < width; x++, base += base_inc) {
+ if (base < max_base_x) {
+ const int v = top[base] * (64 - frac) + top[base + 1] * frac;
+ dst[x] = (v + 32) >> 6;
+ } else {
+ pixel_set(&dst[x], top[max_base_x], width - x);
+ break;
+ }
+ }
+ }
+}
+
+static void ipred_z2_c(pixel *dst, const ptrdiff_t stride,
+ const pixel *const topleft_in,
+ const int width, const int height, int angle,
+ const int max_width, const int max_height
+ HIGHBD_DECL_SUFFIX)
+{
+ const int is_sm = (angle >> 9) & 0x1;
+ const int enable_intra_edge_filter = angle >> 10;
+ angle &= 511;
+ assert(angle > 90 && angle < 180);
+ int dy = dav1d_dr_intra_derivative[(angle - 90) >> 1];
+ int dx = dav1d_dr_intra_derivative[(180 - angle) >> 1];
+ const int upsample_left = enable_intra_edge_filter ?
+ get_upsample(width + height, 180 - angle, is_sm) : 0;
+ const int upsample_above = enable_intra_edge_filter ?
+ get_upsample(width + height, angle - 90, is_sm) : 0;
+ pixel edge[64 + 64 + 1];
+ pixel *const topleft = &edge[64];
+
+ if (upsample_above) {
+ upsample_edge(topleft, width + 1, topleft_in, 0, width + 1
+ HIGHBD_TAIL_SUFFIX);
+ dx <<= 1;
+ } else {
+ const int filter_strength = enable_intra_edge_filter ?
+ get_filter_strength(width + height, angle - 90, is_sm) : 0;
+
+ if (filter_strength) {
+ filter_edge(&topleft[1], width, 0, max_width,
+ &topleft_in[1], -1, width,
+ filter_strength);
+ } else {
+ pixel_copy(&topleft[1], &topleft_in[1], width);
+ }
+ }
+ if (upsample_left) {
+ upsample_edge(&topleft[-height * 2], height + 1, &topleft_in[-height],
+ 0, height + 1 HIGHBD_TAIL_SUFFIX);
+ dy <<= 1;
+ } else {
+ const int filter_strength = enable_intra_edge_filter ?
+ get_filter_strength(width + height, 180 - angle, is_sm) : 0;
+
+ if (filter_strength) {
+ filter_edge(&topleft[-height], height, height - max_height, height,
+ &topleft_in[-height],
+ 0, height + 1, filter_strength);
+ } else {
+ pixel_copy(&topleft[-height], &topleft_in[-height], height);
+ }
+ }
+ *topleft = *topleft_in;
+
+ const int base_inc_x = 1 + upsample_above;
+ const pixel *const left = &topleft[-(1 + upsample_left)];
+ for (int y = 0, xpos = ((1 + upsample_above) << 6) - dx; y < height;
+ y++, xpos -= dx, dst += PXSTRIDE(stride))
+ {
+ int base_x = xpos >> 6;
+ const int frac_x = xpos & 0x3E;
+
+ for (int x = 0, ypos = (y << (6 + upsample_left)) - dy; x < width;
+ x++, base_x += base_inc_x, ypos -= dy)
+ {
+ int v;
+ if (base_x >= 0) {
+ v = topleft[base_x] * (64 - frac_x) +
+ topleft[base_x + 1] * frac_x;
+ } else {
+ const int base_y = ypos >> 6;
+ assert(base_y >= -(1 + upsample_left));
+ const int frac_y = ypos & 0x3E;
+ v = left[-base_y] * (64 - frac_y) +
+ left[-(base_y + 1)] * frac_y;
+ }
+ dst[x] = (v + 32) >> 6;
+ }
+ }
+}
+
+static void ipred_z3_c(pixel *dst, const ptrdiff_t stride,
+ const pixel *const topleft_in,
+ const int width, const int height, int angle,
+ const int max_width, const int max_height
+ HIGHBD_DECL_SUFFIX)
+{
+ const int is_sm = (angle >> 9) & 0x1;
+ const int enable_intra_edge_filter = angle >> 10;
+ angle &= 511;
+ assert(angle > 180);
+ int dy = dav1d_dr_intra_derivative[(270 - angle) >> 1];
+ pixel left_out[64 + 64];
+ const pixel *left;
+ int max_base_y;
+ const int upsample_left = enable_intra_edge_filter ?
+ get_upsample(width + height, angle - 180, is_sm) : 0;
+ if (upsample_left) {
+ upsample_edge(left_out, width + height,
+ &topleft_in[-(width + height)],
+ imax(width - height, 0), width + height + 1
+ HIGHBD_TAIL_SUFFIX);
+ left = &left_out[2 * (width + height) - 2];
+ max_base_y = 2 * (width + height) - 2;
+ dy <<= 1;
+ } else {
+ const int filter_strength = enable_intra_edge_filter ?
+ get_filter_strength(width + height, angle - 180, is_sm) : 0;
+
+ if (filter_strength) {
+ filter_edge(left_out, width + height, 0, width + height,
+ &topleft_in[-(width + height)],
+ imax(width - height, 0), width + height + 1,
+ filter_strength);
+ left = &left_out[width + height - 1];
+ max_base_y = width + height - 1;
+ } else {
+ left = &topleft_in[-1];
+ max_base_y = height + imin(width, height) - 1;
+ }
+ }
+ const int base_inc = 1 + upsample_left;
+ for (int x = 0, ypos = dy; x < width; x++, ypos += dy) {
+ const int frac = ypos & 0x3E;
+
+ for (int y = 0, base = ypos >> 6; y < height; y++, base += base_inc) {
+ if (base < max_base_y) {
+ const int v = left[-base] * (64 - frac) +
+ left[-(base + 1)] * frac;
+ dst[y * PXSTRIDE(stride) + x] = (v + 32) >> 6;
+ } else {
+ do {
+ dst[y * PXSTRIDE(stride) + x] = left[-max_base_y];
+ } while (++y < height);
+ break;
+ }
+ }
+ }
+}
+
+#if ARCH_X86
+#define FILTER(flt_ptr, p0, p1, p2, p3, p4, p5, p6) \
+ flt_ptr[ 0] * p0 + flt_ptr[ 1] * p1 + \
+ flt_ptr[16] * p2 + flt_ptr[17] * p3 + \
+ flt_ptr[32] * p4 + flt_ptr[33] * p5 + \
+ flt_ptr[48] * p6
+#define FLT_INCR 2
+#else
+#define FILTER(flt_ptr, p0, p1, p2, p3, p4, p5, p6) \
+ flt_ptr[ 0] * p0 + flt_ptr[ 8] * p1 + \
+ flt_ptr[16] * p2 + flt_ptr[24] * p3 + \
+ flt_ptr[32] * p4 + flt_ptr[40] * p5 + \
+ flt_ptr[48] * p6
+#define FLT_INCR 1
+#endif
+
+/* Up to 32x32 only */
+static void ipred_filter_c(pixel *dst, const ptrdiff_t stride,
+ const pixel *const topleft_in,
+ const int width, const int height, int filt_idx,
+ const int max_width, const int max_height
+ HIGHBD_DECL_SUFFIX)
+{
+ filt_idx &= 511;
+ assert(filt_idx < 5);
+
+ const int8_t *const filter = dav1d_filter_intra_taps[filt_idx];
+ const pixel *top = &topleft_in[1];
+ for (int y = 0; y < height; y += 2) {
+ const pixel *topleft = &topleft_in[-y];
+ const pixel *left = &topleft[-1];
+ ptrdiff_t left_stride = -1;
+ for (int x = 0; x < width; x += 4) {
+ const int p0 = *topleft;
+ const int p1 = top[0], p2 = top[1], p3 = top[2], p4 = top[3];
+ const int p5 = left[0 * left_stride], p6 = left[1 * left_stride];
+ pixel *ptr = &dst[x];
+ const int8_t *flt_ptr = filter;
+
+ for (int yy = 0; yy < 2; yy++) {
+ for (int xx = 0; xx < 4; xx++, flt_ptr += FLT_INCR) {
+ const int acc = FILTER(flt_ptr, p0, p1, p2, p3, p4, p5, p6);
+ ptr[xx] = iclip_pixel((acc + 8) >> 4);
+ }
+ ptr += PXSTRIDE(stride);
+ }
+ left = &dst[x + 4 - 1];
+ left_stride = PXSTRIDE(stride);
+ top += 4;
+ topleft = &top[-1];
+ }
+ top = &dst[PXSTRIDE(stride)];
+ dst = &dst[PXSTRIDE(stride) * 2];
+ }
+}
+
+static NOINLINE void
+cfl_ac_c(int16_t *ac, const pixel *ypx, const ptrdiff_t stride,
+ const int w_pad, const int h_pad, const int width, const int height,
+ const int ss_hor, const int ss_ver)
+{
+ int y, x;
+ int16_t *const ac_orig = ac;
+
+ assert(w_pad >= 0 && w_pad * 4 < width);
+ assert(h_pad >= 0 && h_pad * 4 < height);
+
+ for (y = 0; y < height - 4 * h_pad; y++) {
+ for (x = 0; x < width - 4 * w_pad; x++) {
+ int ac_sum = ypx[x << ss_hor];
+ if (ss_hor) ac_sum += ypx[x * 2 + 1];
+ if (ss_ver) {
+ ac_sum += ypx[(x << ss_hor) + PXSTRIDE(stride)];
+ if (ss_hor) ac_sum += ypx[x * 2 + 1 + PXSTRIDE(stride)];
+ }
+ ac[x] = ac_sum << (1 + !ss_ver + !ss_hor);
+ }
+ for (; x < width; x++)
+ ac[x] = ac[x - 1];
+ ac += width;
+ ypx += PXSTRIDE(stride) << ss_ver;
+ }
+ for (; y < height; y++) {
+ memcpy(ac, &ac[-width], width * sizeof(*ac));
+ ac += width;
+ }
+
+ const int log2sz = ctz(width) + ctz(height);
+ int sum = (1 << log2sz) >> 1;
+ for (ac = ac_orig, y = 0; y < height; y++) {
+ for (x = 0; x < width; x++)
+ sum += ac[x];
+ ac += width;
+ }
+ sum >>= log2sz;
+
+ // subtract DC
+ for (ac = ac_orig, y = 0; y < height; y++) {
+ for (x = 0; x < width; x++)
+ ac[x] -= sum;
+ ac += width;
+ }
+}
+
+#define cfl_ac_fn(fmt, ss_hor, ss_ver) \
+static void cfl_ac_##fmt##_c(int16_t *const ac, const pixel *const ypx, \
+ const ptrdiff_t stride, const int w_pad, \
+ const int h_pad, const int cw, const int ch) \
+{ \
+ cfl_ac_c(ac, ypx, stride, w_pad, h_pad, cw, ch, ss_hor, ss_ver); \
+}
+
+cfl_ac_fn(420, 1, 1)
+cfl_ac_fn(422, 1, 0)
+cfl_ac_fn(444, 0, 0)
+
+static void pal_pred_c(pixel *dst, const ptrdiff_t stride,
+ const pixel *const pal, const uint8_t *idx,
+ const int w, const int h)
+{
+ for (int y = 0; y < h; y++) {
+ for (int x = 0; x < w; x += 2) {
+ const int i = *idx++;
+ assert(!(i & 0x88));
+ dst[x + 0] = pal[i & 7];
+ dst[x + 1] = pal[i >> 4];
+ }
+ dst += PXSTRIDE(stride);
+ }
+}
+
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+#include "src/arm/ipred.h"
+#elif ARCH_X86
+#include "src/x86/ipred.h"
+#endif
+#endif
+
+COLD void bitfn(dav1d_intra_pred_dsp_init)(Dav1dIntraPredDSPContext *const c) {
+ c->intra_pred[DC_PRED ] = ipred_dc_c;
+ c->intra_pred[DC_128_PRED ] = ipred_dc_128_c;
+ c->intra_pred[TOP_DC_PRED ] = ipred_dc_top_c;
+ c->intra_pred[LEFT_DC_PRED ] = ipred_dc_left_c;
+ c->intra_pred[HOR_PRED ] = ipred_h_c;
+ c->intra_pred[VERT_PRED ] = ipred_v_c;
+ c->intra_pred[PAETH_PRED ] = ipred_paeth_c;
+ c->intra_pred[SMOOTH_PRED ] = ipred_smooth_c;
+ c->intra_pred[SMOOTH_V_PRED] = ipred_smooth_v_c;
+ c->intra_pred[SMOOTH_H_PRED] = ipred_smooth_h_c;
+ c->intra_pred[Z1_PRED ] = ipred_z1_c;
+ c->intra_pred[Z2_PRED ] = ipred_z2_c;
+ c->intra_pred[Z3_PRED ] = ipred_z3_c;
+ c->intra_pred[FILTER_PRED ] = ipred_filter_c;
+
+ c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1] = cfl_ac_420_c;
+ c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1] = cfl_ac_422_c;
+ c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1] = cfl_ac_444_c;
+
+ c->cfl_pred[DC_PRED ] = ipred_cfl_c;
+ c->cfl_pred[DC_128_PRED ] = ipred_cfl_128_c;
+ c->cfl_pred[TOP_DC_PRED ] = ipred_cfl_top_c;
+ c->cfl_pred[LEFT_DC_PRED] = ipred_cfl_left_c;
+
+ c->pal_pred = pal_pred_c;
+
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+ intra_pred_dsp_init_arm(c);
+#elif ARCH_X86
+ intra_pred_dsp_init_x86(c);
+#endif
+#endif
+}
diff --git a/third_party/dav1d/src/itx.h b/third_party/dav1d/src/itx.h
new file mode 100644
index 0000000000..d522079907
--- /dev/null
+++ b/third_party/dav1d/src/itx.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_ITX_H
+#define DAV1D_SRC_ITX_H
+
+#include <stddef.h>
+
+#include "common/bitdepth.h"
+
+#include "src/levels.h"
+
+#define decl_itx_fn(name) \
+void (name)(pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob \
+ HIGHBD_DECL_SUFFIX)
+typedef decl_itx_fn(*itxfm_fn);
+
+typedef struct Dav1dInvTxfmDSPContext {
+ itxfm_fn itxfm_add[N_RECT_TX_SIZES][N_TX_TYPES_PLUS_LL];
+} Dav1dInvTxfmDSPContext;
+
+bitfn_decls(void dav1d_itx_dsp_init, Dav1dInvTxfmDSPContext *c, int bpc);
+
+#endif /* DAV1D_SRC_ITX_H */
diff --git a/third_party/dav1d/src/itx_1d.c b/third_party/dav1d/src/itx_1d.c
new file mode 100644
index 0000000000..ca14fc8c41
--- /dev/null
+++ b/third_party/dav1d/src/itx_1d.c
@@ -0,0 +1,1034 @@
+/*
+ * Copyright © 2018-2019, VideoLAN and dav1d authors
+ * Copyright © 2018-2019, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "common/intops.h"
+
+#include "src/itx_1d.h"
+
+#define CLIP(a) iclip(a, min, max)
+
+/*
+ * In some places, we use the pattern like this:
+ * t2 = ((in1 * 1567 - in3 * (3784 - 4096) + 2048) >> 12) - in3;
+ * even though the reference code might use something like:
+ * t2 = (in1 * 1567 - in3 * 3784 + 2048) >> 12;
+ *
+ * The reason for this is that for 12 bits/component bitstreams (corrupt/
+ * invalid ones, but they are codable nonetheless), each coefficient or
+ * input can be 19(+sign) bits, and therefore if the combination of the
+ * two multipliers (each 12 bits) is >= 4096, the result of the add/sub
+ * after the pair of multiplies will exceed the 31+sign bit range. Signed
+ * integer overflows are UB in C, and we'd like to prevent that.
+ *
+ * To workaround this, we invert one of the two coefficients (or, if both are
+ * multiples of 2, we reduce their magnitude by one bit). It should be noted
+ * that SIMD implementations do not have to follow this exact behaviour. The
+ * AV1 spec clearly states that the result of the multiply/add pairs should
+ * fit in 31+sign bit intermediates, and that streams violating this convention
+ * are not AV1-compliant. So, as long as we don't trigger UB (which some people
+ * would consider a security vulnerability), we're fine. So, SIMD can simply
+ * use the faster implementation, even if that might in some cases result in
+ * integer overflows, since these are not considered valid AV1 anyway, and in
+ * e.g. x86 assembly, integer overflows are not considered UB, but they merely
+ * wrap around.
+ */
+
+static NOINLINE void
+inv_dct4_1d_internal_c(int32_t *const c, const ptrdiff_t stride,
+ const int min, const int max, const int tx64)
+{
+ assert(stride > 0);
+ const int in0 = c[0 * stride], in1 = c[1 * stride];
+
+ int t0, t1, t2, t3;
+ if (tx64) {
+ t0 = t1 = (in0 * 181 + 128) >> 8;
+ t2 = (in1 * 1567 + 2048) >> 12;
+ t3 = (in1 * 3784 + 2048) >> 12;
+ } else {
+ const int in2 = c[2 * stride], in3 = c[3 * stride];
+
+ t0 = ((in0 + in2) * 181 + 128) >> 8;
+ t1 = ((in0 - in2) * 181 + 128) >> 8;
+ t2 = ((in1 * 1567 - in3 * (3784 - 4096) + 2048) >> 12) - in3;
+ t3 = ((in1 * (3784 - 4096) + in3 * 1567 + 2048) >> 12) + in1;
+ }
+
+ c[0 * stride] = CLIP(t0 + t3);
+ c[1 * stride] = CLIP(t1 + t2);
+ c[2 * stride] = CLIP(t1 - t2);
+ c[3 * stride] = CLIP(t0 - t3);
+}
+
+void dav1d_inv_dct4_1d_c(int32_t *const c, const ptrdiff_t stride,
+ const int min, const int max)
+{
+ inv_dct4_1d_internal_c(c, stride, min, max, 0);
+}
+
+static NOINLINE void
+inv_dct8_1d_internal_c(int32_t *const c, const ptrdiff_t stride,
+ const int min, const int max, const int tx64)
+{
+ assert(stride > 0);
+ inv_dct4_1d_internal_c(c, stride << 1, min, max, tx64);
+
+ const int in1 = c[1 * stride], in3 = c[3 * stride];
+
+ int t4a, t5a, t6a, t7a;
+ if (tx64) {
+ t4a = (in1 * 799 + 2048) >> 12;
+ t5a = (in3 * -2276 + 2048) >> 12;
+ t6a = (in3 * 3406 + 2048) >> 12;
+ t7a = (in1 * 4017 + 2048) >> 12;
+ } else {
+ const int in5 = c[5 * stride], in7 = c[7 * stride];
+
+ t4a = ((in1 * 799 - in7 * (4017 - 4096) + 2048) >> 12) - in7;
+ t5a = (in5 * 1703 - in3 * 1138 + 1024) >> 11;
+ t6a = (in5 * 1138 + in3 * 1703 + 1024) >> 11;
+ t7a = ((in1 * (4017 - 4096) + in7 * 799 + 2048) >> 12) + in1;
+ }
+
+ const int t4 = CLIP(t4a + t5a);
+ t5a = CLIP(t4a - t5a);
+ const int t7 = CLIP(t7a + t6a);
+ t6a = CLIP(t7a - t6a);
+
+ const int t5 = ((t6a - t5a) * 181 + 128) >> 8;
+ const int t6 = ((t6a + t5a) * 181 + 128) >> 8;
+
+ const int t0 = c[0 * stride];
+ const int t1 = c[2 * stride];
+ const int t2 = c[4 * stride];
+ const int t3 = c[6 * stride];
+
+ c[0 * stride] = CLIP(t0 + t7);
+ c[1 * stride] = CLIP(t1 + t6);
+ c[2 * stride] = CLIP(t2 + t5);
+ c[3 * stride] = CLIP(t3 + t4);
+ c[4 * stride] = CLIP(t3 - t4);
+ c[5 * stride] = CLIP(t2 - t5);
+ c[6 * stride] = CLIP(t1 - t6);
+ c[7 * stride] = CLIP(t0 - t7);
+}
+
+void dav1d_inv_dct8_1d_c(int32_t *const c, const ptrdiff_t stride,
+ const int min, const int max)
+{
+ inv_dct8_1d_internal_c(c, stride, min, max, 0);
+}
+
+static NOINLINE void
+inv_dct16_1d_internal_c(int32_t *const c, const ptrdiff_t stride,
+ const int min, const int max, int tx64)
+{
+ assert(stride > 0);
+ inv_dct8_1d_internal_c(c, stride << 1, min, max, tx64);
+
+ const int in1 = c[1 * stride], in3 = c[3 * stride];
+ const int in5 = c[5 * stride], in7 = c[7 * stride];
+
+ int t8a, t9a, t10a, t11a, t12a, t13a, t14a, t15a;
+ if (tx64) {
+ t8a = (in1 * 401 + 2048) >> 12;
+ t9a = (in7 * -2598 + 2048) >> 12;
+ t10a = (in5 * 1931 + 2048) >> 12;
+ t11a = (in3 * -1189 + 2048) >> 12;
+ t12a = (in3 * 3920 + 2048) >> 12;
+ t13a = (in5 * 3612 + 2048) >> 12;
+ t14a = (in7 * 3166 + 2048) >> 12;
+ t15a = (in1 * 4076 + 2048) >> 12;
+ } else {
+ const int in9 = c[ 9 * stride], in11 = c[11 * stride];
+ const int in13 = c[13 * stride], in15 = c[15 * stride];
+
+ t8a = ((in1 * 401 - in15 * (4076 - 4096) + 2048) >> 12) - in15;
+ t9a = (in9 * 1583 - in7 * 1299 + 1024) >> 11;
+ t10a = ((in5 * 1931 - in11 * (3612 - 4096) + 2048) >> 12) - in11;
+ t11a = ((in13 * (3920 - 4096) - in3 * 1189 + 2048) >> 12) + in13;
+ t12a = ((in13 * 1189 + in3 * (3920 - 4096) + 2048) >> 12) + in3;
+ t13a = ((in5 * (3612 - 4096) + in11 * 1931 + 2048) >> 12) + in5;
+ t14a = (in9 * 1299 + in7 * 1583 + 1024) >> 11;
+ t15a = ((in1 * (4076 - 4096) + in15 * 401 + 2048) >> 12) + in1;
+ }
+
+ int t8 = CLIP(t8a + t9a);
+ int t9 = CLIP(t8a - t9a);
+ int t10 = CLIP(t11a - t10a);
+ int t11 = CLIP(t11a + t10a);
+ int t12 = CLIP(t12a + t13a);
+ int t13 = CLIP(t12a - t13a);
+ int t14 = CLIP(t15a - t14a);
+ int t15 = CLIP(t15a + t14a);
+
+ t9a = (( t14 * 1567 - t9 * (3784 - 4096) + 2048) >> 12) - t9;
+ t14a = (( t14 * (3784 - 4096) + t9 * 1567 + 2048) >> 12) + t14;
+ t10a = ((-(t13 * (3784 - 4096) + t10 * 1567) + 2048) >> 12) - t13;
+ t13a = (( t13 * 1567 - t10 * (3784 - 4096) + 2048) >> 12) - t10;
+
+ t8a = CLIP(t8 + t11);
+ t9 = CLIP(t9a + t10a);
+ t10 = CLIP(t9a - t10a);
+ t11a = CLIP(t8 - t11);
+ t12a = CLIP(t15 - t12);
+ t13 = CLIP(t14a - t13a);
+ t14 = CLIP(t14a + t13a);
+ t15a = CLIP(t15 + t12);
+
+ t10a = ((t13 - t10) * 181 + 128) >> 8;
+ t13a = ((t13 + t10) * 181 + 128) >> 8;
+ t11 = ((t12a - t11a) * 181 + 128) >> 8;
+ t12 = ((t12a + t11a) * 181 + 128) >> 8;
+
+ const int t0 = c[ 0 * stride];
+ const int t1 = c[ 2 * stride];
+ const int t2 = c[ 4 * stride];
+ const int t3 = c[ 6 * stride];
+ const int t4 = c[ 8 * stride];
+ const int t5 = c[10 * stride];
+ const int t6 = c[12 * stride];
+ const int t7 = c[14 * stride];
+
+ c[ 0 * stride] = CLIP(t0 + t15a);
+ c[ 1 * stride] = CLIP(t1 + t14);
+ c[ 2 * stride] = CLIP(t2 + t13a);
+ c[ 3 * stride] = CLIP(t3 + t12);
+ c[ 4 * stride] = CLIP(t4 + t11);
+ c[ 5 * stride] = CLIP(t5 + t10a);
+ c[ 6 * stride] = CLIP(t6 + t9);
+ c[ 7 * stride] = CLIP(t7 + t8a);
+ c[ 8 * stride] = CLIP(t7 - t8a);
+ c[ 9 * stride] = CLIP(t6 - t9);
+ c[10 * stride] = CLIP(t5 - t10a);
+ c[11 * stride] = CLIP(t4 - t11);
+ c[12 * stride] = CLIP(t3 - t12);
+ c[13 * stride] = CLIP(t2 - t13a);
+ c[14 * stride] = CLIP(t1 - t14);
+ c[15 * stride] = CLIP(t0 - t15a);
+}
+
+void dav1d_inv_dct16_1d_c(int32_t *const c, const ptrdiff_t stride,
+ const int min, const int max)
+{
+ inv_dct16_1d_internal_c(c, stride, min, max, 0);
+}
+
+static NOINLINE void
+inv_dct32_1d_internal_c(int32_t *const c, const ptrdiff_t stride,
+ const int min, const int max, const int tx64)
+{
+ assert(stride > 0);
+ inv_dct16_1d_internal_c(c, stride << 1, min, max, tx64);
+
+ const int in1 = c[ 1 * stride], in3 = c[ 3 * stride];
+ const int in5 = c[ 5 * stride], in7 = c[ 7 * stride];
+ const int in9 = c[ 9 * stride], in11 = c[11 * stride];
+ const int in13 = c[13 * stride], in15 = c[15 * stride];
+
+ int t16a, t17a, t18a, t19a, t20a, t21a, t22a, t23a;
+ int t24a, t25a, t26a, t27a, t28a, t29a, t30a, t31a;
+ if (tx64) {
+ t16a = (in1 * 201 + 2048) >> 12;
+ t17a = (in15 * -2751 + 2048) >> 12;
+ t18a = (in9 * 1751 + 2048) >> 12;
+ t19a = (in7 * -1380 + 2048) >> 12;
+ t20a = (in5 * 995 + 2048) >> 12;
+ t21a = (in11 * -2106 + 2048) >> 12;
+ t22a = (in13 * 2440 + 2048) >> 12;
+ t23a = (in3 * -601 + 2048) >> 12;
+ t24a = (in3 * 4052 + 2048) >> 12;
+ t25a = (in13 * 3290 + 2048) >> 12;
+ t26a = (in11 * 3513 + 2048) >> 12;
+ t27a = (in5 * 3973 + 2048) >> 12;
+ t28a = (in7 * 3857 + 2048) >> 12;
+ t29a = (in9 * 3703 + 2048) >> 12;
+ t30a = (in15 * 3035 + 2048) >> 12;
+ t31a = (in1 * 4091 + 2048) >> 12;
+ } else {
+ const int in17 = c[17 * stride], in19 = c[19 * stride];
+ const int in21 = c[21 * stride], in23 = c[23 * stride];
+ const int in25 = c[25 * stride], in27 = c[27 * stride];
+ const int in29 = c[29 * stride], in31 = c[31 * stride];
+
+ t16a = ((in1 * 201 - in31 * (4091 - 4096) + 2048) >> 12) - in31;
+ t17a = ((in17 * (3035 - 4096) - in15 * 2751 + 2048) >> 12) + in17;
+ t18a = ((in9 * 1751 - in23 * (3703 - 4096) + 2048) >> 12) - in23;
+ t19a = ((in25 * (3857 - 4096) - in7 * 1380 + 2048) >> 12) + in25;
+ t20a = ((in5 * 995 - in27 * (3973 - 4096) + 2048) >> 12) - in27;
+ t21a = ((in21 * (3513 - 4096) - in11 * 2106 + 2048) >> 12) + in21;
+ t22a = (in13 * 1220 - in19 * 1645 + 1024) >> 11;
+ t23a = ((in29 * (4052 - 4096) - in3 * 601 + 2048) >> 12) + in29;
+ t24a = ((in29 * 601 + in3 * (4052 - 4096) + 2048) >> 12) + in3;
+ t25a = (in13 * 1645 + in19 * 1220 + 1024) >> 11;
+ t26a = ((in21 * 2106 + in11 * (3513 - 4096) + 2048) >> 12) + in11;
+ t27a = ((in5 * (3973 - 4096) + in27 * 995 + 2048) >> 12) + in5;
+ t28a = ((in25 * 1380 + in7 * (3857 - 4096) + 2048) >> 12) + in7;
+ t29a = ((in9 * (3703 - 4096) + in23 * 1751 + 2048) >> 12) + in9;
+ t30a = ((in17 * 2751 + in15 * (3035 - 4096) + 2048) >> 12) + in15;
+ t31a = ((in1 * (4091 - 4096) + in31 * 201 + 2048) >> 12) + in1;
+ }
+
+ int t16 = CLIP(t16a + t17a);
+ int t17 = CLIP(t16a - t17a);
+ int t18 = CLIP(t19a - t18a);
+ int t19 = CLIP(t19a + t18a);
+ int t20 = CLIP(t20a + t21a);
+ int t21 = CLIP(t20a - t21a);
+ int t22 = CLIP(t23a - t22a);
+ int t23 = CLIP(t23a + t22a);
+ int t24 = CLIP(t24a + t25a);
+ int t25 = CLIP(t24a - t25a);
+ int t26 = CLIP(t27a - t26a);
+ int t27 = CLIP(t27a + t26a);
+ int t28 = CLIP(t28a + t29a);
+ int t29 = CLIP(t28a - t29a);
+ int t30 = CLIP(t31a - t30a);
+ int t31 = CLIP(t31a + t30a);
+
+ t17a = (( t30 * 799 - t17 * (4017 - 4096) + 2048) >> 12) - t17;
+ t30a = (( t30 * (4017 - 4096) + t17 * 799 + 2048) >> 12) + t30;
+ t18a = ((-(t29 * (4017 - 4096) + t18 * 799) + 2048) >> 12) - t29;
+ t29a = (( t29 * 799 - t18 * (4017 - 4096) + 2048) >> 12) - t18;
+ t21a = ( t26 * 1703 - t21 * 1138 + 1024) >> 11;
+ t26a = ( t26 * 1138 + t21 * 1703 + 1024) >> 11;
+ t22a = (-(t25 * 1138 + t22 * 1703 ) + 1024) >> 11;
+ t25a = ( t25 * 1703 - t22 * 1138 + 1024) >> 11;
+
+ t16a = CLIP(t16 + t19);
+ t17 = CLIP(t17a + t18a);
+ t18 = CLIP(t17a - t18a);
+ t19a = CLIP(t16 - t19);
+ t20a = CLIP(t23 - t20);
+ t21 = CLIP(t22a - t21a);
+ t22 = CLIP(t22a + t21a);
+ t23a = CLIP(t23 + t20);
+ t24a = CLIP(t24 + t27);
+ t25 = CLIP(t25a + t26a);
+ t26 = CLIP(t25a - t26a);
+ t27a = CLIP(t24 - t27);
+ t28a = CLIP(t31 - t28);
+ t29 = CLIP(t30a - t29a);
+ t30 = CLIP(t30a + t29a);
+ t31a = CLIP(t31 + t28);
+
+ t18a = (( t29 * 1567 - t18 * (3784 - 4096) + 2048) >> 12) - t18;
+ t29a = (( t29 * (3784 - 4096) + t18 * 1567 + 2048) >> 12) + t29;
+ t19 = (( t28a * 1567 - t19a * (3784 - 4096) + 2048) >> 12) - t19a;
+ t28 = (( t28a * (3784 - 4096) + t19a * 1567 + 2048) >> 12) + t28a;
+ t20 = ((-(t27a * (3784 - 4096) + t20a * 1567) + 2048) >> 12) - t27a;
+ t27 = (( t27a * 1567 - t20a * (3784 - 4096) + 2048) >> 12) - t20a;
+ t21a = ((-(t26 * (3784 - 4096) + t21 * 1567) + 2048) >> 12) - t26;
+ t26a = (( t26 * 1567 - t21 * (3784 - 4096) + 2048) >> 12) - t21;
+
+ t16 = CLIP(t16a + t23a);
+ t17a = CLIP(t17 + t22);
+ t18 = CLIP(t18a + t21a);
+ t19a = CLIP(t19 + t20);
+ t20a = CLIP(t19 - t20);
+ t21 = CLIP(t18a - t21a);
+ t22a = CLIP(t17 - t22);
+ t23 = CLIP(t16a - t23a);
+ t24 = CLIP(t31a - t24a);
+ t25a = CLIP(t30 - t25);
+ t26 = CLIP(t29a - t26a);
+ t27a = CLIP(t28 - t27);
+ t28a = CLIP(t28 + t27);
+ t29 = CLIP(t29a + t26a);
+ t30a = CLIP(t30 + t25);
+ t31 = CLIP(t31a + t24a);
+
+ t20 = ((t27a - t20a) * 181 + 128) >> 8;
+ t27 = ((t27a + t20a) * 181 + 128) >> 8;
+ t21a = ((t26 - t21 ) * 181 + 128) >> 8;
+ t26a = ((t26 + t21 ) * 181 + 128) >> 8;
+ t22 = ((t25a - t22a) * 181 + 128) >> 8;
+ t25 = ((t25a + t22a) * 181 + 128) >> 8;
+ t23a = ((t24 - t23 ) * 181 + 128) >> 8;
+ t24a = ((t24 + t23 ) * 181 + 128) >> 8;
+
+ const int t0 = c[ 0 * stride];
+ const int t1 = c[ 2 * stride];
+ const int t2 = c[ 4 * stride];
+ const int t3 = c[ 6 * stride];
+ const int t4 = c[ 8 * stride];
+ const int t5 = c[10 * stride];
+ const int t6 = c[12 * stride];
+ const int t7 = c[14 * stride];
+ const int t8 = c[16 * stride];
+ const int t9 = c[18 * stride];
+ const int t10 = c[20 * stride];
+ const int t11 = c[22 * stride];
+ const int t12 = c[24 * stride];
+ const int t13 = c[26 * stride];
+ const int t14 = c[28 * stride];
+ const int t15 = c[30 * stride];
+
+ c[ 0 * stride] = CLIP(t0 + t31);
+ c[ 1 * stride] = CLIP(t1 + t30a);
+ c[ 2 * stride] = CLIP(t2 + t29);
+ c[ 3 * stride] = CLIP(t3 + t28a);
+ c[ 4 * stride] = CLIP(t4 + t27);
+ c[ 5 * stride] = CLIP(t5 + t26a);
+ c[ 6 * stride] = CLIP(t6 + t25);
+ c[ 7 * stride] = CLIP(t7 + t24a);
+ c[ 8 * stride] = CLIP(t8 + t23a);
+ c[ 9 * stride] = CLIP(t9 + t22);
+ c[10 * stride] = CLIP(t10 + t21a);
+ c[11 * stride] = CLIP(t11 + t20);
+ c[12 * stride] = CLIP(t12 + t19a);
+ c[13 * stride] = CLIP(t13 + t18);
+ c[14 * stride] = CLIP(t14 + t17a);
+ c[15 * stride] = CLIP(t15 + t16);
+ c[16 * stride] = CLIP(t15 - t16);
+ c[17 * stride] = CLIP(t14 - t17a);
+ c[18 * stride] = CLIP(t13 - t18);
+ c[19 * stride] = CLIP(t12 - t19a);
+ c[20 * stride] = CLIP(t11 - t20);
+ c[21 * stride] = CLIP(t10 - t21a);
+ c[22 * stride] = CLIP(t9 - t22);
+ c[23 * stride] = CLIP(t8 - t23a);
+ c[24 * stride] = CLIP(t7 - t24a);
+ c[25 * stride] = CLIP(t6 - t25);
+ c[26 * stride] = CLIP(t5 - t26a);
+ c[27 * stride] = CLIP(t4 - t27);
+ c[28 * stride] = CLIP(t3 - t28a);
+ c[29 * stride] = CLIP(t2 - t29);
+ c[30 * stride] = CLIP(t1 - t30a);
+ c[31 * stride] = CLIP(t0 - t31);
+}
+
+void dav1d_inv_dct32_1d_c(int32_t *const c, const ptrdiff_t stride,
+ const int min, const int max)
+{
+ inv_dct32_1d_internal_c(c, stride, min, max, 0);
+}
+
+void dav1d_inv_dct64_1d_c(int32_t *const c, const ptrdiff_t stride,
+ const int min, const int max)
+{
+ assert(stride > 0);
+ inv_dct32_1d_internal_c(c, stride << 1, min, max, 1);
+
+ const int in1 = c[ 1 * stride], in3 = c[ 3 * stride];
+ const int in5 = c[ 5 * stride], in7 = c[ 7 * stride];
+ const int in9 = c[ 9 * stride], in11 = c[11 * stride];
+ const int in13 = c[13 * stride], in15 = c[15 * stride];
+ const int in17 = c[17 * stride], in19 = c[19 * stride];
+ const int in21 = c[21 * stride], in23 = c[23 * stride];
+ const int in25 = c[25 * stride], in27 = c[27 * stride];
+ const int in29 = c[29 * stride], in31 = c[31 * stride];
+
+ int t32a = (in1 * 101 + 2048) >> 12;
+ int t33a = (in31 * -2824 + 2048) >> 12;
+ int t34a = (in17 * 1660 + 2048) >> 12;
+ int t35a = (in15 * -1474 + 2048) >> 12;
+ int t36a = (in9 * 897 + 2048) >> 12;
+ int t37a = (in23 * -2191 + 2048) >> 12;
+ int t38a = (in25 * 2359 + 2048) >> 12;
+ int t39a = (in7 * -700 + 2048) >> 12;
+ int t40a = (in5 * 501 + 2048) >> 12;
+ int t41a = (in27 * -2520 + 2048) >> 12;
+ int t42a = (in21 * 2019 + 2048) >> 12;
+ int t43a = (in11 * -1092 + 2048) >> 12;
+ int t44a = (in13 * 1285 + 2048) >> 12;
+ int t45a = (in19 * -1842 + 2048) >> 12;
+ int t46a = (in29 * 2675 + 2048) >> 12;
+ int t47a = (in3 * -301 + 2048) >> 12;
+ int t48a = (in3 * 4085 + 2048) >> 12;
+ int t49a = (in29 * 3102 + 2048) >> 12;
+ int t50a = (in19 * 3659 + 2048) >> 12;
+ int t51a = (in13 * 3889 + 2048) >> 12;
+ int t52a = (in11 * 3948 + 2048) >> 12;
+ int t53a = (in21 * 3564 + 2048) >> 12;
+ int t54a = (in27 * 3229 + 2048) >> 12;
+ int t55a = (in5 * 4065 + 2048) >> 12;
+ int t56a = (in7 * 4036 + 2048) >> 12;
+ int t57a = (in25 * 3349 + 2048) >> 12;
+ int t58a = (in23 * 3461 + 2048) >> 12;
+ int t59a = (in9 * 3996 + 2048) >> 12;
+ int t60a = (in15 * 3822 + 2048) >> 12;
+ int t61a = (in17 * 3745 + 2048) >> 12;
+ int t62a = (in31 * 2967 + 2048) >> 12;
+ int t63a = (in1 * 4095 + 2048) >> 12;
+
+ int t32 = CLIP(t32a + t33a);
+ int t33 = CLIP(t32a - t33a);
+ int t34 = CLIP(t35a - t34a);
+ int t35 = CLIP(t35a + t34a);
+ int t36 = CLIP(t36a + t37a);
+ int t37 = CLIP(t36a - t37a);
+ int t38 = CLIP(t39a - t38a);
+ int t39 = CLIP(t39a + t38a);
+ int t40 = CLIP(t40a + t41a);
+ int t41 = CLIP(t40a - t41a);
+ int t42 = CLIP(t43a - t42a);
+ int t43 = CLIP(t43a + t42a);
+ int t44 = CLIP(t44a + t45a);
+ int t45 = CLIP(t44a - t45a);
+ int t46 = CLIP(t47a - t46a);
+ int t47 = CLIP(t47a + t46a);
+ int t48 = CLIP(t48a + t49a);
+ int t49 = CLIP(t48a - t49a);
+ int t50 = CLIP(t51a - t50a);
+ int t51 = CLIP(t51a + t50a);
+ int t52 = CLIP(t52a + t53a);
+ int t53 = CLIP(t52a - t53a);
+ int t54 = CLIP(t55a - t54a);
+ int t55 = CLIP(t55a + t54a);
+ int t56 = CLIP(t56a + t57a);
+ int t57 = CLIP(t56a - t57a);
+ int t58 = CLIP(t59a - t58a);
+ int t59 = CLIP(t59a + t58a);
+ int t60 = CLIP(t60a + t61a);
+ int t61 = CLIP(t60a - t61a);
+ int t62 = CLIP(t63a - t62a);
+ int t63 = CLIP(t63a + t62a);
+
+ t33a = ((t33 * (4096 - 4076) + t62 * 401 + 2048) >> 12) - t33;
+ t34a = ((t34 * -401 + t61 * (4096 - 4076) + 2048) >> 12) - t61;
+ t37a = (t37 * -1299 + t58 * 1583 + 1024) >> 11;
+ t38a = (t38 * -1583 + t57 * -1299 + 1024) >> 11;
+ t41a = ((t41 * (4096 - 3612) + t54 * 1931 + 2048) >> 12) - t41;
+ t42a = ((t42 * -1931 + t53 * (4096 - 3612) + 2048) >> 12) - t53;
+ t45a = ((t45 * -1189 + t50 * (3920 - 4096) + 2048) >> 12) + t50;
+ t46a = ((t46 * (4096 - 3920) + t49 * -1189 + 2048) >> 12) - t46;
+ t49a = ((t46 * -1189 + t49 * (3920 - 4096) + 2048) >> 12) + t49;
+ t50a = ((t45 * (3920 - 4096) + t50 * 1189 + 2048) >> 12) + t45;
+ t53a = ((t42 * (4096 - 3612) + t53 * 1931 + 2048) >> 12) - t42;
+ t54a = ((t41 * 1931 + t54 * (3612 - 4096) + 2048) >> 12) + t54;
+ t57a = (t38 * -1299 + t57 * 1583 + 1024) >> 11;
+ t58a = (t37 * 1583 + t58 * 1299 + 1024) >> 11;
+ t61a = ((t34 * (4096 - 4076) + t61 * 401 + 2048) >> 12) - t34;
+ t62a = ((t33 * 401 + t62 * (4076 - 4096) + 2048) >> 12) + t62;
+
+ t32a = CLIP(t32 + t35);
+ t33 = CLIP(t33a + t34a);
+ t34 = CLIP(t33a - t34a);
+ t35a = CLIP(t32 - t35);
+ t36a = CLIP(t39 - t36);
+ t37 = CLIP(t38a - t37a);
+ t38 = CLIP(t38a + t37a);
+ t39a = CLIP(t39 + t36);
+ t40a = CLIP(t40 + t43);
+ t41 = CLIP(t41a + t42a);
+ t42 = CLIP(t41a - t42a);
+ t43a = CLIP(t40 - t43);
+ t44a = CLIP(t47 - t44);
+ t45 = CLIP(t46a - t45a);
+ t46 = CLIP(t46a + t45a);
+ t47a = CLIP(t47 + t44);
+ t48a = CLIP(t48 + t51);
+ t49 = CLIP(t49a + t50a);
+ t50 = CLIP(t49a - t50a);
+ t51a = CLIP(t48 - t51);
+ t52a = CLIP(t55 - t52);
+ t53 = CLIP(t54a - t53a);
+ t54 = CLIP(t54a + t53a);
+ t55a = CLIP(t55 + t52);
+ t56a = CLIP(t56 + t59);
+ t57 = CLIP(t57a + t58a);
+ t58 = CLIP(t57a - t58a);
+ t59a = CLIP(t56 - t59);
+ t60a = CLIP(t63 - t60);
+ t61 = CLIP(t62a - t61a);
+ t62 = CLIP(t62a + t61a);
+ t63a = CLIP(t63 + t60);
+
+ t34a = ((t34 * (4096 - 4017) + t61 * 799 + 2048) >> 12) - t34;
+ t35 = ((t35a * (4096 - 4017) + t60a * 799 + 2048) >> 12) - t35a;
+ t36 = ((t36a * -799 + t59a * (4096 - 4017) + 2048) >> 12) - t59a;
+ t37a = ((t37 * -799 + t58 * (4096 - 4017) + 2048) >> 12) - t58;
+ t42a = (t42 * -1138 + t53 * 1703 + 1024) >> 11;
+ t43 = (t43a * -1138 + t52a * 1703 + 1024) >> 11;
+ t44 = (t44a * -1703 + t51a * -1138 + 1024) >> 11;
+ t45a = (t45 * -1703 + t50 * -1138 + 1024) >> 11;
+ t50a = (t45 * -1138 + t50 * 1703 + 1024) >> 11;
+ t51 = (t44a * -1138 + t51a * 1703 + 1024) >> 11;
+ t52 = (t43a * 1703 + t52a * 1138 + 1024) >> 11;
+ t53a = (t42 * 1703 + t53 * 1138 + 1024) >> 11;
+ t58a = ((t37 * (4096 - 4017) + t58 * 799 + 2048) >> 12) - t37;
+ t59 = ((t36a * (4096 - 4017) + t59a * 799 + 2048) >> 12) - t36a;
+ t60 = ((t35a * 799 + t60a * (4017 - 4096) + 2048) >> 12) + t60a;
+ t61a = ((t34 * 799 + t61 * (4017 - 4096) + 2048) >> 12) + t61;
+
+ t32 = CLIP(t32a + t39a);
+ t33a = CLIP(t33 + t38);
+ t34 = CLIP(t34a + t37a);
+ t35a = CLIP(t35 + t36);
+ t36a = CLIP(t35 - t36);
+ t37 = CLIP(t34a - t37a);
+ t38a = CLIP(t33 - t38);
+ t39 = CLIP(t32a - t39a);
+ t40 = CLIP(t47a - t40a);
+ t41a = CLIP(t46 - t41);
+ t42 = CLIP(t45a - t42a);
+ t43a = CLIP(t44 - t43);
+ t44a = CLIP(t44 + t43);
+ t45 = CLIP(t45a + t42a);
+ t46a = CLIP(t46 + t41);
+ t47 = CLIP(t47a + t40a);
+ t48 = CLIP(t48a + t55a);
+ t49a = CLIP(t49 + t54);
+ t50 = CLIP(t50a + t53a);
+ t51a = CLIP(t51 + t52);
+ t52a = CLIP(t51 - t52);
+ t53 = CLIP(t50a - t53a);
+ t54a = CLIP(t49 - t54);
+ t55 = CLIP(t48a - t55a);
+ t56 = CLIP(t63a - t56a);
+ t57a = CLIP(t62 - t57);
+ t58 = CLIP(t61a - t58a);
+ t59a = CLIP(t60 - t59);
+ t60a = CLIP(t60 + t59);
+ t61 = CLIP(t61a + t58a);
+ t62a = CLIP(t62 + t57);
+ t63 = CLIP(t63a + t56a);
+
+ t36 = ((t36a * (4096 - 3784) + t59a * 1567 + 2048) >> 12) - t36a;
+ t37a = ((t37 * (4096 - 3784) + t58 * 1567 + 2048) >> 12) - t37;
+ t38 = ((t38a * (4096 - 3784) + t57a * 1567 + 2048) >> 12) - t38a;
+ t39a = ((t39 * (4096 - 3784) + t56 * 1567 + 2048) >> 12) - t39;
+ t40a = ((t40 * -1567 + t55 * (4096 - 3784) + 2048) >> 12) - t55;
+ t41 = ((t41a * -1567 + t54a * (4096 - 3784) + 2048) >> 12) - t54a;
+ t42a = ((t42 * -1567 + t53 * (4096 - 3784) + 2048) >> 12) - t53;
+ t43 = ((t43a * -1567 + t52a * (4096 - 3784) + 2048) >> 12) - t52a;
+ t52 = ((t43a * (4096 - 3784) + t52a * 1567 + 2048) >> 12) - t43a;
+ t53a = ((t42 * (4096 - 3784) + t53 * 1567 + 2048) >> 12) - t42;
+ t54 = ((t41a * (4096 - 3784) + t54a * 1567 + 2048) >> 12) - t41a;
+ t55a = ((t40 * (4096 - 3784) + t55 * 1567 + 2048) >> 12) - t40;
+ t56a = ((t39 * 1567 + t56 * (3784 - 4096) + 2048) >> 12) + t56;
+ t57 = ((t38a * 1567 + t57a * (3784 - 4096) + 2048) >> 12) + t57a;
+ t58a = ((t37 * 1567 + t58 * (3784 - 4096) + 2048) >> 12) + t58;
+ t59 = ((t36a * 1567 + t59a * (3784 - 4096) + 2048) >> 12) + t59a;
+
+ t32a = CLIP(t32 + t47);
+ t33 = CLIP(t33a + t46a);
+ t34a = CLIP(t34 + t45);
+ t35 = CLIP(t35a + t44a);
+ t36a = CLIP(t36 + t43);
+ t37 = CLIP(t37a + t42a);
+ t38a = CLIP(t38 + t41);
+ t39 = CLIP(t39a + t40a);
+ t40 = CLIP(t39a - t40a);
+ t41a = CLIP(t38 - t41);
+ t42 = CLIP(t37a - t42a);
+ t43a = CLIP(t36 - t43);
+ t44 = CLIP(t35a - t44a);
+ t45a = CLIP(t34 - t45);
+ t46 = CLIP(t33a - t46a);
+ t47a = CLIP(t32 - t47);
+ t48a = CLIP(t63 - t48);
+ t49 = CLIP(t62a - t49a);
+ t50a = CLIP(t61 - t50);
+ t51 = CLIP(t60a - t51a);
+ t52a = CLIP(t59 - t52);
+ t53 = CLIP(t58a - t53a);
+ t54a = CLIP(t57 - t54);
+ t55 = CLIP(t56a - t55a);
+ t56 = CLIP(t56a + t55a);
+ t57a = CLIP(t57 + t54);
+ t58 = CLIP(t58a + t53a);
+ t59a = CLIP(t59 + t52);
+ t60 = CLIP(t60a + t51a);
+ t61a = CLIP(t61 + t50);
+ t62 = CLIP(t62a + t49a);
+ t63a = CLIP(t63 + t48);
+
+ t40a = ((t55 - t40 ) * 181 + 128) >> 8;
+ t41 = ((t54a - t41a) * 181 + 128) >> 8;
+ t42a = ((t53 - t42 ) * 181 + 128) >> 8;
+ t43 = ((t52a - t43a) * 181 + 128) >> 8;
+ t44a = ((t51 - t44 ) * 181 + 128) >> 8;
+ t45 = ((t50a - t45a) * 181 + 128) >> 8;
+ t46a = ((t49 - t46 ) * 181 + 128) >> 8;
+ t47 = ((t48a - t47a) * 181 + 128) >> 8;
+ t48 = ((t47a + t48a) * 181 + 128) >> 8;
+ t49a = ((t46 + t49 ) * 181 + 128) >> 8;
+ t50 = ((t45a + t50a) * 181 + 128) >> 8;
+ t51a = ((t44 + t51 ) * 181 + 128) >> 8;
+ t52 = ((t43a + t52a) * 181 + 128) >> 8;
+ t53a = ((t42 + t53 ) * 181 + 128) >> 8;
+ t54 = ((t41a + t54a) * 181 + 128) >> 8;
+ t55a = ((t40 + t55 ) * 181 + 128) >> 8;
+
+ const int t0 = c[ 0 * stride];
+ const int t1 = c[ 2 * stride];
+ const int t2 = c[ 4 * stride];
+ const int t3 = c[ 6 * stride];
+ const int t4 = c[ 8 * stride];
+ const int t5 = c[10 * stride];
+ const int t6 = c[12 * stride];
+ const int t7 = c[14 * stride];
+ const int t8 = c[16 * stride];
+ const int t9 = c[18 * stride];
+ const int t10 = c[20 * stride];
+ const int t11 = c[22 * stride];
+ const int t12 = c[24 * stride];
+ const int t13 = c[26 * stride];
+ const int t14 = c[28 * stride];
+ const int t15 = c[30 * stride];
+ const int t16 = c[32 * stride];
+ const int t17 = c[34 * stride];
+ const int t18 = c[36 * stride];
+ const int t19 = c[38 * stride];
+ const int t20 = c[40 * stride];
+ const int t21 = c[42 * stride];
+ const int t22 = c[44 * stride];
+ const int t23 = c[46 * stride];
+ const int t24 = c[48 * stride];
+ const int t25 = c[50 * stride];
+ const int t26 = c[52 * stride];
+ const int t27 = c[54 * stride];
+ const int t28 = c[56 * stride];
+ const int t29 = c[58 * stride];
+ const int t30 = c[60 * stride];
+ const int t31 = c[62 * stride];
+
+ c[ 0 * stride] = CLIP(t0 + t63a);
+ c[ 1 * stride] = CLIP(t1 + t62);
+ c[ 2 * stride] = CLIP(t2 + t61a);
+ c[ 3 * stride] = CLIP(t3 + t60);
+ c[ 4 * stride] = CLIP(t4 + t59a);
+ c[ 5 * stride] = CLIP(t5 + t58);
+ c[ 6 * stride] = CLIP(t6 + t57a);
+ c[ 7 * stride] = CLIP(t7 + t56);
+ c[ 8 * stride] = CLIP(t8 + t55a);
+ c[ 9 * stride] = CLIP(t9 + t54);
+ c[10 * stride] = CLIP(t10 + t53a);
+ c[11 * stride] = CLIP(t11 + t52);
+ c[12 * stride] = CLIP(t12 + t51a);
+ c[13 * stride] = CLIP(t13 + t50);
+ c[14 * stride] = CLIP(t14 + t49a);
+ c[15 * stride] = CLIP(t15 + t48);
+ c[16 * stride] = CLIP(t16 + t47);
+ c[17 * stride] = CLIP(t17 + t46a);
+ c[18 * stride] = CLIP(t18 + t45);
+ c[19 * stride] = CLIP(t19 + t44a);
+ c[20 * stride] = CLIP(t20 + t43);
+ c[21 * stride] = CLIP(t21 + t42a);
+ c[22 * stride] = CLIP(t22 + t41);
+ c[23 * stride] = CLIP(t23 + t40a);
+ c[24 * stride] = CLIP(t24 + t39);
+ c[25 * stride] = CLIP(t25 + t38a);
+ c[26 * stride] = CLIP(t26 + t37);
+ c[27 * stride] = CLIP(t27 + t36a);
+ c[28 * stride] = CLIP(t28 + t35);
+ c[29 * stride] = CLIP(t29 + t34a);
+ c[30 * stride] = CLIP(t30 + t33);
+ c[31 * stride] = CLIP(t31 + t32a);
+ c[32 * stride] = CLIP(t31 - t32a);
+ c[33 * stride] = CLIP(t30 - t33);
+ c[34 * stride] = CLIP(t29 - t34a);
+ c[35 * stride] = CLIP(t28 - t35);
+ c[36 * stride] = CLIP(t27 - t36a);
+ c[37 * stride] = CLIP(t26 - t37);
+ c[38 * stride] = CLIP(t25 - t38a);
+ c[39 * stride] = CLIP(t24 - t39);
+ c[40 * stride] = CLIP(t23 - t40a);
+ c[41 * stride] = CLIP(t22 - t41);
+ c[42 * stride] = CLIP(t21 - t42a);
+ c[43 * stride] = CLIP(t20 - t43);
+ c[44 * stride] = CLIP(t19 - t44a);
+ c[45 * stride] = CLIP(t18 - t45);
+ c[46 * stride] = CLIP(t17 - t46a);
+ c[47 * stride] = CLIP(t16 - t47);
+ c[48 * stride] = CLIP(t15 - t48);
+ c[49 * stride] = CLIP(t14 - t49a);
+ c[50 * stride] = CLIP(t13 - t50);
+ c[51 * stride] = CLIP(t12 - t51a);
+ c[52 * stride] = CLIP(t11 - t52);
+ c[53 * stride] = CLIP(t10 - t53a);
+ c[54 * stride] = CLIP(t9 - t54);
+ c[55 * stride] = CLIP(t8 - t55a);
+ c[56 * stride] = CLIP(t7 - t56);
+ c[57 * stride] = CLIP(t6 - t57a);
+ c[58 * stride] = CLIP(t5 - t58);
+ c[59 * stride] = CLIP(t4 - t59a);
+ c[60 * stride] = CLIP(t3 - t60);
+ c[61 * stride] = CLIP(t2 - t61a);
+ c[62 * stride] = CLIP(t1 - t62);
+ c[63 * stride] = CLIP(t0 - t63a);
+}
+
+static NOINLINE void
+inv_adst4_1d_internal_c(const int32_t *const in, const ptrdiff_t in_s,
+ const int min, const int max,
+ int32_t *const out, const ptrdiff_t out_s)
+{
+ assert(in_s > 0 && out_s != 0);
+ const int in0 = in[0 * in_s], in1 = in[1 * in_s];
+ const int in2 = in[2 * in_s], in3 = in[3 * in_s];
+
+ out[0 * out_s] = (( 1321 * in0 + (3803 - 4096) * in2 +
+ (2482 - 4096) * in3 + (3344 - 4096) * in1 + 2048) >> 12) +
+ in2 + in3 + in1;
+ out[1 * out_s] = (((2482 - 4096) * in0 - 1321 * in2 -
+ (3803 - 4096) * in3 + (3344 - 4096) * in1 + 2048) >> 12) +
+ in0 - in3 + in1;
+ out[2 * out_s] = (209 * (in0 - in2 + in3) + 128) >> 8;
+ out[3 * out_s] = (((3803 - 4096) * in0 + (2482 - 4096) * in2 -
+ 1321 * in3 - (3344 - 4096) * in1 + 2048) >> 12) +
+ in0 + in2 - in1;
+}
+
+static NOINLINE void
+inv_adst8_1d_internal_c(const int32_t *const in, const ptrdiff_t in_s,
+ const int min, const int max,
+ int32_t *const out, const ptrdiff_t out_s)
+{
+ assert(in_s > 0 && out_s != 0);
+ const int in0 = in[0 * in_s], in1 = in[1 * in_s];
+ const int in2 = in[2 * in_s], in3 = in[3 * in_s];
+ const int in4 = in[4 * in_s], in5 = in[5 * in_s];
+ const int in6 = in[6 * in_s], in7 = in[7 * in_s];
+
+ const int t0a = (((4076 - 4096) * in7 + 401 * in0 + 2048) >> 12) + in7;
+ const int t1a = (( 401 * in7 - (4076 - 4096) * in0 + 2048) >> 12) - in0;
+ const int t2a = (((3612 - 4096) * in5 + 1931 * in2 + 2048) >> 12) + in5;
+ const int t3a = (( 1931 * in5 - (3612 - 4096) * in2 + 2048) >> 12) - in2;
+ int t4a = ( 1299 * in3 + 1583 * in4 + 1024) >> 11;
+ int t5a = ( 1583 * in3 - 1299 * in4 + 1024) >> 11;
+ int t6a = (( 1189 * in1 + (3920 - 4096) * in6 + 2048) >> 12) + in6;
+ int t7a = (((3920 - 4096) * in1 - 1189 * in6 + 2048) >> 12) + in1;
+
+ const int t0 = CLIP(t0a + t4a);
+ const int t1 = CLIP(t1a + t5a);
+ int t2 = CLIP(t2a + t6a);
+ int t3 = CLIP(t3a + t7a);
+ const int t4 = CLIP(t0a - t4a);
+ const int t5 = CLIP(t1a - t5a);
+ int t6 = CLIP(t2a - t6a);
+ int t7 = CLIP(t3a - t7a);
+
+ t4a = (((3784 - 4096) * t4 + 1567 * t5 + 2048) >> 12) + t4;
+ t5a = (( 1567 * t4 - (3784 - 4096) * t5 + 2048) >> 12) - t5;
+ t6a = (((3784 - 4096) * t7 - 1567 * t6 + 2048) >> 12) + t7;
+ t7a = (( 1567 * t7 + (3784 - 4096) * t6 + 2048) >> 12) + t6;
+
+ out[0 * out_s] = CLIP(t0 + t2 );
+ out[7 * out_s] = -CLIP(t1 + t3 );
+ t2 = CLIP(t0 - t2 );
+ t3 = CLIP(t1 - t3 );
+ out[1 * out_s] = -CLIP(t4a + t6a);
+ out[6 * out_s] = CLIP(t5a + t7a);
+ t6 = CLIP(t4a - t6a);
+ t7 = CLIP(t5a - t7a);
+
+ out[3 * out_s] = -(((t2 + t3) * 181 + 128) >> 8);
+ out[4 * out_s] = ((t2 - t3) * 181 + 128) >> 8;
+ out[2 * out_s] = ((t6 + t7) * 181 + 128) >> 8;
+ out[5 * out_s] = -(((t6 - t7) * 181 + 128) >> 8);
+}
+
+static NOINLINE void
+inv_adst16_1d_internal_c(const int32_t *const in, const ptrdiff_t in_s,
+ const int min, const int max,
+ int32_t *const out, const ptrdiff_t out_s)
+{
+ assert(in_s > 0 && out_s != 0);
+ const int in0 = in[ 0 * in_s], in1 = in[ 1 * in_s];
+ const int in2 = in[ 2 * in_s], in3 = in[ 3 * in_s];
+ const int in4 = in[ 4 * in_s], in5 = in[ 5 * in_s];
+ const int in6 = in[ 6 * in_s], in7 = in[ 7 * in_s];
+ const int in8 = in[ 8 * in_s], in9 = in[ 9 * in_s];
+ const int in10 = in[10 * in_s], in11 = in[11 * in_s];
+ const int in12 = in[12 * in_s], in13 = in[13 * in_s];
+ const int in14 = in[14 * in_s], in15 = in[15 * in_s];
+
+ int t0 = ((in15 * (4091 - 4096) + in0 * 201 + 2048) >> 12) + in15;
+ int t1 = ((in15 * 201 - in0 * (4091 - 4096) + 2048) >> 12) - in0;
+ int t2 = ((in13 * (3973 - 4096) + in2 * 995 + 2048) >> 12) + in13;
+ int t3 = ((in13 * 995 - in2 * (3973 - 4096) + 2048) >> 12) - in2;
+ int t4 = ((in11 * (3703 - 4096) + in4 * 1751 + 2048) >> 12) + in11;
+ int t5 = ((in11 * 1751 - in4 * (3703 - 4096) + 2048) >> 12) - in4;
+ int t6 = (in9 * 1645 + in6 * 1220 + 1024) >> 11;
+ int t7 = (in9 * 1220 - in6 * 1645 + 1024) >> 11;
+ int t8 = ((in7 * 2751 + in8 * (3035 - 4096) + 2048) >> 12) + in8;
+ int t9 = ((in7 * (3035 - 4096) - in8 * 2751 + 2048) >> 12) + in7;
+ int t10 = ((in5 * 2106 + in10 * (3513 - 4096) + 2048) >> 12) + in10;
+ int t11 = ((in5 * (3513 - 4096) - in10 * 2106 + 2048) >> 12) + in5;
+ int t12 = ((in3 * 1380 + in12 * (3857 - 4096) + 2048) >> 12) + in12;
+ int t13 = ((in3 * (3857 - 4096) - in12 * 1380 + 2048) >> 12) + in3;
+ int t14 = ((in1 * 601 + in14 * (4052 - 4096) + 2048) >> 12) + in14;
+ int t15 = ((in1 * (4052 - 4096) - in14 * 601 + 2048) >> 12) + in1;
+
+ int t0a = CLIP(t0 + t8 );
+ int t1a = CLIP(t1 + t9 );
+ int t2a = CLIP(t2 + t10);
+ int t3a = CLIP(t3 + t11);
+ int t4a = CLIP(t4 + t12);
+ int t5a = CLIP(t5 + t13);
+ int t6a = CLIP(t6 + t14);
+ int t7a = CLIP(t7 + t15);
+ int t8a = CLIP(t0 - t8 );
+ int t9a = CLIP(t1 - t9 );
+ int t10a = CLIP(t2 - t10);
+ int t11a = CLIP(t3 - t11);
+ int t12a = CLIP(t4 - t12);
+ int t13a = CLIP(t5 - t13);
+ int t14a = CLIP(t6 - t14);
+ int t15a = CLIP(t7 - t15);
+
+ t8 = ((t8a * (4017 - 4096) + t9a * 799 + 2048) >> 12) + t8a;
+ t9 = ((t8a * 799 - t9a * (4017 - 4096) + 2048) >> 12) - t9a;
+ t10 = ((t10a * 2276 + t11a * (3406 - 4096) + 2048) >> 12) + t11a;
+ t11 = ((t10a * (3406 - 4096) - t11a * 2276 + 2048) >> 12) + t10a;
+ t12 = ((t13a * (4017 - 4096) - t12a * 799 + 2048) >> 12) + t13a;
+ t13 = ((t13a * 799 + t12a * (4017 - 4096) + 2048) >> 12) + t12a;
+ t14 = ((t15a * 2276 - t14a * (3406 - 4096) + 2048) >> 12) - t14a;
+ t15 = ((t15a * (3406 - 4096) + t14a * 2276 + 2048) >> 12) + t15a;
+
+ t0 = CLIP(t0a + t4a);
+ t1 = CLIP(t1a + t5a);
+ t2 = CLIP(t2a + t6a);
+ t3 = CLIP(t3a + t7a);
+ t4 = CLIP(t0a - t4a);
+ t5 = CLIP(t1a - t5a);
+ t6 = CLIP(t2a - t6a);
+ t7 = CLIP(t3a - t7a);
+ t8a = CLIP(t8 + t12);
+ t9a = CLIP(t9 + t13);
+ t10a = CLIP(t10 + t14);
+ t11a = CLIP(t11 + t15);
+ t12a = CLIP(t8 - t12);
+ t13a = CLIP(t9 - t13);
+ t14a = CLIP(t10 - t14);
+ t15a = CLIP(t11 - t15);
+
+ t4a = ((t4 * (3784 - 4096) + t5 * 1567 + 2048) >> 12) + t4;
+ t5a = ((t4 * 1567 - t5 * (3784 - 4096) + 2048) >> 12) - t5;
+ t6a = ((t7 * (3784 - 4096) - t6 * 1567 + 2048) >> 12) + t7;
+ t7a = ((t7 * 1567 + t6 * (3784 - 4096) + 2048) >> 12) + t6;
+ t12 = ((t12a * (3784 - 4096) + t13a * 1567 + 2048) >> 12) + t12a;
+ t13 = ((t12a * 1567 - t13a * (3784 - 4096) + 2048) >> 12) - t13a;
+ t14 = ((t15a * (3784 - 4096) - t14a * 1567 + 2048) >> 12) + t15a;
+ t15 = ((t15a * 1567 + t14a * (3784 - 4096) + 2048) >> 12) + t14a;
+
+ out[ 0 * out_s] = CLIP(t0 + t2 );
+ out[15 * out_s] = -CLIP(t1 + t3 );
+ t2a = CLIP(t0 - t2 );
+ t3a = CLIP(t1 - t3 );
+ out[ 3 * out_s] = -CLIP(t4a + t6a );
+ out[12 * out_s] = CLIP(t5a + t7a );
+ t6 = CLIP(t4a - t6a );
+ t7 = CLIP(t5a - t7a );
+ out[ 1 * out_s] = -CLIP(t8a + t10a);
+ out[14 * out_s] = CLIP(t9a + t11a);
+ t10 = CLIP(t8a - t10a);
+ t11 = CLIP(t9a - t11a);
+ out[ 2 * out_s] = CLIP(t12 + t14 );
+ out[13 * out_s] = -CLIP(t13 + t15 );
+ t14a = CLIP(t12 - t14 );
+ t15a = CLIP(t13 - t15 );
+
+ out[ 7 * out_s] = -(((t2a + t3a) * 181 + 128) >> 8);
+ out[ 8 * out_s] = ((t2a - t3a) * 181 + 128) >> 8;
+ out[ 4 * out_s] = ((t6 + t7) * 181 + 128) >> 8;
+ out[11 * out_s] = -(((t6 - t7) * 181 + 128) >> 8);
+ out[ 6 * out_s] = ((t10 + t11) * 181 + 128) >> 8;
+ out[ 9 * out_s] = -(((t10 - t11) * 181 + 128) >> 8);
+ out[ 5 * out_s] = -(((t14a + t15a) * 181 + 128) >> 8);
+ out[10 * out_s] = ((t14a - t15a) * 181 + 128) >> 8;
+}
+
+#define inv_adst_1d(sz) \
+void dav1d_inv_adst##sz##_1d_c(int32_t *const c, const ptrdiff_t stride, \
+ const int min, const int max) \
+{ \
+ inv_adst##sz##_1d_internal_c(c, stride, min, max, c, stride); \
+} \
+void dav1d_inv_flipadst##sz##_1d_c(int32_t *const c, const ptrdiff_t stride, \
+ const int min, const int max) \
+{ \
+ inv_adst##sz##_1d_internal_c(c, stride, min, max, \
+ &c[(sz - 1) * stride], -stride); \
+}
+
+inv_adst_1d( 4)
+inv_adst_1d( 8)
+inv_adst_1d(16)
+
+#undef inv_adst_1d
+
+void dav1d_inv_identity4_1d_c(int32_t *const c, const ptrdiff_t stride,
+ const int min, const int max)
+{
+ assert(stride > 0);
+ for (int i = 0; i < 4; i++) {
+ const int in = c[stride * i];
+ c[stride * i] = in + ((in * 1697 + 2048) >> 12);
+ }
+}
+
+void dav1d_inv_identity8_1d_c(int32_t *const c, const ptrdiff_t stride,
+ const int min, const int max)
+{
+ assert(stride > 0);
+ for (int i = 0; i < 8; i++)
+ c[stride * i] *= 2;
+}
+
+void dav1d_inv_identity16_1d_c(int32_t *const c, const ptrdiff_t stride,
+ const int min, const int max)
+{
+ assert(stride > 0);
+ for (int i = 0; i < 16; i++) {
+ const int in = c[stride * i];
+ c[stride * i] = 2 * in + ((in * 1697 + 1024) >> 11);
+ }
+}
+
+void dav1d_inv_identity32_1d_c(int32_t *const c, const ptrdiff_t stride,
+ const int min, const int max)
+{
+ assert(stride > 0);
+ for (int i = 0; i < 32; i++)
+ c[stride * i] *= 4;
+}
+
+void dav1d_inv_wht4_1d_c(int32_t *const c, const ptrdiff_t stride) {
+ assert(stride > 0);
+ const int in0 = c[0 * stride], in1 = c[1 * stride];
+ const int in2 = c[2 * stride], in3 = c[3 * stride];
+
+ const int t0 = in0 + in1;
+ const int t2 = in2 - in3;
+ const int t4 = (t0 - t2) >> 1;
+ const int t3 = t4 - in3;
+ const int t1 = t4 - in1;
+
+ c[0 * stride] = t0 - t3;
+ c[1 * stride] = t3;
+ c[2 * stride] = t1;
+ c[3 * stride] = t2 + t1;
+}
diff --git a/third_party/dav1d/src/itx_1d.h b/third_party/dav1d/src/itx_1d.h
new file mode 100644
index 0000000000..b63d71b020
--- /dev/null
+++ b/third_party/dav1d/src/itx_1d.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright © 2018-2019, VideoLAN and dav1d authors
+ * Copyright © 2018-2019, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifndef DAV1D_SRC_ITX_1D_H
+#define DAV1D_SRC_ITX_1D_H
+
+#define decl_itx_1d_fn(name) \
+void (name)(int32_t *c, ptrdiff_t stride, int min, int max)
+typedef decl_itx_1d_fn(*itx_1d_fn);
+
+decl_itx_1d_fn(dav1d_inv_dct4_1d_c);
+decl_itx_1d_fn(dav1d_inv_dct8_1d_c);
+decl_itx_1d_fn(dav1d_inv_dct16_1d_c);
+decl_itx_1d_fn(dav1d_inv_dct32_1d_c);
+decl_itx_1d_fn(dav1d_inv_dct64_1d_c);
+
+decl_itx_1d_fn(dav1d_inv_adst4_1d_c);
+decl_itx_1d_fn(dav1d_inv_adst8_1d_c);
+decl_itx_1d_fn(dav1d_inv_adst16_1d_c);
+
+decl_itx_1d_fn(dav1d_inv_flipadst4_1d_c);
+decl_itx_1d_fn(dav1d_inv_flipadst8_1d_c);
+decl_itx_1d_fn(dav1d_inv_flipadst16_1d_c);
+
+decl_itx_1d_fn(dav1d_inv_identity4_1d_c);
+decl_itx_1d_fn(dav1d_inv_identity8_1d_c);
+decl_itx_1d_fn(dav1d_inv_identity16_1d_c);
+decl_itx_1d_fn(dav1d_inv_identity32_1d_c);
+
+void dav1d_inv_wht4_1d_c(int32_t *c, ptrdiff_t stride);
+
+#endif /* DAV1D_SRC_ITX_1D_H */
diff --git a/third_party/dav1d/src/itx_tmpl.c b/third_party/dav1d/src/itx_tmpl.c
new file mode 100644
index 0000000000..8ff245a0de
--- /dev/null
+++ b/third_party/dav1d/src/itx_tmpl.c
@@ -0,0 +1,274 @@
+/*
+ * Copyright © 2018-2019, VideoLAN and dav1d authors
+ * Copyright © 2018-2019, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "common/attributes.h"
+#include "common/intops.h"
+
+#include "src/itx.h"
+#include "src/itx_1d.h"
+
+static NOINLINE void
+inv_txfm_add_c(pixel *dst, const ptrdiff_t stride, coef *const coeff,
+ const int eob, const int w, const int h, const int shift,
+ const itx_1d_fn first_1d_fn, const itx_1d_fn second_1d_fn,
+ const int has_dconly HIGHBD_DECL_SUFFIX)
+{
+ assert(w >= 4 && w <= 64);
+ assert(h >= 4 && h <= 64);
+ assert(eob >= 0);
+
+ const int is_rect2 = w * 2 == h || h * 2 == w;
+ const int rnd = (1 << shift) >> 1;
+
+ if (eob < has_dconly) {
+ int dc = coeff[0];
+ coeff[0] = 0;
+ if (is_rect2)
+ dc = (dc * 181 + 128) >> 8;
+ dc = (dc * 181 + 128) >> 8;
+ dc = (dc + rnd) >> shift;
+ dc = (dc * 181 + 128 + 2048) >> 12;
+ for (int y = 0; y < h; y++, dst += PXSTRIDE(stride))
+ for (int x = 0; x < w; x++)
+ dst[x] = iclip_pixel(dst[x] + dc);
+ return;
+ }
+
+ const int sh = imin(h, 32), sw = imin(w, 32);
+#if BITDEPTH == 8
+ const int row_clip_min = INT16_MIN;
+ const int col_clip_min = INT16_MIN;
+#else
+ const int row_clip_min = (int) ((unsigned) ~bitdepth_max << 7);
+ const int col_clip_min = (int) ((unsigned) ~bitdepth_max << 5);
+#endif
+ const int row_clip_max = ~row_clip_min;
+ const int col_clip_max = ~col_clip_min;
+
+ int32_t tmp[64 * 64], *c = tmp;
+ for (int y = 0; y < sh; y++, c += w) {
+ if (is_rect2)
+ for (int x = 0; x < sw; x++)
+ c[x] = (coeff[y + x * sh] * 181 + 128) >> 8;
+ else
+ for (int x = 0; x < sw; x++)
+ c[x] = coeff[y + x * sh];
+ first_1d_fn(c, 1, row_clip_min, row_clip_max);
+ }
+
+ memset(coeff, 0, sizeof(*coeff) * sw * sh);
+ for (int i = 0; i < w * sh; i++)
+ tmp[i] = iclip((tmp[i] + rnd) >> shift, col_clip_min, col_clip_max);
+
+ for (int x = 0; x < w; x++)
+ second_1d_fn(&tmp[x], w, col_clip_min, col_clip_max);
+
+ c = tmp;
+ for (int y = 0; y < h; y++, dst += PXSTRIDE(stride))
+ for (int x = 0; x < w; x++)
+ dst[x] = iclip_pixel(dst[x] + ((*c++ + 8) >> 4));
+}
+
+#define inv_txfm_fn(type1, type2, w, h, shift, has_dconly) \
+static void \
+inv_txfm_add_##type1##_##type2##_##w##x##h##_c(pixel *dst, \
+ const ptrdiff_t stride, \
+ coef *const coeff, \
+ const int eob \
+ HIGHBD_DECL_SUFFIX) \
+{ \
+ inv_txfm_add_c(dst, stride, coeff, eob, w, h, shift, \
+ dav1d_inv_##type1##w##_1d_c, dav1d_inv_##type2##h##_1d_c, \
+ has_dconly HIGHBD_TAIL_SUFFIX); \
+}
+
+#define inv_txfm_fn64(w, h, shift) \
+inv_txfm_fn(dct, dct, w, h, shift, 1)
+
+#define inv_txfm_fn32(w, h, shift) \
+inv_txfm_fn64(w, h, shift) \
+inv_txfm_fn(identity, identity, w, h, shift, 0)
+
+#define inv_txfm_fn16(w, h, shift) \
+inv_txfm_fn32(w, h, shift) \
+inv_txfm_fn(adst, dct, w, h, shift, 0) \
+inv_txfm_fn(dct, adst, w, h, shift, 0) \
+inv_txfm_fn(adst, adst, w, h, shift, 0) \
+inv_txfm_fn(dct, flipadst, w, h, shift, 0) \
+inv_txfm_fn(flipadst, dct, w, h, shift, 0) \
+inv_txfm_fn(adst, flipadst, w, h, shift, 0) \
+inv_txfm_fn(flipadst, adst, w, h, shift, 0) \
+inv_txfm_fn(flipadst, flipadst, w, h, shift, 0) \
+inv_txfm_fn(identity, dct, w, h, shift, 0) \
+inv_txfm_fn(dct, identity, w, h, shift, 0) \
+
+#define inv_txfm_fn84(w, h, shift) \
+inv_txfm_fn16(w, h, shift) \
+inv_txfm_fn(identity, flipadst, w, h, shift, 0) \
+inv_txfm_fn(flipadst, identity, w, h, shift, 0) \
+inv_txfm_fn(identity, adst, w, h, shift, 0) \
+inv_txfm_fn(adst, identity, w, h, shift, 0) \
+
+inv_txfm_fn84( 4, 4, 0)
+inv_txfm_fn84( 4, 8, 0)
+inv_txfm_fn84( 4, 16, 1)
+inv_txfm_fn84( 8, 4, 0)
+inv_txfm_fn84( 8, 8, 1)
+inv_txfm_fn84( 8, 16, 1)
+inv_txfm_fn32( 8, 32, 2)
+inv_txfm_fn84(16, 4, 1)
+inv_txfm_fn84(16, 8, 1)
+inv_txfm_fn16(16, 16, 2)
+inv_txfm_fn32(16, 32, 1)
+inv_txfm_fn64(16, 64, 2)
+inv_txfm_fn32(32, 8, 2)
+inv_txfm_fn32(32, 16, 1)
+inv_txfm_fn32(32, 32, 2)
+inv_txfm_fn64(32, 64, 1)
+inv_txfm_fn64(64, 16, 2)
+inv_txfm_fn64(64, 32, 1)
+inv_txfm_fn64(64, 64, 2)
+
+static void inv_txfm_add_wht_wht_4x4_c(pixel *dst, const ptrdiff_t stride,
+ coef *const coeff, const int eob
+ HIGHBD_DECL_SUFFIX)
+{
+ int32_t tmp[4 * 4], *c = tmp;
+ for (int y = 0; y < 4; y++, c += 4) {
+ for (int x = 0; x < 4; x++)
+ c[x] = coeff[y + x * 4] >> 2;
+ dav1d_inv_wht4_1d_c(c, 1);
+ }
+ memset(coeff, 0, sizeof(*coeff) * 4 * 4);
+
+ for (int x = 0; x < 4; x++)
+ dav1d_inv_wht4_1d_c(&tmp[x], 4);
+
+ c = tmp;
+ for (int y = 0; y < 4; y++, dst += PXSTRIDE(stride))
+ for (int x = 0; x < 4; x++)
+ dst[x] = iclip_pixel(dst[x] + *c++);
+}
+
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+#include "src/arm/itx.h"
+#elif ARCH_LOONGARCH64
+#include "src/loongarch/itx.h"
+#elif ARCH_RISCV
+#include "src/riscv/itx.h"
+#elif ARCH_X86
+#include "src/x86/itx.h"
+#endif
+#endif
+
+COLD void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c, int bpc) {
+#define assign_itx_all_fn64(w, h, pfx) \
+ c->itxfm_add[pfx##TX_##w##X##h][DCT_DCT ] = \
+ inv_txfm_add_dct_dct_##w##x##h##_c
+
+#define assign_itx_all_fn32(w, h, pfx) \
+ assign_itx_all_fn64(w, h, pfx); \
+ c->itxfm_add[pfx##TX_##w##X##h][IDTX] = \
+ inv_txfm_add_identity_identity_##w##x##h##_c
+
+#define assign_itx_all_fn16(w, h, pfx) \
+ assign_itx_all_fn32(w, h, pfx); \
+ c->itxfm_add[pfx##TX_##w##X##h][DCT_ADST ] = \
+ inv_txfm_add_adst_dct_##w##x##h##_c; \
+ c->itxfm_add[pfx##TX_##w##X##h][ADST_DCT ] = \
+ inv_txfm_add_dct_adst_##w##x##h##_c; \
+ c->itxfm_add[pfx##TX_##w##X##h][ADST_ADST] = \
+ inv_txfm_add_adst_adst_##w##x##h##_c; \
+ c->itxfm_add[pfx##TX_##w##X##h][ADST_FLIPADST] = \
+ inv_txfm_add_flipadst_adst_##w##x##h##_c; \
+ c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_ADST] = \
+ inv_txfm_add_adst_flipadst_##w##x##h##_c; \
+ c->itxfm_add[pfx##TX_##w##X##h][DCT_FLIPADST] = \
+ inv_txfm_add_flipadst_dct_##w##x##h##_c; \
+ c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_DCT] = \
+ inv_txfm_add_dct_flipadst_##w##x##h##_c; \
+ c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_FLIPADST] = \
+ inv_txfm_add_flipadst_flipadst_##w##x##h##_c; \
+ c->itxfm_add[pfx##TX_##w##X##h][H_DCT] = \
+ inv_txfm_add_dct_identity_##w##x##h##_c; \
+ c->itxfm_add[pfx##TX_##w##X##h][V_DCT] = \
+ inv_txfm_add_identity_dct_##w##x##h##_c
+
+#define assign_itx_all_fn84(w, h, pfx) \
+ assign_itx_all_fn16(w, h, pfx); \
+ c->itxfm_add[pfx##TX_##w##X##h][H_FLIPADST] = \
+ inv_txfm_add_flipadst_identity_##w##x##h##_c; \
+ c->itxfm_add[pfx##TX_##w##X##h][V_FLIPADST] = \
+ inv_txfm_add_identity_flipadst_##w##x##h##_c; \
+ c->itxfm_add[pfx##TX_##w##X##h][H_ADST] = \
+ inv_txfm_add_adst_identity_##w##x##h##_c; \
+ c->itxfm_add[pfx##TX_##w##X##h][V_ADST] = \
+ inv_txfm_add_identity_adst_##w##x##h##_c; \
+
+ c->itxfm_add[TX_4X4][WHT_WHT] = inv_txfm_add_wht_wht_4x4_c;
+ assign_itx_all_fn84( 4, 4, );
+ assign_itx_all_fn84( 4, 8, R);
+ assign_itx_all_fn84( 4, 16, R);
+ assign_itx_all_fn84( 8, 4, R);
+ assign_itx_all_fn84( 8, 8, );
+ assign_itx_all_fn84( 8, 16, R);
+ assign_itx_all_fn32( 8, 32, R);
+ assign_itx_all_fn84(16, 4, R);
+ assign_itx_all_fn84(16, 8, R);
+ assign_itx_all_fn16(16, 16, );
+ assign_itx_all_fn32(16, 32, R);
+ assign_itx_all_fn64(16, 64, R);
+ assign_itx_all_fn32(32, 8, R);
+ assign_itx_all_fn32(32, 16, R);
+ assign_itx_all_fn32(32, 32, );
+ assign_itx_all_fn64(32, 64, R);
+ assign_itx_all_fn64(64, 16, R);
+ assign_itx_all_fn64(64, 32, R);
+ assign_itx_all_fn64(64, 64, );
+
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+ itx_dsp_init_arm(c, bpc);
+#endif
+#if ARCH_LOONGARCH64
+ itx_dsp_init_loongarch(c, bpc);
+#endif
+#if ARCH_RISCV
+ itx_dsp_init_riscv(c, bpc);
+#endif
+#if ARCH_X86
+ itx_dsp_init_x86(c, bpc);
+#endif
+#endif
+}
diff --git a/third_party/dav1d/src/levels.h b/third_party/dav1d/src/levels.h
new file mode 100644
index 0000000000..0f510e9f30
--- /dev/null
+++ b/third_party/dav1d/src/levels.h
@@ -0,0 +1,289 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_LEVELS_H
+#define DAV1D_SRC_LEVELS_H
+
+#include <stdint.h>
+
+#include "dav1d/headers.h"
+#include "common/attributes.h"
+
+enum ObuMetaType {
+ OBU_META_HDR_CLL = 1,
+ OBU_META_HDR_MDCV = 2,
+ OBU_META_SCALABILITY = 3,
+ OBU_META_ITUT_T35 = 4,
+ OBU_META_TIMECODE = 5,
+};
+
+enum TxfmSize {
+ TX_4X4,
+ TX_8X8,
+ TX_16X16,
+ TX_32X32,
+ TX_64X64,
+ N_TX_SIZES,
+};
+
+enum BlockLevel {
+ BL_128X128,
+ BL_64X64,
+ BL_32X32,
+ BL_16X16,
+ BL_8X8,
+ N_BL_LEVELS,
+};
+
+enum RectTxfmSize {
+ RTX_4X8 = N_TX_SIZES,
+ RTX_8X4,
+ RTX_8X16,
+ RTX_16X8,
+ RTX_16X32,
+ RTX_32X16,
+ RTX_32X64,
+ RTX_64X32,
+ RTX_4X16,
+ RTX_16X4,
+ RTX_8X32,
+ RTX_32X8,
+ RTX_16X64,
+ RTX_64X16,
+ N_RECT_TX_SIZES
+};
+
+enum TxfmType {
+ DCT_DCT, // DCT in both horizontal and vertical
+ ADST_DCT, // ADST in vertical, DCT in horizontal
+ DCT_ADST, // DCT in vertical, ADST in horizontal
+ ADST_ADST, // ADST in both directions
+ FLIPADST_DCT,
+ DCT_FLIPADST,
+ FLIPADST_FLIPADST,
+ ADST_FLIPADST,
+ FLIPADST_ADST,
+ IDTX,
+ V_DCT,
+ H_DCT,
+ V_ADST,
+ H_ADST,
+ V_FLIPADST,
+ H_FLIPADST,
+ N_TX_TYPES,
+ WHT_WHT = N_TX_TYPES,
+ N_TX_TYPES_PLUS_LL,
+};
+
+enum TxClass {
+ TX_CLASS_2D,
+ TX_CLASS_H,
+ TX_CLASS_V,
+};
+
+enum IntraPredMode {
+ DC_PRED,
+ VERT_PRED,
+ HOR_PRED,
+ DIAG_DOWN_LEFT_PRED,
+ DIAG_DOWN_RIGHT_PRED,
+ VERT_RIGHT_PRED,
+ HOR_DOWN_PRED,
+ HOR_UP_PRED,
+ VERT_LEFT_PRED,
+ SMOOTH_PRED,
+ SMOOTH_V_PRED,
+ SMOOTH_H_PRED,
+ PAETH_PRED,
+ N_INTRA_PRED_MODES,
+ CFL_PRED = N_INTRA_PRED_MODES,
+ N_UV_INTRA_PRED_MODES,
+ N_IMPL_INTRA_PRED_MODES = N_UV_INTRA_PRED_MODES,
+ LEFT_DC_PRED = DIAG_DOWN_LEFT_PRED,
+ TOP_DC_PRED,
+ DC_128_PRED,
+ Z1_PRED,
+ Z2_PRED,
+ Z3_PRED,
+ FILTER_PRED = N_INTRA_PRED_MODES,
+};
+
+enum InterIntraPredMode {
+ II_DC_PRED,
+ II_VERT_PRED,
+ II_HOR_PRED,
+ II_SMOOTH_PRED,
+ N_INTER_INTRA_PRED_MODES,
+};
+
+enum BlockPartition {
+ PARTITION_NONE, // [ ] <-.
+ PARTITION_H, // [-] |
+ PARTITION_V, // [|] |
+ PARTITION_SPLIT, // [+] --'
+ PARTITION_T_TOP_SPLIT, // [⊥] i.e. split top, H bottom
+ PARTITION_T_BOTTOM_SPLIT, // [т] i.e. H top, split bottom
+ PARTITION_T_LEFT_SPLIT, // [-|] i.e. split left, V right
+ PARTITION_T_RIGHT_SPLIT, // [|-] i.e. V left, split right
+ PARTITION_H4, // [Ⲷ]
+ PARTITION_V4, // [Ⲽ]
+ N_PARTITIONS,
+ N_SUB8X8_PARTITIONS = PARTITION_T_TOP_SPLIT,
+};
+
+enum BlockSize {
+ BS_128x128,
+ BS_128x64,
+ BS_64x128,
+ BS_64x64,
+ BS_64x32,
+ BS_64x16,
+ BS_32x64,
+ BS_32x32,
+ BS_32x16,
+ BS_32x8,
+ BS_16x64,
+ BS_16x32,
+ BS_16x16,
+ BS_16x8,
+ BS_16x4,
+ BS_8x32,
+ BS_8x16,
+ BS_8x8,
+ BS_8x4,
+ BS_4x16,
+ BS_4x8,
+ BS_4x4,
+ N_BS_SIZES,
+};
+
+enum Filter2d { // order is horizontal, vertical
+ FILTER_2D_8TAP_REGULAR,
+ FILTER_2D_8TAP_REGULAR_SMOOTH,
+ FILTER_2D_8TAP_REGULAR_SHARP,
+ FILTER_2D_8TAP_SHARP_REGULAR,
+ FILTER_2D_8TAP_SHARP_SMOOTH,
+ FILTER_2D_8TAP_SHARP,
+ FILTER_2D_8TAP_SMOOTH_REGULAR,
+ FILTER_2D_8TAP_SMOOTH,
+ FILTER_2D_8TAP_SMOOTH_SHARP,
+ FILTER_2D_BILINEAR,
+ N_2D_FILTERS,
+};
+
+enum MVJoint {
+ MV_JOINT_ZERO,
+ MV_JOINT_H,
+ MV_JOINT_V,
+ MV_JOINT_HV,
+ N_MV_JOINTS,
+};
+
+enum InterPredMode {
+ NEARESTMV,
+ NEARMV,
+ GLOBALMV,
+ NEWMV,
+ N_INTER_PRED_MODES,
+};
+
+enum DRL_PROXIMITY {
+ NEAREST_DRL,
+ NEARER_DRL,
+ NEAR_DRL,
+ NEARISH_DRL
+};
+
+enum CompInterPredMode {
+ NEARESTMV_NEARESTMV,
+ NEARMV_NEARMV,
+ NEARESTMV_NEWMV,
+ NEWMV_NEARESTMV,
+ NEARMV_NEWMV,
+ NEWMV_NEARMV,
+ GLOBALMV_GLOBALMV,
+ NEWMV_NEWMV,
+ N_COMP_INTER_PRED_MODES,
+};
+
+enum CompInterType {
+ COMP_INTER_NONE,
+ COMP_INTER_WEIGHTED_AVG,
+ COMP_INTER_AVG,
+ COMP_INTER_SEG,
+ COMP_INTER_WEDGE,
+};
+
+enum InterIntraType {
+ INTER_INTRA_NONE,
+ INTER_INTRA_BLEND,
+ INTER_INTRA_WEDGE,
+};
+
+typedef union mv {
+ struct {
+ int16_t y, x;
+ };
+ uint32_t n;
+} mv;
+
+enum MotionMode {
+ MM_TRANSLATION,
+ MM_OBMC,
+ MM_WARP,
+};
+
+#define QINDEX_RANGE 256
+
+typedef struct Av1Block {
+ uint8_t bl, bs, bp;
+ uint8_t intra, seg_id, skip_mode, skip, uvtx;
+ union {
+ struct {
+ uint8_t y_mode, uv_mode, tx, pal_sz[2];
+ int8_t y_angle, uv_angle, cfl_alpha[2];
+ }; // intra
+ struct {
+ union {
+ struct {
+ union mv mv[2];
+ uint8_t wedge_idx, mask_sign, interintra_mode;
+ };
+ struct {
+ union mv mv2d;
+ int16_t matrix[4];
+ };
+ };
+ uint8_t comp_type, inter_mode, motion_mode, drl_idx;
+ int8_t ref[2];
+ uint8_t max_ytx, filter2d, interintra_type, tx_split0;
+ uint16_t tx_split1;
+ }; // inter
+ };
+} Av1Block;
+
+#endif /* DAV1D_SRC_LEVELS_H */
diff --git a/third_party/dav1d/src/lf_apply.h b/third_party/dav1d/src/lf_apply.h
new file mode 100644
index 0000000000..cf4c898550
--- /dev/null
+++ b/third_party/dav1d/src/lf_apply.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_LF_APPLY_H
+#define DAV1D_SRC_LF_APPLY_H
+
+#include <stdint.h>
+
+#include "common/bitdepth.h"
+
+#include "src/internal.h"
+#include "src/levels.h"
+
+void bytefn(dav1d_loopfilter_sbrow_cols)(const Dav1dFrameContext *f,
+ pixel *const p[3], Av1Filter *lflvl,
+ int sby, int start_of_tile_row);
+void bytefn(dav1d_loopfilter_sbrow_rows)(const Dav1dFrameContext *f,
+ pixel *const p[3], Av1Filter *lflvl,
+ int sby);
+
+void bytefn(dav1d_copy_lpf)(Dav1dFrameContext *const f,
+ /*const*/ pixel *const src[3], int sby);
+
+#endif /* DAV1D_SRC_LF_APPLY_H */
diff --git a/third_party/dav1d/src/lf_apply_tmpl.c b/third_party/dav1d/src/lf_apply_tmpl.c
new file mode 100644
index 0000000000..4ef3becd82
--- /dev/null
+++ b/third_party/dav1d/src/lf_apply_tmpl.c
@@ -0,0 +1,466 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <string.h>
+
+#include "common/intops.h"
+
+#include "src/lf_apply.h"
+#include "src/lr_apply.h"
+
+// The loop filter buffer stores 12 rows of pixels. A superblock block will
+// contain at most 2 stripes. Each stripe requires 4 rows pixels (2 above
+// and 2 below) the final 4 rows are used to swap the bottom of the last
+// stripe with the top of the next super block row.
+static void backup_lpf(const Dav1dFrameContext *const f,
+ pixel *dst, const ptrdiff_t dst_stride,
+ const pixel *src, const ptrdiff_t src_stride,
+ const int ss_ver, const int sb128,
+ int row, const int row_h, const int src_w,
+ const int h, const int ss_hor, const int lr_backup)
+{
+ const int cdef_backup = !lr_backup;
+ const int dst_w = f->frame_hdr->super_res.enabled ?
+ (f->frame_hdr->width[1] + ss_hor) >> ss_hor : src_w;
+
+ // The first stripe of the frame is shorter by 8 luma pixel rows.
+ int stripe_h = ((64 << (cdef_backup & sb128)) - 8 * !row) >> ss_ver;
+ src += (stripe_h - 2) * PXSTRIDE(src_stride);
+
+ if (f->c->n_tc == 1) {
+ if (row) {
+ const int top = 4 << sb128;
+ // Copy the top part of the stored loop filtered pixels from the
+ // previous sb row needed above the first stripe of this sb row.
+ pixel_copy(&dst[PXSTRIDE(dst_stride) * 0],
+ &dst[PXSTRIDE(dst_stride) * top], dst_w);
+ pixel_copy(&dst[PXSTRIDE(dst_stride) * 1],
+ &dst[PXSTRIDE(dst_stride) * (top + 1)], dst_w);
+ pixel_copy(&dst[PXSTRIDE(dst_stride) * 2],
+ &dst[PXSTRIDE(dst_stride) * (top + 2)], dst_w);
+ pixel_copy(&dst[PXSTRIDE(dst_stride) * 3],
+ &dst[PXSTRIDE(dst_stride) * (top + 3)], dst_w);
+ }
+ dst += 4 * PXSTRIDE(dst_stride);
+ }
+
+ if (lr_backup && (f->frame_hdr->width[0] != f->frame_hdr->width[1])) {
+ while (row + stripe_h <= row_h) {
+ const int n_lines = 4 - (row + stripe_h + 1 == h);
+ f->dsp->mc.resize(dst, dst_stride, src, src_stride,
+ dst_w, n_lines, src_w, f->resize_step[ss_hor],
+ f->resize_start[ss_hor] HIGHBD_CALL_SUFFIX);
+ row += stripe_h; // unmodified stripe_h for the 1st stripe
+ stripe_h = 64 >> ss_ver;
+ src += stripe_h * PXSTRIDE(src_stride);
+ dst += n_lines * PXSTRIDE(dst_stride);
+ if (n_lines == 3) {
+ pixel_copy(dst, &dst[-PXSTRIDE(dst_stride)], dst_w);
+ dst += PXSTRIDE(dst_stride);
+ }
+ }
+ } else {
+ while (row + stripe_h <= row_h) {
+ const int n_lines = 4 - (row + stripe_h + 1 == h);
+ for (int i = 0; i < 4; i++) {
+ pixel_copy(dst, i == n_lines ? &dst[-PXSTRIDE(dst_stride)] :
+ src, src_w);
+ dst += PXSTRIDE(dst_stride);
+ src += PXSTRIDE(src_stride);
+ }
+ row += stripe_h; // unmodified stripe_h for the 1st stripe
+ stripe_h = 64 >> ss_ver;
+ src += (stripe_h - 4) * PXSTRIDE(src_stride);
+ }
+ }
+}
+
+void bytefn(dav1d_copy_lpf)(Dav1dFrameContext *const f,
+ /*const*/ pixel *const src[3], const int sby)
+{
+ const int have_tt = f->c->n_tc > 1;
+ const int resize = f->frame_hdr->width[0] != f->frame_hdr->width[1];
+ const int offset = 8 * !!sby;
+ const ptrdiff_t *const src_stride = f->cur.stride;
+ const ptrdiff_t *const lr_stride = f->sr_cur.p.stride;
+ const int tt_off = have_tt * sby * (4 << f->seq_hdr->sb128);
+ pixel *const dst[3] = {
+ f->lf.lr_lpf_line[0] + tt_off * PXSTRIDE(lr_stride[0]),
+ f->lf.lr_lpf_line[1] + tt_off * PXSTRIDE(lr_stride[1]),
+ f->lf.lr_lpf_line[2] + tt_off * PXSTRIDE(lr_stride[1])
+ };
+
+ // TODO Also check block level restore type to reduce copying.
+ const int restore_planes = f->lf.restore_planes;
+
+ if (f->seq_hdr->cdef || restore_planes & LR_RESTORE_Y) {
+ const int h = f->cur.p.h;
+ const int w = f->bw << 2;
+ const int row_h = imin((sby + 1) << (6 + f->seq_hdr->sb128), h - 1);
+ const int y_stripe = (sby << (6 + f->seq_hdr->sb128)) - offset;
+ if (restore_planes & LR_RESTORE_Y || !resize)
+ backup_lpf(f, dst[0], lr_stride[0],
+ src[0] - offset * PXSTRIDE(src_stride[0]), src_stride[0],
+ 0, f->seq_hdr->sb128, y_stripe, row_h, w, h, 0, 1);
+ if (have_tt && resize) {
+ const ptrdiff_t cdef_off_y = sby * 4 * PXSTRIDE(src_stride[0]);
+ backup_lpf(f, f->lf.cdef_lpf_line[0] + cdef_off_y, src_stride[0],
+ src[0] - offset * PXSTRIDE(src_stride[0]), src_stride[0],
+ 0, f->seq_hdr->sb128, y_stripe, row_h, w, h, 0, 0);
+ }
+ }
+ if ((f->seq_hdr->cdef || restore_planes & (LR_RESTORE_U | LR_RESTORE_V)) &&
+ f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400)
+ {
+ const int ss_ver = f->sr_cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_hor = f->sr_cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+ const int h = (f->cur.p.h + ss_ver) >> ss_ver;
+ const int w = f->bw << (2 - ss_hor);
+ const int row_h = imin((sby + 1) << ((6 - ss_ver) + f->seq_hdr->sb128), h - 1);
+ const int offset_uv = offset >> ss_ver;
+ const int y_stripe = (sby << ((6 - ss_ver) + f->seq_hdr->sb128)) - offset_uv;
+ const ptrdiff_t cdef_off_uv = sby * 4 * PXSTRIDE(src_stride[1]);
+ if (f->seq_hdr->cdef || restore_planes & LR_RESTORE_U) {
+ if (restore_planes & LR_RESTORE_U || !resize)
+ backup_lpf(f, dst[1], lr_stride[1],
+ src[1] - offset_uv * PXSTRIDE(src_stride[1]),
+ src_stride[1], ss_ver, f->seq_hdr->sb128, y_stripe,
+ row_h, w, h, ss_hor, 1);
+ if (have_tt && resize)
+ backup_lpf(f, f->lf.cdef_lpf_line[1] + cdef_off_uv, src_stride[1],
+ src[1] - offset_uv * PXSTRIDE(src_stride[1]),
+ src_stride[1], ss_ver, f->seq_hdr->sb128, y_stripe,
+ row_h, w, h, ss_hor, 0);
+ }
+ if (f->seq_hdr->cdef || restore_planes & LR_RESTORE_V) {
+ if (restore_planes & LR_RESTORE_V || !resize)
+ backup_lpf(f, dst[2], lr_stride[1],
+ src[2] - offset_uv * PXSTRIDE(src_stride[1]),
+ src_stride[1], ss_ver, f->seq_hdr->sb128, y_stripe,
+ row_h, w, h, ss_hor, 1);
+ if (have_tt && resize)
+ backup_lpf(f, f->lf.cdef_lpf_line[2] + cdef_off_uv, src_stride[1],
+ src[2] - offset_uv * PXSTRIDE(src_stride[1]),
+ src_stride[1], ss_ver, f->seq_hdr->sb128, y_stripe,
+ row_h, w, h, ss_hor, 0);
+ }
+ }
+}
+
+static inline void filter_plane_cols_y(const Dav1dFrameContext *const f,
+ const int have_left,
+ const uint8_t (*lvl)[4],
+ const ptrdiff_t b4_stride,
+ const uint16_t (*const mask)[3][2],
+ pixel *dst, const ptrdiff_t ls,
+ const int w,
+ const int starty4, const int endy4)
+{
+ const Dav1dDSPContext *const dsp = f->dsp;
+
+ // filter edges between columns (e.g. block1 | block2)
+ for (int x = 0; x < w; x++) {
+ if (!have_left && !x) continue;
+ uint32_t hmask[4];
+ if (!starty4) {
+ hmask[0] = mask[x][0][0];
+ hmask[1] = mask[x][1][0];
+ hmask[2] = mask[x][2][0];
+ if (endy4 > 16) {
+ hmask[0] |= (unsigned) mask[x][0][1] << 16;
+ hmask[1] |= (unsigned) mask[x][1][1] << 16;
+ hmask[2] |= (unsigned) mask[x][2][1] << 16;
+ }
+ } else {
+ hmask[0] = mask[x][0][1];
+ hmask[1] = mask[x][1][1];
+ hmask[2] = mask[x][2][1];
+ }
+ hmask[3] = 0;
+ dsp->lf.loop_filter_sb[0][0](&dst[x * 4], ls, hmask,
+ (const uint8_t(*)[4]) &lvl[x][0], b4_stride,
+ &f->lf.lim_lut, endy4 - starty4 HIGHBD_CALL_SUFFIX);
+ }
+}
+
+static inline void filter_plane_rows_y(const Dav1dFrameContext *const f,
+ const int have_top,
+ const uint8_t (*lvl)[4],
+ const ptrdiff_t b4_stride,
+ const uint16_t (*const mask)[3][2],
+ pixel *dst, const ptrdiff_t ls,
+ const int w,
+ const int starty4, const int endy4)
+{
+ const Dav1dDSPContext *const dsp = f->dsp;
+
+ // block1
+ // filter edges between rows (e.g. ------)
+ // block2
+ for (int y = starty4; y < endy4;
+ y++, dst += 4 * PXSTRIDE(ls), lvl += b4_stride)
+ {
+ if (!have_top && !y) continue;
+ const uint32_t vmask[4] = {
+ mask[y][0][0] | ((unsigned) mask[y][0][1] << 16),
+ mask[y][1][0] | ((unsigned) mask[y][1][1] << 16),
+ mask[y][2][0] | ((unsigned) mask[y][2][1] << 16),
+ 0,
+ };
+ dsp->lf.loop_filter_sb[0][1](dst, ls, vmask,
+ (const uint8_t(*)[4]) &lvl[0][1], b4_stride,
+ &f->lf.lim_lut, w HIGHBD_CALL_SUFFIX);
+ }
+}
+
+static inline void filter_plane_cols_uv(const Dav1dFrameContext *const f,
+ const int have_left,
+ const uint8_t (*lvl)[4],
+ const ptrdiff_t b4_stride,
+ const uint16_t (*const mask)[2][2],
+ pixel *const u, pixel *const v,
+ const ptrdiff_t ls, const int w,
+ const int starty4, const int endy4,
+ const int ss_ver)
+{
+ const Dav1dDSPContext *const dsp = f->dsp;
+
+ // filter edges between columns (e.g. block1 | block2)
+ for (int x = 0; x < w; x++) {
+ if (!have_left && !x) continue;
+ uint32_t hmask[3];
+ if (!starty4) {
+ hmask[0] = mask[x][0][0];
+ hmask[1] = mask[x][1][0];
+ if (endy4 > (16 >> ss_ver)) {
+ hmask[0] |= (unsigned) mask[x][0][1] << (16 >> ss_ver);
+ hmask[1] |= (unsigned) mask[x][1][1] << (16 >> ss_ver);
+ }
+ } else {
+ hmask[0] = mask[x][0][1];
+ hmask[1] = mask[x][1][1];
+ }
+ hmask[2] = 0;
+ dsp->lf.loop_filter_sb[1][0](&u[x * 4], ls, hmask,
+ (const uint8_t(*)[4]) &lvl[x][2], b4_stride,
+ &f->lf.lim_lut, endy4 - starty4 HIGHBD_CALL_SUFFIX);
+ dsp->lf.loop_filter_sb[1][0](&v[x * 4], ls, hmask,
+ (const uint8_t(*)[4]) &lvl[x][3], b4_stride,
+ &f->lf.lim_lut, endy4 - starty4 HIGHBD_CALL_SUFFIX);
+ }
+}
+
+static inline void filter_plane_rows_uv(const Dav1dFrameContext *const f,
+ const int have_top,
+ const uint8_t (*lvl)[4],
+ const ptrdiff_t b4_stride,
+ const uint16_t (*const mask)[2][2],
+ pixel *const u, pixel *const v,
+ const ptrdiff_t ls, const int w,
+ const int starty4, const int endy4,
+ const int ss_hor)
+{
+ const Dav1dDSPContext *const dsp = f->dsp;
+ ptrdiff_t off_l = 0;
+
+ // block1
+ // filter edges between rows (e.g. ------)
+ // block2
+ for (int y = starty4; y < endy4;
+ y++, off_l += 4 * PXSTRIDE(ls), lvl += b4_stride)
+ {
+ if (!have_top && !y) continue;
+ const uint32_t vmask[3] = {
+ mask[y][0][0] | ((unsigned) mask[y][0][1] << (16 >> ss_hor)),
+ mask[y][1][0] | ((unsigned) mask[y][1][1] << (16 >> ss_hor)),
+ 0,
+ };
+ dsp->lf.loop_filter_sb[1][1](&u[off_l], ls, vmask,
+ (const uint8_t(*)[4]) &lvl[0][2], b4_stride,
+ &f->lf.lim_lut, w HIGHBD_CALL_SUFFIX);
+ dsp->lf.loop_filter_sb[1][1](&v[off_l], ls, vmask,
+ (const uint8_t(*)[4]) &lvl[0][3], b4_stride,
+ &f->lf.lim_lut, w HIGHBD_CALL_SUFFIX);
+ }
+}
+
+void bytefn(dav1d_loopfilter_sbrow_cols)(const Dav1dFrameContext *const f,
+ pixel *const p[3], Av1Filter *const lflvl,
+ int sby, const int start_of_tile_row)
+{
+ int x, have_left;
+ // Don't filter outside the frame
+ const int is_sb64 = !f->seq_hdr->sb128;
+ const int starty4 = (sby & is_sb64) << 4;
+ const int sbsz = 32 >> is_sb64;
+ const int sbl2 = 5 - is_sb64;
+ const int halign = (f->bh + 31) & ~31;
+ const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+ const int vmask = 16 >> ss_ver, hmask = 16 >> ss_hor;
+ const unsigned vmax = 1U << vmask, hmax = 1U << hmask;
+ const unsigned endy4 = starty4 + imin(f->h4 - sby * sbsz, sbsz);
+ const unsigned uv_endy4 = (endy4 + ss_ver) >> ss_ver;
+
+ // fix lpf strength at tile col boundaries
+ const uint8_t *lpf_y = &f->lf.tx_lpf_right_edge[0][sby << sbl2];
+ const uint8_t *lpf_uv = &f->lf.tx_lpf_right_edge[1][sby << (sbl2 - ss_ver)];
+ for (int tile_col = 1;; tile_col++) {
+ x = f->frame_hdr->tiling.col_start_sb[tile_col];
+ if ((x << sbl2) >= f->bw) break;
+ const int bx4 = x & is_sb64 ? 16 : 0, cbx4 = bx4 >> ss_hor;
+ x >>= is_sb64;
+
+ uint16_t (*const y_hmask)[2] = lflvl[x].filter_y[0][bx4];
+ for (unsigned y = starty4, mask = 1 << y; y < endy4; y++, mask <<= 1) {
+ const int sidx = mask >= 0x10000U;
+ const unsigned smask = mask >> (sidx << 4);
+ const int idx = 2 * !!(y_hmask[2][sidx] & smask) +
+ !!(y_hmask[1][sidx] & smask);
+ y_hmask[2][sidx] &= ~smask;
+ y_hmask[1][sidx] &= ~smask;
+ y_hmask[0][sidx] &= ~smask;
+ y_hmask[imin(idx, lpf_y[y - starty4])][sidx] |= smask;
+ }
+
+ if (f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400) {
+ uint16_t (*const uv_hmask)[2] = lflvl[x].filter_uv[0][cbx4];
+ for (unsigned y = starty4 >> ss_ver, uv_mask = 1 << y; y < uv_endy4;
+ y++, uv_mask <<= 1)
+ {
+ const int sidx = uv_mask >= vmax;
+ const unsigned smask = uv_mask >> (sidx << (4 - ss_ver));
+ const int idx = !!(uv_hmask[1][sidx] & smask);
+ uv_hmask[1][sidx] &= ~smask;
+ uv_hmask[0][sidx] &= ~smask;
+ uv_hmask[imin(idx, lpf_uv[y - (starty4 >> ss_ver)])][sidx] |= smask;
+ }
+ }
+ lpf_y += halign;
+ lpf_uv += halign >> ss_ver;
+ }
+
+ // fix lpf strength at tile row boundaries
+ if (start_of_tile_row) {
+ const BlockContext *a;
+ for (x = 0, a = &f->a[f->sb128w * (start_of_tile_row - 1)];
+ x < f->sb128w; x++, a++)
+ {
+ uint16_t (*const y_vmask)[2] = lflvl[x].filter_y[1][starty4];
+ const unsigned w = imin(32, f->w4 - (x << 5));
+ for (unsigned mask = 1, i = 0; i < w; mask <<= 1, i++) {
+ const int sidx = mask >= 0x10000U;
+ const unsigned smask = mask >> (sidx << 4);
+ const int idx = 2 * !!(y_vmask[2][sidx] & smask) +
+ !!(y_vmask[1][sidx] & smask);
+ y_vmask[2][sidx] &= ~smask;
+ y_vmask[1][sidx] &= ~smask;
+ y_vmask[0][sidx] &= ~smask;
+ y_vmask[imin(idx, a->tx_lpf_y[i])][sidx] |= smask;
+ }
+
+ if (f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400) {
+ const unsigned cw = (w + ss_hor) >> ss_hor;
+ uint16_t (*const uv_vmask)[2] = lflvl[x].filter_uv[1][starty4 >> ss_ver];
+ for (unsigned uv_mask = 1, i = 0; i < cw; uv_mask <<= 1, i++) {
+ const int sidx = uv_mask >= hmax;
+ const unsigned smask = uv_mask >> (sidx << (4 - ss_hor));
+ const int idx = !!(uv_vmask[1][sidx] & smask);
+ uv_vmask[1][sidx] &= ~smask;
+ uv_vmask[0][sidx] &= ~smask;
+ uv_vmask[imin(idx, a->tx_lpf_uv[i])][sidx] |= smask;
+ }
+ }
+ }
+ }
+
+ pixel *ptr;
+ uint8_t (*level_ptr)[4] = f->lf.level + f->b4_stride * sby * sbsz;
+ for (ptr = p[0], have_left = 0, x = 0; x < f->sb128w;
+ x++, have_left = 1, ptr += 128, level_ptr += 32)
+ {
+ filter_plane_cols_y(f, have_left, level_ptr, f->b4_stride,
+ lflvl[x].filter_y[0], ptr, f->cur.stride[0],
+ imin(32, f->w4 - x * 32), starty4, endy4);
+ }
+
+ if (!f->frame_hdr->loopfilter.level_u && !f->frame_hdr->loopfilter.level_v)
+ return;
+
+ ptrdiff_t uv_off;
+ level_ptr = f->lf.level + f->b4_stride * (sby * sbsz >> ss_ver);
+ for (uv_off = 0, have_left = 0, x = 0; x < f->sb128w;
+ x++, have_left = 1, uv_off += 128 >> ss_hor, level_ptr += 32 >> ss_hor)
+ {
+ filter_plane_cols_uv(f, have_left, level_ptr, f->b4_stride,
+ lflvl[x].filter_uv[0],
+ &p[1][uv_off], &p[2][uv_off], f->cur.stride[1],
+ (imin(32, f->w4 - x * 32) + ss_hor) >> ss_hor,
+ starty4 >> ss_ver, uv_endy4, ss_ver);
+ }
+}
+
+void bytefn(dav1d_loopfilter_sbrow_rows)(const Dav1dFrameContext *const f,
+ pixel *const p[3], Av1Filter *const lflvl,
+ int sby)
+{
+ int x;
+ // Don't filter outside the frame
+ const int have_top = sby > 0;
+ const int is_sb64 = !f->seq_hdr->sb128;
+ const int starty4 = (sby & is_sb64) << 4;
+ const int sbsz = 32 >> is_sb64;
+ const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+ const unsigned endy4 = starty4 + imin(f->h4 - sby * sbsz, sbsz);
+ const unsigned uv_endy4 = (endy4 + ss_ver) >> ss_ver;
+
+ pixel *ptr;
+ uint8_t (*level_ptr)[4] = f->lf.level + f->b4_stride * sby * sbsz;
+ for (ptr = p[0], x = 0; x < f->sb128w; x++, ptr += 128, level_ptr += 32) {
+ filter_plane_rows_y(f, have_top, level_ptr, f->b4_stride,
+ lflvl[x].filter_y[1], ptr, f->cur.stride[0],
+ imin(32, f->w4 - x * 32), starty4, endy4);
+ }
+
+ if (!f->frame_hdr->loopfilter.level_u && !f->frame_hdr->loopfilter.level_v)
+ return;
+
+ ptrdiff_t uv_off;
+ level_ptr = f->lf.level + f->b4_stride * (sby * sbsz >> ss_ver);
+ for (uv_off = 0, x = 0; x < f->sb128w;
+ x++, uv_off += 128 >> ss_hor, level_ptr += 32 >> ss_hor)
+ {
+ filter_plane_rows_uv(f, have_top, level_ptr, f->b4_stride,
+ lflvl[x].filter_uv[1],
+ &p[1][uv_off], &p[2][uv_off], f->cur.stride[1],
+ (imin(32, f->w4 - x * 32) + ss_hor) >> ss_hor,
+ starty4 >> ss_ver, uv_endy4, ss_hor);
+ }
+}
diff --git a/third_party/dav1d/src/lf_mask.c b/third_party/dav1d/src/lf_mask.c
new file mode 100644
index 0000000000..062ba67371
--- /dev/null
+++ b/third_party/dav1d/src/lf_mask.c
@@ -0,0 +1,495 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <string.h>
+
+#include "common/intops.h"
+
+#include "src/ctx.h"
+#include "src/levels.h"
+#include "src/lf_mask.h"
+#include "src/tables.h"
+
+static void decomp_tx(uint8_t (*const txa)[2 /* txsz, step */][32 /* y */][32 /* x */],
+ const enum RectTxfmSize from,
+ const int depth,
+ const int y_off, const int x_off,
+ const uint16_t *const tx_masks)
+{
+ const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[from];
+ const int is_split = (from == (int) TX_4X4 || depth > 1) ? 0 :
+ (tx_masks[depth] >> (y_off * 4 + x_off)) & 1;
+
+ if (is_split) {
+ const enum RectTxfmSize sub = t_dim->sub;
+ const int htw4 = t_dim->w >> 1, hth4 = t_dim->h >> 1;
+
+ decomp_tx(txa, sub, depth + 1, y_off * 2 + 0, x_off * 2 + 0, tx_masks);
+ if (t_dim->w >= t_dim->h)
+ decomp_tx((uint8_t(*)[2][32][32]) &txa[0][0][0][htw4],
+ sub, depth + 1, y_off * 2 + 0, x_off * 2 + 1, tx_masks);
+ if (t_dim->h >= t_dim->w) {
+ decomp_tx((uint8_t(*)[2][32][32]) &txa[0][0][hth4][0],
+ sub, depth + 1, y_off * 2 + 1, x_off * 2 + 0, tx_masks);
+ if (t_dim->w >= t_dim->h)
+ decomp_tx((uint8_t(*)[2][32][32]) &txa[0][0][hth4][htw4],
+ sub, depth + 1, y_off * 2 + 1, x_off * 2 + 1, tx_masks);
+ }
+ } else {
+ const int lw = imin(2, t_dim->lw), lh = imin(2, t_dim->lh);
+
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ for (int y = 0; y < t_dim->h; y++) { \
+ rep_macro(type, txa[0][0][y], off, mul * lw); \
+ rep_macro(type, txa[1][0][y], off, mul * lh); \
+ txa[0][1][y][0] = t_dim->w; \
+ }
+ case_set_upto16(t_dim->w,,, 0);
+#undef set_ctx
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, txa[1][1][0], off, mul * t_dim->h)
+ case_set_upto16(t_dim->w,,, 0);
+#undef set_ctx
+ }
+}
+
+static inline void mask_edges_inter(uint16_t (*const masks)[32][3][2],
+ const int by4, const int bx4,
+ const int w4, const int h4, const int skip,
+ const enum RectTxfmSize max_tx,
+ const uint16_t *const tx_masks,
+ uint8_t *const a, uint8_t *const l)
+{
+ const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[max_tx];
+ int y, x;
+
+ ALIGN_STK_16(uint8_t, txa, 2 /* edge */, [2 /* txsz, step */][32 /* y */][32 /* x */]);
+ for (int y_off = 0, y = 0; y < h4; y += t_dim->h, y_off++)
+ for (int x_off = 0, x = 0; x < w4; x += t_dim->w, x_off++)
+ decomp_tx((uint8_t(*)[2][32][32]) &txa[0][0][y][x],
+ max_tx, 0, y_off, x_off, tx_masks);
+
+ // left block edge
+ unsigned mask = 1U << by4;
+ for (y = 0; y < h4; y++, mask <<= 1) {
+ const int sidx = mask >= 0x10000;
+ const unsigned smask = mask >> (sidx << 4);
+ masks[0][bx4][imin(txa[0][0][y][0], l[y])][sidx] |= smask;
+ }
+
+ // top block edge
+ for (x = 0, mask = 1U << bx4; x < w4; x++, mask <<= 1) {
+ const int sidx = mask >= 0x10000;
+ const unsigned smask = mask >> (sidx << 4);
+ masks[1][by4][imin(txa[1][0][0][x], a[x])][sidx] |= smask;
+ }
+
+ if (!skip) {
+ // inner (tx) left|right edges
+ for (y = 0, mask = 1U << by4; y < h4; y++, mask <<= 1) {
+ const int sidx = mask >= 0x10000U;
+ const unsigned smask = mask >> (sidx << 4);
+ int ltx = txa[0][0][y][0];
+ int step = txa[0][1][y][0];
+ for (x = step; x < w4; x += step) {
+ const int rtx = txa[0][0][y][x];
+ masks[0][bx4 + x][imin(rtx, ltx)][sidx] |= smask;
+ ltx = rtx;
+ step = txa[0][1][y][x];
+ }
+ }
+
+ // top
+ // inner (tx) --- edges
+ // bottom
+ for (x = 0, mask = 1U << bx4; x < w4; x++, mask <<= 1) {
+ const int sidx = mask >= 0x10000U;
+ const unsigned smask = mask >> (sidx << 4);
+ int ttx = txa[1][0][0][x];
+ int step = txa[1][1][0][x];
+ for (y = step; y < h4; y += step) {
+ const int btx = txa[1][0][y][x];
+ masks[1][by4 + y][imin(ttx, btx)][sidx] |= smask;
+ ttx = btx;
+ step = txa[1][1][y][x];
+ }
+ }
+ }
+
+ for (y = 0; y < h4; y++)
+ l[y] = txa[0][0][y][w4 - 1];
+ memcpy(a, txa[1][0][h4 - 1], w4);
+}
+
+static inline void mask_edges_intra(uint16_t (*const masks)[32][3][2],
+ const int by4, const int bx4,
+ const int w4, const int h4,
+ const enum RectTxfmSize tx,
+ uint8_t *const a, uint8_t *const l)
+{
+ const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[tx];
+ const int twl4 = t_dim->lw, thl4 = t_dim->lh;
+ const int twl4c = imin(2, twl4), thl4c = imin(2, thl4);
+ int y, x;
+
+ // left block edge
+ unsigned mask = 1U << by4;
+ for (y = 0; y < h4; y++, mask <<= 1) {
+ const int sidx = mask >= 0x10000;
+ const unsigned smask = mask >> (sidx << 4);
+ masks[0][bx4][imin(twl4c, l[y])][sidx] |= smask;
+ }
+
+ // top block edge
+ for (x = 0, mask = 1U << bx4; x < w4; x++, mask <<= 1) {
+ const int sidx = mask >= 0x10000;
+ const unsigned smask = mask >> (sidx << 4);
+ masks[1][by4][imin(thl4c, a[x])][sidx] |= smask;
+ }
+
+ // inner (tx) left|right edges
+ const int hstep = t_dim->w;
+ unsigned t = 1U << by4;
+ unsigned inner = (unsigned) ((((uint64_t) t) << h4) - t);
+ unsigned inner1 = inner & 0xffff, inner2 = inner >> 16;
+ for (x = hstep; x < w4; x += hstep) {
+ if (inner1) masks[0][bx4 + x][twl4c][0] |= inner1;
+ if (inner2) masks[0][bx4 + x][twl4c][1] |= inner2;
+ }
+
+ // top
+ // inner (tx) --- edges
+ // bottom
+ const int vstep = t_dim->h;
+ t = 1U << bx4;
+ inner = (unsigned) ((((uint64_t) t) << w4) - t);
+ inner1 = inner & 0xffff;
+ inner2 = inner >> 16;
+ for (y = vstep; y < h4; y += vstep) {
+ if (inner1) masks[1][by4 + y][thl4c][0] |= inner1;
+ if (inner2) masks[1][by4 + y][thl4c][1] |= inner2;
+ }
+
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, a, off, mul * thl4c)
+#define default_memset(dir, diridx, off, var) \
+ memset(a, thl4c, var)
+ case_set_upto32_with_default(w4,,, 0);
+#undef default_memset
+#undef set_ctx
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, l, off, mul * twl4c)
+#define default_memset(dir, diridx, off, var) \
+ memset(l, twl4c, var)
+ case_set_upto32_with_default(h4,,, 0);
+#undef default_memset
+#undef set_ctx
+}
+
+static void mask_edges_chroma(uint16_t (*const masks)[32][2][2],
+ const int cby4, const int cbx4,
+ const int cw4, const int ch4,
+ const int skip_inter,
+ const enum RectTxfmSize tx,
+ uint8_t *const a, uint8_t *const l,
+ const int ss_hor, const int ss_ver)
+{
+ const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[tx];
+ const int twl4 = t_dim->lw, thl4 = t_dim->lh;
+ const int twl4c = !!twl4, thl4c = !!thl4;
+ int y, x;
+ const int vbits = 4 - ss_ver, hbits = 4 - ss_hor;
+ const int vmask = 16 >> ss_ver, hmask = 16 >> ss_hor;
+ const unsigned vmax = 1 << vmask, hmax = 1 << hmask;
+
+ // left block edge
+ unsigned mask = 1U << cby4;
+ for (y = 0; y < ch4; y++, mask <<= 1) {
+ const int sidx = mask >= vmax;
+ const unsigned smask = mask >> (sidx << vbits);
+ masks[0][cbx4][imin(twl4c, l[y])][sidx] |= smask;
+ }
+
+ // top block edge
+ for (x = 0, mask = 1U << cbx4; x < cw4; x++, mask <<= 1) {
+ const int sidx = mask >= hmax;
+ const unsigned smask = mask >> (sidx << hbits);
+ masks[1][cby4][imin(thl4c, a[x])][sidx] |= smask;
+ }
+
+ if (!skip_inter) {
+ // inner (tx) left|right edges
+ const int hstep = t_dim->w;
+ unsigned t = 1U << cby4;
+ unsigned inner = (unsigned) ((((uint64_t) t) << ch4) - t);
+ unsigned inner1 = inner & ((1 << vmask) - 1), inner2 = inner >> vmask;
+ for (x = hstep; x < cw4; x += hstep) {
+ if (inner1) masks[0][cbx4 + x][twl4c][0] |= inner1;
+ if (inner2) masks[0][cbx4 + x][twl4c][1] |= inner2;
+ }
+
+ // top
+ // inner (tx) --- edges
+ // bottom
+ const int vstep = t_dim->h;
+ t = 1U << cbx4;
+ inner = (unsigned) ((((uint64_t) t) << cw4) - t);
+ inner1 = inner & ((1 << hmask) - 1), inner2 = inner >> hmask;
+ for (y = vstep; y < ch4; y += vstep) {
+ if (inner1) masks[1][cby4 + y][thl4c][0] |= inner1;
+ if (inner2) masks[1][cby4 + y][thl4c][1] |= inner2;
+ }
+ }
+
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, a, off, mul * thl4c)
+#define default_memset(dir, diridx, off, var) \
+ memset(a, thl4c, var)
+ case_set_upto32_with_default(cw4,,, 0);
+#undef default_memset
+#undef set_ctx
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, l, off, mul * twl4c)
+#define default_memset(dir, diridx, off, var) \
+ memset(l, twl4c, var)
+ case_set_upto32_with_default(ch4,,, 0);
+#undef default_memset
+#undef set_ctx
+}
+
+void dav1d_create_lf_mask_intra(Av1Filter *const lflvl,
+ uint8_t (*const level_cache)[4],
+ const ptrdiff_t b4_stride,
+ const uint8_t (*filter_level)[8][2],
+ const int bx, const int by,
+ const int iw, const int ih,
+ const enum BlockSize bs,
+ const enum RectTxfmSize ytx,
+ const enum RectTxfmSize uvtx,
+ const enum Dav1dPixelLayout layout,
+ uint8_t *const ay, uint8_t *const ly,
+ uint8_t *const auv, uint8_t *const luv)
+{
+ const uint8_t *const b_dim = dav1d_block_dimensions[bs];
+ const int bw4 = imin(iw - bx, b_dim[0]);
+ const int bh4 = imin(ih - by, b_dim[1]);
+ const int bx4 = bx & 31;
+ const int by4 = by & 31;
+ assert(bw4 >= 0 && bh4 >= 0);
+
+ if (bw4 && bh4) {
+ uint8_t (*level_cache_ptr)[4] = level_cache + by * b4_stride + bx;
+ for (int y = 0; y < bh4; y++) {
+ for (int x = 0; x < bw4; x++) {
+ level_cache_ptr[x][0] = filter_level[0][0][0];
+ level_cache_ptr[x][1] = filter_level[1][0][0];
+ }
+ level_cache_ptr += b4_stride;
+ }
+
+ mask_edges_intra(lflvl->filter_y, by4, bx4, bw4, bh4, ytx, ay, ly);
+ }
+
+ if (!auv) return;
+
+ const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
+ const int cbw4 = imin(((iw + ss_hor) >> ss_hor) - (bx >> ss_hor),
+ (b_dim[0] + ss_hor) >> ss_hor);
+ const int cbh4 = imin(((ih + ss_ver) >> ss_ver) - (by >> ss_ver),
+ (b_dim[1] + ss_ver) >> ss_ver);
+ assert(cbw4 >= 0 && cbh4 >= 0);
+
+ if (!cbw4 || !cbh4) return;
+
+ const int cbx4 = bx4 >> ss_hor;
+ const int cby4 = by4 >> ss_ver;
+
+ uint8_t (*level_cache_ptr)[4] =
+ level_cache + (by >> ss_ver) * b4_stride + (bx >> ss_hor);
+ for (int y = 0; y < cbh4; y++) {
+ for (int x = 0; x < cbw4; x++) {
+ level_cache_ptr[x][2] = filter_level[2][0][0];
+ level_cache_ptr[x][3] = filter_level[3][0][0];
+ }
+ level_cache_ptr += b4_stride;
+ }
+
+ mask_edges_chroma(lflvl->filter_uv, cby4, cbx4, cbw4, cbh4, 0, uvtx,
+ auv, luv, ss_hor, ss_ver);
+}
+
+void dav1d_create_lf_mask_inter(Av1Filter *const lflvl,
+ uint8_t (*const level_cache)[4],
+ const ptrdiff_t b4_stride,
+ const uint8_t (*filter_level)[8][2],
+ const int bx, const int by,
+ const int iw, const int ih,
+ const int skip, const enum BlockSize bs,
+ const enum RectTxfmSize max_ytx,
+ const uint16_t *const tx_masks,
+ const enum RectTxfmSize uvtx,
+ const enum Dav1dPixelLayout layout,
+ uint8_t *const ay, uint8_t *const ly,
+ uint8_t *const auv, uint8_t *const luv)
+{
+ const uint8_t *const b_dim = dav1d_block_dimensions[bs];
+ const int bw4 = imin(iw - bx, b_dim[0]);
+ const int bh4 = imin(ih - by, b_dim[1]);
+ const int bx4 = bx & 31;
+ const int by4 = by & 31;
+ assert(bw4 >= 0 && bh4 >= 0);
+
+ if (bw4 && bh4) {
+ uint8_t (*level_cache_ptr)[4] = level_cache + by * b4_stride + bx;
+ for (int y = 0; y < bh4; y++) {
+ for (int x = 0; x < bw4; x++) {
+ level_cache_ptr[x][0] = filter_level[0][0][0];
+ level_cache_ptr[x][1] = filter_level[1][0][0];
+ }
+ level_cache_ptr += b4_stride;
+ }
+
+ mask_edges_inter(lflvl->filter_y, by4, bx4, bw4, bh4, skip,
+ max_ytx, tx_masks, ay, ly);
+ }
+
+ if (!auv) return;
+
+ const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
+ const int cbw4 = imin(((iw + ss_hor) >> ss_hor) - (bx >> ss_hor),
+ (b_dim[0] + ss_hor) >> ss_hor);
+ const int cbh4 = imin(((ih + ss_ver) >> ss_ver) - (by >> ss_ver),
+ (b_dim[1] + ss_ver) >> ss_ver);
+ assert(cbw4 >= 0 && cbh4 >= 0);
+
+ if (!cbw4 || !cbh4) return;
+
+ const int cbx4 = bx4 >> ss_hor;
+ const int cby4 = by4 >> ss_ver;
+
+ uint8_t (*level_cache_ptr)[4] =
+ level_cache + (by >> ss_ver) * b4_stride + (bx >> ss_hor);
+ for (int y = 0; y < cbh4; y++) {
+ for (int x = 0; x < cbw4; x++) {
+ level_cache_ptr[x][2] = filter_level[2][0][0];
+ level_cache_ptr[x][3] = filter_level[3][0][0];
+ }
+ level_cache_ptr += b4_stride;
+ }
+
+ mask_edges_chroma(lflvl->filter_uv, cby4, cbx4, cbw4, cbh4, skip, uvtx,
+ auv, luv, ss_hor, ss_ver);
+}
+
+void dav1d_calc_eih(Av1FilterLUT *const lim_lut, const int filter_sharpness) {
+ // set E/I/H values from loopfilter level
+ const int sharp = filter_sharpness;
+ for (int level = 0; level < 64; level++) {
+ int limit = level;
+
+ if (sharp > 0) {
+ limit >>= (sharp + 3) >> 2;
+ limit = imin(limit, 9 - sharp);
+ }
+ limit = imax(limit, 1);
+
+ lim_lut->i[level] = limit;
+ lim_lut->e[level] = 2 * (level + 2) + limit;
+ }
+ lim_lut->sharp[0] = (sharp + 3) >> 2;
+ lim_lut->sharp[1] = sharp ? 9 - sharp : 0xff;
+}
+
+static void calc_lf_value(uint8_t (*const lflvl_values)[2],
+ const int base_lvl, const int lf_delta,
+ const int seg_delta,
+ const Dav1dLoopfilterModeRefDeltas *const mr_delta)
+{
+ const int base = iclip(iclip(base_lvl + lf_delta, 0, 63) + seg_delta, 0, 63);
+
+ if (!mr_delta) {
+ memset(lflvl_values, base, 8 * 2);
+ } else {
+ const int sh = base >= 32;
+ lflvl_values[0][0] = lflvl_values[0][1] =
+ iclip(base + (mr_delta->ref_delta[0] * (1 << sh)), 0, 63);
+ for (int r = 1; r < 8; r++) {
+ for (int m = 0; m < 2; m++) {
+ const int delta =
+ mr_delta->mode_delta[m] + mr_delta->ref_delta[r];
+ lflvl_values[r][m] = iclip(base + (delta * (1 << sh)), 0, 63);
+ }
+ }
+ }
+}
+
+static inline void calc_lf_value_chroma(uint8_t (*const lflvl_values)[2],
+ const int base_lvl, const int lf_delta,
+ const int seg_delta,
+ const Dav1dLoopfilterModeRefDeltas *const mr_delta)
+{
+ if (!base_lvl)
+ memset(lflvl_values, 0, 8 * 2);
+ else
+ calc_lf_value(lflvl_values, base_lvl, lf_delta, seg_delta, mr_delta);
+}
+
+void dav1d_calc_lf_values(uint8_t (*const lflvl_values)[4][8][2],
+ const Dav1dFrameHeader *const hdr,
+ const int8_t lf_delta[4])
+{
+ const int n_seg = hdr->segmentation.enabled ? 8 : 1;
+
+ if (!hdr->loopfilter.level_y[0] && !hdr->loopfilter.level_y[1]) {
+ memset(lflvl_values, 0, 8 * 4 * 2 * n_seg);
+ return;
+ }
+
+ const Dav1dLoopfilterModeRefDeltas *const mr_deltas =
+ hdr->loopfilter.mode_ref_delta_enabled ?
+ &hdr->loopfilter.mode_ref_deltas : NULL;
+ for (int s = 0; s < n_seg; s++) {
+ const Dav1dSegmentationData *const segd =
+ hdr->segmentation.enabled ? &hdr->segmentation.seg_data.d[s] : NULL;
+
+ calc_lf_value(lflvl_values[s][0], hdr->loopfilter.level_y[0],
+ lf_delta[0], segd ? segd->delta_lf_y_v : 0, mr_deltas);
+ calc_lf_value(lflvl_values[s][1], hdr->loopfilter.level_y[1],
+ lf_delta[hdr->delta.lf.multi ? 1 : 0],
+ segd ? segd->delta_lf_y_h : 0, mr_deltas);
+ calc_lf_value_chroma(lflvl_values[s][2], hdr->loopfilter.level_u,
+ lf_delta[hdr->delta.lf.multi ? 2 : 0],
+ segd ? segd->delta_lf_u : 0, mr_deltas);
+ calc_lf_value_chroma(lflvl_values[s][3], hdr->loopfilter.level_v,
+ lf_delta[hdr->delta.lf.multi ? 3 : 0],
+ segd ? segd->delta_lf_v : 0, mr_deltas);
+ }
+}
diff --git a/third_party/dav1d/src/lf_mask.h b/third_party/dav1d/src/lf_mask.h
new file mode 100644
index 0000000000..8991ed4185
--- /dev/null
+++ b/third_party/dav1d/src/lf_mask.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_LF_MASK_H
+#define DAV1D_SRC_LF_MASK_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "src/levels.h"
+
+typedef struct Av1FilterLUT {
+ uint8_t e[64];
+ uint8_t i[64];
+ uint64_t sharp[2];
+} Av1FilterLUT;
+
+typedef struct Av1RestorationUnit {
+ /* SGR: type = DAV1D_RESTORATION_SGRPROJ + sgr_idx */
+ uint8_t /* enum Dav1dRestorationType */ type;
+ int8_t filter_h[3];
+ int8_t filter_v[3];
+ int8_t sgr_weights[2];
+} Av1RestorationUnit;
+
+// each struct describes one 128x128 area (1 or 4 SBs), pre-superres-scaling
+typedef struct Av1Filter {
+ // each bit is 1 col
+ uint16_t filter_y[2 /* 0=col, 1=row */][32][3][2];
+ uint16_t filter_uv[2 /* 0=col, 1=row */][32][2][2];
+ int8_t cdef_idx[4]; // -1 means "unset"
+ uint16_t noskip_mask[16][2]; // for 8x8 blocks, but stored on a 4x8 basis
+} Av1Filter;
+
+// each struct describes one 128x128 area (1 or 4 SBs), post-superres-scaling
+typedef struct Av1Restoration {
+ Av1RestorationUnit lr[3][4];
+} Av1Restoration;
+
+void dav1d_create_lf_mask_intra(Av1Filter *lflvl, uint8_t (*level_cache)[4],
+ const ptrdiff_t b4_stride,
+ const uint8_t (*level)[8][2], int bx, int by,
+ int iw, int ih, enum BlockSize bs,
+ enum RectTxfmSize ytx, enum RectTxfmSize uvtx,
+ enum Dav1dPixelLayout layout, uint8_t *ay,
+ uint8_t *ly, uint8_t *auv, uint8_t *luv);
+void dav1d_create_lf_mask_inter(Av1Filter *lflvl, uint8_t (*level_cache)[4],
+ const ptrdiff_t b4_stride,
+ const uint8_t (*level)[8][2], int bx, int by,
+ int iw, int ih, int skip_inter,
+ enum BlockSize bs, enum RectTxfmSize max_ytx,
+ const uint16_t *tx_mask, enum RectTxfmSize uvtx,
+ enum Dav1dPixelLayout layout, uint8_t *ay,
+ uint8_t *ly, uint8_t *auv, uint8_t *luv);
+void dav1d_calc_eih(Av1FilterLUT *lim_lut, int filter_sharpness);
+void dav1d_calc_lf_values(uint8_t (*values)[4][8][2], const Dav1dFrameHeader *hdr,
+ const int8_t lf_delta[4]);
+
+#endif /* DAV1D_SRC_LF_MASK_H */
diff --git a/third_party/dav1d/src/lib.c b/third_party/dav1d/src/lib.c
new file mode 100644
index 0000000000..3807efdcce
--- /dev/null
+++ b/third_party/dav1d/src/lib.c
@@ -0,0 +1,761 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+#include "vcs_version.h"
+
+#include <errno.h>
+#include <string.h>
+
+#if defined(__linux__) && defined(HAVE_DLSYM)
+#include <dlfcn.h>
+#endif
+
+#include "dav1d/dav1d.h"
+#include "dav1d/data.h"
+
+#include "common/validate.h"
+
+#include "src/cpu.h"
+#include "src/fg_apply.h"
+#include "src/internal.h"
+#include "src/log.h"
+#include "src/obu.h"
+#include "src/qm.h"
+#include "src/ref.h"
+#include "src/thread_task.h"
+#include "src/wedge.h"
+
+static COLD void init_internal(void) {
+ dav1d_init_cpu();
+ dav1d_init_ii_wedge_masks();
+ dav1d_init_intra_edge_tree();
+ dav1d_init_qm_tables();
+ dav1d_init_thread();
+}
+
+COLD const char *dav1d_version(void) {
+ return DAV1D_VERSION;
+}
+
+COLD unsigned dav1d_version_api(void) {
+ return (DAV1D_API_VERSION_MAJOR << 16) |
+ (DAV1D_API_VERSION_MINOR << 8) |
+ (DAV1D_API_VERSION_PATCH << 0);
+}
+
+COLD void dav1d_default_settings(Dav1dSettings *const s) {
+ s->n_threads = 0;
+ s->max_frame_delay = 0;
+ s->apply_grain = 1;
+ s->allocator.cookie = NULL;
+ s->allocator.alloc_picture_callback = dav1d_default_picture_alloc;
+ s->allocator.release_picture_callback = dav1d_default_picture_release;
+ s->logger.cookie = NULL;
+ s->logger.callback = dav1d_log_default_callback;
+ s->operating_point = 0;
+ s->all_layers = 1; // just until the tests are adjusted
+ s->frame_size_limit = 0;
+ s->strict_std_compliance = 0;
+ s->output_invisible_frames = 0;
+ s->inloop_filters = DAV1D_INLOOPFILTER_ALL;
+ s->decode_frame_type = DAV1D_DECODEFRAMETYPE_ALL;
+}
+
+static void close_internal(Dav1dContext **const c_out, int flush);
+
+NO_SANITIZE("cfi-icall") // CFI is broken with dlsym()
+static COLD size_t get_stack_size_internal(const pthread_attr_t *const thread_attr) {
+#if defined(__linux__) && defined(HAVE_DLSYM) && defined(__GLIBC__)
+ /* glibc has an issue where the size of the TLS is subtracted from the stack
+ * size instead of allocated separately. As a result the specified stack
+ * size may be insufficient when used in an application with large amounts
+ * of TLS data. The following is a workaround to compensate for that.
+ * See https://sourceware.org/bugzilla/show_bug.cgi?id=11787 */
+ size_t (*const get_minstack)(const pthread_attr_t*) =
+ dlsym(RTLD_DEFAULT, "__pthread_get_minstack");
+ if (get_minstack)
+ return get_minstack(thread_attr) - PTHREAD_STACK_MIN;
+#endif
+ return 0;
+}
+
+static COLD void get_num_threads(Dav1dContext *const c, const Dav1dSettings *const s,
+ unsigned *n_tc, unsigned *n_fc)
+{
+ /* ceil(sqrt(n)) */
+ static const uint8_t fc_lut[49] = {
+ 1, /* 1 */
+ 2, 2, 2, /* 2- 4 */
+ 3, 3, 3, 3, 3, /* 5- 9 */
+ 4, 4, 4, 4, 4, 4, 4, /* 10-16 */
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, /* 17-25 */
+ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, /* 26-36 */
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, /* 37-49 */
+ };
+ *n_tc = s->n_threads ? s->n_threads :
+ iclip(dav1d_num_logical_processors(c), 1, DAV1D_MAX_THREADS);
+ *n_fc = s->max_frame_delay ? umin(s->max_frame_delay, *n_tc) :
+ *n_tc < 50 ? fc_lut[*n_tc - 1] : 8; // min(8, ceil(sqrt(n)))
+}
+
+COLD int dav1d_get_frame_delay(const Dav1dSettings *const s) {
+ unsigned n_tc, n_fc;
+ validate_input_or_ret(s != NULL, DAV1D_ERR(EINVAL));
+ validate_input_or_ret(s->n_threads >= 0 &&
+ s->n_threads <= DAV1D_MAX_THREADS, DAV1D_ERR(EINVAL));
+ validate_input_or_ret(s->max_frame_delay >= 0 &&
+ s->max_frame_delay <= DAV1D_MAX_FRAME_DELAY, DAV1D_ERR(EINVAL));
+
+ get_num_threads(NULL, s, &n_tc, &n_fc);
+ return n_fc;
+}
+
+COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
+ static pthread_once_t initted = PTHREAD_ONCE_INIT;
+ pthread_once(&initted, init_internal);
+
+ validate_input_or_ret(c_out != NULL, DAV1D_ERR(EINVAL));
+ validate_input_or_ret(s != NULL, DAV1D_ERR(EINVAL));
+ validate_input_or_ret(s->n_threads >= 0 &&
+ s->n_threads <= DAV1D_MAX_THREADS, DAV1D_ERR(EINVAL));
+ validate_input_or_ret(s->max_frame_delay >= 0 &&
+ s->max_frame_delay <= DAV1D_MAX_FRAME_DELAY, DAV1D_ERR(EINVAL));
+ validate_input_or_ret(s->allocator.alloc_picture_callback != NULL,
+ DAV1D_ERR(EINVAL));
+ validate_input_or_ret(s->allocator.release_picture_callback != NULL,
+ DAV1D_ERR(EINVAL));
+ validate_input_or_ret(s->operating_point >= 0 &&
+ s->operating_point <= 31, DAV1D_ERR(EINVAL));
+ validate_input_or_ret(s->decode_frame_type >= DAV1D_DECODEFRAMETYPE_ALL &&
+ s->decode_frame_type <= DAV1D_DECODEFRAMETYPE_KEY, DAV1D_ERR(EINVAL));
+
+ pthread_attr_t thread_attr;
+ if (pthread_attr_init(&thread_attr)) return DAV1D_ERR(ENOMEM);
+ size_t stack_size = 1024 * 1024 + get_stack_size_internal(&thread_attr);
+
+ pthread_attr_setstacksize(&thread_attr, stack_size);
+
+ Dav1dContext *const c = *c_out = dav1d_alloc_aligned(ALLOC_COMMON_CTX, sizeof(*c), 64);
+ if (!c) goto error;
+ memset(c, 0, sizeof(*c));
+
+ c->allocator = s->allocator;
+ c->logger = s->logger;
+ c->apply_grain = s->apply_grain;
+ c->operating_point = s->operating_point;
+ c->all_layers = s->all_layers;
+ c->frame_size_limit = s->frame_size_limit;
+ c->strict_std_compliance = s->strict_std_compliance;
+ c->output_invisible_frames = s->output_invisible_frames;
+ c->inloop_filters = s->inloop_filters;
+ c->decode_frame_type = s->decode_frame_type;
+
+ dav1d_data_props_set_defaults(&c->cached_error_props);
+
+ if (dav1d_mem_pool_init(ALLOC_OBU_HDR, &c->seq_hdr_pool) ||
+ dav1d_mem_pool_init(ALLOC_OBU_HDR, &c->frame_hdr_pool) ||
+ dav1d_mem_pool_init(ALLOC_SEGMAP, &c->segmap_pool) ||
+ dav1d_mem_pool_init(ALLOC_REFMVS, &c->refmvs_pool) ||
+ dav1d_mem_pool_init(ALLOC_PIC_CTX, &c->pic_ctx_pool) ||
+ dav1d_mem_pool_init(ALLOC_CDF, &c->cdf_pool))
+ {
+ goto error;
+ }
+
+ if (c->allocator.alloc_picture_callback == dav1d_default_picture_alloc &&
+ c->allocator.release_picture_callback == dav1d_default_picture_release)
+ {
+ if (c->allocator.cookie) goto error;
+ if (dav1d_mem_pool_init(ALLOC_PIC, &c->picture_pool)) goto error;
+ c->allocator.cookie = c->picture_pool;
+ } else if (c->allocator.alloc_picture_callback == dav1d_default_picture_alloc ||
+ c->allocator.release_picture_callback == dav1d_default_picture_release)
+ {
+ goto error;
+ }
+
+ /* On 32-bit systems extremely large frame sizes can cause overflows in
+ * dav1d_decode_frame() malloc size calculations. Prevent that from occuring
+ * by enforcing a maximum frame size limit, chosen to roughly correspond to
+ * the largest size possible to decode without exhausting virtual memory. */
+ if (sizeof(size_t) < 8 && s->frame_size_limit - 1 >= 8192 * 8192) {
+ c->frame_size_limit = 8192 * 8192;
+ if (s->frame_size_limit)
+ dav1d_log(c, "Frame size limit reduced from %u to %u.\n",
+ s->frame_size_limit, c->frame_size_limit);
+ }
+
+ c->flush = &c->flush_mem;
+ atomic_init(c->flush, 0);
+
+ get_num_threads(c, s, &c->n_tc, &c->n_fc);
+
+ c->fc = dav1d_alloc_aligned(ALLOC_THREAD_CTX, sizeof(*c->fc) * c->n_fc, 32);
+ if (!c->fc) goto error;
+ memset(c->fc, 0, sizeof(*c->fc) * c->n_fc);
+
+ c->tc = dav1d_alloc_aligned(ALLOC_THREAD_CTX, sizeof(*c->tc) * c->n_tc, 64);
+ if (!c->tc) goto error;
+ memset(c->tc, 0, sizeof(*c->tc) * c->n_tc);
+ if (c->n_tc > 1) {
+ if (pthread_mutex_init(&c->task_thread.lock, NULL)) goto error;
+ if (pthread_cond_init(&c->task_thread.cond, NULL)) {
+ pthread_mutex_destroy(&c->task_thread.lock);
+ goto error;
+ }
+ if (pthread_cond_init(&c->task_thread.delayed_fg.cond, NULL)) {
+ pthread_cond_destroy(&c->task_thread.cond);
+ pthread_mutex_destroy(&c->task_thread.lock);
+ goto error;
+ }
+ c->task_thread.cur = c->n_fc;
+ atomic_init(&c->task_thread.reset_task_cur, UINT_MAX);
+ atomic_init(&c->task_thread.cond_signaled, 0);
+ c->task_thread.inited = 1;
+ }
+
+ if (c->n_fc > 1) {
+ const size_t out_delayed_sz = sizeof(*c->frame_thread.out_delayed) * c->n_fc;
+ c->frame_thread.out_delayed =
+ dav1d_malloc(ALLOC_THREAD_CTX, out_delayed_sz);
+ if (!c->frame_thread.out_delayed) goto error;
+ memset(c->frame_thread.out_delayed, 0, out_delayed_sz);
+ }
+ for (unsigned n = 0; n < c->n_fc; n++) {
+ Dav1dFrameContext *const f = &c->fc[n];
+ if (c->n_tc > 1) {
+ if (pthread_mutex_init(&f->task_thread.lock, NULL)) goto error;
+ if (pthread_cond_init(&f->task_thread.cond, NULL)) {
+ pthread_mutex_destroy(&f->task_thread.lock);
+ goto error;
+ }
+ if (pthread_mutex_init(&f->task_thread.pending_tasks.lock, NULL)) {
+ pthread_cond_destroy(&f->task_thread.cond);
+ pthread_mutex_destroy(&f->task_thread.lock);
+ goto error;
+ }
+ }
+ f->c = c;
+ f->task_thread.ttd = &c->task_thread;
+ f->lf.last_sharpness = -1;
+ dav1d_refmvs_init(&f->rf);
+ }
+
+ for (unsigned m = 0; m < c->n_tc; m++) {
+ Dav1dTaskContext *const t = &c->tc[m];
+ t->f = &c->fc[0];
+ t->task_thread.ttd = &c->task_thread;
+ t->c = c;
+ memset(t->cf_16bpc, 0, sizeof(t->cf_16bpc));
+ if (c->n_tc > 1) {
+ if (pthread_mutex_init(&t->task_thread.td.lock, NULL)) goto error;
+ if (pthread_cond_init(&t->task_thread.td.cond, NULL)) {
+ pthread_mutex_destroy(&t->task_thread.td.lock);
+ goto error;
+ }
+ if (pthread_create(&t->task_thread.td.thread, &thread_attr, dav1d_worker_task, t)) {
+ pthread_cond_destroy(&t->task_thread.td.cond);
+ pthread_mutex_destroy(&t->task_thread.td.lock);
+ goto error;
+ }
+ t->task_thread.td.inited = 1;
+ }
+ }
+ dav1d_pal_dsp_init(&c->pal_dsp);
+ dav1d_refmvs_dsp_init(&c->refmvs_dsp);
+
+ pthread_attr_destroy(&thread_attr);
+
+ return 0;
+
+error:
+ if (c) close_internal(c_out, 0);
+ pthread_attr_destroy(&thread_attr);
+ return DAV1D_ERR(ENOMEM);
+}
+
+static int has_grain(const Dav1dPicture *const pic)
+{
+ const Dav1dFilmGrainData *fgdata = &pic->frame_hdr->film_grain.data;
+ return fgdata->num_y_points || fgdata->num_uv_points[0] ||
+ fgdata->num_uv_points[1] || (fgdata->clip_to_restricted_range &&
+ fgdata->chroma_scaling_from_luma);
+}
+
+static int output_image(Dav1dContext *const c, Dav1dPicture *const out)
+{
+ int res = 0;
+
+ Dav1dThreadPicture *const in = (c->all_layers || !c->max_spatial_id)
+ ? &c->out : &c->cache;
+ if (!c->apply_grain || !has_grain(&in->p)) {
+ dav1d_picture_move_ref(out, &in->p);
+ dav1d_thread_picture_unref(in);
+ goto end;
+ }
+
+ res = dav1d_apply_grain(c, out, &in->p);
+ dav1d_thread_picture_unref(in);
+end:
+ if (!c->all_layers && c->max_spatial_id && c->out.p.data[0]) {
+ dav1d_thread_picture_move_ref(in, &c->out);
+ }
+ return res;
+}
+
+static int output_picture_ready(Dav1dContext *const c, const int drain) {
+ if (c->cached_error) return 1;
+ if (!c->all_layers && c->max_spatial_id) {
+ if (c->out.p.data[0] && c->cache.p.data[0]) {
+ if (c->max_spatial_id == c->cache.p.frame_hdr->spatial_id ||
+ c->out.flags & PICTURE_FLAG_NEW_TEMPORAL_UNIT)
+ return 1;
+ dav1d_thread_picture_unref(&c->cache);
+ dav1d_thread_picture_move_ref(&c->cache, &c->out);
+ return 0;
+ } else if (c->cache.p.data[0] && drain) {
+ return 1;
+ } else if (c->out.p.data[0]) {
+ dav1d_thread_picture_move_ref(&c->cache, &c->out);
+ return 0;
+ }
+ }
+
+ return !!c->out.p.data[0];
+}
+
+static int drain_picture(Dav1dContext *const c, Dav1dPicture *const out) {
+ unsigned drain_count = 0;
+ int drained = 0;
+ do {
+ const unsigned next = c->frame_thread.next;
+ Dav1dFrameContext *const f = &c->fc[next];
+ pthread_mutex_lock(&c->task_thread.lock);
+ while (f->n_tile_data > 0)
+ pthread_cond_wait(&f->task_thread.cond,
+ &f->task_thread.ttd->lock);
+ Dav1dThreadPicture *const out_delayed =
+ &c->frame_thread.out_delayed[next];
+ if (out_delayed->p.data[0] || atomic_load(&f->task_thread.error)) {
+ unsigned first = atomic_load(&c->task_thread.first);
+ if (first + 1U < c->n_fc)
+ atomic_fetch_add(&c->task_thread.first, 1U);
+ else
+ atomic_store(&c->task_thread.first, 0);
+ atomic_compare_exchange_strong(&c->task_thread.reset_task_cur,
+ &first, UINT_MAX);
+ if (c->task_thread.cur && c->task_thread.cur < c->n_fc)
+ c->task_thread.cur--;
+ drained = 1;
+ } else if (drained) {
+ pthread_mutex_unlock(&c->task_thread.lock);
+ break;
+ }
+ if (++c->frame_thread.next == c->n_fc)
+ c->frame_thread.next = 0;
+ pthread_mutex_unlock(&c->task_thread.lock);
+ const int error = f->task_thread.retval;
+ if (error) {
+ f->task_thread.retval = 0;
+ dav1d_data_props_copy(&c->cached_error_props, &out_delayed->p.m);
+ dav1d_thread_picture_unref(out_delayed);
+ return error;
+ }
+ if (out_delayed->p.data[0]) {
+ const unsigned progress =
+ atomic_load_explicit(&out_delayed->progress[1],
+ memory_order_relaxed);
+ if ((out_delayed->visible || c->output_invisible_frames) &&
+ progress != FRAME_ERROR)
+ {
+ dav1d_thread_picture_ref(&c->out, out_delayed);
+ c->event_flags |= dav1d_picture_get_event_flags(out_delayed);
+ }
+ dav1d_thread_picture_unref(out_delayed);
+ if (output_picture_ready(c, 0))
+ return output_image(c, out);
+ }
+ } while (++drain_count < c->n_fc);
+
+ if (output_picture_ready(c, 1))
+ return output_image(c, out);
+
+ return DAV1D_ERR(EAGAIN);
+}
+
+static int gen_picture(Dav1dContext *const c)
+{
+ Dav1dData *const in = &c->in;
+
+ if (output_picture_ready(c, 0))
+ return 0;
+
+ while (in->sz > 0) {
+ const ptrdiff_t res = dav1d_parse_obus(c, in);
+ if (res < 0) {
+ dav1d_data_unref_internal(in);
+ } else {
+ assert((size_t)res <= in->sz);
+ in->sz -= res;
+ in->data += res;
+ if (!in->sz) dav1d_data_unref_internal(in);
+ }
+ if (output_picture_ready(c, 0))
+ break;
+ if (res < 0)
+ return (int)res;
+ }
+
+ return 0;
+}
+
+int dav1d_send_data(Dav1dContext *const c, Dav1dData *const in)
+{
+ validate_input_or_ret(c != NULL, DAV1D_ERR(EINVAL));
+ validate_input_or_ret(in != NULL, DAV1D_ERR(EINVAL));
+
+ if (in->data) {
+ validate_input_or_ret(in->sz > 0 && in->sz <= SIZE_MAX / 2, DAV1D_ERR(EINVAL));
+ c->drain = 0;
+ }
+ if (c->in.data)
+ return DAV1D_ERR(EAGAIN);
+ dav1d_data_ref(&c->in, in);
+
+ int res = gen_picture(c);
+ if (!res)
+ dav1d_data_unref_internal(in);
+
+ return res;
+}
+
+int dav1d_get_picture(Dav1dContext *const c, Dav1dPicture *const out)
+{
+ validate_input_or_ret(c != NULL, DAV1D_ERR(EINVAL));
+ validate_input_or_ret(out != NULL, DAV1D_ERR(EINVAL));
+
+ const int drain = c->drain;
+ c->drain = 1;
+
+ int res = gen_picture(c);
+ if (res < 0)
+ return res;
+
+ if (c->cached_error) {
+ const int res = c->cached_error;
+ c->cached_error = 0;
+ return res;
+ }
+
+ if (output_picture_ready(c, c->n_fc == 1))
+ return output_image(c, out);
+
+ if (c->n_fc > 1 && drain)
+ return drain_picture(c, out);
+
+ return DAV1D_ERR(EAGAIN);
+}
+
+int dav1d_apply_grain(Dav1dContext *const c, Dav1dPicture *const out,
+ const Dav1dPicture *const in)
+{
+ validate_input_or_ret(c != NULL, DAV1D_ERR(EINVAL));
+ validate_input_or_ret(out != NULL, DAV1D_ERR(EINVAL));
+ validate_input_or_ret(in != NULL, DAV1D_ERR(EINVAL));
+
+ if (!has_grain(in)) {
+ dav1d_picture_ref(out, in);
+ return 0;
+ }
+
+ int res = dav1d_picture_alloc_copy(c, out, in->p.w, in);
+ if (res < 0) goto error;
+
+ if (c->n_tc > 1) {
+ dav1d_task_delayed_fg(c, out, in);
+ } else {
+ switch (out->p.bpc) {
+#if CONFIG_8BPC
+ case 8:
+ dav1d_apply_grain_8bpc(&c->dsp[0].fg, out, in);
+ break;
+#endif
+#if CONFIG_16BPC
+ case 10:
+ case 12:
+ dav1d_apply_grain_16bpc(&c->dsp[(out->p.bpc >> 1) - 4].fg, out, in);
+ break;
+#endif
+ default: abort();
+ }
+ }
+
+ return 0;
+
+error:
+ dav1d_picture_unref_internal(out);
+ return res;
+}
+
+void dav1d_flush(Dav1dContext *const c) {
+ dav1d_data_unref_internal(&c->in);
+ if (c->out.p.frame_hdr)
+ dav1d_thread_picture_unref(&c->out);
+ if (c->cache.p.frame_hdr)
+ dav1d_thread_picture_unref(&c->cache);
+
+ c->drain = 0;
+ c->cached_error = 0;
+
+ for (int i = 0; i < 8; i++) {
+ if (c->refs[i].p.p.frame_hdr)
+ dav1d_thread_picture_unref(&c->refs[i].p);
+ dav1d_ref_dec(&c->refs[i].segmap);
+ dav1d_ref_dec(&c->refs[i].refmvs);
+ dav1d_cdf_thread_unref(&c->cdf[i]);
+ }
+ c->frame_hdr = NULL;
+ c->seq_hdr = NULL;
+ dav1d_ref_dec(&c->seq_hdr_ref);
+
+ c->mastering_display = NULL;
+ c->content_light = NULL;
+ c->itut_t35 = NULL;
+ c->n_itut_t35 = 0;
+ dav1d_ref_dec(&c->mastering_display_ref);
+ dav1d_ref_dec(&c->content_light_ref);
+ dav1d_ref_dec(&c->itut_t35_ref);
+
+ dav1d_data_props_unref_internal(&c->cached_error_props);
+
+ if (c->n_fc == 1 && c->n_tc == 1) return;
+ atomic_store(c->flush, 1);
+
+ // stop running tasks in worker threads
+ if (c->n_tc > 1) {
+ pthread_mutex_lock(&c->task_thread.lock);
+ for (unsigned i = 0; i < c->n_tc; i++) {
+ Dav1dTaskContext *const tc = &c->tc[i];
+ while (!tc->task_thread.flushed) {
+ pthread_cond_wait(&tc->task_thread.td.cond, &c->task_thread.lock);
+ }
+ }
+ for (unsigned i = 0; i < c->n_fc; i++) {
+ c->fc[i].task_thread.task_head = NULL;
+ c->fc[i].task_thread.task_tail = NULL;
+ c->fc[i].task_thread.task_cur_prev = NULL;
+ c->fc[i].task_thread.pending_tasks.head = NULL;
+ c->fc[i].task_thread.pending_tasks.tail = NULL;
+ atomic_init(&c->fc[i].task_thread.pending_tasks.merge, 0);
+ }
+ atomic_init(&c->task_thread.first, 0);
+ c->task_thread.cur = c->n_fc;
+ atomic_store(&c->task_thread.reset_task_cur, UINT_MAX);
+ atomic_store(&c->task_thread.cond_signaled, 0);
+ pthread_mutex_unlock(&c->task_thread.lock);
+ }
+
+ // wait for threads to complete flushing
+ if (c->n_fc > 1) {
+ for (unsigned n = 0, next = c->frame_thread.next; n < c->n_fc; n++, next++) {
+ if (next == c->n_fc) next = 0;
+ Dav1dFrameContext *const f = &c->fc[next];
+ dav1d_decode_frame_exit(f, -1);
+ f->n_tile_data = 0;
+ f->task_thread.retval = 0;
+ Dav1dThreadPicture *out_delayed = &c->frame_thread.out_delayed[next];
+ if (out_delayed->p.frame_hdr) {
+ dav1d_thread_picture_unref(out_delayed);
+ }
+ }
+ c->frame_thread.next = 0;
+ }
+ atomic_store(c->flush, 0);
+}
+
+COLD void dav1d_close(Dav1dContext **const c_out) {
+ validate_input(c_out != NULL);
+#if TRACK_HEAP_ALLOCATIONS
+ dav1d_log_alloc_stats(*c_out);
+#endif
+ close_internal(c_out, 1);
+}
+
+static COLD void close_internal(Dav1dContext **const c_out, int flush) {
+ Dav1dContext *const c = *c_out;
+ if (!c) return;
+
+ if (flush) dav1d_flush(c);
+
+ if (c->tc) {
+ struct TaskThreadData *ttd = &c->task_thread;
+ if (ttd->inited) {
+ pthread_mutex_lock(&ttd->lock);
+ for (unsigned n = 0; n < c->n_tc && c->tc[n].task_thread.td.inited; n++)
+ c->tc[n].task_thread.die = 1;
+ pthread_cond_broadcast(&ttd->cond);
+ pthread_mutex_unlock(&ttd->lock);
+ for (unsigned n = 0; n < c->n_tc; n++) {
+ Dav1dTaskContext *const pf = &c->tc[n];
+ if (!pf->task_thread.td.inited) break;
+ pthread_join(pf->task_thread.td.thread, NULL);
+ pthread_cond_destroy(&pf->task_thread.td.cond);
+ pthread_mutex_destroy(&pf->task_thread.td.lock);
+ }
+ pthread_cond_destroy(&ttd->delayed_fg.cond);
+ pthread_cond_destroy(&ttd->cond);
+ pthread_mutex_destroy(&ttd->lock);
+ }
+ dav1d_free_aligned(c->tc);
+ }
+
+ for (unsigned n = 0; c->fc && n < c->n_fc; n++) {
+ Dav1dFrameContext *const f = &c->fc[n];
+
+ // clean-up threading stuff
+ if (c->n_fc > 1) {
+ dav1d_free(f->tile_thread.lowest_pixel_mem);
+ dav1d_free(f->frame_thread.b);
+ dav1d_free_aligned(f->frame_thread.cbi);
+ dav1d_free_aligned(f->frame_thread.pal_idx);
+ dav1d_free_aligned(f->frame_thread.cf);
+ dav1d_free(f->frame_thread.tile_start_off);
+ dav1d_free_aligned(f->frame_thread.pal);
+ }
+ if (c->n_tc > 1) {
+ pthread_mutex_destroy(&f->task_thread.pending_tasks.lock);
+ pthread_cond_destroy(&f->task_thread.cond);
+ pthread_mutex_destroy(&f->task_thread.lock);
+ }
+ dav1d_free(f->frame_thread.frame_progress);
+ dav1d_free(f->task_thread.tasks);
+ dav1d_free(f->task_thread.tile_tasks[0]);
+ dav1d_free_aligned(f->ts);
+ dav1d_free_aligned(f->ipred_edge[0]);
+ dav1d_free(f->a);
+ dav1d_free(f->tile);
+ dav1d_free(f->lf.mask);
+ dav1d_free(f->lf.level);
+ dav1d_free(f->lf.lr_mask);
+ dav1d_free(f->lf.tx_lpf_right_edge[0]);
+ dav1d_free(f->lf.start_of_tile_row);
+ dav1d_refmvs_clear(&f->rf);
+ dav1d_free_aligned(f->lf.cdef_line_buf);
+ dav1d_free_aligned(f->lf.lr_line_buf);
+ }
+ dav1d_free_aligned(c->fc);
+ if (c->n_fc > 1 && c->frame_thread.out_delayed) {
+ for (unsigned n = 0; n < c->n_fc; n++)
+ if (c->frame_thread.out_delayed[n].p.frame_hdr)
+ dav1d_thread_picture_unref(&c->frame_thread.out_delayed[n]);
+ dav1d_free(c->frame_thread.out_delayed);
+ }
+ for (int n = 0; n < c->n_tile_data; n++)
+ dav1d_data_unref_internal(&c->tile[n].data);
+ dav1d_free(c->tile);
+ for (int n = 0; n < 8; n++) {
+ dav1d_cdf_thread_unref(&c->cdf[n]);
+ if (c->refs[n].p.p.frame_hdr)
+ dav1d_thread_picture_unref(&c->refs[n].p);
+ dav1d_ref_dec(&c->refs[n].refmvs);
+ dav1d_ref_dec(&c->refs[n].segmap);
+ }
+ dav1d_ref_dec(&c->seq_hdr_ref);
+ dav1d_ref_dec(&c->frame_hdr_ref);
+
+ dav1d_ref_dec(&c->mastering_display_ref);
+ dav1d_ref_dec(&c->content_light_ref);
+ dav1d_ref_dec(&c->itut_t35_ref);
+
+ dav1d_mem_pool_end(c->seq_hdr_pool);
+ dav1d_mem_pool_end(c->frame_hdr_pool);
+ dav1d_mem_pool_end(c->segmap_pool);
+ dav1d_mem_pool_end(c->refmvs_pool);
+ dav1d_mem_pool_end(c->cdf_pool);
+ dav1d_mem_pool_end(c->picture_pool);
+ dav1d_mem_pool_end(c->pic_ctx_pool);
+
+ dav1d_freep_aligned(c_out);
+}
+
+int dav1d_get_event_flags(Dav1dContext *const c, enum Dav1dEventFlags *const flags) {
+ validate_input_or_ret(c != NULL, DAV1D_ERR(EINVAL));
+ validate_input_or_ret(flags != NULL, DAV1D_ERR(EINVAL));
+
+ *flags = c->event_flags;
+ c->event_flags = 0;
+ return 0;
+}
+
+int dav1d_get_decode_error_data_props(Dav1dContext *const c, Dav1dDataProps *const out) {
+ validate_input_or_ret(c != NULL, DAV1D_ERR(EINVAL));
+ validate_input_or_ret(out != NULL, DAV1D_ERR(EINVAL));
+
+ dav1d_data_props_unref_internal(out);
+ *out = c->cached_error_props;
+ dav1d_data_props_set_defaults(&c->cached_error_props);
+
+ return 0;
+}
+
+void dav1d_picture_unref(Dav1dPicture *const p) {
+ dav1d_picture_unref_internal(p);
+}
+
+uint8_t *dav1d_data_create(Dav1dData *const buf, const size_t sz) {
+ return dav1d_data_create_internal(buf, sz);
+}
+
+int dav1d_data_wrap(Dav1dData *const buf, const uint8_t *const ptr,
+ const size_t sz,
+ void (*const free_callback)(const uint8_t *data,
+ void *user_data),
+ void *const user_data)
+{
+ return dav1d_data_wrap_internal(buf, ptr, sz, free_callback, user_data);
+}
+
+int dav1d_data_wrap_user_data(Dav1dData *const buf,
+ const uint8_t *const user_data,
+ void (*const free_callback)(const uint8_t *user_data,
+ void *cookie),
+ void *const cookie)
+{
+ return dav1d_data_wrap_user_data_internal(buf,
+ user_data,
+ free_callback,
+ cookie);
+}
+
+void dav1d_data_unref(Dav1dData *const buf) {
+ dav1d_data_unref_internal(buf);
+}
+
+void dav1d_data_props_unref(Dav1dDataProps *const props) {
+ dav1d_data_props_unref_internal(props);
+}
diff --git a/third_party/dav1d/src/log.c b/third_party/dav1d/src/log.c
new file mode 100644
index 0000000000..a08f6eb68d
--- /dev/null
+++ b/third_party/dav1d/src/log.c
@@ -0,0 +1,57 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stdarg.h>
+#include <stdio.h>
+
+#include "dav1d/dav1d.h"
+
+#include "common/validate.h"
+
+#include "src/internal.h"
+#include "src/log.h"
+
+#if CONFIG_LOG
+COLD void dav1d_log_default_callback(void *const cookie,
+ const char *const format, va_list ap)
+{
+ vfprintf(stderr, format, ap);
+}
+
+COLD void dav1d_log(Dav1dContext *const c, const char *const format, ...) {
+ assert(c != NULL);
+
+ if (!c->logger.callback)
+ return;
+
+ va_list ap;
+ va_start(ap, format);
+ c->logger.callback(c->logger.cookie, format, ap);
+ va_end(ap);
+}
+#endif
diff --git a/third_party/dav1d/src/log.h b/third_party/dav1d/src/log.h
new file mode 100644
index 0000000000..df32de7f25
--- /dev/null
+++ b/third_party/dav1d/src/log.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_LOG_H
+#define DAV1D_SRC_LOG_H
+
+#include "config.h"
+
+#include <stdarg.h>
+
+#include "dav1d/dav1d.h"
+
+#include "common/attributes.h"
+
+#if CONFIG_LOG
+#define dav1d_log dav1d_log
+void dav1d_log_default_callback(void *cookie, const char *format, va_list ap);
+void dav1d_log(Dav1dContext *c, const char *format, ...) ATTR_FORMAT_PRINTF(2, 3);
+#else
+#define dav1d_log_default_callback NULL
+#define dav1d_log(...) do { } while(0)
+#endif
+
+#endif /* DAV1D_SRC_LOG_H */
diff --git a/third_party/dav1d/src/loongarch/cpu.c b/third_party/dav1d/src/loongarch/cpu.c
new file mode 100644
index 0000000000..a79ade5472
--- /dev/null
+++ b/third_party/dav1d/src/loongarch/cpu.c
@@ -0,0 +1,47 @@
+/*
+ * Copyright © 2023, VideoLAN and dav1d authors
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+#include "common/attributes.h"
+#include "src/loongarch/cpu.h"
+
+#if defined(HAVE_GETAUXVAL)
+#include <sys/auxv.h>
+
+#define LA_HWCAP_LSX ( 1 << 4 )
+#define LA_HWCAP_LASX ( 1 << 5 )
+#endif
+
+COLD unsigned dav1d_get_cpu_flags_loongarch(void) {
+ unsigned flags = 0;
+#if defined(HAVE_GETAUXVAL)
+ unsigned long hw_cap = getauxval(AT_HWCAP);
+ flags |= (hw_cap & LA_HWCAP_LSX) ? DAV1D_LOONGARCH_CPU_FLAG_LSX : 0;
+ flags |= (hw_cap & LA_HWCAP_LASX) ? DAV1D_LOONGARCH_CPU_FLAG_LASX : 0;
+#endif
+
+ return flags;
+}
diff --git a/third_party/dav1d/src/loongarch/cpu.h b/third_party/dav1d/src/loongarch/cpu.h
new file mode 100644
index 0000000000..d00ff67dac
--- /dev/null
+++ b/third_party/dav1d/src/loongarch/cpu.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright © 2023, VideoLAN and dav1d authors
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_LOONGARCH_CPU_H
+#define DAV1D_SRC_LOONGARCH_CPU_H
+
+enum CpuFlags {
+ DAV1D_LOONGARCH_CPU_FLAG_LSX = 1 << 0,
+ DAV1D_LOONGARCH_CPU_FLAG_LASX = 1 << 1,
+};
+
+unsigned dav1d_get_cpu_flags_loongarch(void);
+
+#endif /* DAV1D_SRC_LOONGARCH_CPU_H */
diff --git a/third_party/dav1d/src/loongarch/itx.S b/third_party/dav1d/src/loongarch/itx.S
new file mode 100644
index 0000000000..fc0c79ea01
--- /dev/null
+++ b/third_party/dav1d/src/loongarch/itx.S
@@ -0,0 +1,8104 @@
+/*
+ * Copyright © 2023, VideoLAN and dav1d authors
+ * Copyright © 2023, Loongson Technology Corporation Limited
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/loongarch/loongson_asm.S"
+
+/*
+void inv_txfm_add_wht_wht_4x4_c(pixel *dst, const ptrlowff_t stride,
+ coef *const coeff, const int eob
+ HIGHBD_DECL_SUFFIX)
+*/
+function inv_txfm_add_wht_wht_4x4_8bpc_lsx
+ vld vr0, a2, 0
+ vld vr2, a2, 16
+
+ vreplgr2vr.h vr20, zero
+
+ vsrai.h vr0, vr0, 2
+ vsrai.h vr2, vr2, 2
+
+ vst vr20, a2, 0
+
+ vpickod.d vr1, vr0, vr0
+ vpickod.d vr3, vr2, vr2
+
+ vadd.h vr4, vr0, vr1
+ vsub.h vr5, vr2, vr3
+ vsub.h vr6, vr4, vr5
+ vsrai.h vr6, vr6, 1
+ vsub.h vr0, vr6, vr3
+ vsub.h vr2, vr6, vr1
+ vsub.h vr1, vr4, vr0
+ vadd.h vr3, vr5, vr2
+
+ vst vr20, a2, 16
+
+ vilvl.h vr4, vr0, vr1
+ vilvl.h vr5, vr3, vr2
+ vilvl.w vr0, vr5, vr4
+ vilvh.w vr2, vr5, vr4
+ vilvh.d vr1, vr0, vr0
+ vilvh.d vr3, vr2, vr2
+
+ vadd.h vr4, vr0, vr1
+ vsub.h vr5, vr2, vr3
+ vsub.h vr6, vr4, vr5
+ vsrai.h vr6, vr6, 1
+ vsub.h vr0, vr6, vr3
+ vsub.h vr2, vr6, vr1
+ vsub.h vr1, vr4, vr0
+ vadd.h vr3, vr5, vr2
+
+ vld vr4, a0, 0
+ vldx vr5, a0, a1
+ alsl.d t0, a1, a0, 1
+ vld vr6, t0, 0
+ vldx vr7, t0, a1
+
+ vsllwil.hu.bu vr4, vr4, 0
+ vsllwil.hu.bu vr5, vr5, 0
+ vsllwil.hu.bu vr6, vr6, 0
+ vsllwil.hu.bu vr7, vr7, 0
+ vilvl.d vr1, vr0, vr1
+ vilvl.d vr2, vr3, vr2
+ vilvl.d vr4, vr5, vr4
+ vilvl.d vr6, vr7, vr6
+ vadd.h vr1, vr1, vr4
+ vadd.h vr2, vr2, vr6
+ vssrani.bu.h vr2, vr1, 0
+
+ vstelm.w vr2, a0, 0, 0
+ add.d a0, a0, a1
+ vstelm.w vr2, a0, 0, 1
+ add.d a0, a0, a1
+ vstelm.w vr2, a0, 0, 2
+ add.d a0, a0, a1
+ vstelm.w vr2, a0, 0, 3
+endfunc
+
+const idct_coeffs, align=4
+ // idct4
+ .word 2896, 2896*8, 1567, 3784
+ // idct8
+ .word 799, 4017, 3406, 2276
+ // idct16
+ .word 401, 4076, 3166, 2598
+ .word 1931, 3612, 3920, 1189
+ // idct32
+ .word 201, 4091, 3035, 2751
+ .word 1751, 3703, 3857, 1380
+ .word 995, 3973, 3513, 2106
+ .word 2440, 3290, 4052, 601
+endconst
+
+.macro vld_x8 src, start, stride, in0, in1, in2, in3, in4, in5, in6, in7
+ vld \in0, \src, \start
+ vld \in1, \src, \start+(\stride*1)
+ vld \in2, \src, \start+(\stride*2)
+ vld \in3, \src, \start+(\stride*3)
+ vld \in4, \src, \start+(\stride*4)
+ vld \in5, \src, \start+(\stride*5)
+ vld \in6, \src, \start+(\stride*6)
+ vld \in7, \src, \start+(\stride*7)
+.endm
+
+.macro vst_x8 src, start, stride, in0, in1, in2, in3, in4, in5, in6, in7
+ vst \in0, \src, \start
+ vst \in1, \src, \start+(\stride*1)
+ vst \in2, \src, \start+(\stride*2)
+ vst \in3, \src, \start+(\stride*3)
+ vst \in4, \src, \start+(\stride*4)
+ vst \in5, \src, \start+(\stride*5)
+ vst \in6, \src, \start+(\stride*6)
+ vst \in7, \src, \start+(\stride*7)
+.endm
+
+.macro vld_x16 src, start, stride, in0, in1, in2, in3, in4, in5, in6, in7, \
+ in8, in9, in10, in11, in12, in13, in14, in15
+
+ vld_x8 \src, \start, \stride, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7
+
+ vld \in8, \src, \start+(\stride*8)
+ vld \in9, \src, \start+(\stride*9)
+ vld \in10, \src, \start+(\stride*10)
+ vld \in11, \src, \start+(\stride*11)
+ vld \in12, \src, \start+(\stride*12)
+ vld \in13, \src, \start+(\stride*13)
+ vld \in14, \src, \start+(\stride*14)
+ vld \in15, \src, \start+(\stride*15)
+.endm
+
+.macro vst_x16 src, start, stride, in0, in1, in2, in3, in4, in5, in6, in7, \
+ in8, in9, in10, in11, in12, in13, in14, in15
+
+ vst_x8 \src, \start, \stride, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7
+
+ vst \in8, \src, \start+(\stride*8)
+ vst \in9, \src, \start+(\stride*9)
+ vst \in10, \src, \start+(\stride*10)
+ vst \in11, \src, \start+(\stride*11)
+ vst \in12, \src, \start+(\stride*12)
+ vst \in13, \src, \start+(\stride*13)
+ vst \in14, \src, \start+(\stride*14)
+ vst \in15, \src, \start+(\stride*15)
+.endm
+
+.macro DST_ADD_W4 in0, in1, in2, in3, in4, in5
+ vilvl.w vr10, \in1, \in0 // 0 1 2 3 4 5 6 7 x ...
+ vilvl.w vr12, \in3, \in2 // 8 9 10 11 12 13 14 15 x ...
+ vsllwil.hu.bu vr10, vr10, 0
+ vsllwil.hu.bu vr12, vr12, 0
+ vadd.h vr10, \in4, vr10
+ vadd.h vr12, \in5, vr12
+ vssrani.bu.h vr12, vr10, 0
+ vstelm.w vr12, a0, 0, 0
+ add.d t8, a0, a1
+ vstelm.w vr12, t8, 0, 1
+ vstelm.w vr12, t2, 0, 2
+ add.d t8, t2, a1
+ vstelm.w vr12, t8, 0, 3
+.endm
+
+.macro VLD_DST_ADD_W4 in0, in1
+ vld vr0, a0, 0
+ vldx vr1, a0, a1
+ vld vr2, t2, 0
+ vldx vr3, t2, a1
+
+ DST_ADD_W4 vr0, vr1, vr2, vr3, \in0, \in1
+.endm
+
+.macro dct_4x4_core_lsx in0, in1, in2, in3, in4, in5, in6, in7, out0, out1
+ vexth.w.h vr4, \in0 // in1
+ vexth.w.h vr5, \in1 // in3
+ vmul.w vr6, vr4, \in4
+ vmul.w vr7, vr4, \in5
+ vmadd.w vr6, vr5, \in5 // t3
+ vmsub.w vr7, vr5, \in4 // t2
+ vsllwil.w.h vr4, \in2, 0 // in0
+ vsllwil.w.h vr5, \in3, 0 // in2
+ vmul.w vr9, vr4, \in6
+ vmul.w vr10, vr4, \in7
+ vmadd.w vr9, vr5, \in7 // t0
+ vmsub.w vr10, vr5, \in6 // t1
+ vssrarni.h.w vr10, vr9, 12 // t0 t1
+ vssrarni.h.w vr7, vr6, 12 // t3 t2
+ vsadd.h \out0, vr10, vr7 // 0 4 8 12 1 5 9 13 c[0] c[1]
+ vssub.h \out1, vr10, vr7 // 3 7 11 15 2 6 10 14 c[3] c[2]
+.endm
+
+.macro inv_dct_dct_4x4_lsx
+ la.local t0, idct_coeffs
+
+ vld vr0, a2, 0 // 0 1 2 3 4 5 6 7
+ vld vr1, a2, 16 // 8 9 10 11 12 13 14 15
+
+ vldrepl.w vr2, t0, 8 // 1567
+ vldrepl.w vr3, t0, 12 // 3784
+ vldrepl.w vr8, t0, 0 // 2896
+
+ dct_4x4_core_lsx vr0, vr1, vr0, vr1, vr3, vr2, vr8, vr8, vr11, vr12
+
+ vreplgr2vr.h vr15, zero
+ vshuf4i.d vr12, vr12, 0x01 // 2 6 10 14 3 7 11 15
+ vst vr15, a2, 0
+ vst vr15, a2, 16
+
+ vilvl.h vr4, vr12, vr11 // 0 2 4 6 8 10 12 14
+ vilvh.h vr5, vr12, vr11 // 1 3 5 7 9 11 13 15
+ vilvl.h vr0, vr5, vr4 // 0 1 2 3 4 5 6 7
+ vilvh.h vr1, vr5, vr4 // 8 9 10 11 12 13 14 15
+
+ dct_4x4_core_lsx vr0, vr1, vr0, vr1, vr3, vr2, vr8, vr8, vr13, vr14
+ vsrari.h vr13, vr13, 4
+ vsrari.h vr14, vr14, 4
+ vshuf4i.d vr14, vr14, 0x01
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W4 vr13, vr14
+.endm
+
+.macro identity_4x4_lsx in0, in1, in2, in3, out0
+ vsllwil.w.h vr2, \in0, 0
+ vexth.w.h vr3, \in1
+ vmul.w vr4, vr2, \in2
+ vmul.w vr5, vr3, \in2
+ vssrarni.h.w vr5, vr4, 12
+ vsadd.h \out0, vr5, \in3
+.endm
+
+.macro inv_identity_identity_4x4_lsx
+ vld vr0, a2, 0 // 0 1 2 3 4 5 6 7
+ vld vr1, a2, 16 // 8 9 10 11 12 13 14 15
+
+ li.w t0, 1697
+ vreplgr2vr.w vr20, t0
+
+ identity_4x4_lsx vr0, vr0, vr20, vr0, vr0
+ identity_4x4_lsx vr1, vr1, vr20, vr1, vr1
+ vreplgr2vr.h vr15, zero
+ vst vr15, a2, 0
+ vst vr15, a2, 16
+ identity_4x4_lsx vr0, vr0, vr20, vr0, vr6
+ identity_4x4_lsx vr1, vr1, vr20, vr1, vr7
+
+ vsrari.h vr6, vr6, 4
+ vsrari.h vr7, vr7, 4
+ vilvh.d vr8, vr6, vr6
+ vilvh.d vr9, vr7, vr7
+ vilvl.h vr4, vr8, vr6
+ vilvl.h vr5, vr9, vr7
+ vilvl.w vr6, vr5, vr4
+ vilvh.w vr7, vr5, vr4
+
+ alsl.d t2, a1, a0, 1
+ VLD_DST_ADD_W4 vr6, vr7
+.endm
+
+const iadst4_coeffs, align=4
+ .word 1321, 3803, 2482, 3344
+endconst
+
+.macro adst4x4_1d_lsx in0, in1, in2, in3, out0, out1, out2, out3
+ vsub.w vr6, \in0, \in2 // in0-in2
+ vmul.w vr7, \in0, vr20 // in0*1321
+ vmadd.w vr7, \in2, vr21 // in0*1321+in2*3803
+ vmadd.w vr7, \in3, vr22 // in0*1321+in2*3803+in3*2482
+ vmul.w vr8, \in1, vr23 // in1*3344
+ vadd.w vr6, vr6, \in3 // in0-in2+in3
+ vmul.w vr9, \in0, vr22 // in0*2482
+ vmsub.w vr9, \in2, vr20 // in2*1321
+ vmsub.w vr9, \in3, vr21 // in0*2482-in2*1321-in3*3803
+ vadd.w vr5, vr7, vr9
+ vmul.w \out2, vr6, vr23 // out[2] 8 9 10 11
+ vadd.w \out0, vr7, vr8 // out[0] 0 1 2 3
+ vadd.w \out1, vr9, vr8 // out[1] 4 5 6 7
+ vsub.w \out3, vr5, vr8 // out[3] 12 13 14 15
+.endm
+
+.macro inv_adst_dct_4x4_lsx
+ vld vr0, a2, 0
+ vld vr1, a2, 16
+
+ la.local t0, iadst4_coeffs
+ vsllwil.w.h vr2, vr0, 0 // in0
+ vexth.w.h vr3, vr0 // in1
+ vsllwil.w.h vr4, vr1, 0 // in2
+ vexth.w.h vr5, vr1 // in3
+ vldrepl.w vr20, t0, 0 // 1321
+ vldrepl.w vr21, t0, 4 // 3803
+ vldrepl.w vr22, t0, 8 // 2482
+ vldrepl.w vr23, t0, 12 // 3344
+
+ adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3
+
+ LSX_TRANSPOSE4x4_W vr0, vr1, vr2, vr3, vr11, vr13, vr12, vr14, vr6, vr7
+ vssrarni.h.w vr13, vr11, 12
+ vssrarni.h.w vr14, vr12, 12
+
+ vreplgr2vr.h vr15, zero
+ la.local t0, idct_coeffs
+ vst vr15, a2, 0
+ vst vr15, a2, 16
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vldrepl.w vr22, t0, 0 // 2896
+
+ dct_4x4_core_lsx vr13, vr14, vr13, vr14, vr21, vr20, vr22, vr22, vr13, vr14
+
+ vshuf4i.d vr14, vr14, 0x01
+ vsrari.h vr13, vr13, 4
+ vsrari.h vr14, vr14, 4
+
+ alsl.d t2, a1, a0, 1
+ VLD_DST_ADD_W4 vr13, vr14
+.endm
+
+.macro inv_adst_adst_4x4_lsx
+ vld vr0, a2, 0
+ vld vr1, a2, 16
+
+ la.local t0, iadst4_coeffs
+ vsllwil.w.h vr2, vr0, 0 // in0
+ vexth.w.h vr3, vr0 // in1
+ vsllwil.w.h vr4, vr1, 0 // in2
+ vexth.w.h vr5, vr1 // in3
+ vldrepl.w vr20, t0, 0 // 1321
+ vldrepl.w vr21, t0, 4 // 3803
+ vldrepl.w vr22, t0, 8 // 2482
+ vldrepl.w vr23, t0, 12 // 3344
+
+ adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3
+
+ LSX_TRANSPOSE4x4_W vr0, vr1, vr2, vr3, vr11, vr13, vr12, vr14, vr6, vr7
+
+ vsrari.w vr11, vr11, 12
+ vsrari.w vr13, vr13, 12
+ vsrari.w vr12, vr12, 12
+ vsrari.w vr14, vr14, 12
+
+ vreplgr2vr.h vr15, zero
+ vst vr15, a2, 0
+ vst vr15, a2, 16
+
+ adst4x4_1d_lsx vr11, vr13, vr12, vr14, vr11, vr13, vr12, vr14
+
+ vssrarni.h.w vr13, vr11, 12
+ vssrarni.h.w vr14, vr12, 12
+ vsrari.h vr13, vr13, 4
+ vsrari.h vr14, vr14, 4
+
+ alsl.d t2, a1, a0, 1
+ VLD_DST_ADD_W4 vr13, vr14
+.endm
+
+.macro inv_dct_adst_4x4_lsx
+ la.local t0, idct_coeffs
+
+ vld vr0, a2, 0 // 0 1 2 3 4 5 6 7
+ vld vr1, a2, 16 // 8 9 10 11 12 13 14 15
+
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vldrepl.w vr22, t0, 0 // 2896
+
+ dct_4x4_core_lsx vr0, vr1, vr0, vr1, vr21, vr20, vr22, vr22, vr11, vr12
+
+ vreplgr2vr.h vr15, zero
+ vst vr15, a2, 0
+ vst vr15, a2, 16
+
+ vshuf4i.d vr12, vr12, 0x01 // 3 7 11 15 2 6 10 14
+
+ vilvl.h vr4, vr12, vr11 // 0 2 4 6 8 10 12 14
+ vilvh.h vr5, vr12, vr11 // 1 3 5 7 9 11 13 15
+ vilvl.h vr11, vr5, vr4 // 0 1 2 3 4 5 6 7
+ vilvh.h vr12, vr5, vr4 // 8 9 10 11 12 13 14 15
+
+ vsllwil.w.h vr2, vr11, 0 // in0
+ vexth.w.h vr3, vr11 // in1
+ vsllwil.w.h vr4, vr12, 0 // in2
+ vexth.w.h vr5, vr12 // in3
+
+ la.local t0, iadst4_coeffs
+
+ vldrepl.w vr20, t0, 0 // 1321
+ vldrepl.w vr21, t0, 4 // 3803
+ vldrepl.w vr22, t0, 8 // 2482
+ vldrepl.w vr23, t0, 12 // 3344
+
+ adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr11, vr13, vr12, vr14
+
+ vssrarni.h.w vr13, vr11, 12
+ vssrarni.h.w vr14, vr12, 12
+ vsrari.h vr13, vr13, 4
+ vsrari.h vr14, vr14, 4
+
+ alsl.d t2, a1, a0, 1
+ VLD_DST_ADD_W4 vr13, vr14
+.endm
+
+.macro inv_dct_flipadst_4x4_lsx
+ la.local t0, idct_coeffs
+
+ vld vr0, a2, 0 // 0 1 2 3 4 5 6 7
+ vld vr1, a2, 16 // 8 9 10 11 12 13 14 15
+
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vldrepl.w vr22, t0, 0 // 2896
+
+ dct_4x4_core_lsx vr0, vr1, vr0, vr1, vr21, vr20, vr22, vr22, vr11, vr12
+
+ vreplgr2vr.h vr15, zero
+ vst vr15, a2, 0
+ vst vr15, a2, 16
+
+ vshuf4i.d vr12, vr12, 0x01 // 3 7 11 15 2 6 10 14
+
+ vilvl.h vr4, vr12, vr11 // 0 2 4 6 8 10 12 14
+ vilvh.h vr5, vr12, vr11 // 1 3 5 7 9 11 13 15
+ vilvl.h vr11, vr5, vr4 // 0 1 2 3 4 5 6 7
+ vilvh.h vr12, vr5, vr4 // 8 9 10 11 12 13 14 15
+ vsllwil.w.h vr2, vr11, 0 // in0
+ vexth.w.h vr3, vr11 // in1
+ vsllwil.w.h vr4, vr12, 0 // in2
+ vexth.w.h vr5, vr12 // in3
+
+ la.local t0, iadst4_coeffs
+
+ vldrepl.w vr20, t0, 0 // 1321
+ vldrepl.w vr21, t0, 4 // 3803
+ vldrepl.w vr22, t0, 8 // 2482
+ vldrepl.w vr23, t0, 12 // 3344
+
+ adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr11, vr12, vr13, vr14
+
+ vssrarni.h.w vr11, vr12, 12 // 0 1 2 3 4 5 6 7
+ vssrarni.h.w vr13, vr14, 12 // 8 9 10 11 12 13 14 15
+ vsrari.h vr11, vr11, 4
+ vsrari.h vr13, vr13, 4
+
+ alsl.d t2, a1, a0, 1
+ VLD_DST_ADD_W4 vr13, vr11
+.endm
+
+.macro inv_flipadst_adst_4x4_lsx
+ vld vr0, a2, 0
+ vld vr1, a2, 16
+
+ la.local t0, iadst4_coeffs
+ vsllwil.w.h vr2, vr0, 0 // in0
+ vexth.w.h vr3, vr0 // in1
+ vsllwil.w.h vr4, vr1, 0 // in2
+ vexth.w.h vr5, vr1 // in3
+ vldrepl.w vr20, t0, 0 // 1321
+ vldrepl.w vr21, t0, 4 // 3803
+ vldrepl.w vr22, t0, 8 // 2482
+ vldrepl.w vr23, t0, 12 // 3344
+
+ adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3
+
+ vsrari.w vr0, vr0, 12
+ vsrari.w vr1, vr1, 12
+ vsrari.w vr2, vr2, 12
+ vsrari.w vr3, vr3, 12
+
+ vilvl.w vr4, vr0, vr1
+ vilvh.w vr5, vr0, vr1
+ vilvl.w vr6, vr2, vr3
+ vilvh.w vr7, vr2, vr3
+ vilvl.d vr11, vr4, vr6
+ vilvh.d vr12, vr4, vr6
+ vilvl.d vr13, vr5, vr7
+ vilvh.d vr14, vr5, vr7
+
+ vreplgr2vr.h vr15, zero
+ vst vr15, a2, 0
+ vst vr15, a2, 16
+
+ adst4x4_1d_lsx vr11, vr12, vr13, vr14, vr11, vr13, vr12, vr14
+
+ vssrarni.h.w vr13, vr11, 12
+ vssrarni.h.w vr14, vr12, 12
+ vsrari.h vr13, vr13, 4
+ vsrari.h vr14, vr14, 4
+
+ alsl.d t2, a1, a0, 1
+ VLD_DST_ADD_W4 vr13, vr14
+.endm
+
+.macro inv_adst_flipadst_4x4_lsx
+ vld vr0, a2, 0
+ vld vr1, a2, 16
+
+ la.local t0, iadst4_coeffs
+ vsllwil.w.h vr2, vr0, 0 // in0
+ vexth.w.h vr3, vr0 // in1
+ vsllwil.w.h vr4, vr1, 0 // in2
+ vexth.w.h vr5, vr1 // in3
+ vldrepl.w vr20, t0, 0 // 1321
+ vldrepl.w vr21, t0, 4 // 3803
+ vldrepl.w vr22, t0, 8 // 2482
+ vldrepl.w vr23, t0, 12 // 3344
+
+ adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3
+ LSX_TRANSPOSE4x4_W vr0, vr1, vr2, vr3, vr11, vr13, vr12, vr14, vr6, vr7
+ vsrari.w vr11, vr11, 12
+ vsrari.w vr12, vr12, 12
+ vsrari.w vr13, vr13, 12
+ vsrari.w vr14, vr14, 12
+
+ vreplgr2vr.h vr15, zero
+ vst vr15, a2, 0
+ vst vr15, a2, 16
+
+ adst4x4_1d_lsx vr11, vr13, vr12, vr14, vr11, vr12, vr13, vr14
+
+ vssrarni.h.w vr11, vr12, 12
+ vssrarni.h.w vr13, vr14, 12
+ vsrari.h vr11, vr11, 4
+ vsrari.h vr13, vr13, 4
+
+ alsl.d t2, a1, a0, 1
+ VLD_DST_ADD_W4 vr13, vr11
+.endm
+
+.macro inv_flipadst_dct_4x4_lsx
+ vld vr0, a2, 0
+ vld vr1, a2, 16
+
+ la.local t0, iadst4_coeffs
+ vsllwil.w.h vr2, vr0, 0 // in0
+ vexth.w.h vr3, vr0 // in1
+ vsllwil.w.h vr4, vr1, 0 // in2
+ vexth.w.h vr5, vr1 // in3
+ vldrepl.w vr20, t0, 0 // 1321
+ vldrepl.w vr21, t0, 4 // 3803
+ vldrepl.w vr22, t0, 8 // 2482
+ vldrepl.w vr23, t0, 12 // 3344
+
+ adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3
+
+ vilvl.w vr4, vr0, vr1
+ vilvh.w vr5, vr0, vr1
+ vilvl.w vr6, vr2, vr3
+ vilvh.w vr7, vr2, vr3
+
+ vilvl.d vr11, vr4, vr6
+ vilvh.d vr12, vr4, vr6
+ vilvl.d vr13, vr5, vr7
+ vilvh.d vr14, vr5, vr7
+
+ vssrarni.h.w vr12, vr11, 12
+ vssrarni.h.w vr14, vr13, 12
+
+ vreplgr2vr.h vr15, zero
+ la.local t0, idct_coeffs
+ vst vr15, a2, 0
+ vst vr15, a2, 16
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vldrepl.w vr22, t0, 0 // 2896
+
+ dct_4x4_core_lsx vr12, vr14, vr12, vr14, vr21, vr20, vr22, vr22, vr13, vr14
+
+ vshuf4i.d vr14, vr14, 0x01
+ vsrari.h vr13, vr13, 4
+ vsrari.h vr14, vr14, 4
+
+ alsl.d t2, a1, a0, 1
+ VLD_DST_ADD_W4 vr13, vr14
+.endm
+
+.macro inv_flipadst_flipadst_4x4_lsx
+ vld vr0, a2, 0
+ vld vr1, a2, 16
+
+ la.local t0, iadst4_coeffs
+ vsllwil.w.h vr2, vr0, 0 // in0
+ vexth.w.h vr3, vr0 // in1
+ vsllwil.w.h vr4, vr1, 0 // in2
+ vexth.w.h vr5, vr1 // in3
+ vldrepl.w vr20, t0, 0 // 1321
+ vldrepl.w vr21, t0, 4 // 3803
+ vldrepl.w vr22, t0, 8 // 2482
+ vldrepl.w vr23, t0, 12 // 3344
+
+ adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3
+
+ vilvl.w vr4, vr0, vr1
+ vilvh.w vr5, vr0, vr1
+ vilvl.w vr6, vr2, vr3
+ vilvh.w vr7, vr2, vr3
+ vilvl.d vr11, vr4, vr6
+ vilvh.d vr12, vr4, vr6
+ vilvl.d vr13, vr5, vr7
+ vilvh.d vr14, vr5, vr7
+
+ vsrari.w vr11, vr11, 12
+ vsrari.w vr12, vr12, 12
+ vsrari.w vr13, vr13, 12
+ vsrari.w vr14, vr14, 12
+
+ vreplgr2vr.h vr15, zero
+ vst vr15, a2, 0
+ vst vr15, a2, 16
+
+ adst4x4_1d_lsx vr11, vr12, vr13, vr14, vr11, vr12, vr13, vr14
+
+ vssrarni.h.w vr11, vr12, 12
+ vssrarni.h.w vr13, vr14, 12
+ vsrari.h vr11, vr11, 4
+ vsrari.h vr13, vr13, 4
+
+ alsl.d t2, a1, a0, 1
+ VLD_DST_ADD_W4 vr13, vr11
+.endm
+
+.macro inv_dct_identity_4x4_lsx
+ la.local t0, idct_coeffs
+
+ vld vr0, a2, 0
+ vld vr1, a2, 16
+
+ vldrepl.w vr2, t0, 8 // 1567
+ vldrepl.w vr3, t0, 12 // 3784
+ vldrepl.w vr8, t0, 0 // 2896
+
+ dct_4x4_core_lsx vr0, vr1, vr0, vr1, vr3, vr2, vr8, vr8, vr11, vr12
+ vshuf4i.d vr12, vr12, 0x01 // 2 6 10 14 3 7 11 15
+
+ vreplgr2vr.h vr15, zero
+ li.w t0, 1697
+
+ vilvl.h vr4, vr12, vr11 // 0 2 4 6 8 10 12 14
+ vilvh.h vr5, vr12, vr11 // 1 3 5 7 9 11 13 15
+ vilvl.h vr10, vr5, vr4 // 0 1 2 3 4 5 6 7
+ vilvh.h vr12, vr5, vr4 // 8 9 10 11 12 13 14 15
+
+ vst vr15, a2, 0
+ vst vr15, a2, 16
+ vreplgr2vr.w vr20, t0
+
+ identity_4x4_lsx vr10, vr10, vr20, vr10, vr6
+ identity_4x4_lsx vr12, vr12, vr20, vr12, vr7
+ vsrari.h vr11, vr6, 4
+ vsrari.h vr13, vr7, 4
+
+ alsl.d t2, a1, a0, 1
+ VLD_DST_ADD_W4 vr11, vr13
+.endm
+
+.macro inv_identity_dct_4x4_lsx
+ vld vr0, a2, 0
+ vld vr1, a2, 16
+
+ li.w t0, 1697
+ vreplgr2vr.w vr20, t0
+
+ identity_4x4_lsx vr0, vr0, vr20, vr0, vr0
+ identity_4x4_lsx vr1, vr1, vr20, vr1, vr1
+
+ vreplgr2vr.h vr15, zero
+
+ vilvl.h vr4, vr1, vr0 // 0 2 4 6 8 10 12 14
+ vilvh.h vr5, vr1, vr0 // 1 3 5 7 9 11 13 15
+ vilvl.h vr13, vr5, vr4 // 0 1 2 3 4 5 6 7
+ vilvh.h vr14, vr5, vr4 // 8 9 10 11 12 13 14 15
+
+ vst vr15, a2, 0
+ vst vr15, a2, 16
+
+ la.local t0, idct_coeffs
+
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vldrepl.w vr22, t0, 0 // 2896
+
+ dct_4x4_core_lsx vr13, vr14, vr13, vr14, vr21, vr20, vr22, vr22, vr13, vr14
+
+ vshuf4i.d vr14, vr14, 0x01
+ vsrari.h vr13, vr13, 4
+ vsrari.h vr14, vr14, 4
+
+ alsl.d t2, a1, a0, 1
+ VLD_DST_ADD_W4 vr13, vr14
+.endm
+
+.macro inv_flipadst_identity_4x4_lsx
+ vld vr0, a2, 0
+ vld vr1, a2, 16
+
+ la.local t0, iadst4_coeffs
+ vsllwil.w.h vr2, vr0, 0 // in0
+ vexth.w.h vr3, vr0 // in1
+ vsllwil.w.h vr4, vr1, 0 // in2
+ vexth.w.h vr5, vr1 // in3
+ vldrepl.w vr20, t0, 0 // 1321
+ vldrepl.w vr21, t0, 4 // 3803
+ vldrepl.w vr22, t0, 8 // 2482
+ vldrepl.w vr23, t0, 12 // 3344
+
+ adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr10, vr11, vr12, vr13
+
+ vssrarni.h.w vr12, vr13, 12
+ vssrarni.h.w vr10, vr11, 12
+
+ vilvl.h vr4, vr10, vr12 // 0 2 4 6 8 10 12 14
+ vilvh.h vr5, vr10, vr12 // 1 3 5 7 9 11 13 15
+ vilvl.h vr11, vr5, vr4 // 0 1 2 3 4 5 6 7
+ vilvh.h vr13, vr5, vr4 // 8 9 10 11 12 13 14 15
+
+ vreplgr2vr.h vr15, zero
+ li.w t0, 1697
+
+ vst vr15, a2, 0
+ vst vr15, a2, 16
+ vreplgr2vr.w vr20, t0
+
+ identity_4x4_lsx vr11, vr11, vr20, vr11, vr6
+ identity_4x4_lsx vr13, vr13, vr20, vr13, vr7
+ vsrari.h vr11, vr6, 4
+ vsrari.h vr13, vr7, 4
+
+ alsl.d t2, a1, a0, 1
+ VLD_DST_ADD_W4 vr11, vr13
+.endm
+
+.macro inv_identity_flipadst_4x4_lsx
+ vld vr0, a2, 0
+ vld vr1, a2, 16
+
+ li.w t0, 1697
+ vreplgr2vr.w vr20, t0
+
+ identity_4x4_lsx vr0, vr0, vr20, vr0, vr0
+ identity_4x4_lsx vr1, vr1, vr20, vr1, vr1
+
+ vilvl.h vr4, vr1, vr0
+ vilvh.h vr5, vr1, vr0
+ vilvl.h vr11, vr5, vr4
+ vilvh.h vr13, vr5, vr4
+
+ vreplgr2vr.h vr15, zero
+ vst vr15, a2, 0
+ vst vr15, a2, 16
+
+ la.local t0, iadst4_coeffs
+ vsllwil.w.h vr2, vr11, 0 // in0
+ vexth.w.h vr3, vr11 // in1
+ vsllwil.w.h vr4, vr13, 0 // in2
+ vexth.w.h vr5, vr13 // in3
+ vldrepl.w vr20, t0, 0 // 1321
+ vldrepl.w vr21, t0, 4 // 3803
+ vldrepl.w vr22, t0, 8 // 2482
+ vldrepl.w vr23, t0, 12 // 3344
+
+ adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3
+
+ vssrarni.h.w vr0, vr1, 12 // 8 9 10 11 12 13 14 15
+ vssrarni.h.w vr2, vr3, 12 // 0 1 2 3 4 5 6 7
+ vsrari.h vr11, vr0, 4
+ vsrari.h vr13, vr2, 4
+
+ alsl.d t2, a1, a0, 1
+ VLD_DST_ADD_W4 vr13, vr11
+.endm
+
+.macro inv_identity_adst_4x4_lsx
+ vld vr0, a2, 0
+ vld vr1, a2, 16
+
+ li.w t0, 1697
+ vreplgr2vr.w vr20, t0
+
+ identity_4x4_lsx vr0, vr0, vr20, vr0, vr0
+ identity_4x4_lsx vr1, vr1, vr20, vr1, vr1
+
+ vilvl.h vr4, vr1, vr0
+ vilvh.h vr5, vr1, vr0
+ vilvl.h vr11, vr5, vr4
+ vilvh.h vr13, vr5, vr4
+
+ vreplgr2vr.h vr15, zero
+ vst vr15, a2, 0
+ vst vr15, a2, 16
+
+ la.local t0, iadst4_coeffs
+ vsllwil.w.h vr2, vr11, 0 // in0
+ vexth.w.h vr3, vr11 // in1
+ vsllwil.w.h vr4, vr13, 0 // in2
+ vexth.w.h vr5, vr13 // in3
+ vldrepl.w vr20, t0, 0 // 1321
+ vldrepl.w vr21, t0, 4 // 3803
+ vldrepl.w vr22, t0, 8 // 2482
+ vldrepl.w vr23, t0, 12 // 3344
+
+ adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3
+
+ vssrarni.h.w vr1, vr0, 12
+ vssrarni.h.w vr3, vr2, 12
+ vsrari.h vr11, vr1, 4
+ vsrari.h vr13, vr3, 4
+
+ alsl.d t2, a1, a0, 1
+ VLD_DST_ADD_W4 vr11, vr13
+.endm
+
+.macro inv_adst_identity_4x4_lsx
+ vld vr0, a2, 0
+ vld vr1, a2, 16
+
+ la.local t0, iadst4_coeffs
+ vsllwil.w.h vr2, vr0, 0 // in0
+ vexth.w.h vr3, vr0 // in1
+ vsllwil.w.h vr4, vr1, 0 // in2
+ vexth.w.h vr5, vr1 // in3
+ vldrepl.w vr20, t0, 0 // 1321
+ vldrepl.w vr21, t0, 4 // 3803
+ vldrepl.w vr22, t0, 8 // 2482
+ vldrepl.w vr23, t0, 12 // 3344
+
+ adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3
+
+ LSX_TRANSPOSE4x4_W vr0, vr1, vr2, vr3, vr11, vr13, vr12, vr14, vr6, vr7
+
+ vssrarni.h.w vr13, vr11, 12
+ vssrarni.h.w vr14, vr12, 12
+
+ vreplgr2vr.h vr15, zero
+ li.w t0, 1697
+
+ vst vr15, a2, 0
+ vst vr15, a2, 16
+ vreplgr2vr.w vr20, t0
+
+ identity_4x4_lsx vr13, vr13, vr20, vr13, vr6
+ identity_4x4_lsx vr14, vr14, vr20, vr14, vr7
+ vsrari.h vr11, vr6, 4
+ vsrari.h vr13, vr7, 4
+
+ alsl.d t2, a1, a0, 1
+ VLD_DST_ADD_W4 vr11, vr13
+.endm
+
+.macro fun4x4 type1, type2
+function inv_txfm_add_\type1\()_\type2\()_4x4_8bpc_lsx
+.ifc \type1\()_\type2, dct_dct
+ bnez a3, .LLL
+
+ vldi vr0, 0x8b5 // 181
+ ld.h t2, a2, 0 // dc
+ st.h zero, a2, 0
+ vreplgr2vr.w vr1, t2
+ vldi vr3, 0x880 // 128
+ vmul.w vr2, vr0, vr1
+ vld vr10, a0, 0
+ vsrari.w vr2, vr2, 8
+ vldx vr11, a0, a1
+ vmadd.w vr3, vr2, vr0
+ alsl.d t2, a1, a0, 1
+ vssrarni.h.w vr3, vr3, 12
+ vld vr12, t2, 0
+ vldx vr13, t2, a1
+
+ DST_ADD_W4 vr10, vr11, vr12, vr13, vr3, vr3
+
+ b .IDST_\type1\()_\type2\()_4X4_END
+.LLL:
+.endif
+
+ inv_\type1\()_\type2\()_4x4_lsx
+.IDST_\type1\()_\type2\()_4X4_END:
+endfunc
+.endm
+
+fun4x4 dct, dct
+fun4x4 identity, identity
+fun4x4 adst, dct
+fun4x4 dct, adst
+fun4x4 adst, adst
+fun4x4 dct, flipadst
+fun4x4 flipadst, adst
+fun4x4 adst, flipadst
+fun4x4 flipadst, dct
+fun4x4 flipadst, flipadst
+fun4x4 dct, identity
+fun4x4 identity, dct
+fun4x4 flipadst, identity
+fun4x4 identity, flipadst
+fun4x4 identity, adst
+fun4x4 adst, identity
+
+function inv_txfm_add_dct_dct_4x8_8bpc_lsx
+ bnez a3, .NO_HAS_DCONLY_4x8
+
+ ld.h t2, a2, 0 // dc
+ vldi vr0, 0x8b5 // 181
+ vreplgr2vr.w vr1, t2
+ vldi vr5, 0x880 // 128
+ vmul.w vr2, vr0, vr1
+ st.h zero, a2, 0
+ vsrari.w vr2, vr2, 8
+ vld vr10, a0, 0
+ vmul.w vr2, vr2, vr0
+ vldx vr11, a0, a1
+ vsrari.w vr2, vr2, 8
+ alsl.d t2, a1, a0, 1
+ vmadd.w vr5, vr2, vr0
+ vld vr12, t2, 0
+ vssrarni.h.w vr5, vr5, 12
+ vldx vr13, t2, a1
+
+ DST_ADD_W4 vr10, vr11, vr12, vr13, vr5, vr5
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, t2, 2
+
+ VLD_DST_ADD_W4 vr5, vr5
+ b .DCT_DCT_4x8_END
+
+.NO_HAS_DCONLY_4x8:
+ // sh=8 sw=4
+ la.local t0, idct_coeffs
+
+ vld vr0, a2, 0 // 0 1 2 3 4 5 6 7 in0
+ vld vr1, a2, 16 // 8 9 10 11 12 13 14 15 in1
+ vld vr20, a2, 32 // 16 17 18 19 20 21 22 23 in2
+ vld vr21, a2, 48 // 24 25 26 27 28 29 30 31 in3
+
+ vldrepl.w vr2, t0, 8 // 1567
+ vldrepl.w vr3, t0, 12 // 3784
+ vldrepl.w vr8, t0, 0 // 2896
+
+.macro DCT4_4Wx8H_1D_LSX
+ // in1 in3
+ vsllwil.w.h vr4, vr1, 0 // in1
+ vsllwil.w.h vr5, vr21, 0 // in3
+ vmul.w vr4, vr4, vr8
+ vmul.w vr5, vr5, vr8
+ vsrari.w vr4, vr4, 12
+ vsrari.w vr5, vr5, 12
+ vmul.w vr6, vr4, vr3
+ vmul.w vr7, vr4, vr2
+ vmadd.w vr6, vr5, vr2 // t3 0 1 2 3
+ vmsub.w vr7, vr5, vr3 // t2 0 1 2 3
+ vexth.w.h vr4, vr1 // in1
+ vexth.w.h vr5, vr21 // in3
+ vmul.w vr4, vr4, vr8
+ vmul.w vr5, vr5, vr8
+ vsrari.w vr4, vr4, 12
+ vsrari.w vr5, vr5, 12
+ vmul.w vr9, vr4, vr3
+ vmul.w vr10, vr4, vr2
+ vmadd.w vr9, vr5, vr2 // t3 4 5 6 7
+ vmsub.w vr10, vr5, vr3 // t2 4 5 6 7
+
+ // in0 in2
+ vsllwil.w.h vr4, vr0, 0 // in0
+ vsllwil.w.h vr5, vr20, 0 // in2
+ vmul.w vr4, vr4, vr8
+ vmul.w vr5, vr5, vr8
+ vsrari.w vr4, vr4, 12
+ vsrari.w vr5, vr5, 12
+ vmul.w vr11, vr4, vr8
+ vmul.w vr12, vr4, vr8
+ vmadd.w vr11, vr5, vr8 // t0 0 1 2 3
+ vmsub.w vr12, vr5, vr8 // t1 0 1 2 3
+ vexth.w.h vr4, vr0 // in0
+ vexth.w.h vr5, vr20 // in2
+ vmul.w vr4, vr4, vr8
+ vmul.w vr5, vr5, vr8
+ vsrari.w vr4, vr4, 12
+ vsrari.w vr5, vr5, 12
+ vmul.w vr13, vr4, vr8
+ vmul.w vr14, vr4, vr8
+ vmadd.w vr13, vr5, vr8 // t0 4 5 6 7
+ vmsub.w vr14, vr5, vr8 // t1 4 5 6 7
+ vssrarni.h.w vr9, vr6, 12 // t3
+ vssrarni.h.w vr10, vr7, 12 // t2
+ vssrarni.h.w vr14, vr12, 12 // t1
+ vssrarni.h.w vr13, vr11, 12 // t0
+ vsadd.h vr4, vr13, vr9 // c[0] 0 4 8 12 16 20 24 28
+ vsadd.h vr5, vr14, vr10 // c[1] 1 5 9 13 17 21 25 29
+ vssub.h vr20, vr14, vr10 // c[2] 2 6 10 14 18 22 26 30
+ vssub.h vr21, vr13, vr9 // c[3] 3 7 11 15 19 23 27 31
+.endm
+
+ DCT4_4Wx8H_1D_LSX
+
+ vreplgr2vr.h vr22, zero
+ vst vr22, a2, 0
+ vst vr22, a2, 16
+ vst vr22, a2, 32
+ vst vr22, a2, 48
+
+ vilvl.h vr0, vr5, vr4 // 0 1 4 5 8 9 12 13
+ vilvl.h vr1, vr21, vr20 // 2 3 6 7 10 11 14 15
+ vilvh.h vr6, vr5, vr4 // 16 17 20 21 24 25 28 29
+ vilvh.h vr7, vr21, vr20 // 18 19 22 23 26 27 30 31
+ vilvl.w vr9, vr1, vr0 // 0 1 2 3 4 5 6 7 in0
+ vilvh.w vr10, vr1, vr0 // 8 9 10 11 12 13 14 15 in1
+ vilvl.w vr11, vr7, vr6 // 16 17 18 19 20 21 22 23 in2
+ vilvh.w vr12, vr7, vr6 // 24 25 26 27 28 29 30 31 in3
+
+ vilvl.d vr0, vr10, vr9
+ vilvl.d vr1, vr12, vr11
+ vilvh.d vr20, vr9, vr11 // in5 in1
+ vilvh.d vr21, vr12, vr10 // in3 in7
+
+.macro DCT8_4Wx8H_1D_LSX
+ dct_4x4_core_lsx vr0, vr1, vr0, vr1, vr3, vr2, vr8, vr8, vr13, vr14
+
+ vldrepl.w vr17, t0, 16 // 799
+ vldrepl.w vr18, t0, 20 // 4017
+ vldrepl.w vr11, t0, 24 // 3406
+ vldrepl.w vr12, t0, 28 // 2276
+
+ vexth.w.h vr4, vr20
+ vexth.w.h vr5, vr21
+ vmul.w vr6, vr4, vr18 // in1 * 4017
+ vmul.w vr7, vr4, vr17 // in1 * 799
+ vmadd.w vr6, vr5, vr17 // in7 * 799
+ vmsub.w vr7, vr5, vr18 // in7 * 4017
+ vsllwil.w.h vr4, vr20, 0
+ vsllwil.w.h vr5, vr21, 0
+ vmul.w vr9, vr4, vr12
+ vmul.w vr10, vr4, vr11
+ vmadd.w vr9, vr5, vr11
+ vmsub.w vr10, vr5, vr12
+ vssrarni.h.w vr10, vr9, 12 // t6a t5a
+ vssrarni.h.w vr7, vr6, 12 // t7a t4a
+ vsadd.h vr15, vr7, vr10 // t7 t4
+ vssub.h vr16, vr7, vr10 // t6a t5a
+
+ vexth.w.h vr4, vr16 // t5a
+ vsllwil.w.h vr5, vr16, 0 // t6a
+ vldi vr2, 0x8b5 // 181
+ vsub.w vr6, vr5, vr4
+ vadd.w vr7, vr5, vr4
+ vmul.w vr6, vr6, vr2
+ vmul.w vr7, vr7, vr2
+ vssrarni.h.w vr7, vr6, 8 // t5 t6
+ vaddi.hu vr18, vr7, 0
+ vshuf4i.d vr7, vr15, 0x06 // t7 t6
+ vshuf4i.d vr15, vr18, 0x09 // t4 t5
+
+ // vr17 -> vr7 vr18 -> vr15
+ vsadd.h vr4, vr13, vr7
+ vsadd.h vr5, vr14, vr15
+ vssub.h vr6, vr14, vr15
+ vssub.h vr7, vr13, vr7
+.endm
+
+ DCT8_4Wx8H_1D_LSX
+
+ vshuf4i.d vr5, vr5, 0x01
+ vshuf4i.d vr7, vr7, 0x01
+
+ vsrari.h vr4, vr4, 4
+ vsrari.h vr5, vr5, 4
+ vsrari.h vr6, vr6, 4
+ vsrari.h vr7, vr7, 4
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W4 vr4, vr5
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, t2, 2
+
+ VLD_DST_ADD_W4 vr6, vr7
+.DCT_DCT_4x8_END:
+endfunc
+
+.macro rect2_w4_lsx in0, in1, in2, out0, out1
+ vsllwil.w.h vr22, \in0, 0
+ vexth.w.h vr23, \in1
+ vmul.w vr22, vr22, \in2
+ vmul.w vr23, vr23, \in2
+ vsrari.w \out0, vr22, 12
+ vsrari.w \out1, vr23, 12
+.endm
+
+.macro dct_8x4_core_lsx1 out0, out1, out2, out3
+ // dct4 stride=1<<1
+ vmul.w vr0, vr6, vr21
+ vmul.w vr1, vr6, vr20
+ vmadd.w vr0, vr10, vr20 // t3
+ vmsub.w vr1, vr10, vr21 // t2
+ vmul.w vr2, vr18, vr22
+ vmul.w vr3, vr18, vr22
+ vmadd.w vr2, vr8, vr22 // t0
+ vmsub.w vr3, vr8, vr22 // t1
+ vssrarni.h.w vr1, vr0, 12 // t3 t2
+ vssrarni.h.w vr3, vr2, 12 // t0 t1
+ vsadd.h vr8, vr3, vr1 // t0 t1
+ vssub.h vr10, vr3, vr1 // t3 t2
+
+ vldrepl.w vr20, t0, 16 // 799
+ vldrepl.w vr21, t0, 20 // 4017
+ vldrepl.w vr22, t0, 24 // 3406
+ vldrepl.w vr23, t0, 28 // 2276
+
+ vmul.w vr0, vr19, vr21 // in1 * 4017
+ vmul.w vr1, vr19, vr20 // in1 * 799
+ vmadd.w vr0, vr11, vr20 // in7 * 799 // t7a
+ vmsub.w vr1, vr11, vr21 // in7 * 4017 // t4a
+ vmul.w vr2, vr9, vr23 // in5 * 1138
+ vmul.w vr3, vr9, vr22 // in5 * 1703
+ vmadd.w vr2, vr7, vr22 // in3 * 1703 // t6a
+ vmsub.w vr3, vr7, vr23 // in3 * 1138 // t5a
+ vssrarni.h.w vr0, vr1, 12 // t4a t7a
+ vssrarni.h.w vr2, vr3, 12 // t5a t6a
+ vsadd.h vr9, vr0, vr2 // t4 t7
+ vssub.h vr11, vr0, vr2 // t5a t6a
+
+ vldrepl.w vr22, t0, 0 // 2896
+ vexth.w.h vr18, vr11 // t6a
+ vsllwil.w.h vr19, vr11, 0 // t5a
+ vmul.w vr6, vr18, vr22
+ vmul.w vr7, vr18, vr22
+ vmadd.w vr6, vr19, vr22 // t6
+ vmsub.w vr7, vr19, vr22 // t5
+ vssrarni.h.w vr6, vr7, 12 // t5 t6
+
+ vilvh.d vr11, vr6, vr9 // t7 t6
+ vilvl.d vr9, vr6, vr9 // t4 t5
+
+ vsadd.h \out0, vr8, vr11 // c[0] c[1]
+ vsadd.h \out1, vr10, vr9 // c[3] c[2]
+ vssub.h \out2, vr10, vr9 // c[4] c[5]
+ vssub.h \out3, vr8, vr11 // c[7] c[6]
+.endm
+
+.macro dct_8x4_core_lsx2 in0, in1, in2, in3, in4, in5, in6, in7, \
+ out0, out1, out2, out3
+ vexth.w.h vr4, \in0 // in1
+ vexth.w.h vr5, \in1 // in3
+ vmul.w vr6, vr4, \in4
+ vmul.w vr7, vr4, \in5
+ vmadd.w vr6, vr5, \in5 // t3
+ vmsub.w vr7, vr5, \in4 // t2
+ vexth.w.h vr4, \in2 // in1
+ vexth.w.h vr5, \in3 // in3
+ vmul.w vr8, vr4, \in4
+ vmul.w vr9, vr4, \in5
+ vmadd.w vr8, vr5, \in5 // t3
+ vmsub.w vr9, vr5, \in4 // t2
+ vssrarni.h.w vr8, vr6, 12 // t3
+ vssrarni.h.w vr9, vr7, 12 // t2
+
+ vsllwil.w.h vr4, \in0, 0
+ vsllwil.w.h vr5, \in1, 0
+ vmul.w vr11, vr4, \in6
+ vmul.w vr12, vr4, \in7
+ vmadd.w vr11, vr5, \in7 // t0
+ vmsub.w vr12, vr5, \in6 // t1
+ vsllwil.w.h vr4, \in2, 0
+ vsllwil.w.h vr5, \in3, 0
+ vmul.w vr13, vr4, \in6
+ vmul.w vr14, vr4, \in7
+ vmadd.w vr13, vr5, \in7 // t0
+ vmsub.w vr14, vr5, \in6 // t1
+ vssrarni.h.w vr13, vr11, 12 // t0
+ vssrarni.h.w vr14, vr12, 12 // t1
+
+ vsadd.h \out0, vr13, vr8
+ vsadd.h \out1, vr14, vr9
+ vssub.h \out2, vr14, vr9
+ vssub.h \out3, vr13, vr8
+.endm
+
+.macro DST_ADD_W8 in0, in1, in2, in3, in4, in5, in6, in7
+ vsllwil.hu.bu vr10, \in0, 0
+ vsllwil.hu.bu vr11, \in1, 0
+ vsllwil.hu.bu vr12, \in2, 0
+ vsllwil.hu.bu vr13, \in3, 0
+ vadd.h vr10, \in4, vr10
+ vadd.h vr11, \in5, vr11
+ vadd.h vr12, \in6, vr12
+ vadd.h vr13, \in7, vr13
+ vssrani.bu.h vr11, vr10, 0
+ vssrani.bu.h vr13, vr12, 0
+ vstelm.d vr11, a0, 0, 0
+ add.d t8, a0, a1
+ vstelm.d vr11, t8, 0, 1
+ vstelm.d vr13, t2, 0, 0
+ add.d t8, t2, a1
+ vstelm.d vr13, t8, 0, 1
+.endm
+
+.macro VLD_DST_ADD_W8 in0, in1, in2, in3
+ vld vr0, a0, 0
+ vldx vr1, a0, a1
+ vld vr2, t2, 0
+ vldx vr3, t2, a1
+
+ DST_ADD_W8 vr0, vr1, vr2, vr3, \in0, \in1, \in2, \in3
+.endm
+
+function inv_txfm_add_dct_dct_8x4_8bpc_lsx
+ bnez a3, .NO_HAS_DCONLY_8x4
+
+ ld.h t2, a2, 0 // dc
+ vldi vr0, 0x8b5 // 181
+ vreplgr2vr.w vr1, t2
+ vldi vr5, 0x880 // 128
+ vmul.w vr2, vr0, vr1
+ st.h zero, a2, 0
+ vsrari.w vr2, vr2, 8
+ vld vr10, a0, 0
+ vmul.w vr2, vr2, vr0
+ vldx vr11, a0, a1
+ vsrari.w vr2, vr2, 8
+ alsl.d t2, a1, a0, 1
+ vmadd.w vr5, vr2, vr0
+ vld vr12, t2, 0
+ vssrarni.h.w vr5, vr5, 12
+ vldx vr13, t2, a1
+
+ DST_ADD_W8 vr10, vr11, vr12, vr13, vr5, vr5, vr5, vr5
+
+ b .DCT_DCT_8X4_END
+
+.NO_HAS_DCONLY_8x4:
+ la.local t0, idct_coeffs
+
+ vld vr0, a2, 0
+ vld vr1, a2, 16
+ vld vr2, a2, 32
+ vld vr3, a2, 48
+
+ vldrepl.w vr20, t0, 0 // 2896
+
+ rect2_w4_lsx vr0, vr0, vr20, vr18, vr19
+ rect2_w4_lsx vr1, vr1, vr20, vr6, vr7
+ rect2_w4_lsx vr2, vr2, vr20, vr8, vr9
+ rect2_w4_lsx vr3, vr3, vr20, vr10, vr11
+
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vldrepl.w vr22, t0, 0 // 2896
+
+ dct_8x4_core_lsx1 vr0, vr1, vr2, vr3
+
+ vshuf4i.d vr1, vr1, 0x01
+ vshuf4i.d vr3, vr3, 0x01
+
+ vilvl.h vr4, vr1, vr0 // 0 2 4 6 8 10 12 14
+ vilvh.h vr5, vr1, vr0 // 1 3 5 7 9 11 13 15
+ vilvl.h vr0, vr5, vr4 // 0 1 2 3 4 5 6 7 in0
+ vilvh.h vr1, vr5, vr4 // 8 9 10 11 12 13 14 15 in1
+ vilvl.h vr4, vr3, vr2 // 0 2 4 6 8 10 12 14
+ vilvh.h vr5, vr3, vr2 // 1 3 5 7 9 11 13 15
+ vilvl.h vr2, vr5, vr4 // 16 - 23 in2
+ vilvh.h vr3, vr5, vr4 // 24 - 31 in3
+
+ la.local t0, idct_coeffs
+
+ vreplgr2vr.h vr23, zero
+ vst vr23, a2, 0
+ vst vr23, a2, 16
+ vst vr23, a2, 32
+ vst vr23, a2, 48
+
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+
+ dct_8x4_core_lsx2 vr0, vr1, vr2, vr3, vr21, vr20, vr22, \
+ vr22, vr15, vr16, vr17, vr18
+
+ vsrari.h vr15, vr15, 4
+ vsrari.h vr16, vr16, 4
+ vsrari.h vr17, vr17, 4
+ vsrari.h vr18, vr18, 4
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr15, vr16, vr17, vr18
+
+.DCT_DCT_8X4_END:
+endfunc
+
+.macro identity8_lsx in0, in1, in2, in3, in4, in5, in6, in7, \
+ out0, out1, out2, out3
+ vssrarni.h.w \in1, \in0, 0
+ vssrarni.h.w \in3, \in2, 0
+ vssrarni.h.w \in5, \in4, 0
+ vssrarni.h.w \in7, \in6, 0
+ vsadd.h \out0, \in1, \in1
+ vsadd.h \out1, \in3, \in3
+ vsadd.h \out2, \in5, \in5
+ vsadd.h \out3, \in7, \in7
+.endm
+
+function inv_txfm_add_identity_identity_8x4_8bpc_lsx
+ la.local t0, idct_coeffs
+
+ vld vr0, a2, 0 // 0 1 2 3 4 5 6 7 in0
+ vld vr1, a2, 16 // 8 9 10 11 12 13 14 15 in1
+ vld vr2, a2, 32 // 16 17 18 19 20 21 22 23 in2
+ vld vr3, a2, 48 // 24 25 26 27 28 29 30 31 in3
+
+ vldrepl.w vr20, t0, 0 // 2896
+
+ rect2_w4_lsx vr0, vr0, vr20, vr18, vr19
+ rect2_w4_lsx vr1, vr1, vr20, vr6, vr7
+ rect2_w4_lsx vr2, vr2, vr20, vr8, vr9
+ rect2_w4_lsx vr3, vr3, vr20, vr10, vr11
+
+ identity8_lsx vr18, vr19, vr6, vr7, vr8, vr9, vr10, vr11, \
+ vr19, vr7, vr9, vr11
+
+ vreplgr2vr.h vr23, zero
+ vst vr23, a2, 0
+ vst vr23, a2, 16
+ vst vr23, a2, 32
+ vst vr23, a2, 48
+
+ li.w t0, 1697
+ vreplgr2vr.w vr20, t0
+ identity_4x4_lsx vr19, vr19, vr20, vr19, vr19
+ identity_4x4_lsx vr7, vr7, vr20, vr7, vr7
+ identity_4x4_lsx vr9, vr9, vr20, vr9, vr9
+ identity_4x4_lsx vr11, vr11, vr20, vr11, vr11
+
+ vsrari.h vr15, vr19, 4
+ vsrari.h vr16, vr7, 4
+ vsrari.h vr17, vr9, 4
+ vsrari.h vr18, vr11, 4
+
+ vilvl.h vr4, vr16, vr15
+ vilvh.h vr5, vr16, vr15
+ vilvl.h vr11, vr5, vr4
+ vilvh.h vr12, vr5, vr4
+ vilvl.h vr4, vr18, vr17
+ vilvh.h vr5, vr18, vr17
+ vilvl.h vr13, vr5, vr4
+ vilvh.h vr14, vr5, vr4
+ vilvl.d vr15, vr13, vr11
+ vilvh.d vr16, vr13, vr11
+ vilvl.d vr17, vr14, vr12
+ vilvh.d vr18, vr14, vr12
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr15, vr16, vr17, vr18
+endfunc
+
+const iadst8_coeffs, align=4
+ .word 4076, 401, 3612, 1931
+ .word 2598, 3166, 1189, 3920
+ // idct_coeffs
+ .word 2896, 0, 1567, 3784, 0, 0, 0, 0
+endconst
+
+.macro vmadd_vmsub_vssrarni_hw_12 in0, in1, in2, in3, in4, in5, in6, in7, \
+ in8, in9, in10, in11, out0, out1, out2, out3
+ vmul.w \out0, \in0, \in4
+ vmul.w \out1, \in0, \in5
+ vmadd.w \out0, \in1, \in6 // t0a
+ vmsub.w \out1, \in1, \in7 // t1a
+ vmul.w \out2, \in2, \in8
+ vmul.w \out3, \in2, \in9
+ vmadd.w \out2, \in3, \in10 // t2a
+ vmsub.w \out3, \in3, \in11 // t3a
+ vssrarni.h.w \out1, \out0, 12 // t0a t1a
+ vssrarni.h.w \out3, \out2, 12 // t2a t3a
+.endm
+
+.macro adst8x4_1d_lsx
+ la.local t0, iadst8_coeffs
+
+ vldrepl.w vr20, t0, 0 // 4076
+ vldrepl.w vr21, t0, 4 // 401
+ vldrepl.w vr22, t0, 8 // 3612
+ vldrepl.w vr23, t0, 12 // 1931
+
+ // vr13 t0a t1a vr15 t2a t3a
+ vmadd_vmsub_vssrarni_hw_12 vr11, vr18, vr9, vr6, vr20, vr21, vr21, vr20, \
+ vr22, vr23, vr23, vr22, vr12, vr13, vr14, vr15
+ vldrepl.w vr20, t0, 16 // 2598
+ vldrepl.w vr21, t0, 20 // 3166
+ vldrepl.w vr22, t0, 24 // 1189
+ vldrepl.w vr23, t0, 28 // 3920
+
+ // vr18 t4a t5a vr6 t6a t7a
+ vmadd_vmsub_vssrarni_hw_12 vr7, vr8, vr19, vr10, vr20, vr21, vr21, vr20, \
+ vr22, vr23, vr23, vr22, vr11, vr18, vr9, vr6
+
+ vsadd.h vr12, vr13, vr18 // t0 t1
+ vsadd.h vr14, vr15, vr6 // t2 t3
+ vssub.h vr16, vr13, vr18 // t4 t5
+ vssub.h vr18, vr15, vr6 // t6 t7
+
+ la.local t0, idct_coeffs
+
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vldrepl.w vr22, t0, 0 // 2896
+
+ vsllwil.w.h vr7, vr16, 0 // t4
+ vexth.w.h vr8, vr16 // t5
+ vsllwil.w.h vr10, vr18, 0 // t6
+ vexth.w.h vr11, vr18 // t7
+
+ // vr13 out0 out7 vr17 out1 out6
+ vmadd_vmsub_vssrarni_hw_12 vr7, vr8, vr11, vr10, vr21, vr20, vr20, vr21, \
+ vr20, vr21, vr21, vr20, vr13, vr15, vr17, vr19
+ vshuf4i.d vr19, vr19, 0x01
+
+ vsadd.h vr13, vr12, vr14 // out0 out7
+ vssub.h vr16, vr12, vr14 // t2 t3
+ vsadd.h vr17, vr15, vr19 // out1 out6
+ vssub.h vr18, vr15, vr19 // t6 t7
+
+ vexth.w.h vr20, vr13 // out7
+ vsllwil.w.h vr21, vr17, 0 // out1
+ vneg.w vr20, vr20
+ vneg.w vr21, vr21
+ vssrarni.h.w vr21, vr20, 0 // out7 out1
+ vilvl.d vr13, vr21, vr13 // out0 out7
+ vilvh.d vr17, vr17, vr21 // out1 out6
+
+ vsllwil.w.h vr7, vr16, 0 // t2
+ vexth.w.h vr8, vr16 // t3
+ vsllwil.w.h vr10, vr18, 0 // t6
+ vexth.w.h vr11, vr18 // t7
+
+ // vr15 out[3] out[4] vr18 out[2] out[5]
+ vmadd_vmsub_vssrarni_hw_12 vr7, vr8, vr10, vr11, vr22, vr22, vr22, vr22, \
+ vr22, vr22, vr22, vr22, vr14, vr15, vr19, vr18
+
+ vexth.w.h vr20, vr18 // out5
+ vsllwil.w.h vr21, vr15, 0 // out3
+ vneg.w vr20, vr20
+ vneg.w vr21, vr21
+ vssrarni.h.w vr21, vr20, 0 // out5 out3
+ vilvl.d vr18, vr21, vr18 // out2 out5
+ vilvh.d vr15, vr15, vr21 // out3 out4
+.endm
+
+function inv_txfm_add_adst_dct_8x4_8bpc_lsx
+ vld vr0, a2, 0 // 0 1 2 3 4 5 6 7 in0
+ vld vr1, a2, 16 // 8 9 10 11 12 13 14 15 in1
+ vld vr2, a2, 32 // 16 17 18 19 20 21 22 23 in2
+ vld vr3, a2, 48 // 24 25 26 27 28 29 30 31 in3
+
+ la.local t0, idct_coeffs
+ vldrepl.w vr20, t0, 0 // 2896
+
+ rect2_w4_lsx vr0, vr0, vr20, vr18, vr19
+ rect2_w4_lsx vr1, vr1, vr20, vr6, vr7
+ rect2_w4_lsx vr2, vr2, vr20, vr8, vr9
+ rect2_w4_lsx vr3, vr3, vr20, vr10, vr11
+
+ adst8x4_1d_lsx
+
+ vilvl.h vr4, vr17, vr13
+ vilvl.h vr5, vr15, vr18
+ vilvl.w vr0, vr5, vr4
+ vilvh.w vr1, vr5, vr4
+ vilvh.h vr4, vr18, vr15
+ vilvh.h vr5, vr13, vr17
+ vilvl.w vr2, vr5, vr4
+ vilvh.w vr3, vr5, vr4
+
+ vreplgr2vr.h vr23, zero
+ vst vr23, a2, 0
+ vst vr23, a2, 16
+ vst vr23, a2, 32
+ vst vr23, a2, 48
+
+ la.local t0, idct_coeffs
+
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vldrepl.w vr22, t0, 0 // 2896
+
+ dct_8x4_core_lsx2 vr0, vr1, vr2, vr3, vr21, vr20, vr22, \
+ vr22, vr15, vr16, vr17, vr18
+
+ vsrari.h vr15, vr15, 4
+ vsrari.h vr16, vr16, 4
+ vsrari.h vr17, vr17, 4
+ vsrari.h vr18, vr18, 4
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr15, vr16, vr17, vr18
+endfunc
+
+function inv_txfm_add_dct_adst_8x4_8bpc_lsx
+ vld vr0, a2, 0 // 0 1 2 3 4 5 6 7 in0
+ vld vr1, a2, 16 // 8 9 10 11 12 13 14 15 in1
+ vld vr2, a2, 32 // 16 17 18 19 20 21 22 23 in2
+ vld vr3, a2, 48 // 24 25 26 27 28 29 30 31 in3
+
+ la.local t0, idct_coeffs
+ vldrepl.w vr20, t0, 0 // 2896
+
+ rect2_w4_lsx vr0, vr0, vr20, vr18, vr19
+ rect2_w4_lsx vr1, vr1, vr20, vr6, vr7
+ rect2_w4_lsx vr2, vr2, vr20, vr8, vr9
+ rect2_w4_lsx vr3, vr3, vr20, vr10, vr11
+
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vldrepl.w vr22, t0, 0 // 2896
+
+ dct_8x4_core_lsx1 vr0, vr1, vr2, vr3
+
+ vshuf4i.d vr1, vr1, 0x01
+ vshuf4i.d vr3, vr3, 0x01
+
+ vilvl.h vr4, vr1, vr0
+ vilvh.h vr5, vr1, vr0
+ vilvl.h vr0, vr5, vr4
+ vilvh.h vr1, vr5, vr4
+ vilvl.h vr4, vr3, vr2
+ vilvh.h vr5, vr3, vr2
+ vilvl.h vr2, vr5, vr4
+ vilvh.h vr3, vr5, vr4
+
+ la.local t0, iadst4_coeffs
+
+ vreplgr2vr.h vr23, zero
+ vst vr23, a2, 0
+ vst vr23, a2, 16
+ vst vr23, a2, 32
+ vst vr23, a2, 48
+
+ vldrepl.w vr20, t0, 0 // 1321
+ vldrepl.w vr21, t0, 4 // 3803
+ vldrepl.w vr22, t0, 8 // 2482
+ vldrepl.w vr23, t0, 12 // 3344
+
+ vsllwil.w.h vr10, vr0, 0
+ vexth.w.h vr11, vr0
+ vsllwil.w.h vr12, vr1, 0
+ vexth.w.h vr13, vr1
+
+ adst4x4_1d_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13
+
+ vsllwil.w.h vr14, vr2, 0
+ vexth.w.h vr15, vr2
+ vsllwil.w.h vr16, vr3, 0
+ vexth.w.h vr17, vr3
+
+ adst4x4_1d_lsx vr14, vr15, vr16, vr17, vr14, vr15, vr16, vr17
+
+ vssrarni.h.w vr14, vr10, 12
+ vssrarni.h.w vr15, vr11, 12
+ vssrarni.h.w vr16, vr12, 12
+ vssrarni.h.w vr17, vr13, 12
+
+ vsrari.h vr14, vr14, 4
+ vsrari.h vr15, vr15, 4
+ vsrari.h vr16, vr16, 4
+ vsrari.h vr17, vr17, 4
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr14, vr15, vr16, vr17
+endfunc
+
+function inv_txfm_add_adst_adst_8x4_8bpc_lsx
+ vld vr0, a2, 0 // 0 1 2 3 4 5 6 7 in0
+ vld vr1, a2, 16 // 8 9 10 11 12 13 14 15 in1
+ vld vr2, a2, 32 // 16 17 18 19 20 21 22 23 in2
+ vld vr3, a2, 48 // 24 25 26 27 28 29 30 31 in3
+
+ la.local t0, idct_coeffs
+ vldrepl.w vr20, t0, 0 // 2896
+
+ rect2_w4_lsx vr0, vr0, vr20, vr18, vr19
+ rect2_w4_lsx vr1, vr1, vr20, vr6, vr7
+ rect2_w4_lsx vr2, vr2, vr20, vr8, vr9
+ rect2_w4_lsx vr3, vr3, vr20, vr10, vr11
+
+ adst8x4_1d_lsx
+
+ vilvl.h vr4, vr17, vr13
+ vilvl.h vr5, vr15, vr18
+ vilvl.w vr0, vr5, vr4
+ vilvh.w vr1, vr5, vr4
+ vilvh.h vr4, vr18, vr15
+ vilvh.h vr5, vr13, vr17
+ vilvl.w vr2, vr5, vr4
+ vilvh.w vr3, vr5, vr4
+
+ la.local t0, iadst4_coeffs
+
+ vreplgr2vr.h vr23, zero
+ vst vr23, a2, 0
+ vst vr23, a2, 16
+ vst vr23, a2, 32
+ vst vr23, a2, 48
+
+ vldrepl.w vr20, t0, 0 // 1321
+ vldrepl.w vr21, t0, 4 // 3803
+ vldrepl.w vr22, t0, 8 // 2482
+ vldrepl.w vr23, t0, 12 // 3344
+
+ vsllwil.w.h vr10, vr0, 0
+ vexth.w.h vr11, vr0
+ vsllwil.w.h vr12, vr1, 0
+ vexth.w.h vr13, vr1
+
+ adst4x4_1d_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13
+
+ vsllwil.w.h vr14, vr2, 0
+ vexth.w.h vr15, vr2
+ vsllwil.w.h vr16, vr3, 0
+ vexth.w.h vr17, vr3
+
+ adst4x4_1d_lsx vr14, vr15, vr16, vr17, vr14, vr15, vr16, vr17
+
+ vssrarni.h.w vr14, vr10, 12
+ vssrarni.h.w vr15, vr11, 12
+ vssrarni.h.w vr16, vr12, 12
+ vssrarni.h.w vr17, vr13, 12
+
+ vsrari.h vr14, vr14, 4
+ vsrari.h vr15, vr15, 4
+ vsrari.h vr16, vr16, 4
+ vsrari.h vr17, vr17, 4
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr14, vr15, vr16, vr17
+endfunc
+
+function inv_txfm_add_flipadst_adst_8x4_8bpc_lsx
+ vld vr0, a2, 0 // 0 1 2 3 4 5 6 7 in0
+ vld vr1, a2, 16 // 8 9 10 11 12 13 14 15 in1
+ vld vr2, a2, 32 // 16 17 18 19 20 21 22 23 in2
+ vld vr3, a2, 48 // 24 25 26 27 28 29 30 31 in3
+
+ la.local t0, idct_coeffs
+ vldrepl.w vr20, t0, 0 // 2896
+
+ rect2_w4_lsx vr0, vr0, vr20, vr18, vr19
+ rect2_w4_lsx vr1, vr1, vr20, vr6, vr7
+ rect2_w4_lsx vr2, vr2, vr20, vr8, vr9
+ rect2_w4_lsx vr3, vr3, vr20, vr10, vr11
+
+ adst8x4_1d_lsx
+
+ vilvl.h vr20, vr15, vr13
+ vilvl.h vr21, vr18, vr17
+ vilvl.w vr0, vr21, vr20
+ vilvh.w vr1, vr21, vr20
+ vilvh.h vr20, vr15, vr13
+ vilvh.h vr21, vr18, vr17
+ vilvl.w vr2, vr21, vr20
+ vilvh.w vr3, vr21, vr20
+ vshuf4i.h vr0, vr0, 0x2d
+ vshuf4i.h vr1, vr1, 0x2d
+ vshuf4i.h vr2, vr2, 0x78
+ vshuf4i.h vr3, vr3, 0x78
+
+ la.local t0, iadst4_coeffs
+
+ vreplgr2vr.h vr23, zero
+ vst vr23, a2, 0
+ vst vr23, a2, 16
+ vst vr23, a2, 32
+ vst vr23, a2, 48
+
+ vldrepl.w vr20, t0, 0 // 1321
+ vldrepl.w vr21, t0, 4 // 3803
+ vldrepl.w vr22, t0, 8 // 2482
+ vldrepl.w vr23, t0, 12 // 3344
+
+ vsllwil.w.h vr10, vr2, 0
+ vexth.w.h vr11, vr2
+ vsllwil.w.h vr12, vr3, 0
+ vexth.w.h vr13, vr3
+
+ adst4x4_1d_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13
+
+ vsllwil.w.h vr14, vr0, 0
+ vexth.w.h vr15, vr0
+ vsllwil.w.h vr16, vr1, 0
+ vexth.w.h vr17, vr1
+
+ adst4x4_1d_lsx vr14, vr15, vr16, vr17, vr14, vr15, vr16, vr17
+
+ vssrarni.h.w vr14, vr10, 12
+ vssrarni.h.w vr15, vr11, 12
+ vssrarni.h.w vr16, vr12, 12
+ vssrarni.h.w vr17, vr13, 12
+
+ vsrari.h vr14, vr14, 4
+ vsrari.h vr15, vr15, 4
+ vsrari.h vr16, vr16, 4
+ vsrari.h vr17, vr17, 4
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr14, vr15, vr16, vr17
+endfunc
+
+function inv_txfm_add_adst_flipadst_8x4_8bpc_lsx
+ vld vr0, a2, 0 // in0
+ vld vr1, a2, 16 // in1
+ vld vr2, a2, 32 // in2
+ vld vr3, a2, 48 // in3
+
+ la.local t0, idct_coeffs
+ vldrepl.w vr20, t0, 0 // 2896
+
+ rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // 0 8 16 24 1 9 17 25 in0 in1
+ rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 // 2 10 18 26 3 11 19 27 in2 in3
+ rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 // 4 12 20 28 5 13 21 29 in4 in5
+ rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // 6 14 22 30 7 15 23 31 in6 in7
+
+ adst8x4_1d_lsx
+
+ vilvl.h vr4, vr17, vr13
+ vilvl.h vr5, vr15, vr18
+ vilvl.w vr0, vr5, vr4
+ vilvh.w vr1, vr5, vr4
+ vilvh.h vr4, vr18, vr15
+ vilvh.h vr5, vr13, vr17
+ vilvl.w vr2, vr5, vr4
+ vilvh.w vr3, vr5, vr4
+
+ la.local t0, iadst4_coeffs
+
+ vreplgr2vr.h vr23, zero
+ vst vr23, a2, 0
+ vst vr23, a2, 16
+ vst vr23, a2, 32
+ vst vr23, a2, 48
+
+ vldrepl.w vr20, t0, 0 // 1321
+ vldrepl.w vr21, t0, 4 // 3803
+ vldrepl.w vr22, t0, 8 // 2482
+ vldrepl.w vr23, t0, 12 // 3344
+
+ vsllwil.w.h vr10, vr0, 0
+ vexth.w.h vr11, vr0
+ vsllwil.w.h vr12, vr1, 0
+ vexth.w.h vr13, vr1
+
+ adst4x4_1d_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13
+
+ vsllwil.w.h vr14, vr2, 0
+ vexth.w.h vr15, vr2
+ vsllwil.w.h vr16, vr3, 0
+ vexth.w.h vr17, vr3
+
+ adst4x4_1d_lsx vr14, vr15, vr16, vr17, vr14, vr15, vr16, vr17
+
+ vssrarni.h.w vr14, vr10, 12
+ vssrarni.h.w vr15, vr11, 12
+ vssrarni.h.w vr16, vr12, 12
+ vssrarni.h.w vr17, vr13, 12
+
+ vsrari.h vr14, vr14, 4
+ vsrari.h vr15, vr15, 4
+ vsrari.h vr16, vr16, 4
+ vsrari.h vr17, vr17, 4
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr17, vr16, vr15, vr14
+endfunc
+
+function inv_txfm_add_flipadst_dct_8x4_8bpc_lsx
+ vld vr0, a2, 0 // in0
+ vld vr1, a2, 16 // in1
+ vld vr2, a2, 32 // in2
+ vld vr3, a2, 48 // in3
+
+ la.local t0, idct_coeffs
+ vldrepl.w vr20, t0, 0 // 2896
+
+ rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // 0 8 16 24 1 9 17 25 in0 in1
+ rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 // 2 10 18 26 3 11 19 27 in2 in3
+ rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 // 4 12 20 28 5 13 21 29 in4 in5
+ rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // 6 14 22 30 7 15 23 31 in6 in7
+
+ adst8x4_1d_lsx
+
+ vilvl.h vr20, vr15, vr13
+ vilvl.h vr21, vr18, vr17
+ vilvl.w vr0, vr21, vr20
+ vilvh.w vr1, vr21, vr20
+ vilvh.h vr20, vr15, vr13
+ vilvh.h vr21, vr18, vr17
+ vilvl.w vr2, vr21, vr20
+ vilvh.w vr3, vr21, vr20
+ vshuf4i.h vr0, vr0, 0x2d
+ vshuf4i.h vr1, vr1, 0x2d
+ vshuf4i.h vr2, vr2, 0x78
+ vshuf4i.h vr3, vr3, 0x78
+
+ vreplgr2vr.h vr23, zero
+ vst vr23, a2, 0
+ vst vr23, a2, 16
+ vst vr23, a2, 32
+ vst vr23, a2, 48
+
+ la.local t0, idct_coeffs
+
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vldrepl.w vr22, t0, 0 // 2896
+
+ dct_8x4_core_lsx2 vr2, vr3, vr0, vr1, vr21, vr20, vr22, \
+ vr22, vr15, vr16, vr17, vr18
+
+ vsrari.h vr15, vr15, 4
+ vsrari.h vr16, vr16, 4
+ vsrari.h vr17, vr17, 4
+ vsrari.h vr18, vr18, 4
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr15, vr16, vr17, vr18
+endfunc
+
+function inv_txfm_add_dct_flipadst_8x4_8bpc_lsx
+ la.local t0, idct_coeffs
+
+ vld vr0, a2, 0 // in0
+ vld vr1, a2, 16 // in1
+ vld vr2, a2, 32 // in2
+ vld vr3, a2, 48 // in3
+
+ vldrepl.w vr20, t0, 0 // 2896
+
+ rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // in0 0 - 7
+ rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 // in1 8 - 15
+ rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 // in2 16 - 23
+ rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // in3 24 - 31
+
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vldrepl.w vr22, t0, 0 // 2896
+
+ dct_8x4_core_lsx1 vr0, vr1, vr2, vr3
+
+ vshuf4i.d vr1, vr1, 0x01
+ vshuf4i.d vr3, vr3, 0x01
+
+ vilvl.h vr4, vr1, vr0
+ vilvh.h vr5, vr1, vr0
+ vilvl.h vr0, vr5, vr4
+ vilvh.h vr1, vr5, vr4
+ vilvl.h vr4, vr3, vr2
+ vilvh.h vr5, vr3, vr2
+ vilvl.h vr2, vr5, vr4
+ vilvh.h vr3, vr5, vr4
+
+ la.local t0, iadst4_coeffs
+
+ vreplgr2vr.h vr23, zero
+ vst vr23, a2, 0
+ vst vr23, a2, 16
+ vst vr23, a2, 32
+ vst vr23, a2, 48
+
+ vldrepl.w vr20, t0, 0 // 1321
+ vldrepl.w vr21, t0, 4 // 3803
+ vldrepl.w vr22, t0, 8 // 2482
+ vldrepl.w vr23, t0, 12 // 3344
+
+ vsllwil.w.h vr10, vr0, 0 // in0
+ vexth.w.h vr11, vr0 // in1
+ vsllwil.w.h vr12, vr1, 0 // in2
+ vexth.w.h vr13, vr1 // in3
+ adst4x4_1d_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13
+
+ vsllwil.w.h vr14, vr2, 0
+ vexth.w.h vr15, vr2
+ vsllwil.w.h vr16, vr3, 0
+ vexth.w.h vr17, vr3
+ adst4x4_1d_lsx vr14, vr15, vr16, vr17, vr14, vr15, vr16, vr17
+
+ vssrarni.h.w vr14, vr10, 12
+ vssrarni.h.w vr15, vr11, 12
+ vssrarni.h.w vr16, vr12, 12
+ vssrarni.h.w vr17, vr13, 12
+ vsrari.h vr14, vr14, 4
+ vsrari.h vr15, vr15, 4
+ vsrari.h vr16, vr16, 4
+ vsrari.h vr17, vr17, 4
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr17, vr16, vr15, vr14
+endfunc
+
+function inv_txfm_add_flipadst_flipadst_8x4_8bpc_lsx
+ vld vr0, a2, 0 // in0
+ vld vr1, a2, 16 // in1
+ vld vr2, a2, 32 // in2
+ vld vr3, a2, 48 // in3
+
+ la.local t0, idct_coeffs
+ vldrepl.w vr20, t0, 0 // 2896
+
+ rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // 0 8 16 24 1 9 17 25 in0 in1
+ rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 // 2 10 18 26 3 11 19 27 in2 in3
+ rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 // 4 12 20 28 5 13 21 29 in4 in5
+ rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // 6 14 22 30 7 15 23 31 in6 in7
+
+ adst8x4_1d_lsx
+
+ vilvl.h vr20, vr15, vr13
+ vilvl.h vr21, vr18, vr17
+ vilvl.w vr0, vr21, vr20
+ vilvh.w vr1, vr21, vr20
+ vilvh.h vr20, vr15, vr13
+ vilvh.h vr21, vr18, vr17
+ vilvl.w vr2, vr21, vr20
+ vilvh.w vr3, vr21, vr20
+ vshuf4i.h vr0, vr0, 0x2d
+ vshuf4i.h vr1, vr1, 0x2d
+ vshuf4i.h vr2, vr2, 0x78
+ vshuf4i.h vr3, vr3, 0x78
+
+ la.local t0, iadst4_coeffs
+
+ vreplgr2vr.h vr23, zero
+ vst vr23, a2, 0
+ vst vr23, a2, 16
+ vst vr23, a2, 32
+ vst vr23, a2, 48
+
+ vldrepl.w vr20, t0, 0 // 1321
+ vldrepl.w vr21, t0, 4 // 3803
+ vldrepl.w vr22, t0, 8 // 2482
+ vldrepl.w vr23, t0, 12 // 3344
+
+ vsllwil.w.h vr10, vr2, 0 // in0
+ vexth.w.h vr11, vr2 // in1
+ vsllwil.w.h vr12, vr3, 0 // in2
+ vexth.w.h vr13, vr3 // in3
+ adst4x4_1d_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13
+
+ vsllwil.w.h vr14, vr0, 0
+ vexth.w.h vr15, vr0
+ vsllwil.w.h vr16, vr1, 0
+ vexth.w.h vr17, vr1
+ adst4x4_1d_lsx vr14, vr15, vr16, vr17, vr14, vr15, vr16, vr17
+
+ vssrarni.h.w vr14, vr10, 12
+ vssrarni.h.w vr15, vr11, 12
+ vssrarni.h.w vr16, vr12, 12
+ vssrarni.h.w vr17, vr13, 12
+
+ vsrari.h vr14, vr14, 4
+ vsrari.h vr15, vr15, 4
+ vsrari.h vr16, vr16, 4
+ vsrari.h vr17, vr17, 4
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr17, vr16, vr15, vr14
+endfunc
+
+function inv_txfm_add_dct_identity_8x4_8bpc_lsx
+ vld vr0, a2, 0 // in0
+ vld vr1, a2, 16 // in1
+ vld vr2, a2, 32 // in2
+ vld vr3, a2, 48 // in3
+
+ la.local t0, idct_coeffs
+ vldrepl.w vr20, t0, 0 // 2896
+
+ rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // in0 0 - 7
+ rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 // in1 8 - 15
+ rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 // in2 16 - 23
+ rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // in3 24 - 31
+
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vldrepl.w vr22, t0, 0 // 2896
+
+ dct_8x4_core_lsx1 vr0, vr1, vr2, vr3
+
+ vshuf4i.d vr1, vr1, 0x01
+ vshuf4i.d vr3, vr3, 0x01
+
+ vilvl.h vr4, vr1, vr0
+ vilvh.h vr5, vr1, vr0
+ vilvl.h vr0, vr5, vr4
+ vilvh.h vr1, vr5, vr4
+ vilvl.h vr4, vr3, vr2
+ vilvh.h vr5, vr3, vr2
+ vilvl.h vr2, vr5, vr4
+ vilvh.h vr3, vr5, vr4
+ vilvl.d vr14, vr2, vr0
+ vilvh.d vr15, vr2, vr0
+ vilvl.d vr16, vr3, vr1
+ vilvh.d vr17, vr3, vr1
+
+ vreplgr2vr.h vr23, zero
+ vst vr23, a2, 0
+ vst vr23, a2, 16
+ vst vr23, a2, 32
+ vst vr23, a2, 48
+
+ li.w t0, 1697
+ vreplgr2vr.w vr20, t0
+
+ identity_4x4_lsx vr14, vr14, vr20, vr14, vr14
+ identity_4x4_lsx vr15, vr15, vr20, vr15, vr15
+ identity_4x4_lsx vr16, vr16, vr20, vr16, vr16
+ identity_4x4_lsx vr17, vr17, vr20, vr17, vr17
+
+ vsrari.h vr14, vr14, 4
+ vsrari.h vr15, vr15, 4
+ vsrari.h vr16, vr16, 4
+ vsrari.h vr17, vr17, 4
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr14, vr15, vr16, vr17
+endfunc
+
+function inv_txfm_add_identity_dct_8x4_8bpc_lsx
+ vld vr0, a2, 0 // in0
+ vld vr1, a2, 16 // in1
+ vld vr2, a2, 32 // in2
+ vld vr3, a2, 48 // in3
+
+ la.local t0, idct_coeffs
+ vldrepl.w vr20, t0, 0 // 2896
+
+ rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // in0 0 - 7
+ rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 // in1 8 - 15
+ rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 // in2 16 - 23
+ rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // in3 24 - 31
+
+ identity8_lsx vr18, vr19, vr6, vr7, vr8, vr9, vr10, vr11, \
+ vr19, vr7, vr9, vr11
+
+ vreplgr2vr.h vr23, zero
+ vst vr23, a2, 0
+ vst vr23, a2, 16
+ vst vr23, a2, 32
+ vst vr23, a2, 48
+
+ vilvl.h vr4, vr7, vr19
+ vilvh.h vr5, vr7, vr19
+ vilvl.h vr0, vr5, vr4
+ vilvh.h vr1, vr5, vr4
+ vilvl.h vr4, vr11, vr9
+ vilvh.h vr5, vr11, vr9
+ vilvl.h vr2, vr5, vr4
+ vilvh.h vr3, vr5, vr4
+
+ la.local t0, idct_coeffs
+
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vldrepl.w vr22, t0, 0 // 2896
+
+ dct_8x4_core_lsx2 vr0, vr1, vr2, vr3, vr21, vr20, vr22, \
+ vr22, vr15, vr16, vr17, vr18
+
+ vsrari.h vr15, vr15, 4
+ vsrari.h vr16, vr16, 4
+ vsrari.h vr17, vr17, 4
+ vsrari.h vr18, vr18, 4
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr15, vr16, vr17, vr18
+endfunc
+
+function inv_txfm_add_flipadst_identity_8x4_8bpc_lsx
+ vld vr0, a2, 0 // in0
+ vld vr1, a2, 16 // in1
+ vld vr2, a2, 32 // in2
+ vld vr3, a2, 48 // in3
+
+ la.local t0, idct_coeffs
+ vldrepl.w vr20, t0, 0 // 2896
+
+ rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // 0 8 16 24 1 9 17 25 in0 in1
+ rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 // 2 10 18 26 3 11 19 27 in2 in3
+ rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 // 4 12 20 28 5 13 21 29 in4 in5
+ rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // 6 14 22 30 7 15 23 31 in6 in7
+
+ adst8x4_1d_lsx
+
+ vilvl.h vr20, vr15, vr13
+ vilvl.h vr21, vr18, vr17
+ vilvl.w vr0, vr21, vr20
+ vilvh.w vr1, vr21, vr20
+ vilvh.h vr20, vr15, vr13
+ vilvh.h vr21, vr18, vr17
+ vilvl.w vr2, vr21, vr20
+ vilvh.w vr3, vr21, vr20
+ vshuf4i.h vr0, vr0, 0x2d
+ vshuf4i.h vr1, vr1, 0x2d
+ vshuf4i.h vr2, vr2, 0x78
+ vshuf4i.h vr3, vr3, 0x78
+ vilvl.d vr14, vr0, vr2 // in0
+ vilvh.d vr15, vr0, vr2 // in1
+ vilvl.d vr16, vr1, vr3 // in2
+ vilvh.d vr17, vr1, vr3 // in3
+
+ vreplgr2vr.h vr23, zero
+ vst vr23, a2, 0
+ vst vr23, a2, 16
+ vst vr23, a2, 32
+ vst vr23, a2, 48
+
+ li.w t0, 1697
+ vreplgr2vr.w vr20, t0
+
+ identity_4x4_lsx vr14, vr14, vr20, vr14, vr14
+ identity_4x4_lsx vr15, vr15, vr20, vr15, vr15
+ identity_4x4_lsx vr16, vr16, vr20, vr16, vr16
+ identity_4x4_lsx vr17, vr17, vr20, vr17, vr17
+
+ vsrari.h vr14, vr14, 4
+ vsrari.h vr15, vr15, 4
+ vsrari.h vr16, vr16, 4
+ vsrari.h vr17, vr17, 4
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr14, vr15, vr16, vr17
+endfunc
+
+function inv_txfm_add_identity_flipadst_8x4_8bpc_lsx
+ vld vr0, a2, 0 // in0
+ vld vr1, a2, 16 // in1
+ vld vr2, a2, 32 // in2
+ vld vr3, a2, 48 // in3
+
+ la.local t0, idct_coeffs
+ vldrepl.w vr20, t0, 0 // 2896
+
+ rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // in0 0 - 7
+ rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 // in1 8 - 15
+ rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 // in2 16 - 23
+ rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // in3 24 - 31
+
+ identity8_lsx vr18, vr19, vr6, vr7, vr8, vr9, vr10, vr11, \
+ vr19, vr7, vr9, vr11
+
+ vreplgr2vr.h vr23, zero
+ vst vr23, a2, 0
+ vst vr23, a2, 16
+ vst vr23, a2, 32
+ vst vr23, a2, 48
+
+ vilvl.h vr4, vr7, vr19
+ vilvh.h vr5, vr7, vr19
+ vilvl.h vr0, vr5, vr4
+ vilvh.h vr1, vr5, vr4
+ vilvl.h vr4, vr11, vr9
+ vilvh.h vr5, vr11, vr9
+ vilvl.h vr2, vr5, vr4
+ vilvh.h vr3, vr5, vr4
+
+ la.local t0, iadst4_coeffs
+
+ vreplgr2vr.h vr23, zero
+ vst vr23, a2, 0
+ vst vr23, a2, 16
+ vst vr23, a2, 32
+ vst vr23, a2, 48
+
+ vldrepl.w vr20, t0, 0 // 1321
+ vldrepl.w vr21, t0, 4 // 3803
+ vldrepl.w vr22, t0, 8 // 2482
+ vldrepl.w vr23, t0, 12 // 3344
+
+ vsllwil.w.h vr10, vr0, 0 // in0
+ vexth.w.h vr11, vr0 // in1
+ vsllwil.w.h vr12, vr1, 0 // in2
+ vexth.w.h vr13, vr1 // in3
+ adst4x4_1d_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13
+
+ vsllwil.w.h vr14, vr2, 0
+ vexth.w.h vr15, vr2
+ vsllwil.w.h vr16, vr3, 0
+ vexth.w.h vr17, vr3
+ adst4x4_1d_lsx vr14, vr15, vr16, vr17, vr14, vr15, vr16, vr17
+
+ vssrarni.h.w vr14, vr10, 12
+ vssrarni.h.w vr15, vr11, 12
+ vssrarni.h.w vr16, vr12, 12
+ vssrarni.h.w vr17, vr13, 12
+
+ vsrari.h vr14, vr14, 4
+ vsrari.h vr15, vr15, 4
+ vsrari.h vr16, vr16, 4
+ vsrari.h vr17, vr17, 4
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr17, vr16, vr15, vr14
+endfunc
+
+function inv_txfm_add_adst_identity_8x4_8bpc_lsx
+ vld vr0, a2, 0 // in0
+ vld vr1, a2, 16 // in1
+ vld vr2, a2, 32 // in2
+ vld vr3, a2, 48 // in3
+
+ la.local t0, idct_coeffs
+ vldrepl.w vr20, t0, 0 // 2896
+
+ rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // 0 8 16 24 1 9 17 25 in0 in1
+ rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 // 2 10 18 26 3 11 19 27 in2 in3
+ rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 // 4 12 20 28 5 13 21 29 in4 in5
+ rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // 6 14 22 30 7 15 23 31 in6 in7
+
+ adst8x4_1d_lsx
+
+ vilvl.h vr4, vr17, vr13
+ vilvl.h vr5, vr15, vr18
+ vilvl.w vr14, vr5, vr4 // in0 in1
+ vilvh.w vr16, vr5, vr4 // in2 in3
+ vilvh.h vr4, vr18, vr15
+ vilvh.h vr5, vr13, vr17
+ vilvl.w vr17, vr5, vr4
+ vilvh.w vr18, vr5, vr4
+ vilvl.d vr10, vr17, vr14 // in0
+ vilvh.d vr11, vr17, vr14 // in1
+ vilvl.d vr12, vr18, vr16 // in2
+ vilvh.d vr13, vr18, vr16 // in3
+
+ vreplgr2vr.h vr23, zero
+ vst vr23, a2, 0
+ vst vr23, a2, 16
+ vst vr23, a2, 32
+ vst vr23, a2, 48
+
+ li.w t0, 1697
+ vreplgr2vr.w vr20, t0
+
+ identity_4x4_lsx vr10, vr10, vr20, vr10, vr15
+ identity_4x4_lsx vr11, vr11, vr20, vr11, vr16
+ identity_4x4_lsx vr12, vr12, vr20, vr12, vr17
+ identity_4x4_lsx vr13, vr13, vr20, vr13, vr18
+
+ vsrari.h vr15, vr15, 4
+ vsrari.h vr16, vr16, 4
+ vsrari.h vr17, vr17, 4
+ vsrari.h vr18, vr18, 4
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr15, vr16, vr17, vr18
+endfunc
+
+function inv_txfm_add_identity_adst_8x4_8bpc_lsx
+ vld vr0, a2, 0 // in0
+ vld vr1, a2, 16 // in1
+ vld vr2, a2, 32 // in2
+ vld vr3, a2, 48 // in3
+
+ la.local t0, idct_coeffs
+ vldrepl.w vr20, t0, 0 // 2896
+
+ rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // in0 0 - 7
+ rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 // in1 8 - 15
+ rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 // in2 16 - 23
+ rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // in3 24 - 31
+
+ identity8_lsx vr18, vr19, vr6, vr7, vr8, vr9, vr10, vr11, \
+ vr0, vr1, vr2, vr3
+
+ vilvl.h vr4, vr1, vr0 // 0 2 4 6 8 10 12 14
+ vilvh.h vr5, vr1, vr0 // 1 3 5 7 9 11 13 15
+ vilvl.h vr0, vr5, vr4 // 0 1 2 3 4 5 6 7
+ vilvh.h vr1, vr5, vr4 // 8 9 10 11 12 13 14 15
+ vilvl.h vr4, vr3, vr2 // 0 2 4 6 8 10 12 14
+ vilvh.h vr5, vr3, vr2 // 1 3 5 7 9 11 13 15
+ vilvl.h vr2, vr5, vr4 // 0 1 2 3 4 5 6 7
+ vilvh.h vr3, vr5, vr4 // 8 9 10 11 12 13 14 15
+
+ vreplgr2vr.h vr23, zero
+ vst vr23, a2, 0
+ vst vr23, a2, 16
+ vst vr23, a2, 32
+ vst vr23, a2, 48
+
+ la.local t0, iadst4_coeffs
+
+ vldrepl.w vr20, t0, 0 // 1321
+ vldrepl.w vr21, t0, 4 // 3803
+ vldrepl.w vr22, t0, 8 // 2482
+ vldrepl.w vr23, t0, 12 // 3344
+
+ vsllwil.w.h vr10, vr0, 0
+ vexth.w.h vr11, vr0
+ vsllwil.w.h vr12, vr1, 0
+ vexth.w.h vr13, vr1
+
+ adst4x4_1d_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13
+
+ vsllwil.w.h vr14, vr2, 0
+ vexth.w.h vr15, vr2
+ vsllwil.w.h vr16, vr3, 0
+ vexth.w.h vr17, vr3
+
+ adst4x4_1d_lsx vr14, vr15, vr16, vr17, vr14, vr15, vr16, vr17
+
+ vssrarni.h.w vr14, vr10, 12
+ vssrarni.h.w vr15, vr11, 12
+ vssrarni.h.w vr16, vr12, 12
+ vssrarni.h.w vr17, vr13, 12
+
+ vsrari.h vr14, vr14, 4
+ vsrari.h vr15, vr15, 4
+ vsrari.h vr16, vr16, 4
+ vsrari.h vr17, vr17, 4
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr14, vr15, vr16, vr17
+endfunc
+
+function inv_txfm_add_identity_identity_8x8_8bpc_lsx
+
+ vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr14, vr15
+
+ // identity8
+ vsllwil.w.h vr6, vr0, 1
+ vsllwil.w.h vr7, vr1, 1
+ vsllwil.w.h vr8, vr2, 1
+ vsllwil.w.h vr9, vr3, 1
+ vsllwil.w.h vr10, vr4, 1
+ vsllwil.w.h vr11, vr5, 1
+ vsllwil.w.h vr12, vr14, 1
+ vsllwil.w.h vr13, vr15, 1
+
+.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr14, vr15
+ vexth.w.h \i, \i
+.endr
+
+.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr14, vr15
+ vslli.w \i, \i, 1
+.endr
+
+ vssrarni.h.w vr0, vr6, 1 // in0
+ vssrarni.h.w vr1, vr7, 1 // in1
+ vssrarni.h.w vr2, vr8, 1 // in2
+ vssrarni.h.w vr3, vr9, 1 // in3
+ vssrarni.h.w vr4, vr10, 1 // in4
+ vssrarni.h.w vr5, vr11, 1 // in5
+ vssrarni.h.w vr14, vr12, 1 // in6
+ vssrarni.h.w vr15, vr13, 1 // in7
+
+ vreplgr2vr.h vr23, zero
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112
+ vst vr23, a2, \i
+.endr
+
+ LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr14, vr15, \
+ vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23, \
+ vr6, vr7, vr8, vr9, vr10, vr11, vr12 vr13
+
+ vsllwil.w.h vr6, vr16, 1
+ vsllwil.w.h vr7, vr17, 1
+ vsllwil.w.h vr8, vr18, 1
+ vsllwil.w.h vr9, vr19, 1
+ vsllwil.w.h vr10, vr20, 1
+ vsllwil.w.h vr11, vr21, 1
+ vsllwil.w.h vr12, vr22, 1
+ vsllwil.w.h vr13, vr23, 1
+
+.irp i, vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
+ vexth.w.h \i, \i
+.endr
+
+.irp i, vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
+ vslli.w \i, \i, 1
+.endr
+
+ vssrarni.h.w vr16, vr6, 4 // in0
+ vssrarni.h.w vr17, vr7, 4 // in1
+ vssrarni.h.w vr18, vr8, 4 // in2
+ vssrarni.h.w vr19, vr9, 4 // in3
+ vssrarni.h.w vr20, vr10, 4 // in4
+ vssrarni.h.w vr21, vr11, 4 // in5
+ vssrarni.h.w vr22, vr12, 4 // in6
+ vssrarni.h.w vr23, vr13, 4 // in7
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr16, vr17, vr18, vr19
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr20, vr21, vr22, vr23
+
+endfunc
+
+.macro adst8x8_1d_lsx out0, out1, out2, out3
+ la.local t0, iadst8_coeffs
+
+ vldrepl.w vr20, t0, 0 // 4076
+ vldrepl.w vr21, t0, 4 // 401
+ vldrepl.w vr22, t0, 8 // 3612
+ vldrepl.w vr23, t0, 12 // 1931
+
+ // vr13 t0a t1a vr15 t2a t3a
+ vmadd_vmsub_vssrarni_hw_12 vr11, vr18, vr9, vr6, vr20, vr21, vr21, vr20, \
+ vr22, vr23, vr23, vr22, vr12, vr13, vr14, vr15
+ vldrepl.w vr20, t0, 16 // 2598
+ vldrepl.w vr21, t0, 20 // 3166
+ vldrepl.w vr22, t0, 24 // 1189
+ vldrepl.w vr23, t0, 28 // 3920
+
+ // vr18 t4a t5a vr6 t6a t7a
+ vmadd_vmsub_vssrarni_hw_12 vr7, vr8, vr19, vr10, vr20, vr21, vr21, vr20, \
+ vr22, vr23, vr23, vr22, vr11, vr18, vr9, vr6
+
+ vsadd.h vr12, vr13, vr18 // t0 t1
+ vsadd.h vr14, vr15, vr6 // t2 t3
+ vssub.h vr9, vr13, vr18 // t4 t5
+ vssub.h vr18, vr15, vr6 // t6 t7
+
+ la.local t0, idct_coeffs
+
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vldrepl.w vr22, t0, 0 // 2896
+
+ vsllwil.w.h vr7, vr9, 0 // t4
+ vexth.w.h vr8, vr9 // t5
+ vsllwil.w.h vr10, vr18, 0 // t6
+ vexth.w.h vr11, vr18 // t7
+
+ // vr13 out0 out7 vr17 out1 out6
+ vmadd_vmsub_vssrarni_hw_12 vr7, vr8, vr11, vr10, vr21, vr20, vr20, vr21, \
+ vr20, vr21, vr21, vr20, vr13, vr15, vr18, vr19
+ vshuf4i.d vr19, vr19, 0x01
+
+ vsadd.h vr13, vr12, vr14 // out0 out7
+ vssub.h vr6, vr12, vr14 // t2 t3
+ vsadd.h vr7, vr15, vr19 // out1 out6
+ vssub.h vr18, vr15, vr19 // t6 t7
+
+ vexth.w.h vr20, vr13 // out7
+ vsllwil.w.h vr21, vr7, 0 // out1
+ vneg.w vr20, vr20
+ vneg.w vr21, vr21
+ vssrarni.h.w vr21, vr20, 0 // out7 out1
+ vilvl.d \out0, vr21, vr13 // out0 out7
+ vilvh.d \out1, vr7, vr21 // out1 out6
+
+ vsllwil.w.h vr7, vr6, 0 // t2
+ vexth.w.h vr8, vr6 // t3
+ vsllwil.w.h vr10, vr18, 0 // t6
+ vexth.w.h vr11, vr18 // t7
+
+ // vr15 out[3] out[4] vr18 out[2] out[5]
+ vmadd_vmsub_vssrarni_hw_12 vr7, vr8, vr10, vr11, vr22, vr22, vr22, vr22, \
+ vr22, vr22, vr22, vr22, vr14, vr15, vr19, vr18
+
+ vexth.w.h vr20, vr18 // out5
+ vsllwil.w.h vr21, vr15, 0 // out3
+ vneg.w vr20, vr20
+ vneg.w vr21, vr21
+ vssrarni.h.w vr21, vr20, 0 // out5 out3
+ vilvl.d \out2, vr21, vr18 // out2 out5
+ vilvh.d \out3, vr15, vr21 // out3 out4
+.endm
+
+function inv_txfm_add_adst_dct_8x8_8bpc_lsx
+ addi.d sp, sp, -32
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ fst.d f27, sp, 24
+
+ vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr16, vr17
+
+ vsllwil.w.h vr18, vr0, 0
+ vsllwil.w.h vr19, vr1, 0
+ vsllwil.w.h vr6, vr2, 0
+ vsllwil.w.h vr7, vr3, 0
+ vsllwil.w.h vr8, vr4, 0
+ vsllwil.w.h vr9, vr5, 0
+ vsllwil.w.h vr10, vr16, 0
+ vsllwil.w.h vr11, vr17, 0
+ adst8x8_1d_lsx vr24, vr25, vr26, vr27
+
+ vexth.w.h vr18, vr0
+ vexth.w.h vr19, vr1
+ vexth.w.h vr6, vr2
+ vexth.w.h vr7, vr3
+ vexth.w.h vr8, vr4
+ vexth.w.h vr9, vr5
+ vexth.w.h vr10, vr16
+ vexth.w.h vr11, vr17
+ adst8x8_1d_lsx vr0, vr1, vr2, vr3
+
+ vreplgr2vr.h vr23, zero
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112
+ vst vr23, a2, \i
+.endr
+
+.irp i, vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3
+ vsrari.h \i, \i, 1
+.endr
+
+ LSX_TRANSPOSE8x8_H vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3, \
+ vr4, vr5, vr12, vr13, vr14, vr15, vr24, vr25, \
+ vr6, vr7, vr8, vr9, vr10, vr11, vr16, vr17
+
+ vshuf4i.h vr14, vr14, 0x1b
+ vshuf4i.h vr15, vr15, 0x1b
+ vshuf4i.h vr24, vr24, 0x1b
+ vshuf4i.h vr25, vr25, 0x1b
+
+ vsllwil.w.h vr18, vr4, 0
+ vsllwil.w.h vr19, vr5, 0
+ vsllwil.w.h vr6, vr12, 0
+ vsllwil.w.h vr7, vr13, 0
+ vexth.w.h vr8, vr4
+ vexth.w.h vr9, vr5
+ vexth.w.h vr10, vr12
+ vexth.w.h vr11, vr13
+
+ la.local t0, idct_coeffs
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vldrepl.w vr22, t0, 0 // 2896
+
+ dct_8x4_core_lsx1 vr4, vr5, vr12, vr13
+
+ vshuf4i.d vr5, vr5, 0x01
+ vshuf4i.d vr13, vr13, 0x01
+
+ vsllwil.w.h vr18, vr14, 0
+ vsllwil.w.h vr19, vr15, 0
+ vsllwil.w.h vr6, vr24, 0
+ vsllwil.w.h vr7, vr25, 0
+ vexth.w.h vr8, vr14
+ vexth.w.h vr9, vr15
+ vexth.w.h vr10, vr24
+ vexth.w.h vr11, vr25
+
+ la.local t0, idct_coeffs
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vldrepl.w vr22, t0, 0 // 2896
+
+ dct_8x4_core_lsx1 vr14, vr15, vr24, vr25
+
+ vshuf4i.d vr15, vr15, 0x01
+ vshuf4i.d vr25, vr25, 0x01
+
+ vilvl.d vr20, vr14, vr4
+ vilvh.d vr21, vr14, vr4
+ vilvl.d vr22, vr15, vr5
+ vilvh.d vr23, vr15, vr5
+ vilvl.d vr16, vr24, vr12
+ vilvh.d vr17, vr24, vr12
+ vilvl.d vr18, vr25, vr13
+ vilvh.d vr19, vr25, vr13
+
+.irp i, vr20, vr21, vr22, vr23, vr16, vr17, vr18, vr19
+ vsrari.h \i, \i, 4
+.endr
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr20, vr21, vr22, vr23
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr16, vr17, vr18, vr19
+
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ fld.d f27, sp, 24
+ addi.d sp, sp, 32
+endfunc
+
+function inv_txfm_add_dct_adst_8x8_8bpc_lsx
+ addi.d sp, sp, -48
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ fst.d f27, sp, 24
+ fst.d f28, sp, 32
+ fst.d f29, sp, 40
+
+ vld_x8 a2, 0, 16, vr4, vr5, vr12, vr13, vr14, vr15, vr24, vr25
+
+ la.local t0, idct_coeffs
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vldrepl.w vr22, t0, 0 // 2896
+
+ vsllwil.w.h vr18, vr4, 0
+ vsllwil.w.h vr19, vr5, 0
+ vsllwil.w.h vr6, vr12, 0
+ vsllwil.w.h vr7, vr13, 0
+ vsllwil.w.h vr8, vr14, 0
+ vsllwil.w.h vr9, vr15, 0
+ vsllwil.w.h vr10, vr24, 0
+ vsllwil.w.h vr11, vr25, 0
+
+ dct_8x4_core_lsx1 vr26, vr27, vr28, vr29
+
+ vshuf4i.d vr27, vr27, 0x01
+ vshuf4i.d vr29, vr29, 0x01
+
+ vilvl.h vr8, vr27, vr26 // 0 2 4 6 8 10 12 14
+ vilvh.h vr9, vr27, vr26 // 1 3 5 7 9 11 13 15
+ vilvl.h vr26, vr9, vr8 // 0 - 7 in0
+ vilvh.h vr27, vr9, vr8 // 8 - 15 in1
+ vilvl.h vr8, vr29, vr28 // 0 2 4 6 8 10 12 14
+ vilvh.h vr9, vr29, vr28 // 1 3 5 7 9 11 13 15
+ vilvl.h vr28, vr9, vr8 // 16 - 23 in2
+ vilvh.h vr29, vr9, vr8 // 24 - 31 in3
+
+ vsrari.h vr26, vr26, 1 // in0low in1low
+ vsrari.h vr27, vr27, 1 // in2low in3low
+ vsrari.h vr28, vr28, 1 // in0high in1high
+ vsrari.h vr29, vr29, 1 // in2high in3high
+
+ vexth.w.h vr18, vr4
+ vexth.w.h vr19, vr5
+ vexth.w.h vr6, vr12
+ vexth.w.h vr7, vr13
+ vexth.w.h vr8, vr14
+ vexth.w.h vr9, vr15
+ vexth.w.h vr10, vr24
+ vexth.w.h vr11, vr25
+
+ la.local t0, idct_coeffs
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vldrepl.w vr22, t0, 0 // 2896
+
+ dct_8x4_core_lsx1 vr12, vr13, vr14, vr15
+
+ vshuf4i.d vr13, vr13, 0x01
+ vshuf4i.d vr15, vr15, 0x01
+
+ vilvl.h vr8, vr13, vr12 // 0 2 4 6 8 10 12 14
+ vilvh.h vr9, vr13, vr12 // 1 3 5 7 9 11 13 15
+ vilvl.h vr12, vr9, vr8 // 0 - 7 in0
+ vilvh.h vr13, vr9, vr8 // 8 - 15 in1
+ vilvl.h vr8, vr15, vr14 // 0 2 4 6 8 10 12 14
+ vilvh.h vr9, vr15, vr14 // 1 3 5 7 9 11 13 15
+ vilvl.h vr14, vr9, vr8 // 16 - 23 in2
+ vilvh.h vr15, vr9, vr8 // 24 - 31 in3
+
+ vsrari.h vr0, vr12, 1 // in4low in5low
+ vsrari.h vr1, vr13, 1 // in6low in7low
+ vsrari.h vr2, vr14, 1 // in4high in5high
+ vsrari.h vr3, vr15, 1 // in6high in7high
+
+ vreplgr2vr.h vr23, zero
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112
+ vst vr23, a2, \i
+.endr
+
+ vsllwil.w.h vr18, vr26, 0 // in0
+ vexth.w.h vr19, vr26 // in1
+ vsllwil.w.h vr6, vr27, 0 // in2
+ vexth.w.h vr7, vr27 // in3
+ vsllwil.w.h vr8, vr0, 0 // in3
+ vexth.w.h vr9, vr0 // in4
+ vsllwil.w.h vr10, vr1, 0 // in5
+ vexth.w.h vr11, vr1 // in6
+ adst8x8_1d_lsx vr26, vr27, vr0, vr1
+
+ vsllwil.w.h vr18, vr28, 0 // in0
+ vexth.w.h vr19, vr28 // in1
+ vsllwil.w.h vr6, vr29, 0 // in2
+ vexth.w.h vr7, vr29 // in3
+ vsllwil.w.h vr8, vr2, 0 // in4
+ vexth.w.h vr9, vr2 // in5
+ vsllwil.w.h vr10, vr3, 0 // in6
+ vexth.w.h vr11, vr3 // in7
+ adst8x8_1d_lsx vr28, vr29, vr16, vr17
+
+ vilvl.d vr4, vr28, vr26 // 0 ... 7
+ vilvl.d vr5, vr29, vr27 // 8 ... 15
+ vilvl.d vr6, vr16, vr0 // 16 ... 23
+ vilvl.d vr7, vr17, vr1 // 24 ... 31
+ vilvh.d vr14, vr17, vr1 // 32 ... 39
+ vilvh.d vr15, vr16, vr0 // 40 ... 47
+ vilvh.d vr16, vr29, vr27 // 48 ... 55
+ vilvh.d vr17, vr28, vr26 // 56 ... 63
+
+.irp i, vr4, vr5, vr6, vr7, vr14, vr15, vr16, vr17
+ vsrari.h \i, \i, 4
+.endr
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr4, vr5, vr6, vr7
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr14, vr15, vr16, vr17
+
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ fld.d f27, sp, 24
+ fld.d f28, sp, 32
+ fld.d f29, sp, 40
+ addi.d sp, sp, 48
+endfunc
+
+function inv_txfm_add_adst_adst_8x8_8bpc_lsx
+ addi.d sp, sp, -32
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ fst.d f27, sp, 24
+
+ vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr16, vr17
+
+ vsllwil.w.h vr18, vr0, 0
+ vsllwil.w.h vr19, vr1, 0
+ vsllwil.w.h vr6, vr2, 0
+ vsllwil.w.h vr7, vr3, 0
+ vsllwil.w.h vr8, vr4, 0
+ vsllwil.w.h vr9, vr5, 0
+ vsllwil.w.h vr10, vr16, 0
+ vsllwil.w.h vr11, vr17, 0
+ adst8x8_1d_lsx vr24, vr25, vr26, vr27
+
+ vexth.w.h vr18, vr0 // in0
+ vexth.w.h vr19, vr1 // in1
+ vexth.w.h vr6, vr2 // in2
+ vexth.w.h vr7, vr3 // in3
+ vexth.w.h vr8, vr4 // in3
+ vexth.w.h vr9, vr5 // in4
+ vexth.w.h vr10, vr16 // in5
+ vexth.w.h vr11, vr17 // in6
+ adst8x8_1d_lsx vr0, vr1, vr2, vr3
+
+ vreplgr2vr.h vr23, zero
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112
+ vst vr23, a2, \i
+.endr
+
+.irp i, vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3
+ vsrari.h \i, \i, 1
+.endr
+
+ LSX_TRANSPOSE8x8_H vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3, \
+ vr14, vr15, vr12, vr13, vr4, vr5, vr24, vr25, \
+ vr6, vr7, vr8, vr9, vr10, vr11, vr16, vr17
+
+ vshuf4i.h vr4, vr4, 0x1b
+ vshuf4i.h vr5, vr5, 0x1b
+ vshuf4i.h vr24, vr24, 0x1b
+ vshuf4i.h vr25, vr25, 0x1b
+
+ vsllwil.w.h vr18, vr14, 0
+ vsllwil.w.h vr19, vr15, 0
+ vsllwil.w.h vr6, vr12, 0
+ vsllwil.w.h vr7, vr13, 0
+ vexth.w.h vr8, vr14 // in3
+ vexth.w.h vr9, vr15 // in4
+ vexth.w.h vr10, vr12 // in5
+ vexth.w.h vr11, vr13 // in6
+
+ adst8x8_1d_lsx vr26, vr27, vr0, vr1
+
+ vsllwil.w.h vr18, vr4, 0
+ vsllwil.w.h vr19, vr5, 0
+ vsllwil.w.h vr6, vr24, 0
+ vsllwil.w.h vr7, vr25, 0
+ vexth.w.h vr8, vr4 // in3
+ vexth.w.h vr9, vr5 // in4
+ vexth.w.h vr10, vr24 // in5
+ vexth.w.h vr11, vr25 // in6
+
+ adst8x8_1d_lsx vr24, vr25, vr16, vr17
+
+ vilvl.d vr4, vr24, vr26 // 0 ... 7
+ vilvl.d vr5, vr25, vr27 // 8 ... 15
+ vilvl.d vr6, vr16, vr0 // 16 ... 23
+ vilvl.d vr7, vr17, vr1 // 24 ... 31
+ vilvh.d vr14, vr17, vr1 // 32 ... 39
+ vilvh.d vr15, vr16, vr0 // 40 ... 47
+ vilvh.d vr16, vr25, vr27 // 48 ... 55
+ vilvh.d vr17, vr24, vr26 // 56 ... 63
+
+.irp i, vr4, vr5, vr6, vr7, vr14, vr15, vr16, vr17
+ vsrari.h \i, \i, 4
+.endr
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr4, vr5, vr6, vr7
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr14, vr15, vr16, vr17
+
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ fld.d f27, sp, 24
+ addi.d sp, sp, 32
+endfunc
+
+function inv_txfm_add_flipadst_adst_8x8_8bpc_lsx
+ addi.d sp, sp, -32
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ fst.d f27, sp, 24
+
+ vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr16, vr17
+
+ vsllwil.w.h vr18, vr0, 0
+ vsllwil.w.h vr19, vr1, 0
+ vsllwil.w.h vr6, vr2, 0
+ vsllwil.w.h vr7, vr3, 0
+ vsllwil.w.h vr8, vr4, 0
+ vsllwil.w.h vr9, vr5, 0
+ vsllwil.w.h vr10, vr16, 0
+ vsllwil.w.h vr11, vr17, 0
+ adst8x8_1d_lsx vr12, vr13, vr14, vr15
+
+ vilvl.h vr20, vr12, vr13
+ vilvl.h vr21, vr14, vr15
+ vilvl.w vr24, vr20, vr21
+ vilvh.w vr25, vr20, vr21
+ vilvh.h vr20, vr12, vr13
+ vilvh.h vr21, vr14, vr15
+ vilvl.w vr26, vr20, vr21
+ vilvh.w vr27, vr20, vr21
+ vshuf4i.h vr26, vr26, 0x1b
+ vshuf4i.h vr27, vr27, 0x1b
+
+ vexth.w.h vr18, vr0
+ vexth.w.h vr19, vr1
+ vexth.w.h vr6, vr2
+ vexth.w.h vr7, vr3
+ vexth.w.h vr8, vr4
+ vexth.w.h vr9, vr5
+ vexth.w.h vr10, vr16
+ vexth.w.h vr11, vr17
+ adst8x8_1d_lsx vr12, vr13, vr14, vr15
+
+ vilvl.h vr20, vr12, vr13
+ vilvl.h vr21, vr14, vr15
+ vilvl.w vr0, vr20, vr21
+ vilvh.w vr1, vr20, vr21
+ vilvh.h vr20, vr12, vr13
+ vilvh.h vr21, vr14, vr15
+ vilvl.w vr2, vr20, vr21
+ vilvh.w vr3, vr20, vr21
+ vshuf4i.h vr2, vr2, 0x1b
+ vshuf4i.h vr3, vr3, 0x1b
+
+ vreplgr2vr.h vr23, zero
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112
+ vst vr23, a2, \i
+.endr
+
+.irp i, vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3
+ vsrari.h \i, \i, 1
+.endr
+
+ vsllwil.w.h vr18, vr26, 0 // in0
+ vexth.w.h vr19, vr26 // in1
+ vsllwil.w.h vr6, vr27, 0 // in2
+ vexth.w.h vr7, vr27 // in3
+ vsllwil.w.h vr8, vr2, 0 // in4
+ vexth.w.h vr9, vr2 // in5
+ vsllwil.w.h vr10, vr3, 0 // in6
+ vexth.w.h vr11, vr3 // in7
+ adst8x8_1d_lsx vr4, vr5, vr16, vr17
+
+ vsllwil.w.h vr18, vr24, 0 // in0
+ vexth.w.h vr19, vr24 // in1
+ vsllwil.w.h vr6, vr25, 0 // in2
+ vexth.w.h vr7, vr25 // in3
+ vsllwil.w.h vr8, vr0, 0 // in4
+ vexth.w.h vr9, vr0 // in5
+ vsllwil.w.h vr10, vr1, 0 // in6
+ vexth.w.h vr11, vr1 // in7
+ adst8x8_1d_lsx vr0, vr1, vr2, vr3
+
+ vilvl.d vr20, vr0, vr4 // 0 ... 7
+ vilvl.d vr21, vr1, vr5 // 8 ... 15
+ vilvl.d vr22, vr2, vr16 // 16 ... 23
+ vilvl.d vr23, vr3, vr17 // 24 ... 31
+ vilvh.d vr14, vr3, vr17 // 32 ... 39
+ vilvh.d vr15, vr2, vr16 // 40 ... 47
+ vilvh.d vr16, vr1, vr5 // 48 ... 55
+ vilvh.d vr17, vr0, vr4 // 56 ... 63
+
+.irp i, vr20, vr21, vr22, vr23, vr14, vr15, vr16, vr17
+ vsrari.h \i, \i, 4
+.endr
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr20, vr21, vr22, vr23
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr14, vr15, vr16, vr17
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ fld.d f27, sp, 24
+ addi.d sp, sp, 32
+endfunc
+
+function inv_txfm_add_adst_flipadst_8x8_8bpc_lsx
+ addi.d sp, sp, -32
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ fst.d f27, sp, 24
+
+ vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr16, vr17
+
+ vsllwil.w.h vr18, vr0, 0
+ vsllwil.w.h vr19, vr1, 0
+ vsllwil.w.h vr6, vr2, 0
+ vsllwil.w.h vr7, vr3, 0
+ vsllwil.w.h vr8, vr4, 0
+ vsllwil.w.h vr9, vr5, 0
+ vsllwil.w.h vr10, vr16, 0
+ vsllwil.w.h vr11, vr17, 0
+ adst8x8_1d_lsx vr24, vr25, vr26, vr27
+
+ vexth.w.h vr18, vr0
+ vexth.w.h vr19, vr1
+ vexth.w.h vr6, vr2
+ vexth.w.h vr7, vr3
+ vexth.w.h vr8, vr4
+ vexth.w.h vr9, vr5
+ vexth.w.h vr10, vr16
+ vexth.w.h vr11, vr17
+ adst8x8_1d_lsx vr0, vr1, vr2, vr3
+
+ vreplgr2vr.h vr23, zero
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112
+ vst vr23, a2, \i
+.endr
+
+.irp i, vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3
+ vsrari.h \i, \i, 1
+.endr
+
+ LSX_TRANSPOSE8x8_H vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3, \
+ vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3, \
+ vr6, vr7, vr8, vr9, vr10, vr11, vr16, vr17
+
+ vshuf4i.h vr0, vr0, 0x1b
+ vshuf4i.h vr1, vr1, 0x1b
+ vshuf4i.h vr2, vr2, 0x1b
+ vshuf4i.h vr3, vr3, 0x1b
+
+ vsllwil.w.h vr18, vr0, 0 // in0
+ vsllwil.w.h vr19, vr1, 0 // in1
+ vsllwil.w.h vr6, vr2, 0 // in2
+ vsllwil.w.h vr7, vr3, 0 // in3
+ vexth.w.h vr8, vr0 // in4
+ vexth.w.h vr9, vr1 // in5
+ vexth.w.h vr10, vr2 // in6
+ vexth.w.h vr11, vr3 // in7
+ adst8x8_1d_lsx vr4, vr5, vr16, vr17
+
+ vsllwil.w.h vr18, vr24, 0 // in0
+ vsllwil.w.h vr19, vr25, 0 // in1
+ vsllwil.w.h vr6, vr26, 0 // in2
+ vsllwil.w.h vr7, vr27, 0 // in3
+ vexth.w.h vr8, vr24 // in4
+ vexth.w.h vr9, vr25 // in5
+ vexth.w.h vr10, vr26 // in6
+ vexth.w.h vr11, vr27 // in7
+ adst8x8_1d_lsx vr0, vr1, vr2, vr3
+
+ vilvh.d vr20, vr4, vr0
+ vilvh.d vr21, vr5, vr1
+ vilvh.d vr22, vr16, vr2
+ vilvh.d vr23, vr17, vr3
+ vilvl.d vr14, vr17, vr3
+ vilvl.d vr15, vr16, vr2
+ vilvl.d vr18, vr5, vr1
+ vilvl.d vr19, vr4, vr0
+
+.irp i, vr20, vr21, vr22, vr23, vr14, vr15, vr18, vr19
+ vsrari.h \i, \i, 4
+.endr
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr20, vr21, vr22, vr23
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr14, vr15, vr18, vr19
+
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ fld.d f27, sp, 24
+ addi.d sp, sp, 32
+endfunc
+
+function inv_txfm_add_flipadst_dct_8x8_8bpc_lsx
+ addi.d sp, sp, -32
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ fst.d f27, sp, 24
+
+ vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr16, vr17
+
+ vsllwil.w.h vr18, vr0, 0
+ vsllwil.w.h vr19, vr1, 0
+ vsllwil.w.h vr6, vr2, 0
+ vsllwil.w.h vr7, vr3, 0
+ vsllwil.w.h vr8, vr4, 0
+ vsllwil.w.h vr9, vr5, 0
+ vsllwil.w.h vr10, vr16, 0
+ vsllwil.w.h vr11, vr17, 0
+ adst8x8_1d_lsx vr12, vr13, vr14, vr15
+
+ vilvl.h vr20, vr12, vr13
+ vilvl.h vr21, vr14, vr15
+ vilvl.w vr24, vr20, vr21
+ vilvh.w vr25, vr20, vr21
+ vilvh.h vr20, vr12, vr13
+ vilvh.h vr21, vr14, vr15
+ vilvl.w vr26, vr20, vr21
+ vilvh.w vr27, vr20, vr21
+
+ vexth.w.h vr18, vr0
+ vexth.w.h vr19, vr1
+ vexth.w.h vr6, vr2
+ vexth.w.h vr7, vr3
+ vexth.w.h vr8, vr4
+ vexth.w.h vr9, vr5
+ vexth.w.h vr10, vr16
+ vexth.w.h vr11, vr17
+ adst8x8_1d_lsx vr12, vr13, vr14, vr15
+
+ vilvl.h vr20, vr12, vr13
+ vilvl.h vr21, vr14, vr15
+ vilvl.w vr0, vr20, vr21
+ vilvh.w vr1, vr20, vr21
+ vilvh.h vr20, vr12, vr13
+ vilvh.h vr21, vr14, vr15
+ vilvl.w vr2, vr20, vr21
+ vilvh.w vr3, vr20, vr21
+
+ vreplgr2vr.h vr23, zero
+
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112
+ vst vr23, a2, \i
+.endr
+
+ vsrari.h vr24, vr24, 1
+ vsrari.h vr25, vr25, 1
+ vsrari.h vr26, vr26, 1
+ vsrari.h vr27, vr27, 1
+ vsrari.h vr14, vr0, 1
+ vsrari.h vr15, vr1, 1
+ vsrari.h vr16, vr2, 1
+ vsrari.h vr17, vr3, 1
+
+ vsllwil.w.h vr18, vr26, 0
+ vexth.w.h vr19, vr26
+ vsllwil.w.h vr6, vr27, 0
+ vexth.w.h vr7, vr27
+ vsllwil.w.h vr8, vr16, 0
+ vexth.w.h vr9, vr16
+ vsllwil.w.h vr10, vr17, 0
+ vexth.w.h vr11, vr17
+
+ la.local t0, idct_coeffs
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vldrepl.w vr22, t0, 0 // 2896
+
+ dct_8x4_core_lsx1 vr26, vr27, vr16, vr17
+
+ vshuf4i.h vr26, vr26, 0x1b
+ vshuf4i.h vr27, vr27, 0x1b
+ vshuf4i.h vr16, vr16, 0x1b
+ vshuf4i.h vr17, vr17, 0x1b
+
+ vsllwil.w.h vr18, vr24, 0
+ vexth.w.h vr19, vr24
+ vsllwil.w.h vr6, vr25, 0
+ vexth.w.h vr7, vr25
+ vsllwil.w.h vr8, vr14, 0
+ vexth.w.h vr9, vr14
+ vsllwil.w.h vr10, vr15, 0
+ vexth.w.h vr11, vr15
+
+ la.local t0, idct_coeffs
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vldrepl.w vr22, t0, 0 // 2896
+
+ dct_8x4_core_lsx1 vr24, vr25, vr14, vr15
+
+ vilvl.d vr4, vr24, vr26
+ vilvh.d vr5, vr24, vr26
+ vilvh.d vr6, vr25, vr27
+ vilvl.d vr7, vr25, vr27
+ vilvl.d vr24, vr14, vr16
+ vilvh.d vr25, vr14, vr16
+ vilvh.d vr26, vr15, vr17
+ vilvl.d vr27, vr15, vr17
+
+.irp i, vr4, vr5, vr6, vr7, vr24, vr25, vr26, vr27
+ vsrari.h \i, \i, 4
+.endr
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr4, vr5, vr6, vr7
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr24, vr25, vr26, vr27
+
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ fld.d f27, sp, 24
+ addi.d sp, sp, 32
+endfunc
+
+function inv_txfm_add_dct_flipadst_8x8_8bpc_lsx
+ addi.d sp, sp, -48
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ fst.d f27, sp, 24
+ fst.d f28, sp, 32
+ fst.d f29, sp, 40
+
+ vld_x8 a2, 0, 16, vr4, vr5, vr12, vr13, vr14, vr15, vr24, vr25
+
+ la.local t0, idct_coeffs
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vldrepl.w vr22, t0, 0 // 2896
+
+ vsllwil.w.h vr18, vr4, 0
+ vsllwil.w.h vr19, vr5, 0
+ vsllwil.w.h vr6, vr12, 0
+ vsllwil.w.h vr7, vr13, 0
+ vsllwil.w.h vr8, vr14, 0
+ vsllwil.w.h vr9, vr15, 0
+ vsllwil.w.h vr10, vr24, 0
+ vsllwil.w.h vr11, vr25, 0
+ dct_8x4_core_lsx1 vr26, vr27, vr28, vr29
+ vshuf4i.d vr27, vr27, 0x01
+ vshuf4i.d vr29, vr29, 0x01
+
+ vilvl.h vr8, vr27, vr26
+ vilvh.h vr9, vr27, vr26
+ vilvl.h vr26, vr9, vr8
+ vilvh.h vr27, vr9, vr8
+ vilvl.h vr8, vr29, vr28
+ vilvh.h vr9, vr29, vr28
+ vilvl.h vr28, vr9, vr8
+ vilvh.h vr29, vr9, vr8
+
+ vsrari.h vr26, vr26, 1 // in0low in1low
+ vsrari.h vr27, vr27, 1 // in2low in3low
+ vsrari.h vr28, vr28, 1 // in0high in1high
+ vsrari.h vr29, vr29, 1 // in2high in3high
+
+ vexth.w.h vr18, vr4
+ vexth.w.h vr19, vr5
+ vexth.w.h vr6, vr12
+ vexth.w.h vr7, vr13
+ vexth.w.h vr8, vr14
+ vexth.w.h vr9, vr15
+ vexth.w.h vr10, vr24
+ vexth.w.h vr11, vr25
+ la.local t0, idct_coeffs
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vldrepl.w vr22, t0, 0 // 2896
+ dct_8x4_core_lsx1 vr12, vr13, vr14, vr15
+ vshuf4i.d vr13, vr13, 0x01
+ vshuf4i.d vr15, vr15, 0x01
+
+ vilvl.h vr8, vr13, vr12
+ vilvh.h vr9, vr13, vr12
+ vilvl.h vr12, vr9, vr8
+ vilvh.h vr13, vr9, vr8
+ vilvl.h vr8, vr15, vr14
+ vilvh.h vr9, vr15, vr14
+ vilvl.h vr14, vr9, vr8
+ vilvh.h vr15, vr9, vr8
+
+ vsrari.h vr0, vr12, 1
+ vsrari.h vr1, vr13, 1
+ vsrari.h vr2, vr14, 1
+ vsrari.h vr3, vr15, 1
+
+ vreplgr2vr.h vr23, zero
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112
+ vst vr23, a2, \i
+.endr
+
+ vsllwil.w.h vr18, vr28, 0 // in0
+ vexth.w.h vr19, vr28 // in1
+ vsllwil.w.h vr6, vr29, 0 // in2
+ vexth.w.h vr7, vr29 // in3
+ vsllwil.w.h vr8, vr2, 0 // in4
+ vexth.w.h vr9, vr2 // in5
+ vsllwil.w.h vr10, vr3, 0 // in6
+ vexth.w.h vr11, vr3 // in7
+ adst8x8_1d_lsx vr4, vr5, vr16, vr17
+
+ vsllwil.w.h vr18, vr26, 0 // in0
+ vexth.w.h vr19, vr26 // in1
+ vsllwil.w.h vr6, vr27, 0 // in2
+ vexth.w.h vr7, vr27 // in3
+ vsllwil.w.h vr8, vr0, 0 // in4
+ vexth.w.h vr9, vr0 // in5
+ vsllwil.w.h vr10, vr1, 0 // in6
+ vexth.w.h vr11, vr1 // in7
+ adst8x8_1d_lsx vr0, vr1, vr2, vr3
+
+ vilvh.d vr26, vr4, vr0
+ vilvh.d vr27, vr5, vr1
+ vilvh.d vr28, vr16, vr2
+ vilvh.d vr29, vr17, vr3
+ vilvl.d vr20, vr17, vr3
+ vilvl.d vr21, vr16, vr2
+ vilvl.d vr22, vr5, vr1
+ vilvl.d vr23, vr4, vr0
+
+.irp i, vr26, vr27, vr28, vr29, vr20, vr21, vr22, vr23
+ vsrari.h \i, \i, 4
+.endr
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr26, vr27, vr28, vr29
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr20, vr21, vr22, vr23
+
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ fld.d f27, sp, 24
+ fld.d f28, sp, 32
+ fld.d f29, sp, 40
+ addi.d sp, sp, 48
+endfunc
+
+function inv_txfm_add_flipadst_flipadst_8x8_8bpc_lsx
+ addi.d sp, sp, -32
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ fst.d f27, sp, 24
+
+ vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr16, vr17
+
+ vsllwil.w.h vr18, vr0, 0
+ vsllwil.w.h vr19, vr1, 0
+ vsllwil.w.h vr6, vr2, 0
+ vsllwil.w.h vr7, vr3, 0
+ vsllwil.w.h vr8, vr4, 0
+ vsllwil.w.h vr9, vr5, 0
+ vsllwil.w.h vr10, vr16, 0
+ vsllwil.w.h vr11, vr17, 0
+ adst8x8_1d_lsx vr12, vr13, vr14, vr15
+
+ vilvl.h vr20, vr12, vr13
+ vilvl.h vr21, vr14, vr15
+ vilvl.w vr24, vr20, vr21
+ vilvh.w vr25, vr20, vr21
+ vilvh.h vr20, vr12, vr13
+ vilvh.h vr21, vr14, vr15
+ vilvl.w vr26, vr20, vr21
+ vilvh.w vr27, vr20, vr21
+ vshuf4i.h vr26, vr26, 0x1b
+ vshuf4i.h vr27, vr27, 0x1b
+
+ vexth.w.h vr18, vr0
+ vexth.w.h vr19, vr1
+ vexth.w.h vr6, vr2
+ vexth.w.h vr7, vr3
+ vexth.w.h vr8, vr4
+ vexth.w.h vr9, vr5
+ vexth.w.h vr10, vr16
+ vexth.w.h vr11, vr17
+ adst8x8_1d_lsx vr12, vr13, vr14, vr15
+
+ vilvl.h vr20, vr12, vr13
+ vilvl.h vr21, vr14, vr15
+ vilvl.w vr0, vr20, vr21
+ vilvh.w vr1, vr20, vr21
+ vilvh.h vr20, vr12, vr13
+ vilvh.h vr21, vr14, vr15
+ vilvl.w vr2, vr20, vr21
+ vilvh.w vr3, vr20, vr21
+ vshuf4i.h vr2, vr2, 0x1b
+ vshuf4i.h vr3, vr3, 0x1b
+
+.irp i, vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3
+ vsrari.h \i, \i, 1
+.endr
+
+ vreplgr2vr.h vr23, zero
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112
+ vst vr23, a2, \i
+.endr
+
+ vsllwil.w.h vr18, vr26, 0 // in0
+ vexth.w.h vr19, vr26 // in1
+ vsllwil.w.h vr6, vr27, 0 // in2
+ vexth.w.h vr7, vr27 // in3
+ vsllwil.w.h vr8, vr2, 0 // in4
+ vexth.w.h vr9, vr2 // in5
+ vsllwil.w.h vr10, vr3, 0 // in6
+ vexth.w.h vr11, vr3 // in7
+ adst8x8_1d_lsx vr4, vr5, vr16, vr17
+
+ vsllwil.w.h vr18, vr24, 0 // in0
+ vexth.w.h vr19, vr24 // in1
+ vsllwil.w.h vr6, vr25, 0 // in2
+ vexth.w.h vr7, vr25 // in3
+ vsllwil.w.h vr8, vr0, 0 // in4
+ vexth.w.h vr9, vr0 // in5
+ vsllwil.w.h vr10, vr1, 0 // in6
+ vexth.w.h vr11, vr1 // in7
+ adst8x8_1d_lsx vr0, vr1, vr2, vr3
+
+ vilvh.d vr24, vr0, vr4
+ vilvh.d vr25, vr1, vr5
+ vilvh.d vr26, vr2, vr16
+ vilvh.d vr27, vr3, vr17
+ vilvl.d vr20, vr3, vr17
+ vilvl.d vr21, vr2, vr16
+ vilvl.d vr22, vr1, vr5
+ vilvl.d vr23, vr0, vr4
+
+.irp i, vr24, vr25, vr26, vr27, vr20, vr21, vr22, vr23
+ vsrari.h \i, \i, 4
+.endr
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr24, vr25, vr26, vr27
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr20, vr21, vr22, vr23
+
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ fld.d f27, sp, 24
+ addi.d sp, sp, 32
+endfunc
+
+function inv_txfm_add_dct_identity_8x8_8bpc_lsx
+ addi.d sp, sp, -48
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ fst.d f27, sp, 24
+ fst.d f28, sp, 32
+ fst.d f29, sp, 40
+
+ vld_x8 a2, 0, 16, vr4, vr5, vr12, vr13, vr14, vr15, vr24, vr25
+
+ la.local t0, idct_coeffs
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vldrepl.w vr22, t0, 0 // 2896
+
+ vsllwil.w.h vr18, vr4, 0
+ vsllwil.w.h vr19, vr5, 0
+ vsllwil.w.h vr6, vr12, 0
+ vsllwil.w.h vr7, vr13, 0
+ vsllwil.w.h vr8, vr14, 0
+ vsllwil.w.h vr9, vr15, 0
+ vsllwil.w.h vr10, vr24, 0
+ vsllwil.w.h vr11, vr25, 0
+ dct_8x4_core_lsx1 vr26, vr27, vr28, vr29
+ vshuf4i.d vr27, vr27, 0x01
+ vshuf4i.d vr29, vr29, 0x01
+
+ vilvl.h vr8, vr27, vr26
+ vilvh.h vr9, vr27, vr26
+ vilvl.h vr26, vr9, vr8
+ vilvh.h vr27, vr9, vr8
+ vilvl.h vr8, vr29, vr28
+ vilvh.h vr9, vr29, vr28
+ vilvl.h vr28, vr9, vr8
+ vilvh.h vr29, vr9, vr8
+
+ vsrari.h vr26, vr26, 1 // in0low in1low
+ vsrari.h vr27, vr27, 1 // in2low in3low
+ vsrari.h vr28, vr28, 1 // in0high in1high
+ vsrari.h vr29, vr29, 1 // in2high in3high
+
+ vexth.w.h vr18, vr4
+ vexth.w.h vr19, vr5
+ vexth.w.h vr6, vr12
+ vexth.w.h vr7, vr13
+ vexth.w.h vr8, vr14
+ vexth.w.h vr9, vr15
+ vexth.w.h vr10, vr24
+ vexth.w.h vr11, vr25
+
+ la.local t0, idct_coeffs
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vldrepl.w vr22, t0, 0 // 2896
+
+ dct_8x4_core_lsx1 vr12, vr13, vr14, vr15
+
+ vshuf4i.d vr13, vr13, 0x01
+ vshuf4i.d vr15, vr15, 0x01
+
+ vilvl.h vr8, vr13, vr12
+ vilvh.h vr9, vr13, vr12
+ vilvl.h vr12, vr9, vr8
+ vilvh.h vr13, vr9, vr8
+ vilvl.h vr8, vr15, vr14
+ vilvh.h vr9, vr15, vr14
+ vilvl.h vr14, vr9, vr8
+ vilvh.h vr15, vr9, vr8
+
+ vsrari.h vr20, vr12, 1
+ vsrari.h vr21, vr13, 1
+ vsrari.h vr22, vr14, 1
+ vsrari.h vr23, vr15, 1
+
+ vreplgr2vr.h vr19, zero
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112
+ vst vr19, a2, \i
+.endr
+ // identity8
+ vsllwil.w.h vr10, vr26, 1
+ vsllwil.w.h vr11, vr27, 1
+ vsllwil.w.h vr16, vr28, 1
+ vsllwil.w.h vr17, vr29, 1
+ vsllwil.w.h vr6, vr20, 1
+ vsllwil.w.h vr7, vr21, 1
+ vsllwil.w.h vr18, vr22, 1
+ vsllwil.w.h vr19, vr23, 1
+
+.irp i, vr26, vr27, vr28, vr29, vr20, vr21, vr22, vr23
+ vexth.w.h \i, \i
+.endr
+
+.irp i, vr26, vr27, vr28, vr29, vr20, vr21, vr22, vr23
+ vslli.w \i, \i, 1
+.endr
+
+ vssrarni.h.w vr16, vr10, 4 // in0
+ vssrarni.h.w vr28, vr26, 4 // in1
+ vssrarni.h.w vr17, vr11, 4 // in2
+ vssrarni.h.w vr29, vr27, 4 // in3
+ vssrarni.h.w vr18, vr6, 4 // in4
+ vssrarni.h.w vr22, vr20, 4 // in5
+ vssrarni.h.w vr19, vr7, 4 // in6
+ vssrarni.h.w vr23, vr21, 4 // in7
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr16, vr28, vr17, vr29
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr18, vr22, vr19, vr23
+
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ fld.d f27, sp, 24
+ fld.d f28, sp, 32
+ fld.d f29, sp, 40
+ addi.d sp, sp, 48
+endfunc
+
+function inv_txfm_add_identity_dct_8x8_8bpc_lsx
+ addi.d sp, sp, -48
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ fst.d f27, sp, 24
+ fst.d f28, sp, 32
+ fst.d f29, sp, 40
+
+ vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25
+
+ // identity8
+ vsllwil.w.h vr6, vr0, 1
+ vsllwil.w.h vr7, vr1, 1
+ vsllwil.w.h vr8, vr2, 1
+ vsllwil.w.h vr9, vr3, 1
+ vsllwil.w.h vr10, vr4, 1
+ vsllwil.w.h vr11, vr5, 1
+ vsllwil.w.h vr12, vr24, 1
+ vsllwil.w.h vr13, vr25, 1
+
+.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25
+ vexth.w.h \i, \i
+.endr
+
+.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25
+ vslli.w \i, \i, 1
+.endr
+ vssrarni.h.w vr0, vr6, 1 // in0
+ vssrarni.h.w vr1, vr7, 1 // in1
+ vssrarni.h.w vr2, vr8, 1 // in2
+ vssrarni.h.w vr3, vr9, 1 // in3
+ vssrarni.h.w vr4, vr10, 1 // in4
+ vssrarni.h.w vr5, vr11, 1 // in5
+ vssrarni.h.w vr24, vr12, 1 // in6
+ vssrarni.h.w vr25, vr13, 1 // in7
+
+ vreplgr2vr.h vr23, zero
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112
+ vst vr23, a2, \i
+.endr
+
+ LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25, \
+ vr4, vr5, vr12, vr13, vr14, vr15, vr24, vr25, \
+ vr6, vr7, vr8, vr9, vr10, vr11, vr16, vr17
+
+ la.local t0, idct_coeffs
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vldrepl.w vr22, t0, 0 // 2896
+
+ // dct4 in0 in2 in4 in6
+ vsllwil.w.h vr18, vr4, 0
+ vsllwil.w.h vr19, vr5, 0
+ vsllwil.w.h vr6, vr12, 0
+ vsllwil.w.h vr7, vr13, 0
+ vsllwil.w.h vr8, vr14, 0
+ vsllwil.w.h vr9, vr15, 0
+ vsllwil.w.h vr10, vr24, 0
+ vsllwil.w.h vr11, vr25, 0
+ dct_8x4_core_lsx1 vr16, vr17, vr26, vr27
+
+ vexth.w.h vr18, vr4
+ vexth.w.h vr19, vr5
+ vexth.w.h vr6, vr12
+ vexth.w.h vr7, vr13
+ vexth.w.h vr8, vr14
+ vexth.w.h vr9, vr15
+ vexth.w.h vr10, vr24
+ vexth.w.h vr11, vr25
+
+ la.local t0, idct_coeffs
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vldrepl.w vr22, t0, 0 // 2896
+ dct_8x4_core_lsx1 vr4, vr5, vr24, vr25
+
+ vilvl.d vr8, vr4, vr16
+ vilvh.d vr9, vr4, vr16
+ vilvh.d vr6, vr5, vr17
+ vilvl.d vr7, vr5, vr17
+ vilvl.d vr16, vr24, vr26
+ vilvh.d vr17, vr24, vr26
+ vilvh.d vr18, vr25, vr27
+ vilvl.d vr19, vr25, vr27
+
+.irp i, vr8, vr9, vr6, vr7, vr16, vr17, vr18, vr19
+ vsrari.h \i, \i, 4
+.endr
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr8, vr9, vr6, vr7
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr16, vr17, vr18, vr19
+
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ fld.d f27, sp, 24
+ fld.d f28, sp, 32
+ fld.d f29, sp, 40
+ addi.d sp, sp, 48
+endfunc
+
+function inv_txfm_add_flipadst_identity_8x8_8bpc_lsx
+ addi.d sp, sp, -32
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ fst.d f27, sp, 24
+
+ vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr16, vr17
+
+ vsllwil.w.h vr18, vr0, 0
+ vsllwil.w.h vr19, vr1, 0
+ vsllwil.w.h vr6, vr2, 0
+ vsllwil.w.h vr7, vr3, 0
+ vsllwil.w.h vr8, vr4, 0
+ vsllwil.w.h vr9, vr5, 0
+ vsllwil.w.h vr10, vr16, 0
+ vsllwil.w.h vr11, vr17, 0
+ adst8x8_1d_lsx vr12, vr13, vr14, vr15
+
+ vilvl.h vr20, vr12, vr13
+ vilvl.h vr21, vr14, vr15
+ vilvl.w vr24, vr20, vr21
+ vilvh.w vr25, vr20, vr21
+ vilvh.h vr20, vr12, vr13
+ vilvh.h vr21, vr14, vr15
+ vilvl.w vr26, vr20, vr21
+ vilvh.w vr27, vr20, vr21
+ vshuf4i.h vr26, vr26, 0x1b
+ vshuf4i.h vr27, vr27, 0x1b
+
+ vexth.w.h vr18, vr0 // in0
+ vexth.w.h vr19, vr1 // in1
+ vexth.w.h vr6, vr2 // in2
+ vexth.w.h vr7, vr3 // in3
+ vexth.w.h vr8, vr4 // in3
+ vexth.w.h vr9, vr5 // in4
+ vexth.w.h vr10, vr16 // in5
+ vexth.w.h vr11, vr17 // in6
+ adst8x8_1d_lsx vr12, vr13, vr14, vr15
+
+ vilvl.h vr20, vr12, vr13
+ vilvl.h vr21, vr14, vr15
+ vilvl.w vr16, vr20, vr21
+ vilvh.w vr17, vr20, vr21
+ vilvh.h vr20, vr12, vr13
+ vilvh.h vr21, vr14, vr15
+ vilvl.w vr18, vr20, vr21
+ vilvh.w vr19, vr20, vr21
+ vshuf4i.h vr18, vr18, 0x1b
+ vshuf4i.h vr19, vr19, 0x1b
+
+ vreplgr2vr.h vr23, zero
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112
+ vst vr23, a2, \i
+.endr
+
+.irp i, vr24, vr25, vr26, vr27, vr16, vr17, vr18, vr19
+ vsrari.h \i, \i, 1
+.endr
+
+ // identity8
+ vsllwil.w.h vr20, vr24, 1
+ vsllwil.w.h vr21, vr25, 1
+ vsllwil.w.h vr12, vr26, 1
+ vsllwil.w.h vr13, vr27, 1
+ vsllwil.w.h vr22, vr16, 1
+ vsllwil.w.h vr23, vr17, 1
+ vsllwil.w.h vr14, vr18, 1
+ vsllwil.w.h vr15, vr19, 1
+
+.irp i, vr24, vr25, vr26, vr27, vr16, vr17, vr18, vr19
+ vexth.w.h \i, \i
+.endr
+
+.irp i, vr24, vr25, vr26, vr27, vr16, vr17, vr18, vr19
+ vslli.w \i, \i, 1
+.endr
+
+ vssrarni.h.w vr20, vr12, 4 // in0
+ vssrarni.h.w vr24, vr26, 4 // in1
+ vssrarni.h.w vr21, vr13, 4 // in2
+ vssrarni.h.w vr25, vr27, 4 // in3
+ vssrarni.h.w vr22, vr14, 4 // in4
+ vssrarni.h.w vr16, vr18, 4 // in5
+ vssrarni.h.w vr23, vr15, 4 // in6
+ vssrarni.h.w vr17, vr19, 4 // in7
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr20, vr24, vr21, vr25
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr22, vr16, vr23, vr17
+
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ fld.d f27, sp, 24
+ addi.d sp, sp, 32
+endfunc
+
+function inv_txfm_add_identity_flipadst_8x8_8bpc_lsx
+ addi.d sp, sp, -48
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ fst.d f27, sp, 24
+ fst.d f28, sp, 32
+ fst.d f29, sp, 40
+
+ vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25
+
+ // identity8
+ vsllwil.w.h vr6, vr0, 1
+ vsllwil.w.h vr7, vr1, 1
+ vsllwil.w.h vr8, vr2, 1
+ vsllwil.w.h vr9, vr3, 1
+ vsllwil.w.h vr10, vr4, 1
+ vsllwil.w.h vr11, vr5, 1
+ vsllwil.w.h vr12, vr24, 1
+ vsllwil.w.h vr13, vr25, 1
+
+.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25
+ vexth.w.h \i, \i
+.endr
+
+.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25
+ vslli.w \i, \i, 1
+.endr
+
+ vssrarni.h.w vr0, vr6, 1 // in0
+ vssrarni.h.w vr1, vr7, 1 // in1
+ vssrarni.h.w vr2, vr8, 1 // in2
+ vssrarni.h.w vr3, vr9, 1 // in3
+ vssrarni.h.w vr4, vr10, 1 // in4
+ vssrarni.h.w vr5, vr11, 1 // in5
+ vssrarni.h.w vr24, vr12, 1 // in6
+ vssrarni.h.w vr25, vr13, 1 // in7
+
+ LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25, \
+ vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25, \
+ vr6, vr7, vr8, vr9, vr10, vr11, vr12, vr13
+
+ vreplgr2vr.h vr23, zero
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112
+ vst vr23, a2, \i
+.endr
+
+ vsllwil.w.h vr18, vr0, 0 // in0
+ vsllwil.w.h vr19, vr1, 0 // in1
+ vsllwil.w.h vr6, vr2, 0 // in2
+ vsllwil.w.h vr7, vr3, 0 // in3
+ vsllwil.w.h vr8, vr4, 0 // in3
+ vsllwil.w.h vr9, vr5, 0 // in4
+ vsllwil.w.h vr10, vr24, 0 // in5
+ vsllwil.w.h vr11, vr25, 0 // in6
+ adst8x8_1d_lsx vr26, vr27, vr28, vr29
+
+ vexth.w.h vr18, vr0 // in0
+ vexth.w.h vr19, vr1 // in1
+ vexth.w.h vr6, vr2 // in2
+ vexth.w.h vr7, vr3 // in3
+ vexth.w.h vr8, vr4 // in3
+ vexth.w.h vr9, vr5 // in4
+ vexth.w.h vr10, vr24 // in5
+ vexth.w.h vr11, vr25 // in6
+ adst8x8_1d_lsx vr0, vr1, vr2, vr3
+
+ vilvh.d vr4, vr0, vr26
+ vilvh.d vr5, vr1, vr27
+ vilvh.d vr6, vr2, vr28
+ vilvh.d vr7, vr3, vr29
+ vilvl.d vr14, vr3, vr29
+ vilvl.d vr15, vr2, vr28
+ vilvl.d vr16, vr1, vr27
+ vilvl.d vr17, vr0, vr26
+
+.irp i, vr4, vr5, vr6, vr7, vr14, vr15, vr16, vr17
+ vsrari.h \i, \i, 4
+.endr
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr4, vr5, vr6, vr7
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr14, vr15, vr16, vr17
+
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ fld.d f27, sp, 24
+ fld.d f28, sp, 32
+ fld.d f29, sp, 40
+ addi.d sp, sp, 48
+
+endfunc
+
+function inv_txfm_add_adst_identity_8x8_8bpc_lsx
+ addi.d sp, sp, -32
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ fst.d f27, sp, 24
+
+ vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr16, vr17
+
+ vsllwil.w.h vr18, vr0, 0
+ vsllwil.w.h vr19, vr1, 0
+ vsllwil.w.h vr6, vr2, 0
+ vsllwil.w.h vr7, vr3, 0
+ vsllwil.w.h vr8, vr4, 0
+ vsllwil.w.h vr9, vr5, 0
+ vsllwil.w.h vr10, vr16, 0
+ vsllwil.w.h vr11, vr17, 0
+ adst8x8_1d_lsx vr24, vr25, vr26, vr27
+
+ vexth.w.h vr18, vr0
+ vexth.w.h vr19, vr1
+ vexth.w.h vr6, vr2
+ vexth.w.h vr7, vr3
+ vexth.w.h vr8, vr4
+ vexth.w.h vr9, vr5
+ vexth.w.h vr10, vr16
+ vexth.w.h vr11, vr17
+ adst8x8_1d_lsx vr0, vr1, vr2, vr3
+
+ vreplgr2vr.h vr23, zero
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112
+ vst vr23, a2, \i
+.endr
+
+.irp i, vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3
+ vsrari.h \i, \i, 1
+.endr
+
+ LSX_TRANSPOSE8x8_H vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3, \
+ vr24, vr25, vr20, vr21, vr26, vr27, vr22, vr23, \
+ vr6, vr7, vr8, vr9, vr10, vr11, vr16, vr17
+
+ vshuf4i.h vr26, vr26, 0x1b
+ vshuf4i.h vr27, vr27, 0x1b
+ vshuf4i.h vr22, vr22, 0x1b
+ vshuf4i.h vr23, vr23, 0x1b
+
+ // identity8
+ vsllwil.w.h vr16, vr24, 1
+ vsllwil.w.h vr17, vr25, 1
+ vsllwil.w.h vr10, vr20, 1
+ vsllwil.w.h vr11, vr21, 1
+ vsllwil.w.h vr18, vr26, 1
+ vsllwil.w.h vr19, vr27, 1
+ vsllwil.w.h vr14, vr22, 1
+ vsllwil.w.h vr15, vr23, 1
+
+.irp i, vr24, vr25, vr20, vr21, vr26, vr27, vr22, vr23
+ vexth.w.h \i, \i
+.endr
+
+.irp i, vr24, vr25, vr20, vr21, vr26, vr27, vr22, vr23
+ vslli.w \i, \i, 1
+.endr
+
+ vssrarni.h.w vr18, vr16, 4 // in0
+ vssrarni.h.w vr19, vr17, 4 // in1
+ vssrarni.h.w vr14, vr10, 4 // in2
+ vssrarni.h.w vr15, vr11, 4 // in3
+ vssrarni.h.w vr26, vr24, 4 // in4
+ vssrarni.h.w vr27, vr25, 4 // in5
+ vssrarni.h.w vr22, vr20, 4 // in6
+ vssrarni.h.w vr23, vr21, 4 // in7
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr18, vr19, vr14, vr15
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr26, vr27, vr22, vr23
+
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ fld.d f27, sp, 24
+ addi.d sp, sp, 32
+endfunc
+
+function inv_txfm_add_identity_adst_8x8_8bpc_lsx
+ addi.d sp, sp, -48
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ fst.d f27, sp, 24
+ fst.d f28, sp, 32
+ fst.d f29, sp, 40
+
+ vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25
+
+ // identity8
+ vsllwil.w.h vr6, vr0, 1
+ vsllwil.w.h vr7, vr1, 1
+ vsllwil.w.h vr8, vr2, 1
+ vsllwil.w.h vr9, vr3, 1
+ vsllwil.w.h vr10, vr4, 1
+ vsllwil.w.h vr11, vr5, 1
+ vsllwil.w.h vr12, vr24, 1
+ vsllwil.w.h vr13, vr25, 1
+
+.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25
+ vexth.w.h \i, \i
+.endr
+
+.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25
+ vslli.w \i, \i, 1
+.endr
+
+ vssrarni.h.w vr0, vr6, 1 // in0
+ vssrarni.h.w vr1, vr7, 1 // in1
+ vssrarni.h.w vr2, vr8, 1 // in2
+ vssrarni.h.w vr3, vr9, 1 // in3
+ vssrarni.h.w vr4, vr10, 1 // in4
+ vssrarni.h.w vr5, vr11, 1 // in5
+ vssrarni.h.w vr24, vr12, 1 // in6
+ vssrarni.h.w vr25, vr13, 1 // in7
+
+ LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25, \
+ vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25, \
+ vr6, vr7, vr8, vr9, vr10, vr11, vr12, vr13
+
+ vreplgr2vr.h vr23, zero
+
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112
+ vst vr23, a2, \i
+.endr
+
+ vsllwil.w.h vr18, vr0, 0
+ vsllwil.w.h vr19, vr1, 0
+ vsllwil.w.h vr6, vr2, 0
+ vsllwil.w.h vr7, vr3, 0
+ vsllwil.w.h vr8, vr4, 0
+ vsllwil.w.h vr9, vr5, 0
+ vsllwil.w.h vr10, vr24, 0
+ vsllwil.w.h vr11, vr25, 0
+ adst8x8_1d_lsx vr26, vr27, vr28, vr29
+
+ vexth.w.h vr18, vr0
+ vexth.w.h vr19, vr1
+ vexth.w.h vr6, vr2
+ vexth.w.h vr7, vr3
+ vexth.w.h vr8, vr4
+ vexth.w.h vr9, vr5
+ vexth.w.h vr10, vr24
+ vexth.w.h vr11, vr25
+
+ adst8x8_1d_lsx vr0, vr1, vr2, vr3
+
+ vilvl.d vr4, vr0, vr26 // 0 ... 7
+ vilvl.d vr5, vr1, vr27 // 8 ... 15
+ vilvl.d vr6, vr2, vr28 // 16 ... 23
+ vilvl.d vr7, vr3, vr29 // 24 ... 31
+ vilvh.d vr14, vr3, vr29 // 32 ... 39
+ vilvh.d vr15, vr2, vr28 // 40 ... 47
+ vilvh.d vr16, vr1, vr27 // 48 ... 55
+ vilvh.d vr17, vr0, vr26 // 56 ... 63
+
+.irp i, vr4, vr5, vr6, vr7, vr14, vr15, vr16, vr17
+ vsrari.h \i, \i, 4
+.endr
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr4, vr5, vr6, vr7
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr14, vr15, vr16, vr17
+
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ fld.d f27, sp, 24
+ fld.d f28, sp, 32
+ fld.d f29, sp, 40
+ addi.d sp, sp, 48
+endfunc
+
+.macro vmul_vmadd_w in0, in1, in2, in3, out0, out1
+ vsllwil.w.h vr22, \in0, 0
+ vexth.w.h vr23, \in0
+ vmul.w \out0, vr22, \in2
+ vmul.w \out1, vr23, \in2
+ vsllwil.w.h vr22, \in1, 0
+ vexth.w.h vr23, \in1
+ vmadd.w \out0, vr22, \in3
+ vmadd.w \out1, vr23, \in3
+.endm
+
+.macro vmul_vmsub_w in0, in1, in2, in3, out0, out1
+ vsllwil.w.h vr22, \in0, 0
+ vexth.w.h vr23, \in0
+ vmul.w \out0, vr22, \in2
+ vmul.w \out1, vr23, \in2
+ vsllwil.w.h vr22, \in1, 0
+ vexth.w.h vr23, \in1
+ vmsub.w \out0, vr22, \in3
+ vmsub.w \out1, vr23, \in3
+.endm
+
+.macro rect2_lsx in0, in1, out0
+ vsllwil.w.h vr22, \in0, 0 // in1
+ vexth.w.h \in0, \in0 // in1
+ vmul.w vr22, vr22, \in1
+ vmul.w \out0, \in0, \in1
+ vssrarni.h.w \out0, vr22, 12
+.endm
+
+.macro dct_8x8_core_lsx in0, in1, in2, in3, in4, in5, in6, in7, out0, \
+ out1, out2, out3, out4, out5, out6, out7, rect2
+
+ la.local t0, idct_coeffs
+
+.ifc \rect2, rect2_lsx
+ vldrepl.w vr23, t0, 0 // 2896
+.irp i, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7
+ rect2_lsx \i, vr23, \i
+.endr
+.endif
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+
+ vmul_vmadd_w \in2, \in6, vr21, vr20, vr8, vr9
+ vssrarni.h.w vr9, vr8, 12 // t3
+ vmul_vmsub_w \in2, \in6, vr20, vr21, vr8, vr10
+ vssrarni.h.w vr10, vr8, 12 // t2
+
+ vldrepl.w vr20, t0, 0 // 2896
+ vmul_vmadd_w \in0, \in4, vr20, vr20, vr8, \in2
+ vssrarni.h.w \in2, vr8, 12 // t0
+ vmul_vmsub_w \in0, \in4, vr20, vr20, vr8, \in6
+ vssrarni.h.w \in6, vr8, 12 // t1
+
+ vsadd.h vr8, \in2, vr9 // c[0]
+ vssub.h vr9, \in2, vr9 // c[3]
+ vsadd.h \in0, \in6, vr10 // c[1]
+ vssub.h vr10, \in6, vr10 // c[2]
+
+ vldrepl.w vr20, t0, 16 // 799
+ vldrepl.w vr21, t0, 20 // 4017
+ vmul_vmadd_w \in1, \in7, vr21, vr20, \in2, \in4
+ vssrarni.h.w \in4, \in2, 12 // t7a
+ vmul_vmsub_w \in1, \in7, vr20, vr21, \in2, \in6
+ vssrarni.h.w \in6, \in2, 12 // t4a
+
+ vldrepl.w vr20, t0, 24 // 3406
+ vldrepl.w vr21, t0, 28 // 2276
+ vmul_vmadd_w \in5, \in3, vr21, vr20, \in2, \in1
+ vssrarni.h.w \in1, \in2, 12 // t6a
+ vmul_vmsub_w \in5, \in3, vr20, vr21, \in2, \in7
+ vssrarni.h.w \in7, \in2, 12 // t5a
+
+ vsadd.h \in3, \in6, \in7 // t4
+ vssub.h \in6, \in6, \in7 // t5a
+ vsadd.h \in5, \in4, \in1 // t7
+ vssub.h \in4, \in4, \in1 // t6a
+
+ vldrepl.w vr20, t0, 0 // 2896
+ vmul_vmadd_w \in4, \in6, vr20, vr20, \in2, \in1
+ vssrarni.h.w \in1, \in2, 12 // t6
+ vmul_vmsub_w \in4, \in6, vr20, vr20, \in2, \in7
+ vssrarni.h.w \in7, \in2, 12 // t5
+
+ vsadd.h \out0, vr8, \in5 // c[0]
+ vssub.h \out7, vr8, \in5 // c[7]
+ vsadd.h \out1, \in0, \in1 // c[1]
+ vssub.h \out6, \in0, \in1 // c[6]
+ vsadd.h \out2, vr10, \in7 // c[2]
+ vssub.h \out5, vr10, \in7 // c[5]
+ vsadd.h \out3, vr9, \in3 // c[3]
+ vssub.h \out4, vr9, \in3 // c[4]
+.endm
+
+function inv_txfm_add_dct_dct_8x8_8bpc_lsx
+ bnez a3, .NO_HAS_DCONLY_8x8
+
+ ld.h t2, a2, 0 // dc
+ vldi vr0, 0x8b5 // 181
+ vreplgr2vr.w vr1, t2
+ vldi vr5, 0x880 // 128
+ vmul.w vr2, vr0, vr1 // dc * 181
+ st.h zero, a2, 0
+ vsrari.w vr2, vr2, 8 // (dc * 181 + 128) >> 8
+ vld vr10, a0, 0 // 0 1 2 3 4 5 6 7
+ vsrari.w vr2, vr2, 1 // (dc + rnd) >> shift
+ vldx vr11, a0, a1 // 8 9 10 11 12 13 14 15
+ alsl.d t2, a1, a0, 1
+ vmadd.w vr5, vr2, vr0
+ vld vr12, t2, 0 // 16 17 18 19 20 21 22 23
+ vssrarni.h.w vr5, vr5, 12
+ vldx vr13, t2, a1 // 24 25 26 27 28 29 30 31
+
+ DST_ADD_W8 vr10, vr11, vr12, vr13, vr5, vr5, vr5, vr5
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr5, vr5, vr5, vr5
+
+ b .DCT_DCT_8X8_END
+
+.NO_HAS_DCONLY_8x8:
+
+ vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+
+ la.local t0, idct_coeffs
+
+ dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, no_rect2
+
+ LSX_TRANSPOSE8x8_H vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \
+ vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \
+ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+
+.irp i, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18
+ vsrari.h \i, \i, 1
+.endr
+
+ vreplgr2vr.h vr23, zero
+
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112
+ vst vr23, a2, \i
+.endr
+
+ dct_8x8_core_lsx vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \
+ vr4, vr5, vr6, vr7, vr20, vr21, vr22, vr23, no_rect2
+
+.irp i, vr4, vr5, vr6, vr7, vr20, vr21, vr22, vr23
+ vsrari.h \i, \i, 4
+.endr
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr4, vr5, vr6, vr7
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr20, vr21, vr22, vr23
+
+.DCT_DCT_8X8_END:
+
+endfunc
+
+.macro dct_8x16_core_lsx
+ dct_8x8_core_lsx vr0, vr2, vr4, vr6, vr19, vr25, vr27, vr29, \
+ vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, no_rect2
+
+ la.local t0, idct_coeffs
+ vldrepl.w vr20, t0, 32 // 401
+ vldrepl.w vr21, t0, 36 // 4076
+ vmul_vmadd_w vr1, vr30, vr21, vr20, vr0, vr10
+ vssrarni.h.w vr10, vr0, 12 // t15a
+ vmul_vmsub_w vr1, vr30, vr20, vr21, vr0, vr29
+ vssrarni.h.w vr29, vr0, 12 // t8a
+
+ vldrepl.w vr20, t0, 40 // 3166 -> 1583
+ vldrepl.w vr21, t0, 44 // 2598 -> 1299
+ vmul_vmadd_w vr24, vr7, vr21, vr20, vr0, vr30
+ vssrarni.h.w vr30, vr0, 12 // t14a
+ vmul_vmsub_w vr24, vr7, vr20, vr21, vr0, vr31
+ vssrarni.h.w vr31, vr0, 12 // t9a
+
+ vldrepl.w vr20, t0, 48 // 1931
+ vldrepl.w vr21, t0, 52 // 3612
+ vmul_vmadd_w vr5, vr26, vr21, vr20, vr0, vr24
+ vssrarni.h.w vr24, vr0, 12 // t13a
+ vmul_vmsub_w vr5, vr26, vr20, vr21, vr0, vr25
+ vssrarni.h.w vr25, vr0, 12 // t10a
+
+ vldrepl.w vr20, t0, 56 // 3920
+ vldrepl.w vr21, t0, 60 // 1189
+ vmul_vmadd_w vr28, vr3, vr21, vr20, vr0, vr26
+ vssrarni.h.w vr26, vr0, 12 // t12a
+ vmul_vmsub_w vr28, vr3, vr20, vr21, vr0, vr27
+ vssrarni.h.w vr27, vr0, 12 // t11a
+
+ // vr22 vr23 vr30 vr31 vr24 vr25 vr26 vr27
+ vsadd.h vr28, vr29, vr31 // t8
+ vssub.h vr19, vr29, vr31 // t9
+ vssub.h vr29, vr27, vr25 // t10
+ vsadd.h vr9, vr27, vr25 // t11
+ vsadd.h vr31, vr26, vr24 // t12
+ vssub.h vr25, vr26, vr24 // t13
+ vssub.h vr27, vr10, vr30 // t14
+ vsadd.h vr24, vr10, vr30 // t15
+
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vmul_vmadd_w vr27, vr19, vr21, vr20, vr0, vr26
+ vssrarni.h.w vr26, vr0, 12 // t14a
+ vmul_vmsub_w vr27, vr19, vr20, vr21, vr0, vr30
+ vssrarni.h.w vr30, vr0, 12 // t9a
+
+ vmul_vmadd_w vr25, vr29, vr21, vr20, vr0, vr19
+ vneg.w vr0, vr0
+ vneg.w vr19, vr19
+ vssrarni.h.w vr19, vr0, 12 // t10a
+ vmul_vmsub_w vr25, vr29, vr20, vr21, vr0, vr27
+ vssrarni.h.w vr27, vr0, 12 // t13a
+
+ vsadd.h vr25, vr28, vr9 // t8a
+ vssub.h vr29, vr28, vr9 // t11a
+ vssub.h vr28, vr24, vr31 // t12a
+ vsadd.h vr10, vr24, vr31 // t15a
+ vsadd.h vr9, vr30, vr19 // t9
+ vssub.h vr31, vr30, vr19 // t10
+ vssub.h vr30, vr26, vr27 // t13
+ vsadd.h vr24, vr26, vr27 // t14
+
+ vldrepl.w vr20, t0, 0 // 2896
+ vmul_vmadd_w vr30, vr31, vr20, vr20, vr0, vr26
+ vssrarni.h.w vr26, vr0, 12 // t13a
+ vmul_vmsub_w vr30, vr31, vr20, vr20, vr0, vr27
+ vssrarni.h.w vr27, vr0, 12 // t10a
+
+ vmul_vmadd_w vr28, vr29, vr20, vr20, vr0, vr31
+ vssrarni.h.w vr31, vr0, 12 // t12
+ vmul_vmsub_w vr28, vr29, vr20, vr20, vr0, vr30
+ vssrarni.h.w vr30, vr0, 12 // t11
+
+ // vr11 vr12 ... vr18
+ vsadd.h vr28, vr14, vr31 // c[3]
+ vssub.h vr29, vr14, vr31 // c[12]
+ vsadd.h vr20, vr15, vr30 // c[4]
+ vssub.h vr21, vr15, vr30 // c[11]
+ vsadd.h vr14, vr16, vr27 // c[5]
+ vssub.h vr23, vr16, vr27 // c[10]
+ vsadd.h vr15, vr17, vr9 // c[6]
+ vssub.h vr30, vr17, vr9 // c[9]
+ vsadd.h vr16, vr18, vr25 // c[7]
+ vssub.h vr27, vr18, vr25 // c[8]
+ vsadd.h vr17, vr13, vr26 // c[2]
+ vssub.h vr26, vr13, vr26 // c[13]
+ vsadd.h vr18, vr12, vr24 // c[1]
+ vssub.h vr25, vr12, vr24 // c[14]
+ vsadd.h vr22, vr11, vr10 // c[0]
+ vssub.h vr24, vr11, vr10 // c[15]
+.endm
+
+function inv_txfm_add_dct_dct_8x16_8bpc_lsx
+ bnez a3, .NO_HAS_DCONLY_8x16
+
+ ld.h t2, a2, 0 // dc
+ vldi vr0, 0x8b5 // 181
+ vreplgr2vr.w vr1, t2
+ vldi vr5, 0x880 // 128
+ vmul.w vr2, vr0, vr1 // dc * 181
+ st.h zero, a2, 0
+ vsrari.w vr2, vr2, 8 // (dc * 181 + 128) >> 8
+ vld vr10, a0, 0 // 0 1 2 3 4 5 6 7
+ vmul.w vr2, vr0, vr2
+ vsrari.w vr2, vr2, 8 // (dc * 181 + 128) >> 8
+ vsrari.w vr2, vr2, 1 // (dc + rnd) >> shift
+ vldx vr11, a0, a1 // 8 9 10 11 12 13 14 15
+ alsl.d t2, a1, a0, 1
+ vmadd.w vr5, vr2, vr0
+ vld vr12, t2, 0 // 16 17 18 19 20 21 22 23
+ vssrarni.h.w vr5, vr5, 12
+ vldx vr13, t2, a1 // 24 25 26 27 28 29 30 31
+
+ DST_ADD_W8 vr10, vr11, vr12, vr13, vr5, vr5, vr5, vr5
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr5, vr5, vr5, vr5
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr5, vr5, vr5, vr5
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr5, vr5, vr5, vr5
+
+ b .DCT_DCT_8X16_END
+
+.NO_HAS_DCONLY_8x16:
+ addi.d sp, sp, -64
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ fst.d f27, sp, 24
+ fst.d f28, sp, 32
+ fst.d f29, sp, 40
+ fst.d f30, sp, 48
+ fst.d f31, sp, 56
+
+ vld_x8 a2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+
+ la.local t0, idct_coeffs
+
+ dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, rect2_lsx
+
+ vld_x8 a2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+
+ dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30, rect2_lsx
+
+.irp i, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \
+ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+ vsrari.h \i, \i, 1
+.endr
+
+ vreplgr2vr.h vr23, zero
+
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240
+ vst vr23, a2, \i
+.endr
+
+ LSX_TRANSPOSE8x8_H vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \
+ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr8, vr9, vr10, vr20, vr21, vr22, vr23, vr31
+
+ LSX_TRANSPOSE8x8_H vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30, \
+ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30, \
+ vr8, vr9, vr10, vr20, vr21, vr22, vr23, vr31
+
+ dct_8x16_core_lsx
+
+.irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
+ vsrari.h \i, \i, 4
+.endr
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr22, vr18, vr17, vr28
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr20, vr14, vr15, vr16
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr27, vr30, vr23, vr21
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr29, vr26, vr25, vr24
+
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ fld.d f27, sp, 24
+ fld.d f28, sp, 32
+ fld.d f29, sp, 40
+ fld.d f30, sp, 48
+ fld.d f31, sp, 56
+ addi.d sp, sp, 64
+.DCT_DCT_8X16_END:
+endfunc
+
+.macro identity_8x8_core_lsx in0, in1, in2, in3, in4, in5, in6, in7, rect2
+
+ la.local t0, idct_coeffs
+
+.ifc \rect2, rect2_lsx
+ vldrepl.w vr23, t0, 0 // 2896
+.irp i, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7
+ rect2_lsx \i, vr23, \i
+.endr
+.endif
+ vsllwil.w.h vr8, \in0, 1
+ vsllwil.w.h vr9, \in1, 1
+ vsllwil.w.h vr10, \in2, 1
+ vsllwil.w.h vr11, \in3, 1
+ vsllwil.w.h vr12, \in4, 1
+ vsllwil.w.h vr13, \in5, 1
+ vsllwil.w.h vr14, \in6, 1
+ vsllwil.w.h vr15, \in7, 1
+
+.irp i, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7
+ vexth.w.h \i, \i
+.endr
+
+.irp i, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7
+ vslli.w \i, \i, 1
+.endr
+
+ vssrarni.h.w \in0, vr8, 1
+ vssrarni.h.w \in1, vr9, 1
+ vssrarni.h.w \in2, vr10, 1
+ vssrarni.h.w \in3, vr11, 1
+ vssrarni.h.w \in4, vr12, 1
+ vssrarni.h.w \in5, vr13, 1
+ vssrarni.h.w \in6, vr14, 1
+ vssrarni.h.w \in7, vr15, 1
+.endm
+
+.macro identity_8x16_core_lsx in0, out0
+ vsadd.h vr10, \in0, \in0
+ vsllwil.w.h vr8, \in0, 0
+ vexth.w.h \out0, \in0
+ vmul.w vr8, vr8, vr20
+ vmul.w \out0, \out0, vr20
+ vssrarni.h.w \out0, vr8, 11
+ vsadd.h \out0, \out0, vr10
+.endm
+
+function inv_txfm_add_identity_identity_8x16_8bpc_lsx
+ addi.d sp, sp, -64
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ fst.d f27, sp, 24
+ fst.d f28, sp, 32
+ fst.d f29, sp, 40
+ fst.d f30, sp, 48
+ fst.d f31, sp, 56
+
+ vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+
+ identity_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, rect2_lsx
+
+ vld_x8 a2, 128, 16, vr16, vr17, vr18, vr19, vr24, vr25, vr26, vr27
+
+ identity_8x8_core_lsx vr16, vr17, vr18, vr19, vr24, vr25, vr26, vr27, rect2_lsx
+
+ vreplgr2vr.h vr23, zero
+
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240
+ vst vr23, a2, \i
+.endr
+
+
+ LSX_TRANSPOSE8x8_H vr0, vr2, vr4, vr6, vr16, vr18, vr24, vr26, \
+ vr14, vr15, vr22, vr23, vr16, vr18, vr24, vr26, \
+ vr8, vr9, vr10, vr11, vr12, vr13, vr20, vr21
+
+ LSX_TRANSPOSE8x8_H vr1, vr3, vr5, vr7, vr17, vr19, vr25, vr27, \
+ vr28, vr29, vr30, vr31, vr17, vr19, vr25, vr27, \
+ vr8, vr9, vr10, vr11, vr12, vr13, vr20, vr21
+
+ li.w t0, 1697
+ vreplgr2vr.w vr20, t0
+
+.irp i, vr14, vr15, vr22, vr23, vr16, vr18, vr24, vr26, \
+ vr28, vr29, vr30, vr31, vr17, vr19, vr25, vr27
+ identity_8x16_core_lsx \i, \i
+ vsrari.h \i, \i, 4
+.endr
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr14, vr15, vr22, vr23
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr16, vr18, vr24, vr26
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr28, vr29, vr30, vr31
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr17, vr19, vr25, vr27
+
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ fld.d f27, sp, 24
+ fld.d f28, sp, 32
+ fld.d f29, sp, 40
+ fld.d f30, sp, 48
+ fld.d f31, sp, 56
+ addi.d sp, sp, 64
+endfunc
+
+.macro adst_8x8_core_lsx in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3, out4, out5, out6, out7, rect2
+
+ la.local t0, iadst8_coeffs
+
+.ifc \rect2, rect2_lsx
+ vldrepl.w vr23, t0, 32 // 2896
+.irp i, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7
+ rect2_lsx \i, vr23, \i
+.endr
+.endif
+
+ vldrepl.w vr20, t0, 0 // 4076
+ vldrepl.w vr21, t0, 4 // 401
+
+ vmul_vmadd_w vr7, vr0, vr20, vr21, vr8, vr9
+ vssrarni.h.w vr9, vr8, 12 // t0a low
+ vmul_vmsub_w vr7, vr0, vr21, vr20, vr8, vr10
+ vssrarni.h.w vr10, vr8, 12 // t1a low
+
+ vldrepl.w vr20, t0, 8 // 3612
+ vldrepl.w vr21, t0, 12 // 1931
+ vmul_vmadd_w vr5, vr2, vr20, vr21, vr8, vr0
+ vssrarni.h.w vr0, vr8, 12 // t2a low
+ vmul_vmsub_w vr5, vr2, vr21, vr20, vr8, vr7
+ vssrarni.h.w vr7, vr8, 12 // t3a low
+
+ vldrepl.w vr20, t0, 16 // 2598 -> 1299
+ vldrepl.w vr21, t0, 20 // 3166 -> 1583
+ vmul_vmadd_w vr3, vr4, vr20, vr21, vr8, vr2
+ vssrarni.h.w vr2, vr8, 12 // t4a low
+ vmul_vmsub_w vr3, vr4, vr21, vr20, vr8, vr5
+ vssrarni.h.w vr5, vr8, 12 // t5a low
+
+ vldrepl.w vr20, t0, 24 // 1189
+ vldrepl.w vr21, t0, 28 // 3920
+ vmul_vmadd_w vr1, vr6, vr20, vr21, vr8, vr3
+ vssrarni.h.w vr3, vr8, 12 // t6a low
+ vmul_vmsub_w vr1, vr6, vr21, vr20, vr8, vr4
+ vssrarni.h.w vr4, vr8, 12 // t7a low
+
+ vsadd.h vr1, vr9, vr2 // t0
+ vssub.h vr6, vr9, vr2 // t4
+ vsadd.h vr8, vr10, vr5 // t1
+ vssub.h vr2, vr10, vr5 // t5
+ vsadd.h vr9, vr0, vr3 // t2
+ vssub.h vr5, vr0, vr3 // t6
+ vsadd.h vr10, vr7, vr4 // t3
+ vssub.h vr0, vr7, vr4 // t7
+
+ vldrepl.w vr20, t0, 40 // 1567
+ vldrepl.w vr21, t0, 44 // 3784
+ vmul_vmadd_w vr6, vr2, vr21, vr20, vr3, vr4
+ vssrarni.h.w vr4, vr3, 12 // t4a low
+ vmul_vmsub_w vr6, vr2, vr20, vr21, vr3, vr7
+ vssrarni.h.w vr7, vr3, 12 // t5a low
+
+ vmul_vmadd_w vr0, vr5, vr20, vr21, vr3, vr2
+ vssrarni.h.w vr2, vr3, 12 // t7a low
+ vmul_vmsub_w vr0, vr5, vr21, vr20, vr3, vr6
+ vssrarni.h.w vr6, vr3, 12 // t6a low
+
+ vsadd.h \out0, vr1, vr9 // out[0]
+ vssub.h vr5, vr1, vr9 // t2
+ vsadd.h vr3, vr8, vr10 // out[7]
+ vssub.h vr1, vr8, vr10 // t3
+ vexth.w.h vr9, vr3
+ vsllwil.w.h vr21, vr3, 0
+ vneg.w \out7, vr9
+ vneg.w vr21, vr21
+ vssrarni.h.w \out7, vr21, 0 // out[7]
+
+ vsadd.h vr8, vr4, vr6 // out[1]
+ vssub.h vr10, vr4, vr6 // t6
+ vexth.w.h vr20, vr8
+ vsllwil.w.h vr21, vr8, 0
+ vneg.w \out1, vr20
+ vneg.w vr21, vr21
+ vssrarni.h.w \out1, vr21, 0 // out[1]
+ vsadd.h \out6, vr7, vr2 // out[6]
+ vssub.h vr4, vr7, vr2 // t7
+
+ vldrepl.w vr20, t0, 32 // 2896
+ vmul_vmadd_w vr5, vr1, vr20, vr20, vr9, vr6
+ vssrarni.h.w vr6, vr9, 12 // out[3]
+ vmul_vmsub_w vr5, vr1, vr20, vr20, vr9, \out4
+ vssrarni.h.w \out4, vr9, 12 // out[4]
+
+ vmul_vmadd_w vr10, vr4, vr20, vr20, vr9, \out2
+ vssrarni.h.w \out2, vr9, 12 // out[2]
+ vmul_vmsub_w vr10, vr4, vr20, vr20, vr9, vr5
+ vssrarni.h.w vr5, vr9, 12 // out[5]
+
+ vexth.w.h vr20, vr6
+ vsllwil.w.h vr21, vr6, 0
+ vneg.w \out3, vr20
+ vneg.w vr21, vr21
+ vssrarni.h.w \out3, vr21, 0 // out[3]
+
+ vexth.w.h vr20, vr5
+ vsllwil.w.h vr21, vr5, 0
+ vneg.w \out5, vr20
+ vneg.w vr21, vr21
+ vssrarni.h.w \out5, vr21, 0 // out[5]
+.endm
+
+function inv_txfm_add_adst_dct_8x16_8bpc_lsx
+ addi.d sp, sp, -64
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ fst.d f27, sp, 24
+ fst.d f28, sp, 32
+ fst.d f29, sp, 40
+ fst.d f30, sp, 48
+ fst.d f31, sp, 56
+
+ vld_x8 a2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+
+ adst_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, rect2_lsx
+
+ vld_x8 a2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+
+ adst_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30, rect2_lsx
+
+.irp i, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \
+ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+ vsrari.h \i, \i, 1
+.endr
+
+ vreplgr2vr.h vr23, zero
+
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240
+ vst vr23, a2, \i
+.endr
+
+ LSX_TRANSPOSE8x8_H vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \
+ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr8, vr9, vr10, vr20, vr21, vr22, vr23, vr31
+
+ LSX_TRANSPOSE8x8_H vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30, \
+ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30, \
+ vr8, vr9, vr10, vr20, vr21, vr22, vr23, vr31
+
+ dct_8x8_core_lsx vr0, vr2, vr4, vr6, vr19, vr25, vr27, vr29, \
+ vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, no_rect2
+
+ la.local t0, idct_coeffs
+ vldrepl.w vr20, t0, 32 // 401
+ vldrepl.w vr21, t0, 36 // 4076
+ vmul_vmadd_w vr1, vr30, vr21, vr20, vr0, vr10
+ vssrarni.h.w vr10, vr0, 12 // t15a
+ vmul_vmsub_w vr1, vr30, vr20, vr21, vr0, vr29
+ vssrarni.h.w vr29, vr0, 12 // t8a
+
+ vldrepl.w vr20, t0, 40 // 3166 -> 1583
+ vldrepl.w vr21, t0, 44 // 2598 -> 1299
+ vmul_vmadd_w vr24, vr7, vr21, vr20, vr0, vr30
+ vssrarni.h.w vr30, vr0, 12 // t14a
+ vmul_vmsub_w vr24, vr7, vr20, vr21, vr0, vr31
+ vssrarni.h.w vr31, vr0, 12 // t9a
+
+ vldrepl.w vr20, t0, 48 // 1931
+ vldrepl.w vr21, t0, 52 // 3612
+ vmul_vmadd_w vr5, vr26, vr21, vr20, vr0, vr24
+ vssrarni.h.w vr24, vr0, 12 // t13a
+ vmul_vmsub_w vr5, vr26, vr20, vr21, vr0, vr25
+ vssrarni.h.w vr25, vr0, 12 // t10a
+
+ vldrepl.w vr20, t0, 56 // 3920
+ vldrepl.w vr21, t0, 60 // 1189
+ vmul_vmadd_w vr28, vr3, vr21, vr20, vr0, vr26
+ vssrarni.h.w vr26, vr0, 12 // t12a
+ vmul_vmsub_w vr28, vr3, vr20, vr21, vr0, vr27
+ vssrarni.h.w vr27, vr0, 12 // t11a
+
+ // vr22 vr23 vr30 vr31 vr24 vr25 vr26 vr27
+ vsadd.h vr28, vr29, vr31 // t8
+ vssub.h vr19, vr29, vr31 // t9
+ vssub.h vr29, vr27, vr25 // t10
+ vsadd.h vr9, vr27, vr25 // t11
+ vsadd.h vr31, vr26, vr24 // t12
+ vssub.h vr25, vr26, vr24 // t13
+ vssub.h vr27, vr10, vr30 // t14
+ vsadd.h vr24, vr10, vr30 // t15
+
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vmul_vmadd_w vr27, vr19, vr21, vr20, vr0, vr26
+ vssrarni.h.w vr26, vr0, 12 // t14a
+ vmul_vmsub_w vr27, vr19, vr20, vr21, vr0, vr30
+ vssrarni.h.w vr30, vr0, 12 // t9a
+
+ vmul_vmadd_w vr25, vr29, vr21, vr20, vr0, vr19
+ vneg.w vr0, vr0
+ vneg.w vr19, vr19
+ vssrarni.h.w vr19, vr0, 12 // t10a
+ vmul_vmsub_w vr25, vr29, vr20, vr21, vr0, vr27
+ vssrarni.h.w vr27, vr0, 12 // t13a
+
+ vsadd.h vr25, vr28, vr9 // t8a
+ vssub.h vr29, vr28, vr9 // t11a
+ vssub.h vr28, vr24, vr31 // t12a
+ vsadd.h vr10, vr24, vr31 // t15a
+ vsadd.h vr9, vr30, vr19 // t9
+ vssub.h vr31, vr30, vr19 // t10
+ vssub.h vr30, vr26, vr27 // t13
+ vsadd.h vr24, vr26, vr27 // t14
+
+ vldrepl.w vr20, t0, 0 // 2896
+ vmul_vmadd_w vr30, vr31, vr20, vr20, vr0, vr26
+ vssrarni.h.w vr26, vr0, 12 // t13a
+ vmul_vmsub_w vr30, vr31, vr20, vr20, vr0, vr27
+ vssrarni.h.w vr27, vr0, 12 // t10a
+
+ vmul_vmadd_w vr28, vr29, vr20, vr20, vr0, vr31
+ vssrarni.h.w vr31, vr0, 12 // t12
+ vmul_vmsub_w vr28, vr29, vr20, vr20, vr0, vr30
+ vssrarni.h.w vr30, vr0, 12 // t11
+
+ // vr11 vr12 ... vr18
+ vsadd.h vr28, vr14, vr31 // c[3]
+ vssub.h vr29, vr14, vr31 // c[12]
+ vsadd.h vr20, vr15, vr30 // c[4]
+ vssub.h vr21, vr15, vr30 // c[11]
+ vsadd.h vr14, vr16, vr27 // c[5]
+ vssub.h vr23, vr16, vr27 // c[10]
+ vsadd.h vr15, vr17, vr9 // c[6]
+ vssub.h vr30, vr17, vr9 // c[9]
+ vsadd.h vr16, vr18, vr25 // c[7]
+ vssub.h vr27, vr18, vr25 // c[8]
+ vsadd.h vr17, vr13, vr26 // c[2]
+ vssub.h vr26, vr13, vr26 // c[13]
+ vsadd.h vr18, vr12, vr24 // c[1]
+ vssub.h vr25, vr12, vr24 // c[14]
+ vsadd.h vr22, vr11, vr10 // c[0]
+ vssub.h vr24, vr11, vr10 // c[15]
+
+.irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
+ vsrari.h \i, \i, 4
+.endr
+
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr22, vr18, vr17, vr28
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr20, vr14, vr15, vr16
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr27, vr30, vr23, vr21
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr29, vr26, vr25, vr24
+
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ fld.d f27, sp, 24
+ fld.d f28, sp, 32
+ fld.d f29, sp, 40
+ fld.d f30, sp, 48
+ fld.d f31, sp, 56
+ addi.d sp, sp, 64
+endfunc
+
+const iadst16_coeffs, align=4
+ .word 4091, 201, 3973, 995
+ .word 3703, 1751, 3290, 2440
+ .word 2751, 3035, 2106, 3513
+ .word 1380, 3857, 601, 4052
+endconst
+
+.macro adst16_core_lsx transpose8x8, shift, vst
+ la.local t0, iadst16_coeffs
+ vldrepl.w vr20, t0, 0 // 4091
+ vldrepl.w vr21, t0, 4 // 201
+
+ vmul_vmadd_w vr15, vr0, vr20, vr21, vr16, vr18
+ vmul_vmsub_w vr15, vr0, vr21, vr20, vr17, vr19
+ vssrarni.h.w vr18, vr16, 12 // t0
+ vssrarni.h.w vr19, vr17, 12 // t1
+
+ vldrepl.w vr20, t0, 8 // 3973
+ vldrepl.w vr21, t0, 12 // 995
+ vmul_vmadd_w vr13, vr2, vr20, vr21, vr16, vr0
+ vmul_vmsub_w vr13, vr2, vr21, vr20, vr17, vr15
+ vssrarni.h.w vr0, vr16, 12 // t2
+ vssrarni.h.w vr15, vr17, 12 // t3
+
+ vldrepl.w vr20, t0, 16 // 3703
+ vldrepl.w vr21, t0, 20 // 1751
+ vmul_vmadd_w vr11, vr4, vr20, vr21, vr16, vr2
+ vmul_vmsub_w vr11, vr4, vr21, vr20, vr17, vr13
+ vssrarni.h.w vr2, vr16, 12 // t4
+ vssrarni.h.w vr13, vr17, 12 // t5
+
+ vldrepl.w vr20, t0, 24 // 3290 -> 1645
+ vldrepl.w vr21, t0, 28 // 2440 -> 1220
+ vmul_vmadd_w vr9, vr6, vr20, vr21, vr16, vr4
+ vmul_vmsub_w vr9, vr6, vr21, vr20, vr17, vr11
+ vssrarni.h.w vr4, vr16, 12 // t6
+ vssrarni.h.w vr11, vr17, 12 // t7
+
+ vldrepl.w vr20, t0, 32 // 2751
+ vldrepl.w vr21, t0, 36 // 3035
+ vmul_vmadd_w vr7, vr8, vr20, vr21, vr16, vr6
+ vmul_vmsub_w vr7, vr8, vr21, vr20, vr17, vr9
+ vssrarni.h.w vr6, vr16, 12 // t8
+ vssrarni.h.w vr9, vr17, 12 // t9
+
+ vldrepl.w vr20, t0, 40 // 2106
+ vldrepl.w vr21, t0, 44 // 3513
+ vmul_vmadd_w vr5, vr10, vr20, vr21, vr16, vr7
+ vmul_vmsub_w vr5, vr10, vr21, vr20, vr17, vr8
+ vssrarni.h.w vr7, vr16, 12 // t10
+ vssrarni.h.w vr8, vr17, 12 // t11
+
+ vldrepl.w vr20, t0, 48 // 1380
+ vldrepl.w vr21, t0, 52 // 3857
+ vmul_vmadd_w vr3, vr12, vr20, vr21, vr16, vr5
+ vmul_vmsub_w vr3, vr12, vr21, vr20, vr17, vr10
+ vssrarni.h.w vr5, vr16, 12 // t12
+ vssrarni.h.w vr10, vr17, 12 // t13
+
+ vldrepl.w vr20, t0, 56 // 601
+ vldrepl.w vr21, t0, 60 // 4052
+ vmul_vmadd_w vr1, vr14, vr20, vr21, vr16, vr3
+ vmul_vmsub_w vr1, vr14, vr21, vr20, vr17, vr12
+ vssrarni.h.w vr3, vr16, 12 // t14
+ vssrarni.h.w vr12, vr17, 12 // t15
+
+ vsadd.h vr1, vr18, vr6 // t0a
+ vssub.h vr14, vr18, vr6 // t8a
+ vsadd.h vr16, vr19, vr9 // t1a
+ vssub.h vr17, vr19, vr9 // t9a
+ vsadd.h vr6, vr0, vr7 // t2a
+ vssub.h vr18, vr0, vr7 // t10a
+ vsadd.h vr9, vr15, vr8 // t3a
+ vssub.h vr19, vr15, vr8 // t11a
+ vsadd.h vr0, vr2, vr5 // t4a
+ vssub.h vr7, vr2, vr5 // t12a
+ vsadd.h vr8, vr13, vr10 // t5a
+ vssub.h vr15, vr13, vr10 // t13a
+ vsadd.h vr2, vr4, vr3 // t6a
+ vssub.h vr5, vr4, vr3 // t14a
+ vsadd.h vr10, vr11, vr12 // t7a
+ vssub.h vr13, vr11, vr12 // t15a
+
+ la.local t0, idct_coeffs
+
+ vldrepl.w vr20, t0, 16 // 799
+ vldrepl.w vr21, t0, 20 // 4017
+ vmul_vmadd_w vr14, vr17, vr21, vr20, vr3, vr11
+ vmul_vmsub_w vr14, vr17, vr20, vr21, vr4, vr12
+ vssrarni.h.w vr11, vr3, 12 // t8
+ vssrarni.h.w vr12, vr4, 12 // t9
+
+ vmul_vmadd_w vr15, vr7, vr20, vr21, vr3, vr14
+ vmul_vmsub_w vr15, vr7, vr21, vr20, vr4, vr17
+ vssrarni.h.w vr14, vr3, 12 // t13
+ vssrarni.h.w vr17, vr4, 12 // t12
+
+ vldrepl.w vr20, t0, 24 // 3406
+ vldrepl.w vr21, t0, 28 // 2276
+ vmul_vmadd_w vr18, vr19, vr21, vr20, vr3, vr7
+ vmul_vmsub_w vr18, vr19, vr20, vr21, vr4, vr15
+ vssrarni.h.w vr7, vr3, 12 // t10
+ vssrarni.h.w vr15, vr4, 12 // t11
+
+ vmul_vmadd_w vr13, vr5, vr20, vr21, vr3, vr18
+ vmul_vmsub_w vr13, vr5, vr21, vr20, vr4, vr19
+ vssrarni.h.w vr18, vr3, 12 // t15
+ vssrarni.h.w vr19, vr4, 12 // t14
+
+ vsadd.h vr5, vr1, vr0 // t0
+ vssub.h vr13, vr1, vr0 // t4
+ vsadd.h vr3, vr16, vr8 // t1
+ vssub.h vr4, vr16, vr8 // t5
+ vsadd.h vr0, vr6, vr2 // t2
+ vssub.h vr1, vr6, vr2 // t6
+ vsadd.h vr8, vr9, vr10 // t3
+ vssub.h vr16, vr9, vr10 // t7
+ vsadd.h vr2, vr11, vr17 // t8a
+ vssub.h vr6, vr11, vr17 // t12a
+ vsadd.h vr9, vr12, vr14 // t9a
+ vssub.h vr10, vr12, vr14 // t13a
+ vsadd.h vr11, vr7, vr19 // t10a
+ vssub.h vr17, vr7, vr19 // t14a
+ vsadd.h vr12, vr15, vr18 // t11a
+ vssub.h vr14, vr15, vr18 // t15a
+
+ la.local t0, idct_coeffs
+
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vmul_vmadd_w vr13, vr4, vr21, vr20, vr7, vr18
+ vmul_vmsub_w vr13, vr4, vr20, vr21, vr15, vr19
+ vssrarni.h.w vr18, vr7, 12 // t4a
+ vssrarni.h.w vr19, vr15, 12 // t5a
+
+ vmul_vmadd_w vr16, vr1, vr20, vr21, vr7, vr4
+ vmul_vmsub_w vr16, vr1, vr21, vr20, vr15, vr13
+ vssrarni.h.w vr4, vr7, 12 // t7a
+ vssrarni.h.w vr13, vr15, 12 // t6a
+
+ vmul_vmadd_w vr6, vr10, vr21, vr20, vr7, vr1
+ vmul_vmsub_w vr6, vr10, vr20, vr21, vr15, vr16
+ vssrarni.h.w vr1, vr7, 12 // t12
+ vssrarni.h.w vr16, vr15, 12 // t13
+
+ vmul_vmadd_w vr14, vr17, vr20, vr21, vr7, vr6
+ vmul_vmsub_w vr14, vr17, vr21, vr20, vr15, vr10
+ vssrarni.h.w vr6, vr7, 12 // t15
+ vssrarni.h.w vr10, vr15, 12 // t14
+
+ vsadd.h vr14, vr5, vr0 // out[0]
+ vssub.h vr17, vr5, vr0 // t2a
+ vssub.h vr7, vr3, vr8 // t3a
+ vsadd.h vr15, vr3, vr8 // out[15]
+ vsllwil.w.h vr22, vr15, 0
+ vexth.w.h vr15, vr15
+ vneg.w vr22, vr22
+ vneg.w vr15, vr15
+ vssrarni.h.w vr15, vr22, 0 // out[15]
+ vsadd.h vr14, vr5, vr0 // out[0]
+ vssub.h vr17, vr5, vr0 // t2a
+ vssub.h vr7, vr3, vr8 // t3a
+
+ vsadd.h vr3, vr19, vr4 // out[12]
+ vssub.h vr8, vr19, vr4 // t7
+ vssub.h vr0, vr18, vr13 // t6
+ vsadd.h vr5, vr18, vr13 // out[3]
+ vsllwil.w.h vr22, vr5, 0
+ vexth.w.h vr5, vr5
+ vneg.w vr22, vr22
+ vneg.w vr5, vr5
+ vssrarni.h.w vr5, vr22, 0 // out[3]
+
+ vsadd.h vr13, vr9, vr12 // out[14]
+ vssub.h vr19, vr9, vr12 // t11
+ vssub.h vr4, vr2, vr11 // t10
+ vsadd.h vr18, vr2, vr11 // out[1]
+ vsllwil.w.h vr22, vr18, 0
+ vexth.w.h vr18, vr18
+ vneg.w vr22, vr22
+ vneg.w vr18, vr18
+ vssrarni.h.w vr18, vr22, 0 // out[1]
+
+ vsadd.h vr2, vr1, vr10 // out[2]
+ vssub.h vr11, vr1, vr10 // t14a
+ vssub.h vr12, vr16, vr6 // t15a
+ vsadd.h vr9, vr16, vr6 // out[13]
+ vsllwil.w.h vr22, vr9, 0
+ vexth.w.h vr9, vr9
+ vneg.w vr22, vr22
+ vneg.w vr9, vr9
+ vssrarni.h.w vr9, vr22, 0 // out[13]
+
+ vldrepl.w vr20, t0, 0 // 2896
+ vmul_vmadd_w vr17, vr7, vr20, vr20, vr6, vr10
+ vmul_vmsub_w vr17, vr7, vr20, vr20, vr16, vr1
+ vssrarni.h.w vr10, vr6, 12 // out[7]
+
+ vsllwil.w.h vr7, vr10, 0
+ vexth.w.h vr10, vr10
+ vneg.w vr7, vr7
+ vneg.w vr10, vr10
+ vssrarni.h.w vr10, vr7, 0
+ vssrarni.h.w vr1, vr16, 12 // out[8]
+
+ vmul_vmsub_w vr0, vr8, vr20, vr20, vr16, vr17
+ vmul_vmadd_w vr0, vr8, vr20, vr20, vr6, vr7
+ vssrarni.h.w vr17, vr16, 12 // out[11]
+
+ vsllwil.w.h vr0, vr17, 0
+ vexth.w.h vr17, vr17
+ vneg.w vr0, vr0
+ vneg.w vr17, vr17
+ vssrarni.h.w vr17, vr0, 0
+ vssrarni.h.w vr7, vr6, 12 // out[4]
+
+ vmul_vmsub_w vr4, vr19, vr20, vr20, vr16, vr0
+ vmul_vmadd_w vr4, vr19, vr20, vr20, vr6, vr8
+ vssrarni.h.w vr0, vr16, 12 // out[9]
+
+ vsllwil.w.h vr4, vr0, 0
+ vexth.w.h vr0, vr0
+ vneg.w vr4, vr4
+ vneg.w vr0, vr0
+ vssrarni.h.w vr0, vr4, 0
+ vssrarni.h.w vr8, vr6, 12 // out[6]
+
+ vmul_vmadd_w vr11, vr12, vr20, vr20, vr6, vr4
+ vmul_vmsub_w vr11, vr12, vr20, vr20, vr16, vr19
+ vssrarni.h.w vr4, vr6, 12 // out[5]
+
+ vsllwil.w.h vr24, vr4, 0
+ vexth.w.h vr4, vr4
+ vneg.w vr24, vr24
+ vneg.w vr4, vr4
+ vssrarni.h.w vr4, vr24, 0
+ vssrarni.h.w vr19, vr16, 12 // out[10]
+
+.ifnb \transpose8x8
+ LSX_TRANSPOSE8x8_H vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10, \
+ vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10, \
+ vr6, vr11, vr12, vr16, vr20, vr21, vr22, vr23
+
+ LSX_TRANSPOSE8x8_H vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15, \
+ vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15, \
+ vr6, vr11, vr12, vr16, vr20, vr21, vr22, vr23
+.endif
+
+.ifnb \shift
+.irp i, vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10, \
+ vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15
+ vsrari.h \i, \i, \shift
+.endr
+.endif
+
+.ifnb \vst
+ vst_x16 t1, 0, 16, vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10, \
+ vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15
+.endif
+// out0 out1 out2 out3 out4 out5 out6 out7
+// vr14 vr18 vr2 vr5 vr7 vr4 vr8 vr10
+// out8 out9 out10 out11 out12 out13 out14 out15
+// vr1 vr0 vr19 vr17 vr3 vr9 vr13 vr15
+.endm // adst16_core_lsx
+
+.macro adst16_core_finish_lsx in0, in1, in2, in3, in4, in5, in6, in7
+ fld.d f20, t2, 0
+ fldx.d f21, t2, a1
+ fld.d f22, t3, 0
+ fldx.d f23, t3, a1
+
+ alsl.d t2, a1, t2, 2
+ alsl.d t3, a1, t3, 2
+
+ fld.d f24, t2, 0
+ fldx.d f25, t2, a1
+ fld.d f26, t3, 0
+ fldx.d f27, t3, a1
+
+.irp i, vr20, vr21, vr22, vr23, vr24, vr25, vr26, vr27
+ vsllwil.hu.bu \i, \i, 0
+.endr
+
+.irp i, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7
+ vsrari.h \i, \i, 4
+.endr
+
+ vadd.h vr20, vr20, \in0
+ vadd.h vr21, vr21, \in1
+ vadd.h vr22, vr22, \in2
+ vadd.h vr23, vr23, \in3
+ vadd.h vr24, vr24, \in4
+ vadd.h vr25, vr25, \in5
+ vadd.h vr26, vr26, \in6
+ vadd.h vr27, vr27, \in7
+
+ vssrani.bu.h vr21, vr20, 0
+ vssrani.bu.h vr23, vr22, 0
+ vssrani.bu.h vr25, vr24, 0
+ vssrani.bu.h vr27, vr26, 0
+
+ vstelm.d vr21, t4, 0, 0
+ vstelm.d vr21, t5, 0, 1
+
+ alsl.d t4, a1, t4, 1
+ alsl.d t5, a1, t5, 1
+ vstelm.d vr23, t4, 0, 0
+ vstelm.d vr23, t5, 0, 1
+
+ alsl.d t4, a1, t4, 1
+ alsl.d t5, a1, t5, 1
+ vstelm.d vr25, t4, 0, 0
+ vstelm.d vr25, t5, 0, 1
+
+ alsl.d t4, a1, t4, 1
+ alsl.d t5, a1, t5, 1
+ vstelm.d vr27, t4, 0, 0
+ vstelm.d vr27, t5, 0, 1
+
+.endm // adst16_core_finish_lsx
+
+function inv_txfm_add_dct_adst_8x16_8bpc_lsx
+ addi.d sp, sp, -64
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ fst.d f27, sp, 24
+ fst.d f28, sp, 32
+ fst.d f29, sp, 40
+ fst.d f30, sp, 48
+ fst.d f31, sp, 56
+
+ vld_x8 a2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+
+ la.local t0, idct_coeffs
+
+ dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, rect2_lsx
+
+ vld_x8 a2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+
+ dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30, rect2_lsx
+
+.irp i, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \
+ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+ vsrari.h \i, \i, 1
+.endr
+
+ vreplgr2vr.h vr23, zero
+
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240
+ vst vr23, a2, \i
+.endr
+
+ LSX_TRANSPOSE8x8_H vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \
+ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr8, vr9, vr10, vr20, vr21, vr22, vr23, vr31
+
+ LSX_TRANSPOSE8x8_H vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30, \
+ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \
+ vr16, vr17, vr18, vr20, vr21, vr22, vr23, vr31
+
+ adst16_core_lsx , ,
+
+ addi.d t2, a0, 0
+ alsl.d t3, a1, a0, 1
+ addi.d t4, a0, 0
+ add.d t5, a1, a0
+
+ adst16_core_finish_lsx vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10
+
+ alsl.d t2, a1, t2, 2
+ alsl.d t3, a1, t3, 2
+
+ alsl.d t4, a1, t4, 1
+ alsl.d t5, a1, t5, 1
+
+ adst16_core_finish_lsx vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15
+
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ fld.d f27, sp, 24
+ fld.d f28, sp, 32
+ fld.d f29, sp, 40
+ fld.d f30, sp, 48
+ fld.d f31, sp, 56
+ addi.d sp, sp, 64
+endfunc
+
+.macro malloc_space number
+ li.w t0, \number
+ sub.d sp, sp, t0
+ addi.d sp, sp, -64
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ fst.d f27, sp, 24
+ fst.d f28, sp, 32
+ fst.d f29, sp, 40
+ fst.d f30, sp, 48
+ fst.d f31, sp, 56
+.endm
+
+.macro free_space number
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ fld.d f27, sp, 24
+ fld.d f28, sp, 32
+ fld.d f29, sp, 40
+ fld.d f30, sp, 48
+ fld.d f31, sp, 56
+ li.w t0, \number
+ add.d sp, sp, t0
+ addi.d sp, sp, 64
+.endm
+
+.macro DST_ADD_W16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11
+ vsllwil.hu.bu vr10, \in0, 0
+ vexth.hu.bu vr0, \in0
+ vsllwil.hu.bu vr11, \in1, 0
+ vexth.hu.bu vr1, \in1
+ vsllwil.hu.bu vr12, \in2, 0
+ vexth.hu.bu vr2, \in2
+ vsllwil.hu.bu vr13, \in3, 0
+ vexth.hu.bu vr3, \in3
+ vadd.h vr10, vr10, \in4
+ vadd.h vr0, vr0, \in5
+ vadd.h vr11, vr11, \in6
+ vadd.h vr1, vr1, \in7
+ vadd.h vr12, vr12, \in8
+ vadd.h vr2, vr2, \in9
+ vadd.h vr13, vr13, \in10
+ vadd.h vr3, vr3, \in11
+ vssrani.bu.h vr0, vr10, 0
+ vssrani.bu.h vr1, vr11, 0
+ vssrani.bu.h vr2, vr12, 0
+ vssrani.bu.h vr3, vr13, 0
+ vst vr0, a0, 0
+ vstx vr1, a0, a1
+ vst vr2, t2, 0
+ vstx vr3, t2, a1
+.endm
+
+.macro VLD_DST_ADD_W16 in0, in1, in2, in3, in4, in5, in6, in7, shift
+
+.ifnb \shift
+.irp i, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7
+ vsrari.h \i, \i, \shift
+.endr
+.endif
+
+ vld vr0, a0, 0
+ vldx vr1, a0, a1
+ vld vr2, t2, 0
+ vldx vr3, t2, a1
+ DST_ADD_W16 vr0, vr1, vr2, vr3, \in0, \in1, \in2, \in3, \
+ \in4, \in5, \in6, \in7
+.endm
+
+function inv_txfm_add_dct_dct_16x8_8bpc_lsx
+ bnez a3, .NO_HAS_DCONLY_16x8
+
+ ld.h t2, a2, 0 // dc
+ vldi vr0, 0x8b5 // 181
+ vreplgr2vr.w vr1, t2
+ vldi vr5, 0x880 // 128
+ vmul.w vr2, vr0, vr1 // dc * 181
+ st.h zero, a2, 0
+ vsrari.w vr2, vr2, 8 // (dc * 181 + 128) >> 8
+ alsl.d t2, a1, a0, 1
+ vmul.w vr2, vr2, vr0
+ vldx vr1, a0, a1
+ vsrari.w vr2, vr2, 8
+ vldx vr3, t2, a1
+ vsrari.w vr2, vr2, 1 // (dc + rnd) >> shift
+ vmadd.w vr5, vr2, vr0
+ vld vr0, a0, 0
+ vssrarni.h.w vr5, vr5, 12
+ vld vr2, t2, 0
+
+ DST_ADD_W16 vr0, vr1, vr2, vr3, vr5, vr5, vr5, vr5, vr5, vr5, vr5, vr5
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W16 vr5, vr5, vr5, vr5, vr5, vr5, vr5, vr5,
+
+ b .DCT_DCT_16x8_END
+
+.NO_HAS_DCONLY_16x8:
+ malloc_space 512
+
+ vld_x16 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+
+ la.local t0, idct_coeffs
+
+ vldrepl.w vr23, t0, 0 //2896
+.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+ rect2_lsx \i, vr23, \i
+.endr
+
+ dct_8x16_core_lsx
+
+ LSX_TRANSPOSE8x8_H vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr0, vr18, vr17, vr28, vr11, vr14, vr15, vr16, \
+ vr13, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+
+ LSX_TRANSPOSE8x8_H vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \
+ vr27, vr30, vr1, vr12, vr29, vr26, vr25, vr24, \
+ vr13, vr31, vr2, vr3, vr4, vr5, vr6, vr7
+
+.irp i, vr0, vr18, vr17, vr28, vr11, vr14, vr15, vr16, \
+ vr27, vr30, vr1, vr12, vr29, vr26, vr25, vr24
+ vsrari.h \i, \i, 1
+.endr
+
+ vst_x16 sp, 64, 16, vr13, vr18, vr17, vr28, vr11, vr14, vr15, vr16, \
+ vr27, vr30, vr23, vr12, vr29, vr26, vr25, vr24
+
+ vreplgr2vr.h vr23, zero
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240
+ vst vr23, a2, \i
+.endr
+
+ dct_8x8_core_lsx vr0, vr18, vr17, vr28, vr11, vr14, vr15, vr16, \
+ vr4, vr5, vr6, vr16, vr7, vr18, vr19, vr31, no_rect2
+
+ dct_8x8_core_lsx vr27, vr30, vr1, vr12, vr29, vr26, vr25, vr24, \
+ vr14, vr15, vr17, vr20, vr21, vr22, vr23, vr28, no_rect2
+
+ alsl.d t2, a1, a0, 1
+ VLD_DST_ADD_W16 vr4, vr14, vr5, vr15, vr6, vr17, vr16, vr20, 4
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+ VLD_DST_ADD_W16 vr7, vr21, vr18, vr22, vr19, vr23, vr31, vr28, 4
+
+ free_space 512
+
+.DCT_DCT_16x8_END:
+
+endfunc
+
+function inv_txfm_add_adst_dct_16x8_8bpc_lsx
+ addi.d sp, sp, -64
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ fst.d f27, sp, 24
+ fst.d f28, sp, 32
+ fst.d f29, sp, 40
+ fst.d f30, sp, 48
+ fst.d f31, sp, 56
+
+ addi.d t1, sp, 64
+ addi.d t2, a2, 0
+
+ vld_x16 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
+
+ la.local t0, idct_coeffs
+
+ vldrepl.w vr23, t0, 0 //2896
+.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
+ rect2_lsx \i, vr23, \i
+.endr
+
+ adst16_core_lsx , 1,
+
+ // out0 out1 out2 out3 out4 out5 out6 out7
+ // vr14 vr18 vr2 vr5 vr7 vr4 vr8 vr10
+ // out8 out9 out10 out11 out12 out13 out14 out15
+ // vr1 vr0 vr19 vr17 vr3 vr9 vr13 vr15
+
+ LSX_TRANSPOSE8x8_H vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10, \
+ vr14, vr18, vr2, vr5, vr7, vr4, vr24, vr25, \
+ vr6, vr11, vr12, vr16, vr20, vr21, vr22, vr23
+
+ LSX_TRANSPOSE8x8_H vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15, \
+ vr1, vr0, vr19, vr17, vr3, vr26, vr13, vr15, \
+ vr6, vr11, vr12, vr16, vr20, vr21, vr22, vr23
+
+ vreplgr2vr.h vr23, zero
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240
+ vst vr23, a2, \i
+.endr
+
+ dct_8x8_core_lsx vr14, vr18, vr2, vr5, vr7, vr4, vr24, vr25, \
+ vr27, vr28, vr29, vr25, vr30, vr31, vr6, vr16, no_rect2
+
+ dct_8x8_core_lsx vr1, vr0, vr19, vr17, vr3, vr26, vr13, vr15, \
+ vr5, vr7, vr18, vr20, vr21, vr22, vr23, vr24, no_rect2
+
+ alsl.d t2, a1, a0, 1
+ VLD_DST_ADD_W16 vr27, vr5, vr28, vr7, vr29, vr18, vr25, vr20, 4
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+ VLD_DST_ADD_W16 vr30, vr21, vr31, vr22, vr6, vr23, vr16, vr24, 4
+
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ fld.d f27, sp, 24
+ fld.d f28, sp, 32
+ fld.d f29, sp, 40
+ fld.d f30, sp, 48
+ fld.d f31, sp, 56
+ addi.d sp, sp, 64
+endfunc
+
+function inv_txfm_add_dct_dct_16x16_8bpc_lsx
+ bnez a3, .NO_HAS_DCONLY_16x16
+
+ ld.h t2, a2, 0 // dc
+ vldi vr0, 0x8b5 // 181
+ vreplgr2vr.w vr1, t2
+ vldi vr5, 0x880 // 128
+ vmul.w vr2, vr0, vr1 // dc * 181
+ st.h zero, a2, 0
+ vsrari.w vr2, vr2, 8 // (dc * 181 + 128) >> 8
+ alsl.d t2, a1, a0, 1
+ vsrari.w vr2, vr2, 2 // (dc + rnd) >> shift
+ vldx vr1, a0, a1
+ vmadd.w vr5, vr2, vr0
+ vldx vr3, t2, a1
+ vssrarni.h.w vr5, vr5, 12
+ vld vr0, a0, 0
+ vld vr2, t2, 0
+
+ DST_ADD_W16 vr0, vr1, vr2, vr3, vr5, vr5, vr5, vr5, vr5, vr5, vr5, vr5
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W16 vr5, vr5, vr5, vr5, vr5, vr5, vr5, vr5,
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W16 vr5, vr5, vr5, vr5, vr5, vr5, vr5, vr5,
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W16 vr5, vr5, vr5, vr5, vr5, vr5, vr5, vr5,
+
+ b .DCT_DCT_16x16_END
+
+.NO_HAS_DCONLY_16x16:
+
+ malloc_space 512
+
+ vld_x16 a2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+
+ dct_8x16_core_lsx
+
+ LSX_TRANSPOSE8x8_H vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+
+ LSX_TRANSPOSE8x8_H vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \
+ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \
+ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+
+.irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
+ vsrari.h \i, \i, 2
+.endr
+
+ vst_x16 sp, 64, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
+
+ vld_x16 a2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+
+ dct_8x16_core_lsx
+
+ LSX_TRANSPOSE8x8_H vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+
+ LSX_TRANSPOSE8x8_H vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \
+ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \
+ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+
+.irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
+ vsrari.h \i, \i, 2
+.endr
+
+ vst_x16 sp, 320, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
+
+ vreplgr2vr.h vr31, zero
+
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, \
+ 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, \
+ 464, 480, 496
+ vst vr31, a2, \i
+.endr
+
+ vld_x8 sp, 64, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+ vld_x8 sp, 320, 16, vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+
+ dct_8x16_core_lsx
+
+ vst_x8 sp, 64, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16
+ vst_x8 sp, 320, 16, vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
+
+ vld_x8 sp, 192, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+ vld_x8 sp, 448, 16, vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+
+ dct_8x16_core_lsx
+
+ alsl.d t2, a1, a0, 1
+ vld vr4, sp, 64
+ vld vr5, sp, 80
+ vld vr6, sp, 96
+ vld vr7, sp, 112
+ VLD_DST_ADD_W16 vr4, vr22, vr5, vr18, vr6, vr17, vr7, vr28, 4
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+ vld vr4, sp, 128
+ vld vr5, sp, 144
+ vld vr6, sp, 160
+ vld vr7, sp, 176
+ VLD_DST_ADD_W16 vr4, vr20, vr5, vr14, vr6, vr15, vr7, vr16, 4
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+ vld vr4, sp, 320
+ vld vr5, sp, 336
+ vld vr6, sp, 352
+ vld vr7, sp, 368
+ VLD_DST_ADD_W16 vr4, vr27, vr5, vr30, vr6, vr23, vr7, vr21, 4
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+ vld vr4, sp, 384
+ vld vr5, sp, 400
+ vld vr6, sp, 416
+ vld vr7, sp, 432
+ VLD_DST_ADD_W16 vr4, vr29, vr5, vr26, vr6, vr25, vr7, vr24, 4
+
+ free_space 512
+
+.DCT_DCT_16x16_END:
+endfunc
+
+function inv_txfm_add_adst_adst_16x16_8bpc_lsx
+
+ malloc_space 256+256
+
+ addi.d t1, sp, 64
+ addi.d t2, a2, 0
+
+ vld_x16 t2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
+
+ adst16_core_lsx transpose8x8, 2, vst_x16
+
+ addi.d t2, a2, 16
+ addi.d t1, t1, 256
+
+ vld_x16 t2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
+
+ adst16_core_lsx transpose8x8, 2, vst_x16
+
+ vreplgr2vr.h vr23, zero
+
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, \
+ 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, \
+ 464, 480, 496
+ vst vr23, a2, \i
+.endr
+
+ addi.d t2, sp, 64
+
+ vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+ vld_x8 t2, 256, 16, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
+
+ adst16_core_lsx , ,
+
+ // out0 out1 out2 out3 out4 out5 out6 out7
+ // vr14 vr18 vr2 vr5 vr7 vr4 vr8 vr10
+ // out8 out9 out10 out11 out12 out13 out14 out15
+ // vr1 vr0 vr19 vr17 vr3 vr9 vr13 vr15
+
+ addi.d t2, a0, 0
+ alsl.d t3, a1, a0, 1
+ addi.d t4, a0, 0
+ add.d t5, a1, a0
+
+ adst16_core_finish_lsx vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10
+
+ alsl.d t2, a1, t2, 2
+ alsl.d t3, a1, t3, 2
+
+ alsl.d t4, a1, t4, 1
+ alsl.d t5, a1, t5, 1
+
+ adst16_core_finish_lsx vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15
+
+ addi.d t2, sp, 64+128
+
+ vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+ vld_x8 t2, 256, 16, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
+
+ adst16_core_lsx , ,
+
+ addi.d a0, a0, 8
+
+ addi.d t2, a0, 0
+ alsl.d t3, a1, a0, 1
+ addi.d t4, a0, 0
+ add.d t5, a1, a0
+
+ adst16_core_finish_lsx vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10
+
+ alsl.d t2, a1, t2, 2
+ alsl.d t3, a1, t3, 2
+
+ alsl.d t4, a1, t4, 1
+ alsl.d t5, a1, t5, 1
+
+ adst16_core_finish_lsx vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15
+
+ free_space 256+256
+endfunc
+
+function inv_txfm_add_adst_dct_16x16_8bpc_lsx
+ malloc_space 256+256
+
+ addi.d t1, sp, 64
+ addi.d t2, a2, 0
+
+ vld_x16 t2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
+
+ adst16_core_lsx transpose8x8, 2, vst_x16
+
+ addi.d t2, a2, 16
+ addi.d t1, t1, 256
+
+ vld_x16 t2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
+
+ adst16_core_lsx transpose8x8, 2, vst_x16
+
+ vreplgr2vr.h vr23, zero
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, \
+ 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, \
+ 464, 480, 496
+ vst vr23, a2, \i
+.endr
+
+ addi.d t2, sp, 64
+
+ vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+ vld_x8 t2, 256, 16, vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+
+ dct_8x16_core_lsx
+
+ vst_x8 t2, 0, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16
+ vst_x8 t2, 256, 16, vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
+
+ addi.d t2, sp, 64+128
+
+ vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+ vld_x8 t2, 256, 16, vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+
+ dct_8x16_core_lsx
+
+ alsl.d t2, a1, a0, 1
+ vld vr4, sp, 64
+ vld vr5, sp, 80
+ vld vr6, sp, 96
+ vld vr7, sp, 112
+ VLD_DST_ADD_W16 vr4, vr22, vr5, vr18, vr6, vr17, vr7, vr28, 4
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+ vld vr4, sp, 128
+ vld vr5, sp, 144
+ vld vr6, sp, 160
+ vld vr7, sp, 176
+ VLD_DST_ADD_W16 vr4, vr20, vr5, vr14, vr6, vr15, vr7, vr16, 4
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+ vld vr4, sp, 320
+ vld vr5, sp, 336
+ vld vr6, sp, 352
+ vld vr7, sp, 368
+ VLD_DST_ADD_W16 vr4, vr27, vr5, vr30, vr6, vr23, vr7, vr21, 4
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+ vld vr4, sp, 384
+ vld vr5, sp, 400
+ vld vr6, sp, 416
+ vld vr7, sp, 432
+ VLD_DST_ADD_W16 vr4, vr29, vr5, vr26, vr6, vr25, vr7, vr24, 4
+
+ free_space 256+256
+endfunc
+
+function inv_txfm_add_dct_adst_16x16_8bpc_lsx
+ malloc_space 256+256
+
+ vld_x16 a2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+
+ dct_8x16_core_lsx
+
+ LSX_TRANSPOSE8x8_H vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+
+ LSX_TRANSPOSE8x8_H vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \
+ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \
+ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+
+.irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
+ vsrari.h \i, \i, 2
+.endr
+
+ vst_x16 sp, 64, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
+
+ vld_x16 a2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+
+ dct_8x16_core_lsx
+
+ LSX_TRANSPOSE8x8_H vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+
+ LSX_TRANSPOSE8x8_H vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \
+ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \
+ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+
+.irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
+ vsrari.h \i, \i, 2
+.endr
+
+ vst_x16 sp, 320, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
+
+ vreplgr2vr.h vr31, zero
+
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, \
+ 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, \
+ 464, 480, 496
+ vst vr31, a2, \i
+.endr
+
+ addi.d t2, sp, 64
+
+ vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+ vld_x8 t2, 256, 16, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
+
+ adst16_core_lsx , ,
+
+ // out0 out1 out2 out3 out4 out5 out6 out7
+ // vr14 vr18 vr2 vr5 vr7 vr4 vr8 vr10
+ // out8 out9 out10 out11 out12 out13 out14 out15
+ // vr1 vr0 vr19 vr17 vr3 vr9 vr13 vr15
+
+ addi.d t2, a0, 0
+ alsl.d t3, a1, a0, 1
+ addi.d t4, a0, 0
+ add.d t5, a1, a0
+
+ adst16_core_finish_lsx vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10
+
+ alsl.d t2, a1, t2, 2
+ alsl.d t3, a1, t3, 2
+
+ alsl.d t4, a1, t4, 1
+ alsl.d t5, a1, t5, 1
+
+ adst16_core_finish_lsx vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15
+
+ addi.d t2, sp, 64+128
+
+ vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+ vld_x8 t2, 256, 16, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
+
+ adst16_core_lsx , ,
+
+ addi.d a0, a0, 8
+
+ addi.d t2, a0, 0
+ alsl.d t3, a1, a0, 1
+ addi.d t4, a0, 0
+ add.d t5, a1, a0
+
+ adst16_core_finish_lsx vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10
+
+ alsl.d t2, a1, t2, 2
+ alsl.d t3, a1, t3, 2
+
+ alsl.d t4, a1, t4, 1
+ alsl.d t5, a1, t5, 1
+
+ adst16_core_finish_lsx vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15
+
+ free_space 256+256
+endfunc
+
+const shufb
+ .byte 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
+endconst
+
+function inv_txfm_add_flipadst_dct_16x16_8bpc_lsx
+ malloc_space 256+256
+
+ addi.d t1, sp, 64
+ addi.d t2, a2, 0
+
+ vld_x16 t2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
+
+ adst16_core_lsx transpose8x8, 2, vst_x16
+
+ addi.d t2, a2, 16
+ addi.d t1, t1, 256
+
+ vld_x16 t2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
+
+ adst16_core_lsx transpose8x8, 2, vst_x16
+
+ vreplgr2vr.h vr23, zero
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, \
+ 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, \
+ 464, 480, 496
+ vst vr23, a2, \i
+.endr
+
+ addi.d t2, sp, 64
+
+ vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+ vld_x8 t2, 256, 16, vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+
+ dct_8x16_core_lsx
+
+ la.local t0, shufb
+ vld vr0, t0, 0
+
+.irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
+ vshuf.b \i, \i, \i, vr0
+.endr
+
+ vst_x8 t2, 0, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16
+ vst_x8 t2, 256, 16, vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
+
+ addi.d t2, sp, 64+128
+
+ vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+ vld_x8 t2, 256, 16, vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+
+ dct_8x16_core_lsx
+
+ la.local t0, shufb
+ vld vr0, t0, 0
+
+.irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
+ vshuf.b \i, \i, \i, vr0
+.endr
+
+ alsl.d t2, a1, a0, 1
+ vld vr4, sp, 64
+ vld vr5, sp, 80
+ vld vr6, sp, 96
+ vld vr7, sp, 112
+ VLD_DST_ADD_W16 vr22, vr4, vr18, vr5, vr17, vr6, vr28, vr7, 4
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+ vld vr4, sp, 128
+ vld vr5, sp, 144
+ vld vr6, sp, 160
+ vld vr7, sp, 176
+ VLD_DST_ADD_W16 vr20, vr4, vr14, vr5, vr15, vr6, vr16, vr7, 4
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+ vld vr4, sp, 320
+ vld vr5, sp, 336
+ vld vr6, sp, 352
+ vld vr7, sp, 368
+ VLD_DST_ADD_W16 vr27, vr4, vr30, vr5, vr23, vr6, vr21, vr7, 4
+
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+ vld vr4, sp, 384
+ vld vr5, sp, 400
+ vld vr6, sp, 416
+ vld vr7, sp, 432
+ VLD_DST_ADD_W16 vr29, vr4, vr26, vr5, vr25, vr6, vr24, vr7, 4
+
+ free_space 256+256
+endfunc
+
+function inv_txfm_add_dct_flipadst_16x16_8bpc_lsx
+ malloc_space 256+256
+
+ vld_x16 a2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+
+ dct_8x16_core_lsx
+
+ LSX_TRANSPOSE8x8_H vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+
+ LSX_TRANSPOSE8x8_H vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \
+ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \
+ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+
+.irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
+ vsrari.h \i, \i, 2
+.endr
+
+ vst_x16 sp, 64, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
+
+ vld_x16 a2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+
+ dct_8x16_core_lsx
+
+ LSX_TRANSPOSE8x8_H vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+
+ LSX_TRANSPOSE8x8_H vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \
+ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \
+ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+
+.irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
+ vsrari.h \i, \i, 2
+.endr
+
+ vst_x16 sp, 320, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
+
+ vreplgr2vr.h vr31, zero
+
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, \
+ 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, \
+ 464, 480, 496
+ vst vr31, a2, \i
+.endr
+
+ addi.d t2, sp, 64
+
+ vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+ vld_x8 t2, 256, 16, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
+
+ adst16_core_lsx , ,
+
+ // out0 out1 out2 out3 out4 out5 out6 out7
+ // vr14 vr18 vr2 vr5 vr7 vr4 vr8 vr10
+ // out8 out9 out10 out11 out12 out13 out14 out15
+ // vr1 vr0 vr19 vr17 vr3 vr9 vr13 vr15
+
+ la.local t0, shufb
+ vld vr31, t0, 0
+
+ addi.d t2, a0, 0
+ alsl.d t3, a1, a0, 1
+ addi.d t4, a0, 0
+ add.d t5, a1, a0
+
+ adst16_core_finish_lsx vr15, vr13, vr9, vr3, vr17, vr19, vr0, vr1
+
+ alsl.d t2, a1, t2, 2
+ alsl.d t3, a1, t3, 2
+
+ alsl.d t4, a1, t4, 1
+ alsl.d t5, a1, t5, 1
+
+ adst16_core_finish_lsx vr10, vr8, vr4, vr7, vr5, vr2, vr18, vr14
+
+ addi.d t2, sp, 64+128
+
+ vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+ vld_x8 t2, 256, 16, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
+
+ adst16_core_lsx , ,
+
+ addi.d a0, a0, 8
+
+ la.local t0, shufb
+ vld vr31, t0, 0
+
+ addi.d t2, a0, 0
+ alsl.d t3, a1, a0, 1
+ addi.d t4, a0, 0
+ add.d t5, a1, a0
+
+ adst16_core_finish_lsx vr15, vr13, vr9, vr3, vr17, vr19, vr0, vr1
+
+ alsl.d t2, a1, t2, 2
+ alsl.d t3, a1, t3, 2
+
+ alsl.d t4, a1, t4, 1
+ alsl.d t5, a1, t5, 1
+
+ adst16_core_finish_lsx vr10, vr8, vr4, vr7, vr5, vr2, vr18, vr14
+
+ free_space 256+256
+
+endfunc
+
+function inv_txfm_add_dct_dct_8x32_8bpc_lsx
+ bnez a3, .NO_HAS_DCONLY_8x32
+
+ ld.h t2, a2, 0 // dc
+ vldi vr0, 0x8b5 // 181
+ vreplgr2vr.w vr1, t2
+ vldi vr5, 0x880 // 128
+ vmul.w vr2, vr0, vr1 // dc * 181
+ st.h zero, a2, 0
+ vsrari.w vr2, vr2, 8 // (dc * 181 + 128) >> 8
+ vld vr10, a0, 0 // 0 1 2 3 4 5 6 7
+ vsrari.w vr2, vr2, 2 // (dc + rnd) >> shift
+ vldx vr11, a0, a1 // 8 9 10 11 12 13 14 15
+ alsl.d t2, a1, a0, 1
+ vmadd.w vr5, vr2, vr0
+ vld vr12, t2, 0 // 16 17 18 19 20 21 22 23
+ vssrarni.h.w vr5, vr5, 12
+ vldx vr13, t2, a1 // 24 25 26 27 28 29 30 31
+
+ DST_ADD_W8 vr10, vr11, vr12, vr13, vr5, vr5, vr5, vr5
+
+.rept 7
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, a0, 1
+
+ VLD_DST_ADD_W8 vr5, vr5, vr5, vr5
+.endr
+
+ b .DCT_DCT_8X32_END
+
+.NO_HAS_DCONLY_8x32:
+ malloc_space 512
+
+ vld_x8 a2, 0, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+
+ la.local t0, idct_coeffs
+
+ dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, no_rect2
+
+.irp i, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18
+ vsrari.h \i, \i, 2
+.endr
+
+ LSX_TRANSPOSE8x8_H vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \
+ vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \
+ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+
+ vst_x8 sp, 64, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18
+
+ vld_x8 a2, 16, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+
+ dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, no_rect2
+
+.irp i, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18
+ vsrari.h \i, \i, 2
+.endr
+
+ LSX_TRANSPOSE8x8_H vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \
+ vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \
+ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+
+ vst_x8 sp, 192, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18
+
+ vld_x8 a2, 32, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+
+ la.local t0, idct_coeffs
+
+ dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, no_rect2
+
+.irp i, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18
+ vsrari.h \i, \i, 2
+.endr
+
+ LSX_TRANSPOSE8x8_H vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \
+ vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \
+ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+
+ vst_x8 sp, 320, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18
+
+ vld_x8 a2, 48, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+
+ dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, no_rect2
+
+.irp i, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18
+ vsrari.h \i, \i, 2
+.endr
+
+ LSX_TRANSPOSE8x8_H vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \
+ vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \
+ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+
+ vst_x8 sp, 448, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18
+
+ vreplgr2vr.h vr31, zero
+
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, \
+ 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, \
+ 464, 480, 496
+ vst vr31, a2, \i
+.endr
+
+ addi.d t2, sp, 64
+ addi.d t3, sp, 64
+
+ vld_x16 t2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+
+ dct_8x16_core_lsx
+
+ vst_x16 t3, 0, 32, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
+
+ vld_x16 t2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+
+ // vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+ // in1 in3 in5 in7 in9 in11 in13 in15
+ // vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+ // in17 in19 in21 in23 in25 in27 in29 in31
+
+ la.local t0, idct_coeffs
+ vldrepl.w vr20, t0, 64 // 201
+ vldrepl.w vr21, t0, 68 // 4091
+
+ vmul_vmadd_w vr0, vr30, vr21, vr20, vr8, vr9
+ vssrarni.h.w vr9, vr8, 12 // t31a
+ vmul_vmsub_w vr0, vr30, vr20, vr21, vr11, vr10
+ vssrarni.h.w vr10, vr11, 12 // t16a
+
+ vldrepl.w vr20, t0, 72 // 3035
+ vldrepl.w vr21, t0, 76 // 2751
+ vmul_vmadd_w vr19, vr7, vr21, vr20, vr11, vr0
+ vssrarni.h.w vr0, vr11, 12 // t30a
+ vmul_vmsub_w vr19, vr7, vr20, vr21, vr11, vr30
+ vssrarni.h.w vr30, vr11, 12 // t17a
+
+ vldrepl.w vr20, t0, 80 // 1751
+ vldrepl.w vr21, t0, 84 // 3703
+ vmul_vmadd_w vr4, vr26, vr21, vr20, vr8, vr7
+ vssrarni.h.w vr7, vr8, 12 // t29a
+ vmul_vmsub_w vr4, vr26, vr20, vr21, vr8, vr19
+ vssrarni.h.w vr19, vr8, 12 // t18a
+
+ vldrepl.w vr20, t0, 88 // 3857
+ vldrepl.w vr21, t0, 92 // 1380
+ vmul_vmadd_w vr27, vr3, vr21, vr20, vr8, vr4
+ vssrarni.h.w vr4, vr8, 12 // t28a
+ vmul_vmsub_w vr27, vr3, vr20, vr21, vr8, vr26
+ vssrarni.h.w vr26, vr8, 12 // t19a
+
+ vldrepl.w vr20, t0, 96 // 995
+ vldrepl.w vr21, t0, 100 // 3973
+ vmul_vmadd_w vr2, vr28, vr21, vr20, vr8, vr3
+ vssrarni.h.w vr3, vr8, 12 // t27a
+ vmul_vmsub_w vr2, vr28, vr20, vr21, vr8, vr27
+ vssrarni.h.w vr27, vr8, 12 // t20a
+
+ vldrepl.w vr20, t0, 104 // 3513
+ vldrepl.w vr21, t0, 108 // 2106
+ vmul_vmadd_w vr25, vr5, vr21, vr20, vr8, vr2
+ vssrarni.h.w vr2, vr8, 12 // t26a
+ vmul_vmsub_w vr25, vr5, vr20, vr21, vr8, vr28
+ vssrarni.h.w vr28, vr8, 12 // t21a
+
+ vldrepl.w vr20, t0, 112 // 2440 -> 1220
+ vldrepl.w vr21, t0, 116 // 3290 -> 1645
+ vmul_vmadd_w vr6, vr24, vr21, vr20, vr8, vr5
+ vssrarni.h.w vr5, vr8, 12 // t25a
+ vmul_vmsub_w vr6, vr24, vr20, vr21, vr8, vr25
+ vssrarni.h.w vr25, vr8, 12 // t22a
+
+ vldrepl.w vr20, t0, 120 // 4052
+ vldrepl.w vr21, t0, 124 // 601
+ vmul_vmadd_w vr29, vr1, vr21, vr20, vr8, vr6
+ vssrarni.h.w vr6, vr8, 12 // t24a
+ vmul_vmsub_w vr29, vr1, vr20, vr21, vr8, vr24
+ vssrarni.h.w vr24, vr8, 12 // t23a
+
+ vsadd.h vr1, vr10, vr30 // t16
+ vssub.h vr29, vr10, vr30 // t17
+ vssub.h vr8, vr26, vr19 // t18
+ vsadd.h vr31, vr26, vr19 // t19
+ vsadd.h vr10, vr27, vr28 // t20
+ vssub.h vr30, vr27, vr28 // t21
+ vssub.h vr19, vr24, vr25 // t22
+ vsadd.h vr26, vr24, vr25 // t23
+ vsadd.h vr27, vr6, vr5 // t24
+ vssub.h vr28, vr6, vr5 // t25
+ vssub.h vr24, vr3, vr2 // t26
+ vsadd.h vr25, vr3, vr2 // t27
+ vsadd.h vr5, vr4, vr7 // t28
+ vssub.h vr6, vr4, vr7 // t29
+ vssub.h vr2, vr9, vr0 // t30
+ vsadd.h vr3, vr9, vr0 // t31
+
+ vldrepl.w vr20, t0, 16 // 799
+ vldrepl.w vr21, t0, 20 // 4017
+ vmul_vmadd_w vr2, vr29, vr21, vr20, vr4, vr7
+ vssrarni.h.w vr7, vr4, 12 // t30a
+ vmul_vmsub_w vr2, vr29, vr20, vr21, vr4, vr0
+ vssrarni.h.w vr0, vr4, 12 // t17a
+ vmul_vmadd_w vr6, vr8, vr21, vr20, vr4, vr9
+ vneg.w vr4, vr4
+ vneg.w vr9, vr9
+ vssrarni.h.w vr9, vr4, 12 // t18a
+ vmul_vmsub_w vr6, vr8, vr20, vr21, vr4, vr2
+ vssrarni.h.w vr2, vr4, 12 // t29a
+
+ vldrepl.w vr20, t0, 24 // 3406 -> 1703
+ vldrepl.w vr21, t0, 28 // 2276 -> 1138
+ vmul_vmadd_w vr24, vr30, vr21, vr20, vr4, vr29
+ vssrarni.h.w vr29, vr4, 12 // t26a
+ vmul_vmsub_w vr24, vr30, vr20, vr21, vr4, vr6
+ vssrarni.h.w vr6, vr4, 12 // t21a
+
+ vmul_vmadd_w vr28, vr19, vr21, vr20, vr4, vr8
+ vneg.w vr4, vr4
+ vneg.w vr8, vr8
+ vssrarni.h.w vr8, vr4, 12 // t22a
+ vmul_vmsub_w vr28, vr19, vr20, vr21, vr4, vr24
+ vssrarni.h.w vr24, vr4, 12 // t25a
+
+ vsadd.h vr4, vr1, vr31 // t16a
+ vssub.h vr30, vr1, vr31 // t19a
+ vsadd.h vr19, vr0, vr9 // t17
+ vssub.h vr28, vr0, vr9 // t18
+ vssub.h vr1, vr26, vr10 // t20a
+ vsadd.h vr31, vr26, vr10 // t23a
+ vssub.h vr0, vr8, vr6 // t21
+ vsadd.h vr9, vr8, vr6 // t22
+ vsadd.h vr10, vr27, vr25 // t24a
+ vssub.h vr26, vr27, vr25 // t27a
+ vsadd.h vr6, vr24, vr29 // t25
+ vssub.h vr8, vr24, vr29 // t26
+ vssub.h vr25, vr3, vr5 // t28a
+ vsadd.h vr27, vr3, vr5 // t31a
+ vssub.h vr24, vr7, vr2 // t29
+ vsadd.h vr29, vr7, vr2 // t30
+
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vmul_vmadd_w vr24, vr28, vr21, vr20, vr3, vr5
+ vssrarni.h.w vr5, vr3, 12 // t29a
+ vmul_vmsub_w vr24, vr28, vr20, vr21, vr3, vr2
+ vssrarni.h.w vr2, vr3, 12 // 18a
+
+ vmul_vmadd_w vr25, vr30, vr21, vr20, vr3, vr7
+ vssrarni.h.w vr7, vr3, 12 // t28
+ vmul_vmsub_w vr25, vr30, vr20, vr21, vr3, vr24
+ vssrarni.h.w vr24, vr3, 12 // t19
+
+ vmul_vmadd_w vr26, vr1, vr21, vr20, vr3, vr28
+ vneg.w vr3, vr3
+ vneg.w vr28, vr28
+ vssrarni.h.w vr28, vr3, 12 // t20
+ vmul_vmsub_w vr26, vr1, vr20, vr21, vr3, vr25
+ vssrarni.h.w vr25, vr3, 12 // t27
+
+ vmul_vmadd_w vr8, vr0, vr21, vr20, vr3, vr30
+ vneg.w vr3, vr3
+ vneg.w vr30, vr30
+ vssrarni.h.w vr30, vr3, 12 // t21a
+ vmul_vmsub_w vr8, vr0, vr20, vr21, vr3, vr1
+ vssrarni.h.w vr1, vr3, 12 // t26a
+
+ vsadd.h vr3, vr4, vr31 // t16
+ vssub.h vr26, vr4, vr31 // t23
+ vsadd.h vr0, vr19, vr9 // t17a
+ vssub.h vr8, vr19, vr9 // t22a
+ vsadd.h vr4, vr2, vr30 // t18
+ vssub.h vr31, vr2, vr30 // t21
+ vsadd.h vr9, vr24, vr28 // t19a
+ vssub.h vr19, vr24, vr28 // t20a
+ vssub.h vr2, vr27, vr10 // t24
+ vsadd.h vr30, vr27, vr10 // t31
+ vssub.h vr24, vr29, vr6 // t25a
+ vsadd.h vr28, vr29, vr6 // t30a
+ vssub.h vr10, vr5, vr1 // t26
+ vsadd.h vr27, vr5, vr1 // t29
+ vssub.h vr6, vr7, vr25 // t27a
+ vsadd.h vr29, vr7, vr25 // t28a
+
+ vldrepl.w vr20, t0, 0 // 2896
+ vmul_vmsub_w vr6, vr19, vr20, vr20, vr1, vr5
+ vssrarni.h.w vr5, vr1, 12 // t20
+ vmul_vmadd_w vr6, vr19, vr20, vr20, vr1, vr7
+ vssrarni.h.w vr7, vr1, 12 // t27
+
+ vmul_vmsub_w vr10, vr31, vr20, vr20, vr1, vr25
+ vssrarni.h.w vr25, vr1, 12 // t21a
+ vmul_vmadd_w vr10, vr31, vr20, vr20, vr1, vr6
+ vssrarni.h.w vr6, vr1, 12 // t26a
+
+ vmul_vmsub_w vr24, vr8, vr20, vr20, vr1, vr19
+ vssrarni.h.w vr19, vr1, 12 // t22
+ vmul_vmadd_w vr24, vr8, vr20, vr20, vr1, vr10
+ vssrarni.h.w vr10, vr1, 12 // t25
+
+ vmul_vmsub_w vr2, vr26, vr20, vr20, vr1, vr31
+ vssrarni.h.w vr31, vr1, 12 // t23a
+ vmul_vmadd_w vr2, vr26, vr20, vr20, vr1, vr8
+ vssrarni.h.w vr8, vr1, 12 // t24a
+
+ // t31 t30a t29 t28a t27 t26a t25 t24a t23a t22 t21a t20 t19a t18 t17a t16
+ // vr30 vr28 vr27 vr29 vr7 vr6 vr10 vr8 vr31 vr19 vr25 vr5 vr9 vr4 vr0 vr3
+
+ vld_x8 t3, 0, 32, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18
+
+ vsadd.h vr1, vr11, vr30 // c[0]
+ vssub.h vr2, vr11, vr30 // c[31]
+ vsadd.h vr24, vr12, vr28 // c[1]
+ vssub.h vr26, vr12, vr28 // c[30]
+ vsadd.h vr11, vr13, vr27 // c[2]
+ vssub.h vr30, vr13, vr27 // c[29]
+ vsadd.h vr12, vr14, vr29 // c[3]
+ vssub.h vr28, vr14, vr29 // c[28]
+ vsadd.h vr13, vr15, vr7 // c[4]
+ vssub.h vr27, vr15, vr7 // c[27]
+ vsadd.h vr14, vr16, vr6 // c[5]
+ vssub.h vr29, vr16, vr6 // c[26]
+ vsadd.h vr7, vr17, vr10 // c[6]
+ vssub.h vr15, vr17, vr10 // c[25]
+ vsadd.h vr6, vr18, vr8 // c[7]
+ vssub.h vr16, vr18, vr8 // c[24]
+
+.irp i, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \
+ vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2
+ vsrari.h \i, \i, 4
+.endr
+
+ vst_x8 t2, 0, 16, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6
+
+ vst_x8 t2, 128, 16, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2
+
+ vld_x8 t3, 256, 32, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18
+
+ vsadd.h vr1, vr11, vr31 // c[8]
+ vssub.h vr2, vr11, vr31 // c[23]
+ vsadd.h vr24, vr12, vr19 // c[9]
+ vssub.h vr26, vr12, vr19 // c[22]
+ vsadd.h vr11, vr13, vr25 // c[10]
+ vssub.h vr30, vr13, vr25 // c[21]
+ vsadd.h vr12, vr14, vr5 // c[11]
+ vssub.h vr28, vr14, vr5 // c[20]
+ vsadd.h vr13, vr15, vr9 // c[12]
+ vssub.h vr27, vr15, vr9 // c[19]
+ vsadd.h vr14, vr16, vr4 // c[13]
+ vssub.h vr29, vr16, vr4 // c[18]
+ vsadd.h vr7, vr17, vr0 // c[14]
+ vssub.h vr15, vr17, vr0 // c[17]
+ vsadd.h vr6, vr18, vr3 // c[15]
+ vssub.h vr16, vr18, vr3 // c[16]
+
+.irp i, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \
+ vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2
+ vsrari.h \i, \i, 4
+.endr
+
+ vst_x8 t2, 256, 16, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6
+
+ vst_x8 t2, 384, 16, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2
+
+ alsl.d t2, a1, a0, 1
+ addi.d t3, sp, 64
+
+ vld vr4, t3, 0
+ vld vr5, t3, 16
+ vld vr6, t3, 32
+ vld vr7, t3, 48
+ VLD_DST_ADD_W8 vr4, vr5, vr6, vr7
+
+ addi.d t3, sp, 64+64
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, t2, 2
+ vld vr4, t3, 0
+ vld vr5, t3, 16
+ vld vr6, t3, 32
+ vld vr7, t3, 48
+ VLD_DST_ADD_W8 vr4, vr5, vr6, vr7
+
+ addi.d t3, sp, 64+256
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, t2, 2
+ vld vr4, t3, 0
+ vld vr5, t3, 16
+ vld vr6, t3, 32
+ vld vr7, t3, 48
+ VLD_DST_ADD_W8 vr4, vr5, vr6, vr7
+
+ addi.d t3, t3, 64
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, t2, 2
+ vld vr4, t3, 0
+ vld vr5, t3, 16
+ vld vr6, t3, 32
+ vld vr7, t3, 48
+ VLD_DST_ADD_W8 vr4, vr5, vr6, vr7
+
+ addi.d t3, sp, 64+384
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, t2, 2
+ vld vr4, t3, 0
+ vld vr5, t3, 16
+ vld vr6, t3, 32
+ vld vr7, t3, 48
+ VLD_DST_ADD_W8 vr4, vr5, vr6, vr7
+
+ addi.d t3, t3, 64
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, t2, 2
+ vld vr4, t3, 0
+ vld vr5, t3, 16
+ vld vr6, t3, 32
+ vld vr7, t3, 48
+ VLD_DST_ADD_W8 vr4, vr5, vr6, vr7
+
+ addi.d t3, sp, 64+128
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, t2, 2
+ vld vr4, t3, 0
+ vld vr5, t3, 16
+ vld vr6, t3, 32
+ vld vr7, t3, 48
+ VLD_DST_ADD_W8 vr4, vr5, vr6, vr7
+
+ addi.d t3, t3, 64
+ alsl.d a0, a1, a0, 2
+ alsl.d t2, a1, t2, 2
+ vld vr4, t3, 0
+ vld vr5, t3, 16
+ vld vr6, t3, 32
+ vld vr7, t3, 48
+ VLD_DST_ADD_W8 vr4, vr5, vr6, vr7
+
+ free_space 512
+.DCT_DCT_8X32_END:
+endfunc
+
+.macro dct_8x32_core_lsx in1, in2, vst_start0, vst_start1, vst_start2, \
+ vst_start3, transpose8x8, shift
+
+ // vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+ // in1 in3 in5 in7 in9 in11 in13 in15
+ // vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+ // in17 in19 in21 in23 in25 in27 in29 in31
+
+ la.local t0, idct_coeffs
+ vldrepl.w vr20, t0, 64 // 201
+ vldrepl.w vr21, t0, 68 // 4091
+
+ vmul_vmadd_w vr0, vr30, vr21, vr20, vr8, vr9
+ vmul_vmsub_w vr0, vr30, vr20, vr21, vr11, vr10
+ vssrarni.h.w vr9, vr8, 12 // t31a
+ vssrarni.h.w vr10, vr11, 12 // t16a
+
+ vldrepl.w vr20, t0, 72 // 3035
+ vldrepl.w vr21, t0, 76 // 2751
+ vmul_vmadd_w vr19, vr7, vr21, vr20, vr8, vr0
+ vmul_vmsub_w vr19, vr7, vr20, vr21, vr11, vr30
+ vssrarni.h.w vr0, vr8, 12 // t30a
+ vssrarni.h.w vr30, vr11, 12 // t17a
+
+ vldrepl.w vr20, t0, 80 // 1751
+ vldrepl.w vr21, t0, 84 // 3703
+ vmul_vmadd_w vr4, vr26, vr21, vr20, vr8, vr7
+ vmul_vmsub_w vr4, vr26, vr20, vr21, vr11, vr19
+ vssrarni.h.w vr7, vr8, 12 // t29a
+ vssrarni.h.w vr19, vr11, 12 // t18a
+
+ vldrepl.w vr20, t0, 88 // 3857
+ vldrepl.w vr21, t0, 92 // 1380
+ vmul_vmadd_w vr27, vr3, vr21, vr20, vr8, vr4
+ vmul_vmsub_w vr27, vr3, vr20, vr21, vr11, vr26
+ vssrarni.h.w vr4, vr8, 12 // t28a
+ vssrarni.h.w vr26, vr11, 12 // t19a
+
+ vldrepl.w vr20, t0, 96 // 995
+ vldrepl.w vr21, t0, 100 // 3973
+ vmul_vmadd_w vr2, vr28, vr21, vr20, vr8, vr3
+ vmul_vmsub_w vr2, vr28, vr20, vr21, vr11, vr27
+ vssrarni.h.w vr3, vr8, 12 // t27a
+ vssrarni.h.w vr27, vr11, 12 // t20a
+
+ vldrepl.w vr20, t0, 104 // 3513
+ vldrepl.w vr21, t0, 108 // 2106
+ vmul_vmadd_w vr25, vr5, vr21, vr20, vr8, vr2
+ vmul_vmsub_w vr25, vr5, vr20, vr21, vr11, vr28
+ vssrarni.h.w vr2, vr8, 12 // t26a
+ vssrarni.h.w vr28, vr11, 12 // t21a
+
+ vldrepl.w vr20, t0, 112 // 2440 -> 1220
+ vldrepl.w vr21, t0, 116 // 3290 -> 1645
+ vmul_vmadd_w vr6, vr24, vr21, vr20, vr8, vr5
+ vmul_vmsub_w vr6, vr24, vr20, vr21, vr11, vr25
+ vssrarni.h.w vr5, vr8, 12 // t25a
+ vssrarni.h.w vr25, vr11, 12 // t22a
+
+ vldrepl.w vr20, t0, 120 // 4052
+ vldrepl.w vr21, t0, 124 // 601
+ vmul_vmadd_w vr29, vr1, vr21, vr20, vr8, vr6
+ vmul_vmsub_w vr29, vr1, vr20, vr21, vr11, vr24
+ vssrarni.h.w vr6, vr8, 12 // t24a
+ vssrarni.h.w vr24, vr11, 12 // t23a
+
+ vsadd.h vr1, vr10, vr30 // t16
+ vssub.h vr29, vr10, vr30 // t17
+ vssub.h vr8, vr26, vr19 // t18
+ vsadd.h vr31, vr26, vr19 // t19
+ vsadd.h vr10, vr27, vr28 // t20
+ vssub.h vr30, vr27, vr28 // t21
+ vssub.h vr19, vr24, vr25 // t22
+ vsadd.h vr26, vr24, vr25 // t23
+ vsadd.h vr27, vr6, vr5 // t24
+ vssub.h vr28, vr6, vr5 // t25
+ vssub.h vr24, vr3, vr2 // t26
+ vsadd.h vr25, vr3, vr2 // t27
+ vsadd.h vr5, vr4, vr7 // t28
+ vssub.h vr6, vr4, vr7 // t29
+ vssub.h vr2, vr9, vr0 // t30
+ vsadd.h vr3, vr9, vr0 // t31
+
+ vldrepl.w vr20, t0, 16 // 799
+ vldrepl.w vr21, t0, 20 // 4017
+ vmul_vmadd_w vr2, vr29, vr21, vr20, vr4, vr7
+ vmul_vmsub_w vr2, vr29, vr20, vr21, vr11, vr0
+ vssrarni.h.w vr7, vr4, 12 // t30a
+ vssrarni.h.w vr0, vr11, 12 // t17a
+ vmul_vmadd_w vr6, vr8, vr21, vr20, vr4, vr9
+ vneg.w vr4, vr4
+ vneg.w vr9, vr9
+ vmul_vmsub_w vr6, vr8, vr20, vr21, vr11, vr2
+ vssrarni.h.w vr9, vr4, 12 // t18a
+ vssrarni.h.w vr2, vr11, 12 // t29a
+
+ vldrepl.w vr20, t0, 24 // 3406 -> 1703
+ vldrepl.w vr21, t0, 28 // 2276 -> 1138
+ vmul_vmadd_w vr24, vr30, vr21, vr20, vr4, vr29
+ vmul_vmsub_w vr24, vr30, vr20, vr21, vr11, vr6
+ vssrarni.h.w vr29, vr4, 12 // t26a
+ vssrarni.h.w vr6, vr11, 12 // t21a
+
+ vmul_vmadd_w vr28, vr19, vr21, vr20, vr4, vr8
+ vneg.w vr4, vr4
+ vneg.w vr8, vr8
+ vmul_vmsub_w vr28, vr19, vr20, vr21, vr11, vr24
+ vssrarni.h.w vr8, vr4, 12 // t22a
+ vssrarni.h.w vr24, vr11, 12 // t25a
+
+ vsadd.h vr4, vr1, vr31 // t16a
+ vssub.h vr30, vr1, vr31 // t19a
+ vsadd.h vr19, vr0, vr9 // t17
+ vssub.h vr28, vr0, vr9 // t18
+ vssub.h vr1, vr26, vr10 // t20a
+ vsadd.h vr31, vr26, vr10 // t23a
+ vssub.h vr0, vr8, vr6 // t21
+ vsadd.h vr9, vr8, vr6 // t22
+ vsadd.h vr10, vr27, vr25 // t24a
+ vssub.h vr26, vr27, vr25 // t27a
+ vsadd.h vr6, vr24, vr29 // t25
+ vssub.h vr8, vr24, vr29 // t26
+ vssub.h vr25, vr3, vr5 // t28a
+ vsadd.h vr27, vr3, vr5 // t31a
+ vssub.h vr24, vr7, vr2 // t29
+ vsadd.h vr29, vr7, vr2 // t30
+
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vmul_vmadd_w vr24, vr28, vr21, vr20, vr3, vr5
+ vmul_vmsub_w vr24, vr28, vr20, vr21, vr11, vr2
+ vssrarni.h.w vr5, vr3, 12 // t29a
+ vssrarni.h.w vr2, vr11, 12 // 18a
+
+ vmul_vmadd_w vr25, vr30, vr21, vr20, vr3, vr7
+ vmul_vmsub_w vr25, vr30, vr20, vr21, vr11, vr24
+ vssrarni.h.w vr7, vr3, 12 // t28
+ vssrarni.h.w vr24, vr11, 12 // t19
+
+ vmul_vmadd_w vr26, vr1, vr21, vr20, vr3, vr28
+ vneg.w vr3, vr3
+ vneg.w vr28, vr28
+ vmul_vmsub_w vr26, vr1, vr20, vr21, vr11, vr25
+ vssrarni.h.w vr28, vr3, 12 // t20
+ vssrarni.h.w vr25, vr11, 12 // t27
+
+ vmul_vmadd_w vr8, vr0, vr21, vr20, vr3, vr30
+ vneg.w vr3, vr3
+ vneg.w vr30, vr30
+ vmul_vmsub_w vr8, vr0, vr20, vr21, vr11, vr1
+ vssrarni.h.w vr30, vr3, 12 // t21a
+ vssrarni.h.w vr1, vr11, 12 // t26a
+
+ vsadd.h vr3, vr4, vr31 // t16
+ vssub.h vr26, vr4, vr31 // t23
+ vsadd.h vr0, vr19, vr9 // t17a
+ vssub.h vr8, vr19, vr9 // t22a
+ vsadd.h vr4, vr2, vr30 // t18
+ vssub.h vr31, vr2, vr30 // t21
+ vsadd.h vr9, vr24, vr28 // t19a
+ vssub.h vr19, vr24, vr28 // t20a
+ vssub.h vr2, vr27, vr10 // t24
+ vsadd.h vr30, vr27, vr10 // t31
+ vssub.h vr24, vr29, vr6 // t25a
+ vsadd.h vr28, vr29, vr6 // t30a
+ vssub.h vr10, vr5, vr1 // t26
+ vsadd.h vr27, vr5, vr1 // t29
+ vssub.h vr6, vr7, vr25 // t27a
+ vsadd.h vr29, vr7, vr25 // t28a
+
+ vldrepl.w vr20, t0, 0 // 2896
+ vmul_vmsub_w vr6, vr19, vr20, vr20, vr1, vr5
+ vmul_vmadd_w vr6, vr19, vr20, vr20, vr11, vr7
+ vssrarni.h.w vr5, vr1, 12 // t20
+ vssrarni.h.w vr7, vr11, 12 // t27
+
+ vmul_vmsub_w vr10, vr31, vr20, vr20, vr1, vr25
+ vmul_vmadd_w vr10, vr31, vr20, vr20, vr11, vr6
+ vssrarni.h.w vr25, vr1, 12 // t21a
+ vssrarni.h.w vr6, vr11, 12 // t26a
+
+ vmul_vmsub_w vr24, vr8, vr20, vr20, vr1, vr19
+ vmul_vmadd_w vr24, vr8, vr20, vr20, vr11, vr10
+ vssrarni.h.w vr19, vr1, 12 // t22
+ vssrarni.h.w vr10, vr11, 12 // t25
+
+ vmul_vmsub_w vr2, vr26, vr20, vr20, vr1, vr31
+ vmul_vmadd_w vr2, vr26, vr20, vr20, vr11, vr8
+ vssrarni.h.w vr31, vr1, 12 // t23a
+ vssrarni.h.w vr8, vr11, 12 // t24a
+
+ // t31 t30a t29 t28a t27 t26a t25 t24a t23a t22 t21a t20 t19a t18 t17a t16
+ // vr30 vr28 vr27 vr29 vr7 vr6 vr10 vr8 vr31 vr19 vr25 vr5 vr9 vr4 vr0 vr3
+
+ vld_x8 \in2, 0, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18
+
+ vsadd.h vr1, vr11, vr30 // c[0]
+ vssub.h vr2, vr11, vr30 // c[31]
+ vsadd.h vr24, vr12, vr28 // c[1]
+ vssub.h vr26, vr12, vr28 // c[30]
+ vsadd.h vr11, vr13, vr27 // c[2]
+ vssub.h vr30, vr13, vr27 // c[29]
+ vsadd.h vr12, vr14, vr29 // c[3]
+ vssub.h vr28, vr14, vr29 // c[28]
+ vsadd.h vr13, vr15, vr7 // c[4]
+ vssub.h vr27, vr15, vr7 // c[27]
+ vsadd.h vr14, vr16, vr6 // c[5]
+ vssub.h vr29, vr16, vr6 // c[26]
+ vsadd.h vr7, vr17, vr10 // c[6]
+ vssub.h vr15, vr17, vr10 // c[25]
+ vsadd.h vr6, vr18, vr8 // c[7]
+ vssub.h vr16, vr18, vr8 // c[24]
+
+.ifnb \transpose8x8
+ LSX_TRANSPOSE8x8_H vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \
+ vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \
+ vr8, vr10, vr17, vr18, vr20, vr21, vr22, vr23
+.endif
+
+.ifnb \shift
+.irp i, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6
+ vsrari.h \i, \i, \shift
+.endr
+.endif
+
+ vst_x8 \in1, \vst_start0, 64, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6
+
+.ifnb \transpose8x8
+ LSX_TRANSPOSE8x8_H vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2, \
+ vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2, \
+ vr8, vr10, vr17, vr18, vr20, vr21, vr22, vr23
+.endif
+
+.ifnb \shift
+.irp i, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2
+ vsrari.h \i, \i, \shift
+.endr
+.endif
+
+ vst_x8 \in1, \vst_start3, 64, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2
+
+ vld_x8 \in2, 128, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18
+
+ vsadd.h vr1, vr11, vr31 // c[8]
+ vssub.h vr2, vr11, vr31 // c[23]
+ vsadd.h vr24, vr12, vr19 // c[9]
+ vssub.h vr26, vr12, vr19 // c[22]
+ vsadd.h vr11, vr13, vr25 // c[10]
+ vssub.h vr30, vr13, vr25 // c[21]
+ vsadd.h vr12, vr14, vr5 // c[11]
+ vssub.h vr28, vr14, vr5 // c[20]
+ vsadd.h vr13, vr15, vr9 // c[12]
+ vssub.h vr27, vr15, vr9 // c[19]
+ vsadd.h vr14, vr16, vr4 // c[13]
+ vssub.h vr29, vr16, vr4 // c[18]
+ vsadd.h vr7, vr17, vr0 // c[14]
+ vssub.h vr15, vr17, vr0 // c[17]
+ vsadd.h vr6, vr18, vr3 // c[15]
+ vssub.h vr16, vr18, vr3 // c[16]
+
+.ifnb \transpose8x8
+ LSX_TRANSPOSE8x8_H vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \
+ vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \
+ vr8, vr10, vr17, vr18, vr20, vr21, vr22, vr23
+.endif
+
+.ifnb \shift
+.irp i, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6
+ vsrari.h \i, \i, \shift
+.endr
+.endif
+
+ vst_x8 \in1, \vst_start1, 64, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6
+
+.ifnb \transpose8x8
+ LSX_TRANSPOSE8x8_H vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2, \
+ vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2, \
+ vr8, vr10, vr17, vr18, vr20, vr21, vr22, vr23
+.endif
+
+.ifnb \shift
+.irp i, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2
+ vsrari.h \i, \i, \shift
+.endr
+.endif
+
+ vst_x8 \in1, \vst_start2, 64, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2
+.endm
+
+function inv_txfm_add_dct_dct_32x32_8bpc_lsx
+ bnez a3, .NO_HAS_DCONLY_32x32
+
+ ld.h t2, a2, 0 // dc
+ vldi vr0, 0x8b5 // 181
+ vreplgr2vr.w vr1, t2
+ vldi vr20, 0x880 // 128
+ vmul.w vr2, vr0, vr1 // dc * 181
+ st.h zero, a2, 0
+ add.d t0, a0, a1
+ vsrari.w vr2, vr2, 8 // (dc * 181 + 128) >> 8
+ vld vr3, t0, 16
+ vsrari.w vr2, vr2, 2 // (dc + rnd) >> shift
+ vld vr1, a0, 16
+ vmadd.w vr20, vr2, vr0
+ vld vr2, t0, 0
+ vssrarni.h.w vr20, vr20, 12
+ vld vr0, a0, 0
+
+ vsllwil.hu.bu vr4, vr0, 0
+ vsllwil.hu.bu vr5, vr1, 0
+ vsllwil.hu.bu vr6, vr2, 0
+ vsllwil.hu.bu vr7, vr3, 0
+ vexth.hu.bu vr0, vr0
+ vexth.hu.bu vr1, vr1
+ vexth.hu.bu vr2, vr2
+ vexth.hu.bu vr3, vr3
+ vadd.h vr8, vr4, vr20
+ vadd.h vr9, vr0, vr20
+ vadd.h vr10, vr5, vr20
+ vadd.h vr11, vr1, vr20
+ vadd.h vr12, vr6, vr20
+ vadd.h vr13, vr2, vr20
+ vadd.h vr14, vr7, vr20
+ vadd.h vr15, vr3, vr20
+ vssrani.bu.h vr9, vr8, 0
+ vssrani.bu.h vr11, vr10, 0
+ vssrani.bu.h vr13, vr12, 0
+ vssrani.bu.h vr15, vr14, 0
+ vst vr9, a0, 0
+ vst vr11, a0, 16
+ vst vr13, t0, 0
+ vst vr15, t0, 16
+
+.rept 15
+ alsl.d a0, a1, a0, 1
+ add.d t0, a0, a1
+
+ vld vr0, a0, 0
+ vld vr1, a0, 16
+ vld vr2, t0, 0
+ vld vr3, t0, 16
+ vsllwil.hu.bu vr4, vr0, 0
+ vsllwil.hu.bu vr5, vr1, 0
+ vsllwil.hu.bu vr6, vr2, 0
+ vsllwil.hu.bu vr7, vr3, 0
+ vexth.hu.bu vr0, vr0
+ vexth.hu.bu vr1, vr1
+ vexth.hu.bu vr2, vr2
+ vexth.hu.bu vr3, vr3
+ vadd.h vr8, vr4, vr20
+ vadd.h vr9, vr0, vr20
+ vadd.h vr10, vr5, vr20
+ vadd.h vr11, vr1, vr20
+ vadd.h vr12, vr6, vr20
+ vadd.h vr13, vr2, vr20
+ vadd.h vr14, vr7, vr20
+ vadd.h vr15, vr3, vr20
+ vssrani.bu.h vr9, vr8, 0
+ vssrani.bu.h vr11, vr10, 0
+ vssrani.bu.h vr13, vr12, 0
+ vssrani.bu.h vr15, vr14, 0
+ vst vr9, a0, 0
+ vst vr11, a0, 16
+ vst vr13, t0, 0
+ vst vr15, t0, 16
+.endr
+
+ b .DCT_DCT_32X32_END
+.NO_HAS_DCONLY_32x32:
+
+ malloc_space 2560 // 32*32*2+512
+
+ addi.d t1, sp, 64
+ addi.d t2, a2, 0
+ addi.d t3, sp, 1024
+ addi.d t3, t3, 1024
+ addi.d t3, t3, 64
+
+ vld_x16 t2, 0, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+
+ dct_8x16_core_lsx
+
+ vst_x16 t3, 0, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
+
+ vld_x16 t2, 64, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+
+ dct_8x32_core_lsx t1, t3, 0, 16, 32, 48, transpose8x8, 2
+
+.rept 3
+ addi.d t2, t2, 16
+ addi.d t1, t1, 512
+
+ vld_x16 t2, 0, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+
+ dct_8x16_core_lsx
+
+ vst_x16 t3, 0, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
+
+ vld_x16 t2, 64, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+
+ dct_8x32_core_lsx t1, t3, 0, 16, 32, 48, transpose8x8, 2
+.endr
+
+ vreplgr2vr.h vr31, zero
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512, 528, 544, 560, 576, 592, 608, 624, 640, 656, 672, 688, 704, 720, 736, 752, 768, 784, 800, 816, 832, 848, 864, 880, 896, 912, 928, 944, 960, 976, 992, 1008, 1024, 1040, 1056, 1072, 1088, 1104, 1120, 1136, 1152, 1168, 1184, 1200, 1216, 1232, 1248, 1264, 1280, 1296, 1312, 1328, 1344, 1360, 1376, 1392, 1408, 1424, 1440, 1456, 1472, 1488, 1504, 1520, 1536, 1552, 1568, 1584, 1600, 1616, 1632, 1648, 1664, 1680, 1696, 1712, 1728, 1744, 1760, 1776, 1792, 1808, 1824, 1840, 1856, 1872, 1888, 1904, 1920, 1936, 1952, 1968, 1984, 2000, 2016, 2032
+ vst vr31, a2, \i
+.endr
+
+ addi.d t2, sp, 64
+ addi.d t1, sp, 64
+
+ vld_x16 t2, 0, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+
+ dct_8x16_core_lsx
+
+ vst_x16 t3, 0, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
+
+ vld_x16 t2, 64, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+
+ dct_8x32_core_lsx t1, t3, 0, 512, 1024, 1536, , 4
+
+.rept 3
+ addi.d t2, t2, 16
+ addi.d t1, t1, 16
+
+ vld_x16 t2, 0, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+
+ dct_8x16_core_lsx
+
+ vst_x16 t3, 0, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
+
+ vld_x16 t2, 64, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
+
+ dct_8x32_core_lsx t1, t3, 0, 512, 1024, 1536, , 4
+.endr
+
+ addi.d t2, sp, 64
+
+.rept 16
+ add.d t0, a0, a1
+ vld vr0, a0, 0
+ vld vr1, a0, 16
+ vld vr2, t0, 0
+ vld vr3, t0, 16
+ vsllwil.hu.bu vr4, vr0, 0
+ vsllwil.hu.bu vr5, vr1, 0
+ vsllwil.hu.bu vr6, vr2, 0
+ vsllwil.hu.bu vr7, vr3, 0
+ vexth.hu.bu vr0, vr0
+ vexth.hu.bu vr1, vr1
+ vexth.hu.bu vr2, vr2
+ vexth.hu.bu vr3, vr3
+ vld_x8 t2, 0, 16, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
+ vadd.h vr8, vr4, vr8
+ vadd.h vr9, vr0, vr9
+ vadd.h vr10, vr5, vr10
+ vadd.h vr11, vr1, vr11
+ vadd.h vr12, vr6, vr12
+ vadd.h vr13, vr2, vr13
+ vadd.h vr14, vr7, vr14
+ vadd.h vr15, vr3, vr15
+ vssrani.bu.h vr9, vr8, 0
+ vssrani.bu.h vr11, vr10, 0
+ vssrani.bu.h vr13, vr12, 0
+ vssrani.bu.h vr15, vr14, 0
+ vst vr9, a0, 0
+ vst vr11, a0, 16
+ vst vr13, t0, 0
+ vst vr15, t0, 16
+
+ alsl.d a0, a1, a0, 1
+ addi.d t2, t2, 128
+.endr
+
+ free_space 2560 // 32*32*2+512
+
+.DCT_DCT_32X32_END:
+endfunc
+
+.macro dct_8x8_tx64_core_lsx in0, in1, in2, in3, in4, in5, in6, in7, \
+ out0, out1, out2, out3, out4, out5, out6, out7
+
+ // in0 in1 in2 in3
+ // dct4 in0 in2
+ la.local t0, idct_coeffs
+
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vsllwil.w.h vr22, \in2, 0
+ vexth.w.h vr23, \in2
+ vmul.w vr8, vr22, vr20
+ vmul.w vr10, vr23, vr20
+ vmul.w \in2, vr22, vr21
+ vmul.w vr9, vr23, vr21
+ vssrarni.h.w vr10, vr8, 12 // t2
+ vssrarni.h.w vr9, \in2, 12 // t3
+
+ vldrepl.w vr20, t0, 0 // 2896
+ vsllwil.w.h vr22, \in0, 0
+ vexth.w.h vr23, \in0
+ vmul.w vr8, vr22, vr20
+ vmul.w \in2, vr23, vr20
+ vssrarni.h.w \in2, vr8, 12
+
+ vsadd.h vr8, \in2, vr9 // c[0]
+ vssub.h vr9, \in2, vr9 // c[3]
+ vsadd.h \in0, \in2, vr10 // c[1]
+ vssub.h vr10, \in2, vr10 // c[2]
+
+ // inv_dct8_1d_internal_c tx64
+ // in1 in3
+ vldrepl.w vr20, t0, 16 // 799
+ vldrepl.w vr21, t0, 20 // 4017
+
+ vsllwil.w.h vr22, \in1, 0
+ vexth.w.h vr23, \in1
+ vmul.w \in2, vr22, vr21
+ vmul.w \in4, vr23, vr21
+ vmul.w \in1, vr22, vr20
+ vmul.w \in6, vr23, vr20
+ vssrarni.h.w \in4, \in2, 12 // t7a
+ vssrarni.h.w \in6, \in1, 12 // t4a
+
+ vldrepl.w vr20, t0, 24 // 3406
+ vldrepl.w vr21, t0, 28 // 2276
+
+ vsllwil.w.h vr22, \in3, 0
+ vexth.w.h vr23, \in3
+ vneg.w vr21, vr21
+ vmul.w \in2, vr22, vr20
+ vmul.w \in1, vr23, vr20
+ vmul.w \in3, vr22, vr21
+ vmul.w \in7, vr23, vr21
+ vssrarni.h.w \in1, \in2, 12 // t6a
+ vssrarni.h.w \in7, \in3, 12 // t5a
+
+ vsadd.h \in3, \in6, \in7 // t4
+ vssub.h \in6, \in6, \in7 // t5a
+ vsadd.h \in5, \in4, \in1 // t7
+ vssub.h \in4, \in4, \in1 // t6a
+
+ vldrepl.w vr20, t0, 0 // 2896
+ vmul_vmadd_w \in4, \in6, vr20, vr20, vr21, \in1
+ vmul_vmsub_w \in4, \in6, vr20, vr20, \in2, \in7
+ vssrarni.h.w \in1, vr21, 12 // t6
+ vssrarni.h.w \in7, \in2, 12 // t5
+
+ vsadd.h \out0, vr8, \in5 // c[0]
+ vssub.h \out7, vr8, \in5 // c[7]
+ vsadd.h \out1, \in0, \in1 // c[1]
+ vssub.h \out6, \in0, \in1 // c[6]
+ vsadd.h \out2, vr10, \in7 // c[2]
+ vssub.h \out5, vr10, \in7 // c[5]
+ vsadd.h \out3, vr9, \in3 // c[3]
+ vssub.h \out4, vr9, \in3 // c[4]
+.endm
+
+.macro dct_8x16_tx64_core_lsx
+ dct_8x8_tx64_core_lsx vr0, vr2, vr4, vr6, vr19, vr25, vr27, vr29, vr11, \
+ vr12, vr13, vr14, vr15, vr16, vr17, vr18
+
+ // in1 in3 in5 in7 in9 in11 in13 in15
+ // vr1 vr3 vr5 vr7 vr24 vr26 vr28 vr30
+ la.local t0, idct_coeffs
+
+ vldrepl.w vr20, t0, 32 // 401
+ vldrepl.w vr21, t0, 36 // 4076
+ vsllwil.w.h vr22, vr1, 0
+ vexth.w.h vr23, vr1
+ vmul.w vr0, vr22, vr21
+ vmul.w vr10, vr23, vr21
+ vmul.w vr1, vr22, vr20
+ vmul.w vr29, vr23, vr20
+ vssrarni.h.w vr10, vr0, 12 // t15a
+ vssrarni.h.w vr29, vr1, 12 // t8a
+
+ vldrepl.w vr20, t0, 40 // 3166 -> 1583
+ vldrepl.w vr21, t0, 44 // 2598 -> 1299
+ vsllwil.w.h vr22, vr7, 0
+ vexth.w.h vr23, vr7
+ vneg.w vr21, vr21
+ vmul.w vr0, vr22, vr20
+ vmul.w vr30, vr23, vr20
+ vmul.w vr7, vr22, vr21
+ vmul.w vr31, vr23, vr21
+ vssrarni.h.w vr30, vr0, 12 // t14a
+ vssrarni.h.w vr31, vr7, 12 // t9a
+
+ vldrepl.w vr20, t0, 48 // 1931
+ vldrepl.w vr21, t0, 52 // 3612
+ vsllwil.w.h vr22, vr5, 0
+ vexth.w.h vr23, vr5
+ vmul.w vr0, vr22, vr21
+ vmul.w vr24, vr23, vr21
+ vmul.w vr5, vr22, vr20
+ vmul.w vr25, vr23, vr20
+ vssrarni.h.w vr24, vr0, 12 // t13a
+ vssrarni.h.w vr25, vr5, 12 // t10a
+
+ vldrepl.w vr20, t0, 56 // 3920
+ vldrepl.w vr21, t0, 60 // 1189
+ vsllwil.w.h vr22, vr3, 0
+ vexth.w.h vr23, vr3
+ vneg.w vr21, vr21
+ vmul.w vr0, vr22, vr20
+ vmul.w vr26, vr23, vr20
+ vmul.w vr3, vr22, vr21
+ vmul.w vr27, vr23, vr21
+ vssrarni.h.w vr26, vr0, 12 // t12a
+ vssrarni.h.w vr27, vr3, 12 // t11a
+
+ // vr22 vr23 vr30 vr31 vr24 vr25 vr26 vr27
+ vsadd.h vr28, vr29, vr31 // t8
+ vssub.h vr19, vr29, vr31 // t9
+ vssub.h vr29, vr27, vr25 // t10
+ vsadd.h vr9, vr27, vr25 // t11
+ vsadd.h vr31, vr26, vr24 // t12
+ vssub.h vr25, vr26, vr24 // t13
+ vssub.h vr27, vr10, vr30 // t14
+ vsadd.h vr24, vr10, vr30 // t15
+
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vmul_vmadd_w vr27, vr19, vr21, vr20, vr0, vr26
+ vmul_vmsub_w vr27, vr19, vr20, vr21, vr1, vr30
+ vssrarni.h.w vr26, vr0, 12 // t14a
+ vssrarni.h.w vr30, vr1, 12 // t9a
+
+ vmul_vmadd_w vr25, vr29, vr21, vr20, vr0, vr19
+ vneg.w vr0, vr0
+ vneg.w vr19, vr19
+ vmul_vmsub_w vr25, vr29, vr20, vr21, vr1, vr27
+ vssrarni.h.w vr19, vr0, 12 // t10a
+ vssrarni.h.w vr27, vr1, 12 // t13a
+
+ vsadd.h vr25, vr28, vr9 // t8a
+ vssub.h vr29, vr28, vr9 // t11a
+ vssub.h vr28, vr24, vr31 // t12a
+ vsadd.h vr10, vr24, vr31 // t15a
+ vsadd.h vr9, vr30, vr19 // t9
+ vssub.h vr31, vr30, vr19 // t10
+ vssub.h vr30, vr26, vr27 // t13
+ vsadd.h vr24, vr26, vr27 // t14
+
+ vldrepl.w vr20, t0, 0 // 2896
+ vmul_vmadd_w vr30, vr31, vr20, vr20, vr0, vr26
+ vmul_vmsub_w vr30, vr31, vr20, vr20, vr1, vr27
+ vssrarni.h.w vr26, vr0, 12 // t13a
+ vssrarni.h.w vr27, vr1, 12 // t10a
+
+ vmul_vmadd_w vr28, vr29, vr20, vr20, vr0, vr31
+ vmul_vmsub_w vr28, vr29, vr20, vr20, vr1, vr30
+ vssrarni.h.w vr31, vr0, 12 // t12
+ vssrarni.h.w vr30, vr1, 12 // t11
+
+ // vr11 vr12 ... vr18
+ vsadd.h vr28, vr14, vr31 // c[3]
+ vssub.h vr29, vr14, vr31 // c[12]
+ vsadd.h vr20, vr15, vr30 // c[4]
+ vssub.h vr21, vr15, vr30 // c[11]
+ vsadd.h vr14, vr16, vr27 // c[5]
+ vssub.h vr23, vr16, vr27 // c[10]
+ vsadd.h vr15, vr17, vr9 // c[6]
+ vssub.h vr30, vr17, vr9 // c[9]
+ vsadd.h vr16, vr18, vr25 // c[7]
+ vssub.h vr27, vr18, vr25 // c[8]
+ vsadd.h vr17, vr13, vr26 // c[2]
+ vssub.h vr26, vr13, vr26 // c[13]
+ vsadd.h vr18, vr12, vr24 // c[1]
+ vssub.h vr25, vr12, vr24 // c[14]
+ vsadd.h vr22, vr11, vr10 // c[0]
+ vssub.h vr24, vr11, vr10 // c[15]
+.endm // dct_8x16_tx64_core_lsx
+
+.macro vmul_vssrarni_hw in0, in1, in2, tmp0, tmp1, out0, out1
+ vsllwil.w.h vr22, \in0, 0
+ vexth.w.h vr23, \in0
+ vmul.w \tmp0, vr22, \in1
+ vmul.w \out0, vr23, \in1
+ vmul.w \tmp1, vr22, \in2
+ vmul.w \out1, vr23, \in2
+ vssrarni.h.w \out0, \tmp0, 12
+ vssrarni.h.w \out1, \tmp1, 12
+.endm
+
+const idct64_coeffs, align=4
+ .word 101, 4095, 2967, -2824
+ .word 1660, 3745, 3822, -1474
+ .word 4076, 401, 4017, 799
+
+ .word 4036, -700, 2359, 3349
+ .word 3461, -2191, 897, 3996
+ .word -3166, -2598, -799, -4017
+
+ .word 501, 4065, 3229, -2520
+ .word 2019, 3564, 3948, -1092
+ .word 3612, 1931, 2276, 3406
+
+ .word 4085, -301, 2675, 3102
+ .word 3659, -1842, 1285, 3889
+ .word -3920, -1189, -3406, -2276
+endconst
+
+// in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
+// in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
+// in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
+// in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
+
+.macro dct64_step1_lsx
+
+ vldrepl.w vr20, t0, 0 // 101
+ vldrepl.w vr21, t0, 4 // 4095
+ vmul_vssrarni_hw vr0, vr20, vr21, vr16, vr0, vr8, vr9 // vr8 t32a vr9 t63a
+
+ vldrepl.w vr20, t0, 8 // 2967
+ vldrepl.w vr21, t0, 12 // -2824
+ vmul_vssrarni_hw vr1, vr20, vr21, vr16, vr1, vr10, vr11 // vr10 t62a vr11 t33a
+
+ vldrepl.w vr20, t0, 16 // 1660
+ vldrepl.w vr21, t0, 20 // 3745
+ vmul_vssrarni_hw vr2, vr20, vr21, vr16, vr2, vr12, vr13 // vr12 t34a vr13 t61a
+
+ vldrepl.w vr20, t0, 24 // 3822
+ vldrepl.w vr21, t0, 28 // -1474
+ vmul_vssrarni_hw vr3, vr20, vr21, vr16, vr3, vr14, vr15 // vr14 t60a vr15 t35a
+
+ vsadd.h vr0, vr8, vr11 // t32
+ vssub.h vr1, vr8, vr11 // t33
+ vssub.h vr2, vr15, vr12 // t34
+ vsadd.h vr3, vr15, vr12 // t35
+ vsadd.h vr4, vr14, vr13 // t60
+ vssub.h vr5, vr14, vr13 // t61
+ vssub.h vr6, vr9, vr10 // t62
+ vsadd.h vr7, vr9, vr10 // t63
+
+ vldrepl.w vr20, t0, 32 // 4076
+ vldrepl.w vr21, t0, 36 // 401
+ vmul_vmadd_w vr6, vr1, vr20, vr21, vr9, vr10
+ vmul_vmsub_w vr6, vr1, vr21, vr20, vr13, vr11
+ vssrarni.h.w vr10, vr9, 12 // t62a
+ vssrarni.h.w vr11, vr13, 12 // t33a
+
+ vmul_vmadd_w vr5, vr2, vr20, vr21, vr9, vr1
+ vmul_vmsub_w vr5, vr2, vr21, vr20, vr13, vr6
+ vneg.w vr9, vr9
+ vneg.w vr1, vr1
+ vssrarni.h.w vr6, vr13, 12 // t61a
+ vssrarni.h.w vr1, vr9, 12 // t34a
+
+ vsadd.h vr2, vr0, vr3 // t32a
+ vssub.h vr5, vr0, vr3 // t35a
+ vsadd.h vr9, vr11, vr1 // t33
+ vssub.h vr13, vr11, vr1 // t34
+ vssub.h vr0, vr7, vr4 // t60a
+ vsadd.h vr3, vr7, vr4 // t63a
+ vssub.h vr1, vr10, vr6 // t61
+ vsadd.h vr11, vr10, vr6 // t62
+
+ vldrepl.w vr20, t0, 40 // 4017
+ vldrepl.w vr21, t0, 44 // 799
+
+ vmul_vmadd_w vr1, vr13, vr20, vr21, vr8, vr4
+ vmul_vmsub_w vr1, vr13, vr21, vr20, vr12, vr7
+ vssrarni.h.w vr4, vr8, 12 // t61a
+ vssrarni.h.w vr7, vr12, 12 // t34a
+
+ vmul_vmadd_w vr0, vr5, vr20, vr21, vr8, vr6
+ vmul_vmsub_w vr0, vr5, vr21, vr20, vr12, vr10
+ vssrarni.h.w vr6, vr8, 12 // t60
+ vssrarni.h.w vr10, vr12, 12 // t35
+
+ vst_x8 t6, 0, 16, vr2, vr9, vr7, vr10, vr6, vr4, vr11, vr3
+.endm // dct64_step1
+
+ // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
+ // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
+ // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
+ // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
+.macro dct64_step2_lsx
+ vld vr0, t5, 0 // t32a
+ vld vr2, t4, 0 // t63a
+ vld vr3, t5, 16*8 // t56a
+ vld vr1, t4, 16*8 // t39a
+ vld vr4, t5, 16*16 // t40a
+ vld vr6, t4, 16*16 // t55a
+ vld vr7, t5, 16*24 // t48a
+ vld vr5, t4, 16*24 // t47a
+
+ vsadd.h vr8, vr0, vr1 // t32
+ vssub.h vr9, vr0, vr1 // t39
+ vsadd.h vr10, vr2, vr3 // t63
+ vssub.h vr11, vr2, vr3 // t56
+ vssub.h vr12, vr5, vr4 // t40
+ vsadd.h vr13, vr5, vr4 // t47
+ vsadd.h vr14, vr7, vr6 // t48
+ vssub.h vr15, vr7, vr6 // t55
+
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vmul_vmadd_w vr11, vr9, vr21, vr20, vr0, vr2
+ vmul_vmsub_w vr11, vr9, vr20, vr21, vr1, vr3
+ vssrarni.h.w vr2, vr0, 12 // t56a
+ vssrarni.h.w vr3, vr1, 12 // t39a
+
+ vmul_vmadd_w vr15, vr12, vr21, vr20, vr0, vr4
+ vmul_vmsub_w vr15, vr12, vr20, vr21, vr1, vr5
+ vneg.w vr0, vr0
+ vneg.w vr4, vr4
+ vssrarni.h.w vr5, vr1, 12 // t55a
+ vssrarni.h.w vr4, vr0, 12 // t40a
+
+ vsadd.h vr9, vr8, vr13 // t32a
+ vssub.h vr11, vr8, vr13 // t47a
+ vsadd.h vr6, vr3, vr4 // t39
+ vssub.h vr7, vr3, vr4 // t40
+ vssub.h vr12, vr10, vr14 // t48a
+ vsadd.h vr15, vr10, vr14 // t63a
+ vssub.h vr0, vr2, vr5 // t55
+ vsadd.h vr1, vr2, vr5 // t56
+
+ vldrepl.w vr20, t0, 0 // 2896
+ vmul_vmsub_w vr0, vr7, vr20, vr20, vr8, vr13
+ vmul_vmadd_w vr0, vr7, vr20, vr20, vr3, vr4
+ vssrarni.h.w vr13, vr8, 12 // t40a
+ vssrarni.h.w vr4, vr3, 12 // t55a
+ vmul_vmsub_w vr12, vr11, vr20, vr20, vr8, vr10
+ vmul_vmadd_w vr12, vr11, vr20, vr20, vr3, vr14
+ vssrarni.h.w vr10, vr8, 12 // t47
+ vssrarni.h.w vr14, vr3, 12 // t48
+
+ // t32a t39 t40a t47 t48 t55a t56 t63a
+ // vr9 vr6 vr13 vr10 vr14 vr4 vr1 vr15
+ vst vr9, t5, 0 // t32a
+ vst vr6, t4, 0 // t39
+ vst vr13, t5, 16*8 // t40a
+ vst vr10, t4, 16*8 // t47
+ vst vr14, t5, 16*16 // t48
+ vst vr4, t4, 16*16 // t55a
+ vst vr1, t5, 16*24 // t56
+ vst vr15, t4, 16*24 // t63a
+.endm // dct64_step2_lsx
+
+.macro dct64_step3_lsx
+ // t0 t1 t2 t3 t4 t5 t6 t7
+ vld_x8 t3, 0, 16, vr2, vr3, vr7, vr8, vr11, vr12, vr16, vr17
+
+ vld vr9, t5, 16*24 // t56
+ vld vr6, t5, 16*24+16 // t57a
+ vld vr13, t5, 16*24+32 // t58
+ vld vr10, t5, 16*24+48 // t59a
+ vld vr14, t4, 16*24-48 // t60
+ vld vr4, t4, 16*24-32 // t61a
+ vld vr1, t4, 16*24-16 // t62
+ vld vr15, t4, 16*24 // t63a
+
+ vsadd.h vr20, vr2, vr15 // c[0]
+ vssub.h vr21, vr2, vr15 // c[63]
+ vsadd.h vr22, vr3, vr1 // c[1]
+ vssub.h vr23, vr3, vr1 // c[62]
+ vsadd.h vr24, vr7, vr4 // c[2]
+ vssub.h vr25, vr7, vr4 // c[61]
+ vsadd.h vr26, vr8, vr14 // c[3]
+ vssub.h vr27, vr8, vr14 // c[60]
+
+ vsadd.h vr28, vr11, vr10 // c[4]
+ vssub.h vr29, vr11, vr10 // c[59]
+ vsadd.h vr30, vr12, vr13 // c[5]
+ vssub.h vr31, vr12, vr13 // c[58]
+ vsadd.h vr2, vr16, vr6 // c[6]
+ vssub.h vr15, vr16, vr6 // c[57]
+ vsadd.h vr1, vr17, vr9 // c[7]
+ vssub.h vr3, vr17, vr9 // c[56]
+.endm // dct64_step3_lsx
+
+.macro dct64_step4_lsx transpose8x8, shift, start0, stride0, start1, stride1
+
+ dct64_step3_lsx
+
+.ifnb \transpose8x8
+ LSX_TRANSPOSE8x8_H vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1, \
+ vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1, \
+ vr4, vr7, vr8, vr14, vr10, vr11, vr12, vr13
+
+ LSX_TRANSPOSE8x8_H vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21, \
+ vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21, \
+ vr4, vr7, vr8, vr14, vr10, vr11, vr12, vr13
+.endif
+
+.ifnb \shift
+.irp i, vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1, \
+ vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21
+ vsrari.h \i, \i, \shift
+.endr
+.endif
+
+ vst_x8 t7, \start0, \stride0, vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1
+
+ vst_x8 t7, \start1, \stride1, vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21
+
+.endm // dct64_step4_lsx
+
+.macro dct64_step5_lsx in0, in1, in2, in3, in4, in5, in6, in7
+
+ fld.d f4, t0, 0
+ fldx.d f5, t0, a1
+ fld.d f6, t6, 0
+ fldx.d f7, t6, a1
+ alsl.d t0, a1, t0, 2
+ alsl.d t6, a1, t6, 2
+ fld.d f8, t0, 0
+ fldx.d f9, t0, a1
+ fld.d f10, t6, 0
+ fldx.d f11, t6, a1
+
+.irp i, vr4, vr5, vr6, vr7, vr8, vr9, vr10, vr11
+ vsllwil.hu.bu \i, \i, 0
+.endr
+
+ vsrari.h vr20, \in0, 4
+ vsrari.h vr22, \in1, 4
+ vsrari.h vr24, \in2, 4
+ vsrari.h vr26, \in3, 4
+ vsrari.h vr28, \in4, 4
+ vsrari.h vr30, \in5, 4
+ vsrari.h vr2, \in6, 4
+ vsrari.h vr1, \in7, 4
+
+ vadd.h vr4, vr4, vr20
+ vadd.h vr5, vr5, vr22
+ vadd.h vr6, vr6, vr24
+ vadd.h vr7, vr7, vr26
+ vadd.h vr8, vr8, vr28
+ vadd.h vr9, vr9, vr30
+ vadd.h vr10, vr10, vr2
+ vadd.h vr11, vr11, vr1
+
+ vssrani.bu.h vr5, vr4, 0
+ vssrani.bu.h vr7, vr6, 0
+ vssrani.bu.h vr9, vr8, 0
+ vssrani.bu.h vr11, vr10, 0
+
+ vstelm.d vr5, t1, 0, 0
+ vstelm.d vr5, t2, 0, 1
+
+ alsl.d t1, a1, t1, 1
+ alsl.d t2, a1, t2, 1
+ vstelm.d vr7, t1, 0, 0
+ vstelm.d vr7, t2, 0, 1
+
+ alsl.d t1, a1, t1, 1
+ alsl.d t2, a1, t2, 1
+ vstelm.d vr9, t1, 0, 0
+ vstelm.d vr9, t2, 0, 1
+
+ alsl.d t1, a1, t1, 1
+ alsl.d t2, a1, t2, 1
+ vstelm.d vr11, t1, 0, 0
+ vstelm.d vr11, t2, 0, 1
+.endm // dct64_step5_lsx
+
+.macro dct_8x32_tx64_new_lsx vld_loc0, stride0, vld_loc1, stride1
+ vld_x8 t2, \vld_loc0, \stride0, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+
+ dct_8x16_tx64_core_lsx
+
+ vst_x16 t3, 0, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
+ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
+
+ vld_x8 t2, \vld_loc1, \stride1, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+
+ la.local t0, idct_coeffs
+
+ vldrepl.w vr20, t0, 64 // 201
+ vldrepl.w vr21, t0, 68 // 4091
+ vsllwil.w.h vr22, vr0, 0
+ vexth.w.h vr23, vr0
+ vmul.w vr8, vr22, vr21
+ vmul.w vr9, vr23, vr21
+ vmul.w vr0, vr22, vr20
+ vmul.w vr10, vr23, vr20
+ vssrarni.h.w vr9, vr8, 12 // t31a
+ vssrarni.h.w vr10, vr0, 12 // t16a
+
+ vldrepl.w vr20, t0, 72 // 3035
+ vldrepl.w vr21, t0, 76 // 2751
+ vsllwil.w.h vr22, vr7, 0
+ vexth.w.h vr23, vr7
+ vneg.w vr21, vr21
+ vmul.w vr8, vr22, vr20
+ vmul.w vr0, vr23, vr20
+ vmul.w vr7, vr22, vr21
+ vmul.w vr30, vr23, vr21
+ vssrarni.h.w vr0, vr8, 12 // t30a
+ vssrarni.h.w vr30, vr7, 12 // t17a
+
+ vldrepl.w vr20, t0, 80 // 1751
+ vldrepl.w vr21, t0, 84 // 3703
+ vsllwil.w.h vr22, vr4, 0
+ vexth.w.h vr23, vr4
+ vmul.w vr8, vr22, vr21
+ vmul.w vr7, vr23, vr21
+ vmul.w vr4, vr22, vr20
+ vmul.w vr19, vr23, vr20
+ vssrarni.h.w vr7, vr8, 12 // t29a
+ vssrarni.h.w vr19, vr4, 12 // t18a
+
+ vldrepl.w vr20, t0, 88 // 3857
+ vldrepl.w vr21, t0, 92 // 1380
+ vsllwil.w.h vr22, vr3, 0
+ vexth.w.h vr23, vr3
+ vneg.w vr21, vr21
+ vmul.w vr8, vr22, vr20
+ vmul.w vr4, vr23, vr20
+ vmul.w vr3, vr22, vr21
+ vmul.w vr26, vr23, vr21
+ vssrarni.h.w vr4, vr8, 12 // t28a
+ vssrarni.h.w vr26, vr3, 12 // t19a
+
+ vldrepl.w vr20, t0, 96 // 995
+ vldrepl.w vr21, t0, 100 // 3973
+ vsllwil.w.h vr22, vr2, 0
+ vexth.w.h vr23, vr2
+ vmul.w vr8, vr22, vr21
+ vmul.w vr3, vr23, vr21
+ vmul.w vr2, vr22, vr20
+ vmul.w vr27, vr23, vr20
+ vssrarni.h.w vr3, vr8, 12 // t27a
+ vssrarni.h.w vr27, vr2, 12 // t20a
+
+ vldrepl.w vr20, t0, 104 // 3513
+ vldrepl.w vr21, t0, 108 // 2106
+ vsllwil.w.h vr22, vr5, 0
+ vexth.w.h vr23, vr5
+ vneg.w vr21, vr21
+ vmul.w vr8, vr22, vr20
+ vmul.w vr2, vr23, vr20
+ vmul.w vr5, vr22, vr21
+ vmul.w vr28, vr23, vr21
+ vssrarni.h.w vr2, vr8, 12 // t26a
+ vssrarni.h.w vr28, vr5, 12 // t21a
+
+ vldrepl.w vr20, t0, 112 // 2440 -> 1220
+ vldrepl.w vr21, t0, 116 // 3290 -> 1645
+ vsllwil.w.h vr22, vr6, 0
+ vexth.w.h vr23, vr6
+ vmul.w vr8, vr22, vr21
+ vmul.w vr5, vr23, vr21
+ vmul.w vr6, vr22, vr20
+ vmul.w vr25, vr23, vr20
+ vssrarni.h.w vr5, vr8, 12 // t25a
+ vssrarni.h.w vr25, vr6, 12 // t22a
+
+ vldrepl.w vr20, t0, 120 // 4052
+ vldrepl.w vr21, t0, 124 // 601
+ vsllwil.w.h vr22, vr1, 0
+ vexth.w.h vr23, vr1
+ vneg.w vr21, vr21
+ vmul.w vr8, vr22, vr20
+ vmul.w vr6, vr23, vr20
+ vmul.w vr1, vr22, vr21
+ vmul.w vr24, vr23, vr21
+ vssrarni.h.w vr6, vr8, 12 // t24a
+ vssrarni.h.w vr24, vr1, 12 // t23a
+
+ vsadd.h vr1, vr10, vr30 // t16
+ vssub.h vr29, vr10, vr30 // t17
+ vssub.h vr8, vr26, vr19 // t18
+ vsadd.h vr31, vr26, vr19 // t19
+ vsadd.h vr10, vr27, vr28 // t20
+ vssub.h vr30, vr27, vr28 // t21
+ vssub.h vr19, vr24, vr25 // t22
+ vsadd.h vr26, vr24, vr25 // t23
+ vsadd.h vr27, vr6, vr5 // t24
+ vssub.h vr28, vr6, vr5 // t25
+ vssub.h vr24, vr3, vr2 // t26
+ vsadd.h vr25, vr3, vr2 // t27
+ vsadd.h vr5, vr4, vr7 // t28
+ vssub.h vr6, vr4, vr7 // t29
+ vssub.h vr2, vr9, vr0 // t30
+ vsadd.h vr3, vr9, vr0 // t31
+
+ vldrepl.w vr20, t0, 16 // 799
+ vldrepl.w vr21, t0, 20 // 4017
+ vmul_vmadd_w vr2, vr29, vr21, vr20, vr4, vr7
+ vmul_vmsub_w vr2, vr29, vr20, vr21, vr11, vr0
+ vssrarni.h.w vr7, vr4, 12 // t30a
+ vssrarni.h.w vr0, vr11, 12 // t17a
+ vmul_vmadd_w vr6, vr8, vr21, vr20, vr4, vr9
+ vneg.w vr4, vr4
+ vneg.w vr9, vr9
+ vmul_vmsub_w vr6, vr8, vr20, vr21, vr11, vr2
+ vssrarni.h.w vr9, vr4, 12 // t18a
+ vssrarni.h.w vr2, vr11, 12 // t29a
+
+ vldrepl.w vr20, t0, 24 // 3406 -> 1703
+ vldrepl.w vr21, t0, 28 // 2276 -> 1138
+ vmul_vmadd_w vr24, vr30, vr21, vr20, vr4, vr29
+ vmul_vmsub_w vr24, vr30, vr20, vr21, vr11, vr6
+ vssrarni.h.w vr29, vr4, 12 // t26a
+ vssrarni.h.w vr6, vr11, 12 // t21a
+
+ vmul_vmadd_w vr28, vr19, vr21, vr20, vr4, vr8
+ vneg.w vr4, vr4
+ vneg.w vr8, vr8
+ vmul_vmsub_w vr28, vr19, vr20, vr21, vr11, vr24
+ vssrarni.h.w vr8, vr4, 12 // t22a
+ vssrarni.h.w vr24, vr11, 12 // t25a
+
+ vsadd.h vr4, vr1, vr31 // t16a
+ vssub.h vr30, vr1, vr31 // t19a
+ vsadd.h vr19, vr0, vr9 // t17
+ vssub.h vr28, vr0, vr9 // t18
+ vssub.h vr1, vr26, vr10 // t20a
+ vsadd.h vr31, vr26, vr10 // t23a
+ vssub.h vr0, vr8, vr6 // t21
+ vsadd.h vr9, vr8, vr6 // t22
+ vsadd.h vr10, vr27, vr25 // t24a
+ vssub.h vr26, vr27, vr25 // t27a
+ vsadd.h vr6, vr24, vr29 // t25
+ vssub.h vr8, vr24, vr29 // t26
+ vssub.h vr25, vr3, vr5 // t28a
+ vsadd.h vr27, vr3, vr5 // t31a
+ vssub.h vr24, vr7, vr2 // t29
+ vsadd.h vr29, vr7, vr2 // t30
+
+ vldrepl.w vr20, t0, 8 // 1567
+ vldrepl.w vr21, t0, 12 // 3784
+ vmul_vmadd_w vr24, vr28, vr21, vr20, vr3, vr5
+ vmul_vmsub_w vr24, vr28, vr20, vr21, vr11, vr2
+ vssrarni.h.w vr5, vr3, 12 // t29a
+ vssrarni.h.w vr2, vr11, 12 // 18a
+
+ vmul_vmadd_w vr25, vr30, vr21, vr20, vr3, vr7
+ vmul_vmsub_w vr25, vr30, vr20, vr21, vr11, vr24
+ vssrarni.h.w vr7, vr3, 12 // t28
+ vssrarni.h.w vr24, vr11, 12 // t19
+
+ vmul_vmadd_w vr26, vr1, vr21, vr20, vr3, vr28
+ vneg.w vr3, vr3
+ vneg.w vr28, vr28
+ vmul_vmsub_w vr26, vr1, vr20, vr21, vr11, vr25
+ vssrarni.h.w vr28, vr3, 12 // t20
+ vssrarni.h.w vr25, vr11, 12 // t27
+
+ vmul_vmadd_w vr8, vr0, vr21, vr20, vr3, vr30
+ vneg.w vr3, vr3
+ vneg.w vr30, vr30
+ vmul_vmsub_w vr8, vr0, vr20, vr21, vr11, vr1
+ vssrarni.h.w vr30, vr3, 12 // t21a
+ vssrarni.h.w vr1, vr11, 12 // t26a
+
+ vsadd.h vr3, vr4, vr31 // t16
+ vssub.h vr26, vr4, vr31 // t23
+ vsadd.h vr0, vr19, vr9 // t17a
+ vssub.h vr8, vr19, vr9 // t22a
+ vsadd.h vr4, vr2, vr30 // t18
+ vssub.h vr31, vr2, vr30 // t21
+ vsadd.h vr9, vr24, vr28 // t19a
+ vssub.h vr19, vr24, vr28 // t20a
+ vssub.h vr2, vr27, vr10 // t24
+ vsadd.h vr30, vr27, vr10 // t31
+ vssub.h vr24, vr29, vr6 // t25a
+ vsadd.h vr28, vr29, vr6 // t30a
+ vssub.h vr10, vr5, vr1 // t26
+ vsadd.h vr27, vr5, vr1 // t29
+ vssub.h vr6, vr7, vr25 // t27a
+ vsadd.h vr29, vr7, vr25 // t28a
+
+ vldrepl.w vr20, t0, 0 // 2896
+ vmul_vmsub_w vr6, vr19, vr20, vr20, vr1, vr5
+ vmul_vmadd_w vr6, vr19, vr20, vr20, vr11, vr7
+ vssrarni.h.w vr5, vr1, 12 // t20
+ vssrarni.h.w vr7, vr11, 12 // t27
+
+ vmul_vmsub_w vr10, vr31, vr20, vr20, vr1, vr25
+ vmul_vmadd_w vr10, vr31, vr20, vr20, vr11, vr6
+ vssrarni.h.w vr25, vr1, 12 // t21a
+ vssrarni.h.w vr6, vr11, 12 // t26a
+
+ vmul_vmsub_w vr24, vr8, vr20, vr20, vr1, vr19
+ vmul_vmadd_w vr24, vr8, vr20, vr20, vr11, vr10
+ vssrarni.h.w vr19, vr1, 12 // t22
+ vssrarni.h.w vr10, vr11, 12 // t25
+
+ vmul_vmsub_w vr2, vr26, vr20, vr20, vr1, vr31
+ vmul_vmadd_w vr2, vr26, vr20, vr20, vr11, vr8
+ vssrarni.h.w vr31, vr1, 12 // t23a
+ vssrarni.h.w vr8, vr11, 12 // t24a
+
+ // t31 t30a t29 t28a t27 t26a t25 t24a t23a t22 t21a t20 t19a t18 t17a t16
+ // vr30 vr28 vr27 vr29 vr7 vr6 vr10 vr8 vr31 vr19 vr25 vr5 vr9 vr4 vr0 vr3
+
+ vld_x8 t3, 0, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18
+
+ vsadd.h vr1, vr11, vr30 // c[0]
+ vssub.h vr2, vr11, vr30 // c[31]
+ vsadd.h vr24, vr12, vr28 // c[1]
+ vssub.h vr26, vr12, vr28 // c[30]
+ vsadd.h vr11, vr13, vr27 // c[2]
+ vssub.h vr30, vr13, vr27 // c[29]
+ vsadd.h vr12, vr14, vr29 // c[3]
+ vssub.h vr28, vr14, vr29 // c[28]
+ vsadd.h vr13, vr15, vr7 // c[4]
+ vssub.h vr27, vr15, vr7 // c[27]
+ vsadd.h vr14, vr16, vr6 // c[5]
+ vssub.h vr29, vr16, vr6 // c[26]
+ vsadd.h vr7, vr17, vr10 // c[6]
+ vssub.h vr15, vr17, vr10 // c[25]
+ vsadd.h vr6, vr18, vr8 // c[7]
+ vssub.h vr16, vr18, vr8 // c[24]
+
+ vst_x8 t3, 0, 16, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6
+
+ vst_x8 t3, 384, 16, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2
+
+ vld_x8 t3, 128, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18
+
+ vsadd.h vr1, vr11, vr31 // c[8]
+ vssub.h vr2, vr11, vr31 // c[23]
+ vsadd.h vr24, vr12, vr19 // c[9]
+ vssub.h vr26, vr12, vr19 // c[22]
+ vsadd.h vr11, vr13, vr25 // c[10]
+ vssub.h vr30, vr13, vr25 // c[21]
+ vsadd.h vr12, vr14, vr5 // c[11]
+ vssub.h vr28, vr14, vr5 // c[20]
+ vsadd.h vr13, vr15, vr9 // c[12]
+ vssub.h vr27, vr15, vr9 // c[19]
+ vsadd.h vr14, vr16, vr4 // c[13]
+ vssub.h vr29, vr16, vr4 // c[18]
+ vsadd.h vr7, vr17, vr0 // c[14]
+ vssub.h vr15, vr17, vr0 // c[17]
+ vsadd.h vr6, vr18, vr3 // c[15]
+ vssub.h vr16, vr18, vr3 // c[16]
+
+ vst_x8 t3, 128, 16, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6
+
+ vst_x8 t3, 256, 16, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2
+.endm // dct_8x32_tx64_new_lsx
+
+function inv_txfm_add_dct_dct_64x64_8bpc_lsx
+ bnez a3, .NO_HAS_DCONLY_64x64
+
+ ld.h t2, a2, 0
+ vldi vr0, 0x8b5
+ vreplgr2vr.w vr1, t2
+ vldi vr20, 0x880
+ vmul.w vr2, vr0, vr1
+ st.h zero, a2, 0
+ vsrari.w vr2, vr2, 8
+ vld vr3, a0, 48
+ vsrari.w vr2, vr2, 2
+ vld vr1, a0, 16
+ vmadd.w vr20, vr2, vr0
+ vld vr2, a0, 32
+ vssrarni.h.w vr20, vr20, 12
+ vld vr0, a0, 0
+
+ vsllwil.hu.bu vr4, vr0, 0
+ vsllwil.hu.bu vr5, vr1, 0
+ vsllwil.hu.bu vr6, vr2, 0
+ vsllwil.hu.bu vr7, vr3, 0
+ vexth.hu.bu vr0, vr0
+ vexth.hu.bu vr1, vr1
+ vexth.hu.bu vr2, vr2
+ vexth.hu.bu vr3, vr3
+ vadd.h vr8, vr4, vr20
+ vadd.h vr9, vr0, vr20
+ vadd.h vr10, vr5, vr20
+ vadd.h vr11, vr1, vr20
+ vadd.h vr12, vr6, vr20
+ vadd.h vr13, vr2, vr20
+ vadd.h vr14, vr7, vr20
+ vadd.h vr15, vr3, vr20
+ vssrani.bu.h vr9, vr8, 0
+ vssrani.bu.h vr11, vr10, 0
+ vssrani.bu.h vr13, vr12, 0
+ vssrani.bu.h vr15, vr14, 0
+ vst vr9, a0, 0
+ vst vr11, a0, 16
+ vst vr13, a0, 32
+ vst vr15, a0, 48
+
+.rept 63
+ add.d a0, a0, a1
+ vld vr0, a0, 0
+ vld vr1, a0, 16
+ vld vr2, a0, 32
+ vld vr3, a0, 48
+ vsllwil.hu.bu vr4, vr0, 0
+ vsllwil.hu.bu vr5, vr1, 0
+ vsllwil.hu.bu vr6, vr2, 0
+ vsllwil.hu.bu vr7, vr3, 0
+ vexth.hu.bu vr0, vr0
+ vexth.hu.bu vr1, vr1
+ vexth.hu.bu vr2, vr2
+ vexth.hu.bu vr3, vr3
+ vadd.h vr8, vr4, vr20
+ vadd.h vr9, vr0, vr20
+ vadd.h vr10, vr5, vr20
+ vadd.h vr11, vr1, vr20
+ vadd.h vr12, vr6, vr20
+ vadd.h vr13, vr2, vr20
+ vadd.h vr14, vr7, vr20
+ vadd.h vr15, vr3, vr20
+ vssrani.bu.h vr9, vr8, 0
+ vssrani.bu.h vr11, vr10, 0
+ vssrani.bu.h vr13, vr12, 0
+ vssrani.bu.h vr15, vr14, 0
+ vst vr9, a0, 0
+ vst vr11, a0, 16
+ vst vr13, a0, 32
+ vst vr15, a0, 48
+.endr
+ b .DCT_DCT_64X64_END
+.NO_HAS_DCONLY_64x64:
+
+ malloc_space 64*32*2+512+512
+
+ addi.d t7, sp, 64
+
+.macro dct64x64_core1_lsx in0, in1, in2
+ addi.d t2, a2, \in0
+ addi.d t7, t7, \in1
+ li.w t4, 64*32*2+64
+ add.d t3, sp, t4
+ addi.d t6, t3, 512
+ add.d t5, t6, zero
+
+ dct_8x32_tx64_new_lsx 0, 256, 128, 256
+
+ la.local t0, idct64_coeffs
+
+ addi.d t2, a2, \in2 // 32 ...
+ // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
+ vld vr0, t2, 128*0 // in1
+ vld vr1, t2, 128*15 // in31
+ vld vr2, t2, 128*8 // in17
+ vld vr3, t2, 128*7 // in15
+ dct64_step1_lsx
+
+ addi.d t0, t0, 48
+ addi.d t6, t6, 128
+ // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
+ vld vr0, t2, 128*3 // in7
+ vld vr1, t2, 128*12 // in25
+ vld vr2, t2, 128*11 // in23
+ vld vr3, t2, 128*4 // in9
+ dct64_step1_lsx
+
+ addi.d t0, t0, 48
+ addi.d t6, t6, 128
+ // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
+ vld vr0, t2, 128*2 // in5
+ vld vr1, t2, 128*13 // in27
+ vld vr2, t2, 128*10 // in21
+ vld vr3, t2, 128*5 // in11
+ dct64_step1_lsx
+
+ addi.d t0, t0, 48
+ addi.d t6, t6, 128
+ // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
+ vld vr0, t2, 128*1 // in3
+ vld vr1, t2, 128*14 // in29
+ vld vr2, t2, 128*9 // in19
+ vld vr3, t2, 128*6 // in13
+ dct64_step1_lsx
+
+ la.local t0, idct_coeffs
+ addi.d t4, t5, 16*7
+ // t32a/t39/t40a/t47/t48/t55a/t56/t63a
+ dct64_step2_lsx
+
+ addi.d t5, t5, 16
+ addi.d t4, t4, -16
+ // t33/t38a/t41/t46a/t49a/t54/t57a/t62
+ dct64_step2_lsx
+
+ addi.d t5, t5, 16
+ addi.d t4, t4, -16
+ // t34a/t37/t42a/t45/t50/t53a/t58/t61a
+ dct64_step2_lsx
+
+ addi.d t5, t5, 16
+ addi.d t4, t4, -16
+ // t35/t36a/t43/t44a/t51a/t52/t59a/t60
+ dct64_step2_lsx
+
+ li.w t4, 64*32*2+64+512
+ add.d t5, t4, sp
+ addi.d t4, t5, 16*7
+ dct64_step4_lsx transpose8x8, 2, 0, 128, 112, 128
+
+ addi.d t3, t3, 128
+ addi.d t4, t4, -16*8
+ addi.d t5, t5, -16*8
+ dct64_step4_lsx transpose8x8, 2, 16, 128, 96, 128
+
+ addi.d t5, t5, -16*8
+ addi.d t4, t4, -16*8
+ addi.d t3, t3, 128
+ dct64_step4_lsx transpose8x8, 2, 32, 128, 80, 128
+
+ addi.d t5, t5, -16*8
+ addi.d t4, t4, -16*8
+ addi.d t3, t3, 128
+ dct64_step4_lsx transpose8x8, 2, 48, 128, 64, 128
+.endm
+
+ dct64x64_core1_lsx 0, 0, 64
+
+ dct64x64_core1_lsx 16, 128*8, 64+16
+
+ dct64x64_core1_lsx 32, 128*8, 64+16*2
+
+ dct64x64_core1_lsx 48, 128*8, 64+16*3
+
+ vreplgr2vr.h vr31, zero
+.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512, 528, 544, 560, 576, 592, 608, 624, 640, 656, 672, 688, 704, 720, 736, 752, 768, 784, 800, 816, 832, 848, 864, 880, 896, 912, 928, 944, 960, 976, 992, 1008, 1024, 1040, 1056, 1072, 1088, 1104, 1120, 1136, 1152, 1168, 1184, 1200, 1216, 1232, 1248, 1264, 1280, 1296, 1312, 1328, 1344, 1360, 1376, 1392, 1408, 1424, 1440, 1456, 1472, 1488, 1504, 1520, 1536, 1552, 1568, 1584, 1600, 1616, 1632, 1648, 1664, 1680, 1696, 1712, 1728, 1744, 1760, 1776, 1792, 1808, 1824, 1840, 1856, 1872, 1888, 1904, 1920, 1936, 1952, 1968, 1984, 2000, 2016, 2032
+ vst vr31, a2, \i
+.endr
+
+.macro dct64x64_core2_lsx in0, in1
+ addi.d t2, sp, 64+\in0
+ addi.d t7, sp, 64+\in0
+ li.w t4, 64*32*2+64
+ add.d t3, sp, t4
+ addi.d t6, t3, 512
+ add.d t5, t6, zero
+
+ addi.d t2, t2, 1024
+ addi.d t2, t2, 1024
+ dct_8x32_tx64_new_lsx -2048, 512, 256-2048, 512
+
+ la.local t0, idct64_coeffs
+
+ addi.d t2, sp, 64+64*2+\in0
+ addi.d t4, t2, 256*7
+ addi.d t4, t4, 256
+
+ vld vr0, t2, 256*0 // in1
+ vld vr1, t4, 256*7 // in31
+ vld vr2, t4, 256*0 // in17
+ vld vr3, t2, 256*7 // in15
+ dct64_step1_lsx
+
+ addi.d t0, t0, 48
+ addi.d t6, t6, 128
+ vld vr0, t2, 256*3 // in7
+ vld vr1, t4, 256*4 // in25
+ vld vr2, t4, 256*3 // in23
+ vld vr3, t2, 256*4 // in9
+ dct64_step1_lsx
+
+ addi.d t0, t0, 48
+ addi.d t6, t6, 128
+ vld vr0, t2, 256*2 // in5
+ vld vr1, t4, 256*5 // in27
+ vld vr2, t4, 256*2 // in21
+ vld vr3, t2, 256*5 // in11
+ dct64_step1_lsx
+
+ addi.d t0, t0, 48
+ addi.d t6, t6, 128
+ vld vr0, t2, 256*1 // in3
+ vld vr1, t4, 256*6 // in29
+ vld vr2, t4, 256*1 // in19
+ vld vr3, t2, 256*6 // in13
+ dct64_step1_lsx
+
+ la.local t0, idct_coeffs
+ addi.d t4, t5, 16*7
+ // t32a/t39/t40a/t47/t48/t55a/t56/t63a
+ dct64_step2_lsx
+
+ addi.d t5, t5, 16
+ addi.d t4, t4, -16
+ // t33/t38a/t41/t46a/t49a/t54/t57a/t62
+ dct64_step2_lsx
+
+ addi.d t5, t5, 16
+ addi.d t4, t4, -16
+ // t34a/t37/t42a/t45/t50/t53a/t58/t61a
+ dct64_step2_lsx
+
+ addi.d t5, t5, 16
+ addi.d t4, t4, -16
+ // t35/t36a/t43/t44a/t51a/t52/t59a/t60
+ dct64_step2_lsx
+
+ li.w t4, 64*32*2+64+512
+ add.d t5, t4, sp
+ addi.d t4, t5, 16*7
+ addi.d a0, a0, \in1
+ // 0 - 7, 56 -63
+ dct64_step3_lsx
+
+ li.w t8, 0
+ mul.w t0, t8, a1
+ add.d t0, a0, t0
+ alsl.d t6, a1, t0, 1
+ addi.d t1, t0, 0
+ add.d t2, t0, a1
+ dct64_step5_lsx vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1
+
+ li.w t8, 56
+ mul.w t0, t8, a1
+ add.d t0, a0, t0
+ alsl.d t6, a1, t0, 1
+ addi.d t1, t0, 0
+ add.d t2, t0, a1
+ dct64_step5_lsx vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21
+
+ // 8 - 15, 48 - 55
+ addi.d t3, t3, 128
+ addi.d t4, t4, -16*8
+ addi.d t5, t5, -16*8
+ dct64_step3_lsx
+
+ li.w t8, 8
+ mul.w t0, t8, a1
+ add.d t0, t0, a0
+ alsl.d t6, a1, t0, 1
+ addi.d t1, t0, 0
+ add.d t2, t0, a1
+ dct64_step5_lsx vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1
+
+ li.w t8, 48
+ mul.w t0, t8, a1
+ add.d t0, t0, a0
+ alsl.d t6, a1, t0, 1
+ addi.d t1, t0, 0
+ add.d t2, t0, a1
+ dct64_step5_lsx vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21
+
+ // 16 - 23, 40 - 47
+ addi.d t3, t3, 128
+ addi.d t4, t4, -16*8
+ addi.d t5, t5, -16*8
+ dct64_step3_lsx
+
+ li.w t8, 16
+ mul.w t0, t8, a1
+ add.d t0, t0, a0
+ alsl.d t6, a1, t0, 1
+ addi.d t1, t0, 0
+ add.d t2, t0, a1
+ dct64_step5_lsx vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1
+
+ li.w t8, 40
+ mul.w t0, t8, a1
+ add.d t0, t0, a0
+ alsl.d t6, a1, t0, 1
+ addi.d t1, t0, 0
+ add.d t2, t0, a1
+ dct64_step5_lsx vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21
+
+ // 24 - 31, 32 - 39
+ addi.d t3, t3, 128
+ addi.d t4, t4, -16*8
+ addi.d t5, t5, -16*8
+ dct64_step3_lsx
+
+ li.w t8, 24
+ mul.w t0, t8, a1
+ add.d t0, t0, a0
+ alsl.d t6, a1, t0, 1
+ addi.d t1, t0, 0
+ add.d t2, t0, a1
+ dct64_step5_lsx vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1
+
+ li.w t8, 32
+ mul.w t0, t8, a1
+ add.d t0, t0, a0
+ alsl.d t6, a1, t0, 1
+ addi.d t1, t0, 0
+ add.d t2, t0, a1
+ dct64_step5_lsx vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21
+.endm
+
+ dct64x64_core2_lsx 16*0, 0
+
+ dct64x64_core2_lsx 16*1, 8
+
+ dct64x64_core2_lsx 16*2, 8
+
+ dct64x64_core2_lsx 16*3, 8
+
+ dct64x64_core2_lsx 16*4, 8
+
+ dct64x64_core2_lsx 16*5, 8
+
+ dct64x64_core2_lsx 16*6, 8
+
+ dct64x64_core2_lsx 16*7, 8
+
+ free_space 64*32*2+512+512
+.DCT_DCT_64X64_END:
+endfunc
diff --git a/third_party/dav1d/src/loongarch/itx.h b/third_party/dav1d/src/loongarch/itx.h
new file mode 100644
index 0000000000..3ad444f534
--- /dev/null
+++ b/third_party/dav1d/src/loongarch/itx.h
@@ -0,0 +1,195 @@
+/*
+ * Copyright © 2023, VideoLAN and dav1d authors
+ * Copyright © 2023, Loongson Technology Corporation Limited
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_LOONGARCH_ITX_H
+#define DAV1D_SRC_LOONGARCH_ITX_H
+
+#include "src/cpu.h"
+#include "src/itx.h"
+
+decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_4x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_4x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_4x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_4x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_4x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_4x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_4x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_4x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_4x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_4x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_4x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_4x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_4x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_4x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_4x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_4x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_4x4, lsx));
+
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_4x8, lsx));
+
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_8x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_8x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_8x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_8x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_8x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_8x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_8x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_8x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_8x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_8x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_8x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_8x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_8x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_8x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_8x4, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_8x4, lsx));
+
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_8x8, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_8x8, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_8x8, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_8x8, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_8x8, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_8x8, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_8x8, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_8x8, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_8x8, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_8x8, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_8x8, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_8x8, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_8x8, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_8x8, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_8x8, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_8x8, lsx));
+
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_8x16, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_8x16, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_8x16, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_8x16, lsx));
+
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_16x8, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_16x8, lsx));
+
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_16x16, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_16x16, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_16x16, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_16x16, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_16x16, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_16x16, lsx));
+
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_8x32, lsx));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_32x32, lsx));
+
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_32x32, lsx));
+
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x64, lsx));
+
+static ALWAYS_INLINE void itx_dsp_init_loongarch(Dav1dInvTxfmDSPContext *const c, int bpc) {
+#if BITDEPTH == 8
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_LOONGARCH_CPU_FLAG_LSX)) return;
+
+ if (BITDEPTH != 8 ) return;
+
+ c->itxfm_add[TX_4X4][WHT_WHT] = dav1d_inv_txfm_add_wht_wht_4x4_8bpc_lsx;
+ c->itxfm_add[TX_4X4][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_4x4_8bpc_lsx;
+ c->itxfm_add[TX_4X4][IDTX] = dav1d_inv_txfm_add_identity_identity_4x4_8bpc_lsx;
+ c->itxfm_add[TX_4X4][DCT_ADST] = dav1d_inv_txfm_add_adst_dct_4x4_8bpc_lsx;
+ c->itxfm_add[TX_4X4][ADST_DCT] = dav1d_inv_txfm_add_dct_adst_4x4_8bpc_lsx;
+ c->itxfm_add[TX_4X4][ADST_ADST] = dav1d_inv_txfm_add_adst_adst_4x4_8bpc_lsx;
+ c->itxfm_add[TX_4X4][FLIPADST_DCT] = dav1d_inv_txfm_add_dct_flipadst_4x4_8bpc_lsx;
+ c->itxfm_add[TX_4X4][ADST_FLIPADST] = dav1d_inv_txfm_add_flipadst_adst_4x4_8bpc_lsx;
+ c->itxfm_add[TX_4X4][FLIPADST_ADST] = dav1d_inv_txfm_add_adst_flipadst_4x4_8bpc_lsx;
+ c->itxfm_add[TX_4X4][DCT_FLIPADST] = dav1d_inv_txfm_add_flipadst_dct_4x4_8bpc_lsx;
+ c->itxfm_add[TX_4X4][FLIPADST_FLIPADST] = dav1d_inv_txfm_add_flipadst_flipadst_4x4_8bpc_lsx;
+ c->itxfm_add[TX_4X4][H_DCT] = dav1d_inv_txfm_add_dct_identity_4x4_8bpc_lsx;
+ c->itxfm_add[TX_4X4][V_DCT] = dav1d_inv_txfm_add_identity_dct_4x4_8bpc_lsx;
+ c->itxfm_add[TX_4X4][H_FLIPADST] = dav1d_inv_txfm_add_flipadst_identity_4x4_8bpc_lsx;
+ c->itxfm_add[TX_4X4][V_FLIPADST] = dav1d_inv_txfm_add_identity_flipadst_4x4_8bpc_lsx;
+ c->itxfm_add[TX_4X4][V_ADST] = dav1d_inv_txfm_add_identity_adst_4x4_8bpc_lsx;
+ c->itxfm_add[TX_4X4][H_ADST] = dav1d_inv_txfm_add_adst_identity_4x4_8bpc_lsx;
+
+ c->itxfm_add[RTX_4X8][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_4x8_8bpc_lsx;
+
+ c->itxfm_add[RTX_8X4][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_8x4_8bpc_lsx;
+ c->itxfm_add[RTX_8X4][IDTX] = dav1d_inv_txfm_add_identity_identity_8x4_8bpc_lsx;
+ c->itxfm_add[RTX_8X4][DCT_ADST] = dav1d_inv_txfm_add_adst_dct_8x4_8bpc_lsx;
+ c->itxfm_add[RTX_8X4][ADST_DCT] = dav1d_inv_txfm_add_dct_adst_8x4_8bpc_lsx;
+ c->itxfm_add[RTX_8X4][ADST_ADST] = dav1d_inv_txfm_add_adst_adst_8x4_8bpc_lsx;
+ c->itxfm_add[RTX_8X4][ADST_FLIPADST] = dav1d_inv_txfm_add_flipadst_adst_8x4_8bpc_lsx;
+ c->itxfm_add[RTX_8X4][FLIPADST_ADST] = dav1d_inv_txfm_add_adst_flipadst_8x4_8bpc_lsx;
+ c->itxfm_add[RTX_8X4][DCT_FLIPADST] = dav1d_inv_txfm_add_flipadst_dct_8x4_8bpc_lsx;
+ c->itxfm_add[RTX_8X4][FLIPADST_DCT] = dav1d_inv_txfm_add_dct_flipadst_8x4_8bpc_lsx;
+ c->itxfm_add[RTX_8X4][FLIPADST_FLIPADST] = dav1d_inv_txfm_add_flipadst_flipadst_8x4_8bpc_lsx;
+ c->itxfm_add[RTX_8X4][H_DCT] = dav1d_inv_txfm_add_dct_identity_8x4_8bpc_lsx;
+ c->itxfm_add[RTX_8X4][V_DCT] = dav1d_inv_txfm_add_identity_dct_8x4_8bpc_lsx;
+ c->itxfm_add[RTX_8X4][H_FLIPADST] = dav1d_inv_txfm_add_flipadst_identity_8x4_8bpc_lsx;
+ c->itxfm_add[RTX_8X4][V_FLIPADST] = dav1d_inv_txfm_add_identity_flipadst_8x4_8bpc_lsx;
+ c->itxfm_add[RTX_8X4][H_ADST] = dav1d_inv_txfm_add_adst_identity_8x4_8bpc_lsx;
+ c->itxfm_add[RTX_8X4][V_ADST] = dav1d_inv_txfm_add_identity_adst_8x4_8bpc_lsx;
+
+ c->itxfm_add[TX_8X8][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_8x8_8bpc_lsx;
+ c->itxfm_add[TX_8X8][IDTX] = dav1d_inv_txfm_add_identity_identity_8x8_8bpc_lsx;
+ c->itxfm_add[TX_8X8][DCT_ADST] = dav1d_inv_txfm_add_adst_dct_8x8_8bpc_lsx;
+ c->itxfm_add[TX_8X8][ADST_DCT] = dav1d_inv_txfm_add_dct_adst_8x8_8bpc_lsx;
+ c->itxfm_add[TX_8X8][ADST_ADST] = dav1d_inv_txfm_add_adst_adst_8x8_8bpc_lsx;
+ c->itxfm_add[TX_8X8][ADST_FLIPADST] = dav1d_inv_txfm_add_flipadst_adst_8x8_8bpc_lsx;
+ c->itxfm_add[TX_8X8][FLIPADST_ADST] = dav1d_inv_txfm_add_adst_flipadst_8x8_8bpc_lsx;
+ c->itxfm_add[TX_8X8][DCT_FLIPADST] = dav1d_inv_txfm_add_flipadst_dct_8x8_8bpc_lsx;
+ c->itxfm_add[TX_8X8][FLIPADST_DCT] = dav1d_inv_txfm_add_dct_flipadst_8x8_8bpc_lsx;
+ c->itxfm_add[TX_8X8][FLIPADST_FLIPADST] = dav1d_inv_txfm_add_flipadst_flipadst_8x8_8bpc_lsx;
+ c->itxfm_add[TX_8X8][H_DCT] = dav1d_inv_txfm_add_dct_identity_8x8_8bpc_lsx;
+ c->itxfm_add[TX_8X8][V_DCT] = dav1d_inv_txfm_add_identity_dct_8x8_8bpc_lsx;
+ c->itxfm_add[TX_8X8][H_FLIPADST] = dav1d_inv_txfm_add_flipadst_identity_8x8_8bpc_lsx;
+ c->itxfm_add[TX_8X8][V_FLIPADST] = dav1d_inv_txfm_add_identity_flipadst_8x8_8bpc_lsx;
+ c->itxfm_add[TX_8X8][H_ADST] = dav1d_inv_txfm_add_adst_identity_8x8_8bpc_lsx;
+ c->itxfm_add[TX_8X8][V_ADST] = dav1d_inv_txfm_add_identity_adst_8x8_8bpc_lsx;
+
+ c->itxfm_add[RTX_8X16][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_8x16_8bpc_lsx;
+ c->itxfm_add[RTX_8X16][IDTX] = dav1d_inv_txfm_add_identity_identity_8x16_8bpc_lsx;
+ c->itxfm_add[RTX_8X16][DCT_ADST] = dav1d_inv_txfm_add_adst_dct_8x16_8bpc_lsx;
+ c->itxfm_add[RTX_8X16][ADST_DCT] = dav1d_inv_txfm_add_dct_adst_8x16_8bpc_lsx;
+
+ c->itxfm_add[RTX_16X8][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_16x8_8bpc_lsx;
+ c->itxfm_add[RTX_16X8][DCT_ADST] = dav1d_inv_txfm_add_adst_dct_16x8_8bpc_lsx;
+
+ c->itxfm_add[TX_16X16][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_16x16_8bpc_lsx;
+ c->itxfm_add[TX_16X16][ADST_ADST] = dav1d_inv_txfm_add_adst_adst_16x16_8bpc_lsx;
+ c->itxfm_add[TX_16X16][DCT_ADST] = dav1d_inv_txfm_add_adst_dct_16x16_8bpc_lsx;
+ c->itxfm_add[TX_16X16][ADST_DCT] = dav1d_inv_txfm_add_dct_adst_16x16_8bpc_lsx;
+ c->itxfm_add[TX_16X16][DCT_FLIPADST] = dav1d_inv_txfm_add_flipadst_dct_16x16_8bpc_lsx;
+ c->itxfm_add[TX_16X16][FLIPADST_DCT] = dav1d_inv_txfm_add_dct_flipadst_16x16_8bpc_lsx;
+
+ c->itxfm_add[RTX_8X32][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_8x32_8bpc_lsx;
+
+ c->itxfm_add[TX_32X32][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_32x32_8bpc_lsx;
+
+ c->itxfm_add[TX_64X64][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_64x64_8bpc_lsx;
+#endif
+}
+
+#endif /* DAV1D_SRC_LOONGARCH_ITX_H */
diff --git a/third_party/dav1d/src/loongarch/loongson_asm.S b/third_party/dav1d/src/loongarch/loongson_asm.S
new file mode 100644
index 0000000000..a22072ba35
--- /dev/null
+++ b/third_party/dav1d/src/loongarch/loongson_asm.S
@@ -0,0 +1,776 @@
+/*********************************************************************
+ * Copyright (c) 2022 Loongson Technology Corporation Limited
+ * Contributed by Gu Xiwei(guxiwei-hf@loongson.cn)
+ * Shiyou Yin(yinshiyou-hf@loongson.cn)
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *********************************************************************/
+
+/*
+ * This file is a LoongArch assembly helper file and available under ISC
+ * license. It provides a large number of macros and alias to simplify
+ * writing assembly code, especially for LSX and LASX optimizations.
+ *
+ * Any one can modify it or add new features for his/her own purposes.
+ * Contributing a patch will be appreciated as it might be useful for
+ * others as well. Send patches to loongson contributor mentioned above.
+ *
+ * MAJOR version: Usage changes, incompatible with previous version.
+ * MINOR version: Add new macros/functions, or bug fixes.
+ * MICRO version: Comment changes or implementation changes.
+ */
+
+#define LML_VERSION_MAJOR 0
+#define LML_VERSION_MINOR 4
+#define LML_VERSION_MICRO 0
+
+#define DEFAULT_ALIGN 5
+
+/* Set prefix as needed. */
+#ifndef PRIVATE_PREFIX
+#define PRIVATE_PREFIX dav1d_
+#endif
+
+#define PASTE(a,b) a ## b
+#define CONCAT(a,b) PASTE(a,b)
+
+#ifdef PREFIX
+#define ASM_PREF CONCAT(_,PRIVATE_PREFIX)
+#else
+#define ASM_PREF PRIVATE_PREFIX
+#endif
+
+.macro function name, align=DEFAULT_ALIGN
+.macro endfunc
+ jirl $r0, $r1, 0x0
+ .size ASM_PREF\name, . - ASM_PREF\name
+ .purgem endfunc
+.endm
+.text ;
+.align \align ;
+.globl ASM_PREF\name ;
+.type ASM_PREF\name, @function ;
+ASM_PREF\name: ;
+.endm
+
+.macro const name, align=DEFAULT_ALIGN
+ .macro endconst
+ .size \name, . - \name
+ .purgem endconst
+ .endm
+.section .rodata
+.align \align
+\name:
+.endm
+
+/*
+ *============================================================================
+ * LoongArch register alias
+ *============================================================================
+ */
+
+#define a0 $a0
+#define a1 $a1
+#define a2 $a2
+#define a3 $a3
+#define a4 $a4
+#define a5 $a5
+#define a6 $a6
+#define a7 $a7
+
+#define t0 $t0
+#define t1 $t1
+#define t2 $t2
+#define t3 $t3
+#define t4 $t4
+#define t5 $t5
+#define t6 $t6
+#define t7 $t7
+#define t8 $t8
+
+#define s0 $s0
+#define s1 $s1
+#define s2 $s2
+#define s3 $s3
+#define s4 $s4
+#define s5 $s5
+#define s6 $s6
+#define s7 $s7
+#define s8 $s8
+
+#define zero $zero
+#define sp $sp
+#define ra $ra
+
+#define fa0 $fa0
+#define fa1 $fa1
+#define fa2 $fa2
+#define fa3 $fa3
+#define fa4 $fa4
+#define fa5 $fa5
+#define fa6 $fa6
+#define fa7 $fa7
+#define ft0 $ft0
+#define ft1 $ft1
+#define ft2 $ft2
+#define ft3 $ft3
+#define ft4 $ft4
+#define ft5 $ft5
+#define ft6 $ft6
+#define ft7 $ft7
+#define ft8 $ft8
+#define ft9 $ft9
+#define ft10 $ft10
+#define ft11 $ft11
+#define ft12 $ft12
+#define ft13 $ft13
+#define ft14 $ft14
+#define ft15 $ft15
+#define fs0 $fs0
+#define fs1 $fs1
+#define fs2 $fs2
+#define fs3 $fs3
+#define fs4 $fs4
+#define fs5 $fs5
+#define fs6 $fs6
+#define fs7 $fs7
+
+#define f0 $f0
+#define f1 $f1
+#define f2 $f2
+#define f3 $f3
+#define f4 $f4
+#define f5 $f5
+#define f6 $f6
+#define f7 $f7
+#define f8 $f8
+#define f9 $f9
+#define f10 $f10
+#define f11 $f11
+#define f12 $f12
+#define f13 $f13
+#define f14 $f14
+#define f15 $f15
+#define f16 $f16
+#define f17 $f17
+#define f18 $f18
+#define f19 $f19
+#define f20 $f20
+#define f21 $f21
+#define f22 $f22
+#define f23 $f23
+#define f24 $f24
+#define f25 $f25
+#define f26 $f26
+#define f27 $f27
+#define f28 $f28
+#define f29 $f29
+#define f30 $f30
+#define f31 $f31
+
+#define vr0 $vr0
+#define vr1 $vr1
+#define vr2 $vr2
+#define vr3 $vr3
+#define vr4 $vr4
+#define vr5 $vr5
+#define vr6 $vr6
+#define vr7 $vr7
+#define vr8 $vr8
+#define vr9 $vr9
+#define vr10 $vr10
+#define vr11 $vr11
+#define vr12 $vr12
+#define vr13 $vr13
+#define vr14 $vr14
+#define vr15 $vr15
+#define vr16 $vr16
+#define vr17 $vr17
+#define vr18 $vr18
+#define vr19 $vr19
+#define vr20 $vr20
+#define vr21 $vr21
+#define vr22 $vr22
+#define vr23 $vr23
+#define vr24 $vr24
+#define vr25 $vr25
+#define vr26 $vr26
+#define vr27 $vr27
+#define vr28 $vr28
+#define vr29 $vr29
+#define vr30 $vr30
+#define vr31 $vr31
+
+#define xr0 $xr0
+#define xr1 $xr1
+#define xr2 $xr2
+#define xr3 $xr3
+#define xr4 $xr4
+#define xr5 $xr5
+#define xr6 $xr6
+#define xr7 $xr7
+#define xr8 $xr8
+#define xr9 $xr9
+#define xr10 $xr10
+#define xr11 $xr11
+#define xr12 $xr12
+#define xr13 $xr13
+#define xr14 $xr14
+#define xr15 $xr15
+#define xr16 $xr16
+#define xr17 $xr17
+#define xr18 $xr18
+#define xr19 $xr19
+#define xr20 $xr20
+#define xr21 $xr21
+#define xr22 $xr22
+#define xr23 $xr23
+#define xr24 $xr24
+#define xr25 $xr25
+#define xr26 $xr26
+#define xr27 $xr27
+#define xr28 $xr28
+#define xr29 $xr29
+#define xr30 $xr30
+#define xr31 $xr31
+
+/*
+ *============================================================================
+ * LSX/LASX synthesize instructions
+ *============================================================================
+ */
+
+/*
+ * Description : Dot product of byte vector elements
+ * Arguments : Inputs - vj, vk
+ * Outputs - vd
+ * Return Type - halfword
+ */
+.macro vdp2.h.bu vd, vj, vk
+ vmulwev.h.bu \vd, \vj, \vk
+ vmaddwod.h.bu \vd, \vj, \vk
+.endm
+
+.macro vdp2.h.bu.b vd, vj, vk
+ vmulwev.h.bu.b \vd, \vj, \vk
+ vmaddwod.h.bu.b \vd, \vj, \vk
+.endm
+
+.macro vdp2.w.h vd, vj, vk
+ vmulwev.w.h \vd, \vj, \vk
+ vmaddwod.w.h \vd, \vj, \vk
+.endm
+
+.macro xvdp2.h.bu xd, xj, xk
+ xvmulwev.h.bu \xd, \xj, \xk
+ xvmaddwod.h.bu \xd, \xj, \xk
+.endm
+
+.macro xvdp2.h.bu.b xd, xj, xk
+ xvmulwev.h.bu.b \xd, \xj, \xk
+ xvmaddwod.h.bu.b \xd, \xj, \xk
+.endm
+
+.macro xvdp2.w.h xd, xj, xk
+ xvmulwev.w.h \xd, \xj, \xk
+ xvmaddwod.w.h \xd, \xj, \xk
+.endm
+
+/*
+ * Description : Dot product & addition of halfword vector elements
+ * Arguments : Inputs - vj, vk
+ * Outputs - vd
+ * Return Type - twice size of input
+ */
+.macro vdp2add.h.bu vd, vj, vk
+ vmaddwev.h.bu \vd, \vj, \vk
+ vmaddwod.h.bu \vd, \vj, \vk
+.endm
+
+.macro vdp2add.h.bu.b vd, vj, vk
+ vmaddwev.h.bu.b \vd, \vj, \vk
+ vmaddwod.h.bu.b \vd, \vj, \vk
+.endm
+
+.macro vdp2add.w.h vd, vj, vk
+ vmaddwev.w.h \vd, \vj, \vk
+ vmaddwod.w.h \vd, \vj, \vk
+.endm
+
+.macro xvdp2add.h.bu.b xd, xj, xk
+ xvmaddwev.h.bu.b \xd, \xj, \xk
+ xvmaddwod.h.bu.b \xd, \xj, \xk
+.endm
+
+.macro xvdp2add.w.h xd, xj, xk
+ xvmaddwev.w.h \xd, \xj, \xk
+ xvmaddwod.w.h \xd, \xj, \xk
+.endm
+
+/*
+ * Description : Range element vj[i] to vk[i] ~ vj[i]
+ * clip: vj > vk ? vj : vk && vj < va ? vj : va
+ */
+.macro vclip.h vd, vj, vk, va
+ vmax.h \vd, \vj, \vk
+ vmin.h \vd, \vd, \va
+.endm
+
+.macro vclip.w vd, vj, vk, va
+ vmax.w \vd, \vj, \vk
+ vmin.w \vd, \vd, \va
+.endm
+
+.macro xvclip.h xd, xj, xk, xa
+ xvmax.h \xd, \xj, \xk
+ xvmin.h \xd, \xd, \xa
+.endm
+
+.macro xvclip.w xd, xj, xk, xa
+ xvmax.w \xd, \xj, \xk
+ xvmin.w \xd, \xd, \xa
+.endm
+
+/*
+ * Description : Range element vj[i] to 0 ~ 255
+ * clip255: vj < 255 ? vj : 255 && vj > 0 ? vj : 0
+ */
+.macro vclip255.h vd, vj
+ vmaxi.h \vd, \vj, 0
+ vsat.hu \vd, \vd, 7
+.endm
+
+.macro vclip255.w vd, vj
+ vmaxi.w \vd, \vj, 0
+ vsat.wu \vd, \vd, 7
+.endm
+
+.macro xvclip255.h xd, xj
+ xvmaxi.h \xd, \xj, 0
+ xvsat.hu \xd, \xd, 7
+.endm
+
+.macro xvclip255.w xd, xj
+ xvmaxi.w \xd, \xj, 0
+ xvsat.wu \xd, \xd, 7
+.endm
+
+/*
+ * Description : Store elements of vector
+ * vd : Data vector to be stroed
+ * rk : Address of data storage
+ * ra : Offset of address
+ * si : Index of data in vd
+ */
+.macro vstelmx.b vd, rk, ra, si
+ add.d \rk, \rk, \ra
+ vstelm.b \vd, \rk, 0, \si
+.endm
+
+.macro vstelmx.h vd, rk, ra, si
+ add.d \rk, \rk, \ra
+ vstelm.h \vd, \rk, 0, \si
+.endm
+
+.macro vstelmx.w vd, rk, ra, si
+ add.d \rk, \rk, \ra
+ vstelm.w \vd, \rk, 0, \si
+.endm
+
+.macro vstelmx.d vd, rk, ra, si
+ add.d \rk, \rk, \ra
+ vstelm.d \vd, \rk, 0, \si
+.endm
+
+.macro vmov xd, xj
+ vor.v \xd, \xj, \xj
+.endm
+
+.macro xmov xd, xj
+ xvor.v \xd, \xj, \xj
+.endm
+
+.macro xvstelmx.d xd, rk, ra, si
+ add.d \rk, \rk, \ra
+ xvstelm.d \xd, \rk, 0, \si
+.endm
+
+/*
+ *============================================================================
+ * LSX/LASX custom macros
+ *============================================================================
+ */
+
+/*
+ * Load 4 float, double, V128, v256 elements with stride.
+ */
+.macro FLDS_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
+ fld.s \out0, \src, 0
+ fldx.s \out1, \src, \stride
+ fldx.s \out2, \src, \stride2
+ fldx.s \out3, \src, \stride3
+.endm
+
+.macro FLDD_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
+ fld.d \out0, \src, 0
+ fldx.d \out1, \src, \stride
+ fldx.d \out2, \src, \stride2
+ fldx.d \out3, \src, \stride3
+.endm
+
+.macro LSX_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
+ vld \out0, \src, 0
+ vldx \out1, \src, \stride
+ vldx \out2, \src, \stride2
+ vldx \out3, \src, \stride3
+.endm
+
+.macro LASX_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
+ xvld \out0, \src, 0
+ xvldx \out1, \src, \stride
+ xvldx \out2, \src, \stride2
+ xvldx \out3, \src, \stride3
+.endm
+
+/*
+ * Description : Transpose 4x4 block with half-word elements in vectors
+ * Arguments : Inputs - in0, in1, in2, in3
+ * Outputs - out0, out1, out2, out3
+ */
+.macro LSX_TRANSPOSE4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \
+ tmp0, tmp1
+ vilvl.h \tmp0, \in1, \in0
+ vilvl.h \tmp1, \in3, \in2
+ vilvl.w \out0, \tmp1, \tmp0
+ vilvh.w \out2, \tmp1, \tmp0
+ vilvh.d \out1, \out0, \out0
+ vilvh.d \out3, \out0, \out2
+.endm
+
+/*
+ * Description : Transpose 4x4 block with word elements in vectors
+ * Arguments : Inputs - in0, in1, in2, in3
+ * Outputs - out0, out1, out2, out3
+ * Details :
+ * Example :
+ * 1, 2, 3, 4 1, 5, 9,13
+ * 5, 6, 7, 8 to 2, 6,10,14
+ * 9,10,11,12 =====> 3, 7,11,15
+ * 13,14,15,16 4, 8,12,16
+ */
+.macro LSX_TRANSPOSE4x4_W in0, in1, in2, in3, out0, out1, out2, out3, \
+ tmp0, tmp1
+
+ vilvl.w \tmp0, \in1, \in0
+ vilvh.w \out1, \in1, \in0
+ vilvl.w \tmp1, \in3, \in2
+ vilvh.w \out3, \in3, \in2
+
+ vilvl.d \out0, \tmp1, \tmp0
+ vilvl.d \out2, \out3, \out1
+ vilvh.d \out3, \out3, \out1
+ vilvh.d \out1, \tmp1, \tmp0
+.endm
+
+/*
+ * Description : Transpose 8x8 block with half-word elements in vectors
+ * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
+ * Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ */
+.macro LSX_TRANSPOSE8x8_H in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3, out4, out5, out6, out7, tmp0, tmp1, tmp2, \
+ tmp3, tmp4, tmp5, tmp6, tmp7
+ vilvl.h \tmp0, \in6, \in4
+ vilvl.h \tmp1, \in7, \in5
+ vilvl.h \tmp2, \in2, \in0
+ vilvl.h \tmp3, \in3, \in1
+
+ vilvl.h \tmp4, \tmp1, \tmp0
+ vilvh.h \tmp5, \tmp1, \tmp0
+ vilvl.h \tmp6, \tmp3, \tmp2
+ vilvh.h \tmp7, \tmp3, \tmp2
+
+ vilvh.h \tmp0, \in6, \in4
+ vilvh.h \tmp1, \in7, \in5
+ vilvh.h \tmp2, \in2, \in0
+ vilvh.h \tmp3, \in3, \in1
+
+ vpickev.d \out0, \tmp4, \tmp6
+ vpickod.d \out1, \tmp4, \tmp6
+ vpickev.d \out2, \tmp5, \tmp7
+ vpickod.d \out3, \tmp5, \tmp7
+
+ vilvl.h \tmp4, \tmp1, \tmp0
+ vilvh.h \tmp5, \tmp1, \tmp0
+ vilvl.h \tmp6, \tmp3, \tmp2
+ vilvh.h \tmp7, \tmp3, \tmp2
+
+ vpickev.d \out4, \tmp4, \tmp6
+ vpickod.d \out5, \tmp4, \tmp6
+ vpickev.d \out6, \tmp5, \tmp7
+ vpickod.d \out7, \tmp5, \tmp7
+.endm
+
+/*
+ * Description : Transpose 16x8 block with byte elements in vectors
+ * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
+ * Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ */
+.macro LASX_TRANSPOSE16X8_B in0, in1, in2, in3, in4, in5, in6, in7, \
+ in8, in9, in10, in11, in12, in13, in14, in15, \
+ out0, out1, out2, out3, out4, out5, out6, out7,\
+ tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7
+ xvilvl.b \tmp0, \in2, \in0
+ xvilvl.b \tmp1, \in3, \in1
+ xvilvl.b \tmp2, \in6, \in4
+ xvilvl.b \tmp3, \in7, \in5
+ xvilvl.b \tmp4, \in10, \in8
+ xvilvl.b \tmp5, \in11, \in9
+ xvilvl.b \tmp6, \in14, \in12
+ xvilvl.b \tmp7, \in15, \in13
+ xvilvl.b \out0, \tmp1, \tmp0
+ xvilvh.b \out1, \tmp1, \tmp0
+ xvilvl.b \out2, \tmp3, \tmp2
+ xvilvh.b \out3, \tmp3, \tmp2
+ xvilvl.b \out4, \tmp5, \tmp4
+ xvilvh.b \out5, \tmp5, \tmp4
+ xvilvl.b \out6, \tmp7, \tmp6
+ xvilvh.b \out7, \tmp7, \tmp6
+ xvilvl.w \tmp0, \out2, \out0
+ xvilvh.w \tmp2, \out2, \out0
+ xvilvl.w \tmp4, \out3, \out1
+ xvilvh.w \tmp6, \out3, \out1
+ xvilvl.w \tmp1, \out6, \out4
+ xvilvh.w \tmp3, \out6, \out4
+ xvilvl.w \tmp5, \out7, \out5
+ xvilvh.w \tmp7, \out7, \out5
+ xvilvl.d \out0, \tmp1, \tmp0
+ xvilvh.d \out1, \tmp1, \tmp0
+ xvilvl.d \out2, \tmp3, \tmp2
+ xvilvh.d \out3, \tmp3, \tmp2
+ xvilvl.d \out4, \tmp5, \tmp4
+ xvilvh.d \out5, \tmp5, \tmp4
+ xvilvl.d \out6, \tmp7, \tmp6
+ xvilvh.d \out7, \tmp7, \tmp6
+.endm
+
+/*
+ * Description : Transpose 4x4 block with half-word elements in vectors
+ * Arguments : Inputs - in0, in1, in2, in3
+ * Outputs - out0, out1, out2, out3
+ */
+.macro LASX_TRANSPOSE4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \
+ tmp0, tmp1
+ xvilvl.h \tmp0, \in1, \in0
+ xvilvl.h \tmp1, \in3, \in2
+ xvilvl.w \out0, \tmp1, \tmp0
+ xvilvh.w \out2, \tmp1, \tmp0
+ xvilvh.d \out1, \out0, \out0
+ xvilvh.d \out3, \out0, \out2
+.endm
+
+/*
+ * Description : Transpose 4x8 block with half-word elements in vectors
+ * Arguments : Inputs - in0, in1, in2, in3
+ * Outputs - out0, out1, out2, out3
+ */
+.macro LASX_TRANSPOSE4x8_H in0, in1, in2, in3, out0, out1, out2, out3, \
+ tmp0, tmp1
+ xvilvl.h \tmp0, \in2, \in0
+ xvilvl.h \tmp1, \in3, \in1
+ xvilvl.h \out2, \tmp1, \tmp0
+ xvilvh.h \out3, \tmp1, \tmp0
+
+ xvilvl.d \out0, \out2, \out2
+ xvilvh.d \out1, \out2, \out2
+ xvilvl.d \out2, \out3, \out3
+ xvilvh.d \out3, \out3, \out3
+.endm
+
+/*
+ * Description : Transpose 8x8 block with half-word elements in vectors
+ * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
+ * Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ */
+.macro LASX_TRANSPOSE8x8_H in0, in1, in2, in3, in4, in5, in6, in7, \
+ out0, out1, out2, out3, out4, out5, out6, out7, \
+ tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7
+ xvilvl.h \tmp0, \in6, \in4
+ xvilvl.h \tmp1, \in7, \in5
+ xvilvl.h \tmp2, \in2, \in0
+ xvilvl.h \tmp3, \in3, \in1
+
+ xvilvl.h \tmp4, \tmp1, \tmp0
+ xvilvh.h \tmp5, \tmp1, \tmp0
+ xvilvl.h \tmp6, \tmp3, \tmp2
+ xvilvh.h \tmp7, \tmp3, \tmp2
+
+ xvilvh.h \tmp0, \in6, \in4
+ xvilvh.h \tmp1, \in7, \in5
+ xvilvh.h \tmp2, \in2, \in0
+ xvilvh.h \tmp3, \in3, \in1
+
+ xvpickev.d \out0, \tmp4, \tmp6
+ xvpickod.d \out1, \tmp4, \tmp6
+ xvpickev.d \out2, \tmp5, \tmp7
+ xvpickod.d \out3, \tmp5, \tmp7
+
+ xvilvl.h \tmp4, \tmp1, \tmp0
+ xvilvh.h \tmp5, \tmp1, \tmp0
+ xvilvl.h \tmp6, \tmp3, \tmp2
+ xvilvh.h \tmp7, \tmp3, \tmp2
+
+ xvpickev.d \out4, \tmp4, \tmp6
+ xvpickod.d \out5, \tmp4, \tmp6
+ xvpickev.d \out6, \tmp5, \tmp7
+ xvpickod.d \out7, \tmp5, \tmp7
+.endm
+
+/*
+ * Description : Transpose 2x4x4 block with half-word elements in vectors
+ * Arguments : Inputs - in0, in1, in2, in3
+ * Outputs - out0, out1, out2, out3
+ */
+.macro LASX_TRANSPOSE2x4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \
+ tmp0, tmp1, tmp2
+ xvilvh.h \tmp1, \in0, \in1
+ xvilvl.h \out1, \in0, \in1
+ xvilvh.h \tmp0, \in2, \in3
+ xvilvl.h \out3, \in2, \in3
+
+ xvilvh.w \tmp2, \out3, \out1
+ xvilvl.w \out3, \out3, \out1
+
+ xvilvl.w \out2, \tmp0, \tmp1
+ xvilvh.w \tmp1, \tmp0, \tmp1
+
+ xvilvh.d \out0, \out2, \out3
+ xvilvl.d \out2, \out2, \out3
+ xvilvh.d \out1, \tmp1, \tmp2
+ xvilvl.d \out3, \tmp1, \tmp2
+.endm
+
+/*
+ * Description : Transpose 4x4 block with word elements in vectors
+ * Arguments : Inputs - in0, in1, in2, in3
+ * Outputs - out0, out1, out2, out3
+ * Details :
+ * Example :
+ * 1, 2, 3, 4, 1, 2, 3, 4 1,5, 9,13, 1,5, 9,13
+ * 5, 6, 7, 8, 5, 6, 7, 8 to 2,6,10,14, 2,6,10,14
+ * 9,10,11,12, 9,10,11,12 =====> 3,7,11,15, 3,7,11,15
+ * 13,14,15,16, 13,14,15,16 4,8,12,16, 4,8,12,16
+ */
+.macro LASX_TRANSPOSE4x4_W in0, in1, in2, in3, out0, out1, out2, out3, \
+ tmp0, tmp1
+
+ xvilvl.w \tmp0, \in1, \in0
+ xvilvh.w \out1, \in1, \in0
+ xvilvl.w \tmp1, \in3, \in2
+ xvilvh.w \out3, \in3, \in2
+
+ xvilvl.d \out0, \tmp1, \tmp0
+ xvilvl.d \out2, \out3, \out1
+ xvilvh.d \out3, \out3, \out1
+ xvilvh.d \out1, \tmp1, \tmp0
+.endm
+
+/*
+ * Description : Transpose 8x8 block with word elements in vectors
+ * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
+ * Outputs - out0, out1, out2, out3, out4, out5, out6,
+ * _out7
+ * Example : LASX_TRANSPOSE8x8_W
+ * in0 : 1,2,3,4,5,6,7,8
+ * in1 : 2,2,3,4,5,6,7,8
+ * in2 : 3,2,3,4,5,6,7,8
+ * in3 : 4,2,3,4,5,6,7,8
+ * in4 : 5,2,3,4,5,6,7,8
+ * in5 : 6,2,3,4,5,6,7,8
+ * in6 : 7,2,3,4,5,6,7,8
+ * in7 : 8,2,3,4,5,6,7,8
+ *
+ * out0 : 1,2,3,4,5,6,7,8
+ * out1 : 2,2,2,2,2,2,2,2
+ * out2 : 3,3,3,3,3,3,3,3
+ * out3 : 4,4,4,4,4,4,4,4
+ * out4 : 5,5,5,5,5,5,5,5
+ * out5 : 6,6,6,6,6,6,6,6
+ * out6 : 7,7,7,7,7,7,7,7
+ * out7 : 8,8,8,8,8,8,8,8
+ */
+.macro LASX_TRANSPOSE8x8_W in0, in1, in2, in3, in4, in5, in6, in7,\
+ out0, out1, out2, out3, out4, out5, out6, out7,\
+ tmp0, tmp1, tmp2, tmp3
+ xvilvl.w \tmp0, \in2, \in0
+ xvilvl.w \tmp1, \in3, \in1
+ xvilvh.w \tmp2, \in2, \in0
+ xvilvh.w \tmp3, \in3, \in1
+ xvilvl.w \out0, \tmp1, \tmp0
+ xvilvh.w \out1, \tmp1, \tmp0
+ xvilvl.w \out2, \tmp3, \tmp2
+ xvilvh.w \out3, \tmp3, \tmp2
+
+ xvilvl.w \tmp0, \in6, \in4
+ xvilvl.w \tmp1, \in7, \in5
+ xvilvh.w \tmp2, \in6, \in4
+ xvilvh.w \tmp3, \in7, \in5
+ xvilvl.w \out4, \tmp1, \tmp0
+ xvilvh.w \out5, \tmp1, \tmp0
+ xvilvl.w \out6, \tmp3, \tmp2
+ xvilvh.w \out7, \tmp3, \tmp2
+
+ xmov \tmp0, \out0
+ xmov \tmp1, \out1
+ xmov \tmp2, \out2
+ xmov \tmp3, \out3
+ xvpermi.q \out0, \out4, 0x02
+ xvpermi.q \out1, \out5, 0x02
+ xvpermi.q \out2, \out6, 0x02
+ xvpermi.q \out3, \out7, 0x02
+ xvpermi.q \out4, \tmp0, 0x31
+ xvpermi.q \out5, \tmp1, 0x31
+ xvpermi.q \out6, \tmp2, 0x31
+ xvpermi.q \out7, \tmp3, 0x31
+.endm
+
+/*
+ * Description : Transpose 4x4 block with double-word elements in vectors
+ * Arguments : Inputs - in0, in1, in2, in3
+ * Outputs - out0, out1, out2, out3
+ * Example : LASX_TRANSPOSE4x4_D
+ * in0 : 1,2,3,4
+ * in1 : 1,2,3,4
+ * in2 : 1,2,3,4
+ * in3 : 1,2,3,4
+ *
+ * out0 : 1,1,1,1
+ * out1 : 2,2,2,2
+ * out2 : 3,3,3,3
+ * out3 : 4,4,4,4
+ */
+.macro LASX_TRANSPOSE4x4_D in0, in1, in2, in3, out0, out1, out2, out3, \
+ tmp0, tmp1
+ xvilvl.d \tmp0, \in1, \in0
+ xvilvh.d \out1, \in1, \in0
+ xvilvh.d \tmp1, \in3, \in2
+ xvilvl.d \out2, \in3, \in2
+
+ xvor.v \out0, \tmp0, \tmp0
+ xvor.v \out3, \tmp1, \tmp1
+
+ xvpermi.q \out0, \out2, 0x02
+ xvpermi.q \out2, \tmp0, 0x31
+ xvpermi.q \out3, \out1, 0x31
+ xvpermi.q \out1, \tmp1, 0x02
+.endm
diff --git a/third_party/dav1d/src/loongarch/loopfilter.S b/third_party/dav1d/src/loongarch/loopfilter.S
new file mode 100644
index 0000000000..e71d5a712e
--- /dev/null
+++ b/third_party/dav1d/src/loongarch/loopfilter.S
@@ -0,0 +1,1108 @@
+/*
+ * Copyright © 2023, VideoLAN and dav1d authors
+ * Copyright © 2023, Loongson Technology Corporation Limited
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/loongarch/loongson_asm.S"
+
+.macro FILTER_W4 DIR, TYPE
+.ifc \DIR, h
+ addi.d t5, a0, -2
+ fld.s f6, t5, 0 //p1 p0 q0 q1
+ fldx.s f7, t5, a1
+ alsl.d t5, a1, t5, 1
+ fld.s f8, t5, 0
+ fldx.s f9, t5, a1
+
+ vilvl.b vr6, vr7, vr6
+ vilvl.b vr7, vr9, vr8
+ vilvl.h vr6, vr7, vr6 //p1p1p1p1
+ vbsrl.v vr7, vr6, 4 //p0p0p0p0
+ vbsrl.v vr8, vr7, 4 //q0q0q0q0
+ vbsrl.v vr9, vr8, 4 //q1q1q1q1
+.else
+ sub.d t5, a0, a1
+ fld.s f7, t5, 0
+ sub.d t5, t5, a1
+ fld.s f6, t5, 0
+ fld.s f8, a0, 0
+ fldx.s f9, a0, a1
+.endif
+
+ vabsd.bu vr10, vr6, vr7 // (p1 - p0)
+ vabsd.bu vr11, vr9, vr8 // (q1 - q0)
+ vabsd.bu vr12, vr7, vr8 // (p0 - q0)
+ vabsd.bu vr13, vr6, vr9 // (p1 - q1)
+
+ vmax.bu vr14, vr10, vr11
+ vsle.bu vr15, vr14, vr4 //abs(p1 - p0) <= I && abs(q1 - q0) <= I
+ vsadd.bu vr16, vr12, vr12
+ vsrli.b vr17, vr13, 1
+ vsadd.bu vr16, vr16, vr17 //abs(p0 - q0) * 2 + (abs(p1 - q1) >> 1)
+ vsle.bu vr16, vr16, vr3
+ vand.v vr20, vr15, vr16 //fm
+
+ vpickve2gr.wu t5, vr20, 0
+ beqz t5, .END_FILTER_\DIR\()\TYPE\()_W4
+
+ vslt.bu vr16, vr2, vr14 //hev
+
+ vsllwil.h.b vr30, vr20, 0 //expand fm to w
+ vsllwil.w.h vr30, vr30, 0
+
+ vsllwil.hu.bu vr17, vr6, 0
+ vsllwil.hu.bu vr18, vr9, 0
+ vsub.h vr17, vr17, vr18
+ vssrarni.b.h vr17, vr17, 0 //f = iclip_diff(p1 - q1)
+
+ vand.v vr17, vr17, vr16
+ vsllwil.h.b vr18, vr17, 0
+
+ vsllwil.hu.bu vr10, vr8, 0
+ vsllwil.hu.bu vr11, vr7, 0
+ vsub.h vr10, vr10, vr11
+
+ vsadd.h vr11, vr10, vr10
+ vsadd.h vr10, vr10, vr11 //3 * (q0 - p0)
+ vsadd.h vr10, vr10, vr18 //f = iclip_diff(3 * (q0 - p0) + f);
+ vssrani.b.h vr10, vr10, 0
+ vsllwil.h.b vr10, vr10, 0
+
+ vaddi.hu vr11, vr10, 4
+ vaddi.hu vr12, vr10, 3
+ li.w t5, 127
+ vreplgr2vr.h vr13, t5
+ vmin.h vr11, vr11, vr13
+ vmin.h vr12, vr12, vr13
+ vsrai.h vr11, vr11, 3 //f1
+ vsrai.h vr12, vr12, 3 //f2
+
+ vsllwil.hu.bu vr13, vr7, 0 //p0
+ vsllwil.hu.bu vr14, vr8, 0 //q0
+ vsadd.h vr13, vr13, vr12
+ vssub.h vr14, vr14, vr11
+ vssrani.bu.h vr13, vr13, 0 //dst-1
+ vssrani.bu.h vr14, vr14, 0 //dst+0
+
+ vsrari.h vr15, vr11, 1 //f
+ vsllwil.hu.bu vr18, vr6, 0 //p1
+ vsllwil.hu.bu vr19, vr9, 0 //q1
+ vsadd.h vr18, vr18, vr15
+ vssub.h vr19, vr19, vr15
+ vssrani.bu.h vr18, vr18, 0 //dst-2
+ vssrani.bu.h vr19, vr19, 0 //dst+1
+ vbitsel.v vr26, vr18, vr6, vr16
+ vbitsel.v vr29, vr19, vr9, vr16
+
+ vbitsel.v vr6, vr6, vr26, vr20
+ vbitsel.v vr7, vr7, vr13, vr20
+ vbitsel.v vr8, vr8, vr14, vr20
+ vbitsel.v vr9, vr9, vr29, vr20
+
+.ifc \DIR, h
+ vilvl.b vr6, vr7, vr6
+ vilvl.b vr9, vr9, vr8
+ vilvl.h vr6, vr9, vr6
+
+ addi.d t5, a0, -2
+ vstelm.w vr6, t5, 0, 0
+ add.d t5, t5, a1
+ vstelm.w vr6, t5, 0, 1
+ add.d t5, t5, a1
+ vstelm.w vr6, t5, 0, 2
+ add.d t5, t5, a1
+ vstelm.w vr6, t5, 0, 3
+.else
+ fst.s f8, a0, 0
+ fstx.s f9, a0, a1
+ sub.d t5, a0, a1
+ fst.s f7, t5, 0
+ sub.d t5, t5, a1
+ fst.s f6, t5, 0
+.endif
+.END_FILTER_\DIR\()\TYPE\()_W4:
+.endm
+
+.macro FILTER_W6 DIR, TYPE
+.ifc \DIR, h
+ addi.d t5, a0, -3
+ fld.d f6, t5, 0 //p2 p1 p0 q0 q1 q2
+ fldx.d f7, t5, a1
+ alsl.d t5, a1, t5, 1
+ fld.d f8, t5, 0
+ fldx.d f9, t5, a1
+
+ vilvl.b vr6, vr7, vr6
+ vilvl.b vr7, vr9, vr8
+ vilvh.h vr10, vr7, vr6
+ vilvl.h vr6, vr7, vr6
+
+ vbsrl.v vr7, vr6, 4 //p1
+ vbsrl.v vr8, vr7, 4 //p0
+ vbsrl.v vr9, vr8, 4 //q0
+ vbsrl.v vr11, vr10, 4 //q2
+.else
+ alsl.d t5, a1, a1, 1
+ sub.d t5, a0, t5
+ fld.d f6, t5, 0
+ fldx.d f7, t5, a1
+ alsl.d t5, a1, t5, 1
+ fld.d f8, t5, 0
+ fldx.d f9, t5, a1
+ alsl.d t5, a1, t5, 1
+ fld.d f10, t5, 0
+ fldx.d f11, t5, a1
+.endif
+
+ vabsd.bu vr12, vr7, vr8 //abs(p1-p0)
+ vabsd.bu vr13, vr10, vr9 //abs(q1-q0)
+ vmax.bu vr14, vr12, vr13
+ vslt.bu vr2, vr2, vr14 //hev
+ vabsd.bu vr12, vr6, vr7 //abs(p2-p1)
+ vmax.bu vr12, vr12, vr14
+ vabsd.bu vr13, vr11, vr10 //abs(q2-q1)
+ vmax.bu vr12, vr12, vr13
+ vsle.bu vr0, vr12, vr4 // <=I
+
+ vabsd.bu vr13, vr8, vr9 //abs(p0-q0)
+ vsadd.bu vr13, vr13, vr13
+ vabsd.bu vr15, vr7, vr10
+ vsrli.b vr15, vr15, 1
+ vsadd.bu vr13, vr13, vr15
+ vsle.bu vr13, vr13, vr3 //abs(p0 - q0) * 2 + (abs(p1 - q1) >> 1) <= E
+ vand.v vr0, vr0, vr13 //fm
+
+ vpickve2gr.wu t5, vr0, 0
+ beqz t5, .END_FILTER_\DIR\()\TYPE\()_W6
+
+ vabsd.bu vr12, vr6, vr8 //abs(p2-p0)
+ vabsd.bu vr13, vr11, vr9 //abs(q2-q0)
+ vmax.bu vr12, vr12, vr14
+ vmax.bu vr12, vr12, vr13
+ vxor.v vr13, vr13, vr13
+ vaddi.bu vr13, vr13, 1
+ vsle.bu vr1, vr12, vr13 //flat8in
+
+ //6789 10 11 --expand to h
+ vsllwil.hu.bu vr12, vr6, 0
+ vsllwil.hu.bu vr13, vr7, 0
+ vsllwil.hu.bu vr14, vr8, 0
+ vsllwil.hu.bu vr15, vr9, 0
+ vsllwil.hu.bu vr16, vr10, 0
+ vsllwil.hu.bu vr17, vr11, 0
+
+ //dst-2
+ vsadd.hu vr18, vr12, vr12
+ vsadd.hu vr18, vr18, vr12
+ vsadd.hu vr18, vr18, vr13
+ vsadd.hu vr18, vr18, vr13
+ vsadd.hu vr18, vr18, vr14
+ vsadd.hu vr18, vr18, vr14
+ vsadd.hu vr18, vr18, vr15
+
+ //dst-1
+ vsadd.hu vr19, vr18, vr15
+ vsadd.hu vr19, vr19, vr16
+ vssub.hu vr19, vr19, vr12
+ vssub.hu vr19, vr19, vr12
+
+ //dst+0
+ vsadd.hu vr20, vr19, vr17
+ vsadd.hu vr20, vr20, vr16
+ vssub.hu vr20, vr20, vr12
+ vssub.hu vr20, vr20, vr13
+
+ //dst+1
+ vsadd.hu vr21, vr20, vr17
+ vsadd.hu vr21, vr21, vr17
+ vssub.hu vr21, vr21, vr13
+ vssub.hu vr21, vr21, vr14
+
+ vsrari.h vr18, vr18, 3
+ vsrari.h vr19, vr19, 3
+ vsrari.h vr20, vr20, 3
+ vsrari.h vr21, vr21, 3
+
+ vsub.h vr22, vr13, vr16
+ vssrani.b.h vr22, vr22, 0
+ vand.v vr22, vr22, vr2
+ vsllwil.h.b vr22, vr22, 0 //f = iclip_diff(p1 - q1);
+
+ vsub.h vr23, vr15, vr14
+ vsadd.h vr24, vr23, vr23
+ vsadd.h vr23, vr23, vr24
+ vsadd.h vr23, vr23, vr22
+ vssrani.b.h vr23, vr23, 0
+ vsllwil.h.b vr23, vr23, 0 //f = iclip_diff(3 * (q0 - p0) + f);
+
+ vaddi.hu vr24, vr23, 4
+ vaddi.hu vr25, vr23, 3
+ li.w t5, 127
+ vreplgr2vr.h vr3, t5
+ vmin.h vr24, vr24, vr3
+ vmin.h vr25, vr25, vr3
+ vsrai.h vr24, vr24, 3 //f1
+ vsrai.h vr25, vr25, 3 //f2
+
+ vsadd.h vr26, vr14, vr25 //dst-1
+ vssub.h vr27, vr15, vr24 //dst+0
+
+ vsrari.h vr24, vr24, 1
+ vsadd.h vr28, vr13, vr24
+ vssub.h vr29, vr16, vr24
+ vsllwil.h.b vr2, vr2, 0
+ vbitsel.v vr28, vr28, vr13, vr2 //dst-2
+ vbitsel.v vr29, vr29, vr16, vr2 //dst+1
+
+ //flat8in
+ vsllwil.h.b vr1, vr1, 0
+ vbitsel.v vr18, vr28, vr18, vr1
+ vbitsel.v vr19, vr26, vr19, vr1
+ vbitsel.v vr20, vr27, vr20, vr1
+ vbitsel.v vr21, vr29, vr21, vr1
+
+ vssrani.bu.h vr18, vr18, 0
+ vssrani.bu.h vr19, vr19, 0
+ vssrani.bu.h vr20, vr20, 0
+ vssrani.bu.h vr21, vr21, 0
+
+ vbitsel.v vr7, vr7, vr18, vr0 //p1
+ vbitsel.v vr8, vr8, vr19, vr0 //p0
+ vbitsel.v vr9, vr9, vr20, vr0 //q0
+ vbitsel.v vr10, vr10, vr21, vr0 //q1
+
+.ifc \DIR, h
+ vilvl.b vr7, vr8, vr7
+ vilvl.b vr9, vr10, vr9
+ vilvl.h vr7, vr9, vr7
+
+ addi.d t5, a0, -2
+ vstelm.w vr7, t5, 0, 0
+ add.d t5, t5, a1
+ vstelm.w vr7, t5, 0, 1
+ add.d t5, t5, a1
+ vstelm.w vr7, t5, 0, 2
+ add.d t5, t5, a1
+ vstelm.w vr7, t5, 0, 3
+.else
+ fst.s f9, a0, 0
+ fstx.s f10, a0, a1
+ sub.d t5, a0, a1
+ fst.s f8, t5, 0
+ sub.d t5, t5, a1
+ fst.s f7, t5, 0
+.endif
+.END_FILTER_\DIR\()\TYPE\()_W6:
+.endm
+
+.macro FILTER_W8 DIR, TYPE
+.ifc \DIR, h
+ addi.d t5, a0, -4
+ fld.d f6, t5, 0 //p3 p2 p1 p0 q0 q1 q2 q3
+ fldx.d f7, t5, a1
+ alsl.d t5, a1, t5, 1
+ fld.d f8, t5, 0
+ fldx.d f9, t5, a1
+
+ vilvl.b vr6, vr7, vr6
+ vilvl.b vr7, vr9, vr8
+ vilvh.h vr10, vr7, vr6 //q0
+ vilvl.h vr6, vr7, vr6 //p3
+ vbsrl.v vr7, vr6, 4 //p2
+ vbsrl.v vr8, vr6, 8 //p1
+ vbsrl.v vr9, vr6, 12 //p0
+ vbsrl.v vr11, vr10, 4 //q1
+ vbsrl.v vr12, vr10, 8 //q2
+ vbsrl.v vr13, vr10, 12 //q3
+.else
+ fld.s f10, a0, 0
+ fldx.s f11, a0, a1
+ add.d t5, a0, a1
+ fldx.s f12, t5, a1
+ add.d t5, t5, a1
+ fldx.s f13, t5, a1
+ sub.d t5, a0, a1
+ fld.s f9, t5, 0
+ sub.d t5, t5, a1
+ fld.s f8, t5, 0
+ sub.d t5, t5, a1
+ fld.s f7, t5, 0
+ sub.d t5, t5, a1
+ fld.s f6, t5, 0
+.endif
+
+ vabsd.bu vr14, vr8, vr9 //p1-p0
+ vabsd.bu vr15, vr11, vr10 //q1-q0
+ vabsd.bu vr16, vr9, vr10 //p0-q0
+ vabsd.bu vr17, vr8, vr11 //p1-q1
+ vabsd.bu vr18, vr7, vr8 //p2-p1
+ vabsd.bu vr19, vr12, vr11 //q2-q1
+ vabsd.bu vr20, vr6, vr7 //p3-p2
+ vabsd.bu vr21, vr13, vr12 //q3-q2
+
+ vmax.bu vr22, vr14, vr15
+ vsle.bu vr23, vr22, vr4 //abs(p1 - p0) <= I && abs(q1 - q0) <= I
+ vsadd.bu vr16, vr16, vr16
+ vsrli.b vr17, vr17, 1
+ vsadd.bu vr16, vr16, vr17
+ vsle.bu vr16, vr16, vr3 //abs(p0 - q0) * 2 + (abs(p1 - q1) >> 1) <= E
+ vand.v vr16, vr16, vr23 //fm
+
+ vpickve2gr.wu t5, vr16, 0
+ beqz t5, .END_FILTER_\DIR\()\TYPE\()_W8
+
+ vmax.bu vr23, vr18, vr19
+ vmax.bu vr23, vr23, vr20
+ vmax.bu vr23, vr23, vr21
+ vsle.bu vr23, vr23, vr4
+ vand.v vr16, vr16, vr23 //fm
+
+ vabsd.bu vr17, vr7, vr9 //abs(p2-p0)
+ vabsd.bu vr18, vr12, vr10 //abs(q2-q0)
+ vmax.bu vr17, vr17, vr14
+ vmax.bu vr17, vr17, vr15
+ vmax.bu vr17, vr17, vr18
+ vabsd.bu vr18, vr6, vr9 //abs(p3 - p0)
+ vabsd.bu vr19, vr13, vr10 //abs(q3 - q0)
+ vmax.bu vr17, vr17, vr18
+ vmax.bu vr17, vr17, vr19
+
+ vxor.v vr5, vr5, vr5
+ vaddi.bu vr5, vr5, 1 //F
+ vsle.bu vr17, vr17, vr5 //flat8in
+
+ vsllwil.hu.bu vr0, vr6, 0 //p3
+ vsllwil.hu.bu vr1, vr7, 0 //p2
+ vsllwil.hu.bu vr27, vr8, 0 //p1
+ vsllwil.hu.bu vr3, vr9, 0 //p0
+ vsllwil.hu.bu vr4, vr10, 0 //q0
+ vsllwil.hu.bu vr5, vr11, 0 //q1
+ vsllwil.hu.bu vr14, vr12, 0 //q2
+ vsllwil.hu.bu vr15, vr13, 0 //q3
+
+ vsadd.hu vr18, vr0, vr0 //p3+p3
+ vsadd.hu vr19, vr15, vr15 //q3+q3
+ vsadd.hu vr20, vr0, vr1 //p3+p2
+ vsadd.hu vr21, vr1, vr27 //p2+p1
+ vsadd.hu vr28, vr27, vr3 //p1+p0
+ vsadd.hu vr23, vr3, vr4 //p0+q0
+ vsadd.hu vr24, vr4, vr5 //q0+q1
+ vsadd.hu vr25, vr5, vr14 //q1+q2
+ vsadd.hu vr26, vr14, vr15 //q2+q3
+
+ // dst-3
+ vsadd.hu vr29, vr18, vr20
+ vsadd.hu vr29, vr29, vr21
+ vsadd.hu vr29, vr29, vr23
+
+ // dst-2
+ vsadd.hu vr30, vr18, vr21
+ vsadd.hu vr30, vr30, vr28
+ vsadd.hu vr30, vr30, vr24
+
+ // dst-1
+ vsadd.hu vr31, vr20, vr28
+ vsadd.hu vr31, vr31, vr23
+ vsadd.hu vr31, vr31, vr25
+
+ // dst+0
+ vsadd.hu vr18, vr21, vr23
+ vsadd.hu vr18, vr18, vr24
+ vsadd.hu vr18, vr18, vr26
+
+ //dst+1
+ vsadd.hu vr20, vr28, vr24
+ vsadd.hu vr20, vr20, vr25
+ vsadd.hu vr20, vr20, vr19
+
+ //dst+2
+ vsadd.hu vr21, vr23, vr25
+ vsadd.hu vr21, vr21, vr26
+ vsadd.hu vr21, vr21, vr19
+
+ vssrarni.bu.h vr23, vr29, 3
+ vssrarni.bu.h vr24, vr30, 3
+ vssrarni.bu.h vr25, vr31, 3
+ vssrarni.bu.h vr19, vr18, 3
+ vssrarni.bu.h vr20, vr20, 3
+ vssrarni.bu.h vr21, vr21, 3
+
+ // !flat8in
+ vslt.bu vr2, vr2, vr22 //hev
+
+ vsub.h vr30, vr27, vr5 //p1-q1
+ vssrani.b.h vr30, vr30, 0
+ vand.v vr30, vr30, vr2
+ vsllwil.h.b vr30, vr30, 0
+
+ vsub.h vr31, vr4, vr3
+ vsadd.h vr0, vr31, vr31
+ vsadd.h vr31, vr31, vr0
+ vsadd.h vr31, vr31, vr30
+ vssrani.b.h vr31, vr31, 0
+ vsllwil.h.b vr31, vr31, 0 //f = iclip_diff(3 * (q0 - p0) + f);
+
+ vaddi.hu vr14, vr31, 4
+ vaddi.hu vr15, vr31, 3
+ li.w t5, 127
+ vreplgr2vr.h vr18, t5
+ vmin.h vr14, vr14, vr18
+ vmin.h vr15, vr15, vr18
+ vsrai.h vr14, vr14, 3 //f1
+ vsrai.h vr15, vr15, 3 //f2
+
+ vsadd.h vr3, vr3, vr15
+ vssub.h vr4, vr4, vr14
+ vssrani.bu.h vr3, vr3, 0 //dst-1
+ vssrani.bu.h vr4, vr4, 0 //dst+0
+
+ vsrari.h vr14, vr14, 1
+ vsadd.h vr18, vr27, vr14
+ vssub.h vr26, vr5, vr14
+ vssrani.bu.h vr18, vr18, 0 //dst-2
+ vssrani.bu.h vr26, vr26, 0 //dst+1
+
+ vbitsel.v vr27, vr18, vr8, vr2 //dst-2
+ vbitsel.v vr28, vr26, vr11, vr2 //dst+1
+
+ vbitsel.v vr23, vr7, vr23, vr17 //dst-3 (p2)
+ vbitsel.v vr24, vr27, vr24, vr17 //dst-2
+ vbitsel.v vr25, vr3, vr25, vr17 //dst-1
+ vbitsel.v vr19, vr4, vr19, vr17 //dst+0
+ vbitsel.v vr20, vr28, vr20, vr17 //dst+1
+ vbitsel.v vr21, vr12, vr21, vr17 //dst+2
+
+ vbitsel.v vr7, vr7, vr23, vr16 //-3
+ vbitsel.v vr8, vr8, vr24, vr16 //-2
+ vbitsel.v vr9, vr9, vr25, vr16 //-1
+ vbitsel.v vr10, vr10, vr19, vr16 //+0
+ vbitsel.v vr11, vr11, vr20, vr16 //+1
+ vbitsel.v vr12, vr12, vr21, vr16 //+2
+
+.ifc \DIR, h
+ vilvl.b vr6, vr7, vr6
+ vilvl.b vr8, vr9, vr8
+ vilvl.b vr10, vr11, vr10
+ vilvl.b vr12, vr13, vr12
+ vilvl.h vr6, vr8, vr6 //p3p2p1p0 -- -- --
+ vilvl.h vr10, vr12, vr10 //q0q1q2q3 -- -- --
+ vilvl.w vr0, vr10, vr6 //p3p2p1p0q0q1q2q3 --
+ vilvh.w vr1, vr10, vr6 //--
+
+ addi.d t5, a0, -4
+ vstelm.d vr0, t5, 0, 0
+ add.d t5, t5, a1
+ vstelm.d vr0, t5, 0, 1
+ add.d t5, t5, a1
+ vstelm.d vr1, t5, 0, 0
+ add.d t5, t5, a1
+ vstelm.d vr1, t5, 0, 1
+.else
+ alsl.d t5, a1, a1, 1
+ sub.d t5, a0, t5
+ fst.s f7, t5, 0
+ fstx.s f8, t5, a1
+ add.d t5, t5, a1
+ fstx.s f9, t5, a1
+
+ fst.s f10, a0, 0
+ add.d t5, a0, a1
+ fst.s f11, t5, 0
+ fstx.s f12, t5, a1
+.endif
+.END_FILTER_\DIR\()\TYPE\()_W8:
+.endm
+
+.macro FILTER_W16 DIR, TYPE
+.ifc \DIR, h
+ addi.d t5, a0, -7
+ vld vr6, t5, 0 //p6p5p4p3p2p1p0q0 q1q2q3q4q5q6
+ vldx vr7, t5, a1
+ add.d t5, t5, a1
+ vldx vr8, t5, a1
+ add.d t5, t5, a1
+ vldx vr9, t5, a1
+
+ vilvl.b vr10, vr7, vr6
+ vilvh.b vr11, vr7, vr6
+ vilvl.b vr12, vr9, vr8
+ vilvh.b vr13, vr9, vr8
+ vilvl.h vr6, vr12, vr10
+ vilvh.h vr10, vr12, vr10 //p2---
+ vilvl.h vr15, vr13, vr11 //q1---
+ vilvh.h vr19, vr13, vr11
+
+ vbsrl.v vr7, vr6, 4 //p5---
+ vbsrl.v vr8, vr6, 8 //p4---
+ vbsrl.v vr9, vr6, 12 //p3---
+ vbsrl.v vr12, vr10, 4 //p1---
+ vbsrl.v vr13, vr10, 8 //p0---
+ vbsrl.v vr14, vr10, 12 //q0---
+ vbsrl.v vr16, vr15, 4 //q2---
+ vbsrl.v vr17, vr15, 8 //q3---
+ vbsrl.v vr18, vr15, 12 //q4---
+ vbsrl.v vr20, vr19, 4 //q6---
+.else
+ slli.d t5, a1, 3
+ sub.d t5, a0, t5
+ fldx.s f6, t5, a1 //p6
+ alsl.d t5, a1, t5, 1
+ fld.s f7, t5, 0 //p5
+ fldx.s f8, t5, a1 //p4
+ alsl.d t5, a1, t5, 1
+ fld.s f9, t5, 0 //p3
+ fldx.s f10, t5, a1 //p2
+ alsl.d t5, a1, t5, 1
+ fld.s f12, t5, 0 //p1
+ fldx.s f13, t5, a1 //p0
+ alsl.d t5, a1, t5, 1
+ fld.s f14, t5, 0 //q0
+ fldx.s f15, t5, a1 //q1
+ alsl.d t5, a1, t5, 1
+ fld.s f16, t5, 0 //q2
+ fldx.s f17, t5, a1 //q3
+ alsl.d t5, a1, t5, 1
+ fld.s f18, t5, 0 //q4
+ fldx.s f19, t5, a1 //q5
+ add.d t5, t5, a1
+ fldx.s f20, t5, a1 //q6
+
+ //temp store
+ addi.d sp, sp, -96
+ fst.d f7, sp, 0
+ fst.d f8, sp, 8
+ fst.d f9, sp, 16
+ fst.d f10, sp, 24
+ fst.d f12, sp, 32
+ fst.d f13, sp, 40
+ fst.d f14, sp, 48
+ fst.d f15, sp, 56
+ fst.d f16, sp, 64
+ fst.d f17, sp, 72
+ fst.d f18, sp, 80
+ fst.d f19, sp, 88
+.endif
+
+ vabsd.bu vr21, vr12, vr13 //abs(p1-p0)
+ vabsd.bu vr22, vr15, vr14 //abs(q1-q0)
+ vmax.bu vr0, vr21, vr22
+ vslt.bu vr2, vr2, vr0 //hev
+ vabsd.bu vr1, vr10, vr12 //abs(p2-p1)
+ vmax.bu vr0, vr0, vr1
+ vabsd.bu vr1, vr16, vr15 //abs(q2-q1)
+ vmax.bu vr0, vr0, vr1
+ vabsd.bu vr1, vr9, vr10 //abs(p3-p2)
+ vmax.bu vr0, vr0, vr1
+ vabsd.bu vr1, vr17, vr16 //abs(q3-q2)
+ vmax.bu vr0, vr0, vr1
+ vsle.bu vr0, vr0, vr4 //vr4 released I
+ vabsd.bu vr1, vr13, vr14 //abs(p0-q0)
+ vsadd.bu vr1, vr1, vr1
+ vabsd.bu vr4, vr12, vr15 //abs(p1-q1)
+ vsrli.b vr4, vr4, 1
+ vsadd.bu vr1, vr1, vr4 //abs(p0 - q0) * 2 + (abs(p1 - q1) >> 1)
+ vsle.bu vr1, vr1, vr3 //vr3 released E
+ vand.v vr0, vr0, vr1 //fm
+
+ vpickve2gr.wu t5, vr0, 0
+ beqz t5, .END_FILTER_\DIR\()\TYPE\()_W16
+
+ vabsd.bu vr1, vr6, vr13 //abs(p6-p0)
+ vabsd.bu vr4, vr7, vr13 //abs(p5-p0)
+ vmax.bu vr1, vr1, vr4
+ vabsd.bu vr4, vr8, vr13 //abs(p4-p0)
+ vmax.bu vr1, vr1, vr4
+ vabsd.bu vr4, vr18, vr14 //abs(q4-q0)
+ vmax.bu vr1, vr1, vr4
+ vabsd.bu vr4, vr19, vr14 //abs(q5-q0)
+ vmax.bu vr1, vr1, vr4
+ vabsd.bu vr4, vr20, vr14
+ vmax.bu vr1, vr1, vr4
+ vxor.v vr5, vr5, vr5
+ vaddi.bu vr5, vr5, 1 //F
+ vsle.bu vr1, vr1, vr5 //flat8out
+
+ vabsd.bu vr3, vr10, vr13 //abs(p2-p0)
+ vmax.bu vr3, vr3, vr21
+ vmax.bu vr3, vr3, vr22
+ vabsd.bu vr4, vr16, vr14 //abs(q2-q0)
+ vmax.bu vr3, vr3, vr4
+ vabsd.bu vr4, vr9, vr13 //abs(p3-p0)
+ vmax.bu vr3, vr3, vr4
+ vabsd.bu vr4, vr17, vr14 //abs(q3-q0)
+ vmax.bu vr3, vr3, vr4
+ vsle.bu vr3, vr3, vr5 //flatin released vr5
+
+ vsllwil.hu.bu vr6, vr6, 0 //p6
+ vsllwil.hu.bu vr7, vr7, 0 //p5
+ vsllwil.hu.bu vr8, vr8, 0 //p4
+ vsllwil.hu.bu vr9, vr9, 0 //p3
+ vsllwil.hu.bu vr10, vr10, 0 //p2
+ vsllwil.hu.bu vr12, vr12, 0 //p1
+ vsllwil.hu.bu vr13, vr13, 0 //p0
+ vsllwil.hu.bu vr14, vr14, 0 //q0
+ vsllwil.hu.bu vr15, vr15, 0 //q1
+ vsllwil.hu.bu vr16, vr16, 0 //q2
+ vsllwil.hu.bu vr17, vr17, 0 //q3
+ vsllwil.hu.bu vr18, vr18, 0 //q4
+ vsllwil.hu.bu vr19, vr19, 0 //q5
+ vsllwil.hu.bu vr20, vr20, 0 //q6
+
+ //dst-6
+ vslli.w vr21, vr6, 3
+ vssub.hu vr21, vr21, vr6
+ vsadd.hu vr21, vr21, vr7
+ vsadd.hu vr21, vr21, vr7
+ vsadd.hu vr21, vr21, vr8
+ vsadd.hu vr21, vr21, vr8
+ vsadd.hu vr21, vr21, vr9
+ vsadd.hu vr21, vr21, vr10
+ vsadd.hu vr21, vr21, vr12
+ vsadd.hu vr21, vr21, vr13
+ vsadd.hu vr21, vr21, vr14
+
+ //dst-5
+ vsadd.hu vr22, vr21, vr15
+ vsadd.hu vr22, vr22, vr9
+ vssub.hu vr22, vr22, vr6
+ vssub.hu vr22, vr22, vr6
+
+ //dst-4
+ vsadd.hu vr23, vr22, vr16
+ vsadd.hu vr23, vr23, vr10
+ vssub.hu vr23, vr23, vr7
+ vssub.hu vr23, vr23, vr6
+
+ //dst-3
+ vsadd.hu vr24, vr23, vr12
+ vsadd.hu vr24, vr24, vr17
+ vssub.hu vr24, vr24, vr6
+ vssub.hu vr24, vr24, vr8
+
+ //dst-2
+ vsadd.hu vr25, vr24, vr18
+ vsadd.hu vr25, vr25, vr13
+ vssub.hu vr25, vr25, vr6
+ vssub.hu vr25, vr25, vr9
+
+ //dst-1
+ vsadd.hu vr26, vr25, vr19
+ vsadd.hu vr26, vr26, vr14
+ vssub.hu vr26, vr26, vr6
+ vssub.hu vr26, vr26, vr10
+
+ //dst+0
+ vsadd.hu vr27, vr26, vr20
+ vsadd.hu vr27, vr27, vr15
+ vssub.hu vr27, vr27, vr6
+ vssub.hu vr27, vr27, vr12
+
+ //dst+1
+ vsadd.hu vr28, vr27, vr20
+ vsadd.hu vr28, vr28, vr16
+ vssub.hu vr28, vr28, vr7
+ vssub.hu vr28, vr28, vr13
+
+ //dst+2
+ vsadd.hu vr29, vr28, vr20
+ vsadd.hu vr29, vr29, vr17
+ vssub.hu vr29, vr29, vr8
+ vssub.hu vr29, vr29, vr14
+
+ //dst+3
+ vsadd.hu vr30, vr29, vr20
+ vsadd.hu vr30, vr30, vr18
+ vssub.hu vr30, vr30, vr9
+ vssub.hu vr30, vr30, vr15
+
+ //dst+4
+ vsadd.hu vr31, vr30, vr20
+ vsadd.hu vr31, vr31, vr19
+ vssub.hu vr31, vr31, vr10
+ vssub.hu vr31, vr31, vr16
+
+ //dst+5
+ vsadd.hu vr11, vr31, vr20
+ vsadd.hu vr11, vr11, vr20
+ vssub.hu vr11, vr11, vr12
+ vssub.hu vr11, vr11, vr17
+
+ vsrari.h vr21, vr21, 4
+ vsrari.h vr22, vr22, 4
+ vsrari.h vr23, vr23, 4
+ vsrari.h vr24, vr24, 4
+ vsrari.h vr25, vr25, 4
+ vsrari.h vr26, vr26, 4
+ vsrari.h vr27, vr27, 4
+ vsrari.h vr28, vr28, 4
+ vsrari.h vr29, vr29, 4
+ vsrari.h vr30, vr30, 4
+ vsrari.h vr31, vr31, 4
+ vsrari.h vr11, vr11, 4
+
+ vand.v vr1, vr1, vr3
+ vsllwil.h.b vr1, vr1, 0 //expand to h
+ //(flat8out & flat8in)
+ vbitsel.v vr21, vr7, vr21, vr1 //dst-6
+ vbitsel.v vr22, vr8, vr22, vr1 //dst-5
+ vbitsel.v vr23, vr9, vr23, vr1 //dst-4
+ vbitsel.v vr30, vr17, vr30, vr1 //dst+3
+ vbitsel.v vr31, vr18, vr31, vr1 //dst+4
+ vbitsel.v vr11, vr19, vr11, vr1 //dst+5
+
+ //flat8in
+ //dst-3
+ vslli.h vr4, vr9, 1
+ vsadd.hu vr4, vr4, vr9 //p3*3
+ vsadd.hu vr4, vr4, vr10
+ vsadd.hu vr4, vr4, vr10
+ vsadd.hu vr4, vr4, vr12
+ vsadd.hu vr4, vr4, vr13
+ vsadd.hu vr4, vr4, vr14
+
+ //dst-2
+ vsadd.hu vr5, vr4, vr12
+ vsadd.hu vr5, vr5, vr15
+ vssub.hu vr5, vr5, vr9
+ vssub.hu vr5, vr5, vr10
+
+ //dst-1
+ vsadd.hu vr18, vr5, vr13
+ vsadd.hu vr18, vr18, vr16
+ vssub.hu vr18, vr18, vr9
+ vssub.hu vr18, vr18, vr12
+
+ //dst+0
+ vsadd.hu vr7, vr18, vr14
+ vsadd.hu vr7, vr7, vr17
+ vssub.hu vr7, vr7, vr9
+ vssub.hu vr7, vr7, vr13
+
+ //dst+1
+ vsadd.hu vr8, vr7, vr15
+ vsadd.hu vr8, vr8, vr17
+ vssub.hu vr8, vr8, vr10
+ vssub.hu vr8, vr8, vr14
+
+ //dst+2
+ vsadd.hu vr9, vr8, vr16
+ vsadd.hu vr9, vr9, vr17
+ vssub.hu vr9, vr9, vr12
+ vssub.hu vr9, vr9, vr15
+
+ vsrari.h vr4, vr4, 3
+ vsrari.h vr5, vr5, 3
+ vsrari.h vr18, vr18, 3
+ vsrari.h vr7, vr7, 3
+ vsrari.h vr8, vr8, 3
+ vsrari.h vr9, vr9, 3
+
+ //flat8out & flat8in
+ vbitsel.v vr24, vr4, vr24, vr1 //dst-3
+ vbitsel.v vr25, vr5, vr25, vr1 //dst-2
+ vbitsel.v vr26, vr18, vr26, vr1 //dst-1
+ vbitsel.v vr27, vr7, vr27, vr1 //dst+0
+ vbitsel.v vr28, vr8, vr28, vr1 //dst+1
+ vbitsel.v vr29, vr9, vr29, vr1 //dst+2
+
+ //!flat8in
+ vsub.h vr17, vr12, vr15 //p1-q1
+ vsllwil.h.b vr2, vr2, 0
+ vand.v vr17, vr17, vr2 //&hev
+ vssrani.b.h vr17, vr17, 0
+ vsllwil.h.b vr17, vr17, 0
+
+ vsub.h vr7, vr14, vr13
+ vsadd.h vr8, vr7, vr7
+ vsadd.h vr7, vr7, vr8
+ vsadd.h vr7, vr7, vr17
+ vssrani.b.h vr7, vr7, 0
+ vsllwil.h.b vr17, vr7, 0 //f = iclip_diff(3 * (q0 - p0) + f);
+
+ vaddi.hu vr7, vr17, 4
+ vaddi.hu vr8, vr17, 3
+ li.w t5, 127
+ vreplgr2vr.h vr9, t5
+ vmin.h vr7, vr7, vr9
+ vmin.h vr8, vr8, vr9
+ vsrai.h vr7, vr7, 3 //f1
+ vsrai.h vr8, vr8, 3 //f2
+
+ vsadd.h vr4, vr13, vr8 //dst-1
+ vssub.h vr5, vr14, vr7 //dst+0
+
+ vsrari.h vr7, vr7, 1
+ vsadd.h vr17, vr12, vr7
+ vssub.h vr7, vr15, vr7
+ vbitsel.v vr17, vr17, vr12, vr2 //dst-2
+ vbitsel.v vr7, vr7, vr15, vr2 //dst+1
+
+ //flat8in or !flat8in
+ vsllwil.h.b vr3, vr3, 0
+ vbitsel.v vr24, vr10, vr24, vr3 //dst-3
+ vbitsel.v vr25, vr17, vr25, vr3 //dst-2
+ vbitsel.v vr26, vr4, vr26, vr3 //dst-1
+ vbitsel.v vr27, vr5, vr27, vr3 //dst+0
+ vbitsel.v vr28, vr7, vr28, vr3 //dst+1
+ vbitsel.v vr29, vr16, vr29, vr3 //dst+2
+
+.ifc \DIR, h
+ //dst-6,dst-2,dst-5,dst-1
+ vssrani.bu.h vr25, vr21, 0
+ vssrani.bu.h vr26, vr22, 0
+ vpermi.w vr25, vr25, 0xd8
+ vpermi.w vr26, vr26, 0xd8
+ vilvl.b vr6, vr26, vr25 //65656565 21212121
+
+ //dst-4,dst+0,dst-3,dst+1
+ vssrani.bu.h vr27, vr23, 0
+ vssrani.bu.h vr28, vr24, 0
+ vpermi.w vr27, vr27, 0xd8
+ vpermi.w vr28, vr28, 0xd8
+ vilvl.b vr26, vr28, vr27 //43434343 01010101
+
+ vilvl.h vr21, vr26, vr6 //6543 -- -- --
+ vilvh.h vr22, vr26, vr6 //2101 -- -- --
+ vilvl.w vr20, vr22, vr21 //65432101 --
+ vilvh.w vr22, vr22, vr21 //65432101 --
+ vreplvei.d vr21, vr20, 1
+ vreplvei.d vr23, vr22, 1
+
+ //dst+2,dst+4,dst+3,dst+5
+ vssrani.bu.h vr31, vr29, 0
+ vssrani.bu.h vr11, vr30, 0
+ vpermi.w vr31, vr31, 0xd8
+ vpermi.w vr11, vr11, 0xd8
+ vilvl.b vr11, vr11, vr31 //23232323 45454545
+ vshuf4i.w vr11, vr11, 0xd8
+ vshuf4i.h vr11, vr11, 0xd8 //2345 -- -- --
+
+ vextrins.w vr20, vr11, 0x20
+ vextrins.w vr21, vr11, 0x21
+ vextrins.w vr22, vr11, 0x22
+ vextrins.w vr23, vr11, 0x23
+
+ addi.d t5, a0, -6
+ vld vr6, t5, 0 //p6p5p4p3p2p1p0q0 q1q2q3q4q5q6
+ vldx vr7, t5, a1
+ add.d t5, t5, a1
+ vldx vr8, t5, a1
+ add.d t5, t5, a1
+ vldx vr9, t5, a1
+
+ //expand fm to 128
+ vreplvei.b vr10, vr0, 0
+ vreplvei.b vr11, vr0, 1
+ vreplvei.b vr12, vr0, 2
+ vreplvei.b vr13, vr0, 3
+
+ vbitsel.v vr20, vr6, vr20, vr10
+ vbitsel.v vr21, vr7, vr21, vr11
+ vbitsel.v vr22, vr8, vr22, vr12
+ vbitsel.v vr23, vr9, vr23, vr13
+
+ addi.d t5, a0, -6
+ vstelm.d vr20, t5, 0, 0
+ vstelm.w vr20, t5, 8, 2
+ add.d t5, t5, a1
+ vstelm.d vr21, t5, 0, 0
+ vstelm.w vr21, t5, 8, 2
+ add.d t5, t5, a1
+ vstelm.d vr22, t5, 0, 0
+ vstelm.w vr22, t5, 8, 2
+ add.d t5, t5, a1
+ vstelm.d vr23, t5, 0, 0
+ vstelm.w vr23, t5, 8, 2
+.else
+ //reload
+ fld.d f7, sp, 0
+ fld.d f8, sp, 8
+ fld.d f9, sp, 16
+ fld.d f10, sp, 24
+ fld.d f12, sp, 32
+ fld.d f13, sp, 40
+ fld.d f14, sp, 48
+ fld.d f15, sp, 56
+ fld.d f16, sp, 64
+ fld.d f17, sp, 72
+ fld.d f18, sp, 80
+ fld.d f19, sp, 88
+
+ vssrarni.bu.h vr21, vr21, 0
+ vssrarni.bu.h vr22, vr22, 0
+ vssrarni.bu.h vr23, vr23, 0
+ vssrarni.bu.h vr24, vr24, 0
+ vssrarni.bu.h vr25, vr25, 0
+ vssrarni.bu.h vr26, vr26, 0
+ vssrarni.bu.h vr27, vr27, 0
+ vssrarni.bu.h vr28, vr28, 0
+ vssrarni.bu.h vr29, vr29, 0
+ vssrarni.bu.h vr30, vr30, 0
+ vssrarni.bu.h vr31, vr31, 0
+ vssrarni.bu.h vr11, vr11, 0
+
+ vbitsel.v vr7, vr7, vr21, vr0 //p5
+ vbitsel.v vr8, vr8, vr22, vr0 //p4
+ vbitsel.v vr9, vr9, vr23, vr0 //p3
+ vbitsel.v vr10, vr10, vr24, vr0 //p2
+ vbitsel.v vr12, vr12, vr25, vr0 //p1
+ vbitsel.v vr13, vr13, vr26, vr0 //p0
+ vbitsel.v vr14, vr14, vr27, vr0 //q0
+ vbitsel.v vr15, vr15, vr28, vr0 //q1
+ vbitsel.v vr16, vr16, vr29, vr0 //q2
+ vbitsel.v vr17, vr17, vr30, vr0 //q3
+ vbitsel.v vr18, vr18, vr31, vr0 //q4
+ vbitsel.v vr19, vr19, vr11, vr0 //q5
+
+ fst.s f14, a0, 0
+ fstx.s f15, a0, a1
+ alsl.d t5, a1, a0, 1
+ fst.s f16, t5, 0
+ fstx.s f17, t5, a1
+ alsl.d t5, a1, t5, 1
+ fst.s f18, t5, 0
+ fstx.s f19, t5, a1
+
+ slli.w t5, a1, 2
+ alsl.d t5, a1, t5, 1
+ sub.d t5, a0, t5
+ fst.s f7, t5, 0
+ fstx.s f8, t5, a1
+ alsl.d t5, a1, t5, 1
+ fst.s f9, t5, 0
+ fstx.s f10, t5, a1
+ alsl.d t5, a1, t5, 1
+ fst.s f12, t5, 0
+ fstx.s f13, t5, a1
+.endif
+.END_FILTER_\DIR\()\TYPE\()_W16:
+.ifc \DIR, v
+ addi.d sp, sp, 96
+.endif
+.endm
+
+.macro PUSH_REG
+ addi.d sp, sp, -64
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ fst.d f27, sp, 24
+ fst.d f28, sp, 32
+ fst.d f29, sp, 40
+ fst.d f30, sp, 48
+ fst.d f31, sp, 56
+.endm
+.macro POP_REG
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ fld.d f27, sp, 24
+ fld.d f28, sp, 32
+ fld.d f29, sp, 40
+ fld.d f30, sp, 48
+ fld.d f31, sp, 56
+ addi.d sp, sp, 64
+.endm
+
+.macro LPF_FUNC DIR, TYPE
+function lpf_\DIR\()_sb_\TYPE\()_8bpc_lsx
+ PUSH_REG
+ vld vr0, a2, 0 //vmask
+ vpickve2gr.wu t0, vr0, 0
+ vpickve2gr.wu t1, vr0, 1
+ vpickve2gr.wu t2, vr0, 2
+ li.w t3, 1 //y
+ or t0, t0, t1
+.ifc \TYPE, y
+ or t0, t0, t2 //vm
+.endif
+ addi.w t8, t3, -1
+ andn t8, t0, t8
+ beqz t0, .\DIR\()\TYPE\()_END
+.\DIR\()\TYPE\()_LOOP:
+ and t4, t0, t3 //vm & y
+ beqz t4, .\DIR\()\TYPE\()_LOOP_NEXT
+ vldrepl.b vr1, a3, 0 //l[0][0]
+.ifc \DIR, h
+ addi.d t5, a3, -4
+.else
+ slli.d t5, a4, 2
+ sub.d t5, a3, t5
+.endif
+ vldrepl.b vr2, t5, 0 //l[-1][0]
+ vseqi.b vr3, vr1, 0
+ vbitsel.v vr1, vr1, vr2, vr3 //L
+ vpickve2gr.b t5, vr1, 0
+ beqz t5, .\DIR\()\TYPE\()_LOOP_NEXT
+ vsrai.b vr2, vr1, 4 //H
+ add.d t6, a5, t5
+ vldrepl.b vr3, t6, 0 //E
+ addi.d t6, t6, 64
+ vldrepl.b vr4, t6, 0 //I
+.ifc \TYPE, y
+ and t5, t2, t3
+ bnez t5, .FILTER_\DIR\()\TYPE\()_16
+.endif
+ and t5, t1, t3
+.ifc \TYPE, y
+ bnez t5, .FILTER_\DIR\()\TYPE\()_8
+.else
+ bnez t5, .FILTER_\DIR\()\TYPE\()_6
+.endif
+ FILTER_W4 \DIR, \TYPE
+ b .\DIR\()\TYPE\()_LOOP_NEXT
+.ifc \TYPE, uv
+.FILTER_\DIR\()\TYPE\()_6:
+ FILTER_W6 \DIR, \TYPE
+.endif
+.ifc \TYPE, y
+.FILTER_\DIR\()\TYPE\()_8:
+ FILTER_W8 \DIR, \TYPE
+ b .\DIR\()\TYPE\()_LOOP_NEXT
+.FILTER_\DIR\()\TYPE\()_16:
+ FILTER_W16 \DIR, \TYPE
+.endif
+.\DIR\()\TYPE\()_LOOP_NEXT:
+ slli.w t3, t3, 1
+.ifc \DIR, h
+ alsl.d a0, a1, a0, 2
+ slli.w t8, a4, 2
+ add.d a3, a3, t8
+.else
+ addi.d a0, a0, 4
+ addi.d a3, a3, 4
+.endif
+ addi.w t8, t3, -1
+ andn t8, t0, t8
+ bnez t8, .\DIR\()\TYPE\()_LOOP
+.\DIR\()\TYPE\()_END:
+ POP_REG
+endfunc
+.endm
+
+LPF_FUNC h, y
+LPF_FUNC v, y
+LPF_FUNC h, uv
+LPF_FUNC v, uv
diff --git a/third_party/dav1d/src/loongarch/loopfilter.h b/third_party/dav1d/src/loongarch/loopfilter.h
new file mode 100644
index 0000000000..844faf0c30
--- /dev/null
+++ b/third_party/dav1d/src/loongarch/loopfilter.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright © 2023, VideoLAN and dav1d authors
+ * Copyright © 2023, Loongson Technology Corporation Limited
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_LOONGARCH_LOOPFILTER_H
+#define DAV1D_SRC_LOONGARCH_LOOPFILTER_H
+
+#include "src/cpu.h"
+#include "src/loopfilter.h"
+
+decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_y, lsx));
+decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_y, lsx));
+decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_uv, lsx));
+decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_uv, lsx));
+
+static ALWAYS_INLINE void loop_filter_dsp_init_loongarch(Dav1dLoopFilterDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_LOONGARCH_CPU_FLAG_LSX)) return;
+
+#if BITDEPTH == 8
+ c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, lsx);
+ c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, lsx);
+ c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, lsx);
+ c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, lsx);
+#endif
+}
+
+#endif /* DAV1D_SRC_LOONGARCH_LOOPFILTER_H */
diff --git a/third_party/dav1d/src/loongarch/looprestoration.S b/third_party/dav1d/src/loongarch/looprestoration.S
new file mode 100644
index 0000000000..ab512d133c
--- /dev/null
+++ b/third_party/dav1d/src/loongarch/looprestoration.S
@@ -0,0 +1,1407 @@
+/*
+ * Copyright © 2023, VideoLAN and dav1d authors
+ * Copyright © 2023, Loongson Technology Corporation Limited
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/loongarch/loongson_asm.S"
+
+#define REST_UNIT_STRIDE (400)
+
+.macro MADD_HU_BU in0, in1, out0, out1
+ vsllwil.hu.bu vr12, \in0, 0
+ vexth.hu.bu vr13, \in0
+ vmadd.h \out0, vr12, \in1
+ vmadd.h \out1, vr13, \in1
+.endm
+
+const wiener_shuf
+.byte 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18
+endconst
+
+/*
+void wiener_filter_h_lsx(int32_t *hor_ptr,
+ uint8_t *tmp_ptr,
+ const int16_t filterh[8],
+ const int w, const int h)
+*/
+function wiener_filter_h_8bpc_lsx
+ addi.d sp, sp, -40
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ fst.d f27, sp, 24
+ fst.d f28, sp, 32
+ li.w t7, 1<<14 // clip_limit
+
+ la.local t1, wiener_shuf
+ vld vr4, t1, 0
+ vld vr14, a2, 0 // filter[0][k]
+ vreplvei.h vr21, vr14, 0
+ vreplvei.h vr22, vr14, 1
+ vreplvei.h vr23, vr14, 2
+ vreplvei.h vr24, vr14, 3
+ vreplvei.h vr25, vr14, 4
+ vreplvei.h vr26, vr14, 5
+ vreplvei.h vr27, vr14, 6
+ vreplgr2vr.w vr0, t7
+
+.WIENER_FILTER_H_H:
+ addi.w a4, a4, -1 // h
+ addi.w t0, a3, 0 // w
+ addi.d t1, a1, 0 // tmp_ptr
+ addi.d t2, a0, 0 // hor_ptr
+
+.WIENER_FILTER_H_W:
+ addi.w t0, t0, -16
+ vld vr5, t1, 0
+ vld vr13, t1, 16
+
+ vsubi.bu vr14, vr4, 2
+ vsubi.bu vr15, vr4, 1
+ vshuf.b vr6, vr13, vr5, vr14 // 1 ... 8, 9 ... 16
+ vshuf.b vr7, vr13, vr5, vr15 // 2 ... 9, 10 ... 17
+ vshuf.b vr8, vr13, vr5, vr4 // 3 ... 10, 11 ... 18
+ vaddi.bu vr14, vr4, 1
+ vaddi.bu vr15, vr4, 2
+ vshuf.b vr9, vr13, vr5, vr14 // 4 ... 11, 12 ... 19
+ vshuf.b vr10, vr13, vr5, vr15 // 5 ... 12, 13 ... 20
+ vaddi.bu vr14, vr4, 3
+ vshuf.b vr11, vr13, vr5, vr14 // 6 ... 13, 14 ... 21
+
+ vsllwil.hu.bu vr15, vr8, 0 // 3 4 5 6 7 8 9 10
+ vexth.hu.bu vr16, vr8 // 11 12 13 14 15 16 17 18
+ vsllwil.wu.hu vr17, vr15, 0 // 3 4 5 6
+ vexth.wu.hu vr18, vr15 // 7 8 9 10
+ vsllwil.wu.hu vr19, vr16, 0 // 11 12 13 14
+ vexth.wu.hu vr20, vr16 // 15 16 17 18
+ vslli.w vr17, vr17, 7
+ vslli.w vr18, vr18, 7
+ vslli.w vr19, vr19, 7
+ vslli.w vr20, vr20, 7
+ vxor.v vr15, vr15, vr15
+ vxor.v vr14, vr14, vr14
+
+ MADD_HU_BU vr5, vr21, vr14, vr15
+ MADD_HU_BU vr6, vr22, vr14, vr15
+ MADD_HU_BU vr7, vr23, vr14, vr15
+ MADD_HU_BU vr8, vr24, vr14, vr15
+ MADD_HU_BU vr9, vr25, vr14, vr15
+ MADD_HU_BU vr10, vr26, vr14, vr15
+ MADD_HU_BU vr11, vr27, vr14, vr15
+
+ vsllwil.w.h vr5, vr14, 0 // 0 1 2 3
+ vexth.w.h vr6, vr14 // 4 5 6 7
+ vsllwil.w.h vr7, vr15, 0 // 8 9 10 11
+ vexth.w.h vr8, vr15 // 12 13 14 15
+ vadd.w vr17, vr17, vr5
+ vadd.w vr18, vr18, vr6
+ vadd.w vr19, vr19, vr7
+ vadd.w vr20, vr20, vr8
+ vadd.w vr17, vr17, vr0
+ vadd.w vr18, vr18, vr0
+ vadd.w vr19, vr19, vr0
+ vadd.w vr20, vr20, vr0
+
+ vsrli.w vr1, vr0, 1
+ vsubi.wu vr1, vr1, 1
+ vxor.v vr3, vr3, vr3
+ vsrari.w vr17, vr17, 3
+ vsrari.w vr18, vr18, 3
+ vsrari.w vr19, vr19, 3
+ vsrari.w vr20, vr20, 3
+ vclip.w vr17, vr17, vr3, vr1
+ vclip.w vr18, vr18, vr3, vr1
+ vclip.w vr19, vr19, vr3, vr1
+ vclip.w vr20, vr20, vr3, vr1
+
+ vst vr17, t2, 0
+ vst vr18, t2, 16
+ vst vr19, t2, 32
+ vst vr20, t2, 48
+ addi.d t1, t1, 16
+ addi.d t2, t2, 64
+ blt zero, t0, .WIENER_FILTER_H_W
+
+ addi.d a1, a1, REST_UNIT_STRIDE
+ addi.d a0, a0, (REST_UNIT_STRIDE << 2)
+ bnez a4, .WIENER_FILTER_H_H
+
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ fld.d f27, sp, 24
+ fld.d f28, sp, 32
+ addi.d sp, sp, 40
+endfunc
+
+.macro APPLY_FILTER in0, in1, in2
+ alsl.d t7, \in0, \in1, 2
+ vld vr10, t7, 0
+ vld vr11, t7, 16
+ vld vr12, t7, 32
+ vld vr13, t7, 48
+ vmadd.w vr14, vr10, \in2
+ vmadd.w vr15, vr11, \in2
+ vmadd.w vr16, vr12, \in2
+ vmadd.w vr17, vr13, \in2
+.endm
+
+.macro wiener_filter_v_8bpc_core_lsx
+ vreplgr2vr.w vr14, t6
+ vreplgr2vr.w vr15, t6
+ vreplgr2vr.w vr16, t6
+ vreplgr2vr.w vr17, t6
+
+ addi.w t7, t2, 0 // j + index k
+ mul.w t7, t7, t8 // (j + index) * REST_UNIT_STRIDE
+ add.w t7, t7, t4 // (j + index) * REST_UNIT_STRIDE + i
+
+ APPLY_FILTER t7, a2, vr2
+ APPLY_FILTER t8, t7, vr3
+ APPLY_FILTER t8, t7, vr4
+ APPLY_FILTER t8, t7, vr5
+ APPLY_FILTER t8, t7, vr6
+ APPLY_FILTER t8, t7, vr7
+ APPLY_FILTER t8, t7, vr8
+ vssrarni.hu.w vr15, vr14, 11
+ vssrarni.hu.w vr17, vr16, 11
+ vssrlni.bu.h vr17, vr15, 0
+.endm
+
+/*
+void wiener_filter_v_lsx(uint8_t *p,
+ const ptrdiff_t p_stride,
+ const int32_t *hor,
+ const int16_t filterv[8],
+ const int w, const int h)
+*/
+function wiener_filter_v_8bpc_lsx
+ li.w t6, -(1 << 18)
+
+ li.w t8, REST_UNIT_STRIDE
+ ld.h t0, a3, 0
+ ld.h t1, a3, 2
+ vreplgr2vr.w vr2, t0
+ vreplgr2vr.w vr3, t1
+ ld.h t0, a3, 4
+ ld.h t1, a3, 6
+ vreplgr2vr.w vr4, t0
+ vreplgr2vr.w vr5, t1
+ ld.h t0, a3, 8
+ ld.h t1, a3, 10
+ vreplgr2vr.w vr6, t0
+ vreplgr2vr.w vr7, t1
+ ld.h t0, a3, 12
+ vreplgr2vr.w vr8, t0
+
+ andi t1, a4, 0xf
+ sub.w t0, a4, t1 // w-w%16
+ or t2, zero, zero // j
+ or t4, zero, zero
+ beqz t0, .WIENER_FILTER_V_W_LT16
+
+.WIENER_FILTER_V_H:
+ andi t1, a4, 0xf
+ add.d t3, zero, a0 // p
+ or t4, zero, zero // i
+
+.WIENER_FILTER_V_W:
+
+ wiener_filter_v_8bpc_core_lsx
+
+ mul.w t5, t2, a1 // j * stride
+ add.w t5, t5, t4 // j * stride + i
+ add.d t3, a0, t5
+ addi.w t4, t4, 16
+ vst vr17, t3, 0
+ bne t0, t4, .WIENER_FILTER_V_W
+
+ beqz t1, .WIENER_FILTER_V_W_EQ16
+
+ wiener_filter_v_8bpc_core_lsx
+
+ addi.d t3, t3, 16
+ andi t1, a4, 0xf
+
+.WIENER_FILTER_V_ST_REM:
+ vstelm.b vr17, t3, 0, 0
+ vbsrl.v vr17, vr17, 1
+ addi.d t3, t3, 1
+ addi.w t1, t1, -1
+ bnez t1, .WIENER_FILTER_V_ST_REM
+.WIENER_FILTER_V_W_EQ16:
+ addi.w t2, t2, 1
+ blt t2, a5, .WIENER_FILTER_V_H
+ b .WIENER_FILTER_V_END
+
+.WIENER_FILTER_V_W_LT16:
+ andi t1, a4, 0xf
+ add.d t3, zero, a0
+
+ wiener_filter_v_8bpc_core_lsx
+
+ mul.w t5, t2, a1 // j * stride
+ add.d t3, a0, t5
+
+.WIENER_FILTER_V_ST_REM_1:
+ vstelm.b vr17, t3, 0, 0
+ vbsrl.v vr17, vr17, 1
+ addi.d t3, t3, 1
+ addi.w t1, t1, -1
+ bnez t1, .WIENER_FILTER_V_ST_REM_1
+
+ addi.w t2, t2, 1
+ blt t2, a5, .WIENER_FILTER_V_W_LT16
+
+.WIENER_FILTER_V_END:
+endfunc
+
+/*
+void boxsum3_h(int32_t *sumsq, coef *sum, const pixel *src,
+ const int w, const int h)
+*/
+function boxsum3_h_8bpc_lsx
+ addi.d a2, a2, REST_UNIT_STRIDE
+ li.w t0, 1
+ addi.w a3, a3, -2
+ addi.w a4, a4, -4
+
+.LBS3_H_H:
+ alsl.d t1, t0, a1, 1 // sum_v *sum_v = sum + x
+ alsl.d t2, t0, a0, 2 // sumsq_v *sumsq_v = sumsq + x
+ add.d t3, t0, a2 // s
+ addi.w t5, a3, 0
+.LBS3_H_W:
+ vld vr0, t3, 0
+ vld vr1, t3, REST_UNIT_STRIDE
+ vld vr2, t3, (REST_UNIT_STRIDE<<1)
+
+ vilvl.b vr3, vr1, vr0
+ vhaddw.hu.bu vr4, vr3, vr3
+ vilvh.b vr5, vr1, vr0
+ vhaddw.hu.bu vr6, vr5, vr5
+ vsllwil.hu.bu vr7, vr2, 0
+ vexth.hu.bu vr8, vr2
+ // sum_v
+ vadd.h vr4, vr4, vr7
+ vadd.h vr6, vr6, vr8
+ vst vr4, t1, REST_UNIT_STRIDE<<1
+ vst vr6, t1, (REST_UNIT_STRIDE<<1)+16
+ addi.d t1, t1, 32
+ // sumsq
+ vmulwev.h.bu vr9, vr3, vr3
+ vmulwod.h.bu vr10, vr3, vr3
+ vmulwev.h.bu vr11, vr5, vr5
+ vmulwod.h.bu vr12, vr5, vr5
+ vmul.h vr7, vr7, vr7
+ vmul.h vr8, vr8, vr8
+ vaddwev.w.hu vr13, vr10, vr9
+ vaddwod.w.hu vr14, vr10, vr9
+ vilvl.w vr3, vr14, vr13
+ vilvh.w vr4, vr14, vr13
+ vaddwev.w.hu vr13, vr12, vr11
+ vaddwod.w.hu vr14, vr12, vr11
+ vilvl.w vr15, vr14, vr13
+ vilvh.w vr16, vr14, vr13
+ vsllwil.wu.hu vr9, vr7, 0
+ vexth.wu.hu vr10, vr7
+ vsllwil.wu.hu vr11, vr8, 0
+ vexth.wu.hu vr12, vr8
+ vadd.w vr9, vr9, vr3
+ vadd.w vr10, vr10, vr4
+ vadd.w vr11, vr11, vr15
+ vadd.w vr12, vr12, vr16
+ vst vr9, t2, REST_UNIT_STRIDE<<2
+ vst vr10, t2, (REST_UNIT_STRIDE<<2)+16
+ vst vr11, t2, (REST_UNIT_STRIDE<<2)+32
+ vst vr12, t2, (REST_UNIT_STRIDE<<2)+48
+ addi.d t2, t2, 64
+
+ addi.w t5, t5, -16
+ addi.d t3, t3, 16
+ blt zero, t5, .LBS3_H_W
+
+ addi.d a0, a0, REST_UNIT_STRIDE<<2
+ addi.d a1, a1, REST_UNIT_STRIDE<<1
+ addi.d a2, a2, REST_UNIT_STRIDE
+ addi.d a4, a4, -1
+ blt zero, a4, .LBS3_H_H
+
+.LBS3_H_END:
+endfunc
+
+/*
+void boxsum3_v(int32_t *sumsq, coef *sum,
+ const int w, const int h)
+*/
+function boxsum3_v_8bpc_lsx
+ addi.d a0, a0, (REST_UNIT_STRIDE<<2)
+ addi.d a1, a1, (REST_UNIT_STRIDE<<1)
+ addi.w a3, a3, -4
+ addi.w a2, a2, -4
+
+.LBS3_V_H:
+ sub.w t3, a2, zero
+ addi.d t0, a0, 4
+ addi.d t1, a1, 2
+ addi.d t5, a0, 8
+ addi.d t6, a1, 4
+
+ vld vr0, t1, 0 // a 0 1 2 3 4 5 6 7
+ vld vr1, t1, 2 // b 1 2 3 4 5 6 7 8
+ vld vr2, t1, 4 // c 2 3 4 5 6 7 8 9
+ vld vr3, t0, 0 // a2 0 1 2 3
+ vld vr4, t0, 4 // b2 1 2 3 4
+ vld vr5, t0, 8 // c2 2 3 4 5
+ vld vr6, t0, 16 // 3 4 5 6
+ vld vr7, t0, 20 // 4 5 6 7
+ vld vr8, t0, 24 // 5 6 7 8
+ vadd.h vr9, vr0, vr1
+ vadd.h vr9, vr9, vr2
+ vadd.w vr10, vr3, vr4
+ vadd.w vr10, vr10, vr5
+ vadd.w vr11, vr6, vr7
+ vadd.w vr11, vr11, vr8
+ vpickve2gr.h t7, vr2, 6
+ vpickve2gr.w t8, vr8, 2
+ vst vr9, t6, 0
+ vst vr10, t5, 0
+ vst vr11, t5, 16
+
+ addi.d t1, t1, 16
+ addi.d t0, t0, 32
+ addi.d t5, t5, 32
+ addi.d t6, t6, 16
+ addi.d t3, t3, -8
+ ble t3, zero, .LBS3_V_H0
+
+.LBS3_V_W8:
+ vld vr0, t1, 0 // a 0 1 2 3 4 5 6 7
+ vld vr1, t1, 2 // b 1 2 3 4 5 6 7 8
+ vld vr2, t1, 4 // c 2 3 4 5 6 7 8 9
+ vld vr3, t0, 0 // a2 0 1 2 3
+ vld vr4, t0, 4 // b2 1 2 3 4
+ vld vr5, t0, 8 // c2 2 3 4 5
+ vld vr6, t0, 16 // 3 4 5 6
+ vld vr7, t0, 20 // 4 5 6 7
+ vld vr8, t0, 24 // 5 6 7 8
+ vinsgr2vr.h vr0, t7, 0
+ vinsgr2vr.w vr3, t8, 0
+ vpickve2gr.h t7, vr2, 6
+ vpickve2gr.w t8, vr8, 2
+ vadd.h vr9, vr0, vr1
+ vadd.w vr10, vr3, vr4
+ vadd.w vr11, vr6, vr7
+ vadd.h vr9, vr9, vr2
+ vadd.w vr10, vr10, vr5
+ vadd.w vr11, vr11, vr8
+ vst vr9, t6, 0
+ vst vr10, t5, 0
+ vst vr11, t5, 16
+ addi.d t3, t3, -8
+ addi.d t1, t1, 16
+ addi.d t0, t0, 32
+ addi.d t5, t5, 32
+ addi.d t6, t6, 16
+ blt zero, t3, .LBS3_V_W8
+
+.LBS3_V_H0:
+ addi.d a1, a1, REST_UNIT_STRIDE<<1
+ addi.d a0, a0, REST_UNIT_STRIDE<<2
+ addi.w a3, a3, -1
+ bnez a3, .LBS3_V_H
+
+.LBS3_V_END:
+endfunc
+
+/*
+boxsum3_selfguided_filter(int32_t *sumsq, coef *sum,
+ const int w, const int h,
+ const unsigned s)
+*/
+function boxsum3_sgf_h_8bpc_lsx
+ addi.d a0, a0, REST_UNIT_STRIDE<<2
+ addi.d a0, a0, 12 // AA
+ addi.d a1, a1, REST_UNIT_STRIDE<<1
+ addi.d a1, a1, 6 // BB
+ la.local t8, dav1d_sgr_x_by_x
+ li.w t6, 455
+ vreplgr2vr.w vr20, t6
+ li.w t6, 255
+ vreplgr2vr.w vr22, t6
+ vaddi.wu vr21, vr22, 1 // 256
+ vreplgr2vr.w vr6, a4
+ vldi vr19, 0x809
+ addi.w a2, a2, 2 // w + 2
+ addi.w a3, a3, 2 // h + 2
+
+.LBS3SGF_H_H:
+ addi.w t2, a2, 0
+ addi.d t0, a0, -4
+ addi.d t1, a1, -2
+
+.LBS3SGF_H_W:
+ addi.w t2, t2, -8
+ vld vr0, t0, 0 // AA[i]
+ vld vr1, t0, 16
+ vld vr2, t1, 0 // BB[i]
+
+ vmul.w vr4, vr0, vr19 // a * n
+ vmul.w vr5, vr1, vr19 // a * n
+ vsllwil.w.h vr9, vr2, 0
+ vexth.w.h vr10, vr2
+ vmsub.w vr4, vr9, vr9 // p
+ vmsub.w vr5, vr10, vr10 // p
+ vmaxi.w vr4, vr4, 0
+ vmaxi.w vr5, vr5, 0 // p
+ vmul.w vr4, vr4, vr6 // p * s
+ vmul.w vr5, vr5, vr6 // p * s
+ vsrlri.w vr4, vr4, 20
+ vsrlri.w vr5, vr5, 20 // z
+ vmin.w vr4, vr4, vr22
+ vmin.w vr5, vr5, vr22
+
+ vpickve2gr.w t6, vr4, 0
+ ldx.bu t7, t8, t6
+ vinsgr2vr.w vr7, t7, 0
+ vpickve2gr.w t6, vr4, 1
+ ldx.bu t7, t8, t6
+ vinsgr2vr.w vr7, t7, 1
+ vpickve2gr.w t6, vr4, 2
+ ldx.bu t7, t8, t6
+ vinsgr2vr.w vr7, t7, 2
+ vpickve2gr.w t6, vr4, 3
+ ldx.bu t7, t8, t6
+ vinsgr2vr.w vr7, t7, 3
+
+ vpickve2gr.w t6, vr5, 0
+ ldx.bu t7, t8, t6
+ vinsgr2vr.w vr8, t7, 0
+ vpickve2gr.w t6, vr5, 1
+ ldx.bu t7, t8, t6
+ vinsgr2vr.w vr8, t7, 1
+ vpickve2gr.w t6, vr5, 2
+ ldx.bu t7, t8, t6
+ vinsgr2vr.w vr8, t7, 2
+ vpickve2gr.w t6, vr5, 3
+ ldx.bu t7, t8, t6
+ vinsgr2vr.w vr8, t7, 3 // x
+
+ vmul.w vr9, vr7, vr9 // x * BB[i]
+ vmul.w vr10, vr8, vr10
+ vmul.w vr9, vr9, vr20 // x * BB[i] * sgr_one_by_x
+ vmul.w vr10, vr10, vr20
+ vsrlri.w vr9, vr9, 12
+ vsrlri.w vr10, vr10, 12
+ vsub.w vr7, vr21, vr7
+ vsub.w vr8, vr21, vr8
+ vpickev.h vr8, vr8, vr7
+
+ vst vr9, t0, 0
+ vst vr10, t0, 16
+ vst vr8, t1, 0
+ addi.d t0, t0, 32
+ addi.d t1, t1, 16
+ blt zero, t2, .LBS3SGF_H_W
+
+ addi.d a0, a0, REST_UNIT_STRIDE<<2
+ addi.d a1, a1, REST_UNIT_STRIDE<<1
+ addi.w a3, a3, -1
+ bnez a3, .LBS3SGF_H_H
+endfunc
+
+/*
+boxsum3_selfguided_filter(coef *dst, pixel *src,
+ int32_t *sumsq, coef *sum,
+ const int w, const int h)
+*/
+function boxsum3_sgf_v_8bpc_lsx
+ addi.d a1, a1, (3*REST_UNIT_STRIDE+3) // src
+ addi.d a2, a2, REST_UNIT_STRIDE<<2
+ addi.d a2, a2, (REST_UNIT_STRIDE<<2)+12
+ addi.d a3, a3, REST_UNIT_STRIDE<<2
+ addi.d a3, a3, 6
+.LBS3SGF_V_H:
+ // A int32_t *sumsq
+ addi.d t0, a2, -(REST_UNIT_STRIDE<<2) // -stride
+ addi.d t1, a2, 0 // sumsq
+ addi.d t2, a2, REST_UNIT_STRIDE<<2 // +stride
+ addi.d t6, a1, 0
+ addi.w t7, a4, 0
+ addi.d t8, a0, 0
+ // B coef *sum
+ addi.d t3, a3, -(REST_UNIT_STRIDE<<1) // -stride
+ addi.d t4, a3, 0
+ addi.d t5, a3, REST_UNIT_STRIDE<<1
+
+.LBS3SGF_V_W:
+ vld vr0, t0, 0 // P[i - REST_UNIT_STRIDE]
+ vld vr1, t0, 16
+ vld vr2, t1, -4 // P[i-1]
+ vld vr3, t1, 12
+ vld vr4, t2, 0 // P[i + REST_UNIT_STRIDE]
+ vld vr5, t2, 16
+ vld vr6, t1, 0 // p[i]
+ vld vr7, t1, 16
+ vld vr8, t1, 4 // p[i+1]
+ vld vr9, t1, 20
+
+ vld vr10, t0, -4 // P[i - 1 - REST_UNIT_STRIDE]
+ vld vr11, t0, 12
+ vld vr12, t2, -4 // P[i - 1 + REST_UNIT_STRIDE]
+ vld vr13, t2, 12
+ vld vr14, t0, 4 // P[i + 1 - REST_UNIT_STRIDE]
+ vld vr15, t0, 20
+ vld vr16, t2, 4 // P[i + 1 + REST_UNIT_STRIDE]
+ vld vr17, t2, 20
+
+ vadd.w vr0, vr2, vr0
+ vadd.w vr4, vr6, vr4
+ vadd.w vr0, vr0, vr8
+ vadd.w vr20, vr0, vr4
+ vslli.w vr20, vr20, 2 // 0 1 2 3
+ vadd.w vr0, vr1, vr3
+ vadd.w vr4, vr5, vr7
+ vadd.w vr0, vr0, vr9
+ vadd.w vr21, vr0, vr4
+ vslli.w vr21, vr21, 2 // 4 5 6 7
+ vadd.w vr12, vr10, vr12
+ vadd.w vr16, vr14, vr16
+ vadd.w vr22, vr12, vr16
+ vslli.w vr23, vr22, 1
+ vadd.w vr22, vr23, vr22
+ vadd.w vr11, vr11, vr13
+ vadd.w vr15, vr15, vr17
+ vadd.w vr0, vr11, vr15
+ vslli.w vr23, vr0, 1
+ vadd.w vr23, vr23, vr0
+ vadd.w vr20, vr20, vr22 // b
+ vadd.w vr21, vr21, vr23
+
+ // B coef *sum
+ vld vr0, t3, 0 // P[i - REST_UNIT_STRIDE]
+ vld vr1, t4, -2 // p[i - 1]
+ vld vr2, t4, 0 // p[i]
+ vld vr3, t4, 2 // p[i + 1]
+ vld vr4, t5, 0 // P[i + REST_UNIT_STRIDE]
+ vld vr5, t3, -2 // P[i - 1 - REST_UNIT_STRIDE]
+ vld vr6, t5, -2 // P[i - 1 + REST_UNIT_STRIDE]
+ vld vr7, t3, 2 // P[i + 1 - REST_UNIT_STRIDE]
+ vld vr8, t5, 2 // P[i + 1 + REST_UNIT_STRIDE]
+ vaddwev.w.h vr9, vr0, vr1
+ vaddwod.w.h vr10, vr0, vr1
+ vaddwev.w.h vr11, vr2, vr3
+ vaddwod.w.h vr12, vr2, vr3
+ vadd.w vr9, vr11, vr9
+ vadd.w vr10, vr12, vr10
+ vilvl.w vr11, vr10, vr9 // 0 1 2 3
+ vilvh.w vr12, vr10, vr9 // 4 5 6 7
+ vsllwil.w.h vr0, vr4, 0
+ vexth.w.h vr1, vr4
+ vadd.w vr0, vr11, vr0
+ vadd.w vr1, vr12, vr1
+ vslli.w vr0, vr0, 2
+ vslli.w vr1, vr1, 2
+ vaddwev.w.h vr9, vr5, vr6
+ vaddwod.w.h vr10, vr5, vr6
+ vaddwev.w.h vr11, vr7, vr8
+ vaddwod.w.h vr12, vr7, vr8
+ vadd.w vr9, vr11, vr9
+ vadd.w vr10, vr12, vr10
+ vilvl.w vr13, vr10, vr9
+ vilvh.w vr14, vr10, vr9
+ vslli.w vr15, vr13, 1
+ vslli.w vr16, vr14, 1
+ vadd.w vr15, vr13, vr15 // a
+ vadd.w vr16, vr14, vr16
+ vadd.w vr22, vr0, vr15
+ vadd.w vr23, vr1, vr16
+ vld vr0, t6, 0 // src
+ vsllwil.hu.bu vr0, vr0, 0
+ vsllwil.wu.hu vr1, vr0, 0
+ vexth.wu.hu vr2, vr0
+ vmadd.w vr20, vr22, vr1
+ vmadd.w vr21, vr23, vr2
+ vssrlrni.h.w vr21, vr20, 9
+ vst vr21, t8, 0
+ addi.d t8, t8, 16
+
+ addi.d t0, t0, 32
+ addi.d t1, t1, 32
+ addi.d t2, t2, 32
+ addi.d t3, t3, 16
+ addi.d t4, t4, 16
+ addi.d t5, t5, 16
+ addi.d t6, t6, 8
+ addi.w t7, t7, -8
+ blt zero, t7, .LBS3SGF_V_W
+
+ addi.w a5, a5, -1
+ addi.d a0, a0, 384*2
+ addi.d a1, a1, REST_UNIT_STRIDE
+ addi.d a3, a3, REST_UNIT_STRIDE<<1
+ addi.d a2, a2, REST_UNIT_STRIDE<<2
+ bnez a5, .LBS3SGF_V_H
+endfunc
+
+#define FILTER_OUT_STRIDE (384)
+
+/*
+sgr_3x3_finish_c(const pixel *p, const ptrdiff_t stride,
+ const int16_t *dst, const int w1;
+ const int w, const int h);
+*/
+function sgr_3x3_finish_8bpc_lsx
+ vreplgr2vr.w vr3, a3 // w1
+ andi t4, a4, 0x7
+ sub.w t5, a4, t4
+
+ beq zero, t5, .LSGR3X3_REM
+
+.LSGR3X3_H:
+ addi.d t0, a0, 0
+ addi.d t1, a2, 0
+ addi.w t2, t5, 0
+ andi t4, a4, 0x7
+.LSGR3X3_W:
+ vld vr0, t0, 0
+ vld vr1, t1, 0
+ vsllwil.hu.bu vr2, vr0, 4 // u 8 h
+ vsllwil.wu.hu vr4, vr2, 0 // p
+ vexth.wu.hu vr5, vr2 // p
+ vslli.w vr6, vr4, 7
+ vslli.w vr7, vr5, 7
+ vsllwil.w.h vr8, vr1, 0 // dst
+ vexth.w.h vr9, vr1 // dst
+ vsub.w vr8, vr8, vr4
+ vsub.w vr9, vr9, vr5
+ vmadd.w vr6, vr8, vr3 // v 0 - 3
+ vmadd.w vr7, vr9, vr3 // v 4 - 7
+ vssrarni.hu.w vr7, vr6, 11
+ vssrlni.bu.h vr7, vr7, 0
+ vstelm.d vr7, t0, 0, 0
+ addi.d t0, t0, 8
+ addi.d t1, t1, 16
+ addi.d t2, t2, -8
+ bne zero, t2, .LSGR3X3_W
+
+ beq t4, zero, .LSGR3X3_NOREM
+
+ vld vr0, t0, 0
+ vld vr1, t1, 0
+ vsllwil.hu.bu vr2, vr0, 4 // u 8 h
+ vsllwil.wu.hu vr4, vr2, 0 // p
+ vexth.wu.hu vr5, vr2 // p
+ vslli.w vr6, vr4, 7
+ vslli.w vr7, vr5, 7
+ vsllwil.w.h vr8, vr1, 0 // dst
+ vexth.w.h vr9, vr1 // dst
+ vsub.w vr8, vr8, vr4
+ vsub.w vr9, vr9, vr5
+ vmadd.w vr6, vr8, vr3 // v 0 - 3
+ vmadd.w vr7, vr9, vr3 // v 4 - 7
+ vssrarni.hu.w vr7, vr6, 11
+ vssrlni.bu.h vr7, vr7, 0
+
+.LSGR3X3_ST:
+ vstelm.b vr7, t0, 0, 0
+ addi.d t0, t0, 1
+ vbsrl.v vr7, vr7, 1
+ addi.w t4, t4, -1
+ bnez t4, .LSGR3X3_ST
+
+.LSGR3X3_NOREM:
+ addi.w a5, a5, -1
+ add.d a0, a0, a1
+ addi.d a2, a2, (FILTER_OUT_STRIDE<<1)
+ bnez a5, .LSGR3X3_H
+ b .LSGR3X3_END
+
+.LSGR3X3_REM:
+ andi t4, a4, 0x7
+ addi.d t0, a0, 0
+ vld vr0, t0, 0
+ vld vr1, a2, 0
+ vsllwil.hu.bu vr2, vr0, 4 // u 8 h
+ vsllwil.wu.hu vr4, vr2, 0 // p
+ vexth.wu.hu vr5, vr2 // p
+ vslli.w vr6, vr4, 7
+ vslli.w vr7, vr5, 7
+ vsllwil.w.h vr8, vr1, 0 // dst
+ vexth.w.h vr9, vr1 // dst
+ vsub.w vr8, vr8, vr4
+ vsub.w vr9, vr9, vr5
+ vmadd.w vr6, vr8, vr3 // v 0 - 3
+ vmadd.w vr7, vr9, vr3 // v 4 - 7
+ vssrarni.hu.w vr7, vr6, 11
+ vssrlni.bu.h vr7, vr7, 0
+
+.LSGR3X3_REM_ST:
+ vstelm.b vr7, t0, 0, 0
+ addi.d t0, t0, 1
+ vbsrl.v vr7, vr7, 1
+ addi.w t4, t4, -1
+ bnez t4, .LSGR3X3_REM_ST
+ addi.w a5, a5, -1
+ add.d a0, a0, a1
+ addi.d a2, a2, (FILTER_OUT_STRIDE<<1)
+ bnez a5, .LSGR3X3_REM
+
+.LSGR3X3_END:
+endfunc
+
+/*
+void boxsum5(int32_t *sumsq, coef *sum,
+ const pixel *const src,
+ const int w, const int h)
+*/
+function boxsum5_h_8bpc_lsx
+ addi.w a4, a4, -4
+ addi.d a0, a0, REST_UNIT_STRIDE<<2
+ addi.d a1, a1, REST_UNIT_STRIDE<<1
+ li.w t6, 1
+.LBOXSUM5_H_H:
+ addi.w t3, a3, 0
+ addi.d t2, a2, 0
+ addi.d t0, a0, 0
+ addi.d t1, a1, 0
+
+.LBOXSUM5_H_W:
+ vld vr0, t2, 0 // a
+ vld vr1, t2, REST_UNIT_STRIDE // b
+ vld vr2, t2, REST_UNIT_STRIDE<<1 // c
+ vld vr3, t2, REST_UNIT_STRIDE*3 // d
+ vld vr4, t2, REST_UNIT_STRIDE<<2 // e
+
+ vilvl.b vr5, vr1, vr0
+ vilvh.b vr6, vr1, vr0
+ vilvl.b vr7, vr3, vr2
+ vilvh.b vr8, vr3, vr2
+ //sum_v
+ vhaddw.hu.bu vr9, vr5, vr5 // 0 1 2 3 4 5 6 7
+ vhaddw.hu.bu vr10, vr6, vr6 // 8 9 10 11 12 13 14 15 a+b
+ vhaddw.hu.bu vr11, vr7, vr7
+ vhaddw.hu.bu vr12, vr8, vr8
+ vadd.h vr9, vr9, vr11
+ vadd.h vr10, vr10, vr12 // a + b + c + d
+ vsllwil.hu.bu vr11, vr4, 0
+ vexth.hu.bu vr12, vr4
+ vadd.h vr9, vr9, vr11
+ vadd.h vr10, vr10, vr12
+ vst vr9, t1, 0
+ vst vr10, t1, 16
+ addi.d t1, t1, 32
+
+ // sumsq
+ vmulwev.h.bu vr9, vr5, vr5 // a*a 0 1 2 3 4 5 6 7
+ vmulwev.h.bu vr10, vr6, vr6 // a*a 8 9 10 11 12 13 14 15
+ vmulwod.h.bu vr13, vr5, vr5 // b*b 0 1 2 3 4 5 6 7
+ vmulwod.h.bu vr14, vr6, vr6 // b*b 8 9 10 11 12 13 14 15
+ vmulwev.h.bu vr15, vr7, vr7 // c*c 0 1 2 3 4 5 6 7
+ vmulwev.h.bu vr16, vr8, vr8 // c*c 8 9 10 11 12 13 14 15
+ vmulwod.h.bu vr17, vr7, vr7 // d*d 0 1 2 3 4 5 6 7
+ vmulwod.h.bu vr18, vr8, vr8 // d*d 8 9 10 11 12 13 14 15
+ vaddwev.w.hu vr5, vr9, vr13 // 0 2 4 6
+ vaddwod.w.hu vr6, vr9, vr13 // 1 3 5 7
+ vaddwev.w.hu vr7, vr10, vr14 // 8 10 12 14
+ vaddwod.w.hu vr8, vr10, vr14 // 9 11 13 15 a + b
+ vaddwev.w.hu vr19, vr15, vr17 // 0 2 4 6
+ vaddwod.w.hu vr20, vr15, vr17 // 1 3 5 7
+ vaddwev.w.hu vr21, vr16, vr18 // 8 10 12 14
+ vaddwod.w.hu vr22, vr16, vr18 // 9 11 13 15 c + d
+ vadd.w vr5, vr5, vr19
+ vadd.w vr6, vr6, vr20
+ vadd.w vr7, vr7, vr21
+ vadd.w vr8, vr8, vr22
+ vilvl.w vr19, vr6, vr5
+ vilvh.w vr20, vr6, vr5
+ vilvl.w vr21, vr8, vr7
+ vilvh.w vr22, vr8, vr7
+ vmul.h vr11, vr11, vr11
+ vmul.h vr12, vr12, vr12
+ vsllwil.wu.hu vr0, vr11, 0
+ vexth.wu.hu vr1, vr11
+ vsllwil.wu.hu vr2, vr12, 0
+ vexth.wu.hu vr3, vr12
+ vadd.w vr19, vr19, vr0
+ vadd.w vr20, vr20, vr1
+ vadd.w vr21, vr21, vr2
+ vadd.w vr22, vr22, vr3
+ vst vr19, t0, 0
+ vst vr20, t0, 16
+ vst vr21, t0, 32
+ vst vr22, t0, 48
+ addi.d t0, t0, 64
+ addi.d t2, t2, 16
+ addi.w t3, t3, -16
+ blt zero, t3, .LBOXSUM5_H_W
+
+ addi.d a0, a0, REST_UNIT_STRIDE<<2
+ addi.d a1, a1, REST_UNIT_STRIDE<<1
+ addi.d a2, a2, REST_UNIT_STRIDE
+ addi.d a4, a4, -1
+ bnez a4, .LBOXSUM5_H_H
+endfunc
+
+/*
+void boxsum5_h(int32_t *sumsq, coef *sum,
+ const int w, const int h)
+*/
+function boxsum5_v_8bpc_lsx
+ addi.d a0, a0, (REST_UNIT_STRIDE<<2)
+ addi.d a1, a1, (REST_UNIT_STRIDE<<1)
+ addi.w a3, a3, -4
+ addi.w a2, a2, -4
+
+.LBOXSUM5_V_H:
+ addi.w t3, a2, 0
+ addi.d t0, a0, 0
+ addi.d t1, a1, 0
+ addi.d t2, a0, 8
+ addi.d t3, a1, 4
+ addi.d t4, a2, 0
+
+ vld vr0, t1, 0 // a 0 1 2 3 4 5 6 7
+ vld vr1, t1, 2 // b 1 2 3 4 5 6 7 8
+ vld vr2, t1, 4 // c 2
+ vld vr3, t1, 6 // d 3
+ vld vr4, t1, 8 // e 4 5 6 7 8 9 10 11
+ vadd.h vr5, vr0, vr1
+ vadd.h vr6, vr2, vr3
+ vpickve2gr.w t5, vr4, 2
+ vadd.h vr5, vr5, vr6
+ vadd.h vr5, vr5, vr4
+ vst vr5, t3, 0
+
+ vld vr0, t0, 0 // 0 1 2 3 a
+ vld vr1, t0, 4 // 1 2 3 4 b
+ vld vr2, t0, 8 // 2 3 4 5 c
+ vld vr3, t0, 12 // 3 4 5 6 d
+ vld vr4, t0, 16 // 4 5 6 7 e a
+ vld vr5, t0, 20 // 5 6 7 8 b
+ vld vr6, t0, 24 // 6 7 8 9 c
+ vld vr7, t0, 28 // 7 8 9 10 d
+ vld vr8, t0, 32 // 8 9 10 11 e
+
+ vadd.w vr9, vr0, vr1
+ vadd.w vr10, vr2, vr3
+ vadd.w vr9, vr9, vr10
+ vadd.w vr9, vr9, vr4
+ vadd.w vr10, vr4, vr5
+ vadd.w vr11, vr6, vr7
+ vadd.w vr10, vr10, vr8
+ vadd.w vr10, vr10, vr11
+ vst vr9, t2, 0
+ vst vr10, t2, 16
+
+ addi.d t3, t3, 16
+ addi.d t1, t1, 16
+ addi.d t0, t0, 32
+ addi.d t2, t2, 32
+ addi.w t4, t4, -8
+ ble t4, zero, .LBOXSUM5_V_H1
+
+.LBOXSUM5_V_W:
+ vld vr0, t1, 0 // a 0 1 2 3 4 5 6 7
+ vld vr1, t1, 2 // b 1 2 3 4 5 6 7 8
+ vld vr2, t1, 4 // c 2
+ vld vr3, t1, 6 // d 3
+ vld vr4, t1, 8 // e 4 5 6 7 8 9 10 11
+ vinsgr2vr.w vr0, t5, 0
+ vpickve2gr.w t5, vr4, 2
+ vextrins.h vr1, vr0, 0x01
+ vadd.h vr5, vr0, vr1
+ vadd.h vr6, vr2, vr3
+ vadd.h vr5, vr5, vr6
+ vadd.h vr5, vr5, vr4
+ vst vr5, t3, 0
+
+ vaddi.hu vr0, vr8, 0 // 8 9 10 11 a
+ vld vr1, t0, 4 // 9 10 11 12 b
+ vld vr2, t0, 8 // 10 11 12 13 c
+ vld vr3, t0, 12 // 14 15 16 17 d
+ vld vr4, t0, 16 // 15 16 17 18 e a
+ vld vr5, t0, 20 // 16 17 18 19 b
+ vld vr6, t0, 24 // 17 18 19 20 c
+ vld vr7, t0, 28 // 18 19 20 21 d
+ vld vr8, t0, 32 // 19 20 21 22 e
+ vextrins.w vr1, vr0, 0x01
+ vadd.w vr9, vr0, vr1
+ vadd.w vr10, vr2, vr3
+ vadd.w vr9, vr9, vr10
+ vadd.w vr9, vr9, vr4
+ vadd.w vr10, vr4, vr5
+ vadd.w vr11, vr6, vr7
+ vadd.w vr10, vr10, vr8
+ vadd.w vr10, vr10, vr11
+ vst vr9, t2, 0
+ vst vr10, t2, 16
+
+ addi.d t3, t3, 16
+ addi.d t1, t1, 16
+ addi.d t0, t0, 32
+ addi.d t2, t2, 32
+ addi.w t4, t4, -8
+ blt zero, t4, .LBOXSUM5_V_W
+
+.LBOXSUM5_V_H1:
+ addi.d a1, a1, REST_UNIT_STRIDE<<1
+ addi.d a0, a0, REST_UNIT_STRIDE<<2
+ addi.w a3, a3, -1
+ bnez a3, .LBOXSUM5_V_H
+endfunc
+
+/*
+selfguided_filter(int32_t *sumsq, coef *sum,
+ const int w, const int h,
+ const unsigned s)
+*/
+function boxsum5_sgf_h_8bpc_lsx
+ addi.d a0, a0, REST_UNIT_STRIDE<<2
+ addi.d a0, a0, 12 // AA
+ addi.d a1, a1, REST_UNIT_STRIDE<<1
+ addi.d a1, a1, 6 // BB
+ la.local t8, dav1d_sgr_x_by_x
+ li.w t6, 164
+ vreplgr2vr.w vr20, t6
+ li.w t6, 255
+ vreplgr2vr.w vr22, t6
+ vaddi.wu vr21, vr22, 1 // 256
+ vreplgr2vr.w vr6, a4
+ vldi vr19, 0x819
+ addi.w a2, a2, 2 // w + 2
+ addi.w a3, a3, 2 // h + 2
+
+.LBS5SGF_H_H:
+ addi.w t2, a2, 0
+ addi.d t0, a0, -4
+ addi.d t1, a1, -2
+
+.LBS5SGF_H_W:
+ vld vr0, t0, 0 // AA[i]
+ vld vr1, t0, 16
+ vld vr2, t1, 0 // BB[i]
+
+ vmul.w vr4, vr0, vr19 // a * n
+ vmul.w vr5, vr1, vr19 // a * n
+ vsllwil.w.h vr9, vr2, 0
+ vexth.w.h vr10, vr2
+ vmsub.w vr4, vr9, vr9 // p
+ vmsub.w vr5, vr10, vr10 // p
+ vmaxi.w vr4, vr4, 0
+ vmaxi.w vr5, vr5, 0 // p
+ vmul.w vr4, vr4, vr6 // p * s
+ vmul.w vr5, vr5, vr6 // p * s
+ vsrlri.w vr4, vr4, 20
+ vsrlri.w vr5, vr5, 20 // z
+ vmin.w vr4, vr4, vr22
+ vmin.w vr5, vr5, vr22
+
+ // load table data
+ vpickve2gr.w t6, vr4, 0
+ ldx.bu t7, t8, t6
+ vinsgr2vr.w vr7, t7, 0
+ vpickve2gr.w t6, vr4, 1
+ ldx.bu t7, t8, t6
+ vinsgr2vr.w vr7, t7, 1
+ vpickve2gr.w t6, vr4, 2
+ ldx.bu t7, t8, t6
+ vinsgr2vr.w vr7, t7, 2
+ vpickve2gr.w t6, vr4, 3
+ ldx.bu t7, t8, t6
+ vinsgr2vr.w vr7, t7, 3
+
+ vpickve2gr.w t6, vr5, 0
+ ldx.bu t7, t8, t6
+ vinsgr2vr.w vr8, t7, 0
+ vpickve2gr.w t6, vr5, 1
+ ldx.bu t7, t8, t6
+ vinsgr2vr.w vr8, t7, 1
+ vpickve2gr.w t6, vr5, 2
+ ldx.bu t7, t8, t6
+ vinsgr2vr.w vr8, t7, 2
+ vpickve2gr.w t6, vr5, 3
+ ldx.bu t7, t8, t6
+ vinsgr2vr.w vr8, t7, 3 // x
+
+ vmul.w vr9, vr7, vr9 // x * BB[i]
+ vmul.w vr10, vr8, vr10
+ vmul.w vr9, vr9, vr20 // x * BB[i] * sgr_one_by_x
+ vmul.w vr10, vr10, vr20
+ vsrlri.w vr9, vr9, 12
+ vsrlri.w vr10, vr10, 12
+ vsub.w vr7, vr21, vr7
+ vsub.w vr8, vr21, vr8
+ vpickev.h vr8, vr8, vr7
+ vst vr9, t0, 0
+ vst vr10, t0, 16
+ vst vr8, t1, 0
+ addi.d t0, t0, 32
+ addi.d t1, t1, 16
+ addi.w t2, t2, -8
+ blt zero, t2, .LBS5SGF_H_W
+
+ addi.d a0, a0, REST_UNIT_STRIDE<<2
+ addi.d a0, a0, REST_UNIT_STRIDE<<2
+ addi.d a1, a1, REST_UNIT_STRIDE<<2
+ addi.w a3, a3, -2
+ blt zero, a3, .LBS5SGF_H_H
+endfunc
+
+/*
+selfguided_filter(coef *dst, pixel *src,
+ int32_t *sumsq, coef *sum,
+ const int w, const int h)
+*/
+function boxsum5_sgf_v_8bpc_lsx
+ addi.d a1, a1, 3*REST_UNIT_STRIDE+3 // src
+ addi.d a2, a2, (2*REST_UNIT_STRIDE+3)<<1 // A
+ addi.d a2, a2, (2*REST_UNIT_STRIDE+3)<<1
+ addi.d a3, a3, (2*REST_UNIT_STRIDE+3)<<1 // B
+ addi.w a5, a5, -1
+ vldi vr10, 0x806
+ vldi vr11, 0x805
+ vldi vr22, 0x406
+
+.LBS5SGF_V_H:
+ addi.d t0, a0, 0
+ addi.d t1, a1, 0
+ addi.d t2, a2, 0
+ addi.d t3, a3, 0
+ addi.w t4, a4, 0
+
+ addi.d t5, a0, 384*2
+ addi.d t6, a1, REST_UNIT_STRIDE
+ addi.d t7, a2, REST_UNIT_STRIDE<<2
+ addi.d t8, a3, REST_UNIT_STRIDE<<1 // B
+.LBS5SGF_V_W:
+ // a
+ vld vr0, t3, -REST_UNIT_STRIDE*2
+ vld vr1, t3, REST_UNIT_STRIDE*2
+ vld vr2, t3, (-REST_UNIT_STRIDE-1)*2
+ vld vr3, t3, (REST_UNIT_STRIDE-1)*2
+ vld vr4, t3, (1-REST_UNIT_STRIDE)*2
+ vld vr5, t3, (1+REST_UNIT_STRIDE)*2
+ vaddwev.w.h vr6, vr0, vr1
+ vaddwod.w.h vr7, vr0, vr1
+ vmul.w vr6, vr6, vr10
+ vmul.w vr7, vr7, vr10
+ vaddwev.w.h vr8, vr2, vr3
+ vaddwod.w.h vr9, vr2, vr3
+ vaddwev.w.h vr12, vr4, vr5
+ vaddwod.w.h vr13, vr4, vr5
+ vadd.w vr8, vr8, vr12
+ vadd.w vr9, vr9, vr13
+ vmadd.w vr6, vr8, vr11
+ vmadd.w vr7, vr9, vr11
+ vilvl.w vr18, vr7, vr6
+ vilvh.w vr19, vr7, vr6
+ // b
+ vld vr0, t2, -REST_UNIT_STRIDE*4
+ vld vr1, t2, -REST_UNIT_STRIDE*4+16
+ vld vr2, t2, REST_UNIT_STRIDE*4
+ vld vr3, t2, REST_UNIT_STRIDE*4+16
+ vld vr4, t2, (-REST_UNIT_STRIDE-1)*4
+ vld vr5, t2, (-REST_UNIT_STRIDE-1)*4+16
+ vld vr8, t2, (REST_UNIT_STRIDE-1)*4
+ vld vr9, t2, (REST_UNIT_STRIDE-1)*4+16
+ vld vr12, t2, (1-REST_UNIT_STRIDE)*4
+ vld vr13, t2, (1-REST_UNIT_STRIDE)*4+16
+ vld vr14, t2, (1+REST_UNIT_STRIDE)*4
+ vld vr15, t2, (1+REST_UNIT_STRIDE)*4+16
+ vadd.w vr0, vr0, vr2 // 0 1 2 3
+ vadd.w vr1, vr1, vr3 // 4 5 6 7
+ vmul.w vr20, vr0, vr10
+ vmul.w vr21, vr1, vr10
+ vadd.w vr4, vr4, vr8 // 0 1 2 3
+ vadd.w vr5, vr5, vr9 // 4 5 6 7
+ vadd.w vr12, vr12, vr14
+ vadd.w vr13, vr13, vr15
+ vadd.w vr12, vr12, vr4
+ vadd.w vr13, vr13, vr5
+ vmadd.w vr20, vr12, vr11
+ vmadd.w vr21, vr13, vr11
+ vld vr2, t1, 0
+ vsllwil.hu.bu vr2, vr2, 0
+ vsllwil.wu.hu vr3, vr2, 0
+ vexth.wu.hu vr4, vr2
+ vmadd.w vr20, vr18, vr3
+ vmadd.w vr21, vr19, vr4
+ vssrlrni.h.w vr21, vr20, 9
+ vst vr21, t0, 0
+
+ addi.d t1, t1, 8
+ addi.d t2, t2, 32
+ addi.d t3, t3, 16
+
+ // a
+ vld vr0, t8, 0
+ vld vr1, t8, -2
+ vld vr2, t8, 2
+ vmulwev.w.h vr3, vr0, vr22
+ vmulwod.w.h vr4, vr0, vr22
+ vaddwev.w.h vr5, vr1, vr2
+ vaddwod.w.h vr6, vr1, vr2
+ vmadd.w vr3, vr5, vr11
+ vmadd.w vr4, vr6, vr11
+ vilvl.w vr19, vr4, vr3
+ vilvh.w vr20, vr4, vr3
+ // b
+ vld vr0, t7, 0
+ vld vr1, t7, -4
+ vld vr2, t7, 4
+ vld vr5, t7, 16
+ vld vr6, t7, 12
+ vld vr7, t7, 20
+ vmul.w vr8, vr0, vr10
+ vmul.w vr9, vr5, vr10
+ vadd.w vr12, vr1, vr2
+ vadd.w vr13, vr6, vr7
+ vmadd.w vr8, vr12, vr11
+ vmadd.w vr9, vr13, vr11
+ vld vr2, t6, 0
+ vsllwil.hu.bu vr2, vr2, 0
+ vsllwil.wu.hu vr3, vr2, 0
+ vexth.wu.hu vr4, vr2
+ vmadd.w vr8, vr19, vr3
+ vmadd.w vr9, vr20, vr4
+ vssrlrni.h.w vr9, vr8, 8
+ vst vr9, t0, 384*2
+
+ addi.d t0, t0, 16
+ addi.d t8, t8, 16
+ addi.d t7, t7, 32
+ addi.d t6, t6, 8
+ addi.w t4, t4, -8
+ blt zero, t4, .LBS5SGF_V_W
+
+ addi.w a5, a5, -2
+ addi.d a0, a0, 384*4 // dst
+ addi.d a1, a1, REST_UNIT_STRIDE<<1 // src
+ addi.d a2, a2, REST_UNIT_STRIDE<<2 //
+ addi.d a2, a2, REST_UNIT_STRIDE<<2
+ addi.d a3, a3, REST_UNIT_STRIDE<<2 //
+ blt zero, a5, .LBS5SGF_V_H
+ bnez a5, .LBS5SGF_END
+.LBS5SGF_V_W1:
+ // a
+ vld vr0, a3, -REST_UNIT_STRIDE*2
+ vld vr1, a3, REST_UNIT_STRIDE*2
+ vld vr2, a3, (-REST_UNIT_STRIDE-1)*2
+ vld vr3, a3, (REST_UNIT_STRIDE-1)*2
+ vld vr4, a3, (1-REST_UNIT_STRIDE)*2
+ vld vr5, a3, (1+REST_UNIT_STRIDE)*2
+ vaddwev.w.h vr6, vr0, vr1
+ vaddwod.w.h vr7, vr0, vr1
+ vmul.w vr6, vr6, vr10
+ vmul.w vr7, vr7, vr10
+ vaddwev.w.h vr8, vr2, vr3
+ vaddwod.w.h vr9, vr2, vr3
+ vaddwev.w.h vr12, vr4, vr5
+ vaddwod.w.h vr13, vr4, vr5
+ vadd.w vr8, vr8, vr12
+ vadd.w vr9, vr9, vr13
+ vmadd.w vr6, vr8, vr11
+ vmadd.w vr7, vr9, vr11
+ vilvl.w vr18, vr7, vr6
+ vilvh.w vr19, vr7, vr6
+ // b
+ vld vr0, a2, -REST_UNIT_STRIDE*4
+ vld vr1, a2, -REST_UNIT_STRIDE*4+16
+ vld vr2, a2, REST_UNIT_STRIDE*4
+ vld vr3, a2, REST_UNIT_STRIDE*4+16
+ vld vr4, a2, (-REST_UNIT_STRIDE-1)*4
+ vld vr5, a2, (-REST_UNIT_STRIDE-1)*4+16
+ vld vr8, a2, (REST_UNIT_STRIDE-1)*4
+ vld vr9, a2, (REST_UNIT_STRIDE-1)*4+16
+ vld vr12, a2, (1-REST_UNIT_STRIDE)*4
+ vld vr13, a2, (1-REST_UNIT_STRIDE)*4+16
+ vld vr14, a2, (1+REST_UNIT_STRIDE)*4
+ vld vr15, a2, (1+REST_UNIT_STRIDE)*4+16
+ vadd.w vr0, vr0, vr2 // 0 1 2 3
+ vadd.w vr1, vr1, vr3 // 4 5 6 7
+ vmul.w vr20, vr0, vr10
+ vmul.w vr21, vr1, vr10
+ vadd.w vr4, vr4, vr8 // 0 1 2 3
+ vadd.w vr5, vr5, vr9 // 4 5 6 7
+ vadd.w vr12, vr12, vr14
+ vadd.w vr13, vr13, vr15
+ vadd.w vr12, vr12, vr4
+ vadd.w vr13, vr13, vr5
+ vmadd.w vr20, vr12, vr11
+ vmadd.w vr21, vr13, vr11
+ vld vr2, a1, 0
+ vsllwil.hu.bu vr2, vr2, 0
+ vsllwil.wu.hu vr3, vr2, 0
+ vexth.wu.hu vr4, vr2
+ vmadd.w vr20, vr18, vr3
+ vmadd.w vr21, vr19, vr4
+ vssrlrni.h.w vr21, vr20, 9
+ vst vr21, a0, 0
+ addi.d a3, a3, 16
+ addi.d a2, a2, 32
+ addi.d a1, a1, 8
+ addi.d a0, a0, 16
+ addi.w a4, a4, -8
+ blt zero, a4, .LBS5SGF_V_W1
+.LBS5SGF_END:
+endfunc
+
+/*
+void dav1d_sgr_mix_finish_lsx(uint8_t *p, const ptrdiff_t stride,
+ const int16_t *dst0, const int16_t *dst1,
+ const int w0, const int w1,
+ const int w, const int h);
+*/
+function sgr_mix_finish_8bpc_lsx
+ vreplgr2vr.w vr3, a4 // w0
+ vreplgr2vr.w vr13, a5 // w1
+ andi t4, a6, 0x7
+ sub.w t5, a6, t4
+
+ beq zero, t5, .LSGRMIX_REM
+
+.LSGRMIX_H:
+ addi.d t0, a0, 0
+ addi.d t1, a2, 0 // dst0
+ addi.d t3, a3, 0 // dst1
+ addi.w t2, t5, 0
+ andi t4, a6, 0x7
+.LSGRMIX_W:
+ vld vr0, t0, 0
+ vld vr1, t1, 0
+ vld vr10, t3, 0
+ vsllwil.hu.bu vr2, vr0, 4 // u 8 h
+ vsllwil.wu.hu vr4, vr2, 0 // u 0 1 2 3
+ vexth.wu.hu vr5, vr2 // u 4 5 6 7
+ vslli.w vr6, vr4, 7
+ vslli.w vr7, vr5, 7
+ vsllwil.w.h vr8, vr1, 0 // dst0
+ vexth.w.h vr9, vr1 // dst0
+ vsub.w vr8, vr8, vr4
+ vsub.w vr9, vr9, vr5
+ vmadd.w vr6, vr8, vr3 // v 0 - 3
+ vmadd.w vr7, vr9, vr3 // v 4 - 7
+
+ vsllwil.w.h vr11, vr10, 0 // dst1
+ vexth.w.h vr12, vr10 // dst1
+ vsub.w vr11, vr11, vr4
+ vsub.w vr12, vr12, vr5
+ vmadd.w vr6, vr11, vr13
+ vmadd.w vr7, vr12, vr13
+
+ vssrarni.hu.w vr7, vr6, 11
+ vssrlni.bu.h vr7, vr7, 0
+ vstelm.d vr7, t0, 0, 0
+ addi.d t0, t0, 8
+ addi.d t1, t1, 16
+ addi.d t3, t3, 16
+ addi.d t2, t2, -8
+ bne zero, t2, .LSGRMIX_W
+
+ beq t4, zero, .LSGRMIX_W8
+
+ vld vr0, t0, 0
+ vld vr1, t1, 0
+ vld vr10, t3, 0
+ vsllwil.hu.bu vr2, vr0, 4 // u 8 h
+ vsllwil.wu.hu vr4, vr2, 0 // p
+ vexth.wu.hu vr5, vr2 // p
+ vslli.w vr6, vr4, 7
+ vslli.w vr7, vr5, 7
+ vsllwil.w.h vr8, vr1, 0 // dst
+ vexth.w.h vr9, vr1 // dst
+ vsub.w vr8, vr8, vr4
+ vsub.w vr9, vr9, vr5
+ vmadd.w vr6, vr8, vr3 // v 0 - 3
+ vmadd.w vr7, vr9, vr3 // v 4 - 7
+
+ vsllwil.w.h vr11, vr10, 0 // dst1
+ vexth.w.h vr12, vr10 // dst1
+ vsub.w vr11, vr11, vr4
+ vsub.w vr12, vr12, vr5
+ vmadd.w vr6, vr11, vr13
+ vmadd.w vr7, vr12, vr13
+
+ vssrarni.hu.w vr7, vr6, 11
+ vssrlni.bu.h vr7, vr7, 0
+
+.LSGRMIX_ST:
+ vstelm.b vr7, t0, 0, 0
+ addi.d t0, t0, 1
+ vbsrl.v vr7, vr7, 1
+ addi.w t4, t4, -1
+ bnez t4, .LSGRMIX_ST
+
+.LSGRMIX_W8:
+ addi.w a7, a7, -1
+ add.d a0, a0, a1
+ addi.d a2, a2, (FILTER_OUT_STRIDE<<1)
+ addi.d a3, a3, (FILTER_OUT_STRIDE<<1)
+ bnez a7, .LSGRMIX_H
+ b .LSGR_MIX_END
+
+.LSGRMIX_REM:
+ andi t4, a6, 0x7
+ vld vr0, a0, 0
+ vld vr1, a2, 0
+ vld vr10, a3, 0
+ vsllwil.hu.bu vr2, vr0, 4 // u 8 h
+ vsllwil.wu.hu vr4, vr2, 0 // p
+ vexth.wu.hu vr5, vr2 // p
+ vslli.w vr6, vr4, 7
+ vslli.w vr7, vr5, 7
+ vsllwil.w.h vr8, vr1, 0 // dst
+ vexth.w.h vr9, vr1 // dst
+ vsub.w vr8, vr8, vr4
+ vsub.w vr9, vr9, vr5
+ vmadd.w vr6, vr8, vr3 // v 0 - 3
+ vmadd.w vr7, vr9, vr3 // v 4 - 7
+
+ vsllwil.w.h vr11, vr10, 0 // dst1
+ vexth.w.h vr12, vr10 // dst1
+ vsub.w vr11, vr11, vr4
+ vsub.w vr12, vr12, vr5
+ vmadd.w vr6, vr11, vr13
+ vmadd.w vr7, vr12, vr13
+
+ vssrarni.hu.w vr7, vr6, 11
+ vssrlni.bu.h vr7, vr7, 0
+ addi.d t0, a0, 0
+.LSGRMIX_REM_ST:
+ vstelm.b vr7, t0, 0, 0
+ addi.d t0, t0, 1
+ vbsrl.v vr7, vr7, 1
+ addi.w t4, t4, -1
+ bnez t4, .LSGRMIX_REM_ST
+
+ addi.w a7, a7, -1
+ add.d a0, a0, a1
+ addi.d a2, a2, (FILTER_OUT_STRIDE<<1)
+ addi.d a3, a3, (FILTER_OUT_STRIDE<<1)
+ bnez a7, .LSGRMIX_REM
+
+.LSGR_MIX_END:
+endfunc
diff --git a/third_party/dav1d/src/loongarch/looprestoration.h b/third_party/dav1d/src/loongarch/looprestoration.h
new file mode 100644
index 0000000000..ac0cb065c8
--- /dev/null
+++ b/third_party/dav1d/src/loongarch/looprestoration.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright © 2023, VideoLAN and dav1d authors
+ * Copyright © 2023, Loongson Technology Corporation Limited
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_LOONGARCH_LOOPRESTORATION_H
+#define DAV1D_SRC_LOONGARCH_LOOPRESTORATION_H
+
+#include "common/intops.h"
+#include "src/cpu.h"
+#include "src/looprestoration.h"
+
+void dav1d_wiener_filter_lsx(uint8_t *p, const ptrdiff_t stride,
+ const uint8_t (*const left)[4],
+ const uint8_t *lpf,
+ const int w, const int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX);
+
+void dav1d_sgr_filter_3x3_lsx(pixel *p, const ptrdiff_t p_stride,
+ const pixel (*const left)[4],
+ const pixel *lpf,
+ const int w, const int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX);
+
+void dav1d_sgr_filter_5x5_lsx(pixel *p, const ptrdiff_t p_stride,
+ const pixel (*const left)[4],
+ const pixel *lpf,
+ const int w, const int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX);
+
+void dav1d_sgr_filter_mix_lsx(pixel *p, const ptrdiff_t p_stride,
+ const pixel (*const left)[4],
+ const pixel *lpf,
+ const int w, const int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX);
+
+static ALWAYS_INLINE void loop_restoration_dsp_init_loongarch(Dav1dLoopRestorationDSPContext *const c, int bpc)
+{
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_LOONGARCH_CPU_FLAG_LSX)) return;
+
+#if BITDEPTH == 8
+ c->wiener[0] = c->wiener[1] = dav1d_wiener_filter_lsx;
+
+ c->sgr[0] = dav1d_sgr_filter_5x5_lsx;
+ c->sgr[1] = dav1d_sgr_filter_3x3_lsx;
+ c->sgr[2] = dav1d_sgr_filter_mix_lsx;
+#endif
+}
+
+#endif /* DAV1D_SRC_LOONGARCH_LOOPRESTORATION_H */
diff --git a/third_party/dav1d/src/loongarch/looprestoration_tmpl.c b/third_party/dav1d/src/loongarch/looprestoration_tmpl.c
new file mode 100644
index 0000000000..66d0d638f6
--- /dev/null
+++ b/third_party/dav1d/src/loongarch/looprestoration_tmpl.c
@@ -0,0 +1,274 @@
+/*
+ * Copyright © 2023, VideoLAN and dav1d authors
+ * Copyright © 2023, Loongson Technology Corporation Limited
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/loongarch/looprestoration.h"
+
+#if BITDEPTH == 8
+
+#define REST_UNIT_STRIDE (400)
+
+void BF(dav1d_wiener_filter_h, lsx)(int32_t *hor_ptr,
+ uint8_t *tmp_ptr,
+ const int16_t filterh[8],
+ const int w, const int h);
+
+void BF(dav1d_wiener_filter_v, lsx)(uint8_t *p,
+ const ptrdiff_t p_stride,
+ const int32_t *hor,
+ const int16_t filterv[8],
+ const int w, const int h);
+
+// This function refers to the function in the ppc/looprestoration_init_tmpl.c.
+static inline void padding(uint8_t *dst, const uint8_t *p,
+ const ptrdiff_t stride, const uint8_t (*left)[4],
+ const uint8_t *lpf, int unit_w, const int stripe_h,
+ const enum LrEdgeFlags edges)
+{
+ const int have_left = !!(edges & LR_HAVE_LEFT);
+ const int have_right = !!(edges & LR_HAVE_RIGHT);
+
+ // Copy more pixels if we don't have to pad them
+ unit_w += 3 * have_left + 3 * have_right;
+ uint8_t *dst_l = dst + 3 * !have_left;
+ p -= 3 * have_left;
+ lpf -= 3 * have_left;
+
+ if (edges & LR_HAVE_TOP) {
+ // Copy previous loop filtered rows
+ const uint8_t *const above_1 = lpf;
+ const uint8_t *const above_2 = above_1 + PXSTRIDE(stride);
+ pixel_copy(dst_l, above_1, unit_w);
+ pixel_copy(dst_l + REST_UNIT_STRIDE, above_1, unit_w);
+ pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, above_2, unit_w);
+ } else {
+ // Pad with first row
+ pixel_copy(dst_l, p, unit_w);
+ pixel_copy(dst_l + REST_UNIT_STRIDE, p, unit_w);
+ pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, p, unit_w);
+ if (have_left) {
+ pixel_copy(dst_l, &left[0][1], 3);
+ pixel_copy(dst_l + REST_UNIT_STRIDE, &left[0][1], 3);
+ pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, &left[0][1], 3);
+ }
+ }
+
+ uint8_t *dst_tl = dst_l + 3 * REST_UNIT_STRIDE;
+ if (edges & LR_HAVE_BOTTOM) {
+ // Copy next loop filtered rows
+ const uint8_t *const below_1 = lpf + 6 * PXSTRIDE(stride);
+ const uint8_t *const below_2 = below_1 + PXSTRIDE(stride);
+ pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, below_1, unit_w);
+ pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, below_2, unit_w);
+ pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, below_2, unit_w);
+ } else {
+ // Pad with last row
+ const uint8_t *const src = p + (stripe_h - 1) * PXSTRIDE(stride);
+ pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, src, unit_w);
+ pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, src, unit_w);
+ pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, src, unit_w);
+ if (have_left) {
+ pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
+ pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
+ pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
+ }
+ }
+
+ // Inner UNIT_WxSTRIPE_H
+ for (int j = 0; j < stripe_h; j++) {
+ pixel_copy(dst_tl + 3 * have_left, p + 3 * have_left, unit_w - 3 * have_left);
+ dst_tl += REST_UNIT_STRIDE;
+ p += PXSTRIDE(stride);
+ }
+
+ if (!have_right) {
+ uint8_t *pad = dst_l + unit_w;
+ uint8_t *row_last = &dst_l[unit_w - 1];
+ // Pad 3x(STRIPE_H+6) with last column
+ for (int j = 0; j < stripe_h + 6; j++) {
+ pixel_set(pad, *row_last, 3);
+ pad += REST_UNIT_STRIDE;
+ row_last += REST_UNIT_STRIDE;
+ }
+ }
+
+ if (!have_left) {
+ // Pad 3x(STRIPE_H+6) with first column
+ for (int j = 0; j < stripe_h + 6; j++) {
+ pixel_set(dst, *dst_l, 3);
+ dst += REST_UNIT_STRIDE;
+ dst_l += REST_UNIT_STRIDE;
+ }
+ } else {
+ dst += 3 * REST_UNIT_STRIDE;
+ for (int j = 0; j < stripe_h; j++) {
+ pixel_copy(dst, &left[j][1], 3);
+ dst += REST_UNIT_STRIDE;
+ }
+ }
+}
+
+// This function refers to the function in the ppc/looprestoration_init_tmpl.c.
+
+// FIXME Could split into luma and chroma specific functions,
+// (since first and last tops are always 0 for chroma)
+// FIXME Could implement a version that requires less temporary memory
+// (should be possible to implement with only 6 rows of temp storage)
+void dav1d_wiener_filter_lsx(uint8_t *p, const ptrdiff_t p_stride,
+ const uint8_t (*const left)[4],
+ const uint8_t *lpf,
+ const int w, const int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+ const int16_t (*const filter)[8] = params->filter;
+
+ // Wiener filtering is applied to a maximum stripe height of 64 + 3 pixels
+ // of padding above and below
+ ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,);
+ padding(tmp, p, p_stride, left, lpf, w, h, edges);
+ ALIGN_STK_16(int32_t, hor, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE + 64,);
+
+ BF(dav1d_wiener_filter_h, lsx)(hor, tmp, filter[0], w, h + 6);
+ BF(dav1d_wiener_filter_v, lsx)(p, p_stride, hor, filter[1], w, h);
+}
+
+void BF(dav1d_boxsum3_h, lsx)(int32_t *sumsq, int16_t *sum, pixel *src,
+ const int w, const int h);
+void BF(dav1d_boxsum3_v, lsx)(int32_t *sumsq, int16_t *sum,
+ const int w, const int h);
+
+void BF(dav1d_boxsum3_sgf_h, lsx)(int32_t *sumsq, int16_t *sum,
+ const int w, const int h, const int w1);
+void BF(dav1d_boxsum3_sgf_v, lsx)(int16_t *dst, uint8_t *tmp,
+ int32_t *sumsq, int16_t *sum,
+ const int w, const int h);
+void BF(dav1d_sgr_3x3_finish, lsx)(pixel *p, const ptrdiff_t p_stride,
+ int16_t *dst, int w1,
+ const int w, const int h);
+
+
+static inline void boxsum3_lsx(int32_t *sumsq, coef *sum, pixel *src,
+ const int w, const int h)
+{
+ BF(dav1d_boxsum3_h, lsx)(sumsq, sum, src, w + 6, h + 6);
+ BF(dav1d_boxsum3_v, lsx)(sumsq, sum, w + 6, h + 6);
+}
+
+void dav1d_sgr_filter_3x3_lsx(pixel *p, const ptrdiff_t p_stride,
+ const pixel (*const left)[4],
+ const pixel *lpf,
+ const int w, const int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+ ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,);
+ padding(tmp, p, p_stride, left, lpf, w, h, edges);
+ coef dst[64 * 384];
+
+ ALIGN_STK_16(int32_t, sumsq, 68 * REST_UNIT_STRIDE + 8, );
+ ALIGN_STK_16(int16_t, sum, 68 * REST_UNIT_STRIDE + 16, );
+
+ boxsum3_lsx(sumsq, sum, tmp, w, h);
+ BF(dav1d_boxsum3_sgf_h, lsx)(sumsq, sum, w, h, params->sgr.s1);
+ BF(dav1d_boxsum3_sgf_v, lsx)(dst, tmp, sumsq, sum, w, h);
+ BF(dav1d_sgr_3x3_finish, lsx)(p, p_stride, dst, params->sgr.w1, w, h);
+}
+
+void BF(dav1d_boxsum5_h, lsx)(int32_t *sumsq, int16_t *sum,
+ const uint8_t *const src,
+ const int w, const int h);
+
+void BF(dav1d_boxsum5_v, lsx)(int32_t *sumsq, int16_t *sum,
+ const int w, const int h);
+
+void BF(dav1d_boxsum5_sgf_h, lsx)(int32_t *sumsq, int16_t *sum,
+ const int w, const int h,
+ const unsigned s);
+
+void BF(dav1d_boxsum5_sgf_v, lsx)(int16_t *dst, uint8_t *src,
+ int32_t *sumsq, int16_t *sum,
+ const int w, const int h);
+
+void BF(dav1d_sgr_mix_finish, lsx)(uint8_t *p, const ptrdiff_t stride,
+ const int16_t *dst0, const int16_t *dst1,
+ const int w0, const int w1,
+ const int w, const int h);
+
+static inline void boxsum5_lsx(int32_t *sumsq, coef *sum, pixel *src,
+ const int w, const int h)
+{
+ BF(dav1d_boxsum5_h, lsx)(sumsq, sum, src, w + 6, h + 6);
+ BF(dav1d_boxsum5_v, lsx)(sumsq, sum, w + 6, h + 6);
+}
+
+void dav1d_sgr_filter_5x5_lsx(pixel *p, const ptrdiff_t p_stride,
+ const pixel (*const left)[4],
+ const pixel *lpf,
+ const int w, const int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+ ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,);
+ padding(tmp, p, p_stride, left, lpf, w, h, edges);
+ coef dst[64 * 384];
+
+ ALIGN_STK_16(int32_t, sumsq, 68 * REST_UNIT_STRIDE + 8, );
+ ALIGN_STK_16(int16_t, sum, 68 * REST_UNIT_STRIDE + 16, );
+
+ boxsum5_lsx(sumsq, sum, tmp, w, h);
+ BF(dav1d_boxsum5_sgf_h, lsx)(sumsq, sum, w, h, params->sgr.s0);
+ BF(dav1d_boxsum5_sgf_v, lsx)(dst, tmp, sumsq, sum, w, h);
+ BF(dav1d_sgr_3x3_finish, lsx)(p, p_stride, dst, params->sgr.w0, w, h);
+}
+
+void dav1d_sgr_filter_mix_lsx(pixel *p, const ptrdiff_t p_stride,
+ const pixel (*const left)[4],
+ const pixel *lpf,
+ const int w, const int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+ ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,);
+ padding(tmp, p, p_stride, left, lpf, w, h, edges);
+ coef dst0[64 * 384];
+ coef dst1[64 * 384];
+
+ ALIGN_STK_16(int32_t, sumsq0, 68 * REST_UNIT_STRIDE + 8, );
+ ALIGN_STK_16(int16_t, sum0, 68 * REST_UNIT_STRIDE + 16, );
+
+ boxsum5_lsx(sumsq0, sum0, tmp, w, h);
+ BF(dav1d_boxsum5_sgf_h, lsx)(sumsq0, sum0, w, h, params->sgr.s0);
+ BF(dav1d_boxsum5_sgf_v, lsx)(dst0, tmp, sumsq0, sum0, w, h);
+
+ boxsum3_lsx(sumsq0, sum0, tmp, w, h);
+ BF(dav1d_boxsum3_sgf_h, lsx)(sumsq0, sum0, w, h, params->sgr.s1);
+ BF(dav1d_boxsum3_sgf_v, lsx)(dst1, tmp, sumsq0, sum0, w, h);
+
+ BF(dav1d_sgr_mix_finish, lsx)(p, p_stride, dst0, dst1, params->sgr.w0,
+ params->sgr.w1, w, h);
+}
+#endif
diff --git a/third_party/dav1d/src/loongarch/mc.S b/third_party/dav1d/src/loongarch/mc.S
new file mode 100644
index 0000000000..97887de4a7
--- /dev/null
+++ b/third_party/dav1d/src/loongarch/mc.S
@@ -0,0 +1,4758 @@
+/*
+ * Copyright © 2023, VideoLAN and dav1d authors
+ * Copyright © 2023, Loongson Technology Corporation Limited
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/loongarch/loongson_asm.S"
+
+/*
+static void warp_affine_8x8_c(pixel *dst, const ptrdiff_t dst_stride,
+ const pixel *src, const ptrdiff_t src_stride,
+ const int16_t *const abcd, int mx, int my
+ HIGHBD_DECL_SUFFIX)
+*/
+.macro FILTER_WARP_RND_P_LSX in0, in1, in2, in3, out0, out1, out2, out3
+ vbsrl.v vr2, \in0, \in1
+ vbsrl.v vr20, \in0, \in2
+ addi.w t4, \in3, 512
+ srai.w t4, t4, 10
+ addi.w t4, t4, 64
+ slli.w t4, t4, 3
+ vldx vr1, t5, t4
+ add.w t3, t3, t0 // tmx += abcd[0]
+
+ addi.w t4, t3, 512
+ srai.w t4, t4, 10
+ addi.w t4, t4, 64
+ slli.w t4, t4, 3
+ vldx vr29, t5, t4
+ add.w t3, t3, t0 // tmx += abcd[0]
+
+ vilvl.d vr2, vr20, vr2
+ vilvl.d vr1, vr29, vr1
+ vmulwev.h.bu.b vr3, vr2, vr1
+ vmulwod.h.bu.b vr20, vr2, vr1
+ vilvl.d vr2, vr20, vr3
+ vhaddw.w.h vr2, vr2, vr2
+ vhaddw.d.w vr2, vr2, vr2
+ vhaddw.q.d vr2, vr2, vr2
+ vilvh.d vr3, vr20, vr3
+ vhaddw.w.h vr3, vr3, vr3
+ vhaddw.d.w vr3, vr3, vr3
+ vhaddw.q.d vr3, vr3, vr3
+ vextrins.w \out0, vr2, \out1
+ vextrins.w \out2, vr3, \out3
+.endm
+
+.macro FILTER_WARP_CLIP_LSX in0, in1, in2, out0, out1
+ add.w \in0, \in0, \in1
+ addi.w t6, \in0, 512
+ srai.w t6, t6, 10
+ addi.w t6, t6, 64
+ slli.w t6, t6, 3
+ fldx.d f1, t5, t6
+ vsllwil.h.b vr1, vr1, 0
+ vmulwev.w.h vr3, \in2, vr1
+ vmaddwod.w.h vr3, \in2, vr1
+ vhaddw.d.w vr3, vr3, vr3
+ vhaddw.q.d vr3, vr3, vr3
+ vextrins.w \out0, vr3, \out1
+.endm
+
+const warp_sh
+.rept 2
+.byte 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17
+.endr
+.rept 2
+.byte 18, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+.endr
+endconst
+
+.macro warp_lsx t, shift
+function warp_affine_8x8\t\()_8bpc_lsx
+ addi.d sp, sp, -64
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ fst.d f27, sp, 24
+ fst.d f28, sp, 32
+ fst.d f29, sp, 40
+ fst.d f30, sp, 48
+ fst.d f31, sp, 56
+
+ la.local t4, warp_sh
+ ld.h t0, a4, 0 // abcd[0]
+ ld.h t1, a4, 2 // abcd[1]
+
+ alsl.w t2, a3, a3, 1
+ addi.w t3, a5, 0
+ la.local t5, dav1d_mc_warp_filter
+ sub.d a2, a2, t2
+ addi.d a2, a2, -3
+ vld vr0, a2, 0
+ vld vr30, t4, 0
+ vld vr31, t4, 32
+
+ FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x00, vr5, 0x00
+ FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x00, vr7, 0x00
+ FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x00, vr9, 0x00
+ FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x00, vr11, 0x00
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x10, vr5, 0x10
+ FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x10, vr7, 0x10
+ FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x10, vr9, 0x10
+ FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x10, vr11, 0x10
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x20, vr5, 0x20
+ FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x20, vr7, 0x20
+ FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x20, vr9, 0x20
+ FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x20, vr11, 0x20
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x30, vr5, 0x30
+ FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x30, vr7, 0x30
+ FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x30, vr9, 0x30
+ FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x30, vr11, 0x30
+
+ add.w a5, t1, a5
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr12, 0x00, vr13, 0x00
+ FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr14, 0x00, vr15, 0x00
+ FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr16, 0x00, vr17, 0x00
+ FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr18, 0x00, vr19, 0x00
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr12, 0x10, vr13, 0x10
+ FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr14, 0x10, vr15, 0x10
+ FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr16, 0x10, vr17, 0x10
+ FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr18, 0x10, vr19, 0x10
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr12, 0x20, vr13, 0x20
+ FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr14, 0x20, vr15, 0x20
+ FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr16, 0x20, vr17, 0x20
+ FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr18, 0x20, vr19, 0x20
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr12, 0x30, vr13, 0x30
+ FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr14, 0x30, vr15, 0x30
+ FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr16, 0x30, vr17, 0x30
+ FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr18, 0x30, vr19, 0x30
+
+ vsrarni.h.w vr12, vr4, 3
+ vsrarni.h.w vr13, vr5, 3
+ vsrarni.h.w vr14, vr6, 3
+ vsrarni.h.w vr15, vr7, 3
+ vsrarni.h.w vr16, vr8, 3
+ vsrarni.h.w vr17, vr9, 3
+ vsrarni.h.w vr18, vr10, 3
+ vsrarni.h.w vr19, vr11, 3
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x00, vr5, 0x00
+ FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x00, vr7, 0x00
+ FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x00, vr9, 0x00
+ FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x00, vr11, 0x00
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x10, vr5, 0x10
+ FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x10, vr7, 0x10
+ FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x10, vr9, 0x10
+ FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x10, vr11, 0x10
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x20, vr5, 0x20
+ FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x20, vr7, 0x20
+ FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x20, vr9, 0x20
+ FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x20, vr11, 0x20
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x30, vr5, 0x30
+ FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x30, vr7, 0x30
+ FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x30, vr9, 0x30
+ FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x30, vr11, 0x30
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr21, 0x00, vr22, 0x00
+ FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr23, 0x00, vr24, 0x00
+ FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr25, 0x00, vr26, 0x00
+ FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr27, 0x00, vr28, 0x00
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr21, 0x10, vr22, 0x10
+ FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr23, 0x10, vr24, 0x10
+ FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr25, 0x10, vr26, 0x10
+ FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr27, 0x10, vr28, 0x10
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr21, 0x20, vr22, 0x20
+ FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr23, 0x20, vr24, 0x20
+ FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr25, 0x20, vr26, 0x20
+ FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr27, 0x20, vr28, 0x20
+
+ vsrarni.h.w vr21, vr4, 3
+ vsrarni.h.w vr22, vr5, 3
+ vsrarni.h.w vr23, vr6, 3
+ vsrarni.h.w vr24, vr7, 3
+ vsrarni.h.w vr25, vr8, 3
+ vsrarni.h.w vr26, vr9, 3
+ vsrarni.h.w vr27, vr10, 3
+ vsrarni.h.w vr28, vr11, 3
+
+ addi.w t2, a6, 0 // my
+ ld.h t7, a4, 4 // abcd[2]
+ ld.h t8, a4, 6 // abcd[3]
+
+.ifnb \t
+ slli.d a1, a1, 1
+.endif
+
+ FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00
+ FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10
+ FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20
+ FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30
+ FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00
+ FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10
+ FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20
+ FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30
+.ifnb \t
+ vssrarni.h.w vr5, vr4, \shift
+ vst vr5, a0, 0
+.else
+ vssrarni.hu.w vr5, vr4, \shift
+ vssrlni.bu.h vr5, vr5, 0
+ fst.d f5, a0, 0
+.endif
+
+ vshuf.b vr12, vr21, vr12, vr30
+ vshuf.b vr13, vr22, vr13, vr30
+ vshuf.b vr14, vr23, vr14, vr30
+ vshuf.b vr15, vr24, vr15, vr30
+ vshuf.b vr16, vr25, vr16, vr30
+ vshuf.b vr17, vr26, vr17, vr30
+ vshuf.b vr18, vr27, vr18, vr30
+ vshuf.b vr19, vr28, vr19, vr30
+ vextrins.h vr30, vr31, 0x70
+
+ add.w a6, a6, t8
+ addi.w t2, a6, 0
+ FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00
+ FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10
+ FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20
+ FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30
+ FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00
+ FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10
+ FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20
+ FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30
+.ifnb \t
+ vssrarni.h.w vr5, vr4, \shift
+ vstx vr5, a0, a1
+.else
+ vssrarni.hu.w vr5, vr4, \shift
+ vssrlni.bu.h vr5, vr5, 0
+ fstx.d f5, a0, a1
+.endif
+
+ vaddi.bu vr31, vr31, 2
+ vshuf.b vr12, vr21, vr12, vr30
+ vshuf.b vr13, vr22, vr13, vr30
+ vshuf.b vr14, vr23, vr14, vr30
+ vshuf.b vr15, vr24, vr15, vr30
+ vshuf.b vr16, vr25, vr16, vr30
+ vshuf.b vr17, vr26, vr17, vr30
+ vshuf.b vr18, vr27, vr18, vr30
+ vshuf.b vr19, vr28, vr19, vr30
+ vextrins.h vr30, vr31, 0x70
+
+ add.w a6, a6, t8
+ addi.w t2, a6, 0
+ FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00
+ FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10
+ FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20
+ FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30
+ FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00
+ FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10
+ FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20
+ FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30
+ alsl.d a0, a1, a0, 1
+.ifnb \t
+ vssrarni.h.w vr5, vr4, \shift
+ vst vr5, a0, 0
+.else
+ vssrarni.hu.w vr5, vr4, \shift
+ vssrlni.bu.h vr5, vr5, 0
+ fst.d f5, a0, 0
+.endif
+
+ vaddi.bu vr31, vr31, 2
+ vshuf.b vr12, vr21, vr12, vr30
+ vshuf.b vr13, vr22, vr13, vr30
+ vshuf.b vr14, vr23, vr14, vr30
+ vshuf.b vr15, vr24, vr15, vr30
+ vshuf.b vr16, vr25, vr16, vr30
+ vshuf.b vr17, vr26, vr17, vr30
+ vshuf.b vr18, vr27, vr18, vr30
+ vshuf.b vr19, vr28, vr19, vr30
+ vextrins.h vr30, vr31, 0x70
+
+ add.w a6, a6, t8
+ addi.w t2, a6, 0
+ FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00
+ FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10
+ FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20
+ FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30
+ FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00
+ FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10
+ FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20
+ FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30
+.ifnb \t
+ vssrarni.h.w vr5, vr4, \shift
+ vstx vr5, a0, a1
+.else
+ vssrarni.hu.w vr5, vr4, \shift
+ vssrlni.bu.h vr5, vr5, 0
+ fstx.d f5, a0, a1
+.endif
+
+ vaddi.bu vr31, vr31, 2
+ vshuf.b vr12, vr21, vr12, vr30
+ vshuf.b vr13, vr22, vr13, vr30
+ vshuf.b vr14, vr23, vr14, vr30
+ vshuf.b vr15, vr24, vr15, vr30
+ vshuf.b vr16, vr25, vr16, vr30
+ vshuf.b vr17, vr26, vr17, vr30
+ vshuf.b vr18, vr27, vr18, vr30
+ vshuf.b vr19, vr28, vr19, vr30
+ vextrins.h vr30, vr31, 0x70
+
+ add.w a6, a6, t8
+ addi.w t2, a6, 0
+ FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00
+ FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10
+ FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20
+ FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30
+ FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00
+ FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10
+ FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20
+ FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30
+ alsl.d a0, a1, a0, 1
+.ifnb \t
+ vssrarni.h.w vr5, vr4, \shift
+ vst vr5, a0, 0
+.else
+ vssrarni.hu.w vr5, vr4, \shift
+ vssrlni.bu.h vr5, vr5, 0
+ fst.d f5, a0, 0
+.endif
+
+ vaddi.bu vr31, vr31, 2
+ vshuf.b vr12, vr21, vr12, vr30
+ vshuf.b vr13, vr22, vr13, vr30
+ vshuf.b vr14, vr23, vr14, vr30
+ vshuf.b vr15, vr24, vr15, vr30
+ vshuf.b vr16, vr25, vr16, vr30
+ vshuf.b vr17, vr26, vr17, vr30
+ vshuf.b vr18, vr27, vr18, vr30
+ vshuf.b vr19, vr28, vr19, vr30
+ vextrins.h vr30, vr31, 0x70
+
+ add.w a6, a6, t8
+ addi.w t2, a6, 0
+ FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00
+ FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10
+ FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20
+ FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30
+ FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00
+ FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10
+ FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20
+ FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30
+.ifnb \t
+ vssrarni.h.w vr5, vr4, \shift
+ vstx vr5, a0, a1
+.else
+ vssrarni.hu.w vr5, vr4, \shift
+ vssrlni.bu.h vr5, vr5, 0
+ fstx.d f5, a0, a1
+.endif
+
+ vaddi.bu vr31, vr31, 2
+ vshuf.b vr12, vr21, vr12, vr30
+ vshuf.b vr13, vr22, vr13, vr30
+ vshuf.b vr14, vr23, vr14, vr30
+ vshuf.b vr15, vr24, vr15, vr30
+ vshuf.b vr16, vr25, vr16, vr30
+ vshuf.b vr17, vr26, vr17, vr30
+ vshuf.b vr18, vr27, vr18, vr30
+ vshuf.b vr19, vr28, vr19, vr30
+ vextrins.h vr30, vr31, 0x70
+
+ add.w a6, a6, t8
+ addi.w t2, a6, 0
+ FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00
+ FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10
+ FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20
+ FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30
+ FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00
+ FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10
+ FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20
+ FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30
+ alsl.d a0, a1, a0, 1
+.ifnb \t
+ vssrarni.h.w vr5, vr4, \shift
+ vst vr5, a0, 0
+.else
+ vssrarni.hu.w vr5, vr4, \shift
+ vssrlni.bu.h vr5, vr5, 0
+ fst.d f5, a0, 0
+.endif
+
+ vshuf.b vr12, vr21, vr12, vr30
+ vshuf.b vr13, vr22, vr13, vr30
+ vshuf.b vr14, vr23, vr14, vr30
+ vshuf.b vr15, vr24, vr15, vr30
+ vshuf.b vr16, vr25, vr16, vr30
+ vshuf.b vr17, vr26, vr17, vr30
+ vshuf.b vr18, vr27, vr18, vr30
+ vshuf.b vr19, vr28, vr19, vr30
+
+ add.w a6, a6, t8
+ addi.w t2, a6, 0
+ FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00
+ FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10
+ FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20
+ FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30
+ FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00
+ FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10
+ FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20
+ FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30
+.ifnb \t
+ vssrarni.h.w vr5, vr4, \shift
+ vstx vr5, a0, a1
+.else
+ vssrarni.hu.w vr5, vr4, \shift
+ vssrlni.bu.h vr5, vr5, 0
+ fstx.d f5, a0, a1
+.endif
+
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ fld.d f27, sp, 24
+ fld.d f28, sp, 32
+ fld.d f29, sp, 40
+ fld.d f30, sp, 48
+ fld.d f31, sp, 56
+ addi.d sp, sp, 64
+endfunc
+.endm
+
+warp_lsx , 11
+warp_lsx t, 7
+
+.macro FILTER_WARP_RND_P_LASX in0, in1, in2, out0, out1, out2, out3
+ xvshuf.b xr2, \in0, \in0, \in2
+
+ addi.w t4, \in1, 512
+ srai.w t4, t4, 10
+ addi.w t4, t4, 64
+ slli.w t4, t4, 3
+ vldx vr3, t5, t4
+ add.w t3, t3, t0 // tmx += abcd[0]
+
+ addi.w t4, t3, 512
+ srai.w t4, t4, 10
+ addi.w t4, t4, 64
+ slli.w t4, t4, 3
+ vldx vr4, t5, t4
+ add.w t3, t3, t0 // tmx += abcd[0]
+
+ addi.w t4, t3, 512
+ srai.w t4, t4, 10
+ addi.w t4, t4, 64
+ slli.w t4, t4, 3
+ vldx vr5, t5, t4
+ add.w t3, t3, t0 // tmx += abcd[0]
+
+ addi.w t4, t3, 512
+ srai.w t4, t4, 10
+ addi.w t4, t4, 64
+ slli.w t4, t4, 3
+ vldx vr6, t5, t4
+ add.w t3, t3, t0 // tmx += abcd[0]
+
+ xvinsve0.d xr3, xr5, 1
+ xvinsve0.d xr3, xr4, 2
+ xvinsve0.d xr3, xr6, 3
+
+ xvmulwev.h.bu.b xr4, xr2, xr3
+ xvmulwod.h.bu.b xr5, xr2, xr3
+ xvilvl.d xr2, xr5, xr4
+ xvilvh.d xr3, xr5, xr4
+ xvhaddw.w.h xr2, xr2, xr2
+ xvhaddw.w.h xr3, xr3, xr3
+ xvhaddw.d.w xr2, xr2, xr2
+ xvhaddw.d.w xr3, xr3, xr3
+ xvhaddw.q.d xr2, xr2, xr2
+ xvhaddw.q.d xr3, xr3, xr3
+
+ xvextrins.w \out0, xr2, \out1
+ xvextrins.w \out2, xr3, \out3
+.endm
+
+.macro FILTER_WARP_CLIP_LASX in0, in1, in2, out0, out1
+ add.w \in0, \in0, \in1
+ addi.w t6, \in0, 512
+ srai.w t6, t6, 10
+ addi.w t6, t6, 64
+ slli.w t6, t6, 3
+ fldx.d f1, t5, t6
+
+ add.w t2, t2, t7
+ addi.w t6, t2, 512
+ srai.w t6, t6, 10
+ addi.w t6, t6, 64
+ slli.w t6, t6, 3
+ fldx.d f2, t5, t6
+
+ vilvl.d vr0, vr2, vr1
+ vext2xv.h.b xr0, xr0
+ xvmulwev.w.h xr3, \in2, xr0
+ xvmaddwod.w.h xr3, \in2, xr0
+ xvhaddw.d.w xr3, xr3, xr3
+ xvhaddw.q.d xr3, xr3, xr3
+ xvextrins.w \out0, xr3, \out1
+.endm
+
+const shuf0
+.byte 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
+.byte 1, 2, 3, 4, 5, 6, 7, 8, 3, 4, 5, 6, 7, 8, 9, 10
+endconst
+
+.macro warp_lasx t, shift
+function warp_affine_8x8\t\()_8bpc_lasx
+ addi.d sp, sp, -16
+ ld.h t0, a4, 0 // abcd[0]
+ ld.h t1, a4, 2 // abcd[1]
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+
+ alsl.w t2, a3, a3, 1
+ addi.w t3, a5, 0
+ la.local t4, warp_sh
+ la.local t5, dav1d_mc_warp_filter
+ sub.d a2, a2, t2
+ addi.d a2, a2, -3
+ vld vr0, a2, 0
+ xvld xr24, t4, 0
+ xvld xr25, t4, 32
+ la.local t2, shuf0
+ xvld xr1, t2, 0
+ xvpermi.q xr0, xr0, 0x00
+ xvaddi.bu xr9, xr1, 4
+ FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x00, xr8, 0x00
+ FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x00, xr11, 0x00
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ xvpermi.q xr0, xr0, 0x00
+ FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x10, xr8, 0x10
+ FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x10, xr11, 0x10
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ xvpermi.q xr0, xr0, 0x00
+ FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x20, xr8, 0x20
+ FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x20, xr11, 0x20
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ xvpermi.q xr0, xr0, 0x00
+ FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x30, xr8, 0x30
+ FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x30, xr11, 0x30
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ xvpermi.q xr0, xr0, 0x00
+ FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr12, 0x00, xr13, 0x00
+ FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr14, 0x00, xr15, 0x00
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ xvpermi.q xr0, xr0, 0x00
+ FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr12, 0x10, xr13, 0x10
+ FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr14, 0x10, xr15, 0x10
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ xvpermi.q xr0, xr0, 0x00
+ FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr12, 0x20, xr13, 0x20
+ FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr14, 0x20, xr15, 0x20
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ xvpermi.q xr0, xr0, 0x00
+ FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr12, 0x30, xr13, 0x30
+ FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr14, 0x30, xr15, 0x30
+
+ xvsrarni.h.w xr12, xr7, 3
+ xvsrarni.h.w xr13, xr8, 3
+ xvsrarni.h.w xr14, xr10, 3
+ xvsrarni.h.w xr15, xr11, 3
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ xvpermi.q xr0, xr0, 0x00
+ FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x00, xr8, 0x00
+ FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x00, xr11, 0x00
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ xvpermi.q xr0, xr0, 0x00
+ FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x10, xr8, 0x10
+ FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x10, xr11, 0x10
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ xvpermi.q xr0, xr0, 0x00
+ FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x20, xr8, 0x20
+ FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x20, xr11, 0x20
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ xvpermi.q xr0, xr0, 0x00
+ FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x30, xr8, 0x30
+ FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x30, xr11, 0x30
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ xvpermi.q xr0, xr0, 0x00
+ FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr16, 0x00, xr17, 0x00
+ FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr18, 0x00, xr19, 0x00
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ xvpermi.q xr0, xr0, 0x00
+ FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr16, 0x10, xr17, 0x10
+ FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr18, 0x10, xr19, 0x10
+
+ add.w a5, a5, t1
+ or t3, a5, a5
+ add.d a2, a2, a3
+ vld vr0, a2, 0
+ xvpermi.q xr0, xr0, 0x00
+ FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr16, 0x20, xr17, 0x20
+ FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr18, 0x20, xr19, 0x20
+
+ xvsrarni.h.w xr16, xr7, 3
+ xvsrarni.h.w xr17, xr8, 3
+ xvsrarni.h.w xr18, xr10, 3
+ xvsrarni.h.w xr19, xr11, 3
+
+ addi.w t2, a6, 0 // my
+ ld.h t7, a4, 4 // abcd[2]
+ ld.h t8, a4, 6 // abcd[3]
+
+.ifnb \t
+ slli.d a1, a1, 1
+.endif
+
+ // y = 0
+ FILTER_WARP_CLIP_LASX t2, zero, xr12, xr20, 0x00
+ FILTER_WARP_CLIP_LASX t2, t7, xr13, xr20, 0x10
+ FILTER_WARP_CLIP_LASX t2, t7, xr14, xr20, 0x20
+ FILTER_WARP_CLIP_LASX t2, t7, xr15, xr20, 0x30
+
+ xvshuf.b xr12, xr16, xr12, xr24
+ xvshuf.b xr13, xr17, xr13, xr24
+ xvshuf.b xr14, xr18, xr14, xr24
+ xvshuf.b xr15, xr19, xr15, xr24
+ xvextrins.h xr24, xr25, 0x70
+
+ add.w a6, a6, t8
+ addi.w t2, a6, 0
+ FILTER_WARP_CLIP_LASX t2, zero, xr12, xr21, 0x00
+ FILTER_WARP_CLIP_LASX t2, t7, xr13, xr21, 0x10
+ FILTER_WARP_CLIP_LASX t2, t7, xr14, xr21, 0x20
+ FILTER_WARP_CLIP_LASX t2, t7, xr15, xr21, 0x30
+
+.ifnb \t
+ xvssrarni.h.w xr21, xr20, \shift
+ xvpermi.q xr22, xr21, 0x01
+ vilvl.h vr23, vr22, vr21
+ vilvh.h vr21, vr22, vr21
+ vst vr23, a0, 0
+ vstx vr21, a0, a1
+.else
+ xvssrarni.hu.w xr21, xr20, \shift
+ xvssrlni.bu.h xr22, xr21, 0
+ xvpermi.q xr23, xr22, 0x01
+ vilvl.b vr21, vr23, vr22
+ fst.d f21, a0, 0
+ add.d a0, a0, a1
+ vstelm.d vr21, a0, 0, 1
+.endif
+
+ xvaddi.bu xr25, xr25, 2
+ xvshuf.b xr12, xr16, xr12, xr24
+ xvshuf.b xr13, xr17, xr13, xr24
+ xvshuf.b xr14, xr18, xr14, xr24
+ xvshuf.b xr15, xr19, xr15, xr24
+ xvextrins.h xr24, xr25, 0x70
+
+ add.w a6, a6, t8
+ addi.w t2, a6, 0
+ FILTER_WARP_CLIP_LASX t2, zero, xr12, xr20, 0x00
+ FILTER_WARP_CLIP_LASX t2, t7, xr13, xr20, 0x10
+ FILTER_WARP_CLIP_LASX t2, t7, xr14, xr20, 0x20
+ FILTER_WARP_CLIP_LASX t2, t7, xr15, xr20, 0x30
+
+ xvaddi.bu xr25, xr25, 2
+ xvshuf.b xr12, xr16, xr12, xr24
+ xvshuf.b xr13, xr17, xr13, xr24
+ xvshuf.b xr14, xr18, xr14, xr24
+ xvshuf.b xr15, xr19, xr15, xr24
+ xvextrins.h xr24, xr25, 0x70
+
+ add.w a6, a6, t8
+ addi.w t2, a6, 0
+ FILTER_WARP_CLIP_LASX t2, zero, xr12, xr21, 0x00
+ FILTER_WARP_CLIP_LASX t2, t7, xr13, xr21, 0x10
+ FILTER_WARP_CLIP_LASX t2, t7, xr14, xr21, 0x20
+ FILTER_WARP_CLIP_LASX t2, t7, xr15, xr21, 0x30
+
+.ifnb \t
+ xvssrarni.h.w xr21, xr20, \shift
+ alsl.d a0, a1, a0, 1
+ xvpermi.q xr22, xr21, 0x01
+ vilvl.h vr23, vr22, vr21
+ vilvh.h vr21, vr22, vr21
+ vst vr23, a0, 0
+ vstx vr21, a0, a1
+.else
+ xvssrarni.hu.w xr21, xr20, 11
+ xvssrlni.bu.h xr22, xr21, 0
+ xvpermi.q xr23, xr22, 0x01
+ vilvl.b vr21, vr23, vr22
+ add.d a0, a0, a1
+ fst.d f21, a0, 0
+ add.d a0, a0, a1
+ vstelm.d vr21, a0, 0, 1
+.endif
+
+ xvaddi.bu xr25, xr25, 2
+ xvshuf.b xr12, xr16, xr12, xr24
+ xvshuf.b xr13, xr17, xr13, xr24
+ xvshuf.b xr14, xr18, xr14, xr24
+ xvshuf.b xr15, xr19, xr15, xr24
+ xvextrins.h xr24, xr25, 0x70
+
+ add.w a6, a6, t8
+ addi.w t2, a6, 0
+ FILTER_WARP_CLIP_LASX t2, zero, xr12, xr20, 0x00
+ FILTER_WARP_CLIP_LASX t2, t7, xr13, xr20, 0x10
+ FILTER_WARP_CLIP_LASX t2, t7, xr14, xr20, 0x20
+ FILTER_WARP_CLIP_LASX t2, t7, xr15, xr20, 0x30
+
+ xvaddi.bu xr25, xr25, 2
+ xvshuf.b xr12, xr16, xr12, xr24
+ xvshuf.b xr13, xr17, xr13, xr24
+ xvshuf.b xr14, xr18, xr14, xr24
+ xvshuf.b xr15, xr19, xr15, xr24
+ xvextrins.h xr24, xr25, 0x70
+
+ add.w a6, a6, t8
+ addi.w t2, a6, 0
+ FILTER_WARP_CLIP_LASX t2, zero, xr12, xr21, 0x00
+ FILTER_WARP_CLIP_LASX t2, t7, xr13, xr21, 0x10
+ FILTER_WARP_CLIP_LASX t2, t7, xr14, xr21, 0x20
+ FILTER_WARP_CLIP_LASX t2, t7, xr15, xr21, 0x30
+
+.ifnb \t
+ xvssrarni.h.w xr21, xr20, \shift
+ alsl.d a0, a1, a0, 1
+ xvpermi.q xr22, xr21, 0x01
+ vilvl.h vr23, vr22, vr21
+ vilvh.h vr21, vr22, vr21
+ vst vr23, a0, 0
+ vstx vr21, a0, a1
+.else
+ xvssrarni.hu.w xr21, xr20, 11
+ xvssrlni.bu.h xr22, xr21, 0
+ xvpermi.q xr23, xr22, 0x01
+ vilvl.b vr21, vr23, vr22
+ add.d a0, a0, a1
+ fst.d f21, a0, 0
+ add.d a0, a0, a1
+ vstelm.d vr21, a0, 0, 1
+.endif
+
+ xvaddi.bu xr25, xr25, 2
+ xvshuf.b xr12, xr16, xr12, xr24
+ xvshuf.b xr13, xr17, xr13, xr24
+ xvshuf.b xr14, xr18, xr14, xr24
+ xvshuf.b xr15, xr19, xr15, xr24
+ xvextrins.h xr24, xr25, 0x70
+
+ add.w a6, a6, t8
+ addi.w t2, a6, 0
+ FILTER_WARP_CLIP_LASX t2, zero, xr12, xr20, 0x00
+ FILTER_WARP_CLIP_LASX t2, t7, xr13, xr20, 0x10
+ FILTER_WARP_CLIP_LASX t2, t7, xr14, xr20, 0x20
+ FILTER_WARP_CLIP_LASX t2, t7, xr15, xr20, 0x30
+
+ xvshuf.b xr12, xr16, xr12, xr24
+ xvshuf.b xr13, xr17, xr13, xr24
+ xvshuf.b xr14, xr18, xr14, xr24
+ xvshuf.b xr15, xr19, xr15, xr24
+
+ add.w a6, a6, t8
+ addi.w t2, a6, 0
+ FILTER_WARP_CLIP_LASX t2, zero, xr12, xr21, 0x00
+ FILTER_WARP_CLIP_LASX t2, t7, xr13, xr21, 0x10
+ FILTER_WARP_CLIP_LASX t2, t7, xr14, xr21, 0x20
+ FILTER_WARP_CLIP_LASX t2, t7, xr15, xr21, 0x30
+
+.ifnb \t
+ xvssrarni.h.w xr21, xr20, \shift
+ alsl.d a0, a1, a0, 1
+ xvpermi.q xr22, xr21, 0x01
+ vilvl.h vr23, vr22, vr21
+ vilvh.h vr21, vr22, vr21
+ vst vr23, a0, 0
+ vstx vr21, a0, a1
+.else
+ xvssrarni.hu.w xr21, xr20, 11
+ xvssrlni.bu.h xr22, xr21, 0
+ xvpermi.q xr23, xr22, 0x01
+ vilvl.b vr21, vr23, vr22
+ add.d a0, a0, a1
+ fst.d f21, a0, 0
+ add.d a0, a0, a1
+ vstelm.d vr21, a0, 0, 1
+.endif
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ addi.d sp, sp, 16
+endfunc
+.endm
+
+warp_lasx , 11
+warp_lasx t, 7
+
+/*
+static void w_avg_c(pixel *dst, const ptrdiff_t dst_stride,
+ const int16_t *tmp1, const int16_t *tmp2,
+ const int w, int h,
+ const int weight HIGHBD_DECL_SUFFIX)
+*/
+
+#define bpc8_sh 5 // sh = intermediate_bits + 1
+#define bpcw8_sh 8 // sh = intermediate_bits + 4
+
+#define bpc_sh bpc8_sh
+#define bpcw_sh bpcw8_sh
+
+function avg_8bpc_lsx
+ addi.d t8, a0, 0
+
+ clz.w t0, a4
+ li.w t1, 24
+ sub.w t0, t0, t1
+ la.local t1, .AVG_LSX_JRTABLE
+ alsl.d t0, t0, t1, 1
+ ld.h t2, t0, 0 // The jump addresses are relative to AVG_LSX_JRTABLE
+ add.d t1, t1, t2 // Get absolute address
+ jirl $r0, t1, 0
+
+ .align 3
+.AVG_LSX_JRTABLE:
+ .hword .AVG_W128_LSX - .AVG_LSX_JRTABLE
+ .hword .AVG_W64_LSX - .AVG_LSX_JRTABLE
+ .hword .AVG_W32_LSX - .AVG_LSX_JRTABLE
+ .hword .AVG_W16_LSX - .AVG_LSX_JRTABLE
+ .hword .AVG_W8_LSX - .AVG_LSX_JRTABLE
+ .hword .AVG_W4_LSX - .AVG_LSX_JRTABLE
+
+.AVG_W4_LSX:
+ vld vr0, a2, 0
+ vld vr1, a3, 0
+ vadd.h vr2, vr0, vr1
+ vssrarni.bu.h vr3, vr2, bpc_sh
+ vstelm.w vr3, a0, 0, 0
+ add.d a0, a0, a1
+ vstelm.w vr3, a0, 0, 1
+ addi.w a5, a5, -2
+ addi.d a2, a2, 16
+ addi.d a3, a3, 16
+ add.d a0, a0, a1
+ blt zero, a5, .AVG_W4_LSX
+ b .AVG_END_LSX
+
+.AVG_W8_LSX:
+ vld vr0, a2, 0
+ vld vr2, a2, 16
+ vld vr1, a3, 0
+ vld vr3, a3, 16
+ vadd.h vr4, vr0, vr1
+ vadd.h vr5, vr2, vr3
+ vssrarni.bu.h vr5, vr4, bpc_sh
+ addi.w a5, a5, -2
+ addi.d a2, a2, 32
+ vstelm.d vr5, a0, 0, 0
+ add.d a0, a0, a1
+ vstelm.d vr5, a0, 0, 1
+ addi.d a3, a3, 32
+ add.d a0, a0, a1
+ blt zero, a5, .AVG_W8_LSX
+ b .AVG_END_LSX
+
+.AVG_W16_LSX:
+ vld vr0, a2, 0
+ vld vr2, a2, 16
+ vld vr1, a3, 0
+ vld vr3, a3, 16
+ vadd.h vr4, vr0, vr1
+ vadd.h vr5, vr2, vr3
+ vssrarni.bu.h vr5, vr4, bpc_sh
+ addi.w a5, a5, -1
+ addi.d a2, a2, 32
+ vst vr5, a0, 0
+ addi.d a3, a3, 32
+ add.d a0, a0, a1
+ blt zero, a5, .AVG_W16_LSX
+ b .AVG_END_LSX
+
+.AVG_W32_LSX:
+ vld vr0, a2, 0
+ vld vr2, a2, 16
+ vld vr4, a2, 32
+ vld vr6, a2, 48
+ vld vr1, a3, 0
+ vld vr3, a3, 16
+ vld vr5, a3, 32
+ vld vr7, a3, 48
+ vadd.h vr0, vr0, vr1
+ vadd.h vr2, vr2, vr3
+ vadd.h vr4, vr4, vr5
+ vadd.h vr6, vr6, vr7
+ vssrarni.bu.h vr2, vr0, bpc_sh
+ vssrarni.bu.h vr6, vr4, bpc_sh
+ addi.w a5, a5, -1
+ addi.d a2, a2, 64
+ vst vr2, a0, 0
+ vst vr6, a0, 16
+ addi.d a3, a3, 64
+ add.d a0, a0, a1
+ blt zero, a5, .AVG_W32_LSX
+ b .AVG_END_LSX
+
+.AVG_W64_LSX:
+.rept 4
+ vld vr0, a2, 0
+ vld vr2, a2, 16
+ vld vr1, a3, 0
+ vld vr3, a3, 16
+ vadd.h vr0, vr0, vr1
+ vadd.h vr2, vr2, vr3
+ vssrarni.bu.h vr2, vr0, bpc_sh
+ addi.d a2, a2, 32
+ addi.d a3, a3, 32
+ vst vr2, a0, 0
+ addi.d a0, a0, 16
+.endr
+ addi.w a5, a5, -1
+ add.d t8, t8, a1
+ add.d a0, t8, zero
+ blt zero, a5, .AVG_W64_LSX
+ b .AVG_END_LSX
+
+.AVG_W128_LSX:
+.rept 8
+ vld vr0, a2, 0
+ vld vr2, a2, 16
+ vld vr1, a3, 0
+ vld vr3, a3, 16
+ vadd.h vr0, vr0, vr1
+ vadd.h vr2, vr2, vr3
+ vssrarni.bu.h vr2, vr0, bpc_sh
+ addi.d a2, a2, 32
+ addi.d a3, a3, 32
+ vst vr2, a0, 0
+ addi.d a0, a0, 16
+.endr
+ addi.w a5, a5, -1
+ add.d t8, t8, a1
+ add.d a0, t8, zero
+ blt zero, a5, .AVG_W128_LSX
+.AVG_END_LSX:
+endfunc
+
+function avg_8bpc_lasx
+ clz.w t0, a4
+ li.w t1, 24
+ sub.w t0, t0, t1
+ la.local t1, .AVG_LASX_JRTABLE
+ alsl.d t0, t0, t1, 1
+ ld.h t2, t0, 0
+ add.d t1, t1, t2
+ jirl $r0, t1, 0
+
+ .align 3
+.AVG_LASX_JRTABLE:
+ .hword .AVG_W128_LASX - .AVG_LASX_JRTABLE
+ .hword .AVG_W64_LASX - .AVG_LASX_JRTABLE
+ .hword .AVG_W32_LASX - .AVG_LASX_JRTABLE
+ .hword .AVG_W16_LASX - .AVG_LASX_JRTABLE
+ .hword .AVG_W8_LASX - .AVG_LASX_JRTABLE
+ .hword .AVG_W4_LASX - .AVG_LASX_JRTABLE
+
+.AVG_W4_LASX:
+ vld vr0, a2, 0
+ vld vr1, a3, 0
+ vadd.h vr0, vr0, vr1
+ vssrarni.bu.h vr1, vr0, bpc_sh
+ vstelm.w vr1, a0, 0, 0
+ add.d a0, a0, a1
+ vstelm.w vr1, a0, 0, 1
+ addi.w a5, a5, -2
+ addi.d a2, a2, 16
+ addi.d a3, a3, 16
+ add.d a0, a0, a1
+ blt zero, a5, .AVG_W4_LASX
+ b .AVG_END_LASX
+.AVG_W8_LASX:
+ xvld xr0, a2, 0
+ xvld xr1, a3, 0
+ xvadd.h xr2, xr0, xr1
+ xvssrarni.bu.h xr1, xr2, bpc_sh
+ xvstelm.d xr1, a0, 0, 0
+ add.d a0, a0, a1
+ xvstelm.d xr1, a0, 0, 2
+ addi.w a5, a5, -2
+ addi.d a2, a2, 32
+ addi.d a3, a3, 32
+ add.d a0, a1, a0
+ blt zero, a5, .AVG_W8_LASX
+ b .AVG_END_LASX
+.AVG_W16_LASX:
+ xvld xr0, a2, 0
+ xvld xr2, a2, 32
+ xvld xr1, a3, 0
+ xvld xr3, a3, 32
+ xvadd.h xr4, xr0, xr1
+ xvadd.h xr5, xr2, xr3
+ xvssrarni.bu.h xr5, xr4, bpc_sh
+ xvpermi.d xr2, xr5, 0xd8
+ xvpermi.d xr3, xr5, 0x8d
+ vst vr2, a0, 0
+ vstx vr3, a0, a1
+ addi.w a5, a5, -2
+ addi.d a2, a2, 64
+ addi.d a3, a3, 64
+ alsl.d a0, a1, a0, 1
+ blt zero, a5, .AVG_W16_LASX
+ b .AVG_END_LASX
+.AVG_W32_LASX:
+ xvld xr0, a2, 0
+ xvld xr2, a2, 32
+ xvld xr1, a3, 0
+ xvld xr3, a3, 32
+ xvadd.h xr4, xr0, xr1
+ xvadd.h xr5, xr2, xr3
+ xvssrarni.bu.h xr5, xr4, bpc_sh
+ xvpermi.d xr6, xr5, 0xd8
+ xvst xr6, a0, 0
+ addi.w a5, a5, -1
+ addi.d a2, a2, 64
+ addi.d a3, a3, 64
+ add.d a0, a0, a1
+ blt zero, a5, .AVG_W32_LASX
+ b .AVG_END_LASX
+.AVG_W64_LASX:
+ xvld xr0, a2, 0
+ xvld xr2, a2, 32
+ xvld xr4, a2, 64
+ xvld xr6, a2, 96
+ xvld xr1, a3, 0
+ xvld xr3, a3, 32
+ xvld xr5, a3, 64
+ xvld xr7, a3, 96
+ xvadd.h xr0, xr0, xr1
+ xvadd.h xr2, xr2, xr3
+ xvadd.h xr4, xr4, xr5
+ xvadd.h xr6, xr6, xr7
+ xvssrarni.bu.h xr2, xr0, bpc_sh
+ xvssrarni.bu.h xr6, xr4, bpc_sh
+ xvpermi.d xr1, xr2, 0xd8
+ xvpermi.d xr3, xr6, 0xd8
+ xvst xr1, a0, 0
+ xvst xr3, a0, 32
+ addi.w a5, a5, -1
+ addi.d a2, a2, 128
+ addi.d a3, a3, 128
+ add.d a0, a0, a1
+ blt zero, a5, .AVG_W64_LASX
+ b .AVG_END_LASX
+.AVG_W128_LASX:
+ xvld xr0, a2, 0
+ xvld xr2, a2, 32
+ xvld xr4, a2, 64
+ xvld xr6, a2, 96
+ xvld xr8, a2, 128
+ xvld xr10, a2, 160
+ xvld xr12, a2, 192
+ xvld xr14, a2, 224
+ xvld xr1, a3, 0
+ xvld xr3, a3, 32
+ xvld xr5, a3, 64
+ xvld xr7, a3, 96
+ xvld xr9, a3, 128
+ xvld xr11, a3, 160
+ xvld xr13, a3, 192
+ xvld xr15, a3, 224
+ xvadd.h xr0, xr0, xr1
+ xvadd.h xr2, xr2, xr3
+ xvadd.h xr4, xr4, xr5
+ xvadd.h xr6, xr6, xr7
+ xvadd.h xr8, xr8, xr9
+ xvadd.h xr10, xr10, xr11
+ xvadd.h xr12, xr12, xr13
+ xvadd.h xr14, xr14, xr15
+ xvssrarni.bu.h xr2, xr0, bpc_sh
+ xvssrarni.bu.h xr6, xr4, bpc_sh
+ xvssrarni.bu.h xr10, xr8, bpc_sh
+ xvssrarni.bu.h xr14, xr12, bpc_sh
+ xvpermi.d xr1, xr2, 0xd8
+ xvpermi.d xr3, xr6, 0xd8
+ xvpermi.d xr5, xr10, 0xd8
+ xvpermi.d xr7, xr14, 0xd8
+ xvst xr1, a0, 0
+ xvst xr3, a0, 32
+ xvst xr5, a0, 64
+ xvst xr7, a0, 96
+ addi.w a5, a5, -1
+ addi.d a2, a2, 256
+ addi.d a3, a3, 256
+ add.d a0, a0, a1
+ blt zero, a5, .AVG_W128_LASX
+.AVG_END_LASX:
+endfunc
+
+function w_avg_8bpc_lsx
+ addi.d t8, a0, 0
+ li.w t2, 16
+ sub.w t2, t2, a6 // 16 - weight
+ vreplgr2vr.h vr21, a6
+ vreplgr2vr.h vr22, t2
+
+ clz.w t0, a4
+ li.w t1, 24
+ sub.w t0, t0, t1
+ la.local t1, .W_AVG_LSX_JRTABLE
+ alsl.d t0, t0, t1, 1
+ ld.h t2, t0, 0
+ add.d t1, t1, t2
+ jirl $r0, t1, 0
+
+ .align 3
+.W_AVG_LSX_JRTABLE:
+ .hword .W_AVG_W128_LSX - .W_AVG_LSX_JRTABLE
+ .hword .W_AVG_W64_LSX - .W_AVG_LSX_JRTABLE
+ .hword .W_AVG_W32_LSX - .W_AVG_LSX_JRTABLE
+ .hword .W_AVG_W16_LSX - .W_AVG_LSX_JRTABLE
+ .hword .W_AVG_W8_LSX - .W_AVG_LSX_JRTABLE
+ .hword .W_AVG_W4_LSX - .W_AVG_LSX_JRTABLE
+
+.W_AVG_W4_LSX:
+ vld vr0, a2, 0
+ vld vr1, a3, 0
+ vmulwev.w.h vr2, vr0, vr21
+ vmulwod.w.h vr3, vr0, vr21
+ vmaddwev.w.h vr2, vr1, vr22
+ vmaddwod.w.h vr3, vr1, vr22
+ vssrarni.hu.w vr3, vr2, bpcw_sh
+ vssrlni.bu.h vr1, vr3, 0
+ vpickod.w vr4, vr2, vr1
+ vilvl.b vr0, vr4, vr1
+ fst.s f0, a0, 0
+ add.d a0, a0, a1
+ vstelm.w vr0, a0, 0, 1
+ addi.w a5, a5, -2
+ addi.d a2, a2, 16
+ addi.d a3, a3, 16
+ add.d a0, a1, a0
+ blt zero, a5, .W_AVG_W4_LSX
+ b .W_AVG_END_LSX
+.W_AVG_W8_LSX:
+ vld vr0, a2, 0
+ vld vr1, a3, 0
+ vmulwev.w.h vr2, vr0, vr21
+ vmulwod.w.h vr3, vr0, vr21
+ vmaddwev.w.h vr2, vr1, vr22
+ vmaddwod.w.h vr3, vr1, vr22
+ vssrarni.hu.w vr3, vr2, bpcw_sh
+ vssrlni.bu.h vr1, vr3, 0
+ vpickod.w vr4, vr2, vr1
+ vilvl.b vr0, vr4, vr1
+ fst.d f0, a0, 0
+ addi.w a5, a5, -1
+ addi.d a2, a2, 16
+ addi.d a3, a3, 16
+ add.d a0, a0, a1
+ blt zero, a5, .W_AVG_W8_LSX
+ b .W_AVG_END_LSX
+.W_AVG_W16_LSX:
+ vld vr0, a2, 0
+ vld vr2, a2, 16
+ vld vr1, a3, 0
+ vld vr3, a3, 16
+ vmulwev.w.h vr4, vr0, vr21
+ vmulwod.w.h vr5, vr0, vr21
+ vmulwev.w.h vr6, vr2, vr21
+ vmulwod.w.h vr7, vr2, vr21
+ vmaddwev.w.h vr4, vr1, vr22
+ vmaddwod.w.h vr5, vr1, vr22
+ vmaddwev.w.h vr6, vr3, vr22
+ vmaddwod.w.h vr7, vr3, vr22
+ vssrarni.hu.w vr6, vr4, bpcw_sh
+ vssrarni.hu.w vr7, vr5, bpcw_sh
+ vssrlrni.bu.h vr7, vr6, 0
+ vshuf4i.w vr8, vr7, 0x4E
+ vilvl.b vr0, vr8, vr7
+ vst vr0, a0, 0
+ addi.w a5, a5, -1
+ addi.d a2, a2, 32
+ addi.d a3, a3, 32
+ add.d a0, a0, a1
+ blt zero, a5, .W_AVG_W16_LSX
+ b .W_AVG_END_LSX
+.W_AVG_W32_LSX:
+.rept 2
+ vld vr0, a2, 0
+ vld vr2, a2, 16
+ vld vr1, a3, 0
+ vld vr3, a3, 16
+ vmulwev.w.h vr4, vr0, vr21
+ vmulwod.w.h vr5, vr0, vr21
+ vmulwev.w.h vr6, vr2, vr21
+ vmulwod.w.h vr7, vr2, vr21
+ vmaddwev.w.h vr4, vr1, vr22
+ vmaddwod.w.h vr5, vr1, vr22
+ vmaddwev.w.h vr6, vr3, vr22
+ vmaddwod.w.h vr7, vr3, vr22
+ vssrarni.hu.w vr6, vr4, bpcw_sh
+ vssrarni.hu.w vr7, vr5, bpcw_sh
+ vssrlrni.bu.h vr7, vr6, 0
+ vshuf4i.w vr8, vr7, 0x4E
+ vilvl.b vr0, vr8, vr7
+ vst vr0, a0, 0
+ addi.d a2, a2, 32
+ addi.d a3, a3, 32
+ addi.d a0, a0, 16
+.endr
+ addi.w a5, a5, -1
+ add.d t8, t8, a1
+ add.d a0, t8, zero
+ blt zero, a5, .W_AVG_W32_LSX
+ b .W_AVG_END_LSX
+
+.W_AVG_W64_LSX:
+.rept 4
+ vld vr0, a2, 0
+ vld vr2, a2, 16
+ vld vr1, a3, 0
+ vld vr3, a3, 16
+ vmulwev.w.h vr4, vr0, vr21
+ vmulwod.w.h vr5, vr0, vr21
+ vmulwev.w.h vr6, vr2, vr21
+ vmulwod.w.h vr7, vr2, vr21
+ vmaddwev.w.h vr4, vr1, vr22
+ vmaddwod.w.h vr5, vr1, vr22
+ vmaddwev.w.h vr6, vr3, vr22
+ vmaddwod.w.h vr7, vr3, vr22
+ vssrarni.hu.w vr6, vr4, bpcw_sh
+ vssrarni.hu.w vr7, vr5, bpcw_sh
+ vssrlrni.bu.h vr7, vr6, 0
+ vshuf4i.w vr8, vr7, 0x4E
+ vilvl.b vr0, vr8, vr7
+ vst vr0, a0, 0
+ addi.d a2, a2, 32
+ addi.d a3, a3, 32
+ addi.d a0, a0, 16
+.endr
+ addi.w a5, a5, -1
+ add.d t8, t8, a1
+ add.d a0, t8, zero
+ blt zero, a5, .W_AVG_W64_LSX
+ b .W_AVG_END_LSX
+
+.W_AVG_W128_LSX:
+.rept 8
+ vld vr0, a2, 0
+ vld vr2, a2, 16
+ vld vr1, a3, 0
+ vld vr3, a3, 16
+ vmulwev.w.h vr4, vr0, vr21
+ vmulwod.w.h vr5, vr0, vr21
+ vmulwev.w.h vr6, vr2, vr21
+ vmulwod.w.h vr7, vr2, vr21
+ vmaddwev.w.h vr4, vr1, vr22
+ vmaddwod.w.h vr5, vr1, vr22
+ vmaddwev.w.h vr6, vr3, vr22
+ vmaddwod.w.h vr7, vr3, vr22
+ vssrarni.hu.w vr6, vr4, bpcw_sh
+ vssrarni.hu.w vr7, vr5, bpcw_sh
+ vssrlrni.bu.h vr7, vr6, 0
+ vshuf4i.w vr8, vr7, 0x4E
+ vilvl.b vr0, vr8, vr7
+ vst vr0, a0, 0
+ addi.d a2, a2, 32
+ addi.d a3, a3, 32
+ addi.d a0, a0, 16
+.endr
+ addi.w a5, a5, -1
+ add.d t8, t8, a1
+ add.d a0, t8, zero
+ blt zero, a5, .W_AVG_W128_LSX
+.W_AVG_END_LSX:
+endfunc
+
+function w_avg_8bpc_lasx
+ addi.d t8, a0, 0
+ li.w t2, 16
+ sub.w t2, t2, a6 // 16 - weight
+ xvreplgr2vr.h xr21, a6
+ xvreplgr2vr.h xr22, t2
+
+ clz.w t0, a4
+ li.w t1, 24
+ sub.w t0, t0, t1
+ la.local t1, .W_AVG_LASX_JRTABLE
+ alsl.d t0, t0, t1, 1
+ ld.h t2, t0, 0
+ add.d t1, t1, t2
+ jirl $r0, t1, 0
+
+ .align 3
+.W_AVG_LASX_JRTABLE:
+ .hword .W_AVG_W128_LASX - .W_AVG_LASX_JRTABLE
+ .hword .W_AVG_W64_LASX - .W_AVG_LASX_JRTABLE
+ .hword .W_AVG_W32_LASX - .W_AVG_LASX_JRTABLE
+ .hword .W_AVG_W16_LASX - .W_AVG_LASX_JRTABLE
+ .hword .W_AVG_W8_LASX - .W_AVG_LASX_JRTABLE
+ .hword .W_AVG_W4_LASX - .W_AVG_LASX_JRTABLE
+
+.W_AVG_W4_LASX:
+ vld vr0, a2, 0
+ vld vr1, a3, 0
+ xvpermi.d xr2, xr0, 0xD8
+ xvpermi.d xr3, xr1, 0xD8
+ xvilvl.h xr4, xr3, xr2
+ xvmulwev.w.h xr0, xr4, xr21
+ xvmaddwod.w.h xr0, xr4, xr22
+ xvssrarni.hu.w xr1, xr0, bpcw_sh
+ xvssrlni.bu.h xr0, xr1, 0
+ fst.s f0, a0, 0
+ add.d a0, a0, a1
+ xvstelm.w xr0, a0, 0, 4
+ addi.w a5, a5, -2
+ addi.d a2, a2, 16
+ addi.d a3, a3, 16
+ add.d a0, a1, a0
+ blt zero, a5, .W_AVG_W4_LASX
+ b .W_AVG_END_LASX
+
+.W_AVG_W8_LASX:
+ xvld xr0, a2, 0
+ xvld xr1, a3, 0
+ xvmulwev.w.h xr2, xr0, xr21
+ xvmulwod.w.h xr3, xr0, xr21
+ xvmaddwev.w.h xr2, xr1, xr22
+ xvmaddwod.w.h xr3, xr1, xr22
+ xvssrarni.hu.w xr3, xr2, bpcw_sh
+ xvssrlni.bu.h xr1, xr3, 0
+ xvpickod.w xr4, xr2, xr1
+ xvilvl.b xr0, xr4, xr1
+ xvstelm.d xr0, a0, 0, 0
+ add.d a0, a0, a1
+ xvstelm.d xr0, a0, 0, 2
+ addi.w a5, a5, -2
+ addi.d a2, a2, 32
+ addi.d a3, a3, 32
+ add.d a0, a0, a1
+ blt zero, a5, .W_AVG_W8_LASX
+ b .W_AVG_END_LASX
+
+.W_AVG_W16_LASX:
+ xvld xr0, a2, 0
+ xvld xr1, a3, 0
+ xvmulwev.w.h xr2, xr0, xr21
+ xvmulwod.w.h xr3, xr0, xr21
+ xvmaddwev.w.h xr2, xr1, xr22
+ xvmaddwod.w.h xr3, xr1, xr22
+ xvssrarni.hu.w xr3, xr2, bpcw_sh
+ xvssrlni.bu.h xr1, xr3, 0
+ xvpickod.w xr4, xr2, xr1
+ xvilvl.b xr0, xr4, xr1
+ xvpermi.d xr1, xr0, 0xD8
+ vst vr1, a0, 0
+ addi.w a5, a5, -1
+ addi.d a2, a2, 32
+ addi.d a3, a3, 32
+ add.d a0, a0, a1
+ blt zero, a5, .W_AVG_W16_LASX
+ b .W_AVG_END_LSX
+
+.W_AVG_W32_LASX:
+ xvld xr0, a2, 0
+ xvld xr2, a2, 32
+ xvld xr1, a3, 0
+ xvld xr3, a3, 32
+ xvmulwev.w.h xr4, xr0, xr21
+ xvmulwod.w.h xr5, xr0, xr21
+ xvmulwev.w.h xr6, xr2, xr21
+ xvmulwod.w.h xr7, xr2, xr21
+ xvmaddwev.w.h xr4, xr1, xr22
+ xvmaddwod.w.h xr5, xr1, xr22
+ xvmaddwev.w.h xr6, xr3, xr22
+ xvmaddwod.w.h xr7, xr3, xr22
+ xvssrarni.hu.w xr6, xr4, bpcw_sh
+ xvssrarni.hu.w xr7, xr5, bpcw_sh
+ xvssrlni.bu.h xr7, xr6, 0
+ xvshuf4i.w xr8, xr7, 0x4E
+ xvilvl.b xr9, xr8, xr7
+ xvpermi.d xr0, xr9, 0xD8
+ xvst xr0, a0, 0
+ addi.w a5, a5, -1
+ addi.d a2, a2, 64
+ addi.d a3, a3, 64
+ add.d a0, a0, a1
+ blt zero, a5, .W_AVG_W32_LASX
+ b .W_AVG_END_LASX
+
+.W_AVG_W64_LASX:
+.rept 2
+ xvld xr0, a2, 0
+ xvld xr2, a2, 32
+ xvld xr1, a3, 0
+ xvld xr3, a3, 32
+ xvmulwev.w.h xr4, xr0, xr21
+ xvmulwod.w.h xr5, xr0, xr21
+ xvmulwev.w.h xr6, xr2, xr21
+ xvmulwod.w.h xr7, xr2, xr21
+ xvmaddwev.w.h xr4, xr1, xr22
+ xvmaddwod.w.h xr5, xr1, xr22
+ xvmaddwev.w.h xr6, xr3, xr22
+ xvmaddwod.w.h xr7, xr3, xr22
+ xvssrarni.hu.w xr6, xr4, bpcw_sh
+ xvssrarni.hu.w xr7, xr5, bpcw_sh
+ xvssrlni.bu.h xr7, xr6, 0
+ xvshuf4i.w xr8, xr7, 0x4E
+ xvilvl.b xr9, xr8, xr7
+ xvpermi.d xr0, xr9, 0xD8
+ xvst xr0, a0, 0
+ addi.d a2, a2, 64
+ addi.d a3, a3, 64
+ addi.d a0, a0, 32
+.endr
+ addi.w a5, a5, -1
+ add.d t8, t8, a1
+ add.d a0, t8, zero
+ blt zero, a5, .W_AVG_W64_LASX
+ b .W_AVG_END_LASX
+
+.W_AVG_W128_LASX:
+.rept 4
+ xvld xr0, a2, 0
+ xvld xr2, a2, 32
+ xvld xr1, a3, 0
+ xvld xr3, a3, 32
+ xvmulwev.w.h xr4, xr0, xr21
+ xvmulwod.w.h xr5, xr0, xr21
+ xvmulwev.w.h xr6, xr2, xr21
+ xvmulwod.w.h xr7, xr2, xr21
+ xvmaddwev.w.h xr4, xr1, xr22
+ xvmaddwod.w.h xr5, xr1, xr22
+ xvmaddwev.w.h xr6, xr3, xr22
+ xvmaddwod.w.h xr7, xr3, xr22
+ xvssrarni.hu.w xr6, xr4, bpcw_sh
+ xvssrarni.hu.w xr7, xr5, bpcw_sh
+ xvssrlni.bu.h xr7, xr6, 0
+ xvshuf4i.w xr8, xr7, 0x4E
+ xvilvl.b xr9, xr8, xr7
+ xvpermi.d xr0, xr9, 0xD8
+ xvst xr0, a0, 0
+ addi.d a2, a2, 64
+ addi.d a3, a3, 64
+ addi.d a0, a0, 32
+.endr
+
+ addi.w a5, a5, -1
+ add.d t8, t8, a1
+ add.d a0, t8, zero
+ blt zero, a5, .W_AVG_W128_LASX
+.W_AVG_END_LASX:
+endfunc
+
+#undef bpc_sh
+#undef bpcw_sh
+
+#define mask_sh 10
+/*
+static void mask_c(pixel *dst, const ptrdiff_t dst_stride,
+ const int16_t *tmp1, const int16_t *tmp2, const int w, int h,
+ const uint8_t *mask HIGHBD_DECL_SUFFIX)
+*/
+function mask_8bpc_lsx
+ vldi vr21, 0x440 // 64
+ vxor.v vr19, vr19, vr19
+ addi.d t8, a0, 0
+ clz.w t0, a4
+ li.w t1, 24
+ sub.w t0, t0, t1
+ la.local t1, .MASK_LSX_JRTABLE
+ alsl.d t0, t0, t1, 1
+ ld.h t2, t0, 0
+ add.d t1, t1, t2
+ jirl $r0, t1, 0
+
+ .align 3
+.MASK_LSX_JRTABLE:
+ .hword .MASK_W128_LSX - .MASK_LSX_JRTABLE
+ .hword .MASK_W64_LSX - .MASK_LSX_JRTABLE
+ .hword .MASK_W32_LSX - .MASK_LSX_JRTABLE
+ .hword .MASK_W16_LSX - .MASK_LSX_JRTABLE
+ .hword .MASK_W8_LSX - .MASK_LSX_JRTABLE
+ .hword .MASK_W4_LSX - .MASK_LSX_JRTABLE
+
+.MASK_W4_LSX:
+ vld vr0, a2, 0
+ vld vr1, a3, 0
+ fld.d f22, a6, 0
+
+ vilvl.b vr2, vr19, vr22
+ vsub.h vr3, vr21, vr2
+
+ vmulwev.w.h vr4, vr0, vr2
+ vmulwod.w.h vr5, vr0, vr2
+ vmaddwev.w.h vr4, vr1, vr3
+ vmaddwod.w.h vr5, vr1, vr3
+ vssrarni.hu.w vr5, vr4, mask_sh
+ vssrlrni.bu.h vr1, vr5, 0
+ vpickod.w vr4, vr2, vr1
+ vilvl.b vr0, vr4, vr1
+ fst.s f0, a0, 0
+ add.d a0, a0, a1
+ vstelm.w vr0, a0, 0, 1
+ addi.d a2, a2, 16
+ addi.d a3, a3, 16
+ addi.d a6, a6, 8
+ add.d a0, a0, a1
+ addi.w a5, a5, -2
+ blt zero, a5, .MASK_W4_LSX
+ b .MASK_END_LSX
+.MASK_W8_LSX:
+ vld vr0, a2, 0
+ vld vr10, a2, 16
+ vld vr1, a3, 0
+ vld vr11, a3, 16
+ vld vr22, a6, 0
+
+ vilvl.b vr2, vr19, vr22
+ vilvh.b vr12, vr19, vr22
+ vsub.h vr3, vr21, vr2
+ vsub.h vr13, vr21, vr12
+
+ vmulwev.w.h vr4, vr0, vr2
+ vmulwod.w.h vr5, vr0, vr2
+ vmulwev.w.h vr14, vr10, vr12
+ vmulwod.w.h vr15, vr10, vr12
+ vmaddwev.w.h vr4, vr1, vr3
+ vmaddwod.w.h vr5, vr1, vr3
+ vmaddwev.w.h vr14, vr11, vr13
+ vmaddwod.w.h vr15, vr11, vr13
+ vssrarni.hu.w vr14, vr4, mask_sh
+ vssrarni.hu.w vr15, vr5, mask_sh
+ vssrlrni.bu.h vr15, vr14, 0
+ vshuf4i.w vr6, vr15, 0x4E
+ vilvl.b vr0, vr6, vr15
+ fst.d f0, a0, 0
+ add.d a0, a0, a1
+ vstelm.d vr0, a0, 0, 1
+ addi.d a2, a2, 32
+ addi.d a3, a3, 32
+ addi.d a6, a6, 16
+ add.d a0, a0, a1
+ addi.w a5, a5, -2
+ blt zero, a5, .MASK_W8_LSX
+ b .MASK_END_LSX
+
+.MASK_W16_LSX:
+ vld vr0, a2, 0
+ vld vr10, a2, 16
+ vld vr1, a3, 0
+ vld vr11, a3, 16
+ vld vr22, a6, 0
+
+ vilvl.b vr2, vr19, vr22
+ vilvh.b vr12, vr19, vr22
+ vsub.h vr3, vr21, vr2
+ vsub.h vr13, vr21, vr12
+
+ vmulwev.w.h vr4, vr0, vr2
+ vmulwod.w.h vr5, vr0, vr2
+ vmulwev.w.h vr14, vr10, vr12
+ vmulwod.w.h vr15, vr10, vr12
+ vmaddwev.w.h vr4, vr1, vr3
+ vmaddwod.w.h vr5, vr1, vr3
+ vmaddwev.w.h vr14, vr11, vr13
+ vmaddwod.w.h vr15, vr11, vr13
+ vssrarni.hu.w vr14, vr4, mask_sh
+ vssrarni.hu.w vr15, vr5, mask_sh
+ vssrlrni.bu.h vr15, vr14, 0
+ vshuf4i.w vr6, vr15, 0x4E
+ vilvl.b vr0, vr6, vr15
+ vst vr0, a0, 0
+ addi.d a2, a2, 32
+ addi.d a3, a3, 32
+ addi.d a6, a6, 16
+ add.d a0, a0, a1
+ addi.w a5, a5, -1
+ blt zero, a5, .MASK_W16_LSX
+ b .MASK_END_LSX
+.MASK_W32_LSX:
+.rept 2
+ vld vr0, a2, 0
+ vld vr10, a2, 16
+ vld vr1, a3, 0
+ vld vr11, a3, 16
+ vld vr22, a6, 0
+ vilvl.b vr2, vr19, vr22
+ vilvh.b vr12, vr19, vr22
+ vsub.h vr3, vr21, vr2
+ vsub.h vr13, vr21, vr12
+ vmulwev.w.h vr4, vr0, vr2
+ vmulwod.w.h vr5, vr0, vr2
+ vmulwev.w.h vr14, vr10, vr12
+ vmulwod.w.h vr15, vr10, vr12
+ vmaddwev.w.h vr4, vr1, vr3
+ vmaddwod.w.h vr5, vr1, vr3
+ vmaddwev.w.h vr14, vr11, vr13
+ vmaddwod.w.h vr15, vr11, vr13
+ vssrarni.hu.w vr14, vr4, mask_sh
+ vssrarni.hu.w vr15, vr5, mask_sh
+ vssrlrni.bu.h vr15, vr14, 0
+ vshuf4i.w vr6, vr15, 0x4E
+ vilvl.b vr0, vr6, vr15
+ vst vr0, a0, 0
+ addi.d a2, a2, 32
+ addi.d a3, a3, 32
+ addi.d a6, a6, 16
+ addi.d a0, a0, 16
+.endr
+ add.d t8, t8, a1
+ add.d a0, t8, zero
+ addi.w a5, a5, -1
+ blt zero, a5, .MASK_W32_LSX
+ b .MASK_END_LSX
+.MASK_W64_LSX:
+.rept 4
+ vld vr0, a2, 0
+ vld vr10, a2, 16
+ vld vr1, a3, 0
+ vld vr11, a3, 16
+ vld vr22, a6, 0
+ vilvl.b vr2, vr19, vr22
+ vilvh.b vr12, vr19, vr22
+ vsub.h vr3, vr21, vr2
+ vsub.h vr13, vr21, vr12
+ vmulwev.w.h vr4, vr0, vr2
+ vmulwod.w.h vr5, vr0, vr2
+ vmulwev.w.h vr14, vr10, vr12
+ vmulwod.w.h vr15, vr10, vr12
+ vmaddwev.w.h vr4, vr1, vr3
+ vmaddwod.w.h vr5, vr1, vr3
+ vmaddwev.w.h vr14, vr11, vr13
+ vmaddwod.w.h vr15, vr11, vr13
+ vssrarni.hu.w vr14, vr4, mask_sh
+ vssrarni.hu.w vr15, vr5, mask_sh
+ vssrlrni.bu.h vr15, vr14, 0
+ vshuf4i.w vr6, vr15, 0x4E
+ vilvl.b vr0, vr6, vr15
+ vst vr0, a0, 0
+ addi.d a2, a2, 32
+ addi.d a3, a3, 32
+ addi.d a6, a6, 16
+ addi.d a0, a0, 16
+.endr
+ add.d t8, t8, a1
+ add.d a0, t8, zero
+ addi.w a5, a5, -1
+ blt zero, a5, .MASK_W64_LSX
+ b .MASK_END_LSX
+.MASK_W128_LSX:
+.rept 8
+ vld vr0, a2, 0
+ vld vr10, a2, 16
+ vld vr1, a3, 0
+ vld vr11, a3, 16
+ vld vr22, a6, 0
+ vilvl.b vr2, vr19, vr22
+ vilvh.b vr12, vr19, vr22
+ vsub.h vr3, vr21, vr2
+ vsub.h vr13, vr21, vr12
+ vmulwev.w.h vr4, vr0, vr2
+ vmulwod.w.h vr5, vr0, vr2
+ vmulwev.w.h vr14, vr10, vr12
+ vmulwod.w.h vr15, vr10, vr12
+ vmaddwev.w.h vr4, vr1, vr3
+ vmaddwod.w.h vr5, vr1, vr3
+ vmaddwev.w.h vr14, vr11, vr13
+ vmaddwod.w.h vr15, vr11, vr13
+ vssrarni.hu.w vr14, vr4, mask_sh
+ vssrarni.hu.w vr15, vr5, mask_sh
+ vssrlrni.bu.h vr15, vr14, 0
+ vshuf4i.w vr6, vr15, 0x4E
+ vilvl.b vr0, vr6, vr15
+ vst vr0, a0, 0
+ addi.d a2, a2, 32
+ addi.d a3, a3, 32
+ addi.d a6, a6, 16
+ addi.d a0, a0, 16
+.endr
+ add.d t8, t8, a1
+ add.d a0, t8, zero
+ addi.w a5, a5, -1
+ blt zero, a5, .MASK_W128_LSX
+.MASK_END_LSX:
+endfunc
+
+function mask_8bpc_lasx
+ xvldi xr21, 0x440 // 64
+ xvxor.v xr19, xr19, xr19
+ addi.d t8, a0, 0
+ clz.w t0, a4
+ li.w t1, 24
+ sub.w t0, t0, t1
+ la.local t1, .MASK_LASX_JRTABLE
+ alsl.d t0, t0, t1, 1
+ ld.h t2, t0, 0
+ add.d t1, t1, t2
+ jirl $r0, t1, 0
+
+ .align 3
+.MASK_LASX_JRTABLE:
+ .hword .MASK_W128_LASX - .MASK_LASX_JRTABLE
+ .hword .MASK_W64_LASX - .MASK_LASX_JRTABLE
+ .hword .MASK_W32_LASX - .MASK_LASX_JRTABLE
+ .hword .MASK_W16_LASX - .MASK_LASX_JRTABLE
+ .hword .MASK_W8_LASX - .MASK_LASX_JRTABLE
+ .hword .MASK_W4_LASX - .MASK_LASX_JRTABLE
+
+.MASK_W4_LASX:
+ vld vr0, a2, 0
+ vld vr1, a3, 0
+ fld.d f22, a6, 0
+
+ vilvl.h vr4, vr1, vr0
+ vilvh.h vr14, vr1, vr0
+ vilvl.b vr2, vr19, vr22
+ vsub.h vr3, vr21, vr2
+ xvpermi.q xr14, xr4, 0x20
+ vilvl.h vr5, vr3, vr2
+ vilvh.h vr15, vr3, vr2
+ xvpermi.q xr15, xr5, 0x20
+ xvmulwev.w.h xr0, xr14, xr15
+ xvmaddwod.w.h xr0, xr14, xr15
+ xvssrarni.hu.w xr1, xr0, mask_sh
+ xvssrlni.bu.h xr2, xr1, 0
+ fst.s f2, a0, 0
+ add.d a0, a0, a1
+ xvstelm.w xr2, a0, 0, 4
+
+ addi.d a2, a2, 16
+ addi.d a3, a3, 16
+ addi.d a6, a6, 8
+ add.d a0, a0, a1
+ addi.w a5, a5, -2
+ blt zero, a5, .MASK_W4_LASX
+ b .MASK_END_LASX
+
+.MASK_W8_LASX:
+ xvld xr0, a2, 0
+ xvld xr1, a3, 0
+ vld vr22, a6, 0
+
+ vext2xv.hu.bu xr2, xr22
+ xvsub.h xr3, xr21, xr2
+ xvmulwev.w.h xr4, xr0, xr2
+ xvmulwod.w.h xr5, xr0, xr2
+ xvmaddwev.w.h xr4, xr1, xr3
+ xvmaddwod.w.h xr5, xr1, xr3
+ xvssrarni.hu.w xr5, xr4, mask_sh
+ xvssrlni.bu.h xr1, xr5, 0
+ xvpickod.w xr4, xr2, xr1
+ xvilvl.b xr0, xr4, xr1
+ fst.d f0, a0, 0
+ add.d a0, a0, a1
+ xvstelm.d xr0, a0, 0, 2
+
+ addi.d a2, a2, 32
+ addi.d a3, a3, 32
+ addi.d a6, a6, 16
+ add.d a0, a0, a1
+ addi.w a5, a5, -2
+ blt zero, a5, .MASK_W8_LASX
+ b .MASK_END_LASX
+
+.MASK_W16_LASX:
+ xvld xr0, a2, 0
+ xvld xr1, a3, 0
+ vld vr22, a6, 0
+
+ vext2xv.hu.bu xr2, xr22
+ xvsub.h xr3, xr21, xr2
+ xvmulwev.w.h xr4, xr0, xr2
+ xvmulwod.w.h xr5, xr0, xr2
+ xvmaddwev.w.h xr4, xr1, xr3
+ xvmaddwod.w.h xr5, xr1, xr3
+ xvssrarni.hu.w xr5, xr4, mask_sh
+ xvssrlni.bu.h xr1, xr5, 0
+ xvpickod.w xr4, xr2, xr1
+ xvilvl.b xr0, xr4, xr1
+ xvpermi.d xr1, xr0, 0xD8
+ vst vr1, a0, 0
+
+ addi.d a2, a2, 32
+ addi.d a3, a3, 32
+ addi.d a6, a6, 16
+ add.d a0, a0, a1
+ addi.w a5, a5, -1
+ blt zero, a5, .MASK_W16_LASX
+ b .MASK_END_LASX
+.MASK_W32_LASX:
+ xvld xr0, a2, 0
+ xvld xr10, a2, 32
+ xvld xr1, a3, 0
+ xvld xr11, a3, 32
+ xvld xr22, a6, 0
+ vext2xv.hu.bu xr2, xr22
+ xvpermi.q xr4, xr22, 0x01
+ vext2xv.hu.bu xr12, xr4
+ xvsub.h xr3, xr21, xr2
+ xvsub.h xr13, xr21, xr12
+
+ xvmulwev.w.h xr4, xr0, xr2
+ xvmulwod.w.h xr5, xr0, xr2
+ xvmulwev.w.h xr14, xr10, xr12
+ xvmulwod.w.h xr15, xr10, xr12
+ xvmaddwev.w.h xr4, xr1, xr3
+ xvmaddwod.w.h xr5, xr1, xr3
+ xvmaddwev.w.h xr14, xr11, xr13
+ xvmaddwod.w.h xr15, xr11, xr13
+ xvssrarni.hu.w xr14, xr4, mask_sh
+ xvssrarni.hu.w xr15, xr5, mask_sh
+ xvssrlni.bu.h xr15, xr14, 0
+ xvshuf4i.w xr6, xr15, 0x4E
+ xvilvl.b xr1, xr6, xr15
+ xvpermi.d xr0, xr1, 0xD8
+ xvst xr0, a0, 0
+
+ addi.d a2, a2, 64
+ addi.d a3, a3, 64
+ addi.d a6, a6, 32
+ add.d a0, a0, a1
+ addi.w a5, a5, -1
+ blt zero, a5, .MASK_W32_LASX
+ b .MASK_END_LASX
+
+.MASK_W64_LASX:
+.rept 2
+ xvld xr0, a2, 0
+ xvld xr10, a2, 32
+ xvld xr1, a3, 0
+ xvld xr11, a3, 32
+ xvld xr22, a6, 0
+ vext2xv.hu.bu xr2, xr22
+ xvpermi.q xr4, xr22, 0x01
+ vext2xv.hu.bu xr12, xr4
+ xvsub.h xr3, xr21, xr2
+ xvsub.h xr13, xr21, xr12
+
+ xvmulwev.w.h xr4, xr0, xr2
+ xvmulwod.w.h xr5, xr0, xr2
+ xvmulwev.w.h xr14, xr10, xr12
+ xvmulwod.w.h xr15, xr10, xr12
+ xvmaddwev.w.h xr4, xr1, xr3
+ xvmaddwod.w.h xr5, xr1, xr3
+ xvmaddwev.w.h xr14, xr11, xr13
+ xvmaddwod.w.h xr15, xr11, xr13
+ xvssrarni.hu.w xr14, xr4, mask_sh
+ xvssrarni.hu.w xr15, xr5, mask_sh
+ xvssrlni.bu.h xr15, xr14, 0
+ xvshuf4i.w xr6, xr15, 0x4E
+ xvilvl.b xr1, xr6, xr15
+ xvpermi.d xr0, xr1, 0xD8
+ xvst xr0, a0, 0
+ addi.d a2, a2, 64
+ addi.d a3, a3, 64
+ addi.d a6, a6, 32
+ addi.d a0, a0, 32
+.endr
+ add.d t8, t8, a1
+ add.d a0, t8, zero
+ addi.w a5, a5, -1
+ blt zero, a5, .MASK_W64_LASX
+ b .MASK_END_LASX
+
+.MASK_W128_LASX:
+.rept 4
+ xvld xr0, a2, 0
+ xvld xr10, a2, 32
+ xvld xr1, a3, 0
+ xvld xr11, a3, 32
+ xvld xr22, a6, 0
+ vext2xv.hu.bu xr2, xr22
+ xvpermi.q xr4, xr22, 0x01
+ vext2xv.hu.bu xr12, xr4
+ xvsub.h xr3, xr21, xr2
+ xvsub.h xr13, xr21, xr12
+
+ xvmulwev.w.h xr4, xr0, xr2
+ xvmulwod.w.h xr5, xr0, xr2
+ xvmulwev.w.h xr14, xr10, xr12
+ xvmulwod.w.h xr15, xr10, xr12
+ xvmaddwev.w.h xr4, xr1, xr3
+ xvmaddwod.w.h xr5, xr1, xr3
+ xvmaddwev.w.h xr14, xr11, xr13
+ xvmaddwod.w.h xr15, xr11, xr13
+ xvssrarni.hu.w xr14, xr4, mask_sh
+ xvssrarni.hu.w xr15, xr5, mask_sh
+ xvssrlni.bu.h xr15, xr14, 0
+ xvshuf4i.w xr6, xr15, 0x4E
+ xvilvl.b xr1, xr6, xr15
+ xvpermi.d xr0, xr1, 0xD8
+ xvst xr0, a0, 0
+
+ addi.d a2, a2, 64
+ addi.d a3, a3, 64
+ addi.d a6, a6, 32
+ addi.d a0, a0, 32
+.endr
+ add.d t8, t8, a1
+ add.d a0, t8, zero
+ addi.w a5, a5, -1
+ blt zero, a5, .MASK_W128_LASX
+.MASK_END_LASX:
+endfunc
+
+/*
+static void w_mask_c(pixel *dst, const ptrdiff_t dst_stride,
+ const int16_t *tmp1, const int16_t *tmp2, const int w, int h,
+ uint8_t *mask, const int sign,
+ const int ss_hor, const int ss_ver HIGHBD_DECL_SUFFIX)
+*/
+function w_mask_420_8bpc_lsx
+ addi.d sp, sp, -24
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ vldi vr20, 0x440
+ vreplgr2vr.h vr21, a7
+ vldi vr22, 0x426
+
+ clz.w t0, a4
+ li.w t1, 24
+ sub.w t0, t0, t1
+ la.local t1, .WMASK420_LSX_JRTABLE
+ alsl.d t0, t0, t1, 1
+ ld.h t8, t0, 0
+ add.d t1, t1, t8
+ jirl $r0, t1, 0
+
+ .align 3
+.WMASK420_LSX_JRTABLE:
+ .hword .WMASK420_W128_LSX - .WMASK420_LSX_JRTABLE
+ .hword .WMASK420_W64_LSX - .WMASK420_LSX_JRTABLE
+ .hword .WMASK420_W32_LSX - .WMASK420_LSX_JRTABLE
+ .hword .WMASK420_W16_LSX - .WMASK420_LSX_JRTABLE
+ .hword .WMASK420_W8_LSX - .WMASK420_LSX_JRTABLE
+ .hword .WMASK420_W4_LSX - .WMASK420_LSX_JRTABLE
+
+.WMASK420_W4_LSX:
+ vld vr0, a2, 0
+ vld vr1, a2, 16
+ vld vr2, a3, 0
+ vld vr3, a3, 16
+ addi.w a5, a5, -4
+
+ vabsd.h vr4, vr0, vr2
+ vabsd.h vr5, vr1, vr3
+ vaddi.hu vr4, vr4, 8
+ vaddi.hu vr5, vr5, 8
+ vsrli.h vr4, vr4, 8
+ vsrli.h vr5, vr5, 8
+ vadd.h vr4, vr4, vr22
+ vadd.h vr5, vr5, vr22
+ vmin.hu vr6, vr4, vr20
+ vmin.hu vr7, vr5, vr20
+ vsub.h vr8, vr20, vr6
+ vsub.h vr9, vr20, vr7
+ vmulwev.w.h vr4, vr6, vr0
+ vmulwod.w.h vr5, vr6, vr0
+ vmulwev.w.h vr10, vr7, vr1
+ vmulwod.w.h vr11, vr7, vr1
+ vmaddwev.w.h vr4, vr8, vr2
+ vmaddwod.w.h vr5, vr8, vr2
+ vmaddwev.w.h vr10, vr9, vr3
+ vmaddwod.w.h vr11, vr9, vr3
+ vilvl.w vr0, vr5, vr4
+ vilvh.w vr1, vr5, vr4
+ vilvl.w vr2, vr11, vr10
+ vilvh.w vr3, vr11, vr10
+ vssrarni.hu.w vr1, vr0, 10
+ vssrarni.hu.w vr3, vr2, 10
+ vssrlni.bu.h vr3, vr1, 0
+ vstelm.w vr3, a0, 0, 0
+ add.d a0, a0, a1
+ vstelm.w vr3, a0, 0, 1
+ add.d a0, a0, a1
+ vstelm.w vr3, a0, 0, 2
+ add.d a0, a0, a1
+ vstelm.w vr3, a0, 0, 3
+ add.d a0, a0, a1
+ vpickev.h vr0, vr7, vr6
+ vpickod.h vr1, vr7, vr6
+ vadd.h vr0, vr0, vr1
+ vshuf4i.h vr0, vr0, 0xd8
+ vhaddw.w.h vr2, vr0, vr0
+ vpickev.h vr2, vr2, vr2
+ vsub.h vr2, vr2, vr21
+ vaddi.hu vr2, vr2, 2
+ vssrani.bu.h vr2, vr2, 2
+ vstelm.w vr2, a6, 0, 0
+
+ addi.d a2, a2, 32
+ addi.d a3, a3, 32
+ addi.d a6, a6, 4
+ blt zero, a5, .WMASK420_W4_LSX
+ b .END_W420
+
+.WMASK420_W8_LSX:
+ vld vr0, a2, 0
+ vld vr1, a2, 16
+ vld vr2, a3, 0
+ vld vr3, a3, 16
+ addi.w a5, a5, -2
+
+ vabsd.h vr4, vr0, vr2
+ vabsd.h vr5, vr1, vr3
+ vaddi.hu vr4, vr4, 8
+ vaddi.hu vr5, vr5, 8
+ vsrli.h vr4, vr4, 8
+ vsrli.h vr5, vr5, 8
+ vadd.h vr4, vr4, vr22
+ vadd.h vr5, vr5, vr22
+ vmin.hu vr6, vr4, vr20
+ vmin.hu vr7, vr5, vr20
+ vsub.h vr8, vr20, vr6
+ vsub.h vr9, vr20, vr7
+ vmulwev.w.h vr4, vr6, vr0
+ vmulwod.w.h vr5, vr6, vr0
+ vmulwev.w.h vr10, vr7, vr1
+ vmulwod.w.h vr11, vr7, vr1
+ vmaddwev.w.h vr4, vr8, vr2
+ vmaddwod.w.h vr5, vr8, vr2
+ vmaddwev.w.h vr10, vr9, vr3
+ vmaddwod.w.h vr11, vr9, vr3
+ vssrarni.hu.w vr10, vr4, 10
+ vssrarni.hu.w vr11, vr5, 10
+ vssrlni.bu.h vr11, vr10, 0
+ vshuf4i.w vr0, vr11, 0x4E
+ vilvl.b vr3, vr0, vr11
+ vstelm.d vr3, a0, 0, 0
+ add.d a0, a0, a1
+ vstelm.d vr3, a0, 0, 1
+ add.d a0, a0, a1
+ vpickev.h vr0, vr7, vr6
+ vpickod.h vr1, vr7, vr6
+ vadd.h vr0, vr0, vr1
+ vilvh.d vr2, vr0, vr0
+ vadd.h vr2, vr2, vr0
+ vsub.h vr2, vr2, vr21
+ vaddi.hu vr2, vr2, 2
+ vssrani.bu.h vr2, vr2, 2
+ vstelm.w vr2, a6, 0, 0
+
+ addi.d a2, a2, 32
+ addi.d a3, a3, 32
+ addi.d a6, a6, 4
+ blt zero, a5, .WMASK420_W8_LSX
+ b .END_W420
+
+.WMASK420_W16_LSX:
+ vld vr0, a2, 0
+ vld vr1, a2, 16
+ alsl.d a2, a4, a2, 1
+ vld vr2, a2, 0
+ vld vr3, a2, 16
+ vld vr4, a3, 0
+ vld vr5, a3, 16
+ alsl.d a3, a4, a3, 1
+ vld vr6, a3, 0
+ vld vr7, a3, 16
+
+ vabsd.h vr8, vr0, vr4
+ vabsd.h vr9, vr1, vr5
+ vabsd.h vr10, vr2, vr6
+ vabsd.h vr11, vr3, vr7
+ vaddi.hu vr8, vr8, 8
+ vaddi.hu vr9, vr9, 8
+ vaddi.hu vr10, vr10, 8
+ vaddi.hu vr11, vr11, 8
+ vsrli.h vr8, vr8, 8
+ vsrli.h vr9, vr9, 8
+ vsrli.h vr10, vr10, 8
+ vsrli.h vr11, vr11, 8
+ vadd.h vr8, vr8, vr22
+ vadd.h vr9, vr9, vr22
+ vadd.h vr10, vr10, vr22
+ vadd.h vr11, vr11, vr22
+ vmin.hu vr12, vr8, vr20
+ vmin.hu vr13, vr9, vr20
+ vmin.hu vr14, vr10, vr20
+ vmin.hu vr15, vr11, vr20
+ vsub.h vr16, vr20, vr12
+ vsub.h vr17, vr20, vr13
+ vsub.h vr18, vr20, vr14
+ vsub.h vr19, vr20, vr15
+ vmulwev.w.h vr8, vr12, vr0
+ vmulwod.w.h vr9, vr12, vr0
+ vmulwev.w.h vr10, vr13, vr1
+ vmulwod.w.h vr11, vr13, vr1
+ vmulwev.w.h vr23, vr14, vr2
+ vmulwod.w.h vr24, vr14, vr2
+ vmulwev.w.h vr25, vr15, vr3
+ vmulwod.w.h vr26, vr15, vr3
+ vmaddwev.w.h vr8, vr16, vr4
+ vmaddwod.w.h vr9, vr16, vr4
+ vmaddwev.w.h vr10, vr17, vr5
+ vmaddwod.w.h vr11, vr17, vr5
+ vmaddwev.w.h vr23, vr18, vr6
+ vmaddwod.w.h vr24, vr18, vr6
+ vmaddwev.w.h vr25, vr19, vr7
+ vmaddwod.w.h vr26, vr19, vr7
+ vssrarni.hu.w vr10, vr8, 10
+ vssrarni.hu.w vr11, vr9, 10
+ vssrarni.hu.w vr25, vr23, 10
+ vssrarni.hu.w vr26, vr24, 10
+ vssrlni.bu.h vr11, vr10, 0
+ vssrlni.bu.h vr26, vr25, 0
+ vshuf4i.w vr0, vr11, 0x4E
+ vshuf4i.w vr1, vr26, 0x4E
+ vilvl.b vr3, vr0, vr11
+ vilvl.b vr7, vr1, vr26
+ vst vr3, a0, 0
+ vstx vr7, a0, a1
+ vpickev.h vr0, vr13, vr12
+ vpickod.h vr1, vr13, vr12
+ vpickev.h vr2, vr15, vr14
+ vpickod.h vr3, vr15, vr14
+ vadd.h vr4, vr0, vr1
+ vadd.h vr5, vr2, vr3
+ vadd.h vr4, vr4, vr5
+ vsub.h vr4, vr4, vr21
+ vssrarni.bu.h vr4, vr4, 2
+ vstelm.d vr4, a6, 0, 0
+
+ alsl.d a2, a4, a2, 1
+ alsl.d a3, a4, a3, 1
+ alsl.d a0, a1, a0, 1
+ addi.d a6, a6, 8
+ addi.w a5, a5, -2
+ blt zero, a5, .WMASK420_W16_LSX
+ b .END_W420
+
+.WMASK420_W32_LSX:
+.WMASK420_W64_LSX:
+.WMASK420_W128_LSX:
+
+.LOOP_W32_420_LSX:
+ add.d t1, a2, zero
+ add.d t2, a3, zero
+ add.d t3, a0, zero
+ add.d t4, a6, zero
+ alsl.d t5, a4, t1, 1
+ alsl.d t6, a4, t2, 1
+ or t7, a4, a4
+
+.W32_420_LSX:
+ vld vr0, t1, 0
+ vld vr1, t1, 16
+ vld vr2, t2, 0
+ vld vr3, t2, 16
+ vld vr4, t5, 0
+ vld vr5, t5, 16
+ vld vr6, t6, 0
+ vld vr7, t6, 16
+ addi.d t1, t1, 32
+ addi.d t2, t2, 32
+ addi.d t5, t5, 32
+ addi.d t6, t6, 32
+ addi.w t7, t7, -16
+ vabsd.h vr8, vr0, vr2
+ vabsd.h vr9, vr1, vr3
+ vabsd.h vr10, vr4, vr6
+ vabsd.h vr11, vr5, vr7
+ vaddi.hu vr8, vr8, 8
+ vaddi.hu vr9, vr9, 8
+ vaddi.hu vr10, vr10, 8
+ vaddi.hu vr11, vr11, 8
+ vsrli.h vr8, vr8, 8
+ vsrli.h vr9, vr9, 8
+ vsrli.h vr10, vr10, 8
+ vsrli.h vr11, vr11, 8
+ vadd.h vr8, vr8, vr22
+ vadd.h vr9, vr9, vr22
+ vadd.h vr10, vr10, vr22
+ vadd.h vr11, vr11, vr22
+ vmin.hu vr12, vr8, vr20
+ vmin.hu vr13, vr9, vr20
+ vmin.hu vr14, vr10, vr20
+ vmin.hu vr15, vr11, vr20
+ vsub.h vr16, vr20, vr12
+ vsub.h vr17, vr20, vr13
+ vsub.h vr18, vr20, vr14
+ vsub.h vr19, vr20, vr15
+ vmulwev.w.h vr8, vr12, vr0
+ vmulwod.w.h vr9, vr12, vr0
+ vmulwev.w.h vr10, vr13, vr1
+ vmulwod.w.h vr11, vr13, vr1
+ vmulwev.w.h vr23, vr14, vr4
+ vmulwod.w.h vr24, vr14, vr4
+ vmulwev.w.h vr25, vr15, vr5
+ vmulwod.w.h vr26, vr15, vr5
+ vmaddwev.w.h vr8, vr16, vr2
+ vmaddwod.w.h vr9, vr16, vr2
+ vmaddwev.w.h vr10, vr17, vr3
+ vmaddwod.w.h vr11, vr17, vr3
+ vmaddwev.w.h vr23, vr18, vr6
+ vmaddwod.w.h vr24, vr18, vr6
+ vmaddwev.w.h vr25, vr19, vr7
+ vmaddwod.w.h vr26, vr19, vr7
+ vssrarni.hu.w vr10, vr8, 10
+ vssrarni.hu.w vr11, vr9, 10
+ vssrarni.hu.w vr25, vr23, 10
+ vssrarni.hu.w vr26, vr24, 10
+ vssrlni.bu.h vr11, vr10, 0
+ vssrlni.bu.h vr26, vr25, 0
+ vshuf4i.w vr8, vr11, 0x4E
+ vshuf4i.w vr9, vr26, 0x4E
+ vilvl.b vr3, vr8, vr11
+ vilvl.b vr7, vr9, vr26
+ vst vr3, t3, 0
+ vstx vr7, a1, t3
+ addi.d t3, t3, 16
+ vpickev.h vr8, vr13, vr12
+ vpickod.h vr9, vr13, vr12
+ vpickev.h vr10, vr15, vr14
+ vpickod.h vr11, vr15, vr14
+ vadd.h vr8, vr8, vr9
+ vadd.h vr10, vr10, vr11
+ vadd.h vr12, vr8, vr10
+ vsub.h vr12, vr12, vr21
+ vssrarni.bu.h vr12, vr12, 2
+ vstelm.d vr12, t4, 0, 0
+ addi.d t4, t4, 8
+ bne t7, zero, .W32_420_LSX
+
+ alsl.d a2, a4, a2, 2
+ alsl.d a3, a4, a3, 2
+ alsl.d a0, a1, a0, 1
+ srai.w t8, a4, 1
+ add.d a6, a6, t8
+ addi.w a5, a5, -2
+ blt zero, a5, .LOOP_W32_420_LSX
+
+.END_W420:
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ addi.d sp, sp, 24
+endfunc
+
+function w_mask_420_8bpc_lasx
+ xvldi xr20, 0x440
+ xvreplgr2vr.h xr21, a7
+ xvldi xr22, 0x426
+
+ clz.w t0, a4
+ li.w t1, 24
+ sub.w t0, t0, t1
+ la.local t1, .WMASK420_LASX_JRTABLE
+ alsl.d t0, t0, t1, 1
+ ld.h t8, t0, 0
+ add.d t1, t1, t8
+ jirl $r0, t1, 0
+
+ .align 3
+.WMASK420_LASX_JRTABLE:
+ .hword .WMASK420_W128_LASX - .WMASK420_LASX_JRTABLE
+ .hword .WMASK420_W64_LASX - .WMASK420_LASX_JRTABLE
+ .hword .WMASK420_W32_LASX - .WMASK420_LASX_JRTABLE
+ .hword .WMASK420_W16_LASX - .WMASK420_LASX_JRTABLE
+ .hword .WMASK420_W8_LASX - .WMASK420_LASX_JRTABLE
+ .hword .WMASK420_W4_LASX - .WMASK420_LASX_JRTABLE
+
+.WMASK420_W4_LASX:
+ xvld xr0, a2, 0
+ xvld xr1, a3, 0
+ addi.w a5, a5, -4
+
+ xvabsd.h xr2, xr0, xr1
+ xvaddi.hu xr2, xr2, 8
+ xvsrli.h xr2, xr2, 8
+ xvadd.h xr2, xr2, xr22
+ xvmin.hu xr3, xr2, xr20
+ xvsub.h xr4, xr20, xr3
+ xvmulwev.w.h xr5, xr3, xr0
+ xvmulwod.w.h xr6, xr3, xr0
+ xvmaddwev.w.h xr5, xr4, xr1
+ xvmaddwod.w.h xr6, xr4, xr1
+ xvilvl.w xr7, xr6, xr5
+ xvilvh.w xr8, xr6, xr5
+ xvssrarni.hu.w xr8, xr7, 10
+ xvssrlni.bu.h xr9, xr8, 0
+ vstelm.w vr9, a0, 0, 0
+ add.d a0, a0, a1
+ vstelm.w vr9, a0, 0, 1
+ add.d a0, a0, a1
+ xvstelm.w xr9, a0, 0, 4
+ add.d a0, a0, a1
+ xvstelm.w xr9, a0, 0, 5
+ add.d a0, a0, a1
+
+ xvhaddw.w.h xr3, xr3, xr3
+ xvpermi.d xr4, xr3, 0xb1
+ xvadd.h xr3, xr3, xr4
+ xvpickev.h xr3, xr3, xr3
+ xvsub.h xr3, xr3, xr21
+ xvssrarni.bu.h xr3, xr3, 2
+ vstelm.h vr3, a6, 0, 0
+ xvstelm.h xr3, a6, 2, 8
+
+ addi.d a2, a2, 32
+ addi.d a3, a3, 32
+ addi.d a6, a6, 4
+ blt zero, a5, .WMASK420_W4_LASX
+ b .END_W420_LASX
+
+.WMASK420_W8_LASX:
+ xvld xr0, a2, 0
+ xvld xr1, a2, 32
+ xvld xr2, a3, 0
+ xvld xr3, a3, 32
+ addi.w a5, a5, -4
+
+ xvabsd.h xr4, xr0, xr2
+ xvabsd.h xr5, xr1, xr3
+ xvaddi.hu xr4, xr4, 8
+ xvaddi.hu xr5, xr5, 8
+ xvsrli.h xr4, xr4, 8
+ xvsrli.h xr5, xr5, 8
+ xvadd.h xr4, xr4, xr22
+ xvadd.h xr5, xr5, xr22
+ xvmin.hu xr6, xr4, xr20
+ xvmin.hu xr7, xr5, xr20
+ xvsub.h xr8, xr20, xr6
+ xvsub.h xr9, xr20, xr7
+ xvmulwev.w.h xr10, xr6, xr0
+ xvmulwod.w.h xr11, xr6, xr0
+ xvmulwev.w.h xr12, xr7, xr1
+ xvmulwod.w.h xr13, xr7, xr1
+ xvmaddwev.w.h xr10, xr8, xr2
+ xvmaddwod.w.h xr11, xr8, xr2
+ xvmaddwev.w.h xr12, xr9, xr3
+ xvmaddwod.w.h xr13, xr9, xr3
+ xvssrarni.hu.w xr12, xr10, 10
+ xvssrarni.hu.w xr13, xr11, 10
+ xvssrlni.bu.h xr13, xr12, 0
+ xvshuf4i.w xr1, xr13, 0x4E
+ xvilvl.b xr17, xr1, xr13
+ vstelm.d vr17, a0, 0, 0
+ add.d a0, a0, a1
+ xvstelm.d xr17, a0, 0, 2
+ add.d a0, a0, a1
+ xvstelm.d xr17, a0, 0, 1
+ add.d a0, a0, a1
+ xvstelm.d xr17, a0, 0, 3
+ add.d a0, a0, a1
+
+ xvhaddw.w.h xr6, xr6, xr6
+ xvhaddw.w.h xr7, xr7, xr7
+ xvpickev.h xr8, xr7, xr6
+ xvpermi.q xr9, xr8, 0x01
+ vadd.h vr8, vr8, vr9
+ vsub.h vr8, vr8, vr21
+ vssrarni.bu.h vr8, vr8, 2
+ vstelm.d vr8, a6, 0, 0
+ addi.d a2, a2, 64
+ addi.d a3, a3, 64
+ addi.d a6, a6, 8
+ blt zero, a5, .WMASK420_W8_LASX
+ b .END_W420_LASX
+
+.WMASK420_W16_LASX:
+ xvld xr0, a2, 0
+ xvld xr1, a2, 32
+ xvld xr2, a3, 0
+ xvld xr3, a3, 32
+ addi.w a5, a5, -2
+
+ xvabsd.h xr4, xr0, xr2
+ xvabsd.h xr5, xr1, xr3
+ xvaddi.hu xr4, xr4, 8
+ xvaddi.hu xr5, xr5, 8
+ xvsrli.h xr4, xr4, 8
+ xvsrli.h xr5, xr5, 8
+ xvadd.h xr4, xr4, xr22
+ xvadd.h xr5, xr5, xr22
+ xvmin.hu xr4, xr4, xr20
+ xvmin.hu xr5, xr5, xr20
+ xvsub.h xr6, xr20, xr4
+ xvsub.h xr7, xr20, xr5
+ xvmulwev.w.h xr8, xr4, xr0
+ xvmulwod.w.h xr9, xr4, xr0
+ xvmulwev.w.h xr10, xr5, xr1
+ xvmulwod.w.h xr11, xr5, xr1
+ xvmaddwev.w.h xr8, xr6, xr2
+ xvmaddwod.w.h xr9, xr6, xr2
+ xvmaddwev.w.h xr10, xr7, xr3
+ xvmaddwod.w.h xr11, xr7, xr3
+ xvssrarni.hu.w xr10, xr8, 10
+ xvssrarni.hu.w xr11, xr9, 10
+ xvssrlni.bu.h xr11, xr10, 0
+ xvshuf4i.w xr8, xr11, 0x4E
+ xvilvl.b xr15, xr8, xr11
+ xvpermi.d xr16, xr15, 0xd8
+ vst vr16, a0, 0
+ add.d a0, a0, a1
+ xvpermi.q xr16, xr16, 0x01
+ vst vr16, a0, 0
+ add.d a0, a0, a1
+
+ xvhaddw.w.h xr4, xr4, xr4
+ xvhaddw.w.h xr5, xr5, xr5
+ xvadd.h xr4, xr5, xr4
+ xvpickev.h xr6, xr4, xr4
+ xvpermi.d xr7, xr6, 0x08
+ vsub.h vr7, vr7, vr21
+ vssrarni.bu.h vr7, vr7, 2
+ vstelm.d vr7, a6, 0, 0
+
+ addi.d a2, a2, 64
+ addi.d a3, a3, 64
+ addi.d a6, a6, 8
+ blt zero, a5, .WMASK420_W16_LASX
+ b .END_W420_LASX
+
+.WMASK420_W32_LASX:
+.WMASK420_W64_LASX:
+.WMASK420_W128_LASX:
+
+.LOOP_W32_420_LASX:
+ add.d t1, a2, zero
+ add.d t2, a3, zero
+ add.d t3, a0, zero
+ add.d t4, a6, zero
+ alsl.d t5, a4, t1, 1
+ alsl.d t6, a4, t2, 1
+ or t7, a4, a4
+.W32_420_LASX:
+ xvld xr0, t1, 0
+ xvld xr1, t2, 0
+ xvld xr2, t5, 0
+ xvld xr3, t6, 0
+ addi.d t1, t1, 32
+ addi.d t2, t2, 32
+ addi.d t5, t5, 32
+ addi.d t6, t6, 32
+ addi.w t7, t7, -16
+ xvabsd.h xr4, xr0, xr1
+ xvabsd.h xr5, xr2, xr3
+ xvaddi.hu xr4, xr4, 8
+ xvaddi.hu xr5, xr5, 8
+ xvsrli.h xr4, xr4, 8
+ xvsrli.h xr5, xr5, 8
+ xvadd.h xr4, xr4, xr22
+ xvadd.h xr5, xr5, xr22
+ xvmin.hu xr6, xr4, xr20
+ xvmin.hu xr7, xr5, xr20
+ xvsub.h xr8, xr20, xr6
+ xvsub.h xr9, xr20, xr7
+ xvmulwev.w.h xr10, xr6, xr0
+ xvmulwod.w.h xr11, xr6, xr0
+ xvmulwev.w.h xr12, xr7, xr2
+ xvmulwod.w.h xr13, xr7, xr2
+ xvmaddwev.w.h xr10, xr8, xr1
+ xvmaddwod.w.h xr11, xr8, xr1
+ xvmaddwev.w.h xr12, xr9, xr3
+ xvmaddwod.w.h xr13, xr9, xr3
+ xvssrarni.hu.w xr12, xr10, 10
+ xvssrarni.hu.w xr13, xr11, 10
+ xvssrlni.bu.h xr13, xr12, 0
+ xvshuf4i.w xr10, xr13, 0x4E
+ xvilvl.b xr17, xr10, xr13
+ xvpermi.d xr18, xr17, 0x08
+ xvpermi.d xr19, xr17, 0x0d
+ vst vr18, t3, 0
+ vstx vr19, t3, a1
+ addi.d t3, t3, 16
+
+ xvhaddw.w.h xr6, xr6, xr6
+ xvhaddw.w.h xr7, xr7, xr7
+ xvadd.h xr6, xr7, xr6
+ xvpickev.h xr7, xr6, xr6
+ xvpermi.d xr8, xr7, 0x08
+ vsub.h vr9, vr8, vr21
+ vssrarni.bu.h vr9, vr9, 2
+ vstelm.d vr9, t4, 0, 0
+ addi.d t4, t4, 8
+ bne t7, zero, .W32_420_LASX
+
+ alsl.d a2, a4, a2, 2
+ alsl.d a3, a4, a3, 2
+ alsl.d a0, a1, a0, 1
+ srai.w t8, a4, 1
+ add.d a6, a6, t8
+ addi.w a5, a5, -2
+ blt zero, a5, .LOOP_W32_420_LASX
+
+.END_W420_LASX:
+endfunc
+
+#undef bpc_sh
+#undef bpcw_sh
+
+.macro vhaddw.d.h in0
+ vhaddw.w.h \in0, \in0, \in0
+ vhaddw.d.w \in0, \in0, \in0
+.endm
+.macro vhaddw.q.w in0
+ vhaddw.d.w \in0, \in0, \in0
+ vhaddw.q.d \in0, \in0, \in0
+.endm
+.macro PUT_H_8W in0
+ vbsrl.v vr2, \in0, 1
+ vbsrl.v vr3, \in0, 2
+ vbsrl.v vr4, \in0, 3
+ vbsrl.v vr5, \in0, 4
+ vbsrl.v vr6, \in0, 5
+ vbsrl.v vr7, \in0, 6
+ vbsrl.v vr10, \in0, 7
+ vilvl.d vr2, vr2, \in0
+ vilvl.d vr3, vr4, vr3
+ vilvl.d vr4, vr6, vr5
+ vilvl.d vr5, vr10, vr7
+ vdp2.h.bu.b \in0, vr2, vr8
+ vdp2.h.bu.b vr2, vr3, vr8
+ vdp2.h.bu.b vr3, vr4, vr8
+ vdp2.h.bu.b vr4, vr5, vr8
+ vhaddw.d.h \in0
+ vhaddw.d.h vr2
+ vhaddw.d.h vr3
+ vhaddw.d.h vr4
+ vpickev.w \in0, vr2, \in0
+ vpickev.w vr2, vr4, vr3
+ vpickev.h \in0, vr2, \in0
+ vadd.h \in0, \in0, vr9
+.endm
+.macro FILTER_8TAP_4W in0
+ vbsrl.v vr10, \in0, 1
+ vbsrl.v vr11, \in0, 2
+ vbsrl.v vr12, \in0, 3
+ vilvl.d vr10, vr10, \in0
+ vilvl.d vr11, vr12, vr11
+ vdp2.h.bu.b vr7, vr10, vr8
+ vdp2.h.bu.b vr10, vr11, vr8
+ vhaddw.d.h vr7
+ vhaddw.d.h vr10
+ vpickev.w \in0, vr10, vr7
+.endm
+.macro FILTER_8TAP_8W in0
+ vbsrl.v vr10, \in0, 1
+ vbsrl.v vr11, \in0, 2
+ vbsrl.v vr12, \in0, 3
+ vbsrl.v vr13, \in0, 4
+ vbsrl.v vr14, \in0, 5
+ vbsrl.v vr15, \in0, 6
+ vbsrl.v vr16, \in0, 7
+ vilvl.d vr10, vr10, \in0
+ vilvl.d vr11, vr12, vr11
+ vilvl.d vr12, vr14, vr13
+ vilvl.d vr13, vr16, vr15
+ vdp2.h.bu.b vr14, vr10, vr8
+ vdp2.h.bu.b vr15, vr11, vr8
+ vdp2.h.bu.b vr16, vr12, vr8
+ vdp2.h.bu.b vr17, vr13, vr8
+ vhaddw.d.h vr14
+ vhaddw.d.h vr15
+ vhaddw.d.h vr16
+ vhaddw.d.h vr17
+ vpickev.w vr13, vr15, vr14
+ vpickev.w vr14, vr17, vr16
+ vpickev.h \in0, vr14, vr13 //x0 ... x7
+ vsrari.h \in0, \in0, 2
+.endm
+.macro FILTER_8TAP_8W_CLIP_STORE
+ vdp2.w.h vr12, vr0, vr9
+ vdp2.w.h vr13, vr1, vr9
+ vdp2.w.h vr14, vr2, vr9
+ vdp2.w.h vr15, vr3, vr9
+ vdp2.w.h vr16, vr4, vr9
+ vdp2.w.h vr17, vr5, vr9
+ vdp2.w.h vr18, vr6, vr9
+ vdp2.w.h vr19, vr7, vr9
+ vhaddw.q.w vr12
+ vhaddw.q.w vr13
+ vhaddw.q.w vr14
+ vhaddw.q.w vr15
+ vhaddw.q.w vr16
+ vhaddw.q.w vr17
+ vhaddw.q.w vr18
+ vhaddw.q.w vr19
+ vpackev.w vr12, vr13, vr12
+ vpackev.w vr13, vr15, vr14
+ vpackev.d vr12, vr13, vr12
+ vpackev.w vr14, vr17, vr16
+ vpackev.w vr15, vr19, vr18
+ vpackev.d vr13, vr15, vr14
+ vssrarni.hu.w vr13, vr12, 10
+ vssrani.bu.h vr13, vr13, 0
+ vstelm.d vr13, a0, 0, 0
+ add.d a0, a0, a1
+.endm
+.macro VEXTRINS_Hx8 in0
+ vextrins.h vr0, \in0, 0x70
+ vextrins.h vr1, \in0, 0x71
+ vextrins.h vr2, \in0, 0x72
+ vextrins.h vr3, \in0, 0x73
+ vextrins.h vr4, \in0, 0x74
+ vextrins.h vr5, \in0, 0x75
+ vextrins.h vr6, \in0, 0x76
+ vextrins.h vr7, \in0, 0x77
+.endm
+.macro VBSRL_Vx8
+ vbsrl.v vr0, vr0, 2
+ vbsrl.v vr1, vr1, 2
+ vbsrl.v vr2, vr2, 2
+ vbsrl.v vr3, vr3, 2
+ vbsrl.v vr4, vr4, 2
+ vbsrl.v vr5, vr5, 2
+ vbsrl.v vr6, vr6, 2
+ vbsrl.v vr7, vr7, 2
+.endm
+
+.macro PUT_8TAP_8BPC_LSX lable
+ li.w t0, 4
+ la.local t6, dav1d_mc_subpel_filters
+ slli.d t2, a3, 1 //src_stride*2
+ add.d t3, t2, a3 //src_stride*3
+ slli.d t4, t2, 1 //src_stride*4
+
+ bnez a6, .l_\lable\()put_h //mx
+ bnez a7, .l_\lable\()put_v //my
+
+ clz.w t1, a4
+ li.w t5, 24
+ sub.w t1, t1, t5
+ la.local t5, .l_\lable\()put_hv0_jtable
+ alsl.d t1, t1, t5, 3
+ ld.d t6, t1, 0
+ add.d t5, t5, t6
+ jirl $r0, t5, 0
+
+ .align 3
+.l_\lable\()put_hv0_jtable:
+ .dword .l_\lable\()put_hv0_128w - .l_\lable\()put_hv0_jtable
+ .dword .l_\lable\()put_hv0_64w - .l_\lable\()put_hv0_jtable
+ .dword .l_\lable\()put_hv0_32w - .l_\lable\()put_hv0_jtable
+ .dword .l_\lable\()put_hv0_16w - .l_\lable\()put_hv0_jtable
+ .dword .l_\lable\()put_hv0_8w - .l_\lable\()put_hv0_jtable
+ .dword .l_\lable\()put_hv0_4w - .l_\lable\()put_hv0_jtable
+ .dword .l_\lable\()put_hv0_2w - .l_\lable\()put_hv0_jtable
+
+.l_\lable\()put_hv0_2w:
+ vldrepl.h vr0, a2, 0
+ add.d a2, a2, a3
+ vldrepl.h vr1, a2, 0
+ vstelm.h vr0, a0, 0, 0
+ add.d a0, a0, a1
+ vstelm.h vr1, a0, 0, 0
+ add.d a2, a2, a3
+ add.d a0, a0, a1
+ addi.w a5, a5, -2
+ bnez a5, .l_\lable\()put_hv0_2w
+ b .l_\lable\()end_put_8tap
+.l_\lable\()put_hv0_4w:
+ fld.s f0, a2, 0
+ fldx.s f1, a2, a3
+ fst.s f0, a0, 0
+ fstx.s f1, a0, a1
+ alsl.d a2, a3, a2, 1
+ alsl.d a0, a1, a0, 1
+ addi.w a5, a5, -2
+ bnez a5, .l_\lable\()put_hv0_4w
+ b .l_\lable\()end_put_8tap
+.l_\lable\()put_hv0_8w:
+ fld.d f0, a2, 0
+ fldx.d f1, a2, a3
+ fst.d f0, a0, 0
+ fstx.d f1, a0, a1
+ alsl.d a2, a3, a2, 1
+ alsl.d a0, a1, a0, 1
+ addi.w a5, a5, -2
+ bnez a5, .l_\lable\()put_hv0_8w
+ b .l_\lable\()end_put_8tap
+.l_\lable\()put_hv0_16w:
+ vld vr0, a2, 0
+ vldx vr1, a2, a3
+ vst vr0, a0, 0
+ vstx vr1, a0, a1
+ alsl.d a2, a3, a2, 1
+ alsl.d a0, a1, a0, 1
+ addi.w a5, a5, -2
+ bnez a5, .l_\lable\()put_hv0_16w
+ b .l_\lable\()end_put_8tap
+.l_\lable\()put_hv0_32w:
+ vld vr0, a2, 0
+ vld vr1, a2, 16
+ add.d a2, a2, a3
+ vld vr2, a2, 0
+ vld vr3, a2, 16
+ vst vr0, a0, 0
+ vst vr1, a0, 16
+ add.d a0, a0, a1
+ vst vr2, a0, 0
+ vst vr3, a0, 16
+ add.d a2, a2, a3
+ add.d a0, a0, a1
+ addi.w a5, a5, -2
+ bnez a5, .l_\lable\()put_hv0_32w
+ b .l_\lable\()end_put_8tap
+.l_\lable\()put_hv0_64w:
+ vld vr0, a2, 0
+ vld vr1, a2, 16
+ vld vr2, a2, 32
+ vld vr3, a2, 48
+ add.d a2, a2, a3
+ vld vr4, a2, 0
+ vld vr5, a2, 16
+ vld vr6, a2, 32
+ vld vr7, a2, 48
+ add.d a2, a2, a3
+ vst vr0, a0, 0
+ vst vr1, a0, 16
+ vst vr2, a0, 32
+ vst vr3, a0, 48
+ add.d a0, a0, a1
+ vst vr4, a0, 0
+ vst vr5, a0, 16
+ vst vr6, a0, 32
+ vst vr7, a0, 48
+ add.d a0, a0, a1
+ addi.w a5, a5, -2
+ bnez a5, .l_\lable\()put_hv0_64w
+ b .l_\lable\()end_put_8tap
+.l_\lable\()put_hv0_128w:
+ vld vr0, a2, 0
+ vld vr1, a2, 16
+ vld vr2, a2, 32
+ vld vr3, a2, 48
+ vld vr4, a2, 64
+ vld vr5, a2, 80
+ vld vr6, a2, 96
+ vld vr7, a2, 112
+ add.d a2, a2, a3
+ vld vr8, a2, 0
+ vld vr9, a2, 16
+ vld vr10, a2, 32
+ vld vr11, a2, 48
+ vld vr12, a2, 64
+ vld vr13, a2, 80
+ vld vr14, a2, 96
+ vld vr15, a2, 112
+ add.d a2, a2, a3
+ vst vr0, a0, 0
+ vst vr1, a0, 16
+ vst vr2, a0, 32
+ vst vr3, a0, 48
+ vst vr4, a0, 64
+ vst vr5, a0, 80
+ vst vr6, a0, 96
+ vst vr7, a0, 112
+ add.d a0, a0, a1
+ vst vr8, a0, 0
+ vst vr9, a0, 16
+ vst vr10, a0, 32
+ vst vr11, a0, 48
+ vst vr12, a0, 64
+ vst vr13, a0, 80
+ vst vr14, a0, 96
+ vst vr15, a0, 112
+ add.d a0, a0, a1
+ addi.w a5, a5, -2
+ bnez a5, .l_\lable\()put_hv0_128w
+ b .l_\lable\()end_put_8tap
+
+.l_\lable\()put_h:
+ bnez a7, .l_\lable\()put_hv //if(fh) && if (fv)
+ ld.d t5, sp, 0 //filter_type
+ andi t1, t5, 3
+ blt t0, a4, .l_\lable\()put_h_idx_fh
+ andi t1, t5, 1
+ addi.w t1, t1, 3
+
+.l_\lable\()put_h_idx_fh:
+ addi.w t5, zero, 120
+ mul.w t1, t1, t5
+ addi.w t5, a6, -1
+ slli.w t5, t5, 3
+ add.w t1, t1, t5
+ add.d t1, t6, t1 //fh's offset
+ vldrepl.d vr8, t1, 0
+ addi.d a2, a2, -3
+ li.w t1, 34
+ vreplgr2vr.h vr9, t1
+
+ clz.w t1, a4
+ li.w t5, 24
+ sub.w t1, t1, t5
+ la.local t5, .l_\lable\()put_h_jtable
+ alsl.d t1, t1, t5, 3
+ ld.d t6, t1, 0
+ add.d t5, t5, t6
+ jirl $r0, t5, 0
+
+ .align 3
+.l_\lable\()put_h_jtable:
+ .dword .l_\lable\()put_h_128w - .l_\lable\()put_h_jtable
+ .dword .l_\lable\()put_h_64w - .l_\lable\()put_h_jtable
+ .dword .l_\lable\()put_h_32w - .l_\lable\()put_h_jtable
+ .dword .l_\lable\()put_h_16w - .l_\lable\()put_h_jtable
+ .dword .l_\lable\()put_h_8w - .l_\lable\()put_h_jtable
+ .dword .l_\lable\()put_h_4w - .l_\lable\()put_h_jtable
+ .dword .l_\lable\()put_h_2w - .l_\lable\()put_h_jtable
+
+.l_\lable\()put_h_2w:
+ vld vr0, a2, 0
+ vldx vr1, a2, a3
+ add.d a2, a2, t2
+
+ vbsrl.v vr2, vr0, 1
+ vilvl.d vr0, vr2, vr0
+ vdp2.h.bu.b vr2, vr0, vr8
+ vhaddw.w.h vr0, vr2, vr2
+ vhaddw.d.w vr0, vr0, vr0
+ vbsrl.v vr2, vr1, 1
+ vilvl.d vr1, vr2, vr1
+ vdp2.h.bu.b vr2, vr1, vr8
+ vhaddw.w.h vr1, vr2, vr2
+ vhaddw.d.w vr1, vr1, vr1
+ vpickev.w vr0, vr1, vr0
+ vpickev.h vr0, vr0, vr0
+ vadd.h vr0, vr0, vr9
+ vssrani.bu.h vr0, vr0, 6
+
+ vstelm.h vr0, a0, 0, 0
+ add.d a0, a0, a1
+ vstelm.h vr0, a0, 0, 1
+ add.d a0, a0, a1
+ addi.w a5, a5, -2
+ bnez a5, .l_\lable\()put_h_2w
+ b .l_\lable\()end_put_8tap
+
+.l_\lable\()put_h_4w:
+ vld vr0, a2, 0
+ vldx vr1, a2, a3
+ add.d a2, a2, t2
+
+ vbsrl.v vr2, vr0, 1
+ vbsrl.v vr3, vr0, 2
+ vbsrl.v vr4, vr0, 3
+ vilvl.d vr0, vr2, vr0 //x0 x1
+ vilvl.d vr2, vr4, vr3 //x2 x3
+ vdp2.h.bu.b vr3, vr0, vr8
+ vdp2.h.bu.b vr4, vr2, vr8
+ vhaddw.w.h vr0, vr3, vr3
+ vhaddw.d.w vr0, vr0, vr0
+ vhaddw.w.h vr2, vr4, vr4
+ vhaddw.d.w vr2, vr2, vr2
+ vpickev.w vr5, vr2, vr0
+ vbsrl.v vr2, vr1, 1
+ vbsrl.v vr3, vr1, 2
+ vbsrl.v vr4, vr1, 3
+ vilvl.d vr0, vr2, vr1 //x0 x1
+ vilvl.d vr2, vr4, vr3 //x2 x3
+ vdp2.h.bu.b vr3, vr0, vr8
+ vdp2.h.bu.b vr4, vr2, vr8
+ vhaddw.w.h vr0, vr3, vr3
+ vhaddw.d.w vr0, vr0, vr0
+ vhaddw.w.h vr2, vr4, vr4
+ vhaddw.d.w vr2, vr2, vr2
+ vpickev.w vr6, vr2, vr0
+ vpickev.h vr0, vr6, vr5
+ vadd.h vr0, vr0, vr9
+ vssrani.bu.h vr0, vr0, 6
+
+ vstelm.w vr0, a0, 0, 0
+ add.d a0, a0, a1
+ vstelm.w vr0, a0, 0, 1
+ add.d a0, a0, a1
+ addi.d a5, a5, -2
+ bnez a5, .l_\lable\()put_h_4w
+ b .l_\lable\()end_put_8tap
+
+.l_\lable\()put_h_8w:
+ vld vr0, a2, 0
+ vldx vr1, a2, a3
+ add.d a2, a2, t2
+ PUT_H_8W vr0
+ PUT_H_8W vr1
+ vssrani.bu.h vr1, vr0, 6
+ vstelm.d vr1, a0, 0, 0
+ add.d a0, a0, a1
+ vstelm.d vr1, a0, 0, 1
+ add.d a0, a0, a1
+ addi.w a5, a5, -2
+ bnez a5, .l_\lable\()put_h_8w
+ b .l_\lable\()end_put_8tap
+
+.l_\lable\()put_h_16w:
+.l_\lable\()put_h_32w:
+.l_\lable\()put_h_64w:
+.l_\lable\()put_h_128w:
+ addi.d t0, a2, 0 //src
+ addi.w t5, a5, 0 //h
+ addi.d t8, a0, 0 //dst
+.l_\lable\()put_h_16w_loop:
+ vld vr0, a2, 0
+ vldx vr1, a2, a3
+ add.d a2, a2, t2
+ PUT_H_8W vr0
+ PUT_H_8W vr1
+ vssrani.bu.h vr1, vr0, 6
+ vstelm.d vr1, a0, 0, 0
+ add.d a0, a0, a1
+ vstelm.d vr1, a0, 0, 1
+ add.d a0, a0, a1
+ addi.d a5, a5, -2
+ bnez a5, .l_\lable\()put_h_16w_loop
+ addi.d a2, t0, 8
+ addi.d t0, t0, 8
+ addi.d a0, t8, 8
+ addi.d t8, t8, 8
+ addi.w a5, t5, 0
+ addi.w a4, a4, -8
+ bnez a4, .l_\lable\()put_h_16w_loop
+ b .l_\lable\()end_put_8tap
+
+.l_\lable\()put_v:
+ ld.d t1, sp, 0 //filter_type
+ srli.w t1, t1, 2
+ blt t0, a5, .l_\lable\()put_v_idx_fv
+ andi t1, t1, 1
+ addi.w t1, t1, 3
+
+.l_\lable\()put_v_idx_fv:
+ addi.w t5, zero, 120
+ mul.w t1, t1, t5
+ addi.w t5, a7, -1
+ slli.w t5, t5, 3
+ add.w t1, t1, t5
+ add.d t1, t6, t1 //fv's offset
+ vldrepl.d vr8, t1, 0
+ sub.d a2, a2, t3
+
+ clz.w t1, a4
+ li.w t5, 24
+ sub.w t1, t1, t5
+ la.local t5, .l_\lable\()put_v_jtable
+ alsl.d t1, t1, t5, 3
+ ld.d t6, t1, 0
+ add.d t5, t5, t6
+ jirl $r0, t5, 0
+
+ .align 3
+.l_\lable\()put_v_jtable:
+ .dword .l_\lable\()put_v_128w - .l_\lable\()put_v_jtable
+ .dword .l_\lable\()put_v_64w - .l_\lable\()put_v_jtable
+ .dword .l_\lable\()put_v_32w - .l_\lable\()put_v_jtable
+ .dword .l_\lable\()put_v_16w - .l_\lable\()put_v_jtable
+ .dword .l_\lable\()put_v_8w - .l_\lable\()put_v_jtable
+ .dword .l_\lable\()put_v_4w - .l_\lable\()put_v_jtable
+ .dword .l_\lable\()put_v_2w - .l_\lable\()put_v_jtable
+
+.l_\lable\()put_v_2w:
+ fld.s f0, a2, 0
+ fldx.s f1, a2, a3
+ fldx.s f2, a2, t2
+ add.d a2, a2, t3
+ fld.s f3, a2, 0
+ fldx.s f4, a2, a3
+ fldx.s f5, a2, t2
+ fldx.s f6, a2, t3
+ add.d a2, a2, t4
+ vilvl.b vr0, vr1, vr0
+ vilvl.b vr1, vr3, vr2
+ vilvl.b vr2, vr5, vr4
+ vilvl.b vr3, vr7, vr6
+ vilvl.h vr0, vr1, vr0
+ vilvl.h vr1, vr3, vr2
+ vilvl.w vr0, vr1, vr0
+
+.l_\lable\()put_v_2w_loop:
+ fld.s f7, a2, 0 //h0
+ fldx.s f10, a2, a3 //h1
+ add.d a2, a2, t2
+
+ vextrins.b vr0, vr7, 0x70
+ vextrins.b vr0, vr7, 0xf1
+ vbsrl.v vr1, vr0, 1
+ vextrins.b vr1, vr10, 0x70
+ vextrins.b vr1, vr10, 0xf1
+ vdp2.h.bu.b vr10, vr0, vr8
+ vdp2.h.bu.b vr11, vr1, vr8
+ vbsrl.v vr0, vr1, 1
+ vhaddw.d.h vr10
+ vhaddw.d.h vr11
+ vpickev.w vr10, vr11, vr10
+ vssrarni.hu.w vr10, vr10, 6
+ vssrani.bu.h vr10, vr10, 0
+
+ vstelm.h vr10, a0, 0, 0
+ add.d a0, a0, a1
+ vstelm.h vr10, a0, 0, 1
+ add.d a0, a0, a1
+ addi.w a5, a5, -2
+ bnez a5, .l_\lable\()put_v_2w_loop
+ b .l_\lable\()end_put_8tap
+
+.l_\lable\()put_v_4w:
+ fld.s f0, a2, 0
+ fldx.s f1, a2, a3
+ fldx.s f2, a2, t2
+ add.d a2, a2, t3
+ fld.s f3, a2, 0
+ fldx.s f4, a2, a3
+ fldx.s f5, a2, t2
+ fldx.s f6, a2, t3
+ add.d a2, a2, t4
+
+ vilvl.b vr0, vr1, vr0
+ vilvl.b vr1, vr3, vr2
+ vilvl.b vr2, vr5, vr4
+ vilvl.b vr3, vr7, vr6
+ vilvl.h vr0, vr1, vr0
+ vilvl.h vr1, vr3, vr2
+ vilvl.w vr2, vr1, vr0
+ vilvh.w vr3, vr1, vr0
+
+.l_\lable\()put_v_4w_loop:
+ fld.s f7, a2, 0
+ fldx.s f10, a2, a3
+ add.d a2, a2, t2
+
+ vextrins.b vr2, vr7, 0x70
+ vextrins.b vr2, vr7, 0xf1 //x0x1(h0)
+ vbsrl.v vr4, vr2, 1
+ vextrins.b vr4, vr10, 0x70
+ vextrins.b vr4, vr10, 0xf1 //x0x1(h1)
+ vdp2.h.bu.b vr11, vr2, vr8
+ vdp2.h.bu.b vr12, vr4, vr8
+ vbsrl.v vr2, vr4, 1
+
+ vextrins.b vr3, vr7, 0x72
+ vextrins.b vr3, vr7, 0xf3 //x2x3(h0)
+ vbsrl.v vr4, vr3, 1
+ vextrins.b vr4, vr10, 0x72
+ vextrins.b vr4, vr10, 0xf3 //x2x3(h1)
+ vdp2.h.bu.b vr13, vr3, vr8
+ vdp2.h.bu.b vr14, vr4, vr8
+ vbsrl.v vr3, vr4, 1
+
+ vhaddw.d.h vr11
+ vhaddw.d.h vr12
+ vhaddw.d.h vr13
+ vhaddw.d.h vr14
+
+ vpickev.w vr11, vr13, vr11
+ vpickev.w vr12, vr14, vr12
+ vpickev.h vr11, vr12, vr11
+ vssrarni.bu.h vr11, vr11, 6
+ vstelm.w vr11, a0, 0, 0
+ add.d a0, a0, a1
+ vstelm.w vr11, a0, 0, 1
+ add.d a0, a0, a1
+ addi.w a5, a5, -2
+ bnez a5, .l_\lable\()put_v_4w_loop
+ b .l_\lable\()end_put_8tap
+
+.l_\lable\()put_v_8w:
+.l_\lable\()put_v_16w:
+.l_\lable\()put_v_32w:
+.l_\lable\()put_v_64w:
+.l_\lable\()put_v_128w:
+ addi.d t0, a2, 0 //src
+ addi.d t5, a5, 0 //h
+ addi.d t8, a0, 0 //dst
+.l_\lable\()put_v_8w_loop0:
+ fld.d f0, a2, 0
+ fldx.d f1, a2, a3
+ fldx.d f2, a2, t2
+ add.d a2, a2, t3
+ fld.d f3, a2, 0
+ fldx.d f4, a2, a3
+ fldx.d f5, a2, t2
+ fldx.d f6, a2, t3
+ add.d a2, a2, t4
+
+ vilvl.b vr0, vr1, vr0
+ vilvl.b vr1, vr3, vr2
+ vilvl.b vr2, vr5, vr4
+ vilvl.b vr3, vr7, vr6
+ vilvl.h vr4, vr1, vr0
+ vilvh.h vr5, vr1, vr0
+ vilvl.h vr6, vr3, vr2
+ vilvh.h vr7, vr3, vr2
+ vilvl.w vr0, vr6, vr4 // x0x1
+ vilvh.w vr1, vr6, vr4 // x2x3
+ vilvl.w vr2, vr7, vr5 // x4x5
+ vilvh.w vr3, vr7, vr5 // x6x7
+.l_\lable\()put_v_8w_loop:
+ fld.d f7, a2, 0
+ fldx.d f10, a2, a3
+ add.d a2, a2, t2
+ //h0
+ vextrins.b vr0, vr7, 0x70
+ vextrins.b vr0, vr7, 0xf1
+ vextrins.b vr1, vr7, 0x72
+ vextrins.b vr1, vr7, 0xf3
+ vextrins.b vr2, vr7, 0x74
+ vextrins.b vr2, vr7, 0xf5
+ vextrins.b vr3, vr7, 0x76
+ vextrins.b vr3, vr7, 0xf7
+ vdp2.h.bu.b vr11, vr0, vr8
+ vdp2.h.bu.b vr12, vr1, vr8
+ vdp2.h.bu.b vr13, vr2, vr8
+ vdp2.h.bu.b vr14, vr3, vr8
+ vhaddw.d.h vr11
+ vhaddw.d.h vr12
+ vhaddw.d.h vr13
+ vhaddw.d.h vr14
+ vpickev.w vr11, vr12, vr11
+ vpickev.w vr12, vr14, vr13
+ vpickev.h vr11, vr12, vr11
+ vssrarni.bu.h vr11, vr11, 6
+ fst.d f11, a0, 0
+ add.d a0, a0, a1
+ //h1
+ vbsrl.v vr0, vr0, 1
+ vbsrl.v vr1, vr1, 1
+ vbsrl.v vr2, vr2, 1
+ vbsrl.v vr3, vr3, 1
+ vextrins.b vr0, vr10, 0x70
+ vextrins.b vr0, vr10, 0xf1
+ vextrins.b vr1, vr10, 0x72
+ vextrins.b vr1, vr10, 0xf3
+ vextrins.b vr2, vr10, 0x74
+ vextrins.b vr2, vr10, 0xf5
+ vextrins.b vr3, vr10, 0x76
+ vextrins.b vr3, vr10, 0xf7
+ vdp2.h.bu.b vr11, vr0, vr8
+ vdp2.h.bu.b vr12, vr1, vr8
+ vdp2.h.bu.b vr13, vr2, vr8
+ vdp2.h.bu.b vr14, vr3, vr8
+ vhaddw.d.h vr11
+ vhaddw.d.h vr12
+ vhaddw.d.h vr13
+ vhaddw.d.h vr14
+ vpickev.w vr11, vr12, vr11
+ vpickev.w vr12, vr14, vr13
+ vpickev.h vr11, vr12, vr11
+ vssrarni.bu.h vr11, vr11, 6
+ fst.d f11, a0, 0
+ add.d a0, a0, a1
+ vbsrl.v vr0, vr0, 1
+ vbsrl.v vr1, vr1, 1
+ vbsrl.v vr2, vr2, 1
+ vbsrl.v vr3, vr3, 1
+ addi.w a5, a5, -2
+ bnez a5, .l_\lable\()put_v_8w_loop
+ addi.d a2, t0, 8
+ addi.d t0, t0, 8
+ addi.d a0, t8, 8
+ addi.d t8, t8, 8
+ addi.d a5, t5, 0
+ addi.w a4, a4, -8
+ bnez a4, .l_\lable\()put_v_8w_loop0
+ b .l_\lable\()end_put_8tap
+
+.l_\lable\()put_hv:
+ ld.d t5, sp, 0 //filter_type
+ andi t1, t5, 3
+ blt t0, a4, .l_\lable\()put_hv_idx_fh
+ andi t1, t5, 1
+ addi.w t1, t1, 3
+.l_\lable\()put_hv_idx_fh:
+ addi.w t5, zero, 120
+ mul.w t1, t1, t5
+ addi.w t5, a6, -1
+ slli.w t5, t5, 3
+ add.w t1, t1, t5
+ add.d t1, t6, t1 //fh's offset
+ vldrepl.d vr8, t1, 0
+ ld.d t1, sp, 0 //filter_type
+ srli.w t1, t1, 2
+ blt t0, a5, .l_\lable\()put_hv_idx_fv
+ andi t1, t1, 1
+ addi.w t1, t1, 3
+.l_\lable\()put_hv_idx_fv:
+ addi.w t5, zero, 120
+ mul.w t1, t1, t5
+ addi.w t5, a7, -1
+ slli.w t5, t5, 3
+ add.w t1, t1, t5
+ add.d t1, t6, t1 //fv's offset
+ vldrepl.d vr9, t1, 0
+ vexth.h.b vr9, vr9
+
+ sub.d a2, a2, t3
+ addi.d a2, a2, -3
+
+ clz.w t1, a4
+ li.w t5, 24
+ sub.w t1, t1, t5
+ la.local t5, .l_\lable\()put_hv_jtable
+ alsl.d t1, t1, t5, 3
+ ld.d t6, t1, 0
+ add.d t5, t5, t6
+ jirl $r0, t5, 0
+
+ .align 3
+.l_\lable\()put_hv_jtable:
+ .dword .l_\lable\()put_hv_128w - .l_\lable\()put_hv_jtable
+ .dword .l_\lable\()put_hv_64w - .l_\lable\()put_hv_jtable
+ .dword .l_\lable\()put_hv_32w - .l_\lable\()put_hv_jtable
+ .dword .l_\lable\()put_hv_16w - .l_\lable\()put_hv_jtable
+ .dword .l_\lable\()put_hv_8w - .l_\lable\()put_hv_jtable
+ .dword .l_\lable\()put_hv_4w - .l_\lable\()put_hv_jtable
+ .dword .l_\lable\()put_hv_2w - .l_\lable\()put_hv_jtable
+
+.l_\lable\()put_hv_2w:
+ vld vr0, a2, 0
+ vldx vr1, a2, a3
+ vldx vr2, a2, t2
+ add.d a2, a2, t3
+ vld vr3, a2, 0
+ vldx vr4, a2, a3
+ vldx vr5, a2, t2
+ vldx vr6, a2, t3
+ add.d a2, a2, t4
+
+ vbsrl.v vr10, vr0, 1
+ vbsrl.v vr11, vr1, 1
+ vbsrl.v vr12, vr2, 1
+ vbsrl.v vr13, vr3, 1
+ vbsrl.v vr14, vr4, 1
+ vbsrl.v vr15, vr5, 1
+ vbsrl.v vr16, vr6, 1
+ vilvl.d vr0, vr10, vr0
+ vilvl.d vr1, vr11, vr1
+ vilvl.d vr2, vr12, vr2
+ vilvl.d vr3, vr13, vr3
+ vilvl.d vr4, vr14, vr4
+ vilvl.d vr5, vr15, vr5
+ vilvl.d vr6, vr16, vr6
+ vdp2.h.bu.b vr10, vr0, vr8
+ vdp2.h.bu.b vr11, vr1, vr8
+ vdp2.h.bu.b vr12, vr2, vr8
+ vdp2.h.bu.b vr13, vr3, vr8
+ vdp2.h.bu.b vr14, vr4, vr8
+ vdp2.h.bu.b vr15, vr5, vr8
+ vdp2.h.bu.b vr16, vr6, vr8
+ vhaddw.d.h vr10
+ vhaddw.d.h vr11
+ vhaddw.d.h vr12
+ vhaddw.d.h vr13
+ vhaddw.d.h vr14
+ vhaddw.d.h vr15
+ vhaddw.d.h vr16
+
+ vpackev.w vr10, vr11, vr10
+ vpackev.w vr12, vr13, vr12
+ vpackod.d vr11, vr12, vr10
+ vpackev.d vr10, vr12, vr10
+
+ vpackev.w vr12, vr15, vr14
+ vpackev.w vr16, vr17, vr16
+ vpackod.d vr13, vr16, vr12
+ vpackev.d vr12, vr16, vr12
+
+ vpickev.h vr10, vr12, vr10 //0 1 2 3 4 5 6 * (h0)
+ vpickev.h vr11, vr13, vr11 //8 9 10 11 12 13 14 * (h1)
+ vsrari.h vr10, vr10, 2
+ vsrari.h vr11, vr11, 2
+.l_\lable\()put_hv_2w_loop:
+ vld vr7, a2, 0
+ vldx vr12, a2, a3
+ add.d a2, a2, t2
+
+ vbsrl.v vr1, vr7, 1
+ vbsrl.v vr2, vr12, 1
+ vilvl.d vr0, vr1, vr7
+ vilvl.d vr1, vr2, vr12
+ vdp2.h.bu.b vr2, vr0, vr8
+ vdp2.h.bu.b vr3, vr1, vr8
+ vhaddw.d.h vr2
+ vhaddw.d.h vr3
+ vpickev.w vr2, vr3, vr2
+ vpickev.h vr2, vr2, vr2
+ vsrari.h vr2, vr2, 2
+ vextrins.h vr10, vr2, 0x70 //0 1 2 3 4 5 6 7
+ vextrins.h vr11, vr2, 0x71
+ vbsrl.v vr12, vr10, 2
+ vbsrl.v vr13, vr11, 2
+ vextrins.h vr12, vr2, 0x72 //1 2 3 4 5 6 7 8
+ vextrins.h vr13, vr2, 0x73
+ vdp2.w.h vr0, vr10, vr9
+ vdp2.w.h vr1, vr11, vr9
+ vdp2.w.h vr2, vr12, vr9
+ vdp2.w.h vr3, vr13, vr9
+ vhaddw.q.w vr0
+ vhaddw.q.w vr1
+ vhaddw.q.w vr2
+ vhaddw.q.w vr3
+ vpackev.w vr0, vr1, vr0
+ vpackev.w vr1, vr3, vr2
+ vpackev.d vr0, vr1, vr0
+ vssrarni.hu.w vr0, vr0, 10
+ vssrani.bu.h vr0, vr0, 0
+ vbsrl.v vr10, vr12, 2
+ vbsrl.v vr11, vr13, 2
+ vstelm.h vr0, a0, 0, 0
+ add.d a0, a0, a1
+ vstelm.h vr0, a0, 0, 1
+ add.d a0, a0, a1
+ addi.d a5, a5, -2
+ bnez a5, .l_\lable\()put_hv_2w_loop
+ b .l_\lable\()end_put_8tap
+
+.l_\lable\()put_hv_4w:
+ vld vr0, a2, 0
+ vldx vr1, a2, a3
+ vldx vr2, a2, t2
+ add.d a2, a2, t3
+ vld vr3, a2, 0
+ vldx vr4, a2, a3
+ vldx vr5, a2, t2
+ vldx vr6, a2, t3
+ add.d a2, a2, t4
+ FILTER_8TAP_4W vr0 //x0 x1 x2 x3
+ FILTER_8TAP_4W vr1
+ FILTER_8TAP_4W vr2
+ FILTER_8TAP_4W vr3
+ FILTER_8TAP_4W vr4
+ FILTER_8TAP_4W vr5
+ FILTER_8TAP_4W vr6
+ vpackev.h vr0, vr1, vr0
+ vpackev.h vr1, vr3, vr2
+ vpackev.h vr2, vr5, vr4
+ vpackev.h vr3, vr7, vr6
+ vilvl.w vr4, vr1, vr0
+ vilvh.w vr5, vr1, vr0
+ vilvl.w vr6, vr3, vr2
+ vilvh.w vr7, vr3, vr2
+ vilvl.d vr0, vr6, vr4 //0 1 2 3 4 5 6 *
+ vilvh.d vr1, vr6, vr4
+ vilvl.d vr2, vr7, vr5
+ vilvh.d vr3, vr7, vr5
+ vsrari.h vr0, vr0, 2
+ vsrari.h vr1, vr1, 2
+ vsrari.h vr2, vr2, 2
+ vsrari.h vr3, vr3, 2
+.l_\lable\()put_hv_4w_loop:
+ vld vr4, a2, 0
+ vldx vr5, a2, a3
+ add.d a2, a2, t2
+ FILTER_8TAP_4W vr4
+ FILTER_8TAP_4W vr5
+ vpickev.h vr4, vr5, vr4
+ vsrari.h vr4, vr4, 2
+ vextrins.h vr0, vr4, 0x70
+ vextrins.h vr1, vr4, 0x71
+ vextrins.h vr2, vr4, 0x72
+ vextrins.h vr3, vr4, 0x73
+ vbsrl.v vr5, vr0, 2
+ vbsrl.v vr6, vr1, 2
+ vbsrl.v vr7, vr2, 2
+ vbsrl.v vr10, vr3, 2
+ vextrins.h vr5, vr4, 0x74
+ vextrins.h vr6, vr4, 0x75
+ vextrins.h vr7, vr4, 0x76
+ vextrins.h vr10, vr4, 0x77
+ vdp2.w.h vr11, vr0, vr9
+ vdp2.w.h vr12, vr1, vr9
+ vdp2.w.h vr13, vr2, vr9
+ vdp2.w.h vr14, vr3, vr9
+ vhaddw.q.w vr11
+ vhaddw.q.w vr12
+ vhaddw.q.w vr13
+ vhaddw.q.w vr14
+ vpackev.w vr0, vr12, vr11
+ vpackev.w vr1, vr14, vr13
+ vpackev.d vr0, vr1, vr0
+ vdp2.w.h vr11, vr5, vr9
+ vdp2.w.h vr12, vr6, vr9
+ vdp2.w.h vr13, vr7, vr9
+ vdp2.w.h vr14, vr10, vr9
+ vhaddw.q.w vr11
+ vhaddw.q.w vr12
+ vhaddw.q.w vr13
+ vhaddw.q.w vr14
+ vpackev.w vr1, vr12, vr11
+ vpackev.w vr2, vr14, vr13
+ vpackev.d vr1, vr2, vr1
+ vssrarni.hu.w vr1, vr0, 10
+ vssrani.bu.h vr1, vr1, 0
+ vstelm.w vr1, a0, 0, 0
+ add.d a0, a0, a1
+ vstelm.w vr1, a0, 0, 1
+ add.d a0, a0, a1
+ vbsrl.v vr0, vr5, 2
+ vbsrl.v vr1, vr6, 2
+ vbsrl.v vr2, vr7, 2
+ vbsrl.v vr3, vr10, 2
+ addi.w a5, a5, -2
+ bnez a5, .l_\lable\()put_hv_4w_loop
+ b .l_\lable\()end_put_8tap
+
+.l_\lable\()put_hv_8w:
+.l_\lable\()put_hv_16w:
+.l_\lable\()put_hv_32w:
+.l_\lable\()put_hv_64w:
+.l_\lable\()put_hv_128w:
+ addi.d t0, a2, 0 //src
+ addi.d t5, a5, 0 //h
+ addi.d t8, a0, 0 //dst
+.l_\lable\()put_hv_8w_loop0:
+ vld vr0, a2, 0
+ vldx vr1, a2, a3
+ vldx vr2, a2, t2
+ add.d a2, a2, t3
+ vld vr3, a2, 0
+ vldx vr4, a2, a3
+ vldx vr5, a2, t2
+ vldx vr6, a2, t3
+ add.d a2, a2, t4
+ FILTER_8TAP_8W vr0
+ FILTER_8TAP_8W vr1
+ FILTER_8TAP_8W vr2
+ FILTER_8TAP_8W vr3
+ FILTER_8TAP_8W vr4
+ FILTER_8TAP_8W vr5
+ FILTER_8TAP_8W vr6
+ LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7,\
+ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7,\
+ vr10,vr11,vr12,vr13,vr14,vr15,vr16,vr17
+.l_\lable\()put_hv_8w_loop:
+ vld vr20, a2, 0
+ vldx vr21, a2, a3
+ add.d a2, a2, t2
+ FILTER_8TAP_8W vr20
+ FILTER_8TAP_8W vr21
+ VEXTRINS_Hx8 vr20
+ FILTER_8TAP_8W_CLIP_STORE
+ VBSRL_Vx8
+ VEXTRINS_Hx8 vr21
+ FILTER_8TAP_8W_CLIP_STORE
+ VBSRL_Vx8
+ addi.w a5, a5, -2
+ bnez a5, .l_\lable\()put_hv_8w_loop
+ addi.d a2, t0, 8
+ addi.d t0, t0, 8
+ addi.d a0, t8, 8
+ addi.d t8, t8, 8
+ addi.d a5, t5, 0
+ addi.w a4, a4, -8
+ bnez a4, .l_\lable\()put_hv_8w_loop0
+.l_\lable\()end_put_8tap:
+.endm
+
+function put_8tap_regular_8bpc_lsx
+ addi.d sp, sp, -16
+ st.d zero, sp, 0
+ PUT_8TAP_8BPC_LSX 0
+ addi.d sp, sp, 16
+endfunc
+
+function put_8tap_smooth_regular_8bpc_lsx
+ addi.d sp, sp, -16
+ li.w t0, 1
+ st.d t0, sp, 0
+ PUT_8TAP_8BPC_LSX 1
+ addi.d sp, sp, 16
+endfunc
+
+function put_8tap_sharp_regular_8bpc_lsx
+ addi.d sp, sp, -16
+ li.w t0, 2
+ st.d t0, sp, 0
+ PUT_8TAP_8BPC_LSX 2
+ addi.d sp, sp, 16
+endfunc
+
+function put_8tap_regular_smooth_8bpc_lsx
+ addi.d sp, sp, -16
+ li.w t0, 4
+ st.d t0, sp, 0
+ PUT_8TAP_8BPC_LSX 4
+ addi.d sp, sp, 16
+endfunc
+
+function put_8tap_smooth_8bpc_lsx
+ addi.d sp, sp, -16
+ li.w t0, 5
+ st.d t0, sp, 0
+ PUT_8TAP_8BPC_LSX 5
+ addi.d sp, sp, 16
+endfunc
+
+function put_8tap_sharp_smooth_8bpc_lsx
+ addi.d sp, sp, -16
+ li.w t0, 6
+ st.d t0, sp, 0
+ PUT_8TAP_8BPC_LSX 6
+ addi.d sp, sp, 16
+endfunc
+
+function put_8tap_regular_sharp_8bpc_lsx
+ addi.d sp, sp, -16
+ li.w t0, 8
+ st.d t0, sp, 0
+ PUT_8TAP_8BPC_LSX 8
+ addi.d sp, sp, 16
+endfunc
+
+function put_8tap_smooth_sharp_8bpc_lsx
+ addi.d sp, sp, -16
+ li.w t0, 9
+ st.d t0, sp, 0
+ PUT_8TAP_8BPC_LSX 9
+ addi.d sp, sp, 16
+endfunc
+
+function put_8tap_sharp_8bpc_lsx
+ addi.d sp, sp, -16
+ li.w t0, 10
+ st.d t0, sp, 0
+ PUT_8TAP_8BPC_LSX 10
+ addi.d sp, sp, 16
+endfunc
+
+const shufb1
+.byte 0,1,2,3,4,5,6,7,1,2,3,4,5,6,7,8,0,1,2,3,4,5,6,7,1,2,3,4,5,6,7,8
+endconst
+
+.macro SHUFB in0, in1, tmp, out
+ xvbsrl.v \tmp, \in0, 2
+ xvpermi.q \tmp, \in0, 0x20
+ xvshuf.b \out, \tmp, \tmp, \in1
+.endm
+
+.macro HADDWDH in0
+ xvhaddw.w.h \in0, \in0, \in0
+ xvhaddw.d.w \in0, \in0, \in0
+.endm
+
+.macro HADDWQW in0
+ xvhaddw.d.w \in0, \in0, \in0
+ xvhaddw.q.d \in0, \in0, \in0
+.endm
+
+.macro PREP_W16_H in0
+ xvbsrl.v xr4, \in0, 4
+ xvbsrl.v xr5, \in0, 8
+ xvpermi.q xr9, \in0, 0x31
+ xvpackev.d xr5, xr9, xr5
+ xvbsrl.v xr6, xr5, 4
+ SHUFB \in0, xr23, xr9, \in0
+ SHUFB xr4, xr23, xr9, xr4
+ SHUFB xr5, xr23, xr9, xr5
+ SHUFB xr6, xr23, xr9, xr6
+ xvdp2.h.bu.b xr10, \in0, xr22
+ xvdp2.h.bu.b xr11, xr4, xr22
+ xvdp2.h.bu.b xr12, xr5, xr22
+ xvdp2.h.bu.b xr13, xr6, xr22
+ HADDWDH xr10
+ HADDWDH xr11
+ HADDWDH xr12
+ HADDWDH xr13
+ xvpickev.w xr10, xr11, xr10
+ xvpickev.w xr11, xr13, xr12
+ xvpermi.d xr10, xr10, 0xd8
+ xvpermi.d xr11, xr11, 0xd8
+ xvpickev.h xr10, xr11, xr10
+ xvpermi.d xr10, xr10, 0xd8
+ xvsrari.h \in0, xr10, 2
+.endm
+
+.macro PREP_8TAP_8BPC_LASX lable
+ li.w t0, 4
+ la.local t6, dav1d_mc_subpel_filters
+ la.local t7, shufb1
+ xvld xr23, t7, 0
+ slli.d t2, a2, 1 //src_stride*2
+ add.d t3, t2, a2 //src_stride*3
+ slli.d t4, t2, 1
+
+ bnez a5, .l_\lable\()h //mx
+ bnez a6, .l_\lable\()v
+
+ clz.w t1, a3
+ li.w t5, 24
+ sub.w t1, t1, t5
+ la.local t5, .l_\lable\()prep_hv0_jtable
+ alsl.d t1, t1, t5, 1
+ ld.h t8, t1, 0
+ add.d t5, t5, t8
+ jirl $r0, t5, 0
+
+ .align 3
+.l_\lable\()prep_hv0_jtable:
+ .hword .l_\lable\()hv0_128w - .l_\lable\()prep_hv0_jtable
+ .hword .l_\lable\()hv0_64w - .l_\lable\()prep_hv0_jtable
+ .hword .l_\lable\()hv0_32w - .l_\lable\()prep_hv0_jtable
+ .hword .l_\lable\()hv0_16w - .l_\lable\()prep_hv0_jtable
+ .hword .l_\lable\()hv0_8w - .l_\lable\()prep_hv0_jtable
+ .hword .l_\lable\()hv0_4w - .l_\lable\()prep_hv0_jtable
+
+.l_\lable\()hv0_4w:
+ fld.s f0, a1, 0
+ fldx.s f1, a1, a2
+ fldx.s f2, a1, t2
+ fldx.s f3, a1, t3
+ add.d a1, a1, t4
+ xvpackev.w xr0, xr1, xr0
+ xvpackev.w xr1, xr3, xr2
+ xvpermi.q xr0, xr1, 0x02
+ xvsllwil.hu.bu xr0, xr0, 4
+ xvst xr0, a0, 0
+ addi.d a0, a0, 32
+ addi.d a4, a4, -4
+ bnez a4, .l_\lable\()hv0_4w
+ b .l_\lable\()end_pre_8tap
+.l_\lable\()hv0_8w:
+ fld.d f0, a1, 0
+ fldx.d f1, a1, a2
+ fldx.d f2, a1, t2
+ fldx.d f3, a1, t3
+ add.d a1, a1, t4
+ xvpermi.q xr0, xr1, 0x02
+ xvpermi.q xr2, xr3, 0x02
+ xvsllwil.hu.bu xr0, xr0, 4
+ xvsllwil.hu.bu xr2, xr2, 4
+ xvst xr0, a0, 0
+ xvst xr2, a0, 32
+ addi.d a0, a0, 64
+ addi.d a4, a4, -4
+ bnez a4, .l_\lable\()hv0_8w
+ b .l_\lable\()end_pre_8tap
+.l_\lable\()hv0_16w:
+ vld vr0, a1, 0
+ vldx vr1, a1, a2
+ vldx vr2, a1, t2
+ vldx vr3, a1, t3
+ add.d a1, a1, t4
+ vext2xv.hu.bu xr0, xr0
+ vext2xv.hu.bu xr1, xr1
+ vext2xv.hu.bu xr2, xr2
+ vext2xv.hu.bu xr3, xr3
+ xvslli.h xr0, xr0, 4
+ xvslli.h xr1, xr1, 4
+ xvslli.h xr2, xr2, 4
+ xvslli.h xr3, xr3, 4
+ xvst xr0, a0, 0
+ xvst xr1, a0, 32
+ xvst xr2, a0, 64
+ xvst xr3, a0, 96
+ addi.d a0, a0, 128
+ addi.d a4, a4, -4
+ bnez a4, .l_\lable\()hv0_16w
+ b .l_\lable\()end_pre_8tap
+.l_\lable\()hv0_32w:
+ xvld xr0, a1, 0
+ xvldx xr1, a1, a2
+ xvldx xr2, a1, t2
+ xvldx xr3, a1, t3
+ add.d a1, a1, t4
+ xvpermi.d xr4, xr0, 0xD8
+ xvpermi.d xr5, xr1, 0xD8
+ xvpermi.d xr6, xr2, 0xD8
+ xvpermi.d xr7, xr3, 0xD8
+ xvpermi.d xr10, xr0, 0x32
+ xvpermi.d xr11, xr1, 0x32
+ xvpermi.d xr12, xr2, 0x32
+ xvpermi.d xr13, xr3, 0x32
+ xvsllwil.hu.bu xr0, xr4, 4
+ xvsllwil.hu.bu xr1, xr5, 4
+ xvsllwil.hu.bu xr2, xr6, 4
+ xvsllwil.hu.bu xr3, xr7, 4
+ xvsllwil.hu.bu xr4, xr10, 4
+ xvsllwil.hu.bu xr5, xr11, 4
+ xvsllwil.hu.bu xr6, xr12, 4
+ xvsllwil.hu.bu xr7, xr13, 4
+ xvst xr0, a0, 0
+ xvst xr4, a0, 32
+ xvst xr1, a0, 64
+ xvst xr5, a0, 96
+ xvst xr2, a0, 128
+ xvst xr6, a0, 160
+ xvst xr3, a0, 192
+ xvst xr7, a0, 224
+ addi.d a0, a0, 256
+ addi.d a4, a4, -4
+ bnez a4, .l_\lable\()hv0_32w
+ b .l_\lable\()end_pre_8tap
+.l_\lable\()hv0_64w:
+.l_\lable\()hv0_128w:
+ addi.d t0, a1, 0
+ addi.d t5, a4, 0
+ srli.w t7, a3, 5
+ slli.w t7, t7, 6
+ addi.d t8, a0, 0
+.l_\lable\()hv0_32_loop:
+ xvld xr0, a1, 0
+ xvldx xr1, a1, a2
+ xvldx xr2, a1, t2
+ xvldx xr3, a1, t3
+ add.d a1, a1, t4
+ xvpermi.d xr4, xr0, 0xD8
+ xvpermi.d xr5, xr1, 0xD8
+ xvpermi.d xr6, xr2, 0xD8
+ xvpermi.d xr7, xr3, 0xD8
+ xvpermi.d xr10, xr0, 0x32
+ xvpermi.d xr11, xr1, 0x32
+ xvpermi.d xr12, xr2, 0x32
+ xvpermi.d xr13, xr3, 0x32
+ xvsllwil.hu.bu xr0, xr4, 4
+ xvsllwil.hu.bu xr1, xr5, 4
+ xvsllwil.hu.bu xr2, xr6, 4
+ xvsllwil.hu.bu xr3, xr7, 4
+ xvsllwil.hu.bu xr4, xr10, 4
+ xvsllwil.hu.bu xr5, xr11, 4
+ xvsllwil.hu.bu xr6, xr12, 4
+ xvsllwil.hu.bu xr7, xr13, 4
+ xvst xr0, a0, 0
+ xvst xr4, a0, 32
+ add.d t1, a0, t7
+ xvst xr1, t1, 0
+ xvst xr5, t1, 32
+ add.d t1, t1, t7
+ xvst xr2, t1, 0
+ xvst xr6, t1, 32
+ add.d t1, t1, t7
+ xvst xr3, t1, 0
+ xvst xr7, t1, 32
+ add.d a0, t1, t7
+ addi.d a4, a4, -4
+ bnez a4, .l_\lable\()hv0_32_loop
+ addi.d a1, t0, 32
+ addi.d t0, t0, 32
+ addi.d a0, t8, 64
+ addi.d t8, t8, 64
+ addi.d a4, t5, 0
+ addi.d a3, a3, -32
+ bnez a3, .l_\lable\()hv0_32_loop
+ b .l_\lable\()end_pre_8tap
+
+.l_\lable\()h:
+ bnez a6, .l_\lable\()hv //if(fh) && if (fv)
+
+ andi t1, a7, 3
+ blt t0, a3, .l_\lable\()h_idx_fh
+ andi t1, a7, 1
+ addi.w t1, t1, 3
+.l_\lable\()h_idx_fh:
+ addi.w t5, zero, 120
+ mul.w t1, t1, t5
+ addi.w t5, a5, -1
+ slli.w t5, t5, 3
+ add.w t1, t1, t5
+ add.d t1, t6, t1 //fh's offset
+ xvldrepl.d xr22, t1, 0
+
+ addi.d a1, a1, -3
+ clz.w t1, a3
+ li.w t5, 24
+ sub.w t1, t1, t5
+ la.local t5, .l_\lable\()prep_h_jtable
+ alsl.d t1, t1, t5, 1
+ ld.h t8, t1, 0
+ add.d t5, t5, t8
+ jirl $r0, t5, 0
+
+ .align 3
+.l_\lable\()prep_h_jtable:
+ .hword .l_\lable\()h_128w - .l_\lable\()prep_h_jtable
+ .hword .l_\lable\()h_64w - .l_\lable\()prep_h_jtable
+ .hword .l_\lable\()h_32w - .l_\lable\()prep_h_jtable
+ .hword .l_\lable\()h_16w - .l_\lable\()prep_h_jtable
+ .hword .l_\lable\()h_8w - .l_\lable\()prep_h_jtable
+ .hword .l_\lable\()h_4w - .l_\lable\()prep_h_jtable
+
+.l_\lable\()h_4w:
+ xvld xr0, a1, 0
+ xvldx xr1, a1, a2
+ xvldx xr2, a1, t2
+ xvldx xr3, a1, t3
+ add.d a1, a1, t4
+
+ SHUFB xr0, xr23, xr9, xr0
+ SHUFB xr1, xr23, xr9, xr1
+ SHUFB xr2, xr23, xr9, xr2
+ SHUFB xr3, xr23, xr9, xr3
+
+ xvdp2.h.bu.b xr10, xr0, xr22
+ xvdp2.h.bu.b xr12, xr1, xr22
+ xvdp2.h.bu.b xr14, xr2, xr22
+ xvdp2.h.bu.b xr16, xr3, xr22
+
+ HADDWDH xr10 //h0 mid0 mid1 mid2 mid3
+ HADDWDH xr12 //h1 mid4 mid5 mid6 mid7
+ HADDWDH xr14 //h2
+ HADDWDH xr16 //h3
+
+ xvpickev.w xr10, xr12, xr10
+ xvpickev.w xr14, xr16, xr14
+ xvpermi.d xr10, xr10, 0xd8
+ xvpermi.d xr14, xr14, 0xd8
+ xvpickev.h xr10, xr14, xr10
+ xvpermi.d xr10, xr10, 0xd8
+ xvsrari.h xr10, xr10, 2
+
+ xvst xr10, a0, 0
+ addi.d a0, a0, 32
+ addi.w a4, a4, -4
+ bnez a4, .l_\lable\()h_4w
+ b .l_\lable\()end_pre_8tap
+
+.l_\lable\()h_8w:
+ xvld xr0, a1, 0
+ xvldx xr2, a1, a2
+ xvldx xr4, a1, t2
+ xvldx xr6, a1, t3
+ add.d a1, a1, t4
+
+ xvbsrl.v xr1, xr0, 4
+ xvbsrl.v xr3, xr2, 4
+ xvbsrl.v xr5, xr4, 4
+ xvbsrl.v xr7, xr6, 4
+
+ SHUFB xr0, xr23, xr9, xr10
+ SHUFB xr1, xr23, xr9, xr11
+ SHUFB xr2, xr23, xr9, xr12
+ SHUFB xr3, xr23, xr9, xr13
+ SHUFB xr4, xr23, xr9, xr14
+ SHUFB xr5, xr23, xr9, xr15
+ SHUFB xr6, xr23, xr9, xr16
+ SHUFB xr7, xr23, xr9, xr17
+
+ xvdp2.h.bu.b xr0, xr10, xr22
+ xvdp2.h.bu.b xr1, xr11, xr22
+ xvdp2.h.bu.b xr2, xr12, xr22
+ xvdp2.h.bu.b xr3, xr13, xr22
+ xvdp2.h.bu.b xr4, xr14, xr22
+ xvdp2.h.bu.b xr5, xr15, xr22
+ xvdp2.h.bu.b xr6, xr16, xr22
+ xvdp2.h.bu.b xr7, xr17, xr22
+
+ HADDWDH xr0
+ HADDWDH xr1
+ HADDWDH xr2
+ HADDWDH xr3
+ HADDWDH xr4
+ HADDWDH xr5
+ HADDWDH xr6
+ HADDWDH xr7
+
+ xvpickev.w xr0, xr1, xr0
+ xvpickev.w xr2, xr3, xr2
+ xvpermi.d xr0, xr0, 0xd8
+ xvpermi.d xr2, xr2, 0xd8
+ xvpickev.h xr0, xr2, xr0
+ xvpermi.d xr0, xr0, 0xd8
+ xvsrari.h xr0, xr0, 2
+
+ xvpickev.w xr4, xr5, xr4
+ xvpickev.w xr6, xr7, xr6
+ xvpermi.d xr4, xr4, 0xd8
+ xvpermi.d xr6, xr6, 0xd8
+ xvpickev.h xr4, xr6, xr4
+ xvpermi.d xr4, xr4, 0xd8
+ xvsrari.h xr4, xr4, 2
+
+ xvst xr0, a0, 0
+ xvst xr4, a0, 32
+ addi.d a0, a0, 64
+ addi.d a4, a4, -4
+ bnez a4, .l_\lable\()h_8w
+ b .l_\lable\()end_pre_8tap
+
+.l_\lable\()h_16w:
+ xvld xr0, a1, 0
+ xvldx xr1, a1, a2
+ xvldx xr2, a1, t2
+ xvldx xr3, a1, t3
+ add.d a1, a1, t4
+
+ PREP_W16_H xr0
+ PREP_W16_H xr1
+ PREP_W16_H xr2
+ PREP_W16_H xr3
+
+ xvst xr0, a0, 0
+ xvst xr1, a0, 32
+ xvst xr2, a0, 64
+ xvst xr3, a0, 96
+
+ addi.d a0, a0, 128
+ addi.w a4, a4, -4
+ bnez a4, .l_\lable\()h_16w
+ b .l_\lable\()end_pre_8tap
+
+.l_\lable\()h_32w:
+.l_\lable\()h_64w:
+.l_\lable\()h_128w:
+ addi.d t0, a1, 0 //src
+ addi.d t5, a4, 0 //h
+ srli.w t7, a3, 4 //w
+ slli.w t7, t7, 5 //store offset
+ addi.d t8, a0, 0 //dst
+.l_\lable\()h_16_loop:
+ xvld xr0, a1, 0
+ xvldx xr1, a1, a2
+ xvldx xr2, a1, t2
+ xvldx xr3, a1, t3
+ add.d a1, a1, t4
+
+ PREP_W16_H xr0
+ PREP_W16_H xr1
+ PREP_W16_H xr2
+ PREP_W16_H xr3
+
+ xvst xr0, a0, 0
+ xvstx xr1, a0, t7
+ slli.w t1, t7, 1
+ xvstx xr2, a0, t1
+ add.w t1, t1, t7
+ xvstx xr3, a0, t1
+ slli.w t1, t7, 2
+ add.d a0, a0, t1
+ addi.d a4, a4, -4
+ bnez a4, .l_\lable\()h_16_loop
+
+ addi.d a1, t0, 16
+ addi.d t0, t0, 16
+ addi.d a0, t8, 32
+ addi.d t8, t8, 32
+ addi.d a4, t5, 0
+ addi.d a3, a3, -16
+ bnez a3, .l_\lable\()h_16_loop
+ b .l_\lable\()end_pre_8tap
+.l_\lable\()hv:
+ andi t1, a7, 3
+ blt t0, a3, .l_\lable\()hv_idx_fh
+ andi t1, a7, 1
+ addi.w t1, t1, 3
+.l_\lable\()hv_idx_fh:
+ addi.w t5, zero, 120
+ mul.w t1, t1, t5
+ addi.w t5, a5, -1
+ slli.w t5, t5, 3
+ add.w t1, t1, t5
+ add.d t1, t6, t1 //fh's offset
+ xvldrepl.d xr22, t1, 0
+ srli.w a7, a7, 2
+ blt t0, a4, .l_\lable\()hv_idx_fv
+ andi a7, a7, 1
+ addi.w a7, a7, 3
+.l_\lable\()hv_idx_fv:
+ addi.w t5, zero, 120
+ mul.w a7, a7, t5
+ addi.w t5, a6, -1
+ slli.w t5, t5, 3
+ add.w a7, a7, t5
+ add.d a7, t6, a7 //fv's offset
+ xvldrepl.d xr8, a7, 0
+ xvsllwil.h.b xr8, xr8, 0
+
+ sub.d a1, a1, t3
+ addi.d a1, a1, -3
+ beq a3, t0, .l_\lable\()hv_4w
+ b .l_\lable\()hv_8w
+.l_\lable\()hv_4w:
+ xvld xr0, a1, 0
+ xvldx xr1, a1, a2
+ xvldx xr2, a1, t2
+ xvldx xr3, a1, t3
+ add.d a1, a1, t4
+ xvld xr4, a1, 0
+ xvldx xr5, a1, a2
+ xvldx xr6, a1, t2
+
+ SHUFB xr0, xr23, xr9, xr0
+ SHUFB xr1, xr23, xr9, xr1
+ SHUFB xr2, xr23, xr9, xr2
+ SHUFB xr3, xr23, xr9, xr3
+
+ SHUFB xr4, xr23, xr9, xr4
+ SHUFB xr5, xr23, xr9, xr5
+ SHUFB xr6, xr23, xr9, xr6
+
+ xvdp2.h.bu.b xr10, xr0, xr22
+ xvdp2.h.bu.b xr11, xr1, xr22
+ xvdp2.h.bu.b xr12, xr2, xr22
+ xvdp2.h.bu.b xr13, xr3, xr22
+
+ xvdp2.h.bu.b xr14, xr4, xr22
+ xvdp2.h.bu.b xr15, xr5, xr22
+ xvdp2.h.bu.b xr16, xr6, xr22
+
+ HADDWDH xr10 //h0 mid0 mid1 mid2 mid3
+ HADDWDH xr11 //h1 mid4 mid5 mid6 mid7
+ HADDWDH xr12 //h2
+ HADDWDH xr13 //h3
+
+ xvpackev.w xr10, xr11, xr10
+ xvpackev.w xr12, xr13, xr12
+ xvpackev.d xr11, xr12, xr10
+ xvpackod.d xr10, xr12, xr10
+ xvpickev.h xr11, xr10, xr11
+ xvsrari.h xr11, xr11, 2
+
+ HADDWDH xr14 //h4
+ HADDWDH xr15 //h5
+ HADDWDH xr16 //h6
+
+ xvpackev.w xr14, xr15, xr14
+ xvpackev.w xr16, xr17, xr16
+ xvpackev.d xr17, xr16, xr14
+ xvpackod.d xr14, xr16, xr14
+ xvpickev.h xr13, xr14, xr17
+ xvsrari.h xr13, xr13, 2
+
+ xvpackev.d xr18, xr13, xr11 //0 4 8 12 16 20 24 * 2 6 10 14 18 22 26 *
+ xvpackod.d xr19, xr13, xr11 //1 5 9 13 17 21 25 * 3 7 11 15 19 23 27 *
+.l_\lable\()hv_w4_loop:
+ xvldx xr0, a1, t3
+ add.d a1, a1, t4
+ xvld xr1, a1, 0
+ xvldx xr2, a1, a2
+ xvldx xr3, a1, t2
+
+ SHUFB xr0, xr23, xr9, xr0
+ SHUFB xr1, xr23, xr9, xr1
+ SHUFB xr2, xr23, xr9, xr2
+ SHUFB xr3, xr23, xr9, xr3
+
+ xvdp2.h.bu.b xr10, xr0, xr22
+ xvdp2.h.bu.b xr12, xr1, xr22
+ xvdp2.h.bu.b xr14, xr2, xr22
+ xvdp2.h.bu.b xr16, xr3, xr22
+
+ HADDWDH xr10 //h0 mid0 mid1 mid2 mid3
+ HADDWDH xr12 //h1 mid4 mid5 mid6 mid7
+ HADDWDH xr14 //h2
+ HADDWDH xr16 //h3
+
+ xvpackev.w xr10, xr12, xr10
+ xvpackev.w xr14, xr16, xr14
+ xvpackev.d xr12, xr14, xr10
+ xvpackod.d xr10, xr14, xr10
+ xvpickev.h xr12, xr10, xr12
+ xvsrari.h xr12, xr12, 2
+
+ xvextrins.h xr18, xr12, 0x70 //0 4 8 12 16 20 24 0(x0) 2 6 10 14 18 22 26 2(x2)
+ xvextrins.h xr19, xr12, 0x74 //1 5 9 13 17 21 25 0(x1) 3 7 11 15 19 23 27 2(x3)
+
+ xvdp2.w.h xr0, xr18, xr8
+ xvdp2.w.h xr2, xr19, xr8
+ HADDWQW xr0
+ HADDWQW xr2
+ xvpackev.w xr0, xr2, xr0
+
+ xvbsrl.v xr18, xr18, 2
+ xvbsrl.v xr19, xr19, 2
+ xvextrins.h xr18, xr12, 0x71
+ xvextrins.h xr19, xr12, 0x75
+ xvdp2.w.h xr2, xr18, xr8
+ xvdp2.w.h xr4, xr19, xr8
+ HADDWQW xr2
+ HADDWQW xr4
+ xvpackev.w xr2, xr4, xr2
+
+ xvbsrl.v xr18, xr18, 2
+ xvbsrl.v xr19, xr19, 2
+ xvextrins.h xr18, xr12, 0x72
+ xvextrins.h xr19, xr12, 0x76
+ xvdp2.w.h xr4, xr18, xr8
+ xvdp2.w.h xr9, xr19, xr8
+ HADDWQW xr4
+ HADDWQW xr9
+ xvpackev.w xr4, xr9, xr4
+
+ xvbsrl.v xr18, xr18, 2
+ xvbsrl.v xr19, xr19, 2
+ xvextrins.h xr18, xr12, 0x73
+ xvextrins.h xr19, xr12, 0x77
+ xvdp2.w.h xr9, xr18, xr8
+ xvdp2.w.h xr11, xr19, xr8
+ HADDWQW xr9
+ HADDWQW xr11
+ xvpackev.w xr9, xr11, xr9
+
+ xvpackev.d xr0, xr2, xr0
+ xvpackev.d xr4, xr9, xr4
+ xvsrari.w xr0, xr0, 6
+ xvsrari.w xr4, xr4, 6
+ xvpermi.d xr0, xr0, 0xd8
+ xvpermi.d xr4, xr4, 0xd8
+ xvpickev.h xr0, xr4, xr0
+ xvpermi.d xr0, xr0, 0xd8
+ xvst xr0, a0, 0
+ addi.d a0, a0, 32
+
+ xvbsrl.v xr18, xr18, 2
+ xvbsrl.v xr19, xr19, 2
+
+ addi.d a4, a4, -4
+ bnez a4, .l_\lable\()hv_w4_loop
+ b .l_\lable\()end_pre_8tap
+
+.l_\lable\()hv_8w:
+ addi.d t0, a1, 0
+ addi.d t5, a4, 0
+ srli.w t7, a3, 3
+ slli.w t7, t7, 4 // store offset
+ addi.d t8, a0, 0
+.l_\lable\()hv_8w_loop0:
+ xvld xr0, a1, 0
+ xvldx xr2, a1, a2
+ xvldx xr4, a1, t2
+ xvldx xr6, a1, t3
+
+ add.d a1, a1, t4
+ xvld xr10, a1, 0
+ xvldx xr11, a1, a2
+ xvldx xr12, a1, t2
+
+ xvbsrl.v xr1, xr0, 4
+ xvbsrl.v xr3, xr2, 4
+ xvbsrl.v xr5, xr4, 4
+ xvbsrl.v xr7, xr6, 4
+
+ SHUFB xr0, xr23, xr9, xr13
+ SHUFB xr1, xr23, xr9, xr14
+ SHUFB xr2, xr23, xr9, xr15
+ SHUFB xr3, xr23, xr9, xr16
+ SHUFB xr4, xr23, xr9, xr17
+ SHUFB xr5, xr23, xr9, xr18
+ SHUFB xr6, xr23, xr9, xr19
+ SHUFB xr7, xr23, xr9, xr20
+
+ xvdp2.h.bu.b xr0, xr13, xr22
+ xvdp2.h.bu.b xr1, xr14, xr22
+ xvdp2.h.bu.b xr2, xr15, xr22
+ xvdp2.h.bu.b xr3, xr16, xr22
+ xvdp2.h.bu.b xr4, xr17, xr22
+ xvdp2.h.bu.b xr5, xr18, xr22
+ xvdp2.h.bu.b xr6, xr19, xr22
+ xvdp2.h.bu.b xr7, xr20, xr22
+
+ HADDWDH xr0
+ HADDWDH xr1
+ HADDWDH xr2
+ HADDWDH xr3
+ HADDWDH xr4
+ HADDWDH xr5
+ HADDWDH xr6
+ HADDWDH xr7
+
+ xvpackev.w xr0, xr2, xr0
+ xvpackev.w xr2, xr6, xr4
+ xvpackev.d xr16, xr2, xr0
+ xvpackod.d xr0, xr2, xr0
+ xvpickev.h xr0, xr0, xr16
+ xvsrari.h xr0, xr0, 2 // 0 8 16 24 1 9 17 25 2 10 18 26 3 11 19 27
+
+ xvpackev.w xr1, xr3, xr1
+ xvpackev.w xr3, xr7, xr5
+ xvpackev.d xr16, xr3, xr1
+ xvpackod.d xr1, xr3, xr1
+ xvpickev.h xr1, xr1, xr16
+ xvsrari.h xr1, xr1, 2 // 4 12 20 28 5 13 21 29 6 14 22 30 7 15 23 31
+
+ xvbsrl.v xr13, xr10, 4
+ xvbsrl.v xr14, xr11, 4
+ xvbsrl.v xr15, xr12, 4
+
+ SHUFB xr10, xr23, xr9, xr10
+ SHUFB xr13, xr23, xr9, xr13
+ SHUFB xr11, xr23, xr9, xr11
+ SHUFB xr14, xr23, xr9, xr14
+ SHUFB xr12, xr23, xr9, xr12
+ SHUFB xr15, xr23, xr9, xr15
+
+ xvdp2.h.bu.b xr4, xr10, xr22
+ xvdp2.h.bu.b xr5, xr13, xr22
+ xvdp2.h.bu.b xr6, xr11, xr22
+ xvdp2.h.bu.b xr7, xr14, xr22
+ xvdp2.h.bu.b xr9, xr12, xr22
+ xvdp2.h.bu.b xr10, xr15, xr22
+
+ HADDWDH xr4
+ HADDWDH xr5
+ HADDWDH xr6
+ HADDWDH xr7
+ HADDWDH xr9
+ HADDWDH xr10
+
+ xvpackev.w xr4, xr6, xr4
+ xvpackev.w xr9, xr12, xr9
+ xvpackev.d xr16, xr9, xr4
+ xvpackod.d xr11, xr9, xr4
+ xvpickev.h xr2, xr11, xr16
+ xvsrari.h xr2, xr2, 2 // 32 40 48 * 33 41 49 * 34 42 50 * 35 43 51 *
+
+ xvpackev.w xr5, xr7, xr5
+ xvpackev.w xr10, xr12, xr10
+ xvpackev.d xr16, xr10, xr5
+ xvpackod.d xr11, xr10, xr5
+ xvpickev.h xr3, xr11, xr16
+ xvsrari.h xr3, xr3, 2 // 36 44 52 * 37 45 53 * 38 46 54 * 39 47 56 *
+
+ xvpackev.d xr18, xr2, xr0 // 0 8 16 24 32 40 48 * 2 10 18 26 34 42 50 *
+ xvpackod.d xr19, xr2, xr0 // 1 9 17 25 33 41 49 * 3 11 19 27 35 43 51 *
+ xvpackev.d xr20, xr3, xr1 // 4 12 20 28 36 44 52 * 6 14 22 30 38 46 54 *
+ xvpackod.d xr21, xr3, xr1 // 5 13 21 29 37 45 53 * 7 15 23 31 39 47 55 *
+
+.l_\lable\()hv_8w_loop:
+ xvldx xr0, a1, t3
+ add.d a1, a1, t4
+ xvld xr2, a1, 0
+ xvldx xr4, a1, a2
+ xvldx xr6, a1, t2
+
+ xvbsrl.v xr1, xr0, 4
+ xvbsrl.v xr3, xr2, 4
+ xvbsrl.v xr5, xr4, 4
+ xvbsrl.v xr7, xr6, 4
+
+ SHUFB xr0, xr23, xr9, xr0
+ SHUFB xr1, xr23, xr9, xr1
+ SHUFB xr2, xr23, xr9, xr2
+ SHUFB xr3, xr23, xr9, xr3
+ SHUFB xr4, xr23, xr9, xr4
+ SHUFB xr5, xr23, xr9, xr5
+ SHUFB xr6, xr23, xr9, xr6
+ SHUFB xr7, xr23, xr9, xr7
+
+ xvdp2.h.bu.b xr10, xr0, xr22
+ xvdp2.h.bu.b xr11, xr1, xr22
+ xvdp2.h.bu.b xr12, xr2, xr22
+ xvdp2.h.bu.b xr13, xr3, xr22
+ xvdp2.h.bu.b xr14, xr4, xr22
+ xvdp2.h.bu.b xr15, xr5, xr22
+ xvdp2.h.bu.b xr16, xr6, xr22
+ xvdp2.h.bu.b xr17, xr7, xr22
+
+ HADDWDH xr10
+ HADDWDH xr11
+ HADDWDH xr12
+ HADDWDH xr13
+ HADDWDH xr14
+ HADDWDH xr15
+ HADDWDH xr16
+ HADDWDH xr17
+
+ xvpackev.w xr0, xr12, xr10
+ xvpackev.w xr2, xr16, xr14
+ xvpackev.d xr9, xr2, xr0
+ xvpackod.d xr0, xr2, xr0
+ xvpickev.h xr0, xr0, xr9
+ xvsrari.h xr0, xr0, 2 // 56 64 72 80 57 65 73 81 58 66 74 82 59 67 75 83
+
+ xvpackev.w xr1, xr13, xr11
+ xvpackev.w xr3, xr17, xr15
+ xvpackev.d xr9, xr3, xr1
+ xvpackod.d xr1, xr3, xr1
+ xvpickev.h xr1, xr1, xr9
+ xvsrari.h xr1, xr1, 2 // 60 68 76 84 61 69 77 85 62 70 78 86 63 71 79 87
+
+ xvextrins.h xr18, xr0, 0x70 // 0 8 16 24 32 40 48 (56) 2 10 18 26 34 42 50 (58)
+ xvextrins.h xr19, xr0, 0x74 // 1 9 17 25 33 41 49 (57) 3 11 19 27 35 43 51 (59)
+ xvextrins.h xr20, xr1, 0x70
+ xvextrins.h xr21, xr1, 0x74
+
+ //h - 1
+ xvdp2.w.h xr10, xr18, xr8
+ xvdp2.w.h xr11, xr19, xr8
+ xvdp2.w.h xr12, xr20, xr8
+ xvdp2.w.h xr13, xr21, xr8
+
+ HADDWQW xr10
+ HADDWQW xr11
+ HADDWQW xr12
+ HADDWQW xr13
+
+ xvpackev.w xr2, xr11, xr10 //0 1 * * 2 3 * *
+ xvpackev.w xr3, xr13, xr12 //4 5 * * 6 7 * *
+ xvpackev.d xr2, xr3, xr2 //0 1 4 5 2 3 6 7
+ //h - 2
+ xvbsrl.v xr4, xr18, 2
+ xvbsrl.v xr5, xr19, 2
+ xvbsrl.v xr6, xr20, 2
+ xvbsrl.v xr7, xr21, 2
+ xvextrins.h xr4, xr0, 0x71
+ xvextrins.h xr5, xr0, 0x75
+ xvextrins.h xr6, xr1, 0x71
+ xvextrins.h xr7, xr1, 0x75
+
+ xvdp2.w.h xr10, xr4, xr8
+ xvdp2.w.h xr11, xr5, xr8
+ xvdp2.w.h xr12, xr6, xr8
+ xvdp2.w.h xr13, xr7, xr8
+
+ HADDWQW xr10
+ HADDWQW xr11
+ HADDWQW xr12
+ HADDWQW xr13
+
+ xvpackev.w xr14, xr11, xr10
+ xvpackev.w xr15, xr13, xr12
+ xvpackev.d xr14, xr15, xr14 //8 9 12 13 10 11 14 15
+ //h - 3
+ xvbsrl.v xr4, xr4, 2
+ xvbsrl.v xr5, xr5, 2
+ xvbsrl.v xr6, xr6, 2
+ xvbsrl.v xr7, xr7, 2
+ xvextrins.h xr4, xr0, 0x72
+ xvextrins.h xr5, xr0, 0x76
+ xvextrins.h xr6, xr1, 0x72
+ xvextrins.h xr7, xr1, 0x76
+
+ xvdp2.w.h xr10, xr4, xr8
+ xvdp2.w.h xr11, xr5, xr8
+ xvdp2.w.h xr12, xr6, xr8
+ xvdp2.w.h xr13, xr7, xr8
+
+ HADDWQW xr10
+ HADDWQW xr11
+ HADDWQW xr12
+ HADDWQW xr13
+
+ xvpackev.w xr15, xr11, xr10
+ xvpackev.w xr16, xr13, xr12
+ xvpackev.d xr15, xr16, xr15 //16 17 20 21 18 19 22 23
+ //h - 4
+ xvbsrl.v xr4, xr4, 2
+ xvbsrl.v xr5, xr5, 2
+ xvbsrl.v xr6, xr6, 2
+ xvbsrl.v xr7, xr7, 2
+ xvextrins.h xr4, xr0, 0x73
+ xvextrins.h xr5, xr0, 0x77
+ xvextrins.h xr6, xr1, 0x73
+ xvextrins.h xr7, xr1, 0x77
+
+ xvdp2.w.h xr10, xr4, xr8
+ xvdp2.w.h xr11, xr5, xr8
+ xvdp2.w.h xr12, xr6, xr8
+ xvdp2.w.h xr13, xr7, xr8
+
+ HADDWQW xr10
+ HADDWQW xr11
+ HADDWQW xr12
+ HADDWQW xr13
+
+ xvpackev.w xr16, xr11, xr10
+ xvpackev.w xr17, xr13, xr12
+ xvpackev.d xr16, xr17, xr16 //24 25 28 29 26 27 30 31
+
+ xvsrari.w xr2, xr2, 6
+ xvsrari.w xr14, xr14, 6
+ xvsrari.w xr15, xr15, 6
+ xvsrari.w xr16, xr16, 6
+
+ xvpermi.d xr2, xr2, 0xd8
+ xvpermi.d xr14, xr14, 0xd8
+ xvpermi.d xr15, xr15, 0xd8
+ xvpermi.d xr16, xr16, 0xd8
+ xvpickev.h xr2, xr14, xr2
+ xvpickev.h xr3, xr16, xr15
+ xvpermi.d xr2, xr2, 0xd8
+ xvpermi.d xr3, xr3, 0xd8
+
+ xvpermi.q xr10, xr2, 0x31
+ xvpermi.q xr11, xr3, 0x31
+
+ vst vr2, a0, 0
+ vstx vr10, a0, t7 //32
+ slli.w t1, t7, 1 //64
+ vstx vr3, a0, t1
+ add.w t1, t1, t7 //96
+ vstx vr11, a0, t1
+ slli.w t1, t7, 2 //128
+ add.d a0, a0, t1
+
+ xvbsrl.v xr18, xr4, 2
+ xvbsrl.v xr19, xr5, 2
+ xvbsrl.v xr20, xr6, 2
+ xvbsrl.v xr21, xr7, 2
+
+ addi.d a4, a4, -4
+ bnez a4, .l_\lable\()hv_8w_loop
+
+ addi.d a1, t0, 8
+ addi.d t0, t0, 8
+ addi.d a0, t8, 16
+ addi.d t8, t8, 16
+ addi.d a4, t5, 0
+ addi.d a3, a3, -8
+ bnez a3, .l_\lable\()hv_8w_loop0
+ b .l_\lable\()end_pre_8tap
+.l_\lable\()v:
+
+ srli.w a7, a7, 2
+ blt t0, a4, .l_\lable\()v_idx_fv
+ andi a7, a7, 1
+ addi.w a7, a7, 3
+.l_\lable\()v_idx_fv:
+ addi.w t5, zero, 120
+ mul.w a7, a7, t5
+ addi.w t5, a6, -1
+ slli.w t5, t5, 3
+ add.w a7, a7, t5
+ add.d a7, t6, a7 //fv's offset
+ xvldrepl.d xr8, a7, 0
+
+ sub.d a1, a1, t3
+ beq a3, t0, .l_\lable\()v_4w
+ blt t0, a3, .l_\lable\()v_8w
+.l_\lable\()v_4w:
+ fld.s f0, a1, 0
+ fldx.s f1, a1, a2
+ fldx.s f2, a1, t2
+ add.d a1, a1, t3
+ fld.s f3, a1, 0
+ fldx.s f4, a1, a2
+ fldx.s f5, a1, t2
+ fldx.s f6, a1, t3
+
+ xvilvl.b xr0, xr1, xr0 // 0 1 8 9 16 17 24 25
+ xvilvl.b xr1, xr3, xr2 // 2 3 10 11 18 19 26 27
+ xvilvl.b xr2, xr5, xr4 // 4 5 12 13 20 21 28 29
+ xvilvl.b xr3, xr7, xr6 // 6 7 14 15 22 23 30 31
+ xvilvl.h xr0, xr1, xr0 // 0 1 2 3 8 9 10 11 16 17 18 19 24 25 26 27
+ xvilvl.h xr1, xr3, xr2 // 4 5 6 7 12 13 14 15 20 21 22 23 28 29 30 31
+ xvilvl.w xr2, xr1, xr0
+ xvilvh.w xr0, xr1, xr0
+ xvpermi.q xr0, xr2, 0x20
+
+.l_\lable\()v_4w_loop:
+ add.d a1, a1, t4
+ fld.s f7, a1, 0 //h0
+ fldx.s f10, a1, a2 //h1
+ fldx.s f11, a1, t2 //h2
+ fldx.s f12, a1, t3 //h3
+
+ xvbsrl.v xr9, xr7, 2
+ xvpermi.q xr9, xr7, 0x20
+ xvextrins.b xr0, xr9, 0x70
+ xvextrins.b xr0, xr9, 0xf1
+
+ xvbsrl.v xr1, xr0, 1
+ xvbsrl.v xr7, xr10, 2
+ xvpermi.q xr7, xr10, 0x20
+ xvextrins.b xr1, xr7, 0x70
+ xvextrins.b xr1, xr7, 0xf1
+
+ xvbsrl.v xr2, xr1, 1
+ xvbsrl.v xr7, xr11, 2
+ xvpermi.q xr7, xr11, 0x20
+ xvextrins.b xr2, xr7, 0x70
+ xvextrins.b xr2, xr7, 0xf1
+
+ xvbsrl.v xr3, xr2, 1
+ xvbsrl.v xr7, xr12, 2
+ xvpermi.q xr7, xr12, 0x20
+ xvextrins.b xr3, xr7, 0x70
+ xvextrins.b xr3, xr7, 0xf1
+ xvbsrl.v xr4, xr3, 1
+
+ xvdp2.h.bu.b xr10, xr0, xr8
+ xvdp2.h.bu.b xr11, xr1, xr8
+ xvdp2.h.bu.b xr12, xr2, xr8
+ xvdp2.h.bu.b xr13, xr3, xr8
+ HADDWDH xr10
+ HADDWDH xr11
+ HADDWDH xr12
+ HADDWDH xr13
+ xvpickev.w xr10, xr11, xr10
+ xvpickev.w xr11, xr13, xr12
+ xvpermi.d xr10, xr10, 0xd8
+ xvpermi.d xr11, xr11, 0xd8
+ xvpickev.h xr10, xr11, xr10
+ xvpermi.d xr10, xr10, 0xd8
+ xvsrari.h xr10, xr10, 2
+
+ xvaddi.bu xr0, xr4, 0
+
+ xvst xr10, a0, 0
+ addi.d a0, a0, 32
+ addi.w a4, a4, -4
+ bnez a4, .l_\lable\()v_4w_loop
+ b .l_\lable\()end_pre_8tap
+
+.l_\lable\()v_8w:
+ addi.d t0, a1, 0
+ addi.d t5, a4, 0
+ srli.w t7, a3, 2
+ slli.w t7, t7, 3
+ addi.d t8, a0, 0
+.l_\lable\()v_8w_loop0:
+ fld.s f0, a1, 0
+ fldx.s f1, a1, a2
+ fldx.s f2, a1, t2
+ add.d a1, a1, t3
+ fld.s f3, a1, 0
+ fldx.s f4, a1, a2
+ fldx.s f5, a1, t2
+ fldx.s f6, a1, t3
+
+ xvilvl.b xr0, xr1, xr0 // 0 1 8 9 16 17 24 25
+ xvilvl.b xr1, xr3, xr2 // 2 3 10 11 18 19 26 27
+ xvilvl.b xr2, xr5, xr4 // 4 5 12 13 20 21 28 29
+ xvilvl.b xr3, xr7, xr6 // 6 7 14 15 22 23 30 31
+ xvilvl.h xr0, xr1, xr0 // 0 1 2 3 8 9 10 11 16 17 18 19 24 25 26 27
+ xvilvl.h xr1, xr3, xr2 // 4 5 6 7 12 13 14 15 20 21 22 23 28 29 30 31
+ xvilvl.w xr2, xr1, xr0
+ xvilvh.w xr0, xr1, xr0
+ xvpermi.q xr0, xr2, 0x20
+
+.l_\lable\()v_8w_loop:
+ add.d a1, a1, t4
+ fld.s f7, a1, 0 //h0
+ fldx.s f10, a1, a2 //h1
+ fldx.s f11, a1, t2 //h2
+ fldx.s f12, a1, t3 //h3
+
+ xvbsrl.v xr9, xr7, 2
+ xvpermi.q xr9, xr7, 0x20
+ xvextrins.b xr0, xr9, 0x70
+ xvextrins.b xr0, xr9, 0xf1
+
+ xvbsrl.v xr1, xr0, 1
+ xvbsrl.v xr7, xr10, 2
+ xvpermi.q xr7, xr10, 0x20
+ xvextrins.b xr1, xr7, 0x70
+ xvextrins.b xr1, xr7, 0xf1
+
+ xvbsrl.v xr2, xr1, 1
+ xvbsrl.v xr7, xr11, 2
+ xvpermi.q xr7, xr11, 0x20
+ xvextrins.b xr2, xr7, 0x70
+ xvextrins.b xr2, xr7, 0xf1
+
+ xvbsrl.v xr3, xr2, 1
+ xvbsrl.v xr7, xr12, 2
+ xvpermi.q xr7, xr12, 0x20
+ xvextrins.b xr3, xr7, 0x70
+ xvextrins.b xr3, xr7, 0xf1
+ xvbsrl.v xr4, xr3, 1
+
+ xvdp2.h.bu.b xr10, xr0, xr8
+ xvdp2.h.bu.b xr11, xr1, xr8
+ xvdp2.h.bu.b xr12, xr2, xr8
+ xvdp2.h.bu.b xr13, xr3, xr8
+ HADDWDH xr10
+ HADDWDH xr11
+ HADDWDH xr12
+ HADDWDH xr13
+ xvpickev.w xr10, xr11, xr10
+ xvpickev.w xr11, xr13, xr12
+ xvpermi.d xr10, xr10, 0xd8
+ xvpermi.d xr11, xr11, 0xd8
+ xvpickev.h xr10, xr11, xr10
+ xvpermi.d xr10, xr10, 0xd8
+ xvsrari.h xr10, xr10, 2
+
+ xvaddi.bu xr0, xr4, 0
+
+ xvstelm.d xr10, a0, 0, 0
+ add.d a0, a0, t7
+ xvstelm.d xr10, a0, 0, 1
+ add.d a0, a0, t7
+ xvstelm.d xr10, a0, 0, 2
+ add.d a0, a0, t7
+ xvstelm.d xr10, a0, 0, 3
+ add.d a0, a0, t7
+ addi.w a4, a4, -4
+ bnez a4, .l_\lable\()v_8w_loop
+
+ addi.d a1, t0, 4
+ addi.d t0, t0, 4
+ addi.d a0, t8, 8
+ addi.d t8, t8, 8
+ addi.d a4, t5, 0
+ addi.d a3, a3, -4
+ bnez a3, .l_\lable\()v_8w_loop0
+
+.l_\lable\()end_pre_8tap:
+.endm
+
+function prep_8tap_regular_8bpc_lasx
+ addi.w a7, zero, 0
+ PREP_8TAP_8BPC_LASX 0
+endfunc
+
+function prep_8tap_smooth_regular_8bpc_lasx
+ addi.w a7, zero, 1
+ PREP_8TAP_8BPC_LASX 1
+endfunc
+
+function prep_8tap_sharp_regular_8bpc_lasx
+ addi.w a7, zero, 2
+ PREP_8TAP_8BPC_LASX 2
+endfunc
+
+function prep_8tap_regular_smooth_8bpc_lasx
+ addi.w a7, zero, 4
+ PREP_8TAP_8BPC_LASX 4
+endfunc
+
+function prep_8tap_smooth_8bpc_lasx
+ addi.w a7, zero, 5
+ PREP_8TAP_8BPC_LASX 5
+endfunc
+
+function prep_8tap_sharp_smooth_8bpc_lasx
+ addi.w a7, zero, 6
+ PREP_8TAP_8BPC_LASX 6
+endfunc
+
+function prep_8tap_regular_sharp_8bpc_lasx
+ addi.w a7, zero, 8
+ PREP_8TAP_8BPC_LASX 8
+endfunc
+
+function prep_8tap_smooth_sharp_8bpc_lasx
+ addi.w a7, zero, 9
+ PREP_8TAP_8BPC_LASX 9
+endfunc
+
+function prep_8tap_sharp_8bpc_lasx
+ addi.w a7, zero, 10
+ PREP_8TAP_8BPC_LASX 10
+endfunc
diff --git a/third_party/dav1d/src/loongarch/mc.h b/third_party/dav1d/src/loongarch/mc.h
new file mode 100644
index 0000000000..c64b7efc2b
--- /dev/null
+++ b/third_party/dav1d/src/loongarch/mc.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright © 2023, VideoLAN and dav1d authors
+ * Copyright © 2023, Loongson Technology Corporation Limited
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_LOONGARCH_MC_H
+#define DAV1D_SRC_LOONGARCH_MC_H
+
+#include "config.h"
+#include "src/mc.h"
+#include "src/cpu.h"
+
+#define init_mc_fn(type, name, suffix) \
+ c->mc[type] = BF(dav1d_put_##name, suffix)
+#define init_mct_fn(type, name, suffix) \
+ c->mct[type] = BF(dav1d_prep_##name, suffix)
+
+decl_avg_fn(BF(dav1d_avg, lsx));
+decl_w_avg_fn(BF(dav1d_w_avg, lsx));
+decl_mask_fn(BF(dav1d_mask, lsx));
+decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, lsx));
+decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, lsx));
+decl_w_mask_fn(BF(dav1d_w_mask_420, lsx));
+
+decl_mc_fn(BF(dav1d_put_8tap_regular, lsx));
+decl_mc_fn(BF(dav1d_put_8tap_regular_smooth, lsx));
+decl_mc_fn(BF(dav1d_put_8tap_regular_sharp, lsx));
+decl_mc_fn(BF(dav1d_put_8tap_smooth, lsx));
+decl_mc_fn(BF(dav1d_put_8tap_smooth_regular, lsx));
+decl_mc_fn(BF(dav1d_put_8tap_smooth_sharp, lsx));
+decl_mc_fn(BF(dav1d_put_8tap_sharp, lsx));
+decl_mc_fn(BF(dav1d_put_8tap_sharp_regular, lsx));
+decl_mc_fn(BF(dav1d_put_8tap_sharp_smooth, lsx));
+
+decl_avg_fn(BF(dav1d_avg, lasx));
+decl_w_avg_fn(BF(dav1d_w_avg, lasx));
+decl_mask_fn(BF(dav1d_mask, lasx));
+decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, lasx));
+decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, lasx));
+decl_w_mask_fn(BF(dav1d_w_mask_420, lasx));
+
+decl_mct_fn(BF(dav1d_prep_8tap_regular, lasx));
+decl_mct_fn(BF(dav1d_prep_8tap_regular_smooth, lasx));
+decl_mct_fn(BF(dav1d_prep_8tap_regular_sharp, lasx));
+decl_mct_fn(BF(dav1d_prep_8tap_smooth, lasx));
+decl_mct_fn(BF(dav1d_prep_8tap_smooth_regular, lasx));
+decl_mct_fn(BF(dav1d_prep_8tap_smooth_sharp, lasx));
+decl_mct_fn(BF(dav1d_prep_8tap_sharp, lasx));
+decl_mct_fn(BF(dav1d_prep_8tap_sharp_regular, lasx));
+decl_mct_fn(BF(dav1d_prep_8tap_sharp_smooth, lasx));
+
+static ALWAYS_INLINE void mc_dsp_init_loongarch(Dav1dMCDSPContext *const c) {
+#if BITDEPTH == 8
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_LOONGARCH_CPU_FLAG_LSX)) return;
+
+ c->avg = BF(dav1d_avg, lsx);
+ c->w_avg = BF(dav1d_w_avg, lsx);
+ c->mask = BF(dav1d_mask, lsx);
+ c->warp8x8 = BF(dav1d_warp_affine_8x8, lsx);
+ c->warp8x8t = BF(dav1d_warp_affine_8x8t, lsx);
+ c->w_mask[2] = BF(dav1d_w_mask_420, lsx);
+
+ init_mc_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, lsx);
+ init_mc_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, lsx);
+ init_mc_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, lsx);
+ init_mc_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, lsx);
+ init_mc_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, lsx);
+ init_mc_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, lsx);
+ init_mc_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, lsx);
+ init_mc_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, lsx);
+ init_mc_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, lsx);
+
+ if (!(flags & DAV1D_LOONGARCH_CPU_FLAG_LASX)) return;
+
+ c->avg = BF(dav1d_avg, lasx);
+ c->w_avg = BF(dav1d_w_avg, lasx);
+ c->mask = BF(dav1d_mask, lasx);
+ c->warp8x8 = BF(dav1d_warp_affine_8x8, lasx);
+ c->warp8x8t = BF(dav1d_warp_affine_8x8t, lasx);
+ c->w_mask[2] = BF(dav1d_w_mask_420, lasx);
+
+ init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, lasx);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, lasx);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, lasx);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, lasx);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, lasx);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, lasx);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, lasx);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, lasx);
+ init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, lasx);
+#endif
+}
+
+#endif /* DAV1D_SRC_LOONGARCH_MC_H */
diff --git a/third_party/dav1d/src/loongarch/msac.S b/third_party/dav1d/src/loongarch/msac.S
new file mode 100644
index 0000000000..c371eba4de
--- /dev/null
+++ b/third_party/dav1d/src/loongarch/msac.S
@@ -0,0 +1,368 @@
+/*
+ * Copyright © 2023, VideoLAN and dav1d authors
+ * Copyright © 2023, Loongson Technology Corporation Limited
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "loongson_asm.S"
+
+const min_prob
+ .short 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0
+endconst
+
+.macro decode_symbol_adapt w
+ addi.d sp, sp, -48
+ addi.d a4, a0, 24
+ vldrepl.h vr0, a4, 0 //rng
+ fst.s f0, sp, 0 //val==0
+ vld vr1, a1, 0 //cdf
+.if \w == 16
+ li.w t4, 16
+ vldx vr11, a1, t4
+.endif
+ addi.d a6, a0, 16
+ vldrepl.d vr2, a6, 0 //dif
+ addi.d t0, a0, 32
+ ld.w t1, t0, 0 //allow_update_cdf
+ la.local t2, min_prob
+ addi.d t2, t2, 32
+ addi.w t3, a2, 1
+ slli.w t3, t3, 1
+ sub.d t2, t2, t3
+ vld vr3, t2, 0 //min_prob
+.if \w == 16
+ vldx vr13, t2, t4
+.endif
+ vsrli.h vr4, vr0, 8 //r = s->rng >> 8
+ vslli.h vr4, vr4, 8 //r << 8
+ vsrli.h vr5, vr1, 6
+ vslli.h vr5, vr5, 7
+.if \w == 16
+ vsrli.h vr15, vr11, 6
+ vslli.h vr15, vr15, 7
+.endif
+ vmuh.hu vr5, vr4, vr5
+ vadd.h vr5, vr5, vr3 //v
+.if \w == 16
+ vmuh.hu vr15, vr4, vr15
+ vadd.h vr15, vr15, vr13
+.endif
+ addi.d t8, sp, 4
+ vst vr5, t8, 0 //store v
+.if \w == 16
+ vstx vr15, t8, t4
+.endif
+ vreplvei.h vr20, vr2, 3 //c
+ vssub.hu vr6, vr5, vr20 //c >=v
+ vseqi.h vr6, vr6, 0
+.if \w == 16
+ vssub.hu vr16, vr15, vr20 //c >=v
+ vseqi.h vr16, vr16, 0
+ vpickev.b vr21, vr16, vr6
+.endif
+.if \w <= 8
+ vmskltz.h vr10, vr6
+.else
+ vmskltz.b vr10, vr21
+.endif
+ beqz t1, .renorm\()\w
+
+ // update_cdf
+ alsl.d t1, a2, a1, 1
+ ld.h t2, t1, 0 //count
+ srli.w t3, t2, 4 //count >> 4
+ addi.w t3, t3, 4
+ li.w t5, 2
+ sltu t5, t5, a2
+ add.w t3, t3, t5 //rate
+ sltui t5, t2, 32
+ add.w t2, t2, t5 //count + (count < 32)
+ vreplgr2vr.h vr9, t3
+ vseq.h vr7, vr7, vr7
+ vavgr.hu vr5, vr6, vr7 //i >= val ? -1 : 32768
+ vsub.h vr5, vr5, vr1
+ vsub.h vr8, vr1, vr6
+.if \w == 16
+ vavgr.hu vr15, vr16, vr7
+ vsub.h vr15, vr15, vr11
+ vsub.h vr18, vr11, vr16
+.endif
+ vsra.h vr5, vr5, vr9
+ vadd.h vr8, vr8, vr5
+.if \w == 4
+ fst.d f8, a1, 0
+.else
+ vst vr8, a1, 0
+.endif
+.if \w == 16
+ vsra.h vr15, vr15, vr9
+ vadd.h vr18, vr18, vr15
+ vstx vr18, a1, t4
+.endif
+ st.h t2, t1, 0
+
+.renorm\()\w:
+ vpickve2gr.h t3, vr10, 0
+ ctz.w a7, t3 // ret
+ alsl.d t3, a7, t8, 1
+ ld.hu t4, t3, 0 // v
+ addi.d t3, t3, -2
+ ld.hu t5, t3, 0 // u
+ sub.w t5, t5, t4 // rng
+ slli.d t4, t4, 48
+ vpickve2gr.d t6, vr2, 0
+ sub.d t6, t6, t4 // dif
+ addi.d t6, t6, 1
+ clz.w t4, t5 // d
+ xori t4, t4, 16 // d
+ sll.d t6, t6, t4
+ addi.d t6, t6, -1 // dif
+ addi.d a5, a0, 28 // cnt
+ ld.w t7, a5, 0
+ sub.w t7, t7, t4 // cnt-d
+ sll.w t5, t5, t4
+ st.w t5, a4, 0 // store rng
+ bge t7, zero, 9f
+
+ // refill
+ ld.d t0, a0, 0 // buf_pos
+ addi.d t1, a0, 8
+ ld.d t1, t1, 0 // buf_end
+ addi.d t2, t0, 8
+ blt t1, t2, 1f
+
+ ld.d t0, t0, 0 // next_bits
+ addi.w t3, t7, 23 // shift_bits = cnt + 23
+ addi.w t7, t7, 16 // cnt += 16
+ revb.d t0, t0 // next_bits = bswap(next_bits)
+ srli.w t4, t3, 3
+ sub.d t2, t2, t4 // buf_pos -= shift_bits >> 3
+ st.d t2, a0, 0
+ andi t3, t3, 24 // shift_bits &= 24
+ srl.d t0, t0, t3 // next_bits >>= shift_bits
+ sub.w t3, t3, t7 // shift_bits -= 16 + cnt
+ sll.d t0, t0, t3 // next_bits <<= shift_bits
+ li.w t5, 48
+ sub.w t7, t5, t3 // cnt = cnt + 64 - shift_bits
+ xor t6, t6, t0 // dif ^= next_bits
+ b 9f
+1:
+ li.w t4, 40
+ sub.w t5, t4, t7 // c = 40 - cnt
+2:
+ bge t0, t1, 3f
+ ld.bu t2, t0, 0
+ addi.d t0, t0, 1
+ sll.d t2, t2, t5
+ xor t6, t6, t2
+ addi.w t5, t5, -8
+ bge t5, zero, 2b
+ // refill_eob_end
+3:
+ st.d t0, a0, 0 // s->buf_pos = buf_pos
+ sub.w t7, t4, t5 // cnt = 40 - c
+9:
+ st.w t7, a5, 0 // store cnt
+ st.d t6, a6, 0 // store dif
+ move a0, a7
+ addi.d sp, sp, 48
+.endm
+
+function msac_decode_symbol_adapt4_lsx
+ decode_symbol_adapt 4
+endfunc
+
+function msac_decode_symbol_adapt8_lsx
+ decode_symbol_adapt 8
+endfunc
+
+function msac_decode_symbol_adapt16_lsx
+ decode_symbol_adapt 16
+endfunc
+
+function msac_decode_bool_lsx
+ ld.w t0, a0, 24 // rng
+ srli.w a1, a1, 6
+ ld.d t1, a0, 16 // dif
+ srli.w t2, t0, 8 // r >> 8
+ mul.w t2, t2, a1
+ ld.w a5, a0, 28 // cnt
+ addi.d t1, t1, 1 // dif + 1
+ srli.w t2, t2, 1
+ addi.w t2, t2, 4 // v
+ slli.d t3, t2, 48 // vw
+ sltu t4, t1, t3
+ move t8, t4 // ret
+ xori t4, t4, 1
+ maskeqz t6, t3, t4 // if (ret) vw
+ sub.d t6, t1, t6 // dif
+ slli.w t5, t2, 1
+ sub.w t5, t0, t5 // r - 2v
+ maskeqz t7, t5, t4 // if (ret) r - 2v
+ add.w t5, t2, t7 // v(rng)
+
+ // renorm
+ clz.w t4, t5 // d
+ xori t4, t4, 16 // d
+ sll.d t6, t6, t4
+ addi.d t6, t6, -1 // dif
+ sub.w t7, a5, t4 // cnt-d
+ sll.w t5, t5, t4
+ st.w t5, a0, 24 // store rng
+ bge t7, zero, 9f
+
+ // refill
+ ld.d t0, a0, 0 // buf_pos
+ addi.d t1, a0, 8
+ ld.d t1, t1, 0 // buf_end
+ addi.d t2, t0, 8
+ blt t1, t2, 1f
+
+ ld.d t0, t0, 0 // next_bits
+ addi.w t3, t7, 23 // shift_bits = cnt + 23
+ addi.w t7, t7, 16 // cnt += 16
+ revb.d t0, t0 // next_bits = bswap(next_bits)
+ srli.w t4, t3, 3
+ sub.d t2, t2, t4 // buf_pos -= shift_bits >> 3
+ st.d t2, a0, 0
+ andi t3, t3, 24 // shift_bits &= 24
+ srl.d t0, t0, t3 // next_bits >>= shift_bits
+ sub.w t3, t3, t7 // shift_bits -= 16 + cnt
+ sll.d t0, t0, t3 // next_bits <<= shift_bits
+ li.w t5, 48
+ sub.w t7, t5, t3 // cnt = cnt + 64 - shift_bits
+ xor t6, t6, t0 // dif ^= next_bits
+ b 9f
+1:
+ li.w t4, 40
+ sub.w t5, t4, t7 // c = 40 - cnt
+2:
+ bge t0, t1, 3f
+ ld.bu t2, t0, 0
+ addi.d t0, t0, 1
+ sll.d t2, t2, t5
+ xor t6, t6, t2
+ addi.w t5, t5, -8
+ bge t5, zero, 2b
+ // refill_eob_end
+3:
+ st.d t0, a0, 0 // s->buf_pos = buf_pos
+ sub.w t7, t4, t5 // cnt = 40 - c
+9:
+ st.w t7, a0, 28 // store cnt
+ st.d t6, a0, 16 // store dif
+ move a0, t8
+endfunc
+
+function msac_decode_bool_adapt_lsx
+ ld.hu a3, a1, 0 // cdf[0] /f
+ ld.w t0, a0, 24 // rng
+ ld.d t1, a0, 16 // dif
+ srli.w t2, t0, 8 // r >> 8
+ srli.w a7, a3, 6
+ mul.w t2, t2, a7
+ ld.w a4, a0, 32 // allow_update_cdf
+ ld.w a5, a0, 28 // cnt
+ srli.w t2, t2, 1
+ addi.w t2, t2, 4 // v
+ slli.d t3, t2, 48 // vw
+ sltu t4, t1, t3
+ move t8, t4 // bit
+ xori t4, t4, 1
+ maskeqz t6, t3, t4 // if (ret) vw
+ sub.d t6, t1, t6 // dif
+ slli.w t5, t2, 1
+ sub.w t5, t0, t5 // r - 2v
+ maskeqz t7, t5, t4 // if (ret) r - 2v
+ add.w t5, t2, t7 // v(rng)
+ beqz a4, .renorm
+
+ // update_cdf
+ ld.hu t0, a1, 2 // cdf[1]
+ srli.w t1, t0, 4
+ addi.w t1, t1, 4 // rate
+ sltui t2, t0, 32 // count < 32
+ add.w t0, t0, t2 // count + (count < 32)
+ sub.w a3, a3, t8 // cdf[0] -= bit
+ slli.w t4, t8, 15
+ sub.w t7, a3, t4 // cdf[0] - bit - 32768
+ sra.w t7, t7, t1 // (cdf[0] - bit - 32768) >> rate
+ sub.w t7, a3, t7 // cdf[0]
+ st.h t7, a1, 0
+ st.h t0, a1, 2
+
+.renorm:
+ // renorm
+ addi.d t6, t6, 1
+ clz.w t4, t5 // d
+ xori t4, t4, 16 // d
+ sll.d t6, t6, t4
+ addi.d t6, t6, -1 // dif
+ sub.w t7, a5, t4 // cnt-d
+ sll.w t5, t5, t4
+ st.w t5, a0, 24 // store rng
+ bge t7, zero, 9f
+
+ // refill
+ ld.d t0, a0, 0 // buf_pos
+ addi.d t1, a0, 8
+ ld.d t1, t1, 0 // buf_end
+ addi.d t2, t0, 8
+ blt t1, t2, 1f
+
+ ld.d t0, t0, 0 // next_bits
+ addi.w t3, t7, 23 // shift_bits = cnt + 23
+ addi.w t7, t7, 16 // cnt += 16
+ revb.d t0, t0 // next_bits = bswap(next_bits)
+ srli.w t4, t3, 3
+ sub.d t2, t2, t4 // buf_pos -= shift_bits >> 3
+ st.d t2, a0, 0
+ andi t3, t3, 24 // shift_bits &= 24
+ srl.d t0, t0, t3 // next_bits >>= shift_bits
+ sub.w t3, t3, t7 // shift_bits -= 16 + cnt
+ sll.d t0, t0, t3 // next_bits <<= shift_bits
+ li.w t5, 48
+ sub.w t7, t5, t3 // cnt = cnt + 64 - shift_bits
+ xor t6, t6, t0 // dif ^= next_bits
+ b 9f
+1:
+ li.w t4, 40
+ sub.w t5, t4, t7 // c = 40 - cnt
+2:
+ bge t0, t1, 3f
+ ld.bu t2, t0, 0
+ addi.d t0, t0, 1
+ sll.d t2, t2, t5
+ xor t6, t6, t2
+ addi.w t5, t5, -8
+ bge t5, zero, 2b
+ // refill_eob_end
+3:
+ st.d t0, a0, 0 // s->buf_pos = buf_pos
+ sub.w t7, t4, t5 // cnt = 40 - c
+9:
+ st.w t7, a0, 28 // store cnt
+ st.d t6, a0, 16 // store dif
+ move a0, t8
+endfunc
diff --git a/third_party/dav1d/src/loongarch/msac.h b/third_party/dav1d/src/loongarch/msac.h
new file mode 100644
index 0000000000..fdcff838bb
--- /dev/null
+++ b/third_party/dav1d/src/loongarch/msac.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright © 2023, VideoLAN and dav1d authors
+ * Copyright © 2023, Loongson Technology Corporation Limited
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_LOONGARCH_MSAC_H
+#define DAV1D_SRC_LOONGARCH_MSAC_H
+
+unsigned dav1d_msac_decode_symbol_adapt4_lsx(MsacContext *s, uint16_t *cdf,
+ size_t n_symbols);
+unsigned dav1d_msac_decode_symbol_adapt8_lsx(MsacContext *s, uint16_t *cdf,
+ size_t n_symbols);
+unsigned dav1d_msac_decode_symbol_adapt16_lsx(MsacContext *s, uint16_t *cdf,
+ size_t n_symbols);
+unsigned dav1d_msac_decode_bool_adapt_lsx(MsacContext *s, uint16_t *cdf);
+unsigned dav1d_msac_decode_bool_lsx(MsacContext *s, unsigned f);
+
+#define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt4_lsx
+#define dav1d_msac_decode_symbol_adapt8 dav1d_msac_decode_symbol_adapt8_lsx
+#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_lsx
+#define dav1d_msac_decode_bool_adapt dav1d_msac_decode_bool_adapt_lsx
+#define dav1d_msac_decode_bool dav1d_msac_decode_bool_lsx
+
+#endif /* DAV1D_SRC_LOONGARCH_MSAC_H */
diff --git a/third_party/dav1d/src/loongarch/refmvs.S b/third_party/dav1d/src/loongarch/refmvs.S
new file mode 100644
index 0000000000..63a83d3ce7
--- /dev/null
+++ b/third_party/dav1d/src/loongarch/refmvs.S
@@ -0,0 +1,152 @@
+/*
+ * Copyright © 2023, VideoLAN and dav1d authors
+ * Copyright © 2023, Loongson Technology Corporation Limited
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/loongarch/loongson_asm.S"
+
+/*
+static void splat_mv_c(refmvs_block **rr, const refmvs_block *const rmv,
+ const int bx4, const int bw4, int bh4)
+*/
+
+function splat_mv_lsx
+ vld vr0, a1, 0 // 0 1 ... 11 ...
+ clz.w t4, a3
+ vaddi.bu vr1, vr0, 0
+ addi.w t4, t4, -26
+ vextrins.w vr1, vr0, 0x30 // 0 1 2 ... 11 0 1 2 3
+ la.local t5, .SPLAT_LSX_JRTABLE
+ vbsrl.v vr2, vr1, 4 // 4 5 6 7...11 0 1 2 3 0 0 0 0
+ alsl.d t6, t4, t5, 1
+ vextrins.w vr2, vr0, 0x31 // 4 5 6 7...11 0 1 2 3 4 5 6 7
+ ld.h t7, t6, 0
+ vbsrl.v vr3, vr2, 4 // 8 9 10 11 0 1 2 3 4 5 6 7 0 0 0 0
+ add.d t8, t5, t7
+ alsl.d a2, a2, a2, 1
+ vextrins.w vr3, vr0, 0x32 // 8 9 10 11 0 1 2 3 4 5 6 7 8 9 10 11
+ slli.w a2, a2, 2
+ jirl $r0, t8, 0
+
+.SPLAT_LSX_JRTABLE:
+ .hword .SPLAT_W32_LSX - .SPLAT_LSX_JRTABLE
+ .hword .SPLAT_W16_LSX - .SPLAT_LSX_JRTABLE
+ .hword .SPLAT_W8_LSX - .SPLAT_LSX_JRTABLE
+ .hword .SPLAT_W4_LSX - .SPLAT_LSX_JRTABLE
+ .hword .SPLAT_W2_LSX - .SPLAT_LSX_JRTABLE
+ .hword .SPLAT_W1_LSX - .SPLAT_LSX_JRTABLE
+
+.SPLAT_W1_LSX:
+ ld.d t3, a0, 0
+ addi.d a0, a0, 8
+ addi.d a4, a4, -1
+ add.d t3, t3, a2
+
+ fst.d f1, t3, 0
+ fst.s f3, t3, 8
+ blt zero, a4, .SPLAT_W1_LSX
+ b .splat_end
+.SPLAT_W2_LSX:
+ ld.d t3, a0, 0
+ addi.d a0, a0, 8
+ addi.d a4, a4, -1
+ add.d t3, t3, a2
+
+ vst vr1, t3, 0
+ fst.d f2, t3, 16
+ blt zero, a4, .SPLAT_W2_LSX
+ b .splat_end
+
+.SPLAT_W4_LSX:
+ ld.d t3, a0, 0
+ addi.d a0, a0, 8
+ addi.d a4, a4, -1
+ add.d t3, t3, a2
+
+ vst vr1, t3, 0
+ vst vr2, t3, 16
+ vst vr3, t3, 32
+ blt zero, a4, .SPLAT_W4_LSX
+ b .splat_end
+
+.SPLAT_W8_LSX:
+ ld.d t3, a0, 0
+ addi.d a0, a0, 8
+ addi.d a4, a4, -1
+ add.d t3, t3, a2
+
+ vst vr1, t3, 0
+ vst vr2, t3, 16
+ vst vr3, t3, 32
+
+ vst vr1, t3, 48
+ vst vr2, t3, 64
+ vst vr3, t3, 80
+ blt zero, a4, .SPLAT_W8_LSX
+ b .splat_end
+
+.SPLAT_W16_LSX:
+ ld.d t3, a0, 0
+ addi.d a0, a0, 8
+ addi.d a4, a4, -1
+ add.d t3, t3, a2
+
+.rept 2
+ vst vr1, t3, 0
+ vst vr2, t3, 16
+ vst vr3, t3, 32
+
+ vst vr1, t3, 48
+ vst vr2, t3, 64
+ vst vr3, t3, 80
+
+ addi.d t3, t3, 96
+.endr
+
+ blt zero, a4, .SPLAT_W16_LSX
+ b .splat_end
+
+.SPLAT_W32_LSX:
+ ld.d t3, a0, 0
+ addi.d a0, a0, 8
+ addi.d a4, a4, -1
+ add.d t3, t3, a2
+
+.rept 4
+ vst vr1, t3, 0
+ vst vr2, t3, 16
+ vst vr3, t3, 32
+
+ vst vr1, t3, 48
+ vst vr2, t3, 64
+ vst vr3, t3, 80
+
+ addi.d t3, t3, 96
+.endr
+
+ blt zero, a4, .SPLAT_W32_LSX
+
+.splat_end:
+endfunc
diff --git a/third_party/dav1d/src/loongarch/refmvs.h b/third_party/dav1d/src/loongarch/refmvs.h
new file mode 100644
index 0000000000..60ff435c81
--- /dev/null
+++ b/third_party/dav1d/src/loongarch/refmvs.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright © 2023, VideoLAN and dav1d authors
+ * Copyright © 2023, Loongson Technology Corporation Limited
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_LOONGARCH_REFMVS_H
+#define DAV1D_SRC_LOONGARCH_REFMVS_H
+
+#include "src/cpu.h"
+#include "src/refmvs.h"
+
+decl_splat_mv_fn(dav1d_splat_mv_lsx);
+
+static ALWAYS_INLINE void refmvs_dsp_init_loongarch(Dav1dRefmvsDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_LOONGARCH_CPU_FLAG_LSX)) return;
+
+ c->splat_mv = dav1d_splat_mv_lsx;
+}
+
+#endif /* DAV1D_SRC_LOONGARCH_REFMVS_H */
diff --git a/third_party/dav1d/src/loopfilter.h b/third_party/dav1d/src/loopfilter.h
new file mode 100644
index 0000000000..a0f78c9657
--- /dev/null
+++ b/third_party/dav1d/src/loopfilter.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_LOOPFILTER_H
+#define DAV1D_SRC_LOOPFILTER_H
+
+#include <stdint.h>
+#include <stddef.h>
+
+#include "common/bitdepth.h"
+
+#include "src/levels.h"
+#include "src/lf_mask.h"
+
+#define decl_loopfilter_sb_fn(name) \
+void (name)(pixel *dst, ptrdiff_t stride, const uint32_t *mask, \
+ const uint8_t (*lvl)[4], ptrdiff_t lvl_stride, \
+ const Av1FilterLUT *lut, int w HIGHBD_DECL_SUFFIX)
+typedef decl_loopfilter_sb_fn(*loopfilter_sb_fn);
+
+typedef struct Dav1dLoopFilterDSPContext {
+ /*
+ * dimension 1: plane (0=luma, 1=chroma)
+ * dimension 2: 0=col-edge filter (h), 1=row-edge filter (v)
+ *
+ * dst/stride are aligned by 32
+ */
+ loopfilter_sb_fn loop_filter_sb[2][2];
+} Dav1dLoopFilterDSPContext;
+
+bitfn_decls(void dav1d_loop_filter_dsp_init, Dav1dLoopFilterDSPContext *c);
+
+#endif /* DAV1D_SRC_LOOPFILTER_H */
diff --git a/third_party/dav1d/src/loopfilter_tmpl.c b/third_party/dav1d/src/loopfilter_tmpl.c
new file mode 100644
index 0000000000..7cc89643e4
--- /dev/null
+++ b/third_party/dav1d/src/loopfilter_tmpl.c
@@ -0,0 +1,272 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stdlib.h>
+
+#include "common/attributes.h"
+#include "common/intops.h"
+
+#include "src/loopfilter.h"
+
+static NOINLINE void
+loop_filter(pixel *dst, int E, int I, int H,
+ const ptrdiff_t stridea, const ptrdiff_t strideb, const int wd
+ HIGHBD_DECL_SUFFIX)
+{
+ const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
+ const int F = 1 << bitdepth_min_8;
+ E <<= bitdepth_min_8;
+ I <<= bitdepth_min_8;
+ H <<= bitdepth_min_8;
+
+ for (int i = 0; i < 4; i++, dst += stridea) {
+ int p6, p5, p4, p3, p2;
+ int p1 = dst[strideb * -2], p0 = dst[strideb * -1];
+ int q0 = dst[strideb * +0], q1 = dst[strideb * +1];
+ int q2, q3, q4, q5, q6;
+ int fm, flat8out, flat8in;
+
+ fm = abs(p1 - p0) <= I && abs(q1 - q0) <= I &&
+ abs(p0 - q0) * 2 + (abs(p1 - q1) >> 1) <= E;
+
+ if (wd > 4) {
+ p2 = dst[strideb * -3];
+ q2 = dst[strideb * +2];
+
+ fm &= abs(p2 - p1) <= I && abs(q2 - q1) <= I;
+
+ if (wd > 6) {
+ p3 = dst[strideb * -4];
+ q3 = dst[strideb * +3];
+
+ fm &= abs(p3 - p2) <= I && abs(q3 - q2) <= I;
+ }
+ }
+ if (!fm) continue;
+
+ if (wd >= 16) {
+ p6 = dst[strideb * -7];
+ p5 = dst[strideb * -6];
+ p4 = dst[strideb * -5];
+ q4 = dst[strideb * +4];
+ q5 = dst[strideb * +5];
+ q6 = dst[strideb * +6];
+
+ flat8out = abs(p6 - p0) <= F && abs(p5 - p0) <= F &&
+ abs(p4 - p0) <= F && abs(q4 - q0) <= F &&
+ abs(q5 - q0) <= F && abs(q6 - q0) <= F;
+ }
+
+ if (wd >= 6)
+ flat8in = abs(p2 - p0) <= F && abs(p1 - p0) <= F &&
+ abs(q1 - q0) <= F && abs(q2 - q0) <= F;
+
+ if (wd >= 8)
+ flat8in &= abs(p3 - p0) <= F && abs(q3 - q0) <= F;
+
+ if (wd >= 16 && (flat8out & flat8in)) {
+ dst[strideb * -6] = (p6 + p6 + p6 + p6 + p6 + p6 * 2 + p5 * 2 +
+ p4 * 2 + p3 + p2 + p1 + p0 + q0 + 8) >> 4;
+ dst[strideb * -5] = (p6 + p6 + p6 + p6 + p6 + p5 * 2 + p4 * 2 +
+ p3 * 2 + p2 + p1 + p0 + q0 + q1 + 8) >> 4;
+ dst[strideb * -4] = (p6 + p6 + p6 + p6 + p5 + p4 * 2 + p3 * 2 +
+ p2 * 2 + p1 + p0 + q0 + q1 + q2 + 8) >> 4;
+ dst[strideb * -3] = (p6 + p6 + p6 + p5 + p4 + p3 * 2 + p2 * 2 +
+ p1 * 2 + p0 + q0 + q1 + q2 + q3 + 8) >> 4;
+ dst[strideb * -2] = (p6 + p6 + p5 + p4 + p3 + p2 * 2 + p1 * 2 +
+ p0 * 2 + q0 + q1 + q2 + q3 + q4 + 8) >> 4;
+ dst[strideb * -1] = (p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 +
+ q0 * 2 + q1 + q2 + q3 + q4 + q5 + 8) >> 4;
+ dst[strideb * +0] = (p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 * 2 +
+ q1 * 2 + q2 + q3 + q4 + q5 + q6 + 8) >> 4;
+ dst[strideb * +1] = (p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 * 2 +
+ q2 * 2 + q3 + q4 + q5 + q6 + q6 + 8) >> 4;
+ dst[strideb * +2] = (p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 * 2 +
+ q3 * 2 + q4 + q5 + q6 + q6 + q6 + 8) >> 4;
+ dst[strideb * +3] = (p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 * 2 +
+ q4 * 2 + q5 + q6 + q6 + q6 + q6 + 8) >> 4;
+ dst[strideb * +4] = (p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 * 2 +
+ q5 * 2 + q6 + q6 + q6 + q6 + q6 + 8) >> 4;
+ dst[strideb * +5] = (p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 2 +
+ q6 * 2 + q6 + q6 + q6 + q6 + q6 + 8) >> 4;
+ } else if (wd >= 8 && flat8in) {
+ dst[strideb * -3] = (p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0 + 4) >> 3;
+ dst[strideb * -2] = (p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4) >> 3;
+ dst[strideb * -1] = (p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4) >> 3;
+ dst[strideb * +0] = (p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4) >> 3;
+ dst[strideb * +1] = (p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3 + 4) >> 3;
+ dst[strideb * +2] = (p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3 + 4) >> 3;
+ } else if (wd == 6 && flat8in) {
+ dst[strideb * -2] = (p2 + 2 * p2 + 2 * p1 + 2 * p0 + q0 + 4) >> 3;
+ dst[strideb * -1] = (p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3;
+ dst[strideb * +0] = (p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3;
+ dst[strideb * +1] = (p0 + 2 * q0 + 2 * q1 + 2 * q2 + q2 + 4) >> 3;
+ } else {
+ const int hev = abs(p1 - p0) > H || abs(q1 - q0) > H;
+
+#define iclip_diff(v) iclip(v, -128 * (1 << bitdepth_min_8), \
+ 128 * (1 << bitdepth_min_8) - 1)
+
+ if (hev) {
+ int f = iclip_diff(p1 - q1), f1, f2;
+ f = iclip_diff(3 * (q0 - p0) + f);
+
+ f1 = imin(f + 4, (128 << bitdepth_min_8) - 1) >> 3;
+ f2 = imin(f + 3, (128 << bitdepth_min_8) - 1) >> 3;
+
+ dst[strideb * -1] = iclip_pixel(p0 + f2);
+ dst[strideb * +0] = iclip_pixel(q0 - f1);
+ } else {
+ int f = iclip_diff(3 * (q0 - p0)), f1, f2;
+
+ f1 = imin(f + 4, (128 << bitdepth_min_8) - 1) >> 3;
+ f2 = imin(f + 3, (128 << bitdepth_min_8) - 1) >> 3;
+
+ dst[strideb * -1] = iclip_pixel(p0 + f2);
+ dst[strideb * +0] = iclip_pixel(q0 - f1);
+
+ f = (f1 + 1) >> 1;
+ dst[strideb * -2] = iclip_pixel(p1 + f);
+ dst[strideb * +1] = iclip_pixel(q1 - f);
+ }
+#undef iclip_diff
+ }
+ }
+}
+
+static void loop_filter_h_sb128y_c(pixel *dst, const ptrdiff_t stride,
+ const uint32_t *const vmask,
+ const uint8_t (*l)[4], ptrdiff_t b4_stride,
+ const Av1FilterLUT *lut, const int h
+ HIGHBD_DECL_SUFFIX)
+{
+ const unsigned vm = vmask[0] | vmask[1] | vmask[2];
+ for (unsigned y = 1; vm & ~(y - 1);
+ y <<= 1, dst += 4 * PXSTRIDE(stride), l += b4_stride)
+ {
+ if (vm & y) {
+ const int L = l[0][0] ? l[0][0] : l[-1][0];
+ if (!L) continue;
+ const int H = L >> 4;
+ const int E = lut->e[L], I = lut->i[L];
+ const int idx = (vmask[2] & y) ? 2 : !!(vmask[1] & y);
+ loop_filter(dst, E, I, H, PXSTRIDE(stride), 1, 4 << idx
+ HIGHBD_TAIL_SUFFIX);
+ }
+ }
+}
+
+static void loop_filter_v_sb128y_c(pixel *dst, const ptrdiff_t stride,
+ const uint32_t *const vmask,
+ const uint8_t (*l)[4], ptrdiff_t b4_stride,
+ const Av1FilterLUT *lut, const int w
+ HIGHBD_DECL_SUFFIX)
+{
+ const unsigned vm = vmask[0] | vmask[1] | vmask[2];
+ for (unsigned x = 1; vm & ~(x - 1); x <<= 1, dst += 4, l++) {
+ if (vm & x) {
+ const int L = l[0][0] ? l[0][0] : l[-b4_stride][0];
+ if (!L) continue;
+ const int H = L >> 4;
+ const int E = lut->e[L], I = lut->i[L];
+ const int idx = (vmask[2] & x) ? 2 : !!(vmask[1] & x);
+ loop_filter(dst, E, I, H, 1, PXSTRIDE(stride), 4 << idx
+ HIGHBD_TAIL_SUFFIX);
+ }
+ }
+}
+
+static void loop_filter_h_sb128uv_c(pixel *dst, const ptrdiff_t stride,
+ const uint32_t *const vmask,
+ const uint8_t (*l)[4], ptrdiff_t b4_stride,
+ const Av1FilterLUT *lut, const int h
+ HIGHBD_DECL_SUFFIX)
+{
+ const unsigned vm = vmask[0] | vmask[1];
+ for (unsigned y = 1; vm & ~(y - 1);
+ y <<= 1, dst += 4 * PXSTRIDE(stride), l += b4_stride)
+ {
+ if (vm & y) {
+ const int L = l[0][0] ? l[0][0] : l[-1][0];
+ if (!L) continue;
+ const int H = L >> 4;
+ const int E = lut->e[L], I = lut->i[L];
+ const int idx = !!(vmask[1] & y);
+ loop_filter(dst, E, I, H, PXSTRIDE(stride), 1, 4 + 2 * idx
+ HIGHBD_TAIL_SUFFIX);
+ }
+ }
+}
+
+static void loop_filter_v_sb128uv_c(pixel *dst, const ptrdiff_t stride,
+ const uint32_t *const vmask,
+ const uint8_t (*l)[4], ptrdiff_t b4_stride,
+ const Av1FilterLUT *lut, const int w
+ HIGHBD_DECL_SUFFIX)
+{
+ const unsigned vm = vmask[0] | vmask[1];
+ for (unsigned x = 1; vm & ~(x - 1); x <<= 1, dst += 4, l++) {
+ if (vm & x) {
+ const int L = l[0][0] ? l[0][0] : l[-b4_stride][0];
+ if (!L) continue;
+ const int H = L >> 4;
+ const int E = lut->e[L], I = lut->i[L];
+ const int idx = !!(vmask[1] & x);
+ loop_filter(dst, E, I, H, 1, PXSTRIDE(stride), 4 + 2 * idx
+ HIGHBD_TAIL_SUFFIX);
+ }
+ }
+}
+
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+#include "src/arm/loopfilter.h"
+#elif ARCH_LOONGARCH64
+#include "src/loongarch/loopfilter.h"
+#elif ARCH_X86
+#include "src/x86/loopfilter.h"
+#endif
+#endif
+
+COLD void bitfn(dav1d_loop_filter_dsp_init)(Dav1dLoopFilterDSPContext *const c) {
+ c->loop_filter_sb[0][0] = loop_filter_h_sb128y_c;
+ c->loop_filter_sb[0][1] = loop_filter_v_sb128y_c;
+ c->loop_filter_sb[1][0] = loop_filter_h_sb128uv_c;
+ c->loop_filter_sb[1][1] = loop_filter_v_sb128uv_c;
+
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+ loop_filter_dsp_init_arm(c);
+#elif ARCH_LOONGARCH64
+ loop_filter_dsp_init_loongarch(c);
+#elif ARCH_X86
+ loop_filter_dsp_init_x86(c);
+#endif
+#endif
+}
diff --git a/third_party/dav1d/src/looprestoration.h b/third_party/dav1d/src/looprestoration.h
new file mode 100644
index 0000000000..f55dd31947
--- /dev/null
+++ b/third_party/dav1d/src/looprestoration.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_LOOPRESTORATION_H
+#define DAV1D_SRC_LOOPRESTORATION_H
+
+#include <stdint.h>
+#include <stddef.h>
+
+#include "common/bitdepth.h"
+
+enum LrEdgeFlags {
+ LR_HAVE_LEFT = 1 << 0,
+ LR_HAVE_RIGHT = 1 << 1,
+ LR_HAVE_TOP = 1 << 2,
+ LR_HAVE_BOTTOM = 1 << 3,
+};
+
+#ifdef BITDEPTH
+typedef const pixel (*const_left_pixel_row)[4];
+#else
+typedef const void *const_left_pixel_row;
+#endif
+
+typedef union LooprestorationParams {
+ ALIGN(int16_t filter[2][8], 16);
+ struct {
+ uint32_t s0, s1;
+ int16_t w0, w1;
+ } sgr;
+} LooprestorationParams;
+
+// Although the spec applies restoration filters over 4x4 blocks,
+// they can be applied to a bigger surface.
+// * w is constrained by the restoration unit size (w <= 256)
+// * h is constrained by the stripe height (h <= 64)
+// The filter functions are allowed to do aligned writes past the right
+// edge of the buffer, aligned up to the minimum loop restoration unit size
+// (which is 32 pixels for subsampled chroma and 64 pixels for luma).
+#define decl_lr_filter_fn(name) \
+void (name)(pixel *dst, ptrdiff_t dst_stride, \
+ const_left_pixel_row left, \
+ const pixel *lpf, int w, int h, \
+ const LooprestorationParams *params, \
+ enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+typedef decl_lr_filter_fn(*looprestorationfilter_fn);
+
+typedef struct Dav1dLoopRestorationDSPContext {
+ looprestorationfilter_fn wiener[2]; /* 7-tap, 5-tap */
+ looprestorationfilter_fn sgr[3]; /* 5x5, 3x3, mix */
+} Dav1dLoopRestorationDSPContext;
+
+bitfn_decls(void dav1d_loop_restoration_dsp_init, Dav1dLoopRestorationDSPContext *c, int bpc);
+
+#endif /* DAV1D_SRC_LOOPRESTORATION_H */
diff --git a/third_party/dav1d/src/looprestoration_tmpl.c b/third_party/dav1d/src/looprestoration_tmpl.c
new file mode 100644
index 0000000000..992290864a
--- /dev/null
+++ b/third_party/dav1d/src/looprestoration_tmpl.c
@@ -0,0 +1,558 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stdlib.h>
+
+#include "common/intops.h"
+
+#include "src/looprestoration.h"
+#include "src/tables.h"
+
+// 256 * 1.5 + 3 + 3 = 390
+#define REST_UNIT_STRIDE (390)
+
+// TODO Reuse p when no padding is needed (add and remove lpf pixels in p)
+// TODO Chroma only requires 2 rows of padding.
+static NOINLINE void
+padding(pixel *dst, const pixel *p, const ptrdiff_t stride,
+ const pixel (*left)[4], const pixel *lpf, int unit_w,
+ const int stripe_h, const enum LrEdgeFlags edges)
+{
+ const int have_left = !!(edges & LR_HAVE_LEFT);
+ const int have_right = !!(edges & LR_HAVE_RIGHT);
+
+ // Copy more pixels if we don't have to pad them
+ unit_w += 3 * have_left + 3 * have_right;
+ pixel *dst_l = dst + 3 * !have_left;
+ p -= 3 * have_left;
+ lpf -= 3 * have_left;
+
+ if (edges & LR_HAVE_TOP) {
+ // Copy previous loop filtered rows
+ const pixel *const above_1 = lpf;
+ const pixel *const above_2 = above_1 + PXSTRIDE(stride);
+ pixel_copy(dst_l, above_1, unit_w);
+ pixel_copy(dst_l + REST_UNIT_STRIDE, above_1, unit_w);
+ pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, above_2, unit_w);
+ } else {
+ // Pad with first row
+ pixel_copy(dst_l, p, unit_w);
+ pixel_copy(dst_l + REST_UNIT_STRIDE, p, unit_w);
+ pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, p, unit_w);
+ if (have_left) {
+ pixel_copy(dst_l, &left[0][1], 3);
+ pixel_copy(dst_l + REST_UNIT_STRIDE, &left[0][1], 3);
+ pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, &left[0][1], 3);
+ }
+ }
+
+ pixel *dst_tl = dst_l + 3 * REST_UNIT_STRIDE;
+ if (edges & LR_HAVE_BOTTOM) {
+ // Copy next loop filtered rows
+ const pixel *const below_1 = lpf + 6 * PXSTRIDE(stride);
+ const pixel *const below_2 = below_1 + PXSTRIDE(stride);
+ pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, below_1, unit_w);
+ pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, below_2, unit_w);
+ pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, below_2, unit_w);
+ } else {
+ // Pad with last row
+ const pixel *const src = p + (stripe_h - 1) * PXSTRIDE(stride);
+ pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, src, unit_w);
+ pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, src, unit_w);
+ pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, src, unit_w);
+ if (have_left) {
+ pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
+ pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
+ pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
+ }
+ }
+
+ // Inner UNIT_WxSTRIPE_H
+ for (int j = 0; j < stripe_h; j++) {
+ pixel_copy(dst_tl + 3 * have_left, p + 3 * have_left, unit_w - 3 * have_left);
+ dst_tl += REST_UNIT_STRIDE;
+ p += PXSTRIDE(stride);
+ }
+
+ if (!have_right) {
+ pixel *pad = dst_l + unit_w;
+ pixel *row_last = &dst_l[unit_w - 1];
+ // Pad 3x(STRIPE_H+6) with last column
+ for (int j = 0; j < stripe_h + 6; j++) {
+ pixel_set(pad, *row_last, 3);
+ pad += REST_UNIT_STRIDE;
+ row_last += REST_UNIT_STRIDE;
+ }
+ }
+
+ if (!have_left) {
+ // Pad 3x(STRIPE_H+6) with first column
+ for (int j = 0; j < stripe_h + 6; j++) {
+ pixel_set(dst, *dst_l, 3);
+ dst += REST_UNIT_STRIDE;
+ dst_l += REST_UNIT_STRIDE;
+ }
+ } else {
+ dst += 3 * REST_UNIT_STRIDE;
+ for (int j = 0; j < stripe_h; j++) {
+ pixel_copy(dst, &left[j][1], 3);
+ dst += REST_UNIT_STRIDE;
+ }
+ }
+}
+
+// FIXME Could split into luma and chroma specific functions,
+// (since first and last tops are always 0 for chroma)
+// FIXME Could implement a version that requires less temporary memory
+// (should be possible to implement with only 6 rows of temp storage)
+static void wiener_c(pixel *p, const ptrdiff_t stride,
+ const pixel (*const left)[4],
+ const pixel *lpf, const int w, const int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+ // Wiener filtering is applied to a maximum stripe height of 64 + 3 pixels
+ // of padding above and below
+ pixel tmp[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
+ pixel *tmp_ptr = tmp;
+
+ padding(tmp, p, stride, left, lpf, w, h, edges);
+
+ // Values stored between horizontal and vertical filtering don't
+ // fit in a uint8_t.
+ uint16_t hor[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
+ uint16_t *hor_ptr = hor;
+
+ const int16_t (*const filter)[8] = params->filter;
+ const int bitdepth = bitdepth_from_max(bitdepth_max);
+ const int round_bits_h = 3 + (bitdepth == 12) * 2;
+ const int rounding_off_h = 1 << (round_bits_h - 1);
+ const int clip_limit = 1 << (bitdepth + 1 + 7 - round_bits_h);
+ for (int j = 0; j < h + 6; j++) {
+ for (int i = 0; i < w; i++) {
+ int sum = (1 << (bitdepth + 6));
+#if BITDEPTH == 8
+ sum += tmp_ptr[i + 3] * 128;
+#endif
+
+ for (int k = 0; k < 7; k++) {
+ sum += tmp_ptr[i + k] * filter[0][k];
+ }
+
+ hor_ptr[i] =
+ iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1);
+ }
+ tmp_ptr += REST_UNIT_STRIDE;
+ hor_ptr += REST_UNIT_STRIDE;
+ }
+
+ const int round_bits_v = 11 - (bitdepth == 12) * 2;
+ const int rounding_off_v = 1 << (round_bits_v - 1);
+ const int round_offset = 1 << (bitdepth + (round_bits_v - 1));
+ for (int j = 0; j < h; j++) {
+ for (int i = 0; i < w; i++) {
+ int sum = -round_offset;
+
+ for (int k = 0; k < 7; k++) {
+ sum += hor[(j + k) * REST_UNIT_STRIDE + i] * filter[1][k];
+ }
+
+ p[j * PXSTRIDE(stride) + i] =
+ iclip_pixel((sum + rounding_off_v) >> round_bits_v);
+ }
+ }
+}
+
+// Sum over a 3x3 area
+// The dst and src pointers are positioned 3 pixels above and 3 pixels to the
+// left of the top left corner. However, the self guided filter only needs 1
+// pixel above and one pixel to the left. As for the pixels below and to the
+// right they must be computed in the sums, but don't need to be stored.
+//
+// Example for a 4x4 block:
+// x x x x x x x x x x
+// x c c c c c c c c x
+// x i s s s s s s i x
+// x i s s s s s s i x
+// x i s s s s s s i x
+// x i s s s s s s i x
+// x i s s s s s s i x
+// x i s s s s s s i x
+// x c c c c c c c c x
+// x x x x x x x x x x
+//
+// s: Pixel summed and stored
+// i: Pixel summed and stored (between loops)
+// c: Pixel summed not stored
+// x: Pixel not summed not stored
+static void boxsum3(int32_t *sumsq, coef *sum, const pixel *src,
+ const int w, const int h)
+{
+ // We skip the first row, as it is never used
+ src += REST_UNIT_STRIDE;
+
+ // We skip the first and last columns, as they are never used
+ for (int x = 1; x < w - 1; x++) {
+ coef *sum_v = sum + x;
+ int32_t *sumsq_v = sumsq + x;
+ const pixel *s = src + x;
+ int a = s[0], a2 = a * a;
+ int b = s[REST_UNIT_STRIDE], b2 = b * b;
+
+ // We skip the first 2 rows, as they are skipped in the next loop and
+ // we don't need the last 2 row as it is skipped in the next loop
+ for (int y = 2; y < h - 2; y++) {
+ s += REST_UNIT_STRIDE;
+ const int c = s[REST_UNIT_STRIDE];
+ const int c2 = c * c;
+ sum_v += REST_UNIT_STRIDE;
+ sumsq_v += REST_UNIT_STRIDE;
+ *sum_v = a + b + c;
+ *sumsq_v = a2 + b2 + c2;
+ a = b;
+ a2 = b2;
+ b = c;
+ b2 = c2;
+ }
+ }
+
+ // We skip the first row as it is never read
+ sum += REST_UNIT_STRIDE;
+ sumsq += REST_UNIT_STRIDE;
+ // We skip the last 2 rows as it is never read
+ for (int y = 2; y < h - 2; y++) {
+ int a = sum[1], a2 = sumsq[1];
+ int b = sum[2], b2 = sumsq[2];
+
+ // We don't store the first column as it is never read and
+ // we don't store the last 2 columns as they are never read
+ for (int x = 2; x < w - 2; x++) {
+ const int c = sum[x + 1], c2 = sumsq[x + 1];
+ sum[x] = a + b + c;
+ sumsq[x] = a2 + b2 + c2;
+ a = b;
+ a2 = b2;
+ b = c;
+ b2 = c2;
+ }
+ sum += REST_UNIT_STRIDE;
+ sumsq += REST_UNIT_STRIDE;
+ }
+}
+
+// Sum over a 5x5 area
+// The dst and src pointers are positioned 3 pixels above and 3 pixels to the
+// left of the top left corner. However, the self guided filter only needs 1
+// pixel above and one pixel to the left. As for the pixels below and to the
+// right they must be computed in the sums, but don't need to be stored.
+//
+// Example for a 4x4 block:
+// c c c c c c c c c c
+// c c c c c c c c c c
+// i i s s s s s s i i
+// i i s s s s s s i i
+// i i s s s s s s i i
+// i i s s s s s s i i
+// i i s s s s s s i i
+// i i s s s s s s i i
+// c c c c c c c c c c
+// c c c c c c c c c c
+//
+// s: Pixel summed and stored
+// i: Pixel summed and stored (between loops)
+// c: Pixel summed not stored
+// x: Pixel not summed not stored
+static void boxsum5(int32_t *sumsq, coef *sum, const pixel *const src,
+ const int w, const int h)
+{
+ for (int x = 0; x < w; x++) {
+ coef *sum_v = sum + x;
+ int32_t *sumsq_v = sumsq + x;
+ const pixel *s = src + 3 * REST_UNIT_STRIDE + x;
+ int a = s[-3 * REST_UNIT_STRIDE], a2 = a * a;
+ int b = s[-2 * REST_UNIT_STRIDE], b2 = b * b;
+ int c = s[-1 * REST_UNIT_STRIDE], c2 = c * c;
+ int d = s[0], d2 = d * d;
+
+ // We skip the first 2 rows, as they are skipped in the next loop and
+ // we don't need the last 2 row as it is skipped in the next loop
+ for (int y = 2; y < h - 2; y++) {
+ s += REST_UNIT_STRIDE;
+ const int e = *s, e2 = e * e;
+ sum_v += REST_UNIT_STRIDE;
+ sumsq_v += REST_UNIT_STRIDE;
+ *sum_v = a + b + c + d + e;
+ *sumsq_v = a2 + b2 + c2 + d2 + e2;
+ a = b;
+ b = c;
+ c = d;
+ d = e;
+ a2 = b2;
+ b2 = c2;
+ c2 = d2;
+ d2 = e2;
+ }
+ }
+
+ // We skip the first row as it is never read
+ sum += REST_UNIT_STRIDE;
+ sumsq += REST_UNIT_STRIDE;
+ for (int y = 2; y < h - 2; y++) {
+ int a = sum[0], a2 = sumsq[0];
+ int b = sum[1], b2 = sumsq[1];
+ int c = sum[2], c2 = sumsq[2];
+ int d = sum[3], d2 = sumsq[3];
+
+ for (int x = 2; x < w - 2; x++) {
+ const int e = sum[x + 2], e2 = sumsq[x + 2];
+ sum[x] = a + b + c + d + e;
+ sumsq[x] = a2 + b2 + c2 + d2 + e2;
+ a = b;
+ b = c;
+ c = d;
+ d = e;
+ a2 = b2;
+ b2 = c2;
+ c2 = d2;
+ d2 = e2;
+ }
+ sum += REST_UNIT_STRIDE;
+ sumsq += REST_UNIT_STRIDE;
+ }
+}
+
+static NOINLINE void
+selfguided_filter(coef *dst, const pixel *src, const ptrdiff_t src_stride,
+ const int w, const int h, const int n, const unsigned s
+ HIGHBD_DECL_SUFFIX)
+{
+ const unsigned sgr_one_by_x = n == 25 ? 164 : 455;
+
+ // Selfguided filter is applied to a maximum stripe height of 64 + 3 pixels
+ // of padding above and below
+ int32_t sumsq[68 /*(64 + 2 + 2)*/ * REST_UNIT_STRIDE];
+ int32_t *A = sumsq + 2 * REST_UNIT_STRIDE + 3;
+ // By inverting A and B after the boxsums, B can be of size coef instead
+ // of int32_t
+ coef sum[68 /*(64 + 2 + 2)*/ * REST_UNIT_STRIDE];
+ coef *B = sum + 2 * REST_UNIT_STRIDE + 3;
+
+ const int step = (n == 25) + 1;
+ if (n == 25)
+ boxsum5(sumsq, sum, src, w + 6, h + 6);
+ else
+ boxsum3(sumsq, sum, src, w + 6, h + 6);
+ const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
+
+ int32_t *AA = A - REST_UNIT_STRIDE;
+ coef *BB = B - REST_UNIT_STRIDE;
+ for (int j = -1; j < h + 1; j+= step) {
+ for (int i = -1; i < w + 1; i++) {
+ const int a =
+ (AA[i] + ((1 << (2 * bitdepth_min_8)) >> 1)) >> (2 * bitdepth_min_8);
+ const int b =
+ (BB[i] + ((1 << bitdepth_min_8) >> 1)) >> bitdepth_min_8;
+
+ const unsigned p = imax(a * n - b * b, 0);
+ const unsigned z = (p * s + (1 << 19)) >> 20;
+ const unsigned x = dav1d_sgr_x_by_x[umin(z, 255)];
+
+ // This is where we invert A and B, so that B is of size coef.
+ AA[i] = (x * BB[i] * sgr_one_by_x + (1 << 11)) >> 12;
+ BB[i] = x;
+ }
+ AA += step * REST_UNIT_STRIDE;
+ BB += step * REST_UNIT_STRIDE;
+ }
+
+ src += 3 * REST_UNIT_STRIDE + 3;
+ if (n == 25) {
+ int j = 0;
+#define SIX_NEIGHBORS(P, i)\
+ ((P[i - REST_UNIT_STRIDE] + P[i + REST_UNIT_STRIDE]) * 6 + \
+ (P[i - 1 - REST_UNIT_STRIDE] + P[i - 1 + REST_UNIT_STRIDE] + \
+ P[i + 1 - REST_UNIT_STRIDE] + P[i + 1 + REST_UNIT_STRIDE]) * 5)
+ for (; j < h - 1; j+=2) {
+ for (int i = 0; i < w; i++) {
+ const int a = SIX_NEIGHBORS(B, i);
+ const int b = SIX_NEIGHBORS(A, i);
+ dst[i] = (b - a * src[i] + (1 << 8)) >> 9;
+ }
+ dst += 384 /* Maximum restoration width is 384 (256 * 1.5) */;
+ src += REST_UNIT_STRIDE;
+ B += REST_UNIT_STRIDE;
+ A += REST_UNIT_STRIDE;
+ for (int i = 0; i < w; i++) {
+ const int a = B[i] * 6 + (B[i - 1] + B[i + 1]) * 5;
+ const int b = A[i] * 6 + (A[i - 1] + A[i + 1]) * 5;
+ dst[i] = (b - a * src[i] + (1 << 7)) >> 8;
+ }
+ dst += 384 /* Maximum restoration width is 384 (256 * 1.5) */;
+ src += REST_UNIT_STRIDE;
+ B += REST_UNIT_STRIDE;
+ A += REST_UNIT_STRIDE;
+ }
+ if (j + 1 == h) { // Last row, when number of rows is odd
+ for (int i = 0; i < w; i++) {
+ const int a = SIX_NEIGHBORS(B, i);
+ const int b = SIX_NEIGHBORS(A, i);
+ dst[i] = (b - a * src[i] + (1 << 8)) >> 9;
+ }
+ }
+#undef SIX_NEIGHBORS
+ } else {
+#define EIGHT_NEIGHBORS(P, i)\
+ ((P[i] + P[i - 1] + P[i + 1] + P[i - REST_UNIT_STRIDE] + P[i + REST_UNIT_STRIDE]) * 4 + \
+ (P[i - 1 - REST_UNIT_STRIDE] + P[i - 1 + REST_UNIT_STRIDE] + \
+ P[i + 1 - REST_UNIT_STRIDE] + P[i + 1 + REST_UNIT_STRIDE]) * 3)
+ for (int j = 0; j < h; j++) {
+ for (int i = 0; i < w; i++) {
+ const int a = EIGHT_NEIGHBORS(B, i);
+ const int b = EIGHT_NEIGHBORS(A, i);
+ dst[i] = (b - a * src[i] + (1 << 8)) >> 9;
+ }
+ dst += 384;
+ src += REST_UNIT_STRIDE;
+ B += REST_UNIT_STRIDE;
+ A += REST_UNIT_STRIDE;
+ }
+ }
+#undef EIGHT_NEIGHBORS
+}
+
+static void sgr_5x5_c(pixel *p, const ptrdiff_t stride,
+ const pixel (*const left)[4], const pixel *lpf,
+ const int w, const int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+ // Selfguided filter is applied to a maximum stripe height of 64 + 3 pixels
+ // of padding above and below
+ pixel tmp[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
+
+ // Selfguided filter outputs to a maximum stripe height of 64 and a
+ // maximum restoration width of 384 (256 * 1.5)
+ coef dst[64 * 384];
+
+ padding(tmp, p, stride, left, lpf, w, h, edges);
+ selfguided_filter(dst, tmp, REST_UNIT_STRIDE, w, h, 25,
+ params->sgr.s0 HIGHBD_TAIL_SUFFIX);
+
+ const int w0 = params->sgr.w0;
+ for (int j = 0; j < h; j++) {
+ for (int i = 0; i < w; i++) {
+ const int v = w0 * dst[j * 384 + i];
+ p[i] = iclip_pixel(p[i] + ((v + (1 << 10)) >> 11));
+ }
+ p += PXSTRIDE(stride);
+ }
+}
+
+static void sgr_3x3_c(pixel *p, const ptrdiff_t stride,
+ const pixel (*const left)[4], const pixel *lpf,
+ const int w, const int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+ pixel tmp[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
+ coef dst[64 * 384];
+
+ padding(tmp, p, stride, left, lpf, w, h, edges);
+ selfguided_filter(dst, tmp, REST_UNIT_STRIDE, w, h, 9,
+ params->sgr.s1 HIGHBD_TAIL_SUFFIX);
+
+ const int w1 = params->sgr.w1;
+ for (int j = 0; j < h; j++) {
+ for (int i = 0; i < w; i++) {
+ const int v = w1 * dst[j * 384 + i];
+ p[i] = iclip_pixel(p[i] + ((v + (1 << 10)) >> 11));
+ }
+ p += PXSTRIDE(stride);
+ }
+}
+
+static void sgr_mix_c(pixel *p, const ptrdiff_t stride,
+ const pixel (*const left)[4], const pixel *lpf,
+ const int w, const int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+ pixel tmp[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
+ coef dst0[64 * 384];
+ coef dst1[64 * 384];
+
+ padding(tmp, p, stride, left, lpf, w, h, edges);
+ selfguided_filter(dst0, tmp, REST_UNIT_STRIDE, w, h, 25,
+ params->sgr.s0 HIGHBD_TAIL_SUFFIX);
+ selfguided_filter(dst1, tmp, REST_UNIT_STRIDE, w, h, 9,
+ params->sgr.s1 HIGHBD_TAIL_SUFFIX);
+
+ const int w0 = params->sgr.w0;
+ const int w1 = params->sgr.w1;
+ for (int j = 0; j < h; j++) {
+ for (int i = 0; i < w; i++) {
+ const int v = w0 * dst0[j * 384 + i] + w1 * dst1[j * 384 + i];
+ p[i] = iclip_pixel(p[i] + ((v + (1 << 10)) >> 11));
+ }
+ p += PXSTRIDE(stride);
+ }
+}
+
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+#include "src/arm/looprestoration.h"
+#elif ARCH_LOONGARCH64
+#include "src/loongarch/looprestoration.h"
+#elif ARCH_PPC64LE
+#include "src/ppc/looprestoration.h"
+#elif ARCH_X86
+#include "src/x86/looprestoration.h"
+#endif
+#endif
+
+COLD void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext *const c,
+ const int bpc)
+{
+ c->wiener[0] = c->wiener[1] = wiener_c;
+ c->sgr[0] = sgr_5x5_c;
+ c->sgr[1] = sgr_3x3_c;
+ c->sgr[2] = sgr_mix_c;
+
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+ loop_restoration_dsp_init_arm(c, bpc);
+#elif ARCH_LOONGARCH64
+ loop_restoration_dsp_init_loongarch(c, bpc);
+#elif ARCH_PPC64LE
+ loop_restoration_dsp_init_ppc(c, bpc);
+#elif ARCH_X86
+ loop_restoration_dsp_init_x86(c, bpc);
+#endif
+#endif
+}
diff --git a/third_party/dav1d/src/lr_apply.h b/third_party/dav1d/src/lr_apply.h
new file mode 100644
index 0000000000..2815367534
--- /dev/null
+++ b/third_party/dav1d/src/lr_apply.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_LR_APPLY_H
+#define DAV1D_SRC_LR_APPLY_H
+
+#include <stdint.h>
+#include <stddef.h>
+
+#include "common/bitdepth.h"
+
+#include "src/internal.h"
+
+enum LrRestorePlanes {
+ LR_RESTORE_Y = 1 << 0,
+ LR_RESTORE_U = 1 << 1,
+ LR_RESTORE_V = 1 << 2,
+};
+
+void bytefn(dav1d_lr_sbrow)(Dav1dFrameContext *const f, pixel *const dst[3],
+ int sby);
+
+#endif /* DAV1D_SRC_LR_APPLY_H */
diff --git a/third_party/dav1d/src/lr_apply_tmpl.c b/third_party/dav1d/src/lr_apply_tmpl.c
new file mode 100644
index 0000000000..ec0acdf605
--- /dev/null
+++ b/third_party/dav1d/src/lr_apply_tmpl.c
@@ -0,0 +1,202 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stdio.h>
+
+#include "common/intops.h"
+
+#include "src/lr_apply.h"
+
+static void lr_stripe(const Dav1dFrameContext *const f, pixel *p,
+ const pixel (*left)[4], int x, int y,
+ const int plane, const int unit_w, const int row_h,
+ const Av1RestorationUnit *const lr, enum LrEdgeFlags edges)
+{
+ const Dav1dDSPContext *const dsp = f->dsp;
+ const int chroma = !!plane;
+ const int ss_ver = chroma & (f->sr_cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420);
+ const ptrdiff_t stride = f->sr_cur.p.stride[chroma];
+ const int sby = (y + (y ? 8 << ss_ver : 0)) >> (6 - ss_ver + f->seq_hdr->sb128);
+ const int have_tt = f->c->n_tc > 1;
+ const pixel *lpf = f->lf.lr_lpf_line[plane] +
+ have_tt * (sby * (4 << f->seq_hdr->sb128) - 4) * PXSTRIDE(stride) + x;
+
+ // The first stripe of the frame is shorter by 8 luma pixel rows.
+ int stripe_h = imin((64 - 8 * !y) >> ss_ver, row_h - y);
+
+ looprestorationfilter_fn lr_fn;
+ LooprestorationParams params;
+ if (lr->type == DAV1D_RESTORATION_WIENER) {
+ int16_t (*const filter)[8] = params.filter;
+ filter[0][0] = filter[0][6] = lr->filter_h[0];
+ filter[0][1] = filter[0][5] = lr->filter_h[1];
+ filter[0][2] = filter[0][4] = lr->filter_h[2];
+ filter[0][3] = -(filter[0][0] + filter[0][1] + filter[0][2]) * 2;
+#if BITDEPTH != 8
+ /* For 8-bit SIMD it's beneficial to handle the +128 separately
+ * in order to avoid overflows. */
+ filter[0][3] += 128;
+#endif
+
+ filter[1][0] = filter[1][6] = lr->filter_v[0];
+ filter[1][1] = filter[1][5] = lr->filter_v[1];
+ filter[1][2] = filter[1][4] = lr->filter_v[2];
+ filter[1][3] = 128 - (filter[1][0] + filter[1][1] + filter[1][2]) * 2;
+
+ lr_fn = dsp->lr.wiener[!(filter[0][0] | filter[1][0])];
+ } else {
+ assert(lr->type >= DAV1D_RESTORATION_SGRPROJ);
+ const int sgr_idx = lr->type - DAV1D_RESTORATION_SGRPROJ;
+ const uint16_t *const sgr_params = dav1d_sgr_params[sgr_idx];
+ params.sgr.s0 = sgr_params[0];
+ params.sgr.s1 = sgr_params[1];
+ params.sgr.w0 = lr->sgr_weights[0];
+ params.sgr.w1 = 128 - (lr->sgr_weights[0] + lr->sgr_weights[1]);
+
+ lr_fn = dsp->lr.sgr[!!sgr_params[0] + !!sgr_params[1] * 2 - 1];
+ }
+
+ while (y + stripe_h <= row_h) {
+ // Change the HAVE_BOTTOM bit in edges to (sby + 1 != f->sbh || y + stripe_h != row_h)
+ edges ^= (-(sby + 1 != f->sbh || y + stripe_h != row_h) ^ edges) & LR_HAVE_BOTTOM;
+ lr_fn(p, stride, left, lpf, unit_w, stripe_h, &params, edges HIGHBD_CALL_SUFFIX);
+
+ left += stripe_h;
+ y += stripe_h;
+ p += stripe_h * PXSTRIDE(stride);
+ edges |= LR_HAVE_TOP;
+ stripe_h = imin(64 >> ss_ver, row_h - y);
+ if (stripe_h == 0) break;
+ lpf += 4 * PXSTRIDE(stride);
+ }
+}
+
+static void backup4xU(pixel (*dst)[4], const pixel *src, const ptrdiff_t src_stride,
+ int u)
+{
+ for (; u > 0; u--, dst++, src += PXSTRIDE(src_stride))
+ pixel_copy(dst, src, 4);
+}
+
+static void lr_sbrow(const Dav1dFrameContext *const f, pixel *p, const int y,
+ const int w, const int h, const int row_h, const int plane)
+{
+ const int chroma = !!plane;
+ const int ss_ver = chroma & (f->sr_cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420);
+ const int ss_hor = chroma & (f->sr_cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444);
+ const ptrdiff_t p_stride = f->sr_cur.p.stride[chroma];
+
+ const int unit_size_log2 = f->frame_hdr->restoration.unit_size[!!plane];
+ const int unit_size = 1 << unit_size_log2;
+ const int half_unit_size = unit_size >> 1;
+ const int max_unit_size = unit_size + half_unit_size;
+
+ // Y coordinate of the sbrow (y is 8 luma pixel rows above row_y)
+ const int row_y = y + ((8 >> ss_ver) * !!y);
+
+ // FIXME This is an ugly hack to lookup the proper AV1Filter unit for
+ // chroma planes. Question: For Multithreaded decoding, is it better
+ // to store the chroma LR information with collocated Luma information?
+ // In other words. For a chroma restoration unit locate at 128,128 and
+ // with a 4:2:0 chroma subsampling, do we store the filter information at
+ // the AV1Filter unit located at (128,128) or (256,256)
+ // TODO Support chroma subsampling.
+ const int shift_hor = 7 - ss_hor;
+
+ /* maximum sbrow height is 128 + 8 rows offset */
+ ALIGN_STK_16(pixel, pre_lr_border, 2, [128 + 8][4]);
+ const Av1RestorationUnit *lr[2];
+
+ enum LrEdgeFlags edges = (y > 0 ? LR_HAVE_TOP : 0) | LR_HAVE_RIGHT;
+
+ int aligned_unit_pos = row_y & ~(unit_size - 1);
+ if (aligned_unit_pos && aligned_unit_pos + half_unit_size > h)
+ aligned_unit_pos -= unit_size;
+ aligned_unit_pos <<= ss_ver;
+ const int sb_idx = (aligned_unit_pos >> 7) * f->sr_sb128w;
+ const int unit_idx = ((aligned_unit_pos >> 6) & 1) << 1;
+ lr[0] = &f->lf.lr_mask[sb_idx].lr[plane][unit_idx];
+ int restore = lr[0]->type != DAV1D_RESTORATION_NONE;
+ int x = 0, bit = 0;
+ for (; x + max_unit_size <= w; p += unit_size, edges |= LR_HAVE_LEFT, bit ^= 1) {
+ const int next_x = x + unit_size;
+ const int next_u_idx = unit_idx + ((next_x >> (shift_hor - 1)) & 1);
+ lr[!bit] =
+ &f->lf.lr_mask[sb_idx + (next_x >> shift_hor)].lr[plane][next_u_idx];
+ const int restore_next = lr[!bit]->type != DAV1D_RESTORATION_NONE;
+ if (restore_next)
+ backup4xU(pre_lr_border[bit], p + unit_size - 4, p_stride, row_h - y);
+ if (restore)
+ lr_stripe(f, p, pre_lr_border[!bit], x, y, plane, unit_size, row_h,
+ lr[bit], edges);
+ x = next_x;
+ restore = restore_next;
+ }
+ if (restore) {
+ edges &= ~LR_HAVE_RIGHT;
+ const int unit_w = w - x;
+ lr_stripe(f, p, pre_lr_border[!bit], x, y, plane, unit_w, row_h, lr[bit], edges);
+ }
+}
+
+void bytefn(dav1d_lr_sbrow)(Dav1dFrameContext *const f, pixel *const dst[3],
+ const int sby)
+{
+ const int offset_y = 8 * !!sby;
+ const ptrdiff_t *const dst_stride = f->sr_cur.p.stride;
+ const int restore_planes = f->lf.restore_planes;
+ const int not_last = sby + 1 < f->sbh;
+
+ if (restore_planes & LR_RESTORE_Y) {
+ const int h = f->sr_cur.p.p.h;
+ const int w = f->sr_cur.p.p.w;
+ const int next_row_y = (sby + 1) << (6 + f->seq_hdr->sb128);
+ const int row_h = imin(next_row_y - 8 * not_last, h);
+ const int y_stripe = (sby << (6 + f->seq_hdr->sb128)) - offset_y;
+ lr_sbrow(f, dst[0] - offset_y * PXSTRIDE(dst_stride[0]), y_stripe, w,
+ h, row_h, 0);
+ }
+ if (restore_planes & (LR_RESTORE_U | LR_RESTORE_V)) {
+ const int ss_ver = f->sr_cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_hor = f->sr_cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+ const int h = (f->sr_cur.p.p.h + ss_ver) >> ss_ver;
+ const int w = (f->sr_cur.p.p.w + ss_hor) >> ss_hor;
+ const int next_row_y = (sby + 1) << ((6 - ss_ver) + f->seq_hdr->sb128);
+ const int row_h = imin(next_row_y - (8 >> ss_ver) * not_last, h);
+ const int offset_uv = offset_y >> ss_ver;
+ const int y_stripe = (sby << ((6 - ss_ver) + f->seq_hdr->sb128)) - offset_uv;
+ if (restore_planes & LR_RESTORE_U)
+ lr_sbrow(f, dst[1] - offset_uv * PXSTRIDE(dst_stride[1]), y_stripe,
+ w, h, row_h, 1);
+
+ if (restore_planes & LR_RESTORE_V)
+ lr_sbrow(f, dst[2] - offset_uv * PXSTRIDE(dst_stride[1]), y_stripe,
+ w, h, row_h, 2);
+ }
+}
diff --git a/third_party/dav1d/src/mc.h b/third_party/dav1d/src/mc.h
new file mode 100644
index 0000000000..59ba2d9a5a
--- /dev/null
+++ b/third_party/dav1d/src/mc.h
@@ -0,0 +1,136 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_MC_H
+#define DAV1D_SRC_MC_H
+
+#include <stdint.h>
+#include <stddef.h>
+
+#include "common/bitdepth.h"
+
+#include "src/levels.h"
+
+#define decl_mc_fn(name) \
+void (name)(pixel *dst, ptrdiff_t dst_stride, \
+ const pixel *src, ptrdiff_t src_stride, \
+ int w, int h, int mx, int my HIGHBD_DECL_SUFFIX)
+typedef decl_mc_fn(*mc_fn);
+
+#define decl_mc_scaled_fn(name) \
+void (name)(pixel *dst, ptrdiff_t dst_stride, \
+ const pixel *src, ptrdiff_t src_stride, \
+ int w, int h, int mx, int my, int dx, int dy HIGHBD_DECL_SUFFIX)
+typedef decl_mc_scaled_fn(*mc_scaled_fn);
+
+#define decl_warp8x8_fn(name) \
+void (name)(pixel *dst, ptrdiff_t dst_stride, \
+ const pixel *src, ptrdiff_t src_stride, \
+ const int16_t *abcd, int mx, int my HIGHBD_DECL_SUFFIX)
+typedef decl_warp8x8_fn(*warp8x8_fn);
+
+#define decl_mct_fn(name) \
+void (name)(int16_t *tmp, const pixel *src, ptrdiff_t src_stride, \
+ int w, int h, int mx, int my HIGHBD_DECL_SUFFIX)
+typedef decl_mct_fn(*mct_fn);
+
+#define decl_mct_scaled_fn(name) \
+void (name)(int16_t *tmp, const pixel *src, ptrdiff_t src_stride, \
+ int w, int h, int mx, int my, int dx, int dy HIGHBD_DECL_SUFFIX)
+typedef decl_mct_scaled_fn(*mct_scaled_fn);
+
+#define decl_warp8x8t_fn(name) \
+void (name)(int16_t *tmp, const ptrdiff_t tmp_stride, \
+ const pixel *src, ptrdiff_t src_stride, \
+ const int16_t *abcd, int mx, int my HIGHBD_DECL_SUFFIX)
+typedef decl_warp8x8t_fn(*warp8x8t_fn);
+
+#define decl_avg_fn(name) \
+void (name)(pixel *dst, ptrdiff_t dst_stride, \
+ const int16_t *tmp1, const int16_t *tmp2, int w, int h \
+ HIGHBD_DECL_SUFFIX)
+typedef decl_avg_fn(*avg_fn);
+
+#define decl_w_avg_fn(name) \
+void (name)(pixel *dst, ptrdiff_t dst_stride, \
+ const int16_t *tmp1, const int16_t *tmp2, int w, int h, int weight \
+ HIGHBD_DECL_SUFFIX)
+typedef decl_w_avg_fn(*w_avg_fn);
+
+#define decl_mask_fn(name) \
+void (name)(pixel *dst, ptrdiff_t dst_stride, \
+ const int16_t *tmp1, const int16_t *tmp2, int w, int h, \
+ const uint8_t *mask HIGHBD_DECL_SUFFIX)
+typedef decl_mask_fn(*mask_fn);
+
+#define decl_w_mask_fn(name) \
+void (name)(pixel *dst, ptrdiff_t dst_stride, \
+ const int16_t *tmp1, const int16_t *tmp2, int w, int h, \
+ uint8_t *mask, int sign HIGHBD_DECL_SUFFIX)
+typedef decl_w_mask_fn(*w_mask_fn);
+
+#define decl_blend_fn(name) \
+void (name)(pixel *dst, ptrdiff_t dst_stride, const pixel *tmp, \
+ int w, int h, const uint8_t *mask)
+typedef decl_blend_fn(*blend_fn);
+
+#define decl_blend_dir_fn(name) \
+void (name)(pixel *dst, ptrdiff_t dst_stride, const pixel *tmp, int w, int h)
+typedef decl_blend_dir_fn(*blend_dir_fn);
+
+#define decl_emu_edge_fn(name) \
+void (name)(intptr_t bw, intptr_t bh, intptr_t iw, intptr_t ih, intptr_t x, intptr_t y, \
+ pixel *dst, ptrdiff_t dst_stride, const pixel *src, ptrdiff_t src_stride)
+typedef decl_emu_edge_fn(*emu_edge_fn);
+
+#define decl_resize_fn(name) \
+void (name)(pixel *dst, ptrdiff_t dst_stride, \
+ const pixel *src, ptrdiff_t src_stride, \
+ int dst_w, int h, int src_w, int dx, int mx HIGHBD_DECL_SUFFIX)
+typedef decl_resize_fn(*resize_fn);
+
+typedef struct Dav1dMCDSPContext {
+ mc_fn mc[N_2D_FILTERS];
+ mc_scaled_fn mc_scaled[N_2D_FILTERS];
+ mct_fn mct[N_2D_FILTERS];
+ mct_scaled_fn mct_scaled[N_2D_FILTERS];
+ avg_fn avg;
+ w_avg_fn w_avg;
+ mask_fn mask;
+ w_mask_fn w_mask[3 /* 444, 422, 420 */];
+ blend_fn blend;
+ blend_dir_fn blend_v;
+ blend_dir_fn blend_h;
+ warp8x8_fn warp8x8;
+ warp8x8t_fn warp8x8t;
+ emu_edge_fn emu_edge;
+ resize_fn resize;
+} Dav1dMCDSPContext;
+
+bitfn_decls(void dav1d_mc_dsp_init, Dav1dMCDSPContext *c);
+
+#endif /* DAV1D_SRC_MC_H */
diff --git a/third_party/dav1d/src/mc_tmpl.c b/third_party/dav1d/src/mc_tmpl.c
new file mode 100644
index 0000000000..469fc5fa26
--- /dev/null
+++ b/third_party/dav1d/src/mc_tmpl.c
@@ -0,0 +1,957 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "common/attributes.h"
+#include "common/intops.h"
+
+#include "src/mc.h"
+#include "src/tables.h"
+
+#if BITDEPTH == 8
+#define get_intermediate_bits(bitdepth_max) 4
+// Output in interval [-5132, 9212], fits in int16_t as is
+#define PREP_BIAS 0
+#else
+// 4 for 10 bits/component, 2 for 12 bits/component
+#define get_intermediate_bits(bitdepth_max) (14 - bitdepth_from_max(bitdepth_max))
+// Output in interval [-20588, 36956] (10-bit), [-20602, 36983] (12-bit)
+// Subtract a bias to ensure the output fits in int16_t
+#define PREP_BIAS 8192
+#endif
+
+static NOINLINE void
+put_c(pixel *dst, const ptrdiff_t dst_stride,
+ const pixel *src, const ptrdiff_t src_stride, const int w, int h)
+{
+ do {
+ pixel_copy(dst, src, w);
+
+ dst += dst_stride;
+ src += src_stride;
+ } while (--h);
+}
+
+static NOINLINE void
+prep_c(int16_t *tmp, const pixel *src, const ptrdiff_t src_stride,
+ const int w, int h HIGHBD_DECL_SUFFIX)
+{
+ const int intermediate_bits = get_intermediate_bits(bitdepth_max);
+ do {
+ for (int x = 0; x < w; x++)
+ tmp[x] = (src[x] << intermediate_bits) - PREP_BIAS;
+
+ tmp += w;
+ src += src_stride;
+ } while (--h);
+}
+
+#define FILTER_8TAP(src, x, F, stride) \
+ (F[0] * src[x + -3 * stride] + \
+ F[1] * src[x + -2 * stride] + \
+ F[2] * src[x + -1 * stride] + \
+ F[3] * src[x + +0 * stride] + \
+ F[4] * src[x + +1 * stride] + \
+ F[5] * src[x + +2 * stride] + \
+ F[6] * src[x + +3 * stride] + \
+ F[7] * src[x + +4 * stride])
+
+#define DAV1D_FILTER_8TAP_RND(src, x, F, stride, sh) \
+ ((FILTER_8TAP(src, x, F, stride) + ((1 << (sh)) >> 1)) >> (sh))
+
+#define DAV1D_FILTER_8TAP_RND2(src, x, F, stride, rnd, sh) \
+ ((FILTER_8TAP(src, x, F, stride) + (rnd)) >> (sh))
+
+#define DAV1D_FILTER_8TAP_CLIP(src, x, F, stride, sh) \
+ iclip_pixel(DAV1D_FILTER_8TAP_RND(src, x, F, stride, sh))
+
+#define DAV1D_FILTER_8TAP_CLIP2(src, x, F, stride, rnd, sh) \
+ iclip_pixel(DAV1D_FILTER_8TAP_RND2(src, x, F, stride, rnd, sh))
+
+#define GET_H_FILTER(mx) \
+ const int8_t *const fh = !(mx) ? NULL : w > 4 ? \
+ dav1d_mc_subpel_filters[filter_type & 3][(mx) - 1] : \
+ dav1d_mc_subpel_filters[3 + (filter_type & 1)][(mx) - 1]
+
+#define GET_V_FILTER(my) \
+ const int8_t *const fv = !(my) ? NULL : h > 4 ? \
+ dav1d_mc_subpel_filters[filter_type >> 2][(my) - 1] : \
+ dav1d_mc_subpel_filters[3 + ((filter_type >> 2) & 1)][(my) - 1]
+
+#define GET_FILTERS() \
+ GET_H_FILTER(mx); \
+ GET_V_FILTER(my)
+
+static NOINLINE void
+put_8tap_c(pixel *dst, ptrdiff_t dst_stride,
+ const pixel *src, ptrdiff_t src_stride,
+ const int w, int h, const int mx, const int my,
+ const int filter_type HIGHBD_DECL_SUFFIX)
+{
+ const int intermediate_bits = get_intermediate_bits(bitdepth_max);
+ const int intermediate_rnd = 32 + ((1 << (6 - intermediate_bits)) >> 1);
+
+ GET_FILTERS();
+ dst_stride = PXSTRIDE(dst_stride);
+ src_stride = PXSTRIDE(src_stride);
+
+ if (fh) {
+ if (fv) {
+ int tmp_h = h + 7;
+ int16_t mid[128 * 135], *mid_ptr = mid;
+
+ src -= src_stride * 3;
+ do {
+ for (int x = 0; x < w; x++)
+ mid_ptr[x] = DAV1D_FILTER_8TAP_RND(src, x, fh, 1,
+ 6 - intermediate_bits);
+
+ mid_ptr += 128;
+ src += src_stride;
+ } while (--tmp_h);
+
+ mid_ptr = mid + 128 * 3;
+ do {
+ for (int x = 0; x < w; x++)
+ dst[x] = DAV1D_FILTER_8TAP_CLIP(mid_ptr, x, fv, 128,
+ 6 + intermediate_bits);
+
+ mid_ptr += 128;
+ dst += dst_stride;
+ } while (--h);
+ } else {
+ do {
+ for (int x = 0; x < w; x++) {
+ dst[x] = DAV1D_FILTER_8TAP_CLIP2(src, x, fh, 1,
+ intermediate_rnd, 6);
+ }
+
+ dst += dst_stride;
+ src += src_stride;
+ } while (--h);
+ }
+ } else if (fv) {
+ do {
+ for (int x = 0; x < w; x++)
+ dst[x] = DAV1D_FILTER_8TAP_CLIP(src, x, fv, src_stride, 6);
+
+ dst += dst_stride;
+ src += src_stride;
+ } while (--h);
+ } else
+ put_c(dst, dst_stride, src, src_stride, w, h);
+}
+
+static NOINLINE void
+put_8tap_scaled_c(pixel *dst, const ptrdiff_t dst_stride,
+ const pixel *src, ptrdiff_t src_stride,
+ const int w, int h, const int mx, int my,
+ const int dx, const int dy, const int filter_type
+ HIGHBD_DECL_SUFFIX)
+{
+ const int intermediate_bits = get_intermediate_bits(bitdepth_max);
+ const int intermediate_rnd = (1 << intermediate_bits) >> 1;
+ int tmp_h = (((h - 1) * dy + my) >> 10) + 8;
+ int16_t mid[128 * (256 + 7)], *mid_ptr = mid;
+ src_stride = PXSTRIDE(src_stride);
+
+ src -= src_stride * 3;
+ do {
+ int x;
+ int imx = mx, ioff = 0;
+
+ for (x = 0; x < w; x++) {
+ GET_H_FILTER(imx >> 6);
+ mid_ptr[x] = fh ? DAV1D_FILTER_8TAP_RND(src, ioff, fh, 1,
+ 6 - intermediate_bits) :
+ src[ioff] << intermediate_bits;
+ imx += dx;
+ ioff += imx >> 10;
+ imx &= 0x3ff;
+ }
+
+ mid_ptr += 128;
+ src += src_stride;
+ } while (--tmp_h);
+
+ mid_ptr = mid + 128 * 3;
+ for (int y = 0; y < h; y++) {
+ int x;
+ GET_V_FILTER(my >> 6);
+
+ for (x = 0; x < w; x++)
+ dst[x] = fv ? DAV1D_FILTER_8TAP_CLIP(mid_ptr, x, fv, 128,
+ 6 + intermediate_bits) :
+ iclip_pixel((mid_ptr[x] + intermediate_rnd) >>
+ intermediate_bits);
+
+ my += dy;
+ mid_ptr += (my >> 10) * 128;
+ my &= 0x3ff;
+ dst += PXSTRIDE(dst_stride);
+ }
+}
+
+static NOINLINE void
+prep_8tap_c(int16_t *tmp, const pixel *src, ptrdiff_t src_stride,
+ const int w, int h, const int mx, const int my,
+ const int filter_type HIGHBD_DECL_SUFFIX)
+{
+ const int intermediate_bits = get_intermediate_bits(bitdepth_max);
+ GET_FILTERS();
+ src_stride = PXSTRIDE(src_stride);
+
+ if (fh) {
+ if (fv) {
+ int tmp_h = h + 7;
+ int16_t mid[128 * 135], *mid_ptr = mid;
+
+ src -= src_stride * 3;
+ do {
+ for (int x = 0; x < w; x++)
+ mid_ptr[x] = DAV1D_FILTER_8TAP_RND(src, x, fh, 1,
+ 6 - intermediate_bits);
+
+ mid_ptr += 128;
+ src += src_stride;
+ } while (--tmp_h);
+
+ mid_ptr = mid + 128 * 3;
+ do {
+ for (int x = 0; x < w; x++) {
+ int t = DAV1D_FILTER_8TAP_RND(mid_ptr, x, fv, 128, 6) -
+ PREP_BIAS;
+ assert(t >= INT16_MIN && t <= INT16_MAX);
+ tmp[x] = t;
+ }
+
+ mid_ptr += 128;
+ tmp += w;
+ } while (--h);
+ } else {
+ do {
+ for (int x = 0; x < w; x++)
+ tmp[x] = DAV1D_FILTER_8TAP_RND(src, x, fh, 1,
+ 6 - intermediate_bits) -
+ PREP_BIAS;
+
+ tmp += w;
+ src += src_stride;
+ } while (--h);
+ }
+ } else if (fv) {
+ do {
+ for (int x = 0; x < w; x++)
+ tmp[x] = DAV1D_FILTER_8TAP_RND(src, x, fv, src_stride,
+ 6 - intermediate_bits) -
+ PREP_BIAS;
+
+ tmp += w;
+ src += src_stride;
+ } while (--h);
+ } else
+ prep_c(tmp, src, src_stride, w, h HIGHBD_TAIL_SUFFIX);
+}
+
+static NOINLINE void
+prep_8tap_scaled_c(int16_t *tmp, const pixel *src, ptrdiff_t src_stride,
+ const int w, int h, const int mx, int my,
+ const int dx, const int dy, const int filter_type
+ HIGHBD_DECL_SUFFIX)
+{
+ const int intermediate_bits = get_intermediate_bits(bitdepth_max);
+ int tmp_h = (((h - 1) * dy + my) >> 10) + 8;
+ int16_t mid[128 * (256 + 7)], *mid_ptr = mid;
+ src_stride = PXSTRIDE(src_stride);
+
+ src -= src_stride * 3;
+ do {
+ int x;
+ int imx = mx, ioff = 0;
+
+ for (x = 0; x < w; x++) {
+ GET_H_FILTER(imx >> 6);
+ mid_ptr[x] = fh ? DAV1D_FILTER_8TAP_RND(src, ioff, fh, 1,
+ 6 - intermediate_bits) :
+ src[ioff] << intermediate_bits;
+ imx += dx;
+ ioff += imx >> 10;
+ imx &= 0x3ff;
+ }
+
+ mid_ptr += 128;
+ src += src_stride;
+ } while (--tmp_h);
+
+ mid_ptr = mid + 128 * 3;
+ for (int y = 0; y < h; y++) {
+ int x;
+ GET_V_FILTER(my >> 6);
+
+ for (x = 0; x < w; x++)
+ tmp[x] = (fv ? DAV1D_FILTER_8TAP_RND(mid_ptr, x, fv, 128, 6)
+ : mid_ptr[x]) - PREP_BIAS;
+
+ my += dy;
+ mid_ptr += (my >> 10) * 128;
+ my &= 0x3ff;
+ tmp += w;
+ }
+}
+
+#define filter_fns(type, type_h, type_v) \
+static void put_8tap_##type##_c(pixel *const dst, \
+ const ptrdiff_t dst_stride, \
+ const pixel *const src, \
+ const ptrdiff_t src_stride, \
+ const int w, const int h, \
+ const int mx, const int my \
+ HIGHBD_DECL_SUFFIX) \
+{ \
+ put_8tap_c(dst, dst_stride, src, src_stride, w, h, mx, my, \
+ type_h | (type_v << 2) HIGHBD_TAIL_SUFFIX); \
+} \
+static void put_8tap_##type##_scaled_c(pixel *const dst, \
+ const ptrdiff_t dst_stride, \
+ const pixel *const src, \
+ const ptrdiff_t src_stride, \
+ const int w, const int h, \
+ const int mx, const int my, \
+ const int dx, const int dy \
+ HIGHBD_DECL_SUFFIX) \
+{ \
+ put_8tap_scaled_c(dst, dst_stride, src, src_stride, w, h, mx, my, dx, dy, \
+ type_h | (type_v << 2) HIGHBD_TAIL_SUFFIX); \
+} \
+static void prep_8tap_##type##_c(int16_t *const tmp, \
+ const pixel *const src, \
+ const ptrdiff_t src_stride, \
+ const int w, const int h, \
+ const int mx, const int my \
+ HIGHBD_DECL_SUFFIX) \
+{ \
+ prep_8tap_c(tmp, src, src_stride, w, h, mx, my, \
+ type_h | (type_v << 2) HIGHBD_TAIL_SUFFIX); \
+} \
+static void prep_8tap_##type##_scaled_c(int16_t *const tmp, \
+ const pixel *const src, \
+ const ptrdiff_t src_stride, \
+ const int w, const int h, \
+ const int mx, const int my, \
+ const int dx, const int dy \
+ HIGHBD_DECL_SUFFIX) \
+{ \
+ prep_8tap_scaled_c(tmp, src, src_stride, w, h, mx, my, dx, dy, \
+ type_h | (type_v << 2) HIGHBD_TAIL_SUFFIX); \
+}
+
+filter_fns(regular, DAV1D_FILTER_8TAP_REGULAR, DAV1D_FILTER_8TAP_REGULAR)
+filter_fns(regular_sharp, DAV1D_FILTER_8TAP_REGULAR, DAV1D_FILTER_8TAP_SHARP)
+filter_fns(regular_smooth, DAV1D_FILTER_8TAP_REGULAR, DAV1D_FILTER_8TAP_SMOOTH)
+filter_fns(smooth, DAV1D_FILTER_8TAP_SMOOTH, DAV1D_FILTER_8TAP_SMOOTH)
+filter_fns(smooth_regular, DAV1D_FILTER_8TAP_SMOOTH, DAV1D_FILTER_8TAP_REGULAR)
+filter_fns(smooth_sharp, DAV1D_FILTER_8TAP_SMOOTH, DAV1D_FILTER_8TAP_SHARP)
+filter_fns(sharp, DAV1D_FILTER_8TAP_SHARP, DAV1D_FILTER_8TAP_SHARP)
+filter_fns(sharp_regular, DAV1D_FILTER_8TAP_SHARP, DAV1D_FILTER_8TAP_REGULAR)
+filter_fns(sharp_smooth, DAV1D_FILTER_8TAP_SHARP, DAV1D_FILTER_8TAP_SMOOTH)
+
+#define FILTER_BILIN(src, x, mxy, stride) \
+ (16 * src[x] + ((mxy) * (src[x + stride] - src[x])))
+
+#define FILTER_BILIN_RND(src, x, mxy, stride, sh) \
+ ((FILTER_BILIN(src, x, mxy, stride) + ((1 << (sh)) >> 1)) >> (sh))
+
+#define FILTER_BILIN_CLIP(src, x, mxy, stride, sh) \
+ iclip_pixel(FILTER_BILIN_RND(src, x, mxy, stride, sh))
+
+static void put_bilin_c(pixel *dst, ptrdiff_t dst_stride,
+ const pixel *src, ptrdiff_t src_stride,
+ const int w, int h, const int mx, const int my
+ HIGHBD_DECL_SUFFIX)
+{
+ const int intermediate_bits = get_intermediate_bits(bitdepth_max);
+ const int intermediate_rnd = (1 << intermediate_bits) >> 1;
+ dst_stride = PXSTRIDE(dst_stride);
+ src_stride = PXSTRIDE(src_stride);
+
+ if (mx) {
+ if (my) {
+ int16_t mid[128 * 129], *mid_ptr = mid;
+ int tmp_h = h + 1;
+
+ do {
+ for (int x = 0; x < w; x++)
+ mid_ptr[x] = FILTER_BILIN_RND(src, x, mx, 1,
+ 4 - intermediate_bits);
+
+ mid_ptr += 128;
+ src += src_stride;
+ } while (--tmp_h);
+
+ mid_ptr = mid;
+ do {
+ for (int x = 0; x < w; x++)
+ dst[x] = FILTER_BILIN_CLIP(mid_ptr, x, my, 128,
+ 4 + intermediate_bits);
+
+ mid_ptr += 128;
+ dst += dst_stride;
+ } while (--h);
+ } else {
+ do {
+ for (int x = 0; x < w; x++) {
+ const int px = FILTER_BILIN_RND(src, x, mx, 1,
+ 4 - intermediate_bits);
+ dst[x] = iclip_pixel((px + intermediate_rnd) >> intermediate_bits);
+ }
+
+ dst += dst_stride;
+ src += src_stride;
+ } while (--h);
+ }
+ } else if (my) {
+ do {
+ for (int x = 0; x < w; x++)
+ dst[x] = FILTER_BILIN_CLIP(src, x, my, src_stride, 4);
+
+ dst += dst_stride;
+ src += src_stride;
+ } while (--h);
+ } else
+ put_c(dst, dst_stride, src, src_stride, w, h);
+}
+
+static void put_bilin_scaled_c(pixel *dst, ptrdiff_t dst_stride,
+ const pixel *src, ptrdiff_t src_stride,
+ const int w, int h, const int mx, int my,
+ const int dx, const int dy
+ HIGHBD_DECL_SUFFIX)
+{
+ const int intermediate_bits = get_intermediate_bits(bitdepth_max);
+ int tmp_h = (((h - 1) * dy + my) >> 10) + 2;
+ int16_t mid[128 * (256 + 1)], *mid_ptr = mid;
+
+ do {
+ int x;
+ int imx = mx, ioff = 0;
+
+ for (x = 0; x < w; x++) {
+ mid_ptr[x] = FILTER_BILIN_RND(src, ioff, imx >> 6, 1,
+ 4 - intermediate_bits);
+ imx += dx;
+ ioff += imx >> 10;
+ imx &= 0x3ff;
+ }
+
+ mid_ptr += 128;
+ src += PXSTRIDE(src_stride);
+ } while (--tmp_h);
+
+ mid_ptr = mid;
+ do {
+ int x;
+
+ for (x = 0; x < w; x++)
+ dst[x] = FILTER_BILIN_CLIP(mid_ptr, x, my >> 6, 128,
+ 4 + intermediate_bits);
+
+ my += dy;
+ mid_ptr += (my >> 10) * 128;
+ my &= 0x3ff;
+ dst += PXSTRIDE(dst_stride);
+ } while (--h);
+}
+
+static void prep_bilin_c(int16_t *tmp,
+ const pixel *src, ptrdiff_t src_stride,
+ const int w, int h, const int mx, const int my
+ HIGHBD_DECL_SUFFIX)
+{
+ const int intermediate_bits = get_intermediate_bits(bitdepth_max);
+ src_stride = PXSTRIDE(src_stride);
+
+ if (mx) {
+ if (my) {
+ int16_t mid[128 * 129], *mid_ptr = mid;
+ int tmp_h = h + 1;
+
+ do {
+ for (int x = 0; x < w; x++)
+ mid_ptr[x] = FILTER_BILIN_RND(src, x, mx, 1,
+ 4 - intermediate_bits);
+
+ mid_ptr += 128;
+ src += src_stride;
+ } while (--tmp_h);
+
+ mid_ptr = mid;
+ do {
+ for (int x = 0; x < w; x++)
+ tmp[x] = FILTER_BILIN_RND(mid_ptr, x, my, 128, 4) -
+ PREP_BIAS;
+
+ mid_ptr += 128;
+ tmp += w;
+ } while (--h);
+ } else {
+ do {
+ for (int x = 0; x < w; x++)
+ tmp[x] = FILTER_BILIN_RND(src, x, mx, 1,
+ 4 - intermediate_bits) -
+ PREP_BIAS;
+
+ tmp += w;
+ src += src_stride;
+ } while (--h);
+ }
+ } else if (my) {
+ do {
+ for (int x = 0; x < w; x++)
+ tmp[x] = FILTER_BILIN_RND(src, x, my, src_stride,
+ 4 - intermediate_bits) - PREP_BIAS;
+
+ tmp += w;
+ src += src_stride;
+ } while (--h);
+ } else
+ prep_c(tmp, src, src_stride, w, h HIGHBD_TAIL_SUFFIX);
+}
+
+static void prep_bilin_scaled_c(int16_t *tmp,
+ const pixel *src, ptrdiff_t src_stride,
+ const int w, int h, const int mx, int my,
+ const int dx, const int dy HIGHBD_DECL_SUFFIX)
+{
+ const int intermediate_bits = get_intermediate_bits(bitdepth_max);
+ int tmp_h = (((h - 1) * dy + my) >> 10) + 2;
+ int16_t mid[128 * (256 + 1)], *mid_ptr = mid;
+
+ do {
+ int x;
+ int imx = mx, ioff = 0;
+
+ for (x = 0; x < w; x++) {
+ mid_ptr[x] = FILTER_BILIN_RND(src, ioff, imx >> 6, 1,
+ 4 - intermediate_bits);
+ imx += dx;
+ ioff += imx >> 10;
+ imx &= 0x3ff;
+ }
+
+ mid_ptr += 128;
+ src += PXSTRIDE(src_stride);
+ } while (--tmp_h);
+
+ mid_ptr = mid;
+ do {
+ int x;
+
+ for (x = 0; x < w; x++)
+ tmp[x] = FILTER_BILIN_RND(mid_ptr, x, my >> 6, 128, 4) - PREP_BIAS;
+
+ my += dy;
+ mid_ptr += (my >> 10) * 128;
+ my &= 0x3ff;
+ tmp += w;
+ } while (--h);
+}
+
+static void avg_c(pixel *dst, const ptrdiff_t dst_stride,
+ const int16_t *tmp1, const int16_t *tmp2, const int w, int h
+ HIGHBD_DECL_SUFFIX)
+{
+ const int intermediate_bits = get_intermediate_bits(bitdepth_max);
+ const int sh = intermediate_bits + 1;
+ const int rnd = (1 << intermediate_bits) + PREP_BIAS * 2;
+ do {
+ for (int x = 0; x < w; x++)
+ dst[x] = iclip_pixel((tmp1[x] + tmp2[x] + rnd) >> sh);
+
+ tmp1 += w;
+ tmp2 += w;
+ dst += PXSTRIDE(dst_stride);
+ } while (--h);
+}
+
+static void w_avg_c(pixel *dst, const ptrdiff_t dst_stride,
+ const int16_t *tmp1, const int16_t *tmp2, const int w, int h,
+ const int weight HIGHBD_DECL_SUFFIX)
+{
+ const int intermediate_bits = get_intermediate_bits(bitdepth_max);
+ const int sh = intermediate_bits + 4;
+ const int rnd = (8 << intermediate_bits) + PREP_BIAS * 16;
+ do {
+ for (int x = 0; x < w; x++)
+ dst[x] = iclip_pixel((tmp1[x] * weight +
+ tmp2[x] * (16 - weight) + rnd) >> sh);
+
+ tmp1 += w;
+ tmp2 += w;
+ dst += PXSTRIDE(dst_stride);
+ } while (--h);
+}
+
+static void mask_c(pixel *dst, const ptrdiff_t dst_stride,
+ const int16_t *tmp1, const int16_t *tmp2, const int w, int h,
+ const uint8_t *mask HIGHBD_DECL_SUFFIX)
+{
+ const int intermediate_bits = get_intermediate_bits(bitdepth_max);
+ const int sh = intermediate_bits + 6;
+ const int rnd = (32 << intermediate_bits) + PREP_BIAS * 64;
+ do {
+ for (int x = 0; x < w; x++)
+ dst[x] = iclip_pixel((tmp1[x] * mask[x] +
+ tmp2[x] * (64 - mask[x]) + rnd) >> sh);
+
+ tmp1 += w;
+ tmp2 += w;
+ mask += w;
+ dst += PXSTRIDE(dst_stride);
+ } while (--h);
+}
+
+#define blend_px(a, b, m) (((a * (64 - m) + b * m) + 32) >> 6)
+static void blend_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
+ const int w, int h, const uint8_t *mask)
+{
+ do {
+ for (int x = 0; x < w; x++) {
+ dst[x] = blend_px(dst[x], tmp[x], mask[x]);
+ }
+ dst += PXSTRIDE(dst_stride);
+ tmp += w;
+ mask += w;
+ } while (--h);
+}
+
+static void blend_v_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
+ const int w, int h)
+{
+ const uint8_t *const mask = &dav1d_obmc_masks[w];
+ do {
+ for (int x = 0; x < (w * 3) >> 2; x++) {
+ dst[x] = blend_px(dst[x], tmp[x], mask[x]);
+ }
+ dst += PXSTRIDE(dst_stride);
+ tmp += w;
+ } while (--h);
+}
+
+static void blend_h_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
+ const int w, int h)
+{
+ const uint8_t *mask = &dav1d_obmc_masks[h];
+ h = (h * 3) >> 2;
+ do {
+ const int m = *mask++;
+ for (int x = 0; x < w; x++) {
+ dst[x] = blend_px(dst[x], tmp[x], m);
+ }
+ dst += PXSTRIDE(dst_stride);
+ tmp += w;
+ } while (--h);
+}
+
+static void w_mask_c(pixel *dst, const ptrdiff_t dst_stride,
+ const int16_t *tmp1, const int16_t *tmp2, const int w, int h,
+ uint8_t *mask, const int sign,
+ const int ss_hor, const int ss_ver HIGHBD_DECL_SUFFIX)
+{
+ // store mask at 2x2 resolution, i.e. store 2x1 sum for even rows,
+ // and then load this intermediate to calculate final value for odd rows
+ const int intermediate_bits = get_intermediate_bits(bitdepth_max);
+ const int bitdepth = bitdepth_from_max(bitdepth_max);
+ const int sh = intermediate_bits + 6;
+ const int rnd = (32 << intermediate_bits) + PREP_BIAS * 64;
+ const int mask_sh = bitdepth + intermediate_bits - 4;
+ const int mask_rnd = 1 << (mask_sh - 5);
+ do {
+ for (int x = 0; x < w; x++) {
+ const int m = imin(38 + ((abs(tmp1[x] - tmp2[x]) + mask_rnd) >> mask_sh), 64);
+ dst[x] = iclip_pixel((tmp1[x] * m +
+ tmp2[x] * (64 - m) + rnd) >> sh);
+
+ if (ss_hor) {
+ x++;
+
+ const int n = imin(38 + ((abs(tmp1[x] - tmp2[x]) + mask_rnd) >> mask_sh), 64);
+ dst[x] = iclip_pixel((tmp1[x] * n +
+ tmp2[x] * (64 - n) + rnd) >> sh);
+
+ if (h & ss_ver) {
+ mask[x >> 1] = (m + n + mask[x >> 1] + 2 - sign) >> 2;
+ } else if (ss_ver) {
+ mask[x >> 1] = m + n;
+ } else {
+ mask[x >> 1] = (m + n + 1 - sign) >> 1;
+ }
+ } else {
+ mask[x] = m;
+ }
+ }
+
+ tmp1 += w;
+ tmp2 += w;
+ dst += PXSTRIDE(dst_stride);
+ if (!ss_ver || (h & 1)) mask += w >> ss_hor;
+ } while (--h);
+}
+
+#define w_mask_fns(ssn, ss_hor, ss_ver) \
+static void w_mask_##ssn##_c(pixel *const dst, const ptrdiff_t dst_stride, \
+ const int16_t *const tmp1, const int16_t *const tmp2, \
+ const int w, const int h, uint8_t *mask, \
+ const int sign HIGHBD_DECL_SUFFIX) \
+{ \
+ w_mask_c(dst, dst_stride, tmp1, tmp2, w, h, mask, sign, ss_hor, ss_ver \
+ HIGHBD_TAIL_SUFFIX); \
+}
+
+w_mask_fns(444, 0, 0);
+w_mask_fns(422, 1, 0);
+w_mask_fns(420, 1, 1);
+
+#undef w_mask_fns
+
+#define FILTER_WARP_RND(src, x, F, stride, sh) \
+ ((F[0] * src[x - 3 * stride] + \
+ F[1] * src[x - 2 * stride] + \
+ F[2] * src[x - 1 * stride] + \
+ F[3] * src[x + 0 * stride] + \
+ F[4] * src[x + 1 * stride] + \
+ F[5] * src[x + 2 * stride] + \
+ F[6] * src[x + 3 * stride] + \
+ F[7] * src[x + 4 * stride] + \
+ ((1 << (sh)) >> 1)) >> (sh))
+
+#define FILTER_WARP_CLIP(src, x, F, stride, sh) \
+ iclip_pixel(FILTER_WARP_RND(src, x, F, stride, sh))
+
+static void warp_affine_8x8_c(pixel *dst, const ptrdiff_t dst_stride,
+ const pixel *src, const ptrdiff_t src_stride,
+ const int16_t *const abcd, int mx, int my
+ HIGHBD_DECL_SUFFIX)
+{
+ const int intermediate_bits = get_intermediate_bits(bitdepth_max);
+ int16_t mid[15 * 8], *mid_ptr = mid;
+
+ src -= 3 * PXSTRIDE(src_stride);
+ for (int y = 0; y < 15; y++, mx += abcd[1]) {
+ for (int x = 0, tmx = mx; x < 8; x++, tmx += abcd[0]) {
+ const int8_t *const filter =
+ dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)];
+
+ mid_ptr[x] = FILTER_WARP_RND(src, x, filter, 1,
+ 7 - intermediate_bits);
+ }
+ src += PXSTRIDE(src_stride);
+ mid_ptr += 8;
+ }
+
+ mid_ptr = &mid[3 * 8];
+ for (int y = 0; y < 8; y++, my += abcd[3]) {
+ for (int x = 0, tmy = my; x < 8; x++, tmy += abcd[2]) {
+ const int8_t *const filter =
+ dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)];
+
+ dst[x] = FILTER_WARP_CLIP(mid_ptr, x, filter, 8,
+ 7 + intermediate_bits);
+ }
+ mid_ptr += 8;
+ dst += PXSTRIDE(dst_stride);
+ }
+}
+
+static void warp_affine_8x8t_c(int16_t *tmp, const ptrdiff_t tmp_stride,
+ const pixel *src, const ptrdiff_t src_stride,
+ const int16_t *const abcd, int mx, int my
+ HIGHBD_DECL_SUFFIX)
+{
+ const int intermediate_bits = get_intermediate_bits(bitdepth_max);
+ int16_t mid[15 * 8], *mid_ptr = mid;
+
+ src -= 3 * PXSTRIDE(src_stride);
+ for (int y = 0; y < 15; y++, mx += abcd[1]) {
+ for (int x = 0, tmx = mx; x < 8; x++, tmx += abcd[0]) {
+ const int8_t *const filter =
+ dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)];
+
+ mid_ptr[x] = FILTER_WARP_RND(src, x, filter, 1,
+ 7 - intermediate_bits);
+ }
+ src += PXSTRIDE(src_stride);
+ mid_ptr += 8;
+ }
+
+ mid_ptr = &mid[3 * 8];
+ for (int y = 0; y < 8; y++, my += abcd[3]) {
+ for (int x = 0, tmy = my; x < 8; x++, tmy += abcd[2]) {
+ const int8_t *const filter =
+ dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)];
+
+ tmp[x] = FILTER_WARP_RND(mid_ptr, x, filter, 8, 7) - PREP_BIAS;
+ }
+ mid_ptr += 8;
+ tmp += tmp_stride;
+ }
+}
+
+static void emu_edge_c(const intptr_t bw, const intptr_t bh,
+ const intptr_t iw, const intptr_t ih,
+ const intptr_t x, const intptr_t y,
+ pixel *dst, const ptrdiff_t dst_stride,
+ const pixel *ref, const ptrdiff_t ref_stride)
+{
+ // find offset in reference of visible block to copy
+ ref += iclip((int) y, 0, (int) ih - 1) * PXSTRIDE(ref_stride) +
+ iclip((int) x, 0, (int) iw - 1);
+
+ // number of pixels to extend (left, right, top, bottom)
+ const int left_ext = iclip((int) -x, 0, (int) bw - 1);
+ const int right_ext = iclip((int) (x + bw - iw), 0, (int) bw - 1);
+ assert(left_ext + right_ext < bw);
+ const int top_ext = iclip((int) -y, 0, (int) bh - 1);
+ const int bottom_ext = iclip((int) (y + bh - ih), 0, (int) bh - 1);
+ assert(top_ext + bottom_ext < bh);
+
+ // copy visible portion first
+ pixel *blk = dst + top_ext * PXSTRIDE(dst_stride);
+ const int center_w = (int) (bw - left_ext - right_ext);
+ const int center_h = (int) (bh - top_ext - bottom_ext);
+ for (int y = 0; y < center_h; y++) {
+ pixel_copy(blk + left_ext, ref, center_w);
+ // extend left edge for this line
+ if (left_ext)
+ pixel_set(blk, blk[left_ext], left_ext);
+ // extend right edge for this line
+ if (right_ext)
+ pixel_set(blk + left_ext + center_w, blk[left_ext + center_w - 1],
+ right_ext);
+ ref += PXSTRIDE(ref_stride);
+ blk += PXSTRIDE(dst_stride);
+ }
+
+ // copy top
+ blk = dst + top_ext * PXSTRIDE(dst_stride);
+ for (int y = 0; y < top_ext; y++) {
+ pixel_copy(dst, blk, bw);
+ dst += PXSTRIDE(dst_stride);
+ }
+
+ // copy bottom
+ dst += center_h * PXSTRIDE(dst_stride);
+ for (int y = 0; y < bottom_ext; y++) {
+ pixel_copy(dst, &dst[-PXSTRIDE(dst_stride)], bw);
+ dst += PXSTRIDE(dst_stride);
+ }
+}
+
+static void resize_c(pixel *dst, const ptrdiff_t dst_stride,
+ const pixel *src, const ptrdiff_t src_stride,
+ const int dst_w, int h, const int src_w,
+ const int dx, const int mx0 HIGHBD_DECL_SUFFIX)
+{
+ do {
+ int mx = mx0, src_x = -1;
+ for (int x = 0; x < dst_w; x++) {
+ const int8_t *const F = dav1d_resize_filter[mx >> 8];
+ dst[x] = iclip_pixel((-(F[0] * src[iclip(src_x - 3, 0, src_w - 1)] +
+ F[1] * src[iclip(src_x - 2, 0, src_w - 1)] +
+ F[2] * src[iclip(src_x - 1, 0, src_w - 1)] +
+ F[3] * src[iclip(src_x + 0, 0, src_w - 1)] +
+ F[4] * src[iclip(src_x + 1, 0, src_w - 1)] +
+ F[5] * src[iclip(src_x + 2, 0, src_w - 1)] +
+ F[6] * src[iclip(src_x + 3, 0, src_w - 1)] +
+ F[7] * src[iclip(src_x + 4, 0, src_w - 1)]) +
+ 64) >> 7);
+ mx += dx;
+ src_x += mx >> 14;
+ mx &= 0x3fff;
+ }
+
+ dst += PXSTRIDE(dst_stride);
+ src += PXSTRIDE(src_stride);
+ } while (--h);
+}
+
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+#include "src/arm/mc.h"
+#elif ARCH_LOONGARCH64
+#include "src/loongarch/mc.h"
+#elif ARCH_X86
+#include "src/x86/mc.h"
+#endif
+#endif
+
+COLD void bitfn(dav1d_mc_dsp_init)(Dav1dMCDSPContext *const c) {
+#define init_mc_fns(type, name) do { \
+ c->mc [type] = put_##name##_c; \
+ c->mc_scaled [type] = put_##name##_scaled_c; \
+ c->mct [type] = prep_##name##_c; \
+ c->mct_scaled[type] = prep_##name##_scaled_c; \
+} while (0)
+
+ init_mc_fns(FILTER_2D_8TAP_REGULAR, 8tap_regular);
+ init_mc_fns(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth);
+ init_mc_fns(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp);
+ init_mc_fns(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular);
+ init_mc_fns(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth);
+ init_mc_fns(FILTER_2D_8TAP_SHARP, 8tap_sharp);
+ init_mc_fns(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular);
+ init_mc_fns(FILTER_2D_8TAP_SMOOTH, 8tap_smooth);
+ init_mc_fns(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp);
+ init_mc_fns(FILTER_2D_BILINEAR, bilin);
+
+ c->avg = avg_c;
+ c->w_avg = w_avg_c;
+ c->mask = mask_c;
+ c->blend = blend_c;
+ c->blend_v = blend_v_c;
+ c->blend_h = blend_h_c;
+ c->w_mask[0] = w_mask_444_c;
+ c->w_mask[1] = w_mask_422_c;
+ c->w_mask[2] = w_mask_420_c;
+ c->warp8x8 = warp_affine_8x8_c;
+ c->warp8x8t = warp_affine_8x8t_c;
+ c->emu_edge = emu_edge_c;
+ c->resize = resize_c;
+
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+ mc_dsp_init_arm(c);
+#elif ARCH_LOONGARCH64
+ mc_dsp_init_loongarch(c);
+#elif ARCH_X86
+ mc_dsp_init_x86(c);
+#endif
+#endif
+}
diff --git a/third_party/dav1d/src/mem.c b/third_party/dav1d/src/mem.c
new file mode 100644
index 0000000000..7e6eb4c066
--- /dev/null
+++ b/third_party/dav1d/src/mem.c
@@ -0,0 +1,328 @@
+/*
+ * Copyright © 2020, VideoLAN and dav1d authors
+ * Copyright © 2020, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stdint.h>
+
+#include "src/internal.h"
+
+#if TRACK_HEAP_ALLOCATIONS
+#include <stdio.h>
+
+#include "src/log.h"
+
+#define DEFAULT_ALIGN 16
+
+typedef struct {
+ size_t sz;
+ unsigned align;
+ enum AllocationType type;
+} Dav1dAllocationData;
+
+typedef struct {
+ size_t curr_sz;
+ size_t peak_sz;
+ unsigned num_allocs;
+ unsigned num_reuses;
+} AllocStats;
+
+static AllocStats tracked_allocs[N_ALLOC_TYPES];
+static size_t curr_total_sz;
+static size_t peak_total_sz;
+static pthread_mutex_t track_alloc_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+static void *track_alloc(const enum AllocationType type, char *ptr,
+ const size_t sz, const size_t align)
+{
+ assert(align >= sizeof(Dav1dAllocationData));
+ if (ptr) {
+ ptr += align;
+ Dav1dAllocationData *const d = &((Dav1dAllocationData*)ptr)[-1];
+ AllocStats *const s = &tracked_allocs[type];
+
+ d->sz = sz;
+ d->align = (unsigned)align;
+ d->type = type;
+
+ pthread_mutex_lock(&track_alloc_mutex);
+ s->num_allocs++;
+ s->curr_sz += sz;
+ if (s->curr_sz > s->peak_sz)
+ s->peak_sz = s->curr_sz;
+
+ curr_total_sz += sz;
+ if (curr_total_sz > peak_total_sz)
+ peak_total_sz = curr_total_sz;
+ pthread_mutex_unlock(&track_alloc_mutex);
+ }
+ return ptr;
+}
+
+static void *track_free(char *const ptr) {
+ const Dav1dAllocationData *const d = &((Dav1dAllocationData*)ptr)[-1];
+ const size_t sz = d->sz;
+
+ pthread_mutex_lock(&track_alloc_mutex);
+ tracked_allocs[d->type].curr_sz -= sz;
+ curr_total_sz -= sz;
+ pthread_mutex_unlock(&track_alloc_mutex);
+
+ return ptr - d->align;
+}
+
+static void dav1d_track_reuse(const enum AllocationType type) {
+ pthread_mutex_lock(&track_alloc_mutex);
+ tracked_allocs[type].num_reuses++;
+ pthread_mutex_unlock(&track_alloc_mutex);
+}
+
+void *dav1d_malloc(const enum AllocationType type, const size_t sz) {
+ void *const ptr = malloc(sz + DEFAULT_ALIGN);
+ return track_alloc(type, ptr, sz, DEFAULT_ALIGN);
+}
+
+void *dav1d_alloc_aligned(const enum AllocationType type,
+ const size_t sz, const size_t align)
+{
+ assert(!(align & (align - 1)));
+ void *ptr;
+#ifdef _WIN32
+ ptr = _aligned_malloc(sz + align, align);
+#elif defined(HAVE_POSIX_MEMALIGN)
+ if (posix_memalign(&ptr, align, sz + align)) return NULL;
+#else
+ ptr = memalign(align, sz + align);
+#endif
+
+ return track_alloc(type, ptr, sz, align);
+}
+
+void *dav1d_realloc(const enum AllocationType type,
+ void *ptr, const size_t sz)
+{
+ if (!ptr)
+ return dav1d_malloc(type, sz);
+ ptr = realloc((char*)ptr - DEFAULT_ALIGN, sz + DEFAULT_ALIGN);
+ if (ptr)
+ ptr = track_free((char*)ptr + DEFAULT_ALIGN);
+ return track_alloc(type, ptr, sz, DEFAULT_ALIGN);
+}
+
+void dav1d_free(void *ptr) {
+ if (ptr)
+ free(track_free(ptr));
+}
+
+void dav1d_free_aligned(void *ptr) {
+ if (ptr) {
+ ptr = track_free(ptr);
+#ifdef _WIN32
+ _aligned_free(ptr);
+#else
+ free(ptr);
+#endif
+ }
+}
+
+static COLD int cmp_stats(const void *const a, const void *const b) {
+ const size_t a_sz = ((const AllocStats*)a)->peak_sz;
+ const size_t b_sz = ((const AllocStats*)b)->peak_sz;
+ return a_sz < b_sz ? -1 : a_sz > b_sz;
+}
+
+/* Insert spaces as thousands separators for better readability */
+static COLD int format_tsep(char *const s, const size_t n, const size_t value) {
+ if (value < 1000)
+ return snprintf(s, n, "%u", (unsigned)value);
+
+ const int len = format_tsep(s, n, value / 1000);
+ assert((size_t)len < n);
+ return len + snprintf(s + len, n - len, " %03u", (unsigned)(value % 1000));
+}
+
+COLD void dav1d_log_alloc_stats(Dav1dContext *const c) {
+ static const char *const type_names[N_ALLOC_TYPES] = {
+ [ALLOC_BLOCK ] = "Block data",
+ [ALLOC_CDEF ] = "CDEF line buffers",
+ [ALLOC_CDF ] = "CDF contexts",
+ [ALLOC_COEF ] = "Coefficient data",
+ [ALLOC_COMMON_CTX] = "Common context data",
+ [ALLOC_DAV1DDATA ] = "Dav1dData",
+ [ALLOC_IPRED ] = "Intra pred edges",
+ [ALLOC_LF ] = "Loopfilter data",
+ [ALLOC_LR ] = "Looprestoration data",
+ [ALLOC_OBU_HDR ] = "OBU headers",
+ [ALLOC_OBU_META ] = "OBU metadata",
+ [ALLOC_PAL ] = "Palette data",
+ [ALLOC_PIC ] = "Picture buffers",
+ [ALLOC_PIC_CTX ] = "Picture context data",
+ [ALLOC_REFMVS ] = "Reference mv data",
+ [ALLOC_SEGMAP ] = "Segmentation maps",
+ [ALLOC_THREAD_CTX] = "Thread context data",
+ [ALLOC_TILE ] = "Tile data",
+ };
+
+ struct {
+ AllocStats stats;
+ enum AllocationType type;
+ } data[N_ALLOC_TYPES];
+ unsigned total_allocs = 0;
+ unsigned total_reuses = 0;
+
+ pthread_mutex_lock(&track_alloc_mutex);
+ for (int i = 0; i < N_ALLOC_TYPES; i++) {
+ AllocStats *const s = &data[i].stats;
+ *s = tracked_allocs[i];
+ data[i].type = i;
+ total_allocs += s->num_allocs;
+ total_reuses += s->num_reuses;
+ }
+ size_t total_sz = peak_total_sz;
+ pthread_mutex_unlock(&track_alloc_mutex);
+
+ /* Sort types by memory usage */
+ qsort(&data, N_ALLOC_TYPES, sizeof(*data), cmp_stats);
+
+ const double inv_total_share = 100.0 / total_sz;
+ char total_sz_buf[32];
+ const int sz_len = 4 + format_tsep(total_sz_buf, sizeof(total_sz_buf), total_sz);
+
+ dav1d_log(c, "\n Type Allocs Reuses Share Peak size\n"
+ "---------------------------------------------------------------------\n");
+ for (int i = N_ALLOC_TYPES - 1; i >= 0; i--) {
+ const AllocStats *const s = &data[i].stats;
+ if (s->num_allocs) {
+ const double share = s->peak_sz * inv_total_share;
+ char sz_buf[32];
+ format_tsep(sz_buf, sizeof(sz_buf), s->peak_sz);
+ dav1d_log(c, " %-20s%10u%10u%8.1f%%%*s\n", type_names[data[i].type],
+ s->num_allocs, s->num_reuses, share, sz_len, sz_buf);
+ }
+ }
+ dav1d_log(c, "---------------------------------------------------------------------\n"
+ "%31u%10u %s\n",
+ total_allocs, total_reuses, total_sz_buf);
+}
+#endif /* TRACK_HEAP_ALLOCATIONS */
+
+static COLD void mem_pool_destroy(Dav1dMemPool *const pool) {
+ pthread_mutex_destroy(&pool->lock);
+ dav1d_free(pool);
+}
+
+void dav1d_mem_pool_push(Dav1dMemPool *const pool, Dav1dMemPoolBuffer *const buf) {
+ pthread_mutex_lock(&pool->lock);
+ const int ref_cnt = --pool->ref_cnt;
+ if (!pool->end) {
+ buf->next = pool->buf;
+ pool->buf = buf;
+ pthread_mutex_unlock(&pool->lock);
+ assert(ref_cnt > 0);
+ } else {
+ pthread_mutex_unlock(&pool->lock);
+ dav1d_free_aligned(buf->data);
+ if (!ref_cnt) mem_pool_destroy(pool);
+ }
+}
+
+Dav1dMemPoolBuffer *dav1d_mem_pool_pop(Dav1dMemPool *const pool, const size_t size) {
+ assert(!(size & (sizeof(void*) - 1)));
+ pthread_mutex_lock(&pool->lock);
+ Dav1dMemPoolBuffer *buf = pool->buf;
+ pool->ref_cnt++;
+ uint8_t *data;
+ if (buf) {
+ pool->buf = buf->next;
+ pthread_mutex_unlock(&pool->lock);
+ data = buf->data;
+ if ((uintptr_t)buf - (uintptr_t)data != size) {
+ /* Reallocate if the size has changed */
+ dav1d_free_aligned(data);
+ goto alloc;
+ }
+#if TRACK_HEAP_ALLOCATIONS
+ dav1d_track_reuse(pool->type);
+#endif
+ } else {
+ pthread_mutex_unlock(&pool->lock);
+alloc:
+ data = dav1d_alloc_aligned(pool->type,
+ size + sizeof(Dav1dMemPoolBuffer), 64);
+ if (!data) {
+ pthread_mutex_lock(&pool->lock);
+ const int ref_cnt = --pool->ref_cnt;
+ pthread_mutex_unlock(&pool->lock);
+ if (!ref_cnt) mem_pool_destroy(pool);
+ return NULL;
+ }
+ buf = (Dav1dMemPoolBuffer*)(data + size);
+ buf->data = data;
+ }
+
+ return buf;
+}
+
+COLD int dav1d_mem_pool_init(const enum AllocationType type,
+ Dav1dMemPool **const ppool)
+{
+ Dav1dMemPool *const pool = dav1d_malloc(ALLOC_COMMON_CTX,
+ sizeof(Dav1dMemPool));
+ if (pool) {
+ if (!pthread_mutex_init(&pool->lock, NULL)) {
+ pool->buf = NULL;
+ pool->ref_cnt = 1;
+ pool->end = 0;
+#if TRACK_HEAP_ALLOCATIONS
+ pool->type = type;
+#endif
+ *ppool = pool;
+ return 0;
+ }
+ dav1d_free(pool);
+ }
+ *ppool = NULL;
+ return DAV1D_ERR(ENOMEM);
+}
+
+COLD void dav1d_mem_pool_end(Dav1dMemPool *const pool) {
+ if (pool) {
+ pthread_mutex_lock(&pool->lock);
+ Dav1dMemPoolBuffer *buf = pool->buf;
+ const int ref_cnt = --pool->ref_cnt;
+ pool->buf = NULL;
+ pool->end = 1;
+ pthread_mutex_unlock(&pool->lock);
+
+ while (buf) {
+ void *const data = buf->data;
+ buf = buf->next;
+ dav1d_free_aligned(data);
+ }
+ if (!ref_cnt) mem_pool_destroy(pool);
+ }
+}
diff --git a/third_party/dav1d/src/mem.h b/third_party/dav1d/src/mem.h
new file mode 100644
index 0000000000..0a8c18d709
--- /dev/null
+++ b/third_party/dav1d/src/mem.h
@@ -0,0 +1,137 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_MEM_H
+#define DAV1D_SRC_MEM_H
+
+#define TRACK_HEAP_ALLOCATIONS 0
+
+#include <stdlib.h>
+
+#if defined(_WIN32) || !defined(HAVE_POSIX_MEMALIGN)
+#include <malloc.h>
+#endif
+
+#include "dav1d/dav1d.h"
+
+#include "common/attributes.h"
+
+#include "src/thread.h"
+
+enum AllocationType {
+ ALLOC_BLOCK,
+ ALLOC_CDEF,
+ ALLOC_CDF,
+ ALLOC_COEF,
+ ALLOC_COMMON_CTX,
+ ALLOC_DAV1DDATA,
+ ALLOC_IPRED,
+ ALLOC_LF,
+ ALLOC_LR,
+ ALLOC_OBU_HDR,
+ ALLOC_OBU_META,
+ ALLOC_PAL,
+ ALLOC_PIC,
+ ALLOC_PIC_CTX,
+ ALLOC_REFMVS,
+ ALLOC_SEGMAP,
+ ALLOC_THREAD_CTX,
+ ALLOC_TILE,
+ N_ALLOC_TYPES,
+};
+
+typedef struct Dav1dMemPoolBuffer {
+ void *data;
+ struct Dav1dMemPoolBuffer *next;
+} Dav1dMemPoolBuffer;
+
+typedef struct Dav1dMemPool {
+ pthread_mutex_t lock;
+ Dav1dMemPoolBuffer *buf;
+ int ref_cnt;
+ int end;
+#if TRACK_HEAP_ALLOCATIONS
+ enum AllocationType type;
+#endif
+} Dav1dMemPool;
+
+
+#if TRACK_HEAP_ALLOCATIONS
+void *dav1d_malloc(enum AllocationType type, size_t sz);
+void *dav1d_realloc(enum AllocationType type, void *ptr, size_t sz);
+void *dav1d_alloc_aligned(enum AllocationType type, size_t sz, size_t align);
+void dav1d_free(void *ptr);
+void dav1d_free_aligned(void *ptr);
+void dav1d_log_alloc_stats(Dav1dContext *c);
+#else
+#define dav1d_mem_pool_init(type, pool) dav1d_mem_pool_init(pool)
+#define dav1d_malloc(type, sz) malloc(sz)
+#define dav1d_realloc(type, ptr, sz) realloc(ptr, sz)
+#define dav1d_free(ptr) free(ptr)
+
+/*
+ * Allocate align-byte aligned memory. The return value can be released
+ * by calling the dav1d_free_aligned() function.
+ */
+static inline void *dav1d_alloc_aligned(const size_t sz, const size_t align) {
+ assert(!(align & (align - 1)));
+#ifdef _WIN32
+ return _aligned_malloc(sz, align);
+#elif defined(HAVE_POSIX_MEMALIGN)
+ void *ptr;
+ if (posix_memalign(&ptr, align, sz)) return NULL;
+ return ptr;
+#else
+ return memalign(align, sz);
+#endif
+}
+#define dav1d_alloc_aligned(type, sz, align) dav1d_alloc_aligned(sz, align)
+
+static inline void dav1d_free_aligned(void *ptr) {
+#ifdef _WIN32
+ _aligned_free(ptr);
+#else
+ free(ptr);
+#endif
+}
+
+#endif /* TRACK_HEAP_ALLOCATIONS */
+
+void dav1d_mem_pool_push(Dav1dMemPool *pool, Dav1dMemPoolBuffer *buf);
+Dav1dMemPoolBuffer *dav1d_mem_pool_pop(Dav1dMemPool *pool, size_t size);
+int dav1d_mem_pool_init(enum AllocationType type, Dav1dMemPool **pool);
+void dav1d_mem_pool_end(Dav1dMemPool *pool);
+
+static inline void dav1d_freep_aligned(void *ptr) {
+ void **mem = (void **) ptr;
+ if (*mem) {
+ dav1d_free_aligned(*mem);
+ *mem = NULL;
+ }
+}
+
+#endif /* DAV1D_SRC_MEM_H */
diff --git a/third_party/dav1d/src/meson.build b/third_party/dav1d/src/meson.build
new file mode 100644
index 0000000000..dc4be5fd6f
--- /dev/null
+++ b/third_party/dav1d/src/meson.build
@@ -0,0 +1,377 @@
+# Copyright © 2018-2019, VideoLAN and dav1d authors
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#
+# Build definition for the dav1d library
+#
+
+# libdav1d source files
+libdav1d_sources = files(
+ 'cdf.c',
+ 'cpu.c',
+ 'data.c',
+ 'decode.c',
+ 'dequant_tables.c',
+ 'getbits.c',
+ 'intra_edge.c',
+ 'itx_1d.c',
+ 'lf_mask.c',
+ 'lib.c',
+ 'log.c',
+ 'mem.c',
+ 'msac.c',
+ 'obu.c',
+ 'pal.c',
+ 'picture.c',
+ 'qm.c',
+ 'ref.c',
+ 'refmvs.c',
+ 'scan.c',
+ 'tables.c',
+ 'thread_task.c',
+ 'warpmv.c',
+ 'wedge.c',
+)
+
+# libdav1d bitdepth source files
+# These files are compiled for each bitdepth with
+# `BITDEPTH` defined to the currently built bitdepth.
+libdav1d_tmpl_sources = files(
+ 'cdef_apply_tmpl.c',
+ 'cdef_tmpl.c',
+ 'fg_apply_tmpl.c',
+ 'filmgrain_tmpl.c',
+ 'ipred_prepare_tmpl.c',
+ 'ipred_tmpl.c',
+ 'itx_tmpl.c',
+ 'lf_apply_tmpl.c',
+ 'loopfilter_tmpl.c',
+ 'looprestoration_tmpl.c',
+ 'lr_apply_tmpl.c',
+ 'mc_tmpl.c',
+ 'recon_tmpl.c',
+)
+
+libdav1d_arch_tmpl_sources = []
+
+libdav1d_bitdepth_objs = []
+
+# ASM specific sources
+libdav1d_asm_objs = []
+# Arch-specific flags
+arch_flags = []
+if is_asm_enabled
+ if (host_machine.cpu_family() == 'aarch64' or
+ host_machine.cpu_family().startswith('arm'))
+
+ libdav1d_sources += files(
+ 'arm/cpu.c',
+ )
+ if (host_machine.cpu_family() == 'aarch64' or
+ host_machine.cpu() == 'arm64')
+ libdav1d_sources_asm = files(
+ # itx.S is used for both 8 and 16 bpc.
+ 'arm/64/itx.S',
+ 'arm/64/looprestoration_common.S',
+ 'arm/64/msac.S',
+ 'arm/64/refmvs.S',
+ )
+
+ if dav1d_bitdepths.contains('8')
+ libdav1d_sources_asm += files(
+ 'arm/64/cdef.S',
+ 'arm/64/filmgrain.S',
+ 'arm/64/ipred.S',
+ 'arm/64/loopfilter.S',
+ 'arm/64/looprestoration.S',
+ 'arm/64/mc.S',
+ )
+ endif
+
+ if dav1d_bitdepths.contains('16')
+ libdav1d_sources_asm += files(
+ 'arm/64/cdef16.S',
+ 'arm/64/filmgrain16.S',
+ 'arm/64/ipred16.S',
+ 'arm/64/itx16.S',
+ 'arm/64/loopfilter16.S',
+ 'arm/64/looprestoration16.S',
+ 'arm/64/mc16.S',
+ )
+ endif
+ elif host_machine.cpu_family().startswith('arm')
+ libdav1d_sources_asm = files(
+ # itx.S is used for both 8 and 16 bpc.
+ 'arm/32/itx.S',
+ 'arm/32/looprestoration_common.S',
+ 'arm/32/msac.S',
+ 'arm/32/refmvs.S',
+ )
+
+ if dav1d_bitdepths.contains('8')
+ libdav1d_sources_asm += files(
+ 'arm/32/cdef.S',
+ 'arm/32/filmgrain.S',
+ 'arm/32/ipred.S',
+ 'arm/32/loopfilter.S',
+ 'arm/32/looprestoration.S',
+ 'arm/32/mc.S',
+ )
+ endif
+
+ if dav1d_bitdepths.contains('16')
+ libdav1d_sources_asm += files(
+ 'arm/32/cdef16.S',
+ 'arm/32/filmgrain16.S',
+ 'arm/32/ipred16.S',
+ 'arm/32/itx16.S',
+ 'arm/32/loopfilter16.S',
+ 'arm/32/looprestoration16.S',
+ 'arm/32/mc16.S',
+ )
+ endif
+ endif
+
+ if use_gaspp
+ libdav1d_asm_objs = gaspp_gen.process(libdav1d_sources_asm)
+ else
+ libdav1d_sources += libdav1d_sources_asm
+ endif
+ elif host_machine.cpu_family().startswith('x86')
+
+ libdav1d_sources += files(
+ 'x86/cpu.c',
+ )
+
+ # NASM source files
+ libdav1d_sources_asm = files(
+ 'x86/cpuid.asm',
+ 'x86/msac.asm',
+ 'x86/pal.asm',
+ 'x86/refmvs.asm',
+ 'x86/itx_avx512.asm',
+ 'x86/cdef_avx2.asm',
+ 'x86/itx_avx2.asm',
+ 'x86/looprestoration_avx2.asm',
+ 'x86/cdef_sse.asm',
+ 'x86/itx_sse.asm',
+ )
+
+ if dav1d_bitdepths.contains('8')
+ libdav1d_sources_asm += files(
+ 'x86/cdef_avx512.asm',
+ 'x86/filmgrain_avx512.asm',
+ 'x86/ipred_avx512.asm',
+ 'x86/loopfilter_avx512.asm',
+ 'x86/looprestoration_avx512.asm',
+ 'x86/mc_avx512.asm',
+ 'x86/filmgrain_avx2.asm',
+ 'x86/ipred_avx2.asm',
+ 'x86/loopfilter_avx2.asm',
+ 'x86/mc_avx2.asm',
+ 'x86/filmgrain_sse.asm',
+ 'x86/ipred_sse.asm',
+ 'x86/loopfilter_sse.asm',
+ 'x86/looprestoration_sse.asm',
+ 'x86/mc_sse.asm',
+ )
+ endif
+
+ if dav1d_bitdepths.contains('16')
+ libdav1d_sources_asm += files(
+ 'x86/cdef16_avx512.asm',
+ 'x86/filmgrain16_avx512.asm',
+ 'x86/ipred16_avx512.asm',
+ 'x86/itx16_avx512.asm',
+ 'x86/loopfilter16_avx512.asm',
+ 'x86/looprestoration16_avx512.asm',
+ 'x86/mc16_avx512.asm',
+ 'x86/cdef16_avx2.asm',
+ 'x86/filmgrain16_avx2.asm',
+ 'x86/ipred16_avx2.asm',
+ 'x86/itx16_avx2.asm',
+ 'x86/loopfilter16_avx2.asm',
+ 'x86/looprestoration16_avx2.asm',
+ 'x86/mc16_avx2.asm',
+ 'x86/cdef16_sse.asm',
+ 'x86/filmgrain16_sse.asm',
+ 'x86/ipred16_sse.asm',
+ 'x86/itx16_sse.asm',
+ 'x86/loopfilter16_sse.asm',
+ 'x86/looprestoration16_sse.asm',
+ 'x86/mc16_sse.asm',
+ )
+ endif
+
+ # Compile the ASM sources with NASM
+ libdav1d_asm_objs = nasm_gen.process(libdav1d_sources_asm)
+ elif host_machine.cpu_family().startswith('loongarch')
+ libdav1d_sources += files(
+ 'loongarch/cpu.c',
+ )
+
+ libdav1d_arch_tmpl_sources += files(
+ 'loongarch/looprestoration_tmpl.c',
+ )
+
+ libdav1d_sources_asm = files(
+ 'loongarch/mc.S',
+ 'loongarch/loopfilter.S',
+ 'loongarch/looprestoration.S',
+ 'loongarch/msac.S',
+ 'loongarch/refmvs.S',
+ 'loongarch/itx.S',
+ )
+ libdav1d_asm_objs += libdav1d_sources_asm
+ elif host_machine.cpu() == 'ppc64le'
+ arch_flags = ['-maltivec', '-mvsx']
+ libdav1d_sources += files(
+ 'ppc/cpu.c',
+ )
+ libdav1d_arch_tmpl_sources += files(
+ 'ppc/cdef_tmpl.c',
+ 'ppc/looprestoration_tmpl.c',
+ )
+ elif host_machine.cpu_family().startswith('riscv')
+ libdav1d_sources += files(
+ 'riscv/cpu.c',
+ )
+ if host_machine.cpu_family() == 'riscv64'
+ libdav1d_sources += files(
+ 'riscv/64/itx.S',
+ )
+ endif
+ endif
+endif
+
+
+
+libdav1d_rc_obj = []
+libdav1d_flags = []
+api_export_flags = []
+
+#
+# Windows .rc file and API export flags
+#
+
+if host_machine.system() == 'windows'
+ if get_option('default_library') != 'static'
+ rc_file = configure_file(
+ input : 'dav1d.rc.in',
+ output : 'dav1d.rc',
+ configuration : rc_data
+ )
+
+ libdav1d_rc_obj = winmod.compile_resources(rc_file)
+
+ api_export_flags = ['-DDAV1D_BUILDING_DLL']
+ endif
+
+ if (host_machine.cpu_family() == 'x86_64' and cc.get_id() == 'gcc')
+ # We don't expect to reference data members from other DLLs without
+ # dllimport attributes. Set the -mcmodel=small flag, which avoids
+ # generating indirection via .refptr.<symname> for all potentially
+ # dllimported variable references.
+ libdav1d_flags += '-mcmodel=small'
+ endif
+endif
+
+
+
+#
+# Library definitions
+#
+
+# Helper library for each bitdepth
+libdav1d_bitdepth_objs = []
+foreach bitdepth : dav1d_bitdepths
+ libdav1d_bitdepth_objs += static_library(
+ 'dav1d_bitdepth_@0@'.format(bitdepth),
+ libdav1d_tmpl_sources, config_h_target,
+ include_directories: dav1d_inc_dirs,
+ dependencies : [stdatomic_dependencies],
+ c_args : ['-DBITDEPTH=@0@'.format(bitdepth)] + libdav1d_flags,
+ install : false,
+ build_by_default : false,
+ ).extract_all_objects(recursive: true)
+endforeach
+
+# Helper library for each bitdepth and architecture-specific flags
+foreach bitdepth : dav1d_bitdepths
+ libdav1d_bitdepth_objs += static_library(
+ 'dav1d_arch_bitdepth_@0@'.format(bitdepth),
+ libdav1d_arch_tmpl_sources, config_h_target,
+ include_directories: dav1d_inc_dirs,
+ dependencies : [stdatomic_dependencies],
+ c_args : ['-DBITDEPTH=@0@'.format(bitdepth)] + libdav1d_flags + arch_flags,
+ install : false,
+ build_by_default : false,
+ ).extract_all_objects(recursive: true)
+endforeach
+
+# The final dav1d library
+if host_machine.system() == 'windows'
+ dav1d_soversion = ''
+else
+ dav1d_soversion = dav1d_api_version_major
+endif
+
+libdav1d = library('dav1d',
+ libdav1d_sources,
+ libdav1d_asm_objs,
+ libdav1d_rc_obj,
+ rev_target,
+ config_h_target,
+
+ objects : [
+ libdav1d_bitdepth_objs,
+ ],
+
+ include_directories : dav1d_inc_dirs,
+ dependencies : [
+ stdatomic_dependencies,
+ thread_dependency,
+ thread_compat_dep,
+ libdl_dependency,
+ ],
+ c_args : [libdav1d_flags, api_export_flags],
+ version : dav1d_soname_version,
+ soversion : dav1d_soversion,
+ install : true,
+)
+
+dav1d_dep = declare_dependency(link_with: libdav1d,
+ include_directories : include_directories('../include/dav1d')
+)
+
+#
+# Generate pkg-config .pc file
+#
+pkg_mod = import('pkgconfig')
+pkg_mod.generate(libraries: libdav1d,
+ version: meson.project_version(),
+ name: 'libdav1d',
+ filebase: 'dav1d',
+ description: 'AV1 decoding library'
+)
diff --git a/third_party/dav1d/src/msac.c b/third_party/dav1d/src/msac.c
new file mode 100644
index 0000000000..43d8ae5d07
--- /dev/null
+++ b/third_party/dav1d/src/msac.c
@@ -0,0 +1,208 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <limits.h>
+
+#include "common/intops.h"
+
+#include "src/msac.h"
+
+#define EC_PROB_SHIFT 6
+#define EC_MIN_PROB 4 // must be <= (1<<EC_PROB_SHIFT)/16
+
+#define EC_WIN_SIZE (sizeof(ec_win) << 3)
+
+static inline void ctx_refill(MsacContext *const s) {
+ const uint8_t *buf_pos = s->buf_pos;
+ const uint8_t *buf_end = s->buf_end;
+ int c = EC_WIN_SIZE - s->cnt - 24;
+ ec_win dif = s->dif;
+ while (c >= 0 && buf_pos < buf_end) {
+ dif ^= ((ec_win)*buf_pos++) << c;
+ c -= 8;
+ }
+ s->dif = dif;
+ s->cnt = EC_WIN_SIZE - c - 24;
+ s->buf_pos = buf_pos;
+}
+
+/* Takes updated dif and range values, renormalizes them so that
+ * 32768 <= rng < 65536 (reading more bytes from the stream into dif if
+ * necessary), and stores them back in the decoder context.
+ * dif: The new value of dif.
+ * rng: The new value of the range. */
+static inline void ctx_norm(MsacContext *const s, const ec_win dif,
+ const unsigned rng)
+{
+ const int d = 15 ^ (31 ^ clz(rng));
+ assert(rng <= 65535U);
+ s->cnt -= d;
+ s->dif = ((dif + 1) << d) - 1; /* Shift in 1s in the LSBs */
+ s->rng = rng << d;
+ if (s->cnt < 0)
+ ctx_refill(s);
+}
+
+unsigned dav1d_msac_decode_bool_equi_c(MsacContext *const s) {
+ const unsigned r = s->rng;
+ ec_win dif = s->dif;
+ assert((dif >> (EC_WIN_SIZE - 16)) < r);
+ // When the probability is 1/2, f = 16384 >> EC_PROB_SHIFT = 256 and we can
+ // replace the multiply with a simple shift.
+ unsigned v = ((r >> 8) << 7) + EC_MIN_PROB;
+ const ec_win vw = (ec_win)v << (EC_WIN_SIZE - 16);
+ const unsigned ret = dif >= vw;
+ dif -= ret * vw;
+ v += ret * (r - 2 * v);
+ ctx_norm(s, dif, v);
+ return !ret;
+}
+
+/* Decode a single binary value.
+ * f: The probability that the bit is one
+ * Return: The value decoded (0 or 1). */
+unsigned dav1d_msac_decode_bool_c(MsacContext *const s, const unsigned f) {
+ const unsigned r = s->rng;
+ ec_win dif = s->dif;
+ assert((dif >> (EC_WIN_SIZE - 16)) < r);
+ unsigned v = ((r >> 8) * (f >> EC_PROB_SHIFT) >> (7 - EC_PROB_SHIFT)) + EC_MIN_PROB;
+ const ec_win vw = (ec_win)v << (EC_WIN_SIZE - 16);
+ const unsigned ret = dif >= vw;
+ dif -= ret * vw;
+ v += ret * (r - 2 * v);
+ ctx_norm(s, dif, v);
+ return !ret;
+}
+
+int dav1d_msac_decode_subexp(MsacContext *const s, const int ref,
+ const int n, unsigned k)
+{
+ assert(n >> k == 8);
+
+ unsigned a = 0;
+ if (dav1d_msac_decode_bool_equi(s)) {
+ if (dav1d_msac_decode_bool_equi(s))
+ k += dav1d_msac_decode_bool_equi(s) + 1;
+ a = 1 << k;
+ }
+ const unsigned v = dav1d_msac_decode_bools(s, k) + a;
+ return ref * 2 <= n ? inv_recenter(ref, v) :
+ n - 1 - inv_recenter(n - 1 - ref, v);
+}
+
+/* Decodes a symbol given an inverse cumulative distribution function (CDF)
+ * table in Q15. */
+unsigned dav1d_msac_decode_symbol_adapt_c(MsacContext *const s,
+ uint16_t *const cdf,
+ const size_t n_symbols)
+{
+ const unsigned c = s->dif >> (EC_WIN_SIZE - 16), r = s->rng >> 8;
+ unsigned u, v = s->rng, val = -1;
+
+ assert(n_symbols <= 15);
+ assert(cdf[n_symbols] <= 32);
+
+ do {
+ val++;
+ u = v;
+ v = r * (cdf[val] >> EC_PROB_SHIFT);
+ v >>= 7 - EC_PROB_SHIFT;
+ v += EC_MIN_PROB * ((unsigned)n_symbols - val);
+ } while (c < v);
+
+ assert(u <= s->rng);
+
+ ctx_norm(s, s->dif - ((ec_win)v << (EC_WIN_SIZE - 16)), u - v);
+
+ if (s->allow_update_cdf) {
+ const unsigned count = cdf[n_symbols];
+ const unsigned rate = 4 + (count >> 4) + (n_symbols > 2);
+ unsigned i;
+ for (i = 0; i < val; i++)
+ cdf[i] += (32768 - cdf[i]) >> rate;
+ for (; i < n_symbols; i++)
+ cdf[i] -= cdf[i] >> rate;
+ cdf[n_symbols] = count + (count < 32);
+ }
+
+ return val;
+}
+
+unsigned dav1d_msac_decode_bool_adapt_c(MsacContext *const s,
+ uint16_t *const cdf)
+{
+ const unsigned bit = dav1d_msac_decode_bool(s, *cdf);
+
+ if (s->allow_update_cdf) {
+ // update_cdf() specialized for boolean CDFs
+ const unsigned count = cdf[1];
+ const int rate = 4 + (count >> 4);
+ if (bit)
+ cdf[0] += (32768 - cdf[0]) >> rate;
+ else
+ cdf[0] -= cdf[0] >> rate;
+ cdf[1] = count + (count < 32);
+ }
+
+ return bit;
+}
+
+unsigned dav1d_msac_decode_hi_tok_c(MsacContext *const s, uint16_t *const cdf) {
+ unsigned tok_br = dav1d_msac_decode_symbol_adapt4(s, cdf, 3);
+ unsigned tok = 3 + tok_br;
+ if (tok_br == 3) {
+ tok_br = dav1d_msac_decode_symbol_adapt4(s, cdf, 3);
+ tok = 6 + tok_br;
+ if (tok_br == 3) {
+ tok_br = dav1d_msac_decode_symbol_adapt4(s, cdf, 3);
+ tok = 9 + tok_br;
+ if (tok_br == 3)
+ tok = 12 + dav1d_msac_decode_symbol_adapt4(s, cdf, 3);
+ }
+ }
+ return tok;
+}
+
+void dav1d_msac_init(MsacContext *const s, const uint8_t *const data,
+ const size_t sz, const int disable_cdf_update_flag)
+{
+ s->buf_pos = data;
+ s->buf_end = data + sz;
+ s->dif = ((ec_win)1 << (EC_WIN_SIZE - 1)) - 1;
+ s->rng = 0x8000;
+ s->cnt = -15;
+ s->allow_update_cdf = !disable_cdf_update_flag;
+ ctx_refill(s);
+
+#if ARCH_X86_64 && HAVE_ASM
+ s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt_c;
+
+ msac_init_x86(s);
+#endif
+}
diff --git a/third_party/dav1d/src/msac.h b/third_party/dav1d/src/msac.h
new file mode 100644
index 0000000000..c3e07e1c70
--- /dev/null
+++ b/third_party/dav1d/src/msac.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_MSAC_H
+#define DAV1D_SRC_MSAC_H
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "common/intops.h"
+
+typedef size_t ec_win;
+
+typedef struct MsacContext {
+ const uint8_t *buf_pos;
+ const uint8_t *buf_end;
+ ec_win dif;
+ unsigned rng;
+ int cnt;
+ int allow_update_cdf;
+
+#if ARCH_X86_64 && HAVE_ASM
+ unsigned (*symbol_adapt16)(struct MsacContext *s, uint16_t *cdf, size_t n_symbols);
+#endif
+} MsacContext;
+
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+#include "src/arm/msac.h"
+#elif ARCH_LOONGARCH64
+#include "src/loongarch/msac.h"
+#elif ARCH_X86
+#include "src/x86/msac.h"
+#endif
+#endif
+
+void dav1d_msac_init(MsacContext *s, const uint8_t *data, size_t sz,
+ int disable_cdf_update_flag);
+unsigned dav1d_msac_decode_symbol_adapt_c(MsacContext *s, uint16_t *cdf,
+ size_t n_symbols);
+unsigned dav1d_msac_decode_bool_adapt_c(MsacContext *s, uint16_t *cdf);
+unsigned dav1d_msac_decode_bool_equi_c(MsacContext *s);
+unsigned dav1d_msac_decode_bool_c(MsacContext *s, unsigned f);
+unsigned dav1d_msac_decode_hi_tok_c(MsacContext *s, uint16_t *cdf);
+int dav1d_msac_decode_subexp(MsacContext *s, int ref, int n, unsigned k);
+
+/* Supported n_symbols ranges: adapt4: 1-4, adapt8: 1-7, adapt16: 3-15 */
+#ifndef dav1d_msac_decode_symbol_adapt4
+#define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt_c
+#endif
+#ifndef dav1d_msac_decode_symbol_adapt8
+#define dav1d_msac_decode_symbol_adapt8 dav1d_msac_decode_symbol_adapt_c
+#endif
+#ifndef dav1d_msac_decode_symbol_adapt16
+#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt_c
+#endif
+#ifndef dav1d_msac_decode_bool_adapt
+#define dav1d_msac_decode_bool_adapt dav1d_msac_decode_bool_adapt_c
+#endif
+#ifndef dav1d_msac_decode_bool_equi
+#define dav1d_msac_decode_bool_equi dav1d_msac_decode_bool_equi_c
+#endif
+#ifndef dav1d_msac_decode_bool
+#define dav1d_msac_decode_bool dav1d_msac_decode_bool_c
+#endif
+#ifndef dav1d_msac_decode_hi_tok
+#define dav1d_msac_decode_hi_tok dav1d_msac_decode_hi_tok_c
+#endif
+
+static inline unsigned dav1d_msac_decode_bools(MsacContext *const s, unsigned n) {
+ unsigned v = 0;
+ while (n--)
+ v = (v << 1) | dav1d_msac_decode_bool_equi(s);
+ return v;
+}
+
+static inline int dav1d_msac_decode_uniform(MsacContext *const s, const unsigned n) {
+ assert(n > 0);
+ const int l = ulog2(n) + 1;
+ assert(l > 1);
+ const unsigned m = (1 << l) - n;
+ const unsigned v = dav1d_msac_decode_bools(s, l - 1);
+ return v < m ? v : (v << 1) - m + dav1d_msac_decode_bool_equi(s);
+}
+
+#endif /* DAV1D_SRC_MSAC_H */
diff --git a/third_party/dav1d/src/obu.c b/third_party/dav1d/src/obu.c
new file mode 100644
index 0000000000..78d652b4c5
--- /dev/null
+++ b/third_party/dav1d/src/obu.c
@@ -0,0 +1,1738 @@
+/*
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <errno.h>
+#include <limits.h>
+#include <stdio.h>
+
+#include "dav1d/data.h"
+
+#include "common/frame.h"
+#include "common/intops.h"
+#include "common/validate.h"
+
+#include "src/decode.h"
+#include "src/getbits.h"
+#include "src/levels.h"
+#include "src/log.h"
+#include "src/obu.h"
+#include "src/ref.h"
+#include "src/thread_task.h"
+
+static int check_trailing_bits(GetBits *const gb,
+ const int strict_std_compliance)
+{
+ const int trailing_one_bit = dav1d_get_bit(gb);
+
+ if (gb->error)
+ return DAV1D_ERR(EINVAL);
+
+ if (!strict_std_compliance)
+ return 0;
+
+ if (!trailing_one_bit || gb->state)
+ return DAV1D_ERR(EINVAL);
+
+ ptrdiff_t size = gb->ptr_end - gb->ptr;
+ while (size > 0 && gb->ptr[size - 1] == 0)
+ size--;
+
+ if (size)
+ return DAV1D_ERR(EINVAL);
+
+ return 0;
+}
+
+static NOINLINE int parse_seq_hdr(Dav1dSequenceHeader *const hdr,
+ GetBits *const gb,
+ const int strict_std_compliance)
+{
+#define DEBUG_SEQ_HDR 0
+
+#if DEBUG_SEQ_HDR
+ const unsigned init_bit_pos = dav1d_get_bits_pos(gb);
+#endif
+
+ memset(hdr, 0, sizeof(*hdr));
+ hdr->profile = dav1d_get_bits(gb, 3);
+ if (hdr->profile > 2) goto error;
+#if DEBUG_SEQ_HDR
+ printf("SEQHDR: post-profile: off=%u\n",
+ dav1d_get_bits_pos(gb) - init_bit_pos);
+#endif
+
+ hdr->still_picture = dav1d_get_bit(gb);
+ hdr->reduced_still_picture_header = dav1d_get_bit(gb);
+ if (hdr->reduced_still_picture_header && !hdr->still_picture) goto error;
+#if DEBUG_SEQ_HDR
+ printf("SEQHDR: post-stillpicture_flags: off=%u\n",
+ dav1d_get_bits_pos(gb) - init_bit_pos);
+#endif
+
+ if (hdr->reduced_still_picture_header) {
+ hdr->num_operating_points = 1;
+ hdr->operating_points[0].major_level = dav1d_get_bits(gb, 3);
+ hdr->operating_points[0].minor_level = dav1d_get_bits(gb, 2);
+ hdr->operating_points[0].initial_display_delay = 10;
+ } else {
+ hdr->timing_info_present = dav1d_get_bit(gb);
+ if (hdr->timing_info_present) {
+ hdr->num_units_in_tick = dav1d_get_bits(gb, 32);
+ hdr->time_scale = dav1d_get_bits(gb, 32);
+ if (strict_std_compliance && (!hdr->num_units_in_tick || !hdr->time_scale))
+ goto error;
+ hdr->equal_picture_interval = dav1d_get_bit(gb);
+ if (hdr->equal_picture_interval) {
+ const unsigned num_ticks_per_picture = dav1d_get_vlc(gb);
+ if (num_ticks_per_picture == 0xFFFFFFFFU)
+ goto error;
+ hdr->num_ticks_per_picture = num_ticks_per_picture + 1;
+ }
+
+ hdr->decoder_model_info_present = dav1d_get_bit(gb);
+ if (hdr->decoder_model_info_present) {
+ hdr->encoder_decoder_buffer_delay_length = dav1d_get_bits(gb, 5) + 1;
+ hdr->num_units_in_decoding_tick = dav1d_get_bits(gb, 32);
+ if (strict_std_compliance && !hdr->num_units_in_decoding_tick)
+ goto error;
+ hdr->buffer_removal_delay_length = dav1d_get_bits(gb, 5) + 1;
+ hdr->frame_presentation_delay_length = dav1d_get_bits(gb, 5) + 1;
+ }
+ }
+#if DEBUG_SEQ_HDR
+ printf("SEQHDR: post-timinginfo: off=%u\n",
+ dav1d_get_bits_pos(gb) - init_bit_pos);
+#endif
+
+ hdr->display_model_info_present = dav1d_get_bit(gb);
+ hdr->num_operating_points = dav1d_get_bits(gb, 5) + 1;
+ for (int i = 0; i < hdr->num_operating_points; i++) {
+ struct Dav1dSequenceHeaderOperatingPoint *const op =
+ &hdr->operating_points[i];
+ op->idc = dav1d_get_bits(gb, 12);
+ if (op->idc && (!(op->idc & 0xff) || !(op->idc & 0xf00)))
+ goto error;
+ op->major_level = 2 + dav1d_get_bits(gb, 3);
+ op->minor_level = dav1d_get_bits(gb, 2);
+ if (op->major_level > 3)
+ op->tier = dav1d_get_bit(gb);
+ if (hdr->decoder_model_info_present) {
+ op->decoder_model_param_present = dav1d_get_bit(gb);
+ if (op->decoder_model_param_present) {
+ struct Dav1dSequenceHeaderOperatingParameterInfo *const opi =
+ &hdr->operating_parameter_info[i];
+ opi->decoder_buffer_delay =
+ dav1d_get_bits(gb, hdr->encoder_decoder_buffer_delay_length);
+ opi->encoder_buffer_delay =
+ dav1d_get_bits(gb, hdr->encoder_decoder_buffer_delay_length);
+ opi->low_delay_mode = dav1d_get_bit(gb);
+ }
+ }
+ if (hdr->display_model_info_present)
+ op->display_model_param_present = dav1d_get_bit(gb);
+ op->initial_display_delay =
+ op->display_model_param_present ? dav1d_get_bits(gb, 4) + 1 : 10;
+ }
+#if DEBUG_SEQ_HDR
+ printf("SEQHDR: post-operating-points: off=%u\n",
+ dav1d_get_bits_pos(gb) - init_bit_pos);
+#endif
+ }
+
+ hdr->width_n_bits = dav1d_get_bits(gb, 4) + 1;
+ hdr->height_n_bits = dav1d_get_bits(gb, 4) + 1;
+ hdr->max_width = dav1d_get_bits(gb, hdr->width_n_bits) + 1;
+ hdr->max_height = dav1d_get_bits(gb, hdr->height_n_bits) + 1;
+#if DEBUG_SEQ_HDR
+ printf("SEQHDR: post-size: off=%u\n",
+ dav1d_get_bits_pos(gb) - init_bit_pos);
+#endif
+ if (!hdr->reduced_still_picture_header) {
+ hdr->frame_id_numbers_present = dav1d_get_bit(gb);
+ if (hdr->frame_id_numbers_present) {
+ hdr->delta_frame_id_n_bits = dav1d_get_bits(gb, 4) + 2;
+ hdr->frame_id_n_bits = dav1d_get_bits(gb, 3) + hdr->delta_frame_id_n_bits + 1;
+ }
+ }
+#if DEBUG_SEQ_HDR
+ printf("SEQHDR: post-frame-id-numbers-present: off=%u\n",
+ dav1d_get_bits_pos(gb) - init_bit_pos);
+#endif
+
+ hdr->sb128 = dav1d_get_bit(gb);
+ hdr->filter_intra = dav1d_get_bit(gb);
+ hdr->intra_edge_filter = dav1d_get_bit(gb);
+ if (hdr->reduced_still_picture_header) {
+ hdr->screen_content_tools = DAV1D_ADAPTIVE;
+ hdr->force_integer_mv = DAV1D_ADAPTIVE;
+ } else {
+ hdr->inter_intra = dav1d_get_bit(gb);
+ hdr->masked_compound = dav1d_get_bit(gb);
+ hdr->warped_motion = dav1d_get_bit(gb);
+ hdr->dual_filter = dav1d_get_bit(gb);
+ hdr->order_hint = dav1d_get_bit(gb);
+ if (hdr->order_hint) {
+ hdr->jnt_comp = dav1d_get_bit(gb);
+ hdr->ref_frame_mvs = dav1d_get_bit(gb);
+ }
+ hdr->screen_content_tools = dav1d_get_bit(gb) ? DAV1D_ADAPTIVE : dav1d_get_bit(gb);
+ #if DEBUG_SEQ_HDR
+ printf("SEQHDR: post-screentools: off=%u\n",
+ dav1d_get_bits_pos(gb) - init_bit_pos);
+ #endif
+ hdr->force_integer_mv = hdr->screen_content_tools ?
+ dav1d_get_bit(gb) ? DAV1D_ADAPTIVE : dav1d_get_bit(gb) : 2;
+ if (hdr->order_hint)
+ hdr->order_hint_n_bits = dav1d_get_bits(gb, 3) + 1;
+ }
+ hdr->super_res = dav1d_get_bit(gb);
+ hdr->cdef = dav1d_get_bit(gb);
+ hdr->restoration = dav1d_get_bit(gb);
+#if DEBUG_SEQ_HDR
+ printf("SEQHDR: post-featurebits: off=%u\n",
+ dav1d_get_bits_pos(gb) - init_bit_pos);
+#endif
+
+ hdr->hbd = dav1d_get_bit(gb);
+ if (hdr->profile == 2 && hdr->hbd)
+ hdr->hbd += dav1d_get_bit(gb);
+ if (hdr->profile != 1)
+ hdr->monochrome = dav1d_get_bit(gb);
+ hdr->color_description_present = dav1d_get_bit(gb);
+ if (hdr->color_description_present) {
+ hdr->pri = dav1d_get_bits(gb, 8);
+ hdr->trc = dav1d_get_bits(gb, 8);
+ hdr->mtrx = dav1d_get_bits(gb, 8);
+ } else {
+ hdr->pri = DAV1D_COLOR_PRI_UNKNOWN;
+ hdr->trc = DAV1D_TRC_UNKNOWN;
+ hdr->mtrx = DAV1D_MC_UNKNOWN;
+ }
+ if (hdr->monochrome) {
+ hdr->color_range = dav1d_get_bit(gb);
+ hdr->layout = DAV1D_PIXEL_LAYOUT_I400;
+ hdr->ss_hor = hdr->ss_ver = 1;
+ hdr->chr = DAV1D_CHR_UNKNOWN;
+ } else if (hdr->pri == DAV1D_COLOR_PRI_BT709 &&
+ hdr->trc == DAV1D_TRC_SRGB &&
+ hdr->mtrx == DAV1D_MC_IDENTITY)
+ {
+ hdr->layout = DAV1D_PIXEL_LAYOUT_I444;
+ hdr->color_range = 1;
+ if (hdr->profile != 1 && !(hdr->profile == 2 && hdr->hbd == 2))
+ goto error;
+ } else {
+ hdr->color_range = dav1d_get_bit(gb);
+ switch (hdr->profile) {
+ case 0: hdr->layout = DAV1D_PIXEL_LAYOUT_I420;
+ hdr->ss_hor = hdr->ss_ver = 1;
+ break;
+ case 1: hdr->layout = DAV1D_PIXEL_LAYOUT_I444;
+ break;
+ case 2:
+ if (hdr->hbd == 2) {
+ hdr->ss_hor = dav1d_get_bit(gb);
+ if (hdr->ss_hor)
+ hdr->ss_ver = dav1d_get_bit(gb);
+ } else
+ hdr->ss_hor = 1;
+ hdr->layout = hdr->ss_hor ?
+ hdr->ss_ver ? DAV1D_PIXEL_LAYOUT_I420 :
+ DAV1D_PIXEL_LAYOUT_I422 :
+ DAV1D_PIXEL_LAYOUT_I444;
+ break;
+ }
+ hdr->chr = (hdr->ss_hor & hdr->ss_ver) ?
+ dav1d_get_bits(gb, 2) : DAV1D_CHR_UNKNOWN;
+ }
+ if (strict_std_compliance &&
+ hdr->mtrx == DAV1D_MC_IDENTITY && hdr->layout != DAV1D_PIXEL_LAYOUT_I444)
+ {
+ goto error;
+ }
+ if (!hdr->monochrome)
+ hdr->separate_uv_delta_q = dav1d_get_bit(gb);
+#if DEBUG_SEQ_HDR
+ printf("SEQHDR: post-colorinfo: off=%u\n",
+ dav1d_get_bits_pos(gb) - init_bit_pos);
+#endif
+
+ hdr->film_grain_present = dav1d_get_bit(gb);
+#if DEBUG_SEQ_HDR
+ printf("SEQHDR: post-filmgrain: off=%u\n",
+ dav1d_get_bits_pos(gb) - init_bit_pos);
+#endif
+
+ // We needn't bother flushing the OBU here: we'll check we didn't
+ // overrun in the caller and will then discard gb, so there's no
+ // point in setting its position properly.
+
+ return check_trailing_bits(gb, strict_std_compliance);
+
+error:
+ return DAV1D_ERR(EINVAL);
+}
+
+int dav1d_parse_sequence_header(Dav1dSequenceHeader *const out,
+ const uint8_t *const ptr, const size_t sz)
+{
+ validate_input_or_ret(out != NULL, DAV1D_ERR(EINVAL));
+ validate_input_or_ret(ptr != NULL, DAV1D_ERR(EINVAL));
+ validate_input_or_ret(sz > 0 && sz <= SIZE_MAX / 2, DAV1D_ERR(EINVAL));
+
+ GetBits gb;
+ dav1d_init_get_bits(&gb, ptr, sz);
+ int res = DAV1D_ERR(ENOENT);
+
+ do {
+ dav1d_get_bit(&gb); // obu_forbidden_bit
+ const enum Dav1dObuType type = dav1d_get_bits(&gb, 4);
+ const int has_extension = dav1d_get_bit(&gb);
+ const int has_length_field = dav1d_get_bit(&gb);
+ dav1d_get_bits(&gb, 1 + 8 * has_extension); // ignore
+
+ const uint8_t *obu_end = gb.ptr_end;
+ if (has_length_field) {
+ const size_t len = dav1d_get_uleb128(&gb);
+ if (len > (size_t)(obu_end - gb.ptr)) return DAV1D_ERR(EINVAL);
+ obu_end = gb.ptr + len;
+ }
+
+ if (type == DAV1D_OBU_SEQ_HDR) {
+ if ((res = parse_seq_hdr(out, &gb, 0)) < 0) return res;
+ if (gb.ptr > obu_end) return DAV1D_ERR(EINVAL);
+ dav1d_bytealign_get_bits(&gb);
+ }
+
+ if (gb.error) return DAV1D_ERR(EINVAL);
+ assert(gb.state == 0 && gb.bits_left == 0);
+ gb.ptr = obu_end;
+ } while (gb.ptr < gb.ptr_end);
+
+ return res;
+}
+
+static int read_frame_size(Dav1dContext *const c, GetBits *const gb,
+ const int use_ref)
+{
+ const Dav1dSequenceHeader *const seqhdr = c->seq_hdr;
+ Dav1dFrameHeader *const hdr = c->frame_hdr;
+
+ if (use_ref) {
+ for (int i = 0; i < 7; i++) {
+ if (dav1d_get_bit(gb)) {
+ const Dav1dThreadPicture *const ref =
+ &c->refs[c->frame_hdr->refidx[i]].p;
+ if (!ref->p.frame_hdr) return -1;
+ hdr->width[1] = ref->p.frame_hdr->width[1];
+ hdr->height = ref->p.frame_hdr->height;
+ hdr->render_width = ref->p.frame_hdr->render_width;
+ hdr->render_height = ref->p.frame_hdr->render_height;
+ hdr->super_res.enabled = seqhdr->super_res && dav1d_get_bit(gb);
+ if (hdr->super_res.enabled) {
+ const int d = hdr->super_res.width_scale_denominator =
+ 9 + dav1d_get_bits(gb, 3);
+ hdr->width[0] = imax((hdr->width[1] * 8 + (d >> 1)) / d,
+ imin(16, hdr->width[1]));
+ } else {
+ hdr->super_res.width_scale_denominator = 8;
+ hdr->width[0] = hdr->width[1];
+ }
+ return 0;
+ }
+ }
+ }
+
+ if (hdr->frame_size_override) {
+ hdr->width[1] = dav1d_get_bits(gb, seqhdr->width_n_bits) + 1;
+ hdr->height = dav1d_get_bits(gb, seqhdr->height_n_bits) + 1;
+ } else {
+ hdr->width[1] = seqhdr->max_width;
+ hdr->height = seqhdr->max_height;
+ }
+ hdr->super_res.enabled = seqhdr->super_res && dav1d_get_bit(gb);
+ if (hdr->super_res.enabled) {
+ const int d = hdr->super_res.width_scale_denominator = 9 + dav1d_get_bits(gb, 3);
+ hdr->width[0] = imax((hdr->width[1] * 8 + (d >> 1)) / d, imin(16, hdr->width[1]));
+ } else {
+ hdr->super_res.width_scale_denominator = 8;
+ hdr->width[0] = hdr->width[1];
+ }
+ hdr->have_render_size = dav1d_get_bit(gb);
+ if (hdr->have_render_size) {
+ hdr->render_width = dav1d_get_bits(gb, 16) + 1;
+ hdr->render_height = dav1d_get_bits(gb, 16) + 1;
+ } else {
+ hdr->render_width = hdr->width[1];
+ hdr->render_height = hdr->height;
+ }
+ return 0;
+}
+
+static inline int tile_log2(const int sz, const int tgt) {
+ int k;
+ for (k = 0; (sz << k) < tgt; k++) ;
+ return k;
+}
+
+static const Dav1dLoopfilterModeRefDeltas default_mode_ref_deltas = {
+ .mode_delta = { 0, 0 },
+ .ref_delta = { 1, 0, 0, 0, -1, 0, -1, -1 },
+};
+
+static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
+#define DEBUG_FRAME_HDR 0
+
+#if DEBUG_FRAME_HDR
+ const uint8_t *const init_ptr = gb->ptr;
+#endif
+ const Dav1dSequenceHeader *const seqhdr = c->seq_hdr;
+ Dav1dFrameHeader *const hdr = c->frame_hdr;
+
+ hdr->show_existing_frame =
+ !seqhdr->reduced_still_picture_header && dav1d_get_bit(gb);
+#if DEBUG_FRAME_HDR
+ printf("HDR: post-show_existing_frame: off=%td\n",
+ (gb->ptr - init_ptr) * 8 - gb->bits_left);
+#endif
+ if (hdr->show_existing_frame) {
+ hdr->existing_frame_idx = dav1d_get_bits(gb, 3);
+ if (seqhdr->decoder_model_info_present && !seqhdr->equal_picture_interval)
+ hdr->frame_presentation_delay = dav1d_get_bits(gb, seqhdr->frame_presentation_delay_length);
+ if (seqhdr->frame_id_numbers_present) {
+ hdr->frame_id = dav1d_get_bits(gb, seqhdr->frame_id_n_bits);
+ Dav1dFrameHeader *const ref_frame_hdr = c->refs[hdr->existing_frame_idx].p.p.frame_hdr;
+ if (!ref_frame_hdr || ref_frame_hdr->frame_id != hdr->frame_id) goto error;
+ }
+ return 0;
+ }
+
+ hdr->frame_type = seqhdr->reduced_still_picture_header ? DAV1D_FRAME_TYPE_KEY : dav1d_get_bits(gb, 2);
+ hdr->show_frame = seqhdr->reduced_still_picture_header || dav1d_get_bit(gb);
+ if (hdr->show_frame) {
+ if (seqhdr->decoder_model_info_present && !seqhdr->equal_picture_interval)
+ hdr->frame_presentation_delay = dav1d_get_bits(gb, seqhdr->frame_presentation_delay_length);
+ hdr->showable_frame = hdr->frame_type != DAV1D_FRAME_TYPE_KEY;
+ } else
+ hdr->showable_frame = dav1d_get_bit(gb);
+ hdr->error_resilient_mode =
+ (hdr->frame_type == DAV1D_FRAME_TYPE_KEY && hdr->show_frame) ||
+ hdr->frame_type == DAV1D_FRAME_TYPE_SWITCH ||
+ seqhdr->reduced_still_picture_header || dav1d_get_bit(gb);
+#if DEBUG_FRAME_HDR
+ printf("HDR: post-frametype_bits: off=%td\n",
+ (gb->ptr - init_ptr) * 8 - gb->bits_left);
+#endif
+ hdr->disable_cdf_update = dav1d_get_bit(gb);
+ hdr->allow_screen_content_tools = seqhdr->screen_content_tools == DAV1D_ADAPTIVE ?
+ dav1d_get_bit(gb) : seqhdr->screen_content_tools;
+ if (hdr->allow_screen_content_tools)
+ hdr->force_integer_mv = seqhdr->force_integer_mv == DAV1D_ADAPTIVE ?
+ dav1d_get_bit(gb) : seqhdr->force_integer_mv;
+ else
+ hdr->force_integer_mv = 0;
+
+ if (IS_KEY_OR_INTRA(hdr))
+ hdr->force_integer_mv = 1;
+
+ if (seqhdr->frame_id_numbers_present)
+ hdr->frame_id = dav1d_get_bits(gb, seqhdr->frame_id_n_bits);
+
+ hdr->frame_size_override = seqhdr->reduced_still_picture_header ? 0 :
+ hdr->frame_type == DAV1D_FRAME_TYPE_SWITCH ? 1 : dav1d_get_bit(gb);
+#if DEBUG_FRAME_HDR
+ printf("HDR: post-frame_size_override_flag: off=%td\n",
+ (gb->ptr - init_ptr) * 8 - gb->bits_left);
+#endif
+ hdr->frame_offset = seqhdr->order_hint ?
+ dav1d_get_bits(gb, seqhdr->order_hint_n_bits) : 0;
+ hdr->primary_ref_frame = !hdr->error_resilient_mode && IS_INTER_OR_SWITCH(hdr) ?
+ dav1d_get_bits(gb, 3) : DAV1D_PRIMARY_REF_NONE;
+
+ if (seqhdr->decoder_model_info_present) {
+ hdr->buffer_removal_time_present = dav1d_get_bit(gb);
+ if (hdr->buffer_removal_time_present) {
+ for (int i = 0; i < c->seq_hdr->num_operating_points; i++) {
+ const struct Dav1dSequenceHeaderOperatingPoint *const seqop = &seqhdr->operating_points[i];
+ struct Dav1dFrameHeaderOperatingPoint *const op = &hdr->operating_points[i];
+ if (seqop->decoder_model_param_present) {
+ int in_temporal_layer = (seqop->idc >> hdr->temporal_id) & 1;
+ int in_spatial_layer = (seqop->idc >> (hdr->spatial_id + 8)) & 1;
+ if (!seqop->idc || (in_temporal_layer && in_spatial_layer))
+ op->buffer_removal_time = dav1d_get_bits(gb, seqhdr->buffer_removal_delay_length);
+ }
+ }
+ }
+ }
+
+ if (IS_KEY_OR_INTRA(hdr)) {
+ hdr->refresh_frame_flags = (hdr->frame_type == DAV1D_FRAME_TYPE_KEY &&
+ hdr->show_frame) ? 0xff : dav1d_get_bits(gb, 8);
+ if (hdr->refresh_frame_flags != 0xff && hdr->error_resilient_mode && seqhdr->order_hint)
+ for (int i = 0; i < 8; i++)
+ dav1d_get_bits(gb, seqhdr->order_hint_n_bits);
+ if (c->strict_std_compliance &&
+ hdr->frame_type == DAV1D_FRAME_TYPE_INTRA && hdr->refresh_frame_flags == 0xff)
+ {
+ goto error;
+ }
+ if (read_frame_size(c, gb, 0) < 0) goto error;
+ hdr->allow_intrabc = hdr->allow_screen_content_tools &&
+ !hdr->super_res.enabled && dav1d_get_bit(gb);
+ hdr->use_ref_frame_mvs = 0;
+ } else {
+ hdr->allow_intrabc = 0;
+ hdr->refresh_frame_flags = hdr->frame_type == DAV1D_FRAME_TYPE_SWITCH ? 0xff :
+ dav1d_get_bits(gb, 8);
+ if (hdr->error_resilient_mode && seqhdr->order_hint)
+ for (int i = 0; i < 8; i++)
+ dav1d_get_bits(gb, seqhdr->order_hint_n_bits);
+ hdr->frame_ref_short_signaling =
+ seqhdr->order_hint && dav1d_get_bit(gb);
+ if (hdr->frame_ref_short_signaling) { // FIXME: Nearly verbatim copy from section 7.8
+ hdr->refidx[0] = dav1d_get_bits(gb, 3);
+ hdr->refidx[1] = hdr->refidx[2] = -1;
+ hdr->refidx[3] = dav1d_get_bits(gb, 3);
+ hdr->refidx[4] = hdr->refidx[5] = hdr->refidx[6] = -1;
+
+ int shifted_frame_offset[8];
+ const int current_frame_offset = 1 << (seqhdr->order_hint_n_bits - 1);
+ for (int i = 0; i < 8; i++) {
+ if (!c->refs[i].p.p.frame_hdr) goto error;
+ shifted_frame_offset[i] = current_frame_offset +
+ get_poc_diff(seqhdr->order_hint_n_bits,
+ c->refs[i].p.p.frame_hdr->frame_offset,
+ hdr->frame_offset);
+ }
+
+ int used_frame[8] = { 0 };
+ used_frame[hdr->refidx[0]] = 1;
+ used_frame[hdr->refidx[3]] = 1;
+
+ int latest_frame_offset = -1;
+ for (int i = 0; i < 8; i++) {
+ const int hint = shifted_frame_offset[i];
+ if (!used_frame[i] && hint >= current_frame_offset &&
+ hint >= latest_frame_offset)
+ {
+ hdr->refidx[6] = i;
+ latest_frame_offset = hint;
+ }
+ }
+ if (latest_frame_offset != -1)
+ used_frame[hdr->refidx[6]] = 1;
+
+ int earliest_frame_offset = INT_MAX;
+ for (int i = 0; i < 8; i++) {
+ const int hint = shifted_frame_offset[i];
+ if (!used_frame[i] && hint >= current_frame_offset &&
+ hint < earliest_frame_offset)
+ {
+ hdr->refidx[4] = i;
+ earliest_frame_offset = hint;
+ }
+ }
+ if (earliest_frame_offset != INT_MAX)
+ used_frame[hdr->refidx[4]] = 1;
+
+ earliest_frame_offset = INT_MAX;
+ for (int i = 0; i < 8; i++) {
+ const int hint = shifted_frame_offset[i];
+ if (!used_frame[i] && hint >= current_frame_offset &&
+ (hint < earliest_frame_offset))
+ {
+ hdr->refidx[5] = i;
+ earliest_frame_offset = hint;
+ }
+ }
+ if (earliest_frame_offset != INT_MAX)
+ used_frame[hdr->refidx[5]] = 1;
+
+ for (int i = 1; i < 7; i++) {
+ if (hdr->refidx[i] < 0) {
+ latest_frame_offset = -1;
+ for (int j = 0; j < 8; j++) {
+ const int hint = shifted_frame_offset[j];
+ if (!used_frame[j] && hint < current_frame_offset &&
+ hint >= latest_frame_offset)
+ {
+ hdr->refidx[i] = j;
+ latest_frame_offset = hint;
+ }
+ }
+ if (latest_frame_offset != -1)
+ used_frame[hdr->refidx[i]] = 1;
+ }
+ }
+
+ earliest_frame_offset = INT_MAX;
+ int ref = -1;
+ for (int i = 0; i < 8; i++) {
+ const int hint = shifted_frame_offset[i];
+ if (hint < earliest_frame_offset) {
+ ref = i;
+ earliest_frame_offset = hint;
+ }
+ }
+ for (int i = 0; i < 7; i++) {
+ if (hdr->refidx[i] < 0)
+ hdr->refidx[i] = ref;
+ }
+ }
+ for (int i = 0; i < 7; i++) {
+ if (!hdr->frame_ref_short_signaling)
+ hdr->refidx[i] = dav1d_get_bits(gb, 3);
+ if (seqhdr->frame_id_numbers_present) {
+ const unsigned delta_ref_frame_id = dav1d_get_bits(gb, seqhdr->delta_frame_id_n_bits) + 1;
+ const unsigned ref_frame_id = (hdr->frame_id + (1 << seqhdr->frame_id_n_bits) - delta_ref_frame_id) & ((1 << seqhdr->frame_id_n_bits) - 1);
+ Dav1dFrameHeader *const ref_frame_hdr = c->refs[hdr->refidx[i]].p.p.frame_hdr;
+ if (!ref_frame_hdr || ref_frame_hdr->frame_id != ref_frame_id) goto error;
+ }
+ }
+ const int use_ref = !hdr->error_resilient_mode &&
+ hdr->frame_size_override;
+ if (read_frame_size(c, gb, use_ref) < 0) goto error;
+ hdr->hp = !hdr->force_integer_mv && dav1d_get_bit(gb);
+ hdr->subpel_filter_mode = dav1d_get_bit(gb) ? DAV1D_FILTER_SWITCHABLE :
+ dav1d_get_bits(gb, 2);
+ hdr->switchable_motion_mode = dav1d_get_bit(gb);
+ hdr->use_ref_frame_mvs = !hdr->error_resilient_mode &&
+ seqhdr->ref_frame_mvs && seqhdr->order_hint &&
+ IS_INTER_OR_SWITCH(hdr) && dav1d_get_bit(gb);
+ }
+#if DEBUG_FRAME_HDR
+ printf("HDR: post-frametype-specific-bits: off=%td\n",
+ (gb->ptr - init_ptr) * 8 - gb->bits_left);
+#endif
+
+ hdr->refresh_context = !seqhdr->reduced_still_picture_header &&
+ !hdr->disable_cdf_update && !dav1d_get_bit(gb);
+#if DEBUG_FRAME_HDR
+ printf("HDR: post-refresh_context: off=%td\n",
+ (gb->ptr - init_ptr) * 8 - gb->bits_left);
+#endif
+
+ // tile data
+ hdr->tiling.uniform = dav1d_get_bit(gb);
+ const int sbsz_min1 = (64 << seqhdr->sb128) - 1;
+ const int sbsz_log2 = 6 + seqhdr->sb128;
+ const int sbw = (hdr->width[0] + sbsz_min1) >> sbsz_log2;
+ const int sbh = (hdr->height + sbsz_min1) >> sbsz_log2;
+ const int max_tile_width_sb = 4096 >> sbsz_log2;
+ const int max_tile_area_sb = 4096 * 2304 >> (2 * sbsz_log2);
+ hdr->tiling.min_log2_cols = tile_log2(max_tile_width_sb, sbw);
+ hdr->tiling.max_log2_cols = tile_log2(1, imin(sbw, DAV1D_MAX_TILE_COLS));
+ hdr->tiling.max_log2_rows = tile_log2(1, imin(sbh, DAV1D_MAX_TILE_ROWS));
+ const int min_log2_tiles = imax(tile_log2(max_tile_area_sb, sbw * sbh),
+ hdr->tiling.min_log2_cols);
+ if (hdr->tiling.uniform) {
+ for (hdr->tiling.log2_cols = hdr->tiling.min_log2_cols;
+ hdr->tiling.log2_cols < hdr->tiling.max_log2_cols && dav1d_get_bit(gb);
+ hdr->tiling.log2_cols++) ;
+ const int tile_w = 1 + ((sbw - 1) >> hdr->tiling.log2_cols);
+ hdr->tiling.cols = 0;
+ for (int sbx = 0; sbx < sbw; sbx += tile_w, hdr->tiling.cols++)
+ hdr->tiling.col_start_sb[hdr->tiling.cols] = sbx;
+ hdr->tiling.min_log2_rows =
+ imax(min_log2_tiles - hdr->tiling.log2_cols, 0);
+
+ for (hdr->tiling.log2_rows = hdr->tiling.min_log2_rows;
+ hdr->tiling.log2_rows < hdr->tiling.max_log2_rows && dav1d_get_bit(gb);
+ hdr->tiling.log2_rows++) ;
+ const int tile_h = 1 + ((sbh - 1) >> hdr->tiling.log2_rows);
+ hdr->tiling.rows = 0;
+ for (int sby = 0; sby < sbh; sby += tile_h, hdr->tiling.rows++)
+ hdr->tiling.row_start_sb[hdr->tiling.rows] = sby;
+ } else {
+ hdr->tiling.cols = 0;
+ int widest_tile = 0, max_tile_area_sb = sbw * sbh;
+ for (int sbx = 0; sbx < sbw && hdr->tiling.cols < DAV1D_MAX_TILE_COLS; hdr->tiling.cols++) {
+ const int tile_width_sb = imin(sbw - sbx, max_tile_width_sb);
+ const int tile_w = (tile_width_sb > 1) ?
+ 1 + dav1d_get_uniform(gb, tile_width_sb) :
+ 1;
+ hdr->tiling.col_start_sb[hdr->tiling.cols] = sbx;
+ sbx += tile_w;
+ widest_tile = imax(widest_tile, tile_w);
+ }
+ hdr->tiling.log2_cols = tile_log2(1, hdr->tiling.cols);
+ if (min_log2_tiles) max_tile_area_sb >>= min_log2_tiles + 1;
+ const int max_tile_height_sb = imax(max_tile_area_sb / widest_tile, 1);
+
+ hdr->tiling.rows = 0;
+ for (int sby = 0; sby < sbh && hdr->tiling.rows < DAV1D_MAX_TILE_ROWS; hdr->tiling.rows++) {
+ const int tile_height_sb = imin(sbh - sby, max_tile_height_sb);
+ const int tile_h = (tile_height_sb > 1) ?
+ 1 + dav1d_get_uniform(gb, tile_height_sb) :
+ 1;
+ hdr->tiling.row_start_sb[hdr->tiling.rows] = sby;
+ sby += tile_h;
+ }
+ hdr->tiling.log2_rows = tile_log2(1, hdr->tiling.rows);
+ }
+ hdr->tiling.col_start_sb[hdr->tiling.cols] = sbw;
+ hdr->tiling.row_start_sb[hdr->tiling.rows] = sbh;
+ if (hdr->tiling.log2_cols || hdr->tiling.log2_rows) {
+ hdr->tiling.update = dav1d_get_bits(gb, hdr->tiling.log2_cols +
+ hdr->tiling.log2_rows);
+ if (hdr->tiling.update >= hdr->tiling.cols * hdr->tiling.rows)
+ goto error;
+ hdr->tiling.n_bytes = dav1d_get_bits(gb, 2) + 1;
+ } else {
+ hdr->tiling.n_bytes = 0;
+ hdr->tiling.update = 0;
+ }
+#if DEBUG_FRAME_HDR
+ printf("HDR: post-tiling: off=%td\n",
+ (gb->ptr - init_ptr) * 8 - gb->bits_left);
+#endif
+
+ // quant data
+ hdr->quant.yac = dav1d_get_bits(gb, 8);
+ hdr->quant.ydc_delta = dav1d_get_bit(gb) ? dav1d_get_sbits(gb, 7) : 0;
+ if (!seqhdr->monochrome) {
+ // If the sequence header says that delta_q might be different
+ // for U, V, we must check whether it actually is for this
+ // frame.
+ const int diff_uv_delta = seqhdr->separate_uv_delta_q ? dav1d_get_bit(gb) : 0;
+ hdr->quant.udc_delta = dav1d_get_bit(gb) ? dav1d_get_sbits(gb, 7) : 0;
+ hdr->quant.uac_delta = dav1d_get_bit(gb) ? dav1d_get_sbits(gb, 7) : 0;
+ if (diff_uv_delta) {
+ hdr->quant.vdc_delta = dav1d_get_bit(gb) ? dav1d_get_sbits(gb, 7) : 0;
+ hdr->quant.vac_delta = dav1d_get_bit(gb) ? dav1d_get_sbits(gb, 7) : 0;
+ } else {
+ hdr->quant.vdc_delta = hdr->quant.udc_delta;
+ hdr->quant.vac_delta = hdr->quant.uac_delta;
+ }
+ }
+#if DEBUG_FRAME_HDR
+ printf("HDR: post-quant: off=%td\n",
+ (gb->ptr - init_ptr) * 8 - gb->bits_left);
+#endif
+ hdr->quant.qm = dav1d_get_bit(gb);
+ if (hdr->quant.qm) {
+ hdr->quant.qm_y = dav1d_get_bits(gb, 4);
+ hdr->quant.qm_u = dav1d_get_bits(gb, 4);
+ hdr->quant.qm_v =
+ seqhdr->separate_uv_delta_q ? dav1d_get_bits(gb, 4) :
+ hdr->quant.qm_u;
+ }
+#if DEBUG_FRAME_HDR
+ printf("HDR: post-qm: off=%td\n",
+ (gb->ptr - init_ptr) * 8 - gb->bits_left);
+#endif
+
+ // segmentation data
+ hdr->segmentation.enabled = dav1d_get_bit(gb);
+ if (hdr->segmentation.enabled) {
+ if (hdr->primary_ref_frame == DAV1D_PRIMARY_REF_NONE) {
+ hdr->segmentation.update_map = 1;
+ hdr->segmentation.temporal = 0;
+ hdr->segmentation.update_data = 1;
+ } else {
+ hdr->segmentation.update_map = dav1d_get_bit(gb);
+ hdr->segmentation.temporal =
+ hdr->segmentation.update_map ? dav1d_get_bit(gb) : 0;
+ hdr->segmentation.update_data = dav1d_get_bit(gb);
+ }
+
+ if (hdr->segmentation.update_data) {
+ hdr->segmentation.seg_data.preskip = 0;
+ hdr->segmentation.seg_data.last_active_segid = -1;
+ for (int i = 0; i < DAV1D_MAX_SEGMENTS; i++) {
+ Dav1dSegmentationData *const seg =
+ &hdr->segmentation.seg_data.d[i];
+ if (dav1d_get_bit(gb)) {
+ seg->delta_q = dav1d_get_sbits(gb, 9);
+ hdr->segmentation.seg_data.last_active_segid = i;
+ } else {
+ seg->delta_q = 0;
+ }
+ if (dav1d_get_bit(gb)) {
+ seg->delta_lf_y_v = dav1d_get_sbits(gb, 7);
+ hdr->segmentation.seg_data.last_active_segid = i;
+ } else {
+ seg->delta_lf_y_v = 0;
+ }
+ if (dav1d_get_bit(gb)) {
+ seg->delta_lf_y_h = dav1d_get_sbits(gb, 7);
+ hdr->segmentation.seg_data.last_active_segid = i;
+ } else {
+ seg->delta_lf_y_h = 0;
+ }
+ if (dav1d_get_bit(gb)) {
+ seg->delta_lf_u = dav1d_get_sbits(gb, 7);
+ hdr->segmentation.seg_data.last_active_segid = i;
+ } else {
+ seg->delta_lf_u = 0;
+ }
+ if (dav1d_get_bit(gb)) {
+ seg->delta_lf_v = dav1d_get_sbits(gb, 7);
+ hdr->segmentation.seg_data.last_active_segid = i;
+ } else {
+ seg->delta_lf_v = 0;
+ }
+ if (dav1d_get_bit(gb)) {
+ seg->ref = dav1d_get_bits(gb, 3);
+ hdr->segmentation.seg_data.last_active_segid = i;
+ hdr->segmentation.seg_data.preskip = 1;
+ } else {
+ seg->ref = -1;
+ }
+ if ((seg->skip = dav1d_get_bit(gb))) {
+ hdr->segmentation.seg_data.last_active_segid = i;
+ hdr->segmentation.seg_data.preskip = 1;
+ }
+ if ((seg->globalmv = dav1d_get_bit(gb))) {
+ hdr->segmentation.seg_data.last_active_segid = i;
+ hdr->segmentation.seg_data.preskip = 1;
+ }
+ }
+ } else {
+ // segmentation.update_data was false so we should copy
+ // segmentation data from the reference frame.
+ assert(hdr->primary_ref_frame != DAV1D_PRIMARY_REF_NONE);
+ const int pri_ref = hdr->refidx[hdr->primary_ref_frame];
+ if (!c->refs[pri_ref].p.p.frame_hdr) goto error;
+ hdr->segmentation.seg_data =
+ c->refs[pri_ref].p.p.frame_hdr->segmentation.seg_data;
+ }
+ } else {
+ memset(&hdr->segmentation.seg_data, 0, sizeof(Dav1dSegmentationDataSet));
+ for (int i = 0; i < DAV1D_MAX_SEGMENTS; i++)
+ hdr->segmentation.seg_data.d[i].ref = -1;
+ }
+#if DEBUG_FRAME_HDR
+ printf("HDR: post-segmentation: off=%td\n",
+ (gb->ptr - init_ptr) * 8 - gb->bits_left);
+#endif
+
+ // delta q
+ hdr->delta.q.present = hdr->quant.yac ? dav1d_get_bit(gb) : 0;
+ hdr->delta.q.res_log2 = hdr->delta.q.present ? dav1d_get_bits(gb, 2) : 0;
+ hdr->delta.lf.present = hdr->delta.q.present && !hdr->allow_intrabc &&
+ dav1d_get_bit(gb);
+ hdr->delta.lf.res_log2 = hdr->delta.lf.present ? dav1d_get_bits(gb, 2) : 0;
+ hdr->delta.lf.multi = hdr->delta.lf.present ? dav1d_get_bit(gb) : 0;
+#if DEBUG_FRAME_HDR
+ printf("HDR: post-delta_q_lf_flags: off=%td\n",
+ (gb->ptr - init_ptr) * 8 - gb->bits_left);
+#endif
+
+ // derive lossless flags
+ const int delta_lossless = !hdr->quant.ydc_delta && !hdr->quant.udc_delta &&
+ !hdr->quant.uac_delta && !hdr->quant.vdc_delta && !hdr->quant.vac_delta;
+ hdr->all_lossless = 1;
+ for (int i = 0; i < DAV1D_MAX_SEGMENTS; i++) {
+ hdr->segmentation.qidx[i] = hdr->segmentation.enabled ?
+ iclip_u8(hdr->quant.yac + hdr->segmentation.seg_data.d[i].delta_q) :
+ hdr->quant.yac;
+ hdr->segmentation.lossless[i] =
+ !hdr->segmentation.qidx[i] && delta_lossless;
+ hdr->all_lossless &= hdr->segmentation.lossless[i];
+ }
+
+ // loopfilter
+ if (hdr->all_lossless || hdr->allow_intrabc) {
+ hdr->loopfilter.level_y[0] = hdr->loopfilter.level_y[1] = 0;
+ hdr->loopfilter.level_u = hdr->loopfilter.level_v = 0;
+ hdr->loopfilter.sharpness = 0;
+ hdr->loopfilter.mode_ref_delta_enabled = 1;
+ hdr->loopfilter.mode_ref_delta_update = 1;
+ hdr->loopfilter.mode_ref_deltas = default_mode_ref_deltas;
+ } else {
+ hdr->loopfilter.level_y[0] = dav1d_get_bits(gb, 6);
+ hdr->loopfilter.level_y[1] = dav1d_get_bits(gb, 6);
+ if (!seqhdr->monochrome &&
+ (hdr->loopfilter.level_y[0] || hdr->loopfilter.level_y[1]))
+ {
+ hdr->loopfilter.level_u = dav1d_get_bits(gb, 6);
+ hdr->loopfilter.level_v = dav1d_get_bits(gb, 6);
+ }
+ hdr->loopfilter.sharpness = dav1d_get_bits(gb, 3);
+
+ if (hdr->primary_ref_frame == DAV1D_PRIMARY_REF_NONE) {
+ hdr->loopfilter.mode_ref_deltas = default_mode_ref_deltas;
+ } else {
+ const int ref = hdr->refidx[hdr->primary_ref_frame];
+ if (!c->refs[ref].p.p.frame_hdr) goto error;
+ hdr->loopfilter.mode_ref_deltas =
+ c->refs[ref].p.p.frame_hdr->loopfilter.mode_ref_deltas;
+ }
+ hdr->loopfilter.mode_ref_delta_enabled = dav1d_get_bit(gb);
+ if (hdr->loopfilter.mode_ref_delta_enabled) {
+ hdr->loopfilter.mode_ref_delta_update = dav1d_get_bit(gb);
+ if (hdr->loopfilter.mode_ref_delta_update) {
+ for (int i = 0; i < 8; i++)
+ if (dav1d_get_bit(gb))
+ hdr->loopfilter.mode_ref_deltas.ref_delta[i] =
+ dav1d_get_sbits(gb, 7);
+ for (int i = 0; i < 2; i++)
+ if (dav1d_get_bit(gb))
+ hdr->loopfilter.mode_ref_deltas.mode_delta[i] =
+ dav1d_get_sbits(gb, 7);
+ }
+ }
+ }
+#if DEBUG_FRAME_HDR
+ printf("HDR: post-lpf: off=%td\n",
+ (gb->ptr - init_ptr) * 8 - gb->bits_left);
+#endif
+
+ // cdef
+ if (!hdr->all_lossless && seqhdr->cdef && !hdr->allow_intrabc) {
+ hdr->cdef.damping = dav1d_get_bits(gb, 2) + 3;
+ hdr->cdef.n_bits = dav1d_get_bits(gb, 2);
+ for (int i = 0; i < (1 << hdr->cdef.n_bits); i++) {
+ hdr->cdef.y_strength[i] = dav1d_get_bits(gb, 6);
+ if (!seqhdr->monochrome)
+ hdr->cdef.uv_strength[i] = dav1d_get_bits(gb, 6);
+ }
+ } else {
+ hdr->cdef.n_bits = 0;
+ hdr->cdef.y_strength[0] = 0;
+ hdr->cdef.uv_strength[0] = 0;
+ }
+#if DEBUG_FRAME_HDR
+ printf("HDR: post-cdef: off=%td\n",
+ (gb->ptr - init_ptr) * 8 - gb->bits_left);
+#endif
+
+ // restoration
+ if ((!hdr->all_lossless || hdr->super_res.enabled) &&
+ seqhdr->restoration && !hdr->allow_intrabc)
+ {
+ hdr->restoration.type[0] = dav1d_get_bits(gb, 2);
+ if (!seqhdr->monochrome) {
+ hdr->restoration.type[1] = dav1d_get_bits(gb, 2);
+ hdr->restoration.type[2] = dav1d_get_bits(gb, 2);
+ } else {
+ hdr->restoration.type[1] =
+ hdr->restoration.type[2] = DAV1D_RESTORATION_NONE;
+ }
+
+ if (hdr->restoration.type[0] || hdr->restoration.type[1] ||
+ hdr->restoration.type[2])
+ {
+ // Log2 of the restoration unit size.
+ hdr->restoration.unit_size[0] = 6 + seqhdr->sb128;
+ if (dav1d_get_bit(gb)) {
+ hdr->restoration.unit_size[0]++;
+ if (!seqhdr->sb128)
+ hdr->restoration.unit_size[0] += dav1d_get_bit(gb);
+ }
+ hdr->restoration.unit_size[1] = hdr->restoration.unit_size[0];
+ if ((hdr->restoration.type[1] || hdr->restoration.type[2]) &&
+ seqhdr->ss_hor == 1 && seqhdr->ss_ver == 1)
+ {
+ hdr->restoration.unit_size[1] -= dav1d_get_bit(gb);
+ }
+ } else {
+ hdr->restoration.unit_size[0] = 8;
+ }
+ } else {
+ hdr->restoration.type[0] = DAV1D_RESTORATION_NONE;
+ hdr->restoration.type[1] = DAV1D_RESTORATION_NONE;
+ hdr->restoration.type[2] = DAV1D_RESTORATION_NONE;
+ }
+#if DEBUG_FRAME_HDR
+ printf("HDR: post-restoration: off=%td\n",
+ (gb->ptr - init_ptr) * 8 - gb->bits_left);
+#endif
+
+ hdr->txfm_mode = hdr->all_lossless ? DAV1D_TX_4X4_ONLY :
+ dav1d_get_bit(gb) ? DAV1D_TX_SWITCHABLE : DAV1D_TX_LARGEST;
+#if DEBUG_FRAME_HDR
+ printf("HDR: post-txfmmode: off=%td\n",
+ (gb->ptr - init_ptr) * 8 - gb->bits_left);
+#endif
+ hdr->switchable_comp_refs = IS_INTER_OR_SWITCH(hdr) ? dav1d_get_bit(gb) : 0;
+#if DEBUG_FRAME_HDR
+ printf("HDR: post-refmode: off=%td\n",
+ (gb->ptr - init_ptr) * 8 - gb->bits_left);
+#endif
+ hdr->skip_mode_allowed = 0;
+ if (hdr->switchable_comp_refs && IS_INTER_OR_SWITCH(hdr) && seqhdr->order_hint) {
+ const unsigned poc = hdr->frame_offset;
+ unsigned off_before = 0xFFFFFFFFU;
+ int off_after = -1;
+ int off_before_idx, off_after_idx;
+ for (int i = 0; i < 7; i++) {
+ if (!c->refs[hdr->refidx[i]].p.p.frame_hdr) goto error;
+ const unsigned refpoc = c->refs[hdr->refidx[i]].p.p.frame_hdr->frame_offset;
+
+ const int diff = get_poc_diff(seqhdr->order_hint_n_bits, refpoc, poc);
+ if (diff > 0) {
+ if (off_after == -1 || get_poc_diff(seqhdr->order_hint_n_bits,
+ off_after, refpoc) > 0)
+ {
+ off_after = refpoc;
+ off_after_idx = i;
+ }
+ } else if (diff < 0 && (off_before == 0xFFFFFFFFU ||
+ get_poc_diff(seqhdr->order_hint_n_bits,
+ refpoc, off_before) > 0))
+ {
+ off_before = refpoc;
+ off_before_idx = i;
+ }
+ }
+
+ if (off_before != 0xFFFFFFFFU && off_after != -1) {
+ hdr->skip_mode_refs[0] = imin(off_before_idx, off_after_idx);
+ hdr->skip_mode_refs[1] = imax(off_before_idx, off_after_idx);
+ hdr->skip_mode_allowed = 1;
+ } else if (off_before != 0xFFFFFFFFU) {
+ unsigned off_before2 = 0xFFFFFFFFU;
+ int off_before2_idx;
+ for (int i = 0; i < 7; i++) {
+ if (!c->refs[hdr->refidx[i]].p.p.frame_hdr) goto error;
+ const unsigned refpoc = c->refs[hdr->refidx[i]].p.p.frame_hdr->frame_offset;
+ if (get_poc_diff(seqhdr->order_hint_n_bits,
+ refpoc, off_before) < 0) {
+ if (off_before2 == 0xFFFFFFFFU ||
+ get_poc_diff(seqhdr->order_hint_n_bits,
+ refpoc, off_before2) > 0)
+ {
+ off_before2 = refpoc;
+ off_before2_idx = i;
+ }
+ }
+ }
+
+ if (off_before2 != 0xFFFFFFFFU) {
+ hdr->skip_mode_refs[0] = imin(off_before_idx, off_before2_idx);
+ hdr->skip_mode_refs[1] = imax(off_before_idx, off_before2_idx);
+ hdr->skip_mode_allowed = 1;
+ }
+ }
+ }
+ hdr->skip_mode_enabled = hdr->skip_mode_allowed ? dav1d_get_bit(gb) : 0;
+#if DEBUG_FRAME_HDR
+ printf("HDR: post-extskip: off=%td\n",
+ (gb->ptr - init_ptr) * 8 - gb->bits_left);
+#endif
+ hdr->warp_motion = !hdr->error_resilient_mode && IS_INTER_OR_SWITCH(hdr) &&
+ seqhdr->warped_motion && dav1d_get_bit(gb);
+#if DEBUG_FRAME_HDR
+ printf("HDR: post-warpmotionbit: off=%td\n",
+ (gb->ptr - init_ptr) * 8 - gb->bits_left);
+#endif
+ hdr->reduced_txtp_set = dav1d_get_bit(gb);
+#if DEBUG_FRAME_HDR
+ printf("HDR: post-reducedtxtpset: off=%td\n",
+ (gb->ptr - init_ptr) * 8 - gb->bits_left);
+#endif
+
+ for (int i = 0; i < 7; i++)
+ hdr->gmv[i] = dav1d_default_wm_params;
+
+ if (IS_INTER_OR_SWITCH(hdr)) {
+ for (int i = 0; i < 7; i++) {
+ hdr->gmv[i].type = !dav1d_get_bit(gb) ? DAV1D_WM_TYPE_IDENTITY :
+ dav1d_get_bit(gb) ? DAV1D_WM_TYPE_ROT_ZOOM :
+ dav1d_get_bit(gb) ? DAV1D_WM_TYPE_TRANSLATION :
+ DAV1D_WM_TYPE_AFFINE;
+
+ if (hdr->gmv[i].type == DAV1D_WM_TYPE_IDENTITY) continue;
+
+ const Dav1dWarpedMotionParams *ref_gmv;
+ if (hdr->primary_ref_frame == DAV1D_PRIMARY_REF_NONE) {
+ ref_gmv = &dav1d_default_wm_params;
+ } else {
+ const int pri_ref = hdr->refidx[hdr->primary_ref_frame];
+ if (!c->refs[pri_ref].p.p.frame_hdr) goto error;
+ ref_gmv = &c->refs[pri_ref].p.p.frame_hdr->gmv[i];
+ }
+ int32_t *const mat = hdr->gmv[i].matrix;
+ const int32_t *const ref_mat = ref_gmv->matrix;
+ int bits, shift;
+
+ if (hdr->gmv[i].type >= DAV1D_WM_TYPE_ROT_ZOOM) {
+ mat[2] = (1 << 16) + 2 *
+ dav1d_get_bits_subexp(gb, (ref_mat[2] - (1 << 16)) >> 1, 12);
+ mat[3] = 2 * dav1d_get_bits_subexp(gb, ref_mat[3] >> 1, 12);
+
+ bits = 12;
+ shift = 10;
+ } else {
+ bits = 9 - !hdr->hp;
+ shift = 13 + !hdr->hp;
+ }
+
+ if (hdr->gmv[i].type == DAV1D_WM_TYPE_AFFINE) {
+ mat[4] = 2 * dav1d_get_bits_subexp(gb, ref_mat[4] >> 1, 12);
+ mat[5] = (1 << 16) + 2 *
+ dav1d_get_bits_subexp(gb, (ref_mat[5] - (1 << 16)) >> 1, 12);
+ } else {
+ mat[4] = -mat[3];
+ mat[5] = mat[2];
+ }
+
+ mat[0] = dav1d_get_bits_subexp(gb, ref_mat[0] >> shift, bits) * (1 << shift);
+ mat[1] = dav1d_get_bits_subexp(gb, ref_mat[1] >> shift, bits) * (1 << shift);
+ }
+ }
+#if DEBUG_FRAME_HDR
+ printf("HDR: post-gmv: off=%td\n",
+ (gb->ptr - init_ptr) * 8 - gb->bits_left);
+#endif
+
+ hdr->film_grain.present = seqhdr->film_grain_present &&
+ (hdr->show_frame || hdr->showable_frame) &&
+ dav1d_get_bit(gb);
+ if (hdr->film_grain.present) {
+ const unsigned seed = dav1d_get_bits(gb, 16);
+ hdr->film_grain.update = hdr->frame_type != DAV1D_FRAME_TYPE_INTER || dav1d_get_bit(gb);
+ if (!hdr->film_grain.update) {
+ const int refidx = dav1d_get_bits(gb, 3);
+ int i;
+ for (i = 0; i < 7; i++)
+ if (hdr->refidx[i] == refidx)
+ break;
+ if (i == 7 || !c->refs[refidx].p.p.frame_hdr) goto error;
+ hdr->film_grain.data = c->refs[refidx].p.p.frame_hdr->film_grain.data;
+ hdr->film_grain.data.seed = seed;
+ } else {
+ Dav1dFilmGrainData *const fgd = &hdr->film_grain.data;
+ fgd->seed = seed;
+
+ fgd->num_y_points = dav1d_get_bits(gb, 4);
+ if (fgd->num_y_points > 14) goto error;
+ for (int i = 0; i < fgd->num_y_points; i++) {
+ fgd->y_points[i][0] = dav1d_get_bits(gb, 8);
+ if (i && fgd->y_points[i - 1][0] >= fgd->y_points[i][0])
+ goto error;
+ fgd->y_points[i][1] = dav1d_get_bits(gb, 8);
+ }
+
+ fgd->chroma_scaling_from_luma =
+ !seqhdr->monochrome && dav1d_get_bit(gb);
+ if (seqhdr->monochrome || fgd->chroma_scaling_from_luma ||
+ (seqhdr->ss_ver == 1 && seqhdr->ss_hor == 1 && !fgd->num_y_points))
+ {
+ fgd->num_uv_points[0] = fgd->num_uv_points[1] = 0;
+ } else for (int pl = 0; pl < 2; pl++) {
+ fgd->num_uv_points[pl] = dav1d_get_bits(gb, 4);
+ if (fgd->num_uv_points[pl] > 10) goto error;
+ for (int i = 0; i < fgd->num_uv_points[pl]; i++) {
+ fgd->uv_points[pl][i][0] = dav1d_get_bits(gb, 8);
+ if (i && fgd->uv_points[pl][i - 1][0] >= fgd->uv_points[pl][i][0])
+ goto error;
+ fgd->uv_points[pl][i][1] = dav1d_get_bits(gb, 8);
+ }
+ }
+
+ if (seqhdr->ss_hor == 1 && seqhdr->ss_ver == 1 &&
+ !!fgd->num_uv_points[0] != !!fgd->num_uv_points[1])
+ {
+ goto error;
+ }
+
+ fgd->scaling_shift = dav1d_get_bits(gb, 2) + 8;
+ fgd->ar_coeff_lag = dav1d_get_bits(gb, 2);
+ const int num_y_pos = 2 * fgd->ar_coeff_lag * (fgd->ar_coeff_lag + 1);
+ if (fgd->num_y_points)
+ for (int i = 0; i < num_y_pos; i++)
+ fgd->ar_coeffs_y[i] = dav1d_get_bits(gb, 8) - 128;
+ for (int pl = 0; pl < 2; pl++)
+ if (fgd->num_uv_points[pl] || fgd->chroma_scaling_from_luma) {
+ const int num_uv_pos = num_y_pos + !!fgd->num_y_points;
+ for (int i = 0; i < num_uv_pos; i++)
+ fgd->ar_coeffs_uv[pl][i] = dav1d_get_bits(gb, 8) - 128;
+ if (!fgd->num_y_points)
+ fgd->ar_coeffs_uv[pl][num_uv_pos] = 0;
+ }
+ fgd->ar_coeff_shift = dav1d_get_bits(gb, 2) + 6;
+ fgd->grain_scale_shift = dav1d_get_bits(gb, 2);
+ for (int pl = 0; pl < 2; pl++)
+ if (fgd->num_uv_points[pl]) {
+ fgd->uv_mult[pl] = dav1d_get_bits(gb, 8) - 128;
+ fgd->uv_luma_mult[pl] = dav1d_get_bits(gb, 8) - 128;
+ fgd->uv_offset[pl] = dav1d_get_bits(gb, 9) - 256;
+ }
+ fgd->overlap_flag = dav1d_get_bit(gb);
+ fgd->clip_to_restricted_range = dav1d_get_bit(gb);
+ }
+ } else {
+ memset(&hdr->film_grain.data, 0, sizeof(hdr->film_grain.data));
+ }
+#if DEBUG_FRAME_HDR
+ printf("HDR: post-filmgrain: off=%td\n",
+ (gb->ptr - init_ptr) * 8 - gb->bits_left);
+#endif
+
+ return 0;
+
+error:
+ dav1d_log(c, "Error parsing frame header\n");
+ return DAV1D_ERR(EINVAL);
+}
+
+static void parse_tile_hdr(Dav1dContext *const c, GetBits *const gb) {
+ const int n_tiles = c->frame_hdr->tiling.cols * c->frame_hdr->tiling.rows;
+ const int have_tile_pos = n_tiles > 1 ? dav1d_get_bit(gb) : 0;
+
+ if (have_tile_pos) {
+ const int n_bits = c->frame_hdr->tiling.log2_cols +
+ c->frame_hdr->tiling.log2_rows;
+ c->tile[c->n_tile_data].start = dav1d_get_bits(gb, n_bits);
+ c->tile[c->n_tile_data].end = dav1d_get_bits(gb, n_bits);
+ } else {
+ c->tile[c->n_tile_data].start = 0;
+ c->tile[c->n_tile_data].end = n_tiles - 1;
+ }
+}
+
+ptrdiff_t dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in) {
+ GetBits gb;
+ int res;
+
+ dav1d_init_get_bits(&gb, in->data, in->sz);
+
+ // obu header
+ const int obu_forbidden_bit = dav1d_get_bit(&gb);
+ if (c->strict_std_compliance && obu_forbidden_bit) goto error;
+ const enum Dav1dObuType type = dav1d_get_bits(&gb, 4);
+ const int has_extension = dav1d_get_bit(&gb);
+ const int has_length_field = dav1d_get_bit(&gb);
+ dav1d_get_bit(&gb); // reserved
+
+ int temporal_id = 0, spatial_id = 0;
+ if (has_extension) {
+ temporal_id = dav1d_get_bits(&gb, 3);
+ spatial_id = dav1d_get_bits(&gb, 2);
+ dav1d_get_bits(&gb, 3); // reserved
+ }
+
+ if (has_length_field) {
+ const size_t len = dav1d_get_uleb128(&gb);
+ if (len > (size_t)(gb.ptr_end - gb.ptr)) goto error;
+ gb.ptr_end = gb.ptr + len;
+ }
+ if (gb.error) goto error;
+
+ // We must have read a whole number of bytes at this point (1 byte
+ // for the header and whole bytes at a time when reading the
+ // leb128 length field).
+ assert(gb.bits_left == 0);
+
+ // skip obu not belonging to the selected temporal/spatial layer
+ if (type != DAV1D_OBU_SEQ_HDR && type != DAV1D_OBU_TD &&
+ has_extension && c->operating_point_idc != 0)
+ {
+ const int in_temporal_layer = (c->operating_point_idc >> temporal_id) & 1;
+ const int in_spatial_layer = (c->operating_point_idc >> (spatial_id + 8)) & 1;
+ if (!in_temporal_layer || !in_spatial_layer)
+ return gb.ptr_end - gb.ptr_start;
+ }
+
+ switch (type) {
+ case DAV1D_OBU_SEQ_HDR: {
+ Dav1dRef *ref = dav1d_ref_create_using_pool(c->seq_hdr_pool,
+ sizeof(Dav1dSequenceHeader));
+ if (!ref) return DAV1D_ERR(ENOMEM);
+ Dav1dSequenceHeader *seq_hdr = ref->data;
+ if ((res = parse_seq_hdr(seq_hdr, &gb, c->strict_std_compliance)) < 0) {
+ dav1d_log(c, "Error parsing sequence header\n");
+ dav1d_ref_dec(&ref);
+ goto error;
+ }
+
+ const int op_idx =
+ c->operating_point < seq_hdr->num_operating_points ? c->operating_point : 0;
+ c->operating_point_idc = seq_hdr->operating_points[op_idx].idc;
+ const unsigned spatial_mask = c->operating_point_idc >> 8;
+ c->max_spatial_id = spatial_mask ? ulog2(spatial_mask) : 0;
+
+ // If we have read a sequence header which is different from
+ // the old one, this is a new video sequence and can't use any
+ // previous state. Free that state.
+
+ if (!c->seq_hdr) {
+ c->frame_hdr = NULL;
+ c->frame_flags |= PICTURE_FLAG_NEW_SEQUENCE;
+ // see 7.5, operating_parameter_info is allowed to change in
+ // sequence headers of a single sequence
+ } else if (memcmp(seq_hdr, c->seq_hdr, offsetof(Dav1dSequenceHeader, operating_parameter_info))) {
+ c->frame_hdr = NULL;
+ c->mastering_display = NULL;
+ c->content_light = NULL;
+ dav1d_ref_dec(&c->mastering_display_ref);
+ dav1d_ref_dec(&c->content_light_ref);
+ for (int i = 0; i < 8; i++) {
+ if (c->refs[i].p.p.frame_hdr)
+ dav1d_thread_picture_unref(&c->refs[i].p);
+ dav1d_ref_dec(&c->refs[i].segmap);
+ dav1d_ref_dec(&c->refs[i].refmvs);
+ dav1d_cdf_thread_unref(&c->cdf[i]);
+ }
+ c->frame_flags |= PICTURE_FLAG_NEW_SEQUENCE;
+ // If operating_parameter_info changed, signal it
+ } else if (memcmp(seq_hdr->operating_parameter_info, c->seq_hdr->operating_parameter_info,
+ sizeof(seq_hdr->operating_parameter_info)))
+ {
+ c->frame_flags |= PICTURE_FLAG_NEW_OP_PARAMS_INFO;
+ }
+ dav1d_ref_dec(&c->seq_hdr_ref);
+ c->seq_hdr_ref = ref;
+ c->seq_hdr = seq_hdr;
+ break;
+ }
+ case DAV1D_OBU_REDUNDANT_FRAME_HDR:
+ if (c->frame_hdr) break;
+ // fall-through
+ case DAV1D_OBU_FRAME:
+ case DAV1D_OBU_FRAME_HDR:
+ if (!c->seq_hdr) goto error;
+ if (!c->frame_hdr_ref) {
+ c->frame_hdr_ref = dav1d_ref_create_using_pool(c->frame_hdr_pool,
+ sizeof(Dav1dFrameHeader));
+ if (!c->frame_hdr_ref) return DAV1D_ERR(ENOMEM);
+ }
+#ifndef NDEBUG
+ // ensure that the reference is writable
+ assert(dav1d_ref_is_writable(c->frame_hdr_ref));
+#endif
+ c->frame_hdr = c->frame_hdr_ref->data;
+ memset(c->frame_hdr, 0, sizeof(*c->frame_hdr));
+ c->frame_hdr->temporal_id = temporal_id;
+ c->frame_hdr->spatial_id = spatial_id;
+ if ((res = parse_frame_hdr(c, &gb)) < 0) {
+ c->frame_hdr = NULL;
+ goto error;
+ }
+ for (int n = 0; n < c->n_tile_data; n++)
+ dav1d_data_unref_internal(&c->tile[n].data);
+ c->n_tile_data = 0;
+ c->n_tiles = 0;
+ if (type != DAV1D_OBU_FRAME) {
+ // This is actually a frame header OBU so read the
+ // trailing bit and check for overrun.
+ if (check_trailing_bits(&gb, c->strict_std_compliance) < 0) {
+ c->frame_hdr = NULL;
+ goto error;
+ }
+ }
+
+ if (c->frame_size_limit && (int64_t)c->frame_hdr->width[1] *
+ c->frame_hdr->height > c->frame_size_limit)
+ {
+ dav1d_log(c, "Frame size %dx%d exceeds limit %u\n", c->frame_hdr->width[1],
+ c->frame_hdr->height, c->frame_size_limit);
+ c->frame_hdr = NULL;
+ return DAV1D_ERR(ERANGE);
+ }
+
+ if (type != DAV1D_OBU_FRAME)
+ break;
+ // OBU_FRAMEs shouldn't be signaled with show_existing_frame
+ if (c->frame_hdr->show_existing_frame) {
+ c->frame_hdr = NULL;
+ goto error;
+ }
+
+ // This is the frame header at the start of a frame OBU.
+ // There's no trailing bit at the end to skip, but we do need
+ // to align to the next byte.
+ dav1d_bytealign_get_bits(&gb);
+ // fall-through
+ case DAV1D_OBU_TILE_GRP: {
+ if (!c->frame_hdr) goto error;
+ if (c->n_tile_data_alloc < c->n_tile_data + 1) {
+ if ((c->n_tile_data + 1) > INT_MAX / (int)sizeof(*c->tile)) goto error;
+ struct Dav1dTileGroup *tile = dav1d_realloc(ALLOC_TILE, c->tile,
+ (c->n_tile_data + 1) * sizeof(*c->tile));
+ if (!tile) goto error;
+ c->tile = tile;
+ memset(c->tile + c->n_tile_data, 0, sizeof(*c->tile));
+ c->n_tile_data_alloc = c->n_tile_data + 1;
+ }
+ parse_tile_hdr(c, &gb);
+ // Align to the next byte boundary and check for overrun.
+ dav1d_bytealign_get_bits(&gb);
+ if (gb.error) goto error;
+
+ dav1d_data_ref(&c->tile[c->n_tile_data].data, in);
+ c->tile[c->n_tile_data].data.data = gb.ptr;
+ c->tile[c->n_tile_data].data.sz = (size_t)(gb.ptr_end - gb.ptr);
+ // ensure tile groups are in order and sane, see 6.10.1
+ if (c->tile[c->n_tile_data].start > c->tile[c->n_tile_data].end ||
+ c->tile[c->n_tile_data].start != c->n_tiles)
+ {
+ for (int i = 0; i <= c->n_tile_data; i++)
+ dav1d_data_unref_internal(&c->tile[i].data);
+ c->n_tile_data = 0;
+ c->n_tiles = 0;
+ goto error;
+ }
+ c->n_tiles += 1 + c->tile[c->n_tile_data].end -
+ c->tile[c->n_tile_data].start;
+ c->n_tile_data++;
+ break;
+ }
+ case DAV1D_OBU_METADATA: {
+#define DEBUG_OBU_METADATA 0
+#if DEBUG_OBU_METADATA
+ const uint8_t *const init_ptr = gb.ptr;
+#endif
+ // obu metadta type field
+ const enum ObuMetaType meta_type = dav1d_get_uleb128(&gb);
+ if (gb.error) goto error;
+
+ switch (meta_type) {
+ case OBU_META_HDR_CLL: {
+ Dav1dRef *ref = dav1d_ref_create(ALLOC_OBU_META,
+ sizeof(Dav1dContentLightLevel));
+ if (!ref) return DAV1D_ERR(ENOMEM);
+ Dav1dContentLightLevel *const content_light = ref->data;
+
+ content_light->max_content_light_level = dav1d_get_bits(&gb, 16);
+#if DEBUG_OBU_METADATA
+ printf("CLLOBU: max-content-light-level: %d [off=%td]\n",
+ content_light->max_content_light_level,
+ (gb.ptr - init_ptr) * 8 - gb.bits_left);
+#endif
+ content_light->max_frame_average_light_level = dav1d_get_bits(&gb, 16);
+#if DEBUG_OBU_METADATA
+ printf("CLLOBU: max-frame-average-light-level: %d [off=%td]\n",
+ content_light->max_frame_average_light_level,
+ (gb.ptr - init_ptr) * 8 - gb.bits_left);
+#endif
+
+ if (check_trailing_bits(&gb, c->strict_std_compliance) < 0) {
+ dav1d_ref_dec(&ref);
+ goto error;
+ }
+
+ dav1d_ref_dec(&c->content_light_ref);
+ c->content_light = content_light;
+ c->content_light_ref = ref;
+ break;
+ }
+ case OBU_META_HDR_MDCV: {
+ Dav1dRef *ref = dav1d_ref_create(ALLOC_OBU_META,
+ sizeof(Dav1dMasteringDisplay));
+ if (!ref) return DAV1D_ERR(ENOMEM);
+ Dav1dMasteringDisplay *const mastering_display = ref->data;
+
+ for (int i = 0; i < 3; i++) {
+ mastering_display->primaries[i][0] = dav1d_get_bits(&gb, 16);
+ mastering_display->primaries[i][1] = dav1d_get_bits(&gb, 16);
+#if DEBUG_OBU_METADATA
+ printf("MDCVOBU: primaries[%d]: (%d, %d) [off=%td]\n", i,
+ mastering_display->primaries[i][0],
+ mastering_display->primaries[i][1],
+ (gb.ptr - init_ptr) * 8 - gb.bits_left);
+#endif
+ }
+ mastering_display->white_point[0] = dav1d_get_bits(&gb, 16);
+#if DEBUG_OBU_METADATA
+ printf("MDCVOBU: white-point-x: %d [off=%td]\n",
+ mastering_display->white_point[0],
+ (gb.ptr - init_ptr) * 8 - gb.bits_left);
+#endif
+ mastering_display->white_point[1] = dav1d_get_bits(&gb, 16);
+#if DEBUG_OBU_METADATA
+ printf("MDCVOBU: white-point-y: %d [off=%td]\n",
+ mastering_display->white_point[1],
+ (gb.ptr - init_ptr) * 8 - gb.bits_left);
+#endif
+ mastering_display->max_luminance = dav1d_get_bits(&gb, 32);
+#if DEBUG_OBU_METADATA
+ printf("MDCVOBU: max-luminance: %d [off=%td]\n",
+ mastering_display->max_luminance,
+ (gb.ptr - init_ptr) * 8 - gb.bits_left);
+#endif
+ mastering_display->min_luminance = dav1d_get_bits(&gb, 32);
+#if DEBUG_OBU_METADATA
+ printf("MDCVOBU: min-luminance: %d [off=%td]\n",
+ mastering_display->min_luminance,
+ (gb.ptr - init_ptr) * 8 - gb.bits_left);
+#endif
+ if (check_trailing_bits(&gb, c->strict_std_compliance) < 0) {
+ dav1d_ref_dec(&ref);
+ goto error;
+ }
+
+ dav1d_ref_dec(&c->mastering_display_ref);
+ c->mastering_display = mastering_display;
+ c->mastering_display_ref = ref;
+ break;
+ }
+ case OBU_META_ITUT_T35: {
+ ptrdiff_t payload_size = gb.ptr_end - gb.ptr;
+ // Don't take into account all the trailing bits for payload_size
+ while (payload_size > 0 && !gb.ptr[payload_size - 1])
+ payload_size--; // trailing_zero_bit x 8
+ payload_size--; // trailing_one_bit + trailing_zero_bit x 7
+
+ int country_code_extension_byte = 0;
+ const int country_code = dav1d_get_bits(&gb, 8);
+ payload_size--;
+ if (country_code == 0xFF) {
+ country_code_extension_byte = dav1d_get_bits(&gb, 8);
+ payload_size--;
+ }
+
+ if (payload_size <= 0 || gb.ptr[payload_size] != 0x80) {
+ dav1d_log(c, "Malformed ITU-T T.35 metadata message format\n");
+ break;
+ }
+
+ if ((c->n_itut_t35 + 1) > INT_MAX / (int)sizeof(*c->itut_t35)) goto error;
+ struct Dav1dITUTT35 *itut_t35 = dav1d_realloc(ALLOC_OBU_META, c->itut_t35,
+ (c->n_itut_t35 + 1) * sizeof(*c->itut_t35));
+ if (!itut_t35) goto error;
+ c->itut_t35 = itut_t35;
+ memset(c->itut_t35 + c->n_itut_t35, 0, sizeof(*c->itut_t35));
+
+ struct itut_t35_ctx_context *itut_t35_ctx;
+ if (!c->n_itut_t35) {
+ assert(!c->itut_t35_ref);
+ itut_t35_ctx = dav1d_malloc(ALLOC_OBU_META, sizeof(struct itut_t35_ctx_context));
+ if (!itut_t35_ctx) goto error;
+ c->itut_t35_ref = dav1d_ref_init(&itut_t35_ctx->ref, c->itut_t35,
+ dav1d_picture_free_itut_t35, itut_t35_ctx, 0);
+ } else {
+ assert(c->itut_t35_ref && atomic_load(&c->itut_t35_ref->ref_cnt) == 1);
+ itut_t35_ctx = c->itut_t35_ref->user_data;
+ c->itut_t35_ref->const_data = (uint8_t *)c->itut_t35;
+ }
+ itut_t35_ctx->itut_t35 = c->itut_t35;
+ itut_t35_ctx->n_itut_t35 = c->n_itut_t35 + 1;
+
+ Dav1dITUTT35 *const itut_t35_metadata = &c->itut_t35[c->n_itut_t35];
+ itut_t35_metadata->payload = dav1d_malloc(ALLOC_OBU_META, payload_size);
+ if (!itut_t35_metadata->payload) goto error;
+
+ itut_t35_metadata->country_code = country_code;
+ itut_t35_metadata->country_code_extension_byte = country_code_extension_byte;
+ itut_t35_metadata->payload_size = payload_size;
+
+ // We know that we've read a whole number of bytes and that the
+ // payload is within the OBU boundaries, so just use memcpy()
+ assert(gb.bits_left == 0);
+ memcpy(itut_t35_metadata->payload, gb.ptr, payload_size);
+
+ c->n_itut_t35++;
+ break;
+ }
+ case OBU_META_SCALABILITY:
+ case OBU_META_TIMECODE:
+ // ignore metadata OBUs we don't care about
+ break;
+ default:
+ // print a warning but don't fail for unknown types
+ dav1d_log(c, "Unknown Metadata OBU type %d\n", meta_type);
+ break;
+ }
+
+ break;
+ }
+ case DAV1D_OBU_TD:
+ c->frame_flags |= PICTURE_FLAG_NEW_TEMPORAL_UNIT;
+ break;
+ case DAV1D_OBU_PADDING:
+ // ignore OBUs we don't care about
+ break;
+ default:
+ // print a warning but don't fail for unknown types
+ dav1d_log(c, "Unknown OBU type %d of size %td\n", type, gb.ptr_end - gb.ptr);
+ break;
+ }
+
+ if (c->seq_hdr && c->frame_hdr) {
+ if (c->frame_hdr->show_existing_frame) {
+ if (!c->refs[c->frame_hdr->existing_frame_idx].p.p.frame_hdr) goto error;
+ switch (c->refs[c->frame_hdr->existing_frame_idx].p.p.frame_hdr->frame_type) {
+ case DAV1D_FRAME_TYPE_INTER:
+ case DAV1D_FRAME_TYPE_SWITCH:
+ if (c->decode_frame_type > DAV1D_DECODEFRAMETYPE_REFERENCE)
+ goto skip;
+ break;
+ case DAV1D_FRAME_TYPE_INTRA:
+ if (c->decode_frame_type > DAV1D_DECODEFRAMETYPE_INTRA)
+ goto skip;
+ // fall-through
+ default:
+ break;
+ }
+ if (!c->refs[c->frame_hdr->existing_frame_idx].p.p.data[0]) goto error;
+ if (c->strict_std_compliance &&
+ !c->refs[c->frame_hdr->existing_frame_idx].p.showable)
+ {
+ goto error;
+ }
+ if (c->n_fc == 1) {
+ dav1d_thread_picture_ref(&c->out,
+ &c->refs[c->frame_hdr->existing_frame_idx].p);
+ dav1d_picture_copy_props(&c->out.p,
+ c->content_light, c->content_light_ref,
+ c->mastering_display, c->mastering_display_ref,
+ c->itut_t35, c->itut_t35_ref, c->n_itut_t35,
+ &in->m);
+ // Must be removed from the context after being attached to the frame
+ dav1d_ref_dec(&c->itut_t35_ref);
+ c->itut_t35 = NULL;
+ c->n_itut_t35 = 0;
+ c->event_flags |= dav1d_picture_get_event_flags(&c->refs[c->frame_hdr->existing_frame_idx].p);
+ } else {
+ pthread_mutex_lock(&c->task_thread.lock);
+ // need to append this to the frame output queue
+ const unsigned next = c->frame_thread.next++;
+ if (c->frame_thread.next == c->n_fc)
+ c->frame_thread.next = 0;
+
+ Dav1dFrameContext *const f = &c->fc[next];
+ while (f->n_tile_data > 0)
+ pthread_cond_wait(&f->task_thread.cond,
+ &f->task_thread.ttd->lock);
+ Dav1dThreadPicture *const out_delayed =
+ &c->frame_thread.out_delayed[next];
+ if (out_delayed->p.data[0] || atomic_load(&f->task_thread.error)) {
+ unsigned first = atomic_load(&c->task_thread.first);
+ if (first + 1U < c->n_fc)
+ atomic_fetch_add(&c->task_thread.first, 1U);
+ else
+ atomic_store(&c->task_thread.first, 0);
+ atomic_compare_exchange_strong(&c->task_thread.reset_task_cur,
+ &first, UINT_MAX);
+ if (c->task_thread.cur && c->task_thread.cur < c->n_fc)
+ c->task_thread.cur--;
+ }
+ const int error = f->task_thread.retval;
+ if (error) {
+ c->cached_error = error;
+ f->task_thread.retval = 0;
+ dav1d_data_props_copy(&c->cached_error_props, &out_delayed->p.m);
+ dav1d_thread_picture_unref(out_delayed);
+ } else if (out_delayed->p.data[0]) {
+ const unsigned progress = atomic_load_explicit(&out_delayed->progress[1],
+ memory_order_relaxed);
+ if ((out_delayed->visible || c->output_invisible_frames) &&
+ progress != FRAME_ERROR)
+ {
+ dav1d_thread_picture_ref(&c->out, out_delayed);
+ c->event_flags |= dav1d_picture_get_event_flags(out_delayed);
+ }
+ dav1d_thread_picture_unref(out_delayed);
+ }
+ dav1d_thread_picture_ref(out_delayed,
+ &c->refs[c->frame_hdr->existing_frame_idx].p);
+ out_delayed->visible = 1;
+ dav1d_picture_copy_props(&out_delayed->p,
+ c->content_light, c->content_light_ref,
+ c->mastering_display, c->mastering_display_ref,
+ c->itut_t35, c->itut_t35_ref, c->n_itut_t35,
+ &in->m);
+ // Must be removed from the context after being attached to the frame
+ dav1d_ref_dec(&c->itut_t35_ref);
+ c->itut_t35 = NULL;
+ c->n_itut_t35 = 0;
+
+ pthread_mutex_unlock(&c->task_thread.lock);
+ }
+ if (c->refs[c->frame_hdr->existing_frame_idx].p.p.frame_hdr->frame_type == DAV1D_FRAME_TYPE_KEY) {
+ const int r = c->frame_hdr->existing_frame_idx;
+ c->refs[r].p.showable = 0;
+ for (int i = 0; i < 8; i++) {
+ if (i == r) continue;
+
+ if (c->refs[i].p.p.frame_hdr)
+ dav1d_thread_picture_unref(&c->refs[i].p);
+ dav1d_thread_picture_ref(&c->refs[i].p, &c->refs[r].p);
+
+ dav1d_cdf_thread_unref(&c->cdf[i]);
+ dav1d_cdf_thread_ref(&c->cdf[i], &c->cdf[r]);
+
+ dav1d_ref_dec(&c->refs[i].segmap);
+ c->refs[i].segmap = c->refs[r].segmap;
+ if (c->refs[r].segmap)
+ dav1d_ref_inc(c->refs[r].segmap);
+ dav1d_ref_dec(&c->refs[i].refmvs);
+ }
+ }
+ c->frame_hdr = NULL;
+ } else if (c->n_tiles == c->frame_hdr->tiling.cols * c->frame_hdr->tiling.rows) {
+ switch (c->frame_hdr->frame_type) {
+ case DAV1D_FRAME_TYPE_INTER:
+ case DAV1D_FRAME_TYPE_SWITCH:
+ if (c->decode_frame_type > DAV1D_DECODEFRAMETYPE_REFERENCE ||
+ (c->decode_frame_type == DAV1D_DECODEFRAMETYPE_REFERENCE &&
+ !c->frame_hdr->refresh_frame_flags))
+ goto skip;
+ break;
+ case DAV1D_FRAME_TYPE_INTRA:
+ if (c->decode_frame_type > DAV1D_DECODEFRAMETYPE_INTRA ||
+ (c->decode_frame_type == DAV1D_DECODEFRAMETYPE_REFERENCE &&
+ !c->frame_hdr->refresh_frame_flags))
+ goto skip;
+ // fall-through
+ default:
+ break;
+ }
+ if (!c->n_tile_data)
+ goto error;
+ if ((res = dav1d_submit_frame(c)) < 0)
+ return res;
+ assert(!c->n_tile_data);
+ c->frame_hdr = NULL;
+ c->n_tiles = 0;
+ }
+ }
+
+ return gb.ptr_end - gb.ptr_start;
+
+skip:
+ // update refs with only the headers in case we skip the frame
+ for (int i = 0; i < 8; i++) {
+ if (c->frame_hdr->refresh_frame_flags & (1 << i)) {
+ dav1d_thread_picture_unref(&c->refs[i].p);
+ c->refs[i].p.p.frame_hdr = c->frame_hdr;
+ c->refs[i].p.p.seq_hdr = c->seq_hdr;
+ c->refs[i].p.p.frame_hdr_ref = c->frame_hdr_ref;
+ c->refs[i].p.p.seq_hdr_ref = c->seq_hdr_ref;
+ dav1d_ref_inc(c->frame_hdr_ref);
+ dav1d_ref_inc(c->seq_hdr_ref);
+ }
+ }
+
+ dav1d_ref_dec(&c->frame_hdr_ref);
+ c->frame_hdr = NULL;
+ c->n_tiles = 0;
+
+ return gb.ptr_end - gb.ptr_start;
+
+error:
+ dav1d_data_props_copy(&c->cached_error_props, &in->m);
+ dav1d_log(c, gb.error ? "Overrun in OBU bit buffer\n" :
+ "Error parsing OBU data\n");
+ return DAV1D_ERR(EINVAL);
+}
diff --git a/third_party/dav1d/src/obu.h b/third_party/dav1d/src/obu.h
new file mode 100644
index 0000000000..22901f020b
--- /dev/null
+++ b/third_party/dav1d/src/obu.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_OBU_H
+#define DAV1D_SRC_OBU_H
+
+#include "dav1d/data.h"
+#include "src/internal.h"
+
+ptrdiff_t dav1d_parse_obus(Dav1dContext *c, Dav1dData *in);
+
+#endif /* DAV1D_SRC_OBU_H */
diff --git a/third_party/dav1d/src/pal.c b/third_party/dav1d/src/pal.c
new file mode 100644
index 0000000000..f50c7aa21f
--- /dev/null
+++ b/third_party/dav1d/src/pal.c
@@ -0,0 +1,77 @@
+/*
+ * Copyright © 2023, VideoLAN and dav1d authors
+ * Copyright © 2023, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <string.h>
+
+#include "common/attributes.h"
+
+#include "src/pal.h"
+
+// fill invisible edges and pack to 4-bit (2 pixels per byte)
+static void pal_idx_finish_c(uint8_t *dst, const uint8_t *src,
+ const int bw, const int bh,
+ const int w, const int h)
+{
+ assert(bw >= 4 && bw <= 64 && !(bw & (bw - 1)));
+ assert(bh >= 4 && bh <= 64 && !(bh & (bh - 1)));
+ assert(w >= 4 && w <= bw && !(w & 3));
+ assert(h >= 4 && h <= bh && !(h & 3));
+
+ const int dst_w = w / 2;
+ const int dst_bw = bw / 2;
+
+ for (int y = 0; y < h; y++, src += bw, dst += dst_bw) {
+ for (int x = 0; x < dst_w; x++)
+ dst[x] = src[x * 2 + 0] | (src[x * 2 + 1] << 4);
+ if (dst_w < dst_bw)
+ memset(dst + dst_w, src[w - 1] * 0x11, dst_bw - dst_w);
+ }
+
+ if (h < bh) {
+ const uint8_t *const last_row = &dst[-dst_bw];
+ for (int y = h; y < bh; y++, dst += dst_bw)
+ memcpy(dst, last_row, dst_bw);
+ }
+}
+
+#if HAVE_ASM
+#if ARCH_X86
+#include "src/x86/pal.h"
+#endif
+#endif
+
+COLD void dav1d_pal_dsp_init(Dav1dPalDSPContext *const c) {
+ c->pal_idx_finish = pal_idx_finish_c;
+
+#if HAVE_ASM
+#if ARCH_X86
+ pal_dsp_init_x86(c);
+#endif
+#endif
+}
diff --git a/third_party/dav1d/src/pal.h b/third_party/dav1d/src/pal.h
new file mode 100644
index 0000000000..6a6d729bef
--- /dev/null
+++ b/third_party/dav1d/src/pal.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright © 2023, VideoLAN and dav1d authors
+ * Copyright © 2023, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_PAL_H
+#define DAV1D_SRC_PAL_H
+
+#include <stdint.h>
+
+#define decl_pal_idx_finish_fn(name) \
+void (name)(uint8_t *dst, const uint8_t *src, int bw, int bh, int w, int h)
+typedef decl_pal_idx_finish_fn(*pal_idx_finish_fn);
+
+typedef struct Dav1dPalDSPContext {
+ pal_idx_finish_fn pal_idx_finish;
+} Dav1dPalDSPContext;
+
+void dav1d_pal_dsp_init(Dav1dPalDSPContext *dsp);
+
+#endif /* DAV1D_SRC_PAL_H */
diff --git a/third_party/dav1d/src/picture.c b/third_party/dav1d/src/picture.c
new file mode 100644
index 0000000000..f22f05f0ca
--- /dev/null
+++ b/third_party/dav1d/src/picture.c
@@ -0,0 +1,336 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <errno.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "common/intops.h"
+#include "common/validate.h"
+
+#include "src/internal.h"
+#include "src/log.h"
+#include "src/picture.h"
+#include "src/ref.h"
+#include "src/thread.h"
+#include "src/thread_task.h"
+
+int dav1d_default_picture_alloc(Dav1dPicture *const p, void *const cookie) {
+ assert(sizeof(Dav1dMemPoolBuffer) <= DAV1D_PICTURE_ALIGNMENT);
+ const int hbd = p->p.bpc > 8;
+ const int aligned_w = (p->p.w + 127) & ~127;
+ const int aligned_h = (p->p.h + 127) & ~127;
+ const int has_chroma = p->p.layout != DAV1D_PIXEL_LAYOUT_I400;
+ const int ss_ver = p->p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_hor = p->p.layout != DAV1D_PIXEL_LAYOUT_I444;
+ ptrdiff_t y_stride = aligned_w << hbd;
+ ptrdiff_t uv_stride = has_chroma ? y_stride >> ss_hor : 0;
+ /* Due to how mapping of addresses to sets works in most L1 and L2 cache
+ * implementations, strides of multiples of certain power-of-two numbers
+ * may cause multiple rows of the same superblock to map to the same set,
+ * causing evictions of previous rows resulting in a reduction in cache
+ * hit rate. Avoid that by slightly padding the stride when necessary. */
+ if (!(y_stride & 1023))
+ y_stride += DAV1D_PICTURE_ALIGNMENT;
+ if (!(uv_stride & 1023) && has_chroma)
+ uv_stride += DAV1D_PICTURE_ALIGNMENT;
+ p->stride[0] = y_stride;
+ p->stride[1] = uv_stride;
+ const size_t y_sz = y_stride * aligned_h;
+ const size_t uv_sz = uv_stride * (aligned_h >> ss_ver);
+ const size_t pic_size = y_sz + 2 * uv_sz;
+
+ Dav1dMemPoolBuffer *const buf = dav1d_mem_pool_pop(cookie, pic_size +
+ DAV1D_PICTURE_ALIGNMENT -
+ sizeof(Dav1dMemPoolBuffer));
+ if (!buf) return DAV1D_ERR(ENOMEM);
+ p->allocator_data = buf;
+
+ uint8_t *const data = buf->data;
+ p->data[0] = data;
+ p->data[1] = has_chroma ? data + y_sz : NULL;
+ p->data[2] = has_chroma ? data + y_sz + uv_sz : NULL;
+
+ return 0;
+}
+
+void dav1d_default_picture_release(Dav1dPicture *const p, void *const cookie) {
+ dav1d_mem_pool_push(cookie, p->allocator_data);
+}
+
+struct pic_ctx_context {
+ Dav1dPicAllocator allocator;
+ Dav1dPicture pic;
+ Dav1dRef ref;
+ void *extra_data[];
+};
+
+static void free_buffer(const uint8_t *const data, void *const user_data) {
+ Dav1dMemPoolBuffer *buf = (Dav1dMemPoolBuffer *)data;
+ struct pic_ctx_context *pic_ctx = buf->data;
+
+ pic_ctx->allocator.release_picture_callback(&pic_ctx->pic,
+ pic_ctx->allocator.cookie);
+ dav1d_mem_pool_push(user_data, buf);
+}
+
+void dav1d_picture_free_itut_t35(const uint8_t *const data, void *const user_data) {
+ struct itut_t35_ctx_context *itut_t35_ctx = user_data;
+
+ for (size_t i = 0; i < itut_t35_ctx->n_itut_t35; i++)
+ dav1d_free(itut_t35_ctx->itut_t35[i].payload);
+ dav1d_free(itut_t35_ctx->itut_t35);
+ dav1d_free(itut_t35_ctx);
+}
+
+static int picture_alloc_with_edges(Dav1dContext *const c,
+ Dav1dPicture *const p,
+ const int w, const int h,
+ Dav1dSequenceHeader *const seq_hdr, Dav1dRef *const seq_hdr_ref,
+ Dav1dFrameHeader *const frame_hdr, Dav1dRef *const frame_hdr_ref,
+ const int bpc,
+ const Dav1dDataProps *const props,
+ Dav1dPicAllocator *const p_allocator,
+ void **const extra_ptr)
+{
+ if (p->data[0]) {
+ dav1d_log(c, "Picture already allocated!\n");
+ return -1;
+ }
+ assert(bpc > 0 && bpc <= 16);
+
+ size_t extra = c->n_fc > 1 ? sizeof(atomic_int) * 2 : 0;
+ Dav1dMemPoolBuffer *buf = dav1d_mem_pool_pop(c->pic_ctx_pool,
+ extra + sizeof(struct pic_ctx_context));
+ if (buf == NULL)
+ return DAV1D_ERR(ENOMEM);
+
+ struct pic_ctx_context *pic_ctx = buf->data;
+
+ p->p.w = w;
+ p->p.h = h;
+ p->seq_hdr = seq_hdr;
+ p->frame_hdr = frame_hdr;
+ p->p.layout = seq_hdr->layout;
+ p->p.bpc = bpc;
+ dav1d_data_props_set_defaults(&p->m);
+ const int res = p_allocator->alloc_picture_callback(p, p_allocator->cookie);
+ if (res < 0) {
+ dav1d_mem_pool_push(c->pic_ctx_pool, buf);
+ return res;
+ }
+
+ pic_ctx->allocator = *p_allocator;
+ pic_ctx->pic = *p;
+ p->ref = dav1d_ref_init(&pic_ctx->ref, buf, free_buffer, c->pic_ctx_pool, 0);
+
+ p->seq_hdr_ref = seq_hdr_ref;
+ if (seq_hdr_ref) dav1d_ref_inc(seq_hdr_ref);
+
+ p->frame_hdr_ref = frame_hdr_ref;
+ if (frame_hdr_ref) dav1d_ref_inc(frame_hdr_ref);
+
+ if (extra && extra_ptr)
+ *extra_ptr = &pic_ctx->extra_data;
+
+ return 0;
+}
+
+void dav1d_picture_copy_props(Dav1dPicture *const p,
+ Dav1dContentLightLevel *const content_light, Dav1dRef *const content_light_ref,
+ Dav1dMasteringDisplay *const mastering_display, Dav1dRef *const mastering_display_ref,
+ Dav1dITUTT35 *const itut_t35, Dav1dRef *itut_t35_ref, size_t n_itut_t35,
+ const Dav1dDataProps *const props)
+{
+ dav1d_data_props_copy(&p->m, props);
+
+ dav1d_ref_dec(&p->content_light_ref);
+ p->content_light_ref = content_light_ref;
+ p->content_light = content_light;
+ if (content_light_ref) dav1d_ref_inc(content_light_ref);
+
+ dav1d_ref_dec(&p->mastering_display_ref);
+ p->mastering_display_ref = mastering_display_ref;
+ p->mastering_display = mastering_display;
+ if (mastering_display_ref) dav1d_ref_inc(mastering_display_ref);
+
+ dav1d_ref_dec(&p->itut_t35_ref);
+ p->itut_t35_ref = itut_t35_ref;
+ p->itut_t35 = itut_t35;
+ p->n_itut_t35 = n_itut_t35;
+ if (itut_t35_ref) dav1d_ref_inc(itut_t35_ref);
+}
+
+int dav1d_thread_picture_alloc(Dav1dContext *const c, Dav1dFrameContext *const f,
+ const int bpc)
+{
+ Dav1dThreadPicture *const p = &f->sr_cur;
+
+ const int res =
+ picture_alloc_with_edges(c, &p->p, f->frame_hdr->width[1], f->frame_hdr->height,
+ f->seq_hdr, f->seq_hdr_ref,
+ f->frame_hdr, f->frame_hdr_ref,
+ bpc, &f->tile[0].data.m, &c->allocator,
+ (void **) &p->progress);
+ if (res) return res;
+
+ dav1d_picture_copy_props(&p->p, c->content_light, c->content_light_ref,
+ c->mastering_display, c->mastering_display_ref,
+ c->itut_t35, c->itut_t35_ref, c->n_itut_t35,
+ &f->tile[0].data.m);
+
+ // Must be removed from the context after being attached to the frame
+ dav1d_ref_dec(&c->itut_t35_ref);
+ c->itut_t35 = NULL;
+ c->n_itut_t35 = 0;
+
+ // Don't clear these flags from c->frame_flags if the frame is not visible.
+ // This way they will be added to the next visible frame too.
+ const int flags_mask = (f->frame_hdr->show_frame || c->output_invisible_frames)
+ ? 0 : (PICTURE_FLAG_NEW_SEQUENCE | PICTURE_FLAG_NEW_OP_PARAMS_INFO);
+ p->flags = c->frame_flags;
+ c->frame_flags &= flags_mask;
+
+ p->visible = f->frame_hdr->show_frame;
+ p->showable = f->frame_hdr->showable_frame;
+ if (c->n_fc > 1) {
+ atomic_init(&p->progress[0], 0);
+ atomic_init(&p->progress[1], 0);
+ }
+ return res;
+}
+
+int dav1d_picture_alloc_copy(Dav1dContext *const c, Dav1dPicture *const dst, const int w,
+ const Dav1dPicture *const src)
+{
+ Dav1dMemPoolBuffer *const buf = (Dav1dMemPoolBuffer *)src->ref->const_data;
+ struct pic_ctx_context *const pic_ctx = buf->data;
+ const int res = picture_alloc_with_edges(c, dst, w, src->p.h,
+ src->seq_hdr, src->seq_hdr_ref,
+ src->frame_hdr, src->frame_hdr_ref,
+ src->p.bpc, &src->m, &pic_ctx->allocator,
+ NULL);
+ if (res) return res;
+
+ dav1d_picture_copy_props(dst, src->content_light, src->content_light_ref,
+ src->mastering_display, src->mastering_display_ref,
+ src->itut_t35, src->itut_t35_ref, src->n_itut_t35,
+ &src->m);
+
+ return 0;
+}
+
+void dav1d_picture_ref(Dav1dPicture *const dst, const Dav1dPicture *const src) {
+ assert(dst != NULL);
+ assert(dst->data[0] == NULL);
+ assert(src != NULL);
+
+ if (src->ref) {
+ assert(src->data[0] != NULL);
+ dav1d_ref_inc(src->ref);
+ }
+ if (src->frame_hdr_ref) dav1d_ref_inc(src->frame_hdr_ref);
+ if (src->seq_hdr_ref) dav1d_ref_inc(src->seq_hdr_ref);
+ if (src->m.user_data.ref) dav1d_ref_inc(src->m.user_data.ref);
+ if (src->content_light_ref) dav1d_ref_inc(src->content_light_ref);
+ if (src->mastering_display_ref) dav1d_ref_inc(src->mastering_display_ref);
+ if (src->itut_t35_ref) dav1d_ref_inc(src->itut_t35_ref);
+ *dst = *src;
+}
+
+void dav1d_picture_move_ref(Dav1dPicture *const dst, Dav1dPicture *const src) {
+ assert(dst != NULL);
+ assert(dst->data[0] == NULL);
+ assert(src != NULL);
+
+ if (src->ref)
+ assert(src->data[0] != NULL);
+
+ *dst = *src;
+ memset(src, 0, sizeof(*src));
+}
+
+void dav1d_thread_picture_ref(Dav1dThreadPicture *const dst,
+ const Dav1dThreadPicture *const src)
+{
+ dav1d_picture_ref(&dst->p, &src->p);
+ dst->visible = src->visible;
+ dst->showable = src->showable;
+ dst->progress = src->progress;
+ dst->flags = src->flags;
+}
+
+void dav1d_thread_picture_move_ref(Dav1dThreadPicture *const dst,
+ Dav1dThreadPicture *const src)
+{
+ dav1d_picture_move_ref(&dst->p, &src->p);
+ dst->visible = src->visible;
+ dst->showable = src->showable;
+ dst->progress = src->progress;
+ dst->flags = src->flags;
+ memset(src, 0, sizeof(*src));
+}
+
+void dav1d_picture_unref_internal(Dav1dPicture *const p) {
+ validate_input(p != NULL);
+
+ if (p->ref) {
+ validate_input(p->data[0] != NULL);
+ dav1d_ref_dec(&p->ref);
+ }
+ dav1d_ref_dec(&p->seq_hdr_ref);
+ dav1d_ref_dec(&p->frame_hdr_ref);
+ dav1d_ref_dec(&p->m.user_data.ref);
+ dav1d_ref_dec(&p->content_light_ref);
+ dav1d_ref_dec(&p->mastering_display_ref);
+ dav1d_ref_dec(&p->itut_t35_ref);
+ memset(p, 0, sizeof(*p));
+ dav1d_data_props_set_defaults(&p->m);
+}
+
+void dav1d_thread_picture_unref(Dav1dThreadPicture *const p) {
+ dav1d_picture_unref_internal(&p->p);
+
+ p->progress = NULL;
+}
+
+enum Dav1dEventFlags dav1d_picture_get_event_flags(const Dav1dThreadPicture *const p) {
+ if (!p->flags)
+ return 0;
+
+ enum Dav1dEventFlags flags = 0;
+ if (p->flags & PICTURE_FLAG_NEW_SEQUENCE)
+ flags |= DAV1D_EVENT_FLAG_NEW_SEQUENCE;
+ if (p->flags & PICTURE_FLAG_NEW_OP_PARAMS_INFO)
+ flags |= DAV1D_EVENT_FLAG_NEW_OP_PARAMS_INFO;
+
+ return flags;
+}
diff --git a/third_party/dav1d/src/picture.h b/third_party/dav1d/src/picture.h
new file mode 100644
index 0000000000..88aee08f4a
--- /dev/null
+++ b/third_party/dav1d/src/picture.h
@@ -0,0 +1,122 @@
+/*
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_PICTURE_H
+#define DAV1D_SRC_PICTURE_H
+
+#include <stdatomic.h>
+
+#include "src/thread.h"
+#include "dav1d/picture.h"
+
+#include "src/thread_data.h"
+#include "src/ref.h"
+
+enum PlaneType {
+ PLANE_TYPE_Y,
+ PLANE_TYPE_UV,
+ PLANE_TYPE_BLOCK,
+ PLANE_TYPE_ALL,
+};
+
+enum PictureFlags {
+ PICTURE_FLAG_NEW_SEQUENCE = 1 << 0,
+ PICTURE_FLAG_NEW_OP_PARAMS_INFO = 1 << 1,
+ PICTURE_FLAG_NEW_TEMPORAL_UNIT = 1 << 2,
+};
+
+typedef struct Dav1dThreadPicture {
+ Dav1dPicture p;
+ int visible;
+ // This can be set for inter frames, non-key intra frames, or for invisible
+ // keyframes that have not yet been made visible using the show-existing-frame
+ // mechanism.
+ int showable;
+ enum PictureFlags flags;
+ // [0] block data (including segmentation map and motion vectors)
+ // [1] pixel data
+ atomic_uint *progress;
+} Dav1dThreadPicture;
+
+typedef struct Dav1dPictureBuffer {
+ void *data;
+ struct Dav1dPictureBuffer *next;
+} Dav1dPictureBuffer;
+
+/*
+ * Allocate a picture with custom border size.
+ */
+int dav1d_thread_picture_alloc(Dav1dContext *c, Dav1dFrameContext *f, const int bpc);
+
+/**
+ * Allocate a picture with identical metadata to an existing picture.
+ * The width is a separate argument so this function can be used for
+ * super-res, where the width changes, but everything else is the same.
+ * For the more typical use case of allocating a new image of the same
+ * dimensions, use src->p.w as width.
+ */
+int dav1d_picture_alloc_copy(Dav1dContext *c, Dav1dPicture *dst, const int w,
+ const Dav1dPicture *src);
+
+/**
+ * Create a copy of a picture.
+ */
+void dav1d_picture_ref(Dav1dPicture *dst, const Dav1dPicture *src);
+void dav1d_thread_picture_ref(Dav1dThreadPicture *dst,
+ const Dav1dThreadPicture *src);
+void dav1d_thread_picture_move_ref(Dav1dThreadPicture *dst,
+ Dav1dThreadPicture *src);
+void dav1d_thread_picture_unref(Dav1dThreadPicture *p);
+
+/**
+ * Move a picture reference.
+ */
+void dav1d_picture_move_ref(Dav1dPicture *dst, Dav1dPicture *src);
+
+int dav1d_default_picture_alloc(Dav1dPicture *p, void *cookie);
+void dav1d_default_picture_release(Dav1dPicture *p, void *cookie);
+void dav1d_picture_unref_internal(Dav1dPicture *p);
+
+struct itut_t35_ctx_context {
+ Dav1dITUTT35 *itut_t35;
+ size_t n_itut_t35;
+ Dav1dRef ref;
+};
+
+void dav1d_picture_free_itut_t35(const uint8_t *data, void *user_data);
+void dav1d_picture_copy_props(Dav1dPicture *p,
+ Dav1dContentLightLevel *content_light, Dav1dRef *content_light_ref,
+ Dav1dMasteringDisplay *mastering_display, Dav1dRef *mastering_display_ref,
+ Dav1dITUTT35 *itut_t35, Dav1dRef *itut_t35_ref, size_t n_itut_t35,
+ const Dav1dDataProps *props);
+
+/**
+ * Get event flags from picture flags.
+ */
+enum Dav1dEventFlags dav1d_picture_get_event_flags(const Dav1dThreadPicture *p);
+
+#endif /* DAV1D_SRC_PICTURE_H */
diff --git a/third_party/dav1d/src/ppc/cdef.h b/third_party/dav1d/src/ppc/cdef.h
new file mode 100644
index 0000000000..b794ba53be
--- /dev/null
+++ b/third_party/dav1d/src/ppc/cdef.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright © 2019, Luca Barbato
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdlib.h>
+
+#include "common/bitdepth.h"
+#include "common/intops.h"
+
+#include "src/cdef.h"
+#include "src/cpu.h"
+
+#define cdef_vsx_fn(w, h) \
+void dav1d_cdef_filter_##w##x##h##_vsx(pixel *const dst, \
+ const ptrdiff_t dst_stride, \
+ const pixel (*left)[2], \
+ const pixel *const top, \
+ const pixel *const bottom, \
+ const int pri_strength, \
+ const int sec_strength, \
+ const int dir, \
+ const int damping, \
+ const enum CdefEdgeFlags edges)
+
+cdef_vsx_fn(4, 4);
+cdef_vsx_fn(4, 8);
+cdef_vsx_fn(8, 8);
+
+static ALWAYS_INLINE void cdef_dsp_init_ppc(Dav1dCdefDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_PPC_CPU_FLAG_VSX)) return;
+
+#if BITDEPTH == 8
+ c->fb[0] = dav1d_cdef_filter_8x8_vsx;
+ c->fb[1] = dav1d_cdef_filter_4x8_vsx;
+ c->fb[2] = dav1d_cdef_filter_4x4_vsx;
+#endif
+}
diff --git a/third_party/dav1d/src/ppc/cdef_tmpl.c b/third_party/dav1d/src/ppc/cdef_tmpl.c
new file mode 100644
index 0000000000..e2e759810f
--- /dev/null
+++ b/third_party/dav1d/src/ppc/cdef_tmpl.c
@@ -0,0 +1,487 @@
+/*
+ * Copyright © 2019, Luca Barbato
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/ppc/dav1d_types.h"
+#include "src/ppc/cdef.h"
+
+#if BITDEPTH == 8
+static inline i16x8 vconstrain(const i16x8 diff, const int16_t threshold,
+ const int damping)
+{
+ const i16x8 zero = vec_splat_s16(0);
+ if (!threshold) return zero;
+ const uint16_t shift = imax(0, damping - ulog2(threshold));
+ const i16x8 abs_diff = vec_abs(diff);
+ const b16x8 mask = vec_cmplt(diff, zero);
+ const i16x8 thr = vec_splats(threshold);
+ const i16x8 sub = vec_sub(thr, vec_sra(abs_diff, vec_splats(shift)));
+ const i16x8 max = vec_max(zero, sub);
+ const i16x8 min = vec_min(abs_diff, max);
+ const i16x8 neg = vec_sub(zero, min);
+ return vec_sel(min, neg, mask);
+}
+
+static inline void copy4xN(uint16_t *tmp, const ptrdiff_t tmp_stride,
+ const uint8_t *src, const ptrdiff_t src_stride,
+ const uint8_t (*left)[2], const uint8_t *const top,
+ const uint8_t *const bottom, const int w, const int h,
+ const enum CdefEdgeFlags edges)
+{
+ const u16x8 fill = vec_splats((uint16_t)INT16_MAX);
+
+ u16x8 l0;
+ u16x8 l1;
+
+ int y_start = -2, y_end = h + 2;
+
+ // Copy top and bottom first
+ if (!(edges & CDEF_HAVE_TOP)) {
+ l0 = fill;
+ l1 = fill;
+ y_start = 0;
+ } else {
+ l0 = u8h_to_u16(vec_vsx_ld(0, top + 0 * src_stride - 2));
+ l1 = u8h_to_u16(vec_vsx_ld(0, top + 1 * src_stride - 2));
+ }
+
+ vec_st(l0, 0, tmp - 2 * 8);
+ vec_st(l1, 0, tmp - 1 * 8);
+
+ if (!(edges & CDEF_HAVE_BOTTOM)) {
+ l0 = fill;
+ l1 = fill;
+ y_end -= 2;
+ } else {
+ l0 = u8h_to_u16(vec_vsx_ld(0, bottom + 0 * src_stride - 2));
+ l1 = u8h_to_u16(vec_vsx_ld(0, bottom + 1 * src_stride - 2));
+ }
+
+ vec_st(l0, 0, tmp + (h + 0) * 8);
+ vec_st(l1, 0, tmp + (h + 1) * 8);
+
+ int y_with_left_edge = 0;
+ if (!(edges & CDEF_HAVE_LEFT)) {
+ u16x8 l = u8h_to_u16(vec_vsx_ld(0, src));
+ vec_vsx_st(l, 0, tmp + 2);
+
+ y_with_left_edge = 1;
+ }
+
+ for (int y = y_with_left_edge; y < h; y++) {
+ u16x8 l = u8h_to_u16(vec_vsx_ld(0, src - 2 + y * src_stride));
+ vec_st(l, 0, tmp + y * 8);
+ }
+
+ if (!(edges & CDEF_HAVE_LEFT)) {
+ for (int y = y_start; y < y_end; y++) {
+ tmp[y * 8] = INT16_MAX;
+ tmp[1 + y * 8] = INT16_MAX;
+ }
+ } else {
+ for (int y = 0; y < h; y++) {
+ tmp[y * 8] = left[y][0];
+ tmp[1 + y * 8] = left[y][1];
+ }
+ }
+ if (!(edges & CDEF_HAVE_RIGHT)) {
+ for (int y = y_start; y < y_end; y++) {
+ tmp[- 2 + (y + 1) * 8] = INT16_MAX;
+ tmp[- 1 + (y + 1) * 8] = INT16_MAX;
+ }
+ }
+}
+
+static inline void copy8xN(uint16_t *tmp, const ptrdiff_t tmp_stride,
+ const uint8_t *src, const ptrdiff_t src_stride,
+ const uint8_t (*left)[2], const uint8_t *const top,
+ const uint8_t *const bottom, const int w, const int h,
+ const enum CdefEdgeFlags edges)
+{
+ const u16x8 fill = vec_splats((uint16_t)INT16_MAX);
+
+ u16x8 l0h, l0l;
+ u16x8 l1h, l1l;
+
+ int y_start = -2, y_end = h + 2;
+
+ // Copy top and bottom first
+ if (!(edges & CDEF_HAVE_TOP)) {
+ l0h = fill;
+ l0l = fill;
+ l1h = fill;
+ l1l = fill;
+ y_start = 0;
+ } else {
+ u8x16 l0 = vec_vsx_ld(0, top + 0 * src_stride - 2);
+ u8x16 l1 = vec_vsx_ld(0, top + 1 * src_stride - 2);
+ l0h = u8h_to_u16(l0);
+ l0l = u8l_to_u16(l0);
+ l1h = u8h_to_u16(l1);
+ l1l = u8l_to_u16(l1);
+ }
+
+ vec_st(l0h, 0, tmp - 4 * 8);
+ vec_st(l0l, 0, tmp - 3 * 8);
+ vec_st(l1h, 0, tmp - 2 * 8);
+ vec_st(l1l, 0, tmp - 1 * 8);
+
+ if (!(edges & CDEF_HAVE_BOTTOM)) {
+ l0h = fill;
+ l0l = fill;
+ l1h = fill;
+ l1l = fill;
+ y_end -= 2;
+ } else {
+ u8x16 l0 = vec_vsx_ld(0, bottom + 0 * src_stride - 2);
+ u8x16 l1 = vec_vsx_ld(0, bottom + 1 * src_stride - 2);
+ l0h = u8h_to_u16(l0);
+ l0l = u8l_to_u16(l0);
+ l1h = u8h_to_u16(l1);
+ l1l = u8l_to_u16(l1);
+ }
+
+ vec_st(l0h, 0, tmp + (h + 0) * 16);
+ vec_st(l0l, 0, tmp + (h + 0) * 16 + 8);
+ vec_st(l1h, 0, tmp + (h + 1) * 16);
+ vec_st(l1l, 0, tmp + (h + 1) * 16 + 8);
+
+ int y_with_left_edge = 0;
+ if (!(edges & CDEF_HAVE_LEFT)) {
+ u8x16 l = vec_vsx_ld(0, src);
+ u16x8 lh = u8h_to_u16(l);
+ u16x8 ll = u8l_to_u16(l);
+ vec_vsx_st(lh, 0, tmp + 2);
+ vec_vsx_st(ll, 0, tmp + 8 + 2);
+
+ y_with_left_edge = 1;
+ }
+
+ for (int y = y_with_left_edge; y < h; y++) {
+ u8x16 l = vec_vsx_ld(0, src - 2 + y * src_stride);
+ u16x8 lh = u8h_to_u16(l);
+ u16x8 ll = u8l_to_u16(l);
+ vec_st(lh, 0, tmp + y * 16);
+ vec_st(ll, 0, tmp + 8 + y * 16);
+ }
+
+ if (!(edges & CDEF_HAVE_LEFT)) {
+ for (int y = y_start; y < y_end; y++) {
+ tmp[y * 16] = INT16_MAX;
+ tmp[1 + y * 16] = INT16_MAX;
+ }
+ } else {
+ for (int y = 0; y < h; y++) {
+ tmp[y * 16] = left[y][0];
+ tmp[1 + y * 16] = left[y][1];
+ }
+ }
+ if (!(edges & CDEF_HAVE_RIGHT)) {
+ for (int y = y_start; y < y_end; y++) {
+ tmp[- 6 + (y + 1) * 16] = INT16_MAX;
+ tmp[- 5 + (y + 1) * 16] = INT16_MAX;
+ }
+ }
+}
+
+static inline i16x8 max_mask(i16x8 a, i16x8 b) {
+ const i16x8 I16X8_INT16_MAX = vec_splats((int16_t)INT16_MAX);
+
+ const b16x8 mask = vec_cmpeq(a, I16X8_INT16_MAX);
+
+ const i16x8 val = vec_sel(a, b, mask);
+
+ return vec_max(val, b);
+}
+
+#define LOAD_PIX(addr) \
+ const i16x8 px = (i16x8)vec_vsx_ld(0, addr); \
+ i16x8 max = px; \
+ i16x8 min = px; \
+ i16x8 sum = vec_splat_s16(0);
+
+#define LOAD_PIX4(addr) \
+ const i16x8 a = (i16x8)vec_vsx_ld(0, addr); \
+ const i16x8 b = (i16x8)vec_vsx_ld(0, addr + tmp_stride); \
+ const i16x8 px = vec_xxpermdi(a, b, 0); \
+ i16x8 max = px; \
+ i16x8 min = px; \
+ i16x8 sum = vec_splat_s16(0);
+
+#define LOAD_DIR(p, addr, o0, o1) \
+ const i16x8 p ## 0 = (i16x8)vec_vsx_ld(0, addr + o0); \
+ const i16x8 p ## 1 = (i16x8)vec_vsx_ld(0, addr - o0); \
+ const i16x8 p ## 2 = (i16x8)vec_vsx_ld(0, addr + o1); \
+ const i16x8 p ## 3 = (i16x8)vec_vsx_ld(0, addr - o1);
+
+#define LOAD_DIR4(p, addr, o0, o1) \
+ LOAD_DIR(p ## a, addr, o0, o1) \
+ LOAD_DIR(p ## b, addr + tmp_stride, o0, o1) \
+ const i16x8 p ## 0 = vec_xxpermdi(p ## a ## 0, p ## b ## 0, 0); \
+ const i16x8 p ## 1 = vec_xxpermdi(p ## a ## 1, p ## b ## 1, 0); \
+ const i16x8 p ## 2 = vec_xxpermdi(p ## a ## 2, p ## b ## 2, 0); \
+ const i16x8 p ## 3 = vec_xxpermdi(p ## a ## 3, p ## b ## 3, 0);
+
+#define CONSTRAIN(p, strength) \
+ const i16x8 p ## _d0 = vec_sub(p ## 0, px); \
+ const i16x8 p ## _d1 = vec_sub(p ## 1, px); \
+ const i16x8 p ## _d2 = vec_sub(p ## 2, px); \
+ const i16x8 p ## _d3 = vec_sub(p ## 3, px); \
+\
+ i16x8 p ## _c0 = vconstrain(p ## _d0, strength, damping); \
+ i16x8 p ## _c1 = vconstrain(p ## _d1, strength, damping); \
+ i16x8 p ## _c2 = vconstrain(p ## _d2, strength, damping); \
+ i16x8 p ## _c3 = vconstrain(p ## _d3, strength, damping);
+
+#define MIN_MAX(p) \
+ max = max_mask(p ## 0, max); \
+ min = vec_min(p ## 0, min); \
+ max = max_mask(p ## 1, max); \
+ min = vec_min(p ## 1, min); \
+ max = max_mask(p ## 2, max); \
+ min = vec_min(p ## 2, min); \
+ max = max_mask(p ## 3, max); \
+ min = vec_min(p ## 3, min);
+
+#define PRI_0(p) \
+ p ## _c0 = vec_add(vec_sl(p ## _c0, vec_splat_u16(1)), vec_sl(p ## _c0, vec_splats(tap_even))); \
+ p ## _c1 = vec_add(vec_sl(p ## _c1, vec_splat_u16(1)), vec_sl(p ## _c1, vec_splats(tap_even)));
+
+#define PRI_1(p) \
+ p ## _c2 = vec_sub(vec_sl(p ## _c2, vec_splat_u16(2)), vec_sl(p ## _c2, vec_splats(tap_even))); \
+ p ## _c3 = vec_sub(vec_sl(p ## _c3, vec_splat_u16(2)), vec_sl(p ## _c3, vec_splats(tap_even)));
+
+#define SEC_0(p) \
+ p ## _c0 = vec_sl(p ## _c0, vec_splat_u16(1)); \
+ p ## _c1 = vec_sl(p ## _c1, vec_splat_u16(1)); \
+ p ## _c2 = vec_sl(p ## _c2, vec_splat_u16(1)); \
+ p ## _c3 = vec_sl(p ## _c3, vec_splat_u16(1));
+
+#define UPDATE_SUM(p) \
+ const i16x8 p ## sum0 = vec_add(p ## _c0, p ## _c1); \
+ const i16x8 p ## sum1 = vec_add(p ## _c2, p ## _c3); \
+ sum = vec_add(sum, p ## sum0); \
+ sum = vec_add(sum, p ## sum1);
+
+static inline void
+filter_4xN(pixel *dst, const ptrdiff_t dst_stride,
+ const pixel (*left)[2], const pixel *const top,
+ const pixel *const bottom, const int w, const int h,
+ const int pri_strength, const int sec_strength, const int dir,
+ const int damping, const enum CdefEdgeFlags edges,
+ const ptrdiff_t tmp_stride, uint16_t *tmp)
+{
+ const int8_t cdef_directions[8 /* dir */][2 /* pass */] = {
+ { -1 * tmp_stride + 1, -2 * tmp_stride + 2 },
+ { 0 * tmp_stride + 1, -1 * tmp_stride + 2 },
+ { 0 * tmp_stride + 1, 0 * tmp_stride + 2 },
+ { 0 * tmp_stride + 1, 1 * tmp_stride + 2 },
+ { 1 * tmp_stride + 1, 2 * tmp_stride + 2 },
+ { 1 * tmp_stride + 0, 2 * tmp_stride + 1 },
+ { 1 * tmp_stride + 0, 2 * tmp_stride + 0 },
+ { 1 * tmp_stride + 0, 2 * tmp_stride - 1 }
+ };
+ const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
+ const uint16_t tap_even = !((pri_strength >> bitdepth_min_8) & 1);
+ const int off1 = cdef_directions[dir][0];
+ const int off1_1 = cdef_directions[dir][1];
+
+ const int off2 = cdef_directions[(dir + 2) & 7][0];
+ const int off3 = cdef_directions[(dir + 6) & 7][0];
+
+ const int off2_1 = cdef_directions[(dir + 2) & 7][1];
+ const int off3_1 = cdef_directions[(dir + 6) & 7][1];
+
+ copy4xN(tmp - 2, tmp_stride, dst, dst_stride, left, top, bottom, w, h, edges);
+
+ for (int y = 0; y < h / 2; y++) {
+ LOAD_PIX4(tmp)
+
+ // Primary pass
+ LOAD_DIR4(p, tmp, off1, off1_1)
+
+ CONSTRAIN(p, pri_strength)
+
+ MIN_MAX(p)
+
+ PRI_0(p)
+ PRI_1(p)
+
+ UPDATE_SUM(p)
+
+ // Secondary pass 1
+ LOAD_DIR4(s, tmp, off2, off3)
+
+ CONSTRAIN(s, sec_strength)
+
+ MIN_MAX(s)
+
+ SEC_0(s)
+
+ UPDATE_SUM(s)
+
+ // Secondary pass 2
+ LOAD_DIR4(s2, tmp, off2_1, off3_1)
+
+ CONSTRAIN(s2, sec_strength)
+
+ MIN_MAX(s2)
+
+ UPDATE_SUM(s2)
+
+ // Store
+ i16x8 bias = vec_and((i16x8)vec_cmplt(sum, vec_splat_s16(0)), vec_splat_s16(1));
+ bias = vec_sub(vec_splat_s16(8), bias);
+ i16x8 unclamped = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4)));
+ i16x8 vdst = vec_max(vec_min(unclamped, max), min);
+
+ dst[0] = vdst[0];
+ dst[1] = vdst[1];
+ dst[2] = vdst[2];
+ dst[3] = vdst[3];
+
+ tmp += tmp_stride;
+ dst += PXSTRIDE(dst_stride);
+ dst[0] = vdst[4];
+ dst[1] = vdst[5];
+ dst[2] = vdst[6];
+ dst[3] = vdst[7];
+
+ tmp += tmp_stride;
+ dst += PXSTRIDE(dst_stride);
+ }
+}
+
+static inline void
+filter_8xN(pixel *dst, const ptrdiff_t dst_stride,
+ const pixel (*left)[2], const pixel *const top,
+ const pixel *const bottom, const int w, const int h,
+ const int pri_strength, const int sec_strength, const int dir,
+ const int damping, const enum CdefEdgeFlags edges,
+ const ptrdiff_t tmp_stride, uint16_t *tmp)
+{
+ const int8_t cdef_directions[8 /* dir */][2 /* pass */] = {
+ { -1 * tmp_stride + 1, -2 * tmp_stride + 2 },
+ { 0 * tmp_stride + 1, -1 * tmp_stride + 2 },
+ { 0 * tmp_stride + 1, 0 * tmp_stride + 2 },
+ { 0 * tmp_stride + 1, 1 * tmp_stride + 2 },
+ { 1 * tmp_stride + 1, 2 * tmp_stride + 2 },
+ { 1 * tmp_stride + 0, 2 * tmp_stride + 1 },
+ { 1 * tmp_stride + 0, 2 * tmp_stride + 0 },
+ { 1 * tmp_stride + 0, 2 * tmp_stride - 1 }
+ };
+ const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
+
+
+ const uint16_t tap_even = !((pri_strength >> bitdepth_min_8) & 1);
+ const int off1 = cdef_directions[dir][0];
+ const int off1_1 = cdef_directions[dir][1];
+
+ const int off2 = cdef_directions[(dir + 2) & 7][0];
+ const int off3 = cdef_directions[(dir + 6) & 7][0];
+
+ const int off2_1 = cdef_directions[(dir + 2) & 7][1];
+ const int off3_1 = cdef_directions[(dir + 6) & 7][1];
+
+ copy8xN(tmp - 2, tmp_stride, dst, dst_stride, left, top, bottom, w, h, edges);
+
+ for (int y = 0; y < h; y++) {
+ LOAD_PIX(tmp)
+
+ // Primary pass
+ LOAD_DIR(p, tmp, off1, off1_1)
+
+ CONSTRAIN(p, pri_strength)
+
+ MIN_MAX(p)
+
+ PRI_0(p)
+ PRI_1(p)
+
+ UPDATE_SUM(p)
+
+ // Secondary pass 1
+ LOAD_DIR(s, tmp, off2, off3)
+
+ CONSTRAIN(s, sec_strength)
+
+ MIN_MAX(s)
+
+ SEC_0(s)
+
+ UPDATE_SUM(s)
+
+ // Secondary pass 2
+ LOAD_DIR(s2, tmp, off2_1, off3_1)
+
+ CONSTRAIN(s2, sec_strength)
+
+ MIN_MAX(s2)
+
+ UPDATE_SUM(s2)
+
+ // Store
+ i16x8 bias = vec_and((i16x8)vec_cmplt(sum, vec_splat_s16(0)), vec_splat_s16(1));
+ bias = vec_sub(vec_splat_s16(8), bias);
+ i16x8 unclamped = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4)));
+ i16x8 vdst = vec_max(vec_min(unclamped, max), min);
+
+ dst[0] = vdst[0];
+ dst[1] = vdst[1];
+ dst[2] = vdst[2];
+ dst[3] = vdst[3];
+ dst[4] = vdst[4];
+ dst[5] = vdst[5];
+ dst[6] = vdst[6];
+ dst[7] = vdst[7];
+
+ tmp += tmp_stride;
+ dst += PXSTRIDE(dst_stride);
+ }
+
+}
+
+#define cdef_fn(w, h, tmp_stride) \
+void dav1d_cdef_filter_##w##x##h##_vsx(pixel *const dst, \
+ const ptrdiff_t dst_stride, \
+ const pixel (*left)[2], \
+ const pixel *const top, \
+ const pixel *const bottom, \
+ const int pri_strength, \
+ const int sec_strength, \
+ const int dir, \
+ const int damping, \
+ const enum CdefEdgeFlags edges) \
+{ \
+ ALIGN_STK_16(uint16_t, tmp_buf, 12 * tmp_stride + 8,); \
+ uint16_t *tmp = tmp_buf + 2 * tmp_stride + 2; \
+ filter_##w##xN(dst, dst_stride, left, top, bottom, w, h, pri_strength, \
+ sec_strength, dir, damping, edges, tmp_stride, tmp); \
+}
+
+cdef_fn(4, 4, 8);
+cdef_fn(4, 8, 8);
+cdef_fn(8, 8, 16);
+#endif
diff --git a/third_party/dav1d/src/ppc/cpu.c b/third_party/dav1d/src/ppc/cpu.c
new file mode 100644
index 0000000000..fe77057c57
--- /dev/null
+++ b/third_party/dav1d/src/ppc/cpu.c
@@ -0,0 +1,51 @@
+/*
+ * Copyright © 2019, VideoLAN and dav1d authors
+ * Copyright © 2019, Janne Grunau
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "common/attributes.h"
+
+#include "src/ppc/cpu.h"
+
+#if (defined(HAVE_GETAUXVAL) || defined(HAVE_ELF_AUX_INFO)) && ARCH_PPC64LE
+#include <sys/auxv.h>
+#define HAVE_AUX
+#endif
+
+COLD unsigned dav1d_get_cpu_flags_ppc(void) {
+ unsigned flags = 0;
+#if defined(HAVE_GETAUXVAL) && ARCH_PPC64LE
+ unsigned long hw_cap = getauxval(AT_HWCAP);
+#elif defined(HAVE_ELF_AUX_INFO) && ARCH_PPC64LE
+ unsigned long hw_cap = 0;
+ elf_aux_info(AT_HWCAP, &hw_cap, sizeof(hw_cap));
+#endif
+#ifdef HAVE_AUX
+ flags |= (hw_cap & PPC_FEATURE_HAS_VSX) ? DAV1D_PPC_CPU_FLAG_VSX : 0;
+#endif
+ return flags;
+}
diff --git a/third_party/dav1d/src/ppc/cpu.h b/third_party/dav1d/src/ppc/cpu.h
new file mode 100644
index 0000000000..cfd2ff4ff5
--- /dev/null
+++ b/third_party/dav1d/src/ppc/cpu.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright © 2019, VideoLAN and dav1d authors
+ * Copyright © 2019, Janne Grunau
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_PPC_CPU_H
+#define DAV1D_SRC_PPC_CPU_H
+
+enum CpuFlags {
+ DAV1D_PPC_CPU_FLAG_VSX = 1 << 0,
+};
+
+unsigned dav1d_get_cpu_flags_ppc(void);
+
+#endif /* DAV1D_SRC_PPC_CPU_H */
diff --git a/third_party/dav1d/src/ppc/dav1d_types.h b/third_party/dav1d/src/ppc/dav1d_types.h
new file mode 100644
index 0000000000..0b4bd72f0e
--- /dev/null
+++ b/third_party/dav1d/src/ppc/dav1d_types.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright © 2019, VideoLAN and dav1d authors
+ * Copyright © 2019, Luca Barbato
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_PPC_TYPES_H
+#define DAV1D_SRC_PPC_TYPES_H
+
+#include <altivec.h>
+#undef pixel
+
+#define u8x16 vector unsigned char
+#define i8x16 vector signed char
+#define b8x16 vector bool char
+#define u16x8 vector unsigned short
+#define i16x8 vector signed short
+#define b16x8 vector bool short
+#define u32x4 vector unsigned int
+#define i32x4 vector signed int
+#define b32x4 vector bool int
+#define u64x2 vector unsigned long long
+#define i64x2 vector signed long long
+#define b64x2 vector bool long long
+
+#define u8h_to_u16(v) ((u16x8) vec_mergeh((u8x16) v, vec_splat_u8(0)))
+#define u8l_to_u16(v) ((u16x8) vec_mergel((u8x16) v, vec_splat_u8(0)))
+#define u16h_to_i32(v) ((i32x4) vec_mergeh((u16x8) v, vec_splat_u16(0)))
+#define i16h_to_i32(v) ((i32x4) vec_unpackh((i16x8)v))
+#define u16l_to_i32(v) ((i32x4) vec_mergel((u16x8) v, vec_splat_u16(0)))
+#define i16l_to_i32(v) ((i32x4) vec_unpackl((i16x8)v))
+
+#endif /* DAV1D_SRC_PPC_TYPES_H */
diff --git a/third_party/dav1d/src/ppc/looprestoration.h b/third_party/dav1d/src/ppc/looprestoration.h
new file mode 100644
index 0000000000..3fe16318bd
--- /dev/null
+++ b/third_party/dav1d/src/ppc/looprestoration.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright © 2019, VideoLAN and dav1d authors
+ * Copyright © 2019, Michail Alvanos
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "common/intops.h"
+
+#include "src/cpu.h"
+#include "src/looprestoration.h"
+
+void dav1d_wiener_filter_vsx(uint8_t *p, const ptrdiff_t stride,
+ const uint8_t (*const left)[4],
+ const uint8_t *lpf,
+ const int w, const int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX);
+
+static ALWAYS_INLINE void loop_restoration_dsp_init_ppc(Dav1dLoopRestorationDSPContext *const c, const int bpc) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_PPC_CPU_FLAG_VSX)) return;
+
+#if BITDEPTH == 8
+ c->wiener[0] = c->wiener[1] = dav1d_wiener_filter_vsx;
+#endif
+}
diff --git a/third_party/dav1d/src/ppc/looprestoration_tmpl.c b/third_party/dav1d/src/ppc/looprestoration_tmpl.c
new file mode 100644
index 0000000000..c0c64e1800
--- /dev/null
+++ b/third_party/dav1d/src/ppc/looprestoration_tmpl.c
@@ -0,0 +1,321 @@
+/*
+ * Copyright © 2019, VideoLAN and dav1d authors
+ * Copyright © 2019, Michail Alvanos
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/ppc/dav1d_types.h"
+#include "src/ppc/looprestoration.h"
+
+#if BITDEPTH == 8
+
+#define REST_UNIT_STRIDE (400)
+
+static inline i32x4 iclip_vec(i32x4 v, const i32x4 minv, const i32x4 maxv) {
+ v = vec_max(minv, v);
+ v = vec_min(maxv, v);
+ return v;
+}
+
+#define APPLY_FILTER_H(v, f, ssum1, ssum2) do { \
+ i16x8 ktmp_u16_high = (i16x8) u8h_to_u16(v); \
+ i16x8 ktmp_u16_low = (i16x8) u8l_to_u16(v); \
+ ssum1 = vec_madd(ktmp_u16_high, f, ssum1); \
+ ssum2 = vec_madd(ktmp_u16_low, f, ssum2); \
+} while (0)
+
+static void wiener_filter_h_vsx(int32_t *hor_ptr,
+ uint8_t *tmp_ptr,
+ const int16_t filterh[8],
+ const int w, const int h)
+{
+ const i32x4 zerov = vec_splats(0);
+ const i32x4 seven_vec = vec_splats(7);
+ const i32x4 bitdepth_added_vec = vec_splats(1 << 14);
+ const i32x4 round_bits_vec = vec_splats(3);
+ const i32x4 rounding_off_vec = vec_splats(1<<2);
+ const i32x4 clip_limit_v = vec_splats((1 << 13) - 1);
+
+ i16x8 filterhvall = vec_vsx_ld(0, filterh);
+ i16x8 filterhv0 = vec_splat( filterhvall, 0);
+ i16x8 filterhv1 = vec_splat( filterhvall, 1);
+ i16x8 filterhv2 = vec_splat( filterhvall, 2);
+ i16x8 filterhv3 = vec_splat( filterhvall, 3);
+ i16x8 filterhv4 = vec_splat( filterhvall, 4);
+ i16x8 filterhv5 = vec_splat( filterhvall, 5);
+ i16x8 filterhv6 = vec_splat( filterhvall, 6);
+
+ for (int j = 0; j < h + 6; j++) {
+ for (int i = 0; i < w; i+=16) {
+ i32x4 sum1 = bitdepth_added_vec;
+ i32x4 sum2 = bitdepth_added_vec;
+ i32x4 sum3 = bitdepth_added_vec;
+ i32x4 sum4 = bitdepth_added_vec;
+
+ u8x16 tmp_v0 = vec_ld(0, &tmp_ptr[i]);
+ u8x16 tmp_v7 = vec_ld(0, &tmp_ptr[i+16]);
+
+ u8x16 tmp_v1 = vec_sld( tmp_v7, tmp_v0, 15);
+ u8x16 tmp_v2 = vec_sld( tmp_v7, tmp_v0, 14);
+ u8x16 tmp_v3 = vec_sld( tmp_v7, tmp_v0, 13);
+ u8x16 tmp_v4 = vec_sld( tmp_v7, tmp_v0, 12);
+ u8x16 tmp_v5 = vec_sld( tmp_v7, tmp_v0, 11);
+ u8x16 tmp_v6 = vec_sld( tmp_v7, tmp_v0, 10);
+
+ u16x8 tmp_u16_high = u8h_to_u16(tmp_v3);
+ u16x8 tmp_u16_low = u8l_to_u16(tmp_v3);
+
+ i32x4 tmp_expanded1 = i16h_to_i32(tmp_u16_high);
+ i32x4 tmp_expanded2 = i16l_to_i32(tmp_u16_high);
+ i32x4 tmp_expanded3 = i16h_to_i32(tmp_u16_low);
+ i32x4 tmp_expanded4 = i16l_to_i32(tmp_u16_low);
+
+ i16x8 ssum1 = (i16x8) zerov;
+ i16x8 ssum2 = (i16x8) zerov;
+
+ APPLY_FILTER_H(tmp_v0, filterhv0, ssum1, ssum2);
+ APPLY_FILTER_H(tmp_v1, filterhv1, ssum1, ssum2);
+ APPLY_FILTER_H(tmp_v2, filterhv2, ssum1, ssum2);
+ APPLY_FILTER_H(tmp_v3, filterhv3, ssum1, ssum2);
+ APPLY_FILTER_H(tmp_v4, filterhv4, ssum1, ssum2);
+ APPLY_FILTER_H(tmp_v5, filterhv5, ssum1, ssum2);
+ APPLY_FILTER_H(tmp_v6, filterhv6, ssum1, ssum2);
+
+ sum1 += i16h_to_i32(ssum1) + (tmp_expanded1 << seven_vec);
+ sum2 += i16l_to_i32(ssum1) + (tmp_expanded2 << seven_vec);
+ sum3 += i16h_to_i32(ssum2) + (tmp_expanded3 << seven_vec);
+ sum4 += i16l_to_i32(ssum2) + (tmp_expanded4 << seven_vec);
+
+ sum1 = (sum1 + rounding_off_vec) >> round_bits_vec;
+ sum2 = (sum2 + rounding_off_vec) >> round_bits_vec;
+ sum3 = (sum3 + rounding_off_vec) >> round_bits_vec;
+ sum4 = (sum4 + rounding_off_vec) >> round_bits_vec;
+
+ sum1 = iclip_vec(sum1, zerov, clip_limit_v);
+ sum2 = iclip_vec(sum2, zerov, clip_limit_v);
+ sum3 = iclip_vec(sum3, zerov, clip_limit_v);
+ sum4 = iclip_vec(sum4, zerov, clip_limit_v);
+
+ vec_st(sum1, 0, &hor_ptr[i]);
+ vec_st(sum2, 16, &hor_ptr[i]);
+ vec_st(sum3, 32, &hor_ptr[i]);
+ vec_st(sum4, 48, &hor_ptr[i]);
+ }
+ tmp_ptr += REST_UNIT_STRIDE;
+ hor_ptr += REST_UNIT_STRIDE;
+ }
+}
+
+static inline i16x8 iclip_u8_vec(i16x8 v) {
+ const i16x8 zerov = vec_splats((int16_t)0);
+ const i16x8 maxv = vec_splats((int16_t)255);
+ v = vec_max(zerov, v);
+ v = vec_min(maxv, v);
+ return v;
+}
+
+#define APPLY_FILTER_V(index, f) do { \
+ i32x4 v1 = vec_ld( 0, &hor[(j + index) * REST_UNIT_STRIDE + i]); \
+ i32x4 v2 = vec_ld(16, &hor[(j + index) * REST_UNIT_STRIDE + i]); \
+ i32x4 v3 = vec_ld(32, &hor[(j + index) * REST_UNIT_STRIDE + i]); \
+ i32x4 v4 = vec_ld(48, &hor[(j + index) * REST_UNIT_STRIDE + i]); \
+ sum1 = sum1 + v1 * f; \
+ sum2 = sum2 + v2 * f; \
+ sum3 = sum3 + v3 * f; \
+ sum4 = sum4 + v4 * f; \
+} while (0)
+
+#define LOAD_AND_APPLY_FILTER_V(sumpixelv, hor) do { \
+ i32x4 sum1 = round_vec; \
+ i32x4 sum2 = round_vec; \
+ i32x4 sum3 = round_vec; \
+ i32x4 sum4 = round_vec; \
+ APPLY_FILTER_V(0, filterv0); \
+ APPLY_FILTER_V(1, filterv1); \
+ APPLY_FILTER_V(2, filterv2); \
+ APPLY_FILTER_V(3, filterv3); \
+ APPLY_FILTER_V(4, filterv4); \
+ APPLY_FILTER_V(5, filterv5); \
+ APPLY_FILTER_V(6, filterv6); \
+ sum1 = sum1 >> round_bits_vec; \
+ sum2 = sum2 >> round_bits_vec; \
+ sum3 = sum3 >> round_bits_vec; \
+ sum4 = sum4 >> round_bits_vec; \
+ i16x8 sum_short_packed_1 = (i16x8) vec_pack(sum1, sum2); \
+ i16x8 sum_short_packed_2 = (i16x8) vec_pack(sum3, sum4); \
+ sum_short_packed_1 = iclip_u8_vec(sum_short_packed_1); \
+ sum_short_packed_2 = iclip_u8_vec(sum_short_packed_2); \
+ sum_pixel = (u8x16) vec_pack(sum_short_packed_1, sum_short_packed_2); \
+} while (0)
+
+static inline void wiener_filter_v_vsx(uint8_t *p,
+ const ptrdiff_t stride,
+ const int32_t *hor,
+ const int16_t filterv[8],
+ const int w, const int h)
+{
+ const i32x4 round_bits_vec = vec_splats(11);
+ const i32x4 round_vec = vec_splats((1 << 10) - (1 << 18));
+
+ i32x4 filterv0 = vec_splats((int32_t) filterv[0]);
+ i32x4 filterv1 = vec_splats((int32_t) filterv[1]);
+ i32x4 filterv2 = vec_splats((int32_t) filterv[2]);
+ i32x4 filterv3 = vec_splats((int32_t) filterv[3]);
+ i32x4 filterv4 = vec_splats((int32_t) filterv[4]);
+ i32x4 filterv5 = vec_splats((int32_t) filterv[5]);
+ i32x4 filterv6 = vec_splats((int32_t) filterv[6]);
+
+ for (int j = 0; j < h; j++) {
+ for (int i = 0; i <(w-w%16); i += 16) {
+ u8x16 sum_pixel;
+ LOAD_AND_APPLY_FILTER_V(sum_pixel, hor);
+ vec_vsx_st(sum_pixel, 0, &p[j * PXSTRIDE(stride) + i]);
+ }
+ // remaining loop
+ if (w & 0xf){
+ int i=w-w%16;
+ ALIGN_STK_16(uint8_t, tmp_out, 16,);
+ u8x16 sum_pixel;
+
+ LOAD_AND_APPLY_FILTER_V(sum_pixel, hor);
+ vec_vsx_st(sum_pixel, 0, tmp_out);
+
+ for (int k=0; i<w; i++, k++) {
+ p[j * PXSTRIDE(stride) + i] = tmp_out[k];
+ }
+ }
+ }
+}
+
+static inline void padding(uint8_t *dst, const uint8_t *p,
+ const ptrdiff_t stride, const uint8_t (*left)[4],
+ const uint8_t *lpf, int unit_w, const int stripe_h,
+ const enum LrEdgeFlags edges)
+{
+ const int have_left = !!(edges & LR_HAVE_LEFT);
+ const int have_right = !!(edges & LR_HAVE_RIGHT);
+
+ // Copy more pixels if we don't have to pad them
+ unit_w += 3 * have_left + 3 * have_right;
+ uint8_t *dst_l = dst + 3 * !have_left;
+ p -= 3 * have_left;
+ lpf -= 3 * have_left;
+
+ if (edges & LR_HAVE_TOP) {
+ // Copy previous loop filtered rows
+ const uint8_t *const above_1 = lpf;
+ const uint8_t *const above_2 = above_1 + PXSTRIDE(stride);
+ pixel_copy(dst_l, above_1, unit_w);
+ pixel_copy(dst_l + REST_UNIT_STRIDE, above_1, unit_w);
+ pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, above_2, unit_w);
+ } else {
+ // Pad with first row
+ pixel_copy(dst_l, p, unit_w);
+ pixel_copy(dst_l + REST_UNIT_STRIDE, p, unit_w);
+ pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, p, unit_w);
+ if (have_left) {
+ pixel_copy(dst_l, &left[0][1], 3);
+ pixel_copy(dst_l + REST_UNIT_STRIDE, &left[0][1], 3);
+ pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, &left[0][1], 3);
+ }
+ }
+
+ uint8_t *dst_tl = dst_l + 3 * REST_UNIT_STRIDE;
+ if (edges & LR_HAVE_BOTTOM) {
+ // Copy next loop filtered rows
+ const uint8_t *const below_1 = lpf + 6 * PXSTRIDE(stride);
+ const uint8_t *const below_2 = below_1 + PXSTRIDE(stride);
+ pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, below_1, unit_w);
+ pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, below_2, unit_w);
+ pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, below_2, unit_w);
+ } else {
+ // Pad with last row
+ const uint8_t *const src = p + (stripe_h - 1) * PXSTRIDE(stride);
+ pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, src, unit_w);
+ pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, src, unit_w);
+ pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, src, unit_w);
+ if (have_left) {
+ pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
+ pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
+ pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
+ }
+ }
+
+ // Inner UNIT_WxSTRIPE_H
+ for (int j = 0; j < stripe_h; j++) {
+ pixel_copy(dst_tl + 3 * have_left, p + 3 * have_left, unit_w - 3 * have_left);
+ dst_tl += REST_UNIT_STRIDE;
+ p += PXSTRIDE(stride);
+ }
+
+ if (!have_right) {
+ uint8_t *pad = dst_l + unit_w;
+ uint8_t *row_last = &dst_l[unit_w - 1];
+ // Pad 3x(STRIPE_H+6) with last column
+ for (int j = 0; j < stripe_h + 6; j++) {
+ pixel_set(pad, *row_last, 3);
+ pad += REST_UNIT_STRIDE;
+ row_last += REST_UNIT_STRIDE;
+ }
+ }
+
+ if (!have_left) {
+ // Pad 3x(STRIPE_H+6) with first column
+ for (int j = 0; j < stripe_h + 6; j++) {
+ pixel_set(dst, *dst_l, 3);
+ dst += REST_UNIT_STRIDE;
+ dst_l += REST_UNIT_STRIDE;
+ }
+ } else {
+ dst += 3 * REST_UNIT_STRIDE;
+ for (int j = 0; j < stripe_h; j++) {
+ pixel_copy(dst, &left[j][1], 3);
+ dst += REST_UNIT_STRIDE;
+ }
+ }
+}
+
+// FIXME Could split into luma and chroma specific functions,
+// (since first and last tops are always 0 for chroma)
+// FIXME Could implement a version that requires less temporary memory
+// (should be possible to implement with only 6 rows of temp storage)
+void dav1d_wiener_filter_vsx(uint8_t *p, const ptrdiff_t stride,
+ const uint8_t (*const left)[4],
+ const uint8_t *lpf,
+ const int w, const int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+ const int16_t (*const filter)[8] = params->filter;
+
+ // Wiener filtering is applied to a maximum stripe height of 64 + 3 pixels
+ // of padding above and below
+ ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,);
+ padding(tmp, p, stride, left, lpf, w, h, edges);
+ ALIGN_STK_16(int32_t, hor, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE + 64,);
+
+ wiener_filter_h_vsx(hor, tmp, filter[0], w, h);
+ wiener_filter_v_vsx(p, stride, hor, filter[1], w, h);
+}
+#endif
diff --git a/third_party/dav1d/src/qm.c b/third_party/dav1d/src/qm.c
new file mode 100644
index 0000000000..a523da5e4d
--- /dev/null
+++ b/third_party/dav1d/src/qm.c
@@ -0,0 +1,1693 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <string.h>
+
+#include "common/attributes.h"
+
+#include "src/qm.h"
+
+static const uint8_t qm_tbl_32x16[][2][512] = {
+ {
+ {
+ 32, 31, 31, 31, 32, 32, 34, 35, 36, 39, 44, 46, 48, 53, 58, 61, 65, 71, 79, 81, 82, 88, 91, 94, 97, 100, 103, 107, 110, 114, 118, 122,
+ 31, 32, 32, 32, 32, 33, 34, 34, 34, 37, 41, 43, 45, 49, 54, 57, 60, 65, 72, 74, 75, 80, 83, 85, 88, 91, 94, 97, 101, 104, 108, 111,
+ 32, 32, 33, 33, 34, 35, 37, 37, 38, 40, 43, 44, 46, 50, 54, 56, 58, 63, 70, 71, 72, 77, 80, 83, 86, 89, 93, 96, 100, 104, 107, 111,
+ 34, 34, 33, 34, 35, 37, 39, 41, 43, 45, 48, 49, 51, 54, 58, 60, 63, 68, 74, 75, 76, 80, 81, 82, 85, 87, 90, 93, 97, 100, 103, 107,
+ 36, 35, 34, 35, 36, 38, 42, 45, 48, 50, 53, 55, 56, 60, 63, 66, 68, 73, 79, 80, 81, 85, 88, 91, 94, 97, 98, 100, 101, 103, 105, 107,
+ 44, 42, 41, 41, 42, 42, 48, 50, 54, 58, 63, 65, 67, 71, 75, 77, 79, 84, 90, 91, 92, 97, 100, 100, 100, 100, 101, 104, 108, 112, 115, 119,
+ 53, 51, 49, 49, 50, 49, 54, 57, 60, 65, 71, 73, 76, 82, 87, 89, 92, 97, 104, 105, 106, 108, 106, 105, 107, 111, 114, 117, 117, 117, 118, 119,
+ 59, 56, 54, 54, 54, 53, 58, 61, 64, 69, 75, 78, 80, 87, 92, 95, 98, 103, 110, 112, 113, 115, 114, 118, 123, 121, 120, 119, 123, 127, 131, 136,
+ 65, 62, 59, 59, 59, 58, 63, 65, 68, 73, 79, 82, 85, 92, 98, 101, 105, 111, 118, 119, 121, 126, 130, 131, 128, 127, 131, 136, 138, 137, 136, 136,
+ 79, 75, 72, 71, 71, 69, 73, 76, 78, 84, 90, 93, 96, 103, 110, 114, 118, 125, 133, 135, 136, 142, 142, 137, 140, 145, 144, 142, 141, 146, 151, 156,
+ 87, 82, 78, 78, 77, 75, 79, 82, 84, 89, 95, 98, 102, 109, 116, 120, 124, 132, 141, 142, 144, 149, 148, 153, 157, 152, 150, 155, 161, 159, 157, 156,
+ 90, 85, 82, 81, 80, 78, 78, 83, 87, 89, 93, 100, 102, 107, 115, 118, 123, 132, 136, 140, 151, 153, 155, 160, 161, 164, 170, 168, 165, 167, 172, 178,
+ 93, 88, 86, 84, 82, 82, 80, 84, 86, 91, 94, 98, 105, 107, 112, 119, 122, 130, 135, 140, 149, 153, 162, 165, 167, 173, 174, 177, 183, 185, 182, 179,
+ 96, 91, 90, 87, 86, 86, 83, 84, 89, 91, 95, 100, 102, 110, 111, 118, 123, 128, 135, 138, 149, 152, 160, 167, 173, 178, 180, 187, 188, 190, 197, 203,
+ 99, 94, 93, 90, 89, 89, 88, 87, 90, 93, 97, 99, 105, 107, 115, 116, 124, 127, 135, 139, 146, 152, 159, 166, 171, 182, 186, 191, 193, 201, 203, 204,
+ 102, 97, 97, 93, 93, 92, 92, 90, 90, 96, 97, 103, 104, 111, 112, 120, 121, 130, 131, 142, 143, 154, 155, 168, 169, 181, 183, 198, 200, 206, 208, 217,
+ }, {
+ 32, 31, 30, 32, 33, 37, 42, 45, 49, 48, 49, 49, 50, 52, 54, 55, 57, 60, 63, 64, 64, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
+ 31, 31, 32, 34, 36, 40, 43, 44, 46, 46, 45, 46, 46, 48, 50, 51, 52, 54, 57, 58, 59, 61, 62, 62, 63, 64, 65, 66, 67, 68, 69, 70,
+ 37, 38, 40, 41, 43, 47, 47, 47, 48, 47, 46, 46, 46, 47, 49, 49, 50, 52, 55, 55, 56, 58, 59, 60, 62, 63, 64, 65, 67, 68, 69, 70,
+ 42, 42, 42, 44, 45, 47, 48, 49, 50, 50, 49, 49, 50, 50, 52, 52, 53, 55, 58, 58, 58, 60, 60, 60, 60, 61, 62, 63, 64, 65, 66, 67,
+ 48, 47, 46, 46, 47, 47, 50, 51, 53, 53, 53, 53, 54, 54, 55, 56, 56, 58, 60, 61, 61, 63, 64, 65, 66, 67, 66, 66, 66, 66, 67, 67,
+ 49, 47, 45, 45, 46, 45, 49, 51, 53, 56, 58, 59, 59, 61, 62, 63, 64, 65, 67, 68, 68, 69, 71, 70, 69, 68, 68, 69, 70, 71, 72, 73,
+ 52, 50, 48, 48, 47, 47, 50, 52, 54, 57, 61, 62, 64, 66, 68, 69, 70, 72, 75, 75, 75, 76, 74, 72, 73, 74, 75, 75, 74, 74, 73, 73,
+ 54, 52, 50, 49, 49, 48, 52, 54, 55, 59, 62, 64, 65, 68, 71, 72, 73, 75, 78, 78, 79, 79, 78, 79, 81, 79, 78, 76, 77, 78, 80, 81,
+ 57, 54, 52, 51, 51, 50, 53, 55, 57, 60, 64, 65, 67, 71, 73, 75, 76, 79, 82, 82, 83, 85, 86, 85, 83, 82, 83, 84, 84, 83, 82, 81,
+ 63, 60, 57, 57, 56, 54, 57, 59, 60, 64, 67, 69, 71, 75, 78, 80, 82, 85, 89, 89, 90, 92, 91, 88, 89, 90, 89, 87, 86, 87, 88, 90,
+ 66, 63, 60, 59, 59, 57, 60, 61, 62, 66, 69, 71, 73, 77, 80, 82, 84, 88, 92, 92, 93, 95, 94, 95, 96, 93, 92, 93, 94, 93, 91, 90,
+ 67, 64, 62, 61, 60, 58, 58, 61, 63, 65, 67, 70, 72, 74, 78, 80, 82, 86, 88, 90, 95, 96, 96, 98, 97, 98, 100, 98, 96, 96, 97, 99,
+ 68, 65, 63, 62, 60, 60, 59, 61, 62, 65, 66, 68, 72, 73, 76, 79, 80, 84, 87, 89, 93, 94, 98, 99, 99, 102, 101, 102, 103, 103, 101, 99,
+ 69, 66, 65, 63, 62, 61, 60, 60, 63, 64, 66, 68, 70, 73, 74, 78, 80, 82, 85, 87, 91, 92, 96, 98, 101, 102, 103, 105, 105, 105, 107, 108,
+ 71, 67, 66, 64, 63, 62, 62, 61, 62, 64, 66, 67, 70, 71, 75, 76, 79, 81, 84, 86, 89, 91, 94, 97, 98, 102, 104, 106, 106, 109, 109, 108,
+ 72, 68, 68, 65, 65, 63, 63, 61, 62, 65, 65, 68, 69, 72, 73, 77, 77, 81, 81, 86, 87, 91, 91, 96, 97, 101, 102, 107, 107, 109, 110, 113,
+ },
+ }, {
+ {
+ 32, 31, 31, 31, 32, 32, 32, 34, 36, 38, 39, 44, 47, 49, 53, 58, 61, 65, 71, 76, 79, 82, 86, 89, 92, 95, 98, 101, 104, 107, 110, 114,
+ 31, 32, 32, 32, 32, 33, 33, 34, 34, 36, 37, 41, 44, 46, 49, 54, 56, 60, 65, 69, 72, 75, 78, 81, 84, 86, 89, 92, 95, 98, 101, 104,
+ 32, 32, 32, 33, 34, 35, 35, 36, 37, 39, 40, 42, 45, 47, 50, 54, 56, 59, 64, 68, 70, 73, 76, 79, 82, 85, 88, 91, 94, 97, 100, 104,
+ 32, 33, 33, 33, 34, 36, 36, 38, 40, 41, 42, 45, 47, 48, 51, 55, 57, 60, 65, 69, 71, 74, 77, 78, 80, 83, 85, 88, 91, 94, 97, 100,
+ 36, 35, 35, 35, 36, 38, 40, 42, 48, 49, 50, 53, 56, 57, 60, 63, 65, 68, 73, 76, 79, 81, 84, 87, 89, 92, 93, 94, 95, 96, 98, 100,
+ 44, 42, 41, 41, 42, 42, 44, 48, 54, 56, 58, 63, 66, 67, 71, 75, 77, 79, 84, 88, 90, 92, 95, 95, 95, 95, 95, 98, 101, 105, 108, 111,
+ 47, 45, 44, 44, 45, 45, 47, 50, 56, 58, 60, 66, 69, 71, 75, 79, 81, 84, 89, 92, 95, 97, 100, 99, 101, 105, 108, 110, 110, 110, 111, 111,
+ 53, 51, 49, 49, 50, 49, 51, 54, 60, 63, 65, 71, 75, 77, 82, 87, 89, 92, 97, 101, 104, 106, 109, 112, 116, 114, 113, 112, 115, 119, 123, 126,
+ 65, 62, 60, 59, 59, 58, 60, 63, 68, 71, 73, 79, 84, 86, 92, 98, 100, 105, 111, 115, 118, 121, 124, 124, 121, 120, 124, 128, 129, 128, 127, 127,
+ 73, 69, 67, 66, 65, 64, 66, 69, 74, 77, 79, 85, 90, 93, 99, 105, 107, 112, 119, 123, 127, 130, 133, 130, 132, 136, 136, 133, 132, 136, 141, 145,
+ 79, 75, 72, 71, 71, 69, 71, 73, 78, 81, 84, 90, 95, 97, 103, 110, 113, 118, 125, 130, 133, 136, 140, 145, 148, 143, 141, 146, 151, 149, 147, 145,
+ 87, 83, 80, 79, 78, 76, 76, 80, 84, 86, 90, 96, 99, 103, 111, 114, 118, 126, 130, 134, 143, 146, 147, 152, 151, 155, 160, 158, 154, 156, 161, 166,
+ 90, 86, 84, 82, 80, 80, 78, 82, 83, 88, 91, 94, 101, 103, 108, 114, 116, 124, 129, 134, 142, 145, 153, 156, 157, 163, 163, 166, 171, 173, 169, 166,
+ 93, 88, 87, 84, 83, 83, 81, 81, 86, 88, 92, 96, 98, 105, 107, 113, 117, 122, 129, 131, 141, 144, 151, 157, 163, 167, 169, 175, 175, 177, 183, 189,
+ 96, 91, 90, 87, 87, 86, 85, 84, 87, 90, 94, 96, 101, 102, 110, 111, 118, 121, 129, 132, 138, 144, 150, 156, 161, 171, 174, 179, 181, 188, 188, 190,
+ 99, 94, 94, 90, 90, 88, 89, 86, 87, 93, 93, 99, 99, 106, 107, 115, 116, 124, 125, 135, 136, 145, 146, 158, 159, 170, 171, 185, 186, 192, 193, 201,
+ }, {
+ 32, 31, 30, 31, 33, 37, 39, 42, 49, 48, 48, 49, 50, 51, 52, 54, 55, 57, 60, 62, 63, 64, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75,
+ 31, 31, 32, 33, 36, 40, 41, 43, 46, 46, 46, 45, 46, 47, 48, 50, 51, 52, 54, 56, 57, 59, 60, 61, 62, 63, 64, 65, 65, 66, 67, 68,
+ 35, 37, 38, 38, 41, 45, 46, 46, 48, 47, 46, 45, 46, 47, 47, 49, 49, 50, 53, 54, 55, 56, 58, 59, 60, 61, 62, 64, 65, 66, 67, 68,
+ 38, 40, 40, 41, 44, 47, 47, 48, 49, 48, 48, 47, 48, 48, 48, 50, 50, 51, 53, 55, 56, 57, 58, 58, 59, 60, 60, 61, 62, 63, 64, 65,
+ 48, 47, 46, 46, 47, 47, 48, 50, 53, 53, 53, 53, 54, 54, 54, 55, 56, 56, 58, 60, 60, 61, 62, 63, 64, 65, 65, 65, 65, 65, 65, 65,
+ 49, 47, 45, 45, 46, 45, 47, 49, 53, 55, 56, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 68, 67, 66, 66, 67, 68, 69, 70, 71,
+ 50, 48, 46, 46, 46, 46, 47, 50, 54, 55, 56, 59, 61, 61, 63, 65, 65, 66, 68, 69, 70, 71, 72, 71, 71, 72, 73, 73, 72, 72, 71, 71,
+ 52, 50, 48, 48, 47, 47, 48, 50, 54, 56, 57, 61, 63, 64, 66, 68, 69, 70, 72, 74, 75, 75, 76, 78, 79, 77, 76, 74, 75, 76, 77, 78,
+ 57, 54, 52, 52, 51, 50, 51, 53, 57, 58, 60, 64, 66, 68, 71, 73, 74, 76, 79, 81, 82, 83, 84, 83, 81, 80, 81, 82, 82, 81, 79, 78,
+ 61, 57, 55, 55, 54, 52, 54, 56, 59, 61, 62, 66, 68, 70, 73, 76, 77, 79, 82, 84, 86, 87, 88, 86, 86, 88, 87, 85, 83, 85, 86, 87,
+ 63, 60, 58, 57, 56, 54, 55, 57, 60, 62, 64, 67, 70, 71, 75, 78, 79, 82, 85, 87, 89, 90, 91, 93, 94, 91, 89, 90, 92, 90, 89, 87,
+ 67, 63, 61, 60, 59, 57, 57, 60, 63, 64, 66, 69, 71, 73, 77, 79, 81, 85, 87, 88, 92, 93, 94, 96, 95, 96, 97, 95, 93, 93, 94, 96,
+ 68, 64, 63, 61, 60, 59, 58, 60, 61, 64, 65, 67, 71, 72, 75, 78, 79, 83, 85, 87, 91, 92, 95, 96, 97, 99, 98, 99, 100, 100, 98, 96,
+ 69, 65, 64, 62, 61, 61, 59, 59, 62, 63, 65, 67, 68, 72, 73, 76, 78, 81, 84, 85, 89, 90, 93, 96, 98, 99, 100, 102, 102, 102, 103, 105,
+ 70, 66, 65, 63, 63, 62, 61, 60, 61, 63, 65, 66, 69, 70, 74, 74, 78, 79, 82, 84, 87, 89, 91, 94, 96, 100, 101, 103, 103, 105, 105, 105,
+ 71, 67, 67, 64, 64, 62, 62, 60, 61, 64, 64, 67, 67, 71, 71, 75, 75, 79, 80, 84, 84, 89, 89, 94, 94, 98, 99, 104, 104, 106, 106, 109,
+ },
+ }, {
+ {
+ 32, 31, 31, 31, 32, 32, 32, 34, 34, 36, 39, 40, 44, 47, 49, 53, 57, 59, 65, 69, 71, 79, 81, 82, 87, 90, 92, 95, 98, 100, 103, 106,
+ 31, 32, 32, 32, 32, 32, 33, 34, 34, 34, 37, 38, 41, 44, 46, 49, 53, 54, 60, 63, 65, 72, 74, 75, 79, 82, 84, 87, 89, 92, 94, 97,
+ 32, 32, 32, 32, 33, 34, 34, 35, 36, 37, 39, 40, 42, 45, 46, 50, 53, 54, 59, 62, 64, 71, 72, 73, 77, 80, 83, 85, 88, 91, 94, 97,
+ 32, 32, 32, 33, 34, 34, 35, 37, 37, 38, 40, 41, 43, 46, 47, 50, 53, 54, 58, 62, 63, 70, 71, 72, 76, 78, 81, 83, 85, 88, 90, 93,
+ 36, 35, 35, 34, 36, 37, 38, 42, 44, 48, 50, 51, 53, 56, 57, 60, 63, 64, 68, 71, 73, 79, 80, 81, 85, 87, 88, 88, 89, 90, 92, 93,
+ 39, 38, 38, 37, 39, 40, 40, 45, 47, 51, 54, 55, 58, 61, 62, 65, 68, 69, 73, 76, 78, 84, 85, 86, 90, 89, 90, 92, 95, 98, 101, 104,
+ 44, 42, 41, 41, 42, 42, 42, 48, 50, 54, 58, 59, 63, 66, 67, 71, 74, 75, 79, 83, 84, 90, 91, 92, 96, 99, 102, 103, 103, 103, 103, 104,
+ 53, 51, 50, 49, 50, 49, 49, 54, 56, 60, 65, 67, 71, 75, 77, 82, 86, 87, 92, 96, 97, 104, 105, 106, 110, 108, 106, 105, 108, 111, 114, 118,
+ 58, 55, 54, 53, 53, 53, 52, 57, 59, 63, 68, 70, 74, 79, 81, 86, 90, 91, 97, 100, 102, 109, 110, 111, 114, 113, 117, 120, 121, 120, 119, 118,
+ 65, 62, 60, 59, 59, 58, 58, 63, 65, 68, 73, 75, 79, 85, 86, 92, 97, 98, 105, 109, 111, 118, 120, 121, 125, 129, 128, 125, 124, 127, 131, 135,
+ 79, 75, 73, 72, 71, 70, 69, 73, 75, 78, 84, 85, 90, 95, 97, 103, 108, 111, 118, 122, 125, 133, 135, 136, 140, 135, 133, 137, 141, 139, 137, 135,
+ 81, 77, 75, 74, 72, 71, 70, 75, 77, 80, 85, 87, 91, 97, 99, 105, 110, 112, 119, 124, 127, 135, 137, 139, 143, 146, 150, 148, 144, 146, 150, 154,
+ 88, 83, 81, 79, 78, 77, 76, 79, 81, 85, 88, 91, 97, 99, 104, 109, 111, 119, 123, 127, 135, 137, 145, 147, 148, 153, 153, 155, 160, 161, 158, 155,
+ 90, 86, 84, 82, 81, 80, 78, 79, 83, 85, 89, 92, 94, 101, 102, 108, 112, 117, 123, 125, 134, 136, 143, 148, 154, 157, 158, 164, 164, 165, 170, 175,
+ 93, 88, 88, 84, 84, 83, 82, 81, 84, 86, 90, 92, 97, 98, 105, 106, 113, 115, 122, 125, 131, 136, 141, 147, 151, 160, 163, 168, 169, 175, 175, 176,
+ 96, 91, 91, 87, 87, 85, 86, 83, 84, 89, 89, 95, 95, 102, 102, 110, 110, 118, 119, 128, 129, 137, 138, 149, 149, 159, 160, 173, 174, 179, 180, 187,
+ }, {
+ 32, 31, 31, 30, 33, 35, 37, 42, 44, 49, 48, 48, 49, 50, 51, 52, 54, 54, 57, 59, 60, 63, 64, 64, 66, 67, 68, 69, 70, 71, 72, 73,
+ 31, 31, 32, 32, 36, 38, 40, 43, 44, 46, 46, 45, 45, 46, 47, 48, 49, 50, 52, 54, 54, 57, 58, 59, 60, 61, 62, 63, 64, 65, 65, 66,
+ 34, 35, 36, 36, 40, 42, 44, 45, 46, 47, 46, 46, 45, 46, 47, 47, 49, 49, 51, 52, 53, 56, 57, 57, 59, 60, 61, 62, 63, 64, 65, 66,
+ 37, 38, 39, 40, 43, 45, 47, 47, 47, 48, 47, 46, 46, 46, 47, 47, 48, 49, 50, 52, 52, 55, 55, 56, 57, 58, 59, 60, 60, 61, 62, 63,
+ 48, 47, 46, 46, 47, 47, 47, 50, 51, 53, 53, 53, 53, 54, 54, 54, 55, 55, 56, 58, 58, 60, 61, 61, 63, 63, 63, 63, 63, 63, 63, 63,
+ 48, 47, 46, 45, 46, 46, 46, 50, 51, 53, 54, 55, 56, 56, 57, 57, 58, 59, 60, 61, 62, 64, 64, 65, 66, 65, 64, 65, 66, 67, 68, 69,
+ 49, 47, 46, 45, 46, 45, 45, 49, 51, 53, 56, 56, 58, 59, 60, 61, 62, 62, 64, 65, 65, 67, 68, 68, 69, 70, 71, 71, 70, 70, 69, 69,
+ 52, 50, 48, 48, 47, 47, 47, 50, 52, 54, 57, 58, 61, 63, 64, 66, 68, 68, 70, 72, 72, 75, 75, 75, 77, 75, 74, 72, 73, 74, 75, 76,
+ 54, 51, 50, 49, 49, 48, 48, 51, 53, 55, 58, 59, 62, 65, 65, 68, 70, 70, 73, 74, 75, 77, 78, 78, 79, 78, 79, 80, 80, 78, 77, 76,
+ 57, 54, 53, 52, 51, 50, 50, 53, 54, 57, 60, 61, 64, 66, 68, 71, 73, 74, 76, 78, 79, 82, 82, 83, 84, 85, 84, 82, 81, 82, 83, 84,
+ 63, 60, 58, 57, 56, 55, 54, 57, 59, 60, 64, 65, 67, 70, 71, 75, 77, 78, 82, 84, 85, 89, 89, 90, 91, 88, 87, 88, 89, 88, 86, 84,
+ 64, 61, 59, 58, 57, 56, 55, 58, 59, 61, 64, 65, 68, 71, 72, 75, 78, 79, 82, 85, 86, 90, 90, 91, 93, 93, 94, 93, 90, 90, 92, 93,
+ 67, 63, 62, 60, 59, 58, 57, 59, 60, 63, 64, 66, 70, 70, 73, 76, 77, 81, 83, 85, 89, 90, 93, 94, 94, 96, 96, 96, 97, 97, 95, 93,
+ 68, 64, 63, 61, 60, 60, 58, 58, 61, 62, 64, 66, 67, 71, 71, 75, 77, 79, 82, 83, 87, 88, 91, 93, 95, 97, 97, 99, 99, 99, 100, 101,
+ 69, 65, 65, 62, 62, 61, 60, 59, 61, 62, 64, 65, 68, 68, 72, 73, 76, 77, 81, 82, 85, 87, 89, 92, 93, 97, 98, 100, 100, 102, 102, 101,
+ 69, 66, 66, 63, 63, 61, 61, 59, 60, 63, 63, 66, 66, 70, 70, 73, 74, 78, 78, 82, 82, 86, 87, 91, 91, 95, 96, 101, 101, 103, 103, 105,
+ },
+ }, {
+ {
+ 32, 31, 31, 31, 31, 32, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48, 53, 55, 58, 63, 65, 71, 74, 79, 82, 82, 87, 89, 92, 94, 97, 99,
+ 31, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 37, 39, 41, 44, 45, 49, 51, 54, 58, 60, 65, 68, 72, 75, 75, 79, 82, 84, 86, 88, 91,
+ 31, 32, 32, 32, 33, 33, 34, 34, 35, 36, 36, 39, 40, 42, 44, 45, 50, 51, 54, 58, 59, 64, 67, 71, 73, 74, 78, 81, 83, 85, 88, 91,
+ 32, 32, 32, 33, 34, 34, 35, 36, 37, 38, 38, 40, 41, 43, 45, 46, 50, 51, 54, 57, 58, 63, 66, 70, 72, 72, 76, 78, 80, 82, 85, 87,
+ 35, 35, 34, 34, 35, 36, 37, 39, 41, 45, 46, 48, 49, 51, 53, 54, 57, 59, 61, 65, 66, 71, 73, 77, 79, 79, 83, 83, 84, 85, 86, 87,
+ 36, 35, 35, 34, 36, 36, 38, 40, 42, 47, 48, 50, 51, 53, 56, 56, 60, 61, 63, 67, 68, 73, 75, 79, 81, 81, 85, 87, 89, 92, 94, 97,
+ 44, 42, 41, 41, 42, 42, 42, 44, 48, 52, 54, 58, 60, 63, 66, 67, 71, 72, 75, 78, 79, 84, 86, 90, 92, 92, 96, 97, 97, 97, 97, 97,
+ 47, 45, 45, 44, 44, 45, 45, 47, 50, 55, 56, 60, 62, 66, 69, 70, 75, 77, 79, 83, 84, 89, 91, 95, 97, 97, 100, 99, 101, 104, 107, 110,
+ 53, 51, 50, 49, 49, 50, 49, 51, 54, 59, 60, 65, 67, 71, 75, 76, 82, 84, 87, 91, 92, 97, 100, 104, 105, 106, 110, 113, 114, 112, 111, 110,
+ 62, 59, 58, 57, 57, 57, 56, 58, 61, 65, 66, 71, 74, 78, 82, 83, 90, 92, 95, 100, 102, 108, 110, 115, 117, 117, 120, 118, 116, 119, 123, 126,
+ 65, 62, 61, 59, 59, 59, 58, 60, 63, 67, 68, 73, 76, 79, 84, 85, 92, 94, 98, 103, 105, 111, 113, 118, 120, 121, 125, 128, 132, 130, 128, 126,
+ 79, 75, 74, 72, 71, 71, 69, 71, 73, 77, 78, 84, 86, 90, 95, 96, 103, 106, 110, 116, 118, 125, 128, 133, 136, 136, 141, 139, 135, 136, 140, 144,
+ 82, 78, 76, 74, 73, 73, 71, 73, 76, 79, 80, 86, 88, 92, 97, 98, 106, 108, 112, 118, 120, 127, 131, 136, 139, 139, 144, 145, 150, 151, 147, 144,
+ 88, 83, 82, 79, 79, 78, 76, 76, 81, 82, 85, 89, 91, 97, 98, 104, 107, 111, 117, 119, 127, 129, 135, 140, 145, 148, 148, 153, 153, 154, 159, 163,
+ 90, 86, 85, 82, 81, 80, 79, 78, 81, 83, 87, 88, 93, 94, 101, 101, 108, 110, 116, 119, 124, 129, 134, 139, 142, 150, 153, 157, 157, 163, 163, 163,
+ 93, 88, 88, 84, 84, 82, 83, 80, 80, 86, 86, 91, 91, 97, 98, 105, 105, 112, 113, 121, 122, 130, 130, 140, 140, 149, 150, 161, 162, 166, 167, 173,
+ }, {
+ 32, 31, 31, 30, 33, 33, 37, 39, 42, 47, 49, 48, 48, 49, 50, 50, 52, 53, 54, 56, 57, 60, 61, 63, 64, 64, 66, 67, 68, 69, 70, 70,
+ 31, 31, 32, 32, 35, 36, 40, 41, 43, 46, 46, 46, 45, 45, 46, 46, 48, 49, 50, 51, 52, 54, 56, 57, 58, 59, 60, 61, 62, 63, 63, 64,
+ 33, 34, 34, 35, 37, 38, 43, 43, 44, 46, 47, 46, 46, 45, 46, 46, 47, 48, 49, 51, 51, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,
+ 37, 38, 39, 40, 42, 43, 47, 47, 47, 48, 48, 47, 46, 46, 46, 46, 47, 48, 49, 50, 50, 52, 53, 55, 56, 56, 57, 58, 59, 59, 60, 61,
+ 45, 45, 45, 44, 46, 46, 47, 48, 49, 51, 52, 51, 51, 51, 52, 52, 53, 53, 54, 55, 55, 57, 58, 59, 60, 60, 61, 61, 61, 61, 61, 61,
+ 48, 47, 46, 46, 47, 47, 47, 48, 50, 52, 53, 53, 53, 53, 54, 54, 54, 55, 55, 56, 56, 58, 59, 60, 61, 61, 63, 63, 64, 65, 66, 67,
+ 49, 47, 46, 45, 45, 46, 45, 47, 49, 53, 53, 56, 57, 58, 59, 59, 61, 61, 62, 63, 64, 65, 66, 67, 68, 68, 69, 69, 68, 68, 67, 67,
+ 50, 48, 47, 46, 46, 46, 46, 47, 50, 53, 54, 56, 57, 59, 61, 61, 63, 64, 65, 66, 66, 68, 69, 70, 71, 71, 72, 70, 71, 72, 73, 74,
+ 52, 50, 49, 48, 47, 47, 47, 48, 50, 53, 54, 57, 59, 61, 63, 64, 66, 67, 68, 70, 70, 72, 73, 75, 75, 75, 77, 78, 77, 76, 75, 74,
+ 56, 53, 52, 51, 50, 50, 49, 50, 53, 55, 56, 59, 61, 63, 65, 66, 70, 71, 72, 74, 75, 77, 79, 80, 81, 81, 82, 80, 79, 80, 81, 82,
+ 57, 54, 53, 52, 51, 51, 50, 51, 53, 56, 57, 60, 61, 64, 66, 67, 71, 72, 73, 76, 76, 79, 80, 82, 83, 83, 84, 85, 86, 85, 83, 82,
+ 63, 60, 59, 57, 56, 56, 54, 55, 57, 60, 60, 64, 65, 67, 70, 71, 75, 76, 78, 81, 82, 85, 86, 89, 90, 90, 92, 90, 88, 88, 89, 90,
+ 64, 61, 60, 58, 57, 57, 55, 56, 58, 61, 61, 64, 66, 68, 71, 71, 75, 77, 79, 82, 83, 86, 87, 90, 91, 91, 93, 93, 94, 94, 92, 90,
+ 67, 63, 62, 60, 60, 59, 57, 57, 60, 61, 63, 65, 66, 70, 70, 73, 75, 77, 80, 81, 85, 86, 89, 91, 93, 94, 94, 96, 96, 95, 97, 98,
+ 68, 64, 64, 61, 61, 60, 59, 58, 60, 61, 63, 64, 67, 67, 71, 71, 74, 75, 79, 80, 83, 85, 87, 89, 91, 94, 95, 97, 97, 99, 98, 98,
+ 68, 65, 65, 62, 62, 60, 61, 59, 59, 62, 62, 65, 65, 68, 68, 72, 72, 76, 76, 80, 80, 84, 84, 89, 89, 93, 93, 97, 98, 99, 99, 102,
+ },
+ }, {
+ {
+ 32, 31, 31, 31, 31, 32, 32, 32, 33, 34, 36, 36, 39, 40, 44, 46, 48, 52, 53, 58, 58, 65, 66, 71, 74, 79, 81, 82, 86, 88, 91, 93,
+ 31, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 35, 37, 38, 41, 43, 45, 48, 49, 53, 54, 60, 61, 65, 68, 72, 74, 75, 78, 81, 83, 85,
+ 31, 32, 32, 32, 32, 33, 33, 33, 34, 34, 35, 35, 38, 39, 41, 43, 45, 48, 49, 53, 54, 59, 60, 65, 67, 72, 73, 74, 78, 80, 82, 85,
+ 32, 32, 32, 33, 33, 34, 35, 35, 36, 37, 38, 38, 40, 41, 43, 44, 46, 49, 50, 53, 54, 58, 59, 63, 66, 70, 71, 72, 75, 77, 79, 81,
+ 33, 33, 33, 33, 34, 35, 36, 36, 38, 39, 42, 42, 44, 45, 46, 48, 50, 52, 53, 57, 57, 62, 63, 67, 69, 73, 75, 75, 78, 80, 80, 81,
+ 36, 35, 35, 34, 35, 36, 37, 38, 41, 42, 48, 48, 50, 51, 53, 55, 56, 59, 60, 63, 63, 68, 69, 73, 75, 79, 80, 81, 84, 86, 88, 90,
+ 40, 39, 39, 38, 38, 39, 40, 41, 44, 45, 51, 51, 54, 56, 59, 60, 62, 65, 66, 69, 70, 74, 75, 79, 81, 85, 86, 87, 90, 90, 90, 90,
+ 44, 42, 42, 41, 41, 42, 42, 42, 46, 48, 54, 54, 58, 59, 63, 65, 67, 70, 71, 74, 75, 79, 80, 84, 86, 90, 91, 92, 95, 98, 100, 102,
+ 51, 49, 49, 47, 47, 48, 48, 48, 52, 53, 58, 59, 63, 65, 69, 72, 74, 78, 79, 83, 84, 89, 90, 94, 97, 101, 102, 103, 106, 105, 103, 103,
+ 53, 51, 51, 49, 49, 50, 49, 49, 53, 54, 60, 60, 65, 67, 71, 73, 76, 80, 82, 86, 87, 92, 93, 97, 100, 104, 105, 106, 109, 112, 114, 117,
+ 65, 62, 61, 59, 59, 59, 58, 58, 62, 63, 68, 68, 73, 75, 79, 82, 85, 90, 92, 97, 98, 105, 106, 111, 113, 118, 120, 121, 124, 122, 119, 117,
+ 66, 63, 62, 60, 60, 60, 59, 59, 63, 64, 69, 69, 74, 76, 80, 83, 86, 91, 93, 98, 99, 106, 107, 112, 115, 119, 121, 122, 125, 127, 130, 134,
+ 79, 75, 74, 72, 71, 71, 69, 69, 72, 73, 78, 79, 84, 85, 90, 93, 96, 101, 103, 109, 110, 118, 119, 125, 128, 133, 135, 136, 140, 140, 137, 134,
+ 81, 77, 76, 74, 73, 72, 71, 70, 74, 75, 80, 80, 85, 87, 91, 94, 98, 103, 105, 111, 112, 119, 121, 127, 130, 135, 137, 139, 142, 144, 148, 151,
+ 87, 83, 82, 79, 79, 78, 77, 75, 78, 80, 84, 85, 89, 90, 96, 97, 103, 105, 111, 113, 118, 122, 126, 131, 134, 141, 143, 147, 147, 152, 151, 152,
+ 90, 85, 85, 81, 81, 80, 80, 77, 78, 83, 83, 87, 88, 93, 93, 100, 100, 107, 107, 115, 115, 123, 123, 132, 132, 140, 140, 151, 151, 155, 155, 160,
+ }, {
+ 32, 31, 31, 30, 31, 33, 35, 37, 41, 42, 49, 49, 48, 48, 49, 49, 50, 51, 52, 54, 54, 57, 57, 60, 61, 63, 64, 64, 66, 67, 68, 68,
+ 31, 31, 31, 32, 33, 36, 38, 40, 42, 43, 46, 46, 46, 45, 45, 46, 46, 47, 48, 50, 50, 52, 52, 54, 56, 57, 58, 59, 60, 61, 62, 62,
+ 32, 33, 33, 33, 35, 37, 39, 41, 43, 43, 47, 47, 46, 46, 45, 46, 46, 47, 48, 49, 50, 52, 52, 54, 55, 57, 58, 58, 59, 60, 61, 62,
+ 37, 38, 38, 40, 41, 43, 45, 47, 47, 47, 48, 48, 47, 46, 46, 46, 46, 47, 47, 48, 49, 50, 51, 52, 53, 55, 55, 56, 57, 58, 58, 59,
+ 40, 41, 41, 42, 43, 44, 46, 47, 48, 48, 50, 50, 49, 49, 48, 49, 49, 49, 50, 51, 51, 52, 53, 55, 56, 57, 58, 58, 59, 59, 59, 59,
+ 48, 47, 47, 46, 46, 47, 47, 47, 49, 50, 53, 53, 53, 53, 53, 53, 54, 54, 54, 55, 55, 56, 57, 58, 59, 60, 61, 61, 62, 63, 64, 65,
+ 49, 47, 47, 45, 46, 46, 46, 46, 49, 49, 53, 53, 54, 55, 56, 57, 57, 58, 58, 59, 59, 60, 61, 62, 63, 64, 65, 65, 66, 66, 65, 65,
+ 49, 47, 47, 45, 45, 46, 45, 45, 48, 49, 53, 54, 56, 56, 58, 59, 59, 61, 61, 62, 62, 64, 64, 65, 66, 67, 68, 68, 69, 70, 71, 71,
+ 51, 49, 49, 47, 47, 47, 47, 46, 49, 50, 54, 54, 57, 58, 61, 62, 63, 64, 65, 67, 67, 69, 69, 71, 72, 73, 73, 74, 75, 74, 72, 71,
+ 52, 50, 49, 48, 48, 47, 47, 47, 50, 50, 54, 55, 57, 58, 61, 62, 64, 66, 66, 68, 68, 70, 71, 72, 73, 75, 75, 75, 76, 77, 78, 79,
+ 57, 54, 54, 52, 51, 51, 50, 50, 52, 53, 57, 57, 60, 61, 64, 65, 67, 69, 71, 73, 73, 76, 77, 79, 80, 82, 82, 83, 84, 82, 81, 79,
+ 58, 55, 54, 52, 52, 52, 51, 50, 53, 54, 57, 57, 60, 61, 64, 66, 67, 70, 71, 73, 74, 77, 77, 79, 81, 82, 83, 83, 85, 85, 86, 87,
+ 63, 60, 59, 57, 57, 56, 55, 54, 57, 57, 60, 61, 64, 65, 67, 69, 71, 73, 75, 77, 78, 82, 82, 85, 86, 89, 89, 90, 91, 91, 89, 87,
+ 64, 61, 60, 58, 57, 57, 56, 55, 57, 58, 61, 61, 64, 65, 68, 69, 71, 74, 75, 78, 78, 82, 83, 86, 87, 90, 90, 91, 92, 93, 94, 95,
+ 67, 63, 63, 60, 60, 59, 58, 57, 59, 60, 62, 63, 65, 66, 69, 70, 73, 74, 77, 78, 81, 83, 85, 87, 88, 92, 92, 94, 94, 96, 95, 95,
+ 67, 64, 64, 61, 61, 60, 60, 58, 58, 61, 61, 64, 64, 67, 67, 70, 71, 74, 74, 78, 78, 82, 82, 86, 86, 90, 90, 95, 95, 96, 96, 98,
+ },
+ }, {
+ {
+ 32, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 36, 36, 39, 39, 44, 44, 48, 48, 53, 53, 58, 58, 65, 65, 71, 71, 79, 79, 82, 82, 87,
+ 31, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41, 41, 45, 45, 49, 49, 54, 54, 60, 60, 65, 65, 72, 72, 75, 75, 79,
+ 31, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41, 41, 45, 45, 49, 49, 54, 54, 60, 60, 65, 65, 72, 72, 75, 75, 79,
+ 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 43, 43, 46, 46, 50, 50, 54, 54, 58, 58, 63, 63, 70, 70, 72, 72, 76,
+ 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 43, 43, 46, 46, 50, 50, 54, 54, 58, 58, 63, 63, 70, 70, 72, 72, 76,
+ 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48, 48, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 68, 68, 73, 73, 79, 79, 81, 81, 84,
+ 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48, 48, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 68, 68, 73, 73, 79, 79, 81, 81, 84,
+ 44, 42, 42, 41, 41, 42, 42, 42, 42, 48, 48, 54, 54, 58, 58, 63, 63, 67, 67, 71, 71, 75, 75, 79, 79, 84, 84, 90, 90, 92, 92, 96,
+ 44, 42, 42, 41, 41, 42, 42, 42, 42, 48, 48, 54, 54, 58, 58, 63, 63, 67, 67, 71, 71, 75, 75, 79, 79, 84, 84, 90, 90, 92, 92, 96,
+ 53, 51, 51, 49, 49, 50, 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, 76, 82, 82, 87, 87, 92, 92, 97, 97, 104, 104, 106, 106, 109,
+ 53, 51, 51, 49, 49, 50, 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, 76, 82, 82, 87, 87, 92, 92, 97, 97, 104, 104, 106, 106, 109,
+ 65, 62, 62, 59, 59, 59, 59, 58, 58, 63, 63, 68, 68, 73, 73, 79, 79, 85, 85, 92, 92, 98, 98, 105, 105, 111, 111, 118, 118, 121, 121, 124,
+ 65, 62, 62, 59, 59, 59, 59, 58, 58, 63, 63, 68, 68, 73, 73, 79, 79, 85, 85, 92, 92, 98, 98, 105, 105, 111, 111, 118, 118, 121, 121, 124,
+ 79, 75, 75, 72, 72, 71, 71, 69, 69, 73, 73, 78, 78, 84, 84, 90, 90, 96, 96, 103, 103, 110, 110, 118, 118, 125, 125, 133, 133, 136, 136, 141,
+ 79, 75, 75, 72, 72, 71, 71, 69, 69, 73, 73, 78, 78, 84, 84, 90, 90, 96, 96, 103, 103, 110, 110, 118, 118, 125, 125, 133, 133, 136, 136, 141,
+ 87, 82, 82, 78, 78, 77, 77, 75, 75, 79, 79, 84, 84, 89, 89, 95, 95, 102, 102, 109, 109, 116, 116, 124, 124, 132, 132, 141, 141, 144, 144, 149,
+ }, {
+ 32, 31, 31, 30, 30, 33, 33, 37, 37, 42, 42, 49, 49, 48, 48, 49, 49, 50, 50, 52, 52, 54, 54, 57, 57, 60, 60, 63, 63, 64, 64, 66,
+ 31, 31, 31, 32, 32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 46, 46, 48, 48, 50, 50, 52, 52, 54, 54, 57, 57, 59, 59, 60,
+ 31, 31, 31, 32, 32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 46, 46, 48, 48, 50, 50, 52, 52, 54, 54, 57, 57, 59, 59, 60,
+ 37, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 47, 47, 49, 49, 50, 50, 52, 52, 55, 55, 56, 56, 57,
+ 37, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 47, 47, 49, 49, 50, 50, 52, 52, 55, 55, 56, 56, 57,
+ 48, 47, 47, 46, 46, 47, 47, 47, 47, 50, 50, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 58, 58, 60, 60, 61, 61, 63,
+ 48, 47, 47, 46, 46, 47, 47, 47, 47, 50, 50, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 58, 58, 60, 60, 61, 61, 63,
+ 49, 47, 47, 45, 45, 46, 46, 45, 45, 49, 49, 53, 53, 56, 56, 58, 58, 59, 59, 61, 61, 62, 62, 64, 64, 65, 65, 67, 67, 68, 68, 69,
+ 49, 47, 47, 45, 45, 46, 46, 45, 45, 49, 49, 53, 53, 56, 56, 58, 58, 59, 59, 61, 61, 62, 62, 64, 64, 65, 65, 67, 67, 68, 68, 69,
+ 52, 50, 50, 48, 48, 47, 47, 47, 47, 50, 50, 54, 54, 57, 57, 61, 61, 64, 64, 66, 66, 68, 68, 70, 70, 72, 72, 75, 75, 75, 75, 77,
+ 52, 50, 50, 48, 48, 47, 47, 47, 47, 50, 50, 54, 54, 57, 57, 61, 61, 64, 64, 66, 66, 68, 68, 70, 70, 72, 72, 75, 75, 75, 75, 77,
+ 57, 54, 54, 52, 52, 51, 51, 50, 50, 53, 53, 57, 57, 60, 60, 64, 64, 67, 67, 71, 71, 73, 73, 76, 76, 79, 79, 82, 82, 83, 83, 84,
+ 57, 54, 54, 52, 52, 51, 51, 50, 50, 53, 53, 57, 57, 60, 60, 64, 64, 67, 67, 71, 71, 73, 73, 76, 76, 79, 79, 82, 82, 83, 83, 84,
+ 63, 60, 60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60, 64, 64, 67, 67, 71, 71, 75, 75, 78, 78, 82, 82, 85, 85, 89, 89, 90, 90, 92,
+ 63, 60, 60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60, 64, 64, 67, 67, 71, 71, 75, 75, 78, 78, 82, 82, 85, 85, 89, 89, 90, 90, 92,
+ 66, 63, 63, 60, 60, 59, 59, 57, 57, 60, 60, 62, 62, 66, 66, 69, 69, 73, 73, 77, 77, 80, 80, 84, 84, 88, 88, 92, 92, 93, 93, 95,
+ },
+ }, {
+ {
+ 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 35, 36, 38, 39, 41, 44, 44, 48, 48, 53, 53, 57, 58, 61, 65, 67, 71, 72, 79, 79,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 36, 38, 39, 41, 42, 45, 45, 49, 50, 53, 54, 57, 60, 62, 66, 66, 73, 73,
+ 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 36, 37, 39, 41, 42, 45, 45, 49, 49, 52, 54, 57, 60, 61, 65, 66, 72, 72,
+ 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 36, 36, 37, 37, 38, 40, 41, 42, 43, 46, 46, 49, 50, 52, 54, 56, 59, 60, 64, 64, 71, 71,
+ 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 41, 43, 43, 46, 46, 49, 50, 52, 54, 56, 58, 60, 63, 64, 70, 70,
+ 34, 34, 34, 33, 33, 34, 35, 35, 37, 37, 39, 39, 42, 43, 44, 45, 46, 48, 48, 51, 51, 54, 54, 57, 58, 60, 63, 64, 68, 68, 74, 74,
+ 36, 35, 35, 35, 34, 35, 36, 37, 38, 39, 42, 42, 47, 48, 49, 50, 51, 53, 54, 56, 56, 59, 60, 62, 63, 66, 68, 69, 73, 73, 79, 79,
+ 38, 37, 37, 36, 36, 37, 38, 38, 39, 40, 44, 44, 48, 49, 51, 52, 54, 56, 56, 59, 59, 62, 63, 65, 67, 69, 71, 72, 76, 76, 82, 82,
+ 44, 42, 42, 41, 41, 41, 42, 42, 42, 43, 48, 48, 52, 54, 56, 58, 60, 63, 64, 67, 67, 71, 71, 74, 75, 77, 79, 81, 84, 85, 90, 90,
+ 44, 43, 43, 42, 41, 42, 43, 43, 43, 44, 48, 48, 53, 54, 57, 58, 60, 64, 64, 67, 67, 71, 72, 75, 76, 78, 80, 82, 85, 86, 91, 91,
+ 53, 51, 51, 50, 49, 49, 50, 49, 49, 50, 54, 54, 59, 60, 63, 65, 67, 71, 72, 76, 76, 81, 82, 85, 87, 89, 92, 94, 97, 98, 104, 104,
+ 53, 51, 51, 50, 49, 49, 50, 49, 49, 50, 54, 54, 59, 60, 63, 65, 67, 71, 72, 76, 76, 81, 82, 85, 87, 89, 92, 94, 97, 98, 104, 104,
+ 62, 60, 59, 58, 57, 57, 57, 56, 56, 56, 61, 61, 65, 66, 69, 71, 74, 78, 79, 83, 83, 89, 90, 94, 95, 98, 102, 103, 108, 108, 115, 115,
+ 65, 62, 62, 60, 59, 59, 59, 59, 58, 58, 63, 63, 67, 68, 71, 73, 76, 79, 81, 85, 85, 91, 92, 96, 98, 101, 105, 106, 111, 111, 118, 118,
+ 73, 70, 69, 67, 66, 66, 65, 65, 64, 64, 69, 69, 73, 74, 77, 79, 81, 85, 86, 91, 91, 98, 99, 103, 105, 108, 112, 114, 119, 119, 127, 127,
+ 79, 75, 75, 73, 72, 71, 71, 70, 69, 69, 73, 73, 77, 78, 81, 84, 86, 90, 91, 96, 96, 103, 103, 108, 110, 114, 118, 120, 125, 125, 133, 133,
+ }, {
+ 32, 31, 31, 30, 30, 32, 33, 34, 37, 37, 42, 42, 47, 49, 48, 48, 48, 49, 49, 50, 50, 52, 52, 53, 54, 55, 57, 58, 60, 60, 63, 63,
+ 31, 31, 31, 32, 32, 33, 35, 37, 40, 40, 43, 43, 46, 47, 46, 46, 46, 45, 46, 47, 47, 48, 48, 50, 50, 51, 52, 53, 55, 55, 58, 58,
+ 31, 31, 31, 32, 32, 34, 36, 37, 40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 45, 46, 46, 48, 48, 49, 50, 51, 52, 53, 54, 55, 57, 57,
+ 35, 36, 36, 37, 37, 39, 40, 42, 45, 45, 46, 46, 47, 47, 47, 46, 46, 45, 46, 46, 46, 47, 47, 48, 49, 50, 51, 51, 53, 53, 56, 56,
+ 37, 38, 38, 39, 40, 41, 43, 44, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 46, 47, 47, 48, 49, 49, 50, 51, 52, 53, 55, 55,
+ 42, 42, 42, 42, 42, 44, 45, 45, 47, 47, 48, 48, 50, 50, 50, 50, 49, 49, 49, 50, 50, 50, 50, 51, 52, 52, 53, 54, 55, 55, 58, 58,
+ 48, 47, 47, 46, 46, 46, 47, 47, 47, 48, 50, 50, 52, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 57, 58, 59, 60, 60,
+ 48, 47, 47, 46, 46, 46, 46, 47, 47, 47, 50, 50, 52, 53, 53, 54, 54, 55, 55, 55, 55, 56, 56, 57, 57, 58, 58, 59, 60, 60, 62, 62,
+ 49, 47, 47, 46, 45, 45, 46, 45, 45, 46, 49, 49, 53, 53, 55, 56, 57, 58, 58, 59, 59, 61, 61, 62, 62, 63, 64, 64, 65, 65, 67, 67,
+ 49, 47, 47, 46, 45, 45, 46, 46, 46, 46, 49, 49, 53, 54, 55, 56, 57, 59, 59, 60, 60, 61, 61, 62, 63, 63, 64, 65, 66, 66, 68, 68,
+ 52, 50, 50, 48, 48, 48, 47, 47, 47, 47, 50, 50, 53, 54, 56, 57, 59, 61, 62, 64, 64, 66, 66, 68, 68, 69, 70, 71, 72, 73, 75, 75,
+ 52, 50, 50, 48, 48, 48, 47, 47, 47, 47, 50, 50, 53, 54, 56, 57, 59, 61, 62, 64, 64, 66, 66, 68, 68, 69, 70, 71, 72, 73, 75, 75,
+ 56, 54, 53, 52, 51, 51, 50, 50, 49, 49, 53, 53, 55, 56, 58, 59, 61, 63, 64, 66, 66, 69, 70, 71, 72, 74, 75, 76, 77, 78, 80, 80,
+ 57, 54, 54, 52, 52, 51, 51, 51, 50, 50, 53, 53, 56, 57, 58, 60, 61, 64, 64, 67, 67, 70, 71, 72, 73, 75, 76, 77, 79, 79, 82, 82,
+ 61, 58, 57, 56, 55, 54, 54, 53, 52, 53, 56, 56, 58, 59, 61, 62, 63, 66, 66, 69, 69, 72, 73, 75, 76, 78, 79, 80, 82, 83, 86, 86,
+ 63, 60, 60, 58, 57, 57, 56, 55, 54, 55, 57, 57, 60, 60, 62, 64, 65, 67, 68, 71, 71, 74, 75, 77, 78, 80, 82, 83, 85, 85, 89, 89,
+ },
+ }, {
+ {
+ 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 34, 34, 36, 36, 38, 39, 41, 44, 44, 47, 48, 50, 53, 53, 57, 58, 61, 65, 65, 70,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 37, 38, 39, 41, 41, 44, 45, 47, 50, 50, 54, 55, 57, 61, 61, 65,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 37, 37, 39, 41, 41, 44, 45, 46, 49, 49, 53, 54, 56, 60, 60, 64,
+ 31, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 35, 35, 35, 36, 36, 38, 39, 40, 42, 42, 44, 45, 47, 50, 50, 53, 54, 56, 59, 59, 63,
+ 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 38, 38, 40, 40, 41, 43, 43, 45, 46, 47, 50, 50, 53, 54, 56, 58, 58, 62,
+ 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 38, 38, 40, 40, 41, 43, 43, 45, 46, 47, 50, 50, 53, 54, 56, 58, 58, 62,
+ 35, 35, 35, 34, 34, 34, 35, 36, 36, 37, 37, 40, 41, 43, 46, 46, 47, 48, 49, 51, 51, 53, 54, 55, 57, 57, 60, 61, 63, 66, 66, 70,
+ 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, 38, 41, 42, 44, 48, 48, 50, 50, 51, 53, 53, 56, 56, 58, 60, 60, 63, 63, 65, 68, 68, 72,
+ 39, 38, 38, 37, 37, 37, 38, 38, 39, 40, 40, 43, 44, 46, 50, 50, 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 67, 68, 69, 72, 72, 76,
+ 44, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 46, 48, 50, 54, 54, 57, 58, 60, 63, 63, 66, 67, 68, 71, 71, 74, 75, 77, 79, 79, 83,
+ 44, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 46, 48, 50, 54, 54, 57, 58, 60, 63, 63, 66, 67, 68, 71, 71, 74, 75, 77, 79, 79, 83,
+ 51, 49, 49, 48, 47, 47, 48, 48, 48, 48, 48, 52, 53, 55, 58, 58, 62, 63, 66, 69, 69, 73, 74, 76, 79, 79, 83, 84, 86, 89, 89, 93,
+ 53, 52, 51, 50, 49, 49, 49, 50, 49, 49, 49, 53, 54, 56, 60, 60, 64, 65, 67, 71, 71, 75, 76, 78, 82, 82, 86, 87, 89, 92, 92, 96,
+ 58, 56, 55, 54, 53, 53, 53, 53, 53, 52, 52, 56, 57, 59, 63, 63, 67, 68, 70, 74, 74, 78, 79, 82, 86, 86, 90, 91, 93, 97, 97, 101,
+ 65, 63, 62, 61, 59, 59, 59, 59, 58, 58, 58, 62, 63, 65, 68, 68, 72, 73, 76, 79, 79, 84, 85, 88, 92, 92, 97, 98, 100, 105, 105, 109,
+ 65, 63, 62, 61, 59, 59, 59, 59, 58, 58, 58, 62, 63, 65, 68, 68, 72, 73, 76, 79, 79, 84, 85, 88, 92, 92, 97, 98, 100, 105, 105, 109,
+ }, {
+ 32, 31, 31, 31, 30, 30, 33, 33, 35, 37, 37, 41, 42, 44, 49, 49, 48, 48, 48, 49, 49, 50, 50, 51, 52, 52, 54, 54, 55, 57, 57, 59,
+ 31, 31, 31, 31, 32, 32, 34, 35, 37, 39, 39, 42, 42, 44, 47, 47, 46, 46, 46, 46, 46, 47, 47, 48, 48, 48, 50, 51, 51, 53, 53, 55,
+ 31, 31, 31, 32, 32, 32, 35, 36, 37, 40, 40, 42, 43, 44, 46, 46, 46, 46, 45, 45, 45, 46, 46, 47, 48, 48, 49, 50, 51, 52, 52, 54,
+ 33, 34, 34, 34, 35, 35, 37, 38, 40, 43, 43, 44, 44, 45, 47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 47, 49, 49, 50, 51, 51, 53,
+ 37, 38, 38, 39, 40, 40, 42, 43, 44, 47, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 49, 49, 50, 50, 52,
+ 37, 38, 38, 39, 40, 40, 42, 43, 44, 47, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 49, 49, 50, 50, 52,
+ 45, 45, 45, 45, 44, 44, 46, 46, 46, 47, 47, 49, 49, 50, 52, 52, 51, 51, 51, 51, 51, 52, 52, 52, 53, 53, 54, 54, 54, 55, 55, 57,
+ 48, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 49, 50, 51, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 54, 55, 55, 56, 56, 56, 58,
+ 48, 47, 47, 46, 45, 45, 46, 46, 46, 47, 47, 49, 50, 51, 53, 53, 54, 54, 54, 55, 55, 56, 56, 56, 57, 57, 58, 58, 58, 59, 59, 61,
+ 49, 47, 47, 46, 45, 45, 45, 46, 45, 45, 45, 48, 49, 51, 53, 53, 55, 56, 57, 58, 58, 59, 59, 60, 61, 61, 62, 62, 63, 64, 64, 65,
+ 49, 47, 47, 46, 45, 45, 45, 46, 45, 45, 45, 48, 49, 51, 53, 53, 55, 56, 57, 58, 58, 59, 59, 60, 61, 61, 62, 62, 63, 64, 64, 65,
+ 51, 50, 49, 48, 47, 47, 47, 47, 47, 46, 46, 49, 50, 52, 54, 54, 56, 57, 58, 61, 61, 62, 63, 64, 65, 65, 67, 67, 68, 69, 69, 70,
+ 52, 50, 50, 49, 48, 48, 47, 47, 47, 47, 47, 50, 50, 52, 54, 54, 57, 57, 59, 61, 61, 63, 64, 65, 66, 66, 68, 68, 69, 70, 70, 72,
+ 54, 52, 51, 51, 49, 49, 49, 49, 48, 48, 48, 51, 51, 53, 55, 55, 58, 58, 60, 62, 62, 64, 65, 66, 68, 68, 70, 70, 71, 73, 73, 74,
+ 57, 55, 54, 53, 52, 52, 51, 51, 51, 50, 50, 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 66, 67, 68, 71, 71, 73, 73, 74, 76, 76, 78,
+ 57, 55, 54, 53, 52, 52, 51, 51, 51, 50, 50, 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 66, 67, 68, 71, 71, 73, 73, 74, 76, 76, 78,
+ },
+ }, {
+ {
+ 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 34, 35, 36, 36, 38, 39, 39, 42, 44, 44, 47, 48, 49, 53, 53, 55, 58, 58,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 37, 38, 38, 40, 42, 42, 45, 46, 47, 50, 51, 52, 55, 55,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 36, 37, 37, 40, 41, 41, 44, 45, 46, 49, 49, 51, 54, 54,
+ 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 35, 35, 37, 38, 38, 40, 41, 41, 44, 45, 46, 49, 49, 51, 54, 54,
+ 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 36, 36, 37, 37, 37, 39, 40, 40, 42, 42, 43, 45, 46, 47, 49, 50, 51, 54, 54,
+ 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 37, 38, 38, 40, 40, 40, 42, 43, 43, 45, 46, 47, 49, 50, 51, 54, 54,
+ 32, 33, 33, 33, 33, 33, 33, 34, 34, 35, 36, 36, 36, 38, 38, 39, 40, 40, 41, 42, 42, 44, 45, 45, 47, 48, 48, 51, 51, 53, 55, 55,
+ 35, 35, 35, 35, 34, 34, 35, 36, 36, 37, 38, 38, 39, 42, 42, 44, 47, 47, 48, 49, 49, 51, 52, 52, 54, 55, 56, 58, 59, 60, 62, 62,
+ 36, 35, 35, 35, 35, 34, 35, 36, 36, 37, 38, 38, 40, 42, 42, 45, 48, 48, 49, 50, 50, 52, 53, 54, 56, 56, 57, 59, 60, 61, 63, 63,
+ 38, 37, 37, 37, 36, 36, 36, 38, 38, 38, 39, 39, 41, 44, 44, 46, 49, 49, 51, 52, 52, 55, 56, 56, 58, 59, 60, 62, 63, 64, 67, 67,
+ 44, 43, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 44, 48, 48, 50, 54, 54, 56, 58, 58, 61, 63, 63, 66, 67, 67, 71, 71, 72, 75, 75,
+ 44, 43, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 44, 48, 48, 50, 54, 54, 56, 58, 58, 61, 63, 63, 66, 67, 67, 71, 71, 72, 75, 75,
+ 47, 46, 45, 45, 44, 44, 44, 45, 45, 45, 45, 45, 47, 50, 50, 53, 56, 56, 58, 60, 60, 64, 66, 66, 69, 70, 71, 74, 75, 76, 79, 79,
+ 53, 52, 51, 51, 49, 49, 49, 49, 50, 49, 49, 49, 51, 54, 54, 57, 60, 60, 63, 65, 65, 69, 71, 72, 75, 76, 77, 81, 82, 83, 87, 87,
+ 53, 52, 51, 51, 49, 49, 49, 49, 50, 49, 49, 49, 51, 54, 54, 57, 60, 60, 63, 65, 65, 69, 71, 72, 75, 76, 77, 81, 82, 83, 87, 87,
+ 59, 57, 56, 56, 54, 54, 54, 54, 54, 54, 53, 53, 55, 58, 58, 61, 64, 64, 67, 69, 69, 73, 75, 76, 79, 80, 81, 86, 87, 88, 92, 92,
+ }, {
+ 32, 31, 31, 31, 30, 30, 31, 33, 33, 34, 37, 37, 39, 42, 42, 45, 49, 49, 48, 48, 48, 49, 49, 49, 50, 50, 51, 52, 52, 53, 54, 54,
+ 31, 31, 31, 31, 31, 31, 32, 35, 35, 36, 39, 39, 40, 42, 42, 45, 47, 47, 47, 46, 46, 46, 46, 46, 47, 48, 48, 49, 49, 50, 51, 51,
+ 31, 31, 31, 31, 32, 32, 33, 35, 36, 37, 40, 40, 41, 43, 43, 44, 46, 46, 46, 46, 46, 45, 45, 45, 46, 46, 47, 48, 48, 48, 50, 50,
+ 31, 32, 32, 32, 32, 33, 33, 36, 36, 37, 41, 41, 42, 43, 43, 45, 47, 47, 46, 46, 46, 45, 45, 45, 46, 46, 47, 48, 48, 48, 50, 50,
+ 35, 36, 37, 37, 38, 38, 38, 41, 41, 42, 45, 45, 46, 46, 46, 47, 48, 48, 47, 46, 46, 46, 45, 46, 46, 46, 47, 47, 47, 48, 49, 49,
+ 37, 38, 38, 38, 39, 40, 40, 43, 43, 44, 47, 47, 47, 47, 47, 47, 48, 48, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 49, 49,
+ 38, 39, 40, 40, 40, 41, 41, 43, 44, 45, 47, 47, 47, 48, 48, 48, 49, 49, 48, 48, 48, 47, 47, 47, 48, 48, 48, 48, 48, 49, 50, 50,
+ 47, 46, 46, 46, 45, 45, 45, 46, 46, 47, 47, 47, 48, 50, 50, 51, 52, 52, 52, 52, 52, 52, 52, 52, 53, 53, 53, 53, 53, 54, 55, 55,
+ 48, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 54, 54, 55, 55,
+ 48, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 50, 50, 51, 53, 53, 53, 54, 54, 54, 55, 55, 55, 55, 55, 56, 56, 56, 57, 57,
+ 49, 48, 47, 47, 45, 45, 45, 45, 46, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55, 56, 56, 57, 58, 58, 59, 59, 60, 61, 61, 61, 62, 62,
+ 49, 48, 47, 47, 45, 45, 45, 45, 46, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55, 56, 56, 57, 58, 58, 59, 59, 60, 61, 61, 61, 62, 62,
+ 50, 49, 48, 48, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 58, 59, 60, 61, 61, 61, 63, 63, 63, 65, 65,
+ 52, 50, 50, 50, 48, 48, 48, 47, 47, 47, 47, 47, 48, 50, 50, 52, 54, 54, 56, 57, 57, 60, 61, 61, 63, 64, 64, 66, 66, 67, 68, 68,
+ 52, 50, 50, 50, 48, 48, 48, 47, 47, 47, 47, 47, 48, 50, 50, 52, 54, 54, 56, 57, 57, 60, 61, 61, 63, 64, 64, 66, 66, 67, 68, 68,
+ 54, 53, 52, 52, 50, 50, 50, 49, 49, 49, 48, 48, 50, 52, 52, 54, 55, 55, 57, 59, 59, 61, 62, 63, 65, 65, 66, 68, 68, 69, 71, 71,
+ },
+ }, {
+ {
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 34, 34, 35, 36, 36, 38, 39, 39, 41, 44, 44, 44, 47, 48, 48, 51,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 35, 37, 38, 38, 40, 42, 42, 43, 45, 46, 46, 49,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 34, 36, 37, 37, 39, 41, 41, 42, 44, 45, 45, 47,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 34, 36, 37, 37, 39, 41, 41, 42, 44, 45, 45, 47,
+ 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 37, 39, 39, 40, 42, 42, 42, 44, 45, 45, 48,
+ 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 38, 38, 38, 40, 40, 40, 41, 43, 43, 43, 45, 46, 46, 48,
+ 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 38, 38, 38, 40, 40, 40, 41, 43, 43, 43, 45, 46, 46, 48,
+ 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 35, 36, 36, 36, 38, 38, 38, 39, 40, 40, 41, 42, 42, 43, 45, 45, 45, 47, 48, 48, 50,
+ 35, 35, 35, 35, 34, 34, 34, 34, 35, 36, 36, 37, 37, 37, 39, 41, 41, 42, 45, 46, 46, 47, 48, 48, 49, 51, 51, 51, 53, 54, 54, 56,
+ 36, 35, 35, 35, 35, 34, 34, 35, 36, 36, 36, 37, 38, 38, 40, 42, 42, 43, 47, 48, 48, 49, 50, 50, 51, 53, 53, 54, 56, 56, 56, 58,
+ 36, 35, 35, 35, 35, 34, 34, 35, 36, 36, 36, 37, 38, 38, 40, 42, 42, 43, 47, 48, 48, 49, 50, 50, 51, 53, 53, 54, 56, 56, 56, 58,
+ 40, 39, 39, 39, 39, 38, 38, 38, 39, 39, 39, 40, 41, 41, 42, 45, 45, 46, 50, 51, 51, 53, 54, 54, 56, 59, 59, 59, 61, 62, 62, 64,
+ 44, 43, 42, 42, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 44, 48, 48, 49, 52, 54, 54, 56, 58, 58, 60, 63, 63, 64, 66, 67, 67, 69,
+ 44, 43, 42, 42, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 44, 48, 48, 49, 52, 54, 54, 56, 58, 58, 60, 63, 63, 64, 66, 67, 67, 69,
+ 47, 46, 45, 45, 45, 44, 44, 44, 44, 45, 45, 45, 45, 45, 47, 50, 50, 51, 55, 56, 56, 58, 60, 60, 62, 66, 66, 67, 69, 70, 70, 73,
+ 53, 52, 51, 51, 50, 49, 49, 49, 49, 50, 50, 49, 49, 49, 51, 54, 54, 55, 59, 60, 60, 63, 65, 65, 67, 71, 71, 72, 75, 76, 76, 79,
+ }, {
+ 32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 37, 37, 39, 42, 42, 43, 47, 49, 49, 48, 48, 48, 48, 49, 49, 49, 50, 50, 50, 51,
+ 31, 31, 31, 31, 31, 31, 31, 32, 34, 35, 35, 37, 39, 39, 40, 42, 42, 43, 46, 47, 47, 47, 47, 47, 47, 46, 46, 47, 48, 48, 48, 49,
+ 31, 31, 31, 31, 32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, 43, 43, 46, 46, 46, 46, 46, 46, 45, 45, 45, 45, 46, 46, 46, 47,
+ 31, 31, 31, 31, 32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, 43, 43, 46, 46, 46, 46, 46, 46, 45, 45, 45, 45, 46, 46, 46, 47,
+ 33, 33, 34, 34, 34, 35, 35, 35, 37, 38, 38, 41, 43, 43, 43, 44, 44, 45, 46, 47, 47, 46, 46, 46, 46, 45, 45, 45, 46, 46, 46, 47,
+ 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 45, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 46, 46, 46, 46, 46, 46, 46, 47,
+ 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 45, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 46, 46, 46, 46, 46, 46, 46, 47,
+ 38, 39, 40, 40, 40, 41, 41, 41, 43, 44, 44, 46, 47, 47, 47, 48, 48, 48, 48, 49, 49, 48, 48, 48, 47, 47, 47, 47, 48, 48, 48, 48,
+ 45, 45, 45, 45, 45, 44, 44, 45, 46, 46, 46, 47, 47, 47, 48, 49, 49, 50, 51, 52, 52, 52, 51, 51, 51, 51, 51, 52, 52, 52, 52, 52,
+ 48, 47, 47, 47, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 50, 50, 50, 52, 53, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54,
+ 48, 47, 47, 47, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 50, 50, 50, 52, 53, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54,
+ 49, 48, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 47, 49, 49, 50, 52, 53, 53, 54, 54, 54, 55, 56, 56, 56, 57, 57, 57, 58,
+ 49, 48, 47, 47, 46, 45, 45, 45, 45, 46, 46, 45, 45, 45, 47, 49, 49, 50, 53, 53, 53, 55, 56, 56, 57, 58, 58, 58, 59, 59, 59, 60,
+ 49, 48, 47, 47, 46, 45, 45, 45, 45, 46, 46, 45, 45, 45, 47, 49, 49, 50, 53, 53, 53, 55, 56, 56, 57, 58, 58, 58, 59, 59, 59, 60,
+ 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56, 57, 59, 59, 60, 61, 61, 61, 62,
+ 52, 51, 50, 50, 49, 48, 48, 48, 47, 47, 47, 47, 47, 47, 48, 50, 50, 51, 53, 54, 54, 56, 57, 57, 59, 61, 61, 62, 63, 64, 64, 65,
+ },
+ }, {
+ {
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 36, 36, 36, 37, 39, 39, 39, 41, 44, 44,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 35, 37, 38, 38, 38, 40, 42, 42,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 36, 37, 37, 37, 39, 41, 41,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 36, 37, 37, 37, 39, 41, 41,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 36, 37, 37, 37, 39, 41, 41,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 36, 36, 36, 38, 39, 39, 39, 40, 42, 42,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43,
+ 34, 34, 34, 34, 34, 34, 33, 33, 33, 34, 35, 35, 35, 36, 37, 37, 37, 38, 39, 39, 39, 41, 43, 43, 43, 44, 45, 45, 45, 46, 48, 48,
+ 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 48, 49, 50, 50, 50, 52, 53, 53,
+ 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 48, 49, 50, 50, 50, 52, 53, 53,
+ 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 48, 49, 50, 50, 50, 52, 53, 53,
+ 39, 39, 38, 38, 38, 38, 37, 37, 37, 38, 39, 39, 39, 40, 40, 40, 40, 42, 45, 45, 45, 47, 51, 51, 51, 52, 54, 54, 54, 56, 58, 58,
+ 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 42, 45, 48, 48, 48, 50, 54, 54, 54, 56, 58, 58, 58, 60, 63, 63,
+ 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 42, 45, 48, 48, 48, 50, 54, 54, 54, 56, 58, 58, 58, 60, 63, 63,
+ }, {
+ 32, 31, 31, 31, 31, 31, 30, 30, 30, 32, 33, 33, 33, 35, 37, 37, 37, 39, 42, 42, 42, 45, 49, 49, 49, 48, 48, 48, 48, 48, 49, 49,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 34, 34, 36, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 48, 47, 47, 47, 47, 47, 47, 47,
+ 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 36, 36, 38, 40, 40, 40, 41, 43, 43, 43, 44, 46, 46, 46, 46, 46, 46, 46, 45, 45, 45,
+ 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 36, 36, 38, 40, 40, 40, 41, 43, 43, 43, 44, 46, 46, 46, 46, 46, 46, 46, 45, 45, 45,
+ 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 36, 36, 38, 40, 40, 40, 41, 43, 43, 43, 44, 46, 46, 46, 46, 46, 46, 46, 45, 45, 45,
+ 33, 34, 34, 34, 34, 35, 35, 35, 35, 37, 39, 39, 39, 41, 43, 43, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47, 46, 46, 46, 46, 45, 45,
+ 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 47, 46, 46, 46,
+ 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 47, 46, 46, 46,
+ 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 47, 46, 46, 46,
+ 42, 42, 42, 42, 42, 42, 42, 42, 42, 44, 45, 45, 45, 46, 47, 47, 47, 48, 48, 48, 48, 49, 50, 50, 50, 50, 50, 50, 50, 49, 49, 49,
+ 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 49, 50, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53,
+ 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 49, 50, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53,
+ 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 49, 50, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53,
+ 48, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 48, 50, 50, 50, 51, 53, 53, 53, 54, 54, 54, 54, 55, 56, 56,
+ 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 46, 46, 46, 45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 56, 56, 56, 57, 58, 58,
+ 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 46, 46, 46, 45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 56, 56, 56, 57, 58, 58,
+ },
+ }, {
+ {
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 36, 36, 37,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 36,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 36,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 36,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 36,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 36,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 35, 36, 36, 36, 36, 37,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 37, 37, 37, 37, 37, 38,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 37, 38, 39, 39, 39, 40, 41, 42, 42, 42, 42,
+ 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 37, 37, 37, 37, 39, 40, 41, 41, 41, 43, 45, 46, 46, 46, 46,
+ 36, 35, 35, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 41, 42, 42, 42, 44, 47, 48, 48, 48, 49,
+ 36, 35, 35, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 41, 42, 42, 42, 44, 47, 48, 48, 48, 49,
+ 36, 35, 35, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 41, 42, 42, 42, 44, 47, 48, 48, 48, 49,
+ }, {
+ 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33, 35, 36, 37, 37, 37, 39, 41, 42, 42, 42, 44, 47, 49, 49, 49, 49,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 34, 36, 37, 38, 38, 38, 39, 41, 42, 42, 42, 44, 46, 48, 48, 48, 48,
+ 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 37, 38, 39, 39, 39, 40, 42, 42, 42, 42, 44, 46, 47, 47, 47, 47,
+ 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 37, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 46, 46, 46, 46,
+ 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 37, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 46, 46, 46, 46,
+ 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 37, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 46, 46, 46, 46,
+ 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 36, 37, 38, 38, 38, 40, 42, 43, 43, 43, 43, 44, 44, 44, 44, 45, 46, 47, 47, 47, 47,
+ 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 40, 41, 41, 41, 43, 44, 45, 45, 45, 46, 46, 46, 46, 46, 47, 47, 48, 48, 48, 47,
+ 37, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 47,
+ 37, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 47,
+ 37, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 47,
+ 40, 41, 41, 41, 41, 41, 41, 42, 42, 42, 42, 43, 44, 44, 44, 44, 45, 47, 47, 47, 47, 48, 48, 48, 48, 48, 49, 49, 50, 50, 50, 49,
+ 45, 45, 45, 45, 45, 45, 45, 44, 44, 44, 44, 45, 46, 46, 46, 46, 46, 47, 47, 47, 47, 48, 49, 49, 49, 49, 50, 51, 52, 52, 52, 52,
+ 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 53, 53, 53, 53,
+ 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 53, 53, 53, 53,
+ 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 53, 53, 53, 53,
+ },
+ }, {
+ {
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 36, 36,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37,
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38, 38,
+ 34, 34, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 39, 39,
+ }, {
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 32, 33, 33, 33, 33, 33, 34, 35, 36, 37, 37, 37, 37, 39, 40, 42, 42,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 34, 35, 36, 37, 38, 38, 38, 38, 39, 41, 42, 42,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 35, 35, 35, 36, 37, 38, 39, 39, 39, 39, 40, 41, 42, 42,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 35, 37, 38, 39, 40, 40, 40, 40, 41, 42, 43, 43,
+ 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 35, 36, 36, 36, 36, 37, 38, 39, 40, 40, 40, 40, 41, 42, 43, 43,
+ 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 35, 36, 36, 36, 36, 37, 38, 39, 40, 40, 40, 40, 41, 42, 43, 43,
+ 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 35, 36, 36, 36, 36, 37, 38, 39, 40, 40, 40, 40, 41, 42, 43, 43,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 35, 36, 36, 36, 36, 37, 39, 40, 41, 41, 41, 41, 42, 42, 43, 43,
+ 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 36, 37, 38, 38, 38, 38, 39, 41, 42, 43, 43, 43, 43, 43, 44, 44, 44,
+ 35, 35, 35, 36, 36, 36, 36, 36, 36, 37, 37, 37, 37, 37, 38, 39, 40, 40, 40, 40, 40, 42, 43, 44, 45, 45, 45, 45, 45, 45, 46, 46,
+ 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 45, 47, 47, 47, 47, 47, 47, 47, 47, 47,
+ 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 45, 47, 47, 47, 47, 47, 47, 47, 47, 47,
+ 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 45, 47, 47, 47, 47, 47, 47, 47, 47, 47,
+ 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 45, 47, 47, 47, 47, 47, 47, 47, 47, 47,
+ 38, 39, 39, 40, 40, 40, 40, 40, 40, 40, 41, 41, 41, 41, 41, 42, 43, 44, 44, 44, 44, 45, 46, 47, 47, 47, 47, 47, 47, 47, 48, 48,
+ 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 44, 44, 45, 45, 45, 45, 45, 46, 47, 47, 47, 47, 47, 48, 48, 48, 48,
+ },
+ }, {
+ {
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34,
+ }, {
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 31, 31, 32, 33, 33, 33, 33, 33, 33, 33, 34,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 33, 34, 34, 34, 34, 34, 34, 34,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 34, 34, 34, 34, 35,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33, 34, 35, 35, 35, 35, 35, 35, 35,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36,
+ 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 35, 35, 36, 37, 37, 37, 37, 37, 37, 38,
+ 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 38, 38, 38, 38, 38, 39,
+ 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 36, 36, 36, 36, 36, 36, 36, 36, 37, 37, 38, 39, 40, 40, 40, 40, 40, 40, 40,
+ 35, 35, 36, 36, 36, 37, 37, 37, 37, 37, 37, 37, 37, 37, 38, 38, 38, 38, 38, 38, 38, 38, 39, 40, 40, 41, 41, 41, 41, 41, 41, 42,
+ 37, 37, 37, 38, 38, 38, 38, 38, 38, 38, 38, 38, 39, 39, 39, 40, 40, 40, 40, 40, 40, 40, 41, 41, 42, 43, 43, 43, 43, 43, 43, 44,
+ },
+ }, {
+ {
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ }, {
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ },
+ },
+};
+
+static const uint8_t qm_tbl_32x32_t[][2][528] = {
+ {
+ {
+ 32,
+ 31, 32,
+ 31, 32, 32,
+ 31, 32, 32, 32,
+ 31, 32, 32, 33, 33,
+ 32, 32, 32, 33, 34, 35,
+ 34, 34, 33, 34, 35, 37, 39,
+ 35, 34, 34, 35, 36, 37, 41, 43,
+ 36, 35, 34, 35, 36, 38, 42, 45, 48,
+ 39, 38, 37, 38, 39, 40, 45, 47, 50, 54,
+ 44, 42, 41, 41, 42, 42, 47, 50, 54, 58, 63,
+ 46, 44, 42, 43, 44, 44, 49, 52, 55, 59, 65, 67,
+ 48, 46, 44, 45, 45, 46, 51, 53, 57, 61, 67, 69, 71,
+ 54, 51, 49, 49, 50, 49, 54, 57, 60, 65, 71, 74, 76, 82,
+ 59, 56, 54, 54, 54, 53, 58, 61, 64, 69, 75, 78, 80, 87, 92,
+ 62, 59, 56, 56, 56, 55, 60, 63, 66, 71, 77, 80, 83, 89, 95, 98,
+ 65, 62, 59, 59, 59, 58, 63, 65, 68, 73, 79, 82, 85, 92, 98, 101, 105,
+ 71, 68, 65, 64, 64, 63, 68, 70, 73, 78, 84, 87, 90, 97, 103, 107, 111, 117,
+ 80, 76, 72, 72, 71, 69, 74, 76, 79, 84, 90, 93, 96, 104, 110, 114, 118, 125, 134,
+ 81, 77, 73, 73, 72, 70, 75, 77, 80, 85, 91, 94, 97, 105, 111, 115, 119, 126, 135, 137,
+ 83, 78, 75, 74, 74, 72, 76, 79, 81, 86, 92, 95, 99, 106, 113, 117, 121, 128, 137, 138, 140,
+ 88, 84, 80, 79, 78, 76, 80, 82, 85, 91, 95, 98, 103, 111, 115, 119, 126, 134, 139, 144, 147, 152,
+ 91, 86, 83, 82, 81, 79, 81, 84, 88, 92, 95, 100, 107, 110, 115, 123, 127, 132, 140, 147, 151, 154, 159,
+ 94, 89, 86, 85, 84, 82, 82, 86, 90, 92, 97, 103, 105, 111, 119, 121, 128, 136, 139, 146, 156, 158, 161, 166,
+ 97, 92, 90, 88, 86, 85, 84, 89, 91, 95, 100, 102, 108, 114, 116, 125, 130, 133, 143, 148, 152, 163, 166, 168, 174,
+ 101, 95, 93, 91, 89, 89, 87, 91, 93, 98, 101, 105, 111, 113, 120, 126, 130, 138, 142, 149, 157, 159, 171, 174, 176, 183,
+ 104, 99, 97, 94, 93, 93, 90, 92, 96, 100, 102, 108, 111, 116, 122, 125, 134, 137, 144, 151, 155, 165, 169, 179, 182, 184, 191,
+ 107, 102, 101, 97, 96, 96, 93, 93, 99, 101, 105, 110, 113, 120, 122, 129, 133, 140, 146, 150, 161, 163, 173, 178, 187, 191, 193, 200,
+ 111, 105, 104, 101, 100, 99, 97, 96, 102, 103, 109, 111, 117, 120, 125, 131, 135, 143, 146, 156, 158, 168, 173, 180, 189, 195, 200, 202, 210,
+ 115, 109, 108, 104, 104, 102, 101, 100, 103, 106, 111, 113, 119, 121, 129, 131, 140, 142, 151, 155, 162, 168, 176, 183, 188, 199, 204, 210, 212, 220,
+ 119, 113, 112, 107, 107, 106, 105, 103, 105, 110, 112, 117, 120, 125, 130, 135, 140, 145, 152, 157, 165, 169, 179, 183, 193, 197, 210, 214, 220, 222, 231,
+ 123, 116, 116, 111, 111, 109, 110, 107, 107, 114, 114, 121, 122, 130, 130, 140, 140, 150, 151, 163, 164, 176, 177, 190, 191, 204, 206, 222, 224, 230, 232, 242,
+ }, {
+ 32,
+ 31, 31,
+ 30, 31, 32,
+ 32, 33, 33, 35,
+ 33, 34, 35, 37, 39,
+ 36, 38, 40, 41, 43, 47,
+ 41, 42, 42, 43, 45, 47, 48,
+ 45, 45, 44, 45, 46, 47, 49, 50,
+ 49, 47, 46, 47, 47, 48, 50, 51, 53,
+ 48, 47, 45, 46, 46, 46, 49, 51, 53, 54,
+ 49, 47, 45, 45, 45, 45, 49, 51, 53, 55, 58,
+ 50, 47, 45, 46, 46, 46, 49, 51, 54, 56, 59, 60,
+ 50, 48, 46, 46, 46, 46, 50, 52, 54, 56, 60, 60, 61,
+ 52, 50, 47, 47, 47, 47, 50, 52, 54, 57, 61, 62, 63, 66,
+ 54, 52, 49, 49, 49, 48, 52, 53, 55, 58, 62, 64, 65, 68, 71,
+ 56, 53, 51, 50, 50, 49, 52, 54, 56, 59, 63, 64, 66, 69, 72, 73,
+ 57, 54, 52, 51, 51, 50, 53, 55, 56, 60, 63, 65, 67, 70, 73, 75, 76,
+ 60, 57, 54, 54, 53, 52, 55, 57, 58, 61, 65, 67, 68, 72, 75, 77, 79, 82,
+ 63, 60, 57, 57, 56, 54, 57, 59, 60, 63, 67, 69, 71, 75, 78, 80, 82, 85, 89,
+ 64, 61, 58, 57, 57, 55, 58, 59, 61, 64, 67, 69, 71, 75, 78, 80, 82, 85, 89, 90,
+ 65, 61, 58, 58, 57, 55, 58, 60, 61, 64, 68, 70, 71, 75, 79, 81, 83, 86, 90, 91, 91,
+ 67, 63, 61, 60, 59, 57, 60, 61, 63, 66, 69, 70, 73, 77, 79, 81, 85, 88, 90, 92, 94, 96,
+ 68, 64, 62, 61, 60, 58, 59, 61, 64, 66, 67, 71, 74, 75, 78, 82, 84, 86, 90, 93, 94, 96, 98,
+ 69, 65, 63, 62, 61, 59, 59, 62, 64, 65, 68, 71, 72, 75, 79, 80, 83, 87, 89, 92, 96, 97, 98, 100,
+ 70, 66, 64, 63, 62, 61, 60, 63, 64, 66, 69, 70, 73, 76, 77, 81, 84, 85, 89, 92, 93, 98, 99, 100, 102,
+ 71, 67, 66, 64, 63, 62, 61, 63, 64, 67, 68, 70, 74, 75, 78, 81, 83, 86, 88, 91, 94, 95, 100, 101, 102, 104,
+ 72, 68, 67, 65, 64, 64, 61, 63, 65, 67, 68, 71, 73, 75, 78, 79, 84, 85, 88, 91, 93, 97, 98, 102, 103, 104, 106,
+ 73, 69, 68, 66, 65, 65, 63, 63, 66, 67, 69, 71, 73, 76, 77, 81, 82, 85, 88, 90, 94, 95, 99, 101, 104, 105, 106, 109,
+ 74, 70, 70, 67, 66, 66, 64, 63, 66, 67, 70, 71, 74, 75, 78, 80, 82, 86, 87, 91, 92, 96, 98, 101, 104, 106, 108, 108, 111,
+ 75, 71, 71, 68, 68, 67, 66, 64, 66, 68, 70, 71, 74, 75, 79, 79, 84, 84, 88, 90, 93, 95, 98, 101, 103, 107, 108, 110, 111, 113,
+ 76, 72, 72, 69, 69, 68, 67, 65, 66, 69, 70, 72, 74, 76, 78, 81, 83, 85, 88, 90, 93, 95, 98, 100, 104, 105, 109, 111, 112, 113, 116,
+ 78, 74, 74, 70, 70, 69, 69, 66, 66, 70, 70, 74, 74, 77, 78, 82, 82, 86, 87, 92, 92, 96, 97, 102, 102, 107, 107, 112, 113, 115, 115, 118,
+ },
+ }, {
+ {
+ 32,
+ 31, 32,
+ 31, 32, 32,
+ 31, 32, 32, 32,
+ 31, 32, 32, 32, 33,
+ 32, 32, 32, 33, 34, 35,
+ 32, 33, 33, 33, 34, 36, 36,
+ 34, 34, 33, 34, 35, 37, 38, 39,
+ 36, 35, 34, 35, 36, 38, 40, 42, 48,
+ 38, 37, 36, 36, 38, 39, 41, 44, 50, 51,
+ 39, 38, 37, 38, 39, 40, 42, 45, 50, 52, 54,
+ 44, 42, 41, 41, 42, 42, 44, 47, 54, 56, 58, 63,
+ 47, 45, 44, 44, 45, 45, 47, 50, 56, 58, 60, 66, 69,
+ 49, 47, 46, 45, 46, 46, 48, 51, 57, 60, 62, 68, 71, 73,
+ 54, 51, 50, 49, 50, 49, 51, 54, 60, 63, 65, 71, 75, 77, 82,
+ 59, 56, 54, 54, 54, 53, 55, 58, 64, 67, 69, 75, 79, 81, 87, 92,
+ 61, 58, 56, 56, 56, 55, 57, 60, 65, 68, 70, 77, 81, 83, 89, 94, 97,
+ 65, 62, 60, 59, 59, 58, 60, 63, 68, 71, 73, 79, 84, 87, 92, 98, 101, 105,
+ 71, 68, 65, 65, 64, 63, 65, 68, 73, 76, 78, 84, 89, 92, 97, 103, 106, 111, 117,
+ 76, 72, 70, 69, 68, 66, 68, 71, 76, 79, 81, 88, 92, 95, 101, 107, 110, 115, 122, 127,
+ 80, 76, 73, 72, 71, 69, 71, 74, 79, 82, 84, 90, 95, 98, 104, 110, 113, 118, 125, 130, 134,
+ 83, 78, 76, 75, 74, 72, 73, 76, 81, 84, 86, 92, 97, 100, 106, 113, 116, 121, 128, 133, 137, 140,
+ 86, 82, 79, 78, 77, 74, 76, 79, 84, 87, 89, 95, 100, 103, 109, 116, 119, 124, 131, 136, 140, 144, 147,
+ 89, 85, 82, 81, 79, 78, 78, 82, 86, 87, 92, 97, 100, 105, 112, 114, 120, 128, 131, 136, 146, 147, 150, 155,
+ 92, 88, 85, 84, 82, 81, 80, 85, 86, 90, 95, 97, 102, 107, 110, 117, 122, 125, 134, 138, 142, 152, 154, 156, 162,
+ 95, 90, 88, 86, 85, 84, 82, 86, 88, 93, 95, 99, 105, 106, 113, 118, 121, 129, 132, 139, 146, 148, 159, 161, 163, 169,
+ 98, 93, 91, 89, 88, 87, 85, 87, 90, 94, 96, 102, 104, 109, 114, 117, 126, 128, 134, 141, 145, 154, 157, 166, 168, 170, 176,
+ 101, 96, 95, 92, 91, 90, 88, 88, 93, 95, 99, 103, 106, 112, 114, 121, 124, 131, 136, 140, 149, 151, 160, 165, 173, 176, 178, 184,
+ 104, 99, 98, 95, 94, 93, 91, 90, 95, 96, 102, 103, 109, 112, 117, 122, 125, 133, 136, 145, 146, 156, 160, 167, 174, 180, 184, 186, 193,
+ 108, 102, 101, 98, 97, 96, 95, 93, 97, 100, 104, 106, 111, 113, 121, 122, 130, 132, 140, 143, 150, 155, 162, 169, 174, 183, 188, 192, 194, 201,
+ 111, 105, 105, 101, 100, 99, 98, 96, 98, 103, 105, 109, 112, 117, 121, 125, 130, 135, 141, 146, 152, 156, 165, 169, 178, 181, 193, 196, 201, 202, 210,
+ 114, 109, 109, 104, 104, 102, 102, 99, 100, 106, 106, 113, 113, 120, 121, 129, 130, 139, 140, 151, 151, 162, 162, 175, 176, 187, 188, 203, 204, 210, 211, 219,
+ }, {
+ 32,
+ 31, 31,
+ 30, 31, 31,
+ 31, 32, 32, 33,
+ 33, 34, 35, 36, 39,
+ 36, 38, 39, 40, 43, 47,
+ 38, 40, 41, 41, 44, 47, 47,
+ 41, 42, 42, 43, 45, 47, 48, 48,
+ 49, 47, 46, 46, 47, 48, 49, 50, 53,
+ 49, 47, 46, 46, 46, 47, 48, 50, 53, 53,
+ 48, 47, 46, 45, 46, 46, 48, 49, 53, 54, 54,
+ 49, 47, 45, 45, 45, 45, 47, 49, 53, 55, 55, 58,
+ 50, 48, 46, 46, 46, 46, 47, 50, 54, 55, 56, 59, 61,
+ 51, 48, 47, 46, 47, 46, 47, 50, 54, 55, 56, 60, 61, 62,
+ 52, 50, 48, 47, 47, 47, 48, 50, 54, 56, 57, 61, 63, 64, 66,
+ 54, 52, 50, 49, 49, 48, 49, 52, 55, 57, 58, 62, 64, 66, 68, 71,
+ 55, 53, 51, 50, 50, 49, 50, 52, 56, 58, 59, 63, 65, 66, 69, 72, 73,
+ 57, 54, 52, 51, 51, 50, 51, 53, 56, 58, 60, 63, 66, 67, 70, 73, 74, 76,
+ 60, 57, 55, 54, 53, 52, 53, 55, 58, 60, 61, 65, 68, 69, 72, 75, 77, 79, 82,
+ 62, 59, 57, 56, 55, 53, 54, 56, 59, 61, 63, 66, 69, 70, 74, 77, 78, 80, 84, 86,
+ 63, 60, 58, 57, 56, 54, 55, 57, 60, 62, 63, 67, 70, 71, 75, 78, 79, 82, 85, 87, 89,
+ 65, 61, 59, 58, 57, 55, 56, 58, 61, 63, 64, 68, 71, 72, 75, 79, 80, 83, 86, 88, 90, 91,
+ 66, 63, 60, 59, 58, 56, 58, 59, 62, 64, 65, 69, 72, 73, 76, 80, 81, 84, 87, 90, 91, 93, 94,
+ 67, 64, 62, 61, 59, 58, 58, 60, 63, 64, 66, 69, 71, 73, 77, 78, 81, 85, 86, 89, 93, 94, 95, 97,
+ 68, 65, 63, 62, 60, 59, 58, 61, 62, 64, 67, 68, 71, 74, 75, 79, 81, 83, 87, 89, 91, 95, 96, 97, 99,
+ 69, 66, 64, 63, 61, 61, 59, 61, 62, 65, 66, 68, 72, 73, 76, 78, 80, 84, 85, 88, 91, 92, 97, 98, 98, 101,
+ 70, 67, 65, 63, 62, 62, 60, 61, 63, 65, 66, 69, 71, 73, 76, 77, 81, 83, 85, 88, 90, 94, 95, 99, 100, 100, 103,
+ 71, 67, 67, 64, 63, 63, 61, 61, 64, 65, 67, 69, 71, 74, 75, 78, 80, 83, 85, 87, 91, 92, 95, 97, 100, 102, 102, 105,
+ 72, 68, 68, 65, 65, 64, 62, 62, 64, 65, 68, 69, 72, 73, 76, 78, 80, 83, 84, 88, 89, 93, 95, 97, 100, 102, 104, 104, 107,
+ 73, 69, 69, 66, 66, 65, 64, 63, 64, 66, 68, 69, 72, 73, 77, 77, 81, 82, 86, 87, 90, 92, 95, 97, 99, 103, 104, 106, 106, 109,
+ 74, 70, 70, 67, 67, 66, 65, 63, 64, 67, 68, 70, 72, 74, 76, 78, 80, 82, 85, 87, 90, 91, 95, 96, 100, 101, 105, 106, 108, 108, 111,
+ 75, 71, 71, 68, 68, 66, 66, 64, 64, 68, 68, 71, 71, 75, 75, 79, 79, 83, 84, 88, 89, 93, 93, 98, 98, 102, 103, 108, 108, 110, 110, 113,
+ },
+ }, {
+ {
+ 32,
+ 31, 32,
+ 31, 32, 32,
+ 31, 32, 32, 32,
+ 31, 32, 32, 32, 33,
+ 32, 32, 32, 32, 33, 34,
+ 32, 32, 32, 32, 34, 34, 35,
+ 34, 34, 33, 33, 35, 36, 37, 39,
+ 34, 34, 34, 34, 36, 36, 37, 41, 42,
+ 36, 35, 34, 34, 36, 37, 38, 42, 45, 48,
+ 39, 38, 38, 37, 39, 40, 40, 45, 47, 50, 54,
+ 41, 39, 39, 38, 40, 40, 41, 46, 48, 51, 55, 56,
+ 44, 42, 41, 41, 42, 42, 42, 47, 50, 54, 58, 59, 63,
+ 48, 46, 45, 44, 45, 45, 45, 50, 53, 56, 61, 62, 66, 70,
+ 49, 47, 46, 45, 46, 46, 46, 51, 53, 57, 62, 63, 68, 71, 73,
+ 54, 51, 50, 49, 50, 49, 49, 54, 56, 60, 65, 67, 71, 76, 77, 82,
+ 58, 55, 54, 53, 53, 53, 52, 57, 59, 63, 68, 70, 74, 79, 81, 86, 90,
+ 59, 57, 55, 54, 54, 54, 54, 59, 61, 64, 69, 71, 75, 80, 82, 87, 91, 93,
+ 65, 62, 60, 59, 59, 58, 58, 63, 65, 68, 73, 75, 79, 85, 87, 92, 97, 99, 105,
+ 69, 66, 64, 63, 63, 62, 61, 66, 68, 71, 76, 78, 83, 88, 90, 96, 100, 102, 109, 113,
+ 71, 68, 66, 65, 64, 63, 63, 68, 70, 73, 78, 80, 84, 90, 92, 97, 102, 104, 111, 115, 117,
+ 80, 76, 73, 72, 71, 70, 69, 74, 76, 79, 84, 86, 90, 96, 98, 104, 109, 111, 118, 123, 125, 134,
+ 81, 77, 75, 74, 73, 72, 71, 75, 77, 80, 85, 87, 91, 97, 99, 105, 110, 112, 120, 125, 127, 136, 137,
+ 83, 78, 76, 75, 74, 73, 72, 76, 78, 81, 86, 88, 92, 98, 100, 106, 111, 113, 121, 126, 128, 137, 139, 140,
+ 87, 83, 81, 79, 78, 77, 75, 80, 82, 85, 90, 91, 96, 101, 103, 110, 114, 117, 125, 129, 133, 142, 143, 145, 150,
+ 90, 85, 83, 81, 80, 79, 78, 81, 83, 87, 89, 93, 98, 100, 106, 110, 114, 121, 124, 130, 136, 138, 148, 149, 151, 156,
+ 93, 88, 86, 84, 83, 82, 80, 82, 85, 89, 90, 96, 98, 102, 107, 109, 118, 120, 125, 131, 134, 143, 145, 153, 156, 157, 163,
+ 95, 90, 89, 86, 85, 85, 83, 83, 88, 89, 93, 97, 99, 105, 106, 113, 116, 122, 127, 130, 139, 140, 148, 153, 159, 162, 164, 169,
+ 98, 93, 92, 89, 88, 87, 86, 85, 89, 90, 96, 97, 102, 105, 109, 114, 117, 124, 126, 134, 136, 144, 148, 154, 160, 166, 169, 170, 176,
+ 101, 96, 95, 91, 91, 90, 89, 87, 90, 93, 97, 99, 104, 105, 112, 113, 121, 122, 130, 133, 139, 144, 150, 155, 160, 168, 172, 176, 177, 184,
+ 104, 99, 98, 94, 94, 92, 92, 90, 92, 96, 98, 102, 104, 109, 112, 116, 121, 125, 130, 135, 141, 144, 152, 155, 163, 166, 177, 179, 184, 185, 191,
+ 107, 101, 101, 97, 97, 95, 95, 93, 93, 99, 99, 105, 105, 112, 112, 120, 120, 129, 129, 139, 140, 149, 149, 161, 161, 172, 172, 185, 186, 191, 192, 199,
+ }, {
+ 32,
+ 31, 31,
+ 30, 31, 31,
+ 30, 31, 31, 32,
+ 33, 34, 35, 35, 39,
+ 35, 36, 37, 37, 41, 43,
+ 36, 38, 39, 40, 43, 45, 47,
+ 41, 42, 42, 42, 45, 46, 47, 48,
+ 44, 44, 44, 44, 46, 46, 47, 49, 50,
+ 49, 47, 47, 46, 47, 47, 48, 50, 51, 53,
+ 48, 47, 46, 45, 46, 46, 46, 49, 51, 53, 54,
+ 48, 47, 46, 45, 46, 46, 46, 49, 51, 53, 54, 55,
+ 49, 47, 46, 45, 45, 45, 45, 49, 51, 53, 55, 56, 58,
+ 50, 48, 47, 46, 46, 46, 46, 50, 51, 54, 56, 57, 59, 61,
+ 51, 48, 47, 46, 47, 46, 46, 50, 51, 54, 56, 57, 60, 62, 62,
+ 52, 50, 48, 47, 47, 47, 47, 50, 52, 54, 57, 58, 61, 63, 64, 66,
+ 54, 51, 50, 49, 49, 48, 48, 51, 53, 55, 58, 59, 62, 64, 65, 68, 70,
+ 55, 52, 51, 50, 49, 49, 48, 52, 53, 55, 59, 60, 62, 65, 66, 68, 70, 71,
+ 57, 54, 53, 52, 51, 50, 50, 53, 54, 56, 60, 61, 63, 66, 67, 70, 73, 73, 76,
+ 59, 56, 54, 53, 53, 52, 51, 54, 56, 58, 61, 62, 65, 68, 69, 72, 74, 75, 78, 80,
+ 60, 57, 55, 54, 53, 53, 52, 55, 56, 58, 61, 63, 65, 68, 69, 72, 75, 76, 79, 81, 82,
+ 63, 60, 58, 57, 56, 55, 54, 57, 59, 60, 63, 65, 67, 70, 71, 75, 77, 78, 82, 84, 85, 89,
+ 64, 61, 59, 58, 57, 56, 55, 58, 59, 61, 64, 65, 68, 71, 72, 75, 78, 79, 82, 85, 86, 89, 90,
+ 65, 61, 60, 58, 57, 56, 55, 58, 59, 61, 64, 65, 68, 71, 72, 75, 78, 79, 83, 85, 86, 90, 91, 91,
+ 67, 63, 61, 60, 59, 58, 57, 60, 61, 63, 65, 66, 69, 72, 73, 77, 79, 80, 84, 86, 88, 92, 93, 93, 95,
+ 68, 64, 63, 61, 60, 59, 58, 60, 61, 63, 65, 67, 70, 71, 74, 76, 78, 81, 83, 86, 88, 89, 94, 94, 95, 97,
+ 68, 65, 64, 62, 61, 60, 58, 59, 61, 64, 64, 68, 69, 71, 74, 75, 79, 80, 83, 86, 87, 91, 92, 95, 96, 97, 99,
+ 69, 66, 65, 63, 62, 61, 59, 59, 62, 63, 65, 67, 69, 72, 72, 76, 78, 80, 83, 84, 88, 89, 92, 94, 97, 98, 99, 101,
+ 70, 67, 66, 63, 63, 62, 61, 60, 63, 63, 66, 67, 69, 71, 73, 76, 77, 81, 82, 85, 86, 90, 91, 94, 96, 99, 100, 100, 103,
+ 71, 67, 67, 64, 64, 63, 62, 61, 62, 64, 66, 67, 70, 71, 74, 74, 78, 79, 83, 84, 87, 89, 91, 94, 95, 99, 100, 102, 102, 104,
+ 72, 68, 68, 65, 65, 64, 63, 61, 62, 65, 66, 68, 69, 71, 73, 75, 77, 79, 82, 84, 87, 88, 92, 93, 96, 97, 101, 102, 104, 104, 106,
+ 73, 69, 69, 66, 66, 64, 64, 62, 62, 66, 66, 69, 69, 72, 73, 76, 77, 81, 81, 85, 85, 89, 90, 94, 94, 99, 99, 104, 104, 106, 106, 108,
+ },
+ }, {
+ {
+ 32,
+ 31, 32,
+ 31, 32, 32,
+ 31, 32, 32, 32,
+ 31, 32, 32, 32, 33,
+ 31, 32, 32, 32, 33, 33,
+ 32, 32, 32, 32, 33, 34, 35,
+ 32, 33, 33, 33, 34, 34, 36, 36,
+ 34, 34, 34, 33, 35, 35, 37, 38, 39,
+ 35, 35, 34, 34, 36, 36, 38, 39, 42, 46,
+ 36, 35, 35, 34, 36, 36, 38, 40, 42, 47, 48,
+ 39, 38, 38, 37, 39, 39, 40, 42, 45, 49, 50, 54,
+ 41, 40, 39, 38, 40, 40, 41, 43, 46, 50, 52, 55, 57,
+ 44, 42, 42, 41, 42, 42, 42, 44, 47, 52, 54, 58, 60, 63,
+ 47, 45, 45, 44, 44, 45, 45, 47, 50, 55, 56, 60, 62, 66, 69,
+ 48, 46, 45, 44, 45, 45, 46, 47, 51, 55, 57, 61, 63, 67, 70, 71,
+ 54, 51, 50, 49, 49, 50, 49, 51, 54, 59, 60, 65, 67, 71, 75, 76, 82,
+ 56, 53, 52, 51, 51, 51, 51, 53, 56, 60, 61, 66, 69, 73, 77, 78, 84, 86,
+ 59, 56, 55, 54, 54, 54, 53, 55, 58, 62, 64, 69, 71, 75, 79, 80, 87, 89, 92,
+ 64, 61, 60, 58, 58, 58, 57, 59, 62, 66, 67, 72, 75, 79, 83, 84, 91, 93, 97, 102,
+ 65, 62, 61, 59, 59, 59, 58, 60, 63, 67, 68, 73, 75, 79, 84, 85, 92, 94, 98, 103, 105,
+ 71, 68, 67, 65, 64, 64, 63, 65, 68, 72, 73, 78, 80, 84, 89, 90, 97, 100, 103, 109, 111, 117,
+ 74, 71, 69, 68, 67, 67, 65, 67, 70, 74, 75, 80, 83, 86, 91, 93, 100, 102, 106, 112, 114, 120, 123,
+ 80, 76, 74, 72, 71, 71, 69, 71, 74, 78, 79, 84, 86, 90, 95, 96, 104, 106, 110, 116, 118, 125, 128, 134,
+ 82, 78, 76, 74, 73, 73, 71, 73, 76, 79, 80, 86, 88, 92, 97, 98, 106, 108, 112, 118, 120, 127, 131, 136, 139,
+ 83, 78, 77, 75, 74, 74, 72, 73, 76, 80, 81, 86, 89, 92, 97, 99, 106, 109, 113, 119, 121, 128, 131, 137, 139, 140,
+ 87, 83, 81, 79, 78, 78, 75, 77, 80, 83, 85, 90, 92, 96, 100, 102, 110, 112, 117, 122, 125, 133, 135, 142, 144, 145, 150,
+ 90, 85, 84, 81, 80, 80, 78, 78, 82, 84, 87, 91, 93, 98, 99, 106, 108, 113, 118, 121, 129, 130, 137, 141, 147, 150, 151, 156,
+ 92, 88, 87, 84, 83, 82, 80, 80, 84, 85, 90, 91, 95, 98, 102, 106, 109, 115, 117, 125, 126, 134, 137, 142, 148, 152, 155, 156, 162,
+ 95, 90, 89, 86, 85, 84, 83, 82, 85, 87, 91, 92, 97, 98, 105, 105, 112, 114, 121, 123, 129, 133, 138, 143, 147, 155, 158, 161, 162, 168,
+ 97, 92, 92, 88, 88, 86, 86, 84, 85, 90, 91, 95, 97, 101, 104, 108, 112, 116, 121, 125, 130, 133, 140, 143, 150, 152, 162, 164, 168, 168, 174,
+ 100, 95, 95, 90, 90, 89, 89, 86, 86, 92, 92, 97, 98, 104, 104, 111, 111, 119, 119, 128, 129, 137, 137, 147, 148, 157, 158, 169, 170, 174, 175, 181,
+ }, {
+ 32,
+ 31, 31,
+ 31, 31, 31,
+ 30, 31, 31, 32,
+ 33, 34, 34, 34, 37,
+ 33, 34, 35, 35, 38, 39,
+ 36, 38, 39, 40, 42, 43, 47,
+ 38, 40, 40, 41, 43, 44, 47, 47,
+ 41, 42, 42, 42, 44, 45, 47, 48, 48,
+ 47, 46, 46, 45, 46, 47, 47, 48, 50, 52,
+ 49, 47, 47, 46, 47, 47, 48, 49, 50, 52, 53,
+ 48, 47, 46, 45, 46, 46, 46, 48, 49, 52, 53, 54,
+ 49, 47, 46, 45, 46, 46, 46, 47, 49, 52, 53, 55, 55,
+ 49, 47, 46, 45, 45, 45, 45, 47, 49, 52, 53, 55, 57, 58,
+ 50, 48, 47, 46, 46, 46, 46, 47, 50, 53, 54, 56, 57, 59, 61,
+ 50, 48, 47, 46, 46, 46, 46, 47, 50, 53, 54, 56, 58, 60, 61, 61,
+ 52, 50, 49, 47, 47, 47, 47, 48, 50, 53, 54, 57, 59, 61, 63, 63, 66,
+ 53, 50, 50, 48, 48, 48, 47, 49, 51, 54, 55, 58, 59, 62, 64, 64, 67, 68,
+ 54, 52, 51, 49, 49, 49, 48, 49, 52, 55, 55, 58, 60, 62, 64, 65, 68, 69, 71,
+ 56, 54, 53, 51, 51, 51, 49, 51, 53, 55, 56, 59, 61, 63, 66, 66, 70, 71, 73, 75,
+ 57, 54, 53, 52, 51, 51, 50, 51, 53, 56, 56, 60, 61, 63, 66, 67, 70, 71, 73, 76, 76,
+ 60, 57, 56, 54, 53, 53, 52, 53, 55, 58, 58, 61, 63, 65, 68, 68, 72, 73, 75, 78, 79, 82,
+ 61, 58, 57, 55, 55, 54, 53, 54, 56, 58, 59, 62, 64, 66, 69, 69, 73, 74, 76, 79, 80, 83, 84,
+ 63, 60, 59, 57, 56, 56, 54, 55, 57, 60, 60, 63, 65, 67, 70, 71, 75, 76, 78, 81, 82, 85, 86, 89,
+ 64, 61, 60, 58, 57, 57, 55, 56, 58, 60, 61, 64, 66, 68, 70, 71, 75, 77, 79, 82, 82, 86, 87, 90, 91,
+ 65, 61, 60, 58, 57, 57, 55, 56, 58, 61, 61, 64, 66, 68, 71, 71, 75, 77, 79, 82, 83, 86, 88, 90, 91, 91,
+ 67, 63, 62, 60, 59, 59, 57, 58, 60, 62, 63, 66, 67, 69, 72, 73, 77, 78, 80, 83, 84, 88, 89, 92, 93, 93, 95,
+ 67, 64, 63, 61, 60, 60, 58, 58, 61, 61, 63, 65, 67, 70, 70, 74, 75, 78, 80, 81, 85, 86, 89, 91, 93, 94, 95, 97,
+ 68, 65, 64, 62, 61, 60, 59, 58, 61, 61, 64, 65, 67, 69, 71, 73, 75, 78, 79, 83, 83, 87, 88, 91, 93, 95, 96, 97, 99,
+ 69, 65, 65, 62, 62, 61, 60, 59, 61, 62, 64, 65, 68, 68, 72, 72, 76, 76, 80, 81, 84, 86, 88, 90, 92, 95, 96, 98, 98, 100,
+ 70, 66, 66, 63, 63, 62, 61, 60, 60, 63, 64, 66, 67, 69, 71, 73, 75, 77, 79, 81, 84, 85, 88, 89, 93, 93, 97, 98, 100, 100, 102,
+ 71, 67, 67, 64, 64, 62, 62, 60, 60, 64, 64, 67, 67, 70, 70, 74, 74, 78, 78, 82, 82, 86, 86, 91, 91, 95, 95, 100, 100, 101, 101, 104,
+ },
+ }, {
+ {
+ 32,
+ 31, 32,
+ 31, 32, 32,
+ 31, 32, 32, 32,
+ 31, 32, 32, 32, 32,
+ 31, 32, 32, 32, 33, 33,
+ 32, 32, 32, 32, 33, 33, 34,
+ 32, 32, 32, 32, 33, 34, 35, 35,
+ 33, 33, 33, 33, 34, 35, 36, 36, 38,
+ 34, 34, 34, 33, 34, 35, 36, 37, 39, 39,
+ 36, 35, 35, 34, 35, 36, 37, 38, 42, 42, 48,
+ 36, 35, 35, 34, 35, 36, 38, 38, 42, 43, 48, 49,
+ 39, 38, 38, 37, 38, 39, 40, 40, 44, 45, 50, 51, 54,
+ 41, 39, 39, 38, 39, 40, 40, 41, 45, 46, 51, 52, 55, 56,
+ 44, 42, 42, 41, 41, 42, 42, 42, 46, 47, 54, 54, 58, 59, 63,
+ 46, 44, 44, 42, 43, 44, 44, 44, 48, 49, 55, 55, 59, 61, 65, 67,
+ 48, 46, 46, 44, 45, 45, 45, 46, 50, 51, 57, 57, 61, 63, 67, 69, 71,
+ 52, 50, 49, 48, 48, 48, 48, 48, 52, 53, 59, 59, 64, 65, 70, 72, 74, 78,
+ 54, 51, 51, 49, 49, 50, 49, 49, 53, 54, 60, 60, 65, 67, 71, 74, 76, 80, 82,
+ 58, 56, 55, 53, 53, 53, 53, 53, 57, 58, 63, 64, 68, 70, 75, 77, 80, 84, 86, 91,
+ 59, 56, 56, 54, 54, 54, 53, 53, 57, 58, 64, 64, 69, 70, 75, 78, 80, 85, 87, 91, 92,
+ 65, 62, 61, 59, 59, 59, 58, 58, 62, 63, 68, 68, 73, 75, 79, 82, 85, 90, 92, 97, 98, 105,
+ 66, 63, 63, 60, 60, 60, 59, 59, 63, 64, 69, 69, 74, 76, 80, 83, 86, 91, 93, 98, 99, 106, 107,
+ 71, 68, 67, 65, 65, 64, 63, 63, 67, 68, 73, 73, 78, 80, 84, 87, 90, 95, 97, 103, 103, 111, 112, 117,
+ 74, 71, 70, 68, 67, 67, 66, 65, 69, 70, 75, 75, 80, 82, 86, 89, 93, 97, 100, 105, 106, 114, 115, 120, 123,
+ 80, 76, 75, 72, 72, 71, 70, 69, 73, 74, 79, 79, 84, 86, 90, 93, 96, 101, 104, 110, 110, 118, 119, 125, 128, 134,
+ 81, 77, 77, 74, 73, 73, 71, 71, 74, 75, 80, 80, 85, 87, 91, 94, 98, 103, 105, 111, 112, 120, 121, 127, 130, 136, 137,
+ 83, 78, 78, 75, 74, 74, 72, 72, 75, 76, 81, 81, 86, 88, 92, 95, 99, 104, 106, 112, 113, 121, 122, 128, 131, 137, 139, 140,
+ 86, 82, 81, 78, 77, 77, 75, 74, 78, 79, 84, 84, 89, 91, 95, 98, 101, 106, 109, 115, 116, 124, 125, 131, 135, 140, 142, 144, 147,
+ 89, 84, 84, 80, 80, 79, 78, 77, 79, 81, 85, 86, 91, 92, 97, 98, 104, 106, 112, 114, 119, 123, 128, 132, 135, 142, 145, 148, 149, 153,
+ 91, 86, 86, 82, 82, 81, 80, 79, 80, 84, 85, 88, 91, 94, 97, 100, 104, 107, 112, 115, 120, 123, 129, 132, 138, 140, 148, 150, 153, 154, 159,
+ 93, 88, 88, 84, 84, 83, 83, 80, 81, 86, 86, 91, 91, 96, 97, 103, 103, 110, 110, 118, 119, 126, 126, 135, 136, 144, 144, 155, 155, 159, 159, 164,
+ }, {
+ 32,
+ 31, 31,
+ 31, 31, 31,
+ 30, 31, 31, 32,
+ 31, 32, 32, 33, 34,
+ 33, 34, 35, 35, 37, 39,
+ 35, 37, 37, 38, 39, 41, 44,
+ 36, 38, 39, 40, 41, 43, 46, 47,
+ 40, 41, 41, 42, 43, 44, 46, 47, 48,
+ 41, 42, 42, 42, 43, 45, 46, 47, 48, 48,
+ 49, 47, 47, 46, 46, 47, 47, 48, 50, 50, 53,
+ 49, 47, 47, 46, 46, 47, 47, 47, 49, 50, 53, 53,
+ 48, 47, 47, 45, 46, 46, 46, 46, 49, 49, 53, 53, 54,
+ 48, 47, 46, 45, 45, 46, 46, 46, 49, 49, 53, 53, 54, 55,
+ 49, 47, 46, 45, 45, 45, 45, 45, 48, 49, 53, 54, 55, 56, 58,
+ 50, 47, 47, 45, 46, 46, 46, 46, 49, 49, 54, 54, 56, 57, 59, 60,
+ 50, 48, 48, 46, 46, 46, 46, 46, 49, 50, 54, 54, 56, 57, 60, 60, 61,
+ 52, 49, 49, 47, 47, 47, 47, 46, 49, 50, 54, 54, 57, 58, 61, 62, 63, 65,
+ 52, 50, 49, 47, 47, 47, 47, 47, 49, 50, 54, 54, 57, 58, 61, 62, 63, 65, 66,
+ 54, 52, 51, 49, 49, 49, 48, 48, 51, 52, 55, 55, 58, 59, 62, 63, 65, 67, 68, 70,
+ 54, 52, 51, 49, 49, 49, 48, 48, 51, 52, 55, 56, 58, 60, 62, 64, 65, 67, 68, 70, 71,
+ 57, 54, 54, 52, 51, 51, 50, 50, 52, 53, 56, 57, 60, 61, 63, 65, 67, 69, 70, 73, 73, 76,
+ 57, 55, 54, 52, 52, 51, 51, 50, 53, 53, 57, 57, 60, 61, 64, 65, 67, 70, 71, 73, 74, 77, 77,
+ 60, 57, 56, 54, 54, 53, 52, 52, 54, 55, 58, 59, 61, 63, 65, 67, 68, 71, 72, 75, 75, 79, 79, 82,
+ 61, 58, 57, 55, 55, 54, 53, 53, 55, 56, 59, 59, 62, 63, 66, 68, 69, 72, 73, 76, 76, 80, 80, 83, 84,
+ 63, 60, 59, 57, 57, 56, 55, 54, 57, 57, 60, 61, 63, 65, 67, 69, 71, 73, 75, 78, 78, 82, 82, 85, 86, 89,
+ 64, 61, 60, 58, 57, 57, 56, 55, 57, 58, 61, 61, 64, 65, 68, 69, 71, 74, 75, 78, 78, 82, 83, 86, 87, 89, 90,
+ 65, 61, 61, 58, 58, 57, 56, 55, 58, 58, 61, 62, 64, 65, 68, 70, 71, 74, 75, 78, 79, 83, 83, 86, 88, 90, 91, 91,
+ 66, 63, 62, 60, 59, 58, 57, 56, 59, 59, 62, 63, 65, 66, 69, 70, 72, 75, 76, 79, 80, 84, 84, 87, 89, 91, 92, 93, 94,
+ 67, 64, 63, 61, 60, 59, 58, 57, 59, 60, 62, 63, 66, 66, 70, 70, 73, 74, 77, 78, 81, 83, 85, 87, 89, 92, 93, 94, 94, 96,
+ 68, 64, 64, 61, 61, 60, 59, 58, 59, 61, 62, 64, 65, 67, 69, 71, 72, 74, 77, 78, 81, 82, 85, 86, 89, 90, 94, 94, 96, 96, 98,
+ 69, 65, 65, 62, 62, 61, 61, 58, 59, 62, 62, 65, 65, 68, 68, 71, 71, 75, 75, 79, 79, 83, 83, 87, 87, 91, 91, 96, 96, 97, 97, 99,
+ },
+ }, {
+ {
+ 32,
+ 31, 32,
+ 31, 32, 32,
+ 31, 32, 32, 32,
+ 31, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 33,
+ 31, 32, 32, 32, 32, 33, 33,
+ 32, 32, 32, 32, 32, 34, 34, 35,
+ 32, 32, 32, 32, 32, 34, 34, 35, 35,
+ 34, 34, 34, 33, 33, 35, 35, 37, 37, 39,
+ 34, 34, 34, 33, 33, 35, 35, 37, 37, 39, 39,
+ 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48,
+ 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48, 48,
+ 39, 38, 38, 37, 37, 39, 39, 40, 40, 45, 45, 50, 50, 54,
+ 39, 38, 38, 37, 37, 39, 39, 40, 40, 45, 45, 50, 50, 54, 54,
+ 44, 42, 42, 41, 41, 42, 42, 42, 42, 47, 47, 54, 54, 58, 58, 63,
+ 44, 42, 42, 41, 41, 42, 42, 42, 42, 47, 47, 54, 54, 58, 58, 63, 63,
+ 48, 46, 46, 44, 44, 45, 45, 46, 46, 51, 51, 57, 57, 61, 61, 67, 67, 71,
+ 48, 46, 46, 44, 44, 45, 45, 46, 46, 51, 51, 57, 57, 61, 61, 67, 67, 71, 71,
+ 54, 51, 51, 49, 49, 50, 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, 76, 82,
+ 54, 51, 51, 49, 49, 50, 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, 76, 82, 82,
+ 59, 56, 56, 54, 54, 54, 54, 53, 53, 58, 58, 64, 64, 69, 69, 75, 75, 80, 80, 87, 87, 92,
+ 59, 56, 56, 54, 54, 54, 54, 53, 53, 58, 58, 64, 64, 69, 69, 75, 75, 80, 80, 87, 87, 92, 92,
+ 65, 62, 62, 59, 59, 59, 59, 58, 58, 63, 63, 68, 68, 73, 73, 79, 79, 85, 85, 92, 92, 98, 98, 105,
+ 65, 62, 62, 59, 59, 59, 59, 58, 58, 63, 63, 68, 68, 73, 73, 79, 79, 85, 85, 92, 92, 98, 98, 105, 105,
+ 71, 68, 68, 65, 65, 64, 64, 63, 63, 68, 68, 73, 73, 78, 78, 84, 84, 90, 90, 97, 97, 103, 103, 111, 111, 117,
+ 71, 68, 68, 65, 65, 64, 64, 63, 63, 68, 68, 73, 73, 78, 78, 84, 84, 90, 90, 97, 97, 103, 103, 111, 111, 117, 117,
+ 80, 76, 76, 72, 72, 71, 71, 69, 69, 74, 74, 79, 79, 84, 84, 90, 90, 96, 96, 104, 104, 110, 110, 118, 118, 125, 125, 134,
+ 80, 76, 76, 72, 72, 71, 71, 69, 69, 74, 74, 79, 79, 84, 84, 90, 90, 96, 96, 104, 104, 110, 110, 118, 118, 125, 125, 134, 134,
+ 83, 78, 78, 75, 75, 74, 74, 72, 72, 76, 76, 81, 81, 86, 86, 92, 92, 99, 99, 106, 106, 113, 113, 121, 121, 128, 128, 137, 137, 140,
+ 83, 78, 78, 75, 75, 74, 74, 72, 72, 76, 76, 81, 81, 86, 86, 92, 92, 99, 99, 106, 106, 113, 113, 121, 121, 128, 128, 137, 137, 140, 140,
+ 87, 83, 83, 79, 79, 77, 77, 75, 75, 80, 80, 84, 84, 90, 90, 96, 96, 102, 102, 109, 109, 116, 116, 124, 124, 132, 132, 141, 141, 144, 144, 149,
+ }, {
+ 32,
+ 31, 31,
+ 31, 31, 31,
+ 30, 31, 31, 32,
+ 30, 31, 31, 32, 32,
+ 33, 34, 34, 35, 35, 39,
+ 33, 34, 34, 35, 35, 39, 39,
+ 36, 38, 38, 40, 40, 43, 43, 47,
+ 36, 38, 38, 40, 40, 43, 43, 47, 47,
+ 41, 42, 42, 42, 42, 45, 45, 47, 47, 48,
+ 41, 42, 42, 42, 42, 45, 45, 47, 47, 48, 48,
+ 49, 47, 47, 46, 46, 47, 47, 48, 48, 50, 50, 53,
+ 49, 47, 47, 46, 46, 47, 47, 48, 48, 50, 50, 53, 53,
+ 48, 47, 47, 45, 45, 46, 46, 46, 46, 49, 49, 53, 53, 54,
+ 48, 47, 47, 45, 45, 46, 46, 46, 46, 49, 49, 53, 53, 54, 54,
+ 49, 47, 47, 45, 45, 45, 45, 45, 45, 49, 49, 53, 53, 55, 55, 58,
+ 49, 47, 47, 45, 45, 45, 45, 45, 45, 49, 49, 53, 53, 55, 55, 58, 58,
+ 50, 48, 48, 46, 46, 46, 46, 46, 46, 50, 50, 54, 54, 56, 56, 60, 60, 61,
+ 50, 48, 48, 46, 46, 46, 46, 46, 46, 50, 50, 54, 54, 56, 56, 60, 60, 61, 61,
+ 52, 50, 50, 47, 47, 47, 47, 47, 47, 50, 50, 54, 54, 57, 57, 61, 61, 63, 63, 66,
+ 52, 50, 50, 47, 47, 47, 47, 47, 47, 50, 50, 54, 54, 57, 57, 61, 61, 63, 63, 66, 66,
+ 54, 52, 52, 49, 49, 49, 49, 48, 48, 52, 52, 55, 55, 58, 58, 62, 62, 65, 65, 68, 68, 71,
+ 54, 52, 52, 49, 49, 49, 49, 48, 48, 52, 52, 55, 55, 58, 58, 62, 62, 65, 65, 68, 68, 71, 71,
+ 57, 54, 54, 52, 52, 51, 51, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 67, 67, 70, 70, 73, 73, 76,
+ 57, 54, 54, 52, 52, 51, 51, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 67, 67, 70, 70, 73, 73, 76, 76,
+ 60, 57, 57, 54, 54, 53, 53, 52, 52, 55, 55, 58, 58, 61, 61, 65, 65, 68, 68, 72, 72, 75, 75, 79, 79, 82,
+ 60, 57, 57, 54, 54, 53, 53, 52, 52, 55, 55, 58, 58, 61, 61, 65, 65, 68, 68, 72, 72, 75, 75, 79, 79, 82, 82,
+ 63, 60, 60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60, 63, 63, 67, 67, 71, 71, 75, 75, 78, 78, 82, 82, 85, 85, 89,
+ 63, 60, 60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60, 63, 63, 67, 67, 71, 71, 75, 75, 78, 78, 82, 82, 85, 85, 89, 89,
+ 65, 61, 61, 58, 58, 57, 57, 55, 55, 58, 58, 61, 61, 64, 64, 68, 68, 71, 71, 75, 75, 79, 79, 83, 83, 86, 86, 90, 90, 91,
+ 65, 61, 61, 58, 58, 57, 57, 55, 55, 58, 58, 61, 61, 64, 64, 68, 68, 71, 71, 75, 75, 79, 79, 83, 83, 86, 86, 90, 90, 91, 91,
+ 67, 63, 63, 60, 60, 59, 59, 57, 57, 60, 60, 62, 62, 66, 66, 69, 69, 72, 72, 76, 76, 80, 80, 84, 84, 88, 88, 92, 92, 93, 93, 95,
+ },
+ }, {
+ {
+ 32,
+ 31, 31,
+ 31, 32, 32,
+ 31, 32, 32, 32,
+ 31, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 33, 33,
+ 32, 32, 32, 32, 32, 33, 33, 34,
+ 32, 32, 32, 32, 32, 33, 34, 34, 35,
+ 32, 32, 32, 32, 33, 33, 34, 34, 35, 35,
+ 34, 34, 34, 33, 33, 34, 35, 35, 37, 37, 39,
+ 34, 34, 34, 33, 33, 34, 35, 35, 37, 37, 39, 39,
+ 35, 35, 35, 34, 34, 35, 36, 36, 38, 38, 42, 42, 46,
+ 36, 35, 35, 34, 34, 35, 36, 37, 38, 38, 42, 42, 47, 48,
+ 38, 37, 37, 36, 36, 37, 38, 38, 39, 40, 44, 44, 48, 50, 51,
+ 39, 38, 38, 38, 37, 38, 39, 39, 40, 41, 45, 45, 49, 50, 52, 54,
+ 41, 40, 40, 39, 38, 39, 40, 40, 41, 41, 46, 46, 50, 52, 54, 55, 57,
+ 44, 42, 42, 41, 41, 41, 42, 42, 42, 43, 47, 47, 52, 54, 56, 58, 60, 63,
+ 45, 43, 43, 42, 41, 42, 42, 43, 43, 43, 48, 48, 53, 54, 57, 58, 60, 64, 65,
+ 48, 46, 46, 45, 44, 45, 45, 45, 46, 46, 51, 51, 55, 57, 59, 61, 63, 67, 68, 71,
+ 48, 46, 46, 45, 44, 45, 45, 45, 46, 46, 51, 51, 55, 57, 59, 61, 63, 67, 68, 71, 71,
+ 53, 51, 51, 49, 49, 49, 49, 49, 49, 49, 54, 54, 58, 59, 62, 64, 67, 71, 72, 75, 75, 81,
+ 54, 52, 51, 50, 49, 49, 50, 49, 49, 50, 54, 54, 59, 60, 63, 65, 67, 71, 72, 76, 76, 81, 82,
+ 57, 55, 55, 53, 52, 52, 52, 52, 52, 52, 57, 57, 61, 62, 65, 67, 70, 74, 75, 79, 79, 85, 85, 89,
+ 59, 56, 56, 54, 54, 54, 54, 54, 53, 54, 58, 58, 62, 64, 67, 69, 71, 75, 76, 80, 80, 86, 87, 90, 92,
+ 62, 59, 59, 57, 56, 56, 56, 56, 55, 56, 60, 60, 64, 66, 69, 71, 73, 77, 78, 83, 83, 89, 89, 93, 95, 98,
+ 65, 62, 62, 60, 59, 59, 59, 59, 58, 58, 63, 63, 67, 68, 71, 73, 75, 79, 81, 85, 85, 91, 92, 96, 98, 101, 105,
+ 67, 64, 64, 62, 61, 61, 60, 60, 59, 60, 64, 64, 68, 69, 72, 74, 77, 81, 82, 87, 87, 93, 94, 98, 99, 103, 106, 108,
+ 71, 68, 68, 66, 65, 64, 64, 64, 63, 63, 68, 68, 72, 73, 76, 78, 80, 84, 85, 90, 90, 97, 97, 102, 103, 107, 111, 113, 117,
+ 72, 69, 69, 66, 65, 65, 65, 64, 63, 64, 68, 68, 72, 73, 76, 78, 81, 85, 86, 91, 91, 97, 98, 102, 104, 108, 111, 113, 118, 119,
+ 80, 76, 76, 73, 72, 72, 71, 70, 69, 70, 74, 74, 78, 79, 82, 84, 86, 90, 91, 96, 96, 103, 104, 108, 110, 114, 118, 120, 125, 126, 134,
+ 80, 76, 76, 73, 72, 72, 71, 70, 69, 70, 74, 74, 78, 79, 82, 84, 86, 90, 91, 96, 96, 103, 104, 108, 110, 114, 118, 120, 125, 126, 134, 134,
+ }, {
+ 32,
+ 31, 31,
+ 31, 31, 31,
+ 30, 31, 31, 31,
+ 30, 31, 31, 31, 32,
+ 32, 32, 33, 33, 33, 35,
+ 33, 34, 34, 35, 35, 37, 39,
+ 34, 35, 35, 36, 36, 38, 40, 41,
+ 36, 38, 38, 39, 40, 41, 43, 44, 47,
+ 37, 38, 39, 40, 40, 42, 43, 44, 47, 47,
+ 41, 42, 42, 42, 42, 43, 45, 45, 47, 47, 48,
+ 41, 42, 42, 42, 42, 43, 45, 45, 47, 47, 48, 48,
+ 47, 46, 46, 46, 45, 46, 47, 47, 47, 48, 50, 50, 52,
+ 49, 48, 47, 47, 46, 47, 47, 47, 48, 48, 50, 50, 52, 53,
+ 49, 47, 47, 46, 46, 46, 46, 47, 47, 47, 50, 50, 52, 53, 53,
+ 48, 47, 47, 46, 45, 46, 46, 46, 46, 47, 49, 49, 52, 53, 54, 54,
+ 49, 47, 47, 46, 45, 45, 46, 46, 46, 46, 49, 49, 52, 53, 54, 55, 55,
+ 49, 47, 47, 45, 45, 45, 45, 45, 45, 45, 49, 49, 52, 53, 55, 55, 57, 58,
+ 49, 47, 47, 46, 45, 45, 45, 45, 45, 46, 49, 49, 52, 53, 55, 56, 57, 59, 59,
+ 50, 48, 48, 47, 46, 46, 46, 46, 46, 46, 50, 50, 53, 54, 55, 56, 58, 60, 60, 61,
+ 50, 48, 48, 47, 46, 46, 46, 46, 46, 46, 50, 50, 53, 54, 55, 56, 58, 60, 60, 61, 61,
+ 52, 50, 49, 48, 47, 47, 47, 47, 46, 47, 50, 50, 53, 54, 56, 57, 59, 61, 61, 63, 63, 66,
+ 52, 50, 50, 48, 47, 47, 47, 47, 47, 47, 50, 50, 53, 54, 56, 57, 59, 61, 61, 63, 63, 66, 66,
+ 54, 51, 51, 50, 49, 49, 49, 48, 48, 48, 51, 51, 54, 55, 57, 58, 60, 62, 62, 65, 65, 67, 68, 69,
+ 54, 52, 52, 50, 49, 49, 49, 49, 48, 48, 52, 52, 55, 55, 57, 58, 60, 62, 63, 65, 65, 68, 68, 70, 71,
+ 56, 53, 53, 51, 51, 50, 50, 50, 49, 49, 52, 52, 55, 56, 58, 59, 61, 63, 63, 66, 66, 69, 69, 71, 72, 73,
+ 57, 54, 54, 52, 52, 51, 51, 51, 50, 50, 53, 53, 56, 56, 58, 60, 61, 63, 64, 67, 67, 70, 70, 72, 73, 75, 76,
+ 58, 55, 55, 53, 52, 52, 52, 51, 50, 51, 54, 54, 56, 57, 59, 60, 62, 64, 65, 67, 67, 71, 71, 73, 74, 75, 77, 78,
+ 60, 57, 57, 55, 54, 54, 53, 53, 52, 52, 55, 55, 58, 58, 60, 61, 63, 65, 66, 68, 68, 72, 72, 74, 75, 77, 79, 80, 82,
+ 60, 57, 57, 55, 54, 54, 54, 53, 52, 52, 55, 55, 58, 58, 60, 62, 63, 65, 66, 69, 69, 72, 73, 75, 76, 77, 79, 80, 82, 82,
+ 63, 60, 60, 58, 57, 57, 56, 55, 54, 55, 57, 57, 60, 60, 62, 63, 65, 67, 68, 71, 71, 74, 75, 77, 78, 80, 82, 83, 85, 85, 89,
+ 63, 60, 60, 58, 57, 57, 56, 55, 54, 55, 57, 57, 60, 60, 62, 63, 65, 67, 68, 71, 71, 74, 75, 77, 78, 80, 82, 83, 85, 85, 89, 89,
+ },
+ }, {
+ {
+ 32,
+ 31, 31,
+ 31, 31, 32,
+ 31, 32, 32, 32,
+ 31, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 33,
+ 31, 32, 32, 32, 32, 32, 33, 33,
+ 32, 32, 32, 32, 32, 32, 33, 33, 34,
+ 32, 32, 32, 32, 32, 32, 33, 34, 34, 35,
+ 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35,
+ 33, 33, 33, 33, 33, 33, 34, 35, 35, 36, 36, 38,
+ 34, 34, 34, 34, 33, 33, 35, 35, 36, 37, 37, 39, 39,
+ 34, 34, 34, 34, 34, 34, 35, 36, 36, 37, 37, 40, 41, 42,
+ 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, 38, 42, 42, 45, 48,
+ 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, 38, 42, 42, 45, 48, 48,
+ 38, 38, 38, 37, 37, 37, 38, 38, 39, 40, 40, 43, 44, 46, 50, 50, 52,
+ 39, 38, 38, 38, 37, 37, 39, 39, 39, 40, 40, 44, 45, 47, 50, 50, 53, 54,
+ 41, 40, 40, 39, 38, 38, 40, 40, 40, 41, 41, 45, 46, 48, 52, 52, 54, 55, 57,
+ 44, 42, 42, 42, 41, 41, 42, 42, 42, 42, 42, 46, 47, 50, 54, 54, 57, 58, 60, 63,
+ 44, 42, 42, 42, 41, 41, 42, 42, 42, 42, 42, 46, 47, 50, 54, 54, 57, 58, 60, 63, 63,
+ 47, 46, 45, 45, 44, 44, 44, 45, 45, 45, 45, 49, 50, 52, 56, 56, 59, 60, 62, 66, 66, 69,
+ 48, 47, 46, 45, 44, 44, 45, 45, 45, 46, 46, 50, 51, 53, 57, 57, 60, 61, 63, 67, 67, 70, 71,
+ 50, 49, 48, 47, 46, 46, 47, 47, 47, 47, 47, 51, 52, 54, 58, 58, 61, 62, 65, 68, 68, 72, 73, 75,
+ 54, 52, 51, 50, 49, 49, 49, 50, 49, 49, 49, 53, 54, 56, 60, 60, 64, 65, 67, 71, 71, 75, 76, 78, 82,
+ 54, 52, 51, 50, 49, 49, 49, 50, 49, 49, 49, 53, 54, 56, 60, 60, 64, 65, 67, 71, 71, 75, 76, 78, 82, 82,
+ 58, 56, 55, 54, 53, 53, 53, 53, 53, 52, 52, 56, 57, 59, 63, 63, 67, 68, 70, 74, 74, 78, 79, 82, 86, 86, 90,
+ 59, 57, 56, 55, 54, 54, 54, 54, 54, 53, 53, 57, 58, 60, 64, 64, 68, 69, 71, 75, 75, 79, 80, 83, 87, 87, 91, 92,
+ 61, 59, 58, 57, 56, 56, 56, 56, 55, 55, 55, 59, 60, 62, 65, 65, 69, 70, 73, 77, 77, 81, 82, 85, 89, 89, 93, 94, 97,
+ 65, 63, 62, 61, 59, 59, 59, 59, 59, 58, 58, 62, 63, 65, 68, 68, 72, 73, 75, 79, 79, 84, 85, 88, 92, 92, 97, 98, 101, 105,
+ 65, 63, 62, 61, 59, 59, 59, 59, 59, 58, 58, 62, 63, 65, 68, 68, 72, 73, 75, 79, 79, 84, 85, 88, 92, 92, 97, 98, 101, 105, 105,
+ 70, 67, 67, 65, 64, 64, 63, 63, 63, 62, 62, 66, 67, 69, 72, 72, 76, 77, 79, 83, 83, 88, 89, 92, 96, 96, 101, 102, 105, 109, 109, 114,
+ }, {
+ 32,
+ 31, 31,
+ 31, 31, 31,
+ 31, 31, 31, 31,
+ 30, 31, 31, 31, 32,
+ 30, 31, 31, 31, 32, 32,
+ 33, 33, 34, 34, 34, 34, 37,
+ 33, 34, 34, 35, 35, 35, 38, 39,
+ 34, 36, 36, 36, 37, 37, 40, 40, 42,
+ 36, 38, 38, 39, 40, 40, 42, 43, 45, 47,
+ 36, 38, 38, 39, 40, 40, 42, 43, 45, 47, 47,
+ 40, 41, 41, 41, 42, 42, 44, 44, 45, 47, 47, 48,
+ 41, 42, 42, 42, 42, 42, 44, 45, 46, 47, 47, 48, 48,
+ 44, 44, 44, 44, 44, 44, 45, 46, 46, 47, 47, 49, 49, 50,
+ 49, 48, 47, 47, 46, 46, 47, 47, 47, 48, 48, 50, 50, 51, 53,
+ 49, 48, 47, 47, 46, 46, 47, 47, 47, 48, 48, 50, 50, 51, 53, 53,
+ 48, 47, 47, 46, 45, 45, 46, 46, 46, 47, 47, 49, 50, 51, 53, 53, 54,
+ 48, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 49, 49, 51, 53, 53, 54, 54,
+ 49, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 49, 49, 51, 53, 53, 54, 55, 55,
+ 49, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 48, 49, 51, 53, 53, 55, 55, 57, 58,
+ 49, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 48, 49, 51, 53, 53, 55, 55, 57, 58, 58,
+ 50, 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 49, 50, 51, 54, 54, 56, 56, 57, 59, 59, 61,
+ 50, 49, 48, 47, 46, 46, 46, 46, 46, 46, 46, 49, 50, 51, 54, 54, 56, 56, 58, 60, 60, 61, 61,
+ 51, 49, 49, 48, 47, 47, 47, 47, 47, 46, 46, 49, 50, 51, 54, 54, 56, 57, 58, 60, 60, 62, 62, 63,
+ 52, 50, 50, 49, 47, 47, 47, 47, 47, 47, 47, 49, 50, 52, 54, 54, 57, 57, 59, 61, 61, 63, 63, 65, 66,
+ 52, 50, 50, 49, 47, 47, 47, 47, 47, 47, 47, 49, 50, 52, 54, 54, 57, 57, 59, 61, 61, 63, 63, 65, 66, 66,
+ 54, 52, 51, 50, 49, 49, 49, 49, 48, 48, 48, 51, 51, 53, 55, 55, 58, 58, 60, 62, 62, 64, 65, 66, 68, 68, 70,
+ 54, 52, 52, 51, 49, 49, 49, 49, 49, 48, 48, 51, 52, 53, 55, 55, 58, 58, 60, 62, 62, 64, 65, 66, 68, 68, 70, 71,
+ 55, 53, 53, 52, 50, 50, 50, 50, 49, 49, 49, 51, 52, 54, 56, 56, 58, 59, 60, 63, 63, 65, 66, 67, 69, 69, 71, 72, 73,
+ 57, 55, 54, 53, 52, 52, 51, 51, 50, 50, 50, 52, 53, 54, 56, 56, 59, 60, 61, 63, 63, 66, 67, 68, 70, 70, 73, 73, 74, 76,
+ 57, 55, 54, 53, 52, 52, 51, 51, 50, 50, 50, 52, 53, 54, 56, 56, 59, 60, 61, 63, 63, 66, 67, 68, 70, 70, 73, 73, 74, 76, 76,
+ 59, 57, 56, 55, 54, 54, 53, 53, 52, 51, 51, 54, 55, 56, 58, 58, 60, 61, 63, 65, 65, 67, 68, 70, 72, 72, 74, 75, 76, 78, 78, 80,
+ },
+ }, {
+ {
+ 32,
+ 31, 31,
+ 31, 31, 32,
+ 31, 31, 32, 32,
+ 31, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 33,
+ 31, 32, 32, 32, 32, 32, 32, 33, 33,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 34,
+ 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35,
+ 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35,
+ 32, 33, 33, 33, 33, 33, 33, 34, 34, 35, 36, 36, 36,
+ 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 37, 37, 38, 39,
+ 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 37, 37, 38, 39, 39,
+ 35, 34, 34, 34, 34, 34, 34, 35, 36, 36, 37, 37, 39, 41, 41, 43,
+ 36, 35, 35, 35, 34, 34, 35, 36, 36, 37, 38, 38, 40, 42, 42, 45, 48,
+ 36, 35, 35, 35, 34, 34, 35, 36, 36, 37, 38, 38, 40, 42, 42, 45, 48, 48,
+ 38, 37, 37, 37, 36, 36, 36, 38, 38, 38, 39, 39, 41, 44, 44, 47, 50, 50, 51,
+ 39, 39, 38, 38, 37, 37, 38, 39, 39, 39, 40, 40, 42, 45, 45, 47, 50, 50, 52, 54,
+ 39, 39, 38, 38, 37, 37, 38, 39, 39, 39, 40, 40, 42, 45, 45, 47, 50, 50, 52, 54, 54,
+ 42, 41, 41, 41, 40, 40, 40, 41, 41, 41, 42, 42, 44, 47, 47, 49, 53, 53, 55, 56, 56, 60,
+ 44, 43, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 44, 47, 47, 50, 54, 54, 56, 58, 58, 61, 63,
+ 44, 43, 43, 42, 41, 41, 41, 42, 42, 42, 43, 43, 45, 48, 48, 51, 54, 54, 56, 58, 58, 62, 64, 64,
+ 47, 46, 45, 45, 44, 44, 44, 44, 45, 45, 45, 45, 47, 50, 50, 53, 56, 56, 58, 60, 60, 64, 66, 66, 69,
+ 48, 47, 46, 46, 45, 44, 45, 45, 45, 45, 46, 46, 47, 51, 51, 53, 57, 57, 59, 61, 61, 65, 67, 67, 70, 71,
+ 49, 48, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 48, 51, 51, 54, 57, 57, 60, 62, 62, 66, 68, 68, 71, 72, 73,
+ 53, 51, 51, 51, 49, 49, 49, 49, 49, 49, 49, 49, 51, 54, 54, 57, 59, 59, 62, 64, 64, 69, 71, 71, 74, 75, 77, 81,
+ 54, 52, 51, 51, 50, 49, 49, 50, 50, 49, 49, 49, 51, 54, 54, 57, 60, 60, 63, 65, 65, 69, 71, 72, 75, 76, 77, 81, 82,
+ 55, 53, 53, 52, 51, 50, 50, 51, 51, 51, 50, 50, 52, 55, 55, 58, 61, 61, 64, 66, 66, 70, 72, 73, 76, 77, 78, 83, 83, 85,
+ 59, 57, 56, 56, 54, 54, 54, 54, 54, 54, 53, 53, 55, 58, 58, 61, 64, 64, 67, 69, 69, 73, 75, 76, 79, 80, 81, 86, 87, 88, 92,
+ 59, 57, 56, 56, 54, 54, 54, 54, 54, 54, 53, 53, 55, 58, 58, 61, 64, 64, 67, 69, 69, 73, 75, 76, 79, 80, 81, 86, 87, 88, 92, 92,
+ }, {
+ 32,
+ 31, 31,
+ 31, 31, 31,
+ 31, 31, 31, 31,
+ 30, 31, 31, 31, 31,
+ 30, 31, 31, 31, 31, 32,
+ 31, 31, 32, 32, 32, 32, 33,
+ 33, 34, 34, 34, 35, 35, 35, 38,
+ 33, 34, 34, 34, 35, 35, 36, 38, 39,
+ 34, 35, 35, 36, 36, 36, 37, 40, 40, 41,
+ 36, 38, 38, 38, 39, 40, 40, 43, 43, 44, 47,
+ 36, 38, 38, 38, 39, 40, 40, 43, 43, 44, 47, 47,
+ 38, 39, 40, 40, 41, 41, 41, 43, 44, 45, 47, 47, 47,
+ 41, 42, 42, 42, 42, 42, 43, 44, 45, 45, 47, 47, 48, 48,
+ 41, 42, 42, 42, 42, 42, 43, 44, 45, 45, 47, 47, 48, 48, 48,
+ 45, 45, 45, 45, 44, 44, 44, 46, 46, 46, 47, 47, 48, 49, 49, 50,
+ 49, 48, 47, 47, 46, 46, 46, 47, 47, 47, 48, 48, 49, 50, 50, 51, 53,
+ 49, 48, 47, 47, 46, 46, 46, 47, 47, 47, 48, 48, 49, 50, 50, 51, 53, 53,
+ 49, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 50, 50, 51, 53, 53, 53,
+ 48, 47, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 48, 49, 49, 51, 53, 53, 54, 54,
+ 48, 47, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 48, 49, 49, 51, 53, 53, 54, 54, 54,
+ 49, 47, 47, 47, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 51, 53, 53, 54, 55, 55, 57,
+ 49, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55, 55, 55, 57, 58,
+ 49, 47, 47, 47, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55, 56, 56, 58, 58, 59,
+ 50, 49, 48, 48, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 58, 59, 59, 61,
+ 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 59, 60, 60, 61, 61,
+ 51, 49, 48, 48, 47, 46, 46, 47, 47, 46, 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 59, 60, 60, 61, 62, 62,
+ 52, 50, 49, 49, 48, 47, 47, 47, 47, 47, 46, 46, 48, 50, 50, 52, 54, 54, 56, 57, 57, 60, 61, 61, 63, 63, 64, 66,
+ 52, 50, 50, 49, 48, 47, 47, 47, 47, 47, 47, 47, 48, 50, 50, 52, 54, 54, 56, 57, 57, 60, 61, 61, 63, 63, 64, 66, 66,
+ 53, 51, 50, 50, 48, 48, 48, 48, 48, 48, 47, 47, 48, 51, 51, 52, 54, 54, 56, 58, 58, 60, 61, 62, 63, 64, 64, 67, 67, 68,
+ 54, 53, 52, 52, 50, 49, 49, 49, 49, 49, 48, 48, 49, 52, 52, 53, 55, 55, 57, 58, 58, 61, 62, 63, 64, 65, 66, 68, 68, 69, 71,
+ 54, 53, 52, 52, 50, 49, 49, 49, 49, 49, 48, 48, 49, 52, 52, 53, 55, 55, 57, 58, 58, 61, 62, 63, 64, 65, 66, 68, 68, 69, 71, 71,
+ },
+ }, {
+ {
+ 32,
+ 31, 31,
+ 31, 31, 32,
+ 31, 31, 32, 32,
+ 31, 31, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 32, 33,
+ 31, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34,
+ 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35,
+ 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35,
+ 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 35, 36, 36, 36,
+ 34, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 36, 37, 37, 38, 39,
+ 34, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 36, 37, 37, 38, 39, 39,
+ 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 38, 40, 40, 41,
+ 35, 35, 35, 35, 34, 34, 34, 34, 36, 36, 36, 37, 38, 38, 39, 42, 42, 43, 46,
+ 36, 35, 35, 35, 35, 34, 34, 35, 36, 36, 36, 37, 38, 38, 40, 42, 42, 44, 47, 48,
+ 36, 35, 35, 35, 35, 34, 34, 35, 36, 36, 36, 37, 38, 38, 40, 42, 42, 44, 47, 48, 48,
+ 38, 37, 37, 37, 36, 36, 36, 36, 37, 38, 38, 39, 39, 39, 41, 44, 44, 45, 48, 50, 50, 51,
+ 39, 39, 38, 38, 38, 37, 37, 38, 39, 39, 39, 40, 40, 40, 42, 45, 45, 46, 49, 50, 50, 52, 54,
+ 39, 39, 38, 38, 38, 37, 37, 38, 39, 39, 39, 40, 40, 40, 42, 45, 45, 46, 49, 50, 50, 52, 54, 54,
+ 41, 40, 40, 40, 39, 38, 38, 39, 40, 40, 40, 41, 41, 41, 43, 46, 46, 47, 50, 52, 52, 54, 55, 55, 57,
+ 44, 43, 42, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 42, 44, 47, 47, 49, 52, 54, 54, 56, 58, 58, 60, 63,
+ 44, 43, 42, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 42, 44, 47, 47, 49, 52, 54, 54, 56, 58, 58, 60, 63, 63,
+ 45, 44, 43, 43, 42, 41, 41, 42, 42, 42, 42, 43, 43, 43, 45, 48, 48, 49, 53, 54, 54, 57, 58, 58, 60, 64, 64, 65,
+ 47, 46, 45, 45, 45, 44, 44, 44, 44, 45, 45, 45, 45, 45, 47, 50, 50, 51, 55, 56, 56, 58, 60, 60, 62, 66, 66, 67, 69,
+ 48, 47, 46, 46, 45, 44, 44, 45, 45, 45, 45, 45, 46, 46, 47, 51, 51, 52, 55, 57, 57, 59, 61, 61, 63, 67, 67, 68, 70, 71,
+ 48, 47, 46, 46, 45, 44, 44, 45, 45, 45, 45, 45, 46, 46, 47, 51, 51, 52, 55, 57, 57, 59, 61, 61, 63, 67, 67, 68, 70, 71, 71,
+ 51, 50, 49, 49, 48, 47, 47, 47, 48, 48, 48, 48, 48, 48, 50, 53, 53, 54, 57, 58, 58, 61, 63, 63, 66, 69, 69, 70, 73, 74, 74, 77,
+ }, {
+ 32,
+ 31, 31,
+ 31, 31, 31,
+ 31, 31, 31, 31,
+ 31, 31, 31, 31, 31,
+ 30, 31, 31, 31, 31, 32,
+ 30, 31, 31, 31, 31, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 33,
+ 33, 33, 34, 34, 34, 34, 34, 35, 37,
+ 33, 34, 34, 34, 35, 35, 35, 36, 38, 39,
+ 33, 34, 34, 34, 35, 35, 35, 36, 38, 39, 39,
+ 35, 36, 37, 37, 37, 38, 38, 38, 41, 41, 41, 44,
+ 36, 37, 38, 38, 39, 40, 40, 40, 42, 43, 43, 46, 47,
+ 36, 37, 38, 38, 39, 40, 40, 40, 42, 43, 43, 46, 47, 47,
+ 38, 39, 40, 40, 40, 41, 41, 41, 43, 44, 44, 46, 47, 47, 47,
+ 41, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 46, 47, 47, 48, 48,
+ 41, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 46, 47, 47, 48, 48, 48,
+ 43, 43, 43, 43, 43, 43, 43, 43, 45, 45, 45, 46, 47, 47, 48, 49, 49, 49,
+ 47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 47, 47, 47, 48, 50, 50, 50, 52,
+ 49, 48, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 48, 48, 49, 50, 50, 51, 52, 53,
+ 49, 48, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 48, 48, 49, 50, 50, 51, 52, 53, 53,
+ 49, 48, 47, 47, 46, 46, 46, 46, 46, 46, 46, 47, 47, 47, 48, 50, 50, 50, 52, 53, 53, 53,
+ 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 48, 49, 49, 50, 52, 53, 53, 54, 54,
+ 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 48, 49, 49, 50, 52, 53, 53, 54, 54, 54,
+ 49, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 47, 49, 49, 50, 52, 53, 53, 54, 55, 55, 55,
+ 49, 47, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 50, 52, 53, 53, 55, 55, 55, 57, 58,
+ 49, 47, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 50, 52, 53, 53, 55, 55, 55, 57, 58, 58,
+ 49, 48, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 50, 52, 53, 53, 55, 56, 56, 57, 59, 59, 59,
+ 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56, 57, 59, 59, 60, 61,
+ 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56, 58, 60, 60, 60, 61, 61,
+ 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56, 58, 60, 60, 60, 61, 61, 61,
+ 51, 50, 49, 49, 48, 47, 47, 47, 47, 47, 47, 47, 46, 46, 48, 50, 50, 51, 53, 54, 54, 56, 57, 57, 58, 60, 60, 61, 62, 63, 63, 64,
+ },
+ }, {
+ {
+ 32,
+ 31, 31,
+ 31, 31, 32,
+ 31, 31, 32, 32,
+ 31, 31, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 35,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 36, 36, 36, 37,
+ 34, 34, 34, 34, 34, 34, 33, 33, 33, 34, 35, 35, 35, 36, 37, 37, 37, 38, 39,
+ 34, 34, 34, 34, 34, 34, 33, 33, 33, 34, 35, 35, 35, 36, 37, 37, 37, 38, 39, 39,
+ 34, 34, 34, 34, 34, 34, 33, 33, 33, 34, 35, 35, 35, 36, 37, 37, 37, 38, 39, 39, 39,
+ 35, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 37, 37, 39, 41, 41, 41, 43,
+ 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48,
+ 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48,
+ 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 48,
+ 37, 37, 37, 37, 37, 36, 36, 36, 36, 37, 38, 38, 38, 38, 39, 39, 39, 41, 44, 44, 44, 46, 49, 49, 49, 51,
+ 39, 39, 38, 38, 38, 38, 37, 37, 37, 38, 39, 39, 39, 40, 40, 40, 40, 42, 45, 45, 45, 47, 50, 50, 50, 52, 54,
+ 39, 39, 38, 38, 38, 38, 37, 37, 37, 38, 39, 39, 39, 40, 40, 40, 40, 42, 45, 45, 45, 47, 50, 50, 50, 52, 54, 54,
+ 39, 39, 38, 38, 38, 38, 37, 37, 37, 38, 39, 39, 39, 40, 40, 40, 40, 42, 45, 45, 45, 47, 50, 50, 50, 52, 54, 54, 54,
+ 41, 41, 40, 40, 40, 39, 39, 39, 39, 40, 40, 40, 40, 41, 41, 41, 41, 44, 46, 46, 46, 49, 52, 52, 52, 54, 56, 56, 56, 58,
+ 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 42, 45, 47, 47, 47, 50, 54, 54, 54, 56, 58, 58, 58, 60, 63,
+ 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 42, 45, 47, 47, 47, 50, 54, 54, 54, 56, 58, 58, 58, 60, 63, 63,
+ }, {
+ 32,
+ 31, 31,
+ 31, 31, 31,
+ 31, 31, 31, 31,
+ 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31,
+ 30, 31, 31, 31, 31, 31, 32,
+ 30, 31, 31, 31, 31, 31, 32, 32,
+ 30, 31, 31, 31, 31, 31, 32, 32, 32,
+ 32, 32, 33, 33, 33, 33, 33, 33, 33, 35,
+ 33, 34, 34, 34, 34, 35, 35, 35, 35, 37, 39,
+ 33, 34, 34, 34, 34, 35, 35, 35, 35, 37, 39, 39,
+ 33, 34, 34, 34, 34, 35, 35, 35, 35, 37, 39, 39, 39,
+ 35, 35, 36, 36, 36, 37, 37, 37, 37, 39, 41, 41, 41, 43,
+ 36, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47,
+ 36, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47,
+ 36, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47,
+ 39, 39, 40, 40, 40, 41, 41, 41, 41, 42, 44, 44, 44, 45, 47, 47, 47, 47,
+ 41, 42, 42, 42, 42, 42, 42, 42, 42, 43, 45, 45, 45, 46, 47, 47, 47, 48, 48,
+ 41, 42, 42, 42, 42, 42, 42, 42, 42, 43, 45, 45, 45, 46, 47, 47, 47, 48, 48, 48,
+ 41, 42, 42, 42, 42, 42, 42, 42, 42, 43, 45, 45, 45, 46, 47, 47, 47, 48, 48, 48, 48,
+ 45, 45, 45, 45, 45, 44, 44, 44, 44, 45, 46, 46, 46, 47, 47, 47, 47, 48, 49, 49, 49, 50,
+ 49, 48, 47, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 51, 53,
+ 49, 48, 47, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 51, 53, 53,
+ 49, 48, 47, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 51, 53, 53, 53,
+ 49, 48, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 48, 50, 50, 50, 51, 53, 53, 53, 53,
+ 48, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 48, 49, 49, 49, 51, 53, 53, 53, 53, 54,
+ 48, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 48, 49, 49, 49, 51, 53, 53, 53, 53, 54, 54,
+ 48, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 48, 49, 49, 49, 51, 53, 53, 53, 53, 54, 54, 54,
+ 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 47, 49, 49, 49, 51, 53, 53, 53, 54, 55, 55, 55, 56,
+ 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 55, 55, 55, 57, 58,
+ 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 55, 55, 55, 57, 58, 58,
+ },
+ }, {
+ {
+ 32,
+ 31, 31,
+ 31, 31, 31,
+ 31, 31, 31, 32,
+ 31, 31, 31, 32, 32,
+ 31, 31, 31, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 35,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35,
+ 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 37, 38,
+ 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 38, 39, 39,
+ 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 38, 39, 39, 39,
+ 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 38, 39, 39, 39, 39,
+ 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 37, 37, 37, 38, 40, 41, 41, 41, 42,
+ 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 37, 38, 38, 38, 39, 41, 42, 42, 42, 44, 46,
+ 36, 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 42, 42, 42, 42, 45, 47, 48,
+ 36, 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 42, 42, 42, 42, 45, 47, 48, 48,
+ 36, 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 42, 42, 42, 42, 45, 47, 48, 48, 48,
+ 37, 37, 36, 36, 36, 36, 36, 35, 35, 35, 35, 36, 37, 37, 37, 37, 38, 39, 39, 39, 39, 41, 42, 43, 43, 43, 45, 48, 49, 49, 49, 50,
+ }, {
+ 32,
+ 31, 31,
+ 31, 31, 31,
+ 31, 31, 31, 31,
+ 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31,
+ 30, 31, 31, 31, 31, 31, 31, 31,
+ 30, 30, 31, 31, 31, 31, 31, 31, 32,
+ 30, 30, 31, 31, 31, 31, 31, 31, 32, 32,
+ 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34,
+ 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 36, 37,
+ 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 37, 38, 39,
+ 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 37, 38, 39, 39,
+ 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 37, 38, 39, 39, 39,
+ 34, 35, 36, 36, 36, 36, 36, 37, 37, 37, 37, 38, 40, 40, 40, 40, 42,
+ 36, 36, 37, 37, 37, 37, 38, 38, 39, 39, 39, 40, 41, 42, 42, 42, 44, 46,
+ 36, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 45, 46, 47,
+ 36, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 45, 46, 47, 47,
+ 36, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 45, 46, 47, 47, 47,
+ 38, 39, 39, 40, 40, 40, 40, 41, 41, 41, 41, 42, 43, 44, 44, 44, 45, 47, 47, 47, 47, 47,
+ 40, 41, 41, 41, 41, 41, 41, 42, 42, 42, 42, 43, 44, 44, 44, 44, 45, 47, 47, 47, 47, 48, 48,
+ 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47, 48, 48, 48,
+ 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47, 48, 48, 48, 48,
+ 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47, 48, 48, 48, 48, 48,
+ 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 45, 46, 46, 46, 46, 47, 47, 47, 47, 48, 49, 49, 49, 49, 50,
+ 47, 47, 46, 46, 46, 46, 46, 46, 45, 45, 45, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52,
+ 49, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 50, 51, 52, 53,
+ 49, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 50, 51, 52, 53, 53,
+ 49, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 50, 51, 52, 53, 53, 53,
+ 49, 48, 47, 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 53, 53, 53, 53,
+ },
+ }, {
+ {
+ 32,
+ 31, 31,
+ 31, 31, 31,
+ 31, 31, 31, 31,
+ 31, 31, 31, 32, 32,
+ 31, 31, 31, 32, 32, 32,
+ 31, 31, 31, 32, 32, 32, 32,
+ 31, 31, 31, 32, 32, 32, 32, 32,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35,
+ 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 36, 36, 36, 36, 36,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 36, 36, 36, 37, 38,
+ 34, 34, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 39,
+ 34, 34, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 39, 39,
+ }, {
+ 32,
+ 31, 31,
+ 31, 31, 31,
+ 31, 31, 31, 31,
+ 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 30, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+ 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+ 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
+ 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 35,
+ 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 37,
+ 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 37, 38, 39,
+ 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 37, 38, 39, 39,
+ 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 37, 38, 39, 39, 39,
+ 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 37, 38, 39, 39, 39, 39,
+ 34, 35, 35, 35, 35, 35, 35, 36, 36, 36, 36, 36, 36, 36, 37, 38, 39, 40, 40, 40, 40, 41,
+ 35, 36, 36, 36, 37, 37, 37, 37, 37, 37, 38, 38, 38, 38, 38, 39, 41, 41, 41, 41, 41, 42, 44,
+ 36, 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39, 40, 41, 42, 43, 43, 43, 43, 44, 45, 46,
+ 36, 37, 37, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 46, 47, 47,
+ 36, 37, 37, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 46, 47, 47, 47,
+ 36, 37, 37, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 46, 47, 47, 47, 47,
+ 37, 37, 38, 38, 39, 39, 39, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47,
+ 38, 39, 39, 40, 40, 40, 40, 40, 40, 40, 41, 41, 41, 41, 41, 42, 43, 44, 44, 44, 44, 45, 46, 47, 47, 47, 47, 47, 47,
+ 40, 40, 40, 41, 41, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 43, 44, 44, 44, 44, 44, 45, 46, 47, 47, 47, 47, 47, 48, 48,
+ 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 43, 44, 45, 45, 45, 45, 45, 46, 47, 47, 47, 47, 47, 48, 48, 48,
+ 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 43, 44, 45, 45, 45, 45, 45, 46, 47, 47, 47, 47, 47, 48, 48, 48, 48,
+ },
+ }, {
+ {
+ 32,
+ 31, 31,
+ 31, 31, 31,
+ 31, 31, 31, 31,
+ 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 32,
+ 31, 31, 31, 31, 31, 32, 32,
+ 31, 31, 31, 31, 31, 32, 32, 32,
+ 31, 31, 31, 31, 31, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ }, {
+ 32,
+ 31, 31,
+ 31, 31, 31,
+ 31, 31, 31, 31,
+ 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+ 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+ 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
+ 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+ 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+ 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34,
+ 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36,
+ 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 37, 37,
+ 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39,
+ 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39,
+ 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 39,
+ 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 39, 39,
+ 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 39, 39, 39,
+ 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 39, 39, 39, 39,
+ 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 36, 36, 36, 36, 36, 36, 36, 37, 37, 38, 39, 40, 40, 40, 40, 40, 40, 40,
+ },
+ }, {
+ {
+ 32,
+ 31, 31,
+ 31, 31, 31,
+ 31, 31, 31, 31,
+ 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ }, {
+ 32,
+ 31, 31,
+ 31, 31, 31,
+ 31, 31, 31, 31,
+ 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+ 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+ },
+ },
+};
+
+const uint8_t *dav1d_qm_tbl[16][2][N_RECT_TX_SIZES];
+static uint8_t qm_tbl_4x4[15][2][16];
+static uint8_t qm_tbl_4x8[15][2][32];
+static uint8_t qm_tbl_4x16[15][2][64];
+static uint8_t qm_tbl_8x4[15][2][32];
+static uint8_t qm_tbl_8x8[15][2][64];
+static uint8_t qm_tbl_8x16[15][2][128];
+static uint8_t qm_tbl_8x32[15][2][256];
+static uint8_t qm_tbl_16x4[15][2][64];
+static uint8_t qm_tbl_16x8[15][2][128];
+static uint8_t qm_tbl_16x16[15][2][256];
+static uint8_t qm_tbl_16x32[15][2][512];
+static uint8_t qm_tbl_32x8[15][2][256];
+static uint8_t qm_tbl_32x32[15][2][1024];
+
+static void subsample(uint8_t *dst, const uint8_t *const src,
+ const int h, const int hstep, const int vstep)
+{
+ for (int y = 0; y < h; y += vstep)
+ for (int x = 0; x < 32; x += hstep)
+ *dst++ = src[y * 32 + x];
+}
+
+static void transpose(uint8_t *const dst, const uint8_t *const src,
+ const int w, const int h)
+{
+ for (int y = 0, y_off = 0; y < h; y++, y_off += w)
+ for (int x = 0, x_off = 0; x < w; x++, x_off += h)
+ dst[x_off + y] = src[y_off + x];
+}
+
+static void untriangle(uint8_t *dst, const uint8_t *src, const int sz) {
+ for (int y = 0; y < sz; y++) {
+ memcpy(dst, src, y + 1);
+ const uint8_t *src_ptr = &src[y];
+ for (int x = y + 1; x < sz; x++) {
+ src_ptr += x;
+ dst[x] = *src_ptr;
+ }
+ dst += sz;
+ src += y + 1;
+ }
+}
+
+COLD void dav1d_init_qm_tables(void) {
+ // This function is guaranteed to be called only once
+
+ for (int i = 0; i < 15; i++)
+ for (int j = 0; j < 2; j++) {
+ // note that the w/h in the assignment is inverted, this is on purpose
+ // because we store coefficients transposed
+ dav1d_qm_tbl[i][j][RTX_4X8 ] = qm_tbl_8x4[i][j];
+ dav1d_qm_tbl[i][j][RTX_8X4 ] = qm_tbl_4x8[i][j];
+ dav1d_qm_tbl[i][j][RTX_4X16 ] = qm_tbl_16x4[i][j];
+ dav1d_qm_tbl[i][j][RTX_16X4 ] = qm_tbl_4x16[i][j];
+ dav1d_qm_tbl[i][j][RTX_8X16 ] = qm_tbl_16x8[i][j];
+ dav1d_qm_tbl[i][j][RTX_16X8 ] = qm_tbl_8x16[i][j];
+ dav1d_qm_tbl[i][j][RTX_8X32 ] = qm_tbl_32x8[i][j];
+ dav1d_qm_tbl[i][j][RTX_32X8 ] = qm_tbl_8x32[i][j];
+ dav1d_qm_tbl[i][j][RTX_16X32] = qm_tbl_32x16[i][j];
+ dav1d_qm_tbl[i][j][RTX_32X16] = qm_tbl_16x32[i][j];
+
+ dav1d_qm_tbl[i][j][ TX_4X4 ] = qm_tbl_4x4[i][j];
+ dav1d_qm_tbl[i][j][ TX_8X8 ] = qm_tbl_8x8[i][j];
+ dav1d_qm_tbl[i][j][ TX_16X16] = qm_tbl_16x16[i][j];
+ dav1d_qm_tbl[i][j][ TX_32X32] = qm_tbl_32x32[i][j];
+
+ untriangle(qm_tbl_32x32[i][j], qm_tbl_32x32_t[i][j], 32);
+ subsample(qm_tbl_4x4[i][j], &qm_tbl_32x32[i][j][32*3+3], 32, 8, 8);
+ subsample(qm_tbl_8x4[i][j], &qm_tbl_32x16[i][j][32*1+1], 16, 4, 4);
+ subsample(qm_tbl_8x8[i][j], &qm_tbl_32x32[i][j][32*1+1], 32, 4, 4);
+ subsample(qm_tbl_16x4[i][j], &qm_tbl_32x16[i][j][32*1+0], 16, 2, 4);
+ subsample(qm_tbl_16x8[i][j], &qm_tbl_32x16[i][j][32*0+0], 16, 2, 2);
+ subsample(qm_tbl_16x16[i][j], &qm_tbl_32x32[i][j][32*0+0], 32, 2, 2);
+ subsample(qm_tbl_32x8[i][j], &qm_tbl_32x16[i][j][32*0+0], 16, 1, 2);
+ transpose(qm_tbl_4x8[i][j], qm_tbl_8x4[i][j], 8, 4);
+ transpose(qm_tbl_4x16[i][j], qm_tbl_16x4[i][j], 16, 4);
+ transpose(qm_tbl_8x16[i][j], qm_tbl_16x8[i][j], 16, 8);
+ transpose(qm_tbl_8x32[i][j], qm_tbl_32x8[i][j], 32, 8);
+ transpose(qm_tbl_16x32[i][j], qm_tbl_32x16[i][j], 32, 16);
+
+ dav1d_qm_tbl[i][j][ TX_64X64] = dav1d_qm_tbl[i][j][ TX_32X32];
+ dav1d_qm_tbl[i][j][RTX_64X32] = dav1d_qm_tbl[i][j][ TX_32X32];
+ dav1d_qm_tbl[i][j][RTX_64X16] = dav1d_qm_tbl[i][j][RTX_32X16];
+ dav1d_qm_tbl[i][j][RTX_32X64] = dav1d_qm_tbl[i][j][ TX_32X32];
+ dav1d_qm_tbl[i][j][RTX_16X64] = dav1d_qm_tbl[i][j][RTX_16X32];
+ }
+
+ // dav1d_qm_tbl[15][*][*] == NULL
+}
diff --git a/third_party/dav1d/src/qm.h b/third_party/dav1d/src/qm.h
new file mode 100644
index 0000000000..8191c8afa7
--- /dev/null
+++ b/third_party/dav1d/src/qm.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_QM_H
+#define DAV1D_SRC_QM_H
+
+#include "src/levels.h"
+
+EXTERN const uint8_t *dav1d_qm_tbl[16][2][N_RECT_TX_SIZES];
+
+void dav1d_init_qm_tables(void);
+
+#endif /* DAV1D_SRC_QM_H */
diff --git a/third_party/dav1d/src/recon.h b/third_party/dav1d/src/recon.h
new file mode 100644
index 0000000000..721924916f
--- /dev/null
+++ b/third_party/dav1d/src/recon.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_RECON_H
+#define DAV1D_SRC_RECON_H
+
+#include "src/internal.h"
+#include "src/levels.h"
+
+#define DEBUG_BLOCK_INFO 0 && \
+ f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
+ t->bx >= 8 && t->bx < 12
+#define DEBUG_B_PIXELS 0
+
+#define decl_recon_b_intra_fn(name) \
+void (name)(Dav1dTaskContext *t, enum BlockSize bs, \
+ enum EdgeFlags intra_edge_flags, const Av1Block *b)
+typedef decl_recon_b_intra_fn(*recon_b_intra_fn);
+
+#define decl_recon_b_inter_fn(name) \
+int (name)(Dav1dTaskContext *t, enum BlockSize bs, const Av1Block *b)
+typedef decl_recon_b_inter_fn(*recon_b_inter_fn);
+
+#define decl_filter_sbrow_fn(name) \
+void (name)(Dav1dFrameContext *f, int sby)
+typedef decl_filter_sbrow_fn(*filter_sbrow_fn);
+
+#define decl_backup_ipred_edge_fn(name) \
+void (name)(Dav1dTaskContext *t)
+typedef decl_backup_ipred_edge_fn(*backup_ipred_edge_fn);
+
+#define decl_read_coef_blocks_fn(name) \
+void (name)(Dav1dTaskContext *t, enum BlockSize bs, const Av1Block *b)
+typedef decl_read_coef_blocks_fn(*read_coef_blocks_fn);
+
+#define decl_copy_pal_block_fn(name) \
+void (name)(Dav1dTaskContext *t, int bx4, int by4, int bw4, int bh4)
+typedef decl_copy_pal_block_fn(*copy_pal_block_fn);
+
+#define decl_read_pal_plane_fn(name) \
+void (name)(Dav1dTaskContext *t, Av1Block *b, int pl, int sz_ctx, int bx4, int by4)
+typedef decl_read_pal_plane_fn(*read_pal_plane_fn);
+
+#define decl_read_pal_uv_fn(name) \
+void (name)(Dav1dTaskContext *t, Av1Block *b, int sz_ctx, int bx4, int by4)
+typedef decl_read_pal_uv_fn(*read_pal_uv_fn);
+
+decl_recon_b_intra_fn(dav1d_recon_b_intra_8bpc);
+decl_recon_b_intra_fn(dav1d_recon_b_intra_16bpc);
+
+decl_recon_b_inter_fn(dav1d_recon_b_inter_8bpc);
+decl_recon_b_inter_fn(dav1d_recon_b_inter_16bpc);
+
+decl_filter_sbrow_fn(dav1d_filter_sbrow_8bpc);
+decl_filter_sbrow_fn(dav1d_filter_sbrow_16bpc);
+decl_filter_sbrow_fn(dav1d_filter_sbrow_deblock_cols_8bpc);
+decl_filter_sbrow_fn(dav1d_filter_sbrow_deblock_cols_16bpc);
+decl_filter_sbrow_fn(dav1d_filter_sbrow_deblock_rows_8bpc);
+decl_filter_sbrow_fn(dav1d_filter_sbrow_deblock_rows_16bpc);
+void dav1d_filter_sbrow_cdef_8bpc(Dav1dTaskContext *tc, int sby);
+void dav1d_filter_sbrow_cdef_16bpc(Dav1dTaskContext *tc, int sby);
+decl_filter_sbrow_fn(dav1d_filter_sbrow_resize_8bpc);
+decl_filter_sbrow_fn(dav1d_filter_sbrow_resize_16bpc);
+decl_filter_sbrow_fn(dav1d_filter_sbrow_lr_8bpc);
+decl_filter_sbrow_fn(dav1d_filter_sbrow_lr_16bpc);
+
+decl_backup_ipred_edge_fn(dav1d_backup_ipred_edge_8bpc);
+decl_backup_ipred_edge_fn(dav1d_backup_ipred_edge_16bpc);
+
+decl_read_coef_blocks_fn(dav1d_read_coef_blocks_8bpc);
+decl_read_coef_blocks_fn(dav1d_read_coef_blocks_16bpc);
+
+decl_copy_pal_block_fn(dav1d_copy_pal_block_y_8bpc);
+decl_copy_pal_block_fn(dav1d_copy_pal_block_y_16bpc);
+decl_copy_pal_block_fn(dav1d_copy_pal_block_uv_8bpc);
+decl_copy_pal_block_fn(dav1d_copy_pal_block_uv_16bpc);
+decl_read_pal_plane_fn(dav1d_read_pal_plane_8bpc);
+decl_read_pal_plane_fn(dav1d_read_pal_plane_16bpc);
+decl_read_pal_uv_fn(dav1d_read_pal_uv_8bpc);
+decl_read_pal_uv_fn(dav1d_read_pal_uv_16bpc);
+
+#endif /* DAV1D_SRC_RECON_H */
diff --git a/third_party/dav1d/src/recon_tmpl.c b/third_party/dav1d/src/recon_tmpl.c
new file mode 100644
index 0000000000..9d1a0da6bf
--- /dev/null
+++ b/third_party/dav1d/src/recon_tmpl.c
@@ -0,0 +1,2361 @@
+/*
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <string.h>
+#include <stdio.h>
+
+#include "common/attributes.h"
+#include "common/bitdepth.h"
+#include "common/dump.h"
+#include "common/frame.h"
+#include "common/intops.h"
+
+#include "src/cdef_apply.h"
+#include "src/ctx.h"
+#include "src/ipred_prepare.h"
+#include "src/lf_apply.h"
+#include "src/lr_apply.h"
+#include "src/recon.h"
+#include "src/scan.h"
+#include "src/tables.h"
+#include "src/wedge.h"
+
+static inline unsigned read_golomb(MsacContext *const msac) {
+ int len = 0;
+ unsigned val = 1;
+
+ while (!dav1d_msac_decode_bool_equi(msac) && len < 32) len++;
+ while (len--) val = (val << 1) + dav1d_msac_decode_bool_equi(msac);
+
+ return val - 1;
+}
+
+static inline unsigned get_skip_ctx(const TxfmInfo *const t_dim,
+ const enum BlockSize bs,
+ const uint8_t *const a,
+ const uint8_t *const l,
+ const int chroma,
+ const enum Dav1dPixelLayout layout)
+{
+ const uint8_t *const b_dim = dav1d_block_dimensions[bs];
+
+ if (chroma) {
+ const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
+ const int not_one_blk = b_dim[2] - (!!b_dim[2] && ss_hor) > t_dim->lw ||
+ b_dim[3] - (!!b_dim[3] && ss_ver) > t_dim->lh;
+ unsigned ca, cl;
+
+#define MERGE_CTX(dir, type, no_val) \
+ c##dir = *(const type *) dir != no_val; \
+ break
+
+ switch (t_dim->lw) {
+ /* For some reason the MSVC CRT _wassert() function is not flagged as
+ * __declspec(noreturn), so when using those headers the compiler will
+ * expect execution to continue after an assertion has been triggered
+ * and will therefore complain about the use of uninitialized variables
+ * when compiled in debug mode if we put the default case at the end. */
+ default: assert(0); /* fall-through */
+ case TX_4X4: MERGE_CTX(a, uint8_t, 0x40);
+ case TX_8X8: MERGE_CTX(a, uint16_t, 0x4040);
+ case TX_16X16: MERGE_CTX(a, uint32_t, 0x40404040U);
+ case TX_32X32: MERGE_CTX(a, uint64_t, 0x4040404040404040ULL);
+ }
+ switch (t_dim->lh) {
+ default: assert(0); /* fall-through */
+ case TX_4X4: MERGE_CTX(l, uint8_t, 0x40);
+ case TX_8X8: MERGE_CTX(l, uint16_t, 0x4040);
+ case TX_16X16: MERGE_CTX(l, uint32_t, 0x40404040U);
+ case TX_32X32: MERGE_CTX(l, uint64_t, 0x4040404040404040ULL);
+ }
+#undef MERGE_CTX
+
+ return 7 + not_one_blk * 3 + ca + cl;
+ } else if (b_dim[2] == t_dim->lw && b_dim[3] == t_dim->lh) {
+ return 0;
+ } else {
+ unsigned la, ll;
+
+#define MERGE_CTX(dir, type, tx) \
+ if (tx == TX_64X64) { \
+ uint64_t tmp = *(const uint64_t *) dir; \
+ tmp |= *(const uint64_t *) &dir[8]; \
+ l##dir = (unsigned) (tmp >> 32) | (unsigned) tmp; \
+ } else \
+ l##dir = *(const type *) dir; \
+ if (tx == TX_32X32) l##dir |= *(const type *) &dir[sizeof(type)]; \
+ if (tx >= TX_16X16) l##dir |= l##dir >> 16; \
+ if (tx >= TX_8X8) l##dir |= l##dir >> 8; \
+ break
+
+ switch (t_dim->lw) {
+ default: assert(0); /* fall-through */
+ case TX_4X4: MERGE_CTX(a, uint8_t, TX_4X4);
+ case TX_8X8: MERGE_CTX(a, uint16_t, TX_8X8);
+ case TX_16X16: MERGE_CTX(a, uint32_t, TX_16X16);
+ case TX_32X32: MERGE_CTX(a, uint32_t, TX_32X32);
+ case TX_64X64: MERGE_CTX(a, uint32_t, TX_64X64);
+ }
+ switch (t_dim->lh) {
+ default: assert(0); /* fall-through */
+ case TX_4X4: MERGE_CTX(l, uint8_t, TX_4X4);
+ case TX_8X8: MERGE_CTX(l, uint16_t, TX_8X8);
+ case TX_16X16: MERGE_CTX(l, uint32_t, TX_16X16);
+ case TX_32X32: MERGE_CTX(l, uint32_t, TX_32X32);
+ case TX_64X64: MERGE_CTX(l, uint32_t, TX_64X64);
+ }
+#undef MERGE_CTX
+
+ return dav1d_skip_ctx[umin(la & 0x3F, 4)][umin(ll & 0x3F, 4)];
+ }
+}
+
+static inline unsigned get_dc_sign_ctx(const int /*enum RectTxfmSize*/ tx,
+ const uint8_t *const a,
+ const uint8_t *const l)
+{
+ uint64_t mask = 0xC0C0C0C0C0C0C0C0ULL, mul = 0x0101010101010101ULL;
+ int s;
+
+#if ARCH_X86_64 && defined(__GNUC__)
+ /* Coerce compilers into producing better code. For some reason
+ * every x86-64 compiler is awful at handling 64-bit constants. */
+ __asm__("" : "+r"(mask), "+r"(mul));
+#endif
+
+ switch(tx) {
+ default: assert(0); /* fall-through */
+ case TX_4X4: {
+ int t = *(const uint8_t *) a >> 6;
+ t += *(const uint8_t *) l >> 6;
+ s = t - 1 - 1;
+ break;
+ }
+ case TX_8X8: {
+ uint32_t t = *(const uint16_t *) a & (uint32_t) mask;
+ t += *(const uint16_t *) l & (uint32_t) mask;
+ t *= 0x04040404U;
+ s = (int) (t >> 24) - 2 - 2;
+ break;
+ }
+ case TX_16X16: {
+ uint32_t t = (*(const uint32_t *) a & (uint32_t) mask) >> 6;
+ t += (*(const uint32_t *) l & (uint32_t) mask) >> 6;
+ t *= (uint32_t) mul;
+ s = (int) (t >> 24) - 4 - 4;
+ break;
+ }
+ case TX_32X32: {
+ uint64_t t = (*(const uint64_t *) a & mask) >> 6;
+ t += (*(const uint64_t *) l & mask) >> 6;
+ t *= mul;
+ s = (int) (t >> 56) - 8 - 8;
+ break;
+ }
+ case TX_64X64: {
+ uint64_t t = (*(const uint64_t *) &a[0] & mask) >> 6;
+ t += (*(const uint64_t *) &a[8] & mask) >> 6;
+ t += (*(const uint64_t *) &l[0] & mask) >> 6;
+ t += (*(const uint64_t *) &l[8] & mask) >> 6;
+ t *= mul;
+ s = (int) (t >> 56) - 16 - 16;
+ break;
+ }
+ case RTX_4X8: {
+ uint32_t t = *(const uint8_t *) a & (uint32_t) mask;
+ t += *(const uint16_t *) l & (uint32_t) mask;
+ t *= 0x04040404U;
+ s = (int) (t >> 24) - 1 - 2;
+ break;
+ }
+ case RTX_8X4: {
+ uint32_t t = *(const uint16_t *) a & (uint32_t) mask;
+ t += *(const uint8_t *) l & (uint32_t) mask;
+ t *= 0x04040404U;
+ s = (int) (t >> 24) - 2 - 1;
+ break;
+ }
+ case RTX_8X16: {
+ uint32_t t = *(const uint16_t *) a & (uint32_t) mask;
+ t += *(const uint32_t *) l & (uint32_t) mask;
+ t = (t >> 6) * (uint32_t) mul;
+ s = (int) (t >> 24) - 2 - 4;
+ break;
+ }
+ case RTX_16X8: {
+ uint32_t t = *(const uint32_t *) a & (uint32_t) mask;
+ t += *(const uint16_t *) l & (uint32_t) mask;
+ t = (t >> 6) * (uint32_t) mul;
+ s = (int) (t >> 24) - 4 - 2;
+ break;
+ }
+ case RTX_16X32: {
+ uint64_t t = *(const uint32_t *) a & (uint32_t) mask;
+ t += *(const uint64_t *) l & mask;
+ t = (t >> 6) * mul;
+ s = (int) (t >> 56) - 4 - 8;
+ break;
+ }
+ case RTX_32X16: {
+ uint64_t t = *(const uint64_t *) a & mask;
+ t += *(const uint32_t *) l & (uint32_t) mask;
+ t = (t >> 6) * mul;
+ s = (int) (t >> 56) - 8 - 4;
+ break;
+ }
+ case RTX_32X64: {
+ uint64_t t = (*(const uint64_t *) &a[0] & mask) >> 6;
+ t += (*(const uint64_t *) &l[0] & mask) >> 6;
+ t += (*(const uint64_t *) &l[8] & mask) >> 6;
+ t *= mul;
+ s = (int) (t >> 56) - 8 - 16;
+ break;
+ }
+ case RTX_64X32: {
+ uint64_t t = (*(const uint64_t *) &a[0] & mask) >> 6;
+ t += (*(const uint64_t *) &a[8] & mask) >> 6;
+ t += (*(const uint64_t *) &l[0] & mask) >> 6;
+ t *= mul;
+ s = (int) (t >> 56) - 16 - 8;
+ break;
+ }
+ case RTX_4X16: {
+ uint32_t t = *(const uint8_t *) a & (uint32_t) mask;
+ t += *(const uint32_t *) l & (uint32_t) mask;
+ t = (t >> 6) * (uint32_t) mul;
+ s = (int) (t >> 24) - 1 - 4;
+ break;
+ }
+ case RTX_16X4: {
+ uint32_t t = *(const uint32_t *) a & (uint32_t) mask;
+ t += *(const uint8_t *) l & (uint32_t) mask;
+ t = (t >> 6) * (uint32_t) mul;
+ s = (int) (t >> 24) - 4 - 1;
+ break;
+ }
+ case RTX_8X32: {
+ uint64_t t = *(const uint16_t *) a & (uint32_t) mask;
+ t += *(const uint64_t *) l & mask;
+ t = (t >> 6) * mul;
+ s = (int) (t >> 56) - 2 - 8;
+ break;
+ }
+ case RTX_32X8: {
+ uint64_t t = *(const uint64_t *) a & mask;
+ t += *(const uint16_t *) l & (uint32_t) mask;
+ t = (t >> 6) * mul;
+ s = (int) (t >> 56) - 8 - 2;
+ break;
+ }
+ case RTX_16X64: {
+ uint64_t t = *(const uint32_t *) a & (uint32_t) mask;
+ t += *(const uint64_t *) &l[0] & mask;
+ t = (t >> 6) + ((*(const uint64_t *) &l[8] & mask) >> 6);
+ t *= mul;
+ s = (int) (t >> 56) - 4 - 16;
+ break;
+ }
+ case RTX_64X16: {
+ uint64_t t = *(const uint64_t *) &a[0] & mask;
+ t += *(const uint32_t *) l & (uint32_t) mask;
+ t = (t >> 6) + ((*(const uint64_t *) &a[8] & mask) >> 6);
+ t *= mul;
+ s = (int) (t >> 56) - 16 - 4;
+ break;
+ }
+ }
+
+ return (s != 0) + (s > 0);
+}
+
+static inline unsigned get_lo_ctx(const uint8_t *const levels,
+ const enum TxClass tx_class,
+ unsigned *const hi_mag,
+ const uint8_t (*const ctx_offsets)[5],
+ const unsigned x, const unsigned y,
+ const ptrdiff_t stride)
+{
+ unsigned mag = levels[0 * stride + 1] + levels[1 * stride + 0];
+ unsigned offset;
+ if (tx_class == TX_CLASS_2D) {
+ mag += levels[1 * stride + 1];
+ *hi_mag = mag;
+ mag += levels[0 * stride + 2] + levels[2 * stride + 0];
+ offset = ctx_offsets[umin(y, 4)][umin(x, 4)];
+ } else {
+ mag += levels[0 * stride + 2];
+ *hi_mag = mag;
+ mag += levels[0 * stride + 3] + levels[0 * stride + 4];
+ offset = 26 + (y > 1 ? 10 : y * 5);
+ }
+ return offset + (mag > 512 ? 4 : (mag + 64) >> 7);
+}
+
+static int decode_coefs(Dav1dTaskContext *const t,
+ uint8_t *const a, uint8_t *const l,
+ const enum RectTxfmSize tx, const enum BlockSize bs,
+ const Av1Block *const b, const int intra,
+ const int plane, coef *cf,
+ enum TxfmType *const txtp, uint8_t *res_ctx)
+{
+ Dav1dTileState *const ts = t->ts;
+ const int chroma = !!plane;
+ const Dav1dFrameContext *const f = t->f;
+ const int lossless = f->frame_hdr->segmentation.lossless[b->seg_id];
+ const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[tx];
+ const int dbg = DEBUG_BLOCK_INFO && plane && 0;
+
+ if (dbg)
+ printf("Start: r=%d\n", ts->msac.rng);
+
+ // does this block have any non-zero coefficients
+ const int sctx = get_skip_ctx(t_dim, bs, a, l, chroma, f->cur.p.layout);
+ const int all_skip = dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.coef.skip[t_dim->ctx][sctx]);
+ if (dbg)
+ printf("Post-non-zero[%d][%d][%d]: r=%d\n",
+ t_dim->ctx, sctx, all_skip, ts->msac.rng);
+ if (all_skip) {
+ *res_ctx = 0x40;
+ *txtp = lossless * WHT_WHT; /* lossless ? WHT_WHT : DCT_DCT */
+ return -1;
+ }
+
+ // transform type (chroma: derived, luma: explicitly coded)
+ if (lossless) {
+ assert(t_dim->max == TX_4X4);
+ *txtp = WHT_WHT;
+ } else if (t_dim->max + intra >= TX_64X64) {
+ *txtp = DCT_DCT;
+ } else if (chroma) {
+ // inferred from either the luma txtp (inter) or a LUT (intra)
+ *txtp = intra ? dav1d_txtp_from_uvmode[b->uv_mode] :
+ get_uv_inter_txtp(t_dim, *txtp);
+ } else if (!f->frame_hdr->segmentation.qidx[b->seg_id]) {
+ // In libaom, lossless is checked by a literal qidx == 0, but not all
+ // such blocks are actually lossless. The remainder gets an implicit
+ // transform type (for luma)
+ *txtp = DCT_DCT;
+ } else {
+ unsigned idx;
+ if (intra) {
+ const enum IntraPredMode y_mode_nofilt = b->y_mode == FILTER_PRED ?
+ dav1d_filter_mode_to_y_mode[b->y_angle] : b->y_mode;
+ if (f->frame_hdr->reduced_txtp_set || t_dim->min == TX_16X16) {
+ idx = dav1d_msac_decode_symbol_adapt4(&ts->msac,
+ ts->cdf.m.txtp_intra2[t_dim->min][y_mode_nofilt], 4);
+ *txtp = dav1d_tx_types_per_set[idx + 0];
+ } else {
+ idx = dav1d_msac_decode_symbol_adapt8(&ts->msac,
+ ts->cdf.m.txtp_intra1[t_dim->min][y_mode_nofilt], 6);
+ *txtp = dav1d_tx_types_per_set[idx + 5];
+ }
+ if (dbg)
+ printf("Post-txtp-intra[%d->%d][%d][%d->%d]: r=%d\n",
+ tx, t_dim->min, y_mode_nofilt, idx, *txtp, ts->msac.rng);
+ } else {
+ if (f->frame_hdr->reduced_txtp_set || t_dim->max == TX_32X32) {
+ idx = dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.txtp_inter3[t_dim->min]);
+ *txtp = (idx - 1) & IDTX; /* idx ? DCT_DCT : IDTX */
+ } else if (t_dim->min == TX_16X16) {
+ idx = dav1d_msac_decode_symbol_adapt16(&ts->msac,
+ ts->cdf.m.txtp_inter2, 11);
+ *txtp = dav1d_tx_types_per_set[idx + 12];
+ } else {
+ idx = dav1d_msac_decode_symbol_adapt16(&ts->msac,
+ ts->cdf.m.txtp_inter1[t_dim->min], 15);
+ *txtp = dav1d_tx_types_per_set[idx + 24];
+ }
+ if (dbg)
+ printf("Post-txtp-inter[%d->%d][%d->%d]: r=%d\n",
+ tx, t_dim->min, idx, *txtp, ts->msac.rng);
+ }
+ }
+
+ // find end-of-block (eob)
+ int eob_bin;
+ const int tx2dszctx = imin(t_dim->lw, TX_32X32) + imin(t_dim->lh, TX_32X32);
+ const enum TxClass tx_class = dav1d_tx_type_class[*txtp];
+ const int is_1d = tx_class != TX_CLASS_2D;
+ switch (tx2dszctx) {
+#define case_sz(sz, bin, ns, is_1d) \
+ case sz: { \
+ uint16_t *const eob_bin_cdf = ts->cdf.coef.eob_bin_##bin[chroma]is_1d; \
+ eob_bin = dav1d_msac_decode_symbol_adapt##ns(&ts->msac, eob_bin_cdf, 4 + sz); \
+ break; \
+ }
+ case_sz(0, 16, 4, [is_1d]);
+ case_sz(1, 32, 8, [is_1d]);
+ case_sz(2, 64, 8, [is_1d]);
+ case_sz(3, 128, 8, [is_1d]);
+ case_sz(4, 256, 16, [is_1d]);
+ case_sz(5, 512, 16, );
+ case_sz(6, 1024, 16, );
+#undef case_sz
+ }
+ if (dbg)
+ printf("Post-eob_bin_%d[%d][%d][%d]: r=%d\n",
+ 16 << tx2dszctx, chroma, is_1d, eob_bin, ts->msac.rng);
+ int eob;
+ if (eob_bin > 1) {
+ uint16_t *const eob_hi_bit_cdf =
+ ts->cdf.coef.eob_hi_bit[t_dim->ctx][chroma][eob_bin];
+ const int eob_hi_bit = dav1d_msac_decode_bool_adapt(&ts->msac, eob_hi_bit_cdf);
+ if (dbg)
+ printf("Post-eob_hi_bit[%d][%d][%d][%d]: r=%d\n",
+ t_dim->ctx, chroma, eob_bin, eob_hi_bit, ts->msac.rng);
+ eob = ((eob_hi_bit | 2) << (eob_bin - 2)) |
+ dav1d_msac_decode_bools(&ts->msac, eob_bin - 2);
+ if (dbg)
+ printf("Post-eob[%d]: r=%d\n", eob, ts->msac.rng);
+ } else {
+ eob = eob_bin;
+ }
+ assert(eob >= 0);
+
+ // base tokens
+ uint16_t (*const eob_cdf)[4] = ts->cdf.coef.eob_base_tok[t_dim->ctx][chroma];
+ uint16_t (*const hi_cdf)[4] = ts->cdf.coef.br_tok[imin(t_dim->ctx, 3)][chroma];
+ unsigned rc, dc_tok;
+
+ if (eob) {
+ uint16_t (*const lo_cdf)[4] = ts->cdf.coef.base_tok[t_dim->ctx][chroma];
+ uint8_t *const levels = t->scratch.levels; // bits 0-5: tok, 6-7: lo_tok
+ const int sw = imin(t_dim->w, 8), sh = imin(t_dim->h, 8);
+
+ /* eob */
+ unsigned ctx = 1 + (eob > sw * sh * 2) + (eob > sw * sh * 4);
+ int eob_tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, eob_cdf[ctx], 2);
+ int tok = eob_tok + 1;
+ int level_tok = tok * 0x41;
+ unsigned mag;
+
+#define DECODE_COEFS_CLASS(tx_class) \
+ unsigned x, y; \
+ if (tx_class == TX_CLASS_2D) \
+ rc = scan[eob], x = rc >> shift, y = rc & mask; \
+ else if (tx_class == TX_CLASS_H) \
+ /* Transposing reduces the stride and padding requirements */ \
+ x = eob & mask, y = eob >> shift, rc = eob; \
+ else /* tx_class == TX_CLASS_V */ \
+ x = eob & mask, y = eob >> shift, rc = (x << shift2) | y; \
+ if (dbg) \
+ printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
+ t_dim->ctx, chroma, ctx, eob, rc, tok, ts->msac.rng); \
+ if (eob_tok == 2) { \
+ ctx = (tx_class == TX_CLASS_2D ? (x | y) > 1 : y != 0) ? 14 : 7; \
+ tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \
+ level_tok = tok + (3 << 6); \
+ if (dbg) \
+ printf("Post-hi_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
+ imin(t_dim->ctx, 3), chroma, ctx, eob, rc, tok, \
+ ts->msac.rng); \
+ } \
+ cf[rc] = tok << 11; \
+ levels[x * stride + y] = (uint8_t) level_tok; \
+ for (int i = eob - 1; i > 0; i--) { /* ac */ \
+ unsigned rc_i; \
+ if (tx_class == TX_CLASS_2D) \
+ rc_i = scan[i], x = rc_i >> shift, y = rc_i & mask; \
+ else if (tx_class == TX_CLASS_H) \
+ x = i & mask, y = i >> shift, rc_i = i; \
+ else /* tx_class == TX_CLASS_V */ \
+ x = i & mask, y = i >> shift, rc_i = (x << shift2) | y; \
+ assert(x < 32 && y < 32); \
+ uint8_t *const level = levels + x * stride + y; \
+ ctx = get_lo_ctx(level, tx_class, &mag, lo_ctx_offsets, x, y, stride); \
+ if (tx_class == TX_CLASS_2D) \
+ y |= x; \
+ tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf[ctx], 3); \
+ if (dbg) \
+ printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
+ t_dim->ctx, chroma, ctx, i, rc_i, tok, ts->msac.rng); \
+ if (tok == 3) { \
+ mag &= 63; \
+ ctx = (y > (tx_class == TX_CLASS_2D) ? 14 : 7) + \
+ (mag > 12 ? 6 : (mag + 1) >> 1); \
+ tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \
+ if (dbg) \
+ printf("Post-hi_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
+ imin(t_dim->ctx, 3), chroma, ctx, i, rc_i, tok, \
+ ts->msac.rng); \
+ *level = (uint8_t) (tok + (3 << 6)); \
+ cf[rc_i] = (tok << 11) | rc; \
+ rc = rc_i; \
+ } else { \
+ /* 0x1 for tok, 0x7ff as bitmask for rc, 0x41 for level_tok */ \
+ tok *= 0x17ff41; \
+ *level = (uint8_t) tok; \
+ /* tok ? (tok << 11) | rc : 0 */ \
+ tok = (tok >> 9) & (rc + ~0x7ffu); \
+ if (tok) rc = rc_i; \
+ cf[rc_i] = tok; \
+ } \
+ } \
+ /* dc */ \
+ ctx = (tx_class == TX_CLASS_2D) ? 0 : \
+ get_lo_ctx(levels, tx_class, &mag, lo_ctx_offsets, 0, 0, stride); \
+ dc_tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf[ctx], 3); \
+ if (dbg) \
+ printf("Post-dc_lo_tok[%d][%d][%d][%d]: r=%d\n", \
+ t_dim->ctx, chroma, ctx, dc_tok, ts->msac.rng); \
+ if (dc_tok == 3) { \
+ if (tx_class == TX_CLASS_2D) \
+ mag = levels[0 * stride + 1] + levels[1 * stride + 0] + \
+ levels[1 * stride + 1]; \
+ mag &= 63; \
+ ctx = mag > 12 ? 6 : (mag + 1) >> 1; \
+ dc_tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \
+ if (dbg) \
+ printf("Post-dc_hi_tok[%d][%d][0][%d]: r=%d\n", \
+ imin(t_dim->ctx, 3), chroma, dc_tok, ts->msac.rng); \
+ } \
+ break
+
+ const uint16_t *scan;
+ switch (tx_class) {
+ case TX_CLASS_2D: {
+ const unsigned nonsquare_tx = tx >= RTX_4X8;
+ const uint8_t (*const lo_ctx_offsets)[5] =
+ dav1d_lo_ctx_offsets[nonsquare_tx + (tx & nonsquare_tx)];
+ scan = dav1d_scans[tx];
+ const ptrdiff_t stride = 4 * sh;
+ const unsigned shift = t_dim->lh < 4 ? t_dim->lh + 2 : 5, shift2 = 0;
+ const unsigned mask = 4 * sh - 1;
+ memset(levels, 0, stride * (4 * sw + 2));
+ DECODE_COEFS_CLASS(TX_CLASS_2D);
+ }
+ case TX_CLASS_H: {
+ const uint8_t (*const lo_ctx_offsets)[5] = NULL;
+ const ptrdiff_t stride = 16;
+ const unsigned shift = t_dim->lh + 2, shift2 = 0;
+ const unsigned mask = 4 * sh - 1;
+ memset(levels, 0, stride * (4 * sh + 2));
+ DECODE_COEFS_CLASS(TX_CLASS_H);
+ }
+ case TX_CLASS_V: {
+ const uint8_t (*const lo_ctx_offsets)[5] = NULL;
+ const ptrdiff_t stride = 16;
+ const unsigned shift = t_dim->lw + 2, shift2 = t_dim->lh + 2;
+ const unsigned mask = 4 * sw - 1;
+ memset(levels, 0, stride * (4 * sw + 2));
+ DECODE_COEFS_CLASS(TX_CLASS_V);
+ }
+#undef DECODE_COEFS_CLASS
+ default: assert(0);
+ }
+ } else { // dc-only
+ int tok_br = dav1d_msac_decode_symbol_adapt4(&ts->msac, eob_cdf[0], 2);
+ dc_tok = 1 + tok_br;
+ if (dbg)
+ printf("Post-dc_lo_tok[%d][%d][%d][%d]: r=%d\n",
+ t_dim->ctx, chroma, 0, dc_tok, ts->msac.rng);
+ if (tok_br == 2) {
+ dc_tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[0]);
+ if (dbg)
+ printf("Post-dc_hi_tok[%d][%d][0][%d]: r=%d\n",
+ imin(t_dim->ctx, 3), chroma, dc_tok, ts->msac.rng);
+ }
+ rc = 0;
+ }
+
+ // residual and sign
+ const uint16_t *const dq_tbl = ts->dq[b->seg_id][plane];
+ const uint8_t *const qm_tbl = *txtp < IDTX ? f->qm[tx][plane] : NULL;
+ const int dq_shift = imax(0, t_dim->ctx - 2);
+ const int cf_max = ~(~127U << (BITDEPTH == 8 ? 8 : f->cur.p.bpc));
+ unsigned cul_level, dc_sign_level;
+
+ if (!dc_tok) {
+ cul_level = 0;
+ dc_sign_level = 1 << 6;
+ if (qm_tbl) goto ac_qm;
+ goto ac_noqm;
+ }
+
+ const int dc_sign_ctx = get_dc_sign_ctx(tx, a, l);
+ uint16_t *const dc_sign_cdf = ts->cdf.coef.dc_sign[chroma][dc_sign_ctx];
+ const int dc_sign = dav1d_msac_decode_bool_adapt(&ts->msac, dc_sign_cdf);
+ if (dbg)
+ printf("Post-dc_sign[%d][%d][%d]: r=%d\n",
+ chroma, dc_sign_ctx, dc_sign, ts->msac.rng);
+
+ int dc_dq = dq_tbl[0];
+ dc_sign_level = (dc_sign - 1) & (2 << 6);
+
+ if (qm_tbl) {
+ dc_dq = (dc_dq * qm_tbl[0] + 16) >> 5;
+
+ if (dc_tok == 15) {
+ dc_tok = read_golomb(&ts->msac) + 15;
+ if (dbg)
+ printf("Post-dc_residual[%d->%d]: r=%d\n",
+ dc_tok - 15, dc_tok, ts->msac.rng);
+
+ dc_tok &= 0xfffff;
+ dc_dq = (dc_dq * dc_tok) & 0xffffff;
+ } else {
+ dc_dq *= dc_tok;
+ assert(dc_dq <= 0xffffff);
+ }
+ cul_level = dc_tok;
+ dc_dq >>= dq_shift;
+ dc_dq = umin(dc_dq, cf_max + dc_sign);
+ cf[0] = (coef) (dc_sign ? -dc_dq : dc_dq);
+
+ if (rc) ac_qm: {
+ const unsigned ac_dq = dq_tbl[1];
+ do {
+ const int sign = dav1d_msac_decode_bool_equi(&ts->msac);
+ if (dbg)
+ printf("Post-sign[%d=%d]: r=%d\n", rc, sign, ts->msac.rng);
+ const unsigned rc_tok = cf[rc];
+ unsigned tok, dq = (ac_dq * qm_tbl[rc] + 16) >> 5;
+ int dq_sat;
+
+ if (rc_tok >= (15 << 11)) {
+ tok = read_golomb(&ts->msac) + 15;
+ if (dbg)
+ printf("Post-residual[%d=%d->%d]: r=%d\n",
+ rc, tok - 15, tok, ts->msac.rng);
+
+ tok &= 0xfffff;
+ dq = (dq * tok) & 0xffffff;
+ } else {
+ tok = rc_tok >> 11;
+ dq *= tok;
+ assert(dq <= 0xffffff);
+ }
+ cul_level += tok;
+ dq >>= dq_shift;
+ dq_sat = umin(dq, cf_max + sign);
+ cf[rc] = (coef) (sign ? -dq_sat : dq_sat);
+
+ rc = rc_tok & 0x3ff;
+ } while (rc);
+ }
+ } else {
+ // non-qmatrix is the common case and allows for additional optimizations
+ if (dc_tok == 15) {
+ dc_tok = read_golomb(&ts->msac) + 15;
+ if (dbg)
+ printf("Post-dc_residual[%d->%d]: r=%d\n",
+ dc_tok - 15, dc_tok, ts->msac.rng);
+
+ dc_tok &= 0xfffff;
+ dc_dq = ((dc_dq * dc_tok) & 0xffffff) >> dq_shift;
+ dc_dq = umin(dc_dq, cf_max + dc_sign);
+ } else {
+ dc_dq = ((dc_dq * dc_tok) >> dq_shift);
+ assert(dc_dq <= cf_max);
+ }
+ cul_level = dc_tok;
+ cf[0] = (coef) (dc_sign ? -dc_dq : dc_dq);
+
+ if (rc) ac_noqm: {
+ const unsigned ac_dq = dq_tbl[1];
+ do {
+ const int sign = dav1d_msac_decode_bool_equi(&ts->msac);
+ if (dbg)
+ printf("Post-sign[%d=%d]: r=%d\n", rc, sign, ts->msac.rng);
+ const unsigned rc_tok = cf[rc];
+ unsigned tok;
+ int dq;
+
+ // residual
+ if (rc_tok >= (15 << 11)) {
+ tok = read_golomb(&ts->msac) + 15;
+ if (dbg)
+ printf("Post-residual[%d=%d->%d]: r=%d\n",
+ rc, tok - 15, tok, ts->msac.rng);
+
+ // coefficient parsing, see 5.11.39
+ tok &= 0xfffff;
+
+ // dequant, see 7.12.3
+ dq = ((ac_dq * tok) & 0xffffff) >> dq_shift;
+ dq = umin(dq, cf_max + sign);
+ } else {
+ // cannot exceed cf_max, so we can avoid the clipping
+ tok = rc_tok >> 11;
+ dq = ((ac_dq * tok) >> dq_shift);
+ assert(dq <= cf_max);
+ }
+ cul_level += tok;
+ cf[rc] = (coef) (sign ? -dq : dq);
+
+ rc = rc_tok & 0x3ff; // next non-zero rc, zero if eob
+ } while (rc);
+ }
+ }
+
+ // context
+ *res_ctx = umin(cul_level, 63) | dc_sign_level;
+
+ return eob;
+}
+
+static void read_coef_tree(Dav1dTaskContext *const t,
+ const enum BlockSize bs, const Av1Block *const b,
+ const enum RectTxfmSize ytx, const int depth,
+ const uint16_t *const tx_split,
+ const int x_off, const int y_off, pixel *dst)
+{
+ const Dav1dFrameContext *const f = t->f;
+ Dav1dTileState *const ts = t->ts;
+ const Dav1dDSPContext *const dsp = f->dsp;
+ const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[ytx];
+ const int txw = t_dim->w, txh = t_dim->h;
+
+ /* y_off can be larger than 3 since lossless blocks use TX_4X4 but can't
+ * be splitted. Aviods an undefined left shift. */
+ if (depth < 2 && tx_split[depth] &&
+ tx_split[depth] & (1 << (y_off * 4 + x_off)))
+ {
+ const enum RectTxfmSize sub = t_dim->sub;
+ const TxfmInfo *const sub_t_dim = &dav1d_txfm_dimensions[sub];
+ const int txsw = sub_t_dim->w, txsh = sub_t_dim->h;
+
+ read_coef_tree(t, bs, b, sub, depth + 1, tx_split,
+ x_off * 2 + 0, y_off * 2 + 0, dst);
+ t->bx += txsw;
+ if (txw >= txh && t->bx < f->bw)
+ read_coef_tree(t, bs, b, sub, depth + 1, tx_split, x_off * 2 + 1,
+ y_off * 2 + 0, dst ? &dst[4 * txsw] : NULL);
+ t->bx -= txsw;
+ t->by += txsh;
+ if (txh >= txw && t->by < f->bh) {
+ if (dst)
+ dst += 4 * txsh * PXSTRIDE(f->cur.stride[0]);
+ read_coef_tree(t, bs, b, sub, depth + 1, tx_split,
+ x_off * 2 + 0, y_off * 2 + 1, dst);
+ t->bx += txsw;
+ if (txw >= txh && t->bx < f->bw)
+ read_coef_tree(t, bs, b, sub, depth + 1, tx_split, x_off * 2 + 1,
+ y_off * 2 + 1, dst ? &dst[4 * txsw] : NULL);
+ t->bx -= txsw;
+ }
+ t->by -= txsh;
+ } else {
+ const int bx4 = t->bx & 31, by4 = t->by & 31;
+ enum TxfmType txtp;
+ uint8_t cf_ctx;
+ int eob;
+ coef *cf;
+
+ if (t->frame_thread.pass) {
+ const int p = t->frame_thread.pass & 1;
+ assert(ts->frame_thread[p].cf);
+ cf = ts->frame_thread[p].cf;
+ ts->frame_thread[p].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
+ } else {
+ cf = bitfn(t->cf);
+ }
+ if (t->frame_thread.pass != 2) {
+ eob = decode_coefs(t, &t->a->lcoef[bx4], &t->l.lcoef[by4],
+ ytx, bs, b, 0, 0, cf, &txtp, &cf_ctx);
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
+ ytx, txtp, eob, ts->msac.rng);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir lcoef, off, mul * cf_ctx)
+#define default_memset(dir, diridx, off, sz) \
+ memset(&t->dir lcoef[off], cf_ctx, sz)
+ case_set_upto16_with_default(imin(txh, f->bh - t->by), l., 1, by4);
+ case_set_upto16_with_default(imin(txw, f->bw - t->bx), a->, 0, bx4);
+#undef default_memset
+#undef set_ctx
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ for (int y = 0; y < txh; y++) { \
+ rep_macro(type, txtp_map, 0, mul * txtp); \
+ txtp_map += 32; \
+ }
+ uint8_t *txtp_map = &t->scratch.txtp_map[by4 * 32 + bx4];
+ case_set_upto16(txw,,,);
+#undef set_ctx
+ if (t->frame_thread.pass == 1)
+ *ts->frame_thread[1].cbi++ = eob * (1 << 5) + txtp;
+ } else {
+ const int cbi = *ts->frame_thread[0].cbi++;
+ eob = cbi >> 5;
+ txtp = cbi & 0x1f;
+ }
+ if (!(t->frame_thread.pass & 1)) {
+ assert(dst);
+ if (eob >= 0) {
+ if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
+ coef_dump(cf, imin(t_dim->h, 8) * 4, imin(t_dim->w, 8) * 4, 3, "dq");
+ dsp->itx.itxfm_add[ytx][txtp](dst, f->cur.stride[0], cf, eob
+ HIGHBD_CALL_SUFFIX);
+ if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
+ hex_dump(dst, f->cur.stride[0], t_dim->w * 4, t_dim->h * 4, "recon");
+ }
+ }
+ }
+}
+
+void bytefn(dav1d_read_coef_blocks)(Dav1dTaskContext *const t,
+ const enum BlockSize bs, const Av1Block *const b)
+{
+ const Dav1dFrameContext *const f = t->f;
+ const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+ const int bx4 = t->bx & 31, by4 = t->by & 31;
+ const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;
+ const uint8_t *const b_dim = dav1d_block_dimensions[bs];
+ const int bw4 = b_dim[0], bh4 = b_dim[1];
+ const int cbw4 = (bw4 + ss_hor) >> ss_hor, cbh4 = (bh4 + ss_ver) >> ss_ver;
+ const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 &&
+ (bw4 > ss_hor || t->bx & 1) &&
+ (bh4 > ss_ver || t->by & 1);
+
+ if (b->skip) {
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir lcoef, off, mul * 0x40)
+ case_set(bh4, l., 1, by4);
+ case_set(bw4, a->, 0, bx4);
+#undef set_ctx
+ if (has_chroma) {
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir ccoef[0], off, mul * 0x40); \
+ rep_macro(type, t->dir ccoef[1], off, mul * 0x40)
+ case_set(cbh4, l., 1, cby4);
+ case_set(cbw4, a->, 0, cbx4);
+#undef set_ctx
+ }
+ return;
+ }
+
+ Dav1dTileState *const ts = t->ts;
+ const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
+ const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
+ assert(t->frame_thread.pass == 1);
+ assert(!b->skip);
+ const TxfmInfo *const uv_t_dim = &dav1d_txfm_dimensions[b->uvtx];
+ const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[b->intra ? b->tx : b->max_ytx];
+ const uint16_t tx_split[2] = { b->tx_split0, b->tx_split1 };
+
+ for (int init_y = 0; init_y < h4; init_y += 16) {
+ const int sub_h4 = imin(h4, 16 + init_y);
+ for (int init_x = 0; init_x < w4; init_x += 16) {
+ const int sub_w4 = imin(w4, init_x + 16);
+ int y_off = !!init_y, y, x;
+ for (y = init_y, t->by += init_y; y < sub_h4;
+ y += t_dim->h, t->by += t_dim->h, y_off++)
+ {
+ int x_off = !!init_x;
+ for (x = init_x, t->bx += init_x; x < sub_w4;
+ x += t_dim->w, t->bx += t_dim->w, x_off++)
+ {
+ if (!b->intra) {
+ read_coef_tree(t, bs, b, b->max_ytx, 0, tx_split,
+ x_off, y_off, NULL);
+ } else {
+ uint8_t cf_ctx = 0x40;
+ enum TxfmType txtp;
+ const int eob =
+ decode_coefs(t, &t->a->lcoef[bx4 + x],
+ &t->l.lcoef[by4 + y], b->tx, bs, b, 1,
+ 0, ts->frame_thread[1].cf, &txtp, &cf_ctx);
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
+ b->tx, txtp, eob, ts->msac.rng);
+ *ts->frame_thread[1].cbi++ = eob * (1 << 5) + txtp;
+ ts->frame_thread[1].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir lcoef, off, mul * cf_ctx)
+#define default_memset(dir, diridx, off, sz) \
+ memset(&t->dir lcoef[off], cf_ctx, sz)
+ case_set_upto16_with_default(imin(t_dim->h, f->bh - t->by),
+ l., 1, by4 + y);
+ case_set_upto16_with_default(imin(t_dim->w, f->bw - t->bx),
+ a->, 0, bx4 + x);
+#undef default_memset
+#undef set_ctx
+ }
+ }
+ t->bx -= x;
+ }
+ t->by -= y;
+
+ if (!has_chroma) continue;
+
+ const int sub_ch4 = imin(ch4, (init_y + 16) >> ss_ver);
+ const int sub_cw4 = imin(cw4, (init_x + 16) >> ss_hor);
+ for (int pl = 0; pl < 2; pl++) {
+ for (y = init_y >> ss_ver, t->by += init_y; y < sub_ch4;
+ y += uv_t_dim->h, t->by += uv_t_dim->h << ss_ver)
+ {
+ for (x = init_x >> ss_hor, t->bx += init_x; x < sub_cw4;
+ x += uv_t_dim->w, t->bx += uv_t_dim->w << ss_hor)
+ {
+ uint8_t cf_ctx = 0x40;
+ enum TxfmType txtp;
+ if (!b->intra)
+ txtp = t->scratch.txtp_map[(by4 + (y << ss_ver)) * 32 +
+ bx4 + (x << ss_hor)];
+ const int eob =
+ decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],
+ &t->l.ccoef[pl][cby4 + y], b->uvtx, bs,
+ b, b->intra, 1 + pl, ts->frame_thread[1].cf,
+ &txtp, &cf_ctx);
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-uv-cf-blk[pl=%d,tx=%d,"
+ "txtp=%d,eob=%d]: r=%d\n",
+ pl, b->uvtx, txtp, eob, ts->msac.rng);
+ *ts->frame_thread[1].cbi++ = eob * (1 << 5) + txtp;
+ ts->frame_thread[1].cf += uv_t_dim->w * uv_t_dim->h * 16;
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir ccoef[pl], off, mul * cf_ctx)
+#define default_memset(dir, diridx, off, sz) \
+ memset(&t->dir ccoef[pl][off], cf_ctx, sz)
+ case_set_upto16_with_default( \
+ imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver),
+ l., 1, cby4 + y);
+ case_set_upto16_with_default( \
+ imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor),
+ a->, 0, cbx4 + x);
+#undef default_memset
+#undef set_ctx
+ }
+ t->bx -= x << ss_hor;
+ }
+ t->by -= y << ss_ver;
+ }
+ }
+ }
+}
+
+static int mc(Dav1dTaskContext *const t,
+ pixel *const dst8, int16_t *const dst16, const ptrdiff_t dst_stride,
+ const int bw4, const int bh4,
+ const int bx, const int by, const int pl,
+ const mv mv, const Dav1dThreadPicture *const refp, const int refidx,
+ const enum Filter2d filter_2d)
+{
+ assert((dst8 != NULL) ^ (dst16 != NULL));
+ const Dav1dFrameContext *const f = t->f;
+ const int ss_ver = !!pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_hor = !!pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+ const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;
+ const int mvx = mv.x, mvy = mv.y;
+ const int mx = mvx & (15 >> !ss_hor), my = mvy & (15 >> !ss_ver);
+ ptrdiff_t ref_stride = refp->p.stride[!!pl];
+ const pixel *ref;
+
+ if (refp->p.p.w == f->cur.p.w && refp->p.p.h == f->cur.p.h) {
+ const int dx = bx * h_mul + (mvx >> (3 + ss_hor));
+ const int dy = by * v_mul + (mvy >> (3 + ss_ver));
+ int w, h;
+
+ if (refp->p.data[0] != f->cur.data[0]) { // i.e. not for intrabc
+ w = (f->cur.p.w + ss_hor) >> ss_hor;
+ h = (f->cur.p.h + ss_ver) >> ss_ver;
+ } else {
+ w = f->bw * 4 >> ss_hor;
+ h = f->bh * 4 >> ss_ver;
+ }
+ if (dx < !!mx * 3 || dy < !!my * 3 ||
+ dx + bw4 * h_mul + !!mx * 4 > w ||
+ dy + bh4 * v_mul + !!my * 4 > h)
+ {
+ pixel *const emu_edge_buf = bitfn(t->scratch.emu_edge);
+ f->dsp->mc.emu_edge(bw4 * h_mul + !!mx * 7, bh4 * v_mul + !!my * 7,
+ w, h, dx - !!mx * 3, dy - !!my * 3,
+ emu_edge_buf, 192 * sizeof(pixel),
+ refp->p.data[pl], ref_stride);
+ ref = &emu_edge_buf[192 * !!my * 3 + !!mx * 3];
+ ref_stride = 192 * sizeof(pixel);
+ } else {
+ ref = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * dy + dx;
+ }
+
+ if (dst8 != NULL) {
+ f->dsp->mc.mc[filter_2d](dst8, dst_stride, ref, ref_stride, bw4 * h_mul,
+ bh4 * v_mul, mx << !ss_hor, my << !ss_ver
+ HIGHBD_CALL_SUFFIX);
+ } else {
+ f->dsp->mc.mct[filter_2d](dst16, ref, ref_stride, bw4 * h_mul,
+ bh4 * v_mul, mx << !ss_hor, my << !ss_ver
+ HIGHBD_CALL_SUFFIX);
+ }
+ } else {
+ assert(refp != &f->sr_cur);
+
+ const int orig_pos_y = (by * v_mul << 4) + mvy * (1 << !ss_ver);
+ const int orig_pos_x = (bx * h_mul << 4) + mvx * (1 << !ss_hor);
+#define scale_mv(res, val, scale) do { \
+ const int64_t tmp = (int64_t)(val) * scale + (scale - 0x4000) * 8; \
+ res = apply_sign64((int) ((llabs(tmp) + 128) >> 8), tmp) + 32; \
+ } while (0)
+ int pos_y, pos_x;
+ scale_mv(pos_x, orig_pos_x, f->svc[refidx][0].scale);
+ scale_mv(pos_y, orig_pos_y, f->svc[refidx][1].scale);
+#undef scale_mv
+ const int left = pos_x >> 10;
+ const int top = pos_y >> 10;
+ const int right =
+ ((pos_x + (bw4 * h_mul - 1) * f->svc[refidx][0].step) >> 10) + 1;
+ const int bottom =
+ ((pos_y + (bh4 * v_mul - 1) * f->svc[refidx][1].step) >> 10) + 1;
+
+ if (DEBUG_BLOCK_INFO)
+ printf("Off %dx%d [%d,%d,%d], size %dx%d [%d,%d]\n",
+ left, top, orig_pos_x, f->svc[refidx][0].scale, refidx,
+ right-left, bottom-top,
+ f->svc[refidx][0].step, f->svc[refidx][1].step);
+
+ const int w = (refp->p.p.w + ss_hor) >> ss_hor;
+ const int h = (refp->p.p.h + ss_ver) >> ss_ver;
+ if (left < 3 || top < 3 || right + 4 > w || bottom + 4 > h) {
+ pixel *const emu_edge_buf = bitfn(t->scratch.emu_edge);
+ f->dsp->mc.emu_edge(right - left + 7, bottom - top + 7,
+ w, h, left - 3, top - 3,
+ emu_edge_buf, 320 * sizeof(pixel),
+ refp->p.data[pl], ref_stride);
+ ref = &emu_edge_buf[320 * 3 + 3];
+ ref_stride = 320 * sizeof(pixel);
+ if (DEBUG_BLOCK_INFO) printf("Emu\n");
+ } else {
+ ref = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * top + left;
+ }
+
+ if (dst8 != NULL) {
+ f->dsp->mc.mc_scaled[filter_2d](dst8, dst_stride, ref, ref_stride,
+ bw4 * h_mul, bh4 * v_mul,
+ pos_x & 0x3ff, pos_y & 0x3ff,
+ f->svc[refidx][0].step,
+ f->svc[refidx][1].step
+ HIGHBD_CALL_SUFFIX);
+ } else {
+ f->dsp->mc.mct_scaled[filter_2d](dst16, ref, ref_stride,
+ bw4 * h_mul, bh4 * v_mul,
+ pos_x & 0x3ff, pos_y & 0x3ff,
+ f->svc[refidx][0].step,
+ f->svc[refidx][1].step
+ HIGHBD_CALL_SUFFIX);
+ }
+ }
+
+ return 0;
+}
+
+static int obmc(Dav1dTaskContext *const t,
+ pixel *const dst, const ptrdiff_t dst_stride,
+ const uint8_t *const b_dim, const int pl,
+ const int bx4, const int by4, const int w4, const int h4)
+{
+ assert(!(t->bx & 1) && !(t->by & 1));
+ const Dav1dFrameContext *const f = t->f;
+ /*const*/ refmvs_block **r = &t->rt.r[(t->by & 31) + 5];
+ pixel *const lap = bitfn(t->scratch.lap);
+ const int ss_ver = !!pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_hor = !!pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+ const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;
+ int res;
+
+ if (t->by > t->ts->tiling.row_start &&
+ (!pl || b_dim[0] * h_mul + b_dim[1] * v_mul >= 16))
+ {
+ for (int i = 0, x = 0; x < w4 && i < imin(b_dim[2], 4); ) {
+ // only odd blocks are considered for overlap handling, hence +1
+ const refmvs_block *const a_r = &r[-1][t->bx + x + 1];
+ const uint8_t *const a_b_dim = dav1d_block_dimensions[a_r->bs];
+ const int step4 = iclip(a_b_dim[0], 2, 16);
+
+ if (a_r->ref.ref[0] > 0) {
+ const int ow4 = imin(step4, b_dim[0]);
+ const int oh4 = imin(b_dim[1], 16) >> 1;
+ res = mc(t, lap, NULL, ow4 * h_mul * sizeof(pixel), ow4, (oh4 * 3 + 3) >> 2,
+ t->bx + x, t->by, pl, a_r->mv.mv[0],
+ &f->refp[a_r->ref.ref[0] - 1], a_r->ref.ref[0] - 1,
+ dav1d_filter_2d[t->a->filter[1][bx4 + x + 1]][t->a->filter[0][bx4 + x + 1]]);
+ if (res) return res;
+ f->dsp->mc.blend_h(&dst[x * h_mul], dst_stride, lap,
+ h_mul * ow4, v_mul * oh4);
+ i++;
+ }
+ x += step4;
+ }
+ }
+
+ if (t->bx > t->ts->tiling.col_start)
+ for (int i = 0, y = 0; y < h4 && i < imin(b_dim[3], 4); ) {
+ // only odd blocks are considered for overlap handling, hence +1
+ const refmvs_block *const l_r = &r[y + 1][t->bx - 1];
+ const uint8_t *const l_b_dim = dav1d_block_dimensions[l_r->bs];
+ const int step4 = iclip(l_b_dim[1], 2, 16);
+
+ if (l_r->ref.ref[0] > 0) {
+ const int ow4 = imin(b_dim[0], 16) >> 1;
+ const int oh4 = imin(step4, b_dim[1]);
+ res = mc(t, lap, NULL, h_mul * ow4 * sizeof(pixel), ow4, oh4,
+ t->bx, t->by + y, pl, l_r->mv.mv[0],
+ &f->refp[l_r->ref.ref[0] - 1], l_r->ref.ref[0] - 1,
+ dav1d_filter_2d[t->l.filter[1][by4 + y + 1]][t->l.filter[0][by4 + y + 1]]);
+ if (res) return res;
+ f->dsp->mc.blend_v(&dst[y * v_mul * PXSTRIDE(dst_stride)],
+ dst_stride, lap, h_mul * ow4, v_mul * oh4);
+ i++;
+ }
+ y += step4;
+ }
+ return 0;
+}
+
+static int warp_affine(Dav1dTaskContext *const t,
+ pixel *dst8, int16_t *dst16, const ptrdiff_t dstride,
+ const uint8_t *const b_dim, const int pl,
+ const Dav1dThreadPicture *const refp,
+ const Dav1dWarpedMotionParams *const wmp)
+{
+ assert((dst8 != NULL) ^ (dst16 != NULL));
+ const Dav1dFrameContext *const f = t->f;
+ const Dav1dDSPContext *const dsp = f->dsp;
+ const int ss_ver = !!pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_hor = !!pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+ const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;
+ assert(!((b_dim[0] * h_mul) & 7) && !((b_dim[1] * v_mul) & 7));
+ const int32_t *const mat = wmp->matrix;
+ const int width = (refp->p.p.w + ss_hor) >> ss_hor;
+ const int height = (refp->p.p.h + ss_ver) >> ss_ver;
+
+ for (int y = 0; y < b_dim[1] * v_mul; y += 8) {
+ const int src_y = t->by * 4 + ((y + 4) << ss_ver);
+ const int64_t mat3_y = (int64_t) mat[3] * src_y + mat[0];
+ const int64_t mat5_y = (int64_t) mat[5] * src_y + mat[1];
+ for (int x = 0; x < b_dim[0] * h_mul; x += 8) {
+ // calculate transformation relative to center of 8x8 block in
+ // luma pixel units
+ const int src_x = t->bx * 4 + ((x + 4) << ss_hor);
+ const int64_t mvx = ((int64_t) mat[2] * src_x + mat3_y) >> ss_hor;
+ const int64_t mvy = ((int64_t) mat[4] * src_x + mat5_y) >> ss_ver;
+
+ const int dx = (int) (mvx >> 16) - 4;
+ const int mx = (((int) mvx & 0xffff) - wmp->u.p.alpha * 4 -
+ wmp->u.p.beta * 7) & ~0x3f;
+ const int dy = (int) (mvy >> 16) - 4;
+ const int my = (((int) mvy & 0xffff) - wmp->u.p.gamma * 4 -
+ wmp->u.p.delta * 4) & ~0x3f;
+
+ const pixel *ref_ptr;
+ ptrdiff_t ref_stride = refp->p.stride[!!pl];
+
+ if (dx < 3 || dx + 8 + 4 > width || dy < 3 || dy + 8 + 4 > height) {
+ pixel *const emu_edge_buf = bitfn(t->scratch.emu_edge);
+ f->dsp->mc.emu_edge(15, 15, width, height, dx - 3, dy - 3,
+ emu_edge_buf, 32 * sizeof(pixel),
+ refp->p.data[pl], ref_stride);
+ ref_ptr = &emu_edge_buf[32 * 3 + 3];
+ ref_stride = 32 * sizeof(pixel);
+ } else {
+ ref_ptr = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * dy + dx;
+ }
+ if (dst16 != NULL)
+ dsp->mc.warp8x8t(&dst16[x], dstride, ref_ptr, ref_stride,
+ wmp->u.abcd, mx, my HIGHBD_CALL_SUFFIX);
+ else
+ dsp->mc.warp8x8(&dst8[x], dstride, ref_ptr, ref_stride,
+ wmp->u.abcd, mx, my HIGHBD_CALL_SUFFIX);
+ }
+ if (dst8) dst8 += 8 * PXSTRIDE(dstride);
+ else dst16 += 8 * dstride;
+ }
+ return 0;
+}
+
+void bytefn(dav1d_recon_b_intra)(Dav1dTaskContext *const t, const enum BlockSize bs,
+ const enum EdgeFlags intra_edge_flags,
+ const Av1Block *const b)
+{
+ Dav1dTileState *const ts = t->ts;
+ const Dav1dFrameContext *const f = t->f;
+ const Dav1dDSPContext *const dsp = f->dsp;
+ const int bx4 = t->bx & 31, by4 = t->by & 31;
+ const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+ const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;
+ const uint8_t *const b_dim = dav1d_block_dimensions[bs];
+ const int bw4 = b_dim[0], bh4 = b_dim[1];
+ const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
+ const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
+ const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 &&
+ (bw4 > ss_hor || t->bx & 1) &&
+ (bh4 > ss_ver || t->by & 1);
+ const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[b->tx];
+ const TxfmInfo *const uv_t_dim = &dav1d_txfm_dimensions[b->uvtx];
+
+ // coefficient coding
+ pixel *const edge = bitfn(t->scratch.edge) + 128;
+ const int cbw4 = (bw4 + ss_hor) >> ss_hor, cbh4 = (bh4 + ss_ver) >> ss_ver;
+
+ const int intra_edge_filter_flag = f->seq_hdr->intra_edge_filter << 10;
+
+ for (int init_y = 0; init_y < h4; init_y += 16) {
+ const int sub_h4 = imin(h4, 16 + init_y);
+ const int sub_ch4 = imin(ch4, (init_y + 16) >> ss_ver);
+ for (int init_x = 0; init_x < w4; init_x += 16) {
+ if (b->pal_sz[0]) {
+ pixel *dst = ((pixel *) f->cur.data[0]) +
+ 4 * (t->by * PXSTRIDE(f->cur.stride[0]) + t->bx);
+ const uint8_t *pal_idx;
+ if (t->frame_thread.pass) {
+ const int p = t->frame_thread.pass & 1;
+ assert(ts->frame_thread[p].pal_idx);
+ pal_idx = ts->frame_thread[p].pal_idx;
+ ts->frame_thread[p].pal_idx += bw4 * bh4 * 8;
+ } else {
+ pal_idx = t->scratch.pal_idx_y;
+ }
+ const pixel *const pal = t->frame_thread.pass ?
+ f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
+ ((t->bx >> 1) + (t->by & 1))][0] :
+ bytefn(t->scratch.pal)[0];
+ f->dsp->ipred.pal_pred(dst, f->cur.stride[0], pal,
+ pal_idx, bw4 * 4, bh4 * 4);
+ if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
+ hex_dump(dst, PXSTRIDE(f->cur.stride[0]),
+ bw4 * 4, bh4 * 4, "y-pal-pred");
+ }
+
+ const int intra_flags = (sm_flag(t->a, bx4) |
+ sm_flag(&t->l, by4) |
+ intra_edge_filter_flag);
+ const int sb_has_tr = init_x + 16 < w4 ? 1 : init_y ? 0 :
+ intra_edge_flags & EDGE_I444_TOP_HAS_RIGHT;
+ const int sb_has_bl = init_x ? 0 : init_y + 16 < h4 ? 1 :
+ intra_edge_flags & EDGE_I444_LEFT_HAS_BOTTOM;
+ int y, x;
+ const int sub_w4 = imin(w4, init_x + 16);
+ for (y = init_y, t->by += init_y; y < sub_h4;
+ y += t_dim->h, t->by += t_dim->h)
+ {
+ pixel *dst = ((pixel *) f->cur.data[0]) +
+ 4 * (t->by * PXSTRIDE(f->cur.stride[0]) +
+ t->bx + init_x);
+ for (x = init_x, t->bx += init_x; x < sub_w4;
+ x += t_dim->w, t->bx += t_dim->w)
+ {
+ if (b->pal_sz[0]) goto skip_y_pred;
+
+ int angle = b->y_angle;
+ const enum EdgeFlags edge_flags =
+ (((y > init_y || !sb_has_tr) && (x + t_dim->w >= sub_w4)) ?
+ 0 : EDGE_I444_TOP_HAS_RIGHT) |
+ ((x > init_x || (!sb_has_bl && y + t_dim->h >= sub_h4)) ?
+ 0 : EDGE_I444_LEFT_HAS_BOTTOM);
+ const pixel *top_sb_edge = NULL;
+ if (!(t->by & (f->sb_step - 1))) {
+ top_sb_edge = f->ipred_edge[0];
+ const int sby = t->by >> f->sb_shift;
+ top_sb_edge += f->sb128w * 128 * (sby - 1);
+ }
+ const enum IntraPredMode m =
+ bytefn(dav1d_prepare_intra_edges)(t->bx,
+ t->bx > ts->tiling.col_start,
+ t->by,
+ t->by > ts->tiling.row_start,
+ ts->tiling.col_end,
+ ts->tiling.row_end,
+ edge_flags, dst,
+ f->cur.stride[0], top_sb_edge,
+ b->y_mode, &angle,
+ t_dim->w, t_dim->h,
+ f->seq_hdr->intra_edge_filter,
+ edge HIGHBD_CALL_SUFFIX);
+ dsp->ipred.intra_pred[m](dst, f->cur.stride[0], edge,
+ t_dim->w * 4, t_dim->h * 4,
+ angle | intra_flags,
+ 4 * f->bw - 4 * t->bx,
+ 4 * f->bh - 4 * t->by
+ HIGHBD_CALL_SUFFIX);
+
+ if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
+ hex_dump(edge - t_dim->h * 4, t_dim->h * 4,
+ t_dim->h * 4, 2, "l");
+ hex_dump(edge, 0, 1, 1, "tl");
+ hex_dump(edge + 1, t_dim->w * 4,
+ t_dim->w * 4, 2, "t");
+ hex_dump(dst, f->cur.stride[0],
+ t_dim->w * 4, t_dim->h * 4, "y-intra-pred");
+ }
+
+ skip_y_pred: {}
+ if (!b->skip) {
+ coef *cf;
+ int eob;
+ enum TxfmType txtp;
+ if (t->frame_thread.pass) {
+ const int p = t->frame_thread.pass & 1;
+ const int cbi = *ts->frame_thread[p].cbi++;
+ cf = ts->frame_thread[p].cf;
+ ts->frame_thread[p].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
+ eob = cbi >> 5;
+ txtp = cbi & 0x1f;
+ } else {
+ uint8_t cf_ctx;
+ cf = bitfn(t->cf);
+ eob = decode_coefs(t, &t->a->lcoef[bx4 + x],
+ &t->l.lcoef[by4 + y], b->tx, bs,
+ b, 1, 0, cf, &txtp, &cf_ctx);
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
+ b->tx, txtp, eob, ts->msac.rng);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir lcoef, off, mul * cf_ctx)
+#define default_memset(dir, diridx, off, sz) \
+ memset(&t->dir lcoef[off], cf_ctx, sz)
+ case_set_upto16_with_default(imin(t_dim->h, f->bh - t->by), \
+ l., 1, by4 + y);
+ case_set_upto16_with_default(imin(t_dim->w, f->bw - t->bx), \
+ a->, 0, bx4 + x);
+#undef default_memset
+#undef set_ctx
+ }
+ if (eob >= 0) {
+ if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
+ coef_dump(cf, imin(t_dim->h, 8) * 4,
+ imin(t_dim->w, 8) * 4, 3, "dq");
+ dsp->itx.itxfm_add[b->tx]
+ [txtp](dst,
+ f->cur.stride[0],
+ cf, eob HIGHBD_CALL_SUFFIX);
+ if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
+ hex_dump(dst, f->cur.stride[0],
+ t_dim->w * 4, t_dim->h * 4, "recon");
+ }
+ } else if (!t->frame_thread.pass) {
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir lcoef, off, mul * 0x40)
+ case_set_upto16(t_dim->h, l., 1, by4 + y);
+ case_set_upto16(t_dim->w, a->, 0, bx4 + x);
+#undef set_ctx
+ }
+ dst += 4 * t_dim->w;
+ }
+ t->bx -= x;
+ }
+ t->by -= y;
+
+ if (!has_chroma) continue;
+
+ const ptrdiff_t stride = f->cur.stride[1];
+
+ if (b->uv_mode == CFL_PRED) {
+ assert(!init_x && !init_y);
+
+ int16_t *const ac = t->scratch.ac;
+ pixel *y_src = ((pixel *) f->cur.data[0]) + 4 * (t->bx & ~ss_hor) +
+ 4 * (t->by & ~ss_ver) * PXSTRIDE(f->cur.stride[0]);
+ const ptrdiff_t uv_off = 4 * ((t->bx >> ss_hor) +
+ (t->by >> ss_ver) * PXSTRIDE(stride));
+ pixel *const uv_dst[2] = { ((pixel *) f->cur.data[1]) + uv_off,
+ ((pixel *) f->cur.data[2]) + uv_off };
+
+ const int furthest_r =
+ ((cw4 << ss_hor) + t_dim->w - 1) & ~(t_dim->w - 1);
+ const int furthest_b =
+ ((ch4 << ss_ver) + t_dim->h - 1) & ~(t_dim->h - 1);
+ dsp->ipred.cfl_ac[f->cur.p.layout - 1](ac, y_src, f->cur.stride[0],
+ cbw4 - (furthest_r >> ss_hor),
+ cbh4 - (furthest_b >> ss_ver),
+ cbw4 * 4, cbh4 * 4);
+ for (int pl = 0; pl < 2; pl++) {
+ if (!b->cfl_alpha[pl]) continue;
+ int angle = 0;
+ const pixel *top_sb_edge = NULL;
+ if (!((t->by & ~ss_ver) & (f->sb_step - 1))) {
+ top_sb_edge = f->ipred_edge[pl + 1];
+ const int sby = t->by >> f->sb_shift;
+ top_sb_edge += f->sb128w * 128 * (sby - 1);
+ }
+ const int xpos = t->bx >> ss_hor, ypos = t->by >> ss_ver;
+ const int xstart = ts->tiling.col_start >> ss_hor;
+ const int ystart = ts->tiling.row_start >> ss_ver;
+ const enum IntraPredMode m =
+ bytefn(dav1d_prepare_intra_edges)(xpos, xpos > xstart,
+ ypos, ypos > ystart,
+ ts->tiling.col_end >> ss_hor,
+ ts->tiling.row_end >> ss_ver,
+ 0, uv_dst[pl], stride,
+ top_sb_edge, DC_PRED, &angle,
+ uv_t_dim->w, uv_t_dim->h, 0,
+ edge HIGHBD_CALL_SUFFIX);
+ dsp->ipred.cfl_pred[m](uv_dst[pl], stride, edge,
+ uv_t_dim->w * 4,
+ uv_t_dim->h * 4,
+ ac, b->cfl_alpha[pl]
+ HIGHBD_CALL_SUFFIX);
+ }
+ if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
+ ac_dump(ac, 4*cbw4, 4*cbh4, "ac");
+ hex_dump(uv_dst[0], stride, cbw4 * 4, cbh4 * 4, "u-cfl-pred");
+ hex_dump(uv_dst[1], stride, cbw4 * 4, cbh4 * 4, "v-cfl-pred");
+ }
+ } else if (b->pal_sz[1]) {
+ const ptrdiff_t uv_dstoff = 4 * ((t->bx >> ss_hor) +
+ (t->by >> ss_ver) * PXSTRIDE(f->cur.stride[1]));
+ const pixel (*pal)[8];
+ const uint8_t *pal_idx;
+ if (t->frame_thread.pass) {
+ const int p = t->frame_thread.pass & 1;
+ assert(ts->frame_thread[p].pal_idx);
+ pal = f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
+ ((t->bx >> 1) + (t->by & 1))];
+ pal_idx = ts->frame_thread[p].pal_idx;
+ ts->frame_thread[p].pal_idx += cbw4 * cbh4 * 8;
+ } else {
+ pal = bytefn(t->scratch.pal);
+ pal_idx = t->scratch.pal_idx_uv;
+ }
+
+ f->dsp->ipred.pal_pred(((pixel *) f->cur.data[1]) + uv_dstoff,
+ f->cur.stride[1], pal[1],
+ pal_idx, cbw4 * 4, cbh4 * 4);
+ f->dsp->ipred.pal_pred(((pixel *) f->cur.data[2]) + uv_dstoff,
+ f->cur.stride[1], pal[2],
+ pal_idx, cbw4 * 4, cbh4 * 4);
+ if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
+ hex_dump(((pixel *) f->cur.data[1]) + uv_dstoff,
+ PXSTRIDE(f->cur.stride[1]),
+ cbw4 * 4, cbh4 * 4, "u-pal-pred");
+ hex_dump(((pixel *) f->cur.data[2]) + uv_dstoff,
+ PXSTRIDE(f->cur.stride[1]),
+ cbw4 * 4, cbh4 * 4, "v-pal-pred");
+ }
+ }
+
+ const int sm_uv_fl = sm_uv_flag(t->a, cbx4) |
+ sm_uv_flag(&t->l, cby4);
+ const int uv_sb_has_tr =
+ ((init_x + 16) >> ss_hor) < cw4 ? 1 : init_y ? 0 :
+ intra_edge_flags & (EDGE_I420_TOP_HAS_RIGHT >> (f->cur.p.layout - 1));
+ const int uv_sb_has_bl =
+ init_x ? 0 : ((init_y + 16) >> ss_ver) < ch4 ? 1 :
+ intra_edge_flags & (EDGE_I420_LEFT_HAS_BOTTOM >> (f->cur.p.layout - 1));
+ const int sub_cw4 = imin(cw4, (init_x + 16) >> ss_hor);
+ for (int pl = 0; pl < 2; pl++) {
+ for (y = init_y >> ss_ver, t->by += init_y; y < sub_ch4;
+ y += uv_t_dim->h, t->by += uv_t_dim->h << ss_ver)
+ {
+ pixel *dst = ((pixel *) f->cur.data[1 + pl]) +
+ 4 * ((t->by >> ss_ver) * PXSTRIDE(stride) +
+ ((t->bx + init_x) >> ss_hor));
+ for (x = init_x >> ss_hor, t->bx += init_x; x < sub_cw4;
+ x += uv_t_dim->w, t->bx += uv_t_dim->w << ss_hor)
+ {
+ if ((b->uv_mode == CFL_PRED && b->cfl_alpha[pl]) ||
+ b->pal_sz[1])
+ {
+ goto skip_uv_pred;
+ }
+
+ int angle = b->uv_angle;
+ // this probably looks weird because we're using
+ // luma flags in a chroma loop, but that's because
+ // prepare_intra_edges() expects luma flags as input
+ const enum EdgeFlags edge_flags =
+ (((y > (init_y >> ss_ver) || !uv_sb_has_tr) &&
+ (x + uv_t_dim->w >= sub_cw4)) ?
+ 0 : EDGE_I444_TOP_HAS_RIGHT) |
+ ((x > (init_x >> ss_hor) ||
+ (!uv_sb_has_bl && y + uv_t_dim->h >= sub_ch4)) ?
+ 0 : EDGE_I444_LEFT_HAS_BOTTOM);
+ const pixel *top_sb_edge = NULL;
+ if (!((t->by & ~ss_ver) & (f->sb_step - 1))) {
+ top_sb_edge = f->ipred_edge[1 + pl];
+ const int sby = t->by >> f->sb_shift;
+ top_sb_edge += f->sb128w * 128 * (sby - 1);
+ }
+ const enum IntraPredMode uv_mode =
+ b->uv_mode == CFL_PRED ? DC_PRED : b->uv_mode;
+ const int xpos = t->bx >> ss_hor, ypos = t->by >> ss_ver;
+ const int xstart = ts->tiling.col_start >> ss_hor;
+ const int ystart = ts->tiling.row_start >> ss_ver;
+ const enum IntraPredMode m =
+ bytefn(dav1d_prepare_intra_edges)(xpos, xpos > xstart,
+ ypos, ypos > ystart,
+ ts->tiling.col_end >> ss_hor,
+ ts->tiling.row_end >> ss_ver,
+ edge_flags, dst, stride,
+ top_sb_edge, uv_mode,
+ &angle, uv_t_dim->w,
+ uv_t_dim->h,
+ f->seq_hdr->intra_edge_filter,
+ edge HIGHBD_CALL_SUFFIX);
+ angle |= intra_edge_filter_flag;
+ dsp->ipred.intra_pred[m](dst, stride, edge,
+ uv_t_dim->w * 4,
+ uv_t_dim->h * 4,
+ angle | sm_uv_fl,
+ (4 * f->bw + ss_hor -
+ 4 * (t->bx & ~ss_hor)) >> ss_hor,
+ (4 * f->bh + ss_ver -
+ 4 * (t->by & ~ss_ver)) >> ss_ver
+ HIGHBD_CALL_SUFFIX);
+ if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
+ hex_dump(edge - uv_t_dim->h * 4, uv_t_dim->h * 4,
+ uv_t_dim->h * 4, 2, "l");
+ hex_dump(edge, 0, 1, 1, "tl");
+ hex_dump(edge + 1, uv_t_dim->w * 4,
+ uv_t_dim->w * 4, 2, "t");
+ hex_dump(dst, stride, uv_t_dim->w * 4,
+ uv_t_dim->h * 4, pl ? "v-intra-pred" : "u-intra-pred");
+ }
+
+ skip_uv_pred: {}
+ if (!b->skip) {
+ enum TxfmType txtp;
+ int eob;
+ coef *cf;
+ if (t->frame_thread.pass) {
+ const int p = t->frame_thread.pass & 1;
+ const int cbi = *ts->frame_thread[p].cbi++;
+ cf = ts->frame_thread[p].cf;
+ ts->frame_thread[p].cf += uv_t_dim->w * uv_t_dim->h * 16;
+ eob = cbi >> 5;
+ txtp = cbi & 0x1f;
+ } else {
+ uint8_t cf_ctx;
+ cf = bitfn(t->cf);
+ eob = decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],
+ &t->l.ccoef[pl][cby4 + y],
+ b->uvtx, bs, b, 1, 1 + pl, cf,
+ &txtp, &cf_ctx);
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-uv-cf-blk[pl=%d,tx=%d,"
+ "txtp=%d,eob=%d]: r=%d [x=%d,cbx4=%d]\n",
+ pl, b->uvtx, txtp, eob, ts->msac.rng, x, cbx4);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir ccoef[pl], off, mul * cf_ctx)
+#define default_memset(dir, diridx, off, sz) \
+ memset(&t->dir ccoef[pl][off], cf_ctx, sz)
+ case_set_upto16_with_default( \
+ imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver),
+ l., 1, cby4 + y);
+ case_set_upto16_with_default( \
+ imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor),
+ a->, 0, cbx4 + x);
+#undef default_memset
+#undef set_ctx
+ }
+ if (eob >= 0) {
+ if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
+ coef_dump(cf, uv_t_dim->h * 4,
+ uv_t_dim->w * 4, 3, "dq");
+ dsp->itx.itxfm_add[b->uvtx]
+ [txtp](dst, stride,
+ cf, eob HIGHBD_CALL_SUFFIX);
+ if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
+ hex_dump(dst, stride, uv_t_dim->w * 4,
+ uv_t_dim->h * 4, "recon");
+ }
+ } else if (!t->frame_thread.pass) {
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir ccoef[pl], off, mul * 0x40)
+ case_set_upto16(uv_t_dim->h, l., 1, cby4 + y);
+ case_set_upto16(uv_t_dim->w, a->, 0, cbx4 + x);
+#undef set_ctx
+ }
+ dst += uv_t_dim->w * 4;
+ }
+ t->bx -= x << ss_hor;
+ }
+ t->by -= y << ss_ver;
+ }
+ }
+ }
+}
+
+int bytefn(dav1d_recon_b_inter)(Dav1dTaskContext *const t, const enum BlockSize bs,
+ const Av1Block *const b)
+{
+ Dav1dTileState *const ts = t->ts;
+ const Dav1dFrameContext *const f = t->f;
+ const Dav1dDSPContext *const dsp = f->dsp;
+ const int bx4 = t->bx & 31, by4 = t->by & 31;
+ const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+ const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;
+ const uint8_t *const b_dim = dav1d_block_dimensions[bs];
+ const int bw4 = b_dim[0], bh4 = b_dim[1];
+ const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
+ const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 &&
+ (bw4 > ss_hor || t->bx & 1) &&
+ (bh4 > ss_ver || t->by & 1);
+ const int chr_layout_idx = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I400 ? 0 :
+ DAV1D_PIXEL_LAYOUT_I444 - f->cur.p.layout;
+ int res;
+
+ // prediction
+ const int cbh4 = (bh4 + ss_ver) >> ss_ver, cbw4 = (bw4 + ss_hor) >> ss_hor;
+ pixel *dst = ((pixel *) f->cur.data[0]) +
+ 4 * (t->by * PXSTRIDE(f->cur.stride[0]) + t->bx);
+ const ptrdiff_t uvdstoff =
+ 4 * ((t->bx >> ss_hor) + (t->by >> ss_ver) * PXSTRIDE(f->cur.stride[1]));
+ if (IS_KEY_OR_INTRA(f->frame_hdr)) {
+ // intrabc
+ assert(!f->frame_hdr->super_res.enabled);
+ res = mc(t, dst, NULL, f->cur.stride[0], bw4, bh4, t->bx, t->by, 0,
+ b->mv[0], &f->sr_cur, 0 /* unused */, FILTER_2D_BILINEAR);
+ if (res) return res;
+ if (has_chroma) for (int pl = 1; pl < 3; pl++) {
+ res = mc(t, ((pixel *)f->cur.data[pl]) + uvdstoff, NULL, f->cur.stride[1],
+ bw4 << (bw4 == ss_hor), bh4 << (bh4 == ss_ver),
+ t->bx & ~ss_hor, t->by & ~ss_ver, pl, b->mv[0],
+ &f->sr_cur, 0 /* unused */, FILTER_2D_BILINEAR);
+ if (res) return res;
+ }
+ } else if (b->comp_type == COMP_INTER_NONE) {
+ const Dav1dThreadPicture *const refp = &f->refp[b->ref[0]];
+ const enum Filter2d filter_2d = b->filter2d;
+
+ if (imin(bw4, bh4) > 1 &&
+ ((b->inter_mode == GLOBALMV && f->gmv_warp_allowed[b->ref[0]]) ||
+ (b->motion_mode == MM_WARP && t->warpmv.type > DAV1D_WM_TYPE_TRANSLATION)))
+ {
+ res = warp_affine(t, dst, NULL, f->cur.stride[0], b_dim, 0, refp,
+ b->motion_mode == MM_WARP ? &t->warpmv :
+ &f->frame_hdr->gmv[b->ref[0]]);
+ if (res) return res;
+ } else {
+ res = mc(t, dst, NULL, f->cur.stride[0],
+ bw4, bh4, t->bx, t->by, 0, b->mv[0], refp, b->ref[0], filter_2d);
+ if (res) return res;
+ if (b->motion_mode == MM_OBMC) {
+ res = obmc(t, dst, f->cur.stride[0], b_dim, 0, bx4, by4, w4, h4);
+ if (res) return res;
+ }
+ }
+ if (b->interintra_type) {
+ pixel *const tl_edge = bitfn(t->scratch.edge) + 32;
+ enum IntraPredMode m = b->interintra_mode == II_SMOOTH_PRED ?
+ SMOOTH_PRED : b->interintra_mode;
+ pixel *const tmp = bitfn(t->scratch.interintra);
+ int angle = 0;
+ const pixel *top_sb_edge = NULL;
+ if (!(t->by & (f->sb_step - 1))) {
+ top_sb_edge = f->ipred_edge[0];
+ const int sby = t->by >> f->sb_shift;
+ top_sb_edge += f->sb128w * 128 * (sby - 1);
+ }
+ m = bytefn(dav1d_prepare_intra_edges)(t->bx, t->bx > ts->tiling.col_start,
+ t->by, t->by > ts->tiling.row_start,
+ ts->tiling.col_end, ts->tiling.row_end,
+ 0, dst, f->cur.stride[0], top_sb_edge,
+ m, &angle, bw4, bh4, 0, tl_edge
+ HIGHBD_CALL_SUFFIX);
+ dsp->ipred.intra_pred[m](tmp, 4 * bw4 * sizeof(pixel),
+ tl_edge, bw4 * 4, bh4 * 4, 0, 0, 0
+ HIGHBD_CALL_SUFFIX);
+ dsp->mc.blend(dst, f->cur.stride[0], tmp,
+ bw4 * 4, bh4 * 4, II_MASK(0, bs, b));
+ }
+
+ if (!has_chroma) goto skip_inter_chroma_pred;
+
+ // sub8x8 derivation
+ int is_sub8x8 = bw4 == ss_hor || bh4 == ss_ver;
+ refmvs_block *const *r;
+ if (is_sub8x8) {
+ assert(ss_hor == 1);
+ r = &t->rt.r[(t->by & 31) + 5];
+ if (bw4 == 1) is_sub8x8 &= r[0][t->bx - 1].ref.ref[0] > 0;
+ if (bh4 == ss_ver) is_sub8x8 &= r[-1][t->bx].ref.ref[0] > 0;
+ if (bw4 == 1 && bh4 == ss_ver)
+ is_sub8x8 &= r[-1][t->bx - 1].ref.ref[0] > 0;
+ }
+
+ // chroma prediction
+ if (is_sub8x8) {
+ assert(ss_hor == 1);
+ ptrdiff_t h_off = 0, v_off = 0;
+ if (bw4 == 1 && bh4 == ss_ver) {
+ for (int pl = 0; pl < 2; pl++) {
+ res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff,
+ NULL, f->cur.stride[1],
+ bw4, bh4, t->bx - 1, t->by - 1, 1 + pl,
+ r[-1][t->bx - 1].mv.mv[0],
+ &f->refp[r[-1][t->bx - 1].ref.ref[0] - 1],
+ r[-1][t->bx - 1].ref.ref[0] - 1,
+ t->frame_thread.pass != 2 ? t->tl_4x4_filter :
+ f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx - 1].filter2d);
+ if (res) return res;
+ }
+ v_off = 2 * PXSTRIDE(f->cur.stride[1]);
+ h_off = 2;
+ }
+ if (bw4 == 1) {
+ const enum Filter2d left_filter_2d =
+ dav1d_filter_2d[t->l.filter[1][by4]][t->l.filter[0][by4]];
+ for (int pl = 0; pl < 2; pl++) {
+ res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + v_off, NULL,
+ f->cur.stride[1], bw4, bh4, t->bx - 1,
+ t->by, 1 + pl, r[0][t->bx - 1].mv.mv[0],
+ &f->refp[r[0][t->bx - 1].ref.ref[0] - 1],
+ r[0][t->bx - 1].ref.ref[0] - 1,
+ t->frame_thread.pass != 2 ? left_filter_2d :
+ f->frame_thread.b[(t->by * f->b4_stride) + t->bx - 1].filter2d);
+ if (res) return res;
+ }
+ h_off = 2;
+ }
+ if (bh4 == ss_ver) {
+ const enum Filter2d top_filter_2d =
+ dav1d_filter_2d[t->a->filter[1][bx4]][t->a->filter[0][bx4]];
+ for (int pl = 0; pl < 2; pl++) {
+ res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + h_off, NULL,
+ f->cur.stride[1], bw4, bh4, t->bx, t->by - 1,
+ 1 + pl, r[-1][t->bx].mv.mv[0],
+ &f->refp[r[-1][t->bx].ref.ref[0] - 1],
+ r[-1][t->bx].ref.ref[0] - 1,
+ t->frame_thread.pass != 2 ? top_filter_2d :
+ f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx].filter2d);
+ if (res) return res;
+ }
+ v_off = 2 * PXSTRIDE(f->cur.stride[1]);
+ }
+ for (int pl = 0; pl < 2; pl++) {
+ res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + h_off + v_off, NULL, f->cur.stride[1],
+ bw4, bh4, t->bx, t->by, 1 + pl, b->mv[0],
+ refp, b->ref[0], filter_2d);
+ if (res) return res;
+ }
+ } else {
+ if (imin(cbw4, cbh4) > 1 &&
+ ((b->inter_mode == GLOBALMV && f->gmv_warp_allowed[b->ref[0]]) ||
+ (b->motion_mode == MM_WARP && t->warpmv.type > DAV1D_WM_TYPE_TRANSLATION)))
+ {
+ for (int pl = 0; pl < 2; pl++) {
+ res = warp_affine(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff, NULL,
+ f->cur.stride[1], b_dim, 1 + pl, refp,
+ b->motion_mode == MM_WARP ? &t->warpmv :
+ &f->frame_hdr->gmv[b->ref[0]]);
+ if (res) return res;
+ }
+ } else {
+ for (int pl = 0; pl < 2; pl++) {
+ res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff,
+ NULL, f->cur.stride[1],
+ bw4 << (bw4 == ss_hor), bh4 << (bh4 == ss_ver),
+ t->bx & ~ss_hor, t->by & ~ss_ver,
+ 1 + pl, b->mv[0], refp, b->ref[0], filter_2d);
+ if (res) return res;
+ if (b->motion_mode == MM_OBMC) {
+ res = obmc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff,
+ f->cur.stride[1], b_dim, 1 + pl, bx4, by4, w4, h4);
+ if (res) return res;
+ }
+ }
+ }
+ if (b->interintra_type) {
+ // FIXME for 8x32 with 4:2:2 subsampling, this probably does
+ // the wrong thing since it will select 4x16, not 4x32, as a
+ // transform size...
+ const uint8_t *const ii_mask = II_MASK(chr_layout_idx, bs, b);
+
+ for (int pl = 0; pl < 2; pl++) {
+ pixel *const tmp = bitfn(t->scratch.interintra);
+ pixel *const tl_edge = bitfn(t->scratch.edge) + 32;
+ enum IntraPredMode m =
+ b->interintra_mode == II_SMOOTH_PRED ?
+ SMOOTH_PRED : b->interintra_mode;
+ int angle = 0;
+ pixel *const uvdst = ((pixel *) f->cur.data[1 + pl]) + uvdstoff;
+ const pixel *top_sb_edge = NULL;
+ if (!(t->by & (f->sb_step - 1))) {
+ top_sb_edge = f->ipred_edge[pl + 1];
+ const int sby = t->by >> f->sb_shift;
+ top_sb_edge += f->sb128w * 128 * (sby - 1);
+ }
+ m = bytefn(dav1d_prepare_intra_edges)(t->bx >> ss_hor,
+ (t->bx >> ss_hor) >
+ (ts->tiling.col_start >> ss_hor),
+ t->by >> ss_ver,
+ (t->by >> ss_ver) >
+ (ts->tiling.row_start >> ss_ver),
+ ts->tiling.col_end >> ss_hor,
+ ts->tiling.row_end >> ss_ver,
+ 0, uvdst, f->cur.stride[1],
+ top_sb_edge, m,
+ &angle, cbw4, cbh4, 0, tl_edge
+ HIGHBD_CALL_SUFFIX);
+ dsp->ipred.intra_pred[m](tmp, cbw4 * 4 * sizeof(pixel),
+ tl_edge, cbw4 * 4, cbh4 * 4, 0, 0, 0
+ HIGHBD_CALL_SUFFIX);
+ dsp->mc.blend(uvdst, f->cur.stride[1], tmp,
+ cbw4 * 4, cbh4 * 4, ii_mask);
+ }
+ }
+ }
+
+ skip_inter_chroma_pred: {}
+ t->tl_4x4_filter = filter_2d;
+ } else {
+ const enum Filter2d filter_2d = b->filter2d;
+ // Maximum super block size is 128x128
+ int16_t (*tmp)[128 * 128] = t->scratch.compinter;
+ int jnt_weight;
+ uint8_t *const seg_mask = t->scratch.seg_mask;
+ const uint8_t *mask;
+
+ for (int i = 0; i < 2; i++) {
+ const Dav1dThreadPicture *const refp = &f->refp[b->ref[i]];
+
+ if (b->inter_mode == GLOBALMV_GLOBALMV && f->gmv_warp_allowed[b->ref[i]]) {
+ res = warp_affine(t, NULL, tmp[i], bw4 * 4, b_dim, 0, refp,
+ &f->frame_hdr->gmv[b->ref[i]]);
+ if (res) return res;
+ } else {
+ res = mc(t, NULL, tmp[i], 0, bw4, bh4, t->bx, t->by, 0,
+ b->mv[i], refp, b->ref[i], filter_2d);
+ if (res) return res;
+ }
+ }
+ switch (b->comp_type) {
+ case COMP_INTER_AVG:
+ dsp->mc.avg(dst, f->cur.stride[0], tmp[0], tmp[1],
+ bw4 * 4, bh4 * 4 HIGHBD_CALL_SUFFIX);
+ break;
+ case COMP_INTER_WEIGHTED_AVG:
+ jnt_weight = f->jnt_weights[b->ref[0]][b->ref[1]];
+ dsp->mc.w_avg(dst, f->cur.stride[0], tmp[0], tmp[1],
+ bw4 * 4, bh4 * 4, jnt_weight HIGHBD_CALL_SUFFIX);
+ break;
+ case COMP_INTER_SEG:
+ dsp->mc.w_mask[chr_layout_idx](dst, f->cur.stride[0],
+ tmp[b->mask_sign], tmp[!b->mask_sign],
+ bw4 * 4, bh4 * 4, seg_mask,
+ b->mask_sign HIGHBD_CALL_SUFFIX);
+ mask = seg_mask;
+ break;
+ case COMP_INTER_WEDGE:
+ mask = WEDGE_MASK(0, bs, 0, b->wedge_idx);
+ dsp->mc.mask(dst, f->cur.stride[0],
+ tmp[b->mask_sign], tmp[!b->mask_sign],
+ bw4 * 4, bh4 * 4, mask HIGHBD_CALL_SUFFIX);
+ if (has_chroma)
+ mask = WEDGE_MASK(chr_layout_idx, bs, b->mask_sign, b->wedge_idx);
+ break;
+ }
+
+ // chroma
+ if (has_chroma) for (int pl = 0; pl < 2; pl++) {
+ for (int i = 0; i < 2; i++) {
+ const Dav1dThreadPicture *const refp = &f->refp[b->ref[i]];
+ if (b->inter_mode == GLOBALMV_GLOBALMV &&
+ imin(cbw4, cbh4) > 1 && f->gmv_warp_allowed[b->ref[i]])
+ {
+ res = warp_affine(t, NULL, tmp[i], bw4 * 4 >> ss_hor,
+ b_dim, 1 + pl,
+ refp, &f->frame_hdr->gmv[b->ref[i]]);
+ if (res) return res;
+ } else {
+ res = mc(t, NULL, tmp[i], 0, bw4, bh4, t->bx, t->by,
+ 1 + pl, b->mv[i], refp, b->ref[i], filter_2d);
+ if (res) return res;
+ }
+ }
+ pixel *const uvdst = ((pixel *) f->cur.data[1 + pl]) + uvdstoff;
+ switch (b->comp_type) {
+ case COMP_INTER_AVG:
+ dsp->mc.avg(uvdst, f->cur.stride[1], tmp[0], tmp[1],
+ bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver
+ HIGHBD_CALL_SUFFIX);
+ break;
+ case COMP_INTER_WEIGHTED_AVG:
+ dsp->mc.w_avg(uvdst, f->cur.stride[1], tmp[0], tmp[1],
+ bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver, jnt_weight
+ HIGHBD_CALL_SUFFIX);
+ break;
+ case COMP_INTER_WEDGE:
+ case COMP_INTER_SEG:
+ dsp->mc.mask(uvdst, f->cur.stride[1],
+ tmp[b->mask_sign], tmp[!b->mask_sign],
+ bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver, mask
+ HIGHBD_CALL_SUFFIX);
+ break;
+ }
+ }
+ }
+
+ if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
+ hex_dump(dst, f->cur.stride[0], b_dim[0] * 4, b_dim[1] * 4, "y-pred");
+ if (has_chroma) {
+ hex_dump(&((pixel *) f->cur.data[1])[uvdstoff], f->cur.stride[1],
+ cbw4 * 4, cbh4 * 4, "u-pred");
+ hex_dump(&((pixel *) f->cur.data[2])[uvdstoff], f->cur.stride[1],
+ cbw4 * 4, cbh4 * 4, "v-pred");
+ }
+ }
+
+ const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
+
+ if (b->skip) {
+ // reset coef contexts
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir lcoef, off, mul * 0x40)
+ case_set(bh4, l., 1, by4);
+ case_set(bw4, a->, 0, bx4);
+#undef set_ctx
+ if (has_chroma) {
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir ccoef[0], off, mul * 0x40); \
+ rep_macro(type, t->dir ccoef[1], off, mul * 0x40)
+ case_set(cbh4, l., 1, cby4);
+ case_set(cbw4, a->, 0, cbx4);
+#undef set_ctx
+ }
+ return 0;
+ }
+
+ const TxfmInfo *const uvtx = &dav1d_txfm_dimensions[b->uvtx];
+ const TxfmInfo *const ytx = &dav1d_txfm_dimensions[b->max_ytx];
+ const uint16_t tx_split[2] = { b->tx_split0, b->tx_split1 };
+
+ for (int init_y = 0; init_y < bh4; init_y += 16) {
+ for (int init_x = 0; init_x < bw4; init_x += 16) {
+ // coefficient coding & inverse transforms
+ int y_off = !!init_y, y;
+ dst += PXSTRIDE(f->cur.stride[0]) * 4 * init_y;
+ for (y = init_y, t->by += init_y; y < imin(h4, init_y + 16);
+ y += ytx->h, y_off++)
+ {
+ int x, x_off = !!init_x;
+ for (x = init_x, t->bx += init_x; x < imin(w4, init_x + 16);
+ x += ytx->w, x_off++)
+ {
+ read_coef_tree(t, bs, b, b->max_ytx, 0, tx_split,
+ x_off, y_off, &dst[x * 4]);
+ t->bx += ytx->w;
+ }
+ dst += PXSTRIDE(f->cur.stride[0]) * 4 * ytx->h;
+ t->bx -= x;
+ t->by += ytx->h;
+ }
+ dst -= PXSTRIDE(f->cur.stride[0]) * 4 * y;
+ t->by -= y;
+
+ // chroma coefs and inverse transform
+ if (has_chroma) for (int pl = 0; pl < 2; pl++) {
+ pixel *uvdst = ((pixel *) f->cur.data[1 + pl]) + uvdstoff +
+ (PXSTRIDE(f->cur.stride[1]) * init_y * 4 >> ss_ver);
+ for (y = init_y >> ss_ver, t->by += init_y;
+ y < imin(ch4, (init_y + 16) >> ss_ver); y += uvtx->h)
+ {
+ int x;
+ for (x = init_x >> ss_hor, t->bx += init_x;
+ x < imin(cw4, (init_x + 16) >> ss_hor); x += uvtx->w)
+ {
+ coef *cf;
+ int eob;
+ enum TxfmType txtp;
+ if (t->frame_thread.pass) {
+ const int p = t->frame_thread.pass & 1;
+ const int cbi = *ts->frame_thread[p].cbi++;
+ cf = ts->frame_thread[p].cf;
+ ts->frame_thread[p].cf += uvtx->w * uvtx->h * 16;
+ eob = cbi >> 5;
+ txtp = cbi & 0x1f;
+ } else {
+ uint8_t cf_ctx;
+ cf = bitfn(t->cf);
+ txtp = t->scratch.txtp_map[(by4 + (y << ss_ver)) * 32 +
+ bx4 + (x << ss_hor)];
+ eob = decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],
+ &t->l.ccoef[pl][cby4 + y],
+ b->uvtx, bs, b, 0, 1 + pl,
+ cf, &txtp, &cf_ctx);
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-uv-cf-blk[pl=%d,tx=%d,"
+ "txtp=%d,eob=%d]: r=%d\n",
+ pl, b->uvtx, txtp, eob, ts->msac.rng);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir ccoef[pl], off, mul * cf_ctx)
+#define default_memset(dir, diridx, off, sz) \
+ memset(&t->dir ccoef[pl][off], cf_ctx, sz)
+ case_set_upto16_with_default( \
+ imin(uvtx->h, (f->bh - t->by + ss_ver) >> ss_ver),
+ l., 1, cby4 + y);
+ case_set_upto16_with_default( \
+ imin(uvtx->w, (f->bw - t->bx + ss_hor) >> ss_hor),
+ a->, 0, cbx4 + x);
+#undef default_memset
+#undef set_ctx
+ }
+ if (eob >= 0) {
+ if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
+ coef_dump(cf, uvtx->h * 4, uvtx->w * 4, 3, "dq");
+ dsp->itx.itxfm_add[b->uvtx]
+ [txtp](&uvdst[4 * x],
+ f->cur.stride[1],
+ cf, eob HIGHBD_CALL_SUFFIX);
+ if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
+ hex_dump(&uvdst[4 * x], f->cur.stride[1],
+ uvtx->w * 4, uvtx->h * 4, "recon");
+ }
+ t->bx += uvtx->w << ss_hor;
+ }
+ uvdst += PXSTRIDE(f->cur.stride[1]) * 4 * uvtx->h;
+ t->bx -= x << ss_hor;
+ t->by += uvtx->h << ss_ver;
+ }
+ t->by -= y << ss_ver;
+ }
+ }
+ }
+ return 0;
+}
+
+void bytefn(dav1d_filter_sbrow_deblock_cols)(Dav1dFrameContext *const f, const int sby) {
+ if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_DEBLOCK) ||
+ (!f->frame_hdr->loopfilter.level_y[0] && !f->frame_hdr->loopfilter.level_y[1]))
+ {
+ return;
+ }
+ const int y = sby * f->sb_step * 4;
+ const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ pixel *const p[3] = {
+ f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),
+ f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
+ f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
+ };
+ Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w;
+ bytefn(dav1d_loopfilter_sbrow_cols)(f, p, mask, sby,
+ f->lf.start_of_tile_row[sby]);
+}
+
+void bytefn(dav1d_filter_sbrow_deblock_rows)(Dav1dFrameContext *const f, const int sby) {
+ const int y = sby * f->sb_step * 4;
+ const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ pixel *const p[3] = {
+ f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),
+ f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
+ f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
+ };
+ Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w;
+ if (f->c->inloop_filters & DAV1D_INLOOPFILTER_DEBLOCK &&
+ (f->frame_hdr->loopfilter.level_y[0] || f->frame_hdr->loopfilter.level_y[1]))
+ {
+ bytefn(dav1d_loopfilter_sbrow_rows)(f, p, mask, sby);
+ }
+ if (f->seq_hdr->cdef || f->lf.restore_planes) {
+ // Store loop filtered pixels required by CDEF / LR
+ bytefn(dav1d_copy_lpf)(f, p, sby);
+ }
+}
+
+void bytefn(dav1d_filter_sbrow_cdef)(Dav1dTaskContext *const tc, const int sby) {
+ const Dav1dFrameContext *const f = tc->f;
+ if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_CDEF)) return;
+ const int sbsz = f->sb_step;
+ const int y = sby * sbsz * 4;
+ const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ pixel *const p[3] = {
+ f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),
+ f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
+ f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
+ };
+ Av1Filter *prev_mask = f->lf.mask + ((sby - 1) >> !f->seq_hdr->sb128) * f->sb128w;
+ Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w;
+ const int start = sby * sbsz;
+ if (sby) {
+ const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ pixel *p_up[3] = {
+ p[0] - 8 * PXSTRIDE(f->cur.stride[0]),
+ p[1] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
+ p[2] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
+ };
+ bytefn(dav1d_cdef_brow)(tc, p_up, prev_mask, start - 2, start, 1, sby);
+ }
+ const int n_blks = sbsz - 2 * (sby + 1 < f->sbh);
+ const int end = imin(start + n_blks, f->bh);
+ bytefn(dav1d_cdef_brow)(tc, p, mask, start, end, 0, sby);
+}
+
+void bytefn(dav1d_filter_sbrow_resize)(Dav1dFrameContext *const f, const int sby) {
+ const int sbsz = f->sb_step;
+ const int y = sby * sbsz * 4;
+ const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ const pixel *const p[3] = {
+ f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),
+ f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
+ f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
+ };
+ pixel *const sr_p[3] = {
+ f->lf.sr_p[0] + y * PXSTRIDE(f->sr_cur.p.stride[0]),
+ f->lf.sr_p[1] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver),
+ f->lf.sr_p[2] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver)
+ };
+ const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400;
+ for (int pl = 0; pl < 1 + 2 * has_chroma; pl++) {
+ const int ss_ver = pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int h_start = 8 * !!sby >> ss_ver;
+ const ptrdiff_t dst_stride = f->sr_cur.p.stride[!!pl];
+ pixel *dst = sr_p[pl] - h_start * PXSTRIDE(dst_stride);
+ const ptrdiff_t src_stride = f->cur.stride[!!pl];
+ const pixel *src = p[pl] - h_start * PXSTRIDE(src_stride);
+ const int h_end = 4 * (sbsz - 2 * (sby + 1 < f->sbh)) >> ss_ver;
+ const int ss_hor = pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+ const int dst_w = (f->sr_cur.p.p.w + ss_hor) >> ss_hor;
+ const int src_w = (4 * f->bw + ss_hor) >> ss_hor;
+ const int img_h = (f->cur.p.h - sbsz * 4 * sby + ss_ver) >> ss_ver;
+
+ f->dsp->mc.resize(dst, dst_stride, src, src_stride, dst_w,
+ imin(img_h, h_end) + h_start, src_w,
+ f->resize_step[!!pl], f->resize_start[!!pl]
+ HIGHBD_CALL_SUFFIX);
+ }
+}
+
+void bytefn(dav1d_filter_sbrow_lr)(Dav1dFrameContext *const f, const int sby) {
+ if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_RESTORATION)) return;
+ const int y = sby * f->sb_step * 4;
+ const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ pixel *const sr_p[3] = {
+ f->lf.sr_p[0] + y * PXSTRIDE(f->sr_cur.p.stride[0]),
+ f->lf.sr_p[1] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver),
+ f->lf.sr_p[2] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver)
+ };
+ bytefn(dav1d_lr_sbrow)(f, sr_p, sby);
+}
+
+void bytefn(dav1d_filter_sbrow)(Dav1dFrameContext *const f, const int sby) {
+ bytefn(dav1d_filter_sbrow_deblock_cols)(f, sby);
+ bytefn(dav1d_filter_sbrow_deblock_rows)(f, sby);
+ if (f->seq_hdr->cdef)
+ bytefn(dav1d_filter_sbrow_cdef)(f->c->tc, sby);
+ if (f->frame_hdr->width[0] != f->frame_hdr->width[1])
+ bytefn(dav1d_filter_sbrow_resize)(f, sby);
+ if (f->lf.restore_planes)
+ bytefn(dav1d_filter_sbrow_lr)(f, sby);
+}
+
+void bytefn(dav1d_backup_ipred_edge)(Dav1dTaskContext *const t) {
+ const Dav1dFrameContext *const f = t->f;
+ Dav1dTileState *const ts = t->ts;
+ const int sby = t->by >> f->sb_shift;
+ const int sby_off = f->sb128w * 128 * sby;
+ const int x_off = ts->tiling.col_start;
+
+ const pixel *const y =
+ ((const pixel *) f->cur.data[0]) + x_off * 4 +
+ ((t->by + f->sb_step) * 4 - 1) * PXSTRIDE(f->cur.stride[0]);
+ pixel_copy(&f->ipred_edge[0][sby_off + x_off * 4], y,
+ 4 * (ts->tiling.col_end - x_off));
+
+ if (f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400) {
+ const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+
+ const ptrdiff_t uv_off = (x_off * 4 >> ss_hor) +
+ (((t->by + f->sb_step) * 4 >> ss_ver) - 1) * PXSTRIDE(f->cur.stride[1]);
+ for (int pl = 1; pl <= 2; pl++)
+ pixel_copy(&f->ipred_edge[pl][sby_off + (x_off * 4 >> ss_hor)],
+ &((const pixel *) f->cur.data[pl])[uv_off],
+ 4 * (ts->tiling.col_end - x_off) >> ss_hor);
+ }
+}
+
+void bytefn(dav1d_copy_pal_block_y)(Dav1dTaskContext *const t,
+ const int bx4, const int by4,
+ const int bw4, const int bh4)
+
+{
+ const Dav1dFrameContext *const f = t->f;
+ pixel *const pal = t->frame_thread.pass ?
+ f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
+ ((t->bx >> 1) + (t->by & 1))][0] :
+ bytefn(t->scratch.pal)[0];
+ for (int x = 0; x < bw4; x++)
+ memcpy(bytefn(t->al_pal)[0][bx4 + x][0], pal, 8 * sizeof(pixel));
+ for (int y = 0; y < bh4; y++)
+ memcpy(bytefn(t->al_pal)[1][by4 + y][0], pal, 8 * sizeof(pixel));
+}
+
+void bytefn(dav1d_copy_pal_block_uv)(Dav1dTaskContext *const t,
+ const int bx4, const int by4,
+ const int bw4, const int bh4)
+
+{
+ const Dav1dFrameContext *const f = t->f;
+ const pixel (*const pal)[8] = t->frame_thread.pass ?
+ f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
+ ((t->bx >> 1) + (t->by & 1))] :
+ bytefn(t->scratch.pal);
+ // see aomedia bug 2183 for why we use luma coordinates here
+ for (int pl = 1; pl <= 2; pl++) {
+ for (int x = 0; x < bw4; x++)
+ memcpy(bytefn(t->al_pal)[0][bx4 + x][pl], pal[pl], 8 * sizeof(pixel));
+ for (int y = 0; y < bh4; y++)
+ memcpy(bytefn(t->al_pal)[1][by4 + y][pl], pal[pl], 8 * sizeof(pixel));
+ }
+}
+
+void bytefn(dav1d_read_pal_plane)(Dav1dTaskContext *const t, Av1Block *const b,
+ const int pl, const int sz_ctx,
+ const int bx4, const int by4)
+{
+ Dav1dTileState *const ts = t->ts;
+ const Dav1dFrameContext *const f = t->f;
+ const int pal_sz = b->pal_sz[pl] = dav1d_msac_decode_symbol_adapt8(&ts->msac,
+ ts->cdf.m.pal_sz[pl][sz_ctx], 6) + 2;
+ pixel cache[16], used_cache[8];
+ int l_cache = pl ? t->pal_sz_uv[1][by4] : t->l.pal_sz[by4];
+ int n_cache = 0;
+ // don't reuse above palette outside SB64 boundaries
+ int a_cache = by4 & 15 ? pl ? t->pal_sz_uv[0][bx4] : t->a->pal_sz[bx4] : 0;
+ const pixel *l = bytefn(t->al_pal)[1][by4][pl];
+ const pixel *a = bytefn(t->al_pal)[0][bx4][pl];
+
+ // fill/sort cache
+ while (l_cache && a_cache) {
+ if (*l < *a) {
+ if (!n_cache || cache[n_cache - 1] != *l)
+ cache[n_cache++] = *l;
+ l++;
+ l_cache--;
+ } else {
+ if (*a == *l) {
+ l++;
+ l_cache--;
+ }
+ if (!n_cache || cache[n_cache - 1] != *a)
+ cache[n_cache++] = *a;
+ a++;
+ a_cache--;
+ }
+ }
+ if (l_cache) {
+ do {
+ if (!n_cache || cache[n_cache - 1] != *l)
+ cache[n_cache++] = *l;
+ l++;
+ } while (--l_cache > 0);
+ } else if (a_cache) {
+ do {
+ if (!n_cache || cache[n_cache - 1] != *a)
+ cache[n_cache++] = *a;
+ a++;
+ } while (--a_cache > 0);
+ }
+
+ // find reused cache entries
+ int i = 0;
+ for (int n = 0; n < n_cache && i < pal_sz; n++)
+ if (dav1d_msac_decode_bool_equi(&ts->msac))
+ used_cache[i++] = cache[n];
+ const int n_used_cache = i;
+
+ // parse new entries
+ pixel *const pal = t->frame_thread.pass ?
+ f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
+ ((t->bx >> 1) + (t->by & 1))][pl] :
+ bytefn(t->scratch.pal)[pl];
+ if (i < pal_sz) {
+ const int bpc = BITDEPTH == 8 ? 8 : f->cur.p.bpc;
+ int prev = pal[i++] = dav1d_msac_decode_bools(&ts->msac, bpc);
+
+ if (i < pal_sz) {
+ int bits = bpc - 3 + dav1d_msac_decode_bools(&ts->msac, 2);
+ const int max = (1 << bpc) - 1;
+
+ do {
+ const int delta = dav1d_msac_decode_bools(&ts->msac, bits);
+ prev = pal[i++] = imin(prev + delta + !pl, max);
+ if (prev + !pl >= max) {
+ for (; i < pal_sz; i++)
+ pal[i] = max;
+ break;
+ }
+ bits = imin(bits, 1 + ulog2(max - prev - !pl));
+ } while (i < pal_sz);
+ }
+
+ // merge cache+new entries
+ int n = 0, m = n_used_cache;
+ for (i = 0; i < pal_sz; i++) {
+ if (n < n_used_cache && (m >= pal_sz || used_cache[n] <= pal[m])) {
+ pal[i] = used_cache[n++];
+ } else {
+ assert(m < pal_sz);
+ pal[i] = pal[m++];
+ }
+ }
+ } else {
+ memcpy(pal, used_cache, n_used_cache * sizeof(*used_cache));
+ }
+
+ if (DEBUG_BLOCK_INFO) {
+ printf("Post-pal[pl=%d,sz=%d,cache_size=%d,used_cache=%d]: r=%d, cache=",
+ pl, pal_sz, n_cache, n_used_cache, ts->msac.rng);
+ for (int n = 0; n < n_cache; n++)
+ printf("%c%02x", n ? ' ' : '[', cache[n]);
+ printf("%s, pal=", n_cache ? "]" : "[]");
+ for (int n = 0; n < pal_sz; n++)
+ printf("%c%02x", n ? ' ' : '[', pal[n]);
+ printf("]\n");
+ }
+}
+
+void bytefn(dav1d_read_pal_uv)(Dav1dTaskContext *const t, Av1Block *const b,
+ const int sz_ctx, const int bx4, const int by4)
+{
+ bytefn(dav1d_read_pal_plane)(t, b, 1, sz_ctx, bx4, by4);
+
+ // V pal coding
+ Dav1dTileState *const ts = t->ts;
+ const Dav1dFrameContext *const f = t->f;
+ pixel *const pal = t->frame_thread.pass ?
+ f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
+ ((t->bx >> 1) + (t->by & 1))][2] :
+ bytefn(t->scratch.pal)[2];
+ const int bpc = BITDEPTH == 8 ? 8 : f->cur.p.bpc;
+ if (dav1d_msac_decode_bool_equi(&ts->msac)) {
+ const int bits = bpc - 4 + dav1d_msac_decode_bools(&ts->msac, 2);
+ int prev = pal[0] = dav1d_msac_decode_bools(&ts->msac, bpc);
+ const int max = (1 << bpc) - 1;
+ for (int i = 1; i < b->pal_sz[1]; i++) {
+ int delta = dav1d_msac_decode_bools(&ts->msac, bits);
+ if (delta && dav1d_msac_decode_bool_equi(&ts->msac)) delta = -delta;
+ prev = pal[i] = (prev + delta) & max;
+ }
+ } else {
+ for (int i = 0; i < b->pal_sz[1]; i++)
+ pal[i] = dav1d_msac_decode_bools(&ts->msac, bpc);
+ }
+ if (DEBUG_BLOCK_INFO) {
+ printf("Post-pal[pl=2]: r=%d ", ts->msac.rng);
+ for (int n = 0; n < b->pal_sz[1]; n++)
+ printf("%c%02x", n ? ' ' : '[', pal[n]);
+ printf("]\n");
+ }
+}
diff --git a/third_party/dav1d/src/ref.c b/third_party/dav1d/src/ref.c
new file mode 100644
index 0000000000..5a4d3a2457
--- /dev/null
+++ b/third_party/dav1d/src/ref.c
@@ -0,0 +1,86 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "src/ref.h"
+
+static void default_free_callback(const uint8_t *const data, void *const user_data) {
+ assert(data == user_data);
+ dav1d_free_aligned(user_data);
+}
+
+Dav1dRef *dav1d_ref_create(const enum AllocationType type, size_t size) {
+ size = (size + sizeof(void*) - 1) & ~(sizeof(void*) - 1);
+
+ uint8_t *const data = dav1d_alloc_aligned(type, size + sizeof(Dav1dRef), 64);
+ if (!data) return NULL;
+
+ Dav1dRef *const res = (Dav1dRef*)(data + size);
+ res->const_data = res->user_data = res->data = data;
+ atomic_init(&res->ref_cnt, 1);
+ res->free_ref = 0;
+ res->free_callback = default_free_callback;
+
+ return res;
+}
+
+static void pool_free_callback(const uint8_t *const data, void *const user_data) {
+ dav1d_mem_pool_push((Dav1dMemPool*)data, user_data);
+}
+
+Dav1dRef *dav1d_ref_create_using_pool(Dav1dMemPool *const pool, size_t size) {
+ size = (size + sizeof(void*) - 1) & ~(sizeof(void*) - 1);
+
+ Dav1dMemPoolBuffer *const buf =
+ dav1d_mem_pool_pop(pool, size + sizeof(Dav1dRef));
+ if (!buf) return NULL;
+
+ Dav1dRef *const res = &((Dav1dRef*)buf)[-1];
+ res->data = buf->data;
+ res->const_data = pool;
+ atomic_init(&res->ref_cnt, 1);
+ res->free_ref = 0;
+ res->free_callback = pool_free_callback;
+ res->user_data = buf;
+
+ return res;
+}
+
+void dav1d_ref_dec(Dav1dRef **const pref) {
+ assert(pref != NULL);
+
+ Dav1dRef *const ref = *pref;
+ if (!ref) return;
+
+ *pref = NULL;
+ if (atomic_fetch_sub(&ref->ref_cnt, 1) == 1) {
+ const int free_ref = ref->free_ref;
+ ref->free_callback(ref->const_data, ref->user_data);
+ if (free_ref) dav1d_free(ref);
+ }
+}
diff --git a/third_party/dav1d/src/ref.h b/third_party/dav1d/src/ref.h
new file mode 100644
index 0000000000..f1c96eb914
--- /dev/null
+++ b/third_party/dav1d/src/ref.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_REF_H
+#define DAV1D_SRC_REF_H
+
+#include "dav1d/dav1d.h"
+
+#include "src/mem.h"
+#include "src/thread.h"
+
+#include <stdatomic.h>
+#include <stddef.h>
+
+struct Dav1dRef {
+ void *data;
+ const void *const_data;
+ atomic_int ref_cnt;
+ int free_ref;
+ void (*free_callback)(const uint8_t *data, void *user_data);
+ void *user_data;
+};
+
+#if !TRACK_HEAP_ALLOCATIONS
+#define dav1d_ref_create(type, size) dav1d_ref_create(size)
+#endif
+
+Dav1dRef *dav1d_ref_create(enum AllocationType type, size_t size);
+Dav1dRef *dav1d_ref_create_using_pool(Dav1dMemPool *pool, size_t size);
+void dav1d_ref_dec(Dav1dRef **ref);
+
+static inline Dav1dRef *dav1d_ref_init(Dav1dRef *const ref, const void *const ptr,
+ void (*const free_callback)(const uint8_t *data, void *user_data),
+ void *const user_data, const int free_ref)
+{
+ ref->data = NULL;
+ ref->const_data = ptr;
+ atomic_init(&ref->ref_cnt, 1);
+ ref->free_ref = free_ref;
+ ref->free_callback = free_callback;
+ ref->user_data = user_data;
+ return ref;
+}
+
+static inline void dav1d_ref_inc(Dav1dRef *const ref) {
+ atomic_fetch_add_explicit(&ref->ref_cnt, 1, memory_order_relaxed);
+}
+
+static inline int dav1d_ref_is_writable(Dav1dRef *const ref) {
+ return atomic_load(&ref->ref_cnt) == 1 && ref->data;
+}
+
+#endif /* DAV1D_SRC_REF_H */
diff --git a/third_party/dav1d/src/refmvs.c b/third_party/dav1d/src/refmvs.c
new file mode 100644
index 0000000000..200afebde7
--- /dev/null
+++ b/third_party/dav1d/src/refmvs.c
@@ -0,0 +1,944 @@
+/*
+ * Copyright © 2020, VideoLAN and dav1d authors
+ * Copyright © 2020, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <limits.h>
+#include <stdlib.h>
+
+#include "dav1d/common.h"
+
+#include "common/intops.h"
+
+#include "src/env.h"
+#include "src/mem.h"
+#include "src/refmvs.h"
+
+static void add_spatial_candidate(refmvs_candidate *const mvstack, int *const cnt,
+ const int weight, const refmvs_block *const b,
+ const union refmvs_refpair ref, const mv gmv[2],
+ int *const have_newmv_match,
+ int *const have_refmv_match)
+{
+ if (b->mv.mv[0].n == INVALID_MV) return; // intra block, no intrabc
+
+ if (ref.ref[1] == -1) {
+ for (int n = 0; n < 2; n++) {
+ if (b->ref.ref[n] == ref.ref[0]) {
+ const mv cand_mv = ((b->mf & 1) && gmv[0].n != INVALID_MV) ?
+ gmv[0] : b->mv.mv[n];
+
+ *have_refmv_match = 1;
+ *have_newmv_match |= b->mf >> 1;
+
+ const int last = *cnt;
+ for (int m = 0; m < last; m++)
+ if (mvstack[m].mv.mv[0].n == cand_mv.n) {
+ mvstack[m].weight += weight;
+ return;
+ }
+
+ if (last < 8) {
+ mvstack[last].mv.mv[0] = cand_mv;
+ mvstack[last].weight = weight;
+ *cnt = last + 1;
+ }
+ return;
+ }
+ }
+ } else if (b->ref.pair == ref.pair) {
+ const refmvs_mvpair cand_mv = { .mv = {
+ [0] = ((b->mf & 1) && gmv[0].n != INVALID_MV) ? gmv[0] : b->mv.mv[0],
+ [1] = ((b->mf & 1) && gmv[1].n != INVALID_MV) ? gmv[1] : b->mv.mv[1],
+ }};
+
+ *have_refmv_match = 1;
+ *have_newmv_match |= b->mf >> 1;
+
+ const int last = *cnt;
+ for (int n = 0; n < last; n++)
+ if (mvstack[n].mv.n == cand_mv.n) {
+ mvstack[n].weight += weight;
+ return;
+ }
+
+ if (last < 8) {
+ mvstack[last].mv = cand_mv;
+ mvstack[last].weight = weight;
+ *cnt = last + 1;
+ }
+ }
+}
+
+static int scan_row(refmvs_candidate *const mvstack, int *const cnt,
+ const union refmvs_refpair ref, const mv gmv[2],
+ const refmvs_block *b, const int bw4, const int w4,
+ const int max_rows, const int step,
+ int *const have_newmv_match, int *const have_refmv_match)
+{
+ const refmvs_block *cand_b = b;
+ const enum BlockSize first_cand_bs = cand_b->bs;
+ const uint8_t *const first_cand_b_dim = dav1d_block_dimensions[first_cand_bs];
+ int cand_bw4 = first_cand_b_dim[0];
+ int len = imax(step, imin(bw4, cand_bw4));
+
+ if (bw4 <= cand_bw4) {
+ // FIXME weight can be higher for odd blocks (bx4 & 1), but then the
+ // position of the first block has to be odd already, i.e. not just
+ // for row_offset=-3/-5
+ // FIXME why can this not be cand_bw4?
+ const int weight = bw4 == 1 ? 2 :
+ imax(2, imin(2 * max_rows, first_cand_b_dim[1]));
+ add_spatial_candidate(mvstack, cnt, len * weight, cand_b, ref, gmv,
+ have_newmv_match, have_refmv_match);
+ return weight >> 1;
+ }
+
+ for (int x = 0;;) {
+ // FIXME if we overhang above, we could fill a bitmask so we don't have
+ // to repeat the add_spatial_candidate() for the next row, but just increase
+ // the weight here
+ add_spatial_candidate(mvstack, cnt, len * 2, cand_b, ref, gmv,
+ have_newmv_match, have_refmv_match);
+ x += len;
+ if (x >= w4) return 1;
+ cand_b = &b[x];
+ cand_bw4 = dav1d_block_dimensions[cand_b->bs][0];
+ assert(cand_bw4 < bw4);
+ len = imax(step, cand_bw4);
+ }
+}
+
+static int scan_col(refmvs_candidate *const mvstack, int *const cnt,
+ const union refmvs_refpair ref, const mv gmv[2],
+ /*const*/ refmvs_block *const *b, const int bh4, const int h4,
+ const int bx4, const int max_cols, const int step,
+ int *const have_newmv_match, int *const have_refmv_match)
+{
+ const refmvs_block *cand_b = &b[0][bx4];
+ const enum BlockSize first_cand_bs = cand_b->bs;
+ const uint8_t *const first_cand_b_dim = dav1d_block_dimensions[first_cand_bs];
+ int cand_bh4 = first_cand_b_dim[1];
+ int len = imax(step, imin(bh4, cand_bh4));
+
+ if (bh4 <= cand_bh4) {
+ // FIXME weight can be higher for odd blocks (by4 & 1), but then the
+ // position of the first block has to be odd already, i.e. not just
+ // for col_offset=-3/-5
+ // FIXME why can this not be cand_bh4?
+ const int weight = bh4 == 1 ? 2 :
+ imax(2, imin(2 * max_cols, first_cand_b_dim[0]));
+ add_spatial_candidate(mvstack, cnt, len * weight, cand_b, ref, gmv,
+ have_newmv_match, have_refmv_match);
+ return weight >> 1;
+ }
+
+ for (int y = 0;;) {
+ // FIXME if we overhang above, we could fill a bitmask so we don't have
+ // to repeat the add_spatial_candidate() for the next row, but just increase
+ // the weight here
+ add_spatial_candidate(mvstack, cnt, len * 2, cand_b, ref, gmv,
+ have_newmv_match, have_refmv_match);
+ y += len;
+ if (y >= h4) return 1;
+ cand_b = &b[y][bx4];
+ cand_bh4 = dav1d_block_dimensions[cand_b->bs][1];
+ assert(cand_bh4 < bh4);
+ len = imax(step, cand_bh4);
+ }
+}
+
+static inline union mv mv_projection(const union mv mv, const int num, const int den) {
+ static const uint16_t div_mult[32] = {
+ 0, 16384, 8192, 5461, 4096, 3276, 2730, 2340,
+ 2048, 1820, 1638, 1489, 1365, 1260, 1170, 1092,
+ 1024, 963, 910, 862, 819, 780, 744, 712,
+ 682, 655, 630, 606, 585, 564, 546, 528
+ };
+ assert(den > 0 && den < 32);
+ assert(num > -32 && num < 32);
+ const int frac = num * div_mult[den];
+ const int y = mv.y * frac, x = mv.x * frac;
+ // Round and clip according to AV1 spec section 7.9.3
+ return (union mv) { // 0x3fff == (1 << 14) - 1
+ .y = iclip((y + 8192 + (y >> 31)) >> 14, -0x3fff, 0x3fff),
+ .x = iclip((x + 8192 + (x >> 31)) >> 14, -0x3fff, 0x3fff)
+ };
+}
+
+static void add_temporal_candidate(const refmvs_frame *const rf,
+ refmvs_candidate *const mvstack, int *const cnt,
+ const refmvs_temporal_block *const rb,
+ const union refmvs_refpair ref, int *const globalmv_ctx,
+ const union mv gmv[])
+{
+ if (rb->mv.n == INVALID_MV) return;
+
+ union mv mv = mv_projection(rb->mv, rf->pocdiff[ref.ref[0] - 1], rb->ref);
+ fix_mv_precision(rf->frm_hdr, &mv);
+
+ const int last = *cnt;
+ if (ref.ref[1] == -1) {
+ if (globalmv_ctx)
+ *globalmv_ctx = (abs(mv.x - gmv[0].x) | abs(mv.y - gmv[0].y)) >= 16;
+
+ for (int n = 0; n < last; n++)
+ if (mvstack[n].mv.mv[0].n == mv.n) {
+ mvstack[n].weight += 2;
+ return;
+ }
+ if (last < 8) {
+ mvstack[last].mv.mv[0] = mv;
+ mvstack[last].weight = 2;
+ *cnt = last + 1;
+ }
+ } else {
+ refmvs_mvpair mvp = { .mv = {
+ [0] = mv,
+ [1] = mv_projection(rb->mv, rf->pocdiff[ref.ref[1] - 1], rb->ref),
+ }};
+ fix_mv_precision(rf->frm_hdr, &mvp.mv[1]);
+
+ for (int n = 0; n < last; n++)
+ if (mvstack[n].mv.n == mvp.n) {
+ mvstack[n].weight += 2;
+ return;
+ }
+ if (last < 8) {
+ mvstack[last].mv = mvp;
+ mvstack[last].weight = 2;
+ *cnt = last + 1;
+ }
+ }
+}
+
+static void add_compound_extended_candidate(refmvs_candidate *const same,
+ int *const same_count,
+ const refmvs_block *const cand_b,
+ const int sign0, const int sign1,
+ const union refmvs_refpair ref,
+ const uint8_t *const sign_bias)
+{
+ refmvs_candidate *const diff = &same[2];
+ int *const diff_count = &same_count[2];
+
+ for (int n = 0; n < 2; n++) {
+ const int cand_ref = cand_b->ref.ref[n];
+
+ if (cand_ref <= 0) break;
+
+ mv cand_mv = cand_b->mv.mv[n];
+ if (cand_ref == ref.ref[0]) {
+ if (same_count[0] < 2)
+ same[same_count[0]++].mv.mv[0] = cand_mv;
+ if (diff_count[1] < 2) {
+ if (sign1 ^ sign_bias[cand_ref - 1]) {
+ cand_mv.y = -cand_mv.y;
+ cand_mv.x = -cand_mv.x;
+ }
+ diff[diff_count[1]++].mv.mv[1] = cand_mv;
+ }
+ } else if (cand_ref == ref.ref[1]) {
+ if (same_count[1] < 2)
+ same[same_count[1]++].mv.mv[1] = cand_mv;
+ if (diff_count[0] < 2) {
+ if (sign0 ^ sign_bias[cand_ref - 1]) {
+ cand_mv.y = -cand_mv.y;
+ cand_mv.x = -cand_mv.x;
+ }
+ diff[diff_count[0]++].mv.mv[0] = cand_mv;
+ }
+ } else {
+ mv i_cand_mv = (union mv) {
+ .x = -cand_mv.x,
+ .y = -cand_mv.y
+ };
+
+ if (diff_count[0] < 2) {
+ diff[diff_count[0]++].mv.mv[0] =
+ sign0 ^ sign_bias[cand_ref - 1] ?
+ i_cand_mv : cand_mv;
+ }
+
+ if (diff_count[1] < 2) {
+ diff[diff_count[1]++].mv.mv[1] =
+ sign1 ^ sign_bias[cand_ref - 1] ?
+ i_cand_mv : cand_mv;
+ }
+ }
+ }
+}
+
+static void add_single_extended_candidate(refmvs_candidate mvstack[8], int *const cnt,
+ const refmvs_block *const cand_b,
+ const int sign, const uint8_t *const sign_bias)
+{
+ for (int n = 0; n < 2; n++) {
+ const int cand_ref = cand_b->ref.ref[n];
+
+ if (cand_ref <= 0) break;
+ // we need to continue even if cand_ref == ref.ref[0], since
+ // the candidate could have been added as a globalmv variant,
+ // which changes the value
+ // FIXME if scan_{row,col}() returned a mask for the nearest
+ // edge, we could skip the appropriate ones here
+
+ mv cand_mv = cand_b->mv.mv[n];
+ if (sign ^ sign_bias[cand_ref - 1]) {
+ cand_mv.y = -cand_mv.y;
+ cand_mv.x = -cand_mv.x;
+ }
+
+ int m;
+ const int last = *cnt;
+ for (m = 0; m < last; m++)
+ if (cand_mv.n == mvstack[m].mv.mv[0].n)
+ break;
+ if (m == last) {
+ mvstack[m].mv.mv[0] = cand_mv;
+ mvstack[m].weight = 2; // "minimal"
+ *cnt = last + 1;
+ }
+ }
+}
+
+/*
+ * refmvs_frame allocates memory for one sbrow (32 blocks high, whole frame
+ * wide) of 4x4-resolution refmvs_block entries for spatial MV referencing.
+ * mvrefs_tile[] keeps a list of 35 (32 + 3 above) pointers into this memory,
+ * and each sbrow, the bottom entries (y=27/29/31) are exchanged with the top
+ * (-5/-3/-1) pointers by calling dav1d_refmvs_tile_sbrow_init() at the start
+ * of each tile/sbrow.
+ *
+ * For temporal MV referencing, we call dav1d_refmvs_save_tmvs() at the end of
+ * each tile/sbrow (when tile column threading is enabled), or at the start of
+ * each interleaved sbrow (i.e. once for all tile columns together, when tile
+ * column threading is disabled). This will copy the 4x4-resolution spatial MVs
+ * into 8x8-resolution refmvs_temporal_block structures. Then, for subsequent
+ * frames, at the start of each tile/sbrow (when tile column threading is
+ * enabled) or at the start of each interleaved sbrow (when tile column
+ * threading is disabled), we call load_tmvs(), which will project the MVs to
+ * their respective position in the current frame.
+ */
+
+void dav1d_refmvs_find(const refmvs_tile *const rt,
+ refmvs_candidate mvstack[8], int *const cnt,
+ int *const ctx,
+ const union refmvs_refpair ref, const enum BlockSize bs,
+ const enum EdgeFlags edge_flags,
+ const int by4, const int bx4)
+{
+ const refmvs_frame *const rf = rt->rf;
+ const uint8_t *const b_dim = dav1d_block_dimensions[bs];
+ const int bw4 = b_dim[0], w4 = imin(imin(bw4, 16), rt->tile_col.end - bx4);
+ const int bh4 = b_dim[1], h4 = imin(imin(bh4, 16), rt->tile_row.end - by4);
+ mv gmv[2], tgmv[2];
+
+ *cnt = 0;
+ assert(ref.ref[0] >= 0 && ref.ref[0] <= 8 &&
+ ref.ref[1] >= -1 && ref.ref[1] <= 8);
+ if (ref.ref[0] > 0) {
+ tgmv[0] = get_gmv_2d(&rf->frm_hdr->gmv[ref.ref[0] - 1],
+ bx4, by4, bw4, bh4, rf->frm_hdr);
+ gmv[0] = rf->frm_hdr->gmv[ref.ref[0] - 1].type > DAV1D_WM_TYPE_TRANSLATION ?
+ tgmv[0] : (mv) { .n = INVALID_MV };
+ } else {
+ tgmv[0] = (mv) { .n = 0 };
+ gmv[0] = (mv) { .n = INVALID_MV };
+ }
+ if (ref.ref[1] > 0) {
+ tgmv[1] = get_gmv_2d(&rf->frm_hdr->gmv[ref.ref[1] - 1],
+ bx4, by4, bw4, bh4, rf->frm_hdr);
+ gmv[1] = rf->frm_hdr->gmv[ref.ref[1] - 1].type > DAV1D_WM_TYPE_TRANSLATION ?
+ tgmv[1] : (mv) { .n = INVALID_MV };
+ }
+
+ // top
+ int have_newmv = 0, have_col_mvs = 0, have_row_mvs = 0;
+ unsigned max_rows = 0, n_rows = ~0;
+ const refmvs_block *b_top;
+ if (by4 > rt->tile_row.start) {
+ max_rows = imin((by4 - rt->tile_row.start + 1) >> 1, 2 + (bh4 > 1));
+ b_top = &rt->r[(by4 & 31) - 1 + 5][bx4];
+ n_rows = scan_row(mvstack, cnt, ref, gmv, b_top,
+ bw4, w4, max_rows, bw4 >= 16 ? 4 : 1,
+ &have_newmv, &have_row_mvs);
+ }
+
+ // left
+ unsigned max_cols = 0, n_cols = ~0U;
+ refmvs_block *const *b_left;
+ if (bx4 > rt->tile_col.start) {
+ max_cols = imin((bx4 - rt->tile_col.start + 1) >> 1, 2 + (bw4 > 1));
+ b_left = &rt->r[(by4 & 31) + 5];
+ n_cols = scan_col(mvstack, cnt, ref, gmv, b_left,
+ bh4, h4, bx4 - 1, max_cols, bh4 >= 16 ? 4 : 1,
+ &have_newmv, &have_col_mvs);
+ }
+
+ // top/right
+ if (n_rows != ~0U && edge_flags & EDGE_I444_TOP_HAS_RIGHT &&
+ imax(bw4, bh4) <= 16 && bw4 + bx4 < rt->tile_col.end)
+ {
+ add_spatial_candidate(mvstack, cnt, 4, &b_top[bw4], ref, gmv,
+ &have_newmv, &have_row_mvs);
+ }
+
+ const int nearest_match = have_col_mvs + have_row_mvs;
+ const int nearest_cnt = *cnt;
+ for (int n = 0; n < nearest_cnt; n++)
+ mvstack[n].weight += 640;
+
+ // temporal
+ int globalmv_ctx = rf->frm_hdr->use_ref_frame_mvs;
+ if (rf->use_ref_frame_mvs) {
+ const ptrdiff_t stride = rf->rp_stride;
+ const int by8 = by4 >> 1, bx8 = bx4 >> 1;
+ const refmvs_temporal_block *const rbi = &rt->rp_proj[(by8 & 15) * stride + bx8];
+ const refmvs_temporal_block *rb = rbi;
+ const int step_h = bw4 >= 16 ? 2 : 1, step_v = bh4 >= 16 ? 2 : 1;
+ const int w8 = imin((w4 + 1) >> 1, 8), h8 = imin((h4 + 1) >> 1, 8);
+ for (int y = 0; y < h8; y += step_v) {
+ for (int x = 0; x < w8; x+= step_h) {
+ add_temporal_candidate(rf, mvstack, cnt, &rb[x], ref,
+ !(x | y) ? &globalmv_ctx : NULL, tgmv);
+ }
+ rb += stride * step_v;
+ }
+ if (imin(bw4, bh4) >= 2 && imax(bw4, bh4) < 16) {
+ const int bh8 = bh4 >> 1, bw8 = bw4 >> 1;
+ rb = &rbi[bh8 * stride];
+ const int has_bottom = by8 + bh8 < imin(rt->tile_row.end >> 1,
+ (by8 & ~7) + 8);
+ if (has_bottom && bx8 - 1 >= imax(rt->tile_col.start >> 1, bx8 & ~7)) {
+ add_temporal_candidate(rf, mvstack, cnt, &rb[-1], ref,
+ NULL, NULL);
+ }
+ if (bx8 + bw8 < imin(rt->tile_col.end >> 1, (bx8 & ~7) + 8)) {
+ if (has_bottom) {
+ add_temporal_candidate(rf, mvstack, cnt, &rb[bw8], ref,
+ NULL, NULL);
+ }
+ if (by8 + bh8 - 1 < imin(rt->tile_row.end >> 1, (by8 & ~7) + 8)) {
+ add_temporal_candidate(rf, mvstack, cnt, &rb[bw8 - stride],
+ ref, NULL, NULL);
+ }
+ }
+ }
+ }
+ assert(*cnt <= 8);
+
+ // top/left (which, confusingly, is part of "secondary" references)
+ int have_dummy_newmv_match;
+ if ((n_rows | n_cols) != ~0U) {
+ add_spatial_candidate(mvstack, cnt, 4, &b_top[-1], ref, gmv,
+ &have_dummy_newmv_match, &have_row_mvs);
+ }
+
+ // "secondary" (non-direct neighbour) top & left edges
+ // what is different about secondary is that everything is now in 8x8 resolution
+ for (int n = 2; n <= 3; n++) {
+ if ((unsigned) n > n_rows && (unsigned) n <= max_rows) {
+ n_rows += scan_row(mvstack, cnt, ref, gmv,
+ &rt->r[(((by4 & 31) - 2 * n + 1) | 1) + 5][bx4 | 1],
+ bw4, w4, 1 + max_rows - n, bw4 >= 16 ? 4 : 2,
+ &have_dummy_newmv_match, &have_row_mvs);
+ }
+
+ if ((unsigned) n > n_cols && (unsigned) n <= max_cols) {
+ n_cols += scan_col(mvstack, cnt, ref, gmv, &rt->r[((by4 & 31) | 1) + 5],
+ bh4, h4, (bx4 - n * 2 + 1) | 1,
+ 1 + max_cols - n, bh4 >= 16 ? 4 : 2,
+ &have_dummy_newmv_match, &have_col_mvs);
+ }
+ }
+ assert(*cnt <= 8);
+
+ const int ref_match_count = have_col_mvs + have_row_mvs;
+
+ // context build-up
+ int refmv_ctx, newmv_ctx;
+ switch (nearest_match) {
+ case 0:
+ refmv_ctx = imin(2, ref_match_count);
+ newmv_ctx = ref_match_count > 0;
+ break;
+ case 1:
+ refmv_ctx = imin(ref_match_count * 3, 4);
+ newmv_ctx = 3 - have_newmv;
+ break;
+ case 2:
+ refmv_ctx = 5;
+ newmv_ctx = 5 - have_newmv;
+ break;
+ }
+
+ // sorting (nearest, then "secondary")
+ int len = nearest_cnt;
+ while (len) {
+ int last = 0;
+ for (int n = 1; n < len; n++) {
+ if (mvstack[n - 1].weight < mvstack[n].weight) {
+#define EXCHANGE(a, b) do { refmvs_candidate tmp = a; a = b; b = tmp; } while (0)
+ EXCHANGE(mvstack[n - 1], mvstack[n]);
+ last = n;
+ }
+ }
+ len = last;
+ }
+ len = *cnt;
+ while (len > nearest_cnt) {
+ int last = nearest_cnt;
+ for (int n = nearest_cnt + 1; n < len; n++) {
+ if (mvstack[n - 1].weight < mvstack[n].weight) {
+ EXCHANGE(mvstack[n - 1], mvstack[n]);
+#undef EXCHANGE
+ last = n;
+ }
+ }
+ len = last;
+ }
+
+ if (ref.ref[1] > 0) {
+ if (*cnt < 2) {
+ const int sign0 = rf->sign_bias[ref.ref[0] - 1];
+ const int sign1 = rf->sign_bias[ref.ref[1] - 1];
+ const int sz4 = imin(w4, h4);
+ refmvs_candidate *const same = &mvstack[*cnt];
+ int same_count[4] = { 0 };
+
+ // non-self references in top
+ if (n_rows != ~0U) for (int x = 0; x < sz4;) {
+ const refmvs_block *const cand_b = &b_top[x];
+ add_compound_extended_candidate(same, same_count, cand_b,
+ sign0, sign1, ref, rf->sign_bias);
+ x += dav1d_block_dimensions[cand_b->bs][0];
+ }
+
+ // non-self references in left
+ if (n_cols != ~0U) for (int y = 0; y < sz4;) {
+ const refmvs_block *const cand_b = &b_left[y][bx4 - 1];
+ add_compound_extended_candidate(same, same_count, cand_b,
+ sign0, sign1, ref, rf->sign_bias);
+ y += dav1d_block_dimensions[cand_b->bs][1];
+ }
+
+ refmvs_candidate *const diff = &same[2];
+ const int *const diff_count = &same_count[2];
+
+ // merge together
+ for (int n = 0; n < 2; n++) {
+ int m = same_count[n];
+
+ if (m >= 2) continue;
+
+ const int l = diff_count[n];
+ if (l) {
+ same[m].mv.mv[n] = diff[0].mv.mv[n];
+ if (++m == 2) continue;
+ if (l == 2) {
+ same[1].mv.mv[n] = diff[1].mv.mv[n];
+ continue;
+ }
+ }
+ do {
+ same[m].mv.mv[n] = tgmv[n];
+ } while (++m < 2);
+ }
+
+ // if the first extended was the same as the non-extended one,
+ // then replace it with the second extended one
+ int n = *cnt;
+ if (n == 1 && mvstack[0].mv.n == same[0].mv.n)
+ mvstack[1].mv = mvstack[2].mv;
+ do {
+ mvstack[n].weight = 2;
+ } while (++n < 2);
+ *cnt = 2;
+ }
+
+ // clamping
+ const int left = -(bx4 + bw4 + 4) * 4 * 8;
+ const int right = (rf->iw4 - bx4 + 4) * 4 * 8;
+ const int top = -(by4 + bh4 + 4) * 4 * 8;
+ const int bottom = (rf->ih4 - by4 + 4) * 4 * 8;
+
+ const int n_refmvs = *cnt;
+ int n = 0;
+ do {
+ mvstack[n].mv.mv[0].x = iclip(mvstack[n].mv.mv[0].x, left, right);
+ mvstack[n].mv.mv[0].y = iclip(mvstack[n].mv.mv[0].y, top, bottom);
+ mvstack[n].mv.mv[1].x = iclip(mvstack[n].mv.mv[1].x, left, right);
+ mvstack[n].mv.mv[1].y = iclip(mvstack[n].mv.mv[1].y, top, bottom);
+ } while (++n < n_refmvs);
+
+ switch (refmv_ctx >> 1) {
+ case 0:
+ *ctx = imin(newmv_ctx, 1);
+ break;
+ case 1:
+ *ctx = 1 + imin(newmv_ctx, 3);
+ break;
+ case 2:
+ *ctx = iclip(3 + newmv_ctx, 4, 7);
+ break;
+ }
+
+ return;
+ } else if (*cnt < 2 && ref.ref[0] > 0) {
+ const int sign = rf->sign_bias[ref.ref[0] - 1];
+ const int sz4 = imin(w4, h4);
+
+ // non-self references in top
+ if (n_rows != ~0U) for (int x = 0; x < sz4 && *cnt < 2;) {
+ const refmvs_block *const cand_b = &b_top[x];
+ add_single_extended_candidate(mvstack, cnt, cand_b, sign, rf->sign_bias);
+ x += dav1d_block_dimensions[cand_b->bs][0];
+ }
+
+ // non-self references in left
+ if (n_cols != ~0U) for (int y = 0; y < sz4 && *cnt < 2;) {
+ const refmvs_block *const cand_b = &b_left[y][bx4 - 1];
+ add_single_extended_candidate(mvstack, cnt, cand_b, sign, rf->sign_bias);
+ y += dav1d_block_dimensions[cand_b->bs][1];
+ }
+ }
+ assert(*cnt <= 8);
+
+ // clamping
+ int n_refmvs = *cnt;
+ if (n_refmvs) {
+ const int left = -(bx4 + bw4 + 4) * 4 * 8;
+ const int right = (rf->iw4 - bx4 + 4) * 4 * 8;
+ const int top = -(by4 + bh4 + 4) * 4 * 8;
+ const int bottom = (rf->ih4 - by4 + 4) * 4 * 8;
+
+ int n = 0;
+ do {
+ mvstack[n].mv.mv[0].x = iclip(mvstack[n].mv.mv[0].x, left, right);
+ mvstack[n].mv.mv[0].y = iclip(mvstack[n].mv.mv[0].y, top, bottom);
+ } while (++n < n_refmvs);
+ }
+
+ for (int n = *cnt; n < 2; n++)
+ mvstack[n].mv.mv[0] = tgmv[0];
+
+ *ctx = (refmv_ctx << 4) | (globalmv_ctx << 3) | newmv_ctx;
+}
+
+void dav1d_refmvs_tile_sbrow_init(refmvs_tile *const rt, const refmvs_frame *const rf,
+ const int tile_col_start4, const int tile_col_end4,
+ const int tile_row_start4, const int tile_row_end4,
+ const int sby, int tile_row_idx, const int pass)
+{
+ if (rf->n_tile_threads == 1) tile_row_idx = 0;
+ rt->rp_proj = &rf->rp_proj[16 * rf->rp_stride * tile_row_idx];
+ const int uses_2pass = rf->n_tile_threads > 1 && rf->n_frame_threads > 1;
+ const ptrdiff_t pass_off = (uses_2pass && pass == 2) ?
+ 35 * rf->r_stride * rf->n_tile_rows : 0;
+ refmvs_block *r = &rf->r[35 * rf->r_stride * tile_row_idx + pass_off];
+ const int sbsz = rf->sbsz;
+ const int off = (sbsz * sby) & 16;
+ for (int i = 0; i < sbsz; i++, r += rf->r_stride)
+ rt->r[off + 5 + i] = r;
+ rt->r[off + 0] = r;
+ r += rf->r_stride;
+ rt->r[off + 1] = NULL;
+ rt->r[off + 2] = r;
+ r += rf->r_stride;
+ rt->r[off + 3] = NULL;
+ rt->r[off + 4] = r;
+ if (sby & 1) {
+#define EXCHANGE(a, b) do { void *const tmp = a; a = b; b = tmp; } while (0)
+ EXCHANGE(rt->r[off + 0], rt->r[off + sbsz + 0]);
+ EXCHANGE(rt->r[off + 2], rt->r[off + sbsz + 2]);
+ EXCHANGE(rt->r[off + 4], rt->r[off + sbsz + 4]);
+#undef EXCHANGE
+ }
+
+ rt->rf = rf;
+ rt->tile_row.start = tile_row_start4;
+ rt->tile_row.end = imin(tile_row_end4, rf->ih4);
+ rt->tile_col.start = tile_col_start4;
+ rt->tile_col.end = imin(tile_col_end4, rf->iw4);
+}
+
+static void load_tmvs_c(const refmvs_frame *const rf, int tile_row_idx,
+ const int col_start8, const int col_end8,
+ const int row_start8, int row_end8)
+{
+ if (rf->n_tile_threads == 1) tile_row_idx = 0;
+ assert(row_start8 >= 0);
+ assert((unsigned) (row_end8 - row_start8) <= 16U);
+ row_end8 = imin(row_end8, rf->ih8);
+ const int col_start8i = imax(col_start8 - 8, 0);
+ const int col_end8i = imin(col_end8 + 8, rf->iw8);
+
+ const ptrdiff_t stride = rf->rp_stride;
+ refmvs_temporal_block *rp_proj =
+ &rf->rp_proj[16 * stride * tile_row_idx + (row_start8 & 15) * stride];
+ for (int y = row_start8; y < row_end8; y++) {
+ for (int x = col_start8; x < col_end8; x++)
+ rp_proj[x].mv.n = INVALID_MV;
+ rp_proj += stride;
+ }
+
+ rp_proj = &rf->rp_proj[16 * stride * tile_row_idx];
+ for (int n = 0; n < rf->n_mfmvs; n++) {
+ const int ref2cur = rf->mfmv_ref2cur[n];
+ if (ref2cur == INT_MIN) continue;
+
+ const int ref = rf->mfmv_ref[n];
+ const int ref_sign = ref - 4;
+ const refmvs_temporal_block *r = &rf->rp_ref[ref][row_start8 * stride];
+ for (int y = row_start8; y < row_end8; y++) {
+ const int y_sb_align = y & ~7;
+ const int y_proj_start = imax(y_sb_align, row_start8);
+ const int y_proj_end = imin(y_sb_align + 8, row_end8);
+ for (int x = col_start8i; x < col_end8i; x++) {
+ const refmvs_temporal_block *rb = &r[x];
+ const int b_ref = rb->ref;
+ if (!b_ref) continue;
+ const int ref2ref = rf->mfmv_ref2ref[n][b_ref - 1];
+ if (!ref2ref) continue;
+ const mv b_mv = rb->mv;
+ const mv offset = mv_projection(b_mv, ref2cur, ref2ref);
+ int pos_x = x + apply_sign(abs(offset.x) >> 6,
+ offset.x ^ ref_sign);
+ const int pos_y = y + apply_sign(abs(offset.y) >> 6,
+ offset.y ^ ref_sign);
+ if (pos_y >= y_proj_start && pos_y < y_proj_end) {
+ const ptrdiff_t pos = (pos_y & 15) * stride;
+ for (;;) {
+ const int x_sb_align = x & ~7;
+ if (pos_x >= imax(x_sb_align - 8, col_start8) &&
+ pos_x < imin(x_sb_align + 16, col_end8))
+ {
+ rp_proj[pos + pos_x].mv = rb->mv;
+ rp_proj[pos + pos_x].ref = ref2ref;
+ }
+ if (++x >= col_end8i) break;
+ rb++;
+ if (rb->ref != b_ref || rb->mv.n != b_mv.n) break;
+ pos_x++;
+ }
+ } else {
+ for (;;) {
+ if (++x >= col_end8i) break;
+ rb++;
+ if (rb->ref != b_ref || rb->mv.n != b_mv.n) break;
+ }
+ }
+ x--;
+ }
+ r += stride;
+ }
+ }
+}
+
+static void save_tmvs_c(refmvs_temporal_block *rp, const ptrdiff_t stride,
+ refmvs_block *const *const rr,
+ const uint8_t *const ref_sign,
+ const int col_end8, const int row_end8,
+ const int col_start8, const int row_start8)
+{
+ for (int y = row_start8; y < row_end8; y++) {
+ const refmvs_block *const b = rr[(y & 15) * 2];
+
+ for (int x = col_start8; x < col_end8;) {
+ const refmvs_block *const cand_b = &b[x * 2 + 1];
+ const int bw8 = (dav1d_block_dimensions[cand_b->bs][0] + 1) >> 1;
+
+ if (cand_b->ref.ref[1] > 0 && ref_sign[cand_b->ref.ref[1] - 1] &&
+ (abs(cand_b->mv.mv[1].y) | abs(cand_b->mv.mv[1].x)) < 4096)
+ {
+ for (int n = 0; n < bw8; n++, x++)
+ rp[x] = (refmvs_temporal_block) { .mv = cand_b->mv.mv[1],
+ .ref = cand_b->ref.ref[1] };
+ } else if (cand_b->ref.ref[0] > 0 && ref_sign[cand_b->ref.ref[0] - 1] &&
+ (abs(cand_b->mv.mv[0].y) | abs(cand_b->mv.mv[0].x)) < 4096)
+ {
+ for (int n = 0; n < bw8; n++, x++)
+ rp[x] = (refmvs_temporal_block) { .mv = cand_b->mv.mv[0],
+ .ref = cand_b->ref.ref[0] };
+ } else {
+ for (int n = 0; n < bw8; n++, x++) {
+ rp[x].mv.n = 0;
+ rp[x].ref = 0; // "invalid"
+ }
+ }
+ }
+ rp += stride;
+ }
+}
+
+int dav1d_refmvs_init_frame(refmvs_frame *const rf,
+ const Dav1dSequenceHeader *const seq_hdr,
+ const Dav1dFrameHeader *const frm_hdr,
+ const unsigned ref_poc[7],
+ refmvs_temporal_block *const rp,
+ const unsigned ref_ref_poc[7][7],
+ /*const*/ refmvs_temporal_block *const rp_ref[7],
+ const int n_tile_threads, const int n_frame_threads)
+{
+ rf->sbsz = 16 << seq_hdr->sb128;
+ rf->frm_hdr = frm_hdr;
+ rf->iw8 = (frm_hdr->width[0] + 7) >> 3;
+ rf->ih8 = (frm_hdr->height + 7) >> 3;
+ rf->iw4 = rf->iw8 << 1;
+ rf->ih4 = rf->ih8 << 1;
+
+ const ptrdiff_t r_stride = ((frm_hdr->width[0] + 127) & ~127) >> 2;
+ const int n_tile_rows = n_tile_threads > 1 ? frm_hdr->tiling.rows : 1;
+ if (r_stride != rf->r_stride || n_tile_rows != rf->n_tile_rows) {
+ if (rf->r) dav1d_freep_aligned(&rf->r);
+ const int uses_2pass = n_tile_threads > 1 && n_frame_threads > 1;
+ rf->r = dav1d_alloc_aligned(ALLOC_REFMVS, sizeof(*rf->r) * 35 * r_stride * n_tile_rows * (1 + uses_2pass), 64);
+ if (!rf->r) return DAV1D_ERR(ENOMEM);
+ rf->r_stride = r_stride;
+ }
+
+ const ptrdiff_t rp_stride = r_stride >> 1;
+ if (rp_stride != rf->rp_stride || n_tile_rows != rf->n_tile_rows) {
+ if (rf->rp_proj) dav1d_freep_aligned(&rf->rp_proj);
+ rf->rp_proj = dav1d_alloc_aligned(ALLOC_REFMVS, sizeof(*rf->rp_proj) * 16 * rp_stride * n_tile_rows, 64);
+ if (!rf->rp_proj) return DAV1D_ERR(ENOMEM);
+ rf->rp_stride = rp_stride;
+ }
+ rf->n_tile_rows = n_tile_rows;
+ rf->n_tile_threads = n_tile_threads;
+ rf->n_frame_threads = n_frame_threads;
+ rf->rp = rp;
+ rf->rp_ref = rp_ref;
+ const unsigned poc = frm_hdr->frame_offset;
+ for (int i = 0; i < 7; i++) {
+ const int poc_diff = get_poc_diff(seq_hdr->order_hint_n_bits,
+ ref_poc[i], poc);
+ rf->sign_bias[i] = poc_diff > 0;
+ rf->mfmv_sign[i] = poc_diff < 0;
+ rf->pocdiff[i] = iclip(get_poc_diff(seq_hdr->order_hint_n_bits,
+ poc, ref_poc[i]), -31, 31);
+ }
+
+ // temporal MV setup
+ rf->n_mfmvs = 0;
+ if (frm_hdr->use_ref_frame_mvs && seq_hdr->order_hint_n_bits) {
+ int total = 2;
+ if (rp_ref[0] && ref_ref_poc[0][6] != ref_poc[3] /* alt-of-last != gold */) {
+ rf->mfmv_ref[rf->n_mfmvs++] = 0; // last
+ total = 3;
+ }
+ if (rp_ref[4] && get_poc_diff(seq_hdr->order_hint_n_bits, ref_poc[4],
+ frm_hdr->frame_offset) > 0)
+ {
+ rf->mfmv_ref[rf->n_mfmvs++] = 4; // bwd
+ }
+ if (rp_ref[5] && get_poc_diff(seq_hdr->order_hint_n_bits, ref_poc[5],
+ frm_hdr->frame_offset) > 0)
+ {
+ rf->mfmv_ref[rf->n_mfmvs++] = 5; // altref2
+ }
+ if (rf->n_mfmvs < total && rp_ref[6] &&
+ get_poc_diff(seq_hdr->order_hint_n_bits, ref_poc[6],
+ frm_hdr->frame_offset) > 0)
+ {
+ rf->mfmv_ref[rf->n_mfmvs++] = 6; // altref
+ }
+ if (rf->n_mfmvs < total && rp_ref[1])
+ rf->mfmv_ref[rf->n_mfmvs++] = 1; // last2
+
+ for (int n = 0; n < rf->n_mfmvs; n++) {
+ const unsigned rpoc = ref_poc[rf->mfmv_ref[n]];
+ const int diff1 = get_poc_diff(seq_hdr->order_hint_n_bits,
+ rpoc, frm_hdr->frame_offset);
+ if (abs(diff1) > 31) {
+ rf->mfmv_ref2cur[n] = INT_MIN;
+ } else {
+ rf->mfmv_ref2cur[n] = rf->mfmv_ref[n] < 4 ? -diff1 : diff1;
+ for (int m = 0; m < 7; m++) {
+ const unsigned rrpoc = ref_ref_poc[rf->mfmv_ref[n]][m];
+ const int diff2 = get_poc_diff(seq_hdr->order_hint_n_bits,
+ rpoc, rrpoc);
+ // unsigned comparison also catches the < 0 case
+ rf->mfmv_ref2ref[n][m] = (unsigned) diff2 > 31U ? 0 : diff2;
+ }
+ }
+ }
+ }
+ rf->use_ref_frame_mvs = rf->n_mfmvs > 0;
+
+ return 0;
+}
+
+void dav1d_refmvs_init(refmvs_frame *const rf) {
+ rf->r = NULL;
+ rf->r_stride = 0;
+ rf->rp_proj = NULL;
+ rf->rp_stride = 0;
+}
+
+void dav1d_refmvs_clear(refmvs_frame *const rf) {
+ if (rf->r) dav1d_freep_aligned(&rf->r);
+ if (rf->rp_proj) dav1d_freep_aligned(&rf->rp_proj);
+}
+
+static void splat_mv_c(refmvs_block **rr, const refmvs_block *const rmv,
+ const int bx4, const int bw4, int bh4)
+{
+ do {
+ refmvs_block *const r = *rr++ + bx4;
+ for (int x = 0; x < bw4; x++)
+ r[x] = *rmv;
+ } while (--bh4);
+}
+
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+#include "src/arm/refmvs.h"
+#elif ARCH_LOONGARCH64
+#include "src/loongarch/refmvs.h"
+#elif ARCH_X86
+#include "src/x86/refmvs.h"
+#endif
+#endif
+
+COLD void dav1d_refmvs_dsp_init(Dav1dRefmvsDSPContext *const c)
+{
+ c->load_tmvs = load_tmvs_c;
+ c->save_tmvs = save_tmvs_c;
+ c->splat_mv = splat_mv_c;
+
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+ refmvs_dsp_init_arm(c);
+#elif ARCH_LOONGARCH64
+ refmvs_dsp_init_loongarch(c);
+#elif ARCH_X86
+ refmvs_dsp_init_x86(c);
+#endif
+#endif
+}
diff --git a/third_party/dav1d/src/refmvs.h b/third_party/dav1d/src/refmvs.h
new file mode 100644
index 0000000000..d63874d3cb
--- /dev/null
+++ b/third_party/dav1d/src/refmvs.h
@@ -0,0 +1,177 @@
+/*
+ * Copyright © 2020, VideoLAN and dav1d authors
+ * Copyright © 2020, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_REF_MVS_H
+#define DAV1D_SRC_REF_MVS_H
+
+#include <stdint.h>
+
+#include "dav1d/headers.h"
+
+#include "common/intops.h"
+
+#include "src/intra_edge.h"
+#include "src/tables.h"
+
+#define INVALID_MV 0x80008000
+
+PACKED(typedef struct refmvs_temporal_block {
+ mv mv;
+ int8_t ref;
+}) refmvs_temporal_block;
+
+typedef union refmvs_refpair {
+ int8_t ref[2]; // [0] = 0: intra=1, [1] = -1: comp=0
+ uint16_t pair;
+} refmvs_refpair;
+
+typedef union refmvs_mvpair {
+ mv mv[2];
+ uint64_t n;
+} refmvs_mvpair;
+
+PACKED(typedef struct refmvs_block {
+ refmvs_mvpair mv;
+ refmvs_refpair ref;
+ uint8_t bs, mf; // 1 = globalmv+affine, 2 = newmv
+}) ALIGN(refmvs_block, 4);
+
+typedef struct refmvs_frame {
+ const Dav1dFrameHeader *frm_hdr;
+ int iw4, ih4, iw8, ih8;
+ int sbsz;
+ int use_ref_frame_mvs;
+ uint8_t sign_bias[7], mfmv_sign[7];
+ int8_t pocdiff[7];
+ uint8_t mfmv_ref[3];
+ int mfmv_ref2cur[3];
+ int mfmv_ref2ref[3][7];
+ int n_mfmvs;
+
+ refmvs_temporal_block *rp;
+ /*const*/ refmvs_temporal_block *const *rp_ref;
+ refmvs_temporal_block *rp_proj;
+ ptrdiff_t rp_stride;
+
+ refmvs_block *r; // 35 x r_stride memory
+ ptrdiff_t r_stride;
+ int n_tile_rows, n_tile_threads, n_frame_threads;
+} refmvs_frame;
+
+typedef struct refmvs_tile {
+ const refmvs_frame *rf;
+ refmvs_block *r[32 + 5];
+ refmvs_temporal_block *rp_proj;
+ struct {
+ int start, end;
+ } tile_col, tile_row;
+} refmvs_tile;
+
+typedef struct refmvs_candidate {
+ refmvs_mvpair mv;
+ int weight;
+} refmvs_candidate;
+
+// initialize temporal MVs; this can be done in any configuration, e.g. one
+// tile/sbrow at a time, where col_{start,end}8 are the tile boundaries; or
+// it can just be for the whole frame's sbrow, where col_{start,end}8 are the
+// frame boundaries. row_{start,end}8 are the superblock row boundaries.
+#define decl_load_tmvs_fn(name) \
+void (name)(const refmvs_frame *rf, int tile_row_idx, \
+ int col_start8, int col_end8, int row_start8, int row_end8)
+typedef decl_load_tmvs_fn(*load_tmvs_fn);
+
+#define decl_save_tmvs_fn(name) \
+void (name)(refmvs_temporal_block *rp, const ptrdiff_t stride, \
+ refmvs_block *const *const rr, const uint8_t *const ref_sign, \
+ int col_end8, int row_end8, int col_start8, int row_start8)
+typedef decl_save_tmvs_fn(*save_tmvs_fn);
+
+#define decl_splat_mv_fn(name) \
+void (name)(refmvs_block **rr, const refmvs_block *rmv, int bx4, int bw4, int bh4)
+typedef decl_splat_mv_fn(*splat_mv_fn);
+
+typedef struct Dav1dRefmvsDSPContext {
+ load_tmvs_fn load_tmvs;
+ save_tmvs_fn save_tmvs;
+ splat_mv_fn splat_mv;
+} Dav1dRefmvsDSPContext;
+
+// call once per frame thread
+void dav1d_refmvs_init(refmvs_frame *rf);
+void dav1d_refmvs_clear(refmvs_frame *rf);
+
+// call once per frame
+int dav1d_refmvs_init_frame(refmvs_frame *rf,
+ const Dav1dSequenceHeader *seq_hdr,
+ const Dav1dFrameHeader *frm_hdr,
+ const unsigned ref_poc[7],
+ refmvs_temporal_block *rp,
+ const unsigned ref_ref_poc[7][7],
+ /*const*/ refmvs_temporal_block *const rp_ref[7],
+ int n_tile_threads, int n_frame_threads);
+
+// cache the current tile/sbrow (or frame/sbrow)'s projectable motion vectors
+// into buffers for use in future frame's temporal MV prediction
+static inline void dav1d_refmvs_save_tmvs(const Dav1dRefmvsDSPContext *const dsp,
+ const refmvs_tile *const rt,
+ const int col_start8, int col_end8,
+ const int row_start8, int row_end8)
+{
+ const refmvs_frame *const rf = rt->rf;
+
+ assert(row_start8 >= 0);
+ assert((unsigned) (row_end8 - row_start8) <= 16U);
+ row_end8 = imin(row_end8, rf->ih8);
+ col_end8 = imin(col_end8, rf->iw8);
+
+ const ptrdiff_t stride = rf->rp_stride;
+ const uint8_t *const ref_sign = rf->mfmv_sign;
+ refmvs_temporal_block *rp = &rf->rp[row_start8 * stride];
+
+ dsp->save_tmvs(rp, stride, rt->r + 6, ref_sign,
+ col_end8, row_end8, col_start8, row_start8);
+}
+
+// initialize tile boundaries and refmvs_block pointers for one tile/sbrow
+void dav1d_refmvs_tile_sbrow_init(refmvs_tile *rt, const refmvs_frame *rf,
+ int tile_col_start4, int tile_col_end4,
+ int tile_row_start4, int tile_row_end4,
+ int sby, int tile_row_idx, int pass);
+
+// call for each block
+void dav1d_refmvs_find(const refmvs_tile *rt,
+ refmvs_candidate mvstack[8], int *cnt,
+ int *ctx, const refmvs_refpair ref, enum BlockSize bs,
+ enum EdgeFlags edge_flags, int by4, int bx4);
+
+void dav1d_refmvs_dsp_init(Dav1dRefmvsDSPContext *dsp);
+void dav1d_refmvs_dsp_init_arm(Dav1dRefmvsDSPContext *dsp);
+void dav1d_refmvs_dsp_init_loongarch(Dav1dRefmvsDSPContext *dsp);
+void dav1d_refmvs_dsp_init_x86(Dav1dRefmvsDSPContext *dsp);
+
+#endif /* DAV1D_SRC_REF_MVS_H */
diff --git a/third_party/dav1d/src/riscv/64/itx.S b/third_party/dav1d/src/riscv/64/itx.S
new file mode 100644
index 0000000000..f7d907eedf
--- /dev/null
+++ b/third_party/dav1d/src/riscv/64/itx.S
@@ -0,0 +1,662 @@
+/******************************************************************************
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2023, Nathan Egge
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+#include "src/riscv/asm.S"
+
+function inv_txfm_add_4x4_rvv, export=1, ext=v
+ csrw vxrm, zero
+
+ vsetivli zero, 4, e16, mf2, ta, ma
+ vle16.v v0, (a2)
+ addi t0, a2, 8
+ vle16.v v1, (t0)
+ addi t0, t0, 8
+ vle16.v v2, (t0)
+ addi t0, t0, 8
+ vle16.v v3, (t0)
+
+ jalr t0, a4
+
+ vmv.v.x v4, zero
+
+ vsseg4e16.v v0, (a2)
+ vle16.v v0, (a2)
+ vse16.v v4, (a2)
+ addi t0, a2, 8
+ vle16.v v1, (t0)
+ vse16.v v4, (t0)
+ addi t0, t0, 8
+ vle16.v v2, (t0)
+ vse16.v v4, (t0)
+ addi t0, t0, 8
+ vle16.v v3, (t0)
+ vse16.v v4, (t0)
+
+ jalr t0, a5
+
+ vssra.vi v0, v0, 4
+ vssra.vi v1, v1, 4
+ vssra.vi v2, v2, 4
+ vssra.vi v3, v3, 4
+
+itx_4x4_end:
+ vsetvli zero, zero, e8, mf4, ta, ma
+ vle8.v v4, (a0)
+ add t0, a0, a1
+ vle8.v v5, (t0)
+ add t0, t0, a1
+ vle8.v v6, (t0)
+ add t0, t0, a1
+ vle8.v v7, (t0)
+
+ vwaddu.wv v0, v0, v4
+ vwaddu.wv v1, v1, v5
+ vwaddu.wv v2, v2, v6
+ vwaddu.wv v3, v3, v7
+
+ vsetvli zero, zero, e16, mf2, ta, ma
+ vmax.vx v0, v0, zero
+ vmax.vx v1, v1, zero
+ vmax.vx v2, v2, zero
+ vmax.vx v3, v3, zero
+
+ vsetvli zero, zero, e8, mf4, ta, ma
+
+ vnclipu.wi v4, v0, 0
+ vnclipu.wi v5, v1, 0
+ vnclipu.wi v6, v2, 0
+ vnclipu.wi v7, v3, 0
+
+ vse8.v v4, (a0)
+ add a0, a0, a1
+ vse8.v v5, (a0)
+ add a0, a0, a1
+ vse8.v v6, (a0)
+ add a0, a0, a1
+ vse8.v v7, (a0)
+
+ ret
+endfunc
+
+function inv_identity_e16_x4_rvv, export=1, ext=v
+ li t1, (5793-4096)*8
+ vsmul.vx v4, v0, t1
+ vsmul.vx v5, v1, t1
+ vsmul.vx v6, v2, t1
+ vsmul.vx v7, v3, t1
+
+ vsadd.vv v0, v0, v4
+ vsadd.vv v1, v1, v5
+ vsadd.vv v2, v2, v6
+ vsadd.vv v3, v3, v7
+
+ jr t0
+endfunc
+
+.macro idct_4 o0, o1, o2, o3
+ li t1, 2896
+ li t2, 1567
+ li t3, 3784
+
+ vwmul.vx v8, \o0, t1
+ vwmul.vx v10, \o0, t1
+ vwmacc.vx v8, t1, \o2
+ neg t1, t1
+ vwmacc.vx v10, t1, \o2
+
+ vwmul.vx v12, \o1, t3
+ neg t3, t3
+ vwmul.vx v14, \o1, t2
+ vwmacc.vx v12, t2, \o3
+ vwmacc.vx v14, t3, \o3
+
+ li t1, 2048
+
+ vwadd.wx v8, v8, t1
+ vwadd.wx v10, v10, t1
+ vwadd.wx v12, v12, t1
+ vwadd.wx v14, v14, t1
+
+ vnsra.wi v8, v8, 12
+ vnsra.wi v10, v10, 12
+ vnsra.wi v12, v12, 12
+ vnsra.wi v14, v14, 12
+
+ vsadd.vv \o0, v8, v12
+ vsadd.vv \o1, v10, v14
+ vssub.vv \o2, v10, v14
+ vssub.vv \o3, v8, v12
+.endm
+
+.macro iadst_4 o0, o1, o2, o3
+ li t1, 1321
+ li t2, 3803
+ li t3, 2482
+
+ vwmul.vx v4, v0, t1
+ vwmul.vx v5, v0, t3
+ neg t1, t1
+ vwmacc.vx v4, t2, v2
+ vwmacc.vx v5, t1, v2
+ neg t2, t2
+ vwmacc.vx v4, t3, v3
+ vwmacc.vx v5, t2, v3
+
+ vwsub.vv v6, v0, v2
+ vwadd.wv v6, v6, v3
+
+ li t1, 3344
+ vwmul.vx v7, v1, t1
+
+ vsetvli zero, zero, e32, m1, ta, ma
+
+ vmul.vx v6, v6, t1
+
+ vadd.vv v8, v4, v5
+ vadd.vv v4, v4, v7
+ vadd.vv v5, v5, v7
+ vsub.vv v7, v8, v7
+
+ li t1, 2048
+
+ vadd.vx v4, v4, t1
+ vadd.vx v5, v5, t1
+ vadd.vx v6, v6, t1
+ vadd.vx v7, v7, t1
+
+ vsetvli zero, zero, e16, mf2, ta, ma
+
+ vnsra.wi \o0, v4, 12
+ vnsra.wi \o1, v5, 12
+ vnsra.wi \o2, v6, 12
+ vnsra.wi \o3, v7, 12
+.endm
+
+function inv_dct_e16_x4_rvv, export=1, ext=v
+ idct_4 v0, v1, v2, v3
+ jr t0
+endfunc
+
+function inv_adst_e16_x4_rvv, export=1, ext=v
+ iadst_4 v0, v1, v2, v3
+ jr t0
+endfunc
+
+function inv_flipadst_e16_x4_rvv, export=1, ext=v
+ iadst_4 v3, v2, v1, v0
+ jr t0
+endfunc
+
+.macro def_fn_4x4 txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_8bpc_rvv, export=1, ext=v
+.ifc \txfm1\()_\txfm2, dct_dct
+ beqz a3, 1f
+.endif
+ la a4, inv_\txfm1\()_e16_x4_rvv
+ la a5, inv_\txfm2\()_e16_x4_rvv
+ j inv_txfm_add_4x4_rvv
+.ifc \txfm1\()_\txfm2, dct_dct
+1:
+ csrw vxrm, zero
+ vsetivli zero, 4, e16, mf2, ta, ma
+ ld t2, (a2)
+ li t1, 2896*8
+ vmv.v.x v0, t2
+ vsmul.vx v0, v0, t1
+ sd x0, (a2)
+ vsmul.vx v0, v0, t1
+ vssra.vi v0, v0, 4
+ vmv.v.v v1, v0
+ vmv.v.v v2, v0
+ vmv.v.v v3, v0
+ j itx_4x4_end
+.endif
+endfunc
+.endm
+
+def_fn_4x4 dct, dct
+def_fn_4x4 identity, identity
+def_fn_4x4 dct, adst
+def_fn_4x4 dct, flipadst
+def_fn_4x4 dct, identity
+def_fn_4x4 adst, dct
+def_fn_4x4 adst, adst
+def_fn_4x4 adst, flipadst
+def_fn_4x4 flipadst, dct
+def_fn_4x4 flipadst, adst
+def_fn_4x4 flipadst, flipadst
+def_fn_4x4 identity, dct
+def_fn_4x4 adst, identity
+def_fn_4x4 flipadst, identity
+def_fn_4x4 identity, adst
+def_fn_4x4 identity, flipadst
+
+.macro def_fn_8x8_base variant
+function inv_txfm_\variant\()add_8x8_rvv, export=1, ext=v
+ csrw vxrm, zero
+
+ vsetivli zero, 8, e16, m1, ta, ma
+ vle16.v v0, (a2)
+ addi t0, a2, 16
+ vle16.v v1, (t0)
+ addi t0, t0, 16
+ vle16.v v2, (t0)
+ addi t0, t0, 16
+ vle16.v v3, (t0)
+ addi t0, t0, 16
+ vle16.v v4, (t0)
+ addi t0, t0, 16
+ vle16.v v5, (t0)
+ addi t0, t0, 16
+ vle16.v v6, (t0)
+ addi t0, t0, 16
+ vle16.v v7, (t0)
+
+.ifc \variant, identity_
+ // The identity vsadd.vv and downshift vssra.vi 1 cancel out
+.else
+ jalr t0, a4
+
+ vssra.vi v0, v0, 1
+ vssra.vi v1, v1, 1
+ vssra.vi v2, v2, 1
+ vssra.vi v3, v3, 1
+ vssra.vi v4, v4, 1
+ vssra.vi v5, v5, 1
+ vssra.vi v6, v6, 1
+ vssra.vi v7, v7, 1
+.endif
+
+ vsseg8e16.v v0, (a2)
+ vle16.v v0, (a2)
+ addi t0, a2, 16
+ vle16.v v1, (t0)
+ addi t0, t0, 16
+ vle16.v v2, (t0)
+ addi t0, t0, 16
+ vle16.v v3, (t0)
+ addi t0, t0, 16
+ vle16.v v4, (t0)
+ addi t0, t0, 16
+ vle16.v v5, (t0)
+ addi t0, t0, 16
+ vle16.v v6, (t0)
+ addi t0, t0, 16
+ vle16.v v7, (t0)
+
+ jalr t0, a5
+
+ vssra.vi v0, v0, 4
+ vssra.vi v1, v1, 4
+ vssra.vi v2, v2, 4
+ vssra.vi v3, v3, 4
+ vssra.vi v4, v4, 4
+ vssra.vi v5, v5, 4
+ vssra.vi v6, v6, 4
+ vssra.vi v7, v7, 4
+
+ li t1, 64
+ vsetvli zero, t1, e16, m8, ta, ma
+ vmv.v.x v8, zero
+ vse16.v v8, (a2)
+
+.ifc \variant, identity_
+itx_8x8_end:
+.endif
+ vsetivli zero, 8, e8, mf2, ta, ma
+ vle8.v v8, (a0)
+ add t0, a0, a1
+ vle8.v v9, (t0)
+ add t0, t0, a1
+ vle8.v v10, (t0)
+ add t0, t0, a1
+ vle8.v v11, (t0)
+ add t0, t0, a1
+ vle8.v v12, (t0)
+ add t0, t0, a1
+ vle8.v v13, (t0)
+ add t0, t0, a1
+ vle8.v v14, (t0)
+ add t0, t0, a1
+ vle8.v v15, (t0)
+
+ vwaddu.wv v0, v0, v8
+ vwaddu.wv v1, v1, v9
+ vwaddu.wv v2, v2, v10
+ vwaddu.wv v3, v3, v11
+ vwaddu.wv v4, v4, v12
+ vwaddu.wv v5, v5, v13
+ vwaddu.wv v6, v6, v14
+ vwaddu.wv v7, v7, v15
+
+ vsetvli zero, zero, e16, m1
+ vmax.vx v0, v0, zero
+ vmax.vx v1, v1, zero
+ vmax.vx v2, v2, zero
+ vmax.vx v3, v3, zero
+ vmax.vx v4, v4, zero
+ vmax.vx v5, v5, zero
+ vmax.vx v6, v6, zero
+ vmax.vx v7, v7, zero
+
+ vsetvli zero, zero, e8, mf2, ta, ma
+
+ vnclipu.wi v8, v0, 0
+ vnclipu.wi v9, v1, 0
+ vnclipu.wi v10, v2, 0
+ vnclipu.wi v11, v3, 0
+ vnclipu.wi v12, v4, 0
+ vnclipu.wi v13, v5, 0
+ vnclipu.wi v14, v6, 0
+ vnclipu.wi v15, v7, 0
+
+ vse8.v v8, (a0)
+ add a0, a0, a1
+ vse8.v v9, (a0)
+ add a0, a0, a1
+ vse8.v v10, (a0)
+ add a0, a0, a1
+ vse8.v v11, (a0)
+ add a0, a0, a1
+ vse8.v v12, (a0)
+ add a0, a0, a1
+ vse8.v v13, (a0)
+ add a0, a0, a1
+ vse8.v v14, (a0)
+ add a0, a0, a1
+ vse8.v v15, (a0)
+
+ ret
+endfunc
+.endm
+
+def_fn_8x8_base
+def_fn_8x8_base identity_
+
+function inv_identity_e16_x8_rvv, export=1, ext=v
+ vsadd.vv v0, v0, v0
+ vsadd.vv v1, v1, v1
+ vsadd.vv v2, v2, v2
+ vsadd.vv v3, v3, v3
+ vsadd.vv v4, v4, v4
+ vsadd.vv v5, v5, v5
+ vsadd.vv v6, v6, v6
+ vsadd.vv v7, v7, v7
+
+ jr t0
+endfunc
+
+function inv_dct_e16_x8_rvv, export=1, ext=v
+ idct_4 v0, v2, v4, v6
+
+ li t1, 799
+ li t2, 4017
+ li t3, 3406
+ li t4, 2276
+
+ vwmul.vx v14, v1, t2
+ neg t2, t2
+ vwmul.vx v8, v1, t1
+ vwmacc.vx v14, t1, v7
+ vwmacc.vx v8, t2, v7
+
+ vwmul.vx v12, v5, t4
+ neg t4, t4
+ vwmul.vx v10, v5, t3
+ vwmacc.vx v12, t3, v3
+ vwmacc.vx v10, t4, v3
+
+ li t1, 2048
+
+ vwadd.wx v8, v8, t1
+ vwadd.wx v10, v10, t1
+ vwadd.wx v12, v12, t1
+ vwadd.wx v14, v14, t1
+
+ vnsra.wi v8, v8, 12
+ vnsra.wi v10, v10, 12
+ vnsra.wi v12, v12, 12
+ vnsra.wi v14, v14, 12
+
+ vssub.vv v7, v14, v12
+ vsadd.vv v14, v14, v12
+ vssub.vv v1, v8, v10
+ vsadd.vv v8, v8, v10
+
+ li t2, 2896
+
+ vwmul.vx v10, v7, t2
+ vwmul.vx v12, v7, t2
+ vwmacc.vx v12, t2, v1
+ neg t2, t2
+ vwmacc.vx v10, t2, v1
+
+ vwadd.wx v10, v10, t1
+ vwadd.wx v12, v12, t1
+
+ vnsra.wi v10, v10, 12
+ vnsra.wi v12, v12, 12
+
+ vssub.vv v7, v0, v14
+ vsadd.vv v0, v0, v14
+ vssub.vv v9, v2, v12
+ vsadd.vv v1, v2, v12
+ vssub.vv v5, v4, v10
+ vsadd.vv v2, v4, v10
+ vssub.vv v4, v6, v8
+ vsadd.vv v3, v6, v8
+ vmv.v.v v6, v9
+
+ jr t0
+endfunc
+
+.macro iadst_8 o0, o1, o2, o3, o4, o5, o6, o7
+ li t1, 4076
+ li t2, 401
+ li t3, 3612
+ li t4, 1931
+ li t5, 2598
+ li t6, 3166
+
+ vwmul.vx v8, v7, t1
+ neg t1, t1
+ vwmul.vx v10, v7, t2
+ vwmacc.vx v8, t2, v0
+ vwmacc.vx v10, t1, v0
+
+ vwmul.vx v12, v5, t3
+ neg t3, t3
+ vwmul.vx v14, v5, t4
+ vwmacc.vx v12, t4, v2
+ vwmacc.vx v14, t3, v2
+
+ vwmul.vx v16, v3, t5
+ neg t5, t5
+ vwmul.vx v18, v3, t6
+ vwmacc.vx v16, t6, v4
+ vwmacc.vx v18, t5, v4
+
+ li t1, 2048
+ li t2, 1189
+ li t3, 3920
+ li t4, 1567
+ li t5, 3784
+ li t6, 2896
+
+ vwmul.vx v20, v1, t2
+ neg t2, t2
+ vwmul.vx v22, v1, t3
+ vwmacc.vx v20, t3, v6
+ vwmacc.vx v22, t2, v6
+
+ vwadd.wx v8, v8, t1
+ vwadd.wx v10, v10, t1
+ vwadd.wx v12, v12, t1
+ vwadd.wx v14, v14, t1
+ vwadd.wx v16, v16, t1
+ vwadd.wx v18, v18, t1
+ vwadd.wx v20, v20, t1
+ vwadd.wx v22, v22, t1
+
+ vnsra.wi v8, v8, 12
+ vnsra.wi v10, v10, 12
+ vnsra.wi v12, v12, 12
+ vnsra.wi v14, v14, 12
+ vnsra.wi v16, v16, 12
+ vnsra.wi v18, v18, 12
+ vnsra.wi v20, v20, 12
+ vnsra.wi v22, v22, 12
+
+ vssub.vv v4, v8, v16
+ vsadd.vv v8, v8, v16
+ vsadd.vv v1, v10, v18
+ vsadd.vv v2, v12, v20
+ vsadd.vv v3, v14, v22
+ vssub.vv v5, v10, v18
+ vssub.vv v6, v12, v20
+ vssub.vv v22, v14, v22
+
+ vsadd.vv \o0, v8, v2
+ vsadd.vv \o7, v1, v3
+ vssub.vv v2, v8, v2
+ vssub.vv v3, v1, v3
+
+ vwmul.vx v8, v4, t5
+ vwmul.vx v10, v4, t4
+ vwmul.vx v12, v22, t5
+ vwmul.vx v14, v22, t4
+ vwmacc.vx v8, t4, v5
+ neg t4, t4
+ vwmacc.vx v14, t5, v6
+ neg t5, t5
+ vwmacc.vx v12, t4, v6
+ vwmacc.vx v10, t5, v5
+
+ vwadd.wx v8, v8, t1
+ vwadd.wx v10, v10, t1
+ vwadd.wx v12, v12, t1
+ vwadd.wx v14, v14, t1
+
+ vnsra.wi v8, v8, 12
+ vnsra.wi v10, v10, 12
+ vnsra.wi v12, v12, 12
+ vnsra.wi v14, v14, 12
+
+ vsadd.vv \o1, v8, v12
+ vsadd.vv \o6, v10, v14
+ vssub.vv v8, v8, v12
+ vssub.vv v9, v10, v14
+
+ vwmul.vx v10, v2, t6
+ vwmul.vx v12, v2, t6
+ vwmul.vx v14, v8, t6
+ vwmul.vx v16, v8, t6
+ vwmacc.vx v10, t6, v3
+ vwmacc.vx v14, t6, v9
+ neg t6, t6
+ vwmacc.vx v12, t6, v3
+ vwmacc.vx v16, t6, v9
+
+ vwadd.wx v10, v10, t1
+ vwadd.wx v12, v12, t1
+ vwadd.wx v14, v14, t1
+ vwadd.wx v16, v16, t1
+
+ vnsra.wi \o3, v10, 12
+ vnsra.wi \o4, v12, 12
+ vnsra.wi \o2, v14, 12
+ vnsra.wi \o5, v16, 12
+
+ vmv.v.x v8, zero
+ vssub.vv \o1, v8, \o1
+ vssub.vv \o3, v8, \o3
+ vssub.vv \o5, v8, \o5
+ vssub.vv \o7, v8, \o7
+.endm
+
+function inv_adst_e16_x8_rvv, export=1, ext=v
+ iadst_8 v0, v1, v2, v3, v4, v5, v6, v7
+ jr t0
+endfunc
+
+function inv_flipadst_e16_x8_rvv, export=1, ext=v
+ iadst_8 v7, v6, v5, v4, v3, v2, v1, v0
+ jr t0
+endfunc
+
+.macro def_fn_8x8 txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_rvv, export=1, ext=v
+.ifc \txfm1\()_\txfm2, dct_dct
+ beqz a3, 1f
+.endif
+ la a5, inv_\txfm2\()_e16_x8_rvv
+.ifc \txfm1, identity
+ j inv_txfm_identity_add_8x8_rvv
+.else
+ la a4, inv_\txfm1\()_e16_x8_rvv
+ j inv_txfm_add_8x8_rvv
+.endif
+.ifc \txfm1\()_\txfm2, dct_dct
+1:
+ csrw vxrm, zero
+ vsetivli zero, 8, e16, m1, ta, ma
+ ld t2, (a2)
+ li t1, 2896*8
+ vmv.v.x v0, t2
+ vsmul.vx v0, v0, t1
+ sd x0, (a2)
+ vssra.vi v0, v0, 1
+ vsmul.vx v0, v0, t1
+ vssra.vi v0, v0, 4
+ vmv.v.v v1, v0
+ vmv.v.v v2, v0
+ vmv.v.v v3, v0
+ vmv.v.v v4, v0
+ vmv.v.v v5, v0
+ vmv.v.v v6, v0
+ vmv.v.v v7, v0
+ j itx_8x8_end
+.endif
+endfunc
+.endm
+
+def_fn_8x8 dct, dct
+def_fn_8x8 identity, identity
+def_fn_8x8 dct, adst
+def_fn_8x8 dct, flipadst
+def_fn_8x8 dct, identity
+def_fn_8x8 adst, dct
+def_fn_8x8 adst, adst
+def_fn_8x8 adst, flipadst
+def_fn_8x8 flipadst, dct
+def_fn_8x8 flipadst, adst
+def_fn_8x8 flipadst, flipadst
+def_fn_8x8 identity, dct
+def_fn_8x8 adst, identity
+def_fn_8x8 flipadst, identity
+def_fn_8x8 identity, adst
+def_fn_8x8 identity, flipadst
diff --git a/third_party/dav1d/src/riscv/asm.S b/third_party/dav1d/src/riscv/asm.S
new file mode 100644
index 0000000000..2435170acb
--- /dev/null
+++ b/third_party/dav1d/src/riscv/asm.S
@@ -0,0 +1,126 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2023, Nathan Egge
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_RISCV_ASM_S
+#define DAV1D_SRC_RISCV_ASM_S
+
+#include "config.h"
+
+#if !defined(PIC)
+#if defined(__PIC__)
+#define PIC __PIC__
+#elif defined(__pic__)
+#define PIC __pic__
+#endif
+#endif
+
+#ifndef PRIVATE_PREFIX
+#define PRIVATE_PREFIX dav1d_
+#endif
+
+#define PASTE(a,b) a ## b
+#define CONCAT(a,b) PASTE(a,b)
+
+#ifdef PREFIX
+#define EXTERN CONCAT(_,PRIVATE_PREFIX)
+#else
+#define EXTERN PRIVATE_PREFIX
+#endif
+
+.macro function name, export=0, ext=
+ .macro endfunc
+#ifdef __ELF__
+ .size \name, . - \name
+#endif
+ .option pop
+ .purgem endfunc
+ .endm
+ .text
+ .option push
+ .ifnb \ext
+ .option arch, +\ext
+ .endif
+ .if \export
+ .global EXTERN\name
+#ifdef __ELF__
+ .type EXTERN\name, %function
+ .hidden EXTERN\name
+#elif defined(__MACH__)
+ .private_extern EXTERN\name
+#endif
+EXTERN\name:
+ .else
+#ifdef __ELF__
+ .type \name, %function
+#endif
+ .endif
+\name:
+.endm
+
+.macro const name, export=0, align=2
+ .macro endconst
+#ifdef __ELF__
+ .size \name, . - \name
+#endif
+ .purgem endconst
+ .endm
+#if defined(_WIN32)
+ .section .rdata
+#elif !defined(__MACH__)
+ .section .rodata
+#else
+ .const_data
+#endif
+ .align \align
+ .if \export
+ .global EXTERN\name
+#ifdef __ELF__
+ .hidden EXTERN\name
+#elif defined(__MACH__)
+ .private_extern EXTERN\name
+#endif
+EXTERN\name:
+ .endif
+\name:
+.endm
+
+.macro thread_local name, align=3, quads=1
+ .macro end_thread_local
+ .size \name, . - \name
+ .purgem end_thread_local
+ .endm
+ .section .tbss, "waT"
+ .align \align
+ .hidden \name
+\name:
+ .rept \quads
+ .quad 0
+ .endr
+ end_thread_local
+.endm
+
+#endif /* DAV1D_SRC_RISCV_ASM_S */
diff --git a/third_party/dav1d/src/riscv/cpu.c b/third_party/dav1d/src/riscv/cpu.c
new file mode 100644
index 0000000000..16377109de
--- /dev/null
+++ b/third_party/dav1d/src/riscv/cpu.c
@@ -0,0 +1,49 @@
+/*
+ * Copyright © 2022, VideoLAN and dav1d authors
+ * Copyright © 2022, Nathan Egge
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "common/attributes.h"
+
+#include "src/riscv/cpu.h"
+
+#if defined(HAVE_GETAUXVAL)
+#include <sys/auxv.h>
+
+#define HWCAP_RVV (1 << ('v' - 'a'))
+
+#endif
+
+COLD unsigned dav1d_get_cpu_flags_riscv(void) {
+ unsigned flags = 0;
+#if defined(HAVE_GETAUXVAL)
+ unsigned long hw_cap = getauxval(AT_HWCAP);
+ flags |= (hw_cap & HWCAP_RVV) ? DAV1D_RISCV_CPU_FLAG_V : 0;
+#endif
+
+ return flags;
+}
diff --git a/third_party/dav1d/src/riscv/cpu.h b/third_party/dav1d/src/riscv/cpu.h
new file mode 100644
index 0000000000..8ab7f53152
--- /dev/null
+++ b/third_party/dav1d/src/riscv/cpu.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright © 2022, VideoLAN and dav1d authors
+ * Copyright © 2022, Nathan Egge
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_RISCV_CPU_H
+#define DAV1D_SRC_RISCV_CPU_H
+
+enum CpuFlags {
+ DAV1D_RISCV_CPU_FLAG_V = 1 << 0,
+};
+
+unsigned dav1d_get_cpu_flags_riscv(void);
+
+#endif /* DAV1D_SRC_RISCV_CPU_H */
diff --git a/third_party/dav1d/src/riscv/itx.h b/third_party/dav1d/src/riscv/itx.h
new file mode 100644
index 0000000000..bed215471b
--- /dev/null
+++ b/third_party/dav1d/src/riscv/itx.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2023, Nathan Egge
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/itx.h"
+
+#define decl_itx2_fns(w, h, opt) \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_##w##x##h, opt))
+
+#define decl_itx12_fns(w, h, opt) \
+decl_itx2_fns(w, h, opt); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_##w##x##h, opt))
+
+#define decl_itx16_fns(w, h, opt) \
+decl_itx12_fns(w, h, opt); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_##w##x##h, opt))
+
+#define decl_itx17_fns(w, h, opt) \
+decl_itx16_fns(w, h, opt); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_##w##x##h, opt))
+
+#define decl_itx_fns(ext) \
+decl_itx17_fns( 4, 4, ext); \
+decl_itx16_fns( 8, 8, ext)
+
+decl_itx_fns(rvv);
+
+static ALWAYS_INLINE void itx_dsp_init_riscv(Dav1dInvTxfmDSPContext *const c, int const bpc) {
+#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \
+ c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
+ BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
+
+#define assign_itx1_fn(pfx, w, h, ext) \
+ assign_itx_fn(pfx, w, h, dct_dct, DCT_DCT, ext)
+
+#define assign_itx2_fn(pfx, w, h, ext) \
+ assign_itx1_fn(pfx, w, h, ext); \
+ assign_itx_fn(pfx, w, h, identity_identity, IDTX, ext)
+
+#define assign_itx12_fn(pfx, w, h, ext) \
+ assign_itx2_fn(pfx, w, h, ext); \
+ assign_itx_fn(pfx, w, h, dct_adst, ADST_DCT, ext); \
+ assign_itx_fn(pfx, w, h, dct_flipadst, FLIPADST_DCT, ext); \
+ assign_itx_fn(pfx, w, h, dct_identity, H_DCT, ext); \
+ assign_itx_fn(pfx, w, h, adst_dct, DCT_ADST, ext); \
+ assign_itx_fn(pfx, w, h, adst_adst, ADST_ADST, ext); \
+ assign_itx_fn(pfx, w, h, adst_flipadst, FLIPADST_ADST, ext); \
+ assign_itx_fn(pfx, w, h, flipadst_dct, DCT_FLIPADST, ext); \
+ assign_itx_fn(pfx, w, h, flipadst_adst, ADST_FLIPADST, ext); \
+ assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \
+ assign_itx_fn(pfx, w, h, identity_dct, V_DCT, ext)
+
+#define assign_itx16_fn(pfx, w, h, ext) \
+ assign_itx12_fn(pfx, w, h, ext); \
+ assign_itx_fn(pfx, w, h, adst_identity, H_ADST, ext); \
+ assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST, ext); \
+ assign_itx_fn(pfx, w, h, identity_adst, V_ADST, ext); \
+ assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST, ext)
+
+#define assign_itx17_fn(pfx, w, h, ext) \
+ assign_itx16_fn(pfx, w, h, ext); \
+ assign_itx_fn(pfx, w, h, wht_wht, WHT_WHT, ext)
+
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_RISCV_CPU_FLAG_V)) return;
+
+#if BITDEPTH == 8
+ assign_itx16_fn( , 4, 4, rvv);
+ assign_itx16_fn( , 8, 8, rvv);
+#endif
+}
diff --git a/third_party/dav1d/src/scan.c b/third_party/dav1d/src/scan.c
new file mode 100644
index 0000000000..5261ccd3d1
--- /dev/null
+++ b/third_party/dav1d/src/scan.c
@@ -0,0 +1,299 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "common/attributes.h"
+#include "src/scan.h"
+
+static const uint16_t ALIGN(scan_4x4[], 32) = {
+ 0, 4, 1, 2,
+ 5, 8, 12, 9,
+ 6, 3, 7, 10,
+ 13, 14, 11, 15,
+};
+
+static const uint16_t ALIGN(scan_4x8[], 32) = {
+ 0, 8, 1, 16,
+ 9, 2, 24, 17,
+ 10, 3, 25, 18,
+ 11, 4, 26, 19,
+ 12, 5, 27, 20,
+ 13, 6, 28, 21,
+ 14, 7, 29, 22,
+ 15, 30, 23, 31,
+};
+
+static const uint16_t ALIGN(scan_4x16[], 32) = {
+ 0, 16, 1, 32,
+ 17, 2, 48, 33,
+ 18, 3, 49, 34,
+ 19, 4, 50, 35,
+ 20, 5, 51, 36,
+ 21, 6, 52, 37,
+ 22, 7, 53, 38,
+ 23, 8, 54, 39,
+ 24, 9, 55, 40,
+ 25, 10, 56, 41,
+ 26, 11, 57, 42,
+ 27, 12, 58, 43,
+ 28, 13, 59, 44,
+ 29, 14, 60, 45,
+ 30, 15, 61, 46,
+ 31, 62, 47, 63,
+};
+
+static const uint16_t ALIGN(scan_8x4[], 32) = {
+ 0, 1, 4, 2, 5, 8, 3, 6,
+ 9, 12, 7, 10, 13, 16, 11, 14,
+ 17, 20, 15, 18, 21, 24, 19, 22,
+ 25, 28, 23, 26, 29, 27, 30, 31,
+};
+
+static const uint16_t ALIGN(scan_8x8[], 32) = {
+ 0, 8, 1, 2, 9, 16, 24, 17,
+ 10, 3, 4, 11, 18, 25, 32, 40,
+ 33, 26, 19, 12, 5, 6, 13, 20,
+ 27, 34, 41, 48, 56, 49, 42, 35,
+ 28, 21, 14, 7, 15, 22, 29, 36,
+ 43, 50, 57, 58, 51, 44, 37, 30,
+ 23, 31, 38, 45, 52, 59, 60, 53,
+ 46, 39, 47, 54, 61, 62, 55, 63,
+};
+
+static const uint16_t ALIGN(scan_8x16[], 32) = {
+ 0, 16, 1, 32, 17, 2, 48, 33,
+ 18, 3, 64, 49, 34, 19, 4, 80,
+ 65, 50, 35, 20, 5, 96, 81, 66,
+ 51, 36, 21, 6, 112, 97, 82, 67,
+ 52, 37, 22, 7, 113, 98, 83, 68,
+ 53, 38, 23, 8, 114, 99, 84, 69,
+ 54, 39, 24, 9, 115, 100, 85, 70,
+ 55, 40, 25, 10, 116, 101, 86, 71,
+ 56, 41, 26, 11, 117, 102, 87, 72,
+ 57, 42, 27, 12, 118, 103, 88, 73,
+ 58, 43, 28, 13, 119, 104, 89, 74,
+ 59, 44, 29, 14, 120, 105, 90, 75,
+ 60, 45, 30, 15, 121, 106, 91, 76,
+ 61, 46, 31, 122, 107, 92, 77, 62,
+ 47, 123, 108, 93, 78, 63, 124, 109,
+ 94, 79, 125, 110, 95, 126, 111, 127,
+};
+
+static const uint16_t ALIGN(scan_8x32[], 32) = {
+ 0, 32, 1, 64, 33, 2, 96, 65,
+ 34, 3, 128, 97, 66, 35, 4, 160,
+ 129, 98, 67, 36, 5, 192, 161, 130,
+ 99, 68, 37, 6, 224, 193, 162, 131,
+ 100, 69, 38, 7, 225, 194, 163, 132,
+ 101, 70, 39, 8, 226, 195, 164, 133,
+ 102, 71, 40, 9, 227, 196, 165, 134,
+ 103, 72, 41, 10, 228, 197, 166, 135,
+ 104, 73, 42, 11, 229, 198, 167, 136,
+ 105, 74, 43, 12, 230, 199, 168, 137,
+ 106, 75, 44, 13, 231, 200, 169, 138,
+ 107, 76, 45, 14, 232, 201, 170, 139,
+ 108, 77, 46, 15, 233, 202, 171, 140,
+ 109, 78, 47, 16, 234, 203, 172, 141,
+ 110, 79, 48, 17, 235, 204, 173, 142,
+ 111, 80, 49, 18, 236, 205, 174, 143,
+ 112, 81, 50, 19, 237, 206, 175, 144,
+ 113, 82, 51, 20, 238, 207, 176, 145,
+ 114, 83, 52, 21, 239, 208, 177, 146,
+ 115, 84, 53, 22, 240, 209, 178, 147,
+ 116, 85, 54, 23, 241, 210, 179, 148,
+ 117, 86, 55, 24, 242, 211, 180, 149,
+ 118, 87, 56, 25, 243, 212, 181, 150,
+ 119, 88, 57, 26, 244, 213, 182, 151,
+ 120, 89, 58, 27, 245, 214, 183, 152,
+ 121, 90, 59, 28, 246, 215, 184, 153,
+ 122, 91, 60, 29, 247, 216, 185, 154,
+ 123, 92, 61, 30, 248, 217, 186, 155,
+ 124, 93, 62, 31, 249, 218, 187, 156,
+ 125, 94, 63, 250, 219, 188, 157, 126,
+ 95, 251, 220, 189, 158, 127, 252, 221,
+ 190, 159, 253, 222, 191, 254, 223, 255,
+};
+
+static const uint16_t ALIGN(scan_16x4[], 32) = {
+ 0, 1, 4, 2, 5, 8, 3, 6, 9, 12, 7, 10, 13, 16, 11, 14,
+ 17, 20, 15, 18, 21, 24, 19, 22, 25, 28, 23, 26, 29, 32, 27, 30,
+ 33, 36, 31, 34, 37, 40, 35, 38, 41, 44, 39, 42, 45, 48, 43, 46,
+ 49, 52, 47, 50, 53, 56, 51, 54, 57, 60, 55, 58, 61, 59, 62, 63,
+};
+
+static const uint16_t ALIGN(scan_16x8[], 32) = {
+ 0, 1, 8, 2, 9, 16, 3, 10, 17, 24, 4, 11, 18, 25, 32, 5,
+ 12, 19, 26, 33, 40, 6, 13, 20, 27, 34, 41, 48, 7, 14, 21, 28,
+ 35, 42, 49, 56, 15, 22, 29, 36, 43, 50, 57, 64, 23, 30, 37, 44,
+ 51, 58, 65, 72, 31, 38, 45, 52, 59, 66, 73, 80, 39, 46, 53, 60,
+ 67, 74, 81, 88, 47, 54, 61, 68, 75, 82, 89, 96, 55, 62, 69, 76,
+ 83, 90, 97, 104, 63, 70, 77, 84, 91, 98, 105, 112, 71, 78, 85, 92,
+ 99, 106, 113, 120, 79, 86, 93, 100, 107, 114, 121, 87, 94, 101, 108, 115,
+ 122, 95, 102, 109, 116, 123, 103, 110, 117, 124, 111, 118, 125, 119, 126, 127,
+};
+
+static const uint16_t ALIGN(scan_16x16[], 32) = {
+ 0, 16, 1, 2, 17, 32, 48, 33, 18, 3, 4, 19, 34, 49, 64, 80,
+ 65, 50, 35, 20, 5, 6, 21, 36, 51, 66, 81, 96, 112, 97, 82, 67,
+ 52, 37, 22, 7, 8, 23, 38, 53, 68, 83, 98, 113, 128, 144, 129, 114,
+ 99, 84, 69, 54, 39, 24, 9, 10, 25, 40, 55, 70, 85, 100, 115, 130,
+ 145, 160, 176, 161, 146, 131, 116, 101, 86, 71, 56, 41, 26, 11, 12, 27,
+ 42, 57, 72, 87, 102, 117, 132, 147, 162, 177, 192, 208, 193, 178, 163, 148,
+ 133, 118, 103, 88, 73, 58, 43, 28, 13, 14, 29, 44, 59, 74, 89, 104,
+ 119, 134, 149, 164, 179, 194, 209, 224, 240, 225, 210, 195, 180, 165, 150, 135,
+ 120, 105, 90, 75, 60, 45, 30, 15, 31, 46, 61, 76, 91, 106, 121, 136,
+ 151, 166, 181, 196, 211, 226, 241, 242, 227, 212, 197, 182, 167, 152, 137, 122,
+ 107, 92, 77, 62, 47, 63, 78, 93, 108, 123, 138, 153, 168, 183, 198, 213,
+ 228, 243, 244, 229, 214, 199, 184, 169, 154, 139, 124, 109, 94, 79, 95, 110,
+ 125, 140, 155, 170, 185, 200, 215, 230, 245, 246, 231, 216, 201, 186, 171, 156,
+ 141, 126, 111, 127, 142, 157, 172, 187, 202, 217, 232, 247, 248, 233, 218, 203,
+ 188, 173, 158, 143, 159, 174, 189, 204, 219, 234, 249, 250, 235, 220, 205, 190,
+ 175, 191, 206, 221, 236, 251, 252, 237, 222, 207, 223, 238, 253, 254, 239, 255,
+};
+
+static const uint16_t ALIGN(scan_16x32[], 32) = {
+ 0, 32, 1, 64, 33, 2, 96, 65, 34, 3, 128, 97, 66, 35, 4, 160,
+ 129, 98, 67, 36, 5, 192, 161, 130, 99, 68, 37, 6, 224, 193, 162, 131,
+ 100, 69, 38, 7, 256, 225, 194, 163, 132, 101, 70, 39, 8, 288, 257, 226,
+ 195, 164, 133, 102, 71, 40, 9, 320, 289, 258, 227, 196, 165, 134, 103, 72,
+ 41, 10, 352, 321, 290, 259, 228, 197, 166, 135, 104, 73, 42, 11, 384, 353,
+ 322, 291, 260, 229, 198, 167, 136, 105, 74, 43, 12, 416, 385, 354, 323, 292,
+ 261, 230, 199, 168, 137, 106, 75, 44, 13, 448, 417, 386, 355, 324, 293, 262,
+ 231, 200, 169, 138, 107, 76, 45, 14, 480, 449, 418, 387, 356, 325, 294, 263,
+ 232, 201, 170, 139, 108, 77, 46, 15, 481, 450, 419, 388, 357, 326, 295, 264,
+ 233, 202, 171, 140, 109, 78, 47, 16, 482, 451, 420, 389, 358, 327, 296, 265,
+ 234, 203, 172, 141, 110, 79, 48, 17, 483, 452, 421, 390, 359, 328, 297, 266,
+ 235, 204, 173, 142, 111, 80, 49, 18, 484, 453, 422, 391, 360, 329, 298, 267,
+ 236, 205, 174, 143, 112, 81, 50, 19, 485, 454, 423, 392, 361, 330, 299, 268,
+ 237, 206, 175, 144, 113, 82, 51, 20, 486, 455, 424, 393, 362, 331, 300, 269,
+ 238, 207, 176, 145, 114, 83, 52, 21, 487, 456, 425, 394, 363, 332, 301, 270,
+ 239, 208, 177, 146, 115, 84, 53, 22, 488, 457, 426, 395, 364, 333, 302, 271,
+ 240, 209, 178, 147, 116, 85, 54, 23, 489, 458, 427, 396, 365, 334, 303, 272,
+ 241, 210, 179, 148, 117, 86, 55, 24, 490, 459, 428, 397, 366, 335, 304, 273,
+ 242, 211, 180, 149, 118, 87, 56, 25, 491, 460, 429, 398, 367, 336, 305, 274,
+ 243, 212, 181, 150, 119, 88, 57, 26, 492, 461, 430, 399, 368, 337, 306, 275,
+ 244, 213, 182, 151, 120, 89, 58, 27, 493, 462, 431, 400, 369, 338, 307, 276,
+ 245, 214, 183, 152, 121, 90, 59, 28, 494, 463, 432, 401, 370, 339, 308, 277,
+ 246, 215, 184, 153, 122, 91, 60, 29, 495, 464, 433, 402, 371, 340, 309, 278,
+ 247, 216, 185, 154, 123, 92, 61, 30, 496, 465, 434, 403, 372, 341, 310, 279,
+ 248, 217, 186, 155, 124, 93, 62, 31, 497, 466, 435, 404, 373, 342, 311, 280,
+ 249, 218, 187, 156, 125, 94, 63, 498, 467, 436, 405, 374, 343, 312, 281, 250,
+ 219, 188, 157, 126, 95, 499, 468, 437, 406, 375, 344, 313, 282, 251, 220, 189,
+ 158, 127, 500, 469, 438, 407, 376, 345, 314, 283, 252, 221, 190, 159, 501, 470,
+ 439, 408, 377, 346, 315, 284, 253, 222, 191, 502, 471, 440, 409, 378, 347, 316,
+ 285, 254, 223, 503, 472, 441, 410, 379, 348, 317, 286, 255, 504, 473, 442, 411,
+ 380, 349, 318, 287, 505, 474, 443, 412, 381, 350, 319, 506, 475, 444, 413, 382,
+ 351, 507, 476, 445, 414, 383, 508, 477, 446, 415, 509, 478, 447, 510, 479, 511,
+};
+
+static const uint16_t ALIGN(scan_32x8[], 32) = {
+ 0, 1, 8, 2, 9, 16, 3, 10, 17, 24, 4, 11, 18, 25, 32, 5, 12, 19, 26, 33, 40, 6, 13, 20, 27, 34, 41, 48, 7, 14, 21, 28,
+ 35, 42, 49, 56, 15, 22, 29, 36, 43, 50, 57, 64, 23, 30, 37, 44, 51, 58, 65, 72, 31, 38, 45, 52, 59, 66, 73, 80, 39, 46, 53, 60,
+ 67, 74, 81, 88, 47, 54, 61, 68, 75, 82, 89, 96, 55, 62, 69, 76, 83, 90, 97, 104, 63, 70, 77, 84, 91, 98, 105, 112, 71, 78, 85, 92,
+ 99, 106, 113, 120, 79, 86, 93, 100, 107, 114, 121, 128, 87, 94, 101, 108, 115, 122, 129, 136, 95, 102, 109, 116, 123, 130, 137, 144, 103, 110, 117, 124,
+ 131, 138, 145, 152, 111, 118, 125, 132, 139, 146, 153, 160, 119, 126, 133, 140, 147, 154, 161, 168, 127, 134, 141, 148, 155, 162, 169, 176, 135, 142, 149, 156,
+ 163, 170, 177, 184, 143, 150, 157, 164, 171, 178, 185, 192, 151, 158, 165, 172, 179, 186, 193, 200, 159, 166, 173, 180, 187, 194, 201, 208, 167, 174, 181, 188,
+ 195, 202, 209, 216, 175, 182, 189, 196, 203, 210, 217, 224, 183, 190, 197, 204, 211, 218, 225, 232, 191, 198, 205, 212, 219, 226, 233, 240, 199, 206, 213, 220,
+ 227, 234, 241, 248, 207, 214, 221, 228, 235, 242, 249, 215, 222, 229, 236, 243, 250, 223, 230, 237, 244, 251, 231, 238, 245, 252, 239, 246, 253, 247, 254, 255,
+};
+
+static const uint16_t ALIGN(scan_32x16[], 32) = {
+ 0, 1, 16, 2, 17, 32, 3, 18, 33, 48, 4, 19, 34, 49, 64, 5, 20, 35, 50, 65, 80, 6, 21, 36, 51, 66, 81, 96, 7, 22, 37, 52,
+ 67, 82, 97, 112, 8, 23, 38, 53, 68, 83, 98, 113, 128, 9, 24, 39, 54, 69, 84, 99, 114, 129, 144, 10, 25, 40, 55, 70, 85, 100, 115, 130,
+ 145, 160, 11, 26, 41, 56, 71, 86, 101, 116, 131, 146, 161, 176, 12, 27, 42, 57, 72, 87, 102, 117, 132, 147, 162, 177, 192, 13, 28, 43, 58, 73,
+ 88, 103, 118, 133, 148, 163, 178, 193, 208, 14, 29, 44, 59, 74, 89, 104, 119, 134, 149, 164, 179, 194, 209, 224, 15, 30, 45, 60, 75, 90, 105, 120,
+ 135, 150, 165, 180, 195, 210, 225, 240, 31, 46, 61, 76, 91, 106, 121, 136, 151, 166, 181, 196, 211, 226, 241, 256, 47, 62, 77, 92, 107, 122, 137, 152,
+ 167, 182, 197, 212, 227, 242, 257, 272, 63, 78, 93, 108, 123, 138, 153, 168, 183, 198, 213, 228, 243, 258, 273, 288, 79, 94, 109, 124, 139, 154, 169, 184,
+ 199, 214, 229, 244, 259, 274, 289, 304, 95, 110, 125, 140, 155, 170, 185, 200, 215, 230, 245, 260, 275, 290, 305, 320, 111, 126, 141, 156, 171, 186, 201, 216,
+ 231, 246, 261, 276, 291, 306, 321, 336, 127, 142, 157, 172, 187, 202, 217, 232, 247, 262, 277, 292, 307, 322, 337, 352, 143, 158, 173, 188, 203, 218, 233, 248,
+ 263, 278, 293, 308, 323, 338, 353, 368, 159, 174, 189, 204, 219, 234, 249, 264, 279, 294, 309, 324, 339, 354, 369, 384, 175, 190, 205, 220, 235, 250, 265, 280,
+ 295, 310, 325, 340, 355, 370, 385, 400, 191, 206, 221, 236, 251, 266, 281, 296, 311, 326, 341, 356, 371, 386, 401, 416, 207, 222, 237, 252, 267, 282, 297, 312,
+ 327, 342, 357, 372, 387, 402, 417, 432, 223, 238, 253, 268, 283, 298, 313, 328, 343, 358, 373, 388, 403, 418, 433, 448, 239, 254, 269, 284, 299, 314, 329, 344,
+ 359, 374, 389, 404, 419, 434, 449, 464, 255, 270, 285, 300, 315, 330, 345, 360, 375, 390, 405, 420, 435, 450, 465, 480, 271, 286, 301, 316, 331, 346, 361, 376,
+ 391, 406, 421, 436, 451, 466, 481, 496, 287, 302, 317, 332, 347, 362, 377, 392, 407, 422, 437, 452, 467, 482, 497, 303, 318, 333, 348, 363, 378, 393, 408, 423,
+ 438, 453, 468, 483, 498, 319, 334, 349, 364, 379, 394, 409, 424, 439, 454, 469, 484, 499, 335, 350, 365, 380, 395, 410, 425, 440, 455, 470, 485, 500, 351, 366,
+ 381, 396, 411, 426, 441, 456, 471, 486, 501, 367, 382, 397, 412, 427, 442, 457, 472, 487, 502, 383, 398, 413, 428, 443, 458, 473, 488, 503, 399, 414, 429, 444,
+ 459, 474, 489, 504, 415, 430, 445, 460, 475, 490, 505, 431, 446, 461, 476, 491, 506, 447, 462, 477, 492, 507, 463, 478, 493, 508, 479, 494, 509, 495, 510, 511,
+};
+
+static const uint16_t ALIGN(scan_32x32[], 32) = {
+ 0, 32, 1, 2, 33, 64, 96, 65, 34, 3, 4, 35, 66, 97, 128, 160, 129, 98, 67, 36, 5, 6, 37, 68, 99, 130, 161, 192, 224, 193, 162, 131,
+ 100, 69, 38, 7, 8, 39, 70, 101, 132, 163, 194, 225, 256, 288, 257, 226, 195, 164, 133, 102, 71, 40, 9, 10, 41, 72, 103, 134, 165, 196, 227, 258,
+ 289, 320, 352, 321, 290, 259, 228, 197, 166, 135, 104, 73, 42, 11, 12, 43, 74, 105, 136, 167, 198, 229, 260, 291, 322, 353, 384, 416, 385, 354, 323, 292,
+ 261, 230, 199, 168, 137, 106, 75, 44, 13, 14, 45, 76, 107, 138, 169, 200, 231, 262, 293, 324, 355, 386, 417, 448, 480, 449, 418, 387, 356, 325, 294, 263,
+ 232, 201, 170, 139, 108, 77, 46, 15, 16, 47, 78, 109, 140, 171, 202, 233, 264, 295, 326, 357, 388, 419, 450, 481, 512, 544, 513, 482, 451, 420, 389, 358,
+ 327, 296, 265, 234, 203, 172, 141, 110, 79, 48, 17, 18, 49, 80, 111, 142, 173, 204, 235, 266, 297, 328, 359, 390, 421, 452, 483, 514, 545, 576, 608, 577,
+ 546, 515, 484, 453, 422, 391, 360, 329, 298, 267, 236, 205, 174, 143, 112, 81, 50, 19, 20, 51, 82, 113, 144, 175, 206, 237, 268, 299, 330, 361, 392, 423,
+ 454, 485, 516, 547, 578, 609, 640, 672, 641, 610, 579, 548, 517, 486, 455, 424, 393, 362, 331, 300, 269, 238, 207, 176, 145, 114, 83, 52, 21, 22, 53, 84,
+ 115, 146, 177, 208, 239, 270, 301, 332, 363, 394, 425, 456, 487, 518, 549, 580, 611, 642, 673, 704, 736, 705, 674, 643, 612, 581, 550, 519, 488, 457, 426, 395,
+ 364, 333, 302, 271, 240, 209, 178, 147, 116, 85, 54, 23, 24, 55, 86, 117, 148, 179, 210, 241, 272, 303, 334, 365, 396, 427, 458, 489, 520, 551, 582, 613,
+ 644, 675, 706, 737, 768, 800, 769, 738, 707, 676, 645, 614, 583, 552, 521, 490, 459, 428, 397, 366, 335, 304, 273, 242, 211, 180, 149, 118, 87, 56, 25, 26,
+ 57, 88, 119, 150, 181, 212, 243, 274, 305, 336, 367, 398, 429, 460, 491, 522, 553, 584, 615, 646, 677, 708, 739, 770, 801, 832, 864, 833, 802, 771, 740, 709,
+ 678, 647, 616, 585, 554, 523, 492, 461, 430, 399, 368, 337, 306, 275, 244, 213, 182, 151, 120, 89, 58, 27, 28, 59, 90, 121, 152, 183, 214, 245, 276, 307,
+ 338, 369, 400, 431, 462, 493, 524, 555, 586, 617, 648, 679, 710, 741, 772, 803, 834, 865, 896, 928, 897, 866, 835, 804, 773, 742, 711, 680, 649, 618, 587, 556,
+ 525, 494, 463, 432, 401, 370, 339, 308, 277, 246, 215, 184, 153, 122, 91, 60, 29, 30, 61, 92, 123, 154, 185, 216, 247, 278, 309, 340, 371, 402, 433, 464,
+ 495, 526, 557, 588, 619, 650, 681, 712, 743, 774, 805, 836, 867, 898, 929, 960, 992, 961, 930, 899, 868, 837, 806, 775, 744, 713, 682, 651, 620, 589, 558, 527,
+ 496, 465, 434, 403, 372, 341, 310, 279, 248, 217, 186, 155, 124, 93, 62, 31, 63, 94, 125, 156, 187, 218, 249, 280, 311, 342, 373, 404, 435, 466, 497, 528,
+ 559, 590, 621, 652, 683, 714, 745, 776, 807, 838, 869, 900, 931, 962, 993, 994, 963, 932, 901, 870, 839, 808, 777, 746, 715, 684, 653, 622, 591, 560, 529, 498,
+ 467, 436, 405, 374, 343, 312, 281, 250, 219, 188, 157, 126, 95, 127, 158, 189, 220, 251, 282, 313, 344, 375, 406, 437, 468, 499, 530, 561, 592, 623, 654, 685,
+ 716, 747, 778, 809, 840, 871, 902, 933, 964, 995, 996, 965, 934, 903, 872, 841, 810, 779, 748, 717, 686, 655, 624, 593, 562, 531, 500, 469, 438, 407, 376, 345,
+ 314, 283, 252, 221, 190, 159, 191, 222, 253, 284, 315, 346, 377, 408, 439, 470, 501, 532, 563, 594, 625, 656, 687, 718, 749, 780, 811, 842, 873, 904, 935, 966,
+ 997, 998, 967, 936, 905, 874, 843, 812, 781, 750, 719, 688, 657, 626, 595, 564, 533, 502, 471, 440, 409, 378, 347, 316, 285, 254, 223, 255, 286, 317, 348, 379,
+ 410, 441, 472, 503, 534, 565, 596, 627, 658, 689, 720, 751, 782, 813, 844, 875, 906, 937, 968, 999, 1000, 969, 938, 907, 876, 845, 814, 783, 752, 721, 690, 659,
+ 628, 597, 566, 535, 504, 473, 442, 411, 380, 349, 318, 287, 319, 350, 381, 412, 443, 474, 505, 536, 567, 598, 629, 660, 691, 722, 753, 784, 815, 846, 877, 908,
+ 939, 970, 1001, 1002, 971, 940, 909, 878, 847, 816, 785, 754, 723, 692, 661, 630, 599, 568, 537, 506, 475, 444, 413, 382, 351, 383, 414, 445, 476, 507, 538, 569,
+ 600, 631, 662, 693, 724, 755, 786, 817, 848, 879, 910, 941, 972, 1003, 1004, 973, 942, 911, 880, 849, 818, 787, 756, 725, 694, 663, 632, 601, 570, 539, 508, 477,
+ 446, 415, 447, 478, 509, 540, 571, 602, 633, 664, 695, 726, 757, 788, 819, 850, 881, 912, 943, 974, 1005, 1006, 975, 944, 913, 882, 851, 820, 789, 758, 727, 696,
+ 665, 634, 603, 572, 541, 510, 479, 511, 542, 573, 604, 635, 666, 697, 728, 759, 790, 821, 852, 883, 914, 945, 976, 1007, 1008, 977, 946, 915, 884, 853, 822, 791,
+ 760, 729, 698, 667, 636, 605, 574, 543, 575, 606, 637, 668, 699, 730, 761, 792, 823, 854, 885, 916, 947, 978, 1009, 1010, 979, 948, 917, 886, 855, 824, 793, 762,
+ 731, 700, 669, 638, 607, 639, 670, 701, 732, 763, 794, 825, 856, 887, 918, 949, 980, 1011, 1012, 981, 950, 919, 888, 857, 826, 795, 764, 733, 702, 671, 703, 734,
+ 765, 796, 827, 858, 889, 920, 951, 982, 1013, 1014, 983, 952, 921, 890, 859, 828, 797, 766, 735, 767, 798, 829, 860, 891, 922, 953, 984, 1015, 1016, 985, 954, 923,
+ 892, 861, 830, 799, 831, 862, 893, 924, 955, 986, 1017, 1018, 987, 956, 925, 894, 863, 895, 926, 957, 988, 1019, 1020, 989, 958, 927, 959, 990, 1021, 1022, 991, 1023,
+};
+
+const uint16_t *const dav1d_scans[N_RECT_TX_SIZES] = {
+ [ TX_4X4 ] = scan_4x4,
+ [ TX_8X8 ] = scan_8x8,
+ [ TX_16X16] = scan_16x16,
+ [ TX_32X32] = scan_32x32,
+ [ TX_64X64] = scan_32x32,
+ [RTX_4X8 ] = scan_4x8,
+ [RTX_8X4 ] = scan_8x4,
+ [RTX_8X16 ] = scan_8x16,
+ [RTX_16X8 ] = scan_16x8,
+ [RTX_16X32] = scan_16x32,
+ [RTX_32X16] = scan_32x16,
+ [RTX_32X64] = scan_32x32,
+ [RTX_64X32] = scan_32x32,
+ [RTX_4X16 ] = scan_4x16,
+ [RTX_16X4 ] = scan_16x4,
+ [RTX_8X32 ] = scan_8x32,
+ [RTX_32X8 ] = scan_32x8,
+ [RTX_16X64] = scan_16x32,
+ [RTX_64X16] = scan_32x16,
+};
diff --git a/third_party/dav1d/src/scan.h b/third_party/dav1d/src/scan.h
new file mode 100644
index 0000000000..09df988779
--- /dev/null
+++ b/third_party/dav1d/src/scan.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_SCAN_H
+#define DAV1D_SRC_SCAN_H
+
+#include <stdint.h>
+
+#include "src/levels.h"
+
+EXTERN const uint16_t *const dav1d_scans[N_RECT_TX_SIZES];
+
+#endif /* DAV1D_SRC_SCAN_H */
diff --git a/third_party/dav1d/src/tables.c b/third_party/dav1d/src/tables.c
new file mode 100644
index 0000000000..9752f15c40
--- /dev/null
+++ b/third_party/dav1d/src/tables.c
@@ -0,0 +1,1013 @@
+/*
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stdint.h>
+
+#include "common/attributes.h"
+
+#include "src/levels.h"
+#include "src/tables.h"
+
+const uint8_t dav1d_al_part_ctx[2][N_BL_LEVELS][N_PARTITIONS] = {
+ {
+ // partitions:
+ // none, h, v, split, tts, tbs, tls, trs, h4, v4
+ { 0x00, 0x00, 0x10, -1, 0x00, 0x10, 0x10, 0x10, -1, -1 }, // bl128
+ { 0x10, 0x10, 0x18, -1, 0x10, 0x18, 0x18, 0x18, 0x10, 0x1c }, // bl64
+ { 0x18, 0x18, 0x1c, -1, 0x18, 0x1c, 0x1c, 0x1c, 0x18, 0x1e }, // bl32
+ { 0x1c, 0x1c, 0x1e, -1, 0x1c, 0x1e, 0x1e, 0x1e, 0x1c, 0x1f }, // bl16
+ { 0x1e, 0x1e, 0x1f, 0x1f, -1, -1, -1, -1, -1, -1 }, // bl8
+ }, {
+ { 0x00, 0x10, 0x00, -1, 0x10, 0x10, 0x00, 0x10, -1, -1 }, // bl128
+ { 0x10, 0x18, 0x10, -1, 0x18, 0x18, 0x10, 0x18, 0x1c, 0x10 }, // bl64
+ { 0x18, 0x1c, 0x18, -1, 0x1c, 0x1c, 0x18, 0x1c, 0x1e, 0x18 }, // bl32
+ { 0x1c, 0x1e, 0x1c, -1, 0x1e, 0x1e, 0x1c, 0x1e, 0x1f, 0x1c }, // bl16
+ { 0x1e, 0x1f, 0x1e, 0x1f, -1, -1, -1, -1, -1, -1 }, // bl8
+ }
+};
+
+const uint8_t /* enum BlockSize */
+ dav1d_block_sizes[N_BL_LEVELS][N_PARTITIONS][2] =
+{
+ [BL_128X128] = {
+ [PARTITION_NONE] = { BS_128x128 },
+ [PARTITION_H] = { BS_128x64 },
+ [PARTITION_V] = { BS_64x128 },
+ [PARTITION_T_TOP_SPLIT] = { BS_64x64, BS_128x64 },
+ [PARTITION_T_BOTTOM_SPLIT] = { BS_128x64, BS_64x64 },
+ [PARTITION_T_LEFT_SPLIT] = { BS_64x64, BS_64x128 },
+ [PARTITION_T_RIGHT_SPLIT] = { BS_64x128, BS_64x64 },
+ }, [BL_64X64] = {
+ [PARTITION_NONE] = { BS_64x64 },
+ [PARTITION_H] = { BS_64x32 },
+ [PARTITION_V] = { BS_32x64 },
+ [PARTITION_T_TOP_SPLIT] = { BS_32x32, BS_64x32 },
+ [PARTITION_T_BOTTOM_SPLIT] = { BS_64x32, BS_32x32 },
+ [PARTITION_T_LEFT_SPLIT] = { BS_32x32, BS_32x64 },
+ [PARTITION_T_RIGHT_SPLIT] = { BS_32x64, BS_32x32 },
+ [PARTITION_H4] = { BS_64x16 },
+ [PARTITION_V4] = { BS_16x64 },
+ }, [BL_32X32] = {
+ [PARTITION_NONE] = { BS_32x32 },
+ [PARTITION_H] = { BS_32x16 },
+ [PARTITION_V] = { BS_16x32 },
+ [PARTITION_T_TOP_SPLIT] = { BS_16x16, BS_32x16 },
+ [PARTITION_T_BOTTOM_SPLIT] = { BS_32x16, BS_16x16 },
+ [PARTITION_T_LEFT_SPLIT] = { BS_16x16, BS_16x32 },
+ [PARTITION_T_RIGHT_SPLIT] = { BS_16x32, BS_16x16 },
+ [PARTITION_H4] = { BS_32x8 },
+ [PARTITION_V4] = { BS_8x32 },
+ }, [BL_16X16] = {
+ [PARTITION_NONE] = { BS_16x16 },
+ [PARTITION_H] = { BS_16x8 },
+ [PARTITION_V] = { BS_8x16 },
+ [PARTITION_T_TOP_SPLIT] = { BS_8x8, BS_16x8 },
+ [PARTITION_T_BOTTOM_SPLIT] = { BS_16x8, BS_8x8 },
+ [PARTITION_T_LEFT_SPLIT] = { BS_8x8, BS_8x16 },
+ [PARTITION_T_RIGHT_SPLIT] = { BS_8x16, BS_8x8 },
+ [PARTITION_H4] = { BS_16x4 },
+ [PARTITION_V4] = { BS_4x16 },
+ }, [BL_8X8] = {
+ [PARTITION_NONE] = { BS_8x8 },
+ [PARTITION_H] = { BS_8x4 },
+ [PARTITION_V] = { BS_4x8 },
+ [PARTITION_SPLIT] = { BS_4x4 },
+ }
+};
+
+const uint8_t dav1d_block_dimensions[N_BS_SIZES][4] = {
+ [BS_128x128] = { 32, 32, 5, 5 },
+ [BS_128x64] = { 32, 16, 5, 4 },
+ [BS_64x128] = { 16, 32, 4, 5 },
+ [BS_64x64] = { 16, 16, 4, 4 },
+ [BS_64x32] = { 16, 8, 4, 3 },
+ [BS_64x16] = { 16, 4, 4, 2 },
+ [BS_32x64] = { 8, 16, 3, 4 },
+ [BS_32x32] = { 8, 8, 3, 3 },
+ [BS_32x16] = { 8, 4, 3, 2 },
+ [BS_32x8] = { 8, 2, 3, 1 },
+ [BS_16x64] = { 4, 16, 2, 4 },
+ [BS_16x32] = { 4, 8, 2, 3 },
+ [BS_16x16] = { 4, 4, 2, 2 },
+ [BS_16x8] = { 4, 2, 2, 1 },
+ [BS_16x4] = { 4, 1, 2, 0 },
+ [BS_8x32] = { 2, 8, 1, 3 },
+ [BS_8x16] = { 2, 4, 1, 2 },
+ [BS_8x8] = { 2, 2, 1, 1 },
+ [BS_8x4] = { 2, 1, 1, 0 },
+ [BS_4x16] = { 1, 4, 0, 2 },
+ [BS_4x8] = { 1, 2, 0, 1 },
+ [BS_4x4] = { 1, 1, 0, 0 },
+};
+
+const TxfmInfo dav1d_txfm_dimensions[N_RECT_TX_SIZES] = {
+ [ TX_4X4] = { .w = 1, .h = 1, .lw = 0, .lh = 0,
+ .min = 0, .max = 0, .ctx = 0 },
+ [ TX_8X8] = { .w = 2, .h = 2, .lw = 1, .lh = 1,
+ .min = 1, .max = 1, .sub = TX_4X4, .ctx = 1 },
+ [ TX_16X16] = { .w = 4, .h = 4, .lw = 2, .lh = 2,
+ .min = 2, .max = 2, .sub = TX_8X8, .ctx = 2 },
+ [ TX_32X32] = { .w = 8, .h = 8, .lw = 3, .lh = 3,
+ .min = 3, .max = 3, .sub = TX_16X16, .ctx = 3 },
+ [ TX_64X64] = { .w = 16, .h = 16, .lw = 4, .lh = 4,
+ .min = 4, .max = 4, .sub = TX_32X32, .ctx = 4 },
+ [RTX_4X8] = { .w = 1, .h = 2, .lw = 0, .lh = 1,
+ .min = 0, .max = 1, .sub = TX_4X4, .ctx = 1 },
+ [RTX_8X4] = { .w = 2, .h = 1, .lw = 1, .lh = 0,
+ .min = 0, .max = 1, .sub = TX_4X4, .ctx = 1 },
+ [RTX_8X16] = { .w = 2, .h = 4, .lw = 1, .lh = 2,
+ .min = 1, .max = 2, .sub = TX_8X8, .ctx = 2 },
+ [RTX_16X8] = { .w = 4, .h = 2, .lw = 2, .lh = 1,
+ .min = 1, .max = 2, .sub = TX_8X8, .ctx = 2 },
+ [RTX_16X32] = { .w = 4, .h = 8, .lw = 2, .lh = 3,
+ .min = 2, .max = 3, .sub = TX_16X16, .ctx = 3 },
+ [RTX_32X16] = { .w = 8, .h = 4, .lw = 3, .lh = 2,
+ .min = 2, .max = 3, .sub = TX_16X16, .ctx = 3 },
+ [RTX_32X64] = { .w = 8, .h = 16, .lw = 3, .lh = 4,
+ .min = 3, .max = 4, .sub = TX_32X32, .ctx = 4 },
+ [RTX_64X32] = { .w = 16, .h = 8, .lw = 4, .lh = 3,
+ .min = 3, .max = 4, .sub = TX_32X32, .ctx = 4 },
+ [RTX_4X16] = { .w = 1, .h = 4, .lw = 0, .lh = 2,
+ .min = 0, .max = 2, .sub = RTX_4X8, .ctx = 1 },
+ [RTX_16X4] = { .w = 4, .h = 1, .lw = 2, .lh = 0,
+ .min = 0, .max = 2, .sub = RTX_8X4, .ctx = 1 },
+ [RTX_8X32] = { .w = 2, .h = 8, .lw = 1, .lh = 3,
+ .min = 1, .max = 3, .sub = RTX_8X16, .ctx = 2 },
+ [RTX_32X8] = { .w = 8, .h = 2, .lw = 3, .lh = 1,
+ .min = 1, .max = 3, .sub = RTX_16X8, .ctx = 2 },
+ [RTX_16X64] = { .w = 4, .h = 16, .lw = 2, .lh = 4,
+ .min = 2, .max = 4, .sub = RTX_16X32, .ctx = 3 },
+ [RTX_64X16] = { .w = 16, .h = 4, .lw = 4, .lh = 2,
+ .min = 2, .max = 4, .sub = RTX_32X16, .ctx = 3 },
+};
+
+const uint8_t /* enum (Rect)TxfmSize */
+ dav1d_max_txfm_size_for_bs[N_BS_SIZES][4 /* y, 420, 422, 444 */] =
+{
+ [BS_128x128] = { TX_64X64, TX_32X32, TX_32X32, TX_32X32 },
+ [BS_128x64] = { TX_64X64, TX_32X32, TX_32X32, TX_32X32 },
+ [BS_64x128] = { TX_64X64, TX_32X32, 0, TX_32X32 },
+ [BS_64x64] = { TX_64X64, TX_32X32, TX_32X32, TX_32X32 },
+ [BS_64x32] = { RTX_64X32, RTX_32X16, TX_32X32, TX_32X32 },
+ [BS_64x16] = { RTX_64X16, RTX_32X8, RTX_32X16, RTX_32X16 },
+ [BS_32x64] = { RTX_32X64, RTX_16X32, 0, TX_32X32 },
+ [BS_32x32] = { TX_32X32, TX_16X16, RTX_16X32, TX_32X32 },
+ [BS_32x16] = { RTX_32X16, RTX_16X8, TX_16X16, RTX_32X16 },
+ [BS_32x8] = { RTX_32X8, RTX_16X4, RTX_16X8, RTX_32X8 },
+ [BS_16x64] = { RTX_16X64, RTX_8X32, 0, RTX_16X32 },
+ [BS_16x32] = { RTX_16X32, RTX_8X16, 0, RTX_16X32 },
+ [BS_16x16] = { TX_16X16, TX_8X8, RTX_8X16, TX_16X16 },
+ [BS_16x8] = { RTX_16X8, RTX_8X4, TX_8X8, RTX_16X8 },
+ [BS_16x4] = { RTX_16X4, RTX_8X4, RTX_8X4, RTX_16X4 },
+ [BS_8x32] = { RTX_8X32, RTX_4X16, 0, RTX_8X32 },
+ [BS_8x16] = { RTX_8X16, RTX_4X8, 0, RTX_8X16 },
+ [BS_8x8] = { TX_8X8, TX_4X4, RTX_4X8, TX_8X8 },
+ [BS_8x4] = { RTX_8X4, TX_4X4, TX_4X4, RTX_8X4 },
+ [BS_4x16] = { RTX_4X16, RTX_4X8, 0, RTX_4X16 },
+ [BS_4x8] = { RTX_4X8, TX_4X4, 0, RTX_4X8 },
+ [BS_4x4] = { TX_4X4, TX_4X4, TX_4X4, TX_4X4 },
+};
+
+const uint8_t /* enum TxfmType */
+ dav1d_txtp_from_uvmode[N_UV_INTRA_PRED_MODES] =
+{
+ [DC_PRED] = DCT_DCT,
+ [VERT_PRED] = ADST_DCT,
+ [HOR_PRED] = DCT_ADST,
+ [DIAG_DOWN_LEFT_PRED] = DCT_DCT,
+ [DIAG_DOWN_RIGHT_PRED] = ADST_ADST,
+ [VERT_RIGHT_PRED] = ADST_DCT,
+ [HOR_DOWN_PRED] = DCT_ADST,
+ [HOR_UP_PRED] = DCT_ADST,
+ [VERT_LEFT_PRED] = ADST_DCT,
+ [SMOOTH_PRED] = ADST_ADST,
+ [SMOOTH_V_PRED] = ADST_DCT,
+ [SMOOTH_H_PRED] = DCT_ADST,
+ [PAETH_PRED] = ADST_ADST,
+};
+
+const uint8_t /* enum InterPredMode */
+ dav1d_comp_inter_pred_modes[N_COMP_INTER_PRED_MODES][2] =
+{
+ [NEARESTMV_NEARESTMV] = { NEARESTMV, NEARESTMV },
+ [NEARMV_NEARMV] = { NEARMV, NEARMV },
+ [NEWMV_NEWMV] = { NEWMV, NEWMV },
+ [GLOBALMV_GLOBALMV] = { GLOBALMV, GLOBALMV },
+ [NEWMV_NEARESTMV] = { NEWMV, NEARESTMV },
+ [NEWMV_NEARMV] = { NEWMV, NEARMV },
+ [NEARESTMV_NEWMV] = { NEARESTMV, NEWMV },
+ [NEARMV_NEWMV] = { NEARMV, NEWMV },
+};
+
+const uint8_t dav1d_partition_type_count[N_BL_LEVELS] = {
+ [BL_128X128] = N_PARTITIONS - 3,
+ [BL_64X64] = N_PARTITIONS - 1,
+ [BL_32X32] = N_PARTITIONS - 1,
+ [BL_16X16] = N_PARTITIONS - 1,
+ [BL_8X8] = N_SUB8X8_PARTITIONS - 1,
+};
+
+const uint8_t /* enum TxfmType */ dav1d_tx_types_per_set[40] = {
+ /* Intra2 */
+ IDTX, DCT_DCT, ADST_ADST, ADST_DCT, DCT_ADST,
+ /* Intra1 */
+ IDTX, DCT_DCT, V_DCT, H_DCT, ADST_ADST, ADST_DCT, DCT_ADST,
+ /* Inter2 */
+ IDTX, V_DCT, H_DCT, DCT_DCT, ADST_DCT, DCT_ADST, FLIPADST_DCT,
+ DCT_FLIPADST, ADST_ADST, FLIPADST_FLIPADST, ADST_FLIPADST, FLIPADST_ADST,
+ /* Inter1 */
+ IDTX, V_DCT, H_DCT, V_ADST, H_ADST, V_FLIPADST, H_FLIPADST,
+ DCT_DCT, ADST_DCT, DCT_ADST, FLIPADST_DCT, DCT_FLIPADST,
+ ADST_ADST, FLIPADST_FLIPADST, ADST_FLIPADST, FLIPADST_ADST,
+};
+
+const uint8_t dav1d_ymode_size_context[N_BS_SIZES] = {
+ [BS_128x128] = 3,
+ [BS_128x64] = 3,
+ [BS_64x128] = 3,
+ [BS_64x64] = 3,
+ [BS_64x32] = 3,
+ [BS_64x16] = 2,
+ [BS_32x64] = 3,
+ [BS_32x32] = 3,
+ [BS_32x16] = 2,
+ [BS_32x8 ] = 1,
+ [BS_16x64] = 2,
+ [BS_16x32] = 2,
+ [BS_16x16] = 2,
+ [BS_16x8 ] = 1,
+ [BS_16x4 ] = 0,
+ [BS_8x32 ] = 1,
+ [BS_8x16 ] = 1,
+ [BS_8x8 ] = 1,
+ [BS_8x4 ] = 0,
+ [BS_4x16 ] = 0,
+ [BS_4x8 ] = 0,
+ [BS_4x4 ] = 0,
+};
+
+const uint8_t dav1d_lo_ctx_offsets[3][5][5] = {
+ { /* w == h */
+ { 0, 1, 6, 6, 21 },
+ { 1, 6, 6, 21, 21 },
+ { 6, 6, 21, 21, 21 },
+ { 6, 21, 21, 21, 21 },
+ { 21, 21, 21, 21, 21 },
+ }, { /* w > h */
+ { 0, 16, 6, 6, 21 },
+ { 16, 16, 6, 21, 21 },
+ { 16, 16, 21, 21, 21 },
+ { 16, 16, 21, 21, 21 },
+ { 16, 16, 21, 21, 21 },
+ }, { /* w < h */
+ { 0, 11, 11, 11, 11 },
+ { 11, 11, 11, 11, 11 },
+ { 6, 6, 21, 21, 21 },
+ { 6, 21, 21, 21, 21 },
+ { 21, 21, 21, 21, 21 },
+ },
+};
+
+const uint8_t dav1d_skip_ctx[5][5] = {
+ { 1, 2, 2, 2, 3 },
+ { 2, 4, 4, 4, 5 },
+ { 2, 4, 4, 4, 5 },
+ { 2, 4, 4, 4, 5 },
+ { 3, 5, 5, 5, 6 },
+};
+
+const uint8_t /* enum TxClass */ dav1d_tx_type_class[N_TX_TYPES_PLUS_LL] = {
+ [DCT_DCT] = TX_CLASS_2D,
+ [ADST_DCT] = TX_CLASS_2D,
+ [DCT_ADST] = TX_CLASS_2D,
+ [ADST_ADST] = TX_CLASS_2D,
+ [FLIPADST_DCT] = TX_CLASS_2D,
+ [DCT_FLIPADST] = TX_CLASS_2D,
+ [FLIPADST_FLIPADST] = TX_CLASS_2D,
+ [ADST_FLIPADST] = TX_CLASS_2D,
+ [FLIPADST_ADST] = TX_CLASS_2D,
+ [IDTX] = TX_CLASS_2D,
+ [V_DCT] = TX_CLASS_V,
+ [H_DCT] = TX_CLASS_H,
+ [V_ADST] = TX_CLASS_V,
+ [H_ADST] = TX_CLASS_H,
+ [V_FLIPADST] = TX_CLASS_V,
+ [H_FLIPADST] = TX_CLASS_H,
+ [WHT_WHT] = TX_CLASS_2D,
+};
+
+const uint8_t /* enum Filter2d */ dav1d_filter_2d[DAV1D_N_FILTERS][DAV1D_N_FILTERS] = {
+ [DAV1D_FILTER_8TAP_REGULAR] = {
+ [DAV1D_FILTER_8TAP_REGULAR] = FILTER_2D_8TAP_REGULAR,
+ [DAV1D_FILTER_8TAP_SHARP] = FILTER_2D_8TAP_REGULAR_SHARP,
+ [DAV1D_FILTER_8TAP_SMOOTH] = FILTER_2D_8TAP_REGULAR_SMOOTH,
+ }, [DAV1D_FILTER_8TAP_SHARP] = {
+ [DAV1D_FILTER_8TAP_REGULAR] = FILTER_2D_8TAP_SHARP_REGULAR,
+ [DAV1D_FILTER_8TAP_SHARP] = FILTER_2D_8TAP_SHARP,
+ [DAV1D_FILTER_8TAP_SMOOTH] = FILTER_2D_8TAP_SHARP_SMOOTH,
+ }, [DAV1D_FILTER_8TAP_SMOOTH] = {
+ [DAV1D_FILTER_8TAP_REGULAR] = FILTER_2D_8TAP_SMOOTH_REGULAR,
+ [DAV1D_FILTER_8TAP_SHARP] = FILTER_2D_8TAP_SMOOTH_SHARP,
+ [DAV1D_FILTER_8TAP_SMOOTH] = FILTER_2D_8TAP_SMOOTH,
+ }, [DAV1D_FILTER_BILINEAR] = {
+ [DAV1D_FILTER_BILINEAR] = FILTER_2D_BILINEAR,
+ }
+};
+
+const uint8_t /* enum Dav1dFilterMode */ dav1d_filter_dir[N_2D_FILTERS][2] = {
+ [FILTER_2D_8TAP_REGULAR] = { DAV1D_FILTER_8TAP_REGULAR, DAV1D_FILTER_8TAP_REGULAR },
+ [FILTER_2D_8TAP_REGULAR_SMOOTH] = { DAV1D_FILTER_8TAP_SMOOTH, DAV1D_FILTER_8TAP_REGULAR },
+ [FILTER_2D_8TAP_REGULAR_SHARP] = { DAV1D_FILTER_8TAP_SHARP, DAV1D_FILTER_8TAP_REGULAR },
+ [FILTER_2D_8TAP_SHARP_REGULAR] = { DAV1D_FILTER_8TAP_REGULAR, DAV1D_FILTER_8TAP_SHARP },
+ [FILTER_2D_8TAP_SHARP_SMOOTH] = { DAV1D_FILTER_8TAP_SMOOTH, DAV1D_FILTER_8TAP_SHARP },
+ [FILTER_2D_8TAP_SHARP] = { DAV1D_FILTER_8TAP_SHARP, DAV1D_FILTER_8TAP_SHARP },
+ [FILTER_2D_8TAP_SMOOTH_REGULAR] = { DAV1D_FILTER_8TAP_REGULAR, DAV1D_FILTER_8TAP_SMOOTH },
+ [FILTER_2D_8TAP_SMOOTH] = { DAV1D_FILTER_8TAP_SMOOTH, DAV1D_FILTER_8TAP_SMOOTH },
+ [FILTER_2D_8TAP_SMOOTH_SHARP] = { DAV1D_FILTER_8TAP_SHARP, DAV1D_FILTER_8TAP_SMOOTH },
+ [FILTER_2D_BILINEAR] = { DAV1D_FILTER_BILINEAR, DAV1D_FILTER_BILINEAR },
+};
+
+const uint8_t dav1d_filter_mode_to_y_mode[5] = {
+ DC_PRED, VERT_PRED, HOR_PRED, HOR_DOWN_PRED, DC_PRED
+};
+
+const uint8_t dav1d_intra_mode_context[N_INTRA_PRED_MODES] = {
+ [DC_PRED] = 0,
+ [VERT_PRED] = 1,
+ [HOR_PRED] = 2,
+ [DIAG_DOWN_LEFT_PRED] = 3,
+ [DIAG_DOWN_RIGHT_PRED] = 4,
+ [VERT_RIGHT_PRED] = 4,
+ [HOR_DOWN_PRED] = 4,
+ [HOR_UP_PRED] = 4,
+ [VERT_LEFT_PRED] = 3,
+ [SMOOTH_PRED] = 0,
+ [SMOOTH_V_PRED] = 1,
+ [SMOOTH_H_PRED] = 2,
+ [PAETH_PRED] = 0,
+};
+
+const uint8_t dav1d_wedge_ctx_lut[N_BS_SIZES] = {
+ [BS_32x32] = 6,
+ [BS_32x16] = 5,
+ [BS_32x8] = 8,
+ [BS_16x32] = 4,
+ [BS_16x16] = 3,
+ [BS_16x8] = 2,
+ [BS_8x32] = 7,
+ [BS_8x16] = 1,
+ [BS_8x8] = 0,
+};
+
+const Dav1dWarpedMotionParams dav1d_default_wm_params = {
+ .type = DAV1D_WM_TYPE_IDENTITY,
+ .matrix = {
+ 0, 0, 1 << 16,
+ 0, 0, 1 << 16,
+ },
+ .u.p.alpha = 0,
+ .u.p.beta = 0,
+ .u.p.gamma = 0,
+ .u.p.delta = 0,
+};
+
+const int8_t dav1d_cdef_directions[2 + 8 + 2 /* dir */][2 /* pass */] = {
+ { 1 * 12 + 0, 2 * 12 + 0 }, // 6
+ { 1 * 12 + 0, 2 * 12 - 1 }, // 7
+ { -1 * 12 + 1, -2 * 12 + 2 }, // 0
+ { 0 * 12 + 1, -1 * 12 + 2 }, // 1
+ { 0 * 12 + 1, 0 * 12 + 2 }, // 2
+ { 0 * 12 + 1, 1 * 12 + 2 }, // 3
+ { 1 * 12 + 1, 2 * 12 + 2 }, // 4
+ { 1 * 12 + 0, 2 * 12 + 1 }, // 5
+ { 1 * 12 + 0, 2 * 12 + 0 }, // 6
+ { 1 * 12 + 0, 2 * 12 - 1 }, // 7
+ { -1 * 12 + 1, -2 * 12 + 2 }, // 0
+ { 0 * 12 + 1, -1 * 12 + 2 }, // 1
+};
+
+const uint16_t ALIGN(dav1d_sgr_params[16][2], 4) = {
+ { 140, 3236 }, { 112, 2158 }, { 93, 1618 }, { 80, 1438 },
+ { 70, 1295 }, { 58, 1177 }, { 47, 1079 }, { 37, 996 },
+ { 30, 925 }, { 25, 863 }, { 0, 2589 }, { 0, 1618 },
+ { 0, 1177 }, { 0, 925 }, { 56, 0 }, { 22, 0 },
+};
+
+const uint8_t ALIGN(dav1d_sgr_x_by_x[256], 64) = {
+ 255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17,
+ 16, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9,
+ 8, 8, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6,
+ 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 0
+};
+
+const int8_t ALIGN(dav1d_mc_subpel_filters[6][15][8], 8) = {
+ [DAV1D_FILTER_8TAP_REGULAR] = {
+ { 0, 1, -3, 63, 4, -1, 0, 0 },
+ { 0, 1, -5, 61, 9, -2, 0, 0 },
+ { 0, 1, -6, 58, 14, -4, 1, 0 },
+ { 0, 1, -7, 55, 19, -5, 1, 0 },
+ { 0, 1, -7, 51, 24, -6, 1, 0 },
+ { 0, 1, -8, 47, 29, -6, 1, 0 },
+ { 0, 1, -7, 42, 33, -6, 1, 0 },
+ { 0, 1, -7, 38, 38, -7, 1, 0 },
+ { 0, 1, -6, 33, 42, -7, 1, 0 },
+ { 0, 1, -6, 29, 47, -8, 1, 0 },
+ { 0, 1, -6, 24, 51, -7, 1, 0 },
+ { 0, 1, -5, 19, 55, -7, 1, 0 },
+ { 0, 1, -4, 14, 58, -6, 1, 0 },
+ { 0, 0, -2, 9, 61, -5, 1, 0 },
+ { 0, 0, -1, 4, 63, -3, 1, 0 }
+ }, [DAV1D_FILTER_8TAP_SMOOTH] = {
+ { 0, 1, 14, 31, 17, 1, 0, 0 },
+ { 0, 0, 13, 31, 18, 2, 0, 0 },
+ { 0, 0, 11, 31, 20, 2, 0, 0 },
+ { 0, 0, 10, 30, 21, 3, 0, 0 },
+ { 0, 0, 9, 29, 22, 4, 0, 0 },
+ { 0, 0, 8, 28, 23, 5, 0, 0 },
+ { 0, -1, 8, 27, 24, 6, 0, 0 },
+ { 0, -1, 7, 26, 26, 7, -1, 0 },
+ { 0, 0, 6, 24, 27, 8, -1, 0 },
+ { 0, 0, 5, 23, 28, 8, 0, 0 },
+ { 0, 0, 4, 22, 29, 9, 0, 0 },
+ { 0, 0, 3, 21, 30, 10, 0, 0 },
+ { 0, 0, 2, 20, 31, 11, 0, 0 },
+ { 0, 0, 2, 18, 31, 13, 0, 0 },
+ { 0, 0, 1, 17, 31, 14, 1, 0 }
+ }, [DAV1D_FILTER_8TAP_SHARP] = {
+ { -1, 1, -3, 63, 4, -1, 1, 0 },
+ { -1, 3, -6, 62, 8, -3, 2, -1 },
+ { -1, 4, -9, 60, 13, -5, 3, -1 },
+ { -2, 5, -11, 58, 19, -7, 3, -1 },
+ { -2, 5, -11, 54, 24, -9, 4, -1 },
+ { -2, 5, -12, 50, 30, -10, 4, -1 },
+ { -2, 5, -12, 45, 35, -11, 5, -1 },
+ { -2, 6, -12, 40, 40, -12, 6, -2 },
+ { -1, 5, -11, 35, 45, -12, 5, -2 },
+ { -1, 4, -10, 30, 50, -12, 5, -2 },
+ { -1, 4, -9, 24, 54, -11, 5, -2 },
+ { -1, 3, -7, 19, 58, -11, 5, -2 },
+ { -1, 3, -5, 13, 60, -9, 4, -1 },
+ { -1, 2, -3, 8, 62, -6, 3, -1 },
+ { 0, 1, -1, 4, 63, -3, 1, -1 }
+ /* width <= 4 */
+ }, [3 + DAV1D_FILTER_8TAP_REGULAR] = {
+ { 0, 0, -2, 63, 4, -1, 0, 0 },
+ { 0, 0, -4, 61, 9, -2, 0, 0 },
+ { 0, 0, -5, 58, 14, -3, 0, 0 },
+ { 0, 0, -6, 55, 19, -4, 0, 0 },
+ { 0, 0, -6, 51, 24, -5, 0, 0 },
+ { 0, 0, -7, 47, 29, -5, 0, 0 },
+ { 0, 0, -6, 42, 33, -5, 0, 0 },
+ { 0, 0, -6, 38, 38, -6, 0, 0 },
+ { 0, 0, -5, 33, 42, -6, 0, 0 },
+ { 0, 0, -5, 29, 47, -7, 0, 0 },
+ { 0, 0, -5, 24, 51, -6, 0, 0 },
+ { 0, 0, -4, 19, 55, -6, 0, 0 },
+ { 0, 0, -3, 14, 58, -5, 0, 0 },
+ { 0, 0, -2, 9, 61, -4, 0, 0 },
+ { 0, 0, -1, 4, 63, -2, 0, 0 }
+ }, [3 + DAV1D_FILTER_8TAP_SMOOTH] = {
+ { 0, 0, 15, 31, 17, 1, 0, 0 },
+ { 0, 0, 13, 31, 18, 2, 0, 0 },
+ { 0, 0, 11, 31, 20, 2, 0, 0 },
+ { 0, 0, 10, 30, 21, 3, 0, 0 },
+ { 0, 0, 9, 29, 22, 4, 0, 0 },
+ { 0, 0, 8, 28, 23, 5, 0, 0 },
+ { 0, 0, 7, 27, 24, 6, 0, 0 },
+ { 0, 0, 6, 26, 26, 6, 0, 0 },
+ { 0, 0, 6, 24, 27, 7, 0, 0 },
+ { 0, 0, 5, 23, 28, 8, 0, 0 },
+ { 0, 0, 4, 22, 29, 9, 0, 0 },
+ { 0, 0, 3, 21, 30, 10, 0, 0 },
+ { 0, 0, 2, 20, 31, 11, 0, 0 },
+ { 0, 0, 2, 18, 31, 13, 0, 0 },
+ { 0, 0, 1, 17, 31, 15, 0, 0 }
+ /* Bilin scaled being very rarely used, add a new table entry
+ * and use the put/prep_8tap_scaled code, thus acting as a
+ * scaled bilinear filter. */
+ }, [5] = {
+ { 0, 0, 0, 60, 4, 0, 0, 0 },
+ { 0, 0, 0, 56, 8, 0, 0, 0 },
+ { 0, 0, 0, 52, 12, 0, 0, 0 },
+ { 0, 0, 0, 48, 16, 0, 0, 0 },
+ { 0, 0, 0, 44, 20, 0, 0, 0 },
+ { 0, 0, 0, 40, 24, 0, 0, 0 },
+ { 0, 0, 0, 36, 28, 0, 0, 0 },
+ { 0, 0, 0, 32, 32, 0, 0, 0 },
+ { 0, 0, 0, 28, 36, 0, 0, 0 },
+ { 0, 0, 0, 24, 40, 0, 0, 0 },
+ { 0, 0, 0, 20, 44, 0, 0, 0 },
+ { 0, 0, 0, 16, 48, 0, 0, 0 },
+ { 0, 0, 0, 12, 52, 0, 0, 0 },
+ { 0, 0, 0, 8, 56, 0, 0, 0 },
+ { 0, 0, 0, 4, 60, 0, 0, 0 }
+ }
+};
+
+const int8_t ALIGN(dav1d_mc_warp_filter[193][8], 8) = {
+ // [-1, 0)
+ { 0, 0, 127, 1, 0, 0, 0, 0 }, { 0, -1, 127, 2, 0, 0, 0, 0 },
+ { 1, -3, 127, 4, - 1, 0, 0, 0 }, { 1, -4, 126, 6, -2, 1, 0, 0 },
+ { 1, -5, 126, 8, - 3, 1, 0, 0 }, { 1, -6, 125, 11, -4, 1, 0, 0 },
+ { 1, -7, 124, 13, - 4, 1, 0, 0 }, { 2, -8, 123, 15, -5, 1, 0, 0 },
+ { 2, -9, 122, 18, - 6, 1, 0, 0 }, { 2, -10, 121, 20, -6, 1, 0, 0 },
+ { 2, -11, 120, 22, - 7, 2, 0, 0 }, { 2, -12, 119, 25, -8, 2, 0, 0 },
+ { 3, -13, 117, 27, - 8, 2, 0, 0 }, { 3, -13, 116, 29, -9, 2, 0, 0 },
+ { 3, -14, 114, 32, -10, 3, 0, 0 }, { 3, -15, 113, 35, -10, 2, 0, 0 },
+ { 3, -15, 111, 37, -11, 3, 0, 0 }, { 3, -16, 109, 40, -11, 3, 0, 0 },
+ { 3, -16, 108, 42, -12, 3, 0, 0 }, { 4, -17, 106, 45, -13, 3, 0, 0 },
+ { 4, -17, 104, 47, -13, 3, 0, 0 }, { 4, -17, 102, 50, -14, 3, 0, 0 },
+ { 4, -17, 100, 52, -14, 3, 0, 0 }, { 4, -18, 98, 55, -15, 4, 0, 0 },
+ { 4, -18, 96, 58, -15, 3, 0, 0 }, { 4, -18, 94, 60, -16, 4, 0, 0 },
+ { 4, -18, 91, 63, -16, 4, 0, 0 }, { 4, -18, 89, 65, -16, 4, 0, 0 },
+ { 4, -18, 87, 68, -17, 4, 0, 0 }, { 4, -18, 85, 70, -17, 4, 0, 0 },
+ { 4, -18, 82, 73, -17, 4, 0, 0 }, { 4, -18, 80, 75, -17, 4, 0, 0 },
+ { 4, -18, 78, 78, -18, 4, 0, 0 }, { 4, -17, 75, 80, -18, 4, 0, 0 },
+ { 4, -17, 73, 82, -18, 4, 0, 0 }, { 4, -17, 70, 85, -18, 4, 0, 0 },
+ { 4, -17, 68, 87, -18, 4, 0, 0 }, { 4, -16, 65, 89, -18, 4, 0, 0 },
+ { 4, -16, 63, 91, -18, 4, 0, 0 }, { 4, -16, 60, 94, -18, 4, 0, 0 },
+ { 3, -15, 58, 96, -18, 4, 0, 0 }, { 4, -15, 55, 98, -18, 4, 0, 0 },
+ { 3, -14, 52, 100, -17, 4, 0, 0 }, { 3, -14, 50, 102, -17, 4, 0, 0 },
+ { 3, -13, 47, 104, -17, 4, 0, 0 }, { 3, -13, 45, 106, -17, 4, 0, 0 },
+ { 3, -12, 42, 108, -16, 3, 0, 0 }, { 3, -11, 40, 109, -16, 3, 0, 0 },
+ { 3, -11, 37, 111, -15, 3, 0, 0 }, { 2, -10, 35, 113, -15, 3, 0, 0 },
+ { 3, -10, 32, 114, -14, 3, 0, 0 }, { 2, - 9, 29, 116, -13, 3, 0, 0 },
+ { 2, -8, 27, 117, -13, 3, 0, 0 }, { 2, - 8, 25, 119, -12, 2, 0, 0 },
+ { 2, -7, 22, 120, -11, 2, 0, 0 }, { 1, - 6, 20, 121, -10, 2, 0, 0 },
+ { 1, -6, 18, 122, - 9, 2, 0, 0 }, { 1, - 5, 15, 123, - 8, 2, 0, 0 },
+ { 1, -4, 13, 124, - 7, 1, 0, 0 }, { 1, - 4, 11, 125, - 6, 1, 0, 0 },
+ { 1, -3, 8, 126, - 5, 1, 0, 0 }, { 1, - 2, 6, 126, - 4, 1, 0, 0 },
+ { 0, -1, 4, 127, - 3, 1, 0, 0 }, { 0, 0, 2, 127, - 1, 0, 0, 0 },
+ // [0, 1)
+ { 0, 0, 0, 127, 1, 0, 0, 0 }, { 0, 0, -1, 127, 2, 0, 0, 0 },
+ { 0, 1, -3, 127, 4, -2, 1, 0 }, { 0, 1, -5, 127, 6, -2, 1, 0 },
+ { 0, 2, -6, 126, 8, -3, 1, 0 }, { -1, 2, -7, 126, 11, -4, 2, -1 },
+ { -1, 3, -8, 125, 13, -5, 2, -1 }, { -1, 3, -10, 124, 16, -6, 3, -1 },
+ { -1, 4, -11, 123, 18, -7, 3, -1 }, { -1, 4, -12, 122, 20, -7, 3, -1 },
+ { -1, 4, -13, 121, 23, -8, 3, -1 }, { -2, 5, -14, 120, 25, -9, 4, -1 },
+ { -1, 5, -15, 119, 27, -10, 4, -1 }, { -1, 5, -16, 118, 30, -11, 4, -1 },
+ { -2, 6, -17, 116, 33, -12, 5, -1 }, { -2, 6, -17, 114, 35, -12, 5, -1 },
+ { -2, 6, -18, 113, 38, -13, 5, -1 }, { -2, 7, -19, 111, 41, -14, 6, -2 },
+ { -2, 7, -19, 110, 43, -15, 6, -2 }, { -2, 7, -20, 108, 46, -15, 6, -2 },
+ { -2, 7, -20, 106, 49, -16, 6, -2 }, { -2, 7, -21, 104, 51, -16, 7, -2 },
+ { -2, 7, -21, 102, 54, -17, 7, -2 }, { -2, 8, -21, 100, 56, -18, 7, -2 },
+ { -2, 8, -22, 98, 59, -18, 7, -2 }, { -2, 8, -22, 96, 62, -19, 7, -2 },
+ { -2, 8, -22, 94, 64, -19, 7, -2 }, { -2, 8, -22, 91, 67, -20, 8, -2 },
+ { -2, 8, -22, 89, 69, -20, 8, -2 }, { -2, 8, -22, 87, 72, -21, 8, -2 },
+ { -2, 8, -21, 84, 74, -21, 8, -2 }, { -2, 8, -22, 82, 77, -21, 8, -2 },
+ { -2, 8, -21, 79, 79, -21, 8, -2 }, { -2, 8, -21, 77, 82, -22, 8, -2 },
+ { -2, 8, -21, 74, 84, -21, 8, -2 }, { -2, 8, -21, 72, 87, -22, 8, -2 },
+ { -2, 8, -20, 69, 89, -22, 8, -2 }, { -2, 8, -20, 67, 91, -22, 8, -2 },
+ { -2, 7, -19, 64, 94, -22, 8, -2 }, { -2, 7, -19, 62, 96, -22, 8, -2 },
+ { -2, 7, -18, 59, 98, -22, 8, -2 }, { -2, 7, -18, 56, 100, -21, 8, -2 },
+ { -2, 7, -17, 54, 102, -21, 7, -2 }, { -2, 7, -16, 51, 104, -21, 7, -2 },
+ { -2, 6, -16, 49, 106, -20, 7, -2 }, { -2, 6, -15, 46, 108, -20, 7, -2 },
+ { -2, 6, -15, 43, 110, -19, 7, -2 }, { -2, 6, -14, 41, 111, -19, 7, -2 },
+ { -1, 5, -13, 38, 113, -18, 6, -2 }, { -1, 5, -12, 35, 114, -17, 6, -2 },
+ { -1, 5, -12, 33, 116, -17, 6, -2 }, { -1, 4, -11, 30, 118, -16, 5, -1 },
+ { -1, 4, -10, 27, 119, -15, 5, -1 }, { -1, 4, -9, 25, 120, -14, 5, -2 },
+ { -1, 3, -8, 23, 121, -13, 4, -1 }, { -1, 3, -7, 20, 122, -12, 4, -1 },
+ { -1, 3, -7, 18, 123, -11, 4, -1 }, { -1, 3, -6, 16, 124, -10, 3, -1 },
+ { -1, 2, -5, 13, 125, -8, 3, -1 }, { -1, 2, -4, 11, 126, -7, 2, -1 },
+ { 0, 1, -3, 8, 126, -6, 2, 0 }, { 0, 1, -2, 6, 127, -5, 1, 0 },
+ { 0, 1, -2, 4, 127, -3, 1, 0 }, { 0, 0, 0, 2, 127, -1, 0, 0 },
+ // [1, 2)
+ { 0, 0, 0, 1, 127, 0, 0, 0 }, { 0, 0, 0, -1, 127, 2, 0, 0 },
+ { 0, 0, 1, -3, 127, 4, -1, 0 }, { 0, 0, 1, -4, 126, 6, -2, 1 },
+ { 0, 0, 1, -5, 126, 8, -3, 1 }, { 0, 0, 1, -6, 125, 11, -4, 1 },
+ { 0, 0, 1, -7, 124, 13, -4, 1 }, { 0, 0, 2, -8, 123, 15, -5, 1 },
+ { 0, 0, 2, -9, 122, 18, -6, 1 }, { 0, 0, 2, -10, 121, 20, -6, 1 },
+ { 0, 0, 2, -11, 120, 22, -7, 2 }, { 0, 0, 2, -12, 119, 25, -8, 2 },
+ { 0, 0, 3, -13, 117, 27, -8, 2 }, { 0, 0, 3, -13, 116, 29, -9, 2 },
+ { 0, 0, 3, -14, 114, 32, -10, 3 }, { 0, 0, 3, -15, 113, 35, -10, 2 },
+ { 0, 0, 3, -15, 111, 37, -11, 3 }, { 0, 0, 3, -16, 109, 40, -11, 3 },
+ { 0, 0, 3, -16, 108, 42, -12, 3 }, { 0, 0, 4, -17, 106, 45, -13, 3 },
+ { 0, 0, 4, -17, 104, 47, -13, 3 }, { 0, 0, 4, -17, 102, 50, -14, 3 },
+ { 0, 0, 4, -17, 100, 52, -14, 3 }, { 0, 0, 4, -18, 98, 55, -15, 4 },
+ { 0, 0, 4, -18, 96, 58, -15, 3 }, { 0, 0, 4, -18, 94, 60, -16, 4 },
+ { 0, 0, 4, -18, 91, 63, -16, 4 }, { 0, 0, 4, -18, 89, 65, -16, 4 },
+ { 0, 0, 4, -18, 87, 68, -17, 4 }, { 0, 0, 4, -18, 85, 70, -17, 4 },
+ { 0, 0, 4, -18, 82, 73, -17, 4 }, { 0, 0, 4, -18, 80, 75, -17, 4 },
+ { 0, 0, 4, -18, 78, 78, -18, 4 }, { 0, 0, 4, -17, 75, 80, -18, 4 },
+ { 0, 0, 4, -17, 73, 82, -18, 4 }, { 0, 0, 4, -17, 70, 85, -18, 4 },
+ { 0, 0, 4, -17, 68, 87, -18, 4 }, { 0, 0, 4, -16, 65, 89, -18, 4 },
+ { 0, 0, 4, -16, 63, 91, -18, 4 }, { 0, 0, 4, -16, 60, 94, -18, 4 },
+ { 0, 0, 3, -15, 58, 96, -18, 4 }, { 0, 0, 4, -15, 55, 98, -18, 4 },
+ { 0, 0, 3, -14, 52, 100, -17, 4 }, { 0, 0, 3, -14, 50, 102, -17, 4 },
+ { 0, 0, 3, -13, 47, 104, -17, 4 }, { 0, 0, 3, -13, 45, 106, -17, 4 },
+ { 0, 0, 3, -12, 42, 108, -16, 3 }, { 0, 0, 3, -11, 40, 109, -16, 3 },
+ { 0, 0, 3, -11, 37, 111, -15, 3 }, { 0, 0, 2, -10, 35, 113, -15, 3 },
+ { 0, 0, 3, -10, 32, 114, -14, 3 }, { 0, 0, 2, -9, 29, 116, -13, 3 },
+ { 0, 0, 2, -8, 27, 117, -13, 3 }, { 0, 0, 2, -8, 25, 119, -12, 2 },
+ { 0, 0, 2, -7, 22, 120, -11, 2 }, { 0, 0, 1, -6, 20, 121, -10, 2 },
+ { 0, 0, 1, -6, 18, 122, -9, 2 }, { 0, 0, 1, -5, 15, 123, -8, 2 },
+ { 0, 0, 1, -4, 13, 124, -7, 1 }, { 0, 0, 1, -4, 11, 125, -6, 1 },
+ { 0, 0, 1, -3, 8, 126, -5, 1 }, { 0, 0, 1, -2, 6, 126, -4, 1 },
+ { 0, 0, 0, -1, 4, 127, -3, 1 }, { 0, 0, 0, 0, 2, 127, -1, 0 },
+ // dummy (replicate row index 191)
+ { 0, 0, 0, 0, 2, 127, -1, 0 },
+};
+
+const int8_t ALIGN(dav1d_resize_filter[64][8], 8) = {
+ { 0, 0, 0, -128, 0, 0, 0, 0 }, { 0, 0, 1, -128, -2, 1, 0, 0 },
+ { 0, -1, 3, -127, -4, 2, -1, 0 }, { 0, -1, 4, -127, -6, 3, -1, 0 },
+ { 0, -2, 6, -126, -8, 3, -1, 0 }, { 0, -2, 7, -125, -11, 4, -1, 0 },
+ { 1, -2, 8, -125, -13, 5, -2, 0 }, { 1, -3, 9, -124, -15, 6, -2, 0 },
+ { 1, -3, 10, -123, -18, 6, -2, 1 }, { 1, -3, 11, -122, -20, 7, -3, 1 },
+ { 1, -4, 12, -121, -22, 8, -3, 1 }, { 1, -4, 13, -120, -25, 9, -3, 1 },
+ { 1, -4, 14, -118, -28, 9, -3, 1 }, { 1, -4, 15, -117, -30, 10, -4, 1 },
+ { 1, -5, 16, -116, -32, 11, -4, 1 }, { 1, -5, 16, -114, -35, 12, -4, 1 },
+ { 1, -5, 17, -112, -38, 12, -4, 1 }, { 1, -5, 18, -111, -40, 13, -5, 1 },
+ { 1, -5, 18, -109, -43, 14, -5, 1 }, { 1, -6, 19, -107, -45, 14, -5, 1 },
+ { 1, -6, 19, -105, -48, 15, -5, 1 }, { 1, -6, 19, -103, -51, 16, -5, 1 },
+ { 1, -6, 20, -101, -53, 16, -6, 1 }, { 1, -6, 20, -99, -56, 17, -6, 1 },
+ { 1, -6, 20, -97, -58, 17, -6, 1 }, { 1, -6, 20, -95, -61, 18, -6, 1 },
+ { 2, -7, 20, -93, -64, 18, -6, 2 }, { 2, -7, 20, -91, -66, 19, -6, 1 },
+ { 2, -7, 20, -88, -69, 19, -6, 1 }, { 2, -7, 20, -86, -71, 19, -6, 1 },
+ { 2, -7, 20, -84, -74, 20, -7, 2 }, { 2, -7, 20, -81, -76, 20, -7, 1 },
+ { 2, -7, 20, -79, -79, 20, -7, 2 }, { 1, -7, 20, -76, -81, 20, -7, 2 },
+ { 2, -7, 20, -74, -84, 20, -7, 2 }, { 1, -6, 19, -71, -86, 20, -7, 2 },
+ { 1, -6, 19, -69, -88, 20, -7, 2 }, { 1, -6, 19, -66, -91, 20, -7, 2 },
+ { 2, -6, 18, -64, -93, 20, -7, 2 }, { 1, -6, 18, -61, -95, 20, -6, 1 },
+ { 1, -6, 17, -58, -97, 20, -6, 1 }, { 1, -6, 17, -56, -99, 20, -6, 1 },
+ { 1, -6, 16, -53, -101, 20, -6, 1 }, { 1, -5, 16, -51, -103, 19, -6, 1 },
+ { 1, -5, 15, -48, -105, 19, -6, 1 }, { 1, -5, 14, -45, -107, 19, -6, 1 },
+ { 1, -5, 14, -43, -109, 18, -5, 1 }, { 1, -5, 13, -40, -111, 18, -5, 1 },
+ { 1, -4, 12, -38, -112, 17, -5, 1 }, { 1, -4, 12, -35, -114, 16, -5, 1 },
+ { 1, -4, 11, -32, -116, 16, -5, 1 }, { 1, -4, 10, -30, -117, 15, -4, 1 },
+ { 1, -3, 9, -28, -118, 14, -4, 1 }, { 1, -3, 9, -25, -120, 13, -4, 1 },
+ { 1, -3, 8, -22, -121, 12, -4, 1 }, { 1, -3, 7, -20, -122, 11, -3, 1 },
+ { 1, -2, 6, -18, -123, 10, -3, 1 }, { 0, -2, 6, -15, -124, 9, -3, 1 },
+ { 0, -2, 5, -13, -125, 8, -2, 1 }, { 0, -1, 4, -11, -125, 7, -2, 0 },
+ { 0, -1, 3, -8, -126, 6, -2, 0 }, { 0, -1, 3, -6, -127, 4, -1, 0 },
+ { 0, -1, 2, -4, -127, 3, -1, 0 }, { 0, 0, 1, -2, -128, 1, 0, 0 },
+};
+
+const uint8_t ALIGN(dav1d_sm_weights[128], 16) = {
+ // Unused, because we always offset by bs, which is at least 2.
+ 0, 0,
+ // bs = 2
+ 255, 128,
+ // bs = 4
+ 255, 149, 85, 64,
+ // bs = 8
+ 255, 197, 146, 105, 73, 50, 37, 32,
+ // bs = 16
+ 255, 225, 196, 170, 145, 123, 102, 84,
+ 68, 54, 43, 33, 26, 20, 17, 16,
+ // bs = 32
+ 255, 240, 225, 210, 196, 182, 169, 157,
+ 145, 133, 122, 111, 101, 92, 83, 74,
+ 66, 59, 52, 45, 39, 34, 29, 25,
+ 21, 17, 14, 12, 10, 9, 8, 8,
+ // bs = 64
+ 255, 248, 240, 233, 225, 218, 210, 203,
+ 196, 189, 182, 176, 169, 163, 156, 150,
+ 144, 138, 133, 127, 121, 116, 111, 106,
+ 101, 96, 91, 86, 82, 77, 73, 69,
+ 65, 61, 57, 54, 50, 47, 44, 41,
+ 38, 35, 32, 29, 27, 25, 22, 20,
+ 18, 16, 15, 13, 12, 10, 9, 8,
+ 7, 6, 6, 5, 5, 4, 4, 4
+};
+
+const uint16_t dav1d_dr_intra_derivative[44] = {
+ // Values that are 0 will never be used
+ 0, // Angles:
+ 1023, 0, // 3, 93, 183
+ 547, // 6, 96, 186
+ 372, 0, 0, // 9, 99, 189
+ 273, // 14, 104, 194
+ 215, 0, // 17, 107, 197
+ 178, // 20, 110, 200
+ 151, 0, // 23, 113, 203 (113 & 203 are base angles)
+ 132, // 26, 116, 206
+ 116, 0, // 29, 119, 209
+ 102, 0, // 32, 122, 212
+ 90, // 36, 126, 216
+ 80, 0, // 39, 129, 219
+ 71, // 42, 132, 222
+ 64, 0, // 45, 135, 225 (45 & 135 are base angles)
+ 57, // 48, 138, 228
+ 51, 0, // 51, 141, 231
+ 45, 0, // 54, 144, 234
+ 40, // 58, 148, 238
+ 35, 0, // 61, 151, 241
+ 31, // 64, 154, 244
+ 27, 0, // 67, 157, 247 (67 & 157 are base angles)
+ 23, // 70, 160, 250
+ 19, 0, // 73, 163, 253
+ 15, 0, // 76, 166, 256
+ 11, 0, // 81, 171, 261
+ 7, // 84, 174, 264
+ 3 // 87, 177, 267
+};
+
+#if ARCH_X86
+#define F(idx, f0, f1, f2, f3, f4, f5, f6) \
+ [2*idx+0] = f0, [2*idx+1] = f1, \
+ [2*idx+16] = f2, [2*idx+17] = f3, \
+ [2*idx+32] = f4, [2*idx+33] = f5, \
+ [2*idx+48] = f6
+#else
+#define F(idx, f0, f1, f2, f3, f4, f5, f6) \
+ [1*idx+0] = f0, [1*idx+8] = f1, \
+ [1*idx+16] = f2, [1*idx+24] = f3, \
+ [1*idx+32] = f4, [1*idx+40] = f5, \
+ [1*idx+48] = f6
+#endif
+const int8_t ALIGN(dav1d_filter_intra_taps[5][64], 64) = {
+ {
+ F( 0, -6, 10, 0, 0, 0, 12, 0 ),
+ F( 1, -5, 2, 10, 0, 0, 9, 0 ),
+ F( 2, -3, 1, 1, 10, 0, 7, 0 ),
+ F( 3, -3, 1, 1, 2, 10, 5, 0 ),
+ F( 4, -4, 6, 0, 0, 0, 2, 12 ),
+ F( 5, -3, 2, 6, 0, 0, 2, 9 ),
+ F( 6, -3, 2, 2, 6, 0, 2, 7 ),
+ F( 7, -3, 1, 2, 2, 6, 3, 5 ),
+ }, {
+ F( 0, -10, 16, 0, 0, 0, 10, 0 ),
+ F( 1, -6, 0, 16, 0, 0, 6, 0 ),
+ F( 2, -4, 0, 0, 16, 0, 4, 0 ),
+ F( 3, -2, 0, 0, 0, 16, 2, 0 ),
+ F( 4, -10, 16, 0, 0, 0, 0, 10 ),
+ F( 5, -6, 0, 16, 0, 0, 0, 6 ),
+ F( 6, -4, 0, 0, 16, 0, 0, 4 ),
+ F( 7, -2, 0, 0, 0, 16, 0, 2 ),
+ }, {
+ F( 0, -8, 8, 0, 0, 0, 16, 0 ),
+ F( 1, -8, 0, 8, 0, 0, 16, 0 ),
+ F( 2, -8, 0, 0, 8, 0, 16, 0 ),
+ F( 3, -8, 0, 0, 0, 8, 16, 0 ),
+ F( 4, -4, 4, 0, 0, 0, 0, 16 ),
+ F( 5, -4, 0, 4, 0, 0, 0, 16 ),
+ F( 6, -4, 0, 0, 4, 0, 0, 16 ),
+ F( 7, -4, 0, 0, 0, 4, 0, 16 ),
+ }, {
+ F( 0, -2, 8, 0, 0, 0, 10, 0 ),
+ F( 1, -1, 3, 8, 0, 0, 6, 0 ),
+ F( 2, -1, 2, 3, 8, 0, 4, 0 ),
+ F( 3, 0, 1, 2, 3, 8, 2, 0 ),
+ F( 4, -1, 4, 0, 0, 0, 3, 10 ),
+ F( 5, -1, 3, 4, 0, 0, 4, 6 ),
+ F( 6, -1, 2, 3, 4, 0, 4, 4 ),
+ F( 7, -1, 2, 2, 3, 4, 3, 3 ),
+ }, {
+ F( 0, -12, 14, 0, 0, 0, 14, 0 ),
+ F( 1, -10, 0, 14, 0, 0, 12, 0 ),
+ F( 2, -9, 0, 0, 14, 0, 11, 0 ),
+ F( 3, -8, 0, 0, 0, 14, 10, 0 ),
+ F( 4, -10, 12, 0, 0, 0, 0, 14 ),
+ F( 5, -9, 1, 12, 0, 0, 0, 12 ),
+ F( 6, -8, 0, 0, 12, 0, 1, 11 ),
+ F( 7, -7, 0, 0, 1, 12, 1, 9 ),
+ }
+};
+
+const uint8_t ALIGN(dav1d_obmc_masks[64], 16) = {
+ /* Unused */
+ 0, 0,
+ /* 2 */
+ 19, 0,
+ /* 4 */
+ 25, 14, 5, 0,
+ /* 8 */
+ 28, 22, 16, 11, 7, 3, 0, 0,
+ /* 16 */
+ 30, 27, 24, 21, 18, 15, 12, 10, 8, 6, 4, 3, 0, 0, 0, 0,
+ /* 32 */
+ 31, 29, 28, 26, 24, 23, 21, 20, 19, 17, 16, 14, 13, 12, 11, 9,
+ 8, 7, 6, 5, 4, 4, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+// Taken from the spec. Range is [-2048, 2047], mean is 0 and stddev is 512
+const int16_t dav1d_gaussian_sequence[2048] = {
+ 56, 568, -180, 172, 124, -84, 172, -64, -900, 24, 820,
+ 224, 1248, 996, 272, -8, -916, -388, -732, -104, -188, 800,
+ 112, -652, -320, -376, 140, -252, 492, -168, 44, -788, 588,
+ -584, 500, -228, 12, 680, 272, -476, 972, -100, 652, 368,
+ 432, -196, -720, -192, 1000, -332, 652, -136, -552, -604, -4,
+ 192, -220, -136, 1000, -52, 372, -96, -624, 124, -24, 396,
+ 540, -12, -104, 640, 464, 244, -208, -84, 368, -528, -740,
+ 248, -968, -848, 608, 376, -60, -292, -40, -156, 252, -292,
+ 248, 224, -280, 400, -244, 244, -60, 76, -80, 212, 532,
+ 340, 128, -36, 824, -352, -60, -264, -96, -612, 416, -704,
+ 220, -204, 640, -160, 1220, -408, 900, 336, 20, -336, -96,
+ -792, 304, 48, -28, -1232, -1172, -448, 104, -292, -520, 244,
+ 60, -948, 0, -708, 268, 108, 356, -548, 488, -344, -136,
+ 488, -196, -224, 656, -236, -1128, 60, 4, 140, 276, -676,
+ -376, 168, -108, 464, 8, 564, 64, 240, 308, -300, -400,
+ -456, -136, 56, 120, -408, -116, 436, 504, -232, 328, 844,
+ -164, -84, 784, -168, 232, -224, 348, -376, 128, 568, 96,
+ -1244, -288, 276, 848, 832, -360, 656, 464, -384, -332, -356,
+ 728, -388, 160, -192, 468, 296, 224, 140, -776, -100, 280,
+ 4, 196, 44, -36, -648, 932, 16, 1428, 28, 528, 808,
+ 772, 20, 268, 88, -332, -284, 124, -384, -448, 208, -228,
+ -1044, -328, 660, 380, -148, -300, 588, 240, 540, 28, 136,
+ -88, -436, 256, 296, -1000, 1400, 0, -48, 1056, -136, 264,
+ -528, -1108, 632, -484, -592, -344, 796, 124, -668, -768, 388,
+ 1296, -232, -188, -200, -288, -4, 308, 100, -168, 256, -500,
+ 204, -508, 648, -136, 372, -272, -120, -1004, -552, -548, -384,
+ 548, -296, 428, -108, -8, -912, -324, -224, -88, -112, -220,
+ -100, 996, -796, 548, 360, -216, 180, 428, -200, -212, 148,
+ 96, 148, 284, 216, -412, -320, 120, -300, -384, -604, -572,
+ -332, -8, -180, -176, 696, 116, -88, 628, 76, 44, -516,
+ 240, -208, -40, 100, -592, 344, -308, -452, -228, 20, 916,
+ -1752, -136, -340, -804, 140, 40, 512, 340, 248, 184, -492,
+ 896, -156, 932, -628, 328, -688, -448, -616, -752, -100, 560,
+ -1020, 180, -800, -64, 76, 576, 1068, 396, 660, 552, -108,
+ -28, 320, -628, 312, -92, -92, -472, 268, 16, 560, 516,
+ -672, -52, 492, -100, 260, 384, 284, 292, 304, -148, 88,
+ -152, 1012, 1064, -228, 164, -376, -684, 592, -392, 156, 196,
+ -524, -64, -884, 160, -176, 636, 648, 404, -396, -436, 864,
+ 424, -728, 988, -604, 904, -592, 296, -224, 536, -176, -920,
+ 436, -48, 1176, -884, 416, -776, -824, -884, 524, -548, -564,
+ -68, -164, -96, 692, 364, -692, -1012, -68, 260, -480, 876,
+ -1116, 452, -332, -352, 892, -1088, 1220, -676, 12, -292, 244,
+ 496, 372, -32, 280, 200, 112, -440, -96, 24, -644, -184,
+ 56, -432, 224, -980, 272, -260, 144, -436, 420, 356, 364,
+ -528, 76, 172, -744, -368, 404, -752, -416, 684, -688, 72,
+ 540, 416, 92, 444, 480, -72, -1416, 164, -1172, -68, 24,
+ 424, 264, 1040, 128, -912, -524, -356, 64, 876, -12, 4,
+ -88, 532, 272, -524, 320, 276, -508, 940, 24, -400, -120,
+ 756, 60, 236, -412, 100, 376, -484, 400, -100, -740, -108,
+ -260, 328, -268, 224, -200, -416, 184, -604, -564, -20, 296,
+ 60, 892, -888, 60, 164, 68, -760, 216, -296, 904, -336,
+ -28, 404, -356, -568, -208, -1480, -512, 296, 328, -360, -164,
+ -1560, -776, 1156, -428, 164, -504, -112, 120, -216, -148, -264,
+ 308, 32, 64, -72, 72, 116, 176, -64, -272, 460, -536,
+ -784, -280, 348, 108, -752, -132, 524, -540, -776, 116, -296,
+ -1196, -288, -560, 1040, -472, 116, -848, -1116, 116, 636, 696,
+ 284, -176, 1016, 204, -864, -648, -248, 356, 972, -584, -204,
+ 264, 880, 528, -24, -184, 116, 448, -144, 828, 524, 212,
+ -212, 52, 12, 200, 268, -488, -404, -880, 824, -672, -40,
+ 908, -248, 500, 716, -576, 492, -576, 16, 720, -108, 384,
+ 124, 344, 280, 576, -500, 252, 104, -308, 196, -188, -8,
+ 1268, 296, 1032, -1196, 436, 316, 372, -432, -200, -660, 704,
+ -224, 596, -132, 268, 32, -452, 884, 104, -1008, 424, -1348,
+ -280, 4, -1168, 368, 476, 696, 300, -8, 24, 180, -592,
+ -196, 388, 304, 500, 724, -160, 244, -84, 272, -256, -420,
+ 320, 208, -144, -156, 156, 364, 452, 28, 540, 316, 220,
+ -644, -248, 464, 72, 360, 32, -388, 496, -680, -48, 208,
+ -116, -408, 60, -604, -392, 548, -840, 784, -460, 656, -544,
+ -388, -264, 908, -800, -628, -612, -568, 572, -220, 164, 288,
+ -16, -308, 308, -112, -636, -760, 280, -668, 432, 364, 240,
+ -196, 604, 340, 384, 196, 592, -44, -500, 432, -580, -132,
+ 636, -76, 392, 4, -412, 540, 508, 328, -356, -36, 16,
+ -220, -64, -248, -60, 24, -192, 368, 1040, 92, -24, -1044,
+ -32, 40, 104, 148, 192, -136, -520, 56, -816, -224, 732,
+ 392, 356, 212, -80, -424, -1008, -324, 588, -1496, 576, 460,
+ -816, -848, 56, -580, -92, -1372, -112, -496, 200, 364, 52,
+ -140, 48, -48, -60, 84, 72, 40, 132, -356, -268, -104,
+ -284, -404, 732, -520, 164, -304, -540, 120, 328, -76, -460,
+ 756, 388, 588, 236, -436, -72, -176, -404, -316, -148, 716,
+ -604, 404, -72, -88, -888, -68, 944, 88, -220, -344, 960,
+ 472, 460, -232, 704, 120, 832, -228, 692, -508, 132, -476,
+ 844, -748, -364, -44, 1116, -1104, -1056, 76, 428, 552, -692,
+ 60, 356, 96, -384, -188, -612, -576, 736, 508, 892, 352,
+ -1132, 504, -24, -352, 324, 332, -600, -312, 292, 508, -144,
+ -8, 484, 48, 284, -260, -240, 256, -100, -292, -204, -44,
+ 472, -204, 908, -188, -1000, -256, 92, 1164, -392, 564, 356,
+ 652, -28, -884, 256, 484, -192, 760, -176, 376, -524, -452,
+ -436, 860, -736, 212, 124, 504, -476, 468, 76, -472, 552,
+ -692, -944, -620, 740, -240, 400, 132, 20, 192, -196, 264,
+ -668, -1012, -60, 296, -316, -828, 76, -156, 284, -768, -448,
+ -832, 148, 248, 652, 616, 1236, 288, -328, -400, -124, 588,
+ 220, 520, -696, 1032, 768, -740, -92, -272, 296, 448, -464,
+ 412, -200, 392, 440, -200, 264, -152, -260, 320, 1032, 216,
+ 320, -8, -64, 156, -1016, 1084, 1172, 536, 484, -432, 132,
+ 372, -52, -256, 84, 116, -352, 48, 116, 304, -384, 412,
+ 924, -300, 528, 628, 180, 648, 44, -980, -220, 1320, 48,
+ 332, 748, 524, -268, -720, 540, -276, 564, -344, -208, -196,
+ 436, 896, 88, -392, 132, 80, -964, -288, 568, 56, -48,
+ -456, 888, 8, 552, -156, -292, 948, 288, 128, -716, -292,
+ 1192, -152, 876, 352, -600, -260, -812, -468, -28, -120, -32,
+ -44, 1284, 496, 192, 464, 312, -76, -516, -380, -456, -1012,
+ -48, 308, -156, 36, 492, -156, -808, 188, 1652, 68, -120,
+ -116, 316, 160, -140, 352, 808, -416, 592, 316, -480, 56,
+ 528, -204, -568, 372, -232, 752, -344, 744, -4, 324, -416,
+ -600, 768, 268, -248, -88, -132, -420, -432, 80, -288, 404,
+ -316, -1216, -588, 520, -108, 92, -320, 368, -480, -216, -92,
+ 1688, -300, 180, 1020, -176, 820, -68, -228, -260, 436, -904,
+ 20, 40, -508, 440, -736, 312, 332, 204, 760, -372, 728,
+ 96, -20, -632, -520, -560, 336, 1076, -64, -532, 776, 584,
+ 192, 396, -728, -520, 276, -188, 80, -52, -612, -252, -48,
+ 648, 212, -688, 228, -52, -260, 428, -412, -272, -404, 180,
+ 816, -796, 48, 152, 484, -88, -216, 988, 696, 188, -528,
+ 648, -116, -180, 316, 476, 12, -564, 96, 476, -252, -364,
+ -376, -392, 556, -256, -576, 260, -352, 120, -16, -136, -260,
+ -492, 72, 556, 660, 580, 616, 772, 436, 424, -32, -324,
+ -1268, 416, -324, -80, 920, 160, 228, 724, 32, -516, 64,
+ 384, 68, -128, 136, 240, 248, -204, -68, 252, -932, -120,
+ -480, -628, -84, 192, 852, -404, -288, -132, 204, 100, 168,
+ -68, -196, -868, 460, 1080, 380, -80, 244, 0, 484, -888,
+ 64, 184, 352, 600, 460, 164, 604, -196, 320, -64, 588,
+ -184, 228, 12, 372, 48, -848, -344, 224, 208, -200, 484,
+ 128, -20, 272, -468, -840, 384, 256, -720, -520, -464, -580,
+ 112, -120, 644, -356, -208, -608, -528, 704, 560, -424, 392,
+ 828, 40, 84, 200, -152, 0, -144, 584, 280, -120, 80,
+ -556, -972, -196, -472, 724, 80, 168, -32, 88, 160, -688,
+ 0, 160, 356, 372, -776, 740, -128, 676, -248, -480, 4,
+ -364, 96, 544, 232, -1032, 956, 236, 356, 20, -40, 300,
+ 24, -676, -596, 132, 1120, -104, 532, -1096, 568, 648, 444,
+ 508, 380, 188, -376, -604, 1488, 424, 24, 756, -220, -192,
+ 716, 120, 920, 688, 168, 44, -460, 568, 284, 1144, 1160,
+ 600, 424, 888, 656, -356, -320, 220, 316, -176, -724, -188,
+ -816, -628, -348, -228, -380, 1012, -452, -660, 736, 928, 404,
+ -696, -72, -268, -892, 128, 184, -344, -780, 360, 336, 400,
+ 344, 428, 548, -112, 136, -228, -216, -820, -516, 340, 92,
+ -136, 116, -300, 376, -244, 100, -316, -520, -284, -12, 824,
+ 164, -548, -180, -128, 116, -924, -828, 268, -368, -580, 620,
+ 192, 160, 0, -1676, 1068, 424, -56, -360, 468, -156, 720,
+ 288, -528, 556, -364, 548, -148, 504, 316, 152, -648, -620,
+ -684, -24, -376, -384, -108, -920, -1032, 768, 180, -264, -508,
+ -1268, -260, -60, 300, -240, 988, 724, -376, -576, -212, -736,
+ 556, 192, 1092, -620, -880, 376, -56, -4, -216, -32, 836,
+ 268, 396, 1332, 864, -600, 100, 56, -412, -92, 356, 180,
+ 884, -468, -436, 292, -388, -804, -704, -840, 368, -348, 140,
+ -724, 1536, 940, 372, 112, -372, 436, -480, 1136, 296, -32,
+ -228, 132, -48, -220, 868, -1016, -60, -1044, -464, 328, 916,
+ 244, 12, -736, -296, 360, 468, -376, -108, -92, 788, 368,
+ -56, 544, 400, -672, -420, 728, 16, 320, 44, -284, -380,
+ -796, 488, 132, 204, -596, -372, 88, -152, -908, -636, -572,
+ -624, -116, -692, -200, -56, 276, -88, 484, -324, 948, 864,
+ 1000, -456, -184, -276, 292, -296, 156, 676, 320, 160, 908,
+ -84, -1236, -288, -116, 260, -372, -644, 732, -756, -96, 84,
+ 344, -520, 348, -688, 240, -84, 216, -1044, -136, -676, -396,
+ -1500, 960, -40, 176, 168, 1516, 420, -504, -344, -364, -360,
+ 1216, -940, -380, -212, 252, -660, -708, 484, -444, -152, 928,
+ -120, 1112, 476, -260, 560, -148, -344, 108, -196, 228, -288,
+ 504, 560, -328, -88, 288, -1008, 460, -228, 468, -836, -196,
+ 76, 388, 232, 412, -1168, -716, -644, 756, -172, -356, -504,
+ 116, 432, 528, 48, 476, -168, -608, 448, 160, -532, -272,
+ 28, -676, -12, 828, 980, 456, 520, 104, -104, 256, -344,
+ -4, -28, -368, -52, -524, -572, -556, -200, 768, 1124, -208,
+ -512, 176, 232, 248, -148, -888, 604, -600, -304, 804, -156,
+ -212, 488, -192, -804, -256, 368, -360, -916, -328, 228, -240,
+ -448, -472, 856, -556, -364, 572, -12, -156, -368, -340, 432,
+ 252, -752, -152, 288, 268, -580, -848, -592, 108, -76, 244,
+ 312, -716, 592, -80, 436, 360, 4, -248, 160, 516, 584,
+ 732, 44, -468, -280, -292, -156, -588, 28, 308, 912, 24,
+ 124, 156, 180, -252, 944, -924, -772, -520, -428, -624, 300,
+ -212, -1144, 32, -724, 800, -1128, -212, -1288, -848, 180, -416,
+ 440, 192, -576, -792, -76, -1080, 80, -532, -352, -132, 380,
+ -820, 148, 1112, 128, 164, 456, 700, -924, 144, -668, -384,
+ 648, -832, 508, 552, -52, -100, -656, 208, -568, 748, -88,
+ 680, 232, 300, 192, -408, -1012, -152, -252, -268, 272, -876,
+ -664, -648, -332, -136, 16, 12, 1152, -28, 332, -536, 320,
+ -672, -460, -316, 532, -260, 228, -40, 1052, -816, 180, 88,
+ -496, -556, -672, -368, 428, 92, 356, 404, -408, 252, 196,
+ -176, -556, 792, 268, 32, 372, 40, 96, -332, 328, 120,
+ 372, -900, -40, 472, -264, -592, 952, 128, 656, 112, 664,
+ -232, 420, 4, -344, -464, 556, 244, -416, -32, 252, 0,
+ -412, 188, -696, 508, -476, 324, -1096, 656, -312, 560, 264,
+ -136, 304, 160, -64, -580, 248, 336, -720, 560, -348, -288,
+ -276, -196, -500, 852, -544, -236, -1128, -992, -776, 116, 56,
+ 52, 860, 884, 212, -12, 168, 1020, 512, -552, 924, -148,
+ 716, 188, 164, -340, -520, -184, 880, -152, -680, -208, -1156,
+ -300, -528, -472, 364, 100, -744, -1056, -32, 540, 280, 144,
+ -676, -32, -232, -280, -224, 96, 568, -76, 172, 148, 148,
+ 104, 32, -296, -32, 788, -80, 32, -16, 280, 288, 944,
+ 428, -484
+};
diff --git a/third_party/dav1d/src/tables.h b/third_party/dav1d/src/tables.h
new file mode 100644
index 0000000000..f3c00cfb00
--- /dev/null
+++ b/third_party/dav1d/src/tables.h
@@ -0,0 +1,125 @@
+/*
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_TABLES_H
+#define DAV1D_SRC_TABLES_H
+
+#include <stdint.h>
+
+#include "common/intops.h"
+
+#include "src/levels.h"
+
+EXTERN const uint8_t dav1d_al_part_ctx[2][N_BL_LEVELS][N_PARTITIONS];
+EXTERN const uint8_t /* enum BlockSize */
+ dav1d_block_sizes[N_BL_LEVELS][N_PARTITIONS][2];
+// width, height (in 4px blocks), log2 versions of these two
+EXTERN const uint8_t dav1d_block_dimensions[N_BS_SIZES][4];
+typedef struct TxfmInfo {
+ // width, height (in 4px blocks), log2 of them, min/max of log2, sub, pad
+ uint8_t w, h, lw, lh, min, max, sub, ctx;
+} TxfmInfo;
+EXTERN const TxfmInfo dav1d_txfm_dimensions[N_RECT_TX_SIZES];
+EXTERN const uint8_t /* enum (Rect)TxfmSize */
+ dav1d_max_txfm_size_for_bs[N_BS_SIZES][4 /* y, 420, 422, 444 */];
+EXTERN const uint8_t /* enum TxfmType */
+ dav1d_txtp_from_uvmode[N_UV_INTRA_PRED_MODES];
+
+EXTERN const uint8_t /* enum InterPredMode */
+ dav1d_comp_inter_pred_modes[N_COMP_INTER_PRED_MODES][2];
+
+EXTERN const uint8_t dav1d_partition_type_count[N_BL_LEVELS];
+EXTERN const uint8_t /* enum TxfmType */ dav1d_tx_types_per_set[40];
+
+EXTERN const uint8_t dav1d_filter_mode_to_y_mode[5];
+EXTERN const uint8_t dav1d_ymode_size_context[N_BS_SIZES];
+EXTERN const uint8_t dav1d_lo_ctx_offsets[3][5][5];
+EXTERN const uint8_t dav1d_skip_ctx[5][5];
+EXTERN const uint8_t /* enum TxClass */
+ dav1d_tx_type_class[N_TX_TYPES_PLUS_LL];
+EXTERN const uint8_t /* enum Filter2d */
+ dav1d_filter_2d[DAV1D_N_FILTERS /* h */][DAV1D_N_FILTERS /* v */];
+EXTERN const uint8_t /* enum Dav1dFilterMode */ dav1d_filter_dir[N_2D_FILTERS][2];
+EXTERN const uint8_t dav1d_intra_mode_context[N_INTRA_PRED_MODES];
+EXTERN const uint8_t dav1d_wedge_ctx_lut[N_BS_SIZES];
+
+static const unsigned cfl_allowed_mask =
+ (1 << BS_32x32) |
+ (1 << BS_32x16) |
+ (1 << BS_32x8) |
+ (1 << BS_16x32) |
+ (1 << BS_16x16) |
+ (1 << BS_16x8) |
+ (1 << BS_16x4) |
+ (1 << BS_8x32) |
+ (1 << BS_8x16) |
+ (1 << BS_8x8) |
+ (1 << BS_8x4) |
+ (1 << BS_4x16) |
+ (1 << BS_4x8) |
+ (1 << BS_4x4);
+
+static const unsigned wedge_allowed_mask =
+ (1 << BS_32x32) |
+ (1 << BS_32x16) |
+ (1 << BS_32x8) |
+ (1 << BS_16x32) |
+ (1 << BS_16x16) |
+ (1 << BS_16x8) |
+ (1 << BS_8x32) |
+ (1 << BS_8x16) |
+ (1 << BS_8x8);
+
+static const unsigned interintra_allowed_mask =
+ (1 << BS_32x32) |
+ (1 << BS_32x16) |
+ (1 << BS_16x32) |
+ (1 << BS_16x16) |
+ (1 << BS_16x8) |
+ (1 << BS_8x16) |
+ (1 << BS_8x8);
+
+EXTERN const Dav1dWarpedMotionParams dav1d_default_wm_params;
+
+EXTERN const int8_t dav1d_cdef_directions[12][2];
+
+EXTERN const uint16_t dav1d_sgr_params[16][2];
+EXTERN const uint8_t dav1d_sgr_x_by_x[256];
+
+EXTERN const int8_t dav1d_mc_subpel_filters[6][15][8];
+EXTERN const int8_t dav1d_mc_warp_filter[193][8];
+EXTERN const int8_t dav1d_resize_filter[64][8];
+
+EXTERN const uint8_t dav1d_sm_weights[128];
+EXTERN const uint16_t dav1d_dr_intra_derivative[44];
+EXTERN const int8_t dav1d_filter_intra_taps[5][64];
+
+EXTERN const uint8_t dav1d_obmc_masks[64];
+
+EXTERN const int16_t dav1d_gaussian_sequence[2048]; // for fgs
+
+#endif /* DAV1D_SRC_TABLES_H */
diff --git a/third_party/dav1d/src/thread.h b/third_party/dav1d/src/thread.h
new file mode 100644
index 0000000000..c44de736c3
--- /dev/null
+++ b/third_party/dav1d/src/thread.h
@@ -0,0 +1,189 @@
+/*
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_THREAD_H
+#define DAV1D_SRC_THREAD_H
+
+#if defined(_WIN32)
+
+#include <limits.h>
+#include <windows.h>
+
+#define PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT
+#define PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT
+
+typedef struct {
+ HANDLE h;
+ void *(*func)(void*);
+ void *arg;
+} pthread_t;
+
+typedef struct {
+ unsigned stack_size;
+} pthread_attr_t;
+
+typedef SRWLOCK pthread_mutex_t;
+typedef CONDITION_VARIABLE pthread_cond_t;
+typedef INIT_ONCE pthread_once_t;
+
+void dav1d_init_thread(void);
+void dav1d_set_thread_name(const wchar_t *name);
+#define dav1d_set_thread_name(name) dav1d_set_thread_name(L##name)
+
+int dav1d_pthread_create(pthread_t *thread, const pthread_attr_t *attr,
+ void *(*func)(void*), void *arg);
+int dav1d_pthread_join(pthread_t *thread, void **res);
+int dav1d_pthread_once(pthread_once_t *once_control,
+ void (*init_routine)(void));
+
+#define pthread_create dav1d_pthread_create
+#define pthread_join(thread, res) dav1d_pthread_join(&(thread), res)
+#define pthread_once dav1d_pthread_once
+
+static inline int pthread_attr_init(pthread_attr_t *const attr) {
+ attr->stack_size = 0;
+ return 0;
+}
+
+static inline int pthread_attr_destroy(pthread_attr_t *const attr) {
+ return 0;
+}
+
+static inline int pthread_attr_setstacksize(pthread_attr_t *const attr,
+ const size_t stack_size)
+{
+ if (stack_size > UINT_MAX) return 1;
+ attr->stack_size = (unsigned) stack_size;
+ return 0;
+}
+
+static inline int pthread_mutex_init(pthread_mutex_t *const mutex,
+ const void *const attr)
+{
+ InitializeSRWLock(mutex);
+ return 0;
+}
+
+static inline int pthread_mutex_destroy(pthread_mutex_t *const mutex) {
+ return 0;
+}
+
+static inline int pthread_mutex_lock(pthread_mutex_t *const mutex) {
+ AcquireSRWLockExclusive(mutex);
+ return 0;
+}
+
+static inline int pthread_mutex_unlock(pthread_mutex_t *const mutex) {
+ ReleaseSRWLockExclusive(mutex);
+ return 0;
+}
+
+static inline int pthread_cond_init(pthread_cond_t *const cond,
+ const void *const attr)
+{
+ InitializeConditionVariable(cond);
+ return 0;
+}
+
+static inline int pthread_cond_destroy(pthread_cond_t *const cond) {
+ return 0;
+}
+
+static inline int pthread_cond_wait(pthread_cond_t *const cond,
+ pthread_mutex_t *const mutex)
+{
+ return !SleepConditionVariableSRW(cond, mutex, INFINITE, 0);
+}
+
+static inline int pthread_cond_signal(pthread_cond_t *const cond) {
+ WakeConditionVariable(cond);
+ return 0;
+}
+
+static inline int pthread_cond_broadcast(pthread_cond_t *const cond) {
+ WakeAllConditionVariable(cond);
+ return 0;
+}
+
+#else
+
+#include <pthread.h>
+
+#define dav1d_init_thread() do {} while (0)
+
+/* Thread naming support */
+
+#ifdef __linux__
+
+#include <sys/prctl.h>
+
+static inline void dav1d_set_thread_name(const char *const name) {
+ prctl(PR_SET_NAME, name);
+}
+
+#elif defined(__APPLE__)
+
+static inline void dav1d_set_thread_name(const char *const name) {
+ pthread_setname_np(name);
+}
+
+#elif defined(__DragonFly__) || defined(__FreeBSD__) || defined(__OpenBSD__)
+
+#if defined(__FreeBSD__)
+ /* ALIGN from <sys/param.h> conflicts with ALIGN from "common/attributes.h" */
+#define _SYS_PARAM_H_
+#include <sys/types.h>
+#endif
+#include <pthread_np.h>
+
+static inline void dav1d_set_thread_name(const char *const name) {
+ pthread_set_name_np(pthread_self(), name);
+}
+
+#elif defined(__NetBSD__)
+
+static inline void dav1d_set_thread_name(const char *const name) {
+ pthread_setname_np(pthread_self(), "%s", (void*)name);
+}
+
+#elif defined(__HAIKU__)
+
+#include <os/kernel/OS.h>
+
+static inline void dav1d_set_thread_name(const char *const name) {
+ rename_thread(find_thread(NULL), name);
+}
+
+#else
+
+#define dav1d_set_thread_name(name) do {} while (0)
+
+#endif
+
+#endif
+
+#endif /* DAV1D_SRC_THREAD_H */
diff --git a/third_party/dav1d/src/thread_data.h b/third_party/dav1d/src/thread_data.h
new file mode 100644
index 0000000000..62814e6348
--- /dev/null
+++ b/third_party/dav1d/src/thread_data.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_THREAD_DATA_H
+#define DAV1D_SRC_THREAD_DATA_H
+
+#include "src/thread.h"
+
+struct thread_data {
+ pthread_t thread;
+ pthread_cond_t cond;
+ pthread_mutex_t lock;
+ int inited;
+};
+
+#endif /* DAV1D_SRC_THREAD_DATA_H */
diff --git a/third_party/dav1d/src/thread_task.c b/third_party/dav1d/src/thread_task.c
new file mode 100644
index 0000000000..1ededde43c
--- /dev/null
+++ b/third_party/dav1d/src/thread_task.c
@@ -0,0 +1,936 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "common/frame.h"
+
+#include "src/thread_task.h"
+#include "src/fg_apply.h"
+
+// This function resets the cur pointer to the first frame theoretically
+// executable after a task completed (ie. each time we update some progress or
+// insert some tasks in the queue).
+// When frame_idx is set, it can be either from a completed task, or from tasks
+// inserted in the queue, in which case we have to make sure the cur pointer
+// isn't past this insert.
+// The special case where frame_idx is UINT_MAX is to handle the reset after
+// completing a task and locklessly signaling progress. In this case we don't
+// enter a critical section, which is needed for this function, so we set an
+// atomic for a delayed handling, happening here. Meaning we can call this
+// function without any actual update other than what's in the atomic, hence
+// this special case.
+static inline int reset_task_cur(const Dav1dContext *const c,
+ struct TaskThreadData *const ttd,
+ unsigned frame_idx)
+{
+ const unsigned first = atomic_load(&ttd->first);
+ unsigned reset_frame_idx = atomic_exchange(&ttd->reset_task_cur, UINT_MAX);
+ if (reset_frame_idx < first) {
+ if (frame_idx == UINT_MAX) return 0;
+ reset_frame_idx = UINT_MAX;
+ }
+ if (!ttd->cur && c->fc[first].task_thread.task_cur_prev == NULL)
+ return 0;
+ if (reset_frame_idx != UINT_MAX) {
+ if (frame_idx == UINT_MAX) {
+ if (reset_frame_idx > first + ttd->cur)
+ return 0;
+ ttd->cur = reset_frame_idx - first;
+ goto cur_found;
+ }
+ } else if (frame_idx == UINT_MAX)
+ return 0;
+ if (frame_idx < first) frame_idx += c->n_fc;
+ const unsigned min_frame_idx = umin(reset_frame_idx, frame_idx);
+ const unsigned cur_frame_idx = first + ttd->cur;
+ if (ttd->cur < c->n_fc && cur_frame_idx < min_frame_idx)
+ return 0;
+ for (ttd->cur = min_frame_idx - first; ttd->cur < c->n_fc; ttd->cur++)
+ if (c->fc[(first + ttd->cur) % c->n_fc].task_thread.task_head)
+ break;
+cur_found:
+ for (unsigned i = ttd->cur; i < c->n_fc; i++)
+ c->fc[(first + i) % c->n_fc].task_thread.task_cur_prev = NULL;
+ return 1;
+}
+
+static inline void reset_task_cur_async(struct TaskThreadData *const ttd,
+ unsigned frame_idx, unsigned n_frames)
+{
+ const unsigned first = atomic_load(&ttd->first);
+ if (frame_idx < first) frame_idx += n_frames;
+ unsigned last_idx = frame_idx;
+ do {
+ frame_idx = last_idx;
+ last_idx = atomic_exchange(&ttd->reset_task_cur, frame_idx);
+ } while (last_idx < frame_idx);
+ if (frame_idx == first && atomic_load(&ttd->first) != first) {
+ unsigned expected = frame_idx;
+ atomic_compare_exchange_strong(&ttd->reset_task_cur, &expected, UINT_MAX);
+ }
+}
+
+static void insert_tasks_between(Dav1dFrameContext *const f,
+ Dav1dTask *const first, Dav1dTask *const last,
+ Dav1dTask *const a, Dav1dTask *const b,
+ const int cond_signal)
+{
+ struct TaskThreadData *const ttd = f->task_thread.ttd;
+ if (atomic_load(f->c->flush)) return;
+ assert(!a || a->next == b);
+ if (!a) f->task_thread.task_head = first;
+ else a->next = first;
+ if (!b) f->task_thread.task_tail = last;
+ last->next = b;
+ reset_task_cur(f->c, ttd, first->frame_idx);
+ if (cond_signal && !atomic_fetch_or(&ttd->cond_signaled, 1))
+ pthread_cond_signal(&ttd->cond);
+}
+
+static void insert_tasks(Dav1dFrameContext *const f,
+ Dav1dTask *const first, Dav1dTask *const last,
+ const int cond_signal)
+{
+ // insert task back into task queue
+ Dav1dTask *t_ptr, *prev_t = NULL;
+ for (t_ptr = f->task_thread.task_head;
+ t_ptr; prev_t = t_ptr, t_ptr = t_ptr->next)
+ {
+ // entropy coding precedes other steps
+ if (t_ptr->type == DAV1D_TASK_TYPE_TILE_ENTROPY) {
+ if (first->type > DAV1D_TASK_TYPE_TILE_ENTROPY) continue;
+ // both are entropy
+ if (first->sby > t_ptr->sby) continue;
+ if (first->sby < t_ptr->sby) {
+ insert_tasks_between(f, first, last, prev_t, t_ptr, cond_signal);
+ return;
+ }
+ // same sby
+ } else {
+ if (first->type == DAV1D_TASK_TYPE_TILE_ENTROPY) {
+ insert_tasks_between(f, first, last, prev_t, t_ptr, cond_signal);
+ return;
+ }
+ if (first->sby > t_ptr->sby) continue;
+ if (first->sby < t_ptr->sby) {
+ insert_tasks_between(f, first, last, prev_t, t_ptr, cond_signal);
+ return;
+ }
+ // same sby
+ if (first->type > t_ptr->type) continue;
+ if (first->type < t_ptr->type) {
+ insert_tasks_between(f, first, last, prev_t, t_ptr, cond_signal);
+ return;
+ }
+ // same task type
+ }
+
+ // sort by tile-id
+ assert(first->type == DAV1D_TASK_TYPE_TILE_RECONSTRUCTION ||
+ first->type == DAV1D_TASK_TYPE_TILE_ENTROPY);
+ assert(first->type == t_ptr->type);
+ assert(t_ptr->sby == first->sby);
+ const int p = first->type == DAV1D_TASK_TYPE_TILE_ENTROPY;
+ const int t_tile_idx = (int) (first - f->task_thread.tile_tasks[p]);
+ const int p_tile_idx = (int) (t_ptr - f->task_thread.tile_tasks[p]);
+ assert(t_tile_idx != p_tile_idx);
+ if (t_tile_idx > p_tile_idx) continue;
+ insert_tasks_between(f, first, last, prev_t, t_ptr, cond_signal);
+ return;
+ }
+ // append at the end
+ insert_tasks_between(f, first, last, prev_t, NULL, cond_signal);
+}
+
+static inline void insert_task(Dav1dFrameContext *const f,
+ Dav1dTask *const t, const int cond_signal)
+{
+ insert_tasks(f, t, t, cond_signal);
+}
+
+static inline void add_pending(Dav1dFrameContext *const f, Dav1dTask *const t) {
+ pthread_mutex_lock(&f->task_thread.pending_tasks.lock);
+ t->next = NULL;
+ if (!f->task_thread.pending_tasks.head)
+ f->task_thread.pending_tasks.head = t;
+ else
+ f->task_thread.pending_tasks.tail->next = t;
+ f->task_thread.pending_tasks.tail = t;
+ atomic_store(&f->task_thread.pending_tasks.merge, 1);
+ pthread_mutex_unlock(&f->task_thread.pending_tasks.lock);
+}
+
+static inline int merge_pending_frame(Dav1dFrameContext *const f) {
+ int const merge = atomic_load(&f->task_thread.pending_tasks.merge);
+ if (merge) {
+ pthread_mutex_lock(&f->task_thread.pending_tasks.lock);
+ Dav1dTask *t = f->task_thread.pending_tasks.head;
+ f->task_thread.pending_tasks.head = NULL;
+ f->task_thread.pending_tasks.tail = NULL;
+ atomic_store(&f->task_thread.pending_tasks.merge, 0);
+ pthread_mutex_unlock(&f->task_thread.pending_tasks.lock);
+ while (t) {
+ Dav1dTask *const tmp = t->next;
+ insert_task(f, t, 0);
+ t = tmp;
+ }
+ }
+ return merge;
+}
+
+static inline int merge_pending(const Dav1dContext *const c) {
+ int res = 0;
+ for (unsigned i = 0; i < c->n_fc; i++)
+ res |= merge_pending_frame(&c->fc[i]);
+ return res;
+}
+
+static int create_filter_sbrow(Dav1dFrameContext *const f,
+ const int pass, Dav1dTask **res_t)
+{
+ const int has_deblock = f->frame_hdr->loopfilter.level_y[0] ||
+ f->frame_hdr->loopfilter.level_y[1];
+ const int has_cdef = f->seq_hdr->cdef;
+ const int has_resize = f->frame_hdr->width[0] != f->frame_hdr->width[1];
+ const int has_lr = f->lf.restore_planes;
+
+ Dav1dTask *tasks = f->task_thread.tasks;
+ const int uses_2pass = f->c->n_fc > 1;
+ int num_tasks = f->sbh * (1 + uses_2pass);
+ if (num_tasks > f->task_thread.num_tasks) {
+ const size_t size = sizeof(Dav1dTask) * num_tasks;
+ tasks = dav1d_realloc(ALLOC_COMMON_CTX, f->task_thread.tasks, size);
+ if (!tasks) return -1;
+ memset(tasks, 0, size);
+ f->task_thread.tasks = tasks;
+ f->task_thread.num_tasks = num_tasks;
+ }
+ tasks += f->sbh * (pass & 1);
+
+ if (pass & 1) {
+ f->frame_thread.entropy_progress = 0;
+ } else {
+ const int prog_sz = ((f->sbh + 31) & ~31) >> 5;
+ if (prog_sz > f->frame_thread.prog_sz) {
+ atomic_uint *const prog = dav1d_realloc(ALLOC_COMMON_CTX, f->frame_thread.frame_progress,
+ 2 * prog_sz * sizeof(*prog));
+ if (!prog) return -1;
+ f->frame_thread.frame_progress = prog;
+ f->frame_thread.copy_lpf_progress = prog + prog_sz;
+ }
+ f->frame_thread.prog_sz = prog_sz;
+ memset(f->frame_thread.frame_progress, 0, prog_sz * sizeof(atomic_uint));
+ memset(f->frame_thread.copy_lpf_progress, 0, prog_sz * sizeof(atomic_uint));
+ atomic_store(&f->frame_thread.deblock_progress, 0);
+ }
+ f->frame_thread.next_tile_row[pass & 1] = 0;
+
+ Dav1dTask *t = &tasks[0];
+ t->sby = 0;
+ t->recon_progress = 1;
+ t->deblock_progress = 0;
+ t->type = pass == 1 ? DAV1D_TASK_TYPE_ENTROPY_PROGRESS :
+ has_deblock ? DAV1D_TASK_TYPE_DEBLOCK_COLS :
+ has_cdef || has_lr /* i.e. LR backup */ ? DAV1D_TASK_TYPE_DEBLOCK_ROWS :
+ has_resize ? DAV1D_TASK_TYPE_SUPER_RESOLUTION :
+ DAV1D_TASK_TYPE_RECONSTRUCTION_PROGRESS;
+ t->frame_idx = (int)(f - f->c->fc);
+
+ *res_t = t;
+ return 0;
+}
+
+int dav1d_task_create_tile_sbrow(Dav1dFrameContext *const f, const int pass,
+ const int cond_signal)
+{
+ Dav1dTask *tasks = f->task_thread.tile_tasks[0];
+ const int uses_2pass = f->c->n_fc > 1;
+ const int num_tasks = f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows;
+ if (pass < 2) {
+ int alloc_num_tasks = num_tasks * (1 + uses_2pass);
+ if (alloc_num_tasks > f->task_thread.num_tile_tasks) {
+ const size_t size = sizeof(Dav1dTask) * alloc_num_tasks;
+ tasks = dav1d_realloc(ALLOC_COMMON_CTX, f->task_thread.tile_tasks[0], size);
+ if (!tasks) return -1;
+ memset(tasks, 0, size);
+ f->task_thread.tile_tasks[0] = tasks;
+ f->task_thread.num_tile_tasks = alloc_num_tasks;
+ }
+ f->task_thread.tile_tasks[1] = tasks + num_tasks;
+ }
+ tasks += num_tasks * (pass & 1);
+
+ Dav1dTask *pf_t;
+ if (create_filter_sbrow(f, pass, &pf_t))
+ return -1;
+
+ Dav1dTask *prev_t = NULL;
+ for (int tile_idx = 0; tile_idx < num_tasks; tile_idx++) {
+ Dav1dTileState *const ts = &f->ts[tile_idx];
+ Dav1dTask *t = &tasks[tile_idx];
+ t->sby = ts->tiling.row_start >> f->sb_shift;
+ if (pf_t && t->sby) {
+ prev_t->next = pf_t;
+ prev_t = pf_t;
+ pf_t = NULL;
+ }
+ t->recon_progress = 0;
+ t->deblock_progress = 0;
+ t->deps_skip = 0;
+ t->type = pass != 1 ? DAV1D_TASK_TYPE_TILE_RECONSTRUCTION :
+ DAV1D_TASK_TYPE_TILE_ENTROPY;
+ t->frame_idx = (int)(f - f->c->fc);
+ if (prev_t) prev_t->next = t;
+ prev_t = t;
+ }
+ if (pf_t) {
+ prev_t->next = pf_t;
+ prev_t = pf_t;
+ }
+ prev_t->next = NULL;
+
+ atomic_store(&f->task_thread.done[pass & 1], 0);
+
+ // XXX in theory this could be done locklessly, at this point they are no
+ // tasks in the frameQ, so no other runner should be using this lock, but
+ // we must add both passes at once
+ pthread_mutex_lock(&f->task_thread.pending_tasks.lock);
+ assert(f->task_thread.pending_tasks.head == NULL || pass == 2);
+ if (!f->task_thread.pending_tasks.head)
+ f->task_thread.pending_tasks.head = &tasks[0];
+ else
+ f->task_thread.pending_tasks.tail->next = &tasks[0];
+ f->task_thread.pending_tasks.tail = prev_t;
+ atomic_store(&f->task_thread.pending_tasks.merge, 1);
+ atomic_store(&f->task_thread.init_done, 1);
+ pthread_mutex_unlock(&f->task_thread.pending_tasks.lock);
+
+ return 0;
+}
+
+void dav1d_task_frame_init(Dav1dFrameContext *const f) {
+ const Dav1dContext *const c = f->c;
+
+ atomic_store(&f->task_thread.init_done, 0);
+ // schedule init task, which will schedule the remaining tasks
+ Dav1dTask *const t = &f->task_thread.init_task;
+ t->type = DAV1D_TASK_TYPE_INIT;
+ t->frame_idx = (int)(f - c->fc);
+ t->sby = 0;
+ t->recon_progress = t->deblock_progress = 0;
+ insert_task(f, t, 1);
+}
+
+void dav1d_task_delayed_fg(Dav1dContext *const c, Dav1dPicture *const out,
+ const Dav1dPicture *const in)
+{
+ struct TaskThreadData *const ttd = &c->task_thread;
+ ttd->delayed_fg.in = in;
+ ttd->delayed_fg.out = out;
+ ttd->delayed_fg.type = DAV1D_TASK_TYPE_FG_PREP;
+ atomic_init(&ttd->delayed_fg.progress[0], 0);
+ atomic_init(&ttd->delayed_fg.progress[1], 0);
+ pthread_mutex_lock(&ttd->lock);
+ ttd->delayed_fg.exec = 1;
+ ttd->delayed_fg.finished = 0;
+ pthread_cond_signal(&ttd->cond);
+ do {
+ pthread_cond_wait(&ttd->delayed_fg.cond, &ttd->lock);
+ } while (!ttd->delayed_fg.finished);
+ pthread_mutex_unlock(&ttd->lock);
+}
+
+static inline int ensure_progress(struct TaskThreadData *const ttd,
+ Dav1dFrameContext *const f,
+ Dav1dTask *const t, const enum TaskType type,
+ atomic_int *const state, int *const target)
+{
+ // deblock_rows (non-LR portion) depends on deblock of previous sbrow,
+ // so ensure that completed. if not, re-add to task-queue; else, fall-through
+ int p1 = atomic_load(state);
+ if (p1 < t->sby) {
+ t->type = type;
+ t->recon_progress = t->deblock_progress = 0;
+ *target = t->sby;
+ add_pending(f, t);
+ pthread_mutex_lock(&ttd->lock);
+ return 1;
+ }
+ return 0;
+}
+
+static inline int check_tile(Dav1dTask *const t, Dav1dFrameContext *const f,
+ const int frame_mt)
+{
+ const int tp = t->type == DAV1D_TASK_TYPE_TILE_ENTROPY;
+ const int tile_idx = (int)(t - f->task_thread.tile_tasks[tp]);
+ Dav1dTileState *const ts = &f->ts[tile_idx];
+ const int p1 = atomic_load(&ts->progress[tp]);
+ if (p1 < t->sby) return 1;
+ int error = p1 == TILE_ERROR;
+ error |= atomic_fetch_or(&f->task_thread.error, error);
+ if (!error && frame_mt && !tp) {
+ const int p2 = atomic_load(&ts->progress[1]);
+ if (p2 <= t->sby) return 1;
+ error = p2 == TILE_ERROR;
+ error |= atomic_fetch_or(&f->task_thread.error, error);
+ }
+ if (!error && frame_mt && !IS_KEY_OR_INTRA(f->frame_hdr)) {
+ // check reference state
+ const Dav1dThreadPicture *p = &f->sr_cur;
+ const int ss_ver = p->p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ const unsigned p_b = (t->sby + 1) << (f->sb_shift + 2);
+ const int tile_sby = t->sby - (ts->tiling.row_start >> f->sb_shift);
+ const int (*const lowest_px)[2] = ts->lowest_pixel[tile_sby];
+ for (int n = t->deps_skip; n < 7; n++, t->deps_skip++) {
+ unsigned lowest;
+ if (tp) {
+ // if temporal mv refs are disabled, we only need this
+ // for the primary ref; if segmentation is disabled, we
+ // don't even need that
+ lowest = p_b;
+ } else {
+ // +8 is postfilter-induced delay
+ const int y = lowest_px[n][0] == INT_MIN ? INT_MIN :
+ lowest_px[n][0] + 8;
+ const int uv = lowest_px[n][1] == INT_MIN ? INT_MIN :
+ lowest_px[n][1] * (1 << ss_ver) + 8;
+ const int max = imax(y, uv);
+ if (max == INT_MIN) continue;
+ lowest = iclip(max, 1, f->refp[n].p.p.h);
+ }
+ const unsigned p3 = atomic_load(&f->refp[n].progress[!tp]);
+ if (p3 < lowest) return 1;
+ atomic_fetch_or(&f->task_thread.error, p3 == FRAME_ERROR);
+ }
+ }
+ return 0;
+}
+
+static inline int get_frame_progress(const Dav1dContext *const c,
+ const Dav1dFrameContext *const f)
+{
+ unsigned frame_prog = c->n_fc > 1 ? atomic_load(&f->sr_cur.progress[1]) : 0;
+ if (frame_prog >= FRAME_ERROR)
+ return f->sbh - 1;
+ int idx = frame_prog >> (f->sb_shift + 7);
+ int prog;
+ do {
+ atomic_uint *state = &f->frame_thread.frame_progress[idx];
+ const unsigned val = ~atomic_load(state);
+ prog = val ? ctz(val) : 32;
+ if (prog != 32) break;
+ prog = 0;
+ } while (++idx < f->frame_thread.prog_sz);
+ return ((idx << 5) | prog) - 1;
+}
+
+static inline void abort_frame(Dav1dFrameContext *const f, const int error) {
+ atomic_store(&f->task_thread.error, error == DAV1D_ERR(EINVAL) ? 1 : -1);
+ atomic_store(&f->task_thread.task_counter, 0);
+ atomic_store(&f->task_thread.done[0], 1);
+ atomic_store(&f->task_thread.done[1], 1);
+ atomic_store(&f->sr_cur.progress[0], FRAME_ERROR);
+ atomic_store(&f->sr_cur.progress[1], FRAME_ERROR);
+ dav1d_decode_frame_exit(f, error);
+ f->n_tile_data = 0;
+ pthread_cond_signal(&f->task_thread.cond);
+}
+
+static inline void delayed_fg_task(const Dav1dContext *const c,
+ struct TaskThreadData *const ttd)
+{
+ const Dav1dPicture *const in = ttd->delayed_fg.in;
+ Dav1dPicture *const out = ttd->delayed_fg.out;
+#if CONFIG_16BPC
+ int off;
+ if (out->p.bpc != 8)
+ off = (out->p.bpc >> 1) - 4;
+#endif
+ switch (ttd->delayed_fg.type) {
+ case DAV1D_TASK_TYPE_FG_PREP:
+ ttd->delayed_fg.exec = 0;
+ if (atomic_load(&ttd->cond_signaled))
+ pthread_cond_signal(&ttd->cond);
+ pthread_mutex_unlock(&ttd->lock);
+ switch (out->p.bpc) {
+#if CONFIG_8BPC
+ case 8:
+ dav1d_prep_grain_8bpc(&c->dsp[0].fg, out, in,
+ ttd->delayed_fg.scaling_8bpc,
+ ttd->delayed_fg.grain_lut_8bpc);
+ break;
+#endif
+#if CONFIG_16BPC
+ case 10:
+ case 12:
+ dav1d_prep_grain_16bpc(&c->dsp[off].fg, out, in,
+ ttd->delayed_fg.scaling_16bpc,
+ ttd->delayed_fg.grain_lut_16bpc);
+ break;
+#endif
+ default: abort();
+ }
+ ttd->delayed_fg.type = DAV1D_TASK_TYPE_FG_APPLY;
+ pthread_mutex_lock(&ttd->lock);
+ ttd->delayed_fg.exec = 1;
+ // fall-through
+ case DAV1D_TASK_TYPE_FG_APPLY:;
+ int row = atomic_fetch_add(&ttd->delayed_fg.progress[0], 1);
+ pthread_mutex_unlock(&ttd->lock);
+ int progmax = (out->p.h + FG_BLOCK_SIZE - 1) / FG_BLOCK_SIZE;
+ while (row < progmax) {
+ if (row + 1 < progmax)
+ pthread_cond_signal(&ttd->cond);
+ else {
+ pthread_mutex_lock(&ttd->lock);
+ ttd->delayed_fg.exec = 0;
+ pthread_mutex_unlock(&ttd->lock);
+ }
+ switch (out->p.bpc) {
+#if CONFIG_8BPC
+ case 8:
+ dav1d_apply_grain_row_8bpc(&c->dsp[0].fg, out, in,
+ ttd->delayed_fg.scaling_8bpc,
+ ttd->delayed_fg.grain_lut_8bpc, row);
+ break;
+#endif
+#if CONFIG_16BPC
+ case 10:
+ case 12:
+ dav1d_apply_grain_row_16bpc(&c->dsp[off].fg, out, in,
+ ttd->delayed_fg.scaling_16bpc,
+ ttd->delayed_fg.grain_lut_16bpc, row);
+ break;
+#endif
+ default: abort();
+ }
+ row = atomic_fetch_add(&ttd->delayed_fg.progress[0], 1);
+ atomic_fetch_add(&ttd->delayed_fg.progress[1], 1);
+ }
+ pthread_mutex_lock(&ttd->lock);
+ ttd->delayed_fg.exec = 0;
+ int done = atomic_fetch_add(&ttd->delayed_fg.progress[1], 1) + 1;
+ progmax = atomic_load(&ttd->delayed_fg.progress[0]);
+ // signal for completion only once the last runner reaches this
+ if (done >= progmax) {
+ ttd->delayed_fg.finished = 1;
+ pthread_cond_signal(&ttd->delayed_fg.cond);
+ }
+ break;
+ default: abort();
+ }
+}
+
+void *dav1d_worker_task(void *data) {
+ Dav1dTaskContext *const tc = data;
+ const Dav1dContext *const c = tc->c;
+ struct TaskThreadData *const ttd = tc->task_thread.ttd;
+
+ dav1d_set_thread_name("dav1d-worker");
+
+ pthread_mutex_lock(&ttd->lock);
+ for (;;) {
+ if (tc->task_thread.die) break;
+ if (atomic_load(c->flush)) goto park;
+
+ merge_pending(c);
+ if (ttd->delayed_fg.exec) { // run delayed film grain first
+ delayed_fg_task(c, ttd);
+ continue;
+ }
+ Dav1dFrameContext *f;
+ Dav1dTask *t, *prev_t = NULL;
+ if (c->n_fc > 1) { // run init tasks second
+ for (unsigned i = 0; i < c->n_fc; i++) {
+ const unsigned first = atomic_load(&ttd->first);
+ f = &c->fc[(first + i) % c->n_fc];
+ if (atomic_load(&f->task_thread.init_done)) continue;
+ t = f->task_thread.task_head;
+ if (!t) continue;
+ if (t->type == DAV1D_TASK_TYPE_INIT) goto found;
+ if (t->type == DAV1D_TASK_TYPE_INIT_CDF) {
+ // XXX This can be a simple else, if adding tasks of both
+ // passes at once (in dav1d_task_create_tile_sbrow).
+ // Adding the tasks to the pending Q can result in a
+ // thread merging them before setting init_done.
+ // We will need to set init_done before adding to the
+ // pending Q, so maybe return the tasks, set init_done,
+ // and add to pending Q only then.
+ const int p1 = f->in_cdf.progress ?
+ atomic_load(f->in_cdf.progress) : 1;
+ if (p1) {
+ atomic_fetch_or(&f->task_thread.error, p1 == TILE_ERROR);
+ goto found;
+ }
+ }
+ }
+ }
+ while (ttd->cur < c->n_fc) { // run decoding tasks last
+ const unsigned first = atomic_load(&ttd->first);
+ f = &c->fc[(first + ttd->cur) % c->n_fc];
+ merge_pending_frame(f);
+ prev_t = f->task_thread.task_cur_prev;
+ t = prev_t ? prev_t->next : f->task_thread.task_head;
+ while (t) {
+ if (t->type == DAV1D_TASK_TYPE_INIT_CDF) goto next;
+ else if (t->type == DAV1D_TASK_TYPE_TILE_ENTROPY ||
+ t->type == DAV1D_TASK_TYPE_TILE_RECONSTRUCTION)
+ {
+ // if not bottom sbrow of tile, this task will be re-added
+ // after it's finished
+ if (!check_tile(t, f, c->n_fc > 1))
+ goto found;
+ } else if (t->recon_progress) {
+ const int p = t->type == DAV1D_TASK_TYPE_ENTROPY_PROGRESS;
+ int error = atomic_load(&f->task_thread.error);
+ assert(!atomic_load(&f->task_thread.done[p]) || error);
+ const int tile_row_base = f->frame_hdr->tiling.cols *
+ f->frame_thread.next_tile_row[p];
+ if (p) {
+ atomic_int *const prog = &f->frame_thread.entropy_progress;
+ const int p1 = atomic_load(prog);
+ if (p1 < t->sby) goto next;
+ atomic_fetch_or(&f->task_thread.error, p1 == TILE_ERROR);
+ }
+ for (int tc = 0; tc < f->frame_hdr->tiling.cols; tc++) {
+ Dav1dTileState *const ts = &f->ts[tile_row_base + tc];
+ const int p2 = atomic_load(&ts->progress[p]);
+ if (p2 < t->recon_progress) goto next;
+ atomic_fetch_or(&f->task_thread.error, p2 == TILE_ERROR);
+ }
+ if (t->sby + 1 < f->sbh) {
+ // add sby+1 to list to replace this one
+ Dav1dTask *next_t = &t[1];
+ *next_t = *t;
+ next_t->sby++;
+ const int ntr = f->frame_thread.next_tile_row[p] + 1;
+ const int start = f->frame_hdr->tiling.row_start_sb[ntr];
+ if (next_t->sby == start)
+ f->frame_thread.next_tile_row[p] = ntr;
+ next_t->recon_progress = next_t->sby + 1;
+ insert_task(f, next_t, 0);
+ }
+ goto found;
+ } else if (t->type == DAV1D_TASK_TYPE_CDEF) {
+ atomic_uint *prog = f->frame_thread.copy_lpf_progress;
+ const int p1 = atomic_load(&prog[(t->sby - 1) >> 5]);
+ if (p1 & (1U << ((t->sby - 1) & 31)))
+ goto found;
+ } else {
+ assert(t->deblock_progress);
+ const int p1 = atomic_load(&f->frame_thread.deblock_progress);
+ if (p1 >= t->deblock_progress) {
+ atomic_fetch_or(&f->task_thread.error, p1 == TILE_ERROR);
+ goto found;
+ }
+ }
+ next:
+ prev_t = t;
+ t = t->next;
+ f->task_thread.task_cur_prev = prev_t;
+ }
+ ttd->cur++;
+ }
+ if (reset_task_cur(c, ttd, UINT_MAX)) continue;
+ if (merge_pending(c)) continue;
+ park:
+ tc->task_thread.flushed = 1;
+ pthread_cond_signal(&tc->task_thread.td.cond);
+ // we want to be woken up next time progress is signaled
+ atomic_store(&ttd->cond_signaled, 0);
+ pthread_cond_wait(&ttd->cond, &ttd->lock);
+ tc->task_thread.flushed = 0;
+ reset_task_cur(c, ttd, UINT_MAX);
+ continue;
+
+ found:
+ // remove t from list
+ if (prev_t) prev_t->next = t->next;
+ else f->task_thread.task_head = t->next;
+ if (!t->next) f->task_thread.task_tail = prev_t;
+ if (t->type > DAV1D_TASK_TYPE_INIT_CDF && !f->task_thread.task_head)
+ ttd->cur++;
+ t->next = NULL;
+ // we don't need to check cond_signaled here, since we found a task
+ // after the last signal so we want to re-signal the next waiting thread
+ // and again won't need to signal after that
+ atomic_store(&ttd->cond_signaled, 1);
+ pthread_cond_signal(&ttd->cond);
+ pthread_mutex_unlock(&ttd->lock);
+ found_unlocked:;
+ const int flush = atomic_load(c->flush);
+ int error = atomic_fetch_or(&f->task_thread.error, flush) | flush;
+
+ // run it
+ tc->f = f;
+ int sby = t->sby;
+ switch (t->type) {
+ case DAV1D_TASK_TYPE_INIT: {
+ assert(c->n_fc > 1);
+ int res = dav1d_decode_frame_init(f);
+ int p1 = f->in_cdf.progress ? atomic_load(f->in_cdf.progress) : 1;
+ if (res || p1 == TILE_ERROR) {
+ pthread_mutex_lock(&ttd->lock);
+ abort_frame(f, res ? res : DAV1D_ERR(EINVAL));
+ reset_task_cur(c, ttd, t->frame_idx);
+ } else {
+ t->type = DAV1D_TASK_TYPE_INIT_CDF;
+ if (p1) goto found_unlocked;
+ add_pending(f, t);
+ pthread_mutex_lock(&ttd->lock);
+ }
+ continue;
+ }
+ case DAV1D_TASK_TYPE_INIT_CDF: {
+ assert(c->n_fc > 1);
+ int res = DAV1D_ERR(EINVAL);
+ if (!atomic_load(&f->task_thread.error))
+ res = dav1d_decode_frame_init_cdf(f);
+ if (f->frame_hdr->refresh_context && !f->task_thread.update_set) {
+ atomic_store(f->out_cdf.progress, res < 0 ? TILE_ERROR : 1);
+ }
+ if (!res) {
+ assert(c->n_fc > 1);
+ for (int p = 1; p <= 2; p++) {
+ const int res = dav1d_task_create_tile_sbrow(f, p, 0);
+ if (res) {
+ pthread_mutex_lock(&ttd->lock);
+ // memory allocation failed
+ atomic_store(&f->task_thread.done[2 - p], 1);
+ atomic_store(&f->task_thread.error, -1);
+ atomic_fetch_sub(&f->task_thread.task_counter,
+ f->frame_hdr->tiling.cols *
+ f->frame_hdr->tiling.rows + f->sbh);
+ atomic_store(&f->sr_cur.progress[p - 1], FRAME_ERROR);
+ if (p == 2 && atomic_load(&f->task_thread.done[1])) {
+ assert(!atomic_load(&f->task_thread.task_counter));
+ dav1d_decode_frame_exit(f, DAV1D_ERR(ENOMEM));
+ f->n_tile_data = 0;
+ pthread_cond_signal(&f->task_thread.cond);
+ } else {
+ pthread_mutex_unlock(&ttd->lock);
+ }
+ }
+ }
+ pthread_mutex_lock(&ttd->lock);
+ } else {
+ pthread_mutex_lock(&ttd->lock);
+ abort_frame(f, res);
+ reset_task_cur(c, ttd, t->frame_idx);
+ atomic_store(&f->task_thread.init_done, 1);
+ }
+ continue;
+ }
+ case DAV1D_TASK_TYPE_TILE_ENTROPY:
+ case DAV1D_TASK_TYPE_TILE_RECONSTRUCTION: {
+ const int p = t->type == DAV1D_TASK_TYPE_TILE_ENTROPY;
+ const int tile_idx = (int)(t - f->task_thread.tile_tasks[p]);
+ Dav1dTileState *const ts = &f->ts[tile_idx];
+
+ tc->ts = ts;
+ tc->by = sby << f->sb_shift;
+ const int uses_2pass = c->n_fc > 1;
+ tc->frame_thread.pass = !uses_2pass ? 0 :
+ 1 + (t->type == DAV1D_TASK_TYPE_TILE_RECONSTRUCTION);
+ if (!error) error = dav1d_decode_tile_sbrow(tc);
+ const int progress = error ? TILE_ERROR : 1 + sby;
+
+ // signal progress
+ atomic_fetch_or(&f->task_thread.error, error);
+ if (((sby + 1) << f->sb_shift) < ts->tiling.row_end) {
+ t->sby++;
+ t->deps_skip = 0;
+ if (!check_tile(t, f, uses_2pass)) {
+ atomic_store(&ts->progress[p], progress);
+ reset_task_cur_async(ttd, t->frame_idx, c->n_fc);
+ if (!atomic_fetch_or(&ttd->cond_signaled, 1))
+ pthread_cond_signal(&ttd->cond);
+ goto found_unlocked;
+ }
+ atomic_store(&ts->progress[p], progress);
+ add_pending(f, t);
+ pthread_mutex_lock(&ttd->lock);
+ } else {
+ pthread_mutex_lock(&ttd->lock);
+ atomic_store(&ts->progress[p], progress);
+ reset_task_cur(c, ttd, t->frame_idx);
+ error = atomic_load(&f->task_thread.error);
+ if (f->frame_hdr->refresh_context &&
+ tc->frame_thread.pass <= 1 && f->task_thread.update_set &&
+ f->frame_hdr->tiling.update == tile_idx)
+ {
+ if (!error)
+ dav1d_cdf_thread_update(f->frame_hdr, f->out_cdf.data.cdf,
+ &f->ts[f->frame_hdr->tiling.update].cdf);
+ if (c->n_fc > 1)
+ atomic_store(f->out_cdf.progress, error ? TILE_ERROR : 1);
+ }
+ if (atomic_fetch_sub(&f->task_thread.task_counter, 1) - 1 == 0 &&
+ atomic_load(&f->task_thread.done[0]) &&
+ (!uses_2pass || atomic_load(&f->task_thread.done[1])))
+ {
+ error = atomic_load(&f->task_thread.error);
+ dav1d_decode_frame_exit(f, error == 1 ? DAV1D_ERR(EINVAL) :
+ error ? DAV1D_ERR(ENOMEM) : 0);
+ f->n_tile_data = 0;
+ pthread_cond_signal(&f->task_thread.cond);
+ }
+ assert(atomic_load(&f->task_thread.task_counter) >= 0);
+ if (!atomic_fetch_or(&ttd->cond_signaled, 1))
+ pthread_cond_signal(&ttd->cond);
+ }
+ continue;
+ }
+ case DAV1D_TASK_TYPE_DEBLOCK_COLS:
+ if (!atomic_load(&f->task_thread.error))
+ f->bd_fn.filter_sbrow_deblock_cols(f, sby);
+ if (ensure_progress(ttd, f, t, DAV1D_TASK_TYPE_DEBLOCK_ROWS,
+ &f->frame_thread.deblock_progress,
+ &t->deblock_progress)) continue;
+ // fall-through
+ case DAV1D_TASK_TYPE_DEBLOCK_ROWS:
+ if (!atomic_load(&f->task_thread.error))
+ f->bd_fn.filter_sbrow_deblock_rows(f, sby);
+ // signal deblock progress
+ if (f->frame_hdr->loopfilter.level_y[0] ||
+ f->frame_hdr->loopfilter.level_y[1])
+ {
+ error = atomic_load(&f->task_thread.error);
+ atomic_store(&f->frame_thread.deblock_progress,
+ error ? TILE_ERROR : sby + 1);
+ reset_task_cur_async(ttd, t->frame_idx, c->n_fc);
+ if (!atomic_fetch_or(&ttd->cond_signaled, 1))
+ pthread_cond_signal(&ttd->cond);
+ } else if (f->seq_hdr->cdef || f->lf.restore_planes) {
+ atomic_fetch_or(&f->frame_thread.copy_lpf_progress[sby >> 5],
+ 1U << (sby & 31));
+ // CDEF needs the top buffer to be saved by lr_copy_lpf of the
+ // previous sbrow
+ if (sby) {
+ int prog = atomic_load(&f->frame_thread.copy_lpf_progress[(sby - 1) >> 5]);
+ if (~prog & (1U << ((sby - 1) & 31))) {
+ t->type = DAV1D_TASK_TYPE_CDEF;
+ t->recon_progress = t->deblock_progress = 0;
+ add_pending(f, t);
+ pthread_mutex_lock(&ttd->lock);
+ continue;
+ }
+ }
+ }
+ // fall-through
+ case DAV1D_TASK_TYPE_CDEF:
+ if (f->seq_hdr->cdef) {
+ if (!atomic_load(&f->task_thread.error))
+ f->bd_fn.filter_sbrow_cdef(tc, sby);
+ reset_task_cur_async(ttd, t->frame_idx, c->n_fc);
+ if (!atomic_fetch_or(&ttd->cond_signaled, 1))
+ pthread_cond_signal(&ttd->cond);
+ }
+ // fall-through
+ case DAV1D_TASK_TYPE_SUPER_RESOLUTION:
+ if (f->frame_hdr->width[0] != f->frame_hdr->width[1])
+ if (!atomic_load(&f->task_thread.error))
+ f->bd_fn.filter_sbrow_resize(f, sby);
+ // fall-through
+ case DAV1D_TASK_TYPE_LOOP_RESTORATION:
+ if (!atomic_load(&f->task_thread.error) && f->lf.restore_planes)
+ f->bd_fn.filter_sbrow_lr(f, sby);
+ // fall-through
+ case DAV1D_TASK_TYPE_RECONSTRUCTION_PROGRESS:
+ // dummy to cover for no post-filters
+ case DAV1D_TASK_TYPE_ENTROPY_PROGRESS:
+ // dummy to convert tile progress to frame
+ break;
+ default: abort();
+ }
+ // if task completed [typically LR], signal picture progress as per below
+ const int uses_2pass = c->n_fc > 1;
+ const int sbh = f->sbh;
+ const int sbsz = f->sb_step * 4;
+ if (t->type == DAV1D_TASK_TYPE_ENTROPY_PROGRESS) {
+ error = atomic_load(&f->task_thread.error);
+ const unsigned y = sby + 1 == sbh ? UINT_MAX : (unsigned)(sby + 1) * sbsz;
+ assert(c->n_fc > 1);
+ if (f->sr_cur.p.data[0] /* upon flush, this can be free'ed already */)
+ atomic_store(&f->sr_cur.progress[0], error ? FRAME_ERROR : y);
+ atomic_store(&f->frame_thread.entropy_progress,
+ error ? TILE_ERROR : sby + 1);
+ if (sby + 1 == sbh)
+ atomic_store(&f->task_thread.done[1], 1);
+ pthread_mutex_lock(&ttd->lock);
+ const int num_tasks = atomic_fetch_sub(&f->task_thread.task_counter, 1) - 1;
+ if (sby + 1 < sbh && num_tasks) {
+ reset_task_cur(c, ttd, t->frame_idx);
+ continue;
+ }
+ if (!num_tasks && atomic_load(&f->task_thread.done[0]) &&
+ atomic_load(&f->task_thread.done[1]))
+ {
+ error = atomic_load(&f->task_thread.error);
+ dav1d_decode_frame_exit(f, error == 1 ? DAV1D_ERR(EINVAL) :
+ error ? DAV1D_ERR(ENOMEM) : 0);
+ f->n_tile_data = 0;
+ pthread_cond_signal(&f->task_thread.cond);
+ }
+ reset_task_cur(c, ttd, t->frame_idx);
+ continue;
+ }
+ // t->type != DAV1D_TASK_TYPE_ENTROPY_PROGRESS
+ atomic_fetch_or(&f->frame_thread.frame_progress[sby >> 5],
+ 1U << (sby & 31));
+ pthread_mutex_lock(&f->task_thread.lock);
+ sby = get_frame_progress(c, f);
+ error = atomic_load(&f->task_thread.error);
+ const unsigned y = sby + 1 == sbh ? UINT_MAX : (unsigned)(sby + 1) * sbsz;
+ if (c->n_fc > 1 && f->sr_cur.p.data[0] /* upon flush, this can be free'ed already */)
+ atomic_store(&f->sr_cur.progress[1], error ? FRAME_ERROR : y);
+ pthread_mutex_unlock(&f->task_thread.lock);
+ if (sby + 1 == sbh)
+ atomic_store(&f->task_thread.done[0], 1);
+ pthread_mutex_lock(&ttd->lock);
+ const int num_tasks = atomic_fetch_sub(&f->task_thread.task_counter, 1) - 1;
+ if (sby + 1 < sbh && num_tasks) {
+ reset_task_cur(c, ttd, t->frame_idx);
+ continue;
+ }
+ if (!num_tasks && atomic_load(&f->task_thread.done[0]) &&
+ (!uses_2pass || atomic_load(&f->task_thread.done[1])))
+ {
+ error = atomic_load(&f->task_thread.error);
+ dav1d_decode_frame_exit(f, error == 1 ? DAV1D_ERR(EINVAL) :
+ error ? DAV1D_ERR(ENOMEM) : 0);
+ f->n_tile_data = 0;
+ pthread_cond_signal(&f->task_thread.cond);
+ }
+ reset_task_cur(c, ttd, t->frame_idx);
+ }
+ pthread_mutex_unlock(&ttd->lock);
+
+ return NULL;
+}
diff --git a/third_party/dav1d/src/thread_task.h b/third_party/dav1d/src/thread_task.h
new file mode 100644
index 0000000000..257da1a470
--- /dev/null
+++ b/third_party/dav1d/src/thread_task.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_THREAD_TASK_H
+#define DAV1D_SRC_THREAD_TASK_H
+
+#include <limits.h>
+
+#include "src/internal.h"
+
+#define FRAME_ERROR (UINT_MAX - 1)
+#define TILE_ERROR (INT_MAX - 1)
+
+// these functions assume the task scheduling lock is already taken
+int dav1d_task_create_tile_sbrow(Dav1dFrameContext *f, int pass, int cond_signal);
+void dav1d_task_frame_init(Dav1dFrameContext *f);
+
+void dav1d_task_delayed_fg(Dav1dContext *c, Dav1dPicture *out, const Dav1dPicture *in);
+
+void *dav1d_worker_task(void *data);
+
+int dav1d_decode_frame_init(Dav1dFrameContext *f);
+int dav1d_decode_frame_init_cdf(Dav1dFrameContext *f);
+int dav1d_decode_frame_main(Dav1dFrameContext *f);
+void dav1d_decode_frame_exit(Dav1dFrameContext *f, int retval);
+int dav1d_decode_frame(Dav1dFrameContext *f);
+int dav1d_decode_tile_sbrow(Dav1dTaskContext *t);
+
+#endif /* DAV1D_SRC_THREAD_TASK_H */
diff --git a/third_party/dav1d/src/warpmv.c b/third_party/dav1d/src/warpmv.c
new file mode 100644
index 0000000000..439c4304c7
--- /dev/null
+++ b/third_party/dav1d/src/warpmv.c
@@ -0,0 +1,209 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stdlib.h>
+
+#include "common/intops.h"
+
+#include "src/warpmv.h"
+
+static const uint16_t div_lut[257] = {
+ 16384, 16320, 16257, 16194, 16132, 16070, 16009, 15948, 15888, 15828, 15768,
+ 15709, 15650, 15592, 15534, 15477, 15420, 15364, 15308, 15252, 15197, 15142,
+ 15087, 15033, 14980, 14926, 14873, 14821, 14769, 14717, 14665, 14614, 14564,
+ 14513, 14463, 14413, 14364, 14315, 14266, 14218, 14170, 14122, 14075, 14028,
+ 13981, 13935, 13888, 13843, 13797, 13752, 13707, 13662, 13618, 13574, 13530,
+ 13487, 13443, 13400, 13358, 13315, 13273, 13231, 13190, 13148, 13107, 13066,
+ 13026, 12985, 12945, 12906, 12866, 12827, 12788, 12749, 12710, 12672, 12633,
+ 12596, 12558, 12520, 12483, 12446, 12409, 12373, 12336, 12300, 12264, 12228,
+ 12193, 12157, 12122, 12087, 12053, 12018, 11984, 11950, 11916, 11882, 11848,
+ 11815, 11782, 11749, 11716, 11683, 11651, 11619, 11586, 11555, 11523, 11491,
+ 11460, 11429, 11398, 11367, 11336, 11305, 11275, 11245, 11215, 11185, 11155,
+ 11125, 11096, 11067, 11038, 11009, 10980, 10951, 10923, 10894, 10866, 10838,
+ 10810, 10782, 10755, 10727, 10700, 10673, 10645, 10618, 10592, 10565, 10538,
+ 10512, 10486, 10460, 10434, 10408, 10382, 10356, 10331, 10305, 10280, 10255,
+ 10230, 10205, 10180, 10156, 10131, 10107, 10082, 10058, 10034, 10010, 9986,
+ 9963, 9939, 9916, 9892, 9869, 9846, 9823, 9800, 9777, 9754, 9732,
+ 9709, 9687, 9664, 9642, 9620, 9598, 9576, 9554, 9533, 9511, 9489,
+ 9468, 9447, 9425, 9404, 9383, 9362, 9341, 9321, 9300, 9279, 9259,
+ 9239, 9218, 9198, 9178, 9158, 9138, 9118, 9098, 9079, 9059, 9039,
+ 9020, 9001, 8981, 8962, 8943, 8924, 8905, 8886, 8867, 8849, 8830,
+ 8812, 8793, 8775, 8756, 8738, 8720, 8702, 8684, 8666, 8648, 8630,
+ 8613, 8595, 8577, 8560, 8542, 8525, 8508, 8490, 8473, 8456, 8439,
+ 8422, 8405, 8389, 8372, 8355, 8339, 8322, 8306, 8289, 8273, 8257,
+ 8240, 8224, 8208, 8192,
+};
+
+static inline int iclip_wmp(const int v) {
+ const int cv = iclip(v, INT16_MIN, INT16_MAX);
+
+ return apply_sign((abs(cv) + 32) >> 6, cv) * (1 << 6);
+}
+
+static inline int resolve_divisor_32(const unsigned d, int *const shift) {
+ *shift = ulog2(d);
+ const int e = d - (1 << *shift);
+ const int f = *shift > 8 ? (e + (1 << (*shift - 9))) >> (*shift - 8) :
+ e << (8 - *shift);
+ assert(f <= 256);
+ *shift += 14;
+ // Use f as lookup into the precomputed table of multipliers
+ return div_lut[f];
+}
+
+int dav1d_get_shear_params(Dav1dWarpedMotionParams *const wm) {
+ const int32_t *const mat = wm->matrix;
+
+ if (mat[2] <= 0) return 1;
+
+ wm->u.p.alpha = iclip_wmp(mat[2] - 0x10000);
+ wm->u.p.beta = iclip_wmp(mat[3]);
+
+ int shift;
+ const int y = apply_sign(resolve_divisor_32(abs(mat[2]), &shift), mat[2]);
+ const int64_t v1 = ((int64_t) mat[4] * 0x10000) * y;
+ const int rnd = (1 << shift) >> 1;
+ wm->u.p.gamma = iclip_wmp(apply_sign64((int) ((llabs(v1) + rnd) >> shift), v1));
+ const int64_t v2 = ((int64_t) mat[3] * mat[4]) * y;
+ wm->u.p.delta = iclip_wmp(mat[5] -
+ apply_sign64((int) ((llabs(v2) + rnd) >> shift), v2) -
+ 0x10000);
+
+ return (4 * abs(wm->u.p.alpha) + 7 * abs(wm->u.p.beta) >= 0x10000) ||
+ (4 * abs(wm->u.p.gamma) + 4 * abs(wm->u.p.delta) >= 0x10000);
+}
+
+static int resolve_divisor_64(const uint64_t d, int *const shift) {
+ *shift = u64log2(d);
+ const int64_t e = d - (1LL << *shift);
+ const int64_t f = *shift > 8 ? (e + (1LL << (*shift - 9))) >> (*shift - 8) :
+ e << (8 - *shift);
+ assert(f <= 256);
+ *shift += 14;
+ // Use f as lookup into the precomputed table of multipliers
+ return div_lut[f];
+}
+
+static int get_mult_shift_ndiag(const int64_t px,
+ const int idet, const int shift)
+{
+ const int64_t v1 = px * idet;
+ const int v2 = apply_sign64((int) ((llabs(v1) +
+ ((1LL << shift) >> 1)) >> shift),
+ v1);
+ return iclip(v2, -0x1fff, 0x1fff);
+}
+
+static int get_mult_shift_diag(const int64_t px,
+ const int idet, const int shift)
+{
+ const int64_t v1 = px * idet;
+ const int v2 = apply_sign64((int) ((llabs(v1) +
+ ((1LL << shift) >> 1)) >> shift),
+ v1);
+ return iclip(v2, 0xe001, 0x11fff);
+}
+
+void dav1d_set_affine_mv2d(const int bw4, const int bh4,
+ const mv mv, Dav1dWarpedMotionParams *const wm,
+ const int bx4, const int by4)
+{
+ int32_t *const mat = wm->matrix;
+ const int rsuy = 2 * bh4 - 1;
+ const int rsux = 2 * bw4 - 1;
+ const int isuy = by4 * 4 + rsuy;
+ const int isux = bx4 * 4 + rsux;
+
+ mat[0] = iclip(mv.x * 0x2000 - (isux * (mat[2] - 0x10000) + isuy * mat[3]),
+ -0x800000, 0x7fffff);
+ mat[1] = iclip(mv.y * 0x2000 - (isux * mat[4] + isuy * (mat[5] - 0x10000)),
+ -0x800000, 0x7fffff);
+}
+
+int dav1d_find_affine_int(const int (*pts)[2][2], const int np,
+ const int bw4, const int bh4,
+ const mv mv, Dav1dWarpedMotionParams *const wm,
+ const int bx4, const int by4)
+{
+ int32_t *const mat = wm->matrix;
+ int a[2][2] = { { 0, 0 }, { 0, 0 } };
+ int bx[2] = { 0, 0 };
+ int by[2] = { 0, 0 };
+ const int rsuy = 2 * bh4 - 1;
+ const int rsux = 2 * bw4 - 1;
+ const int suy = rsuy * 8;
+ const int sux = rsux * 8;
+ const int duy = suy + mv.y;
+ const int dux = sux + mv.x;
+ const int isuy = by4 * 4 + rsuy;
+ const int isux = bx4 * 4 + rsux;
+
+ for (int i = 0; i < np; i++) {
+ const int dx = pts[i][1][0] - dux;
+ const int dy = pts[i][1][1] - duy;
+ const int sx = pts[i][0][0] - sux;
+ const int sy = pts[i][0][1] - suy;
+ if (abs(sx - dx) < 256 && abs(sy - dy) < 256) {
+ a[0][0] += ((sx * sx) >> 2) + sx * 2 + 8;
+ a[0][1] += ((sx * sy) >> 2) + sx + sy + 4;
+ a[1][1] += ((sy * sy) >> 2) + sy * 2 + 8;
+ bx[0] += ((sx * dx) >> 2) + sx + dx + 8;
+ bx[1] += ((sy * dx) >> 2) + sy + dx + 4;
+ by[0] += ((sx * dy) >> 2) + sx + dy + 4;
+ by[1] += ((sy * dy) >> 2) + sy + dy + 8;
+ }
+ }
+
+ // compute determinant of a
+ const int64_t det = (int64_t) a[0][0] * a[1][1] - (int64_t) a[0][1] * a[0][1];
+ if (det == 0) return 1;
+ int shift, idet = apply_sign64(resolve_divisor_64(llabs(det), &shift), det);
+ shift -= 16;
+ if (shift < 0) {
+ idet <<= -shift;
+ shift = 0;
+ }
+
+ // solve the least-squares
+ mat[2] = get_mult_shift_diag((int64_t) a[1][1] * bx[0] -
+ (int64_t) a[0][1] * bx[1], idet, shift);
+ mat[3] = get_mult_shift_ndiag((int64_t) a[0][0] * bx[1] -
+ (int64_t) a[0][1] * bx[0], idet, shift);
+ mat[4] = get_mult_shift_ndiag((int64_t) a[1][1] * by[0] -
+ (int64_t) a[0][1] * by[1], idet, shift);
+ mat[5] = get_mult_shift_diag((int64_t) a[0][0] * by[1] -
+ (int64_t) a[0][1] * by[0], idet, shift);
+
+ mat[0] = iclip(mv.x * 0x2000 - (isux * (mat[2] - 0x10000) + isuy * mat[3]),
+ -0x800000, 0x7fffff);
+ mat[1] = iclip(mv.y * 0x2000 - (isux * mat[4] + isuy * (mat[5] - 0x10000)),
+ -0x800000, 0x7fffff);
+
+ return 0;
+}
diff --git a/third_party/dav1d/src/warpmv.h b/third_party/dav1d/src/warpmv.h
new file mode 100644
index 0000000000..08e841d1ca
--- /dev/null
+++ b/third_party/dav1d/src/warpmv.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_WARPMV_H
+#define DAV1D_SRC_WARPMV_H
+
+#include "src/levels.h"
+
+int dav1d_get_shear_params(Dav1dWarpedMotionParams *wm);
+int dav1d_find_affine_int(const int (*pts)[2][2], int np, int bw4, int bh4,
+ mv mv, Dav1dWarpedMotionParams *wm, int bx, int by);
+void dav1d_set_affine_mv2d(int bw4, int bh4,
+ mv mv, Dav1dWarpedMotionParams *wm, int bx, int by);
+
+#endif /* DAV1D_SRC_WARPMV_H */
diff --git a/third_party/dav1d/src/wedge.c b/third_party/dav1d/src/wedge.c
new file mode 100644
index 0000000000..2bea1393a1
--- /dev/null
+++ b/third_party/dav1d/src/wedge.c
@@ -0,0 +1,299 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stdint.h>
+#include <string.h>
+
+#include "common/intops.h"
+
+#include "src/wedge.h"
+
+enum WedgeDirectionType {
+ WEDGE_HORIZONTAL = 0,
+ WEDGE_VERTICAL = 1,
+ WEDGE_OBLIQUE27 = 2,
+ WEDGE_OBLIQUE63 = 3,
+ WEDGE_OBLIQUE117 = 4,
+ WEDGE_OBLIQUE153 = 5,
+ N_WEDGE_DIRECTIONS
+};
+
+typedef struct {
+ uint8_t /* enum WedgeDirectionType */ direction;
+ uint8_t x_offset;
+ uint8_t y_offset;
+} wedge_code_type;
+
+static const wedge_code_type wedge_codebook_16_hgtw[16] = {
+ { WEDGE_OBLIQUE27, 4, 4 }, { WEDGE_OBLIQUE63, 4, 4 },
+ { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
+ { WEDGE_HORIZONTAL, 4, 2 }, { WEDGE_HORIZONTAL, 4, 4 },
+ { WEDGE_HORIZONTAL, 4, 6 }, { WEDGE_VERTICAL, 4, 4 },
+ { WEDGE_OBLIQUE27, 4, 2 }, { WEDGE_OBLIQUE27, 4, 6 },
+ { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 },
+ { WEDGE_OBLIQUE63, 2, 4 }, { WEDGE_OBLIQUE63, 6, 4 },
+ { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 },
+};
+
+static const wedge_code_type wedge_codebook_16_hltw[16] = {
+ { WEDGE_OBLIQUE27, 4, 4 }, { WEDGE_OBLIQUE63, 4, 4 },
+ { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
+ { WEDGE_VERTICAL, 2, 4 }, { WEDGE_VERTICAL, 4, 4 },
+ { WEDGE_VERTICAL, 6, 4 }, { WEDGE_HORIZONTAL, 4, 4 },
+ { WEDGE_OBLIQUE27, 4, 2 }, { WEDGE_OBLIQUE27, 4, 6 },
+ { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 },
+ { WEDGE_OBLIQUE63, 2, 4 }, { WEDGE_OBLIQUE63, 6, 4 },
+ { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 },
+};
+
+static const wedge_code_type wedge_codebook_16_heqw[16] = {
+ { WEDGE_OBLIQUE27, 4, 4 }, { WEDGE_OBLIQUE63, 4, 4 },
+ { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
+ { WEDGE_HORIZONTAL, 4, 2 }, { WEDGE_HORIZONTAL, 4, 6 },
+ { WEDGE_VERTICAL, 2, 4 }, { WEDGE_VERTICAL, 6, 4 },
+ { WEDGE_OBLIQUE27, 4, 2 }, { WEDGE_OBLIQUE27, 4, 6 },
+ { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 },
+ { WEDGE_OBLIQUE63, 2, 4 }, { WEDGE_OBLIQUE63, 6, 4 },
+ { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 },
+};
+
+Dav1dMasks dav1d_masks;
+
+static void insert_border(uint8_t *const dst, const uint8_t *const src,
+ const int ctr)
+{
+ if (ctr > 4) memset(dst, 0, ctr - 4);
+ memcpy(dst + imax(ctr, 4) - 4, src + imax(4 - ctr, 0), imin(64 - ctr, 8));
+ if (ctr < 64 - 4)
+ memset(dst + ctr + 4, 64, 64 - 4 - ctr);
+}
+
+static void transpose(uint8_t *const dst, const uint8_t *const src) {
+ for (int y = 0, y_off = 0; y < 64; y++, y_off += 64)
+ for (int x = 0, x_off = 0; x < 64; x++, x_off += 64)
+ dst[x_off + y] = src[y_off + x];
+}
+
+static void hflip(uint8_t *const dst, const uint8_t *const src) {
+ for (int y = 0, y_off = 0; y < 64; y++, y_off += 64)
+ for (int x = 0; x < 64; x++)
+ dst[y_off + 64 - 1 - x] = src[y_off + x];
+}
+
+static void copy2d(uint8_t *dst, const uint8_t *src, int sign,
+ const int w, const int h, const int x_off, const int y_off)
+{
+ src += y_off * 64 + x_off;
+ if (sign) {
+ for (int y = 0; y < h; y++) {
+ for (int x = 0; x < w; x++)
+ dst[x] = 64 - src[x];
+ src += 64;
+ dst += w;
+ }
+ } else {
+ for (int y = 0; y < h; y++) {
+ memcpy(dst, src, w);
+ src += 64;
+ dst += w;
+ }
+ }
+}
+
+#define MASK_OFFSET(x) ((uint16_t)(((uintptr_t)(x) - (uintptr_t)&dav1d_masks) >> 3))
+
+static COLD uint16_t init_chroma(uint8_t *chroma, const uint8_t *luma,
+ const int sign, const int w, const int h,
+ const int ss_ver)
+{
+ const uint16_t offset = MASK_OFFSET(chroma);
+ for (int y = 0; y < h; y += 1 + ss_ver) {
+ for (int x = 0; x < w; x += 2) {
+ int sum = luma[x] + luma[x + 1] + 1;
+ if (ss_ver) sum += luma[w + x] + luma[w + x + 1] + 1;
+ chroma[x >> 1] = (sum - sign) >> (1 + ss_ver);
+ }
+ luma += w << ss_ver;
+ chroma += w >> 1;
+ }
+ return offset;
+}
+
+static COLD void fill2d_16x2(const int w, const int h, const enum BlockSize bs,
+ const uint8_t (*const master)[64 * 64],
+ const wedge_code_type *const cb,
+ uint8_t *masks_444, uint8_t *masks_422,
+ uint8_t *masks_420, unsigned signs)
+{
+ const int n_stride_444 = (w * h);
+ const int n_stride_422 = n_stride_444 >> 1;
+ const int n_stride_420 = n_stride_444 >> 2;
+ const int sign_stride_422 = 16 * n_stride_422;
+ const int sign_stride_420 = 16 * n_stride_420;
+
+ // assign pointer offsets in lookup table
+ for (int n = 0; n < 16; n++) {
+ const int sign = signs & 1;
+
+ copy2d(masks_444, master[cb[n].direction], sign, w, h,
+ 32 - (w * cb[n].x_offset >> 3), 32 - (h * cb[n].y_offset >> 3));
+
+ // not using !sign is intentional here, since 444 does not require
+ // any rounding since no chroma subsampling is applied.
+ dav1d_masks.offsets[0][bs].wedge[0][n] =
+ dav1d_masks.offsets[0][bs].wedge[1][n] = MASK_OFFSET(masks_444);
+
+ dav1d_masks.offsets[1][bs].wedge[0][n] =
+ init_chroma(&masks_422[ sign * sign_stride_422], masks_444, 0, w, h, 0);
+ dav1d_masks.offsets[1][bs].wedge[1][n] =
+ init_chroma(&masks_422[!sign * sign_stride_422], masks_444, 1, w, h, 0);
+ dav1d_masks.offsets[2][bs].wedge[0][n] =
+ init_chroma(&masks_420[ sign * sign_stride_420], masks_444, 0, w, h, 1);
+ dav1d_masks.offsets[2][bs].wedge[1][n] =
+ init_chroma(&masks_420[!sign * sign_stride_420], masks_444, 1, w, h, 1);
+
+ signs >>= 1;
+ masks_444 += n_stride_444;
+ masks_422 += n_stride_422;
+ masks_420 += n_stride_420;
+ }
+}
+
+static COLD void build_nondc_ii_masks(uint8_t *const mask_v, const int w,
+ const int h, const int step)
+{
+ static const uint8_t ii_weights_1d[32] = {
+ 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7,
+ 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1,
+ };
+
+ uint8_t *const mask_h = &mask_v[w * h];
+ uint8_t *const mask_sm = &mask_h[w * h];
+ for (int y = 0, off = 0; y < h; y++, off += w) {
+ memset(&mask_v[off], ii_weights_1d[y * step], w);
+ for (int x = 0; x < w; x++) {
+ mask_sm[off + x] = ii_weights_1d[imin(x, y) * step];
+ mask_h[off + x] = ii_weights_1d[x * step];
+ }
+ }
+}
+
+COLD void dav1d_init_ii_wedge_masks(void) {
+ // This function is guaranteed to be called only once
+
+ enum WedgeMasterLineType {
+ WEDGE_MASTER_LINE_ODD,
+ WEDGE_MASTER_LINE_EVEN,
+ WEDGE_MASTER_LINE_VERT,
+ N_WEDGE_MASTER_LINES,
+ };
+ static const uint8_t wedge_master_border[N_WEDGE_MASTER_LINES][8] = {
+ [WEDGE_MASTER_LINE_ODD] = { 1, 2, 6, 18, 37, 53, 60, 63 },
+ [WEDGE_MASTER_LINE_EVEN] = { 1, 4, 11, 27, 46, 58, 62, 63 },
+ [WEDGE_MASTER_LINE_VERT] = { 0, 2, 7, 21, 43, 57, 62, 64 },
+ };
+ uint8_t master[6][64 * 64];
+
+ // create master templates
+ for (int y = 0, off = 0; y < 64; y++, off += 64)
+ insert_border(&master[WEDGE_VERTICAL][off],
+ wedge_master_border[WEDGE_MASTER_LINE_VERT], 32);
+ for (int y = 0, off = 0, ctr = 48; y < 64; y += 2, off += 128, ctr--)
+ {
+ insert_border(&master[WEDGE_OBLIQUE63][off],
+ wedge_master_border[WEDGE_MASTER_LINE_EVEN], ctr);
+ insert_border(&master[WEDGE_OBLIQUE63][off + 64],
+ wedge_master_border[WEDGE_MASTER_LINE_ODD], ctr - 1);
+ }
+
+ transpose(master[WEDGE_OBLIQUE27], master[WEDGE_OBLIQUE63]);
+ transpose(master[WEDGE_HORIZONTAL], master[WEDGE_VERTICAL]);
+ hflip(master[WEDGE_OBLIQUE117], master[WEDGE_OBLIQUE63]);
+ hflip(master[WEDGE_OBLIQUE153], master[WEDGE_OBLIQUE27]);
+
+#define fill(w, h, sz_422, sz_420, hvsw, signs) \
+ fill2d_16x2(w, h, BS_##w##x##h - BS_32x32, \
+ master, wedge_codebook_16_##hvsw, \
+ dav1d_masks.wedge_444_##w##x##h, \
+ dav1d_masks.wedge_422_##sz_422, \
+ dav1d_masks.wedge_420_##sz_420, signs)
+
+ fill(32, 32, 16x32, 16x16, heqw, 0x7bfb);
+ fill(32, 16, 16x16, 16x8, hltw, 0x7beb);
+ fill(32, 8, 16x8, 16x4, hltw, 0x6beb);
+ fill(16, 32, 8x32, 8x16, hgtw, 0x7beb);
+ fill(16, 16, 8x16, 8x8, heqw, 0x7bfb);
+ fill(16, 8, 8x8, 8x4, hltw, 0x7beb);
+ fill( 8, 32, 4x32, 4x16, hgtw, 0x7aeb);
+ fill( 8, 16, 4x16, 4x8, hgtw, 0x7beb);
+ fill( 8, 8, 4x8, 4x4, heqw, 0x7bfb);
+#undef fill
+
+ memset(dav1d_masks.ii_dc, 32, 32 * 32);
+ for (int c = 0; c < 3; c++) {
+ dav1d_masks.offsets[c][BS_32x32-BS_32x32].ii[II_DC_PRED] =
+ dav1d_masks.offsets[c][BS_32x16-BS_32x32].ii[II_DC_PRED] =
+ dav1d_masks.offsets[c][BS_16x32-BS_32x32].ii[II_DC_PRED] =
+ dav1d_masks.offsets[c][BS_16x16-BS_32x32].ii[II_DC_PRED] =
+ dav1d_masks.offsets[c][BS_16x8 -BS_32x32].ii[II_DC_PRED] =
+ dav1d_masks.offsets[c][BS_8x16 -BS_32x32].ii[II_DC_PRED] =
+ dav1d_masks.offsets[c][BS_8x8 -BS_32x32].ii[II_DC_PRED] =
+ MASK_OFFSET(dav1d_masks.ii_dc);
+ }
+
+#define BUILD_NONDC_II_MASKS(w, h, step) \
+ build_nondc_ii_masks(dav1d_masks.ii_nondc_##w##x##h, w, h, step)
+
+#define ASSIGN_NONDC_II_OFFSET(bs, w444, h444, w422, h422, w420, h420) \
+ dav1d_masks.offsets[0][bs-BS_32x32].ii[p + 1] = \
+ MASK_OFFSET(&dav1d_masks.ii_nondc_##w444##x##h444[p*w444*h444]); \
+ dav1d_masks.offsets[1][bs-BS_32x32].ii[p + 1] = \
+ MASK_OFFSET(&dav1d_masks.ii_nondc_##w422##x##h422[p*w422*h422]); \
+ dav1d_masks.offsets[2][bs-BS_32x32].ii[p + 1] = \
+ MASK_OFFSET(&dav1d_masks.ii_nondc_##w420##x##h420[p*w420*h420])
+
+ BUILD_NONDC_II_MASKS(32, 32, 1);
+ BUILD_NONDC_II_MASKS(16, 32, 1);
+ BUILD_NONDC_II_MASKS(16, 16, 2);
+ BUILD_NONDC_II_MASKS( 8, 32, 1);
+ BUILD_NONDC_II_MASKS( 8, 16, 2);
+ BUILD_NONDC_II_MASKS( 8, 8, 4);
+ BUILD_NONDC_II_MASKS( 4, 16, 2);
+ BUILD_NONDC_II_MASKS( 4, 8, 4);
+ BUILD_NONDC_II_MASKS( 4, 4, 8);
+ for (int p = 0; p < 3; p++) {
+ ASSIGN_NONDC_II_OFFSET(BS_32x32, 32, 32, 16, 32, 16, 16);
+ ASSIGN_NONDC_II_OFFSET(BS_32x16, 32, 32, 16, 16, 16, 16);
+ ASSIGN_NONDC_II_OFFSET(BS_16x32, 16, 32, 8, 32, 8, 16);
+ ASSIGN_NONDC_II_OFFSET(BS_16x16, 16, 16, 8, 16, 8, 8);
+ ASSIGN_NONDC_II_OFFSET(BS_16x8, 16, 16, 8, 8, 8, 8);
+ ASSIGN_NONDC_II_OFFSET(BS_8x16, 8, 16, 4, 16, 4, 8);
+ ASSIGN_NONDC_II_OFFSET(BS_8x8, 8, 8, 4, 8, 4, 4);
+ }
+}
diff --git a/third_party/dav1d/src/wedge.h b/third_party/dav1d/src/wedge.h
new file mode 100644
index 0000000000..244e04ad2a
--- /dev/null
+++ b/third_party/dav1d/src/wedge.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_WEDGE_H
+#define DAV1D_SRC_WEDGE_H
+
+#include "src/levels.h"
+
+typedef struct {
+ /* Offsets, in units of 8 bytes, relative to the start of the struct. */
+ struct {
+ uint16_t wedge[2 /* sign */][16 /* wedge_idx */];
+ uint16_t ii[N_INTER_INTRA_PRED_MODES];
+ } offsets[3 /* 444, 422, 420 */][BS_8x8 - BS_32x32 + 1];
+
+ uint8_t ALIGN(wedge_444_32x32[ 16 * 32 * 32], 64);
+ uint8_t ALIGN(wedge_444_32x16[ 16 * 32 * 16], 64);
+ uint8_t ALIGN(wedge_444_32x8 [ 16 * 32 * 8], 64);
+ uint8_t ALIGN(wedge_444_16x32[ 16 * 16 * 32], 64);
+ uint8_t ALIGN(wedge_444_16x16[ 16 * 16 * 16], 64);
+ uint8_t ALIGN(wedge_444_16x8 [ 16 * 16 * 8], 64);
+ uint8_t ALIGN(wedge_444_8x32 [ 16 * 8 * 32], 64);
+ uint8_t ALIGN(wedge_444_8x16 [ 16 * 8 * 16], 64);
+ uint8_t ALIGN(wedge_444_8x8 [ 16 * 8 * 8], 64);
+
+ uint8_t ALIGN(wedge_422_16x32[2 * 16 * 16 * 32], 64);
+ uint8_t ALIGN(wedge_422_16x16[2 * 16 * 16 * 16], 64);
+ uint8_t ALIGN(wedge_422_16x8 [2 * 16 * 16 * 8], 64);
+ uint8_t ALIGN(wedge_422_8x32 [2 * 16 * 8 * 32], 64);
+ uint8_t ALIGN(wedge_422_8x16 [2 * 16 * 8 * 16], 64);
+ uint8_t ALIGN(wedge_422_8x8 [2 * 16 * 8 * 8], 64);
+ uint8_t ALIGN(wedge_422_4x32 [2 * 16 * 4 * 32], 64);
+ uint8_t ALIGN(wedge_422_4x16 [2 * 16 * 4 * 16], 64);
+ uint8_t ALIGN(wedge_422_4x8 [2 * 16 * 4 * 8], 64);
+
+ uint8_t ALIGN(wedge_420_16x16[2 * 16 * 16 * 16], 64);
+ uint8_t ALIGN(wedge_420_16x8 [2 * 16 * 16 * 8], 64);
+ uint8_t ALIGN(wedge_420_16x4 [2 * 16 * 16 * 4], 64);
+ uint8_t ALIGN(wedge_420_8x16 [2 * 16 * 8 * 16], 64);
+ uint8_t ALIGN(wedge_420_8x8 [2 * 16 * 8 * 8], 64);
+ uint8_t ALIGN(wedge_420_8x4 [2 * 16 * 8 * 4], 64);
+ uint8_t ALIGN(wedge_420_4x16 [2 * 16 * 4 * 16], 64);
+ uint8_t ALIGN(wedge_420_4x8 [2 * 16 * 4 * 8], 64);
+ uint8_t ALIGN(wedge_420_4x4 [2 * 16 * 4 * 4], 64);
+
+ uint8_t ALIGN(ii_dc [ 32 * 32], 64);
+ uint8_t ALIGN(ii_nondc_32x32[3 * 32 * 32], 64);
+ uint8_t ALIGN(ii_nondc_16x32[3 * 16 * 32], 64);
+ uint8_t ALIGN(ii_nondc_16x16[3 * 16 * 16], 64);
+ uint8_t ALIGN(ii_nondc_8x32 [3 * 8 * 32], 64);
+ uint8_t ALIGN(ii_nondc_8x16 [3 * 8 * 16], 64);
+ uint8_t ALIGN(ii_nondc_8x8 [3 * 8 * 8], 64);
+ uint8_t ALIGN(ii_nondc_4x16 [3 * 4 * 16], 64);
+ uint8_t ALIGN(ii_nondc_4x8 [3 * 4 * 8], 32);
+ uint8_t ALIGN(ii_nondc_4x4 [3 * 4 * 4], 16);
+} Dav1dMasks;
+
+#define II_MASK(c, bs, b) \
+ ((const uint8_t*)((uintptr_t)&dav1d_masks + \
+ (size_t)((b)->interintra_type == INTER_INTRA_BLEND ? \
+ dav1d_masks.offsets[c][(bs)-BS_32x32].ii[(b)->interintra_mode] : \
+ dav1d_masks.offsets[c][(bs)-BS_32x32].wedge[0][(b)->wedge_idx]) * 8))
+
+#define WEDGE_MASK(c, bs, sign, idx) \
+ ((const uint8_t*)((uintptr_t)&dav1d_masks + \
+ (size_t)dav1d_masks.offsets[c][(bs)-BS_32x32].wedge[sign][idx] * 8))
+
+EXTERN Dav1dMasks dav1d_masks;
+
+void dav1d_init_ii_wedge_masks(void);
+
+#endif /* DAV1D_SRC_WEDGE_H */
diff --git a/third_party/dav1d/src/win32/thread.c b/third_party/dav1d/src/win32/thread.c
new file mode 100644
index 0000000000..b89bd6b165
--- /dev/null
+++ b/third_party/dav1d/src/win32/thread.c
@@ -0,0 +1,99 @@
+/*
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#if defined(_WIN32)
+
+#include <process.h>
+#include <stdlib.h>
+#include <windows.h>
+
+#include "common/attributes.h"
+
+#include "src/thread.h"
+
+static HRESULT (WINAPI *set_thread_description)(HANDLE, PCWSTR);
+
+COLD void dav1d_init_thread(void) {
+#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+ HANDLE kernel32 = GetModuleHandleW(L"kernel32.dll");
+ if (kernel32)
+ set_thread_description =
+ (void*)GetProcAddress(kernel32, "SetThreadDescription");
+#endif
+}
+
+#undef dav1d_set_thread_name
+COLD void dav1d_set_thread_name(const wchar_t *const name) {
+ if (set_thread_description) /* Only available since Windows 10 1607 */
+ set_thread_description(GetCurrentThread(), name);
+}
+
+static COLD unsigned __stdcall thread_entrypoint(void *const data) {
+ pthread_t *const t = data;
+ t->arg = t->func(t->arg);
+ return 0;
+}
+
+COLD int dav1d_pthread_create(pthread_t *const thread,
+ const pthread_attr_t *const attr,
+ void *(*const func)(void*), void *const arg)
+{
+ const unsigned stack_size = attr ? attr->stack_size : 0;
+ thread->func = func;
+ thread->arg = arg;
+ thread->h = (HANDLE)_beginthreadex(NULL, stack_size, thread_entrypoint, thread,
+ STACK_SIZE_PARAM_IS_A_RESERVATION, NULL);
+ return !thread->h;
+}
+
+COLD int dav1d_pthread_join(pthread_t *const thread, void **const res) {
+ if (WaitForSingleObject(thread->h, INFINITE))
+ return 1;
+
+ if (res)
+ *res = thread->arg;
+
+ return !CloseHandle(thread->h);
+}
+
+COLD int dav1d_pthread_once(pthread_once_t *const once_control,
+ void (*const init_routine)(void))
+{
+ BOOL pending = FALSE;
+
+ if (InitOnceBeginInitialize(once_control, 0, &pending, NULL) != TRUE)
+ return 1;
+
+ if (pending == TRUE)
+ init_routine();
+
+ return !InitOnceComplete(once_control, 0, NULL);
+}
+
+#endif
diff --git a/third_party/dav1d/src/x86/cdef.h b/third_party/dav1d/src/x86/cdef.h
new file mode 100644
index 0000000000..553d650741
--- /dev/null
+++ b/third_party/dav1d/src/x86/cdef.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/cdef.h"
+
+#define decl_cdef_fns(ext) \
+ decl_cdef_fn(BF(dav1d_cdef_filter_4x4, ext)); \
+ decl_cdef_fn(BF(dav1d_cdef_filter_4x8, ext)); \
+ decl_cdef_fn(BF(dav1d_cdef_filter_8x8, ext))
+
+decl_cdef_fns(avx512icl);
+decl_cdef_fns(avx2);
+decl_cdef_fns(sse4);
+decl_cdef_fns(ssse3);
+decl_cdef_fns(sse2);
+
+decl_cdef_dir_fn(BF(dav1d_cdef_dir, avx2));
+decl_cdef_dir_fn(BF(dav1d_cdef_dir, sse4));
+decl_cdef_dir_fn(BF(dav1d_cdef_dir, ssse3));
+
+static ALWAYS_INLINE void cdef_dsp_init_x86(Dav1dCdefDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+#if BITDEPTH == 8
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return;
+
+ c->fb[0] = BF(dav1d_cdef_filter_8x8, sse2);
+ c->fb[1] = BF(dav1d_cdef_filter_4x8, sse2);
+ c->fb[2] = BF(dav1d_cdef_filter_4x4, sse2);
+#endif
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
+
+ c->dir = BF(dav1d_cdef_dir, ssse3);
+ c->fb[0] = BF(dav1d_cdef_filter_8x8, ssse3);
+ c->fb[1] = BF(dav1d_cdef_filter_4x8, ssse3);
+ c->fb[2] = BF(dav1d_cdef_filter_4x4, ssse3);
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSE41)) return;
+
+ c->dir = BF(dav1d_cdef_dir, sse4);
+#if BITDEPTH == 8
+ c->fb[0] = BF(dav1d_cdef_filter_8x8, sse4);
+ c->fb[1] = BF(dav1d_cdef_filter_4x8, sse4);
+ c->fb[2] = BF(dav1d_cdef_filter_4x4, sse4);
+#endif
+
+#if ARCH_X86_64
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
+
+ c->dir = BF(dav1d_cdef_dir, avx2);
+ c->fb[0] = BF(dav1d_cdef_filter_8x8, avx2);
+ c->fb[1] = BF(dav1d_cdef_filter_4x8, avx2);
+ c->fb[2] = BF(dav1d_cdef_filter_4x4, avx2);
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
+
+ c->fb[0] = BF(dav1d_cdef_filter_8x8, avx512icl);
+ c->fb[1] = BF(dav1d_cdef_filter_4x8, avx512icl);
+ c->fb[2] = BF(dav1d_cdef_filter_4x4, avx512icl);
+#endif
+}
diff --git a/third_party/dav1d/src/x86/cdef16_avx2.asm b/third_party/dav1d/src/x86/cdef16_avx2.asm
new file mode 100644
index 0000000000..4c8d3bca43
--- /dev/null
+++ b/third_party/dav1d/src/x86/cdef16_avx2.asm
@@ -0,0 +1,877 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA
+
+%macro DIR_TABLE 1 ; stride
+ db 1 * %1 + 0, 2 * %1 + 0
+ db 1 * %1 + 0, 2 * %1 - 2
+ db -1 * %1 + 2, -2 * %1 + 4
+ db 0 * %1 + 2, -1 * %1 + 4
+ db 0 * %1 + 2, 0 * %1 + 4
+ db 0 * %1 + 2, 1 * %1 + 4
+ db 1 * %1 + 2, 2 * %1 + 4
+ db 1 * %1 + 0, 2 * %1 + 2
+ db 1 * %1 + 0, 2 * %1 + 0
+ db 1 * %1 + 0, 2 * %1 - 2
+ db -1 * %1 + 2, -2 * %1 + 4
+ db 0 * %1 + 2, -1 * %1 + 4
+%endmacro
+
+dir_table4: DIR_TABLE 16
+dir_table8: DIR_TABLE 32
+pri_taps: dw 4, 4, 3, 3, 2, 2, 3, 3
+
+dir_shift: times 2 dw 0x4000
+ times 2 dw 0x1000
+
+pw_2048: times 2 dw 2048
+pw_m16384: times 2 dw -16384
+
+cextern cdef_dir_8bpc_avx2.main
+
+SECTION .text
+
+%macro CDEF_FILTER 2 ; w, h
+ DEFINE_ARGS dst, stride, _, dir, pridmp, pri, sec, tmp
+ movifnidn prid, r5m
+ movifnidn secd, r6m
+ mov dird, r7m
+ vpbroadcastd m8, [base+pw_2048]
+ lea dirq, [base+dir_table%1+dirq*2]
+ test prid, prid
+ jz .sec_only
+%if WIN64
+ vpbroadcastw m6, prim
+ movaps [rsp+16*0], xmm9
+ movaps [rsp+16*1], xmm10
+%else
+ movd xm6, prid
+ vpbroadcastw m6, xm6
+%endif
+ lzcnt pridmpd, prid
+ rorx tmpd, prid, 2
+ cmp dword r10m, 0xfff ; if (bpc == 12)
+ cmove prid, tmpd ; pri >>= 2
+ mov tmpd, r8m ; damping
+ and prid, 4
+ sub tmpd, 31
+ vpbroadcastd m9, [base+pri_taps+priq+8*0]
+ vpbroadcastd m10, [base+pri_taps+priq+8*1]
+ test secd, secd
+ jz .pri_only
+%if WIN64
+ movaps r8m, xmm13
+ vpbroadcastw m13, secm
+ movaps r4m, xmm11
+ movaps r6m, xmm12
+%else
+ movd xm0, secd
+ vpbroadcastw m13, xm0
+%endif
+ lzcnt secd, secd
+ xor prid, prid
+ add pridmpd, tmpd
+ cmovs pridmpd, prid
+ add secd, tmpd
+ lea tmpq, [px]
+ mov [pri_shift], pridmpq
+ mov [sec_shift], secq
+%rep %1*%2/16
+ call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).pri_sec
+%endrep
+%if WIN64
+ movaps xmm11, r4m
+ movaps xmm12, r6m
+ movaps xmm13, r8m
+%endif
+ jmp .pri_end
+.pri_only:
+ add pridmpd, tmpd
+ cmovs pridmpd, secd
+ lea tmpq, [px]
+ mov [pri_shift], pridmpq
+%rep %1*%2/16
+ call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).pri
+%endrep
+.pri_end:
+%if WIN64
+ movaps xmm9, [rsp+16*0]
+ movaps xmm10, [rsp+16*1]
+%endif
+.end:
+ RET
+.sec_only:
+ mov tmpd, r8m ; damping
+%if WIN64
+ vpbroadcastw m6, secm
+%else
+ movd xm6, secd
+ vpbroadcastw m6, xm6
+%endif
+ tzcnt secd, secd
+ sub tmpd, secd
+ mov [sec_shift], tmpq
+ lea tmpq, [px]
+%rep %1*%2/16
+ call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).sec
+%endrep
+ jmp .end
+%if %1 == %2
+ALIGN function_align
+.pri:
+ movsx offq, byte [dirq+4] ; off_k0
+%if %1 == 4
+ mova m1, [tmpq+32*0]
+ punpcklqdq m1, [tmpq+32*1] ; 0 2 1 3
+ movu m2, [tmpq+offq+32*0]
+ punpcklqdq m2, [tmpq+offq+32*1] ; k0p0
+ neg offq
+ movu m3, [tmpq+offq+32*0]
+ punpcklqdq m3, [tmpq+offq+32*1] ; k0p1
+%else
+ mova xm1, [tmpq+32*0]
+ vinserti128 m1, [tmpq+32*1], 1
+ movu xm2, [tmpq+offq+32*0]
+ vinserti128 m2, [tmpq+offq+32*1], 1
+ neg offq
+ movu xm3, [tmpq+offq+32*0]
+ vinserti128 m3, [tmpq+offq+32*1], 1
+%endif
+ movsx offq, byte [dirq+5] ; off_k1
+ psubw m2, m1 ; diff_k0p0
+ psubw m3, m1 ; diff_k0p1
+ pabsw m4, m2 ; adiff_k0p0
+ psrlw m5, m4, [pri_shift+gprsize]
+ psubusw m0, m6, m5
+ pabsw m5, m3 ; adiff_k0p1
+ pminsw m0, m4
+ psrlw m4, m5, [pri_shift+gprsize]
+ psignw m0, m2 ; constrain(diff_k0p0)
+ psubusw m2, m6, m4
+ pminsw m2, m5
+%if %1 == 4
+ movu m4, [tmpq+offq+32*0]
+ punpcklqdq m4, [tmpq+offq+32*1] ; k1p0
+ neg offq
+ movu m5, [tmpq+offq+32*0]
+ punpcklqdq m5, [tmpq+offq+32*1] ; k1p1
+%else
+ movu xm4, [tmpq+offq+32*0]
+ vinserti128 m4, [tmpq+offq+32*1], 1
+ neg offq
+ movu xm5, [tmpq+offq+32*0]
+ vinserti128 m5, [tmpq+offq+32*1], 1
+%endif
+ psubw m4, m1 ; diff_k1p0
+ psubw m5, m1 ; diff_k1p1
+ psignw m2, m3 ; constrain(diff_k0p1)
+ pabsw m3, m4 ; adiff_k1p0
+ paddw m0, m2 ; constrain(diff_k0)
+ psrlw m2, m3, [pri_shift+gprsize]
+ psubusw m7, m6, m2
+ pabsw m2, m5 ; adiff_k1p1
+ pminsw m7, m3
+ psrlw m3, m2, [pri_shift+gprsize]
+ psignw m7, m4 ; constrain(diff_k1p0)
+ psubusw m4, m6, m3
+ pminsw m4, m2
+ psignw m4, m5 ; constrain(diff_k1p1)
+ paddw m7, m4 ; constrain(diff_k1)
+ pmullw m0, m9 ; pri_tap_k0
+ pmullw m7, m10 ; pri_tap_k1
+ paddw m0, m7 ; sum
+ psraw m2, m0, 15
+ paddw m0, m2
+ pmulhrsw m0, m8
+ add tmpq, 32*2
+ paddw m0, m1
+%if %1 == 4
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+r9 ], xm1
+ lea dstq, [dstq+strideq*4]
+%else
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+%endif
+ ret
+ALIGN function_align
+.sec:
+ movsx offq, byte [dirq+8] ; off1_k0
+%if %1 == 4
+ mova m1, [tmpq+32*0]
+ punpcklqdq m1, [tmpq+32*1]
+ movu m2, [tmpq+offq+32*0]
+ punpcklqdq m2, [tmpq+offq+32*1] ; k0s0
+ neg offq
+ movu m3, [tmpq+offq+32*0]
+ punpcklqdq m3, [tmpq+offq+32*1] ; k0s1
+%else
+ mova xm1, [tmpq+32*0]
+ vinserti128 m1, [tmpq+32*1], 1
+ movu xm2, [tmpq+offq+32*0]
+ vinserti128 m2, [tmpq+offq+32*1], 1
+ neg offq
+ movu xm3, [tmpq+offq+32*0]
+ vinserti128 m3, [tmpq+offq+32*1], 1
+%endif
+ movsx offq, byte [dirq+0] ; off2_k0
+ psubw m2, m1 ; diff_k0s0
+ psubw m3, m1 ; diff_k0s1
+ pabsw m4, m2 ; adiff_k0s0
+ psrlw m5, m4, [sec_shift+gprsize]
+ psubusw m0, m6, m5
+ pabsw m5, m3 ; adiff_k0s1
+ pminsw m0, m4
+ psrlw m4, m5, [sec_shift+gprsize]
+ psignw m0, m2 ; constrain(diff_k0s0)
+ psubusw m2, m6, m4
+ pminsw m2, m5
+%if %1 == 4
+ movu m4, [tmpq+offq+32*0]
+ punpcklqdq m4, [tmpq+offq+32*1] ; k0s2
+ neg offq
+ movu m5, [tmpq+offq+32*0]
+ punpcklqdq m5, [tmpq+offq+32*1] ; k0s3
+%else
+ movu xm4, [tmpq+offq+32*0]
+ vinserti128 m4, [tmpq+offq+32*1], 1
+ neg offq
+ movu xm5, [tmpq+offq+32*0]
+ vinserti128 m5, [tmpq+offq+32*1], 1
+%endif
+ movsx offq, byte [dirq+9] ; off1_k1
+ psubw m4, m1 ; diff_k0s2
+ psubw m5, m1 ; diff_k0s3
+ psignw m2, m3 ; constrain(diff_k0s1)
+ pabsw m3, m4 ; adiff_k0s2
+ paddw m0, m2
+ psrlw m2, m3, [sec_shift+gprsize]
+ psubusw m7, m6, m2
+ pabsw m2, m5 ; adiff_k0s3
+ pminsw m7, m3
+ psrlw m3, m2, [sec_shift+gprsize]
+ psignw m7, m4 ; constrain(diff_k0s2)
+ psubusw m4, m6, m3
+ pminsw m4, m2
+%if %1 == 4
+ movu m2, [tmpq+offq+32*0]
+ punpcklqdq m2, [tmpq+offq+32*1] ; k1s0
+ neg offq
+ movu m3, [tmpq+offq+32*0]
+ punpcklqdq m3, [tmpq+offq+32*1] ; k1s1
+%else
+ movu xm2, [tmpq+offq+32*0]
+ vinserti128 m2, [tmpq+offq+32*1], 1
+ neg offq
+ movu xm3, [tmpq+offq+32*0]
+ vinserti128 m3, [tmpq+offq+32*1], 1
+%endif
+ movsx offq, byte [dirq+1] ; off2_k1
+ paddw m0, m7
+ psignw m4, m5 ; constrain(diff_k0s3)
+ paddw m0, m4 ; constrain(diff_k0)
+ psubw m2, m1 ; diff_k1s0
+ psubw m3, m1 ; diff_k1s1
+ paddw m0, m0 ; sec_tap_k0
+ pabsw m4, m2 ; adiff_k1s0
+ psrlw m5, m4, [sec_shift+gprsize]
+ psubusw m7, m6, m5
+ pabsw m5, m3 ; adiff_k1s1
+ pminsw m7, m4
+ psrlw m4, m5, [sec_shift+gprsize]
+ psignw m7, m2 ; constrain(diff_k1s0)
+ psubusw m2, m6, m4
+ pminsw m2, m5
+%if %1 == 4
+ movu m4, [tmpq+offq+32*0]
+ punpcklqdq m4, [tmpq+offq+32*1] ; k1s2
+ neg offq
+ movu m5, [tmpq+offq+32*0]
+ punpcklqdq m5, [tmpq+offq+32*1] ; k1s3
+%else
+ movu xm4, [tmpq+offq+32*0]
+ vinserti128 m4, [tmpq+offq+32*1], 1
+ neg offq
+ movu xm5, [tmpq+offq+32*0]
+ vinserti128 m5, [tmpq+offq+32*1], 1
+%endif
+ paddw m0, m7
+ psubw m4, m1 ; diff_k1s2
+ psubw m5, m1 ; diff_k1s3
+ psignw m2, m3 ; constrain(diff_k1s1)
+ pabsw m3, m4 ; adiff_k1s2
+ paddw m0, m2
+ psrlw m2, m3, [sec_shift+gprsize]
+ psubusw m7, m6, m2
+ pabsw m2, m5 ; adiff_k1s3
+ pminsw m7, m3
+ psrlw m3, m2, [sec_shift+gprsize]
+ psignw m7, m4 ; constrain(diff_k1s2)
+ psubusw m4, m6, m3
+ pminsw m4, m2
+ paddw m0, m7
+ psignw m4, m5 ; constrain(diff_k1s3)
+ paddw m0, m4 ; sum
+ psraw m2, m0, 15
+ paddw m0, m2
+ pmulhrsw m0, m8
+ add tmpq, 32*2
+ paddw m0, m1
+%if %1 == 4
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+r9 ], xm1
+ lea dstq, [dstq+strideq*4]
+%else
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+%endif
+ ret
+ALIGN function_align
+.pri_sec:
+ movsx offq, byte [dirq+8] ; off2_k0
+%if %1 == 4
+ mova m1, [tmpq+32*0]
+ punpcklqdq m1, [tmpq+32*1]
+ movu m2, [tmpq+offq+32*0]
+ punpcklqdq m2, [tmpq+offq+32*1] ; k0s0
+ neg offq
+ movu m3, [tmpq+offq+32*0]
+ punpcklqdq m3, [tmpq+offq+32*1] ; k0s1
+%else
+ mova xm1, [dstq+strideq*0]
+ vinserti128 m1, [dstq+strideq*1], 1
+ movu xm2, [tmpq+offq+32*0]
+ vinserti128 m2, [tmpq+offq+32*1], 1
+ neg offq
+ movu xm3, [tmpq+offq+32*0]
+ vinserti128 m3, [tmpq+offq+32*1], 1
+%endif
+ movsx offq, byte [dirq+0] ; off3_k0
+ pmaxsw m11, m2, m3
+ pminuw m12, m2, m3
+ psubw m2, m1 ; diff_k0s0
+ psubw m3, m1 ; diff_k0s1
+ pabsw m4, m2 ; adiff_k0s0
+ psrlw m5, m4, [sec_shift+gprsize]
+ psubusw m0, m13, m5
+ pabsw m5, m3 ; adiff_k0s1
+ pminsw m0, m4
+ psrlw m4, m5, [sec_shift+gprsize]
+ psignw m0, m2 ; constrain(diff_k0s0)
+ psubusw m2, m13, m4
+ pminsw m2, m5
+%if %1 == 4
+ movu m4, [tmpq+offq+32*0]
+ punpcklqdq m4, [tmpq+offq+32*1] ; k0s2
+ neg offq
+ movu m5, [tmpq+offq+32*0]
+ punpcklqdq m5, [tmpq+offq+32*1] ; k0s3
+%else
+ movu xm4, [tmpq+offq+32*0]
+ vinserti128 m4, [tmpq+offq+32*1], 1
+ neg offq
+ movu xm5, [tmpq+offq+32*0]
+ vinserti128 m5, [tmpq+offq+32*1], 1
+%endif
+ movsx offq, byte [dirq+9] ; off2_k1
+ psignw m2, m3 ; constrain(diff_k0s1)
+ pmaxsw m11, m4
+ pminuw m12, m4
+ pmaxsw m11, m5
+ pminuw m12, m5
+ psubw m4, m1 ; diff_k0s2
+ psubw m5, m1 ; diff_k0s3
+ paddw m0, m2
+ pabsw m3, m4 ; adiff_k0s2
+ psrlw m2, m3, [sec_shift+gprsize]
+ psubusw m7, m13, m2
+ pabsw m2, m5 ; adiff_k0s3
+ pminsw m7, m3
+ psrlw m3, m2, [sec_shift+gprsize]
+ psignw m7, m4 ; constrain(diff_k0s2)
+ psubusw m4, m13, m3
+ pminsw m4, m2
+%if %1 == 4
+ movu m2, [tmpq+offq+32*0]
+ punpcklqdq m2, [tmpq+offq+32*1] ; k1s0
+ neg offq
+ movu m3, [tmpq+offq+32*0]
+ punpcklqdq m3, [tmpq+offq+32*1] ; k1s1
+%else
+ movu xm2, [tmpq+offq+32*0]
+ vinserti128 m2, [tmpq+offq+32*1], 1
+ neg offq
+ movu xm3, [tmpq+offq+32*0]
+ vinserti128 m3, [tmpq+offq+32*1], 1
+%endif
+ movsx offq, byte [dirq+1] ; off3_k1
+ paddw m0, m7
+ psignw m4, m5 ; constrain(diff_k0s3)
+ pmaxsw m11, m2
+ pminuw m12, m2
+ pmaxsw m11, m3
+ pminuw m12, m3
+ paddw m0, m4 ; constrain(diff_k0)
+ psubw m2, m1 ; diff_k1s0
+ psubw m3, m1 ; diff_k1s1
+ paddw m0, m0 ; sec_tap_k0
+ pabsw m4, m2 ; adiff_k1s0
+ psrlw m5, m4, [sec_shift+gprsize]
+ psubusw m7, m13, m5
+ pabsw m5, m3 ; adiff_k1s1
+ pminsw m7, m4
+ psrlw m4, m5, [sec_shift+gprsize]
+ psignw m7, m2 ; constrain(diff_k1s0)
+ psubusw m2, m13, m4
+ pminsw m2, m5
+%if %1 == 4
+ movu m4, [tmpq+offq+32*0]
+ punpcklqdq m4, [tmpq+offq+32*1] ; k1s2
+ neg offq
+ movu m5, [tmpq+offq+32*0]
+ punpcklqdq m5, [tmpq+offq+32*1] ; k1s3
+%else
+ movu xm4, [tmpq+offq+32*0]
+ vinserti128 m4, [tmpq+offq+32*1], 1
+ neg offq
+ movu xm5, [tmpq+offq+32*0]
+ vinserti128 m5, [tmpq+offq+32*1], 1
+%endif
+ movsx offq, byte [dirq+4] ; off1_k0
+ paddw m0, m7
+ psignw m2, m3 ; constrain(diff_k1s1)
+ pmaxsw m11, m4
+ pminuw m12, m4
+ pmaxsw m11, m5
+ pminuw m12, m5
+ psubw m4, m1 ; diff_k1s2
+ psubw m5, m1 ; diff_k1s3
+ pabsw m3, m4 ; adiff_k1s2
+ paddw m0, m2
+ psrlw m2, m3, [sec_shift+gprsize]
+ psubusw m7, m13, m2
+ pabsw m2, m5 ; adiff_k1s3
+ pminsw m7, m3
+ psrlw m3, m2, [sec_shift+gprsize]
+ psignw m7, m4 ; constrain(diff_k1s2)
+ psubusw m4, m13, m3
+ pminsw m4, m2
+ paddw m0, m7
+%if %1 == 4
+ movu m2, [tmpq+offq+32*0]
+ punpcklqdq m2, [tmpq+offq+32*1] ; k0p0
+ neg offq
+ movu m3, [tmpq+offq+32*0]
+ punpcklqdq m3, [tmpq+offq+32*1] ; k0p1
+%else
+ movu xm2, [tmpq+offq+32*0]
+ vinserti128 m2, [tmpq+offq+32*1], 1
+ neg offq
+ movu xm3, [tmpq+offq+32*0]
+ vinserti128 m3, [tmpq+offq+32*1], 1
+%endif
+ movsx offq, byte [dirq+5] ; off1_k1
+ psignw m4, m5 ; constrain(diff_k1s3)
+ pmaxsw m11, m2
+ pminuw m12, m2
+ pmaxsw m11, m3
+ pminuw m12, m3
+ psubw m2, m1 ; diff_k0p0
+ psubw m3, m1 ; diff_k0p1
+ paddw m0, m4
+ pabsw m4, m2 ; adiff_k0p0
+ psrlw m5, m4, [pri_shift+gprsize]
+ psubusw m7, m6, m5
+ pabsw m5, m3 ; adiff_k0p1
+ pminsw m7, m4
+ psrlw m4, m5, [pri_shift+gprsize]
+ psignw m7, m2 ; constrain(diff_k0p0)
+ psubusw m2, m6, m4
+ pminsw m2, m5
+%if %1 == 4
+ movu m4, [tmpq+offq+32*0]
+ punpcklqdq m4, [tmpq+offq+32*1] ; k1p0
+ neg offq
+ movu m5, [tmpq+offq+32*0]
+ punpcklqdq m5, [tmpq+offq+32*1] ; k1p1
+%else
+ movu xm4, [tmpq+offq+32*0]
+ vinserti128 m4, [tmpq+offq+32*1], 1
+ neg offq
+ movu xm5, [tmpq+offq+32*0]
+ vinserti128 m5, [tmpq+offq+32*1], 1
+%endif
+ psignw m2, m3 ; constrain(diff_k0p1)
+ paddw m7, m2 ; constrain(diff_k0)
+ pmaxsw m11, m4
+ pminuw m12, m4
+ pmaxsw m11, m5
+ pminuw m12, m5
+ psubw m4, m1 ; diff_k1p0
+ psubw m5, m1 ; diff_k1p1
+ pabsw m3, m4 ; adiff_k1p0
+ pmullw m7, m9 ; pri_tap_k0
+ paddw m0, m7
+ psrlw m2, m3, [pri_shift+gprsize]
+ psubusw m7, m6, m2
+ pabsw m2, m5 ; adiff_k1p1
+ pminsw m7, m3
+ psrlw m3, m2, [pri_shift+gprsize]
+ psignw m7, m4 ; constrain(diff_k1p0)
+ psubusw m4, m6, m3
+ pminsw m4, m2
+ psignw m4, m5 ; constrain(diff_k1p1)
+ paddw m7, m4 ; constrain(diff_k1)
+ pmullw m7, m10 ; pri_tap_k1
+ paddw m0, m7 ; sum
+ psraw m2, m0, 15
+ paddw m0, m2
+ pmulhrsw m0, m8
+ add tmpq, 32*2
+ pmaxsw m11, m1
+ pminuw m12, m1
+ paddw m0, m1
+ pminsw m0, m11
+ pmaxsw m0, m12
+%if %1 == 4
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+r9 ], xm1
+ lea dstq, [dstq+strideq*4]
+%else
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+%endif
+ ret
+%endif
+%endmacro
+
+INIT_YMM avx2
+cglobal cdef_filter_4x4_16bpc, 5, 10, 9, 16*10, dst, stride, left, top, bot, \
+ pri, sec, edge
+%if WIN64
+ %define px rsp+16*6
+ %define offq r8
+ %define pri_shift rsp+16*2
+ %define sec_shift rsp+16*3
+%else
+ %define px rsp+16*4
+ %define offq r4
+ %define pri_shift rsp+16*0
+ %define sec_shift rsp+16*1
+%endif
+ %define base r8-dir_table4
+ mov edged, r9m
+ lea r8, [dir_table4]
+ movu xm0, [dstq+strideq*0]
+ movu xm1, [dstq+strideq*1]
+ lea r9, [strideq*3]
+ movu xm2, [dstq+strideq*2]
+ movu xm3, [dstq+r9 ]
+ vpbroadcastd m7, [base+pw_m16384]
+ mova [px+16*0+0], xm0
+ mova [px+16*1+0], xm1
+ mova [px+16*2+0], xm2
+ mova [px+16*3+0], xm3
+ test edgeb, 4 ; HAVE_TOP
+ jz .no_top
+ movu xm0, [topq+strideq*0]
+ movu xm1, [topq+strideq*1]
+ mova [px-16*2+0], xm0
+ mova [px-16*1+0], xm1
+ test edgeb, 1 ; HAVE_LEFT
+ jz .top_no_left
+ movd xm0, [topq+strideq*0-4]
+ movd xm1, [topq+strideq*1-4]
+ movd [px-16*2-4], xm0
+ movd [px-16*1-4], xm1
+ jmp .top_done
+.no_top:
+ mova [px-16*2+0], m7
+.top_no_left:
+ movd [px-16*2-4], xm7
+ movd [px-16*1-4], xm7
+.top_done:
+ test edgeb, 8 ; HAVE_BOTTOM
+ jz .no_bottom
+ movu xm0, [botq+strideq*0]
+ movu xm1, [botq+strideq*1]
+ mova [px+16*4+0], xm0
+ mova [px+16*5+0], xm1
+ test edgeb, 1 ; HAVE_LEFT
+ jz .bottom_no_left
+ movd xm0, [botq+strideq*0-4]
+ movd xm1, [botq+strideq*1-4]
+ movd [px+16*4-4], xm0
+ movd [px+16*5-4], xm1
+ jmp .bottom_done
+.no_bottom:
+ mova [px+16*4+0], m7
+.bottom_no_left:
+ movd [px+16*4-4], xm7
+ movd [px+16*5-4], xm7
+.bottom_done:
+ test edgeb, 1 ; HAVE_LEFT
+ jz .no_left
+ movd xm0, [leftq+4*0]
+ movd xm1, [leftq+4*1]
+ movd xm2, [leftq+4*2]
+ movd xm3, [leftq+4*3]
+ movd [px+16*0-4], xm0
+ movd [px+16*1-4], xm1
+ movd [px+16*2-4], xm2
+ movd [px+16*3-4], xm3
+ jmp .left_done
+.no_left:
+ REPX {movd [px+16*x-4], xm7}, 0, 1, 2, 3
+.left_done:
+ test edgeb, 2 ; HAVE_RIGHT
+ jnz .padding_done
+ REPX {movd [px+16*x+8], xm7}, -2, -1, 0, 1, 2, 3, 4, 5
+.padding_done:
+ CDEF_FILTER 4, 4
+
+cglobal cdef_filter_4x8_16bpc, 5, 10, 9, 16*14, dst, stride, left, top, bot, \
+ pri, sec, edge
+ mov edged, r9m
+ movu xm0, [dstq+strideq*0]
+ movu xm1, [dstq+strideq*1]
+ lea r9, [strideq*3]
+ movu xm2, [dstq+strideq*2]
+ movu xm3, [dstq+r9 ]
+ lea r6, [dstq+strideq*4]
+ movu xm4, [r6 +strideq*0]
+ movu xm5, [r6 +strideq*1]
+ movu xm6, [r6 +strideq*2]
+ movu xm7, [r6 +r9 ]
+ lea r8, [dir_table4]
+ mova [px+16*0+0], xm0
+ mova [px+16*1+0], xm1
+ mova [px+16*2+0], xm2
+ mova [px+16*3+0], xm3
+ mova [px+16*4+0], xm4
+ mova [px+16*5+0], xm5
+ mova [px+16*6+0], xm6
+ mova [px+16*7+0], xm7
+ vpbroadcastd m7, [base+pw_m16384]
+ test edgeb, 4 ; HAVE_TOP
+ jz .no_top
+ movu xm0, [topq+strideq*0]
+ movu xm1, [topq+strideq*1]
+ mova [px-16*2+0], xm0
+ mova [px-16*1+0], xm1
+ test edgeb, 1 ; HAVE_LEFT
+ jz .top_no_left
+ movd xm0, [topq+strideq*0-4]
+ movd xm1, [topq+strideq*1-4]
+ movd [px-16*2-4], xm0
+ movd [px-16*1-4], xm1
+ jmp .top_done
+.no_top:
+ mova [px-16*2+0], m7
+.top_no_left:
+ movd [px-16*2-4], xm7
+ movd [px-16*1-4], xm7
+.top_done:
+ test edgeb, 8 ; HAVE_BOTTOM
+ jz .no_bottom
+ movu xm0, [botq+strideq*0]
+ movu xm1, [botq+strideq*1]
+ mova [px+16*8+0], xm0
+ mova [px+16*9+0], xm1
+ test edgeb, 1 ; HAVE_LEFT
+ jz .bottom_no_left
+ movd xm0, [botq+strideq*0-4]
+ movd xm1, [botq+strideq*1-4]
+ movd [px+16*8-4], xm0
+ movd [px+16*9-4], xm1
+ jmp .bottom_done
+.no_bottom:
+ mova [px+16*8+0], m7
+.bottom_no_left:
+ movd [px+16*8-4], xm7
+ movd [px+16*9-4], xm7
+.bottom_done:
+ test edgeb, 1 ; HAVE_LEFT
+ jz .no_left
+ movd xm0, [leftq+4*0]
+ movd xm1, [leftq+4*1]
+ movd xm2, [leftq+4*2]
+ movd xm3, [leftq+4*3]
+ movd [px+16*0-4], xm0
+ movd [px+16*1-4], xm1
+ movd [px+16*2-4], xm2
+ movd [px+16*3-4], xm3
+ movd xm0, [leftq+4*4]
+ movd xm1, [leftq+4*5]
+ movd xm2, [leftq+4*6]
+ movd xm3, [leftq+4*7]
+ movd [px+16*4-4], xm0
+ movd [px+16*5-4], xm1
+ movd [px+16*6-4], xm2
+ movd [px+16*7-4], xm3
+ jmp .left_done
+.no_left:
+ REPX {movd [px+16*x-4], xm7}, 0, 1, 2, 3, 4, 5, 6, 7
+.left_done:
+ test edgeb, 2 ; HAVE_RIGHT
+ jnz .padding_done
+ REPX {movd [px+16*x+8], xm7}, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
+.padding_done:
+ CDEF_FILTER 4, 8
+
+cglobal cdef_filter_8x8_16bpc, 5, 9, 9, 32*13, dst, stride, left, top, bot, \
+ pri, sec, edge
+%if WIN64
+ %define px rsp+32*4
+%else
+ %define px rsp+32*3
+%endif
+ %define base r8-dir_table8
+ mov edged, r9m
+ movu m0, [dstq+strideq*0]
+ movu m1, [dstq+strideq*1]
+ lea r6, [dstq+strideq*2]
+ movu m2, [r6 +strideq*0]
+ movu m3, [r6 +strideq*1]
+ lea r6, [r6 +strideq*2]
+ movu m4, [r6 +strideq*0]
+ movu m5, [r6 +strideq*1]
+ lea r6, [r6 +strideq*2]
+ movu m6, [r6 +strideq*0]
+ movu m7, [r6 +strideq*1]
+ lea r8, [dir_table8]
+ mova [px+32*0+0], m0
+ mova [px+32*1+0], m1
+ mova [px+32*2+0], m2
+ mova [px+32*3+0], m3
+ mova [px+32*4+0], m4
+ mova [px+32*5+0], m5
+ mova [px+32*6+0], m6
+ mova [px+32*7+0], m7
+ vpbroadcastd m7, [base+pw_m16384]
+ test edgeb, 4 ; HAVE_TOP
+ jz .no_top
+ movu m0, [topq+strideq*0]
+ movu m1, [topq+strideq*1]
+ mova [px-32*2+0], m0
+ mova [px-32*1+0], m1
+ test edgeb, 1 ; HAVE_LEFT
+ jz .top_no_left
+ movd xm0, [topq+strideq*0-4]
+ movd xm1, [topq+strideq*1-4]
+ movd [px-32*2-4], xm0
+ movd [px-32*1-4], xm1
+ jmp .top_done
+.no_top:
+ mova [px-32*2+0], m7
+ mova [px-32*1+0], m7
+.top_no_left:
+ movd [px-32*2-4], xm7
+ movd [px-32*1-4], xm7
+.top_done:
+ test edgeb, 8 ; HAVE_BOTTOM
+ jz .no_bottom
+ movu m0, [botq+strideq*0]
+ movu m1, [botq+strideq*1]
+ mova [px+32*8+0], m0
+ mova [px+32*9+0], m1
+ test edgeb, 1 ; HAVE_LEFT
+ jz .bottom_no_left
+ movd xm0, [botq+strideq*0-4]
+ movd xm1, [botq+strideq*1-4]
+ movd [px+32*8-4], xm0
+ movd [px+32*9-4], xm1
+ jmp .bottom_done
+.no_bottom:
+ mova [px+32*8+0], m7
+ mova [px+32*9+0], m7
+.bottom_no_left:
+ movd [px+32*8-4], xm7
+ movd [px+32*9-4], xm7
+.bottom_done:
+ test edgeb, 1 ; HAVE_LEFT
+ jz .no_left
+ movd xm0, [leftq+4*0]
+ movd xm1, [leftq+4*1]
+ movd xm2, [leftq+4*2]
+ movd xm3, [leftq+4*3]
+ movd [px+32*0-4], xm0
+ movd [px+32*1-4], xm1
+ movd [px+32*2-4], xm2
+ movd [px+32*3-4], xm3
+ movd xm0, [leftq+4*4]
+ movd xm1, [leftq+4*5]
+ movd xm2, [leftq+4*6]
+ movd xm3, [leftq+4*7]
+ movd [px+32*4-4], xm0
+ movd [px+32*5-4], xm1
+ movd [px+32*6-4], xm2
+ movd [px+32*7-4], xm3
+ jmp .left_done
+.no_left:
+ REPX {movd [px+32*x-4], xm7}, 0, 1, 2, 3, 4, 5, 6, 7
+.left_done:
+ test edgeb, 2 ; HAVE_RIGHT
+ jnz .padding_done
+ REPX {movd [px+32*x+16], xm7}, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
+.padding_done:
+ CDEF_FILTER 8, 8
+
+cglobal cdef_dir_16bpc, 4, 7, 6, src, stride, var, bdmax
+ lea r6, [dir_shift]
+ shr bdmaxd, 11 ; 0 for 10bpc, 1 for 12bpc
+ vpbroadcastd m4, [r6+bdmaxq*4]
+ lea r6, [strideq*3]
+ mova xm0, [srcq+strideq*0]
+ mova xm1, [srcq+strideq*1]
+ mova xm2, [srcq+strideq*2]
+ mova xm3, [srcq+r6 ]
+ lea srcq, [srcq+strideq*4]
+ vinserti128 m0, [srcq+r6 ], 1
+ vinserti128 m1, [srcq+strideq*2], 1
+ vinserti128 m2, [srcq+strideq*1], 1
+ vinserti128 m3, [srcq+strideq*0], 1
+ REPX {pmulhuw x, m4}, m0, m1, m2, m3
+ jmp mangle(private_prefix %+ _cdef_dir_8bpc %+ SUFFIX).main
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/cdef16_avx512.asm b/third_party/dav1d/src/x86/cdef16_avx512.asm
new file mode 100644
index 0000000000..6d625a02a0
--- /dev/null
+++ b/third_party/dav1d/src/x86/cdef16_avx512.asm
@@ -0,0 +1,622 @@
+; Copyright © 2022, VideoLAN and dav1d authors
+; Copyright © 2022, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 64
+
+cdef_perm: db 2, 18, 16, 18, 24, 19, 0, 19, 25, 20, 1, 20, 26, 21, 2, 21
+ db 3, 26, 3, 26, 28, 27, 4, 27, 29, 28, -1, 28, 30, 29, -1, 29
+ db 0, 34, 17, 34, 16, 35, 8, 35, 17, 36, 9, 36, 18, 37, 10, 37
+ db 1, 42, 11, 42, 20, 43, 12, 43, 21, 44, -1, 44, 22, 45, -1, 45
+end_perm4: db 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30
+ db 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58, 61, 62
+edge_mask4: dw 0xff99, 0xff88, 0xff11, 0xff00 ; 0100, 0101, 0110, 0111
+ dw 0x99ff, 0x88ff, 0x11ff, 0x00ff ; 1000, 1001, 1010, 1011
+ dw 0x9999, 0x8888, 0x1111, 0x0000 ; 1100, 1101, 1110, 1111
+pri_taps4: dw 64, 32, 48, 48 ; left-shifted by 4
+cdef_dirs4: dw 8, 16, 8, 15, -7,-14, 1, -6
+ dw 1, 2, 1, 10, 9, 18, 8, 17
+ dw 8, 16, 8, 15, -7,-14, 1, -6
+deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
+cdef_dirs8: db 32, 64, 32, 62,-30,-60, 2,-28
+ db 2, 4, 2, 36, 34, 68, 32, 66
+ db 32, 64, 32, 62,-30,-60, 2,-28
+pri_taps8: dw 4, 4, 2, 2, 3, 3, 3, 3
+sec_taps4: dw 32, 16
+pw_m16384: times 2 dw -16384
+pw_2048: times 2 dw 2048
+pd_268435568: dd 268435568 ; (1 << 28) + (7 << 4)
+edge_mask8: dw 0x2121, 0x2020, 0x0101
+
+SECTION .text
+
+%macro CONSTRAIN 7 ; dst, p, px, zero, tresh, shift, tmp
+ psubw %1, %2, %3
+ pabsw %1, %1
+ vpcmpgtw k1, %3, %2
+ vpsrlvw %7, %1, %6
+ psubusw %7, %5, %7
+ pminsw %1, %7
+ vpsubw %1{k1}, %4, %1
+%endmacro
+
+; t0 t1 t2 t3 t4 t5 t6 t7 L4 L5 20 21 22 23 24 25
+; T0 T1 T2 T3 T4 T5 T6 T7 L6 L7 30 31 32 33 34 35
+; L0 L1 00 01 02 03 04 05 b0 b1 b2 b3 b4 b5 b6 b7
+; L2 L3 10 11 12 13 14 15 B0 B1 B2 B3 B4 B5 B6 B7
+
+INIT_ZMM avx512icl
+cglobal cdef_filter_4x4_16bpc, 5, 7, 16, dst, stride, left, top, bot, \
+ pri, sec, dir, damping, edge
+%define base r6-cdef_dirs4
+ lea r6, [cdef_dirs4]
+ movu xm3, [dstq+strideq*0]
+ vinserti32x4 ym3, [dstq+strideq*1], 1
+ mova xm2, [leftq]
+ lea r2, [dstq+strideq*2]
+ vinserti32x4 m3, [r2+strideq*0], 2
+ mova m5, [base+cdef_perm]
+ vinserti32x4 m3, [r2+strideq*1], 3
+ vpermt2d m2, m5, m3
+ vinserti32x4 m1, m2, [topq+strideq*0-4], 0
+ vinserti32x4 m1, [topq+strideq*1-4], 1
+ mov r3d, edgem
+ movifnidn prid, prim
+ punpcklwd m3, m3 ; px
+ psrlw m5, 8
+ vpbroadcastd m0, [base+pd_268435568]
+ pxor m12, m12
+ cmp r3d, 0x0f
+ jne .mask_edges
+ vinserti32x4 m2, [botq+strideq*0-4], 2
+ vinserti32x4 m2, [botq+strideq*1-4], 3
+.main:
+ test prid, prid
+ jz .sec_only
+ lzcnt r4d, prid
+ rorx r3d, prid, 2
+ vpbroadcastw m13, prim
+ cmp dword r10m, 0xfff ; if (bpc == 12)
+ cmove prid, r3d ; pri >>= 2
+ mov r3d, dampingm
+ and prid, 4
+ sub r3d, 31
+ vpbroadcastd m15, [base+pri_taps4+priq]
+ xor prid, prid
+ add r4d, r3d
+ cmovns prid, r4d ; pri_shift
+ mov r4d, dirm
+ vpbroadcastw m14, prid
+ mov r5d, secm
+ vpbroadcastd m9, [base+cdef_dirs4+(r4+2)*4]
+ call .constrain
+ test r5d, r5d
+ jz .end_no_clip
+ lzcnt r5d, r5d
+ vpbroadcastw m13, secm
+ add r3d, r5d
+ pminuw m6, m3, m8
+ pmaxsw m7, m3, m8
+ pminuw m6, m9
+ pmaxsw m7, m9
+ call .constrain_sec
+ pminuw m6, m8
+ pmaxsw m7, m8
+ pminuw m6, m9
+ pmaxsw m7, m9
+ vpbroadcastd m9, [base+cdef_dirs4+(r4+0)*4]
+ call .constrain
+ pminuw m6, m8
+ pmaxsw m7, m8
+ pminuw m6, m9
+ pmaxsw m7, m9
+ psrldq m8, m6, 2
+ vpshldd m3, m0, 8
+ psrldq m9, m7, 2
+ paddd m0, m3
+ pminuw m6, m8
+ psrldq m0, 1
+ pmaxsw m7, m9
+ pmaxsw m0, m6
+ pminsw m0, m7
+ vpmovdw ym0, m0
+ jmp .end
+.sec_only:
+ tzcnt r5d, secm
+ mov r3d, dampingm
+ vpbroadcastw m13, secm
+ mov r4d, dirm
+ sub r3d, r5d ; sec_shift
+ call .constrain_sec
+ vpbroadcastd m9, [base+cdef_dirs4+(r4+0)*4]
+ call .constrain
+.end_no_clip:
+ mova ym1, [base+end_perm4]
+ vpshldd m3, m0, 8 ; (px << 8) + ((sum > -8) << 4)
+ paddd m0, m3 ; (px << 8) + ((sum + (sum > -8) + 7) << 4)
+ vpermb m0, m1, m0
+.end:
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ vextracti32x4 xm0, ym0, 1
+ movq [r2+strideq*0], xm0
+ movhps [r2+strideq*1], xm0
+ RET
+.mask_edges:
+ vpbroadcastd m6, [base+pw_m16384]
+ test r3b, 0x08
+ jz .mask_edges_no_bottom ; avoid buffer overread
+ vinserti32x4 m2, [botq+strideq*0-4], 2
+ vinserti32x4 m2, [botq+strideq*1-4], 3
+ kmovw k1, [base+edge_mask4-8+r3*2]
+ jmp .mask_edges_main
+.mask_edges_no_bottom:
+ kmovw k1, [base+edge_mask4+8+r3*2]
+.mask_edges_main:
+ or r3d, 0x04
+ vmovdqa32 m1{k1}, m6 ; edge pixels = -16384
+ kmovw k1, [base+edge_mask4-8+r3*2]
+ vmovdqa32 m2{k1}, m6
+ jmp .main
+.constrain_sec:
+ vpbroadcastd m9, [base+cdef_dirs4+(r4+4)*4]
+ vpbroadcastw m14, r3d
+ vpbroadcastd m15, [base+sec_taps4]
+.constrain:
+ paddw m8, m5, m9
+ vpermi2w m8, m1, m2 ; k0p0 k1p0
+ psubw m9, m5, m9
+ vpermi2w m9, m1, m2 ; k0p1 k1p1
+ CONSTRAIN m10, m8, m3, m12, m13, m14, m11
+ vpdpwssd m0, m10, m15
+ CONSTRAIN m10, m9, m3, m12, m13, m14, m11
+ vpdpwssd m0, m10, m15
+ ret
+
+; t0 t1 t2 t3 t4 t5 t6 t7 L4 L5 20 21 22 23 24 25 Lc Ld 60 61 62 63 64 65
+; T0 T1 T2 T3 T4 T5 T6 T7 L6 L7 30 31 32 33 34 35 Le Lf 70 71 72 73 74 75
+; L0 L1 00 01 02 03 04 05 L8 L9 40 41 42 43 44 45 b0 b1 b2 b3 b4 b5 b6 b7
+; L2 L3 10 11 12 13 14 15 La Lb 50 51 52 53 54 55 B0 B1 B2 B3 B4 B5 B6 B7
+
+cglobal cdef_filter_4x8_16bpc, 5, 7, 22, dst, stride, left, top, bot, \
+ pri, sec, dir, damping, edge
+ lea r6, [cdef_dirs4]
+ movu xm18, [dstq+strideq*0]
+ vinserti128 ym18, [dstq+strideq*1], 1
+ mova xm1, [leftq+16*0]
+ mova xm2, [leftq+16*1]
+ lea r2, [strideq*3]
+ vinserti32x4 m18, [dstq+strideq*2], 2
+ mova m5, [base+cdef_perm]
+ vinserti32x4 m18, [dstq+r2 ], 3
+ vpermt2d m1, m5, m18
+ vinserti32x4 m0, m1, [topq+strideq*0-4], 0
+ vinserti32x4 m0, [topq+strideq*1-4], 1
+ lea r3, [dstq+strideq*4]
+ movu xm19, [r3+strideq*0]
+ vinserti128 ym19, [r3+strideq*1], 1
+ vinserti32x4 m19, [r3+strideq*2], 2
+ vinserti32x4 m19, [r3+r2 ], 3
+ mov r3d, edgem
+ movifnidn prid, prim
+ vpermt2d m2, m5, m19
+ vpbroadcastd m16, [base+pd_268435568]
+ pxor m12, m12
+ punpcklwd m18, m18 ; px (top)
+ psrlw m5, 8
+ punpcklwd m19, m19 ; px (bottom)
+ mova m17, m16
+ vshufi32x4 m1, m2, q3210
+ cmp r3d, 0x0f
+ jne .mask_edges
+ vinserti32x4 m2, [botq+strideq*0-4], 2
+ vinserti32x4 m2, [botq+strideq*1-4], 3
+.main:
+ test prid, prid
+ jz .sec_only
+ lzcnt r4d, prid
+ rorx r3d, prid, 2
+ vpbroadcastw m13, prim
+ cmp dword r10m, 0xfff ; if (bpc == 12)
+ cmove prid, r3d ; pri >>= 2
+ mov r3d, dampingm
+ and prid, 4
+ sub r3d, 31
+ vpbroadcastd m15, [base+pri_taps4+priq]
+ xor prid, prid
+ add r4d, r3d
+ cmovns prid, r4d ; pri_shift
+ mov r4d, dirm
+ vpbroadcastw m14, prid
+ mov r5d, secm
+ vpbroadcastd m9, [base+cdef_dirs4+(r4+2)*4]
+ call .constrain
+ test r5d, r5d
+ jz .end_no_clip
+ lzcnt r5d, r5d
+ vpbroadcastw m13, secm
+ add r3d, r5d
+ pminuw m3, m18, m6
+ pmaxsw m4, m18, m6
+ pminuw m20, m19, m7
+ pmaxsw m21, m19, m7
+ pminuw m3, m8
+ pmaxsw m4, m8
+ pminuw m20, m9
+ pmaxsw m21, m9
+ call .constrain_sec
+ pminuw m3, m6
+ pmaxsw m4, m6
+ pminuw m20, m7
+ pmaxsw m21, m7
+ pminuw m3, m8
+ pmaxsw m4, m8
+ pminuw m20, m9
+ pmaxsw m21, m9
+ vpbroadcastd m9, [base+cdef_dirs4+(r4+0)*4]
+ call .constrain
+ pminuw m3, m6
+ pmaxsw m4, m6
+ mov r3, 0xcccccccccccccccc
+ pminuw m20, m7
+ pmaxsw m21, m7
+ kmovq k1, r3
+ pminuw m3, m8
+ pmaxsw m4, m8
+ pminuw m20, m9
+ pmaxsw m21, m9
+ vbroadcasti32x4 m0, [base+deint_shuf]
+ vpshldd m6, m20, m3, 16
+ vmovdqu8 m3{k1}, m20
+ vpshldd m18, m16, 8
+ vpshldd m7, m21, m4, 16
+ vmovdqu8 m4{k1}, m21
+ vpshldd m19, m17, 8
+ pminuw m3, m6
+ paddd m16, m18
+ pmaxsw m4, m7
+ paddd m17, m19
+ psrldq m16, 1
+ palignr m16{k1}, m17, m17, 15
+ lea r6, [dstq+strideq*4]
+ pmaxsw m16, m3
+ pminsw m16, m4
+ pshufb m16, m0
+ movq [dstq+strideq*0], xm16
+ movhps [r6 +strideq*0], xm16
+ vextracti128 xm17, ym16, 1
+ movq [dstq+strideq*1], xm17
+ movhps [r6 +strideq*1], xm17
+ vextracti32x4 xm17, m16, 2
+ movq [dstq+strideq*2], xm17
+ movhps [r6 +strideq*2], xm17
+ vextracti32x4 xm16, m16, 3
+ movq [dstq+r2 ], xm16
+ movhps [r6 +r2 ], xm16
+ RET
+.sec_only:
+ mov r4d, dirm
+ tzcnt r5d, secm
+ mov r3d, dampingm
+ vpbroadcastw m13, secm
+ sub r3d, r5d ; sec_shift
+ call .constrain_sec
+ vpbroadcastd m9, [base+cdef_dirs4+(r4+0)*4]
+ call .constrain
+.end_no_clip:
+ mova ym20, [base+end_perm4]
+ vpshldd m18, m16, 8 ; (px << 8) + ((sum > -8) << 4)
+ vpshldd m19, m17, 8
+ paddd m16, m18 ; (px << 8) + ((sum + (sum > -8) + 7) << 4)
+ paddd m17, m19
+ vpermb m16, m20, m16
+ vpermb m17, m20, m17
+ movq [dstq+strideq*0], xm16
+ movhps [dstq+strideq*1], xm16
+ vextracti128 xm16, ym16, 1
+ movq [dstq+strideq*2], xm16
+ movhps [dstq+r2 ], xm16
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm17
+ movhps [dstq+strideq*1], xm17
+ vextracti128 xm17, ym17, 1
+ movq [dstq+strideq*2], xm17
+ movhps [dstq+r2 ], xm17
+ RET
+.mask_edges:
+ vpbroadcastd m6, [base+pw_m16384]
+ test r3b, 0x08
+ jz .mask_edges_no_bottom ; avoid buffer overread
+ vinserti32x4 m2, [botq+strideq*0-4], 2
+ vinserti32x4 m2, [botq+strideq*1-4], 3
+ kmovw k1, [base+edge_mask4-8+r3*2]
+ jmp .mask_edges_main
+.mask_edges_no_bottom:
+ kmovw k1, [base+edge_mask4+8+r3*2]
+.mask_edges_main:
+ mov r4d, r3d
+ or r3d, 0x0c
+ vmovdqa32 m0{k1}, m6 ; edge pixels = -16384
+ kmovw k1, [base+edge_mask4-8+r3*2]
+ or r4d, 0x04
+ vmovdqa32 m1{k1}, m6
+ kmovw k1, [base+edge_mask4-8+r4*2]
+ vmovdqa32 m2{k1}, m6
+ jmp .main
+.constrain_sec:
+ vpbroadcastd m9, [base+cdef_dirs4+(r4+4)*4]
+ vpbroadcastw m14, r3d
+ vpbroadcastd m15, [base+sec_taps4]
+.constrain:
+ paddw m7, m5, m9
+ mova m6, m0
+ vpermt2w m6, m7, m1 ; k0p0 k1p0 (top)
+ psubw m9, m5, m9
+ mova m8, m0
+ vpermi2w m7, m1, m2 ; k0p0 k1p0 (bottom)
+ CONSTRAIN m10, m6, m18, m12, m13, m14, m11
+ vpermt2w m8, m9, m1 ; k0p1 k1p1 (top)
+ vpdpwssd m16, m10, m15
+ CONSTRAIN m10, m7, m19, m12, m13, m14, m11
+ vpermi2w m9, m1, m2 ; k0p1 k1p1 (bottom)
+ vpdpwssd m17, m10, m15
+ CONSTRAIN m10, m8, m18, m12, m13, m14, m11
+ vpdpwssd m16, m10, m15
+ CONSTRAIN m10, m9, m19, m12, m13, m14, m11
+ vpdpwssd m17, m10, m15
+ ret
+
+cglobal cdef_filter_8x8_16bpc, 5, 7, 22, 64*6, dst, stride, left, top, bot, \
+ pri, sec, dir, damping, edge
+%define base r6-cdef_dirs8
+ lea r6, [cdef_dirs8]
+ movu ym17, [dstq+strideq*0]
+ vinserti32x8 m17, [dstq+strideq*1], 1
+ movq xm4, [leftq+8*0]
+ movq xm5, [leftq+8*1]
+ psrld m2, [base+cdef_perm], 16
+ movq xm6, [leftq+8*2]
+ movq xm7, [leftq+8*3]
+ lea r2, [strideq*3]
+ movu ym16, [topq+strideq*0-4]
+ vinserti32x8 m16, [topq+strideq*1-4], 1
+ lea r3, [dstq+strideq*4]
+ movu ym18, [dstq+strideq*2]
+ vinserti32x8 m18, [dstq+r2 ], 1
+ movu ym19, [r3+strideq*0]
+ vinserti32x8 m19, [r3+strideq*1], 1
+ movu ym20, [r3+strideq*2]
+ vinserti32x8 m20, [r3+r2 ], 1
+ vshufi32x4 m0, m17, m18, q2020 ; px (top)
+ mov r3d, edgem
+ vshufi32x4 m1, m19, m20, q2020 ; px (bottom)
+ movifnidn prid, prim
+ vpermt2d m17, m2, m4
+ vpermt2d m18, m2, m5
+ pxor m12, m12
+ vpermt2d m19, m2, m6
+ vpermt2d m20, m2, m7
+ cmp r3d, 0x0f
+ jne .mask_edges
+ movu ym21, [botq+strideq*0-4]
+ vinserti32x8 m21, [botq+strideq*1-4], 1
+.main:
+ mova [rsp+64*0], m16 ; top
+ mova [rsp+64*1], m17 ; 0 1
+ mova [rsp+64*2], m18 ; 2 3
+ mova [rsp+64*3], m19 ; 4 5
+ mova [rsp+64*4], m20 ; 6 7
+ mova [rsp+64*5], m21 ; bottom
+ test prid, prid
+ jz .sec_only
+ lzcnt r4d, prid
+ rorx r3d, prid, 2
+ vpbroadcastw m13, prim
+ cmp dword r10m, 0xfff ; if (bpc == 12)
+ cmove prid, r3d ; pri >>= 2
+ mov r3d, dampingm
+ and prid, 4
+ sub r3d, 31
+ add r4d, r3d ; pri_shift
+ vpbroadcastw m14, r4d
+ mov r4d, dirm
+ vpbroadcastd m2, [base+pri_taps8+priq*2+0]
+ vpbroadcastd m3, [base+pri_taps8+priq*2+4]
+ movsx r5, byte [base+cdef_dirs8+(r4+2)*2+0] ; k0off1
+ pmaxsw m14, m12
+ call .constrain
+ mov r5d, secm
+ pmullw m16, m8, m2
+ pmullw m17, m9, m2
+ test r5d, r5d
+ jnz .pri_sec
+ movsx r5, byte [base+cdef_dirs8+(r4+2)*2+1] ; k1off1
+ call .constrain
+ pmullw m8, m3
+ pmullw m9, m3
+ jmp .end_no_clip
+.pri_sec:
+ lzcnt r5d, r5d
+ add r3d, r5d ; sec_shift
+ movsx r5, byte [base+cdef_dirs8+(r4+2)*2+1] ; k1off1
+ pminuw m18, m0, m4
+ pmaxsw m19, m0, m4
+ pminuw m20, m1, m5
+ pmaxsw m21, m1, m5
+ call .min_max_constrain2
+ movsx r5, byte [base+cdef_dirs8+(r4+0)*2+0] ; k0off2
+ pmullw m8, m3
+ pmullw m9, m3
+ vpbroadcastw m13, secm
+ vpbroadcastw m14, r3d
+ paddw m16, m8
+ paddw m17, m9
+ call .min_max_constrain
+ movsx r5, byte [base+cdef_dirs8+(r4+4)*2+0] ; k0off3
+ mova m2, m8
+ mova m3, m9
+ call .min_max_constrain
+ movsx r5, byte [base+cdef_dirs8+(r4+0)*2+1] ; k1off2
+ paddw m2, m8
+ paddw m3, m9
+ call .min_max_constrain
+ movsx r5, byte [base+cdef_dirs8+(r4+4)*2+1] ; k1off3
+ paddw m2, m2
+ paddw m3, m3
+ paddw m16, m8
+ paddw m17, m9
+ call .min_max_constrain
+ vpbroadcastd m10, [base+pw_2048]
+ paddw m16, m2
+ paddw m17, m3
+ paddw m16, m8
+ paddw m17, m9
+ psraw m8, m16, 15
+ psraw m9, m17, 15
+ paddw m16, m8
+ paddw m17, m9
+ pmulhrsw m16, m10
+ pmulhrsw m17, m10
+ pminuw m18, m4
+ pmaxsw m19, m4
+ pminuw m20, m5
+ pmaxsw m21, m5
+ pminuw m18, m6
+ pmaxsw m19, m6
+ pminuw m20, m7
+ pmaxsw m21, m7
+ paddw m16, m0
+ paddw m17, m1
+ pmaxsw m16, m18
+ pmaxsw m17, m20
+ pminsw m16, m19
+ pminsw m17, m21
+ jmp .end
+.sec_only:
+ tzcnt r5d, secm
+ mov r4d, dirm
+ mov r3d, dampingm
+ vpbroadcastw m13, secm
+ sub r3d, r5d
+ movsx r5, byte [base+cdef_dirs8+(r4+0)*2+0]
+ vpbroadcastw m14, r3d
+ call .constrain
+ movsx r5, byte [base+cdef_dirs8+(r4+4)*2+0]
+ mova m16, m8
+ mova m17, m9
+ call .constrain
+ movsx r5, byte [base+cdef_dirs8+(r4+0)*2+1]
+ paddw m16, m8
+ paddw m17, m9
+ call .constrain
+ movsx r5, byte [base+cdef_dirs8+(r4+4)*2+1]
+ paddw m16, m16
+ paddw m17, m17
+ paddw m16, m8
+ paddw m17, m9
+ call .constrain
+.end_no_clip:
+ vpbroadcastd m10, [base+pw_2048]
+ paddw m16, m8
+ paddw m17, m9
+ psraw m8, m16, 15
+ psraw m9, m17, 15
+ paddw m16, m8
+ paddw m17, m9
+ pmulhrsw m16, m10
+ pmulhrsw m17, m10
+ paddw m16, m0
+ paddw m17, m1
+.end:
+ mova [dstq+strideq*0], xm16
+ vextracti128 [dstq+strideq*1], ym16, 1
+ vextracti32x4 [dstq+strideq*2], m16, 2
+ vextracti32x4 [dstq+r2 ], m16, 3
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], xm17
+ vextracti128 [dstq+strideq*1], ym17, 1
+ vextracti32x4 [dstq+strideq*2], m17, 2
+ vextracti32x4 [dstq+r2 ], m17, 3
+ RET
+.mask_edges:
+ vpbroadcastd m2, [base+pw_m16384]
+ test r3b, 0x08
+ jz .mask_edges_no_bottom ; avoid buffer overread
+ movu ym21, [botq+strideq*0-4]
+ vinserti32x8 m21, [botq+strideq*1-4], 1
+ jmp .mask_edges_top
+.mask_edges_no_bottom:
+ mova m21, m2
+.mask_edges_top:
+ test r3b, 0x04
+ jnz .mask_edges_main
+ mova m16, m2
+.mask_edges_main:
+ and r3d, 0x03
+ cmp r3d, 0x03
+ je .main
+ kmovw k1, [base+edge_mask8+r3*2]
+ vmovdqa32 m16{k1}, m2 ; edge pixels = -16384
+ vmovdqa32 m17{k1}, m2
+ vmovdqa32 m18{k1}, m2
+ vmovdqa32 m19{k1}, m2
+ vmovdqa32 m20{k1}, m2
+ vmovdqa32 m21{k1}, m2
+ jmp .main
+ALIGN function_align
+.min_max_constrain:
+ pminuw m18, m4
+ pmaxsw m19, m4
+ pminuw m20, m5
+ pmaxsw m21, m5
+.min_max_constrain2:
+ pminuw m18, m6
+ pmaxsw m19, m6
+ pminuw m20, m7
+ pmaxsw m21, m7
+.constrain:
+ %define tmp rsp+gprsize+68
+ movu m4, [tmp+r5+64*0]
+ vshufi32x4 m4, [tmp+r5+64*1], q2020 ; k0p0 (top)
+ movu m5, [tmp+r5+64*2]
+ vshufi32x4 m5, [tmp+r5+64*3], q2020 ; k0p0 (bottom)
+ neg r5
+ movu m6, [tmp+r5+64*0]
+ vshufi32x4 m6, [tmp+r5+64*1], q2020 ; k0p1 (top)
+ movu m7, [tmp+r5+64*2]
+ vshufi32x4 m7, [tmp+r5+64*3], q2020 ; k0p1 (bottom)
+ CONSTRAIN m8, m4, m0, m12, m13, m14, m15
+ CONSTRAIN m9, m5, m1, m12, m13, m14, m15
+ CONSTRAIN m10, m6, m0, m12, m13, m14, m15
+ CONSTRAIN m11, m7, m1, m12, m13, m14, m15
+ paddw m8, m10
+ paddw m9, m11
+ ret
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/cdef16_sse.asm b/third_party/dav1d/src/x86/cdef16_sse.asm
new file mode 100644
index 0000000000..1bd67ace64
--- /dev/null
+++ b/third_party/dav1d/src/x86/cdef16_sse.asm
@@ -0,0 +1,1033 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
+; Copyright (c) 2017-2021, The rav1e contributors
+; Copyright (c) 2021, Nathan Egge
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA
+
+%macro DUP8 1-*
+ %rep %0
+ times 8 dw %1
+ %rotate 1
+ %endrep
+%endmacro
+
+pri_taps: DUP8 4, 2, 3, 3
+dir_table: db 1 * 32 + 0, 2 * 32 + 0
+ db 1 * 32 + 0, 2 * 32 - 2
+ db -1 * 32 + 2, -2 * 32 + 4
+ db 0 * 32 + 2, -1 * 32 + 4
+ db 0 * 32 + 2, 0 * 32 + 4
+ db 0 * 32 + 2, 1 * 32 + 4
+ db 1 * 32 + 2, 2 * 32 + 4
+ db 1 * 32 + 0, 2 * 32 + 2
+ db 1 * 32 + 0, 2 * 32 + 0
+ db 1 * 32 + 0, 2 * 32 - 2
+ db -1 * 32 + 2, -2 * 32 + 4
+ db 0 * 32 + 2, -1 * 32 + 4
+
+dir_shift: times 4 dw 0x4000
+ times 4 dw 0x1000
+
+pw_128: times 4 dw 128
+pw_2048: times 8 dw 2048
+pw_m16384: times 8 dw -16384
+
+cextern cdef_dir_8bpc_ssse3.main
+cextern cdef_dir_8bpc_sse4.main
+cextern shufw_6543210x
+
+SECTION .text
+
+%if ARCH_X86_32
+DECLARE_REG_TMP 5, 3
+%elif WIN64
+DECLARE_REG_TMP 8, 4
+%else
+DECLARE_REG_TMP 8, 6
+%endif
+
+%macro CDEF_FILTER 2 ; w, h
+%if ARCH_X86_64
+ DEFINE_ARGS dst, stride, _, tmp, pridmp, pri, sec, dir
+ mova m8, [base+pw_2048]
+%else
+ DEFINE_ARGS dst, pridmp, tmp, sec, pri, _, dir
+ %define m8 [base+pw_2048]
+ %define m9 [rsp+16*1+gprsize]
+ %define m10 [rsp+16*2+gprsize]
+%endif
+ movifnidn prid, r5m
+ movifnidn secd, r6m
+ test prid, prid
+ jz .sec_only
+ movd m6, r5m
+%if ARCH_X86_32
+ mov [rsp+24], pridmpd
+%endif
+ bsr pridmpd, prid
+ lea tmpd, [priq*4]
+ cmp dword r10m, 0x3ff ; if (bpc == 10)
+ cmove prid, tmpd ; pri <<= 2
+ mov tmpd, r8m ; damping
+ mov dird, r7m
+ and prid, 16
+ pshufb m6, m7 ; splat
+ lea dirq, [base+dir_table+dirq*2]
+ lea priq, [base+pri_taps+priq*2]
+ test secd, secd
+ jz .pri_only
+ mova [rsp], m6
+ movd m6, secd
+ tzcnt secd, secd
+ sub pridmpd, tmpd
+ sub tmpd, secd
+ pshufb m6, m7
+ xor secd, secd
+ neg pridmpd
+ cmovs pridmpd, secd
+%if ARCH_X86_32
+ mov [pri_shift+4], secd
+ mov [sec_shift+4], secd
+%endif
+ mov [pri_shift+0], pridmpq
+ mov [sec_shift+0], tmpq
+ lea tmpq, [px]
+%if WIN64
+ movaps r4m, m9
+ movaps r6m, m10
+%elif ARCH_X86_32
+ mov pridmpd, [rsp+24]
+%endif
+%rep %1*%2/8
+ call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).pri_sec
+%endrep
+%if WIN64
+ movaps m9, r4m
+ movaps m10, r6m
+%endif
+ jmp .end
+.pri_only:
+ sub tmpd, pridmpd
+ cmovs tmpd, secd
+%if ARCH_X86_32
+ mov pridmpd, [rsp+24]
+ mov [pri_shift+4], secd
+%endif
+ mov [pri_shift+0], tmpq
+ lea tmpq, [px]
+%rep %1*%2/8
+ call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).pri
+%endrep
+.end:
+ RET
+.sec_only:
+ mov tmpd, r8m ; damping
+ movd m6, r6m
+ tzcnt secd, secd
+ mov dird, r7m
+ pshufb m6, m7
+ sub tmpd, secd
+ lea dirq, [base+dir_table+dirq*2]
+%if ARCH_X86_32
+ mov [sec_shift+4], prid
+%endif
+ mov [sec_shift+0], tmpq
+ lea tmpq, [px]
+%rep %1*%2/8
+ call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).sec
+%endrep
+ jmp .end
+%if %1 == %2
+ %if ARCH_X86_64
+ DEFINE_ARGS dst, stride, _, tmp, off, pri, _, dir
+ %else
+ DEFINE_ARGS dst, stride, tmp, off, pri, _, dir
+ %endif
+ALIGN function_align
+.pri:
+ movsx offq, byte [dirq+4] ; off_k0
+%if %1 == 4
+ movq m1, [dstq+strideq*0]
+ movhps m1, [dstq+strideq*1]
+ movq m2, [tmpq+offq+32*0] ; k0p0
+ movhps m2, [tmpq+offq+32*1]
+ neg offq
+ movq m3, [tmpq+offq+32*0] ; k0p1
+ movhps m3, [tmpq+offq+32*1]
+%else
+ mova m1, [dstq]
+ movu m2, [tmpq+offq]
+ neg offq
+ movu m3, [tmpq+offq]
+%endif
+ movsx offq, byte [dirq+5] ; off_k1
+ psubw m2, m1 ; diff_k0p0
+ psubw m3, m1 ; diff_k0p1
+ pabsw m4, m2 ; adiff_k0p0
+ psrlw m5, m4, [pri_shift+gprsize]
+ psubusw m0, m6, m5
+ pabsw m5, m3 ; adiff_k0p1
+ pminsw m0, m4
+ psrlw m4, m5, [pri_shift+gprsize]
+ psignw m0, m2 ; constrain(diff_k0p0)
+ psubusw m2, m6, m4
+ pminsw m2, m5
+%if %1 == 4
+ movq m4, [tmpq+offq+32*0] ; k1p0
+ movhps m4, [tmpq+offq+32*1]
+ neg offq
+ movq m5, [tmpq+offq+32*0] ; k1p1
+ movhps m5, [tmpq+offq+32*1]
+%else
+ movu m4, [tmpq+offq]
+ neg offq
+ movu m5, [tmpq+offq]
+%endif
+ psubw m4, m1 ; diff_k1p0
+ psubw m5, m1 ; diff_k1p1
+ psignw m2, m3 ; constrain(diff_k0p1)
+ pabsw m3, m4 ; adiff_k1p0
+ paddw m0, m2 ; constrain(diff_k0)
+ psrlw m2, m3, [pri_shift+gprsize]
+ psubusw m7, m6, m2
+ pabsw m2, m5 ; adiff_k1p1
+ pminsw m7, m3
+ psrlw m3, m2, [pri_shift+gprsize]
+ psignw m7, m4 ; constrain(diff_k1p0)
+ psubusw m4, m6, m3
+ pminsw m4, m2
+ psignw m4, m5 ; constrain(diff_k1p1)
+ paddw m7, m4 ; constrain(diff_k1)
+ pmullw m0, [priq+16*0] ; pri_tap_k0
+ pmullw m7, [priq+16*1] ; pri_tap_k1
+ paddw m0, m7 ; sum
+ psraw m2, m0, 15
+ paddw m0, m2
+ pmulhrsw m0, m8
+ paddw m0, m1
+%if %1 == 4
+ add tmpq, 32*2
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+%else
+ add tmpq, 32
+ mova [dstq], m0
+ add dstq, strideq
+%endif
+ ret
+ALIGN function_align
+.sec:
+ movsx offq, byte [dirq+8] ; off1_k0
+%if %1 == 4
+ movq m1, [dstq+strideq*0]
+ movhps m1, [dstq+strideq*1]
+ movq m2, [tmpq+offq+32*0] ; k0s0
+ movhps m2, [tmpq+offq+32*1]
+ neg offq
+ movq m3, [tmpq+offq+32*0] ; k0s1
+ movhps m3, [tmpq+offq+32*1]
+%else
+ mova m1, [dstq]
+ movu m2, [tmpq+offq]
+ neg offq
+ movu m3, [tmpq+offq]
+%endif
+ movsx offq, byte [dirq+0] ; off2_k0
+ psubw m2, m1 ; diff_k0s0
+ psubw m3, m1 ; diff_k0s1
+ pabsw m4, m2 ; adiff_k0s0
+ psrlw m5, m4, [sec_shift+gprsize]
+ psubusw m0, m6, m5
+ pabsw m5, m3 ; adiff_k0s1
+ pminsw m0, m4
+ psrlw m4, m5, [sec_shift+gprsize]
+ psignw m0, m2 ; constrain(diff_k0s0)
+ psubusw m2, m6, m4
+ pminsw m2, m5
+%if %1 == 4
+ movq m4, [tmpq+offq+32*0] ; k0s2
+ movhps m4, [tmpq+offq+32*1]
+ neg offq
+ movq m5, [tmpq+offq+32*0] ; k0s3
+ movhps m5, [tmpq+offq+32*1]
+%else
+ movu m4, [tmpq+offq]
+ neg offq
+ movu m5, [tmpq+offq]
+%endif
+ movsx offq, byte [dirq+9] ; off1_k1
+ psubw m4, m1 ; diff_k0s2
+ psubw m5, m1 ; diff_k0s3
+ psignw m2, m3 ; constrain(diff_k0s1)
+ pabsw m3, m4 ; adiff_k0s2
+ paddw m0, m2
+ psrlw m2, m3, [sec_shift+gprsize]
+ psubusw m7, m6, m2
+ pabsw m2, m5 ; adiff_k0s3
+ pminsw m7, m3
+ psrlw m3, m2, [sec_shift+gprsize]
+ psignw m7, m4 ; constrain(diff_k0s2)
+ psubusw m4, m6, m3
+ pminsw m4, m2
+%if %1 == 4
+ movq m2, [tmpq+offq+32*0] ; k1s0
+ movhps m2, [tmpq+offq+32*1]
+ neg offq
+ movq m3, [tmpq+offq+32*0] ; k1s1
+ movhps m3, [tmpq+offq+32*1]
+%else
+ movu m2, [tmpq+offq]
+ neg offq
+ movu m3, [tmpq+offq]
+%endif
+ movsx offq, byte [dirq+1] ; off2_k1
+ paddw m0, m7
+ psignw m4, m5 ; constrain(diff_k0s3)
+ paddw m0, m4 ; constrain(diff_k0)
+ psubw m2, m1 ; diff_k1s0
+ psubw m3, m1 ; diff_k1s1
+ paddw m0, m0 ; sec_tap_k0
+ pabsw m4, m2 ; adiff_k1s0
+ psrlw m5, m4, [sec_shift+gprsize]
+ psubusw m7, m6, m5
+ pabsw m5, m3 ; adiff_k1s1
+ pminsw m7, m4
+ psrlw m4, m5, [sec_shift+gprsize]
+ psignw m7, m2 ; constrain(diff_k1s0)
+ psubusw m2, m6, m4
+ pminsw m2, m5
+%if %1 == 4
+ movq m4, [tmpq+offq+32*0] ; k1s2
+ movhps m4, [tmpq+offq+32*1]
+ neg offq
+ movq m5, [tmpq+offq+32*0] ; k1s3
+ movhps m5, [tmpq+offq+32*1]
+%else
+ movu m4, [tmpq+offq]
+ neg offq
+ movu m5, [tmpq+offq]
+%endif
+ paddw m0, m7
+ psubw m4, m1 ; diff_k1s2
+ psubw m5, m1 ; diff_k1s3
+ psignw m2, m3 ; constrain(diff_k1s1)
+ pabsw m3, m4 ; adiff_k1s2
+ paddw m0, m2
+ psrlw m2, m3, [sec_shift+gprsize]
+ psubusw m7, m6, m2
+ pabsw m2, m5 ; adiff_k1s3
+ pminsw m7, m3
+ psrlw m3, m2, [sec_shift+gprsize]
+ psignw m7, m4 ; constrain(diff_k1s2)
+ psubusw m4, m6, m3
+ pminsw m4, m2
+ paddw m0, m7
+ psignw m4, m5 ; constrain(diff_k1s3)
+ paddw m0, m4 ; sum
+ psraw m2, m0, 15
+ paddw m0, m2
+ pmulhrsw m0, m8
+ paddw m0, m1
+%if %1 == 4
+ add tmpq, 32*2
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+%else
+ add tmpq, 32
+ mova [dstq], m0
+ add dstq, strideq
+%endif
+ ret
+ALIGN function_align
+.pri_sec:
+ movsx offq, byte [dirq+8] ; off2_k0
+%if %1 == 4
+ movq m1, [dstq+strideq*0]
+ movhps m1, [dstq+strideq*1]
+ movq m2, [tmpq+offq+32*0] ; k0s0
+ movhps m2, [tmpq+offq+32*1]
+ neg offq
+ movq m3, [tmpq+offq+32*0] ; k0s1
+ movhps m3, [tmpq+offq+32*1]
+%else
+ mova m1, [dstq]
+ movu m2, [tmpq+offq]
+ neg offq
+ movu m3, [tmpq+offq]
+%endif
+ movsx offq, byte [dirq+0] ; off3_k0
+ pabsw m4, m2
+%if ARCH_X86_64
+ pabsw m10, m3
+ pmaxsw m9, m2, m3
+ pminsw m10, m4
+%else
+ pabsw m7, m3
+ pmaxsw m5, m2, m3
+ pminsw m4, m7
+ mova m9, m5
+ mova m10, m4
+%endif
+ psubw m2, m1 ; diff_k0s0
+ psubw m3, m1 ; diff_k0s1
+ pabsw m4, m2 ; adiff_k0s0
+ psrlw m5, m4, [sec_shift+gprsize]
+ psubusw m0, m6, m5
+ pabsw m5, m3 ; adiff_k0s1
+ pminsw m0, m4
+ psrlw m4, m5, [sec_shift+gprsize]
+ psignw m0, m2 ; constrain(diff_k0s0)
+ psubusw m2, m6, m4
+ pminsw m2, m5
+%if %1 == 4
+ movq m4, [tmpq+offq+32*0] ; k0s2
+ movhps m4, [tmpq+offq+32*1]
+ neg offq
+ movq m5, [tmpq+offq+32*0] ; k0s3
+ movhps m5, [tmpq+offq+32*1]
+%else
+ movu m4, [tmpq+offq]
+ neg offq
+ movu m5, [tmpq+offq]
+%endif
+ movsx offq, byte [dirq+9] ; off2_k1
+ pabsw m7, m4
+ psignw m2, m3
+ pabsw m3, m5 ; constrain(diff_k0s1)
+%if ARCH_X86_64
+ pmaxsw m9, m4
+ pminsw m10, m7
+ pmaxsw m9, m5
+ pminsw m10, m3
+%else
+ pminsw m7, m10
+ pminsw m7, m3
+ pmaxsw m3, m9, m4
+ pmaxsw m3, m5
+ mova m10, m7
+ mova m9, m3
+%endif
+ psubw m4, m1 ; diff_k0s2
+ psubw m5, m1 ; diff_k0s3
+ paddw m0, m2
+ pabsw m3, m4 ; adiff_k0s2
+ psrlw m2, m3, [sec_shift+gprsize]
+ psubusw m7, m6, m2
+ pabsw m2, m5 ; adiff_k0s3
+ pminsw m7, m3
+ psrlw m3, m2, [sec_shift+gprsize]
+ psignw m7, m4 ; constrain(diff_k0s2)
+ psubusw m4, m6, m3
+ pminsw m4, m2
+%if %1 == 4
+ movq m2, [tmpq+offq+32*0] ; k1s0
+ movhps m2, [tmpq+offq+32*1]
+ neg offq
+ movq m3, [tmpq+offq+32*0] ; k1s1
+ movhps m3, [tmpq+offq+32*1]
+%else
+ movu m2, [tmpq+offq]
+ neg offq
+ movu m3, [tmpq+offq]
+%endif
+ movsx offq, byte [dirq+1] ; off3_k1
+ paddw m0, m7
+ pabsw m7, m2
+ psignw m4, m5 ; constrain(diff_k0s3)
+ pabsw m5, m3
+%if ARCH_X86_64
+ pmaxsw m9, m2
+ pminsw m10, m7
+ pmaxsw m9, m3
+ pminsw m10, m5
+%else
+ pminsw m7, m10
+ pminsw m7, m5
+ pmaxsw m5, m9, m2
+ pmaxsw m5, m3
+ mova m10, m7
+ mova m9, m5
+%endif
+ paddw m0, m4 ; constrain(diff_k0)
+ psubw m2, m1 ; diff_k1s0
+ psubw m3, m1 ; diff_k1s1
+ paddw m0, m0 ; sec_tap_k0
+ pabsw m4, m2 ; adiff_k1s0
+ psrlw m5, m4, [sec_shift+gprsize]
+ psubusw m7, m6, m5
+ pabsw m5, m3 ; adiff_k1s1
+ pminsw m7, m4
+ psrlw m4, m5, [sec_shift+gprsize]
+ psignw m7, m2 ; constrain(diff_k1s0)
+ psubusw m2, m6, m4
+ pminsw m2, m5
+%if %1 == 4
+ movq m4, [tmpq+offq+32*0] ; k1s2
+ movhps m4, [tmpq+offq+32*1]
+ neg offq
+ movq m5, [tmpq+offq+32*0] ; k1s3
+ movhps m5, [tmpq+offq+32*1]
+%else
+ movu m4, [tmpq+offq]
+ neg offq
+ movu m5, [tmpq+offq]
+%endif
+ movsx offq, byte [dirq+4] ; off1_k0
+ paddw m0, m7
+ pabsw m7, m4
+ psignw m2, m3 ; constrain(diff_k1s1)
+ pabsw m3, m5
+%if ARCH_X86_64
+ pmaxsw m9, m4
+ pminsw m10, m7
+ pmaxsw m9, m5
+ pminsw m10, m3
+%else
+ pminsw m7, m10
+ pminsw m7, m3
+ pmaxsw m3, m9, m4
+ pmaxsw m3, m5
+ mova m10, m7
+ mova m9, m3
+%endif
+ psubw m4, m1 ; diff_k1s2
+ psubw m5, m1 ; diff_k1s3
+ pabsw m3, m4 ; adiff_k1s2
+ paddw m0, m2
+ psrlw m2, m3, [sec_shift+gprsize]
+ psubusw m7, m6, m2
+ pabsw m2, m5 ; adiff_k1s3
+ pminsw m7, m3
+ psrlw m3, m2, [sec_shift+gprsize]
+ psignw m7, m4 ; constrain(diff_k1s2)
+ psubusw m4, m6, m3
+ pminsw m4, m2
+ paddw m0, m7
+%if %1 == 4
+ movq m2, [tmpq+offq+32*0] ; k0p0
+ movhps m2, [tmpq+offq+32*1]
+ neg offq
+ movq m3, [tmpq+offq+32*0] ; k0p1
+ movhps m3, [tmpq+offq+32*1]
+%else
+ movu m2, [tmpq+offq]
+ neg offq
+ movu m3, [tmpq+offq]
+%endif
+ movsx offq, byte [dirq+5] ; off1_k1
+ pabsw m7, m2
+ psignw m4, m5 ; constrain(diff_k1s3)
+ pabsw m5, m3
+%if ARCH_X86_64
+ pmaxsw m9, m2
+ pminsw m10, m7
+ pmaxsw m9, m3
+ pminsw m10, m5
+%else
+ pminsw m7, m10
+ pminsw m7, m5
+ pmaxsw m5, m9, m2
+ pmaxsw m5, m3
+ mova m10, m7
+ mova m9, m5
+%endif
+ psubw m2, m1 ; diff_k0p0
+ psubw m3, m1 ; diff_k0p1
+ paddw m0, m4
+ pabsw m4, m2 ; adiff_k0p0
+ psrlw m5, m4, [pri_shift+gprsize]
+ psubusw m7, [rsp+gprsize], m5
+ pabsw m5, m3 ; adiff_k0p1
+ pminsw m7, m4
+ psrlw m4, m5, [pri_shift+gprsize]
+ psignw m7, m2 ; constrain(diff_k0p0)
+ psubusw m2, [rsp+gprsize], m4
+ pminsw m2, m5
+%if %1 == 4
+ movq m4, [tmpq+offq+32*0] ; k1p0
+ movhps m4, [tmpq+offq+32*1]
+ neg offq
+ movq m5, [tmpq+offq+32*0] ; k1p1
+ movhps m5, [tmpq+offq+32*1]
+%else
+ movu m4, [tmpq+offq]
+ neg offq
+ movu m5, [tmpq+offq]
+%endif
+ psignw m2, m3 ; constrain(diff_k0p1)
+ pabsw m3, m4
+ paddw m7, m2 ; constrain(diff_k0)
+ pabsw m2, m5
+%if ARCH_X86_64
+ pmaxsw m9, m4
+ pminsw m10, m3
+ pmaxsw m9, m5
+ pminsw m10, m2
+%else
+ pminsw m3, m10
+ pminsw m3, m2
+ pmaxsw m2, m9, m4
+ pmaxsw m2, m5
+ mova m10, m3
+ mova m9, m2
+%endif
+ psubw m4, m1 ; diff_k1p0
+ psubw m5, m1 ; diff_k1p1
+ pabsw m3, m4 ; adiff_k1p0
+ pmullw m7, [priq+16*0] ; pri_tap_k0
+ paddw m0, m7
+ psrlw m2, m3, [pri_shift+gprsize]
+ psubusw m7, [rsp+16*0+gprsize], m2
+ pabsw m2, m5 ; adiff_k1p1
+ pminsw m7, m3
+ psrlw m3, m2, [pri_shift+gprsize]
+ psignw m7, m4 ; constrain(diff_k1p0)
+ psubusw m4, [rsp+16*0+gprsize], m3
+ pminsw m4, m2
+ psignw m4, m5 ; constrain(diff_k1p1)
+ paddw m7, m4 ; constrain(diff_k1)
+ pmullw m7, [priq+16*1] ; pri_tap_k1
+ paddw m0, m7 ; sum
+ psraw m2, m0, 15
+ paddw m0, m2
+ pmulhrsw m0, m8
+ paddw m0, m1
+%if ARCH_X86_64
+ pmaxsw m9, m1
+ pminsw m0, m9
+%else
+ pmaxsw m2, m9, m1
+ pminsw m0, m2
+%endif
+ pminsw m1, m10
+ pmaxsw m0, m1
+%if %1 == 4
+ add tmpq, 32*2
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+%else
+ add tmpq, 32
+ mova [dstq], m0
+ add dstq, strideq
+%endif
+ ret
+%endif
+%endmacro
+
+INIT_XMM ssse3
+%if ARCH_X86_64
+cglobal cdef_filter_4x4_16bpc, 5, 9, 9, 32*10, dst, stride, left, top, bot, \
+ pri, sec, edge
+ %define px rsp+32*4
+%else
+cglobal cdef_filter_4x4_16bpc, 2, 7, 8, -32*11, dst, stride, edge, top, left
+ %define botq topq
+ %define px rsp+32*5
+%endif
+ %define base t0-dir_table
+ %define pri_shift px-16*6
+ %define sec_shift px-16*5
+ mov edged, r9m
+ LEA t0, dir_table
+ movu m0, [dstq+strideq*0]
+ movu m1, [dstq+strideq*1]
+ lea t1, [dstq+strideq*2]
+ movu m2, [t1 +strideq*0]
+ movu m3, [t1 +strideq*1]
+ movddup m7, [base+pw_m16384]
+ mova [px+32*0+0], m0
+ mova [px+32*1+0], m1
+ mova [px+32*2+0], m2
+ mova [px+32*3+0], m3
+ test edgeb, 4 ; HAVE_TOP
+ jz .no_top
+ movifnidn topq, topmp
+ movu m0, [topq+strideq*0]
+ movu m1, [topq+strideq*1]
+ mova [px-32*2+0], m0
+ mova [px-32*1+0], m1
+ test edgeb, 1 ; HAVE_LEFT
+ jz .top_no_left
+ movd m0, [topq+strideq*0-4]
+ movd m1, [topq+strideq*1-4]
+ movd [px-32*2-4], m0
+ movd [px-32*1-4], m1
+ jmp .top_done
+.no_top:
+ mova [px-32*2+0], m7
+ mova [px-32*1+0], m7
+.top_no_left:
+ movd [px-32*2-4], m7
+ movd [px-32*1-4], m7
+.top_done:
+ test edgeb, 8 ; HAVE_BOTTOM
+ jz .no_bottom
+ movifnidn botq, r4mp
+ movu m0, [botq+strideq*0]
+ movu m1, [botq+strideq*1]
+ mova [px+32*4+0], m0
+ mova [px+32*5+0], m1
+ test edgeb, 1 ; HAVE_LEFT
+ jz .bottom_no_left
+ movd m0, [botq+strideq*0-4]
+ movd m1, [botq+strideq*1-4]
+ movd [px+32*4-4], m0
+ movd [px+32*5-4], m1
+ jmp .bottom_done
+.no_bottom:
+ mova [px+32*4+0], m7
+ mova [px+32*5+0], m7
+.bottom_no_left:
+ movd [px+32*4-4], m7
+ movd [px+32*5-4], m7
+.bottom_done:
+ test edgeb, 1 ; HAVE_LEFT
+ jz .no_left
+ movifnidn leftq, r2mp
+ movd m0, [leftq+4*0]
+ movd m1, [leftq+4*1]
+ movd m2, [leftq+4*2]
+ movd m3, [leftq+4*3]
+ movd [px+32*0-4], m0
+ movd [px+32*1-4], m1
+ movd [px+32*2-4], m2
+ movd [px+32*3-4], m3
+ jmp .left_done
+.no_left:
+ REPX {movd [px+32*x-4], m7}, 0, 1, 2, 3
+.left_done:
+ test edgeb, 2 ; HAVE_RIGHT
+ jnz .padding_done
+ REPX {movd [px+32*x+8], m7}, -2, -1, 0, 1, 2, 3, 4, 5
+.padding_done:
+ CDEF_FILTER 4, 4
+
+%if ARCH_X86_64
+cglobal cdef_filter_4x8_16bpc, 5, 9, 9, 32*14, dst, stride, left, top, bot, \
+ pri, sec, edge
+%else
+cglobal cdef_filter_4x8_16bpc, 2, 7, 8, -32*15, dst, stride, edge, top, left
+%endif
+ mov edged, r9m
+ LEA t0, dir_table
+ movu m0, [dstq+strideq*0]
+ movu m1, [dstq+strideq*1]
+ lea t1, [dstq+strideq*2]
+ movu m2, [t1 +strideq*0]
+ movu m3, [t1 +strideq*1]
+ lea t1, [t1 +strideq*2]
+ movu m4, [t1 +strideq*0]
+ movu m5, [t1 +strideq*1]
+ lea t1, [t1 +strideq*2]
+ movu m6, [t1 +strideq*0]
+ movu m7, [t1 +strideq*1]
+ mova [px+32*0+0], m0
+ mova [px+32*1+0], m1
+ mova [px+32*2+0], m2
+ mova [px+32*3+0], m3
+ mova [px+32*4+0], m4
+ mova [px+32*5+0], m5
+ mova [px+32*6+0], m6
+ mova [px+32*7+0], m7
+ movddup m7, [base+pw_m16384]
+ test edgeb, 4 ; HAVE_TOP
+ jz .no_top
+ movifnidn topq, topmp
+ movu m0, [topq+strideq*0]
+ movu m1, [topq+strideq*1]
+ mova [px-32*2+0], m0
+ mova [px-32*1+0], m1
+ test edgeb, 1 ; HAVE_LEFT
+ jz .top_no_left
+ movd m0, [topq+strideq*0-4]
+ movd m1, [topq+strideq*1-4]
+ movd [px-32*2-4], m0
+ movd [px-32*1-4], m1
+ jmp .top_done
+.no_top:
+ mova [px-32*2+0], m7
+ mova [px-32*1+0], m7
+.top_no_left:
+ movd [px-32*2-4], m7
+ movd [px-32*1-4], m7
+.top_done:
+ test edgeb, 8 ; HAVE_BOTTOM
+ jz .no_bottom
+ movifnidn botq, r4mp
+ movu m0, [botq+strideq*0]
+ movu m1, [botq+strideq*1]
+ mova [px+32*8+0], m0
+ mova [px+32*9+0], m1
+ test edgeb, 1 ; HAVE_LEFT
+ jz .bottom_no_left
+ movd m0, [botq+strideq*0-4]
+ movd m1, [botq+strideq*1-4]
+ movd [px+32*8-4], m0
+ movd [px+32*9-4], m1
+ jmp .bottom_done
+.no_bottom:
+ mova [px+32*8+0], m7
+ mova [px+32*9+0], m7
+.bottom_no_left:
+ movd [px+32*8-4], m7
+ movd [px+32*9-4], m7
+.bottom_done:
+ test edgeb, 1 ; HAVE_LEFT
+ jz .no_left
+ movifnidn leftq, r2mp
+ movd m0, [leftq+4*0]
+ movd m1, [leftq+4*1]
+ movd m2, [leftq+4*2]
+ movd m3, [leftq+4*3]
+ movd [px+32*0-4], m0
+ movd [px+32*1-4], m1
+ movd [px+32*2-4], m2
+ movd [px+32*3-4], m3
+ movd m0, [leftq+4*4]
+ movd m1, [leftq+4*5]
+ movd m2, [leftq+4*6]
+ movd m3, [leftq+4*7]
+ movd [px+32*4-4], m0
+ movd [px+32*5-4], m1
+ movd [px+32*6-4], m2
+ movd [px+32*7-4], m3
+ jmp .left_done
+.no_left:
+ REPX {movd [px+32*x-4], m7}, 0, 1, 2, 3, 4, 5, 6, 7
+.left_done:
+ test edgeb, 2 ; HAVE_RIGHT
+ jnz .padding_done
+ REPX {movd [px+32*x+8], m7}, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
+.padding_done:
+ CDEF_FILTER 4, 8
+
+%if ARCH_X86_64
+cglobal cdef_filter_8x8_16bpc, 5, 9, 9, 32*14, dst, stride, left, top, bot, \
+ pri, sec, edge
+%else
+cglobal cdef_filter_8x8_16bpc, 2, 7, 8, -32*15, dst, stride, edge, top, left
+%endif
+ mov edged, r9m
+ LEA t0, dir_table
+ mova m0, [dstq+strideq*0+ 0]
+ movd m1, [dstq+strideq*0+16]
+ mova m2, [dstq+strideq*1+ 0]
+ movd m3, [dstq+strideq*1+16]
+ lea t1, [dstq+strideq*2]
+ mova m4, [t1 +strideq*0+ 0]
+ movd m5, [t1 +strideq*0+16]
+ mova m6, [t1 +strideq*1+ 0]
+ movd m7, [t1 +strideq*1+16]
+ lea t1, [t1 +strideq*2]
+ mova [px+32*0+ 0], m0
+ movd [px+32*0+16], m1
+ mova [px+32*1+ 0], m2
+ movd [px+32*1+16], m3
+ mova [px+32*2+ 0], m4
+ movd [px+32*2+16], m5
+ mova [px+32*3+ 0], m6
+ movd [px+32*3+16], m7
+ mova m0, [t1 +strideq*0+ 0]
+ movd m1, [t1 +strideq*0+16]
+ mova m2, [t1 +strideq*1+ 0]
+ movd m3, [t1 +strideq*1+16]
+ lea t1, [t1 +strideq*2]
+ mova m4, [t1 +strideq*0+ 0]
+ movd m5, [t1 +strideq*0+16]
+ mova m6, [t1 +strideq*1+ 0]
+ movd m7, [t1 +strideq*1+16]
+ mova [px+32*4+ 0], m0
+ movd [px+32*4+16], m1
+ mova [px+32*5+ 0], m2
+ movd [px+32*5+16], m3
+ mova [px+32*6+ 0], m4
+ movd [px+32*6+16], m5
+ mova [px+32*7+ 0], m6
+ movd [px+32*7+16], m7
+ movddup m7, [base+pw_m16384]
+ test edgeb, 4 ; HAVE_TOP
+ jz .no_top
+ movifnidn topq, topmp
+ mova m0, [topq+strideq*0+ 0]
+ mova m1, [topq+strideq*0+16]
+ mova m2, [topq+strideq*1+ 0]
+ mova m3, [topq+strideq*1+16]
+ mova [px-32*2+ 0], m0
+ movd [px-32*2+16], m1
+ mova [px-32*1+ 0], m2
+ movd [px-32*1+16], m3
+ test edgeb, 1 ; HAVE_LEFT
+ jz .top_no_left
+ movd m0, [topq+strideq*0-4]
+ movd m1, [topq+strideq*1-4]
+ movd [px-32*2-4], m0
+ movd [px-32*1-4], m1
+ jmp .top_done
+.no_top:
+ mova [px-32*2+ 0], m7
+ movd [px-32*2+16], m7
+ mova [px-32*1+ 0], m7
+ movd [px-32*1+16], m7
+.top_no_left:
+ movd [px-32*2- 4], m7
+ movd [px-32*1- 4], m7
+.top_done:
+ test edgeb, 8 ; HAVE_BOTTOM
+ jz .no_bottom
+ movifnidn botq, r4mp
+ mova m0, [botq+strideq*0+ 0]
+ movd m1, [botq+strideq*0+16]
+ mova m2, [botq+strideq*1+ 0]
+ movd m3, [botq+strideq*1+16]
+ mova [px+32*8+ 0], m0
+ movd [px+32*8+16], m1
+ mova [px+32*9+ 0], m2
+ movd [px+32*9+16], m3
+ test edgeb, 1 ; HAVE_LEFT
+ jz .bottom_no_left
+ movd m0, [botq+strideq*0-4]
+ movd m1, [botq+strideq*1-4]
+ movd [px+32*8- 4], m0
+ movd [px+32*9- 4], m1
+ jmp .bottom_done
+.no_bottom:
+ mova [px+32*8+ 0], m7
+ movd [px+32*8+16], m7
+ mova [px+32*9+ 0], m7
+ movd [px+32*9+16], m7
+.bottom_no_left:
+ movd [px+32*8- 4], m7
+ movd [px+32*9- 4], m7
+.bottom_done:
+ test edgeb, 1 ; HAVE_LEFT
+ jz .no_left
+ movifnidn leftq, r2mp
+ movd m0, [leftq+4*0]
+ movd m1, [leftq+4*1]
+ movd m2, [leftq+4*2]
+ movd m3, [leftq+4*3]
+ movd [px+32*0- 4], m0
+ movd [px+32*1- 4], m1
+ movd [px+32*2- 4], m2
+ movd [px+32*3- 4], m3
+ movd m0, [leftq+4*4]
+ movd m1, [leftq+4*5]
+ movd m2, [leftq+4*6]
+ movd m3, [leftq+4*7]
+ movd [px+32*4- 4], m0
+ movd [px+32*5- 4], m1
+ movd [px+32*6- 4], m2
+ movd [px+32*7- 4], m3
+ jmp .left_done
+.no_left:
+ REPX {movd [px+32*x- 4], m7}, 0, 1, 2, 3, 4, 5, 6, 7
+.left_done:
+ test edgeb, 2 ; HAVE_RIGHT
+ jnz .padding_done
+ REPX {movd [px+32*x+16], m7}, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
+.padding_done:
+ CDEF_FILTER 8, 8
+
+%macro CDEF_DIR 0
+%if ARCH_X86_64
+cglobal cdef_dir_16bpc, 4, 7, 16, src, stride, var, bdmax
+ lea r6, [dir_shift]
+ shr bdmaxd, 11 ; 0 for 10bpc, 1 for 12bpc
+ movddup m7, [r6+bdmaxq*8]
+ lea r6, [strideq*3]
+ mova m0, [srcq+strideq*0]
+ mova m1, [srcq+strideq*1]
+ mova m2, [srcq+strideq*2]
+ mova m3, [srcq+r6 ]
+ lea srcq, [srcq+strideq*4]
+ mova m4, [srcq+strideq*0]
+ mova m5, [srcq+strideq*1]
+ mova m6, [srcq+strideq*2]
+ REPX {pmulhuw x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pmulhuw m7, [srcq+r6 ]
+ pxor m8, m8
+ packuswb m9, m0, m1
+ packuswb m10, m2, m3
+ packuswb m11, m4, m5
+ packuswb m12, m6, m7
+ REPX {psadbw x, m8}, m9, m10, m11, m12
+ packssdw m9, m10
+ packssdw m11, m12
+ packssdw m9, m11
+ jmp mangle(private_prefix %+ _cdef_dir_8bpc %+ SUFFIX).main
+%else
+cglobal cdef_dir_16bpc, 2, 4, 8, 96, src, stride, var, bdmax
+ mov bdmaxd, bdmaxm
+ LEA r2, dir_shift
+ shr bdmaxd, 11
+ movddup m7, [r2+bdmaxq*8]
+ lea r3, [strideq*3]
+ pmulhuw m3, m7, [srcq+strideq*0]
+ pmulhuw m4, m7, [srcq+strideq*1]
+ pmulhuw m5, m7, [srcq+strideq*2]
+ pmulhuw m6, m7, [srcq+r3 ]
+ movddup m1, [r2-dir_shift+pw_128]
+ lea srcq, [srcq+strideq*4]
+ pxor m0, m0
+ packuswb m2, m3, m4
+ psubw m3, m1
+ psubw m4, m1
+ mova [esp+0x00], m3
+ mova [esp+0x10], m4
+ packuswb m3, m5, m6
+ psadbw m2, m0
+ psadbw m3, m0
+ psubw m5, m1
+ psubw m6, m1
+ packssdw m2, m3
+ mova [esp+0x20], m5
+ mova [esp+0x50], m6
+ pmulhuw m4, m7, [srcq+strideq*0]
+ pmulhuw m5, m7, [srcq+strideq*1]
+ pmulhuw m6, m7, [srcq+strideq*2]
+ pmulhuw m7, [srcq+r3 ]
+ packuswb m3, m4, m5
+ packuswb m1, m6, m7
+ psadbw m3, m0
+ psadbw m1, m0
+ packssdw m3, m1
+ movddup m1, [r2-dir_shift+pw_128]
+ LEA r2, shufw_6543210x
+ jmp mangle(private_prefix %+ _cdef_dir_8bpc %+ SUFFIX).main
+%endif
+%endmacro
+
+INIT_XMM ssse3
+CDEF_DIR
+
+INIT_XMM sse4
+CDEF_DIR
diff --git a/third_party/dav1d/src/x86/cdef_avx2.asm b/third_party/dav1d/src/x86/cdef_avx2.asm
new file mode 100644
index 0000000000..1f30f8a3b7
--- /dev/null
+++ b/third_party/dav1d/src/x86/cdef_avx2.asm
@@ -0,0 +1,1772 @@
+; Copyright © 2018, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+%macro JMP_TABLE 2-*
+ %xdefine %1_jmptable %%table
+ %xdefine %%base mangle(private_prefix %+ _%1_avx2)
+ %%table:
+ %rep %0 - 1
+ dd %%base %+ .%2 - %%table
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro CDEF_FILTER_JMP_TABLE 1
+JMP_TABLE cdef_filter_%1_8bpc, \
+ d6k0, d6k1, d7k0, d7k1, \
+ d0k0, d0k1, d1k0, d1k1, d2k0, d2k1, d3k0, d3k1, \
+ d4k0, d4k1, d5k0, d5k1, d6k0, d6k1, d7k0, d7k1, \
+ d0k0, d0k1, d1k0, d1k1
+%endmacro
+
+SECTION_RODATA 32
+
+pd_47130256: dd 4, 7, 1, 3, 0, 2, 5, 6
+blend_4x4: dd 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00
+ dd 0x80, 0x00, 0x00
+blend_4x8_0: dd 0x00, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+blend_4x8_1: dd 0x00, 0x00, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+ dd 0x00, 0x00
+blend_4x8_2: dd 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080
+ dd 0x0000
+blend_4x8_3: dd 0x0000, 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080
+ dd 0x0000, 0x0000
+blend_8x8_0: dq 0x00, 0x00, 0x80, 0x80, 0x80, 0x80
+blend_8x8_1: dq 0x0000, 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x0000, 0x0000
+div_table: dd 840, 420, 280, 210, 168, 140, 120, 105, 420, 210, 140, 105
+shufw_6543210x:db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15
+shufb_lohi: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
+pw_128: times 2 dw 128
+pw_2048: times 2 dw 2048
+tap_table: ; masks for 8 bit shifts
+ db 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01
+ ; weights
+ db 4, 2, 3, 3, 2, 1
+ db -1 * 16 + 1, -2 * 16 + 2
+ db 0 * 16 + 1, -1 * 16 + 2
+ db 0 * 16 + 1, 0 * 16 + 2
+ db 0 * 16 + 1, 1 * 16 + 2
+ db 1 * 16 + 1, 2 * 16 + 2
+ db 1 * 16 + 0, 2 * 16 + 1
+ db 1 * 16 + 0, 2 * 16 + 0
+ db 1 * 16 + 0, 2 * 16 - 1
+ ; the last 6 are repeats of the first 6 so we don't need to & 7
+ db -1 * 16 + 1, -2 * 16 + 2
+ db 0 * 16 + 1, -1 * 16 + 2
+ db 0 * 16 + 1, 0 * 16 + 2
+ db 0 * 16 + 1, 1 * 16 + 2
+ db 1 * 16 + 1, 2 * 16 + 2
+ db 1 * 16 + 0, 2 * 16 + 1
+
+CDEF_FILTER_JMP_TABLE 4x4
+CDEF_FILTER_JMP_TABLE 4x8
+CDEF_FILTER_JMP_TABLE 8x8
+
+SECTION .text
+
+%macro PREP_REGS 2 ; w, h
+ ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k]
+ mov dird, r7m
+ lea tableq, [cdef_filter_%1x%2_8bpc_jmptable]
+ lea dirq, [tableq+dirq*2*4]
+%if %1 == 4
+ %if %2 == 4
+ DEFINE_ARGS dst, stride, left, top, bot, pri, sec, \
+ table, dir, dirjmp, stride3, k
+ %else
+ DEFINE_ARGS dst, stride, left, top, bot, pri, sec, \
+ table, dir, dirjmp, dst4, stride3, k
+ lea dst4q, [dstq+strideq*4]
+ %endif
+%else
+ DEFINE_ARGS dst, stride, h, top1, bot, pri, sec, \
+ table, dir, dirjmp, top2, stride3, k
+ mov hq, -8
+ lea top1q, [top1q+strideq*0]
+ lea top2q, [top1q+strideq*1]
+%endif
+%if %1 == 4
+ lea stride3q, [strideq*3]
+%endif
+%endmacro
+
+%macro LOAD_BLOCK 2-3 0 ; w, h, init_min_max
+ mov kd, 1
+ pxor m15, m15 ; sum
+%if %2 == 8
+ pxor m12, m12
+ %if %1 == 4
+ movd xm4, [dstq +strideq*0]
+ movd xm6, [dstq +strideq*1]
+ movd xm5, [dstq +strideq*2]
+ movd xm7, [dstq +stride3q ]
+ vinserti128 m4, [dst4q+strideq*0], 1
+ vinserti128 m6, [dst4q+strideq*1], 1
+ vinserti128 m5, [dst4q+strideq*2], 1
+ vinserti128 m7, [dst4q+stride3q ], 1
+ punpckldq m4, m6
+ punpckldq m5, m7
+ %else
+ movq xm4, [dstq+strideq*0]
+ movq xm5, [dstq+strideq*1]
+ vinserti128 m4, [dstq+strideq*2], 1
+ vinserti128 m5, [dstq+stride3q ], 1
+ %endif
+ punpcklqdq m4, m5
+%else
+ movd xm4, [dstq+strideq*0]
+ movd xm5, [dstq+strideq*1]
+ vinserti128 m4, [dstq+strideq*2], 1
+ vinserti128 m5, [dstq+stride3q ], 1
+ punpckldq m4, m5
+%endif
+%if %3 == 1
+ mova m7, m4 ; min
+ mova m8, m4 ; max
+%endif
+%endmacro
+
+%macro ACCUMULATE_TAP_BYTE 7-8 0 ; tap_offset, shift, mask, strength
+ ; mul_tap, w, h, clip
+ ; load p0/p1
+ movsxd dirjmpq, [dirq+kq*4+%1*2*4]
+ add dirjmpq, tableq
+ call dirjmpq
+
+%if %8 == 1
+ pmaxub m7, m5
+ pminub m8, m5
+ pmaxub m7, m6
+ pminub m8, m6
+%endif
+
+ ; accumulate sum[m15] over p0/p1
+%if %7 == 4
+ punpcklbw m5, m6
+ punpcklbw m6, m4, m4
+ psubusb m9, m5, m6
+ psubusb m5, m6, m5
+ por m9, m5 ; abs_diff_p01(p01 - px)
+ pcmpeqb m5, m9
+ por m5, %5
+ psignb m6, %5, m5
+ psrlw m5, m9, %2 ; emulate 8-bit shift
+ pand m5, %3
+ psubusb m5, %4, m5
+ pminub m5, m9
+ pmaddubsw m5, m6
+ paddw m15, m5
+%else
+ psubusb m9, m5, m4
+ psubusb m5, m4, m5
+ psubusb m11, m6, m4
+ psubusb m6, m4, m6
+ por m9, m5 ; abs_diff_p0(p0 - px)
+ por m11, m6 ; abs_diff_p1(p1 - px)
+ pcmpeqb m5, m9
+ pcmpeqb m6, m11
+ punpckhbw m10, m9, m11
+ punpcklbw m9, m11
+ por m5, %5
+ por m11, m6, %5
+ punpckhbw m6, m5, m11
+ punpcklbw m5, m11
+ psignb m11, %5, m6
+ psrlw m6, m10, %2 ; emulate 8-bit shift
+ pand m6, %3
+ psubusb m6, %4, m6
+ pminub m6, m10
+ pmaddubsw m6, m11
+ paddw m12, m6
+ psignb m11, %5, m5
+ psrlw m5, m9, %2 ; emulate 8-bit shift
+ pand m5, %3
+ psubusb m5, %4, m5
+ pminub m5, m9
+ pmaddubsw m5, m11
+ paddw m15, m5
+%endif
+%endmacro
+
+%macro ADJUST_PIXEL 4-5 0 ; w, h, zero, pw_2048, clip
+%if %2 == 4
+ %if %5 == 1
+ punpcklbw m4, %3
+ %endif
+ pcmpgtw %3, m15
+ paddw m15, %3
+ pmulhrsw m15, %4
+ %if %5 == 0
+ packsswb m15, m15
+ paddb m4, m15
+ %else
+ paddw m4, m15
+ packuswb m4, m4 ; clip px in [0x0,0xff]
+ pminub m4, m7
+ pmaxub m4, m8
+ %endif
+ vextracti128 xm5, m4, 1
+ movd [dstq+strideq*0], xm4
+ movd [dstq+strideq*2], xm5
+ pextrd [dstq+strideq*1], xm4, 1
+ pextrd [dstq+stride3q ], xm5, 1
+%else
+ pcmpgtw m6, %3, m12
+ pcmpgtw m5, %3, m15
+ paddw m12, m6
+ paddw m15, m5
+ %if %5 == 1
+ punpckhbw m5, m4, %3
+ punpcklbw m4, %3
+ %endif
+ pmulhrsw m12, %4
+ pmulhrsw m15, %4
+ %if %5 == 0
+ packsswb m15, m12
+ paddb m4, m15
+ %else
+ paddw m5, m12
+ paddw m4, m15
+ packuswb m4, m5 ; clip px in [0x0,0xff]
+ pminub m4, m7
+ pmaxub m4, m8
+ %endif
+ vextracti128 xm5, m4, 1
+ %if %1 == 4
+ movd [dstq +strideq*0], xm4
+ movd [dst4q+strideq*0], xm5
+ pextrd [dstq +strideq*1], xm4, 1
+ pextrd [dst4q+strideq*1], xm5, 1
+ pextrd [dstq +strideq*2], xm4, 2
+ pextrd [dst4q+strideq*2], xm5, 2
+ pextrd [dstq +stride3q ], xm4, 3
+ pextrd [dst4q+stride3q ], xm5, 3
+ %else
+ movq [dstq+strideq*0], xm4
+ movq [dstq+strideq*2], xm5
+ movhps [dstq+strideq*1], xm4
+ movhps [dstq+stride3q ], xm5
+ %endif
+%endif
+%endmacro
+
+%macro BORDER_PREP_REGS 2 ; w, h
+ ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k]
+ mov dird, r7m
+ lea dirq, [tableq+dirq*2+14]
+%if %1*%2*2/mmsize > 1
+ %if %1 == 4
+ DEFINE_ARGS dst, stride, k, dir, stk, pri, sec, stride3, h, off
+ %else
+ DEFINE_ARGS dst, stride, k, dir, stk, pri, sec, h, off
+ %endif
+ mov hd, %1*%2*2/mmsize
+%else
+ DEFINE_ARGS dst, stride, k, dir, stk, pri, sec, stride3, off
+%endif
+ lea stkq, [px]
+ pxor m11, m11
+%endmacro
+
+%macro BORDER_LOAD_BLOCK 2-3 0 ; w, h, init_min_max
+ mov kd, 1
+%if %1 == 4
+ movq xm4, [stkq+32*0]
+ movhps xm4, [stkq+32*1]
+ movq xm5, [stkq+32*2]
+ movhps xm5, [stkq+32*3]
+ vinserti128 m4, xm5, 1
+%else
+ mova xm4, [stkq+32*0] ; px
+ vinserti128 m4, [stkq+32*1], 1
+%endif
+ pxor m15, m15 ; sum
+%if %3 == 1
+ mova m7, m4 ; max
+ mova m8, m4 ; min
+%endif
+%endmacro
+
+%macro ACCUMULATE_TAP_WORD 6-7 0 ; tap_offset, shift, mask, strength
+ ; mul_tap, w, clip
+ ; load p0/p1
+ movsx offq, byte [dirq+kq+%1] ; off1
+%if %6 == 4
+ movq xm5, [stkq+offq*2+32*0] ; p0
+ movq xm6, [stkq+offq*2+32*2]
+ movhps xm5, [stkq+offq*2+32*1]
+ movhps xm6, [stkq+offq*2+32*3]
+ vinserti128 m5, xm6, 1
+%else
+ movu xm5, [stkq+offq*2+32*0] ; p0
+ vinserti128 m5, [stkq+offq*2+32*1], 1
+%endif
+ neg offq ; -off1
+%if %6 == 4
+ movq xm6, [stkq+offq*2+32*0] ; p1
+ movq xm9, [stkq+offq*2+32*2]
+ movhps xm6, [stkq+offq*2+32*1]
+ movhps xm9, [stkq+offq*2+32*3]
+ vinserti128 m6, xm9, 1
+%else
+ movu xm6, [stkq+offq*2+32*0] ; p1
+ vinserti128 m6, [stkq+offq*2+32*1], 1
+%endif
+%if %7 == 1
+ ; out of bounds values are set to a value that is a both a large unsigned
+ ; value and a negative signed value.
+ ; use signed max and unsigned min to remove them
+ pmaxsw m7, m5 ; max after p0
+ pminuw m8, m5 ; min after p0
+ pmaxsw m7, m6 ; max after p1
+ pminuw m8, m6 ; min after p1
+%endif
+
+ ; accumulate sum[m15] over p0/p1
+ ; calculate difference before converting
+ psubw m5, m4 ; diff_p0(p0 - px)
+ psubw m6, m4 ; diff_p1(p1 - px)
+
+ ; convert to 8-bits with signed saturation
+ ; saturating to large diffs has no impact on the results
+ packsswb m5, m6
+
+ ; group into pairs so we can accumulate using maddubsw
+ pshufb m5, m12
+ pabsb m9, m5
+ psignb m10, %5, m5
+ psrlw m5, m9, %2 ; emulate 8-bit shift
+ pand m5, %3
+ psubusb m5, %4, m5
+
+ ; use unsigned min since abs diff can equal 0x80
+ pminub m5, m9
+ pmaddubsw m5, m10
+ paddw m15, m5
+%endmacro
+
+%macro BORDER_ADJUST_PIXEL 2-3 0 ; w, pw_2048, clip
+ pcmpgtw m9, m11, m15
+ paddw m15, m9
+ pmulhrsw m15, %2
+ paddw m4, m15
+%if %3 == 1
+ pminsw m4, m7
+ pmaxsw m4, m8
+%endif
+ packuswb m4, m4
+ vextracti128 xm5, m4, 1
+%if %1 == 4
+ movd [dstq+strideq*0], xm4
+ pextrd [dstq+strideq*1], xm4, 1
+ movd [dstq+strideq*2], xm5
+ pextrd [dstq+stride3q ], xm5, 1
+%else
+ movq [dstq+strideq*0], xm4
+ movq [dstq+strideq*1], xm5
+%endif
+%endmacro
+
+%macro CDEF_FILTER 2 ; w, h
+INIT_YMM avx2
+cglobal cdef_filter_%1x%2_8bpc, 5, 11, 0, dst, stride, left, top, bot, \
+ pri, sec, dir, damping, edge
+%assign stack_offset_entry stack_offset
+ mov edged, edgem
+ cmp edged, 0xf
+ jne .border_block
+
+ PUSH r11
+ PUSH r12
+%if %2 == 4
+%assign regs_used 13
+ ALLOC_STACK 0x60, 16
+ pmovzxbw xm0, [leftq+1]
+ vpermq m0, m0, q0110
+ psrldq m1, m0, 4
+ vpalignr m2, m0, m0, 12
+ movu [rsp+0x10], m0
+ movu [rsp+0x28], m1
+ movu [rsp+0x40], m2
+%elif %1 == 4
+%assign regs_used 14
+ PUSH r13
+ ALLOC_STACK 8*2+%1*%2*1, 16
+ pmovzxwd m0, [leftq]
+ mova [rsp+0x10], m0
+%else
+%assign regs_used 15
+ PUSH r13
+ PUSH r14
+ ALLOC_STACK 8*4+%1*%2*2+32, 16
+ lea r11, [strideq*3]
+ movu xm4, [dstq+strideq*2]
+ pmovzxwq m0, [leftq+0]
+ pmovzxwq m1, [leftq+8]
+ vinserti128 m4, [dstq+r11], 1
+ pmovzxbd m2, [leftq+1]
+ pmovzxbd m3, [leftq+9]
+ mov [rsp+16], botq
+ mova [rsp+0x20], m0
+ mova [rsp+0x40], m1
+ mova [rsp+0x60], m2
+ mova [rsp+0x80], m3
+ mova [rsp+0xa0], m4
+ lea botq, [dstq+strideq*4]
+%endif
+
+ DEFINE_ARGS dst, stride, left, top, bot, pri, secdmp, zero, pridmp, damping
+ mov dampingd, r8m
+ xor zerod, zerod
+ movifnidn prid, prim
+ sub dampingd, 31
+ movifnidn secdmpd, secdmpm
+ test prid, prid
+ jz .sec_only
+ movd xm0, prid
+ lzcnt pridmpd, prid
+ add pridmpd, dampingd
+ cmovs pridmpd, zerod
+ mov [rsp+0], pridmpq ; pri_shift
+ test secdmpd, secdmpd
+ jz .pri_only
+ movd xm1, secdmpd
+ lzcnt secdmpd, secdmpd
+ add secdmpd, dampingd
+ mov [rsp+8], secdmpq ; sec_shift
+
+ DEFINE_ARGS dst, stride, left, top, bot, pri, secdmp, table, pridmp
+ lea tableq, [tap_table]
+ vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask
+ vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask
+
+ ; pri/sec_taps[k] [4 total]
+ DEFINE_ARGS dst, stride, left, top, bot, pri, sec, table, dir
+ vpbroadcastb m0, xm0 ; pri_strength
+ vpbroadcastb m1, xm1 ; sec_strength
+ and prid, 1
+ lea priq, [tableq+priq*2+8] ; pri_taps
+ lea secq, [tableq+12] ; sec_taps
+
+ PREP_REGS %1, %2
+%if %1*%2 > mmsize
+.v_loop:
+%endif
+ LOAD_BLOCK %1, %2, 1
+.k_loop:
+ vpbroadcastb m2, [priq+kq] ; pri_taps
+ vpbroadcastb m3, [secq+kq] ; sec_taps
+ ACCUMULATE_TAP_BYTE 2, [rsp+0], m13, m0, m2, %1, %2, 1 ; dir + 0
+ ACCUMULATE_TAP_BYTE 4, [rsp+8], m14, m1, m3, %1, %2, 1 ; dir + 2
+ ACCUMULATE_TAP_BYTE 0, [rsp+8], m14, m1, m3, %1, %2, 1 ; dir - 2
+ dec kq
+ jge .k_loop
+
+ vpbroadcastd m10, [pw_2048]
+ pxor m9, m9
+ ADJUST_PIXEL %1, %2, m9, m10, 1
+%if %1*%2 > mmsize
+ lea dstq, [dstq+strideq*4]
+ lea top1q, [rsp+0xa0]
+ lea top2q, [rsp+0xb0]
+ mov botq, [rsp+16]
+ add hq, 4
+ jl .v_loop
+%endif
+ RET
+
+.pri_only:
+ DEFINE_ARGS dst, stride, left, top, bot, pri, _, table, pridmp
+ lea tableq, [tap_table]
+ vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask
+ ; pri/sec_taps[k] [4 total]
+ DEFINE_ARGS dst, stride, left, top, bot, pri, _, table, dir
+ vpbroadcastb m0, xm0 ; pri_strength
+ and prid, 1
+ lea priq, [tableq+priq*2+8] ; pri_taps
+ PREP_REGS %1, %2
+ vpbroadcastd m3, [pw_2048]
+ pxor m1, m1
+%if %1*%2 > mmsize
+.pri_v_loop:
+%endif
+ LOAD_BLOCK %1, %2
+.pri_k_loop:
+ vpbroadcastb m2, [priq+kq] ; pri_taps
+ ACCUMULATE_TAP_BYTE 2, [rsp+0], m13, m0, m2, %1, %2 ; dir + 0
+ dec kq
+ jge .pri_k_loop
+ ADJUST_PIXEL %1, %2, m1, m3
+%if %1*%2 > mmsize
+ lea dstq, [dstq+strideq*4]
+ lea top1q, [rsp+0xa0]
+ lea top2q, [rsp+0xb0]
+ mov botq, [rsp+16]
+ add hq, 4
+ jl .pri_v_loop
+%endif
+ RET
+
+.sec_only:
+ DEFINE_ARGS dst, stride, left, top, bot, _, secdmp, zero, _, damping
+ movd xm1, secdmpd
+ lzcnt secdmpd, secdmpd
+ add secdmpd, dampingd
+ mov [rsp+8], secdmpq ; sec_shift
+ DEFINE_ARGS dst, stride, left, top, bot, _, secdmp, table
+ lea tableq, [tap_table]
+ vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask
+ ; pri/sec_taps[k] [4 total]
+ DEFINE_ARGS dst, stride, left, top, bot, _, sec, table, dir
+ vpbroadcastb m1, xm1 ; sec_strength
+ lea secq, [tableq+12] ; sec_taps
+ PREP_REGS %1, %2
+ vpbroadcastd m2, [pw_2048]
+ pxor m0, m0
+%if %1*%2 > mmsize
+.sec_v_loop:
+%endif
+ LOAD_BLOCK %1, %2
+.sec_k_loop:
+ vpbroadcastb m3, [secq+kq] ; sec_taps
+ ACCUMULATE_TAP_BYTE 4, [rsp+8], m14, m1, m3, %1, %2 ; dir + 2
+ ACCUMULATE_TAP_BYTE 0, [rsp+8], m14, m1, m3, %1, %2 ; dir - 2
+ dec kq
+ jge .sec_k_loop
+ ADJUST_PIXEL %1, %2, m0, m2
+%if %1*%2 > mmsize
+ lea dstq, [dstq+strideq*4]
+ lea top1q, [rsp+0xa0]
+ lea top2q, [rsp+0xb0]
+ mov botq, [rsp+16]
+ add hq, 4
+ jl .sec_v_loop
+%endif
+ RET
+
+.d0k0:
+%if %1 == 4
+ %if %2 == 4
+ vpbroadcastq m6, [dstq+strideq*1-1]
+ vpbroadcastq m10, [dstq+strideq*2-1]
+ movd xm5, [topq+strideq*1+1]
+ movd xm9, [dstq+strideq*0+1]
+ psrldq m11, m6, 2
+ psrldq m12, m10, 2
+ vinserti128 m6, [dstq+stride3q -1], 1
+ vinserti128 m10, [botq -1], 1
+ vpblendd m5, m11, 0x10
+ vpblendd m9, m12, 0x10
+ movu m11, [blend_4x4+16]
+ punpckldq m6, m10
+ punpckldq m5, m9
+ vpblendvb m6, [rsp+gprsize+0x28], m11
+ %else
+ movd xm5, [topq +strideq*1+1]
+ movq xm6, [dstq +strideq*1-1]
+ movq xm10, [dstq +stride3q -1]
+ movq xm11, [dst4q+strideq*1-1]
+ pinsrd xm5, [dstq +strideq*0+1], 1
+ movhps xm6, [dstq +strideq*2-1]
+ movhps xm10, [dst4q+strideq*0-1]
+ movhps xm11, [dst4q+strideq*2-1]
+ psrldq xm9, xm6, 2
+ shufps xm5, xm9, q2010 ; -1 +0 +1 +2
+ shufps xm6, xm10, q2020 ; +1 +2 +3 +4
+ psrldq xm9, xm11, 2
+ psrldq xm10, 2
+ shufps xm10, xm9, q2020 ; +3 +4 +5 +6
+ movd xm9, [dst4q+stride3q -1]
+ pinsrd xm9, [botq -1], 1
+ shufps xm11, xm9, q1020 ; +5 +6 +7 +8
+ pmovzxbw m9, [leftq+3]
+ vinserti128 m6, xm11, 1
+ movu m11, [blend_4x8_0+4]
+ vinserti128 m5, xm10, 1
+ vpblendvb m6, m9, m11
+ %endif
+%else
+ lea r13, [blend_8x8_0+16]
+ movq xm5, [top2q +1]
+ vbroadcasti128 m10, [dstq+strideq*1-1]
+ vbroadcasti128 m11, [dstq+strideq*2-1]
+ movhps xm5, [dstq+strideq*0+1]
+ vinserti128 m6, m10, [dstq+stride3q-1], 1
+ vinserti128 m9, m11, [botq -1], 1
+ psrldq m10, 2
+ psrldq m11, 2
+ punpcklqdq m6, m9
+ movu m9, [r13+hq*2*1+16*1]
+ punpcklqdq m10, m11
+ vpblendd m5, m10, 0xF0
+ vpblendvb m6, [rsp+gprsize+0x60+hq*8+64+8*1], m9
+%endif
+ ret
+.d1k0:
+.d2k0:
+.d3k0:
+%if %1 == 4
+ %if %2 == 4
+ movq xm6, [dstq+strideq*0-1]
+ movq xm9, [dstq+strideq*1-1]
+ vinserti128 m6, [dstq+strideq*2-1], 1
+ vinserti128 m9, [dstq+stride3q -1], 1
+ movu m11, [rsp+gprsize+0x10]
+ pcmpeqd m12, m12
+ psrldq m5, m6, 2
+ psrldq m10, m9, 2
+ psrld m12, 24
+ punpckldq m6, m9
+ punpckldq m5, m10
+ vpblendvb m6, m11, m12
+ %else
+ movq xm6, [dstq +strideq*0-1]
+ movq xm9, [dstq +strideq*2-1]
+ movhps xm6, [dstq +strideq*1-1]
+ movhps xm9, [dstq +stride3q -1]
+ movq xm10, [dst4q+strideq*0-1]
+ movhps xm10, [dst4q+strideq*1-1]
+ psrldq xm5, xm6, 2
+ psrldq xm11, xm9, 2
+ shufps xm5, xm11, q2020
+ movq xm11, [dst4q+strideq*2-1]
+ movhps xm11, [dst4q+stride3q -1]
+ shufps xm6, xm9, q2020
+ shufps xm9, xm10, xm11, q2020
+ vinserti128 m6, xm9, 1
+ pmovzxbw m9, [leftq+1]
+ psrldq xm10, 2
+ psrldq xm11, 2
+ shufps xm10, xm11, q2020
+ vpbroadcastd m11, [blend_4x8_0+4]
+ vinserti128 m5, xm10, 1
+ vpblendvb m6, m9, m11
+ %endif
+%else
+ movu xm5, [dstq+strideq*0-1]
+ movu xm9, [dstq+strideq*1-1]
+ vinserti128 m5, [dstq+strideq*2-1], 1
+ vinserti128 m9, [dstq+stride3q -1], 1
+ movu m10, [blend_8x8_0+16]
+ punpcklqdq m6, m5, m9
+ vpblendvb m6, [rsp+gprsize+0x60+hq*8+64], m10
+ psrldq m5, 2
+ psrldq m9, 2
+ punpcklqdq m5, m9
+%endif
+ ret
+.d4k0:
+%if %1 == 4
+ %if %2 == 4
+ vpbroadcastq m10, [dstq+strideq*1-1]
+ vpbroadcastq m11, [dstq+strideq*2-1]
+ movd xm6, [topq+strideq*1-1]
+ movd xm9, [dstq+strideq*0-1]
+ psrldq m5, m10, 2
+ psrldq m12, m11, 2
+ vpblendd m6, m10, 0x10
+ vpblendd m9, m11, 0x10
+ movu m10, [blend_4x4]
+ vinserti128 m5, [dstq+stride3q +1], 1
+ vinserti128 m12, [botq +1], 1
+ punpckldq m6, m9
+ punpckldq m5, m12
+ vpblendvb m6, [rsp+gprsize+0x40], m10
+ %else
+ movd xm6, [topq +strideq*1-1]
+ movq xm9, [dstq +strideq*1-1]
+ movq xm10, [dstq +stride3q -1]
+ movq xm11, [dst4q+strideq*1-1]
+ pinsrd xm6, [dstq +strideq*0-1], 1
+ movhps xm9, [dstq +strideq*2-1]
+ movhps xm10, [dst4q+strideq*0-1]
+ movhps xm11, [dst4q+strideq*2-1]
+ psrldq xm5, xm9, 2
+ shufps xm6, xm9, q2010
+ psrldq xm9, xm10, 2
+ shufps xm5, xm9, q2020
+ shufps xm10, xm11, q2020
+ movd xm9, [dst4q+stride3q +1]
+ vinserti128 m6, xm10, 1
+ pinsrd xm9, [botq +1], 1
+ psrldq xm11, 2
+ pmovzxbw m10, [leftq-1]
+ shufps xm11, xm9, q1020
+ movu m9, [blend_4x8_0]
+ vinserti128 m5, xm11, 1
+ vpblendvb m6, m10, m9
+ %endif
+%else
+ lea r13, [blend_8x8_0+8]
+ movq xm6, [top2q -1]
+ vbroadcasti128 m5, [dstq+strideq*1-1]
+ vbroadcasti128 m9, [dstq+strideq*2-1]
+ movhps xm6, [dstq+strideq*0-1]
+ movu m11, [r13+hq*2*1+16*1]
+ punpcklqdq m10, m5, m9
+ vinserti128 m5, [dstq+stride3q -1], 1
+ vinserti128 m9, [botq -1], 1
+ vpblendd m6, m10, 0xF0
+ vpblendvb m6, [rsp+gprsize+0x60+hq*8+64-8*1], m11
+ psrldq m5, 2
+ psrldq m9, 2
+ punpcklqdq m5, m9
+%endif
+ ret
+.d5k0:
+.d6k0:
+.d7k0:
+%if %1 == 4
+ %if %2 == 4
+ movd xm6, [topq+strideq*1 ]
+ vpbroadcastd m5, [dstq+strideq*1 ]
+ vpbroadcastd m9, [dstq+strideq*2 ]
+ vpblendd xm6, [dstq+strideq*0-4], 0x2
+ vpblendd m5, m9, 0x22
+ vpblendd m6, m5, 0x30
+ vinserti128 m5, [dstq+stride3q ], 1
+ vpblendd m5, [botq -20], 0x20
+ %else
+ movd xm6, [topq +strideq*1]
+ movd xm5, [dstq +strideq*1]
+ movd xm9, [dstq +stride3q ]
+ movd xm10, [dst4q+strideq*1]
+ movd xm11, [dst4q+stride3q ]
+ pinsrd xm6, [dstq +strideq*0], 1
+ pinsrd xm5, [dstq +strideq*2], 1
+ pinsrd xm9, [dst4q+strideq*0], 1
+ pinsrd xm10, [dst4q+strideq*2], 1
+ pinsrd xm11, [botq ], 1
+ punpcklqdq xm6, xm5
+ punpcklqdq xm5, xm9
+ punpcklqdq xm9, xm10
+ punpcklqdq xm10, xm11
+ vinserti128 m6, xm9, 1
+ vinserti128 m5, xm10, 1
+ %endif
+%else
+ movq xm6, [top2q ]
+ movq xm5, [dstq+strideq*1]
+ movq xm9, [dstq+stride3q ]
+ movhps xm6, [dstq+strideq*0]
+ movhps xm5, [dstq+strideq*2]
+ movhps xm9, [botq ]
+ vinserti128 m6, xm5, 1
+ vinserti128 m5, xm9, 1
+%endif
+ ret
+.d0k1:
+%if %1 == 4
+ %if %2 == 4
+ movd xm6, [dstq+strideq*2-2]
+ movd xm9, [dstq+stride3q -2]
+ movd xm5, [topq+strideq*0+2]
+ movd xm10, [topq+strideq*1+2]
+ pinsrw xm6, [leftq+4], 0
+ pinsrw xm9, [leftq+6], 0
+ vinserti128 m5, [dstq+strideq*0+2], 1
+ vinserti128 m10, [dstq+strideq*1+2], 1
+ vinserti128 m6, [botq+strideq*0-2], 1
+ vinserti128 m9, [botq+strideq*1-2], 1
+ punpckldq m5, m10
+ punpckldq m6, m9
+ %else
+ movq xm6, [dstq +strideq*2-2]
+ movd xm10, [dst4q+strideq*2-2]
+ movd xm5, [topq +strideq*0+2]
+ movq xm9, [dst4q+strideq*0-2]
+ movhps xm6, [dstq +stride3q -2]
+ pinsrw xm10, [dst4q+stride3q ], 3
+ pinsrd xm5, [topq +strideq*1+2], 1
+ movhps xm9, [dst4q+strideq*1-2]
+ pinsrd xm10, [botq +strideq*0-2], 2
+ pinsrd xm5, [dstq +strideq*0+2], 2
+ pinsrd xm10, [botq +strideq*1-2], 3
+ pinsrd xm5, [dstq +strideq*1+2], 3
+ shufps xm11, xm6, xm9, q3131
+ shufps xm6, xm9, q2020
+ movu m9, [blend_4x8_3+8]
+ vinserti128 m6, xm10, 1
+ vinserti128 m5, xm11, 1
+ vpblendvb m6, [rsp+gprsize+0x10+8], m9
+ %endif
+%else
+ lea r13, [blend_8x8_1+16]
+ movq xm6, [dstq+strideq*2-2]
+ movq xm9, [dstq+stride3q -2]
+ movq xm5, [top1q +2]
+ movq xm10, [top2q +2]
+ movu m11, [r13+hq*2*2+16*2]
+ vinserti128 m6, [botq+strideq*0-2], 1
+ vinserti128 m9, [botq+strideq*1-2], 1
+ vinserti128 m5, [dstq+strideq*0+2], 1
+ vinserti128 m10, [dstq+strideq*1+2], 1
+ punpcklqdq m6, m9
+ punpcklqdq m5, m10
+ vpblendvb m6, [rsp+gprsize+0x20+hq*8+64+8*2], m11
+%endif
+ ret
+.d1k1:
+%if %1 == 4
+ %if %2 == 4
+ vpbroadcastq m6, [dstq+strideq*1-2]
+ vpbroadcastq m9, [dstq+strideq*2-2]
+ movd xm5, [topq+strideq*1+2]
+ movd xm10, [dstq+strideq*0+2]
+ psrldq m11, m6, 4
+ psrldq m12, m9, 4
+ vpblendd m5, m11, 0x10
+ movq xm11, [leftq+2]
+ vinserti128 m6, [dstq+stride3q-2], 1
+ punpckldq xm11, xm11
+ vpblendd m10, m12, 0x10
+ pcmpeqd m12, m12
+ pmovzxwd m11, xm11
+ psrld m12, 16
+ punpckldq m6, m9
+ vpbroadcastd m9, [botq-2]
+ vpblendvb m6, m11, m12
+ punpckldq m5, m10
+ vpblendd m6, m9, 0x20
+ %else
+ movd xm5, [topq +strideq*1+2]
+ movq xm6, [dstq +strideq*1-2]
+ movq xm9, [dstq +stride3q -2]
+ movq xm10, [dst4q+strideq*1-2]
+ movd xm11, [dst4q+stride3q -2]
+ pinsrd xm5, [dstq +strideq*0+2], 1
+ movhps xm6, [dstq +strideq*2-2]
+ movhps xm9, [dst4q+strideq*0-2]
+ movhps xm10, [dst4q+strideq*2-2]
+ pinsrd xm11, [botq -2], 1
+ shufps xm5, xm6, q3110
+ shufps xm6, xm9, q2020
+ shufps xm9, xm10, q3131
+ shufps xm10, xm11, q1020
+ movu m11, [blend_4x8_2+4]
+ vinserti128 m6, xm10, 1
+ vinserti128 m5, xm9, 1
+ vpblendvb m6, [rsp+gprsize+0x10+4], m11
+ %endif
+%else
+ lea r13, [blend_8x8_1+16]
+ movq xm5, [top2q +2]
+ vbroadcasti128 m6, [dstq+strideq*1-2]
+ vbroadcasti128 m9, [dstq+strideq*2-2]
+ movhps xm5, [dstq+strideq*0+2]
+ shufps m10, m6, m9, q2121
+ vinserti128 m6, [dstq+stride3q -2], 1
+ vinserti128 m9, [botq -2], 1
+ movu m11, [r13+hq*2*1+16*1]
+ vpblendd m5, m10, 0xF0
+ punpcklqdq m6, m9
+ vpblendvb m6, [rsp+gprsize+0x20+hq*8+64+8*1], m11
+%endif
+ ret
+.d2k1:
+%if %1 == 4
+ %if %2 == 4
+ movq xm11, [leftq]
+ movq xm6, [dstq+strideq*0-2]
+ movq xm9, [dstq+strideq*1-2]
+ vinserti128 m6, [dstq+strideq*2-2], 1
+ vinserti128 m9, [dstq+stride3q -2], 1
+ punpckldq xm11, xm11
+ psrldq m5, m6, 4
+ psrldq m10, m9, 4
+ pmovzxwd m11, xm11
+ punpckldq m6, m9
+ punpckldq m5, m10
+ pblendw m6, m11, 0x05
+ %else
+ movq xm5, [dstq +strideq*0-2]
+ movq xm9, [dstq +strideq*2-2]
+ movq xm10, [dst4q+strideq*0-2]
+ movq xm11, [dst4q+strideq*2-2]
+ movhps xm5, [dstq +strideq*1-2]
+ movhps xm9, [dstq +stride3q -2]
+ movhps xm10, [dst4q+strideq*1-2]
+ movhps xm11, [dst4q+stride3q -2]
+ shufps xm6, xm5, xm9, q2020
+ shufps xm5, xm9, q3131
+ shufps xm9, xm10, xm11, q2020
+ shufps xm10, xm11, q3131
+ pmovzxwd m11, [leftq]
+ vinserti128 m6, xm9, 1
+ vinserti128 m5, xm10, 1
+ pblendw m6, m11, 0x55
+ %endif
+%else
+ mova m11, [rsp+gprsize+0x20+hq*8+64]
+ movu xm5, [dstq+strideq*0-2]
+ movu xm9, [dstq+strideq*1-2]
+ vinserti128 m5, [dstq+strideq*2-2], 1
+ vinserti128 m9, [dstq+stride3q -2], 1
+ shufps m6, m5, m9, q1010
+ shufps m5, m9, q2121
+ pblendw m6, m11, 0x11
+%endif
+ ret
+.d3k1:
+%if %1 == 4
+ %if %2 == 4
+ vpbroadcastq m11, [dstq+strideq*1-2]
+ vpbroadcastq m12, [dstq+strideq*2-2]
+ movd xm6, [topq+strideq*1-2]
+ movd xm9, [dstq+strideq*0-2]
+ pblendw m11, [leftq-16+2], 0x01
+ pblendw m12, [leftq-16+4], 0x01
+ pinsrw xm9, [leftq- 0+0], 0
+ psrldq m5, m11, 4
+ psrldq m10, m12, 4
+ vinserti128 m5, [dstq+stride3q +2], 1
+ vinserti128 m10, [botq +2], 1
+ vpblendd m6, m11, 0x10
+ vpblendd m9, m12, 0x10
+ punpckldq m6, m9
+ punpckldq m5, m10
+ %else
+ movd xm6, [topq +strideq*1-2]
+ movq xm5, [dstq +strideq*1-2]
+ movq xm9, [dstq +stride3q -2]
+ movq xm10, [dst4q+strideq*1-2]
+ movd xm11, [dst4q+stride3q +2]
+ pinsrw xm6, [dstq +strideq*0 ], 3
+ movhps xm5, [dstq +strideq*2-2]
+ movhps xm9, [dst4q+strideq*0-2]
+ movhps xm10, [dst4q+strideq*2-2]
+ pinsrd xm11, [botq +2], 1
+ shufps xm6, xm5, q2010
+ shufps xm5, xm9, q3131
+ shufps xm9, xm10, q2020
+ shufps xm10, xm11, q1031
+ movu m11, [blend_4x8_2]
+ vinserti128 m6, xm9, 1
+ vinserti128 m5, xm10, 1
+ vpblendvb m6, [rsp+gprsize+0x10-4], m11
+ %endif
+%else
+ lea r13, [blend_8x8_1+8]
+ movq xm6, [top2q -2]
+ vbroadcasti128 m5, [dstq+strideq*1-2]
+ vbroadcasti128 m10, [dstq+strideq*2-2]
+ movhps xm6, [dstq+strideq*0-2]
+ punpcklqdq m9, m5, m10
+ vinserti128 m5, [dstq+stride3q -2], 1
+ vinserti128 m10, [botq -2], 1
+ movu m11, [r13+hq*2*1+16*1]
+ vpblendd m6, m9, 0xF0
+ shufps m5, m10, q2121
+ vpblendvb m6, [rsp+gprsize+0x20+hq*8+64-8*1], m11
+%endif
+ ret
+.d4k1:
+%if %1 == 4
+ %if %2 == 4
+ vinserti128 m6, [dstq+strideq*0-2], 1
+ vinserti128 m9, [dstq+strideq*1-2], 1
+ movd xm5, [dstq+strideq*2+2]
+ movd xm10, [dstq+stride3q +2]
+ pblendw m6, [leftq-16+0], 0x01
+ pblendw m9, [leftq-16+2], 0x01
+ vinserti128 m5, [botq+strideq*0+2], 1
+ vinserti128 m10, [botq+strideq*1+2], 1
+ vpblendd m6, [topq+strideq*0-2], 0x01
+ vpblendd m9, [topq+strideq*1-2], 0x01
+ punpckldq m5, m10
+ punpckldq m6, m9
+ %else
+ movd xm6, [topq +strideq*0-2]
+ movq xm5, [dstq +strideq*2-2]
+ movq xm9, [dst4q+strideq*0-2]
+ movd xm10, [dst4q+strideq*2+2]
+ pinsrd xm6, [topq +strideq*1-2], 1
+ movhps xm5, [dstq +stride3q -2]
+ movhps xm9, [dst4q+strideq*1-2]
+ pinsrd xm10, [dst4q+stride3q +2], 1
+ pinsrd xm6, [dstq +strideq*0-2], 2
+ pinsrd xm10, [botq +strideq*0+2], 2
+ pinsrd xm6, [dstq +strideq*1-2], 3
+ pinsrd xm10, [botq +strideq*1+2], 3
+ shufps xm11, xm5, xm9, q2020
+ shufps xm5, xm9, q3131
+ movu m9, [blend_4x8_3]
+ vinserti128 m6, xm11, 1
+ vinserti128 m5, xm10, 1
+ vpblendvb m6, [rsp+gprsize+0x10-8], m9
+ %endif
+%else
+ lea r13, [blend_8x8_1]
+ movu m11, [r13+hq*2*2+16*2]
+ movq xm6, [top1q -2]
+ movq xm9, [top2q -2]
+ movq xm5, [dstq+strideq*2+2]
+ movq xm10, [dstq+stride3q +2]
+ vinserti128 m6, [dstq+strideq*0-2], 1
+ vinserti128 m9, [dstq+strideq*1-2], 1
+ vinserti128 m5, [botq+strideq*0+2], 1
+ vinserti128 m10, [botq+strideq*1+2], 1
+ punpcklqdq m6, m9
+ vpblendvb m6, [rsp+gprsize+0x20+hq*8+64-8*2], m11
+ punpcklqdq m5, m10
+%endif
+ ret
+.d5k1:
+%if %1 == 4
+ %if %2 == 4
+ movd xm6, [topq+strideq*0-1]
+ movd xm9, [topq+strideq*1-1]
+ movd xm5, [dstq+strideq*2+1]
+ movd xm10, [dstq+stride3q +1]
+ pcmpeqd m12, m12
+ pmovzxbw m11, [leftq-8+1]
+ psrld m12, 24
+ vinserti128 m6, [dstq+strideq*0-1], 1
+ vinserti128 m9, [dstq+strideq*1-1], 1
+ vinserti128 m5, [botq+strideq*0+1], 1
+ vinserti128 m10, [botq+strideq*1+1], 1
+ punpckldq m6, m9
+ pxor m9, m9
+ vpblendd m12, m9, 0x0F
+ punpckldq m5, m10
+ vpblendvb m6, m11, m12
+ %else
+ movd xm6, [topq +strideq*0-1]
+ movq xm5, [dstq +strideq*2-1]
+ movq xm9, [dst4q+strideq*0-1]
+ movd xm10, [dst4q+strideq*2+1]
+ pinsrd xm6, [topq +strideq*1-1], 1
+ movhps xm5, [dstq +stride3q -1]
+ movhps xm9, [dst4q+strideq*1-1]
+ pinsrd xm10, [dst4q+stride3q +1], 1
+ pinsrd xm6, [dstq +strideq*0-1], 2
+ pinsrd xm10, [botq +strideq*0+1], 2
+ pinsrd xm6, [dstq +strideq*1-1], 3
+ pinsrd xm10, [botq +strideq*1+1], 3
+ shufps xm11, xm5, xm9, q2020
+ vinserti128 m6, xm11, 1
+ pmovzxbw m11, [leftq-3]
+ psrldq xm5, 2
+ psrldq xm9, 2
+ shufps xm5, xm9, q2020
+ movu m9, [blend_4x8_1]
+ vinserti128 m5, xm10, 1
+ vpblendvb m6, m11, m9
+ %endif
+%else
+ lea r13, [blend_8x8_0]
+ movu m11, [r13+hq*2*2+16*2]
+ movq xm6, [top1q -1]
+ movq xm9, [top2q -1]
+ movq xm5, [dstq+strideq*2+1]
+ movq xm10, [dstq+stride3q +1]
+ vinserti128 m6, [dstq+strideq*0-1], 1
+ vinserti128 m9, [dstq+strideq*1-1], 1
+ vinserti128 m5, [botq+strideq*0+1], 1
+ vinserti128 m10, [botq+strideq*1+1], 1
+ punpcklqdq m6, m9
+ punpcklqdq m5, m10
+ vpblendvb m6, [rsp+gprsize+0x60+hq*8+64-8*2], m11
+%endif
+ ret
+.d6k1:
+%if %1 == 4
+ %if %2 == 4
+ movd xm6, [topq+strideq*0]
+ movd xm9, [topq+strideq*1]
+ movd xm5, [dstq+strideq*2]
+ movd xm10, [dstq+stride3q ]
+ vinserti128 m6, [dstq+strideq*0], 1
+ vinserti128 m9, [dstq+strideq*1], 1
+ vinserti128 m5, [botq+strideq*0], 1
+ vinserti128 m10, [botq+strideq*1], 1
+ punpckldq m6, m9
+ punpckldq m5, m10
+ %else
+ movd xm5, [dstq +strideq*2]
+ movd xm6, [topq +strideq*0]
+ movd xm9, [dst4q+strideq*2]
+ pinsrd xm5, [dstq +stride3q ], 1
+ pinsrd xm6, [topq +strideq*1], 1
+ pinsrd xm9, [dst4q+stride3q ], 1
+ pinsrd xm5, [dst4q+strideq*0], 2
+ pinsrd xm6, [dstq +strideq*0], 2
+ pinsrd xm9, [botq +strideq*0], 2
+ pinsrd xm5, [dst4q+strideq*1], 3
+ pinsrd xm6, [dstq +strideq*1], 3
+ pinsrd xm9, [botq +strideq*1], 3
+ vinserti128 m6, xm5, 1
+ vinserti128 m5, xm9, 1
+ %endif
+%else
+ movq xm5, [dstq+strideq*2]
+ movq xm9, [botq+strideq*0]
+ movq xm6, [top1q ]
+ movq xm10, [dstq+strideq*0]
+ movhps xm5, [dstq+stride3q ]
+ movhps xm9, [botq+strideq*1]
+ movhps xm6, [top2q ]
+ movhps xm10, [dstq+strideq*1]
+ vinserti128 m5, xm9, 1
+ vinserti128 m6, xm10, 1
+%endif
+ ret
+.d7k1:
+%if %1 == 4
+ %if %2 == 4
+ movd xm5, [dstq+strideq*2-1]
+ movd xm9, [dstq+stride3q -1]
+ movd xm6, [topq+strideq*0+1]
+ movd xm10, [topq+strideq*1+1]
+ pinsrb xm5, [leftq+ 5], 0
+ pinsrb xm9, [leftq+ 7], 0
+ vinserti128 m6, [dstq+strideq*0+1], 1
+ vinserti128 m10, [dstq+strideq*1+1], 1
+ vinserti128 m5, [botq+strideq*0-1], 1
+ vinserti128 m9, [botq+strideq*1-1], 1
+ punpckldq m6, m10
+ punpckldq m5, m9
+ %else
+ movd xm6, [topq +strideq*0+1]
+ movq xm9, [dstq +strideq*2-1]
+ movq xm10, [dst4q+strideq*0-1]
+ movd xm11, [dst4q+strideq*2-1]
+ pinsrd xm6, [topq +strideq*1+1], 1
+ movhps xm9, [dstq +stride3q -1]
+ movhps xm10, [dst4q+strideq*1-1]
+ pinsrd xm11, [dst4q+stride3q -1], 1
+ pinsrd xm6, [dstq +strideq*0+1], 2
+ pinsrd xm11, [botq +strideq*0-1], 2
+ pinsrd xm6, [dstq +strideq*1+1], 3
+ pinsrd xm11, [botq +strideq*1-1], 3
+ shufps xm5, xm9, xm10, q2020
+ vinserti128 m5, xm11, 1
+ pmovzxbw m11, [leftq+5]
+ psrldq xm9, 2
+ psrldq xm10, 2
+ shufps xm9, xm10, q2020
+ movu m10, [blend_4x8_1+8]
+ vinserti128 m6, xm9, 1
+ vpblendvb m5, m11, m10
+ %endif
+%else
+ lea r13, [blend_8x8_0+16]
+ movq xm5, [dstq+strideq*2-1]
+ movq xm9, [botq+strideq*0-1]
+ movq xm6, [top1q +1]
+ movq xm10, [dstq+strideq*0+1]
+ movhps xm5, [dstq+stride3q -1]
+ movhps xm9, [botq+strideq*1-1]
+ movhps xm6, [top2q +1]
+ movhps xm10, [dstq+strideq*1+1]
+ movu m11, [r13+hq*2*2+16*2]
+ vinserti128 m5, xm9, 1
+ vinserti128 m6, xm10, 1
+ vpblendvb m5, [rsp+gprsize+0x60+hq*8+64+8*2], m11
+%endif
+ ret
+
+.border_block:
+ DEFINE_ARGS dst, stride, left, top, bot, pri, sec, stride3, dst4, edge
+%define rstk rsp
+%assign stack_offset stack_offset_entry
+%assign regs_used 11
+ ALLOC_STACK 2*16+(%2+4)*32, 16
+%define px rsp+2*16+2*32
+
+ pcmpeqw m14, m14
+ psllw m14, 15 ; 0x8000
+
+ ; prepare pixel buffers - body/right
+%if %1 == 4
+ INIT_XMM avx2
+%endif
+%if %2 == 8
+ lea dst4q, [dstq+strideq*4]
+%endif
+ lea stride3q, [strideq*3]
+ test edgeb, 2 ; have_right
+ jz .no_right
+ pmovzxbw m1, [dstq+strideq*0]
+ pmovzxbw m2, [dstq+strideq*1]
+ pmovzxbw m3, [dstq+strideq*2]
+ pmovzxbw m4, [dstq+stride3q]
+ mova [px+0*32], m1
+ mova [px+1*32], m2
+ mova [px+2*32], m3
+ mova [px+3*32], m4
+%if %2 == 8
+ pmovzxbw m1, [dst4q+strideq*0]
+ pmovzxbw m2, [dst4q+strideq*1]
+ pmovzxbw m3, [dst4q+strideq*2]
+ pmovzxbw m4, [dst4q+stride3q]
+ mova [px+4*32], m1
+ mova [px+5*32], m2
+ mova [px+6*32], m3
+ mova [px+7*32], m4
+%endif
+ jmp .body_done
+.no_right:
+%if %1 == 4
+ movd xm1, [dstq+strideq*0]
+ movd xm2, [dstq+strideq*1]
+ movd xm3, [dstq+strideq*2]
+ movd xm4, [dstq+stride3q]
+ pmovzxbw xm1, xm1
+ pmovzxbw xm2, xm2
+ pmovzxbw xm3, xm3
+ pmovzxbw xm4, xm4
+ movq [px+0*32], xm1
+ movq [px+1*32], xm2
+ movq [px+2*32], xm3
+ movq [px+3*32], xm4
+%else
+ pmovzxbw xm1, [dstq+strideq*0]
+ pmovzxbw xm2, [dstq+strideq*1]
+ pmovzxbw xm3, [dstq+strideq*2]
+ pmovzxbw xm4, [dstq+stride3q]
+ mova [px+0*32], xm1
+ mova [px+1*32], xm2
+ mova [px+2*32], xm3
+ mova [px+3*32], xm4
+%endif
+ movd [px+0*32+%1*2], xm14
+ movd [px+1*32+%1*2], xm14
+ movd [px+2*32+%1*2], xm14
+ movd [px+3*32+%1*2], xm14
+%if %2 == 8
+ %if %1 == 4
+ movd xm1, [dst4q+strideq*0]
+ movd xm2, [dst4q+strideq*1]
+ movd xm3, [dst4q+strideq*2]
+ movd xm4, [dst4q+stride3q]
+ pmovzxbw xm1, xm1
+ pmovzxbw xm2, xm2
+ pmovzxbw xm3, xm3
+ pmovzxbw xm4, xm4
+ movq [px+4*32], xm1
+ movq [px+5*32], xm2
+ movq [px+6*32], xm3
+ movq [px+7*32], xm4
+ %else
+ pmovzxbw xm1, [dst4q+strideq*0]
+ pmovzxbw xm2, [dst4q+strideq*1]
+ pmovzxbw xm3, [dst4q+strideq*2]
+ pmovzxbw xm4, [dst4q+stride3q]
+ mova [px+4*32], xm1
+ mova [px+5*32], xm2
+ mova [px+6*32], xm3
+ mova [px+7*32], xm4
+ %endif
+ movd [px+4*32+%1*2], xm14
+ movd [px+5*32+%1*2], xm14
+ movd [px+6*32+%1*2], xm14
+ movd [px+7*32+%1*2], xm14
+%endif
+.body_done:
+
+ ; top
+ test edgeb, 4 ; have_top
+ jz .no_top
+ test edgeb, 1 ; have_left
+ jz .top_no_left
+ test edgeb, 2 ; have_right
+ jz .top_no_right
+ pmovzxbw m1, [topq+strideq*0-(%1/2)]
+ pmovzxbw m2, [topq+strideq*1-(%1/2)]
+ movu [px-2*32-%1], m1
+ movu [px-1*32-%1], m2
+ jmp .top_done
+.top_no_right:
+ pmovzxbw m1, [topq+strideq*0-%1]
+ pmovzxbw m2, [topq+strideq*1-%1]
+ movu [px-2*32-%1*2], m1
+ movu [px-1*32-%1*2], m2
+ movd [px-2*32+%1*2], xm14
+ movd [px-1*32+%1*2], xm14
+ jmp .top_done
+.top_no_left:
+ test edgeb, 2 ; have_right
+ jz .top_no_left_right
+ pmovzxbw m1, [topq+strideq*0]
+ pmovzxbw m2, [topq+strideq*1]
+ mova [px-2*32+0], m1
+ mova [px-1*32+0], m2
+ movd [px-2*32-4], xm14
+ movd [px-1*32-4], xm14
+ jmp .top_done
+.top_no_left_right:
+%if %1 == 4
+ movd xm1, [topq+strideq*0]
+ pinsrd xm1, [topq+strideq*1], 1
+ pmovzxbw xm1, xm1
+ movq [px-2*32+0], xm1
+ movhps [px-1*32+0], xm1
+%else
+ pmovzxbw xm1, [topq+strideq*0]
+ pmovzxbw xm2, [topq+strideq*1]
+ mova [px-2*32+0], xm1
+ mova [px-1*32+0], xm2
+%endif
+ movd [px-2*32-4], xm14
+ movd [px-1*32-4], xm14
+ movd [px-2*32+%1*2], xm14
+ movd [px-1*32+%1*2], xm14
+ jmp .top_done
+.no_top:
+ movu [px-2*32-%1], m14
+ movu [px-1*32-%1], m14
+.top_done:
+
+ ; left
+ test edgeb, 1 ; have_left
+ jz .no_left
+ pmovzxbw xm1, [leftq+ 0]
+%if %2 == 8
+ pmovzxbw xm2, [leftq+ 8]
+%endif
+ movd [px+0*32-4], xm1
+ pextrd [px+1*32-4], xm1, 1
+ pextrd [px+2*32-4], xm1, 2
+ pextrd [px+3*32-4], xm1, 3
+%if %2 == 8
+ movd [px+4*32-4], xm2
+ pextrd [px+5*32-4], xm2, 1
+ pextrd [px+6*32-4], xm2, 2
+ pextrd [px+7*32-4], xm2, 3
+%endif
+ jmp .left_done
+.no_left:
+ movd [px+0*32-4], xm14
+ movd [px+1*32-4], xm14
+ movd [px+2*32-4], xm14
+ movd [px+3*32-4], xm14
+%if %2 == 8
+ movd [px+4*32-4], xm14
+ movd [px+5*32-4], xm14
+ movd [px+6*32-4], xm14
+ movd [px+7*32-4], xm14
+%endif
+.left_done:
+
+ ; bottom
+ DEFINE_ARGS dst, stride, _, _, bot, pri, sec, stride3, _, edge
+ test edgeb, 8 ; have_bottom
+ jz .no_bottom
+ test edgeb, 1 ; have_left
+ jz .bottom_no_left
+ test edgeb, 2 ; have_right
+ jz .bottom_no_right
+ pmovzxbw m1, [botq+strideq*0-(%1/2)]
+ pmovzxbw m2, [botq+strideq*1-(%1/2)]
+ movu [px+(%2+0)*32-%1], m1
+ movu [px+(%2+1)*32-%1], m2
+ jmp .bottom_done
+.bottom_no_right:
+ pmovzxbw m1, [botq+strideq*0-%1]
+ pmovzxbw m2, [botq+strideq*1-%1]
+ movu [px+(%2+0)*32-%1*2], m1
+ movu [px+(%2+1)*32-%1*2], m2
+%if %1 == 8
+ movd [px+(%2-1)*32+%1*2], xm14 ; overwritten by previous movu
+%endif
+ movd [px+(%2+0)*32+%1*2], xm14
+ movd [px+(%2+1)*32+%1*2], xm14
+ jmp .bottom_done
+.bottom_no_left:
+ test edgeb, 2 ; have_right
+ jz .bottom_no_left_right
+ pmovzxbw m1, [botq+strideq*0]
+ pmovzxbw m2, [botq+strideq*1]
+ mova [px+(%2+0)*32+0], m1
+ mova [px+(%2+1)*32+0], m2
+ movd [px+(%2+0)*32-4], xm14
+ movd [px+(%2+1)*32-4], xm14
+ jmp .bottom_done
+.bottom_no_left_right:
+%if %1 == 4
+ movd xm1, [botq+strideq*0]
+ pinsrd xm1, [botq+strideq*1], 1
+ pmovzxbw xm1, xm1
+ movq [px+(%2+0)*32+0], xm1
+ movhps [px+(%2+1)*32+0], xm1
+%else
+ pmovzxbw xm1, [botq+strideq*0]
+ pmovzxbw xm2, [botq+strideq*1]
+ mova [px+(%2+0)*32+0], xm1
+ mova [px+(%2+1)*32+0], xm2
+%endif
+ movd [px+(%2+0)*32-4], xm14
+ movd [px+(%2+1)*32-4], xm14
+ movd [px+(%2+0)*32+%1*2], xm14
+ movd [px+(%2+1)*32+%1*2], xm14
+ jmp .bottom_done
+.no_bottom:
+ movu [px+(%2+0)*32-%1], m14
+ movu [px+(%2+1)*32-%1], m14
+.bottom_done:
+
+ ; actual filter
+ INIT_YMM avx2
+ DEFINE_ARGS dst, stride, _, pridmp, damping, pri, secdmp, stride3, zero
+%undef edged
+ ; register to shuffle values into after packing
+ vbroadcasti128 m12, [shufb_lohi]
+
+ mov dampingd, r8m
+ xor zerod, zerod
+ movifnidn prid, prim
+ sub dampingd, 31
+ movifnidn secdmpd, secdmpm
+ test prid, prid
+ jz .border_sec_only
+ movd xm0, prid
+ lzcnt pridmpd, prid
+ add pridmpd, dampingd
+ cmovs pridmpd, zerod
+ mov [rsp+0], pridmpq ; pri_shift
+ test secdmpd, secdmpd
+ jz .border_pri_only
+ movd xm1, secdmpd
+ lzcnt secdmpd, secdmpd
+ add secdmpd, dampingd
+ mov [rsp+8], secdmpq ; sec_shift
+
+ DEFINE_ARGS dst, stride, _, pridmp, table, pri, secdmp, stride3
+ lea tableq, [tap_table]
+ vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask
+ vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask
+
+ ; pri/sec_taps[k] [4 total]
+ DEFINE_ARGS dst, stride, _, dir, table, pri, sec, stride3
+ vpbroadcastb m0, xm0 ; pri_strength
+ vpbroadcastb m1, xm1 ; sec_strength
+ and prid, 1
+ lea priq, [tableq+priq*2+8] ; pri_taps
+ lea secq, [tableq+12] ; sec_taps
+
+ BORDER_PREP_REGS %1, %2
+%if %1*%2*2/mmsize > 1
+.border_v_loop:
+%endif
+ BORDER_LOAD_BLOCK %1, %2, 1
+.border_k_loop:
+ vpbroadcastb m2, [priq+kq] ; pri_taps
+ vpbroadcastb m3, [secq+kq] ; sec_taps
+ ACCUMULATE_TAP_WORD 0*2, [rsp+0], m13, m0, m2, %1, 1
+ ACCUMULATE_TAP_WORD 2*2, [rsp+8], m14, m1, m3, %1, 1
+ ACCUMULATE_TAP_WORD 6*2, [rsp+8], m14, m1, m3, %1, 1
+ dec kq
+ jge .border_k_loop
+
+ vpbroadcastd m10, [pw_2048]
+ BORDER_ADJUST_PIXEL %1, m10, 1
+%if %1*%2*2/mmsize > 1
+ %define vloop_lines (mmsize/(%1*2))
+ lea dstq, [dstq+strideq*vloop_lines]
+ add stkq, 32*vloop_lines
+ dec hd
+ jg .border_v_loop
+%endif
+ RET
+
+.border_pri_only:
+ DEFINE_ARGS dst, stride, _, pridmp, table, pri, _, stride3
+ lea tableq, [tap_table]
+ vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask
+ DEFINE_ARGS dst, stride, _, dir, table, pri, _, stride3
+ vpbroadcastb m0, xm0 ; pri_strength
+ and prid, 1
+ lea priq, [tableq+priq*2+8] ; pri_taps
+ BORDER_PREP_REGS %1, %2
+ vpbroadcastd m1, [pw_2048]
+%if %1*%2*2/mmsize > 1
+.border_pri_v_loop:
+%endif
+ BORDER_LOAD_BLOCK %1, %2
+.border_pri_k_loop:
+ vpbroadcastb m2, [priq+kq] ; pri_taps
+ ACCUMULATE_TAP_WORD 0*2, [rsp+0], m13, m0, m2, %1
+ dec kq
+ jge .border_pri_k_loop
+ BORDER_ADJUST_PIXEL %1, m1
+%if %1*%2*2/mmsize > 1
+ %define vloop_lines (mmsize/(%1*2))
+ lea dstq, [dstq+strideq*vloop_lines]
+ add stkq, 32*vloop_lines
+ dec hd
+ jg .border_pri_v_loop
+%endif
+ RET
+
+.border_sec_only:
+ DEFINE_ARGS dst, stride, _, _, damping, _, secdmp, stride3
+ movd xm1, secdmpd
+ lzcnt secdmpd, secdmpd
+ add secdmpd, dampingd
+ mov [rsp+8], secdmpq ; sec_shift
+ DEFINE_ARGS dst, stride, _, _, table, _, secdmp, stride3
+ lea tableq, [tap_table]
+ vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask
+ DEFINE_ARGS dst, stride, _, dir, table, _, sec, stride3
+ vpbroadcastb m1, xm1 ; sec_strength
+ lea secq, [tableq+12] ; sec_taps
+ BORDER_PREP_REGS %1, %2
+ vpbroadcastd m0, [pw_2048]
+%if %1*%2*2/mmsize > 1
+.border_sec_v_loop:
+%endif
+ BORDER_LOAD_BLOCK %1, %2
+.border_sec_k_loop:
+ vpbroadcastb m3, [secq+kq] ; sec_taps
+ ACCUMULATE_TAP_WORD 2*2, [rsp+8], m14, m1, m3, %1
+ ACCUMULATE_TAP_WORD 6*2, [rsp+8], m14, m1, m3, %1
+ dec kq
+ jge .border_sec_k_loop
+ BORDER_ADJUST_PIXEL %1, m0
+%if %1*%2*2/mmsize > 1
+ %define vloop_lines (mmsize/(%1*2))
+ lea dstq, [dstq+strideq*vloop_lines]
+ add stkq, 32*vloop_lines
+ dec hd
+ jg .border_sec_v_loop
+%endif
+ RET
+%endmacro
+
+CDEF_FILTER 8, 8
+CDEF_FILTER 4, 8
+CDEF_FILTER 4, 4
+
+INIT_YMM avx2
+cglobal cdef_dir_8bpc, 3, 4, 6, src, stride, var, stride3
+ lea stride3q, [strideq*3]
+ movq xm0, [srcq+strideq*0]
+ movq xm1, [srcq+strideq*1]
+ movq xm2, [srcq+strideq*2]
+ movq xm3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpbroadcastq m4, [srcq+stride3q ]
+ vpbroadcastq m5, [srcq+strideq*2]
+ vpblendd m0, m4, 0xf0
+ vpblendd m1, m5, 0xf0
+ vpbroadcastq m4, [srcq+strideq*1]
+ vpbroadcastq m5, [srcq+strideq*0]
+ vpblendd m2, m4, 0xf0
+ vpblendd m3, m5, 0xf0
+ pxor m4, m4
+ punpcklbw m0, m4
+ punpcklbw m1, m4
+ punpcklbw m2, m4
+ punpcklbw m3, m4
+cglobal_label .main
+ vpbroadcastd m4, [pw_128]
+ PROLOGUE 3, 4, 15
+ psubw m0, m4
+ psubw m1, m4
+ psubw m2, m4
+ psubw m3, m4
+
+ ; shuffle registers to generate partial_sum_diag[0-1] together
+ vperm2i128 m7, m0, m0, 0x01
+ vperm2i128 m6, m1, m1, 0x01
+ vperm2i128 m5, m2, m2, 0x01
+ vperm2i128 m4, m3, m3, 0x01
+
+ ; start with partial_sum_hv[0-1]
+ paddw m8, m0, m1
+ paddw m9, m2, m3
+ phaddw m10, m0, m1
+ phaddw m11, m2, m3
+ paddw m8, m9
+ phaddw m10, m11
+ vextracti128 xm9, m8, 1
+ vextracti128 xm11, m10, 1
+ paddw xm8, xm9 ; partial_sum_hv[1]
+ phaddw xm10, xm11 ; partial_sum_hv[0]
+ vinserti128 m8, xm10, 1
+ vpbroadcastd m9, [div_table+44]
+ pmaddwd m8, m8
+ pmulld m8, m9 ; cost6[2a-d] | cost2[a-d]
+
+ ; create aggregates [lower half]:
+ ; m9 = m0:01234567+m1:x0123456+m2:xx012345+m3:xxx01234+
+ ; m4:xxxx0123+m5:xxxxx012+m6:xxxxxx01+m7:xxxxxxx0
+ ; m10= m1:7xxxxxxx+m2:67xxxxxx+m3:567xxxxx+
+ ; m4:4567xxxx+m5:34567xxx+m6:234567xx+m7:1234567x
+ ; and [upper half]:
+ ; m9 = m0:xxxxxxx0+m1:xxxxxx01+m2:xxxxx012+m3:xxxx0123+
+ ; m4:xxx01234+m5:xx012345+m6:x0123456+m7:01234567
+ ; m10= m0:1234567x+m1:234567xx+m2:34567xxx+m3:4567xxxx+
+ ; m4:567xxxxx+m5:67xxxxxx+m6:7xxxxxxx
+ ; and then shuffle m11 [shufw_6543210x], unpcklwd, pmaddwd, pmulld, paddd
+
+ pslldq m9, m1, 2
+ psrldq m10, m1, 14
+ pslldq m11, m2, 4
+ psrldq m12, m2, 12
+ pslldq m13, m3, 6
+ psrldq m14, m3, 10
+ paddw m9, m11
+ paddw m10, m12
+ paddw m9, m13
+ paddw m10, m14
+ pslldq m11, m4, 8
+ psrldq m12, m4, 8
+ pslldq m13, m5, 10
+ psrldq m14, m5, 6
+ paddw m9, m11
+ paddw m10, m12
+ paddw m9, m13
+ paddw m10, m14
+ pslldq m11, m6, 12
+ psrldq m12, m6, 4
+ pslldq m13, m7, 14
+ psrldq m14, m7, 2
+ paddw m9, m11
+ paddw m10, m12
+ paddw m9, m13
+ paddw m10, m14 ; partial_sum_diag[0/1][8-14,zero]
+ vbroadcasti128 m14, [shufw_6543210x]
+ vbroadcasti128 m13, [div_table+16]
+ vbroadcasti128 m12, [div_table+0]
+ paddw m9, m0 ; partial_sum_diag[0/1][0-7]
+ pshufb m10, m14
+ punpckhwd m11, m9, m10
+ punpcklwd m9, m10
+ pmaddwd m11, m11
+ pmaddwd m9, m9
+ pmulld m11, m13
+ pmulld m9, m12
+ paddd m9, m11 ; cost0[a-d] | cost4[a-d]
+
+ ; merge horizontally and vertically for partial_sum_alt[0-3]
+ paddw m10, m0, m1
+ paddw m11, m2, m3
+ paddw m12, m4, m5
+ paddw m13, m6, m7
+ phaddw m0, m4
+ phaddw m1, m5
+ phaddw m2, m6
+ phaddw m3, m7
+
+ ; create aggregates [lower half]:
+ ; m4 = m10:01234567+m11:x0123456+m12:xx012345+m13:xxx01234
+ ; m11= m11:7xxxxxxx+m12:67xxxxxx+m13:567xxxxx
+ ; and [upper half]:
+ ; m4 = m10:xxx01234+m11:xx012345+m12:x0123456+m13:01234567
+ ; m11= m10:567xxxxx+m11:67xxxxxx+m12:7xxxxxxx
+ ; and then pshuflw m11 3012, unpcklwd, pmaddwd, pmulld, paddd
+
+ pslldq m4, m11, 2
+ psrldq m11, 14
+ pslldq m5, m12, 4
+ psrldq m12, 12
+ pslldq m6, m13, 6
+ psrldq m13, 10
+ paddw m4, m10
+ paddw m11, m12
+ vpbroadcastd m12, [div_table+44]
+ paddw m5, m6
+ paddw m11, m13 ; partial_sum_alt[3/2] right
+ vbroadcasti128 m13, [div_table+32]
+ paddw m4, m5 ; partial_sum_alt[3/2] left
+ pshuflw m5, m11, q3012
+ punpckhwd m6, m11, m4
+ punpcklwd m4, m5
+ pmaddwd m6, m6
+ pmaddwd m4, m4
+ pmulld m6, m12
+ pmulld m4, m13
+ paddd m4, m6 ; cost7[a-d] | cost5[a-d]
+
+ ; create aggregates [lower half]:
+ ; m5 = m0:01234567+m1:x0123456+m2:xx012345+m3:xxx01234
+ ; m1 = m1:7xxxxxxx+m2:67xxxxxx+m3:567xxxxx
+ ; and [upper half]:
+ ; m5 = m0:xxx01234+m1:xx012345+m2:x0123456+m3:01234567
+ ; m1 = m0:567xxxxx+m1:67xxxxxx+m2:7xxxxxxx
+ ; and then pshuflw m1 3012, unpcklwd, pmaddwd, pmulld, paddd
+
+ pslldq m5, m1, 2
+ psrldq m1, 14
+ pslldq m6, m2, 4
+ psrldq m2, 12
+ pslldq m7, m3, 6
+ psrldq m3, 10
+ paddw m5, m0
+ paddw m1, m2
+ paddw m6, m7
+ paddw m1, m3 ; partial_sum_alt[0/1] right
+ paddw m5, m6 ; partial_sum_alt[0/1] left
+ pshuflw m0, m1, q3012
+ punpckhwd m1, m5
+ punpcklwd m5, m0
+ pmaddwd m1, m1
+ pmaddwd m5, m5
+ pmulld m1, m12
+ pmulld m5, m13
+ paddd m5, m1 ; cost1[a-d] | cost3[a-d]
+
+ mova xm0, [pd_47130256+ 16]
+ mova m1, [pd_47130256]
+ phaddd m9, m8
+ phaddd m5, m4
+ phaddd m9, m5
+ vpermd m0, m9 ; cost[0-3]
+ vpermd m1, m9 ; cost[4-7] | cost[0-3]
+
+ ; now find the best cost
+ pmaxsd xm2, xm0, xm1
+ pshufd xm3, xm2, q1032
+ pmaxsd xm2, xm3
+ pshufd xm3, xm2, q2301
+ pmaxsd xm2, xm3 ; best cost
+
+ ; find the idx using minpos
+ ; make everything other than the best cost negative via subtraction
+ ; find the min of unsigned 16-bit ints to sort out the negative values
+ psubd xm4, xm1, xm2
+ psubd xm3, xm0, xm2
+ packssdw xm3, xm4
+ phminposuw xm3, xm3
+
+ ; convert idx to 32-bits
+ psrld xm3, 16
+ movd eax, xm3
+
+ ; get idx^4 complement
+ vpermd m3, m1
+ psubd xm2, xm3
+ psrld xm2, 10
+ movd [varq], xm2
+ RET
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/cdef_avx512.asm b/third_party/dav1d/src/x86/cdef_avx512.asm
new file mode 100644
index 0000000000..b4f9c008ca
--- /dev/null
+++ b/third_party/dav1d/src/x86/cdef_avx512.asm
@@ -0,0 +1,860 @@
+; Copyright © 2020, VideoLAN and dav1d authors
+; Copyright © 2020, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+%macro DUP4 1-*
+ %rep %0
+ times 4 db %1
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro DIRS 16 ; cdef_directions[]
+ %rep 4 + 16 + 4 ; 6 7 0 1 2 3 4 5 6 7 0 1
+ ; masking away unused bits allows us to use a single vpaddd {1to16}
+ ; instruction instead of having to do vpbroadcastd + paddb
+ db %13 & 0x3f, -%13 & 0x3f
+ %rotate 1
+ %endrep
+%endmacro
+
+SECTION_RODATA 64
+
+lut_perm_4x4: db 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79
+ db 16, 17, 0, 1, 2, 3, 4, 5, 18, 19, 8, 9, 10, 11, 12, 13
+ db 20, 21, 80, 81, 82, 83, 84, 85, 22, 23, 32, 33, 34, 35, 36, 37
+ db 98, 99,100,101,102,103,104,105, 50, 51, 52, 53, 54, 55, 56, 57
+lut_perm_4x8a: db 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79
+ db 96, 97, 0, 1, 2, 3, 4, 5, 98, 99, 8, 9, 10, 11, 12, 13
+lut_perm_4x8b:db 100,101, 16, 17, 18, 19, 20, 21,102,103, 24, 25, 26, 27, 28, 29
+ db 104,105, 32, 33, 34, 35, 36, 37,106,107, 40, 41, 42, 43, 44, 45
+ db 108,109, 48, 49, 50, 51, 52, 53,110,111, 56, 57, 58, 59, 60, 61
+ db 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95
+pd_01234567: dd 0, 1, 2, 3, 4, 5, 6, 7
+lut_perm_8x8a: db 32, 33, 34, 35, 36, 37, 38, 39, 48, 49, 50, 51, 52, 53, 54, 55
+ db 36, 37, 38, 39, 40, 41, 42, 43, 52, 53, 54, 55, 56, 57, 58, 59
+lut_perm_8x8b: db 12, 13, 0, 1, 2, 3, 4, 5, 14, 15, 16, 17, 18, 19, 20, 21
+ db 2, 3, 4, 5, 6, 7, 8, 9, 18, 19, 20, 21, 22, 23, 24, 25
+ db 28, 29, 32, 33, 34, 35, 36, 37, 30, 31, 48, 49, 50, 51, 52, 53
+ db 34, 35, 36, 37, 38, 39, 40, 41, 50, 51, 52, 53, 54, 55, 56, 57
+end_perm: db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61
+ db 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63
+end_perm_clip: db 0, 4, 8, 12, 2, 6, 10, 14, 16, 20, 24, 28, 18, 22, 26, 30
+ db 32, 36, 40, 44, 34, 38, 42, 46, 48, 52, 56, 60, 50, 54, 58, 62
+ db 1, 5, 9, 13, 3, 7, 11, 15, 17, 21, 25, 29, 19, 23, 27, 31
+ db 33, 37, 41, 45, 35, 39, 43, 47, 49, 53, 57, 61, 51, 55, 59, 63
+edge_mask: dq 0x00003c3c3c3c0000, 0x00003f3f3f3f0000 ; 0000, 0001
+ dq 0x0000fcfcfcfc0000, 0x0000ffffffff0000 ; 0010, 0011
+ dq 0x00003c3c3c3c3c3c, 0x00003f3f3f3f3f3f ; 0100, 0101
+ dq 0x0000fcfcfcfcfcfc, 0x0000ffffffffffff ; 0110, 0111
+ dq 0x3c3c3c3c3c3c0000, 0x3f3f3f3f3f3f0000 ; 1000, 1001
+ dq 0xfcfcfcfcfcfc0000, 0xffffffffffff0000 ; 1010, 1011
+ dq 0x3c3c3c3c3c3c3c3c, 0x3f3f3f3f3f3f3f3f ; 1100, 1101
+ dq 0xfcfcfcfcfcfcfcfc, 0xffffffffffffffff ; 1110, 1111
+px_idx: DUP4 18, 19, 20, 21, 26, 27, 28, 29, 34, 35, 36, 37, 42, 43, 44, 45
+cdef_dirs: DIRS -7,-14, 1, -6, 1, 2, 1, 10, 9, 18, 8, 17, 8, 16, 8, 15
+gf_shr: dq 0x0102040810204080, 0x0102040810204080 ; >> 0, >> 0
+ dq 0x0204081020408000, 0x0408102040800000 ; >> 1, >> 2
+ dq 0x0810204080000000, 0x1020408000000000 ; >> 3, >> 4
+ dq 0x2040800000000000, 0x4080000000000000 ; >> 5, >> 6
+pri_tap: db 64, 64, 32, 32, 48, 48, 48, 48 ; left-shifted by 4
+sec_tap: db 32, 32, 16, 16
+pd_268435568: dd 268435568
+
+SECTION .text
+
+%if WIN64
+DECLARE_REG_TMP 4
+%else
+DECLARE_REG_TMP 8
+%endif
+
+; lut:
+; t0 t1 t2 t3 t4 t5 t6 t7
+; T0 T1 T2 T3 T4 T5 T6 T7
+; L0 L1 00 01 02 03 04 05
+; L2 L3 10 11 12 13 14 15
+; L4 L5 20 21 22 23 24 25
+; L6 L7 30 31 32 33 34 35
+; b0 b1 b2 b3 b4 b5 b6 b7
+; B0 B1 B2 B3 B4 B5 B6 B7
+
+INIT_ZMM avx512icl
+cglobal cdef_filter_4x4_8bpc, 5, 8, 13, dst, stride, left, top, bot, \
+ pri, sec, dir, damping, edge
+%define base r7-edge_mask
+ movq xmm0, [dstq+strideq*0]
+ movhps xmm0, [dstq+strideq*1]
+ lea r7, [edge_mask]
+ movq xmm1, [topq+strideq*0-2]
+ movhps xmm1, [topq+strideq*1-2]
+ mov r6d, edgem
+ vinserti32x4 ym0, ymm0, [leftq], 1
+ lea r2, [strideq*3]
+ vinserti32x4 ym1, ymm1, [dstq+strideq*2], 1
+ mova m5, [base+lut_perm_4x4]
+ vinserti32x4 m0, [dstq+r2], 2
+ test r6b, 0x08 ; avoid buffer overread
+ jz .main
+ vinserti32x4 m1, [botq+strideq*0-4], 2
+ vinserti32x4 m0, [botq+strideq*1-4], 3
+.main:
+ movifnidn prid, prim
+ mov t0d, dirm
+ mova m3, [base+px_idx]
+ mov r3d, dampingm
+ vpermi2b m5, m0, m1 ; lut
+ vpbroadcastd m0, [base+pd_268435568] ; (1 << 28) + (7 << 4)
+ pxor m7, m7
+ lea r3, [r7+r3*8] ; gf_shr + (damping - 30) * 8
+ vpermb m6, m3, m5 ; px
+ cmp r6d, 0x0f
+ jne .mask_edges ; mask edges only if required
+ test prid, prid
+ jz .sec_only
+ vpaddd m1, m3, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir
+ vpermb m1, m1, m5 ; k0p0 k0p1 k1p0 k1p1
+%macro CDEF_FILTER_4x4_PRI 0
+ vpcmpub k1, m6, m1, 6 ; px > pN
+ psubb m2, m1, m6
+ lzcnt r6d, prid
+ vpsubb m2{k1}, m6, m1 ; abs(diff)
+ vpbroadcastb m4, prid
+ and prid, 1
+ vgf2p8affineqb m9, m2, [r3+r6*8] {1to8}, 0 ; abs(diff) >> shift
+ movifnidn secd, secm
+ vpbroadcastd m10, [base+pri_tap+priq*4]
+ vpsubb m10{k1}, m7, m10 ; apply_sign(pri_tap)
+ psubusb m4, m9 ; imax(0, pri_strength - (abs(diff) >> shift)))
+ pminub m2, m4
+ vpdpbusd m0, m2, m10 ; sum
+%endmacro
+ CDEF_FILTER_4x4_PRI
+ test secd, secd
+ jz .end_no_clip
+ call .sec
+.end_clip:
+ pminub m4, m6, m1
+ pmaxub m1, m6
+ pminub m5, m2, m3
+ pmaxub m2, m3
+ pminub m4, m5
+ pmaxub m2, m1
+ psrldq m1, m4, 2
+ psrldq m3, m2, 2
+ pminub m1, m4
+ vpcmpw k1, m0, m7, 1
+ vpshldd m6, m0, 8
+ pmaxub m2, m3
+ pslldq m3, m1, 1
+ psubw m7, m0
+ paddusw m0, m6 ; clip >0xff
+ vpsubusw m0{k1}, m6, m7 ; clip <0x00
+ pslldq m4, m2, 1
+ pminub m1, m3
+ pmaxub m2, m4
+ pmaxub m0, m1
+ pminub m0, m2
+ jmp .end
+.sec_only:
+ movifnidn secd, secm
+ call .sec
+.end_no_clip:
+ vpshldd m6, m0, 8 ; (px << 8) + ((sum > -8) << 4)
+ paddw m0, m6 ; (px << 8) + ((sum + (sum > -8) + 7) << 4)
+.end:
+ mova xm1, [base+end_perm]
+ vpermb m0, m1, m0 ; output in bits 8-15 of each dword
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ pextrd [dstq+strideq*2], xm0, 2
+ pextrd [dstq+r2 ], xm0, 3
+ RET
+.mask_edges_sec_only:
+ movifnidn secd, secm
+ call .mask_edges_sec
+ jmp .end_no_clip
+ALIGN function_align
+.mask_edges:
+ vpbroadcastq m8, [base+edge_mask+r6*8]
+ test prid, prid
+ jz .mask_edges_sec_only
+ vpaddd m2, m3, [base+cdef_dirs+(t0+2)*4] {1to16}
+ vpshufbitqmb k1, m8, m2 ; index in-range
+ mova m1, m6
+ vpermb m1{k1}, m2, m5
+ CDEF_FILTER_4x4_PRI
+ test secd, secd
+ jz .end_no_clip
+ call .mask_edges_sec
+ jmp .end_clip
+.mask_edges_sec:
+ vpaddd m4, m3, [base+cdef_dirs+(t0+4)*4] {1to16}
+ vpaddd m9, m3, [base+cdef_dirs+(t0+0)*4] {1to16}
+ vpshufbitqmb k1, m8, m4
+ mova m2, m6
+ vpermb m2{k1}, m4, m5
+ vpshufbitqmb k1, m8, m9
+ mova m3, m6
+ vpermb m3{k1}, m9, m5
+ jmp .sec_main
+ALIGN function_align
+.sec:
+ vpaddd m2, m3, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2
+ vpaddd m3, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2
+ vpermb m2, m2, m5 ; k0s0 k0s1 k1s0 k1s1
+ vpermb m3, m3, m5 ; k0s2 k0s3 k1s2 k1s3
+.sec_main:
+ vpbroadcastd m8, [base+sec_tap]
+ vpcmpub k1, m6, m2, 6
+ psubb m4, m2, m6
+ vpbroadcastb m12, secd
+ lzcnt secd, secd
+ vpsubb m4{k1}, m6, m2
+ vpcmpub k2, m6, m3, 6
+ vpbroadcastq m11, [r3+secq*8]
+ gf2p8affineqb m10, m4, m11, 0
+ psubb m5, m3, m6
+ mova m9, m8
+ vpsubb m8{k1}, m7, m8
+ psubusb m10, m12, m10
+ vpsubb m5{k2}, m6, m3
+ pminub m4, m10
+ vpdpbusd m0, m4, m8
+ gf2p8affineqb m11, m5, m11, 0
+ vpsubb m9{k2}, m7, m9
+ psubusb m12, m11
+ pminub m5, m12
+ vpdpbusd m0, m5, m9
+ ret
+
+DECLARE_REG_TMP 2, 7
+
+; lut top lut bottom
+; t0 t1 t2 t3 t4 t5 t6 t7 L4 L5 20 21 22 23 24 25
+; T0 T1 T2 T3 T4 T5 T6 T7 L6 L7 30 31 32 33 34 35
+; L0 L1 00 01 02 03 04 05 L8 L9 40 41 42 43 44 45
+; L2 L3 10 11 12 13 14 15 La Lb 50 51 52 53 54 55
+; L4 L5 20 21 22 23 24 25 Lc Ld 60 61 62 63 64 65
+; L6 L7 30 31 32 33 34 35 Le Lf 70 71 72 73 74 75
+; L8 L9 40 41 42 43 44 45 b0 b1 b2 b3 b4 b5 b6 b7
+; La Lb 50 51 52 53 54 55 B0 B1 B2 B3 B4 B5 B6 B7
+
+cglobal cdef_filter_4x8_8bpc, 5, 9, 22, dst, stride, left, top, bot, \
+ pri, sec, dir, damping, edge
+%define base r8-edge_mask
+ vpbroadcastd ym21, strided
+ mov r6d, edgem
+ lea r8, [edge_mask]
+ movq xm1, [topq+strideq*0-2]
+ pmulld ym21, [base+pd_01234567]
+ kxnorb k1, k1, k1
+ movq xm2, [topq+strideq*1-2]
+ vpgatherdq m0{k1}, [dstq+ym21] ; +0+1 +2+3 +4+5 +6+7
+ mova m14, [base+lut_perm_4x8a]
+ movu m15, [base+lut_perm_4x8b]
+ test r6b, 0x08 ; avoid buffer overread
+ jz .main
+ vinserti32x4 ym1, [botq+strideq*0-2], 1
+ vinserti32x4 ym2, [botq+strideq*1-2], 1
+.main:
+ punpcklqdq ym1, ym2
+ vinserti32x4 m1, [leftq], 2 ; -2-1 +8+9 left ____
+ movifnidn prid, prim
+ mov t0d, dirm
+ mova m16, [base+px_idx]
+ mov r3d, dampingm
+ vpermi2b m14, m0, m1 ; lut top
+ vpermi2b m15, m0, m1 ; lut bottom
+ vpbroadcastd m0, [base+pd_268435568] ; (1 << 28) + (7 << 4)
+ pxor m20, m20
+ lea r3, [r8+r3*8] ; gf_shr + (damping - 30) * 8
+ vpermb m2, m16, m14 ; pxt
+ vpermb m3, m16, m15 ; pxb
+ mova m1, m0
+ cmp r6b, 0x0f
+ jne .mask_edges ; mask edges only if required
+ test prid, prid
+ jz .sec_only
+ vpaddd m6, m16, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir
+ vpermb m4, m6, m14 ; pNt k0p0 k0p1 k1p0 k1p1
+ vpermb m5, m6, m15 ; pNb
+%macro CDEF_FILTER_4x8_PRI 0
+ vpcmpub k1, m2, m4, 6 ; pxt > pNt
+ vpcmpub k2, m3, m5, 6 ; pxb > pNb
+ psubb m6, m4, m2
+ psubb m7, m5, m3
+ lzcnt r6d, prid
+ vpsubb m6{k1}, m2, m4 ; abs(diff_top)
+ vpsubb m7{k2}, m3, m5 ; abs(diff_bottom)
+ vpbroadcastb m13, prid
+ vpbroadcastq m9, [r3+r6*8]
+ and prid, 1
+ vpbroadcastd m11, [base+pri_tap+priq*4]
+ vgf2p8affineqb m8, m6, m9, 0 ; abs(dt) >> shift
+ vgf2p8affineqb m9, m7, m9, 0 ; abs(db) >> shift
+ mova m10, m11
+ movifnidn t1d, secm
+ vpsubb m10{k1}, m20, m11 ; apply_sign(pri_tap_top)
+ vpsubb m11{k2}, m20, m11 ; apply_sign(pri_tap_bottom)
+ psubusb m12, m13, m8 ; imax(0, pri_strength - (abs(dt) >> shift)))
+ psubusb m13, m13, m9 ; imax(0, pri_strength - (abs(db) >> shift)))
+ pminub m6, m12
+ pminub m7, m13
+ vpdpbusd m0, m6, m10 ; sum top
+ vpdpbusd m1, m7, m11 ; sum bottom
+%endmacro
+ CDEF_FILTER_4x8_PRI
+ test t1d, t1d ; sec
+ jz .end_no_clip
+ call .sec
+.end_clip:
+ pminub m10, m4, m2
+ pminub m12, m6, m8
+ pminub m11, m5, m3
+ pminub m13, m7, m9
+ pmaxub m4, m2
+ pmaxub m6, m8
+ pmaxub m5, m3
+ pmaxub m7, m9
+ pminub m10, m12
+ pminub m11, m13
+ pmaxub m4, m6
+ pmaxub m5, m7
+ mov r2d, 0xAAAAAAAA
+ kmovd k1, r2d
+ kxnorb k2, k2, k2 ; hw lw
+ vpshrdd m12, m0, m1, 16 ; m1lw m0hw
+ vpshrdd m6, m10, m11, 16 ; m11lw m10hw
+ vpshrdd m8, m4, m5, 16 ; m5lw m4hw
+ vpblendmw m7{k1}, m10, m11 ; m11hw m10lw
+ vpblendmw m9{k1}, m4, m5 ; m5hw m4lw
+ vpblendmw m4{k1}, m0, m12 ; m1lw m0lw
+ vpblendmw m5{k1}, m12, m1 ; m1hw m0hw
+ vpshrdd m2, m3, 16
+ pminub m6, m7
+ pmaxub m8, m9
+ mova ym14, [base+end_perm]
+ vpcmpw k1, m4, m20, 1
+ vpshldw m2, m5, 8
+ pslldq m7, m6, 1
+ pslldq m9, m8, 1
+ psubw m5, m20, m4
+ paddusw m0, m4, m2 ; clip >0xff
+ pminub m6, m7
+ pmaxub m8, m9
+ psubusw m0{k1}, m2, m5 ; clip <0x00
+ pmaxub m0, m6
+ pminub m0, m8
+ vpermb m0, m14, m0
+ vpscatterdd [dstq+ym21]{k2}, ym0
+ RET
+.sec_only:
+ movifnidn t1d, secm
+ call .sec
+.end_no_clip:
+ mova ym4, [base+end_perm]
+ kxnorb k1, k1, k1
+ vpshldd m2, m0, 8 ; (px << 8) + ((sum > -8) << 4)
+ vpshldd m3, m1, 8
+ paddw m0, m2 ; (px << 8) + ((sum + (sum > -8) + 7) << 4)
+ paddw m1, m3
+ pslld m0, 16
+ vpshrdd m0, m1, 16
+ vpermb m0, m4, m0 ; output in bits 8-15 of each word
+ vpscatterdd [dstq+ym21]{k1}, ym0
+ RET
+.mask_edges_sec_only:
+ movifnidn t1d, secm
+ call .mask_edges_sec
+ jmp .end_no_clip
+ALIGN function_align
+.mask_edges:
+ mov t1d, r6d
+ or r6d, 8 ; top 4x4 has bottom
+ or t1d, 4 ; bottom 4x4 has top
+ vpbroadcastq m17, [base+edge_mask+r6*8]
+ vpbroadcastq m18, [base+edge_mask+t1*8]
+ test prid, prid
+ jz .mask_edges_sec_only
+ vpaddd m6, m16, [base+cdef_dirs+(t0+2)*4] {1to16}
+ vpshufbitqmb k1, m17, m6 ; index in-range
+ vpshufbitqmb k2, m18, m6
+ mova m4, m2
+ mova m5, m3
+ vpermb m4{k1}, m6, m14
+ vpermb m5{k2}, m6, m15
+ CDEF_FILTER_4x8_PRI
+ test t1d, t1d
+ jz .end_no_clip
+ call .mask_edges_sec
+ jmp .end_clip
+.mask_edges_sec:
+ vpaddd m10, m16, [base+cdef_dirs+(t0+4)*4] {1to16}
+ vpaddd m11, m16, [base+cdef_dirs+(t0+0)*4] {1to16}
+ vpshufbitqmb k1, m17, m10
+ vpshufbitqmb k2, m18, m10
+ vpshufbitqmb k3, m17, m11
+ vpshufbitqmb k4, m18, m11
+ mova m6, m2
+ mova m7, m3
+ mova m8, m2
+ mova m9, m3
+ vpermb m6{k1}, m10, m14
+ vpermb m7{k2}, m10, m15
+ vpermb m8{k3}, m11, m14
+ vpermb m9{k4}, m11, m15
+ jmp .sec_main
+ALIGN function_align
+.sec:
+ vpaddd m8, m16, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2
+ vpaddd m9, m16, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2
+ vpermb m6, m8, m14 ; pNt k0s0 k0s1 k1s0 k1s1
+ vpermb m7, m8, m15 ; pNb
+ vpermb m8, m9, m14 ; pNt k0s2 k0s3 k1s2 k1s3
+ vpermb m9, m9, m15 ; pNb
+.sec_main:
+ vpbroadcastb m18, t1d
+ lzcnt t1d, t1d
+ vpcmpub k1, m2, m6, 6
+ vpcmpub k2, m3, m7, 6
+ vpcmpub k3, m2, m8, 6
+ vpcmpub k4, m3, m9, 6
+ vpbroadcastq m17, [r3+t1*8]
+ psubb m10, m6, m2
+ psubb m11, m7, m3
+ psubb m12, m8, m2
+ psubb m13, m9, m3
+ vpsubb m10{k1}, m2, m6 ; abs(dt0)
+ vpsubb m11{k2}, m3, m7 ; abs(db0)
+ vpsubb m12{k3}, m2, m8 ; abs(dt1)
+ vpsubb m13{k4}, m3, m9 ; abs(db1)
+ vpbroadcastd m19, [base+sec_tap]
+ gf2p8affineqb m14, m10, m17, 0 ; abs(dt0) >> shift
+ gf2p8affineqb m15, m11, m17, 0 ; abs(db0) >> shift
+ gf2p8affineqb m16, m12, m17, 0 ; abs(dt1) >> shift
+ gf2p8affineqb m17, m13, m17, 0 ; abs(db1) >> shift
+ psubusb m14, m18, m14 ; imax(0, sec_strength - (abs(dt0) >> shift)))
+ psubusb m15, m18, m15 ; imax(0, sec_strength - (abs(db0) >> shift)))
+ psubusb m16, m18, m16 ; imax(0, sec_strength - (abs(dt1) >> shift)))
+ psubusb m17, m18, m17 ; imax(0, sec_strength - (abs(db1) >> shift)))
+ pminub m10, m14
+ pminub m11, m15
+ pminub m12, m16
+ pminub m13, m17
+ mova m14, m19
+ mova m15, m19
+ mova m16, m19
+ vpsubb m14{k1}, m20, m19 ; apply_sign(sec_tap_top_0)
+ vpsubb m15{k2}, m20, m19 ; apply_sign(sec_tap_bottom_0)
+ vpsubb m16{k3}, m20, m19 ; apply_sign(sec_tap_top_1)
+ vpsubb m19{k4}, m20, m19 ; apply_sign(sec_tap_bottom_1)
+ vpdpbusd m0, m10, m14
+ vpdpbusd m1, m11, m15
+ vpdpbusd m0, m12, m16
+ vpdpbusd m1, m13, m19
+ ret
+
+; lut tl lut tr
+; t0 t1 t2 t3 t4 t5 t6 t7 t4 t5 t6 t7 t8 t9 ta tb
+; T0 T1 T2 T3 T4 T5 T6 T7 T4 T5 T6 T7 T8 T9 Ta Tb
+; L0 L1 00 01 02 03 04 05 02 03 04 05 06 07 08 09
+; L2 L3 10 11 12 13 14 15 12 13 14 15 16 17 18 19
+; L4 L5 20 21 22 23 24 25 22 23 24 25 26 27 28 29
+; L6 L7 30 31 32 33 34 35 32 33 34 35 36 37 38 39
+; L8 L9 40 41 42 43 44 45 42 43 44 45 46 47 48 49
+; La Lb 50 51 52 53 54 55 52 53 54 55 56 57 58 59
+; lut bl lut br
+; L4 L5 20 21 22 23 24 25 22 23 24 25 26 27 28 29
+; L6 L7 30 31 32 33 34 35 32 33 34 35 36 37 38 39
+; L8 L9 40 41 42 43 44 45 42 43 44 45 46 47 48 49
+; La Lb 50 51 52 53 54 55 52 53 54 55 56 57 58 59
+; Lc Ld 60 61 62 63 64 65 62 63 64 65 66 67 68 69
+; Le Lf 70 71 72 73 74 75 72 73 74 75 76 77 78 79
+; b0 b1 b2 b3 b4 b5 b6 b7 b4 b5 b6 b7 b8 b9 ba bb
+; B0 B1 B2 B3 B4 B5 B6 B7 B4 B5 B6 B7 B8 B9 Ba Bb
+
+cglobal cdef_filter_8x8_8bpc, 5, 11, 32, 4*64, dst, stride, left, top, bot, \
+ pri, sec, dir, damping, edge
+%define base r8-edge_mask
+ movu xm16, [dstq+strideq*0]
+ pinsrd xm16, [leftq+4*0], 3
+ mov r6d, edgem
+ vinserti128 ym16, [dstq+strideq*1], 1
+ lea r10, [dstq+strideq*4]
+ movu xm17, [dstq+strideq*2]
+ vinserti32x4 m16, [topq+strideq*0-2], 2
+ lea r9, [strideq*3]
+ pinsrd xm17, [leftq+4*1], 3
+ vinserti32x4 m16, [topq+strideq*1-2], 3 ; 0 1 t T
+ lea r8, [edge_mask]
+ vinserti128 ym17, [dstq+r9 ], 1
+ vpbroadcastd ym18, [leftq+4*2]
+ vpblendd ym17, ym18, 0x80
+ movu xm18, [r10 +strideq*2]
+ vinserti32x4 m17, [r10 +strideq*0], 2
+ pinsrd xm18, [leftq+4*3], 3
+ vinserti32x4 m17, [r10 +strideq*1], 3 ; 2 3 4 5
+ vinserti128 ym18, [r10 +r9 ], 1
+ test r6b, 0x08 ; avoid buffer overread
+ jz .main
+ vinserti32x4 m18, [botq+strideq*0-2], 2
+ vinserti32x4 m18, [botq+strideq*1-2], 3 ; 6 7 b B
+.main:
+ mova m0, [base+lut_perm_8x8a]
+ movu m1, [base+lut_perm_8x8b]
+ mova m30, [base+px_idx]
+ vpermb m16, m0, m16
+ movifnidn prid, prim
+ vpermb m17, m1, m17
+ mov t0d, dirm
+ vpermb m18, m0, m18
+ mov r3d, dampingm
+ vshufi32x4 m12, m16, m17, q2020 ; lut tl
+ vshufi32x4 m13, m16, m17, q3131 ; lut tr
+ vshufi32x4 m14, m17, m18, q0220 ; lut bl
+ vshufi32x4 m15, m17, m18, q1331 ; lut br
+ vpbroadcastd m0, [base+pd_268435568] ; (1 << 28) + (7 << 4)
+ pxor m31, m31
+ lea r3, [r8+r3*8] ; gf_shr + (damping - 30) * 8
+ vpermb m4, m30, m12 ; pxtl
+ mova m1, m0
+ vpermb m5, m30, m13 ; pxtr
+ mova m2, m0
+ vpermb m6, m30, m14 ; pxbl
+ mova m3, m0
+ vpermb m7, m30, m15 ; pxbr
+ cmp r6b, 0x0f
+ jne .mask_edges ; mask edges only if required
+ test prid, prid
+ jz .sec_only
+ vpaddd m11, m30, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir
+ vpermb m8, m11, m12 ; pNtl k0p0 k0p1 k1p0 k1p1
+ vpermb m9, m11, m13 ; pNtr
+ vpermb m10, m11, m14 ; pNbl
+ vpermb m11, m11, m15 ; pNbr
+%macro CDEF_FILTER_8x8_PRI 0
+ vpcmpub k1, m4, m8, 6 ; pxtl > pNtl
+ vpcmpub k2, m5, m9, 6 ; pxtr > pNtr
+ vpcmpub k3, m6, m10, 6 ; pxbl > pNbl
+ vpcmpub k4, m7, m11, 6 ; pxbr > pNbr
+ psubb m16, m8, m4
+ psubb m17, m9, m5
+ psubb m18, m10, m6
+ psubb m19, m11, m7
+ lzcnt r6d, prid
+ vpsubb m16{k1}, m4, m8 ; abs(diff_tl)
+ vpsubb m17{k2}, m5, m9 ; abs(diff_tr)
+ vpsubb m18{k3}, m6, m10 ; abs(diff_bl)
+ vpsubb m19{k4}, m7, m11 ; abs(diff_br)
+ vpbroadcastq m28, [r3+r6*8]
+ vpbroadcastb m29, prid
+ and prid, 1
+ vpbroadcastd m27, [base+pri_tap+priq*4]
+ vgf2p8affineqb m20, m16, m28, 0 ; abs(dtl) >> shift
+ vgf2p8affineqb m21, m17, m28, 0 ; abs(dtr) >> shift
+ vgf2p8affineqb m22, m18, m28, 0 ; abs(dbl) >> shift
+ vgf2p8affineqb m23, m19, m28, 0 ; abs(dbl) >> shift
+ mova m24, m27
+ mova m25, m27
+ mova m26, m27
+ movifnidn t1d, secm
+ vpsubb m24{k1}, m31, m27 ; apply_sign(pri_tap_tl)
+ vpsubb m25{k2}, m31, m27 ; apply_sign(pri_tap_tr)
+ vpsubb m26{k3}, m31, m27 ; apply_sign(pri_tap_tl)
+ vpsubb m27{k4}, m31, m27 ; apply_sign(pri_tap_tr)
+ psubusb m20, m29, m20 ; imax(0, pri_strength - (abs(dtl) >> shift)))
+ psubusb m21, m29, m21 ; imax(0, pri_strength - (abs(dtr) >> shift)))
+ psubusb m22, m29, m22 ; imax(0, pri_strength - (abs(dbl) >> shift)))
+ psubusb m23, m29, m23 ; imax(0, pri_strength - (abs(dbr) >> shift)))
+ pminub m16, m20
+ pminub m17, m21
+ pminub m18, m22
+ pminub m19, m23
+ vpdpbusd m0, m16, m24 ; sum tl
+ vpdpbusd m1, m17, m25 ; sum tr
+ vpdpbusd m2, m18, m26 ; sum bl
+ vpdpbusd m3, m19, m27 ; sum br
+%endmacro
+ CDEF_FILTER_8x8_PRI
+ test t1d, t1d ; sec
+ jz .end_no_clip
+ call .sec
+.end_clip:
+ pminub m20, m8, m4
+ pminub m24, m12, m16
+ pminub m21, m9, m5
+ pminub m25, m13, m17
+ pminub m22, m10, m6
+ pminub m26, m14, m18
+ pminub m23, m11, m7
+ pminub m27, m15, m19
+ pmaxub m8, m4
+ pmaxub m12, m16
+ pmaxub m9, m5
+ pmaxub m13, m17
+ pmaxub m10, m6
+ pmaxub m14, m18
+ pmaxub m11, m7
+ pmaxub m15, m19
+ pminub m20, m24
+ pminub m21, m25
+ pminub m22, m26
+ pminub m23, m27
+ pmaxub m8, m12
+ pmaxub m9, m13
+ pmaxub m10, m14
+ pmaxub m11, m15
+ mov r2d, 0xAAAAAAAA
+ kmovd k1, r2d
+ vpshrdd m24, m0, m1, 16
+ vpshrdd m25, m2, m3, 16
+ vpshrdd m12, m20, m21, 16
+ vpshrdd m14, m22, m23, 16
+ vpshrdd m16, m8, m9, 16
+ vpshrdd m18, m10, m11, 16
+ vpblendmw m13{k1}, m20, m21
+ vpblendmw m15{k1}, m22, m23
+ vpblendmw m17{k1}, m8, m9
+ vpblendmw m19{k1}, m10, m11
+ vpblendmw m20{k1}, m0, m24
+ vpblendmw m21{k1}, m24, m1
+ vpblendmw m22{k1}, m2, m25
+ vpblendmw m23{k1}, m25, m3
+ vpshrdd m4, m5, 16
+ vpshrdd m6, m7, 16
+ pminub m12, m13
+ pminub m14, m15
+ pmaxub m16, m17
+ pmaxub m18, m19
+ mova m8, [base+end_perm_clip]
+ vpcmpw k2, m20, m31, 1
+ vpcmpw k3, m22, m31, 1
+ vpshldw m4, m21, 8
+ vpshldw m6, m23, 8
+ kunpckdq k1, k1, k1
+ kxnorb k4, k4, k4
+ vpshrdw m11, m12, m14, 8
+ vpshrdw m15, m16, m18, 8
+ vpblendmb m13{k1}, m12, m14
+ vpblendmb m17{k1}, m16, m18
+ psubw m21, m31, m20
+ psubw m23, m31, m22
+ paddusw m0, m20, m4 ; clip >0xff
+ paddusw m1, m22, m6
+ pminub m11, m13
+ pmaxub m15, m17
+ psubusw m0{k2}, m4, m21 ; clip <0x00
+ psubusw m1{k3}, m6, m23
+ psrlw m0, 8
+ vmovdqu8 m0{k1}, m1
+ pmaxub m0, m11
+ pminub m0, m15
+ vpermb m0, m8, m0
+ vextracti32x4 xm1, m0, 1
+ vextracti32x4 xm2, m0, 2
+ vextracti32x4 xm3, m0, 3
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*2], xm1
+ movq [r10 +strideq*0], xm2
+ movq [r10 +strideq*2], xm3
+ movhps [dstq+strideq*1], xm0
+ movhps [dstq+r9 ], xm1
+ movhps [r10 +strideq*1], xm2
+ movhps [r10 +r9 ], xm3
+ RET
+.sec_only:
+ movifnidn t1d, secm
+ call .sec
+.end_no_clip:
+ mova xm8, [base+end_perm]
+ kxnorb k1, k1, k1
+ vpshldd m4, m0, 8 ; (px << 8) + ((sum > -8) << 4)
+ vpshldd m5, m1, 8
+ vpshldd m6, m2, 8
+ vpshldd m7, m3, 8
+ paddw m0, m4 ; (px << 8) + ((sum + (sum > -8) + 7) << 4)
+ paddw m1, m5
+ paddw m2, m6
+ paddw m3, m7
+ vpermb m0, m8, m0
+ vpermb m1, m8, m1
+ vpermb m2, m8, m2
+ vpermb m3, m8, m3
+ punpckldq m4, m0, m1
+ punpckhdq m0, m1
+ punpckldq m5, m2, m3
+ punpckhdq m2, m3
+ movq [dstq+strideq*0], xm4
+ movq [dstq+strideq*2], xm0
+ movq [r10 +strideq*0], xm5
+ movq [r10 +strideq*2], xm2
+ movhps [dstq+strideq*1], xm4
+ movhps [dstq+r9 ], xm0
+ movhps [r10 +strideq*1], xm5
+ movhps [r10 +r9 ], xm2
+ RET
+.mask_edges_sec_only:
+ movifnidn t1d, secm
+ call .mask_edges_sec
+ jmp .end_no_clip
+ALIGN function_align
+.mask_edges:
+ mov t0d, r6d
+ mov t1d, r6d
+ or t0d, 0xA ; top-left 4x4 has bottom and right
+ or t1d, 0x9 ; top-right 4x4 has bottom and left
+ vpbroadcastq m26, [base+edge_mask+t0*8]
+ vpbroadcastq m27, [base+edge_mask+t1*8]
+ mov t1d, r6d
+ or r6d, 0x6 ; bottom-left 4x4 has top and right
+ or t1d, 0x5 ; bottom-right 4x4 has top and left
+ vpbroadcastq m28, [base+edge_mask+r6*8]
+ vpbroadcastq m29, [base+edge_mask+t1*8]
+ mov t0d, dirm
+ test prid, prid
+ jz .mask_edges_sec_only
+ vpaddd m20, m30, [base+cdef_dirs+(t0+2)*4] {1to16}
+ vpshufbitqmb k1, m26, m20 ; index in-range
+ vpshufbitqmb k2, m27, m20
+ vpshufbitqmb k3, m28, m20
+ vpshufbitqmb k4, m29, m20
+ mova m8, m4
+ mova m9, m5
+ mova m10, m6
+ mova m11, m7
+ vpermb m8{k1}, m20, m12
+ vpermb m9{k2}, m20, m13
+ vpermb m10{k3}, m20, m14
+ vpermb m11{k4}, m20, m15
+ mova [rsp+0x00], m26
+ mova [rsp+0x40], m27
+ mova [rsp+0x80], m28
+ mova [rsp+0xC0], m29
+ CDEF_FILTER_8x8_PRI
+ test t1d, t1d
+ jz .end_no_clip
+ mova m26, [rsp+0x00]
+ mova m27, [rsp+0x40]
+ mova m28, [rsp+0x80]
+ mova m29, [rsp+0xC0]
+ call .mask_edges_sec
+ jmp .end_clip
+.mask_edges_sec:
+ vpaddd m20, m30, [base+cdef_dirs+(t0+4)*4] {1to16}
+ vpaddd m21, m30, [base+cdef_dirs+(t0+0)*4] {1to16}
+ vpshufbitqmb k1, m26, m20
+ vpshufbitqmb k2, m27, m20
+ vpshufbitqmb k3, m28, m20
+ vpshufbitqmb k4, m29, m20
+ mova m16, m4
+ mova m17, m5
+ mova m18, m6
+ mova m19, m7
+ vpermb m16{k1}, m20, m12
+ vpermb m17{k2}, m20, m13
+ vpermb m18{k3}, m20, m14
+ vpermb m19{k4}, m20, m15
+ vpshufbitqmb k1, m26, m21
+ vpshufbitqmb k2, m27, m21
+ vpshufbitqmb k3, m28, m21
+ vpshufbitqmb k4, m29, m21
+ vpermb m12, m21, m12
+ vpermb m13, m21, m13
+ vpermb m14, m21, m14
+ vpermb m15, m21, m15
+ vpblendmb m12{k1}, m4, m12
+ vpblendmb m13{k2}, m5, m13
+ vpblendmb m14{k3}, m6, m14
+ vpblendmb m15{k4}, m7, m15
+ jmp .sec_main
+ALIGN function_align
+.sec:
+ vpaddd m20, m30, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2
+ vpaddd m21, m30, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2
+ vpermb m16, m20, m12 ; pNtl k0s0 k0s1 k1s0 k1s1
+ vpermb m17, m20, m13 ; pNtr
+ vpermb m18, m20, m14 ; pNbl
+ vpermb m19, m20, m15 ; pNbr
+ vpermb m12, m21, m12 ; pNtl k0s2 k0s3 k1s2 k1s3
+ vpermb m13, m21, m13 ; pNtr
+ vpermb m14, m21, m14 ; pNbl
+ vpermb m15, m21, m15 ; pNbr
+.sec_main:
+%macro CDEF_FILTER_8x8_SEC 4-5 0 ; load constants
+ vpcmpub k1, m4, %1, 6
+ vpcmpub k2, m5, %2, 6
+ vpcmpub k3, m6, %3, 6
+ vpcmpub k4, m7, %4, 6
+ psubb m20, %1, m4
+ psubb m21, %2, m5
+ psubb m22, %3, m6
+ psubb m23, %4, m7
+%if %5
+ vpbroadcastb m28, t1d
+ lzcnt t1d, t1d
+ vpbroadcastq m29, [r3+t1*8]
+%endif
+ vpsubb m20{k1}, m4, %1
+ vpsubb m21{k2}, m5, %2
+ vpsubb m22{k3}, m6, %3
+ vpsubb m23{k4}, m7, %4
+ gf2p8affineqb m24, m20, m29, 0
+ gf2p8affineqb m25, m21, m29, 0
+ gf2p8affineqb m26, m22, m29, 0
+ gf2p8affineqb m27, m23, m29, 0
+%if %5
+ vpbroadcastd m30, [base+sec_tap]
+%endif
+ psubusb m24, m28, m24
+ psubusb m25, m28, m25
+ psubusb m26, m28, m26
+ psubusb m27, m28, m27
+ pminub m20, m24
+ pminub m21, m25
+ pminub m22, m26
+ pminub m23, m27
+ mova m24, m30
+ mova m25, m30
+ mova m26, m30
+ mova m27, m30
+ vpsubb m24{k1}, m31, m30
+ vpsubb m25{k2}, m31, m30
+ vpsubb m26{k3}, m31, m30
+ vpsubb m27{k4}, m31, m30
+ vpdpbusd m0, m20, m24
+ vpdpbusd m1, m21, m25
+ vpdpbusd m2, m22, m26
+ vpdpbusd m3, m23, m27
+%endmacro
+ CDEF_FILTER_8x8_SEC m16, m17, m18, m19, 1
+ CDEF_FILTER_8x8_SEC m12, m13, m14, m15
+ ret
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/cdef_sse.asm b/third_party/dav1d/src/x86/cdef_sse.asm
new file mode 100644
index 0000000000..1b353121f4
--- /dev/null
+++ b/third_party/dav1d/src/x86/cdef_sse.asm
@@ -0,0 +1,1357 @@
+; Copyright © 2018, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; Copyright © 2019, VideoLabs
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA 16
+
+%macro DUP8 1-*
+ %rep %0
+ times 8 db %1
+ %rotate 1
+ %endrep
+%endmacro
+
+div_table_sse4: dd 840, 420, 280, 210, 168, 140, 120, 105
+ dd 420, 210, 140, 105, 105, 105, 105, 105
+div_table_ssse3: dw 840, 840, 420, 420, 280, 280, 210, 210
+ dw 168, 168, 140, 140, 120, 120, 105, 105
+ dw 420, 420, 210, 210, 140, 140, 105, 105
+ dw 105, 105, 105, 105, 105, 105, 105, 105
+const shufw_6543210x, \
+ db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15
+shufb_lohi: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
+pw_8: times 8 dw 8
+pw_128: times 8 dw 128
+pw_256: times 8 dw 256
+pw_2048: times 8 dw 2048
+pw_0x7FFF: times 8 dw 0x7FFF
+pw_0x8000: times 8 dw 0x8000
+tap_table: ; masks for 8-bit shift emulation
+ DUP8 0xFF, 0xFE, 0xFC, 0xF8, 0xF0, 0xE0, 0xC0, 0x80
+ ; weights
+ DUP8 4, 2, 3, 3, 2, 1
+ ; taps indices
+ db -1 * 16 + 1, -2 * 16 + 2
+ db 0 * 16 + 1, -1 * 16 + 2
+ db 0 * 16 + 1, 0 * 16 + 2
+ db 0 * 16 + 1, 1 * 16 + 2
+ db 1 * 16 + 1, 2 * 16 + 2
+ db 1 * 16 + 0, 2 * 16 + 1
+ db 1 * 16 + 0, 2 * 16 + 0
+ db 1 * 16 + 0, 2 * 16 - 1
+ ; the last 6 are repeats of the first 6 so we don't need to & 7
+ db -1 * 16 + 1, -2 * 16 + 2
+ db 0 * 16 + 1, -1 * 16 + 2
+ db 0 * 16 + 1, 0 * 16 + 2
+ db 0 * 16 + 1, 1 * 16 + 2
+ db 1 * 16 + 1, 2 * 16 + 2
+ db 1 * 16 + 0, 2 * 16 + 1
+
+SECTION .text
+
+%macro movif32 2
+ %if ARCH_X86_32
+ mov %1, %2
+ %endif
+%endmacro
+
+%macro PMOVZXBW 2-3 0 ; %3 = half
+ %if cpuflag(sse4) && %3 == 0
+ pmovzxbw %1, %2
+ %else
+ %if %3 == 1
+ movd %1, %2
+ %else
+ movq %1, %2
+ %endif
+ punpcklbw %1, m7
+ %endif
+%endmacro
+
+%macro PSHUFB_0 2
+ %if cpuflag(ssse3)
+ pshufb %1, %2
+ %else
+ punpcklbw %1, %1
+ pshuflw %1, %1, q0000
+ punpcklqdq %1, %1
+ %endif
+%endmacro
+
+%macro MOVDDUP 2
+%if cpuflag(ssse3)
+ movddup %1, %2
+%else
+ movq %1, %2
+ punpcklqdq %1, %1
+%endif
+%endmacro
+
+%macro ACCUMULATE_TAP 7 ; tap_offset, shift, shift_mask, strength, mul_tap, w, minmax
+ ; load p0/p1
+ movsx offq, byte [dirq+kq+%1+14*8] ; off1
+ %if %6 == 4
+ movq m5, [stkq+offq*2+32*0] ; p0
+ movhps m5, [stkq+offq*2+32*1]
+ %else
+ movu m5, [stkq+offq*2+32*0] ; p0
+ %endif
+ neg offq ; -off1
+ %if %6 == 4
+ movq m6, [stkq+offq*2+32*0] ; p1
+ movhps m6, [stkq+offq*2+32*1]
+ %else
+ movu m6, [stkq+offq*2+32*0] ; p1
+ %endif
+ %if %7
+ %if cpuflag(sse4)
+ ; out of bounds values are set to a value that is a both a large unsigned
+ ; value and a negative signed value.
+ ; use signed max and unsigned min to remove them
+ pmaxsw m7, m5
+ pminuw m8, m5
+ pmaxsw m7, m6
+ pminuw m8, m6
+ %else
+ pcmpeqw m3, m14, m5
+ pminsw m8, m5 ; min after p0
+ pandn m3, m5
+ pmaxsw m7, m3 ; max after p0
+ pcmpeqw m3, m14, m6
+ pminsw m8, m6 ; min after p1
+ pandn m3, m6
+ pmaxsw m7, m3 ; max after p1
+ %endif
+ %endif
+
+ ; accumulate sum[m13] over p0/p1
+ psubw m5, m4 ; diff_p0(p0 - px)
+ psubw m6, m4 ; diff_p1(p1 - px)
+ packsswb m5, m6 ; convert pixel diff to 8-bit
+ %if cpuflag(ssse3)
+ pshufb m5, m13 ; group diffs p0 and p1 into pairs
+ pabsb m6, m5
+ psignb m3, %5, m5
+ %else
+ movlhps m6, m5
+ punpckhbw m6, m5
+ pxor m5, m5
+ pcmpgtb m5, m6
+ paddb m6, m5
+ pxor m6, m5
+ paddb m3, %5, m5
+ pxor m3, m5
+ %endif
+ pand m9, %3, m6 ; emulate 8-bit shift
+ psrlw m9, %2
+ psubusb m5, %4, m9
+ pminub m5, m6 ; constrain(diff_p)
+ %if cpuflag(ssse3)
+ pmaddubsw m5, m3 ; constrain(diff_p) * taps
+ %else
+ psrlw m9, m5, 8
+ psraw m6, m3, 8
+ psllw m5, 8
+ psllw m3, 8
+ pmullw m9, m6
+ pmulhw m5, m3
+ paddw m5, m9
+ %endif
+ paddw m0, m5
+%endmacro
+
+%macro LOAD_BODY 3 ; dst, src, block_width
+ %if %3 == 4
+ PMOVZXBW m0, [%2+strideq*0]
+ PMOVZXBW m1, [%2+strideq*1]
+ PMOVZXBW m2, [%2+strideq*2]
+ PMOVZXBW m3, [%2+stride3q]
+ mova [%1+32*0], m0
+ mova [%1+32*1], m1
+ mova [%1+32*2], m2
+ mova [%1+32*3], m3
+ %else
+ movu m0, [%2+strideq*0]
+ movu m1, [%2+strideq*1]
+ movu m2, [%2+strideq*2]
+ movu m3, [%2+stride3q]
+ punpcklbw m4, m0, m7
+ punpckhbw m0, m7
+ mova [%1+32*0+ 0], m4
+ mova [%1+32*0+16], m0
+ punpcklbw m4, m1, m7
+ punpckhbw m1, m7
+ mova [%1+32*1+ 0], m4
+ mova [%1+32*1+16], m1
+ punpcklbw m4, m2, m7
+ punpckhbw m2, m7
+ mova [%1+32*2+ 0], m4
+ mova [%1+32*2+16], m2
+ punpcklbw m4, m3, m7
+ punpckhbw m3, m7
+ mova [%1+32*3+ 0], m4
+ mova [%1+32*3+16], m3
+ %endif
+%endmacro
+
+%macro CDEF_FILTER_END 2 ; w, minmax
+ pxor m6, m6
+ pcmpgtw m6, m0
+ paddw m0, m6
+ %if cpuflag(ssse3)
+ pmulhrsw m0, m15
+ %else
+ paddw m0, m15
+ psraw m0, 4
+ %endif
+ paddw m4, m0
+ %if %2
+ pminsw m4, m7
+ pmaxsw m4, m8
+ %endif
+ packuswb m4, m4
+ %if %1 == 4
+ movd [dstq+strideq*0], m4
+ psrlq m4, 32
+ movd [dstq+strideq*1], m4
+ add stkq, 32*2
+ lea dstq, [dstq+strideq*2]
+ %else
+ movq [dstq], m4
+ add stkq, 32
+ add dstq, strideq
+ %endif
+%endmacro
+
+%macro CDEF_FILTER 2 ; w, h
+ %if ARCH_X86_64
+cglobal cdef_filter_%1x%2_8bpc, 5, 9, 16, 3 * 16 + (%2+4)*32, \
+ dst, stride, left, top, bot, pri, dst4, edge, \
+ stride3
+ %define px rsp+3*16+2*32
+ %define base 0
+ %else
+cglobal cdef_filter_%1x%2_8bpc, 2, 7, 8, - 7 * 16 - (%2+4)*32, \
+ dst, stride, left, edge, stride3
+ %define topq r2
+ %define botq r2
+ %define dst4q r2
+ LEA r5, tap_table
+ %define px esp+7*16+2*32
+ %define base r5-tap_table
+ %endif
+ mov edged, r9m
+ %if cpuflag(sse4)
+ %define OUT_OF_BOUNDS_MEM [base+pw_0x8000]
+ %else
+ %define OUT_OF_BOUNDS_MEM [base+pw_0x7FFF]
+ %endif
+ mova m6, OUT_OF_BOUNDS_MEM
+ pxor m7, m7
+
+ ; prepare pixel buffers - body/right
+ %if %2 == 8
+ lea dst4q, [dstq+strideq*4]
+ %endif
+ lea stride3q, [strideq*3]
+ test edgeb, 2 ; have_right
+ jz .no_right
+ LOAD_BODY px, dstq, %1
+ %if %2 == 8
+ LOAD_BODY px+4*32, dst4q, %1
+ %endif
+ jmp .body_done
+.no_right:
+ PMOVZXBW m0, [dstq+strideq*0], %1 == 4
+ PMOVZXBW m1, [dstq+strideq*1], %1 == 4
+ PMOVZXBW m2, [dstq+strideq*2], %1 == 4
+ PMOVZXBW m3, [dstq+stride3q ], %1 == 4
+ mova [px+32*0], m0
+ mova [px+32*1], m1
+ mova [px+32*2], m2
+ mova [px+32*3], m3
+ movd [px+32*0+%1*2], m6
+ movd [px+32*1+%1*2], m6
+ movd [px+32*2+%1*2], m6
+ movd [px+32*3+%1*2], m6
+ %if %2 == 8
+ PMOVZXBW m0, [dst4q+strideq*0], %1 == 4
+ PMOVZXBW m1, [dst4q+strideq*1], %1 == 4
+ PMOVZXBW m2, [dst4q+strideq*2], %1 == 4
+ PMOVZXBW m3, [dst4q+stride3q ], %1 == 4
+ mova [px+32*4], m0
+ mova [px+32*5], m1
+ mova [px+32*6], m2
+ mova [px+32*7], m3
+ movd [px+32*4+%1*2], m6
+ movd [px+32*5+%1*2], m6
+ movd [px+32*6+%1*2], m6
+ movd [px+32*7+%1*2], m6
+ %endif
+.body_done:
+
+ ; top
+ movifnidn topq, r3mp
+ test edgeb, 4 ; have_top
+ jz .no_top
+ test edgeb, 1 ; have_left
+ jz .top_no_left
+ test edgeb, 2 ; have_right
+ jz .top_no_right
+ %if %1 == 4
+ PMOVZXBW m0, [topq+strideq*0-2]
+ PMOVZXBW m1, [topq+strideq*1-2]
+ %else
+ movu m0, [topq+strideq*0-4]
+ movu m1, [topq+strideq*1-4]
+ punpckhbw m2, m0, m7
+ punpcklbw m0, m7
+ punpckhbw m3, m1, m7
+ punpcklbw m1, m7
+ movu [px-32*2+8], m2
+ movu [px-32*1+8], m3
+ %endif
+ movu [px-32*2-%1], m0
+ movu [px-32*1-%1], m1
+ jmp .top_done
+.top_no_right:
+ %if %1 == 4
+ PMOVZXBW m0, [topq+strideq*0-%1]
+ PMOVZXBW m1, [topq+strideq*1-%1]
+ movu [px-32*2-8], m0
+ movu [px-32*1-8], m1
+ %else
+ movu m0, [topq+strideq*0-%1]
+ movu m1, [topq+strideq*1-%2]
+ punpckhbw m2, m0, m7
+ punpcklbw m0, m7
+ punpckhbw m3, m1, m7
+ punpcklbw m1, m7
+ mova [px-32*2-16], m0
+ mova [px-32*2+ 0], m2
+ mova [px-32*1-16], m1
+ mova [px-32*1+ 0], m3
+ %endif
+ movd [px-32*2+%1*2], m6
+ movd [px-32*1+%1*2], m6
+ jmp .top_done
+.top_no_left:
+ test edgeb, 2 ; have_right
+ jz .top_no_left_right
+ %if %1 == 4
+ PMOVZXBW m0, [topq+strideq*0]
+ PMOVZXBW m1, [topq+strideq*1]
+ %else
+ movu m0, [topq+strideq*0]
+ movu m1, [topq+strideq*1]
+ punpckhbw m2, m0, m7
+ punpcklbw m0, m7
+ punpckhbw m3, m1, m7
+ punpcklbw m1, m7
+ movd [px-32*2+16], m2
+ movd [px-32*1+16], m3
+ %endif
+ movd [px-32*2- 4], m6
+ movd [px-32*1- 4], m6
+ mova [px-32*2+ 0], m0
+ mova [px-32*1+ 0], m1
+ jmp .top_done
+.top_no_left_right:
+ PMOVZXBW m0, [topq+strideq*0], %1 == 4
+ PMOVZXBW m1, [topq+strideq*1], %1 == 4
+ movd [px-32*2-4], m6
+ movd [px-32*1-4], m6
+ mova [px-32*2+0], m0
+ mova [px-32*1+0], m1
+ movd [px-32*2+%1*2], m6
+ movd [px-32*1+%1*2], m6
+ jmp .top_done
+.no_top:
+ movu [px-32*2- 4], m6
+ movu [px-32*1- 4], m6
+ %if %1 == 8
+ movq [px-32*2+12], m6
+ movq [px-32*1+12], m6
+ %endif
+.top_done:
+
+ ; left
+ test edgeb, 1 ; have_left
+ jz .no_left
+ movifnidn leftq, leftmp
+ %if %2 == 4
+ movq m0, [leftq]
+ %else
+ movu m0, [leftq]
+ %endif
+ %if %2 == 4
+ punpcklbw m0, m7
+ %else
+ punpckhbw m1, m0, m7
+ punpcklbw m0, m7
+ movhlps m3, m1
+ movd [px+32*4-4], m1
+ movd [px+32*6-4], m3
+ psrlq m1, 32
+ psrlq m3, 32
+ movd [px+32*5-4], m1
+ movd [px+32*7-4], m3
+ %endif
+ movhlps m2, m0
+ movd [px+32*0-4], m0
+ movd [px+32*2-4], m2
+ psrlq m0, 32
+ psrlq m2, 32
+ movd [px+32*1-4], m0
+ movd [px+32*3-4], m2
+ jmp .left_done
+.no_left:
+ movd [px+32*0-4], m6
+ movd [px+32*1-4], m6
+ movd [px+32*2-4], m6
+ movd [px+32*3-4], m6
+ %if %2 == 8
+ movd [px+32*4-4], m6
+ movd [px+32*5-4], m6
+ movd [px+32*6-4], m6
+ movd [px+32*7-4], m6
+ %endif
+.left_done:
+
+ ; bottom
+ movifnidn botq, r4mp
+ test edgeb, 8 ; have_bottom
+ jz .no_bottom
+ test edgeb, 1 ; have_left
+ jz .bottom_no_left
+ test edgeb, 2 ; have_right
+ jz .bottom_no_right
+ %if %1 == 4
+ PMOVZXBW m0, [botq+strideq*0-(%1/2)]
+ PMOVZXBW m1, [botq+strideq*1-(%1/2)]
+ %else
+ movu m0, [botq+strideq*0-4]
+ movu m1, [botq+strideq*1-4]
+ punpckhbw m2, m0, m7
+ punpcklbw m0, m7
+ punpckhbw m3, m1, m7
+ punpcklbw m1, m7
+ movu [px+32*(%2+0)+8], m2
+ movu [px+32*(%2+1)+8], m3
+ %endif
+ movu [px+32*(%2+0)-%1], m0
+ movu [px+32*(%2+1)-%1], m1
+ jmp .bottom_done
+.bottom_no_right:
+ %if %1 == 4
+ PMOVZXBW m0, [botq+strideq*0-4]
+ PMOVZXBW m1, [botq+strideq*1-4]
+ movu [px+32*(%2+0)-8], m0
+ movu [px+32*(%2+1)-8], m1
+ %else
+ movu m0, [botq+strideq*0-8]
+ movu m1, [botq+strideq*1-8]
+ punpckhbw m2, m0, m7
+ punpcklbw m0, m7
+ punpckhbw m3, m1, m7
+ punpcklbw m1, m7
+ mova [px+32*(%2+0)-16], m0
+ mova [px+32*(%2+0)+ 0], m2
+ mova [px+32*(%2+1)-16], m1
+ mova [px+32*(%2+1)+ 0], m3
+ movd [px+32*(%2-1)+16], m6 ; overwritten by first mova
+ %endif
+ movd [px+32*(%2+0)+%1*2], m6
+ movd [px+32*(%2+1)+%1*2], m6
+ jmp .bottom_done
+.bottom_no_left:
+ test edgeb, 2 ; have_right
+ jz .bottom_no_left_right
+ %if %1 == 4
+ PMOVZXBW m0, [botq+strideq*0]
+ PMOVZXBW m1, [botq+strideq*1]
+ %else
+ movu m0, [botq+strideq*0]
+ movu m1, [botq+strideq*1]
+ punpckhbw m2, m0, m7
+ punpcklbw m0, m7
+ punpckhbw m3, m1, m7
+ punpcklbw m1, m7
+ mova [px+32*(%2+0)+16], m2
+ mova [px+32*(%2+1)+16], m3
+ %endif
+ mova [px+32*(%2+0)+ 0], m0
+ mova [px+32*(%2+1)+ 0], m1
+ movd [px+32*(%2+0)- 4], m6
+ movd [px+32*(%2+1)- 4], m6
+ jmp .bottom_done
+.bottom_no_left_right:
+ PMOVZXBW m0, [botq+strideq*0], %1 == 4
+ PMOVZXBW m1, [botq+strideq*1], %1 == 4
+ mova [px+32*(%2+0)+ 0], m0
+ mova [px+32*(%2+1)+ 0], m1
+ movd [px+32*(%2+0)+%1*2], m6
+ movd [px+32*(%2+1)+%1*2], m6
+ movd [px+32*(%2+0)- 4], m6
+ movd [px+32*(%2+1)- 4], m6
+ jmp .bottom_done
+.no_bottom:
+ movu [px+32*(%2+0)- 4], m6
+ movu [px+32*(%2+1)- 4], m6
+ %if %1 == 8
+ movq [px+32*(%2+0)+12], m6
+ movq [px+32*(%2+1)+12], m6
+ %endif
+.bottom_done:
+
+ ; actual filter
+ %if ARCH_X86_64
+ DEFINE_ARGS dst, stride, _, pridmp, damping, pri, sec
+ mova m13, [shufb_lohi]
+ %if cpuflag(ssse3)
+ mova m15, [pw_2048]
+ %else
+ mova m15, [pw_8]
+ %endif
+ mova m14, m6
+ %else
+ DEFINE_ARGS dst, pridmp, sec, damping, pri, tap
+ %xdefine m8 m1
+ %xdefine m9 m2
+ %xdefine m10 m0
+ %xdefine m13 [base+shufb_lohi]
+ %xdefine m14 OUT_OF_BOUNDS_MEM
+ %if cpuflag(ssse3)
+ %xdefine m15 [base+pw_2048]
+ %else
+ %xdefine m15 [base+pw_8]
+ %endif
+ %endif
+ movifnidn prid, r5m
+ movifnidn secd, r6m
+ mov dampingd, r8m
+ movif32 [esp+0x3C], r1d
+ test prid, prid
+ jz .sec_only
+ movd m1, r5m
+ bsr pridmpd, prid
+ test secd, secd
+ jz .pri_only
+ movd m10, r6m
+ tzcnt secd, secd
+ and prid, 1
+ sub pridmpd, dampingd
+ sub secd, dampingd
+ xor dampingd, dampingd
+ add prid, prid
+ neg pridmpd
+ cmovs pridmpd, dampingd
+ neg secd
+ PSHUFB_0 m1, m7
+ PSHUFB_0 m10, m7
+ %if ARCH_X86_64
+ DEFINE_ARGS dst, stride, _, pridmp, tap, pri, sec
+ lea tapq, [tap_table]
+ MOVDDUP m11, [tapq+pridmpq*8] ; pri_shift_mask
+ MOVDDUP m12, [tapq+secq*8] ; sec_shift_mask
+ mov [rsp+0x00], pridmpq ; pri_shift
+ mov [rsp+0x10], secq ; sec_shift
+ DEFINE_ARGS dst, stride, h, dir, tap, pri, stk, k, off
+ %else
+ MOVDDUP m2, [tapq+pridmpq*8]
+ MOVDDUP m3, [tapq+secq*8]
+ mov [esp+0x04], dampingd ; zero upper 32 bits of psrlw
+ mov [esp+0x34], dampingd ; source operand in ACCUMULATE_TAP
+ mov [esp+0x00], pridmpd
+ mov [esp+0x30], secd
+ DEFINE_ARGS dst, stride, dir, stk, pri, tap, h
+ %define offq dstq
+ %define kd strided
+ %define kq strideq
+ mova [esp+0x10], m2
+ mova [esp+0x40], m3
+ mova [esp+0x20], m1
+ mova [esp+0x50], m10
+ %endif
+ mov dird, r7m
+ lea stkq, [px]
+ lea priq, [tapq+8*8+priq*8] ; pri_taps
+ mov hd, %1*%2/8
+ lea dirq, [tapq+dirq*2]
+.v_loop:
+ movif32 [esp+0x38], dstd
+ mov kd, 1
+ %if %1 == 4
+ movq m4, [stkq+32*0]
+ movhps m4, [stkq+32*1]
+ %else
+ mova m4, [stkq+32*0] ; px
+ %endif
+ pxor m0, m0 ; sum
+ mova m7, m4 ; max
+ mova m8, m4 ; min
+.k_loop:
+ MOVDDUP m2, [priq+kq*8]
+ %if ARCH_X86_64
+ ACCUMULATE_TAP 0*2, [rsp+0x00], m11, m1, m2, %1, 1
+ MOVDDUP m2, [tapq+12*8+kq*8]
+ ACCUMULATE_TAP 2*2, [rsp+0x10], m12, m10, m2, %1, 1
+ ACCUMULATE_TAP 6*2, [rsp+0x10], m12, m10, m2, %1, 1
+ %else
+ ACCUMULATE_TAP 0*2, [esp+0x00], [esp+0x10], [esp+0x20], m2, %1, 1
+ MOVDDUP m2, [tapq+12*8+kq*8]
+ ACCUMULATE_TAP 2*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, 1
+ MOVDDUP m2, [tapq+12*8+kq*8]
+ ACCUMULATE_TAP 6*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, 1
+ %endif
+ dec kd
+ jge .k_loop
+ movif32 dstq, [esp+0x38]
+ movif32 strideq, [esp+0x3C]
+ CDEF_FILTER_END %1, 1
+ dec hd
+ jg .v_loop
+ RET
+
+.pri_only:
+%if ARCH_X86_64
+ DEFINE_ARGS dst, stride, zero, pridmp, damping, pri, tap
+ lea tapq, [tap_table]
+ %else
+ DEFINE_ARGS dst, pridmp, zero, damping, pri, tap
+ %endif
+ and prid, 1
+ xor zerod, zerod
+ sub dampingd, pridmpd
+ cmovs dampingd, zerod
+ add prid, prid
+ PSHUFB_0 m1, m7
+ MOVDDUP m7, [tapq+dampingq*8]
+ mov [rsp+0x00], dampingq
+ %if ARCH_X86_64
+ DEFINE_ARGS dst, stride, h, dir, stk, pri, tap, k, off
+ %else
+ mov [rsp+0x04], zerod
+ DEFINE_ARGS dst, stride, dir, stk, pri, tap, h
+ %endif
+ mov dird, r7m
+ lea stkq, [px]
+ lea priq, [tapq+8*8+priq*8]
+ mov hd, %1*%2/8
+ lea dirq, [tapq+dirq*2]
+.pri_v_loop:
+ movif32 [esp+0x38], dstd
+ mov kd, 1
+ %if %1 == 4
+ movq m4, [stkq+32*0]
+ movhps m4, [stkq+32*1]
+ %else
+ mova m4, [stkq+32*0]
+ %endif
+ pxor m0, m0
+.pri_k_loop:
+ MOVDDUP m2, [priq+kq*8]
+ ACCUMULATE_TAP 0*2, [rsp], m7, m1, m2, %1, 0
+ dec kd
+ jge .pri_k_loop
+ movif32 dstq, [esp+0x38]
+ movif32 strideq, [esp+0x3C]
+ CDEF_FILTER_END %1, 0
+ dec hd
+ jg .pri_v_loop
+ RET
+
+.sec_only:
+%if ARCH_X86_64
+ DEFINE_ARGS dst, stride, zero, dir, damping, tap, sec
+%else
+ DEFINE_ARGS dst, stride, sec, damping, dir, tap, zero
+%endif
+ movd m1, r6m
+ tzcnt secd, secd
+ mov dird, r7m
+ xor zerod, zerod
+ sub dampingd, secd
+ cmovs dampingd, zerod
+ PSHUFB_0 m1, m7
+ %if ARCH_X86_64
+ lea tapq, [tap_table]
+ %else
+ mov [rsp+0x04], zerod
+ %endif
+ mov [rsp+0x00], dampingq
+ MOVDDUP m7, [tapq+dampingq*8]
+ lea dirq, [tapq+dirq*2]
+ %if ARCH_X86_64
+ DEFINE_ARGS dst, stride, h, dir, stk, tap, off, k
+ %else
+ DEFINE_ARGS dst, stride, off, stk, dir, tap, h
+ %endif
+ lea stkq, [px]
+ mov hd, %1*%2/8
+.sec_v_loop:
+ mov kd, 1
+ %if %1 == 4
+ movq m4, [stkq+32*0]
+ movhps m4, [stkq+32*1]
+ %else
+ mova m4, [stkq+32*0]
+ %endif
+ pxor m0, m0
+.sec_k_loop:
+ MOVDDUP m2, [tapq+12*8+kq*8]
+ ACCUMULATE_TAP 2*2, [rsp], m7, m1, m2, %1, 0
+ %if ARCH_X86_32
+ MOVDDUP m2, [tapq+12*8+kq*8]
+ %endif
+ ACCUMULATE_TAP 6*2, [rsp], m7, m1, m2, %1, 0
+ dec kd
+ jge .sec_k_loop
+ movif32 strideq, [esp+0x3C]
+ CDEF_FILTER_END %1, 0
+ dec hd
+ jg .sec_v_loop
+ RET
+%endmacro
+
+%macro MULLD 2
+ %if cpuflag(sse4)
+ pmulld %1, %2
+ %else
+ %if ARCH_X86_32
+ %define m15 m1
+ %endif
+ pmulhuw m15, %1, %2
+ pmullw %1, %2
+ pslld m15, 16
+ paddd %1, m15
+ %endif
+%endmacro
+
+%macro CDEF_DIR 0
+ %if ARCH_X86_64
+cglobal cdef_dir_8bpc, 3, 7, 16, src, stride, var
+ lea r6, [strideq*3]
+ movq m1, [srcq+strideq*0]
+ movhps m1, [srcq+strideq*1]
+ movq m3, [srcq+strideq*2]
+ movhps m3, [srcq+r6 ]
+ lea srcq, [srcq+strideq*4]
+ movq m5, [srcq+strideq*0]
+ movhps m5, [srcq+strideq*1]
+ movq m7, [srcq+strideq*2]
+ movhps m7, [srcq+r6 ]
+
+ pxor m8, m8
+ psadbw m9, m1, m8
+ psadbw m2, m3, m8
+ psadbw m4, m5, m8
+ psadbw m6, m7, m8
+ packssdw m9, m2
+ packssdw m4, m6
+ packssdw m9, m4
+
+ punpcklbw m0, m1, m8
+ punpckhbw m1, m8
+ punpcklbw m2, m3, m8
+ punpckhbw m3, m8
+ punpcklbw m4, m5, m8
+ punpckhbw m5, m8
+ punpcklbw m6, m7, m8
+ punpckhbw m7, m8
+cglobal_label .main
+ mova m8, [pw_128]
+ psubw m0, m8
+ psubw m1, m8
+ psubw m2, m8
+ psubw m3, m8
+ psubw m4, m8
+ psubw m5, m8
+ psubw m6, m8
+ psubw m7, m8
+ psllw m8, 3
+ psubw m9, m8 ; partial_sum_hv[0]
+
+ paddw m8, m0, m1
+ paddw m10, m2, m3
+ paddw m8, m4
+ paddw m10, m5
+ paddw m8, m6
+ paddw m10, m7
+ paddw m8, m10 ; partial_sum_hv[1]
+
+ pmaddwd m8, m8
+ pmaddwd m9, m9
+ phaddd m9, m8
+ SWAP m8, m9
+ MULLD m8, [div_table%+SUFFIX+48]
+
+ pslldq m9, m1, 2
+ psrldq m10, m1, 14
+ pslldq m11, m2, 4
+ psrldq m12, m2, 12
+ pslldq m13, m3, 6
+ psrldq m14, m3, 10
+ paddw m9, m0
+ paddw m10, m12
+ paddw m11, m13
+ paddw m10, m14 ; partial_sum_diag[0] top/right half
+ paddw m9, m11 ; partial_sum_diag[0] top/left half
+ pslldq m11, m4, 8
+ psrldq m12, m4, 8
+ pslldq m13, m5, 10
+ psrldq m14, m5, 6
+ paddw m9, m11
+ paddw m10, m12
+ paddw m9, m13
+ paddw m10, m14
+ pslldq m11, m6, 12
+ psrldq m12, m6, 4
+ pslldq m13, m7, 14
+ psrldq m14, m7, 2
+ paddw m9, m11
+ paddw m10, m12
+ paddw m9, m13 ; partial_sum_diag[0][0-7]
+ paddw m10, m14 ; partial_sum_diag[0][8-14,zero]
+ pshufb m10, [shufw_6543210x]
+ punpckhwd m11, m9, m10
+ punpcklwd m9, m10
+ pmaddwd m11, m11
+ pmaddwd m9, m9
+ MULLD m11, [div_table%+SUFFIX+16]
+ MULLD m9, [div_table%+SUFFIX+0]
+ paddd m9, m11 ; cost[0a-d]
+
+ pslldq m10, m0, 14
+ psrldq m11, m0, 2
+ pslldq m12, m1, 12
+ psrldq m13, m1, 4
+ pslldq m14, m2, 10
+ psrldq m15, m2, 6
+ paddw m10, m12
+ paddw m11, m13
+ paddw m10, m14
+ paddw m11, m15
+ pslldq m12, m3, 8
+ psrldq m13, m3, 8
+ pslldq m14, m4, 6
+ psrldq m15, m4, 10
+ paddw m10, m12
+ paddw m11, m13
+ paddw m10, m14
+ paddw m11, m15
+ pslldq m12, m5, 4
+ psrldq m13, m5, 12
+ pslldq m14, m6, 2
+ psrldq m15, m6, 14
+ paddw m10, m12
+ paddw m11, m13
+ paddw m10, m14
+ paddw m11, m15 ; partial_sum_diag[1][8-14,zero]
+ paddw m10, m7 ; partial_sum_diag[1][0-7]
+ pshufb m11, [shufw_6543210x]
+ punpckhwd m12, m10, m11
+ punpcklwd m10, m11
+ pmaddwd m12, m12
+ pmaddwd m10, m10
+ MULLD m12, [div_table%+SUFFIX+16]
+ MULLD m10, [div_table%+SUFFIX+0]
+ paddd m10, m12 ; cost[4a-d]
+ phaddd m9, m10 ; cost[0a/b,4a/b]
+
+ paddw m10, m0, m1
+ paddw m11, m2, m3
+ paddw m12, m4, m5
+ paddw m13, m6, m7
+ phaddw m0, m4
+ phaddw m1, m5
+ phaddw m2, m6
+ phaddw m3, m7
+
+ ; m0-3 are horizontal sums (x >> 1), m10-13 are vertical sums (y >> 1)
+ pslldq m4, m11, 2
+ psrldq m5, m11, 14
+ pslldq m6, m12, 4
+ psrldq m7, m12, 12
+ pslldq m14, m13, 6
+ psrldq m15, m13, 10
+ paddw m4, m10
+ paddw m5, m7
+ paddw m4, m6
+ paddw m5, m15 ; partial_sum_alt[3] right
+ paddw m4, m14 ; partial_sum_alt[3] left
+ pshuflw m6, m5, q3012
+ punpckhwd m5, m4
+ punpcklwd m4, m6
+ pmaddwd m5, m5
+ pmaddwd m4, m4
+ MULLD m5, [div_table%+SUFFIX+48]
+ MULLD m4, [div_table%+SUFFIX+32]
+ paddd m4, m5 ; cost[7a-d]
+
+ pslldq m5, m10, 6
+ psrldq m6, m10, 10
+ pslldq m7, m11, 4
+ psrldq m10, m11, 12
+ pslldq m11, m12, 2
+ psrldq m12, 14
+ paddw m5, m7
+ paddw m6, m10
+ paddw m5, m11
+ paddw m6, m12
+ paddw m5, m13
+ pshuflw m7, m6, q3012
+ punpckhwd m6, m5
+ punpcklwd m5, m7
+ pmaddwd m6, m6
+ pmaddwd m5, m5
+ MULLD m6, [div_table%+SUFFIX+48]
+ MULLD m5, [div_table%+SUFFIX+32]
+ paddd m5, m6 ; cost[5a-d]
+
+ pslldq m6, m1, 2
+ psrldq m7, m1, 14
+ pslldq m10, m2, 4
+ psrldq m11, m2, 12
+ pslldq m12, m3, 6
+ psrldq m13, m3, 10
+ paddw m6, m0
+ paddw m7, m11
+ paddw m6, m10
+ paddw m7, m13 ; partial_sum_alt[3] right
+ paddw m6, m12 ; partial_sum_alt[3] left
+ pshuflw m10, m7, q3012
+ punpckhwd m7, m6
+ punpcklwd m6, m10
+ pmaddwd m7, m7
+ pmaddwd m6, m6
+ MULLD m7, [div_table%+SUFFIX+48]
+ MULLD m6, [div_table%+SUFFIX+32]
+ paddd m6, m7 ; cost[1a-d]
+
+ pshufd m0, m0, q1032
+ pshufd m1, m1, q1032
+ pshufd m2, m2, q1032
+ pshufd m3, m3, q1032
+
+ pslldq m10, m0, 6
+ psrldq m11, m0, 10
+ pslldq m12, m1, 4
+ psrldq m13, m1, 12
+ pslldq m14, m2, 2
+ psrldq m2, 14
+ paddw m10, m12
+ paddw m11, m13
+ paddw m10, m14
+ paddw m11, m2
+ paddw m10, m3
+ pshuflw m12, m11, q3012
+ punpckhwd m11, m10
+ punpcklwd m10, m12
+ pmaddwd m11, m11
+ pmaddwd m10, m10
+ MULLD m11, [div_table%+SUFFIX+48]
+ MULLD m10, [div_table%+SUFFIX+32]
+ paddd m10, m11 ; cost[3a-d]
+
+ phaddd m9, m8 ; cost[0,4,2,6]
+ phaddd m6, m10
+ phaddd m5, m4
+ phaddd m6, m5 ; cost[1,3,5,7]
+ pshufd m4, m9, q3120
+
+ ; now find the best cost
+ %if cpuflag(sse4)
+ pmaxsd m9, m6
+ pshufd m0, m9, q1032
+ pmaxsd m0, m9
+ pshufd m1, m0, q2301
+ pmaxsd m0, m1 ; best cost
+ %else
+ pcmpgtd m0, m9, m6
+ pand m9, m0
+ pandn m0, m6
+ por m9, m0
+ pshufd m1, m9, q1032
+ pcmpgtd m0, m9, m1
+ pand m9, m0
+ pandn m0, m1
+ por m9, m0
+ pshufd m1, m9, q2301
+ pcmpgtd m0, m9, m1
+ pand m9, m0
+ pandn m0, m1
+ por m0, m9
+ %endif
+
+ ; get direction and variance
+ punpckhdq m1, m4, m6
+ punpckldq m4, m6
+ psubd m2, m0, m1
+ psubd m3, m0, m4
+%if WIN64
+ WIN64_RESTORE_XMM
+ %define tmp rsp+stack_offset+8
+%else
+ %define tmp rsp-40
+%endif
+ mova [tmp+0x00], m2 ; emulate ymm in stack
+ mova [tmp+0x10], m3
+ pcmpeqd m1, m0 ; compute best cost mask
+ pcmpeqd m4, m0
+ packssdw m4, m1
+ pmovmskb eax, m4 ; get byte-idx from mask
+ tzcnt eax, eax
+ mov r1d, [tmp+rax*2] ; get idx^4 complement from emulated ymm
+ shr eax, 1 ; get direction by converting byte-idx to word-idx
+ shr r1d, 10
+ mov [varq], r1d
+ %else
+cglobal cdef_dir_8bpc, 2, 4, 8, 96, src, stride, var, stride3
+%define base r2-shufw_6543210x
+ LEA r2, shufw_6543210x
+ pxor m0, m0
+ lea stride3q, [strideq*3]
+ movq m5, [srcq+strideq*0]
+ movhps m5, [srcq+strideq*1]
+ movq m7, [srcq+strideq*2]
+ movhps m7, [srcq+stride3q]
+ mova m1, [base+pw_128]
+ psadbw m2, m5, m0
+ psadbw m3, m7, m0
+ packssdw m2, m3
+ punpcklbw m4, m5, m0
+ punpckhbw m5, m0
+ punpcklbw m6, m7, m0
+ punpckhbw m7, m0
+ psubw m4, m1
+ psubw m5, m1
+ psubw m6, m1
+ psubw m7, m1
+
+ mova [esp+0x00], m4
+ mova [esp+0x10], m5
+ mova [esp+0x20], m6
+ mova [esp+0x50], m7
+
+ lea srcq, [srcq+strideq*4]
+ movq m5, [srcq+strideq*0]
+ movhps m5, [srcq+strideq*1]
+ movq m7, [srcq+strideq*2]
+ movhps m7, [srcq+stride3q]
+ psadbw m3, m5, m0
+ psadbw m0, m7
+ packssdw m3, m0
+ pxor m0, m0
+ punpcklbw m4, m5, m0
+ punpckhbw m5, m0
+ punpcklbw m6, m7, m0
+ punpckhbw m7, m0
+cglobal_label .main
+ psubw m4, m1
+ psubw m5, m1
+ psubw m6, m1
+ psubw m7, m1
+ packssdw m2, m3
+ psllw m1, 3
+ psubw m2, m1 ; partial_sum_hv[0]
+ pmaddwd m2, m2
+
+ mova m3, [esp+0x50]
+ mova m0, [esp+0x00]
+ paddw m0, [esp+0x10]
+ paddw m1, m3, [esp+0x20]
+ paddw m0, m4
+ paddw m1, m5
+ paddw m0, m6
+ paddw m1, m7
+ paddw m0, m1 ; partial_sum_hv[1]
+ pmaddwd m0, m0
+
+ phaddd m2, m0
+ MULLD m2, [base+div_table%+SUFFIX+48]
+ mova [esp+0x30], m2
+
+ mova m1, [esp+0x10]
+ pslldq m0, m1, 2
+ psrldq m1, 14
+ paddw m0, [esp+0x00]
+ pslldq m2, m3, 6
+ psrldq m3, 10
+ paddw m0, m2
+ paddw m1, m3
+ mova m3, [esp+0x20]
+ pslldq m2, m3, 4
+ psrldq m3, 12
+ paddw m0, m2 ; partial_sum_diag[0] top/left half
+ paddw m1, m3 ; partial_sum_diag[0] top/right half
+ pslldq m2, m4, 8
+ psrldq m3, m4, 8
+ paddw m0, m2
+ paddw m1, m3
+ pslldq m2, m5, 10
+ psrldq m3, m5, 6
+ paddw m0, m2
+ paddw m1, m3
+ pslldq m2, m6, 12
+ psrldq m3, m6, 4
+ paddw m0, m2
+ paddw m1, m3
+ pslldq m2, m7, 14
+ psrldq m3, m7, 2
+ paddw m0, m2 ; partial_sum_diag[0][0-7]
+ paddw m1, m3 ; partial_sum_diag[0][8-14,zero]
+ mova m3, [esp+0x50]
+ pshufb m1, [base+shufw_6543210x]
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ pmaddwd m2, m2
+ pmaddwd m0, m0
+ MULLD m2, [base+div_table%+SUFFIX+16]
+ MULLD m0, [base+div_table%+SUFFIX+ 0]
+ paddd m0, m2 ; cost[0a-d]
+ mova [esp+0x40], m0
+
+ mova m1, [esp+0x00]
+ pslldq m0, m1, 14
+ psrldq m1, 2
+ paddw m0, m7
+ pslldq m2, m3, 8
+ psrldq m3, 8
+ paddw m0, m2
+ paddw m1, m3
+ mova m3, [esp+0x20]
+ pslldq m2, m3, 10
+ psrldq m3, 6
+ paddw m0, m2
+ paddw m1, m3
+ mova m3, [esp+0x10]
+ pslldq m2, m3, 12
+ psrldq m3, 4
+ paddw m0, m2
+ paddw m1, m3
+ pslldq m2, m4, 6
+ psrldq m3, m4, 10
+ paddw m0, m2
+ paddw m1, m3
+ pslldq m2, m5, 4
+ psrldq m3, m5, 12
+ paddw m0, m2
+ paddw m1, m3
+ pslldq m2, m6, 2
+ psrldq m3, m6, 14
+ paddw m0, m2 ; partial_sum_diag[1][0-7]
+ paddw m1, m3 ; partial_sum_diag[1][8-14,zero]
+ mova m3, [esp+0x50]
+ pshufb m1, [base+shufw_6543210x]
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ pmaddwd m2, m2
+ pmaddwd m0, m0
+ MULLD m2, [base+div_table%+SUFFIX+16]
+ MULLD m0, [base+div_table%+SUFFIX+ 0]
+ paddd m0, m2 ; cost[4a-d]
+ phaddd m1, [esp+0x40], m0 ; cost[0a/b,4a/b]
+ phaddd m1, [esp+0x30] ; cost[0,4,2,6]
+ mova [esp+0x30], m1
+
+ phaddw m0, [esp+0x00], m4
+ phaddw m1, [esp+0x10], m5
+ paddw m4, m5
+ mova m2, [esp+0x20]
+ paddw m5, m2, m3
+ phaddw m2, m6
+ paddw m6, m7
+ phaddw m3, m7
+ mova m7, [esp+0x00]
+ paddw m7, [esp+0x10]
+ mova [esp+0x00], m0
+ mova [esp+0x10], m1
+ mova [esp+0x20], m2
+
+ pslldq m1, m4, 4
+ pslldq m2, m6, 6
+ pslldq m0, m5, 2
+ paddw m1, m2
+ paddw m0, m7
+ psrldq m2, m5, 14
+ paddw m0, m1 ; partial_sum_alt[3] left
+ psrldq m1, m4, 12
+ paddw m1, m2
+ psrldq m2, m6, 10
+ paddw m1, m2 ; partial_sum_alt[3] right
+ pshuflw m1, m1, q3012
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ pmaddwd m2, m2
+ pmaddwd m0, m0
+ MULLD m2, [base+div_table%+SUFFIX+48]
+ MULLD m0, [base+div_table%+SUFFIX+32]
+ paddd m0, m2 ; cost[7a-d]
+ mova [esp+0x40], m0
+
+ pslldq m0, m7, 6
+ psrldq m7, 10
+ pslldq m1, m5, 4
+ psrldq m5, 12
+ pslldq m2, m4, 2
+ psrldq m4, 14
+ paddw m0, m6
+ paddw m7, m5
+ paddw m0, m1
+ paddw m7, m4
+ paddw m0, m2
+ pshuflw m2, m7, q3012
+ punpckhwd m7, m0
+ punpcklwd m0, m2
+ pmaddwd m7, m7
+ pmaddwd m0, m0
+ MULLD m7, [base+div_table%+SUFFIX+48]
+ MULLD m0, [base+div_table%+SUFFIX+32]
+ paddd m0, m7 ; cost[5a-d]
+ mova [esp+0x50], m0
+
+ mova m7, [esp+0x10]
+ mova m2, [esp+0x20]
+ pslldq m0, m7, 2
+ psrldq m7, 14
+ pslldq m4, m2, 4
+ psrldq m2, 12
+ pslldq m5, m3, 6
+ psrldq m6, m3, 10
+ paddw m0, [esp+0x00]
+ paddw m7, m2
+ paddw m4, m5
+ paddw m7, m6 ; partial_sum_alt[3] right
+ paddw m0, m4 ; partial_sum_alt[3] left
+ pshuflw m2, m7, q3012
+ punpckhwd m7, m0
+ punpcklwd m0, m2
+ pmaddwd m7, m7
+ pmaddwd m0, m0
+ MULLD m7, [base+div_table%+SUFFIX+48]
+ MULLD m0, [base+div_table%+SUFFIX+32]
+ paddd m0, m7 ; cost[1a-d]
+ SWAP m0, m4
+
+ pshufd m0, [esp+0x00], q1032
+ pshufd m1, [esp+0x10], q1032
+ pshufd m2, [esp+0x20], q1032
+ pshufd m3, m3, q1032
+ mova [esp+0x00], m4
+
+ pslldq m4, m0, 6
+ psrldq m0, 10
+ pslldq m5, m1, 4
+ psrldq m1, 12
+ pslldq m6, m2, 2
+ psrldq m2, 14
+ paddw m4, m3
+ paddw m0, m1
+ paddw m5, m6
+ paddw m0, m2
+ paddw m4, m5
+ pshuflw m2, m0, q3012
+ punpckhwd m0, m4
+ punpcklwd m4, m2
+ pmaddwd m0, m0
+ pmaddwd m4, m4
+ MULLD m0, [base+div_table%+SUFFIX+48]
+ MULLD m4, [base+div_table%+SUFFIX+32]
+ paddd m4, m0 ; cost[3a-d]
+
+ mova m1, [esp+0x00]
+ mova m2, [esp+0x50]
+ mova m0, [esp+0x30] ; cost[0,4,2,6]
+ phaddd m1, m4
+ phaddd m2, [esp+0x40] ; cost[1,3,5,7]
+ phaddd m1, m2
+ pshufd m2, m0, q3120
+
+ ; now find the best cost
+ %if cpuflag(sse4)
+ pmaxsd m0, m1
+ pshufd m3, m0, q1032
+ pmaxsd m3, m0
+ pshufd m0, m3, q2301
+ pmaxsd m0, m3
+ %else
+ pcmpgtd m3, m0, m1
+ pand m0, m3
+ pandn m3, m1
+ por m0, m3
+ pshufd m4, m0, q1032
+ pcmpgtd m3, m0, m4
+ pand m0, m3
+ pandn m3, m4
+ por m0, m3
+ pshufd m4, m0, q2301
+ pcmpgtd m3, m0, m4
+ pand m0, m3
+ pandn m3, m4
+ por m0, m3
+ %endif
+
+ ; get direction and variance
+ mov vard, varm
+ punpckhdq m3, m2, m1
+ punpckldq m2, m1
+ psubd m1, m0, m3
+ psubd m4, m0, m2
+ mova [esp+0x00], m1 ; emulate ymm in stack
+ mova [esp+0x10], m4
+ pcmpeqd m3, m0 ; compute best cost mask
+ pcmpeqd m2, m0
+ packssdw m2, m3
+ pmovmskb eax, m2 ; get byte-idx from mask
+ tzcnt eax, eax
+ mov r1d, [esp+eax*2] ; get idx^4 complement from emulated ymm
+ shr eax, 1 ; get direction by converting byte-idx to word-idx
+ shr r1d, 10
+ mov [vard], r1d
+ %endif
+
+ RET
+%endmacro
+
+INIT_XMM sse4
+CDEF_FILTER 8, 8
+CDEF_FILTER 4, 8
+CDEF_FILTER 4, 4
+CDEF_DIR
+
+INIT_XMM ssse3
+CDEF_FILTER 8, 8
+CDEF_FILTER 4, 8
+CDEF_FILTER 4, 4
+CDEF_DIR
+
+INIT_XMM sse2
+CDEF_FILTER 8, 8
+CDEF_FILTER 4, 8
+CDEF_FILTER 4, 4
diff --git a/third_party/dav1d/src/x86/cpu.c b/third_party/dav1d/src/x86/cpu.c
new file mode 100644
index 0000000000..f570fd7f39
--- /dev/null
+++ b/third_party/dav1d/src/x86/cpu.c
@@ -0,0 +1,97 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stdint.h>
+#include <string.h>
+
+#include "common/attributes.h"
+
+#include "src/x86/cpu.h"
+
+typedef struct {
+ uint32_t eax, ebx, edx, ecx;
+} CpuidRegisters;
+
+void dav1d_cpu_cpuid(CpuidRegisters *regs, unsigned leaf, unsigned subleaf);
+uint64_t dav1d_cpu_xgetbv(unsigned xcr);
+
+#define X(reg, mask) (((reg) & (mask)) == (mask))
+
+COLD unsigned dav1d_get_cpu_flags_x86(void) {
+ union {
+ CpuidRegisters r;
+ struct {
+ uint32_t max_leaf;
+ char vendor[12];
+ };
+ } cpu;
+ dav1d_cpu_cpuid(&cpu.r, 0, 0);
+ unsigned flags = 0;
+
+ if (cpu.max_leaf >= 1) {
+ CpuidRegisters r;
+ dav1d_cpu_cpuid(&r, 1, 0);
+ const unsigned family = ((r.eax >> 8) & 0x0f) + ((r.eax >> 20) & 0xff);
+
+ if (X(r.edx, 0x06008000)) /* CMOV/SSE/SSE2 */ {
+ flags |= DAV1D_X86_CPU_FLAG_SSE2;
+ if (X(r.ecx, 0x00000201)) /* SSE3/SSSE3 */ {
+ flags |= DAV1D_X86_CPU_FLAG_SSSE3;
+ if (X(r.ecx, 0x00080000)) /* SSE4.1 */
+ flags |= DAV1D_X86_CPU_FLAG_SSE41;
+ }
+ }
+#if ARCH_X86_64
+ /* We only support >128-bit SIMD on x86-64. */
+ if (X(r.ecx, 0x18000000)) /* OSXSAVE/AVX */ {
+ const uint64_t xcr0 = dav1d_cpu_xgetbv(0);
+ if (X(xcr0, 0x00000006)) /* XMM/YMM */ {
+ if (cpu.max_leaf >= 7) {
+ dav1d_cpu_cpuid(&r, 7, 0);
+ if (X(r.ebx, 0x00000128)) /* BMI1/BMI2/AVX2 */ {
+ flags |= DAV1D_X86_CPU_FLAG_AVX2;
+ if (X(xcr0, 0x000000e0)) /* ZMM/OPMASK */ {
+ if (X(r.ebx, 0xd0230000) && X(r.ecx, 0x00005f42))
+ flags |= DAV1D_X86_CPU_FLAG_AVX512ICL;
+ }
+ }
+ }
+ }
+ }
+#endif
+ if (!memcmp(cpu.vendor, "AuthenticAMD", sizeof(cpu.vendor))) {
+ if ((flags & DAV1D_X86_CPU_FLAG_AVX2) && family <= 0x19) {
+ /* Excavator, Zen, Zen+, Zen 2, Zen 3, Zen 3+, Zen 4 */
+ flags |= DAV1D_X86_CPU_FLAG_SLOW_GATHER;
+ }
+ }
+ }
+
+ return flags;
+}
diff --git a/third_party/dav1d/src/x86/cpu.h b/third_party/dav1d/src/x86/cpu.h
new file mode 100644
index 0000000000..8529c77c9b
--- /dev/null
+++ b/third_party/dav1d/src/x86/cpu.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_X86_CPU_H
+#define DAV1D_SRC_X86_CPU_H
+
+enum CpuFlags {
+ DAV1D_X86_CPU_FLAG_SSE2 = 1 << 0,
+ DAV1D_X86_CPU_FLAG_SSSE3 = 1 << 1,
+ DAV1D_X86_CPU_FLAG_SSE41 = 1 << 2,
+ DAV1D_X86_CPU_FLAG_AVX2 = 1 << 3,
+ DAV1D_X86_CPU_FLAG_AVX512ICL = 1 << 4, /* F/CD/BW/DQ/VL/VNNI/IFMA/VBMI/VBMI2/
+ * VPOPCNTDQ/BITALG/GFNI/VAES/VPCLMULQDQ */
+ DAV1D_X86_CPU_FLAG_SLOW_GATHER = 1 << 5, /* Flag CPUs where gather instructions are slow enough
+ * to cause performance regressions. */
+};
+
+unsigned dav1d_get_cpu_flags_x86(void);
+
+#endif /* DAV1D_SRC_X86_CPU_H */
diff --git a/third_party/dav1d/src/x86/cpuid.asm b/third_party/dav1d/src/x86/cpuid.asm
new file mode 100644
index 0000000000..e1d9228660
--- /dev/null
+++ b/third_party/dav1d/src/x86/cpuid.asm
@@ -0,0 +1,55 @@
+; Copyright © 2018, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION .text
+
+cglobal cpu_cpuid, 0, 5, 0, regs, leaf, subleaf
+ mov r4, regsmp
+ mov eax, leafm
+ mov ecx, subleafm
+%if ARCH_X86_64
+ mov r5, rbx
+%endif
+ cpuid
+ mov [r4+4*0], eax
+ mov [r4+4*1], ebx
+ mov [r4+4*2], edx
+ mov [r4+4*3], ecx
+%if ARCH_X86_64
+ mov rbx, r5
+%endif
+ RET
+
+cglobal cpu_xgetbv, 0, 0, 0, xcr
+ movifnidn ecx, xcrm
+ xgetbv
+%if ARCH_X86_64
+ shl rdx, 32
+ or rax, rdx
+%endif
+ RET
diff --git a/third_party/dav1d/src/x86/filmgrain.h b/third_party/dav1d/src/x86/filmgrain.h
new file mode 100644
index 0000000000..8f6ac8f828
--- /dev/null
+++ b/third_party/dav1d/src/x86/filmgrain.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright © 2018-2022, VideoLAN and dav1d authors
+ * Copyright © 2018-2022, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/filmgrain.h"
+
+#define decl_fg_fns(ext) \
+decl_generate_grain_y_fn(BF(dav1d_generate_grain_y, ext)); \
+decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_420, ext)); \
+decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_422, ext)); \
+decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_444, ext)); \
+decl_fgy_32x32xn_fn(BF(dav1d_fgy_32x32xn, ext)); \
+decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i420, ext)); \
+decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i422, ext)); \
+decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i444, ext))
+
+decl_fg_fns(ssse3);
+decl_fg_fns(avx2);
+decl_fg_fns(avx512icl);
+
+static ALWAYS_INLINE void film_grain_dsp_init_x86(Dav1dFilmGrainDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
+
+ c->generate_grain_y = BF(dav1d_generate_grain_y, ssse3);
+ c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_generate_grain_uv_420, ssse3);
+ c->fgy_32x32xn = BF(dav1d_fgy_32x32xn, ssse3);
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_fguv_32x32xn_i420, ssse3);
+ c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_generate_grain_uv_422, ssse3);
+ c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_generate_grain_uv_444, ssse3);
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_fguv_32x32xn_i422, ssse3);
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_fguv_32x32xn_i444, ssse3);
+
+#if ARCH_X86_64
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
+
+ c->generate_grain_y = BF(dav1d_generate_grain_y, avx2);
+ c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_generate_grain_uv_420, avx2);
+ c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_generate_grain_uv_422, avx2);
+ c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_generate_grain_uv_444, avx2);
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SLOW_GATHER)) {
+ c->fgy_32x32xn = BF(dav1d_fgy_32x32xn, avx2);
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_fguv_32x32xn_i420, avx2);
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_fguv_32x32xn_i422, avx2);
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_fguv_32x32xn_i444, avx2);
+ }
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
+
+ if (BITDEPTH == 8 || !(flags & DAV1D_X86_CPU_FLAG_SLOW_GATHER)) {
+ c->fgy_32x32xn = BF(dav1d_fgy_32x32xn, avx512icl);
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_fguv_32x32xn_i420, avx512icl);
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_fguv_32x32xn_i422, avx512icl);
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_fguv_32x32xn_i444, avx512icl);
+ }
+#endif
+}
diff --git a/third_party/dav1d/src/x86/filmgrain16_avx2.asm b/third_party/dav1d/src/x86/filmgrain16_avx2.asm
new file mode 100644
index 0000000000..a1d4c41f27
--- /dev/null
+++ b/third_party/dav1d/src/x86/filmgrain16_avx2.asm
@@ -0,0 +1,2248 @@
+; Copyright © 2021-2022, VideoLAN and dav1d authors
+; Copyright © 2021-2022, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+%include "x86/filmgrain_common.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 16
+pb_mask: db 0,128,128, 0,128, 0, 0,128,128, 0, 0,128, 0,128,128, 0
+gen_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9
+gen_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13
+next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058
+pw_27_17_17_27: dw 27, 17, 17, 27
+pw_23_22: dw 23, 22, 0, 32
+pw_seed_xor: times 2 dw 0xb524
+ times 2 dw 0x49d8
+gen_ar0_shift: times 4 db 128
+ times 4 db 64
+ times 4 db 32
+ times 4 db 16
+pd_16: dd 16
+pd_m65536: dd -65536
+pb_1: times 4 db 1
+grain_max: times 2 dw 511
+ times 2 dw 2047
+grain_min: times 2 dw -512
+ times 2 dw -2048
+fg_max: times 2 dw 1023
+ times 2 dw 4095
+ times 2 dw 960
+ times 2 dw 3840
+ times 2 dw 940
+ times 2 dw 3760
+fg_min: times 2 dw 0
+ times 2 dw 64
+ times 2 dw 256
+uv_offset_mul: dd 256
+ dd 1024
+hmul_bits: dw 32768, 16384, 8192, 4096
+round: dw 2048, 1024, 512
+mul_bits: dw 256, 128, 64, 32, 16, 8
+round_vals: dw 32, 64, 128, 256, 512, 1024
+pb_8_9_0_1: db 8, 9, 0, 1
+
+%macro JMP_TABLE 1-*
+ %xdefine %1_table %%table
+ %xdefine %%base %1_table
+ %xdefine %%prefix mangle(private_prefix %+ _%1)
+ %%table:
+ %rep %0 - 1
+ dd %%prefix %+ .ar%2 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+JMP_TABLE generate_grain_y_16bpc_avx2, 0, 1, 2, 3
+JMP_TABLE generate_grain_uv_420_16bpc_avx2, 0, 1, 2, 3
+JMP_TABLE generate_grain_uv_422_16bpc_avx2, 0, 1, 2, 3
+JMP_TABLE generate_grain_uv_444_16bpc_avx2, 0, 1, 2, 3
+
+SECTION .text
+
+%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
+
+INIT_YMM avx2
+cglobal generate_grain_y_16bpc, 3, 9, 14, buf, fg_data, bdmax
+%define base r4-generate_grain_y_16bpc_avx2_table
+ lea r4, [generate_grain_y_16bpc_avx2_table]
+ vpbroadcastw xm0, [fg_dataq+FGData.seed]
+ mov r6d, [fg_dataq+FGData.grain_scale_shift]
+ movq xm1, [base+next_upperbit_mask]
+ mov r3, -73*82*2
+ movsxd r5, [fg_dataq+FGData.ar_coeff_lag]
+ lea r7d, [bdmaxq+1]
+ movq xm4, [base+mul_bits]
+ shr r7d, 11 ; 0 for 10bpc, 2 for 12bpc
+ movq xm5, [base+hmul_bits]
+ sub r6, r7
+ mova xm6, [base+pb_mask]
+ sub bufq, r3
+ vpbroadcastw xm7, [base+round+r6*2-2]
+ lea r6, [gaussian_sequence]
+ movsxd r5, [r4+r5*4]
+.loop:
+ pand xm2, xm0, xm1
+ psrlw xm3, xm2, 10
+ por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set
+ pmullw xm2, xm4 ; bits 0x0f00 are set
+ pmulhuw xm0, xm5
+ pshufb xm3, xm6, xm2 ; set 15th bit for next 4 seeds
+ psllq xm2, xm3, 30
+ por xm2, xm3
+ psllq xm3, xm2, 15
+ por xm2, xm0 ; aggregate each bit into next seed's high bit
+ por xm3, xm2 ; 4 next output seeds
+ pshuflw xm0, xm3, q3333
+ psrlw xm3, 5
+ pand xm2, xm0, xm1
+ movq r7, xm3
+ psrlw xm3, xm2, 10
+ por xm2, xm3
+ pmullw xm2, xm4
+ pmulhuw xm0, xm5
+ movzx r8d, r7w
+ pshufb xm3, xm6, xm2
+ psllq xm2, xm3, 30
+ por xm2, xm3
+ psllq xm3, xm2, 15
+ por xm0, xm2
+ movd xm2, [r6+r8*2]
+ rorx r8, r7, 32
+ por xm3, xm0
+ shr r7d, 16
+ pinsrw xm2, [r6+r7*2], 1
+ pshuflw xm0, xm3, q3333
+ movzx r7d, r8w
+ psrlw xm3, 5
+ pinsrw xm2, [r6+r7*2], 2
+ shr r8d, 16
+ movq r7, xm3
+ pinsrw xm2, [r6+r8*2], 3
+ movzx r8d, r7w
+ pinsrw xm2, [r6+r8*2], 4
+ rorx r8, r7, 32
+ shr r7d, 16
+ pinsrw xm2, [r6+r7*2], 5
+ movzx r7d, r8w
+ pinsrw xm2, [r6+r7*2], 6
+ shr r8d, 16
+ pinsrw xm2, [r6+r8*2], 7
+ paddw xm2, xm2 ; otherwise bpc=12 w/ grain_scale_shift=0
+ pmulhrsw xm2, xm7 ; shifts by 0, which pmulhrsw does not support
+ mova [bufq+r3], xm2
+ add r3, 8*2
+ jl .loop
+
+ ; auto-regression code
+ add r5, r4
+ jmp r5
+
+.ar1:
+ DEFINE_ARGS buf, fg_data, max, shift, val3, min, cf3, x, val0
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3]
+ movd xm4, [fg_dataq+FGData.ar_coeffs_y]
+ DEFINE_ARGS buf, h, max, shift, val3, min, cf3, x, val0
+ pinsrb xm4, [base+pb_1], 3
+ pmovsxbw xm4, xm4
+ pshufd xm5, xm4, q1111
+ pshufd xm4, xm4, q0000
+ vpbroadcastw xm3, [base+round_vals+shiftq*2-12] ; rnd
+ sub bufq, 2*(82*73-(82*3+79))
+ mov hd, 70
+ sar maxd, 1
+ mov mind, maxd
+ xor mind, -1
+.y_loop_ar1:
+ mov xq, -76
+ movsx val3d, word [bufq+xq*2-2]
+.x_loop_ar1:
+ movu xm0, [bufq+xq*2-82*2-2] ; top/left
+ psrldq xm2, xm0, 2 ; top
+ psrldq xm1, xm0, 4 ; top/right
+ punpcklwd xm0, xm2
+ punpcklwd xm1, xm3
+ pmaddwd xm0, xm4
+ pmaddwd xm1, xm5
+ paddd xm0, xm1
+.x_loop_ar1_inner:
+ movd val0d, xm0
+ psrldq xm0, 4
+ imul val3d, cf3d
+ add val3d, val0d
+ sarx val3d, val3d, shiftd
+ movsx val0d, word [bufq+xq*2]
+ add val3d, val0d
+ cmp val3d, maxd
+ cmovg val3d, maxd
+ cmp val3d, mind
+ cmovl val3d, mind
+ mov word [bufq+xq*2], val3w
+ ; keep val3d in-place as left for next x iteration
+ inc xq
+ jz .x_loop_ar1_end
+ test xb, 3
+ jnz .x_loop_ar1_inner
+ jmp .x_loop_ar1
+.x_loop_ar1_end:
+ add bufq, 82*2
+ dec hd
+ jg .y_loop_ar1
+.ar0:
+ RET
+
+.ar2:
+ DEFINE_ARGS buf, fg_data, bdmax, shift
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ movq xm0, [fg_dataq+FGData.ar_coeffs_y+5] ; cf5-11
+ vinserti128 m0, [fg_dataq+FGData.ar_coeffs_y+0], 1 ; cf0-4
+ vpbroadcastw xm10, [base+round_vals-12+shiftq*2]
+ pxor m1, m1
+ punpcklwd xm10, xm1
+ pcmpgtb m1, m0
+ punpcklbw m0, m1 ; cf5-11,0-4
+ vpermq m1, m0, q3333 ; cf4
+ vbroadcasti128 m11, [base+gen_shufA]
+ pshufd m6, m0, q0000 ; cf[5,6], cf[0-1]
+ vbroadcasti128 m12, [base+gen_shufB]
+ pshufd m7, m0, q1111 ; cf[7,8], cf[2-3]
+ punpckhwd xm1, xm0
+ pshufhw xm9, xm0, q2121
+ pshufd xm8, xm1, q0000 ; cf[4,9]
+ sar bdmaxd, 1
+ punpckhqdq xm9, xm9 ; cf[10,11]
+ movd xm4, bdmaxd ; max_grain
+ pcmpeqd xm5, xm5
+ sub bufq, 2*(82*73-(82*3+79))
+ pxor xm5, xm4 ; min_grain
+ DEFINE_ARGS buf, fg_data, h, x
+ mov hd, 70
+.y_loop_ar2:
+ mov xq, -76
+.x_loop_ar2:
+ vbroadcasti128 m2, [bufq+xq*2-82*4-4] ; y=-2,x=[-2,+5]
+ vinserti128 m1, m2, [bufq+xq*2-82*2-4], 0 ; y=-1,x=[-2,+5]
+ pshufb m0, m1, m11 ; y=-1/-2,x=[-2/-1,-1/+0,+0/+1,+1/+2]
+ pmaddwd m0, m6
+ punpckhwd xm2, xm1 ; y=-2/-1 interleaved, x=[+2,+5]
+ pshufb m1, m12 ; y=-1/-2,x=[+0/+1,+1/+2,+2/+3,+3/+4]
+ pmaddwd m1, m7
+ pmaddwd xm2, xm8
+ paddd m0, m1
+ vextracti128 xm1, m0, 1
+ paddd xm0, xm10
+ paddd xm2, xm0
+ movu xm0, [bufq+xq*2-4] ; y=0,x=[-2,+5]
+ paddd xm2, xm1
+ pmovsxwd xm1, [bufq+xq*2] ; in dwords, y=0,x=[0,3]
+.x_loop_ar2_inner:
+ pmaddwd xm3, xm9, xm0
+ psrldq xm0, 2
+ paddd xm3, xm2
+ psrldq xm2, 4 ; shift top to next pixel
+ psrad xm3, [fg_dataq+FGData.ar_coeff_shift]
+ ; skip packssdw because we only care about one value
+ paddd xm3, xm1
+ pminsd xm3, xm4
+ psrldq xm1, 4
+ pmaxsd xm3, xm5
+ pextrw [bufq+xq*2], xm3, 0
+ punpcklwd xm3, xm3
+ pblendw xm0, xm3, 0010b
+ inc xq
+ jz .x_loop_ar2_end
+ test xb, 3
+ jnz .x_loop_ar2_inner
+ jmp .x_loop_ar2
+.x_loop_ar2_end:
+ add bufq, 82*2
+ dec hd
+ jg .y_loop_ar2
+ RET
+
+.ar3:
+ DEFINE_ARGS buf, fg_data, bdmax, shift
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ sar bdmaxd, 1
+ movq xm7, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-6
+ movd xm0, [fg_dataq+FGData.ar_coeffs_y+14] ; cf14-16
+ pinsrb xm7, [fg_dataq+FGData.ar_coeffs_y+13], 7 ; cf0-6,13
+ pinsrb xm0, [base+pb_1], 3 ; cf14-16,pb_1
+ movd xm1, [fg_dataq+FGData.ar_coeffs_y+21] ; cf21-23
+ vinserti128 m7, [fg_dataq+FGData.ar_coeffs_y+ 7], 1 ; cf7-13
+ vinserti128 m0, [fg_dataq+FGData.ar_coeffs_y+17], 1 ; cf17-20
+ vpbroadcastw xm11, [base+round_vals+shiftq*2-12]
+ movd xm12, bdmaxd ; max_grain
+ punpcklbw m7, m7 ; sign-extension
+ punpcklbw m0, m0 ; sign-extension
+ punpcklbw xm1, xm1
+ REPX {psraw x, 8}, m7, m0, xm1
+ pshufd m4, m7, q0000 ; cf[0,1] | cf[7,8]
+ pshufd m5, m7, q1111 ; cf[2,3] | cf[9,10]
+ pshufd m6, m7, q2222 ; cf[4,5] | cf[11,12]
+ pshufd xm7, xm7, q3333 ; cf[6,13]
+ pshufd m8, m0, q0000 ; cf[14,15] | cf[17,18]
+ pshufd m9, m0, q1111 ; cf[16],pw_1 | cf[19,20]
+ paddw xm0, xm11, xm11
+ pcmpeqd xm13, xm13
+ pblendw xm10, xm1, xm0, 00001000b
+ pxor xm13, xm12 ; min_grain
+ DEFINE_ARGS buf, fg_data, h, x
+ sub bufq, 2*(82*73-(82*3+79))
+ mov hd, 70
+.y_loop_ar3:
+ mov xq, -76
+.x_loop_ar3:
+ movu xm0, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4]
+ vinserti128 m0, [bufq+xq*2-82*4-6+ 0], 1 ; y=-3/-2,x=[-3,+4]
+ movq xm1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+8]
+ vinserti128 m1, [bufq+xq*2-82*4-6+16], 1 ; y=-3/-2,x=[+5,+12]
+ palignr m3, m1, m0, 2 ; y=-3/-2,x=[-2,+5]
+ palignr m1, m0, 12 ; y=-3/-2,x=[+3,+6]
+ punpckhwd m2, m0, m3 ; y=-3/-2,x=[+1/+2,+2/+3,+3/+4,+4/+5]
+ punpcklwd m0, m3 ; y=-3/-2,x=[-3/-2,-2/-1,-1/+0,+0/+1]
+ shufps m3, m0, m2, q1032 ; y=-3/-2,x=[-1/+0,+0/+1,+1/+2,+2/+3]
+ pmaddwd m0, m4
+ pmaddwd m2, m6
+ pmaddwd m3, m5
+ paddd m0, m2
+ movu xm2, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4]
+ vinserti128 m2, [bufq+xq*2-82*2-6+ 6], 1 ; y=-1,x=[+1,+8]
+ paddd m0, m3
+ psrldq m3, m2, 2
+ punpcklwd m3, m2, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1]
+ pmaddwd m3, m8 ; x=[+0/+1,+1/+2,+2/+3,+3/+4]
+ paddd m0, m3
+ psrldq m3, m2, 4
+ psrldq m2, 6
+ vpblendd m2, m11, 0x0f ; rounding constant
+ punpcklwd m3, m2 ; y=-1,x=[-1/rnd,+0/rnd,+1/rnd,+2/rnd]
+ pmaddwd m3, m9 ; x=[+2/+3,+3/+4,+4/+5,+5,+6]
+ vextracti128 xm2, m1, 1
+ punpcklwd xm1, xm2
+ pmaddwd xm1, xm7 ; y=-3/-2 interleaved,x=[+3,+4,+5,+6]
+ paddd m0, m3
+ vextracti128 xm2, m0, 1
+ paddd xm0, xm1
+ movu xm1, [bufq+xq*2-6] ; y=0,x=[-3,+4]
+ paddd xm0, xm2
+.x_loop_ar3_inner:
+ pmaddwd xm2, xm1, xm10
+ pshuflw xm3, xm2, q1032
+ paddd xm2, xm0 ; add top
+ paddd xm2, xm3 ; left+cur
+ psrldq xm0, 4
+ psrad xm2, [fg_dataq+FGData.ar_coeff_shift]
+ ; skip packssdw because we only care about one value
+ pminsd xm2, xm12
+ pmaxsd xm2, xm13
+ pextrw [bufq+xq*2], xm2, 0
+ pslldq xm2, 4
+ psrldq xm1, 2
+ pblendw xm1, xm2, 0100b
+ inc xq
+ jz .x_loop_ar3_end
+ test xb, 3
+ jnz .x_loop_ar3_inner
+ jmp .x_loop_ar3
+.x_loop_ar3_end:
+ add bufq, 82*2
+ dec hd
+ jg .y_loop_ar3
+ RET
+
+%macro GEN_GRAIN_UV_FN 3 ; ss_name, ss_x, ss_y
+INIT_XMM avx2
+cglobal generate_grain_uv_%1_16bpc, 4, 11, 8, buf, bufy, fg_data, uv, bdmax
+%define base r8-generate_grain_uv_%1_16bpc_avx2_table
+ lea r8, [generate_grain_uv_%1_16bpc_avx2_table]
+ movifnidn bdmaxd, bdmaxm
+ vpbroadcastw xm0, [fg_dataq+FGData.seed]
+ mov r5d, [fg_dataq+FGData.grain_scale_shift]
+ movq xm1, [base+next_upperbit_mask]
+ lea r6d, [bdmaxq+1]
+ movq xm4, [base+mul_bits]
+ shr r6d, 11 ; 0 for 10bpc, 2 for 12bpc
+ movq xm5, [base+hmul_bits]
+ sub r5, r6
+ mova xm6, [base+pb_mask]
+ vpbroadcastd xm2, [base+pw_seed_xor+uvq*4]
+ vpbroadcastw xm7, [base+round+r5*2-2]
+ pxor xm0, xm2
+ lea r6, [gaussian_sequence]
+%if %2
+ mov r7d, 73-35*%3
+ add bufq, 44*2
+.loop_y:
+ mov r5, -44*2
+%else
+ mov r5, -82*73*2
+ sub bufq, r5
+%endif
+.loop_x:
+ pand xm2, xm0, xm1
+ psrlw xm3, xm2, 10
+ por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set
+ pmullw xm2, xm4 ; bits 0x0f00 are set
+ pmulhuw xm0, xm5
+ pshufb xm3, xm6, xm2 ; set 15th bit for next 4 seeds
+ psllq xm2, xm3, 30
+ por xm2, xm3
+ psllq xm3, xm2, 15
+ por xm2, xm0 ; aggregate each bit into next seed's high bit
+ por xm2, xm3 ; 4 next output seeds
+ pshuflw xm0, xm2, q3333
+ psrlw xm2, 5
+ movq r10, xm2
+ movzx r9d, r10w
+ movd xm2, [r6+r9*2]
+ rorx r9, r10, 32
+ shr r10d, 16
+ pinsrw xm2, [r6+r10*2], 1
+ movzx r10d, r9w
+ pinsrw xm2, [r6+r10*2], 2
+ shr r9d, 16
+ pinsrw xm2, [r6+r9*2], 3
+ paddw xm2, xm2 ; otherwise bpc=12 w/ grain_scale_shift=0
+ pmulhrsw xm2, xm7 ; shifts by 0, which pmulhrsw does not support
+ movq [bufq+r5], xm2
+ add r5, 8
+ jl .loop_x
+%if %2
+ add bufq, 82*2
+ dec r7d
+ jg .loop_y
+%endif
+
+ ; auto-regression code
+ movsxd r6, [fg_dataq+FGData.ar_coeff_lag]
+ movsxd r6, [r8+r6*4]
+ add r6, r8
+ jmp r6
+
+INIT_YMM avx2
+.ar0:
+ DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift
+ imul uvd, 28
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ vpbroadcastb m0, [fg_dataq+FGData.ar_coeffs_uv+uvq]
+ sar bdmaxd, 1
+ vpbroadcastd m4, [base+gen_ar0_shift-24+shiftq*4]
+ movd xm6, bdmaxd
+ pcmpeqw m7, m7
+ pmaddubsw m4, m0 ; ar_coeff << (14 - shift)
+ vpbroadcastw m6, xm6 ; max_gain
+ pxor m7, m6 ; min_grain
+ DEFINE_ARGS buf, bufy, h, x
+%if %2
+ vpbroadcastw m5, [base+hmul_bits+2+%3*2]
+ sub bufq, 2*(82*(73-35*%3)+82-(82*3+41))
+%else
+ sub bufq, 2*(82*70-3)
+%endif
+ add bufyq, 2*(3+82*3)
+ mov hd, 70-35*%3
+.y_loop_ar0:
+%if %2
+ ; first 32 pixels
+ movu xm0, [bufyq+16*0]
+ vinserti128 m0, [bufyq+16*2], 1
+ movu xm1, [bufyq+16*1]
+ vinserti128 m1, [bufyq+16*3], 1
+%if %3
+ movu xm2, [bufyq+82*2+16*0]
+ vinserti128 m2, [bufyq+82*2+16*2], 1
+ movu xm3, [bufyq+82*2+16*1]
+ vinserti128 m3, [bufyq+82*2+16*3], 1
+ paddw m0, m2
+ paddw m1, m3
+%endif
+ phaddw m0, m1
+ movu xm1, [bufyq+16*4]
+ vinserti128 m1, [bufyq+16*6], 1
+ movu xm2, [bufyq+16*5]
+ vinserti128 m2, [bufyq+16*7], 1
+%if %3
+ movu xm3, [bufyq+82*2+16*4]
+ vinserti128 m3, [bufyq+82*2+16*6], 1
+ paddw m1, m3
+ movu xm3, [bufyq+82*2+16*5]
+ vinserti128 m3, [bufyq+82*2+16*7], 1
+ paddw m2, m3
+%endif
+ phaddw m1, m2
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+%else
+ xor xd, xd
+.x_loop_ar0:
+ movu m0, [bufyq+xq*2]
+ movu m1, [bufyq+xq*2+32]
+%endif
+ paddw m0, m0
+ paddw m1, m1
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+%if %2
+ paddw m0, [bufq+ 0]
+ paddw m1, [bufq+32]
+%else
+ paddw m0, [bufq+xq*2+ 0]
+ paddw m1, [bufq+xq*2+32]
+%endif
+ pminsw m0, m6
+ pminsw m1, m6
+ pmaxsw m0, m7
+ pmaxsw m1, m7
+%if %2
+ movu [bufq+ 0], m0
+ movu [bufq+32], m1
+
+ ; last 6 pixels
+ movu xm0, [bufyq+32*4]
+ movu xm1, [bufyq+32*4+16]
+%if %3
+ paddw xm0, [bufyq+32*4+82*2]
+ paddw xm1, [bufyq+32*4+82*2+16]
+%endif
+ phaddw xm0, xm1
+ movu xm1, [bufq+32*2]
+ pmulhrsw xm0, xm5
+ paddw xm0, xm0
+ pmulhrsw xm0, xm4
+ paddw xm0, xm1
+ pminsw xm0, xm6
+ pmaxsw xm0, xm7
+ vpblendd xm0, xm1, 0x08
+ movu [bufq+32*2], xm0
+%else
+ movu [bufq+xq*2+ 0], m0
+ movu [bufq+xq*2+32], m1
+ add xd, 32
+ cmp xd, 64
+ jl .x_loop_ar0
+
+ ; last 12 pixels
+ movu m0, [bufyq+64*2]
+ movu m1, [bufq+64*2]
+ paddw m0, m0
+ pmulhrsw m0, m4
+ paddw m0, m1
+ pminsw m0, m6
+ pmaxsw m0, m7
+ vpblendd m0, m1, 0xc0
+ movu [bufq+64*2], m0
+%endif
+ add bufq, 82*2
+ add bufyq, 82*2<<%3
+ dec hd
+ jg .y_loop_ar0
+ RET
+
+INIT_XMM avx2
+.ar1:
+ DEFINE_ARGS buf, bufy, fg_data, uv, max, cf3, min, val3, x, shift
+ imul uvd, 28
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3]
+ movd xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq]
+ pinsrb xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 3
+ DEFINE_ARGS buf, bufy, h, val0, max, cf3, min, val3, x, shift
+ pmovsxbw xm4, xm4
+ pshufd xm5, xm4, q1111
+ pshufd xm4, xm4, q0000
+ pmovsxwd xm3, [base+round_vals+shiftq*2-12] ; rnd
+ vpbroadcastw xm6, [base+hmul_bits+2+%3*2]
+ vpbroadcastd xm3, xm3
+%if %2
+ sub bufq, 2*(82*(73-35*%3)+44-(82*3+41))
+%else
+ sub bufq, 2*(82*69+3)
+%endif
+ add bufyq, 2*(79+82*3)
+ mov hd, 70-35*%3
+ sar maxd, 1
+ mov mind, maxd
+ xor mind, -1
+.y_loop_ar1:
+ mov xq, -(76>>%2)
+ movsx val3d, word [bufq+xq*2-2]
+.x_loop_ar1:
+ movu xm0, [bufq+xq*2-82*2-2] ; top/left
+%if %2
+ movu xm2, [bufyq+xq*4]
+%else
+ movq xm2, [bufyq+xq*2]
+%endif
+%if %2
+%if %3
+ phaddw xm2, [bufyq+xq*4+82*2]
+ punpckhqdq xm1, xm2, xm2
+ paddw xm2, xm1
+%else
+ phaddw xm2, xm2
+%endif
+ pmulhrsw xm2, xm6
+%endif
+ psrldq xm1, xm0, 4 ; top/right
+ punpcklwd xm1, xm2
+ psrldq xm2, xm0, 2 ; top
+ punpcklwd xm0, xm2
+ pmaddwd xm1, xm5
+ pmaddwd xm0, xm4
+ paddd xm1, xm3
+ paddd xm0, xm1
+.x_loop_ar1_inner:
+ movd val0d, xm0
+ psrldq xm0, 4
+ imul val3d, cf3d
+ add val3d, val0d
+ sarx val3d, val3d, shiftd
+ movsx val0d, word [bufq+xq*2]
+ add val3d, val0d
+ cmp val3d, maxd
+ cmovg val3d, maxd
+ cmp val3d, mind
+ cmovl val3d, mind
+ mov word [bufq+xq*2], val3w
+ ; keep val3d in-place as left for next x iteration
+ inc xq
+ jz .x_loop_ar1_end
+ test xb, 3
+ jnz .x_loop_ar1_inner
+ jmp .x_loop_ar1
+.x_loop_ar1_end:
+ add bufq, 82*2
+ add bufyq, 82*2<<%3
+ dec hd
+ jg .y_loop_ar1
+ RET
+
+INIT_YMM avx2
+.ar2:
+%if WIN64
+ ; xmm6 and xmm7 already saved
+ %assign xmm_regs_used 13 + %2
+ %assign stack_size_padded 136
+ SUB rsp, stack_size_padded
+ movaps [rsp+16*2], xmm8
+ movaps [rsp+16*3], xmm9
+ movaps [rsp+16*4], xmm10
+ movaps [rsp+16*5], xmm11
+ movaps [rsp+16*6], xmm12
+%if %2
+ movaps [rsp+16*7], xmm13
+%endif
+%endif
+ DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ imul uvd, 28
+ vbroadcasti128 m10, [base+gen_shufA]
+ sar bdmaxd, 1
+ vbroadcasti128 m11, [base+gen_shufB]
+ movd xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 5]
+ pinsrb xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+12], 4
+ pinsrb xm7, [base+pb_1], 5
+ pinsrw xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+10], 3
+ movhps xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0]
+ pinsrb xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 9], 13
+ pmovsxbw m7, xm7
+ movd xm8, bdmaxd ; max_grain
+ pshufd m4, m7, q0000
+ vpbroadcastw xm12, [base+round_vals-12+shiftq*2]
+ pshufd m5, m7, q1111
+ pcmpeqd xm9, xm9
+ pshufd m6, m7, q2222
+ pxor xm9, xm8 ; min_grain
+ pshufd xm7, xm7, q3333
+ DEFINE_ARGS buf, bufy, fg_data, h, x
+%if %2
+ vpbroadcastw xm13, [base+hmul_bits+2+%3*2]
+ sub bufq, 2*(82*(73-35*%3)+44-(82*3+41))
+%else
+ sub bufq, 2*(82*69+3)
+%endif
+ add bufyq, 2*(79+82*3)
+ mov hd, 70-35*%3
+.y_loop_ar2:
+ mov xq, -(76>>%2)
+.x_loop_ar2:
+ vbroadcasti128 m3, [bufq+xq*2-82*2-4] ; y=-1,x=[-2,+5]
+ vinserti128 m2, m3, [bufq+xq*2-82*4-4], 1 ; y=-2,x=[-2,+5]
+ pshufb m0, m2, m10 ; y=-1/-2,x=[-2/-1,-1/+0,+0/+1,+1/+2]
+ pmaddwd m0, m4
+ pshufb m1, m2, m11 ; y=-1/-2,x=[+0/+1,+1/+2,+2/+3,+3/+4]
+ pmaddwd m1, m5
+ punpckhwd m2, m3 ; y=-2/-1 interleaved, x=[+2,+5]
+%if %2
+ movu xm3, [bufyq+xq*4]
+%if %3
+ paddw xm3, [bufyq+xq*4+82*2]
+%endif
+ phaddw xm3, xm3
+ pmulhrsw xm3, xm13
+%else
+ movq xm3, [bufyq+xq*2]
+%endif
+ punpcklwd xm3, xm12 ; luma, round interleaved
+ vpblendd m2, m3, 0x0f
+ pmaddwd m2, m6
+ paddd m1, m0
+ movu xm0, [bufq+xq*2-4] ; y=0,x=[-2,+5]
+ paddd m2, m1
+ vextracti128 xm1, m2, 1
+ paddd xm2, xm1
+ pshufd xm1, xm0, q3321
+ pmovsxwd xm1, xm1 ; y=0,x=[0,3] in dword
+.x_loop_ar2_inner:
+ pmaddwd xm3, xm7, xm0
+ paddd xm3, xm2
+ psrldq xm2, 4 ; shift top to next pixel
+ psrad xm3, [fg_dataq+FGData.ar_coeff_shift]
+ ; we do not need to packssdw since we only care about one value
+ paddd xm3, xm1
+ psrldq xm1, 4
+ pminsd xm3, xm8
+ pmaxsd xm3, xm9
+ pextrw [bufq+xq*2], xm3, 0
+ psrldq xm0, 2
+ pslldq xm3, 2
+ pblendw xm0, xm3, 00000010b
+ inc xq
+ jz .x_loop_ar2_end
+ test xb, 3
+ jnz .x_loop_ar2_inner
+ jmp .x_loop_ar2
+.x_loop_ar2_end:
+ add bufq, 82*2
+ add bufyq, 82*2<<%3
+ dec hd
+ jg .y_loop_ar2
+ RET
+
+.ar3:
+%if WIN64
+ ; xmm6 and xmm7 already saved
+ %assign stack_offset 32
+ %assign xmm_regs_used 14 + %2
+ %assign stack_size_padded 152
+ SUB rsp, stack_size_padded
+ movaps [rsp+16*2], xmm8
+ movaps [rsp+16*3], xmm9
+ movaps [rsp+16*4], xmm10
+ movaps [rsp+16*5], xmm11
+ movaps [rsp+16*6], xmm12
+ movaps [rsp+16*7], xmm13
+%if %2
+ movaps [rsp+16*8], xmm14
+%endif
+%endif
+ DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ imul uvd, 28
+ vpbroadcastw xm11, [base+round_vals-12+shiftq*2]
+ sar bdmaxd, 1
+ movq xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0]
+ pinsrb xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+24], 7 ; luma
+ movhps xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 7]
+ pmovsxbw m7, xm7
+%if %2
+ vpbroadcastw xm14, [base+hmul_bits+2+%3*2]
+%endif
+ pshufd m4, m7, q0000
+ pshufd m5, m7, q1111
+ pshufd m6, m7, q2222
+ pshufd m7, m7, q3333
+ movd xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+14]
+ pinsrb xm0, [base+pb_1], 3
+ pinsrd xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+21], 1
+ pinsrd xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+17], 2
+ pmovsxbw m0, xm0
+ movd xm12, bdmaxd ; max_grain
+ pshufd m8, m0, q0000
+ pshufd m9, m0, q1111
+ pcmpeqd xm13, xm13
+ punpckhqdq xm10, xm0, xm0
+ pxor xm13, xm12 ; min_grain
+ pinsrw xm10, [base+round_vals-10+shiftq*2], 3
+ DEFINE_ARGS buf, bufy, fg_data, h, unused, x
+%if %2
+ sub bufq, 2*(82*(73-35*%3)+44-(82*3+41))
+%else
+ sub bufq, 2*(82*69+3)
+%endif
+ add bufyq, 2*(79+82*3)
+ mov hd, 70-35*%3
+.y_loop_ar3:
+ mov xq, -(76>>%2)
+.x_loop_ar3:
+ movu xm2, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4]
+ vinserti128 m2, [bufq+xq*2-82*4-6+ 0], 1 ; y=-3/-2,x=[-3,+4]
+ movq xm1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+8]
+ vinserti128 m1, [bufq+xq*2-82*4-6+16], 1 ; y=-3/-2,x=[+5,+12]
+ palignr m3, m1, m2, 2 ; y=-3/-2,x=[-2,+5]
+ palignr m1, m2, 12 ; y=-3/-2,x=[+3,+6]
+ punpcklwd m0, m2, m3 ; y=-3/-2,x=[-3/-2,-2/-1,-1/+0,+0/+1]
+ punpckhwd m2, m3 ; y=-3/-2,x=[+1/+2,+2/+3,+3/+4,+4/+5]
+ shufps m3, m0, m2, q1032 ; y=-3/-2,x=[-1/+0,+0/+1,+1/+2,+2/+3]
+ pmaddwd m0, m4
+ pmaddwd m2, m6
+ pmaddwd m3, m5
+ paddd m0, m2
+ paddd m0, m3
+ movu xm2, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4]
+ vinserti128 m2, [bufq+xq*2-82*2-6+ 6], 1 ; y=-1,x=[+1,+8]
+%if %2
+ movu xm3, [bufyq+xq*4]
+%if %3
+ paddw xm3, [bufyq+xq*4+82*2]
+%endif
+ phaddw xm3, xm3
+ pmulhrsw xm3, xm14
+%else
+ movq xm3, [bufyq+xq*2]
+%endif
+ punpcklwd m1, m3
+ pmaddwd m1, m7
+ paddd m0, m1
+ psrldq m1, m2, 4
+ psrldq m3, m2, 6
+ vpblendd m3, m11, 0x0f ; rounding constant
+ punpcklwd m1, m3 ; y=-1,x=[-1/rnd,+0/rnd,+1/rnd,+2/rnd]
+ pmaddwd m1, m9 ; x=[+2/+3,+3/+4,+4/+5,+5,+6]
+ psrldq m3, m2, 2
+ punpcklwd m2, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1]
+ pmaddwd m2, m8 ; x=[+0/+1,+1/+2,+2/+3,+3/+4]
+ paddd m0, m1
+ movu xm1, [bufq+xq*2-6] ; y=0,x=[-3,+4]
+ paddd m0, m2
+ vextracti128 xm2, m0, 1
+ paddd xm0, xm2
+.x_loop_ar3_inner:
+ pmaddwd xm2, xm1, xm10
+ pshuflw xm3, xm2, q1032
+ paddd xm2, xm0 ; add top
+ paddd xm2, xm3 ; left+cur
+ psrldq xm0, 4
+ psrad xm2, [fg_dataq+FGData.ar_coeff_shift]
+ psrldq xm1, 2
+ ; no need to packssdw since we only care about one value
+ pminsd xm2, xm12
+ pmaxsd xm2, xm13
+ pextrw [bufq+xq*2], xm2, 0
+ pslldq xm2, 4
+ pblendw xm1, xm2, 00000100b
+ inc xq
+ jz .x_loop_ar3_end
+ test xb, 3
+ jnz .x_loop_ar3_inner
+ jmp .x_loop_ar3
+.x_loop_ar3_end:
+ add bufq, 82*2
+ add bufyq, 82*2<<%3
+ dec hd
+ jg .y_loop_ar3
+ RET
+%endmacro
+
+cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, \
+ grain_lut, unused, sby, see
+%define base r11-grain_min
+ lea r11, [grain_min]
+ mov r6d, r9m ; bdmax
+ mov r9d, [fg_dataq+FGData.clip_to_restricted_range]
+ mov r7d, [fg_dataq+FGData.scaling_shift]
+ mov sbyd, sbym
+ vpbroadcastd m8, r9m
+ shr r6d, 11 ; is_12bpc
+ vpbroadcastd m9, [base+grain_min+r6*4]
+ shlx r10d, r9d, r6d
+ vpbroadcastd m10, [base+grain_max+r6*4]
+ lea r9d, [r6+r9*4]
+ vpbroadcastw m11, [base+mul_bits+r7*2-12]
+ vpbroadcastd m12, [base+fg_min+r10*4]
+ vpbroadcastd m13, [base+fg_max+r9*4]
+ test sbyd, sbyd
+ setnz r7b
+ vpbroadcastd m14, [base+pd_16]
+ test r7b, [fg_dataq+FGData.overlap_flag]
+ jnz .vertical_overlap
+
+ imul seed, sbyd, (173 << 24) | 37
+ add seed, (105 << 24) | 178
+ rorx seed, seed, 24
+ movzx seed, seew
+ xor seed, [fg_dataq+FGData.seed]
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, src_bak
+
+ lea src_bakq, [srcq+wq*2]
+ neg wq
+ sub dstq, srcq
+
+.loop_x:
+ rorx r6, seeq, 1
+ or seed, 0xEFF4
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+ rorx offyd, seed, 8
+ rorx offxq, seeq, 12
+ and offyd, 0xf
+ imul offyd, 164
+ lea offyd, [offyq+offxq*2+747] ; offy*stride+offx
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, src_bak
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+.loop_y:
+ ; scaling[src]
+ mova m0, [srcq+ 0]
+ mova m1, [srcq+32]
+ pand m4, m8, m0
+ psrld m3, m0, 16
+ mova m6, m9
+ vpgatherdd m2, [scalingq+m4-0], m9
+ pand m3, m8
+ mova m9, m6
+ vpgatherdd m4, [scalingq+m3-2], m6
+ pand m5, m8, m1
+ mova m6, m9
+ vpgatherdd m3, [scalingq+m5-0], m9
+ pblendw m4, m2, 0x55
+ psrld m2, m1, 16
+ mova m9, m6
+ pand m2, m8
+ vpgatherdd m5, [scalingq+m2-2], m6
+ pblendw m5, m3, 0x55
+
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+ pmaddubsw m4, m11
+ pmaddubsw m5, m11
+ paddw m4, m4
+ paddw m5, m5
+ pmulhrsw m4, [grain_lutq+offxyq*2]
+ pmulhrsw m5, [grain_lutq+offxyq*2+32]
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m4
+ paddw m1, m5
+ pmaxsw m0, m12
+ pmaxsw m1, m12
+ pminsw m0, m13
+ pminsw m1, m13
+ mova [dstq+srcq+ 0], m0
+ mova [dstq+srcq+32], m1
+
+ add srcq, strideq
+ add grain_lutq, 82*2
+ dec hd
+ jg .loop_y
+ add wq, 32
+ jge .end
+ lea srcq, [src_bakq+wq*2]
+ cmp byte [fg_dataq+FGData.overlap_flag], 0
+ je .loop_x
+ movq xm7, [pw_27_17_17_27]
+ cmp dword r8m, 0 ; sby
+ jne .loop_x_hv_overlap
+
+ ; horizontal overlap (without vertical overlap)
+.loop_x_h_overlap:
+ rorx r6, seeq, 1
+ or seed, 0xEFF4
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, src_bak, left_offxy
+
+ lea left_offxyd, [offyq+32] ; previous column's offy*stride+offx
+ rorx offyd, seed, 8
+ rorx offxq, seeq, 12
+ and offyd, 0xf
+ imul offyd, 164
+ lea offyd, [offyq+offxq*2+747] ; offy*stride+offx
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, src_bak, left_offxy
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+.loop_y_h_overlap:
+ ; scaling[src]
+ mova m0, [srcq+ 0]
+ mova m1, [srcq+32]
+ pand m4, m8, m0
+ psrld m3, m0, 16
+ mova m6, m9
+ vpgatherdd m2, [scalingq+m4-0], m9
+ pand m3, m8
+ mova m9, m6
+ vpgatherdd m4, [scalingq+m3-2], m6
+ pand m5, m8, m1
+ mova m6, m9
+ vpgatherdd m3, [scalingq+m5-0], m9
+ pblendw m4, m2, 0x55
+ psrld m2, m1, 16
+ mova m9, m6
+ pand m2, m8
+ vpgatherdd m5, [scalingq+m2-2], m6
+ pblendw m5, m3, 0x55
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m3, [grain_lutq+offxyq*2]
+ movd xm6, [grain_lutq+left_offxyq*2]
+ punpcklwd xm6, xm3
+ pmaddwd xm6, xm7
+ paddd xm6, xm14
+ psrad xm6, 5
+ packssdw xm6, xm6
+ pmaxsw xm6, xm9
+ pminsw xm6, xm10
+ vpblendd m3, m6, 0x01
+
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+ pmaddubsw m4, m11
+ pmaddubsw m5, m11
+ paddw m4, m4
+ paddw m5, m5
+ pmulhrsw m4, m3
+ pmulhrsw m5, [grain_lutq+offxyq*2+32]
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m4
+ paddw m1, m5
+ pmaxsw m0, m12
+ pmaxsw m1, m12
+ pminsw m0, m13
+ pminsw m1, m13
+ mova [dstq+srcq+ 0], m0
+ mova [dstq+srcq+32], m1
+
+ add srcq, strideq
+ add grain_lutq, 82*2
+ dec hd
+ jg .loop_y_h_overlap
+ add wq, 32
+ jge .end
+ lea srcq, [src_bakq+wq*2]
+ cmp dword r8m, 0 ; sby
+ jne .loop_x_hv_overlap
+ jmp .loop_x_h_overlap
+
+.vertical_overlap:
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \
+ sby, see, src_bak
+
+ movzx sbyd, sbyb
+ imul seed, [fg_dataq+FGData.seed], 0x00010001
+ imul r7d, sbyd, 173 * 0x00010001
+ imul sbyd, 37 * 0x01000100
+ add r7d, (105 << 16) | 188
+ add sbyd, (178 << 24) | (141 << 8)
+ and r7d, 0x00ff00ff
+ and sbyd, 0xff00ff00
+ xor seed, r7d
+ xor seed, sbyd ; (cur_seed << 16) | top_seed
+
+ lea src_bakq, [srcq+wq*2]
+ neg wq
+ sub dstq, srcq
+
+.loop_x_v_overlap:
+ vpbroadcastd m15, [pw_27_17_17_27]
+
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp r7b ; parity of top_seed
+ shr seed, 16
+ shl r7d, 16
+ test seeb, seeh
+ setp r7b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor r7d, r6d
+ rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, src_bak, unused, top_offxy
+
+ rorx offyd, seed, 8
+ rorx offxd, seed, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyd, [offyq+offxq*2+0x10001*747+32*82]
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, src_bak, unused, top_offxy
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+.loop_y_v_overlap:
+ ; scaling[src]
+ mova m0, [srcq+ 0]
+ mova m1, [srcq+32]
+ pand m4, m8, m0
+ psrld m3, m0, 16
+ mova m6, m9
+ vpgatherdd m2, [scalingq+m4-0], m9
+ pand m3, m8
+ mova m9, m6
+ vpgatherdd m4, [scalingq+m3-2], m6
+ pand m5, m8, m1
+ mova m6, m9
+ vpgatherdd m3, [scalingq+m5-0], m9
+ pblendw m2, m4, 0xaa
+ psrld m4, m1, 16
+ mova m9, m6
+ pand m4, m8
+ vpgatherdd m5, [scalingq+m4-2], m6
+ pblendw m3, m5, 0xaa
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m6, [grain_lutq+offxyq*2]
+ movu m5, [grain_lutq+top_offxyq*2]
+ punpcklwd m4, m5, m6
+ punpckhwd m5, m6
+ pmaddwd m4, m15
+ pmaddwd m5, m15
+ movu m7, [grain_lutq+offxyq*2+32]
+ movu m6, [grain_lutq+top_offxyq*2+32]
+ paddd m4, m14
+ paddd m5, m14
+ psrad m4, 5
+ psrad m5, 5
+ packssdw m4, m5
+ punpcklwd m5, m6, m7
+ punpckhwd m6, m7
+ pmaddwd m5, m15
+ pmaddwd m6, m15
+ paddd m5, m14
+ paddd m6, m14
+ psrad m5, 5
+ psrad m6, 5
+ packssdw m5, m6
+ pmaxsw m4, m9
+ pmaxsw m5, m9
+ pminsw m4, m10
+ pminsw m5, m10
+
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+ pmaddubsw m2, m11
+ pmaddubsw m3, m11
+ paddw m2, m2
+ paddw m3, m3
+ pmulhrsw m4, m2
+ pmulhrsw m5, m3
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m4
+ paddw m1, m5
+ pmaxsw m0, m12
+ pmaxsw m1, m12
+ pminsw m0, m13
+ pminsw m1, m13
+ mova [dstq+srcq+ 0], m0
+ mova [dstq+srcq+32], m1
+
+ add srcq, strideq
+ add grain_lutq, 82*2
+ dec hb
+ jz .end_y_v_overlap
+ vpbroadcastd m15, [pw_27_17_17_27+4] ; swap weights for second v-overlap line
+ ; 2 lines get vertical overlap, then fall back to non-overlap code for
+ ; remaining (up to) 30 lines
+ add hd, 0x80000000
+ jnc .loop_y_v_overlap
+ jmp .loop_y
+.end_y_v_overlap:
+ add wq, 32
+ jge .end
+ lea srcq, [src_bakq+wq*2]
+
+ ; since fg_dataq.overlap is guaranteed to be set, we never jump
+ ; back to .loop_x_v_overlap, and instead always fall-through to
+ ; h+v overlap
+
+.loop_x_hv_overlap:
+ vpbroadcastd m15, [pw_27_17_17_27]
+
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp r7b ; parity of top_seed
+ shr seed, 16
+ shl r7d, 16
+ test seeb, seeh
+ setp r7b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor r7d, r6d
+ rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, src_bak, left_offxy, top_offxy, topleft_offxy
+
+ lea topleft_offxyd, [top_offxyq+32]
+ lea left_offxyd, [offyq+32]
+ rorx offyd, seed, 8
+ rorx offxd, seed, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyd, [offyq+offxq*2+0x10001*747+32*82]
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, src_bak, left_offxy, top_offxy, topleft_offxy
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+.loop_y_hv_overlap:
+ ; scaling[src]
+ mova m0, [srcq+ 0]
+ mova m1, [srcq+32]
+ pand m4, m8, m0
+ psrld m3, m0, 16
+ mova m6, m9
+ vpgatherdd m2, [scalingq+m4-0], m9
+ pand m3, m8
+ mova m9, m6
+ vpgatherdd m4, [scalingq+m3-2], m6
+ pand m5, m8, m1
+ mova m6, m9
+ vpgatherdd m3, [scalingq+m5-0], m9
+ pblendw m2, m4, 0xaa
+ psrld m4, m1, 16
+ mova m9, m6
+ pand m4, m8
+ vpgatherdd m5, [scalingq+m4-2], m6
+ pblendw m3, m5, 0xaa
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m7, [grain_lutq+offxyq*2]
+ movd xm6, [grain_lutq+left_offxyq*2]
+ movu m5, [grain_lutq+top_offxyq*2]
+ movd xm4, [grain_lutq+topleft_offxyq*2]
+ ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
+ punpcklwd xm6, xm7
+ punpcklwd xm4, xm5
+ punpcklqdq xm6, xm4
+ movddup xm4, [pw_27_17_17_27]
+ pmaddwd xm6, xm4
+ paddd xm6, xm14
+ psrad xm6, 5
+ packssdw xm6, xm6
+ pmaxsw xm6, xm9
+ pminsw xm6, xm10
+ pshuflw xm4, xm6, q1032
+ vpblendd m6, m7, 0xfe
+ vpblendd m4, m5, 0xfe
+ ; followed by v interpolation (top | cur -> cur)
+ punpckhwd m5, m7
+ pmaddwd m5, m15
+ punpcklwd m4, m6
+ pmaddwd m4, m15
+ movu m7, [grain_lutq+offxyq*2+32]
+ movu m6, [grain_lutq+top_offxyq*2+32]
+ paddd m5, m14
+ paddd m4, m14
+ psrad m5, 5
+ psrad m4, 5
+ packssdw m4, m5
+ punpcklwd m5, m6, m7
+ punpckhwd m6, m7
+ pmaddwd m5, m15
+ pmaddwd m6, m15
+ paddd m5, m14
+ paddd m6, m14
+ psrad m5, 5
+ psrad m6, 5
+ packssdw m5, m6
+ pmaxsw m4, m9
+ pmaxsw m5, m9
+ pminsw m4, m10
+ pminsw m5, m10
+
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+ pmaddubsw m2, m11
+ pmaddubsw m3, m11
+ paddw m2, m2
+ paddw m3, m3
+ pmulhrsw m4, m2
+ pmulhrsw m5, m3
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m4
+ paddw m1, m5
+ pmaxsw m0, m12
+ pmaxsw m1, m12
+ pminsw m0, m13
+ pminsw m1, m13
+ mova [dstq+srcq+ 0], m0
+ mova [dstq+srcq+32], m1
+
+ add srcq, strideq
+ add grain_lutq, 82*2
+ dec hb
+ jz .end_y_hv_overlap
+ vpbroadcastd m15, [pw_27_17_17_27+4] ; swap weights for second v-overlap line
+ ; 2 lines get vertical overlap, then fall back to non-overlap code for
+ ; remaining (up to) 30 lines
+ add hd, 0x80000000
+ jnc .loop_y_hv_overlap
+ movq xm7, [pw_27_17_17_27]
+ jmp .loop_y_h_overlap
+.end_y_hv_overlap:
+ add wq, 32
+ lea srcq, [src_bakq+wq*2]
+ jl .loop_x_hv_overlap
+.end:
+ RET
+
+%macro FGUV_FN 3 ; name, ss_hor, ss_ver
+cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
+ grain_lut, h, sby, luma, lstride, uv_pl, is_id
+%define base r12-grain_min
+ lea r12, [grain_min]
+ mov r9d, r13m ; bdmax
+ mov r7d, [fg_dataq+FGData.scaling_shift]
+ mov r11d, is_idm
+ mov sbyd, sbym
+ vpbroadcastw m11, [base+mul_bits+r7*2-12]
+ mov r6d, [fg_dataq+FGData.clip_to_restricted_range]
+ shr r9d, 11 ; is_12bpc
+ vpbroadcastd m8, [base+grain_min+r9*4]
+ shlx r10d, r6d, r9d
+ vpbroadcastd m9, [base+grain_max+r9*4]
+ vpbroadcastw m10, r13m
+ shlx r6d, r6d, r11d
+ vpbroadcastd m12, [base+fg_min+r10*4]
+ lea r6d, [r9+r6*2]
+ vpbroadcastd m13, [base+fg_max+r6*4]
+ test sbyd, sbyd
+ setnz r7b
+ cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
+ jne .csfl
+
+%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ unused, sby, see, overlap
+
+%if %1
+ mov r6d, r11m
+ vpbroadcastd m0, [base+pb_8_9_0_1]
+ vpbroadcastd m1, [base+uv_offset_mul+r9*4]
+ vbroadcasti128 m14, [fg_dataq+FGData.uv_mult+r6*4]
+ vpbroadcastd m15, [fg_dataq+FGData.uv_offset+r6*4]
+ pshufb m14, m0 ; { uv_luma_mult, uv_mult }
+ pmaddwd m15, m1
+%else
+%if %2
+ vpbroadcastq m15, [base+pw_23_22]
+%else
+ vpbroadcastq m15, [base+pw_27_17_17_27]
+%endif
+ vpbroadcastd m14, [base+pd_16]
+%endif
+ test r7b, [fg_dataq+FGData.overlap_flag]
+ jnz %%vertical_overlap
+
+ imul seed, sbyd, (173 << 24) | 37
+ add seed, (105 << 24) | 178
+ rorx seed, seed, 24
+ movzx seed, seew
+ xor seed, [fg_dataq+FGData.seed]
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ unused2, unused3, see, unused4, unused5, unused6, luma, lstride
+
+ mov lumaq, r9mp
+ mov lstrideq, r10mp
+ lea r10, [srcq+wq*2]
+ lea r11, [dstq+wq*2]
+ lea r12, [lumaq+wq*(2<<%2)]
+ mov r9mp, r10
+ mov r11mp, r11
+ mov r12mp, r12
+ neg wq
+
+%%loop_x:
+ rorx r6, seeq, 1
+ or seed, 0xEFF4
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, unused1, unused2, unused3, luma, lstride
+
+ rorx offyd, seed, 8
+ rorx offxq, seeq, 12
+ and offyd, 0xf
+ imul offyd, 164>>%3
+ lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))] ; offy*stride+offx
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, unused1, unused2, unused3, luma, lstride
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+%%loop_y:
+ ; luma_src
+%if %2
+ mova xm2, [lumaq+lstrideq*0+ 0]
+ vinserti128 m2, [lumaq+lstrideq*0+32], 1
+ mova xm4, [lumaq+lstrideq*0+16]
+ vinserti128 m4, [lumaq+lstrideq*0+48], 1
+ mova xm3, [lumaq+lstrideq*(1<<%3)+ 0]
+ vinserti128 m3, [lumaq+lstrideq*(1<<%3)+32], 1
+ mova xm5, [lumaq+lstrideq*(1<<%3)+16]
+ vinserti128 m5, [lumaq+lstrideq*(1<<%3)+48], 1
+ phaddw m2, m4
+ phaddw m3, m5
+ pxor m4, m4
+ pavgw m2, m4
+ pavgw m3, m4
+%elif %1
+ mova m2, [lumaq+ 0]
+ mova m3, [lumaq+32]
+%endif
+%if %1
+ mova m0, [srcq]
+%if %2
+ mova m1, [srcq+strideq]
+%else
+ mova m1, [srcq+32]
+%endif
+ punpckhwd m4, m2, m0
+ punpcklwd m2, m0
+ punpckhwd m5, m3, m1
+ punpcklwd m3, m1 ; { luma, chroma }
+ REPX {pmaddwd x, m14}, m4, m2, m5, m3
+ REPX {paddd x, m15}, m4, m2, m5, m3
+ REPX {psrad x, 6 }, m4, m2, m5, m3
+ packusdw m2, m4
+ packusdw m3, m5
+ pminuw m2, m10
+ pminuw m3, m10 ; clip_pixel()
+%elif %2
+ pand m2, m10
+ pand m3, m10
+%else
+ pand m2, m10, [lumaq+ 0]
+ pand m3, m10, [lumaq+32]
+%endif
+
+ ; scaling[luma_src]
+ vpbroadcastd m7, [pd_m65536]
+ pandn m4, m7, m2
+ mova m6, m7
+ vpgatherdd m5, [scalingq+m4-0], m7
+ psrld m2, 16
+ mova m7, m6
+ vpgatherdd m4, [scalingq+m2-2], m6
+ pblendw m4, m5, 0x55
+ pandn m5, m7, m3
+ mova m6, m7
+ vpgatherdd m2, [scalingq+m5-0], m7
+ psrld m3, 16
+ vpgatherdd m5, [scalingq+m3-2], m6
+ pblendw m5, m2, 0x55
+
+ ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+ pmaddubsw m4, m11
+ pmaddubsw m5, m11
+ paddw m4, m4
+ paddw m5, m5
+ pmulhrsw m4, [grain_lutq+offxyq*2]
+%if %2
+ pmulhrsw m5, [grain_lutq+offxyq*2+82*2]
+%else
+ pmulhrsw m5, [grain_lutq+offxyq*2+32]
+%endif
+
+ ; dst = clip_pixel(src, noise)
+%if %1
+ paddw m0, m4
+ paddw m1, m5
+%else
+ paddw m0, m4, [srcq]
+%if %2
+ paddw m1, m5, [srcq+strideq]
+%else
+ paddw m1, m5, [srcq+32]
+%endif
+%endif
+ pmaxsw m0, m12
+ pmaxsw m1, m12
+ pminsw m0, m13
+ pminsw m1, m13
+ mova [dstq], m0
+%if %2
+ mova [dstq+strideq], m1
+ lea srcq, [srcq+strideq*2]
+ lea dstq, [dstq+strideq*2]
+ lea lumaq, [lumaq+lstrideq*(2<<%3)]
+%else
+ mova [dstq+32], m1
+ add srcq, strideq
+ add dstq, strideq
+ add lumaq, lstrideq
+%endif
+ add grain_lutq, 82*(2<<%2)
+%if %2
+ sub hb, 2
+%else
+ dec hb
+%endif
+ jg %%loop_y
+ add wq, 32>>%2
+ jge .end
+ mov srcq, r9mp
+ mov dstq, r11mp
+ mov lumaq, r12mp
+ lea srcq, [srcq+wq*2]
+ lea dstq, [dstq+wq*2]
+ lea lumaq, [lumaq+wq*(2<<%2)]
+ cmp byte [fg_dataq+FGData.overlap_flag], 0
+ je %%loop_x
+ cmp dword r8m, 0 ; sby
+ jne %%loop_x_hv_overlap
+
+ ; horizontal overlap (without vertical overlap)
+%%loop_x_h_overlap:
+ rorx r6, seeq, 1
+ or seed, 0xEFF4
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, left_offxy, unused1, unused2, luma, lstride
+
+ lea left_offxyd, [offyq+(32>>%2)] ; previous column's offy*stride+offx
+ rorx offyd, seed, 8
+ rorx offxq, seeq, 12
+ and offyd, 0xf
+ imul offyd, 164>>%3
+ lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, left_offxy, unused1, unused2, luma, lstride
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+%%loop_y_h_overlap:
+ ; luma_src
+%if %2
+ mova xm2, [lumaq+lstrideq*0+ 0]
+ vinserti128 m2, [lumaq+lstrideq*0+32], 1
+ mova xm4, [lumaq+lstrideq*0+16]
+ vinserti128 m4, [lumaq+lstrideq*0+48], 1
+ mova xm3, [lumaq+lstrideq*(1<<%3)+ 0]
+ vinserti128 m3, [lumaq+lstrideq*(1<<%3)+32], 1
+ mova xm5, [lumaq+lstrideq*(1<<%3)+16]
+ vinserti128 m5, [lumaq+lstrideq*(1<<%3)+48], 1
+ phaddw m2, m4
+ phaddw m3, m5
+ pxor m4, m4
+ pavgw m2, m4
+ pavgw m3, m4
+%elif %1
+ mova m2, [lumaq]
+ mova m3, [lumaq+32]
+%endif
+%if %1
+ mova m0, [srcq]
+%if %2
+ mova m1, [srcq+strideq]
+%else
+ mova m1, [srcq+32]
+%endif
+ punpckhwd m4, m2, m0
+ punpcklwd m2, m0
+ punpckhwd m5, m3, m1
+ punpcklwd m3, m1 ; { luma, chroma }
+ REPX {pmaddwd x, m14}, m4, m2, m5, m3
+ REPX {paddd x, m15}, m4, m2, m5, m3
+ REPX {psrad x, 6 }, m4, m2, m5, m3
+ packusdw m2, m4
+ packusdw m3, m5
+ pminuw m2, m10 ; clip_pixel()
+ pminuw m3, m10
+%elif %2
+ pand m2, m10
+ pand m3, m10
+%else
+ pand m2, m10, [lumaq+ 0]
+ pand m3, m10, [lumaq+32]
+%endif
+
+ ; scaling[luma_src]
+ vpbroadcastd m7, [pd_m65536]
+ pandn m4, m7, m2
+ mova m6, m7
+ vpgatherdd m5, [scalingq+m4-0], m7
+ psrld m2, 16
+ mova m7, m6
+ vpgatherdd m4, [scalingq+m2-2], m6
+ pblendw m4, m5, 0x55
+ pandn m5, m7, m3
+ mova m6, m7
+ vpgatherdd m2, [scalingq+m5-0], m7
+ psrld m3, 16
+ vpgatherdd m5, [scalingq+m3-2], m6
+ pblendw m5, m2, 0x55
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m2, [grain_lutq+offxyq*2]
+%if %2
+ movu m3, [grain_lutq+offxyq*2+82*2]
+%else
+ movu m3, [grain_lutq+offxyq*2+32]
+%endif
+ movd xm6, [grain_lutq+left_offxyq*2]
+%if %2
+ pinsrw xm6, [grain_lutq+left_offxyq*2+82*2], 2 ; {left0, left1}
+ punpckldq xm7, xm2, xm3 ; {cur0, cur1}
+ punpcklwd xm6, xm7 ; {left0, cur0, left1, cur1}
+%else
+ punpcklwd xm6, xm2
+%endif
+%if %1
+%if %2
+ vpbroadcastq xm7, [pw_23_22]
+%else
+ movq xm7, [pw_27_17_17_27]
+%endif
+ pmaddwd xm6, xm7
+ vpbroadcastd xm7, [pd_16]
+ paddd xm6, xm7
+%else
+ pmaddwd xm6, xm15
+ paddd xm6, xm14
+%endif
+ psrad xm6, 5
+ packssdw xm6, xm6
+ pmaxsw xm6, xm8
+ pminsw xm6, xm9
+ vpblendd m2, m6, 0x01
+%if %2
+ pshuflw xm6, xm6, q1032
+ vpblendd m3, m6, 0x01
+%endif
+
+ ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+ pmaddubsw m4, m11
+ pmaddubsw m5, m11
+ paddw m4, m4
+ paddw m5, m5
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+
+ ; dst = clip_pixel(src, noise)
+%if %1
+ paddw m0, m2
+ paddw m1, m3
+%else
+ paddw m0, m2, [srcq]
+%if %2
+ paddw m1, m3, [srcq+strideq]
+%else
+ paddw m1, m3, [srcq+32]
+%endif
+%endif
+ pmaxsw m0, m12
+ pmaxsw m1, m12
+ pminsw m0, m13
+ pminsw m1, m13
+ mova [dstq], m0
+%if %2
+ mova [dstq+strideq], m1
+ lea srcq, [srcq+strideq*2]
+ lea dstq, [dstq+strideq*2]
+ lea lumaq, [lumaq+lstrideq*(2<<%3)]
+%else
+ mova [dstq+32], m1
+ add srcq, strideq
+ add dstq, strideq
+ add lumaq, r10mp
+%endif
+ add grain_lutq, 82*(2<<%2)
+%if %2
+ sub hb, 2
+%else
+ dec hb
+%endif
+ jg %%loop_y_h_overlap
+ add wq, 32>>%2
+ jge .end
+ mov srcq, r9mp
+ mov dstq, r11mp
+ mov lumaq, r12mp
+ lea srcq, [srcq+wq*2]
+ lea dstq, [dstq+wq*2]
+ lea lumaq, [lumaq+wq*(2<<%2)]
+ cmp dword r8m, 0 ; sby
+ jne %%loop_x_hv_overlap
+ jmp %%loop_x_h_overlap
+
+%%vertical_overlap:
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \
+ sby, see, unused1, unused2, unused3, lstride
+
+ movzx sbyd, sbyb
+ imul seed, [fg_dataq+FGData.seed], 0x00010001
+ imul r7d, sbyd, 173 * 0x00010001
+ imul sbyd, 37 * 0x01000100
+ add r7d, (105 << 16) | 188
+ add sbyd, (178 << 24) | (141 << 8)
+ and r7d, 0x00ff00ff
+ and sbyd, 0xff00ff00
+ xor seed, r7d
+ xor seed, sbyd ; (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, unused1, top_offxy, unused2, luma, lstride
+
+ mov lumaq, r9mp
+ mov lstrideq, r10mp
+ lea r10, [srcq+wq*2]
+ lea r11, [dstq+wq*2]
+ lea r12, [lumaq+wq*(2<<%2)]
+ mov r9mp, r10
+ mov r11mp, r11
+ mov r12mp, r12
+ neg wq
+
+%%loop_x_v_overlap:
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp r7b ; parity of top_seed
+ shr seed, 16
+ shl r7d, 16
+ test seeb, seeh
+ setp r7b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor r7d, r6d
+ rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
+
+ rorx offyd, seed, 8
+ rorx offxd, seed, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164>>%3
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, unused1, top_offxy, unused2, luma, lstride
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+%if %2 == 0
+ lea r10, [pw_27_17_17_27]
+%endif
+%%loop_y_v_overlap:
+ ; luma_src
+%if %2
+ mova xm2, [lumaq+lstrideq*0+ 0]
+ vinserti128 m2, [lumaq+lstrideq*0+32], 1
+ mova xm4, [lumaq+lstrideq*0+16]
+ vinserti128 m4, [lumaq+lstrideq*0+48], 1
+ mova xm3, [lumaq+lstrideq*(1<<%3)+ 0]
+ vinserti128 m3, [lumaq+lstrideq*(1<<%3)+32], 1
+ mova xm5, [lumaq+lstrideq*(1<<%3)+16]
+ vinserti128 m5, [lumaq+lstrideq*(1<<%3)+48], 1
+ phaddw m2, m4
+ phaddw m3, m5
+ pxor m4, m4
+ pavgw m2, m4
+ pavgw m3, m4
+%elif %1
+ mova m2, [lumaq]
+ mova m3, [lumaq+32]
+%endif
+%if %1
+ mova m0, [srcq]
+%if %2
+ mova m1, [srcq+strideq]
+%else
+ mova m1, [srcq+32]
+%endif
+ punpckhwd m4, m2, m0
+ punpcklwd m2, m0
+ punpckhwd m5, m3, m1
+ punpcklwd m3, m1 ; { luma, chroma }
+ REPX {pmaddwd x, m14}, m4, m2, m5, m3
+ REPX {paddd x, m15}, m4, m2, m5, m3
+ REPX {psrad x, 6 }, m4, m2, m5, m3
+ packusdw m2, m4
+ packusdw m3, m5
+ pminuw m2, m10 ; clip_pixel()
+ pminuw m3, m10
+%elif %2
+ pand m2, m10
+ pand m3, m10
+%else
+ pand m2, m10, [lumaq+ 0]
+ pand m3, m10, [lumaq+32]
+%endif
+
+ ; scaling[luma_src]
+ vpbroadcastd m7, [pd_m65536]
+ pandn m4, m7, m2
+ mova m6, m7
+ vpgatherdd m5, [scalingq+m4-0], m7
+ psrld m2, 16
+ mova m7, m6
+ vpgatherdd m4, [scalingq+m2-2], m6
+ pblendw m4, m5, 0x55
+ pandn m5, m7, m3
+ mova m6, m7
+ vpgatherdd m2, [scalingq+m5-0], m7
+ psrld m3, 16
+ vpgatherdd m5, [scalingq+m3-2], m6
+ pblendw m5, m2, 0x55
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m6, [grain_lutq+offxyq*2]
+ movu m3, [grain_lutq+top_offxyq*2]
+ punpcklwd m2, m3, m6
+ punpckhwd m3, m6 ; { top, cur }
+%if %3
+ vpbroadcastd m0, [pw_23_22]
+%elif %2
+ vpbroadcastd m0, [pw_27_17_17_27]
+%else
+ vpbroadcastd m0, [r10]
+%endif
+ REPX {pmaddwd x, m0}, m2, m3
+%if %1
+ vpbroadcastd m1, [pd_16]
+ REPX {paddd x, m1}, m2, m3
+%else
+ REPX {paddd x, m14}, m2, m3
+%endif
+ REPX {psrad x, 5}, m2, m3
+ packssdw m2, m3
+%if %2
+ movu m3, [grain_lutq+offxyq*2+82*2]
+%else
+ movu m3, [grain_lutq+offxyq*2+32]
+%endif
+%if %3
+ pmaxsw m2, m8
+ pminsw m2, m9
+%else
+%if %2
+ movu m7, [grain_lutq+top_offxyq*2+82*2]
+ punpckhwd m6, m3, m7 ; { cur, top }
+ punpcklwd m3, m7
+%else
+ movu m7, [grain_lutq+top_offxyq*2+32]
+ punpckhwd m6, m7, m3
+ punpcklwd m3, m7, m3 ; { top, cur }
+%endif
+ pmaddwd m6, m0
+ pmaddwd m3, m0
+%if %1
+ paddd m6, m1
+ paddd m3, m1
+%else
+ paddd m6, m14
+ paddd m3, m14
+%endif
+ psrad m6, 5
+ psrad m3, 5
+ packssdw m3, m6
+ pmaxsw m2, m8
+ pmaxsw m3, m8
+ pminsw m2, m9
+ pminsw m3, m9
+%endif
+
+ ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+ pmaddubsw m4, m11
+ pmaddubsw m5, m11
+ paddw m4, m4
+ paddw m5, m5
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2, [srcq]
+%if %2
+ paddw m1, m3, [srcq+strideq]
+%else
+ paddw m1, m3, [srcq+32]
+%endif
+ pmaxsw m0, m12
+ pmaxsw m1, m12
+ pminsw m0, m13
+ pminsw m1, m13
+ mova [dstq], m0
+%if %2
+ mova [dstq+strideq], m1
+ sub hb, 2
+%else
+ mova [dstq+32], m1
+ dec hb
+%endif
+ jle %%end_y_v_overlap
+%if %2
+ lea srcq, [srcq+strideq*2]
+ lea dstq, [dstq+strideq*2]
+ lea lumaq, [lumaq+lstrideq*(2<<%3)]
+%else
+ add srcq, strideq
+ add dstq, strideq
+ add lumaq, lstrideq
+%endif
+ add grain_lutq, 82*(2<<%2)
+%if %2
+ jmp %%loop_y
+%else
+ add hd, 0x80000000
+ jc %%loop_y
+ add r10, 4
+ jmp %%loop_y_v_overlap
+%endif
+%%end_y_v_overlap:
+ add wq, 32>>%2
+ jge .end
+ mov srcq, r9mp
+ mov dstq, r11mp
+ mov lumaq, r12mp
+ lea srcq, [srcq+wq*2]
+ lea dstq, [dstq+wq*2]
+ lea lumaq, [lumaq+wq*(2<<%2)]
+
+ ; since fg_dataq.overlap is guaranteed to be set, we never jump
+ ; back to .loop_x_v_overlap, and instead always fall-through to
+ ; h+v overlap
+%%loop_x_hv_overlap:
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp r7b ; parity of top_seed
+ shr seed, 16
+ shl r7d, 16
+ test seeb, seeh
+ setp r7b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor r7d, r6d
+ rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride
+
+%if %2 == 0
+ lea r14, [pw_27_17_17_27]
+%endif
+ lea topleft_offxyq, [top_offxyq+(32>>%2)]
+ lea left_offxyq, [offyq+(32>>%2)]
+ rorx offyd, seed, 8
+ rorx offxd, seed, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164>>%3
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+%%loop_y_hv_overlap:
+ ; luma_src
+%if %2
+ mova xm2, [lumaq+lstrideq*0+ 0]
+ vinserti128 m2, [lumaq+lstrideq*0+32], 1
+ mova xm4, [lumaq+lstrideq*0+16]
+ vinserti128 m4, [lumaq+lstrideq*0+48], 1
+ mova xm3, [lumaq+lstrideq*(1<<%3)+ 0]
+ vinserti128 m3, [lumaq+lstrideq*(1<<%3)+32], 1
+ mova xm5, [lumaq+lstrideq*(1<<%3)+16]
+ vinserti128 m5, [lumaq+lstrideq*(1<<%3)+48], 1
+ phaddw m2, m4
+ phaddw m3, m5
+ pxor m4, m4
+ pavgw m2, m4
+ pavgw m3, m4
+%elif %1
+ mova m2, [lumaq]
+ mova m3, [lumaq+32]
+%endif
+%if %1
+ mova m0, [srcq]
+%if %2
+ mova m1, [srcq+strideq]
+%else
+ mova m1, [srcq+32]
+%endif
+ punpckhwd m4, m2, m0
+ punpcklwd m2, m0
+ punpckhwd m5, m3, m1
+ punpcklwd m3, m1 ; { luma, chroma }
+ REPX {pmaddwd x, m14}, m4, m2, m5, m3
+ REPX {paddd x, m15}, m4, m2, m5, m3
+ REPX {psrad x, 6 }, m4, m2, m5, m3
+ packusdw m2, m4
+ packusdw m3, m5
+ pminuw m2, m10 ; clip_pixel()
+ pminuw m3, m10
+%elif %2
+ pand m2, m10
+ pand m3, m10
+%else
+ pand m2, m10, [lumaq+ 0]
+ pand m3, m10, [lumaq+32]
+%endif
+
+ ; scaling[luma_src]
+ vpbroadcastd m7, [pd_m65536]
+ pandn m4, m7, m2
+ mova m6, m7
+ vpgatherdd m5, [scalingq+m4-0], m7
+ psrld m2, 16
+ mova m7, m6
+ vpgatherdd m4, [scalingq+m2-2], m6
+ pblendw m4, m5, 0x55
+ pandn m5, m7, m3
+ mova m6, m7
+ vpgatherdd m2, [scalingq+m5-0], m7
+ psrld m3, 16
+ vpgatherdd m5, [scalingq+m3-2], m6
+ pblendw m5, m2, 0x55
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m0, [grain_lutq+offxyq*2]
+ movd xm2, [grain_lutq+left_offxyq*2]
+ movu m6, [grain_lutq+top_offxyq*2]
+%if %2
+ pinsrw xm2, [grain_lutq+left_offxyq*2+82*2], 2
+ movu m3, [grain_lutq+offxyq*2+82*2]
+ punpckldq xm1, xm0, xm3 ; { cur0, cur1 }
+%if %3
+ vinserti128 m2, [grain_lutq+topleft_offxyq*2], 1 ; { left0, left1, top/left }
+ vinserti128 m1, [grain_lutq+top_offxyq*2], 1 ; { cur0, cur1, top0 }
+%else
+ vinserti128 m2, [grain_lutq+topleft_offxyq*2+82*2], 1
+ vpbroadcastd m7, [grain_lutq+topleft_offxyq*2]
+ vpblendd m2, m7, 0x20
+ movd xm7, [grain_lutq+top_offxyq*2+82*2]
+ punpckldq xm7, xm6
+ vinserti128 m1, xm7, 1
+ movu m7, [grain_lutq+top_offxyq*2+82*2]
+%endif
+ punpcklwd m2, m1 ; { cur, left }
+%if %1
+ vpbroadcastq m1, [pw_23_22]
+ pmaddwd m2, m1
+ vpbroadcastd m1, [pd_16]
+ paddd m2, m1
+ psrad m2, 5
+ packssdw m2, m2
+ vpermq m2, m2, q3120
+%else
+ pmaddwd m2, m15
+ paddd m2, m14
+ psrad m2, 5
+ vextracti128 xm1, m2, 1
+ packssdw xm2, xm1
+%endif
+%else
+ pinsrd xm2, [grain_lutq+topleft_offxyq*2], 1
+ movu m3, [grain_lutq+offxyq*2+32]
+ movu m7, [grain_lutq+top_offxyq*2+32]
+ punpckldq xm1, xm0, xm6
+ punpcklwd xm2, xm1 ; { cur, left }
+%if %1
+ movddup xm1, [pw_27_17_17_27]
+ pmaddwd xm2, xm1
+ vpbroadcastd m1, [pd_16]
+ paddd xm2, xm1
+%else
+ pmaddwd xm2, xm15
+ paddd xm2, xm14
+%endif
+ psrad xm2, 5
+ packssdw xm2, xm2
+%endif
+ pmaxsw xm2, xm8
+ pminsw xm2, xm9
+ vpblendd m0, m2, 0x01
+%if %2
+ pshufd xm2, xm2, q0321
+ vpblendd m3, m2, 0x01
+%if %3 == 0
+ pshufd xm2, xm2, q0321
+ vpblendd m7, m2, 0x01
+%endif
+%endif
+ pshuflw xm2, xm2, q1032
+ vpblendd m2, m6, 0xfe
+ punpckhwd m6, m0 ; { top, cur }
+ punpcklwd m2, m0
+%if %3
+ vpbroadcastd m0, [pw_23_22]
+%elif %2
+ vpbroadcastd m0, [pw_27_17_17_27]
+%else
+ vpbroadcastd m0, [r14]
+%endif
+ pmaddwd m6, m0
+ pmaddwd m2, m0
+%if %1
+ paddd m6, m1
+ paddd m2, m1
+%else
+ paddd m6, m14
+ paddd m2, m14
+%endif
+ psrad m6, 5
+ psrad m2, 5
+ packssdw m2, m6
+
+%if %3
+ pmaxsw m2, m8
+ pminsw m2, m9
+%else
+%if %2
+ punpckhwd m6, m3, m7
+ punpcklwd m3, m7 ; { cur, top }
+%else
+ punpckhwd m6, m7, m3
+ punpcklwd m3, m7, m3 ; { top, cur }
+%endif
+ REPX {pmaddwd x, m0}, m6, m3
+%if %1
+ REPX {paddd x, m1}, m6, m3
+%else
+ REPX {paddd x, m14}, m6, m3
+%endif
+ REPX {psrad x, 5}, m6, m3
+ packssdw m3, m6
+ pmaxsw m2, m8
+ pmaxsw m3, m8
+ pminsw m2, m9
+ pminsw m3, m9
+%endif
+
+ ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+ pmaddubsw m4, m11
+ pmaddubsw m5, m11
+ paddw m4, m4
+ paddw m5, m5
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2, [srcq]
+%if %2
+ paddw m1, m3, [srcq+strideq]
+%else
+ paddw m1, m3, [srcq+32]
+%endif
+ pmaxsw m0, m12
+ pmaxsw m1, m12
+ pminsw m0, m13
+ pminsw m1, m13
+ mova [dstq], m0
+%if %2
+ mova [dstq+strideq], m1
+ lea srcq, [srcq+strideq*2]
+ lea dstq, [dstq+strideq*2]
+ lea lumaq, [lumaq+lstrideq*(2<<%3)]
+%else
+ mova [dstq+32], m1
+ add srcq, strideq
+ add dstq, strideq
+ add lumaq, r10mp
+%endif
+ add grain_lutq, 82*(2<<%2)
+%if %2
+ sub hb, 2
+ jg %%loop_y_h_overlap
+%else
+ dec hb
+ jle %%end_y_hv_overlap
+ add hd, 0x80000000
+ jc %%loop_y_h_overlap
+ add r14, 4
+ jmp %%loop_y_hv_overlap
+%endif
+%%end_y_hv_overlap:
+ add wq, 32>>%2
+ jge .end
+ mov srcq, r9mp
+ mov dstq, r11mp
+ mov lumaq, r12mp
+ lea srcq, [srcq+wq*2]
+ lea dstq, [dstq+wq*2]
+ lea lumaq, [lumaq+wq*(2<<%2)]
+ jmp %%loop_x_hv_overlap
+%endmacro
+
+ %%FGUV_32x32xN_LOOP 1, %2, %3
+.csfl:
+ %%FGUV_32x32xN_LOOP 0, %2, %3
+.end:
+ RET
+%endmacro
+
+GEN_GRAIN_UV_FN 420, 1, 1
+FGUV_FN 420, 1, 1
+GEN_GRAIN_UV_FN 422, 1, 0
+FGUV_FN 422, 1, 0
+GEN_GRAIN_UV_FN 444, 0, 0
+FGUV_FN 444, 0, 0
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/filmgrain16_avx512.asm b/third_party/dav1d/src/x86/filmgrain16_avx512.asm
new file mode 100644
index 0000000000..5cbebcef50
--- /dev/null
+++ b/third_party/dav1d/src/x86/filmgrain16_avx512.asm
@@ -0,0 +1,930 @@
+; Copyright © 2022, VideoLAN and dav1d authors
+; Copyright © 2022, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+%include "x86/filmgrain_common.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 16
+scale_mask: db -1, -1, 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1
+scale_shift: dw 7, 7, 6, 6, 5, 5, 4, 4
+pw_27_17_17_27: dw 108, 68, 68, 108, 27, 17, 17, 27
+pw_23_22: dw 92, 88, 0, 128, 23, 22, 0, 32
+fg_min: times 2 dw 0
+ times 2 dw 64
+ times 2 dw 256
+fg_max: times 2 dw 1023
+ times 2 dw 4095
+ times 2 dw 960
+ times 2 dw 3840
+ times 2 dw 940
+ times 2 dw 3760
+scale_rnd: dd 64
+ dd 16
+uv_offset_mul: dd 256
+ dd 1024
+pb_8_9_0_1: db 8, 9, 0, 1
+
+cextern pb_0to63
+
+SECTION .text
+
+INIT_ZMM avx512icl
+cglobal fgy_32x32xn_16bpc, 6, 15, 21, dst, src, stride, fg_data, w, scaling, \
+ grain_lut, offx, sby, see, offy, src_bak
+%define base r11-fg_min
+ lea r11, [fg_min]
+ mov r6d, r9m ; bdmax
+ mov r9d, [fg_dataq+FGData.clip_to_restricted_range]
+ mov r7d, [fg_dataq+FGData.scaling_shift]
+ mov sbyd, sbym
+ vpbroadcastd m6, r9m
+ shr r6d, 11 ; is_12bpc
+ vbroadcasti32x4 m7, [base+scale_mask]
+ shlx r10d, r9d, r6d
+ vpbroadcastd m10, [base+scale_shift+r7*4-32]
+ lea r9d, [r6+r9*4]
+ vpbroadcastd m8, [base+fg_min+r10*4]
+ kxnorw k1, k1, k1 ; 0xffff
+ vpbroadcastd m9, [base+fg_max+r9*4]
+ mov r12, 0xeeeeeeeeeeeeeeee
+ vpbroadcastd m19, [base+scale_rnd+r6*4]
+ kshiftrb k2, k1, 4 ; 0xf
+ vpbroadcastq xm20, [base+pw_27_17_17_27+r6*8]
+ kmovq k3, r12
+ vpbroadcastd m11, [base+scale_shift+r6*8+4]
+ test sbyd, sbyd
+ setnz r7b
+ vpbroadcastd m12, [base+pw_27_17_17_27+r6*8+0]
+ vpbroadcastd m13, [base+pw_27_17_17_27+r6*8+4]
+ test r7b, [fg_dataq+FGData.overlap_flag]
+ jnz .v_overlap
+
+ imul seed, sbyd, (173 << 24) | 37
+ add seed, (105 << 24) | 178
+ rorx seed, seed, 24
+ movzx seed, seew
+ xor seed, [fg_dataq+FGData.seed]
+ lea src_bakq, [srcq+wq*2]
+ neg wq
+ sub dstq, srcq
+
+.loop_x:
+ rorx r6, seeq, 1
+ or seed, 0xeff4
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+ rorx offyd, seed, 8
+ rorx offxq, seeq, 12
+ and offyd, 0xf
+ imul offyd, 164
+ lea offyd, [offyq+offxq*2+747] ; offy*stride+offx
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, h, \
+ sby, see, offxy, src_bak
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+.loop_y:
+ movu m4, [grain_lutq+offxyq*2+82*0]
+ movu m5, [grain_lutq+offxyq*2+82*2]
+ call .add_noise
+ sub hb, 2
+ jg .loop_y
+ add wq, 32
+ jge .end
+ lea srcq, [src_bakq+wq*2]
+ cmp byte [fg_dataq+FGData.overlap_flag], 0
+ je .loop_x
+ test sbyd, sbyd
+ jnz .hv_overlap
+
+ ; horizontal overlap (without vertical overlap)
+.loop_x_h_overlap:
+ rorx r6, seeq, 1
+ or seed, 0xeff4
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, offx, \
+ sby, see, offy, src_bak, left_offxy
+
+ lea left_offxyd, [offyq+73] ; previous column's offy*stride+offx
+ rorx offyd, seed, 8
+ rorx offxq, seeq, 12
+ and offyd, 0xf
+ imul offyd, 164
+ lea offyd, [offyq+offxq*2+747] ; offy*stride+offx
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, h, \
+ sby, see, offxy, src_bak, left_offxy
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+.loop_y_h_overlap:
+ movu m4, [grain_lutq+offxyq*2+82*0]
+ movu m5, [grain_lutq+offxyq*2+82*2]
+ movd xm17, [grain_lutq+left_offxyq*2-82*1]
+ pinsrd xm17, [grain_lutq+left_offxyq*2+82*1], 1
+ punpckldq xm16, xm4, xm5
+ punpcklwd xm17, xm16
+ mova xm16, xm19
+ vpdpwssd xm16, xm20, xm17
+ psrad xm16, 1
+ packssdw xm16, xm16
+ vpsravw xm16, xm11
+ vmovdqu8 m4{k2}, m16
+ vpalignr m5{k2}, m16, m16, 4
+ call .add_noise
+ sub hb, 2
+ jg .loop_y_h_overlap
+ add wq, 32
+ jge .end
+ lea srcq, [src_bakq+wq*2]
+ test sbyd, sbyd
+ jnz .hv_overlap
+ jmp .loop_x_h_overlap
+
+.v_overlap:
+ movzx sbyd, sbyb
+ imul seed, [fg_dataq+FGData.seed], 0x00010001
+ imul r7d, sbyd, 173 * 0x00010001
+ imul sbyd, 37 * 0x01000100
+ add r7d, (105 << 16) | 188
+ add sbyd, (178 << 24) | (141 << 8)
+ and r7d, 0x00ff00ff
+ and sbyd, 0xff00ff00
+ xor seed, r7d
+ xor seed, sbyd ; (cur_seed << 16) | top_seed
+ lea src_bakq, [srcq+wq*2]
+ neg wq
+ sub dstq, srcq
+
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp r7b ; parity of top_seed
+ shr seed, 16
+ shl r7d, 16
+ test seeb, seeh
+ setp r7b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor r7d, r6d
+ rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, offx, \
+ sby, see, offy, src_bak, _, top_offxy
+
+ rorx offyd, seed, 8
+ rorx offxd, seed, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyd, [offyq+offxq*2+0x10001*747+32*82]
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, h, \
+ sby, see, offxy, src_bak, _, top_offxy
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+
+ movu m16, [grain_lutq+offxyq*2+82*0]
+ movu m0, [grain_lutq+top_offxyq*2+82*0]
+ movu m17, [grain_lutq+offxyq*2+82*2]
+ movu m1, [grain_lutq+top_offxyq*2+82*2]
+ punpckhwd m4, m0, m16
+ punpcklwd m0, m16
+ punpckhwd m5, m1, m17
+ punpcklwd m1, m17
+ call .add_noise_v
+ sub hb, 2
+ jg .loop_y
+ add wq, 32
+ jge .end
+ lea srcq, [src_bakq+wq*2]
+
+ ; since fg_dataq.overlap is guaranteed to be set, we never jump back
+ ; to .v_overlap, and instead always fall-through to .hv_overlap
+.hv_overlap:
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp r7b ; parity of top_seed
+ shr seed, 16
+ shl r7d, 16
+ test seeb, seeh
+ setp r7b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor r7d, r6d
+ rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, offx, \
+ sby, see, offy, src_bak, left_offxy, top_offxy, topleft_offxy
+
+ lea topleft_offxyd, [top_offxyq+73]
+ lea left_offxyd, [offyq+73]
+ rorx offyd, seed, 8
+ rorx offxd, seed, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyd, [offyq+offxq*2+0x10001*747+32*82]
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, h, \
+ sby, see, offxy, src_bak, left_offxy, top_offxy, topleft_offxy
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+
+ movu m5, [grain_lutq+offxyq*2+82*0]
+ movu m0, [grain_lutq+top_offxyq*2+82*0]
+ movd xm17, [grain_lutq+left_offxyq*2-82*1]
+ pinsrd xm17, [grain_lutq+topleft_offxyq*2-82*1], 1
+ movu m2, [grain_lutq+offxyq*2+82*2]
+ movu m1, [grain_lutq+top_offxyq*2+82*2]
+ movd xm18, [grain_lutq+left_offxyq*2+82*1]
+ pinsrd xm18, [grain_lutq+topleft_offxyq*2+82*1], 1
+ punpckldq xm16, xm5, xm0
+ punpcklwd xm17, xm16
+ mova xm16, xm19
+ vpdpwssd xm16, xm20, xm17
+ punpckldq xm17, xm2, xm1
+ punpcklwd xm18, xm17
+ mova xm17, xm19
+ vpdpwssd xm17, xm20, xm18
+ punpckhwd m4, m0, m5
+ punpcklwd m0, m5
+ punpckhwd m5, m1, m2
+ punpcklwd m1, m2
+ psrad xm16, 1
+ psrad xm17, 1
+ packssdw xm16, xm17
+ vpsravw xm16, xm11
+ vpshuflw m0{k2}, m16, q1302
+ punpckhqdq xm16, xm16
+ vpshuflw m1{k2}, m16, q1302
+ call .add_noise_v
+ sub hb, 2
+ jg .loop_y_h_overlap
+ add wq, 32
+ lea srcq, [src_bakq+wq*2]
+ jl .hv_overlap
+.end:
+ RET
+ALIGN function_align
+.add_noise_v:
+ mova m2, m19
+ vpdpwssd m2, m12, m4
+ mova m3, m19
+ vpdpwssd m3, m13, m5
+ mova m4, m19
+ vpdpwssd m4, m12, m0
+ mova m5, m19
+ vpdpwssd m5, m13, m1
+ REPX {psrad x, 1}, m2, m3, m4, m5
+ packssdw m4, m2
+ packssdw m5, m3
+ vpsravw m4, m11
+ vpsravw m5, m11
+.add_noise:
+ mova m0, [srcq+strideq*0]
+ mova m1, [srcq+strideq*1]
+ kmovw k4, k1
+ pand m16, m6, m0
+ psrld m3, m0, 16
+ vpgatherdd m2{k4}, [scalingq+m16]
+ vpcmpud k4, m3, m6, 2 ; px <= bdmax
+ vpgatherdd m16{k4}, [scalingq+m3]
+ kmovw k4, k1
+ pand m17, m6, m1
+ vpgatherdd m3{k4}, [scalingq+m17]
+ vpshufb m2{k3}, m16, m7
+ psrld m16, m1, 16
+ vpcmpud k4, m16, m6, 2
+ vpgatherdd m17{k4}, [scalingq+m16]
+ vpshufb m3{k3}, m17, m7
+ vpsllvw m2, m10
+ vpsllvw m3, m10
+ pmulhrsw m4, m2
+ pmulhrsw m5, m3
+ add grain_lutq, 82*4
+ paddw m0, m4
+ paddw m1, m5
+ pmaxsw m0, m8
+ pmaxsw m1, m8
+ pminsw m0, m9
+ pminsw m1, m9
+ mova [dstq+srcq], m0
+ add srcq, strideq
+ mova [dstq+srcq], m1
+ add srcq, strideq
+ ret
+
+%macro FGUV_FN 3 ; name, ss_hor, ss_ver
+cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 22, dst, src, stride, fg_data, w, scaling, \
+ grain_lut, h, sby, luma, lstride, uv_pl, is_id
+%define base r12-fg_min
+ lea r12, [fg_min]
+ mov r9d, r13m ; bdmax
+ mov r7d, [fg_dataq+FGData.scaling_shift]
+ mov r6d, [fg_dataq+FGData.clip_to_restricted_range]
+ mov r11d, is_idm
+ kxnorw k1, k1, k1 ; 0xffff
+ vpbroadcastd m5, r13m
+ mov r13, 0xeeeeeeeeeeeeeeee
+ vbroadcasti32x4 m6, [base+scale_mask]
+ shr r9d, 11 ; is_12bpc
+ vpbroadcastd m7, [base+scale_shift+r7*4-32]
+ shlx r10d, r6d, r9d
+ mov sbyd, sbym
+ shlx r6d, r6d, r11d
+ vpbroadcastd m8, [base+fg_min+r10*4]
+ lea r6d, [r9+r6*2]
+ vpbroadcastd m9, [base+fg_max+r6*4]
+ kmovq k2, r13
+ vpbroadcastd m20, [base+scale_rnd+r9*4]
+ packssdw m4, m5, m5
+ vpbroadcastd m21, [base+scale_shift+r9*8+4]
+%if %2
+ mova m12, [pb_0to63] ; pw_even
+ mov r13d, 0x0101
+ vpbroadcastq m10, [base+pw_23_22+r9*8]
+ kmovw k3, r13d
+%if %3
+ pshufd m11, m10, q0000
+%else
+ vpbroadcastd ym16, [base+pw_27_17_17_27+r9*8+0]
+ vpbroadcastd m11, [base+pw_27_17_17_27+r9*8+4]
+ vmovdqu16 m11{k1}, m16
+%endif
+ psrlw m13, m12, 8 ; pw_odd
+%else
+ vpbroadcastq m10, [base+pw_27_17_17_27+r9*8]
+ kshiftrb k3, k1, 7 ; 0x01
+ kshiftrb k4, k1, 4 ; 0x0f
+ pshufd m11, m10, q0000
+%endif
+ mov lstrideq, r10mp
+ test sbyd, sbyd
+ setnz r7b
+ cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
+ jne .csfl
+
+%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ _, sby, see, lstride
+
+%if %1
+ mov r6d, r11m
+ vpbroadcastd m0, [base+uv_offset_mul+r9*4]
+ vpbroadcastd m1, [base+pb_8_9_0_1]
+ vpbroadcastd m14, [fg_dataq+FGData.uv_offset+r6*4]
+ vbroadcasti32x4 m15, [fg_dataq+FGData.uv_mult+r6*4]
+ pmaddwd m14, m0
+ pshufb m15, m1 ; { uv_luma_mult, uv_mult }
+%endif
+ test r7b, [fg_dataq+FGData.overlap_flag]
+ jnz %%v_overlap
+
+ imul seed, sbyd, (173 << 24) | 37
+ add seed, (105 << 24) | 178
+ rorx seed, seed, 24
+ movzx seed, seew
+ xor seed, [fg_dataq+FGData.seed]
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, lstride, luma
+
+ mov lumaq, r9mp
+ lea r12, [srcq+wq*2]
+ lea r13, [dstq+wq*2]
+ lea r14, [lumaq+wq*(2<<%2)]
+ mov r9mp, r12
+ mov r10mp, r13
+ mov r11mp, r14
+ neg wq
+
+%%loop_x:
+ rorx r6, seeq, 1
+ or seed, 0xeff4
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+ rorx offyd, seed, 8
+ rorx offxq, seeq, 12
+ and offyd, 0xf
+ imul offyd, 164>>%3
+ lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))] ; offy*stride+offx
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, lstride, luma
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+%%loop_y:
+%if %2
+ movu ym18, [grain_lutq+offxyq*2+82*0]
+ vinserti32x8 m18, [grain_lutq+offxyq*2+82*2], 1
+ movu ym19, [grain_lutq+offxyq*2+82*4]
+ vinserti32x8 m19, [grain_lutq+offxyq*2+82*6], 1
+%else
+ movu m18, [grain_lutq+offxyq*2+82*0]
+ movu m19, [grain_lutq+offxyq*2+82*2]
+%endif
+ call %%add_noise
+ sub hb, 2<<%2
+ jg %%loop_y
+ add wq, 32>>%2
+ jge .end
+ mov srcq, r9mp
+ mov dstq, r10mp
+ mov lumaq, r11mp
+ lea srcq, [srcq+wq*2]
+ lea dstq, [dstq+wq*2]
+ lea lumaq, [lumaq+wq*(2<<%2)]
+ cmp byte [fg_dataq+FGData.overlap_flag], 0
+ je %%loop_x
+ cmp dword r8m, 0 ; sby
+ jne %%hv_overlap
+
+ ; horizontal overlap (without vertical overlap)
+%%loop_x_h_overlap:
+ rorx r6, seeq, 1
+ or seed, 0xEFF4
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, lstride, luma, left_offxy
+
+ lea left_offxyd, [offyq+(32>>%2)] ; previous column's offy*stride+offx
+ rorx offyd, seed, 8
+ rorx offxq, seeq, 12
+ and offyd, 0xf
+ imul offyd, 164>>%3
+ lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, lstride, luma, left_offxy
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+%%loop_y_h_overlap:
+%if %2
+ movu ym18, [grain_lutq+offxyq*2+82*0]
+ vinserti32x8 m18, [grain_lutq+offxyq*2+82*2], 1
+ movu ym19, [grain_lutq+offxyq*2+82*4]
+ vinserti32x8 m19, [grain_lutq+offxyq*2+82*6], 1
+ movd xm16, [grain_lutq+left_offxyq*2+82*0]
+ vinserti32x4 m16, [grain_lutq+left_offxyq*2+82*2], 2
+ movd xm17, [grain_lutq+left_offxyq*2+82*4]
+ vinserti32x4 m17, [grain_lutq+left_offxyq*2+82*6], 2
+ punpckldq m16, m17
+ punpckldq m17, m18, m19
+ punpcklwd m16, m17
+ mova m17, m20
+ vpdpwssd m17, m16, m10
+ psrad m17, 1
+ packssdw m17, m17
+ vpsravw m17, m21
+%else
+ movu m18, [grain_lutq+offxyq*2+82*0]
+ movu m19, [grain_lutq+offxyq*2+82*2]
+ movd xm16, [grain_lutq+left_offxyq*2+82*0]
+ pinsrd xm16, [grain_lutq+left_offxyq*2+82*2], 1
+ punpckldq xm17, xm18, xm19
+ punpcklwd xm16, xm17
+ mova xm17, xm20
+ vpdpwssd xm17, xm16, xm10
+ psrad xm17, 1
+ packssdw xm17, xm17
+ vpsravw xm17, xm21
+%endif
+ vmovdqa32 m18{k3}, m17
+ vpshufd m19{k3}, m17, q0321
+ call %%add_noise
+ sub hb, 2<<%2
+ jg %%loop_y_h_overlap
+ add wq, 32>>%2
+ jge .end
+ mov srcq, r9mp
+ mov dstq, r10mp
+ mov lumaq, r11mp
+ lea srcq, [srcq+wq*2]
+ lea dstq, [dstq+wq*2]
+ lea lumaq, [lumaq+wq*(2<<%2)]
+ cmp dword r8m, 0 ; sby
+ jne %%hv_overlap
+ jmp %%loop_x_h_overlap
+
+%%v_overlap:
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ _, sby, see, lstride
+
+ movzx sbyd, sbyb
+ imul seed, [fg_dataq+FGData.seed], 0x00010001
+ imul r7d, sbyd, 173 * 0x00010001
+ imul sbyd, 37 * 0x01000100
+ add r7d, (105 << 16) | 188
+ add sbyd, (178 << 24) | (141 << 8)
+ and r7d, 0x00ff00ff
+ and sbyd, 0xff00ff00
+ xor seed, r7d
+ xor seed, sbyd ; (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, lstride, luma, _, top_offxy
+
+ mov lumaq, r9mp
+ lea r12, [srcq+wq*2]
+ lea r13, [dstq+wq*2]
+ lea r14, [lumaq+wq*(2<<%2)]
+ mov r9mp, r12
+ mov r10mp, r13
+ mov r11mp, r14
+ neg wq
+
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp r7b ; parity of top_seed
+ shr seed, 16
+ shl r7d, 16
+ test seeb, seeh
+ setp r7b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor r7d, r6d
+ rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
+
+ rorx offyd, seed, 8
+ rorx offxd, seed, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164>>%3
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, lstride, luma, _, top_offxy
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+
+%if %3
+ movu ym16, [grain_lutq+offxyq*2+82*0]
+ movu ym1, [grain_lutq+top_offxyq*2+82*0]
+ vbroadcasti32x8 m18, [grain_lutq+offxyq*2+82*2]
+ movu ym19, [grain_lutq+offxyq*2+82*4]
+ vinserti32x8 m19, [grain_lutq+offxyq*2+82*6], 1
+ punpcklwd ym17, ym1, ym16
+ punpckhwd ym1, ym16
+%elif %2
+ movu ym18, [grain_lutq+offxyq*2+82*0]
+ vinserti32x8 m18, [grain_lutq+offxyq*2+82*2], 1
+ movu ym17, [grain_lutq+top_offxyq*2+82*0]
+ vinserti32x8 m17, [grain_lutq+top_offxyq*2+82*2], 1
+ movu ym19, [grain_lutq+offxyq*2+82*4]
+ vinserti32x8 m19, [grain_lutq+offxyq*2+82*6], 1
+ punpcklwd m16, m17, m18
+ punpckhwd m17, m18
+%else
+ movu m18, [grain_lutq+offxyq*2+82*0]
+ movu m19, [grain_lutq+top_offxyq*2+82*0]
+ movu m2, [grain_lutq+offxyq*2+82*2]
+ movu m16, [grain_lutq+top_offxyq*2+82*2]
+ punpckhwd m1, m19, m18
+ punpcklwd m19, m18
+ punpckhwd m18, m2, m16
+ punpcklwd m2, m16
+%endif
+ call %%add_noise_v
+ sub hb, 2<<%2
+ jg %%loop_y
+ add wq, 32>>%2
+ jge .end
+ mov srcq, r9mp
+ mov dstq, r10mp
+ mov lumaq, r11mp
+ lea srcq, [srcq+wq*2]
+ lea dstq, [dstq+wq*2]
+ lea lumaq, [lumaq+wq*(2<<%2)]
+
+ ; since fg_dataq.overlap is guaranteed to be set, we never jump back
+ ; to %%v_overlap, and instead always fall-through to %%hv_overlap
+%%hv_overlap:
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp r7b ; parity of top_seed
+ shr seed, 16
+ shl r7d, 16
+ test seeb, seeh
+ setp r7b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor r7d, r6d
+ rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, lstride, luma, left_offxy, top_offxy, topleft_offxy
+
+ lea topleft_offxyq, [top_offxyq+(32>>%2)]
+ lea left_offxyq, [offyq+(32>>%2)]
+ rorx offyd, seed, 8
+ rorx offxd, seed, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164>>%3
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, lstride, luma, left_offxy, top_offxy, topleft_offxy
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+
+ ; grain = grain_lut[offy+y][offx+x]
+%if %2
+ movd xm16, [grain_lutq+left_offxyq*2+82*0]
+ vinserti32x4 m16, [grain_lutq+left_offxyq*2+82*2], 2
+ movd xm17, [grain_lutq+left_offxyq*2+82*4]
+ vinserti32x4 m17, [grain_lutq+left_offxyq*2+82*6], 2
+ movu ym18, [grain_lutq+offxyq*2+82*0]
+ vinserti32x8 m18, [grain_lutq+offxyq*2+82*2], 1
+ movu ym19, [grain_lutq+offxyq*2+82*4]
+ vinserti32x8 m19, [grain_lutq+offxyq*2+82*6], 1
+ punpckldq m16, m17
+ punpckldq m17, m18, m19
+ punpcklwd m16, m17
+ movu ym1, [grain_lutq+top_offxyq*2+82*0]
+ movd xm17, [grain_lutq+topleft_offxyq*2+82*0]
+ mova m0, m20
+ vpdpwssd m0, m16, m10
+%if %3
+ punpcklwd xm17, xm1
+ mova xm16, xm20
+ vpdpwssd xm16, xm17, xm10
+ psrad xm16, 1
+%else
+ vinserti32x8 m1, [grain_lutq+top_offxyq*2+82*2], 1
+ vinserti32x4 m17, [grain_lutq+topleft_offxyq*2+82*2], 2
+ punpcklwd m17, m1
+ mova m16, m20
+ vpdpwssd m16, m17, m10
+ psrad m16, 1
+%endif
+ psrad m0, 1
+ packssdw m0, m16
+ vpsravw m0, m21
+ vmovdqa32 m18{k3}, m0
+ vpshufd m19{k3}, m0, q0321
+%if %3
+ vpunpckhdq ym1{k3}, ym0, ym0
+ punpcklwd ym17, ym1, ym18
+ punpckhwd ym1, ym18
+%else
+ vpunpckhdq m1{k3}, m0, m0
+ punpcklwd m16, m1, m18
+ punpckhwd m17, m1, m18
+%endif
+%else
+ movu m18, [grain_lutq+offxyq*2+82*0]
+ movu m19, [grain_lutq+top_offxyq*2+82*0]
+ movd xm17, [grain_lutq+left_offxyq*2+82*0]
+ pinsrd xm17, [grain_lutq+topleft_offxyq*2+82*0], 1
+ punpckldq xm16, xm18, xm19
+ punpcklwd xm17, xm16
+ movu m2, [grain_lutq+offxyq*2+82*2]
+ movu m0, [grain_lutq+top_offxyq*2+82*2]
+ movd xm16, [grain_lutq+left_offxyq*2+82*2]
+ pinsrd xm16, [grain_lutq+topleft_offxyq*2+82*2], 1
+ punpckldq xm1, xm2, xm0
+ punpcklwd xm1, xm16, xm1
+ mova xm16, xm20
+ vpdpwssd xm16, xm17, xm10
+ mova xm17, xm20
+ vpdpwssd xm17, xm1, xm10
+ punpckhwd m1, m19, m18
+ punpcklwd m19, m18
+ punpckhwd m18, m2, m0
+ punpcklwd m2, m0
+ psrad xm16, 1
+ psrad xm17, 1
+ packssdw xm16, xm17
+ vpsravw xm16, xm21
+ vpshuflw m19{k4}, m16, q1302
+ punpckhqdq xm16, xm16
+ vpshuflw m2{k4}, m16, q3120
+%endif
+ call %%add_noise_v
+ sub hb, 2<<%2
+ jg %%loop_y_h_overlap
+ add wq, 32>>%2
+ jge .end
+ mov srcq, r9mp
+ mov dstq, r10mp
+ mov lumaq, r11mp
+ lea srcq, [srcq+wq*2]
+ lea dstq, [dstq+wq*2]
+ lea lumaq, [lumaq+wq*(2<<%2)]
+ jmp %%hv_overlap
+
+ALIGN function_align
+%%add_noise_v:
+%if %3
+ mova ym16, ym20
+ vpdpwssd ym16, ym17, ym11
+ mova ym17, ym20
+ vpdpwssd ym17, ym1, ym11
+ psrad ym16, 1
+ psrad ym17, 1
+ packssdw ym16, ym17
+ vpsravw m18{k1}, m16, m21
+%elif %2
+ mova m18, m20
+ vpdpwssd m18, m16, m11
+ mova m16, m20
+ vpdpwssd m16, m17, m11
+ psrad m18, 1
+ psrad m16, 1
+ packssdw m18, m16
+ vpsravw m18, m21
+%else
+ mova m16, m20
+ vpdpwssd m16, m1, m11
+ mova m17, m20
+ vpdpwssd m17, m18, m11
+ mova m18, m20
+ vpdpwssd m18, m19, m11
+ mova m19, m20
+ vpdpwssd m19, m2, m11
+ REPX {psrad x, 1}, m16, m17, m18, m19
+ packssdw m18, m16
+ packssdw m19, m17
+ vpsravw m18, m21
+ vpsravw m19, m21
+%endif
+%%add_noise:
+%if %2
+ mova m2, [lumaq+lstrideq*(0<<%3)]
+ mova m0, [lumaq+lstrideq*(1<<%3)]
+ lea lumaq, [lumaq+lstrideq*(2<<%3)]
+ mova m3, [lumaq+lstrideq*(0<<%3)]
+ mova m1, [lumaq+lstrideq*(1<<%3)]
+ mova m16, m12
+ vpermi2w m16, m2, m0
+ vpermt2w m2, m13, m0
+ mova m17, m12
+ vpermi2w m17, m3, m1
+ vpermt2w m3, m13, m1
+ pavgw m2, m16
+ pavgw m3, m17
+%elif %1
+ mova m2, [lumaq+lstrideq*0]
+ mova m3, [lumaq+lstrideq*1]
+%endif
+%if %2
+ mova ym16, [srcq+strideq*0]
+ vinserti32x8 m16, [srcq+strideq*1], 1
+ lea srcq, [srcq+strideq*2]
+%else
+ mova m16, [srcq+strideq*0]
+%endif
+%if %1
+ punpckhwd m17, m2, m16
+ mova m0, m14
+ vpdpwssd m0, m17, m15
+ punpcklwd m17, m2, m16
+ mova m2, m14
+ vpdpwssd m2, m17, m15
+%endif
+%if %2
+ mova ym17, [srcq+strideq*0]
+ vinserti32x8 m17, [srcq+strideq*1], 1
+%else
+ mova m17, [srcq+strideq*1]
+%endif
+%if %1
+ psrad m0, 6
+ psrad m2, 6
+ packusdw m2, m0
+ punpckhwd m0, m3, m17
+ mova m1, m14
+ vpdpwssd m1, m15, m0
+ punpcklwd m0, m3, m17
+ mova m3, m14
+ vpdpwssd m3, m15, m0
+ psrad m1, 6
+ psrad m3, 6
+ packusdw m3, m1
+ pminuw m2, m4
+ pminuw m3, m4
+
+.add_noise_main:
+ ; scaling[luma_src]
+ kmovw k5, k1
+ pand m1, m5, m2
+ vpgatherdd m0{k5}, [scalingq+m1]
+ kmovw k5, k1
+ psrld m2, 16
+ vpgatherdd m1{k5}, [scalingq+m2]
+ vpshufb m0{k2}, m1, m6
+ kmovw k5, k1
+ psrld m1, m3, 16
+ vpgatherdd m2{k5}, [scalingq+m1]
+ kmovw k5, k1
+ pand m3, m5
+ vpgatherdd m1{k5}, [scalingq+m3]
+ vpshufb m1{k2}, m2, m6
+
+ ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+ vpsllvw m0, m7
+ vpsllvw m1, m7
+ pmulhrsw m18, m0
+ pmulhrsw m19, m1
+ add grain_lutq, 82*(4<<%2)
+ lea lumaq, [lumaq+lstrideq*(2<<%3)]
+ lea srcq, [srcq+strideq*2]
+ paddw m16, m18
+ paddw m17, m19
+ pmaxsw m16, m8
+ pmaxsw m17, m8
+ pminsw m16, m9
+ pminsw m17, m9
+%if %2
+ mova [dstq+strideq*0], ym16
+ vextracti32x8 [dstq+strideq*1], m16, 1
+ lea dstq, [dstq+strideq*2]
+ mova [dstq+strideq*0], ym17
+ vextracti32x8 [dstq+strideq*1], m17, 1
+%else
+ mova [dstq+strideq*0], m16
+ mova [dstq+strideq*1], m17
+%endif
+ lea dstq, [dstq+strideq*2]
+ ret
+%else
+%if %2
+ pand m2, m4
+ pand m3, m4
+%else
+ pand m2, m4, [lumaq+lstrideq*0]
+ pand m3, m4, [lumaq+lstrideq*1]
+%endif
+ jmp .add_noise_main
+%endif
+%endmacro
+
+ %%FGUV_32x32xN_LOOP 1, %2, %3
+.csfl:
+ %%FGUV_32x32xN_LOOP 0, %2, %3
+.end:
+ RET
+%endmacro
+
+FGUV_FN 420, 1, 1
+FGUV_FN 422, 1, 0
+FGUV_FN 444, 0, 0
+
+%endif
diff --git a/third_party/dav1d/src/x86/filmgrain16_sse.asm b/third_party/dav1d/src/x86/filmgrain16_sse.asm
new file mode 100644
index 0000000000..6b0daaac0b
--- /dev/null
+++ b/third_party/dav1d/src/x86/filmgrain16_sse.asm
@@ -0,0 +1,3421 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+%include "x86/filmgrain_common.asm"
+
+SECTION_RODATA 16
+pd_16: times 4 dd 16
+pw_1: times 8 dw 1
+pw_16384: times 8 dw 16384
+pw_8192: times 8 dw 8192
+pw_23_22: dw 23, 22
+ times 3 dw 0, 32
+pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0
+pw_27_17_17_27: dw 27, 17, 17, 27
+ times 2 dw 0, 32
+rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058
+pw_seed_xor: times 2 dw 0xb524
+ times 2 dw 0x49d8
+pb_1: times 4 db 1
+hmul_bits: dw 32768, 16384, 8192, 4096
+round: dw 2048, 1024, 512
+mul_bits: dw 256, 128, 64, 32, 16
+round_vals: dw 32, 64, 128, 256, 512, 1024
+max: dw 256*4-1, 240*4, 235*4, 256*16-1, 240*16, 235*16
+min: dw 0, 16*4, 16*16
+; these two should be next to each other
+pw_4: times 2 dw 4
+pw_16: times 2 dw 16
+
+%macro JMP_TABLE 1-*
+ %xdefine %1_table %%table
+ %xdefine %%base %1_table
+ %xdefine %%prefix mangle(private_prefix %+ _%1)
+ %%table:
+ %rep %0 - 1
+ dd %%prefix %+ .ar%2 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+JMP_TABLE generate_grain_y_16bpc_ssse3, 0, 1, 2, 3
+JMP_TABLE generate_grain_uv_420_16bpc_ssse3, 0, 1, 2, 3
+JMP_TABLE generate_grain_uv_422_16bpc_ssse3, 0, 1, 2, 3
+JMP_TABLE generate_grain_uv_444_16bpc_ssse3, 0, 1, 2, 3
+
+SECTION .text
+
+%if ARCH_X86_32
+%undef base
+%define PIC_ptr(a) base+a
+%else
+%define PIC_ptr(a) a
+%endif
+
+%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
+
+%macro vpgatherdw 5-8 8, 1 ; dst, src, base, tmp_gpr[x2], cnt, stride, tmp_xmm_reg
+%assign %%idx 0
+%define %%tmp %2
+%if %0 == 8
+%define %%tmp %8
+%endif
+%rep (%6/2)
+%if %%idx == 0
+ movd %5 %+ d, %2
+ pshuflw %%tmp, %2, q3232
+%else
+ movd %5 %+ d, %%tmp
+%if %6 == 8
+%if %%idx == 2
+ punpckhqdq %%tmp, %%tmp
+%elif %%idx == 4
+ psrlq %%tmp, 32
+%endif
+%endif
+%endif
+ movzx %4 %+ d, %5 %+ w
+ shr %5 %+ d, 16
+
+%if %%idx == 0
+ movd %1, [%3+%4*%7]
+%else
+ pinsrw %1, [%3+%4*%7], %%idx + 0
+%endif
+ pinsrw %1, [%3+%5*%7], %%idx + 1
+%assign %%idx %%idx+2
+%endrep
+%endmacro
+
+%macro SPLATD 2 ; dst, src
+%ifnidn %1, %2
+ movd %1, %2
+%endif
+ pshufd %1, %1, q0000
+%endmacro
+
+%macro SPLATW 2 ; dst, src
+%ifnidn %1, %2
+ movd %1, %2
+%endif
+ pshuflw %1, %1, q0000
+ punpcklqdq %1, %1
+%endmacro
+
+
+INIT_XMM ssse3
+%if ARCH_X86_64
+cglobal generate_grain_y_16bpc, 3, 8, 16, buf, fg_data, bdmax
+ lea r4, [pb_mask]
+%define base r4-pb_mask
+%else
+cglobal generate_grain_y_16bpc, 3, 6, 8, buf, fg_data, bdmax
+ LEA r4, $$
+%define base r4-$$
+%endif
+ movq m1, [base+rnd_next_upperbit_mask]
+ movq m4, [base+mul_bits]
+ movq m7, [base+hmul_bits]
+ mov r3d, [fg_dataq+FGData.grain_scale_shift]
+ lea r5d, [bdmaxq+1]
+ shr r5d, 11 ; 0 for 10bpc, 2 for 12bpc
+ sub r3, r5
+ SPLATW m6, [base+round+r3*2-2]
+ mova m5, [base+pb_mask]
+ SPLATW m0, [fg_dataq+FGData.seed]
+ mov r3, -73*82*2
+ sub bufq, r3
+%if ARCH_X86_64
+ lea r6, [gaussian_sequence]
+%endif
+.loop:
+ pand m2, m0, m1
+ psrlw m3, m2, 10
+ por m2, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set
+ pmullw m2, m4 ; bits 0x0f00 are set
+ pshufb m3, m5, m2 ; set 15th bit for next 4 seeds
+ psllq m2, m3, 30
+ por m2, m3
+ psllq m3, m2, 15
+ por m2, m3 ; aggregate each bit into next seed's high bit
+ pmulhuw m3, m0, m7
+ por m2, m3 ; 4 next output seeds
+ pshuflw m0, m2, q3333
+ psrlw m2, 5
+%if ARCH_X86_64
+ vpgatherdw m3, m2, r6, r5, r7, 4, 2
+%else
+ vpgatherdw m3, m2, base+gaussian_sequence, r5, r2, 4, 2
+%endif
+ paddw m3, m3 ; otherwise bpc=12 w/ grain_scale_shift=0
+ ; shifts by 0, which pmulhrsw does not support
+ pmulhrsw m3, m6
+ movq [bufq+r3], m3
+ add r3, 4*2
+ jl .loop
+
+ ; auto-regression code
+ movsxd r3, [fg_dataq+FGData.ar_coeff_lag]
+ movsxd r3, [base+generate_grain_y_16bpc_ssse3_table+r3*4]
+ lea r3, [r3+base+generate_grain_y_16bpc_ssse3_table]
+ jmp r3
+
+.ar1:
+%if WIN64
+ DEFINE_ARGS shift, fg_data, max, buf, val3, min, cf3, x, val0
+ lea bufq, [r0-2*(82*73-(82*3+79))]
+ PUSH r8
+%else
+%if ARCH_X86_64
+ DEFINE_ARGS buf, fg_data, max, shift, val3, min, cf3, x, val0
+%else ; x86-32
+ DEFINE_ARGS buf, fg_data, min, val3, x, cf3, val0
+ PUSH r6
+%define shiftd r1d
+%endif
+ sub bufq, 2*(82*73-(82*3+79))
+%endif
+ movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3]
+ movd m4, [fg_dataq+FGData.ar_coeffs_y]
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+%if WIN64
+ DEFINE_ARGS shift, h, max, buf, val3, min, cf3, x, val0
+%elif ARCH_X86_64
+ DEFINE_ARGS buf, h, max, shift, val3, min, cf3, x, val0
+%else ; x86-32
+%undef shiftd
+ DEFINE_ARGS buf, shift, min, val3, x, cf3, val0
+%define hd dword r0m
+%define maxd dword minm
+%endif
+%if cpuflag(sse4)
+ pmovsxbw m4, m4
+%else
+ pxor m3, m3
+ pcmpgtb m3, m4
+ punpcklbw m4, m3
+%endif
+ pinsrw m4, [base+pw_1], 3
+ pshufd m5, m4, q1111
+ pshufd m4, m4, q0000
+ SPLATW m3, [base+round_vals+shiftq*2-12] ; rnd
+ mov hd, 70
+ sar maxd, 1
+ mov mind, maxd
+ xor mind, -1
+.y_loop_ar1:
+ mov xq, -76
+ movsx val3d, word [bufq+xq*2-2]
+.x_loop_ar1:
+ movu m0, [bufq+xq*2-82*2-2] ; top/left
+ psrldq m2, m0, 2 ; top
+ psrldq m1, m0, 4 ; top/right
+ punpcklwd m0, m2
+ punpcklwd m1, m3
+ pmaddwd m0, m4
+ pmaddwd m1, m5
+ paddd m0, m1
+.x_loop_ar1_inner:
+ movd val0d, m0
+ psrldq m0, 4
+ imul val3d, cf3d
+ add val3d, val0d
+ sar val3d, shiftb
+ movsx val0d, word [bufq+xq*2]
+ add val3d, val0d
+ cmp val3d, maxd
+ cmovg val3d, maxd
+ cmp val3d, mind
+ cmovl val3d, mind
+ mov word [bufq+xq*2], val3w
+ ; keep val3d in-place as left for next x iteration
+ inc xq
+ jz .x_loop_ar1_end
+ test xq, 3
+ jnz .x_loop_ar1_inner
+ jmp .x_loop_ar1
+
+.x_loop_ar1_end:
+ add bufq, 82*2
+ dec hd
+ jg .y_loop_ar1
+%if WIN64
+ POP r8
+%elif ARCH_X86_32
+ POP r6
+%undef maxd
+%undef hd
+%endif
+.ar0:
+ RET
+
+.ar2:
+%if ARCH_X86_32
+%assign stack_offset_old stack_offset
+ ALLOC_STACK -16*8
+%endif
+ DEFINE_ARGS buf, fg_data, bdmax, shift
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ movd m0, [base+round_vals-12+shiftq*2]
+ pshuflw m0, m0, q0000
+ movu m6, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-11
+ pxor m2, m2
+ punpcklwd m0, m2
+ pcmpgtb m2, m6
+ punpckhbw m3, m6, m2
+ punpcklbw m6, m2
+ pshufd m2, m6, q3333
+ pshufd m1, m6, q2222
+ pshufd m7, m6, q1111
+ pshufd m6, m6, q0000
+ pshufd m4, m3, q1111
+ pshufd m3, m3, q0000
+%if ARCH_X86_64
+ SWAP 0, 12
+ SWAP 1, 8
+ SWAP 2, 9
+ SWAP 3, 10
+ SWAP 4, 11
+%else
+%define m12 [rsp+0*16]
+%define m8 [rsp+1*16]
+%define m9 [rsp+2*16]
+%define m10 [rsp+3*16]
+%define m11 [rsp+4*16]
+ mova m12, m0
+ mova m8, m1
+ mova m9, m2
+ mova m10, m3
+ mova m11, m4
+ mov bdmaxd, bdmaxm
+%endif
+ sar bdmaxd, 1
+ SPLATW m0, bdmaxd ; max_grain
+ pcmpeqw m1, m1
+%if !cpuflag(sse4)
+ pcmpeqw m2, m2
+ psrldq m2, 14
+ pslldq m2, 2
+ pxor m2, m1
+%endif
+ pxor m1, m0 ; min_grain
+%if ARCH_X86_64
+ SWAP 0, 13
+ SWAP 1, 14
+ SWAP 2, 15
+%else
+%define m13 [rsp+5*16]
+%define m14 [rsp+6*16]
+ mova m13, m0
+ mova m14, m1
+%if !cpuflag(sse4)
+%define m15 [rsp+7*16]
+ mova m15, m2
+%endif
+%endif
+ sub bufq, 2*(82*73-(82*3+79))
+ DEFINE_ARGS buf, fg_data, h, x
+ mov hd, 70
+.y_loop_ar2:
+ mov xq, -76
+
+.x_loop_ar2:
+ movu m0, [bufq+xq*2-82*4-4] ; y=-2,x=[-2,+5]
+ movu m1, [bufq+xq*2-82*2-4] ; y=-1,x=[-2,+5]
+ psrldq m2, m0, 2
+ psrldq m3, m0, 4
+ psrldq m4, m0, 6
+ psrldq m5, m0, 8
+ punpcklwd m0, m2
+ punpcklwd m3, m4
+ punpcklwd m5, m1
+ psrldq m2, m1, 2
+ psrldq m4, m1, 4
+ punpcklwd m2, m4
+ psrldq m4, m1, 6
+ psrldq m1, 8
+ punpcklwd m4, m1
+ pmaddwd m0, m6
+ pmaddwd m3, m7
+ pmaddwd m5, m8
+ pmaddwd m2, m9
+ pmaddwd m4, m10
+ paddd m0, m3
+ paddd m5, m2
+ paddd m0, m4
+ paddd m0, m5 ; accumulated top 2 rows
+ paddd m0, m12
+
+ movu m1, [bufq+xq*2-4] ; y=0,x=[-2,+5]
+ pshufd m4, m1, q3321
+ pxor m2, m2
+ pcmpgtw m2, m4
+ punpcklwd m4, m2 ; in dwords, y=0,x=[0,3]
+.x_loop_ar2_inner:
+ pmaddwd m2, m1, m11
+ paddd m2, m0
+ psrldq m0, 4 ; shift top to next pixel
+ psrad m2, [fg_dataq+FGData.ar_coeff_shift]
+ paddd m2, m4
+ packssdw m2, m2
+ pminsw m2, m13
+ pmaxsw m2, m14
+ psrldq m4, 4
+ pslldq m2, 2
+ psrldq m1, 2
+%if cpuflag(sse4)
+ pblendw m1, m2, 00000010b
+%else
+ pand m1, m15
+ pandn m3, m15, m2
+ por m1, m3
+%endif
+ ; overwrite previous pixel, this should be ok
+ movd [bufq+xq*2-2], m1
+ inc xq
+ jz .x_loop_ar2_end
+ test xq, 3
+ jnz .x_loop_ar2_inner
+ jmp .x_loop_ar2
+
+.x_loop_ar2_end:
+ add bufq, 82*2
+ dec hd
+ jg .y_loop_ar2
+%if ARCH_X86_32
+%undef m8
+%undef m9
+%undef m10
+%undef m11
+%undef m12
+%undef m13
+%undef m14
+%undef m15
+%endif
+ RET
+
+.ar3:
+ DEFINE_ARGS buf, fg_data, bdmax, shift
+%if WIN64
+ mov r6, rsp
+ and rsp, ~15
+ sub rsp, 64
+ %define tmp rsp
+%elif ARCH_X86_64
+ %define tmp rsp+stack_offset-72
+%else
+%assign stack_offset stack_offset_old
+ ALLOC_STACK -16*12
+ %define tmp rsp
+ mov bdmaxd, bdmaxm
+%endif
+ sar bdmaxd, 1
+ SPLATW m7, bdmaxd ; max_grain
+ pcmpeqw m6, m6
+%if !cpuflag(sse4)
+ pcmpeqw m4, m4
+ psrldq m4, 14
+ pslldq m4, 4
+ pxor m4, m6
+%endif
+ pxor m6, m7 ; min_grain
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+
+%if ARCH_X86_64
+ SWAP 6, 14
+ SWAP 7, 15
+%else
+%define m14 [rsp+10*16]
+%define m15 [esp+11*16]
+ mova m14, m6
+ mova m15, m7
+%endif
+
+ ; build cf0-1 until 18-19 in m5-12 and r0/1
+ pxor m1, m1
+ movu m0, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-15
+ pcmpgtb m1, m0
+ punpckhbw m2, m0, m1
+ punpcklbw m0, m1
+
+%if cpuflag(sse4)
+ pshufd m4, m2, q3333
+%else
+ pshufd m5, m2, q3333
+ mova [tmp+48], m5
+%endif
+ pshufd m3, m2, q2222
+ pshufd m1, m2, q0000
+ pshufd m2, m2, q1111
+ pshufd m7, m0, q2222
+ pshufd m6, m0, q1111
+ pshufd m5, m0, q0000
+ pshufd m0, m0, q3333
+
+%if ARCH_X86_64
+ SWAP 0, 8
+ SWAP 1, 9
+ SWAP 2, 10
+ SWAP 3, 11
+ SWAP 4, 12
+%else
+%define m8 [rsp+4*16]
+%define m9 [esp+5*16]
+%define m10 [rsp+6*16]
+%define m11 [esp+7*16]
+%define m12 [rsp+8*16]
+ mova m8, m0
+ mova m9, m1
+ mova m10, m2
+ mova m11, m3
+ mova m12, m4
+%endif
+
+ ; build cf20,round in r2
+ ; build cf21-23,round*2 in m13
+ pxor m1, m1
+ movq m0, [fg_dataq+FGData.ar_coeffs_y+16] ; cf16-23
+ pcmpgtb m1, m0
+ punpcklbw m0, m1
+ pshufd m1, m0, q0000
+ pshufd m2, m0, q1111
+ mova [tmp+ 0], m1
+ mova [tmp+16], m2
+ psrldq m3, m0, 10
+ pinsrw m3, [base+round_vals+shiftq*2-10], 3
+
+%if ARCH_X86_64
+ SWAP 3, 13
+%else
+%define m13 [esp+9*16]
+ mova m13, m3
+%endif
+
+ pinsrw m0, [base+round_vals+shiftq*2-12], 5
+ pshufd m3, m0, q2222
+ mova [tmp+32], m3
+
+ DEFINE_ARGS buf, fg_data, h, x
+ sub bufq, 2*(82*73-(82*3+79))
+ mov hd, 70
+.y_loop_ar3:
+ mov xq, -76
+
+.x_loop_ar3:
+ movu m0, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4]
+ movd m1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+6]
+ palignr m2, m1, m0, 2 ; y=-3,x=[-2,+5]
+ palignr m1, m1, m0, 12 ; y=-3,x=[+3,+6]
+ punpckhwd m3, m0, m2 ; y=-3,x=[+1/+2,+2/+3,+3/+4,+4/+5]
+ punpcklwd m0, m2 ; y=-3,x=[-3/-2,-2/-1,-1/+0,+0/+1]
+ shufps m2, m0, m3, q1032 ; y=-3,x=[-1/+0,+0/+1,+1/+2,+2/+3]
+
+ pmaddwd m0, m5
+ pmaddwd m2, m6
+ pmaddwd m3, m7
+ paddd m0, m2
+ paddd m0, m3
+ ; m0 = top line first 6 multiplied by cf, m1 = top line last entry
+
+ movu m2, [bufq+xq*2-82*4-6+ 0] ; y=-2,x=[-3,+4]
+ movd m3, [bufq+xq*2-82*4-6+16] ; y=-2,x=[+5,+6]
+ punpcklwd m1, m2 ; y=-3/-2,x=[+3/-3,+4/-2,+5/-1,+6/+0]
+ palignr m4, m3, m2, 2 ; y=-3,x=[-2,+5]
+ palignr m3, m3, m2, 4 ; y=-3,x=[-1,+6]
+ punpckhwd m2, m4, m3 ; y=-2,x=[+2/+3,+3/+4,+4/+5,+5/+6]
+ punpcklwd m4, m3 ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2]
+ shufps m3, m4, m2, q1032 ; y=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4]
+
+ pmaddwd m1, m8
+ pmaddwd m4, m9
+ pmaddwd m3, m10
+ pmaddwd m2, m11
+ paddd m1, m4
+ paddd m3, m2
+ paddd m0, m1
+ paddd m0, m3
+ ; m0 = top 2 lines multiplied by cf
+
+ movu m1, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4]
+ movd m2, [bufq+xq*2-82*2-6+16] ; y=-1,x=[+5,+6]
+ palignr m3, m2, m1, 2 ; y=-1,x=[-2,+5]
+ palignr m2, m2, m1, 12 ; y=-1,x=[+3,+6]
+ punpckhwd m4, m1, m3 ; y=-1,x=[+1/+2,+2/+3,+3/+4,+4/+5]
+ punpcklwd m1, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1]
+ shufps m3, m1, m4, q1032 ; y=-1,x=[-1/+0,+0/+1,+1/+2,+2/+3]
+ punpcklwd m2, [base+pw_1]
+
+%if cpuflag(sse4)
+ pmaddwd m1, m12
+%else
+ pmaddwd m1, [tmp+48]
+%endif
+ pmaddwd m3, [tmp+ 0]
+ pmaddwd m4, [tmp+16]
+ pmaddwd m2, [tmp+32]
+ paddd m1, m3
+ paddd m4, m2
+ paddd m0, m1
+ paddd m0, m4
+ ; m0 = top 3 lines multiplied by cf plus rounding for downshift
+
+ movu m1, [bufq+xq*2-6] ; y=0,x=[-3,+4]
+.x_loop_ar3_inner:
+ pmaddwd m2, m1, m13
+ pshufd m3, m2, q1111
+ paddd m2, m3 ; left+cur
+ paddd m2, m0 ; add top
+ psrldq m0, 4
+ psrad m2, [fg_dataq+FGData.ar_coeff_shift]
+ packssdw m2, m2
+ pminsw m2, m15
+ pmaxsw m2, m14
+ pslldq m2, 4
+ psrldq m1, 2
+%if cpuflag(sse4)
+ pblendw m1, m2, 00000100b
+%else
+ pand m1, m12
+ pandn m3, m12, m2
+ por m1, m3
+%endif
+ ; overwrite a couple of pixels, should be ok
+ movq [bufq+xq*2-4], m1
+ inc xq
+ jz .x_loop_ar3_end
+ test xq, 3
+ jnz .x_loop_ar3_inner
+ jmp .x_loop_ar3
+
+.x_loop_ar3_end:
+ add bufq, 82*2
+ dec hd
+ jg .y_loop_ar3
+%if WIN64
+ mov rsp, r6
+%elif ARCH_X86_32
+%undef m8
+%undef m9
+%undef m10
+%undef m11
+%undef m12
+%undef m13
+%undef m14
+%undef m15
+%endif
+ RET
+
+%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y
+INIT_XMM ssse3
+%if ARCH_X86_64
+cglobal generate_grain_uv_%1_16bpc, 4, 11, 16, buf, bufy, fg_data, uv, bdmax, x, gaussian_reg, h, pic_reg
+%define base r8-pb_mask
+ lea r8, [pb_mask]
+ movifnidn bdmaxd, bdmaxm
+ lea r6d, [bdmaxq+1]
+%else
+cglobal generate_grain_uv_%1_16bpc, 1, 7, 8, buf, x, pic_reg, fg_data, h
+%define base r2-$$
+ LEA r2, $$
+ mov fg_dataq, r2m
+ mov r6d, r4m
+ inc r6d
+%endif
+ movq m1, [base+rnd_next_upperbit_mask]
+ movq m4, [base+mul_bits]
+ movq m7, [base+hmul_bits]
+ mov r5d, [fg_dataq+FGData.grain_scale_shift]
+ shr r6d, 11 ; 0 for 10bpc, 2 for 12bpc
+ sub r5, r6
+ SPLATW m6, [base+round+r5*2-2]
+ mova m5, [base+pb_mask]
+ SPLATW m0, [fg_dataq+FGData.seed]
+%if ARCH_X86_64
+ SPLATW m2, [base+pw_seed_xor+uvq*4]
+%else
+ mov r5d, r3m
+ SPLATW m2, [base+pw_seed_xor+r5*4]
+%endif
+ pxor m0, m2
+%if ARCH_X86_64
+ lea r6, [gaussian_sequence]
+%endif
+%if %2
+ mov hd, 73-35*%3
+ add bufq, 44*2
+.loop_y:
+ mov xq, -44
+%else
+ mov xq, -82*73
+ add bufq, 82*73*2
+%endif
+.loop_x:
+ pand m2, m0, m1
+ psrlw m3, m2, 10
+ por m2, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set
+ pmullw m2, m4 ; bits 0x0f00 are set
+ pshufb m3, m5, m2 ; set 15th bit for next 4 seeds
+ psllq m2, m3, 30
+ por m2, m3
+ psllq m3, m2, 15
+ por m2, m3 ; aggregate each bit into next seed's high bit
+ pmulhuw m3, m0, m7
+ por m2, m3 ; 4 next output seeds
+ pshuflw m0, m2, q3333
+ psrlw m2, 5
+%if ARCH_X86_64
+ vpgatherdw m3, m2, r6, r9, r10, 4, 2
+%else
+ vpgatherdw m3, m2, base+gaussian_sequence, r5, r6, 4, 2
+%endif
+ paddw m3, m3 ; otherwise bpc=12 w/ grain_scale_shift=0
+ ; shifts by 0, which pmulhrsw does not support
+ pmulhrsw m3, m6
+ movq [bufq+xq*2], m3
+ add xq, 4
+ jl .loop_x
+%if %2
+ add bufq, 82*2
+ dec hd
+ jg .loop_y
+%endif
+
+ ; auto-regression code
+ movsxd r5, [fg_dataq+FGData.ar_coeff_lag]
+ movsxd r5, [base+generate_grain_uv_%1_16bpc_ssse3_table+r5*4]
+ lea r5, [r5+base+generate_grain_uv_%1_16bpc_ssse3_table]
+ jmp r5
+
+.ar0:
+%if ARCH_X86_64
+ DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift
+%else
+ DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift
+%assign stack_offset_old stack_offset
+ ALLOC_STACK -16*2
+ mov bufyq, r1m
+ mov uvd, r3m
+%endif
+ imul uvd, 28
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ movd m4, [fg_dataq+FGData.ar_coeffs_uv+uvq]
+ SPLATW m3, [base+hmul_bits+shiftq*2-10]
+%if ARCH_X86_64
+ sar bdmaxd, 1
+ SPLATW m1, bdmaxd ; max_gain
+%else
+ SPLATW m1, r4m
+ psraw m1, 1
+%endif
+ pcmpeqw m7, m7
+ pxor m7, m1 ; min_grain
+%if ARCH_X86_64
+ SWAP 1, 14
+ DEFINE_ARGS buf, bufy, h, x
+%else
+%define m14 [rsp+0*16]
+ mova m14, m1
+ DEFINE_ARGS buf, bufy, pic_reg, h, x
+%endif
+ pxor m5, m5
+ pcmpgtb m5, m4
+ punpcklbw m4, m5
+%if %2
+ SPLATW m6, [base+hmul_bits+2+%3*2]
+%endif
+ SPLATW m4, m4
+ pxor m5, m5
+%if %2
+%if !cpuflag(sse4)
+ pcmpeqw m2, m2
+ pslldq m2, 12
+%if ARCH_X86_64
+ SWAP 2, 12
+%else
+%define m12 [rsp+1*16]
+ mova m12, m2
+%endif
+%endif
+%endif
+%if %2
+ sub bufq, 2*(82*(73-35*%3)+82-(82*3+41))
+%else
+ sub bufq, 2*(82*70-3)
+%endif
+ add bufyq, 2*(3+82*3)
+ mov hd, 70-35*%3
+.y_loop_ar0:
+ ; first 32 pixels
+ xor xd, xd
+.x_loop_ar0:
+ movu m0, [bufyq+xq*(2<<%2)]
+%if %2
+%if %3
+ movu m2, [bufyq+xq*4+82*2]
+ paddw m0, m2
+%endif
+ movu m1, [bufyq+xq*4 +16]
+%if %3
+ movu m2, [bufyq+xq*4+82*2+16]
+ paddw m1, m2
+%endif
+ phaddw m0, m1
+ pmulhrsw m0, m6
+%endif
+ punpckhwd m1, m0, m5
+ punpcklwd m0, m5
+ REPX {pmaddwd x, m4}, m0, m1
+ REPX {psrad x, 5}, m0, m1
+ packssdw m0, m1
+ pmulhrsw m0, m3
+ movu m1, [bufq+xq*2]
+ paddw m0, m1
+ pminsw m0, m14
+ pmaxsw m0, m7
+ cmp xd, 72-40*%2
+ je .end
+ movu [bufq+xq*2], m0
+ add xd, 8
+ jmp .x_loop_ar0
+
+ ; last 6/4 pixels
+.end:
+%if %2
+%if cpuflag(sse4)
+ pblendw m0, m1, 11000000b
+%else
+ pand m1, m12
+ pandn m2, m12, m0
+ por m0, m1, m2
+%endif
+ movu [bufq+xq*2], m0
+%else
+ movq [bufq+xq*2], m0
+%endif
+
+ add bufq, 82*2
+ add bufyq, 82*(2<<%3)
+ dec hd
+ jg .y_loop_ar0
+%if ARCH_X86_32
+%undef m12
+%undef m14
+%endif
+ RET
+
+.ar1:
+%if ARCH_X86_64
+ DEFINE_ARGS buf, bufy, fg_data, uv, max, cf3, min, val3, x
+%else
+%assign stack_offset stack_offset_old
+%xdefine rstk rsp
+%assign stack_size_padded 0
+ DEFINE_ARGS buf, shift, pic_reg, fg_data, uv, bufy, cf3
+ mov bufyq, r1m
+ mov uvd, r3m
+%endif
+ imul uvd, 28
+ movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3]
+ movq m4, [fg_dataq+FGData.ar_coeffs_uv+uvq]
+%if WIN64
+ DEFINE_ARGS shift, bufy, h, buf, max, cf3, min, val3, x, val0
+%if %2
+ lea bufq, [r0-2*(82*(73-35*%3)+44-(82*3+41))]
+%else
+ lea bufq, [r0-2*(82*69+3)]
+%endif
+%else
+%if ARCH_X86_64
+ DEFINE_ARGS buf, bufy, h, shift, max, cf3, min, val3, x, val0
+%else
+ DEFINE_ARGS buf, shift, pic_reg, fg_data, val0, bufy, cf3
+%define hd dword r1m
+%define mind dword r3m
+%define maxd dword r4m
+%endif
+%if %2
+ sub bufq, 2*(82*(73-35*%3)+44-(82*3+41))
+%else
+ sub bufq, 2*(82*69+3)
+%endif
+%endif
+%if ARCH_X86_64
+ mov shiftd, [r2+FGData.ar_coeff_shift]
+%else
+ mov shiftd, [r3+FGData.ar_coeff_shift]
+%endif
+ pxor m5, m5
+ pcmpgtb m5, m4
+ punpcklbw m4, m5 ; cf0-4 in words
+ pshuflw m4, m4, q2100
+ psrldq m4, 2 ; cf0-3,4 in words
+ pshufd m5, m4, q1111
+ pshufd m4, m4, q0000
+ movd m3, [base+round_vals+shiftq*2-12] ; rnd
+ pxor m6, m6
+ punpcklwd m3, m6
+%if %2
+ SPLATW m6, [base+hmul_bits+2+%3*2]
+%endif
+ SPLATD m3, m3
+ add bufyq, 2*(79+82*3)
+ mov hd, 70-35*%3
+ sar maxd, 1
+%if ARCH_X86_64
+ mov mind, maxd
+ xor mind, -1
+%else
+ DEFINE_ARGS buf, shift, val3, x, val0, bufy, cf3
+ mov r2, maxd
+ xor r2, -1
+ mov mind, r2
+%endif
+.y_loop_ar1:
+ mov xq, -(76>>%2)
+ movsx val3d, word [bufq+xq*2-2]
+.x_loop_ar1:
+ movu m0, [bufq+xq*2-82*2-2] ; top/left
+%if %2
+ movu m7, [bufyq+xq*4]
+%if %3
+ movu m1, [bufyq+xq*4+82*2]
+ phaddw m7, m1
+%else
+ phaddw m7, m7
+%endif
+%else
+ movq m7, [bufyq+xq*2]
+%endif
+ psrldq m2, m0, 2 ; top
+ psrldq m1, m0, 4 ; top/right
+ punpcklwd m0, m2
+%if %2
+%if %3
+ pshufd m2, m7, q3232
+ paddw m7, m2
+%endif
+ pmulhrsw m7, m6
+%endif
+ punpcklwd m1, m7
+ pmaddwd m0, m4
+ pmaddwd m1, m5
+ paddd m0, m1
+ paddd m0, m3
+.x_loop_ar1_inner:
+ movd val0d, m0
+ psrldq m0, 4
+ imul val3d, cf3d
+ add val3d, val0d
+ sar val3d, shiftb
+ movsx val0d, word [bufq+xq*2]
+ add val3d, val0d
+ cmp val3d, maxd
+ cmovg val3d, maxd
+ cmp val3d, mind
+ cmovl val3d, mind
+ mov word [bufq+xq*2], val3w
+ ; keep val3d in-place as left for next x iteration
+ inc xq
+ jz .x_loop_ar1_end
+ test xq, 3
+ jnz .x_loop_ar1_inner
+ jmp .x_loop_ar1
+
+.x_loop_ar1_end:
+ add bufq, 82*2
+ add bufyq, 82*2<<%3
+ dec hd
+ jg .y_loop_ar1
+%if ARCH_X86_32
+%undef maxd
+%undef mind
+%undef hd
+%endif
+ RET
+
+.ar2:
+%if ARCH_X86_64
+ DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift
+%else
+ DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift
+ ALLOC_STACK -16*8
+ mov bufyq, r1m
+ mov uvd, r3m
+%endif
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ imul uvd, 28
+%if ARCH_X86_64
+ sar bdmaxd, 1
+ SPLATW m5, bdmaxd ; max_grain
+%else
+ SPLATW m5, r4m
+ psraw m5, 1
+%endif
+ pcmpeqw m6, m6
+%if !cpuflag(sse4)
+ pcmpeqw m7, m7
+ psrldq m7, 14
+ pslldq m7, 2
+ pxor m7, m6
+%endif
+ pxor m6, m5 ; min_grain
+%if %2 && cpuflag(sse4)
+ SPLATW m7, [base+hmul_bits+2+%3*2]
+%endif
+
+%if ARCH_X86_64
+ SWAP 5, 13
+ SWAP 6, 14
+ SWAP 7, 15
+%else
+%define m13 [rsp+5*16]
+%define m14 [rsp+6*16]
+%define m15 [rsp+7*16]
+ mova m13, m5
+ mova m14, m6
+ mova m15, m7
+%endif
+
+ ; coef values
+ movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+0]
+ pxor m1, m1
+ pcmpgtb m1, m0
+ punpckhbw m2, m0, m1
+ punpcklbw m0, m1
+ pinsrw m2, [base+round_vals-12+shiftq*2], 5
+
+ pshufd m6, m0, q0000
+ pshufd m7, m0, q1111
+ pshufd m1, m0, q3333
+ pshufd m0, m0, q2222
+ pshufd m3, m2, q1111
+ pshufd m4, m2, q2222
+ pshufd m2, m2, q0000
+
+%if ARCH_X86_64
+ SWAP 0, 8
+ SWAP 1, 9
+ SWAP 2, 10
+ SWAP 3, 11
+ SWAP 4, 12
+%else
+%define m8 [rsp+0*16]
+%define m9 [rsp+1*16]
+%define m10 [rsp+2*16]
+%define m11 [rsp+3*16]
+%define m12 [rsp+4*16]
+ mova m8, m0
+ mova m9, m1
+ mova m10, m2
+ mova m11, m3
+ mova m12, m4
+%endif
+
+%if ARCH_X86_64
+ DEFINE_ARGS buf, bufy, fg_data, h, x
+%else
+ DEFINE_ARGS buf, bufy, pic_reg, fg_data, h, x
+%endif
+%if %2
+ sub bufq, 2*(82*(73-35*%3)+44-(82*3+41))
+%else
+ sub bufq, 2*(82*69+3)
+%endif
+ add bufyq, 2*(79+82*3)
+ mov hd, 70-35*%3
+.y_loop_ar2:
+ mov xq, -(76>>%2)
+
+.x_loop_ar2:
+ movu m0, [bufq+xq*2-82*4-4] ; y=-2,x=[-2,+5]
+ movu m5, [bufq+xq*2-82*2-4] ; y=-1,x=[-2,+5]
+ psrldq m4, m0, 2 ; y=-2,x=[-1,+5]
+ psrldq m1, m0, 4 ; y=-2,x=[-0,+5]
+ psrldq m3, m0, 6 ; y=-2,x=[+1,+5]
+ psrldq m2, m0, 8 ; y=-2,x=[+2,+5]
+ punpcklwd m0, m4 ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2]
+ punpcklwd m1, m3 ; y=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4]
+ punpcklwd m2, m5 ; y=-2/-1,x=[+2/-2,+3/-1,+4/+0,+5/+1]
+ pmaddwd m0, m6
+ pmaddwd m1, m7
+ pmaddwd m2, m8
+ paddd m0, m1
+ paddd m0, m2
+ psrldq m3, m5, 2 ; y=-1,x=[-1,+5]
+ psrldq m1, m5, 4 ; y=-1,x=[-0,+5]
+ psrldq m4, m5, 6 ; y=-1,x=[+1,+5]
+ psrldq m2, m5, 8 ; y=-1,x=[+2,+5]
+ punpcklwd m3, m1
+ punpcklwd m4, m2
+ pmaddwd m3, m9
+ pmaddwd m4, m10
+ paddd m3, m4
+ paddd m0, m3
+
+ ; luma component & rounding
+%if %2
+ movu m1, [bufyq+xq*4]
+%if %3
+ movu m2, [bufyq+xq*4+82*2]
+ phaddw m1, m2
+ pshufd m2, m1, q3232
+ paddw m1, m2
+%else
+ phaddw m1, m1
+%endif
+%if cpuflag(sse4)
+ pmulhrsw m1, m15
+%elif %3
+ pmulhrsw m1, [base+pw_8192]
+%else
+ pmulhrsw m1, [base+pw_16384]
+%endif
+%else
+ movq m1, [bufyq+xq*2]
+%endif
+ punpcklwd m1, [base+pw_1]
+ pmaddwd m1, m12
+ paddd m0, m1
+
+ movu m1, [bufq+xq*2-4] ; y=0,x=[-2,+5]
+ pshufd m2, m1, q3321
+ pxor m3, m3
+ pcmpgtw m3, m2
+ punpcklwd m2, m3 ; y=0,x=[0,3] in dword
+.x_loop_ar2_inner:
+ pmaddwd m3, m1, m11
+ paddd m3, m0
+ psrldq m0, 4 ; shift top to next pixel
+ psrad m3, [fg_dataq+FGData.ar_coeff_shift]
+ ; we do not need to packssdw since we only care about one value
+ paddd m3, m2
+ packssdw m3, m3
+ pminsw m3, m13
+ pmaxsw m3, m14
+ psrldq m1, 2
+ pslldq m3, 2
+ psrldq m2, 4
+%if cpuflag(sse4)
+ pblendw m1, m3, 00000010b
+%else
+ pand m1, m15
+ pandn m4, m15, m3
+ por m1, m4
+%endif
+ ; overwrite previous pixel, should be ok
+ movd [bufq+xq*2-2], m1
+ inc xq
+ jz .x_loop_ar2_end
+ test xq, 3
+ jnz .x_loop_ar2_inner
+ jmp .x_loop_ar2
+
+.x_loop_ar2_end:
+ add bufq, 82*2
+ add bufyq, 82*2<<%3
+ dec hd
+ jg .y_loop_ar2
+%if ARCH_X86_32
+%undef m13
+%undef m14
+%undef m15
+%endif
+ RET
+
+.ar3:
+%if ARCH_X86_64
+ DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift
+%if WIN64
+ mov r6, rsp
+ and rsp, ~15
+ sub rsp, 96
+ %define tmp rsp
+%else
+ %define tmp rsp+stack_offset-120
+%endif
+%else
+ DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift
+%assign stack_offset stack_offset_old
+ ALLOC_STACK -16*14
+ mov bufyq, r1m
+ mov uvd, r3m
+ %define tmp rsp
+%endif
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ imul uvd, 28
+ SPLATW m4, [base+round_vals-12+shiftq*2]
+ pxor m5, m5
+ pcmpgtw m5, m4
+ punpcklwd m4, m5
+%if ARCH_X86_64
+ sar bdmaxd, 1
+ SPLATW m6, bdmaxd ; max_grain
+%else
+ SPLATW m6, r4m
+ psraw m6, 1
+%endif
+ pcmpeqw m7, m7
+%if !cpuflag(sse4)
+ pcmpeqw m3, m3
+ psrldq m3, 14
+ pslldq m3, 4
+ pxor m3, m7
+%endif
+ pxor m7, m6 ; min_grain
+%if %2 && cpuflag(sse4)
+ SPLATW m3, [base+hmul_bits+2+%3*2]
+%endif
+
+%if ARCH_X86_64
+ SWAP 3, 11
+ SWAP 4, 12
+ SWAP 6, 14
+ SWAP 7, 15
+%else
+%define m11 [rsp+ 9*16]
+%define m12 [rsp+10*16]
+%define m14 [rsp+12*16]
+%define m15 [rsp+13*16]
+ mova m11, m3
+ mova m12, m4
+ mova m14, m6
+ mova m15, m7
+%endif
+
+ ; cf from y=-3,x=-3 until y=-3,x=-2
+ movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0]
+ pxor m1, m1
+ pcmpgtb m1, m0
+ punpckhbw m2, m0, m1
+ punpcklbw m0, m1
+ pshufd m1, m0, q0000
+ pshufd m3, m0, q1111
+ pshufd m4, m0, q2222
+ pshufd m0, m0, q3333
+ pshufd m5, m2, q0000
+ pshufd m6, m2, q1111
+ mova [tmp+16*0], m1
+ mova [tmp+16*1], m3
+ mova [tmp+16*2], m4
+ mova [tmp+16*3], m0
+ mova [tmp+16*4], m5
+ mova [tmp+16*5], m6
+ pshufd m6, m2, q2222
+ pshufd m7, m2, q3333
+
+ ; cf from y=-1,x=-1 to y=0,x=-1 + luma component
+ movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+16]
+ pxor m1, m1
+ pcmpgtb m1, m0
+ punpckhbw m2, m0, m1 ; luma
+ punpcklbw m0, m1
+ pshufd m3, m0, q3232
+ psrldq m5, m0, 10
+ ; y=0,x=[-3 to -1] + "1.0" for current pixel
+ pinsrw m5, [base+round_vals-10+shiftq*2], 3
+ ; y=-1,x=[-1 to +2]
+ pshufd m1, m0, q0000
+ pshufd m0, m0, q1111
+ ; y=-1,x=+3 + luma
+ punpcklwd m3, m2
+ pshufd m3, m3, q0000
+
+%if ARCH_X86_64
+ SWAP 1, 8
+ SWAP 0, 9
+ SWAP 3, 10
+ SWAP 5, 13
+ DEFINE_ARGS buf, bufy, fg_data, h, x
+%else
+%define m8 [rsp+ 6*16]
+%define m9 [rsp+ 7*16]
+%define m10 [rsp+ 8*16]
+%define m13 [rsp+11*16]
+ mova m8, m1
+ mova m9, m0
+ mova m10, m3
+ mova m13, m5
+ DEFINE_ARGS buf, bufy, pic_reg, fg_data, h, x
+%endif
+%if %2
+ sub bufq, 2*(82*(73-35*%3)+44-(82*3+41))
+%else
+ sub bufq, 2*(82*69+3)
+%endif
+ add bufyq, 2*(79+82*3)
+ mov hd, 70-35*%3
+.y_loop_ar3:
+ mov xq, -(76>>%2)
+
+.x_loop_ar3:
+ ; first line
+ movu m0, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4]
+ movd m1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+6]
+ palignr m2, m1, m0, 2 ; y=-3,x=[-2,+5]
+ palignr m1, m1, m0, 12 ; y=-3,x=[+3,+6]
+ punpckhwd m3, m0, m2 ; y=-3,x=[+1/+2,+2/+3,+3/+4,+4/+5]
+ punpcklwd m0, m2 ; y=-3,x=[-3/-2,-2/-1,-1/+0,+0/+1]
+ shufps m2, m0, m3, q1032 ; y=-3,x=[-1/+0,+0/+1,+1/+2,+2/+3]
+
+ pmaddwd m0, [tmp+0*16]
+ pmaddwd m2, [tmp+1*16]
+ pmaddwd m3, [tmp+2*16]
+ paddd m0, m2
+ paddd m0, m3 ; first 6 x of top y
+
+ ; second line [m0/1 are busy]
+ movu m2, [bufq+xq*2-82*4-6+ 0] ; y=-2,x=[-3,+4]
+ movd m3, [bufq+xq*2-82*4-6+16] ; y=-2,x=[+5,+6]
+ punpcklwd m1, m2 ; y=-3/-2,x=[+3/-3,+4/-2,+5/-1,+6/+0]
+ palignr m4, m3, m2, 2 ; y=-2,x=[-2,+5]
+ palignr m3, m3, m2, 4 ; y=-2,x=[-2,+5]
+ punpckhwd m5, m4, m3 ; y=-2,x=[+2/+3,+3/+4,+4/+5,+5/+6]
+ punpcklwd m4, m3 ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2]
+ shufps m3, m4, m5, q1032 ; t=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4]
+ pmaddwd m1, [tmp+3*16]
+ pmaddwd m4, [tmp+4*16]
+ pmaddwd m3, [tmp+5*16]
+ pmaddwd m5, m6
+ paddd m1, m4
+ paddd m3, m5
+ paddd m0, m1
+ paddd m0, m3 ; top 2 lines
+
+ ; third line [m0 is busy] & luma + round
+ movu m1, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4]
+ movd m2, [bufq+xq*2-82*2-6+16] ; y=-1,x=[+5,+6]
+%if %2
+ movu m5, [bufyq+xq*4]
+%if %3
+ movu m4, [bufyq+xq*4+82*2]
+ phaddw m5, m4
+%else
+ phaddw m5, m5
+%endif
+%else
+ movq m5, [bufyq+xq*2]
+%endif
+ palignr m3, m2, m1, 2 ; y=-1,x=[-2,+5]
+ palignr m2, m2, m1, 12 ; y=-1,x=[+3,+6]
+%if %3
+ pshufd m4, m5, q3232
+ paddw m5, m4
+%endif
+%if %2
+%if cpuflag(sse4)
+ pmulhrsw m5, m11
+%elif %3
+ pmulhrsw m5, [base+pw_8192]
+%else
+ pmulhrsw m5, [base+pw_16384]
+%endif
+%endif
+ punpckhwd m4, m1, m3 ; y=-1,x=[+1/+2,+2/+3,+3/+4,+4/+5]
+ punpcklwd m1, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1]
+ shufps m3, m1, m4, q1032 ; y=-1,x=[-1/+0,+0/+1,+1/+2,+2/+3]
+ punpcklwd m2, m5
+ pmaddwd m1, m7
+ pmaddwd m3, m8
+ pmaddwd m4, m9
+ pmaddwd m2, m10
+ paddd m1, m3
+ paddd m4, m2
+ paddd m0, m12 ; += round
+ paddd m1, m4
+ paddd m0, m1
+
+ movu m1, [bufq+xq*2-6] ; y=0,x=[-3,+4]
+.x_loop_ar3_inner:
+ pmaddwd m2, m1, m13
+ pshufd m3, m2, q1111
+ paddd m2, m3 ; left+cur
+ paddd m2, m0 ; add top
+ psrldq m0, 4
+ psrad m2, [fg_dataq+FGData.ar_coeff_shift]
+ packssdw m2, m2
+ pminsw m2, m14
+ pmaxsw m2, m15
+ pslldq m2, 4
+ psrldq m1, 2
+%if cpuflag(sse4)
+ pblendw m1, m2, 00000100b
+%else
+ pand m1, m11
+ pandn m3, m11, m2
+ por m1, m3
+%endif
+ ; overwrite previous pixels, should be ok
+ movq [bufq+xq*2-4], m1
+ inc xq
+ jz .x_loop_ar3_end
+ test xq, 3
+ jnz .x_loop_ar3_inner
+ jmp .x_loop_ar3
+
+.x_loop_ar3_end:
+ add bufq, 82*2
+ add bufyq, 82*2<<%3
+ dec hd
+ jg .y_loop_ar3
+%if WIN64
+ mov rsp, r6
+%elif ARCH_X86_32
+%undef m8
+%undef m9
+%undef m10
+%undef m11
+%undef m12
+%undef m13
+%undef m14
+%undef m15
+%endif
+ RET
+%endmacro
+
+generate_grain_uv_fn 420, 1, 1
+generate_grain_uv_fn 422, 1, 0
+generate_grain_uv_fn 444, 0, 0
+
+%macro SCRATCH 3
+%if ARCH_X86_32
+ mova [rsp+%3*mmsize], m%1
+%define m%2 [rsp+%3*mmsize]
+%else
+ SWAP %1, %2
+%endif
+%endmacro
+
+INIT_XMM ssse3
+%if ARCH_X86_32
+%if STACK_ALIGNMENT < mmsize
+cglobal fgy_32x32xn_16bpc, 0, 7, 8, 0-(8 * mmsize + 12 * gprsize), \
+ dst, src, scaling, unused1, fg_data, picptr, unused2
+ ; copy stack arguments to new position post-alignment, so that we
+ ; don't have to keep the old stack location in a separate register
+ mov r0, r0m
+ mov r1, r2m
+ mov r2, r4m
+ mov r3, r6m
+ mov r4, r7m
+ mov r5, r8m
+
+%define r0m [rsp+8*mmsize+ 3*gprsize]
+%define r2m [rsp+8*mmsize+ 5*gprsize]
+%define r4m [rsp+8*mmsize+ 7*gprsize]
+%define r6m [rsp+8*mmsize+ 9*gprsize]
+%define r7m [rsp+8*mmsize+10*gprsize]
+%define r8m [rsp+8*mmsize+11*gprsize]
+
+ mov r0m, r0
+ mov r2m, r1
+ mov r4m, r2
+ mov r6m, r3
+ mov r7m, r4
+ mov r8m, r5
+%else
+cglobal fgy_32x32xn_16bpc, 0, 7, 8, 8 * mmsize + 4 * gprsize, \
+ dst, src, scaling, unused1, fg_data, picptr, unused2
+%endif
+ mov srcq, srcm
+ mov scalingq, r5m
+ mov fg_dataq, r3m
+%if STACK_ALIGNMENT < mmsize
+ mov r6, r9m
+
+%define r9m [rsp+8*mmsize+ 4*gprsize]
+%define r3m [rsp+8*mmsize+ 6*gprsize]
+%define r5m [rsp+8*mmsize+ 8*gprsize]
+
+ mov r9m, r6
+%endif
+ LEA r5, $$
+%define base r5-$$
+ mov r5m, picptrq
+%else
+cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
+ lea r8, [pb_mask]
+%define base r8-pb_mask
+%endif
+ mov r6d, [fg_dataq+FGData.scaling_shift]
+ SPLATW m3, [base+mul_bits+r6*2-14]
+ mov r6d, [fg_dataq+FGData.clip_to_restricted_range]
+%if ARCH_X86_32
+ DECLARE_REG_TMP 0, 3
+%else
+ DECLARE_REG_TMP 9, 10
+%endif
+ mov t0d, r9m ; bdmax
+ sar t0d, 11 ; is_12bpc
+ inc t0d
+ mov t1d, r6d
+ imul t1d, t0d
+ dec t0d
+ SPLATW m5, [base+min+t1*2]
+ lea t0d, [t0d*3]
+ lea t0d, [r6d*2+t0d]
+ SPLATW m4, [base+max+t0*2]
+ SPLATW m2, r9m
+
+ pcmpeqw m1, m1
+ psraw m7, m2, 1 ; max_grain
+ pxor m1, m7 ; min_grain
+ SPLATD m6, [base+pd_16]
+
+ SCRATCH 1, 9, 0
+ SCRATCH 2, 10, 1
+ SCRATCH 3, 11, 2
+ SCRATCH 4, 12, 3
+ SCRATCH 5, 13, 4
+ SCRATCH 6, 14, 5
+ SCRATCH 7, 15, 6
+
+ mova m6, [base+pw_27_17_17_27] ; for horizontal filter
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, unused2
+ DECLARE_REG_TMP 0
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \
+ sby, see
+ DECLARE_REG_TMP 7
+%endif
+
+ mov sbyd, r8m
+ movzx t0d, byte [fg_dataq+FGData.overlap_flag]
+ test t0d, t0d
+ jz .no_vertical_overlap
+ test sbyd, sbyd
+ jnz .vertical_overlap
+.no_vertical_overlap:
+ mov dword r8m, t0d
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, unused
+ imul seed, (173 << 24) | 37
+%else
+ imul seed, sbyd, (173 << 24) | 37
+%endif
+ add seed, (105 << 24) | 178
+ rol seed, 8
+ movzx seed, seew
+ xor seed, [fg_dataq+FGData.seed]
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
+
+ mov r3m, seed
+ mov wq, r4m
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ unused1, unused2, see, src_bak
+%endif
+
+ lea src_bakq, [srcq+wq*2]
+ mov r9mp, src_bakq
+ neg wq
+ sub dstmp, srcq
+%if ARCH_X86_32
+ mov r4m, wq
+%endif
+
+.loop_x:
+%if ARCH_X86_32
+ mov seed, r3m
+%endif
+ mov r6d, seed
+ or seed, 0xEFF4
+ shr r6d, 1
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
+
+ mov offxd, offyd
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, src_bak
+
+ mov offyd, seed
+ mov offxd, seed
+%endif
+ ror offyd, 8
+ shr offxd, 12
+ and offyd, 0xf
+ imul offyd, 164
+ lea offyq, [offyq+offxq*2+747] ; offy*stride+offx
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, src_bak
+%endif
+
+.loop_x_odd:
+ movzx hd, word r7m
+ mov grain_lutq, grain_lutmp
+.loop_y:
+ ; src
+ pand m0, m10, [srcq+ 0]
+ pand m1, m10, [srcq+16] ; m0-1: src as word
+
+ ; scaling[src]
+%if ARCH_X86_32
+ vpgatherdw m2, m0, scalingq-1, r0, r5, 8, 1, m4
+ vpgatherdw m3, m1, scalingq-1, r0, r5, 8, 1, m4
+%else
+ vpgatherdw m2, m0, scalingq-1, r11, r13, 8, 1, m4
+ vpgatherdw m3, m1, scalingq-1, r11, r13, 8, 1, m4
+%endif
+ REPX {psrlw x, 8}, m2, m3
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m4, [grain_lutq+offxyq*2]
+ movu m5, [grain_lutq+offxyq*2+16]
+
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+ REPX {pmullw x, m11}, m2, m3
+ pmulhrsw m4, m2
+ pmulhrsw m5, m3
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m4
+ paddw m1, m5
+ pmaxsw m0, m13
+ pmaxsw m1, m13
+ pminsw m0, m12
+ pminsw m1, m12
+ movifnidn dstq, dstmp
+ mova [dstq+srcq+ 0], m0
+ mova [dstq+srcq+16], m1
+
+ add srcq, r2mp ; src += stride
+ add grain_lutq, 82*2
+ dec hd
+ jg .loop_y
+
+%if ARCH_X86_32
+ add r4mp, 16
+%else
+ add wq, 16
+%endif
+ jge .end
+%if ARCH_X86_32
+ mov srcq, r9mp
+ add srcq, r4mp
+ add srcq, r4mp
+%else
+ mov src_bakq, r9mp
+ lea srcq, [src_bakq+wq*2]
+%endif
+ btc dword r8m, 2
+ jc .next_blk
+ add offxyd, 16
+ test dword r8m, 2
+ jz .loop_x_odd
+%if ARCH_X86_32
+ add dword [rsp+8*mmsize+1*gprsize], 16
+%else
+ add r12d, 16 ; top_offxy += 16
+%endif
+ jmp .loop_x_odd_v_overlap
+
+.next_blk:
+ test dword r8m, 1
+ jz .loop_x
+
+ ; r8m = sbym
+ test dword r8m, 2
+ jnz .loop_x_hv_overlap
+
+ ; horizontal overlap (without vertical overlap)
+.loop_x_h_overlap:
+%if ARCH_X86_32
+ add offxyd, 16
+ mov [rsp+8*mmsize+0*gprsize], offxyd
+ DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
+ mov seed, r3m
+%endif
+
+ mov r6d, seed
+ or seed, 0xEFF4
+ shr r6d, 1
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS dst, src, scaling, offy, h, picptr, offx
+
+ mov offxd, offyd
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, src_bak, left_offxy
+
+ lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx
+
+ mov offyd, seed
+ mov offxd, seed
+%endif
+ ror offyd, 8
+ shr offxd, 12
+ and offyd, 0xf
+ imul offyd, 164
+ lea offyq, [offyq+offxq*2+747] ; offy*stride+offx
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, src_bak, left_offxy
+%endif
+
+ mov hd, dword r7m
+ mov grain_lutq, grain_lutmp
+.loop_y_h_overlap:
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m5, [grain_lutq+offxyq*2]
+%if ARCH_X86_32
+ mov r5, [rsp+8*mmsize+0*gprsize]
+ movd m4, [grain_lutq+r5*2]
+%else
+ movd m4, [grain_lutq+left_offxyq*2]
+%endif
+ punpcklwd m4, m5
+ pmaddwd m4, m6
+ paddd m4, m14
+ psrad m4, 5
+ packssdw m4, m4
+ pminsw m4, m15
+ pmaxsw m4, m9
+ shufps m4, m5, q3210
+
+ ; src
+ pand m0, m10, [srcq+ 0]
+ pand m1, m10, [srcq+16] ; m0-1: src as word
+
+ ; scaling[src]
+%if ARCH_X86_32
+ vpgatherdw m2, m0, scalingq-1, r0, r5, 8, 1, m5
+ vpgatherdw m3, m1, scalingq-1, r0, r5, 8, 1, m5
+%else
+ vpgatherdw m2, m0, scalingq-1, r13, r14, 8, 1, m5
+ vpgatherdw m3, m1, scalingq-1, r13, r14, 8, 1, m5
+%endif
+ REPX {psrlw x, 8}, m2, m3
+
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+ movu m5, [grain_lutq+offxyq*2+16]
+ REPX {pmullw x, m11}, m2, m3
+ pmulhrsw m4, m2
+ pmulhrsw m5, m3
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m4
+ paddw m1, m5
+ pmaxsw m0, m13
+ pmaxsw m1, m13
+ pminsw m0, m12
+ pminsw m1, m12
+ movifnidn dstq, dstmp
+ mova [dstq+srcq+ 0], m0
+ mova [dstq+srcq+16], m1
+
+ add srcq, r2mp
+ add grain_lutq, 82*2
+ dec hd
+ jg .loop_y_h_overlap
+
+%if ARCH_X86_32
+ add r4mp, 16
+%else
+ add wq, 16
+%endif
+ jge .end
+%if ARCH_X86_32
+ mov srcq, r9mp
+ add srcq, r4mp
+ add srcq, r4mp
+%else
+ mov src_bakq, r9mp
+ lea srcq, [src_bakq+wq*2]
+%endif
+ or dword r8m, 4
+ add offxyd, 16
+
+ ; r8m = sbym
+ test dword r8m, 2
+ jz .loop_x_odd
+%if ARCH_X86_32
+ add dword [rsp+8*mmsize+1*gprsize], 16
+%else
+ add r12d, 16 ; top_offxy += 16
+%endif
+ jmp .loop_x_odd_v_overlap
+
+.end:
+ RET
+
+.vertical_overlap:
+ or t0d, 2
+ mov r8m, t0d
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, unused
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \
+ sby, see
+%endif
+
+ movzx sbyd, sbyb
+%if ARCH_X86_32
+ imul r4, [fg_dataq+FGData.seed], 0x00010001
+ DEFINE_ARGS dst, src, scaling, sby, see, picptr, unused
+%else
+ imul seed, [fg_dataq+FGData.seed], 0x00010001
+%endif
+ imul t0d, sbyd, 173 * 0x00010001
+ imul sbyd, 37 * 0x01000100
+ add t0d, (105 << 16) | 188
+ add sbyd, (178 << 24) | (141 << 8)
+ and t0d, 0x00ff00ff
+ and sbyd, 0xff00ff00
+ xor seed, t0d
+%if ARCH_X86_32
+ xor sbyd, seed
+
+ DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
+
+ mov r3m, seed
+ mov wq, r4m
+%else
+ xor seed, sbyd ; (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ unused1, unused2, see, src_bak
+%endif
+
+ lea src_bakq, [srcq+wq*2]
+ mov r9mp, src_bakq
+ neg wq
+ sub dstmp, srcq
+%if ARCH_X86_32
+ mov r4m, wq
+%endif
+
+.loop_x_v_overlap:
+%if ARCH_X86_32
+ mov r5, r5m
+ SPLATD m7, [base+pw_27_17_17_27]
+ mov seed, r3m
+%else
+ SPLATD m7, [pw_27_17_17_27]
+%endif
+
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp t0b ; parity of top_seed
+ shr seed, 16
+ shl t0d, 16
+ test seeb, seeh
+ setp t0b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor t0d, r6d
+ mov seed, t0d
+ ror seed, 1 ; updated (cur_seed << 16) | top_seed
+
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
+
+ mov offxd, offyd
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, src_bak, unused, top_offxy
+
+ mov offyd, seed
+ mov offxd, seed
+%endif
+ ror offyd, 8
+ ror offxd, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyq, [offyq+offxq*2+0x10001*747+32*82]
+
+%if ARCH_X86_32
+ DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, src_bak, unused, top_offxy
+%endif
+
+ movzx top_offxyd, offxyw
+%if ARCH_X86_32
+ mov [rsp+8*mmsize+1*gprsize], top_offxyd
+
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%endif
+ shr offxyd, 16
+
+.loop_x_odd_v_overlap:
+%if ARCH_X86_32
+ mov r5, r5m
+%endif
+ SPLATD m7, [PIC_ptr(pw_27_17_17_27)]
+ mov hd, dword r7m
+ mov grain_lutq, grain_lutmp
+.loop_y_v_overlap:
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m3, [grain_lutq+offxyq*2]
+%if ARCH_X86_32
+ mov r5, [rsp+8*mmsize+1*gprsize]
+ movu m2, [grain_lutq+r5*2]
+%else
+ movu m2, [grain_lutq+top_offxyq*2]
+%endif
+ punpckhwd m4, m2, m3
+ punpcklwd m2, m3
+ REPX {pmaddwd x, m7}, m4, m2
+ REPX {paddd x, m14}, m4, m2
+ REPX {psrad x, 5}, m4, m2
+ packssdw m2, m4
+ pminsw m2, m15
+ pmaxsw m2, m9
+ movu m4, [grain_lutq+offxyq*2+16]
+%if ARCH_X86_32
+ movu m3, [grain_lutq+r5*2+16]
+%else
+ movu m3, [grain_lutq+top_offxyq*2+16]
+%endif
+ punpckhwd m5, m3, m4
+ punpcklwd m3, m4
+ REPX {pmaddwd x, m7}, m5, m3
+ REPX {paddd x, m14}, m5, m3
+ REPX {psrad x, 5}, m5, m3
+ packssdw m3, m5
+ pminsw m3, m15
+ pmaxsw m3, m9
+
+ ; src
+ pand m0, m10, [srcq+ 0] ; m0-1: src as word
+ pand m1, m10, [srcq+16] ; m0-1: src as word
+
+ ; scaling[src]
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+%if ARCH_X86_32
+ vpgatherdw m4, m0, scalingq-1, r0, r5, 8, 1, m5
+%else
+ vpgatherdw m4, m0, scalingq-1, r11, r13, 8, 1, m5
+%endif
+ psrlw m4, 8
+ pmullw m4, m11
+ pmulhrsw m4, m2
+%if ARCH_X86_32
+ vpgatherdw m5, m1, scalingq-1, r0, r5, 8, 1, m2
+%else
+ vpgatherdw m5, m1, scalingq-1, r11, r13, 8, 1, m2
+%endif
+ psrlw m5, 8
+ pmullw m5, m11
+ pmulhrsw m5, m3
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m4
+ paddw m1, m5
+ pmaxsw m0, m13
+ pmaxsw m1, m13
+ pminsw m0, m12
+ pminsw m1, m12
+ movifnidn dstq, dstmp
+ mova [dstq+srcq+ 0], m0
+ mova [dstq+srcq+16], m1
+
+ add srcq, r2mp
+ add grain_lutq, 82*2
+ dec hw
+ jz .end_y_v_overlap
+ ; 2 lines get vertical overlap, then fall back to non-overlap code for
+ ; remaining (up to) 30 lines
+%if ARCH_X86_32
+ mov r5, r5m
+%endif
+ SPLATD m7, [PIC_ptr(pw_27_17_17_27)+4]
+ xor hd, 0x10000
+ test hd, 0x10000
+ jnz .loop_y_v_overlap
+ jmp .loop_y
+
+.end_y_v_overlap:
+%if ARCH_X86_32
+ add r4mp, 16
+%else
+ add wq, 16
+%endif
+ jge .end_hv
+%if ARCH_X86_32
+ mov srcq, r9mp
+ add srcq, r4mp
+ add srcq, r4mp
+%else
+ mov src_bakq, r9mp
+ lea srcq, [src_bakq+wq*2]
+%endif
+ btc dword r8m, 2
+ jc .next_blk_v
+%if ARCH_X86_32
+ add dword [rsp+8*mmsize+1*gprsize], 16
+%else
+ add top_offxyd, 16
+%endif
+ add offxyd, 16
+ jmp .loop_x_odd_v_overlap
+
+.next_blk_v:
+ ; since fg_dataq.overlap is guaranteed to be set, we never jump
+ ; back to .loop_x_v_overlap, and instead always fall-through to
+ ; h+v overlap
+
+.loop_x_hv_overlap:
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
+
+ mov r0, [rsp+8*mmsize+1*gprsize]
+ add r3, 16
+ add r0, 16
+ mov [rsp+8*mmsize+0*gprsize], r3 ; left_offxy
+ mov [rsp+8*mmsize+2*gprsize], r0 ; topleft_offxy
+
+ mov seed, r3m
+ xor r0, r0
+%else
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+%endif
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp t0b ; parity of top_seed
+ shr seed, 16
+ shl t0d, 16
+ test seeb, seeh
+ setp t0b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor t0d, r6d
+ mov seed, t0d
+ ror seed, 1 ; updated (cur_seed << 16) | top_seed
+
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx
+
+ mov offxd, offyd
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, src_bak, left_offxy, top_offxy, topleft_offxy
+
+ lea topleft_offxyq, [top_offxyq+16]
+ lea left_offxyq, [offyq+16]
+ mov offyd, seed
+ mov offxd, seed
+%endif
+ ror offyd, 8
+ ror offxd, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyq, [offyq+offxq*2+0x10001*747+32*82]
+
+%if ARCH_X86_32
+ DEFINE_ARGS top_offxy, src, scaling, offxy, w, picptr, grain_lut
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, src_bak, left_offxy, top_offxy, topleft_offxy
+%endif
+
+ movzx top_offxyd, offxyw
+%if ARCH_X86_32
+ mov [rsp+8*mmsize+1*gprsize], top_offxyd
+
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%endif
+ shr offxyd, 16
+
+%if ARCH_X86_32
+ mov r5, r5m
+%endif
+ SPLATD m7, [PIC_ptr(pw_27_17_17_27)]
+
+ movzx hd, word r7m
+ mov grain_lutq, grain_lutmp
+.loop_y_hv_overlap:
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m2, [grain_lutq+offxyq*2]
+%if ARCH_X86_32
+ mov r0, [rsp+8*mmsize+1*gprsize] ; top_offxy
+ mov r5, [rsp+8*mmsize+0*gprsize] ; left_offxy
+ movu m4, [grain_lutq+r0*2]
+ movd m5, [grain_lutq+r5*2]
+ mov r5, [rsp+8*mmsize+2*gprsize] ; topleft_offxy
+ movd m3, [grain_lutq+r5*2]
+%else
+ movu m4, [grain_lutq+top_offxyq*2]
+ movd m5, [grain_lutq+left_offxyq*2]
+ movd m3, [grain_lutq+topleft_offxyq*2]
+%endif
+ ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
+ punpcklwd m5, m2
+ punpcklwd m3, m4
+ REPX {pmaddwd x, m6}, m5, m3
+ REPX {paddd x, m14}, m5, m3
+ REPX {psrad x, 5}, m5, m3
+ packssdw m5, m3
+ pminsw m5, m15
+ pmaxsw m5, m9
+ shufps m3, m5, m2, q3210
+ shufps m5, m4, q3232
+ ; followed by v interpolation (top | cur -> cur)
+ movu m0, [grain_lutq+offxyq*2+16]
+%if ARCH_X86_32
+ movu m1, [grain_lutq+r0*2+16]
+%else
+ movu m1, [grain_lutq+top_offxyq*2+16]
+%endif
+ punpcklwd m2, m5, m3
+ punpckhwd m5, m3
+ punpcklwd m3, m1, m0
+ punpckhwd m1, m0
+ REPX {pmaddwd x, m7}, m2, m5, m3, m1
+ REPX {paddd x, m14}, m2, m5, m3, m1
+ REPX {psrad x, 5}, m2, m5, m3, m1
+ packssdw m2, m5
+ packssdw m3, m1
+ REPX {pminsw x, m15}, m2, m3
+ REPX {pmaxsw x, m9}, m2, m3
+
+ ; src
+ pand m0, m10, [srcq+ 0]
+ pand m1, m10, [srcq+16] ; m0-1: src as word
+
+ ; scaling[src]
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+%if ARCH_X86_32
+ vpgatherdw m4, m0, scalingq-1, r0, r5, 8, 1, m5
+%else
+ vpgatherdw m4, m0, scalingq-1, r14, r10, 8, 1, m5
+%endif
+ psrlw m4, 8
+ pmullw m4, m11
+ pmulhrsw m2, m4
+%if ARCH_X86_32
+ vpgatherdw m5, m1, scalingq-1, r0, r5, 8, 1, m4
+%else
+ vpgatherdw m5, m1, scalingq-1, r14, r10, 8, 1, m4
+%endif
+ psrlw m5, 8
+ pmullw m5, m11
+ pmulhrsw m3, m5
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2
+ paddw m1, m3
+ pmaxsw m0, m13
+ pmaxsw m1, m13
+ pminsw m0, m12
+ pminsw m1, m12
+ movifnidn dstq, dstmp
+ mova [dstq+srcq+ 0], m0
+ mova [dstq+srcq+16], m1
+
+ add srcq, r2mp
+ add grain_lutq, 82*2
+ dec hw
+ jz .end_y_hv_overlap
+ ; 2 lines get vertical overlap, then fall back to non-overlap code for
+ ; remaining (up to) 30 lines
+%if ARCH_X86_32
+ mov r5, r5m
+%endif
+ SPLATD m7, [PIC_ptr(pw_27_17_17_27)+4]
+ xor hd, 0x10000
+ test hd, 0x10000
+ jnz .loop_y_hv_overlap
+ jmp .loop_y_h_overlap
+
+.end_y_hv_overlap:
+ or dword r8m, 4
+%if ARCH_X86_32
+ add r4mp, 16
+%else
+ add wq, 16
+%endif
+ jge .end_hv
+%if ARCH_X86_32
+ mov r5, r5m
+ add offxyd, 16
+ add dword [rsp+8*mmsize+1*gprsize], 16 ; top_offxy += 16
+ mov srcq, r9mp
+ add srcq, r4mp
+ add srcq, r4mp
+%else
+ add offxyd, 16
+ add top_offxyd, 16
+ mov src_bakq, r9mp
+ lea srcq, [src_bakq+wq*2]
+%endif
+ jmp .loop_x_odd_v_overlap
+
+.end_hv:
+ RET
+%if ARCH_X86_32
+ DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
+%endif
+
+%macro FGUV_FN 3 ; name, ss_hor, ss_ver
+INIT_XMM ssse3
+%if ARCH_X86_32
+%if STACK_ALIGNMENT < mmsize
+cglobal fguv_32x32xn_i%1_16bpc, 0, 7, 8, 0-(8 * mmsize + 16 * gprsize), \
+ tmp, src, scaling, h, fg_data, picptr, unused
+ mov r0, r0m
+ mov r1, r1m
+ mov r2, r2m
+ mov r4, r3m
+ mov r3, r4m
+ mov r5, r5m
+%define r0m [rsp+8*mmsize+ 3*gprsize]
+%define r1m [rsp+8*mmsize+ 4*gprsize]
+%define r2m [rsp+8*mmsize+ 5*gprsize]
+%define r3m [rsp+8*mmsize+ 6*gprsize]
+%define r4m [rsp+8*mmsize+ 7*gprsize]
+%define r5m [rsp+8*mmsize+ 8*gprsize]
+ mov r0m, r0
+ mov r2m, r2
+ mov r4m, r3
+ mov r5m, r5
+
+ mov r0, r6m
+ mov r2, r7m
+ mov r3, r8m
+ mov r5, r9m
+%define r6m [rsp+8*mmsize+ 9*gprsize]
+%define r7m [rsp+8*mmsize+10*gprsize]
+%define r8m [rsp+8*mmsize+11*gprsize]
+%define r9m [rsp+8*mmsize+12*gprsize]
+ mov r6m, r0
+ mov r7m, r2
+ mov r8m, r3
+ mov r9m, r5
+
+ mov r2, r10m
+ mov r3, r11m
+ mov r5, r12m
+ mov r0, r13m
+%define r10m [rsp+8*mmsize+13*gprsize]
+%define r11m [rsp+8*mmsize+14*gprsize]
+%define r12m [rsp+8*mmsize+15*gprsize]
+ mov r10m, r2
+ mov r11m, r3
+ mov r12m, r5
+
+ SPLATW m2, r13m
+%else
+cglobal fguv_32x32xn_i%1_16bpc, 0, 7, 8, 8 * mmsize + (4) * gprsize, \
+ tmp, src, scaling, h, fg_data, picptr, unused
+ mov srcq, srcm
+ mov fg_dataq, r3m
+%endif
+ LEA r5, $$
+%define base r5-$$
+
+ DECLARE_REG_TMP 0, 2, 3
+%else
+cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
+ grain_lut, h, sby, luma, lstride, uv_pl, is_id
+%define base r8-pb_mask
+ lea r8, [pb_mask]
+
+ DECLARE_REG_TMP 9, 10, 11
+%endif
+ mov r6d, [fg_dataq+FGData.scaling_shift]
+ SPLATW m3, [base+mul_bits+r6*2-14]
+ mov r6d, [fg_dataq+FGData.clip_to_restricted_range]
+%if STACK_ALIGNMENT >= mmsize
+ mov t0d, r13m ; bdmax
+%endif
+ sar t0d, 11 ; is_12bpc
+ inc t0d
+ mov t1d, r6d
+ imul t1d, t0d
+ dec t0d
+ SPLATW m5, [base+min+t1*2]
+ lea t1d, [t0d*3]
+ mov t2d, r12m
+ inc t2d
+ imul r6d, t2d
+ add t1d, r6d
+ SPLATW m4, [base+max+t1*2]
+%if STACK_ALIGNMENT >= mmsize
+ SPLATW m2, r13m
+%endif
+
+ SCRATCH 2, 10, 2
+ SCRATCH 3, 11, 3
+ SCRATCH 4, 12, 4
+ SCRATCH 5, 13, 5
+
+%define mzero m7
+
+%if %3
+ SPLATD m2, [base+pw_23_22]
+%endif
+
+%if ARCH_X86_32
+ mov scalingq, r5m
+ mov r5m, r5
+%else
+ mov r13mp, strideq
+%endif
+
+ pcmpeqw m0, m0
+ psraw m1, m10, 1
+ pxor m0, m1
+
+ SCRATCH 0, 8, 0
+ SCRATCH 1, 9, 1
+
+ cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
+ jne .csfl
+
+%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_h, ss_v
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
+
+ DECLARE_REG_TMP 0
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap
+
+ DECLARE_REG_TMP 9
+%endif
+
+%if %1
+ mov r6d, r11m
+ SPLATW m0, [fg_dataq+FGData.uv_mult+r6*4]
+ SPLATW m1, [fg_dataq+FGData.uv_luma_mult+r6*4]
+ punpcklwd m6, m1, m0
+ SPLATW m5, [fg_dataq+FGData.uv_offset+r6*4]
+ SPLATD m7, [base+pw_4+t0*4]
+ pmullw m5, m7
+%else
+ SPLATD m6, [base+pd_16]
+%if %2
+ mova m5, [base+pw_23_22]
+%else
+ mova m5, [base+pw_27_17_17_27]
+%endif
+%endif
+
+ SCRATCH 6, 14, 6
+ SCRATCH 5, 15, 7
+
+%if ARCH_X86_32
+ DECLARE_REG_TMP 0
+%else
+ DECLARE_REG_TMP 7
+%endif
+
+ mov sbyd, r8m
+ mov t0d, [fg_dataq+FGData.overlap_flag]
+ test t0d, t0d
+ jz %%no_vertical_overlap
+ test sbyd, sbyd
+ jnz %%vertical_overlap
+
+%%no_vertical_overlap:
+ mov r8m, t0d
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, overlap
+ imul seed, (173 << 24) | 37
+%else
+ imul seed, sbyd, (173 << 24) | 37
+%endif
+ add seed, (105 << 24) | 178
+ rol seed, 8
+ movzx seed, seew
+ xor seed, [fg_dataq+FGData.seed]
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS dst, src, scaling, see, w, picptr, luma
+
+ mov dstq, r0mp
+ mov lumaq, r9mp
+ mov wq, r4m
+ lea r3, [srcq+wq*2]
+ mov r1mp, r3
+ lea r3, [dstq+wq*2]
+ mov r11mp, r3
+ lea r3, [lumaq+wq*(2<<%2)]
+ mov r12mp, r3
+%if %3
+ shl r10mp, 1
+%endif
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ unused2, unused3, see, unused4, unused5, unused6, luma, lstride
+
+ mov lstrideq, r10mp
+%if %3
+ add lstrideq, lstrideq
+%endif
+ mov lumaq, r9mp
+ lea r10, [srcq+wq*2]
+ lea r11, [dstq+wq*2]
+ lea r12, [lumaq+wq*(2<<%2)]
+ mov r10mp, r10
+ mov r11mp, r11
+ mov r12mp, r12
+%endif
+ neg wq
+%if ARCH_X86_32
+ mov r4mp, wq
+%endif
+
+%%loop_x:
+%if ARCH_X86_32
+ mov seed, r3m
+%endif
+
+ mov r6d, seed
+ or seed, 0xEFF4
+ shr r6d, 1
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx
+
+ mov offxd, offyd
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, unused1, unused2, unused3, luma, lstride
+
+ mov offxd, seed
+ mov offyd, seed
+%endif
+ ror offyd, 8
+ shr offxd, 12
+ and offyd, 0xf
+ imul offyd, 164>>%3
+ lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, unused1, unused2, unused3, luma, lstride
+%endif
+
+%if %2 == 0
+%%loop_x_odd:
+%endif
+ mov hd, r7m
+ mov grain_lutq, grain_lutmp
+%%loop_y:
+ ; src
+ mova m0, [srcq]
+ mova m1, [srcq+16] ; m0-1: src as word
+
+ ; luma_src
+ pxor mzero, mzero
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut
+
+ mov lumaq, r9m
+%endif
+ mova m4, [lumaq+ 0]
+ mova m6, [lumaq+(16<<%2)]
+%if %2
+ phaddw m4, [lumaq+16]
+ phaddw m6, [lumaq+48]
+%endif
+%if ARCH_X86_32
+ add lumaq, r10mp
+ mov r9m, lumaq
+%endif
+%if %2
+ pavgw m4, mzero
+ pavgw m6, mzero
+%endif
+
+%if %1
+ punpckhwd m3, m4, m0
+ punpcklwd m4, m0
+ punpckhwd m5, m6, m1
+ punpcklwd m6, m1 ; { luma, chroma }
+ REPX {pmaddwd x, m14}, m3, m4, m5, m6
+ REPX {psrad x, 6}, m3, m4, m5, m6
+ packssdw m4, m3
+ packssdw m6, m5
+ REPX {paddw x, m15}, m4, m6
+ REPX {pmaxsw x, mzero}, m4, m6
+ REPX {pminsw x, m10}, m4, m6 ; clip_pixel()
+%else
+ REPX {pand x, m10}, m4, m6
+%endif
+
+ ; scaling[luma_src]
+%if ARCH_X86_32
+ vpgatherdw m3, m4, scalingq-1, r0, r5, 8, 1
+ vpgatherdw m5, m6, scalingq-1, r0, r5, 8, 1
+%else
+ vpgatherdw m3, m4, scalingq-1, r10, r12, 8, 1
+ vpgatherdw m5, m6, scalingq-1, r10, r12, 8, 1
+%endif
+ REPX {psrlw x, 8}, m3, m5
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m4, [grain_lutq+offxyq*2]
+ movu m6, [grain_lutq+offxyq*2+16]
+
+ ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+ REPX {pmullw x, m11}, m3, m5
+ pmulhrsw m4, m3
+ pmulhrsw m6, m5
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m4
+ paddw m1, m6
+ pmaxsw m0, m13
+ pmaxsw m1, m13
+ pminsw m0, m12
+ pminsw m1, m12
+ movifnidn dstq, dstmp
+ mova [dstq+ 0], m0
+ mova [dstq+16], m1
+
+%if ARCH_X86_32
+ add srcq, r2mp
+ add dstq, r2mp
+ mov dstmp, dstq
+%else
+ add srcq, r13mp
+ add dstq, r13mp
+ add lumaq, lstrideq
+%endif
+ add grain_lutq, 82*2
+ dec hd
+ jg %%loop_y
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, w, picptr, luma
+
+ mov wq, r4mp
+%endif
+ add wq, 16
+ jge %%end
+%if ARCH_X86_32
+ mov srcq, r1mp
+%else
+ mov srcq, r10mp
+%endif
+ mov dstq, r11mp
+ mov lumaq, r12mp
+ lea srcq, [srcq+wq*2]
+ lea dstq, [dstq+wq*2]
+ lea lumaq, [lumaq+wq*(2<<%2)]
+%if ARCH_X86_32
+ mov r0m, dstq
+ mov r9m, lumaq
+ mov r4m, wq
+%endif
+%if %2 == 0
+ btc dword r8m, 2
+ jc %%next_blk
+ add offxyd, 16
+ test dword r8m, 2
+ jz %%loop_x_odd
+%if ARCH_X86_32
+ add dword [rsp+8*mmsize+1*gprsize], 16
+%else
+ add r11d, 16
+%endif
+ jmp %%loop_x_odd_v_overlap
+%%next_blk:
+%endif
+ test dword r8m, 1
+ je %%loop_x
+
+ ; r8m = sbym
+ test dword r8m, 2
+ jnz %%loop_x_hv_overlap
+
+ ; horizontal overlap (without vertical overlap)
+%%loop_x_h_overlap:
+%if ARCH_X86_32
+ add offxyd, 16
+ mov [rsp+8*mmsize+0*gprsize], offxyd
+
+ DEFINE_ARGS dst, src, scaling, see, w, picptr, grain_lut
+
+ mov seed, r3m
+%endif
+ mov r6d, seed
+ or seed, 0xEFF4
+ shr r6d, 1
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx
+
+ mov offxd, offyd
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, left_offxy, unused1, unused2, luma, lstride
+
+ lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx
+ mov offxd, seed
+ mov offyd, seed
+%endif
+ ror offyd, 8
+ shr offxd, 12
+ and offyd, 0xf
+ imul offyd, 164>>%3
+ lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, left_offxy, unused1, unused2, luma, lstride
+%endif
+
+ mov hd, r7m
+ mov grain_lutq, grain_lutmp
+%%loop_y_h_overlap:
+ mova m0, [srcq]
+ mova m1, [srcq+16]
+
+ ; luma_src
+ pxor mzero, mzero
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut
+ mov lumaq, r9m
+%endif
+ mova m4, [lumaq+ 0]
+ mova m6, [lumaq+(16<<%2)]
+%if %2
+ phaddw m4, [lumaq+16]
+ phaddw m6, [lumaq+48]
+%endif
+%if ARCH_X86_32
+ add lumaq, r10mp
+ mov r9m, lumaq
+%endif
+%if %2
+ pavgw m4, mzero
+ pavgw m6, mzero
+%endif
+
+%if %1
+ punpckhwd m3, m4, m0
+ punpcklwd m4, m0
+ punpckhwd m5, m6, m1
+ punpcklwd m6, m1 ; { luma, chroma }
+ REPX {pmaddwd x, m14}, m3, m4, m5, m6
+ REPX {psrad x, 6}, m3, m4, m5, m6
+ packssdw m4, m3
+ packssdw m6, m5
+ REPX {paddw x, m15}, m4, m6
+ REPX {pmaxsw x, mzero}, m4, m6
+ REPX {pminsw x, m10}, m4, m6 ; clip_pixel()
+%else
+ REPX {pand x, m10}, m4, m6
+%endif
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m7, [grain_lutq+offxyq*2]
+%if ARCH_X86_32
+ mov r5, [rsp+8*mmsize+0*gprsize]
+ movd m5, [grain_lutq+r5*2]
+%else
+ movd m5, [grain_lutq+left_offxyq*2+ 0]
+%endif
+ punpcklwd m5, m7 ; {left0, cur0}
+%if %1
+%if ARCH_X86_32
+ mov r5, r5m
+%endif
+%if %2
+ pmaddwd m5, [PIC_ptr(pw_23_22)]
+%else
+ pmaddwd m5, [PIC_ptr(pw_27_17_17_27)]
+%endif
+ paddd m5, [PIC_ptr(pd_16)]
+%else
+ pmaddwd m5, m15
+ paddd m5, m14
+%endif
+ psrad m5, 5
+ packssdw m5, m5
+ pmaxsw m5, m8
+ pminsw m5, m9
+ shufps m5, m7, q3210
+ movu m3, [grain_lutq+offxyq*2+16]
+
+ ; scaling[luma_src]
+%if ARCH_X86_32
+ vpgatherdw m7, m4, scalingq-1, r0, r5, 8, 1
+ vpgatherdw m4, m6, scalingq-1, r0, r5, 8, 1
+%else
+ vpgatherdw m7, m4, scalingq-1, r2, r12, 8, 1
+ vpgatherdw m4, m6, scalingq-1, r2, r12, 8, 1
+%endif
+ REPX {psrlw x, 8}, m7, m4
+
+ ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+ REPX {pmullw x, m11}, m7, m4
+ pmulhrsw m5, m7
+ pmulhrsw m3, m4
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m5
+ paddw m1, m3
+ pmaxsw m0, m13
+ pmaxsw m1, m13
+ pminsw m0, m12
+ pminsw m1, m12
+ movifnidn dstq, dstmp
+ mova [dstq+ 0], m0
+ mova [dstq+16], m1
+
+%if ARCH_X86_32
+ add srcq, r2mp
+ add dstq, r2mp
+ mov dstmp, dstq
+%else
+ add srcq, r13mp
+ add dstq, r13mp
+ add lumaq, lstrideq
+%endif
+ add grain_lutq, 82*2
+ dec hd
+ jg %%loop_y_h_overlap
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut
+ mov wq, r4mp
+%endif
+ add wq, 16
+ jge %%end
+%if ARCH_X86_32
+ mov srcq, r1mp
+%else
+ mov srcq, r10mp
+%endif
+ mov dstq, r11mp
+ mov lumaq, r12mp
+ lea srcq, [srcq+wq*2]
+ lea dstq, [dstq+wq*2]
+ lea lumaq, [lumaq+wq*(2<<%2)]
+%if ARCH_X86_32
+ mov r0mp, dstq
+ mov r9mp, lumaq
+ mov r4m, wq
+%endif
+
+%if %2
+ ; r8m = sbym
+ test dword r8m, 2
+ jne %%loop_x_hv_overlap
+ jmp %%loop_x_h_overlap
+%else
+ or dword r8m, 4
+ add offxyd, 16
+
+ ; r8m = sbym
+ test dword r8m, 2
+ jz %%loop_x_odd
+%if ARCH_X86_32
+ add dword [rsp+8*mmsize+1*gprsize], 16
+%else
+ add r11d, 16 ; top_offxy += 16
+%endif
+ jmp %%loop_x_odd_v_overlap
+%endif
+
+%%end:
+ RET
+
+%%vertical_overlap:
+ or t0d, 2
+ mov r8m, t0d
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \
+ sby, see, unused1, unused2, unused3, lstride
+%endif
+
+ movzx sbyd, sbyb
+%if ARCH_X86_32
+ imul r4, [fg_dataq+FGData.seed], 0x00010001
+
+ DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused
+%else
+ imul seed, [fg_dataq+FGData.seed], 0x00010001
+%endif
+ imul t0d, sbyd, 173 * 0x00010001
+ imul sbyd, 37 * 0x01000100
+ add t0d, (105 << 16) | 188
+ add sbyd, (178 << 24) | (141 << 8)
+ and t0d, 0x00ff00ff
+ and sbyd, 0xff00ff00
+ xor seed, t0d
+%if ARCH_X86_32
+ xor sbyd, seed
+
+ DEFINE_ARGS dst, src, scaling, see, w, picptr, luma
+
+ mov r3m, seed
+ mov dstq, r0mp
+ mov lumaq, r9mp
+ mov wq, r4m
+ lea r3, [srcq+wq*2]
+ mov r1mp, r3
+ lea r3, [dstq+wq*2]
+ mov r11mp, r3
+ lea r3, [lumaq+wq*(2<<%2)]
+ mov r12mp, r3
+%if %3
+ shl r10mp, 1
+%endif
+%else
+ xor seed, sbyd ; (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ unused1, unused2, see, unused3, unused4, unused5, luma, lstride
+
+ mov lstrideq, r10mp
+%if %3
+ add lstrideq, lstrideq
+%endif
+ mov lumaq, r9mp
+ lea r10, [srcq+wq*2]
+ lea r11, [dstq+wq*2]
+ lea r12, [lumaq+wq*(2<<%2)]
+ mov r10mp, r10
+ mov r11mp, r11
+ mov r12mp, r12
+%endif
+ neg wq
+%if ARCH_X86_32
+ mov r4m, wq
+%endif
+
+%%loop_x_v_overlap:
+%if ARCH_X86_32
+ mov seed, r3m
+ xor t0d, t0d
+%else
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+%endif
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp t0b ; parity of top_seed
+ shr seed, 16
+ shl t0d, 16
+ test seeb, seeh
+ setp t0b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor t0d, r6d
+ mov seed, t0d
+ ror seed, 1 ; updated (cur_seed << 16) | top_seed
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx
+
+ mov offxd, offyd
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, unused1, top_offxy, unused2, luma, lstride
+
+ mov offyd, seed
+ mov offxd, seed
+%endif
+ ror offyd, 8
+ ror offxd, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164>>%3
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
+
+%if ARCH_X86_32
+ DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, unused1, top_offxy, unused2, luma, lstride
+%endif
+ movzx top_offxyd, offxyw
+%if ARCH_X86_32
+ mov [rsp+8*mmsize+1*gprsize], top_offxyd
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%endif
+ shr offxyd, 16
+
+%if %2 == 0
+%%loop_x_odd_v_overlap:
+%endif
+%if %3 == 0
+%if ARCH_X86_32
+ mov r5, r5m
+%endif
+ SPLATD m2, [PIC_ptr(pw_27_17_17_27)]
+%endif
+
+ mov hd, r7m
+ mov grain_lutq, grain_lutmp
+%%loop_y_v_overlap:
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m3, [grain_lutq+offxyq*2]
+%if ARCH_X86_32
+ mov r0, [rsp+mmsize*8+gprsize*1] ; top_offxy
+ movu m5, [grain_lutq+r0*2]
+%else
+ movu m5, [grain_lutq+top_offxyq*2]
+%endif
+ punpckhwd m7, m5, m3
+ punpcklwd m5, m3 ; {top/cur interleaved}
+ REPX {pmaddwd x, m2}, m7, m5
+%if %1
+%if ARCH_X86_32
+ mov r5, r5m
+%endif
+ REPX {paddd x, [PIC_ptr(pd_16)]}, m7, m5
+%else
+ REPX {paddd x, m14}, m7, m5
+%endif
+ REPX {psrad x, 5}, m7, m5
+ packssdw m3, m5, m7
+ pmaxsw m3, m8
+ pminsw m3, m9
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m4, [grain_lutq+offxyq*2+16]
+%if ARCH_X86_32
+ movu m5, [grain_lutq+r0*2+16]
+%else
+ movu m5, [grain_lutq+top_offxyq*2+16]
+%endif
+ punpckhwd m7, m5, m4
+ punpcklwd m5, m4 ; {top/cur interleaved}
+ REPX {pmaddwd x, m2}, m7, m5
+%if %1
+ REPX {paddd x, [PIC_ptr(pd_16)]}, m7, m5
+%else
+ REPX {paddd x, m14}, m7, m5
+%endif
+ REPX {psrad x, 5}, m7, m5
+ packssdw m4, m5, m7
+ pmaxsw m4, m8
+ pminsw m4, m9
+
+ ; src
+ mova m0, [srcq]
+ mova m1, [srcq+16]
+
+ ; luma_src
+ pxor mzero, mzero
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut
+
+ mov lumaq, r9mp
+%endif
+ mova m5, [lumaq+ 0]
+ mova m6, [lumaq+(16<<%2)]
+%if %2
+ phaddw m5, [lumaq+16]
+ phaddw m6, [lumaq+48]
+%endif
+%if ARCH_X86_32
+ add lumaq, r10mp
+ mov r9mp, lumaq
+%endif
+%if %2
+ pavgw m5, mzero
+ pavgw m6, mzero
+%endif
+
+%if %1
+ punpckhwd m7, m5, m0
+ punpcklwd m5, m0
+ REPX {pmaddwd x, m14}, m7, m5
+ REPX {psrad x, 6}, m7, m5
+ packssdw m5, m7
+ punpckhwd m7, m6, m1
+ punpcklwd m6, m1 ; { luma, chroma }
+ REPX {pmaddwd x, m14}, m7, m6
+ REPX {psrad x, 6}, m7, m6
+ packssdw m6, m7
+ pxor mzero, mzero
+ REPX {paddw x, m15}, m5, m6
+ REPX {pmaxsw x, mzero}, m5, m6
+ REPX {pminsw x, m10}, m5, m6 ; clip_pixel()
+%else
+ REPX {pand x, m10}, m5, m6
+%endif
+
+ ; scaling[luma_src]
+%if ARCH_X86_32
+ vpgatherdw m7, m5, scalingq-1, r0, r5, 8, 1
+ vpgatherdw m5, m6, scalingq-1, r0, r5, 8, 1
+%else
+ vpgatherdw m7, m5, scalingq-1, r10, r12, 8, 1
+ vpgatherdw m5, m6, scalingq-1, r10, r12, 8, 1
+%endif
+ REPX {psrlw x, 8}, m7, m5
+
+ ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+ REPX {pmullw x, m11}, m7, m5
+ pmulhrsw m3, m7
+ pmulhrsw m4, m5
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m3
+ paddw m1, m4
+ pmaxsw m0, m13
+ pmaxsw m1, m13
+ pminsw m0, m12
+ pminsw m1, m12
+ movifnidn dstq, dstmp
+ mova [dstq+ 0], m0
+ mova [dstq+16], m1
+
+ dec hw
+ jle %%end_y_v_overlap
+%if ARCH_X86_32
+ add srcq, r2mp
+ add dstq, r2mp
+ mov dstmp, dstq
+%else
+ add srcq, r13mp
+ add dstq, r13mp
+ add lumaq, lstrideq
+%endif
+ add grain_lutq, 82*2
+%if %3
+ jmp %%loop_y
+%else
+ btc hd, 16
+ jc %%loop_y
+%if ARCH_X86_32
+ mov r5, r5m
+%endif
+ SPLATD m2, [PIC_ptr(pw_27_17_17_27)+4]
+ jmp %%loop_y_v_overlap
+%endif
+
+%%end_y_v_overlap:
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut
+
+ mov wq, r4m
+%endif
+ add wq, 16
+ jge %%end_hv
+%if ARCH_X86_32
+ mov srcq, r1mp
+%else
+ mov srcq, r10mp
+%endif
+ mov dstq, r11mp
+ mov lumaq, r12mp
+ lea srcq, [srcq+wq*2]
+ lea dstq, [dstq+wq*2]
+ lea lumaq, [lumaq+wq*(2<<%2)]
+%if ARCH_X86_32
+ mov r0mp, dstq
+ mov r9mp, lumaq
+ mov r4m, wq
+%endif
+
+%if %2
+ ; since fg_dataq.overlap is guaranteed to be set, we never jump
+ ; back to .loop_x_v_overlap, and instead always fall-through to
+ ; h+v overlap
+%else
+ btc dword r8m, 2
+ jc %%loop_x_hv_overlap
+ add offxyd, 16
+%if ARCH_X86_32
+ add dword [rsp+8*mmsize+1*gprsize], 16
+%else
+ add r11d, 16
+%endif
+ jmp %%loop_x_odd_v_overlap
+%endif
+
+%%loop_x_hv_overlap:
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, w, picptr, grain_lut
+
+ mov t0d, [rsp+mmsize*8+gprsize*1] ; top_offxy
+ add offxyd, 16
+ add t0d, 16
+ mov [rsp+mmsize*8+gprsize*0], offxyd ; left_offxyd
+ mov [rsp+mmsize*8+gprsize*2], t0d ; topleft_offxyd
+
+ DEFINE_ARGS dst, src, scaling, see, w, picptr, grain_lut
+
+ mov seed, r3m
+ xor t0d, t0d
+%else
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+%endif
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp t0b ; parity of top_seed
+ shr seed, 16
+ shl t0d, 16
+ test seeb, seeh
+ setp t0b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor t0d, r6d
+ mov seed, t0d
+ ror seed, 1 ; updated (cur_seed << 16) | top_seed
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx
+
+ mov offxd, offyd
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride
+
+ lea topleft_offxyq, [top_offxyq+16]
+ lea left_offxyq, [offyq+16]
+ mov offyd, seed
+ mov offxd, seed
+%endif
+ ror offyd, 8
+ ror offxd, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164>>%3
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, top_offxy
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride
+%endif
+ movzx top_offxyd, offxyw
+%if ARCH_X86_32
+ mov [rsp+8*mmsize+1*gprsize], top_offxyd
+
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%endif
+ shr offxyd, 16
+
+%if %3 == 0
+%if ARCH_X86_32
+ mov r5, r5m
+%endif
+ SPLATD m2, [PIC_ptr(pw_27_17_17_27)]
+%endif
+
+ mov hd, r7m
+ mov grain_lutq, grain_lutmp
+%%loop_y_hv_overlap:
+ ; grain = grain_lut[offy+y][offx+x]
+%if ARCH_X86_32
+ mov r5, [rsp+8*mmsize+0*gprsize] ; left_offxy
+ mov r0, [rsp+8*mmsize+1*gprsize] ; top_offxy
+ movd m5, [grain_lutq+r5*2]
+%else
+ movd m5, [grain_lutq+left_offxyq*2]
+%endif
+ movu m7, [grain_lutq+offxyq*2]
+%if ARCH_X86_32
+ mov r5, [rsp+8*mmsize+2*gprsize]
+ movu m4, [grain_lutq+r0*2]
+%if %2
+ pinsrw m5, [grain_lutq+r5*2], 2
+%else
+ movd m3, [grain_lutq+r5*2]
+%endif
+%else
+ movu m4, [grain_lutq+top_offxyq*2]
+%if %2
+ pinsrw m5, [grain_lutq+topleft_offxyq*2], 2 ; { left, _, top/left }
+%else
+ movd m3, [grain_lutq+topleft_offxyq*2]
+%endif
+%endif
+%if %2 == 0
+ punpckldq m5, m3
+%endif
+ punpckldq m3, m7, m4 ; { cur0/1,top0/1,cur2/3,top2/3 }
+ punpcklwd m5, m3 ; { left/cur0,_/cur1,topleft/top0,_/top1 }
+%if %1
+%if ARCH_X86_32
+ mov r5, r5m
+%endif
+%if %2
+ movddup m0, [PIC_ptr(pw_23_22)]
+%else
+ movddup m0, [PIC_ptr(pw_27_17_17_27)]
+%endif
+%else
+ pshufd m0, m15, q1010
+%endif
+ pmaddwd m5, m0
+%if %1
+ paddd m5, [PIC_ptr(pd_16)]
+%else
+ paddd m5, m14
+%endif
+ psrad m5, 5
+ packssdw m5, m5
+ pmaxsw m5, m8
+ pminsw m5, m9
+ shufps m5, m3, q3210 ; cur0/1,top0/1,cur2/3,top2/3
+ shufps m3, m5, m7, q3220 ; cur0-7 post-h_filter
+ shufps m5, m4, q3231 ; top0-7 post-h_filter
+
+ punpckhwd m7, m5, m3
+ punpcklwd m5, m3 ; {top/cur interleaved}
+ REPX {pmaddwd x, m2}, m7, m5
+%if %1
+ REPX {paddd x, [PIC_ptr(pd_16)]}, m5, m7
+%else
+ REPX {paddd x, m14}, m5, m7
+%endif
+ REPX {psrad x, 5}, m5, m7
+ packssdw m3, m5, m7
+ pmaxsw m3, m8
+ pminsw m3, m9
+
+ ; right half
+ movu m4, [grain_lutq+offxyq*2+16]
+%if ARCH_X86_32
+ movu m0, [grain_lutq+r0*2+16]
+%else
+ movu m0, [grain_lutq+top_offxyq*2+16]
+%endif
+ punpckhwd m1, m0, m4
+ punpcklwd m0, m4 ; {top/cur interleaved}
+ REPX {pmaddwd x, m2}, m1, m0
+%if %1
+ REPX {paddd x, [PIC_ptr(pd_16)]}, m1, m0
+%else
+ REPX {paddd x, m14}, m1, m0
+%endif
+ REPX {psrad x, 5}, m1, m0
+ packssdw m4, m0, m1
+ pmaxsw m4, m8
+ pminsw m4, m9
+
+ ; src
+ mova m0, [srcq]
+ mova m1, [srcq+16]
+
+ ; luma_src
+ pxor mzero, mzero
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut
+
+ mov lumaq, r9mp
+%endif
+ mova m6, [lumaq+ 0]
+ mova m5, [lumaq+(16<<%2)]
+%if %2
+ phaddw m6, [lumaq+16]
+ phaddw m5, [lumaq+48]
+%endif
+%if ARCH_X86_32
+ add lumaq, r10mp
+ mov r9mp, lumaq
+%endif
+%if %2
+ pavgw m6, mzero
+ pavgw m5, mzero
+%endif
+
+%if %1
+ punpckhwd m7, m6, m0
+ punpcklwd m6, m0
+ REPX {pmaddwd x, m14}, m7, m6
+ REPX {psrad x, 6}, m7, m6
+ packssdw m6, m7
+ punpckhwd m7, m5, m1
+ punpcklwd m5, m1 ; { luma, chroma }
+ REPX {pmaddwd x, m14}, m7, m5
+ REPX {psrad x, 6}, m7, m5
+ packssdw m5, m7
+ pxor mzero, mzero
+ REPX {paddw x, m15}, m6, m5
+ REPX {pmaxsw x, mzero}, m6, m5
+ REPX {pminsw x, m10}, m6, m5 ; clip_pixel()
+%else
+ REPX {pand x, m10}, m6, m5
+%endif
+
+ ; scaling[luma_src]
+%if ARCH_X86_32
+ vpgatherdw m7, m6, scalingq-1, r0, r5, 8, 1
+ vpgatherdw m6, m5, scalingq-1, r0, r5, 8, 1
+%else
+%if %3 == 0
+ ; register shortage :)
+ push r12
+%endif
+ vpgatherdw m7, m6, scalingq-1, r2, r12, 8, 1
+ vpgatherdw m6, m5, scalingq-1, r2, r12, 8, 1
+%if %3 == 0
+ pop r12
+%endif
+%endif
+ REPX {psrlw x, 8}, m7, m6
+
+ ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+ REPX {pmullw x, m11}, m7, m6
+ pmulhrsw m3, m7
+ pmulhrsw m4, m6
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m3
+ paddw m1, m4
+ pmaxsw m0, m13
+ pmaxsw m1, m13
+ pminsw m0, m12
+ pminsw m1, m12
+ movifnidn dstq, dstmp
+ mova [dstq+ 0], m0
+ mova [dstq+16], m1
+
+%if ARCH_X86_32
+ add srcq, r2mp
+ add dstq, r2mp
+ mov dstmp, dstq
+%else
+ add srcq, r13mp
+ add dstq, r13mp
+ add lumaq, lstrideq
+%endif
+ add grain_lutq, 82*2
+ dec hw
+%if %3
+ jg %%loop_y_h_overlap
+%else
+ jle %%end_y_hv_overlap
+ btc hd, 16
+ jc %%loop_y_h_overlap
+%if ARCH_X86_32
+ mov r5, r5m
+%endif
+ SPLATD m2, [PIC_ptr(pw_27_17_17_27)+4]
+ jmp %%loop_y_hv_overlap
+%%end_y_hv_overlap:
+%endif
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut
+
+ mov wq, r4m
+%endif
+ add wq, 16
+ jge %%end_hv
+%if ARCH_X86_32
+ mov srcq, r1mp
+%else
+ mov srcq, r10mp
+%endif
+ mov dstq, r11mp
+ mov lumaq, r12mp
+ lea srcq, [srcq+wq*2]
+ lea dstq, [dstq+wq*2]
+ lea lumaq, [lumaq+wq*(2<<%2)]
+%if ARCH_X86_32
+ mov dstmp, dstq
+ mov r9mp, lumaq
+ mov r4m, wq
+%endif
+%if %2
+ jmp %%loop_x_hv_overlap
+%else
+ or dword r8m, 4
+ add offxyd, 16
+%if ARCH_X86_32
+ add dword [rsp+8*mmsize+1*gprsize], 16
+%else
+ add r11d, 16 ; top_offxy += 16
+%endif
+ jmp %%loop_x_odd_v_overlap
+%endif
+
+%%end_hv:
+ RET
+%endmacro
+
+ %%FGUV_32x32xN_LOOP 1, %2, %3
+.csfl:
+ %%FGUV_32x32xN_LOOP 0, %2, %3
+
+%if STACK_ALIGNMENT < mmsize
+DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
+%endif
+%endmacro
+
+FGUV_FN 420, 1, 1
+FGUV_FN 422, 1, 0
+FGUV_FN 444, 0, 0
diff --git a/third_party/dav1d/src/x86/filmgrain_avx2.asm b/third_party/dav1d/src/x86/filmgrain_avx2.asm
new file mode 100644
index 0000000000..55445cf593
--- /dev/null
+++ b/third_party/dav1d/src/x86/filmgrain_avx2.asm
@@ -0,0 +1,2107 @@
+; Copyright © 2019-2022, VideoLAN and dav1d authors
+; Copyright © 2019-2022, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+%include "x86/filmgrain_common.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 32
+pb_mask: db 0,128,128, 0,128, 0, 0,128,128, 0, 0,128, 0,128,128, 0
+gen_shufE: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
+gen_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9
+gen_shufB: db 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11
+gen_shufC: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13
+gen_shufD: db 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15
+; note: the order of (some of) the following constants matter
+pb_27_17: times 2 db 27, 17
+byte_blend: db 0, 0, 0, -1
+pb_27_17_17_27: db 27, 17, 17, 27, 0, 32, 0, 32
+pb_17_27: times 2 db 17, 27
+pb_1: times 4 db 1
+pb_23_22: db 23, 22, 0, 32, 0, 32, 0, 32
+next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058
+pw_seed_xor: times 2 dw 0xb524
+ times 2 dw 0x49d8
+fg_min: times 4 db 0
+ times 4 db 16
+fg_max: times 4 db 255
+ times 4 db 240
+ times 4 db 235
+pd_m65536: dd -65536
+pw_8: times 2 dw 8
+pw_1024: times 2 dw 1024
+hmul_bits: dw 32768, 16384, 8192, 4096
+round: dw 2048, 1024, 512
+mul_bits: dw 256, 128, 64, 32, 16
+round_vals: dw 32, 64, 128, 256, 512
+pw_1: dw 1
+
+%macro JMP_TABLE 2-*
+ %1_8bpc_%2_table:
+ %xdefine %%base %1_8bpc_%2_table
+ %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2)
+ %rep %0 - 2
+ dd %%prefix %+ .ar%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+JMP_TABLE generate_grain_y, avx2, 0, 1, 2, 3
+JMP_TABLE generate_grain_uv_420, avx2, 0, 1, 2, 3
+JMP_TABLE generate_grain_uv_422, avx2, 0, 1, 2, 3
+JMP_TABLE generate_grain_uv_444, avx2, 0, 1, 2, 3
+
+SECTION .text
+
+INIT_YMM avx2
+cglobal generate_grain_y_8bpc, 2, 9, 8, buf, fg_data
+%define base r4-generate_grain_y_8bpc_avx2_table
+ lea r4, [generate_grain_y_8bpc_avx2_table]
+ vpbroadcastw xm0, [fg_dataq+FGData.seed]
+ mov r6d, [fg_dataq+FGData.grain_scale_shift]
+ movq xm1, [base+next_upperbit_mask]
+ movsxd r5, [fg_dataq+FGData.ar_coeff_lag]
+ movq xm4, [base+mul_bits]
+ movq xm5, [base+hmul_bits]
+ mov r7, -73*82
+ mova xm6, [base+pb_mask]
+ sub bufq, r7
+ vpbroadcastw xm7, [base+round+r6*2]
+ lea r6, [gaussian_sequence]
+ movsxd r5, [r4+r5*4]
+.loop:
+ pand xm2, xm0, xm1
+ psrlw xm3, xm2, 10
+ por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set
+ pmullw xm2, xm4 ; bits 0x0f00 are set
+ pmulhuw xm0, xm5
+ pshufb xm3, xm6, xm2 ; set 15th bit for next 4 seeds
+ psllq xm2, xm3, 30
+ por xm2, xm3
+ psllq xm3, xm2, 15
+ por xm2, xm0 ; aggregate each bit into next seed's high bit
+ por xm3, xm2 ; 4 next output seeds
+ pshuflw xm0, xm3, q3333
+ psrlw xm3, 5
+ pand xm2, xm0, xm1
+ movq r2, xm3
+ psrlw xm3, xm2, 10
+ por xm2, xm3
+ pmullw xm2, xm4
+ pmulhuw xm0, xm5
+ movzx r3d, r2w
+ pshufb xm3, xm6, xm2
+ psllq xm2, xm3, 30
+ por xm2, xm3
+ psllq xm3, xm2, 15
+ por xm0, xm2
+ movd xm2, [r6+r3*2]
+ rorx r3, r2, 32
+ por xm3, xm0
+ shr r2d, 16
+ pinsrw xm2, [r6+r2*2], 1
+ pshuflw xm0, xm3, q3333
+ movzx r2d, r3w
+ psrlw xm3, 5
+ pinsrw xm2, [r6+r2*2], 2
+ shr r3d, 16
+ movq r2, xm3
+ pinsrw xm2, [r6+r3*2], 3
+ movzx r3d, r2w
+ pinsrw xm2, [r6+r3*2], 4
+ rorx r3, r2, 32
+ shr r2d, 16
+ pinsrw xm2, [r6+r2*2], 5
+ movzx r2d, r3w
+ pinsrw xm2, [r6+r2*2], 6
+ shr r3d, 16
+ pinsrw xm2, [r6+r3*2], 7
+ pmulhrsw xm2, xm7
+ packsswb xm2, xm2
+ movq [bufq+r7], xm2
+ add r7, 8
+ jl .loop
+
+ ; auto-regression code
+ add r5, r4
+ jmp r5
+
+.ar1:
+ DEFINE_ARGS buf, fg_data, cf3, shift, val3, min, max, x, val0
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3]
+ movd xm5, [fg_dataq+FGData.ar_coeffs_y]
+ mova xm2, [base+gen_shufC]
+ DEFINE_ARGS buf, h, cf3, shift, val3, min, max, x, val0
+ pinsrb xm5, [base+pb_1], 3
+ vpbroadcastw xm3, [base+round_vals+shiftq*2-12] ; rnd
+ pmovsxbw xm5, xm5
+ pshufd xm4, xm5, q0000
+ pshufd xm5, xm5, q1111
+ sub bufq, 82*73-(82*3+79)
+ mov hd, 70
+ mov mind, -128
+ mov maxd, 127
+.y_loop_ar1:
+ mov xq, -76
+ movsx val3d, byte [bufq+xq-1]
+.x_loop_ar1:
+ pmovsxbw xm1, [bufq+xq-82-3]
+ pshufb xm0, xm1, xm2
+ punpckhwd xm1, xm3
+ pmaddwd xm0, xm4
+ pmaddwd xm1, xm5
+ paddd xm0, xm1
+.x_loop_ar1_inner:
+ movd val0d, xm0
+ psrldq xm0, 4
+ imul val3d, cf3d
+ add val3d, val0d
+ movsx val0d, byte [bufq+xq]
+ sarx val3d, val3d, shiftd
+ add val3d, val0d
+ cmp val3d, maxd
+ cmovns val3d, maxd
+ cmp val3d, mind
+ cmovs val3d, mind
+ mov [bufq+xq], val3b
+ ; keep val3d in-place as left for next x iteration
+ inc xq
+ jz .x_loop_ar1_end
+ test xb, 3
+ jnz .x_loop_ar1_inner
+ jmp .x_loop_ar1
+.x_loop_ar1_end:
+ add bufq, 82
+ dec hd
+ jg .y_loop_ar1
+.ar0:
+ RET
+
+.ar2:
+%if WIN64
+ ; xmm6 and xmm7 already saved
+ %assign xmm_regs_used 16
+ %assign stack_size_padded 168
+ SUB rsp, stack_size_padded
+ movaps [rsp+16*2], xmm8
+ movaps [rsp+16*3], xmm9
+ movaps [rsp+16*4], xmm10
+ movaps [rsp+16*5], xmm11
+ movaps [rsp+16*6], xmm12
+ movaps [rsp+16*7], xmm13
+ movaps [rsp+16*8], xmm14
+ movaps [rsp+16*9], xmm15
+%endif
+ DEFINE_ARGS buf, fg_data, h, x
+ mov r6d, [fg_dataq+FGData.ar_coeff_shift]
+ pmovsxbw xm7, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-7
+ movd xm9, [fg_dataq+FGData.ar_coeffs_y+8] ; cf8-11
+ vpbroadcastd xm10, [base+round_vals-14+r6*2]
+ movd xm11, [base+byte_blend+1]
+ pmovsxbw xm9, xm9
+ pshufd xm4, xm7, q0000
+ mova xm12, [base+gen_shufA]
+ pshufd xm5, xm7, q3333
+ mova xm13, [base+gen_shufB]
+ pshufd xm6, xm7, q1111
+ mova xm14, [base+gen_shufC]
+ pshufd xm7, xm7, q2222
+ mova xm15, [base+gen_shufD]
+ pshufd xm8, xm9, q0000
+ psrld xm10, 16
+ pshufd xm9, xm9, q1111
+ sub bufq, 82*73-(82*3+79)
+ mov hd, 70
+.y_loop_ar2:
+ mov xq, -76
+.x_loop_ar2:
+ pmovsxbw xm0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5]
+ pmovsxbw xm1, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5]
+ pshufb xm2, xm0, xm12
+ pmaddwd xm2, xm4
+ pshufb xm3, xm1, xm13
+ pmaddwd xm3, xm5
+ paddd xm2, xm3
+ pshufb xm3, xm0, xm14
+ pmaddwd xm3, xm6
+ punpckhqdq xm0, xm0
+ punpcklwd xm0, xm1
+ pmaddwd xm0, xm7
+ pshufb xm1, xm15
+ pmaddwd xm1, xm8
+ paddd xm2, xm10
+ paddd xm2, xm3
+ paddd xm0, xm1
+ paddd xm2, xm0
+ movq xm0, [bufq+xq-2] ; y=0,x=[-2,+5]
+.x_loop_ar2_inner:
+ pmovsxbw xm1, xm0
+ pmaddwd xm3, xm9, xm1
+ psrldq xm1, 4 ; y=0,x=0
+ paddd xm3, xm2
+ psrldq xm2, 4 ; shift top to next pixel
+ psrad xm3, [fg_dataq+FGData.ar_coeff_shift]
+ ; don't packssdw since we only care about one value
+ paddw xm3, xm1
+ packsswb xm3, xm3
+ pextrb [bufq+xq], xm3, 0
+ pslldq xm3, 2
+ vpblendvb xm0, xm3, xm11
+ psrldq xm0, 1
+ inc xq
+ jz .x_loop_ar2_end
+ test xb, 3
+ jnz .x_loop_ar2_inner
+ jmp .x_loop_ar2
+.x_loop_ar2_end:
+ add bufq, 82
+ dec hd
+ jg .y_loop_ar2
+ RET
+
+INIT_YMM avx2
+.ar3:
+%if WIN64
+ ; xmm6 and xmm7 already saved
+ %assign stack_offset 16
+ ALLOC_STACK 16*14
+ %assign stack_size stack_size - 16*4
+ %assign xmm_regs_used 12
+ movaps [rsp+16*12], xmm8
+ movaps [rsp+16*13], xmm9
+ movaps [rsp+16*14], xmm10
+ movaps [rsp+16*15], xmm11
+%else
+ ALLOC_STACK 16*12
+%endif
+ mov r6d, [fg_dataq+FGData.ar_coeff_shift]
+ movd xm11, [base+byte_blend]
+ pmovsxbw m1, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-15
+ pmovsxbw xm2, [fg_dataq+FGData.ar_coeffs_y+16] ; cf16-23
+ pshufd m0, m1, q0000
+ mova [rsp+16* 0], m0
+ pshufd m0, m1, q1111
+ mova [rsp+16* 2], m0
+ pshufd m0, m1, q2222
+ mova [rsp+16* 4], m0
+ pshufd m1, m1, q3333
+ mova [rsp+16* 6], m1
+ pshufd xm0, xm2, q0000
+ mova [rsp+16* 8], xm0
+ pshufd xm0, xm2, q1111
+ mova [rsp+16* 9], xm0
+ psrldq xm7, xm2, 10
+ mova m8, [base+gen_shufA]
+ pinsrw xm2, [base+pw_1], 5
+ mova m9, [base+gen_shufC]
+ pshufd xm2, xm2, q2222
+ movu m10, [base+gen_shufE]
+ vpbroadcastw xm6, [base+round_vals-12+r6*2]
+ pinsrw xm7, [base+round_vals+r6*2-10], 3
+ mova [rsp+16*10], xm2
+ DEFINE_ARGS buf, fg_data, h, x
+ sub bufq, 82*73-(82*3+79)
+ mov hd, 70
+.y_loop_ar3:
+ mov xq, -76
+.x_loop_ar3:
+ movu xm5, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12]
+ vinserti128 m5, [bufq+xq-82*2-3], 1 ; y=-2,x=[-3,+12]
+ movu xm4, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12]
+ punpcklbw m3, m5, m5
+ punpckhwd m5, m4
+ psraw m3, 8
+ punpcklbw m5, m5
+ psraw m5, 8
+ punpcklbw xm4, xm4
+ psraw xm4, 8
+ pshufb m0, m3, m8
+ pmaddwd m0, [rsp+16*0]
+ pshufb m1, m3, m9
+ pmaddwd m1, [rsp+16*2]
+ shufps m2, m3, m5, q1032
+ paddd m0, m1
+ pshufb m1, m2, m8
+ vperm2i128 m3, m4, 0x21
+ pmaddwd m1, [rsp+16*4]
+ shufps xm2, xm3, q1021
+ vpblendd m2, m3, 0xf0
+ pshufb m2, m10
+ paddd m0, m1
+ pmaddwd m2, [rsp+16*6]
+ pshufb xm1, xm4, xm9
+ pmaddwd xm1, [rsp+16*8]
+ shufps xm4, xm5, q1132
+ paddd m0, m2
+ pshufb xm2, xm4, xm8
+ pshufd xm4, xm4, q2121
+ pmaddwd xm2, [rsp+16*9]
+ punpcklwd xm4, xm6
+ pmaddwd xm4, [rsp+16*10]
+ vextracti128 xm3, m0, 1
+ paddd xm0, xm1
+ movq xm1, [bufq+xq-3] ; y=0,x=[-3,+4]
+ paddd xm2, xm4
+ paddd xm0, xm2
+ paddd xm0, xm3
+.x_loop_ar3_inner:
+ pmovsxbw xm2, xm1
+ pmaddwd xm2, xm7
+ pshufd xm3, xm2, q1111
+ paddd xm2, xm0 ; add top
+ paddd xm2, xm3 ; left+cur
+ psrldq xm0, 4
+ psrad xm2, [fg_dataq+FGData.ar_coeff_shift]
+ ; don't packssdw since we only care about one value
+ packsswb xm2, xm2
+ pextrb [bufq+xq], xm2, 0
+ pslldq xm2, 3
+ vpblendvb xm1, xm2, xm11
+ psrldq xm1, 1
+ inc xq
+ jz .x_loop_ar3_end
+ test xb, 3
+ jnz .x_loop_ar3_inner
+ jmp .x_loop_ar3
+.x_loop_ar3_end:
+ add bufq, 82
+ dec hd
+ jg .y_loop_ar3
+ RET
+
+%macro GEN_GRAIN_UV_FN 3 ; ss_name, ss_x, ss_y
+INIT_XMM avx2
+cglobal generate_grain_uv_%1_8bpc, 4, 10, 16, buf, bufy, fg_data, uv
+%define base r4-generate_grain_uv_%1_8bpc_avx2_table
+ lea r4, [generate_grain_uv_%1_8bpc_avx2_table]
+ vpbroadcastw xm0, [fg_dataq+FGData.seed]
+ mov r6d, [fg_dataq+FGData.grain_scale_shift]
+ movq xm1, [base+next_upperbit_mask]
+ movq xm4, [base+mul_bits]
+ movq xm5, [base+hmul_bits]
+ mova xm6, [base+pb_mask]
+ vpbroadcastw xm7, [base+round+r6*2]
+ vpbroadcastd xm2, [base+pw_seed_xor+uvq*4]
+ pxor xm0, xm2
+ lea r6, [gaussian_sequence]
+%if %2
+ mov r7d, 73-35*%3
+ add bufq, 44
+.loop_y:
+ mov r5, -44
+%else
+ mov r5, -73*82
+ sub bufq, r5
+%endif
+.loop:
+ pand xm2, xm0, xm1
+ psrlw xm3, xm2, 10
+ por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set
+ pmullw xm2, xm4 ; bits 0x0f00 are set
+ pmulhuw xm0, xm5
+ pshufb xm3, xm6, xm2 ; set 15th bit for next 4 seeds
+ psllq xm2, xm3, 30
+ por xm2, xm3
+ psllq xm3, xm2, 15
+ por xm2, xm0 ; aggregate each bit into next seed's high bit
+ por xm2, xm3 ; 4 next output seeds
+ pshuflw xm0, xm2, q3333
+ psrlw xm2, 5
+ movq r8, xm2
+ movzx r9d, r8w
+ movd xm2, [r6+r9*2]
+ rorx r9, r8, 32
+ shr r8d, 16
+ pinsrw xm2, [r6+r8*2], 1
+ movzx r8d, r9w
+ pinsrw xm2, [r6+r8*2], 2
+ shr r9d, 16
+ pinsrw xm2, [r6+r9*2], 3
+ pmulhrsw xm2, xm7
+ packsswb xm2, xm2
+ movd [bufq+r5], xm2
+ add r5, 4
+ jl .loop
+%if %2
+ add bufq, 82
+ dec r7d
+ jg .loop_y
+%endif
+
+ ; auto-regression code
+ movsxd r6, [fg_dataq+FGData.ar_coeff_lag]
+ movsxd r6, [base+generate_grain_uv_%1_8bpc_avx2_table+r6*4]
+ add r6, r4
+ jmp r6
+
+INIT_YMM avx2
+.ar0:
+ DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
+ imul uvd, 28
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ movd xm2, [fg_dataq+FGData.ar_coeffs_uv+uvq]
+ movd xm3, [base+hmul_bits+shiftq*2]
+ DEFINE_ARGS buf, bufy, h
+ pmovsxbw xm2, xm2
+%if %2
+ vpbroadcastd m7, [base+pb_1]
+ vpbroadcastw m6, [base+hmul_bits+2+%3*2]
+%endif
+ vpbroadcastw m2, xm2
+ vpbroadcastw m3, xm3
+ pxor m12, m12
+%if %2
+ sub bufq, 82*(73-35*%3)+82-(82*3+41)
+%else
+ sub bufq, 82*70-3
+%endif
+ add bufyq, 3+82*3
+ mov hd, 70-35*%3
+.y_loop_ar0:
+%if %2
+ ; first 32 pixels
+ movu xm4, [bufyq]
+ vinserti128 m4, [bufyq+32], 1
+%if %3
+ movu xm0, [bufyq+82]
+ vinserti128 m0, [bufyq+82+32], 1
+%endif
+ movu xm5, [bufyq+16]
+ vinserti128 m5, [bufyq+48], 1
+%if %3
+ movu xm1, [bufyq+82+16]
+ vinserti128 m1, [bufyq+82+48], 1
+%endif
+ pmaddubsw m4, m7, m4
+%if %3
+ pmaddubsw m0, m7, m0
+%endif
+ pmaddubsw m5, m7, m5
+%if %3
+ pmaddubsw m1, m7, m1
+ paddw m4, m0
+ paddw m5, m1
+%endif
+ pmulhrsw m4, m6
+ pmulhrsw m5, m6
+%else
+ xor r3d, r3d
+ ; first 32x2 pixels
+.x_loop_ar0:
+ movu m4, [bufyq+r3]
+ pcmpgtb m0, m12, m4
+ punpckhbw m5, m4, m0
+ punpcklbw m4, m0
+%endif
+ pmullw m4, m2
+ pmullw m5, m2
+ pmulhrsw m4, m3
+ pmulhrsw m5, m3
+%if %2
+ movu m1, [bufq]
+%else
+ movu m1, [bufq+r3]
+%endif
+ pcmpgtb m8, m12, m1
+ punpcklbw m0, m1, m8
+ punpckhbw m1, m8
+ paddw m0, m4
+ paddw m1, m5
+ packsswb m0, m1
+%if %2
+ movu [bufq], m0
+%else
+ movu [bufq+r3], m0
+ add r3d, 32
+ cmp r3d, 64
+ jl .x_loop_ar0
+%endif
+
+ ; last 6/12 pixels
+ movu xm4, [bufyq+32*2]
+%if %2
+%if %3
+ movu xm5, [bufyq+32*2+82]
+%endif
+ pmaddubsw xm4, xm7, xm4
+%if %3
+ pmaddubsw xm5, xm7, xm5
+ paddw xm4, xm5
+%endif
+ movq xm0, [bufq+32]
+ pmulhrsw xm4, xm6
+ pmullw xm4, xm2
+ pmulhrsw xm4, xm3
+ pcmpgtb xm5, xm12, xm0
+ punpcklbw xm5, xm0, xm5
+ paddw xm4, xm5
+ packsswb xm4, xm4
+ pblendw xm0, xm4, xm0, 1000b
+ movq [bufq+32], xm0
+%else
+ movu xm0, [bufq+64]
+ pcmpgtb xm1, xm12, xm4
+ punpckhbw xm5, xm4, xm1
+ punpcklbw xm4, xm1
+ pmullw xm5, xm2
+ pmullw xm4, xm2
+ vpblendd xm1, xm3, xm12, 0x0c
+ pmulhrsw xm5, xm1
+ pmulhrsw xm4, xm3
+ pcmpgtb xm1, xm12, xm0
+ punpckhbw xm8, xm0, xm1
+ punpcklbw xm0, xm1
+ paddw xm5, xm8
+ paddw xm0, xm4
+ packsswb xm0, xm5
+ movu [bufq+64], xm0
+%endif
+ add bufq, 82
+ add bufyq, 82<<%3
+ dec hd
+ jg .y_loop_ar0
+ RET
+
+INIT_XMM avx2
+.ar1:
+ DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x, shift
+ imul uvd, 28
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3]
+ movd xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq]
+ pinsrb xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 3
+ DEFINE_ARGS buf, bufy, h, val0, val3, cf3, min, max, x, shift
+ pmovsxbw xm4, xm4
+ pshufd xm5, xm4, q1111
+ pshufd xm4, xm4, q0000
+ pmovsxwd xm3, [base+round_vals+shiftq*2-12] ; rnd
+%if %2
+ vpbroadcastd xm7, [base+pb_1]
+ vpbroadcastw xm6, [base+hmul_bits+2+%3*2]
+%endif
+ vpbroadcastd xm3, xm3
+%if %2
+ sub bufq, 82*(73-35*%3)+44-(82*3+41)
+%else
+ sub bufq, 82*70-(82-3)
+%endif
+ add bufyq, 79+82*3
+ mov hd, 70-35*%3
+ mov mind, -128
+ mov maxd, 127
+.y_loop_ar1:
+ mov xq, -(76>>%2)
+ movsx val3d, byte [bufq+xq-1]
+.x_loop_ar1:
+ pmovsxbw xm0, [bufq+xq-82-1] ; top/left
+%if %2
+ movq xm8, [bufyq+xq*2]
+%if %3
+ movq xm9, [bufyq+xq*2+82]
+%endif
+%endif
+ psrldq xm2, xm0, 2 ; top
+ psrldq xm1, xm0, 4 ; top/right
+%if %2
+ pmaddubsw xm8, xm7, xm8
+%if %3
+ pmaddubsw xm9, xm7, xm9
+ paddw xm8, xm9
+%endif
+ pmulhrsw xm8, xm6
+%else
+ pmovsxbw xm8, [bufyq+xq]
+%endif
+ punpcklwd xm0, xm2
+ punpcklwd xm1, xm8
+ pmaddwd xm0, xm4
+ pmaddwd xm1, xm5
+ paddd xm0, xm1
+ paddd xm0, xm3
+.x_loop_ar1_inner:
+ movd val0d, xm0
+ psrldq xm0, 4
+ imul val3d, cf3d
+ add val3d, val0d
+ sarx val3d, val3d, shiftd
+ movsx val0d, byte [bufq+xq]
+ add val3d, val0d
+ cmp val3d, maxd
+ cmovns val3d, maxd
+ cmp val3d, mind
+ cmovs val3d, mind
+ mov byte [bufq+xq], val3b
+ ; keep val3d in-place as left for next x iteration
+ inc xq
+ jz .x_loop_ar1_end
+ test xq, 3
+ jnz .x_loop_ar1_inner
+ jmp .x_loop_ar1
+
+.x_loop_ar1_end:
+ add bufq, 82
+ add bufyq, 82<<%3
+ dec hd
+ jg .y_loop_ar1
+ RET
+
+.ar2:
+ DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ imul uvd, 28
+ vpbroadcastw xm13, [base+round_vals-12+shiftq*2]
+ pmovsxbw xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] ; cf0-7
+ pmovsxbw xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+8] ; cf8-12
+ pinsrw xm0, [base+pw_1], 5
+%if %2
+ vpbroadcastw xm12, [base+hmul_bits+2+%3*2]
+ vpbroadcastd xm11, [base+pb_1]
+%endif
+ DEFINE_ARGS buf, bufy, fg_data, h, unused, x
+ pshufd xm4, xm7, q0000
+ pshufd xm5, xm7, q3333
+ pshufd xm6, xm7, q1111
+ pshufd xm7, xm7, q2222
+ pshufd xm8, xm0, q0000
+ pshufd xm9, xm0, q1111
+ pshufd xm10, xm0, q2222
+%if %2
+ sub bufq, 82*(73-35*%3)+44-(82*3+41)
+%else
+ sub bufq, 82*70-(82-3)
+%endif
+ add bufyq, 79+82*3
+ mov hd, 70-35*%3
+.y_loop_ar2:
+ mov xq, -(76>>%2)
+
+.x_loop_ar2:
+ pmovsxbw xm0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5]
+ pmovsxbw xm1, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5]
+ pshufb xm2, xm0, [base+gen_shufA]
+ pmaddwd xm2, xm4
+ pshufb xm3, xm1, [base+gen_shufB]
+ pmaddwd xm3, xm5
+ paddd xm2, xm3
+ pshufb xm3, xm0, [base+gen_shufC]
+ pmaddwd xm3, xm6
+ punpckhqdq xm0, xm0 ; y=-2,x=[+2,+5]
+ punpcklwd xm0, xm1
+ pmaddwd xm0, xm7
+ pshufb xm1, [gen_shufD]
+ pmaddwd xm1, xm8
+ paddd xm2, xm3
+ paddd xm0, xm1
+ paddd xm2, xm0
+
+%if %2
+ movq xm0, [bufyq+xq*2]
+%if %3
+ movq xm3, [bufyq+xq*2+82]
+%endif
+ pmaddubsw xm0, xm11, xm0
+%if %3
+ pmaddubsw xm3, xm11, xm3
+ paddw xm0, xm3
+%endif
+ pmulhrsw xm0, xm12
+%else
+ pmovsxbw xm0, [bufyq+xq]
+%endif
+ punpcklwd xm0, xm13
+ pmaddwd xm0, xm10
+ paddd xm2, xm0
+
+ movq xm0, [bufq+xq-2] ; y=0,x=[-2,+5]
+.x_loop_ar2_inner:
+ pmovsxbw xm0, xm0
+ pmaddwd xm3, xm0, xm9
+ psrldq xm0, 2
+ paddd xm3, xm2
+ psrldq xm2, 4 ; shift top to next pixel
+ psrad xm3, [fg_dataq+FGData.ar_coeff_shift]
+ pslldq xm3, 2
+ paddw xm3, xm0
+ pblendw xm0, xm3, 00000010b
+ packsswb xm0, xm0
+ pextrb [bufq+xq], xm0, 1
+ inc xq
+ jz .x_loop_ar2_end
+ test xb, 3
+ jnz .x_loop_ar2_inner
+ jmp .x_loop_ar2
+
+.x_loop_ar2_end:
+ add bufq, 82
+ add bufyq, 82<<%3
+ dec hd
+ jg .y_loop_ar2
+ RET
+
+INIT_YMM avx2
+.ar3:
+ DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ imul uvd, 28
+ pmovsxbw m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] ; cf0-15
+ pmovsxbw xm1, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] ; cf16-23
+ vpbroadcastb xm2, [fg_dataq+FGData.ar_coeffs_uv+uvq+24] ; cf24 [luma]
+ movd xm13, [base+round_vals-10+shiftq*2]
+ vpbroadcastd xm14, [base+round_vals-14+shiftq*2]
+ pshufd m6, m0, q0000
+ pshufd m7, m0, q1111
+ pshufd m8, m0, q2222
+ pshufd m9, m0, q3333
+ pshufd xm10, xm1, q0000
+ pshufd xm11, xm1, q1111
+ pshufhw xm12, xm1, q0000
+ psraw xm2, 8
+ palignr xm13, xm1, 10
+ punpckhwd xm12, xm2 ; interleave luma cf
+ psrld xm14, 16
+ DEFINE_ARGS buf, bufy, fg_data, h, unused, x
+%if %2
+ vpbroadcastw xm15, [base+hmul_bits+2+%3*2]
+ sub bufq, 82*(73-35*%3)+44-(82*3+41)
+%else
+ sub bufq, 82*70-(82-3)
+%endif
+ add bufyq, 79+82*3
+ mov hd, 70-35*%3
+.y_loop_ar3:
+ mov xq, -(76>>%2)
+.x_loop_ar3:
+ vbroadcasti128 m3, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12
+ palignr xm1, xm3, [bufq+xq-82*3-9], 6 ; y=-3,x=[-3,+12]
+ vbroadcasti128 m4, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12]
+ vpblendd m3, m1, 0x0f
+ pxor m0, m0
+ pcmpgtb m2, m0, m3
+ pcmpgtb m0, m4
+ punpcklbw m1, m3, m2
+ punpckhbw m3, m2
+ punpcklbw m2, m4, m0
+ punpckhbw xm4, xm0
+ pshufb m0, m1, [base+gen_shufA]
+ pmaddwd m0, m6
+ pshufb m5, m1, [base+gen_shufC]
+ pmaddwd m5, m7
+ shufps m1, m3, q1032
+ paddd m0, m5
+ pshufb m5, m1, [base+gen_shufA]
+ pmaddwd m5, m8
+ shufps xm1, xm3, q2121
+ vpblendd m1, m2, 0xf0
+ pshufb m1, [base+gen_shufE]
+ pmaddwd m1, m9
+ paddd m0, m5
+ pshufb xm3, xm2, [base+gen_shufC]
+ paddd m0, m1
+ pmaddwd xm3, xm10
+ palignr xm1, xm4, xm2, 2
+ punpckhwd xm1, xm2, xm1
+ pmaddwd xm1, xm11
+ palignr xm4, xm2, 12
+ paddd xm3, xm1
+%if %2
+ vpbroadcastd xm5, [base+pb_1]
+ movq xm1, [bufyq+xq*2]
+ pmaddubsw xm1, xm5, xm1
+%if %3
+ movq xm2, [bufyq+xq*2+82]
+ pmaddubsw xm5, xm2
+ paddw xm1, xm5
+%endif
+ pmulhrsw xm1, xm15
+%else
+ pmovsxbw xm1, [bufyq+xq]
+%endif
+ punpcklwd xm4, xm1
+ pmaddwd xm4, xm12
+ movq xm1, [bufq+xq-3] ; y=0,x=[-3,+4]
+ vextracti128 xm2, m0, 1
+ paddd xm0, xm14
+ paddd xm3, xm4
+ paddd xm0, xm3
+ paddd xm0, xm2
+.x_loop_ar3_inner:
+ pmovsxbw xm1, xm1
+ pmaddwd xm2, xm13, xm1
+ pshuflw xm3, xm2, q1032
+ paddd xm2, xm0 ; add top
+ paddd xm2, xm3 ; left+cur
+ psrldq xm0, 4
+ psrad xm2, [fg_dataq+FGData.ar_coeff_shift]
+ psrldq xm1, 2
+ ; don't packssdw, we only care about one value
+ punpckldq xm2, xm2
+ pblendw xm1, xm2, 0100b
+ packsswb xm1, xm1
+ pextrb [bufq+xq], xm1, 2
+ inc xq
+ jz .x_loop_ar3_end
+ test xb, 3
+ jnz .x_loop_ar3_inner
+ jmp .x_loop_ar3
+.x_loop_ar3_end:
+ add bufq, 82
+ add bufyq, 82<<%3
+ dec hd
+ jg .y_loop_ar3
+ RET
+%endmacro
+
+INIT_YMM avx2
+cglobal fgy_32x32xn_8bpc, 6, 13, 15, dst, src, stride, fg_data, w, scaling, \
+ grain_lut, h, sby, see, overlap
+%define base r9-pd_m65536
+ lea r9, [pd_m65536]
+ mov r6d, [fg_dataq+FGData.scaling_shift]
+ mov r7d, [fg_dataq+FGData.clip_to_restricted_range]
+ mov sbyd, sbym
+ mov overlapd, [fg_dataq+FGData.overlap_flag]
+ vpbroadcastd m8, [base+pd_m65536]
+ vpbroadcastw m9, [base+mul_bits+r6*2-14]
+ vpbroadcastd m10, [base+fg_min+r7*4]
+ vpbroadcastd m11, [base+fg_max+r7*8]
+ vpbroadcastd m12, [base+pw_1024]
+ movq xm13, [base+pb_27_17_17_27]
+ test sbyd, sbyd
+ setnz r7b
+ pxor m7, m7
+ test r7b, overlapb
+ jnz .vertical_overlap
+
+ imul seed, sbyd, (173 << 24) | 37
+ add seed, (105 << 24) | 178
+ rorx seed, seed, 24
+ movzx seed, seew
+ xor seed, [fg_dataq+FGData.seed]
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ offx, offy, see, overlap
+
+ lea src_bakq, [srcq+wq]
+ neg wq
+ sub dstq, srcq
+
+.loop_x:
+ rorx r6, seeq, 1
+ or seed, 0xEFF4
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+ rorx offyd, seed, 8
+ rorx offxq, seeq, 12
+ and offyd, 0xf
+ imul offyd, 164
+ lea offyd, [offyq+offxq*2+747] ; offy*stride+offx
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ h, offxy, see, overlap
+
+ mov hd, hm
+ mov grain_lutq, grain_lutmp
+.loop_y:
+ ; src
+ mova m2, [srcq]
+ punpcklbw m0, m2, m7
+ punpckhbw m1, m2, m7
+
+ ; scaling[src]
+ pandn m4, m8, m0
+ mova m6, m8
+ vpgatherdd m2, [scalingq+m4-0], m8
+ psrld m3, m0, 16
+ mova m8, m6
+ vpgatherdd m4, [scalingq+m3-2], m6
+ pandn m5, m8, m1
+ mova m6, m8
+ vpgatherdd m3, [scalingq+m5-0], m8
+ pblendw m2, m4, 0xaa
+ psrld m4, m1, 16
+ mova m8, m6
+ vpgatherdd m5, [scalingq+m4-2], m6
+ pblendw m3, m5, 0xaa
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m5, [grain_lutq+offxyq]
+ punpcklbw m4, m5, m7
+ punpckhbw m5, m7
+
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+ pmaddubsw m2, m4
+ pmaddubsw m3, m5
+ pmulhrsw m2, m9
+ pmulhrsw m3, m9
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2
+ paddw m1, m3
+ packuswb m0, m1
+ pmaxub m0, m10
+ pminub m0, m11
+ mova [dstq+srcq], m0
+
+ add srcq, strideq
+ add grain_lutq, 82
+ dec hd
+ jg .loop_y
+
+ add wq, 32
+ jge .end
+ lea srcq, [src_bakq+wq]
+ test overlapd, overlapd
+ jz .loop_x
+
+ ; r8m = sbym
+ cmp dword r8m, 0
+ jne .loop_x_hv_overlap
+
+ ; horizontal overlap (without vertical overlap)
+.loop_x_h_overlap:
+ rorx r6, seeq, 1
+ or seed, 0xEFF4
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ offx, offy, see, left_offxy
+
+ lea left_offxyd, [offyq+32] ; previous column's offy*stride+offx
+ rorx offyd, seed, 8
+ rorx offxq, seeq, 12
+ and offyd, 0xf
+ imul offyd, 164
+ lea offyd, [offyq+offxq*2+747] ; offy*stride+offx
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ h, offxy, see, left_offxy
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+.loop_y_h_overlap:
+ ; src
+ mova m2, [srcq]
+ punpcklbw m0, m2, m7
+ punpckhbw m1, m2, m7
+
+ ; scaling[src]
+ pandn m4, m8, m0
+ mova m6, m8
+ vpgatherdd m2, [scalingq+m4-0], m8
+ psrld m3, m0, 16
+ mova m8, m6
+ vpgatherdd m4, [scalingq+m3-2], m6
+ pandn m5, m8, m1
+ mova m6, m8
+ vpgatherdd m3, [scalingq+m5-0], m8
+ pblendw m2, m4, 0xaa
+ psrld m4, m1, 16
+ mova m8, m6
+ vpgatherdd m5, [scalingq+m4-2], m6
+ pblendw m3, m5, 0xaa
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m5, [grain_lutq+offxyq]
+ movd xm4, [grain_lutq+left_offxyq]
+ punpcklbw xm4, xm5
+ pmaddubsw xm4, xm13, xm4
+ pmulhrsw xm4, xm12
+ packsswb xm4, xm4
+ vpblendd m4, m5, 0xfe
+ punpckhbw m5, m7
+ punpcklbw m4, m7
+
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+ pmaddubsw m2, m4
+ pmaddubsw m3, m5
+ pmulhrsw m2, m9
+ pmulhrsw m3, m9
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2
+ paddw m1, m3
+ packuswb m0, m1
+ pmaxub m0, m10
+ pminub m0, m11
+ mova [dstq+srcq], m0
+
+ add srcq, strideq
+ add grain_lutq, 82
+ dec hd
+ jg .loop_y_h_overlap
+
+ add wq, 32
+ jge .end
+ lea srcq, [src_bakq+wq]
+
+ ; r8m = sbym
+ cmp dword r8m, 0
+ jne .loop_x_hv_overlap
+ jmp .loop_x_h_overlap
+
+.vertical_overlap:
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ unused, sby, see, overlap
+
+ movzx sbyd, sbyb
+ imul seed, [fg_dataq+FGData.seed], 0x00010001
+ imul r7d, sbyd, 173 * 0x00010001
+ imul sbyd, 37 * 0x01000100
+ add r7d, (105 << 16) | 188
+ add sbyd, (178 << 24) | (141 << 8)
+ and r7d, 0x00ff00ff
+ and sbyd, 0xff00ff00
+ xor seed, r7d
+ xor seed, sbyd ; (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ offx, offy, see, overlap
+
+ lea src_bakq, [srcq+wq]
+ neg wq
+ sub dstq, srcq
+
+.loop_x_v_overlap:
+ vpbroadcastd m14, [pb_27_17]
+
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp r7b ; parity of top_seed
+ shr seed, 16
+ shl r7d, 16
+ test seeb, seeh
+ setp r7b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor r7d, r6d
+ rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
+
+ rorx offyd, seed, 8
+ rorx offxd, seed, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyd, [offyq+offxq*2+0x10001*747+32*82]
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ h, offxy, see, overlap, top_offxy
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+.loop_y_v_overlap:
+ ; src
+ mova m2, [srcq]
+ punpcklbw m0, m2, m7
+ punpckhbw m1, m2, m7
+
+ ; scaling[src]
+ pandn m4, m8, m0
+ mova m6, m8
+ vpgatherdd m2, [scalingq+m4-0], m8
+ psrld m3, m0, 16
+ mova m8, m6
+ vpgatherdd m4, [scalingq+m3-2], m6
+ pandn m5, m8, m1
+ mova m6, m8
+ vpgatherdd m3, [scalingq+m5-0], m8
+ pblendw m2, m4, 0xaa
+ psrld m4, m1, 16
+ mova m8, m6
+ vpgatherdd m5, [scalingq+m4-2], m6
+ pblendw m3, m5, 0xaa
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m6, [grain_lutq+offxyq]
+ movu m4, [grain_lutq+top_offxyq]
+ punpcklbw m5, m4, m6
+ punpckhbw m4, m6
+ pmaddubsw m5, m14, m5
+ pmaddubsw m4, m14, m4
+ pmulhrsw m5, m12
+ pmulhrsw m4, m12
+ packsswb m5, m4
+ punpcklbw m4, m5, m7
+ punpckhbw m5, m7
+
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+ pmaddubsw m2, m4
+ pmaddubsw m3, m5
+ pmulhrsw m2, m9
+ pmulhrsw m3, m9
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2
+ paddw m1, m3
+ packuswb m0, m1
+ pmaxub m0, m10
+ pminub m0, m11
+ mova [dstq+srcq], m0
+
+ add srcq, strideq
+ add grain_lutq, 82
+ dec hb
+ jz .end_y_v_overlap
+ vpbroadcastd m14, [pb_17_27] ; swap weights for second v-overlap line
+ ; 2 lines get vertical overlap, then fall back to non-overlap code for
+ ; remaining (up to) 30 lines
+ add hd, 0x80000000
+ jnc .loop_y_v_overlap
+ jmp .loop_y
+.end_y_v_overlap:
+ add wq, 32
+ jge .end
+ lea srcq, [src_bakq+wq]
+
+ ; since fg_dataq.overlap is guaranteed to be set, we never jump
+ ; back to .loop_x_v_overlap, and instead always fall-through to
+ ; h+v overlap
+.loop_x_hv_overlap:
+ vpbroadcastd m14, [pb_27_17]
+
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp r7b ; parity of top_seed
+ shr seed, 16
+ shl r7d, 16
+ test seeb, seeh
+ setp r7b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor r7d, r6d
+ rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ offx, offy, see, left_offxy, top_offxy, topleft_offxy
+
+ lea topleft_offxyd, [top_offxyq+32]
+ lea left_offxyd, [offyq+32]
+ rorx offyd, seed, 8
+ rorx offxd, seed, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyd, [offyq+offxq*2+0x10001*747+32*82]
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ h, offxy, see, left_offxy, top_offxy, topleft_offxy
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+.loop_y_hv_overlap:
+ ; src
+ mova m2, [srcq]
+ punpcklbw m0, m2, m7
+ punpckhbw m1, m2, m7
+
+ ; scaling[src]
+ pandn m4, m8, m0
+ mova m6, m8
+ vpgatherdd m2, [scalingq+m4-0], m8
+ psrld m3, m0, 16
+ mova m8, m6
+ vpgatherdd m4, [scalingq+m3-2], m6
+ pandn m5, m8, m1
+ mova m6, m8
+ vpgatherdd m3, [scalingq+m5-0], m8
+ pblendw m2, m4, 0xaa
+ psrld m4, m1, 16
+ mova m8, m6
+ vpgatherdd m5, [scalingq+m4-2], m6
+ pblendw m3, m5, 0xaa
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m6, [grain_lutq+offxyq]
+ movd xm7, [grain_lutq+left_offxyq]
+ movu m4, [grain_lutq+top_offxyq]
+ movd xm5, [grain_lutq+topleft_offxyq]
+ ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
+ punpcklbw xm7, xm6
+ punpcklbw xm5, xm4
+ pmaddubsw xm7, xm13, xm7
+ pmaddubsw xm5, xm13, xm5
+ pmulhrsw xm7, xm12
+ pmulhrsw xm5, xm12
+ packsswb xm7, xm7
+ packsswb xm5, xm5
+ vpblendd m7, m6, 0xfe
+ vpblendd m5, m4, 0xfe
+ ; followed by v interpolation (top | cur -> cur)
+ punpckhbw m4, m6
+ punpcklbw m5, m7
+ pmaddubsw m4, m14, m4
+ pmaddubsw m5, m14, m5
+ pmulhrsw m4, m12
+ pmulhrsw m5, m12
+ pxor m7, m7
+ packsswb m5, m4
+ punpcklbw m4, m5, m7
+ punpckhbw m5, m7
+
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+ pmaddubsw m2, m4
+ pmaddubsw m3, m5
+ pmulhrsw m2, m9
+ pmulhrsw m3, m9
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2
+ paddw m1, m3
+ packuswb m0, m1
+ pmaxub m0, m10
+ pminub m0, m11
+ mova [dstq+srcq], m0
+
+ add srcq, strideq
+ add grain_lutq, 82
+ dec hb
+ jz .end_y_hv_overlap
+ vpbroadcastd m14, [pb_17_27] ; swap weights for second v-overlap line
+ ; 2 lines get vertical overlap, then fall back to non-overlap code for
+ ; remaining (up to) 30 lines
+ add hd, 0x80000000
+ jnc .loop_y_hv_overlap
+ jmp .loop_y_h_overlap
+.end_y_hv_overlap:
+ add wq, 32
+ lea srcq, [src_bakq+wq]
+ jl .loop_x_hv_overlap
+.end:
+ RET
+
+%macro FGUV_FN 3 ; name, ss_hor, ss_ver
+cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
+ grain_lut, h, sby, luma, overlap, uv_pl, is_id
+%define base r11-pd_m65536
+ lea r11, [pd_m65536]
+ mov r6d, [fg_dataq+FGData.scaling_shift]
+ mov r7d, [fg_dataq+FGData.clip_to_restricted_range]
+ mov r9d, is_idm
+ mov sbyd, sbym
+ mov overlapd, [fg_dataq+FGData.overlap_flag]
+ vpbroadcastd m8, [base+pd_m65536]
+ vpbroadcastw m9, [base+mul_bits+r6*2-14]
+ vpbroadcastd m10, [base+fg_min+r7*4]
+ shlx r7d, r7d, r9d
+ vpbroadcastd m11, [base+fg_max+r7*4]
+ vpbroadcastd m12, [base+pw_1024]
+ pxor m7, m7
+ test sbyd, sbyd
+ setnz r7b
+ cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
+ jne .csfl
+
+%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, sby, see, overlap, uv_pl
+%if %1
+ mov r6d, uv_plm
+ vpbroadcastd m0, [base+pw_8]
+ vbroadcasti128 m14, [fg_dataq+FGData.uv_mult+r6*4]
+ vpbroadcastw m15, [fg_dataq+FGData.uv_offset+r6*4]
+ pshufb m14, m0 ; uv_luma_mult, uv_mult
+%elif %2
+ vpbroadcastq m15, [base+pb_23_22]
+%else
+ vpbroadcastq xm15, [base+pb_27_17_17_27]
+%endif
+%if %3
+ vpbroadcastw m13, [base+pb_23_22]
+%elif %2
+ pshufd m13, [base+pb_27_17], q0000 ; 8x27_17, 8x17_27
+%endif
+ test r7b, overlapb
+ jnz %%vertical_overlap
+
+ imul seed, sbyd, (173 << 24) | 37
+ add seed, (105 << 24) | 178
+ rorx seed, seed, 24
+ movzx seed, seew
+ xor seed, [fg_dataq+FGData.seed]
+
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ unused2, unused3, see, overlap, unused4, unused5, lstride
+
+ mov lumaq, r9mp
+ lea r12, [srcq+wq]
+ lea r13, [dstq+wq]
+ lea r14, [lumaq+wq*(1+%2)]
+ mov r11mp, r12
+ mov r12mp, r13
+ mov lstrideq, r10mp
+ neg wq
+
+%%loop_x:
+ rorx r6, seeq, 1
+ or seed, 0xEFF4
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ offx, offy, see, overlap, unused1, unused2, lstride
+
+ rorx offyd, seed, 8
+ rorx offxq, seeq, 12
+ and offyd, 0xf
+ imul offyd, 164>>%3
+ lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx
+
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ h, offxy, see, overlap, unused1, unused2, lstride
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+%%loop_y:
+ ; src
+%if %2
+ mova xm3, [lumaq+lstrideq*0+ 0]
+ vinserti128 m3, [lumaq+lstrideq*(1+%3) +0], 1
+ vpbroadcastd m2, [pb_1]
+ mova xm0, [lumaq+lstrideq*0+16]
+ vinserti128 m0, [lumaq+lstrideq*(1+%3)+16], 1
+ mova xm1, [srcq]
+ vinserti128 m1, [srcq+strideq], 1
+ pmaddubsw m3, m2
+ pmaddubsw m0, m2
+ pavgw m3, m7
+ pavgw m0, m7
+%else
+ mova m2, [lumaq]
+ mova m1, [srcq]
+%endif
+%if %1
+%if %2
+ packuswb m2, m3, m0 ; luma
+%endif
+ punpckhbw m3, m2, m1
+ punpcklbw m2, m1 ; { luma, chroma }
+ pmaddubsw m3, m14
+ pmaddubsw m2, m14
+ psraw m3, 6
+ psraw m2, 6
+ paddw m3, m15
+ paddw m2, m15
+ packuswb m2, m3 ; pack+unpack = clip
+%endif
+%if %1 || %2 == 0
+ punpcklbw m3, m2, m7
+ punpckhbw m0, m2, m7
+%endif
+
+ ; scaling[luma_src]
+ pandn m4, m8, m3
+ mova m6, m8
+ vpgatherdd m2, [scalingq+m4-0], m8
+ psrld m3, 16
+ mova m8, m6
+ vpgatherdd m4, [scalingq+m3-2], m6
+ pandn m5, m8, m0
+ mova m6, m8
+ vpgatherdd m3, [scalingq+m5-0], m8
+ psrld m0, 16
+ mova m8, m6
+ vpgatherdd m5, [scalingq+m0-2], m6
+ pblendw m2, m4, 0xaa
+ pblendw m3, m5, 0xaa
+
+ ; grain = grain_lut[offy+y][offx+x]
+%if %2
+ movu xm5, [grain_lutq+offxyq+ 0]
+ vinserti128 m5, [grain_lutq+offxyq+82], 1
+%else
+ movu m5, [grain_lutq+offxyq]
+%endif
+ punpcklbw m4, m5, m7
+ punpckhbw m5, m7
+
+ ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+ pmaddubsw m2, m4
+ pmaddubsw m3, m5
+ pmulhrsw m2, m9
+ pmulhrsw m3, m9
+
+ ; unpack chroma_source
+ punpcklbw m0, m1, m7
+ punpckhbw m1, m7
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2
+ paddw m1, m3
+ packuswb m0, m1
+ pmaxub m0, m10
+ pminub m0, m11
+%if %2
+ mova [dstq], xm0
+ vextracti128 [dstq+strideq], m0, 1
+%else
+ mova [dstq], m0
+%endif
+
+%if %2
+ lea srcq, [srcq+strideq*2]
+ lea dstq, [dstq+strideq*2]
+ lea lumaq, [lumaq+lstrideq*(2<<%3)]
+%else
+ add srcq, strideq
+ add dstq, strideq
+ add lumaq, lstrideq
+%endif
+ add grain_lutq, 82<<%2
+ sub hb, 1+%2
+ jg %%loop_y
+
+ add wq, 32>>%2
+ jge .end
+ mov srcq, r11mp
+ mov dstq, r12mp
+ lea lumaq, [r14+wq*(1+%2)]
+ add srcq, wq
+ add dstq, wq
+ test overlapd, overlapd
+ jz %%loop_x
+
+ ; r8m = sbym
+ cmp dword r8m, 0
+ jne %%loop_x_hv_overlap
+
+ ; horizontal overlap (without vertical overlap)
+%%loop_x_h_overlap:
+ rorx r6, seeq, 1
+ or seed, 0xEFF4
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ offx, offy, see, left_offxy, unused1, unused2, lstride
+
+ lea left_offxyd, [offyq+(32>>%2)] ; previous column's offy*stride+offx
+ rorx offyd, seed, 8
+ rorx offxq, seeq, 12
+ and offyd, 0xf
+ imul offyd, 164>>%3
+ lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx
+
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ h, offxy, see, left_offxy, unused1, unused2, lstride
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+%%loop_y_h_overlap:
+ ; src
+%if %2
+ mova xm3, [lumaq+lstrideq*0+ 0]
+ vinserti128 m3, [lumaq+lstrideq*(1+%3)+ 0], 1
+ vpbroadcastd m2, [pb_1]
+ mova xm0, [lumaq+lstrideq*0+16]
+ vinserti128 m0, [lumaq+lstrideq*(1+%3)+16], 1
+ mova xm1, [srcq]
+ vinserti128 m1, [srcq+strideq], 1
+ pmaddubsw m3, m2
+ pmaddubsw m0, m2
+ pavgw m3, m7
+ pavgw m0, m7
+%else
+ mova m2, [lumaq]
+ mova m1, [srcq]
+%endif
+%if %1
+%if %2
+ packuswb m2, m3, m0 ; luma
+%endif
+ punpckhbw m3, m2, m1
+ punpcklbw m2, m1 ; { luma, chroma }
+ pmaddubsw m3, m14
+ pmaddubsw m2, m14
+ psraw m3, 6
+ psraw m2, 6
+ paddw m3, m15
+ paddw m2, m15
+ packuswb m2, m3 ; pack+unpack = clip
+%endif
+%if %1 || %2 == 0
+ punpcklbw m3, m2, m7
+ punpckhbw m0, m2, m7
+%endif
+
+ ; scaling[luma_src]
+ pandn m4, m8, m3
+ mova m6, m8
+ vpgatherdd m2, [scalingq+m4-0], m8
+ psrld m3, 16
+ mova m8, m6
+ vpgatherdd m4, [scalingq+m3-2], m6
+ pandn m5, m8, m0
+ mova m6, m8
+ vpgatherdd m3, [scalingq+m5-0], m8
+ psrld m0, 16
+ mova m8, m6
+ vpgatherdd m5, [scalingq+m0-2], m6
+ pblendw m2, m4, 0xaa
+ pblendw m3, m5, 0xaa
+
+ ; grain = grain_lut[offy+y][offx+x]
+%if %2
+ movu xm5, [grain_lutq+offxyq+ 0]
+ vinserti128 m5, [grain_lutq+offxyq+82], 1
+ movd xm4, [grain_lutq+left_offxyq+ 0]
+ vinserti128 m4, [grain_lutq+left_offxyq+82], 1
+ punpcklbw m4, m5
+%if %1
+ vpbroadcastq m0, [pb_23_22]
+ pmaddubsw m4, m0, m4
+%else
+ pmaddubsw m4, m15, m4
+%endif
+ pmulhrsw m4, m12
+ packsswb m4, m4
+ vpblendd m4, m5, 0xee
+%else
+ movu m5, [grain_lutq+offxyq]
+ movd xm4, [grain_lutq+left_offxyq]
+ punpcklbw xm4, xm5
+%if %1
+ movq xm0, [pb_27_17_17_27]
+ pmaddubsw xm4, xm0, xm4
+%else
+ pmaddubsw xm4, xm15, xm4
+%endif
+ pmulhrsw xm4, xm12
+ packsswb xm4, xm4
+ vpblendd m4, m5, 0xfe
+%endif
+ punpckhbw m5, m7
+ punpcklbw m4, m7
+
+ ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+ pmaddubsw m2, m4
+ pmaddubsw m3, m5
+ pmulhrsw m2, m9
+ pmulhrsw m3, m9
+
+ ; unpack chroma_source
+ punpcklbw m0, m1, m7
+ punpckhbw m1, m7
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2
+ paddw m1, m3
+ packuswb m0, m1
+ pmaxub m0, m10
+ pminub m0, m11
+%if %2
+ mova [dstq], xm0
+ vextracti128 [dstq+strideq], m0, 1
+%else
+ mova [dstq], m0
+%endif
+
+%if %2
+ lea srcq, [srcq+strideq*2]
+ lea dstq, [dstq+strideq*2]
+ lea lumaq, [lumaq+lstrideq*(2<<%3)]
+%else
+ add srcq, strideq
+ add dstq, strideq
+ add lumaq, lstrideq
+%endif
+ add grain_lutq, 82*(1+%2)
+ sub hb, 1+%2
+ jg %%loop_y_h_overlap
+
+ add wq, 32>>%2
+ jge .end
+ mov srcq, r11mp
+ mov dstq, r12mp
+ lea lumaq, [r14+wq*(1+%2)]
+ add srcq, wq
+ add dstq, wq
+
+ ; r8m = sbym
+ cmp dword r8m, 0
+ jne %%loop_x_hv_overlap
+ jmp %%loop_x_h_overlap
+
+%%vertical_overlap:
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \
+ sby, see, overlap, unused1, unused2, lstride
+
+ movzx sbyd, sbyb
+ imul seed, [fg_dataq+FGData.seed], 0x00010001
+ imul r7d, sbyd, 173 * 0x00010001
+ imul sbyd, 37 * 0x01000100
+ add r7d, (105 << 16) | 188
+ add sbyd, (178 << 24) | (141 << 8)
+ and r7d, 0x00ff00ff
+ and sbyd, 0xff00ff00
+ xor seed, r7d
+ xor seed, sbyd ; (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ unused1, unused2, see, overlap, unused3, unused4, lstride
+
+ mov lumaq, r9mp
+ lea r12, [srcq+wq]
+ lea r13, [dstq+wq]
+ lea r14, [lumaq+wq*(1+%2)]
+ mov r11mp, r12
+ mov r12mp, r13
+ mov lstrideq, r10mp
+ neg wq
+
+%%loop_x_v_overlap:
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp r7b ; parity of top_seed
+ shr seed, 16
+ shl r7d, 16
+ test seeb, seeh
+ setp r7b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor r7d, r6d
+ rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ offx, offy, see, overlap, top_offxy, unused, lstride
+
+ rorx offyd, seed, 8
+ rorx offxd, seed, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164>>%3
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
+
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ h, offxy, see, overlap, top_offxy, unused, lstride
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+%if %2 == 0
+ vpbroadcastd m13, [pb_27_17]
+%endif
+%%loop_y_v_overlap:
+ ; src
+%if %2
+ mova xm3, [lumaq+lstrideq*0+ 0]
+ vinserti128 m3, [lumaq+lstrideq*(1+%3)+ 0], 1
+ vpbroadcastd m2, [pb_1]
+ mova xm0, [lumaq+lstrideq*0+16]
+ vinserti128 m0, [lumaq+lstrideq*(1+%3)+16], 1
+ mova xm1, [srcq]
+ vinserti128 m1, [srcq+strideq], 1
+ pmaddubsw m3, m2
+ pmaddubsw m0, m2
+ pavgw m3, m7
+ pavgw m0, m7
+%else
+ mova m2, [lumaq]
+ mova m1, [srcq]
+%endif
+%if %1
+%if %2
+ packuswb m2, m3, m0 ; luma
+%endif
+ punpckhbw m3, m2, m1
+ punpcklbw m2, m1 ; { luma, chroma }
+ pmaddubsw m3, m14
+ pmaddubsw m2, m14
+ psraw m3, 6
+ psraw m2, 6
+ paddw m3, m15
+ paddw m2, m15
+ packuswb m2, m3 ; pack+unpack = clip
+%endif
+%if %1 || %2 == 0
+ punpcklbw m3, m2, m7
+ punpckhbw m0, m2, m7
+%endif
+
+ ; scaling[luma_src]
+ pandn m4, m8, m3
+ mova m6, m8
+ vpgatherdd m2, [scalingq+m4-0], m8
+ psrld m3, 16
+ mova m8, m6
+ vpgatherdd m4, [scalingq+m3-2], m6
+ pandn m5, m8, m0
+ mova m6, m8
+ vpgatherdd m3, [scalingq+m5-0], m8
+ psrld m0, 16
+ mova m8, m6
+ vpgatherdd m5, [scalingq+m0-2], m6
+ pblendw m2, m4, 0xaa
+ pblendw m3, m5, 0xaa
+
+ ; grain = grain_lut[offy+y][offx+x]
+%if %3 == 0
+%if %2
+ movu xm0, [grain_lutq+offxyq]
+ vinserti128 m0, [grain_lutq+offxyq+82], 1
+ movu xm4, [grain_lutq+top_offxyq]
+ vinserti128 m4, [grain_lutq+top_offxyq+82], 1
+%else
+ movu m0, [grain_lutq+offxyq]
+ movu m4, [grain_lutq+top_offxyq]
+%endif
+ punpcklbw m5, m4, m0
+ punpckhbw m4, m0
+ pmaddubsw m5, m13, m5
+ pmaddubsw m4, m13, m4
+ pmulhrsw m5, m12
+ pmulhrsw m4, m12
+ packsswb m5, m4
+%else
+ movq xm4, [grain_lutq+offxyq]
+ vinserti128 m4, [grain_lutq+offxyq+8], 1
+ movq xm5, [grain_lutq+top_offxyq]
+ vinserti128 m5, [grain_lutq+top_offxyq+8], 1
+ punpcklbw m5, m4
+ pmaddubsw m5, m13, m5
+ pmulhrsw m5, m12
+ vextracti128 xm4, m5, 1
+ packsswb xm5, xm4
+ ; only interpolate first line, insert second line unmodified
+ vinserti128 m5, [grain_lutq+offxyq+82], 1
+%endif
+ punpcklbw m4, m5, m7
+ punpckhbw m5, m7
+
+ ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+ pmaddubsw m2, m4
+ pmaddubsw m3, m5
+ pmulhrsw m2, m9
+ pmulhrsw m3, m9
+
+ ; unpack chroma_source
+ punpcklbw m0, m1, m7
+ punpckhbw m1, m7
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2
+ paddw m1, m3
+ packuswb m0, m1
+ pmaxub m0, m10
+ pminub m0, m11
+%if %2
+ mova [dstq], xm0
+ vextracti128 [dstq+strideq], m0, 1
+%else
+ mova [dstq], m0
+%endif
+
+ sub hb, 1+%2
+ jle %%end_y_v_overlap
+%if %2
+ lea srcq, [srcq+strideq*2]
+ lea dstq, [dstq+strideq*2]
+ lea lumaq, [lumaq+lstrideq*(2<<%3)]
+%else
+ add srcq, strideq
+ add dstq, strideq
+ add lumaq, lstrideq
+%endif
+ add grain_lutq, 82<<%2
+%if %2 == 0
+ vpbroadcastd m13, [pb_17_27]
+ add hd, 0x80000000
+ jnc %%loop_y_v_overlap
+%endif
+ jmp %%loop_y
+
+%%end_y_v_overlap:
+ add wq, 32>>%2
+ jge .end
+ mov srcq, r11mp
+ mov dstq, r12mp
+ lea lumaq, [r14+wq*(1+%2)]
+ add srcq, wq
+ add dstq, wq
+
+ ; since fg_dataq.overlap is guaranteed to be set, we never jump
+ ; back to .loop_x_v_overlap, and instead always fall-through to
+ ; h+v overlap
+
+%%loop_x_hv_overlap:
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp r7b ; parity of top_seed
+ shr seed, 16
+ shl r7d, 16
+ test seeb, seeh
+ setp r7b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor r7d, r6d
+ rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride
+
+ lea topleft_offxyd, [top_offxyq+(32>>%2)]
+ lea left_offxyd, [offyq+(32>>%2)]
+ rorx offyd, seed, 8
+ rorx offxd, seed, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164>>%3
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
+
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+%if %2 == 0
+ vpbroadcastd m13, [pb_27_17]
+%endif
+%%loop_y_hv_overlap:
+ ; src
+%if %2
+ mova xm3, [lumaq+lstrideq*0+ 0]
+ vinserti128 m3, [lumaq+lstrideq*(1+%3)+ 0], 1
+ vpbroadcastd m2, [pb_1]
+ mova xm0, [lumaq+lstrideq*0+16]
+ vinserti128 m0, [lumaq+lstrideq*(1+%3)+16], 1
+ mova xm1, [srcq]
+ vinserti128 m1, [srcq+strideq], 1
+ pmaddubsw m3, m2
+ pmaddubsw m0, m2
+ pavgw m3, m7
+ pavgw m0, m7
+%else
+ mova m2, [lumaq]
+ mova m1, [srcq]
+%endif
+%if %1
+%if %2
+ packuswb m2, m3, m0 ; luma
+%endif
+ punpckhbw m3, m2, m1
+ punpcklbw m2, m1 ; { luma, chroma }
+ pmaddubsw m3, m14
+ pmaddubsw m2, m14
+ psraw m3, 6
+ psraw m2, 6
+ paddw m3, m15
+ paddw m2, m15
+ packuswb m2, m3 ; pack+unpack = clip
+%endif
+%if %1 || %2 == 0
+ punpcklbw m3, m2, m7
+ punpckhbw m0, m2, m7
+%endif
+
+ ; scaling[luma_src]
+ pandn m4, m8, m3
+ mova m6, m8
+ vpgatherdd m2, [scalingq+m4-0], m8
+ psrld m3, 16
+ mova m8, m6
+ vpgatherdd m4, [scalingq+m3-2], m6
+ pandn m5, m8, m0
+ mova m6, m8
+ vpgatherdd m3, [scalingq+m5-0], m8
+ psrld m0, 16
+ mova m8, m6
+ vpgatherdd m5, [scalingq+m0-2], m6
+ pblendw m2, m4, 0xaa
+ pblendw m3, m5, 0xaa
+
+ ; grain = grain_lut[offy+y][offx+x]
+%if %2
+ movu xm4, [grain_lutq+offxyq]
+ vinserti128 m4, [grain_lutq+offxyq+82], 1
+ movd xm0, [grain_lutq+left_offxyq]
+ vinserti128 m0, [grain_lutq+left_offxyq+82], 1
+ movd xm6, [grain_lutq+topleft_offxyq]
+%if %3
+ movq xm5, [grain_lutq+top_offxyq]
+ vinserti128 m5, [grain_lutq+top_offxyq+8], 1
+%else
+ vinserti128 m6, [grain_lutq+topleft_offxyq+82], 1
+ movu xm5, [grain_lutq+top_offxyq]
+ vinserti128 m5, [grain_lutq+top_offxyq+82], 1
+%endif
+
+ ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
+ punpcklbw m0, m4
+%if %3
+ punpcklbw xm6, xm5
+%else
+ punpcklbw m6, m5
+%endif
+ punpcklqdq m0, m6
+%if %1
+ vpbroadcastq m6, [pb_23_22]
+ pmaddubsw m0, m6, m0
+%else
+ pmaddubsw m0, m15, m0
+%endif
+ pmulhrsw m0, m12
+ packsswb m0, m0
+ vpblendd m4, m0, 0x11
+%if %3
+ pshuflw xm0, xm0, q1032
+ vpblendd m5, m0, 0x01
+%else
+ pshuflw m0, m0, q1032
+ vpblendd m5, m0, 0x11
+%endif
+%else
+ movu m4, [grain_lutq+offxyq]
+ movd xm0, [grain_lutq+left_offxyq]
+ movu m5, [grain_lutq+top_offxyq]
+ movd xm6, [grain_lutq+topleft_offxyq]
+ punpcklbw xm0, xm4
+ punpcklbw xm6, xm5
+ punpcklqdq xm0, xm6
+%if %1
+ vpbroadcastq xm6, [pb_27_17_17_27]
+ pmaddubsw xm0, xm6, xm0
+%else
+ pmaddubsw xm0, xm15, xm0
+%endif
+ pmulhrsw xm0, xm12
+ packsswb xm0, xm0
+ vpblendd m4, m0, 0x01
+ pshuflw xm0, xm0, q1032
+ vpblendd m5, m0, 0x01
+%endif
+
+ ; followed by v interpolation (top | cur -> cur)
+%if %3
+ vpermq m0, m4, q3120
+ punpcklbw m5, m0
+ pmaddubsw m5, m13, m5
+ pmulhrsw m5, m12
+ vextracti128 xm0, m5, 1
+ packsswb xm5, xm0
+ vpblendd m5, m4, 0xf0
+%else
+ punpckhbw m0, m5, m4
+ punpcklbw m5, m4
+ pmaddubsw m4, m13, m0
+ pmaddubsw m5, m13, m5
+ pmulhrsw m4, m12
+ pmulhrsw m5, m12
+ packsswb m5, m4
+%endif
+ punpcklbw m4, m5, m7
+ punpckhbw m5, m7
+
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+ pmaddubsw m2, m4
+ pmaddubsw m3, m5
+ pmulhrsw m2, m9
+ pmulhrsw m3, m9
+
+ ; unpack chroma source
+ punpcklbw m0, m1, m7
+ punpckhbw m1, m7
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2
+ paddw m1, m3
+ packuswb m0, m1
+ pmaxub m0, m10
+ pminub m0, m11
+%if %2
+ mova [dstq], xm0
+ vextracti128 [dstq+strideq], m0, 1
+%else
+ mova [dstq], m0
+%endif
+
+%if %2
+ lea srcq, [srcq+strideq*2]
+ lea dstq, [dstq+strideq*2]
+ lea lumaq, [lumaq+lstrideq*(2<<%3)]
+%else
+ add srcq, strideq
+ add dstq, strideq
+ add lumaq, lstrideq
+%endif
+ add grain_lutq, 82<<%2
+ sub hb, 1+%2
+%if %2
+ jg %%loop_y_h_overlap
+%else
+ je %%end_y_hv_overlap
+ vpbroadcastd m13, [pb_17_27]
+ add hd, 0x80000000
+ jnc %%loop_y_hv_overlap
+ jmp %%loop_y_h_overlap
+%endif
+
+%%end_y_hv_overlap:
+ add wq, 32>>%2
+ jge .end
+ mov srcq, r11mp
+ mov dstq, r12mp
+ lea lumaq, [r14+wq*(1+%2)]
+ add srcq, wq
+ add dstq, wq
+ jmp %%loop_x_hv_overlap
+%endmacro
+
+ %%FGUV_32x32xN_LOOP 1, %2, %3
+.csfl:
+ %%FGUV_32x32xN_LOOP 0, %2, %3
+.end:
+ RET
+%endmacro
+
+GEN_GRAIN_UV_FN 420, 1, 1
+FGUV_FN 420, 1, 1
+GEN_GRAIN_UV_FN 422, 1, 0
+FGUV_FN 422, 1, 0
+GEN_GRAIN_UV_FN 444, 0, 0
+FGUV_FN 444, 0, 0
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/filmgrain_avx512.asm b/third_party/dav1d/src/x86/filmgrain_avx512.asm
new file mode 100644
index 0000000000..317ec118b3
--- /dev/null
+++ b/third_party/dav1d/src/x86/filmgrain_avx512.asm
@@ -0,0 +1,813 @@
+; Copyright © 2022, VideoLAN and dav1d authors
+; Copyright © 2022, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+%include "x86/filmgrain_common.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 64
+
+pb_even: db 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+ db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62
+ db 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94
+ db 96, 98,100,102,104,106,108,110,112,114,116,118,120,122,124,126
+pb_odd: db 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+ db 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63
+ db 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95
+ db 97, 99,101,103,105,107,109,111,113,115,117,119,121,123,125,127
+interleave_hl: db 8, 0, 9, 1, 10, 2, 11, 3, 12, 4, 13, 5, 14, 6, 15, 7
+pb_27_17_17_27: db 27, 17, 17, 27, 0, 32, 0, 32
+pb_23_22_0_32: db 23, 22, 0, 32, 0, 32, 0, 32
+pb_27_17: times 2 db 27, 17
+pb_23_22: times 2 db 23, 22
+pw_8: times 2 dw 8
+pw_1024: times 2 dw 1024
+pb_17_27: times 2 db 17, 27
+fg_max: times 4 db 255
+ times 4 db 240
+ times 4 db 235
+fg_min: times 4 db 0
+ times 4 db 16
+noise_rnd: times 2 dw 128
+ times 2 dw 64
+ times 2 dw 32
+ times 2 dw 16
+
+SECTION .text
+
+INIT_ZMM avx512icl
+cglobal fgy_32x32xn_8bpc, 6, 13, 22, dst, src, stride, fg_data, w, scaling, \
+ grain_lut, h, sby, see, overlap
+%define base r11-fg_min
+ lea r11, [fg_min]
+ mov r6d, [fg_dataq+FGData.scaling_shift]
+ mov r7d, [fg_dataq+FGData.clip_to_restricted_range]
+ mov sbyd, sbym
+ mov overlapd, [fg_dataq+FGData.overlap_flag]
+ mov r12, 0x0000000f0000000f ; h_overlap mask
+ mova m0, [scalingq+64*0]
+ mova m1, [scalingq+64*1]
+ mova m2, [scalingq+64*2]
+ mova m3, [scalingq+64*3]
+ kmovq k1, r12
+ vbroadcasti32x4 m4, [base+interleave_hl]
+ vpbroadcastd ym16, [base+pb_27_17]
+ vpbroadcastd m12, [base+pb_17_27]
+ vpbroadcastd m6, [base+noise_rnd+r6*4-32]
+ test sbyd, sbyd
+ setnz r6b
+ vpbroadcastd m7, [base+fg_min+r7*4]
+ vpbroadcastd m8, [base+fg_max+r7*8]
+ pxor m5, m5
+ vpbroadcastd m9, [base+pw_1024]
+ vpbroadcastq m10, [base+pb_27_17_17_27]
+ vmovdqa64 m12{k1}, m16
+ test r6b, overlapb
+ jnz .v_overlap
+
+ imul seed, sbyd, (173 << 24) | 37
+ add seed, (105 << 24) | 178
+ rorx seed, seed, 24
+ movzx seed, seew
+ xor seed, [fg_dataq+FGData.seed]
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \
+ h, sby, see, overlap
+
+ lea src_bakq, [srcq+wq]
+ neg wq
+ sub dstq, srcq
+.loop_x:
+ rorx r6, seeq, 1
+ or seed, 0xeff4
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+ rorx offyd, seed, 8
+ rorx offxq, seeq, 12
+ and offyd, 0xf
+ imul offyd, 164
+ lea offxd, [offyq+offxq*2+829] ; offy*stride+offx
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \
+ h, sby, see, overlap
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+.loop_y:
+ movu ym21, [grain_lutq+offxyq-82]
+ vinserti32x8 m21, [grain_lutq+offxyq+ 0], 1
+ call .add_noise
+ sub hb, 2
+ jg .loop_y
+ add wq, 32
+ jge .end
+ lea srcq, [src_bakq+wq]
+ test overlapd, overlapd
+ jz .loop_x
+ test sbyd, sbyd
+ jnz .hv_overlap
+
+.loop_x_h_overlap:
+ rorx r6, seeq, 1
+ or seed, 0xeff4
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \
+ h, sby, see, left_offxy
+
+ rorx offyd, seed, 8
+ mov left_offxyd, offxd ; previous column's offy*stride
+ rorx offxq, seeq, 12
+ and offyd, 0xf
+ imul offyd, 164
+ lea offxd, [offyq+offxq*2+829] ; offy*stride+offx
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \
+ h, sby, see, left_offxy
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+.loop_y_h_overlap:
+ movu ym20, [grain_lutq+offxyq-82]
+ vinserti32x8 m20, [grain_lutq+offxyq+ 0], 1
+ movd xm19, [grain_lutq+left_offxyq-50]
+ vinserti32x4 m19, [grain_lutq+left_offxyq+32], 2
+ punpcklbw m19, m20
+ pmaddubsw m19, m10, m19
+ pmulhrsw m19, m9
+ punpckhbw m21, m20, m5
+ packsswb m20{k1}, m19, m19
+ punpcklbw m20, m5, m20
+ call .add_noise_h
+ sub hb, 2
+ jg .loop_y_h_overlap
+ add wq, 32
+ jge .end
+ lea srcq, [src_bakq+wq]
+ test sbyd, sbyd
+ jnz .hv_overlap
+ jmp .loop_x_h_overlap
+
+.v_overlap:
+ DEFINE_ARGS dst, src, stride, fg_data, w, offy, offx, \
+ h, sby, see, overlap
+
+ movzx r6d, sbyb
+ imul seed, [fg_dataq+FGData.seed], 0x00010001
+ imul r7d, r6d, 173 * 0x00010001
+ imul r6d, 37 * 0x01000100
+ add r7d, (105 << 16) | 188
+ add r6d, (178 << 24) | (141 << 8)
+ and r7d, 0x00ff00ff
+ and r6d, 0xff00ff00
+ xor seed, r7d
+ xor seed, r6d ; (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \
+ h, sby, see, overlap
+
+ lea src_bakq, [srcq+wq]
+ neg wq
+ sub dstq, srcq
+
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp r7b ; parity of top_seed
+ shr seed, 16
+ shl r7d, 16
+ test seeb, seeh
+ setp r7b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor r7d, r6d
+ rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
+ rorx offyd, seed, 8
+ rorx offxd, seed, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offxd, [offyq+offxq*2+0x10001*829+32*82]
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \
+ h, sby, see, overlap, top_offxy
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+ movu ym19, [grain_lutq+offxyq-82]
+ vinserti32x8 m19, [grain_lutq+offxyq+ 0], 1
+ movu ym21, [grain_lutq+top_offxyq-82]
+ vinserti32x8 m21, [grain_lutq+top_offxyq+ 0], 1
+ punpckhbw m20, m21, m19
+ punpcklbw m21, m19
+ call .add_noise_v
+ sub hb, 2
+ jg .loop_y
+ add wq, 32
+ jge .end
+ lea srcq, [src_bakq+wq]
+
+ ; since fg_dataq.overlap is guaranteed to be set, we never jump back
+ ; to .v_overlap, and instead always fall-through to h+v overlap
+.hv_overlap:
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp r7b ; parity of top_seed
+ shr seed, 16
+ shl r7d, 16
+ test seeb, seeh
+ setp r7b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor r7d, r6d
+ rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \
+ h, sby, see, left_offxy, top_offxy, topleft_offxy
+
+ mov topleft_offxyd, top_offxyd
+ rorx offyd, seed, 8
+ mov left_offxyd, offxd
+ rorx offxd, seed, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offxd, [offyq+offxq*2+0x10001*829+32*82]
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \
+ h, sby, see, left_offxy, top_offxy, topleft_offxy
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+ movu ym19, [grain_lutq+offxyq-82]
+ vinserti32x8 m19, [grain_lutq+offxyq+ 0], 1
+ movd xm16, [grain_lutq+left_offxyq-50]
+ vinserti32x4 m16, [grain_lutq+left_offxyq+32], 2
+ movu ym21, [grain_lutq+top_offxyq-82]
+ vinserti32x8 m21, [grain_lutq+top_offxyq+ 0], 1
+ movd xm17, [grain_lutq+topleft_offxyq-50]
+ vinserti32x4 m17, [grain_lutq+topleft_offxyq+32], 2
+ ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
+ punpcklbw m16, m19
+ pmaddubsw m16, m10, m16
+ punpcklbw m17, m21
+ pmaddubsw m17, m10, m17
+ punpckhbw m20, m21, m19
+ pmulhrsw m16, m9
+ pmulhrsw m17, m9
+ packsswb m19{k1}, m16, m16
+ packsswb m21{k1}, m17, m17
+ ; followed by v interpolation (top | cur -> cur)
+ punpcklbw m21, m19
+ call .add_noise_v
+ sub hb, 2
+ jg .loop_y_h_overlap
+ add wq, 32
+ lea srcq, [src_bakq+wq]
+ jl .hv_overlap
+.end:
+ RET
+ALIGN function_align
+.add_noise_v:
+ pmaddubsw m20, m12, m20
+ pmaddubsw m21, m12, m21
+ pmulhrsw m20, m9
+ pmulhrsw m21, m9
+ packsswb m21, m20
+.add_noise:
+ punpcklbw m20, m5, m21
+ punpckhbw m21, m5
+.add_noise_h:
+ mova ym18, [srcq+strideq*0]
+ vinserti32x8 m18, [srcq+strideq*1], 1
+ mova m19, m0
+ punpcklbw m16, m18, m5
+ vpermt2b m19, m18, m1 ; scaling[ 0..127]
+ vpmovb2m k2, m18
+ punpckhbw m17, m18, m5
+ vpermi2b m18, m2, m3 ; scaling[128..255]
+ vmovdqu8 m19{k2}, m18 ; scaling[src]
+ pshufb m19, m4
+ pmaddubsw m18, m19, m20
+ pmaddubsw m19, m21
+ add grain_lutq, 82*2
+ pmulhrsw m18, m6 ; noise
+ pmulhrsw m19, m6
+ paddw m16, m18
+ paddw m17, m19
+ packuswb m16, m17
+ pmaxub m16, m7
+ pminub m16, m8
+ mova [dstq+srcq], ym16
+ add srcq, strideq
+ vextracti32x8 [dstq+srcq], m16, 1
+ add srcq, strideq
+ ret
+
+%macro FGUV_FN 3 ; name, ss_hor, ss_ver
+cglobal fguv_32x32xn_i%1_8bpc, 6, 14+%2, 22, dst, src, stride, fg_data, w, \
+ scaling, grain_lut, h, sby, luma, \
+ overlap, uv_pl, is_id, _, stride3
+ lea r11, [fg_min]
+ mov r6d, [fg_dataq+FGData.scaling_shift]
+ mov r7d, [fg_dataq+FGData.clip_to_restricted_range]
+ mov r9d, is_idm
+ mov sbyd, sbym
+ mov overlapd, [fg_dataq+FGData.overlap_flag]
+%if %2
+ mov r12, 0x000f000f000f000f ; h_overlap mask
+ vpbroadcastq m10, [base+pb_23_22_0_32]
+ lea stride3q, [strideq*3]
+%else
+ mov r12, 0x0000000f0000000f
+ vpbroadcastq m10, [base+pb_27_17_17_27]
+%endif
+ mova m0, [scalingq+64*0]
+ mova m1, [scalingq+64*1]
+ mova m2, [scalingq+64*2]
+ mova m3, [scalingq+64*3]
+ kmovq k1, r12
+ vbroadcasti32x4 m4, [base+interleave_hl]
+ vpbroadcastd m6, [base+noise_rnd+r6*4-32]
+ vpbroadcastd m7, [base+fg_min+r7*4]
+ shlx r7d, r7d, r9d
+ vpbroadcastd m8, [base+fg_max+r7*4]
+ test sbyd, sbyd
+ setnz r7b
+ vpbroadcastd m9, [base+pw_1024]
+ mova m11, [base+pb_even]
+ mova m12, [base+pb_odd]
+ pxor m5, m5
+ mov r5, r10mp ; lstride
+ cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
+ jne .csfl
+
+%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver
+ DEFINE_ARGS dst, src, stride, fg_data, w, lstride, grain_lut, \
+ h, sby, see, overlap, uv_pl, _, _, stride3
+%if %1
+ mov r6d, uv_plm
+ vpbroadcastd m16, [base+pw_8]
+ vbroadcasti32x4 m14, [fg_dataq+FGData.uv_mult+r6*4]
+ vpbroadcastw m15, [fg_dataq+FGData.uv_offset+r6*4]
+ pshufb m14, m16 ; uv_luma_mult, uv_mult
+%endif
+ test r7b, overlapb
+ jnz %%v_overlap
+
+ imul seed, sbyd, (173 << 24) | 37
+ add seed, (105 << 24) | 178
+ rorx seed, seed, 24
+ movzx seed, seew
+ xor seed, [fg_dataq+FGData.seed]
+
+ DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
+ offx, offy, see, overlap, _, _, _, stride3
+
+ mov lumaq, r9mp
+ lea r11, [srcq+wq]
+ lea r12, [dstq+wq]
+ lea r13, [lumaq+wq*(1+%2)]
+ mov r11mp, r11
+ mov r12mp, r12
+ neg wq
+
+%%loop_x:
+ rorx r6, seeq, 1
+ or seed, 0xeff4
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+ rorx offyd, seed, 8
+ rorx offxq, seeq, 12
+ and offyd, 0xf
+ imul offyd, 164>>%3
+ lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx
+
+ DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
+ h, offxy, see, overlap, _, _, _, stride3
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+%%loop_y:
+%if %2
+ movu xm21, [grain_lutq+offxyq+82*0]
+ vinserti128 ym21, [grain_lutq+offxyq+82*1], 1
+ vinserti32x4 m21, [grain_lutq+offxyq+82*2], 2
+ vinserti32x4 m21, [grain_lutq+offxyq+82*3], 3
+%else
+ movu ym21, [grain_lutq+offxyq+82*0]
+ vinserti32x8 m21, [grain_lutq+offxyq+82*1], 1
+%endif
+ call %%add_noise
+ sub hb, 2<<%2
+ jg %%loop_y
+ add wq, 32>>%2
+ jge .end
+ mov srcq, r11mp
+ mov dstq, r12mp
+ lea lumaq, [r13+wq*(1<<%2)]
+ add srcq, wq
+ add dstq, wq
+ test overlapd, overlapd
+ jz %%loop_x
+ cmp dword r8m, 0 ; sby
+ jne %%hv_overlap
+
+ ; horizontal overlap (without vertical overlap)
+%%loop_x_h_overlap:
+ rorx r6, seeq, 1
+ or seed, 0xeff4
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+ DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
+ offx, offy, see, left_offxy, _, _, _, stride3
+
+ lea left_offxyd, [offyq+(32>>%2)] ; previous column's offy*stride+offx
+ rorx offyd, seed, 8
+ rorx offxq, seeq, 12
+ and offyd, 0xf
+ imul offyd, 164>>%3
+ lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx
+
+ DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
+ h, offxy, see, left_offxy, _, _, _, stride3
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+%%loop_y_h_overlap:
+%if %2
+ movu xm20, [grain_lutq+offxyq +82*0]
+ movd xm19, [grain_lutq+left_offxyq+82*0]
+ vinserti32x4 ym20, [grain_lutq+offxyq +82*1], 1
+ vinserti32x4 ym19, [grain_lutq+left_offxyq+82*1], 1
+ vinserti32x4 m20, [grain_lutq+offxyq +82*2], 2
+ vinserti32x4 m19, [grain_lutq+left_offxyq+82*2], 2
+ vinserti32x4 m20, [grain_lutq+offxyq +82*3], 3
+ vinserti32x4 m19, [grain_lutq+left_offxyq+82*3], 3
+%else
+ movu ym20, [grain_lutq+offxyq + 0]
+ movd xm19, [grain_lutq+left_offxyq+ 0]
+ vinserti32x8 m20, [grain_lutq+offxyq +82], 1
+ vinserti32x4 m19, [grain_lutq+left_offxyq+82], 2
+%endif
+ punpcklbw m19, m20
+ pmaddubsw m19, m10, m19
+ punpckhbw m21, m20, m5
+ pmulhrsw m19, m9
+ vpacksswb m20{k1}, m19, m19
+ punpcklbw m20, m5, m20
+ call %%add_noise_h
+ sub hb, 2<<%2
+ jg %%loop_y_h_overlap
+ add wq, 32>>%2
+ jge .end
+ mov srcq, r11mp
+ mov dstq, r12mp
+ lea lumaq, [r13+wq*(1<<%2)]
+ add srcq, wq
+ add dstq, wq
+ cmp dword r8m, 0 ; sby
+ jne %%hv_overlap
+ jmp %%loop_x_h_overlap
+
+%%v_overlap:
+ DEFINE_ARGS dst, src, stride, fg_data, w, lstride, grain_lut, \
+ _, sby, see, overlap, _, _, _, stride3
+
+ movzx sbyd, sbyb
+ imul seed, [fg_dataq+FGData.seed], 0x00010001
+ imul r7d, sbyd, 173 * 0x00010001
+ imul sbyd, 37 * 0x01000100
+ add r7d, (105 << 16) | 188
+ add sbyd, (178 << 24) | (141 << 8)
+ and r7d, 0x00ff00ff
+ and sbyd, 0xff00ff00
+ xor seed, r7d
+ xor seed, sbyd ; (cur_seed << 16) | top_seed
+
+%if %3
+ vpbroadcastd m13, [base+pb_23_22]
+ kxnorw k3, k3, k3 ; v_overlap mask
+%elif %2
+ vbroadcasti32x8 m13, [base+pb_27_17]
+ kxnord k3, k3, k3
+ pshufd m13, m13, q0000 ; 8x27_17, 8x17_27
+%else
+ vpbroadcastd ym16, [base+pb_27_17]
+ vpbroadcastd m13, [base+pb_17_27]
+ vmovdqa64 m13{k1}, m16
+%endif
+
+ DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
+ offx, offy, see, overlap, top_offxy, _, _, stride3
+
+ mov lumaq, r9mp
+ lea r11, [srcq+wq]
+ lea r12, [dstq+wq]
+ lea r13, [lumaq+wq*(1<<%2)]
+ mov r11mp, r11
+ mov r12mp, r12
+ neg wq
+
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp r7b ; parity of top_seed
+ shr seed, 16
+ shl r7d, 16
+ test seeb, seeh
+ setp r7b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor r7d, r6d
+ rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
+ rorx offyd, seed, 8
+ rorx offxd, seed, 12
+ and offyd, 0x000f000f
+ and offxd, 0x000f000f
+ imul offyd, 164>>%3
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
+
+ DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
+ h, offxy, see, overlap, top_offxy, _, _, stride3
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+
+%if %3
+ movu xm18, [grain_lutq+offxyq+82*0]
+ movu xm20, [grain_lutq+top_offxyq+82*0]
+ ; only interpolate first line, insert remaining line unmodified
+ vbroadcasti128 ym21, [grain_lutq+offxyq+82*1]
+ vinserti32x4 m21, [grain_lutq+offxyq+82*2], 2
+ vinserti32x4 m21, [grain_lutq+offxyq+82*3], 3
+ punpcklbw xm19, xm20, xm18
+ punpckhbw xm20, xm18
+%elif %2
+ movu xm18, [grain_lutq+offxyq+82*0]
+ vinserti128 ym18, [grain_lutq+offxyq+82*1], 1
+ movu xm20, [grain_lutq+top_offxyq+82*0]
+ vinserti32x4 ym20, [grain_lutq+top_offxyq+82*1], 1
+ vbroadcasti32x4 m21, [grain_lutq+offxyq+82*2]
+ vinserti32x4 m21, [grain_lutq+offxyq+82*3], 3
+ punpcklbw ym19, ym20, ym18
+ punpckhbw ym20, ym18
+%else
+ movu ym21, [grain_lutq+offxyq+82*0]
+ vinserti32x8 m21, [grain_lutq+offxyq+82*1], 1
+ movu ym20, [grain_lutq+top_offxyq+82*0]
+ vinserti32x8 m20, [grain_lutq+top_offxyq+82*1], 1
+%endif
+ call %%add_noise_v
+ sub hb, 2<<%2
+ jg %%loop_y
+ add wq, 32>>%2
+ jge .end
+ mov srcq, r11mp
+ mov dstq, r12mp
+ lea lumaq, [r13+wq*(1<<%2)]
+ add srcq, wq
+ add dstq, wq
+
+%%hv_overlap:
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp r7b ; parity of top_seed
+ shr seed, 16
+ shl r7d, 16
+ test seeb, seeh
+ setp r7b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor r7d, r6d
+ rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
+ offx, offy, see, left_offxy, top_offxy, topleft_offxy, _, stride3
+
+ lea topleft_offxyd, [top_offxyq+(32>>%2)]
+ lea left_offxyd, [offyq+(32>>%2)]
+ rorx offyd, seed, 8
+ rorx offxd, seed, 12
+ and offyd, 0x000f000f
+ and offxd, 0x000f000f
+ imul offyd, 164>>%3
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
+
+ DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
+ h, offxy, see, left_offxy, top_offxy, topleft_offxy, _, stride3
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+
+%if %2
+ movu xm21, [grain_lutq+offxyq+82*0]
+ movd xm16, [grain_lutq+left_offxyq+82*0]
+ vinserti128 ym21, [grain_lutq+offxyq+82*1], 1
+ vinserti128 ym16, [grain_lutq+left_offxyq+82*1], 1
+ vinserti32x4 m21, [grain_lutq+offxyq+82*2], 2
+ vinserti32x4 m16, [grain_lutq+left_offxyq+82*2], 2
+ vinserti32x4 m21, [grain_lutq+offxyq+82*3], 3
+ vinserti32x4 m16, [grain_lutq+left_offxyq+82*3], 3
+ movd xm18, [grain_lutq+topleft_offxyq+82*0]
+ movu xm20, [grain_lutq+top_offxyq]
+ ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
+ punpcklbw m16, m21
+%if %3
+ punpcklbw xm18, xm20
+%else
+ vinserti128 ym18, [grain_lutq+topleft_offxyq+82*1], 1
+ vinserti128 ym20, [grain_lutq+top_offxyq+82*1], 1
+ punpcklbw ym18, ym20
+%endif
+ punpcklqdq m16, m18
+ pmaddubsw m16, m10, m16
+ pmulhrsw m16, m9
+ packsswb m16, m16
+ vmovdqu8 m21{k1}, m16
+%if %3
+ vpalignr xm20{k1}, xm16, xm16, 4
+ punpcklbw xm19, xm20, xm21
+ punpckhbw xm20, xm21
+%else
+ vpalignr ym20{k1}, ym16, ym16, 4
+ punpcklbw ym19, ym20, ym21
+ punpckhbw ym20, ym21
+%endif
+%else
+ movu ym21, [grain_lutq+offxyq+82*0]
+ vinserti32x8 m21, [grain_lutq+offxyq+82*1], 1
+ movd xm16, [grain_lutq+left_offxyq+82*0]
+ vinserti32x4 m16, [grain_lutq+left_offxyq+82*1], 2
+ movu ym20, [grain_lutq+top_offxyq+82*0]
+ vinserti32x8 m20, [grain_lutq+top_offxyq+82*1], 1
+ movd xm18, [grain_lutq+topleft_offxyq+82*0]
+ vinserti32x4 m18, [grain_lutq+topleft_offxyq+82*1], 2
+ punpcklbw m16, m21
+ punpcklbw m18, m20
+ punpcklqdq m16, m18
+ pmaddubsw m16, m10, m16
+ pmulhrsw m16, m9
+ packsswb m16, m16
+ vpalignr m20{k1}, m16, m16, 4
+ vmovdqu8 m21{k1}, m16
+%endif
+ call %%add_noise_v
+ sub hb, 2<<%2
+ jg %%loop_y_h_overlap
+ add wq, 32>>%2
+ jge .end
+ mov srcq, r11mp
+ mov dstq, r12mp
+ lea lumaq, [r13+wq*(1<<%2)]
+ add srcq, wq
+ add dstq, wq
+ jmp %%hv_overlap
+ALIGN function_align
+%%add_noise_v:
+%if %3
+ pmaddubsw xm19, xm13, xm19
+ pmaddubsw xm20, xm13, xm20
+ pmulhrsw xm19, xm9
+ pmulhrsw xm20, xm9
+ vpacksswb m21{k3}, m19, m20
+%elif %2
+ pmaddubsw ym19, ym13, ym19
+ pmaddubsw ym20, ym13, ym20
+ pmulhrsw ym19, ym9
+ pmulhrsw ym20, ym9
+ vpacksswb m21{k3}, m19, m20
+%else
+ punpcklbw m19, m20, m21
+ punpckhbw m20, m21
+ pmaddubsw m19, m13, m19
+ pmaddubsw m20, m13, m20
+ pmulhrsw m19, m9
+ pmulhrsw m20, m9
+ packsswb m21, m19, m20
+%endif
+%%add_noise:
+ punpcklbw m20, m5, m21
+ punpckhbw m21, m5
+%%add_noise_h:
+ mova ym18, [lumaq+lstrideq*(0<<%3)]
+ vinserti32x8 m18, [lumaq+lstrideq*(1<<%3)], 1
+%if %2
+ lea lumaq, [lumaq+lstrideq*(2<<%3)]
+ mova ym16, [lumaq+lstrideq*(0<<%3)]
+ vinserti32x8 m16, [lumaq+lstrideq*(1<<%3)], 1
+ mova xm17, [srcq+strideq*0]
+ mova m19, m11
+ vpermi2b m19, m18, m16
+ vinserti128 ym17, [srcq+strideq*1], 1
+ vpermt2b m18, m12, m16
+ vinserti32x4 m17, [srcq+strideq*2], 2
+ pavgb m18, m19
+ vinserti32x4 m17, [srcq+stride3q ], 3
+%else
+ mova ym17, [srcq+strideq*0]
+ vinserti32x8 m17, [srcq+strideq*1], 1
+%endif
+%if %1
+ punpckhbw m19, m18, m17
+ punpcklbw m18, m17 ; { luma, chroma }
+ pmaddubsw m19, m14
+ pmaddubsw m18, m14
+ psraw m19, 6
+ psraw m18, 6
+ paddw m19, m15
+ paddw m18, m15
+ packuswb m18, m19
+.add_noise_main:
+ mova m19, m0
+ vpermt2b m19, m18, m1 ; scaling[ 0..127]
+ vpmovb2m k2, m18
+ vpermi2b m18, m2, m3 ; scaling[128..255]
+ vmovdqu8 m19{k2}, m18 ; scaling[src]
+ pshufb m19, m4
+ pmaddubsw m18, m19, m20
+ pmaddubsw m19, m21
+ add grain_lutq, 82*2<<%2
+ lea lumaq, [lumaq+lstrideq*(2<<%3)]
+ lea srcq, [srcq+strideq*(2<<%2)]
+ pmulhrsw m18, m6 ; noise
+ pmulhrsw m19, m6
+ punpcklbw m16, m17, m5 ; chroma
+ punpckhbw m17, m5
+ paddw m16, m18
+ paddw m17, m19
+ packuswb m16, m17
+ pmaxub m16, m7
+ pminub m16, m8
+%if %2
+ mova [dstq+strideq*0], xm16
+ vextracti128 [dstq+strideq*1], ym16, 1
+ vextracti32x4 [dstq+strideq*2], m16, 2
+ vextracti32x4 [dstq+stride3q ], m16, 3
+%else
+ mova [dstq+strideq*0], ym16
+ vextracti32x8 [dstq+strideq*1], m16, 1
+%endif
+ lea dstq, [dstq+strideq*(2<<%2)]
+ ret
+%else
+ jmp .add_noise_main
+%endif
+%endmacro
+
+ %%FGUV_32x32xN_LOOP 1, %2, %3
+.csfl:
+ %%FGUV_32x32xN_LOOP 0, %2, %3
+.end:
+ RET
+%endmacro
+
+FGUV_FN 420, 1, 1
+FGUV_FN 422, 1, 0
+FGUV_FN 444, 0, 0
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/filmgrain_common.asm b/third_party/dav1d/src/x86/filmgrain_common.asm
new file mode 100644
index 0000000000..74f7044e66
--- /dev/null
+++ b/third_party/dav1d/src/x86/filmgrain_common.asm
@@ -0,0 +1,46 @@
+; Copyright © 2019-2022, VideoLAN and dav1d authors
+; Copyright © 2019-2022, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+struc FGData
+ .seed: resd 1
+ .num_y_points: resd 1
+ .y_points: resb 14 * 2
+ .chroma_scaling_from_luma: resd 1
+ .num_uv_points: resd 2
+ .uv_points: resb 2 * 10 * 2
+ .scaling_shift: resd 1
+ .ar_coeff_lag: resd 1
+ .ar_coeffs_y: resb 24
+ .ar_coeffs_uv: resb 2 * 28 ; includes padding
+ .ar_coeff_shift: resq 1
+ .grain_scale_shift: resd 1
+ .uv_mult: resd 2
+ .uv_luma_mult: resd 2
+ .uv_offset: resd 2
+ .overlap_flag: resd 1
+ .clip_to_restricted_range: resd 1
+endstruc
+
+cextern gaussian_sequence
diff --git a/third_party/dav1d/src/x86/filmgrain_sse.asm b/third_party/dav1d/src/x86/filmgrain_sse.asm
new file mode 100644
index 0000000000..0172f98760
--- /dev/null
+++ b/third_party/dav1d/src/x86/filmgrain_sse.asm
@@ -0,0 +1,3233 @@
+; Copyright © 2019-2021, VideoLAN and dav1d authors
+; Copyright © 2019, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+%include "x86/filmgrain_common.asm"
+
+SECTION_RODATA
+
+pw_1024: times 8 dw 1024
+pb_27_17_17_27: db 27, 17, 17, 27
+ times 6 db 0, 32
+pb_23_22_h: db 23, 22
+ times 7 db 0, 32
+pb_27_17: times 8 db 27, 17
+pb_17_27: times 8 db 17, 27
+pb_23_22: times 8 db 23, 22
+pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0
+rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058
+byte_blend: db 0, 0, 0, 0xff, 0, 0, 0, 0
+pw_seed_xor: times 2 dw 0xb524
+ times 2 dw 0x49d8
+pb_1: times 4 db 1
+hmul_bits: dw 32768, 16384, 8192, 4096
+round: dw 2048, 1024, 512
+mul_bits: dw 256, 128, 64, 32, 16
+round_vals: dw 32, 64, 128, 256, 512
+max: dw 255, 240, 235
+min: dw 0, 16
+pw_1: dw 1
+
+%macro JMP_TABLE 2-*
+ %xdefine %1_8bpc_%2_table %%table
+ %xdefine %%base %1_8bpc_%2_table
+ %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2)
+ %%table:
+ %rep %0 - 2
+ dd %%prefix %+ .ar%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+JMP_TABLE generate_grain_y, ssse3, 0, 1, 2, 3
+JMP_TABLE generate_grain_uv_420, ssse3, 0, 1, 2, 3
+JMP_TABLE generate_grain_uv_422, ssse3, 0, 1, 2, 3
+JMP_TABLE generate_grain_uv_444, ssse3, 0, 1, 2, 3
+
+SECTION .text
+
+%if ARCH_X86_32
+%define PIC_ptr(a) base+a
+%else
+%define PIC_ptr(a) a
+%endif
+
+%macro SCRATCH 3
+%if ARCH_X86_32
+ mova [rsp+%3*mmsize], m%1
+%define m%2 [rsp+%3*mmsize]
+%else
+ SWAP %1, %2
+%endif
+%endmacro
+
+INIT_XMM ssse3
+cglobal generate_grain_y_8bpc, 2, 7 + 2 * ARCH_X86_64, 16, buf, fg_data
+ LEA r4, $$
+%define base r4-$$
+ movq m1, [base+rnd_next_upperbit_mask]
+ movq m4, [base+mul_bits]
+ movq m7, [base+hmul_bits]
+ mov r2d, [fg_dataq+FGData.grain_scale_shift]
+ movd m2, [base+round+r2*2]
+ movd m0, [fg_dataq+FGData.seed]
+ mova m5, [base+pb_mask]
+ pshuflw m2, m2, q0000
+ pshuflw m0, m0, q0000
+ mov r2, -73*82
+ sub bufq, r2
+ lea r3, [base+gaussian_sequence]
+.loop:
+ pand m6, m0, m1
+ psrlw m3, m6, 10
+ por m6, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set
+ pmullw m6, m4 ; bits 0x0f00 are set
+ pshufb m3, m5, m6 ; set 15th bit for next 4 seeds
+ psllq m6, m3, 30
+ por m3, m6
+ psllq m6, m3, 15
+ por m3, m6 ; aggregate each bit into next seed's high bit
+ pmulhuw m6, m0, m7
+ por m3, m6 ; 4 next output seeds
+ pshuflw m0, m3, q3333
+ psrlw m3, 5
+%if ARCH_X86_64
+ movq r6, m3
+ mov r8, r6
+ movzx r5d, r6w
+ shr r6d, 16
+ shr r8, 32
+ movzx r7, r8w
+ shr r8, 16
+
+ movd m6, [r3+r5*2]
+ pinsrw m6, [r3+r6*2], 1
+ pinsrw m6, [r3+r7*2], 2
+ pinsrw m6, [r3+r8*2], 3
+%else
+ movd r6, m3
+ pshuflw m3, m3, q3232
+ movzx r5, r6w
+ shr r6, 16
+
+ movd m6, [r3+r5*2]
+ pinsrw m6, [r3+r6*2], 1
+
+ movd r6, m3
+ movzx r5, r6w
+ shr r6, 16
+
+ pinsrw m6, [r3+r5*2], 2
+ pinsrw m6, [r3+r6*2], 3
+%endif
+ pmulhrsw m6, m2
+ packsswb m6, m6
+ movd [bufq+r2], m6
+ add r2, 4
+ jl .loop
+
+ ; auto-regression code
+ movsxd r2, [fg_dataq+FGData.ar_coeff_lag]
+ movsxd r2, [base+generate_grain_y_8bpc_ssse3_table+r2*4]
+ lea r2, [r2+base+generate_grain_y_8bpc_ssse3_table]
+ jmp r2
+
+.ar1:
+%if ARCH_X86_32
+ DEFINE_ARGS buf, fg_data, cf3, unused, val3, min, max
+%elif WIN64
+ DEFINE_ARGS shift, fg_data, cf3, buf, val3, min, max, x, val0
+ mov bufq, r0
+%else
+ DEFINE_ARGS buf, fg_data, cf3, shift, val3, min, max, x, val0
+%endif
+ movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3]
+ movd m4, [fg_dataq+FGData.ar_coeffs_y]
+ mov ecx, [fg_dataq+FGData.ar_coeff_shift]
+%if ARCH_X86_32
+ mov r1m, cf3d
+ DEFINE_ARGS buf, shift, val3, min, max, x, val0
+%define hd r0mp
+%define cf3d r1mp
+%elif WIN64
+ DEFINE_ARGS shift, h, cf3, buf, val3, min, max, x, val0
+%else
+ DEFINE_ARGS buf, h, cf3, shift, val3, min, max, x, val0
+%endif
+ pxor m6, m6
+ pcmpgtb m7, m6, m4
+ punpcklbw m4, m7
+ pinsrw m4, [base+pw_1], 3
+ pshufd m5, m4, q1111
+ pshufd m4, m4, q0000
+ movd m3, [base+round_vals+shiftq*2-12] ; rnd
+ pshuflw m3, m3, q0000
+ sub bufq, 82*73-(82*3+79)
+ mov hd, 70
+ mov mind, -128
+ mov maxd, 127
+.y_loop_ar1:
+ mov xq, -76
+ movsx val3d, byte [bufq+xq-1]
+.x_loop_ar1:
+ movq m0, [bufq+xq-82-1] ; top/left
+ pcmpgtb m7, m6, m0
+ punpcklbw m0, m7
+ psrldq m2, m0, 2 ; top
+ psrldq m1, m0, 4 ; top/right
+ punpcklwd m0, m2
+ punpcklwd m1, m3
+ pmaddwd m0, m4
+ pmaddwd m1, m5
+ paddd m0, m1
+.x_loop_ar1_inner:
+ movd val0d, m0
+ psrldq m0, 4
+ imul val3d, cf3d
+ add val3d, val0d
+ sar val3d, shiftb
+ movsx val0d, byte [bufq+xq]
+ add val3d, val0d
+ cmp val3d, maxd
+ cmovns val3d, maxd
+ cmp val3d, mind
+ cmovs val3d, mind
+ mov byte [bufq+xq], val3b
+ ; keep val3d in-place as left for next x iteration
+ inc xq
+ jz .x_loop_ar1_end
+ test xq, 3
+ jnz .x_loop_ar1_inner
+ jmp .x_loop_ar1
+
+.x_loop_ar1_end:
+ add bufq, 82
+ dec hd
+ jg .y_loop_ar1
+.ar0:
+ RET
+
+.ar2:
+%if ARCH_X86_32
+%assign stack_offset_old stack_offset
+ ALLOC_STACK -16*8
+%endif
+ DEFINE_ARGS buf, fg_data, shift
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ movd m6, [base+round_vals-12+shiftq*2]
+ movd m7, [base+byte_blend+1]
+ SCRATCH 7, 15, 7
+ movq m0, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-7
+ movd m1, [fg_dataq+FGData.ar_coeffs_y+8] ; cf8-11
+ pxor m7, m7
+ pshuflw m6, m6, q0000
+ punpcklwd m6, m7
+ pcmpgtb m4, m7, m0
+ pcmpgtb m5, m7, m1
+ punpcklbw m0, m4
+ punpcklbw m1, m5
+ DEFINE_ARGS buf, fg_data, h, x
+ pshufd m4, m1, q0000
+ pshufd m5, m1, q1111
+ pshufd m3, m0, q3333
+ pshufd m2, m0, q2222
+ pshufd m1, m0, q1111
+ pshufd m0, m0, q0000
+ SCRATCH 0, 8, 0
+ SCRATCH 1, 9, 1
+ SCRATCH 2, 10, 2
+ SCRATCH 3, 11, 3
+ SCRATCH 4, 12, 4
+ SCRATCH 5, 13, 5
+ SCRATCH 6, 14, 6
+ sub bufq, 82*73-(82*3+79)
+ mov hd, 70
+.y_loop_ar2:
+ mov xq, -76
+
+.x_loop_ar2:
+ movq m0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5]
+ movhps m0, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5]
+ pcmpgtb m2, m7, m0
+ punpckhbw m1, m0, m2
+ punpcklbw m0, m2
+ psrldq m5, m0, 2 ; y=-2,x=[-1,+5]
+ psrldq m3, m1, 2 ; y=-1,x=[-1,+5]
+ psrldq m4, m1, 4 ; y=-1,x=[+0,+5]
+ punpcklwd m2, m0, m5
+ punpcklwd m3, m4
+ pmaddwd m2, m8
+ pmaddwd m3, m11
+ paddd m2, m3
+
+ psrldq m4, m0, 4 ; y=-2,x=[+0,+5]
+ psrldq m5, m0, 6 ; y=-2,x=[+1,+5]
+ psrldq m6, m0, 8 ; y=-2,x=[+2,+5]
+ punpcklwd m4, m5
+ punpcklwd m6, m1
+ psrldq m5, m1, 6 ; y=-1,x=[+1,+5]
+ psrldq m1, m1, 8 ; y=-1,x=[+2,+5]
+ punpcklwd m5, m1
+ pmaddwd m4, m9
+ pmaddwd m6, m10
+ pmaddwd m5, m12
+ paddd m4, m6
+ paddd m2, m5
+ paddd m2, m4
+ paddd m2, m14
+
+ movq m0, [bufq+xq-2] ; y=0,x=[-2,+5]
+.x_loop_ar2_inner:
+ pcmpgtb m4, m7, m0
+ punpcklbw m1, m0, m4
+ pmaddwd m3, m1, m13
+ paddd m3, m2
+ psrldq m1, 4 ; y=0,x=0
+ psrldq m2, 4 ; shift top to next pixel
+ psrad m3, [fg_dataq+FGData.ar_coeff_shift]
+ ; don't packssdw since we only care about one value
+ paddw m3, m1
+ packsswb m3, m3
+ pslldq m3, 2
+ pand m3, m15
+ pandn m1, m15, m0
+ por m0, m1, m3
+ psrldq m0, 1
+ ; overwrite 2 pixels, but that's ok
+ movd [bufq+xq-1], m0
+ inc xq
+ jz .x_loop_ar2_end
+ test xq, 3
+ jnz .x_loop_ar2_inner
+ jmp .x_loop_ar2
+
+.x_loop_ar2_end:
+ add bufq, 82
+ dec hd
+ jg .y_loop_ar2
+ RET
+
+.ar3:
+ DEFINE_ARGS buf, fg_data, shift
+%if ARCH_X86_32
+%assign stack_offset stack_offset_old
+ ALLOC_STACK -16*14
+%elif WIN64
+ SUB rsp, 16*6
+%assign stack_size_padded (stack_size_padded+16*6)
+%assign stack_size (stack_size+16*6)
+%else
+ ALLOC_STACK -16*6
+%endif
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ movd m6, [base+round_vals-12+shiftq*2]
+ movd m7, [base+byte_blend]
+ movu m0, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-15
+ movq m2, [fg_dataq+FGData.ar_coeffs_y+16] ; cf16-23
+ pxor m3, m3
+ pcmpgtb m4, m3, m0
+ pcmpgtb m3, m2
+ pshuflw m6, m6, q0000
+ SCRATCH 6, 14, 12
+ SCRATCH 7, 15, 13
+ punpckhbw m1, m0, m4
+ punpcklbw m0, m4
+ punpcklbw m2, m3
+ pshufd m3, m0, q1111
+ pshufd m4, m0, q2222
+ pshufd m5, m0, q3333
+ pshufd m0, m0, q0000
+ mova [rsp+ 0*16], m0
+ mova [rsp+ 1*16], m3
+ mova [rsp+ 2*16], m4
+ mova [rsp+ 3*16], m5
+ pshufd m6, m1, q1111
+ pshufd m7, m1, q2222
+ pshufd m5, m1, q3333
+ pshufd m1, m1, q0000
+ pshufd m3, m2, q1111
+ psrldq m0, m2, 10
+ pinsrw m2, [base+pw_1], 5
+ pshufd m4, m2, q2222
+ pshufd m2, m2, q0000
+ pinsrw m0, [base+round_vals+shiftq*2-10], 3
+ mova [rsp+ 4*16], m1
+ mova [rsp+ 5*16], m6
+ SCRATCH 7, 8, 6
+ SCRATCH 5, 9, 7
+ SCRATCH 2, 10, 8
+ SCRATCH 3, 11, 9
+ SCRATCH 4, 12, 10
+ SCRATCH 0, 13, 11
+ DEFINE_ARGS buf, fg_data, h, x
+ sub bufq, 82*73-(82*3+79)
+ mov hd, 70
+.y_loop_ar3:
+ mov xq, -76
+
+.x_loop_ar3:
+ movu m0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12]
+ pxor m3, m3
+ pcmpgtb m3, m0
+ punpckhbw m2, m0, m3
+ punpcklbw m0, m3
+
+ psrldq m5, m0, 2
+ psrldq m6, m0, 4
+ psrldq m7, m0, 6
+ punpcklwd m4, m0, m5
+ punpcklwd m6, m7
+ pmaddwd m4, [rsp+ 0*16]
+ pmaddwd m6, [rsp+ 1*16]
+ paddd m4, m6
+
+ movu m1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12]
+ pxor m5, m5
+ pcmpgtb m5, m1
+ punpckhbw m3, m1, m5
+ punpcklbw m1, m5
+ palignr m6, m2, m0, 10
+ palignr m7, m2, m0, 12
+ psrldq m0, 8
+ punpcklwd m0, m6
+ punpcklwd m7, m1
+ pmaddwd m0, [rsp+ 2*16]
+ pmaddwd m7, [rsp+ 3*16]
+ paddd m0, m7
+ paddd m0, m4
+
+ psrldq m4, m1, 2
+ psrldq m5, m1, 4
+ psrldq m6, m1, 6
+ psrldq m7, m1, 8
+ punpcklwd m4, m5
+ punpcklwd m6, m7
+ pmaddwd m4, [rsp+ 4*16]
+ pmaddwd m6, [rsp+ 5*16]
+ paddd m4, m6
+ paddd m0, m4
+
+ movu m2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12]
+ pxor m7, m7
+ pcmpgtb m7, m2
+ punpckhbw m5, m2, m7
+ punpcklbw m2, m7
+ palignr m7, m3, m1, 10
+ palignr m3, m1, 12
+ psrldq m1, m2, 2
+ punpcklwd m7, m3
+ punpcklwd m3, m2, m1
+ pmaddwd m7, m8
+ pmaddwd m3, m9
+ paddd m7, m3
+ paddd m0, m7
+
+ psrldq m6, m2, 4
+ psrldq m1, m2, 6
+ psrldq m3, m2, 8
+ palignr m4, m5, m2, 10
+ palignr m5, m5, m2, 12
+
+ punpcklwd m6, m1
+ punpcklwd m3, m4
+ punpcklwd m5, m14
+ pmaddwd m6, m10
+ pmaddwd m3, m11
+ pmaddwd m5, m12
+ paddd m0, m6
+ paddd m3, m5
+ paddd m0, m3
+
+ movq m1, [bufq+xq-3] ; y=0,x=[-3,+4]
+.x_loop_ar3_inner:
+ pxor m5, m5
+ pcmpgtb m5, m1
+ punpcklbw m2, m1, m5
+ pmaddwd m2, m13
+ pshufd m3, m2, q1111
+ paddd m2, m3 ; left+cur
+ paddd m2, m0 ; add top
+ psrldq m0, 4
+ psrad m2, [fg_dataq+FGData.ar_coeff_shift]
+ ; don't packssdw since we only care about one value
+ packsswb m2, m2
+ pslldq m2, 3
+ pand m2, m15
+ pandn m3, m15, m1
+ por m1, m2, m3
+ movd [bufq+xq-3], m1
+ psrldq m1, 1
+ inc xq
+ jz .x_loop_ar3_end
+ test xq, 3
+ jnz .x_loop_ar3_inner
+ jmp .x_loop_ar3
+
+.x_loop_ar3_end:
+ add bufq, 82
+ dec hd
+ jg .y_loop_ar3
+ RET
+
+%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y
+INIT_XMM ssse3
+cglobal generate_grain_uv_%1_8bpc, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, uv
+ movifnidn r2, r2mp
+ movifnidn r3, r3mp
+ LEA r4, $$
+%define base r4-$$
+ movq m1, [base+rnd_next_upperbit_mask]
+ movq m4, [base+mul_bits]
+ movq m7, [base+hmul_bits]
+ mov r5d, [fg_dataq+FGData.grain_scale_shift]
+ movd m6, [base+round+r5*2]
+ mova m5, [base+pb_mask]
+ movd m0, [fg_dataq+FGData.seed]
+ movd m2, [base+pw_seed_xor+uvq*4]
+ pxor m0, m2
+ pshuflw m6, m6, q0000
+ pshuflw m0, m0, q0000
+ lea r6, [base+gaussian_sequence]
+%if %2
+%if ARCH_X86_64
+ mov r7d, 73-35*%3
+%else
+ mov r3mp, 73-35*%3
+%endif
+ add bufq, 44
+.loop_y:
+ mov r5, -44
+.loop_x:
+%else
+ mov r5, -82*73
+ sub bufq, r5
+.loop:
+%endif
+ pand m2, m0, m1
+ psrlw m3, m2, 10
+ por m2, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set
+ pmullw m2, m4 ; bits 0x0f00 are set
+ pshufb m3, m5, m2 ; set 15th bit for next 4 seeds
+ psllq m2, m3, 30
+ por m3, m2
+ psllq m2, m3, 15
+ por m3, m2 ; aggregate each bit into next seed's high bit
+ pmulhuw m2, m0, m7
+ por m2, m3 ; 4 next output seeds
+ pshuflw m0, m2, q3333
+ psrlw m2, 5
+%if ARCH_X86_64
+ movd r9d, m2
+ pshuflw m2, m2, q3232
+ movzx r8, r9w
+ shr r9, 16
+
+ movd m3, [r6+r8*2]
+ pinsrw m3, [r6+r9*2], 1
+
+ movd r9d, m2
+ movzx r8, r9w
+ shr r9, 16
+
+ pinsrw m3, [r6+r8*2], 2
+ pinsrw m3, [r6+r9*2], 3
+%else
+ movd r2, m2
+ pshuflw m2, m2, q3232
+ movzx r1, r2w
+ shr r2, 16
+
+ movd m3, [r6+r1*2]
+ pinsrw m3, [r6+r2*2], 1
+
+ movd r2, m2
+ movzx r1, r2w
+ shr r2, 16
+
+ pinsrw m3, [r6+r1*2], 2
+ pinsrw m3, [r6+r2*2], 3
+%endif
+ pmulhrsw m3, m6
+ packsswb m3, m3
+ movd [bufq+r5], m3
+ add r5, 4
+%if %2
+ jl .loop_x
+ add bufq, 82
+%if ARCH_X86_64
+ dec r7d
+%else
+ dec r3mp
+%endif
+ jg .loop_y
+%else
+ jl .loop
+%endif
+
+%if ARCH_X86_32
+ mov r2, r2mp
+%endif
+
+ ; auto-regression code
+ movsxd r5, [fg_dataq+FGData.ar_coeff_lag]
+ movsxd r5, [base+generate_grain_uv_%1_8bpc_ssse3_table+r5*4]
+ lea r5, [r5+base+generate_grain_uv_%1_8bpc_ssse3_table]
+ jmp r5
+
+.ar0:
+ DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
+ movifnidn bufyq, bufymp
+%if ARCH_X86_32
+%assign stack_offset_old stack_offset
+ ALLOC_STACK -2*16
+%endif
+ imul uvd, 28
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ movd m5, [fg_dataq+FGData.ar_coeffs_uv+uvq]
+ movd m4, [base+hmul_bits+shiftq*2]
+ DEFINE_ARGS buf, bufy, h, x
+ pxor m0, m0
+ pcmpgtb m0, m5
+ punpcklbw m5, m0
+ movd m7, [base+pb_1]
+%if %2
+ movd m6, [base+hmul_bits+2+%3*2]
+%endif
+ pshuflw m5, m5, q0000
+ pshuflw m4, m4, q0000
+ pshufd m7, m7, q0000
+%if %2
+ pshuflw m6, m6, q0000
+%endif
+ punpcklqdq m5, m5
+ punpcklqdq m4, m4
+%if %2
+ punpcklqdq m6, m6
+%endif
+ pcmpeqw m1, m1
+ pslldq m1, 12>>%2
+ SCRATCH 1, 8, 0
+ SCRATCH 4, 9, 1
+%if %2
+ sub bufq, 82*(73-35*%3)+82-(82*3+41)
+%else
+ sub bufq, 82*70-3
+%endif
+ add bufyq, 3+82*3
+ mov hd, 70-35*%3
+.y_loop_ar0:
+ xor xd, xd
+.x_loop_ar0:
+ ; first 32 pixels
+%if %2
+ movu m1, [bufyq+xq*2]
+%if %3
+ movu m2, [bufyq+xq*2+82]
+%endif
+ movu m3, [bufyq+xq*2+16]
+%if %3
+ movu m4, [bufyq+xq*2+82+16]
+%endif
+ pmaddubsw m0, m7, m1
+%if %3
+ pmaddubsw m1, m7, m2
+%endif
+ pmaddubsw m2, m7, m3
+%if %3
+ pmaddubsw m3, m7, m4
+ paddw m0, m1
+ paddw m2, m3
+%endif
+ pmulhrsw m0, m6
+ pmulhrsw m2, m6
+%else
+ movu m0, [bufyq+xq]
+ pxor m6, m6
+ pcmpgtb m6, m0
+ punpckhbw m2, m0, m6
+ punpcklbw m0, m6
+%endif
+ pmullw m0, m5
+ pmullw m2, m5
+ pmulhrsw m0, m9
+ pmulhrsw m2, m9
+ movu m1, [bufq+xq]
+ pxor m4, m4
+ pcmpgtb m4, m1
+ punpckhbw m3, m1, m4
+%if %2
+ punpcklbw m1, m4
+ paddw m2, m3
+ paddw m0, m1
+%else
+ punpcklbw m6, m1, m4
+ paddw m2, m3
+ paddw m0, m6
+%endif
+ packsswb m0, m2
+%if %2
+ movu [bufq+xq], m0
+ add xd, 16
+ cmp xd, 32
+ jl .x_loop_ar0
+
+ ; last 6/12 pixels
+ movu m1, [bufyq+xq*(1+%2)]
+%if %3
+ movu m2, [bufyq+xq*2+82]
+%endif
+ pmaddubsw m0, m7, m1
+%if %3
+ pmaddubsw m1, m7, m2
+ paddw m0, m1
+%endif
+ pmulhrsw m0, m6
+ pmullw m0, m5
+ pmulhrsw m0, m9
+ movq m1, [bufq+xq]
+ pxor m4, m4
+ pcmpgtb m4, m1
+ punpcklbw m2, m1, m4
+ paddw m0, m2
+ packsswb m0, m0
+ pandn m2, m8, m0
+ pand m1, m8
+ por m2, m1
+ movq [bufq+xq], m2
+%else
+ add xd, 16
+ cmp xd, 80
+ je .y_loop_final_ar0
+ movu [bufq+xq-16], m0
+ jmp .x_loop_ar0
+.y_loop_final_ar0:
+ pandn m2, m8, m0
+ pand m1, m8
+ por m2, m1
+ movu [bufq+xq-16], m2
+%endif
+
+ add bufq, 82
+ add bufyq, 82<<%3
+ dec hd
+ jg .y_loop_ar0
+ RET
+
+.ar1:
+%if ARCH_X86_32
+%assign stack_offset stack_offset_old
+%assign stack_size_padded 0
+%xdefine rstk rsp
+%endif
+ DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x
+ imul uvd, 28
+ movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3]
+ movd m4, [fg_dataq+FGData.ar_coeffs_uv+uvq-1]
+ pinsrw m4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 2
+%if ARCH_X86_32
+ mov r3mp, cf3d
+ DEFINE_ARGS buf, shift, fg_data, val3, min, max, x
+%elif WIN64
+ DEFINE_ARGS shift, bufy, fg_data, buf, val3, cf3, min, max, x
+ mov bufq, r0
+%else
+ DEFINE_ARGS buf, bufy, fg_data, shift, val3, cf3, min, max, x
+%endif
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ movd m3, [base+round_vals+shiftq*2-12] ; rnd
+%if %2
+ movd m7, [base+pb_1]
+ movd m6, [base+hmul_bits+2+%3*2]
+%endif
+ psrldq m4, 1
+%if ARCH_X86_32
+ DEFINE_ARGS buf, shift, val0, val3, min, max, x
+%elif WIN64
+ DEFINE_ARGS shift, bufy, h, buf, val3, cf3, min, max, x, val0
+%else
+ DEFINE_ARGS buf, bufy, h, shift, val3, cf3, min, max, x, val0
+%endif
+ pxor m5, m5
+ punpcklwd m3, m5
+%if %2
+ punpcklwd m6, m6
+%endif
+ pcmpgtb m5, m4
+ punpcklbw m4, m5
+ pshufd m5, m4, q1111
+ pshufd m4, m4, q0000
+ pshufd m3, m3, q0000
+%if %2
+ pshufd m7, m7, q0000
+ pshufd m6, m6, q0000
+ sub bufq, 82*(73-35*%3)+44-(82*3+41)
+%else
+ sub bufq, 82*69+3
+%endif
+%if ARCH_X86_32
+ add r1mp, 79+82*3
+ mov r0mp, 70-35*%3
+%else
+ add bufyq, 79+82*3
+ mov hd, 70-35*%3
+%endif
+ mov mind, -128
+ mov maxd, 127
+.y_loop_ar1:
+ mov xq, -(76>>%2)
+ movsx val3d, byte [bufq+xq-1]
+.x_loop_ar1:
+%if %2
+%if ARCH_X86_32
+ mov r2, r1mp
+ movq m0, [r2+xq*2]
+%if %3
+ movq m1, [r2+xq*2+82]
+%endif
+%else
+ movq m0, [bufyq+xq*2]
+%if %3
+ movq m1, [bufyq+xq*2+82]
+%endif
+%endif
+ pmaddubsw m2, m7, m0
+%if %3
+ pmaddubsw m0, m7, m1
+ paddw m2, m0
+%endif
+ pmulhrsw m2, m6
+%else
+%if ARCH_X86_32
+ mov r2, r1mp
+ movd m2, [r2+xq]
+%else
+ movd m2, [bufyq+xq]
+%endif
+ pxor m0, m0
+ pcmpgtb m0, m2
+ punpcklbw m2, m0
+%endif
+
+ movq m0, [bufq+xq-82-1] ; top/left
+ pxor m1, m1
+ pcmpgtb m1, m0
+ punpcklbw m0, m1
+ psrldq m1, m0, 4 ; top/right
+ punpcklwd m1, m2
+ psrldq m2, m0, 2 ; top
+ punpcklwd m0, m2
+ pmaddwd m0, m4
+ pmaddwd m1, m5
+ paddd m0, m1
+ paddd m0, m3
+.x_loop_ar1_inner:
+ movd val0d, m0
+ psrldq m0, 4
+%if ARCH_X86_32
+ imul val3d, r3mp
+%else
+ imul val3d, cf3d
+%endif
+ add val3d, val0d
+ sar val3d, shiftb
+ movsx val0d, byte [bufq+xq]
+ add val3d, val0d
+ cmp val3d, maxd
+ cmovns val3d, maxd
+ cmp val3d, mind
+ cmovs val3d, mind
+ mov byte [bufq+xq], val3b
+ ; keep val3d in-place as left for next x iteration
+ inc xq
+ jz .x_loop_ar1_end
+ test xq, 3
+ jnz .x_loop_ar1_inner
+ jmp .x_loop_ar1
+
+.x_loop_ar1_end:
+ add bufq, 82
+%if ARCH_X86_32
+ add r1mp, 82<<%3
+ dec r0mp
+%else
+ add bufyq, 82<<%3
+ dec hd
+%endif
+ jg .y_loop_ar1
+ RET
+
+.ar2:
+%if ARCH_X86_32
+%assign stack_offset stack_offset_old
+%assign stack_size_padded 0
+%xdefine rstk rsp
+ ALLOC_STACK -8*16
+%endif
+ DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
+ movifnidn bufyq, bufymp
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ imul uvd, 28
+ movd m7, [base+round_vals-12+shiftq*2]
+ movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] ; cf0-12
+ pxor m2, m2
+ pcmpgtb m2, m0
+ punpckhbw m1, m0, m2
+ punpcklbw m0, m2
+ pinsrw m1, [base+pw_1], 5
+ punpcklwd m7, m7
+ pshufd m7, m7, q0000
+ DEFINE_ARGS buf, bufy, fg_data, h, unused, x
+ pshufd m4, m1, q0000
+ pshufd m5, m1, q1111
+ pshufd m6, m1, q2222
+ pshufd m3, m0, q3333
+ pshufd m2, m0, q2222
+ pshufd m1, m0, q1111
+ pshufd m0, m0, q0000
+ SCRATCH 0, 8, 0
+ SCRATCH 1, 9, 1
+ SCRATCH 2, 10, 2
+ SCRATCH 3, 11, 3
+ SCRATCH 4, 12, 4
+ SCRATCH 5, 13, 5
+ SCRATCH 6, 14, 6
+ SCRATCH 7, 15, 7
+%if %2
+ movd m7, [base+hmul_bits+2+%3*2]
+ movd m6, [base+pb_1]
+ punpcklwd m7, m7
+ pshufd m6, m6, q0000
+ pshufd m7, m7, q0000
+ sub bufq, 82*(73-35*%3)+44-(82*3+41)
+%else
+ sub bufq, 82*69+3
+%endif
+ add bufyq, 79+82*3
+ mov hd, 70-35*%3
+.y_loop_ar2:
+ mov xq, -(76>>%2)
+
+.x_loop_ar2:
+ pxor m2, m2
+ movq m0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5]
+ movhps m0, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5]
+ pcmpgtb m2, m0
+ punpckhbw m1, m0, m2
+ punpcklbw m0, m2
+ psrldq m5, m0, 2 ; y=-2,x=[-1,+5]
+ psrldq m3, m1, 2 ; y=-1,x=[-1,+5]
+ psrldq m4, m1, 4 ; y=-1,x=[+0,+5]
+ punpcklwd m2, m0, m5
+ punpcklwd m3, m4
+ pmaddwd m2, m8
+ pmaddwd m3, m11
+ paddd m2, m3
+
+ psrldq m4, m0, 4 ; y=-2,x=[+0,+5]
+ psrldq m5, m0, 6 ; y=-2,x=[+1,+5]
+ psrldq m0, 8 ; y=-2,x=[+2,+5]
+ punpcklwd m4, m5
+ punpcklwd m0, m1
+ psrldq m3, m1, 6 ; y=-1,x=[+1,+5]
+ psrldq m1, m1, 8 ; y=-1,x=[+2,+5]
+ punpcklwd m3, m1
+ pmaddwd m4, m9
+ pmaddwd m0, m10
+ pmaddwd m3, m12
+ paddd m4, m0
+ paddd m2, m3
+ paddd m2, m4
+
+%if %2
+ movq m1, [bufyq+xq*2]
+%if %3
+ movq m3, [bufyq+xq*2+82]
+%endif
+ pmaddubsw m0, m6, m1
+%if %3
+ pmaddubsw m1, m6, m3
+ paddw m0, m1
+%endif
+ pmulhrsw m0, m7
+%else
+ movd m0, [bufyq+xq]
+ pxor m1, m1
+ pcmpgtb m1, m0
+ punpcklbw m0, m1
+%endif
+ punpcklwd m0, m15
+ pmaddwd m0, m14
+ paddd m2, m0
+
+ movq m0, [bufq+xq-2] ; y=0,x=[-2,+5]
+ pxor m4, m4
+ movd m5, [base+byte_blend+1]
+ punpcklbw m5, m5
+.x_loop_ar2_inner:
+ pcmpgtb m1, m4, m0
+ punpcklbw m0, m1
+ pmaddwd m3, m0, m13
+ paddd m3, m2
+ psrldq m2, 4 ; shift top to next pixel
+ psrad m3, [fg_dataq+FGData.ar_coeff_shift]
+ pslldq m3, 4
+ pand m3, m5
+ paddw m0, m3
+ packsswb m0, m0
+ movd [bufq+xq-2], m0
+ psrldq m0, 1
+ inc xq
+ jz .x_loop_ar2_end
+ test xq, 3
+ jnz .x_loop_ar2_inner
+ jmp .x_loop_ar2
+
+.x_loop_ar2_end:
+ add bufq, 82
+ add bufyq, 82<<%3
+ dec hd
+ jg .y_loop_ar2
+ RET
+
+.ar3:
+%if ARCH_X86_32
+%assign stack_offset stack_offset_old
+%assign stack_size_padded 0
+%xdefine rstk rsp
+%endif
+ DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
+ movifnidn bufyq, bufymp
+%if ARCH_X86_32
+ ALLOC_STACK -15*16
+%else
+ SUB rsp, 16*7
+%assign stack_size_padded (stack_size_padded+16*7)
+%assign stack_size (stack_size+16*7)
+%endif
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ imul uvd, 28
+
+ movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] ; cf0-15
+ pxor m3, m3
+ pcmpgtb m3, m0
+ punpckhbw m1, m0, m3
+ punpcklbw m0, m3
+ pshufd m2, m0, q1111
+ pshufd m3, m0, q2222
+ pshufd m4, m0, q3333
+ pshufd m0, m0, q0000
+ pshufd m5, m1, q1111
+ pshufd m6, m1, q2222
+ pshufd m7, m1, q3333
+ pshufd m1, m1, q0000
+ mova [rsp+ 0*16], m0
+ mova [rsp+ 1*16], m2
+ mova [rsp+ 2*16], m3
+ mova [rsp+ 3*16], m4
+ mova [rsp+ 4*16], m1
+ mova [rsp+ 5*16], m5
+ mova [rsp+ 6*16], m6
+ SCRATCH 7, 8, 7
+
+ movu m2, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] ; cf16-24 [24=luma]
+ pxor m4, m4
+ pcmpgtb m4, m2
+ punpckhbw m5, m2, m4
+ punpcklbw m2, m4
+ pshufd m4, m2, q3232
+ punpcklwd m3, m4, m5
+ pshuflw m5, m4, q3321
+ pshufd m4, m3, q0000
+ pshufd m3, m2, q1111
+ pshufd m2, m2, q0000
+ pinsrw m5, [base+round_vals+shiftq*2-10], 3
+ SCRATCH 2, 9, 8
+ SCRATCH 3, 10, 9
+ SCRATCH 4, 11, 10
+ SCRATCH 5, 12, 11
+
+ movd m2, [base+round_vals-12+shiftq*2]
+%if %2
+ movd m1, [base+pb_1]
+ movd m3, [base+hmul_bits+2+%3*2]
+%endif
+ pxor m0, m0
+ punpcklwd m2, m0
+%if %2
+ punpcklwd m3, m3
+%endif
+ pshufd m2, m2, q0000
+%if %2
+ pshufd m1, m1, q0000
+ pshufd m3, m3, q0000
+ SCRATCH 1, 13, 12
+%endif
+ SCRATCH 2, 14, 13
+%if %2
+ SCRATCH 3, 15, 14
+%endif
+
+ DEFINE_ARGS buf, bufy, fg_data, h, unused, x
+%if %2
+ sub bufq, 82*(73-35*%3)+44-(82*3+41)
+%else
+ sub bufq, 82*69+3
+%endif
+ add bufyq, 79+82*3
+ mov hd, 70-35*%3
+.y_loop_ar3:
+ mov xq, -(76>>%2)
+
+.x_loop_ar3:
+ movu m0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12]
+ pxor m4, m4
+ pcmpgtb m4, m0
+ punpckhbw m3, m0, m4
+ punpcklbw m0, m4
+
+ psrldq m5, m0, 2
+ psrldq m6, m0, 4
+ psrldq m7, m0, 6
+ punpcklwd m4, m0, m5
+ punpcklwd m6, m7
+ pmaddwd m4, [rsp+ 0*16]
+ pmaddwd m6, [rsp+ 1*16]
+ paddd m4, m6
+
+ palignr m2, m3, m0, 10
+ palignr m3, m0, 12
+ psrldq m0, 8
+
+ movu m1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12]
+ pxor m6, m6
+ pcmpgtb m6, m1
+ punpckhbw m5, m1, m6
+ punpcklbw m1, m6
+
+ punpcklwd m0, m2
+ punpcklwd m3, m1
+ pmaddwd m0, [rsp+ 2*16]
+ pmaddwd m3, [rsp+ 3*16]
+ paddd m0, m3
+ paddd m0, m4
+
+ movu m2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12]
+ pxor m7, m7
+ pcmpgtb m7, m2
+ punpckhbw m6, m2, m7
+ punpcklbw m2, m7
+
+ palignr m3, m5, m1, 10
+ palignr m5, m1, 12
+ psrldq m4, m2, 2
+
+ punpcklwd m3, m5
+ punpcklwd m5, m2, m4
+ pmaddwd m3, [rsp+ 6*16]
+ pmaddwd m5, m8
+ paddd m3, m5
+ paddd m0, m3
+
+ psrldq m3, m1, 2
+ psrldq m4, m1, 4
+ psrldq m5, m1, 6
+ psrldq m1, 8
+
+ punpcklwd m3, m4
+ punpcklwd m5, m1
+ pmaddwd m3, [rsp+ 4*16]
+ pmaddwd m5, [rsp+ 5*16]
+ paddd m3, m5
+ paddd m0, m3
+
+%if %2
+ movq m1, [bufyq+xq*2]
+%if %3
+ movq m3, [bufyq+xq*2+82]
+%endif
+ pmaddubsw m7, m13, m1
+%if %3
+ pmaddubsw m5, m13, m3
+ paddw m7, m5
+%endif
+ pmulhrsw m7, m15
+%else
+ movd m7, [bufyq+xq]
+ pxor m1, m1
+ pcmpgtb m1, m7
+ punpcklbw m7, m1
+%endif
+
+ psrldq m1, m2, 4
+ psrldq m3, m2, 6
+ palignr m4, m6, m2, 10
+ palignr m6, m2, 12
+ psrldq m2, 8
+
+ punpcklwd m1, m3
+ punpcklwd m2, m4
+ punpcklwd m6, m7
+ pmaddwd m1, m9
+ pmaddwd m2, m10
+ pmaddwd m6, m11
+ paddd m1, m2
+ paddd m0, m6
+ paddd m0, m1
+ paddd m0, m14
+
+ movq m1, [bufq+xq-3] ; y=0,x=[-3,+4]
+ pxor m4, m4
+ movd m5, [base+byte_blend]
+.x_loop_ar3_inner:
+ pcmpgtb m2, m4, m1
+ punpcklbw m3, m1, m2
+ pmaddwd m2, m3, m12
+ pshufd m3, m2, q1111
+ paddd m2, m3 ; left+cur
+ paddd m2, m0 ; add top
+ psrldq m0, 4
+ psrad m2, [fg_dataq+FGData.ar_coeff_shift]
+ ; don't packssdw, we only care about one value
+ packsswb m2, m2
+ pandn m3, m5, m1
+ pslld m2, 24
+ pand m2, m5
+ por m1, m2, m3
+ movd [bufq+xq-3], m1
+ psrldq m1, 1
+ inc xq
+ jz .x_loop_ar3_end
+ test xq, 3
+ jnz .x_loop_ar3_inner
+ jmp .x_loop_ar3
+
+.x_loop_ar3_end:
+ add bufq, 82
+ add bufyq, 82<<%3
+ dec hd
+ jg .y_loop_ar3
+ RET
+%endmacro
+
+generate_grain_uv_fn 420, 1, 1
+generate_grain_uv_fn 422, 1, 0
+generate_grain_uv_fn 444, 0, 0
+
+%macro vpgatherdw 5-6 ; dst, src, base, tmp_gpr[x2], tmp_xmm_reg
+%assign %%idx 0
+%define %%tmp %2
+%if %0 == 6
+%define %%tmp %6
+%endif
+%rep 4
+%if %%idx == 0
+ movd %5 %+ d, %2
+ pshuflw %%tmp, %2, q3232
+%else
+ movd %5 %+ d, %%tmp
+%if %%idx == 2
+ punpckhqdq %%tmp, %%tmp
+%elif %%idx == 4
+ psrlq %%tmp, 32
+%endif
+%endif
+ movzx %4 %+ d, %5 %+ w
+ shr %5 %+ d, 16
+
+%if %%idx == 0
+ movd %1, [%3+%4]
+%else
+ pinsrw %1, [%3+%4], %%idx + 0
+%endif
+ pinsrw %1, [%3+%5], %%idx + 1
+%assign %%idx %%idx+2
+%endrep
+%endmacro
+
+INIT_XMM ssse3
+; fgy_32x32xn(dst, src, stride, fg_data, w, scaling, grain_lut, h, sby)
+%if ARCH_X86_32
+%if STACK_ALIGNMENT < mmsize
+cglobal fgy_32x32xn_8bpc, 0, 7, 16, 0 - (5 * mmsize + 16 * gprsize), \
+ dst, src, scaling, unused1, fg_data, picptr, unused2
+ ; copy stack arguments to new position post-alignment, so that we
+ ; don't have to keep the old stack location in a separate register
+ mov r0, r0m
+ mov r1, r2m
+ mov r2, r4m
+ mov r3, r6m
+ mov r4, r7m
+ mov r5, r8m
+
+ mov [rsp+5*mmsize+ 4*gprsize], r0
+ mov [rsp+5*mmsize+ 6*gprsize], r1
+ mov [rsp+5*mmsize+ 8*gprsize], r2
+ mov [rsp+5*mmsize+10*gprsize], r3
+ mov [rsp+5*mmsize+11*gprsize], r4
+ mov [rsp+5*mmsize+12*gprsize], r5
+%else
+cglobal fgy_32x32xn_8bpc, 0, 7, 16, 5 * mmsize + 4 * gprsize, \
+ dst, src, scaling, unused1, fg_data, picptr, unused2
+%endif
+ mov srcq, srcm
+ mov fg_dataq, r3m
+ mov scalingq, r5m
+%if STACK_ALIGNMENT < mmsize
+%define r0m [rsp+5*mmsize+ 4*gprsize]
+%define r1m [rsp+5*mmsize+ 5*gprsize]
+%define r2m [rsp+5*mmsize+ 6*gprsize]
+%define r3m [rsp+5*mmsize+ 7*gprsize]
+%define r4m [rsp+5*mmsize+ 8*gprsize]
+%define r5m [rsp+5*mmsize+ 9*gprsize]
+%define r6m [rsp+5*mmsize+10*gprsize]
+%define r7m [rsp+5*mmsize+11*gprsize]
+%define r8m [rsp+5*mmsize+12*gprsize]
+%endif
+ LEA r5, pb_mask
+%define base r5-pb_mask
+ mov r5m, picptrq
+%else
+cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
+ lea r7, [pb_mask]
+%define base r7-pb_mask
+%endif
+ mov r6d, [fg_dataq+FGData.scaling_shift]
+ movd m3, [base+mul_bits+r6*2-14]
+ mov r6d, [fg_dataq+FGData.clip_to_restricted_range]
+ movd m4, [base+max+r6*4]
+ movd m5, [base+min+r6*2]
+ punpcklwd m3, m3
+ punpcklwd m4, m4
+ punpcklwd m5, m5
+ pshufd m3, m3, q0000
+ pshufd m4, m4, q0000
+ pshufd m5, m5, q0000
+ SCRATCH 3, 11, 0
+ SCRATCH 4, 12, 1
+ SCRATCH 5, 13, 2
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap
+%endif
+
+ mov sbyd, r8m
+ mov overlapd, [fg_dataq+FGData.overlap_flag] ; left_overlap: overlap & 1
+ test overlapd, overlapd
+ jz .no_vertical_overlap
+ mova m6, [base+pw_1024]
+ mova m7, [base+pb_27_17_17_27]
+ SCRATCH 6, 14, 3
+ SCRATCH 7, 15, 4
+ test sbyd, sbyd
+ jnz .vertical_overlap
+ ; fall-through
+
+.no_vertical_overlap:
+ mov r8m, overlapd
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, unused
+ imul seed, (173 << 24) | 37
+%else
+ imul seed, sbyd, (173 << 24) | 37
+%endif
+ add seed, (105 << 24) | 178
+ rol seed, 8
+ movzx seed, seew
+ xor seed, [fg_dataq+FGData.seed]
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
+
+ mov r3m, seed
+ mov wq, r4m
+%else
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ unused1, unused2, see, unused3
+%endif
+
+ lea src_bakq, [srcq+wq]
+ neg wq
+ sub dstmp, srcq
+%if ARCH_X86_32
+ mov r1m, src_bakq
+ mov r4m, wq
+ DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3
+%endif
+
+.loop_x:
+%if ARCH_X86_32
+ mov seed, r3m
+%endif
+ mov r6d, seed
+ or seed, 0xEFF4
+ shr r6d, 1
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
+
+ mov offxd, offyd
+%else
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ offx, offy, see, unused
+
+ mov offyd, seed
+ mov offxd, seed
+%endif
+ ror offyd, 8
+ shr offxd, 12
+ and offyd, 0xf
+ imul offyd, 164
+ lea offyq, [offyq+offxq*2+747] ; offy*stride+offx
+
+%if ARCH_X86_32
+ ; r0m=dst, r1m=src_bak, r2m=stride, r3m=see, r4m=w, r5m=picptr,
+ ; r6m=grain_lut, r7m=h, r8m=overlap_v|h
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%else
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ h, offxy, see, unused
+%endif
+
+.loop_x_odd:
+ mov hd, r7m
+ mov grain_lutq, grain_lutmp
+.loop_y:
+ ; src
+ mova m0, [srcq]
+ pxor m2, m2
+ punpckhbw m1, m0, m2
+ punpcklbw m0, m2 ; m0-1: src as word
+
+ ; scaling[src]
+%if ARCH_X86_32
+ vpgatherdw m4, m0, scalingq-1, r0, r5, m3
+ vpgatherdw m5, m1, scalingq-1, r0, r5, m3
+%else
+ vpgatherdw m4, m0, scalingq-1, r12, r13, m3
+ vpgatherdw m5, m1, scalingq-1, r12, r13, m3
+%endif
+ REPX {psrlw x, 8}, m4, m5
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m3, [grain_lutq+offxyq]
+ pcmpgtb m7, m2, m3
+ punpcklbw m2, m3, m7
+ punpckhbw m3, m7
+
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+ pmullw m2, m4
+ pmullw m3, m5
+ pmulhrsw m2, m11
+ pmulhrsw m3, m11
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2
+ paddw m1, m3
+ pmaxsw m0, m13
+ pmaxsw m1, m13
+ pminsw m0, m12
+ pminsw m1, m12
+ packuswb m0, m1
+ movifnidn dstq, dstmp
+ mova [dstq+srcq], m0
+
+ add srcq, r2mp
+ add grain_lutq, 82
+ dec hd
+ jg .loop_y
+
+%if ARCH_X86_32
+ add r4mp, 16
+%else
+ add wq, 16
+%endif
+ jge .end
+%if ARCH_X86_32
+ mov srcq, r1mp
+ add srcq, r4mp
+%else
+ lea srcq, [src_bakq+wq]
+%endif
+ btc dword r8m, 2
+ jc .next_blk
+
+ add offxyd, 16
+ test dword r8m, 2 ; r8m & 2 = have_top_overlap
+ jz .loop_x_odd
+
+%if ARCH_X86_32
+ add dword [rsp+5*mmsize+1*gprsize], 16
+%else
+ add r11d, 16 ; top_offxyd
+%endif
+ jnz .loop_x_odd_v_overlap
+
+.next_blk:
+ test dword r8m, 1
+ jz .loop_x
+
+ test dword r8m, 2
+ jnz .loop_x_hv_overlap
+
+ ; horizontal overlap (without vertical overlap)
+.loop_x_h_overlap:
+%if ARCH_X86_32
+ ; r0m=dst, r1m=src_bak, r2m=stride, r3m=see, r4m=w, r5m=picptr,
+ ; r6m=grain_lut, r7m=h, r8m=overlap_v|h
+ DEFINE_ARGS dst, src, scaling, offxy, unused1, unused2, unused3
+
+ add offxyd, 16 ; left_offxyd
+ mov [rsp+5*mmsize+0*gprsize], offxyd
+
+ DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3
+
+ mov seed, r3m
+%else
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ offx, offy, see, left_offxy
+
+ lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx
+%endif
+
+ mov r6d, seed
+ or seed, 0xEFF4
+ shr r6d, 1
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
+
+ mov offxd, offyd
+%else
+ mov offyd, seed
+ mov offxd, seed
+%endif
+ ror offyd, 8
+ shr offxd, 12
+ and offyd, 0xf
+ imul offyd, 164
+ lea offyq, [offyq+offxq*2+747] ; offy*stride+offx
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%else
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ h, offxy, see, left_offxy
+%endif
+
+ mov hd, r7m
+ mov grain_lutq, grain_lutmp
+.loop_y_h_overlap:
+ ; src
+ mova m0, [srcq]
+ pxor m2, m2
+ punpckhbw m1, m0, m2
+ punpcklbw m0, m2 ; m0-1: src as word
+
+ ; scaling[src]
+%if ARCH_X86_32
+ vpgatherdw m4, m0, scalingq-1, r0, r5, m3
+ vpgatherdw m5, m1, scalingq-1, r0, r5, m3
+%else
+ vpgatherdw m4, m0, scalingq-1, r12, r13, m3
+ vpgatherdw m5, m1, scalingq-1, r12, r13, m3
+%endif
+ REPX {psrlw x, 8}, m4, m5
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m3, [grain_lutq+offxyq]
+%if ARCH_X86_32
+ mov r5, [rsp+5*mmsize+0*gprsize]
+ movd m7, [grain_lutq+r5]
+%else
+ movd m7, [grain_lutq+left_offxyq]
+%endif
+ punpcklbw m7, m3
+ pmaddubsw m6, m15, m7
+ pmulhrsw m6, m14
+ packsswb m6, m6
+ shufps m6, m3, q3210
+ pcmpgtb m2, m6
+ punpcklbw m7, m6, m2
+ punpckhbw m6, m2
+
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+ pmullw m7, m4
+ pmullw m6, m5
+ pmulhrsw m7, m11
+ pmulhrsw m6, m11
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m7
+ paddw m1, m6
+ pmaxsw m0, m13
+ pmaxsw m1, m13
+ pminsw m0, m12
+ pminsw m1, m12
+ packuswb m0, m1
+ movifnidn dstq, dstmp
+ mova [dstq+srcq], m0
+
+ add srcq, r2mp
+ add grain_lutq, 82
+ dec hd
+ jg .loop_y_h_overlap
+
+%if ARCH_X86_32
+ add r4mp, 16
+%else
+ add wq, 16
+%endif
+ jge .end
+%if ARCH_X86_32
+ mov srcq, r1m
+ add srcq, r4m
+%else
+ lea srcq, [src_bakq+wq]
+%endif
+ xor dword r8m, 4
+ add offxyd, 16
+
+ ; since this half-block had left-overlap, the next does not
+ test dword r8m, 2 ; have_top_overlap
+ jz .loop_x_odd
+%if ARCH_X86_32
+ add dword [rsp+5*mmsize+1*gprsize], 16
+%else
+ add r11d, 16 ; top_offxyd
+%endif
+ jmp .loop_x_odd_v_overlap
+
+.end:
+ RET
+
+.vertical_overlap:
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, tmp, sby, see, overlap
+%endif
+
+ or overlapd, 2 ; top_overlap: overlap & 2
+ mov r8m, overlapd
+ movzx sbyd, sbyb
+%if ARCH_X86_32
+ imul r4, [fg_dataq+FGData.seed], 0x00010001
+ DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused
+%else
+ imul seed, [fg_dataq+FGData.seed], 0x00010001
+%endif
+ imul tmpd, sbyd, 173 * 0x00010001
+ imul sbyd, 37 * 0x01000100
+ add tmpd, (105 << 16) | 188
+ add sbyd, (178 << 24) | (141 << 8)
+ and tmpd, 0x00ff00ff
+ and sbyd, 0xff00ff00
+ xor seed, tmpd
+%if ARCH_X86_32
+ xor sbyd, seed ; (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
+
+ mov r3m, seed
+ mov wq, r4m
+%else
+ xor seed, sbyd ; (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ tmp, unused2, see, unused3
+%endif
+
+ lea src_bakq, [srcq+wq]
+ neg wq
+ sub dstmp, srcq
+%if ARCH_X86_32
+ mov r1m, src_bakq
+ mov r4m, wq
+ DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2
+%endif
+
+.loop_x_v_overlap:
+%if ARCH_X86_32
+ mov seed, r3m
+%endif
+ ; we assume from the block above that bits 8-15 of tmpd are zero'ed,
+ ; because of the 'and tmpd, 0x00ff00ff' above
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp tmpb ; parity of top_seed
+ shr seed, 16
+ shl tmpd, 16
+ test seeb, seeh
+ setp tmpb ; parity of cur_seed
+ or r6d, 0x00010001
+ xor tmpd, r6d
+ mov seed, tmpd
+ ror seed, 1 ; updated (cur_seed << 16) | top_seed
+
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
+
+ mov offxd, offyd
+%else
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ offx, offy, see, unused, top_offxy
+
+ mov offyd, seed
+ mov offxd, seed
+%endif
+
+ ror offyd, 8
+ ror offxd, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyq, [offyq+offxq*2+0x10001*747+32*82]
+
+%if ARCH_X86_32
+ DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut
+%else
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ h, offxy, see, unused, top_offxy
+%endif
+
+ movzx top_offxyd, offxyw
+%if ARCH_X86_32
+ mov [rsp+5*mmsize+1*gprsize], top_offxyd
+
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%endif
+ shr offxyd, 16
+
+.loop_x_odd_v_overlap:
+%if ARCH_X86_32
+ mov r5, r5m
+ lea r5, [base+pb_27_17]
+ mov [rsp+5*mmsize+12], r5
+%else
+ mova m8, [pb_27_17]
+%endif
+ mov hd, r7m
+ mov grain_lutq, grain_lutmp
+.loop_y_v_overlap:
+ ; src
+ mova m0, [srcq]
+ pxor m2, m2
+ punpckhbw m1, m0, m2
+ punpcklbw m0, m2 ; m0-1: src as word
+
+ ; scaling[src]
+%if ARCH_X86_32
+ vpgatherdw m4, m0, scalingq-1, r0, r5, m3
+ vpgatherdw m5, m1, scalingq-1, r0, r5, m3
+%else
+ vpgatherdw m4, m0, scalingq-1, r12, r13, m3
+ vpgatherdw m5, m1, scalingq-1, r12, r13, m3
+%endif
+ REPX {psrlw x, 8}, m4, m5
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m3, [grain_lutq+offxyq]
+%if ARCH_X86_32
+ mov r5, [rsp+5*mmsize+1*gprsize]
+ movu m7, [grain_lutq+r5]
+%else
+ movu m7, [grain_lutq+top_offxyq]
+%endif
+ punpckhbw m6, m7, m3
+ punpcklbw m7, m3
+%if ARCH_X86_32
+ mov r5, [rsp+5*mmsize+12]
+ pmaddubsw m3, [r5], m6
+ pmaddubsw m6, [r5], m7
+%else
+ pmaddubsw m3, m8, m6
+ pmaddubsw m6, m8, m7
+%endif
+ pmulhrsw m3, m14
+ pmulhrsw m6, m14
+ packsswb m6, m3
+ pcmpgtb m7, m2, m6
+ punpcklbw m2, m6, m7
+ punpckhbw m6, m7
+
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+ pmullw m2, m4
+ pmullw m6, m5
+ pmulhrsw m2, m11
+ pmulhrsw m6, m11
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2
+ paddw m1, m6
+ pmaxsw m0, m13
+ pmaxsw m1, m13
+ pminsw m0, m12
+ pminsw m1, m12
+ packuswb m0, m1
+ movifnidn dstq, dstmp
+ mova [dstq+srcq], m0
+
+%if ARCH_X86_32
+ add dword [rsp+5*mmsize+12], mmsize
+%else
+ mova m8, [pb_17_27]
+%endif
+ add srcq, r2mp
+ add grain_lutq, 82
+ dec hw
+ jz .end_y_v_overlap
+ ; 2 lines get vertical overlap, then fall back to non-overlap code for
+ ; remaining (up to) 30 lines
+ btc hd, 16
+ jnc .loop_y_v_overlap
+ jmp .loop_y
+
+.end_y_v_overlap:
+%if ARCH_X86_32
+ add r4mp, 16
+%else
+ add wq, 16
+%endif
+ jge .end_hv
+%if ARCH_X86_32
+ mov srcq, r1mp
+ add srcq, r4mp
+%else
+ lea srcq, [src_bakq+wq]
+%endif
+ btc dword r8m, 2
+ jc .loop_x_hv_overlap
+ add offxyd, 16
+%if ARCH_X86_32
+ add dword [rsp+5*mmsize+1*gprsize], 16
+%else
+ add top_offxyd, 16
+%endif
+ jmp .loop_x_odd_v_overlap
+
+.loop_x_hv_overlap:
+%if ARCH_X86_32
+ mov r5, r5m
+ lea r5, [base+pb_27_17]
+ mov [rsp+5*mmsize+12], r5
+
+ DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, src_bak
+
+ mov r5, [rsp+5*mmsize+1*gprsize]
+ mov r4, offxyd
+ add r5, 16
+ add r4, 16
+ mov [rsp+5*mmsize+2*gprsize], r5 ; topleft_offxy
+ mov [rsp+5*mmsize+0*gprsize], r4 ; left_offxy
+
+ DEFINE_ARGS tmp, src, scaling, see, w, picptr, src_bak
+
+ xor tmpd, tmpd
+ mov seed, r3m
+%else
+ mova m8, [pb_27_17]
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ tmp, unused2, see, unused3
+
+ ; we assume from the block above that bits 8-15 of tmpd are zero'ed
+%endif
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp tmpb ; parity of top_seed
+ shr seed, 16
+ shl tmpd, 16
+ test seeb, seeh
+ setp tmpb ; parity of cur_seed
+ or r6d, 0x00010001
+ xor tmpd, r6d
+ mov seed, tmpd
+ ror seed, 1 ; updated (cur_seed << 16) | top_seed
+
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
+
+ mov offxd, offyd
+%else
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ offx, offy, see, left_offxy, top_offxy, topleft_offxy
+
+ lea topleft_offxyq, [top_offxyq+16]
+ lea left_offxyq, [offyq+16]
+ mov offyd, seed
+ mov offxd, seed
+%endif
+ ror offyd, 8
+ ror offxd, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyq, [offyq+offxq*2+0x10001*747+32*82]
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+
+ movzx r5, offxyw ; top_offxy
+ mov [rsp+5*mmsize+1*gprsize], r5
+%else
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ h, offxy, see, left_offxy, top_offxy, topleft_offxy
+
+ movzx top_offxyd, offxyw
+%endif
+ shr offxyd, 16
+
+ mov hd, r7m
+ mov grain_lutq, grain_lutmp
+.loop_y_hv_overlap:
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m3, [grain_lutq+offxyq]
+%if ARCH_X86_32
+ mov r5, [rsp+5*mmsize+1*gprsize] ; top_offxy
+ mov r0, [rsp+5*mmsize+0*gprsize] ; left_offxy
+ movu m6, [grain_lutq+r5]
+ mov r5, [rsp+5*mmsize+2*gprsize] ; topleft_offxy
+ movd m4, [grain_lutq+r0]
+ movd m7, [grain_lutq+r5]
+%else
+ movu m6, [grain_lutq+top_offxyq]
+ movd m4, [grain_lutq+left_offxyq]
+ movd m7, [grain_lutq+topleft_offxyq]
+%endif
+ ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
+ punpcklbw m4, m3
+ punpcklbw m7, m6
+ pmaddubsw m2, m15, m4
+ pmaddubsw m4, m15, m7
+ pmulhrsw m2, m14
+ pmulhrsw m4, m14
+ packsswb m2, m2
+ packsswb m4, m4
+ shufps m2, m3, q3210
+ shufps m4, m6, q3210
+ ; followed by v interpolation (top | cur -> cur)
+ punpcklbw m3, m4, m2
+ punpckhbw m4, m2
+%if ARCH_X86_32
+ mov r5, [rsp+5*mmsize+12]
+ pmaddubsw m7, [r5], m4
+ pmaddubsw m4, [r5], m3
+%else
+ pmaddubsw m7, m8, m4
+ pmaddubsw m4, m8, m3
+%endif
+ pmulhrsw m7, m14
+ pmulhrsw m4, m14
+ packsswb m4, m7
+ pxor m2, m2
+ pcmpgtb m7, m2, m4
+ punpcklbw m3, m4, m7
+ punpckhbw m4, m7
+
+ ; src
+ mova m0, [srcq]
+ punpckhbw m1, m0, m2
+ punpcklbw m0, m2 ; m0-1: src as word
+
+ ; scaling[src]
+%if ARCH_X86_32
+ vpgatherdw m5, m0, scalingq-1, r0, r5, m7
+ vpgatherdw m6, m1, scalingq-1, r0, r5, m7
+%else
+ vpgatherdw m5, m0, scalingq-1, r13, r14, m7
+ vpgatherdw m6, m1, scalingq-1, r13, r14, m7
+%endif
+ REPX {psrlw x, 8}, m5, m6
+
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+ pmullw m3, m5
+ pmullw m4, m6
+ pmulhrsw m3, m11
+ pmulhrsw m4, m11
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m3
+ paddw m1, m4
+ pmaxsw m0, m13
+ pmaxsw m1, m13
+ pminsw m0, m12
+ pminsw m1, m12
+ packuswb m0, m1
+ movifnidn dstq, dstmp
+ mova [dstq+srcq], m0
+
+%if ARCH_X86_32
+ add dword [rsp+5*mmsize+12], mmsize
+%else
+ mova m8, [pb_17_27]
+%endif
+ add srcq, r2mp
+ add grain_lutq, 82
+ dec hw
+ jz .end_y_hv_overlap
+ ; 2 lines get vertical overlap, then fall back to non-overlap code for
+ ; remaining (up to) 30 lines
+ btc hd, 16
+ jnc .loop_y_hv_overlap
+ jmp .loop_y_h_overlap
+
+.end_y_hv_overlap:
+%if ARCH_X86_32
+ add r4mp, 16
+%else
+ add wq, 16
+%endif
+ jge .end_hv
+%if ARCH_X86_32
+ mov srcq, r1m
+ add srcq, r4m
+%else
+ lea srcq, [src_bakq+wq]
+%endif
+ xor dword r8m, 4
+ add offxyd, 16
+%if ARCH_X86_32
+ add dword [rsp+5*mmsize+1*gprsize], 16
+%else
+ add top_offxyd, 16
+%endif
+ jmp .loop_x_odd_v_overlap
+
+.end_hv:
+ RET
+
+%macro FGUV_FN 3 ; name, ss_hor, ss_ver
+INIT_XMM ssse3
+%if ARCH_X86_32
+; fguv_32x32xn_i420_ssse3(dst, src, stride, fg_data, w, scaling, grain_lut, h,
+; sby, luma, lstride, uv_pl, is_id)
+%if STACK_ALIGNMENT < mmsize
+DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8
+cglobal fguv_32x32xn_i%1_8bpc, 0, 7, 8, 0 - (7 * mmsize + (13 + 3) * gprsize), \
+ tmp, src, scaling, h, fg_data, picptr, unused
+ mov r0, r0m
+ mov r1, r2m
+ mov r2, r4m
+ mov r3, r6m
+ mov r4, r7m
+ mov [rsp+7*mmsize+3*gprsize], r0
+ mov [rsp+7*mmsize+5*gprsize], r1
+ mov [rsp+7*mmsize+7*gprsize], r2
+ mov [rsp+7*mmsize+9*gprsize], r3
+ mov [rsp+7*mmsize+10*gprsize], r4
+
+ mov r0, r8m
+ mov r1, r9m
+ mov r2, r10m
+ mov r4, r11m
+ mov r3, r12m
+ mov [rsp+7*mmsize+11*gprsize], r0
+ mov [rsp+7*mmsize+12*gprsize], r1
+ mov [rsp+7*mmsize+13*gprsize], r2
+ mov [rsp+7*mmsize+14*gprsize], r4
+%else
+cglobal fguv_32x32xn_i%1_8bpc, 0, 7, 8, 7 * mmsize + (4) * gprsize, \
+ tmp, src, scaling, h, fg_data, picptr, unused
+%endif
+ mov srcq, srcm
+ mov fg_dataq, r3m
+ mov scalingq, r5m
+%if STACK_ALIGNMENT < mmsize
+%define r0m [rsp+7*mmsize+ 3*gprsize]
+%define r1m [rsp+7*mmsize+ 4*gprsize]
+%define r2m [rsp+7*mmsize+ 5*gprsize]
+%define r3m [rsp+7*mmsize+ 6*gprsize]
+%define r4m [rsp+7*mmsize+ 7*gprsize]
+%define r5m [rsp+7*mmsize+ 8*gprsize]
+%define r6m [rsp+7*mmsize+ 9*gprsize]
+%define r7m [rsp+7*mmsize+10*gprsize]
+%define r8m [rsp+7*mmsize+11*gprsize]
+%define r9m [rsp+7*mmsize+12*gprsize]
+%define r10m [rsp+7*mmsize+13*gprsize]
+%define r11m [rsp+7*mmsize+14*gprsize]
+%define r12m [rsp+7*mmsize+15*gprsize]
+%endif
+ LEA r5, pb_mask
+%define base r5-pb_mask
+ mov r5m, r5
+%else
+cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
+ grain_lut, tmp, sby, luma, lstride, uv_pl, is_id
+ lea r8, [pb_mask]
+%define base r8-pb_mask
+%endif
+ mov r6d, [fg_dataq+FGData.scaling_shift]
+ movd m3, [base+mul_bits+r6*2-14]
+ mov r6d, [fg_dataq+FGData.clip_to_restricted_range]
+ lea tmpd, [r6d*2]
+%if ARCH_X86_32 && STACK_ALIGNMENT < mmsize
+ test r3, r3
+%else
+ cmp dword r12m, 0 ; is_idm
+%endif
+ movd m5, [base+min+r6*2]
+ cmovne r6d, tmpd
+ movd m4, [base+max+r6*2]
+ punpcklwd m3, m3
+ punpcklwd m5, m5
+ punpcklwd m4, m4
+ pshufd m3, m3, q0000
+ pshufd m5, m5, q0000
+ pshufd m4, m4, q0000
+ SCRATCH 3, 11, 0
+ SCRATCH 4, 12, 1
+ SCRATCH 5, 13, 2
+
+ cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
+ jne .csfl
+
+%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap
+%endif
+
+%if %1
+ mov r6d, dword r11m
+ movd m0, [fg_dataq+FGData.uv_mult+r6*4]
+ movd m1, [fg_dataq+FGData.uv_luma_mult+r6*4]
+ punpcklbw m6, m1, m0
+ movd m7, [fg_dataq+FGData.uv_offset+r6*4]
+ punpcklwd m6, m6
+ punpcklwd m7, m7
+ pshufd m6, m6, q0000
+ pshufd m7, m7, q0000
+ SCRATCH 6, 14, 3
+ SCRATCH 7, 15, 4
+%endif
+
+ mov sbyd, r8m
+ mov overlapd, [fg_dataq+FGData.overlap_flag] ; left_overlap: overlap & 1
+ test overlapd, overlapd
+ jz %%no_vertical_overlap
+%if ARCH_X86_32
+%if %2
+ mova m1, [base+pb_23_22_h]
+%else
+ mova m1, [base+pb_27_17_17_27]
+%endif
+ mova m0, [base+pw_1024]
+%else
+%if %2
+ mova m1, [pb_23_22_h]
+%else
+ mova m1, [pb_27_17_17_27]
+%endif
+ mova m0, [pw_1024]
+%endif
+ SCRATCH 0, 8, 5
+ SCRATCH 1, 9, 6
+ test sbyd, sbyd
+ jnz %%vertical_overlap
+ ; fall-through
+
+%%no_vertical_overlap:
+ mov r8m, overlapd
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, overlap
+ imul seed, (173 << 24) | 37
+%else
+ imul seed, sbyd, (173 << 24) | 37
+%endif
+ add seed, (105 << 24) | 178
+ rol seed, 8
+ movzx seed, seew
+ xor seed, [fg_dataq+FGData.seed]
+
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS luma, src, scaling, see, w, picptr, src_bak
+%define luma_bakq lumaq
+
+ mov wq, r4m
+%if %3
+ shl r10mp, 1
+%endif
+%else
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ unused2, unused3, see, overlap, unused4, src_bak, lstride, luma_bak
+
+ mov lstrideq, r10mp
+%endif
+
+ mov lumaq, r9mp
+ lea src_bakq, [srcq+wq]
+ lea luma_bakq, [lumaq+wq*(1+%2)]
+ neg wq
+ sub r0mp, srcq
+%if ARCH_X86_32
+ mov r1m, src_bakq
+ mov r11m, luma_bakq
+ mov r4m, wq
+
+ DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2
+%else
+ mov r11mp, src_bakq
+ mov r12mp, strideq
+%endif
+
+%%loop_x:
+%if ARCH_X86_32
+ mov seed, r3m
+%endif
+ mov r6d, seed
+ or seed, 0xEFF4
+ shr r6d, 1
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx
+
+ mov offxd, offyd
+%else
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ offx, offy, see, overlap, unused1, unused2, lstride
+
+ mov offyd, seed
+ mov offxd, seed
+%endif
+ ror offyd, 8
+ shr offxd, 12
+ and offyd, 0xf
+ imul offyd, 164>>%3
+ lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))] ; offy*stride+offx
+
+%if ARCH_X86_32
+ DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut
+%else
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ h, offxy, see, overlap, unused1, unused2, lstride, luma_bak
+%endif
+
+%%loop_x_odd:
+ mov hd, r7m
+ mov grain_lutq, grain_lutmp
+%%loop_y:
+ ; src
+%if ARCH_X86_32
+ mov lumaq, r9mp
+%endif
+%if %2
+ mova m4, [lumaq+ 0]
+ mova m6, [lumaq+16]
+ mova m0, [srcq]
+%if ARCH_X86_32
+ add lumaq, r10mp
+ mov r9mp, lumaq
+ mov r5, r5m
+ movd m7, [base+pb_1]
+%else
+ movd m7, [pb_1]
+%endif
+ pshufd m7, m7, q0000
+ pxor m2, m2
+ pmaddubsw m4, m7
+ pmaddubsw m6, m7
+ pavgw m4, m2
+ pavgw m6, m2
+%else
+ mova m4, [lumaq]
+ mova m0, [srcq]
+%if ARCH_X86_32
+ add lumaq, r10mp
+ mov r9mp, lumaq
+%endif
+ pxor m2, m2
+%endif
+
+%if %1
+%if %2
+ packuswb m4, m6 ; luma
+%endif
+ punpckhbw m6, m4, m0
+ punpcklbw m4, m0 ; { luma, chroma }
+ pmaddubsw m6, m14
+ pmaddubsw m4, m14
+ psraw m6, 6
+ psraw m4, 6
+ paddw m6, m15
+ paddw m4, m15
+ packuswb m4, m6 ; pack+unpack = clip
+ punpckhbw m6, m4, m2
+ punpcklbw m4, m2
+%elif %2 == 0
+ punpckhbw m6, m4, m2
+ punpcklbw m4, m2
+%endif
+
+ ; scaling[luma_src]
+%if ARCH_X86_32
+ vpgatherdw m7, m4, scalingq-1, r0, r5
+ vpgatherdw m5, m6, scalingq-1, r0, r5
+%else
+ vpgatherdw m7, m4, scalingq-1, r12, r2
+ vpgatherdw m5, m6, scalingq-1, r12, r2
+%endif
+ REPX {psrlw x, 8}, m7, m5
+
+ ; unpack chroma_source
+ punpckhbw m1, m0, m2
+ punpcklbw m0, m2 ; m0-1: src as word
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m3, [grain_lutq+offxyq+ 0]
+ pcmpgtb m6, m2, m3
+ punpcklbw m2, m3, m6
+ punpckhbw m3, m6
+
+ ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+ pmullw m2, m7
+ pmullw m3, m5
+ pmulhrsw m2, m11
+ pmulhrsw m3, m11
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%endif
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2
+ paddw m1, m3
+ pmaxsw m0, m13
+ pmaxsw m1, m13
+ pminsw m0, m12
+ pminsw m1, m12
+ packuswb m0, m1
+ movifnidn dstq, dstmp
+ mova [dstq+srcq], m0
+
+%if ARCH_X86_32
+ add srcq, r2mp
+ ; we already incremented lumaq above
+%else
+ add srcq, r12mp
+%if %3
+ lea lumaq, [lumaq+lstrideq*2]
+%else
+ add lumaq, lstrideq
+%endif
+%endif
+ add grain_lutq, 82
+ dec hw
+ jg %%loop_y
+
+%if ARCH_X86_32
+ DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
+
+ mov wq, r4m
+%endif
+ add wq, 16
+ jge %%end
+%if ARCH_X86_32
+ mov srcq, r1mp
+ mov lumaq, r11mp
+%else
+ mov srcq, r11mp
+%endif
+ lea lumaq, [luma_bakq+wq*(1+%2)]
+ add srcq, wq
+%if ARCH_X86_32
+ mov r4m, wq
+ mov r9m, lumaq
+%endif
+%if %2 == 0
+ ; adjust top_offxy
+%if ARCH_X86_32
+ add dword [rsp+7*mmsize+1*gprsize], 16
+%else
+ add r11d, 16
+%endif
+ add offxyd, 16
+ btc dword r8m, 2
+ jc %%loop_x_even
+ test dword r8m, 2
+ jz %%loop_x_odd
+ jmp %%loop_x_odd_v_overlap
+%%loop_x_even:
+%endif
+ test dword r8m, 1
+ jz %%loop_x
+
+ ; r8m = sbym
+ test dword r8m, 2
+ jne %%loop_x_hv_overlap
+
+ ; horizontal overlap (without vertical overlap)
+%%loop_x_h_overlap:
+%if ARCH_X86_32
+%if %2
+ lea r6, [offxyd+16]
+ mov [rsp+7*mmsize+0*gprsize], r6
+%else
+ mov [rsp+7*mmsize+0*gprsize], offxyd
+%endif
+
+ DEFINE_ARGS luma, src, scaling, see, w, picptr, grain_lut
+
+ mov seed, r3m
+%else
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ offx, offy, see, left_offxy, unused1, unused2, lstride
+
+%if %2
+ lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx
+%else
+ mov left_offxyd, offyd
+%endif
+%endif
+ mov r6d, seed
+ or seed, 0xEFF4
+ shr r6d, 1
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS luma, src, scaling, offy, w, picptr, offx
+
+ mov offxd, offyd
+%else
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ offx, offy, see, left_offxy, unused1, unused2, lstride
+
+ mov offyd, seed
+ mov offxd, seed
+%endif
+ ror offyd, 8
+ shr offxd, 12
+ and offyd, 0xf
+ imul offyd, 164>>%3
+ lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx
+
+%if ARCH_X86_32
+ DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut
+%else
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ h, offxy, see, left_offxy, unused1, unused2, lstride, luma_bak
+%endif
+
+ mov hd, r7m
+ mov grain_lutq, grain_lutmp
+%%loop_y_h_overlap:
+ ; src
+%if ARCH_X86_32
+ mov lumaq, r9mp
+%endif
+%if %2
+ mova m4, [lumaq+ 0]
+ mova m6, [lumaq+16]
+ mova m0, [srcq]
+%if ARCH_X86_32
+ add lumaq, r10mp
+ mov r9mp, lumaq
+ mov r5, r5m
+ movd m7, [base+pb_1]
+%else
+ movd m7, [pb_1]
+%endif
+ pshufd m7, m7, q0000
+ pxor m2, m2
+ pmaddubsw m4, m7
+ pmaddubsw m6, m7
+ pavgw m4, m2
+ pavgw m6, m2
+%else
+ mova m4, [lumaq]
+ mova m0, [srcq]
+%if ARCH_X86_32
+ add lumaq, r10mp
+ mov r9mp, lumaq
+%endif
+ pxor m2, m2
+%endif
+
+%if %1
+%if %2
+ packuswb m4, m6 ; luma
+%endif
+ punpckhbw m6, m4, m0
+ punpcklbw m4, m0 ; { luma, chroma }
+ pmaddubsw m6, m14
+ pmaddubsw m4, m14
+ psraw m6, 6
+ psraw m4, 6
+ paddw m6, m15
+ paddw m4, m15
+ packuswb m4, m6 ; pack+unpack = clip
+ punpckhbw m6, m4, m2
+ punpcklbw m4, m2
+%elif %2 == 0
+ punpckhbw m6, m4, m2
+ punpcklbw m4, m2
+%endif
+
+ ; scaling[luma_src]
+%if ARCH_X86_32
+ vpgatherdw m7, m4, scalingq-1, r0, r5
+ vpgatherdw m5, m6, scalingq-1, r0, r5
+%else
+ vpgatherdw m7, m4, scalingq-1, r12, r2
+ vpgatherdw m5, m6, scalingq-1, r12, r2
+%endif
+ REPX {psrlw x, 8}, m7, m5
+
+ ; unpack chroma_source
+ punpckhbw m1, m0, m2
+ punpcklbw m0, m2 ; m0-1: src as word
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m4, [grain_lutq+offxyq+ 0]
+%if ARCH_X86_32
+ mov r0, [rsp+7*mmsize+0*gprsize]
+ movd m2, [grain_lutq+r0+ 0]
+%else
+ movd m2, [grain_lutq+left_offxyq+ 0]
+%endif
+ punpcklbw m2, m4
+ pmaddubsw m3, m9, m2
+ pmulhrsw m3, m8
+ packsswb m3, m3
+ shufps m3, m4, q3210
+ pxor m4, m4
+ pcmpgtb m4, m3
+ punpcklbw m2, m3, m4
+ punpckhbw m3, m4
+
+ ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+ pmullw m2, m7
+ pmullw m3, m5
+ pmulhrsw m2, m11
+ pmulhrsw m3, m11
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%endif
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2
+ paddw m1, m3
+ pmaxsw m0, m13
+ pmaxsw m1, m13
+ pminsw m0, m12
+ pminsw m1, m12
+ packuswb m0, m1
+ movifnidn dstq, dstmp
+ mova [dstq+srcq], m0
+
+%if ARCH_X86_32
+ add srcq, r2mp
+ ; lumaq has already been incremented above
+%else
+ add srcq, r12mp
+%if %3
+ lea lumaq, [lumaq+lstrideq*2]
+%else
+ add lumaq, lstrideq
+%endif
+%endif
+ add grain_lutq, 82
+ dec hw
+ jg %%loop_y_h_overlap
+
+%if ARCH_X86_32
+ DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
+
+ mov wq, r4m
+%endif
+ add wq, 16
+ jge %%end
+%if ARCH_X86_32
+ mov srcq, r1mp
+ mov lumaq, r11mp
+%else
+ mov srcq, r11mp
+%endif
+ lea lumaq, [luma_bakq+wq*(1+%2)]
+ add srcq, wq
+%if ARCH_X86_32
+ mov r4m, wq
+ mov r9m, lumaq
+%endif
+%if %2 == 0
+ xor dword r8m, 4
+ ; adjust top_offxyd
+%if ARCH_X86_32
+ add dword [rsp+7*mmsize+1*gprsize], 16
+%else
+ add r11d, 16
+%endif
+ add offxyd, 16
+%endif
+
+ ; r8m = sbym
+ test dword r8m, 2
+%if %2
+ jne %%loop_x_hv_overlap
+ jmp %%loop_x_h_overlap
+%else
+ jne %%loop_x_odd_v_overlap
+ jmp %%loop_x_odd
+%endif
+
+%%end:
+ RET
+
+%%vertical_overlap:
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, tmp, sby, see, overlap
+%endif
+
+ or overlapd, 2 ; top_overlap: overlap & 2
+ mov r8m, overlapd
+ movzx sbyd, sbyb
+%if ARCH_X86_32
+ imul r4, [fg_dataq+FGData.seed], 0x00010001
+ DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused
+%else
+ imul seed, [fg_dataq+FGData.seed], 0x00010001
+%endif
+ imul tmpd, sbyd, 173 * 0x00010001
+ imul sbyd, 37 * 0x01000100
+ add tmpd, (105 << 16) | 188
+ add sbyd, (178 << 24) | (141 << 8)
+ and tmpd, 0x00ff00ff
+ and sbyd, 0xff00ff00
+ xor seed, tmpd
+%if ARCH_X86_32
+ xor sbyd, seed ; (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS luma, src, scaling, see, w, picptr, src_bak
+
+ mov r3m, seed
+ mov wq, r4m
+%if %3
+ shl r10mp, 1
+%endif
+%else
+ xor seed, sbyd ; (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ tmp, unused2, see, overlap, unused3, src_bak, lstride, luma_bak
+
+ mov lstrideq, r10mp
+%endif
+
+ mov lumaq, r9mp
+ lea src_bakq, [srcq+wq]
+ lea luma_bakq, [lumaq+wq*(1+%2)]
+ neg wq
+ sub r0mp, srcq
+%if ARCH_X86_32
+ mov r1m, src_bakq
+ mov r11m, luma_bakq
+ mov r4m, wq
+
+ DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2
+%else
+ mov r11mp, src_bakq
+ mov r12mp, strideq
+%endif
+
+%%loop_x_v_overlap:
+%if ARCH_X86_32
+ mov seed, r3m
+ xor tmpd, tmpd
+%endif
+ ; we assume from the block above that bits 8-15 of tmpd are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp tmpb ; parity of top_seed
+ shr seed, 16
+ shl tmpd, 16
+ test seeb, seeh
+ setp tmpb ; parity of cur_seed
+ or r6d, 0x00010001
+ xor tmpd, r6d
+ mov seed, tmpd
+ ror seed, 1 ; updated (cur_seed << 16) | top_seed
+
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS dst, src, scaling, offy, h, picptr, offx
+
+ mov offxd, offyd
+%else
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ offx, offy, see, overlap, top_offxy, unused, lstride
+
+ mov offxd, seed
+ mov offyd, seed
+%endif
+ ror offyd, 8
+ ror offxd, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164>>%3
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
+
+%if ARCH_X86_32
+ DEFINE_ARGS tmp, src, scaling, offxy, h, picptr, top_offxy
+%else
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ h, offxy, see, overlap, top_offxy, unused, lstride, luma_bak
+%endif
+
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+%if ARCH_X86_32
+ mov [rsp+7*mmsize+1*gprsize], top_offxyd
+
+ DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut
+%endif
+
+%%loop_x_odd_v_overlap:
+ mov hd, r7m
+ mov grain_lutq, grain_lutmp
+%if ARCH_X86_32
+ mov r5, r5m
+%endif
+%if %3
+ mova m1, [PIC_ptr(pb_23_22)]
+%else
+ mova m1, [PIC_ptr(pb_27_17)]
+%endif
+%%loop_y_v_overlap:
+%if ARCH_X86_32
+ mov lumaq, r9mp
+%endif
+%if %2
+ mova m4, [lumaq+ 0]
+ mova m6, [lumaq+16]
+ mova m0, [srcq]
+%if ARCH_X86_32
+ add lumaq, r10mp
+ mov r9mp, lumaq
+ mov r5, r5m
+ movd m7, [base+pb_1]
+%else
+ movd m7, [pb_1]
+%endif
+ pshufd m7, m7, q0000
+ pxor m2, m2
+ pmaddubsw m4, m7
+ pmaddubsw m6, m7
+ pavgw m4, m2
+ pavgw m6, m2
+%else
+ mova m4, [lumaq]
+ mova m0, [srcq]
+%if ARCH_X86_32
+ add lumaq, r10mp
+ mov r9mp, lumaq
+%endif
+ pxor m2, m2
+%endif
+
+%if %1
+%if %2
+ packuswb m4, m6 ; luma
+%endif
+ punpckhbw m6, m4, m0
+ punpcklbw m4, m0 ; { luma, chroma }
+ pmaddubsw m6, m14
+ pmaddubsw m4, m14
+ psraw m6, 6
+ psraw m4, 6
+ paddw m6, m15
+ paddw m4, m15
+ packuswb m4, m6 ; pack+unpack = clip
+ punpckhbw m6, m4, m2
+ punpcklbw m4, m2
+%elif %2 == 0
+ punpckhbw m6, m4, m2
+ punpcklbw m4, m2
+%endif
+
+ ; scaling[luma_src]
+%if ARCH_X86_32
+ vpgatherdw m7, m4, scalingq-1, r0, r5
+ vpgatherdw m5, m6, scalingq-1, r0, r5
+%else
+ vpgatherdw m7, m4, scalingq-1, r12, r2
+ vpgatherdw m5, m6, scalingq-1, r12, r2
+%endif
+ REPX {psrlw x, 8}, m7, m5
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m3, [grain_lutq+offxyq]
+%if ARCH_X86_32
+ mov r0, [rsp+7*mmsize+1*gprsize]
+ movu m4, [grain_lutq+r0]
+%else
+ movu m4, [grain_lutq+top_offxyq]
+%endif
+ punpckhbw m6, m4, m3
+ punpcklbw m4, m3
+ pmaddubsw m2, m1, m6
+ pmaddubsw m3, m1, m4
+ pmulhrsw m2, m8
+ pmulhrsw m3, m8
+ packsswb m3, m2
+ pxor m6, m6
+ pcmpgtb m6, m3
+ punpcklbw m2, m3, m6
+ punpckhbw m3, m6
+
+ ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+ pmullw m2, m7
+ pmullw m3, m5
+ pmulhrsw m2, m11
+ pmulhrsw m3, m11
+
+ ; unpack chroma_source
+ pxor m4, m4
+ punpckhbw m6, m0, m4
+ punpcklbw m0, m4 ; m0-1: src as word
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%endif
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2
+ paddw m6, m3
+ pmaxsw m0, m13
+ pmaxsw m6, m13
+ pminsw m0, m12
+ pminsw m6, m12
+ packuswb m0, m6
+ movifnidn dstq, dstmp
+ mova [dstq+srcq], m0
+
+ dec hw
+ je %%end_y_v_overlap
+%if ARCH_X86_32
+ add srcq, r2mp
+ ; lumaq has already been incremented above
+%else
+ add srcq, r12mp
+%if %3
+ lea lumaq, [lumaq+lstrideq*2]
+%else
+ add lumaq, lstrideq
+%endif
+%endif
+ add grain_lutq, 82
+%if %3 == 0
+ btc hd, 16
+%if ARCH_X86_32
+ mov r5, r5m
+%endif
+ mova m1, [PIC_ptr(pb_17_27)]
+ jnc %%loop_y_v_overlap
+%endif
+ jmp %%loop_y
+
+%%end_y_v_overlap:
+%if ARCH_X86_32
+ DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
+
+ mov wq, r4m
+%endif
+ add wq, 16
+ jge %%end_hv
+%if ARCH_X86_32
+ mov srcq, r1mp
+ mov lumaq, r11mp
+%else
+ mov srcq, r11mp
+%endif
+ lea lumaq, [luma_bakq+wq*(1+%2)]
+ add srcq, wq
+%if ARCH_X86_32
+ mov r4m, wq
+ mov r9m, lumaq
+%endif
+
+%if %2
+ ; since fg_dataq.overlap is guaranteed to be set, we never jump
+ ; back to .loop_x_v_overlap, and instead always fall-through to
+ ; h+v overlap
+%else
+%if ARCH_X86_32
+ add dword [rsp+7*mmsize+1*gprsize], 16
+%else
+ add top_offxyd, 16
+%endif
+ add offxyd, 16
+ btc dword r8m, 2
+ jnc %%loop_x_odd_v_overlap
+%endif
+
+%%loop_x_hv_overlap:
+%if ARCH_X86_32
+ DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, unused
+
+ mov r6, [rsp+7*mmsize+1*gprsize]
+%if %2
+ lea r0, [r3d+16]
+ add r6, 16
+ mov [rsp+7*mmsize+0*gprsize], r0 ; left_offxy
+%else
+ mov [rsp+7*mmsize+0*gprsize], r3 ; left_offxy
+%endif
+ mov [rsp+7*mmsize+2*gprsize], r6 ; topleft_offxy
+
+ DEFINE_ARGS tmp, src, scaling, see, w, picptr, unused
+
+ mov seed, r3m
+ xor tmpd, tmpd
+%else
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ tmp, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride
+
+%if %2
+ lea topleft_offxyq, [top_offxyq+16]
+ lea left_offxyq, [offxyq+16]
+%else
+ mov topleft_offxyq, top_offxyq
+ mov left_offxyq, offxyq
+%endif
+
+ ; we assume from the block above that bits 8-15 of tmpd are zero'ed
+%endif
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp tmpb ; parity of top_seed
+ shr seed, 16
+ shl tmpd, 16
+ test seeb, seeh
+ setp tmpb ; parity of cur_seed
+ or r6d, 0x00010001
+ xor tmpd, r6d
+ mov seed, tmpd
+ ror seed, 1 ; updated (cur_seed << 16) | top_seed
+
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS tmp, src, scaling, offy, w, picptr, offx
+
+ mov offxd, offyd
+%else
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride
+
+ mov offxd, seed
+ mov offyd, seed
+%endif
+ ror offyd, 8
+ ror offxd, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164>>%3
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
+
+%if ARCH_X86_32
+ DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut
+%else
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride, luma_bak
+%endif
+
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+%if ARCH_X86_32
+ mov [rsp+7*mmsize+1*gprsize], top_offxyd
+%endif
+
+ mov hd, r7m
+ mov grain_lutq, grain_lutmp
+%if ARCH_X86_32
+ mov r5, r5m
+%endif
+%if %3
+ mova m3, [PIC_ptr(pb_23_22)]
+%else
+ mova m3, [PIC_ptr(pb_27_17)]
+%endif
+%%loop_y_hv_overlap:
+ ; grain = grain_lut[offy+y][offx+x]
+%if ARCH_X86_32
+ mov r0, [rsp+7*mmsize+2*gprsize] ; topleft_offxy
+ mov r5, [rsp+7*mmsize+1*gprsize] ; top_offxy
+ movd m1, [grain_lutq+r0]
+ mov r0, [rsp+7*mmsize+0*gprsize] ; left_offxy
+%else
+ movd m1, [grain_lutq+topleft_offxyq]
+%endif
+ movu m2, [grain_lutq+offxyq]
+%if ARCH_X86_32
+ movu m6, [grain_lutq+r5]
+ movd m4, [grain_lutq+r0]
+%else
+ movu m6, [grain_lutq+top_offxyq]
+ movd m4, [grain_lutq+left_offxyq]
+%endif
+ ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
+ punpcklbw m1, m6
+ punpcklbw m4, m2
+ pmaddubsw m0, m9, m1
+ pmaddubsw m1, m9, m4
+ REPX {pmulhrsw x, m8}, m0, m1
+ packsswb m0, m1
+ shufps m4, m0, m2, q3232
+ shufps m0, m6, q3210
+ ; followed by v interpolation (top | cur -> cur)
+ punpcklbw m2, m0, m4
+ punpckhbw m0, m4
+ pmaddubsw m4, m3, m0
+ pmaddubsw m1, m3, m2
+ pmulhrsw m4, m8
+ pmulhrsw m1, m8
+ packsswb m1, m4
+
+ ; src
+%if ARCH_X86_32
+ DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
+
+ mov lumaq, r9mp
+%endif
+%if %2
+ mova m4, [lumaq+ 0]
+ mova m6, [lumaq+16]
+ mova m0, [srcq]
+%if ARCH_X86_32
+ add lumaq, r10mp
+ mov r9mp, lumaq
+ mov r5, r5m
+ movd m7, [base+pb_1]
+%else
+ movd m7, [pb_1]
+%endif
+ pshufd m7, m7, q0000
+ pxor m2, m2
+ pmaddubsw m4, m7
+ pmaddubsw m6, m7
+ pavgw m4, m2
+ pavgw m6, m2
+%else
+ mova m4, [lumaq]
+ mova m0, [srcq]
+%if ARCH_X86_32
+ add lumaq, r10mp
+ mov r9mp, lumaq
+%endif
+ pxor m2, m2
+%endif
+
+%if %1
+%if %2
+ packuswb m4, m6 ; luma
+%endif
+ punpckhbw m6, m4, m0
+ punpcklbw m4, m0 ; { luma, chroma }
+ pmaddubsw m6, m14
+ pmaddubsw m4, m14
+ psraw m6, 6
+ psraw m4, 6
+ paddw m6, m15
+ paddw m4, m15
+ packuswb m4, m6 ; pack+unpack = clip
+ punpckhbw m6, m4, m2
+ punpcklbw m4, m2
+%elif %2 == 0
+ punpckhbw m6, m4, m2
+ punpcklbw m4, m2
+%endif
+
+ ; scaling[src]
+%if ARCH_X86_32
+ vpgatherdw m7, m4, scalingq-1, r0, r5
+ vpgatherdw m5, m6, scalingq-1, r0, r5
+%else
+%if %3
+ vpgatherdw m7, m4, scalingq-1, r2, r12
+ vpgatherdw m5, m6, scalingq-1, r2, r12
+%else
+ vpgatherdw m7, m4, scalingq-1, r2, r13
+ vpgatherdw m5, m6, scalingq-1, r2, r13
+%endif
+%endif
+ REPX {psrlw x, 8}, m7, m5
+
+ ; unpack grain
+ pxor m4, m4
+ pcmpgtb m4, m1
+ punpcklbw m2, m1, m4
+ punpckhbw m1, m4
+
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+ pmullw m2, m7
+ pmullw m1, m5
+ pmulhrsw m2, m11
+ pmulhrsw m1, m11
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%endif
+
+ ; unpack chroma source
+ pxor m4, m4
+ punpckhbw m5, m0, m4
+ punpcklbw m0, m4 ; m0-1: src as word
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2
+ paddw m5, m1
+ pmaxsw m0, m13
+ pmaxsw m5, m13
+ pminsw m0, m12
+ pminsw m5, m12
+ packuswb m0, m5
+ movifnidn dstq, dstmp
+ mova [dstq+srcq], m0
+
+%if ARCH_X86_32
+ add srcq, r2mp
+ ; lumaq has been adjusted above already
+%else
+ add srcq, r12mp
+%if %3
+ lea lumaq, [lumaq+lstrideq*(1+%2)]
+%else
+ add lumaq, r10mp
+%endif
+%endif
+ add grain_lutq, 82
+ dec hw
+%if %3
+ jg %%loop_y_h_overlap
+%else
+ jle %%end_y_hv_overlap
+%if ARCH_X86_32
+ mov r5, r5m
+%endif
+ mova m3, [PIC_ptr(pb_17_27)]
+ btc hd, 16
+ jnc %%loop_y_hv_overlap
+%if ARCH_X86_64
+ mov lstrideq, r10mp
+%endif
+ jmp %%loop_y_h_overlap
+%%end_y_hv_overlap:
+%if ARCH_X86_64
+ mov lstrideq, r10mp
+%endif
+%endif
+
+%if ARCH_X86_32
+ DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
+
+ mov wq, r4m
+%endif
+ add wq, 16
+ jge %%end_hv
+%if ARCH_X86_32
+ mov srcq, r1mp
+ mov lumaq, r11mp
+%else
+ mov srcq, r11mp
+%endif
+ lea lumaq, [luma_bakq+wq*(1+%2)]
+ add srcq, wq
+%if ARCH_X86_32
+ mov r4m, wq
+ mov r9m, lumaq
+%endif
+%if %2
+ jmp %%loop_x_hv_overlap
+%else
+%if ARCH_X86_32
+ add dword [rsp+7*mmsize+1*gprsize], 16
+%else
+ add top_offxyd, 16
+%endif
+ add offxyd, 16
+ xor dword r8m, 4
+ jmp %%loop_x_odd_v_overlap
+%endif
+
+%%end_hv:
+ RET
+%endmacro
+
+ %%FGUV_32x32xN_LOOP 1, %2, %3
+.csfl:
+ %%FGUV_32x32xN_LOOP 0, %2, %3
+%endmacro
+
+FGUV_FN 420, 1, 1
+
+%if STACK_ALIGNMENT < mmsize
+DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
+%endif
+
+FGUV_FN 422, 1, 0
+
+%if STACK_ALIGNMENT < mmsize
+DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
+%endif
+
+FGUV_FN 444, 0, 0
diff --git a/third_party/dav1d/src/x86/ipred.h b/third_party/dav1d/src/x86/ipred.h
new file mode 100644
index 0000000000..f5f187e53d
--- /dev/null
+++ b/third_party/dav1d/src/x86/ipred.h
@@ -0,0 +1,152 @@
+/*
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/ipred.h"
+
+#define decl_fn(type, name) \
+ decl_##type##_fn(BF(dav1d_##name, ssse3)); \
+ decl_##type##_fn(BF(dav1d_##name, avx2)); \
+ decl_##type##_fn(BF(dav1d_##name, avx512icl))
+#define init_fn(type0, type1, name, suffix) \
+ c->type0[type1] = BF(dav1d_##name, suffix)
+
+#define init_angular_ipred_fn(type, name, suffix) \
+ init_fn(intra_pred, type, name, suffix)
+#define init_cfl_pred_fn(type, name, suffix) \
+ init_fn(cfl_pred, type, name, suffix)
+#define init_cfl_ac_fn(type, name, suffix) \
+ init_fn(cfl_ac, type, name, suffix)
+
+decl_fn(angular_ipred, ipred_dc);
+decl_fn(angular_ipred, ipred_dc_128);
+decl_fn(angular_ipred, ipred_dc_top);
+decl_fn(angular_ipred, ipred_dc_left);
+decl_fn(angular_ipred, ipred_h);
+decl_fn(angular_ipred, ipred_v);
+decl_fn(angular_ipred, ipred_paeth);
+decl_fn(angular_ipred, ipred_smooth);
+decl_fn(angular_ipred, ipred_smooth_h);
+decl_fn(angular_ipred, ipred_smooth_v);
+decl_fn(angular_ipred, ipred_z1);
+decl_fn(angular_ipred, ipred_z2);
+decl_fn(angular_ipred, ipred_z3);
+decl_fn(angular_ipred, ipred_filter);
+
+decl_fn(cfl_pred, ipred_cfl);
+decl_fn(cfl_pred, ipred_cfl_128);
+decl_fn(cfl_pred, ipred_cfl_top);
+decl_fn(cfl_pred, ipred_cfl_left);
+
+decl_fn(cfl_ac, ipred_cfl_ac_420);
+decl_fn(cfl_ac, ipred_cfl_ac_422);
+decl_fn(cfl_ac, ipred_cfl_ac_444);
+
+decl_fn(pal_pred, pal_pred);
+
+static ALWAYS_INLINE void intra_pred_dsp_init_x86(Dav1dIntraPredDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
+
+ init_angular_ipred_fn(DC_PRED, ipred_dc, ssse3);
+ init_angular_ipred_fn(DC_128_PRED, ipred_dc_128, ssse3);
+ init_angular_ipred_fn(TOP_DC_PRED, ipred_dc_top, ssse3);
+ init_angular_ipred_fn(LEFT_DC_PRED, ipred_dc_left, ssse3);
+ init_angular_ipred_fn(HOR_PRED, ipred_h, ssse3);
+ init_angular_ipred_fn(VERT_PRED, ipred_v, ssse3);
+ init_angular_ipred_fn(PAETH_PRED, ipred_paeth, ssse3);
+ init_angular_ipred_fn(SMOOTH_PRED, ipred_smooth, ssse3);
+ init_angular_ipred_fn(SMOOTH_H_PRED, ipred_smooth_h, ssse3);
+ init_angular_ipred_fn(SMOOTH_V_PRED, ipred_smooth_v, ssse3);
+ init_angular_ipred_fn(Z1_PRED, ipred_z1, ssse3);
+ init_angular_ipred_fn(Z2_PRED, ipred_z2, ssse3);
+ init_angular_ipred_fn(Z3_PRED, ipred_z3, ssse3);
+ init_angular_ipred_fn(FILTER_PRED, ipred_filter, ssse3);
+
+ init_cfl_pred_fn(DC_PRED, ipred_cfl, ssse3);
+ init_cfl_pred_fn(DC_128_PRED, ipred_cfl_128, ssse3);
+ init_cfl_pred_fn(TOP_DC_PRED, ipred_cfl_top, ssse3);
+ init_cfl_pred_fn(LEFT_DC_PRED, ipred_cfl_left, ssse3);
+
+ init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I420 - 1, ipred_cfl_ac_420, ssse3);
+ init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I422 - 1, ipred_cfl_ac_422, ssse3);
+ init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I444 - 1, ipred_cfl_ac_444, ssse3);
+
+ c->pal_pred = BF(dav1d_pal_pred, ssse3);
+
+#if ARCH_X86_64
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
+
+ init_angular_ipred_fn(DC_PRED, ipred_dc, avx2);
+ init_angular_ipred_fn(DC_128_PRED, ipred_dc_128, avx2);
+ init_angular_ipred_fn(TOP_DC_PRED, ipred_dc_top, avx2);
+ init_angular_ipred_fn(LEFT_DC_PRED, ipred_dc_left, avx2);
+ init_angular_ipred_fn(HOR_PRED, ipred_h, avx2);
+ init_angular_ipred_fn(VERT_PRED, ipred_v, avx2);
+ init_angular_ipred_fn(PAETH_PRED, ipred_paeth, avx2);
+ init_angular_ipred_fn(SMOOTH_PRED, ipred_smooth, avx2);
+ init_angular_ipred_fn(SMOOTH_H_PRED, ipred_smooth_h, avx2);
+ init_angular_ipred_fn(SMOOTH_V_PRED, ipred_smooth_v, avx2);
+ init_angular_ipred_fn(Z1_PRED, ipred_z1, avx2);
+ init_angular_ipred_fn(Z2_PRED, ipred_z2, avx2);
+ init_angular_ipred_fn(Z3_PRED, ipred_z3, avx2);
+ init_angular_ipred_fn(FILTER_PRED, ipred_filter, avx2);
+
+ init_cfl_pred_fn(DC_PRED, ipred_cfl, avx2);
+ init_cfl_pred_fn(DC_128_PRED, ipred_cfl_128, avx2);
+ init_cfl_pred_fn(TOP_DC_PRED, ipred_cfl_top, avx2);
+ init_cfl_pred_fn(LEFT_DC_PRED, ipred_cfl_left, avx2);
+
+ init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I420 - 1, ipred_cfl_ac_420, avx2);
+ init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I422 - 1, ipred_cfl_ac_422, avx2);
+ init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I444 - 1, ipred_cfl_ac_444, avx2);
+
+ c->pal_pred = BF(dav1d_pal_pred, avx2);
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
+
+#if BITDEPTH == 8
+ init_angular_ipred_fn(DC_PRED, ipred_dc, avx512icl);
+ init_angular_ipred_fn(DC_128_PRED, ipred_dc_128, avx512icl);
+ init_angular_ipred_fn(TOP_DC_PRED, ipred_dc_top, avx512icl);
+ init_angular_ipred_fn(LEFT_DC_PRED, ipred_dc_left, avx512icl);
+ init_angular_ipred_fn(HOR_PRED, ipred_h, avx512icl);
+ init_angular_ipred_fn(VERT_PRED, ipred_v, avx512icl);
+ init_angular_ipred_fn(Z2_PRED, ipred_z2, avx512icl);
+#endif
+ init_angular_ipred_fn(PAETH_PRED, ipred_paeth, avx512icl);
+ init_angular_ipred_fn(SMOOTH_PRED, ipred_smooth, avx512icl);
+ init_angular_ipred_fn(SMOOTH_H_PRED, ipred_smooth_h, avx512icl);
+ init_angular_ipred_fn(SMOOTH_V_PRED, ipred_smooth_v, avx512icl);
+ init_angular_ipred_fn(Z1_PRED, ipred_z1, avx512icl);
+ init_angular_ipred_fn(Z3_PRED, ipred_z3, avx512icl);
+ init_angular_ipred_fn(FILTER_PRED, ipred_filter, avx512icl);
+
+ c->pal_pred = BF(dav1d_pal_pred, avx512icl);
+#endif
+}
diff --git a/third_party/dav1d/src/x86/ipred16_avx2.asm b/third_party/dav1d/src/x86/ipred16_avx2.asm
new file mode 100644
index 0000000000..f4931e977b
--- /dev/null
+++ b/third_party/dav1d/src/x86/ipred16_avx2.asm
@@ -0,0 +1,5005 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA 64
+
+%macro SMOOTH_WEIGHTS 1-*
+const smooth_weights_1d_16bpc ; sm_weights[] << 7
+ %rep %0
+ dw %1*128
+ %rotate 1
+ %endrep
+const smooth_weights_2d_16bpc ; sm_weights[], 256 - sm_weights[]
+ %rep %0
+ dw %1, 256-%1
+ %rotate 1
+ %endrep
+%endmacro
+
+SMOOTH_WEIGHTS 0, 0, 255, 128, 255, 149, 85, 64, \
+ 255, 197, 146, 105, 73, 50, 37, 32, \
+ 255, 225, 196, 170, 145, 123, 102, 84, \
+ 68, 54, 43, 33, 26, 20, 17, 16, \
+ 255, 240, 225, 210, 196, 182, 169, 157, \
+ 145, 133, 122, 111, 101, 92, 83, 74, \
+ 66, 59, 52, 45, 39, 34, 29, 25, \
+ 21, 17, 14, 12, 10, 9, 8, 8, \
+ 255, 248, 240, 233, 225, 218, 210, 203, \
+ 196, 189, 182, 176, 169, 163, 156, 150, \
+ 144, 138, 133, 127, 121, 116, 111, 106, \
+ 101, 96, 91, 86, 82, 77, 73, 69, \
+ 65, 61, 57, 54, 50, 47, 44, 41, \
+ 38, 35, 32, 29, 27, 25, 22, 20, \
+ 18, 16, 15, 13, 12, 10, 9, 8, \
+ 7, 6, 6, 5, 5, 4, 4, 4
+
+%if ARCH_X86_64
+
+ipred_hv_shuf: db 6, 7, 6, 7, 0, 1, 2, 3, 2, 3, 2, 3, 8, 9, 10, 11
+ db 4, 5, 4, 5, 4, 5, 6, 7, 0, 1, 0, 1, 12, 13, 14, 15
+filter_shuf1: db 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 14, 15, 12, 13, -1, -1
+filter_shuf2: db 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 2, 3, -1, -1
+filter_shuf3: db 12, 13, 0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 8, 9, -1, -1
+pal_pred_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+z_base_inc: dw 0*64, 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64
+ dw 8*64, 9*64, 10*64, 11*64, 12*64, 13*64, 14*64, 15*64
+z_filter_t0: db 55,127, 39,127, 39,127, 7, 15, 31, 7, 15, 31, 0, 3, 31, 0
+z_filter_t1: db 39, 63, 19, 47, 19, 47, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0
+z_filter_wh: db 7, 7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39
+ db 39, 39, 47, 47, 47, 63, 63, 63, 79, 79, 79, -1
+pw_m1024: times 2 dw -1024
+pw_1to16: dw 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+pw_16to1: dw 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
+z2_ymul: dw 1, 2, 1, 2, 1, 2, 1, 2, 3, 4, 3, 4, 3, 4, 3, 4
+z2_ymul8: dw 1, 2, 5, 6, 3, 4, 7, 8, 5, 6, 16, 16, 7, 8
+pb_90: times 4 db 90
+z2_y_shuf_h4: dd 3, 7, 2, 6, 1, 5, 0, 4
+z_upsample: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
+z2_x_shuf: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
+z2_y_shuf: db 6, 7, 14, 15, 4, 5, 12, 13, 4, 5, 12, 13, 2, 3, 10, 11
+z2_y_shuf_us: db 6, 7, 14, 15, 2, 3, 10, 11, 4, 5, 12, 13, 0, 1, 8, 9
+z_filter_k: dw 4, 4, 5, 5, 4, 4
+ dw 8, 8, 6, 6, 4, 4
+ dw 0, 0, 0, 0, 2, 2
+
+%define pw_2 (z_filter_k+32)
+%define pw_4 (z_filter_k+ 0)
+%define pw_16 (z2_ymul8 +20)
+
+pw_1: times 2 dw 1
+pw_3: times 2 dw 3
+pw_62: times 2 dw 62
+pw_512: times 2 dw 512
+pw_2048: times 2 dw 2048
+pd_8: dd 8
+
+%macro JMP_TABLE 3-*
+ %xdefine %1_%2_table (%%table - 2*4)
+ %xdefine %%base mangle(private_prefix %+ _%1_%2)
+ %%table:
+ %rep %0 - 2
+ dd %%base %+ .%3 - (%%table - 2*4)
+ %rotate 1
+ %endrep
+%endmacro
+
+%define ipred_dc_splat_16bpc_avx2_table (ipred_dc_16bpc_avx2_table + 10*4)
+%define ipred_cfl_splat_16bpc_avx2_table (ipred_cfl_16bpc_avx2_table + 8*4)
+
+JMP_TABLE ipred_dc_16bpc, avx2, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
+ s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4
+JMP_TABLE ipred_dc_left_16bpc, avx2, h4, h8, h16, h32, h64
+JMP_TABLE ipred_h_16bpc, avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_paeth_16bpc, avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_smooth_16bpc, avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_smooth_h_16bpc, avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_smooth_v_16bpc, avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_z1_16bpc, avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_z2_16bpc, avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_z3_16bpc, avx2, h4, h8, h16, h32, h64
+JMP_TABLE ipred_filter_16bpc, avx2, w4, w8, w16, w32
+JMP_TABLE ipred_cfl_16bpc, avx2, h4, h8, h16, h32, w4, w8, w16, w32, \
+ s4-8*4, s8-8*4, s16-8*4, s32-8*4
+JMP_TABLE ipred_cfl_left_16bpc, avx2, h4, h8, h16, h32
+JMP_TABLE ipred_cfl_ac_444_16bpc, avx2, w4, w8, w16, w32
+JMP_TABLE pal_pred_16bpc, avx2, w4, w8, w16, w32, w64
+
+cextern dr_intra_derivative
+cextern filter_intra_taps
+
+SECTION .text
+
+INIT_YMM avx2
+cglobal ipred_dc_top_16bpc, 3, 7, 6, dst, stride, tl, w, h
+ movifnidn hd, hm
+ add tlq, 2
+ movd xm4, wd
+ pxor xm3, xm3
+ pavgw xm4, xm3
+ tzcnt wd, wd
+ movd xm5, wd
+ movu m0, [tlq]
+ lea r5, [ipred_dc_left_16bpc_avx2_table]
+ movsxd r6, [r5+wq*4]
+ add r6, r5
+ add r5, ipred_dc_splat_16bpc_avx2_table-ipred_dc_left_16bpc_avx2_table
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ jmp r6
+
+cglobal ipred_dc_left_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
+ mov hd, hm
+ sub tlq, hq
+ movd xm4, hd
+ sub tlq, hq
+ pxor xm3, xm3
+ pavgw xm4, xm3
+ tzcnt r6d, hd
+ movd xm5, r6d
+ movu m0, [tlq]
+ lea r5, [ipred_dc_left_16bpc_avx2_table]
+ movsxd r6, [r5+r6*4]
+ add r6, r5
+ add r5, ipred_dc_splat_16bpc_avx2_table-ipred_dc_left_16bpc_avx2_table
+ tzcnt wd, wd
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ jmp r6
+.h64:
+ paddw m0, [tlq+96]
+ paddw m0, [tlq+64]
+.h32:
+ paddw m0, [tlq+32]
+.h16:
+ vextracti128 xm1, m0, 1
+ paddw xm0, xm1
+.h8:
+ psrldq xm1, xm0, 8
+ paddw xm0, xm1
+.h4:
+ punpcklwd xm0, xm3
+ psrlq xm1, xm0, 32
+ paddd xm0, xm1
+ psrldq xm1, xm0, 8
+ paddd xm0, xm1
+ paddd xm0, xm4
+ psrld xm0, xm5
+ lea stride3q, [strideq*3]
+ vpbroadcastw m0, xm0
+ mova m1, m0
+ mova m2, m0
+ mova m3, m0
+ jmp wq
+
+cglobal ipred_dc_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
+ movifnidn hd, hm
+ tzcnt r6d, hd
+ lea r5d, [wq+hq]
+ movd xm4, r5d
+ tzcnt r5d, r5d
+ movd xm5, r5d
+ lea r5, [ipred_dc_16bpc_avx2_table]
+ tzcnt wd, wd
+ movsxd r6, [r5+r6*4]
+ movsxd wq, [r5+wq*4+5*4]
+ pxor m3, m3
+ psrlw xm4, 1
+ add r6, r5
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp r6
+.h4:
+ movq xm0, [tlq-8]
+ jmp wq
+.w4:
+ movq xm1, [tlq+2]
+ paddw m0, m4
+ paddw m0, m1
+ psrlq m1, m0, 32
+ paddw m0, m1
+ psrld m1, m0, 16
+ paddw m0, m1
+ cmp hd, 4
+ jg .w4_mul
+ psrlw xm0, 3
+ jmp .w4_end
+.w4_mul:
+ vextracti128 xm1, m0, 1
+ paddw xm0, xm1
+ lea r2d, [hq*2]
+ mov r6d, 0xAAAB6667
+ shrx r6d, r6d, r2d
+ punpckhwd xm1, xm0, xm3
+ punpcklwd xm0, xm3
+ paddd xm0, xm1
+ movd xm1, r6d
+ psrld xm0, 2
+ pmulhuw xm0, xm1
+ psrlw xm0, 1
+.w4_end:
+ vpbroadcastw xm0, xm0
+.s4:
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm0
+ movq [dstq+strideq*2], xm0
+ movq [dstq+stride3q ], xm0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s4
+ RET
+ALIGN function_align
+.h8:
+ mova xm0, [tlq-16]
+ jmp wq
+.w8:
+ vextracti128 xm1, m0, 1
+ paddw xm0, [tlq+2]
+ paddw xm0, xm4
+ paddw xm0, xm1
+ psrld xm1, xm0, 16
+ paddw xm0, xm1
+ pblendw xm0, xm3, 0xAA
+ psrlq xm1, xm0, 32
+ paddd xm0, xm1
+ psrldq xm1, xm0, 8
+ paddd xm0, xm1
+ psrld xm0, xm5
+ cmp hd, 8
+ je .w8_end
+ mov r6d, 0xAAAB
+ mov r2d, 0x6667
+ cmp hd, 32
+ cmovz r6d, r2d
+ movd xm1, r6d
+ pmulhuw xm0, xm1
+ psrlw xm0, 1
+.w8_end:
+ vpbroadcastw xm0, xm0
+.s8:
+ mova [dstq+strideq*0], xm0
+ mova [dstq+strideq*1], xm0
+ mova [dstq+strideq*2], xm0
+ mova [dstq+stride3q ], xm0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s8
+ RET
+ALIGN function_align
+.h16:
+ mova m0, [tlq-32]
+ jmp wq
+.w16:
+ paddw m0, [tlq+2]
+ vextracti128 xm1, m0, 1
+ paddw xm0, xm4
+ paddw xm0, xm1
+ punpckhwd xm1, xm0, xm3
+ punpcklwd xm0, xm3
+ paddd xm0, xm1
+ psrlq xm1, xm0, 32
+ paddd xm0, xm1
+ psrldq xm1, xm0, 8
+ paddd xm0, xm1
+ psrld xm0, xm5
+ cmp hd, 16
+ je .w16_end
+ mov r6d, 0xAAAB
+ mov r2d, 0x6667
+ test hb, 8|32
+ cmovz r6d, r2d
+ movd xm1, r6d
+ pmulhuw xm0, xm1
+ psrlw xm0, 1
+.w16_end:
+ vpbroadcastw m0, xm0
+.s16:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s16
+ RET
+ALIGN function_align
+.h32:
+ mova m0, [tlq-64]
+ paddw m0, [tlq-32]
+ jmp wq
+.w32:
+ paddw m0, [tlq+ 2]
+ paddw m0, [tlq+34]
+ vextracti128 xm1, m0, 1
+ paddw xm0, xm4
+ paddw xm0, xm1
+ punpcklwd xm1, xm0, xm3
+ punpckhwd xm0, xm3
+ paddd xm0, xm1
+ psrlq xm1, xm0, 32
+ paddd xm0, xm1
+ psrldq xm1, xm0, 8
+ paddd xm0, xm1
+ psrld xm0, xm5
+ cmp hd, 32
+ je .w32_end
+ lea r2d, [hq*2]
+ mov r6d, 0x6667AAAB
+ shrx r6d, r6d, r2d
+ movd xm1, r6d
+ pmulhuw xm0, xm1
+ psrlw xm0, 1
+.w32_end:
+ vpbroadcastw m0, xm0
+ mova m1, m0
+.s32:
+ mova [dstq+strideq*0+32*0], m0
+ mova [dstq+strideq*0+32*1], m1
+ mova [dstq+strideq*1+32*0], m0
+ mova [dstq+strideq*1+32*1], m1
+ mova [dstq+strideq*2+32*0], m0
+ mova [dstq+strideq*2+32*1], m1
+ mova [dstq+stride3q +32*0], m0
+ mova [dstq+stride3q +32*1], m1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s32
+ RET
+ALIGN function_align
+.h64:
+ mova m0, [tlq-128]
+ mova m1, [tlq- 96]
+ paddw m0, [tlq- 64]
+ paddw m1, [tlq- 32]
+ paddw m0, m1
+ jmp wq
+.w64:
+ movu m1, [tlq+ 2]
+ paddw m0, [tlq+34]
+ paddw m1, [tlq+66]
+ paddw m0, [tlq+98]
+ paddw m0, m1
+ vextracti128 xm1, m0, 1
+ paddw xm0, xm1
+ punpcklwd xm1, xm0, xm3
+ punpckhwd xm0, xm3
+ paddd xm1, xm4
+ paddd xm0, xm1
+ psrlq xm1, xm0, 32
+ paddd xm0, xm1
+ psrldq xm1, xm0, 8
+ paddd xm0, xm1
+ psrld xm0, xm5
+ cmp hd, 64
+ je .w64_end
+ mov r6d, 0x6667AAAB
+ shrx r6d, r6d, hd
+ movd xm1, r6d
+ pmulhuw xm0, xm1
+ psrlw xm0, 1
+.w64_end:
+ vpbroadcastw m0, xm0
+ mova m1, m0
+ mova m2, m0
+ mova m3, m0
+.s64:
+ mova [dstq+strideq*0+32*0], m0
+ mova [dstq+strideq*0+32*1], m1
+ mova [dstq+strideq*0+32*2], m2
+ mova [dstq+strideq*0+32*3], m3
+ mova [dstq+strideq*1+32*0], m0
+ mova [dstq+strideq*1+32*1], m1
+ mova [dstq+strideq*1+32*2], m2
+ mova [dstq+strideq*1+32*3], m3
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .s64
+ RET
+
+cglobal ipred_dc_128_16bpc, 2, 7, 6, dst, stride, tl, w, h, stride3
+ mov r6d, r8m
+ shr r6d, 11
+ lea r5, [ipred_dc_splat_16bpc_avx2_table]
+ tzcnt wd, wd
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ vpbroadcastd m0, [r5-ipred_dc_splat_16bpc_avx2_table+pw_512+r6*4]
+ mova m1, m0
+ mova m2, m0
+ mova m3, m0
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp wq
+
+cglobal ipred_v_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
+ movifnidn hd, hm
+ movu m0, [tlq+ 2]
+ movu m1, [tlq+34]
+ movu m2, [tlq+66]
+ movu m3, [tlq+98]
+ lea r5, [ipred_dc_splat_16bpc_avx2_table]
+ tzcnt wd, wd
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp wq
+
+%macro IPRED_H 2 ; w, store_type
+ vpbroadcastw m0, [tlq-2]
+ vpbroadcastw m1, [tlq-4]
+ vpbroadcastw m2, [tlq-6]
+ vpbroadcastw m3, [tlq-8]
+ sub tlq, 8
+ mov%2 [dstq+strideq*0], m0
+ mov%2 [dstq+strideq*1], m1
+ mov%2 [dstq+strideq*2], m2
+ mov%2 [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w%1
+ RET
+ALIGN function_align
+%endmacro
+
+cglobal ipred_h_16bpc, 3, 6, 4, dst, stride, tl, w, h, stride3
+ movifnidn hd, hm
+ lea r5, [ipred_h_16bpc_avx2_table]
+ tzcnt wd, wd
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp wq
+INIT_XMM avx2
+.w4:
+ IPRED_H 4, q
+.w8:
+ IPRED_H 8, a
+INIT_YMM avx2
+.w16:
+ IPRED_H 16, a
+.w32:
+ vpbroadcastw m0, [tlq-2]
+ vpbroadcastw m1, [tlq-4]
+ vpbroadcastw m2, [tlq-6]
+ vpbroadcastw m3, [tlq-8]
+ sub tlq, 8
+ mova [dstq+strideq*0+32*0], m0
+ mova [dstq+strideq*0+32*1], m0
+ mova [dstq+strideq*1+32*0], m1
+ mova [dstq+strideq*1+32*1], m1
+ mova [dstq+strideq*2+32*0], m2
+ mova [dstq+strideq*2+32*1], m2
+ mova [dstq+stride3q +32*0], m3
+ mova [dstq+stride3q +32*1], m3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w32
+ RET
+.w64:
+ vpbroadcastw m0, [tlq-2]
+ vpbroadcastw m1, [tlq-4]
+ sub tlq, 4
+ mova [dstq+strideq*0+32*0], m0
+ mova [dstq+strideq*0+32*1], m0
+ mova [dstq+strideq*0+32*2], m0
+ mova [dstq+strideq*0+32*3], m0
+ mova [dstq+strideq*1+32*0], m1
+ mova [dstq+strideq*1+32*1], m1
+ mova [dstq+strideq*1+32*2], m1
+ mova [dstq+strideq*1+32*3], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w64
+ RET
+
+%macro PAETH 3 ; top, signed_ldiff, ldiff
+ paddw m0, m%2, m1
+ psubw m7, m3, m0 ; tldiff
+ psubw m0, m%1 ; tdiff
+ pabsw m7, m7
+ pabsw m0, m0
+ pminsw m7, m0
+ pcmpeqw m0, m7
+ pcmpgtw m7, m%3, m7
+ vpblendvb m0, m3, m%1, m0
+ vpblendvb m0, m1, m0, m7
+%endmacro
+
+cglobal ipred_paeth_16bpc, 3, 6, 8, dst, stride, tl, w, h
+%define base r5-ipred_paeth_16bpc_avx2_table
+ movifnidn hd, hm
+ lea r5, [ipred_paeth_16bpc_avx2_table]
+ tzcnt wd, wd
+ movsxd wq, [r5+wq*4]
+ vpbroadcastw m3, [tlq] ; topleft
+ add wq, r5
+ jmp wq
+.w4:
+ vpbroadcastq m2, [tlq+2] ; top
+ movsldup m6, [base+ipred_hv_shuf]
+ lea r3, [strideq*3]
+ psubw m4, m2, m3
+ pabsw m5, m4
+.w4_loop:
+ sub tlq, 8
+ vpbroadcastq m1, [tlq]
+ pshufb m1, m6 ; left
+ PAETH 2, 4, 5
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+r3 ], xm1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4_loop
+ RET
+ALIGN function_align
+.w8:
+ vbroadcasti128 m2, [tlq+2]
+ movsldup m6, [base+ipred_hv_shuf]
+ psubw m4, m2, m3
+ pabsw m5, m4
+.w8_loop:
+ sub tlq, 4
+ vpbroadcastd m1, [tlq]
+ pshufb m1, m6
+ PAETH 2, 4, 5
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8_loop
+ RET
+ALIGN function_align
+.w16:
+ movu m2, [tlq+2]
+ psubw m4, m2, m3
+ pabsw m5, m4
+.w16_loop:
+ sub tlq, 2
+ vpbroadcastw m1, [tlq]
+ PAETH 2, 4, 5
+ mova [dstq], m0
+ add dstq, strideq
+ dec hd
+ jg .w16_loop
+ RET
+ALIGN function_align
+.w32:
+ movu m2, [tlq+2]
+ movu m6, [tlq+34]
+%if WIN64
+ movaps r4m, xmm8
+ movaps r6m, xmm9
+%endif
+ psubw m4, m2, m3
+ psubw m8, m6, m3
+ pabsw m5, m4
+ pabsw m9, m8
+.w32_loop:
+ sub tlq, 2
+ vpbroadcastw m1, [tlq]
+ PAETH 2, 4, 5
+ mova [dstq+32*0], m0
+ PAETH 6, 8, 9
+ mova [dstq+32*1], m0
+ add dstq, strideq
+ dec hd
+ jg .w32_loop
+%if WIN64
+ movaps xmm8, r4m
+ movaps xmm9, r6m
+%endif
+ RET
+ALIGN function_align
+.w64:
+ WIN64_SPILL_XMM 16
+ movu m2, [tlq+ 2]
+ movu m6, [tlq+34]
+ movu m10, [tlq+66]
+ movu m13, [tlq+98]
+ psubw m4, m2, m3
+ psubw m8, m6, m3
+ psubw m11, m10, m3
+ psubw m14, m13, m3
+ pabsw m5, m4
+ pabsw m9, m8
+ pabsw m12, m11
+ pabsw m15, m14
+.w64_loop:
+ sub tlq, 2
+ vpbroadcastw m1, [tlq]
+ PAETH 2, 4, 5
+ mova [dstq+32*0], m0
+ PAETH 6, 8, 9
+ mova [dstq+32*1], m0
+ PAETH 10, 11, 12
+ mova [dstq+32*2], m0
+ PAETH 13, 14, 15
+ mova [dstq+32*3], m0
+ add dstq, strideq
+ dec hd
+ jg .w64_loop
+ RET
+
+cglobal ipred_smooth_v_16bpc, 3, 7, 6, dst, stride, tl, w, h, weights
+%define base r6-ipred_smooth_v_16bpc_avx2_table
+ lea r6, [ipred_smooth_v_16bpc_avx2_table]
+ tzcnt wd, wm
+ mov hd, hm
+ movsxd wq, [r6+wq*4]
+ lea weightsq, [base+smooth_weights_1d_16bpc+hq*4]
+ neg hq
+ vpbroadcastw m5, [tlq+hq*2] ; bottom
+ add wq, r6
+ jmp wq
+.w4:
+ vpbroadcastq m4, [tlq+2] ; top
+ movsldup m3, [base+ipred_hv_shuf]
+ lea r6, [strideq*3]
+ psubw m4, m5 ; top - bottom
+.w4_loop:
+ vpbroadcastq m0, [weightsq+hq*2]
+ pshufb m0, m3
+ pmulhrsw m0, m4
+ paddw m0, m5
+ vextracti128 xm1, m0, 1
+ movhps [dstq+strideq*0], xm1
+ movhps [dstq+strideq*1], xm0
+ movq [dstq+strideq*2], xm1
+ movq [dstq+r6 ], xm0
+ lea dstq, [dstq+strideq*4]
+ add hq, 4
+ jl .w4_loop
+.ret:
+ RET
+.w8:
+ vbroadcasti128 m4, [tlq+2]
+ movsldup m3, [base+ipred_hv_shuf]
+ lea r6, [strideq*3]
+ psubw m4, m5
+.w8_loop:
+ vpbroadcastd m0, [weightsq+hq*2+0]
+ vpbroadcastd m1, [weightsq+hq*2+4]
+ pshufb m0, m3
+ pshufb m1, m3
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+ paddw m0, m5
+ paddw m1, m5
+ vextracti128 [dstq+strideq*0], m0, 1
+ mova [dstq+strideq*1], xm0
+ vextracti128 [dstq+strideq*2], m1, 1
+ mova [dstq+r6 ], xm1
+ lea dstq, [dstq+strideq*4]
+ add hq, 4
+ jl .w8_loop
+ RET
+.w16:
+ movu m4, [tlq+2]
+ lea r6, [strideq*3]
+ psubw m4, m5
+.w16_loop:
+ vpbroadcastw m0, [weightsq+hq*2+0]
+ vpbroadcastw m1, [weightsq+hq*2+2]
+ vpbroadcastw m2, [weightsq+hq*2+4]
+ vpbroadcastw m3, [weightsq+hq*2+6]
+ REPX {pmulhrsw x, m4}, m0, m1, m2, m3
+ REPX {paddw x, m5}, m0, m1, m2, m3
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+r6 ], m3
+ lea dstq, [dstq+strideq*4]
+ add hq, 4
+ jl .w16_loop
+ RET
+.w32:
+ WIN64_SPILL_XMM 7
+ movu m4, [tlq+ 2]
+ movu m6, [tlq+34]
+ psubw m4, m5
+ psubw m6, m5
+.w32_loop:
+ vpbroadcastw m1, [weightsq+hq*2+0]
+ vpbroadcastw m3, [weightsq+hq*2+2]
+ pmulhrsw m0, m4, m1
+ pmulhrsw m1, m6
+ pmulhrsw m2, m4, m3
+ pmulhrsw m3, m6
+ REPX {paddw x, m5}, m0, m1, m2, m3
+ mova [dstq+strideq*0+32*0], m0
+ mova [dstq+strideq*0+32*1], m1
+ mova [dstq+strideq*1+32*0], m2
+ mova [dstq+strideq*1+32*1], m3
+ lea dstq, [dstq+strideq*2]
+ add hq, 2
+ jl .w32_loop
+ RET
+.w64:
+ WIN64_SPILL_XMM 8
+ movu m3, [tlq+ 2]
+ movu m4, [tlq+34]
+ movu m6, [tlq+66]
+ movu m7, [tlq+98]
+ REPX {psubw x, m5}, m3, m4, m6, m7
+.w64_loop:
+ vpbroadcastw m2, [weightsq+hq*2]
+ pmulhrsw m0, m3, m2
+ pmulhrsw m1, m4, m2
+ paddw m0, m5
+ paddw m1, m5
+ mova [dstq+32*0], m0
+ pmulhrsw m0, m6, m2
+ mova [dstq+32*1], m1
+ pmulhrsw m1, m7, m2
+ paddw m0, m5
+ paddw m1, m5
+ mova [dstq+32*2], m0
+ mova [dstq+32*3], m1
+ add dstq, strideq
+ inc hq
+ jl .w64_loop
+ RET
+
+cglobal ipred_smooth_h_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
+%define base r6-ipred_smooth_h_16bpc_avx2_table
+ lea r6, [ipred_smooth_h_16bpc_avx2_table]
+ mov wd, wm
+ movifnidn hd, hm
+ vpbroadcastw m5, [tlq+wq*2] ; right
+ tzcnt wd, wd
+ add hd, hd
+ movsxd wq, [r6+wq*4]
+ sub tlq, hq
+ lea stride3q, [strideq*3]
+ add wq, r6
+ jmp wq
+.w4:
+ vpbroadcastq m4, [base+smooth_weights_1d_16bpc+4*2]
+ movsldup m3, [base+ipred_hv_shuf]
+.w4_loop:
+ vpbroadcastq m0, [tlq+hq-8] ; left
+ pshufb m0, m3
+ psubw m0, m5 ; left - right
+ pmulhrsw m0, m4
+ paddw m0, m5
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4*2
+ jg .w4_loop
+ RET
+.w8:
+ vbroadcasti128 m4, [base+smooth_weights_1d_16bpc+8*2]
+ movsldup m3, [base+ipred_hv_shuf]
+.w8_loop:
+ vpbroadcastd m0, [tlq+hq-4]
+ vpbroadcastd m1, [tlq+hq-8]
+ pshufb m0, m3
+ pshufb m1, m3
+ psubw m0, m5
+ psubw m1, m5
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+ paddw m0, m5
+ paddw m1, m5
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], xm1
+ vextracti128 [dstq+stride3q ], m1, 1
+ lea dstq, [dstq+strideq*4]
+ sub hq, 4*2
+ jg .w8_loop
+ RET
+.w16:
+ movu m4, [base+smooth_weights_1d_16bpc+16*2]
+.w16_loop:
+ vpbroadcastq m3, [tlq+hq-8]
+ punpcklwd m3, m3
+ psubw m3, m5
+ pshufd m0, m3, q3333
+ pshufd m1, m3, q2222
+ pshufd m2, m3, q1111
+ pshufd m3, m3, q0000
+ REPX {pmulhrsw x, m4}, m0, m1, m2, m3
+ REPX {paddw x, m5}, m0, m1, m2, m3
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+ sub hq, 4*2
+ jg .w16_loop
+ RET
+.w32:
+ WIN64_SPILL_XMM 7
+ movu m4, [base+smooth_weights_1d_16bpc+32*2]
+ movu m6, [base+smooth_weights_1d_16bpc+32*3]
+.w32_loop:
+ vpbroadcastw m1, [tlq+hq-2]
+ vpbroadcastw m3, [tlq+hq-4]
+ psubw m1, m5
+ psubw m3, m5
+ pmulhrsw m0, m4, m1
+ pmulhrsw m1, m6
+ pmulhrsw m2, m4, m3
+ pmulhrsw m3, m6
+ REPX {paddw x, m5}, m0, m1, m2, m3
+ mova [dstq+strideq*0+32*0], m0
+ mova [dstq+strideq*0+32*1], m1
+ mova [dstq+strideq*1+32*0], m2
+ mova [dstq+strideq*1+32*1], m3
+ lea dstq, [dstq+strideq*2]
+ sub hq, 2*2
+ jg .w32_loop
+ RET
+.w64:
+ WIN64_SPILL_XMM 8
+ movu m3, [base+smooth_weights_1d_16bpc+32*4]
+ movu m4, [base+smooth_weights_1d_16bpc+32*5]
+ movu m6, [base+smooth_weights_1d_16bpc+32*6]
+ movu m7, [base+smooth_weights_1d_16bpc+32*7]
+.w64_loop:
+ vpbroadcastw m2, [tlq+hq-2]
+ psubw m2, m5
+ pmulhrsw m0, m3, m2
+ pmulhrsw m1, m4, m2
+ paddw m0, m5
+ paddw m1, m5
+ mova [dstq+32*0], m0
+ pmulhrsw m0, m6, m2
+ mova [dstq+32*1], m1
+ pmulhrsw m1, m7, m2
+ paddw m0, m5
+ paddw m1, m5
+ mova [dstq+32*2], m0
+ mova [dstq+32*3], m1
+ add dstq, strideq
+ sub hq, 1*2
+ jg .w64_loop
+ RET
+
+%macro SMOOTH_2D_END 6 ; src[1-2], mul[1-2], add[1-2]
+ pmaddwd m0, m%1, m%3
+ pmaddwd m1, m%2, m%4
+ paddd m0, m%5
+ paddd m1, m%6
+ psrld m0, 8
+ psrld m1, 8
+ packssdw m0, m1
+ pavgw m0, m5
+%endmacro
+
+cglobal ipred_smooth_16bpc, 3, 7, 6, dst, stride, tl, w, h, v_weights
+%define base r6-ipred_smooth_16bpc_avx2_table
+ lea r6, [ipred_smooth_16bpc_avx2_table]
+ mov wd, wm
+ vpbroadcastw m4, [tlq+wq*2] ; right
+ tzcnt wd, wd
+ mov hd, hm
+ sub tlq, hq
+ sub tlq, hq
+ movsxd wq, [r6+wq*4]
+ pxor m5, m5
+ add wq, r6
+ lea v_weightsq, [base+smooth_weights_2d_16bpc+hq*4]
+ jmp wq
+.w4:
+ WIN64_SPILL_XMM 11
+ vpbroadcastw m0, [tlq] ; bottom
+ vpbroadcastq m6, [tlq+hq*2+2]
+ movsldup m7, [base+ipred_hv_shuf]
+ movshdup m9, [base+ipred_hv_shuf]
+ vbroadcasti128 m10, [base+smooth_weights_2d_16bpc+4*4]
+ punpcklwd m6, m0 ; top, bottom
+ punpcklqdq m8, m9, m9
+ punpckhqdq m9, m9
+ lea r3, [strideq*3]
+.w4_loop:
+ vpbroadcastq m3, [tlq+hq*2-8]
+ vbroadcasti128 m1, [v_weightsq]
+ pshufb m3, m7
+ punpcklwd m2, m3, m4 ; left, right
+ punpckhwd m3, m4
+ pmaddwd m2, m10
+ pmaddwd m3, m10
+ pshufb m0, m1, m8
+ pshufb m1, m9
+ SMOOTH_2D_END 0, 1, 6, 6, 2, 3
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+r3 ], xm1
+ lea dstq, [dstq+strideq*4]
+ add v_weightsq, 16
+ sub hd, 4
+ jg .w4_loop
+ RET
+.w8:
+%assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 12
+ vpbroadcastw m0, [tlq] ; bottom
+ vbroadcasti128 m7, [tlq+hq*2+2]
+ movsldup m8, [base+ipred_hv_shuf]
+ movshdup m9, [base+ipred_hv_shuf]
+ vbroadcasti128 m10, [base+smooth_weights_2d_16bpc+8*4+16*0]
+ vbroadcasti128 m11, [base+smooth_weights_2d_16bpc+8*4+16*1]
+ punpcklwd m6, m7, m0 ; top, bottom
+ punpckhwd m7, m0
+.w8_loop:
+ vpbroadcastd m3, [tlq+hq*2-4]
+ vpbroadcastq m1, [v_weightsq]
+ pshufb m3, m8
+ punpcklwd m2, m3, m4 ; left, right
+ punpckhwd m3, m4
+ pmaddwd m2, m10
+ pmaddwd m3, m11
+ pshufb m1, m9
+ SMOOTH_2D_END 1, 1, 6, 7, 2, 3
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ add v_weightsq, 8
+ sub hd, 2
+ jg .w8_loop
+ RET
+.w16:
+%assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 11
+ vpbroadcastw m0, [tlq] ; bottom
+ movu m7, [tlq+hq*2+2]
+ mova xm8, [base+smooth_weights_2d_16bpc+16*4+16*0]
+ mova xm9, [base+smooth_weights_2d_16bpc+16*4+16*1]
+ vinserti128 m8, [base+smooth_weights_2d_16bpc+16*4+16*2], 1
+ vinserti128 m9, [base+smooth_weights_2d_16bpc+16*4+16*3], 1
+ punpcklwd m6, m7, m0 ; top, bottom
+ punpckhwd m7, m0
+.w16_loop:
+ vpbroadcastd m3, [tlq+hq*2-4]
+ vpbroadcastd m1, [v_weightsq+0]
+ punpcklwd m3, m4 ; left, right
+ pshufd m2, m3, q1111
+ pmaddwd m10, m8, m2
+ pmaddwd m2, m9
+ pshufd m3, m3, q0000
+ SMOOTH_2D_END 1, 1, 6, 7, 10, 2
+ vpbroadcastd m1, [v_weightsq+4]
+ pmaddwd m2, m8, m3
+ pmaddwd m3, m9
+ mova [dstq+strideq*0], m0
+ SMOOTH_2D_END 1, 1, 6, 7, 2, 3
+ mova [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ add v_weightsq, 8
+ sub hq, 2
+ jg .w16_loop
+ RET
+.w32:
+%assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 15
+ vpbroadcastw m0, [tlq] ; bottom
+ movu m7, [tlq+hq*2+ 2]
+ movu m9, [tlq+hq*2+34]
+ mova xm10, [base+smooth_weights_2d_16bpc+32*4+16*0]
+ mova xm11, [base+smooth_weights_2d_16bpc+32*4+16*1]
+ vinserti128 m10, [base+smooth_weights_2d_16bpc+32*4+16*2], 1
+ vinserti128 m11, [base+smooth_weights_2d_16bpc+32*4+16*3], 1
+ mova xm12, [base+smooth_weights_2d_16bpc+32*4+16*4]
+ mova xm13, [base+smooth_weights_2d_16bpc+32*4+16*5]
+ vinserti128 m12, [base+smooth_weights_2d_16bpc+32*4+16*6], 1
+ vinserti128 m13, [base+smooth_weights_2d_16bpc+32*4+16*7], 1
+ punpcklwd m6, m7, m0
+ punpckhwd m7, m0
+ punpcklwd m8, m9, m0
+ punpckhwd m9, m0
+.w32_loop:
+ vpbroadcastw m3, [tlq+hq*2-2]
+ vpbroadcastd m14, [v_weightsq]
+ punpcklwd m3, m4
+ pmaddwd m1, m10, m3
+ pmaddwd m2, m11, m3
+ pmaddwd m0, m6, m14
+ paddd m0, m1
+ pmaddwd m1, m7, m14
+ paddd m1, m2
+ pmaddwd m2, m12, m3
+ pmaddwd m3, m13
+ psrld m0, 8
+ psrld m1, 8
+ packssdw m0, m1
+ pavgw m0, m5
+ mova [dstq+32*0], m0
+ SMOOTH_2D_END 14, 14, 8, 9, 2, 3
+ mova [dstq+32*1], m0
+ add dstq, strideq
+ add v_weightsq, 4
+ dec hd
+ jg .w32_loop
+ RET
+.w64:
+%assign stack_offset stack_offset - stack_size_padded
+ PROLOGUE 0, 11, 16, dst, stride, tl, tl_base, h, v_weights, dummy, v_weights_base, x, y, dst_base
+ mov dst_baseq, dstq
+ mov tl_baseq, tlq
+ mov v_weights_baseq, v_weightsq
+ xor xq, xq
+.w64_loop_x:
+ mov yq, hq
+ lea tlq, [tl_baseq+hq*2]
+ vpbroadcastw m0, [tl_baseq] ; bottom
+ movu m7, [tlq+xq*2+ 2]
+ movu m9, [tlq+xq*2+34]
+ mova xm10, [base+smooth_weights_2d_16bpc+64*4+16*0]
+ mova xm11, [base+smooth_weights_2d_16bpc+64*4+16*1]
+ vinserti128 m10, [base+smooth_weights_2d_16bpc+64*4+16*2], 1
+ vinserti128 m11, [base+smooth_weights_2d_16bpc+64*4+16*3], 1
+ mova xm12, [base+smooth_weights_2d_16bpc+64*4+16*4]
+ mova xm13, [base+smooth_weights_2d_16bpc+64*4+16*5]
+ vinserti128 m12, [base+smooth_weights_2d_16bpc+64*4+16*6], 1
+ vinserti128 m13, [base+smooth_weights_2d_16bpc+64*4+16*7], 1
+ punpcklwd m6, m7, m0
+ punpckhwd m7, m0
+ punpcklwd m8, m9, m0
+ punpckhwd m9, m0
+ lea tlq, [tl_baseq-2]
+.w64_loop_y:
+ vpbroadcastw m3, [tlq+yq*2]
+ vpbroadcastd m1, [v_weightsq]
+ punpcklwd m3, m4
+ pmaddwd m14, m10, m3
+ pmaddwd m15, m11, m3
+ pmaddwd m2, m12, m3
+ pmaddwd m3, m13
+ pmaddwd m0, m6, m1
+ paddd m0, m14
+ pmaddwd m14, m7, m1
+ paddd m14, m15
+ psrld m0, 8
+ psrld m14, 8
+ packssdw m0, m14
+ pavgw m0, m5
+ mova [dstq+32*0], m0
+ SMOOTH_2D_END 8, 9, 1, 1, 2, 3
+ mova [dstq+32*1], m0
+ add dstq, strideq
+ add v_weightsq, 4
+ dec yq
+ jg .w64_loop_y
+ lea dstq, [dst_baseq+32*2]
+ add r6, 16*8
+ mov v_weightsq, v_weights_baseq
+ add xq, 32
+ test xb, 64
+ jz .w64_loop_x
+ RET
+
+cglobal ipred_z1_16bpc, 3, 8, 0, dst, stride, tl, w, h, angle, dx, maxbase
+ %assign org_stack_offset stack_offset
+ lea r6, [ipred_z1_16bpc_avx2_table]
+ tzcnt wd, wm
+ movifnidn angled, anglem
+ movifnidn hd, hm
+ lea r7, [dr_intra_derivative]
+ movsxd wq, [r6+wq*4]
+ add tlq, 2
+ add wq, r6
+ mov dxd, angled
+ and dxd, 0x7e
+ add angled, 165 ; ~90
+ movzx dxd, word [r7+dxq]
+ xor angled, 0x4ff ; d = 90 - angle
+ vpbroadcastd m5, [pw_62]
+ jmp wq
+.w4:
+ ALLOC_STACK -64, 7
+ cmp angleb, 40
+ jae .w4_no_upsample
+ lea r3d, [angleq-1024]
+ sar r3d, 7
+ add r3d, hd
+ jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm)
+ vpbroadcastw xm3, [tlq+14]
+ movu xm1, [tlq+ 0] ; 1 2 3 4 5 6 7 8
+ palignr xm0, xm3, xm1, 4 ; 3 4 5 6 7 8 8 8
+ paddw xm0, [tlq- 2] ; 0 1 2 3 4 5 6 7
+ add dxd, dxd
+ palignr xm2, xm3, xm1, 2 ; 2 3 4 5 6 7 8 8
+ paddw xm2, xm1 ; -1 * a + 9 * b + 9 * c + -1 * d
+ psubw xm0, xm2, xm0 ; = (b + c - a - d + (b + c) << 3 + 8) >> 4
+ psraw xm0, 3 ; = ((b + c - a - d) >> 3 + b + c + 1) >> 1
+ pxor xm4, xm4
+ paddw xm2, xm0
+ vpbroadcastw xm0, r8m ; pixel_max
+ mova [rsp+32], xm3
+ movd xm3, dxd
+ pmaxsw xm2, xm4
+ mov r3d, dxd
+ pavgw xm2, xm4
+ vpbroadcastw m3, xm3
+ pminsw xm2, xm0
+ punpcklwd xm0, xm1, xm2
+ punpckhwd xm1, xm2
+ lea r5, [strideq*3]
+ pslldq m2, m3, 8
+ mova [rsp+ 0], xm0
+ mova [rsp+16], xm1
+ paddw m6, m3, m3
+ paddw m3, m2
+ vpblendd m4, m6, 0xf0
+ paddw m6, m6
+ paddw m3, m4 ; xpos0 xpos1 xpos2 xpos3
+ vbroadcasti128 m4, [z_upsample]
+.w4_upsample_loop:
+ lea r2d, [r3+dxq]
+ shr r3d, 6 ; base0
+ movu xm1, [rsp+r3*2]
+ lea r3d, [r2+dxq]
+ shr r2d, 6 ; base1
+ movu xm2, [rsp+r2*2]
+ lea r2d, [r3+dxq]
+ shr r3d, 6 ; base2
+ vinserti128 m1, [rsp+r3*2], 1 ; 0 2
+ lea r3d, [r2+dxq]
+ shr r2d, 6 ; base3
+ vinserti128 m2, [rsp+r2*2], 1 ; 1 3
+ pshufb m1, m4
+ pshufb m2, m4
+ punpcklqdq m0, m1, m2
+ punpckhqdq m1, m2
+ pand m2, m5, m3 ; frac
+ psllw m2, 9 ; (a * (64 - frac) + b * frac + 32) >> 6
+ psubw m1, m0 ; = a + (((b - a) * frac + 32) >> 6)
+ pmulhrsw m1, m2 ; = a + (((b - a) * (frac << 9) + 16384) >> 15)
+ paddw m3, m6 ; xpos += dx
+ paddw m0, m1
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+r5 ], xm1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4_upsample_loop
+ RET
+ALIGN function_align
+.filter_strength: ; w4/w8/w16
+%define base r3-z_filter_t0
+ movd xm0, maxbased
+ lea r3, [z_filter_t0]
+ movd xm1, angled
+ shr angled, 8 ; is_sm << 1
+ vpbroadcastb m0, xm0
+ vpbroadcastb m1, xm1
+ pcmpeqb m0, [base+z_filter_wh]
+ mova xm2, [r3+angleq*8]
+ pand m0, m1
+ pcmpgtb m0, m2
+ pmovmskb r5d, m0
+ ret
+.w4_no_upsample:
+ mov maxbased, 7
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .w4_main
+ lea maxbased, [hq+3]
+ call .filter_strength
+ mov maxbased, 7
+ test r5d, r5d
+ jz .w4_main ; filter_strength == 0
+ popcnt r5d, r5d
+ vpbroadcastw xm3, [tlq+14]
+ mova xm0, [tlq- 2] ; 0 1 2 3 4 5 6 7
+ vpbroadcastd xm1, [base+z_filter_k-4+r5*4+12*1]
+ vpbroadcastd xm4, [base+z_filter_k-4+r5*4+12*0]
+ palignr xm2, xm3, xm0, 4 ; 2 3 4 5 6 7 8 8
+ pmullw xm1, [tlq+ 0] ; 1 2 3 4 5 6 7 8
+ paddw xm2, xm0
+ pmullw xm2, xm4
+ movd [rsp+16], xm3
+ cmp r5d, 3
+ jne .w4_3tap
+ paddw xm1, xm2
+ palignr xm2, xm3, xm0, 6 ; 3 4 5 6 7 8 8 8
+ pblendw xm0, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6
+ movzx r3d, word [tlq+14]
+ movzx r2d, word [tlq+12]
+ inc maxbased
+ paddw xm2, xm0
+ sub r2d, r3d
+ paddw xm2, xm2
+ lea r2d, [r2+r3*8+4]
+ shr r2d, 3 ; (1 * top[6] + 7 * top[7] + 4) >> 3
+ mov [rsp+16], r2w
+.w4_3tap:
+ pxor xm0, xm0
+ paddw xm1, xm2
+ mov tlq, rsp
+ psrlw xm1, 3
+ cmp hd, 8
+ sbb maxbased, -1
+ pavgw xm0, xm1
+ mova [tlq], xm0
+.w4_main:
+ movd xm3, dxd
+ vpbroadcastq m1, [z_base_inc]
+ vpbroadcastw m6, [tlq+maxbaseq*2] ; top[max_base_x]
+ shl maxbased, 6
+ vpbroadcastw m3, xm3
+ movd xm0, maxbased
+ mov r3d, dxd ; xpos
+ vpbroadcastw m0, xm0
+ paddw m4, m3, m3
+ psubw m1, m0 ; -max_base_x
+ vpblendd m3, m4, 0xcc
+ paddw m0, m4, m3
+ vpblendd m3, m0, 0xf0 ; xpos0 xpos1 xpos2 xpos3
+ paddw m4, m4
+ paddw m3, m1
+.w4_loop:
+ lea r5d, [r3+dxq]
+ shr r3d, 6 ; base0
+ movu xm1, [tlq+r3*2]
+ lea r3d, [r5+dxq]
+ shr r5d, 6 ; base1
+ movu xm2, [tlq+r5*2]
+ lea r5d, [r3+dxq]
+ shr r3d, 6 ; base2
+ vinserti128 m1, [tlq+r3*2], 1 ; 0 2
+ lea r3d, [r5+dxq]
+ shr r5d, 6 ; base3
+ vinserti128 m2, [tlq+r5*2], 1 ; 1 3
+ punpcklqdq m0, m1, m2
+ psrldq m1, 2
+ pslldq m2, 6
+ vpblendd m1, m2, 0xcc
+ pand m2, m5, m3
+ psllw m2, 9
+ psubw m1, m0
+ pmulhrsw m1, m2
+ psraw m2, m3, 15 ; xpos < max_base_x
+ paddw m3, m4
+ paddw m0, m1
+ vpblendvb m0, m6, m0, m2
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ lea dstq, [dstq+strideq*2]
+ movq [dstq+strideq*0], xm1
+ movhps [dstq+strideq*1], xm1
+ sub hd, 4
+ jz .w4_end
+ lea dstq, [dstq+strideq*2]
+ cmp r3d, maxbased
+ jb .w4_loop
+ lea r6, [strideq*3]
+.w4_end_loop:
+ movq [dstq+strideq*0], xm6
+ movq [dstq+strideq*1], xm6
+ movq [dstq+strideq*2], xm6
+ movq [dstq+r6 ], xm6
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4_end_loop
+.w4_end:
+ RET
+.w8:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -64, 7
+ lea r3d, [angleq+216]
+ mov r3b, hb
+ cmp r3d, 8
+ ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8
+ movu m2, [tlq+2] ; 2 3 4 5 6 7 8 9 a b c d e f g _
+ movu m0, [tlq+4] ; 3 4 5 6 7 8 9 a b c d e f g _ _
+ movu m1, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ cmp hd, 4
+ jne .w8_upsample_h8 ; awkward single-pixel edge case
+ vpblendd m0, m2, 0x20 ; 3 4 5 6 7 8 9 a b c c _ _ _ _ _
+.w8_upsample_h8:
+ paddw m2, m1
+ paddw m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ add dxd, dxd
+ psubw m0, m2, m0
+ psraw m0, 3
+ pxor m4, m4
+ paddw m2, m0
+ vpbroadcastw m0, r8m
+ movd xm3, dxd
+ pmaxsw m2, m4
+ mov r3d, dxd
+ pavgw m2, m4
+ vpbroadcastw m3, xm3
+ pminsw m2, m0
+ punpcklwd m0, m1, m2
+ punpckhwd m1, m2
+ vbroadcasti128 m4, [z_upsample]
+ mova [rsp+ 0], xm0
+ mova [rsp+16], xm1
+ paddw m6, m3, m3
+ vextracti128 [rsp+32], m0, 1
+ vextracti128 [rsp+48], m1, 1
+ vpblendd m3, m6, 0xf0 ; xpos0 xpos1
+.w8_upsample_loop:
+ lea r2d, [r3+dxq]
+ shr r3d, 6 ; base0
+ movu xm1, [rsp+r3*2]
+ movu xm2, [rsp+r3*2+16]
+ lea r3d, [r2+dxq]
+ shr r2d, 6 ; base1
+ vinserti128 m1, [rsp+r2*2], 1
+ vinserti128 m2, [rsp+r2*2+16], 1
+ pshufb m1, m4
+ pshufb m2, m4
+ punpcklqdq m0, m1, m2
+ punpckhqdq m1, m2
+ pand m2, m5, m3
+ psllw m2, 9
+ psubw m1, m0
+ pmulhrsw m1, m2
+ paddw m3, m6
+ paddw m0, m1
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8_upsample_loop
+ RET
+.w8_no_intra_edge_filter:
+ and maxbased, 7
+ or maxbased, 8 ; imin(h+7, 15)
+ jmp .w8_main
+.w8_no_upsample:
+ lea maxbased, [hq+7]
+ test angled, 0x400
+ jnz .w8_no_intra_edge_filter
+ call .filter_strength
+ test r5d, r5d
+ jz .w8_main
+ popcnt r5d, r5d
+ vpbroadcastd m1, [base+z_filter_k-4+r5*4+12*1]
+ vpbroadcastd m4, [base+z_filter_k-4+r5*4+12*0]
+ mova m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ movu m2, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ pmullw m1, m2
+ cmp hd, 8
+ jl .w8_filter_h4
+ punpckhwd m2, m2
+ vpblendd m3, m2, [tlq+2], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g
+ je .w8_filter_end ; 8x4 and 8x8 are always 3-tap
+ movzx r3d, word [tlq+30]
+ mov maxbased, 16
+ mov [rsp+32], r3d
+ cmp r5d, 3
+ jne .w8_filter_end
+ punpcklwd xm6, xm0, xm0
+ vpblendd m2, [tlq+4], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g g g
+ vpblendd m6, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e
+ movzx r5d, word [tlq+28]
+ mov [rsp+34], r3w
+ paddw m2, m6
+ sub r5d, r3d
+ inc maxbased
+ paddw m2, m2
+ lea r3d, [r5+r3*8+4]
+ paddw m1, m2
+ shr r3d, 3
+ mov [rsp+32], r3w
+ jmp .w8_filter_end
+.w8_filter_h4:
+ pshuflw m3, m2, q3321
+ vinserti128 m3, [tlq+2], 0 ; 2 3 4 5 6 7 8 9 a b c c _ _ _ _
+.w8_filter_end:
+ paddw m0, m3
+ pmullw m0, m4
+ mov tlq, rsp
+ pxor m2, m2
+ paddw m0, m1
+ psrlw m0, 3
+ pavgw m0, m2
+ mova [tlq], m0
+.w8_main:
+ movd xm3, dxd
+ vbroadcasti128 m1, [z_base_inc]
+ vpbroadcastw m6, [tlq+maxbaseq*2]
+ shl maxbased, 6
+ vpbroadcastw m3, xm3
+ movd xm0, maxbased
+ mov r3d, dxd
+ vpbroadcastw m0, xm0
+ paddw m4, m3, m3
+ psubw m1, m0
+ vpblendd m3, m4, 0xf0 ; xpos0 xpos1
+ paddw m3, m1
+.w8_loop:
+ lea r5d, [r3+dxq]
+ shr r3d, 6
+ movu xm0, [tlq+r3*2]
+ movu xm1, [tlq+r3*2+2]
+ lea r3d, [r5+dxq]
+ shr r5d, 6
+ vinserti128 m0, [tlq+r5*2], 1
+ vinserti128 m1, [tlq+r5*2+2], 1
+ pand m2, m5, m3
+ psllw m2, 9
+ psubw m1, m0
+ pmulhrsw m1, m2
+ psraw m2, m3, 15
+ paddw m3, m4
+ paddw m0, m1
+ vpblendvb m0, m6, m0, m2
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ sub hd, 2
+ jz .w8_end
+ lea dstq, [dstq+strideq*2]
+ cmp r3d, maxbased
+ jb .w8_loop
+.w8_end_loop:
+ mova [dstq+strideq*0], xm6
+ mova [dstq+strideq*1], xm6
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8_end_loop
+.w8_end:
+ RET
+.w16_no_intra_edge_filter:
+ and maxbased, 15
+ or maxbased, 16 ; imin(h+15, 31)
+ jmp .w16_main
+.w16:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -96, 7
+ lea maxbased, [hq+15]
+ test angled, 0x400
+ jnz .w16_no_intra_edge_filter
+ call .filter_strength
+ test r5d, r5d
+ jz .w16_main
+ popcnt r5d, r5d
+ mova m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ paddw m1, m0, [tlq+2] ; 2 3 4 5 6 7 8 9 a b c d e f g h
+ cmp r5d, 3
+ jne .w16_filter_3tap
+ vpbroadcastd m2, [base+pw_3]
+ punpcklwd xm0, xm0
+ vpblendd m0, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e
+ paddw m1, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ paddw m0, m2
+ pavgw m0, [tlq+4] ; 3 4 5 6 7 8 9 a b c d e f g h i
+ paddw m0, m1
+ psrlw m0, 2
+ movu m3, [tlq+32] ; 2 3 4 5 6 7 8 9 a b c d e f g h
+ paddw m2, [tlq+28] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ paddw m1, m3, [tlq+30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ cmp hd, 8
+ jl .w16_filter_5tap_h4
+ punpckhwd m3, m3
+ je .w16_filter_5tap_h8
+ vpblendd m4, m3, [tlq+36], 0x7f ; 4 5 6 7 8 9 a b c d e f g h h h
+ vpblendd m3, [tlq+34], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g h h
+ movzx r3d, word [tlq+62]
+ movzx r2d, word [tlq+60]
+ pavgw m2, m4
+ sub r2d, r3d
+ paddw m1, m3
+ lea r2d, [r2+r3*8+4]
+ paddw m1, m2
+ shr r2d, 3
+ psrlw m1, 2
+ mov [rsp+66], r3w
+ mov [rsp+64], r2w
+ mov tlq, rsp
+ mov r3d, 33
+ cmp hd, 16
+ cmovg maxbased, r3d
+ jmp .w16_filter_end2
+.w16_filter_5tap_h8:
+ vpblendd xm4, xm3, [tlq+36], 0x07 ; 4 5 6 7 8 9 9 9
+ vpblendd xm3, [tlq+34], 0x07 ; 3 4 5 6 7 8 9 9
+ pavgw xm2, xm4
+ paddw xm1, xm3
+ paddw xm1, xm2
+ psrlw xm1, 2
+ jmp .w16_filter_end2
+.w16_filter_5tap_h4:
+ pshuflw xm4, xm3, q3332 ; 4 5 5 5
+ pshuflw xm3, xm3, q3321 ; 3 4 5 5
+ pavgw xm2, xm4
+ paddw xm1, xm3
+ paddw xm1, xm2
+ psrlw xm1, 2
+ jmp .w16_filter_end2
+.w16_filter_3tap:
+ vpbroadcastd m3, [base+z_filter_k-4+r5*4+12*1]
+ vpbroadcastd m4, [base+z_filter_k-4+r5*4+12*0]
+ pmullw m0, m3, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ movu m2, [tlq+32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ pmullw m1, m4
+ pmullw m3, m2
+ paddw m0, m1
+ cmp hd, 8
+ je .w16_filter_3tap_h8
+ jl .w16_filter_3tap_h4
+ punpckhwd m2, m2
+ vpblendd m2, [tlq+34], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g
+ jmp .w16_filter_end
+.w16_filter_3tap_h4:
+ pshuflw xm2, xm2, q3321 ; 2 3 4 4 _ _ _ _
+ jmp .w16_filter_end
+.w16_filter_3tap_h8:
+ psrldq xm2, 2
+ pshufhw xm2, xm2, q2210 ; 2 3 4 5 6 7 8 8
+.w16_filter_end:
+ paddw m2, [tlq+30] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ pmullw m2, m4
+ psrlw m0, 3
+ pxor m1, m1
+ paddw m2, m3
+ psrlw m2, 3
+ pavgw m0, m1
+ pavgw m1, m2
+.w16_filter_end2:
+ mov tlq, rsp
+ mova [tlq+ 0], m0
+ mova [tlq+32], m1
+.w16_main:
+ movd xm4, dxd
+ vpbroadcastw m6, [tlq+maxbaseq*2]
+ shl maxbased, 6
+ vpbroadcastw m4, xm4
+ movd xm0, maxbased
+ mov r3d, dxd
+ vpbroadcastw m0, xm0
+ paddw m3, m4, [z_base_inc]
+ psubw m3, m0
+.w16_loop:
+ lea r5d, [r3+dxq]
+ shr r3d, 6
+ movu m0, [tlq+r3*2]
+ movu m1, [tlq+r3*2+2]
+ lea r3d, [r5+dxq]
+ shr r5d, 6
+ pand m2, m5, m3
+ psllw m2, 9
+ psubw m1, m0
+ pmulhrsw m1, m2
+ psraw m2, m3, 15
+ paddw m3, m4
+ paddw m1, m0
+ movu m0, [tlq+r5*2]
+ vpblendvb m2, m6, m1, m2
+ movu m1, [tlq+r5*2+2]
+ mova [dstq+strideq*0], m2
+ pand m2, m5, m3
+ psllw m2, 9
+ psubw m1, m0
+ pmulhrsw m1, m2
+ psraw m2, m3, 15
+ paddw m3, m4
+ paddw m0, m1
+ vpblendvb m0, m6, m0, m2
+ mova [dstq+strideq*1], m0
+ sub hd, 2
+ jz .w16_end
+ lea dstq, [dstq+strideq*2]
+ cmp r3d, maxbased
+ jb .w16_loop
+.w16_end_loop:
+ mova [dstq+strideq*0], m6
+ mova [dstq+strideq*1], m6
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w16_end_loop
+.w16_end:
+ RET
+.w32:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -160, 8
+ lea maxbased, [hq+31]
+ mov r3d, 63
+ cmp hd, 32
+ cmova maxbased, r3d
+ test angled, 0x400
+ jnz .w32_main
+ vpbroadcastd m2, [pw_3]
+ mova m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ punpcklwd xm1, xm0, xm0
+ vpblendd m1, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e
+ paddw m0, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ paddw m1, m2
+ paddw m0, [tlq+2] ; 2 3 4 5 6 7 8 9 a b c d e f g h
+ pavgw m1, [tlq+4] ; 3 4 5 6 7 8 9 a b c d e f g h i
+ mov r3, rsp
+ paddw m0, m1
+ lea r5d, [maxbaseq-31]
+ psrlw m0, 2
+ mova [r3], m0
+.w32_filter_loop:
+ mova m0, [tlq+30]
+ paddw m1, m2, [tlq+28]
+ add tlq, 32
+ paddw m0, [tlq+0]
+ pavgw m1, [tlq+4]
+ paddw m0, [tlq+2]
+ add r3, 32
+ paddw m0, m1
+ psrlw m0, 2
+ mova [r3], m0
+ sub r5d, 16
+ jg .w32_filter_loop
+ movu m0, [tlq+32] ; 2 3 4 5 6 7 8 9 a b c d e f g h
+ punpckhwd m1, m0, m0
+ paddw m2, [tlq+28] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ paddw m0, [tlq+30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ jl .w32_filter_h8
+ vpblendd m3, m1, [tlq+36], 0x7f ; 4 5 6 7 8 9 a b c d e f g h h h
+ vpblendd m1, [tlq+34], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g h h
+ movzx r5d, word [tlq+62]
+ movzx r2d, word [tlq+60]
+ pavgw m2, m3
+ sub r2d, r5d
+ paddw m0, m1
+ lea r2d, [r2+r5*8+4]
+ paddw m0, m2
+ shr r2d, 3
+ psrlw m0, 2
+ mova [r3+32], m0
+ mov [r3+66], r5w
+ mov [r3+64], r2w
+ mov tlq, rsp
+ mov r3d, 65
+ cmp hd, 64
+ cmove maxbased, r3d
+ jmp .w32_main
+.w32_filter_h8:
+ vpblendd xm3, xm1, [tlq+36], 0x07 ; 4 5 6 7 8 9 9 9
+ vpblendd xm1, [tlq+34], 0x07 ; 3 4 5 6 7 8 9 9
+ pavgw xm2, xm3
+ paddw xm0, xm1
+ mov tlq, rsp
+ paddw xm0, xm2
+ psrlw xm0, 2
+ mova [r3+32], xm0
+.w32_main:
+ movd xm4, dxd
+ vpbroadcastw m6, [tlq+maxbaseq*2]
+ shl maxbased, 6
+ vpbroadcastw m4, xm4
+ movd xm0, maxbased
+ mov r5d, dxd
+ vpbroadcastd m7, [pw_m1024] ; -16 * 64
+ vpbroadcastw m0, xm0
+ paddw m3, m4, [z_base_inc]
+ psubw m3, m0
+.w32_loop:
+ mov r3d, r5d
+ shr r3d, 6
+ movu m0, [tlq+r3*2]
+ movu m1, [tlq+r3*2+2]
+ pand m2, m5, m3
+ psllw m2, 9
+ psubw m1, m0
+ pmulhrsw m1, m2
+ paddw m0, m1
+ psraw m1, m3, 15
+ vpblendvb m0, m6, m0, m1
+ mova [dstq+32*0], m0
+ movu m0, [tlq+r3*2+32]
+ movu m1, [tlq+r3*2+34]
+ add r5d, dxd
+ psubw m1, m0
+ pmulhrsw m1, m2
+ pcmpgtw m2, m7, m3
+ paddw m3, m4
+ paddw m0, m1
+ vpblendvb m0, m6, m0, m2
+ mova [dstq+32*1], m0
+ dec hd
+ jz .w32_end
+ add dstq, strideq
+ cmp r5d, maxbased
+ jb .w32_loop
+.w32_end_loop:
+ mova [dstq+32*0], m6
+ mova [dstq+32*1], m6
+ add dstq, strideq
+ dec hd
+ jg .w32_end_loop
+.w32_end:
+ RET
+.w64:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -256, 10
+ lea maxbased, [hq+63]
+ test angled, 0x400
+ jnz .w64_main
+ vpbroadcastd m2, [pw_3]
+ mova m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ punpcklwd xm1, xm0, xm0
+ vpblendd m1, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e
+ paddw m0, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ paddw m1, m2
+ paddw m0, [tlq+2] ; 2 3 4 5 6 7 8 9 a b c d e f g h
+ pavgw m1, [tlq+4] ; 3 4 5 6 7 8 9 a b c d e f g h i
+ mov r3, rsp
+ paddw m0, m1
+ lea r5d, [hq+32]
+ psrlw m0, 2
+ mova [r3], m0
+.w64_filter_loop:
+ mova m0, [tlq+30]
+ paddw m1, m2, [tlq+28]
+ add tlq, 32
+ paddw m0, [tlq+0]
+ pavgw m1, [tlq+4]
+ paddw m0, [tlq+2]
+ add r3, 32
+ paddw m0, m1
+ psrlw m0, 2
+ mova [r3], m0
+ sub r5d, 16
+ jg .w64_filter_loop
+ movu m0, [tlq+32] ; 2 3 4 5 6 7 8 9 a b c d e f g h
+ punpckhwd m1, m0, m0
+ paddw m2, [tlq+28] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ paddw m0, [tlq+30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ vpblendd m3, m1, [tlq+36], 0x7f ; 4 5 6 7 8 9 a b c d e f g h h h
+ vpblendd m1, [tlq+34], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g h h
+ pavgw m2, m3
+ paddw m0, m1
+ paddw m0, m2
+ mov tlq, rsp
+ psrlw m0, 2
+ mova [r3+32], m0
+.w64_main:
+ movd xm4, dxd
+ vpbroadcastw m6, [tlq+maxbaseq*2]
+ shl maxbased, 6
+ vpbroadcastw m4, xm4
+ movd xm0, maxbased
+ mov r5d, dxd
+ vpbroadcastd m7, [pw_m1024] ; -16 * 64
+ vpbroadcastw m0, xm0
+ paddw m3, m4, [z_base_inc]
+ paddw m8, m7, m7 ; -32 * 64
+ psubw m3, m0
+ paddw m9, m8, m7 ; -48 * 64
+.w64_loop:
+ mov r3d, r5d
+ shr r3d, 6
+ movu m0, [tlq+r3*2]
+ movu m1, [tlq+r3*2+2]
+ pand m2, m5, m3
+ psllw m2, 9
+ psubw m1, m0
+ pmulhrsw m1, m2
+ paddw m0, m1
+ psraw m1, m3, 15
+ vpblendvb m0, m6, m0, m1
+ mova [dstq+32*0], m0
+ movu m0, [tlq+r3*2+32]
+ movu m1, [tlq+r3*2+34]
+ psubw m1, m0
+ pmulhrsw m1, m2
+ paddw m0, m1
+ pcmpgtw m1, m7, m3
+ vpblendvb m0, m6, m0, m1
+ mova [dstq+32*1], m0
+ movu m0, [tlq+r3*2+64]
+ movu m1, [tlq+r3*2+66]
+ psubw m1, m0
+ pmulhrsw m1, m2
+ paddw m0, m1
+ pcmpgtw m1, m8, m3
+ vpblendvb m0, m6, m0, m1
+ mova [dstq+32*2], m0
+ movu m0, [tlq+r3*2+96]
+ movu m1, [tlq+r3*2+98]
+ add r5d, dxd
+ psubw m1, m0
+ pmulhrsw m1, m2
+ pcmpgtw m2, m9, m3
+ paddw m3, m4
+ paddw m0, m1
+ vpblendvb m0, m6, m0, m2
+ mova [dstq+32*3], m0
+ dec hd
+ jz .w64_end
+ add dstq, strideq
+ cmp r5d, maxbased
+ jb .w64_loop
+.w64_end_loop:
+ mova [dstq+32*0], m6
+ mova [dstq+32*1], m6
+ mova [dstq+32*2], m6
+ mova [dstq+32*3], m6
+ add dstq, strideq
+ dec hd
+ jg .w64_end_loop
+.w64_end:
+ RET
+
+cglobal ipred_z2_16bpc, 3, 12, 12, 352, dst, stride, tl, w, h, angle, dx, dy
+%define base r9-z_filter_t0
+ lea r9, [ipred_z2_16bpc_avx2_table]
+ tzcnt wd, wm
+ movifnidn angled, anglem
+ movifnidn hd, hm
+ lea dxq, [dr_intra_derivative-90]
+ movsxd wq, [r9+wq*4]
+ mova m1, [tlq- 0]
+ movzx dyd, angleb
+ xor angled, 0x400
+ mova m2, [tlq- 32]
+ mov r8, dxq
+ sub dxq, dyq
+ mova m3, [tlq- 64]
+ add wq, r9
+ add r9, z_filter_t0-ipred_z2_16bpc_avx2_table
+ mova m4, [tlq- 96]
+ and dyd, ~1
+ mova m5, [tlq-128]
+ and dxq, ~1
+ movzx dyd, word [r8+dyq] ; angle - 90
+ movzx dxd, word [dxq+270] ; 180 - angle
+ vpbroadcastd m11, [base+pw_62]
+ mova [rsp+128], m1
+ mova [rsp+ 96], m2
+ mova [rsp+ 64], m3
+ neg dxd
+ mova [rsp+ 32], m4
+ neg dyq
+ mova [rsp+ 0], m5
+ jmp wq
+.w4:
+ vbroadcasti128 m10, [base+z2_x_shuf]
+ vpbroadcastq m6, [base+z_base_inc+2]
+ lea r8d, [dxq+(65<<6)] ; xpos
+ mov r10d, (63-4)<<6
+ test angled, 0x400
+ jnz .w4_main ; !enable_intra_edge_filter
+ lea r3d, [hq+2]
+ add angled, 1022
+ shl r3d, 6
+ test r3d, angled
+ jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8)
+ movq xm0, [tlq+2] ; 1 2 3 4
+ movq xm1, [tlq+0] ; 0 1 2 3
+ pshuflw xm2, xm0, q3321 ; 2 3 4 4
+ pshuflw xm3, xm1, q2100 ; 0 0 1 2
+ vpbroadcastw xm4, r8m ; pixel_max
+ vbroadcasti128 m10, [base+z_upsample]
+ paddw xm1, xm0
+ paddw xm2, xm3
+ lea r8d, [r8+dxq+(1<<6)]
+ psubw xm2, xm1, xm2
+ add dxd, dxd
+ psraw xm2, 3
+ pxor xm3, xm3
+ sub r10d, 3<<6
+ paddw xm1, xm2
+ paddw m6, m6
+ pmaxsw xm1, xm3
+ sub angled, 1075 ; angle - 53
+ pavgw xm1, xm3
+ lea r3d, [hq+3]
+ pminsw xm1, xm4
+ xor angled, 0x7f ; 180 - angle
+ punpcklwd xm1, xm0
+ movu [rsp+130], xm1
+ call .filter_strength
+ jmp .w4_filter_left
+ALIGN function_align
+.filter_strength:
+ movd xm8, r3d
+ mov r3d, angled
+ movd xm7, angled
+ vpbroadcastb m8, xm8
+ shr r3d, 8 ; is_sm << 1
+ vpbroadcastb m7, xm7
+ pcmpeqb m8, [base+z_filter_wh]
+ mova xm9, [r9+r3*8]
+ pand m0, m8, m7
+ pcmpgtb m0, m9
+ pmovmskb r3d, m0
+ ret
+ALIGN function_align
+.upsample_left: ; h4/h8
+ mova xm0, [tlq-16] ; 8 7 6 5 4 3 2 1
+ movu xm1, [tlq-14] ; 7 6 5 4 3 2 1 0
+ vpbroadcastw xm4, r8m ; pixel_max
+ cmp hd, 8
+ je .upsample_left_h8
+ pshufhw xm2, xm0, q2100 ; _ _ _ _ 4 4 3 2
+ pshufhw xm3, xm1, q3321 ; _ _ _ _ 2 1 0 0
+ jmp .upsample_left_end
+.upsample_left_h8:
+ pblendw xm2, xm0, [tlq-18], 0xfe ; 8 8 7 6 5 4 3 2
+ pblendw xm3, xm1, [tlq-12], 0x7f ; 6 5 4 3 2 1 0 0
+.upsample_left_end:
+ paddw xm1, xm0
+ paddw xm2, xm3
+ psubw xm2, xm1, xm2
+ add dyq, dyq
+ psraw xm2, 3
+ pxor xm3, xm3
+ paddw xm1, xm2
+ pmaxsw xm1, xm3
+ pavgw xm1, xm3
+ pminsw xm1, xm4
+ punpcklwd xm2, xm0, xm1
+ punpckhwd xm0, xm1
+ mova [rsp+ 96+gprsize], xm2
+ mova [rsp+112+gprsize], xm0
+ ret
+.w4_no_upsample_above:
+ lea r3d, [hq+3]
+ sub angled, 1112 ; angle - 90
+ call .filter_strength
+ test r3d, r3d
+ jz .w4_no_filter_above
+ popcnt r3d, r3d
+ vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*1]
+ vpbroadcastd xm5, [base+z_filter_k-4+r3*4+12*0]
+ psrldq xm0, xm1, 2 ; 1 2 3 4
+ pshuflw xm2, xm1, q2100 ; 0 0 1 2
+ pmullw xm4, xm0
+ pshuflw xm3, xm0, q3321 ; 2 3 4 4
+ paddw xm1, xm3
+ pshuflw xm3, xm0, q3332 ; 3 4 4 4
+ pmullw xm1, xm5
+ vpbroadcastd xm5, [base+z_filter_k-4+r3*4+12*2]
+ paddw xm2, xm3
+ vpbroadcastd xm3, r6m ; max_width
+ pmullw xm2, xm5
+ packssdw xm3, xm3
+ paddw xm1, xm4
+ paddw xm1, xm2
+ psubw xm3, [base+pw_1to16]
+ pxor xm4, xm4
+ psrlw xm1, 3
+ pminsw xm3, xm11 ; clip to byte range since there's no variable word blend
+ pavgw xm1, xm4
+ vpblendvb xm1, xm0, xm3
+ movq [rsp+130], xm1
+.w4_no_filter_above:
+ lea r3d, [hq+2]
+ add angled, 973 ; angle + 883
+ shl r3d, 6
+ test r3d, angled
+ jz .w4_upsample_left ; angle <= 140 || h > 8 || (is_sm && h == 8)
+ vpbroadcastd xm0, [base+pb_90]
+ psubb xm0, xm7 ; 180 - angle
+ pand xm0, xm8 ; reuse from previous filter_strength call
+ pcmpgtb xm0, xm9
+ pmovmskb r3d, xm0
+.w4_filter_left:
+ test r3d, r3d
+ jz .w4_main
+ popcnt r3d, r3d
+ mova m0, [tlq-32] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ vpbroadcastd m5, r7m ; max_height
+ cmp r3d, 3
+ je .w4_filter_left_s3
+ vpbroadcastd m2, [base+z_filter_k-4+r3*4+12*1]
+ vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*0]
+ pmullw m2, m0
+ cmp hd, 8
+ jl .w4_filter_left_h4
+ movu m4, [tlq-34]
+ punpcklwd m1, m0, m0
+ vpblendd m1, m4, 0xee ; 0 0 1 2 3 4 5 6 8 8 9 a b c d e
+ je .w4_filter_left_end
+ vpblendd m1, m4, 0x10 ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e
+ jmp .w4_filter_left_end
+.w4_upsample_left:
+ call .upsample_left
+ mov r11, -16
+ vbroadcasti128 m9, [base+z_upsample]
+ jmp .w4_main_upsample_left
+.w4_filter_left_s3: ; can only be h16
+ movu m2, [tlq-30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ vpbroadcastd m4, [base+pw_3]
+ paddw m1, m0, m2
+ punpckhwd m2, m2
+ vpblendd m2, [tlq-28], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g
+ punpcklwd xm3, xm0, xm0
+ paddw m2, m4
+ vpblendd m4, m3, [tlq-34], 0xfe ; 0 0 1 2 3 4 5 6 8 8 9 a b c d e
+ vpblendd m3, [tlq-36], 0xfe ; 0 0 0 1 2 3 4 5 6 8 8 9 a b c d
+ paddw m1, m4
+ pavgw m2, m3
+ paddw m1, m2
+ psrlw m1, 2
+ jmp .w4_filter_left_end2
+.w4_filter_left_h4:
+ pshufhw m1, m0, q2100 ; _ _ _ _ _ _ _ _ _ _ _ _ c c d e
+.w4_filter_left_end:
+ paddw m1, [tlq-30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ pmullw m1, m3
+ paddw m1, m2
+ pxor m2, m2
+ psrlw m1, 3
+ pavgw m1, m2
+.w4_filter_left_end2:
+ packssdw m5, m5
+ psubw m5, [base+pw_16to1]
+ pminsw m5, m11
+ vpblendvb m1, m0, m5
+ mova [rsp+96], m1
+.w4_main:
+ vbroadcasti128 m9, [base+z2_x_shuf]
+ mov r11, -8
+.w4_main_upsample_left:
+ movd xm5, dyd
+ mova m4, [base+z2_y_shuf_h4]
+ mov r2d, r8d
+ movd xm0, dxd
+ vpbroadcastw m5, xm5
+ rorx r5, dyq, 5
+ lea r8d, [dyq*3]
+ pmullw m5, [base+z2_ymul]
+ rorx r9, dyq, 4
+ sar dyd, 6
+ vpbroadcastw m0, xm0
+ sar r8d, 6
+ pand m5, m11 ; frac_y
+ neg dyd
+ psllw m5, 9
+ add r5d, dyd
+ add r8d, dyd
+ add r9d, dyd
+ paddw m7, m0, m0
+ lea dyq, [rsp+dyq*2+126]
+ vpblendd m0, m7, 0xcc
+ add dyq, r11
+ neg r5d
+ paddw m1, m0, m7
+ neg r8d
+ vpblendd m0, m1, 0xf0 ; xpos0 xpos1 xpos2 xpos3
+ neg r9d
+ paddw m7, m7
+ paddw m6, m0
+.w4_loop:
+ lea r3d, [r2+dxq]
+ shr r2d, 6 ; base_x0
+ movu xm1, [rsp+r2*2]
+ lea r2d, [r3+dxq]
+ shr r3d, 6 ; base_x1
+ movu xm3, [rsp+r3*2]
+ lea r3d, [r2+dxq]
+ shr r2d, 6 ; base_x2
+ vinserti128 m1, [rsp+r2*2], 1
+ lea r2d, [r3+dxq]
+ shr r3d, 6 ; base_x3
+ vinserti128 m3, [rsp+r3*2], 1
+ pshufb m1, m10 ; a0 a1 a2 a3 A0 A1 A2 A3
+ pshufb m3, m10 ; b0 b1 b2 b3 B0 B1 B2 B3
+ pand m2, m11, m6
+ punpcklqdq m0, m1, m3
+ punpckhqdq m1, m3
+ psllw m2, 9
+ psubw m1, m0
+ pmulhrsw m1, m2
+ paddw m0, m1
+ cmp r3d, 64
+ jge .w4_toponly
+ movu xm2, [dyq]
+ vinserti128 m2, [dyq+r8*2], 1
+ movu xm3, [dyq+r5*2]
+ vinserti128 m3, [dyq+r9*2], 1
+ pshufb m2, m9
+ pshufb m3, m9
+ punpckhwd m1, m2, m3 ; a3 b3 a2 b2 a1 b1 a0 b0
+ punpcklwd m2, m3
+ psubw m2, m1
+ pmulhrsw m2, m5
+ psraw m3, m6, 15 ; base_x < topleft
+ paddw m1, m2
+ vpermd m1, m4, m1 ; a0 b0 c0 d0 a1 b1 c1 d1 a2 b2 c2 d2 a3 b3 c3 d3
+ vpblendvb m0, m1, m3
+.w4_toponly:
+ paddw m6, m7 ; xpos += dx
+ lea r3, [strideq*3]
+ add dyq, r11
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+r3 ], xm1
+ sub hd, 4
+ jz .w4_end
+ lea dstq, [dstq+strideq*4]
+ cmp r2d, r10d
+ jge .w4_loop
+.w4_leftonly_loop:
+ movu xm1, [dyq]
+ vinserti128 m1, [dyq+r8*2], 1
+ movu xm2, [dyq+r5*2]
+ vinserti128 m2, [dyq+r9*2], 1
+ add dyq, r11
+ pshufb m1, m9
+ pshufb m2, m9
+ punpckhwd m0, m1, m2
+ punpcklwd m1, m2
+ psubw m1, m0
+ pmulhrsw m1, m5
+ paddw m0, m1
+ vpermd m0, m4, m0
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+r3 ], xm1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4_leftonly_loop
+.w4_end:
+ RET
+.w8:
+ mov r10d, hd
+ test angled, 0x400
+ jnz .w8_main
+ lea r3d, [angleq+126]
+ xor r8d, r8d
+ mov r3b, hb
+ cmp r3d, 8
+ ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm
+ movu xm0, [tlq+2] ; 1 2 3 4 5 6 7 8
+ mova xm1, [tlq+0] ; 0 1 2 3 4 5 6 7
+ pblendw xm2, xm0, [tlq+4], 0x7f ; 2 3 4 5 6 7 8 8
+ pblendw xm3, xm1, [tlq-2], 0xfe ; 0 0 1 2 3 4 5 6
+ vpbroadcastw xm4, r8m ; pixel_max
+ paddw xm1, xm0
+ paddw xm2, xm3
+ not r8d
+ psubw xm2, xm1, xm2
+ add dxd, dxd
+ psraw xm2, 3
+ sub angled, 53 ; angle - 53
+ pxor xm3, xm3
+ paddw xm2, xm1
+ lea r3d, [hq+7]
+ pmaxsw xm2, xm3
+ xor angled, 0x7f ; 180 - angle
+ pavgw xm2, xm3
+ pminsw xm2, xm4
+ punpcklwd xm1, xm2, xm0
+ punpckhwd xm2, xm0
+ movu [rsp+130], xm1
+ movu [rsp+146], xm2
+ call .filter_strength
+ jmp .w8_filter_left
+.w8_no_upsample_above:
+ lea r3d, [hq+7]
+ sub angled, 90 ; angle - 90
+ call .filter_strength
+ test r3d, r3d
+ jz .w8_no_filter_above
+ popcnt r3d, r3d
+ vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*1]
+ vpbroadcastd xm5, [base+z_filter_k-4+r3*4+12*0]
+ vpbroadcastd xm6, [base+z_filter_k-4+r3*4+12*2]
+ movu xm0, [tlq+2] ; 1 2 3 4 5 6 7 8 x
+ pblendw xm2, xm1, [tlq-2], 0xfe ; 0 0 1 2 3 4 5 6 x
+ pmullw xm4, xm0
+ pblendw xm3, xm0, [tlq+4], 0x7f ; 2 3 4 5 6 7 8 8 x
+ paddw xm1, xm3
+ vpblendd xm3, [tlq+6], 0x07 ; 3 4 5 6 7 8 8 8 x
+ paddw xm2, xm3
+ vpbroadcastd xm3, r6m ; max_width
+ pmullw xm1, xm5
+ pmullw xm2, xm6
+ packssdw xm3, xm3
+ paddw xm1, xm4
+ paddw xm1, xm2
+ psubw xm3, [base+pw_1to16]
+ pxor xm4, xm4
+ psrlw xm1, 3
+ pminsw xm3, xm11
+ pavgw xm1, xm4
+ vpblendvb xm1, xm0, xm3
+ movu [rsp+130], xm1
+.w8_no_filter_above:
+ lea r3d, [angleq-51]
+ mov r3b, hb
+ cmp r3d, 8
+ jbe .w8_upsample_left ; angle > 140 && h <= 8 && !is_sm
+ vpbroadcastd m0, [base+pb_90]
+ psubb m0, m7
+ pand m0, m8
+ pcmpgtb m0, m9
+ pmovmskb r3d, m0
+.w8_filter_left:
+ test r3d, r3d
+ jz .w8_main
+ popcnt r3d, r3d
+ cmp r3d, 3
+ jne .w8_filter_left_s12
+ vpbroadcastd m6, [base+pw_3]
+ vpbroadcastd m7, [base+pw_16]
+ cmp hd, 16 ; flags needed for later
+ jmp .filter_left_s3b
+.w8_upsample_left:
+ call .upsample_left
+ vbroadcasti128 m7, [base+z2_y_shuf_us]
+ lea r11, [rsp+118]
+ mov r8, -8
+ jmp .w8_main_upsample_left
+.w16_filter_left_s12:
+ xor r8d, r8d
+.w8_filter_left_s12:
+ mova m0, [tlq-32] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ vpbroadcastd m5, r7m ; max_height
+ vpbroadcastd m2, [base+z_filter_k-4+r3*4+12*1]
+ vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*0]
+ pmullw m2, m0
+ cmp hd, 8
+ jl .w8_filter_left_h4
+ movu m4, [tlq-34]
+ punpcklwd m1, m0, m0
+ vpblendd m1, m4, 0xee ; 0 0 1 2 3 4 5 6 8 8 9 a b c d e
+ je .w8_filter_left_end
+ vpblendd m1, m4, 0x10 ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e
+ jmp .w8_filter_left_end
+.w8_filter_left_h4:
+ pshufhw m1, m0, q2100 ; _ _ _ _ _ _ _ _ _ _ _ _ c c d e
+.w8_filter_left_end:
+ paddw m1, [tlq-30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ pmullw m1, m3
+ paddw m1, m2
+ pxor m2, m2
+ psrlw m1, 3
+ pavgw m1, m2
+ packssdw m5, m5
+ psubw m5, [base+pw_16to1]
+ pminsw m5, m11
+ vpblendvb m1, m0, m5
+ mova [rsp+96], m1
+ test r8d, r8d
+ jz .w8_main
+; upsample_main
+ vbroadcasti128 m10, [base+z_upsample]
+ vbroadcasti128 m7, [base+z2_y_shuf]
+ lea r5, [rsp+120]
+ movd xm1, dyd
+ vbroadcasti128 m4, [base+z_base_inc+2]
+ movd xm2, dxd
+ vpbroadcastw m1, xm1
+ vpbroadcastw m2, xm2
+ mov r7, dstq
+ paddw m4, m4
+ pmullw m0, m1, [base+z2_ymul8]
+ paddw m5, m2, m2
+ psllw xm1, 3
+ vpblendd m2, m5, 0xf0
+ lea r2d, [dxq+(66<<6)] ; xpos
+ paddw m4, m2
+ pshufd m6, m0, q2020
+ psraw xm0, 6
+ pxor xm1, xm1
+ psubw xm8, xm1, xm0
+ pand m6, m11
+ punpckhwd xm9, xm8, xm1
+ psllw m6, 9
+ punpcklwd xm8, xm1
+.w8_upsample_above_loop:
+ lea r3d, [r2+dxq]
+ shr r2d, 6
+ movu xm1, [rsp+r2*2]
+ movu xm2, [rsp+r2*2+16]
+ lea r2d, [r3+dxq]
+ shr r3d, 6
+ vinserti128 m1, [rsp+r3*2], 1
+ vinserti128 m2, [rsp+r3*2+16], 1
+ pshufb m1, m10
+ pshufb m2, m10
+ punpcklqdq m0, m1, m2 ; a0 b0 c0 d0 e0 f0 g0 h0
+ punpckhqdq m1, m2
+ pand m2, m11, m4
+ psubw m1, m0
+ psllw m2, 9
+ pmulhrsw m1, m2
+ paddw m0, m1
+ cmp r3d, 64
+ jge .w8_upsample_above_toponly
+ mova m1, m5
+ vpgatherdq m3, [r5+xm9*2], m5
+ mova m5, m1
+ vpgatherdq m2, [r5+xm8*2], m1
+ pshufb m3, m7
+ pshufb m2, m7
+ punpckldq m1, m2, m3
+ punpckhdq m2, m3
+ psubw m2, m1
+ pmulhrsw m2, m6
+ paddw m1, m2
+ vpermq m1, m1, q3120
+ psraw m2, m4, 15
+ vpblendvb m0, m1, m2
+.w8_upsample_above_toponly:
+ paddw m4, m5
+ sub r5, 4
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ sub hd, 2
+ jz .w8_ret
+ lea dstq, [dstq+strideq*2]
+ jmp .w8_upsample_above_loop
+.w8_main:
+ vbroadcasti128 m7, [base+z2_y_shuf]
+ lea r11, [rsp+120]
+ mov r8, -4
+.w8_main_upsample_left:
+ movd xm1, dyd
+ vbroadcasti128 m4, [base+z_base_inc+2]
+ movd xm2, dxd
+ vpbroadcastw m1, xm1
+ vpbroadcastw m2, xm2
+ mov r7, dstq
+ pmullw m0, m1, [base+z2_ymul8]
+ paddw m5, m2, m2
+ psllw xm1, 3
+ vpblendd m2, m5, 0xf0 ; xpos0 xpos1
+ lea r9d, [dxq+(65<<6)] ; xpos
+ paddw m4, m2
+ movd [rsp+284], xm1
+.w8_loop0:
+ mov r2d, r9d
+ mova [rsp+288], m0
+ mov r5, r11
+ mova [rsp+320], m4
+ pshufd m6, m0, q2020
+ psraw xm0, 6
+ pxor xm1, xm1
+ psubw xm8, xm1, xm0 ; base_y
+ pand m6, m11 ; frac_y
+ punpckhwd xm9, xm8, xm1 ; base_y 2 3 6 7
+ psllw m6, 9
+ punpcklwd xm8, xm1 ; base_y 0 1 4 5
+.w8_loop:
+ lea r3d, [r2+dxq]
+ shr r2d, 6 ; base_x0
+ movu xm0, [rsp+r2*2]
+ movu xm1, [rsp+r2*2+2]
+ lea r2d, [r3+dxq]
+ shr r3d, 6 ; base_x1
+ vinserti128 m0, [rsp+r3*2], 1
+ vinserti128 m1, [rsp+r3*2+2], 1
+ pand m2, m11, m4
+ psubw m1, m0
+ psllw m2, 9
+ pmulhrsw m1, m2
+ paddw m0, m1
+ cmp r3d, 64
+ jge .w8_toponly
+ mova m1, m5
+ vpgatherdq m3, [r5+xm9*2], m5
+ mova m5, m1
+ vpgatherdq m2, [r5+xm8*2], m1
+ pshufb m3, m7 ; c0 d0 c1 d1 g0 h0 g1 h1
+ pshufb m2, m7 ; a0 b0 a1 b1 e0 f0 e1 f1
+ punpckldq m1, m2, m3 ; a0 b0 c0 d0 a1 b1 c1 d1 e0 f0 g0 h0 e1 f1 g1 h1
+ punpckhdq m2, m3
+ psubw m2, m1
+ pmulhrsw m2, m6
+ paddw m1, m2
+ vpermq m1, m1, q3120
+ psraw m2, m4, 15 ; base_x < topleft
+ vpblendvb m0, m1, m2
+.w8_toponly:
+ paddw m4, m5 ; xpos += dx
+ add r5, r8
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ sub hd, 2
+ jz .w8_end
+ lea dstq, [dstq+strideq*2]
+ cmp r2d, (63-8)<<6
+ jge .w8_loop
+.w8_leftonly_loop:
+ mova m0, m5
+ vpgatherdq m4, [r5+xm9*2], m5
+ mova m5, m0
+ vpgatherdq m3, [r5+xm8*2], m0
+ add r5, r8
+ pshufb m2, m4, m7
+ pshufb m1, m3, m7
+ punpckldq m0, m1, m2
+ punpckhdq m1, m2
+ psubw m1, m0
+ pmulhrsw m1, m6
+ paddw m0, m1
+ vpermq m0, m0, q3120
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8_leftonly_loop
+.w8_end:
+ sub r10d, 1<<8
+ jl .w8_ret
+ vpbroadcastd m0, [rsp+284]
+ add r7, 16
+ paddw m0, [rsp+288] ; base_y += 8*dy
+ add r9d, 8<<6
+ vpbroadcastd m4, [pw_512]
+ movzx hd, r10b
+ paddw m4, [rsp+320] ; base_x += 8*64
+ mov dstq, r7
+ jmp .w8_loop0
+.w8_ret:
+ RET
+.w16:
+ movd xm0, [tlq+32]
+ lea r10d, [hq+(1<<8)]
+ movd [rsp+160], xm0
+ test angled, 0x400
+ jnz .w8_main
+ lea r3d, [hq+15]
+ sub angled, 90
+ call .filter_strength
+ test r3d, r3d
+ jz .w16_no_filter_above
+ popcnt r3d, r3d
+ vpbroadcastd m4, [base+z_filter_k-4+r3*4+12*1]
+ vpbroadcastd m5, [base+z_filter_k-4+r3*4+12*0]
+ vpbroadcastd m6, [base+z_filter_k-4+r3*4+12*2]
+ movu m0, [tlq+2] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ punpcklwd xm2, xm1, xm1
+ vpblendd m2, [tlq-2], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e
+ punpckhwd m3, m0, m0
+ pmullw m4, m0
+ vpblendd m3, [tlq+4], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g
+ paddw m1, m3
+ vpblendd m3, [tlq+6], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g g g
+ paddw m2, m3
+ vpbroadcastd m3, r6m ; max_width
+ pmullw m1, m5
+ pmullw m2, m6
+ packssdw m3, m3
+ paddw m1, m4
+ paddw m1, m2
+ psubw m3, [base+pw_1to16]
+ pxor m4, m4
+ psrlw m1, 3
+ pminsw m3, m11
+ pavgw m1, m4
+ vpblendvb m1, m0, m3
+ movu [rsp+130], m1
+.w16_no_filter_above:
+ vpbroadcastd m0, [base+pb_90]
+ psubb m0, m7
+ pand m0, m8
+ pcmpgtb m0, m9
+ pmovmskb r3d, m0
+ test r3d, r3d
+ jz .w8_main
+ popcnt r3d, r3d
+ cmp r3d, 3
+ jne .w16_filter_left_s12
+ vpbroadcastd m6, [base+pw_3]
+ vpbroadcastd m7, [base+pw_16]
+ cmp hd, 4
+ jne .filter_left_s3
+ movq xm0, [tlq-8] ; 0 1 2 3
+ movq xm1, [tlq-6] ; 1 2 3 4
+ vpbroadcastd xm5, r7m ; max_height
+ movq xm4, [base+pw_16to1+24] ; 4to1
+ pshuflw xm2, xm0, q2100 ; 0 0 1 2
+ pshuflw xm3, xm1, q3321 ; 2 3 4 4
+ paddw xm1, xm0
+ paddw xm1, xm2
+ pshuflw xm2, xm0, q1000 ; 0 0 0 1
+ paddw xm3, xm6
+ packssdw xm5, xm5
+ pavgw xm2, xm3
+ psubw xm5, xm4
+ paddw xm1, xm2
+ pminsw xm5, xm11
+ psrlw xm1, 2
+ vpblendvb xm1, xm0, xm5
+ movq [rsp+120], xm1
+ jmp .w8_main
+.w32:
+ mova m2, [tlq+32]
+ movd xm0, [tlq+64]
+ lea r10d, [hq+(3<<8)]
+ mova [rsp+160], m2
+ movd [rsp+192], xm0
+ test angled, 0x400
+ jnz .w8_main
+ vpbroadcastd m6, [base+pw_3]
+ vpbroadcastd m0, r6m ; max_width
+ vpbroadcastd m7, [base+pw_16]
+ mov r3d, 32
+ packssdw m0, m0
+ psubw m0, [base+pw_1to16]
+ pminsw m8, m0, m11
+ psubw m9, m8, m7
+.w32_filter_above:
+ movu m0, [tlq+2]
+ punpcklwd xm4, xm1, xm1
+ paddw m2, m6, [tlq+6]
+ paddw m1, m0
+ vpblendd m4, [tlq-2], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e
+ paddw m1, [tlq+4]
+ movu m3, [tlq+r3+2]
+ paddw m5, m6, [tlq+r3-2]
+ pavgw m2, m4
+ punpckhwd m4, m3, m3
+ paddw m1, m2
+ vpblendd m2, m4, [tlq+r3+6], 0x7f ; 4 5 6 7 8 9 a b c d e f g h h h
+ vpblendd m4, [tlq+r3+4], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g h h
+ pavgw m2, m5
+ paddw m5, m3, [tlq+r3]
+ paddw m4, m5
+ psrlw m1, 2
+ paddw m2, m4
+ vpblendvb m1, m0, m8
+ psrlw m2, 2
+ vpblendvb m2, m3, m9
+ movu [rsp+130], m1
+ movu [rsp+r3+130], m2
+.filter_left_s3:
+ cmp hd, 16
+ jl .filter_left_s3_h8 ; h8
+.filter_left_s3b:
+ mova m0, [tlq-32] ; 2 3 4 5 6 7 8 9 a b c d e f g h
+ movu m2, [tlq-30] ; 3 4 5 6 7 8 9 a b c d e f g h i
+ vpbroadcastd m5, r7m ; max_height
+ paddw m1, m0, m2
+ punpckhwd m2, m2
+ mov r3d, hd
+ vpblendd m2, [tlq-28], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i
+ packssdw m5, m5
+ not r3
+ psubw m5, [base+pw_16to1]
+ paddw m2, m6
+ pminsw m8, m11, m5
+ je .filter_left_s3_end ; h16
+ paddw m1, [tlq-34] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ pavgw m2, [tlq-36] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ paddw m1, m2
+ psrlw m1, 2
+ vpblendvb m3, m1, m0, m8
+ mova m0, [tlq-64] ; 2 3 4 5 6 7 8 9 a b c d e f g h
+ paddw m1, m0, [tlq-62] ; 3 4 5 6 7 8 9 a b c d e f g h i
+ paddw m2, m6, [tlq-60] ; 4 5 6 7 8 9 a b c d e f g h i j
+ psubw m8, m7
+ mova [rsp+96], m3
+ jnp .filter_left_s3_end ; h32
+ mova m5, [tlq-96]
+ paddw m1, [tlq-66]
+ pavgw m2, [tlq-68]
+ paddw m1, m2
+ paddw m4, m5, [tlq-94]
+ paddw m2, m6, [tlq-92]
+ psrlw m1, 2
+ paddw m4, [tlq- 98]
+ pavgw m2, [tlq-100]
+ vpblendvb m3, m1, m0, m8
+ mova m0, [tlq-128]
+ psubw m8, m7
+ paddw m4, m2
+ paddw m1, m0, [tlq-126]
+ paddw m2, m6, [tlq-124]
+ psrlw m4, 2
+ mova [rsp+64], m3
+ vpblendvb m4, m5, m8
+ psubw m8, m7
+ mova [rsp+32], m4
+.filter_left_s3_end:
+ punpcklwd xm3, xm0, xm0
+ vpblendd m4, m3, [tlq+r3*2], 0xfe ; 2 2 3 4 5 6 7 8 9 a b c d e f g
+ vpblendd m3, [tlq+r3*2-2], 0xfe ; 2 2 2 3 4 5 6 7 8 9 a b c d e f
+ paddw m1, m4
+ pavgw m2, m3
+ paddw m1, m2
+ psrlw m1, 2
+ vpblendvb m1, m0, m8
+ mova [rsp+r3*2+130], m1
+ jmp .w8_main
+.filter_left_s3_h8:
+ mova xm0, [tlq-16] ; 0 1 2 3 4 5 6 7
+ movu xm3, [tlq-14] ; 1 2 3 4 5 6 7 8
+ pblendw xm2, xm0, [tlq-18], 0xfe ; 0 0 1 2 3 4 5 6
+ vpbroadcastd xm5, r7m ; max_height
+ paddw xm1, xm0, xm3
+ pblendw xm3, [tlq-12], 0x7f ; 2 3 4 5 6 7 8 8
+ paddw xm1, xm2
+ vpblendd xm2, [tlq-20], 0x0e ; 0 0 0 1 2 3 4 5
+ paddw xm3, xm6
+ packssdw xm5, xm5
+ pavgw xm2, xm3
+ psubw xm5, [base+pw_16to1+16] ; 8to1
+ paddw xm1, xm2
+ pminsw xm5, xm11
+ psrlw xm1, 2
+ vpblendvb xm1, xm0, xm5
+ mova [rsp+112], xm1
+ jmp .w8_main
+.w64:
+ mova m2, [tlq+ 32]
+ mova m3, [tlq+ 64]
+ mova m4, [tlq+ 96]
+ movd xm0, [tlq+128]
+ lea r10d, [hq+(7<<8)]
+ mova [rsp+160], m2
+ mova [rsp+192], m3
+ mova [rsp+224], m4
+ movd [rsp+256], xm0
+ test angled, 0x400
+ jnz .w8_main
+ vpbroadcastd m6, [base+pw_3]
+ movu m0, [tlq+34] ; 2 3 4 5 6 7 8 9 a b c d e f g h
+ paddw m2, m6, [tlq+30] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ paddw m5, m0, [tlq+32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ pavgw m2, [tlq+38] ; 4 5 6 7 8 9 a b c d e f g h h h
+ paddw m5, [tlq+36] ; 3 4 5 6 7 8 9 a b c d e f g h h
+ movu m4, [tlq+66]
+ paddw m3, m6, [tlq+62]
+ paddw m7, m4, [tlq+64]
+ pavgw m3, [tlq+70]
+ paddw m7, [tlq+68]
+ paddw m2, m5
+ vpbroadcastd m5, r6m ; max_width
+ mov r3d, 96
+ packssdw m5, m5
+ paddw m3, m7
+ psubw m5, [base+pw_1to16]
+ psrlw m2, 2
+ vpbroadcastd m7, [base+pw_16]
+ psrlw m3, 2
+ pminsw m8, m11, m5
+ psubw m9, m8, m7
+ vpblendvb m2, m0, m9
+ psubw m9, m7
+ vpblendvb m3, m4, m9
+ psubw m9, m7
+ movu [rsp+162], m2
+ movu [rsp+194], m3
+ jmp .w32_filter_above
+
+cglobal ipred_z3_16bpc, 4, 9, 0, dst, stride, tl, w, h, angle, dy, org_w, maxbase
+ %assign org_stack_offset stack_offset
+ lea r6, [ipred_z3_16bpc_avx2_table]
+ tzcnt hd, hm
+ movifnidn angled, anglem
+ lea r7, [dr_intra_derivative+45*2-1]
+ sub tlq, 2
+ movsxd hq, [r6+hq*4]
+ sub angled, 180
+ add hq, r6
+ mov dyd, angled
+ neg dyd
+ xor angled, 0x400
+ or dyq, ~0x7e
+ movzx dyd, word [r7+dyq]
+ vpbroadcastd m5, [pw_62]
+ mov org_wd, wd
+ jmp hq
+.h4:
+ ALLOC_STACK -64, 7
+ lea r7, [strideq*3]
+ cmp angleb, 40
+ jae .h4_no_upsample
+ lea r4d, [angleq-1024]
+ sar r4d, 7
+ add r4d, wd
+ jg .h4_no_upsample ; !enable_intra_edge_filter || w > 8 || (w == 8 && is_sm)
+ mova xm2, [tlq-14] ; 0 1 2 3 4 5 6 7
+ pblendw xm1, xm2, [tlq-16], 0xfe ; 0 0 1 2 3 4 5 6
+ vpblendd xm0, xm1, [tlq-18], 0x0e ; 0 0 0 1 2 3 4 5
+ pshufd xm3, xm1, q0000
+ paddw xm1, xm2
+ paddw xm0, [tlq-12] ; 1 2 3 4 5 6 7 8
+ vpbroadcastw xm4, r8m ; pixel_max
+ add dyd, dyd
+ psubw xm0, xm1, xm0
+ mova [rsp+ 0], xm3
+ movd xm3, dyd
+ psraw xm0, 3
+ neg dyd
+ paddw xm1, xm0
+ pxor xm0, xm0
+ lea r2d, [dyq+(16<<6)+63] ; ypos
+ pmaxsw xm1, xm0
+ pavgw xm1, xm0
+ vpbroadcastw m3, xm3
+ pminsw xm1, xm4
+ punpckhwd xm0, xm1, xm2
+ punpcklwd xm1, xm2
+ paddw m2, m3, m3
+ mova [rsp+32], xm0
+ punpcklwd m3, m2
+ mova [rsp+16], xm1
+ paddw m4, m2, m2
+ paddw m2, m3
+ vpblendd m3, m2, 0xf0 ; ypos0 ypos1 ypos2 ypos3
+.h4_upsample_loop:
+ lea r4d, [r2+dyq]
+ shr r2d, 6
+ movu xm1, [rsp+r2*2]
+ lea r2d, [r4+dyq]
+ shr r4d, 6
+ movu xm2, [rsp+r4*2]
+ lea r4d, [r2+dyq]
+ shr r2d, 6
+ vinserti128 m1, [rsp+r2*2], 1
+ lea r2d, [r4+dyq]
+ shr r4d, 6
+ vinserti128 m2, [rsp+r4*2], 1
+ psrld m0, m1, 16
+ pblendw m0, m2, 0xaa ; a3 b3 a2 b2 a1 b1 a0 b0 c3 d3 c2 d2 c1 d1 c0 d0
+ pslld m2, 16
+ pblendw m1, m2, 0xaa
+ pand m2, m5, m3
+ psllw m2, 9
+ psubw m1, m0
+ pmulhrsw m1, m2
+ paddw m3, m4
+ paddw m1, m0
+ vextracti128 xm2, m1, 1
+ punpckhdq xm0, xm1, xm2 ; a1 b1 c1 d1 a0 b0 c0 d0
+ punpckldq xm1, xm2 ; a3 b3 c3 d3 a2 b2 c2 d2
+ movhps [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm0
+ movhps [dstq+strideq*2], xm1
+ movq [dstq+r7 ], xm1
+ add dstq, 8
+ sub wd, 4
+ jg .h4_upsample_loop
+ RET
+ALIGN function_align
+.filter_strength: ; h4/h8/h16
+%define base r4-z_filter_t0
+ lea r4, [z_filter_t0]
+ movd xm0, maxbased
+ movd xm1, angled
+ shr angled, 8 ; is_sm << 1
+ vpbroadcastb m0, xm0
+ vpbroadcastb m1, xm1
+ pcmpeqb m0, [base+z_filter_wh]
+ pand m0, m1
+ mova xm1, [r4+angleq*8]
+ pcmpgtb m0, m1
+ pmovmskb r5d, m0
+ ret
+.h4_no_upsample:
+ mov maxbased, 7
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .h4_main
+ lea maxbased, [wq+3]
+ call .filter_strength
+ mov maxbased, 7
+ test r5d, r5d
+ jz .h4_main ; filter_strength == 0
+ popcnt r5d, r5d
+ mova xm0, [tlq-14] ; 0 1 2 3 4 5 6 7
+ movu xm3, [tlq-12] ; 1 2 3 4 5 6 7 8
+ vpbroadcastd xm2, [base+z_filter_k-4+r5*4+12*1]
+ vpbroadcastd xm4, [base+z_filter_k-4+r5*4+12*0]
+ pmullw xm2, xm0
+ pblendw xm0, [tlq-16], 0xfe ; 0 0 1 2 3 4 5 6
+ paddw xm1, xm0, xm3
+ movd [rsp+12], xm0
+ pmullw xm1, xm4
+ cmp r5d, 3
+ jne .h4_filter_3tap
+ pblendw xm3, [tlq-10], 0x7f ; 2 3 4 5 6 7 8 8
+ vpblendd xm0, [tlq-18], 0x0e ; 0 0 0 1 2 3 4 5
+ movzx r4d, word [tlq-14]
+ movzx r2d, word [tlq-12]
+ inc maxbased
+ paddw xm1, xm2
+ paddw xm0, xm3
+ sub r2d, r4d
+ paddw xm2, xm0, xm0
+ lea r2d, [r2+r4*8+4]
+ shr r2d, 3
+ mov [rsp+14], r2w
+.h4_filter_3tap:
+ pxor xm0, xm0
+ paddw xm1, xm2
+ lea tlq, [rsp+30]
+ psrlw xm1, 3
+ cmp wd, 8
+ sbb maxbased, -1
+ pavgw xm0, xm1
+ mova [rsp+16], xm0
+.h4_main:
+ movd xm3, dyd
+ neg maxbaseq
+ vbroadcasti128 m1, [z_base_inc]
+ vpbroadcastw m6, [tlq+maxbaseq*2]
+ shl maxbased, 6
+ vpbroadcastw m3, xm3
+ lea r4d, [maxbaseq+3*64]
+ neg dyq
+ movd xm2, r4d
+ sub tlq, 8
+ lea r4, [dyq+63] ; ypos
+ punpcklwd m1, m1
+ paddw m0, m3, m3
+ vpbroadcastw m2, xm2
+ punpcklwd m3, m0
+ paddw m4, m0, m0
+ paddw m0, m3
+ psubw m2, m1
+ vpblendd m3, m0, 0xf0 ; ypos0 ypos1 ypos2 ypos3
+ or maxbased, 63
+ paddw m3, m2
+.h4_loop:
+ lea r5, [r4+dyq]
+ sar r4, 6 ; base0
+ movu xm1, [tlq+r4*2]
+ lea r4, [r5+dyq]
+ sar r5, 6 ; base1
+ movu xm2, [tlq+r5*2]
+ lea r5, [r4+dyq]
+ sar r4, 6 ; base2
+ vinserti128 m1, [tlq+r4*2], 1
+ lea r4, [r5+dyq]
+ sar r5, 6 ; base3
+ vinserti128 m2, [tlq+r5*2], 1
+ punpckhwd m0, m1, m2
+ punpcklwd m1, m2
+ pand m2, m5, m3
+ palignr m0, m1, 4 ; a3 b3 a2 b2 a1 b1 a0 b0 c3 d3 c2 d2 c1 d1 c0 d0
+ psllw m2, 9
+ psubw m1, m0
+ pmulhrsw m1, m2
+ psraw m2, m3, 15 ; ypos < max_base_y
+ paddw m3, m4
+ paddw m1, m0
+ vpblendvb m1, m6, m1, m2
+ vextracti128 xm2, m1, 1
+ punpckhdq xm0, xm1, xm2 ; a1 b1 c1 d1 a0 b0 c0 d0
+ punpckldq xm1, xm2 ; a3 b3 c3 d3 a2 b2 c2 d2
+ movhps [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm0
+ movhps [dstq+strideq*2], xm1
+ movq [dstq+r7 ], xm1
+ sub wd, 4
+ jz .h4_end
+ add dstq, 8
+ cmp r4d, maxbased
+ jg .h4_loop
+.h4_end_loop:
+ movq [dstq+strideq*0], xm6
+ movq [dstq+strideq*1], xm6
+ movq [dstq+strideq*2], xm6
+ movq [dstq+r7 ], xm6
+ add dstq, 8
+ sub wd, 4
+ jg .h4_end_loop
+.h4_end:
+ RET
+.h8:
+ lea r4d, [angleq+216]
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -64, 8
+ mov r4b, wb
+ lea r7, [strideq*3]
+ cmp r4d, 8
+ ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8
+ mova m2, [tlq-30] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ paddw m1, m2, [tlq-32] ; _ 0 1 2 3 4 5 6 7 8 9 a b c d e
+ movu m0, [tlq-34] ; _ _ 0 1 2 3 4 5 6 7 8 9 a b c d
+ cmp wd, 8
+ je .h8_upsample_w8
+ pshufhw xm3, xm2, q1000
+ vpblendd m0, m3, 0x0f ; _ _ _ _ 4 4 4 5 6 7 8 9 a b c d
+.h8_upsample_w8:
+ paddw m0, [tlq-28] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ vpbroadcastw m4, r8m ; pixel_max
+ add dyd, dyd
+ psubw m0, m1, m0
+ movd xm6, dyd
+ psraw m0, 3
+ neg dyd
+ paddw m1, m0
+ pxor m0, m0
+ pmaxsw m1, m0
+ lea r4d, [dyq+(16<<6)+63] ; ypos
+ pavgw m1, m0
+ vpbroadcastw m6, xm6
+ pminsw m1, m4
+ punpckhwd m0, m1, m2
+ punpcklwd m1, m2
+ vextracti128 [rsp+48], m0, 1
+ vextracti128 [rsp+32], m1, 1
+ paddw m7, m6, m6
+ mova [rsp+16], xm0
+ mova [rsp+ 0], xm1
+ punpcklwd m6, m7 ; ypos0 ypos1
+.h8_upsample_loop:
+ lea r2d, [r4+dyq]
+ shr r4d, 6 ; base0
+ movu m1, [rsp+r4*2]
+ lea r4d, [r2+dyq]
+ shr r2d, 6 ; base1
+ movu m2, [rsp+r2*2]
+ lea r2d, [r4+dyq]
+ shr r4d, 6 ; base2
+ movu m3, [rsp+r4*2]
+ lea r4d, [r2+dyq]
+ shr r2d, 6 ; base3
+ movu m4, [rsp+r2*2]
+ psrld m0, m1, 16
+ pblendw m0, m2, 0xaa ; a7 b7 a6 b6 a5 b5 a4 b4 a3 b3 a2 b2 a1 b1 a0 b0
+ pslld m2, 16
+ pblendw m1, m2, 0xaa
+ psrld m2, m3, 16
+ pblendw m2, m4, 0xaa ; c7 d7 c6 d6 c5 d5 c4 d4 c3 d3 c2 d2 c1 d1 c0 d0
+ pslld m4, 16
+ pblendw m3, m4, 0xaa
+ pand m4, m5, m6
+ paddw m6, m7
+ psllw m4, 9
+ psubw m1, m0
+ pmulhrsw m1, m4
+ pand m4, m5, m6
+ psllw m4, 9
+ psubw m3, m2
+ pmulhrsw m3, m4
+ paddw m6, m7
+ lea r2, [dstq+strideq*4]
+ paddw m1, m0
+ paddw m3, m2
+ punpckhdq m0, m1, m3 ; a5 b5 c5 d5 a4 b4 c4 d4 a1 b1 c1 d1 a0 b0 c0 d0
+ punpckldq m1, m3 ; a7 b7 c7 d7 a6 b6 c6 d6 a3 b3 c3 d3 a2 b2 c2 d2
+ vextracti128 xm2, m0, 1
+ vextracti128 xm3, m1, 1
+ movhps [r2 +strideq*0], xm0
+ movq [r2 +strideq*1], xm0
+ movhps [r2 +strideq*2], xm1
+ movq [r2 +r7 ], xm1
+ movhps [dstq+strideq*0], xm2
+ movq [dstq+strideq*1], xm2
+ movhps [dstq+strideq*2], xm3
+ movq [dstq+r7 ], xm3
+ add dstq, 8
+ sub wd, 4
+ jg .h8_upsample_loop
+ RET
+.h8_no_intra_edge_filter:
+ and maxbased, 7
+ or maxbased, 8 ; imin(w+7, 15)
+ jmp .h8_main
+.h8_no_upsample:
+ lea maxbased, [wq+7]
+ test angled, 0x400
+ jnz .h8_no_intra_edge_filter
+ call .filter_strength
+ test r5d, r5d
+ jz .h8_main
+ popcnt r5d, r5d
+ mova m0, [tlq-30] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ movu m3, [tlq-28] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ vpbroadcastd m2, [base+z_filter_k-4+r5*4+12*1]
+ vpbroadcastd m4, [base+z_filter_k-4+r5*4+12*0]
+ pmullw m2, m0
+ cmp wd, 8
+ jl .h8_filter_w4
+ punpcklwd xm0, xm0
+ vpblendd m1, m0, [tlq-32], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e
+ movd [rsp+28], xm0
+ paddw m1, m3
+ mov r4d, 16
+ pmullw m1, m4
+ cmovg maxbased, r4d
+ cmp r5d, 3
+ jne .h8_filter_3tap
+ punpckhwd m3, m3
+ vpblendd m0, [tlq-34], 0xfe ; 0 0 0 1 2 3 4 5 6 7 8 9 a b c d
+ vpblendd m3, [tlq-26], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g
+ movzx r4d, word [tlq-30]
+ movzx r2d, word [tlq-28]
+ inc maxbased
+ paddw m1, m2
+ paddw m0, m3
+ sub r2d, r4d
+ paddw m2, m0, m0
+ lea r2d, [r2+r4*8+4]
+ shr r2d, 3
+ mov [rsp+30], r2w
+ jmp .h8_filter_3tap
+.h8_filter_w4:
+ pshufhw xm1, xm0, q2100
+ vinserti128 m1, [tlq-16], 1 ; _ _ _ _ 4 4 5 6 7 8 9 a b c d e
+ paddw m1, m3
+ pmullw m1, m4
+.h8_filter_3tap:
+ pxor m0, m0
+ paddw m1, m2
+ lea tlq, [rsp+62]
+ psrlw m1, 3
+ pavgw m0, m1
+ mova [rsp+32], m0
+.h8_main:
+ movd xm4, dyd
+ neg maxbaseq
+ vbroadcasti128 m1, [z_base_inc]
+ vpbroadcastw m7, [tlq+maxbaseq*2]
+ shl maxbased, 6
+ vpbroadcastw m4, xm4
+ lea r4d, [maxbaseq+7*64]
+ neg dyq
+ movd xm2, r4d
+ sub tlq, 16
+ lea r4, [dyq+63]
+ paddw m6, m4, m4
+ vpbroadcastw m2, xm2
+ vpblendd m4, m6, 0xf0 ; ypos0 ypos1
+ psubw m2, m1
+ or maxbased, 63
+ paddw m4, m2
+.h8_loop:
+ lea r5, [r4+dyq]
+ sar r4, 6 ; base0
+ movu xm0, [tlq+r4*2+2]
+ movu xm1, [tlq+r4*2]
+ lea r4, [r5+dyq]
+ sar r5, 6 ; base1
+ vinserti128 m0, [tlq+r5*2+2], 1
+ vinserti128 m1, [tlq+r5*2], 1
+ lea r5, [r4+dyq]
+ sar r4, 6 ; base2
+ pand m3, m5, m4
+ psllw m3, 9
+ psubw m1, m0
+ pmulhrsw m1, m3
+ psraw m3, m4, 15
+ paddw m4, m6
+ paddw m0, m1
+ movu xm1, [tlq+r4*2+2]
+ movu xm2, [tlq+r4*2]
+ lea r4, [r5+dyq]
+ sar r5, 6 ; base3
+ vpblendvb m0, m7, m0, m3
+ vinserti128 m1, [tlq+r5*2+2], 1
+ vinserti128 m2, [tlq+r5*2], 1
+ pand m3, m5, m4
+ psllw m3, 9
+ psubw m2, m1
+ pmulhrsw m2, m3
+ psraw m3, m4, 15
+ paddw m4, m6
+ lea r5, [dstq+strideq*4]
+ paddw m1, m2
+ vpblendvb m1, m7, m1, m3
+ punpckhwd m2, m0, m1 ; a3 c3 a2 c2 a1 c1 a0 c0 b3 d3 b2 d2 b1 d1 b0 d0
+ vextracti128 xm3, m2, 1
+ punpcklwd m0, m1 ; a7 c7 a6 c6 a5 c5 a4 c5 b7 d7 b6 d6 b5 d5 b4 d4
+ punpckhwd xm1, xm2, xm3 ; a1 b1 c1 d1 a0 b0 c0 d0
+ punpcklwd xm2, xm3 ; a3 b3 c3 d3 a2 b2 c2 d2
+ vextracti128 xm3, m0, 1
+ movhps [dstq+strideq*0], xm1
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm2
+ movq [dstq+r7 ], xm2
+ punpckhwd xm1, xm0, xm3 ; a5 b5 c5 d5 a4 b4 c4 d4
+ punpcklwd xm0, xm3 ; a7 b7 c7 d7 a6 b6 c6 d6
+ movhps [r5 +strideq*0], xm1
+ movq [r5 +strideq*1], xm1
+ movhps [r5 +strideq*2], xm0
+ movq [r5 +r7 ], xm0
+ sub wd, 4
+ jz .h8_end
+ add dstq, 8
+ cmp r4d, maxbased
+ jg .h8_loop
+ lea r6, [strideq*5]
+ lea r2, [strideq+r7*2] ; stride*7
+ test wd, 4
+ jz .h8_end_loop
+ movq [dstq+strideq*0], xm7
+ movq [dstq+strideq*1], xm7
+ movq [dstq+strideq*2], xm7
+ movq [dstq+r7 ], xm7
+ movq [dstq+strideq*4], xm7
+ movq [dstq+r6 ], xm7
+ movq [dstq+r7*2 ], xm7
+ movq [dstq+r2 ], xm7
+ add dstq, 8
+ sub wd, 4
+ jz .h8_end
+.h8_end_loop:
+ mova [dstq+strideq*0], xm7
+ mova [dstq+strideq*1], xm7
+ mova [dstq+strideq*2], xm7
+ mova [dstq+r7 ], xm7
+ mova [dstq+strideq*4], xm7
+ mova [dstq+r6 ], xm7
+ mova [dstq+r7*2 ], xm7
+ mova [dstq+r2 ], xm7
+ add dstq, 16
+ sub wd, 8
+ jg .h8_end_loop
+.h8_end:
+ RET
+.h16_no_intra_edge_filter:
+ and maxbased, 15
+ or maxbased, 16 ; imin(w+15, 31)
+ jmp .h16_main
+ALIGN function_align
+.h16:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -96, 10
+ lea maxbased, [wq+15]
+ lea r7, [strideq*3]
+ test angled, 0x400
+ jnz .h16_no_intra_edge_filter
+ call .filter_strength
+ test r5d, r5d
+ jz .h16_main ; filter_strength == 0
+ popcnt r5d, r5d
+ movu m0, [tlq-28] ; 3 4 5 6 7 8 9 a b c d e f g h i
+ paddw m1, m0, [tlq-32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ vpbroadcastd m6, [base+z_filter_k-4+r5*4+12*1]
+ vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*0]
+ pmullw m2, m6, [tlq-30] ; 2 3 4 5 6 7 8 9 a b c d e f g h
+ pmullw m1, m7
+ paddw m1, m2
+ cmp wd, 8
+ jg .h16_filter_w16
+ mova xm3, [tlq-46] ; 0 1 2 3 4 5 6 7
+ pmullw xm6, xm3
+ jl .h16_filter_w4
+ pblendw xm3, [tlq-48], 0xfe ; 0 0 1 2 3 4 5 6
+ cmp r5d, 3
+ jne .h16_filter_w8_3tap
+ vpblendd xm4, xm3, [tlq-50], 0x0e ; 0 0 0 1 2 3 4 5
+.h16_filter_w8_5tap:
+ punpckhwd m0, m0
+ vpblendd m0, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i
+ paddw xm4, [tlq-42] ; 2 3 4 5 6 7 8 9
+ paddw m0, [tlq-34] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ paddw xm4, xm4
+ paddw m0, m0
+ paddw xm6, xm4
+ paddw m1, m0
+.h16_filter_w8_3tap:
+ paddw xm3, [tlq-44] ; 1 2 3 4 5 6 7 8
+ pmullw xm3, xm7
+ pxor m0, m0
+ paddw xm3, xm6
+ psrlw xm3, 3
+ pavgw xm3, xm0
+ mova [rsp+48], xm3
+ jmp .h16_filter_end
+.h16_filter_w4:
+ pshufhw xm3, xm3, q2100 ; _ _ _ _ 4 4 5 6
+ cmp r5d, 3
+ jne .h16_filter_w8_3tap
+ pshufhw xm4, xm3, q2100 ; _ _ _ _ 4 4 4 5
+ jmp .h16_filter_w8_5tap
+.h16_filter_w16:
+ mova m3, [tlq-62] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ pmullw m6, m3
+ punpcklwd xm3, xm3
+ vpblendd m4, m3, [tlq-64], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e
+ paddw m4, [tlq-60] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ mov r4d, 32
+ cmp wd, 16
+ cmovg maxbased, r4d
+ movd [rsp+28], xm3
+ pmullw m4, m7
+ cmp r5d, 3
+ jne .h16_filter_w16_3tap
+ punpckhwd m0, m0
+ vpblendd m3, [tlq-66], 0xfe ; 0 0 0 1 2 3 4 5 6 7 8 9 a b c d
+ vpblendd m0, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i
+ paddw m3, [tlq-58] ; 2 3 4 5 6 7 8 9 a b c d e f g h
+ paddw m0, [tlq-34] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ movzx r4d, word [tlq-62]
+ movzx r2d, word [tlq-60]
+ or maxbased, 1
+ paddw m3, m3
+ sub r2d, r4d
+ paddw m0, m0
+ lea r2d, [r2+r4*8+4]
+ paddw m4, m3
+ shr r2d, 3
+ paddw m1, m0
+ mov [rsp+30], r2w
+.h16_filter_w16_3tap:
+ pxor m0, m0
+ paddw m4, m6
+ psrlw m4, 3
+ pavgw m4, m0
+ mova [rsp+32], m4
+.h16_filter_end:
+ psrlw m1, 3
+ lea tlq, [rsp+94]
+ pavgw m1, m0
+ mova [rsp+64], m1
+.h16_main:
+ movd xm8, dyd
+ neg maxbaseq
+ vpbroadcastw m9, [tlq+maxbaseq*2]
+ shl maxbased, 6
+ vpbroadcastw m8, xm8
+ lea r4d, [maxbaseq+dyq+15*64]
+ neg dyq
+ movd xm7, r4d
+ sub tlq, 32
+ lea r4, [dyq+63]
+ vpbroadcastw m7, xm7
+ or maxbased, 63
+ psubw m7, [z_base_inc]
+.h16_loop:
+ lea r5, [r4+dyq]
+ sar r4, 6 ; base0
+ movu m0, [tlq+r4*2+2]
+ movu m2, [tlq+r4*2]
+ lea r4, [r5+dyq]
+ sar r5, 6 ; base1
+ movu m1, [tlq+r5*2+2]
+ movu m3, [tlq+r5*2]
+ lea r5, [r4+dyq]
+ sar r4, 6 ; base3
+ pand m6, m5, m7
+ psllw m6, 9
+ psubw m2, m0
+ pmulhrsw m2, m6
+ psraw m6, m7, 15
+ paddw m7, m8
+ paddw m0, m2
+ movu m2, [tlq+r4*2+2]
+ movu m4, [tlq+r4*2]
+ lea r4, [r5+dyq]
+ sar r5, 6 ; base3
+ vpblendvb m0, m9, m0, m6
+ pand m6, m5, m7
+ psllw m6, 9
+ psubw m3, m1
+ pmulhrsw m3, m6
+ psraw m6, m7, 15
+ paddw m7, m8
+ paddw m1, m3
+ vpblendvb m1, m9, m1, m6
+ pand m6, m5, m7
+ psllw m6, 9
+ psubw m4, m2
+ pmulhrsw m4, m6
+ psraw m6, m7, 15
+ paddw m7, m8
+ paddw m2, m4
+ movu m3, [tlq+r5*2+2]
+ movu m4, [tlq+r5*2]
+ vpblendvb m2, m9, m2, m6
+ pand m6, m5, m7
+ psllw m6, 9
+ psubw m4, m3
+ pmulhrsw m4, m6
+ psraw m6, m7, 15
+ paddw m7, m8
+ lea r5, [dstq+strideq*4]
+ paddw m3, m4
+ vpblendvb m3, m9, m3, m6
+ punpckhwd m4, m0, m1 ; ab bb aa ba a9 b9 a8 b8 a3 b3 a2 b2 a1 b1 a0 b0
+ punpcklwd m0, m1 ; af bf ae be ad bd ac bc a7 b7 a6 b6 a5 b5 a4 b4
+ punpckhwd m1, m2, m3 ; cb db ca da c9 d9 c8 d8 c3 d3 c2 d2 c1 d1 c0 d0
+ punpcklwd m2, m3 ; cf df ce de cd dd cc dc c7 d7 c6 d6 c5 d5 c4 d4
+ punpckhdq m3, m4, m1 ; a9 b9 c9 d9 a8 b8 c8 d8 a1 b1 c1 d1 a0 b0 c0 d0
+ vextracti128 xm6, m3, 1
+ punpckldq m4, m1 ; ab bb cb db aa ba ca da a3 b3 c3 d3 a2 b2 c2 d2
+ punpckhdq m1, m0, m2 ; ad bd cd dd ac bc cc dc a5 b5 c5 d5 a4 b4 c4 d4
+ punpckldq m0, m2 ; af bf cf df ae be ce de a7 b7 c7 d7 a6 b6 c6 d6
+ vextracti128 xm2, m4, 1
+ movhps [dstq+strideq*0], xm6
+ movq [dstq+strideq*1], xm6
+ vextracti128 xm6, m1, 1
+ movhps [dstq+strideq*2], xm2
+ movq [dstq+r7 ], xm2
+ vextracti128 xm2, m0, 1
+ movhps [r5 +strideq*0], xm6
+ movq [r5 +strideq*1], xm6
+ movhps [r5 +strideq*2], xm2
+ movq [r5 +r7 ], xm2
+ lea r5, [dstq+strideq*8]
+ movhps [r5 +strideq*0], xm3
+ movq [r5 +strideq*1], xm3
+ movhps [r5 +strideq*2], xm4
+ movq [r5 +r7 ], xm4
+ lea r5, [r5+strideq*4]
+ movhps [r5 +strideq*0], xm1
+ movq [r5 +strideq*1], xm1
+ movhps [r5 +strideq*2], xm0
+ movq [r5 +r7 ], xm0
+ sub wd, 4
+ jz .h16_end
+ add dstq, 8
+ cmp r4d, maxbased
+ jg .h16_loop
+ mov hd, 4
+.h16_end_loop0:
+ mov r6d, wd
+ mov r2, dstq
+ test wb, 4
+ jz .h16_end_loop
+ movq [dstq+strideq*0], xm9
+ movq [dstq+strideq*1], xm9
+ movq [dstq+strideq*2], xm9
+ movq [dstq+r7 ], xm9
+ and r6d, 120
+ jz .h16_end_w4
+ add dstq, 8
+.h16_end_loop:
+ mova [dstq+strideq*0], xm9
+ mova [dstq+strideq*1], xm9
+ mova [dstq+strideq*2], xm9
+ mova [dstq+r7 ], xm9
+ add dstq, 16
+ sub r6d, 8
+ jg .h16_end_loop
+.h16_end_w4:
+ lea dstq, [r2+strideq*4]
+ dec hd
+ jg .h16_end_loop0
+.h16_end:
+ RET
+.h32:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -160, 9
+ lea maxbased, [wq+31]
+ and maxbased, 31
+ or maxbased, 32 ; imin(w+31, 63)
+ test angled, 0x400
+ jnz .h32_main
+ vpbroadcastd m2, [pw_3]
+ movu m0, [tlq-28] ; 3 4 5 6 7 8 9 a b c d e f g h i
+ punpckhwd m1, m0, m0
+ vpblendd m1, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i
+ paddw m0, [tlq-30] ; 2 3 4 5 6 7 8 9 a b c d e f g h
+ paddw m1, m2
+ paddw m0, [tlq-32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ pavgw m1, [tlq-34] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ lea r4, [rsp+128]
+ paddw m0, m1
+ lea r5d, [maxbaseq-31]
+ psrlw m0, 2
+ mova [r4], m0
+.h32_filter_loop:
+ mova m0, [tlq-62]
+ paddw m1, m2, [tlq-66]
+ paddw m0, [tlq-64]
+ pavgw m1, [tlq-58]
+ paddw m0, [tlq-60]
+ sub tlq, 32
+ sub r4, 32
+ paddw m0, m1
+ psrlw m0, 2
+ mova [r4], m0
+ sub r5d, 16
+ jg .h32_filter_loop
+ jl .h32_filter_h8
+ mova m0, [tlq-62] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ punpcklwd xm1, xm0, xm0
+ paddw m2, [tlq-58] ; 2 3 4 5 6 7 8 9 a b c d e f g h
+ paddw m0, [tlq-60] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ vpblendd m3, m1, [tlq-66], 0xfe ; 0 0 0 1 2 3 4 5 6 7 8 9 a b c d
+ vpblendd m1, [tlq-64], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e
+ movzx r5d, word [tlq-62]
+ movzx r2d, word [tlq-60]
+ pavgw m2, m3
+ sub r2d, r5d
+ paddw m0, m1
+ lea r2d, [r2+r5*8+4]
+ paddw m0, m2
+ shr r2d, 3
+ psrlw m0, 2
+ mova [r4-32], m0
+ mov [r4-36], r5w
+ mov [r4-34], r2w
+ lea tlq, [rsp+158]
+ mov r4d, 65
+ cmp wd, 64
+ cmove maxbased, r4d
+ jmp .h32_main
+.h32_filter_h8:
+ mova xm0, [tlq-46] ; 0 1 2 3 4 5 6 7
+ pblendw xm1, xm0, [tlq-48], 0xfe ; 0 0 1 2 3 4 5 6
+ paddw xm2, [tlq-42] ; 2 3 4 5 6 7 8 9
+ paddw xm0, [tlq-44] ; 1 2 3 4 5 6 7 8
+ vpblendd xm3, xm1, [tlq-50], 0x0e ; 0 0 0 1 2 3 4 5
+ lea tlq, [rsp+158]
+ pavgw xm2, xm3
+ paddw xm0, xm1
+ paddw xm0, xm2
+ psrlw xm0, 2
+ mova [r4-16], xm0
+.h32_main:
+ movd xm6, dyd
+ neg maxbaseq
+ vpbroadcastw m7, [tlq+maxbaseq*2]
+ shl maxbased, 6
+ vpbroadcastw m6, xm6
+ lea r4d, [maxbaseq+dyq+15*64]
+ neg dyq
+ movd xm4, r4d
+ vpbroadcastd m8, [pw_m1024]
+ lea r4, [dyq+63]
+ vpbroadcastw m4, xm4
+ or maxbased, 63
+ psubw m4, [z_base_inc]
+.h32_loop:
+ mov r5, r4
+ sar r5, 6
+ movu m1, [tlq+r5*2-64]
+ movu m0, [tlq+r5*2-62]
+ pand m3, m5, m4
+ psllw m3, 9
+ psubw m1, m0
+ pmulhrsw m1, m3
+ pcmpgtw m2, m8, m4
+ paddw m0, m1
+ vpblendvb m0, m7, m0, m2
+ movu m2, [tlq+r5*2-32]
+ movu m1, [tlq+r5*2-30]
+ add r4, dyq
+ sub rsp, 64
+ psubw m2, m1
+ pmulhrsw m2, m3
+ psraw m3, m4, 15
+ paddw m4, m6
+ mova [rsp+32*0], m0
+ paddw m1, m2
+ vpblendvb m1, m7, m1, m3
+ mova [rsp+32*1], m1
+ dec wd
+ jz .h32_transpose
+ cmp r4d, maxbased
+ jg .h32_loop
+.h32_end_loop:
+ sub rsp, 64
+ mova [rsp+32*0], m7
+ mova [rsp+32*1], m7
+ dec wd
+ jg .h32_end_loop
+.h32_transpose:
+ lea r3, [strideq*3]
+ lea r4, [strideq*5]
+ mov r8, dstq
+ lea r5, [strideq+r3*2]
+.h32_transpose_loop0:
+ lea r6, [rsp+32]
+ lea r2, [r8+org_wq*2-16]
+.h32_transpose_loop:
+ mova m0, [r6+64*7]
+ mova m1, [r6+64*6]
+ mova m2, [r6+64*5]
+ mova m3, [r6+64*4]
+ mova m4, [r6+64*3]
+ mova m5, [r6+64*2]
+ mova m6, [r6+64*1]
+ mova m7, [r6+64*0]
+ punpckhwd m8, m0, m1 ; a3 b3 a2 b2 a1 b1 a0 b0
+ punpcklwd m0, m1 ; a7 b7 a6 b6 a5 b5 a4 b4
+ punpckhwd m1, m2, m3 ; c3 d3 c2 d2 c1 d1 c0 d0
+ punpcklwd m2, m3 ; c7 d7 c6 d6 c5 d5 c4 d4
+ punpckhwd m3, m4, m5 ; e3 f3 e2 f2 e1 f1 e0 f0
+ punpcklwd m4, m5 ; e7 f7 e6 f6 e5 f5 e4 f4
+ punpckhwd m5, m6, m7 ; g3 h3 g2 h2 g1 h1 g0 h0
+ punpcklwd m6, m7 ; g7 h7 g6 h6 g5 h5 g4 h4
+ lea dstq, [r2+strideq*8]
+ sub r6, 32
+ punpckhdq m7, m8, m1 ; a1 b1 c1 d1 a0 b0 c0 d0
+ punpckldq m8, m1 ; a3 b3 c3 d3 a2 b2 c2 d2
+ punpckhdq m1, m3, m5 ; e1 f1 g1 h1 e0 f0 g0 h0
+ punpckldq m3, m5 ; e3 f3 g3 h3 e2 f2 g2 h2
+ punpckhqdq m5, m7, m1 ; 8 0
+ vextracti128 [r2 +strideq*0], m5, 1
+ punpcklqdq m7, m1 ; 9 1
+ mova [dstq+strideq*0], xm5
+ punpckhqdq m1, m8, m3 ; 10 2
+ vextracti128 [r2 +strideq*1], m7, 1
+ punpcklqdq m8, m3 ; 11 3
+ mova [dstq+strideq*1], xm7
+ punpckhdq m3, m0, m2 ; a5 b5 c5 d5 a4 b4 c4 d4
+ vextracti128 [r2 +strideq*2], m1, 1
+ punpckldq m0, m2 ; a7 b7 c7 d7 a6 b6 c6 d6
+ mova [dstq+strideq*2], xm1
+ punpckhdq m2, m4, m6 ; e5 f5 g5 h5 e4 f4 g4 h4
+ vextracti128 [r2 +r3 ], m8, 1
+ punpckldq m4, m6 ; e7 f7 g7 h7 e6 f6 g6 h6
+ mova [dstq+r3 ], xm8
+ punpckhqdq m6, m3, m2 ; 12 4
+ vextracti128 [r2 +strideq*4], m6, 1
+ punpcklqdq m3, m2 ; 13 5
+ mova [dstq+strideq*4], xm6
+ punpckhqdq m2, m0, m4 ; 14 6
+ vextracti128 [r2 +r4 ], m3, 1
+ punpcklqdq m0, m4 ; 15 7
+ mova [dstq+r4 ], xm3
+ vextracti128 [r2 +r3*2 ], m2, 1
+ mova [dstq+r3*2 ], xm2
+ vextracti128 [r2 +r5 ], m0, 1
+ mova [dstq+r5 ], xm0
+ lea r2, [dstq+strideq*8]
+ cmp r6, rsp
+ jae .h32_transpose_loop
+ add rsp, 64*8
+ sub org_wd, 8
+ jg .h32_transpose_loop0
+.h32_end:
+ RET
+.h64:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -256, 10
+ lea maxbased, [wq+63]
+ test angled, 0x400
+ jnz .h64_main
+ vpbroadcastd m2, [pw_3]
+ movu m0, [tlq-28] ; 3 4 5 6 7 8 9 a b c d e f g h i
+ punpckhwd m1, m0, m0
+ vpblendd m1, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i
+ paddw m0, [tlq-30] ; 2 3 4 5 6 7 8 9 a b c d e f g h
+ paddw m1, m2
+ paddw m0, [tlq-32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ pavgw m1, [tlq-34] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ lea r4, [rsp+224]
+ paddw m0, m1
+ lea r5d, [wq+32]
+ psrlw m0, 2
+ mova [r4], m0
+.h64_filter_loop:
+ mova m0, [tlq-62]
+ paddw m1, m2, [tlq-66]
+ paddw m0, [tlq-64]
+ pavgw m1, [tlq-58]
+ paddw m0, [tlq-60]
+ sub tlq, 32
+ sub r4, 32
+ paddw m0, m1
+ psrlw m0, 2
+ mova [r4], m0
+ sub r5d, 16
+ jg .h64_filter_loop
+ mova m0, [tlq-62] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ punpcklwd xm1, xm0, xm0
+ paddw m2, [tlq-58] ; 2 3 4 5 6 7 8 9 a b c d e f g h
+ paddw m0, [tlq-60] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ vpblendd m3, m1, [tlq-66], 0xfe ; 0 0 0 1 2 3 4 5 6 7 8 9 a b c d
+ vpblendd m1, [tlq-64], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e
+ lea tlq, [rsp+254]
+ pavgw m2, m3
+ paddw m0, m1
+ paddw m0, m2
+ psrlw m0, 2
+ mova [r4-32], m0
+.h64_main:
+ neg maxbaseq
+ movd xm4, dyd
+ vpbroadcastw m6, [tlq+maxbaseq*2]
+ shl maxbased, 6
+ vpbroadcastw m4, xm4
+ lea r4d, [maxbaseq+dyq+15*64]
+ neg dyq
+ vpbroadcastd m7, [pw_m1024]
+ movd xm3, r4d
+ lea r4, [dyq+63]
+ paddw m8, m7, m7
+ vpbroadcastw m3, xm3
+ or maxbased, 63
+ paddw m9, m8, m7
+ psubw m3, [z_base_inc]
+.h64_loop:
+ mov r5, r4
+ sar r5, 6
+ movu m1, [tlq+r5*2-128]
+ movu m0, [tlq+r5*2-126]
+ pand m2, m5, m3
+ psllw m2, 9
+ psubw m1, m0
+ pmulhrsw m1, m2
+ sub rsp, 128
+ paddw m0, m1
+ pcmpgtw m1, m9, m3
+ vpblendvb m0, m6, m0, m1
+ mova [rsp+32*0], m0
+ movu m1, [tlq+r5*2-96]
+ movu m0, [tlq+r5*2-94]
+ psubw m1, m0
+ pmulhrsw m1, m2
+ paddw m0, m1
+ pcmpgtw m1, m8, m3
+ vpblendvb m0, m6, m0, m1
+ mova [rsp+32*1], m0
+ movu m1, [tlq+r5*2-64]
+ movu m0, [tlq+r5*2-62]
+ psubw m1, m0
+ pmulhrsw m1, m2
+ paddw m0, m1
+ pcmpgtw m1, m7, m3
+ vpblendvb m0, m6, m0, m1
+ mova [rsp+32*2], m0
+ movu m1, [tlq+r5*2-32]
+ movu m0, [tlq+r5*2-30]
+ psubw m1, m0
+ pmulhrsw m1, m2
+ add r4, dyq
+ psraw m2, m3, 15
+ paddw m3, m4
+ paddw m0, m1
+ vpblendvb m0, m6, m0, m2
+ mova [rsp+32*3], m0
+ dec wd
+ jz .h64_transpose
+ cmp r4d, maxbased
+ jg .h64_loop
+.h64_end_loop:
+ sub rsp, 128
+ mova [rsp+32*0], m6
+ mova [rsp+32*1], m6
+ mova [rsp+32*2], m6
+ mova [rsp+32*3], m6
+ dec wd
+ jg .h64_end_loop
+.h64_transpose:
+ lea r2, [strideq*3]
+ lea r3, [strideq*5]
+ mov r5, dstq
+ lea r4, [strideq+r2*2]
+.h64_transpose_loop0:
+ lea r6, [rsp+112]
+ lea dstq, [r5+org_wq*2-32]
+.h64_transpose_loop:
+ mova xm0, [r6+128*15]
+ vinserti128 m0, [r6+128* 7], 1
+ mova xm1, [r6+128*14]
+ vinserti128 m1, [r6+128* 6], 1
+ mova xm2, [r6+128*13]
+ vinserti128 m2, [r6+128* 5], 1
+ mova xm3, [r6+128*12]
+ vinserti128 m3, [r6+128* 4], 1
+ mova xm4, [r6+128*11]
+ vinserti128 m4, [r6+128* 3], 1
+ mova xm5, [r6+128*10]
+ vinserti128 m5, [r6+128* 2], 1
+ mova xm6, [r6+128* 9]
+ vinserti128 m6, [r6+128* 1], 1
+ mova xm7, [r6+128* 8]
+ vinserti128 m7, [r6+128* 0], 1
+ punpckhwd m8, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m2, m3
+ punpcklwd m2, m3
+ punpckhwd m3, m4, m5
+ punpcklwd m4, m5
+ punpckhwd m5, m6, m7
+ punpcklwd m6, m7
+ sub r6, 16
+ punpckhdq m7, m8, m1
+ punpckldq m8, m1
+ punpckhdq m1, m3, m5
+ punpckldq m3, m5
+ punpckhqdq m5, m7, m1
+ punpcklqdq m7, m1
+ punpckhqdq m1, m8, m3
+ punpcklqdq m8, m3
+ punpckhdq m3, m0, m2
+ mova [dstq+strideq*0], m5
+ punpckldq m0, m2
+ mova [dstq+strideq*1], m7
+ punpckhdq m2, m4, m6
+ mova [dstq+strideq*2], m1
+ punpckldq m4, m6
+ mova [dstq+r2 ], m8
+ punpckhqdq m6, m3, m2
+ mova [dstq+strideq*4], m6
+ punpcklqdq m3, m2
+ mova [dstq+r3 ], m3
+ punpckhqdq m2, m0, m4
+ mova [dstq+r2*2 ], m2
+ punpcklqdq m0, m4
+ mova [dstq+r4 ], m0
+ lea dstq, [dstq+strideq*8]
+ cmp r6, rsp
+ jae .h64_transpose_loop
+ add rsp, 128*16
+ sub org_wd, 16
+ jg .h64_transpose_loop0
+.h64_end:
+ RET
+
+%macro FILTER_1BLK 5 ; dst, src, tmp, shuf, bdmax
+%ifnum %4
+ pshufb xm%2, xm%4
+%else
+ pshufb xm%2, %4
+%endif
+ vinserti128 m%2, xm%2, 1
+ pshufd m%1, m%2, q0000
+ pmaddwd m%1, m2
+ pshufd m%3, m%2, q1111
+ pmaddwd m%3, m3
+ paddd m%1, m1
+ paddd m%1, m%3
+ pshufd m%3, m%2, q2222
+ pmaddwd m%3, m4
+ paddd m%1, m%3
+ pshufd m%3, m%2, q3333
+ pmaddwd m%3, m5
+ paddd m%1, m%3
+ psrad m%1, 4
+ packusdw m%1, m%1
+ pminsw m%1, m%5
+%endmacro
+
+%macro FILTER_2BLK 7 ; dst, src, tmp_dst, tmp_src, tmp, shuf, bdmax
+ pshufb m%2, m%6
+ vpermq m%4, m%2, q3232
+ vinserti128 m%2, xm%2, 1
+ pshufd m%1, m%2, q0000
+ pshufd m%3, m%4, q0000
+ pmaddwd m%1, m2
+ pmaddwd m%3, m2
+ paddd m%1, m1
+ paddd m%3, m1
+ pshufd m%5, m%2, q1111
+ pmaddwd m%5, m3
+ paddd m%1, m%5
+ pshufd m%5, m%4, q1111
+ pmaddwd m%5, m3
+ paddd m%3, m%5
+ pshufd m%5, m%2, q2222
+ pmaddwd m%5, m4
+ paddd m%1, m%5
+ pshufd m%5, m%4, q2222
+ pmaddwd m%5, m4
+ paddd m%3, m%5
+ pshufd m%5, m%2, q3333
+ pmaddwd m%5, m5
+ paddd m%1, m%5
+ pshufd m%5, m%4, q3333
+ pmaddwd m%5, m5
+ paddd m%3, m%5
+ psrad m%1, 4
+ psrad m%3, 4
+ packusdw m%1, m%3
+ pminsw m%1, m%7
+%endmacro
+
+; The ipred_filter SIMD processes 4x2 blocks in the following order which
+; increases parallelism compared to doing things row by row. One redundant
+; block is calculated for w8 and w16, two for w32.
+; w4 w8 w16 w32
+; 1 1 2 1 2 3 5 1 2 3 5 b c d f
+; 2 2 3 2 4 5 7 2 4 5 7 c e f h
+; 3 3 4 4 6 7 9 4 6 7 9 e g h j
+; ___ 4 ___ 4 5 ___ 6 8 9 a ___ 6 8 9 a g i j k ___
+; 5 8 8 i
+
+cglobal ipred_filter_16bpc, 3, 9, 0, dst, stride, tl, w, h, filter
+%assign org_stack_offset stack_offset
+%define base r6-ipred_filter_16bpc_avx2_table
+ lea r6, [filter_intra_taps]
+ tzcnt wd, wm
+%ifidn filterd, filterm
+ movzx filterd, filterb
+%else
+ movzx filterd, byte filterm
+%endif
+ shl filterd, 6
+ add filterq, r6
+ lea r6, [ipred_filter_16bpc_avx2_table]
+ vbroadcasti128 m0, [tlq-6]
+ movsxd wq, [r6+wq*4]
+ vpbroadcastd m1, [base+pd_8]
+ pmovsxbw m2, [filterq+16*0]
+ pmovsxbw m3, [filterq+16*1]
+ pmovsxbw m4, [filterq+16*2]
+ pmovsxbw m5, [filterq+16*3]
+ add wq, r6
+ mov hd, hm
+ jmp wq
+.w4:
+ WIN64_SPILL_XMM 10
+ mova xm8, [base+filter_shuf2]
+ vpbroadcastw m9, r8m ; bitdepth_max
+ lea r7, [6+hq*2]
+ sub tlq, r7
+ jmp .w4_loop_start
+.w4_loop:
+ pinsrq xm0, [tlq+hq*2], 0
+ lea dstq, [dstq+strideq*2]
+.w4_loop_start:
+ FILTER_1BLK 6, 0, 7, 8, 9
+ vextracti128 xm0, m6, 1
+ movq [dstq+strideq*0], xm6
+ movq [dstq+strideq*1], xm0
+ sub hd, 2
+ jg .w4_loop
+ RET
+ALIGN function_align
+.w8:
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 16
+ vbroadcasti128 m14, [base+filter_shuf3]
+ vpbroadcastw m15, r8m ; bitdepth_max
+ FILTER_1BLK 10, 0, 7, [base+filter_shuf2], 15
+ vpermq m6, m10, q1302 ; ____ ____ | ____ 4321
+ pslldq m8, m0, 4
+ psrldq m7, m6, 2
+ psrldq m0, m6, 10
+ punpcklwd m7, m0
+ vpblendd m8, m6, 0x33 ; _0__ 4321 | ____ 4321
+ vpblendd m8, m7, 0x40 ; _056 4321 | ____ 4321
+ vpblendd m8, [tlq-6], 0x30 ; _056 4321 | ____ 4321
+ lea r7, [16+hq*2]
+ sub tlq, r7
+ jmp .w8_loop_start
+.w8_loop:
+ vpermq m8, m9, q1302 ; ____ 4321 | ____ 4321
+ vpermq m6, m9, q2031
+ psrldq m0, m6, 2
+ psrldq m6, 10
+ punpcklwd m6, m0
+ vpblendd m8, m7, 0x80 ; _0__ 4321 | ____ 4321
+ vpblendd m8, m6, 0x40 ; _056 4321 | ____ 4321
+ mova m10, m9
+.w8_loop_start:
+ vpblendd m8, [tlq+hq*2], 0x0C ; _056 4321 | _056 4321
+ call .main
+ vpblendd m10, m9, 0xCC
+ mova [dstq+strideq*0], xm10
+ vextracti128 [dstq+strideq*1], m10, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8_loop
+ RET
+ALIGN function_align
+.w16:
+ %assign stack_offset stack_offset - stack_size_padded
+ ALLOC_STACK 32, 16
+ vpbroadcastw m15, r8m ; bitdepth_max
+ sub hd, 2
+ TAIL_CALL .w16_main, 0
+.w16_main:
+ mova xm10, [base+filter_shuf2]
+ FILTER_1BLK 13, 0, 6, 10, 15
+ vpermq m12, m13, q3120
+ mova xm14, [base+filter_shuf3]
+ vinserti128 m14, [base+filter_shuf1], 1
+ vpbroadcastq m0, [tlq+10]
+ vpblendd m0, [tlq-16], 0x4C ; ___0 4321 | _056 ____
+ psrldq m6, m12, 8
+ vpblendd m0, m6, 0x03 ; ___0 4321 | _056 4321
+ punpcklwd m6, m12
+ vpblendd m0, m6, 0x80 ; 56_0 4321 | _056 4321
+ FILTER_2BLK 12, 0, 6, 7, 8, 14, 15
+ vpblendd m13, m12, 0xCC
+ vpermq m12, m12, q2031 ; 6___ 5___
+ psrldq xm6, xm12, 2
+ psrldq xm8, xm12, 12
+ vpblendd xm6, xm8, 0x01
+ pblendw xm6, [tlq+10], 0xF8 ; 4321 056_
+ FILTER_1BLK 11, 6, 8, 10, 15
+ vpermq m11, m11, q3120
+ pshufd m9, m11, q1032
+ movu m8, [tlq+6] ; __43 210_ | ____ ____
+ pshufd m8, m8, q3021 ; __0_ 4321 | ____ ____
+ pshufhw m8, m8, q3201 ; ___0 4321 | ____ ____
+ vpblendd m9, m8, 0x70 ; ___0 4321 | ____ 4321
+ mova [dstq+strideq*0], xm13
+ vextracti128 [dstq+strideq*1], m13, 1
+ lea r7, [20+hq*2]
+ sub tlq, r7
+ vpermq m6, m12, q0123 ; ____ 4321 | ____ 4321
+ jmp .w16_loop_start
+.w16_loop:
+ vpermq m13, m13, q3322
+ vpermq m11, m9, q2020
+ vpermq m9, m9, q1302
+ vpermq m6, m12, q0123
+ psrldq m7, 4
+ vpblendd m13, m10, 0xCC
+ vpblendd m9, m7, 0x40
+ mova m0, [rsp+8]
+ mova [dstq+strideq*0], xm13
+ vextracti128 [dstq+strideq*1], m13, 1
+.w16_loop_start:
+ mova m13, m12
+ vpblendd m0, [tlq+hq*2], 0x0C
+ psrldq m7, m12, 8
+ punpcklwd m7, m12
+ vpblendd m0, m6, 0x33 ; ___0 4321 | _056 4321
+ vpblendd m0, m7, 0x80 ; 56_0 4321 | _056 4321
+ FILTER_2BLK 10, 0, 6, 7, 8, 14, 15
+ vpermq m12, m10, q2031
+ mova [rsp+8], m0
+ psrldq m8, m11, 8
+ psrldq xm6, xm12, 2
+ psrldq xm7, xm12, 10
+ psrldq xm0, xm13, 2
+ punpcklwd m8, m11
+ punpcklwd xm7, xm6
+ vpblendd m8, m9, 0x73 ; 56_0 4321 | ____ 4321
+ vpblendd m8, m7, 0x04 ; 56_0 4321 | __56 4321
+ vpblendd m8, m0, 0x08 ; 56_0 4321 | _056 4321
+ call .main
+ vpermq m8, m11, q3120
+ vpblendd m6, m8, m9, 0xCC
+ mova [dstq+strideq*0+16], xm6
+ vextracti128 [dstq+strideq*1+16], m6, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w16_loop
+ vpermq m8, m9, q3120
+ vextracti128 xm0, m8, 1 ; 4321 ____
+ pshufd xm11, xm11, q1032
+ vpblendd xm0, xm11, 0x02 ; 4321 0___
+ psrldq xm6, xm8, 2
+ psrldq xm7, xm8, 12
+ pblendw xm0, xm6, 0x4 ; 4321 05__
+ pblendw xm0, xm7, 0x2 ; 4321 056_
+ FILTER_1BLK 6, 0, 7, [base+filter_shuf2], 15
+ vpermq m12, m13, q1302
+ vpblendd m12, m10, 0xCC
+ vpblendd m9, m6, 0xCC
+ mova [dstq+strideq*0+ 0], xm12
+ mova [dstq+strideq*0+16], xm9
+ vextracti128 [dstq+strideq*1+ 0], m12, 1
+ vextracti128 [dstq+strideq*1+16], m9, 1
+ ret
+ALIGN function_align
+.w32:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK 64, 16
+ vpbroadcastw m15, r8m ; bitdepth_max
+ sub hd, 2
+ lea r3, [dstq+32]
+ lea r5d, [hd*2+20]
+ call .w16_main
+ mov dstq, r3
+ lea tlq, [tlq+r5+32]
+ sub r5d, 20
+ shr r5d, 1
+ sub r5d, 2
+ lea r4, [dstq+strideq*2-2]
+DEFINE_ARGS dst, stride, tl, stride3, left, h
+ lea stride3q, [strideq*3]
+ movu m8, [tlq-6] ; 4321 0___
+ mova xm10, [base+filter_shuf2]
+ pinsrw xm0, xm8, [dstq+strideq*0-2], 2
+ pinsrw xm0, xm0, [dstq+strideq*1-2], 1 ; 4321 056_
+ pinsrw xm9, [leftq+strideq*0], 5
+ pinsrw xm9, [leftq+strideq*1], 4
+ FILTER_1BLK 13, 0, 6, 10, 15
+ vpermq m12, m13, q3120
+ mova xm14, [base+filter_shuf3]
+ vinserti128 m14, [base+filter_shuf1], 1
+ psrldq m6, m12, 8
+ punpcklwd m7, m6, m12
+ vpblendd m0, m6, 0x03 ; ___0 ____ | _0__ 4321
+ vpblendd m0, m7, 0x80 ; 56_0 ____ | _0__ 4321
+ vpblendd m0, m8, 0x30 ; 56_0 4321 | _0__ 4321
+ vpblendd m0, m9, 0x04 ; 56_0 4321 | _056 4321
+ FILTER_2BLK 12, 0, 6, 7, 8, 14, 15
+ vpblendd m13, m12, 0xCC
+ pinsrw xm9, [leftq+strideq*2], 3
+ pinsrw xm9, [leftq+stride3q ], 2
+ lea leftq, [leftq+strideq*4]
+ pinsrw xm9, [leftq+strideq*0], 1
+ pinsrw xm9, [leftq+strideq*1], 0
+ movq [rsp+32], xm9
+ mov r7d, 1
+ pslldq m8, m9, 4
+ vpblendd m0, m8, 0x0C ; ___0 ____ | _056 ____
+ vpermq m12, m12, q2031 ; 6___ 5___
+ psrldq xm6, xm12, 2
+ psrldq xm7, xm12, 12
+ vpblendd xm6, xm7, 0x01 ; ____ _56_
+ pblendw xm6, [tlq+10], 0xF8 ; 4321 056_
+ FILTER_1BLK 11, 6, 7, 10, 15
+ vpermq m11, m11, q3120
+ pshufd m9, m11, q1032
+ vbroadcasti128 m8, [tlq+22] ; __43 210_ | ____ ____
+ pshufd m8, m8, q3021 ; __0_ 4321 | ____ ____
+ pshufhw m8, m8, q3201 ; ___0 4321 | ____ ____
+ vpblendd m9, m8, 0x70 ; ___0 4321 | ____ 4321
+ mova [dstq+strideq*0], xm13
+ vextracti128 [dstq+strideq*1], m13, 1
+ vpermq m6, m12, q0123 ; ____ 4321 | ____ 4321
+ jmp .w32_loop_start
+.w32_loop_last:
+ mova m0, [rsp+0]
+ jmp .w32_loop
+.w32_loop_left:
+ mova m0, [rsp+0]
+ vpblendd m0, [rsp+32+r7*4-12], 0x0C
+ dec r7d
+ jg .w32_loop
+ cmp hd, 2
+ je .w32_loop
+ pinsrw xm6, [rsp+32], 6
+ pinsrw xm6, [leftq+strideq*2], 5
+ pinsrw xm6, [leftq+stride3q ], 4
+ lea leftq, [leftq+strideq*4]
+ pinsrw xm6, [leftq+strideq*0], 3
+ pinsrw xm6, [leftq+strideq*1], 2
+ pinsrw xm6, [leftq+strideq*2], 1
+ pinsrw xm6, [leftq+stride3q ], 0
+ lea leftq, [leftq+strideq*4]
+ movu [rsp+36], xm6
+ pinsrw xm6, [leftq+strideq*0], 1
+ pinsrw xm6, [leftq+strideq*1], 0
+ movd [rsp+32], xm6
+ mov r7d, 4
+.w32_loop:
+ vpermq m13, m13, q3322
+ vpermq m11, m9, q2020
+ vpermq m9, m9, q1302
+ vpermq m6, m12, q0123
+ psrldq m7, 4
+ vpblendd m13, m10, 0xCC
+ vpblendd m9, m7, 0x40 ; ___0 4321 | ____ 4321
+ mova [dstq+strideq*0], xm13
+ vextracti128 [dstq+strideq*1], m13, 1
+.w32_loop_start:
+ mova m13, m12
+ psrldq m7, m12, 8
+ punpcklwd m7, m12
+ vpblendd m0, m6, 0x33 ; ___0 4321 | _056 4321
+ vpblendd m0, m7, 0x80 ; 56_0 4321 | _056 4321
+ FILTER_2BLK 10, 0, 6, 7, 8, 14, 15
+ vpermq m12, m10, q2031
+ mova [rsp+0], m0
+ psrldq m8, m11, 8
+ psrldq xm6, xm12, 2
+ psrldq xm7, xm12, 10
+ psrldq xm0, xm13, 2
+ punpcklwd m8, m11
+ punpcklwd xm7, xm6
+ vpblendd m8, m9, 0x73 ; 56_0 4321 | ____ 4321
+ vpblendd m8, m7, 0x04 ; 56_0 4321 | __56 4321
+ vpblendd m8, m0, 0x08 ; 56_0 4321 | _056 4321
+ call .main
+ vpermq m8, m11, q3120
+ vpblendd m6, m8, m9, 0xCC
+ mova [dstq+strideq*0+16], xm6
+ vextracti128 [dstq+strideq*1+16], m6, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32_loop_left
+ jz .w32_loop_last
+ vpermq m8, m9, q3120
+ vextracti128 xm0, m8, 1 ; 4321 ____
+ pshufd xm11, xm11, q1032
+ vpblendd xm0, xm11, 0x02 ; 4321 0___
+ psrldq xm6, xm8, 2
+ psrldq xm7, xm8, 12
+ pblendw xm0, xm6, 0x4 ; 4321 05__
+ pblendw xm0, xm7, 0x2 ; 4321 056_
+ FILTER_1BLK 6, 0, 7, [base+filter_shuf2], 15
+ vpermq m12, m13, q1302
+ vpblendd m12, m10, 0xCC
+ vpblendd m9, m6, 0xCC
+ mova [dstq+strideq*0+ 0], xm12
+ mova [dstq+strideq*0+16], xm9
+ vextracti128 [dstq+strideq*1+ 0], m12, 1
+ vextracti128 [dstq+strideq*1+16], m9, 1
+ RET
+.main:
+ FILTER_2BLK 9, 8, 6, 7, 0, 14, 15
+ ret
+
+%if WIN64
+DECLARE_REG_TMP 5
+%else
+DECLARE_REG_TMP 7
+%endif
+
+%macro IPRED_CFL 1 ; ac in, unpacked pixels out
+ psignw m3, m%1, m1
+ pabsw m%1, m%1
+ pmulhrsw m%1, m2
+ psignw m%1, m3
+ paddw m%1, m0
+%endmacro
+
+cglobal ipred_cfl_top_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha
+ movifnidn hd, hm
+ add tlq, 2
+ movd xm4, wd
+ pxor m6, m6
+ vpbroadcastw m7, r7m
+ pavgw xm4, xm6
+ tzcnt wd, wd
+ movd xm5, wd
+ movu m0, [tlq]
+ lea t0, [ipred_cfl_left_16bpc_avx2_table]
+ movsxd r6, [t0+wq*4]
+ add r6, t0
+ add t0, ipred_cfl_splat_16bpc_avx2_table-ipred_cfl_left_16bpc_avx2_table
+ movsxd wq, [t0+wq*4]
+ add wq, t0
+ movifnidn acq, acmp
+ jmp r6
+
+cglobal ipred_cfl_left_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha
+ mov hd, hm ; zero upper half
+ sub tlq, hq
+ movd xm4, hd
+ sub tlq, hq
+ pxor m6, m6
+ vpbroadcastw m7, r7m
+ pavgw xm4, xm6
+ tzcnt r6d, hd
+ movd xm5, r6d
+ movu m0, [tlq]
+ lea t0, [ipred_cfl_left_16bpc_avx2_table]
+ movsxd r6, [t0+r6*4]
+ add r6, t0
+ add t0, ipred_cfl_splat_16bpc_avx2_table-ipred_cfl_left_16bpc_avx2_table
+ tzcnt wd, wd
+ movsxd wq, [t0+wq*4]
+ add wq, t0
+ movifnidn acq, acmp
+ jmp r6
+.h32:
+ paddw m0, [tlq+32]
+.h16:
+ vextracti128 xm1, m0, 1
+ paddw xm0, xm1
+.h8:
+ psrldq xm1, xm0, 8
+ paddw xm0, xm1
+.h4:
+ punpcklwd xm0, xm6
+ psrlq xm1, xm0, 32
+ paddd xm0, xm1
+ psrldq xm1, xm0, 8
+ paddd xm0, xm1
+ paddd xm0, xm4
+ psrld xm0, xm5
+ vpbroadcastw m0, xm0
+ jmp wq
+
+cglobal ipred_cfl_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha
+ movifnidn hd, hm
+ movifnidn wd, wm
+ tzcnt r6d, hd
+ lea t0d, [wq+hq]
+ movd xm4, t0d
+ tzcnt t0d, t0d
+ movd xm5, t0d
+ lea t0, [ipred_cfl_16bpc_avx2_table]
+ tzcnt wd, wd
+ movsxd r6, [t0+r6*4]
+ movsxd wq, [t0+wq*4+4*4]
+ psrlw xm4, 1
+ pxor m6, m6
+ vpbroadcastw m7, r7m
+ add r6, t0
+ add wq, t0
+ movifnidn acq, acmp
+ jmp r6
+.h4:
+ movq xm0, [tlq-8]
+ jmp wq
+.w4:
+ movq xm1, [tlq+2]
+ paddw m0, m4
+ paddw m0, m1
+ psrlq m1, m0, 32
+ paddw m0, m1
+ psrld m1, m0, 16
+ paddw m0, m1
+ cmp hd, 4
+ jg .w4_mul
+ psrlw xm0, 3
+ jmp .w4_end
+.w4_mul:
+ vextracti128 xm1, m0, 1
+ paddw xm0, xm1
+ lea r2d, [hq*2]
+ mov r6d, 0xAAAB6667
+ shrx r6d, r6d, r2d
+ punpckhwd xm1, xm0, xm6
+ punpcklwd xm0, xm6
+ paddd xm0, xm1
+ movd xm1, r6d
+ psrld xm0, 2
+ pmulhuw xm0, xm1
+ psrlw xm0, 1
+.w4_end:
+ vpbroadcastw m0, xm0
+.s4:
+ vpbroadcastw m1, alpham
+ lea r6, [strideq*3]
+ pabsw m2, m1
+ psllw m2, 9
+.s4_loop:
+ mova m4, [acq]
+ IPRED_CFL 4
+ pmaxsw m4, m6
+ pminsw m4, m7
+ vextracti128 xm5, m4, 1
+ movq [dstq+strideq*0], xm4
+ movq [dstq+strideq*2], xm5
+ movhps [dstq+strideq*1], xm4
+ movhps [dstq+r6 ], xm5
+ lea dstq, [dstq+strideq*4]
+ add acq, 32
+ sub hd, 4
+ jg .s4_loop
+ RET
+ALIGN function_align
+.h8:
+ mova xm0, [tlq-16]
+ jmp wq
+.w8:
+ vextracti128 xm1, m0, 1
+ paddw xm0, [tlq+2]
+ paddw xm0, xm4
+ paddw xm0, xm1
+ psrld xm1, xm0, 16
+ paddw xm0, xm1
+ pblendw xm0, xm6, 0xAA
+ psrlq xm1, xm0, 32
+ paddd xm0, xm1
+ psrldq xm1, xm0, 8
+ paddd xm0, xm1
+ psrld xm0, xm5
+ cmp hd, 8
+ je .w8_end
+ mov r6d, 0xAAAB
+ mov r2d, 0x6667
+ cmp hd, 32
+ cmovz r6d, r2d
+ movd xm1, r6d
+ pmulhuw xm0, xm1
+ psrlw xm0, 1
+.w8_end:
+ vpbroadcastw m0, xm0
+.s8:
+ vpbroadcastw m1, alpham
+ lea r6, [strideq*3]
+ pabsw m2, m1
+ psllw m2, 9
+.s8_loop:
+ mova m4, [acq]
+ mova m5, [acq+32]
+ IPRED_CFL 4
+ IPRED_CFL 5
+ pmaxsw m4, m6
+ pmaxsw m5, m6
+ pminsw m4, m7
+ pminsw m5, m7
+ mova [dstq+strideq*0], xm4
+ mova [dstq+strideq*2], xm5
+ vextracti128 [dstq+strideq*1], m4, 1
+ vextracti128 [dstq+r6 ], m5, 1
+ lea dstq, [dstq+strideq*4]
+ add acq, 64
+ sub hd, 4
+ jg .s8_loop
+ RET
+ALIGN function_align
+.h16:
+ mova m0, [tlq-32]
+ jmp wq
+.w16:
+ paddw m0, [tlq+2]
+ vextracti128 xm1, m0, 1
+ paddw xm0, xm4
+ paddw xm0, xm1
+ punpckhwd xm1, xm0, xm6
+ punpcklwd xm0, xm6
+ paddd xm0, xm1
+ psrlq xm1, xm0, 32
+ paddd xm0, xm1
+ psrldq xm1, xm0, 8
+ paddd xm0, xm1
+ psrld xm0, xm5
+ cmp hd, 16
+ je .w16_end
+ mov r6d, 0xAAAB
+ mov r2d, 0x6667
+ test hb, 8|32
+ cmovz r6d, r2d
+ movd xm1, r6d
+ pmulhuw xm0, xm1
+ psrlw xm0, 1
+.w16_end:
+ vpbroadcastw m0, xm0
+.s16:
+ vpbroadcastw m1, alpham
+ pabsw m2, m1
+ psllw m2, 9
+.s16_loop:
+ mova m4, [acq]
+ mova m5, [acq+32]
+ IPRED_CFL 4
+ IPRED_CFL 5
+ pmaxsw m4, m6
+ pmaxsw m5, m6
+ pminsw m4, m7
+ pminsw m5, m7
+ mova [dstq+strideq*0], m4
+ mova [dstq+strideq*1], m5
+ lea dstq, [dstq+strideq*2]
+ add acq, 64
+ sub hd, 2
+ jg .s16_loop
+ RET
+ALIGN function_align
+.h32:
+ mova m0, [tlq-64]
+ paddw m0, [tlq-32]
+ jmp wq
+.w32:
+ paddw m0, [tlq+ 2]
+ paddw m0, [tlq+34]
+ vextracti128 xm1, m0, 1
+ paddw xm0, xm4
+ paddw xm0, xm1
+ punpcklwd xm1, xm0, xm6
+ punpckhwd xm0, xm6
+ paddd xm0, xm1
+ psrlq xm1, xm0, 32
+ paddd xm0, xm1
+ psrldq xm1, xm0, 8
+ paddd xm0, xm1
+ psrld xm0, xm5
+ cmp hd, 32
+ je .w32_end
+ lea r2d, [hq*2]
+ mov r6d, 0x6667AAAB
+ shrx r6d, r6d, r2d
+ movd xm1, r6d
+ pmulhuw xm0, xm1
+ psrlw xm0, 1
+.w32_end:
+ vpbroadcastw m0, xm0
+.s32:
+ vpbroadcastw m1, alpham
+ pabsw m2, m1
+ psllw m2, 9
+.s32_loop:
+ mova m4, [acq]
+ mova m5, [acq+32]
+ IPRED_CFL 4
+ IPRED_CFL 5
+ pmaxsw m4, m6
+ pmaxsw m5, m6
+ pminsw m4, m7
+ pminsw m5, m7
+ mova [dstq+32*0], m4
+ mova [dstq+32*1], m5
+ add dstq, strideq
+ add acq, 64
+ dec hd
+ jg .s32_loop
+ RET
+
+cglobal ipred_cfl_128_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha
+ mov r6d, r7m
+ shr r6d, 11
+ lea t0, [ipred_cfl_splat_16bpc_avx2_table]
+ tzcnt wd, wd
+ movifnidn hd, hm
+ movsxd wq, [t0+wq*4]
+ vpbroadcastd m0, [t0-ipred_cfl_splat_16bpc_avx2_table+pw_512+r6*4]
+ pxor m6, m6
+ vpbroadcastw m7, r7m
+ add wq, t0
+ movifnidn acq, acmp
+ jmp wq
+
+cglobal ipred_cfl_ac_420_16bpc, 4, 7, 6, ac, ypx, stride, wpad, hpad, w, h
+ movifnidn hpadd, hpadm
+ vpbroadcastd m5, [pw_2]
+ mov hd, hm
+ shl hpadd, 2
+ pxor m4, m4
+ sub hd, hpadd
+ cmp dword wm, 8
+ jg .w16
+ je .w8
+.w4:
+ lea r3, [strideq*3]
+ mov r5, acq
+.w4_loop:
+ mova xm0, [ypxq+strideq*2]
+ mova xm1, [ypxq+r3 ]
+ vinserti128 m0, [ypxq+strideq*0], 1
+ vinserti128 m1, [ypxq+strideq*1], 1
+ lea ypxq, [ypxq+strideq*4]
+ pmaddwd m0, m5
+ pmaddwd m1, m5
+ paddd m0, m1
+ vextracti128 xm1, m0, 1
+ paddd m4, m0
+ packssdw xm1, xm0
+ mova [acq], xm1
+ add acq, 16
+ sub hd, 2
+ jg .w4_loop
+ test hpadd, hpadd
+ jz .dc
+ vpermq m1, m1, q1111
+ pslld xm0, 2
+.w4_hpad_loop:
+ mova [acq], m1
+ paddd m4, m0
+ add acq, 32
+ sub hpadd, 4
+ jg .w4_hpad_loop
+ jmp .dc
+.w8:
+ mov r5, acq
+ test wpadd, wpadd
+ jnz .w8_wpad1
+.w8_loop:
+ pmaddwd m0, m5, [ypxq+strideq*0]
+ pmaddwd m1, m5, [ypxq+strideq*1]
+ lea ypxq, [ypxq+strideq*2]
+ paddd m0, m1
+ vextracti128 xm1, m0, 1
+ paddd m4, m0
+ packssdw xm1, xm0, xm1
+ mova [acq], xm1
+ add acq, 16
+ dec hd
+ jg .w8_loop
+.w8_hpad:
+ test hpadd, hpadd
+ jz .dc
+ vinserti128 m1, xm1, 1
+ pslld m0, 2
+ jmp .hpad
+.w8_wpad1:
+ pmaddwd xm0, xm5, [ypxq+strideq*0]
+ pmaddwd xm3, xm5, [ypxq+strideq*1]
+ lea ypxq, [ypxq+strideq*2]
+ paddd xm0, xm3
+ pshufd xm3, xm0, q3333
+ packssdw xm1, xm0, xm3
+ paddd xm0, xm3
+ paddd xm4, xm0
+ mova [acq], xm1
+ add acq, 16
+ dec hd
+ jg .w8_wpad1
+ jmp .w8_hpad
+.w16_wpad:
+ mova m0, [ypxq+strideq*0+ 0]
+ mova m1, [ypxq+strideq*1+ 0]
+ cmp wpadd, 2
+ jl .w16_wpad1
+ je .w16_wpad2
+ vpbroadcastd m2, [ypxq+strideq*0+12]
+ vpbroadcastd m3, [ypxq+strideq*1+12]
+ vpblendd m0, m2, 0xf0
+ vpblendd m1, m3, 0xf0
+ jmp .w16_wpad_end
+.w16_wpad2:
+ vpbroadcastd m2, [ypxq+strideq*0+28]
+ vpbroadcastd m3, [ypxq+strideq*1+28]
+ jmp .w16_wpad_end
+.w16_wpad1:
+ vpbroadcastd m2, [ypxq+strideq*0+44]
+ vpbroadcastd m3, [ypxq+strideq*1+44]
+ vinserti128 m2, [ypxq+strideq*0+32], 0
+ vinserti128 m3, [ypxq+strideq*1+32], 0
+.w16_wpad_end:
+ lea ypxq, [ypxq+strideq*2]
+ REPX {pmaddwd x, m5}, m0, m1, m2, m3
+ paddd m0, m1
+ paddd m2, m3
+ packssdw m1, m0, m2
+ paddd m0, m2
+ vpermq m1, m1, q3120
+ paddd m4, m0
+ mova [acq], m1
+ add acq, 32
+ dec hd
+ jg .w16_wpad
+ jmp .w16_hpad
+.w16:
+ mov r5, acq
+ test wpadd, wpadd
+ jnz .w16_wpad
+.w16_loop:
+ pmaddwd m0, m5, [ypxq+strideq*0+ 0]
+ pmaddwd m2, m5, [ypxq+strideq*0+32]
+ pmaddwd m1, m5, [ypxq+strideq*1+ 0]
+ pmaddwd m3, m5, [ypxq+strideq*1+32]
+ lea ypxq, [ypxq+strideq*2]
+ paddd m0, m1
+ paddd m2, m3
+ packssdw m1, m0, m2
+ paddd m0, m2
+ vpermq m1, m1, q3120
+ paddd m4, m0
+ mova [acq], m1
+ add acq, 32
+ dec hd
+ jg .w16_loop
+.w16_hpad:
+ add hpadd, hpadd
+ jz .dc
+ paddd m0, m0
+.hpad:
+ mova [acq+32*0], m1
+ paddd m4, m0
+ mova [acq+32*1], m1
+ add acq, 32*2
+ sub hpadd, 4
+ jg .hpad
+.dc:
+ vextracti128 xm1, m4, 1
+ sub r5, acq ; -w*h*2
+ tzcnt r1d, r5d
+ paddd xm4, xm1
+ sub r1d, 2
+ punpckhqdq xm1, xm4, xm4
+ movd xm0, r1d
+ paddd xm1, xm4
+ pshuflw xm4, xm1, q1032
+ paddd xm1, xm4
+ psrld xm1, xm0
+ pxor xm0, xm0
+ pavgw xm1, xm0
+ vpbroadcastw m1, xm1
+.dc_loop:
+ mova m0, [acq+r5]
+ psubw m0, m1
+ mova [acq+r5], m0
+ add r5, 32
+ jl .dc_loop
+ RET
+
+cglobal ipred_cfl_ac_422_16bpc, 4, 7, 6, ac, ypx, stride, wpad, hpad, w, h
+ movifnidn hpadd, hpadm
+ vpbroadcastd m5, [pw_4]
+ mov hd, hm
+ shl hpadd, 2
+ pxor m4, m4
+ sub hd, hpadd
+ cmp dword wm, 8
+ jg .w16
+ je .w8
+.w4:
+ lea r3, [strideq*3]
+ mov r5, acq
+.w4_loop:
+ mova xm0, [ypxq+strideq*0]
+ mova xm1, [ypxq+strideq*1]
+ vinserti128 m0, [ypxq+strideq*2], 1
+ vinserti128 m1, [ypxq+r3 ], 1
+ lea ypxq, [ypxq+strideq*4]
+ pmaddwd m0, m5
+ pmaddwd m1, m5
+ paddd m4, m0
+ packssdw m0, m1
+ paddd m4, m1
+ mova [acq], m0
+ add acq, 32
+ sub hd, 4
+ jg .w4_loop
+ test hpadd, hpadd
+ jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
+ vextracti128 xm1, m1, 1
+ vpermq m0, m0, q3333
+ pslld xm1, 2
+.w4_hpad_loop:
+ mova [acq], m0
+ paddd m4, m1
+ add acq, 32
+ sub hpadd, 4
+ jg .w4_hpad_loop
+ jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
+.w8:
+ mov r5, acq
+ test wpadd, wpadd
+ jnz .w8_wpad1
+.w8_loop:
+ pmaddwd m1, m5, [ypxq+strideq*0]
+ pmaddwd m0, m5, [ypxq+strideq*1]
+ lea ypxq, [ypxq+strideq*2]
+ paddd m4, m1
+ packssdw m1, m0
+ paddd m4, m0
+ vpermq m2, m1, q3120
+ mova [acq], m2
+ add acq, 32
+ sub hd, 2
+ jg .w8_loop
+.w8_hpad:
+ test hpadd, hpadd
+ jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
+ vpermq m1, m1, q3131
+ pslld m0, 2
+ jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).hpad
+.w8_wpad1:
+ vpbroadcastd m1, [ypxq+strideq*0+12]
+ vpbroadcastd m0, [ypxq+strideq*1+12]
+ vinserti128 m1, [ypxq+strideq*0+ 0], 0
+ vinserti128 m0, [ypxq+strideq*1+ 0], 0
+ lea ypxq, [ypxq+strideq*2]
+ pmaddwd m1, m5
+ pmaddwd m0, m5
+ paddd m4, m1
+ packssdw m1, m0
+ paddd m4, m0
+ vpermq m2, m1, q3120
+ mova [acq], m2
+ add acq, 32
+ sub hd, 2
+ jg .w8_wpad1
+ jmp .w8_hpad
+.w16:
+ mov r5, acq
+ test wpadd, wpadd
+ jnz .w16_wpad
+.w16_loop:
+ pmaddwd m2, m5, [ypxq+strideq*0+ 0]
+ pmaddwd m1, m5, [ypxq+strideq*0+32]
+ pmaddwd m0, m5, [ypxq+strideq*1+ 0]
+ pmaddwd m3, m5, [ypxq+strideq*1+32]
+ lea ypxq, [ypxq+strideq*2]
+ paddd m4, m2
+ packssdw m2, m1
+ paddd m4, m1
+ packssdw m1, m0, m3
+ paddd m0, m3
+ vpermq m2, m2, q3120
+ paddd m4, m0
+ vpermq m1, m1, q3120
+ mova [acq+32*0], m2
+ mova [acq+32*1], m1
+ add acq, 32*2
+ sub hd, 2
+ jg .w16_loop
+ jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).w16_hpad
+.w16_wpad:
+ mova m2, [ypxq+strideq*0+ 0]
+ mova m0, [ypxq+strideq*1+ 0]
+ cmp wpadd, 2
+ jl .w16_wpad1
+ je .w16_wpad2
+ vpbroadcastd m1, [ypxq+strideq*0+12]
+ vpbroadcastd m3, [ypxq+strideq*1+12]
+ vpblendd m2, m1, 0xf0
+ vpblendd m0, m3, 0xf0
+ jmp .w16_wpad_end
+.w16_wpad2:
+ vpbroadcastd m1, [ypxq+strideq*0+28]
+ vpbroadcastd m3, [ypxq+strideq*1+28]
+ jmp .w16_wpad_end
+.w16_wpad1:
+ vpbroadcastd m1, [ypxq+strideq*0+44]
+ vpbroadcastd m3, [ypxq+strideq*1+44]
+ vinserti128 m1, [ypxq+strideq*0+32], 0
+ vinserti128 m3, [ypxq+strideq*1+32], 0
+.w16_wpad_end:
+ lea ypxq, [ypxq+strideq*2]
+ REPX {pmaddwd x, m5}, m2, m0, m1, m3
+ paddd m4, m2
+ packssdw m2, m1
+ paddd m4, m1
+ packssdw m1, m0, m3
+ paddd m0, m3
+ vpermq m2, m2, q3120
+ paddd m4, m0
+ vpermq m1, m1, q3120
+ mova [acq+32*0], m2
+ mova [acq+32*1], m1
+ add acq, 32*2
+ sub hd, 2
+ jg .w16_wpad
+ jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).w16_hpad
+
+cglobal ipred_cfl_ac_444_16bpc, 4, 7, 6, ac, ypx, stride, wpad, hpad, w, h
+ lea r6, [ipred_cfl_ac_444_16bpc_avx2_table]
+ tzcnt wd, wm
+ movifnidn hpadd, hpadm
+ vpbroadcastd m5, [pw_1]
+ movsxd wq, [r6+wq*4]
+ shl hpadd, 2
+ add wq, r6
+ mov hd, hm
+ pxor m4, m4
+ sub hd, hpadd
+ jmp wq
+.w4:
+ lea r3, [strideq*3]
+ mov r5, acq
+.w4_loop:
+ movq xm0, [ypxq+strideq*0]
+ movhps xm0, [ypxq+strideq*1]
+ vpbroadcastq m1, [ypxq+strideq*2]
+ vpbroadcastq m2, [ypxq+r3 ]
+ lea ypxq, [ypxq+strideq*4]
+ vpblendd m0, m1, 0x30
+ vpblendd m0, m2, 0xc0
+ psllw m0, 3
+ pmaddwd m1, m0, m5
+ mova [acq], m0
+ add acq, 32
+ paddd m4, m1
+ sub hd, 4
+ jg .w4_loop
+ test hpadd, hpadd
+ jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
+ vpermq m0, m0, q3333
+ paddd m1, m1
+ mova [acq+32*0], m0
+ vpermq m1, m1, q3333
+ mova [acq+32*1], m0
+ add acq, 32*2
+ paddd m4, m1
+ jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
+.w8:
+ lea r3, [strideq*3]
+ mov r5, acq
+.w8_loop:
+ mova xm2, [ypxq+strideq*0]
+ vinserti128 m2, [ypxq+strideq*1], 1
+ mova xm1, [ypxq+strideq*2]
+ vinserti128 m1, [ypxq+r3 ], 1
+ lea ypxq, [ypxq+strideq*4]
+ psllw m2, 3
+ psllw m1, 3
+ mova [acq+32*0], m2
+ pmaddwd m2, m5
+ mova [acq+32*1], m1
+ pmaddwd m0, m1, m5
+ add acq, 32*2
+ paddd m4, m2
+ paddd m4, m0
+ sub hd, 4
+ jg .w8_loop
+ test hpadd, hpadd
+ jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
+ vperm2i128 m1, m1, 0x11
+ pslld m0, 2
+ pxor m2, m2
+ vpblendd m0, m2, 0x0f
+ jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).hpad
+.w16_wpad2:
+ vpbroadcastw m3, [ypxq+strideq*0+14]
+ vpbroadcastw m0, [ypxq+strideq*1+14]
+ vpblendd m2, m3, 0xf0
+ vpblendd m1, m0, 0xf0
+ jmp .w16_wpad_end
+.w16:
+ mov r5, acq
+.w16_loop:
+ mova m2, [ypxq+strideq*0]
+ mova m1, [ypxq+strideq*1]
+ test wpadd, wpadd
+ jnz .w16_wpad2
+.w16_wpad_end:
+ lea ypxq, [ypxq+strideq*2]
+ psllw m2, 3
+ psllw m1, 3
+ mova [acq+32*0], m2
+ pmaddwd m2, m5
+ mova [acq+32*1], m1
+ pmaddwd m0, m1, m5
+ add acq, 32*2
+ paddd m4, m2
+ paddd m4, m0
+ sub hd, 2
+ jg .w16_loop
+ add hpadd, hpadd
+ jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
+ paddd m0, m0
+ jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).hpad
+.w32:
+ mov r5, acq
+ test wpadd, wpadd
+ jnz .w32_wpad
+.w32_loop:
+ mova m0, [ypxq+ 0]
+ mova m1, [ypxq+32]
+ add ypxq, strideq
+ psllw m0, 3
+ psllw m1, 3
+ pmaddwd m2, m0, m5
+ mova [acq+32*0], m0
+ pmaddwd m3, m1, m5
+ mova [acq+32*1], m1
+ add acq, 32*2
+ paddd m2, m3
+ paddd m4, m2
+ dec hd
+ jg .w32_loop
+.w32_hpad:
+ test hpadd, hpadd
+ jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
+ paddd m2, m2
+.w32_hpad_loop:
+ mova [acq+32*0], m0
+ mova [acq+32*1], m1
+ paddd m4, m2
+ mova [acq+32*2], m0
+ mova [acq+32*3], m1
+ add acq, 32*4
+ sub hpadd, 2
+ jg .w32_hpad_loop
+ jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
+.w32_wpad:
+ mova m0, [ypxq+ 0]
+ cmp wpadd, 4
+ jl .w32_wpad2
+ je .w32_wpad4
+ vpbroadcastw m1, [ypxq+14]
+ vpblendd m0, m1, 0xf0
+ jmp .w32_wpad_end
+.w32_wpad4:
+ vpbroadcastw m1, [ypxq+30]
+ jmp .w32_wpad_end
+.w32_wpad2:
+ vpbroadcastw m1, [ypxq+46]
+ vinserti128 m1, [ypxq+32], 0
+.w32_wpad_end:
+ add ypxq, strideq
+ psllw m0, 3
+ psllw m1, 3
+ pmaddwd m2, m0, m5
+ mova [acq+32*0], m0
+ pmaddwd m3, m1, m5
+ mova [acq+32*1], m1
+ add acq, 32*2
+ paddd m2, m3
+ paddd m4, m2
+ dec hd
+ jg .w32_wpad
+ jmp .w32_hpad
+
+cglobal pal_pred_16bpc, 4, 6, 6, dst, stride, pal, idx, w, h
+ vbroadcasti128 m4, [palq]
+ lea r2, [pal_pred_16bpc_avx2_table]
+ tzcnt wd, wm
+ vbroadcasti128 m5, [pal_pred_shuf]
+ movifnidn hd, hm
+ movsxd wq, [r2+wq*4]
+ pshufb m4, m5
+ punpckhqdq m5, m4, m4
+ add wq, r2
+DEFINE_ARGS dst, stride, stride3, idx, w, h
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ movq xm0, [idxq]
+ add idxq, 8
+ psrlw xm1, xm0, 4
+ punpcklbw xm0, xm1
+ pshufb xm1, xm4, xm0
+ pshufb xm2, xm5, xm0
+ punpcklbw xm0, xm1, xm2
+ punpckhbw xm1, xm2
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+strideq*1], xm0
+ movhps [dstq+stride3q ], xm1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4
+ RET
+.w8:
+ pmovzxbw m2, [idxq]
+ add idxq, 16
+ psllw m1, m2, 4
+ por m2, m1
+ pshufb m1, m4, m2
+ pshufb m2, m5, m2
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova [dstq+strideq*0], xm0
+ mova [dstq+strideq*1], xm1
+ vextracti128 [dstq+strideq*2], m0, 1
+ vextracti128 [dstq+stride3q ], m1, 1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8
+ RET
+.w16:
+ pshufd m3, [idxq], q3120
+ add idxq, 32
+ vpermq m3, m3, q3120
+ psrlw m1, m3, 4
+ punpcklbw m2, m3, m1
+ punpckhbw m3, m1
+ pshufb m1, m4, m2
+ pshufb m2, m5, m2
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ pshufb m1, m4, m3
+ pshufb m3, m5, m3
+ punpcklbw m0, m1, m3
+ punpckhbw m1, m3
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w16
+ RET
+.w32:
+ pshufd m3, [idxq], q3120
+ add idxq, 32
+ vpermq m3, m3, q3120
+ psrlw m1, m3, 4
+ punpcklbw m2, m3, m1
+ punpckhbw m3, m1
+ pshufb m1, m4, m2
+ pshufb m2, m5, m2
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova [dstq+ 0], m0
+ mova [dstq+32], m1
+ pshufb m1, m4, m3
+ pshufb m3, m5, m3
+ punpcklbw m0, m1, m3
+ punpckhbw m1, m3
+ mova [dstq+strideq+ 0], m0
+ mova [dstq+strideq+32], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32
+ RET
+.w64:
+ pshufd m3, [idxq], q3120
+ add idxq, 32
+ vpermq m3, m3, q3120
+ psrlw m1, m3, 4
+ punpcklbw m2, m3, m1
+ punpckhbw m3, m1
+ pshufb m1, m4, m2
+ pshufb m2, m5, m2
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova [dstq+32*0], m0
+ mova [dstq+32*1], m1
+ pshufb m1, m4, m3
+ pshufb m3, m5, m3
+ punpcklbw m0, m1, m3
+ punpckhbw m1, m3
+ mova [dstq+32*2], m0
+ mova [dstq+32*3], m1
+ add dstq, strideq
+ dec hd
+ jg .w64
+ RET
+
+%endif
diff --git a/third_party/dav1d/src/x86/ipred16_avx512.asm b/third_party/dav1d/src/x86/ipred16_avx512.asm
new file mode 100644
index 0000000000..8124a3b145
--- /dev/null
+++ b/third_party/dav1d/src/x86/ipred16_avx512.asm
@@ -0,0 +1,2049 @@
+; Copyright © 2022-2024, VideoLAN and dav1d authors
+; Copyright © 2022-2024, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 64
+
+ipred_shuf: db 14, 15, 14, 15, 0, 1, 2, 3, 6, 7, 6, 7, 0, 1, 2, 3
+ db 10, 11, 10, 11, 8, 9, 10, 11, 2, 3, 2, 3, 8, 9, 10, 11
+ db 12, 13, 12, 13, 4, 5, 6, 7, 4, 5, 4, 5, 4, 5, 6, 7
+ db 8, 9, 8, 9, 12, 13, 14, 15, 0, 1, 0, 1, 12, 13, 14, 15
+smooth_perm: db 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30
+ db 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58, 61, 62
+ db 65, 66, 69, 70, 73, 74, 77, 78, 81, 82, 85, 86, 89, 90, 93, 94
+ db 97, 98,101,102,105,106,109,110,113,114,117,118,121,122,125,126
+pal_pred_perm: db 0, 16, 32, 48, 1, 17, 33, 49, 2, 18, 34, 50, 3, 19, 35, 51
+ db 4, 20, 36, 52, 5, 21, 37, 53, 6, 22, 38, 54, 7, 23, 39, 55
+ db 8, 24, 40, 56, 9, 25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59
+ db 12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63
+pw_31to0: dw 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16
+ dw 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+pw_1to32: dw 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+ dw 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
+z_upsample: dw 0, -1, 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6
+ dw 8, 7, 9, 8, 10, 9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14
+z_xpos_mul: dw 1, 1, 1, 1, 2, 2, 1, 1, 3, 3, 2, 2, 4, 4, 2, 2
+ dw 5, 5, 3, 3, 6, 6, 3, 3, 7, 7, 4, 4, 8, 8, 4, 4
+z_ypos_mul: dw 0, 0, 0, 0, 1, 1, 0, 0, 2, 2, 1, 1, 3, 3, 1, 1
+ dw 4, 4, 2, 2, 5, 5, 2, 2, 6, 6, 3, 3, 7, 7, 3, 3
+z_filter_t0: db 55,127, 39,127, 39,127, 7, 15, 31, 7, 15, 31, 0, 3, 31, 0
+z_filter_t1: db 39, 63, 19, 47, 19, 47, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0
+z_xpos_off1a: dw 30720, 30784, 30848, 30912, 30976, 31040, 31104, 31168
+z_xpos_off1b: dw 30720, 30848, 30976, 31104, 31232, 31360, 31488, 31616
+filter_permA: times 4 db 6, 7, 8, 9, 14, 15, 4, 5
+ times 4 db 10, 11, 12, 13, 2, 3, -1, -1
+filter_permB: times 4 db 22, 23, 24, 25, 30, 31, 6, 7
+ times 4 db 26, 27, 28, 29, 14, 15, -1, -1
+filter_permC: dd 8 ; dq 8, 10, 1, 11, 0, 9
+pw_1: times 2 dw 1
+ dd 10
+filter_rnd: dd 32
+ dd 1
+ dd 8
+ dd 11
+filter_shift: times 2 dw 6
+ dd 0
+ times 2 dw 4
+ dd 9
+pd_65536: dd 65536
+pal_unpack: db 0, 8, 4, 12, 32, 40, 36, 44
+ db 16, 24, 20, 28, 48, 56, 52, 60
+z_filter_wh: db 7, 7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39
+ db 39, 39, 47, 47, 47, 79, 79, 79
+z_filter_k: dw 8, 8, 6, 6, 4, 4
+ dw 4, 4, 5, 5, 4, 4
+ dw 0, 0, 0, 0, 2, 2
+pw_15: times 2 dw 15
+pw_16: times 2 dw 16
+pw_17: times 2 dw 17
+pw_24: times 2 dw 24
+pw_32: times 2 dw 32
+pw_63: times 2 dw 63
+pw_64: times 2 dw 64
+pw_512: times 2 dw 512
+pw_31806: times 2 dw 31806
+pw_32640: times 2 dw 32640
+pw_32672: times 2 dw 32672
+pw_32704: times 2 dw 32704
+pw_32735: times 2 dw 32735
+pw_32736: times 2 dw 32736
+
+%define pw_2 (z_xpos_mul+4* 2)
+%define pw_3 (z_xpos_mul+4* 4)
+%define pw_7 (z_xpos_mul+4*12)
+%define pw_0to31 (pw_1to32-2)
+
+%macro JMP_TABLE 3-*
+ %xdefine %1_%2_table (%%table - 2*4)
+ %xdefine %%base mangle(private_prefix %+ _%1_%2)
+ %%table:
+ %rep %0 - 2
+ dd %%base %+ .%3 - (%%table - 2*4)
+ %rotate 1
+ %endrep
+%endmacro
+
+JMP_TABLE ipred_paeth_16bpc, avx512icl, w4, w8, w16, w32, w64
+JMP_TABLE ipred_smooth_16bpc, avx512icl, w4, w8, w16, w32, w64
+JMP_TABLE ipred_smooth_h_16bpc, avx512icl, w4, w8, w16, w32, w64
+JMP_TABLE ipred_smooth_v_16bpc, avx512icl, w4, w8, w16, w32, w64
+JMP_TABLE ipred_z1_16bpc, avx512icl, w4, w8, w16, w32, w64
+JMP_TABLE ipred_z3_16bpc, avx512icl, w4, w8, w16, w32, w64
+JMP_TABLE pal_pred_16bpc, avx512icl, w4, w8, w16, w32, w64
+
+cextern smooth_weights_1d_16bpc
+cextern smooth_weights_2d_16bpc
+cextern dr_intra_derivative
+cextern filter_intra_taps
+
+SECTION .text
+
+%macro PAETH 3 ; top, signed_ldiff, ldiff
+ paddw m0, m%2, m2
+ psubw m1, m0, m3 ; tldiff
+ psubw m0, m%1 ; tdiff
+ pabsw m1, m1
+ pabsw m0, m0
+ pcmpgtw k1, m0, m1
+ pminsw m0, m1
+ pcmpgtw k2, m%3, m0
+ vpblendmw m0{k1}, m%1, m3
+ vpblendmw m0{k2}, m2, m0
+%endmacro
+
+INIT_ZMM avx512icl
+cglobal ipred_paeth_16bpc, 3, 7, 10, dst, stride, tl, w, h
+%define base r6-ipred_paeth_16bpc_avx512icl_table
+ lea r6, [ipred_paeth_16bpc_avx512icl_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, [r6+wq*4]
+ vpbroadcastw m3, [tlq] ; topleft
+ add wq, r6
+ jmp wq
+.w4:
+ vpbroadcastq m4, [tlq+2] ; top
+ movsldup m7, [base+ipred_shuf]
+ lea r6, [strideq*3]
+ psubw m5, m4, m3
+ pabsw m6, m5
+.w4_loop:
+ sub tlq, 16
+ vbroadcasti32x4 m2, [tlq]
+ pshufb m2, m7 ; left
+ PAETH 4, 5, 6
+ vextracti32x4 xm1, m0, 2
+ vextracti32x4 xm8, ym0, 1
+ vextracti32x4 xm9, m0, 3
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movq [dstq+strideq*2], xm8
+ movq [dstq+r6 ], xm9
+ sub hd, 8
+ jl .w4_end
+ lea dstq, [dstq+strideq*4]
+ movhps [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm8
+ movhps [dstq+r6 ], xm9
+ lea dstq, [dstq+strideq*4]
+ jg .w4_loop
+.w4_end:
+ RET
+.w8:
+ vbroadcasti32x4 m4, [tlq+2]
+ movsldup m7, [base+ipred_shuf]
+ lea r6, [strideq*3]
+ psubw m5, m4, m3
+ pabsw m6, m5
+.w8_loop:
+ sub tlq, 8
+ vpbroadcastq m2, [tlq]
+ pshufb m2, m7
+ PAETH 4, 5, 6
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], m0, 2
+ vextracti32x4 [dstq+strideq*2], ym0, 1
+ vextracti32x4 [dstq+r6 ], m0, 3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8_loop
+ RET
+.w16:
+ vbroadcasti32x8 m4, [tlq+2]
+ movsldup m7, [base+ipred_shuf]
+ psubw m5, m4, m3
+ pabsw m6, m5
+.w16_loop:
+ sub tlq, 4
+ vpbroadcastd m2, [tlq]
+ pshufb m2, m7
+ PAETH 4, 5, 6
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w16_loop
+ RET
+.w32:
+ movu m4, [tlq+2]
+ psubw m5, m4, m3
+ pabsw m6, m5
+.w32_loop:
+ sub tlq, 2
+ vpbroadcastw m2, [tlq]
+ PAETH 4, 5, 6
+ mova [dstq], m0
+ add dstq, strideq
+ dec hd
+ jg .w32_loop
+ RET
+.w64:
+ movu m4, [tlq+ 2]
+ movu m7, [tlq+66]
+ psubw m5, m4, m3
+ psubw m8, m7, m3
+ pabsw m6, m5
+ pabsw m9, m8
+.w64_loop:
+ sub tlq, 2
+ vpbroadcastw m2, [tlq]
+ PAETH 4, 5, 6
+ mova [dstq+64*0], m0
+ PAETH 7, 8, 9
+ mova [dstq+64*1], m0
+ add dstq, strideq
+ dec hd
+ jg .w64_loop
+ RET
+
+cglobal ipred_smooth_v_16bpc, 3, 7, 7, dst, stride, tl, w, h, weights, stride3
+%define base r6-$$
+ lea r6, [$$]
+ tzcnt wd, wm
+ mov hd, hm
+ movsxd wq, [base+ipred_smooth_v_16bpc_avx512icl_table+wq*4]
+ lea weightsq, [base+smooth_weights_1d_16bpc+hq*4]
+ neg hq
+ vpbroadcastw m6, [tlq+hq*2] ; bottom
+ lea wq, [base+ipred_smooth_v_16bpc_avx512icl_table+wq]
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ vpbroadcastq m5, [tlq+2] ; top
+ movsldup m4, [ipred_shuf]
+ psubw m5, m6 ; top - bottom
+.w4_loop:
+ vbroadcasti32x4 m3, [weightsq+hq*2]
+ pshufb m3, m4
+ pmulhrsw m3, m5
+ paddw m3, m6
+ vextracti32x4 xm0, m3, 3
+ vextracti32x4 xm1, ym3, 1
+ vextracti32x4 xm2, m3, 2
+ movhps [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm3
+ add hq, 8
+ jg .end
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movq [dstq+strideq*2], xm2
+ movq [dstq+stride3q ], xm3
+ lea dstq, [dstq+strideq*4]
+ jl .w4_loop
+.end:
+ RET
+.w8:
+ vbroadcasti32x4 m5, [tlq+2] ; top
+ movsldup m4, [ipred_shuf]
+ psubw m5, m6 ; top - bottom
+.w8_loop:
+ vpbroadcastq m0, [weightsq+hq*2]
+ pshufb m0, m4
+ pmulhrsw m0, m5
+ paddw m0, m6
+ vextracti32x4 [dstq+strideq*0], m0, 3
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ vextracti32x4 [dstq+strideq*2], m0, 2
+ mova [dstq+stride3q ], xm0
+ lea dstq, [dstq+strideq*4]
+ add hq, 4
+ jl .w8_loop
+ RET
+.w16:
+ vbroadcasti32x8 m5, [tlq+2] ; top
+ movsldup m4, [ipred_shuf]
+ psubw m5, m6 ; top - bottom
+.w16_loop:
+ vpbroadcastd m0, [weightsq+hq*2+0]
+ vpbroadcastd m1, [weightsq+hq*2+4]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ paddw m0, m6
+ paddw m1, m6
+ vextracti32x8 [dstq+strideq*0], m0, 1
+ mova [dstq+strideq*1], ym0
+ vextracti32x8 [dstq+strideq*2], m1, 1
+ mova [dstq+stride3q ], ym1
+ lea dstq, [dstq+strideq*4]
+ add hq, 4
+ jl .w16_loop
+ RET
+.w32:
+ movu m5, [tlq+2]
+ psubw m5, m6
+.w32_loop:
+ vpbroadcastw m0, [weightsq+hq*2+0]
+ vpbroadcastw m1, [weightsq+hq*2+2]
+ vpbroadcastw m2, [weightsq+hq*2+4]
+ vpbroadcastw m3, [weightsq+hq*2+6]
+ REPX {pmulhrsw x, m5}, m0, m1, m2, m3
+ REPX {paddw x, m6}, m0, m1, m2, m3
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+ add hq, 4
+ jl .w32_loop
+ RET
+.w64:
+ movu m4, [tlq+ 2]
+ movu m5, [tlq+66]
+ psubw m4, m6
+ psubw m5, m6
+.w64_loop:
+ vpbroadcastw m1, [weightsq+hq*2+0]
+ vpbroadcastw m3, [weightsq+hq*2+2]
+ pmulhrsw m0, m4, m1
+ pmulhrsw m1, m5
+ pmulhrsw m2, m4, m3
+ pmulhrsw m3, m5
+ REPX {paddw x, m6}, m0, m1, m2, m3
+ mova [dstq+strideq*0+64*0], m0
+ mova [dstq+strideq*0+64*1], m1
+ mova [dstq+strideq*1+64*0], m2
+ mova [dstq+strideq*1+64*1], m3
+ lea dstq, [dstq+strideq*2]
+ add hq, 2
+ jl .w64_loop
+ RET
+
+cglobal ipred_smooth_h_16bpc, 3, 7, 7, dst, stride, tl, w, h, stride3
+ lea r6, [$$]
+ mov wd, wm
+ movifnidn hd, hm
+ vpbroadcastw m6, [tlq+wq*2] ; right
+ tzcnt wd, wd
+ add hd, hd
+ movsxd wq, [base+ipred_smooth_h_16bpc_avx512icl_table+wq*4]
+ sub tlq, hq
+ lea stride3q, [strideq*3]
+ lea wq, [base+ipred_smooth_h_16bpc_avx512icl_table+wq]
+ jmp wq
+.w4:
+ movsldup m4, [base+ipred_shuf]
+ vpbroadcastq m5, [base+smooth_weights_1d_16bpc+4*2]
+.w4_loop:
+ vbroadcasti32x4 m0, [tlq+hq-16] ; left
+ pshufb m0, m4
+ psubw m0, m6 ; left - right
+ pmulhrsw m0, m5
+ paddw m0, m6
+ vextracti32x4 xm1, m0, 2
+ vextracti32x4 xm2, ym0, 1
+ vextracti32x4 xm3, m0, 3
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movq [dstq+strideq*2], xm2
+ movq [dstq+stride3q ], xm3
+ sub hd, 8*2
+ jl .end
+ lea dstq, [dstq+strideq*4]
+ movhps [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm3
+ lea dstq, [dstq+strideq*4]
+ jg .w4_loop
+.end:
+ RET
+.w8:
+ movsldup m4, [base+ipred_shuf]
+ vbroadcasti32x4 m5, [base+smooth_weights_1d_16bpc+8*2]
+.w8_loop:
+ vpbroadcastq m0, [tlq+hq-8] ; left
+ pshufb m0, m4
+ psubw m0, m6 ; left - right
+ pmulhrsw m0, m5
+ paddw m0, m6
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], m0, 2
+ vextracti32x4 [dstq+strideq*2], ym0, 1
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4*2
+ jg .w8_loop
+ RET
+.w16:
+ movsldup m4, [base+ipred_shuf]
+ vbroadcasti32x8 m5, [base+smooth_weights_1d_16bpc+16*2]
+.w16_loop:
+ vpbroadcastd m0, [tlq+hq-4]
+ vpbroadcastd m1, [tlq+hq-8]
+ pshufb m0, m4
+ pshufb m1, m4
+ psubw m0, m6
+ psubw m1, m6
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ paddw m0, m6
+ paddw m1, m6
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], ym1
+ vextracti32x8 [dstq+stride3q ], m1, 1
+ lea dstq, [dstq+strideq*4]
+ sub hq, 4*2
+ jg .w16_loop
+ RET
+.w32:
+ movu m5, [base+smooth_weights_1d_16bpc+32*2]
+.w32_loop:
+ vpbroadcastq m3, [tlq+hq-8]
+ punpcklwd m3, m3
+ psubw m3, m6
+ pshufd m0, m3, q3333
+ pshufd m1, m3, q2222
+ pshufd m2, m3, q1111
+ pshufd m3, m3, q0000
+ REPX {pmulhrsw x, m5}, m0, m1, m2, m3
+ REPX {paddw x, m6}, m0, m1, m2, m3
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+ sub hq, 4*2
+ jg .w32_loop
+ RET
+.w64:
+ movu m4, [base+smooth_weights_1d_16bpc+64*2]
+ movu m5, [base+smooth_weights_1d_16bpc+64*3]
+.w64_loop:
+ vpbroadcastw m1, [tlq+hq-2]
+ vpbroadcastw m3, [tlq+hq-4]
+ psubw m1, m6
+ psubw m3, m6
+ pmulhrsw m0, m4, m1
+ pmulhrsw m1, m5
+ pmulhrsw m2, m4, m3
+ pmulhrsw m3, m5
+ REPX {paddw x, m6}, m0, m1, m2, m3
+ mova [dstq+strideq*0+64*0], m0
+ mova [dstq+strideq*0+64*1], m1
+ mova [dstq+strideq*1+64*0], m2
+ mova [dstq+strideq*1+64*1], m3
+ lea dstq, [dstq+strideq*2]
+ sub hq, 2*2
+ jg .w64_loop
+ RET
+
+cglobal ipred_smooth_16bpc, 3, 7, 16, dst, stride, tl, w, h, v_weights, stride3
+ lea r6, [$$]
+ mov wd, wm
+ movifnidn hd, hm
+ vpbroadcastw m13, [tlq+wq*2] ; right
+ tzcnt wd, wd
+ add hd, hd
+ movsxd wq, [base+ipred_smooth_16bpc_avx512icl_table+wq*4]
+ mov r5d, 0x55555555
+ sub tlq, hq
+ mova m14, [base+smooth_perm]
+ kmovd k1, r5d
+ vpbroadcastw m0, [tlq] ; bottom
+ mov r5, 0x3333333333333333
+ pxor m15, m15
+ lea wq, [base+ipred_smooth_16bpc_avx512icl_table+wq]
+ kmovq k2, r5
+ lea v_weightsq, [base+smooth_weights_2d_16bpc+hq*2]
+ jmp wq
+.w4:
+ vpbroadcastq m5, [tlq+hq+2]
+ movshdup m3, [base+ipred_shuf]
+ movsldup m4, [base+ipred_shuf]
+ vbroadcasti32x4 m6, [base+smooth_weights_2d_16bpc+4*4]
+ lea stride3q, [strideq*3]
+ punpcklwd m5, m0 ; top, bottom
+.w4_loop:
+ vbroadcasti32x4 m0, [v_weightsq]
+ vpbroadcastq m2, [tlq+hq-8]
+ mova m1, m13
+ pshufb m0, m3
+ pmaddwd m0, m5
+ pshufb m1{k2}, m2, m4 ; left, right
+ vpdpwssd m0, m1, m6
+ vpermb m0, m14, m0
+ pavgw ym0, ym15
+ vextracti32x4 xm1, ym0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm1
+ lea dstq, [dstq+strideq*4]
+ add v_weightsq, 4*4
+ sub hd, 4*2
+ jg .w4_loop
+ RET
+.w8:
+ vbroadcasti32x4 ym5, [tlq+hq+2]
+ movshdup m6, [base+ipred_shuf]
+ movsldup m7, [base+ipred_shuf]
+ pmovzxwd m5, ym5
+ vbroadcasti32x8 m8, [base+smooth_weights_2d_16bpc+8*4]
+ lea stride3q, [strideq*3]
+ vpblendmw m5{k1}, m0, m5 ; top, bottom
+.w8_loop:
+ vpbroadcastq m0, [v_weightsq+0]
+ vpbroadcastq m1, [v_weightsq+8]
+ vpbroadcastd m3, [tlq+hq-4]
+ vpbroadcastd m4, [tlq+hq-8]
+ pshufb m0, m6
+ pmaddwd m0, m5
+ pshufb m1, m6
+ pmaddwd m1, m5
+ mova m2, m13
+ pshufb m2{k2}, m3, m7 ; left, right
+ mova m3, m13
+ pshufb m3{k2}, m4, m7
+ vpdpwssd m0, m2, m8
+ vpdpwssd m1, m3, m8
+ add v_weightsq, 4*4
+ vpermt2b m0, m14, m1
+ pavgw m0, m15
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ vextracti32x4 [dstq+strideq*2], m0, 2
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4*2
+ jg .w8_loop
+ RET
+.w16:
+ pmovzxwd m5, [tlq+hq+2]
+ mova m6, [base+smooth_weights_2d_16bpc+16*4]
+ vpblendmw m5{k1}, m0, m5 ; top, bottom
+.w16_loop:
+ vpbroadcastd m0, [v_weightsq+0]
+ vpbroadcastd m1, [v_weightsq+4]
+ pmaddwd m0, m5
+ pmaddwd m1, m5
+ mova m2, m13
+ vpbroadcastw m2{k1}, [tlq+hq-2] ; left, right
+ mova m3, m13
+ vpbroadcastw m3{k1}, [tlq+hq-4]
+ vpdpwssd m0, m2, m6
+ vpdpwssd m1, m3, m6
+ add v_weightsq, 2*4
+ vpermt2b m0, m14, m1
+ pavgw m0, m15
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hq, 2*2
+ jg .w16_loop
+ RET
+.w32:
+ pmovzxwd m5, [tlq+hq+ 2]
+ pmovzxwd m6, [tlq+hq+34]
+ mova m7, [base+smooth_weights_2d_16bpc+32*4]
+ mova m8, [base+smooth_weights_2d_16bpc+32*6]
+ vpblendmw m5{k1}, m0, m5 ; top, bottom
+ vpblendmw m6{k1}, m0, m6
+.w32_loop:
+ vpbroadcastd m2, [v_weightsq+0]
+ vpbroadcastd m3, [v_weightsq+4]
+ pmaddwd m0, m5, m2
+ pmaddwd m2, m6
+ pmaddwd m1, m5, m3
+ pmaddwd m3, m6
+ mova m4, m13
+ vpbroadcastw m4{k1}, [tlq+hq-2] ; left, right
+ vpdpwssd m0, m4, m7
+ vpdpwssd m2, m4, m8
+ mova m4, m13
+ vpbroadcastw m4{k1}, [tlq+hq-4]
+ vpdpwssd m1, m4, m7
+ vpdpwssd m3, m4, m8
+ add v_weightsq, 2*4
+ vpermt2b m0, m14, m2
+ vpermt2b m1, m14, m3
+ pavgw m0, m15
+ pavgw m1, m15
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ sub hq, 2*2
+ jg .w32_loop
+ RET
+.w64:
+ pmovzxwd m5, [tlq+hq+ 2]
+ pmovzxwd m6, [tlq+hq+34]
+ pmovzxwd m7, [tlq+hq+66]
+ pmovzxwd m8, [tlq+hq+98]
+ mova m9, [base+smooth_weights_2d_16bpc+64*4]
+ vpblendmw m5{k1}, m0, m5 ; top, bottom
+ mova m10, [base+smooth_weights_2d_16bpc+64*5]
+ vpblendmw m6{k1}, m0, m6
+ mova m11, [base+smooth_weights_2d_16bpc+64*6]
+ vpblendmw m7{k1}, m0, m7
+ mova m12, [base+smooth_weights_2d_16bpc+64*7]
+ vpblendmw m8{k1}, m0, m8
+.w64_loop:
+ vpbroadcastd m3, [v_weightsq]
+ mova m4, m13
+ vpbroadcastw m4{k1}, [tlq+hq-2] ; left, right
+ pmaddwd m0, m5, m3
+ pmaddwd m2, m6, m3
+ pmaddwd m1, m7, m3
+ pmaddwd m3, m8
+ vpdpwssd m0, m4, m9
+ vpdpwssd m2, m4, m10
+ vpdpwssd m1, m4, m11
+ vpdpwssd m3, m4, m12
+ add v_weightsq, 1*4
+ vpermt2b m0, m14, m2
+ vpermt2b m1, m14, m3
+ pavgw m0, m15
+ pavgw m1, m15
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ add dstq, strideq
+ sub hd, 1*2
+ jg .w64_loop
+ RET
+
+%if WIN64
+ DECLARE_REG_TMP 4
+%else
+ DECLARE_REG_TMP 8
+%endif
+
+cglobal ipred_z1_16bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dx
+%define base r7-z_filter_t0
+ lea r7, [z_filter_t0]
+ tzcnt wd, wm
+ movifnidn angled, anglem
+ lea t0, [dr_intra_derivative]
+ movsxd wq, [base+ipred_z1_16bpc_avx512icl_table+wq*4]
+ add tlq, 2
+ mov dxd, angled
+ and dxd, 0x7e
+ add angled, 165 ; ~90
+ movzx dxd, word [t0+dxq]
+ lea wq, [base+ipred_z1_16bpc_avx512icl_table+wq]
+ movifnidn hd, hm
+ xor angled, 0x4ff ; d = 90 - angle
+ vpbroadcastd m15, [base+pw_31806]
+ jmp wq
+.w4:
+ vpbroadcastw m5, [tlq+14]
+ vinserti32x4 m5, [tlq], 0
+ cmp angleb, 40
+ jae .w4_no_upsample
+ lea r3d, [angleq-1024]
+ sar r3d, 7
+ add r3d, hd
+ jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm)
+ call .upsample_top
+ vpbroadcastq m0, [base+z_xpos_off1b]
+ jmp .w4_main2
+.w4_no_upsample:
+ test angled, 0x400
+ jnz .w4_main ; !enable_intra_edge_filter
+ lea r3d, [hq+3]
+ vpbroadcastb xm0, r3d
+ vpbroadcastb xm1, angled
+ shr angled, 8 ; is_sm << 1
+ vpcmpeqb k1, xm0, [base+z_filter_wh]
+ vpcmpgtb k1{k1}, xm1, [base+z_filter_t0+angleq*8]
+ kmovw r5d, k1
+ test r5d, r5d
+ jz .w4_main
+ call .w16_filter
+ mov r2d, 9
+ cmp hd, 4
+ cmovne r3d, r2d
+ vpbroadcastw m6, r3d
+ pminuw m6, [base+pw_0to31]
+ vpermw m5, m6, m5
+.w4_main:
+ vpbroadcastq m0, [base+z_xpos_off1a]
+.w4_main2:
+ movsldup m3, [base+z_xpos_mul]
+ vpbroadcastw m4, dxd
+ lea r2, [strideq*3]
+ pmullw m3, m4
+ vshufi32x4 m6, m5, m5, q3321
+ psllw m4, 3 ; dx*8
+ paddsw m3, m0 ; xpos
+ palignr m6, m5, 2 ; top+1
+.w4_loop:
+ psrlw m1, m3, 6 ; base_x
+ pand m2, m15, m3 ; frac
+ vpermw m0, m1, m5 ; top[base_x]
+ vpermw m1, m1, m6 ; top[base_x+1]
+ psllw m2, 9
+ psubw m1, m0
+ pmulhrsw m1, m2
+ paddw m0, m1
+ vextracti32x4 xm1, ym0, 1
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+r2 ], xm1
+ sub hd, 8
+ jl .w4_end
+ vextracti32x4 xm1, m0, 2
+ paddsw m3, m4 ; xpos += dx
+ lea dstq, [dstq+strideq*4]
+ vextracti32x4 xm0, m0, 3
+ movq [dstq+strideq*0], xm1
+ movhps [dstq+strideq*1], xm1
+ movq [dstq+strideq*2], xm0
+ movhps [dstq+r2 ], xm0
+ lea dstq, [dstq+strideq*4]
+ jg .w4_loop
+.w4_end:
+ RET
+.upsample_top:
+ vinserti32x4 m5, [tlq-16], 3
+ mova m3, [base+z_upsample]
+ vpbroadcastd m4, [base+pd_65536]
+ add dxd, dxd
+ vpermw m0, m3, m5
+ paddw m3, m4
+ vpermw m1, m3, m5
+ paddw m3, m4
+ vpermw m2, m3, m5
+ paddw m3, m4
+ vpermw m3, m3, m5
+ vpbroadcastw m5, r9m ; pixel_max
+ paddw m1, m2 ; b+c
+ paddw m0, m3 ; a+d
+ psubw m0, m1, m0
+ psraw m0, 3
+ pxor m2, m2
+ paddw m0, m1
+ pmaxsw m0, m2
+ pavgw m0, m2
+ pminsw m5, m0
+ ret
+.w8:
+ lea r3d, [angleq+216]
+ movu ym5, [tlq]
+ mov r3b, hb
+ movu m10, [base+pw_0to31]
+ cmp r3d, 8
+ ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8
+ lea r3d, [hq+7]
+ vpbroadcastw m6, r3d
+ add r3d, r3d
+ pminuw m6, m10
+ vpermw m5, m6, m5
+ call .upsample_top
+ vbroadcasti32x4 m0, [base+z_xpos_off1b]
+ jmp .w8_main2
+.w8_no_upsample:
+ lea r3d, [hq+7]
+ vpbroadcastb ym0, r3d
+ and r3d, 7
+ or r3d, 8 ; imin(h+7, 15)
+ vpbroadcastw m6, r3d
+ pminuw m6, m10
+ vpermw m5, m6, m5
+ test angled, 0x400
+ jnz .w8_main
+ vpbroadcastb ym1, angled
+ shr angled, 8
+ vpcmpeqb k1, ym0, [base+z_filter_wh]
+ mova xm0, [base+z_filter_t0+angleq*8]
+ vpcmpgtb k1{k1}, ym1, ym0
+ kmovd r5d, k1
+ test r5d, r5d
+ jz .w8_main
+ call .w16_filter
+ cmp hd, r3d
+ jl .w8_filter_end
+ pminud m6, m10, [base+pw_17] {1to16}
+ add r3d, 2
+.w8_filter_end:
+ vpermw m5, m6, m5
+.w8_main:
+ vbroadcasti32x4 m0, [base+z_xpos_off1a]
+.w8_main2:
+ movshdup m3, [base+z_xpos_mul]
+ vpbroadcastw m4, dxd
+ shl r3d, 6
+ lea r2, [strideq*3]
+ pmullw m3, m4
+ vshufi32x4 m6, m5, m5, q3321
+ sub r3d, dxd
+ psllw m4, 2 ; dx*4
+ shl dxd, 2
+ paddsw m3, m0 ; xpos
+ palignr m6, m5, 2 ; top+1
+.w8_loop:
+ psrlw m1, m3, 6 ; base_x
+ pand m2, m15, m3 ; frac
+ vpermw m0, m1, m5 ; top[base_x]
+ vpermw m1, m1, m6 ; top[base_x+1]
+ psllw m2, 9
+ psubw m1, m0
+ pmulhrsw m1, m2
+ paddw m0, m1
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ vextracti32x4 [dstq+strideq*2], m0, 2
+ vextracti32x4 [dstq+r2 ], m0, 3
+ sub hd, 4
+ jz .w8_end
+ paddsw m3, m4 ; xpos += dx
+ lea dstq, [dstq+strideq*4]
+ sub r3d, dxd
+ jg .w8_loop
+ vextracti32x4 xm5, m5, 3
+.w8_end_loop:
+ mova [dstq+strideq*0], xm5
+ mova [dstq+strideq*1], xm5
+ mova [dstq+strideq*2], xm5
+ mova [dstq+r2 ], xm5
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8_end_loop
+.w8_end:
+ RET
+.w16_filter:
+ vpbroadcastw m1, [tlq-2]
+ popcnt r5d, r5d
+ valignq m3, m6, m5, 2
+ vpbroadcastd m7, [base+z_filter_k+(r5-1)*4+12*0]
+ valignq m1, m5, m1, 6
+ vpbroadcastd m8, [base+z_filter_k+(r5-1)*4+12*1]
+ palignr m2, m3, m5, 2
+ vpbroadcastd m9, [base+z_filter_k+(r5-1)*4+12*2]
+ palignr m0, m5, m1, 14
+ pmullw m7, m5
+ palignr m3, m5, 4
+ paddw m0, m2
+ palignr m5, m1, 12
+ pmullw m0, m8
+ paddw m5, m3
+ pmullw m5, m9
+ pxor m1, m1
+ paddw m0, m7
+ paddw m5, m0
+ psrlw m5, 3
+ pavgw m5, m1
+ ret
+.w16:
+ lea r3d, [hq+15]
+ vpbroadcastb ym0, r3d
+ and r3d, 15
+ or r3d, 16 ; imin(h+15, 31)
+ vpbroadcastw m11, r3d
+ pminuw m10, m11, [base+pw_0to31]
+ vpbroadcastw m6, [tlq+r3*2]
+ vpermw m5, m10, [tlq]
+ test angled, 0x400
+ jnz .w16_main
+ vpbroadcastb ym1, angled
+ shr angled, 8
+ vpcmpeqb k1, ym0, [base+z_filter_wh]
+ mova xm0, [base+z_filter_t0+angleq*8]
+ vpcmpgtb k1{k1}, ym1, ym0
+ kmovd r5d, k1
+ test r5d, r5d
+ jz .w16_main
+ call .w16_filter
+ cmp hd, 16
+ jg .w16_filter_h32
+ vpermw m6, m11, m5
+ vpermw m5, m10, m5
+ jmp .w16_main
+.w16_filter_h32:
+ movzx r3d, word [tlq+62]
+ movzx r2d, word [tlq+60]
+ lea r2d, [r2+r3*8+4]
+ sub r2d, r3d
+ mov r3d, 1
+ shr r2d, 3
+ kmovb k1, r3d
+ movd xm0, r2d
+ or r3d, 32
+ vmovdqu16 m6{k1}, m0
+.w16_main:
+ rorx r2d, dxd, 23
+ mov r7, rsp
+ and rsp, ~63
+ vpbroadcastw m3, r2d
+ sub rsp, 64*2
+ mov r2d, dxd
+ paddw m4, m3, m3
+ mova [rsp+64*0], m5
+ vinserti32x8 m3, ym4, 1
+ mova [rsp+64*1], m6
+ shl r3d, 6
+.w16_loop:
+ lea r5d, [r2+dxq]
+ shr r2d, 6
+ movu ym0, [rsp+r2*2]
+ movu ym1, [rsp+r2*2+2]
+ lea r2d, [r5+dxq]
+ shr r5d, 6
+ vinserti32x8 m0, [rsp+r5*2], 1
+ vinserti32x8 m1, [rsp+r5*2+2], 1
+ pand m2, m15, m3 ; frac << 9
+ psubw m1, m0
+ pmulhrsw m1, m2
+ paddw m0, m1
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ sub hd, 2
+ jz .w16_end
+ paddw m3, m4
+ lea dstq, [dstq+strideq*2]
+ cmp r2d, r3d
+ jl .w16_loop
+ punpckhqdq ym6, ym6
+.w16_end_loop:
+ mova [dstq+strideq*0], ym6
+ mova [dstq+strideq*1], ym6
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w16_end_loop
+.w16_end:
+ mov rsp, r7
+ RET
+.w32:
+ lea r3d, [hq+31]
+ movu m7, [tlq+64*0]
+ and r3d, 31
+ vpbroadcastw m11, r3d
+ or r3d, 32 ; imin(h+31, 63)
+ pminuw m10, m11, [base+pw_0to31]
+ vpbroadcastw m9, [tlq+r3*2]
+ vpermw m8, m10, [tlq+64*1]
+ test angled, 0x400
+ jnz .w32_main
+ vpbroadcastd m5, [base+pw_3]
+ mov r5d, ~1
+ movu m3, [tlq-2]
+ kmovd k1, r5d
+ valignq m2, m8, m7, 6
+ paddw m7, m3
+ vmovdqu16 m3{k1}, [tlq-4]
+ valignq m4, m9, m8, 2
+ paddw m3, m5
+ paddw m7, [tlq+2]
+ palignr m1, m8, m2, 14
+ pavgw m3, [tlq+4]
+ palignr m2, m8, m2, 12
+ paddw m7, m3
+ palignr m3, m4, m8, 2
+ psrlw m7, 2
+ palignr m4, m8, 4
+ paddw m8, m1
+ paddw m2, m5
+ paddw m8, m3
+ pavgw m2, m4
+ paddw m8, m2
+ psrlw m8, 2
+ cmp hd, 64
+ je .w32_filter_h64
+ vpermw m9, m11, m8
+ vpermw m8, m10, m8
+ jmp .w32_main
+.w32_filter_h64:
+ movzx r3d, word [tlq+126]
+ movzx r2d, word [tlq+124]
+ lea r2d, [r2+r3*8+4]
+ sub r2d, r3d
+ mov r3d, 65
+ shr r2d, 3
+ movd xm0, r2d
+ vpblendmw m9{k1}, m0, m9
+.w32_main:
+ rorx r2d, dxd, 23
+ mov r7, rsp
+ and rsp, ~63
+ vpbroadcastw m5, r2d
+ sub rsp, 64*4
+ mov r2d, dxd
+ mova [rsp+64*0], m7
+ shl r3d, 6
+ mova [rsp+64*1], m8
+ mova m6, m5
+ mova [rsp+64*2], m9
+ punpckhqdq m9, m9
+ mova [rsp+64*3], ym9
+.w32_loop:
+ lea r5d, [r2+dxq]
+ shr r2d, 6
+ movu m0, [rsp+r2*2]
+ movu m2, [rsp+r2*2+2]
+ lea r2d, [r5+dxq]
+ shr r5d, 6
+ movu m1, [rsp+r5*2]
+ movu m3, [rsp+r5*2+2]
+ pand m4, m15, m5
+ paddw m5, m6
+ psubw m2, m0
+ pmulhrsw m2, m4
+ pand m4, m15, m5
+ psubw m3, m1
+ pmulhrsw m3, m4
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ sub hd, 2
+ jz .w32_end
+ paddw m5, m6
+ lea dstq, [dstq+strideq*2]
+ cmp r2d, r3d
+ jl .w32_loop
+.w32_end_loop:
+ mova [dstq+strideq*0], m9
+ mova [dstq+strideq*1], m9
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32_end_loop
+.w32_end:
+ mov rsp, r7
+ RET
+.w64_filter96:
+ vpbroadcastd m4, [base+pw_3]
+ mov r5d, ~1
+ movu m0, [tlq-2]
+ kmovd k1, r5d
+ paddw m7, m0
+ vmovdqu16 m0{k1}, [tlq-4]
+ paddw m0, m4
+ paddw m7, [tlq+2]
+ pavgw m0, [tlq+4]
+ valignq m1, m9, m8, 6
+ paddw m8, [tlq+62]
+ paddw m2, m4, [tlq+60]
+ valignq m3, m10, m9, 2
+ paddw m8, [tlq+66]
+ pavgw m2, [tlq+68]
+ paddw m7, m0
+ palignr m0, m9, m1, 14
+ paddw m8, m2
+ palignr m1, m9, m1, 12
+ psrlw m7, 2
+ palignr m2, m3, m9, 2
+ psrlw m8, 2
+ palignr m3, m9, 4
+ paddw m0, m9
+ paddw m1, m4
+ paddw m0, m2
+ pavgw m1, m3
+ paddw m0, m1
+ ret
+.w64:
+ movu m7, [tlq+64*0]
+ lea r3d, [hq-1]
+ movu m8, [tlq+64*1]
+ vpbroadcastw m11, [tlq+r3*2+128]
+ movu m9, [tlq+64*2]
+ cmp hd, 64
+ je .w64_h64
+ vpbroadcastw m13, r3d
+ or r3d, 64
+ pminuw m12, m13, [base+pw_0to31]
+ mova m10, m11
+ vpermw m9, m12, m9
+ test angled, 0x400
+ jnz .w64_main
+ call .w64_filter96
+ psrlw m0, 2
+ vpermw m9, m12, m0
+ vpermw m10, m13, m0
+ mova m11, m10
+ jmp .w64_main
+.w64_h64:
+ movu m10, [tlq+64*3]
+ or r3d, 64
+ test angled, 0x400
+ jnz .w64_main
+ call .w64_filter96
+ valignq m1, m10, m9, 6
+ valignq m3, m11, m10, 2
+ vpbroadcastd m11, [base+pw_63]
+ psrlw m9, m0, 2
+ palignr m0, m10, m1, 14
+ palignr m1, m10, m1, 12
+ palignr m2, m3, m10, 2
+ palignr m3, m10, 4
+ paddw m10, m0
+ paddw m1, m4
+ paddw m10, m2
+ pavgw m1, m3
+ paddw m10, m1
+ psrlw m10, 2
+ vpermw m11, m11, m10
+.w64_main:
+ rorx r2d, dxd, 23
+ mov r7, rsp
+ and rsp, ~63
+ vpbroadcastw m5, r2d
+ sub rsp, 64*6
+ mova [rsp+64*0], m7
+ mov r2d, dxd
+ mova [rsp+64*1], m8
+ lea r5, [rsp+r3*2]
+ mova [rsp+64*2], m9
+ shl r3d, 6
+ mova [rsp+64*3], m10
+ sub r2, r3
+ mova [rsp+64*4], m11
+ mova m6, m5
+ mova [rsp+64*5], m11
+.w64_loop:
+ mov r3, r2
+ sar r3, 6
+ movu m0, [r5+r3*2+64*0]
+ movu m2, [r5+r3*2+64*0+2]
+ movu m1, [r5+r3*2+64*1]
+ movu m3, [r5+r3*2+64*1+2]
+ pand m4, m15, m5
+ psubw m2, m0
+ pmulhrsw m2, m4
+ psubw m3, m1
+ pmulhrsw m3, m4
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ dec hd
+ jz .w64_end
+ paddw m5, m6
+ add dstq, strideq
+ add r2, dxq
+ jl .w64_loop
+.w64_end_loop:
+ mova [dstq+64*0], m11
+ mova [dstq+64*1], m11
+ add dstq, strideq
+ dec hd
+ jg .w64_end_loop
+.w64_end:
+ mov rsp, r7
+ RET
+
+cglobal ipred_z3_16bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dy
+ lea r7, [z_filter_t0]
+ tzcnt wd, wm
+ movifnidn angled, anglem
+ lea t0, [dr_intra_derivative+45*2-1]
+ movsxd wq, [base+ipred_z3_16bpc_avx512icl_table+wq*4]
+ sub angled, 180
+ mov dyd, angled
+ neg dyd
+ xor angled, 0x400
+ or dyq, ~0x7e
+ mova m0, [base+pw_31to0]
+ movzx dyd, word [t0+dyq]
+ lea wq, [base+ipred_z3_16bpc_avx512icl_table+wq]
+ movifnidn hd, hm
+ vpbroadcastd m14, [base+pw_31806]
+ vpbroadcastd m15, [base+pw_1]
+ jmp wq
+.w4:
+ lea r3d, [hq+3]
+ xor r3d, 31 ; 32 - (h + imin(w, h))
+ vpbroadcastw m7, r3d
+ pmaxuw m7, m0
+ vpermw m6, m7, [tlq-64*1]
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .w4_main
+ cmp angleb, 40
+ jae .w4_filter
+ lea r3d, [angleq-1024]
+ sar r3d, 7
+ add r3d, hd
+ jg .w4_filter ; h > 8 || (h == 8 && is_sm)
+ call .upsample
+ movsldup m1, [base+z_ypos_mul]
+ paddw m1, m1
+ jmp .w4_main2
+.w4_filter:
+ lea r3d, [hq+3]
+ call .filter32
+.w4_main:
+ movsldup m1, [base+z_ypos_mul]
+.w4_main2:
+ vpbroadcastq m0, [base+pw_1to32]
+ vpbroadcastw m4, dyd
+ lea r2d, [hq+4]
+ shr r2d, 3
+ pmullw m4, m0 ; ypos
+ vpbroadcastw m0, r2d
+ imul r2, strideq ; stride * imax(height / 8, 1)
+ pmullw m1, m0
+ lea r3, [r2*3]
+ paddd m1, [base+pw_32736] {1to16}
+ psrlw m2, m4, 6
+ psllw m4, 9
+ paddsw m2, m1 ; base+0
+ vpandd m4, m14 ; frac << 9
+ vpermw m3, m2, m6 ; left[base+0]
+.w4_loop:
+ paddsw m2, m15 ; base+1
+ vpermw m1, m2, m6 ; left[base+1]
+ psubw m0, m1, m3
+ pmulhrsw m0, m4
+ paddw m0, m3
+ movq [dstq+r2*0], xm0
+ movhps [dstq+r2*1], xm0
+ vextracti32x4 xm3, ym0, 1
+ movq [dstq+r2*2], xm3
+ movhps [dstq+r3 ], xm3
+ sub hd, 8
+ jl .w4_end
+ lea r5, [dstq+r2*4]
+ vextracti32x8 ym0, m0, 1
+ mova m3, m1
+ movq [r5+r2*0], xm0
+ movhps [r5+r2*1], xm0
+ vextracti32x4 xm1, ym0, 1
+ movq [r5+r2*2], xm1
+ movhps [r5+r3 ], xm1
+ add dstq, strideq
+ test hd, hd
+ jnz .w4_loop
+.w4_end:
+ RET
+.upsample:
+ vinserti32x4 m6, [tlq-14], 3
+ mova m3, [base+z_upsample]
+ vpbroadcastd m4, [base+pd_65536]
+ add dyd, dyd
+ vpermw m0, m3, m6
+ paddw m3, m4
+ vpermw m1, m3, m6
+ paddw m3, m4
+ vpermw m2, m3, m6
+ paddw m3, m4
+ vpermw m3, m3, m6
+ vpbroadcastw m6, r9m ; pixel_max
+ paddw m1, m2 ; b+c
+ paddw m0, m3 ; a+d
+ psubw m0, m1, m0
+ psraw m0, 3
+ pxor m2, m2
+ paddw m0, m1
+ pmaxsw m0, m2
+ pavgw m0, m2
+ pminsw m6, m0
+ ret
+.w8:
+ mova m6, [tlq-64*1]
+ cmp hd, 32
+ je .w8_h32
+ mov r3d, 8
+ cmp hd, 4
+ cmove r3d, hd
+ lea r3d, [r3+hq-1]
+ xor r3d, 31 ; 32 - (h + imin(w, h))
+ vpbroadcastw m1, r3d
+ vpermw m7, m1, m6
+ pmaxuw m1, m0
+ vpermw m6, m1, m6
+ test angled, 0x400
+ jnz .w8_main
+ lea r3d, [angleq+216]
+ mov r3b, hb
+ cmp r3d, 8
+ ja .w8_filter ; is_sm || d >= 40 || h > 8
+ call .upsample
+ movshdup m1, [base+z_ypos_mul]
+ paddw m1, m1
+ call .w8_main_setup
+.w8_upsample_loop:
+ vpermw m3, m2, m6 ; left[base+0]
+ paddw m2, m15 ; base+1
+ vpermw m1, m2, m6 ; left[base+1]
+ psubw m0, m1, m3
+ pmulhrsw m0, m4
+ paddw m2, m15 ; base+2
+ paddw m0, m3
+ mova m3, m1
+ mova [dstq+r2*0], xm0
+ vextracti32x4 [dstq+r2*1], ym0, 1
+ vextracti32x4 [dstq+r2*2], m0, 2
+ vextracti32x4 [dstq+r3 ], m0, 3
+ add dstq, strideq
+ sub hd, 4
+ jg .w8_upsample_loop
+ RET
+.w8_main_setup:
+ vbroadcasti32x4 m0, [base+pw_1to32]
+ vpbroadcastw m4, dyd
+ rorx r2d, hd, 2
+ pmullw m4, m0 ; ypos
+ vpbroadcastw m0, r2d
+ imul r2, strideq ; stride * height / 4
+ lea r3, [r2*3]
+ pmullw m1, m0 ; 0 1 2 3
+ paddd m1, [base+pw_32704] {1to16}
+ psrlw m2, m4, 6
+ psllw m4, 9
+ paddsw m2, m1 ; base+0
+ vpandd m4, m14 ; frac << 9
+ ret
+.w8_h32:
+ pmaxud m7, m0, [base+pw_24] {1to16}
+ vpermw m6, m0, m6
+ vpermw m7, m7, [tlq-64*2]
+ test angled, 0x400
+ jnz .w8_main
+ call .filter64
+ vpbroadcastd m0, [base+pw_7]
+ pminuw m0, [base+pw_0to31]
+ vpermw m7, m0, m7
+ jmp .w8_main
+.w8_filter:
+ lea r3d, [hq+7]
+ call .filter32
+.w8_main:
+ movshdup m1, [base+z_ypos_mul]
+ call .w8_main_setup
+ mova m3, m6
+ vpermt2w m3, m2, m7 ; left[base+0]
+.w8_loop:
+ paddsw m2, m15 ; base+1
+ mova m1, m6
+ vpermt2w m1, m2, m7 ; left[base+1]
+ psubw m0, m1, m3
+ pmulhrsw m0, m4
+ paddw m0, m3
+ mova m3, m1
+ mova [dstq+r2*0], xm0
+ vextracti32x4 [dstq+r2*1], ym0, 1
+ vextracti32x4 [dstq+r2*2], m0, 2
+ vextracti32x4 [dstq+r3 ], m0, 3
+ add dstq, strideq
+ sub hd, 4
+ jg .w8_loop
+ RET
+.filter32:
+ vpbroadcastb ym10, r3d
+ vpbroadcastb ym1, angled
+ shr angled, 8
+ vpcmpeqb k1, ym10, [base+z_filter_wh]
+ mova xm2, [base+z_filter_t0+angleq*8]
+ vpcmpgtb k1{k1}, ym1, ym2
+ kmovd r5d, k1
+ test r5d, r5d
+ jz .filter32_end
+ vpbroadcastw m2, [tlq]
+ popcnt r5d, r5d
+ vpbroadcastd m5, [base+z_filter_k+(r5-1)*4+12*0]
+ valignq m2, m6, m2, 6
+ vpbroadcastd m8, [base+z_filter_k+(r5-1)*4+12*1]
+ valignq m4, m7, m6, 2
+ vpbroadcastd m9, [base+z_filter_k+(r5-1)*4+12*2]
+ palignr m1, m6, m2, 14
+ pmullw m5, m6
+ palignr m3, m4, m6, 2
+ paddw m1, m3
+ palignr m2, m6, m2, 12
+ pmullw m1, m8
+ palignr m4, m6, 4
+ paddw m2, m4
+ pmullw m2, m9
+ pmovzxbw m10, ym10
+ pxor m6, m6
+ paddw m5, m1
+ pminuw m1, m10, [base+pw_0to31]
+ paddw m5, m2
+ psrlw m5, 3
+ pavgw m6, m5
+ vpermw m7, m10, m6
+ vpermw m6, m1, m6
+.filter32_end:
+ ret
+.w16:
+ mova m6, [tlq-64*1]
+ cmp hd, 32
+ jl .w16_h16
+ pmaxud m8, m0, [base+pw_16] {1to16}
+ mova m7, [tlq-64*2]
+ vpermw m6, m0, m6
+ jg .w16_h64
+ vpermw m7, m8, m7
+ test angled, 0x400
+ jnz .w16_main
+ call .filter64
+ vpbroadcastd m0, [base+pw_15]
+ vinserti32x8 m0, [base+pw_0to31], 0
+ vpermw m7, m0, m7
+ jmp .w16_main
+.w16_h16:
+ lea r3d, [hq*2-1]
+ xor r3d, 31 ; 32 - (h + imin(w, h))
+ vpbroadcastw m1, r3d
+ vpermw m7, m1, m6
+ pmaxuw m1, m0
+ vpermw m6, m1, m6
+ test angled, 0x400
+ jnz .w16_main
+ lea r3d, [hq+15]
+ call .filter32
+.w16_main:
+ vbroadcasti32x8 m0, [base+pw_1to32]
+ vpbroadcastw m4, dyd
+ rorx r2d, hd, 1
+ pmullw m4, m0 ; ypos
+ vpbroadcastw ym1, r2d
+ imul r2, strideq ; stride * height / 2
+ paddd m1, [base+pw_32704] {1to16}
+ lea r3, [r2+strideq]
+ psrlw m2, m4, 6
+ psllw m4, 9
+ paddsw m2, m1 ; base+0
+ vpandd m4, m14 ; frac << 9
+ mova m3, m6
+ vpermt2w m3, m2, m7 ; left[base+0]
+.w16_loop:
+ paddsw m1, m2, m15 ; base+1
+ paddsw m2, m1, m15 ; base+2
+ vpermi2w m1, m6, m7 ; left[base+1]
+ psubw m0, m1, m3
+ pmulhrsw m0, m4
+ paddw m0, m3
+ mova m3, m6
+ vpermt2w m3, m2, m7 ; left[base+2]
+ vextracti32x8 [dstq+strideq*0], m0, 1
+ mova [dstq+r2 ], ym0
+ psubw m0, m3, m1
+ pmulhrsw m0, m4
+ paddw m0, m1
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ mova [dstq+r3 ], ym0
+ lea dstq, [dstq+strideq*2]
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w16_h64:
+ vpermw m7, m0, m7
+ vpermw m8, m8, [tlq-64*3]
+ test angled, 0x400
+ jnz .w16_h64_main
+ valignq m11, m8, m7, 6
+ call .filter64
+ vshufi32x4 m2, m8, m8, q3321
+ vpbroadcastd m0, [base+pw_15]
+ palignr ym3, ym8, ym11, 12
+ vinserti32x8 m0, [base+pw_0to31], 0
+ palignr ym4, ym8, ym11, 14
+ palignr ym1, ym2, ym8, 4
+ paddw ym3, ym5
+ palignr ym2, ym8, 2
+ paddw ym8, ym4
+ pavgw ym3, ym1
+ paddw ym8, ym2
+ paddw ym8, ym3
+ psrlw ym8, 2
+ vpermw m8, m0, m8
+.w16_h64_main:
+ vbroadcasti32x8 m0, [base+pw_1to32]
+ vpbroadcastw m4, dyd
+ pmullw m4, m0 ; ypos
+ vpbroadcastd ym1, [base+pw_32]
+ paddd m1, [base+pw_32672] {1to16}
+ mov r2, strideq
+ shl r2, 5 ; stride*32
+ vpbroadcastd m9, [base+pw_32735]
+ lea r3, [r2+strideq]
+ psrlw m2, m4, 6
+ psllw m4, 9
+ paddsw m2, m1 ; base+0
+ vpandd m4, m14 ; frac << 9
+ mova m3, m7
+ vpermt2w m3, m2, m6
+ vpcmpgtw k1, m2, m9
+ vpermw m3{k1}, m2, m8 ; left[base+0]
+.w16_h64_loop:
+ paddsw m2, m15 ; base+1
+ mova m1, m7
+ vpermt2w m1, m2, m6
+ vpcmpgtw k1, m2, m9
+ vpermw m1{k1}, m2, m8 ; left[base+1]
+ psubw m0, m1, m3
+ pmulhrsw m0, m4
+ paddsw m2, m15 ; base+2
+ paddw m0, m3
+ mova m3, m7
+ vpermt2w m3, m2, m6
+ vpcmpgtw k1, m2, m9
+ vpermw m3{k1}, m2, m8 ; left[base+2]
+ vextracti32x8 [dstq+strideq*0], m0, 1
+ mova [dstq+r2 ], ym0
+ psubw m0, m3, m1
+ pmulhrsw m0, m4
+ paddw m0, m1
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ mova [dstq+r3 ], ym0
+ lea dstq, [dstq+strideq*2]
+ sub hd, 4
+ jg .w16_h64_loop
+ RET
+.filter64:
+ vpbroadcastw m2, [tlq]
+ vpbroadcastd m5, [base+pw_3]
+ valignq m2, m6, m2, 6
+ valignq m4, m7, m6, 2
+ valignq m10, m7, m6, 6
+ palignr m1, m6, m2, 12
+ palignr m2, m6, m2, 14
+ palignr m3, m4, m6, 4
+ paddw m1, m5
+ palignr m4, m6, 2
+ paddw m6, m2
+ valignq m2, m8, m7, 2
+ pavgw m1, m3
+ palignr m3, m7, m10, 12
+ paddw m6, m4
+ palignr m4, m7, m10, 14
+ paddw m6, m1
+ palignr m1, m2, m7, 4
+ psrlw m6, 2
+ palignr m2, m7, 2
+ paddw m3, m5
+ paddw m7, m4
+ pavgw m3, m1
+ paddw m7, m2
+ paddw m7, m3
+ psrlw m7, 2
+ ret
+.w32:
+ mova m6, [tlq-64*1]
+ cmp hd, 32
+ jl .w32_h16
+ mova m8, [tlq-64*2]
+ vpermw m6, m0, m6
+ vpermw m7, m0, m8
+ jg .w32_h64
+ test angled, 0x400
+ jnz .w32_main
+ vpbroadcastw xm8, xm8
+ jmp .w32_filter
+.w32_h16:
+ lea r3d, [hq*2-1]
+ xor r3d, 31 ; 32 - (h + imin(w, h))
+ vpbroadcastw m1, r3d
+ vpermw m7, m1, m6
+ pmaxuw m1, m0
+ vpermw m6, m1, m6
+ test angled, 0x400
+ jnz .w32_main
+ vextracti32x4 xm8, m7, 3
+.w32_filter:
+ call .filter64
+.w32_main:
+ vpbroadcastw m4, dyd
+ vpbroadcastd m1, [base+pw_32704]
+ pmullw m4, [base+pw_1to32] ; ypos
+ psrlw m2, m4, 6
+ psllw m4, 9
+ paddsw m2, m1 ; base+0
+ vpandd m4, m14 ; frac << 9
+ mova m3, m6
+ vpermt2w m3, m2, m7 ; left[base+0]
+.w32_loop:
+ paddsw m1, m2, m15 ; base+1
+ paddsw m2, m1, m15 ; base+2
+ vpermi2w m1, m6, m7 ; left[base+1]
+ psubw m0, m1, m3
+ pmulhrsw m0, m4
+ paddw m0, m3
+ mova m3, m6
+ vpermt2w m3, m2, m7 ; left[base+2]
+ mova [dstq+strideq*0], m0
+ psubw m0, m3, m1
+ pmulhrsw m0, m4
+ paddw m0, m1
+ mova [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w32_h64:
+ mova m9, [tlq-64*3]
+ vpermw m8, m0, m9
+ test angled, 0x400
+ jnz .w32_h64_main
+ vpbroadcastw xm9, xm9
+ call .filter96
+.w32_h64_main:
+ vpbroadcastw m4, dyd
+ vpbroadcastd m1, [base+pw_32672]
+ pmullw m4, [base+pw_1to32] ; ypos
+ vpbroadcastd m9, [base+pw_32735]
+ psrlw m2, m4, 6
+ psllw m4, 9
+ paddsw m2, m1 ; base+0
+ vpandd m4, m14 ; frac << 9
+ mova m3, m7
+ vpermt2w m3, m2, m6
+ vpcmpgtw k1, m2, m9
+ vpermw m3{k1}, m2, m8 ; left[base+0]
+.w32_h64_loop:
+ paddsw m2, m15 ; base+1
+ mova m1, m7
+ vpermt2w m1, m2, m6
+ vpcmpgtw k1, m2, m9
+ vpermw m1{k1}, m2, m8 ; left[base+1]
+ psubw m0, m1, m3
+ pmulhrsw m0, m4
+ paddsw m2, m15 ; base+2
+ paddw m0, m3
+ mova m3, m7
+ vpermt2w m3, m2, m6
+ vpcmpgtw k1, m2, m9
+ vpermw m3{k1}, m2, m8 ; left[base+2]
+ mova [dstq+strideq*0], m0
+ psubw m0, m3, m1
+ pmulhrsw m0, m4
+ paddw m0, m1
+ mova [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32_h64_loop
+ RET
+.filter96:
+ valignq m11, m8, m7, 6
+ call .filter64
+ valignq m2, m9, m8, 2
+ palignr m3, m8, m11, 12
+ palignr m4, m8, m11, 14
+ palignr m1, m2, m8, 4
+ paddw m3, m5
+ palignr m2, m8, 2
+ paddw m8, m4
+ pavgw m3, m1
+ paddw m8, m2
+ paddw m8, m3
+ psrlw m8, 2
+ ret
+.w64:
+ mova m7, [tlq-64*1]
+ vpermw m6, m0, m7
+ cmp hd, 32
+ jl .w64_h16
+ mova m8, [tlq-64*2]
+ vpermw m7, m0, m8
+ jg .w64_h64
+ test angled, 0x400
+ jnz .w64_main
+ vpbroadcastw m8, xm8
+ mova m9, m8
+ call .filter96
+ vshufi32x4 m9, m8, m8, q3333
+ jmp .w64_h64_main
+.w64_h16:
+ vpbroadcastw m7, xm7
+ test angled, 0x400
+ jnz .w64_main
+ mova m8, m7
+ call .filter64
+.w64_main:
+ vpbroadcastw m11, dyd
+ vpbroadcastd m1, [base+pw_32704]
+ pmullw m10, m11, [base+pw_1to32] ; ypos
+ psllw m11, 5
+ psrlw m8, m10, 6
+ paddw m11, m10
+ psllw m10, 9
+ psrlw m9, m11, 6
+ psllw m11, 9
+ psubw m9, m8
+ paddsw m8, m1 ; base+0
+ vpandd m10, m14 ; frac << 9
+ vpandd m11, m14 ; frac << 9
+ mova m4, m6
+ vpermt2w m4, m8, m7 ; left[base+0] ( 0..31)
+ paddsw m5, m8, m9
+ vpermi2w m5, m6, m7 ; left[base+0] (32..63)
+.w64_loop:
+ paddsw m8, m15 ; base+1 ( 0..31)
+ mova m2, m6
+ vpermt2w m2, m8, m7 ; left[base+1] ( 0..31)
+ paddsw m3, m8, m9 ; base+1 (32..63)
+ vpermi2w m3, m6, m7 ; left[base+1] (32..63)
+ psubw m0, m2, m4
+ psubw m1, m3, m5
+ pmulhrsw m0, m10
+ pmulhrsw m1, m11
+ paddw m0, m4
+ paddw m1, m5
+ mova m4, m2
+ mova [dstq+64*0], m0
+ mova m5, m3
+ mova [dstq+64*1], m1
+ add dstq, strideq
+ dec hd
+ jg .w64_loop
+ RET
+.w64_h64:
+ vpermw m8, m0, [tlq-64*3]
+ mova m13, [tlq-64*4]
+ vpermw m9, m0, m13
+ test angled, 0x400
+ jnz .w64_h64_main
+ valignq m12, m9, m8, 6
+ call .filter96
+ vpbroadcastw xm2, xm13
+ valignq m2, m9, 2
+ palignr m3, m9, m12, 12
+ palignr m4, m9, m12, 14
+ palignr m1, m2, m9, 4
+ paddw m3, m5
+ palignr m2, m9, 2
+ paddw m9, m4
+ pavgw m3, m1
+ paddw m9, m2
+ paddw m9, m3
+ psrlw m9, 2
+.w64_h64_main:
+ vpbroadcastw m11, dyd
+ vpbroadcastd m1, [base+pw_32640]
+ pmullw m10, m11, [base+pw_1to32] ; ypos
+ psllw m11, 5
+ psrlw m12, m10, 6
+ paddw m11, m10
+ psllw m10, 9
+ psrlw m13, m11, 6
+ psllw m11, 9
+ psubw m13, m12
+ paddsw m12, m1 ; base+0
+ vpandd m10, m14 ; frac << 9
+ vpandd m11, m14 ; frac << 9
+ vpbroadcastd m14, [base+pw_64]
+ mova m4, m6
+ vpermt2w m4, m12, m7
+ vptestmw k1, m12, m14
+ mova m0, m8
+ vpermt2w m0, m12, m9
+ paddsw m1, m12, m13
+ mova m5, m6
+ vpermt2w m5, m1, m7
+ vptestmw k2, m1, m14
+ vpermi2w m1, m8, m9
+ vmovdqu16 m4{k1}, m0 ; left[base+0] ( 0..31)
+ vmovdqu16 m5{k2}, m1 ; left[base+0] (32..63)
+.w64_h64_loop:
+ paddsw m12, m15 ; base+1
+ mova m2, m6
+ vpermt2w m2, m12, m7
+ vptestmw k1, m12, m14
+ mova m0, m8
+ vpermt2w m0, m12, m9
+ paddsw m1, m12, m13
+ mova m3, m6
+ vpermt2w m3, m1, m7
+ vptestmw k2, m1, m14
+ vpermi2w m1, m8, m9
+ vmovdqu16 m2{k1}, m0 ; left[base+1] ( 0..31)
+ vmovdqu16 m3{k2}, m1 ; left[base+1] (32..63)
+ psubw m0, m2, m4
+ psubw m1, m3, m5
+ pmulhrsw m0, m10
+ pmulhrsw m1, m11
+ paddw m0, m4
+ paddw m1, m5
+ mova m4, m2
+ mova [dstq+64*0], m0
+ mova m5, m3
+ mova [dstq+64*1], m1
+ add dstq, strideq
+ dec hd
+ jg .w64_h64_loop
+ RET
+
+cglobal pal_pred_16bpc, 4, 7, 7, dst, stride, pal, idx, w, h, stride3
+ lea r6, [pal_pred_16bpc_avx512icl_table]
+ tzcnt wd, wm
+ mova m3, [pal_pred_perm]
+ movifnidn hd, hm
+ movsxd wq, [r6+wq*4]
+ vpbroadcastq m4, [pal_unpack+0]
+ vpbroadcastq m5, [pal_unpack+8]
+ add wq, r6
+ vbroadcasti32x4 m6, [palq]
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ pmovzxbd ym0, [idxq]
+ add idxq, 8
+ vpmultishiftqb ym0, ym4, ym0
+ vpermw ym0, ym0, ym6
+ vextracti32x4 xm1, ym0, 1
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+stride3q ], xm1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4
+ RET
+.w8:
+ pmovzxbd m0, [idxq]
+ add idxq, 16
+ vpmultishiftqb m0, m4, m0
+ vpermw m0, m0, m6
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ vextracti32x4 [dstq+strideq*2], m0, 2
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8
+ RET
+.w16:
+ movu ym1, [idxq]
+ add idxq, 32
+ vpermb m1, m3, m1
+ vpmultishiftqb m1, m4, m1
+ vpermw m0, m1, m6
+ psrlw m1, 8
+ vpermw m1, m1, m6
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], ym1
+ vextracti32x8 [dstq+stride3q ], m1, 1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w16
+ RET
+.w32:
+ vpermb m2, m3, [idxq]
+ add idxq, 64
+ vpmultishiftqb m1, m4, m2
+ vpmultishiftqb m2, m5, m2
+ vpermw m0, m1, m6
+ psrlw m1, 8
+ vpermw m1, m1, m6
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ vpermw m0, m2, m6
+ psrlw m2, 8
+ vpermw m1, m2, m6
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w32
+ RET
+.w64:
+ vpermb m2, m3, [idxq]
+ add idxq, 64
+ vpmultishiftqb m1, m4, m2
+ vpmultishiftqb m2, m5, m2
+ vpermw m0, m1, m6
+ psrlw m1, 8
+ vpermw m1, m1, m6
+ mova [dstq+ 0], m0
+ mova [dstq+64], m1
+ vpermw m0, m2, m6
+ psrlw m2, 8
+ vpermw m1, m2, m6
+ mova [dstq+strideq+ 0], m0
+ mova [dstq+strideq+64], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w64
+ RET
+
+; The ipred_filter SIMD processes 4x2 blocks in the following order which
+; increases parallelism compared to doing things row by row.
+; w4 w8 w16 w32
+; 1 1 2 1 2 5 6 1 2 5 6 9 a d e
+; 2 2 3 2 3 6 7 2 3 6 7 a b e f
+; 3 3 4 3 4 7 8 3 4 7 8 b c f g
+; 4 4 5 4 5 8 9 4 5 8 9 c d g h
+
+cglobal ipred_filter_16bpc, 4, 7, 14, dst, stride, tl, w, h, filter, top
+%define base r6-$$
+ lea r6, [$$]
+%ifidn filterd, filterm
+ movzx filterd, filterb
+%else
+ movzx filterd, byte filterm
+%endif
+ shl filterd, 6
+ movifnidn hd, hm
+ movu xm0, [tlq-6]
+ pmovsxbw m7, [base+filter_intra_taps+filterq+32*0]
+ pmovsxbw m8, [base+filter_intra_taps+filterq+32*1]
+ mov r5d, r8m ; bitdepth_max
+ movsldup m9, [base+filter_permA]
+ movshdup m10, [base+filter_permA]
+ shr r5d, 11 ; is_12bpc
+ jnz .12bpc
+ psllw m7, 2 ; upshift multipliers so that packusdw
+ psllw m8, 2 ; will perform clipping for free
+.12bpc:
+ vpbroadcastd m5, [base+filter_rnd+r5*8]
+ vpbroadcastd m6, [base+filter_shift+r5*8]
+ sub wd, 8
+ jl .w4
+.w8:
+ call .main4
+ movsldup m11, [filter_permB]
+ lea r5d, [hq*2+2]
+ movshdup m12, [filter_permB]
+ lea topq, [tlq+2]
+ mova m13, [filter_permC]
+ sub hd, 4
+ vinserti32x4 ym0, [topq], 1 ; a0 b0 t0 t1
+ sub tlq, r5
+%if WIN64
+ push r7
+ push r8
+%endif
+ mov r7, dstq
+ mov r8d, hd
+.w8_loop:
+ movlps xm4, xm0, [tlq+hq*2]
+ call .main8
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jge .w8_loop
+ test wd, wd
+ jz .end
+ mov r2d, 0x0d
+ kmovb k1, r2d
+ lea r2, [strideq*3]
+.w16:
+ movd xmm0, [r7+strideq*1+12]
+ vpblendd xmm0, [topq+8], 0x0e ; t1 t2
+ pinsrw xm4, xmm0, [r7+strideq*0+14], 2
+ call .main8
+ add r7, 16
+ vinserti32x4 ym0, [topq+16], 1 ; a2 b2 t2 t3
+ mov hd, r8d
+ mov dstq, r7
+ add topq, 16
+.w16_loop:
+ movd xmm1, [dstq+strideq*2-4]
+ punpcklwd xm4, xmm1, xmm0
+ movd xmm0, [dstq+r2-4]
+ shufps xm4{k1}, xmm0, xm0, q3210
+ call .main8
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jge .w16_loop
+ sub wd, 8
+ jg .w16
+.end:
+ vpermb m2, m11, m0
+ mova ym1, ym5
+ vpdpwssd m1, m2, m7
+ vpermb m2, m12, m0
+ vpdpwssd m1, m2, m8
+%if WIN64
+ pop r8
+ pop r7
+%endif
+ vextracti32x8 ym2, m1, 1
+ paddd ym1, ym2
+ packusdw ym1, ym1
+ vpsrlvw ym1, ym6
+ vpermt2q m0, m13, m1
+ vextracti32x4 [dstq+strideq*0], m0, 2
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ RET
+.w4_loop:
+ movlps xm0, [tlq-10]
+ lea dstq, [dstq+strideq*2]
+ sub tlq, 4
+.w4:
+ call .main4
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ sub hd, 2
+ jg .w4_loop
+ RET
+ALIGN function_align
+.main4:
+ vpermb m2, m9, m0
+ mova ym1, ym5
+ vpdpwssd m1, m2, m7
+ vpermb m0, m10, m0
+ vpdpwssd m1, m0, m8
+ vextracti32x8 ym0, m1, 1
+ paddd ym0, ym1
+ vextracti32x4 xm1, ym0, 1
+ packusdw xm0, xm1 ; clip
+ vpsrlvw xm0, xm6
+ ret
+ALIGN function_align
+.main8:
+ vpermb m3, m11, m0
+ mova ym2, ym5
+ vpdpwssd m2, m3, m7
+ vpermb m3, m9, m4
+ mova ym1, ym5
+ vpdpwssd m1, m3, m7
+ vpermb m3, m12, m0
+ vpdpwssd m2, m3, m8
+ vpermb m3, m10, m4
+ vpdpwssd m1, m3, m8
+ vextracti32x8 ym4, m2, 1
+ vextracti32x8 ym3, m1, 1
+ paddd ym2, ym4
+ paddd ym1, ym3
+ packusdw ym1, ym2 ; clip
+ vpsrlvw ym1, ym6
+ vpermt2q m0, m13, m1 ; c0 d0 b0 b1 a0 a1
+ vextracti32x4 [dstq+strideq*0], m0, 2
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ ret
+
+%endif
diff --git a/third_party/dav1d/src/x86/ipred16_sse.asm b/third_party/dav1d/src/x86/ipred16_sse.asm
new file mode 100644
index 0000000000..5a311b1442
--- /dev/null
+++ b/third_party/dav1d/src/x86/ipred16_sse.asm
@@ -0,0 +1,4103 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA
+
+filter_shuf: db 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 2, 3, -1, -1
+pal_pred_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+z_base_inc: dw 0*64, 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64
+z_base_inc_z2: dw 7*64, 6*64, 5*64, 4*64, 3*64, 2*64, 1*64, 0*64
+z_upsample: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
+z2_upsample_l: db -1, -1, -2, -1, -3, -1, -4, -1, 8, 9, 8, 9, 10, 11, 12, 13
+ db 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13
+z2_top_shufA: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
+z2_top_shufB: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
+z2_left_shufA: db 14, 15, 12, 13, 10, 11, 8, 9, 12, 13, 10, 11, 8, 9, 6, 7
+z2_left_shufB: db 14, 15, 10, 11, 6, 7, 2, 3, 12, 13, 8, 9, 4, 5, 0, 1
+z_filt_wh16: db 19, 19, 19, 23, 23, 23, 31, 31, 31, 47, 47, 47, 79, 79, 79, -1
+z_filt_t_w48: db 55,127, 7,127, 15, 31, 39, 31,127, 39,127, 39, 7, 15, 31, 15
+ db 39, 63, 3, 63, 3, 3, 19, 3, 47, 19, 47, 19, 3, 3, 3, 3
+z_filt_t_w16: db 15, 31, 7, 15, 31, 7, 3, 31, 3, 3, 3, 3, 3, 3, 0, 0
+z_filt_wh4: db 7, 7, 19, 7,
+z_filt_wh8: db 19, 19, 11, 19, 11, 15, 15, 15, 23, 23, 23, 23, 39, 39, 39, 39
+ALIGN 8
+pb_2_3: times 4 db 2, 3
+z2_dy_offset: dw 96*64, 96*64, 95*64, 95*64
+z_filt_k: times 4 dw 8
+ times 4 dw 6
+ times 4 dw 4
+ times 4 dw 5
+pw_m3584: times 4 dw -3584
+pw_m3072: times 4 dw -3072
+pw_m2560: times 4 dw -2560
+pw_m2048: times 4 dw -2048
+pw_m1536: times 4 dw -1536
+pw_m1024: times 4 dw -1024
+pw_m512: times 4 dw -512
+pw_1: times 4 dw 1
+pw_2: times 4 dw 2
+pw_3: times 4 dw 3
+pw_62: times 4 dw 62
+pw_256: times 4 dw 256
+pw_512: times 4 dw 512
+pw_2048: times 4 dw 2048
+
+%define pw_4 (z_filt_k+8*2)
+%define pw_8 (z_filt_k+8*0)
+%define pw_m1to4 z2_upsample_l
+
+%macro JMP_TABLE 3-*
+ %xdefine %1_%2_table (%%table - 2*4)
+ %xdefine %%base mangle(private_prefix %+ _%1_%2)
+ %%table:
+ %rep %0 - 2
+ dd %%base %+ .%3 - (%%table - 2*4)
+ %rotate 1
+ %endrep
+%endmacro
+
+%define ipred_dc_splat_16bpc_ssse3_table (ipred_dc_16bpc_ssse3_table + 10*4)
+%define ipred_dc_128_16bpc_ssse3_table (ipred_dc_16bpc_ssse3_table + 15*4)
+%define ipred_cfl_splat_16bpc_ssse3_table (ipred_cfl_16bpc_ssse3_table + 8*4)
+
+JMP_TABLE ipred_dc_left_16bpc, ssse3, h4, h8, h16, h32, h64
+JMP_TABLE ipred_dc_16bpc, ssse3, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
+ s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4, \
+ s4-15*4, s8-15*4, s16c-15*4, s32c-15*4, s64-15*4
+JMP_TABLE ipred_h_16bpc, ssse3, w4, w8, w16, w32, w64
+JMP_TABLE ipred_z1_16bpc, ssse3, w4, w8, w16, w32, w64
+JMP_TABLE ipred_z2_16bpc, ssse3, w4, w8, w16, w32, w64
+JMP_TABLE ipred_z3_16bpc, ssse3, h4, h8, h16, h32, h64
+JMP_TABLE ipred_cfl_16bpc, ssse3, h4, h8, h16, h32, w4, w8, w16, w32, \
+ s4-8*4, s8-8*4, s16-8*4, s32-8*4
+JMP_TABLE ipred_cfl_left_16bpc, ssse3, h4, h8, h16, h32
+JMP_TABLE ipred_cfl_ac_444_16bpc, ssse3, w4, w8, w16, w32
+JMP_TABLE pal_pred_16bpc, ssse3, w4, w8, w16, w32, w64
+
+cextern smooth_weights_1d_16bpc
+cextern smooth_weights_2d_16bpc
+cextern dr_intra_derivative
+cextern filter_intra_taps
+
+SECTION .text
+
+INIT_XMM ssse3
+cglobal ipred_dc_top_16bpc, 3, 7, 6, dst, stride, tl, w, h
+ LEA r5, ipred_dc_left_16bpc_ssse3_table
+ movd m4, wm
+ tzcnt wd, wm
+ add tlq, 2
+ movifnidn hd, hm
+ pxor m3, m3
+ pavgw m4, m3
+ movd m5, wd
+ movu m0, [tlq]
+ movsxd r6, [r5+wq*4]
+ add r6, r5
+ add r5, ipred_dc_128_16bpc_ssse3_table-ipred_dc_left_16bpc_ssse3_table
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ jmp r6
+
+cglobal ipred_dc_left_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
+ LEA r5, ipred_dc_left_16bpc_ssse3_table
+ mov hd, hm
+ movd m4, hm
+ tzcnt r6d, hd
+ sub tlq, hq
+ tzcnt wd, wm
+ pxor m3, m3
+ sub tlq, hq
+ pavgw m4, m3
+ movd m5, r6d
+ movu m0, [tlq]
+ movsxd r6, [r5+r6*4]
+ add r6, r5
+ add r5, ipred_dc_128_16bpc_ssse3_table-ipred_dc_left_16bpc_ssse3_table
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ jmp r6
+.h64:
+ movu m2, [tlq+112]
+ movu m1, [tlq+ 96]
+ paddw m0, m2
+ movu m2, [tlq+ 80]
+ paddw m1, m2
+ movu m2, [tlq+ 64]
+ paddw m0, m2
+ paddw m0, m1
+.h32:
+ movu m1, [tlq+ 48]
+ movu m2, [tlq+ 32]
+ paddw m1, m2
+ paddw m0, m1
+.h16:
+ movu m1, [tlq+ 16]
+ paddw m0, m1
+.h8:
+ movhlps m1, m0
+ paddw m0, m1
+.h4:
+ punpcklwd m0, m3
+ paddd m4, m0
+ punpckhqdq m0, m0
+ paddd m0, m4
+ pshuflw m4, m0, q1032
+ paddd m0, m4
+ psrld m0, m5
+ lea stride3q, [strideq*3]
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+ jmp wq
+
+cglobal ipred_dc_16bpc, 4, 7, 6, dst, stride, tl, w, h, stride3
+ movifnidn hd, hm
+ tzcnt r6d, hd
+ lea r5d, [wq+hq]
+ movd m4, r5d
+ tzcnt r5d, r5d
+ movd m5, r5d
+ LEA r5, ipred_dc_16bpc_ssse3_table
+ tzcnt wd, wd
+ movsxd r6, [r5+r6*4]
+ movsxd wq, [r5+wq*4+5*4]
+ pxor m3, m3
+ psrlw m4, 1
+ add r6, r5
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp r6
+.h4:
+ movq m0, [tlq-8]
+ jmp wq
+.w4:
+ movq m1, [tlq+2]
+ paddw m1, m0
+ punpckhwd m0, m3
+ punpcklwd m1, m3
+ paddd m0, m1
+ paddd m4, m0
+ punpckhqdq m0, m0
+ paddd m0, m4
+ pshuflw m1, m0, q1032
+ paddd m0, m1
+ cmp hd, 4
+ jg .w4_mul
+ psrlw m0, 3
+ jmp .w4_end
+.w4_mul:
+ mov r2d, 0xAAAB
+ mov r3d, 0x6667
+ cmp hd, 16
+ cmove r2d, r3d
+ psrld m0, 2
+ movd m1, r2d
+ pmulhuw m0, m1
+ psrlw m0, 1
+.w4_end:
+ pshuflw m0, m0, q0000
+.s4:
+ movq [dstq+strideq*0], m0
+ movq [dstq+strideq*1], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s4
+ RET
+.h8:
+ mova m0, [tlq-16]
+ jmp wq
+.w8:
+ movu m1, [tlq+2]
+ paddw m0, m1
+ punpcklwd m1, m0, m3
+ punpckhwd m0, m3
+ paddd m0, m1
+ paddd m4, m0
+ punpckhqdq m0, m0
+ paddd m0, m4
+ pshuflw m1, m0, q1032
+ paddd m0, m1
+ psrld m0, m5
+ cmp hd, 8
+ je .w8_end
+ mov r2d, 0xAAAB
+ mov r3d, 0x6667
+ cmp hd, 32
+ cmove r2d, r3d
+ movd m1, r2d
+ pmulhuw m0, m1
+ psrlw m0, 1
+.w8_end:
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.s8:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s8
+ RET
+.h16:
+ mova m0, [tlq-32]
+ paddw m0, [tlq-16]
+ jmp wq
+.w16:
+ movu m1, [tlq+ 2]
+ movu m2, [tlq+18]
+ paddw m1, m2
+ paddw m0, m1
+ punpckhwd m1, m0, m3
+ punpcklwd m0, m3
+ paddd m0, m1
+ paddd m4, m0
+ punpckhqdq m0, m0
+ paddd m0, m4
+ pshuflw m1, m0, q1032
+ paddd m0, m1
+ psrld m0, m5
+ cmp hd, 16
+ je .w16_end
+ mov r2d, 0xAAAB
+ mov r3d, 0x6667
+ test hd, 8|32
+ cmovz r2d, r3d
+ movd m1, r2d
+ pmulhuw m0, m1
+ psrlw m0, 1
+.w16_end:
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.s16c:
+ mova m1, m0
+.s16:
+ mova [dstq+strideq*0+16*0], m0
+ mova [dstq+strideq*0+16*1], m1
+ mova [dstq+strideq*1+16*0], m0
+ mova [dstq+strideq*1+16*1], m1
+ mova [dstq+strideq*2+16*0], m0
+ mova [dstq+strideq*2+16*1], m1
+ mova [dstq+stride3q +16*0], m0
+ mova [dstq+stride3q +16*1], m1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s16
+ RET
+.h32:
+ mova m0, [tlq-64]
+ paddw m0, [tlq-48]
+ paddw m0, [tlq-32]
+ paddw m0, [tlq-16]
+ jmp wq
+.w32:
+ movu m1, [tlq+ 2]
+ movu m2, [tlq+18]
+ paddw m1, m2
+ movu m2, [tlq+34]
+ paddw m0, m2
+ movu m2, [tlq+50]
+ paddw m1, m2
+ paddw m0, m1
+ punpcklwd m1, m0, m3
+ punpckhwd m0, m3
+ paddd m0, m1
+ paddd m4, m0
+ punpckhqdq m0, m0
+ paddd m0, m4
+ pshuflw m1, m0, q1032
+ paddd m0, m1
+ psrld m0, m5
+ cmp hd, 32
+ je .w32_end
+ mov r2d, 0xAAAB
+ mov r3d, 0x6667
+ cmp hd, 8
+ cmove r2d, r3d
+ movd m1, r2d
+ pmulhuw m0, m1
+ psrlw m0, 1
+.w32_end:
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.s32c:
+ mova m1, m0
+ mova m2, m0
+ mova m3, m0
+.s32:
+ mova [dstq+strideq*0+16*0], m0
+ mova [dstq+strideq*0+16*1], m1
+ mova [dstq+strideq*0+16*2], m2
+ mova [dstq+strideq*0+16*3], m3
+ mova [dstq+strideq*1+16*0], m0
+ mova [dstq+strideq*1+16*1], m1
+ mova [dstq+strideq*1+16*2], m2
+ mova [dstq+strideq*1+16*3], m3
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .s32
+ RET
+.h64:
+ mova m0, [tlq-128]
+ mova m1, [tlq-112]
+ paddw m0, [tlq- 96]
+ paddw m1, [tlq- 80]
+ paddw m0, [tlq- 64]
+ paddw m1, [tlq- 48]
+ paddw m0, [tlq- 32]
+ paddw m1, [tlq- 16]
+ paddw m0, m1
+ jmp wq
+.w64:
+ movu m1, [tlq+ 2]
+ movu m2, [tlq+ 18]
+ paddw m1, m2
+ movu m2, [tlq+ 34]
+ paddw m0, m2
+ movu m2, [tlq+ 50]
+ paddw m1, m2
+ movu m2, [tlq+ 66]
+ paddw m0, m2
+ movu m2, [tlq+ 82]
+ paddw m1, m2
+ movu m2, [tlq+ 98]
+ paddw m0, m2
+ movu m2, [tlq+114]
+ paddw m1, m2
+ paddw m0, m1
+ punpcklwd m1, m0, m3
+ punpckhwd m0, m3
+ paddd m0, m1
+ paddd m4, m0
+ punpckhqdq m0, m0
+ paddd m0, m4
+ pshuflw m1, m0, q1032
+ paddd m0, m1
+ psrld m0, m5
+ cmp hd, 64
+ je .w64_end
+ mov r2d, 0xAAAB
+ mov r3d, 0x6667
+ cmp hd, 16
+ cmove r2d, r3d
+ movd m1, r2d
+ pmulhuw m0, m1
+ psrlw m0, 1
+.w64_end:
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.s64:
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m0
+ mova [dstq+16*2], m0
+ mova [dstq+16*3], m0
+ mova [dstq+16*4], m0
+ mova [dstq+16*5], m0
+ mova [dstq+16*6], m0
+ mova [dstq+16*7], m0
+ add dstq, strideq
+ dec hd
+ jg .s64
+ RET
+
+cglobal ipred_dc_128_16bpc, 2, 7, 6, dst, stride, tl, w, h, stride3
+ mov r6d, r8m
+ LEA r5, ipred_dc_128_16bpc_ssse3_table
+ tzcnt wd, wm
+ shr r6d, 11
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ movddup m0, [r5-ipred_dc_128_16bpc_ssse3_table+pw_512+r6*8]
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp wq
+
+cglobal ipred_v_16bpc, 4, 7, 6, dst, stride, tl, w, h, stride3
+ LEA r5, ipred_dc_splat_16bpc_ssse3_table
+ movifnidn hd, hm
+ movu m0, [tlq+ 2]
+ movu m1, [tlq+ 18]
+ movu m2, [tlq+ 34]
+ movu m3, [tlq+ 50]
+ cmp wd, 64
+ je .w64
+ tzcnt wd, wd
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp wq
+.w64:
+ WIN64_SPILL_XMM 8
+ movu m4, [tlq+ 66]
+ movu m5, [tlq+ 82]
+ movu m6, [tlq+ 98]
+ movu m7, [tlq+114]
+.w64_loop:
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ mova [dstq+16*2], m2
+ mova [dstq+16*3], m3
+ mova [dstq+16*4], m4
+ mova [dstq+16*5], m5
+ mova [dstq+16*6], m6
+ mova [dstq+16*7], m7
+ add dstq, strideq
+ dec hd
+ jg .w64_loop
+ RET
+
+cglobal ipred_h_16bpc, 3, 6, 4, dst, stride, tl, w, h, stride3
+%define base r5-ipred_h_16bpc_ssse3_table
+ tzcnt wd, wm
+ LEA r5, ipred_h_16bpc_ssse3_table
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ movddup m2, [base+pw_256]
+ movddup m3, [base+pb_2_3]
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ sub tlq, 8
+ movq m3, [tlq]
+ pshuflw m0, m3, q3333
+ pshuflw m1, m3, q2222
+ pshuflw m2, m3, q1111
+ pshuflw m3, m3, q0000
+ movq [dstq+strideq*0], m0
+ movq [dstq+strideq*1], m1
+ movq [dstq+strideq*2], m2
+ movq [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4
+ RET
+.w8:
+ sub tlq, 8
+ movq m3, [tlq]
+ punpcklwd m3, m3
+ pshufd m0, m3, q3333
+ pshufd m1, m3, q2222
+ pshufd m2, m3, q1111
+ pshufd m3, m3, q0000
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8
+ RET
+.w16:
+ sub tlq, 4
+ movd m1, [tlq]
+ pshufb m0, m1, m3
+ pshufb m1, m2
+ mova [dstq+strideq*0+16*0], m0
+ mova [dstq+strideq*0+16*1], m0
+ mova [dstq+strideq*1+16*0], m1
+ mova [dstq+strideq*1+16*1], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w16
+ RET
+.w32:
+ sub tlq, 4
+ movd m1, [tlq]
+ pshufb m0, m1, m3
+ pshufb m1, m2
+ mova [dstq+strideq*0+16*0], m0
+ mova [dstq+strideq*0+16*1], m0
+ mova [dstq+strideq*0+16*2], m0
+ mova [dstq+strideq*0+16*3], m0
+ mova [dstq+strideq*1+16*0], m1
+ mova [dstq+strideq*1+16*1], m1
+ mova [dstq+strideq*1+16*2], m1
+ mova [dstq+strideq*1+16*3], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32
+ RET
+.w64:
+ sub tlq, 2
+ movd m0, [tlq]
+ pshufb m0, m2
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m0
+ mova [dstq+16*2], m0
+ mova [dstq+16*3], m0
+ mova [dstq+16*4], m0
+ mova [dstq+16*5], m0
+ mova [dstq+16*6], m0
+ mova [dstq+16*7], m0
+ add dstq, strideq
+ dec hd
+ jg .w64
+ RET
+
+cglobal ipred_paeth_16bpc, 4, 6, 8, dst, stride, tl, w, h, left
+%define base r5-ipred_paeth_16bpc_ssse3_table
+ movifnidn hd, hm
+ pshuflw m4, [tlq], q0000
+ mov leftq, tlq
+ add hd, hd
+ punpcklqdq m4, m4 ; topleft
+ sub leftq, hq
+ and wd, ~7
+ jnz .w8
+ movddup m5, [tlq+2] ; top
+ psubw m6, m5, m4
+ pabsw m7, m6
+.w4_loop:
+ movd m1, [leftq+hq-4]
+ punpcklwd m1, m1
+ punpckldq m1, m1 ; left
+%macro PAETH 0
+ paddw m0, m6, m1
+ psubw m2, m4, m0 ; tldiff
+ psubw m0, m5 ; tdiff
+ pabsw m2, m2
+ pabsw m0, m0
+ pminsw m2, m0
+ pcmpeqw m0, m2
+ pand m3, m5, m0
+ pandn m0, m4
+ por m0, m3
+ pcmpgtw m3, m7, m2
+ pand m0, m3
+ pandn m3, m1
+ por m0, m3
+%endmacro
+ PAETH
+ movhps [dstq+strideq*0], m0
+ movq [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2*2
+ jg .w4_loop
+ RET
+.w8:
+%if ARCH_X86_32
+ PUSH r6
+ %define r7d hm
+ %assign regs_used 7
+%elif WIN64
+ movaps r4m, m8
+ PUSH r7
+ %assign regs_used 8
+%endif
+%if ARCH_X86_64
+ movddup m8, [pw_256]
+%endif
+ lea tlq, [tlq+wq*2+2]
+ neg wq
+ mov r7d, hd
+.w8_loop0:
+ movu m5, [tlq+wq*2]
+ mov r6, dstq
+ add dstq, 16
+ psubw m6, m5, m4
+ pabsw m7, m6
+.w8_loop:
+ movd m1, [leftq+hq-2]
+%if ARCH_X86_64
+ pshufb m1, m8
+%else
+ pshuflw m1, m1, q0000
+ punpcklqdq m1, m1
+%endif
+ PAETH
+ mova [r6], m0
+ add r6, strideq
+ sub hd, 1*2
+ jg .w8_loop
+ mov hd, r7d
+ add wq, 8
+ jl .w8_loop0
+%if WIN64
+ movaps m8, r4m
+%endif
+ RET
+
+%if ARCH_X86_64
+DECLARE_REG_TMP 7
+%else
+DECLARE_REG_TMP 4
+%endif
+
+cglobal ipred_smooth_v_16bpc, 4, 6, 6, dst, stride, tl, w, h, weights
+ LEA weightsq, smooth_weights_1d_16bpc
+ mov hd, hm
+ lea weightsq, [weightsq+hq*4]
+ neg hq
+ movd m5, [tlq+hq*2] ; bottom
+ pshuflw m5, m5, q0000
+ punpcklqdq m5, m5
+ cmp wd, 4
+ jne .w8
+ movddup m4, [tlq+2] ; top
+ lea r3, [strideq*3]
+ psubw m4, m5 ; top - bottom
+.w4_loop:
+ movq m1, [weightsq+hq*2]
+ punpcklwd m1, m1
+ pshufd m0, m1, q1100
+ punpckhdq m1, m1
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+ paddw m0, m5
+ paddw m1, m5
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ movq [dstq+strideq*2], m1
+ movhps [dstq+r3 ], m1
+ lea dstq, [dstq+strideq*4]
+ add hq, 4
+ jl .w4_loop
+ RET
+.w8:
+%if ARCH_X86_32
+ PUSH r6
+ %assign regs_used 7
+ mov hm, hq
+ %define hq hm
+%elif WIN64
+ PUSH r7
+ %assign regs_used 8
+%endif
+.w8_loop0:
+ mov t0, hq
+ movu m4, [tlq+2]
+ add tlq, 16
+ mov r6, dstq
+ add dstq, 16
+ psubw m4, m5
+.w8_loop:
+ movq m3, [weightsq+t0*2]
+ punpcklwd m3, m3
+ pshufd m0, m3, q0000
+ pshufd m1, m3, q1111
+ pshufd m2, m3, q2222
+ pshufd m3, m3, q3333
+ REPX {pmulhrsw x, m4}, m0, m1, m2, m3
+ REPX {paddw x, m5}, m0, m1, m2, m3
+ mova [r6+strideq*0], m0
+ mova [r6+strideq*1], m1
+ lea r6, [r6+strideq*2]
+ mova [r6+strideq*0], m2
+ mova [r6+strideq*1], m3
+ lea r6, [r6+strideq*2]
+ add t0, 4
+ jl .w8_loop
+ sub wd, 8
+ jg .w8_loop0
+ RET
+
+cglobal ipred_smooth_h_16bpc, 3, 6, 6, dst, stride, tl, w, h, weights
+ LEA weightsq, smooth_weights_1d_16bpc
+ mov wd, wm
+ movifnidn hd, hm
+ movd m5, [tlq+wq*2] ; right
+ sub tlq, 8
+ add hd, hd
+ pshuflw m5, m5, q0000
+ sub tlq, hq
+ punpcklqdq m5, m5
+ cmp wd, 4
+ jne .w8
+ movddup m4, [weightsq+4*2]
+ lea r3, [strideq*3]
+.w4_loop:
+ movq m1, [tlq+hq] ; left
+ punpcklwd m1, m1
+ psubw m1, m5 ; left - right
+ pshufd m0, m1, q3322
+ punpckldq m1, m1
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+ paddw m0, m5
+ paddw m1, m5
+ movhps [dstq+strideq*0], m0
+ movq [dstq+strideq*1], m0
+ movhps [dstq+strideq*2], m1
+ movq [dstq+r3 ], m1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4*2
+ jg .w4_loop
+ RET
+.w8:
+ lea weightsq, [weightsq+wq*4]
+ neg wq
+%if ARCH_X86_32
+ PUSH r6
+ %assign regs_used 7
+ %define hd hm
+%elif WIN64
+ PUSH r7
+ %assign regs_used 8
+%endif
+.w8_loop0:
+ mov t0d, hd
+ mova m4, [weightsq+wq*2]
+ mov r6, dstq
+ add dstq, 16
+.w8_loop:
+ movq m3, [tlq+t0*(1+ARCH_X86_32)]
+ punpcklwd m3, m3
+ psubw m3, m5
+ pshufd m0, m3, q3333
+ pshufd m1, m3, q2222
+ pshufd m2, m3, q1111
+ pshufd m3, m3, q0000
+ REPX {pmulhrsw x, m4}, m0, m1, m2, m3
+ REPX {paddw x, m5}, m0, m1, m2, m3
+ mova [r6+strideq*0], m0
+ mova [r6+strideq*1], m1
+ lea r6, [r6+strideq*2]
+ mova [r6+strideq*0], m2
+ mova [r6+strideq*1], m3
+ lea r6, [r6+strideq*2]
+ sub t0d, 4*(1+ARCH_X86_64)
+ jg .w8_loop
+ add wq, 8
+ jl .w8_loop0
+ RET
+
+%if ARCH_X86_64
+DECLARE_REG_TMP 10
+%else
+DECLARE_REG_TMP 3
+%endif
+
+cglobal ipred_smooth_16bpc, 3, 7, 8, dst, stride, tl, w, h, \
+ h_weights, v_weights, top
+ LEA h_weightsq, smooth_weights_2d_16bpc
+ mov wd, wm
+ mov hd, hm
+ movd m7, [tlq+wq*2] ; right
+ lea v_weightsq, [h_weightsq+hq*8]
+ neg hq
+ movd m6, [tlq+hq*2] ; bottom
+ pshuflw m7, m7, q0000
+ pshuflw m6, m6, q0000
+ cmp wd, 4
+ jne .w8
+ movq m4, [tlq+2] ; top
+ mova m5, [h_weightsq+4*4]
+ punpcklwd m4, m6 ; top, bottom
+ pxor m6, m6
+.w4_loop:
+ movq m1, [v_weightsq+hq*4]
+ sub tlq, 4
+ movd m3, [tlq] ; left
+ pshufd m0, m1, q0000
+ pshufd m1, m1, q1111
+ pmaddwd m0, m4
+ punpcklwd m3, m7 ; left, right
+ pmaddwd m1, m4
+ pshufd m2, m3, q1111
+ pshufd m3, m3, q0000
+ pmaddwd m2, m5
+ pmaddwd m3, m5
+ paddd m0, m2
+ paddd m1, m3
+ psrld m0, 8
+ psrld m1, 8
+ packssdw m0, m1
+ pavgw m0, m6
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ add hq, 2
+ jl .w4_loop
+ RET
+.w8:
+%if ARCH_X86_32
+ lea h_weightsq, [h_weightsq+wq*4]
+ mov t0, tlq
+ mov r1m, tlq
+ mov r2m, hq
+ %define m8 [h_weightsq+16*0]
+ %define m9 [h_weightsq+16*1]
+%else
+%if WIN64
+ movaps r4m, m8
+ movaps r6m, m9
+ PUSH r7
+ PUSH r8
+%endif
+ PUSH r9
+ PUSH r10
+ %assign regs_used 11
+ lea h_weightsq, [h_weightsq+wq*8]
+ lea topq, [tlq+wq*2]
+ neg wq
+ mov r8, tlq
+ mov r9, hq
+%endif
+ punpcklqdq m6, m6
+.w8_loop0:
+%if ARCH_X86_32
+ movu m5, [t0+2]
+ add t0, 16
+ mov r0m, t0
+%else
+ movu m5, [topq+wq*2+2]
+ mova m8, [h_weightsq+wq*4+16*0]
+ mova m9, [h_weightsq+wq*4+16*1]
+%endif
+ mov t0, dstq
+ add dstq, 16
+ punpcklwd m4, m5, m6
+ punpckhwd m5, m6
+.w8_loop:
+ movd m1, [v_weightsq+hq*4]
+ sub tlq, 2
+ movd m3, [tlq] ; left
+ pshufd m1, m1, q0000
+ pmaddwd m0, m4, m1
+ pshuflw m3, m3, q0000
+ pmaddwd m1, m5
+ punpcklwd m3, m7 ; left, right
+ pmaddwd m2, m8, m3
+ pmaddwd m3, m9
+ paddd m0, m2
+ paddd m1, m3
+ psrld m0, 8
+ psrld m1, 8
+ packssdw m0, m1
+ pxor m1, m1
+ pavgw m0, m1
+ mova [t0], m0
+ add t0, strideq
+ inc hq
+ jl .w8_loop
+%if ARCH_X86_32
+ mov t0, r0m
+ mov tlq, r1m
+ add h_weightsq, 16*2
+ mov hq, r2m
+ sub dword wm, 8
+ jg .w8_loop0
+%else
+ mov tlq, r8
+ mov hq, r9
+ add wq, 8
+ jl .w8_loop0
+%endif
+%if WIN64
+ movaps m8, r4m
+ movaps m9, r6m
+%endif
+ RET
+
+%if ARCH_X86_64
+cglobal ipred_z1_16bpc, 3, 8, 8, 16*18, dst, stride, tl, w, h, angle, dx
+ %define base r7-$$
+ %define bdmaxm r8m
+ lea r7, [$$]
+%else
+cglobal ipred_z1_16bpc, 3, 7, 8, -16*18, dst, stride, tl, w, h, angle, dx
+ %define base r1-$$
+ %define stridemp [rsp+4*0]
+ %define bdmaxm [rsp+4*1]
+ mov r3, r8m
+ mov stridemp, r1
+ mov bdmaxm, r3
+ LEA r1, $$
+%endif
+ tzcnt wd, wm
+ movifnidn angled, anglem
+ movifnidn hd, hm
+ add tlq, 2
+ movsxd wq, [base+ipred_z1_16bpc_ssse3_table+wq*4]
+ mov dxd, angled
+ movddup m0, [base+pw_256]
+ and dxd, 0x7e
+ movddup m7, [base+pw_62]
+ add angled, 165 ; ~90
+ lea wq, [base+wq+ipred_z1_16bpc_ssse3_table]
+ movzx dxd, word [base+dr_intra_derivative+dxq]
+ xor angled, 0x4ff ; d = 90 - angle
+ jmp wq
+.w4:
+ lea r3d, [angleq+88]
+ test r3d, 0x480
+ jnz .w4_no_upsample ; !enable_intra_edge_filter || angle >= 40
+ sar r3d, 9
+ add r3d, hd
+ cmp r3d, 8
+ jg .w4_no_upsample ; h > 8 || (w == h && is_sm)
+ movd m3, [tlq+14]
+ movu m2, [tlq+ 0] ; 1 2 3 4 5 6 7 8
+ movd m1, bdmaxm
+ pshufb m3, m0
+ palignr m4, m3, m2, 4 ; 3 4 5 6 7 8 8 8
+ paddw m4, [tlq- 2] ; 0 1 2 3 4 5 6 7
+ add dxd, dxd
+ mova [rsp+32], m3
+ palignr m3, m2, 2 ; 2 3 4 5 6 7 8 8
+ pshufb m1, m0
+ paddw m3, m2 ; -1 * a + 9 * b + 9 * c + -1 * d
+ psubw m5, m3, m4 ; = (b + c - a - d + (b + c) << 3 + 8) >> 4
+ movd m4, dxd
+ psraw m5, 3 ; = ((b + c - a - d) >> 3 + b + c + 1) >> 1
+ paddw m3, m5
+ pxor m5, m5
+ pmaxsw m3, m5
+ mov r3d, dxd
+ pavgw m3, m5
+ pshufb m4, m0
+ pminsw m3, m1
+ punpcklwd m1, m2, m3
+ punpckhwd m2, m3
+ mova m3, [base+z_upsample]
+ movifnidn strideq, stridemp
+ mova [rsp+ 0], m1
+ paddw m5, m4, m4
+ mova [rsp+16], m2
+ punpcklqdq m4, m5 ; xpos0 xpos1
+.w4_upsample_loop:
+ lea r2d, [r3+dxq]
+ shr r3d, 6 ; base0
+ movu m1, [rsp+r3*2]
+ lea r3d, [r2+dxq]
+ shr r2d, 6 ; base1
+ movu m2, [rsp+r2*2]
+ pshufb m1, m3
+ pshufb m2, m3
+ punpcklqdq m0, m1, m2
+ punpckhqdq m1, m2
+ pand m2, m7, m4 ; frac
+ psllw m2, 9 ; (a * (64 - frac) + b * frac + 32) >> 6
+ psubw m1, m0 ; = a + (((b - a) * frac + 32) >> 6)
+ pmulhrsw m1, m2 ; = a + (((b - a) * (frac << 9) + 16384) >> 15)
+ paddw m4, m5 ; xpos += dx
+ paddw m0, m1
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w4_upsample_loop
+ RET
+.w4_no_upsample:
+ mov r3d, 7 ; max_base
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .w4_main
+ lea r3d, [hq+3]
+ movd m1, r3d
+ movd m3, angled
+ shr angled, 8 ; is_sm << 1
+ pxor m2, m2
+ pshufb m1, m2
+ pshufb m3, m2
+ pcmpeqb m1, [base+z_filt_wh4]
+ pand m1, m3
+ pcmpgtb m1, [base+z_filt_t_w48+angleq*8]
+ pmovmskb r5d, m1
+ mov r3d, 7
+ test r5d, r5d
+ jz .w4_main ; filter_strength == 0
+ pshuflw m1, [tlq-2], q0000
+ movu m2, [tlq+16*0]
+ imul r5d, 0x55555555
+ movd m3, [tlq+r3*2]
+ shr r5d, 30 ; filter_strength
+ movd [rsp+12], m1
+ pshuflw m3, m3, q0000
+ mova [rsp+16*1], m2
+ lea r2d, [r3+2]
+ movq [rsp+r3*2+18], m3
+ cmp hd, 8
+ cmovae r3d, r2d
+ lea tlq, [rsp+16*1]
+ call .filter_edge
+.w4_main:
+ lea tlq, [tlq+r3*2]
+ movd m4, dxd
+ movddup m1, [base+z_base_inc] ; base_inc << 6
+ movd m6, [tlq] ; top[max_base_x]
+ shl r3d, 6
+ movd m3, r3d
+ pshufb m4, m0
+ mov r5d, dxd ; xpos
+ pshufb m6, m0
+ sub r5, r3
+ pshufb m3, m0
+ paddw m5, m4, m4
+ psubw m3, m1 ; max_base_x
+ punpcklqdq m4, m5 ; xpos0 xpos1
+ movifnidn strideq, stridemp
+.w4_loop:
+ lea r3, [r5+dxq]
+ sar r5, 6 ; base0
+ movq m0, [tlq+r5*2+0]
+ movq m1, [tlq+r5*2+2]
+ lea r5, [r3+dxq]
+ sar r3, 6 ; base1
+ movhps m0, [tlq+r3*2+0]
+ movhps m1, [tlq+r3*2+2]
+ pand m2, m7, m4
+ psllw m2, 9
+ psubw m1, m0
+ pmulhrsw m1, m2
+ pcmpgtw m2, m3, m4 ; xpos < max_base_x
+ paddw m4, m5 ; xpos += dx
+ paddw m0, m1
+ pand m0, m2
+ pandn m2, m6
+ por m0, m2
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ sub hd, 2
+ jz .w4_end
+ lea dstq, [dstq+strideq*2]
+ test r5d, r5d
+ jl .w4_loop
+.w4_end_loop:
+ movq [dstq+strideq*0], m6
+ movq [dstq+strideq*1], m6
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w4_end_loop
+.w4_end:
+ RET
+.w8:
+ lea r3d, [angleq+88]
+ and r3d, ~0x7f
+ or r3d, hd
+ cmp r3d, 8
+ ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8
+ movu m1, [tlq+ 0] ; 1 2 3 4 5 6 7 8
+ movu m5, [tlq+ 2] ; 2 3 4 5 6 7 8 9
+ movu m3, [tlq+ 4] ; 3 4 5 6 7 8 9 a
+ paddw m5, m1
+ paddw m3, [tlq- 2] ; 0 1 2 3 4 5 6 7
+ psubw m2, m5, m3
+ movu m6, [tlq+18] ; a b c d e f g _
+ psraw m2, 3
+ movu m3, [tlq+20] ; b c d e f g _ _
+ paddw m5, m2
+ movu m2, [tlq+16] ; 9 a b c d e f g
+ paddw m6, m2
+ add dxd, dxd
+ cmp hd, 4
+ jne .w8_upsample_h8 ; awkward single-pixel edge case
+ pshuflw m3, m3, q1110 ; b c c _ _ _ _ _
+.w8_upsample_h8:
+ paddw m3, [tlq+14] ; 8 9 a b c d e f
+ psubw m4, m6, m3
+ movd m3, bdmaxm
+ psraw m4, 3
+ mov r3d, dxd
+ paddw m6, m4
+ pxor m4, m4
+ pmaxsw m5, m4
+ pmaxsw m6, m4
+ pshufb m3, m0
+ pavgw m5, m4
+ pavgw m6, m4
+ movd m4, dxd
+ pminsw m5, m3
+ pminsw m6, m3
+ mova m3, [base+z_upsample]
+ pshufb m4, m0
+ movifnidn strideq, stridemp
+ punpcklwd m0, m1, m5
+ mova [rsp+ 0], m0
+ punpckhwd m1, m5
+ mova [rsp+16], m1
+ punpcklwd m0, m2, m6
+ mova [rsp+32], m0
+ punpckhwd m2, m6
+ mova [rsp+48], m2
+ mova m5, m4
+.w8_upsample_loop:
+ mov r2d, r3d
+ shr r2d, 6
+ movu m1, [rsp+r2*2+ 0]
+ movu m2, [rsp+r2*2+16]
+ add r3d, dxd
+ pshufb m1, m3
+ pshufb m2, m3
+ punpcklqdq m0, m1, m2
+ punpckhqdq m1, m2
+ pand m2, m7, m4
+ psllw m2, 9
+ psubw m1, m0
+ pmulhrsw m1, m2
+ paddw m4, m5
+ paddw m0, m1
+ mova [dstq], m0
+ add dstq, strideq
+ dec hd
+ jg .w8_upsample_loop
+ RET
+.w8_no_upsample:
+ lea r3d, [hq+7]
+ movd m1, r3d
+ and r3d, 7
+ or r3d, 8 ; imin(h+7, 15)
+ test angled, 0x400
+ jnz .w8_main
+ movd m3, angled
+ shr angled, 8 ; is_sm << 1
+ pxor m2, m2
+ pshufb m1, m2
+ pshufb m3, m2
+ movu m2, [base+z_filt_wh8]
+ psrldq m4, [base+z_filt_t_w48+angleq*8], 4
+ pcmpeqb m2, m1
+ pand m2, m3
+ pcmpgtb m2, m4
+ pmovmskb r5d, m2
+ test r5d, r5d
+ jz .w8_main ; filter_strength == 0
+ pshuflw m1, [tlq-2], q0000
+ movu m2, [tlq+16*0]
+ imul r5d, 0x55555555
+ movu m3, [tlq+16*1]
+ movd m4, [tlq+r3*2]
+ shr r5d, 30 ; filter_strength
+ movd [rsp+12], m1
+ mova [rsp+16*1], m2
+ pshuflw m4, m4, q0000
+ mova [rsp+16*2], m3
+ lea r2d, [r3+2]
+ movq [rsp+r3*2+18], m4
+ cmp hd, 16
+ cmovae r3d, r2d
+ lea tlq, [rsp+16*1]
+ call .filter_edge
+.w8_main:
+ lea tlq, [tlq+r3*2]
+ movd m5, dxd
+ mova m4, [base+z_base_inc]
+ shl r3d, 6
+ movd m6, [tlq] ; top[max_base_x]
+ movd m1, r3d
+ pshufb m5, m0
+ mov r5d, dxd ; xpos
+ pshufb m1, m0
+ sub r5, r3
+ psubw m4, m1 ; max_base_x
+ pshufb m6, m0
+ paddw m4, m5
+ movifnidn strideq, stridemp
+.w8_loop:
+ mov r3, r5
+ sar r3, 6
+ movu m0, [tlq+r3*2+0]
+ movu m1, [tlq+r3*2+2]
+ pand m2, m7, m4
+ psllw m2, 9
+ psubw m1, m0
+ pmulhrsw m1, m2
+ psraw m2, m4, 15 ; xpos < max_base_x
+ paddw m4, m5 ; xpos += dx
+ paddw m0, m1
+ pand m0, m2
+ pandn m2, m6
+ por m0, m2
+ mova [dstq], m0
+ dec hd
+ jz .w8_end
+ add dstq, strideq
+ add r5, dxq
+ jl .w8_loop
+.w8_end_loop:
+ mova [dstq], m6
+ add dstq, strideq
+ dec hd
+ jg .w8_end_loop
+.w8_end:
+ RET
+.w16:
+%if ARCH_X86_32
+ %define strideq r3
+%endif
+ lea r3d, [hq+15]
+ movd m1, r3d
+ and r3d, 15
+ or r3d, 16 ; imin(h+15, 31)
+ test angled, 0x400
+ jnz .w16_main
+ movd m3, angled
+ shr angled, 8 ; is_sm << 1
+ pxor m2, m2
+ pshufb m1, m2
+ pshufb m3, m2
+ movq m4, [base+z_filt_t_w16+angleq*4]
+ pcmpeqb m1, [base+z_filt_wh16]
+ pand m1, m3
+ pcmpgtb m1, m4
+ pmovmskb r5d, m1
+ test r5d, r5d
+ jz .w16_main ; filter_strength == 0
+ pshuflw m1, [tlq-2], q0000
+ movu m2, [tlq+16*0]
+ imul r5d, 0x24924924
+ movu m3, [tlq+16*1]
+ movu m4, [tlq+16*2]
+ shr r5d, 30
+ movu m5, [tlq+16*3]
+ movd m6, [tlq+r3*2]
+ adc r5d, -1 ; filter_strength
+ movd [rsp+12], m1
+ mova [rsp+16*1], m2
+ mova [rsp+16*2], m3
+ pshuflw m6, m6, q0000
+ mova [rsp+16*3], m4
+ mova [rsp+16*4], m5
+ lea r2d, [r3+2]
+ movq [rsp+r3*2+18], m6
+ cmp hd, 32
+ cmovae r3d, r2d
+ lea tlq, [rsp+16*1]
+ call .filter_edge
+.w16_main:
+ lea tlq, [tlq+r3*2]
+ movd m5, dxd
+ mova m4, [base+z_base_inc]
+ shl r3d, 6
+ movd m6, [tlq] ; top[max_base_x]
+ movd m1, r3d
+ pshufb m5, m0
+ mov r5d, dxd ; xpos
+ pshufb m1, m0
+ sub r5, r3
+ psubw m4, m1 ; max_base_x
+ pshufb m6, m0
+ paddw m4, m5
+.w16_loop:
+ mov r3, r5
+ sar r3, 6
+ movu m0, [tlq+r3*2+ 0]
+ movu m2, [tlq+r3*2+ 2]
+ pand m3, m7, m4
+ psllw m3, 9
+ psubw m2, m0
+ pmulhrsw m2, m3
+ movu m1, [tlq+r3*2+16]
+ paddw m0, m2
+ movu m2, [tlq+r3*2+18]
+ psubw m2, m1
+ pmulhrsw m2, m3
+ movddup m3, [base+pw_m512]
+ paddw m1, m2
+ psraw m2, m4, 15
+ pcmpgtw m3, m4
+ paddw m4, m5
+ pand m0, m2
+ pandn m2, m6
+ pand m1, m3
+ pandn m3, m6
+ por m0, m2
+ mova [dstq+16*0], m0
+ por m1, m3
+ mova [dstq+16*1], m1
+ dec hd
+ jz .w16_end
+ movifnidn strideq, stridemp
+ add dstq, strideq
+ add r5, dxq
+ jl .w16_loop
+.w16_end_loop:
+ mova [dstq+16*0], m6
+ mova [dstq+16*1], m6
+ add dstq, strideq
+ dec hd
+ jg .w16_end_loop
+.w16_end:
+ RET
+.w32:
+ lea r3d, [hq+31]
+ and r3d, 31
+ or r3d, 32 ; imin(h+31, 63)
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .w32_main
+ call .filter_copy
+ lea r5d, [r3+2]
+ cmp hd, 64
+ cmove r3d, r5d
+ call .filter_edge_s3
+.w32_main:
+ lea tlq, [tlq+r3*2]
+ movd m5, dxd
+ mova m4, [base+z_base_inc]
+ shl r3d, 6
+ movd m6, [tlq] ; top[max_base_x]
+ movd m1, r3d
+ pshufb m5, m0
+ mov r5d, dxd ; xpos
+ pshufb m1, m0
+ sub r5, r3
+ psubw m4, m1 ; max_base_x
+ pshufb m6, m0
+ paddw m4, m5
+.w32_loop:
+ mov r3, r5
+ sar r3, 6
+ movu m0, [tlq+r3*2+ 0]
+ movu m2, [tlq+r3*2+ 2]
+ pand m3, m7, m4
+ psllw m3, 9
+ psubw m2, m0
+ pmulhrsw m2, m3
+ movu m1, [tlq+r3*2+16]
+ paddw m0, m2
+ movu m2, [tlq+r3*2+18]
+ psubw m2, m1
+ pmulhrsw m2, m3
+ paddw m1, m2
+ psraw m2, m4, 15
+ pand m0, m2
+ pandn m2, m6
+ por m0, m2
+ movddup m2, [base+pw_m512]
+ pcmpgtw m2, m4
+ pand m1, m2
+ pandn m2, m6
+ mova [dstq+16*0], m0
+ por m1, m2
+ mova [dstq+16*1], m1
+ movu m0, [tlq+r3*2+32]
+ movu m2, [tlq+r3*2+34]
+ psubw m2, m0
+ pmulhrsw m2, m3
+ movu m1, [tlq+r3*2+48]
+ paddw m0, m2
+ movu m2, [tlq+r3*2+50]
+ psubw m2, m1
+ pmulhrsw m2, m3
+ paddw m1, m2
+ movddup m2, [base+pw_m1024]
+ movddup m3, [base+pw_m1536]
+ pcmpgtw m2, m4
+ pcmpgtw m3, m4
+ paddw m4, m5
+ pand m0, m2
+ pandn m2, m6
+ pand m1, m3
+ pandn m3, m6
+ por m0, m2
+ mova [dstq+16*2], m0
+ por m1, m3
+ mova [dstq+16*3], m1
+ dec hd
+ jz .w32_end
+ movifnidn strideq, stridemp
+ add dstq, strideq
+ add r5, dxq
+ jl .w32_loop
+.w32_end_loop:
+ REPX {mova [dstq+16*x], m6}, 0, 1, 2, 3
+ add dstq, strideq
+ dec hd
+ jg .w32_end_loop
+.w32_end:
+ RET
+.w64:
+ lea r3d, [hq+63]
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .w64_main
+ call .filter_copy
+ call .filter_edge_s3
+.w64_main:
+ lea tlq, [tlq+r3*2]
+ movd m5, dxd
+ mova m4, [base+z_base_inc]
+ shl r3d, 6
+ movd m6, [tlq] ; top[max_base_x]
+ movd m1, r3d
+ pshufb m5, m0
+ mov r5d, dxd ; xpos
+ pshufb m1, m0
+ sub r5, r3
+ psubw m4, m1 ; max_base_x
+ pshufb m6, m0
+ paddw m4, m5
+.w64_loop:
+ mov r3, r5
+ sar r3, 6
+ movu m0, [tlq+r3*2+ 0]
+ movu m2, [tlq+r3*2+ 2]
+ pand m3, m7, m4
+ psllw m3, 9
+ psubw m2, m0
+ pmulhrsw m2, m3
+ movu m1, [tlq+r3*2+16]
+ paddw m0, m2
+ movu m2, [tlq+r3*2+18]
+ psubw m2, m1
+ pmulhrsw m2, m3
+ paddw m1, m2
+ psraw m2, m4, 15
+ pand m0, m2
+ pandn m2, m6
+ por m0, m2
+ movddup m2, [base+pw_m512]
+ pcmpgtw m2, m4
+ pand m1, m2
+ pandn m2, m6
+ mova [dstq+16*0], m0
+ por m1, m2
+ mova [dstq+16*1], m1
+ movu m0, [tlq+r3*2+32]
+ movu m2, [tlq+r3*2+34]
+ psubw m2, m0
+ pmulhrsw m2, m3
+ movu m1, [tlq+r3*2+48]
+ paddw m0, m2
+ movu m2, [tlq+r3*2+50]
+ psubw m2, m1
+ pmulhrsw m2, m3
+ paddw m1, m2
+ movddup m2, [base+pw_m1024]
+ pcmpgtw m2, m4
+ pand m0, m2
+ pandn m2, m6
+ por m0, m2
+ movddup m2, [base+pw_m1536]
+ pcmpgtw m2, m4
+ pand m1, m2
+ pandn m2, m6
+ mova [dstq+16*2], m0
+ por m1, m2
+ mova [dstq+16*3], m1
+ movu m0, [tlq+r3*2+64]
+ movu m2, [tlq+r3*2+66]
+ psubw m2, m0
+ pmulhrsw m2, m3
+ movu m1, [tlq+r3*2+80]
+ paddw m0, m2
+ movu m2, [tlq+r3*2+82]
+ psubw m2, m1
+ pmulhrsw m2, m3
+ paddw m1, m2
+ movddup m2, [base+pw_m2048]
+ pcmpgtw m2, m4
+ pand m0, m2
+ pandn m2, m6
+ por m0, m2
+ movddup m2, [base+pw_m2560]
+ pcmpgtw m2, m4
+ pand m1, m2
+ pandn m2, m6
+ mova [dstq+16*4], m0
+ por m1, m2
+ mova [dstq+16*5], m1
+ movu m0, [tlq+r3*2+96]
+ movu m2, [tlq+r3*2+98]
+ psubw m2, m0
+ pmulhrsw m2, m3
+ movu m1, [tlq+r3*2+112]
+ paddw m0, m2
+ movu m2, [tlq+r3*2+114]
+ psubw m2, m1
+ pmulhrsw m2, m3
+ paddw m1, m2
+ movddup m2, [base+pw_m3072]
+ movddup m3, [base+pw_m3584]
+ pcmpgtw m2, m4
+ pcmpgtw m3, m4
+ paddw m4, m5
+ pand m0, m2
+ pandn m2, m6
+ pand m1, m3
+ pandn m3, m6
+ por m0, m2
+ mova [dstq+16*6], m0
+ por m1, m3
+ mova [dstq+16*7], m1
+ dec hd
+ jz .w64_end
+ movifnidn strideq, stridemp
+ add dstq, strideq
+ add r5, dxq
+ jl .w64_loop
+.w64_end_loop:
+ REPX {mova [dstq+16*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
+ add dstq, strideq
+ dec hd
+ jg .w64_end_loop
+.w64_end:
+ RET
+ALIGN function_align
+.filter_copy:
+ pshuflw m2, [tlq-2], q0000
+ pshuflw m3, [tlq+r3*2], q0000
+ xor r5d, r5d
+ movd [rsp+gprsize+12], m2
+.filter_copy_loop:
+ movu m1, [tlq+r5*2+16*0]
+ movu m2, [tlq+r5*2+16*1]
+ add r5d, 16
+ mova [rsp+r5*2+gprsize-16*1], m1
+ mova [rsp+r5*2+gprsize-16*0], m2
+ cmp r5d, r3d
+ jle .filter_copy_loop
+ lea tlq, [rsp+gprsize+16*1]
+ movq [tlq+r3*2+2], m3
+ ret
+.filter_edge:
+ cmp r5d, 3
+ je .filter_edge_s3
+ movddup m4, [base+z_filt_k+r5*8-8]
+ movddup m5, [base+z_filt_k+r5*8+8]
+ xor r5d, r5d
+ movddup m6, [base+pw_8]
+ movu m2, [tlq-2]
+ jmp .filter_edge_start
+.filter_edge_loop:
+ movu m2, [tlq+r5*2-2]
+ mova [tlq+r5*2-16], m1
+.filter_edge_start:
+ pmullw m1, m4, [tlq+r5*2]
+ movu m3, [tlq+r5*2+2]
+ paddw m2, m3
+ pmullw m2, m5
+ add r5d, 8
+ paddw m1, m6
+ paddw m1, m2
+ psrlw m1, 4
+ cmp r5d, r3d
+ jl .filter_edge_loop
+ mova [tlq+r5*2-16], m1
+ ret
+.filter_edge_s3:
+ movddup m5, [base+pw_3]
+ xor r5d, r5d
+ movu m2, [tlq-2]
+ movu m3, [tlq-4]
+ jmp .filter_edge_s3_start
+.filter_edge_s3_loop:
+ movu m2, [tlq+r5*2-2]
+ movu m3, [tlq+r5*2-4]
+ mova [tlq+r5*2-16], m1
+.filter_edge_s3_start:
+ paddw m2, [tlq+r5*2+0]
+ paddw m3, m5
+ movu m1, [tlq+r5*2+2]
+ movu m4, [tlq+r5*2+4]
+ add r5d, 8
+ paddw m1, m2
+ pavgw m3, m4
+ paddw m1, m3
+ psrlw m1, 2
+ cmp r5d, r3d
+ jl .filter_edge_s3_loop
+ mova [tlq+r5*2-16], m1
+ ret
+
+%if ARCH_X86_64
+cglobal ipred_z2_16bpc, 4, 12, 11, 16*24, dst, stride, tl, w, h, angle, dx, _, dy
+ %define base r7-$$
+ %define maxwm r6m
+ %define maxhm r7m
+ %define bdmaxm r8m
+ lea r7, [$$]
+ mov hd, hm
+ movddup m8, [base+pw_62]
+ lea r9d, [wq-4]
+ shl r9d, 6
+ mova m9, [base+z2_top_shufA]
+ or r9d, hd
+ mova m10, [base+z2_left_shufA]
+%else
+cglobal ipred_z2_16bpc, 4, 7, 8, -16*27, dst, _, tl, w, h, angle, dx
+ %define base r1-$$
+ %define r9b byte [rsp+16*26+4*0]
+ %define r9d dword [rsp+16*26+4*0]
+ %define r10d dword [rsp+16*26+4*1]
+ %define r11d dword [rsp+16*26+4*2]
+ %define maxwm [rsp+16*2+4*0]
+ %define maxhm [rsp+16*2+4*1]
+ %define bdmaxm [rsp+16*2+4*2]
+ %define stridemp [rsp+16*26+4*3]
+ %define strideq r3
+ %define dyd r4
+ %define dyq r4
+ mov stridemp, r1
+ mov r1d, r6m
+ mov r4d, r7m
+ mov r5d, r8m
+ mov maxwm, r1d
+ mov maxhm, r4d
+ mov bdmaxm, r5d
+ LEA r1, $$
+ lea hd, [wq-4]
+ mova m0, [base+z2_top_shufA]
+ shl hd, 6
+ mova m1, [base+z2_left_shufA]
+ or hd, hm
+ mova [rsp+16*24], m0
+ mov r9d, hd
+ mova [rsp+16*25], m1
+%endif
+ tzcnt wd, wd
+ movifnidn angled, anglem
+ mova m0, [tlq-16*8]
+ mova m1, [tlq-16*7]
+ mova m2, [tlq-16*6]
+ mova m3, [tlq-16*5]
+ movsxd wq, [base+ipred_z2_16bpc_ssse3_table+wq*4]
+%if ARCH_X86_64
+ movzx dxd, angleb
+%else
+ movzx dxd, byte anglem
+%endif
+ mova m4, [tlq-16*4]
+ mova m5, [tlq-16*3]
+ mova m6, [tlq-16*2]
+ mova m7, [tlq-16*1]
+ mova [rsp+16* 5], m0
+ xor angled, 0x400
+ mova [rsp+16* 6], m1
+ mov dyd, dxd
+ mova [rsp+16* 7], m2
+ neg dxq
+ mova [rsp+16* 8], m3
+ and dyd, ~1
+ mova [rsp+16* 9], m4
+ and dxq, ~1
+ mova [rsp+16*10], m5
+ lea wq, [base+ipred_z2_16bpc_ssse3_table+wq]
+ mova [rsp+16*11], m6
+ pxor m3, m3
+ mova [rsp+16*12], m7
+ movzx dyd, word [base+dr_intra_derivative+dyq-90] ; angle - 90
+ movzx dxd, word [base+dr_intra_derivative+dxq+180] ; 180 - angle
+ movddup m0, [base+pw_256] ; 4<<6
+ movd m4, [tlq]
+ movu m5, [tlq+16*0+2]
+ movu m6, [tlq+16*1+2]
+ movsldup m1, [base+z2_dy_offset]
+ pshufb m4, m0
+ movq m7, [base+z_base_inc+2]
+ mov r11d, (112-4)<<6
+ mova [rsp+16*13], m4
+ neg dxd
+ mova [rsp+16*14], m5
+ or dyd, 4<<16
+ mova [rsp+16*15], m6
+%if ARCH_X86_64
+ lea r10d, [dxq+(112<<6)] ; xpos
+%else
+ mov [rsp+8*3], dyd
+ lea r4d, [dxq+(112<<6)]
+ mov r10d, r4d
+ movzx hd, r9b
+%endif
+ movq [rsp+8*0], m1
+ movq [rsp+8*1], m0
+ movq [rsp+8*2], m7
+ jmp wq
+.w4:
+ test angled, 0x400
+ jnz .w4_main
+ lea r3d, [hq+2]
+ add angled, 1022
+ pshuflw m1, m5, q3333
+ shl r3d, 6
+ movq [rsp+16*14+8], m1
+ test r3d, angled
+ jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8)
+ call .upsample_above
+ sub angled, 1075 ; angle - 53
+ lea r3d, [hq+3]
+ xor angled, 0x7f ; 180 - angle
+ movd m2, r3d
+ movd m7, angled
+ shr angled, 8 ; is_sm << 1
+ pshufb m2, m3
+ pshufb m7, m3
+ pcmpeqb m2, [base+z_filt_wh4]
+ pand m7, m2
+ pcmpgtb m7, [base+z_filt_t_w48+angleq*8]
+ jmp .w8_filter_left
+.upsample_above: ; w4/w8
+ paddw m2, m5, [tlq]
+ movu m1, [rsp+gprsize+16*14+2]
+ movu m4, [rsp+gprsize+16*14-4]
+%if ARCH_X86_64
+ movd m6, r9m ; bdmax, offset due to call
+%else
+ movd m6, [rsp+gprsize+16*2+4*2]
+%endif
+ paddw m4, m1
+ psubw m1, m2, m4
+ pshufb m6, m0
+ psraw m1, 3
+ paddw m2, m1
+ add dxd, dxd
+ pmaxsw m2, m3
+ paddw m7, m7
+ pavgw m2, m3
+ pminsw m2, m6
+%if ARCH_X86_64
+ mova m9, [base+z2_top_shufB]
+ lea r10d, [dxq+(113<<6)]
+ mov r11d, (112-7)<<6
+%else
+ mova m1, [base+z2_top_shufB]
+ lea r3d, [dxq+(113<<6)]
+ mov dword [rsp+gprsize+16*26+4*2], (112-7)<<6
+ mov [rsp+gprsize+16*26+4*1], r3d
+ mova [rsp+gprsize+16*24], m1
+%endif
+ punpcklwd m1, m2, m5
+ punpckhwd m2, m5
+ movq [rsp+gprsize+8*2], m7
+ mova [rsp+gprsize+16*14], m1
+ mova [rsp+gprsize+16*15], m2
+ ret
+.w4_no_upsample_above:
+ lea r3d, [hq+3]
+ mov [rsp+16*4], angled
+ sub angled, 1112 ; angle - 90
+ movd m2, r3d
+ mov r3d, 90
+ movd m1, angled
+ sub r3d, angled ; 180 - angle
+ shr angled, 8 ; is_sm << 1
+ mova m4, [base+z_filt_wh4]
+ movd m7, r3d
+ mova m5, [base+z_filt_t_w48+angleq*8]
+ mov r3d, 4
+ call .w8_filter_top
+ mov angled, [rsp+16*4]
+ lea r3d, [hq+2]
+ sub angled, 139
+ shl r3d, 6
+ test r3d, angled
+ jnz .w8_filter_left ; angle <= 140 || h > 8 || (is_sm && h == 8)
+.upsample_left: ; w4/w8
+ mova m2, [tlq-16]
+ lea r3d, [hq-4]
+ movu m3, [tlq-14]
+ movu m4, [rsp+16*12+4]
+ pshufb m1, m2, [base+z2_upsample_l+r3*4]
+ movd m6, bdmaxm
+ pxor m5, m5
+ paddw m3, m2
+ paddw m4, m1
+ psubw m1, m3, m4
+ movshdup m4, [base+z2_dy_offset]
+ psraw m1, 3
+ pshufb m6, m0
+ paddw m3, m1
+ pmaxsw m3, m5
+ pavgw m3, m5
+ pminsw m3, m6
+%if ARCH_X86_64
+ mova m10, [base+z2_left_shufB]
+ add dyd, dyd
+%else
+ mova m1, [base+z2_left_shufB]
+ shl dword [rsp+8*3], 1
+ mova [rsp+16*25], m1
+%endif
+ punpckhwd m1, m2, m3
+ punpcklwd m2, m3
+ movq [rsp+8*0], m4
+ mova [rsp+16*12], m1
+ mova [rsp+16*11], m2
+.w4_main:
+ movd m6, dxd
+%if ARCH_X86_64
+ movd m3, dyd
+%else
+ movd m3, [rsp+8*3]
+%endif
+ pshufb m6, m0
+ movddup m0, [rsp+8*2]
+ paddw m7, m6, m6
+ movq m5, [base+pw_m1to4]
+ pshuflw m4, m3, q0000
+ punpcklqdq m6, m7
+ pmullw m4, m5
+ pshuflw m3, m3, q1111
+ paddw m6, m0
+ mov r2d, r10d
+ pshuflw m0, m4, q3333
+ psubw m4, [rsp+8*0]
+ movq [rsp+8*3], m3
+ movq [rsp+8*5], m0 ; dy*4
+ mov r5, dstq
+.w4_loop0:
+ mova [rsp+16*4], m6
+ movq [rsp+8*4], m4
+%if ARCH_X86_64
+ pand m0, m8, m4
+%else
+ movq m0, [base+pw_62]
+ pand m0, m4
+%endif
+ psraw m4, 6
+ psllw m0, 9 ; frac_y << 9
+ movq [rsp+8*7], m0
+ pabsw m4, m4
+ movq [rsp+8*6], m4
+ movzx hd, r9b
+.w4_loop:
+ lea r3d, [r2+dxq]
+ shr r2d, 6 ; base_x0
+ movu m2, [rsp+r2*2]
+ lea r2d, [r3+dxq]
+ shr r3d, 6 ; base_x1
+ movu m1, [rsp+r3*2]
+ lea r3d, [r2+dxq]
+ shr r2d, 6 ; base_x2
+ movu m3, [rsp+r2*2]
+ lea r2d, [r3+dxq]
+ shr r3d, 6 ; base_x3
+ movu m4, [rsp+r3*2]
+%if ARCH_X86_64
+ REPX {pshufb x, m9}, m2, m1, m3, m4
+%else
+ mova m0, [rsp+16*24]
+ REPX {pshufb x, m0}, m2, m1, m3, m4
+%endif
+ punpcklqdq m0, m2, m1
+ punpckhqdq m2, m1
+ punpcklqdq m1, m3, m4
+ punpckhqdq m3, m4
+%if ARCH_X86_64
+ pand m5, m8, m6
+%else
+ movddup m5, [base+pw_62]
+ pand m5, m6
+%endif
+ psllw m5, 9
+ psubw m2, m0
+ pmulhrsw m2, m5
+ paddw m5, m6, m7
+ psubw m3, m1
+ paddw m0, m2
+%if ARCH_X86_64
+ pand m2, m8, m5
+%else
+ movddup m2, [base+pw_62]
+ pand m2, m5
+%endif
+ psllw m2, 9
+ pmulhrsw m3, m2
+ paddw m1, m3
+ cmp r3d, 111 ; topleft
+ jge .w4_toponly
+ mova [rsp+16*22], m0
+ mova [rsp+16*23], m1
+ movzx r3d, byte [rsp+8*6+0] ; base_y0
+ movu m3, [rsp+r3*2]
+ movzx r3d, byte [rsp+8*6+2] ; base_y1
+ movu m2, [rsp+r3*2]
+ movzx r3d, byte [rsp+8*6+4] ; base_y2
+ movu m4, [rsp+r3*2]
+ movzx r3d, byte [rsp+8*6+6] ; base_y3
+ movu m0, [rsp+r3*2]
+%if ARCH_X86_64
+ REPX {pshufb x, m10}, m3, m2, m4, m0
+%else
+ mova m1, [rsp+16*25]
+ REPX {pshufb x, m1}, m3, m2, m4, m0
+%endif
+ punpcklwd m1, m3, m2
+ punpckhwd m3, m2 ; 01
+ punpcklwd m2, m4, m0
+ punpckhwd m4, m0 ; 23
+ punpckldq m0, m1, m2 ; y0 d1
+ punpckhdq m1, m2 ; y2 y3
+ punpckldq m2, m3, m4
+ punpckhdq m3, m4
+ movddup m4, [rsp+8*7]
+ psubw m2, m0
+ psubw m3, m1
+ pmulhrsw m2, m4
+ pmulhrsw m3, m4
+ psraw m6, 15 ; base_x < topleft
+ psraw m4, m5, 15
+ paddw m0, m2
+ paddw m1, m3
+ pand m0, m6
+ pandn m6, [rsp+16*22]
+ pand m1, m4
+ pandn m4, [rsp+16*23]
+ por m0, m6
+ por m1, m4
+.w4_toponly:
+ movifnidn strideq, stridemp
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ movq [dstq+strideq*0], m1
+ movhps [dstq+strideq*1], m1
+ sub hd, 4
+ jz .w4_end
+ movq m4, [rsp+8*6]
+ paddsw m6, m5, m7 ; xpos += dx
+ movq m5, [rsp+8*3]
+ psubw m4, m5
+ lea dstq, [dstq+strideq*2]
+ movq [rsp+8*6], m4
+ cmp r2d, r11d
+ jge .w4_loop
+.w4_leftonly_loop:
+ movzx r2d, byte [rsp+8*6+0] ; base_y0
+ movu m3, [rsp+r2*2]
+ movzx r2d, byte [rsp+8*6+2] ; base_y1
+ movu m2, [rsp+r2*2]
+ movzx r2d, byte [rsp+8*6+4] ; base_y2
+ movu m6, [rsp+r2*2]
+ movzx r2d, byte [rsp+8*6+6] ; base_y3
+ movu m0, [rsp+r2*2]
+ psubw m4, m5
+%if ARCH_X86_64
+ REPX {pshufb x, m10}, m3, m2, m6, m0
+%else
+ mova m1, [rsp+16*25]
+ REPX {pshufb x, m1}, m3, m2, m6, m0
+%endif
+ movq [rsp+8*6], m4
+ punpcklwd m1, m3, m2
+ punpckhwd m3, m2
+ punpcklwd m2, m6, m0
+ punpckhwd m6, m0
+ punpckldq m0, m1, m2
+ punpckhdq m1, m2
+ punpckldq m2, m3, m6
+ punpckhdq m3, m6
+ movddup m6, [rsp+8*7]
+ psubw m2, m0
+ psubw m3, m1
+ pmulhrsw m2, m6
+ pmulhrsw m3, m6
+ paddw m0, m2
+ paddw m1, m3
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ movq [dstq+strideq*0], m1
+ movhps [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 4
+ jg .w4_leftonly_loop
+.w4_end:
+ sub r9d, 1<<8
+ jl .w4_ret
+ movq m4, [rsp+8*5]
+ add r5, 8
+ mov dstq, r5
+ paddw m4, [rsp+8*4] ; base_y += 4*dy
+ movzx r2d, word [rsp+8*1]
+ movddup m6, [rsp+8*1]
+ paddw m6, [rsp+16*4] ; base_x += (4 << upsample_above)
+ add r2d, r10d
+ mov r10d, r2d
+ jmp .w4_loop0
+.w4_ret:
+ RET
+.w8:
+ test angled, 0x400
+ jnz .w4_main
+ lea r3d, [angleq+126]
+ pshufhw m1, m5, q3333
+%if ARCH_X86_64
+ mov r3b, hb
+%else
+ xor r3b, r3b
+ or r3d, hd
+%endif
+ movhps [rsp+16*15], m1
+ cmp r3d, 8
+ ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm
+ call .upsample_above
+ sub angled, 53
+ lea r3d, [hq+7]
+ xor angled, 0x7f ; 180 - angle
+ movu m1, [base+z_filt_wh8]
+ movd m2, r3d
+ movd m7, angled
+ shr angled, 8 ; is_sm << 1
+ psrldq m4, [base+z_filt_t_w48+angleq*8], 4
+ pshufb m2, m3
+ pshufb m7, m3
+ pcmpeqb m2, m1
+ movq m1, [base+pw_512]
+ pand m7, m2
+ pcmpgtb m7, m4
+ movq [rsp+8*1], m1 ; 8<<6
+ jmp .w8_filter_left
+.w8_no_upsample_above:
+ lea r3d, [hq+7]
+ mov [rsp+16*4], angled
+ sub angled, 90
+ movd m2, r3d
+ mov r3d, 90
+ movd m1, angled
+ sub r3d, angled ; 180 - angle
+ shr angled, 8 ; is_sm << 1
+ movu m4, [base+z_filt_wh8]
+ movd m7, r3d
+ psrldq m5, [base+z_filt_t_w48+angleq*8], 4
+ mov r3d, 8
+ call .w8_filter_top
+ mov r3d, [rsp+16*4]
+ sub r3d, 141
+%if ARCH_X86_64
+ mov r3b, hb
+%else
+ xor r3b, r3b
+ or r3d, hd
+%endif
+ cmp r3d, 8
+ jbe .upsample_left ; angle > 140 && h <= 8 && !is_sm
+.w8_filter_left:
+ pmovmskb r5d, m7
+ test r5d, r5d
+ jz .w4_main
+ imul r5d, 0x55555555
+ neg hq
+ mov r3, tlq
+ movd m1, [tlq+hq*2]
+ shr r5d, 30 ; filter_strength
+ lea tlq, [rsp+16*13-2]
+ pshuflw m1, m1, q0000
+ movq [tlq+hq*2-6], m1
+ call mangle(private_prefix %+ _ipred_z3_16bpc_ssse3).filter_edge
+ jmp .filter_left_end
+.w8_filter_top:
+ REPX {pshufb x, m3}, m2, m1, m7
+ pcmpeqb m2, m4
+ pand m1, m2
+ pand m7, m2
+ pcmpgtb m1, m5
+ pcmpgtb m7, m5
+ pmovmskb r5d, m1
+ test r5d, r5d
+ jz .w8_filter_top_end ; filter_strength == 0
+ imul r5d, 0x55555555
+ mov [dstq], tlq
+ lea tlq, [rsp+16*14+gprsize]
+ shr r5d, 30 ; filter_strength
+ call mangle(private_prefix %+ _ipred_z1_16bpc_ssse3).filter_edge
+%if ARCH_X86_64
+ mov r3d, r7m ; maxw, offset due to call
+%else
+ mov r3d, [rsp+16*2+4*1]
+%endif
+ mov tlq, [dstq]
+ cmp r3d, 8
+ jge .w8_filter_top_end
+ movu m1, [tlq+r3*2+16*0+2]
+ movu m2, [tlq+r3*2+16*1+2]
+ movu [rsp+r3*2+16*14+gprsize], m1
+ movu [rsp+r3*2+16*15+gprsize], m2
+.w8_filter_top_end:
+ ret
+.w16:
+ test angled, 0x400
+ jnz .w4_main
+ lea r3d, [hq+15]
+ sub angled, 90
+ movd m2, r3d
+ mov r3d, 90
+ movd m1, angled
+ sub r3d, angled ; 180 - angle
+ shr angled, 8 ; is_sm << 1
+ movd m7, r3d
+ REPX {pshufb x, m3}, m2, m1, m7
+ movq m4, [base+z_filt_t_w16+angleq*4]
+ pcmpeqb m2, [base+z_filt_wh16]
+ pand m1, m2
+ pand m7, m2
+ pcmpgtb m1, m4
+ pcmpgtb m7, m4
+ pmovmskb r5d, m1
+ test r5d, r5d
+ jz .w16_filter_left ; filter_strength == 0
+ imul r5d, 0x24924924
+ pshufhw m6, m6, q3333
+ mov [dstq], tlq
+ lea tlq, [rsp+16*14]
+ shr r5d, 30
+ movhps [tlq+16*2], m6
+ adc r5d, -1 ; filter_strength
+ mov r3d, 16
+ call mangle(private_prefix %+ _ipred_z1_16bpc_ssse3).filter_edge
+ mov r3d, maxwm
+ mov tlq, [dstq]
+ cmp r3d, 16
+ jge .w16_filter_left
+ movu m1, [tlq+r3*2+16*0+2]
+ movu m2, [tlq+r3*2+16*1+2]
+ movu [rsp+r3*2+16*14], m1
+ movu [rsp+r3*2+16*15], m2
+.w16_filter_left:
+ pmovmskb r5d, m7
+ test r5d, r5d
+ jz .w4_main
+ imul r5d, 0x24924924
+ neg hq
+ mov r3, tlq
+ movd m1, [tlq+hq*2]
+ shr r5d, 30
+ lea tlq, [rsp+16*13-2]
+ pshuflw m1, m1, q0000
+ adc r5d, -1 ; filter_strength
+ movq [tlq+hq*2-6], m1
+ call mangle(private_prefix %+ _ipred_z3_16bpc_ssse3).filter_edge
+ jmp .filter_left_end
+.w32:
+ movu m1, [tlq+16*2+2]
+ movu m2, [tlq+16*3+2]
+ mova [rsp+16*16], m1
+ mova [rsp+16*17], m2
+ test angled, 0x400
+ jnz .w4_main
+ mov [dstq], tlq
+ lea tlq, [rsp+16*14]
+ pshufhw m2, m2, q3333
+ mov r3d, 32
+ movhps [tlq+16*4], m2
+ call mangle(private_prefix %+ _ipred_z1_16bpc_ssse3).filter_edge_s3
+ mov r3d, maxwm
+ mov tlq, [dstq]
+ cmp r3d, 32
+ jge .filter_left
+ movu m1, [tlq+r3*2+16*0+2]
+ movu m2, [tlq+r3*2+16*1+2]
+ movu [rsp+r3*2+16*14], m1
+ movu [rsp+r3*2+16*15], m2
+ cmp r3d, 16
+ jge .filter_left
+ movu m1, [tlq+r3*2+16*2+2]
+ movu m2, [tlq+r3*2+16*3+2]
+ movu [rsp+r3*2+16*16], m1
+ movu [rsp+r3*2+16*17], m2
+.filter_left:
+ neg hq
+ mov r3, tlq
+ pshuflw m1, [tlq+hq*2], q0000
+ lea tlq, [rsp+16*13-2]
+ movq [tlq+hq*2-6], m1
+ call mangle(private_prefix %+ _ipred_z3_16bpc_ssse3).filter_edge_s3
+.filter_left_end:
+ mov r2d, maxhm
+ cmp r2d, hd
+ jge .w4_main
+ neg r2
+ movu m1, [r3+r2*2-16*1]
+ movu m2, [r3+r2*2-16*2]
+ movu [rsp+r2*2+16*12], m1
+ movu [rsp+r2*2+16*11], m2
+ cmp r2d, -48
+ jle .w4_main
+ movu m1, [r3+r2*2-16*3]
+ movu m2, [r3+r2*2-16*4]
+ movu [rsp+r2*2+16*10], m1
+ movu [rsp+r2*2+16* 9], m2
+ cmp r2d, -32
+ jle .w4_main
+ movu m1, [r3+r2*2-16*5]
+ movu m2, [r3+r2*2-16*6]
+ movu [rsp+r2*2+16* 8], m1
+ movu [rsp+r2*2+16* 7], m2
+ cmp r2d, -16
+ jle .w4_main
+ movu m1, [r3+r2*2-16*7]
+ movu m2, [r3+r2*2-16*8]
+ movu [rsp+r2*2+16* 6], m1
+ movu [rsp+r2*2+16* 5], m2
+ jmp .w4_main
+.w64:
+ movu m1, [tlq+16*2+2]
+ movu m2, [tlq+16*3+2]
+ movu m3, [tlq+16*4+2]
+ movu m4, [tlq+16*5+2]
+ movu m5, [tlq+16*6+2]
+ movu m6, [tlq+16*7+2]
+ mov [dstq], tlq
+ lea tlq, [rsp+16*14]
+ mova [tlq+16*2], m1
+ mova [tlq+16*3], m2
+ mova [tlq+16*4], m3
+ mova [tlq+16*5], m4
+ mova [tlq+16*6], m5
+ mova [tlq+16*7], m6
+ test angled, 0x400
+ jnz .w4_main
+ pshufhw m6, m6, q3333
+ mov r3d, 64
+ movhps [tlq+16*8], m6
+ call mangle(private_prefix %+ _ipred_z1_16bpc_ssse3).filter_edge_s3
+ mov r3d, maxwm
+ mov tlq, [dstq]
+ cmp r3d, 64
+ jge .filter_left
+ movu m1, [tlq+r3*2+16*0+2]
+ movu m2, [tlq+r3*2+16*1+2]
+ movu [rsp+r3*2+16*14], m1
+ movu [rsp+r3*2+16*15], m2
+ cmp r3d, 48
+ jge .filter_left
+ movu m1, [tlq+r3*2+16*2+2]
+ movu m2, [tlq+r3*2+16*3+2]
+ movu [rsp+r3*2+16*16], m1
+ movu [rsp+r3*2+16*17], m2
+ cmp r3d, 32
+ jge .filter_left
+ movu m1, [tlq+r3*2+16*4+2]
+ movu m2, [tlq+r3*2+16*5+2]
+ movu [rsp+r3*2+16*18], m1
+ movu [rsp+r3*2+16*19], m2
+ cmp r3d, 16
+ jge .filter_left
+ movu m1, [tlq+r3*2+16*6+2]
+ movu m2, [tlq+r3*2+16*7+2]
+ movu [rsp+r3*2+16*20], m1
+ movu [rsp+r3*2+16*21], m2
+ jmp .filter_left
+
+%if ARCH_X86_64
+cglobal ipred_z3_16bpc, 4, 9, 8, 16*18, dst, stride, tl, w, h, angle, dy, _, org_w
+ %define base r7-$$
+ lea r7, [$$]
+ mov org_wd, wd
+%else
+cglobal ipred_z3_16bpc, 4, 7, 8, -16*18, dst, stride, tl, w, h, angle, dy
+ %define base r1-$$
+ %define org_wd r5
+ %define org_wq r5
+ movd m6, r8m ; pixel_max
+ mov [dstq+4*0], strideq
+ LEA r1, $$
+ mov [dstq+4*1], wd
+%endif
+ tzcnt hd, hm
+ movifnidn angled, anglem
+ sub tlq, 2
+ movsxd hq, [base+ipred_z3_16bpc_ssse3_table+hq*4]
+ sub angled, 180
+ movddup m0, [base+pw_256]
+ mov dyd, angled
+ neg dyd
+ xor angled, 0x400
+ movddup m7, [base+pw_62]
+ or dyq, ~0x7e
+ lea hq, [base+ipred_z3_16bpc_ssse3_table+hq]
+ movzx dyd, word [base+dr_intra_derivative+45*2-1+dyq]
+ jmp hq
+.h4:
+ lea r4d, [angleq+88]
+ test r4d, 0x480
+ jnz .h4_no_upsample ; !enable_intra_edge_filter || angle >= 40
+ sar r4d, 9
+ add r4d, wd
+ cmp r4d, 8
+ jg .h4_no_upsample ; w > 8 || (w == 8 && is_sm)
+ mova m2, [tlq-14] ; 7 6 5 4 3 2 1 0
+ movu m3, [tlq-12] ; 8 7 6 5 4 3 2 1
+%if ARCH_X86_64
+ movd m6, r8m
+%endif
+ pshufb m4, m2, m0
+ mov tlq, rsp
+ palignr m1, m2, m4, 14 ; 8 8 7 6 5 4 3 2
+ add dyd, dyd
+ palignr m5, m2, m4, 12 ; 8 8 8 7 6 5 4 3
+ paddw m1, m2
+ paddw m3, m5
+ psubw m5, m1, m3
+ mova m3, [base+z_upsample]
+ mova [tlq+ 0], m4
+ movd m4, dyd
+ psraw m5, 3
+ neg dyd
+ paddw m1, m5
+ pxor m5, m5
+ lea r5d, [dyq+(16<<6)+63] ; ypos
+ pmaxsw m1, m5
+ pshufb m6, m0
+ shl wd, 3
+ pavgw m1, m5
+ pshufb m4, m0
+ pminsw m1, m6
+ sub rsp, wq
+ punpckhwd m0, m1, m2
+ paddw m5, m4, m4
+ punpcklwd m1, m2
+ mova [tlq+32], m0
+ movsd m4, m5
+ mova [tlq+16], m1
+.h4_upsample_loop:
+ lea r4d, [r5+dyq]
+ sar r5d, 6
+ movu m2, [tlq+r5*2]
+ lea r5d, [r4+dyq]
+ sar r4d, 6
+ movu m1, [tlq+r4*2]
+ pshufb m2, m3
+ pshufb m1, m3
+ punpckhqdq m0, m1, m2
+ punpcklqdq m1, m2
+ pand m2, m7, m4
+ psllw m2, 9
+ psubw m1, m0
+ pmulhrsw m1, m2
+ paddw m4, m5
+ paddw m0, m1
+ mova [rsp+wq-16], m0
+ sub wd, 16
+ jg .h4_upsample_loop
+ or r3d, 4*2
+ jmp .end_transpose
+.h4_no_upsample:
+ mov r4d, 7
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .h4_main
+ lea r4d, [wq+3]
+ movd m1, r4d
+ movd m3, angled
+ shr angled, 8 ; is_sm << 1
+ pxor m2, m2
+ pshufb m1, m2
+ pshufb m3, m2
+ pcmpeqb m1, [base+z_filt_wh4]
+ pand m1, m3
+ pcmpgtb m1, [base+z_filt_t_w48+angleq*8]
+ pmovmskb r5d, m1
+ mov r4d, 7
+ test r5d, r5d
+ jz .h4_main ; filter_strength == 0
+ pshuflw m1, [tlq+2], q0000
+ imul r5d, 0x55555555
+ mova m2, [tlq-14]
+ neg r4
+ movd m3, [tlq+r4*2]
+ shr r5d, 30
+ movd [rsp+16*17], m1
+ pshuflw m3, m3, q0000
+ mova [rsp+16*16], m2
+ lea r2, [r4-2]
+ movq [rsp+16*17+r4*2-10], m3
+ cmp wd, 8
+ cmovae r4, r2
+ lea tlq, [rsp+16*17-2]
+ call .filter_edge
+.h4_main:
+ movd m4, dyd
+ sub tlq, r4
+ movddup m1, [base+z_base_inc_z2+8] ; base_inc << 6
+ sub tlq, r4
+ shl r4d, 6
+ movd m6, [tlq]
+ movd m3, r4d
+ pshufb m4, m0
+ neg dyq
+ pshufb m6, m0
+ lea r5, [dyq+r4+63] ; ypos
+ pshufb m3, m0
+ shl wd, 3
+ paddw m5, m4, m4
+ sub rsp, wq
+ psubw m3, m1 ; max_base_y
+ movsd m4, m5 ; ypos1 ypos0
+.h4_loop:
+ lea r4, [r5+dyq]
+ sar r5, 6
+ movddup m0, [tlq+r5*2-6]
+ movddup m1, [tlq+r5*2-8]
+ lea r5, [r4+dyq]
+ sar r4, 6
+ movlps m0, [tlq+r4*2-6]
+ movlps m1, [tlq+r4*2-8]
+ pand m2, m7, m4
+ psllw m2, 9
+ psubw m1, m0
+ pmulhrsw m1, m2
+ pcmpgtw m2, m3, m4
+ paddw m4, m5
+ paddw m0, m1
+ pand m0, m2
+ pandn m2, m6
+ por m0, m2
+ mova [rsp+wq-16], m0
+ sub wd, 16
+ jz .h4_transpose
+ test r5d, r5d
+ jg .h4_loop
+.h4_end_loop:
+ mova [rsp+wq-16], m6
+ sub wd, 16
+ jg .h4_end_loop
+.h4_transpose:
+ or r3d, 4*2
+ jmp .end_transpose
+.h8:
+ lea r4d, [angleq+88]
+ and r4d, ~0x7f
+ or r4d, wd
+ cmp r4d, 8
+ ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8
+ mova m2, [tlq-30] ; g f e d c b a 9
+ movu m1, [tlq-32] ; _ g f e d c b a
+ movu m3, [tlq-16] ; 9 8 7 6 5 4 3 2
+ paddw m3, [tlq-14] ; 8 7 6 5 4 3 2 1
+ pshufd m4, m2, q2100 ; _ _ g f e d c b
+ paddw m1, m2
+ movu m5, [tlq-28] ; f e d c b a 9 8
+ add dyd, dyd
+ cmp wd, 8
+ je .h8_upsample_w8
+ pshufhw m4, m2, q1000 ; _ _ _ _ c c c b
+.h8_upsample_w8:
+ paddw m4, m5
+ psubw m5, m1, m4
+ movu m4, [tlq-18] ; a 9 8 7 6 5 4 3
+ psraw m5, 3
+ paddw m1, m5
+ movu m5, [tlq-12] ; 7 6 5 4 3 2 1 0
+%if ARCH_X86_64
+ movd m6, r8m ; pixel_max
+%endif
+ paddw m4, m5
+ shl wd, 4
+ psubw m5, m3, m4
+ movd m4, dyd
+ psraw m5, 3
+ neg dyd
+ paddw m3, m5
+ pshufb m6, m0
+ mova m5, [tlq-14]
+ pshufb m4, m0
+ pxor m0, m0
+ pmaxsw m1, m0
+ pmaxsw m3, m0
+ mov tlq, rsp
+ pavgw m1, m0
+ pavgw m3, m0
+ sub rsp, wq
+ pminsw m1, m6
+ pminsw m6, m3
+ mova m3, [base+z_upsample]
+ lea r5d, [dyq+(16<<6)+63] ; ypos
+ punpcklwd m0, m1, m2
+ mova [tlq+16*0], m0
+ punpckhwd m1, m2
+ mova [tlq+16*1], m1
+ punpcklwd m0, m6, m5
+ mova [tlq+16*2], m0
+ punpckhwd m6, m5
+ mova [tlq+16*3], m6
+ mova m5, m4
+.h8_upsample_loop:
+ mov r4d, r5d
+ sar r4d, 6
+ movu m1, [tlq+r4*2+16*0]
+ movu m2, [tlq+r4*2+16*1]
+ add r5d, dyd
+ pshufb m2, m3
+ pshufb m1, m3
+ punpckhqdq m0, m1, m2
+ punpcklqdq m1, m2
+ pand m2, m7, m4
+ psllw m2, 9
+ psubw m1, m0
+ pmulhrsw m1, m2
+ paddw m4, m5
+ paddw m0, m1
+ mova [rsp+wq-16], m0
+ sub wd, 16
+ jg .h8_upsample_loop
+ or r3d, 8*2
+ jmp .end_transpose
+.h8_no_upsample:
+ lea r4d, [wq+7]
+ movd m1, r4d
+ and r4d, 7
+ or r4d, 8 ; imin(w+7, 15)
+ test angled, 0x400
+ jnz .h8_main
+ movd m3, angled
+ shr angled, 8 ; is_sm << 1
+ pxor m2, m2
+ pshufb m1, m2
+ pshufb m3, m2
+ movu m2, [base+z_filt_wh8]
+ psrldq m4, [base+z_filt_t_w48+angleq*8], 4
+ pcmpeqb m2, m1
+ pand m2, m3
+ pcmpgtb m2, m4
+ pmovmskb r5d, m2
+ test r5d, r5d
+ jz .h8_main ; filter_strength == 0
+ pshuflw m1, [tlq+2], q0000
+ imul r5d, 0x55555555
+ mova m2, [tlq-16*1+2]
+ neg r4
+ mova m3, [tlq-16*2+2]
+ shr r5d, 30
+ movd m4, [tlq+r4*2]
+ movd [rsp+16*17], m1
+ mova [rsp+16*16], m2
+ pshuflw m4, m4, q0000
+ mova [rsp+16*15], m3
+ lea r2, [r4-2]
+ movq [rsp+16*17+r4*2-10], m4
+ cmp wd, 16
+ cmovae r4, r2
+ lea tlq, [rsp+16*17-2]
+ call .filter_edge
+.h8_main:
+ sub tlq, r4
+ movd m4, dyd
+ sub tlq, r4
+ shl r4d, 6
+ movd m6, [tlq]
+ movd m3, r4d
+ pshufb m4, m0
+ neg dyq
+ pshufb m6, m0
+ lea r5, [dyq+r4+63]
+ pshufb m3, m0
+ shl wd, 4
+ mova m5, m4
+ sub rsp, wq
+ psubw m3, [base+z_base_inc_z2]
+.h8_loop:
+ mov r4, r5
+ sar r4, 6
+ movu m0, [tlq+r4*2-14]
+ movu m1, [tlq+r4*2-16]
+ pand m2, m7, m4
+ psllw m2, 9
+ psubw m1, m0
+ pmulhrsw m1, m2
+ pcmpgtw m2, m3, m4
+ paddw m4, m5
+ paddw m0, m1
+ pand m0, m2
+ pandn m2, m6
+ por m0, m2
+ mova [rsp+wq-16], m0
+ sub wd, 8*2
+ jz .h8_transpose
+ add r5, dyq
+ jg .h8_loop
+.h8_end_loop:
+ mova [rsp+wq-16], m6
+ sub wd, 8*2
+ jg .h8_end_loop
+.h8_transpose:
+ or r3d, 8*2
+ jmp .end_transpose
+.h16:
+ lea r4d, [wq+15]
+ movd m1, r4d
+ and r4d, 15
+ or r4d, 16 ; imin(w+15, 31)
+ test angled, 0x400
+ jnz .h16_main
+ movd m3, angled
+ shr angled, 8 ; is_sm << 1
+ pxor m2, m2
+ pshufb m1, m2
+ pshufb m3, m2
+ movq m4, [base+z_filt_t_w16+angleq*4]
+ pcmpeqb m1, [base+z_filt_wh16]
+ pand m1, m3
+ pcmpgtb m1, m4
+ pmovmskb r5d, m1
+ test r5d, r5d
+ jz .h16_main ; filter_strength == 0
+ pshuflw m1, [tlq+2], q0000
+ mova m2, [tlq-16*1+2]
+ imul r5d, 0x24924924
+ mova m3, [tlq-16*2+2]
+ neg r4
+ mova m4, [tlq-16*3+2]
+ shr r5d, 30
+ mova m5, [tlq-16*4+2]
+ movd m6, [tlq+r4*2]
+ adc r5d, -1 ; filter_strength
+ movd [rsp+16*17], m1
+ mova [rsp+16*16], m2
+ mova [rsp+16*15], m3
+ pshuflw m6, m6, q0000
+ mova [rsp+16*14], m4
+ mova [rsp+16*13], m5
+ lea r2, [r4-2]
+ movq [rsp+16*17+r4*2-10], m6
+ cmp wd, 32
+ cmovae r4, r2
+ lea tlq, [rsp+16*17-2]
+ call .filter_edge
+.h16_main:
+ sub tlq, r4
+ movd m5, dyd
+ sub tlq, r4
+ shl r4d, 6
+ movd m6, [tlq]
+ movd m3, r4d
+ pshufb m5, m0
+ neg dyq
+ pshufb m6, m0
+ lea r5, [dyq+r4+63]
+ pshufb m3, m0
+ shl wd, 5
+ paddw m4, m5, [base+z_base_inc_z2]
+ sub rsp, wq
+ psubw m4, m3
+.h16_loop:
+ mov r4, r5
+ sar r4, 6
+ movu m0, [tlq+r4*2-14]
+ movu m2, [tlq+r4*2-16]
+ pand m3, m7, m4
+ psllw m3, 9
+ psubw m2, m0
+ pmulhrsw m2, m3
+ movu m1, [tlq+r4*2-30]
+ paddw m0, m2
+ movu m2, [tlq+r4*2-32]
+ psubw m2, m1
+ pmulhrsw m2, m3
+ movddup m3, [base+pw_m512]
+ paddw m1, m2
+ psraw m2, m4, 15
+ pcmpgtw m3, m4
+ paddw m4, m5
+ pand m0, m2
+ pandn m2, m6
+ pand m1, m3
+ pandn m3, m6
+ por m0, m2
+ mova [rsp+wq-16*1], m0
+ por m1, m3
+ mova [rsp+wq-16*2], m1
+ sub wd, 16*2
+ jz .h16_transpose
+ add r5, dyq
+ jg .h16_loop
+.h16_end_loop:
+ mova [rsp+wq-16*1], m6
+ mova [rsp+wq-16*2], m6
+ sub wd, 16*2
+ jg .h16_end_loop
+.h16_transpose:
+ or r3d, 16*2
+ jmp .end_transpose
+.h32:
+ lea r4d, [wq+31]
+ and r4d, 31
+ or r4d, 32 ; imin(w+31, 63)
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .h32_main
+ call .filter_copy
+ lea r5, [r4-2]
+ cmp wd, 64
+ cmove r4, r5
+ call .filter_edge_s3
+.h32_main:
+ sub tlq, r4
+ movd m5, dyd
+ sub tlq, r4
+ shl r4d, 6
+ movd m6, [tlq]
+ movd m3, r4d
+ pshufb m5, m0
+ neg dyq
+ pshufb m6, m0
+ lea r5, [dyq+r4+63]
+ pshufb m3, m0
+ paddw m4, m5, [base+z_base_inc_z2]
+ psubw m4, m3
+.h32_loop:
+ mov r4, r5
+ sar r4, 6
+ movu m0, [tlq+r4*2-14]
+ movu m3, [tlq+r4*2-16]
+ pand m2, m7, m4
+ psllw m2, 9
+ psubw m3, m0
+ pmulhrsw m3, m2
+ movu m1, [tlq+r4*2-30]
+ paddw m0, m3
+ movu m3, [tlq+r4*2-32]
+ psubw m3, m1
+ pmulhrsw m3, m2
+ sub rsp, 16*4
+ paddw m1, m3
+ psraw m3, m4, 15
+ pand m0, m3
+ pandn m3, m6
+ por m0, m3
+ movddup m3, [base+pw_m512]
+ pcmpgtw m3, m4
+ pand m1, m3
+ pandn m3, m6
+ mova [rsp+16*3], m0
+ por m1, m3
+ mova [rsp+16*2], m1
+ movu m0, [tlq+r4*2-46]
+ movu m3, [tlq+r4*2-48]
+ psubw m3, m0
+ pmulhrsw m3, m2
+ movu m1, [tlq+r4*2-62]
+ paddw m0, m3
+ movu m3, [tlq+r4*2-64]
+ psubw m3, m1
+ pmulhrsw m3, m2
+ movddup m2, [base+pw_m1024]
+ paddw m1, m3
+ movddup m3, [base+pw_m1536]
+ pcmpgtw m2, m4
+ pcmpgtw m3, m4
+ paddw m4, m5
+ pand m0, m2
+ pandn m2, m6
+ pand m1, m3
+ pandn m3, m6
+ por m0, m2
+ mova [rsp+16*1], m0
+ por m1, m3
+ mova [rsp+16*0], m1
+ dec wd
+ jz .h32_transpose
+ add r5, dyq
+ jg .h32_loop
+.h32_end_loop:
+ sub rsp, 16*4
+ REPX {mova [rsp+16*x], m6}, 3, 2, 1, 0
+ dec wd
+ jg .h32_end_loop
+.h32_transpose:
+ or r3d, 32*2
+ jmp .end_transpose
+.h64:
+ lea r4d, [wq+63]
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .h64_main
+ call .filter_copy
+ call .filter_edge_s3
+.h64_main:
+ sub tlq, r4
+ movd m5, dyd
+ sub tlq, r4
+ shl r4d, 6
+ movd m6, [tlq]
+ movd m3, r4d
+ pshufb m5, m0
+ neg dyq
+ pshufb m6, m0
+ lea r5, [dyq+r4+63]
+ pshufb m3, m0
+ paddw m4, m5, [base+z_base_inc_z2]
+ psubw m4, m3
+.h64_loop:
+ mov r4, r5
+ sar r4, 6
+ movu m0, [tlq+r4*2- 14]
+ movu m3, [tlq+r4*2- 16]
+ pand m2, m7, m4
+ psllw m2, 9
+ psubw m3, m0
+ pmulhrsw m3, m2
+ movu m1, [tlq+r4*2- 30]
+ paddw m0, m3
+ movu m3, [tlq+r4*2- 32]
+ psubw m3, m1
+ pmulhrsw m3, m2
+ sub rsp, 16*8
+ paddw m1, m3
+ psraw m3, m4, 15
+ pand m0, m3
+ pandn m3, m6
+ por m0, m3
+ movddup m3, [base+pw_m512]
+ pcmpgtw m3, m4
+ pand m1, m3
+ pandn m3, m6
+ mova [rsp+16*7], m0
+ por m1, m3
+ mova [rsp+16*6], m1
+ movu m0, [tlq+r4*2- 46]
+ movu m3, [tlq+r4*2- 48]
+ psubw m3, m0
+ pmulhrsw m3, m2
+ movu m1, [tlq+r4*2- 62]
+ paddw m0, m3
+ movu m3, [tlq+r4*2- 64]
+ psubw m3, m1
+ pmulhrsw m3, m2
+ paddw m1, m3
+ movddup m3, [base+pw_m1024]
+ pcmpgtw m3, m4
+ pand m0, m3
+ pandn m3, m6
+ por m0, m3
+ movddup m3, [base+pw_m1536]
+ pcmpgtw m3, m4
+ pand m1, m3
+ pandn m3, m6
+ mova [rsp+16*5], m0
+ por m1, m3
+ mova [rsp+16*4], m1
+ movu m0, [tlq+r4*2- 78]
+ movu m3, [tlq+r4*2- 80]
+ psubw m3, m0
+ pmulhrsw m3, m2
+ movu m1, [tlq+r4*2- 94]
+ paddw m0, m3
+ movu m3, [tlq+r4*2- 96]
+ psubw m3, m1
+ pmulhrsw m3, m2
+ paddw m1, m3
+ movddup m3, [base+pw_m2048]
+ pcmpgtw m3, m4
+ pand m0, m3
+ pandn m3, m6
+ por m0, m3
+ movddup m3, [base+pw_m2560]
+ pcmpgtw m3, m4
+ pand m1, m3
+ pandn m3, m6
+ mova [rsp+16*3], m0
+ por m1, m3
+ mova [rsp+16*2], m1
+ movu m0, [tlq+r4*2-110]
+ movu m3, [tlq+r4*2-112]
+ psubw m3, m0
+ pmulhrsw m3, m2
+ movu m1, [tlq+r4*2-126]
+ paddw m0, m3
+ movu m3, [tlq+r4*2-128]
+ psubw m3, m1
+ pmulhrsw m3, m2
+ movddup m2, [base+pw_m3072]
+ paddw m1, m3
+ movddup m3, [base+pw_m3584]
+ pcmpgtw m2, m4
+ pcmpgtw m3, m4
+ paddw m4, m5
+ pand m0, m2
+ pandn m2, m6
+ pand m1, m3
+ pandn m3, m6
+ por m0, m2
+ mova [rsp+16*1], m0
+ por m1, m3
+ mova [rsp+16*0], m1
+ dec wd
+ jz .h64_transpose
+ add r5, dyq
+ jg .h64_loop
+.h64_end_loop:
+ sub rsp, 16*8
+ REPX {mova [rsp+16*x], m6}, 7, 6, 5, 4, 3, 2, 1, 0
+ dec wd
+ jg .h64_end_loop
+.h64_transpose:
+ add r3d, 64*2
+.end_transpose:
+%if ARCH_X86_64
+ lea r7, [strideq*3]
+%else
+ mov strideq, [dstq+4*0]
+ mov org_wd, [dstq+4*1]
+%endif
+ lea r4d, [r3*3]
+.end_transpose_loop:
+ lea r2, [rsp+r3-8]
+ lea r6, [dstq+org_wq*2-8]
+.end_transpose_loop_y:
+ movq m0, [r2+r4 ]
+ movq m1, [r2+r3*2]
+ movq m2, [r2+r3*1]
+ movq m3, [r2+r3*0]
+ sub r2, 8
+ punpcklwd m0, m1
+ punpcklwd m2, m3
+ punpckhdq m1, m0, m2
+ punpckldq m0, m2
+ movhps [r6+strideq*0], m1
+ movq [r6+strideq*1], m1
+%if ARCH_X86_64
+ movhps [r6+strideq*2], m0
+ movq [r6+r7 ], m0
+ lea r6, [r6+strideq*4]
+%else
+ lea r6, [r6+strideq*2]
+ movhps [r6+strideq*0], m0
+ movq [r6+strideq*1], m0
+ lea r6, [r6+strideq*2]
+%endif
+ cmp r2, rsp
+ jae .end_transpose_loop_y
+ lea rsp, [rsp+r3*4]
+ sub org_wd, 4
+ jg .end_transpose_loop
+ RET
+.filter_copy:
+ neg r4
+ pshuflw m2, [tlq+2], q0000
+ xor r5d, r5d
+ pshuflw m3, [tlq+r4*2], q0000
+ movq [rsp+gprsize+16*17], m2
+.filter_copy_loop:
+ mova m1, [tlq+r5*2-16*1+2]
+ mova m2, [tlq+r5*2-16*2+2]
+ sub r5, 16
+ mova [rsp+r5*2+gprsize+16*18], m1
+ mova [rsp+r5*2+gprsize+16*17], m2
+ cmp r5d, r4d
+ jg .filter_copy_loop
+ lea tlq, [rsp+gprsize+16*17-2]
+ movq [tlq+r4*2-8], m3
+ ret
+.filter_edge:
+ cmp r5d, 3
+ je .filter_edge_s3
+ movddup m4, [base+z_filt_k+r5*8-8]
+ movddup m5, [base+z_filt_k+r5*8+8]
+ xor r5d, r5d
+ movddup m6, [base+pw_8]
+ movu m2, [tlq-12]
+ jmp .filter_edge_start
+.filter_edge_loop:
+ movu m2, [tlq+r5*2-12]
+ mova [tlq+r5*2+2], m1
+.filter_edge_start:
+ pmullw m1, m4, [tlq+r5*2-14]
+ movu m3, [tlq+r5*2-16]
+ sub r5, 8
+ paddw m2, m3
+ pmullw m2, m5
+ paddw m1, m6
+ paddw m1, m2
+ psrlw m1, 4
+ cmp r5d, r4d
+ jg .filter_edge_loop
+ mova [tlq+r5*2+2], m1
+ neg r4d
+ ret
+.filter_edge_s3:
+ movddup m5, [base+pw_3]
+ xor r5d, r5d
+ movu m2, [tlq-12]
+ movu m3, [tlq-10]
+ jmp .filter_edge_s3_start
+.filter_edge_s3_loop:
+ movu m2, [tlq+r5*2-12]
+ movu m3, [tlq+r5*2-10]
+ mova [tlq+r5*2+2], m1
+.filter_edge_s3_start:
+ paddw m2, [tlq+r5*2-14]
+ paddw m3, m5
+ movu m1, [tlq+r5*2-16]
+ movu m4, [tlq+r5*2-18]
+ sub r5, 8
+ paddw m1, m2
+ pavgw m3, m4
+ paddw m1, m3
+ psrlw m1, 2
+ cmp r5d, r4d
+ jg .filter_edge_s3_loop
+ mova [tlq+r5*2+2], m1
+ neg r4d
+ ret
+
+%if ARCH_X86_64
+cglobal ipred_filter_16bpc, 4, 7, 16, dst, stride, tl, w, h, filter
+%else
+cglobal ipred_filter_16bpc, 4, 7, 8, -16*8, dst, stride, tl, w, h, filter
+%define m8 [esp+16*0]
+%define m9 [esp+16*1]
+%define m10 [esp+16*2]
+%define m11 [esp+16*3]
+%define m12 [esp+16*4]
+%define m13 [esp+16*5]
+%define m14 [esp+16*6]
+%define m15 [esp+16*7]
+%endif
+%define base r6-$$
+ movifnidn hd, hm
+ movd m6, r8m ; bitdepth_max
+%ifidn filterd, filterm
+ movzx filterd, filterb
+%else
+ movzx filterd, byte filterm
+%endif
+ LEA r6, $$
+ shl filterd, 6
+ movu m0, [tlq-6] ; __ l1 l0 tl t0 t1 t2 t3
+ mova m1, [base+filter_intra_taps+filterq+16*0]
+ mova m2, [base+filter_intra_taps+filterq+16*1]
+ mova m3, [base+filter_intra_taps+filterq+16*2]
+ mova m4, [base+filter_intra_taps+filterq+16*3]
+ pxor m5, m5
+%if ARCH_X86_64
+ punpcklbw m8, m5, m1 ; place 8-bit coefficients in the upper
+ punpckhbw m9, m5, m1 ; half of each 16-bit word to avoid
+ punpcklbw m10, m5, m2 ; having to perform sign-extension.
+ punpckhbw m11, m5, m2
+ punpcklbw m12, m5, m3
+ punpckhbw m13, m5, m3
+ punpcklbw m14, m5, m4
+ punpckhbw m15, m5, m4
+%else
+ punpcklbw m7, m5, m1
+ mova m8, m7
+ punpckhbw m7, m5, m1
+ mova m9, m7
+ punpcklbw m7, m5, m2
+ mova m10, m7
+ punpckhbw m7, m5, m2
+ mova m11, m7
+ punpcklbw m7, m5, m3
+ mova m12, m7
+ punpckhbw m7, m5, m3
+ mova m13, m7
+ punpcklbw m7, m5, m4
+ mova m14, m7
+ punpckhbw m7, m5, m4
+ mova m15, m7
+%endif
+ mova m7, [base+filter_shuf]
+ add hd, hd
+ mov r5, dstq
+ pshuflw m6, m6, q0000
+ mov r6, tlq
+ punpcklqdq m6, m6
+ sub tlq, hq
+.left_loop:
+ pshufb m0, m7 ; tl t0 t1 t2 t3 l0 l1 __
+ pshufd m1, m0, q0000
+ pmaddwd m2, m8, m1
+ pmaddwd m1, m9
+ pshufd m4, m0, q1111
+ pmaddwd m3, m10, m4
+ pmaddwd m4, m11
+ paddd m2, m3
+ paddd m1, m4
+ pshufd m4, m0, q2222
+ pmaddwd m3, m12, m4
+ pmaddwd m4, m13
+ paddd m2, m3
+ paddd m1, m4
+ pshufd m3, m0, q3333
+ pmaddwd m0, m14, m3
+ pmaddwd m3, m15
+ paddd m0, m2
+ paddd m1, m3
+ psrad m0, 11 ; x >> 3
+ psrad m1, 11
+ packssdw m0, m1
+ pmaxsw m0, m5
+ pavgw m0, m5 ; (x + 8) >> 4
+ pminsw m0, m6
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ movlps m0, [tlq+hq-10]
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2*2
+ jg .left_loop
+ sub wd, 4
+ jz .end
+ sub tld, r6d ; -h*2
+ sub r6, r5 ; tl-dst
+.right_loop0:
+ add r5, 8
+ mov hd, tld
+ movu m0, [r5+r6] ; tl t0 t1 t2 t3 __ __ __
+ mov dstq, r5
+.right_loop:
+ pshufd m2, m0, q0000
+ pmaddwd m1, m8, m2
+ pmaddwd m2, m9
+ pshufd m4, m0, q1111
+ pmaddwd m3, m10, m4
+ pmaddwd m4, m11
+ pinsrw m0, [dstq+strideq*0-2], 5
+ paddd m1, m3
+ paddd m2, m4
+ pshufd m0, m0, q2222
+ movddup m4, [dstq+strideq*1-8]
+ pmaddwd m3, m12, m0
+ pmaddwd m0, m13
+ paddd m1, m3
+ paddd m0, m2
+ pshuflw m2, m4, q3333
+ punpcklwd m2, m5
+ pmaddwd m3, m14, m2
+ pmaddwd m2, m15
+ paddd m1, m3
+ paddd m0, m2
+ psrad m1, 11
+ psrad m0, 11
+ packssdw m0, m1
+ pmaxsw m0, m5
+ pavgw m0, m5
+ pminsw m0, m6
+ movhps [dstq+strideq*0], m0
+ movq [dstq+strideq*1], m0
+ palignr m0, m4, 14
+ lea dstq, [dstq+strideq*2]
+ add hd, 2*2
+ jl .right_loop
+ sub wd, 4
+ jg .right_loop0
+.end:
+ RET
+
+%if UNIX64
+DECLARE_REG_TMP 7
+%else
+DECLARE_REG_TMP 5
+%endif
+
+cglobal ipred_cfl_top_16bpc, 4, 7, 8, dst, stride, tl, w, h, ac
+ LEA t0, ipred_cfl_left_16bpc_ssse3_table
+ movd m4, wd
+ tzcnt wd, wd
+ movifnidn hd, hm
+ add tlq, 2
+ movsxd r6, [t0+wq*4]
+ movd m5, wd
+ jmp mangle(private_prefix %+ _ipred_cfl_left_16bpc_ssse3.start)
+
+cglobal ipred_cfl_left_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha
+ movifnidn hd, hm
+ LEA t0, ipred_cfl_left_16bpc_ssse3_table
+ tzcnt wd, wm
+ lea r6d, [hq*2]
+ movd m4, hd
+ sub tlq, r6
+ tzcnt r6d, hd
+ movd m5, r6d
+ movsxd r6, [t0+r6*4]
+.start:
+ movd m7, r7m
+ movu m0, [tlq]
+ add r6, t0
+ add t0, ipred_cfl_splat_16bpc_ssse3_table-ipred_cfl_left_16bpc_ssse3_table
+ movsxd wq, [t0+wq*4]
+ pxor m6, m6
+ pshuflw m7, m7, q0000
+ pcmpeqw m3, m3
+ add wq, t0
+ movifnidn acq, acmp
+ pavgw m4, m6
+ punpcklqdq m7, m7
+ jmp r6
+.h32:
+ movu m1, [tlq+48]
+ movu m2, [tlq+32]
+ paddw m0, m1
+ paddw m0, m2
+.h16:
+ movu m1, [tlq+16]
+ paddw m0, m1
+.h8:
+ pshufd m1, m0, q1032
+ paddw m0, m1
+.h4:
+ pmaddwd m0, m3
+ psubd m4, m0
+ pshuflw m0, m4, q1032
+ paddd m0, m4
+ psrld m0, m5
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+ jmp wq
+
+%macro IPRED_CFL 2 ; dst, src
+ pabsw m%1, m%2
+ pmulhrsw m%1, m2
+ psignw m%2, m1
+ psignw m%1, m%2
+ paddw m%1, m0
+ pmaxsw m%1, m6
+ pminsw m%1, m7
+%endmacro
+
+cglobal ipred_cfl_16bpc, 4, 7, 8, dst, stride, tl, w, h, ac, alpha
+ movifnidn hd, hm
+ tzcnt r6d, hd
+ lea t0d, [wq+hq]
+ movd m4, t0d
+ tzcnt t0d, t0d
+ movd m5, t0d
+ LEA t0, ipred_cfl_16bpc_ssse3_table
+ tzcnt wd, wd
+ movd m7, r7m
+ movsxd r6, [t0+r6*4]
+ movsxd wq, [t0+wq*4+4*4]
+ psrlw m4, 1
+ pxor m6, m6
+ pshuflw m7, m7, q0000
+ add r6, t0
+ add wq, t0
+ movifnidn acq, acmp
+ pcmpeqw m3, m3
+ punpcklqdq m7, m7
+ jmp r6
+.h4:
+ movq m0, [tlq-8]
+ jmp wq
+.w4:
+ movq m1, [tlq+2]
+ paddw m0, m1
+ pmaddwd m0, m3
+ psubd m4, m0
+ pshufd m0, m4, q1032
+ paddd m0, m4
+ pshuflw m4, m0, q1032
+ paddd m0, m4
+ cmp hd, 4
+ jg .w4_mul
+ psrld m0, 3
+ jmp .w4_end
+.w4_mul:
+ mov r6d, 0xAAAB
+ mov r2d, 0x6667
+ cmp hd, 16
+ cmove r6d, r2d
+ movd m1, r6d
+ psrld m0, 2
+ pmulhuw m0, m1
+ psrlw m0, 1
+.w4_end:
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.s4:
+ movd m1, alpham
+ lea r6, [strideq*3]
+ pshuflw m1, m1, q0000
+ punpcklqdq m1, m1
+ pabsw m2, m1
+ psllw m2, 9
+.s4_loop:
+ mova m4, [acq+16*0]
+ mova m5, [acq+16*1]
+ add acq, 16*2
+ IPRED_CFL 3, 4
+ IPRED_CFL 4, 5
+ movq [dstq+strideq*0], m3
+ movhps [dstq+strideq*1], m3
+ movq [dstq+strideq*2], m4
+ movhps [dstq+r6 ], m4
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s4_loop
+ RET
+.h8:
+ mova m0, [tlq-16]
+ jmp wq
+.w8:
+ movu m1, [tlq+2]
+ paddw m0, m1
+ pmaddwd m0, m3
+ psubd m4, m0
+ pshufd m0, m4, q1032
+ paddd m0, m4
+ pshuflw m4, m0, q1032
+ paddd m0, m4
+ psrld m0, m5
+ cmp hd, 8
+ je .w8_end
+ mov r6d, 0xAAAB
+ mov r2d, 0x6667
+ cmp hd, 32
+ cmove r6d, r2d
+ movd m1, r6d
+ pmulhuw m0, m1
+ psrlw m0, 1
+.w8_end:
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.s8:
+ movd m1, alpham
+ pshuflw m1, m1, q0000
+ punpcklqdq m1, m1
+ pabsw m2, m1
+ psllw m2, 9
+.s8_loop:
+ mova m4, [acq+16*0]
+ mova m5, [acq+16*1]
+ add acq, 16*2
+ IPRED_CFL 3, 4
+ IPRED_CFL 4, 5
+ mova [dstq+strideq*0], m3
+ mova [dstq+strideq*1], m4
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .s8_loop
+ RET
+.h16:
+ mova m0, [tlq-32]
+ paddw m0, [tlq-16]
+ jmp wq
+.w16:
+ movu m1, [tlq+ 2]
+ movu m2, [tlq+18]
+ paddw m1, m2
+ paddw m0, m1
+ pmaddwd m0, m3
+ psubd m4, m0
+ pshufd m0, m4, q1032
+ paddd m0, m4
+ pshuflw m4, m0, q1032
+ paddd m0, m4
+ psrld m0, m5
+ cmp hd, 16
+ je .w16_end
+ mov r6d, 0xAAAB
+ mov r2d, 0x6667
+ test hd, 8|32
+ cmovz r6d, r2d
+ movd m1, r6d
+ pmulhuw m0, m1
+ psrlw m0, 1
+.w16_end:
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.s16:
+ movd m1, alpham
+ pshuflw m1, m1, q0000
+ punpcklqdq m1, m1
+ pabsw m2, m1
+ psllw m2, 9
+.s16_loop:
+ mova m4, [acq+16*0]
+ mova m5, [acq+16*1]
+ add acq, 16*2
+ IPRED_CFL 3, 4
+ IPRED_CFL 4, 5
+ mova [dstq+16*0], m3
+ mova [dstq+16*1], m4
+ add dstq, strideq
+ dec hd
+ jg .s16_loop
+ RET
+.h32:
+ mova m0, [tlq-64]
+ paddw m0, [tlq-48]
+ paddw m0, [tlq-32]
+ paddw m0, [tlq-16]
+ jmp wq
+.w32:
+ movu m1, [tlq+ 2]
+ movu m2, [tlq+18]
+ paddw m1, m2
+ movu m2, [tlq+34]
+ paddw m1, m2
+ movu m2, [tlq+50]
+ paddw m1, m2
+ paddw m0, m1
+ pmaddwd m0, m3
+ psubd m4, m0
+ pshufd m0, m4, q1032
+ paddd m0, m4
+ pshuflw m4, m0, q1032
+ paddd m0, m4
+ psrld m0, m5
+ cmp hd, 32
+ je .w32_end
+ mov r6d, 0xAAAB
+ mov r2d, 0x6667
+ cmp hd, 8
+ cmove r6d, r2d
+ movd m1, r6d
+ pmulhuw m0, m1
+ psrlw m0, 1
+.w32_end:
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.s32:
+ movd m1, alpham
+ pshuflw m1, m1, q0000
+ punpcklqdq m1, m1
+ pabsw m2, m1
+ psllw m2, 9
+.s32_loop:
+ mova m4, [acq+16*0]
+ mova m5, [acq+16*1]
+ IPRED_CFL 3, 4
+ IPRED_CFL 4, 5
+ mova [dstq+16*0], m3
+ mova [dstq+16*1], m4
+ mova m4, [acq+16*2]
+ mova m5, [acq+16*3]
+ add acq, 16*4
+ IPRED_CFL 3, 4
+ IPRED_CFL 4, 5
+ mova [dstq+16*2], m3
+ mova [dstq+16*3], m4
+ add dstq, strideq
+ dec hd
+ jg .s32_loop
+ RET
+
+cglobal ipred_cfl_128_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac
+ tzcnt wd, wm
+ LEA t0, ipred_cfl_splat_16bpc_ssse3_table
+ mov r6d, r7m
+ movifnidn hd, hm
+ shr r6d, 11
+ movd m7, r7m
+ movsxd wq, [t0+wq*4]
+ movddup m0, [t0-ipred_cfl_splat_16bpc_ssse3_table+pw_512+r6*8]
+ pshuflw m7, m7, q0000
+ pxor m6, m6
+ add wq, t0
+ movifnidn acq, acmp
+ punpcklqdq m7, m7
+ jmp wq
+
+cglobal ipred_cfl_ac_420_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h
+ movifnidn hpadd, hpadm
+%if ARCH_X86_32 && PIC
+ pcmpeqw m5, m5
+ pabsw m5, m5
+ paddw m5, m5
+%else
+ movddup m5, [pw_2]
+%endif
+ mov hd, hm
+ shl hpadd, 2
+ pxor m4, m4
+ sub hd, hpadd
+ cmp dword wm, 8
+ mov r5, acq
+ jg .w16
+ je .w8
+ lea r3, [strideq*3]
+.w4_loop:
+ pmaddwd m0, m5, [ypxq+strideq*0]
+ pmaddwd m1, m5, [ypxq+strideq*1]
+ pmaddwd m2, m5, [ypxq+strideq*2]
+ pmaddwd m3, m5, [ypxq+r3 ]
+ lea ypxq, [ypxq+strideq*4]
+ paddd m0, m1
+ paddd m2, m3
+ paddd m4, m0
+ packssdw m0, m2
+ paddd m4, m2
+ mova [acq], m0
+ add acq, 16
+ sub hd, 2
+ jg .w4_loop
+ test hpadd, hpadd
+ jz .dc
+ punpckhqdq m0, m0
+ pslld m2, 2
+.w4_hpad:
+ mova [acq+16*0], m0
+ paddd m4, m2
+ mova [acq+16*1], m0
+ add acq, 16*2
+ sub hpadd, 4
+ jg .w4_hpad
+ jmp .dc
+.w8:
+%if ARCH_X86_32
+ cmp dword wpadm, 0
+%else
+ test wpadd, wpadd
+%endif
+ jnz .w8_wpad1
+.w8_loop:
+ pmaddwd m0, m5, [ypxq+strideq*0+16*0]
+ pmaddwd m2, m5, [ypxq+strideq*1+16*0]
+ pmaddwd m1, m5, [ypxq+strideq*0+16*1]
+ pmaddwd m3, m5, [ypxq+strideq*1+16*1]
+ lea ypxq, [ypxq+strideq*2]
+ paddd m0, m2
+ paddd m1, m3
+ paddd m2, m0, m1
+ packssdw m0, m1
+ paddd m4, m2
+ mova [acq], m0
+ add acq, 16
+ dec hd
+ jg .w8_loop
+.w8_hpad:
+ test hpadd, hpadd
+ jz .dc
+ pslld m2, 2
+ mova m1, m0
+ jmp .hpad
+.w8_wpad1:
+ pmaddwd m0, m5, [ypxq+strideq*0]
+ pmaddwd m1, m5, [ypxq+strideq*1]
+ lea ypxq, [ypxq+strideq*2]
+ paddd m0, m1
+ pshufd m1, m0, q3333
+ paddd m2, m0, m1
+ packssdw m0, m1
+ paddd m4, m2
+ mova [acq], m0
+ add acq, 16
+ dec hd
+ jg .w8_wpad1
+ jmp .w8_hpad
+.w16_wpad3:
+ pshufd m3, m0, q3333
+ mova m1, m3
+ mova m2, m3
+ jmp .w16_wpad_end
+.w16_wpad2:
+ pshufd m1, m3, q3333
+ mova m2, m1
+ jmp .w16_wpad_end
+.w16_wpad1:
+ pshufd m2, m1, q3333
+ jmp .w16_wpad_end
+.w16:
+ movifnidn wpadd, wpadm
+ WIN64_SPILL_XMM 7
+.w16_loop:
+ pmaddwd m0, m5, [ypxq+strideq*0+16*0]
+ pmaddwd m6, m5, [ypxq+strideq*1+16*0]
+ paddd m0, m6
+ cmp wpadd, 2
+ jg .w16_wpad3
+ pmaddwd m3, m5, [ypxq+strideq*0+16*1]
+ pmaddwd m6, m5, [ypxq+strideq*1+16*1]
+ paddd m3, m6
+ je .w16_wpad2
+ pmaddwd m1, m5, [ypxq+strideq*0+16*2]
+ pmaddwd m6, m5, [ypxq+strideq*1+16*2]
+ paddd m1, m6
+ jp .w16_wpad1
+ pmaddwd m2, m5, [ypxq+strideq*0+16*3]
+ pmaddwd m6, m5, [ypxq+strideq*1+16*3]
+ paddd m2, m6
+.w16_wpad_end:
+ lea ypxq, [ypxq+strideq*2]
+ paddd m6, m0, m3
+ packssdw m0, m3
+ paddd m6, m1
+ mova [acq+16*0], m0
+ packssdw m1, m2
+ paddd m2, m6
+ mova [acq+16*1], m1
+ add acq, 16*2
+ paddd m4, m2
+ dec hd
+ jg .w16_loop
+ WIN64_RESTORE_XMM
+ add hpadd, hpadd
+ jz .dc
+ paddd m2, m2
+.hpad:
+ mova [acq+16*0], m0
+ mova [acq+16*1], m1
+ paddd m4, m2
+ mova [acq+16*2], m0
+ mova [acq+16*3], m1
+ add acq, 16*4
+ sub hpadd, 4
+ jg .hpad
+.dc:
+ sub r5, acq ; -w*h*2
+ pshufd m2, m4, q1032
+ tzcnt r1d, r5d
+ paddd m2, m4
+ sub r1d, 2
+ pshufd m4, m2, q2301
+ movd m0, r1d
+ paddd m2, m4
+ psrld m2, m0
+ pxor m0, m0
+ pavgw m2, m0
+ packssdw m2, m2
+.dc_loop:
+ mova m0, [acq+r5+16*0]
+ mova m1, [acq+r5+16*1]
+ psubw m0, m2
+ psubw m1, m2
+ mova [acq+r5+16*0], m0
+ mova [acq+r5+16*1], m1
+ add r5, 16*2
+ jl .dc_loop
+ RET
+
+cglobal ipred_cfl_ac_422_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h
+ movifnidn hpadd, hpadm
+%if ARCH_X86_32 && PIC
+ pcmpeqw m5, m5
+ pabsw m5, m5
+ psllw m5, 2
+%else
+ movddup m5, [pw_4]
+%endif
+ mov hd, hm
+ shl hpadd, 2
+ pxor m4, m4
+ sub hd, hpadd
+ cmp dword wm, 8
+ mov r5, acq
+ jg .w16
+ je .w8
+ lea r3, [strideq*3]
+.w4_loop:
+ pmaddwd m0, m5, [ypxq+strideq*0]
+ pmaddwd m3, m5, [ypxq+strideq*1]
+ pmaddwd m1, m5, [ypxq+strideq*2]
+ pmaddwd m2, m5, [ypxq+r3 ]
+ lea ypxq, [ypxq+strideq*4]
+ paddd m4, m0
+ packssdw m0, m3
+ paddd m3, m1
+ packssdw m1, m2
+ paddd m4, m2
+ paddd m4, m3
+ mova [acq+16*0], m0
+ mova [acq+16*1], m1
+ add acq, 16*2
+ sub hd, 4
+ jg .w4_loop
+ test hpadd, hpadd
+ jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
+ punpckhqdq m1, m1
+ pslld m2, 3
+ mova [acq+16*0], m1
+ mova [acq+16*1], m1
+ paddd m4, m2
+ mova [acq+16*2], m1
+ mova [acq+16*3], m1
+ add acq, 16*4
+ jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
+.w8:
+%if ARCH_X86_32
+ cmp dword wpadm, 0
+%else
+ test wpadd, wpadd
+%endif
+ jnz .w8_wpad1
+.w8_loop:
+ pmaddwd m0, m5, [ypxq+strideq*0+16*0]
+ pmaddwd m2, m5, [ypxq+strideq*0+16*1]
+ pmaddwd m1, m5, [ypxq+strideq*1+16*0]
+ pmaddwd m3, m5, [ypxq+strideq*1+16*1]
+ lea ypxq, [ypxq+strideq*2]
+ paddd m4, m0
+ packssdw m0, m2
+ paddd m4, m2
+ mova [acq+16*0], m0
+ paddd m2, m1, m3
+ packssdw m1, m3
+ paddd m4, m2
+ mova [acq+16*1], m1
+ add acq, 16*2
+ sub hd, 2
+ jg .w8_loop
+.w8_hpad:
+ test hpadd, hpadd
+ jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
+ pslld m2, 2
+ mova m0, m1
+ jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad
+.w8_wpad1:
+ pmaddwd m0, m5, [ypxq+strideq*0]
+ pmaddwd m1, m5, [ypxq+strideq*1]
+ lea ypxq, [ypxq+strideq*2]
+ pshufd m2, m0, q3333
+ pshufd m3, m1, q3333
+ paddd m4, m0
+ packssdw m0, m2
+ paddd m4, m2
+ paddd m2, m1, m3
+ packssdw m1, m3
+ paddd m4, m2
+ mova [acq+16*0], m0
+ mova [acq+16*1], m1
+ add acq, 16*2
+ sub hd, 2
+ jg .w8_wpad1
+ jmp .w8_hpad
+.w16_wpad3:
+ pshufd m3, m0, q3333
+ mova m1, m3
+ mova m2, m3
+ jmp .w16_wpad_end
+.w16_wpad2:
+ pshufd m1, m3, q3333
+ mova m2, m1
+ jmp .w16_wpad_end
+.w16_wpad1:
+ pshufd m2, m1, q3333
+ jmp .w16_wpad_end
+.w16:
+ movifnidn wpadd, wpadm
+ WIN64_SPILL_XMM 7
+.w16_loop:
+ pmaddwd m0, m5, [ypxq+16*0]
+ cmp wpadd, 2
+ jg .w16_wpad3
+ pmaddwd m3, m5, [ypxq+16*1]
+ je .w16_wpad2
+ pmaddwd m1, m5, [ypxq+16*2]
+ jp .w16_wpad1
+ pmaddwd m2, m5, [ypxq+16*3]
+.w16_wpad_end:
+ add ypxq, strideq
+ paddd m6, m0, m3
+ packssdw m0, m3
+ mova [acq+16*0], m0
+ paddd m6, m1
+ packssdw m1, m2
+ paddd m2, m6
+ mova [acq+16*1], m1
+ add acq, 16*2
+ paddd m4, m2
+ dec hd
+ jg .w16_loop
+ WIN64_RESTORE_XMM
+ add hpadd, hpadd
+ jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
+ paddd m2, m2
+ jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad
+
+cglobal ipred_cfl_ac_444_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h
+%define base r6-ipred_cfl_ac_444_16bpc_ssse3_table
+ LEA r6, ipred_cfl_ac_444_16bpc_ssse3_table
+ tzcnt wd, wm
+ movifnidn hpadd, hpadm
+ pxor m4, m4
+ movsxd wq, [r6+wq*4]
+ movddup m5, [base+pw_1]
+ add wq, r6
+ mov hd, hm
+ shl hpadd, 2
+ sub hd, hpadd
+ jmp wq
+.w4:
+ lea r3, [strideq*3]
+ mov r5, acq
+.w4_loop:
+ movq m0, [ypxq+strideq*0]
+ movhps m0, [ypxq+strideq*1]
+ movq m1, [ypxq+strideq*2]
+ movhps m1, [ypxq+r3 ]
+ lea ypxq, [ypxq+strideq*4]
+ psllw m0, 3
+ psllw m1, 3
+ mova [acq+16*0], m0
+ pmaddwd m0, m5
+ mova [acq+16*1], m1
+ pmaddwd m2, m5, m1
+ add acq, 16*2
+ paddd m4, m0
+ paddd m4, m2
+ sub hd, 4
+ jg .w4_loop
+ test hpadd, hpadd
+ jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
+ punpckhqdq m1, m1
+ mova [acq+16*0], m1
+ pslld m2, 2
+ mova [acq+16*1], m1
+ punpckhqdq m2, m2
+ mova [acq+16*2], m1
+ paddd m4, m2
+ mova [acq+16*3], m1
+ add acq, 16*4
+ jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
+.w8:
+ mov r5, acq
+.w8_loop:
+ mova m0, [ypxq+strideq*0]
+ mova m1, [ypxq+strideq*1]
+ lea ypxq, [ypxq+strideq*2]
+ psllw m0, 3
+ psllw m1, 3
+ mova [acq+16*0], m0
+ pmaddwd m0, m5
+ mova [acq+16*1], m1
+ pmaddwd m2, m5, m1
+ add acq, 16*2
+ paddd m4, m0
+ paddd m4, m2
+ sub hd, 2
+ jg .w8_loop
+.w8_hpad:
+ test hpadd, hpadd
+ jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
+ pslld m2, 2
+ mova m0, m1
+ jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad
+.w16_wpad2:
+ pshufhw m3, m2, q3333
+ pshufhw m1, m0, q3333
+ punpckhqdq m3, m3
+ punpckhqdq m1, m1
+ jmp .w16_wpad_end
+.w16:
+ movifnidn wpadd, wpadm
+ mov r5, acq
+.w16_loop:
+ mova m2, [ypxq+strideq*0+16*0]
+ mova m0, [ypxq+strideq*1+16*0]
+ psllw m2, 3
+ psllw m0, 3
+ test wpadd, wpadd
+ jnz .w16_wpad2
+ mova m3, [ypxq+strideq*0+16*1]
+ mova m1, [ypxq+strideq*1+16*1]
+ psllw m3, 3
+ psllw m1, 3
+.w16_wpad_end:
+ lea ypxq, [ypxq+strideq*2]
+ mova [acq+16*0], m2
+ pmaddwd m2, m5
+ mova [acq+16*1], m3
+ pmaddwd m3, m5
+ paddd m4, m2
+ pmaddwd m2, m5, m0
+ mova [acq+16*2], m0
+ paddd m4, m3
+ pmaddwd m3, m5, m1
+ mova [acq+16*3], m1
+ add acq, 16*4
+ paddd m2, m3
+ paddd m4, m2
+ sub hd, 2
+ jg .w16_loop
+ add hpadd, hpadd
+ jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
+ paddd m2, m2
+ jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad
+.w32_wpad6:
+ pshufhw m1, m0, q3333
+ punpckhqdq m1, m1
+ mova m2, m1
+ mova m3, m1
+ jmp .w32_wpad_end
+.w32_wpad4:
+ pshufhw m2, m1, q3333
+ punpckhqdq m2, m2
+ mova m3, m2
+ jmp .w32_wpad_end
+.w32_wpad2:
+ pshufhw m3, m2, q3333
+ punpckhqdq m3, m3
+ jmp .w32_wpad_end
+.w32:
+ movifnidn wpadd, wpadm
+ mov r5, acq
+ WIN64_SPILL_XMM 8
+.w32_loop:
+ mova m0, [ypxq+16*0]
+ psllw m0, 3
+ cmp wpadd, 4
+ jg .w32_wpad6
+ mova m1, [ypxq+16*1]
+ psllw m1, 3
+ je .w32_wpad4
+ mova m2, [ypxq+16*2]
+ psllw m2, 3
+ jnp .w32_wpad2
+ mova m3, [ypxq+16*3]
+ psllw m3, 3
+.w32_wpad_end:
+ add ypxq, strideq
+ pmaddwd m6, m5, m0
+ mova [acq+16*0], m0
+ pmaddwd m7, m5, m1
+ mova [acq+16*1], m1
+ paddd m6, m7
+ pmaddwd m7, m5, m2
+ mova [acq+16*2], m2
+ paddd m6, m7
+ pmaddwd m7, m5, m3
+ mova [acq+16*3], m3
+ add acq, 16*4
+ paddd m6, m7
+ paddd m4, m6
+ dec hd
+ jg .w32_loop
+%if WIN64
+ mova m5, m6
+ WIN64_RESTORE_XMM
+ SWAP 5, 6
+%endif
+ test hpadd, hpadd
+ jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
+.w32_hpad_loop:
+ mova [acq+16*0], m0
+ mova [acq+16*1], m1
+ paddd m4, m6
+ mova [acq+16*2], m2
+ mova [acq+16*3], m3
+ add acq, 16*4
+ dec hpadd
+ jg .w32_hpad_loop
+ jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
+
+cglobal pal_pred_16bpc, 4, 5, 6, dst, stride, pal, idx, w, h
+%define base r2-pal_pred_16bpc_ssse3_table
+%if ARCH_X86_32
+ %define hd r2d
+%endif
+ mova m4, [palq]
+ LEA r2, pal_pred_16bpc_ssse3_table
+ tzcnt wd, wm
+ pshufb m4, [base+pal_pred_shuf]
+ movsxd wq, [r2+wq*4]
+ pshufd m5, m4, q1032
+ add wq, r2
+ movifnidn hd, hm
+ jmp wq
+.w4:
+ movq m0, [idxq]
+ add idxq, 8
+ psrlw m1, m0, 4
+ punpcklbw m0, m1
+ pshufb m1, m4, m0
+ pshufb m2, m5, m0
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ movq [dstq+strideq*0], m1
+ movhps [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 4
+ jg .w4
+ RET
+.w8:
+ movu m3, [idxq]
+ add idxq, 16
+ psrlw m1, m3, 4
+ punpcklbw m0, m3, m1
+ punpckhbw m3, m1
+ pshufb m1, m4, m0
+ pshufb m2, m5, m0
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ pshufb m1, m4, m3
+ pshufb m2, m5, m3
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 4
+ jg .w8
+ RET
+.w16:
+ movu m3, [idxq]
+ add idxq, 16
+ psrlw m1, m3, 4
+ punpcklbw m0, m3, m1
+ punpckhbw m3, m1
+ pshufb m1, m4, m0
+ pshufb m2, m5, m0
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova [dstq+ 0], m0
+ mova [dstq+16], m1
+ pshufb m1, m4, m3
+ pshufb m2, m5, m3
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova [dstq+strideq+ 0], m0
+ mova [dstq+strideq+16], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w16
+ RET
+.w32:
+ movu m3, [idxq]
+ add idxq, 16
+ psrlw m1, m3, 4
+ punpcklbw m0, m3, m1
+ punpckhbw m3, m1
+ pshufb m1, m4, m0
+ pshufb m2, m5, m0
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ pshufb m1, m4, m3
+ pshufb m2, m5, m3
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova [dstq+16*2], m0
+ mova [dstq+16*3], m1
+ add dstq, strideq
+ dec hd
+ jg .w32
+ RET
+.w64:
+ movu m3, [idxq+16*0]
+ psrlw m1, m3, 4
+ punpcklbw m0, m3, m1
+ punpckhbw m3, m1
+ pshufb m1, m4, m0
+ pshufb m2, m5, m0
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ pshufb m1, m4, m3
+ pshufb m2, m5, m3
+ movu m3, [idxq+16*1]
+ add idxq, 32
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova [dstq+16*2], m0
+ mova [dstq+16*3], m1
+ psrlw m1, m3, 4
+ punpcklbw m0, m3, m1
+ punpckhbw m3, m1
+ pshufb m1, m4, m0
+ pshufb m2, m5, m0
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova [dstq+16*4], m0
+ mova [dstq+16*5], m1
+ pshufb m1, m4, m3
+ pshufb m2, m5, m3
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova [dstq+16*6], m0
+ mova [dstq+16*7], m1
+ add dstq, strideq
+ dec hd
+ jg .w64
+ RET
diff --git a/third_party/dav1d/src/x86/ipred_avx2.asm b/third_party/dav1d/src/x86/ipred_avx2.asm
new file mode 100644
index 0000000000..58e40935ac
--- /dev/null
+++ b/third_party/dav1d/src/x86/ipred_avx2.asm
@@ -0,0 +1,5393 @@
+; Copyright © 2018-2021, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 64
+
+%macro SMOOTH_WEIGHT_TABLE 1-*
+ %rep %0
+ db %1-128, 127-%1
+ %rotate 1
+ %endrep
+%endmacro
+
+; sm_weights[], but modified to precalculate x and 256-x with offsets to
+; enable efficient use of pmaddubsw (which requires signed values)
+smooth_weights: SMOOTH_WEIGHT_TABLE \
+ 0, 0, 255, 128, 255, 149, 85, 64, \
+ 255, 197, 146, 105, 73, 50, 37, 32, \
+ 255, 225, 196, 170, 145, 123, 102, 84, \
+ 68, 54, 43, 33, 26, 20, 17, 16, \
+ 255, 240, 225, 210, 196, 182, 169, 157, \
+ 145, 133, 122, 111, 101, 92, 83, 74, \
+ 66, 59, 52, 45, 39, 34, 29, 25, \
+ 21, 17, 14, 12, 10, 9, 8, 8, \
+ 255, 248, 240, 233, 225, 218, 210, 203, \
+ 196, 189, 182, 176, 169, 163, 156, 150, \
+ 144, 138, 133, 127, 121, 116, 111, 106, \
+ 101, 96, 91, 86, 82, 77, 73, 69, \
+ 65, 61, 57, 54, 50, 47, 44, 41, \
+ 38, 35, 32, 29, 27, 25, 22, 20, \
+ 18, 16, 15, 13, 12, 10, 9, 8, \
+ 7, 6, 6, 5, 5, 4, 4, 4
+
+pb_1to32: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+ db 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
+pb_32to1: db 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17
+pb_16to1: db 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
+z_filter_wh: db 7, 7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39
+ db 39, 39, 47, 47, 47, 63, 63, 63, 79, 79, 79, -1
+z_filter_k: db 0, 16, 0, 16, 0, 20, 0, 20, 8, 16, 8, 16
+ db 32, 16, 32, 16, 24, 20, 24, 20, 16, 16, 16, 16
+ db 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 8, 0
+z_filter_s: db 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7
+ db 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15
+ db 15, 15, 15, 15, 15, 15, 15, 15 ; should be in one cache line
+pb_128: times 4 db 128 ; those are just placed here for alignment.
+pb_36_m4: times 2 db 36, -4
+z3_shuf: db 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0
+z_filter_t0: db 55,127, 39,127, 39,127, 7, 15, 31, 7, 15, 31, 0, 3, 31, 0
+z_filter_t1: db 39, 63, 19, 47, 19, 47, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0
+z_upsample1: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
+z_upsample2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 8, 8, 8
+z2_upsample: db 7, 6, 15, 14, 5, 4, 13, 12, 3, 2, 11, 10, 1, 0, 9, 8
+z1_shuf_w4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12
+z2_shuf_h2: db 3, 2, 7, 6, 11, 10, 15, 14, 2, 1, 6, 5, 10, 9, 14, 13
+z2_shuf_h4: db 7, 6, 15, 14, 6, 5, 14, 13, 5, 4, 13, 12, 4, 3, 12, 11
+z3_shuf_w4: db 4, 3, 3, 2, 2, 1, 1, 0, 12, 11, 11, 10, 10, 9, 9, 8
+z_transpose4: db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
+z_base_inc: dw 0*64, 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64
+ dw 16*64, 17*64, 18*64, 19*64, 20*64, 21*64, 22*64, 23*64
+z2_base_inc: dw 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64, 8*64
+ dw 9*64, 10*64, 11*64, 12*64, 13*64, 14*64, 15*64, 16*64
+z2_ymul: dw 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+z2_y_shuf_h4: db 90, 90, 90, 90, 14, 14, 14, 14, 27, 27, 27, 27, 31, 31, 31, 31 ; 2, 6, 3, 7
+ db 32, 32, 32, 32, 12, 12, 12, 12, 1, 0, 1, 0, 5, -1, -1, -1 ; 0, 4, 1, 5
+; vpermd indices in bits 4..6 of filter_shuf1: 0, 2, 6, 4, 1, 3, 7, 5
+filter_shuf1: db 10, 4, 10, 4, 37, 6, 5, 6,103, 9, 7, 9, 72, -1, 8, -1
+ db 16, 4, 0, 4, 53, 6, 5, 6,119, 11, 7, 11, 95, -1, 15, -1
+filter_shuf2: db 3, 4, 3, 4, 5, 6, 5, 6, 7, 2, 7, 2, 1, -1, 1, -1
+filter_shuf3: db 3, 4, 3, 4, 5, 6, 5, 6, 7, 11, 7, 11; 15, -1, 15, -1
+pb_127_m127: times 2 db 127, -127
+ipred_v_shuf: db 0, 1, 0, 1, 4, 5, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13
+ db 2, 3, 2, 3, 6, 7, 6, 7, 10, 11, 10, 11, 14, 15, 14, 15
+ipred_h_shuf: db 7, 7, 7, 7, 3, 3, 3, 3, 5, 5, 5, 5, 1, 1, 1, 1
+ db 6, 6, 6, 6, 2, 2, 2, 2, 4, 4, 4, 4; 0, 0, 0, 0
+pw_64: times 2 dw 64
+
+cfl_ac_444_w16_pad1_shuffle: db 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1
+ times 9 db 7, -1
+cfl_ac_w16_pad_shuffle: ; w=16, w_pad=1
+ db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ ; w=8, w_pad=1 as well as second half of previous one
+cfl_ac_w8_pad1_shuffle: db 0, 1, 2, 3, 4, 5
+ times 5 db 6, 7
+ ; w=16,w_pad=2
+ db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ times 8 db 14, 15
+ ; w=16,w_pad=3
+ db 0, 1, 2, 3, 4, 5
+ times 13 db 6, 7
+pb_15to0: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+%define pb_0to15 cfl_ac_w16_pad_shuffle
+%define pb_1 (ipred_h_shuf+12)
+%define pb_2 (ipred_h_shuf+20)
+%define pb_3 (ipred_h_shuf+ 4)
+%define pb_4 (ipred_h_shuf+24)
+%define pb_5 (ipred_h_shuf+ 8)
+%define pb_7 (ipred_h_shuf+ 0)
+%define pb_8 (z_upsample2 +12)
+%define pb_12 (z2_y_shuf_h4+20)
+%define pb_14 (z2_y_shuf_h4+ 4)
+%define pb_15 (z_filter_s +32)
+%define pb_27 (z2_y_shuf_h4+ 8)
+%define pb_31 (z2_y_shuf_h4+12)
+%define pb_32 (z2_y_shuf_h4+16)
+%define pb_90 (z2_y_shuf_h4+ 0)
+%define pw_1 (z2_y_shuf_h4+24)
+%define pw_8 (z_filter_k +32)
+
+pw_62: times 2 dw 62
+pw_128: times 2 dw 128
+pw_255: times 2 dw 255
+pw_512: times 2 dw 512
+
+%macro JMP_TABLE 3-*
+ %xdefine %1_%2_table (%%table - 2*4)
+ %xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2)
+ %%table:
+ %rep %0 - 2
+ dd %%base %+ .%3 - (%%table - 2*4)
+ %rotate 1
+ %endrep
+%endmacro
+
+%define ipred_dc_splat_avx2_table (ipred_dc_avx2_table + 10*4)
+%define ipred_cfl_splat_avx2_table (ipred_cfl_avx2_table + 8*4)
+
+JMP_TABLE ipred_smooth, avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_smooth_v, avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_smooth_h, avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_paeth, avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_filter, avx2, w4, w8, w16, w32
+JMP_TABLE ipred_dc, avx2, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
+ s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4
+JMP_TABLE ipred_dc_left, avx2, h4, h8, h16, h32, h64
+JMP_TABLE ipred_h, avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_z1, avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_z2, avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_z3, avx2, h4, h8, h16, h32, h64
+JMP_TABLE ipred_cfl, avx2, h4, h8, h16, h32, w4, w8, w16, w32, \
+ s4-8*4, s8-8*4, s16-8*4, s32-8*4
+JMP_TABLE ipred_cfl_left, avx2, h4, h8, h16, h32
+JMP_TABLE ipred_cfl_ac_420, avx2, w16_pad1, w16_pad2, w16_pad3
+JMP_TABLE ipred_cfl_ac_422, avx2, w16_pad1, w16_pad2, w16_pad3
+JMP_TABLE ipred_cfl_ac_444, avx2, w32_pad1, w32_pad2, w32_pad3, w4, w8, w16, w32
+JMP_TABLE pal_pred, avx2, w4, w8, w16, w32, w64
+
+cextern dr_intra_derivative
+cextern filter_intra_taps
+
+SECTION .text
+
+INIT_YMM avx2
+cglobal ipred_dc_top_8bpc, 3, 7, 6, dst, stride, tl, w, h
+ lea r5, [ipred_dc_left_avx2_table]
+ tzcnt wd, wm
+ inc tlq
+ movu m0, [tlq]
+ movifnidn hd, hm
+ mov r6d, 0x8000
+ shrx r6d, r6d, wd
+ movd xm3, r6d
+ movsxd r6, [r5+wq*4]
+ pcmpeqd m2, m2
+ pmaddubsw m0, m2
+ add r6, r5
+ add r5, ipred_dc_splat_avx2_table-ipred_dc_left_avx2_table
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ jmp r6
+
+cglobal ipred_dc_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
+ mov hd, hm ; zero upper half
+ tzcnt r6d, hd
+ sub tlq, hq
+ tzcnt wd, wm
+ movu m0, [tlq]
+ mov r5d, 0x8000
+ shrx r5d, r5d, r6d
+ movd xm3, r5d
+ lea r5, [ipred_dc_left_avx2_table]
+ movsxd r6, [r5+r6*4]
+ pcmpeqd m2, m2
+ pmaddubsw m0, m2
+ add r6, r5
+ add r5, ipred_dc_splat_avx2_table-ipred_dc_left_avx2_table
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ jmp r6
+.h64:
+ movu m1, [tlq+32] ; unaligned when jumping here from dc_top
+ pmaddubsw m1, m2
+ paddw m0, m1
+.h32:
+ vextracti128 xm1, m0, 1
+ paddw xm0, xm1
+.h16:
+ punpckhqdq xm1, xm0, xm0
+ paddw xm0, xm1
+.h8:
+ psrlq xm1, xm0, 32
+ paddw xm0, xm1
+.h4:
+ pmaddwd xm0, xm2
+ pmulhrsw xm0, xm3
+ lea stride3q, [strideq*3]
+ vpbroadcastb m0, xm0
+ mova m1, m0
+ jmp wq
+
+cglobal ipred_dc_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
+ movifnidn hd, hm
+ movifnidn wd, wm
+ tzcnt r6d, hd
+ lea r5d, [wq+hq]
+ movd xm4, r5d
+ tzcnt r5d, r5d
+ movd xm5, r5d
+ lea r5, [ipred_dc_avx2_table]
+ tzcnt wd, wd
+ movsxd r6, [r5+r6*4]
+ movsxd wq, [r5+wq*4+5*4]
+ pcmpeqd m3, m3
+ psrlw xm4, 1
+ add r6, r5
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp r6
+.h4:
+ movd xm0, [tlq-4]
+ pmaddubsw xm0, xm3
+ jmp wq
+.w4:
+ movd xm1, [tlq+1]
+ pmaddubsw xm1, xm3
+ psubw xm0, xm4
+ paddw xm0, xm1
+ pmaddwd xm0, xm3
+ cmp hd, 4
+ jg .w4_mul
+ psrlw xm0, 3
+ jmp .w4_end
+.w4_mul:
+ punpckhqdq xm1, xm0, xm0
+ lea r2d, [hq*2]
+ mov r6d, 0x55563334
+ paddw xm0, xm1
+ shrx r6d, r6d, r2d
+ psrlq xm1, xm0, 32
+ paddw xm0, xm1
+ movd xm1, r6d
+ psrlw xm0, 2
+ pmulhuw xm0, xm1
+.w4_end:
+ vpbroadcastb xm0, xm0
+.s4:
+ movd [dstq+strideq*0], xm0
+ movd [dstq+strideq*1], xm0
+ movd [dstq+strideq*2], xm0
+ movd [dstq+stride3q ], xm0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s4
+ RET
+ALIGN function_align
+.h8:
+ movq xm0, [tlq-8]
+ pmaddubsw xm0, xm3
+ jmp wq
+.w8:
+ movq xm1, [tlq+1]
+ vextracti128 xm2, m0, 1
+ pmaddubsw xm1, xm3
+ psubw xm0, xm4
+ paddw xm0, xm2
+ punpckhqdq xm2, xm0, xm0
+ paddw xm0, xm2
+ paddw xm0, xm1
+ psrlq xm1, xm0, 32
+ paddw xm0, xm1
+ pmaddwd xm0, xm3
+ psrlw xm0, xm5
+ cmp hd, 8
+ je .w8_end
+ mov r6d, 0x5556
+ mov r2d, 0x3334
+ cmp hd, 32
+ cmove r6d, r2d
+ movd xm1, r6d
+ pmulhuw xm0, xm1
+.w8_end:
+ vpbroadcastb xm0, xm0
+.s8:
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm0
+ movq [dstq+strideq*2], xm0
+ movq [dstq+stride3q ], xm0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s8
+ RET
+ALIGN function_align
+.h16:
+ mova xm0, [tlq-16]
+ pmaddubsw xm0, xm3
+ jmp wq
+.w16:
+ movu xm1, [tlq+1]
+ vextracti128 xm2, m0, 1
+ pmaddubsw xm1, xm3
+ psubw xm0, xm4
+ paddw xm0, xm2
+ paddw xm0, xm1
+ punpckhqdq xm1, xm0, xm0
+ paddw xm0, xm1
+ psrlq xm1, xm0, 32
+ paddw xm0, xm1
+ pmaddwd xm0, xm3
+ psrlw xm0, xm5
+ cmp hd, 16
+ je .w16_end
+ mov r6d, 0x5556
+ mov r2d, 0x3334
+ test hb, 8|32
+ cmovz r6d, r2d
+ movd xm1, r6d
+ pmulhuw xm0, xm1
+.w16_end:
+ vpbroadcastb xm0, xm0
+.s16:
+ mova [dstq+strideq*0], xm0
+ mova [dstq+strideq*1], xm0
+ mova [dstq+strideq*2], xm0
+ mova [dstq+stride3q ], xm0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s16
+ RET
+ALIGN function_align
+.h32:
+ mova m0, [tlq-32]
+ pmaddubsw m0, m3
+ jmp wq
+.w32:
+ movu m1, [tlq+1]
+ pmaddubsw m1, m3
+ paddw m0, m1
+ vextracti128 xm1, m0, 1
+ psubw xm0, xm4
+ paddw xm0, xm1
+ punpckhqdq xm1, xm0, xm0
+ paddw xm0, xm1
+ psrlq xm1, xm0, 32
+ paddw xm0, xm1
+ pmaddwd xm0, xm3
+ psrlw xm0, xm5
+ cmp hd, 32
+ je .w32_end
+ lea r2d, [hq*2]
+ mov r6d, 0x33345556
+ shrx r6d, r6d, r2d
+ movd xm1, r6d
+ pmulhuw xm0, xm1
+.w32_end:
+ vpbroadcastb m0, xm0
+.s32:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s32
+ RET
+ALIGN function_align
+.h64:
+ mova m0, [tlq-64]
+ mova m1, [tlq-32]
+ pmaddubsw m0, m3
+ pmaddubsw m1, m3
+ paddw m0, m1
+ jmp wq
+.w64:
+ movu m1, [tlq+ 1]
+ movu m2, [tlq+33]
+ pmaddubsw m1, m3
+ pmaddubsw m2, m3
+ paddw m0, m1
+ paddw m0, m2
+ vextracti128 xm1, m0, 1
+ psubw xm0, xm4
+ paddw xm0, xm1
+ punpckhqdq xm1, xm0, xm0
+ paddw xm0, xm1
+ psrlq xm1, xm0, 32
+ paddw xm0, xm1
+ pmaddwd xm0, xm3
+ psrlw xm0, xm5
+ cmp hd, 64
+ je .w64_end
+ mov r6d, 0x33345556
+ shrx r6d, r6d, hd
+ movd xm1, r6d
+ pmulhuw xm0, xm1
+.w64_end:
+ vpbroadcastb m0, xm0
+ mova m1, m0
+.s64:
+ mova [dstq+strideq*0+32*0], m0
+ mova [dstq+strideq*0+32*1], m1
+ mova [dstq+strideq*1+32*0], m0
+ mova [dstq+strideq*1+32*1], m1
+ mova [dstq+strideq*2+32*0], m0
+ mova [dstq+strideq*2+32*1], m1
+ mova [dstq+stride3q +32*0], m0
+ mova [dstq+stride3q +32*1], m1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s64
+ RET
+
+cglobal ipred_dc_128_8bpc, 2, 7, 6, dst, stride, tl, w, h, stride3
+ lea r5, [ipred_dc_splat_avx2_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ vpbroadcastd m0, [r5-ipred_dc_splat_avx2_table+pb_128]
+ mova m1, m0
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp wq
+
+cglobal ipred_v_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
+ lea r5, [ipred_dc_splat_avx2_table]
+ tzcnt wd, wm
+ movu m0, [tlq+ 1]
+ movu m1, [tlq+33]
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp wq
+
+%macro IPRED_H 2 ; w, store_type
+ vpbroadcastb m0, [tlq-1]
+ vpbroadcastb m1, [tlq-2]
+ vpbroadcastb m2, [tlq-3]
+ sub tlq, 4
+ vpbroadcastb m3, [tlq+0]
+ mov%2 [dstq+strideq*0], m0
+ mov%2 [dstq+strideq*1], m1
+ mov%2 [dstq+strideq*2], m2
+ mov%2 [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w%1
+ RET
+ALIGN function_align
+%endmacro
+
+INIT_XMM avx2
+cglobal ipred_h_8bpc, 3, 6, 4, dst, stride, tl, w, h, stride3
+ lea r5, [ipred_h_avx2_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ IPRED_H 4, d
+.w8:
+ IPRED_H 8, q
+.w16:
+ IPRED_H 16, a
+INIT_YMM avx2
+.w32:
+ IPRED_H 32, a
+.w64:
+ vpbroadcastb m0, [tlq-1]
+ vpbroadcastb m1, [tlq-2]
+ vpbroadcastb m2, [tlq-3]
+ sub tlq, 4
+ vpbroadcastb m3, [tlq+0]
+ mova [dstq+strideq*0+32*0], m0
+ mova [dstq+strideq*0+32*1], m0
+ mova [dstq+strideq*1+32*0], m1
+ mova [dstq+strideq*1+32*1], m1
+ mova [dstq+strideq*2+32*0], m2
+ mova [dstq+strideq*2+32*1], m2
+ mova [dstq+stride3q +32*0], m3
+ mova [dstq+stride3q +32*1], m3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w64
+ RET
+
+%macro PAETH 2 ; top, ldiff
+ pavgb m1, m%1, m3 ; Calculating tldiff normally requires
+ pxor m0, m%1, m3 ; 10-bit intermediates, but we can do it
+ pand m0, m4 ; in 8-bit with some tricks which avoids
+ psubusb m2, m5, m1 ; having to unpack everything to 16-bit.
+ psubb m1, m0
+ psubusb m1, m5
+ por m1, m2
+ paddusb m1, m1
+ por m1, m0 ; min(tldiff, 255)
+ psubusb m2, m5, m3
+ psubusb m0, m3, m5
+ por m2, m0 ; tdiff
+ pminub m2, m%2
+ pcmpeqb m0, m%2, m2 ; ldiff <= tdiff
+ vpblendvb m0, m%1, m3, m0
+ pminub m1, m2
+ pcmpeqb m1, m2 ; ldiff <= tldiff || tdiff <= tldiff
+ vpblendvb m0, m5, m0, m1
+%endmacro
+
+cglobal ipred_paeth_8bpc, 3, 6, 9, dst, stride, tl, w, h
+%define base r5-ipred_paeth_avx2_table
+ lea r5, [ipred_paeth_avx2_table]
+ tzcnt wd, wm
+ vpbroadcastb m5, [tlq] ; topleft
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ vpbroadcastd m4, [base+pb_1]
+ add wq, r5
+ jmp wq
+.w4:
+ vpbroadcastd m6, [tlq+1] ; top
+ mova m8, [base+ipred_h_shuf]
+ lea r3, [strideq*3]
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0 ; ldiff
+.w4_loop:
+ sub tlq, 8
+ vpbroadcastq m3, [tlq]
+ pshufb m3, m8 ; left
+ PAETH 6, 7
+ vextracti128 xm1, m0, 1
+ movd [dstq+strideq*0], xm0
+ movd [dstq+strideq*1], xm1
+ pextrd [dstq+strideq*2], xm0, 2
+ pextrd [dstq+r3 ], xm1, 2
+ cmp hd, 4
+ je .ret
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 1
+ pextrd [dstq+strideq*1], xm1, 1
+ pextrd [dstq+strideq*2], xm0, 3
+ pextrd [dstq+r3 ], xm1, 3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 8
+ jg .w4_loop
+.ret:
+ RET
+ALIGN function_align
+.w8:
+ vpbroadcastq m6, [tlq+1]
+ mova m8, [base+ipred_h_shuf]
+ lea r3, [strideq*3]
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0
+.w8_loop:
+ sub tlq, 4
+ vpbroadcastd m3, [tlq]
+ pshufb m3, m8
+ PAETH 6, 7
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+r3 ], xm1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8_loop
+ RET
+ALIGN function_align
+.w16:
+ vbroadcasti128 m6, [tlq+1]
+ mova xm8, xm4 ; lower half = 1, upper half = 0
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0
+.w16_loop:
+ sub tlq, 2
+ vpbroadcastd m3, [tlq]
+ pshufb m3, m8
+ PAETH 6, 7
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w16_loop
+ RET
+ALIGN function_align
+.w32:
+ movu m6, [tlq+1]
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0
+.w32_loop:
+ dec tlq
+ vpbroadcastb m3, [tlq]
+ PAETH 6, 7
+ mova [dstq], m0
+ add dstq, strideq
+ dec hd
+ jg .w32_loop
+ RET
+ALIGN function_align
+.w64:
+ movu m6, [tlq+ 1]
+ movu m7, [tlq+33]
+%if WIN64
+ movaps r4m, xmm9
+%endif
+ psubusb m8, m5, m6
+ psubusb m0, m6, m5
+ psubusb m9, m5, m7
+ psubusb m1, m7, m5
+ por m8, m0
+ por m9, m1
+.w64_loop:
+ dec tlq
+ vpbroadcastb m3, [tlq]
+ PAETH 6, 8
+ mova [dstq+32*0], m0
+ PAETH 7, 9
+ mova [dstq+32*1], m0
+ add dstq, strideq
+ dec hd
+ jg .w64_loop
+%if WIN64
+ movaps xmm9, r4m
+%endif
+ RET
+
+%macro SMOOTH 6 ; src[1-2], mul[1-2], add[1-2]
+ ; w * a = (w - 128) * a + 128 * a
+ ; (256 - w) * b = (127 - w) * b + 129 * b
+ pmaddubsw m0, m%3, m%1
+ pmaddubsw m1, m%4, m%2
+ paddw m0, m%5
+ paddw m1, m%6
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+%endmacro
+
+cglobal ipred_smooth_v_8bpc, 3, 7, 0, dst, stride, tl, w, h, weights
+%define base r6-ipred_smooth_v_avx2_table
+ lea r6, [ipred_smooth_v_avx2_table]
+ tzcnt wd, wm
+ mov hd, hm
+ movsxd wq, [r6+wq*4]
+ vpbroadcastd m0, [base+pb_127_m127]
+ vpbroadcastd m1, [base+pw_128]
+ lea weightsq, [base+smooth_weights+hq*4]
+ neg hq
+ vpbroadcastb m5, [tlq+hq] ; bottom
+ add wq, r6
+ jmp wq
+.w4:
+ vpbroadcastd m2, [tlq+1]
+ punpcklbw m2, m5 ; top, bottom
+ mova m5, [base+ipred_v_shuf]
+ lea r3, [strideq*3]
+ punpckldq m4, m5, m5
+ punpckhdq m5, m5
+ pmaddubsw m3, m2, m0
+ paddw m1, m2 ; 1 * top + 256 * bottom + 128, overflow is ok
+ paddw m3, m1 ; 128 * top + 129 * bottom + 128
+.w4_loop:
+ vbroadcasti128 m1, [weightsq+hq*2]
+ pshufb m0, m1, m4
+ pshufb m1, m5
+ SMOOTH 0, 1, 2, 2, 3, 3
+ vextracti128 xm1, m0, 1
+ movd [dstq+strideq*0], xm0
+ movd [dstq+strideq*1], xm1
+ pextrd [dstq+strideq*2], xm0, 1
+ pextrd [dstq+r3 ], xm1, 1
+ cmp hd, -4
+ je .ret
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 2
+ pextrd [dstq+strideq*1], xm1, 2
+ pextrd [dstq+strideq*2], xm0, 3
+ pextrd [dstq+r3 ], xm1, 3
+ lea dstq, [dstq+strideq*4]
+ add hq, 8
+ jl .w4_loop
+.ret:
+ RET
+ALIGN function_align
+.w8:
+ vpbroadcastq m2, [tlq+1]
+ punpcklbw m2, m5
+ mova m5, [base+ipred_v_shuf]
+ lea r3, [strideq*3]
+ pshufd m4, m5, q0000
+ pshufd m5, m5, q1111
+ pmaddubsw m3, m2, m0
+ paddw m1, m2
+ paddw m3, m1
+.w8_loop:
+ vpbroadcastq m1, [weightsq+hq*2]
+ pshufb m0, m1, m4
+ pshufb m1, m5
+ SMOOTH 0, 1, 2, 2, 3, 3
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+r3 ], xm1
+ lea dstq, [dstq+strideq*4]
+ add hq, 4
+ jl .w8_loop
+ RET
+ALIGN function_align
+.w16:
+ WIN64_SPILL_XMM 7
+ vbroadcasti128 m3, [tlq+1]
+ mova m6, [base+ipred_v_shuf]
+ punpcklbw m2, m3, m5
+ punpckhbw m3, m5
+ pmaddubsw m4, m2, m0
+ pmaddubsw m5, m3, m0
+ paddw m0, m1, m2
+ paddw m1, m3
+ paddw m4, m0
+ paddw m5, m1
+.w16_loop:
+ vpbroadcastd m1, [weightsq+hq*2]
+ pshufb m1, m6
+ SMOOTH 1, 1, 2, 3, 4, 5
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ add hq, 2
+ jl .w16_loop
+ RET
+ALIGN function_align
+.w32:
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 6
+ movu m3, [tlq+1]
+ punpcklbw m2, m3, m5
+ punpckhbw m3, m5
+ pmaddubsw m4, m2, m0
+ pmaddubsw m5, m3, m0
+ paddw m0, m1, m2
+ paddw m1, m3
+ paddw m4, m0
+ paddw m5, m1
+.w32_loop:
+ vpbroadcastw m1, [weightsq+hq*2]
+ SMOOTH 1, 1, 2, 3, 4, 5
+ mova [dstq], m0
+ add dstq, strideq
+ inc hq
+ jl .w32_loop
+ RET
+ALIGN function_align
+.w64:
+ WIN64_SPILL_XMM 11
+ movu m4, [tlq+ 1]
+ movu m8, [tlq+33]
+ punpcklbw m3, m4, m5
+ punpckhbw m4, m5
+ punpcklbw m7, m8, m5
+ punpckhbw m8, m5
+ pmaddubsw m5, m3, m0
+ pmaddubsw m6, m4, m0
+ pmaddubsw m9, m7, m0
+ pmaddubsw m10, m8, m0
+ paddw m2, m1, m3
+ paddw m5, m2
+ paddw m2, m1, m4
+ paddw m6, m2
+ paddw m0, m1, m7
+ paddw m9, m0
+ paddw m1, m8
+ paddw m10, m1
+.w64_loop:
+ vpbroadcastw m2, [weightsq+hq*2]
+ SMOOTH 2, 2, 3, 4, 5, 6
+ mova [dstq+32*0], m0
+ SMOOTH 2, 2, 7, 8, 9, 10
+ mova [dstq+32*1], m0
+ add dstq, strideq
+ inc hq
+ jl .w64_loop
+ RET
+
+%macro SETUP_STACK_FRAME 3 ; stack_size, regs_used, xmm_regs_used
+ %assign stack_offset 0
+ %assign stack_size_padded 0
+ %assign regs_used %2
+ %xdefine rstk rsp
+ SETUP_STACK_POINTER %1
+ %if regs_used != %2 && WIN64
+ PUSH r%2
+ %endif
+ ALLOC_STACK %1, %3
+%endmacro
+
+cglobal ipred_smooth_h_8bpc, 3, 7, 0, dst, stride, tl, w, h
+%define base r6-ipred_smooth_h_avx2_table
+ lea r6, [ipred_smooth_h_avx2_table]
+ mov wd, wm
+ vpbroadcastb m3, [tlq+wq] ; right
+ tzcnt wd, wd
+ mov hd, hm
+ movsxd wq, [r6+wq*4]
+ vpbroadcastd m4, [base+pb_127_m127]
+ vpbroadcastd m5, [base+pw_128]
+ add wq, r6
+ jmp wq
+.w4:
+ WIN64_SPILL_XMM 8
+ vpbroadcastq m6, [base+smooth_weights+4*2]
+ mova m7, [base+ipred_h_shuf]
+ sub tlq, 8
+ sub tlq, hq
+ lea r3, [strideq*3]
+.w4_loop:
+ vpbroadcastq m2, [tlq+hq]
+ pshufb m2, m7
+ punpcklbw m1, m2, m3 ; left, right
+ punpckhbw m2, m3
+ pmaddubsw m0, m1, m4 ; 127 * left - 127 * right
+ paddw m0, m1 ; 128 * left + 129 * right
+ pmaddubsw m1, m6
+ paddw m1, m5
+ paddw m0, m1
+ pmaddubsw m1, m2, m4
+ paddw m1, m2
+ pmaddubsw m2, m6
+ paddw m2, m5
+ paddw m1, m2
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+ vextracti128 xm1, m0, 1
+ movd [dstq+strideq*0], xm0
+ movd [dstq+strideq*1], xm1
+ pextrd [dstq+strideq*2], xm0, 2
+ pextrd [dstq+r3 ], xm1, 2
+ cmp hd, 4
+ je .ret
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 1
+ pextrd [dstq+strideq*1], xm1, 1
+ pextrd [dstq+strideq*2], xm0, 3
+ pextrd [dstq+r3 ], xm1, 3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 8
+ jg .w4_loop
+.ret:
+ RET
+ALIGN function_align
+.w8:
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 8
+ vbroadcasti128 m6, [base+smooth_weights+8*2]
+ mova m7, [base+ipred_h_shuf]
+ sub tlq, 4
+ lea r3, [strideq*3]
+ sub tlq, hq
+.w8_loop:
+ vpbroadcastd m2, [tlq+hq]
+ pshufb m2, m7
+ punpcklbw m1, m2, m3
+ punpckhbw m2, m3
+ pmaddubsw m0, m1, m4
+ paddw m0, m1
+ pmaddubsw m1, m6
+ paddw m1, m5
+ paddw m0, m1
+ pmaddubsw m1, m2, m4
+ paddw m1, m2
+ pmaddubsw m2, m6
+ paddw m2, m5
+ paddw m1, m2
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+r3 ], xm1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8_loop
+ RET
+ALIGN function_align
+.w16:
+ SETUP_STACK_FRAME 32*4, 7, 8
+ lea r3, [rsp+64*2-4]
+ call .prep ; only worthwhile for for w16 and above
+ sub tlq, 2
+ vpbroadcastd xm6, [base+pb_1]
+ mova xm7, [base+ipred_v_shuf+16]
+ vinserti128 m7, [base+ipred_v_shuf+ 0], 1
+ vbroadcasti128 m4, [base+smooth_weights+16*2]
+ vbroadcasti128 m5, [base+smooth_weights+16*3]
+.w16_loop:
+ vpbroadcastd m1, [tlq+hq]
+ vpbroadcastd m2, [r3+hq*2]
+ pshufb m1, m6
+ punpcklbw m1, m3
+ pshufb m2, m7
+ SMOOTH 4, 5, 1, 1, 2, 2
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w16_loop
+ RET
+ALIGN function_align
+.w32:
+ SETUP_STACK_FRAME 32*4, 7, 6
+ lea r3, [rsp+64*2-2]
+ call .prep
+ dec tlq
+ mova xm4, [base+smooth_weights+16*4]
+ vinserti128 m4, [base+smooth_weights+16*6], 1
+ mova xm5, [base+smooth_weights+16*5]
+ vinserti128 m5, [base+smooth_weights+16*7], 1
+.w32_loop:
+ vpbroadcastb m1, [tlq+hq]
+ punpcklbw m1, m3
+ vpbroadcastw m2, [r3+hq*2]
+ SMOOTH 4, 5, 1, 1, 2, 2
+ mova [dstq], m0
+ add dstq, strideq
+ dec hd
+ jg .w32_loop
+ RET
+ALIGN function_align
+.w64:
+ SETUP_STACK_FRAME 32*4, 7, 9
+ lea r3, [rsp+64*2-2]
+ call .prep
+ add r6, smooth_weights+16*15-ipred_smooth_h_avx2_table
+ dec tlq
+ mova xm5, [r6-16*7]
+ vinserti128 m5, [r6-16*5], 1
+ mova xm6, [r6-16*6]
+ vinserti128 m6, [r6-16*4], 1
+ mova xm7, [r6-16*3]
+ vinserti128 m7, [r6-16*1], 1
+ mova xm8, [r6-16*2]
+ vinserti128 m8, [r6-16*0], 1
+.w64_loop:
+ vpbroadcastb m2, [tlq+hq]
+ punpcklbw m2, m3
+ vpbroadcastw m4, [r3+hq*2]
+ SMOOTH 5, 6, 2, 2, 4, 4
+ mova [dstq+32*0], m0
+ SMOOTH 7, 8, 2, 2, 4, 4
+ mova [dstq+32*1], m0
+ add dstq, strideq
+ dec hd
+ jg .w64_loop
+ RET
+ALIGN function_align
+.prep:
+ vpermq m2, [tlq-32*1], q3120
+ punpckhbw m1, m2, m3
+ punpcklbw m2, m3
+ pmaddubsw m0, m1, m4 ; 127 * left - 127 * right
+ paddw m1, m5 ; 1 * left + 256 * right + 128
+ paddw m0, m1 ; 128 * left + 129 * right + 128
+ pmaddubsw m1, m2, m4
+ paddw m2, m5
+ paddw m1, m2
+ vpermq m2, [tlq-32*2], q3120
+ mova [rsp+gprsize+32*3], m0
+ mova [rsp+gprsize+32*2], m1
+ punpckhbw m1, m2, m3
+ punpcklbw m2, m3
+ pmaddubsw m0, m1, m4
+ paddw m1, m5
+ paddw m0, m1
+ pmaddubsw m1, m2, m4
+ paddw m2, m5
+ paddw m1, m2
+ mova [rsp+gprsize+32*1], m0
+ mova [rsp+gprsize+32*0], m1
+ sub r3, hq
+ sub tlq, hq
+ sub r3, hq
+ ret
+
+%macro SMOOTH_2D_END 6 ; src[1-2], mul[1-2], add[1-2]
+ pmaddubsw m0, m%3, m%1
+ pmaddubsw m1, m%4, m%2
+%ifnum %5
+ paddw m0, m%5
+%else
+ paddw m0, %5
+%endif
+%ifnum %6
+ paddw m1, m%6
+%else
+ paddw m1, %6
+%endif
+ pavgw m0, m2
+ pavgw m1, m3
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+%endmacro
+
+cglobal ipred_smooth_8bpc, 3, 7, 0, dst, stride, tl, w, h, v_weights
+%define base r6-ipred_smooth_avx2_table
+ lea r6, [ipred_smooth_avx2_table]
+ mov wd, wm
+ vpbroadcastb m4, [tlq+wq] ; right
+ tzcnt wd, wd
+ mov hd, hm
+ mov r5, tlq
+ sub r5, hq
+ movsxd wq, [r6+wq*4]
+ vpbroadcastd m5, [base+pb_127_m127]
+ vpbroadcastb m0, [r5] ; bottom
+ vpbroadcastd m3, [base+pw_255]
+ add wq, r6
+ lea v_weightsq, [base+smooth_weights+hq*2]
+ jmp wq
+.w4:
+ WIN64_SPILL_XMM 12
+ mova m10, [base+ipred_h_shuf]
+ vpbroadcastq m11, [base+smooth_weights+4*2]
+ mova m7, [base+ipred_v_shuf]
+ vpbroadcastd m8, [tlq+1]
+ sub tlq, 8
+ lea r3, [strideq*3]
+ sub tlq, hq
+ punpcklbw m8, m0 ; top, bottom
+ pshufd m6, m7, q2200
+ pshufd m7, m7, q3311
+ pmaddubsw m9, m8, m5
+ paddw m3, m8 ; 1 * top + 255 * bottom + 255
+ paddw m9, m3 ; 128 * top + 129 * bottom + 255
+.w4_loop:
+ vpbroadcastq m1, [tlq+hq]
+ pshufb m1, m10
+ punpcklbw m0, m1, m4 ; left, right
+ punpckhbw m1, m4
+ pmaddubsw m2, m0, m5 ; 127 * left - 127 * right
+ pmaddubsw m3, m1, m5
+ paddw m2, m0 ; 128 * left + 129 * right
+ paddw m3, m1
+ pmaddubsw m0, m11
+ pmaddubsw m1, m11
+ paddw m2, m0
+ paddw m3, m1
+ vbroadcasti128 m1, [v_weightsq]
+ add v_weightsq, 16
+ pshufb m0, m1, m6
+ pshufb m1, m7
+ SMOOTH_2D_END 0, 1, 8, 8, 9, 9
+ vextracti128 xm1, m0, 1
+ movd [dstq+strideq*0], xm0
+ movd [dstq+strideq*1], xm1
+ pextrd [dstq+strideq*2], xm0, 2
+ pextrd [dstq+r3 ], xm1, 2
+ cmp hd, 4
+ je .ret
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 1
+ pextrd [dstq+strideq*1], xm1, 1
+ pextrd [dstq+strideq*2], xm0, 3
+ pextrd [dstq+r3 ], xm1, 3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 8
+ jg .w4_loop
+.ret:
+ RET
+ALIGN function_align
+.w8:
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 12
+ mova m10, [base+ipred_h_shuf]
+ vbroadcasti128 m11, [base+smooth_weights+8*2]
+ mova m7, [base+ipred_v_shuf]
+ vpbroadcastq m8, [tlq+1]
+ sub tlq, 4
+ lea r3, [strideq*3]
+ sub tlq, hq
+ punpcklbw m8, m0
+ pshufd m6, m7, q0000
+ pshufd m7, m7, q1111
+ pmaddubsw m9, m8, m5
+ paddw m3, m8
+ paddw m9, m3
+.w8_loop:
+ vpbroadcastd m1, [tlq+hq]
+ pshufb m1, m10
+ punpcklbw m0, m1, m4
+ punpckhbw m1, m4
+ pmaddubsw m2, m0, m5
+ pmaddubsw m3, m1, m5
+ paddw m2, m0
+ paddw m3, m1
+ pmaddubsw m0, m11
+ pmaddubsw m1, m11
+ paddw m2, m0
+ paddw m3, m1
+ vpbroadcastq m1, [v_weightsq]
+ add v_weightsq, 8
+ pshufb m0, m1, m6
+ pshufb m1, m7
+ SMOOTH_2D_END 0, 1, 8, 8, 9, 9
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+r3 ], xm1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8_loop
+ RET
+ALIGN function_align
+.w16:
+ SETUP_STACK_FRAME 32*4, 7, 14
+ vbroadcasti128 m11, [tlq+1]
+ lea r3, [rsp+64*2-4]
+ punpcklbw m10, m11, m0 ; top, bottom
+ punpckhbw m11, m0
+ call .prep_v
+ sub tlq, 2
+ pmaddubsw m12, m10, m5
+ pmaddubsw m13, m11, m5
+ vpbroadcastd xm5, [base+pb_1]
+ mova m9, [base+ipred_v_shuf]
+ vbroadcasti128 m6, [base+smooth_weights+16*2]
+ vbroadcasti128 m7, [base+smooth_weights+16*3]
+ vperm2i128 m8, m9, m9, 0x01
+ paddw m0, m10, m3
+ paddw m3, m11
+ paddw m12, m0
+ paddw m13, m3
+.w16_loop:
+ vpbroadcastd m3, [tlq+hq]
+ vpbroadcastd m0, [r3+hq*2]
+ vpbroadcastd m1, [v_weightsq]
+ add v_weightsq, 4
+ pshufb m3, m5
+ punpcklbw m3, m4 ; left, right
+ pmaddubsw m2, m3, m6
+ pmaddubsw m3, m7
+ pshufb m0, m8
+ pshufb m1, m9
+ paddw m2, m0
+ paddw m3, m0
+ SMOOTH_2D_END 1, 1, 10, 11, 12, 13
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w16_loop
+ RET
+ALIGN function_align
+.w32:
+ SETUP_STACK_FRAME 32*4, 7, 11
+ movu m8, [tlq+1]
+ lea r3, [rsp+64*2-2]
+ punpcklbw m7, m8, m0
+ punpckhbw m8, m0
+ call .prep_v
+ dec tlq
+ pmaddubsw m9, m7, m5
+ pmaddubsw m10, m8, m5
+ mova xm5, [base+smooth_weights+16*4]
+ vinserti128 m5, [base+smooth_weights+16*6], 1
+ mova xm6, [base+smooth_weights+16*5]
+ vinserti128 m6, [base+smooth_weights+16*7], 1
+ paddw m0, m7, m3
+ paddw m3, m8
+ paddw m9, m0
+ paddw m10, m3
+.w32_loop:
+ vpbroadcastb m3, [tlq+hq]
+ punpcklbw m3, m4
+ vpbroadcastw m0, [r3+hq*2]
+ vpbroadcastw m1, [v_weightsq]
+ add v_weightsq, 2
+ pmaddubsw m2, m3, m5
+ pmaddubsw m3, m6
+ paddw m2, m0
+ paddw m3, m0
+ SMOOTH_2D_END 1, 1, 7, 8, 9, 10
+ mova [dstq], m0
+ add dstq, strideq
+ dec hd
+ jg .w32_loop
+ RET
+ALIGN function_align
+.w64:
+ SETUP_STACK_FRAME 32*8, 7, 16
+ movu m13, [tlq+1 ]
+ movu m15, [tlq+33]
+ add r6, smooth_weights+16*15-ipred_smooth_avx2_table
+ lea r3, [rsp+64*2-2]
+ punpcklbw m12, m13, m0
+ punpckhbw m13, m0
+ punpcklbw m14, m15, m0
+ punpckhbw m15, m0
+ call .prep_v
+ dec tlq
+ pmaddubsw m0, m12, m5
+ pmaddubsw m1, m13, m5
+ pmaddubsw m2, m14, m5
+ pmaddubsw m5, m15, m5
+ mova xm8, [r6-16*7]
+ vinserti128 m8, [r6-16*5], 1
+ mova xm9, [r6-16*6]
+ vinserti128 m9, [r6-16*4], 1
+ mova xm10, [r6-16*3]
+ vinserti128 m10, [r6-16*1], 1
+ mova xm11, [r6-16*2]
+ vinserti128 m11, [r6-16*0], 1
+ lea r6, [rsp+32*4]
+ paddw m0, m3
+ paddw m1, m3
+ paddw m2, m3
+ paddw m3, m5
+ paddw m0, m12
+ paddw m1, m13
+ paddw m2, m14
+ paddw m3, m15
+ mova [r6+32*0], m0
+ mova [r6+32*1], m1
+ mova [r6+32*2], m2
+ mova [r6+32*3], m3
+.w64_loop:
+ vpbroadcastb m5, [tlq+hq]
+ punpcklbw m5, m4
+ vpbroadcastw m6, [r3+hq*2]
+ vpbroadcastw m7, [v_weightsq]
+ add v_weightsq, 2
+ pmaddubsw m2, m5, m8
+ pmaddubsw m3, m5, m9
+ paddw m2, m6
+ paddw m3, m6
+ SMOOTH_2D_END 7, 7, 12, 13, [r6+32*0], [r6+32*1]
+ mova [dstq+32*0], m0
+ pmaddubsw m2, m5, m10
+ pmaddubsw m3, m5, m11
+ paddw m2, m6
+ paddw m3, m6
+ SMOOTH_2D_END 7, 7, 14, 15, [r6+32*2], [r6+32*3]
+ mova [dstq+32*1], m0
+ add dstq, strideq
+ dec hd
+ jg .w64_loop
+ RET
+ALIGN function_align
+.prep_v:
+ vpermq m2, [tlq-32*1], q3120
+ punpckhbw m1, m2, m4
+ punpcklbw m2, m4
+ pmaddubsw m0, m1, m5 ; 127 * left - 127 * right
+ paddw m0, m1 ; 128 * left + 129 * right
+ pmaddubsw m1, m2, m5
+ paddw m1, m2
+ vpermq m2, [tlq-32*2], q3120
+ mova [rsp+gprsize+32*3], m0
+ mova [rsp+gprsize+32*2], m1
+ punpckhbw m1, m2, m4
+ punpcklbw m2, m4
+ pmaddubsw m0, m1, m5
+ paddw m0, m1
+ pmaddubsw m1, m2, m5
+ paddw m1, m2
+ mova [rsp+gprsize+32*1], m0
+ mova [rsp+gprsize+32*0], m1
+ sub r3, hq
+ sub tlq, hq
+ sub r3, hq
+ ret
+
+cglobal ipred_z1_8bpc, 3, 8, 0, dst, stride, tl, w, h, angle, dx, maxbase
+ %assign org_stack_offset stack_offset
+ lea r6, [ipred_z1_avx2_table]
+ tzcnt wd, wm
+ movifnidn angled, anglem
+ movifnidn hd, hm
+ lea r7, [dr_intra_derivative]
+ inc tlq
+ movsxd wq, [r6+wq*4]
+ add wq, r6
+ mov dxd, angled
+ and dxd, 0x7e
+ add angled, 165 ; ~90
+ movzx dxd, word [r7+dxq]
+ xor angled, 0x4ff ; d = 90 - angle
+ vpbroadcastd m3, [pw_512]
+ vpbroadcastd m4, [pw_62]
+ vpbroadcastd m5, [pw_64]
+ jmp wq
+.w4:
+ cmp angleb, 40
+ jae .w4_no_upsample
+ lea r3d, [angleq-1024]
+ sar r3d, 7
+ add r3d, hd
+ jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm)
+ ALLOC_STACK -32, 8
+ mova xm1, [tlq-1]
+ pshufb xm0, xm1, [z_upsample1]
+ pshufb xm1, [z_upsample2]
+ vpbroadcastd xm2, [pb_36_m4] ; upshifted by 2 to be able to reuse
+ add dxd, dxd ; pw_512 (which is already in m3)
+ pmaddubsw xm0, xm2 ; for rounding instead of pw_2048
+ pextrd [rsp+16], xm1, 3 ; top[max_base_x]
+ pmaddubsw xm1, xm2
+ movd xm7, dxd
+ mov r3d, dxd ; xpos
+ vpbroadcastw m7, xm7
+ paddw xm1, xm0
+ movq xm0, [tlq]
+ pmulhrsw xm1, xm3
+ pslldq m6, m7, 8
+ paddw xm2, xm7, xm7
+ lea r2, [strideq*3]
+ paddw m6, m7
+ packuswb xm1, xm1
+ paddw m6, m2 ; xpos2 xpos3 xpos0 xpos1
+ punpcklbw xm0, xm1
+ psllw m7, 2
+ mova [rsp], xm0
+.w4_upsample_loop:
+ lea r5d, [r3+dxq]
+ shr r3d, 6 ; base0
+ vpbroadcastq m1, [rsp+r3]
+ lea r3d, [r5+dxq]
+ shr r5d, 6 ; base1
+ vpbroadcastq m2, [rsp+r5]
+ lea r5d, [r3+dxq]
+ shr r3d, 6 ; base2
+ movq xm0, [rsp+r3]
+ lea r3d, [r5+dxq]
+ shr r5d, 6 ; base3
+ movhps xm0, [rsp+r5]
+ vpblendd m1, m2, 0xc0
+ pand m2, m4, m6 ; frac
+ vpblendd m0, m1, 0xf0
+ psubw m1, m5, m2 ; 64-frac
+ psllw m2, 8
+ por m1, m2 ; 64-frac, frac
+ pmaddubsw m0, m1
+ paddw m6, m7 ; xpos += dx
+ pmulhrsw m0, m3
+ packuswb m0, m0
+ vextracti128 xm1, m0, 1
+ movd [dstq+strideq*2], xm0
+ pextrd [dstq+r2 ], xm0, 1
+ movd [dstq+strideq*0], xm1
+ pextrd [dstq+strideq*1], xm1, 1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4_upsample_loop
+ RET
+ALIGN function_align
+.filter_strength: ; w4/w8/w16
+ ; The C version uses a lot of branches, but we can do all the comparisons
+ ; in parallel and use popcnt to get the final filter strength value.
+%define base r3-z_filter_t0
+ lea r3, [z_filter_t0]
+ movd xm0, maxbased
+ movd xm2, angled
+ shr angled, 8 ; is_sm << 1
+ vpbroadcastb m0, xm0
+ vpbroadcastb m2, xm2
+ pcmpeqb m1, m0, [base+z_filter_wh]
+ pand m1, m2
+ mova xm2, [r3+angleq*8] ; upper ymm half zero in both cases
+ pcmpgtb m1, m2
+ pmovmskb r5d, m1
+ ret
+.w4_no_upsample:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -16, 11
+ mov maxbased, 7
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .w4_main
+ lea maxbased, [hq+3]
+ call .filter_strength
+ mov maxbased, 7
+ test r5d, r5d
+ jz .w4_main ; filter_strength == 0
+ popcnt r5d, r5d
+ vpbroadcastd m7, [base+pb_8]
+ vbroadcasti128 m2, [tlq-1]
+ pminub m1, m7, [base+z_filter_s]
+ vpbroadcastd m8, [base+z_filter_k-4+r5*4+12*0]
+ pminub m7, [base+z_filter_s+8]
+ vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*1]
+ vpbroadcastd m10, [base+z_filter_k-4+r5*4+12*2]
+ pshufb m0, m2, m1
+ shufps m1, m7, q2121
+ pmaddubsw m0, m8
+ pshufb m1, m2, m1
+ pmaddubsw m1, m9
+ pshufb m2, m7
+ pmaddubsw m2, m10
+ paddw m0, m1
+ paddw m0, m2
+ pmulhrsw m0, m3
+ mov r3d, 9
+ mov tlq, rsp
+ cmp hd, 4
+ cmovne maxbased, r3d
+ vextracti128 xm1, m0, 1
+ packuswb xm0, xm1
+ mova [tlq], xm0
+.w4_main:
+ movd xm6, dxd
+ vpbroadcastq m0, [z_base_inc] ; base_inc << 6
+ vpbroadcastb m7, [tlq+maxbaseq]
+ shl maxbased, 6
+ vpbroadcastw m6, xm6
+ mov r3d, dxd ; xpos
+ movd xm9, maxbased
+ vpbroadcastw m9, xm9
+ vbroadcasti128 m8, [z1_shuf_w4]
+ psrlw m7, 8 ; top[max_base_x]
+ paddw m10, m6, m6
+ psubw m9, m0 ; max_base_x
+ vpblendd m6, m10, 0xcc
+ mova xm0, xm10
+ paddw m6, m0 ; xpos2 xpos3 xpos0 xpos1
+ paddw m10, m10
+.w4_loop:
+ lea r5d, [r3+dxq]
+ shr r3d, 6 ; base0
+ vpbroadcastq m1, [tlq+r3]
+ lea r3d, [r5+dxq]
+ shr r5d, 6 ; base1
+ vpbroadcastq m2, [tlq+r5]
+ lea r5d, [r3+dxq]
+ shr r3d, 6 ; base2
+ movq xm0, [tlq+r3]
+ lea r3d, [r5+dxq]
+ shr r5d, 6 ; base3
+ movhps xm0, [tlq+r5]
+ vpblendd m1, m2, 0xc0
+ pand m2, m4, m6 ; frac
+ vpblendd m0, m1, 0xf0
+ psubw m1, m5, m2 ; 64-frac
+ psllw m2, 8
+ pshufb m0, m8
+ por m1, m2 ; 64-frac, frac
+ pmaddubsw m0, m1
+ pcmpgtw m1, m9, m6 ; base < max_base_x
+ pmulhrsw m0, m3
+ paddw m6, m10 ; xpos += dx
+ lea r5, [dstq+strideq*2]
+ vpblendvb m0, m7, m0, m1
+ packuswb m0, m0
+ vextracti128 xm1, m0, 1
+ movd [r5 +strideq*0], xm0
+ pextrd [r5 +strideq*1], xm0, 1
+ movd [dstq+strideq*0], xm1
+ pextrd [dstq+strideq*1], xm1, 1
+ sub hd, 4
+ jz .w4_end
+ lea dstq, [dstq+strideq*4]
+ cmp r3d, maxbased
+ jb .w4_loop
+ packuswb xm7, xm7
+ lea r6, [strideq*3]
+.w4_end_loop:
+ movd [dstq+strideq*0], xm7
+ movd [dstq+strideq*1], xm7
+ movd [dstq+strideq*2], xm7
+ movd [dstq+r6 ], xm7
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4_end_loop
+.w4_end:
+ RET
+ALIGN function_align
+.w8:
+ lea r3d, [angleq+216]
+ mov r3b, hb
+ cmp r3d, 8
+ ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -32, 8
+ movu xm2, [z_filter_s+6]
+ mova xm0, [tlq-1]
+ movd xm6, hd
+ vinserti128 m0, [tlq+7], 1
+ vpbroadcastb xm6, xm6
+ vbroadcasti128 m1, [z_upsample1]
+ pminub xm6, xm2
+ vpbroadcastd m7, [pb_36_m4]
+ vinserti128 m2, xm6, 1
+ add dxd, dxd
+ pshufb m1, m0, m1
+ pshufb m2, m0, m2
+ movd xm6, dxd
+ pmaddubsw m1, m7
+ pmaddubsw m2, m7
+ vpbroadcastw m6, xm6
+ mov r3d, dxd
+ psrldq m0, 1
+ lea r2, [strideq*3]
+ paddw m7, m6, m6
+ paddw m1, m2
+ vpblendd m6, m7, 0xf0
+ pmulhrsw m1, m3
+ pslldq m2, m7, 8
+ paddw m7, m7
+ paddw m6, m2
+ packuswb m1, m1
+ punpcklbw m0, m1
+ mova [rsp], m0
+.w8_upsample_loop:
+ lea r5d, [r3+dxq]
+ shr r3d, 6 ; base0
+ movu xm0, [rsp+r3]
+ lea r3d, [r5+dxq]
+ shr r5d, 6 ; base1
+ vinserti128 m0, [rsp+r5], 1
+ lea r5d, [r3+dxq]
+ shr r3d, 6 ; base2
+ pand m1, m4, m6
+ psubw m2, m5, m1
+ psllw m1, 8
+ por m2, m1
+ punpcklqdq m1, m2, m2 ; frac0 frac1
+ pmaddubsw m0, m1
+ movu xm1, [rsp+r3]
+ lea r3d, [r5+dxq]
+ shr r5d, 6 ; base3
+ vinserti128 m1, [rsp+r5], 1
+ punpckhqdq m2, m2 ; frac2 frac3
+ pmaddubsw m1, m2
+ pmulhrsw m0, m3
+ paddw m6, m7
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*2], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+r2 ], xm1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8_upsample_loop
+ RET
+.w8_no_intra_edge_filter:
+ and maxbased, 7
+ or maxbased, 8 ; imin(h+7, 15)
+ jmp .w8_main
+.w8_no_upsample:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -32, 10
+ lea maxbased, [hq+7]
+ test angled, 0x400
+ jnz .w8_no_intra_edge_filter
+ call .filter_strength
+ test r5d, r5d
+ jz .w8_main ; filter_strength == 0
+ popcnt r5d, r5d
+ movu xm2, [tlq]
+ pminub xm1, xm0, [base+z_filter_s+14]
+ vinserti128 m2, [tlq-1], 1
+ vinserti128 m1, [base+z_filter_s+ 0], 1
+ vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*0]
+ pminub xm0, [base+z_filter_s+22]
+ vinserti128 m0, [base+z_filter_s+ 8], 1
+ pshufb m6, m2, m1
+ pmaddubsw m6, m7
+ vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*1]
+ movzx r3d, byte [tlq+15]
+ shufps m1, m0, q2121
+ pshufb m1, m2, m1
+ pmaddubsw m1, m7
+ paddw m1, m6
+ sub r5d, 3
+ jnz .w8_3tap
+ ; filter_strength == 3 uses a 5-tap filter instead of a 3-tap one,
+ ; which also results in an awkward edge case where out[w*2] is
+ ; slightly different from out[max_base_x] when h > w.
+ vpbroadcastd m7, [z_filter_k+4*8]
+ movzx r2d, byte [tlq+14]
+ pshufb m2, m0
+ pmaddubsw m2, m7
+ sub r2d, r3d
+ lea r2d, [r2+r3*8+4]
+ shr r2d, 3 ; (tlq[w*2-2] + tlq[w*2-1]*7 + 4) >> 3
+ mov [rsp+16], r2b
+ paddw m1, m2
+.w8_3tap:
+ pmulhrsw m1, m3
+ sar r5d, 1
+ mov tlq, rsp
+ add r5d, 17 ; w*2 + (filter_strength == 3)
+ cmp hd, 16
+ cmovns maxbased, r5d
+ mov [tlq+r5], r3b
+ vextracti128 xm0, m1, 1
+ packuswb xm0, xm1
+ mova [tlq], xm0
+.w8_main:
+ movd xm2, dxd
+ vbroadcasti128 m0, [z_base_inc]
+ vpbroadcastw m2, xm2
+ vpbroadcastb m7, [tlq+maxbaseq]
+ shl maxbased, 6
+ movd xm9, maxbased
+ vbroadcasti128 m8, [z_filter_s+2]
+ vpbroadcastw m9, xm9
+ psrlw m7, 8
+ psubw m9, m0
+ mov r3d, dxd
+ paddw m6, m2, m2
+ vpblendd m2, m6, 0xf0
+.w8_loop:
+ lea r5d, [r3+dxq]
+ shr r3d, 6
+ pand m0, m4, m2
+ psubw m1, m5, m0
+ psllw m0, 8
+ por m1, m0
+ movu xm0, [tlq+r3]
+ lea r3d, [r5+dxq]
+ shr r5d, 6 ; base1
+ vinserti128 m0, [tlq+r5], 1
+ pshufb m0, m8
+ pmaddubsw m0, m1
+ pcmpgtw m1, m9, m2
+ paddw m2, m6
+ pmulhrsw m0, m3
+ vpblendvb m0, m7, m0, m1
+ vextracti128 xm1, m0, 1
+ packuswb xm0, xm1
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ sub hd, 2
+ jz .w8_end
+ lea dstq, [dstq+strideq*2]
+ cmp r3d, maxbased
+ jb .w8_loop
+ packuswb xm7, xm7
+.w8_end_loop:
+ movq [dstq+strideq*0], xm7
+ movq [dstq+strideq*1], xm7
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8_end_loop
+.w8_end:
+ RET
+.w16_no_intra_edge_filter:
+ and maxbased, 15
+ or maxbased, 16 ; imin(h+15, 31)
+ jmp .w16_main
+ALIGN function_align
+.w16:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -64, 12
+ lea maxbased, [hq+15]
+ test angled, 0x400
+ jnz .w16_no_intra_edge_filter
+ call .filter_strength
+ test r5d, r5d
+ jz .w16_main ; filter_strength == 0
+ popcnt r5d, r5d
+ vpbroadcastd m1, [base+pb_12]
+ vbroadcasti128 m6, [base+z_filter_s+8]
+ vinserti128 m2, m6, [base+z_filter_s], 0
+ vinserti128 m6, [base+z_filter_s+16], 1
+ mova xm10, [tlq-1]
+ vinserti128 m10, [tlq+3], 1
+ vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*0]
+ vbroadcasti128 m7, [base+z_filter_s+14]
+ vinserti128 m8, m7, [base+z_filter_s+6], 0
+ vinserti128 m7, [base+z_filter_s+22], 1
+ psubw m0, m1
+ movu xm11, [tlq+12]
+ vinserti128 m11, [tlq+16], 1
+ pminub m8, m0
+ pminub m7, m0
+ pshufb m0, m10, m2
+ shufps m2, m6, q2121
+ pmaddubsw m0, m9
+ pshufb m1, m11, m8
+ shufps m8, m7, q2121
+ pmaddubsw m1, m9
+ vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*1]
+ movzx r3d, byte [tlq+31]
+ pshufb m2, m10, m2
+ pmaddubsw m2, m9
+ pshufb m8, m11, m8
+ pmaddubsw m8, m9
+ paddw m0, m2
+ paddw m1, m8
+ sub r5d, 3
+ jnz .w16_3tap
+ vpbroadcastd m9, [z_filter_k+4*8]
+ movzx r2d, byte [tlq+30]
+ pshufb m10, m6
+ pmaddubsw m10, m9
+ pshufb m11, m7
+ pmaddubsw m11, m9
+ sub r2d, r3d
+ lea r2d, [r2+r3*8+4]
+ shr r2d, 3
+ mov [rsp+32], r2b
+ paddw m0, m10
+ paddw m1, m11
+.w16_3tap:
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ sar r5d, 1
+ mov tlq, rsp
+ add r5d, 33
+ cmp hd, 32
+ cmovns maxbased, r5d
+ mov [tlq+r5], r3b
+ packuswb m0, m1
+ vpermq m0, m0, q3120
+ mova [tlq], m0
+.w16_main:
+ movd xm6, dxd
+ vbroadcasti128 m0, [z_base_inc]
+ vpbroadcastb m7, [tlq+maxbaseq]
+ shl maxbased, 6
+ vpbroadcastw m6, xm6
+ movd xm9, maxbased
+ vbroadcasti128 m8, [z_filter_s+2]
+ vpbroadcastw m9, xm9
+ mov r3d, dxd
+ psubw m9, m0
+ paddw m11, m6, m6
+ psubw m10, m9, m3 ; 64*8
+ vpblendd m6, m11, 0xf0
+.w16_loop:
+ lea r5d, [r3+dxq]
+ shr r3d, 6 ; base0
+ pand m1, m4, m6
+ psubw m2, m5, m1
+ psllw m1, 8
+ por m2, m1
+ movu xm0, [tlq+r3+0]
+ movu xm1, [tlq+r3+8]
+ lea r3d, [r5+dxq]
+ shr r5d, 6 ; base1
+ vinserti128 m0, [tlq+r5+0], 1
+ vinserti128 m1, [tlq+r5+8], 1
+ pshufb m0, m8
+ pshufb m1, m8
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ pcmpgtw m1, m9, m6
+ pcmpgtw m2, m10, m6
+ packsswb m1, m2
+ paddw m6, m11
+ vpblendvb m0, m7, m0, m1
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ sub hd, 2
+ jz .w16_end
+ lea dstq, [dstq+strideq*2]
+ cmp r3d, maxbased
+ jb .w16_loop
+.w16_end_loop:
+ mova [dstq+strideq*0], xm7
+ mova [dstq+strideq*1], xm7
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w16_end_loop
+.w16_end:
+ RET
+ALIGN function_align
+.w32:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -96, 15
+ lea r3d, [hq+31]
+ mov maxbased, 63
+ cmp hd, 32
+ cmovs maxbased, r3d
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .w32_main
+ vbroadcasti128 m0, [pb_0to15]
+ sub r3d, 29 ; h+2
+ movu xm13, [tlq+29] ; 32-39
+ movd xm1, r3d
+ movu xm14, [tlq+37] ; 40-47
+ sub r3d, 8 ; h-6
+ vinserti128 m14, [tlq+51], 1 ; 56-63
+ vpbroadcastb xm1, xm1
+ mova xm11, [tlq- 1] ; 0- 7
+ vinserti128 m11, [tlq+13], 1 ; 16-23
+ movd xm2, r3d
+ movu xm12, [tlq+ 5] ; 8-15
+ vinserti128 m12, [tlq+19], 1 ; 24-31
+ pminub xm1, xm0 ; clip 32x8
+ mova m7, [z_filter_s+0]
+ pshufb xm13, xm1
+ vpbroadcastd m1, [pb_12]
+ vpbroadcastb xm2, xm2
+ vinserti128 m13, [tlq+43], 1 ; 48-55
+ vinserti128 m8, m7, [z_filter_s+4], 1
+ vpblendd m2, m1, 0xf0
+ vinserti128 m7, [z_filter_s+12], 0
+ pminub m2, m0 ; clip 32x16 and 32x(32|64)
+ vpbroadcastd m9, [z_filter_k+4*2+12*0]
+ pshufb m14, m2
+ pshufb m0, m11, m8
+ shufps m8, m7, q1021
+ pmaddubsw m0, m9
+ pshufb m2, m12, m8
+ pmaddubsw m2, m9
+ pshufb m1, m13, m8
+ pmaddubsw m1, m9
+ pshufb m6, m14, m8
+ pmaddubsw m6, m9
+ vpbroadcastd m9, [z_filter_k+4*2+12*1]
+ pshufb m10, m11, m8
+ shufps m8, m7, q2121
+ pmaddubsw m10, m9
+ paddw m0, m10
+ pshufb m10, m12, m8
+ pmaddubsw m10, m9
+ paddw m2, m10
+ pshufb m10, m13, m8
+ pmaddubsw m10, m9
+ paddw m1, m10
+ pshufb m10, m14, m8
+ pmaddubsw m10, m9
+ paddw m6, m10
+ vpbroadcastd m9, [z_filter_k+4*2+12*2]
+ pshufb m11, m8
+ pmaddubsw m11, m9
+ pshufb m12, m7
+ pmaddubsw m12, m9
+ movzx r3d, byte [tlq+63]
+ movzx r2d, byte [tlq+62]
+ paddw m0, m11
+ paddw m2, m12
+ pshufb m13, m7
+ pmaddubsw m13, m9
+ pshufb m14, m7
+ pmaddubsw m14, m9
+ paddw m1, m13
+ paddw m6, m14
+ sub r2d, r3d
+ lea r2d, [r2+r3*8+4] ; edge case for 32x64
+ pmulhrsw m0, m3
+ pmulhrsw m2, m3
+ pmulhrsw m1, m3
+ pmulhrsw m6, m3
+ shr r2d, 3
+ mov [rsp+64], r2b
+ mov tlq, rsp
+ mov [tlq+65], r3b
+ mov r3d, 65
+ cmp hd, 64
+ cmove maxbased, r3d
+ packuswb m0, m2
+ packuswb m1, m6
+ mova [tlq+ 0], m0
+ mova [tlq+32], m1
+.w32_main:
+ movd xm6, dxd
+ vpbroadcastb m7, [tlq+maxbaseq]
+ shl maxbased, 6
+ vpbroadcastw m6, xm6
+ movd xm9, maxbased
+ vbroadcasti128 m8, [z_filter_s+2]
+ vpbroadcastw m9, xm9
+ mov r5d, dxd
+ psubw m9, [z_base_inc]
+ mova m11, m6
+ psubw m10, m9, m3 ; 64*8
+.w32_loop:
+ mov r3d, r5d
+ shr r3d, 6
+ pand m1, m4, m6
+ psubw m2, m5, m1
+ psllw m1, 8
+ por m2, m1
+ movu m0, [tlq+r3+0]
+ movu m1, [tlq+r3+8]
+ add r5d, dxd
+ pshufb m0, m8
+ pshufb m1, m8
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ pcmpgtw m1, m9, m6
+ pcmpgtw m2, m10, m6
+ packsswb m1, m2
+ paddw m6, m11
+ vpblendvb m0, m7, m0, m1
+ mova [dstq], m0
+ dec hd
+ jz .w32_end
+ add dstq, strideq
+ cmp r5d, maxbased
+ jb .w32_loop
+ test hb, 1
+ jz .w32_end_loop
+ mova [dstq], m7
+ add dstq, strideq
+ dec hd
+ jz .w32_end
+.w32_end_loop:
+ mova [dstq+strideq*0], m7
+ mova [dstq+strideq*1], m7
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32_end_loop
+.w32_end:
+ RET
+ALIGN function_align
+.w64:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -128, 16
+ lea maxbased, [hq+63]
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .w64_main
+ mova xm11, [tlq- 1] ; 0- 7
+ vinserti128 m11, [tlq+13], 1 ; 16-23
+ movu xm12, [tlq+ 5] ; 8-15
+ vinserti128 m12, [tlq+19], 1 ; 24-31
+ mova m7, [z_filter_s+0]
+ vinserti128 m8, m7, [z_filter_s+4], 1
+ vinserti128 m7, [z_filter_s+12], 0
+ vpbroadcastd m9, [z_filter_k+4*2+12*0]
+ movu xm13, [tlq+29] ; 32-39
+ vinserti128 m13, [tlq+43], 1 ; 48-55
+ movu xm14, [tlq+37] ; 40-47
+ vinserti128 m14, [tlq+51], 1 ; 56-63
+ pshufb m0, m11, m8
+ shufps m8, m7, q1021
+ pmaddubsw m0, m9
+ pshufb m2, m12, m8
+ pmaddubsw m2, m9
+ pshufb m1, m13, m8
+ pmaddubsw m1, m9
+ pshufb m6, m14, m8
+ pmaddubsw m6, m9
+ vpbroadcastd m9, [z_filter_k+4*2+12*1]
+ pshufb m10, m11, m8
+ shufps m15, m8, m7, q2121
+ pmaddubsw m10, m9
+ paddw m0, m10
+ pshufb m10, m12, m15
+ pmaddubsw m10, m9
+ paddw m2, m10
+ pshufb m10, m13, m15
+ pmaddubsw m10, m9
+ paddw m1, m10
+ pshufb m10, m14, m15
+ pmaddubsw m10, m9
+ paddw m6, m10
+ vpbroadcastd m10, [z_filter_k+4*2+12*2]
+ pshufb m11, m15
+ pmaddubsw m11, m10
+ pshufb m12, m7
+ pmaddubsw m12, m10
+ pshufb m13, m7
+ pmaddubsw m13, m10
+ pshufb m14, m7
+ pmaddubsw m14, m10
+ paddw m0, m11
+ paddw m2, m12
+ paddw m1, m13
+ paddw m6, m14
+ movu xm11, [tlq+ 61] ; 64- 71
+ vinserti128 m11, [tlq+ 75], 1 ; 80- 87
+ movu xm12, [tlq+ 69] ; 72- 79
+ vinserti128 m12, [tlq+ 83], 1 ; 88- 95
+ movu xm13, [tlq+ 93] ; 96-103
+ vinserti128 m13, [tlq+107], 1 ; 112-119
+ movu xm14, [tlq+101] ; 104-111
+ vinserti128 m14, [tlq+115], 1 ; 120-127
+ pmulhrsw m0, m3
+ pmulhrsw m2, m3
+ pmulhrsw m1, m3
+ pmulhrsw m6, m3
+ lea r3d, [hq-20]
+ mov tlq, rsp
+ packuswb m0, m2
+ packuswb m1, m6
+ vpbroadcastd xm2, [pb_14]
+ vbroadcasti128 m6, [pb_0to15]
+ mova [tlq+32*0], m0
+ mova [tlq+32*1], m1
+ movd xm0, r3d
+ vpbroadcastd m1, [pb_12]
+ vpbroadcastb m0, xm0
+ paddb m0, m2
+ pminub m0, m6 ; clip 64x16 and 64x32
+ pshufb m12, m0
+ pminub m1, m6 ; clip 64x64
+ pshufb m14, m1
+ pshufb m0, m11, m7
+ pmaddubsw m0, m10
+ pshufb m2, m12, m7
+ pmaddubsw m2, m10
+ pshufb m1, m13, m7
+ pmaddubsw m1, m10
+ pshufb m6, m14, m7
+ pmaddubsw m6, m10
+ pshufb m7, m11, m15
+ pmaddubsw m7, m9
+ pshufb m10, m12, m15
+ pmaddubsw m10, m9
+ paddw m0, m7
+ pshufb m7, m13, m15
+ pmaddubsw m7, m9
+ paddw m2, m10
+ pshufb m10, m14, m15
+ pmaddubsw m10, m9
+ paddw m1, m7
+ paddw m6, m10
+ vpbroadcastd m9, [z_filter_k+4*2+12*0]
+ pshufb m11, m8
+ pmaddubsw m11, m9
+ pshufb m12, m8
+ pmaddubsw m12, m9
+ pshufb m13, m8
+ pmaddubsw m13, m9
+ pshufb m14, m8
+ pmaddubsw m14, m9
+ paddw m0, m11
+ paddw m2, m12
+ paddw m1, m13
+ paddw m6, m14
+ pmulhrsw m0, m3
+ pmulhrsw m2, m3
+ pmulhrsw m1, m3
+ pmulhrsw m6, m3
+ packuswb m0, m2
+ packuswb m1, m6
+ mova [tlq+32*2], m0
+ mova [tlq+32*3], m1
+.w64_main:
+ movd xm12, dxd
+ vpbroadcastb m7, [tlq+maxbaseq]
+ lea r3d, [dxq-64]
+ shl maxbased, 6
+ vpbroadcastw m12, xm12
+ sub r3d, maxbased
+ vbroadcasti128 m8, [z_filter_s+2]
+ movd xm6, r3d
+ mov r5d, dxd
+ mova m10, [pb_1to32]
+ vpbroadcastd m11, [pb_32]
+ vpbroadcastw m6, xm6
+.w64_loop:
+ mov r3d, r5d
+ shr r3d, 6
+ movu m0, [tlq+r3+ 0]
+ movu m1, [tlq+r3+ 8]
+ pand m2, m4, m6
+ psubw m9, m5, m2
+ psllw m2, 8
+ por m9, m2
+ pshufb m0, m8
+ pshufb m1, m8
+ pmaddubsw m0, m9
+ pmaddubsw m1, m9
+ psraw m2, m6, 6
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packsswb m2, m2
+ paddb m2, m10
+ packuswb m0, m1
+ vpblendvb m0, m7, m0, m2
+ mova [dstq+ 0], m0
+ movu m0, [tlq+r3+32]
+ movu m1, [tlq+r3+40]
+ add r5d, dxd
+ pshufb m0, m8
+ pshufb m1, m8
+ pmaddubsw m0, m9
+ pmaddubsw m1, m9
+ paddb m2, m11
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ paddw m6, m12
+ packuswb m0, m1
+ vpblendvb m0, m7, m0, m2
+ mova [dstq+32], m0
+ dec hd
+ jz .w64_end
+ add dstq, strideq
+ cmp r5d, maxbased
+ jb .w64_loop
+.w64_end_loop:
+ mova [dstq+ 0], m7
+ mova [dstq+32], m7
+ add dstq, strideq
+ dec hd
+ jg .w64_end_loop
+.w64_end:
+ RET
+
+cglobal ipred_z2_8bpc, 3, 10, 16, 224, dst, stride, tl, w, h, angle, dx, dy
+%define base r9-z_filter_t0
+ lea r9, [ipred_z2_avx2_table]
+ tzcnt wd, wm
+ movifnidn angled, anglem
+ movifnidn hd, hm
+ lea dxq, [dr_intra_derivative-90]
+ movsxd wq, [r9+wq*4]
+ movzx dyd, angleb
+ xor angled, 0x400
+ mov r8, dxq
+ sub dxq, dyq
+ add wq, r9
+ add r9, z_filter_t0-ipred_z2_avx2_table
+ mova m2, [tlq-64]
+ mova m0, [tlq-32]
+ mova m1, [tlq]
+ and dyd, ~1
+ and dxq, ~1
+ movzx dyd, word [r8+dyq] ; angle - 90
+ movzx dxd, word [dxq+270] ; 180 - angle
+ vpbroadcastd m13, [base+pw_512]
+ vpbroadcastd m14, [base+pw_62]
+ vpbroadcastd m15, [base+pw_64]
+ mova [rsp+ 0], m2
+ mova [rsp+32], m0
+ mova [rsp+64], m1
+ neg dxd
+ neg dyd
+ jmp wq
+.w4:
+ vpbroadcastq m6, [base+z2_base_inc] ; base_inc << 6
+ vbroadcasti128 m10, [base+z1_shuf_w4]
+ vbroadcasti128 m11, [base+z2_shuf_h4]
+ lea r2d, [dxq+(65<<6)] ; xpos
+ movd xm5, dyd
+ mov r8d, (63-4)<<6
+ mov dyq, -4
+ pshuflw xm5, xm5, q0000
+ pmullw xm5, [base+z2_ymul]
+ test angled, 0x400
+ jnz .w4_main ; !enable_intra_edge_filter
+ lea r3d, [hq+2]
+ add angled, 1022
+ shl r3d, 6
+ test r3d, angled
+ jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8)
+ vpbroadcastd xm3, [base+pb_4]
+ call .upsample_above
+ sub angled, 1075 ; angle - 53
+ lea r3d, [hq+3]
+ xor angled, 0x7f ; 180 - angle
+ call .filter_strength
+ jmp .w4_filter_left
+ALIGN function_align
+.filter_strength:
+ movd xm8, r3d
+ mov r3d, angled
+ movd xm7, angled
+ vpbroadcastb m8, xm8
+ shr r3d, 8 ; is_sm << 1
+ vpbroadcastb m7, xm7
+ pcmpeqb m8, [base+z_filter_wh]
+ mova xm9, [r9+r3*8]
+ pand m0, m8, m7
+ pcmpgtb m0, m9
+ pmovmskb r3d, m0
+ ret
+ALIGN function_align
+.upsample_above: ; w4/w8
+ pshufb xm2, xm1, [base+z_upsample1-2]
+ pminub xm3, [base+z_filter_s+4]
+ vpbroadcastd xm4, [base+pb_36_m4]
+ vbroadcasti128 m10, [base+pb_0to15]
+ pshufb xm3, xm1, xm3
+ pmaddubsw xm2, xm4
+ pmaddubsw xm3, xm4
+ lea r2d, [r2+dxq+(1<<6)]
+ add dxd, dxd
+ paddw xm2, xm3
+ pmulhrsw xm2, xm13
+ sub r8d, 3<<6
+ paddw m6, m6
+ packuswb xm2, xm2
+ punpcklbw xm1, xm2
+ mova [rsp+gprsize+64], xm1
+ ret
+ALIGN function_align
+.upsample_left: ; h4/h8
+ mov r3d, hd
+ and r3d, 4
+ movd xm2, [rsp+gprsize+64]
+ movddup xm0, [rsp+gprsize+56]
+ movd xm1, r3d
+ palignr xm2, xm0, 1
+ vpbroadcastb xm1, xm1
+ pshufb xm2, [base+z_filter_s+18]
+ vpbroadcastd xm3, [base+pb_36_m4]
+ pmaxub xm1, [base+z_upsample1-2]
+ pshufb xm1, xm0, xm1
+ pmaddubsw xm2, xm3
+ pmaddubsw xm1, xm3
+ paddw xm5, xm5
+ add dyq, dyq
+ paddw xm1, xm2
+ pmulhrsw xm1, xm13
+ vbroadcasti128 m11, [base+z2_upsample]
+ paddw xm5, xm15
+ packuswb xm1, xm1
+ punpcklbw xm0, xm1
+ mova [rsp+gprsize+48], xm0
+ ret
+.w4_no_upsample_above:
+ lea r3d, [hq+3]
+ sub angled, 1112 ; angle - 90
+ call .filter_strength
+ test r3d, r3d
+ jz .w4_no_filter_above
+ popcnt r3d, r3d
+ vpbroadcastd xm2, [base+pb_4]
+ pminub xm2, [base+z_filter_s]
+ vpbroadcastd xm0, [base+z_filter_k-4+r3*4+12*0]
+ vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*1]
+ pshufb xm3, xm1, xm2 ; 00 01 12 23
+ pshufd xm2, xm2, q0321
+ pmaddubsw xm0, xm3, xm0
+ pshufb xm2, xm1, xm2 ; 12 23 34 44
+ pmaddubsw xm2, xm4
+ vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*2]
+ punpckhqdq xm3, xm3 ; 34 44 44 44
+ pmaddubsw xm3, xm4
+ vpbroadcastd xm4, r6m ; max_width
+ packssdw xm4, xm4
+ paddw xm0, xm2
+ paddw xm0, xm3
+ pmulhrsw xm0, xm13
+ packsswb xm4, xm4
+ psrlq xm1, 8
+ psubb xm4, [base+pb_1to32]
+ packuswb xm0, xm0
+ vpblendvb xm0, xm1, xm4
+ movd [rsp+65], xm0
+.w4_no_filter_above:
+ lea r3d, [hq+2]
+ add angled, 973 ; angle + 883
+ shl r3d, 6
+ test r3d, angled
+ jz .w4_upsample_left ; angle <= 140 || h > 8 || (is_sm && h == 8)
+ vpbroadcastd xm0, [base+pb_90]
+ psubb xm0, xm7 ; 180 - angle
+ pand xm0, xm8 ; reuse from previous filter_strength call
+ pcmpgtb xm0, xm9
+ pmovmskb r3d, xm0
+.w4_filter_left:
+ test r3d, r3d
+ jz .w4_main
+ popcnt r3d, r3d
+ mov r5d, 10
+ cmp hd, 16
+ movu xm2, [rsp+49]
+ vinserti128 m2, [rsp+43], 1
+ cmovs r5d, hd
+ xor r5d, 15 ; h == 16 ? 5 : 15 - h
+ movd xm0, r5d
+ vbroadcasti128 m1, [base+z_filter_s+12]
+ vbroadcasti128 m4, [base+z_filter_s+16]
+ vinserti128 m3, m1, [z_filter_s+8], 1 ; 56 67 78 89 9a ab bc cd 55 55 56 67 78 89 9a ab
+ vpblendd m1, m4, 0x0f ; 78 89 9a ab bc cd de ef 56 67 78 89 9a ab bc cd
+ vinserti128 m4, [base+z_filter_s+20], 0 ; 9a ab bc cd de ef ff ff 78 89 9a ab bc cd de ef
+ vpbroadcastb m0, xm0
+ pmaxub m0, m3
+ vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*0]
+ pshufb m0, m2, m0
+ pmaddubsw m0, m3
+ vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*1]
+ pshufb m1, m2, m1
+ pmaddubsw m1, m3
+ vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*2]
+ pshufb m2, m4
+ pmaddubsw m2, m3
+ vpbroadcastd xm4, r7m ; max_height
+ packssdw xm4, xm4
+ paddw m1, m0
+ paddw m1, m2
+ pmulhrsw m1, m13
+ packsswb xm4, xm4
+ vextracti128 xm0, m1, 1
+ psubb xm4, [base+pb_16to1]
+ packuswb xm0, xm1
+ vpblendvb xm0, [rsp+48], xm4
+ mova [rsp+48], xm0
+ jmp .w4_main
+.w4_upsample_left:
+ call .upsample_left
+.w4_main:
+ movd xm0, dxd
+ mova m12, [base+z2_y_shuf_h4]
+ lea r5, [rsp+56] ; left-7
+ vpbroadcastw m0, xm0
+ lea r9, [strideq*3]
+ psraw xm1, xm5, 6
+ pand xm5, xm14 ; frac_y
+ pxor xm2, xm2
+ paddw m7, m0, m0
+ psubw xm4, xm2, xm1 ; base_y
+ vpblendd m0, m7, 0xcc
+ mova xm1, xm7
+ punpcklwd xm4, xm2
+ paddw m0, m1 ; xpos2 xpos3 xpos0 xpos1
+ psubw xm1, xm15, xm5 ; 64-frac_y
+ psllw xm5, 8
+ paddw m7, m7
+ paddw m6, m0
+ por xm5, xm1 ; 64-frac_y, frac_y
+ vpbroadcastq m5, xm5
+.w4_loop:
+ lea r3d, [r2+dxq]
+ shr r2d, 6 ; base_x0
+ vpbroadcastq m1, [rsp+r2]
+ lea r2d, [r3+dxq]
+ shr r3d, 6 ; base_x1
+ vpbroadcastq m2, [rsp+r3]
+ lea r3d, [r2+dxq]
+ shr r2d, 6 ; base_x2
+ movq xm0, [rsp+r2]
+ lea r2d, [r3+dxq]
+ shr r3d, 6 ; base_x3
+ movhps xm0, [rsp+r3]
+ vpblendd m1, m2, 0xc0
+ pand m2, m14, m6 ; frac_x
+ vpblendd m0, m1, 0xf0
+ psubw m1, m15, m2 ; 64-frac_x
+ psllw m2, 8
+ pshufb m0, m10
+ por m1, m2 ; 64-frac_x, frac_x
+ pmaddubsw m0, m1
+ cmp r3d, 64
+ jge .w4_toponly
+ mova m1, m7 ; arbitrary negative value
+ vpgatherdq m3, [r5+xm4], m1
+ pshufb m1, m3, m11
+ vpermd m1, m12, m1
+ pmaddubsw m1, m5
+ psraw m2, m6, 15 ; base_x < topleft
+ vpblendvb m0, m1, m2
+.w4_toponly:
+ pmulhrsw m0, m13
+ paddw m6, m7 ; xpos += dx
+ add r5, dyq
+ packuswb m0, m0
+ vextracti128 xm1, m0, 1
+ movd [dstq+strideq*2], xm0
+ pextrd [dstq+r9 ], xm0, 1
+ movd [dstq+strideq*0], xm1
+ pextrd [dstq+strideq*1], xm1, 1
+ sub hd, 4
+ jz .w4_end
+ lea dstq, [dstq+strideq*4]
+ cmp r2d, r8d
+ jge .w4_loop
+.w4_leftonly_loop:
+ mova m1, m7
+ vpgatherdq m2, [r5+xm4], m1
+ add r5, dyq
+ pshufb m0, m2, m11
+ vpermd m0, m12, m0
+ pmaddubsw m0, m5
+ pmulhrsw m0, m13
+ packuswb m0, m0
+ vextracti128 xm1, m0, 1
+ movd [dstq+strideq*2], xm0
+ pextrd [dstq+r9 ], xm0, 1
+ movd [dstq+strideq*0], xm1
+ pextrd [dstq+strideq*1], xm1, 1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4_leftonly_loop
+.w4_end:
+ RET
+.w8:
+ vbroadcasti128 m6, [base+z2_base_inc] ; base_inc << 6
+ movd xm5, dyd
+ vbroadcasti128 m10, [base+z_filter_s+2]
+ vbroadcasti128 m11, [base+z2_shuf_h4]
+ lea r2d, [dxq+(65<<6)] ; xpos
+ vpbroadcastw xm5, xm5
+ mov r8d, (63-8)<<6
+ mov dyq, -4
+ pmullw xm5, [base+z2_ymul]
+ test angled, 0x400
+ jnz .w8_main
+ lea r3d, [angleq+126]
+ mov r3b, hb
+ cmp r3d, 8
+ ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm
+ vpbroadcastd xm3, [base+pb_8]
+ movhps [rsp+80], xm1
+ call .upsample_above
+ sub angled, 53 ; angle - 53
+ lea r3d, [hq+7]
+ xor angled, 0x7f ; 180 - angle
+ call .filter_strength
+ jmp .w8_filter_left
+.w8_no_upsample_above:
+ lea r3d, [hq+7]
+ sub angled, 90 ; angle - 90
+ call .filter_strength
+ test r3d, r3d
+ jz .w8_no_filter_above
+ popcnt r3d, r3d
+ vpbroadcastd xm3, [base+pb_8]
+ pminub xm3, [base+z_filter_s+8]
+ vpbroadcastd xm0, [base+z_filter_k-4+r3*4+12*0]
+ vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*1]
+ pshufb xm2, xm1, [base+z_filter_s] ; 00 01 12 23 34 45 56 67
+ pmaddubsw xm0, xm2, xm0
+ pshufb xm3, xm1, xm3 ; 34 45 56 67 78 88 88 88
+ shufps xm2, xm3, q2121 ; 12 23 34 45 56 67 78 88
+ pmaddubsw xm2, xm4
+ vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*2]
+ pmaddubsw xm3, xm4
+ vpbroadcastd xm4, r6m ; max_width
+ packssdw xm4, xm4
+ paddw xm0, xm2
+ paddw xm0, xm3
+ pmulhrsw xm0, xm13
+ packsswb xm4, xm4
+ psrldq xm1, 1
+ psubb xm4, [base+pb_1to32]
+ packuswb xm0, xm0
+ vpblendvb xm0, xm1, xm4
+ movq [rsp+65], xm0
+.w8_no_filter_above:
+ lea r3d, [angleq-51]
+ mov r3b, hb
+ cmp r3d, 8
+ jbe .w8_upsample_left ; angle > 140 && h <= 8 && !is_sm
+ vpbroadcastd m0, [base+pb_90]
+ psubb m0, m7
+ pand m0, m8
+ pcmpgtb m0, m9
+ pmovmskb r3d, m0
+.w8_filter_left:
+ test r3d, r3d
+ jz .w8_main
+ popcnt r3d, r3d
+ vpbroadcastd m7, [base+z_filter_k-4+r3*4+12*0]
+ vpbroadcastd m8, [base+z_filter_k-4+r3*4+12*1]
+ vpbroadcastd m9, [base+z_filter_k-4+r3*4+12*2]
+ cmp hd, 32
+ jne .w8_filter_left_h16
+ movu xm2, [rsp+27]
+ vinserti128 m2, [rsp+35], 1
+ vpbroadcastd xm0, [base+pb_5]
+ vbroadcasti128 m3, [base+z_filter_s+ 8]
+ vbroadcasti128 m1, [base+z_filter_s+12]
+ vbroadcasti128 m4, [base+z_filter_s+16]
+ pmaxub m3, m0
+ pshufb m3, m2, m3
+ pmaddubsw m3, m7
+ pshufb m1, m2, m1
+ pmaddubsw m1, m8
+ pshufb m2, m4
+ pmaddubsw m2, m9
+ paddw m3, m1
+ paddw m3, m2
+ pmulhrsw m3, m13
+ jmp .w8_filter_left_top16
+.w8_filter_left_h16:
+ mov r5d, 10
+ cmp hd, 16
+ cmovs r5d, hd
+ xor r5d, 15 ; h == 16 ? 5 : 15 - h
+ movd xm0, r5d
+ vpbroadcastb m0, xm0
+.w8_filter_left_top16:
+ vbroadcasti128 m1, [base+z_filter_s+12]
+ vinserti128 m2, m1, [base+z_filter_s+8], 1 ; 56 67 78 89 9a ab bc cd 55 55 56 67 78 89 9a ab
+ vbroadcasti128 m4, [base+z_filter_s+16]
+ vpblendd m1, m4, 0x0f ; 78 89 9a ab bc cd de ef 56 67 78 89 9a ab bc cd
+ vinserti128 m4, [base+z_filter_s+20], 0 ; 9a ab bc cd de ef ff ff 78 89 9a ab bc cd de ef
+ pmaxub m0, m2
+ movu xm2, [rsp+49]
+ vinserti128 m2, [rsp+43], 1
+ pshufb m0, m2, m0
+ pmaddubsw m0, m7
+ vpbroadcastd m7, r7m ; max_height
+ pshufb m1, m2, m1
+ pmaddubsw m1, m8
+ pshufb m2, m4
+ pmaddubsw m2, m9
+ packssdw m7, m7
+ paddw m1, m0
+ packsswb m7, m7
+ paddw m1, m2
+ pmulhrsw m1, m13
+ psubb m7, [base+pb_32to1]
+ packuswb m3, m1
+ vpermq m3, m3, q1320
+ vpblendvb m3, [rsp+32], m7
+ mova [rsp+32], m3
+ jmp .w8_main
+.w8_upsample_left:
+ call .upsample_left
+.w8_main:
+ movd xm3, dxd
+ lea r5, [rsp+56] ; left-7
+ pshufd xm1, xm5, q3120
+ pand xm5, xm14
+ vpbroadcastw m3, xm3
+ pxor xm0, xm0
+ psubw xm2, xm15, xm5
+ psraw xm1, 6
+ lea r9, [strideq*3]
+ paddw m7, m3, m3
+ psubw xm9, xm0, xm1 ; base_y
+ psllw xm5, 8
+ punpcklwd xm8, xm9, xm0 ; base_y 0, 1, 4, 5
+ vpblendd m3, m7, 0xf0 ; xpos0 xpos1
+ por xm5, xm2 ; 64-frac_y, frac_y
+ punpckhwd xm9, xm0 ; base_y 2, 3, 6, 7
+ paddw m6, m3
+ vinserti128 m12, m5, xm5, 1
+.w8_loop:
+ lea r3d, [r2+dxq]
+ shr r2d, 6 ; base_x0
+ movu xm0, [rsp+r2]
+ lea r2d, [r3+dxq]
+ shr r3d, 6 ; base_x1
+ vinserti128 m0, [rsp+r3], 1
+ lea r3d, [r2+dxq]
+ shr r2d, 6 ; base_x2
+ movu xm1, [rsp+r2]
+ lea r2d, [r3+dxq]
+ shr r3d, 6 ; base_x3
+ vinserti128 m1, [rsp+r3], 1
+ pand m2, m14, m6
+ paddsw m4, m6, m7
+ psubw m5, m15, m2
+ psllw m2, 8
+ pshufb m0, m10
+ por m2, m5
+ pmaddubsw m0, m2
+ pand m2, m14, m4
+ psubw m5, m15, m2
+ psllw m2, 8
+ pshufb m1, m10
+ por m2, m5
+ pmaddubsw m1, m2
+ cmp r3d, 64
+ jge .w8_toponly
+ mova m5, m7
+ vpgatherdq m3, [r5+xm9], m7
+ mova m7, m5
+ vpgatherdq m2, [r5+xm8], m5
+ pshufb m3, m11
+ pshufb m2, m11
+ punpckldq m5, m2, m3 ; a0 b0 c0 d0 a1 b1 c1 d1 e0 f0 g0 h0 e1 f1 g1 h1
+ punpckhdq m2, m3 ; a2 b2 c2 d2 a3 b3 c3 d3 e2 f2 g2 h2 e3 f3 g3 h3
+ vpermq m5, m5, q3120 ; y0 y1
+ vpermq m2, m2, q3120 ; y2 y3
+ pmaddubsw m5, m12
+ pmaddubsw m2, m12
+ psraw m6, 15 ; base_x < topleft
+ vpblendvb m0, m5, m6
+ psraw m3, m4, 15
+ vpblendvb m1, m2, m3
+.w8_toponly:
+ pmulhrsw m0, m13
+ pmulhrsw m1, m13
+ paddw m6, m4, m7 ; xpos += dx
+ add r5, dyq
+ packuswb m0, m1
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*2], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+r9 ], xm1
+ sub hd, 4
+ jz .w8_end
+ lea dstq, [dstq+strideq*4]
+ cmp r2d, r8d
+ jge .w8_loop
+.w8_leftonly_loop:
+ mova m0, m7
+ vpgatherdq m5, [r5+xm9], m7
+ mova m7, m0
+ vpgatherdq m3, [r5+xm8], m0
+ add r5, dyq
+ pshufb m2, m5, m11
+ pshufb m1, m3, m11
+ punpckldq m0, m1, m2
+ punpckhdq m1, m2
+ vpermq m0, m0, q3120
+ vpermq m1, m1, q3120
+ pmaddubsw m0, m12
+ pmaddubsw m1, m12
+ pmulhrsw m0, m13
+ pmulhrsw m1, m13
+ packuswb m0, m1
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*2], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+r9 ], xm1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8_leftonly_loop
+.w8_end:
+ RET
+.w16:
+ mov r8d, hd
+ test angled, 0x400
+ jnz .w16_main
+ lea r3d, [hq+15]
+ sub angled, 90
+ call .filter_strength
+ test r3d, r3d
+ jz .w16_no_filter_above
+ popcnt r3d, r3d
+ vbroadcasti128 m6, [tlq+1]
+ mova xm2, [base+z_filter_s]
+ vinserti128 m2, [base+z_filter_s+14], 1 ; 00 01 12 23 34 45 56 67 67 78 89 9a ab bc cd de
+ movu xm3, [base+z_filter_s+8]
+ vinserti128 m3, [base+z_filter_s+22], 1 ; 34 45 56 67 78 89 9a ab ab bc cd de ef ff ff ff
+ vpblendd m1, m6, 0xf0
+ vpbroadcastd m0, [base+z_filter_k-4+r3*4+12*0]
+ vpbroadcastd m4, [base+z_filter_k-4+r3*4+12*1]
+ vpbroadcastd m5, [base+z_filter_k-4+r3*4+12*2]
+ pshufb m2, m1, m2
+ pshufb m1, m3
+ pmaddubsw m0, m2, m0
+ shufps m2, m1, q2121 ; 12 23 34 45 56 67 78 89 89 9a ab bc cd de ef ff
+ pmaddubsw m2, m4
+ pmaddubsw m1, m5
+ vpbroadcastd xm4, r6m ; max_width
+ packssdw xm4, xm4
+ paddw m0, m2
+ paddw m0, m1
+ pmulhrsw m0, m13
+ packsswb xm4, xm4
+ vextracti128 xm2, m0, 1
+ psubb xm4, [base+pb_1to32]
+ packuswb xm0, xm2
+ vpblendvb xm0, xm6, xm4
+ movu [rsp+65], xm0
+.w16_no_filter_above:
+ vpbroadcastd m0, [base+pb_90]
+ psubb m0, m7
+ pand m0, m8
+ pcmpgtb m0, m9
+ pmovmskb r3d, m0
+ test r3d, r3d
+ jz .w16_main
+ popcnt r3d, r3d
+ vpbroadcastd m7, [base+z_filter_k-4+r3*4+12*0]
+ vpbroadcastd m8, [base+z_filter_k-4+r3*4+12*1]
+ vpbroadcastd m9, [base+z_filter_k-4+r3*4+12*2]
+.w16_filter_left:
+ vpbroadcastd m6, r7m ; max_height
+ packssdw m6, m6
+ packsswb m6, m6
+ cmp hd, 32
+ jl .w16_filter_left_h16
+ vpbroadcastd xm0, [base+pb_5]
+ vbroadcasti128 m10, [base+z_filter_s+ 8]
+ vbroadcasti128 m11, [base+z_filter_s+12]
+ vbroadcasti128 m12, [base+z_filter_s+16]
+ je .w16_filter_left_h32
+ movu m3, [tlq-69]
+ movu m5, [tlq-61]
+ pmaxub m1, m10, m0
+ pshufb m1, m3, m1
+ pmaddubsw m1, m7
+ pshufb m2, m3, m11
+ pmaddubsw m2, m8
+ pshufb m3, m12
+ pmaddubsw m3, m9
+ paddw m1, m2
+ pshufb m2, m5, m10
+ pmaddubsw m2, m7
+ pshufb m4, m5, m11
+ pmaddubsw m4, m8
+ pshufb m5, m12
+ pmaddubsw m5, m9
+ paddw m1, m3
+ vpbroadcastd m3, [base+pb_32]
+ paddb m3, [base+pb_32to1]
+ paddw m2, m4
+ paddw m2, m5
+ pmulhrsw m1, m13
+ pmulhrsw m2, m13
+ psubb m3, m6, m3
+ packuswb m1, m2
+ vpblendvb m1, [tlq-64], m3
+ mova [rsp], m1
+ jmp .w16_filter_left_top32
+.w16_filter_left_h32:
+ pmaxub m10, m0
+.w16_filter_left_top32:
+ movu xm2, [tlq-37]
+ vinserti128 m2, [tlq-29], 1
+ pshufb m3, m2, m10
+ pshufb m1, m2, m11
+ pshufb m2, m12
+ pmaddubsw m3, m7
+ pmaddubsw m1, m8
+ pmaddubsw m2, m9
+ paddw m3, m1
+ paddw m3, m2
+ pmulhrsw m3, m13
+ jmp .w16_filter_left_top16
+.w16_filter_left_h16:
+ mov r5d, 10
+ cmp hd, 16
+ cmovs r5d, hd
+ xor r5d, 15 ; h == 16 ? 5 : 15 - h
+ movd xm0, r5d
+ vpbroadcastb m0, xm0
+.w16_filter_left_top16:
+ movu xm2, [tlq-15]
+ vinserti128 m2, [tlq-21], 1
+ vbroadcasti128 m1, [base+z_filter_s+12]
+ vbroadcasti128 m4, [base+z_filter_s+16]
+ vinserti128 m5, m1, [base+z_filter_s+8], 1 ; 56 67 78 89 9a ab bc cd 34 45 56 67 78 89 9a ab
+ vpblendd m1, m4, 0x0f ; 78 89 9a ab bc cd de ef 56 67 78 89 9a ab bc cd
+ vinserti128 m4, [base+z_filter_s+20], 0 ; 9a ab bc cd de ef ff ff 78 89 9a ab bc cd de ef
+ pmaxub m0, m5
+ pshufb m0, m2, m0
+ pmaddubsw m0, m7
+ pshufb m1, m2, m1
+ pmaddubsw m1, m8
+ pshufb m2, m4
+ pmaddubsw m2, m9
+ psubb m6, [base+pb_32to1]
+ paddw m1, m0
+ paddw m1, m2
+ pmulhrsw m1, m13
+ packuswb m3, m1
+ vpermq m3, m3, q1320
+ vpblendvb m3, [tlq-32], m6
+ mova [rsp+32], m3
+.w16_main:
+ movd xm1, dyd
+ vbroadcasti128 m10, [base+z_filter_s+2]
+ movd xm7, dxd
+ vbroadcasti128 m11, [base+z2_shuf_h2]
+ vpbroadcastw m1, xm1
+ vpbroadcastw m7, xm7
+ mov r7, dstq
+ pmullw m0, m1, [base+z2_ymul]
+ psllw xm1, 4
+ paddw m6, m7, [base+z2_base_inc]
+ lea r9d, [dxq+(65<<6)] ; xpos
+ movd [rsp+156], xm1
+.w16_loop0:
+ mov r2d, r9d
+ mova [rsp+160], m0
+ lea r5, [rsp+60] ; left-3
+ mova [rsp+192], m6
+ pxor m1, m1
+ psraw m2, m0, 6
+ pand m0, m14
+ psubw m9, m1, m2 ; base_y
+ psubw m12, m15, m0
+ punpcklwd m8, m9, m1 ; base_y 0, 1, 2, 3, 8, 9, 10, 11
+ psllw m0, 8
+ punpckhwd m9, m1 ; base_y 4, 5, 6, 7, 12, 13, 14, 15
+ por m12, m0 ; 64-frac_y, frac_y
+.w16_loop:
+ lea r3d, [r2+dxq]
+ shr r2d, 6 ; base_x0
+ movu xm0, [rsp+r2]
+ vinserti128 m0, [rsp+r2+8], 1
+ lea r2d, [r3+dxq]
+ shr r3d, 6 ; base_x1
+ movu xm1, [rsp+r3]
+ vinserti128 m1, [rsp+r3+8], 1
+ pand m2, m14, m6
+ paddsw m5, m6, m7
+ psubw m3, m15, m2
+ psllw m2, 8
+ pshufb m0, m10
+ por m2, m3
+ pmaddubsw m0, m2
+ pand m2, m14, m5
+ psubw m3, m15, m2
+ psllw m2, 8
+ pshufb m1, m10
+ por m2, m3
+ pmaddubsw m1, m2
+ cmp r3d, 64
+ jge .w16_toponly
+ punpckhwd m2, m5, m5 ; mask out unnecessary loads
+ vpgatherdd m4, [r5+m9], m2
+ punpcklwd m2, m5, m5
+ vpgatherdd m3, [r5+m8], m2
+ pshufb m4, m11 ; e0 f0 g0 h0 e1 f1 g1 h1 m0 n0 o0 p0 m1 n1 o1 p1
+ pshufb m3, m11 ; a0 b0 c0 d0 a1 b1 c1 d1 i0 j0 k0 l0 i1 j1 k1 l1
+ punpcklqdq m2, m3, m4 ; y0
+ punpckhqdq m3, m4 ; y1
+ pmaddubsw m2, m12
+ pmaddubsw m3, m12
+ psraw m6, 15 ; base_x < topleft
+ vpblendvb m0, m2, m6
+ psraw m6, m5, 15
+ vpblendvb m1, m3, m6
+.w16_toponly:
+ pmulhrsw m0, m13
+ pmulhrsw m1, m13
+ paddw m6, m5, m7 ; xpos += dx
+ sub r5, 2
+ packuswb m0, m1
+ vpermq m0, m0, q3120
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ sub hd, 2
+ jz .w16_end
+ lea dstq, [dstq+strideq*2]
+ cmp r2d, (63-16)<<6
+ jge .w16_loop
+.w16_leftonly_loop:
+ mova m0, m7
+ vpgatherdd m4, [r5+m9], m7
+ mova m7, m0
+ vpgatherdd m3, [r5+m8], m0
+ sub r5, 2
+ pshufb m2, m4, m11
+ pshufb m1, m3, m11
+ punpcklqdq m0, m1, m2
+ punpckhqdq m1, m2
+ pmaddubsw m0, m12
+ pmaddubsw m1, m12
+ pmulhrsw m0, m13
+ pmulhrsw m1, m13
+ packuswb m0, m1
+ vpermq m0, m0, q3120
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w16_leftonly_loop
+.w16_end:
+ sub r8d, 1<<8
+ jl .w16_ret
+ vpbroadcastd m0, [rsp+156]
+ paddw m0, [rsp+160] ; base_y += 16*dy
+ paddw m6, m13, [rsp+192]
+ add r7, 16
+ add r9d, 16<<6
+ movzx hd, r8b
+ mov dstq, r7
+ paddw m6, m13 ; base_x += 16*64
+ jmp .w16_loop0
+.w16_ret:
+ RET
+.w32:
+ mova m2, [tlq+32]
+ lea r8d, [hq+(1<<8)]
+ mova [rsp+96], m2
+ test angled, 0x400
+ jnz .w16_main
+ vpbroadcastd m7, [base+z_filter_k+4*2+12*0]
+ vpbroadcastd m8, [base+z_filter_k+4*2+12*1]
+ vpbroadcastd m9, [base+z_filter_k+4*2+12*2]
+ mova xm5, [base+z_filter_s]
+ vinserti128 m5, [base+z_filter_s+10], 1 ; 00 01 12 23 34 45 56 67 45 56 67 78 89 9a ab bc
+ vinserti128 m1, [tlq+11], 1
+ movu xm6, [base+z_filter_s+12]
+ vinserti128 m6, [base+z_filter_s+22], 1 ; 56 67 78 89 9a ab bc cd ab bc cd de ef ff ff ff
+ movu xm3, [tlq+ 6]
+ vinserti128 m3, [tlq+17], 1
+ vpbroadcastd m10, r6m ; max_width
+ packssdw m10, m10
+ packsswb m10, m10
+.w32_filter_above:
+ pshufb m0, m1, m5
+ shufps m4, m5, m6, q1021 ; 12 23 34 45 56 67 78 89 67 78 89 9a ab bc cd de
+ pmaddubsw m0, m7
+ pshufb m2, m1, m4
+ shufps m5, m6, q2132 ; 34 45 56 67 78 89 9a ab 89 9a ab bc cd de ef ff
+ pmaddubsw m2, m8
+ pshufb m1, m5
+ pmaddubsw m1, m9
+ paddw m0, m2
+ paddw m0, m1
+ pshufb m1, m3, m4
+ pmaddubsw m1, m7
+ pshufb m2, m3, m5
+ pmaddubsw m2, m8
+ pshufb m3, m6
+ pmaddubsw m3, m9
+ paddw m1, m2
+ paddw m1, m3
+ pmulhrsw m0, m13
+ pmulhrsw m1, m13
+ psubb m10, [base+pb_1to32]
+ packuswb m0, m1
+ vpblendvb m0, [tlq+1], m10
+ movu [rsp+65], m0
+ jmp .w16_filter_left
+.w64:
+ mova m2, [tlq+32]
+ mov r3d, [tlq+64]
+ lea r8d, [hq+(3<<8)]
+ mova [rsp+ 96], m2
+ mov [rsp+128], r3d
+ test angled, 0x400
+ jnz .w16_main
+ vpbroadcastd m7, [base+z_filter_k+4*2+12*0]
+ vpbroadcastd m8, [base+z_filter_k+4*2+12*1]
+ vpbroadcastd m9, [base+z_filter_k+4*2+12*2]
+ movu xm6, [base+z_filter_s+ 4]
+ vinserti128 m6, [base+z_filter_s+10], 1 ; 12 23 34 45 56 67 78 89 45 56 67 78 89 9a ab bc
+ movu xm3, [tlq+30]
+ vinserti128 m3, [tlq+43], 1
+ movu xm5, [base+z_filter_s+16]
+ vinserti128 m5, [base+z_filter_s+22], 1 ; 78 89 9a ab bc cd de ef ab bc cd de ef ff ff ff
+ pshufb m0, m3, m6
+ shufps m4, m6, m5, q1021 ; 34 45 56 67 78 89 9a ab 67 78 89 9a ab bc cd de
+ pmaddubsw m0, m7
+ pshufb m2, m3, m4
+ shufps m6, m5, q2132 ; 56 67 78 89 9a ab bc cd 89 9a ab bc cd de ef ff
+ pmaddubsw m2, m8
+ pshufb m3, m6
+ pmaddubsw m3, m9
+ paddw m0, m2
+ paddw m0, m3
+ movu xm2, [tlq+36]
+ vinserti128 m2, [tlq+49], 1
+ vpbroadcastd m10, r6m ; max_width
+ pshufb m4, m2, m4
+ pmaddubsw m4, m7
+ pshufb m3, m2, m6
+ pmaddubsw m3, m8
+ pshufb m2, m5
+ pmaddubsw m2, m9
+ packssdw m10, m10
+ paddw m3, m4
+ paddw m2, m3
+ vpbroadcastd m3, [base+pb_32]
+ pmulhrsw m0, m13
+ pmulhrsw m2, m13
+ packsswb m10, m10
+ mova xm5, [base+z_filter_s]
+ vinserti128 m5, [base+z_filter_s+6], 1
+ psubb m3, m10, m3
+ psubb m3, [base+pb_1to32]
+ vinserti128 m1, [tlq+13], 1
+ packuswb m0, m2
+ vpblendvb m0, [tlq+33], m3
+ movu xm3, [tlq+ 6]
+ vinserti128 m3, [tlq+19], 1
+ movu [rsp+97], m0
+ jmp .w32_filter_above
+
+cglobal ipred_z3_8bpc, 4, 9, 0, dst, stride, tl, w, h, angle, dy, org_w, maxbase
+ %assign org_stack_offset stack_offset
+ lea r6, [ipred_z3_avx2_table]
+ tzcnt hd, hm
+ movifnidn angled, anglem
+ lea r7, [dr_intra_derivative+45*2-1]
+ dec tlq
+ movsxd hq, [r6+hq*4]
+ sub angled, 180
+ add hq, r6
+ mov dyd, angled
+ neg dyd
+ xor angled, 0x400
+ or dyq, ~0x7e
+ movzx dyd, word [r7+dyq]
+ vpbroadcastd m3, [pw_512]
+ vpbroadcastd m4, [pw_62]
+ vpbroadcastd m5, [pw_64]
+ mov org_wd, wd
+ jmp hq
+.h4:
+ lea r7, [strideq*3]
+ cmp angleb, 40
+ jae .h4_no_upsample
+ lea r4d, [angleq-1024]
+ sar r4d, 7
+ add r4d, wd
+ jg .h4_no_upsample ; !enable_intra_edge_filter || w > 8 || (w == 8 && is_sm)
+ ALLOC_STACK -32, 9
+ movu xm8, [tlq-7]
+ pshufb xm0, xm8, [z_upsample1-4]
+ vpbroadcastb xm2, xm8
+ pshufb xm1, xm8, [z_filter_s+2]
+ mova [rsp+16], xm2 ; top[max_base_y]
+ vpbroadcastd xm2, [pb_36_m4]
+ add dyd, dyd
+ pmaddubsw xm0, xm2
+ pmaddubsw xm1, xm2
+ movd xm7, dyd
+ mov r2d, dyd
+ vpbroadcastw m7, xm7
+ paddw xm1, xm0
+ pmulhrsw xm1, xm3
+ pslldq m6, m7, 8
+ paddw xm2, xm7, xm7
+ paddw m6, m7
+ packuswb xm1, xm1
+ paddw m6, m2
+ punpcklbw xm1, xm8
+ mova xm8, [z_transpose4]
+ psllw m7, 2
+ pshufb xm1, [pb_15to0]
+ mova [rsp], xm1
+.h4_upsample_loop:
+ lea r4d, [r2+dyq]
+ shr r2d, 6
+ vpbroadcastq m1, [rsp+r2]
+ lea r2d, [r4+dyq]
+ shr r4d, 6
+ vpbroadcastq m2, [rsp+r4]
+ lea r4d, [r2+dyq]
+ shr r2d, 6
+ movq xm0, [rsp+r2]
+ lea r2d, [r4+dyq]
+ shr r4d, 6
+ movhps xm0, [rsp+r4]
+ vpblendd m1, m2, 0xc0
+ pand m2, m4, m6
+ vpblendd m0, m1, 0xf0
+ psubw m1, m5, m2
+ psllw m2, 8
+ por m1, m2
+ pmaddubsw m0, m1
+ paddw m6, m7
+ pmulhrsw m0, m3
+ vextracti128 xm1, m0, 1
+ packuswb xm1, xm0
+ pshufb xm1, xm8
+ movd [dstq+strideq*0], xm1
+ pextrd [dstq+strideq*1], xm1, 1
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+r7 ], xm1, 3
+ add dstq, 4
+ sub wd, 4
+ jg .h4_upsample_loop
+ RET
+ALIGN function_align
+.filter_strength: ; h4/h8/h16
+%define base r4-z_filter_t0
+ lea r4, [z_filter_t0]
+ movd xm0, maxbased
+ movd xm2, angled
+ shr angled, 8 ; is_sm << 1
+ vpbroadcastb m0, xm0
+ vpbroadcastb m2, xm2
+ pcmpeqb m1, m0, [base+z_filter_wh]
+ pand m1, m2
+ mova xm2, [r4+angleq*8]
+ pcmpgtb m1, m2
+ pmovmskb r5d, m1
+ ret
+.h4_no_upsample:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -16, 12
+ mov maxbased, 7
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .h4_main
+ lea maxbased, [wq+3]
+ call .filter_strength
+ mov maxbased, 7
+ test r5d, r5d
+ jz .h4_main ; filter_strength == 0
+ popcnt r5d, r5d
+ vpbroadcastd m7, [base+pb_7]
+ vbroadcasti128 m2, [tlq-14]
+ pmaxub m1, m7, [base+z_filter_s-4]
+ vpbroadcastd m8, [base+z_filter_k-4+r5*4+12*0]
+ pmaxub m7, [base+z_filter_s+4]
+ vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*1]
+ vpbroadcastd m10, [base+z_filter_k-4+r5*4+12*2]
+ pshufb m0, m2, m1
+ shufps m1, m7, q2121
+ pmaddubsw m0, m8
+ pshufb m1, m2, m1
+ pmaddubsw m1, m9
+ pshufb m2, m7
+ pmaddubsw m2, m10
+ paddw m0, m1
+ paddw m0, m2
+ pmulhrsw m0, m3
+ mov r4d, 9
+ lea tlq, [rsp+15]
+ cmp wd, 4
+ cmovne maxbased, r4d
+ vextracti128 xm1, m0, 1
+ packuswb xm0, xm1
+ mova [rsp], xm0
+.h4_main:
+ movd xm6, dyd
+ vpbroadcastq m0, [z_base_inc] ; base_inc << 6
+ mov r4, tlq
+ sub tlq, 4
+ neg dyq
+ vpbroadcastw m6, xm6
+ sub r4, maxbaseq
+ shl maxbased, 6
+ vpbroadcastb m7, [r4]
+ lea r4, [dyq+63] ; ypos
+ movd xm9, maxbased
+ not maxbased
+ vbroadcasti128 m8, [z3_shuf_w4]
+ add maxbased, 64
+ vpbroadcastw m9, xm9
+ psrlw m7, 8 ; top[max_base_y]
+ paddw m10, m6, m6
+ psubw m9, m0 ; max_base_y
+ vpblendd m6, m10, 0xcc
+ mova xm0, xm10
+ paddw m6, m0 ; ypos2 ypos3 ypos0 ypos1
+ paddw m10, m10
+ mova xm11, [z_transpose4]
+.h4_loop:
+ lea r5, [r4+dyq]
+ sar r4, 6 ; base0
+ vpbroadcastq m1, [tlq+r4]
+ lea r4, [r5+dyq]
+ sar r5, 6 ; base1
+ vpbroadcastq m2, [tlq+r5]
+ lea r5, [r4+dyq]
+ sar r4, 6 ; base2
+ movq xm0, [tlq+r4]
+ lea r4, [r5+dyq]
+ sar r5, 6 ; base3
+ movhps xm0, [tlq+r5]
+ vpblendd m1, m2, 0xc0
+ pand m2, m4, m6 ; frac
+ vpblendd m0, m1, 0xf0
+ psubw m1, m5, m2 ; 64-frac
+ psllw m2, 8
+ pshufb m0, m8
+ por m1, m2 ; 64-frac, frac
+ pmaddubsw m0, m1
+ pcmpgtw m1, m9, m6 ; base < max_base_y
+ pmulhrsw m0, m3
+ paddw m6, m10 ; ypos += dy
+ vpblendvb m0, m7, m0, m1
+ vextracti128 xm1, m0, 1
+ packuswb xm1, xm0
+ pshufb xm1, xm11 ; transpose
+ movd [dstq+strideq*0], xm1
+ pextrd [dstq+strideq*1], xm1, 1
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+r7 ], xm1, 3
+ sub wd, 4
+ jz .h4_end
+ add dstq, 4
+ cmp r4d, maxbased
+ jg .h4_loop
+ packuswb xm7, xm7
+.h4_end_loop:
+ movd [dstq+strideq*0], xm7
+ movd [dstq+strideq*1], xm7
+ movd [dstq+strideq*2], xm7
+ movd [dstq+r7 ], xm7
+ add dstq, 4
+ sub wd, 4
+ jg .h4_end_loop
+.h4_end:
+ RET
+ALIGN function_align
+.h8:
+ lea r4d, [angleq+216]
+ mov r4b, wb
+ cmp r4d, 8
+ ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -32, 8
+ and r4d, 4
+ mova xm0, [tlq-15]
+ vinserti128 m0, [tlq- 9], 1
+ movd xm1, r4d
+ movu xm2, [z_filter_s+2]
+ vinserti128 m2, [z_filter_s+6], 1
+ vpbroadcastb xm1, xm1 ; w & 4
+ vpbroadcastd m7, [pb_36_m4]
+ pmaxub xm1, [z_upsample1-4] ; clip 4x8
+ vinserti128 m1, [z_upsample1], 1
+ add dyd, dyd
+ pshufb m1, m0, m1
+ pshufb m2, m0, m2
+ vinserti128 m0, [tlq-7], 1
+ movd xm6, dyd
+ pmaddubsw m1, m7
+ pmaddubsw m2, m7
+ vpbroadcastw m6, xm6
+ mov r2d, dyd
+ lea r5, [strideq*3]
+ paddw m7, m6, m6
+ paddw m1, m2
+ vpblendd m6, m7, 0xf0
+ pmulhrsw m1, m3
+ pslldq m2, m7, 8
+ paddw m7, m7
+ paddw m6, m2
+ vbroadcasti128 m2, [pb_15to0]
+ packuswb m1, m1
+ punpcklbw m1, m0
+ pshufb m1, m2
+ vextracti128 [rsp+ 0], m1, 1
+ mova [rsp+16], xm1
+.h8_upsample_loop:
+ lea r4d, [r2+dyq]
+ shr r2d, 6 ; base0
+ movu xm0, [rsp+r2]
+ lea r2d, [r4+dyq]
+ shr r4d, 6 ; base1
+ vinserti128 m0, [rsp+r4], 1
+ lea r4d, [r2+dyq]
+ shr r2d, 6 ; base2
+ pand m1, m4, m6
+ psubw m2, m5, m1
+ psllw m1, 8
+ por m2, m1
+ punpcklqdq m1, m2, m2 ; frac0 frac1
+ pmaddubsw m0, m1
+ movu xm1, [rsp+r2]
+ lea r2d, [r4+dyq]
+ shr r4d, 6 ; base3
+ vinserti128 m1, [rsp+r4], 1
+ punpckhqdq m2, m2 ; frac2 frac3
+ pmaddubsw m1, m2
+ pmulhrsw m0, m3
+ paddw m6, m7
+ pmulhrsw m1, m3
+ lea r4, [dstq+strideq*4]
+ psllw m1, 8
+ por m0, m1
+ vextracti128 xm1, m0, 1
+ punpcklbw xm2, xm0, xm1
+ punpckhbw xm0, xm1
+ movd [dstq+strideq*0], xm2
+ pextrd [dstq+strideq*1], xm2, 1
+ pextrd [dstq+strideq*2], xm2, 2
+ pextrd [dstq+r5 ], xm2, 3
+ movd [r4 +strideq*0], xm0
+ pextrd [r4 +strideq*1], xm0, 1
+ pextrd [r4 +strideq*2], xm0, 2
+ pextrd [r4 +r5 ], xm0, 3
+ add dstq, 4
+ sub wd, 4
+ jg .h8_upsample_loop
+ RET
+.h8_no_intra_edge_filter:
+ and maxbased, 7
+ or maxbased, 8 ; imin(w+7, 15)
+ jmp .h8_main
+.h8_no_upsample:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -32, 10
+ lea maxbased, [wq+7]
+ test angled, 0x400
+ jnz .h8_no_intra_edge_filter
+ call .filter_strength
+ test r5d, r5d
+ jz .h8_main ; filter_strength == 0
+ popcnt r5d, r5d
+ vpbroadcastd xm6, [base+pb_15]
+ pcmpeqb xm1, xm1
+ psubusb xm6, xm0
+ psubb xm6, xm1 ; w == 4 ? 5 : 1
+ movu xm2, [tlq-16]
+ pmaxub xm1, xm6, [base+z_filter_s]
+ vinserti128 m2, [tlq-14], 1
+ vinserti128 m1, [base+z_filter_s+12], 1
+ vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*0]
+ pmaxub xm6, [base+z_filter_s+ 8]
+ vinserti128 m6, [base+z_filter_s+20], 1
+ pshufb m0, m2, m1
+ pmaddubsw m0, m7
+ vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*1]
+ movzx r4d, byte [tlq-15]
+ shufps m1, m6, q2121
+ pshufb m1, m2, m1
+ pmaddubsw m1, m7
+ paddw m0, m1
+ sub r5d, 3
+ jnz .h8_3tap
+ vpbroadcastd m7, [z_filter_k+4*8]
+ movzx r2d, byte [tlq-14]
+ pshufb m2, m6
+ pmaddubsw m2, m7
+ sub r2d, r4d
+ lea r2d, [r2+r4*8+4]
+ shr r2d, 3
+ mov [rsp+15], r2b
+ paddw m0, m2
+.h8_3tap:
+ pmulhrsw m0, m3
+ sar r5d, 1
+ lea tlq, [rsp+31]
+ add r5d, 17
+ cmp wd, 16
+ cmovns maxbased, r5d
+ neg r5
+ mov [tlq+r5], r4b
+ vextracti128 xm1, m0, 1
+ packuswb xm0, xm1
+ mova [tlq-15], xm0
+.h8_main:
+ movd xm2, dyd
+ vbroadcasti128 m0, [z_base_inc]
+ mov r4, tlq
+ sub tlq, 8
+ neg dyq
+ vpbroadcastw m2, xm2
+ sub r4, maxbaseq
+ shl maxbased, 6
+ vpbroadcastb m7, [r4]
+ lea r4, [dyq+63]
+ movd xm9, maxbased
+ not maxbased
+ vbroadcasti128 m8, [z3_shuf]
+ add maxbased, 64
+ vpbroadcastw m9, xm9
+ psrlw m7, 8
+ psubw m9, m0
+ paddw m6, m2, m2
+ vpblendd m2, m6, 0x0f
+.h8_loop:
+ lea r5, [r4+dyq]
+ sar r4, 6
+ pand m0, m4, m2
+ psubw m1, m5, m0
+ psllw m0, 8
+ por m1, m0
+ vbroadcasti128 m0, [tlq+r4]
+ lea r4, [r5+dyq]
+ sar r5, 6
+ vinserti128 m0, [tlq+r5], 0
+ sub rsp, 8*2
+ pshufb m0, m8
+ pmaddubsw m0, m1
+ pcmpgtw m1, m9, m2
+ paddw m2, m6
+ pmulhrsw m0, m3
+ vpblendvb m0, m7, m0, m1
+ vextracti128 xm1, m0, 1
+ psllw xm0, 8
+ por xm0, xm1 ; interleave rows (partial transpose)
+ mova [rsp], xm0
+ sub wd, 2
+ jz .h8_transpose
+ cmp r4d, maxbased
+ jg .h8_loop
+ packuswb xm0, xm7, xm7
+.h8_end_loop:
+ sub rsp, 8*2
+ mova [rsp], xm0
+ sub wd, 2
+ jg .h8_end_loop
+.h8_transpose:
+ mova xm2, [rsp+16*1]
+ sub org_wd, 8
+ lea r2, [strideq*3]
+ lea r6, [dstq+org_wq]
+ cmovns dstq, r6
+ punpcklwd xm1, xm2, xm0
+ punpckhwd xm2, xm0
+ lea r6, [dstq+strideq*4]
+ jge .h8_w8
+ add rsp, 16*2
+ movd [dstq+strideq*0], xm1
+ pextrd [dstq+strideq*1], xm1, 1
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+r2 ], xm1, 3
+ movd [r6 +strideq*0], xm2
+ pextrd [r6 +strideq*1], xm2, 1
+ pextrd [r6 +strideq*2], xm2, 2
+ pextrd [r6 +r2 ], xm2, 3
+ jmp .h8_end
+.h8_w8_loop:
+ mova xm0, [rsp+16*0]
+ mova xm2, [rsp+16*1]
+ punpcklwd xm1, xm2, xm0
+ punpckhwd xm2, xm0
+.h8_w8: ; w8/w16/w32
+ mova xm0, [rsp+16*2]
+ mova xm4, [rsp+16*3]
+ add rsp, 16*4
+ punpcklwd xm3, xm4, xm0
+ punpckhwd xm4, xm0
+ punpckldq xm0, xm3, xm1
+ punpckhdq xm3, xm1
+ punpckldq xm1, xm4, xm2
+ punpckhdq xm4, xm2
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ movq [dstq+strideq*2], xm3
+ movhps [dstq+r2 ], xm3
+ movq [r6 +strideq*0], xm1
+ movhps [r6 +strideq*1], xm1
+ movq [r6 +strideq*2], xm4
+ movhps [r6 +r2 ], xm4
+ sub dstq, 8
+ sub r6, 8
+ sub org_wd, 8
+ jge .h8_w8_loop
+.h8_end:
+ RET
+.h16_no_intra_edge_filter:
+ and maxbased, 15
+ or maxbased, 16 ; imin(w+15, 31)
+ jmp .h16_main
+ALIGN function_align
+.h16:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -64, 12
+ lea maxbased, [wq+15]
+ test angled, 0x400
+ jnz .h16_no_intra_edge_filter
+ call .filter_strength
+ test r5d, r5d
+ jz .h16_main ; filter_strength == 0
+ popcnt r5d, r5d
+ vpbroadcastd m11, [base+pb_27]
+ vpbroadcastd m1, [base+pb_1]
+ vbroadcasti128 m6, [base+z_filter_s+12]
+ vinserti128 m2, m6, [base+z_filter_s+4], 0
+ vinserti128 m6, [base+z_filter_s+20], 1
+ movu xm10, [tlq-18]
+ vinserti128 m10, [tlq-14], 1
+ vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*0]
+ vbroadcasti128 m7, [base+z_filter_s+8]
+ vinserti128 m8, m7, [base+z_filter_s+0], 0
+ vinserti128 m7, [base+z_filter_s+16], 1
+ psubusb m11, m0
+ por m1, m11
+ movu xm11, [tlq-32]
+ vinserti128 m11, [tlq-28], 1
+ pmaxub m8, m1
+ pmaxub m7, m1
+ pshufb m0, m10, m2
+ shufps m2, m6, q2121
+ pmaddubsw m0, m9
+ pshufb m1, m11, m8
+ shufps m8, m7, q2121
+ pmaddubsw m1, m9
+ vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*1]
+ movzx r4d, byte [tlq-31]
+ pshufb m2, m10, m2
+ pmaddubsw m2, m9
+ pshufb m8, m11, m8
+ pmaddubsw m8, m9
+ paddw m0, m2
+ paddw m1, m8
+ sub r5d, 3
+ jnz .h16_3tap
+ vpbroadcastd m9, [z_filter_k+4*8]
+ movzx r2d, byte [tlq-30]
+ pshufb m10, m6
+ pmaddubsw m10, m9
+ pshufb m11, m7
+ pmaddubsw m11, m9
+ sub r2d, r4d
+ lea r2d, [r2+r4*8+4]
+ shr r2d, 3
+ mov [rsp+31], r2b
+ paddw m0, m10
+ paddw m1, m11
+.h16_3tap:
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ sar r5d, 1
+ lea tlq, [rsp+63]
+ add r5d, 33
+ cmp wd, 32
+ cmovns maxbased, r5d
+ neg r5
+ mov [tlq+r5], r4b
+ packuswb m0, m1
+ vpermq m0, m0, q2031
+ mova [tlq-31], m0
+.h16_main:
+ movd xm6, dyd
+ vbroadcasti128 m0, [z_base_inc]
+ mov r4, tlq
+ sub tlq, 8
+ neg dyq
+ vpbroadcastw m6, xm6
+ sub r4, maxbaseq
+ shl maxbased, 6
+ vpbroadcastb m7, [r4]
+ lea r4, [dyq+63]
+ movd xm9, maxbased
+ not maxbased
+ vbroadcasti128 m8, [z3_shuf]
+ add maxbased, 64
+ vpbroadcastw m9, xm9
+ psubw m9, m0
+ paddw m11, m6, m6
+ psubw m10, m9, m3 ; 64*8
+ vpblendd m6, m11, 0xf0
+.h16_loop:
+ lea r5, [r4+dyq]
+ sar r4, 6
+ pand m1, m4, m6
+ psubw m2, m5, m1
+ psllw m1, 8
+ por m2, m1
+ movu xm0, [tlq+r4-0]
+ movu xm1, [tlq+r4-8]
+ lea r4, [r5+dyq]
+ sar r5, 6
+ vinserti128 m0, [tlq+r5-0], 1
+ vinserti128 m1, [tlq+r5-8], 1
+ sub rsp, 32
+ pshufb m0, m8
+ pshufb m1, m8
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ pcmpgtw m1, m9, m6
+ pcmpgtw m2, m10, m6
+ packsswb m1, m2
+ paddw m6, m11
+ vpblendvb m0, m7, m0, m1
+ vpermq m0, m0, q3120
+ mova [rsp], m0
+ sub wd, 2
+ jz .h16_transpose
+ cmp r4d, maxbased
+ jg .h16_loop
+ mova m0, m7
+.h16_end_loop:
+ sub rsp, 32
+ mova [rsp], m7
+ sub wd, 2
+ jg .h16_end_loop
+.h16_transpose:
+ mova m2, [rsp+32*1]
+ sub org_wd, 8
+ lea r2, [strideq*3]
+ lea r6, [dstq+org_wq]
+ cmovns dstq, r6
+ punpcklbw m1, m2, m0
+ punpckhbw m2, m0
+ lea r3, [strideq*5]
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ lea r4, [strideq+r2*2] ; stride*7
+ jge .h16_w8
+ add rsp, 32*2
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ pextrd [dstq+strideq*2], xm0, 2
+ pextrd [dstq+r2 ], xm0, 3
+ vextracti128 xm0, m0, 1
+ movd [dstq+strideq*4], xm1
+ pextrd [dstq+r3 ], xm1, 1
+ pextrd [dstq+r2*2 ], xm1, 2
+ pextrd [dstq+r4 ], xm1, 3
+ lea dstq, [dstq+strideq*8]
+ vextracti128 xm1, m1, 1
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ pextrd [dstq+strideq*2], xm0, 2
+ pextrd [dstq+r2 ], xm0, 3
+ movd [dstq+strideq*4], xm1
+ pextrd [dstq+r3 ], xm1, 1
+ pextrd [dstq+r2*2 ], xm1, 2
+ pextrd [dstq+r4 ], xm1, 3
+ jmp .h16_end
+.h16_w8_loop:
+ mova m0, [rsp+32*0]
+ mova m2, [rsp+32*1]
+ punpcklbw m1, m2, m0
+ punpckhbw m2, m0
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+.h16_w8:
+ mova m2, [rsp+32*2]
+ mova m4, [rsp+32*3]
+ lea r6, [dstq+strideq*8]
+ add rsp, 32*4
+ punpcklbw m3, m4, m2
+ punpckhbw m4, m2
+ punpcklbw m2, m3, m4
+ punpckhbw m3, m4
+ punpckldq m4, m2, m0
+ punpckhdq m2, m0
+ punpckldq m0, m3, m1
+ punpckhdq m3, m1
+ movq [dstq+strideq*0], xm4
+ movhps [dstq+strideq*1], xm4
+ vextracti128 xm4, m4, 1
+ movq [dstq+strideq*2], xm2
+ movhps [dstq+r2 ], xm2
+ vextracti128 xm2, m2, 1
+ movq [dstq+strideq*4], xm0
+ movhps [dstq+r3 ], xm0
+ vextracti128 xm0, m0, 1
+ movq [dstq+r2*2 ], xm3
+ movhps [dstq+r4 ], xm3
+ vextracti128 xm3, m3, 1
+ movq [r6+strideq*0], xm4
+ movhps [r6+strideq*1], xm4
+ movq [r6+strideq*2], xm2
+ movhps [r6+r2 ], xm2
+ movq [r6+strideq*4], xm0
+ movhps [r6+r3 ], xm0
+ movq [r6+r2*2 ], xm3
+ movhps [r6+r4 ], xm3
+ sub dstq, 8
+ sub org_wd, 8
+ jge .h16_w8_loop
+.h16_end:
+ RET
+ALIGN function_align
+.h32:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -96, 15
+ lea maxbased, [wq+31]
+ and maxbased, 31
+ or maxbased, 32 ; imin(w+31, 63)
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .h32_main
+ vbroadcasti128 m0, [pb_0to15]
+ mov r4d, 21
+ mov r5d, 3
+ movu xm11, [tlq-66] ; 56-63
+ vinserti128 m11, [tlq-52], 1 ; 40-47
+ sub r4d, wd ; 21-w
+ cmovns r5d, r4d
+ movu xm12, [tlq-58] ; 48-55
+ vinserti128 m12, [tlq-44], 1 ; 32-39
+ sub r4d, 8 ; 13-w
+ movd xm1, r5d
+ movu xm13, [tlq-34] ; 24-31
+ vinserti128 m13, [tlq-20], 1 ; 8-15
+ movd xm2, r4d
+ vpbroadcastb m1, xm1
+ movu xm14, [tlq-28] ; 16-23
+ vinserti128 m14, [tlq-14], 1 ; 0- 7
+ vpbroadcastb m2, xm2
+ pmaxsb m1, m0 ; clip 16x32 and (32|64)x32
+ movu m7, [z_filter_s+4]
+ pshufb m11, m1
+ vinserti128 m8, m7, [z_filter_s+8], 1
+ vinserti128 m7, [z_filter_s+16], 0
+ pmaxsb m2, m0 ; clip 8x32
+ vpbroadcastd m9, [z_filter_k+4*2+12*0]
+ pshufb m12, m2
+ pshufb m0, m11, m8
+ pmaddubsw m0, m9
+ pshufb m2, m12, m8
+ pmaddubsw m2, m9
+ pshufb m1, m13, m8
+ pmaddubsw m1, m9
+ shufps m8, m7, q1021
+ pshufb m6, m14, m8
+ pmaddubsw m6, m9
+ vpbroadcastd m9, [z_filter_k+4*2+12*1]
+ pshufb m10, m11, m8
+ pmaddubsw m10, m9
+ paddw m0, m10
+ pshufb m10, m12, m8
+ pmaddubsw m10, m9
+ paddw m2, m10
+ pshufb m10, m13, m8
+ pmaddubsw m10, m9
+ shufps m8, m7, q2121
+ paddw m1, m10
+ pshufb m10, m14, m8
+ pmaddubsw m10, m9
+ paddw m6, m10
+ vpbroadcastd m9, [z_filter_k+4*2+12*2]
+ pshufb m11, m8
+ pmaddubsw m11, m9
+ pshufb m12, m8
+ pmaddubsw m12, m9
+ movzx r4d, byte [tlq-63]
+ movzx r2d, byte [tlq-62]
+ paddw m0, m11
+ paddw m2, m12
+ pshufb m13, m8
+ pmaddubsw m13, m9
+ pshufb m14, m7
+ pmaddubsw m14, m9
+ paddw m1, m13
+ paddw m6, m14
+ sub r2d, r4d
+ lea r2d, [r2+r4*8+4] ; edge case for 64x32
+ pmulhrsw m0, m3
+ pmulhrsw m2, m3
+ pmulhrsw m1, m3
+ pmulhrsw m6, m3
+ shr r2d, 3
+ mov [rsp+31], r2b
+ lea tlq, [rsp+95]
+ mov [tlq-65], r4b
+ mov r4d, 65
+ cmp wd, 64
+ cmove maxbased, r4d
+ packuswb m0, m2
+ packuswb m1, m6
+ mova [tlq-63], m0
+ mova [tlq-31], m1
+.h32_main:
+ movd xm6, dyd
+ mov r4, tlq
+ sub tlq, 8
+ neg dyq
+ vpbroadcastw m6, xm6
+ sub r4, maxbaseq
+ shl maxbased, 6
+ vpbroadcastb m7, [r4]
+ lea r4, [dyq+63]
+ movd xm9, maxbased
+ not maxbased
+ vbroadcasti128 m8, [z3_shuf]
+ add maxbased, 64
+ vpbroadcastw m9, xm9
+ psubw m9, [z_base_inc]
+ mova m11, m6
+ psubw m10, m9, m3 ; 64*8
+.h32_loop:
+ mov r5, r4
+ sar r5, 6
+ pand m1, m4, m6
+ psubw m2, m5, m1
+ psllw m1, 8
+ por m2, m1
+ movu xm0, [tlq+r5- 0]
+ vinserti128 m0, [tlq+r5-16], 1
+ movu xm1, [tlq+r5- 8]
+ vinserti128 m1, [tlq+r5-24], 1
+ sub rsp, 32
+ add r4, dyq
+ pshufb m0, m8
+ pshufb m1, m8
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ pcmpgtw m1, m9, m6
+ pcmpgtw m2, m10, m6
+ packsswb m1, m2
+ paddw m6, m11
+ vpblendvb m0, m7, m0, m1
+ mova [rsp], m0
+ dec wd
+ jz .h32_transpose
+ cmp r4d, maxbased
+ jg .h32_loop
+.h32_end_loop:
+ sub rsp, 32
+ mova [rsp], m7
+ dec wd
+ jg .h32_end_loop
+.h32_transpose:
+ lea dstq, [dstq+org_wq-8]
+ lea r2, [strideq*3]
+ lea r3, [strideq*5]
+ lea r4, [strideq+r2*2] ; stride*7
+.h32_w8_loop:
+ mova m7, [rsp+32*0]
+ mova m6, [rsp+32*1]
+ mova m5, [rsp+32*2]
+ mova m4, [rsp+32*3]
+ mova m3, [rsp+32*4]
+ mova m2, [rsp+32*5]
+ mova m1, [rsp+32*6]
+ mova m0, [rsp+32*7]
+ lea r6, [dstq+strideq*8]
+ add rsp, 32*8
+ punpcklbw m8, m0, m1
+ punpckhbw m0, m1
+ punpcklbw m1, m2, m3
+ punpckhbw m2, m3
+ punpcklbw m3, m4, m5
+ punpckhbw m4, m5
+ punpcklbw m5, m6, m7
+ punpckhbw m6, m7
+ punpcklwd m7, m8, m1
+ punpckhwd m8, m1
+ punpcklwd m1, m0, m2
+ punpckhwd m0, m2
+ punpcklwd m2, m3, m5
+ punpckhwd m3, m5
+ punpcklwd m5, m4, m6
+ punpckhwd m4, m6
+ punpckldq m6, m7, m2
+ punpckhdq m7, m2
+ punpckldq m2, m8, m3
+ punpckhdq m8, m3
+ punpckldq m3, m1, m5
+ punpckhdq m1, m5
+ punpckldq m5, m0, m4
+ punpckhdq m0, m4
+ movq [dstq+strideq*0], xm6
+ movhps [dstq+strideq*1], xm6
+ vextracti128 xm6, m6, 1
+ movq [dstq+strideq*2], xm7
+ movhps [dstq+r2 ], xm7
+ vextracti128 xm7, m7, 1
+ movq [dstq+strideq*4], xm2
+ movhps [dstq+r3 ], xm2
+ vextracti128 xm2, m2, 1
+ movq [dstq+r2*2 ], xm8
+ movhps [dstq+r4 ], xm8
+ vextracti128 xm8, m8, 1
+ movq [r6+strideq*0], xm3
+ movhps [r6+strideq*1], xm3
+ vextracti128 xm3, m3, 1
+ movq [r6+strideq*2], xm1
+ movhps [r6+r2 ], xm1
+ vextracti128 xm1, m1, 1
+ movq [r6+strideq*4], xm5
+ movhps [r6+r3 ], xm5
+ vextracti128 xm5, m5, 1
+ movq [r6+r2*2 ], xm0
+ movhps [r6+r4 ], xm0
+ lea r6, [r6+strideq*8]
+ vextracti128 xm0, m0, 1
+ movq [r6+strideq*0], xm6
+ movhps [r6+strideq*1], xm6
+ movq [r6+strideq*2], xm7
+ movhps [r6+r2 ], xm7
+ movq [r6+strideq*4], xm2
+ movhps [r6+r3 ], xm2
+ movq [r6+r2*2 ], xm8
+ movhps [r6+r4 ], xm8
+ lea r6, [r6+strideq*8]
+ movq [r6+strideq*0], xm3
+ movhps [r6+strideq*1], xm3
+ movq [r6+strideq*2], xm1
+ movhps [r6+r2 ], xm1
+ movq [r6+strideq*4], xm5
+ movhps [r6+r3 ], xm5
+ movq [r6+r2*2 ], xm0
+ movhps [r6+r4 ], xm0
+ sub dstq, 8
+ sub org_wd, 8
+ jg .h32_w8_loop
+ RET
+ALIGN function_align
+.h64:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -128, 16
+ lea maxbased, [wq+63]
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .h64_main
+ mov r4d, 21
+ vpbroadcastb xm11, [tlq-127]
+ vpblendd xm11, [tlq-130], 0x0e ; 120-127
+ sub r4d, wd ; 21-w
+ mov r5d, 3
+ vinserti128 m11, [tlq-116], 1 ; 104-111
+ movu m7, [z_filter_s+4]
+ cmp wd, 32
+ cmove r4d, r5d
+ vinserti128 m8, m7, [z_filter_s+8], 1
+ vbroadcasti128 m6, [pb_0to15]
+ movd xm1, r4d
+ vpbroadcastd m9, [z_filter_k+4*2+12*0]
+ movu xm12, [tlq-122] ; 112-119
+ vinserti128 m12, [tlq-108], 1 ; 96-103
+ vpbroadcastb m1, xm1
+ movu xm13, [tlq- 98] ; 88- 95
+ vinserti128 m13, [tlq- 84], 1 ; 72- 79
+ movu xm14, [tlq- 90] ; 80- 87
+ vinserti128 m14, [tlq- 76], 1 ; 64- 71
+ vinserti128 m7, [z_filter_s+16], 0
+ pshufb m0, m11, m8
+ pmaddubsw m0, m9
+ pshufb m2, m12, m8
+ pmaddubsw m2, m9
+ pmaxsb m1, m6 ; clip (16|32)x64
+ pshufb m13, m1
+ pshufb m1, m13, m8
+ pmaddubsw m1, m9
+ pshufb m6, m14, m8
+ pmaddubsw m6, m9
+ vpbroadcastd m9, [z_filter_k+4*2+12*1]
+ shufps m15, m8, m7, q1021
+ pshufb m10, m11, m15
+ pmaddubsw m10, m9
+ paddw m0, m10
+ pshufb m10, m12, m15
+ pmaddubsw m10, m9
+ paddw m2, m10
+ pshufb m10, m13, m15
+ pmaddubsw m10, m9
+ paddw m1, m10
+ pshufb m10, m14, m15
+ pmaddubsw m10, m9
+ paddw m6, m10
+ vpbroadcastd m9, [z_filter_k+4*2+12*2]
+ shufps m10, m8, m7, q2132
+ pshufb m11, m10
+ pmaddubsw m11, m9
+ pshufb m12, m10
+ pmaddubsw m12, m9
+ pshufb m13, m10
+ pmaddubsw m13, m9
+ pshufb m14, m10
+ pmaddubsw m14, m9
+ paddw m0, m11
+ paddw m2, m12
+ paddw m1, m13
+ paddw m6, m14
+ movu xm11, [tlq-66] ; 56-63
+ vinserti128 m11, [tlq-52], 1 ; 40-47
+ movu xm12, [tlq-58] ; 48-55
+ vinserti128 m12, [tlq-44], 1 ; 32-39
+ movu xm13, [tlq-34] ; 24-31
+ vinserti128 m13, [tlq-20], 1 ; 8-15
+ movu xm14, [tlq-28] ; 16-23
+ vinserti128 m14, [tlq-14], 1 ; 0- 7
+ pmulhrsw m0, m3
+ pmulhrsw m2, m3
+ pmulhrsw m1, m3
+ pmulhrsw m6, m3
+ lea tlq, [rsp+127]
+ packuswb m0, m2
+ packuswb m1, m6
+ mova [tlq-127], m0
+ mova [tlq- 95], m1
+ pshufb m0, m11, m10
+ pmaddubsw m0, m9
+ pshufb m2, m12, m10
+ pmaddubsw m2, m9
+ pshufb m1, m13, m10
+ pmaddubsw m1, m9
+ pshufb m6, m14, m7
+ pmaddubsw m6, m9
+ vpbroadcastd m9, [z_filter_k+4*2+12*1]
+ pshufb m7, m11, m15
+ pmaddubsw m7, m9
+ paddw m0, m7
+ pshufb m7, m12, m15
+ pmaddubsw m7, m9
+ paddw m2, m7
+ pshufb m7, m13, m15
+ pmaddubsw m7, m9
+ paddw m1, m7
+ pshufb m7, m14, m10
+ pmaddubsw m7, m9
+ paddw m6, m7
+ vpbroadcastd m9, [z_filter_k+4*2+12*0]
+ pshufb m11, m8
+ pmaddubsw m11, m9
+ pshufb m12, m8
+ pmaddubsw m12, m9
+ pshufb m13, m8
+ pmaddubsw m13, m9
+ pshufb m14, m15
+ pmaddubsw m14, m9
+ paddw m0, m11
+ paddw m2, m12
+ paddw m1, m13
+ paddw m6, m14
+ pmulhrsw m0, m3
+ pmulhrsw m2, m3
+ pmulhrsw m1, m3
+ pmulhrsw m6, m3
+ packuswb m0, m2
+ packuswb m1, m6
+ mova [tlq-63], m0
+ mova [tlq-31], m1
+.h64_main:
+ movd xm12, dyd
+ neg maxbaseq
+ vbroadcasti128 m8, [z3_shuf]
+ vpbroadcastb m7, [tlq+maxbaseq]
+ shl maxbased, 6
+ vpbroadcastw m12, xm12
+ lea r5d, [dyq+maxbaseq-64]
+ neg dyq
+ or maxbased, 63
+ lea r4, [dyq+63]
+ movd xm6, r5d
+ mova xm10, [pb_1to32+16]
+ vinserti128 m10, [pb_1to32], 1
+ vpbroadcastd m11, [pb_32]
+ vpbroadcastw m6, xm6
+.h64_loop:
+ mov r5, r4
+ sar r5, 6
+ movu m0, [tlq+r5-24]
+ movu m1, [tlq+r5-32]
+ pand m2, m4, m6
+ psubw m9, m5, m2
+ psllw m2, 8
+ por m9, m2
+ pshufb m0, m8
+ pshufb m1, m8
+ pmaddubsw m0, m9
+ pmaddubsw m1, m9
+ psraw m2, m6, 6
+ sub rsp, 64
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packsswb m2, m2
+ paddb m2, m10
+ packuswb m0, m1
+ vpblendvb m0, m7, m0, m2
+ mova [rsp+32], m0
+ movu m0, [tlq+r5-56]
+ movu m1, [tlq+r5-64]
+ add r4, dyq
+ pshufb m0, m8
+ pshufb m1, m8
+ pmaddubsw m0, m9
+ pmaddubsw m1, m9
+ paddb m2, m11
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ paddw m6, m12
+ packuswb m0, m1
+ vpblendvb m0, m7, m0, m2
+ mova [rsp], m0
+ dec wd
+ jz .h64_transpose
+ cmp r4d, maxbased
+ jg .h64_loop
+.h64_end_loop:
+ sub rsp, 64
+ mova [rsp+32], m7
+ mova [rsp+ 0], m7
+ dec wd
+ jg .h64_end_loop
+.h64_transpose:
+ lea r2, [strideq*3]
+ lea r3, [strideq*5]
+ imul r5, strideq, -8
+ lea dstq, [dstq+org_wq-16]
+ lea r4, [strideq+r2*2] ; stride*7
+.h64_transpose_loop0:
+ lea r6, [rsp+16*3]
+.h64_transpose_loop:
+ mova xm0, [r6+64*15]
+ vinserti128 m0, [r6+64* 7], 1
+ mova xm1, [r6+64*14]
+ vinserti128 m1, [r6+64* 6], 1
+ mova xm2, [r6+64*13]
+ vinserti128 m2, [r6+64* 5], 1
+ mova xm3, [r6+64*12]
+ vinserti128 m3, [r6+64* 4], 1
+ mova xm4, [r6+64*11]
+ vinserti128 m4, [r6+64* 3], 1
+ mova xm5, [r6+64*10]
+ vinserti128 m5, [r6+64* 2], 1
+ mova xm6, [r6+64* 9]
+ vinserti128 m6, [r6+64* 1], 1
+ mova xm7, [r6+64* 8]
+ vinserti128 m7, [r6+64* 0], 1
+ sub r6, 16
+ punpcklbw m8, m0, m1
+ punpckhbw m0, m1
+ punpcklbw m1, m2, m3
+ punpckhbw m2, m3
+ punpcklbw m3, m4, m5
+ punpckhbw m4, m5
+ punpcklbw m5, m6, m7
+ punpckhbw m6, m7
+ punpcklwd m7, m8, m1
+ punpckhwd m8, m1
+ punpcklwd m1, m0, m2
+ punpckhwd m0, m2
+ punpcklwd m2, m3, m5
+ punpckhwd m3, m5
+ punpcklwd m5, m4, m6
+ punpckhwd m4, m6
+ punpckldq m6, m7, m2
+ punpckhdq m7, m2
+ punpckldq m2, m8, m3
+ punpckhdq m8, m3
+ punpckldq m3, m1, m5
+ punpckhdq m1, m5
+ punpckldq m5, m0, m4
+ punpckhdq m0, m4
+ vpermq m6, m6, q3120
+ vpermq m7, m7, q3120
+ vpermq m2, m2, q3120
+ vpermq m8, m8, q3120
+ vpermq m3, m3, q3120
+ vpermq m1, m1, q3120
+ vpermq m5, m5, q3120
+ vpermq m0, m0, q3120
+ mova [dstq+strideq*0], xm6
+ vextracti128 [dstq+strideq*1], m6, 1
+ mova [dstq+strideq*2], xm7
+ vextracti128 [dstq+r2 ], m7, 1
+ mova [dstq+strideq*4], xm2
+ vextracti128 [dstq+r3 ], m2, 1
+ mova [dstq+r2*2 ], xm8
+ vextracti128 [dstq+r4 ], m8, 1
+ sub dstq, r5
+ mova [dstq+strideq*0], xm3
+ vextracti128 [dstq+strideq*1], m3, 1
+ mova [dstq+strideq*2], xm1
+ vextracti128 [dstq+r2 ], m1, 1
+ mova [dstq+strideq*4], xm5
+ vextracti128 [dstq+r3 ], m5, 1
+ mova [dstq+r2*2 ], xm0
+ vextracti128 [dstq+r4 ], m0, 1
+ sub dstq, r5
+ cmp r6, rsp
+ jae .h64_transpose_loop
+ add rsp, 64*16
+ lea dstq, [dstq+r5*8-16]
+ sub org_wd, 16
+ jg .h64_transpose_loop0
+.h64_end:
+ RET
+
+%macro FILTER_XMM 4 ; dst, src, tmp, shuf
+%ifnum %4
+ pshufb xm%2, xm%4
+%else
+ pshufb xm%2, %4
+%endif
+ pshufd xm%1, xm%2, q0000 ; p0 p1
+ pmaddubsw xm%1, xm2
+ pshufd xm%3, xm%2, q1111 ; p2 p3
+ pmaddubsw xm%3, xm3
+ paddw xm%1, xm1
+ paddw xm%1, xm%3
+ pshufd xm%3, xm%2, q2222 ; p4 p5
+ pmaddubsw xm%3, xm4
+ paddw xm%1, xm%3
+ pshufd xm%3, xm%2, q3333 ; p6 __
+ pmaddubsw xm%3, xm5
+ paddw xm%1, xm%3
+ psraw xm%1, 4
+ packuswb xm%1, xm%1
+%endmacro
+
+%macro FILTER_YMM 4 ; dst, src, tmp, shuf
+ pshufb m%2, m%4
+ pshufd m%1, m%2, q0000
+ pmaddubsw m%1, m2
+ pshufd m%3, m%2, q1111
+ pmaddubsw m%3, m3
+ paddw m%1, m1
+ paddw m%1, m%3
+ pshufd m%3, m%2, q2222
+ pmaddubsw m%3, m4
+ paddw m%1, m%3
+ pshufd m%3, m%2, q3333
+ pmaddubsw m%3, m5
+ paddw m%1, m%3
+ psraw m%1, 4
+ vperm2i128 m%3, m%1, m%1, 0x01
+ packuswb m%1, m%3
+%endmacro
+
+; The ipred_filter SIMD processes 4x2 blocks in the following order which
+; increases parallelism compared to doing things row by row. One redundant
+; block is calculated for w8 and w16, two for w32.
+; w4 w8 w16 w32
+; 1 1 2 1 2 3 5 1 2 3 5 b c d f
+; 2 2 3 2 4 5 7 2 4 5 7 c e f h
+; 3 3 4 4 6 7 9 4 6 7 9 e g h j
+; ___ 4 ___ 4 5 ___ 6 8 9 a ___ 6 8 9 a g i j k ___
+; 5 8 8 i
+
+cglobal ipred_filter_8bpc, 3, 7, 0, dst, stride, tl, w, h, filter
+%define base r6-ipred_filter_avx2_table
+ lea r6, [filter_intra_taps]
+ tzcnt wd, wm
+%ifidn filterd, filterm
+ movzx filterd, filterb
+%else
+ movzx filterd, byte filterm
+%endif
+ shl filterd, 6
+ add filterq, r6
+ lea r6, [ipred_filter_avx2_table]
+ movq xm0, [tlq-3] ; _ 6 5 0 1 2 3 4
+ movsxd wq, [r6+wq*4]
+ vpbroadcastd m1, [base+pw_8]
+ vbroadcasti128 m2, [filterq+16*0]
+ vbroadcasti128 m3, [filterq+16*1]
+ vbroadcasti128 m4, [filterq+16*2]
+ vbroadcasti128 m5, [filterq+16*3]
+ add wq, r6
+ mov hd, hm
+ jmp wq
+.w4:
+ WIN64_SPILL_XMM 9
+ mova xm8, [base+filter_shuf2]
+ sub tlq, 3
+ sub tlq, hq
+ jmp .w4_loop_start
+.w4_loop:
+ pinsrd xm0, xm6, [tlq+hq], 0
+ lea dstq, [dstq+strideq*2]
+.w4_loop_start:
+ FILTER_XMM 6, 0, 7, 8
+ movd [dstq+strideq*0], xm6
+ pextrd [dstq+strideq*1], xm6, 1
+ sub hd, 2
+ jg .w4_loop
+ RET
+ALIGN function_align
+.w8:
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 10
+ mova m8, [base+filter_shuf1]
+ FILTER_XMM 7, 0, 6, [base+filter_shuf2]
+ vpbroadcastd m0, [tlq+4]
+ vpbroadcastd m6, [tlq+5]
+ sub tlq, 4
+ sub tlq, hq
+ vpbroadcastq m7, xm7
+ vpblendd m7, m6, 0x20
+.w8_loop:
+ vpbroadcastd xm6, [tlq+hq]
+ palignr m6, m0, 12
+ vpblendd m0, m6, m7, 0xeb ; _ _ _ _ 1 2 3 4 6 5 0 _ _ _ _ _
+ ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
+ mova xm6, xm7
+ call .main
+ vpblendd xm6, xm7, 0x0c
+ pshufd xm6, xm6, q3120
+ movq [dstq+strideq*0], xm6
+ movhps [dstq+strideq*1], xm6
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8_loop
+ RET
+ALIGN function_align
+.w16:
+%if WIN64
+ %assign stack_offset stack_offset - stack_size_padded
+ %assign xmm_regs_used 15
+ %assign stack_size_padded 0x98
+ SUB rsp, stack_size_padded
+%endif
+ sub hd, 2
+ TAIL_CALL .w16_main, 0
+.w16_main:
+%if WIN64
+ movaps [rsp+0xa8], xmm6
+ movaps [rsp+0xb8], xmm7
+ movaps [rsp+0x28], xmm8
+ movaps [rsp+0x38], xmm9
+ movaps [rsp+0x48], xmm10
+ movaps [rsp+0x58], xmm11
+ movaps [rsp+0x68], xmm12
+ movaps [rsp+0x78], xmm13
+ movaps [rsp+0x88], xmm14
+%endif
+ FILTER_XMM 12, 0, 7, [base+filter_shuf2]
+ vpbroadcastd m0, [tlq+5]
+ vpblendd m0, [tlq-12], 0x14
+ mova m8, [base+filter_shuf1]
+ vpbroadcastq m7, xm12
+ vpblendd m0, m7, 0xc2 ; _ _ _ _ 1 2 3 4 6 5 0 _ _ _ _ _
+ ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
+ call .main ; c0 d0 a1 b1 a1 b1 c0 d0
+ movlps xm9, xm7, [tlq+5] ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+ vinserti128 m14, m8, [base+filter_shuf3], 0
+ vpblendd xm12, xm7, 0x0c ; a0 b0 a1 b1
+ FILTER_XMM 6, 9, 10, 14
+ vpbroadcastq m6, xm6 ; a2 b2 __ __ __ __ a2 b2
+ vpbroadcastd m9, [tlq+13]
+ vpbroadcastd m10, [tlq+12]
+ psrld m11, m8, 4
+ vpblendd m6, m9, 0x20 ; top
+ sub tlq, 6
+ sub tlq, hq
+.w16_loop:
+ vpbroadcastd xm9, [tlq+hq]
+ palignr m9, m0, 12
+ vpblendd m0, m9, m7, 0xe2 ; _ _ _ _ 1 2 3 4 6 5 0 _ _ _ _ _
+ ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
+ mova xm13, xm7
+ call .main ; e0 f0 c1 d1 c1 d1 e0 f0
+ vpblendd m9, m12, m10, 0xf0
+ vpblendd m12, m6, 0xc0
+ pshufd m9, m9, q3333
+ vpblendd m9, m6, 0xee
+ vpblendd m10, m9, m7, 0x0c ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+ ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
+ FILTER_YMM 6, 10, 9, 14 ; c2 d2 a3 b3 a3 b3 c2 d2
+ vpblendd m12, m6, 0x30 ; a0 b0 a1 b1 a3 b3 a2 b2
+ vpermd m9, m11, m12 ; a0 a1 a2 a3 b0 b1 b2 b3
+ vpblendd xm12, xm13, xm7, 0x0c ; c0 d0 c1 d1
+ mova [dstq+strideq*0], xm9
+ vextracti128 [dstq+strideq*1], m9, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w16_loop
+ vpblendd xm7, xm6, xm10, 0x04 ; _ _ _ 5 _ _ _ 6 0 _ _ _ 1 2 3 4
+ pshufd xm7, xm7, q1032 ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
+ FILTER_XMM 0, 7, 9, [base+filter_shuf1+16]
+ vpblendd xm6, xm0, 0x0c ; c2 d2 c3 d3
+ shufps xm0, xm12, xm6, q2020 ; c0 c1 c2 c3
+ shufps xm6, xm12, xm6, q3131 ; d0 d1 d2 d3
+ mova [dstq+strideq*0], xm0
+ mova [dstq+strideq*1], xm6
+ ret
+ALIGN function_align
+.w32:
+ sub rsp, stack_size_padded
+ sub hd, 2
+ lea r3, [dstq+16]
+ lea r5d, [hq-2]
+ call .w16_main
+ add tlq, r5
+ mov dstq, r3
+ lea r3, [strideq-4]
+ lea r4, [r3+strideq*2]
+ movq xm0, [tlq+21]
+ pinsrd xm0, [dstq-4], 2
+ pinsrd xm0, [dstq+r3*1], 3
+ FILTER_XMM 12, 0, 7, 14 ; a0 b0 a0 b0
+ movq xm7, [dstq+r3*2]
+ pinsrd xm7, [dstq+r4], 2
+ palignr xm7, xm0, 12 ; 0 _ _ _ _ _ _ _ _ _ _ 5 _ _ _ 6
+ vpbroadcastd m0, [tlq+28]
+ vpbroadcastd m9, [tlq+29]
+ vbroadcasti128 m8, [base+filter_shuf1+16]
+ vpblendd m0, m9, 0x20
+ vpblendd m0, m7, 0x0f
+ vpbroadcastq m7, xm12
+ vpblendd m0, m7, 0xc2 ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
+ call .main ; c0 d0 a1 b1 a1 b1 c0 d0
+ add r3, 2
+ lea r4, [r4+strideq*2]
+ movlps xm9, xm7, [tlq+29] ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+ vpblendd xm12, xm7, 0x0c ; a0 b0 a1 b1
+ FILTER_XMM 6, 9, 10, 14
+ vpbroadcastq m6, xm6 ; a2 b2 __ __ __ __ a2 b2
+ vpbroadcastd m9, [tlq+37]
+ vpbroadcastd m10, [tlq+36]
+ vpblendd m6, m9, 0x20 ; top
+.w32_loop:
+ movq xm9, [dstq+r3*4]
+ pinsrd xm9, [dstq+r4], 2
+.w32_loop_last:
+ palignr m9, m0, 12
+ vpblendd m0, m9, m7, 0xe2 ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
+ mova xm13, xm7 ; c0 d0
+ call .main ; e0 f0 c1 d1 c1 d1 e0 f0
+ vpblendd m9, m12, m10, 0xf0
+ vpblendd m12, m6, 0xc0
+ pshufd m9, m9, q3333
+ vpblendd m9, m6, 0xee
+ vpblendd m10, m9, m7, 0x0c ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+ ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
+ FILTER_YMM 6, 10, 9, 14 ; c2 d2 a3 b3 a3 b3 c2 d2
+ vpblendd m12, m6, 0x30 ; a0 b0 a1 b1 a3 b3 a2 b2
+ vpermd m9, m11, m12 ; a0 a1 a2 a3 b0 b1 b2 b3
+ vpblendd xm12, xm13, xm7, 0x0c ; c0 d0 c1 d1
+ mova [dstq+strideq*0], xm9
+ vextracti128 [dstq+strideq*1], m9, 1
+ lea dstq, [dstq+strideq*2]
+ sub r5d, 2
+ jg .w32_loop
+ jz .w32_loop_last
+ vpblendd xm7, xm6, xm10, 0x04 ; _ _ _ 5 _ _ _ 6 0 _ _ _ 1 2 3 4
+ pshufd xm7, xm7, q1032 ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
+ FILTER_XMM 0, 7, 9, [base+filter_shuf1+16]
+ vpblendd xm6, xm0, 0x0c ; c2 d2 c3 d3
+ shufps xm0, xm12, xm6, q2020 ; c0 c1 c2 c3
+ shufps xm6, xm12, xm6, q3131 ; d0 d1 d2 d3
+ mova [dstq+strideq*0], xm0
+ mova [dstq+strideq*1], xm6
+ RET
+ALIGN function_align
+.main:
+ FILTER_YMM 7, 0, 9, 8
+ ret
+
+%if WIN64
+DECLARE_REG_TMP 5
+%else
+DECLARE_REG_TMP 7
+%endif
+
+%macro IPRED_CFL 1 ; ac in, unpacked pixels out
+ psignw m3, m%1, m1
+ pabsw m%1, m%1
+ pmulhrsw m%1, m2
+ psignw m%1, m3
+ paddw m%1, m0
+%endmacro
+
+cglobal ipred_cfl_top_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
+ lea t0, [ipred_cfl_left_avx2_table]
+ tzcnt wd, wm
+ inc tlq
+ movu m0, [tlq]
+ movifnidn hd, hm
+ mov r6d, 0x8000
+ shrx r6d, r6d, wd
+ movd xm3, r6d
+ movsxd r6, [t0+wq*4]
+ pcmpeqd m2, m2
+ pmaddubsw m0, m2
+ add r6, t0
+ add t0, ipred_cfl_splat_avx2_table-ipred_cfl_left_avx2_table
+ movsxd wq, [t0+wq*4]
+ add wq, t0
+ movifnidn acq, acmp
+ jmp r6
+
+cglobal ipred_cfl_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
+ mov hd, hm ; zero upper half
+ tzcnt r6d, hd
+ sub tlq, hq
+ tzcnt wd, wm
+ movu m0, [tlq]
+ mov t0d, 0x8000
+ shrx t0d, t0d, r6d
+ movd xm3, t0d
+ lea t0, [ipred_cfl_left_avx2_table]
+ movsxd r6, [t0+r6*4]
+ pcmpeqd m2, m2
+ pmaddubsw m0, m2
+ add r6, t0
+ add t0, ipred_cfl_splat_avx2_table-ipred_cfl_left_avx2_table
+ movsxd wq, [t0+wq*4]
+ add wq, t0
+ movifnidn acq, acmp
+ jmp r6
+.h32:
+ vextracti128 xm1, m0, 1
+ paddw xm0, xm1
+.h16:
+ punpckhqdq xm1, xm0, xm0
+ paddw xm0, xm1
+.h8:
+ psrlq xm1, xm0, 32
+ paddw xm0, xm1
+.h4:
+ pmaddwd xm0, xm2
+ pmulhrsw xm0, xm3
+ vpbroadcastw m0, xm0
+ jmp wq
+
+cglobal ipred_cfl_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
+ movifnidn hd, hm
+ movifnidn wd, wm
+ tzcnt r6d, hd
+ lea t0d, [wq+hq]
+ movd xm4, t0d
+ tzcnt t0d, t0d
+ movd xm5, t0d
+ lea t0, [ipred_cfl_avx2_table]
+ tzcnt wd, wd
+ movsxd r6, [t0+r6*4]
+ movsxd wq, [t0+wq*4+4*4]
+ pcmpeqd m3, m3
+ psrlw xm4, 1
+ add r6, t0
+ add wq, t0
+ movifnidn acq, acmp
+ jmp r6
+.h4:
+ movd xm0, [tlq-4]
+ pmaddubsw xm0, xm3
+ jmp wq
+.w4:
+ movd xm1, [tlq+1]
+ pmaddubsw xm1, xm3
+ psubw xm0, xm4
+ paddw xm0, xm1
+ pmaddwd xm0, xm3
+ cmp hd, 4
+ jg .w4_mul
+ psrlw xm0, 3
+ jmp .w4_end
+.w4_mul:
+ punpckhqdq xm1, xm0, xm0
+ lea r2d, [hq*2]
+ mov r6d, 0x55563334
+ paddw xm0, xm1
+ shrx r6d, r6d, r2d
+ psrlq xm1, xm0, 32
+ paddw xm0, xm1
+ movd xm1, r6d
+ psrlw xm0, 2
+ pmulhuw xm0, xm1
+.w4_end:
+ vpbroadcastw m0, xm0
+.s4:
+ vpbroadcastw m1, alpham
+ lea r6, [strideq*3]
+ pabsw m2, m1
+ psllw m2, 9
+.s4_loop:
+ mova m4, [acq]
+ IPRED_CFL 4
+ packuswb m4, m4
+ vextracti128 xm5, m4, 1
+ movd [dstq+strideq*0], xm4
+ pextrd [dstq+strideq*1], xm4, 1
+ movd [dstq+strideq*2], xm5
+ pextrd [dstq+r6 ], xm5, 1
+ lea dstq, [dstq+strideq*4]
+ add acq, 32
+ sub hd, 4
+ jg .s4_loop
+ RET
+ALIGN function_align
+.h8:
+ movq xm0, [tlq-8]
+ pmaddubsw xm0, xm3
+ jmp wq
+.w8:
+ movq xm1, [tlq+1]
+ vextracti128 xm2, m0, 1
+ pmaddubsw xm1, xm3
+ psubw xm0, xm4
+ paddw xm0, xm2
+ punpckhqdq xm2, xm0, xm0
+ paddw xm0, xm2
+ paddw xm0, xm1
+ psrlq xm1, xm0, 32
+ paddw xm0, xm1
+ pmaddwd xm0, xm3
+ psrlw xm0, xm5
+ cmp hd, 8
+ je .w8_end
+ mov r6d, 0x5556
+ mov r2d, 0x3334
+ cmp hd, 32
+ cmove r6d, r2d
+ movd xm1, r6d
+ pmulhuw xm0, xm1
+.w8_end:
+ vpbroadcastw m0, xm0
+.s8:
+ vpbroadcastw m1, alpham
+ lea r6, [strideq*3]
+ pabsw m2, m1
+ psllw m2, 9
+.s8_loop:
+ mova m4, [acq]
+ mova m5, [acq+32]
+ IPRED_CFL 4
+ IPRED_CFL 5
+ packuswb m4, m5
+ vextracti128 xm5, m4, 1
+ movq [dstq+strideq*0], xm4
+ movq [dstq+strideq*1], xm5
+ movhps [dstq+strideq*2], xm4
+ movhps [dstq+r6 ], xm5
+ lea dstq, [dstq+strideq*4]
+ add acq, 64
+ sub hd, 4
+ jg .s8_loop
+ RET
+ALIGN function_align
+.h16:
+ mova xm0, [tlq-16]
+ pmaddubsw xm0, xm3
+ jmp wq
+.w16:
+ movu xm1, [tlq+1]
+ vextracti128 xm2, m0, 1
+ pmaddubsw xm1, xm3
+ psubw xm0, xm4
+ paddw xm0, xm2
+ paddw xm0, xm1
+ punpckhqdq xm1, xm0, xm0
+ paddw xm0, xm1
+ psrlq xm1, xm0, 32
+ paddw xm0, xm1
+ pmaddwd xm0, xm3
+ psrlw xm0, xm5
+ cmp hd, 16
+ je .w16_end
+ mov r6d, 0x5556
+ mov r2d, 0x3334
+ test hb, 8|32
+ cmovz r6d, r2d
+ movd xm1, r6d
+ pmulhuw xm0, xm1
+.w16_end:
+ vpbroadcastw m0, xm0
+.s16:
+ vpbroadcastw m1, alpham
+ pabsw m2, m1
+ psllw m2, 9
+.s16_loop:
+ mova m4, [acq]
+ mova m5, [acq+32]
+ IPRED_CFL 4
+ IPRED_CFL 5
+ packuswb m4, m5
+ vpermq m4, m4, q3120
+ mova [dstq+strideq*0], xm4
+ vextracti128 [dstq+strideq*1], m4, 1
+ lea dstq, [dstq+strideq*2]
+ add acq, 64
+ sub hd, 2
+ jg .s16_loop
+ RET
+ALIGN function_align
+.h32:
+ mova m0, [tlq-32]
+ pmaddubsw m0, m3
+ jmp wq
+.w32:
+ movu m1, [tlq+1]
+ pmaddubsw m1, m3
+ paddw m0, m1
+ vextracti128 xm1, m0, 1
+ psubw xm0, xm4
+ paddw xm0, xm1
+ punpckhqdq xm1, xm0, xm0
+ paddw xm0, xm1
+ psrlq xm1, xm0, 32
+ paddw xm0, xm1
+ pmaddwd xm0, xm3
+ psrlw xm0, xm5
+ cmp hd, 32
+ je .w32_end
+ lea r2d, [hq*2]
+ mov r6d, 0x33345556
+ shrx r6d, r6d, r2d
+ movd xm1, r6d
+ pmulhuw xm0, xm1
+.w32_end:
+ vpbroadcastw m0, xm0
+.s32:
+ vpbroadcastw m1, alpham
+ pabsw m2, m1
+ psllw m2, 9
+.s32_loop:
+ mova m4, [acq]
+ mova m5, [acq+32]
+ IPRED_CFL 4
+ IPRED_CFL 5
+ packuswb m4, m5
+ vpermq m4, m4, q3120
+ mova [dstq], m4
+ add dstq, strideq
+ add acq, 64
+ dec hd
+ jg .s32_loop
+ RET
+
+cglobal ipred_cfl_128_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
+ lea t0, [ipred_cfl_splat_avx2_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, [t0+wq*4]
+ vpbroadcastd m0, [t0-ipred_cfl_splat_avx2_table+pw_128]
+ add wq, t0
+ movifnidn acq, acmp
+ jmp wq
+
+cglobal ipred_cfl_ac_420_8bpc, 4, 9, 5, ac, y, stride, wpad, hpad, w, h, sz, ac_bak
+ movifnidn hpadd, hpadm
+ movifnidn wd, wm
+ mov hd, hm
+ mov szd, wd
+ mov ac_bakq, acq
+ imul szd, hd
+ shl hpadd, 2
+ sub hd, hpadd
+ vpbroadcastd m2, [pb_2]
+ pxor m4, m4
+ cmp wd, 8
+ jg .w16
+ je .w8
+ ; fall-through
+
+ DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, sz, ac_bak
+.w4:
+ lea stride3q, [strideq*3]
+.w4_loop:
+ movq xm0, [yq]
+ movq xm1, [yq+strideq]
+ movhps xm0, [yq+strideq*2]
+ movhps xm1, [yq+stride3q]
+ pmaddubsw xm0, xm2
+ pmaddubsw xm1, xm2
+ paddw xm0, xm1
+ mova [acq], xm0
+ paddw xm4, xm0
+ lea yq, [yq+strideq*4]
+ add acq, 16
+ sub hd, 2
+ jg .w4_loop
+ test hpadd, hpadd
+ jz .calc_avg
+ vpermq m0, m0, q1111
+.w4_hpad_loop:
+ mova [acq], m0
+ paddw m4, m0
+ add acq, 32
+ sub hpadd, 4
+ jg .w4_hpad_loop
+ jmp .calc_avg
+
+.w8:
+ lea stride3q, [strideq*3]
+ test wpadd, wpadd
+ jnz .w8_wpad
+.w8_loop:
+ mova xm0, [yq]
+ mova xm1, [yq+strideq]
+ vinserti128 m0, [yq+strideq*2], 1
+ vinserti128 m1, [yq+stride3q], 1
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ paddw m0, m1
+ mova [acq], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*4]
+ add acq, 32
+ sub hd, 2
+ jg .w8_loop
+ test hpadd, hpadd
+ jz .calc_avg
+ jmp .w8_hpad
+.w8_wpad:
+ vbroadcasti128 m3, [cfl_ac_w8_pad1_shuffle]
+.w8_wpad_loop:
+ movq xm0, [yq]
+ movq xm1, [yq+strideq]
+ vinserti128 m0, [yq+strideq*2], 1
+ vinserti128 m1, [yq+stride3q], 1
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ paddw m0, m1
+ pshufb m0, m3
+ mova [acq], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*4]
+ add acq, 32
+ sub hd, 2
+ jg .w8_wpad_loop
+ test hpadd, hpadd
+ jz .calc_avg
+.w8_hpad:
+ vpermq m0, m0, q3232
+.w8_hpad_loop:
+ mova [acq], m0
+ paddw m4, m0
+ add acq, 32
+ sub hpadd, 2
+ jg .w8_hpad_loop
+ jmp .calc_avg
+
+.w16:
+ test wpadd, wpadd
+ jnz .w16_wpad
+.w16_loop:
+ mova m0, [yq]
+ mova m1, [yq+strideq]
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ paddw m0, m1
+ mova [acq], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*2]
+ add acq, 32
+ dec hd
+ jg .w16_loop
+ test hpadd, hpadd
+ jz .calc_avg
+ jmp .w16_hpad_loop
+.w16_wpad:
+ DEFINE_ARGS ac, y, stride, wpad, hpad, iptr, h, sz, ac_bak
+ lea iptrq, [ipred_cfl_ac_420_avx2_table]
+ shl wpadd, 2
+ mova m3, [iptrq+cfl_ac_w16_pad_shuffle- \
+ ipred_cfl_ac_420_avx2_table+wpadq*8-32]
+ movsxd wpadq, [iptrq+wpadq+4]
+ add iptrq, wpadq
+ jmp iptrq
+.w16_pad3:
+ vpbroadcastq m0, [yq]
+ vpbroadcastq m1, [yq+strideq]
+ jmp .w16_wpad_end
+.w16_pad2:
+ vbroadcasti128 m0, [yq]
+ vbroadcasti128 m1, [yq+strideq]
+ jmp .w16_wpad_end
+.w16_pad1:
+ mova m0, [yq]
+ mova m1, [yq+strideq]
+ ; fall-through
+.w16_wpad_end:
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ paddw m0, m1
+ pshufb m0, m3
+ mova [acq], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*2]
+ add acq, 32
+ dec hd
+ jz .w16_wpad_done
+ jmp iptrq
+.w16_wpad_done:
+ test hpadd, hpadd
+ jz .calc_avg
+.w16_hpad_loop:
+ mova [acq], m0
+ paddw m4, m0
+ add acq, 32
+ dec hpadd
+ jg .w16_hpad_loop
+ ; fall-through
+
+.calc_avg:
+ vpbroadcastd m2, [pw_1]
+ pmaddwd m0, m4, m2
+ vextracti128 xm1, m0, 1
+ tzcnt r1d, szd
+ paddd xm0, xm1
+ movd xm2, r1d
+ movd xm3, szd
+ punpckhqdq xm1, xm0, xm0
+ paddd xm0, xm1
+ psrad xm3, 1
+ psrlq xm1, xm0, 32
+ paddd xm0, xm3
+ paddd xm0, xm1
+ psrad xm0, xm2
+ vpbroadcastw m0, xm0
+.sub_loop:
+ mova m1, [ac_bakq]
+ psubw m1, m0
+ mova [ac_bakq], m1
+ add ac_bakq, 32
+ sub szd, 16
+ jg .sub_loop
+ RET
+
+cglobal ipred_cfl_ac_422_8bpc, 4, 9, 6, ac, y, stride, wpad, hpad, w, h, sz, ac_bak
+ movifnidn hpadd, hpadm
+ movifnidn wd, wm
+ mov hd, hm
+ mov szd, wd
+ mov ac_bakq, acq
+ imul szd, hd
+ shl hpadd, 2
+ sub hd, hpadd
+ vpbroadcastd m2, [pb_4]
+ pxor m4, m4
+ pxor m5, m5
+ cmp wd, 8
+ jg .w16
+ je .w8
+ ; fall-through
+
+ DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, sz, ac_bak
+.w4:
+ lea stride3q, [strideq*3]
+.w4_loop:
+ movq xm1, [yq]
+ movhps xm1, [yq+strideq]
+ movq xm0, [yq+strideq*2]
+ movhps xm0, [yq+stride3q]
+ pmaddubsw xm0, xm2
+ pmaddubsw xm1, xm2
+ mova [acq], xm1
+ mova [acq+16], xm0
+ paddw xm4, xm0
+ paddw xm5, xm1
+ lea yq, [yq+strideq*4]
+ add acq, 32
+ sub hd, 4
+ jg .w4_loop
+ test hpadd, hpadd
+ jz .calc_avg
+ vpermq m0, m0, q1111
+.w4_hpad_loop:
+ mova [acq], m0
+ paddw m4, m0
+ add acq, 32
+ sub hpadd, 4
+ jg .w4_hpad_loop
+ jmp .calc_avg
+
+.w8:
+ lea stride3q, [strideq*3]
+ test wpadd, wpadd
+ jnz .w8_wpad
+.w8_loop:
+ mova xm1, [yq]
+ vinserti128 m1, [yq+strideq], 1
+ mova xm0, [yq+strideq*2]
+ vinserti128 m0, [yq+stride3q], 1
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ mova [acq], m1
+ mova [acq+32], m0
+ paddw m4, m0
+ paddw m5, m1
+ lea yq, [yq+strideq*4]
+ add acq, 64
+ sub hd, 4
+ jg .w8_loop
+ test hpadd, hpadd
+ jz .calc_avg
+ jmp .w8_hpad
+.w8_wpad:
+ vbroadcasti128 m3, [cfl_ac_w8_pad1_shuffle]
+.w8_wpad_loop:
+ movq xm1, [yq]
+ vinserti128 m1, [yq+strideq], 1
+ movq xm0, [yq+strideq*2]
+ vinserti128 m0, [yq+stride3q], 1
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ pshufb m0, m3
+ pshufb m1, m3
+ mova [acq], m1
+ mova [acq+32], m0
+ paddw m4, m0
+ paddw m5, m1
+ lea yq, [yq+strideq*4]
+ add acq, 64
+ sub hd, 4
+ jg .w8_wpad_loop
+ test hpadd, hpadd
+ jz .calc_avg
+.w8_hpad:
+ vpermq m0, m0, q3232
+.w8_hpad_loop:
+ mova [acq], m0
+ paddw m4, m0
+ add acq, 32
+ sub hpadd, 2
+ jg .w8_hpad_loop
+ jmp .calc_avg
+
+.w16:
+ test wpadd, wpadd
+ jnz .w16_wpad
+.w16_loop:
+ mova m1, [yq]
+ mova m0, [yq+strideq]
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ mova [acq], m1
+ mova [acq+32], m0
+ paddw m4, m0
+ paddw m5, m1
+ lea yq, [yq+strideq*2]
+ add acq, 64
+ sub hd, 2
+ jg .w16_loop
+ test hpadd, hpadd
+ jz .calc_avg
+ jmp .w16_hpad_loop
+.w16_wpad:
+ DEFINE_ARGS ac, y, stride, wpad, hpad, iptr, h, sz, ac_bak
+ lea iptrq, [ipred_cfl_ac_422_avx2_table]
+ shl wpadd, 2
+ mova m3, [iptrq+cfl_ac_w16_pad_shuffle- \
+ ipred_cfl_ac_422_avx2_table+wpadq*8-32]
+ movsxd wpadq, [iptrq+wpadq+4]
+ add iptrq, wpadq
+ jmp iptrq
+.w16_pad3:
+ vpbroadcastq m1, [yq]
+ vpbroadcastq m0, [yq+strideq]
+ jmp .w16_wpad_end
+.w16_pad2:
+ vbroadcasti128 m1, [yq]
+ vbroadcasti128 m0, [yq+strideq]
+ jmp .w16_wpad_end
+.w16_pad1:
+ mova m1, [yq]
+ mova m0, [yq+strideq]
+ ; fall-through
+.w16_wpad_end:
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ pshufb m0, m3
+ pshufb m1, m3
+ mova [acq], m1
+ mova [acq+32], m0
+ paddw m4, m0
+ paddw m5, m1
+ lea yq, [yq+strideq*2]
+ add acq, 64
+ sub hd, 2
+ jz .w16_wpad_done
+ jmp iptrq
+.w16_wpad_done:
+ test hpadd, hpadd
+ jz .calc_avg
+.w16_hpad_loop:
+ mova [acq], m0
+ mova [acq+32], m0
+ paddw m4, m0
+ paddw m5, m0
+ add acq, 64
+ sub hpadd, 2
+ jg .w16_hpad_loop
+ ; fall-through
+
+.calc_avg:
+ vpbroadcastd m2, [pw_1]
+ pmaddwd m5, m5, m2
+ pmaddwd m0, m4, m2
+ paddd m0, m5
+ vextracti128 xm1, m0, 1
+ tzcnt r1d, szd
+ paddd xm0, xm1
+ movd xm2, r1d
+ movd xm3, szd
+ punpckhqdq xm1, xm0, xm0
+ paddd xm0, xm1
+ psrad xm3, 1
+ psrlq xm1, xm0, 32
+ paddd xm0, xm3
+ paddd xm0, xm1
+ psrad xm0, xm2
+ vpbroadcastw m0, xm0
+.sub_loop:
+ mova m1, [ac_bakq]
+ psubw m1, m0
+ mova [ac_bakq], m1
+ add ac_bakq, 32
+ sub szd, 16
+ jg .sub_loop
+ RET
+
+cglobal ipred_cfl_ac_444_8bpc, 4, 9, 6, ac, y, stride, wpad, hpad, w, h, sz, ac_bak
+ movifnidn hpadd, hpadm
+ movifnidn wd, wm
+ mov hd, hm
+ mov szd, wd
+ imul szd, hd
+ shl hpadd, 2
+ sub hd, hpadd
+ pxor m4, m4
+ vpbroadcastd m5, [pw_1]
+ tzcnt r8d, wd
+ lea r5, [ipred_cfl_ac_444_avx2_table]
+ movsxd r8, [r5+r8*4+12]
+ add r5, r8
+
+ DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, sz, ac_bak
+ mov ac_bakq, acq
+ jmp r5
+
+.w4:
+ lea stride3q, [strideq*3]
+ pxor xm2, xm2
+.w4_loop:
+ movd xm1, [yq]
+ movd xm0, [yq+strideq*2]
+ pinsrd xm1, [yq+strideq], 1
+ pinsrd xm0, [yq+stride3q], 1
+ punpcklbw xm1, xm2
+ punpcklbw xm0, xm2
+ psllw xm1, 3
+ psllw xm0, 3
+ mova [acq], xm1
+ mova [acq+16], xm0
+ paddw xm1, xm0
+ paddw xm4, xm1
+ lea yq, [yq+strideq*4]
+ add acq, 32
+ sub hd, 4
+ jg .w4_loop
+ test hpadd, hpadd
+ jz .calc_avg_mul
+ pshufd xm0, xm0, q3232
+ paddw xm1, xm0, xm0
+.w4_hpad_loop:
+ mova [acq], xm0
+ mova [acq+16], xm0
+ paddw xm4, xm1
+ add acq, 32
+ sub hpadd, 4
+ jg .w4_hpad_loop
+ jmp .calc_avg_mul
+
+.w8:
+ lea stride3q, [strideq*3]
+ pxor m2, m2
+.w8_loop:
+ movq xm1, [yq]
+ movq xm0, [yq+strideq*2]
+ vinserti128 m1, [yq+strideq], 1
+ vinserti128 m0, [yq+stride3q], 1
+ punpcklbw m1, m2
+ punpcklbw m0, m2
+ psllw m1, 3
+ psllw m0, 3
+ mova [acq], m1
+ mova [acq+32], m0
+ paddw m1, m0
+ paddw m4, m1
+ lea yq, [yq+strideq*4]
+ add acq, 64
+ sub hd, 4
+ jg .w8_loop
+ test hpadd, hpadd
+ jz .calc_avg_mul
+ vpermq m0, m0, q3232
+ paddw m1, m0, m0
+.w8_hpad_loop:
+ mova [acq], m0
+ mova [acq+32], m0
+ paddw m4, m1
+ add acq, 64
+ sub hpadd, 4
+ jg .w8_hpad_loop
+ jmp .calc_avg_mul
+
+.w16:
+ test wpadd, wpadd
+ jnz .w16_wpad
+.w16_loop:
+ pmovzxbw m1, [yq]
+ pmovzxbw m0, [yq+strideq]
+ psllw m1, 3
+ psllw m0, 3
+ mova [acq], m1
+ mova [acq+32], m0
+ paddw m1, m0
+ pmaddwd m1, m5
+ paddd m4, m1
+ lea yq, [yq+strideq*2]
+ add acq, 64
+ sub hd, 2
+ jg .w16_loop
+ test hpadd, hpadd
+ jz .calc_avg
+ jmp .w16_hpad
+.w16_wpad:
+ mova m3, [cfl_ac_444_w16_pad1_shuffle]
+.w16_wpad_loop:
+ vpbroadcastq m1, [yq]
+ vpbroadcastq m0, [yq+strideq]
+ pshufb m1, m3
+ pshufb m0, m3
+ psllw m1, 3
+ psllw m0, 3
+ mova [acq], m1
+ mova [acq+32], m0
+ paddw m1, m0
+ pmaddwd m1, m5
+ paddd m4, m1
+ lea yq, [yq+strideq*2]
+ add acq, 64
+ sub hd, 2
+ jg .w16_wpad_loop
+ test hpadd, hpadd
+ jz .calc_avg
+.w16_hpad:
+ paddw m1, m0, m0
+ pmaddwd m1, m5
+.w16_hpad_loop:
+ mova [acq], m0
+ mova [acq+32], m0
+ paddd m4, m1
+ add acq, 64
+ sub hpadd, 2
+ jg .w16_hpad_loop
+ jmp .calc_avg
+
+.w32:
+ test wpadd, wpadd
+ jnz .w32_wpad
+.w32_loop:
+ pmovzxbw m1, [yq]
+ pmovzxbw m0, [yq+16]
+ psllw m1, 3
+ psllw m0, 3
+ mova [acq], m1
+ mova [acq+32], m0
+ paddw m2, m1, m0
+ pmaddwd m2, m5
+ paddd m4, m2
+ add yq, strideq
+ add acq, 64
+ dec hd
+ jg .w32_loop
+ test hpadd, hpadd
+ jz .calc_avg
+ jmp .w32_hpad_loop
+.w32_wpad:
+ DEFINE_ARGS ac, y, stride, wpad, hpad, iptr, h, sz, ac_bak
+ lea iptrq, [ipred_cfl_ac_444_avx2_table]
+ add wpadd, wpadd
+ mova m3, [iptrq+cfl_ac_444_w16_pad1_shuffle-ipred_cfl_ac_444_avx2_table]
+ movsxd wpadq, [iptrq+wpadq+4]
+ add iptrq, wpadq
+ jmp iptrq
+.w32_pad3:
+ vpbroadcastq m1, [yq]
+ pshufb m1, m3
+ vpermq m0, m1, q3232
+ jmp .w32_wpad_end
+.w32_pad2:
+ pmovzxbw m1, [yq]
+ pshufhw m0, m1, q3333
+ vpermq m0, m0, q3333
+ jmp .w32_wpad_end
+.w32_pad1:
+ pmovzxbw m1, [yq]
+ vpbroadcastq m0, [yq+16]
+ pshufb m0, m3
+ ; fall-through
+.w32_wpad_end:
+ psllw m1, 3
+ psllw m0, 3
+ mova [acq], m1
+ mova [acq+32], m0
+ paddw m2, m1, m0
+ pmaddwd m2, m5
+ paddd m4, m2
+ add yq, strideq
+ add acq, 64
+ dec hd
+ jz .w32_wpad_done
+ jmp iptrq
+.w32_wpad_done:
+ test hpadd, hpadd
+ jz .calc_avg
+.w32_hpad_loop:
+ mova [acq], m1
+ mova [acq+32], m0
+ paddd m4, m2
+ add acq, 64
+ dec hpadd
+ jg .w32_hpad_loop
+ jmp .calc_avg
+
+.calc_avg_mul:
+ pmaddwd m4, m5
+.calc_avg:
+ vextracti128 xm1, m4, 1
+ tzcnt r1d, szd
+ paddd xm0, xm4, xm1
+ movd xm2, r1d
+ movd xm3, szd
+ punpckhqdq xm1, xm0, xm0
+ paddd xm0, xm1
+ psrad xm3, 1
+ psrlq xm1, xm0, 32
+ paddd xm0, xm3
+ paddd xm0, xm1
+ psrad xm0, xm2
+ vpbroadcastw m0, xm0
+.sub_loop:
+ mova m1, [ac_bakq]
+ psubw m1, m0
+ mova [ac_bakq], m1
+ add ac_bakq, 32
+ sub szd, 16
+ jg .sub_loop
+ RET
+
+cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, w, h
+ vpbroadcastq m4, [palq]
+ lea r2, [pal_pred_avx2_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, [r2+wq*4]
+ add wq, r2
+ lea r2, [strideq*3]
+ jmp wq
+.w4:
+ movq xm0, [idxq]
+ add idxq, 8
+ psrlw xm1, xm0, 4
+ punpcklbw xm0, xm1
+ pshufb xm0, xm4, xm0
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ pextrd [dstq+strideq*2], xm0, 2
+ pextrd [dstq+r2 ], xm0, 3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4
+ RET
+.w8:
+ movu xm2, [idxq]
+ add idxq, 16
+ pshufb xm1, xm4, xm2
+ psrlw xm2, 4
+ pshufb xm2, xm4, xm2
+ punpcklbw xm0, xm1, xm2
+ punpckhbw xm1, xm2
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+r2 ], xm1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8
+ RET
+.w16:
+ movu m2, [idxq]
+ add idxq, 32
+ pshufb m1, m4, m2
+ psrlw m2, 4
+ pshufb m2, m4, m2
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova [dstq+strideq*0], xm0
+ mova [dstq+strideq*1], xm1
+ vextracti128 [dstq+strideq*2], m0, 1
+ vextracti128 [dstq+r2 ], m1, 1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w16
+ RET
+.w32:
+ vpermq m2, [idxq], q3120
+ add idxq, 32
+ pshufb m1, m4, m2
+ psrlw m2, 4
+ pshufb m2, m4, m2
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32
+ RET
+.w64:
+ vpermq m2, [idxq], q3120
+ add idxq, 32
+ pshufb m1, m4, m2
+ psrlw m2, 4
+ pshufb m2, m4, m2
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova [dstq+32*0], m0
+ mova [dstq+32*1], m1
+ add dstq, strideq
+ dec hd
+ jg .w64
+ RET
+
+%endif
diff --git a/third_party/dav1d/src/x86/ipred_avx512.asm b/third_party/dav1d/src/x86/ipred_avx512.asm
new file mode 100644
index 0000000000..de953deba3
--- /dev/null
+++ b/third_party/dav1d/src/x86/ipred_avx512.asm
@@ -0,0 +1,3143 @@
+; Copyright © 2020, VideoLAN and dav1d authors
+; Copyright © 2020, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 64
+
+%macro SMOOTH_WEIGHT_TABLE 1-*
+ %rep %0
+ db %1-128, 127-%1
+ %rotate 1
+ %endrep
+%endmacro
+
+smooth_weights: SMOOTH_WEIGHT_TABLE \
+ 0, 0, 255, 128, 255, 149, 85, 64, \
+ 255, 197, 146, 105, 73, 50, 37, 32, \
+ 255, 225, 196, 170, 145, 123, 102, 84, \
+ 68, 54, 43, 33, 26, 20, 17, 16, \
+ 255, 240, 225, 210, 196, 182, 169, 157, \
+ 145, 133, 122, 111, 101, 92, 83, 74, \
+ 66, 59, 52, 45, 39, 34, 29, 25, \
+ 21, 17, 14, 12, 10, 9, 8, 8, \
+ 255, 248, 240, 233, 225, 218, 210, 203, \
+ 196, 189, 182, 176, 169, 163, 156, 150, \
+ 144, 138, 133, 127, 121, 116, 111, 106, \
+ 101, 96, 91, 86, 82, 77, 73, 69, \
+ 65, 61, 57, 54, 50, 47, 44, 41, \
+ 38, 35, 32, 29, 27, 25, 22, 20, \
+ 18, 16, 15, 13, 12, 10, 9, 8, \
+ 7, 6, 6, 5, 5, 4, 4, 4
+
+; dav1d_filter_intra_taps[], reordered for VNNI: p1 p2 p3 p4, p6 p5 p0 __
+filter_taps: db 10, 0, 0, 0, 2, 10, 0, 0, 1, 1, 10, 0, 1, 1, 2, 10
+ db 6, 0, 0, 0, 2, 6, 0, 0, 2, 2, 6, 0, 1, 2, 2, 6
+ db 0, 12, -6, 0, 0, 9, -5, 0, 0, 7, -3, 0, 0, 5, -3, 0
+ db 12, 2, -4, 0, 9, 2, -3, 0, 7, 2, -3, 0, 5, 3, -3, 0
+ db 16, 0, 0, 0, 0, 16, 0, 0, 0, 0, 16, 0, 0, 0, 0, 16
+ db 16, 0, 0, 0, 0, 16, 0, 0, 0, 0, 16, 0, 0, 0, 0, 16
+ db 0, 10,-10, 0, 0, 6, -6, 0, 0, 4, -4, 0, 0, 2, -2, 0
+ db 10, 0,-10, 0, 6, 0, -6, 0, 4, 0, -4, 0, 2, 0, -2, 0
+ db 8, 0, 0, 0, 0, 8, 0, 0, 0, 0, 8, 0, 0, 0, 0, 8
+ db 4, 0, 0, 0, 0, 4, 0, 0, 0, 0, 4, 0, 0, 0, 0, 4
+ db 0, 16, -8, 0, 0, 16, -8, 0, 0, 16, -8, 0, 0, 16, -8, 0
+ db 16, 0, -4, 0, 16, 0, -4, 0, 16, 0, -4, 0, 16, 0, -4, 0
+ db 8, 0, 0, 0, 3, 8, 0, 0, 2, 3, 8, 0, 1, 2, 3, 8
+ db 4, 0, 0, 0, 3, 4, 0, 0, 2, 3, 4, 0, 2, 2, 3, 4
+ db 0, 10, -2, 0, 0, 6, -1, 0, 0, 4, -1, 0, 0, 2, 0, 0
+ db 10, 3, -1, 0, 6, 4, -1, 0, 4, 4, -1, 0, 3, 3, -1, 0
+ db 14, 0, 0, 0, 0, 14, 0, 0, 0, 0, 14, 0, 0, 0, 0, 14
+ db 12, 0, 0, 0, 1, 12, 0, 0, 0, 0, 12, 0, 0, 0, 1, 12
+ db 0, 14,-12, 0, 0, 12,-10, 0, 0, 11, -9, 0, 0, 10, -8, 0
+ db 14, 0,-10, 0, 12, 0, -9, 0, 11, 1, -8, 0, 9, 1, -7, 0
+filter_perm: db 0, 1, 2, 3, 24, 25, 26, 27, 4, 5, 6, 7, 28, 29, 30, 31
+ db 15, 11, 7, 3, 15, 11, 7, 3, 15, 11, 7, 3, 15, 11, 7,131
+ db 31, 27, 23, 19, 31, 27, 23, 19, 31, 27, 23, 19, 31, 27, 23,147
+ db 47, 43, 39, 35, 47, 43, 39, 35, 47, 43, 39, 35, 47, 43, 39,163
+filter_end: dd 2, 3, 16, 17, -1, -1, 20, 21, 0, 6, 24, 30, 1, 7, 25, 31
+smooth_shuf: db 7, 7, 7, 7, 0, 1, 0, 1, 3, 3, 3, 3, 8, 9, 8, 9
+ db 5, 5, 5, 5, 4, 5, 4, 5, 1, 1, 1, 1, 12, 13, 12, 13
+ db 6, 6, 6, 6, 2, 3, 2, 3, 2, 2, 2, 2, 10, 11, 10, 11
+ db 4, 4, 4, 4, 6, 7, 6, 7, 0, 0, 0, 0, 14, 15, 14, 15
+smooth_endA: db 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+ db 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63
+ db 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95
+ db 97, 99,101,103,105,107,109,111,113,115,117,119,121,123,125,127
+smooth_endB: db 1, 3, 5, 7, 9, 11, 13, 15, 65, 67, 69, 71, 73, 75, 77, 79
+ db 17, 19, 21, 23, 25, 27, 29, 31, 81, 83, 85, 87, 89, 91, 93, 95
+ db 33, 35, 37, 39, 41, 43, 45, 47, 97, 99,101,103,105,107,109,111
+ db 49, 51, 53, 55, 57, 59, 61, 63,113,115,117,119,121,123,125,127
+ipred_h_shuf: db 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4
+ db 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0
+pal_unpack: db 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60
+pal_perm: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
+pb_63to0: db 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48
+ db 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32
+ db 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16
+ db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+z_frac_table: db 64, 0, 62, 2, 60, 4, 58, 6, 56, 8, 54, 10, 52, 12, 50, 14
+ db 48, 16, 46, 18, 44, 20, 42, 22, 40, 24, 38, 26, 36, 28, 34, 30
+ db 32, 32, 30, 34, 28, 36, 26, 38, 24, 40, 22, 42, 20, 44, 18, 46
+ db 16, 48, 14, 50, 12, 52, 10, 54, 8, 56, 6, 58, 4, 60, 2, 62
+z_filter_s1: db -1, -1, -1, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6
+ db 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22
+ db 30, 31, 31, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38
+ db 46, 47, 47, 48, 48, 49, 49, 50, 50, 51, 51, 52, 52, 53, 53, 54
+z_filter_s5: db 10, 9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14, 16, 15, 17, 16
+ db 26, 25, 27, 26, 28, 27, 29, 28, 30, 29, 31, 30, 32, 31, 33, 32
+ db 42, 41, 43, 42, 44, 43, 45, 44, 46, 45, 47, 46, 48, 47, 49, 48
+ db 58, 57, 59, 58, 60, 59, 61, 60, 62, 61, 63, 62, 64, 63, 65, 64
+z_filter_s3: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
+z_filter_s2: db 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
+z_filter_s4: db 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8
+z_xpos_bc: db 17, 17, 17, 17, 33, 33, 33, 33, 9, 9, 9, 9, 9, 9, 9, 9
+z_filter4_s1: db 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7
+ db 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8
+z_xpos_off1a: db 64, 65, 65, 66, 66, 67, 67, 68, 68, 69, 69, 70, 70, 71, 71, 72
+z_xpos_off1b: db 72, 73, 73, 74, 74, 75, 75, 76, 76, 77, 77, 78, 78, 79, 79, 80
+z_xpos_off2a: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+ db 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24
+ db 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38, 38, 39, 39, 40
+ db 48, 49, 49, 50, 50, 51, 51, 52, 52, 53, 53, 54, 54, 55, 55, 56
+z_xpos_off2b: db 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16
+ db 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 31, 32
+ db 40, 41, 41, 42, 42, 43, 43, 44, 44, 45, 45, 46, 46, 47, 47, 48
+ db 56, 57, 57, 58, 58, 59, 59, 60, 60, 61, 61, 62, 62, 63, 63, 64
+z_xpos_mul: dw 4, 4, 4, 4, 8, 8, 4, 4, 12, 12, 8, 8, 16, 16, 8, 8
+ dw 20, 20, 12, 12, 24, 24, 12, 12, 28, 28, 16, 16, 32, 32, 16, 16
+z_ypos_off1: db 64, 65, 64, 65, 64, 65, 64, 65, 65, 66, 65, 66, 66, 67, 66, 67
+ db 66, 67, 66, 67, 68, 69, 68, 69, 67, 68, 67, 68, 70, 71, 70, 71
+ db 68, 69, 68, 69, 72, 73, 72, 73, 69, 70, 69, 70, 74, 75, 74, 75
+ db 70, 71, 70, 71, 76, 77, 76, 77, 71, 72, 71, 72, 78, 79, 78, 79
+z_ypos_off2: db 64, 65, 64, 65, 0, 0, 0, 0, 64, 65, 64, 65, 0, 0, 0, 0
+ db 65, 66, 65, 66, 1, 1, 1, 1, 65, 66, 65, 66, 1, 1, 1, 1
+ db 66, 67, 66, 67, 2, 2, 2, 2, 66, 67, 66, 67, 2, 2, 2, 2
+ db 67, 68, 67, 68, 3, 3, 3, 3, 67, 68, 67, 68, 3, 3, 3, 3
+z_ypos_off3: db 1, 2, 1, 2, 1, 1, 1, 1, 3, 4, 3, 4, 1, 1, 1, 1
+ db 5, 6, 5, 6, 3, 3, 3, 3, 7, 8, 7, 8, 3, 3, 3, 3
+ db 9, 10, 9, 10, 5, 5, 5, 5, 11, 12, 11, 12, 5, 5, 5, 5
+ db 13, 14, 13, 14, 7, 7, 7, 7, 15, 16, 15, 16, 7, 7, 7, 7
+z_ypos_mul1a: dw 1, 2, 3, 4, 5, 6, 7, 8, 17, 18, 19, 20, 21, 22, 23, 24
+ dw 33, 34, 35, 36, 37, 38, 39, 40, 49, 50, 51, 52, 53, 54, 55, 56
+z_ypos_mul1b: dw 9, 10, 11, 12, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32
+ dw 41, 42, 43, 44, 45, 46, 47, 48, 57, 58, 59, 60, 61, 62, 63, 64
+z_ypos_mul2a: dw 1*512, 2*512, 3*512, 4*512, 5*512, 6*512, 7*512, 8*512
+ dw 17*512, 18*512, 19*512, 20*512, 21*512, 22*512, 23*512, 24*512
+ dw 33*512, 34*512, 35*512, 36*512, 37*512, 38*512, 39*512, 40*512
+ dw 49*512, 50*512, 51*512, 52*512, 53*512, 54*512, 55*512, 56*512
+z_ypos_mul2b: dw 9*512, 10*512, 11*512, 12*512, 13*512, 14*512, 15*512, 16*512
+ dw 25*512, 26*512, 27*512, 28*512, 29*512, 30*512, 31*512, 32*512
+ dw 41*512, 42*512, 43*512, 44*512, 45*512, 46*512, 47*512, 48*512
+ dw 57*512, 58*512, 59*512, 60*512, 61*512, 62*512, 63*512, 64*512
+z_filter_t0: db 55,127, 39,127, 39,127, 7, 15, 31, 7, 15, 31, 0, 3, 31, 0
+z_filter_t1: db 39, 63, 19, 47, 19, 47, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0
+z3_upsample: db 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16
+ db 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8
+z_filter_wh: db 7, 7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39
+ db 39, 39, 47, 47, 47, 79, 79, 79
+z_filter_k: db 0, 16, 0, 16, 0, 20, 0, 20, 8, 16, 8, 16
+ db 32, 0, 32, 0, 24, 0, 24, 0, 16, 0, 16, 0
+ db 0, 32, 0, 32, 0, 24, 0, 24, 0, 16, 0, 16
+
+pb_8_56_0_0: db 8, 56, 0, 0
+pb_m4_36: times 2 db -4, 36
+pb_127_m127: times 2 db 127, -127
+pb_8: times 4 db 8
+pb_15: times 4 db 15
+pb_16: times 4 db 16
+pb_31: times 4 db 31
+pb_63: times 4 db 63
+pb_90: times 4 db 90
+pb_128: times 4 db 128
+pw_128: times 2 dw 128
+pw_255: times 2 dw 255
+pw_512: times 2 dw 512
+
+%define pb_1 (ipred_h_shuf+24)
+%define pb_2 (ipred_h_shuf+20)
+%define pb_3 (ipred_h_shuf+16)
+%define pb_4 (smooth_shuf +48)
+%define pb_7 (ipred_h_shuf+ 0)
+%define pb_9 (z_xpos_bc + 8)
+%define pb_17 (z_xpos_bc + 0)
+%define pb_33 (z_xpos_bc + 4)
+%define pd_8 (filter_taps+128)
+
+%macro JMP_TABLE 3-*
+ %xdefine %1_%2_table (%%table - 2*4)
+ %xdefine %%base mangle(private_prefix %+ _%1_%2)
+ %%table:
+ %rep %0 - 2
+ dd %%base %+ .%3 - (%%table - 2*4)
+ %rotate 1
+ %endrep
+%endmacro
+
+%define ipred_dc_splat_8bpc_avx512icl_table (ipred_dc_8bpc_avx512icl_table + 10*4)
+
+JMP_TABLE ipred_h_8bpc, avx512icl, w4, w8, w16, w32, w64
+JMP_TABLE ipred_paeth_8bpc, avx512icl, w4, w8, w16, w32, w64
+JMP_TABLE ipred_smooth_8bpc, avx512icl, w4, w8, w16, w32, w64
+JMP_TABLE ipred_smooth_v_8bpc, avx512icl, w4, w8, w16, w32, w64
+JMP_TABLE ipred_smooth_h_8bpc, avx512icl, w4, w8, w16, w32, w64
+JMP_TABLE ipred_z1_8bpc, avx512icl, w4, w8, w16, w32, w64
+JMP_TABLE ipred_z2_8bpc, avx512icl, w4, w8, w16, w32, w64
+JMP_TABLE ipred_z3_8bpc, avx512icl, w4, w8, w16, w32, w64
+JMP_TABLE ipred_dc_8bpc, avx512icl, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
+ s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4
+JMP_TABLE ipred_dc_left_8bpc, avx512icl, h4, h8, h16, h32, h64
+
+cextern dr_intra_derivative
+cextern pb_0to63
+
+SECTION .text
+
+INIT_ZMM avx512icl
+cglobal ipred_dc_top_8bpc, 3, 7, 5, dst, stride, tl, w, h
+ lea r5, [ipred_dc_left_8bpc_avx512icl_table]
+ movd xm0, wm
+ tzcnt wd, wm
+ inc tlq
+ movifnidn hd, hm
+ movu ym1, [tlq]
+ movd xmm3, wd
+ movsxd r6, [r5+wq*4]
+ vpbroadcastd ym2, [r5-ipred_dc_left_8bpc_avx512icl_table+pb_1]
+ psrld xm0, 1
+ vpdpbusd ym0, ym1, ym2
+ add r6, r5
+ add r5, ipred_dc_splat_8bpc_avx512icl_table-ipred_dc_left_8bpc_avx512icl_table
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ jmp r6
+
+cglobal ipred_dc_left_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3
+ lea r5, [ipred_dc_left_8bpc_avx512icl_table]
+ mov hd, hm
+ tzcnt r6d, hd
+ sub tlq, hq
+ tzcnt wd, wm
+ movd xm0, hm
+ movu ym1, [tlq]
+ movd xmm3, r6d
+ movsxd r6, [r5+r6*4]
+ vpbroadcastd ym2, [r5-ipred_dc_left_8bpc_avx512icl_table+pb_1]
+ psrld xm0, 1
+ vpdpbusd ym0, ym1, ym2
+ add r6, r5
+ add r5, ipred_dc_splat_8bpc_avx512icl_table-ipred_dc_left_8bpc_avx512icl_table
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ jmp r6
+.h64:
+ movu ym1, [tlq+32] ; unaligned when jumping here from dc_top
+ vpdpbusd ym0, ym1, ym2
+.h32:
+ vextracti32x4 xm1, ym0, 1
+ paddd xm0, xm1
+.h16:
+ punpckhqdq xm1, xm0, xm0
+ paddd xm0, xm1
+.h8:
+ psrlq xm1, xm0, 32
+ paddd xm0, xm1
+.h4:
+ vpsrlvd xm0, xmm3
+ lea stride3q, [strideq*3]
+ vpbroadcastb m0, xm0
+ jmp wq
+
+cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3
+ movifnidn hd, hm
+ movifnidn wd, wm
+ tzcnt r6d, hd
+ lea r5d, [wq+hq]
+ movd xm0, r5d
+ tzcnt r5d, r5d
+ movd xmm4, r5d
+ lea r5, [ipred_dc_8bpc_avx512icl_table]
+ tzcnt wd, wd
+ movsxd r6, [r5+r6*4]
+ movsxd wq, [r5+wq*4+5*4]
+ vpbroadcastd ym3, [r5-ipred_dc_8bpc_avx512icl_table+pb_1]
+ psrld xm0, 1
+ add r6, r5
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp r6
+.h4:
+ movd xmm1, [tlq-4]
+ vpdpbusd xm0, xmm1, xm3
+ jmp wq
+.w4:
+ movd xmm1, [tlq+1]
+ vpdpbusd xm0, xmm1, xm3
+ cmp hd, 4
+ jg .w4_mul
+ psrlw xmm0, xm0, 3
+ jmp .w4_end
+.w4_mul:
+ punpckhqdq xmm1, xm0, xm0
+ lea r2d, [hq*2]
+ mov r6d, 0x55563334
+ paddd xmm1, xm0
+ shrx r6d, r6d, r2d
+ psrlq xmm0, xmm1, 32
+ paddd xmm0, xmm1
+ movd xmm1, r6d
+ psrld xmm0, 2
+ pmulhuw xmm0, xmm1
+.w4_end:
+ vpbroadcastb xm0, xmm0
+.s4:
+ movd [dstq+strideq*0], xm0
+ movd [dstq+strideq*1], xm0
+ movd [dstq+strideq*2], xm0
+ movd [dstq+stride3q ], xm0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s4
+ RET
+.h8:
+ movq xmm1, [tlq-8]
+ vpdpbusd xm0, xmm1, xm3
+ jmp wq
+.w8:
+ movq xmm1, [tlq+1]
+ vextracti32x4 xm2, ym0, 1
+ vpdpbusd xm0, xmm1, xm3
+ paddd xmm2, xm2, xm0
+ punpckhqdq xmm0, xmm2, xmm2
+ paddd xmm0, xmm2
+ psrlq xmm1, xmm0, 32
+ paddd xmm0, xmm1
+ vpsrlvd xmm0, xmm4
+ cmp hd, 8
+ je .w8_end
+ mov r6d, 0x5556
+ mov r2d, 0x3334
+ cmp hd, 32
+ cmove r6d, r2d
+ movd xmm1, r6d
+ pmulhuw xmm0, xmm1
+.w8_end:
+ vpbroadcastb xm0, xmm0
+.s8:
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm0
+ movq [dstq+strideq*2], xm0
+ movq [dstq+stride3q ], xm0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s8
+ RET
+.h16:
+ mova xmm1, [tlq-16]
+ vpdpbusd xm0, xmm1, xm3
+ jmp wq
+.w16:
+ movu xmm1, [tlq+1]
+ vextracti32x4 xm2, ym0, 1
+ vpdpbusd xm0, xmm1, xm3
+ paddd xmm2, xm2, xm0
+ punpckhqdq xmm0, xmm2, xmm2
+ paddd xmm0, xmm2
+ psrlq xmm1, xmm0, 32
+ paddd xmm0, xmm1
+ vpsrlvd xmm0, xmm4
+ cmp hd, 16
+ je .w16_end
+ mov r6d, 0x5556
+ mov r2d, 0x3334
+ test hb, 8|32
+ cmovz r6d, r2d
+ movd xmm1, r6d
+ pmulhuw xmm0, xmm1
+.w16_end:
+ vpbroadcastb xm0, xmm0
+.s16:
+ mova [dstq+strideq*0], xm0
+ mova [dstq+strideq*1], xm0
+ mova [dstq+strideq*2], xm0
+ mova [dstq+stride3q ], xm0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s16
+ RET
+.h32:
+ mova ym1, [tlq-32]
+ vpdpbusd ym0, ym1, ym3
+ jmp wq
+.w32:
+ movu ym1, [tlq+1]
+ vpdpbusd ym0, ym1, ym3
+ vextracti32x4 xm1, ym0, 1
+ paddd xmm1, xm1, xm0
+ punpckhqdq xmm0, xmm1, xmm1
+ paddd xmm0, xmm1
+ psrlq xmm1, xmm0, 32
+ paddd xmm0, xmm1
+ vpsrlvd xmm0, xmm4
+ cmp hd, 32
+ je .w32_end
+ lea r2d, [hq*2]
+ mov r6d, 0x33345556
+ shrx r6d, r6d, r2d
+ movd xmm1, r6d
+ pmulhuw xmm0, xmm1
+.w32_end:
+ vpbroadcastb ym0, xmm0
+.s32:
+ mova [dstq+strideq*0], ym0
+ mova [dstq+strideq*1], ym0
+ mova [dstq+strideq*2], ym0
+ mova [dstq+stride3q ], ym0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s32
+ RET
+.h64:
+ mova ym1, [tlq-64]
+ mova ym2, [tlq-32]
+ vpdpbusd ym0, ym1, ym3
+ vpdpbusd ym0, ym2, ym3
+ jmp wq
+.w64:
+ movu ym1, [tlq+ 1]
+ movu ym2, [tlq+33]
+ vpdpbusd ym0, ym1, ym3
+ vpdpbusd ym0, ym2, ym3
+ vextracti32x4 xm1, ym0, 1
+ paddd xmm1, xm1, xm0
+ punpckhqdq xmm0, xmm1, xmm1
+ paddd xmm0, xmm1
+ psrlq xmm1, xmm0, 32
+ paddd xmm0, xmm1
+ vpsrlvd xmm0, xmm4
+ cmp hd, 64
+ je .w64_end
+ mov r6d, 0x33345556
+ shrx r6d, r6d, hd
+ movd xmm1, r6d
+ pmulhuw xmm0, xmm1
+.w64_end:
+ vpbroadcastb m0, xmm0
+.s64:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s64
+ RET
+
+cglobal ipred_dc_128_8bpc, 2, 7, 5, dst, stride, tl, w, h, stride3
+ lea r5, [ipred_dc_splat_8bpc_avx512icl_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ vpbroadcastd m0, [r5-ipred_dc_splat_8bpc_avx512icl_table+pb_128]
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp wq
+
+cglobal ipred_v_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3
+ lea r5, [ipred_dc_splat_8bpc_avx512icl_table]
+ tzcnt wd, wm
+ movu m0, [tlq+1]
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp wq
+
+cglobal ipred_h_8bpc, 3, 7, 8, dst, stride, tl, w, h, stride3
+%define base r6-ipred_h_8bpc_avx512icl_table
+ lea r6, [ipred_h_8bpc_avx512icl_table]
+ tzcnt wd, wm
+ mov hd, hm
+ movsxd wq, [r6+wq*4]
+ lea stride3q, [strideq*3]
+ sub tlq, hq
+ add wq, r6
+ jmp wq
+.w4:
+ mova xmm1, [base+ipred_h_shuf+16]
+.w4_loop:
+ movd xmm0, [tlq+hq-4]
+ pshufb xmm0, xmm1
+ movd [dstq+strideq*0], xmm0
+ pextrd [dstq+strideq*1], xmm0, 1
+ pextrd [dstq+strideq*2], xmm0, 2
+ pextrd [dstq+stride3q ], xmm0, 3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4_loop
+ RET
+.w8:
+ movsldup xmm2, [base+ipred_h_shuf+16]
+ movshdup xmm3, [base+ipred_h_shuf+16]
+.w8_loop:
+ movd xmm1, [tlq+hq-4]
+ pshufb xmm0, xmm1, xmm2
+ pshufb xmm1, xmm3
+ movq [dstq+strideq*0], xmm0
+ movq [dstq+strideq*1], xmm1
+ movhps [dstq+strideq*2], xmm0
+ movhps [dstq+stride3q ], xmm1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8_loop
+ RET
+.w16:
+ movsldup m1, [base+smooth_shuf]
+.w16_loop:
+ vpbroadcastd m0, [tlq+hq-4]
+ pshufb m0, m1
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], m0, 2
+ vextracti32x4 [dstq+strideq*2], ym0, 1
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w16
+ RET
+.w32:
+ vpbroadcastd ym3, [base+pb_1]
+ vpord m2, m3, [base+pb_2] {1to16}
+.w32_loop:
+ vpbroadcastd m1, [tlq+hq-4]
+ pshufb m0, m1, m2
+ pshufb m1, m3
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], ym1
+ vextracti32x8 [dstq+stride3q ], m1, 1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w32_loop
+ RET
+.w64:
+ vpbroadcastd m4, [base+pb_3]
+ vpbroadcastd m5, [base+pb_2]
+ vpbroadcastd m6, [base+pb_1]
+ pxor m7, m7
+.w64_loop:
+ vpbroadcastd m3, [tlq+hq-4]
+ pshufb m0, m3, m4
+ pshufb m1, m3, m5
+ pshufb m2, m3, m6
+ pshufb m3, m7
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w64_loop
+ RET
+
+%macro PAETH 0
+ psubusb m1, m5, m4
+ psubusb m0, m4, m5
+ por m1, m0 ; tdiff
+ pavgb m2, m6, m4
+ vpcmpub k1, m1, m7, 1 ; tdiff < ldiff
+ vpblendmb m0{k1}, m4, m6
+ vpternlogd m4, m6, m8, 0x28 ; (m4 ^ m6) & m8
+ psubusb m3, m5, m2
+ psubb m2, m4
+ psubusb m2, m5
+ por m2, m3
+ pminub m1, m7
+ paddusb m2, m2
+ por m2, m4 ; min(tldiff, 255)
+ vpcmpub k1, m2, m1, 1 ; tldiff < ldiff && tldiff < tdiff
+ vmovdqu8 m0{k1}, m5
+%endmacro
+
+cglobal ipred_paeth_8bpc, 3, 7, 10, dst, stride, tl, w, h, top, stride3
+ lea r6, [ipred_paeth_8bpc_avx512icl_table]
+ tzcnt wd, wm
+ vpbroadcastb m5, [tlq] ; topleft
+ mov hd, hm
+ movsxd wq, [r6+wq*4]
+ vpbroadcastd m8, [r6-ipred_paeth_8bpc_avx512icl_table+pb_1]
+ lea topq, [tlq+1]
+ sub tlq, hq
+ add wq, r6
+ lea stride3q, [strideq*3]
+ jmp wq
+INIT_YMM avx512icl
+.w4:
+ vpbroadcastd m6, [topq]
+ mova m9, [ipred_h_shuf]
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0 ; ldiff
+.w4_loop:
+ vpbroadcastq m4, [tlq+hq-8]
+ pshufb m4, m9 ; left
+ PAETH
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ pextrd [dstq+strideq*2], xm0, 2
+ pextrd [dstq+stride3q ], xm0, 3
+ sub hd, 8
+ jl .w4_ret
+ vextracti32x4 xm0, m0, 1
+ lea dstq, [dstq+strideq*4]
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ pextrd [dstq+strideq*2], xm0, 2
+ pextrd [dstq+stride3q ], xm0, 3
+ lea dstq, [dstq+strideq*4]
+ jg .w4_loop
+.w4_ret:
+ RET
+INIT_ZMM avx512icl
+.w8:
+ vpbroadcastq m6, [topq]
+ movsldup m9, [smooth_shuf]
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0
+.w8_loop:
+ vpbroadcastq m4, [tlq+hq-8]
+ pshufb m4, m9
+ PAETH
+ vextracti32x4 xm1, m0, 2
+ vextracti32x4 xm2, ym0, 1
+ vextracti32x4 xm3, m0, 3
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movq [dstq+strideq*2], xm2
+ movq [dstq+stride3q ], xm3
+ sub hd, 8
+ jl .w8_ret
+ lea dstq, [dstq+strideq*4]
+ movhps [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm3
+ lea dstq, [dstq+strideq*4]
+ jg .w8_loop
+.w8_ret:
+ RET
+.w16:
+ vbroadcasti32x4 m6, [topq]
+ movsldup m9, [smooth_shuf]
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0
+.w16_loop:
+ vpbroadcastd m4, [tlq+hq-4]
+ pshufb m4, m9
+ PAETH
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], m0, 2
+ vextracti32x4 [dstq+strideq*2], ym0, 1
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32:
+ vbroadcasti32x8 m6, [topq]
+ mova ym9, ym8
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0
+.w32_loop:
+ vpbroadcastd m4, [tlq+hq-2]
+ pshufb m4, m9
+ PAETH
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64:
+ movu m6, [topq]
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0
+.w64_loop:
+ vpbroadcastb m4, [tlq+hq-1]
+ PAETH
+ mova [dstq], m0
+ add dstq, strideq
+ dec hd
+ jg .w64_loop
+ RET
+
+cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl, w, h, weights, stride3
+%define base r6-ipred_smooth_v_8bpc_avx512icl_table
+ lea r6, [ipred_smooth_v_8bpc_avx512icl_table]
+ tzcnt wd, wm
+ mov hd, hm
+ movsxd wq, [r6+wq*4]
+ vpbroadcastd m0, [base+pb_127_m127]
+ vpbroadcastd m1, [base+pw_128]
+ lea weightsq, [base+smooth_weights+hq*4]
+ neg hq
+ vpbroadcastb m4, [tlq+hq] ; bottom
+ add wq, r6
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ vpbroadcastd m2, [tlq+1]
+ movshdup m5, [smooth_shuf]
+ mova ym6, [smooth_endA]
+ punpcklbw m2, m4 ; top, bottom
+ pmaddubsw m3, m2, m0
+ paddw m1, m2 ; 1 * top + 256 * bottom + 128, overflow is ok
+ paddw m3, m1 ; 128 * top + 129 * bottom + 128
+.w4_loop:
+ vbroadcasti32x4 m0, [weightsq+hq*2]
+ pshufb m0, m5
+ pmaddubsw m0, m2, m0
+ paddw m0, m3
+ vpermb m0, m6, m0
+ vextracti32x4 xm1, ym0, 1
+ movd [dstq+strideq*0], xm0
+ movd [dstq+strideq*1], xm1
+ pextrd [dstq+strideq*2], xm0, 2
+ pextrd [dstq+stride3q ], xm1, 2
+ add hq, 8
+ jg .ret
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 1
+ pextrd [dstq+strideq*1], xm1, 1
+ pextrd [dstq+strideq*2], xm0, 3
+ pextrd [dstq+stride3q ], xm1, 3
+ lea dstq, [dstq+strideq*4]
+ jl .w4_loop
+.ret:
+ RET
+.w8:
+ vpbroadcastq m2, [tlq+1]
+ movshdup m5, [smooth_shuf]
+ mova ym6, [smooth_endA]
+ punpcklbw m2, m4
+ pmaddubsw m3, m2, m0
+ paddw m1, m2
+ paddw m3, m1
+.w8_loop:
+ vpbroadcastq m0, [weightsq+hq*2]
+ pshufb m0, m5
+ pmaddubsw m0, m2, m0
+ paddw m0, m3
+ vpermb m0, m6, m0
+ vextracti32x4 xm1, ym0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm1
+ lea dstq, [dstq+strideq*4]
+ add hq, 4
+ jl .w8_loop
+ RET
+.w16:
+ vbroadcasti32x4 m3, [tlq+1]
+ movshdup m6, [smooth_shuf]
+ mova m7, [smooth_endB]
+ punpcklbw m2, m3, m4
+ punpckhbw m3, m4
+ pmaddubsw m4, m2, m0
+ pmaddubsw m5, m3, m0
+ paddw m0, m1, m2
+ paddw m1, m3
+ paddw m4, m0
+ paddw m5, m1
+.w16_loop:
+ vpbroadcastq m1, [weightsq+hq*2]
+ pshufb m1, m6
+ pmaddubsw m0, m2, m1
+ pmaddubsw m1, m3, m1
+ paddw m0, m4
+ paddw m1, m5
+ vpermt2b m0, m7, m1
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], m0, 2
+ vextracti32x4 [dstq+strideq*2], ym0, 1
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ lea dstq, [dstq+strideq*4]
+ add hq, 4
+ jl .w16_loop
+ RET
+.w32:
+ vbroadcasti32x8 m3, [tlq+1]
+ movshdup m6, [smooth_shuf]
+ mova m7, [smooth_endB]
+ punpcklbw m2, m3, m4
+ punpckhbw m3, m4
+ pmaddubsw m4, m2, m0
+ pmaddubsw m5, m3, m0
+ paddw m0, m1, m2
+ paddw m1, m3
+ paddw m4, m0
+ paddw m5, m1
+.w32_loop:
+ vpbroadcastd m1, [weightsq+hq*2]
+ pshufb m1, m6
+ pmaddubsw m0, m2, m1
+ pmaddubsw m1, m3, m1
+ paddw m0, m4
+ paddw m1, m5
+ vpermt2b m0, m7, m1
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ add hq, 2
+ jl .w32_loop
+ RET
+.w64:
+ movu m3, [tlq+1]
+ mova m6, [smooth_endB]
+ punpcklbw m2, m3, m4
+ punpckhbw m3, m4
+ pmaddubsw m4, m2, m0
+ pmaddubsw m5, m3, m0
+ paddw m0, m1, m2
+ paddw m1, m3
+ paddw m4, m0
+ paddw m5, m1
+.w64_loop:
+ vpbroadcastw m1, [weightsq+hq*2]
+ pmaddubsw m0, m2, m1
+ pmaddubsw m1, m3, m1
+ paddw m0, m4
+ paddw m1, m5
+ vpermt2b m0, m6, m1
+ mova [dstq], m0
+ add dstq, strideq
+ inc hq
+ jl .w64_loop
+ RET
+
+cglobal ipred_smooth_h_8bpc, 4, 7, 11, dst, stride, tl, w, h, stride3
+%define base r5-ipred_smooth_h_8bpc_avx512icl_table
+ lea r5, [ipred_smooth_h_8bpc_avx512icl_table]
+ mov r6d, wd
+ tzcnt wd, wd
+ vpbroadcastb m4, [tlq+r6] ; right
+ mov hd, hm
+ movsxd wq, [r5+wq*4]
+ vpbroadcastd m5, [base+pb_127_m127]
+ vpbroadcastd m6, [base+pw_128]
+ sub tlq, hq
+ add wq, r5
+ vpmovb2m k1, m6
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ movsldup m3, [smooth_shuf]
+ vpbroadcastq m7, [smooth_weights+4*2]
+ mova ym8, [smooth_endA]
+.w4_loop:
+ vpbroadcastq m0, [tlq+hq-8]
+ mova m2, m4
+ vpshufb m2{k1}, m0, m3 ; left, right
+ pmaddubsw m0, m2, m5
+ pmaddubsw m1, m2, m7
+ paddw m2, m6
+ paddw m0, m2
+ paddw m0, m1
+ vpermb m0, m8, m0
+ vextracti32x4 xm1, ym0, 1
+ movd [dstq+strideq*0], xm0
+ movd [dstq+strideq*1], xm1
+ pextrd [dstq+strideq*2], xm0, 2
+ pextrd [dstq+stride3q ], xm1, 2
+ sub hd, 8
+ jl .ret
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 1
+ pextrd [dstq+strideq*1], xm1, 1
+ pextrd [dstq+strideq*2], xm0, 3
+ pextrd [dstq+stride3q ], xm1, 3
+ lea dstq, [dstq+strideq*4]
+ jg .w4_loop
+.ret:
+ RET
+.w8:
+ movsldup m3, [smooth_shuf]
+ vbroadcasti32x4 m7, [smooth_weights+8*2]
+ mova ym8, [smooth_endA]
+.w8_loop:
+ vpbroadcastd m0, [tlq+hq-4]
+ mova m2, m4
+ vpshufb m2{k1}, m0, m3
+ pmaddubsw m0, m2, m5
+ pmaddubsw m1, m2, m7
+ paddw m2, m6
+ paddw m0, m2
+ paddw m0, m1
+ vpermb m0, m8, m0
+ vextracti32x4 xm1, ym0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8_loop
+ RET
+.w16:
+ movsldup m7, [smooth_shuf]
+ vbroadcasti32x4 m8, [smooth_weights+16*2]
+ vbroadcasti32x4 m9, [smooth_weights+16*3]
+ mova m10, [smooth_endB]
+.w16_loop:
+ vpbroadcastd m0, [tlq+hq-4]
+ mova m3, m4
+ vpshufb m3{k1}, m0, m7
+ pmaddubsw m2, m3, m5
+ pmaddubsw m0, m3, m8
+ pmaddubsw m1, m3, m9
+ paddw m3, m6
+ paddw m2, m3
+ paddw m0, m2
+ paddw m1, m2
+ vpermt2b m0, m10, m1
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], m0, 2
+ vextracti32x4 [dstq+strideq*2], ym0, 1
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32:
+ mova m10, [smooth_endA]
+ vpbroadcastd ym7, [pb_1]
+ vbroadcasti32x8 m8, [smooth_weights+32*2]
+ vbroadcasti32x8 m9, [smooth_weights+32*3]
+ vshufi32x4 m10, m10, q3120
+.w32_loop:
+ vpbroadcastd m0, [tlq+hq-2]
+ mova m3, m4
+ vpshufb m3{k1}, m0, m7
+ pmaddubsw m2, m3, m5
+ pmaddubsw m0, m3, m8
+ pmaddubsw m1, m3, m9
+ paddw m3, m6
+ paddw m2, m3
+ paddw m0, m2
+ paddw m1, m2
+ vpermt2b m0, m10, m1
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64:
+ mova m7, [smooth_weights+64*2]
+ mova m8, [smooth_weights+64*3]
+ mova m9, [smooth_endA]
+.w64_loop:
+ mova m3, m4
+ vpbroadcastb m3{k1}, [tlq+hq-1]
+ pmaddubsw m2, m3, m5
+ pmaddubsw m0, m3, m7
+ pmaddubsw m1, m3, m8
+ paddw m3, m6
+ paddw m2, m3
+ paddw m0, m2
+ paddw m1, m2
+ vpermt2b m0, m9, m1
+ mova [dstq], m0
+ add dstq, strideq
+ dec hd
+ jg .w64_loop
+ RET
+
+cglobal ipred_smooth_8bpc, 4, 7, 16, dst, stride, tl, w, h, v_weights, stride3
+%define base r5-ipred_smooth_8bpc_avx512icl_table
+ lea r5, [ipred_smooth_8bpc_avx512icl_table]
+ mov r6d, wd
+ tzcnt wd, wd
+ mov hd, hm
+ vpbroadcastb m6, [tlq+r6] ; right
+ sub tlq, hq
+ movsxd wq, [r5+wq*4]
+ vpbroadcastd m7, [base+pb_127_m127]
+ vpbroadcastb m0, [tlq] ; bottom
+ vpbroadcastd m1, [base+pw_255]
+ add wq, r5
+ lea v_weightsq, [base+smooth_weights+hq*2]
+ vpmovb2m k1, m1
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ vpbroadcastd m8, [tlq+hq+1]
+ movsldup m4, [smooth_shuf]
+ movshdup m5, [smooth_shuf]
+ vpbroadcastq m9, [smooth_weights+4*2]
+ mova ym11, [smooth_endA]
+
+ punpcklbw m8, m0 ; top, bottom
+ pmaddubsw m10, m8, m7
+ paddw m1, m8 ; 1 * top + 256 * bottom + 255
+ paddw m10, m1 ; 128 * top + 129 * bottom + 255
+.w4_loop:
+ vpbroadcastq m1, [tlq+hq-8]
+ vbroadcasti32x4 m0, [v_weightsq]
+ add v_weightsq, 16
+ mova m2, m6
+ vpshufb m2{k1}, m1, m4 ; left, right
+ pmaddubsw m1, m2, m7 ; 127 * left - 127 * right
+ pshufb m0, m5
+ pmaddubsw m0, m8, m0
+ paddw m1, m2 ; 128 * left + 129 * right
+ pmaddubsw m2, m9
+ paddw m0, m10
+ paddw m1, m2
+ pavgw m0, m1
+ vpermb m0, m11, m0
+ vextracti32x4 xm1, ym0, 1
+ movd [dstq+strideq*0], xm0
+ movd [dstq+strideq*1], xm1
+ pextrd [dstq+strideq*2], xm0, 2
+ pextrd [dstq+stride3q ], xm1, 2
+ sub hd, 8
+ jl .ret
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 1
+ pextrd [dstq+strideq*1], xm1, 1
+ pextrd [dstq+strideq*2], xm0, 3
+ pextrd [dstq+stride3q ], xm1, 3
+ lea dstq, [dstq+strideq*4]
+ jg .w4_loop
+.ret:
+ RET
+.w8:
+ vpbroadcastq m8, [tlq+hq+1]
+ movsldup m4, [smooth_shuf]
+ movshdup m5, [smooth_shuf]
+ vbroadcasti32x4 m9, [smooth_weights+8*2]
+ mova ym11, [smooth_endA]
+ punpcklbw m8, m0
+ pmaddubsw m10, m8, m7
+ paddw m1, m8
+ paddw m10, m1
+.w8_loop:
+ vpbroadcastd m1, [tlq+hq-4]
+ vpbroadcastq m0, [v_weightsq]
+ add v_weightsq, 8
+ mova m2, m6
+ vpshufb m2{k1}, m1, m4
+ pmaddubsw m1, m2, m7
+ pshufb m0, m5
+ pmaddubsw m0, m8, m0
+ paddw m1, m2
+ pmaddubsw m2, m9
+ paddw m0, m10
+ paddw m1, m2
+ pavgw m0, m1
+ vpermb m0, m11, m0
+ vextracti32x4 xm1, ym0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8_loop
+ RET
+.w16:
+ vbroadcasti32x4 m9, [tlq+hq+1]
+ movsldup m5, [smooth_shuf]
+ movshdup m10, [smooth_shuf]
+ vbroadcasti32x4 m11, [smooth_weights+16*2]
+ vbroadcasti32x4 m12, [smooth_weights+16*3]
+ mova m15, [smooth_endB]
+ punpcklbw m8, m9, m0
+ punpckhbw m9, m0
+ pmaddubsw m13, m8, m7
+ pmaddubsw m14, m9, m7
+ paddw m0, m1, m8
+ paddw m1, m9
+ paddw m13, m0
+ paddw m14, m1
+.w16_loop:
+ vpbroadcastd m0, [tlq+hq-4]
+ vpbroadcastq m1, [v_weightsq]
+ add v_weightsq, 8
+ mova m4, m6
+ vpshufb m4{k1}, m0, m5
+ pmaddubsw m2, m4, m7
+ pshufb m1, m10
+ pmaddubsw m0, m8, m1
+ pmaddubsw m1, m9, m1
+ paddw m2, m4
+ pmaddubsw m3, m4, m11
+ pmaddubsw m4, m12
+ paddw m0, m13
+ paddw m1, m14
+ paddw m3, m2
+ paddw m4, m2
+ pavgw m0, m3
+ pavgw m1, m4
+ vpermt2b m0, m15, m1
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], m0, 2
+ vextracti32x4 [dstq+strideq*2], ym0, 1
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32:
+ vbroadcasti32x8 m9, [tlq+hq+1]
+ movshdup m10, [smooth_shuf]
+ mova m12, [smooth_weights+32*2]
+ vpbroadcastd ym5, [pb_1]
+ mova m15, [smooth_endB]
+ punpcklbw m8, m9, m0
+ punpckhbw m9, m0
+ pmaddubsw m13, m8, m7
+ pmaddubsw m14, m9, m7
+ vshufi32x4 m11, m12, m12, q2020
+ vshufi32x4 m12, m12, q3131
+ paddw m0, m1, m8
+ paddw m1, m9
+ paddw m13, m0
+ paddw m14, m1
+.w32_loop:
+ vpbroadcastd m0, [tlq+hq-2]
+ vpbroadcastd m1, [v_weightsq]
+ add v_weightsq, 4
+ mova m4, m6
+ vpshufb m4{k1}, m0, m5
+ pmaddubsw m2, m4, m7
+ pshufb m1, m10
+ pmaddubsw m0, m8, m1
+ pmaddubsw m1, m9, m1
+ paddw m2, m4
+ pmaddubsw m3, m4, m11
+ pmaddubsw m4, m12
+ paddw m0, m13
+ paddw m1, m14
+ paddw m3, m2
+ paddw m4, m2
+ pavgw m0, m3
+ pavgw m1, m4
+ vpermt2b m0, m15, m1
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64:
+ movu m9, [tlq+hq+1]
+ mova m11, [smooth_weights+64*2]
+ mova m2, [smooth_weights+64*3]
+ mova m14, [smooth_endB]
+ punpcklbw m8, m9, m0
+ punpckhbw m9, m0
+ pmaddubsw m12, m8, m7
+ pmaddubsw m13, m9, m7
+ vshufi32x4 m10, m11, m2, q2020
+ vshufi32x4 m11, m2, q3131
+ paddw m0, m1, m8
+ paddw m1, m9
+ paddw m12, m0
+ paddw m13, m1
+.w64_loop:
+ mova m4, m6
+ vpbroadcastb m4{k1}, [tlq+hq-1]
+ vpbroadcastw m1, [v_weightsq]
+ add v_weightsq, 2
+ pmaddubsw m2, m4, m7
+ pmaddubsw m0, m8, m1
+ pmaddubsw m1, m9, m1
+ paddw m2, m4
+ pmaddubsw m3, m4, m10
+ pmaddubsw m4, m11
+ paddw m0, m12
+ paddw m1, m13
+ paddw m3, m2
+ paddw m4, m2
+ pavgw m0, m3
+ pavgw m1, m4
+ vpermt2b m0, m14, m1
+ mova [dstq], m0
+ add dstq, strideq
+ dec hd
+ jg .w64_loop
+ RET
+
+cglobal pal_pred_8bpc, 4, 7, 6, dst, stride, pal, idx, w, h, stride3
+ movifnidn wd, wm
+ movifnidn hd, hm
+ lea stride3q, [strideq*3]
+ cmp wd, 8
+ jg .w32
+ movq xmm3, [palq]
+ je .w8
+.w4:
+ movq xmm0, [idxq]
+ add idxq, 8
+ psrlw xmm1, xmm0, 4
+ punpcklbw xmm0, xmm1
+ pshufb xmm0, xmm3, xmm0
+ movd [dstq+strideq*0], xmm0
+ pextrd [dstq+strideq*1], xmm0, 1
+ pextrd [dstq+strideq*2], xmm0, 2
+ pextrd [dstq+stride3q ], xmm0, 3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4
+ RET
+.w8:
+ movu xmm2, [idxq]
+ add idxq, 16
+ pshufb xmm1, xmm3, xmm2
+ psrlw xmm2, 4
+ pshufb xmm2, xmm3, xmm2
+ punpcklbw xmm0, xmm1, xmm2
+ punpckhbw xmm1, xmm2
+ movq [dstq+strideq*0], xmm0
+ movhps [dstq+strideq*1], xmm0
+ movq [dstq+strideq*2], xmm1
+ movhps [dstq+stride3q ], xmm1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8
+ RET
+.w16:
+ pmovzxdq m0, [idxq]
+ add idxq, 32
+ vpmultishiftqb m0, m3, m0
+ pshufb m0, m5, m0
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ vextracti32x4 [dstq+strideq*2], m0, 2
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w16
+ RET
+.w32:
+ vpbroadcastq m3, [pal_unpack+0]
+ vpbroadcastq m5, [palq]
+ cmp wd, 32
+ jl .w16
+ pmovzxbd m2, [pal_perm]
+ vpbroadcastq m4, [pal_unpack+8]
+ jg .w64
+.w32_loop:
+ vpermd m1, m2, [idxq]
+ add idxq, 64
+ vpmultishiftqb m0, m3, m1
+ vpmultishiftqb m1, m4, m1
+ pshufb m0, m5, m0
+ pshufb m1, m5, m1
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], ym1
+ vextracti32x8 [dstq+stride3q ], m1, 1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w32_loop
+ RET
+.w64:
+ vpermd m1, m2, [idxq]
+ add idxq, 64
+ vpmultishiftqb m0, m3, m1
+ vpmultishiftqb m1, m4, m1
+ pshufb m0, m5, m0
+ pshufb m1, m5, m1
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w64
+ RET
+
+%if WIN64
+ DECLARE_REG_TMP 4
+%else
+ DECLARE_REG_TMP 8
+%endif
+
+cglobal ipred_z1_8bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dx
+%define base r7-z_filter_t0
+ lea r7, [z_filter_t0]
+ tzcnt wd, wm
+ movifnidn angled, anglem
+ lea t0, [dr_intra_derivative]
+ movsxd wq, [base+ipred_z1_8bpc_avx512icl_table+wq*4]
+ inc tlq
+ mov dxd, angled
+ and dxd, 0x7e
+ add angled, 165 ; ~90
+ movzx dxd, word [t0+dxq]
+ lea wq, [base+ipred_z1_8bpc_avx512icl_table+wq]
+ movifnidn hd, hm
+ xor angled, 0x4ff ; d = 90 - angle
+ mova m14, [base+z_frac_table]
+ vpbroadcastd m15, [base+pw_512]
+ jmp wq
+.w4:
+ mova m9, [pb_0to63]
+ pminud m8, m9, [base+pb_7] {1to16}
+ vpbroadcastq m7, [tlq]
+ pshufb m7, m8
+ cmp angleb, 40
+ jae .w4_no_upsample
+ lea r3d, [angleq-1024]
+ sar r3d, 7
+ add r3d, hd
+ jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm)
+ pshufb xmm0, xm7, [base+z_filter_s4]
+ mova xmm1, [tlq-1]
+ pshufb xmm1, [base+z_xpos_off2a]
+ vpbroadcastd xmm2, [base+pb_m4_36]
+ vpbroadcastq m4, [pb_0to63]
+ pmaddubsw xmm0, xmm2
+ pmaddubsw xmm1, xmm2
+ add dxd, dxd
+ kxnorw k1, k1, k1
+ paddw xmm0, xmm1
+ pmulhrsw xm0, xmm0, xm15
+ packuswb xm0, xm0
+ punpcklbw ym7{k1}, ym0
+ jmp .w4_main2
+.w4_no_upsample:
+ test angled, 0x400
+ jnz .w4_main ; !enable_intra_edge_filter
+ lea r3d, [hq+3]
+ vpbroadcastb xm0, r3d
+ vpbroadcastb xm1, angled
+ shr angled, 8 ; is_sm << 1
+ vpcmpeqb k1, xm0, [base+z_filter_wh]
+ vpcmpgtb k1{k1}, xm1, [base+z_filter_t0+angleq*8]
+ kmovw r5d, k1
+ test r5d, r5d
+ jz .w4_main
+ vbroadcasti32x4 ym0, [tlq-1]
+ pshufb ym0, [base+z_filter4_s1]
+ popcnt r5d, r5d ; filter_strength
+ pshufb ym1, ym7, [z_filter_s4]
+ pshufb ym7, [base+z_filter_s3]
+ vpbroadcastd ym11, [base+z_filter_k+(r5-1)*4+12*0]
+ vpbroadcastd ym12, [base+z_filter_k+(r5-1)*4+12*1]
+ pmaddubsw ym0, ym11
+ pmaddubsw ym1, ym11
+ pmaddubsw ym7, ym12
+ paddw ym0, ym1
+ paddw ym7, ym0
+ pmulhrsw ym7, ym15
+ cmp hd, 4
+ je .w4_filter_end
+ vpbroadcastd m8, [base+pb_9]
+ pminub m8, m9
+.w4_filter_end:
+ paddb m8, m8
+ vpermb m7, m8, m7
+.w4_main:
+ vpbroadcastq m4, [base+z_xpos_off1a]
+.w4_main2:
+ movsldup m2, [base+z_xpos_mul]
+ vpbroadcastw m5, dxd
+ vbroadcasti32x4 m3, [base+z_xpos_bc]
+ lea r2, [strideq*3]
+ pmullw m2, m5 ; xpos
+ psllw m5, 5 ; dx*8
+.w4_loop:
+ psrlw m1, m2, 3
+ pshufb m0, m2, m3
+ vpermw m1, m1, m14 ; 64-frac, frac
+ paddsb m0, m4 ; base, base+1
+ vpermb m0, m0, m7 ; top[base], top[base+1]
+ paddsw m2, m5 ; xpos += dx
+ pmaddubsw m0, m1 ; v
+ pmulhrsw m0, m15
+ packuswb m0, m0
+ vextracti32x4 xm1, ym0, 1
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+r2 ], xm1, 1
+ sub hd, 8
+ jl .w4_end
+ vextracti32x4 xm1, m0, 2 ; top[max_base_x]
+ lea dstq, [dstq+strideq*4]
+ vextracti32x4 xm0, m0, 3
+ movd [dstq+strideq*0], xm1
+ pextrd [dstq+strideq*1], xm1, 1
+ movd [dstq+strideq*2], xm0
+ pextrd [dstq+r2 ], xm0, 1
+ lea dstq, [dstq+strideq*4]
+ jg .w4_loop
+.w4_end:
+ RET
+.w8_filter:
+ mova ym0, [base+z_filter_s1]
+ popcnt r5d, r5d
+ vbroadcasti32x4 ym1, [base+z_filter_s2]
+ vbroadcasti32x4 ym3, [base+z_filter_s3]
+ vbroadcasti32x4 ym4, [base+z_filter_s4]
+ vpermi2b ym0, ym7, ym2 ; al bl
+ mova ym5, [base+z_filter_s5]
+ pshufb ym1, ym7, ym1 ; ah bh
+ vpbroadcastd ym11, [base+z_filter_k+(r5-1)*4+12*0]
+ pshufb ym3, ym7, ym3 ; cl ch
+ vpbroadcastd ym12, [base+z_filter_k+(r5-1)*4+12*1]
+ pshufb ym4, ym7, ym4 ; el dl
+ vpbroadcastd ym13, [base+z_filter_k+(r5-1)*4+12*2]
+ vpermb ym5, ym5, ym7 ; eh dh
+ pmaddubsw ym0, ym11
+ pmaddubsw ym1, ym11
+ pmaddubsw ym2, ym3, ym12
+ pmaddubsw ym3, ym13
+ pmaddubsw ym4, ym11
+ pmaddubsw ym5, ym11
+ paddw ym0, ym2
+ paddw ym1, ym3
+ paddw ym0, ym4
+ paddw ym1, ym5
+ pmulhrsw ym0, ym15
+ pmulhrsw ym1, ym15
+ packuswb ym0, ym1
+ ret
+.w8:
+ lea r3d, [angleq+216]
+ mov r3b, hb
+ cmp r3d, 8
+ ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8
+ lea r3d, [hq-1]
+ mova xm1, [base+z_filter_s4]
+ vpbroadcastb xm2, r3d
+ mova xm7, [tlq-1]
+ vinserti32x4 ym7, [tlq+7], 1
+ vbroadcasti32x4 ym0, [base+z_xpos_off1a]
+ vpbroadcastd ym3, [base+pb_m4_36]
+ pminub xm2, xm1
+ pshufb ym0, ym7, ym0
+ vinserti32x4 ym1, xm2, 1
+ psrldq ym7, 1
+ pshufb ym1, ym7, ym1
+ pmaddubsw ym0, ym3
+ pmaddubsw ym1, ym3
+ vbroadcasti32x4 m8, [pb_0to63]
+ add dxd, dxd
+ paddw ym0, ym1
+ pmulhrsw ym0, ym15
+ packuswb ym0, ym0
+ punpcklbw ym7, ym0
+ jmp .w8_main2
+.w8_no_upsample:
+ lea r3d, [hq+7]
+ mova m9, [pb_0to63]
+ vpbroadcastb ym0, r3d
+ and r3d, 7
+ vbroadcasti32x4 m7, [tlq]
+ or r3d, 8 ; imin(h+7, 15)
+ vpbroadcastb m8, r3d
+ pminub m8, m9
+ pshufb m7, m8
+ test angled, 0x400
+ jnz .w8_main
+ vpbroadcastb ym1, angled
+ shr angled, 8
+ vpcmpeqb k1, ym0, [base+z_filter_wh]
+ mova xm0, [base+z_filter_t0+angleq*8]
+ vpcmpgtb k1{k1}, ym1, ym0
+ kmovd r5d, k1
+ test r5d, r5d
+ jz .w8_main
+ vpbroadcastd ym2, [tlq-4]
+ call .w8_filter
+ cmp hd, 8
+ jle .w8_filter_end
+ vpbroadcastd m8, [base+pb_17]
+ add r3d, 2
+ pminub m8, m9
+.w8_filter_end:
+ vpermb m7, m8, m0
+.w8_main:
+ vbroadcasti32x4 m8, [base+z_xpos_off1a]
+.w8_main2:
+ movsldup m4, [base+z_xpos_mul]
+ vpbroadcastw m9, dxd
+ shl r3d, 6
+ vpbroadcastd m5, [base+z_xpos_bc+8*0]
+ pmullw m4, m9 ; xpos
+ vpbroadcastd m6, [base+z_xpos_bc+8*1]
+ sub r3d, dxd
+ shl dxd, 3
+ psllw m9, 5 ; dx*8
+ lea r2, [strideq*3]
+.w8_loop:
+ psrlw m3, m4, 3
+ pshufb m0, m4, m5
+ pshufb m1, m4, m6
+ vpermw m3, m3, m14
+ paddsb m0, m8
+ paddsb m1, m8
+ vpermb m0, m0, m7
+ vpermb m1, m1, m7
+ paddsw m4, m9
+ punpcklqdq m2, m3, m3
+ pmaddubsw m0, m2
+ punpckhqdq m3, m3
+ pmaddubsw m1, m3
+ pmulhrsw m0, m15
+ pmulhrsw m1, m15
+ packuswb m0, m1
+ vextracti32x4 xm1, ym0, 1
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+r2 ], xm1
+ sub hd, 8
+ jl .w8_end
+ vextracti32x8 ym0, m0, 1
+ lea dstq, [dstq+strideq*4]
+ vextracti32x4 xm1, ym0, 1
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+r2 ], xm1
+ jz .w8_end
+ lea dstq, [dstq+strideq*4]
+ sub r3d, dxd
+ jg .w8_loop
+ vextracti32x4 xm7, m7, 3
+.w8_end_loop:
+ movq [dstq+strideq*0], xm7
+ movq [dstq+strideq*1], xm7
+ movq [dstq+strideq*2], xm7
+ movq [dstq+r2 ], xm7
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8_end_loop
+.w8_end:
+ RET
+.w16_filter:
+ mova m0, [base+z_filter_s1]
+ popcnt r5d, r5d
+ vbroadcasti32x4 m1, [base+z_filter_s2]
+ vbroadcasti32x4 m3, [base+z_filter_s3]
+ vbroadcasti32x4 m4, [base+z_filter_s4]
+ vpermi2b m0, m7, m2 ; al bl
+ mova m5, [base+z_filter_s5]
+ pshufb m1, m7, m1 ; ah bh
+ vpbroadcastd m11, [base+z_filter_k+(r5-1)*4+12*0]
+ pshufb m3, m7, m3 ; cl ch
+ vpbroadcastd m12, [base+z_filter_k+(r5-1)*4+12*1]
+ pshufb m4, m7, m4 ; el dl
+ vpbroadcastd m13, [base+z_filter_k+(r5-1)*4+12*2]
+ vpermb m5, m5, m7 ; eh dh
+ pmaddubsw m0, m11
+ pmaddubsw m1, m11
+ pmaddubsw m2, m3, m12
+ pmaddubsw m3, m13
+ pmaddubsw m4, m11
+ pmaddubsw m5, m11
+ paddw m0, m2
+ paddw m1, m3
+ paddw m0, m4
+ paddw m1, m5
+ pmulhrsw m0, m15
+ pmulhrsw m1, m15
+ packuswb m0, m1
+ ret
+.w16:
+ lea r3d, [hq+15]
+ mova m9, [pb_0to63]
+ vpbroadcastb ym0, r3d
+ and r3d, 15
+ movu ym7, [tlq]
+ or r3d, 16 ; imin(h+15, 31)
+ vpbroadcastb m8, r3d
+ pminub m8, m9
+ vpermb m7, m8, m7
+ test angled, 0x400
+ jnz .w16_main
+ vpbroadcastb ym1, angled
+ shr angled, 8
+ vpcmpeqb k1, ym0, [base+z_filter_wh]
+ mova xm0, [base+z_filter_t0+angleq*8]
+ vpcmpgtb k1{k1}, ym1, ym0
+ kmovd r5d, k1
+ test r5d, r5d
+ jz .w16_main
+ vpbroadcastd m2, [tlq-4]
+ call .w16_filter
+ cmp hd, 16
+ jle .w16_filter_end
+ vpbroadcastd m8, [base+pb_33]
+ add r3d, 2
+ pminub m8, m9
+.w16_filter_end:
+ vpermb m7, m8, m0
+.w16_main:
+ movshdup m3, [base+z_xpos_mul]
+ vpbroadcastw m8, dxd
+ shl r3d, 6
+ vpbroadcastd m4, [base+z_xpos_bc]
+ pmullw m3, m8 ; xpos
+ vbroadcasti32x4 m5, [base+z_xpos_off1a]
+ sub r3d, dxd
+ shl dxd, 2
+ vbroadcasti32x4 m6, [base+z_xpos_off1b]
+ psllw m8, 4 ; dx*4
+ lea r2, [strideq*3]
+.w16_loop:
+ pshufb m1, m3, m4
+ psrlw m2, m3, 3
+ paddsb m0, m1, m5
+ vpermw m2, m2, m14
+ paddsb m1, m6
+ vpermb m0, m0, m7
+ vpermb m1, m1, m7
+ paddsw m3, m8
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ pmulhrsw m0, m15
+ pmulhrsw m1, m15
+ packuswb m0, m1
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ vextracti32x4 [dstq+strideq*2], m0, 2
+ vextracti32x4 [dstq+r2 ], m0, 3
+ sub hd, 4
+ jz .w16_end
+ lea dstq, [dstq+strideq*4]
+ sub r3d, dxd
+ jg .w16_loop
+ vextracti32x4 xm7, m7, 3
+.w16_end_loop:
+ mova [dstq+strideq*0], xm7
+ mova [dstq+strideq*1], xm7
+ mova [dstq+strideq*2], xm7
+ mova [dstq+r2 ], xm7
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w16_end_loop
+.w16_end:
+ RET
+.w32_filter:
+ mova m0, [base+z_filter_s1]
+ vbroadcasti32x4 m1, [base+z_filter_s2]
+ vbroadcasti32x4 m3, [base+z_filter_s3]
+ vbroadcasti32x4 m4, [base+z_filter_s4]
+ vpermi2b m0, m7, m2 ; al bl
+ mova m5, [base+z_filter_s5]
+ pshufb m1, m7, m1 ; ah bh
+ vpbroadcastd m11, [base+z_filter_k+4*2+12*0]
+ pshufb m3, m7, m3 ; cl ch
+ vpbroadcastd m12, [base+z_filter_k+4*2+12*1]
+ pshufb m4, m7, m4 ; el dl
+ vpbroadcastd m13, [base+z_filter_k+4*2+12*2]
+ vpermi2b m5, m7, m8 ; eh dh
+ pmaddubsw m0, m11
+ pmaddubsw m1, m11
+ pmaddubsw m2, m3, m12
+ pmaddubsw m3, m13
+ pmaddubsw m4, m11
+ pmaddubsw m5, m11
+ paddw m0, m2
+ paddw m1, m3
+ paddw m0, m4
+ paddw m1, m5
+ pmulhrsw m0, m15
+ pmulhrsw m1, m15
+ packuswb m7, m0, m1
+ ret
+.w32:
+ lea r3d, [hq+31]
+ vpbroadcastb m9, r3d
+ and r3d, 31
+ pminub m10, m9, [pb_0to63]
+ or r3d, 32 ; imin(h+31, 63)
+ vpermb m7, m10, [tlq]
+ vpbroadcastb m8, [tlq+r3]
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .w32_main
+ vpbroadcastd m2, [tlq-4]
+ call .w32_filter
+ cmp hd, 64
+ je .w32_h64_filter_end
+ vpermb m8, m9, m7
+ vpermb m7, m10, m7
+ jmp .w32_main
+.w32_h64_filter_end: ; edge case for 32x64
+ movd xmm0, [tlq+r3-1]
+ movd xmm1, [base+pb_8_56_0_0]
+ add r3d, 2
+ pmaddubsw xmm0, xmm1
+ vptestmw k1, xmm1, xmm1 ; 0x01
+ pmulhrsw xm0, xmm0, xm15
+ vmovdqu8 m8{k1}, m0
+.w32_main:
+ rorx r2d, dxd, 30
+ vpbroadcastd m4, [base+z_xpos_bc]
+ vpbroadcastw m3, r2d
+ vbroadcasti32x8 m5, [base+z_xpos_off2a]
+ shl r3d, 6
+ vbroadcasti32x8 m6, [base+z_xpos_off2b]
+ sub r3d, dxd
+ paddw m9, m3, m3
+ add dxd, dxd
+ vinserti32x8 m3, ym9, 1
+.w32_loop:
+ pshufb m1, m3, m4
+ psrlw m2, m3, 3
+ paddsb m0, m1, m5
+ vpermw m2, m2, m14
+ paddsb m1, m6
+ vpermi2b m0, m7, m8
+ vpermi2b m1, m7, m8
+ paddsw m3, m9
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ pmulhrsw m0, m15
+ pmulhrsw m1, m15
+ packuswb m0, m1
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ sub hd, 2
+ jz .w32_end
+ lea dstq, [dstq+strideq*2]
+ sub r3d, dxd
+ jg .w32_loop
+ punpckhqdq ym8, ym8
+.w32_end_loop:
+ mova [dstq+strideq*0], ym8
+ mova [dstq+strideq*1], ym8
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32_end_loop
+.w32_end:
+ RET
+.w64_filter:
+ vbroadcasti32x4 m3, [base+z_filter_s2]
+ mova m1, [base+z_filter_s1]
+ pshufb m0, m3 ; al bl
+ vpermi2b m1, m7, m2
+ vbroadcasti32x4 m4, [base+z_filter_s4]
+ pshufb m6, m8, m4 ; el dl
+ pshufb m9, m7, m4
+ pminub m10, m13, [base+z_filter_s5]
+ pshufb m2, m8, m3 ; ah bh
+ pshufb m3, m7, m3
+ vbroadcasti32x4 m5, [base+z_filter_s3]
+ vpermb m10, m10, m8 ; eh dh
+ pshufb m11, m4
+ vpbroadcastd m4, [base+z_filter_k+4*2+12*0]
+ pshufb m8, m5 ; cl ch
+ pshufb m7, m5
+ vpbroadcastd m5, [base+z_filter_k+4*2+12*1]
+ REPX {pmaddubsw x, m4}, m0, m1, m6, m9, m2, m3, m10, m11
+ pmaddubsw m4, m8, m5
+ pmaddubsw m5, m7, m5
+ paddw m0, m6
+ vpbroadcastd m6, [base+z_filter_k+4*2+12*2]
+ paddw m1, m9
+ pmaddubsw m7, m6
+ pmaddubsw m8, m6
+ paddw m2, m10
+ paddw m3, m11
+ paddw m0, m4
+ paddw m1, m5
+ paddw m2, m8
+ paddw m3, m7
+ REPX {pmulhrsw x, m15}, m0, m2, m1, m3
+ packuswb m0, m2
+ packuswb m7, m1, m3
+ vpermb m8, m12, m0
+ ret
+.w64:
+ lea r3d, [hq-1]
+ movu m7, [tlq+64*0]
+ vpbroadcastb m13, r3d
+ pminub m12, m13, [pb_0to63]
+ or r3d, 64
+ vpermb m8, m12, [tlq+64*1]
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .w64_main
+ movu m0, [tlq+56]
+ vpbroadcastd m2, [tlq-4]
+ movu m11, [tlq+8]
+ call .w64_filter
+.w64_main:
+ rorx r2d, dxd, 30
+ vpbroadcastd m4, [base+z_xpos_bc]
+ vpbroadcastw m3, r2d
+ mova m5, [base+z_xpos_off2a]
+ shl r3d, 6
+ mova m6, [base+z_xpos_off2b]
+ sub r3d, dxd
+ mova m9, m3
+.w64_loop:
+ pshufb m1, m3, m4
+ psrlw m2, m3, 3
+ paddsb m0, m1, m5
+ vpermw m2, m2, m14
+ paddsb m1, m6
+ vpermi2b m0, m7, m8
+ vpermi2b m1, m7, m8
+ paddsw m3, m9
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ pmulhrsw m0, m15
+ pmulhrsw m1, m15
+ packuswb m0, m1
+ mova [dstq], m0
+ dec hd
+ jz .w64_end
+ add dstq, strideq
+ sub r3d, dxd
+ jg .w64_loop
+ vpermb m8, m13, m8
+.w64_end_loop:
+ mova [dstq], m8
+ add dstq, strideq
+ dec hd
+ jg .w64_end_loop
+.w64_end:
+ RET
+
+cglobal ipred_z2_8bpc, 3, 9, 18, dst, stride, tl, w, h, angle, dx, _, dy
+ tzcnt wd, wm
+ movifnidn angled, anglem
+ lea dxq, [dr_intra_derivative-90]
+ movzx dyd, angleb
+ xor angled, 0x400
+ mov r7, dxq
+ sub dxq, dyq
+ movifnidn hd, hm
+ and dyd, ~1
+ and dxq, ~1
+ movzx dyd, word [r7+dyq] ; angle - 90
+ lea r7, [z_filter_t0]
+ movzx dxd, word [dxq+270] ; 180 - angle
+ movsxd wq, [base+ipred_z2_8bpc_avx512icl_table+wq*4]
+ mova m8, [base+pb_63to0]
+ neg dyd
+ vpermb m8, m8, [tlq-64] ; left
+ lea wq, [base+ipred_z2_8bpc_avx512icl_table+wq]
+ mova m14, [base+z_frac_table]
+ inc tlq
+ vpbroadcastd m15, [base+pw_512]
+ neg dxd
+ jmp wq
+.w4:
+ movd xm7, [tlq]
+ vpbroadcastq m10, [base+z_xpos_off2a]
+ test angled, 0x400
+ jnz .w4_main ; !enable_intra_edge_filter
+ lea r3d, [hq+2]
+ add angled, 1022
+ shl r3d, 6
+ test r3d, angled
+ jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8)
+ vpbroadcastd xm2, [base+pb_4]
+ sub angled, 1075 ; angle - 53
+ call .upsample_above
+ lea r3d, [hq+3]
+ vpbroadcastq m10, [pb_0to63+1]
+ punpcklbw xm7, xm0, xm7
+ call .filter_strength
+ jmp .w4_filter_left
+.w4_upsample_left:
+ call .upsample_left
+ movsldup m16, [base+z_ypos_off3]
+ vpbroadcastd m9, [base+pb_16]
+ punpcklbw xm8, xm0, xm8
+ jmp .w4_main2
+.w4_no_upsample_above:
+ lea r3d, [hq+3]
+ sub angled, 1112 ; angle - 90
+ call .filter_strength
+ test r3d, r3d
+ jz .w4_no_filter_above
+ vpbroadcastd xm5, [base+pb_3]
+ call .filter_top_w16
+.w4_no_filter_above:
+ lea r3d, [hq+2]
+ add angled, 973 ; angle + 883
+ shl r3d, 6
+ test r3d, angled
+ jz .w4_upsample_left ; angle <= 140 || h > 8 || (is_sm && h == 8)
+ vpbroadcastd ym0, [base+pb_90]
+ psubb ym0, ym17
+ vpcmpgtb k2{k2}, ym0, ym16
+ kmovd r3d, k2
+.w4_filter_left:
+ test r3d, r3d
+ jz .w4_main
+ popcnt r3d, r3d
+ call .filter_left_h16
+.w4_main:
+ movsldup m16, [base+z_ypos_off1]
+ vpbroadcastd m9, [base+pb_8]
+.w4_main2:
+ vpbroadcastq m3, [base+z_ypos_mul1a]
+ vpbroadcastw m0, dyd
+ movsldup m1, [base+z_xpos_mul]
+ vpbroadcastw m5, dxd
+ vinserti32x4 m7, [tlq-16], 3
+ vinserti32x4 m8, [tlq-16], 3
+ pmullw m3, m0
+ vbroadcasti32x4 m2, [base+z_xpos_bc]
+ pmullw m1, m5 ; xpos0..3
+ psllw m5, 5 ; dx*8
+ psraw m4, m3, 6
+ psrlw m3, 1
+ packsswb m4, m4
+ vpermw m3, m3, m14 ; 64-frac, frac
+ punpcklbw m4, m4
+ lea r2, [strideq*3]
+ paddb m4, m16 ; base, base+1
+.w4_loop:
+ pshufb m16, m1, m2
+ psrlw m0, m1, 3
+ paddb m16, m10
+ vpermw m0, m0, m14
+ vpmovw2m k1, m16 ; base_x < 0
+ vpermb m16, m16, m7
+ pmaddubsw m16, m0
+ vpermb m0, m4, m8
+ pmaddubsw m16{k1}, m0, m3
+ pmulhrsw m16, m15
+ vpmovwb ym16, m16
+ movd [dstq+strideq*0], xm16
+ pextrd [dstq+strideq*1], xm16, 1
+ pextrd [dstq+strideq*2], xm16, 2
+ pextrd [dstq+r2 ], xm16, 3
+ sub hd, 8
+ jl .w4_end
+ paddsw m1, m5
+ vextracti128 xm16, ym16, 1
+ lea dstq, [dstq+strideq*4]
+ paddb m4, m9
+ movd [dstq+strideq*0], xm16
+ pextrd [dstq+strideq*1], xm16, 1
+ pextrd [dstq+strideq*2], xm16, 2
+ pextrd [dstq+r2 ], xm16, 3
+ lea dstq, [dstq+strideq*4]
+ jg .w4_loop
+.w4_end:
+ RET
+.upsample_above: ; w4/w8
+ mova xm0, [tlq-1]
+ xor angled, 0x7f ; 180 - angle
+ add dxd, dxd
+ jmp .upsample
+.upsample_left: ; h4/h8
+ palignr xm0, xm8, [tlq-16], 15
+ vpbroadcastb xm2, hd
+ add dyd, dyd
+.upsample:
+ pshufb xm1, xm0, [base+z_filter4_s1]
+ pminub xm2, [base+z_filter_s4]
+ vpbroadcastd xm3, [base+pb_m4_36]
+ pshufb xm0, xm2
+ pmaddubsw xm1, xm3
+ pmaddubsw xm0, xm3
+ paddw xm0, xm1
+ pmulhrsw xm0, xm15
+ packuswb xm0, xm0
+ ret
+.filter_strength:
+ vpbroadcastb ym16, r3d
+ mov r3d, angled
+ vpbroadcastd m2, [tlq-4]
+ vpbroadcastb ym17, angled
+ shr r3d, 8
+ vpcmpeqb k2, ym16, [base+z_filter_wh]
+ mova xm16, [base+z_filter_t0+r3*8]
+ vpcmpgtb k1{k2}, ym17, ym16
+ mova m9, [pb_0to63]
+ kmovd r3d, k1
+ ret
+.w8:
+ movq xm7, [tlq]
+ vbroadcasti32x4 m10, [base+z_xpos_off2a]
+ test angled, 0x400
+ jnz .w8_main
+ lea r3d, [angleq+126]
+ mov r3b, hb
+ cmp r3d, 8
+ ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm
+ vpbroadcastd xm2, [base+pb_8]
+ sub angled, 53 ; angle - 53
+ call .upsample_above
+ lea r3d, [hq+7]
+ vbroadcasti32x4 m10, [pb_0to63+1]
+ punpcklbw xm7, xm0, xm7
+ call .filter_strength
+ jmp .w8_filter_left
+.w8_upsample_left:
+ call .upsample_left
+ movshdup m16, [base+z_ypos_off3]
+ vpbroadcastd m9, [base+pb_8]
+ punpcklbw xm8, xm0, xm8
+ jmp .w8_main2
+.w8_no_upsample_above:
+ lea r3d, [hq+7]
+ sub angled, 90 ; angle - 90
+ call .filter_strength
+ test r3d, r3d
+ jz .w8_no_filter_above
+ vpbroadcastd xm5, [base+pb_7]
+ call .filter_top_w16
+.w8_no_filter_above:
+ lea r3d, [angleq-51]
+ mov r3b, hb
+ cmp r3d, 8
+ jbe .w8_upsample_left ; angle > 140 && h <= 8 && !is_sm
+ vpbroadcastd ym0, [base+pb_90]
+ psubb ym0, ym17
+ vpcmpgtb k2{k2}, ym0, ym16
+ kmovd r3d, k2
+.w8_filter_left:
+ test r3d, r3d
+ jz .w8_main
+ cmp hd, 32
+ je .w8_filter_left_h32
+ popcnt r3d, r3d
+ call .filter_left_h16
+ jmp .w8_main
+.w8_filter_left_h32:
+ call .filter_left_h64
+.w8_main:
+ movshdup m16, [base+z_ypos_off2]
+ vpbroadcastd m9, [base+pb_4]
+.w8_main2:
+ vbroadcasti32x4 m3, [base+z_ypos_mul1a]
+ vpbroadcastw m0, dyd
+ movshdup m1, [base+z_xpos_mul]
+ vpbroadcastw m5, dxd
+ vinserti32x4 m7, [tlq-16], 3
+ vinserti32x4 m8, [tlq-16], 3
+ pmullw m3, m0
+ vpbroadcastd m2, [base+pb_1]
+ pmullw m1, m5 ; xpos0..3
+ psllw m5, 4 ; dx*4
+ psraw m4, m3, 6
+ psrlw m3, 1
+ packsswb m4, m4
+ vpermw m3, m3, m14 ; 64-frac, frac
+ lea r3d, [dxq+(8<<6)]
+ paddsb m4, m16
+ shl dxd, 2
+ paddsb m0, m4, m2
+ lea r2, [strideq*3]
+ punpcklbw m4, m0 ; base, base+1
+.w8_loop:
+ pshufb m16, m1, m2
+ psrlw m0, m1, 3
+ paddb m16, m10
+ vpermw m0, m0, m14
+ vpmovw2m k1, m16 ; base_x < 0
+ vpermb m16, m16, m7
+ pmaddubsw m16, m0
+ vpermb m0, m4, m8
+ pmaddubsw m16{k1}, m0, m3
+ pmulhrsw m16, m15
+ vpmovwb ym16, m16
+ vextracti128 xm17, ym16, 1
+ movq [dstq+strideq*0], xm16
+ movhps [dstq+strideq*1], xm16
+ movq [dstq+strideq*2], xm17
+ movhps [dstq+r2 ], xm17
+ sub hd, 4
+ jz .w8_end
+ paddw m1, m5
+ lea dstq, [dstq+strideq*4]
+ paddb m4, m9
+ add r3d, dxd
+ jge .w8_loop
+.w8_leftonly_loop:
+ vpermb m16, m4, m8
+ pmaddubsw m16, m3
+ paddb m4, m9
+ pmulhrsw m16, m15
+ vpmovwb ym16, m16
+ vextracti128 xm17, ym16, 1
+ movq [dstq+strideq*0], xm16
+ movhps [dstq+strideq*1], xm16
+ movq [dstq+strideq*2], xm17
+ movhps [dstq+r2 ], xm17
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8_leftonly_loop
+.w8_end:
+ RET
+.filter_top_w16:
+ mova xm0, [base+z_filter_s1]
+ popcnt r3d, r3d
+ pminub xm4, xm5, [base+z_filter_s4]
+ vpermi2b xm0, xm7, xm2
+ pminub xm5, [base+z_filter_s5]
+ pshufb xm1, xm7, [base+z_filter_s2]
+ vpbroadcastd xm11, [base+z_filter_k+(r3-1)*4+12*0]
+ pshufb xm3, xm7, [base+z_filter_s3]
+ vpbroadcastd xm12, [base+z_filter_k+(r3-1)*4+12*1]
+ pshufb xm4, xm7, xm4
+ vpbroadcastd xm13, [base+z_filter_k+(r3-1)*4+12*2]
+ pshufb xm5, xm7, xm5
+ pmaddubsw xm0, xm11
+ pmaddubsw xm1, xm11
+ pmaddubsw xm6, xm3, xm12
+ vpbroadcastd xm12, r7m ; max_width
+ pmaddubsw xm3, xm13
+ pmaddubsw xm4, xm11
+ pmaddubsw xm5, xm11
+ packssdw xm12, xm12
+ paddw xm0, xm6
+ paddw xm1, xm3
+ paddw xm0, xm4
+ paddw xm1, xm5
+ packsswb xm12, xm12
+ pmulhrsw xm0, xm15
+ pmulhrsw xm1, xm15
+ vpcmpgtb k1, xm12, xm9 ; x < max_width
+ packuswb xm7{k1}, xm0, xm1
+ ret
+.filter_left_h16:
+ lea r5d, [hq-1]
+ mova xm0, [base+z_filter_s1]
+ vpbroadcastb xm5, r5d
+ vpermi2b xm0, xm8, xm2
+ pminub xm4, xm5, [base+z_filter_s4]
+ pshufb xm1, xm8, [base+z_filter_s2]
+ pminub xm5, [base+z_filter_s5]
+ pshufb xm3, xm8, [base+z_filter_s3]
+ vpbroadcastd xm11, [base+z_filter_k+(r3-1)*4+12*0]
+ pshufb xm4, xm8, xm4
+ vpbroadcastd xm12, [base+z_filter_k+(r3-1)*4+12*1]
+ pshufb xm5, xm8, xm5
+ vpbroadcastd xm13, [base+z_filter_k+(r3-1)*4+12*2]
+ pmaddubsw xm0, xm11
+ pmaddubsw xm1, xm11
+ pmaddubsw xm6, xm3, xm12
+ vpbroadcastd xm12, r8m ; max_height
+ pmaddubsw xm3, xm13
+ pmaddubsw xm4, xm11
+ pmaddubsw xm5, xm11
+ packssdw xm12, xm12
+ paddw xm0, xm6
+ paddw xm1, xm3
+ paddw xm0, xm4
+ paddw xm1, xm5
+ packsswb xm12, xm12
+ pmulhrsw xm0, xm15
+ pmulhrsw xm1, xm15
+ vpcmpgtb k1, xm12, xm9 ; y < max_height
+ packuswb xm8{k1}, xm0, xm1
+ ret
+.w16:
+ movu xm7, [tlq] ; top
+ test angled, 0x400
+ jnz .w16_main
+ lea r3d, [hq+15]
+ sub angled, 90
+ call .filter_strength
+ test r3d, r3d
+ jz .w16_no_filter_above
+ vpbroadcastd xm5, [base+pb_15]
+ call .filter_top_w16
+.w16_no_filter_above:
+ cmp hd, 16
+ jg .w16_filter_left_h64
+ vpbroadcastd ym0, [base+pb_90]
+ psubb ym0, ym17
+ vpcmpgtb k2{k2}, ym0, ym16
+ kmovd r3d, k2
+ test r3d, r3d
+ jz .w16_main
+ popcnt r3d, r3d
+ call .filter_left_h16
+ jmp .w16_main
+.w16_filter_left_h64:
+ call .filter_left_h64
+.w16_main:
+ vbroadcasti32x4 m6, [base+z_ypos_mul1a] ; 1.. 8
+ vbroadcasti32x4 m5, [base+z_ypos_mul1b] ; 9..15
+ vpbroadcastw m0, dyd
+ vinserti32x4 m7, [tlq-16], 3
+ vpbroadcastd m2, [base+pb_1]
+ vpbroadcastw m12, dxd
+ movshdup m1, [base+z_xpos_mul]
+ pmullw m6, m0
+ vbroadcasti32x4 m3, [base+z_xpos_off2a]
+ pmullw m5, m0
+ vbroadcasti32x4 m4, [base+z_xpos_off2b]
+ pmullw m1, m12 ; xpos0 xpos1 xpos2 xpos3
+ vpbroadcastd m9, [base+pb_4]
+ psllw m12, 4 ; dx*4
+ movshdup m16, [base+z_ypos_off2]
+ psrlw m10, m6, 1
+ psrlw m11, m5, 1
+ vpermw m10, m10, m14 ; 64-frac, frac
+ psraw m6, 6
+ vpermw m11, m11, m14
+ psraw m5, 6
+ mov r5d, -(16<<6) ; 15 to avoid top, +1 to avoid topleft
+ packsswb m6, m5
+ mov r3d, 1<<6
+ paddsb m6, m16
+ sub r5d, dxd ; left-only threshold
+ paddsb m0, m6, m2
+ shl dxd, 2
+ punpcklbw m5, m6, m0 ; base, base+1
+ lea r2, [strideq*3]
+ punpckhbw m6, m0
+.w16_loop:
+ pshufb m17, m1, m2
+ psrlw m0, m1, 3
+ paddb m16, m3, m17
+ vpermw m0, m0, m14
+ paddb m17, m4
+ vpmovw2m k1, m16
+ vpermb m16, m16, m7
+ vpmovw2m k2, m17
+ vpermb m17, m17, m7
+ pmaddubsw m16, m0
+ pmaddubsw m17, m0
+ add r3d, dxd
+ jge .w16_toponly
+ mova m0, m8
+ vpermt2b m0, m5, m7
+ pmaddubsw m16{k1}, m0, m10
+ mova m0, m8
+ vpermt2b m0, m6, m7
+ pmaddubsw m17{k2}, m0, m11
+.w16_toponly:
+ pmulhrsw m16, m15
+ pmulhrsw m17, m15
+ packuswb m16, m17
+ mova [dstq+strideq*0], xm16
+ vextracti128 [dstq+strideq*1], ym16, 1
+ vextracti32x4 [dstq+strideq*2], m16, 2
+ vextracti32x4 [dstq+r2 ], m16, 3
+ sub hd, 4
+ jz .w16_end
+ paddw m1, m12
+ lea dstq, [dstq+strideq*4]
+ paddb m5, m9
+ paddb m6, m9
+ cmp r3d, r5d
+ jge .w16_loop
+.w16_leftonly_loop:
+ vpermb m16, m5, m8
+ vpermb m17, m6, m8
+ pmaddubsw m16, m10
+ pmaddubsw m17, m11
+ paddb m5, m9
+ paddb m6, m9
+ pmulhrsw m16, m15
+ pmulhrsw m17, m15
+ packuswb m16, m17
+ mova [dstq+strideq*0], xm16
+ vextracti128 [dstq+strideq*1], ym16, 1
+ vextracti32x4 [dstq+strideq*2], m16, 2
+ vextracti32x4 [dstq+r2 ], m16, 3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w16_leftonly_loop
+.w16_end:
+ RET
+.w32:
+ movu ym7, [tlq]
+ test angled, 0x400
+ jnz .w32_main
+ vpbroadcastd m2, [tlq-4]
+ mova ym0, [base+z_filter_s1]
+ vbroadcasti32x4 ym1, [base+z_filter_s2]
+ vbroadcasti32x4 ym3, [base+z_filter_s3]
+ vbroadcasti32x4 ym4, [base+z_filter_s4]
+ vpermi2b ym0, ym7, ym2 ; al bl
+ vpbroadcastd ym5, [base+pb_31]
+ pminub ym5, [base+z_filter_s5]
+ pshufb ym1, ym7, ym1 ; ah bh
+ vpbroadcastd ym11, [base+z_filter_k+4*2+12*0]
+ pshufb ym3, ym7, ym3 ; cl ch
+ vpbroadcastd ym12, [base+z_filter_k+4*2+12*1]
+ pshufb ym4, ym7, ym4 ; el dl
+ vpbroadcastd ym13, [base+z_filter_k+4*2+12*2]
+ vpermb ym5, ym5, ym7 ; eh dh
+ pmaddubsw ym0, ym11
+ pmaddubsw ym1, ym11
+ pmaddubsw ym6, ym3, ym12
+ vpbroadcastd ym12, r6m
+ pmaddubsw ym3, ym13
+ pmaddubsw ym4, ym11
+ pmaddubsw ym5, ym11
+ mova m9, [pb_0to63]
+ packssdw ym12, ym12
+ paddw ym0, ym6
+ paddw ym1, ym3
+ paddw ym0, ym4
+ paddw ym1, ym5
+ packsswb ym12, ym12
+ pmulhrsw ym0, ym15
+ pmulhrsw ym1, ym15
+ vpcmpgtb k1, ym12, ym9 ; x < max_width
+ packuswb ym7{k1}, ym0, ym1
+ cmp hd, 16
+ jg .w32_filter_h64
+ mov r3d, 3
+ call .filter_left_h16
+ jmp .w32_main
+.w32_filter_h64:
+ call .filter_left_h64
+.w32_main:
+ vbroadcasti32x8 m6, [base+z_ypos_mul1a] ; 1.. 8
+ vbroadcasti32x8 m5, [base+z_ypos_mul1b] ; 9..15
+ vpbroadcastw m0, dyd
+ vinserti32x4 m7, [tlq-16], 3
+ rorx r2q, dxq, 62 ; dx << 2
+ vpbroadcastd m2, [base+pb_1]
+ vpbroadcastw m1, r2d
+ pmullw m6, m0
+ vbroadcasti32x8 m3, [base+z_xpos_off2a]
+ pmullw m5, m0
+ vbroadcasti32x8 m4, [base+z_xpos_off2b]
+ mova ym0, ym1
+ paddw m12, m1, m1
+ vpbroadcastd m9, [base+pb_2]
+ paddw m1, m0 ; xpos1 xpos0
+ mova ym0, ym2
+ psrlw m10, m6, 1
+ psrlw m11, m5, 1
+ vpermw m10, m10, m14 ; 64-frac, frac
+ psraw m6, 6
+ vpermw m11, m11, m14
+ psraw m5, 6
+ mov r5d, -(32<<6) ; 31 to avoid top, +1 to avoid topleft
+ packsswb m6, m5
+ mov r3d, 1<<6
+ paddsb m6, m0
+ sub r5d, dxd ; left-only threshold
+ paddsb m0, m6, m2
+ add dxd, dxd
+ punpcklbw m5, m6, m0 ; base, base+1
+ punpckhbw m6, m0
+.w32_loop:
+ pshufb m17, m1, m2
+ psrlw m0, m1, 3
+ paddb m16, m3, m17
+ vpermw m0, m0, m14
+ paddb m17, m4
+ vpmovw2m k1, m16
+ vpermb m16, m16, m7
+ vpmovw2m k2, m17
+ vpermb m17, m17, m7
+ pmaddubsw m16, m0
+ pmaddubsw m17, m0
+ add r3d, dxd
+ jge .w32_toponly
+ mova m0, m8
+ vpermt2b m0, m5, m7
+ pmaddubsw m16{k1}, m0, m10
+ mova m0, m8
+ vpermt2b m0, m6, m7
+ pmaddubsw m17{k2}, m0, m11
+.w32_toponly:
+ pmulhrsw m16, m15
+ pmulhrsw m17, m15
+ packuswb m16, m17
+ vextracti32x8 [dstq+strideq*0], m16, 1
+ mova [dstq+strideq*1], ym16
+ sub hd, 2
+ jz .w32_end
+ paddw m1, m12
+ lea dstq, [dstq+strideq*2]
+ paddb m5, m9
+ paddb m6, m9
+ cmp r3d, r5d
+ jge .w32_loop
+.w32_leftonly_loop:
+ vpermb m16, m5, m8
+ vpermb m17, m6, m8
+ pmaddubsw m16, m10
+ pmaddubsw m17, m11
+ paddb m5, m9
+ paddb m6, m9
+ pmulhrsw m16, m15
+ pmulhrsw m17, m15
+ packuswb m16, m17
+ vextracti32x8 [dstq+strideq*0], m16, 1
+ mova [dstq+strideq*1], ym16
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32_leftonly_loop
+.w32_end:
+ RET
+.filter_left_h64:
+ mova m0, [base+z_filter_s1]
+ lea r3d, [hq-1]
+ vbroadcasti32x4 m4, [base+z_filter_s4]
+ vpbroadcastb m5, r3d
+ vbroadcasti32x4 m1, [base+z_filter_s2]
+ vbroadcasti32x4 m3, [base+z_filter_s3]
+ vpermi2b m0, m8, m2 ; al bl
+ pminub m5, [base+z_filter_s5]
+ pshufb m1, m8, m1 ; ah bh
+ vpbroadcastd m11, [base+z_filter_k+4*2+12*0]
+ pshufb m3, m8, m3 ; cl ch
+ vpbroadcastd m12, [base+z_filter_k+4*2+12*1]
+ pshufb m4, m8, m4 ; el dl
+ vpbroadcastd m13, [base+z_filter_k+4*2+12*2]
+ vpermb m5, m5, m8 ; eh dh
+ pmaddubsw m0, m11
+ pmaddubsw m1, m11
+ pmaddubsw m6, m3, m12
+ vpbroadcastd m12, r8m ; max_height
+ pmaddubsw m3, m13
+ pmaddubsw m4, m11
+ pmaddubsw m5, m11
+ packssdw m12, m12
+ paddw m0, m6
+ paddw m1, m3
+ paddw m0, m4
+ paddw m1, m5
+ packsswb m12, m12
+ pmulhrsw m0, m15
+ pmulhrsw m1, m15
+ vpcmpgtb k1, m12, m9 ; y < max_height
+ packuswb m8{k1}, m0, m1
+ ret
+.w64:
+ movu m7, [tlq]
+ test angled, 0x400
+ jnz .w64_main
+ vpbroadcastd m2, [tlq-4]
+ mova m0, [base+z_filter_s1]
+ vbroadcasti32x4 m1, [base+z_filter_s2]
+ vbroadcasti32x4 m3, [base+z_filter_s3]
+ vbroadcasti32x4 m4, [base+z_filter_s4]
+ vpermi2b m0, m7, m2 ; al bl
+ vpbroadcastd m5, [base+pb_63]
+ pminub m5, [base+z_filter_s5]
+ pshufb m1, m7, m1 ; ah bh
+ vpbroadcastd m11, [base+z_filter_k+4*2+12*0]
+ pshufb m3, m7, m3 ; cl ch
+ vpbroadcastd m12, [base+z_filter_k+4*2+12*1]
+ pshufb m4, m7, m4 ; el dl
+ vpbroadcastd m13, [base+z_filter_k+4*2+12*2]
+ vpermb m5, m5, m7 ; eh dh
+ pmaddubsw m0, m11
+ pmaddubsw m1, m11
+ pmaddubsw m6, m3, m12
+ vpbroadcastd m12, r6m
+ pmaddubsw m3, m13
+ pmaddubsw m4, m11
+ pmaddubsw m5, m11
+ mova m9, [pb_0to63]
+ packssdw m12, m12
+ paddw m0, m6
+ paddw m1, m3
+ paddw m0, m4
+ paddw m1, m5
+ packsswb m12, m12
+ pmulhrsw m0, m15
+ pmulhrsw m1, m15
+ vpcmpgtb k1, m12, m9 ; x < max_width
+ packuswb m7{k1}, m0, m1
+ call .filter_left_h64 ; always filter the full 64 pixels for simplicity
+.w64_main:
+ vpbroadcastw m5, dyd
+ vpbroadcastd m9, [tlq-4]
+ rorx r2q, dxq, 62 ; dx << 2
+ pmullw m6, m5, [base+z_ypos_mul1a] ; can overflow, but it doesn't matter as such
+ pmullw m5, [base+z_ypos_mul1b] ; pixels aren't selected from the left edge
+ vpbroadcastw m1, r2d ; xpos
+ mova m3, [base+z_xpos_off2a]
+ mova m4, [base+z_xpos_off2b]
+ mova m12, m1
+ vpbroadcastd m2, [base+pb_1]
+ psrlw m10, m6, 1
+ psrlw m11, m5, 1
+ vpermw m10, m10, m14 ; 64-frac, frac
+ psraw m6, 6
+ vpermw m11, m11, m14
+ psraw m5, 6
+ mov r5d, -(64<<6) ; 63 to avoid top, +1 to avoid topleft
+ packsswb m6, m5
+ mov r3d, 1<<6
+ paddsb m0, m6, m2
+ sub r5d, dxd ; left-only threshold
+ punpcklbw m5, m6, m0 ; base, base+1
+ punpckhbw m6, m0
+.w64_loop:
+ pshufb m17, m1, m2
+ psrlw m0, m1, 3
+ paddb m16, m3, m17
+ vpermw m0, m0, m14
+ paddb m17, m4
+ vpmovw2m k1, m16 ; base_x < 0
+ vpermi2b m16, m7, m9
+ vpmovw2m k2, m17
+ vpermi2b m17, m7, m9
+ pmaddubsw m16, m0
+ pmaddubsw m17, m0
+ add r3d, dxd
+ jge .w64_toponly
+ mova m0, m8
+ vpermt2b m0, m5, m9
+ pmaddubsw m16{k1}, m0, m10
+ mova m0, m8
+ vpermt2b m0, m6, m9
+ pmaddubsw m17{k2}, m0, m11
+.w64_toponly:
+ pmulhrsw m16, m15
+ pmulhrsw m17, m15
+ packuswb m16, m17
+ mova [dstq], m16
+ dec hd
+ jz .w64_end
+ paddw m1, m12
+ add dstq, strideq
+ paddb m5, m2
+ paddb m6, m2
+ cmp r3d, r5d
+ jge .w64_loop
+.w64_leftonly_loop:
+ vpermb m16, m5, m8
+ vpermb m17, m6, m8
+ pmaddubsw m16, m10
+ pmaddubsw m17, m11
+ paddb m5, m2
+ paddb m6, m2
+ pmulhrsw m16, m15
+ pmulhrsw m17, m15
+ packuswb m16, m17
+ mova [dstq], m16
+ add dstq, strideq
+ dec hd
+ jg .w64_leftonly_loop
+.w64_end:
+ RET
+
+cglobal ipred_z3_8bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dy
+ lea r7, [z_filter_t0]
+ tzcnt wd, wm
+ movifnidn angled, anglem
+ lea t0, [dr_intra_derivative+45*2-1]
+ movsxd wq, [base+ipred_z3_8bpc_avx512icl_table+wq*4]
+ sub angled, 180
+ mov dyd, angled
+ neg dyd
+ xor angled, 0x400
+ or dyq, ~0x7e
+ mova m0, [base+pb_63to0]
+ movzx dyd, word [t0+dyq]
+ lea wq, [base+ipred_z3_8bpc_avx512icl_table+wq]
+ movifnidn hd, hm
+ mova m14, [base+z_frac_table]
+ shl dyd, 6
+ vpbroadcastd m15, [base+pw_512]
+ jmp wq
+.w4:
+ cmp angleb, 40
+ jae .w4_no_upsample
+ lea r3d, [angleq-1024]
+ sar r3d, 7
+ add r3d, hd
+ jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm)
+ lea r3d, [hq+4]
+ call .upsample
+ movshdup m1, [base+z_ypos_off1]
+ vpbroadcastd m6, [base+pb_16]
+ jmp .w4_main2
+.w4_no_upsample:
+ lea r3d, [hq+3]
+ vpbroadcastb m9, r3d
+ vpxord m1, m9, [base+pb_63] {1to16} ; 63 - (h + 4)
+ pmaxub m1, m0
+ vpermb m7, m1, [tlq-64*1]
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .w4_main
+ vpbroadcastb xm1, angled
+ shr angled, 8
+ vpcmpeqb k1, xm9, [base+z_filter_wh]
+ vpbroadcastd m2, [tlq-3]
+ vpcmpgtb k1{k1}, xm1, [base+z_filter_t0+angleq*8]
+ kmovw r5d, k1
+ test r5d, r5d
+ jz .w4_main
+ pminub m9, [pb_0to63]
+ call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w8_filter
+ vpermb m7, m9, m0
+.w4_main:
+ movsldup m1, [base+z_ypos_off1]
+ vpbroadcastd m6, [base+pb_8]
+.w4_main2:
+ vpbroadcastw m0, dyd
+ vpbroadcastq m2, [base+z_ypos_mul2a] ; 1..4
+ pmulhuw m2, m0 ; ypos >> 1
+ lea r2, [strideq*3]
+ vpermw m3, m2, m14 ; 64-frac, frac
+ psrlw m2, 5
+ packsswb m2, m2
+ punpcklbw m2, m2
+ paddsb m2, m1 ; base, base+1
+.w4_loop:
+ vpermb m0, m2, m7
+ pmaddubsw m0, m3
+ paddsb m2, m6
+ pmulhrsw m0, m15
+ vpmovwb ym0, m0
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ pextrd [dstq+strideq*2], xm0, 2
+ pextrd [dstq+r2 ], xm0, 3
+ sub hd, 8
+ jl .w4_end
+ vextracti32x4 xm0, ym0, 1
+ lea dstq, [dstq+strideq*4]
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ pextrd [dstq+strideq*2], xm0, 2
+ pextrd [dstq+r2 ], xm0, 3
+ lea dstq, [dstq+strideq*4]
+ jg .w4_loop
+.w4_end:
+ RET
+.upsample:
+ xor r3d, 31 ; 31 - (h + imin(w, h))
+ vbroadcasti32x4 ym0, [base+z_xpos_off2a]
+ vpbroadcastb ym7, r3d
+ pmaxub ym7, [base+z3_upsample]
+ vbroadcasti32x4 ym1, [base+z_filter_s4]
+ vpermb ym7, ym7, [tlq-31]
+ vpbroadcastd ym2, [base+pb_m4_36]
+ pshufb ym0, ym7, ym0
+ psrldq ym7, 1
+ pshufb ym1, ym7, ym1
+ pmaddubsw ym0, ym2
+ pmaddubsw ym1, ym2
+ add dyd, dyd
+ paddw ym0, ym1
+ pmulhrsw ym0, ym15
+ packuswb ym0, ym0
+ punpcklbw ym7, ym0
+ ret
+.w8:
+ lea r3d, [angleq+216]
+ mov r3b, hb
+ cmp r3d, 8
+ ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8
+ lea r3d, [hq*2]
+ call .upsample
+ pshufd m1, [base+z_ypos_off1], q0000
+ vpbroadcastd m6, [base+pb_8]
+ jmp .w8_main2
+.w8_no_upsample:
+ mov r3d, 8
+ cmp hd, 4
+ cmove r3d, hd
+ lea r3d, [r3+hq-1]
+ xor r3d, 63 ; 63 - (h + imin(w, h))
+ vpbroadcastb m1, wd
+ pmaxub m1, m0
+ vpermb m7, m1, [tlq-64*1]
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .w8_main
+ lea r3d, [hq+7]
+ call .filter_strength
+ test r5d, r5d
+ jz .w8_main
+ call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w16_filter
+ vpermb m7, m10, m0
+.w8_main:
+ movsldup m1, [base+z_ypos_off2]
+ vpbroadcastd m6, [base+pb_4]
+.w8_main2:
+ vpbroadcastw m0, dyd
+ vbroadcasti32x4 m2, [base+z_ypos_mul2a] ; 1..8
+ pmulhuw m2, m0 ; ypos >> 1
+ lea r2, [strideq*3]
+ vpermw m3, m2, m14 ; 64-frac, frac
+ psrlw m2, 5
+ packsswb m2, m2
+ punpcklbw m2, m2
+ paddsb m2, m1 ; base, base+1
+.w8_loop:
+ vpermb m0, m2, m7
+ pmaddubsw m0, m3
+ paddsb m2, m6
+ pmulhrsw m0, m15
+ vpmovwb ym0, m0
+ vextracti32x4 xm1, ym0, 1
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+r2 ], xm1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8_loop
+ RET
+.filter_strength:
+ vpbroadcastd m2, [tlq-3]
+.filter_strength2:
+ vpbroadcastb m9, r3d
+ vpbroadcastb ym1, angled
+ shr angled, 8
+ vpcmpeqb k1, ym9, [base+z_filter_wh]
+ mova xm0, [base+z_filter_t0+angleq*8]
+ vpcmpgtb k1{k1}, ym1, ym0
+ pminub m10, m9, [pb_0to63]
+ kmovd r5d, k1
+ ret
+.w16_load:
+ cmp r3d, hd
+ cmovae r3d, hd
+ add r3d, hd
+ mova m7, [tlq-64*1]
+ neg r3d ; -(h + imin(w, h))
+ and r3d, 63
+ vpbroadcastb m1, r3d
+ pmaxub m2, m0, m1
+ cmp hd, 64
+ je .w16_load_h64
+ vpermb m8, m1, m7
+ vpermb m7, m2, m7
+ ret
+.w16_load_h64:
+ vpermb m7, m0, m7
+ vpermb m8, m2, [tlq-64*2]
+ ret
+.w16:
+ mov r3d, 16
+ call .w16_load
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .w16_main
+ vpbroadcastd m2, [tlq-3]
+ cmp hd, 64
+ je .w16_filter64
+ lea r3d, [hq+15]
+ call .filter_strength2
+ test r5d, r5d
+ jz .w16_main
+ call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w16_filter
+ pminub m10, m9, [pb_0to63]
+ vpermb m8, m9, m0
+ vpermb m7, m10, m0
+ jmp .w16_main
+.w16_filter64:
+ vpbroadcastd m13, [base+pb_15]
+ valignq m0, m8, m7, 7
+ pminub m12, m13, [pb_0to63]
+ valignq m11, m8, m7, 1
+ call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w64_filter
+.w16_main:
+ vbroadcasti32x4 m3, [base+z_ypos_mul2a] ; 1.. 8
+ vbroadcasti32x4 m2, [base+z_ypos_mul2b] ; 9..15
+ vpbroadcastw m0, dyd
+ vpbroadcastd m6, [base+pb_4]
+ pmulhuw m3, m0 ; ypos >> 1
+ pmulhuw m2, m0
+ movshdup m0, [base+z_ypos_off2]
+ lea r2, [strideq*3]
+ vpbroadcastd m1, [base+pb_1]
+ vpermw m4, m3, m14 ; 64-frac, frac
+ psrlw m3, 5
+ vpermw m5, m2, m14
+ psrlw m2, 5
+ packsswb m3, m2
+ paddsb m3, m0
+ paddsb m1, m3
+ punpcklbw m2, m3, m1 ; base, base+1
+ punpckhbw m3, m1
+.w16_loop:
+%macro Z3_PERM2 0
+ mova m0, m7
+ vpermt2b m0, m2, m8
+ mova m1, m7
+ vpermt2b m1, m3, m8
+ pmaddubsw m0, m4
+ pmaddubsw m1, m5
+ paddsb m2, m6
+ paddsb m3, m6
+ pmulhrsw m0, m15
+ pmulhrsw m1, m15
+ packuswb m0, m1
+%endmacro
+ Z3_PERM2
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ vextracti32x4 [dstq+strideq*2], m0, 2
+ vextracti32x4 [dstq+r2 ], m0, 3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32:
+ mov r3d, 32
+ call .w16_load
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .w32_main
+ vpbroadcastd m2, [tlq-3]
+ cmp hd, 64
+ je .w32_filter64
+ lea r3d, [hq+31]
+ vpbroadcastb m9, r3d
+ call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w32_filter
+ vpermb m8, m9, m7
+ jmp .w32_main
+.w32_filter64:
+ vpbroadcastd m13, [base+pb_31]
+ valignq m0, m8, m7, 7
+ pminub m12, m13, [pb_0to63]
+ valignq m11, m8, m7, 1
+ call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w64_filter
+.w32_main:
+ vbroadcasti32x8 m3, [base+z_ypos_mul2a] ; 1.. 8
+ vbroadcasti32x8 m2, [base+z_ypos_mul2b] ; 9..15
+ vpbroadcastw m0, dyd
+ vpbroadcastd m1, [base+pb_1]
+ pmulhuw m3, m0 ; ypos >> 1
+ pmulhuw m2, m0
+ vpbroadcastd m6, [base+pb_2]
+ mova ym0, ym1
+ vpermw m4, m3, m14 ; 64-frac, frac
+ psrlw m3, 5
+ vpermw m5, m2, m14
+ psrlw m2, 5
+ packsswb m3, m2
+ paddsb m3, m0
+ paddsb m1, m3
+ punpcklbw m2, m3, m1 ; base, base+1
+ punpckhbw m3, m1
+.w32_loop:
+ Z3_PERM2
+ vextracti32x8 [dstq+strideq*0], m0, 1
+ mova [dstq+strideq*1], ym0
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64:
+ mova m7, [tlq-64*1]
+ cmp hd, 64
+ je .w64_h64
+ lea r3d, [hq*2-1]
+ xor r3d, 63 ; -(h + imin(w, h)) & 63
+ vpbroadcastb m1, r3d
+ pmaxub m0, m1
+ vpermb m8, m1, m7
+ jmp .w64_filter
+.w64_h64:
+ vpermb m8, m0, [tlq-64*2]
+.w64_filter:
+ vpermb m7, m0, m7
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .w64_main
+ lea r3d, [hq-1]
+ vpbroadcastd m2, [tlq-3]
+ vpbroadcastb m13, r3d
+ valignq m0, m8, m7, 7
+ pminub m12, m13, [pb_0to63]
+ valignq m11, m8, m7, 1
+ call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w64_filter
+.w64_main:
+ vpbroadcastw m2, dyd
+ pmulhuw m3, m2, [base+z_ypos_mul2a]
+ pmulhuw m2, [base+z_ypos_mul2b]
+ vpbroadcastd m6, [base+pb_1]
+ vpermw m4, m3, m14 ; 64-frac, frac
+ psrlw m3, 5
+ vpermw m5, m2, m14
+ psrlw m2, 5
+ packsswb m3, m2
+ paddsb m1, m3, m6
+ punpcklbw m2, m3, m1 ; base, base+1
+ punpckhbw m3, m1
+.w64_loop:
+ Z3_PERM2
+ mova [dstq], m0
+ add dstq, strideq
+ dec hd
+ jg .w64_loop
+ RET
+
+; The ipred_filter code processes 4x2 blocks in the following order
+; which increases parallelism compared to doing things row by row.
+; Some redundant blocks are calculated for w > 4.
+; w4 w8 w16 w32
+; 1 1 2 1 2 3 4 1 2 3 4 9 a b c
+; 2 2 3 2 3 4 5 2 3 4 5 a b c d
+; 3 3 4 3 4 5 6 3 4 5 6 b c d e
+; 4 4 5 4 5 6 7 4 5 6 7 c d e f
+; 5 5 6 5 6 7 8 5 6 7 8 d e f g
+; 6 6 7 6 7 8 9 6 7 8 9 e f g h
+; 7 7 8 7 8 9 a 7 8 9 a f g h i
+; ___ 8 ___ 8 9 ___ 8 9 a b ___ 8 9 a b g h i j ___
+; 9 9 a b h i j
+; a b i j
+; b j
+
+cglobal ipred_filter_8bpc, 4, 7, 14, dst, stride, tl, w, h, flt
+%define base r6-filter_taps
+ lea r6, [filter_taps]
+%ifidn fltd, fltm
+ movzx fltd, fltb
+%else
+ movzx fltd, byte fltm
+%endif
+ vpbroadcastd xmm2, [tlq+1] ; t0 t0 t0 t0
+ movifnidn hd, hm
+ shl fltd, 6
+ vpbroadcastd m6, [base+pd_8]
+ vpbroadcastd xmm3, [tlq-2] ; l1 l0 tl __
+ vbroadcasti32x4 m7, [r6+fltq+16*0] ; p1 p2 p3 p4
+ vbroadcasti32x4 m8, [r6+fltq+16*1]
+ vbroadcasti32x4 m9, [r6+fltq+16*2] ; p6 p5 p0 __
+ vbroadcasti32x4 m10, [r6+fltq+16*3]
+ mova xmm0, xm6
+ vpdpbusd xmm0, xmm2, xm7
+ mova xmm1, xm6
+ vpdpbusd xmm1, xmm2, xm8
+ vpdpbusd xmm0, xmm3, xm9
+ vpdpbusd xmm1, xmm3, xm10
+ packssdw xmm0, xmm1
+ cmp wd, 8
+ jb .w4
+ vpbroadcastd ym2, [tlq+5]
+ mova m11, [base+filter_perm]
+ mov r5, 0xffffffffffff000f
+ psrldq xmm2, 1 ; __ t0
+ kmovq k1, r5 ; 0x000f
+ psraw xm5, xmm0, 4
+ packuswb xmm2, xm5 ; __ t0 a0 b0
+ pshufd ym2{k1}, ymm2, q3333 ; b0 b0 b0 b0 t1 t1 t1 t1
+ je .w8
+ kxnorb k3, k3, k3 ; 0x00ff
+ vpbroadcastd xm3, [tlq-4]
+ kandnq k2, k3, k1 ; 0xffffffffffff0000
+ vpermb ym3{k2}, ym11, ymm2 ; l3 l2 l1 __ b3 a3 t3 __
+ mova ym0, ym6
+ vpdpbusd ym0, ym2, ym7
+ mova ym1, ym6
+ vpdpbusd ym1, ym2, ym8
+ pshufb ym5{k2}, ym2, ym11 ; a0 b0 __ t0
+ vpbroadcastd m2, [tlq+9]
+ vpdpbusd ym0, ym3, ym9
+ vpdpbusd ym1, ym3, ym10
+ vpbroadcastd xm3, [tlq-6] ; l5 l4 l3 __
+ kunpckbw k4, k1, k3 ; 0x0fff
+ packssdw ym0, ym1
+ psraw ym0, 4 ; a0 d0 a1 b1
+ packuswb ym5, ym0 ; a0 b0 c0 d0 __ t1 a1 b1
+ pshufd m2{k3}, m5, q3333 ; d0 d0 d0 d0 b1 b1 b1 b1 t2 t2 t2 t2
+ vpermb m3{k2}, m11, m5 ; l5 l4 l3 __ d3 c3 b3 __ b7 a7 t7 __
+ mova m4, m6
+ vpdpbusd m4, m2, m7
+ mova m1, m6
+ vpdpbusd m1, m2, m8
+ psrldq m0, m2, 1 ; __ d0 __ b0 __ t0
+ vpbroadcastd m2, [tlq+13]
+ vpdpbusd m4, m3, m9
+ vpdpbusd m1, m3, m10
+ mova m12, [base+filter_end]
+ lea r5d, [hq-6]
+ mov r6, dstq
+ cmovp hd, r5d ; w == 16 ? h : h - 6
+ packssdw m4, m1
+ psraw m4, 4 ; e0 f0 c1 d1 a2 b2
+ packuswb m0, m4 ; __ d0 e0 f0 __ b1 c1 d1 __ t2 a2 b2
+ pshufd m2{k4}, m0, q3333 ; f0 f0 f0 f0 d1 d1 d1 d1 b2 b2 b2 b2 t3 t3 t3 t3
+.w16_loop:
+ vpbroadcastd xm3, [tlq-8]
+ vpermb m3{k2}, m11, m0 ; l7 l6 l5 __ f3 e3 d3 __ d7 c7 b7 __ bb ab tb __
+ mova m1, m6
+ vpdpbusd m1, m2, m7
+ mova m0, m6
+ vpdpbusd m0, m2, m8
+ sub tlq, 2
+ vpdpbusd m1, m3, m9
+ vpdpbusd m0, m3, m10
+ packssdw m1, m0
+ mova m0, m4
+ psraw m4, m1, 4 ; g0 h0 e1 f1 c2 d2 a3 b3
+ packuswb m0, m4 ; e0 f0 g0 h0 c1 d1 e1 f1 a2 b2 c2 d2 __ __ a3 b3
+ pshufd m2, m0, q3333 ; h0 h0 h0 h0 f1 f1 f1 f1 d2 d2 d2 d2 b3 b3 b3 b3
+ vpermt2d m5, m12, m0 ; c0 d0 e0 f0 __ __ c1 d1 a0 a1 a2 a3 b0 b1 b2 b3
+ vextracti32x4 [dstq+strideq*0], m5, 2
+ vextracti32x4 [dstq+strideq*1], m5, 3
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w16_loop
+ cmp wd, 16
+ je .ret
+ mova xm13, [filter_perm+16]
+ mova xmm3, [r6+strideq*0]
+ punpckhdq xmm3, [r6+strideq*1]
+ vpbroadcastd m2{k1}, [tlq+r5+17] ; t4 t4 t4 t4 f1 f1 f1 f1 d2 d2 d2 d2 b3 b3 b3 b3
+ pinsrb xm3, xmm3, [tlq+r5+16], 7
+ pshufb xm3, xm13
+ vpermb m3{k2}, m11, m0 ; bf af tf __ h3 g3 f3 __ f7 e7 d7 __ db cb bb __
+ mova m0, m6
+ vpdpbusd m0, m2, m7
+ mova m1, m6
+ vpdpbusd m1, m2, m8
+ kunpckbw k5, k3, k1 ; 0xff0f
+ lea r3, [strideq*3]
+ vpdpbusd m0, m3, m9
+ vpdpbusd m1, m3, m10
+ packssdw m0, m1
+ psraw m0, 4 ; a4 b4 g1 h1 e2 f2 c3 d3
+ packuswb m4, m0 ; g0 h0 a4 b4 e1 f1 g1 h1 c2 d2 e2 f2 __ __ c3 d3
+ vpblendmb m1{k3}, m4, m2 ; __ t4 a4 b4 e1 f1 g1 h1 c2 d2 e2 f2 __ __ c3 d3
+ vpbroadcastd ym2, [tlq+r5+21]
+ pshufd m2{k5}, m4, q3333 ; b4 b4 b4 b4 t5 t5 t5 t5 f2 f2 f2 f2 d3 d3 d3 d3
+ vpermt2d m5, m12, m4 ; e0 f0 g0 h0 __ __ e1 f1 c0 c1 c2 c3 d0 d1 d2 d3
+ vextracti32x4 [dstq+strideq*0], m5, 2
+ vextracti32x4 [dstq+strideq*1], m5, 3
+ punpckhqdq xmm3, [r6+r3]
+ pinsrb xmm3, [r6+strideq*2+15], 11
+ pshufb xm3, xmm3, xm13
+ vpermb m3{k2}, m11, m1 ; df cf bf __ bj aj tj __ h7 g7 f7 __ fb eb db __
+ mova m4, m6
+ vpdpbusd m4, m2, m7
+ mova m1, m6
+ vpdpbusd m1, m2, m8
+ kxnord k3, k3, k4 ; 0xfffff0ff
+ lea r4, [strideq*5]
+ vpdpbusd m4, m3, m9
+ vpdpbusd m1, m3, m10
+ packssdw m4, m1
+ psraw m4, 4 ; c4 d4 a5 b5 g2 h2 e3 f3
+ packuswb m0, m4 ; a4 b4 c4 d4 g1 h1 a5 b5 e2 f2 g2 h2 __ __ e3 f3
+ vpblendmw m1{k3}, m2, m0 ; a4 b4 c4 d4 __ t5 a5 b5 e2 f2 g2 h2 __ __ e3 f3
+ vpbroadcastd m2, [tlq+r5+25]
+ pshufd m2{k3}, m0, q3333 ; d4 d4 d4 d4 b5 b5 b5 b5 t6 t6 t6 t6 f3 f3 f3 f3
+ vpermt2d m5, m12, m0 ; g0 h0 a4 b4 __ __ g1 h1 e0 e1 e2 e3 f0 f1 f2 f3
+ vextracti32x4 [dstq+strideq*2], m5, 2
+ vextracti32x4 [dstq+r3 ], m5, 3
+ punpckhqdq xmm3, [r6+r4]
+ pinsrb xmm3, [r6+strideq*4+15], 11
+ pshufb xm3, xmm3, xm13
+ vpermb m3{k2}, m11, m1 ; ff ef df __ dj cj bj __ bn an tn __ hb hb fb __
+ mova m0, m6
+ vpdpbusd m0, m2, m7
+ mova m1, m6
+ vpdpbusd m1, m2, m8
+ kunpckwd k1, k1, k2 ; 0x000f0000
+ vpdpbusd m0, m3, m9
+ vpdpbusd m1, m3, m10
+ packssdw m0, m1
+ psraw m0, 4 ; e4 f4 c5 d5 a6 b6 g3 h3
+ packuswb m4, m0 ; c4 d4 e4 f4 a5 b5 c5 d5 g2 h2 a6 b6 __ __ g3 h3
+ vpblendmw m1{k1}, m4, m2 ; c4 d4 e4 f4 a5 b5 c5 d5 __ t6 a6 b6 __ __ g3 h3
+ vpbroadcastd m2, [tlq+r5+29]
+ pshufd m2{k4}, m4, q3333 ; f4 f4 f4 f4 d5 d5 d5 d5 b6 b6 b6 b6 t7 t7 t7 t7
+ vpermt2d m5, m12, m4 ; a4 b4 c4 d4 __ __ a5 b5 g0 g1 g2 g3 h0 h1 h2 h3
+ vextracti32x4 [dstq+strideq*4], m5, 2
+ vextracti32x4 [dstq+r4 ], m5, 3
+ lea r0, [strideq+r3*2]
+.w32_loop:
+ punpckhqdq xmm3, [r6+r0]
+ pinsrb xmm3, [r6+r3*2+15], 11
+ pshufb xm3, xmm3, xm13
+ vpermb m3{k2}, m11, m1 ; hf gf ff __ fj ej dj __ dn cn bn __ br ar tr __
+.w32_loop_tail:
+ mova m4, m6
+ vpdpbusd m4, m2, m7
+ mova m1, m6
+ vpdpbusd m1, m2, m8
+ vpdpbusd m4, m3, m9
+ vpdpbusd m1, m3, m10
+ packssdw m4, m1
+ mova m1, m0
+ psraw m0, m4, 4 ; g4 h4 e5 f5 c6 d6 a7 b7
+ packuswb m1, m0 ; e4 f4 g4 h4 c5 d5 e5 f5 a6 b6 c6 d6 __ __ a7 b7
+ pshufd m2, m1, q3333 ; h4 h4 h4 h4 f5 f5 f5 f5 d6 d6 d6 d6 b7 b7 b7 b7
+ vpermt2d m5, m12, m1 ; c4 d4 e4 f4 __ __ c5 d5 a4 a5 a6 a7 b4 b5 b6 b7
+ vextracti32x4 [r6+strideq*0+16], m5, 2
+ vextracti32x4 [r6+strideq*1+16], m5, 3
+ lea r6, [r6+strideq*2]
+ sub r5d, 2
+ jg .w32_loop
+ vpermb m3, m11, m1
+ cmp r5d, -6
+ jg .w32_loop_tail
+.ret:
+ RET
+.w8:
+ vpermb ym3, ym11, ymm2
+.w8_loop:
+ vpbroadcastd ym3{k1}, [tlq-4] ; l3 l2 l1 __ b3 a3 t3 __
+ mova ym0, ym6
+ vpdpbusd ym0, ym2, ym7
+ mova ym1, ym6
+ vpdpbusd ym1, ym2, ym8
+ sub tlq, 2
+ vpdpbusd ym0, ym3, ym9
+ vpdpbusd ym1, ym3, ym10
+ mova ym3, ym5
+ packssdw ym0, ym1
+ psraw ym5, ym0, 4 ; c0 d0 a1 b1
+ packuswb ym3, ym5 ; a0 b0 c0 d0 __ __ a1 b1
+ pshufd ym2, ym3, q3333 ; d0 d0 d0 d0 b1 b1 b1 b1
+ vpermb ym3, ym11, ym3 ; a0 a1 b0 b1
+ movq [dstq+strideq*0], xm3
+ movhps [dstq+strideq*1], xm3
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8_loop
+ RET
+.w4_loop:
+ vpbroadcastd xmm3, [tlq-4] ; l3 l2 l1 __
+ mova xmm0, xm6
+ vpdpbusd xmm0, xmm2, xm7
+ mova xmm1, xm6
+ vpdpbusd xmm1, xmm2, xm8
+ sub tlq, 2
+ vpdpbusd xmm0, xmm3, xm9
+ vpdpbusd xmm1, xmm3, xm10
+ packssdw xmm0, xmm1
+.w4:
+ psraw xmm0, 4 ; a0 b0
+ packuswb xmm0, xmm0
+ movd [dstq+strideq*0], xmm0
+ pshufd xmm2, xmm0, q1111 ; b0 b0 b0 b0
+ movd [dstq+strideq*1], xmm2
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w4_loop
+ RET
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/ipred_sse.asm b/third_party/dav1d/src/x86/ipred_sse.asm
new file mode 100644
index 0000000000..976f33a24b
--- /dev/null
+++ b/third_party/dav1d/src/x86/ipred_sse.asm
@@ -0,0 +1,5408 @@
+; Copyright © 2018-2021, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA 16
+
+%macro SMOOTH_WEIGHT_TABLE 1-*
+ %rep %0
+ db %1-128, 127-%1
+ %rotate 1
+ %endrep
+%endmacro
+
+; sm_weights[], but modified to precalculate x and 256-x with offsets to
+; enable efficient use of pmaddubsw (which requires signed values)
+smooth_weights: SMOOTH_WEIGHT_TABLE \
+ 0, 0, 255, 128, 255, 149, 85, 64, \
+ 255, 197, 146, 105, 73, 50, 37, 32, \
+ 255, 225, 196, 170, 145, 123, 102, 84, \
+ 68, 54, 43, 33, 26, 20, 17, 16, \
+ 255, 240, 225, 210, 196, 182, 169, 157, \
+ 145, 133, 122, 111, 101, 92, 83, 74, \
+ 66, 59, 52, 45, 39, 34, 29, 25, \
+ 21, 17, 14, 12, 10, 9, 8, 8, \
+ 255, 248, 240, 233, 225, 218, 210, 203, \
+ 196, 189, 182, 176, 169, 163, 156, 150, \
+ 144, 138, 133, 127, 121, 116, 111, 106, \
+ 101, 96, 91, 86, 82, 77, 73, 69, \
+ 65, 61, 57, 54, 50, 47, 44, 41, \
+ 38, 35, 32, 29, 27, 25, 22, 20, \
+ 18, 16, 15, 13, 12, 10, 9, 8, \
+ 7, 6, 6, 5, 5, 4, 4, 4
+
+ipred_v_shuf: db 0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7
+ipred_h_shuf: db 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0
+ipred_paeth_shuf: db 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0
+z_upsample1: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
+z_upsample2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 8, 8, 8
+z_transpose4: db 8, 12, 0, 4, 9, 13, 1, 5, 10, 14, 2, 6, 11, 15, 3, 7
+z3_shuf: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
+z3_shuf_h4: db 4, 3, 3, 2, 2, 1, 1, 0, 12, 11, 11, 10, 10, 9, 9, 8
+filter_shuf1: db 3, 4, 3, 4, 5, 6, 5, 6, 7, 2, 7, 2, 1, -1, 1, -1
+filter_shuf2: db 3, 4, 3, 4, 5, 6, 5, 6, 7, 11, 7, 11, 15, -1, 15, -1
+z_filter_wh4: db 7, 7, 19, 7,
+z_filter_wh8: db 19, 19, 11, 19, 11, 15, 15, 15, 23, 23, 23, 23, 39, 39, 39, 39
+pd_32768: dd 32768
+z3_filter_k_tail: db 64, 0, 64, 0, 64, 0, 56, 8
+z1_shuf_w4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12
+pb_0to15: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+pb_15to0: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+z_base_inc: dw 0*64, 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64
+z3_base_inc: dw 7*64, 6*64, 5*64, 4*64, 3*64, 2*64, 1*64, 0*64
+z_filter_wh16: db 19, 19, 19, 23, 23, 23, 31, 31, 31, 47, 47, 47, 79, 79, 79, -1
+z_filter_t_w48: db 55,127, 7,127, 15, 31, 39, 31,127, 39,127, 39, 7, 15, 31, 15
+ db 39, 63, 3, 63, 3, 3, 19, 3, 47, 19, 47, 19, 3, 3, 3, 3
+z_filter_t_w16: db 15, 31, 7, 15, 31, 7, 3, 31, 3, 3, 3, 3, 3, 3, 0, 0
+z_filter_s: db 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7
+ db 7, 8, 8, 9, 9, 10, 10, 11
+z_filter_k_tail: db 0, 64, 0, 64, 8, 56, 0, 64
+z2_h_shuf: db 7, 6, 15, 14, 6, 5, 14, 13, 5, 4, 13, 12, 4, 3, 12, 11
+z2_upsample: db 7, 6, 15, 14, 5, 4, 13, 12, 3, 2, 11, 10, 1, 0, 9, 8
+z2_dy_offset: dw 88*64, 88*64, 87*64, 87*64
+pw_m1to4: dw -1, -2, -3, -4
+z_filter_k: times 4 db 0, 16
+ times 4 db 0, 20
+ times 4 db 8, 16
+ times 4 db 32, 16
+ times 4 db 24, 20
+ times 4 db 16, 16
+ times 4 db 0, 0
+ times 4 db 0, 0
+pw_8: times 8 db 8, 0
+pb_3: times 16 db 3
+pb_16: times 16 db 16
+pw_62: times 8 dw 62
+pw_64: times 8 dw 64
+pw_256: times 8 dw 256
+pw_512: times 8 dw 512
+pw_m256: times 8 dw -256
+pb_2: times 8 db 2
+pb_4: times 8 db 4
+pb_8: times 8 db 8
+pb_128: times 8 db 128
+pb_m16: times 8 db -16
+pw_128: times 4 dw 128
+pw_255: times 4 dw 255
+pb_36_m4: times 4 db 36, -4
+pb_127_m127: times 4 db 127, -127
+
+%macro JMP_TABLE 3-*
+ %xdefine %1_%2_table (%%table - 2*4)
+ %xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2)
+ %%table:
+ %rep %0 - 2
+ dd %%base %+ .%3 - (%%table - 2*4)
+ %rotate 1
+ %endrep
+%endmacro
+
+%define ipred_dc_splat_ssse3_table (ipred_dc_ssse3_table + 10*4)
+%define ipred_cfl_splat_ssse3_table (ipred_cfl_ssse3_table + 8*4)
+
+JMP_TABLE ipred_h, ssse3, w4, w8, w16, w32, w64
+JMP_TABLE ipred_dc, ssse3, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
+ s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4
+JMP_TABLE ipred_dc_left, ssse3, h4, h8, h16, h32, h64
+JMP_TABLE ipred_smooth, ssse3, w4, w8, w16, w32, w64
+JMP_TABLE ipred_smooth_v, ssse3, w4, w8, w16, w32, w64
+JMP_TABLE ipred_smooth_h, ssse3, w4, w8, w16, w32, w64
+JMP_TABLE ipred_paeth, ssse3, w4, w8, w16, w32, w64
+JMP_TABLE ipred_z1, ssse3, w4, w8, w16, w32, w64
+JMP_TABLE ipred_z2, ssse3, w4, w8, w16, w32, w64
+JMP_TABLE ipred_z3, ssse3, h4, h8, h16, h32, h64
+JMP_TABLE pal_pred, ssse3, w4, w8, w16, w32, w64
+JMP_TABLE ipred_cfl, ssse3, h4, h8, h16, h32, w4, w8, w16, w32, \
+ s4-8*4, s8-8*4, s16-8*4, s32-8*4
+JMP_TABLE ipred_cfl_left, ssse3, h4, h8, h16, h32
+JMP_TABLE ipred_filter, ssse3, w4, w8, w16, w32
+
+cextern dr_intra_derivative
+cextern filter_intra_taps
+
+SECTION .text
+
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_h_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+; const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
+%macro IPRED_SET 3 ; width, stride, stride size pshuflw_imm8
+ pshuflw m1, m0, %3 ; extend 8 byte for 2 pos
+ punpcklqdq m1, m1
+ mova [dstq + %2], m1
+%if %1 > 16
+ mova [dstq + 16 + %2], m1
+%endif
+%if %1 > 32
+ mova [dstq + 32 + %2], m1
+ mova [dstq + 48 + %2], m1
+%endif
+%endmacro
+
+%macro IPRED_H 1 ; width
+ sub tlq, 4
+ movd m0, [tlq] ; get 4 bytes of topleft data
+ punpcklbw m0, m0 ; extend 2 byte
+%if %1 == 4
+ pshuflw m1, m0, q2233
+ movd [dstq+strideq*0], m1
+ psrlq m1, 32
+ movd [dstq+strideq*1], m1
+ pshuflw m0, m0, q0011
+ movd [dstq+strideq*2], m0
+ psrlq m0, 32
+ movd [dstq+stride3q ], m0
+
+%elif %1 == 8
+ punpcklwd m0, m0
+ punpckhdq m1, m0, m0
+ punpckldq m0, m0
+ movq [dstq+strideq*1], m1
+ movhps [dstq+strideq*0], m1
+ movq [dstq+stride3q ], m0
+ movhps [dstq+strideq*2], m0
+%else
+ IPRED_SET %1, 0, q3333
+ IPRED_SET %1, strideq, q2222
+ IPRED_SET %1, strideq*2, q1111
+ IPRED_SET %1, stride3q, q0000
+%endif
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w%1
+ RET
+%endmacro
+
+INIT_XMM ssse3
+cglobal ipred_h_8bpc, 3, 6, 2, dst, stride, tl, w, h, stride3
+ LEA r5, ipred_h_ssse3_table
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ IPRED_H 4
+.w8:
+ IPRED_H 8
+.w16:
+ IPRED_H 16
+.w32:
+ IPRED_H 32
+.w64:
+ IPRED_H 64
+
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_v_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+; const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
+cglobal ipred_v_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
+ LEA r5, ipred_dc_splat_ssse3_table
+ tzcnt wd, wm
+ movu m0, [tlq+ 1]
+ movu m1, [tlq+17]
+ movu m2, [tlq+33]
+ movu m3, [tlq+49]
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp wq
+
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_dc_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+; const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
+cglobal ipred_dc_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
+ movifnidn hd, hm
+ movifnidn wd, wm
+ tzcnt r6d, hd
+ lea r5d, [wq+hq]
+ movd m4, r5d
+ tzcnt r5d, r5d
+ movd m5, r5d
+ LEA r5, ipred_dc_ssse3_table
+ tzcnt wd, wd
+ movsxd r6, [r5+r6*4]
+ movsxd wq, [r5+wq*4+20]
+ pcmpeqd m3, m3
+ psrlw m4, 1 ; dc = (width + height) >> 1;
+ add r6, r5
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp r6
+.h4:
+ movd m0, [tlq-4]
+ pmaddubsw m0, m3
+ jmp wq
+.w4:
+ movd m1, [tlq+1]
+ pmaddubsw m1, m3
+ psubw m0, m4
+ paddw m0, m1
+ pmaddwd m0, m3
+ cmp hd, 4
+ jg .w4_mul
+ psrlw m0, 3 ; dc >>= ctz(width + height);
+ jmp .w4_end
+.w4_mul:
+ punpckhqdq m1, m0, m0
+ paddw m0, m1
+ psrlq m1, m0, 32
+ paddw m0, m1
+ psrlw m0, 2
+ mov r6d, 0x5556
+ mov r2d, 0x3334
+ test hd, 8
+ cmovz r6d, r2d
+ movd m5, r6d
+ pmulhuw m0, m5
+.w4_end:
+ pxor m1, m1
+ pshufb m0, m1
+.s4:
+ movd [dstq+strideq*0], m0
+ movd [dstq+strideq*1], m0
+ movd [dstq+strideq*2], m0
+ movd [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s4
+ RET
+ALIGN function_align
+.h8:
+ movq m0, [tlq-8]
+ pmaddubsw m0, m3
+ jmp wq
+.w8:
+ movq m1, [tlq+1]
+ pmaddubsw m1, m3
+ psubw m4, m0
+ punpckhqdq m0, m0
+ psubw m0, m4
+ paddw m0, m1
+ pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
+ paddw m0, m1
+ pmaddwd m0, m3
+ psrlw m0, m5
+ cmp hd, 8
+ je .w8_end
+ mov r6d, 0x5556
+ mov r2d, 0x3334
+ cmp hd, 32
+ cmovz r6d, r2d
+ movd m1, r6d
+ pmulhuw m0, m1
+.w8_end:
+ pxor m1, m1
+ pshufb m0, m1
+.s8:
+ movq [dstq+strideq*0], m0
+ movq [dstq+strideq*1], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s8
+ RET
+ALIGN function_align
+.h16:
+ mova m0, [tlq-16]
+ pmaddubsw m0, m3
+ jmp wq
+.w16:
+ movu m1, [tlq+1]
+ pmaddubsw m1, m3
+ paddw m0, m1
+ psubw m4, m0
+ punpckhqdq m0, m0
+ psubw m0, m4
+ pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
+ paddw m0, m1
+ pmaddwd m0, m3
+ psrlw m0, m5
+ cmp hd, 16
+ je .w16_end
+ mov r6d, 0x5556
+ mov r2d, 0x3334
+ test hd, 8|32
+ cmovz r6d, r2d
+ movd m1, r6d
+ pmulhuw m0, m1
+.w16_end:
+ pxor m1, m1
+ pshufb m0, m1
+.s16:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s16
+ RET
+ALIGN function_align
+.h32:
+ mova m0, [tlq-32]
+ pmaddubsw m0, m3
+ mova m2, [tlq-16]
+ pmaddubsw m2, m3
+ paddw m0, m2
+ jmp wq
+.w32:
+ movu m1, [tlq+1]
+ pmaddubsw m1, m3
+ movu m2, [tlq+17]
+ pmaddubsw m2, m3
+ paddw m1, m2
+ paddw m0, m1
+ psubw m4, m0
+ punpckhqdq m0, m0
+ psubw m0, m4
+ pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
+ paddw m0, m1
+ pmaddwd m0, m3
+ psrlw m0, m5
+ cmp hd, 32
+ je .w32_end
+ lea r2d, [hq*2]
+ mov r6d, 0x5556
+ mov r2d, 0x3334
+ test hd, 64|16
+ cmovz r6d, r2d
+ movd m1, r6d
+ pmulhuw m0, m1
+.w32_end:
+ pxor m1, m1
+ pshufb m0, m1
+ mova m1, m0
+.s32:
+ mova [dstq], m0
+ mova [dstq+16], m1
+ mova [dstq+strideq], m0
+ mova [dstq+strideq+16], m1
+ mova [dstq+strideq*2], m0
+ mova [dstq+strideq*2+16], m1
+ mova [dstq+stride3q], m0
+ mova [dstq+stride3q+16], m1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s32
+ RET
+ALIGN function_align
+.h64:
+ mova m0, [tlq-64]
+ mova m1, [tlq-48]
+ pmaddubsw m0, m3
+ pmaddubsw m1, m3
+ paddw m0, m1
+ mova m1, [tlq-32]
+ pmaddubsw m1, m3
+ paddw m0, m1
+ mova m1, [tlq-16]
+ pmaddubsw m1, m3
+ paddw m0, m1
+ jmp wq
+.w64:
+ movu m1, [tlq+ 1]
+ movu m2, [tlq+17]
+ pmaddubsw m1, m3
+ pmaddubsw m2, m3
+ paddw m1, m2
+ movu m2, [tlq+33]
+ pmaddubsw m2, m3
+ paddw m1, m2
+ movu m2, [tlq+49]
+ pmaddubsw m2, m3
+ paddw m1, m2
+ paddw m0, m1
+ psubw m4, m0
+ punpckhqdq m0, m0
+ psubw m0, m4
+ pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
+ paddw m0, m1
+ pmaddwd m0, m3
+ psrlw m0, m5
+ cmp hd, 64
+ je .w64_end
+ mov r6d, 0x5556
+ mov r2d, 0x3334
+ test hd, 32
+ cmovz r6d, r2d
+ movd m1, r6d
+ pmulhuw m0, m1
+.w64_end:
+ pxor m1, m1
+ pshufb m0, m1
+ mova m1, m0
+ mova m2, m0
+ mova m3, m0
+.s64:
+ mova [dstq], m0
+ mova [dstq+16], m1
+ mova [dstq+32], m2
+ mova [dstq+48], m3
+ mova [dstq+strideq], m0
+ mova [dstq+strideq+16], m1
+ mova [dstq+strideq+32], m2
+ mova [dstq+strideq+48], m3
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .s64
+ RET
+
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_dc_left_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+; const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
+cglobal ipred_dc_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
+ LEA r5, ipred_dc_left_ssse3_table
+ mov hd, hm ; zero upper half
+ tzcnt r6d, hd
+ sub tlq, hq
+ tzcnt wd, wm
+ movu m0, [tlq]
+ movd m3, [r5-ipred_dc_left_ssse3_table+pd_32768]
+ movd m2, r6d
+ psrld m3, m2
+ movsxd r6, [r5+r6*4]
+ pcmpeqd m2, m2
+ pmaddubsw m0, m2
+ add r6, r5
+ add r5, ipred_dc_splat_ssse3_table-ipred_dc_left_ssse3_table
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ jmp r6
+.h64:
+ movu m1, [tlq+48] ; unaligned when jumping here from dc_top
+ pmaddubsw m1, m2
+ paddw m0, m1
+ movu m1, [tlq+32] ; unaligned when jumping here from dc_top
+ pmaddubsw m1, m2
+ paddw m0, m1
+.h32:
+ movu m1, [tlq+16] ; unaligned when jumping here from dc_top
+ pmaddubsw m1, m2
+ paddw m0, m1
+.h16:
+ pshufd m1, m0, q3232 ; psrlq m1, m0, 16
+ paddw m0, m1
+.h8:
+ pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
+ paddw m0, m1
+.h4:
+ pmaddwd m0, m2
+ pmulhrsw m0, m3
+ lea stride3q, [strideq*3]
+ pxor m1, m1
+ pshufb m0, m1
+ mova m1, m0
+ mova m2, m0
+ mova m3, m0
+ jmp wq
+
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_dc_128_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+; const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
+cglobal ipred_dc_128_8bpc, 2, 7, 6, dst, stride, tl, w, h, stride3
+ LEA r5, ipred_dc_splat_ssse3_table
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ movddup m0, [r5-ipred_dc_splat_ssse3_table+pb_128]
+ mova m1, m0
+ mova m2, m0
+ mova m3, m0
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp wq
+
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_dc_top_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+; const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
+cglobal ipred_dc_top_8bpc, 3, 7, 6, dst, stride, tl, w, h
+ LEA r5, ipred_dc_left_ssse3_table
+ tzcnt wd, wm
+ inc tlq
+ movu m0, [tlq]
+ movifnidn hd, hm
+ movd m3, [r5-ipred_dc_left_ssse3_table+pd_32768]
+ movd m2, wd
+ psrld m3, m2
+ movsxd r6, [r5+wq*4]
+ pcmpeqd m2, m2
+ pmaddubsw m0, m2
+ add r6, r5
+ add r5, ipred_dc_splat_ssse3_table-ipred_dc_left_ssse3_table
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ jmp r6
+
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_smooth_v_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+; const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
+%macro SMOOTH 6 ; src[1-2], mul[1-2], add[1-2]
+ ; w * a = (w - 128) * a + 128 * a
+ ; (256 - w) * b = (127 - w) * b + 129 * b
+ ; => w * a + (256 - w) * b = [(w - 128) * a + (127 - w) * b] + [128 * a + 129 * b]
+ pmaddubsw m6, m%3, m%1
+ pmaddubsw m0, m%4, m%2 ; (w - 128) * a + (127 - w) * b
+ paddw m6, m%5
+ paddw m0, m%6 ; [(w - 128) * a + (127 - w) * b] + [128 * a + 129 * b + 128]
+ psrlw m6, 8
+ psrlw m0, 8
+ packuswb m6, m0
+%endmacro
+
+cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl, w, h, weights
+%define base r6-ipred_smooth_v_ssse3_table
+ LEA r6, ipred_smooth_v_ssse3_table
+ tzcnt wd, wm
+ mov hd, hm
+ movsxd wq, [r6+wq*4]
+ movddup m0, [base+pb_127_m127]
+ movddup m1, [base+pw_128]
+ lea weightsq, [base+smooth_weights+hq*4]
+ neg hq
+ movd m5, [tlq+hq]
+ pxor m2, m2
+ pshufb m5, m2
+ add wq, r6
+ jmp wq
+.w4:
+ movd m2, [tlq+1]
+ punpckldq m2, m2
+ punpcklbw m2, m5 ; top, bottom
+ lea r3, [strideq*3]
+ mova m4, [base+ipred_v_shuf]
+ mova m5, m4
+ punpckldq m4, m4
+ punpckhdq m5, m5
+ pmaddubsw m3, m2, m0 ; m3: 127 * top - 127 * bottom
+ paddw m1, m2 ; m1: 1 * top + 256 * bottom + 128, overflow is ok
+ paddw m3, m1 ; m3: 128 * top + 129 * bottom + 128
+.w4_loop:
+ movu m1, [weightsq+hq*2]
+ pshufb m0, m1, m4 ;m2, m3, m4 and m5 should be stable in loop
+ pshufb m1, m5
+ SMOOTH 0, 1, 2, 2, 3, 3
+ movd [dstq+strideq*0], m6
+ pshuflw m1, m6, q1032
+ movd [dstq+strideq*1], m1
+ punpckhqdq m6, m6
+ movd [dstq+strideq*2], m6
+ psrlq m6, 32
+ movd [dstq+r3 ], m6
+ lea dstq, [dstq+strideq*4]
+ add hq, 4
+ jl .w4_loop
+ RET
+ALIGN function_align
+.w8:
+ movq m2, [tlq+1]
+ punpcklbw m2, m5
+ mova m5, [base+ipred_v_shuf]
+ lea r3, [strideq*3]
+ pshufd m4, m5, q0000
+ pshufd m5, m5, q1111
+ pmaddubsw m3, m2, m0
+ paddw m1, m2
+ paddw m3, m1 ; m3 is output for loop
+.w8_loop:
+ movq m1, [weightsq+hq*2]
+ pshufb m0, m1, m4
+ pshufb m1, m5
+ SMOOTH 0, 1, 2, 2, 3, 3
+ movq [dstq+strideq*0], m6
+ movhps [dstq+strideq*1], m6
+ lea dstq, [dstq+strideq*2]
+ add hq, 2
+ jl .w8_loop
+ RET
+ALIGN function_align
+.w16:
+ movu m3, [tlq+1]
+ punpcklbw m2, m3, m5
+ punpckhbw m3, m5
+ pmaddubsw m4, m2, m0
+ pmaddubsw m5, m3, m0
+ paddw m0, m1, m2
+ paddw m1, m3
+ paddw m4, m0
+ paddw m5, m1 ; m4 and m5 is output for loop
+.w16_loop:
+ movd m1, [weightsq+hq*2]
+ pshuflw m1, m1, q0000
+ punpcklqdq m1, m1
+ SMOOTH 1, 1, 2, 3, 4, 5
+ mova [dstq], m6
+ add dstq, strideq
+ add hq, 1
+ jl .w16_loop
+ RET
+ALIGN function_align
+.w32:
+%if WIN64
+ movaps [rsp+24], xmm7
+ %define xmm_regs_used 8
+%endif
+ mova m7, m5
+.w32_loop_init:
+ mov r3d, 2
+.w32_loop:
+ movddup m0, [base+pb_127_m127]
+ movddup m1, [base+pw_128]
+ movu m3, [tlq+1]
+ punpcklbw m2, m3, m7
+ punpckhbw m3, m7
+ pmaddubsw m4, m2, m0
+ pmaddubsw m5, m3, m0
+ paddw m0, m1, m2
+ paddw m1, m3
+ paddw m4, m0
+ paddw m5, m1
+ movd m1, [weightsq+hq*2]
+ pshuflw m1, m1, q0000
+ punpcklqdq m1, m1
+ SMOOTH 1, 1, 2, 3, 4, 5
+ mova [dstq], m6
+ add tlq, 16
+ add dstq, 16
+ dec r3d
+ jg .w32_loop
+ lea dstq, [dstq-32+strideq]
+ sub tlq, 32
+ add hq, 1
+ jl .w32_loop_init
+ RET
+ALIGN function_align
+.w64:
+%if WIN64
+ movaps [rsp+24], xmm7
+ %define xmm_regs_used 8
+%endif
+ mova m7, m5
+.w64_loop_init:
+ mov r3d, 4
+.w64_loop:
+ movddup m0, [base+pb_127_m127]
+ movddup m1, [base+pw_128]
+ movu m3, [tlq+1]
+ punpcklbw m2, m3, m7
+ punpckhbw m3, m7
+ pmaddubsw m4, m2, m0
+ pmaddubsw m5, m3, m0
+ paddw m0, m1, m2
+ paddw m1, m3
+ paddw m4, m0
+ paddw m5, m1
+ movd m1, [weightsq+hq*2]
+ pshuflw m1, m1, q0000
+ punpcklqdq m1, m1
+ SMOOTH 1, 1, 2, 3, 4, 5
+ mova [dstq], m6
+ add tlq, 16
+ add dstq, 16
+ dec r3d
+ jg .w64_loop
+ lea dstq, [dstq-64+strideq]
+ sub tlq, 64
+ add hq, 1
+ jl .w64_loop_init
+ RET
+
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_smooth_h_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+; const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
+cglobal ipred_smooth_h_8bpc, 3, 7, 8, dst, stride, tl, w, h
+%define base r6-ipred_smooth_h_ssse3_table
+ LEA r6, ipred_smooth_h_ssse3_table
+ mov wd, wm
+ movd m3, [tlq+wq]
+ pxor m1, m1
+ pshufb m3, m1 ; right
+ tzcnt wd, wd
+ mov hd, hm
+ movsxd wq, [r6+wq*4]
+ movddup m4, [base+pb_127_m127]
+ movddup m5, [base+pw_128]
+ add wq, r6
+ jmp wq
+.w4:
+ movddup m6, [base+smooth_weights+4*2]
+ mova m7, [base+ipred_h_shuf]
+ sub tlq, 4
+ sub tlq, hq
+ lea r3, [strideq*3]
+.w4_loop:
+ movd m2, [tlq+hq] ; left
+ pshufb m2, m7
+ punpcklbw m1, m2, m3 ; left, right
+ punpckhbw m2, m3
+ pmaddubsw m0, m1, m4 ; 127 * left - 127 * right
+ paddw m0, m1 ; 128 * left + 129 * right
+ pmaddubsw m1, m6
+ paddw m1, m5
+ paddw m0, m1
+ pmaddubsw m1, m2, m4
+ paddw m1, m2
+ pmaddubsw m2, m6
+ paddw m2, m5
+ paddw m1, m2
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+ movd [dstq+strideq*0], m0
+ pshuflw m1, m0, q1032
+ movd [dstq+strideq*1], m1
+ punpckhqdq m0, m0
+ movd [dstq+strideq*2], m0
+ psrlq m0, 32
+ movd [dstq+r3 ], m0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4_loop
+ RET
+ALIGN function_align
+.w8:
+ mova m6, [base+smooth_weights+8*2]
+ mova m7, [base+ipred_h_shuf]
+ sub tlq, 4
+ sub tlq, hq
+ punpckldq m7, m7
+.w8_loop:
+ movd m2, [tlq+hq] ; left
+ pshufb m2, m7
+ punpcklbw m1, m2, m3 ; left, right
+ punpckhbw m2, m3
+ pmaddubsw m0, m1, m4 ; 127 * left - 127 * right
+ paddw m0, m1 ; 128 * left + 129 * right
+ pmaddubsw m1, m6
+ paddw m1, m5
+ paddw m0, m1
+ pmaddubsw m1, m2, m4
+ paddw m1, m2
+ pmaddubsw m2, m6
+ paddw m2, m5
+ paddw m1, m2
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8_loop
+ RET
+ALIGN function_align
+.w16:
+ mova m6, [base+smooth_weights+16*2]
+ mova m7, [base+smooth_weights+16*3]
+ sub tlq, 1
+ sub tlq, hq
+.w16_loop:
+ pxor m1, m1
+ movd m2, [tlq+hq] ; left
+ pshufb m2, m1
+ punpcklbw m1, m2, m3 ; left, right
+ punpckhbw m2, m3
+ pmaddubsw m0, m1, m4 ; 127 * left - 127 * right
+ paddw m0, m1 ; 128 * left + 129 * right
+ pmaddubsw m1, m6
+ paddw m1, m5
+ paddw m0, m1
+ pmaddubsw m1, m2, m4
+ paddw m1, m2
+ pmaddubsw m2, m7
+ paddw m2, m5
+ paddw m1, m2
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+ mova [dstq], m0
+ lea dstq, [dstq+strideq]
+ sub hd, 1
+ jg .w16_loop
+ RET
+ALIGN function_align
+.w32:
+ sub tlq, 1
+ sub tlq, hq
+ pxor m6, m6
+.w32_loop_init:
+ mov r5, 2
+ lea r3, [base+smooth_weights+16*4]
+.w32_loop:
+ mova m7, [r3]
+ add r3, 16
+ movd m2, [tlq+hq] ; left
+ pshufb m2, m6
+ punpcklbw m1, m2, m3 ; left, right
+ punpckhbw m2, m3
+ pmaddubsw m0, m1, m4 ; 127 * left - 127 * right
+ paddw m0, m1 ; 128 * left + 129 * right
+ pmaddubsw m1, m7
+ paddw m1, m5
+ paddw m0, m1
+ pmaddubsw m1, m2, m4
+ paddw m1, m2
+ mova m7, [r3]
+ add r3, 16
+ pmaddubsw m2, m7
+ paddw m2, m5
+ paddw m1, m2
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+ mova [dstq], m0
+ add dstq, 16
+ dec r5
+ jg .w32_loop
+ lea dstq, [dstq-32+strideq]
+ sub hd, 1
+ jg .w32_loop_init
+ RET
+ALIGN function_align
+.w64:
+ sub tlq, 1
+ sub tlq, hq
+ pxor m6, m6
+.w64_loop_init:
+ mov r5, 4
+ lea r3, [base+smooth_weights+16*8]
+.w64_loop:
+ mova m7, [r3]
+ add r3, 16
+ movd m2, [tlq+hq] ; left
+ pshufb m2, m6
+ punpcklbw m1, m2, m3 ; left, right
+ punpckhbw m2, m3
+ pmaddubsw m0, m1, m4 ; 127 * left - 127 * right
+ paddw m0, m1 ; 128 * left + 129 * right
+ pmaddubsw m1, m7
+ paddw m1, m5
+ paddw m0, m1
+ pmaddubsw m1, m2, m4
+ paddw m1, m2
+ mova m7, [r3]
+ add r3, 16
+ pmaddubsw m2, m7
+ paddw m2, m5
+ paddw m1, m2
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+ mova [dstq], m0
+ add dstq, 16
+ dec r5
+ jg .w64_loop
+ lea dstq, [dstq-64+strideq]
+ sub hd, 1
+ jg .w64_loop_init
+ RET
+
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_smooth_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+; const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
+%macro SMOOTH_2D_END 7 ; src[1-2], mul[1-2], add[1-2], m3
+ pmaddubsw m6, m%3, m%1
+ mova m0, m6
+ pmaddubsw m6, m%4, m%2
+ mova m1, m6
+%ifnum %5
+ paddw m0, m%5
+%else
+ paddw m0, %5
+%endif
+%ifnum %6
+ paddw m1, m%6
+%else
+ paddw m1, %6
+%endif
+%ifnum %7
+%else
+ mova m3, %7
+%endif
+ pavgw m0, m2
+ pavgw m1, m3
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+%endmacro
+
+%macro SMOOTH_OUTPUT_16B 12 ; m1, [buffer1, buffer2, buffer3, buffer4,] [w1, w2,] m3, m7, [m0, m4, m5]
+ mova m1, [rsp+16*%1] ; top
+ punpckhbw m6, m1, m0 ; top, bottom
+ punpcklbw m1, m0 ; top, bottom
+ pmaddubsw m2, m1, m5
+ mova [rsp+16*%2], m1
+ paddw m1, m3 ; 1 * top + 255 * bottom + 255
+ paddw m2, m1 ; 128 * top + 129 * bottom + 255
+ mova [rsp+16*%3], m2
+ pmaddubsw m2, m6, m5
+ mova [rsp+16*%4], m6
+ paddw m6, m3 ; 1 * top + 255 * bottom + 255
+ paddw m2, m6 ; 128 * top + 129 * bottom + 255
+ mova [rsp+16*%5], m2
+ movd m1, [tlq+hq] ; left
+ pshufb m1, [base+pb_3] ; topleft[-(1 + y)]
+ punpcklbw m1, m4 ; left, right
+ pmaddubsw m2, m1, m5 ; 127 * left - 127 * right
+ paddw m2, m1 ; 128 * left + 129 * right
+ mova m3, m2
+ pmaddubsw m0, m1, %6 ; weights_hor = &dav1d_sm_weights[width];
+ pmaddubsw m1, %7
+ paddw m2, m3, m0
+ paddw m3, m1
+ movd m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height];
+ mova m7, [rsp+16*%9]
+ pshufb m1, m7
+ mova [rsp+16*%8], m3
+ mova m4, [rsp+16*%2]
+ mova m5, [rsp+16*%3]
+ mova m3, [rsp+16*%4]
+ mova m7, [rsp+16*%5]
+ SMOOTH_2D_END 1, 1, 4, 3, 5, 7, [rsp+16*%8]
+ mova [dstq], m0
+ movddup m3, [base+pw_255] ; recovery
+ mova m0, [rsp+16*%10] ; recovery
+ mova m4, [rsp+16*%11] ; recovery
+ mova m5, [rsp+16*%12] ; recovery
+%endmacro
+
+cglobal ipred_smooth_8bpc, 3, 7, 8, -13*16, dst, stride, tl, w, h, v_weights
+%define base r6-ipred_smooth_ssse3_table
+ mov wd, wm
+ mov hd, hm
+ LEA r6, ipred_smooth_ssse3_table
+ movd m4, [tlq+wq] ; right
+ pxor m2, m2
+ pshufb m4, m2
+ tzcnt wd, wd
+ mov r5, tlq
+ sub r5, hq
+ movsxd wq, [r6+wq*4]
+ movddup m5, [base+pb_127_m127]
+ movd m0, [r5]
+ pshufb m0, m2 ; bottom
+ movddup m3, [base+pw_255]
+ add wq, r6
+ lea v_weightsq, [base+smooth_weights+hq*2] ; weights_ver = &dav1d_sm_weights[height]
+ jmp wq
+.w4:
+ mova m7, [base+ipred_v_shuf]
+ movd m1, [tlq+1] ; left
+ pshufd m1, m1, q0000
+ sub tlq, 4
+ lea r3, [strideq*3]
+ sub tlq, hq
+ punpcklbw m1, m0 ; top, bottom
+ pshufd m6, m7, q1100
+ pshufd m7, m7, q3322
+ pmaddubsw m2, m1, m5
+ paddw m3, m1 ; 1 * top + 255 * bottom + 255
+ paddw m2, m3 ; 128 * top + 129 * bottom + 255
+ mova [rsp+16*0], m1
+ mova [rsp+16*1], m2
+ movq m1, [base+smooth_weights+4*2] ; weights_hor = &dav1d_sm_weights[width];
+ punpcklqdq m1, m1
+ mova [rsp+16*2], m1
+ mova [rsp+16*3], m4
+ mova [rsp+16*4], m6
+ mova [rsp+16*5], m5
+.w4_loop:
+ movd m1, [tlq+hq] ; left
+ pshufb m1, [base+ipred_h_shuf]
+ punpcklbw m0, m1, m4 ; left, right
+ punpckhbw m1, m4
+ pmaddubsw m2, m0, m5 ; 127 * left - 127 * right
+ pmaddubsw m3, m1, m5
+ paddw m2, m0 ; 128 * left + 129 * right
+ paddw m3, m1
+ mova m4, [rsp+16*2]
+ pmaddubsw m0, m4
+ pmaddubsw m1, m4
+ paddw m2, m0
+ paddw m3, m1
+ movq m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height];
+ add v_weightsq, 8
+ pshufb m0, m1, m6
+ pshufb m1, m7
+ mova m4, [rsp+16*0]
+ mova m5, [rsp+16*1]
+ SMOOTH_2D_END 0, 1, 4, 4, 5, 5, 3
+ mova m4, [rsp+16*3]
+ mova m6, [rsp+16*4]
+ mova m5, [rsp+16*5]
+ movd [dstq+strideq*0], m0
+ pshuflw m1, m0, q1032
+ movd [dstq+strideq*1], m1
+ punpckhqdq m0, m0
+ movd [dstq+strideq*2], m0
+ psrlq m0, 32
+ movd [dstq+r3 ], m0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4_loop
+ RET
+ALIGN function_align
+.w8:
+ mova m7, [base+ipred_v_shuf]
+ movq m1, [tlq+1] ; left
+ punpcklqdq m1, m1
+ sub tlq, 4
+ sub tlq, hq
+ punpcklbw m1, m0
+ pshufd m6, m7, q0000
+ pshufd m7, m7, q1111
+ pmaddubsw m2, m1, m5
+ paddw m3, m1
+ paddw m2, m3
+ mova [rsp+16*0], m1
+ mova [rsp+16*1], m2
+ mova m1, [base+smooth_weights+8*2] ; weights_hor = &dav1d_sm_weights[width];
+ mova [rsp+16*2], m1
+ mova [rsp+16*3], m4
+ mova [rsp+16*4], m6
+ mova [rsp+16*5], m5
+.w8_loop:
+ movd m1, [tlq+hq] ; left
+ pshufb m1, [base+ipred_h_shuf]
+ pshufd m1, m1, q1100
+ punpcklbw m0, m1, m4
+ punpckhbw m1, m4
+ pmaddubsw m2, m0, m5
+ pmaddubsw m3, m1, m5
+ paddw m2, m0
+ paddw m3, m1
+ mova m4, [rsp+16*2]
+ pmaddubsw m0, m4
+ pmaddubsw m1, m4
+ paddw m2, m0
+ paddw m3, m1
+ movd m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height];
+ add v_weightsq, 4
+ pshufb m0, m1, m6
+ pshufb m1, m7
+ mova m4, [rsp+16*0]
+ mova m5, [rsp+16*1]
+ SMOOTH_2D_END 0, 1, 4, 4, 5, 5, 3
+ mova m4, [rsp+16*3]
+ mova m6, [rsp+16*4]
+ mova m5, [rsp+16*5]
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8_loop
+ RET
+ALIGN function_align
+.w16:
+ mova m7, [base+ipred_v_shuf]
+ movu m1, [tlq+1] ; left
+ sub tlq, 4
+ sub tlq, hq
+ punpckhbw m6, m1, m0 ; top, bottom
+ punpcklbw m1, m0 ; top, bottom
+ pshufd m7, m7, q0000
+ mova [rsp+16*2], m7
+ pmaddubsw m2, m6, m5
+ mova [rsp+16*5], m6
+ paddw m6, m3 ; 1 * top + 255 * bottom + 255
+ paddw m2, m6 ; 128 * top + 129 * bottom + 255
+ mova [rsp+16*6], m2
+ pmaddubsw m2, m1, m5
+ paddw m3, m1 ; 1 * top + 255 * bottom + 255
+ mova [rsp+16*0], m1
+ paddw m2, m3 ; 128 * top + 129 * bottom + 255
+ mova [rsp+16*1], m2
+ mova [rsp+16*3], m4
+ mova [rsp+16*4], m5
+.w16_loop:
+ movd m1, [tlq+hq] ; left
+ pshufb m1, [base+pb_3] ; topleft[-(1 + y)]
+ punpcklbw m1, m4 ; left, right
+ pmaddubsw m2, m1, m5 ; 127 * left - 127 * right
+ paddw m2, m1 ; 128 * left + 129 * right
+ mova m0, m1
+ mova m3, m2
+ pmaddubsw m0, [base+smooth_weights+16*2] ; weights_hor = &dav1d_sm_weights[width];
+ pmaddubsw m1, [base+smooth_weights+16*3]
+ paddw m2, m0
+ paddw m3, m1
+ movd m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height];
+ add v_weightsq, 2
+ mova m7, [rsp+16*2]
+ pshufb m1, m7
+ mova [rsp+16*7], m3
+ mova m4, [rsp+16*0]
+ mova m5, [rsp+16*1]
+ mova m3, [rsp+16*5]
+ mova m7, [rsp+16*6]
+ SMOOTH_2D_END 1, 1, 4, 3, 5, 7, [rsp+16*7]
+ mova m4, [rsp+16*3]
+ mova m5, [rsp+16*4]
+ mova [dstq], m0
+ lea dstq, [dstq+strideq]
+ sub hd, 1
+ jg .w16_loop
+ RET
+ALIGN function_align
+.w32:
+ movu m1, [tlq+1] ; top topleft[1 + x]
+ movu m2, [tlq+17] ; top
+ mova [rsp+16*0], m1
+ mova [rsp+16*1], m2
+ sub tlq, 4
+ sub tlq, hq
+ mova m7, [base+ipred_v_shuf]
+ pshufd m7, m7, q0000
+ mova [rsp+16*2], m7
+ mova [rsp+16*3], m0
+ mova [rsp+16*4], m4
+ mova [rsp+16*5], m5
+.w32_loop:
+ SMOOTH_OUTPUT_16B 0, 6, 7, 8, 9, [base+smooth_weights+16*4], [base+smooth_weights+16*5], 10, 2, 3, 4, 5
+ add dstq, 16
+ SMOOTH_OUTPUT_16B 1, 6, 7, 8, 9, [base+smooth_weights+16*6], [base+smooth_weights+16*7], 10, 2, 3, 4, 5
+ lea dstq, [dstq-16+strideq]
+ add v_weightsq, 2
+ sub hd, 1
+ jg .w32_loop
+ RET
+ALIGN function_align
+.w64:
+ movu m1, [tlq+1] ; top topleft[1 + x]
+ movu m2, [tlq+17] ; top
+ mova [rsp+16*0], m1
+ mova [rsp+16*1], m2
+ movu m1, [tlq+33] ; top
+ movu m2, [tlq+49] ; top
+ mova [rsp+16*11], m1
+ mova [rsp+16*12], m2
+ sub tlq, 4
+ sub tlq, hq
+ mova m7, [base+ipred_v_shuf]
+ pshufd m7, m7, q0000
+ mova [rsp+16*2], m7
+ mova [rsp+16*3], m0
+ mova [rsp+16*4], m4
+ mova [rsp+16*5], m5
+.w64_loop:
+ SMOOTH_OUTPUT_16B 0, 6, 7, 8, 9, [base+smooth_weights+16*8], [base+smooth_weights+16*9], 10, 2, 3, 4, 5
+ add dstq, 16
+ SMOOTH_OUTPUT_16B 1, 6, 7, 8, 9, [base+smooth_weights+16*10], [base+smooth_weights+16*11], 10, 2, 3, 4, 5
+ add dstq, 16
+ SMOOTH_OUTPUT_16B 11, 6, 7, 8, 9, [base+smooth_weights+16*12], [base+smooth_weights+16*13], 10, 2, 3, 4, 5
+ add dstq, 16
+ SMOOTH_OUTPUT_16B 12, 6, 7, 8, 9, [base+smooth_weights+16*14], [base+smooth_weights+16*15], 10, 2, 3, 4, 5
+ lea dstq, [dstq-48+strideq]
+ add v_weightsq, 2
+ sub hd, 1
+ jg .w64_loop
+ RET
+
+%if ARCH_X86_64
+cglobal ipred_z1_8bpc, 3, 8, 11, 16*12, dst, stride, tl, w, h, angle, dx
+ %define base r7-$$
+ lea r7, [$$]
+ mova m8, [base+pw_62]
+ mova m9, [base+pw_64]
+ mova m10, [base+pw_512]
+%else
+cglobal ipred_z1_8bpc, 3, 7, 8, -16*13, dst, _, tl, w, h, angle, dx
+ %define base r1-$$
+ %define m8 [base+pw_62]
+ %define m9 [base+pw_64]
+ %define m10 [base+pw_512]
+ %define strideq r3
+ %define stridemp dword [rsp+16*12]
+ mov stridemp, r1
+ LEA r1, $$
+%endif
+ tzcnt wd, wm
+ movifnidn angled, anglem
+ movifnidn hd, hm
+ inc tlq
+ movsxd wq, [base+ipred_z1_ssse3_table+wq*4]
+ mov dxd, angled
+ and dxd, 0x7e
+ add angled, 165 ; ~90
+ lea wq, [base+wq+ipred_z1_ssse3_table]
+ movzx dxd, word [base+dr_intra_derivative+dxq]
+ xor angled, 0x4ff ; d = 90 - angle
+ jmp wq
+.w4:
+ lea r3d, [angleq+88]
+ test r3d, 0x480
+ jnz .w4_no_upsample ; !enable_intra_edge_filter || angle >= 40
+ sar r3d, 9
+ add r3d, hd
+ cmp r3d, 8
+ jg .w4_no_upsample ; h > 8 || (w == h && is_sm)
+ mova m1, [tlq-1]
+ pshufb m0, m1, [base+z_upsample1]
+ pshufb m1, [base+z_upsample2]
+ movddup m2, [base+pb_36_m4]
+ add dxd, dxd
+ pmaddubsw m0, m2
+ pshufd m7, m1, q3333
+ movd [rsp+16], m7 ; top[max_base_x]
+ pmaddubsw m1, m2
+ movd m6, dxd
+ mov r5d, dxd ; xpos
+ pshufb m6, [base+pw_256]
+ paddw m1, m0
+ movq m0, [tlq]
+ pmulhrsw m1, m10
+ paddw m7, m6, m6
+ punpcklqdq m6, m7 ; xpos0 xpos1
+ packuswb m1, m1
+ punpcklbw m0, m1
+ movifnidn strideq, stridemp
+ mova [rsp], m0
+.w4_upsample_loop:
+ lea r2d, [r5+dxq]
+ shr r5d, 6 ; base0
+ movq m0, [rsp+r5]
+ lea r5d, [r2+dxq]
+ shr r2d, 6 ; base1
+ movhps m0, [rsp+r2]
+ pand m2, m8, m6 ; frac
+ psubw m1, m9, m2 ; 64-frac
+ psllw m2, 8
+ por m1, m2 ; 64-frac, frac
+ pmaddubsw m0, m1
+ paddw m6, m7 ; xpos += dx
+ pmulhrsw m0, m10
+ packuswb m0, m0
+ movd [dstq+strideq*0], m0
+ pshuflw m0, m0, q1032
+ movd [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w4_upsample_loop
+ RET
+.w4_no_upsample:
+ mov r3d, 7 ; max_base
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .w4_main
+ lea r3d, [hq+3]
+ movd m0, r3d
+ movd m2, angled
+ shr angled, 8 ; is_sm << 1
+ pxor m1, m1
+ pshufb m0, m1
+ pshufb m2, m1
+ pcmpeqb m1, m0, [base+z_filter_wh4]
+ pand m1, m2
+ pcmpgtb m1, [base+z_filter_t_w48+angleq*8]
+ pmovmskb r5d, m1
+ mov r3d, 7
+ test r5d, r5d
+ jz .w4_main ; filter_strength == 0
+ mova m3, [tlq-1]
+ imul r5d, 0x55555555
+ movu m7, [base+z_filter_s+8]
+ shr r5d, 30 ; filter_strength
+ movddup m0, [base+pb_8]
+ pminub m7, m0
+ pshufb m0, m3, [base+z_filter_s]
+ movddup m4, [base+z_filter_k-8+r5*8+24*0]
+ pshufb m3, m7
+ movddup m5, [base+z_filter_k-8+r5*8+24*1]
+ shufps m2, m0, m3, q2121
+ movddup m6, [base+z_filter_k-8+r5*8+24*2]
+ pmaddubsw m0, m4
+ pmaddubsw m1, m2, m4
+ pmaddubsw m2, m5
+ paddd m5, m6
+ pmaddubsw m4, m3, m5
+ pmaddubsw m3, m6
+ paddw m0, m2
+ paddw m1, m4
+ paddw m0, m3
+ pshufd m1, m1, q3333
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ mov r5d, 9
+ mov tlq, rsp
+ cmp hd, 4
+ cmovne r3d, r5d
+ packuswb m0, m1
+ mova [tlq], m0
+.w4_main:
+ add tlq, r3
+ movd m5, dxd
+ movddup m0, [base+z_base_inc] ; base_inc << 6
+ movd m7, [tlq] ; top[max_base_x]
+ shl r3d, 6
+ movd m4, r3d
+ pshufb m5, [base+pw_256]
+ mov r5d, dxd ; xpos
+ pshufb m7, [base+pw_m256]
+ sub r5, r3
+ pshufb m4, [base+pw_256]
+ mova m3, [base+z1_shuf_w4]
+ paddw m6, m5, m5
+ psubw m4, m0 ; max_base_x
+ punpcklqdq m5, m6 ; xpos0 xpos1
+.w4_loop:
+ lea r3, [r5+dxq]
+ sar r5, 6 ; base0
+ movq m0, [tlq+r5]
+ lea r5, [r3+dxq]
+ sar r3, 6 ; base1
+ movhps m0, [tlq+r3]
+ pand m2, m8, m5 ; frac
+ psubw m1, m9, m2 ; 64-frac
+ psllw m2, 8
+ pshufb m0, m3
+ por m1, m2 ; 64-frac, frac
+ pmaddubsw m0, m1
+ movifnidn strideq, stridemp
+ pcmpgtw m1, m4, m5 ; base < max_base_x
+ pmulhrsw m0, m10
+ paddw m5, m6 ; xpos += dx
+ pand m0, m1
+ pandn m1, m7
+ por m0, m1
+ packuswb m0, m0
+ movd [dstq+strideq*0], m0
+ pshuflw m0, m0, q1032
+ movd [dstq+strideq*1], m0
+ sub hd, 2
+ jz .w4_end
+ lea dstq, [dstq+strideq*2]
+ test r5d, r5d
+ jl .w4_loop
+ packuswb m7, m7
+.w4_end_loop:
+ movd [dstq+strideq*0], m7
+ movd [dstq+strideq*1], m7
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w4_end_loop
+.w4_end:
+ RET
+.w8:
+ lea r3d, [angleq+88]
+ and r3d, ~0x7f
+ or r3d, hd
+ cmp r3d, 8
+ ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8
+ mova m5, [base+z_upsample1]
+ movu m3, [base+z_filter_s+6]
+ movd m4, hd
+ mova m0, [tlq-1]
+ movu m1, [tlq+7]
+ pxor m7, m7
+ pshufb m4, m7
+ movddup m7, [base+pb_36_m4]
+ pminub m4, m3
+ add dxd, dxd
+ pshufb m2, m0, m5
+ pmaddubsw m2, m7
+ pshufb m0, m3
+ pmaddubsw m0, m7
+ movd m6, dxd
+ pshufb m3, m1, m5
+ pmaddubsw m3, m7
+ pshufb m1, m4
+ pmaddubsw m1, m7
+ pshufb m6, [base+pw_256]
+ mov r5d, dxd
+ paddw m2, m0
+ paddw m7, m6, m6
+ paddw m3, m1
+ punpcklqdq m6, m7 ; xpos0 xpos1
+ movu m1, [tlq]
+ pmulhrsw m2, m10
+ pmulhrsw m3, m10
+ packuswb m2, m3
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ movifnidn strideq, stridemp
+ mova [rsp+16*0], m0
+ mova [rsp+16*1], m1
+.w8_upsample_loop:
+ lea r2d, [r5+dxq]
+ shr r5d, 6 ; base0
+ movu m0, [rsp+r5]
+ lea r5d, [r2+dxq]
+ shr r2d, 6 ; base1
+ movu m1, [rsp+r2]
+ pand m2, m8, m6
+ psubw m3, m9, m2
+ psllw m2, 8
+ por m3, m2
+ punpcklqdq m2, m3, m3 ; frac0
+ pmaddubsw m0, m2
+ punpckhqdq m3, m3 ; frac1
+ pmaddubsw m1, m3
+ paddw m6, m7
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ packuswb m0, m1
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8_upsample_loop
+ RET
+.w8_no_upsample:
+ lea r3d, [hq+7]
+ movd m0, r3d
+ and r3d, 7
+ or r3d, 8 ; imin(h+7, 15)
+ test angled, 0x400
+ jnz .w8_main
+ movd m2, angled
+ shr angled, 8 ; is_sm << 1
+ pxor m1, m1
+ pshufb m0, m1
+ pshufb m2, m1
+ movu m1, [base+z_filter_wh8]
+ psrldq m3, [base+z_filter_t_w48+angleq*8], 4
+ pcmpeqb m1, m0
+ pand m1, m2
+ pcmpgtb m1, m3
+ pmovmskb r5d, m1
+ test r5d, r5d
+ jz .w8_main ; filter_strength == 0
+ movd m3, [tlq-1]
+ movu m0, [tlq+16*0]
+ imul r5d, 0x55555555
+ movu m1, [tlq+16*1]
+ shr r5d, 30 ; filter_strength
+ movd m2, [tlq+r3]
+ lea tlq, [rsp+16*4]
+ sub r5, 3
+ mova [tlq-16*1], m0
+ pxor m7, m7
+ mova [tlq+16*0], m1
+ pshufb m3, m7
+ pshufb m2, m7
+ mova [tlq-16*2], m3
+ movq [tlq+r3-15], m2
+ call .filter_edge
+ sar r5d, 1
+ add r5d, 17
+ cmp hd, 8
+ cmova r3d, r5d
+.w8_main:
+ add tlq, r3
+ movd m5, dxd
+ movd m7, [tlq]
+ shl r3d, 6
+ movu m3, [base+z_filter_s+2]
+ movd m4, r3d
+ pshufb m5, [base+pw_256]
+ mov r5d, dxd
+ pshufb m7, [base+pw_m256]
+ sub r5, r3
+ pshufb m4, [base+pw_256]
+ psubw m4, [base+z_base_inc]
+ mova m6, m5
+.w8_loop:
+ mov r3, r5
+ sar r3, 6
+ movu m0, [tlq+r3]
+ pand m1, m8, m5
+ psubw m2, m9, m1
+ psllw m1, 8
+ pshufb m0, m3
+ por m1, m2
+ pmaddubsw m0, m1
+ pcmpgtw m1, m4, m5
+ paddw m5, m6
+ pmulhrsw m0, m10
+ pand m0, m1
+ pandn m1, m7
+ por m0, m1
+ packuswb m0, m0
+ movq [dstq], m0
+ dec hd
+ jz .w8_end
+ movifnidn strideq, stridemp
+ add dstq, strideq
+ add r5, dxq
+ jl .w8_loop
+ packuswb m7, m7
+.w8_end_loop:
+ movq [dstq], m7
+ add dstq, strideq
+ dec hd
+ jg .w8_end_loop
+.w8_end:
+ RET
+.w16:
+ lea r3d, [hq+15]
+ movd m0, r3d
+ and r3d, 15
+ or r3d, 16 ; imin(h+15, 31)
+ test angled, 0x400
+ jnz .w16_main
+ movd m2, angled
+ shr angled, 8 ; is_sm << 1
+ pxor m1, m1
+ pshufb m0, m1
+ pshufb m2, m1
+ movq m3, [base+z_filter_t_w16+angleq*4]
+ pcmpeqb m0, [base+z_filter_wh16]
+ pand m0, m2
+ pcmpgtb m0, m3
+ pmovmskb r5d, m0
+ test r5d, r5d
+ jz .w16_main ; filter_strength == 0
+ movd m4, [tlq-1]
+ movu m0, [tlq+16*0]
+ imul r5d, 0x24924924
+ movu m1, [tlq+16*1]
+ shr r5d, 30
+ movd m2, [tlq+30]
+ adc r5, -4 ; filter_strength-3
+ movd m3, [tlq+r3]
+ lea tlq, [rsp+16*4]
+ mova [tlq-16*1], m0
+ pxor m7, m7
+ mova [tlq+16*0], m1
+ pshufb m4, m7
+ movd [rsp], m2
+ pshufb m3, m7
+ mova [tlq-16*2], m4
+ movd [tlq+r3-16], m3
+ call .filter_edge
+ cmp hd, 16
+ jle .w16_main
+ pshuflw m0, [rsp], q0000
+ sar r5, 1
+ movd m1, [base+z_filter_k_tail+4+r5*4]
+ lea r3d, [r5+33]
+ pmaddubsw m0, m1
+%if ARCH_X86_64
+ pmulhrsw m0, m10
+%else
+ pmulhrsw m0, m4
+%endif
+ packuswb m0, m0
+ movd [tlq+32], m0
+.w16_main:
+ add tlq, r3
+ movd m5, dxd
+ movd m7, [tlq]
+ movd m4, r3d
+ shl r3d, 6
+ pshufb m5, [base+pw_256]
+ pxor m6, m6
+ pshufb m7, m6
+ mov r5d, dxd
+ pshufb m4, m6
+ sub r5, r3
+ psubb m4, [base+pb_0to15]
+ mova m6, m5
+.w16_loop:
+ mov r3, r5
+ sar r3, 6
+ movu m1, [tlq+r3+0]
+ pand m0, m8, m5
+ movu m2, [tlq+r3+1]
+ psubw m3, m9, m0
+ psllw m0, 8
+ por m3, m0
+ punpcklbw m0, m1, m2
+ pmaddubsw m0, m3
+ punpckhbw m1, m2
+ pmaddubsw m1, m3
+ psrlw m3, m5, 6
+ packsswb m3, m3
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ paddw m5, m6
+ pcmpgtb m2, m4, m3
+ packuswb m0, m1
+ pand m0, m2
+ pandn m2, m7
+ por m0, m2
+ mova [dstq], m0
+ dec hd
+ jz .w16_end
+ movifnidn strideq, stridemp
+ add dstq, strideq
+ add r5, dxq
+ jl .w16_loop
+.w16_end_loop:
+ mova [dstq], m7
+ add dstq, strideq
+ dec hd
+ jg .w16_end_loop
+.w16_end:
+ RET
+.w32:
+ lea r3d, [hq+31]
+ and r3d, 31
+ or r3d, 32 ; imin(h+31, 63)
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .w32_main
+ movd m6, [tlq-1]
+ movu m0, [tlq+16*0]
+ movu m1, [tlq+16*1]
+ movu m2, [tlq+16*2]
+ movu m3, [tlq+16*3]
+ movd m4, [tlq+62]
+ movd m5, [tlq+r3]
+ lea tlq, [rsp+16*6]
+ mova [tlq-16*3], m0
+ pxor m7, m7
+ mova [tlq-16*2], m1
+ pshufb m6, m7
+ mova [tlq-16*1], m2
+ xor r5d, r5d ; filter_strength = 3
+ mova [tlq+16*0], m3
+ movd [rsp], m4
+ pshufb m5, m7
+ mova [tlq-16*4], m6
+ movd [tlq+r3-48], m5
+ call .filter_edge
+ sub tlq, 16*2
+ call .filter_edge
+ cmp hd, 32
+ jle .w32_main
+ pshuflw m0, [rsp], q0000
+ movd m1, [base+z_filter_k_tail+4]
+ add r3d, 2
+ pmaddubsw m0, m1
+%if ARCH_X86_64
+ pmulhrsw m0, m10
+%else
+ pmulhrsw m0, m4
+%endif
+ packuswb m0, m0
+ movd [tlq+64], m0
+.w32_main:
+ add tlq, r3
+ movd m0, r3d
+ movd m7, [tlq]
+ shl r3d, 6
+ movd m5, dxd
+ pxor m6, m6
+ mov r5d, dxd
+ pshufb m0, m6
+ pshufb m5, [base+pw_256]
+ sub r5, r3
+ pshufb m7, m6
+ psubb m0, [base+pb_0to15]
+ movddup m1, [base+pb_m16]
+ mova [rsp+16*0], m0
+ paddb m0, m1
+ mova [rsp+16*1], m0
+ mova m6, m5
+.w32_loop:
+ mov r3, r5
+ sar r3, 6
+ movu m1, [tlq+r3+16*0+0]
+ pand m0, m8, m5
+ movu m2, [tlq+r3+16*0+1]
+ psubw m3, m9, m0
+ psllw m0, 8
+ por m3, m0
+ punpcklbw m0, m1, m2
+ pmaddubsw m0, m3
+ punpckhbw m1, m2
+ pmaddubsw m1, m3
+ psrlw m4, m5, 6
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ packsswb m4, m4
+ pcmpgtb m2, [rsp+16*0], m4
+ packuswb m0, m1
+ pand m0, m2
+ pandn m2, m7
+ por m0, m2
+ movu m1, [tlq+r3+16*1+0]
+ movu m2, [tlq+r3+16*1+1]
+ mova [dstq+16*0], m0
+ punpcklbw m0, m1, m2
+ pmaddubsw m0, m3
+ punpckhbw m1, m2
+ pmaddubsw m1, m3
+ paddw m5, m6
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ pcmpgtb m2, [rsp+16*1], m4
+ packuswb m0, m1
+ pand m0, m2
+ pandn m2, m7
+ por m0, m2
+ mova [dstq+16*1], m0
+ dec hd
+ jz .w32_end
+ movifnidn strideq, stridemp
+ add dstq, strideq
+ add r5, dxq
+ jl .w32_loop
+.w32_end_loop:
+ mova [dstq+16*0], m7
+ mova [dstq+16*1], m7
+ add dstq, strideq
+ dec hd
+ jg .w32_end_loop
+.w32_end:
+ RET
+.w64:
+ lea r3d, [hq+63]
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .w64_main
+ movd m4, [tlq-1]
+ movu m0, [tlq+16*0]
+ movu m1, [tlq+16*1]
+ movu m2, [tlq+16*2]
+ movu m3, [tlq+16*3]
+ mova [rsp+16*3], m0
+ pxor m7, m7
+ mova [rsp+16*4], m1
+ pshufb m4, m7
+ mova [rsp+16*5], m2
+ mova [rsp+16*6], m3
+ mova [rsp+16*2], m4
+ movu m0, [tlq+16*4]
+ movu m1, [tlq+16*5]
+ movu m2, [tlq+16*6]
+ movu m3, [tlq+16*7]
+ movd m4, [tlq+r3]
+ lea tlq, [rsp+16*10]
+ mova [tlq-16*3], m0
+ xor r5d, r5d ; filter_strength = 3
+ mova [tlq-16*2], m1
+ pshufb m4, m7
+ mova [tlq-16*1], m2
+ mova [tlq+16*0], m3
+ movd [tlq+r3-16*7], m4
+ cmp hd, 64
+ jl .w64_filter96 ; skip one call if the last 32 bytes aren't used
+ call .filter_edge
+.w64_filter96:
+ sub tlq, 16*2
+ call .filter_edge
+ sub tlq, 16*2
+ call .filter_edge
+ sub tlq, 16*2
+ call .filter_edge
+.w64_main:
+ add tlq, r3
+ movd m0, r3d
+ movd m7, [tlq]
+ shl r3d, 6
+ movd m5, dxd
+ pxor m6, m6
+ mov r5d, dxd
+ pshufb m0, m6
+ sub r5, r3
+ pshufb m5, [base+pw_256]
+ pshufb m7, m6
+ psubb m0, [base+pb_0to15]
+ movddup m1, [base+pb_m16]
+ mova [rsp+16*0], m0
+ paddb m0, m1
+ mova [rsp+16*1], m0
+ paddb m0, m1
+ mova [rsp+16*2], m0
+ paddb m0, m1
+ mova [rsp+16*3], m0
+ mova m6, m5
+.w64_loop:
+ mov r3, r5
+ sar r3, 6
+ movu m1, [tlq+r3+16*0+0]
+ pand m0, m8, m5
+ movu m2, [tlq+r3+16*0+1]
+ psubw m3, m9, m0
+ psllw m0, 8
+ por m3, m0
+ punpcklbw m0, m1, m2
+ pmaddubsw m0, m3
+ punpckhbw m1, m2
+ pmaddubsw m1, m3
+ psrlw m4, m5, 6
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ packsswb m4, m4
+ pcmpgtb m2, [rsp+16*0], m4
+ packuswb m0, m1
+ pand m0, m2
+ pandn m2, m7
+ por m0, m2
+ movu m1, [tlq+r3+16*1+0]
+ movu m2, [tlq+r3+16*1+1]
+ mova [dstq+16*0], m0
+ punpcklbw m0, m1, m2
+ pmaddubsw m0, m3
+ punpckhbw m1, m2
+ pmaddubsw m1, m3
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ pcmpgtb m2, [rsp+16*1], m4
+ packuswb m0, m1
+ pand m0, m2
+ pandn m2, m7
+ por m0, m2
+ movu m1, [tlq+r3+16*2+0]
+ movu m2, [tlq+r3+16*2+1]
+ mova [dstq+16*1], m0
+ punpcklbw m0, m1, m2
+ pmaddubsw m0, m3
+ punpckhbw m1, m2
+ pmaddubsw m1, m3
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ pcmpgtb m2, [rsp+16*2], m4
+ packuswb m0, m1
+ pand m0, m2
+ pandn m2, m7
+ por m0, m2
+ movu m1, [tlq+r3+16*3+0]
+ movu m2, [tlq+r3+16*3+1]
+ mova [dstq+16*2], m0
+ punpcklbw m0, m1, m2
+ pmaddubsw m0, m3
+ punpckhbw m1, m2
+ pmaddubsw m1, m3
+ paddw m5, m6
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ pcmpgtb m2, [rsp+16*3], m4
+ packuswb m0, m1
+ pand m0, m2
+ pandn m2, m7
+ por m0, m2
+ mova [dstq+16*3], m0
+ dec hd
+ jz .w64_end
+ movifnidn strideq, stridemp
+ add dstq, strideq
+ add r5, dxq
+ jl .w64_loop
+.w64_end_loop:
+ mova [dstq+16*0], m7
+ mova [dstq+16*1], m7
+ mova [dstq+16*2], m7
+ mova [dstq+16*3], m7
+ add dstq, strideq
+ dec hd
+ jg .w64_end_loop
+.w64_end:
+ RET
+ALIGN function_align
+.filter_edge: ; 32 pixels/iteration
+ movddup m7, [base+z_filter_k+8*2+r5*8+24*0]
+ movu m2, [tlq-18]
+ movu m1, [tlq-17]
+ movu m3, [tlq- 2]
+ movu m4, [tlq- 1]
+ punpcklbw m0, m2, m1
+ pmaddubsw m0, m7
+ punpckhbw m2, m1
+ pmaddubsw m2, m7
+ punpcklbw m1, m3, m4
+ pmaddubsw m1, m7
+ punpckhbw m3, m4
+ pmaddubsw m3, m7
+ movddup m7, [base+z_filter_k+8*2+r5*8+24*1]
+ mova m5, [tlq-16]
+ movu m6, [tlq-15]
+ punpcklbw m4, m5, m6
+ pmaddubsw m4, m7
+ punpckhbw m5, m6
+ pmaddubsw m5, m7
+ paddw m0, m4
+ paddw m2, m5
+ mova m5, [tlq+ 0]
+ movu m6, [tlq+ 1]
+ punpcklbw m4, m5, m6
+ pmaddubsw m4, m7
+ punpckhbw m5, m6
+ pmaddubsw m5, m7
+ paddw m1, m4
+ paddw m3, m5
+ test r5d, r5d
+ jnz .filter_end ; 3-tap
+ movddup m7, [base+z_filter_k+8*8]
+ movu m5, [tlq-14]
+ movu m6, [tlq+ 2]
+ punpcklbw m4, m5, m5
+ pmaddubsw m4, m7
+ punpckhbw m5, m5
+ pmaddubsw m5, m7
+ paddw m0, m4
+ paddw m2, m5
+ punpcklbw m5, m6, m6
+ pmaddubsw m5, m7
+ punpckhbw m6, m6
+ pmaddubsw m6, m7
+ paddw m1, m5
+ paddw m3, m6
+.filter_end:
+%if ARCH_X86_64
+ REPX {pmulhrsw x, m10}, m0, m2, m1, m3
+%else
+ mova m4, m10
+ REPX {pmulhrsw x, m4 }, m0, m2, m1, m3
+%endif
+ packuswb m0, m2
+ packuswb m1, m3
+ mova [tlq+16*0], m0
+ mova [tlq+16*1], m1
+ ret
+
+%if ARCH_X86_64
+cglobal ipred_z2_8bpc, 4, 12, 13, 16*16, dst, stride, tl, w, h, angle, dx, _, dy
+ %define base r7-$$
+ %define maxwm r6m
+ %define maxhm r7m
+ lea r7, [$$]
+ mov hd, hm
+ mova m8, [base+pw_62]
+ mova m9, [base+pw_64]
+ lea r9d, [wq-4]
+ mova m10, [base+pw_512]
+ shl r9d, 6
+ mova m11, [base+z1_shuf_w4]
+ or r9d, hd
+ mova m12, [base+z2_h_shuf]
+%else
+cglobal ipred_z2_8bpc, 4, 7, 8, -16*20, dst, _, tl, w, h, angle, dx
+ %define base r1-$$
+ %define m8 [base+pw_62]
+ %define m9 [base+pw_64]
+ %define m10 [base+pw_512]
+ %define m11 [rsp+16*16]
+ %define m12 [rsp+16*17]
+ %define r9b byte [rsp+16*18+4*0]
+ %define r9d dword [rsp+16*18+4*0]
+ %define r10d dword [rsp+16*18+4*1]
+ %define r11d dword [rsp+16*18+4*2]
+ %define maxwm [rsp+16*18+4*3]
+ %define maxhm [rsp+16*19+4*0]
+ %define stridemp [rsp+16*19+4*1]
+ %define strideq r3
+ %define dyd r4
+ %define dyq r4
+ mov stridemp, r1
+ mov r1d, r6m
+ mov r4d, r7m
+ mov maxwm, r1d
+ mov maxhm, r4d
+ LEA r1, $$
+ lea hd, [wq-4]
+ mova m0, [base+z1_shuf_w4]
+ shl hd, 6
+ mova m1, [base+z2_h_shuf]
+ or hd, hm
+ mova m11, m0
+ mov r9d, hd
+ mova m12, m1
+%endif
+ tzcnt wd, wd
+ movifnidn angled, anglem
+ movsxd wq, [base+ipred_z2_ssse3_table+wq*4]
+%if ARCH_X86_64
+ movzx dxd, angleb
+%else
+ movzx dxd, byte anglem
+%endif
+ xor angled, 0x400
+ mova m0, [tlq-16*4]
+ mov dyd, dxd
+ mova m1, [tlq-16*3]
+ neg dxq
+ mova m2, [tlq-16*2]
+ and dyd, ~1
+ mova m3, [tlq-16*1]
+ and dxq, ~1
+ movd m4, [tlq]
+ movu m5, [tlq+16*0+1]
+ movu m6, [tlq+16*1+1]
+ movzx dyd, word [base+dr_intra_derivative+dyq-90] ; angle - 90
+ movzx dxd, word [base+dr_intra_derivative+dxq+180] ; 180 - angle
+ mova [rsp+16*2], m0
+ pxor m7, m7
+ mova [rsp+16*3], m1
+ pshufb m4, m7
+ mova [rsp+16*4], m2
+ lea wq, [base+ipred_z2_ssse3_table+wq]
+ mova [rsp+16*5], m3
+ neg dxd
+ mova [rsp+16*6], m4
+ or dyd, 4<<16
+ mova [rsp+16*7], m4
+ mova [rsp+16*8], m5
+ mova [rsp+16*9], m6
+ movq m0, [base+z_base_inc+2]
+ movsldup m1, [base+z2_dy_offset]
+ movq m2, [base+pw_256] ; 4<<6
+ movq [rsp+16*14+8*0], m0
+ movq [rsp+16*15+8*0], m1
+ movq [rsp+16*15+8*1], m2
+%if ARCH_X86_64
+ lea r10d, [dxq+(128<<6)] ; xpos
+%else
+ mov [rsp+16*7+4*1], dyd
+ lea r4d, [dxq+(128<<6)]
+ mov r10d, r4d
+ movzx hd, r9b
+%endif
+ mov r11d, (128-4)<<6
+ jmp wq
+.w4:
+ test angled, 0x400
+ jnz .w4_main
+ movd m5, [tlq+4]
+ lea r3d, [hq+2]
+ add angled, 1022
+ pshufb m5, m7
+ shl r3d, 6
+ movd [rsp+16*8+4], m5
+ test r3d, angled
+ jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8)
+ call .upsample_above
+ sub angled, 1075 ; angle - 53
+ lea r3d, [hq+3]
+ xor angled, 0x7f ; 180 - angle
+ movd m0, r3d
+ movd m6, angled
+ shr angled, 8 ; is_sm << 1
+ pshufb m0, m7
+ pshufb m6, m7
+ pcmpeqb m0, [base+z_filter_wh4]
+ pand m6, m0
+ pcmpgtb m6, [base+z_filter_t_w48+angleq*8]
+ jmp .w8_filter_left
+.upsample_above: ; w4/w8
+ movq m3, [rsp+gprsize+16*8-2]
+ movq m1, [rsp+gprsize+16*8-1]
+ movq m0, [rsp+gprsize+16*8+0]
+ movq m4, [rsp+gprsize+16*8+1]
+ movddup m5, [base+pb_36_m4]
+ punpcklbw m1, m3
+ punpcklbw m2, m0, m4
+ pmaddubsw m1, m5
+ pmaddubsw m2, m5
+%if ARCH_X86_64
+ mova m11, [base+pb_0to15]
+ lea r10d, [r10+dxq+(1<<6)]
+ mov r11d, (128-7)<<6
+%else
+ mova m3, [base+pb_0to15]
+ mov r3d, [rsp+gprsize+16*18+4*1]
+ mov dword [rsp+gprsize+16*18+4*2], (128-7)<<6
+ lea r3d, [r3+dxq+(1<<6)]
+ mov [rsp+gprsize+16*18+4*1], r3d
+ mova [rsp+gprsize+16*16], m3
+%endif
+ add dxd, dxd
+ paddw m1, m2
+ pmulhrsw m1, m10
+ movq m2, [rsp+gprsize+16*14]
+ paddw m2, m2
+ movq [rsp+gprsize+16*14], m2
+ packuswb m1, m1
+ punpcklbw m1, m0
+ mova [rsp+gprsize+16*8], m1
+ ret
+.w4_no_upsample_above:
+ lea r3d, [hq+3]
+ mov [rsp], angled
+ sub angled, 1112 ; angle - 90
+ movd m0, r3d
+ mov r3d, 90
+ movd m1, angled
+ sub r3d, angled ; 180 - angle
+ shr angled, 8 ; is_sm << 1
+ movu m3, [base+z_filter_wh4]
+ mova m4, [base+z_filter_t_w48+angleq*8]
+ call .w8_filter_top
+ mov angled, [rsp]
+ lea r3d, [hq+2]
+ sub angled, 139
+ shl r3d, 6
+ test r3d, angled
+ jnz .w8_filter_left ; angle <= 140 || h > 8 || (is_sm && h == 8)
+.upsample_left: ; w4/w8
+ neg hq
+ movd m0, [tlq+hq]
+ pshufb m0, m7
+ movd [rsp+16*6+hq-4], m0
+ movq m3, [rsp+16*5+7]
+ movq m0, [rsp+16*5+8]
+ movq m2, [rsp+16*5+9]
+ movq m4, [rsp+16*5+10]
+ movddup m5, [base+pb_36_m4]
+ punpcklbw m1, m0, m3
+ punpcklbw m2, m4
+ pmaddubsw m1, m5
+ pmaddubsw m2, m5
+ movshdup m3, [base+z2_dy_offset]
+%if ARCH_X86_64
+ mova m12, [base+z2_upsample]
+ add dyd, dyd
+%else
+ mova m4, [base+z2_upsample]
+ shl dword [rsp+16*7+4*1], 1
+ mova m12, m4
+%endif
+ paddw m1, m2
+ pmulhrsw m1, m10
+ movq [rsp+16*15], m3
+ packuswb m1, m1
+ punpcklbw m0, m1
+ mova [rsp+16*5], m0
+.w4_main:
+ movd m6, dxd
+%if ARCH_X86_64
+ movd m3, dyd
+%else
+ movd m3, [rsp+16*7+4*1]
+%endif
+ movddup m0, [rsp+16*14+8*0]
+ pshufb m6, [base+pw_256]
+ paddw m7, m6, m6
+ movq m5, [base+pw_m1to4]
+ pshuflw m4, m3, q0000
+ punpcklqdq m6, m7
+ pmullw m4, m5
+ pshuflw m3, m3, q1111
+ paddw m6, m0
+ mov r2d, r10d
+ pshuflw m0, m4, q3333
+ psubw m4, [rsp+16*15]
+ movq [rsp+16*6+8*1], m3
+ movq [rsp+8*1], m0 ; dy*4
+ mov r5, dstq
+.w4_loop0:
+ mova [rsp+16*12], m6
+ movq [rsp+8*0], m4
+ pand m0, m4, m8
+ psraw m4, 6
+ psubw m1, m9, m0
+ psllw m0, 8
+ por m0, m1 ; 64-frac_y, frac_y
+ movq [rsp+8*3], m0
+ pabsw m4, m4
+ movq [rsp+8*2], m4
+ movzx hd, r9b
+.w4_loop:
+ lea r3d, [r2+dxq]
+ shr r2d, 6 ; base_x0
+ movq m0, [rsp+r2]
+ lea r2d, [r3+dxq]
+ shr r3d, 6 ; base_x1
+ movhps m0, [rsp+r3]
+ lea r3d, [r2+dxq]
+ shr r2d, 6 ; base_x2
+ movq m1, [rsp+r2]
+ lea r2d, [r3+dxq]
+ shr r3d, 6 ; base_x3
+ movhps m1, [rsp+r3]
+ pand m2, m8, m6
+ paddsw m5, m6, m7
+ psubw m3, m9, m2
+ psllw m2, 8
+ pshufb m0, m11
+ por m2, m3
+ pmaddubsw m0, m2
+ pand m2, m8, m5
+ psubw m3, m9, m2
+ psllw m2, 8
+ pshufb m1, m11
+ por m2, m3
+ pmaddubsw m1, m2
+ cmp r3d, 127 ; topleft
+ jge .w4_toponly
+ movzx r3d, byte [rsp+8*2+0] ; base_y0
+ movq m3, [rsp+r3]
+ movzx r3d, byte [rsp+8*2+2] ; base_y1
+ movhps m3, [rsp+r3]
+ movzx r3d, byte [rsp+8*2+4] ; base_y2
+ movq m4, [rsp+r3]
+ movzx r3d, byte [rsp+8*2+6] ; base_y3
+ movhps m4, [rsp+r3]
+ pshufb m3, m12
+ pshufb m4, m12
+ punpckldq m2, m3, m4
+ punpckhdq m3, m4
+ movddup m4, [rsp+8*3]
+ pmaddubsw m2, m4
+ pmaddubsw m3, m4
+ psraw m6, 15 ; base_x < topleft
+ pand m2, m6
+ pandn m6, m0
+ por m0, m2, m6
+ psraw m6, m5, 15
+ pand m3, m6
+ pandn m6, m1
+ por m1, m3, m6
+.w4_toponly:
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ movifnidn strideq, stridemp
+ packuswb m0, m1
+ movd [dstq+strideq*0], m0
+ pshuflw m1, m0, q1032
+ movd [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ punpckhqdq m0, m0
+ movd [dstq+strideq*0], m0
+ psrlq m0, 32
+ movd [dstq+strideq*1], m0
+ sub hd, 4
+ jz .w4_end
+ movq m4, [rsp+8*2]
+ movq m3, [rsp+16*6+8*1]
+ paddw m6, m5, m7 ; xpos += dx
+ psubw m4, m3
+ movq [rsp+8*2], m4
+ lea dstq, [dstq+strideq*2]
+ cmp r2d, r11d
+ jge .w4_loop
+ movddup m5, [rsp+8*3]
+.w4_leftonly_loop:
+ movzx r2d, byte [rsp+8*2+0] ; base_y0
+ movq m1, [rsp+r2]
+ movzx r2d, byte [rsp+8*2+2] ; base_y1
+ movhps m1, [rsp+r2]
+ movzx r2d, byte [rsp+8*2+4] ; base_y2
+ movq m2, [rsp+r2]
+ movzx r2d, byte [rsp+8*2+6] ; base_y3
+ movhps m2, [rsp+r2]
+ psubw m4, m3
+ pshufb m1, m12
+ pshufb m2, m12
+ movq [rsp+8*2], m4
+ punpckldq m0, m1, m2
+ punpckhdq m1, m2
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ packuswb m0, m1
+ movd [dstq+strideq*0], m0
+ pshuflw m1, m0, q1032
+ movd [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ punpckhqdq m0, m0
+ movd [dstq+strideq*0], m0
+ psrlq m0, 32
+ movd [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ sub hd, 4
+ jg .w4_leftonly_loop
+.w4_end:
+ sub r9d, 1<<8
+ jl .w4_ret
+ movq m4, [rsp+8*1]
+ add r5, 4
+ mov dstq, r5
+ paddw m4, [rsp+8*0] ; base_y += 4*dy
+ movzx r2d, word [rsp+16*15+8*1]
+ movddup m6, [rsp+16*15+8*1]
+ paddw m6, [rsp+16*12] ; base_x += (4 << upsample_above)
+ add r2d, r10d
+ mov r10d, r2d
+ jmp .w4_loop0
+.w4_ret:
+ RET
+.w8:
+ test angled, 0x400
+ jnz .w4_main
+ movd m5, [tlq+8]
+ lea r3d, [angleq+126]
+ pshufb m5, m7
+%if ARCH_X86_64
+ mov r3b, hb
+%else
+ xor r3b, r3b
+ or r3d, hd
+%endif
+ movd [rsp+16*8+8], m5
+ cmp r3d, 8
+ ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm
+ call .upsample_above
+ sub angled, 53
+ lea r3d, [hq+7]
+ xor angled, 0x7f ; 180 - angle
+ movu m1, [base+z_filter_wh8]
+ movd m0, r3d
+ movd m6, angled
+ shr angled, 8 ; is_sm << 1
+ psrldq m2, [base+z_filter_t_w48+angleq*8], 4
+ pshufb m0, m7
+ pshufb m6, m7
+ pcmpeqb m0, m1
+ pand m6, m0
+ pcmpgtb m6, m2
+%if ARCH_X86_64
+ movq [rsp+16*15+8*1], m10 ; 8<<6
+%else
+ movq m0, m10
+ movq [rsp+16*15+8*1], m0
+%endif
+ jmp .w8_filter_left
+.w8_no_upsample_above:
+ lea r3d, [hq+7]
+ mov [rsp], angled
+ sub angled, 90
+ movd m0, r3d
+ mov r3d, 90
+ movd m1, angled
+ sub r3d, angled ; 180 - angle
+ shr angled, 8 ; is_sm << 1
+ movu m3, [base+z_filter_wh8]
+ psrldq m4, [base+z_filter_t_w48+angleq*8], 4
+ call .w8_filter_top
+ mov r3d, [rsp]
+ sub r3d, 141
+%if ARCH_X86_64
+ mov r3b, hb
+%else
+ xor r3b, r3b
+ or r3d, hd
+%endif
+ cmp r3d, 8
+ jbe .upsample_left ; angle > 140 && h <= 8 && !is_sm
+.w8_filter_left:
+ pmovmskb r5d, m6
+ test r5d, r5d
+ jz .w4_main
+ imul r5d, 0x55555555
+ mov r3, tlq
+ shr r5d, 30
+ sub r5, 3 ; filter_strength-3
+ jmp .filter_left
+.w8_filter_top:
+ movd m6, r3d
+ REPX {pshufb x, m7}, m0, m1, m6
+ pcmpeqb m0, m3
+ pand m1, m0
+ pand m6, m0
+ pcmpgtb m1, m4
+ pcmpgtb m6, m4
+ pmovmskb r5d, m1
+ test r5d, r5d
+ jz .w8_filter_top_end ; filter_strength == 0
+ imul r5d, 0x55555555
+ movq m0, [rsp+gprsize+16*8-2]
+ shr r5d, 30
+ movq m1, [rsp+gprsize+16*8-1]
+ sub r5, 3 ; filter_strength-3
+ movddup m7, [base+z_filter_k+8*2+r5*8+24*0]
+ punpcklbw m0, m1
+ pmaddubsw m0, m7
+ movq m1, [rsp+gprsize+16*8+0]
+ movq m2, [rsp+gprsize+16*8+1]
+ movddup m7, [base+z_filter_k+8*2+r5*8+24*1]
+ punpcklbw m1, m2
+ pmaddubsw m1, m7
+ movq m2, [rsp+gprsize+16*8+2]
+ movddup m7, [base+z_filter_k+8*2+r5*8+24*2]
+ punpcklbw m2, m2
+ pmaddubsw m2, m7
+ paddw m0, m1
+ paddw m0, m2
+%if ARCH_X86_64
+ mov r3d, r7m ; maxw, offset due to call
+%else
+ mov r3d, [rsp+gprsize+16*18+4*3]
+%endif
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ packuswb m0, m1
+ movq [rsp+gprsize+16*8], m0
+ cmp r3d, 8
+ jge .w8_filter_top_end
+ movq m0, [tlq+r3+1]
+ movq [rsp+gprsize+r3+16*8], m0
+.w8_filter_top_end:
+ ret
+.w16:
+ test angled, 0x400
+ jnz .w4_main
+ lea r3d, [hq+15]
+ sub angled, 90
+ movd m0, r3d
+ mov r3d, 90
+ movd m1, angled
+ sub r3d, angled ; 180 - angle
+ shr angled, 8 ; is_sm << 1
+ movd m6, r3d
+ REPX {pshufb x, m7}, m0, m1, m6
+ movq m3, [base+z_filter_t_w16+angleq*4]
+ pcmpeqb m0, [base+z_filter_wh16]
+ pand m1, m0
+ pand m6, m0
+ pcmpgtb m1, m3
+ pcmpgtb m6, m3
+ pmovmskb r5d, m1
+ mov r3, tlq
+ test r5d, r5d
+ jz .w16_filter_left ; filter_strength == 0
+ imul r5d, 0x24924924
+ pshufb m5, [base+z_filter_t_w16] ; tlq[16]
+ shr r5d, 30
+ adc r5, -4 ; filter_strength-3
+ movd [rsp+16*9], m5
+ movddup m7, [base+z_filter_k+8*2+r5*8+24*0]
+ movu m1, [rsp+16*8-2]
+ movu m2, [rsp+16*8-1]
+ punpcklbw m0, m1, m2
+ pmaddubsw m0, m7
+ punpckhbw m1, m2
+ pmaddubsw m1, m7
+ movddup m7, [base+z_filter_k+8*2+r5*8+24*1]
+ mova m3, [rsp+16*8+0]
+ movu m4, [rsp+16*8+1]
+ punpcklbw m2, m3, m4
+ pmaddubsw m2, m7
+ punpckhbw m3, m4
+ pmaddubsw m3, m7
+ paddw m0, m2
+ paddw m1, m3
+ test r5d, r5d
+ jnz .w16_filter_end ; 3-tap
+ movddup m7, [base+z_filter_k+8*8]
+ movu m3, [rsp+16*8+2]
+ punpcklbw m2, m3, m3
+ pmaddubsw m2, m7
+ punpckhbw m3, m3
+ pmaddubsw m3, m7
+ paddw m0, m2
+ paddw m1, m3
+.w16_filter_end:
+ mov r2d, maxwm
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ packuswb m0, m1
+ mova [rsp+16*8], m0
+ cmp r2d, 16
+ jge .w16_filter_left
+ movu m0, [r3+r2+1]
+ movu [rsp+r2+16*8], m0
+.w16_filter_left:
+ pmovmskb r5d, m6
+ test r5d, r5d
+ jz .w4_main
+ imul r5d, 0x24924924
+ shr r5d, 30
+ adc r5, -4 ; filter_strength-3
+ jmp .filter_left
+.w32:
+ test angled, 0x400
+ jnz .w4_main
+ pshufb m6, [base+z_filter_t_w16] ; tlq[32]
+ mov r3, tlq
+ lea tlq, [rsp+16*9]
+ movd [tlq+16*1], m6
+ xor r5d, r5d ; filter_strength = 3
+ call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
+ mova m0, [tlq+16*0]
+ mova m1, [tlq+16*1]
+ mov r2d, maxwm
+ mova [rsp+16*8], m0
+ mova [rsp+16*9], m1
+ cmp r2d, 32
+ jge .filter_left
+ movu m0, [r3+r2+16*0+1]
+ movu m1, [r3+r2+16*1+1]
+ movu [rsp+r2+16*8], m0
+ movu [rsp+r2+16*9], m1
+ jmp .filter_left
+.w64:
+ movu m0, [tlq+16*2+1]
+ movu m1, [tlq+16*3+1]
+ mova [rsp+16*10], m0
+ mova [rsp+16*11], m1
+ test angled, 0x400
+ jnz .w4_main
+ pshufb m1, [base+z_filter_t_w16] ; tlq[64]
+ mov r3, tlq
+ lea tlq, [rsp+16*11]
+ movd [tlq+16*1], m1
+ xor r5d, r5d ; filter_strength = 3
+ call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
+ sub tlq, 16*2
+ call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
+ mova m0, [tlq+16*0]
+ mova m1, [tlq+16*1]
+ mova m2, [tlq+16*2]
+ mova m3, [tlq+16*3]
+ mov r2d, maxwm
+ mova [rsp+16* 8], m0
+ mova [rsp+16* 9], m1
+ mova [rsp+16*10], m2
+ mova [rsp+16*11], m3
+ cmp r2d, 64
+ jge .filter_left
+ movu m0, [r3+r2+16*0+1]
+ movu m1, [r3+r2+16*1+1]
+ movu [rsp+r2+16* 8], m0
+ movu [rsp+r2+16* 9], m1
+ cmp r2d, 32
+ jge .filter_left
+ movu m0, [r3+r2+16*2+1]
+ movu m1, [r3+r2+16*3+1]
+ movu [rsp+r2+16*10], m0
+ movu [rsp+r2+16*11], m1
+.filter_left:
+ neg hq
+ movd m0, [r3+hq]
+ pxor m1, m1
+ pshufb m0, m1
+ movd [rsp+16*6+hq-4], m0
+ lea tlq, [rsp+16*5]
+ call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
+ cmp hd, -32
+ jge .filter_left_end
+ sub tlq, 16*2
+ call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
+ mova m0, [tlq+16*0]
+ mova m1, [tlq+16*1]
+ mova [rsp+16*2], m0
+ mova [rsp+16*3], m1
+.filter_left_end:
+ mov r2d, maxhm
+ mova m0, [rsp+16*5]
+ mova m1, [rsp+16*6]
+ mova m2, [rsp+16*7]
+ neg r2
+ mova [rsp+16*4], m0
+ mova [rsp+16*5], m1
+ mova [rsp+16*6], m2
+ cmp r2d, hd
+ jle .w4_main
+ movu m0, [r3+r2-16*2]
+ movu m1, [r3+r2-16*1]
+ movu [rsp+r2+16*4], m0
+ movu [rsp+r2+16*5], m1
+ cmp r2d, -32
+ jle .w4_main
+ movu m0, [r3+r2-16*4]
+ movu m1, [r3+r2-16*3]
+ movu [rsp+r2+16*2], m0
+ movu [rsp+r2+16*3], m1
+ jmp .w4_main
+
+%if ARCH_X86_64
+cglobal ipred_z3_8bpc, 4, 9, 11, 16*10, dst, stride, tl, w, h, angle, dy, _, org_w
+ %define base r7-$$
+ lea r7, [$$]
+ mova m8, [base+pw_62]
+ mova m9, [base+pw_64]
+ mova m10, [base+pw_512]
+ mov org_wd, wd
+%else
+cglobal ipred_z3_8bpc, 4, 7, 8, -16*10, dst, stride, tl, w, h, angle, dy
+ %define base r1-$$
+ %define m8 [base+pw_62]
+ %define m9 [base+pw_64]
+ %define m10 [base+pw_512]
+ %define org_wd r5
+ %define org_wq r5
+ mov [dstq+strideq*0], strideq
+ mov [dstq+strideq*1], wd
+ LEA r1, $$
+%endif
+ tzcnt hd, hm
+ movifnidn angled, anglem
+ dec tlq
+ movsxd hq, [base+ipred_z3_ssse3_table+hq*4]
+ sub angled, 180
+ mov dyd, angled
+ neg dyd
+ xor angled, 0x400
+ or dyq, ~0x7e
+ lea hq, [base+ipred_z3_ssse3_table+hq]
+ movzx dyd, word [base+dr_intra_derivative+45*2-1+dyq]
+ jmp hq
+.h4:
+ lea r4d, [angleq+88]
+ test r4d, 0x480
+ jnz .h4_no_upsample ; !enable_intra_edge_filter || angle >= 40
+ sar r4d, 9
+ add r4d, wd
+ cmp r4d, 8
+ jg .h4_no_upsample ; w > 8 || (w == 8 && is_sm)
+ movu m3, [tlq-7]
+ movu m1, [base+z_upsample1-4]
+ movu m4, [base+z_filter_s+2]
+ pshufb m0, m3, m1
+ pxor m1, m1
+ pshufb m2, m3, m1
+ pshufb m1, m3, m4
+ mova [rsp+16], m2 ; top[max_base_y]
+ movddup m2, [base+pb_36_m4]
+ add dyd, dyd
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ movd m5, dyd
+ mov r5d, dyd
+ pshufb m5, [base+pw_256]
+ paddw m0, m1
+ pmulhrsw m0, m10
+ shl wd, 2
+ mov tlq, rsp
+ sub rsp, wq
+ packuswb m0, m0
+ punpcklbw m0, m3
+ paddw m6, m5, m5
+ punpcklqdq m5, m6
+ pshufb m0, [base+pb_15to0]
+ mova [tlq], m0
+.h4_upsample_loop:
+ lea r4d, [r5+dyq]
+ shr r5d, 6
+ movq m0, [tlq+r5]
+ lea r5d, [r4+dyq]
+ shr r4d, 6
+ movhps m0, [tlq+r4]
+ pand m2, m8, m5
+ psubw m1, m9, m2
+ psllw m2, 8
+ por m1, m2
+ pmaddubsw m0, m1
+ paddw m5, m6
+ pmulhrsw m0, m10
+ packuswb m0, m0
+ movq [rsp+wq-8], m0
+ sub wd, 8
+ jg .h4_upsample_loop
+ jmp .h4_transpose
+.h4_no_upsample:
+ mov r4d, 7
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .h4_main
+ lea r4d, [wq+3]
+ movd m0, r4d
+ movd m2, angled
+ shr angled, 8 ; is_sm << 1
+ pxor m1, m1
+ pshufb m0, m1
+ pshufb m2, m1
+ pcmpeqb m1, m0, [base+z_filter_wh4]
+ pand m1, m2
+ pcmpgtb m1, [base+z_filter_t_w48+angleq*8]
+ pmovmskb r5d, m1
+ mov r4d, 7
+ test r5d, r5d
+ jz .h4_main ; filter_strength == 0
+ movu m2, [tlq-7]
+ imul r5d, 0x55555555
+ movu m3, [base+z_filter_s-2]
+ shr r5d, 30 ; filter_strength
+ mova m4, [base+z_upsample2]
+ movddup m5, [base+z_filter_k-8+r5*8+24*0]
+ movddup m6, [base+z_filter_k-8+r5*8+24*1]
+ movddup m7, [base+z_filter_k-8+r5*8+24*2]
+ pshufb m0, m2, m3
+ shufps m3, m4, q2121
+ pmaddubsw m1, m0, m5
+ pmaddubsw m0, m6
+ pshufb m5, m2, m3
+ pmaddubsw m3, m5, m6
+ pmaddubsw m5, m7
+ pshufb m2, m4
+ pmaddubsw m2, m7
+ paddw m0, m1
+ paddw m1, m3
+ paddw m0, m5
+ paddw m1, m2
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ lea r2d, [r4+2]
+ cmp wd, 4
+ cmovne r4d, r2d
+ pshufd m0, m0, q0000
+ lea tlq, [rsp+15]
+ packuswb m0, m1
+ mova [rsp], m0
+.h4_main:
+ movd m5, dyd
+ movddup m0, [base+z_base_inc] ; base_inc << 6
+ sub tlq, r4
+ shl r4d, 6
+ movd m7, [tlq]
+ movd m4, r4d
+ pshufb m5, [base+pw_256]
+ neg dyq
+ pshufb m7, [base+pw_m256]
+ mova m3, [base+z3_shuf_h4]
+ lea r5, [dyq+r4+63] ; ypos
+ pshufb m4, [base+pw_256]
+ psubw m4, m0 ; max_base_y
+ shl wd, 2
+ paddw m6, m5, m5
+ sub rsp, wq
+ punpcklqdq m5, m6
+.h4_loop:
+ lea r4, [r5+dyq]
+ sar r5, 6
+ movq m0, [tlq+r5-4]
+ lea r5, [r4+dyq]
+ sar r4, 6
+ movhps m0, [tlq+r4-4]
+ pand m2, m8, m5
+ psubw m1, m9, m2
+ psllw m2, 8
+ pshufb m0, m3
+ por m1, m2
+ pmaddubsw m0, m1
+ pcmpgtw m1, m4, m5
+ paddw m5, m6
+ pmulhrsw m0, m10
+ pand m0, m1
+ pandn m1, m7
+ por m0, m1
+ packuswb m0, m0
+ movq [rsp+wq-8], m0
+ sub wd, 8
+ jz .h4_transpose
+ test r5d, r5d
+ jg .h4_loop
+ packuswb m7, m7
+.h4_end_loop:
+ movq [rsp+wq-8], m7
+ sub wd, 8
+ jg .h4_end_loop
+.h4_transpose:
+ mova m1, [base+z_transpose4]
+%if ARCH_X86_32
+ mov strideq, [dstq]
+ mov org_wd, [dstq+strideq]
+%endif
+ lea r2, [strideq*3]
+ lea dstq, [dstq+org_wq-4]
+.h4_transpose_loop:
+ mova m0, [rsp]
+ add rsp, 16
+ pshufb m0, m1
+ movd [dstq+strideq*0], m0
+ pshuflw m2, m0, q1032
+ movd [dstq+strideq*1], m2
+ punpckhqdq m0, m0
+ movd [dstq+strideq*2], m0
+ psrlq m0, 32
+ movd [dstq+r2 ], m0
+ sub dstq, 4
+ sub org_wd, 4
+ jg .h4_transpose_loop
+ RET
+.h8:
+ lea r4d, [angleq+88]
+ and r4d, ~0x7f
+ or r4d, wd
+ cmp r4d, 8
+ ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8
+ mova m4, [tlq-15]
+ and r4d, 4
+ movu m3, [tlq- 9]
+ movd m1, r4d
+ movu m2, [base+z_filter_s+2]
+ pxor m0, m0
+ movu m5, [base+z_filter_s+6]
+ movddup m7, [base+pb_36_m4]
+ pshufb m1, m0 ; w & 4
+ movu m0, [base+z_upsample1-4]
+ pmaxub m1, m0 ; clip 4x8
+ add dyd, dyd
+ pshufb m0, m4, m1
+ pmaddubsw m0, m7
+ pshufb m1, m4, m2
+ pmaddubsw m1, m7
+ pshufb m2, m3, [base+z_upsample1]
+ pmaddubsw m2, m7
+ pshufb m3, m5
+ pmaddubsw m3, m7
+ movd m5, dyd
+ neg dyq
+ paddw m1, m0
+ paddw m2, m3
+ pmulhrsw m1, m10
+ pmulhrsw m2, m10
+ shl wd, 3
+ lea tlq, [rsp+16]
+ pshufb m5, [base+pw_256]
+ sub rsp, wq
+ packuswb m1, m2
+ lea r5, [dyq+63]
+ punpcklbw m0, m1, m4
+ punpckhbw m1, m4
+ mova [tlq-16*1], m0
+ mova [tlq-16*0], m1
+ paddw m6, m5, m5
+ punpcklqdq m5, m6
+.h8_upsample_loop:
+ lea r4, [r5+dyq]
+ sar r5, 6
+ movu m0, [tlq+r5]
+ lea r5, [r4+dyq]
+ sar r4, 6
+ movu m1, [tlq+r4]
+ pand m3, m8, m5
+ psubw m2, m9, m3
+ psllw m2, 8
+ por m3, m2
+ pshufd m2, m3, q1010
+ pmaddubsw m0, m2
+ punpckhqdq m3, m3
+ pmaddubsw m1, m3
+ paddw m5, m6
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ packuswb m1, m0
+ mova [rsp+wq-16], m1
+ sub wd, 16
+ jg .h8_upsample_loop
+ jmp .h8_transpose
+.h8_no_upsample:
+ lea r4d, [wq+7]
+ movd m0, r4d
+ and r4d, 7
+ or r4d, 8 ; imin(w+7, 15)
+ test angled, 0x400
+ jnz .h8_main
+ movd m2, angled
+ shr angled, 8 ; is_sm << 1
+ pxor m1, m1
+ pshufb m0, m1
+ pshufb m2, m1
+ movu m1, [base+z_filter_wh8]
+ psrldq m3, [base+z_filter_t_w48+angleq*8], 4
+ pcmpeqb m1, m0
+ pand m1, m2
+ pcmpgtb m1, m3
+ pmovmskb r5d, m1
+ test r5d, r5d
+ jz .h8_main ; filter_strength == 0
+ mova m0, [tlq-15]
+ imul r5d, 0x55555555
+ movd m1, [tlq+1]
+ neg r4
+ movd m2, [tlq+r4]
+ shr r5d, 30
+ pxor m7, m7
+ lea tlq, [rsp+16*2]
+ sub r5, 3 ; filter_strength-3
+ mova [tlq+16*0], m0
+ pshufb m1, m7
+ mova [tlq+16*1], m1
+ pshufb m2, m7
+ movq [tlq+r4+8], m2
+ neg r4d
+ call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
+ sar r5d, 1
+ add tlq, 31
+ add r5d, 17
+ cmp wd, 8
+ cmova r4d, r5d
+.h8_main:
+ movd m5, dyd
+ sub tlq, r4
+ shl r4d, 6
+ movd m7, [tlq]
+ movd m4, r4d
+ pshufb m5, [base+pw_256]
+ neg dyq
+ pshufb m7, [base+pw_m256]
+ mova m3, [base+z3_shuf]
+ lea r5, [dyq+r4+63]
+ pshufb m4, [base+pw_256]
+ psubw m4, [base+z3_base_inc]
+ shl wd, 3
+ mova m6, m5
+ sub rsp, wq
+.h8_loop:
+ mov r4, r5
+ sar r4, 6
+ movu m0, [tlq+r4-8]
+ pand m2, m8, m5
+ psubw m1, m9, m2
+ psllw m2, 8
+ pshufb m0, m3
+ por m1, m2
+ pmaddubsw m0, m1
+ pcmpgtw m1, m4, m5
+ paddw m5, m6
+ pmulhrsw m0, m10
+ pand m0, m1
+ pandn m1, m7
+ por m0, m1
+ packuswb m0, m0
+ movq [rsp+wq-8], m0
+ sub wd, 8
+ jz .h8_transpose
+ add r5, dyq
+ jg .h8_loop
+ packuswb m7, m7
+.h8_end_loop:
+ movq [rsp+wq-8], m7
+ sub wd, 8
+ jg .h8_end_loop
+.h8_transpose:
+%if ARCH_X86_32
+ mov strideq, [dstq]
+ mov org_wd, [dstq+strideq]
+%endif
+ or r3d, 8
+ cmp org_wd, 4
+%if ARCH_X86_64
+ jne .end_transpose_main
+%else
+ jne .end_transpose_loop
+%endif
+ mova m1, [rsp+16*1]
+ mova m0, [rsp+16*0]
+ lea r2, [strideq*3]
+ add rsp, 16*2
+ punpcklbw m2, m1, m0
+ punpckhbw m1, m0
+ punpckhbw m0, m1, m2
+ punpcklbw m1, m2
+.write_4x8_end:
+ call .write_4x8
+ RET
+.write_4x8:
+ movd [dstq+r2 ], m0
+ pshuflw m4, m0, q1032
+ movd [dstq+strideq*2], m4
+ punpckhqdq m0, m0
+ movd [dstq+strideq*1], m0
+ psrlq m0, 32
+ movd [dstq+strideq*0], m0
+ lea dstq, [dstq+strideq*4]
+ movd [dstq+r2 ], m1
+ pshuflw m4, m1, q1032
+ movd [dstq+strideq*2], m4
+ punpckhqdq m1, m1
+ movd [dstq+strideq*1], m1
+ psrlq m1, 32
+ movd [dstq+strideq*0], m1
+ ret
+.h16:
+ lea r4d, [wq+15]
+ movd m0, r4d
+ and r4d, 15
+ or r4d, 16 ; imin(w+15, 31)
+ test angled, 0x400
+ jnz .h16_main
+ movd m2, angled
+ shr angled, 8 ; is_sm << 1
+ pxor m1, m1
+ pshufb m0, m1
+ pshufb m2, m1
+ movq m3, [base+z_filter_t_w16+angleq*4]
+ pcmpeqb m1, m0, [base+z_filter_wh16]
+ pand m1, m2
+ pcmpgtb m1, m3
+ pmovmskb r5d, m1
+ test r5d, r5d
+ jz .h16_main ; filter_strength == 0
+ mova m0, [tlq-16*2+1]
+ imul r5d, 0x24924924
+ mova m1, [tlq-16*1+1]
+ neg r4
+ movd m2, [tlq-16*0+1]
+ shr r5d, 30
+ movd m3, [tlq+r4]
+ adc r5, -4 ; filter_strength-3
+ pxor m7, m7
+ lea tlq, [rsp+16*2]
+ mova [tlq-16*1], m0
+ pshufb m2, m7
+ mova [tlq+16*0], m1
+ pshufb m3, m7
+ mova [tlq+16*1], m2
+ movq [tlq+r4+8], m3
+ neg r4d
+ call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
+ add tlq, 31
+ cmp wd, 16
+ jle .h16_main
+ pshuflw m0, [tlq-47], q0000
+ sar r5, 1
+ movq m1, [base+z3_filter_k_tail+r5*4]
+ lea r4d, [r5+33]
+ pmaddubsw m0, m1
+%if ARCH_X86_64
+ pmulhrsw m0, m10
+%else
+ pmulhrsw m0, m4
+%endif
+ packuswb m0, m0
+ movd [tlq-35], m0
+.h16_main:
+ movd m5, dyd
+ sub tlq, r4
+ movd m4, r4d
+ shl r4d, 6
+ movd m7, [tlq]
+ pxor m6, m6
+ pshufb m5, [base+pw_256]
+ neg dyq
+ pshufb m7, m6
+ mova m3, [base+z3_shuf]
+ lea r5, [dyq+r4+63]
+ pshufb m4, m6
+ psubb m4, [base+pb_15to0]
+ shl wd, 4
+ mova m6, m5
+ sub rsp, wq
+.h16_loop:
+ mov r4, r5
+ pand m2, m8, m5
+ sar r4, 6
+ psubw m1, m9, m2
+ psllw m2, 8
+ movu m0, [tlq+r4-8*2]
+ por m2, m1
+ movu m1, [tlq+r4-8*1]
+ pshufb m0, m3
+ pmaddubsw m0, m2
+ pshufb m1, m3
+ pmaddubsw m1, m2
+ psrlw m2, m5, 6
+ paddw m5, m6
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ packsswb m2, m2
+ packuswb m0, m1
+ pcmpgtb m1, m4, m2
+ pand m0, m1
+ pandn m1, m7
+ por m0, m1
+ mova [rsp+wq-16], m0
+ sub wd, 16
+ jz .h16_transpose
+ add r5, dyq
+ jg .h16_loop
+.h16_end_loop:
+ mova [rsp+wq-16], m7
+ sub wd, 16
+ jg .h16_end_loop
+.h16_transpose:
+%if ARCH_X86_32
+ mov strideq, [dstq]
+ mov org_wd, [dstq+strideq]
+%endif
+ or r3d, 16
+ cmp org_wd, 4
+%if ARCH_X86_64
+ jne .end_transpose_main
+%else
+ jne .end_transpose_loop
+%endif
+.h16_transpose_w4:
+ mova m2, [rsp+16*3]
+ mova m4, [rsp+16*2]
+ mova m3, [rsp+16*1]
+ mova m0, [rsp+16*0]
+ lea r2, [strideq*3]
+ add rsp, 16*4
+ punpckhbw m1, m2, m4
+ punpcklbw m2, m4
+ punpckhbw m4, m3, m0
+ punpcklbw m3, m0
+ punpckhwd m0, m1, m4
+ punpcklwd m1, m4
+ call .write_4x8
+ lea dstq, [dstq+strideq*4]
+ punpckhwd m0, m2, m3
+ punpcklwd m1, m2, m3
+ jmp .write_4x8_end
+.h32:
+ lea r4d, [wq+31]
+ and r4d, 31
+ or r4d, 32 ; imin(w+31, 63)
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .h32_main
+ mova m0, [tlq-16*4+1]
+ mova m1, [tlq-16*3+1]
+ mova m2, [tlq-16*2+1]
+ mova m3, [tlq-16*1+1]
+ movd m4, [tlq-16*0+1]
+ neg r4
+ movd m5, [tlq+r4]
+ pxor m7, m7
+ lea tlq, [rsp+16*4]
+ mova [tlq-16*3], m0
+ mova [tlq-16*2], m1
+ xor r5d, r5d ; filter_strength = 3
+ mova [tlq-16*1], m2
+ pshufb m4, m7
+ mova [tlq+16*0], m3
+ pshufb m5, m7
+ mova [tlq+16*1], m4
+ movq [tlq+r4+8], m5
+ neg r4d
+ call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
+ sub tlq, 16*2
+ call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
+ add tlq, 63
+ cmp wd, 32
+ jle .h32_main
+ pshuflw m0, [tlq-79], q0000
+ movq m1, [base+z3_filter_k_tail]
+ add r4d, 2
+ pmaddubsw m0, m1
+%if ARCH_X86_64
+ pmulhrsw m0, m10
+%else
+ pmulhrsw m0, m4
+%endif
+ packuswb m0, m0
+ movd [tlq-67], m0
+.h32_main:
+ movd m5, dyd
+ sub tlq, r4
+ movd m4, r4d
+ shl r4d, 6
+ movd m7, [tlq]
+ pxor m6, m6
+ pshufb m5, [base+pw_256]
+ neg dyq
+ pshufb m7, m6
+ mova m3, [base+z3_shuf]
+ lea r5, [dyq+r4+63]
+ pshufb m4, m6
+ psubb m4, [base+pb_15to0]
+ mova m6, m5
+.h32_loop:
+ mov r4, r5
+ pand m2, m8, m5
+ sar r4, 6
+ psubw m1, m9, m2
+ psllw m2, 8
+ movu m0, [tlq+r4-8*4]
+ por m2, m1
+ movu m1, [tlq+r4-8*3]
+ pshufb m0, m3
+ pmaddubsw m0, m2
+ pshufb m1, m3
+ pmaddubsw m1, m2
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ sub rsp, 32
+ packuswb m0, m1
+ mova [rsp+16*0], m0
+ movu m0, [tlq+r4-8*2]
+ movu m1, [tlq+r4-8*1]
+ pshufb m0, m3
+ pshufb m1, m3
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ psrlw m2, m5, 6
+ paddw m5, m6
+ packsswb m2, m2
+ packuswb m0, m1
+ pcmpgtb m1, m4, m2
+ paddsb m2, [base+pb_16]
+ pand m0, m1
+ pandn m1, m7
+ por m0, m1
+ pcmpgtb m1, m4, m2
+ mova [rsp+16*1], m0
+ pand m0, m1, [rsp+16*0]
+ pandn m1, m7
+ por m0, m1
+ mova [rsp+16*0], m0
+ dec wd
+ jz .h32_transpose
+ add r5, dyq
+ jg .h32_loop
+.h32_end_loop:
+ sub rsp, 32
+ mova [rsp+16*1], m7
+ mova [rsp+16*0], m7
+ dec wd
+ jg .h32_end_loop
+.h32_transpose:
+ or r3d, 32
+ jmp .end_transpose_main
+.h64:
+ lea r4d, [wq+63]
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .h64_main
+ mova m0, [tlq-16*8+1]
+ mova m1, [tlq-16*7+1]
+ mova m2, [tlq-16*6+1]
+ mova m3, [tlq-16*5+1]
+ mova [rsp+16*1], m0
+ mova [rsp+16*2], m1
+ mova [rsp+16*3], m2
+ mova [rsp+16*4], m3
+ mova m0, [tlq-16*4+1]
+ mova m1, [tlq-16*3+1]
+ mova m2, [tlq-16*2+1]
+ mova m3, [tlq-16*1+1]
+ movd m4, [tlq-16*0+1]
+ neg r4
+ movd m5, [tlq+r4]
+ pxor m7, m7
+ lea tlq, [rsp+16*8]
+ mova [tlq-16*3], m0
+ mova [tlq-16*2], m1
+ xor r5d, r5d ; filter_strength = 3
+ mova [tlq-16*1], m2
+ pshufb m4, m7
+ mova [tlq+16*0], m3
+ pshufb m5, m7
+ mova [tlq+16*1], m4
+ movq [tlq+r4+8], m5
+ neg r4d
+ call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
+ sub tlq, 16*2
+ call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
+ sub tlq, 16*2
+ call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
+ sub tlq, 16*2
+ cmp wd, 64
+ jl .h64_filter96 ; skip one call if the last 32 bytes aren't used
+ call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
+.h64_filter96:
+ add tlq, 127
+.h64_main:
+ movd m5, dyd
+ sub tlq, r4
+ movd m4, r4d
+ shl r4d, 6
+ movd m7, [tlq]
+ pxor m6, m6
+ pshufb m5, [base+pw_256]
+ neg dyq
+ pshufb m7, m6
+ mova m3, [base+z3_shuf]
+ lea r5, [dyq+r4+63]
+ pshufb m4, m6
+ psubb m4, [base+pb_15to0]
+ mova m6, m5
+.h64_loop:
+ mov r4, r5
+ pand m2, m8, m5
+ sar r4, 6
+ psubw m1, m9, m2
+ psllw m2, 8
+ movu m0, [tlq+r4-8*8]
+ por m2, m1
+ movu m1, [tlq+r4-8*7]
+ pshufb m0, m3
+ pmaddubsw m0, m2
+ pshufb m1, m3
+ pmaddubsw m1, m2
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ sub rsp, 64
+ packuswb m0, m1
+ mova [rsp+16*0], m0
+ movu m0, [tlq+r4-8*6]
+ movu m1, [tlq+r4-8*5]
+ pshufb m0, m3
+ pshufb m1, m3
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ packuswb m0, m1
+ mova [rsp+16*1], m0
+ movu m0, [tlq+r4-8*4]
+ movu m1, [tlq+r4-8*3]
+ pshufb m0, m3
+ pshufb m1, m3
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ packuswb m0, m1
+ mova [rsp+16*2], m0
+ movu m0, [tlq+r4-8*2]
+ movu m1, [tlq+r4-8*1]
+ pshufb m0, m3
+ pshufb m1, m3
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ psrlw m2, m5, 6
+ paddw m5, m6
+ packsswb m2, m2
+ packuswb m0, m1
+ pcmpgtb m1, m4, m2
+ paddsb m2, [base+pb_16]
+ pand m0, m1
+ pandn m1, m7
+ por m0, m1
+ pcmpgtb m1, m4, m2
+ paddsb m2, [base+pb_16]
+ mova [rsp+16*3], m0
+ pand m0, m1, [rsp+16*2]
+ pandn m1, m7
+ por m0, m1
+ pcmpgtb m1, m4, m2
+ paddsb m2, [base+pb_16]
+ mova [rsp+16*2], m0
+ pand m0, m1, [rsp+16*1]
+ pandn m1, m7
+ por m0, m1
+ pcmpgtb m1, m4, m2
+ mova [rsp+16*1], m0
+ pand m0, m1, [rsp+16*0]
+ pandn m1, m7
+ por m0, m1
+ mova [rsp+16*0], m0
+ dec wd
+ jz .h64_transpose
+ add r5, dyq
+ jg .h64_loop
+.h64_end_loop:
+ sub rsp, 64
+ mova [rsp+16*3], m7
+ mova [rsp+16*2], m7
+ mova [rsp+16*1], m7
+ mova [rsp+16*0], m7
+ dec wd
+ jg .h64_end_loop
+.h64_transpose:
+ or r3d, 64
+.end_transpose_main:
+%if ARCH_X86_64
+ lea r5, [r3*3]
+ lea r7, [strideq*3]
+%else
+ mov strideq, [dstq]
+ mov org_wd, [dstq+strideq]
+%endif
+.end_transpose_loop:
+ lea r4, [rsp+r3-8]
+ lea r6, [dstq+org_wq-8]
+.end_transpose_loop_y:
+ movq m0, [r4+r3*1]
+ movq m4, [r4+r3*0]
+%if ARCH_X86_64
+ movq m1, [r4+r5 ]
+ movq m5, [r4+r3*2]
+ lea r2, [r4+r3*4]
+%else
+ lea r2, [r4+r3*2]
+ movq m1, [r2+r3*1]
+ movq m5, [r2+r3*0]
+ lea r2, [r2+r3*2]
+%endif
+ movq m2, [r2+r3*1]
+ movq m6, [r2+r3*0]
+%if ARCH_X86_64
+ movq m3, [r2+r5 ]
+ movq m7, [r2+r3*2]
+%else
+ lea r2, [r2+r3*2]
+ movq m3, [r2+r3*1]
+ movq m7, [r2+r3*0]
+%endif
+ sub r4, 8
+ punpcklbw m0, m4
+ punpcklbw m1, m5
+ punpcklbw m2, m6
+ punpcklbw m3, m7
+ punpckhwd m4, m1, m0
+ punpcklwd m1, m0
+ punpckhwd m0, m3, m2
+ punpcklwd m3, m2
+ punpckhdq m2, m3, m1
+ punpckldq m3, m1
+ punpckldq m1, m0, m4
+ punpckhdq m0, m4
+ movhps [r6+strideq*0], m0
+ movq [r6+strideq*1], m0
+%if ARCH_X86_64
+ movhps [r6+strideq*2], m1
+ movq [r6+r7 ], m1
+ lea r6, [r6+strideq*4]
+%else
+ lea r6, [r6+strideq*2]
+ movhps [r6+strideq*0], m1
+ movq [r6+strideq*1], m1
+ lea r6, [r6+strideq*2]
+%endif
+ movhps [r6+strideq*0], m2
+ movq [r6+strideq*1], m2
+%if ARCH_X86_64
+ movhps [r6+strideq*2], m3
+ movq [r6+r7 ], m3
+ lea r6, [r6+strideq*4]
+%else
+ lea r6, [r6+strideq*2]
+ movhps [r6+strideq*0], m3
+ movq [r6+strideq*1], m3
+ lea r6, [r6+strideq*2]
+%endif
+ cmp r4, rsp
+ jae .end_transpose_loop_y
+ lea rsp, [rsp+r3*8]
+ sub org_wd, 8
+ jg .end_transpose_loop
+ RET
+
+;-------------------------------------------------------------------------------
+;int dav1d_pal_pred_ssse3(pixel *dst, ptrdiff_t stride, const pixel *pal,
+; const uint8_t *idx, int w, int h);
+;-------------------------------------------------------------------------------
+cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, w, h
+ movq m4, [palq]
+ LEA r2, pal_pred_ssse3_table
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, [r2+wq*4]
+ add wq, r2
+ lea r2, [strideq*3]
+ jmp wq
+.w4:
+ movq m1, [idxq]
+ add idxq, 8
+ psrlw m0, m1, 4
+ punpcklbw m1, m0
+ pshufb m0, m4, m1
+ movd [dstq+strideq*0], m0
+ pshuflw m1, m0, q1032
+ movd [dstq+strideq*1], m1
+ punpckhqdq m0, m0
+ movd [dstq+strideq*2], m0
+ psrlq m0, 32
+ movd [dstq+r2 ], m0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4
+ RET
+.w8:
+ movu m0, [idxq]
+ add idxq, 16
+ pshufb m1, m4, m0
+ psrlw m0, 4
+ pshufb m2, m4, m0
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ movq [dstq+strideq*2], m1
+ movhps [dstq+r2 ], m1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8
+ RET
+.w16:
+ movu m0, [idxq]
+ add idxq, 16
+ pshufb m1, m4, m0
+ psrlw m0, 4
+ pshufb m2, m4, m0
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w16
+ RET
+.w32:
+ movu m0, [idxq]
+ add idxq, 16
+ pshufb m1, m4, m0
+ psrlw m0, 4
+ pshufb m2, m4, m0
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ add dstq, strideq
+ dec hd
+ jg .w32
+ RET
+.w64:
+ movu m0, [idxq+16*0]
+ movu m2, [idxq+16*1]
+ add idxq, 32
+ pshufb m1, m4, m0
+ psrlw m0, 4
+ pshufb m3, m4, m0
+ punpcklbw m0, m1, m3
+ punpckhbw m1, m3
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ pshufb m1, m4, m2
+ psrlw m2, 4
+ pshufb m3, m4, m2
+ punpcklbw m0, m1, m3
+ punpckhbw m1, m3
+ mova [dstq+16*2], m0
+ mova [dstq+16*3], m1
+ add dstq, strideq
+ sub hd, 1
+ jg .w64
+ RET
+
+;---------------------------------------------------------------------------------------
+;void dav1d_ipred_cfl_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+; const int width, const int height, const int16_t *ac, const int alpha);
+;---------------------------------------------------------------------------------------
+%macro IPRED_CFL 1 ; ac in, unpacked pixels out
+ psignw m3, m%1, m1
+ pabsw m%1, m%1
+ pmulhrsw m%1, m2
+ psignw m%1, m3
+ paddw m%1, m0
+%endmacro
+
+%if UNIX64
+DECLARE_REG_TMP 7
+%else
+DECLARE_REG_TMP 5
+%endif
+
+cglobal ipred_cfl_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
+ movifnidn wd, wm
+ movifnidn hd, hm
+ tzcnt r6d, hd
+ lea t0d, [wq+hq]
+ movd m4, t0d
+ tzcnt t0d, t0d
+ movd m5, t0d
+ LEA t0, ipred_cfl_ssse3_table
+ tzcnt wd, wd
+ movsxd r6, [t0+r6*4]
+ movsxd wq, [t0+wq*4+16]
+ pcmpeqd m3, m3
+ psrlw m4, 1
+ add r6, t0
+ add wq, t0
+ movifnidn acq, acmp
+ jmp r6
+.h4:
+ movd m0, [tlq-4]
+ pmaddubsw m0, m3
+ jmp wq
+.w4:
+ movd m1, [tlq+1]
+ pmaddubsw m1, m3
+ psubw m0, m4
+ paddw m0, m1
+ pmaddwd m0, m3
+ cmp hd, 4
+ jg .w4_mul
+ psrlw m0, 3 ; dc >>= ctz(width + height);
+ jmp .w4_end
+.w4_mul:
+ punpckhqdq m1, m0, m0
+ paddw m0, m1
+ pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
+ paddw m0, m1
+ psrlw m0, 2
+ mov r6d, 0x5556
+ mov r2d, 0x3334
+ test hd, 8
+ cmovz r6d, r2d
+ movd m5, r6d
+ pmulhuw m0, m5
+.w4_end:
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.s4:
+ movd m1, alpham
+ pshuflw m1, m1, q0000
+ punpcklqdq m1, m1
+ lea r6, [strideq*3]
+ pabsw m2, m1
+ psllw m2, 9
+.s4_loop:
+ mova m4, [acq]
+ mova m5, [acq+16]
+ IPRED_CFL 4
+ IPRED_CFL 5
+ packuswb m4, m5
+ movd [dstq+strideq*0], m4
+ pshuflw m4, m4, q1032
+ movd [dstq+strideq*1], m4
+ punpckhqdq m4, m4
+ movd [dstq+strideq*2], m4
+ psrlq m4, 32
+ movd [dstq+r6 ], m4
+ lea dstq, [dstq+strideq*4]
+ add acq, 32
+ sub hd, 4
+ jg .s4_loop
+ RET
+ALIGN function_align
+.h8:
+ movq m0, [tlq-8]
+ pmaddubsw m0, m3
+ jmp wq
+.w8:
+ movq m1, [tlq+1]
+ pmaddubsw m1, m3
+ psubw m4, m0
+ punpckhqdq m0, m0
+ psubw m0, m4
+ paddw m0, m1
+ pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
+ paddw m0, m1
+ pmaddwd m0, m3
+ psrlw m0, m5
+ cmp hd, 8
+ je .w8_end
+ mov r6d, 0x5556
+ mov r2d, 0x3334
+ cmp hd, 32
+ cmovz r6d, r2d
+ movd m1, r6d
+ pmulhuw m0, m1
+.w8_end:
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.s8:
+ movd m1, alpham
+ pshuflw m1, m1, q0000
+ punpcklqdq m1, m1
+ lea r6, [strideq*3]
+ pabsw m2, m1
+ psllw m2, 9
+.s8_loop:
+ mova m4, [acq]
+ mova m5, [acq+16]
+ IPRED_CFL 4
+ IPRED_CFL 5
+ packuswb m4, m5
+ movq [dstq ], m4
+ movhps [dstq+strideq ], m4
+ mova m4, [acq+32]
+ mova m5, [acq+48]
+ IPRED_CFL 4
+ IPRED_CFL 5
+ packuswb m4, m5
+ movq [dstq+strideq*2], m4
+ movhps [dstq+r6 ], m4
+ lea dstq, [dstq+strideq*4]
+ add acq, 64
+ sub hd, 4
+ jg .s8_loop
+ RET
+ALIGN function_align
+.h16:
+ mova m0, [tlq-16]
+ pmaddubsw m0, m3
+ jmp wq
+.w16:
+ movu m1, [tlq+1]
+ pmaddubsw m1, m3
+ paddw m0, m1
+ psubw m4, m0
+ punpckhqdq m0, m0
+ psubw m0, m4
+ pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
+ paddw m0, m1
+ pmaddwd m0, m3
+ psrlw m0, m5
+ cmp hd, 16
+ je .w16_end
+ mov r6d, 0x5556
+ mov r2d, 0x3334
+ test hd, 8|32
+ cmovz r6d, r2d
+ movd m1, r6d
+ pmulhuw m0, m1
+.w16_end:
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.s16:
+ movd m1, alpham
+ pshuflw m1, m1, q0000
+ punpcklqdq m1, m1
+ pabsw m2, m1
+ psllw m2, 9
+.s16_loop:
+ mova m4, [acq]
+ mova m5, [acq+16]
+ IPRED_CFL 4
+ IPRED_CFL 5
+ packuswb m4, m5
+ mova [dstq], m4
+ mova m4, [acq+32]
+ mova m5, [acq+48]
+ IPRED_CFL 4
+ IPRED_CFL 5
+ packuswb m4, m5
+ mova [dstq+strideq], m4
+ lea dstq, [dstq+strideq*2]
+ add acq, 64
+ sub hd, 2
+ jg .s16_loop
+ RET
+ALIGN function_align
+.h32:
+ mova m0, [tlq-32]
+ pmaddubsw m0, m3
+ mova m2, [tlq-16]
+ pmaddubsw m2, m3
+ paddw m0, m2
+ jmp wq
+.w32:
+ movu m1, [tlq+1]
+ pmaddubsw m1, m3
+ movu m2, [tlq+17]
+ pmaddubsw m2, m3
+ paddw m1, m2
+ paddw m0, m1
+ psubw m4, m0
+ punpckhqdq m0, m0
+ psubw m0, m4
+ pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
+ paddw m0, m1
+ pmaddwd m0, m3
+ psrlw m0, m5
+ cmp hd, 32
+ je .w32_end
+ lea r2d, [hq*2]
+ mov r6d, 0x5556
+ mov r2d, 0x3334
+ test hd, 64|16
+ cmovz r6d, r2d
+ movd m1, r6d
+ pmulhuw m0, m1
+.w32_end:
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.s32:
+ movd m1, alpham
+ pshuflw m1, m1, q0000
+ punpcklqdq m1, m1
+ pabsw m2, m1
+ psllw m2, 9
+.s32_loop:
+ mova m4, [acq]
+ mova m5, [acq+16]
+ IPRED_CFL 4
+ IPRED_CFL 5
+ packuswb m4, m5
+ mova [dstq], m4
+ mova m4, [acq+32]
+ mova m5, [acq+48]
+ IPRED_CFL 4
+ IPRED_CFL 5
+ packuswb m4, m5
+ mova [dstq+16], m4
+ add dstq, strideq
+ add acq, 64
+ dec hd
+ jg .s32_loop
+ RET
+
+;---------------------------------------------------------------------------------------
+;void dav1d_ipred_cfl_left_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+; const int width, const int height, const int16_t *ac, const int alpha);
+;---------------------------------------------------------------------------------------
+cglobal ipred_cfl_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
+ mov hd, hm ; zero upper half
+ tzcnt r6d, hd
+ sub tlq, hq
+ tzcnt wd, wm
+ movu m0, [tlq]
+ mov t0d, 0x8000
+ movd m3, t0d
+ movd m2, r6d
+ psrld m3, m2
+ LEA t0, ipred_cfl_left_ssse3_table
+ movsxd r6, [t0+r6*4]
+ pcmpeqd m2, m2
+ pmaddubsw m0, m2
+ add r6, t0
+ add t0, ipred_cfl_splat_ssse3_table-ipred_cfl_left_ssse3_table
+ movsxd wq, [t0+wq*4]
+ add wq, t0
+ movifnidn acq, acmp
+ jmp r6
+.h32:
+ movu m1, [tlq+16] ; unaligned when jumping here from dc_top
+ pmaddubsw m1, m2
+ paddw m0, m1
+.h16:
+ pshufd m1, m0, q3232 ; psrlq m1, m0, 16
+ paddw m0, m1
+.h8:
+ pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
+ paddw m0, m1
+.h4:
+ pmaddwd m0, m2
+ pmulhrsw m0, m3
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+ jmp wq
+
+;---------------------------------------------------------------------------------------
+;void dav1d_ipred_cfl_top_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+; const int width, const int height, const int16_t *ac, const int alpha);
+;---------------------------------------------------------------------------------------
+cglobal ipred_cfl_top_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
+ LEA t0, ipred_cfl_left_ssse3_table
+ tzcnt wd, wm
+ inc tlq
+ movu m0, [tlq]
+ movifnidn hd, hm
+ mov r6d, 0x8000
+ movd m3, r6d
+ movd m2, wd
+ psrld m3, m2
+ movsxd r6, [t0+wq*4]
+ pcmpeqd m2, m2
+ pmaddubsw m0, m2
+ add r6, t0
+ add t0, ipred_cfl_splat_ssse3_table-ipred_cfl_left_ssse3_table
+ movsxd wq, [t0+wq*4]
+ add wq, t0
+ movifnidn acq, acmp
+ jmp r6
+
+;---------------------------------------------------------------------------------------
+;void dav1d_ipred_cfl_128_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+; const int width, const int height, const int16_t *ac, const int alpha);
+;---------------------------------------------------------------------------------------
+cglobal ipred_cfl_128_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
+ tzcnt wd, wm
+ movifnidn hd, hm
+ LEA r6, ipred_cfl_splat_ssse3_table
+ movsxd wq, [r6+wq*4]
+ movddup m0, [r6-ipred_cfl_splat_ssse3_table+pw_128]
+ add wq, r6
+ movifnidn acq, acmp
+ jmp wq
+
+%macro RELOAD_ACQ_32 1
+ mov acq, ac_bakq ; restore acq
+%endmacro
+
+%if ARCH_X86_64
+cglobal ipred_cfl_ac_420_8bpc, 4, 8, 7, ac, y, stride, wpad, hpad, w, h, ac_bak
+DECLARE_REG_TMP 7
+ movddup m2, [pb_2]
+%else
+cglobal ipred_cfl_ac_420_8bpc, 4, 7, 7, ac, y, stride, wpad, hpad, w, h
+DECLARE_REG_TMP 4
+%define ac_bakq acmp
+ mov t0d, 0x02020202
+ movd m2, t0d
+ pshufd m2, m2, q0000
+%endif
+ movifnidn wd, wm
+ mov t0d, hm
+ mov hd, t0d
+ imul t0d, wd
+ movd m5, t0d
+ movifnidn hpadd, hpadm
+%if ARCH_X86_64
+ mov ac_bakq, acq
+%endif
+ shl hpadd, 2
+ sub hd, hpadd
+ pxor m4, m4
+ cmp wd, 8
+ jg .w16
+ je .w8
+ ; fall-through
+%if ARCH_X86_64
+ DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, ac_bak
+%else
+ DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h
+%endif
+.w4:
+ lea stride3q, [strideq*3]
+.w4_loop:
+ movq m0, [yq]
+ movq m1, [yq+strideq]
+ movhps m0, [yq+strideq*2]
+ movhps m1, [yq+stride3q]
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ paddw m0, m1
+ mova [acq], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*4]
+ add acq, 16
+ sub hd, 2
+ jg .w4_loop
+ test hpadd, hpadd
+ jz .calc_avg_4_8
+ punpckhqdq m0, m0
+.w4_hpad_loop:
+ mova [acq], m0
+ paddw m4, m0
+ add acq, 16
+ sub hpadd, 2
+ jg .w4_hpad_loop
+ jmp .calc_avg_4_8
+.w8:
+ lea stride3q, [strideq*3]
+ test wpadd, wpadd
+ jnz .w8_wpad
+.w8_loop:
+ mova m0, [yq]
+ mova m1, [yq+strideq]
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ paddw m0, m1
+ mova [acq], m0
+ paddw m4, m0
+ mova m0, [yq+strideq*2]
+ mova m1, [yq+stride3q]
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ paddw m0, m1
+ mova [acq+16], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*4]
+ add acq, 32
+ sub hd, 2
+ jg .w8_loop
+ test hpadd, hpadd
+ jz .calc_avg_4_8
+ jmp .w8_hpad
+.w8_wpad: ; wpadd=1
+ movddup m0, [yq]
+ movddup m1, [yq+strideq]
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ paddw m0, m1
+ pshufhw m0, m0, q3333
+ mova [acq], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*2]
+ add acq, 16
+ sub hd, 1
+ jg .w8_wpad
+ test hpadd, hpadd
+ jz .calc_avg_4_8
+.w8_hpad:
+ mova [acq], m0
+ paddw m4, m0
+ add acq, 16
+ sub hpadd, 1
+ jg .w8_hpad
+ jmp .calc_avg_4_8
+.w16:
+ test wpadd, wpadd
+ jnz .w16_wpad
+.w16_loop:
+ mova m0, [yq]
+ mova m1, [yq+strideq]
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ paddw m0, m1
+ mova [acq], m0
+ paddw m4, m0
+ mova m6, [yq+16]
+ mova m1, [yq+strideq+16]
+ pmaddubsw m6, m2
+ pmaddubsw m1, m2
+ paddw m6, m1
+ mova [acq+16], m6
+ paddw m4, m6
+ lea yq, [yq+strideq*2]
+ add acq, 32
+ dec hd
+ jg .w16_loop
+ test hpadd, hpadd
+ jz .calc_avg16
+ jmp .w16_hpad_loop
+.w16_wpad:
+ cmp wpadd, 2
+ jl .w16_pad1
+ je .w16_pad2
+.w16_pad3:
+ movddup m0, [yq]
+ movddup m1, [yq+strideq]
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ paddw m0, m1
+ pshufhw m0, m0, q3333
+ mova [acq], m0
+ paddw m4, m0
+ mova m6, m0
+ punpckhqdq m6, m0, m0
+ mova [acq+16], m6
+ paddw m4, m6
+ lea yq, [yq+strideq*2]
+ add acq, 32
+ dec hd
+ jg .w16_pad3
+ jmp .w16_wpad_done
+.w16_pad2:
+ mova m0, [yq]
+ mova m1, [yq+strideq]
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ paddw m0, m1
+ mova [acq], m0
+ paddw m4, m0
+ pshufhw m6, m0, q3333
+ punpckhqdq m6, m6
+ mova [acq+16], m6
+ paddw m4, m6
+ lea yq, [yq+strideq*2]
+ add acq, 32
+ dec hd
+ jg .w16_pad2
+ jmp .w16_wpad_done
+.w16_pad1:
+ mova m0, [yq]
+ mova m1, [yq+strideq]
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ paddw m0, m1
+ mova [acq], m0
+ paddw m4, m0
+ movddup m6, [yq+16]
+ movddup m1, [yq+strideq+16]
+ pmaddubsw m6, m2
+ pmaddubsw m1, m2
+ paddw m6, m1
+ pshufhw m6, m6, q3333
+ mova [acq+16], m6
+ paddw m4, m6
+ lea yq, [yq+strideq*2]
+ add acq, 32
+ dec hd
+ jg .w16_pad1
+.w16_wpad_done:
+ test hpadd, hpadd
+ jz .calc_avg16
+.w16_hpad_loop:
+ mova [acq], m0
+ paddw m4, m0
+ mova [acq+16], m6
+ paddw m4, m6
+ add acq, 32
+ dec hpadd
+ jg .w16_hpad_loop
+ jmp .calc_avg16
+
+%if ARCH_X86_64
+ DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h, ac_bak
+%else
+ DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h
+%endif
+.calc_avg_4_8:
+ psrlw m2, 9
+ pmaddwd m4, m2
+ jmp .calc_avg
+.calc_avg16:
+ psrld m0, m4, 16
+ pslld m4, 16
+ psrld m4, 16
+ paddd m4, m0
+.calc_avg:
+ movd szd, m5
+ psrad m5, 1
+ tzcnt r1d, szd
+ paddd m4, m5
+ movd m1, r1d
+ pshufd m0, m4, q2301
+ paddd m0, m4
+ pshufd m4, m0, q1032
+ paddd m0, m4
+ psrad m0, m1 ; sum >>= log2sz;
+ packssdw m0, m0
+ RELOAD_ACQ_32 acq
+.sub_loop:
+ mova m1, [acq]
+ psubw m1, m0 ; ac[x] -= sum;
+ mova [acq], m1
+ add acq, 16
+ sub szd, 8
+ jg .sub_loop
+ RET
+
+%if ARCH_X86_64
+cglobal ipred_cfl_ac_422_8bpc, 4, 8, 7, ac, y, stride, wpad, hpad, w, h, ac_bak
+ movddup m2, [pb_4]
+%else
+cglobal ipred_cfl_ac_422_8bpc, 4, 7, 7, ac, y, stride, wpad, hpad, w, h
+ mov t0d, 0x04040404
+ movd m2, t0d
+ pshufd m2, m2, q0000
+%endif
+ movifnidn wd, wm
+ mov t0d, hm
+ mov hd, t0d
+ imul t0d, wd
+ movd m6, t0d
+ movifnidn hpadd, hpadm
+%if ARCH_X86_64
+ mov ac_bakq, acq
+%endif
+ shl hpadd, 2
+ sub hd, hpadd
+ pxor m4, m4
+ pxor m5, m5
+ cmp wd, 8
+ jg .w16
+ je .w8
+ ; fall-through
+
+%if ARCH_X86_64
+ DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, ac_bak
+%else
+ DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h
+%endif
+.w4:
+ lea stride3q, [strideq*3]
+.w4_loop:
+ movq m1, [yq]
+ movhps m1, [yq+strideq]
+ movq m0, [yq+strideq*2]
+ movhps m0, [yq+stride3q]
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ mova [acq], m1
+ mova [acq+16], m0
+ paddw m4, m0
+ paddw m5, m1
+ lea yq, [yq+strideq*4]
+ add acq, 32
+ sub hd, 4
+ jg .w4_loop
+ test hpadd, hpadd
+ jz .calc_avg_4
+ punpckhqdq m0, m0
+.w4_hpad_loop:
+ mova [acq], m0
+ paddw m4, m0
+ add acq, 16
+ sub hpadd, 2
+ jg .w4_hpad_loop
+ jmp .calc_avg_4
+.w8:
+ lea stride3q, [strideq*3]
+ test wpadd, wpadd
+ jnz .w8_wpad
+.w8_loop:
+ mova m1, [yq]
+ mova m0, [yq+strideq]
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ mova [acq], m1
+ mova [acq+16], m0
+ paddw m4, m0
+ paddw m5, m1
+ mova m1, [yq+strideq*2]
+ mova m0, [yq+stride3q]
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ mova [acq+32], m1
+ mova [acq+48], m0
+ paddw m4, m0
+ paddw m5, m1
+ lea yq, [yq+strideq*4]
+ add acq, 64
+ sub hd, 4
+ jg .w8_loop
+ test hpadd, hpadd
+ jz .calc_avg_8_16
+ jmp .w8_hpad
+.w8_wpad:
+ movddup m1, [yq]
+ pmaddubsw m1, m2
+ pshufhw m1, m1, q3333
+ mova [acq], m1
+ paddw m5, m1
+ movddup m0, [yq+strideq]
+ pmaddubsw m0, m2
+ pshufhw m0, m0, q3333
+ mova [acq+16], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*2]
+ add acq, 32
+ sub hd, 2
+ jg .w8_wpad
+ test hpadd, hpadd
+ jz .calc_avg_8_16
+.w8_hpad:
+ mova [acq], m0
+ paddw m4, m0
+ mova [acq+16], m0
+ paddw m4, m0
+ add acq, 32
+ sub hpadd, 2
+ jg .w8_hpad
+ jmp .calc_avg_8_16
+.w16:
+ test wpadd, wpadd
+ jnz .w16_wpad
+.w16_loop:
+ mova m1, [yq]
+ mova m0, [yq+16]
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ mova [acq], m1
+ mova [acq+16], m0
+ paddw m5, m0
+ paddw m5, m1
+ mova m1, [yq+strideq]
+ mova m0, [yq+strideq+16]
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ mova [acq+32], m1
+ mova [acq+48], m0
+ paddw m4, m0
+ paddw m4, m1
+ lea yq, [yq+strideq*2]
+ add acq, 64
+ sub hd, 2
+ jg .w16_loop
+ test hpadd, hpadd
+ jz .calc_avg_8_16
+ jmp .w16_hpad_loop
+.w16_wpad:
+ cmp wpadd, 2
+ jl .w16_pad1
+ je .w16_pad2
+.w16_pad3:
+ movddup m1, [yq]
+ pmaddubsw m1, m2
+ pshufhw m1, m1, q3333
+ mova [acq], m1
+ paddw m5, m1
+ punpckhqdq m1, m1
+ mova [acq+16], m1
+ paddw m5, m1
+ movddup m1, [yq+strideq]
+ pmaddubsw m1, m2
+ pshufhw m1, m1, q3333
+ mova [acq+32], m1
+ paddw m4, m1
+ punpckhqdq m0, m1, m1
+ mova [acq+48], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*2]
+ add acq, 64
+ sub hd, 2
+ jg .w16_pad3
+ jmp .w16_wpad_done
+.w16_pad2:
+ mova m1, [yq]
+ pmaddubsw m1, m2
+ mova [acq], m1
+ paddw m5, m1
+ pshufhw m1, m1, q3333
+ punpckhqdq m1, m1
+ mova [acq+16], m1
+ paddw m5, m1
+ mova m1, [yq+strideq]
+ pmaddubsw m1, m2
+ mova [acq+32], m1
+ paddw m4, m1
+ mova m0, m1
+ pshufhw m0, m0, q3333
+ punpckhqdq m0, m0
+ mova [acq+48], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*2]
+ add acq, 64
+ sub hd, 2
+ jg .w16_pad2
+ jmp .w16_wpad_done
+.w16_pad1:
+ mova m1, [yq]
+ pmaddubsw m1, m2
+ mova [acq], m1
+ paddw m5, m1
+ movddup m0, [yq+16]
+ pmaddubsw m0, m2
+ pshufhw m0, m0, q3333
+ mova [acq+16], m0
+ paddw m5, m0
+ mova m1, [yq+strideq]
+ pmaddubsw m1, m2
+ mova [acq+32], m1
+ paddw m4, m1
+ movddup m0, [yq+strideq+16]
+ pmaddubsw m0, m2
+ pshufhw m0, m0, q3333
+ mova [acq+48], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*2]
+ add acq, 64
+ sub hd, 2
+ jg .w16_pad1
+.w16_wpad_done:
+ test hpadd, hpadd
+ jz .calc_avg_8_16
+.w16_hpad_loop:
+ mova [acq], m1
+ mova [acq+16], m0
+ paddw m4, m1
+ paddw m5, m0
+ mova [acq+32], m1
+ mova [acq+48], m0
+ paddw m4, m1
+ paddw m5, m0
+ add acq, 64
+ sub hpadd, 2
+ jg .w16_hpad_loop
+ jmp .calc_avg_8_16
+
+%if ARCH_X86_64
+ DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h, ac_bak
+%else
+ DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h
+%endif
+.calc_avg_4:
+ psrlw m2, 10
+ pmaddwd m5, m2
+ pmaddwd m0, m4, m2
+ jmp .calc_avg
+.calc_avg_8_16:
+ mova m0, m5
+ psrld m5, 16
+ pslld m0, 16
+ psrld m0, 16
+ paddd m5, m0
+ mova m0, m4
+ psrld m0, 16
+ pslld m4, 16
+ psrld m4, 16
+ paddd m0, m4
+.calc_avg:
+ paddd m5, m0
+ movd szd, m6
+ psrad m6, 1
+ tzcnt r1d, szd ; const int log2sz = ctz(width) + ctz(height);
+ paddd m5, m6
+ movd m1, r1d
+ pshufd m0, m5, q2301
+ paddd m0, m5
+ pshufd m5, m0, q1032
+ paddd m0, m5
+ psrad m0, m1 ; sum >>= log2sz;
+ packssdw m0, m0
+ RELOAD_ACQ_32 acq ; ac = ac_orig
+.sub_loop:
+ mova m1, [acq]
+ psubw m1, m0
+ mova [acq], m1
+ add acq, 16
+ sub szd, 8
+ jg .sub_loop
+ RET
+
+%if ARCH_X86_64
+cglobal ipred_cfl_ac_444_8bpc, 4, 8, 7, -4*16, ac, y, stride, wpad, hpad, w, h, ac_bak
+ movddup m2, [pb_4]
+%else
+cglobal ipred_cfl_ac_444_8bpc, 4, 7, 7, -5*16, ac, y, stride, wpad, hpad, w, h
+%define ac_bakq [rsp+16*4]
+ mov t0d, 0x04040404
+ movd m2, t0d
+ pshufd m2, m2, q0000
+%endif
+ movifnidn wd, wm
+ movifnidn hpadd, hpadm
+ movd m0, hpadd
+ mov t0d, hm
+ mov hd, t0d
+ imul t0d, wd
+ movd m6, t0d
+ movd hpadd, m0
+ mov ac_bakq, acq
+ shl hpadd, 2
+ sub hd, hpadd
+ pxor m5, m5
+ pxor m4, m4
+ cmp wd, 16
+ jg .w32
+ cmp wd, 8
+ jg .w16
+ je .w8
+ ; fall-through
+
+%if ARCH_X86_64
+ DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, ac_bak
+%else
+ DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h
+%endif
+.w4:
+ lea stride3q, [strideq*3]
+.w4_loop:
+ movd m1, [yq]
+ movd m3, [yq+strideq]
+ punpckldq m1, m3
+ punpcklbw m1, m1
+ movd m0, [yq+strideq*2]
+ movd m3, [yq+stride3q]
+ punpckldq m0, m3
+ punpcklbw m0, m0
+ pmaddubsw m1, m2
+ pmaddubsw m0, m2
+ mova [acq], m1
+ mova [acq+16], m0
+ paddw m5, m0
+ paddw m5, m1
+ lea yq, [yq+strideq*4]
+ add acq, 32
+ sub hd, 4
+ jg .w4_loop
+ test hpadd, hpadd
+ jz .calc_avg_4
+ punpckhqdq m0, m0
+.w4_hpad_loop:
+ mova [acq], m0
+ paddw m5, m0
+ add acq, 16
+ sub hpadd, 2
+ jg .w4_hpad_loop
+.calc_avg_4:
+ psrlw m2, 10
+ pmaddwd m5, m2
+ jmp .calc_avg
+
+.w8:
+ lea stride3q, [strideq*3]
+ test wpadd, wpadd
+ jnz .w8_wpad
+.w8_loop:
+ movq m1, [yq]
+ punpcklbw m1, m1
+ pmaddubsw m1, m2
+ mova [acq], m1
+ paddw m5, m1
+ movq m0, [yq+strideq]
+ punpcklbw m0, m0
+ pmaddubsw m0, m2
+ mova [acq+16], m0
+ paddw m5, m0
+ movq m1, [yq+strideq*2]
+ punpcklbw m1, m1
+ pmaddubsw m1, m2
+ mova [acq+32], m1
+ paddw m4, m1
+ movq m0, [yq+stride3q]
+ punpcklbw m0, m0
+ pmaddubsw m0, m2
+ mova [acq+48], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*4]
+ add acq, 64
+ sub hd, 4
+ jg .w8_loop
+ test hpadd, hpadd
+ jz .calc_avg_8_16
+ jmp .w8_hpad
+.w8_wpad:
+ movd m1, [yq]
+ punpcklbw m1, m1
+ punpcklqdq m1, m1
+ pmaddubsw m1, m2
+ pshufhw m1, m1, q3333
+ mova [acq], m1
+ paddw m5, m1
+ movd m0, [yq+strideq]
+ punpcklbw m0, m0
+ punpcklqdq m0, m0
+ pmaddubsw m0, m2
+ pshufhw m0, m0, q3333
+ mova [acq+16], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*2]
+ add acq, 32
+ sub hd, 2
+ jg .w8_wpad
+ test hpadd, hpadd
+ jz .calc_avg_8_16
+.w8_hpad:
+ mova [acq], m0
+ paddw m5, m0
+ mova [acq+16], m0
+ paddw m4, m0
+ add acq, 32
+ sub hpadd, 2
+ jg .w8_hpad
+ jmp .calc_avg_8_16
+
+.w16:
+ test wpadd, wpadd
+ jnz .w16_wpad
+.w16_loop:
+ mova m0, [yq]
+ mova m1, m0
+ punpcklbw m1, m1
+ pmaddubsw m1, m2
+ mova [acq], m1
+ paddw m5, m1
+ punpckhbw m0, m0
+ pmaddubsw m0, m2
+ mova [acq+16], m0
+ paddw m5, m0
+ mova m0, [yq+strideq]
+ mova m1, m0
+ punpcklbw m1, m1
+ pmaddubsw m1, m2
+ mova [acq+32], m1
+ paddw m4, m1
+ punpckhbw m0, m0
+ pmaddubsw m0, m2
+ mova [acq+48], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*2]
+ add acq, 64
+ sub hd, 2
+ jg .w16_loop
+ test hpadd, hpadd
+ jz .calc_avg_8_16
+ jmp .w16_hpad_loop
+.w16_wpad:
+ cmp wpadd, 2
+ jl .w16_pad1
+ je .w16_pad2
+.w16_pad3:
+ movd m1, [yq]
+ punpcklbw m1, m1
+ punpcklqdq m1, m1
+ pshufhw m1, m1, q3333
+ pmaddubsw m1, m2
+ mova [acq], m1
+ paddw m5, m1
+ punpckhqdq m1, m1
+ mova [acq+16], m1
+ paddw m5, m1
+ movd m1, [yq+strideq]
+ punpcklbw m1, m1
+ punpcklqdq m1, m1
+ pshufhw m1, m1, q3333
+ pmaddubsw m1, m2
+ mova [acq+32], m1
+ paddw m4, m1
+ punpckhqdq m0, m1, m1
+ mova [acq+48], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*2]
+ add acq, 64
+ sub hd, 2
+ jg .w16_pad3
+ jmp .w16_wpad_done
+.w16_pad2:
+ movq m1, [yq]
+ punpcklbw m1, m1
+ pmaddubsw m1, m2
+ mova [acq], m1
+ paddw m5, m1
+ pshufhw m1, m1, q3333
+ punpckhqdq m1, m1
+ mova [acq+16], m1
+ paddw m5, m1
+ movq m1, [yq+strideq]
+ punpcklbw m1, m1
+ pmaddubsw m1, m2
+ mova [acq+32], m1
+ paddw m4, m1
+ mova m0, m1
+ pshufhw m0, m0, q3333
+ punpckhqdq m0, m0
+ mova [acq+48], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*2]
+ add acq, 64
+ sub hd, 2
+ jg .w16_pad2
+ jmp .w16_wpad_done
+.w16_pad1:
+ mova m0, [yq]
+ mova m1, m0
+ punpcklbw m1, m1
+ pmaddubsw m1, m2
+ mova [acq], m1
+ paddw m5, m1
+ punpckhbw m0, m0
+ punpcklqdq m0, m0
+ pshufhw m0, m0, q3333
+ pmaddubsw m0, m2
+ mova [acq+16], m0
+ paddw m5, m0
+ mova m0, [yq+strideq]
+ mova m1, m0
+ punpcklbw m1, m1
+ pmaddubsw m1, m2
+ mova [acq+32], m1
+ paddw m4, m1
+ punpckhbw m0, m0
+ punpcklqdq m0, m0
+ pshufhw m0, m0, q3333
+ pmaddubsw m0, m2
+ mova [acq+48], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*2]
+ add acq, 64
+ sub hd, 2
+ jg .w16_pad1
+.w16_wpad_done:
+ test hpadd, hpadd
+ jz .calc_avg_8_16
+.w16_hpad_loop:
+ mova [acq], m1
+ mova [acq+16], m0
+ paddw m4, m1
+ paddw m5, m0
+ mova [acq+32], m1
+ mova [acq+48], m0
+ paddw m4, m1
+ paddw m5, m0
+ add acq, 64
+ sub hpadd, 2
+ jg .w16_hpad_loop
+.calc_avg_8_16:
+ mova m0, m5
+ psrld m5, 16
+ pslld m0, 16
+ psrld m0, 16
+ paddd m5, m0
+ mova m0, m4
+ psrld m0, 16
+ pslld m4, 16
+ psrld m4, 16
+ paddd m0, m4
+ paddd m5, m0
+ jmp .calc_avg
+
+.w32:
+ pxor m0, m0
+ mova [rsp ], m0
+ mova [rsp+16], m0
+ mova [rsp+32], m0
+ mova [rsp+48], m0
+ test wpadd, wpadd
+ jnz .w32_wpad
+.w32_loop:
+ mova m0, [yq]
+ mova m1, m0
+ punpcklbw m1, m1
+ pmaddubsw m1, m2
+ mova [acq], m1
+ paddw m5, m1, [rsp]
+ mova [rsp ], m5
+ punpckhbw m0, m0
+ pmaddubsw m0, m2
+ mova [acq+16], m0
+ paddw m5, m0, [rsp+16]
+ mova [rsp+16], m5
+ mova m4, [yq+16]
+ mova m3, m4
+ punpcklbw m3, m3
+ pmaddubsw m3, m2
+ mova [acq+32], m3
+ paddw m5, m3, [rsp+32]
+ mova [rsp+32], m5
+ punpckhbw m4, m4
+ pmaddubsw m4, m2
+ mova [acq+48], m4
+ paddw m5, m4, [rsp+48]
+ mova [rsp+48], m5
+ lea yq, [yq+strideq]
+ add acq, 64
+ sub hd, 1
+ jg .w32_loop
+ test hpadd, hpadd
+ jz .calc_avg_32
+ jmp .w32_hpad_loop
+.w32_wpad:
+ cmp wpadd, 2
+ jl .w32_pad1
+ je .w32_pad2
+ cmp wpadd, 4
+ jl .w32_pad3
+ je .w32_pad4
+ cmp wpadd, 6
+ jl .w32_pad5
+ je .w32_pad6
+.w32_pad7:
+ movd m1, [yq]
+ punpcklbw m1, m1
+ punpcklqdq m1, m1
+ pshufhw m1, m1, q3333
+ pmaddubsw m1, m2
+ mova [acq], m1
+ paddw m5, m1, [rsp]
+ mova [rsp ], m5
+ mova m0, m1
+ punpckhqdq m0, m0
+ mova [acq+16], m0
+ paddw m5, m0, [rsp+16]
+ mova [rsp+16], m5
+ mova m3, m0
+ mova [acq+32], m3
+ paddw m5, m3, [rsp+32]
+ mova [rsp+32], m5
+ mova m4, m3
+ mova [acq+48], m4
+ paddw m5, m4, [rsp+48]
+ mova [rsp+48], m5
+ lea yq, [yq+strideq]
+ add acq, 64
+ sub hd, 1
+ jg .w32_pad7
+ jmp .w32_wpad_done
+.w32_pad6:
+ mova m0, [yq]
+ mova m1, m0
+ punpcklbw m1, m1
+ pmaddubsw m1, m2
+ mova [acq], m1
+ paddw m5, m1, [rsp]
+ mova [rsp ], m5
+ pshufhw m0, m1, q3333
+ punpckhqdq m0, m0
+ mova [acq+16], m0
+ paddw m5, m0, [rsp+16]
+ mova [rsp+16], m5
+ mova m3, m0
+ mova [acq+32], m3
+ paddw m5, m3, [rsp+32]
+ mova [rsp+32], m5
+ mova m4, m3
+ mova [acq+48], m4
+ paddw m5, m4, [rsp+48]
+ mova [rsp+48], m5
+ lea yq, [yq+strideq]
+ add acq, 64
+ sub hd, 1
+ jg .w32_pad6
+ jmp .w32_wpad_done
+.w32_pad5:
+ mova m0, [yq]
+ mova m1, m0
+ punpcklbw m1, m1
+ pmaddubsw m1, m2
+ mova [acq], m1
+ mova m5, [rsp]
+ paddw m5, m1
+ mova [rsp ], m5
+ punpckhbw m0, m0
+ punpcklqdq m0, m0
+ pshufhw m0, m0, q3333
+ pmaddubsw m0, m2
+ mova [acq+16], m0
+ paddw m5, m0, [rsp+16]
+ mova [rsp+16], m5
+ mova m3, m0
+ punpckhqdq m3, m3
+ mova [acq+32], m3
+ paddw m5, m3, [rsp+32]
+ mova [rsp+32], m5
+ mova m4, m3
+ mova [acq+48], m4
+ paddw m5, m4, [rsp+48]
+ mova [rsp+48], m5
+ lea yq, [yq+strideq]
+ add acq, 64
+ sub hd, 1
+ jg .w32_pad5
+ jmp .w32_wpad_done
+.w32_pad4:
+ mova m0, [yq]
+ mova m1, m0
+ punpcklbw m1, m1
+ pmaddubsw m1, m2
+ mova [acq], m1
+ paddw m5, m1, [rsp]
+ mova [rsp ], m5
+ punpckhbw m0, m0
+ pmaddubsw m0, m2
+ mova [acq+16], m0
+ paddw m5, m0, [rsp+16]
+ mova [rsp+16], m5
+ mova m3, m0
+ pshufhw m3, m3, q3333
+ punpckhqdq m3, m3
+ mova [acq+32], m3
+ paddw m5, m3, [rsp+32]
+ mova [rsp+32], m5
+ mova m4, m3
+ mova [acq+48], m4
+ paddw m5, m4, [rsp+48]
+ mova [rsp+48], m5
+ lea yq, [yq+strideq]
+ add acq, 64
+ sub hd, 1
+ jg .w32_pad4
+ jmp .w32_wpad_done
+.w32_pad3:
+ mova m0, [yq]
+ mova m1, m0
+ punpcklbw m1, m1
+ pmaddubsw m1, m2
+ mova [acq], m1
+ paddw m5, m1, [rsp]
+ mova [rsp ], m5
+ punpckhbw m0, m0
+ pmaddubsw m0, m2
+ mova [acq+16], m0
+ paddw m5, m0, [rsp+16]
+ mova [rsp+16], m5
+ movd m3, [yq+16]
+ punpcklbw m3, m3
+ punpcklqdq m3, m3
+ pshufhw m3, m3, q3333
+ pmaddubsw m3, m2
+ mova [acq+32], m3
+ paddw m5, m3, [rsp+32]
+ mova [rsp+32], m5
+ mova m4, m3
+ punpckhqdq m4, m4
+ mova [acq+48], m4
+ paddw m5, m4, [rsp+48]
+ mova [rsp+48], m5
+ lea yq, [yq+strideq]
+ add acq, 64
+ sub hd, 1
+ jg .w32_pad3
+ jmp .w32_wpad_done
+.w32_pad2:
+ mova m0, [yq]
+ mova m1, m0
+ punpcklbw m1, m1
+ pmaddubsw m1, m2
+ mova [acq], m1
+ paddw m5, m1, [rsp]
+ mova [rsp ], m5
+ punpckhbw m0, m0
+ pmaddubsw m0, m2
+ mova [acq+16], m0
+ paddw m5, m0, [rsp+16]
+ mova [rsp+16], m5
+ mova m3, [yq+16]
+ punpcklbw m3, m3
+ pmaddubsw m3, m2
+ mova [acq+32], m3
+ paddw m5, m3, [rsp+32]
+ mova [rsp+32], m5
+ pshufhw m4, m3, q3333
+ punpckhqdq m4, m4
+ mova [acq+48], m4
+ paddw m5, m4, [rsp+48]
+ mova [rsp+48], m5
+ lea yq, [yq+strideq]
+ add acq, 64
+ sub hd, 1
+ jg .w32_pad2
+ jmp .w32_wpad_done
+.w32_pad1:
+ mova m0, [yq]
+ mova m1, m0
+ punpcklbw m1, m1
+ pmaddubsw m1, m2
+ mova [acq], m1
+ paddw m5, m1, [rsp]
+ mova [rsp ], m5
+ punpckhbw m0, m0
+ pmaddubsw m0, m2
+ mova [acq+16], m0
+ paddw m5, m0, [rsp+16]
+ mova [rsp+16], m5
+ mova m4, [yq+16]
+ mova m3, m4
+ punpcklbw m3, m3
+ pmaddubsw m3, m2
+ mova [acq+32], m3
+ paddw m5, m3, [rsp+32]
+ mova [rsp+32], m5
+ punpckhbw m4, m4
+ punpcklqdq m4, m4
+ pshufhw m4, m4, q3333
+ pmaddubsw m4, m2
+ mova [acq+48], m4
+ paddw m5, m4, [rsp+48]
+ mova [rsp+48], m5
+ lea yq, [yq+strideq]
+ add acq, 64
+ sub hd, 1
+ jg .w32_pad1
+.w32_wpad_done:
+ test hpadd, hpadd
+ jz .calc_avg_32
+.w32_hpad_loop:
+ mova [acq], m1
+ mova [acq+16], m0
+ paddw m5, m1, [rsp]
+ mova [rsp ], m5
+ paddw m5, m0, [rsp+16]
+ mova [rsp+16], m5
+ mova [acq+32], m3
+ mova [acq+48], m4
+ paddw m5, m3, [rsp+32]
+ mova [rsp+32], m5
+ paddw m5, m4, [rsp+48]
+ mova [rsp+48], m5
+ add acq, 64
+ sub hpadd, 1
+ jg .w32_hpad_loop
+
+%if ARCH_X86_64
+ DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h, ac_bak
+%else
+ DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h
+%endif
+
+.calc_avg_32:
+ mova m5, [rsp]
+ mova m0, m5
+ psrld m5, 16
+ pslld m0, 16
+ psrld m0, 16
+ paddd m5, m0
+ mova m0, [rsp+16]
+ mova m3, m0
+ psrld m0, 16
+ pslld m3, 16
+ psrld m3, 16
+ paddd m0, m3
+ paddd m5, m0
+ mova m0, [rsp+32]
+ mova m3, m0
+ psrld m0, 16
+ pslld m3, 16
+ psrld m3, 16
+ paddd m0, m3
+ mova m1, [rsp+48]
+ mova m3, m1
+ psrld m1, 16
+ pslld m3, 16
+ psrld m3, 16
+ paddd m1, m3
+ paddd m1, m0
+ paddd m5, m1
+.calc_avg:
+ movd szd, m6
+ psrad m6, 1
+ tzcnt r1d, szd ; const int log2sz = ctz(width) + ctz(height);
+ paddd m5, m6
+ movd m1, r1d
+ pshufd m0, m5, q2301
+ paddd m0, m5
+ pshufd m5, m0, q1032
+ paddd m0, m5
+ psrad m0, m1 ; sum >>= log2sz;
+ packssdw m0, m0
+ RELOAD_ACQ_32 acq ; ac = ac_orig
+.sub_loop:
+ mova m1, [acq]
+ psubw m1, m0
+ mova [acq], m1
+ add acq, 16
+ sub szd, 8
+ jg .sub_loop
+ RET
+
+; %1 simd register that hold the mask and will hold the result
+; %2 simd register that holds the "true" values
+; %3 location of the "false" values (simd register/memory)
+%macro BLEND 3 ; mask, true, false
+ pand %2, %1
+ pandn %1, %3
+ por %1, %2
+%endmacro
+
+%macro PAETH 2 ; top, ldiff
+ pavgb m1, m%1, m3
+ pxor m0, m%1, m3
+ pand m0, m4
+ psubusb m2, m5, m1
+ psubb m1, m0
+ psubusb m1, m5
+ por m1, m2
+ paddusb m1, m1
+ por m1, m0 ; min(tldiff, 255)
+ psubusb m2, m5, m3
+ psubusb m0, m3, m5
+ por m2, m0 ; tdiff
+%ifnum %2
+ pminub m2, m%2
+ pcmpeqb m0, m%2, m2 ; ldiff <= tdiff
+%else
+ mova m0, %2
+ pminub m2, m0
+ pcmpeqb m0, m2
+%endif
+ pminub m1, m2
+ pcmpeqb m1, m2 ; ldiff <= tldiff && tdiff <= tldiff
+ mova m2, m3
+ BLEND m0, m2, m%1
+ BLEND m1, m0, m5
+%endmacro
+
+cglobal ipred_paeth_8bpc, 3, 6, 8, -7*16, dst, stride, tl, w, h
+%define base r5-ipred_paeth_ssse3_table
+ tzcnt wd, wm
+ movifnidn hd, hm
+ pxor m0, m0
+ movd m5, [tlq]
+ pshufb m5, m0
+ LEA r5, ipred_paeth_ssse3_table
+ movsxd wq, [r5+wq*4]
+ movddup m4, [base+ipred_paeth_shuf]
+ add wq, r5
+ jmp wq
+.w4:
+ movd m6, [tlq+1] ; top
+ pshufd m6, m6, q0000
+ lea r3, [strideq*3]
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0 ; ldiff
+.w4_loop:
+ sub tlq, 4
+ movd m3, [tlq]
+ mova m1, [base+ipred_h_shuf]
+ pshufb m3, m1 ; left
+ PAETH 6, 7
+ movd [dstq ], m1
+ pshuflw m0, m1, q1032
+ movd [dstq+strideq ], m0
+ punpckhqdq m1, m1
+ movd [dstq+strideq*2], m1
+ psrlq m1, 32
+ movd [dstq+r3 ], m1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4_loop
+ RET
+ALIGN function_align
+.w8:
+ movddup m6, [tlq+1]
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0
+.w8_loop:
+ sub tlq, 2
+ movd m3, [tlq]
+ pshufb m3, [base+ipred_paeth_shuf]
+ PAETH 6, 7
+ movq [dstq ], m1
+ movhps [dstq+strideq], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8_loop
+ RET
+ALIGN function_align
+.w16:
+ movu m6, [tlq+1]
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0
+.w16_loop:
+ sub tlq, 1
+ movd m3, [tlq]
+ pxor m1, m1
+ pshufb m3, m1
+ PAETH 6, 7
+ mova [dstq], m1
+ add dstq, strideq
+ sub hd, 1
+ jg .w16_loop
+ RET
+ALIGN function_align
+.w32:
+ movu m6, [tlq+1]
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0
+ mova [rsp ], m6
+ mova [rsp+16], m7
+ movu m6, [tlq+17]
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0
+ mova [rsp+32], m6
+.w32_loop:
+ dec tlq
+ movd m3, [tlq]
+ pxor m1, m1
+ pshufb m3, m1
+ mova m6, [rsp]
+ PAETH 6, [rsp+16]
+ mova [dstq ], m1
+ mova m6, [rsp+32]
+ PAETH 6, 7
+ mova [dstq+16], m1
+ add dstq, strideq
+ dec hd
+ jg .w32_loop
+ RET
+ALIGN function_align
+.w64:
+ movu m6, [tlq+1]
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0
+ mova [rsp ], m6
+ mova [rsp+16], m7
+ movu m6, [tlq+17]
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0
+ mova [rsp+32], m6
+ mova [rsp+48], m7
+ movu m6, [tlq+33]
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0
+ mova [rsp+64], m6
+ mova [rsp+80], m7
+ movu m6, [tlq+49]
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0
+ mova [rsp+96], m6
+.w64_loop:
+ dec tlq
+ movd m3, [tlq]
+ pxor m1, m1
+ pshufb m3, m1
+ mova m6, [rsp]
+ PAETH 6, [rsp+16]
+ mova [dstq ], m1
+ mova m6, [rsp+32]
+ PAETH 6, [rsp+48]
+ mova [dstq+16], m1
+ mova m6, [rsp+64]
+ PAETH 6, [rsp+80]
+ mova [dstq+32], m1
+ mova m6, [rsp+96]
+ PAETH 6, 7
+ mova [dstq+48], m1
+ add dstq, strideq
+ dec hd
+ jg .w64_loop
+ RET
+
+
+%macro FILTER 4 ;dst, src, tmp, shuf
+%ifnum %4
+ pshufb m%2, m%4
+%else
+ pshufb m%2, %4
+%endif
+ pshufd m%1, m%2, q0000 ;p0 p1
+ pmaddubsw m%1, m2
+ pshufd m%3, m%2, q1111 ;p2 p3
+ pmaddubsw m%3, m3
+ paddw m%1, [base+pw_8]
+ paddw m%1, m%3
+ pshufd m%3, m%2, q2222 ;p4 p5
+ pmaddubsw m%3, m4
+ paddw m%1, m%3
+ pshufd m%3, m%2, q3333 ;p6 __
+ pmaddubsw m%3, m5
+ paddw m%1, m%3
+ psraw m%1, 4
+ packuswb m%1, m%1
+%endmacro
+
+cglobal ipred_filter_8bpc, 3, 7, 8, dst, stride, tl, w, h, filter
+%define base r6-$$
+ LEA r6, $$
+ tzcnt wd, wm
+%ifidn filterd, filterm
+ movzx filterd, filterb
+%else
+ movzx filterd, byte filterm
+%endif
+ shl filterd, 6
+ lea filterq, [base+filter_intra_taps+filterq]
+ movq m0, [tlq-3] ;_ 6 5 0 1 2 3 4
+ movsxd wq, [base+ipred_filter_ssse3_table+wq*4]
+ mova m2, [filterq+16*0]
+ mova m3, [filterq+16*1]
+ mova m4, [filterq+16*2]
+ mova m5, [filterq+16*3]
+ lea wq, [base+ipred_filter_ssse3_table+wq]
+ mov hd, hm
+ jmp wq
+.w4:
+ mova m1, [base+filter_shuf1]
+ sub tlq, 3
+ sub tlq, hq
+ jmp .w4_loop_start
+.w4_loop:
+ movd m0, [tlq+hq]
+ punpckldq m0, m6
+ lea dstq, [dstq+strideq*2]
+.w4_loop_start:
+ FILTER 6, 0, 7, 1
+ movd [dstq+strideq*0], m6
+ pshuflw m6, m6, q1032
+ movd [dstq+strideq*1], m6
+ sub hd, 2
+ jg .w4_loop
+ RET
+
+ALIGN function_align
+.w8:
+ movq m6, [tlq+1] ;_ _ _ 0 1 2 3 4
+ sub tlq, 5
+ sub tlq, hq
+
+.w8_loop:
+ FILTER 7, 0, 1, [base+filter_shuf1]
+ punpcklqdq m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+ FILTER 0, 6, 1, [base+filter_shuf2]
+
+ punpckldq m6, m7, m0
+ movq [dstq+strideq*0], m6
+ punpckhqdq m6, m6
+ movq [dstq+strideq*1], m6
+
+ movd m0, [tlq+hq] ;_ 6 5 0
+ punpckldq m0, m6 ;_ 6 5 0 1 2 3 4
+
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8_loop
+ RET
+
+ALIGN function_align
+.w16:
+ movu m6, [tlq+1] ;top row
+ sub tlq, 5
+ sub tlq, hq
+
+.w16_loop:
+ FILTER 7, 0, 1, [base+filter_shuf1]
+ punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+ movd [dstq+strideq*0], m7
+ psrlq m7, 32
+ palignr m7, m6, 4
+
+ FILTER 6, 0, 1, [base+filter_shuf2]
+ punpcklqdq m0, m7, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+ movd [dstq+4+strideq*0], m6
+ psrlq m6, 32
+ palignr m6, m7, 4
+
+ FILTER 7, 0, 1, [base+filter_shuf2]
+ punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+ movd [dstq+8+strideq*0], m7
+ psrlq m7, 32
+ palignr m7, m6, 4
+
+ FILTER 6, 0, 1, [base+filter_shuf2]
+ movd [dstq+12+strideq*0], m6
+ psrlq m6, 32
+ palignr m6, m7, 4
+ mova [dstq+strideq*1], m6
+
+ movd m0, [tlq+hq] ;_ 6 5 0
+ punpckldq m0, m6 ;_ 6 5 0 1 2 3 4
+
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w16_loop
+ RET
+
+ALIGN function_align
+.w32:
+ movu m6, [tlq+1] ;top row
+ lea filterq, [tlq+17]
+ sub tlq, 5
+ sub tlq, hq
+
+.w32_loop:
+ FILTER 7, 0, 1, [base+filter_shuf1]
+ punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+ movd [dstq+strideq*0], m7
+ psrlq m7, 32
+ palignr m7, m6, 4
+
+ FILTER 6, 0, 1, [base+filter_shuf2]
+ punpcklqdq m0, m7, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+ movd [dstq+4+strideq*0], m6
+ psrlq m6, 32
+ palignr m6, m7, 4
+
+ FILTER 7, 0, 1, [base+filter_shuf2]
+ punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+ movd [dstq+8+strideq*0], m7
+ psrlq m7, 32
+ palignr m7, m6, 4
+
+ FILTER 6, 0, 1, [base+filter_shuf2]
+ movu m1, [filterq]
+ punpckldq m0, m7, m1 ;_ _ _ 0 1 2 3 4 _ _ _ _ _ _ _ _
+ punpcklqdq m0, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+ movd [dstq+12+strideq*0], m6
+ psrlq m6, 32
+ palignr m6, m7, 4
+ mova [dstq+strideq*1], m6
+
+ mova m6, m1
+
+ FILTER 7, 0, 6, [base+filter_shuf2]
+ punpcklqdq m0, m1, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+ movd [dstq+16+strideq*0], m7
+ psrlq m7, 32
+ palignr m7, m1, 4
+
+ FILTER 6, 0, 1, [base+filter_shuf2]
+ punpcklqdq m0, m7, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+ movd [dstq+20+strideq*0], m6
+ psrlq m6, 32
+ palignr m6, m7, 4
+
+ FILTER 7, 0, 1, [base+filter_shuf2]
+ punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+ movd [dstq+24+strideq*0], m7
+ psrlq m7, 32
+ palignr m7, m6, 4
+
+ FILTER 6, 0, 1, [base+filter_shuf2]
+ movd [dstq+28+strideq*0], m6
+ psrlq m6, 32
+ palignr m6, m7, 4
+ mova [dstq+16+strideq*1], m6
+
+ mova m6, [dstq+strideq*1]
+ movd m0, [tlq+hq] ;_ 6 5 0
+ punpckldq m0, m6 ;_ 6 5 0 1 2 3 4
+ lea filterq, [dstq+16+strideq*1]
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32_loop
+ RET
diff --git a/third_party/dav1d/src/x86/itx.h b/third_party/dav1d/src/x86/itx.h
new file mode 100644
index 0000000000..346fde7d90
--- /dev/null
+++ b/third_party/dav1d/src/x86/itx.h
@@ -0,0 +1,367 @@
+/*
+ * Copyright © 2018-2023, VideoLAN and dav1d authors
+ * Copyright © 2018-2023, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/itx.h"
+
+#define BF_BPC(x, bits, suffix) x##_##bits##bpc_##suffix
+
+#define decl_itx2_fns(w, h, opt) \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_##w##x##h, opt))
+
+#define decl_itx12_fns(w, h, opt) \
+decl_itx2_fns(w, h, opt); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_##w##x##h, opt))
+
+#define decl_itx16_fns(w, h, opt) \
+decl_itx12_fns(w, h, opt); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_##w##x##h, opt))
+
+#define decl_itx17_fns(w, h, opt) \
+decl_itx16_fns(w, h, opt); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_##w##x##h, opt))
+
+#define decl_itx_fns(ext) \
+decl_itx17_fns( 4, 4, ext); \
+decl_itx16_fns( 4, 8, ext); \
+decl_itx16_fns( 4, 16, ext); \
+decl_itx16_fns( 8, 4, ext); \
+decl_itx16_fns( 8, 8, ext); \
+decl_itx16_fns( 8, 16, ext); \
+decl_itx2_fns ( 8, 32, ext); \
+decl_itx16_fns(16, 4, ext); \
+decl_itx16_fns(16, 8, ext); \
+decl_itx12_fns(16, 16, ext); \
+decl_itx2_fns (16, 32, ext); \
+decl_itx2_fns (32, 8, ext); \
+decl_itx2_fns (32, 16, ext); \
+decl_itx2_fns (32, 32, ext); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_16x64, ext)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_32x64, ext)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x16, ext)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x32, ext)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x64, ext))
+
+
+#define decl_itx2_bpc_fns(w, h, bpc, opt) \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_identity_identity_##w##x##h, bpc, opt))
+
+#define decl_itx12_bpc_fns(w, h, bpc, opt) \
+decl_itx2_bpc_fns(w, h, bpc, opt); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_adst_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_flipadst_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_identity_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_adst_dct_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_adst_adst_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_adst_flipadst_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_flipadst_dct_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_flipadst_adst_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_identity_dct_##w##x##h, bpc, opt))
+
+#define decl_itx16_bpc_fns(w, h, bpc, opt) \
+decl_itx12_bpc_fns(w, h, bpc, opt); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_adst_identity_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_flipadst_identity_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_identity_adst_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_identity_flipadst_##w##x##h, bpc, opt))
+
+#define decl_itx_bpc_fns(bpc, ext) \
+decl_itx16_bpc_fns( 4, 4, bpc, ext); \
+decl_itx16_bpc_fns( 4, 8, bpc, ext); \
+decl_itx16_bpc_fns( 4, 16, bpc, ext); \
+decl_itx16_bpc_fns( 8, 4, bpc, ext); \
+decl_itx16_bpc_fns( 8, 8, bpc, ext); \
+decl_itx16_bpc_fns( 8, 16, bpc, ext); \
+decl_itx2_bpc_fns ( 8, 32, bpc, ext); \
+decl_itx16_bpc_fns(16, 4, bpc, ext); \
+decl_itx16_bpc_fns(16, 8, bpc, ext); \
+decl_itx12_bpc_fns(16, 16, bpc, ext); \
+decl_itx2_bpc_fns (16, 32, bpc, ext); \
+decl_itx2_bpc_fns (32, 8, bpc, ext); \
+decl_itx2_bpc_fns (32, 16, bpc, ext); \
+decl_itx2_bpc_fns (32, 32, bpc, ext); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_16x64, bpc, ext)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_32x64, bpc, ext)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_64x16, bpc, ext)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_64x32, bpc, ext)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_64x64, bpc, ext))
+
+decl_itx_fns(avx512icl);
+decl_itx_bpc_fns(10, avx512icl);
+decl_itx_fns(avx2);
+decl_itx_bpc_fns(10, avx2);
+decl_itx_bpc_fns(12, avx2);
+decl_itx_fns(sse4);
+decl_itx_fns(ssse3);
+decl_itx_fn(dav1d_inv_txfm_add_wht_wht_4x4_16bpc_avx2);
+decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_4x4, sse2));
+
+static ALWAYS_INLINE void itx_dsp_init_x86(Dav1dInvTxfmDSPContext *const c, const int bpc) {
+#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \
+ c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
+ BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
+
+#define assign_itx1_fn(pfx, w, h, ext) \
+ assign_itx_fn(pfx, w, h, dct_dct, DCT_DCT, ext)
+
+#define assign_itx2_fn(pfx, w, h, ext) \
+ assign_itx1_fn(pfx, w, h, ext); \
+ assign_itx_fn(pfx, w, h, identity_identity, IDTX, ext)
+
+#define assign_itx12_fn(pfx, w, h, ext) \
+ assign_itx2_fn(pfx, w, h, ext); \
+ assign_itx_fn(pfx, w, h, dct_adst, ADST_DCT, ext); \
+ assign_itx_fn(pfx, w, h, dct_flipadst, FLIPADST_DCT, ext); \
+ assign_itx_fn(pfx, w, h, dct_identity, H_DCT, ext); \
+ assign_itx_fn(pfx, w, h, adst_dct, DCT_ADST, ext); \
+ assign_itx_fn(pfx, w, h, adst_adst, ADST_ADST, ext); \
+ assign_itx_fn(pfx, w, h, adst_flipadst, FLIPADST_ADST, ext); \
+ assign_itx_fn(pfx, w, h, flipadst_dct, DCT_FLIPADST, ext); \
+ assign_itx_fn(pfx, w, h, flipadst_adst, ADST_FLIPADST, ext); \
+ assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \
+ assign_itx_fn(pfx, w, h, identity_dct, V_DCT, ext)
+
+#define assign_itx16_fn(pfx, w, h, ext) \
+ assign_itx12_fn(pfx, w, h, ext); \
+ assign_itx_fn(pfx, w, h, adst_identity, H_ADST, ext); \
+ assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST, ext); \
+ assign_itx_fn(pfx, w, h, identity_adst, V_ADST, ext); \
+ assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST, ext)
+
+#define assign_itx17_fn(pfx, w, h, ext) \
+ assign_itx16_fn(pfx, w, h, ext); \
+ assign_itx_fn(pfx, w, h, wht_wht, WHT_WHT, ext)
+
+
+#define assign_itx_bpc_fn(pfx, w, h, type, type_enum, bpc, ext) \
+ c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
+ BF_BPC(dav1d_inv_txfm_add_##type##_##w##x##h, bpc, ext)
+
+#define assign_itx1_bpc_fn(pfx, w, h, bpc, ext) \
+ assign_itx_bpc_fn(pfx, w, h, dct_dct, DCT_DCT, bpc, ext)
+
+#define assign_itx2_bpc_fn(pfx, w, h, bpc, ext) \
+ assign_itx1_bpc_fn(pfx, w, h, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, identity_identity, IDTX, bpc, ext)
+
+#define assign_itx12_bpc_fn(pfx, w, h, bpc, ext) \
+ assign_itx2_bpc_fn(pfx, w, h, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, dct_adst, ADST_DCT, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, dct_flipadst, FLIPADST_DCT, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, dct_identity, H_DCT, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, adst_dct, DCT_ADST, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, adst_adst, ADST_ADST, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, adst_flipadst, FLIPADST_ADST, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, flipadst_dct, DCT_FLIPADST, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, flipadst_adst, ADST_FLIPADST, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, identity_dct, V_DCT, bpc, ext)
+
+#define assign_itx16_bpc_fn(pfx, w, h, bpc, ext) \
+ assign_itx12_bpc_fn(pfx, w, h, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, adst_identity, H_ADST, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, flipadst_identity, H_FLIPADST, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, identity_adst, V_ADST, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, identity_flipadst, V_FLIPADST, bpc, ext)
+
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return;
+
+ assign_itx_fn(, 4, 4, wht_wht, WHT_WHT, sse2);
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
+
+#if BITDEPTH == 8
+ assign_itx16_fn(, 4, 4, ssse3);
+ assign_itx16_fn(R, 4, 8, ssse3);
+ assign_itx16_fn(R, 8, 4, ssse3);
+ assign_itx16_fn(, 8, 8, ssse3);
+ assign_itx16_fn(R, 4, 16, ssse3);
+ assign_itx16_fn(R, 16, 4, ssse3);
+ assign_itx16_fn(R, 8, 16, ssse3);
+ assign_itx16_fn(R, 16, 8, ssse3);
+ assign_itx12_fn(, 16, 16, ssse3);
+ assign_itx2_fn (R, 8, 32, ssse3);
+ assign_itx2_fn (R, 32, 8, ssse3);
+ assign_itx2_fn (R, 16, 32, ssse3);
+ assign_itx2_fn (R, 32, 16, ssse3);
+ assign_itx2_fn (, 32, 32, ssse3);
+ assign_itx1_fn (R, 16, 64, ssse3);
+ assign_itx1_fn (R, 32, 64, ssse3);
+ assign_itx1_fn (R, 64, 16, ssse3);
+ assign_itx1_fn (R, 64, 32, ssse3);
+ assign_itx1_fn ( , 64, 64, ssse3);
+#endif
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSE41)) return;
+
+#if BITDEPTH == 16
+ if (bpc == 10) {
+ assign_itx16_fn(, 4, 4, sse4);
+ assign_itx16_fn(R, 4, 8, sse4);
+ assign_itx16_fn(R, 4, 16, sse4);
+ assign_itx16_fn(R, 8, 4, sse4);
+ assign_itx16_fn(, 8, 8, sse4);
+ assign_itx16_fn(R, 8, 16, sse4);
+ assign_itx16_fn(R, 16, 4, sse4);
+ assign_itx16_fn(R, 16, 8, sse4);
+ assign_itx12_fn(, 16, 16, sse4);
+ assign_itx2_fn (R, 8, 32, sse4);
+ assign_itx2_fn (R, 32, 8, sse4);
+ assign_itx2_fn (R, 16, 32, sse4);
+ assign_itx2_fn (R, 32, 16, sse4);
+ assign_itx2_fn (, 32, 32, sse4);
+ assign_itx1_fn (R, 16, 64, sse4);
+ assign_itx1_fn (R, 32, 64, sse4);
+ assign_itx1_fn (R, 64, 16, sse4);
+ assign_itx1_fn (R, 64, 32, sse4);
+ assign_itx1_fn (, 64, 64, sse4);
+ }
+#endif
+
+#if ARCH_X86_64
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
+
+ assign_itx_fn(, 4, 4, wht_wht, WHT_WHT, avx2);
+
+#if BITDEPTH == 8
+ assign_itx16_fn( , 4, 4, avx2);
+ assign_itx16_fn(R, 4, 8, avx2);
+ assign_itx16_fn(R, 4, 16, avx2);
+ assign_itx16_fn(R, 8, 4, avx2);
+ assign_itx16_fn( , 8, 8, avx2);
+ assign_itx16_fn(R, 8, 16, avx2);
+ assign_itx2_fn (R, 8, 32, avx2);
+ assign_itx16_fn(R, 16, 4, avx2);
+ assign_itx16_fn(R, 16, 8, avx2);
+ assign_itx12_fn( , 16, 16, avx2);
+ assign_itx2_fn (R, 16, 32, avx2);
+ assign_itx1_fn (R, 16, 64, avx2);
+ assign_itx2_fn (R, 32, 8, avx2);
+ assign_itx2_fn (R, 32, 16, avx2);
+ assign_itx2_fn ( , 32, 32, avx2);
+ assign_itx1_fn (R, 32, 64, avx2);
+ assign_itx1_fn (R, 64, 16, avx2);
+ assign_itx1_fn (R, 64, 32, avx2);
+ assign_itx1_fn ( , 64, 64, avx2);
+#else
+ if (bpc == 10) {
+ assign_itx16_bpc_fn( , 4, 4, 10, avx2);
+ assign_itx16_bpc_fn(R, 4, 8, 10, avx2);
+ assign_itx16_bpc_fn(R, 4, 16, 10, avx2);
+ assign_itx16_bpc_fn(R, 8, 4, 10, avx2);
+ assign_itx16_bpc_fn( , 8, 8, 10, avx2);
+ assign_itx16_bpc_fn(R, 8, 16, 10, avx2);
+ assign_itx2_bpc_fn (R, 8, 32, 10, avx2);
+ assign_itx16_bpc_fn(R, 16, 4, 10, avx2);
+ assign_itx16_bpc_fn(R, 16, 8, 10, avx2);
+ assign_itx12_bpc_fn( , 16, 16, 10, avx2);
+ assign_itx2_bpc_fn (R, 16, 32, 10, avx2);
+ assign_itx1_bpc_fn (R, 16, 64, 10, avx2);
+ assign_itx2_bpc_fn (R, 32, 8, 10, avx2);
+ assign_itx2_bpc_fn (R, 32, 16, 10, avx2);
+ assign_itx2_bpc_fn ( , 32, 32, 10, avx2);
+ assign_itx1_bpc_fn (R, 32, 64, 10, avx2);
+ assign_itx1_bpc_fn (R, 64, 16, 10, avx2);
+ assign_itx1_bpc_fn (R, 64, 32, 10, avx2);
+ assign_itx1_bpc_fn ( , 64, 64, 10, avx2);
+ } else {
+ assign_itx16_bpc_fn( , 4, 4, 12, avx2);
+ assign_itx16_bpc_fn(R, 4, 8, 12, avx2);
+ assign_itx16_bpc_fn(R, 4, 16, 12, avx2);
+ assign_itx16_bpc_fn(R, 8, 4, 12, avx2);
+ assign_itx16_bpc_fn( , 8, 8, 12, avx2);
+ assign_itx16_bpc_fn(R, 8, 16, 12, avx2);
+ assign_itx2_bpc_fn (R, 8, 32, 12, avx2);
+ assign_itx16_bpc_fn(R, 16, 4, 12, avx2);
+ assign_itx16_bpc_fn(R, 16, 8, 12, avx2);
+ assign_itx12_bpc_fn( , 16, 16, 12, avx2);
+ assign_itx2_bpc_fn (R, 32, 8, 12, avx2);
+ assign_itx_bpc_fn(R, 16, 32, identity_identity, IDTX, 12, avx2);
+ assign_itx_bpc_fn(R, 32, 16, identity_identity, IDTX, 12, avx2);
+ assign_itx_bpc_fn( , 32, 32, identity_identity, IDTX, 12, avx2);
+ }
+#endif
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
+
+#if BITDEPTH == 8
+ assign_itx16_fn( , 4, 4, avx512icl); // no wht
+ assign_itx16_fn(R, 4, 8, avx512icl);
+ assign_itx16_fn(R, 4, 16, avx512icl);
+ assign_itx16_fn(R, 8, 4, avx512icl);
+ assign_itx16_fn( , 8, 8, avx512icl);
+ assign_itx16_fn(R, 8, 16, avx512icl);
+ assign_itx2_fn (R, 8, 32, avx512icl);
+ assign_itx16_fn(R, 16, 4, avx512icl);
+ assign_itx16_fn(R, 16, 8, avx512icl);
+ assign_itx12_fn( , 16, 16, avx512icl);
+ assign_itx2_fn (R, 16, 32, avx512icl);
+ assign_itx1_fn (R, 16, 64, avx512icl);
+ assign_itx2_fn (R, 32, 8, avx512icl);
+ assign_itx2_fn (R, 32, 16, avx512icl);
+ assign_itx2_fn ( , 32, 32, avx512icl);
+ assign_itx1_fn (R, 32, 64, avx512icl);
+ assign_itx1_fn (R, 64, 16, avx512icl);
+ assign_itx1_fn (R, 64, 32, avx512icl);
+ assign_itx1_fn ( , 64, 64, avx512icl);
+#else
+ if (bpc == 10) {
+ assign_itx16_bpc_fn( , 8, 8, 10, avx512icl);
+ assign_itx16_bpc_fn(R, 8, 16, 10, avx512icl);
+ assign_itx2_bpc_fn (R, 8, 32, 10, avx512icl);
+ assign_itx16_bpc_fn(R, 16, 8, 10, avx512icl);
+ assign_itx12_bpc_fn( , 16, 16, 10, avx512icl);
+ assign_itx2_bpc_fn (R, 16, 32, 10, avx512icl);
+ assign_itx2_bpc_fn (R, 32, 8, 10, avx512icl);
+ assign_itx2_bpc_fn (R, 32, 16, 10, avx512icl);
+ assign_itx2_bpc_fn ( , 32, 32, 10, avx512icl);
+ assign_itx1_bpc_fn (R, 16, 64, 10, avx512icl);
+ assign_itx1_bpc_fn (R, 32, 64, 10, avx512icl);
+ assign_itx1_bpc_fn (R, 64, 16, 10, avx512icl);
+ assign_itx1_bpc_fn (R, 64, 32, 10, avx512icl);
+ assign_itx1_bpc_fn ( , 64, 64, 10, avx512icl);
+ }
+#endif
+#endif
+}
diff --git a/third_party/dav1d/src/x86/itx16_avx2.asm b/third_party/dav1d/src/x86/itx16_avx2.asm
new file mode 100644
index 0000000000..0da970a1c6
--- /dev/null
+++ b/third_party/dav1d/src/x86/itx16_avx2.asm
@@ -0,0 +1,8599 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
+; Copyright © 2021, Matthias Dressel
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 32
+itx4_shuf: dd 0x50401600, 0xd0c09284, 0x70603422, 0xf0e0b0a6
+ dd 0x50401701, 0xd0c09385, 0x70603523, 0xf0e0b1a7
+idct4_12_shuf: dd 0, 2, 4, 6, 1, 3, 5, 7
+idct4_12_shuf2: dd 2, 0, 6, 4, 3, 1, 7, 5
+iadst8_12_shuf: dd 0, 4, 1, 5, 2, 6, 3, 7
+idct16_12_shuf: dd 0, 4, 1, 5, 3, 7, 2, 6
+iadst16_12_shuf: dd 3, 7, 0, 4, 2, 6, 1, 5
+pw_2048_m2048: dw 2048, 2048, 2048, 2048, -2048, -2048, -2048, -2048
+idct4_shuf: db 0, 1, 4, 5, 12, 13, 8, 9, 2, 3, 6, 7, 14, 15, 10, 11
+idct32_shuf: db 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15
+
+%macro COEF_PAIR 2-3 0
+pd_%1_%2: dd %1, %1, %2, %2
+%define pd_%1 (pd_%1_%2 + 4*0)
+%define pd_%2 (pd_%1_%2 + 4*2)
+%if %3
+dd -%2, -%2
+%define pd_%2_m%2 pd_%2
+%endif
+%endmacro
+
+COEF_PAIR 201, 995
+COEF_PAIR 401, 1931
+COEF_PAIR 799, 3406
+COEF_PAIR 1380, 601
+COEF_PAIR 1751, 2440
+COEF_PAIR 2598, 1189
+COEF_PAIR 2751, 2106
+COEF_PAIR 2896, 1567, 1
+COEF_PAIR 2896, 3784, 1
+COEF_PAIR 3035, 3513
+COEF_PAIR 3166, 3920
+COEF_PAIR 3703, 3290
+COEF_PAIR 3857, 4052
+COEF_PAIR 4017, 2276
+COEF_PAIR 4076, 3612
+COEF_PAIR 4091, 3973
+
+pd_8: dd 8
+pd_m601: dd -601
+pd_m1189: dd -1189
+pd_m1380: dd -1380
+pd_m2106: dd -2106
+pd_m2598: dd -2598
+pd_m2751: dd -2751
+pd_m3344: dd -3344
+pd_1024: dd 1024
+pd_1321: dd 1321
+pd_1448: dd 1448
+pd_1697: dd 1697
+pd_2482: dd 2482
+pd_3072: dd 3072 ; 1024 + 2048
+pd_3803: dd 3803
+pd_5119: dd 5119 ; 1024 + 4096 - 1
+pd_5120: dd 5120 ; 1024 + 4096
+pd_5793: dd 5793
+pd_6144: dd 6144 ; 2048 + 4096
+pd_17408: dd 17408 ; 1024 + 16384
+
+pixel_10bpc_max: times 2 dw 0x03ff
+pixel_12bpc_max: times 2 dw 0x0fff
+dconly_10bpc: times 2 dw 0x7c00
+dconly_12bpc: times 2 dw 0x7000
+clip_18b_min: dd -0x20000
+clip_18b_max: dd 0x1ffff
+clip_20b_min: dd -0x80000
+clip_20b_max: dd 0x7ffff
+
+const idct64_mul_16bpc
+dd 4095, 101, 2967, -2824, 3745, 1660, 3822, -1474, 401, 4076, 799, 4017
+dd -700, 4036, 2359, 3349, -2191, 3461, 897, 3996, -2598, -3166, -4017, -799
+dd 4065, 501, 3229, -2520, 3564, 2019, 3948, -1092, 1931, 3612, 3406, 2276
+dd -301, 4085, 2675, 3102, -1842, 3659, 1285, 3889, -1189, -3920, -2276, -3406
+
+cextern deint_shuf
+cextern idct64_mul
+cextern pw_1697x8
+cextern pw_1697x16
+cextern pw_1567_3784
+cextern pw_m1567_m3784
+cextern pw_m3784_1567
+cextern pw_2896_2896
+cextern pw_m2896_2896
+cextern pw_5
+cextern pw_2048
+cextern pw_4096
+cextern pw_8192
+cextern pw_16384
+cextern pw_2896x8
+cextern pd_2048
+
+cextern idct_4x8_internal_8bpc_avx2.main
+cextern idct_4x16_internal_8bpc_avx2.main
+cextern idct_8x8_internal_8bpc_avx2.main
+cextern idct_8x16_internal_8bpc_avx2.main
+cextern idct_16x4_internal_8bpc_avx2.main
+cextern idct_16x8_internal_8bpc_avx2.main
+cextern idct_16x16_internal_8bpc_avx2.main
+cextern inv_txfm_add_dct_dct_8x32_8bpc_avx2.main
+cextern inv_txfm_add_dct_dct_8x32_8bpc_avx2.main_fast
+cextern inv_txfm_add_dct_dct_16x32_8bpc_avx2.main_oddhalf
+cextern inv_txfm_add_dct_dct_16x32_8bpc_avx2.main_oddhalf_fast
+cextern inv_txfm_add_dct_dct_16x64_8bpc_avx2.main_part1
+cextern inv_txfm_add_dct_dct_16x64_8bpc_avx2.main_part2_internal
+
+cextern iadst_4x4_internal_8bpc_avx2.main
+cextern iadst_4x8_internal_8bpc_avx2.main_pass2
+cextern iadst_4x16_internal_8bpc_avx2.main2
+cextern iadst_8x4_internal_8bpc_avx2.main
+cextern iadst_8x8_internal_8bpc_avx2.main_pass2
+cextern iadst_8x16_internal_8bpc_avx2.main
+cextern iadst_8x16_internal_8bpc_avx2.main_pass2_end
+cextern iadst_16x4_internal_8bpc_avx2.main
+cextern iadst_16x8_internal_8bpc_avx2.main
+cextern iadst_16x8_internal_8bpc_avx2.main_pass2_end
+cextern iadst_16x16_internal_8bpc_avx2.main
+cextern iadst_16x16_internal_8bpc_avx2.main_pass2_end
+
+SECTION .text
+
+%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
+
+%macro WRAP_XMM 1+
+ INIT_XMM cpuname
+ %1
+ INIT_YMM cpuname
+%endmacro
+
+%macro IWHT4_1D_PACKED 0
+ ; m0 = in0 in2, m1 = in1 in3
+ psubd m2, m0, m1 ; t2
+ paddd xm0, xm1 ; t0
+ vpermq m2, m2, q3322
+ vpermq m0, m0, q1100
+ vpermq m1, m1, q3120
+ psubd m3, m0, m2
+ psrad m3, 1
+ psubd m3, m1 ; t1 t3
+ psubd m0, m3 ; ____ out0
+ paddd m2, m3 ; out3 ____
+%endmacro
+
+INIT_YMM avx2
+cglobal inv_txfm_add_wht_wht_4x4_16bpc, 3, 7, 6, dst, stride, c, eob, bdmax
+ mova xm0, [cq+16*0]
+ vinserti128 m0, [cq+16*2], 1
+ mova xm1, [cq+16*1]
+ vinserti128 m1, [cq+16*3], 1
+ pxor m4, m4
+ mova [cq+32*0], m4
+ mova [cq+32*1], m4
+ lea r6, [dstq+strideq*2]
+ psrad m0, 2
+ psrad m1, 2
+ IWHT4_1D_PACKED
+ punpckhdq m0, m3
+ punpckldq m3, m2
+ punpckhqdq m1, m0, m3
+ punpcklqdq m0, m3
+ IWHT4_1D_PACKED
+ vpblendd m0, m2, 0x33
+ packssdw m0, m3
+ vextracti128 xm2, m0, 1
+ punpckhdq xm1, xm0, xm2 ; out2 out1
+ punpckldq xm0, xm2 ; out3 out0
+ movq xm2, [r6 +strideq*1]
+ movhps xm2, [dstq+strideq*0]
+ movq xm3, [r6 +strideq*0]
+ movhps xm3, [dstq+strideq*1]
+%ifidn bdmaxd, bdmaxm
+ movd xm5, bdmaxd
+ vpbroadcastw xm5, xm5
+%else ; win64: load from stack
+ vpbroadcastw xm5, bdmaxm
+%endif
+ paddsw xm0, xm2
+ paddsw xm1, xm3
+ pmaxsw xm0, xm4
+ pmaxsw xm1, xm4
+ pminsw xm0, xm5
+ pminsw xm1, xm5
+ movhps [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm1
+ movq [r6 +strideq*0], xm1
+ movq [r6 +strideq*1], xm0
+ RET
+
+; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
+; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
+; flags: 1 = packed, 2 = inv_dst2
+; skip round/shift if rnd is not a number
+%macro ITX_MULSUB_2D 8-9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], flags
+%if %8 < 32
+ pmulld m%4, m%1, m%8
+ pmulld m%3, m%2, m%8
+%else
+%if %9 & 1
+ vbroadcasti128 m%3, [pd_%8]
+%else
+ vpbroadcastd m%3, [pd_%8]
+%endif
+ pmulld m%4, m%1, m%3
+ pmulld m%3, m%2
+%endif
+%if %7 < 32
+ pmulld m%1, m%7
+ pmulld m%2, m%7
+%else
+%if %9 & 1
+ vbroadcasti128 m%5, [pd_%7]
+%else
+ vpbroadcastd m%5, [pd_%7]
+%endif
+ pmulld m%1, m%5
+ pmulld m%2, m%5
+%endif
+%if %9 & 2
+ psubd m%4, m%6, m%4
+ psubd m%2, m%4, m%2
+%else
+%ifnum %6
+ paddd m%4, m%6
+%endif
+ paddd m%2, m%4
+%endif
+%ifnum %6
+ paddd m%1, m%6
+%endif
+ psubd m%1, m%3
+%ifnum %6
+ psrad m%2, 12
+ psrad m%1, 12
+%endif
+%endmacro
+
+%macro INV_TXFM_FN 4-5 10 ; type1, type2, eob_offset, size, bitdepth
+cglobal inv_txfm_add_%1_%2_%4_%5bpc, 4, 5, 0, dst, stride, c, eob, tx2
+ %define %%p1 m(i%1_%4_internal_%5bpc)
+ ; Jump to the 1st txfm function if we're not taking the fast path, which
+ ; in turn performs an indirect jump to the 2nd txfm function.
+ lea tx2q, [m(i%2_%4_internal_%5bpc).pass2]
+%ifidn %1_%2, dct_dct
+ test eobd, eobd
+ jnz %%p1
+%else
+%if %3
+ add eobd, %3
+%endif
+ ; jump to the 1st txfm function unless it's located directly after this
+ times ((%%end - %%p1) >> 31) & 1 jmp %%p1
+ALIGN function_align
+%%end:
+%endif
+%endmacro
+
+%macro INV_TXFM_4X4_FN 2-3 10 ; type1, type2, bitdepth
+ INV_TXFM_FN %1, %2, 0, 4x4, %3
+%ifidn %1_%2, dct_dct
+ vpbroadcastd xm2, [dconly_%3bpc]
+%if %3 = 10
+.dconly:
+ imul r6d, [cq], 181
+ mov [cq], eobd ; 0
+ or r3d, 4
+.dconly2:
+ add r6d, 128
+ sar r6d, 8
+.dconly3:
+ imul r6d, 181
+ add r6d, 2176
+ sar r6d, 12
+ movd xm0, r6d
+ paddsw xm0, xm2
+ vpbroadcastw xm0, xm0
+.dconly_loop:
+ movq xm1, [dstq+strideq*0]
+ movhps xm1, [dstq+strideq*1]
+ paddsw xm1, xm0
+ psubusw xm1, xm2
+ movq [dstq+strideq*0], xm1
+ movhps [dstq+strideq*1], xm1
+ lea dstq, [dstq+strideq*2]
+ sub r3d, 2
+ jg .dconly_loop
+ WRAP_XMM RET
+%else
+ jmp m(inv_txfm_add_dct_dct_4x4_10bpc).dconly
+%endif
+%endif
+%endmacro
+
+%macro IDCT4_1D_PACKED 6 ; dst/src[1-2], tmp[1-3], rnd
+ ITX_MULSUB_2D %1, %2, %3, %4, %5, %6, 2896_1567, 2896_3784, 1
+ punpckhqdq m%3, m%2, m%1 ; t3 t2
+ punpcklqdq m%2, m%1 ; t0 t1
+ paddd m%1, m%2, m%3 ; out0 out1
+ psubd m%2, m%3 ; out3 out2
+%endmacro
+
+%macro IDCT4_1D_PACKED_WORD 6 ; dst/src[1-2], tmp[1-3], rnd
+ vpbroadcastd m%5, [pw_m3784_1567]
+ punpckhwd m%3, m%2, m%1
+ vpbroadcastd m%4, [pw_1567_3784]
+ punpcklwd m%2, m%1
+ vpbroadcastd m%1, [pw_m2896_2896]
+ pmaddwd m%5, m%3
+ pmaddwd m%3, m%4
+ vpbroadcastd m%4, [pw_2896_2896]
+ pmaddwd m%1, m%2
+ pmaddwd m%2, m%4
+ REPX {paddd x, m%6}, m%5, m%3, m%1, m%2
+ REPX {psrad x, 12 }, m%5, m%3, m%1, m%2
+ packssdw m%3, m%5 ; t3 t2
+ packssdw m%2, m%1 ; t0 t1
+ paddsw m%1, m%2, m%3 ; out0 out1
+ psubsw m%2, m%3 ; out3 out2
+%endmacro
+
+INV_TXFM_4X4_FN dct, dct
+INV_TXFM_4X4_FN dct, identity
+INV_TXFM_4X4_FN dct, adst
+INV_TXFM_4X4_FN dct, flipadst
+
+cglobal idct_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2
+ call .main
+ vbroadcasti128 m2, [idct4_shuf]
+ packssdw m0, m1
+ pshufb m0, m2
+ jmp tx2q
+.pass2:
+ vextracti128 xm1, m0, 1
+ WRAP_XMM IDCT4_1D_PACKED_WORD 0, 1, 2, 3, 4, 5
+ packssdw xm5, xm5 ; pw_2048
+ pmulhrsw xm0, xm5
+ pmulhrsw xm1, xm5
+ movq xm2, [dstq+strideq*0]
+ movhps xm2, [dstq+strideq*1]
+ lea r6, [dstq+strideq*2]
+ movq xm3, [r6 +strideq*1]
+ movhps xm3, [r6 +strideq*0]
+ vpbroadcastd xm5, [pixel_10bpc_max]
+ pxor m4, m4
+ mova [cq+32*0], m4
+ mova [cq+32*1], m4
+ paddw xm0, xm2
+ paddw xm1, xm3
+ pmaxsw xm0, xm4
+ pmaxsw xm1, xm4
+ pminsw xm0, xm5
+ pminsw xm1, xm5
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ movhps [r6 +strideq*0], xm1
+ movq [r6 +strideq*1], xm1
+ RET
+ALIGN function_align
+.main:
+ vpermq m0, [cq+32*0], q3120
+ vpermq m1, [cq+32*1], q3120
+ vpbroadcastd m5, [pd_2048]
+.main2:
+ IDCT4_1D_PACKED 0, 1, 2, 3, 4, 5
+ ret
+
+INV_TXFM_4X4_FN adst, dct
+INV_TXFM_4X4_FN adst, adst
+INV_TXFM_4X4_FN adst, flipadst
+INV_TXFM_4X4_FN adst, identity
+
+%macro IADST4_1D 0
+ vpbroadcastd m5, [pd_1321]
+ vpbroadcastd m7, [pd_2482]
+ pmulld m4, m0, m5 ; 1321*in0
+ pmulld m6, m3, m7 ; 2482*in3
+ paddd m4, m6 ; 1321*in0 + 2482*in3
+ pmulld m6, m0, m7 ; 2482*in0
+ paddd m0, m3 ; in0 + in3
+ paddd m7, m5 ; pd_3803
+ pmulld m5, m2 ; 1321*in2
+ pmulld m3, m7 ; 3803*in3
+ pmulld m7, m2 ; 3803*in2
+ psubd m2, m0 ; in2 - in0 - in3
+ vpbroadcastd m0, [pd_m3344]
+ pmulld m1, m0 ; -t3
+ pmulld m2, m0 ; out2 (unrounded)
+ psubd m6, m5 ; 2482*in0 - 1321*in2
+ paddd m4, m7 ; t0
+ psubd m6, m3 ; t1
+ paddd m3, m4, m6
+ psubd m4, m1 ; out0 (unrounded)
+ psubd m6, m1 ; out1 (unrounded)
+ paddd m3, m1 ; out3 (unrounded)
+%endmacro
+
+cglobal iadst_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2
+ call .main
+ vinserti128 m0, m4, xm6, 1
+ vinserti128 m1, m2, xm3, 1
+.pass1_end:
+ vpbroadcastd m5, [pd_2048]
+ mova m2, [itx4_shuf]
+ paddd m0, m5
+ paddd m1, m5
+ psrad m0, 12
+ psrad m1, 12
+ packssdw m0, m1
+ vpermd m0, m2, m0
+ psrld m2, 4
+ pshufb m0, m2
+%if WIN64
+ movaps xmm6, [rsp+ 8]
+ movaps xmm7, [rsp+24]
+%endif
+ jmp tx2q
+.pass2:
+ lea r6, [deint_shuf+128]
+ vextracti128 xm1, m0, 1
+ call m(iadst_4x4_internal_8bpc).main
+.end:
+ vpbroadcastd xm4, [pw_2048]
+ movq xm2, [dstq+strideq*0]
+ movhps xm2, [dstq+strideq*1]
+ lea r6, [dstq+strideq*2]
+ movq xm3, [r6 +strideq*0]
+ movhps xm3, [r6 +strideq*1]
+ vpbroadcastd xm5, [pixel_10bpc_max]
+ pmulhrsw xm0, xm4
+ pmulhrsw xm1, xm4
+ pxor m4, m4
+ mova [cq+32*0], m4
+ mova [cq+32*1], m4
+ paddw xm0, xm2
+ paddw xm1, xm3
+ pmaxsw xm0, xm4
+ pmaxsw xm1, xm4
+ pminsw xm0, xm5
+ pminsw xm1, xm5
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ movq [r6 +strideq*0], xm1
+ movhps [r6 +strideq*1], xm1
+ RET
+ALIGN function_align
+.main:
+ mova xm0, [cq+16*0]
+ mova xm1, [cq+16*1]
+ mova xm2, [cq+16*2]
+ mova xm3, [cq+16*3]
+%if WIN64
+ movaps [rsp+16], xmm6
+ movaps [rsp+32], xmm7
+%endif
+.main2:
+ WRAP_XMM IADST4_1D
+ ret
+
+INV_TXFM_4X4_FN flipadst, dct
+INV_TXFM_4X4_FN flipadst, adst
+INV_TXFM_4X4_FN flipadst, flipadst
+INV_TXFM_4X4_FN flipadst, identity
+
+cglobal iflipadst_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2
+ call m(iadst_4x4_internal_10bpc).main
+ vinserti128 m0, m3, xm2, 1
+ vinserti128 m1, m6, xm4, 1
+ jmp m(iadst_4x4_internal_10bpc).pass1_end
+.pass2:
+ lea r6, [deint_shuf+128]
+ vextracti128 xm1, m0, 1
+ call m(iadst_4x4_internal_8bpc).main
+ vpbroadcastd xm4, [pw_2048]
+ movq xm3, [dstq+strideq*1]
+ movhps xm3, [dstq+strideq*0]
+ lea r6, [dstq+strideq*2]
+ movq xm2, [r6 +strideq*1]
+ movhps xm2, [r6 +strideq*0]
+ vpbroadcastd xm5, [pixel_10bpc_max]
+ pmulhrsw xm0, xm4
+ pmulhrsw xm1, xm4
+ pxor m4, m4
+ mova [cq+32*0], m4
+ mova [cq+32*1], m4
+ paddw xm0, xm2
+ paddw xm1, xm3
+ pmaxsw xm0, xm4
+ pmaxsw xm1, xm4
+ pminsw xm0, xm5
+ pminsw xm1, xm5
+ movhps [dstq+strideq*0], xm1
+ movq [dstq+strideq*1], xm1
+ movhps [r6 +strideq*0], xm0
+ movq [r6 +strideq*1], xm0
+ RET
+
+INV_TXFM_4X4_FN identity, dct
+INV_TXFM_4X4_FN identity, adst
+INV_TXFM_4X4_FN identity, flipadst
+INV_TXFM_4X4_FN identity, identity
+
+cglobal iidentity_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2
+ vpbroadcastd m1, [pd_5793]
+ pmulld m0, m1, [cq+32*0]
+ pmulld m1, [cq+32*1]
+ vpbroadcastd m5, [pd_2048]
+ mova m3, [itx4_shuf]
+ paddd m0, m5
+ paddd m1, m5
+ psrad m0, 12
+ psrad m1, 12
+ packssdw m0, m1
+ vpermd m0, m3, m0
+ psrld m3, 4
+ pshufb m0, m3
+ jmp tx2q
+.pass2:
+ vpbroadcastd m1, [pw_1697x8]
+ movq xm2, [dstq+strideq*0]
+ movhps xm2, [dstq+strideq*1]
+ lea r6, [dstq+strideq*2]
+ pmulhrsw m1, m0
+ paddsw m0, m1
+ movq xm3, [r6 +strideq*0]
+ movhps xm3, [r6 +strideq*1]
+ vpbroadcastd xm4, [pixel_10bpc_max]
+ packssdw m5, m5 ; pw_2048
+ pmulhrsw m0, m5
+ pxor m5, m5
+ mova [cq+32*0], m5
+ mova [cq+32*1], m5
+ vextracti128 xm1, m0, 1
+ paddw xm0, xm2
+ paddw xm1, xm3
+ pmaxsw xm0, xm5
+ pmaxsw xm1, xm5
+ pminsw xm0, xm4
+ pminsw xm1, xm4
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ movq [r6 +strideq*0], xm1
+ movhps [r6 +strideq*1], xm1
+ RET
+
+INV_TXFM_4X4_FN dct, dct, 12
+INV_TXFM_4X4_FN dct, identity, 12
+INV_TXFM_4X4_FN dct, adst, 12
+INV_TXFM_4X4_FN dct, flipadst, 12
+
+cglobal idct_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2
+ call m(idct_4x4_internal_10bpc).main
+ mova m3, [idct4_12_shuf]
+ mova m4, [idct4_12_shuf2]
+ vpermd m2, m4, m1
+ vpermd m1, m3, m0
+ jmp m(iadst_4x4_internal_12bpc).pass1_end2
+.pass2:
+ vpbroadcastd m5, [pd_2048]
+ vpermq m0, m0, q3120
+ vpermq m1, m1, q3120
+ call m(idct_4x4_internal_10bpc).main2
+ vpermq m0, m0, q3120
+ vpermq m1, m1, q2031
+ jmp m(iadst_4x4_internal_12bpc).end
+
+INV_TXFM_4X4_FN adst, dct, 12
+INV_TXFM_4X4_FN adst, adst, 12
+INV_TXFM_4X4_FN adst, flipadst, 12
+INV_TXFM_4X4_FN adst, identity, 12
+
+cglobal iadst_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2
+ call m(iadst_4x4_internal_10bpc).main
+ vinserti128 m1, m4, xm6, 1
+ vinserti128 m2, xm3, 1
+.pass1_end:
+ mova m3, [itx4_shuf]
+ vpbroadcastd m5, [pd_1024]
+ psrad m1, 1
+ psrad m2, 1
+ vpermd m1, m3, m1
+ vpermd m2, m3, m2
+ paddd m1, m5
+ paddd m2, m5
+ psrad m1, 11
+ psrad m2, 11
+.pass1_end2:
+ vpbroadcastd m3, [clip_18b_min]
+ vpbroadcastd m4, [clip_18b_max]
+ punpcklqdq m0, m1, m2
+ punpckhqdq m1, m2
+ pmaxsd m0, m3
+ pmaxsd m1, m3
+ pminsd m0, m4
+ pminsd m1, m4
+ jmp tx2q
+.pass2:
+ call .main_pass2
+ vinserti128 m0, m4, xm6, 1
+ vinserti128 m1, m2, xm3, 1
+.pass2_end:
+ vpbroadcastd m5, [pd_2048]
+ paddd m0, m5
+ paddd m1, m5
+ psrad m0, 12
+ psrad m1, 12
+.end:
+%if WIN64
+ WIN64_RESTORE_XMM_INTERNAL
+ %assign xmm_regs_used 6
+%endif
+.end2:
+ vpbroadcastd m4, [pw_16384]
+ movq xm2, [dstq+strideq*0]
+ movq xm3, [dstq+strideq*1]
+ lea r6, [dstq+strideq*2]
+ movhps xm2, [r6 +strideq*0] ; dst0 dst2
+ movhps xm3, [r6 +strideq*1] ; dst1 dst3
+ vpbroadcastd m5, [pixel_12bpc_max]
+ vinserti128 m2, xm3, 1
+ psrad m0, 3
+ psrad m1, 3
+ packssdw m0, m1 ; t0 t2 t1 t3
+ pmulhrsw m0, m4
+ pxor m4, m4
+ mova [cq+32*0], m4
+ mova [cq+32*1], m4
+ paddw m0, m2 ; out0 out2 out1 out3
+ pmaxsw m0, m4
+ pminsw m0, m5
+ vextracti128 xm1, m0, 1 ; out1 out3
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [r6 +strideq*0], xm0
+ movhps [r6 +strideq*1], xm1
+ RET
+.main_pass2:
+ vextracti128 xm3, m1, 1
+ mova xm2, xm1
+ vextracti128 xm1, m0, 1
+ jmp m(iadst_4x4_internal_10bpc).main2
+
+INV_TXFM_4X4_FN flipadst, dct, 12
+INV_TXFM_4X4_FN flipadst, adst, 12
+INV_TXFM_4X4_FN flipadst, flipadst, 12
+INV_TXFM_4X4_FN flipadst, identity, 12
+
+cglobal iflipadst_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2
+ call m(iadst_4x4_internal_10bpc).main
+ vinserti128 m1, m3, xm2, 1
+ vinserti128 m2, m6, xm4, 1
+ jmp m(iadst_4x4_internal_12bpc).pass1_end
+.pass2:
+ call m(iadst_4x4_internal_12bpc).main_pass2
+ vinserti128 m0, m3, xm2, 1
+ vinserti128 m1, m6, xm4, 1
+ jmp m(iadst_4x4_internal_12bpc).pass2_end
+
+INV_TXFM_4X4_FN identity, dct, 12
+INV_TXFM_4X4_FN identity, adst, 12
+INV_TXFM_4X4_FN identity, flipadst, 12
+INV_TXFM_4X4_FN identity, identity, 12
+
+cglobal iidentity_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2
+ mova m2, [itx4_shuf]
+ vpbroadcastd m3, [pd_1697]
+ vpermd m0, m2, [cq+32*0]
+ vpermd m2, m2, [cq+32*1]
+ vpbroadcastd m5, [pd_2048]
+ pmulld m1, m3, m0
+ pmulld m3, m2
+ paddd m1, m5
+ paddd m3, m5
+ psrad m1, 12
+ psrad m3, 12
+ paddd m1, m0
+ paddd m2, m3
+ jmp m(iadst_4x4_internal_12bpc).pass1_end2
+.pass2:
+ ; m0 = in0 in1
+ ; m1 = in2 in3
+ vpbroadcastd m3, [pd_5793]
+ vpbroadcastd m5, [pd_2048]
+ pmulld m0, m3
+ pmulld m1, m3
+ paddd m0, m5 ; 2048
+ paddd m1, m5
+ psrad m0, 12
+ psrad m1, 12
+ jmp m(iadst_4x4_internal_12bpc).end
+
+%macro INV_TXFM_4X8_FN 2-3 10 ; type1, type2, bitdepth
+ INV_TXFM_FN %1, %2, 0, 4x8, %3
+%ifidn %1_%2, dct_dct
+ vpbroadcastd xm2, [dconly_%3bpc]
+%if %3 = 10
+.dconly:
+ imul r6d, [cq], 181
+ mov [cq], eobd ; 0
+ or r3d, 8
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ jmp m(inv_txfm_add_dct_dct_4x4_10bpc).dconly2
+%else
+ jmp m(inv_txfm_add_dct_dct_4x8_10bpc).dconly
+%endif
+%endif
+%endmacro
+
+%macro IDCT4_1D 8 ; src[1-4], tmp[1-3], rnd
+ ITX_MULSUB_2D %2, %4, %5, %6, %7, %8, 1567, 3784 ; t2, t3
+ vpbroadcastd m%5, [pd_2896]
+ pmulld m%1, m%5
+ pmulld m%3, m%5
+ paddd m%1, m%8
+ paddd m%5, m%1, m%3
+ psubd m%1, m%3
+ psrad m%5, 12 ; t0
+ psrad m%1, 12 ; t1
+ psubd m%3, m%1, m%2
+ paddd m%2, m%1
+ paddd m%1, m%5, m%4
+ psubd m%4, m%5, m%4
+%endmacro
+
+INV_TXFM_4X8_FN dct, dct
+INV_TXFM_4X8_FN dct, identity
+INV_TXFM_4X8_FN dct, adst
+INV_TXFM_4X8_FN dct, flipadst
+
+cglobal idct_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2
+.pass1:
+ vpbroadcastd m3, [pd_2896]
+ pmulld m0, m3, [cq+32*0]
+ pmulld m1, m3, [cq+32*1]
+ pmulld m2, m3, [cq+32*2]
+ pmulld m3, m3, [cq+32*3]
+ vpbroadcastd m7, [pd_2048]
+ REPX {paddd x, m7}, m0, m1, m2, m3
+ REPX {psrad x, 12}, m0, m1, m2, m3
+ IDCT4_1D 0, 1, 2, 3, 4, 5, 6, 7
+ jmp tx2q
+.pass2:
+ packssdw m0, m2
+ packssdw m1, m3
+ lea r6, [deint_shuf+128]
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ punpckhdq m1, m0, m2 ; 2 3
+ punpckldq m0, m2 ; 0 1
+ vextracti128 xm2, m0, 1 ; 4 5
+ vextracti128 xm3, m1, 1 ; 6 7
+ call m(idct_4x8_internal_8bpc).main
+ vpbroadcastd xm4, [pw_2048]
+ REPX {pmulhrsw x, xm4}, xm0, xm1, xm2, xm3
+ lea r3, [strideq*3]
+ lea r6, [dstq+strideq*4]
+ movq xm4, [dstq+strideq*0]
+ movhps xm4, [dstq+strideq*1]
+ movq xm5, [dstq+r3 ]
+ movhps xm5, [dstq+strideq*2]
+ movq xm6, [r6 +strideq*0]
+ movhps xm6, [r6 +strideq*1]
+ movq xm7, [r6 +r3 ]
+ movhps xm7, [r6 +strideq*2]
+ paddw xm0, xm4 ; 0 1
+ paddw xm1, xm5 ; 3 2
+ paddw xm2, xm6 ; 4 5
+ paddw xm3, xm7 ; 7 6
+ vpbroadcastd xm5, [pixel_10bpc_max]
+ pxor m4, m4
+ REPX {mova [cq+32*x], m4}, 0, 1, 2, 3
+ REPX {pmaxsw x, xm4}, xm0, xm1, xm2, xm3
+ REPX {pminsw x, xm5}, xm0, xm1, xm2, xm3
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ movhps [dstq+strideq*2], xm1
+ movq [dstq+r3 ], xm1
+ movq [r6 +strideq*0], xm2
+ movhps [r6 +strideq*1], xm2
+ movhps [r6 +strideq*2], xm3
+ movq [r6 +r3 ], xm3
+ RET
+
+INV_TXFM_4X8_FN adst, dct
+INV_TXFM_4X8_FN adst, adst
+INV_TXFM_4X8_FN adst, flipadst
+INV_TXFM_4X8_FN adst, identity
+
+cglobal iadst_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2
+ call m(iadst_8x4_internal_10bpc).main
+ vpbroadcastd m5, [pd_2048]
+ paddd m0, m5, m4
+ paddd m1, m5, m6
+ paddd m2, m5
+ paddd m3, m5
+.pass1_end:
+ REPX {psrad x, 12}, m0, m1, m2, m3
+ jmp tx2q
+.pass2:
+ call .pass2_main
+ mova xm4, [pw_2048_m2048]
+ REPX {pmulhrsw x, xm4}, xm0, xm1, xm2, xm3
+.end:
+ lea r3, [strideq*3]
+ lea r6, [dstq+strideq*4]
+ movq xm4, [dstq+strideq*0]
+ movhps xm4, [dstq+strideq*1]
+ movq xm5, [dstq+strideq*2]
+ movhps xm5, [dstq+r3 ]
+ movq xm6, [r6 +strideq*0]
+ movhps xm6, [r6 +strideq*1]
+ movq xm7, [r6 +strideq*2]
+ movhps xm7, [r6 +r3 ]
+ paddw xm0, xm4 ; 0 1
+ paddw xm1, xm5 ; 2 3
+ paddw xm2, xm6 ; 4 5
+ paddw xm3, xm7 ; 6 7
+ vpbroadcastd xm5, [pixel_10bpc_max]
+ pxor m4, m4
+ REPX {mova [cq+32*x], m4}, 0, 1, 2, 3
+ REPX {pmaxsw x, xm4}, xm0, xm1, xm2, xm3
+ REPX {pminsw x, xm5}, xm0, xm1, xm2, xm3
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+r3 ], xm1
+ movq [r6 +strideq*0], xm2
+ movhps [r6 +strideq*1], xm2
+ movq [r6 +strideq*2], xm3
+ movhps [r6 +r3 ], xm3
+ RET
+ALIGN function_align
+.pass2_main:
+ packssdw m0, m2
+ packssdw m1, m3
+ lea r6, [deint_shuf+128]
+ punpcklwd m4, m0, m1
+ punpckhwd m0, m1
+ punpckhdq m5, m4, m0
+ punpckldq m4, m0
+ vextracti128 xm2, m4, 1 ; 4 5
+ vextracti128 xm3, m5, 1 ; 6 7
+ pshufd xm4, xm4, q1032 ; 1 0
+ pshufd xm5, xm5, q1032 ; 3 2
+ jmp m(iadst_4x8_internal_8bpc).main_pass2
+ALIGN function_align
+.main:
+ vpbroadcastd m8, [clip_18b_min]
+ vpbroadcastd m9, [clip_18b_max]
+.main2:
+ vbroadcasti128 m0, [cq+16*0]
+ vbroadcasti128 m2, [cq+16*2]
+ vbroadcasti128 m3, [cq+16*5]
+ vbroadcasti128 m1, [cq+16*7]
+ vpbroadcastd m6, [pd_2896]
+ shufpd m0, m2, 0x0c ; 0 2
+ shufpd m1, m3, 0x0c ; 7 5
+ vbroadcasti128 m2, [cq+16*4]
+ vbroadcasti128 m4, [cq+16*6]
+ vbroadcasti128 m5, [cq+16*1]
+ vbroadcasti128 m3, [cq+16*3]
+ vpbroadcastd m7, [pd_2048]
+ shufpd m2, m4, 0x0c ; 4 6
+ shufpd m3, m5, 0x0c ; 3 1
+ REPX {pmulld x, m6}, m0, m1, m2, m3
+ REPX {paddd x, m7}, m0, m1, m2, m3
+ REPX {psrad x, 12}, m0, m1, m2, m3
+.main3:
+ ITX_MULSUB_2D 1, 0, 4, 5, 6, 7, 401_1931, 4076_3612, 1
+ ITX_MULSUB_2D 3, 2, 4, 5, 6, 7, 3166_3920, 2598_1189, 1
+ psubd m4, m0, m2 ; t4 t6
+ paddd m0, m2 ; t0 t2
+ psubd m2, m1, m3 ; t5 t7
+ paddd m1, m3 ; t1 t3
+ REPX {pmaxsd x, m8}, m4, m2, m0, m1
+ REPX {pminsd x, m9}, m4, m2, m0, m1
+ pxor m5, m5
+ psubd m5, m4
+ vpblendd m4, m2, 0xcc ; t4 t7
+ vpblendd m2, m5, 0xcc ; t5 -t6
+ ITX_MULSUB_2D 4, 2, 3, 5, 6, 7, 1567, 3784
+ vpbroadcastd m5, [pd_2896]
+ vbroadcasti128 m6, [pw_2048_m2048] ; + + - -
+ punpckhqdq m3, m0, m1
+ punpcklqdq m0, m1
+ psubd m1, m0, m3 ; t2 t3
+ paddd m0, m3 ; out0 -out7
+ punpckhqdq m3, m4, m2 ; t7a t6a
+ punpcklqdq m4, m2 ; t5a t4a
+ psubd m2, m4, m3 ; t7 t6
+ paddd m4, m3 ; out6 -out1
+ REPX {pmaxsd x, m8}, m1, m2
+ REPX {pminsd x, m9}, m1, m2
+ vpblendd m3, m1, m2, 0xcc
+ shufpd m1, m2, 0x05
+ pmulld m3, m5
+ pmulld m5, m1
+ psignd m0, m6 ; out0 out7
+ psignd m4, m6 ; out6 out1
+ paddd m3, m7
+ psubd m2, m3, m5
+ paddd m5, m3
+ psrad m2, 12 ; out4 -out5
+ psrad m5, 12 ; -out3 out2
+ ret
+
+INV_TXFM_4X8_FN flipadst, dct
+INV_TXFM_4X8_FN flipadst, adst
+INV_TXFM_4X8_FN flipadst, flipadst
+INV_TXFM_4X8_FN flipadst, identity
+
+cglobal iflipadst_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2
+ call m(iadst_8x4_internal_10bpc).main
+ vpbroadcastd m5, [pd_2048]
+ paddd m0, m5, m3
+ paddd m1, m5, m2
+ paddd m2, m5, m6
+ paddd m3, m5, m4
+ jmp m(iadst_4x8_internal_10bpc).pass1_end
+.pass2:
+ call m(iadst_4x8_internal_10bpc).pass2_main
+ mova xm4, [pw_2048_m2048]
+ REPX {pmulhrsw x, xm4}, xm3, xm2, xm1, xm0
+ lea r3, [strideq*3]
+ lea r6, [dstq+strideq*4]
+ movq xm4, [dstq+strideq*1]
+ movhps xm4, [dstq+strideq*0]
+ movq xm5, [dstq+r3 ]
+ movhps xm5, [dstq+strideq*2]
+ movq xm6, [r6 +strideq*1]
+ movhps xm6, [r6 +strideq*0]
+ movq xm7, [r6 +r3 ]
+ movhps xm7, [r6 +strideq*2]
+ paddw xm3, xm4 ; 1 0
+ paddw xm2, xm5 ; 3 2
+ paddw xm1, xm6 ; 5 4
+ paddw xm0, xm7 ; 7 6
+ vpbroadcastd xm5, [pixel_10bpc_max]
+ pxor m4, m4
+ REPX {mova [cq+32*x], m4}, 0, 1, 2, 3
+ REPX {pmaxsw x, xm4}, xm3, xm2, xm1, xm0
+ REPX {pminsw x, xm5}, xm3, xm2, xm1, xm0
+ movhps [dstq+strideq*0], xm3
+ movq [dstq+strideq*1], xm3
+ movhps [dstq+strideq*2], xm2
+ movq [dstq+r3 ], xm2
+ movhps [r6 +strideq*0], xm1
+ movq [r6 +strideq*1], xm1
+ movhps [r6 +strideq*2], xm0
+ movq [r6 +r3 ], xm0
+ RET
+
+INV_TXFM_4X8_FN identity, dct
+INV_TXFM_4X8_FN identity, adst
+INV_TXFM_4X8_FN identity, flipadst
+INV_TXFM_4X8_FN identity, identity
+
+cglobal iidentity_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2
+.pass1:
+ vpbroadcastd m3, [pd_2896]
+ pmulld m0, m3, [cq+32*0]
+ pmulld m1, m3, [cq+32*1]
+ pmulld m2, m3, [cq+32*2]
+ pmulld m3, [cq+32*3]
+ vpbroadcastd m5, [pd_2048]
+ vpbroadcastd m4, [pd_5793]
+ REPX {paddd x, m5}, m0, m1, m2, m3
+ REPX {psrad x, 12}, m0, m1, m2, m3
+ REPX {pmulld x, m4}, m0, m1, m2, m3
+ REPX {paddd x, m5}, m0, m1, m2, m3
+ REPX {psrad x, 12}, m0, m1, m2, m3
+ jmp tx2q
+.pass2:
+ vpbroadcastd m6, [pixel_10bpc_max]
+ call .pass2_end
+ RET
+ALIGN function_align
+.pass2_end:
+ vpbroadcastd m4, [pw_4096]
+ packssdw m0, m2
+ packssdw m1, m3
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ pmulhrsw m2, m4
+ pmulhrsw m0, m4
+ punpckhdq m1, m0, m2 ; 2 3 6 7
+ punpckldq m0, m2 ; 0 1 4 5
+ lea r3, [strideq*3]
+ lea r6, [dstq+strideq*4]
+ movq xm2, [dstq+strideq*0]
+ movhps xm2, [dstq+strideq*1]
+ vpbroadcastq m4, [r6 +strideq*0]
+ vpbroadcastq m5, [r6 +strideq*1]
+ movq xm3, [dstq+strideq*2]
+ movhps xm3, [dstq+r3 ]
+ vpblendd m2, m4, 0x30
+ vpblendd m2, m5, 0xc0
+ vpbroadcastq m4, [r6 +strideq*2]
+ vpbroadcastq m5, [r6 +r3 ]
+ vpblendd m3, m4, 0x30
+ vpblendd m3, m5, 0xc0
+ pxor m4, m4
+ REPX {mova [cq+32*x], m4}, 0, 1, 2, 3
+ paddw m0, m2 ; out0 out1 out4 out5
+ paddw m1, m3 ; out2 out3 out6 out7
+ pmaxsw m0, m4
+ pmaxsw m1, m4
+ pminsw m0, m6
+ pminsw m1, m6
+ vextracti128 xm2, m0, 1 ; out4 out5
+ vextracti128 xm3, m1, 1 ; out6 out7
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+r3 ], xm1
+ movq [r6 +strideq*0], xm2
+ movhps [r6 +strideq*1], xm2
+ movq [r6 +strideq*2], xm3
+ movhps [r6 +r3 ], xm3
+ ret
+
+INV_TXFM_4X8_FN dct, dct, 12
+INV_TXFM_4X8_FN dct, identity, 12
+INV_TXFM_4X8_FN dct, adst, 12
+INV_TXFM_4X8_FN dct, flipadst, 12
+
+cglobal idct_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
+ jmp m(idct_4x8_internal_10bpc).pass1
+.pass2:
+ vpbroadcastd m8, [clip_18b_min]
+ vpbroadcastd m9, [clip_18b_max]
+ REPX {pmaxsd x, m8}, m0, m1, m2, m3
+ REPX {pminsd x, m9}, m0, m1, m2, m3
+ ; transpose & interleave
+ pshufd m0, m0, q1320
+ pshufd m1, m1, q1320
+ pshufd m2, m2, q1320
+ pshufd m3, m3, q1320
+ punpckldq m4, m0, m1
+ punpckhdq m0, m1
+ punpckldq m5, m2, m3
+ punpckhdq m2, m3
+ vpermq m0, m0, q3102
+ vpermq m2, m2, q3102
+ vperm2i128 m1, m0, m2, 0x31 ; 1 5 (interleaved)
+ vperm2i128 m3, m0, m2, 0x20 ; 7 3 (interleaved)
+ vperm2i128 m0, m4, m5, 0x20 ; 0 2 (interleaved)
+ vperm2i128 m2, m4, m5, 0x31 ; 4 6 (interleaved)
+ vpbroadcastd m7, [pd_2048]
+ call m(idct_8x4_internal_10bpc).main
+ psubd m3, m0, m4 ; out7 out6
+ paddd m0, m4 ; out0 out1
+ paddd m1, m2, m5 ; out3 out2
+ psubd m2, m5 ; out4 out5
+ pshufd m1, m1, q1032
+ pshufd m3, m3, q1032
+ jmp m(iadst_4x8_internal_12bpc).end
+
+INV_TXFM_4X8_FN adst, dct, 12
+INV_TXFM_4X8_FN adst, adst, 12
+INV_TXFM_4X8_FN adst, flipadst, 12
+INV_TXFM_4X8_FN adst, identity, 12
+
+cglobal iadst_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
+ call m(iadst_8x4_internal_10bpc).main
+ psrad m0, m4, 1
+ psrad m1, m6, 1
+ psrad m2, 1
+ psrad m3, 1
+.pass1_end:
+ vpbroadcastd m5, [pd_1024]
+ REPX {paddd x, m5}, m0, m1, m2, m3
+ REPX {psrad x, 11}, m0, m1, m2, m3
+ jmp tx2q
+.pass2:
+ vpbroadcastd m8, [clip_18b_min]
+ vpbroadcastd m9, [clip_18b_max]
+ REPX {pmaxsd x, m8}, m0, m1, m2, m3
+ REPX {pminsd x, m9}, m0, m1, m2, m3
+ call .pass2_main
+ vpblendd m3, m0, m4, 0x33 ; out6 out7
+ vpblendd m0, m4, 0xcc ; out0 out1
+ pshufd m1, m5, q1032
+ psignd m2, m6 ; out4 out5
+ psignd m1, m6 ; out2 out3
+.end:
+ vpbroadcastd m4, [pw_16384]
+ REPX {psrad x, 3}, m0, m1, m2, m3
+ packssdw m0, m2 ; 0 1 4 5 (interleaved)
+ packssdw m1, m3 ; 2 3 6 7 (interleaved)
+ mova m2, [iadst8_12_shuf]
+ vpermd m0, m2, m0 ; 0 1 4 5
+ vpermd m1, m2, m1 ; 2 3 6 7
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+ lea r3, [strideq*3]
+ lea r6, [dstq+strideq*4]
+ movq xm4, [dstq+strideq*0]
+ movhps xm4, [dstq+strideq*1]
+ movq xm5, [dstq+strideq*2]
+ movhps xm5, [dstq+r3 ]
+ movq xm6, [r6 +strideq*0]
+ movhps xm6, [r6 +strideq*1]
+ vinserti128 m4, xm6, 1
+ movq xm7, [r6 +strideq*2]
+ movhps xm7, [r6 +r3 ]
+ vinserti128 m5, xm7, 1
+ paddw m0, m4 ; 0 1 4 5
+ paddw m1, m5 ; 2 3 6 7
+ vpbroadcastd m5, [pixel_12bpc_max]
+ pxor m4, m4
+ REPX {mova [cq+32*x], m4}, 0, 1, 2, 3
+ REPX {pmaxsw x, m4}, m0, m1
+ REPX {pminsw x, m5}, m0, m1
+ vextracti128 xm2, m0, 1 ; out4 out5
+ vextracti128 xm3, m1, 1 ; out6 out7
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+r3 ], xm1
+ movq [r6 +strideq*0], xm2
+ movhps [r6 +strideq*1], xm2
+ movq [r6 +strideq*2], xm3
+ movhps [r6 +r3 ], xm3
+ RET
+ALIGN function_align
+.pass2_main:
+ ; transpose & interleave
+ pshufd m0, m0, q1320
+ pshufd m1, m1, q1320
+ pshufd m2, m2, q1320
+ pshufd m3, m3, q1320
+ punpckldq m4, m0, m1
+ punpckhdq m0, m1
+ punpckldq m5, m2, m3
+ punpckhdq m2, m3
+ vperm2i128 m1, m0, m2, 0x31 ; 7 5 (interleaved)
+ vperm2i128 m3, m0, m2, 0x20 ; 3 1 (interleaved)
+ vperm2i128 m0, m4, m5, 0x20 ; 0 2 (interleaved)
+ vperm2i128 m2, m4, m5, 0x31 ; 4 6 (interleaved)
+ vpbroadcastd m7, [pd_2048]
+ jmp m(iadst_4x8_internal_10bpc).main3
+
+INV_TXFM_4X8_FN flipadst, dct, 12
+INV_TXFM_4X8_FN flipadst, adst, 12
+INV_TXFM_4X8_FN flipadst, flipadst, 12
+INV_TXFM_4X8_FN flipadst, identity, 12
+
+cglobal iflipadst_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
+ call m(iadst_8x4_internal_10bpc).main
+ psrad m0, m3, 1
+ psrad m1, m2, 1
+ psrad m2, m6, 1
+ psrad m3, m4, 1
+ jmp m(iadst_4x8_internal_12bpc).pass1_end
+.pass2:
+ vpbroadcastd m8, [clip_18b_min]
+ vpbroadcastd m9, [clip_18b_max]
+ REPX {pmaxsd x, m8}, m0, m1, m2, m3
+ REPX {pminsd x, m9}, m0, m1, m2, m3
+ call m(iadst_4x8_internal_12bpc).pass2_main
+ shufpd m3, m4, m0, 0x05 ; out1 out0
+ shufpd m0, m4, 0x05 ; out7 out6
+ psignd m2, m6
+ pshufd m6, m6, q1032
+ pshufd m1, m2, q1032 ; out5 out4
+ psignd m2, m5, m6 ; out3 out2
+ jmp m(iadst_4x8_internal_12bpc).end
+
+INV_TXFM_4X8_FN identity, dct, 12
+INV_TXFM_4X8_FN identity, adst, 12
+INV_TXFM_4X8_FN identity, flipadst, 12
+INV_TXFM_4X8_FN identity, identity, 12
+
+cglobal iidentity_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
+ jmp m(iidentity_4x8_internal_10bpc).pass1
+.pass2:
+ ; m0 = in0 in1
+ ; m1 = in2 in3
+ ; m2 = in4 in5
+ ; m3 = in6 in7
+ vpbroadcastd m6, [pixel_12bpc_max]
+ call m(iidentity_4x8_internal_10bpc).pass2_end
+ RET
+
+%macro INV_TXFM_4X16_FN 2-3 10 ; type1, type2, bitdepth
+ INV_TXFM_FN %1, %2, 0, 4x16, %3
+%ifidn %1_%2, dct_dct
+ imul r6d, [cq], 181
+ vpbroadcastd xm2, [dconly_%3bpc]
+ mov [cq], eobd ; 0
+ or r3d, 16
+ add r6d, 384
+ sar r6d, 9
+ jmp m(inv_txfm_add_dct_dct_4x4_10bpc).dconly3
+%endif
+%endmacro
+
+INV_TXFM_4X16_FN dct, dct
+INV_TXFM_4X16_FN dct, identity
+INV_TXFM_4X16_FN dct, adst
+INV_TXFM_4X16_FN dct, flipadst
+
+cglobal idct_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2
+.pass1:
+ vpbroadcastd m10, [pd_3072]
+ mova m1, [cq+32*2]
+ mova m3, [cq+32*6]
+ mova m5, [cq+32*3]
+ mova m7, [cq+32*7]
+ call .pass1_main
+ pmulld m0, m6, [cq+32*0]
+ pmulld m2, m6, [cq+32*4]
+ pmulld m4, m6, [cq+32*1]
+ pmulld m6, [cq+32*5]
+ call .pass1_main2
+ REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7
+ jmp tx2q
+.pass2:
+ packssdw m0, m4
+ packssdw m1, m5
+ packssdw m2, m6
+ packssdw m3, m7
+ lea r6, [deint_shuf+128]
+ punpcklwd m4, m2, m3
+ punpckhwd m2, m3
+ punpckhwd m5, m0, m1
+ punpcklwd m0, m1
+ punpckhdq m1, m0, m4 ; 2 3
+ punpckldq m0, m4 ; 0 1
+ punpckldq m4, m5, m2 ; 8 9
+ punpckhdq m5, m2 ; a b
+ vextracti128 xm2, m0, 1 ; 4 5
+ vextracti128 xm3, m1, 1 ; 6 7
+ vextracti128 xm6, m4, 1 ; c d
+ vextracti128 xm7, m5, 1 ; e f
+ call m(idct_4x16_internal_8bpc).main
+ vpbroadcastd m9, [pw_2048]
+ vinserti128 m0, m0, xm1, 1 ; 0 1 3 2
+ vinserti128 m1, m2, xm3, 1 ; 4 5 7 6
+ vinserti128 m2, m4, xm5, 1 ; 8 9 b a
+ vinserti128 m3, m6, xm7, 1 ; c d f e
+ vpbroadcastd m8, [pixel_10bpc_max]
+ call .pass2_end
+ RET
+ALIGN function_align
+.pass1_main:
+ vpbroadcastd m4, [pd_3784]
+ vpbroadcastd m8, [pd_1567]
+ vpbroadcastd m9, [pd_2048]
+ vpbroadcastd m6, [pd_1448]
+ ITX_MULSUB_2D 1, 3, 0, 2, _, 9, 8, 4 ; t2l, t3l
+ ITX_MULSUB_2D 5, 7, 4, 2, _, 9, 8, 4 ; t2h, t3h
+ ret
+ALIGN function_align
+.pass1_main2:
+ paddd m0, m10
+ paddd m4, m10
+ paddd m8, m0, m2
+ psubd m0, m2
+ paddd m9, m4, m6
+ psubd m4, m6
+ REPX {psrad x, 11}, m8, m0, m9, m4 ; t0l, t1l, t0h, t1h
+ psubd m2, m0, m1
+ paddd m1, m0
+ psubd m6, m4, m5
+ paddd m5, m4
+ paddd m0, m8, m3
+ psubd m3, m8, m3
+ paddd m4, m9, m7
+ psubd m7, m9, m7
+ ret
+ALIGN function_align
+.pass2_end:
+ lea r6, [strideq*3]
+ pxor m7, m7
+ pmulhrsw m0, m9
+ call .write_4x4
+ pmulhrsw m0, m1, m9
+ call .write_4x4
+ pmulhrsw m0, m2, m9
+ call .write_4x4
+ pmulhrsw m0, m3, m9
+ call .write_4x4
+ ret
+ALIGN function_align
+.write_4x4:
+ movq xm4, [dstq+strideq*0]
+ movhps xm4, [dstq+strideq*1]
+ vpbroadcastq m5, [dstq+strideq*2]
+ vpbroadcastq m6, [dstq+r6 ]
+ mova [cq+32*0], m7
+ mova [cq+32*1], m7
+ add cq, 32*2
+ vpblendd m4, m5, 0xc0
+ vpblendd m4, m6, 0x30
+ paddw m4, m0
+ pmaxsw m4, m7
+ pminsw m4, m8
+ vextracti128 xm5, m4, 1
+ movq [dstq+strideq*0], xm4
+ movhps [dstq+strideq*1], xm4
+ movhps [dstq+strideq*2], xm5
+ movq [dstq+r6 ], xm5
+ lea dstq, [dstq+strideq*4]
+ ret
+
+INV_TXFM_4X16_FN adst, dct
+INV_TXFM_4X16_FN adst, adst
+INV_TXFM_4X16_FN adst, flipadst
+INV_TXFM_4X16_FN adst, identity
+
+cglobal iadst_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2
+ call m(iadst_16x4_internal_10bpc).main
+ vpbroadcastd m6, [pd_6144]
+ call m(iadst_16x4_internal_10bpc).main_end
+ psrad m0, m4, 13
+ psrad m1, m5, 13
+ psrad m2, 13
+ psrad m3, 13
+ psrad m4, m8, 13
+ psrad m5, m9, 13
+ psrad m6, 13
+ psrad m7, 13
+ jmp tx2q
+.pass2:
+ call .pass2_main
+ vpbroadcastd m5, [pw_2048]
+ vpbroadcastd m8, [pixel_10bpc_max]
+ lea r6, [strideq*3]
+ vpblendd m4, m3, m0, 0xcc ; -out3 out0 out2 -out1
+ pshufd m2, m2, q1032 ; -out11 out8 out10 -out9
+ vpblendd m3, m0, 0x33 ; -out15 out12 out14 -out13
+ pxor m7, m7
+ psubw m9, m7, m5
+ vpblendd m9, m5, 0x3c ; -2048 2048 2048 -2048
+ pmulhrsw m0, m4, m9
+ call .write_4x4
+ pmulhrsw m0, m1, m9
+ call .write_4x4
+ pmulhrsw m0, m2, m9
+ call .write_4x4
+ pmulhrsw m0, m3, m9
+ call .write_4x4
+ RET
+ALIGN function_align
+.write_4x4:
+ movq xm4, [dstq+r6 ]
+ movhps xm4, [dstq+strideq*0]
+ vpbroadcastq m5, [dstq+strideq*1]
+ vpbroadcastq m6, [dstq+strideq*2]
+ mova [cq+32*0], m7
+ mova [cq+32*1], m7
+ add cq, 32*2
+ vpblendd m4, m5, 0xc0
+ vpblendd m4, m6, 0x30
+ paddw m4, m0
+ pmaxsw m4, m7
+ pminsw m4, m8
+ vextracti128 xm5, m4, 1
+ movhps [dstq+strideq*0], xm4
+ movhps [dstq+strideq*1], xm5
+ movq [dstq+strideq*2], xm5
+ movq [dstq+r6 ], xm4
+ lea dstq, [dstq+strideq*4]
+ ret
+ALIGN function_align
+.pass2_main:
+ packssdw m0, m4
+ packssdw m1, m5
+ packssdw m2, m6
+ packssdw m3, m7
+ lea r6, [deint_shuf+128]
+ punpcklwd m4, m2, m3
+ punpckhwd m2, m3
+ punpckhwd m5, m0, m1
+ punpcklwd m0, m1
+ punpckhdq m1, m0, m4
+ punpckldq m0, m4
+ punpckldq m4, m5, m2
+ punpckhdq m5, m2
+ vpblendd m3, m0, m1, 0x33
+ vpblendd m0, m1, 0xcc
+ shufpd m2, m5, m4, 0x05
+ shufpd m4, m5, 0x05
+ vperm2i128 m1, m0, m3, 0x31 ; 4 7 6 5
+ vinserti128 m0, xm3, 1 ; 0 3 2 1
+ vperm2i128 m3, m2, m4, 0x31 ; c f e d ; ????
+ vinserti128 m2, xm4, 1 ; b 8 9 a
+ call m(iadst_4x16_internal_8bpc).main2
+ vpbroadcastd m5, [pw_2896x8]
+ paddsw m1, m2, m4
+ psubsw m2, m4
+ pmulhrsw m1, m5 ; -out7 out4 out6 -out5
+ pmulhrsw m2, m5 ; out8 -out11 -out9 out10
+ ret
+ALIGN function_align
+.main:
+ vbroadcasti128 m0, [cq+16* 0]
+ vbroadcasti128 m4, [cq+16* 2]
+ vbroadcasti128 m1, [cq+16*15]
+ vbroadcasti128 m5, [cq+16*13]
+ vbroadcasti128 m2, [cq+16* 4]
+ vbroadcasti128 m6, [cq+16* 6]
+ vbroadcasti128 m3, [cq+16*11]
+ vbroadcasti128 m7, [cq+16* 9]
+ shufpd m0, m4, 0x0c ; 0 2
+ shufpd m1, m5, 0x0c ; 15 13
+ shufpd m2, m6, 0x0c ; 4 6
+ shufpd m3, m7, 0x0c ; 11 9
+ vbroadcasti128 m4, [cq+16* 8]
+ vbroadcasti128 m6, [cq+16*10]
+ vbroadcasti128 m5, [cq+16* 7]
+ vbroadcasti128 m7, [cq+16* 5]
+ shufpd m4, m6, 0x0c ; 8 10
+ shufpd m5, m7, 0x0c ; 7 5
+ vbroadcasti128 m6, [cq+16*12]
+ vbroadcasti128 m7, [cq+16*14]
+ shufpd m6, m7, 0x0c ; 12 14
+ vbroadcasti128 m7, [cq+16* 3]
+ vbroadcasti128 m8, [cq+16* 1]
+ shufpd m7, m8, 0x0c ; 3 1
+.main2:
+ ; expects: m12 = clip_min m13 = clip_max
+ vpbroadcastd m11, [pd_2048]
+ ITX_MULSUB_2D 1, 0, 8, 9, 10, 11, 201_995, 4091_3973, 1
+ ITX_MULSUB_2D 3, 2, 8, 9, 10, 11, 1751_2440, 3703_3290, 1
+ ITX_MULSUB_2D 5, 4, 8, 9, 10, 11, 3035_3513, 2751_2106, 1
+ ITX_MULSUB_2D 7, 6, 8, 9, 10, 11, 3857_4052, 1380_601, 1
+ psubd m8, m0, m4 ; t8a t10a
+ paddd m0, m4 ; t0a t2a
+ psubd m4, m1, m5 ; t9a t11a
+ paddd m1, m5 ; t1a t3a
+ psubd m5, m2, m6 ; t12a t14a
+ paddd m2, m6 ; t4a t6a
+ psubd m6, m3, m7 ; t13a t15a
+ paddd m3, m7 ; t5a t7a
+ REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m8
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m8
+ ITX_MULSUB_2D 8, 4, 7, 9, 10, 11, 799_3406, 4017_2276, 1
+ ITX_MULSUB_2D 6, 5, 7, 9, 10, 11, 4017_2276, 10, 1
+ psubd m7, m0, m2 ; t4 t6
+ paddd m0, m2 ; t0 t2
+ psubd m2, m1, m3 ; t5 t7
+ paddd m1, m3 ; t1 t3
+ psubd m3, m4, m6 ; t12a t14a
+ paddd m4, m6 ; t8a t10a
+ psubd m6, m8, m5 ; t13a t15a
+ paddd m8, m5 ; t9a t11a
+ REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m6, m7, m8
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m6, m7, m8
+ punpcklqdq m5, m3, m7 ; t12a t4
+ punpckhqdq m3, m7 ; t14a t6
+ punpckhqdq m7, m6, m2 ; t15a t7
+ punpcklqdq m6, m2 ; t13a t5
+ ITX_MULSUB_2D 7, 3, 2, 9, 10, 11, 3784, 1567
+ ITX_MULSUB_2D 5, 6, 2, 9, 10, 11, 1567, 10
+ vpbroadcastd m10, [pd_2896]
+ vbroadcasti128 m9, [pw_2048_m2048] ; + + - -
+ punpckhqdq m2, m4, m0 ; t10a t2
+ punpcklqdq m4, m0 ; t8a t0
+ punpckhqdq m0, m8, m1 ; t11a t3
+ punpcklqdq m8, m1 ; t9a t1
+ paddd m1, m6, m7 ; out2 -out3
+ psubd m6, m7 ; t14a t6
+ paddd m7, m5, m3 ; -out13 out12
+ psubd m5, m3 ; t15a t7
+ psubd m3, m8, m0 ; t11 t3a
+ paddd m8, m0 ; out14 -out15
+ paddd m0, m4, m2 ; -out1 out0
+ psubd m4, m2 ; t10 t2a
+ REPX {pmaxsd x, m12}, m6, m5, m3, m4
+ REPX {pminsd x, m13}, m6, m5, m3, m4
+ REPX {pmulld x, m10}, m6, m5, m3, m4
+ paddd m6, m11
+ paddd m4, m11
+ paddd m2, m6, m5 ; -out5 out4
+ psubd m6, m5 ; out10 -out11
+ psubd m5, m4, m3 ; -out9 out8
+ paddd m3, m4 ; out6 -out7
+ REPX {psrad x, 12}, m2, m3, m5, m6
+ REPX {psignd x, m9}, m1, m8, m3, m6
+ pshufd m9, m9, q1032
+ REPX {psignd x, m9}, m0, m7, m2, m5
+ ret
+
+INV_TXFM_4X16_FN flipadst, dct
+INV_TXFM_4X16_FN flipadst, adst
+INV_TXFM_4X16_FN flipadst, flipadst
+INV_TXFM_4X16_FN flipadst, identity
+
+cglobal iflipadst_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2
+.pass1:
+ call m(iadst_16x4_internal_10bpc).main
+ vpbroadcastd m6, [pd_6144]
+ call m(iadst_16x4_internal_10bpc).main_end
+ psrad m0, m3, 13
+ psrad m1, m2, 13
+ psrad m2, m5, 13
+ psrad m3, m4, 13
+ psrad m4, m7, 13
+ psrad m5, m6, 13
+ psrad m6, m9, 13
+ psrad m7, m8, 13
+ jmp tx2q
+.pass2:
+ call m(iadst_4x16_internal_10bpc).pass2_main
+ vpbroadcastd m5, [pw_2048]
+ vpbroadcastd m8, [pixel_10bpc_max]
+ lea r6, [strideq*3]
+ vpblendd m4, m3, m0, 0x33 ; -out0 out3 out1 -out2
+ pshufd m2, m2, q1032 ; -out11 out8 out10 -out9
+ vpblendd m3, m0, 0xcc ; -out12 out15 out13 -out14
+ pxor m7, m7
+ psubw m9, m7, m5
+ vpblendd m9, m5, 0x3c ; -2048 2048 2048 -2048
+ pmulhrsw m0, m4, m9
+ call .write_4x4
+ pmulhrsw m0, m2, m9
+ call .write_4x4
+ pmulhrsw m0, m1, m9
+ call .write_4x4
+ pmulhrsw m0, m3, m9
+ call .write_4x4
+ RET
+ALIGN function_align
+.write_4x4:
+ movq xm4, [dstq+strideq*0]
+ movhps xm4, [dstq+r6 ]
+ vpbroadcastq m5, [dstq+strideq*1]
+ vpbroadcastq m6, [dstq+strideq*2]
+ mova [cq+32*0], m7
+ mova [cq+32*1], m7
+ add cq, 32*2
+ vpblendd m4, m5, 0x30
+ vpblendd m4, m6, 0xc0
+ paddw m4, m0
+ pmaxsw m4, m7
+ pminsw m4, m8
+ vextracti128 xm5, m4, 1
+ movq [dstq+strideq*0], xm4
+ movq [dstq+strideq*1], xm5
+ movhps [dstq+strideq*2], xm5
+ movhps [dstq+r6 ], xm4
+ lea dstq, [dstq+strideq*4]
+ ret
+
+INV_TXFM_4X16_FN identity, dct
+INV_TXFM_4X16_FN identity, adst
+INV_TXFM_4X16_FN identity, flipadst
+INV_TXFM_4X16_FN identity, identity
+
+cglobal iidentity_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2
+ vpbroadcastd m7, [pd_5793]
+ pmulld m0, m7, [cq+32*0]
+ pmulld m4, m7, [cq+32*1]
+ pmulld m1, m7, [cq+32*2]
+ pmulld m5, m7, [cq+32*3]
+ pmulld m2, m7, [cq+32*4]
+ pmulld m6, m7, [cq+32*5]
+ pmulld m3, m7, [cq+32*6]
+ pmulld m7, [cq+32*7]
+ vpbroadcastd m8, [pd_6144]
+ REPX {paddd x, m8}, m0, m4, m1, m5, m2, m6, m3, m7
+ REPX {psrad x, 13}, m0, m4, m1, m5, m2, m6, m3, m7
+ jmp tx2q
+.pass2:
+ packssdw m0, m4
+ packssdw m1, m5
+ packssdw m2, m6
+ packssdw m3, m7
+ vpbroadcastd m7, [pw_1697x16]
+ vpbroadcastd m8, [pw_2048]
+ pmulhrsw m4, m7, m0
+ pmulhrsw m5, m7, m1
+ pmulhrsw m6, m7, m2
+ pmulhrsw m7, m3
+ REPX {paddsw x, x}, m0, m1, m2, m3
+ paddsw m0, m4
+ paddsw m1, m5
+ paddsw m2, m6
+ paddsw m3, m7
+ vpbroadcastd m4, [pixel_10bpc_max]
+ call .pass2_end
+ RET
+ALIGN function_align
+.pass2_end:
+ punpckhwd m7, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m2, m3
+ punpcklwd m2, m3
+ lea r6, [strideq*5]
+ pxor m3, m3
+ punpckhdq m5, m0, m2 ; 2 3 6 7
+ punpckldq m0, m2 ; 0 1 4 5
+ punpckldq m6, m7, m1 ; 8 9 c d
+ punpckhdq m7, m1 ; a b e f
+ pmulhrsw m0, m8
+ call .write_2x4x2
+ pmulhrsw m0, m5, m8
+ call .write_2x4x2
+ pmulhrsw m0, m6, m8
+ lea dstq, [dstq+strideq*4]
+ call .write_2x4x2
+ pmulhrsw m0, m7, m8
+ call .write_2x4x2
+ ret
+ALIGN function_align
+.write_2x4x2:
+ movq xm1, [dstq+strideq*0]
+ movhps xm1, [dstq+strideq*1]
+ vpbroadcastq m2, [dstq+strideq*4]
+ vpblendd m1, m2, 0x30
+ vpbroadcastq m2, [dstq+r6 ]
+ vpblendd m1, m2, 0xc0
+ mova [cq+32*0], m3
+ mova [cq+32*1], m3
+ add cq, 32*2
+ paddw m1, m0
+ pmaxsw m1, m3
+ pminsw m1, m4
+ vextracti128 xm2, m1, 1
+ movq [dstq+strideq*0], xm1
+ movhps [dstq+strideq*1], xm1
+ movq [dstq+strideq*4], xm2
+ movhps [dstq+r6 ], xm2
+ lea dstq, [dstq+strideq*2]
+ ret
+
+INV_TXFM_4X16_FN dct, dct, 12
+INV_TXFM_4X16_FN dct, identity, 12
+INV_TXFM_4X16_FN dct, adst, 12
+INV_TXFM_4X16_FN dct, flipadst, 12
+
+cglobal idct_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
+ jmp m(idct_4x16_internal_10bpc).pass1
+.pass2:
+ punpckldq m8, m0, m1
+ punpckhdq m0, m1
+ punpckldq m9, m2, m3
+ punpckhdq m2, m3
+ punpckldq m1, m4, m5
+ punpckhdq m4, m5
+ punpckldq m3, m6, m7
+ punpckhdq m6, m7
+ punpcklqdq m5, m0, m2 ; 2 6
+ punpckhqdq m12, m0, m2 ; 3 7
+ punpcklqdq m0, m8, m9 ; 0 4
+ punpckhqdq m10, m8, m9 ; 1 5
+ punpcklqdq m2, m1, m3 ; 8 12
+ punpckhqdq m13, m1, m3 ; 9 13
+ punpcklqdq m9, m4, m6 ; 10 14
+ punpckhqdq m4, m6 ; 11 15
+ vperm2i128 m1, m5, m9, 0x20 ; 2 10
+ vperm2i128 m3, m9, m5, 0x31 ; 14 6
+ vpermq m11, m4, q1302 ; 15 11
+ ; interleave
+ REPX {vpermq x, x, q3120}, m0, m1, m2, m3, m10
+ vpbroadcastd m8, [clip_18b_min]
+ vpbroadcastd m9, [clip_18b_max]
+ REPX {pmaxsd x, m8}, m0, m1, m2, m3, m10, m11, m12, m13
+ REPX {pminsd x, m9}, m0, m1, m2, m3, m10, m11, m12, m13
+ call m(idct_16x4_internal_10bpc).pass1_main
+ vpermq m6, m12, q1302 ; 7 3
+ vpermq m5, m13, q3120 ; 9 13
+ call m(idct_16x4_internal_10bpc).pass1_main2
+ call m(idct_16x4_internal_10bpc).pass1_main3
+ REPX {psrad x, 3}, m0, m1, m2, m3, m4, m5, m6, m7
+ packssdw m0, m1
+ packssdw m1, m2, m3
+ packssdw m2, m4, m5
+ packssdw m3, m6, m7
+ mova m4, [idct16_12_shuf]
+ REPX {vpermd x, m4, x}, m0, m1, m2, m3
+ vpbroadcastd m9, [pw_16384]
+ vpbroadcastd m8, [pixel_12bpc_max]
+ call m(idct_4x16_internal_10bpc).pass2_end
+ RET
+
+INV_TXFM_4X16_FN adst, dct, 12
+INV_TXFM_4X16_FN adst, adst, 12
+INV_TXFM_4X16_FN adst, flipadst, 12
+INV_TXFM_4X16_FN adst, identity, 12
+
+cglobal iadst_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
+ call .main_pass1
+ psrad m0, m4, 12
+ psrad m1, m5, 12
+ psrad m2, 12
+ psrad m3, 12
+ psrad m4, m8, 12
+ psrad m5, m9, 12
+ psrad m6, 12
+ psrad m7, 12
+ jmp tx2q
+.pass2:
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ call .transpose_16x4
+ call m(iadst_4x16_internal_10bpc).main2
+ pshufd m4, m5, q1032
+ psrad m5, m6, 3
+ pshufd m6, m7, q1032
+ psrad m7, m8, 3
+ REPX {pshufd x, x, q1032}, m0, m2
+ REPX {psrad x, 3}, m0, m1, m2, m3, m4, m6
+.pass2_end:
+ packssdw m0, m1
+ packssdw m1, m2, m3
+ packssdw m2, m4, m5
+ packssdw m3, m6, m7
+ mova m4, [iadst16_12_shuf]
+ REPX {vpermd x, m4, x}, m0, m1, m2, m3
+ vpbroadcastd m9, [pw_16384]
+ vpbroadcastd m8, [pixel_12bpc_max]
+ lea r6, [strideq*3]
+ pxor m7, m7
+ pmulhrsw m0, m9
+ call m(iadst_4x16_internal_10bpc).write_4x4
+ pmulhrsw m0, m9, m1
+ call m(iadst_4x16_internal_10bpc).write_4x4
+ pmulhrsw m0, m9, m2
+ call m(iadst_4x16_internal_10bpc).write_4x4
+ pmulhrsw m0, m9, m3
+ call m(iadst_4x16_internal_10bpc).write_4x4
+ RET
+ALIGN function_align
+.transpose_16x4:
+ ; transpose & interleave
+ punpckldq m8, m0, m1
+ punpckhdq m0, m1
+ punpckldq m9, m2, m3
+ punpckhdq m2, m3
+ punpckldq m1, m4, m5
+ punpckhdq m4, m5
+ punpckldq m3, m6, m7
+ punpckhdq m6, m7
+ punpcklqdq m10, m8, m0
+ punpckhqdq m0, m8
+ punpcklqdq m11, m9, m2
+ punpckhqdq m2, m9
+ punpcklqdq m8, m1, m4
+ punpckhqdq m4, m1
+ punpcklqdq m9, m3, m6
+ punpckhqdq m6, m3
+ vperm2i128 m5, m0, m2, 0x31 ; 7 5
+ vperm2i128 m7, m0, m2, 0x20 ; 3 1
+ vperm2i128 m0, m10, m11, 0x20 ; 0 2
+ vperm2i128 m2, m10, m11, 0x31 ; 4 6
+ vperm2i128 m1, m4, m6, 0x31 ; 15 13
+ vperm2i128 m3, m4, m6, 0x20 ; 11 9
+ vperm2i128 m4, m8, m9, 0x20 ; 8 10
+ vperm2i128 m6, m8, m9, 0x31 ; 12 14
+ ret
+ALIGN function_align
+.main_pass1:
+ call m(iadst_16x4_internal_10bpc).main
+ vpbroadcastd m6, [pd_3072]
+ paddd m10, m4, m5
+ psubd m4, m3
+ psubd m5, m3
+ paddd m3, m10
+ psubd m8, m7, m1
+ paddd m7, m9
+ psubd m9, m1
+ paddd m7, m1
+ REPX {psrad x, 1 }, m4, m5, m2, m3, m8, m9, m0, m7
+ REPX {paddd x, m6}, m4, m5, m2, m3, m8, m9, m7
+ paddd m6, m0
+ ret
+
+INV_TXFM_4X16_FN flipadst, dct, 12
+INV_TXFM_4X16_FN flipadst, adst, 12
+INV_TXFM_4X16_FN flipadst, flipadst, 12
+INV_TXFM_4X16_FN flipadst, identity, 12
+
+cglobal iflipadst_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
+ call m(iadst_4x16_internal_12bpc).main_pass1
+ psrad m0, m3, 12
+ psrad m1, m2, 12
+ psrad m2, m5, 12
+ psrad m3, m4, 12
+ psrad m4, m7, 12
+ psrad m5, m6, 12
+ psrad m6, m9, 12
+ psrad m7, m8, 12
+ jmp tx2q
+.pass2:
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ call m(iadst_4x16_internal_12bpc).transpose_16x4
+ call m(iadst_4x16_internal_10bpc).main2
+ pshufd m4, m3, q1032
+ psrad m3, m5, 3
+ psrad m5, m2, 3
+ pshufd m2, m6, q1032
+ pshufd m6, m1, q1032
+ psrad m1, m7, 3
+ psrad m7, m0, 3
+ pshufd m0, m8, q1032
+ REPX {psrad x, 3}, m0, m2, m4, m6
+ jmp m(iadst_4x16_internal_12bpc).pass2_end
+
+INV_TXFM_4X16_FN identity, dct, 12
+INV_TXFM_4X16_FN identity, adst, 12
+INV_TXFM_4X16_FN identity, flipadst, 12
+INV_TXFM_4X16_FN identity, identity, 12
+
+cglobal iidentity_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
+ vpbroadcastd m8, [pd_1697]
+ mova m0, [cq+32*0]
+ mova m4, [cq+32*1]
+ mova m1, [cq+32*2]
+ mova m5, [cq+32*3]
+ vpbroadcastd m9, [pd_6144]
+ pmulld m2, m8, m0
+ pmulld m6, m8, m4
+ pmulld m3, m8, m1
+ pmulld m7, m8, m5
+ mova m10, [cq+32*4]
+ mova m11, [cq+32*5]
+ mova m12, [cq+32*6]
+ mova m13, [cq+32*7]
+ REPX {paddd x, m9}, m2, m6, m3, m7
+ REPX {psrad x, 12}, m2, m6, m3, m7
+ paddd m0, m2
+ pmulld m2, m8, m10
+ paddd m4, m6
+ pmulld m6, m8, m11
+ paddd m1, m3
+ pmulld m3, m8, m12
+ paddd m5, m7
+ pmulld m7, m8, m13
+ REPX {psrad x, 1 }, m0, m4, m1, m5
+ REPX {paddd x, m9}, m2, m6, m3, m7
+ REPX {psrad x, 12}, m2, m6, m3, m7
+ paddd m2, m10
+ paddd m6, m11
+ paddd m3, m12
+ paddd m7, m13
+ REPX {psrad x, 1 }, m2, m6, m3, m7
+ jmp tx2q
+.pass2:
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ vpbroadcastd m8, [pd_5793]
+ vpbroadcastd m9, [pd_1024]
+ REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {paddd x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7
+ packssdw m0, m4
+ packssdw m1, m5
+ packssdw m2, m6
+ packssdw m3, m7
+ vpbroadcastd m8, [pw_16384]
+ vpbroadcastd m4, [pixel_12bpc_max]
+ call m(iidentity_4x16_internal_10bpc).pass2_end
+ RET
+
+%macro INV_TXFM_8X4_FN 2-3 10 ; type1, type2, bitdepth
+ INV_TXFM_FN %1, %2, 0, 8x4, %3
+%ifidn %1_%2, dct_dct
+ vpbroadcastd m2, [dconly_%3bpc]
+%if %3 = 10
+.dconly:
+ imul r6d, [cq], 181
+ mov [cq], eobd ; 0
+ or r3d, 4
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ add r6d, 128
+ sar r6d, 8
+ jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly3
+%else
+ jmp m(inv_txfm_add_dct_dct_8x4_10bpc).dconly
+%endif
+%endif
+%endmacro
+
+INV_TXFM_8X4_FN dct, dct
+INV_TXFM_8X4_FN dct, identity
+INV_TXFM_8X4_FN dct, adst
+INV_TXFM_8X4_FN dct, flipadst
+
+cglobal idct_8x4_internal_10bpc, 0, 7, 10, dst, stride, c, eob, tx2
+ vpbroadcastd m8, [clip_18b_min]
+ vpbroadcastd m9, [clip_18b_max]
+.pass1:
+ vbroadcasti128 m1, [cq+16*1]
+ vbroadcasti128 m0, [cq+16*5]
+ vbroadcasti128 m2, [cq+16*3]
+ vbroadcasti128 m3, [cq+16*7]
+ vpbroadcastd m6, [pd_2896]
+ shufpd m1, m0, 0x0c ; 1 5
+ shufpd m3, m2, 0x0c ; 7 3
+ vbroadcasti128 m0, [cq+16*0]
+ vbroadcasti128 m4, [cq+16*2]
+ vbroadcasti128 m2, [cq+16*4]
+ vbroadcasti128 m5, [cq+16*6]
+ vpbroadcastd m7, [pd_2048]
+ shufpd m0, m4, 0x0c ; 0 2
+ shufpd m2, m5, 0x0c ; 4 6
+ REPX {pmulld x, m6}, m1, m3, m0, m2
+ REPX {paddd x, m7}, m1, m3, m0, m2
+ REPX {psrad x, 12}, m1, m3, m0, m2
+ call .main
+ psubd m3, m0, m4 ; out7 out6 (interleaved)
+ paddd m0, m4 ; out0 out1 (interleaved)
+ paddd m1, m2, m5 ; out3 out2 (interleaved)
+ psubd m2, m5 ; out4 out5 (interleaved)
+ pshufd m1, m1, q1032
+ pshufd m3, m3, q1032
+ jmp tx2q
+.pass2:
+ vbroadcasti128 m4, [deint_shuf]
+ packssdw m0, m1
+ packssdw m2, m3
+ vperm2i128 m1, m0, m2, 0x31
+ vinserti128 m0, xm2, 1
+ pshufb m0, m4
+ pshufb m1, m4
+ IDCT4_1D_PACKED_WORD 0, 1, 2, 3, 4, 7
+ vpermq m0, m0, q3120 ; out0 out1
+ vpermq m2, m1, q2031 ; out2 out3
+ jmp m(iadst_8x4_internal_10bpc).end
+ALIGN function_align
+.main:
+ ITX_MULSUB_2D 1, 3, 4, 5, 6, 7, 799_3406, 4017_2276, 1
+ IDCT4_1D_PACKED 0, 2, 4, 5, 6, 7
+ vpbroadcastd m6, [pd_2896]
+ punpcklqdq m4, m1, m3 ; t4a t7a
+ punpckhqdq m1, m3 ; t5a t6a
+ psubd m3, m4, m1 ; t5a t6a
+ paddd m4, m1 ; t4 t7
+ REPX {pmaxsd x, m8}, m3, m4, m0, m2
+ REPX {pminsd x, m9}, m3, m4, m0, m2
+ pmulld m3, m6
+ pshufd m1, m3, q1032
+ paddd m3, m7
+ psubd m5, m3, m1
+ paddd m1, m3
+ psrad m5, 12
+ psrad m1, 12
+ vpblendd m5, m4, 0x33 ; t4 t5
+ punpckhqdq m4, m1 ; t7 t6
+ ret
+
+INV_TXFM_8X4_FN adst, dct
+INV_TXFM_8X4_FN adst, adst
+INV_TXFM_8X4_FN adst, flipadst
+INV_TXFM_8X4_FN adst, identity
+
+cglobal iadst_8x4_internal_10bpc, 0, 7, 10, dst, stride, c, eob, tx2
+ call m(iadst_4x8_internal_10bpc).main
+ vpblendd m3, m0, m4, 0x33 ; out6 out7
+ vpblendd m0, m4, 0xcc ; out0 out1
+ pshufd m1, m5, q1032
+ psignd m2, m6 ; out4 out5
+ psignd m1, m6 ; out2 out3
+ jmp tx2q
+.pass2:
+ call .pass2_main
+ vpermq m0, m0, q3120 ; out0 out1
+ vpermq m2, m1, q3120 ; out2 out3
+.end:
+ vpbroadcastd m1, [pw_2048]
+ pmulhrsw m0, m1
+ pmulhrsw m1, m2
+ vpbroadcastd m5, [pixel_10bpc_max]
+.end2:
+ mova xm2, [dstq+strideq*0]
+ vinserti128 m2, [dstq+strideq*1], 1
+ lea r6, [dstq+strideq*2]
+ mova xm3, [r6 +strideq*0]
+ vinserti128 m3, [r6 +strideq*1], 1
+ pxor m4, m4
+ REPX {mova [cq+32*x], m4}, 0, 1, 2, 3
+ paddw m0, m2
+ paddw m1, m3
+ pmaxsw m0, m4
+ pmaxsw m1, m4
+ pminsw m0, m5
+ pminsw m1, m5
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ mova [r6 +strideq*0], xm1
+ vextracti128 [r6 +strideq*1], m1, 1
+ RET
+ALIGN function_align
+.pass2_main:
+ vbroadcasti128 m4, [deint_shuf]
+ packssdw m0, m1
+ packssdw m2, m3
+ lea r6, [deint_shuf+128]
+ vperm2i128 m1, m0, m2, 0x31
+ vinserti128 m0, xm2, 1
+ pshufb m0, m4
+ pshufb m1, m4
+ jmp m(iadst_8x4_internal_8bpc).main
+ALIGN function_align
+.main:
+ vpbroadcastd m1, [pd_2896]
+ pmulld m0, m1, [cq+32*0]
+ pmulld m3, m1, [cq+32*3]
+ pmulld m2, m1, [cq+32*2]
+ pmulld m1, [cq+32*1]
+ vpbroadcastd m4, [pd_2048]
+ REPX {paddd x, m4}, m0, m3, m2, m1
+ REPX {psrad x, 12}, m0, m3, m2, m1
+.main2:
+ IADST4_1D
+ ret
+
+INV_TXFM_8X4_FN flipadst, dct
+INV_TXFM_8X4_FN flipadst, adst
+INV_TXFM_8X4_FN flipadst, flipadst
+INV_TXFM_8X4_FN flipadst, identity
+
+cglobal iflipadst_8x4_internal_10bpc, 0, 5, 10, dst, stride, c, eob, tx2
+ call m(iadst_4x8_internal_10bpc).main
+ shufpd m3, m4, m0, 0x05
+ shufpd m0, m4, 0x05
+ psignd m2, m6
+ pshufd m6, m6, q1032
+ pshufd m1, m2, q1032
+ psignd m2, m5, m6
+ jmp tx2q
+.pass2:
+ call m(iadst_8x4_internal_10bpc).pass2_main
+ vpermq m2, m0, q2031
+ vpermq m0, m1, q2031
+ jmp m(iadst_8x4_internal_10bpc).end
+
+INV_TXFM_8X4_FN identity, dct
+INV_TXFM_8X4_FN identity, adst
+INV_TXFM_8X4_FN identity, flipadst
+INV_TXFM_8X4_FN identity, identity
+
+cglobal iidentity_8x4_internal_10bpc, 0, 7, 10, dst, stride, c, eob, tx2
+.pass1:
+ vpbroadcastd m4, [pd_2896]
+ vpermq m0, [cq+32*0], q3120
+ vpermq m1, [cq+32*1], q3120
+ vpermq m2, [cq+32*2], q3120
+ vpermq m3, [cq+32*3], q3120
+ vpbroadcastd m7, [pd_2048]
+ REPX {pmulld x, m4}, m0, m1, m2, m3
+ REPX {paddd x, m7}, m0, m1, m2, m3
+ REPX {psrad x, 12}, m0, m1, m2, m3
+ REPX {paddd x, x }, m0, m1, m2, m3
+ jmp tx2q
+.pass2:
+ vpbroadcastd m5, [pixel_10bpc_max]
+ vpbroadcastd m4, [pw_1697x8]
+ packssdw m0, m1
+ packssdw m2, m3
+ pmulhrsw m1, m4, m0
+ pmulhrsw m4, m2
+ paddsw m0, m1
+ paddsw m2, m4
+ packssdw m7, m7 ; pw_2048
+.pass2_end:
+ punpckhwd m1, m0, m2
+ punpcklwd m0, m2
+ lea r6, [dstq+strideq*2]
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ pmulhrsw m2, m7
+ pmulhrsw m0, m7
+ punpckhwd m1, m0, m2
+ punpcklwd m0, m2
+ mova xm2, [dstq+strideq*0]
+ vinserti128 m2, [r6 +strideq*0], 1
+ mova xm3, [dstq+strideq*1]
+ vinserti128 m3, [r6 +strideq*1], 1
+ pxor m4, m4
+ REPX {mova [cq+32*x], m4}, 0, 1, 2, 3
+ paddw m0, m2
+ paddw m1, m3
+ pmaxsw m0, m4
+ pmaxsw m1, m4
+ pminsw m0, m5
+ pminsw m1, m5
+ mova [dstq+strideq*0], xm0
+ mova [dstq+strideq*1], xm1
+ vextracti128 [r6 +strideq*0], m0, 1
+ vextracti128 [r6 +strideq*1], m1, 1
+ RET
+
+INV_TXFM_8X4_FN dct, dct, 12
+INV_TXFM_8X4_FN dct, identity, 12
+INV_TXFM_8X4_FN dct, adst, 12
+INV_TXFM_8X4_FN dct, flipadst, 12
+
+cglobal idct_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
+ vpbroadcastd m8, [clip_20b_min]
+ vpbroadcastd m9, [clip_20b_max]
+ jmp m(idct_8x4_internal_10bpc).pass1
+.pass2:
+ vpbroadcastd m8, [clip_18b_min]
+ vpbroadcastd m9, [clip_18b_max]
+ REPX {pmaxsd x, m8}, m0, m1, m2, m3
+ REPX {pminsd x, m9}, m0, m1, m2, m3
+ call m(iadst_8x4_internal_12bpc).transpose_4x8
+ IDCT4_1D 0, 1, 2, 3, 4, 5, 6, 7
+ jmp m(iadst_8x4_internal_12bpc).end
+
+INV_TXFM_8X4_FN adst, dct, 12
+INV_TXFM_8X4_FN adst, adst, 12
+INV_TXFM_8X4_FN adst, flipadst, 12
+INV_TXFM_8X4_FN adst, identity, 12
+
+cglobal iadst_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
+ vpbroadcastd m8, [clip_20b_min]
+ vpbroadcastd m9, [clip_20b_max]
+ call m(iadst_4x8_internal_10bpc).main2
+ vpblendd m3, m0, m4, 0x33 ; out6 out7
+ vpblendd m0, m4, 0xcc ; out0 out1
+ pshufd m1, m5, q1032
+ psignd m2, m6 ; out4 out5
+ psignd m1, m6 ; out2 out3
+ jmp tx2q
+.pass2:
+ vpbroadcastd m8, [clip_18b_min]
+ vpbroadcastd m9, [clip_18b_max]
+ REPX {pmaxsd x, m8}, m0, m1, m2, m3
+ REPX {pminsd x, m9}, m0, m1, m2, m3
+ call .pass2_main
+ vpbroadcastd m5, [pd_2048]
+ paddd m0, m5, m4
+ paddd m1, m5, m6
+ paddd m2, m5
+ paddd m3, m5
+.pass2_end:
+ REPX {psrad x, 12}, m0, m1, m2, m3
+.end:
+ vpbroadcastd m4, [pw_16384]
+ REPX {psrad x, 3}, m0, m1, m2, m3
+ packssdw m0, m1
+ packssdw m2, m3
+ pmulhrsw m0, m4
+ pmulhrsw m1, m2, m4
+ vpermq m0, m0, q3120 ; out0 out1
+ vpermq m1, m1, q3120 ; out2 out3
+ vpbroadcastd m5, [pixel_12bpc_max]
+ jmp m(iadst_8x4_internal_10bpc).end2
+ALIGN function_align
+.pass2_main:
+ call .transpose_4x8
+ jmp m(iadst_8x4_internal_10bpc).main2
+ALIGN function_align
+.transpose_4x8:
+ ; deinterleave
+ pshufd m0, m0, q3120
+ pshufd m1, m1, q3120
+ pshufd m2, m2, q3120
+ pshufd m3, m3, q3120
+ ; transpose
+ punpcklqdq m4, m0, m1
+ punpckhqdq m0, m1
+ punpcklqdq m5, m2, m3
+ punpckhqdq m2, m3
+ vperm2i128 m1, m0, m2, 0x20 ; out1
+ vperm2i128 m3, m0, m2, 0x31 ; out3
+ vperm2i128 m2, m4, m5, 0x31 ; out2
+ vperm2i128 m0, m4, m5, 0x20 ; out0
+ ret
+
+INV_TXFM_8X4_FN flipadst, dct, 12
+INV_TXFM_8X4_FN flipadst, adst, 12
+INV_TXFM_8X4_FN flipadst, flipadst, 12
+INV_TXFM_8X4_FN flipadst, identity, 12
+
+cglobal iflipadst_8x4_internal_12bpc, 0, 5, 10, dst, stride, c, eob, tx2
+ vpbroadcastd m8, [clip_20b_min]
+ vpbroadcastd m9, [clip_20b_max]
+ call m(iadst_4x8_internal_10bpc).main2
+ shufpd m3, m4, m0, 0x05
+ shufpd m0, m4, 0x05
+ psignd m2, m6
+ pshufd m6, m6, q1032
+ pshufd m1, m2, q1032
+ psignd m2, m5, m6
+ jmp tx2q
+.pass2:
+ vpbroadcastd m8, [clip_18b_min]
+ vpbroadcastd m9, [clip_18b_max]
+ REPX {pmaxsd x, m8}, m0, m1, m2, m3
+ REPX {pminsd x, m9}, m0, m1, m2, m3
+ call m(iadst_8x4_internal_12bpc).pass2_main
+ vpbroadcastd m5, [pd_2048]
+ paddd m0, m5, m3
+ paddd m1, m5, m2
+ paddd m3, m5, m4
+ paddd m2, m5, m6
+ jmp m(iadst_8x4_internal_12bpc).pass2_end
+
+INV_TXFM_8X4_FN identity, dct, 12
+INV_TXFM_8X4_FN identity, adst, 12
+INV_TXFM_8X4_FN identity, flipadst, 12
+INV_TXFM_8X4_FN identity, identity, 12
+
+cglobal iidentity_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
+ jmp m(iidentity_8x4_internal_10bpc).pass1
+.pass2:
+ ; m0 = in0 in1 (interleaved)
+ ; m1 = in2 in3 (interleaved)
+ ; m2 = in4 in5 (interleaved)
+ ; m3 = in6 in7 (interleaved)
+ vpbroadcastd m8, [clip_18b_min]
+ vpbroadcastd m9, [clip_18b_max]
+ REPX {pmaxsd x, m8}, m0, m1, m2, m3
+ REPX {pminsd x, m9}, m0, m1, m2, m3
+ vpbroadcastd m4, [pd_5793]
+ REPX {pmulld x, m4}, m0, m1, m2, m3
+ REPX {paddd x, m7}, m0, m1, m2, m3
+ REPX {psrad x, 15}, m0, m1, m2, m3
+ vpbroadcastd m5, [pixel_12bpc_max]
+ vpbroadcastd m7, [pw_16384]
+ packssdw m0, m1
+ packssdw m2, m3
+ jmp m(iidentity_8x4_internal_10bpc).pass2_end
+
+%macro INV_TXFM_8X8_FN 2-3 10 ; type1, type2, bitdepth
+ INV_TXFM_FN %1, %2, 0, 8x8, %3
+%ifidn %1_%2, dct_dct
+ vpbroadcastd m2, [dconly_%3bpc]
+%if %3 = 10
+.dconly:
+ imul r6d, [cq], 181
+ mov [cq], eobd ; 0
+ or r3d, 8
+.dconly2:
+ add r6d, 384
+ sar r6d, 9
+.dconly3:
+ imul r6d, 181
+ add r6d, 2176
+ sar r6d, 12
+ movd xm0, r6d
+ paddsw xm0, xm2
+ vpbroadcastw m0, xm0
+.dconly_loop:
+ mova xm1, [dstq+strideq*0]
+ vinserti128 m1, [dstq+strideq*1], 1
+ paddsw m1, m0
+ psubusw m1, m2
+ mova [dstq+strideq*0], xm1
+ vextracti128 [dstq+strideq*1], m1, 1
+ lea dstq, [dstq+strideq*2]
+ sub r3d, 2
+ jg .dconly_loop
+ RET
+%else
+ jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly
+%endif
+%endif
+%endmacro
+
+%macro IADST8_1D 14 ; src[1-8], tmp[1-3], pd_2048, clip[1-2]
+ ITX_MULSUB_2D %8, %1, %9, %10, %11, %12, 401, 4076 ; t1a, t0a
+ ITX_MULSUB_2D %2, %7, %9, %10, %11, %12, 3920, 1189 ; t7a, t6a
+ ITX_MULSUB_2D %6, %3, %9, %10, %11, %12, 1931, 3612 ; t3a, t2a
+ ITX_MULSUB_2D %4, %5, %9, %10, %11, %12, 3166, 2598 ; t5a, t4a
+ psubd m%9, m%3, m%7 ; t6
+ paddd m%3, m%7 ; t2
+ psubd m%7, m%1, m%5 ; t4
+ paddd m%1, m%5 ; t0
+ psubd m%5, m%6, m%2 ; t7
+ paddd m%6, m%2 ; t3
+ psubd m%2, m%8, m%4 ; t5
+ paddd m%8, m%4 ; t1
+ REPX {pmaxsd x, m%13}, m%7, m%2, m%9, m%5, m%3, m%1, m%6, m%8
+ REPX {pminsd x, m%14}, m%7, m%2, m%9, m%5, m%3, m%1, m%6, m%8
+ ITX_MULSUB_2D %7, %2, %4, %10, %11, %12, 1567, 3784 ; t5a, t4a
+ ITX_MULSUB_2D %5, %9, %4, %10, %11, %12, 3784, %11 ; t6a, t7a
+ psubd m%10, m%7, m%9 ; t7
+ paddd m%7, m%9 ; out6
+ vpbroadcastd m%9, [pd_1448]
+ psubd m%4, m%8, m%6 ; t3
+ paddd m%8, m%6 ; -out7
+ psubd m%6, m%1, m%3 ; t2
+ paddd m%1, m%3 ; out0
+ psubd m%3, m%2, m%5 ; t6
+ paddd m%2, m%5 ; -out1
+ REPX {pmaxsd x, m%13}, m%6, m%4, m%3, m%10
+ REPX {pminsd x, m%14}, m%6, m%4, m%3, m%10
+ REPX {pmulld x, m%9 }, m%6, m%4, m%3, m%10
+ psubd m%5, m%6, m%4 ; (t2 - t3) * 1448
+ paddd m%4, m%6 ; (t2 + t3) * 1448
+ psubd m%6, m%3, m%10 ; (t6 - t7) * 1448
+ paddd m%3, m%10 ; (t6 + t7) * 1448
+%endmacro
+
+INV_TXFM_8X8_FN dct, dct
+INV_TXFM_8X8_FN dct, identity
+INV_TXFM_8X8_FN dct, adst
+INV_TXFM_8X8_FN dct, flipadst
+
+cglobal idct_8x8_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+.pass1:
+ mova m0, [cq+32*0]
+ mova m1, [cq+32*1]
+ mova m2, [cq+32*2]
+ mova m3, [cq+32*3]
+ mova m4, [cq+32*4]
+ mova m5, [cq+32*5]
+ mova m6, [cq+32*6]
+ mova m7, [cq+32*7]
+ vpbroadcastd m11, [pd_2048]
+ call .main
+ call .round_shift1
+ jmp tx2q
+.pass2:
+ call .transpose_8x8_packed
+ call m(idct_8x8_internal_8bpc).main
+ vpbroadcastd m12, [pw_2048]
+ vpermq m0, m0, q3120
+ vpermq m1, m1, q2031
+ vpermq m2, m2, q3120
+ vpermq m3, m3, q2031
+ pmulhrsw m0, m12
+ pmulhrsw m1, m12
+ call .write_8x4_start
+ pmulhrsw m0, m2, m12
+ pmulhrsw m1, m3, m12
+ call .write_8x4
+ RET
+ALIGN function_align
+.write_8x4_start:
+ vpbroadcastd m11, [pixel_10bpc_max]
+ lea r6, [strideq*3]
+ pxor m10, m10
+.write_8x4:
+ mova xm8, [dstq+strideq*0]
+ vinserti128 m8, [dstq+strideq*1], 1
+ mova xm9, [dstq+strideq*2]
+ vinserti128 m9, [dstq+r6 ], 1
+ mova [cq+32*0], m10
+ mova [cq+32*1], m10
+ mova [cq+32*2], m10
+ mova [cq+32*3], m10
+ add cq, 32*4
+ paddw m0, m8
+ paddw m1, m9
+ pmaxsw m0, m10
+ pmaxsw m1, m10
+ pminsw m0, m11
+ pminsw m1, m11
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], xm1
+ vextracti128 [dstq+r6 ], m1, 1
+ lea dstq, [dstq+strideq*4]
+ ret
+ALIGN function_align
+.transpose_8x8_packed:
+ packssdw m0, m4
+ packssdw m1, m5
+ packssdw m2, m6
+ packssdw m3, m7
+ lea r6, [deint_shuf+128]
+ punpckhwd m4, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m2, m3
+ punpcklwd m2, m3
+ punpckhdq m3, m0, m2
+ punpckldq m0, m2
+ punpckhdq m2, m4, m1
+ punpckldq m4, m1
+ vinserti128 m1, m3, xm2, 1
+ vperm2i128 m3, m2, 0x31
+ vperm2i128 m2, m0, m4, 0x31
+ vinserti128 m0, xm4, 1
+ ret
+ALIGN function_align
+.main_rect2:
+ REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
+.main:
+ ITX_MULSUB_2D 5, 3, 8, 9, 10, 11, 3406, 2276 ; t5a t6a
+ ITX_MULSUB_2D 1, 7, 8, 9, 10, 11, 799, 4017 ; t4a t7a
+ ITX_MULSUB_2D 2, 6, 8, 9, 10, 11, 1567, 3784 ; t2 t3
+ paddd m8, m1, m5 ; t4
+ psubd m1, m5 ; t5a
+ paddd m9, m7, m3 ; t7
+ psubd m7, m3 ; t6a
+ vpbroadcastd m3, [pd_2896]
+ REPX {pmaxsd x, m12}, m1, m8, m7, m9
+ REPX {pminsd x, m13}, m1, m8, m7, m9
+ REPX {pmulld x, m3 }, m0, m4, m7, m1
+ paddd m0, m11
+ paddd m7, m11
+ psubd m5, m0, m4
+ paddd m0, m4
+ psubd m4, m7, m1
+ paddd m7, m1
+ REPX {psrad x, 12 }, m5, m0, m4, m7
+ psubd m3, m0, m6 ; dct4 out3
+ paddd m0, m6 ; dct4 out0
+ paddd m6, m5, m2 ; dct4 out1
+ psubd m5, m2 ; dct4 out2
+ REPX {pmaxsd x, m12}, m0, m6, m5, m3
+ REPX {pminsd x, m13}, m0, m6, m5, m3
+ ret
+ALIGN function_align
+.round_shift1:
+ pcmpeqd m1, m1
+ REPX {psubd x, m1}, m0, m6, m5, m3
+ paddd m1, m6, m7 ; out1
+ psubd m6, m7 ; out6
+ psubd m7, m0, m9 ; out7
+ paddd m0, m9 ; out0
+ paddd m2, m5, m4 ; out2
+ psubd m5, m4 ; out5
+ psubd m4, m3, m8 ; out4
+ paddd m3, m8 ; out3
+ REPX {psrad x, 1 }, m0, m1, m2, m3, m4, m5, m6, m7
+ ret
+
+INV_TXFM_8X8_FN adst, dct
+INV_TXFM_8X8_FN adst, adst
+INV_TXFM_8X8_FN adst, flipadst
+INV_TXFM_8X8_FN adst, identity
+
+cglobal iadst_8x8_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+.pass1:
+ call .main
+ call .main_end
+ jmp tx2q
+.pass2:
+ call m(idct_8x8_internal_10bpc).transpose_8x8_packed
+ pshufd m4, m0, q1032
+ pshufd m5, m1, q1032
+ call m(iadst_8x8_internal_8bpc).main_pass2
+ vpbroadcastd m5, [pw_2048]
+ vpbroadcastd xm12, [pw_4096]
+ psubw m12, m5
+ REPX {vpermq x, x, q3120}, m0, m1, m2, m3
+ pmulhrsw m0, m12
+ pmulhrsw m1, m12
+ call m(idct_8x8_internal_10bpc).write_8x4_start
+ pmulhrsw m0, m2, m12
+ pmulhrsw m1, m3, m12
+ call m(idct_8x8_internal_10bpc).write_8x4
+ RET
+ALIGN function_align
+.main:
+ mova m0, [cq+32*0]
+ mova m7, [cq+32*7]
+ mova m1, [cq+32*1]
+ mova m6, [cq+32*6]
+ mova m2, [cq+32*2]
+ mova m5, [cq+32*5]
+ mova m3, [cq+32*3]
+ mova m4, [cq+32*4]
+ vpbroadcastd m11, [pd_2048]
+.main2:
+ IADST8_1D 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13
+ psrld m8, 10 ; pd_1
+ vpbroadcastd m9, [pd_3072]
+ ret
+ALIGN function_align
+.main_end:
+ paddd m0, m8
+ psubd m1, m8, m1
+ paddd m6, m8
+ psubd m7, m8, m7
+ REPX {psrad x, 1 }, m0, m1, m6, m7
+ ; (1 + ((x + 1024) >> 11)) >> 1 = (3072 + x) >> 12
+ ; (1 - ((x + 1024) >> 11)) >> 1 = (3071 - x) >> 12
+ psubd m8, m9, m8 ; pd_3071
+ paddd m2, m9
+ psubd m3, m8, m3
+ paddd m4, m9
+ psubd m5, m8, m5
+ REPX {psrad x, 12}, m2, m3, m4, m5
+ ret
+
+INV_TXFM_8X8_FN flipadst, dct
+INV_TXFM_8X8_FN flipadst, adst
+INV_TXFM_8X8_FN flipadst, flipadst
+INV_TXFM_8X8_FN flipadst, identity
+
+cglobal iflipadst_8x8_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+.pass1:
+ call m(iadst_8x8_internal_10bpc).main
+ call .main_end
+ jmp tx2q
+.pass2:
+ call m(idct_8x8_internal_10bpc).transpose_8x8_packed
+ pshufd m4, m0, q1032
+ pshufd m5, m1, q1032
+ call m(iadst_8x8_internal_8bpc).main_pass2
+ vpbroadcastd m12, [pw_2048]
+ vpbroadcastd xm5, [pw_4096]
+ psubw m12, m5
+ vpermq m8, m3, q2031
+ vpermq m9, m2, q2031
+ vpermq m2, m1, q2031
+ vpermq m3, m0, q2031
+ pmulhrsw m0, m8, m12
+ pmulhrsw m1, m9, m12
+ call m(idct_8x8_internal_10bpc).write_8x4_start
+ pmulhrsw m0, m2, m12
+ pmulhrsw m1, m3, m12
+ call m(idct_8x8_internal_10bpc).write_8x4
+ RET
+ALIGN function_align
+.main_end:
+ paddd m10, m8, m0
+ psubd m0, m8, m7
+ psubd m7, m8, m1
+ paddd m1, m8, m6
+ psrad m0, 1
+ psrad m1, 1
+ psrad m6, m7, 1
+ psrad m7, m10, 1
+ psubd m8, m9, m8 ; pd_6143
+ psubd m10, m8, m5
+ paddd m5, m9, m2
+ psubd m2, m8, m3
+ paddd m3, m9, m4
+ psrad m4, m2, 12
+ psrad m2, m10, 12
+ psrad m3, 12
+ psrad m5, 12
+ ret
+
+INV_TXFM_8X8_FN identity, dct
+INV_TXFM_8X8_FN identity, adst
+INV_TXFM_8X8_FN identity, flipadst
+INV_TXFM_8X8_FN identity, identity
+
+cglobal iidentity_8x8_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2
+.pass1:
+ mova m0, [cq+32*0]
+ mova m1, [cq+32*1]
+ mova m2, [cq+32*2]
+ mova m3, [cq+32*3]
+ mova m4, [cq+32*4]
+ mova m5, [cq+32*5]
+ mova m6, [cq+32*6]
+ mova m7, [cq+32*7]
+ jmp tx2q
+.pass2:
+ packssdw m3, m7
+ vpbroadcastd m7, [pixel_10bpc_max]
+.pass2_main:
+ packssdw m0, m4
+ packssdw m1, m5
+ packssdw m2, m6
+ vpbroadcastd m12, [pw_4096]
+ punpckhwd m4, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m2, m3
+ punpcklwd m2, m3
+ punpckhdq m3, m0, m2
+ punpckldq m0, m2
+ punpckldq m2, m4, m1
+ punpckhdq m4, m1
+ punpckhqdq m1, m0, m2 ; 1 5
+ punpcklqdq m0, m2 ; 0 4
+ punpcklqdq m2, m3, m4 ; 2 6
+ punpckhqdq m3, m4 ; 3 7
+ pmulhrsw m0, m12
+ pmulhrsw m1, m12
+ call .write_2x8x2_start
+ pmulhrsw m0, m2, m12
+ pmulhrsw m1, m3, m12
+ call .write_2x8x2_zero
+ RET
+.write_2x8x2_start:
+ lea r6, [strideq*5]
+ pxor m6, m6
+.write_2x8x2_zero:
+ mova [cq+32*0], m6
+ mova [cq+32*1], m6
+ mova [cq+32*2], m6
+ mova [cq+32*3], m6
+ add cq, 32*4
+.write_2x8x2:
+ mova xm4, [dstq+strideq*0]
+ vinserti128 m4, [dstq+strideq*4], 1
+ mova xm5, [dstq+strideq*1]
+ vinserti128 m5, [dstq+r6 ], 1
+ paddw m0, m4
+ paddw m1, m5
+ pmaxsw m0, m6
+ pmaxsw m1, m6
+ pminsw m0, m7
+ pminsw m1, m7
+ mova [dstq+strideq*0], xm0
+ mova [dstq+strideq*1], xm1
+ vextracti128 [dstq+strideq*4], m0, 1
+ vextracti128 [dstq+r6 ], m1, 1
+ lea dstq, [dstq+strideq*2]
+ ret
+
+%macro TRANSPOSE_8X8_DWORD 12 ; src/dst[1-8], tmp[1-4]
+ punpckldq m%9, m%1, m%2 ; aibj emfn
+ punpckhdq m%1, m%2 ; ckdl gohp
+ punpckldq m%10, m%3, m%4 ; qyrz uCvD
+ punpckhdq m%3, m%4 ; sAtB wExF
+ punpckldq m%11, m%5, m%6 ; GOHP KSLT
+ punpckhdq m%5, m%6 ; IQJR MUNV
+ punpckldq m%12, m%7, m%8 ; WeXf aibj
+ punpckhdq m%7, m%8 ; YgZh ckdl
+ punpcklqdq m%2, m%9, m%10 ; aiqy emuC
+ punpckhqdq m%9, m%10 ; bjrz fnvD
+ punpcklqdq m%4, m%1, m%3 ; cksA gowE
+ punpckhqdq m%10, m%1, m%3 ; dltB hpxF
+ punpcklqdq m%6, m%11, m%12 ; GOWe KSai
+ punpckhqdq m%11, m%12 ; HPXf LTbj
+ punpcklqdq m%8, m%5, m%7 ; IQYg MUck
+ punpckhqdq m%12, m%5, m%7 ; JRZh NVdl
+ vperm2i128 m%1, m%2, m%6, 0x20 ; out0
+ vperm2i128 m%5, m%2, m%6, 0x31 ; out4
+ vperm2i128 m%2, m%9, m%11, 0x20 ; out1
+ vperm2i128 m%6, m%9, m%11, 0x31 ; out5
+ vperm2i128 m%3, m%4, m%8, 0x20 ; out2
+ vperm2i128 m%7, m%4, m%8, 0x31 ; out6
+ vperm2i128 m%4, m%10, m%12, 0x20 ; out3
+ vperm2i128 m%8, m%10, m%12, 0x31 ; out7
+%endmacro
+
+INV_TXFM_8X8_FN dct, dct, 12
+INV_TXFM_8X8_FN dct, identity, 12
+INV_TXFM_8X8_FN dct, adst, 12
+INV_TXFM_8X8_FN dct, flipadst, 12
+
+cglobal idct_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
+ vpbroadcastd m12, [clip_20b_min]
+ vpbroadcastd m13, [clip_20b_max]
+ jmp m(idct_8x8_internal_10bpc).pass1
+.pass2:
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ call .transpose_8x8
+ vpbroadcastd m11, [pd_2048]
+ call m(idct_8x8_internal_10bpc).main
+ call .round_shift4
+ jmp m(iadst_8x8_internal_12bpc).pass2_end
+ALIGN function_align
+.write_8x4_start:
+ vpbroadcastd m11, [pixel_12bpc_max]
+ lea r6, [strideq*3]
+ pxor m10, m10
+ ret
+ALIGN function_align
+.transpose_8x8:
+ TRANSPOSE_8X8_DWORD 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
+ ret
+ALIGN function_align
+.round_shift4:
+ vpbroadcastd m1, [pd_8]
+ REPX {paddd x, m1}, m0, m6, m5, m3
+ paddd m1, m6, m7 ; out1
+ psubd m6, m7 ; out6
+ psubd m7, m0, m9 ; out7
+ paddd m0, m9 ; out0
+ paddd m2, m5, m4 ; out2
+ psubd m5, m4 ; out5
+ psubd m4, m3, m8 ; out4
+ paddd m3, m8 ; out3
+ REPX {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7
+ ret
+
+INV_TXFM_8X8_FN adst, dct, 12
+INV_TXFM_8X8_FN adst, adst, 12
+INV_TXFM_8X8_FN adst, flipadst, 12
+INV_TXFM_8X8_FN adst, identity, 12
+
+cglobal iadst_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
+ vpbroadcastd m12, [clip_20b_min]
+ vpbroadcastd m13, [clip_20b_max]
+ jmp m(iadst_8x8_internal_10bpc).pass1
+.pass2:
+ call .pass2_main
+.pass2_end:
+ packssdw m0, m1
+ packssdw m1, m2, m3
+ REPX {vpermq x, x, q3120}, m0, m1
+ call m(idct_8x8_internal_12bpc).write_8x4_start
+ call m(idct_8x8_internal_10bpc).write_8x4
+ packssdw m0, m4, m5
+ packssdw m1, m6, m7
+ REPX {vpermq x, x, q3120}, m0, m1
+ call m(idct_8x8_internal_10bpc).write_8x4
+ RET
+ALIGN function_align
+.pass2_main:
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ call m(idct_8x8_internal_12bpc).transpose_8x8
+ vpbroadcastd m11, [pd_2048]
+.pass2_main2:
+ call m(iadst_8x8_internal_10bpc).main2
+ pslld m9, m8, 3 ; pd_8
+ paddd m0, m9
+ psubd m1, m9, m1 ; 8+x
+ paddd m6, m9
+ psubd m7, m9, m7
+ REPX {psrad x, 4}, m0, m1, m6, m7
+ vpbroadcastd m9, [pd_17408]
+ psubd m8, m9, m8 ; 17407
+ paddd m2, m9
+ psubd m3, m8, m3
+ paddd m4, m9
+ psubd m5, m8, m5
+ REPX {psrad x, 15}, m2, m3, m4, m5
+ ret
+
+INV_TXFM_8X8_FN flipadst, dct, 12
+INV_TXFM_8X8_FN flipadst, adst, 12
+INV_TXFM_8X8_FN flipadst, flipadst, 12
+INV_TXFM_8X8_FN flipadst, identity, 12
+
+cglobal iflipadst_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
+ vpbroadcastd m12, [clip_20b_min]
+ vpbroadcastd m13, [clip_20b_max]
+ jmp m(iflipadst_8x8_internal_10bpc).pass1
+.pass2:
+ call m(iadst_8x8_internal_12bpc).pass2_main
+ packssdw m7, m7, m6
+ packssdw m6, m1, m0
+ packssdw m1, m5, m4
+ vpermq m0, m7, q3120
+ vpermq m1, m1, q3120
+ call m(idct_8x8_internal_12bpc).write_8x4_start
+ call m(idct_8x8_internal_10bpc).write_8x4
+ packssdw m0, m3, m2
+ vpermq m0, m0, q3120
+ vpermq m1, m6, q3120
+ call m(idct_8x8_internal_10bpc).write_8x4
+ RET
+
+INV_TXFM_8X8_FN identity, dct, 12
+INV_TXFM_8X8_FN identity, adst, 12
+INV_TXFM_8X8_FN identity, flipadst, 12
+INV_TXFM_8X8_FN identity, identity, 12
+
+cglobal iidentity_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
+ jmp m(iidentity_8x8_internal_10bpc).pass1
+.pass2:
+ packssdw m3, m7
+ vpbroadcastd m7, [pixel_12bpc_max]
+ jmp m(iidentity_8x8_internal_10bpc).pass2_main
+
+%macro INV_TXFM_8X16_FN 2-4 0,10 ; type1, type2, eob_offset, bitdepth
+ INV_TXFM_FN %1, %2, %3, 8x16, %4
+%ifidn %1_%2, dct_dct
+ imul r6d, [cq], 181
+ vpbroadcastd m2, [dconly_%4bpc]
+ mov [cq], eobd ; 0
+ or r3d, 16
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly2
+%endif
+%endmacro
+
+INV_TXFM_8X16_FN dct, dct
+INV_TXFM_8X16_FN dct, identity, 35
+INV_TXFM_8X16_FN dct, adst
+INV_TXFM_8X16_FN dct, flipadst
+
+cglobal idct_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+%undef cmp
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+.pass1:
+ vpbroadcastd m14, [pd_2896]
+ vpbroadcastd m11, [pd_2048]
+ cmp eobd, 43
+ jl .fast
+ add cq, 32
+ call .pass1_main
+ sub cq, 32
+ mova [cq+32* 1], m0
+ mova [cq+32* 3], m1
+ mova [cq+32* 5], m2
+ mova [cq+32* 7], m3
+ mova [cq+32* 9], m4
+ mova [cq+32*11], m5
+ mova [cq+32*13], m6
+ mova m15, m7
+ call .pass1_main
+ mova m8, [cq+32* 1]
+ mova m9, [cq+32* 3]
+ mova m10, [cq+32* 5]
+ mova m11, [cq+32* 7]
+ mova m12, [cq+32* 9]
+ mova m13, [cq+32*11]
+ mova m14, [cq+32*13]
+ jmp tx2q
+.fast:
+ call .pass1_main
+ pxor m8, m8
+ REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15
+ jmp tx2q
+.pass2:
+ call .transpose
+ call m(idct_8x16_internal_8bpc).main
+ vpbroadcastd m12, [pw_2048]
+ REPX {vpermq x, x, q3120}, m0, m2, m4, m6
+ REPX {vpermq x, x, q2031}, m1, m3, m5, m7
+.end:
+ pmulhrsw m0, m12
+ pmulhrsw m1, m12
+ call m(idct_8x8_internal_10bpc).write_8x4_start
+ pmulhrsw m0, m2, m12
+ pmulhrsw m1, m3, m12
+ call m(idct_8x8_internal_10bpc).write_8x4
+ pmulhrsw m0, m4, m12
+ pmulhrsw m1, m5, m12
+ call m(idct_8x8_internal_10bpc).write_8x4
+ pmulhrsw m0, m6, m12
+ pmulhrsw m1, m7, m12
+ call m(idct_8x8_internal_10bpc).write_8x4
+ RET
+ALIGN function_align
+.transpose:
+ packssdw m0, m8
+ packssdw m1, m9
+ packssdw m2, m10
+ packssdw m3, m11
+ packssdw m4, m12
+ packssdw m5, m13
+ packssdw m6, m14
+ packssdw m7, m15
+ lea r6, [deint_shuf+128]
+ punpckhwd m8, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m2, m3
+ punpcklwd m2, m3
+ punpcklwd m3, m4, m5
+ punpckhwd m4, m5
+ punpckhwd m5, m6, m7
+ punpcklwd m6, m7
+ punpckhdq m7, m3, m6
+ punpckldq m3, m6
+ punpckhdq m6, m4, m5
+ punpckldq m4, m5
+ punpckhdq m5, m8, m1
+ punpckldq m8, m1
+ punpckhdq m1, m0, m2
+ punpckldq m0, m2
+ vperm2i128 m2, m0, m3, 0x31
+ vinserti128 m0, xm3, 1
+ vperm2i128 m3, m1, m7, 0x31
+ vinserti128 m1, xm7, 1
+ vperm2i128 m7, m5, m6, 0x31
+ vinserti128 m5, xm6, 1
+ vperm2i128 m6, m8, m4, 0x31
+ vinserti128 m4, m8, xm4, 1
+ ret
+ALIGN function_align
+.pass1_main:
+ pmulld m0, m14, [cq+32* 0]
+ pmulld m1, m14, [cq+32* 2]
+ pmulld m2, m14, [cq+32* 4]
+ pmulld m3, m14, [cq+32* 6]
+ pmulld m4, m14, [cq+32* 8]
+ pmulld m5, m14, [cq+32*10]
+ pmulld m6, m14, [cq+32*12]
+ pmulld m7, m14, [cq+32*14]
+ call m(idct_8x8_internal_10bpc).main_rect2
+ jmp m(idct_8x8_internal_10bpc).round_shift1
+ALIGN function_align
+.main_evenhalf:
+ paddd m1, m6, m7 ; idct8 out1
+ psubd m6, m7 ; idct8 out6
+ psubd m7, m0, m9 ; idct8 out7
+ paddd m0, m9 ; idct8 out0
+ paddd m2, m5, m4 ; idct8 out2
+ psubd m5, m4 ; idct8 out5
+ psubd m4, m3, m8 ; idct8 out4
+ paddd m3, m8 ; idct8 out3
+ REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ ret
+.main_oddhalf_fast_rect2:
+ REPX {paddd x, m11}, m0, m1, m2, m3
+ REPX {psrad x, 12 }, m0, m1, m2, m3
+.main_oddhalf_fast: ; lower half zero
+ vpbroadcastd m7, [pd_4076]
+ vpbroadcastd m8, [pd_401]
+ vpbroadcastd m6, [pd_m1189]
+ vpbroadcastd m9, [pd_3920]
+ vpbroadcastd m5, [pd_3612]
+ vpbroadcastd m10, [pd_1931]
+ vpbroadcastd m4, [pd_m2598]
+ vpbroadcastd m15, [pd_3166]
+ pmulld m7, m0
+ pmulld m0, m8
+ pmulld m6, m1
+ pmulld m1, m9
+ pmulld m5, m2
+ pmulld m2, m10
+ pmulld m4, m3
+ pmulld m3, m15
+ jmp .main_oddhalf_fast2
+.main_oddhalf_rect2:
+ REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
+.main_oddhalf:
+ ITX_MULSUB_2D 0, 7, 8, 9, 10, _, 401, 4076 ; t8a, t15a
+ ITX_MULSUB_2D 6, 1, 8, 9, 10, _, 3920, 1189 ; t11a, t12a
+ ITX_MULSUB_2D 2, 5, 8, 9, 10, _, 1931, 3612 ; t10a, t13a
+ ITX_MULSUB_2D 4, 3, 8, 9, 10, _, 3166, 2598 ; t9a, t14a
+.main_oddhalf_fast2:
+ REPX {paddd x, m11}, m0, m7, m6, m1, m2, m5, m4, m3
+ REPX {psrad x, 12 }, m0, m4, m6, m2, m1, m5, m7, m3
+ psubd m8, m0, m4 ; t9
+ paddd m0, m4 ; t8
+ psubd m4, m6, m2 ; t10
+ paddd m2, m6 ; t11
+ psubd m6, m1, m5 ; t13
+ paddd m5, m1 ; t12
+ psubd m1, m7, m3 ; t14
+ paddd m7, m3 ; t15
+ REPX {pmaxsd x, m12}, m8, m1, m4, m6, m0, m2, m5, m7
+ REPX {pminsd x, m13}, m8, m1, m4, m6, m0, m2, m5, m7
+ vpbroadcastd m15, [pd_3784]
+ vpbroadcastd m10, [pd_1567]
+ ITX_MULSUB_2D 1, 8, 3, 9, _, 11, 10, 15
+ ITX_MULSUB_2D 6, 4, 3, 9, _, 11, 10, 15, 2
+ psubd m3, m1, m4 ; t10
+ paddd m1, m4 ; t9
+ psubd m4, m0, m2 ; t11a
+ paddd m0, m2 ; t8a
+ psubd m2, m8, m6 ; t13
+ paddd m6, m8 ; t14
+ psubd m8, m7, m5 ; t12a
+ paddd m7, m5 ; t15a
+ REPX {pmaxsd x, m12}, m2, m8, m3, m4, m0, m1, m6, m7
+ REPX {pminsd x, m13}, m2, m8, m3, m4, m0, m1, m6, m7
+ REPX {pmulld x, m14}, m2, m8, m3, m4
+ paddd m2, m11
+ paddd m8, m11
+ paddd m5, m2, m3 ; t13a
+ psubd m2, m3 ; t10a
+ psubd m3, m8, m4 ; t11
+ paddd m4, m8 ; t12
+ REPX {psrad x, 12}, m5, m2, m3, m4
+ mova [r6-32*4], m7
+ mova [r6-32*3], m6
+ mova [r6-32*2], m5
+ mova [r6-32*1], m4
+ mova [r6+32*0], m3
+ mova [r6+32*1], m2
+ mova [r6+32*2], m1
+ mova [r6+32*3], m0
+ ret
+
+INV_TXFM_8X16_FN adst, dct
+INV_TXFM_8X16_FN adst, adst
+INV_TXFM_8X16_FN adst, flipadst
+INV_TXFM_8X16_FN adst, identity, 35
+
+cglobal iadst_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+%undef cmp
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+.pass1:
+ vpbroadcastd m14, [pd_2896]
+ vpbroadcastd m11, [pd_2048]
+ cmp eobd, 43
+ jl .fast
+ add cq, 32
+ call .pass1_main
+ call m(iadst_8x8_internal_10bpc).main_end
+ sub cq, 32
+ mova [cq+32* 1], m0
+ mova [cq+32* 3], m1
+ mova [cq+32* 5], m2
+ mova [cq+32* 7], m3
+ mova [cq+32* 9], m4
+ mova [cq+32*11], m5
+ mova [cq+32*13], m6
+ mova m15, m7
+ call .pass1_main
+ call m(iadst_8x8_internal_10bpc).main_end
+ mova m8, [cq+32* 1]
+ mova m9, [cq+32* 3]
+ mova m10, [cq+32* 5]
+ mova m11, [cq+32* 7]
+ mova m12, [cq+32* 9]
+ mova m13, [cq+32*11]
+ mova m14, [cq+32*13]
+ jmp tx2q
+.fast:
+ call .pass1_main
+ call m(iadst_8x8_internal_10bpc).main_end
+ pxor m8, m8
+ REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15
+ jmp tx2q
+.pass2:
+ call m(idct_8x16_internal_10bpc).transpose
+ call m(iadst_8x16_internal_8bpc).main
+ call m(iadst_8x16_internal_8bpc).main_pass2_end
+ vpbroadcastd m8, [pw_2048]
+ vpbroadcastd xm12, [pw_4096]
+ REPX {vpermq x, x, q2031}, m0, m1, m2, m3
+ REPX {vpermq x, x, q3120}, m4, m5, m6, m7
+ psubw m12, m8
+ jmp m(idct_8x16_internal_10bpc).end
+ALIGN function_align
+.pass1_main:
+ pmulld m0, m14, [cq+32* 0]
+ pmulld m7, m14, [cq+32*14]
+ pmulld m1, m14, [cq+32* 2]
+ pmulld m6, m14, [cq+32*12]
+ pmulld m2, m14, [cq+32* 4]
+ pmulld m5, m14, [cq+32*10]
+ pmulld m3, m14, [cq+32* 6]
+ pmulld m4, m14, [cq+32* 8]
+ REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
+ jmp m(iadst_8x8_internal_10bpc).main2
+
+INV_TXFM_8X16_FN flipadst, dct
+INV_TXFM_8X16_FN flipadst, adst
+INV_TXFM_8X16_FN flipadst, flipadst
+INV_TXFM_8X16_FN flipadst, identity, 35
+
+cglobal iflipadst_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+%undef cmp
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+.pass1:
+ vpbroadcastd m14, [pd_2896]
+ vpbroadcastd m11, [pd_2048]
+ cmp eobd, 43
+ jl .fast
+ add cq, 32
+ call m(iadst_8x16_internal_10bpc).pass1_main
+ call m(iflipadst_8x8_internal_10bpc).main_end
+ sub cq, 32
+ mova [cq+32* 1], m0
+ mova [cq+32* 3], m1
+ mova [cq+32* 5], m2
+ mova [cq+32* 7], m3
+ mova [cq+32* 9], m4
+ mova [cq+32*11], m5
+ mova [cq+32*13], m6
+ mova m15, m7
+ call m(iadst_8x16_internal_10bpc).pass1_main
+ call m(iflipadst_8x8_internal_10bpc).main_end
+ mova m8, [cq+32* 1]
+ mova m9, [cq+32* 3]
+ mova m10, [cq+32* 5]
+ mova m11, [cq+32* 7]
+ mova m12, [cq+32* 9]
+ mova m13, [cq+32*11]
+ mova m14, [cq+32*13]
+ jmp tx2q
+.fast:
+ call m(iadst_8x16_internal_10bpc).pass1_main
+ call m(iflipadst_8x8_internal_10bpc).main_end
+ pxor m8, m8
+ REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15
+ jmp tx2q
+.pass2:
+ call m(idct_8x16_internal_10bpc).transpose
+ call m(iadst_8x16_internal_8bpc).main
+ call m(iadst_8x16_internal_8bpc).main_pass2_end
+ vpbroadcastd m12, [pw_2048]
+ vpbroadcastd xm13, [pw_4096]
+ mova m11, m0
+ vpermq m0, m7, q2031
+ mova m10, m1
+ vpermq m1, m6, q2031
+ mova m9, m2
+ vpermq m2, m5, q2031
+ mova m8, m3
+ vpermq m3, m4, q2031
+ vpermq m4, m8, q3120
+ vpermq m5, m9, q3120
+ vpermq m6, m10, q3120
+ vpermq m7, m11, q3120
+ psubw m12, m13
+ jmp m(idct_8x16_internal_10bpc).end
+
+INV_TXFM_8X16_FN identity, dct
+INV_TXFM_8X16_FN identity, adst
+INV_TXFM_8X16_FN identity, flipadst
+INV_TXFM_8X16_FN identity, identity
+
+%macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16384]
+ pmulhrsw m%2, m%3, m%1
+%if %0 == 4 ; if downshifting by 1
+%ifnum %4
+ pmulhrsw m%2, m%4
+%else ; without rounding
+ psraw m%2, 1
+%endif
+%else
+ paddsw m%1, m%1
+%endif
+ paddsw m%1, m%2
+%endmacro
+
+cglobal iidentity_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+.pass1:
+ vpbroadcastd m15, [pd_2896]
+ pmulld m0, m15, [cq+32* 0]
+ pmulld m8, m15, [cq+32* 1]
+ pmulld m1, m15, [cq+32* 2]
+ pmulld m9, m15, [cq+32* 3]
+ pmulld m2, m15, [cq+32* 4]
+ pmulld m10, m15, [cq+32* 5]
+ pmulld m3, m15, [cq+32* 6]
+ pmulld m11, m15, [cq+32* 7]
+ pmulld m4, m15, [cq+32* 8]
+ pmulld m12, m15, [cq+32* 9]
+ pmulld m5, m15, [cq+32*10]
+ pmulld m13, m15, [cq+32*11]
+ pmulld m6, m15, [cq+32*12]
+ pmulld m14, m15, [cq+32*13]
+ pmulld m7, m15, [cq+32*14]
+ pmulld m15, [cq+32*15]
+ mova [cq], m7
+ vpbroadcastd m7, [pd_2048]
+ REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ paddd m7, [cq]
+ REPX {psrad x, 12}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ jmp tx2q
+.pass2:
+ packssdw m0, m8
+ packssdw m1, m9
+ packssdw m2, m10
+ packssdw m3, m11
+ packssdw m4, m12
+ packssdw m5, m13
+ packssdw m6, m14
+ packssdw m13, m7, m15
+ vpbroadcastd m8, [pw_1697x16]
+ REPX {IDTX16 x, 9, 8}, 0, 1, 2, 3, 4, 5, 6, 13
+ vpbroadcastd m7, [pixel_10bpc_max]
+ vpbroadcastd m12, [pw_2048]
+ call .pass2_end
+ RET
+ALIGN function_align
+.pass2_end:
+ punpckhwd m9, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m6, m13
+ punpcklwd m6, m13
+ punpckhwd m13, m4, m5
+ punpcklwd m4, m5
+ punpcklwd m5, m2, m3
+ punpckhwd m2, m3
+ punpckhdq m3, m0, m5
+ punpckldq m0, m5
+ punpckhdq m11, m9, m2
+ punpckldq m9, m2
+ punpckldq m2, m4, m6
+ punpckhdq m4, m6
+ punpckldq m6, m13, m1
+ punpckhdq m13, m1
+ punpckhqdq m1, m0, m2
+ punpcklqdq m0, m2
+ punpcklqdq m2, m3, m4
+ punpckhqdq m3, m4
+ punpcklqdq m8, m9, m6
+ punpckhqdq m9, m6
+ punpcklqdq m10, m11, m13
+ punpckhqdq m11, m13
+ pmulhrsw m0, m12
+ pmulhrsw m1, m12
+ call m(iidentity_8x8_internal_10bpc).write_2x8x2_start
+ pmulhrsw m0, m12, m2
+ pmulhrsw m1, m12, m3
+ call m(iidentity_8x8_internal_10bpc).write_2x8x2_zero
+ pmulhrsw m0, m12, m8
+ pmulhrsw m1, m12, m9
+ lea dstq, [dstq+strideq*4]
+ call m(iidentity_8x8_internal_10bpc).write_2x8x2_zero
+ pmulhrsw m0, m12, m10
+ pmulhrsw m1, m12, m11
+ call m(iidentity_8x8_internal_10bpc).write_2x8x2_zero
+ ret
+
+INV_TXFM_8X16_FN dct, dct, 0, 12
+INV_TXFM_8X16_FN dct, identity, 35, 12
+INV_TXFM_8X16_FN dct, adst, 0, 12
+INV_TXFM_8X16_FN dct, flipadst, 0, 12
+
+cglobal idct_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
+ vpbroadcastd m12, [clip_20b_min]
+ vpbroadcastd m13, [clip_20b_max]
+ jmp m(idct_8x16_internal_10bpc).pass1
+.pass2:
+ lea r6, [rsp+32*4]
+ call .transpose
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ mova [cq+32* 8], m0
+ mova [cq+32*10], m2
+ mova [cq+32*12], m4
+ mova [cq+32*14], m6
+ pmaxsd m0, m12, [cq+32* 1]
+ pmaxsd m4, m12, m1
+ pmaxsd m1, m12, [cq+32* 3]
+ pmaxsd m2, m12, [cq+32* 5]
+ pmaxsd m6, m12, m5
+ pmaxsd m5, m12, m3
+ pmaxsd m3, m12, [cq+32* 7]
+ pmaxsd m7, m12
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ vpbroadcastd m11, [pd_2048]
+ vpbroadcastd m14, [pd_2896]
+ call m(idct_8x16_internal_10bpc).main_oddhalf
+ pmaxsd m0, m12, [cq+32* 0]
+ pmaxsd m1, m12, [cq+32* 2]
+ pmaxsd m2, m12, [cq+32* 4]
+ pmaxsd m3, m12, [cq+32* 6]
+ pmaxsd m4, m12, [cq+32* 8]
+ pmaxsd m5, m12, [cq+32*10]
+ pmaxsd m6, m12, [cq+32*12]
+ pmaxsd m7, m12, [cq+32*14]
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ call m(idct_8x8_internal_10bpc).main
+ call m(idct_8x16_internal_10bpc).main_evenhalf
+ vpbroadcastd m11, [pd_8]
+ REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+ call m(idct_16x8_internal_10bpc).pass1_rotations
+ REPX {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+.end:
+ packssdw m0, m1
+ packssdw m1, m2, m3
+ packssdw m2, m4, m5
+ packssdw m3, m6, m7
+ packssdw m4, m8, m9
+ packssdw m5, m10, m11
+ packssdw m6, m12, m13
+ packssdw m7, m14, m15
+ vpermq m0, m0, q3120
+ vpermq m1, m1, q3120
+ call m(idct_8x8_internal_12bpc).write_8x4_start
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, m2, q3120
+ vpermq m1, m3, q3120
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, m4, q3120
+ vpermq m1, m5, q3120
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, m6, q3120
+ vpermq m1, m7, q3120
+ call m(idct_8x8_internal_10bpc).write_8x4
+ RET
+ALIGN function_align
+.transpose:
+ mova [cq+32* 8], m8
+ mova [cq+32* 9], m9
+ mova [cq+32*10], m10
+ mova [cq+32*11], m11
+ call m(idct_8x8_internal_12bpc).transpose_8x8
+ mova [cq+32* 0], m0
+ mova [cq+32* 1], m1
+ mova [cq+32* 2], m2
+ mova [cq+32* 3], m3
+ mova [cq+32* 4], m4
+ mova [cq+32* 5], m5
+ mova [cq+32* 6], m6
+ mova [cq+32* 7], m7
+ mova m0, [cq+32* 8]
+ mova m1, [cq+32* 9]
+ mova m2, [cq+32*10]
+ mova m3, [cq+32*11]
+ mova m4, m12
+ mova m5, m13
+ mova m6, m14
+ mova m7, m15
+ jmp m(idct_8x8_internal_12bpc).transpose_8x8
+
+INV_TXFM_8X16_FN adst, dct, 0, 12
+INV_TXFM_8X16_FN adst, adst, 0, 12
+INV_TXFM_8X16_FN adst, flipadst, 0, 12
+INV_TXFM_8X16_FN adst, identity, 35, 12
+
+cglobal iadst_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
+ vpbroadcastd m12, [clip_20b_min]
+ vpbroadcastd m13, [clip_20b_max]
+ jmp m(iadst_8x16_internal_10bpc).pass1
+.pass2:
+ lea r6, [rsp+32*4]
+ call .pass2_main
+ call m(iadst_16x8_internal_10bpc).pass1_rotations
+.pass2_end:
+ REPX {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15
+ REPX {psrad x, 15}, m4, m5, m6, m7, m8, m9, m10, m11
+ jmp m(idct_8x16_internal_12bpc).end
+ALIGN function_align
+.pass2_main:
+ call m(idct_8x16_internal_12bpc).transpose
+ vpbroadcastd m13, [clip_18b_min]
+ vpbroadcastd m14, [clip_18b_max]
+ mova [cq+32* 8], m0
+ mova [cq+32*11], m3
+ mova [cq+32*12], m4
+ mova [cq+32*15], m7
+ pmaxsd m0, m13, [cq+32* 2] ; 2
+ pmaxsd m3, m13, m1 ; 9
+ pmaxsd m1, m13, m5 ; 13
+ pmaxsd m4, m13, m2 ; 10
+ pmaxsd m2, m13, [cq+32* 6] ; 6
+ pmaxsd m5, m13, [cq+32* 5] ; 5
+ pmaxsd m6, m13, m6 ; 14
+ pmaxsd m7, m13, [cq+32* 1] ; 1
+ REPX {pminsd x, m14}, m0, m1, m2, m3, m4, m5, m6, m7
+ vpbroadcastd m12, [pd_2048]
+ vpbroadcastd m15, [pd_2896]
+ call m(iadst_16x8_internal_10bpc).main_part1
+ pmaxsd m0, m13, [cq+32* 0] ; 0
+ pmaxsd m1, m13, [cq+32*15] ; 15
+ pmaxsd m2, m13, [cq+32* 4] ; 4
+ pmaxsd m3, m13, [cq+32*11] ; 11
+ pmaxsd m4, m13, [cq+32* 8] ; 8
+ pmaxsd m5, m13, [cq+32* 7] ; 7
+ pmaxsd m6, m13, [cq+32*12] ; 12
+ pmaxsd m7, m13, [cq+32* 3] ; 3
+ REPX {pminsd x, m14}, m0, m1, m2, m3, m4, m5, m6, m7
+ call m(iadst_16x8_internal_10bpc).main_part2
+ vpbroadcastd m14, [pd_17408]
+ psrld m15, 11 ; pd_1
+ psubd m13, m14, m15 ; pd_17407
+ pslld m15, 3 ; pd_8
+ ret
+
+INV_TXFM_8X16_FN flipadst, dct, 0, 12
+INV_TXFM_8X16_FN flipadst, adst, 0, 12
+INV_TXFM_8X16_FN flipadst, flipadst, 0, 12
+INV_TXFM_8X16_FN flipadst, identity, 35, 12
+
+cglobal iflipadst_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
+ vpbroadcastd m12, [clip_20b_min]
+ vpbroadcastd m13, [clip_20b_max]
+ jmp m(iflipadst_8x16_internal_10bpc).pass1
+.pass2:
+ lea r6, [rsp+32*4]
+ call m(iadst_8x16_internal_12bpc).pass2_main
+ call m(iflipadst_16x8_internal_10bpc).pass1_rotations
+ jmp m(iadst_8x16_internal_12bpc).pass2_end
+
+INV_TXFM_8X16_FN identity, dct, 0, 12
+INV_TXFM_8X16_FN identity, adst, 0, 12
+INV_TXFM_8X16_FN identity, flipadst, 0, 12
+INV_TXFM_8X16_FN identity, identity, 0, 12
+
+cglobal iidentity_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
+ jmp m(iidentity_8x16_internal_10bpc).pass1
+.pass2:
+ call .pass2_main
+ packssdw m0, m8
+ packssdw m1, m9
+ packssdw m2, m10
+ packssdw m3, m11
+ packssdw m4, m12
+ packssdw m5, m13
+ packssdw m6, m14
+ packssdw m13, m7, m15
+ vpbroadcastd m7, [pixel_12bpc_max]
+ vpbroadcastd m12, [pw_16384]
+ call m(iidentity_8x16_internal_10bpc).pass2_end
+ RET
+ALIGN function_align
+.pass2_main:
+ mova [cq], m7
+ vpbroadcastd m7, [clip_18b_min]
+ REPX {pmaxsd x, m7}, m0, m1, m2, m3, m4, m5, m6, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ pmaxsd m7, [cq]
+ mova [cq], m15
+ vpbroadcastd m15, [clip_18b_max]
+ REPX {pminsd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14
+ pminsd m15, [cq]
+ mova [cq], m7
+ vpbroadcastd m7, [pd_5793]
+ REPX {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ pmulld m7, [cq]
+ mova [cq], m15
+ vpbroadcastd m15, [pd_1024]
+ REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14
+ paddd m15, [cq]
+ REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ ret
+
+%macro INV_TXFM_16X4_FN 2-3 10 ; type1, type2, bitdepth
+ INV_TXFM_FN %1, %2, 0, 16x4, %3
+%ifidn %1_%2, dct_dct
+ vpbroadcastd m3, [dconly_%3bpc]
+%if %3 = 10
+.dconly:
+ imul r6d, [cq], 181
+ mov [cq], eobd ; 0
+ or r3d, 4
+.dconly2:
+ add r6d, 384
+ sar r6d, 9
+.dconly3:
+ imul r6d, 181
+ add r6d, 2176
+ sar r6d, 12
+ movd xm0, r6d
+ paddsw xm0, xm3
+ vpbroadcastw m0, xm0
+.dconly_loop:
+ paddsw m1, m0, [dstq+strideq*0]
+ paddsw m2, m0, [dstq+strideq*1]
+ psubusw m1, m3
+ psubusw m2, m3
+ mova [dstq+strideq*0], m1
+ mova [dstq+strideq*1], m2
+ lea dstq, [dstq+strideq*2]
+ sub r3d, 2
+ jg .dconly_loop
+ RET
+%else
+ jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly
+%endif
+%endif
+%endmacro
+
+INV_TXFM_16X4_FN dct, dct
+INV_TXFM_16X4_FN dct, identity
+INV_TXFM_16X4_FN dct, adst
+INV_TXFM_16X4_FN dct, flipadst
+
+cglobal idct_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2
+ vpbroadcastd m8, [clip_18b_min]
+ vpbroadcastd m9, [clip_18b_max]
+.pass1:
+ vbroadcasti128 m0, [cq+16* 0]
+ vbroadcasti128 m4, [cq+16* 4]
+ vbroadcasti128 m1, [cq+16* 2]
+ vbroadcasti128 m7, [cq+16* 6]
+ vbroadcasti128 m5, [cq+16*10]
+ vbroadcasti128 m2, [cq+16* 8]
+ vbroadcasti128 m6, [cq+16*12]
+ vbroadcasti128 m3, [cq+16*14]
+ shufpd m0, m4, 0x0c ; 0 4
+ shufpd m1, m5, 0x0c ; 2 10
+ shufpd m2, m6, 0x0c ; 8 12
+ shufpd m3, m7, 0x0c ; 14 6
+ call .pass1_main
+ vbroadcasti128 m10, [cq+16* 1]
+ vbroadcasti128 m4, [cq+16* 5]
+ vbroadcasti128 m11, [cq+16*15]
+ vbroadcasti128 m5, [cq+16*11]
+ shufpd m10, m4, 0x0c ; 1 5
+ shufpd m11, m5, 0x0c ; 15 11
+ vbroadcasti128 m5, [cq+16* 9]
+ vbroadcasti128 m4, [cq+16*13]
+ shufpd m5, m4, 0x0c ; 9 13
+ vbroadcasti128 m6, [cq+16* 7]
+ vbroadcasti128 m4, [cq+16* 3]
+ shufpd m6, m4, 0x0c ; 7 3
+ call .pass1_main2
+ pcmpeqd m4, m4
+ REPX {psubd x, m4}, m0, m1, m2, m3
+ call .pass1_main3
+ REPX {psrad x, 1 }, m0, m1, m2, m3, m4, m5, m6, m7
+ jmp tx2q
+.pass2:
+ call .transpose_4x16_packed
+ lea r6, [deint_shuf+128]
+ call m(idct_16x4_internal_8bpc).main
+.end:
+ vpbroadcastd m4, [pw_2048]
+ REPX {pmulhrsw x, m4}, m0, m1, m2, m3
+ vpbroadcastd m5, [pixel_10bpc_max]
+.end2:
+ paddw m0, [dstq+strideq*0]
+ paddw m1, [dstq+strideq*1]
+.end3:
+ lea r6, [dstq+strideq*2]
+ paddw m2, [r6 +strideq*0]
+ paddw m3, [r6 +strideq*1]
+ pxor m4, m4
+ REPX {mova [cq+32*x], m4}, 0, 1, 2, 3, 4, 5, 6, 7
+ REPX {pmaxsw x, m4}, m0, m1, m2, m3
+ REPX {pminsw x, m5}, m0, m1, m2, m3
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [r6 +strideq*0], m2
+ mova [r6 +strideq*1], m3
+ RET
+ALIGN function_align
+.pass1_main:
+ vpbroadcastd m7, [pd_2048]
+ call m(idct_8x4_internal_10bpc).main
+ psubd m3, m0, m4 ; idct8 out7 out6
+ paddd m0, m4 ; idct8 out0 out1
+ paddd m1, m2, m5 ; idct8 out3 out2
+ psubd m2, m5 ; idct8 out4 out5
+ ret
+ALIGN function_align
+.pass1_main2:
+ ITX_MULSUB_2D 10, 11, 4, 12, 13, 7, 401_1931, 4076_3612, 1
+ ITX_MULSUB_2D 5, 6, 4, 12, 13, 7, 3166_3920, 2598_1189, 1
+ vbroadcasti128 m12, [pd_3784_m3784]
+ psubd m4, m10, m5
+ paddd m10, m5 ; t8 t11
+ psignd m4, m12 ; t9 t10
+ psubd m5, m11, m6
+ paddd m11, m6 ; t15 t12
+ psignd m5, m12 ; t14 t13
+ vpbroadcastd m6, [pd_1567]
+ vpbroadcastd m13, [pd_3784]
+ REPX {pmaxsd x, m8}, m5, m4
+ REPX {pminsd x, m9}, m5, m4
+ pmulld m12, m5
+ pmulld m5, m6
+ vbroadcasti128 m6, [pd_1567_m1567]
+ pmulld m13, m4
+ pmulld m4, m6
+ REPX {pmaxsd x, m8}, m10, m11, m0, m1
+ REPX {pminsd x, m9}, m10, m11, m0, m1
+ paddd m12, m7
+ paddd m5, m7
+ paddd m4, m12
+ psubd m5, m13
+ psrad m4, 12 ; t14a t10a
+ psrad m5, 12 ; t9a t13a
+ vpbroadcastd m12, [pd_2896]
+ punpckhqdq m6, m11, m5
+ punpcklqdq m11, m4
+ punpckhqdq m4, m10, m4
+ punpcklqdq m10, m5
+ psubd m5, m11, m6 ; t12a t13
+ paddd m11, m6 ; t15a t14
+ psubd m6, m10, m4 ; t11a t10
+ paddd m10, m4 ; t8a t9
+ REPX {pmaxsd x, m8}, m5, m6
+ REPX {pminsd x, m9}, m5, m6
+ pmulld m5, m12
+ pmulld m6, m12
+ REPX {pmaxsd x, m8}, m2, m3, m11, m10
+ REPX {pminsd x, m9}, m2, m3, m11, m10
+ ret
+ALIGN function_align
+.pass1_main3:
+ paddd m5, m7
+ psubd m4, m5, m6
+ paddd m5, m6
+ psrad m4, 12 ; t11 t10a
+ psrad m5, 12 ; t12 t13a
+ psubd m7, m0, m11 ; out15 out14
+ paddd m0, m11 ; out0 out1
+ psubd m6, m1, m5 ; out12 out13
+ paddd m1, m5 ; out3 out2
+ psubd m5, m2, m4 ; out11 out10
+ paddd m2, m4 ; out4 out5
+ psubd m4, m3, m10 ; out8 out9
+ paddd m3, m10 ; out7 out6
+ REPX {pshufd x, x, q1032}, m1, m3, m5, m7
+ ret
+ALIGN function_align
+.transpose_4x16_packed:
+ vbroadcasti128 m8, [deint_shuf]
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ REPX {pshufb x, m8}, m0, m2, m4, m6
+ punpckhqdq m1, m0, m2
+ punpcklqdq m0, m2
+ punpckhqdq m2, m4, m6
+ punpcklqdq m4, m6
+ vperm2i128 m3, m1, m2, 0x31
+ vinserti128 m1, xm2, 1
+ vperm2i128 m2, m0, m4, 0x31
+ vinserti128 m0, xm4, 1
+ ret
+
+INV_TXFM_16X4_FN adst, dct
+INV_TXFM_16X4_FN adst, adst
+INV_TXFM_16X4_FN adst, flipadst
+INV_TXFM_16X4_FN adst, identity
+
+cglobal iadst_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+.pass1:
+ call m(iadst_4x16_internal_10bpc).main
+ psrad m11, 11 ; pd_1
+ REPX {paddd x, m11}, m0, m1, m2, m3
+ paddd m4, m5, m11
+ paddd m5, m6, m11
+ paddd m6, m7, m11
+ paddd m7, m8, m11
+.pass1_end:
+ REPX {pshufd x, x, q1032}, m0, m2, m4, m6
+ REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7
+ jmp tx2q
+.pass2:
+ call m(idct_16x4_internal_10bpc).transpose_4x16_packed
+ lea r6, [deint_shuf+128]
+ call m(iadst_16x4_internal_8bpc).main
+ jmp m(idct_16x4_internal_10bpc).end
+ALIGN function_align
+.main:
+ vpbroadcastd m6, [pd_1321]
+ mova m0, [cq+32*0]
+ mova m1, [cq+32*1]
+ vpbroadcastd m7, [pd_2482]
+ mova m2, [cq+32*6]
+ mova m3, [cq+32*7]
+ pmulld m4, m0, m6
+ pmulld m5, m1, m6 ; 1321*in0
+ pmulld m9, m2, m7
+ pmulld m8, m3, m7 ; 2482*in3
+ paddd m4, m9
+ paddd m8, m5 ; 1321*in0 + 2482*in3
+ pmulld m5, m0, m7
+ pmulld m9, m1, m7 ; 2482*in0
+ paddd m0, m2
+ paddd m1, m3 ; in0 + in3
+ paddd m7, m6 ; pd_3803
+ pmulld m2, m7
+ pmulld m3, m7 ; 3803*in3
+ psubd m5, m2
+ psubd m9, m3 ; 2482*in0 - 3803*in3
+ mova m2, [cq+32*4]
+ pmulld m10, m7, m2
+ pmulld m3, m6, m2
+ psubd m2, m0
+ mova m0, [cq+32*5]
+ pmulld m7, m0 ; 3803*in2
+ pmulld m6, m0 ; 1321*in2
+ psubd m0, m1 ; in2 - in0 - in3
+ vpbroadcastd m1, [pd_m3344]
+ paddd m4, m10
+ paddd m7, m8 ; t0
+ psubd m5, m3
+ psubd m9, m6 ; t1
+ pmulld m2, m1
+ pmulld m0, m1 ; t2
+ pmulld m3, m1, [cq+32*2]
+ pmulld m1, [cq+32*3] ; -t3
+ ret
+ALIGN function_align
+.main_end:
+ ; expects: m6 = rnd
+ paddd m5, m6
+ paddd m9, m6
+ paddd m10, m4, m5
+ paddd m4, m6
+ paddd m8, m7, m6
+ paddd m7, m9
+ psubd m4, m3 ; out0 (unshifted)
+ psubd m5, m3 ; out1 (unshifted)
+ paddd m2, m6 ; out2 (unshifted)
+ paddd m3, m10 ; out3 (unshifted)
+ psubd m8, m1 ; out4 (unshifted)
+ psubd m9, m1 ; out5 (unshifted)
+ paddd m6, m0 ; out6 (unshifted)
+ paddd m7, m1 ; out7 (unshifted)
+ ret
+
+INV_TXFM_16X4_FN flipadst, dct
+INV_TXFM_16X4_FN flipadst, adst
+INV_TXFM_16X4_FN flipadst, flipadst
+INV_TXFM_16X4_FN flipadst, identity
+
+cglobal iflipadst_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+.pass1:
+ call m(iadst_4x16_internal_10bpc).main
+ psrad m11, 11 ; pd_1
+ paddd m4, m3, m11
+ paddd m3, m5, m11
+ paddd m5, m2, m11
+ paddd m2, m6, m11
+ paddd m6, m1, m11
+ paddd m1, m7, m11
+ paddd m7, m0, m11
+ paddd m0, m8, m11
+ jmp m(iadst_16x4_internal_10bpc).pass1_end
+.pass2:
+ call m(idct_16x4_internal_10bpc).transpose_4x16_packed
+ lea r6, [deint_shuf+128]
+ call m(iadst_16x4_internal_8bpc).main
+ vpbroadcastd m4, [pw_2048]
+ pmulhrsw m5, m3, m4
+ pmulhrsw m6, m2, m4
+ pmulhrsw m2, m1, m4
+ pmulhrsw m3, m0, m4
+ paddw m0, m5, [dstq+strideq*0]
+ paddw m1, m6, [dstq+strideq*1]
+ vpbroadcastd m5, [pixel_10bpc_max]
+ jmp m(idct_16x4_internal_10bpc).end3
+
+INV_TXFM_16X4_FN identity, dct
+INV_TXFM_16X4_FN identity, adst
+INV_TXFM_16X4_FN identity, flipadst
+INV_TXFM_16X4_FN identity, identity
+
+cglobal iidentity_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2
+ vpbroadcastd m8, [pd_5793]
+ vpermq m0, [cq+32*0], q3120 ; 0 1
+ vpermq m1, [cq+32*1], q3120 ; 2 3
+ vpermq m2, [cq+32*2], q3120 ; 4 5
+ vpermq m3, [cq+32*3], q3120 ; 6 7
+ vpermq m4, [cq+32*4], q3120 ; 8 9
+ vpermq m5, [cq+32*5], q3120 ; a b
+ vpermq m6, [cq+32*6], q3120 ; c d
+ vpermq m7, [cq+32*7], q3120 ; e f
+ vpbroadcastd m9, [pd_3072]
+ REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {paddd x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {psrad x, 12}, m0, m1, m2, m3, m4, m5, m6, m7
+ jmp tx2q
+.pass2:
+ call m(idct_16x4_internal_10bpc).transpose_4x16_packed
+ vpbroadcastd m7, [pw_1697x8]
+ pmulhrsw m4, m7, m0
+ pmulhrsw m5, m7, m1
+ pmulhrsw m6, m7, m2
+ pmulhrsw m7, m3
+ paddsw m0, m4
+ paddsw m1, m5
+ paddsw m2, m6
+ paddsw m3, m7
+ jmp m(idct_16x4_internal_10bpc).end
+
+INV_TXFM_16X4_FN dct, dct, 12
+INV_TXFM_16X4_FN dct, identity, 12
+INV_TXFM_16X4_FN dct, adst, 12
+INV_TXFM_16X4_FN dct, flipadst, 12
+
+cglobal idct_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
+ vpbroadcastd m8, [clip_20b_min]
+ vpbroadcastd m9, [clip_20b_max]
+ jmp m(idct_16x4_internal_10bpc).pass1
+.pass2:
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ ; deinterleave
+ REPX {pshufd x, x, q3120}, m0, m1, m2, m3, m4, m5, m6, m7
+ ; transpose
+ punpcklqdq m8, m0, m1
+ punpckhqdq m0, m1
+ punpcklqdq m9, m2, m3
+ punpckhqdq m2, m3
+ punpcklqdq m10, m4, m5
+ punpckhqdq m4, m5
+ punpcklqdq m11, m6, m7
+ punpckhqdq m6, m7
+ vperm2i128 m3, m0, m2, 0x31 ; out6
+ vperm2i128 m1, m0, m2, 0x20 ; out2
+ vperm2i128 m7, m4, m6, 0x31 ; out7
+ vperm2i128 m5, m4, m6, 0x20 ; out3
+ vperm2i128 m13, m10, m11, 0x31 ; out5
+ vperm2i128 m12, m10, m11, 0x20 ; out1
+ vperm2i128 m11, m8, m9, 0x31 ; out4
+ vperm2i128 m10, m8, m9, 0x20 ; out0
+ call m(idct_4x16_internal_10bpc).pass1_main
+ pmulld m0, m6, m10
+ pmulld m2, m6, m11
+ pmulld m4, m6, m12
+ pmulld m6, m13
+ vpbroadcastd m10, [pd_17408]
+ call m(idct_4x16_internal_10bpc).pass1_main2
+ REPX {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7
+ packssdw m0, m4
+ packssdw m1, m5
+ packssdw m2, m6
+ packssdw m3, m7
+ vpbroadcastd m5, [pixel_12bpc_max]
+ REPX {vpermq x, x, q3120}, m0, m1, m2, m3
+ jmp m(idct_16x4_internal_10bpc).end2
+
+INV_TXFM_16X4_FN adst, dct, 12
+INV_TXFM_16X4_FN adst, adst, 12
+INV_TXFM_16X4_FN adst, flipadst, 12
+INV_TXFM_16X4_FN adst, identity, 12
+
+cglobal iadst_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
+ vpbroadcastd m12, [clip_20b_min]
+ vpbroadcastd m13, [clip_20b_max]
+ jmp m(iadst_16x4_internal_10bpc).pass1
+.pass2:
+ call .pass2_main
+ REPX {vpermq x, x, q3120}, m0, m1, m2, m3
+ REPX {pmulhrsw x, m4}, m0, m1, m2, m3
+ jmp m(idct_16x4_internal_10bpc).end2
+ALIGN function_align
+.pass2_main:
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ REPX {pmaxsd x, m12}, m0, m1, m2, m3, m6, m7
+ pmaxsd m8, m4, m12
+ pmaxsd m9, m5, m12
+ REPX {pminsd x, m13}, m0, m1, m2, m3
+ call m(iadst_8x4_internal_12bpc).transpose_4x8
+ mova [cq+32*0], m0
+ mova [cq+32*2], m1
+ mova [cq+32*4], m2
+ mova [cq+32*6], m3
+ pminsd m0, m8, m13
+ pminsd m1, m9, m13
+ pminsd m2, m6, m13
+ pminsd m3, m7, m13
+ call m(iadst_8x4_internal_12bpc).transpose_4x8
+ mova [cq+32*1], m0
+ mova [cq+32*3], m1
+ mova [cq+32*5], m2
+ mova [cq+32*7], m3
+ call m(iadst_16x4_internal_10bpc).main
+ vpbroadcastd m6, [pd_2048]
+ call m(iadst_16x4_internal_10bpc).main_end
+ psrad m0, m4, 15
+ psrad m1, m5, 15
+ psrad m2, 15
+ psrad m3, 15
+ psrad m4, m8, 15
+ psrad m5, m9, 15
+ psrad m6, 15
+ psrad m7, 15
+ packssdw m0, m4
+ packssdw m1, m5
+ packssdw m2, m6
+ packssdw m3, m7
+ vpbroadcastd m4, [pw_16384]
+ vpbroadcastd m5, [pixel_12bpc_max]
+ ret
+
+INV_TXFM_16X4_FN flipadst, dct, 12
+INV_TXFM_16X4_FN flipadst, adst, 12
+INV_TXFM_16X4_FN flipadst, flipadst, 12
+INV_TXFM_16X4_FN flipadst, identity, 12
+
+cglobal iflipadst_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
+ vpbroadcastd m12, [clip_20b_min]
+ vpbroadcastd m13, [clip_20b_max]
+ jmp m(iflipadst_16x4_internal_10bpc).pass1
+.pass2:
+ call m(iadst_16x4_internal_12bpc).pass2_main
+ vpermq m7, m0, q3120
+ vpermq m6, m1, q3120
+ vpermq m1, m2, q3120
+ vpermq m0, m3, q3120
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+ pmulhrsw m2, m6, m4
+ pmulhrsw m3, m7, m4
+ jmp m(idct_16x4_internal_10bpc).end2
+
+INV_TXFM_16X4_FN identity, dct, 12
+INV_TXFM_16X4_FN identity, adst, 12
+INV_TXFM_16X4_FN identity, flipadst, 12
+INV_TXFM_16X4_FN identity, identity, 12
+
+cglobal iidentity_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
+ vpbroadcastd m8, [pd_1697]
+ vpermq m0, [cq+32*0], q3120 ; 0 1
+ vpermq m1, [cq+32*1], q3120 ; 2 3
+ vpermq m2, [cq+32*2], q3120 ; 4 5
+ vpermq m3, [cq+32*3], q3120 ; 6 7
+ vpbroadcastd m9, [pd_3072]
+ pmulld m4, m8, m0
+ pmulld m5, m8, m1
+ pmulld m6, m8, m2
+ pmulld m7, m8, m3
+ vpermq m10, [cq+32*4], q3120 ; 8 9
+ vpermq m11, [cq+32*5], q3120 ; a b
+ vpermq m12, [cq+32*6], q3120 ; c d
+ vpermq m13, [cq+32*7], q3120 ; e f
+ REPX {paddd x, m9}, m4, m5, m6, m7
+ REPX {psrad x, 12}, m4, m5, m6, m7
+ paddd m0, m4
+ pmulld m4, m8, m10
+ paddd m1, m5
+ pmulld m5, m8, m11
+ paddd m2, m6
+ pmulld m6, m8, m12
+ paddd m3, m7
+ pmulld m7, m8, m13
+ REPX {paddd x, m9}, m4, m5, m6, m7
+ REPX {psrad x, 12}, m4, m5, m6, m7
+ paddd m4, m10
+ paddd m5, m11
+ paddd m6, m12
+ paddd m7, m13
+ jmp tx2q
+.pass2:
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ vpbroadcastd m8, [pd_5793]
+ vpbroadcastd m9, [pd_2048]
+ REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {paddd x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {psrad x, 15}, m0, m1, m2, m3, m4, m5, m6, m7
+ call m(idct_16x4_internal_10bpc).transpose_4x16_packed
+ vpbroadcastd m4, [pw_16384]
+ REPX {pmulhrsw x, m4}, m0, m1, m2, m3
+ vpbroadcastd m5, [pixel_12bpc_max]
+ jmp m(idct_16x4_internal_10bpc).end2
+
+%macro INV_TXFM_16X8_FN 2-3 10 ; type1, type2, bitdepth
+ INV_TXFM_FN %1, %2, 0, 16x8, %3
+%ifidn %1_%2, dct_dct
+ imul r6d, [cq], 181
+ vpbroadcastd m3, [dconly_%3bpc]
+ mov [cq], eobd ; 0
+ or r3d, 8
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly2
+%endif
+%endmacro
+
+INV_TXFM_16X8_FN dct, dct
+INV_TXFM_16X8_FN dct, identity
+INV_TXFM_16X8_FN dct, adst
+INV_TXFM_16X8_FN dct, flipadst
+
+cglobal idct_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+.pass1:
+ vpbroadcastd m14, [pd_2896]
+ pmulld m0, m14, [cq+32* 1]
+ pmulld m1, m14, [cq+32* 3]
+ pmulld m2, m14, [cq+32* 5]
+ pmulld m3, m14, [cq+32* 7]
+ pmulld m4, m14, [cq+32* 9]
+ pmulld m5, m14, [cq+32*11]
+ pmulld m6, m14, [cq+32*13]
+ pmulld m7, m14, [cq+32*15]
+ vpbroadcastd m11, [pd_2048]
+ lea r6, [rsp+32*4]
+ call m(idct_8x16_internal_10bpc).main_oddhalf_rect2
+ pmulld m0, m14, [cq+32* 0]
+ pmulld m1, m14, [cq+32* 2]
+ pmulld m2, m14, [cq+32* 4]
+ pmulld m3, m14, [cq+32* 6]
+ pmulld m4, m14, [cq+32* 8]
+ pmulld m5, m14, [cq+32*10]
+ pmulld m6, m14, [cq+32*12]
+ pmulld m7, m14, [cq+32*14]
+ call m(idct_8x8_internal_10bpc).main_rect2
+ call m(idct_8x16_internal_10bpc).main_evenhalf
+ psrld m11, 11 ; pd_1
+ REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+ call .pass1_rotations
+ REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ jmp tx2q
+.pass2:
+ call .transpose
+ call m(idct_16x8_internal_8bpc).main
+ vpbroadcastd m10, [pw_2048]
+.end:
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ pmulhrsw m2, m10
+ pmulhrsw m3, m10
+ call .write_16x4_start
+.end2:
+ pmulhrsw m0, m4, m10
+ pmulhrsw m1, m5, m10
+ pmulhrsw m2, m6, m10
+ pmulhrsw m3, m7, m10
+ call .write_16x4_zero
+ RET
+ALIGN function_align
+.pass1_rotations:
+ mova m14, [r6-32*4]
+ mova m13, [r6-32*3]
+ mova m12, [r6-32*2]
+ mova m11, [r6-32*1]
+ mova m10, [r6+32*0]
+ mova m9, [r6+32*1]
+ mova m8, [r6+32*2]
+ psubd m15, m0, m14 ; out15
+ paddd m0, m14 ; out0
+ psubd m14, m1, m13 ; out14
+ paddd m1, m13 ; out1
+ psubd m13, m2, m12 ; out13
+ paddd m2, m12 ; out2
+ psubd m12, m3, m11 ; out12
+ paddd m3, m11 ; out3
+ psubd m11, m4, m10 ; out11
+ paddd m4, m10 ; out4
+ psubd m10, m5, m9 ; out10
+ paddd m5, m9 ; out5
+ psubd m9, m6, m8 ; out9
+ paddd m6, m8 ; out6
+ psubd m8, m7, [r6+32*3] ; out8
+ paddd m7, [r6+32*3] ; out7
+ ret
+ALIGN function_align
+.transpose:
+ lea r6, [deint_shuf+128]
+.transpose2:
+ packssdw m0, m8
+ packssdw m1, m9
+ packssdw m2, m10
+ packssdw m3, m11
+ packssdw m4, m12
+ packssdw m5, m13
+ packssdw m6, m14
+ packssdw m7, m15
+.transpose3:
+ punpckhwd m8, m0, m1
+ punpcklwd m0, m1
+ punpcklwd m1, m2, m3
+ punpckhwd m2, m3
+ punpckhwd m3, m4, m5
+ punpcklwd m4, m5
+ punpckhwd m5, m6, m7
+ punpcklwd m6, m7
+ punpckhdq m7, m4, m6
+ punpckldq m4, m6
+ punpckldq m6, m8, m2
+ punpckhdq m8, m2
+ punpckhdq m2, m0, m1
+ punpckldq m0, m1
+ punpckhdq m1, m3, m5
+ punpckldq m3, m5
+ punpcklqdq m5, m6, m3
+ punpckhqdq m6, m3
+ punpckhqdq m3, m2, m7
+ punpcklqdq m2, m7
+ punpcklqdq m7, m8, m1
+ punpckhqdq m8, m1
+ punpckhqdq m1, m0, m4
+ punpcklqdq m0, m4
+ vperm2i128 m4, m0, m5, 0x31
+ vinserti128 m0, xm5, 1
+ vperm2i128 m5, m1, m6, 0x31
+ vinserti128 m1, xm6, 1
+ vperm2i128 m6, m2, m7, 0x31
+ vinserti128 m2, xm7, 1
+ vperm2i128 m7, m3, m8, 0x31
+ vinserti128 m3, xm8, 1
+ ret
+ALIGN function_align
+.write_16x4_start:
+ vpbroadcastd m9, [pixel_10bpc_max]
+ lea r3, [strideq*3]
+ pxor m8, m8
+.write_16x4_zero:
+ REPX {mova [cq+32*x], m8}, 0, 1, 2, 3, 4, 5, 6, 7
+ add cq, 32*8
+.write_16x4:
+ paddw m0, [dstq+strideq*0]
+ paddw m1, [dstq+strideq*1]
+ paddw m2, [dstq+strideq*2]
+ paddw m3, [dstq+r3 ]
+ REPX {pmaxsw x, m8}, m0, m1, m2, m3
+ REPX {pminsw x, m9}, m0, m1, m2, m3
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+r3 ], m3
+ lea dstq, [dstq+strideq*4]
+ ret
+
+INV_TXFM_16X8_FN adst, dct
+INV_TXFM_16X8_FN adst, adst
+INV_TXFM_16X8_FN adst, flipadst
+INV_TXFM_16X8_FN adst, identity
+
+cglobal iadst_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
+ vpbroadcastd m13, [clip_18b_min]
+ vpbroadcastd m14, [clip_18b_max]
+.pass1:
+ lea r6, [rsp+32*4]
+ call .main
+ vpbroadcastd m14, [pd_3072]
+ psrld m15, 11 ; pd_1
+ psubd m13, m14, m15 ; pd_3071
+ call .pass1_rotations
+.pass1_end:
+ REPX {psrad x, 1 }, m0, m1, m2, m3, m12, m13, m14, m15
+ REPX {psrad x, 12}, m4, m5, m6, m7, m8, m9, m10, m11
+ jmp tx2q
+.pass2:
+ call m(idct_16x8_internal_10bpc).transpose
+ call m(iadst_16x8_internal_8bpc).main
+ call m(iadst_16x8_internal_8bpc).main_pass2_end
+ vpbroadcastd m10, [pw_2048]
+ pxor m11, m11
+ psubw m11, m10
+ pmulhrsw m0, m10
+ pmulhrsw m1, m11
+ pmulhrsw m2, m10
+ pmulhrsw m3, m11
+ call m(idct_16x8_internal_10bpc).write_16x4_start
+ pmulhrsw m0, m4, m10
+ pmulhrsw m1, m5, m11
+ pmulhrsw m2, m6, m10
+ pmulhrsw m3, m7, m11
+ call m(idct_16x8_internal_10bpc).write_16x4_zero
+ RET
+ALIGN function_align
+.pass1_rotations:
+ paddd m0, m15
+ psubd m1, m15, m1
+ paddd m2, m15
+ psubd m3, m15, m3
+ paddd m4, m14
+ psubd m5, m13, m5
+ paddd m6, m14
+ psubd m7, m13, m7
+ paddd m8, m14, m9
+ psubd m9, m13, m10
+ paddd m10, m14, m11
+ psubd m11, m13, m12
+ paddd m12, m15, [r6-32*1]
+ psubd m13, m15, [r6-32*2]
+ paddd m14, m15, [r6-32*3]
+ psubd m15, [r6-32*4]
+ ret
+ALIGN function_align
+.main:
+ ; expects: m13 = clip_min m14 = clip_max
+ vpbroadcastd m15, [pd_2896]
+ pmulld m0, m15, [cq+32* 2]
+ pmulld m1, m15, [cq+32*13]
+ pmulld m2, m15, [cq+32* 6]
+ pmulld m3, m15, [cq+32* 9]
+ pmulld m4, m15, [cq+32*10]
+ pmulld m5, m15, [cq+32* 5]
+ pmulld m6, m15, [cq+32*14]
+ pmulld m7, m15, [cq+32* 1]
+ vpbroadcastd m12, [pd_2048]
+ REPX {paddd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
+ call .main_part1
+ pmulld m0, m15, [cq+32* 0]
+ pmulld m1, m15, [cq+32*15]
+ pmulld m2, m15, [cq+32* 4]
+ pmulld m3, m15, [cq+32*11]
+ pmulld m4, m15, [cq+32* 8]
+ pmulld m5, m15, [cq+32* 7]
+ pmulld m6, m15, [cq+32*12]
+ pmulld m7, m15, [cq+32* 3]
+ REPX {paddd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
+.main_part2:
+ ITX_MULSUB_2D 1, 0, 8, 9, 10, 12, 201, 4091
+ ITX_MULSUB_2D 3, 2, 8, 9, 10, 12, 1751, 3703
+ ITX_MULSUB_2D 5, 4, 8, 9, 10, 12, 3035, 2751
+ ITX_MULSUB_2D 7, 6, 8, 9, 10, 12, 3857, 1380
+ psubd m8, m0, m4 ; t8a
+ paddd m0, m4 ; t0a
+ psubd m4, m1, m5 ; t9a
+ paddd m1, m5 ; t1a
+ psubd m5, m2, m6 ; t12a
+ paddd m2, m6 ; t4a
+ psubd m6, m3, m7 ; t13a
+ paddd m7, m3 ; t5a
+ REPX {pmaxsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7
+ REPX {pminsd x, m14}, m8, m4, m5, m6, m0, m1, m2, m7
+ vpbroadcastd m11, [pd_4017]
+ vpbroadcastd m10, [pd_799]
+ ITX_MULSUB_2D 8, 4, 3, 9, _, 12, 10, 11
+ ITX_MULSUB_2D 6, 5, 3, 9, _, 12, 11, 10
+ psubd m3, m0, m2 ; t4
+ paddd m0, m2 ; t0
+ psubd m2, m1, m7 ; t5
+ paddd m1, m7 ; t1
+ psubd m7, m4, m6 ; t12a
+ paddd m4, m6 ; t8a
+ psubd m6, m8, m5 ; t13a
+ paddd m5, m8 ; t9a
+ REPX {pmaxsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5
+ REPX {pminsd x, m14}, m3, m2, m7, m6, m0, m1, m4, m5
+ vpbroadcastd m11, [pd_3784]
+ vpbroadcastd m10, [pd_1567]
+ ITX_MULSUB_2D 3, 2, 8, 9, _, 12, 10, 11
+ ITX_MULSUB_2D 7, 6, 8, 9, _, 12, 10, 11
+ pminsd m10, m14, [r6-32*4] ; t2
+ pminsd m8, m14, [r6-32*3] ; t3
+ psubd m9, m0, m10 ; t2a
+ paddd m0, m10 ; out0
+ psubd m10, m1, m8 ; t3a
+ paddd m1, m8 ; -out15
+ pmaxsd m9, m13
+ pmaxsd m10, m13
+ pminsd m9, m14
+ pminsd m10, m14
+ mova [r6-32*4], m1
+ mova m11, [r6-32*1] ; t7a
+ mova m1, [r6-32*2] ; t6a
+ psubd m8, m3, m11 ; t7
+ paddd m11, m3 ; out12
+ paddd m3, m2, m1 ; -out3
+ psubd m2, m1 ; t6
+ pmaxsd m8, m13
+ pmaxsd m2, m13
+ pminsd m8, m14
+ pminsd m2, m14
+ mova [r6-32*1], m11
+ mova [r6-32*3], m2
+ mova m1, [r6+32*3] ; t15
+ mova m2, [r6+32*2] ; t14
+ paddd m12, m7, m1 ; -out13
+ psubd m7, m1 ; t15a
+ psubd m11, m6, m2 ; t14a
+ paddd m2, m6 ; out2
+ pmaxsd m7, m13
+ pmaxsd m11, m13
+ pminsd m7, m14
+ pminsd m11, m14
+ mova [r6-32*2], m12
+ pminsd m1, m14, [r6+32*0] ; t10a
+ pminsd m12, m14, [r6+32*1] ; t11a
+ psubd m6, m4, m1 ; t10
+ paddd m1, m4 ; -out1
+ psubd m4, m5, m12 ; t11
+ paddd m5, m12 ; out14
+ vpbroadcastd m12, [pd_1448]
+ pmaxsd m6, m13
+ pmaxsd m4, m13
+ pminsd m6, m14
+ pminsd m4, m14
+ REPX {pmulld x, m12}, m9, m10, m8, m7, m11, m6, m4
+ pmulld m12, [r6-32*3] ; t6
+ mova [r6-32*3], m5
+ paddd m5, m11, m7 ; -out5 (unshifted)
+ psubd m11, m7 ; out10 (unshifted)
+ paddd m7, m9, m10 ; -out7 (unshifted)
+ psubd m9, m10 ; out8 (unshifted)
+ psubd m10, m6, m4 ; -out9 (unshifted)
+ paddd m6, m4 ; out6 (unshifted)
+ paddd m4, m12, m8 ; out4 (unshifted)
+ psubd m12, m8 ; -out11 (unshifted)
+ ret
+.main_part1:
+ ITX_MULSUB_2D 1, 0, 8, 9, 10, 12, 995, 3973
+ ITX_MULSUB_2D 3, 2, 8, 9, 10, 12, 2440, 3290
+ ITX_MULSUB_2D 5, 4, 8, 9, 10, 12, 3513, 2106
+ ITX_MULSUB_2D 7, 6, 8, 9, 10, 12, 4052, 601
+ psubd m8, m0, m4 ; t10a
+ paddd m0, m4 ; t2a
+ psubd m4, m1, m5 ; t11a
+ paddd m1, m5 ; t3a
+ psubd m5, m2, m6 ; t14a
+ paddd m2, m6 ; t6a
+ psubd m6, m3, m7 ; t15a
+ paddd m7, m3 ; t7a
+ REPX {pmaxsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7
+ REPX {pminsd x, m14}, m8, m4, m5, m6, m0, m1, m2, m7
+ vpbroadcastd m11, [pd_2276]
+ vpbroadcastd m10, [pd_3406]
+ ITX_MULSUB_2D 8, 4, 3, 9, _, 12, 10, 11
+ ITX_MULSUB_2D 6, 5, 3, 9, _, 12, 11, 10
+ psubd m3, m0, m2 ; t6
+ paddd m0, m2 ; t2
+ psubd m2, m1, m7 ; t7
+ paddd m1, m7 ; t3
+ psubd m7, m4, m6 ; t14a
+ paddd m4, m6 ; t10a
+ psubd m6, m8, m5 ; t15a
+ paddd m5, m8 ; t11a
+ REPX {pmaxsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5
+ REPX {pminsd x, m14}, m3, m2, m7, m6 ; clip the rest later
+ vpbroadcastd m11, [pd_1567]
+ vpbroadcastd m10, [pd_3784]
+ ITX_MULSUB_2D 2, 3, 8, 9, _, 12, 10, 11
+ ITX_MULSUB_2D 6, 7, 8, 9, _, 12, 10, 11
+ mova [r6-32*4], m0
+ mova [r6-32*3], m1
+ mova [r6+32*0], m4
+ mova [r6+32*1], m5
+ mova [r6-32*2], m2
+ mova [r6-32*1], m3
+ mova [r6+32*2], m6
+ mova [r6+32*3], m7
+ ret
+
+INV_TXFM_16X8_FN flipadst, dct
+INV_TXFM_16X8_FN flipadst, adst
+INV_TXFM_16X8_FN flipadst, flipadst
+INV_TXFM_16X8_FN flipadst, identity
+
+cglobal iflipadst_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
+ vpbroadcastd m13, [clip_18b_min]
+ vpbroadcastd m14, [clip_18b_max]
+.pass1:
+ lea r6, [rsp+32*4]
+ call m(iadst_16x8_internal_10bpc).main
+ vpbroadcastd m14, [pd_3072]
+ psrld m15, 11
+ psubd m13, m14, m15
+ call .pass1_rotations
+ jmp m(iadst_16x8_internal_10bpc).pass1_end
+.pass2:
+ call m(idct_16x8_internal_10bpc).transpose
+ call m(iadst_16x8_internal_8bpc).main
+ call m(iadst_16x8_internal_8bpc).main_pass2_end
+ vpbroadcastd m10, [pw_2048]
+ pxor m11, m11
+ psubw m11, m10
+ mova m12, m0
+ pmulhrsw m0, m7, m11
+ mova m7, m1
+ pmulhrsw m1, m6, m10
+ mova m6, m2
+ pmulhrsw m2, m5, m11
+ mova m5, m3
+ pmulhrsw m3, m4, m10
+ call m(idct_16x8_internal_10bpc).write_16x4_start
+ pmulhrsw m0, m5, m11
+ pmulhrsw m1, m6, m10
+ pmulhrsw m2, m7, m11
+ pmulhrsw m3, m12, m10
+ call m(idct_16x8_internal_10bpc).write_16x4_zero
+ RET
+ALIGN function_align
+.pass1_rotations:
+ psubd m8, m13, m7
+ paddd m7, m14, m9
+ paddd m9, m14, m6
+ psubd m6, m13, m10
+ psubd m10, m13, m5
+ paddd m5, m14, m11
+ paddd m11, m14, m4
+ psubd m4, m13, m12
+ psubd m12, m15, m3
+ paddd m3, m15, [r6-32*1]
+ paddd m13, m15, m2
+ psubd m2, m15, [r6-32*2]
+ psubd m14, m15, m1
+ mova m1, m15
+ paddd m15, m0
+ psubd m0, m1, [r6-32*4]
+ paddd m1, [r6-32*3]
+ ret
+
+INV_TXFM_16X8_FN identity, dct
+INV_TXFM_16X8_FN identity, adst
+INV_TXFM_16X8_FN identity, flipadst
+INV_TXFM_16X8_FN identity, identity
+
+cglobal iidentity_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
+.pass1:
+ vpbroadcastd m15, [pd_2896]
+ pmulld m0, m15, [cq+32* 0]
+ pmulld m1, m15, [cq+32* 1]
+ pmulld m2, m15, [cq+32* 2]
+ pmulld m3, m15, [cq+32* 3]
+ pmulld m4, m15, [cq+32* 4]
+ pmulld m5, m15, [cq+32* 5]
+ pmulld m6, m15, [cq+32* 6]
+ pmulld m7, m15, [cq+32* 7]
+ pmulld m8, m15, [cq+32* 8]
+ pmulld m9, m15, [cq+32* 9]
+ pmulld m10, m15, [cq+32*10]
+ pmulld m11, m15, [cq+32*11]
+ pmulld m12, m15, [cq+32*12]
+ pmulld m13, m15, [cq+32*13]
+ pmulld m14, m15, [cq+32*14]
+ pmulld m15, [cq+32*15]
+ mova [rsp], m7
+ vpbroadcastd m7, [pd_2048]
+ REPX {paddd x, m7 }, m0, m1, m2, m3, m4, m5, m6, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ paddd m7, [rsp]
+ REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ mova [rsp], m15
+ vpbroadcastd m15, [pd_5793]
+ REPX {pmulld x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14
+ pmulld m15, [rsp]
+ mova [rsp], m7
+ vpbroadcastd m7, [pd_3072]
+ REPX {paddd x, m7 }, m0, m1, m2, m3, m4, m5, m6, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ paddd m7, [rsp]
+ REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ jmp tx2q
+.pass2:
+ call m(idct_16x8_internal_10bpc).transpose
+ vpbroadcastd m10, [pw_4096]
+ jmp m(idct_16x8_internal_10bpc).end
+
+INV_TXFM_16X8_FN dct, dct, 12
+INV_TXFM_16X8_FN dct, identity, 12
+INV_TXFM_16X8_FN dct, adst, 12
+INV_TXFM_16X8_FN dct, flipadst, 12
+
+cglobal idct_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
+ vpbroadcastd m12, [clip_20b_min]
+ vpbroadcastd m13, [clip_20b_max]
+ jmp m(idct_16x8_internal_10bpc).pass1
+.pass2:
+ call .pass2_main
+ RET
+ALIGN function_align
+.pass2_main:
+ call m(idct_8x16_internal_12bpc).transpose
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ vpbroadcastd m11, [pd_2048]
+ REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ call m(idct_8x8_internal_10bpc).main
+ call m(idct_8x8_internal_12bpc).round_shift4
+ mova [cq+32* 8], m0
+ mova [cq+32* 9], m1
+ mova [cq+32*10], m2
+ mova [cq+32*11], m3
+ mova [cq+32*12], m4
+ mova [cq+32*13], m5
+ mova [cq+32*14], m6
+ mova [cq+32*15], m7
+ pmaxsd m0, m12, [cq+32*0]
+ pmaxsd m1, m12, [cq+32*1]
+ pmaxsd m2, m12, [cq+32*2]
+ pmaxsd m3, m12, [cq+32*3]
+ pmaxsd m4, m12, [cq+32*4]
+ pmaxsd m5, m12, [cq+32*5]
+ pmaxsd m6, m12, [cq+32*6]
+ pmaxsd m7, m12, [cq+32*7]
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ call m(idct_8x8_internal_10bpc).main
+ call m(idct_8x8_internal_12bpc).round_shift4
+.end:
+ packssdw m0, [cq+32* 8]
+ packssdw m1, [cq+32* 9]
+ packssdw m2, [cq+32*10]
+ packssdw m3, [cq+32*11]
+ packssdw m4, [cq+32*12]
+ packssdw m5, [cq+32*13]
+ packssdw m6, [cq+32*14]
+ packssdw m7, [cq+32*15]
+ REPX {vpermq x, x, q3120}, m0, m1, m2, m3
+ call .write_16x4_start
+ call m(idct_16x8_internal_10bpc).write_16x4_zero
+ vpermq m0, m4, q3120
+ vpermq m1, m5, q3120
+ vpermq m2, m6, q3120
+ vpermq m3, m7, q3120
+ jmp m(idct_16x8_internal_10bpc).write_16x4_zero
+ALIGN function_align
+.write_16x4_start:
+ vpbroadcastd m9, [pixel_12bpc_max]
+ lea r3, [strideq*3]
+ pxor m8, m8
+ ret
+
+INV_TXFM_16X8_FN adst, dct, 12
+INV_TXFM_16X8_FN adst, adst, 12
+INV_TXFM_16X8_FN adst, flipadst, 12
+INV_TXFM_16X8_FN adst, identity, 12
+
+cglobal iadst_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
+ vpbroadcastd m13, [clip_20b_min]
+ vpbroadcastd m14, [clip_20b_max]
+ jmp m(iadst_16x8_internal_10bpc).pass1
+.pass2:
+ call .pass2_main
+ call m(idct_16x8_internal_12bpc).end
+ RET
+ALIGN function_align
+.pass2_main:
+ call m(idct_8x16_internal_12bpc).transpose
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ vpbroadcastd m11, [pd_2048]
+ REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ call m(iadst_8x8_internal_12bpc).pass2_main2
+ mova [cq+32* 8], m0
+ mova [cq+32* 9], m1
+ mova [cq+32*10], m2
+ mova [cq+32*11], m3
+ mova [cq+32*12], m4
+ mova [cq+32*13], m5
+ mova [cq+32*14], m6
+ mova [cq+32*15], m7
+ pmaxsd m0, m12, [cq+32*0]
+ pmaxsd m1, m12, [cq+32*1]
+ pmaxsd m2, m12, [cq+32*2]
+ pmaxsd m3, m12, [cq+32*3]
+ pmaxsd m4, m12, [cq+32*4]
+ pmaxsd m5, m12, [cq+32*5]
+ pmaxsd m6, m12, [cq+32*6]
+ pmaxsd m7, m12, [cq+32*7]
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ call m(iadst_8x8_internal_12bpc).pass2_main2
+ ret
+
+INV_TXFM_16X8_FN flipadst, dct, 12
+INV_TXFM_16X8_FN flipadst, adst, 12
+INV_TXFM_16X8_FN flipadst, flipadst, 12
+INV_TXFM_16X8_FN flipadst, identity, 12
+
+cglobal iflipadst_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
+ vpbroadcastd m13, [clip_20b_min]
+ vpbroadcastd m14, [clip_20b_max]
+ jmp m(iflipadst_16x8_internal_10bpc).pass1
+.pass2:
+ call m(iadst_16x8_internal_12bpc).pass2_main
+ packssdw m13, m0, [cq+32* 8]
+ packssdw m12, m1, [cq+32* 9]
+ packssdw m11, m2, [cq+32*10]
+ packssdw m10, m3, [cq+32*11]
+ packssdw m3, m4, [cq+32*12]
+ packssdw m2, m5, [cq+32*13]
+ packssdw m1, m6, [cq+32*14]
+ packssdw m0, m7, [cq+32*15]
+ REPX {vpermq x, x, q3120}, m0, m1, m2, m3
+ call m(idct_16x8_internal_12bpc).write_16x4_start
+ call m(idct_16x8_internal_10bpc).write_16x4_zero
+ vpermq m0, m10, q3120
+ vpermq m1, m11, q3120
+ vpermq m2, m12, q3120
+ vpermq m3, m13, q3120
+ call m(idct_16x8_internal_10bpc).write_16x4_zero
+ RET
+
+INV_TXFM_16X8_FN identity, dct, 12
+INV_TXFM_16X8_FN identity, adst, 12
+INV_TXFM_16X8_FN identity, flipadst, 12
+INV_TXFM_16X8_FN identity, identity, 12
+
+cglobal iidentity_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
+ jmp m(iidentity_16x8_internal_10bpc).pass1
+.pass2:
+ call m(idct_16x8_internal_10bpc).transpose2
+ vpbroadcastd m10, [pw_4096]
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ pmulhrsw m2, m10
+ pmulhrsw m3, m10
+ call m(idct_16x8_internal_12bpc).write_16x4_start
+ call m(idct_16x8_internal_10bpc).write_16x4_zero
+ jmp m(idct_16x8_internal_10bpc).end2
+
+%macro INV_TXFM_16X16_FN 2-4 0,10 ; type1, type2, eob_offset, bitdepth
+ INV_TXFM_FN %1, %2, %3, 16x16, %4
+%ifidn %1_%2, dct_dct
+ imul r6d, [cq], 181
+ vpbroadcastd m3, [dconly_%4bpc]
+ mov [cq], eobd ; 0
+ or r3d, 16
+ add r6d, 640
+ sar r6d, 10
+ jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly3
+%endif
+%endmacro
+
+INV_TXFM_16X16_FN dct, dct
+INV_TXFM_16X16_FN dct, identity, 28
+INV_TXFM_16X16_FN dct, adst
+INV_TXFM_16X16_FN dct, flipadst
+
+cglobal idct_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+.pass1:
+ vpbroadcastd m11, [pd_2048]
+ vpbroadcastd m14, [pd_2896]
+ lea r6, [rsp+32*4]
+ sub eobd, 36
+ jl .fast
+ add cq, 32
+ call .main
+ sub cq, 32
+ mova m10, [r6-32*4]
+ mova m9, [r6-32*3]
+ mova m8, [r6-32*2]
+ psubd m15, m0, m10 ; out15
+ paddd m0, m10 ; out0
+ psubd m10, m1, m9 ; out14
+ paddd m1, m9 ; out1
+ psubd m9, m2, m8 ; out13
+ paddd m2, m8 ; out2
+ REPX {psrad x, 2}, m0, m1, m2
+ mova [r6-32*4], m0
+ mova [r6-32*3], m1
+ mova [r6-32*2], m2
+ mova m2, [r6-32*1]
+ mova m1, [r6+32*0]
+ mova m0, [r6+32*1]
+ REPX {psrad x, 2}, m9, m10, m15
+ psubd m8, m3, m2 ; out12
+ paddd m3, m2 ; out3
+ psubd m2, m4, m1 ; out11
+ paddd m4, m1 ; out4
+ psubd m1, m5, m0 ; out10
+ paddd m5, m0 ; out5
+ REPX {psrad x, 2}, m3, m4, m5
+ mova [r6-32*1], m3
+ mova [r6+32*0], m4
+ mova [r6+32*1], m5
+ mova m4, [r6+32*2]
+ mova m3, [r6+32*3]
+ REPX {psrad x, 2}, m1, m2, m8
+ psubd m5, m6, m4 ; out9
+ paddd m6, m4 ; out6
+ psubd m4, m7, m3 ; out8
+ paddd m7, m3 ; out7
+ REPX {psrad x, 2}, m6, m7, m4, m5
+ mova [r6+32*2], m6
+ mova [r6+32*3], m7
+ add r6, 32*8
+ mova [r6-32*4], m4
+ mova [r6-32*3], m5
+ mova [r6-32*2], m1
+ mova [r6-32*1], m2
+ mova [r6+32*0], m8
+ mova [r6+32*1], m9
+ mova [r6+32*2], m10
+ mova [r6+32*3], m15
+.fast:
+ add r6, 32*8
+ call .main
+ mova m14, [r6-32*4]
+ mova m13, [r6-32*3]
+ mova m12, [r6-32*2]
+ mova m11, [r6-32*1]
+ mova m10, [r6+32*0]
+ mova m9, [r6+32*1]
+ mova m8, [r6+32*2]
+ psubd m15, m0, m14 ; out15
+ paddd m0, m14 ; out0
+ psubd m14, m1, m13 ; out14
+ paddd m1, m13 ; out1
+ psubd m13, m2, m12 ; out13
+ paddd m2, m12 ; out2
+ psubd m12, m3, m11 ; out12
+ paddd m3, m11 ; out3
+ psubd m11, m4, m10 ; out11
+ paddd m4, m10 ; out4
+ psubd m10, m5, m9 ; out10
+ paddd m5, m9 ; out5
+ psubd m9, m6, m8 ; out9
+ paddd m6, m8 ; out6
+ psubd m8, m7, [r6+32*3] ; out8
+ paddd m7, [r6+32*3] ; out7
+ sub r6, 32*8
+ REPX {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ jmp tx2q
+.pass2:
+ call .transpose
+ lea r6, [pw_5+128]
+ mova [rsp], m15
+ call m(idct_16x16_internal_8bpc).main
+ mova m1, [rsp+32*1]
+.end:
+ call .write_16x16
+ RET
+ALIGN function_align
+.write_16x16:
+ mova [rsp+gprsize+32*0], m8
+ mova [rsp+gprsize+32*1], m9
+ mova [rsp+gprsize+32*2], m12
+ vpbroadcastd m12, [pw_2048]
+ pmulhrsw m0, m12
+ pmulhrsw m1, m12
+ pmulhrsw m2, m12
+ pmulhrsw m3, m12
+ call m(idct_16x8_internal_10bpc).write_16x4_start
+.write_16x16_2:
+ pmulhrsw m0, m12, m4
+ pmulhrsw m1, m12, m5
+ pmulhrsw m2, m12, m6
+ pmulhrsw m3, m12, m7
+ call m(idct_16x8_internal_10bpc).write_16x4_zero
+ pmulhrsw m0, m12, [rsp+gprsize+32*0]
+ pmulhrsw m1, m12, [rsp+gprsize+32*1]
+ pmulhrsw m2, m12, m10
+ pmulhrsw m3, m12, m11
+ call m(idct_16x8_internal_10bpc).write_16x4_zero
+ pmulhrsw m0, m12, [rsp+gprsize+32*2]
+ pmulhrsw m1, m12, m13
+ pmulhrsw m2, m12, m14
+ pmulhrsw m3, m12, m15
+ jmp m(idct_16x8_internal_10bpc).write_16x4_zero
+ALIGN function_align
+.transpose:
+ test eobd, eobd
+ jl .transpose_fast
+ packssdw m8, [r6-32*4]
+ packssdw m9, [r6-32*3]
+ packssdw m10, [r6-32*2]
+ packssdw m11, [r6-32*1]
+ packssdw m12, [r6+32*0]
+ packssdw m13, [r6+32*1]
+ packssdw m14, [r6+32*2]
+ packssdw m15, [r6+32*3]
+ sub r6, 32*8
+ packssdw m0, [r6-32*4]
+ packssdw m1, [r6-32*3]
+ packssdw m2, [r6-32*2]
+ packssdw m3, [r6-32*1]
+ packssdw m4, [r6+32*0]
+ packssdw m5, [r6+32*1]
+ packssdw m6, [r6+32*2]
+ packssdw m7, [r6+32*3]
+ mova [r6], m8
+ punpckhwd m8, m0, m1
+ punpcklwd m0, m1
+ punpcklwd m1, m2, m3
+ punpckhwd m2, m3
+ punpckhwd m3, m6, m7
+ punpcklwd m6, m7
+ punpcklwd m7, m4, m5
+ punpckhwd m4, m5
+ punpckldq m5, m8, m2
+ punpckhdq m8, m2
+ punpckhdq m2, m0, m1
+ punpckldq m0, m1
+ punpckhdq m1, m7, m6
+ punpckldq m7, m6
+ punpckhdq m6, m4, m3
+ punpckldq m4, m3
+ punpckhqdq m3, m2, m1
+ punpcklqdq m2, m1
+ punpckhqdq m1, m0, m7
+ punpcklqdq m0, m7
+ punpcklqdq m7, m8, m6
+ punpckhqdq m8, m6
+ punpckhqdq m6, m5, m4
+ punpcklqdq m5, m4
+ mova m4, [r6]
+ mova [r6], m8
+ punpcklwd m8, m4, m9
+ punpckhwd m4, m9
+ punpcklwd m9, m10, m11
+ punpckhwd m10, m11
+ punpckhwd m11, m14, m15
+ punpcklwd m14, m15
+ punpckhwd m15, m12, m13
+ punpcklwd m12, m13
+ punpckldq m13, m4, m10
+ punpckhdq m4, m10
+ punpckhdq m10, m8, m9
+ punpckldq m8, m9
+ punpckhdq m9, m12, m14
+ punpckldq m12, m14
+ punpckhdq m14, m15, m11
+ punpckldq m15, m11
+ punpckhqdq m11, m10, m9
+ punpcklqdq m10, m9
+ punpckhqdq m9, m8, m12
+ punpcklqdq m8, m12
+ punpcklqdq m12, m13, m15
+ punpckhqdq m13, m15
+ punpckhqdq m15, m4, m14
+ punpcklqdq m14, m4, m14
+ vperm2i128 m4, m0, m8, 0x31
+ vinserti128 m0, xm8, 1
+ vinserti128 m8, m5, xm12, 1
+ vperm2i128 m12, m5, 0x13
+ vperm2i128 m5, m1, m9, 0x31
+ vinserti128 m1, xm9, 1
+ vinserti128 m9, m6, xm13, 1
+ vperm2i128 m13, m6, 0x13
+ vperm2i128 m6, m2, m10, 0x31
+ vinserti128 m2, xm10, 1
+ vinserti128 m10, m7, xm14, 1
+ vperm2i128 m14, m7, 0x13
+ vperm2i128 m7, m3, m11, 0x31
+ vinserti128 m3, xm11, 1
+ mova xm11, [r6]
+ vinserti128 m11, xm15, 1
+ vinserti128 m15, [r6+16], 0
+ ret
+.transpose_fast:
+ call m(idct_16x8_internal_10bpc).transpose2
+ pxor m8, m8
+ REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15
+ ret
+ALIGN function_align
+.main:
+ mova m0, [cq+64* 1]
+ mova m1, [cq+64* 3]
+ mova m2, [cq+64* 5]
+ mova m3, [cq+64* 7]
+ mova m4, [cq+64* 9]
+ mova m5, [cq+64*11]
+ mova m6, [cq+64*13]
+ mova m7, [cq+64*15]
+ call m(idct_8x16_internal_10bpc).main_oddhalf
+ mova m0, [cq+64* 0]
+ mova m1, [cq+64* 2]
+ mova m2, [cq+64* 4]
+ mova m3, [cq+64* 6]
+ mova m4, [cq+64* 8]
+ mova m5, [cq+64*10]
+ mova m6, [cq+64*12]
+ mova m7, [cq+64*14]
+ call m(idct_8x8_internal_10bpc).main
+ call m(idct_8x16_internal_10bpc).main_evenhalf
+ psrld m10, m11, 10 ; pd_2
+ REPX {paddd x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
+ ret
+
+INV_TXFM_16X16_FN adst, dct
+INV_TXFM_16X16_FN adst, adst
+INV_TXFM_16X16_FN adst, flipadst
+
+cglobal iadst_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
+ vpbroadcastd m13, [clip_18b_min]
+ vpbroadcastd m14, [clip_18b_max]
+.pass1:
+ vpbroadcastd m15, [pd_2896]
+ lea r6, [rsp+32*4]
+ sub eobd, 36
+ jl .fast
+ add cq, 32
+ call .main
+ sub cq, 32
+ vpbroadcastd m8, [pd_5120]
+ paddd m4, m8
+ paddd m6, m8
+ paddd m9, m8
+ paddd m11, m8
+ vpbroadcastd m8, [pd_5119]
+ psubd m5, m8, m5
+ psubd m7, m8, m7
+ psubd m10, m8, m10
+ psubd m12, m8, m12
+ REPX {psrad x, 13}, m4, m5, m6, m7, m9, m10, m11, m12
+ mova [r6+32*0], m4
+ mova [r6+32*1], m5
+ mova [r6+32*2], m6
+ mova [r6+32*3], m7
+ psrld m4, m15, 10 ; pd_2
+ paddd m0, m4
+ psubd m1, m4, m1
+ paddd m2, m4
+ psubd m3, m4, m3
+ psubd m7, m4, [r6-32*4]
+ paddd m6, m4, [r6-32*3]
+ psubd m5, m4, [r6-32*2]
+ paddd m4, [r6-32*1]
+ REPX {psrad x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7
+ mova [r6-32*4], m0
+ mova [r6-32*3], m1
+ mova [r6-32*2], m2
+ mova [r6-32*1], m3
+ add r6, 32*8
+ mova [r6-32*4], m9
+ mova [r6-32*3], m10
+ mova [r6-32*2], m11
+ mova [r6-32*1], m12
+ mova [r6+32*0], m4
+ mova [r6+32*1], m5
+ mova [r6+32*2], m6
+ mova [r6+32*3], m7
+.fast:
+ add r6, 32*8
+ call .main
+ vpbroadcastd m14, [pd_5120]
+ vpbroadcastd m13, [pd_5119]
+ psrld m15, 10 ; pd_2
+ paddd m0, m15
+ psubd m1, m15, m1
+ paddd m2, m15
+ psubd m3, m15, m3
+ paddd m4, m14
+ psubd m5, m13, m5
+ paddd m6, m14
+ psubd m7, m13, m7
+ paddd m8, m14, m9
+ psubd m9, m13, m10
+ paddd m10, m14, m11
+ psubd m11, m13, m12
+ paddd m12, m15, [r6-32*1]
+ psubd m13, m15, [r6-32*2]
+ paddd m14, m15, [r6-32*3]
+ psubd m15, [r6-32*4]
+.pass1_end:
+ REPX {psrad x, 2 }, m0, m1, m2, m3, m12, m13, m14, m15
+ REPX {psrad x, 13}, m4, m5, m6, m7, m8, m9, m10, m11
+ sub r6, 32*8
+ jmp tx2q
+.pass2:
+ call m(idct_16x16_internal_10bpc).transpose
+ lea r6, [pw_5+128]
+ mova [rsp], m15
+ call m(iadst_16x16_internal_8bpc).main
+ call m(iadst_16x16_internal_8bpc).main_pass2_end
+ mova [rsp+32*0], m8
+ mova [rsp+32*2], m12
+ mova [rsp+32*3], m13
+ vpbroadcastd m12, [pw_2048]
+ pxor m13, m13
+ psubw m13, m12
+ pmulhrsw m0, m12
+ pmulhrsw m1, m13, [rsp+32*1]
+ mova [rsp+32*1], m9
+ pmulhrsw m2, m12
+ pmulhrsw m3, m13
+ call m(idct_16x8_internal_10bpc).write_16x4_start
+ pmulhrsw m0, m12, m4
+ pmulhrsw m1, m13, m5
+ pmulhrsw m2, m12, m6
+ pmulhrsw m3, m13, m7
+ call m(idct_16x8_internal_10bpc).write_16x4_zero
+ pmulhrsw m0, m12, [rsp+32*0]
+ pmulhrsw m1, m13, [rsp+32*1]
+ pmulhrsw m2, m12, m10
+ pmulhrsw m3, m13, m11
+ call m(idct_16x8_internal_10bpc).write_16x4_zero
+ pmulhrsw m0, m12, [rsp+32*2]
+ pmulhrsw m1, m13, [rsp+32*3]
+ pmulhrsw m2, m12, m14
+ pmulhrsw m3, m13, m15
+ call m(idct_16x8_internal_10bpc).write_16x4_zero
+ RET
+ALIGN function_align
+.main:
+ mova m0, [cq+64* 2]
+ mova m1, [cq+64*13]
+ mova m2, [cq+64* 6]
+ mova m3, [cq+64* 9]
+ mova m4, [cq+64*10]
+ mova m5, [cq+64* 5]
+ mova m6, [cq+64*14]
+ mova m7, [cq+64* 1]
+ vpbroadcastd m12, [pd_2048]
+ call m(iadst_16x8_internal_10bpc).main_part1
+ mova m0, [cq+64* 0]
+ mova m1, [cq+64*15]
+ mova m2, [cq+64* 4]
+ mova m3, [cq+64*11]
+ mova m4, [cq+64* 8]
+ mova m5, [cq+64* 7]
+ mova m6, [cq+64*12]
+ mova m7, [cq+64* 3]
+ jmp m(iadst_16x8_internal_10bpc).main_part2
+
+INV_TXFM_16X16_FN flipadst, dct
+INV_TXFM_16X16_FN flipadst, adst
+INV_TXFM_16X16_FN flipadst, flipadst
+
+cglobal iflipadst_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
+ vpbroadcastd m13, [clip_18b_min]
+ vpbroadcastd m14, [clip_18b_max]
+.pass1:
+ vpbroadcastd m15, [pd_2896]
+ lea r6, [rsp+32*4]
+ sub eobd, 36
+ jl .fast
+ add cq, 32
+ call m(iadst_16x16_internal_10bpc).main
+ sub cq, 32
+ vpbroadcastd m8, [pd_5120]
+ paddd m11, m8
+ paddd m9, m8
+ paddd m6, m8
+ paddd m4, m8
+ vpbroadcastd m8, [pd_5119]
+ psubd m12, m8, m12
+ psubd m10, m8, m10
+ psubd m7, m8, m7
+ psubd m5, m8, m5
+ REPX {psrad x, 13}, m12, m11, m10, m9, m7, m6, m5, m4
+ mova [r6+32*0], m12
+ mova [r6+32*1], m11
+ mova [r6+32*2], m10
+ mova [r6+32*3], m9
+ psrld m9, m15, 10 ; pd_2
+ psubd m3, m9, m3
+ paddd m2, m9
+ psubd m1, m9, m1
+ paddd m0, m9
+ psubd m12, m9, [r6-32*4]
+ paddd m11, m9, [r6-32*3]
+ psubd m10, m9, [r6-32*2]
+ paddd m9, [r6-32*1]
+ REPX {psrad x, 2 }, m12, m11, m10, m9, m3, m2, m1, m0
+ mova [r6-32*4], m12
+ mova [r6-32*3], m11
+ mova [r6-32*2], m10
+ mova [r6-32*1], m9
+ add r6, 32*8
+ mova [r6-32*4], m7
+ mova [r6-32*3], m6
+ mova [r6-32*2], m5
+ mova [r6-32*1], m4
+ mova [r6+32*0], m3
+ mova [r6+32*1], m2
+ mova [r6+32*2], m1
+ mova [r6+32*3], m0
+.fast:
+ add r6, 32*8
+ call m(iadst_16x16_internal_10bpc).main
+ vpbroadcastd m14, [pd_5120]
+ vpbroadcastd m13, [pd_5119]
+ psrld m15, 10 ; pd_2
+ psubd m8, m13, m7
+ paddd m7, m14, m9
+ paddd m9, m14, m6
+ psubd m6, m13, m10
+ psubd m10, m13, m5
+ paddd m5, m14, m11
+ paddd m11, m14, m4
+ psubd m4, m13, m12
+ psubd m12, m15, m3
+ paddd m3, m15, [r6-32*1]
+ paddd m13, m15, m2
+ psubd m2, m15, [r6-32*2]
+ psubd m14, m15, m1
+ mova m1, m15
+ paddd m15, m0
+ psubd m0, m1, [r6-32*4]
+ paddd m1, [r6-32*3]
+ jmp m(iadst_16x16_internal_10bpc).pass1_end
+.pass2:
+ call m(idct_16x16_internal_10bpc).transpose
+ lea r6, [pw_5+128]
+ mova [rsp], m15
+ call m(iadst_16x16_internal_8bpc).main
+ call m(iadst_16x16_internal_8bpc).main_pass2_end
+ mova [rsp+32*3], m3
+ mova [rsp+32*2], m2
+ mova [rsp+32*0], m0
+ mova m2, m13
+ mova m3, m12
+ vpbroadcastd m12, [pw_2048]
+ pxor m13, m13
+ psubw m13, m12
+ pmulhrsw m0, m13, m15
+ pmulhrsw m1, m12, m14
+ pmulhrsw m2, m13
+ pmulhrsw m3, m12
+ mova m14, m8
+ mova m15, m9
+ call m(idct_16x8_internal_10bpc).write_16x4_start
+ pmulhrsw m0, m13, m11
+ pmulhrsw m1, m12, m10
+ pmulhrsw m2, m13, m15
+ pmulhrsw m3, m12, m14
+ call m(idct_16x8_internal_10bpc).write_16x4_zero
+ pmulhrsw m0, m13, m7
+ pmulhrsw m1, m12, m6
+ pmulhrsw m2, m13, m5
+ pmulhrsw m3, m12, m4
+ call m(idct_16x8_internal_10bpc).write_16x4_zero
+ pmulhrsw m0, m13, [rsp+32*3]
+ pmulhrsw m1, m12, [rsp+32*2]
+ pmulhrsw m2, m13, [rsp+32*1]
+ pmulhrsw m3, m12, [rsp+32*0]
+ call m(idct_16x8_internal_10bpc).write_16x4_zero
+ RET
+
+INV_TXFM_16X16_FN identity, dct, -92
+INV_TXFM_16X16_FN identity, identity
+
+cglobal iidentity_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
+ vpbroadcastd m15, [pd_5793]
+ vpbroadcastd m7, [pd_5120]
+ lea r6, [rsp+32*4]
+ sub eobd, 36
+ jl .fast
+ mov r3, -32*8*4
+.righthalf:
+ pmulld m0, m15, [cq+r3+32*33]
+ pmulld m1, m15, [cq+r3+32*35]
+ pmulld m2, m15, [cq+r3+32*37]
+ pmulld m3, m15, [cq+r3+32*39]
+ add r6, 32*4
+ REPX {paddd x, m7}, m0, m1, m2, m3
+ REPX {psrad x, 13}, m0, m1, m2, m3
+ mova [r6+32*0], m0
+ mova [r6+32*1], m1
+ mova [r6+32*2], m2
+ mova [r6+32*3], m3
+ add r3, 32*8
+ jl .righthalf
+.fast:
+ pmulld m0, m15, [cq+64* 0]
+ pmulld m1, m15, [cq+64* 1]
+ pmulld m2, m15, [cq+64* 2]
+ pmulld m3, m15, [cq+64* 3]
+ pmulld m4, m15, [cq+64* 4]
+ pmulld m5, m15, [cq+64* 5]
+ pmulld m6, m15, [cq+64* 6]
+ pmulld m8, m15, [cq+64* 7]
+ mova [cq], m8
+ pmulld m8, m15, [cq+64* 8]
+ pmulld m9, m15, [cq+64* 9]
+ pmulld m10, m15, [cq+64*10]
+ pmulld m11, m15, [cq+64*11]
+ pmulld m12, m15, [cq+64*12]
+ pmulld m13, m15, [cq+64*13]
+ pmulld m14, m15, [cq+64*14]
+ pmulld m15, [cq+64*15]
+ REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ paddd m7, [cq]
+ REPX {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ jmp tx2q
+.pass2:
+ call m(idct_16x16_internal_10bpc).transpose
+
+ mova [cq+32*0], m15
+ mova [cq+32*1], m0
+ vpbroadcastd m15, [pw_1697x16]
+
+ REPX {IDTX16 x, 0, 15}, 1, 2, 3, 4, 5, 6, 7, \
+ 8, 9, 10, 11, 12, 13, 14
+ mova m0, [cq+32*1]
+ mova [cq+32*1], m1
+ IDTX16 0, 1, 15
+ mova m1, [cq+32*0]
+ pmulhrsw m15, m1
+ paddsw m1, m1
+ paddsw m15, m1
+ mova m1, [cq+32*1]
+ jmp m(idct_16x16_internal_10bpc).end
+
+INV_TXFM_16X16_FN dct, dct, 0, 12
+INV_TXFM_16X16_FN dct, identity, 28, 12
+INV_TXFM_16X16_FN dct, adst, 0, 12
+INV_TXFM_16X16_FN dct, flipadst, 0, 12
+
+cglobal idct_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
+ vpbroadcastd m12, [clip_20b_min]
+ vpbroadcastd m13, [clip_20b_max]
+ jmp m(idct_16x16_internal_10bpc).pass1
+.pass2:
+ mova [cq+32* 8], m8
+ mova [cq+32* 9], m9
+ mova [cq+32*10], m10
+ mova [cq+32*11], m11
+ mova [cq+32*12], m12
+ mova [cq+32*13], m13
+ mova [cq+32*14], m14
+ mova [cq+32*15], m15
+ call .pass2_main
+ packssdw m0, m1
+ packssdw m1, m2, m3
+ packssdw m2, m4, m5
+ packssdw m3, m6, m7
+ packssdw m4, m8, m9
+ packssdw m5, m10, m11
+ packssdw m6, m12, m13
+ packssdw m7, m14, m15
+ mova [r6-32*4], m0
+ mova [r6-32*3], m1
+ mova [r6-32*2], m2
+ mova [r6-32*1], m3
+ mova [r6+32*0], m4
+ mova [r6+32*1], m5
+ mova [r6+32*2], m6
+ mova [r6+32*3], m7
+ mova m0, [cq+32* 8]
+ mova m1, [cq+32* 9]
+ mova m2, [cq+32*10]
+ mova m3, [cq+32*11]
+ mova m4, [cq+32*12]
+ mova m5, [cq+32*13]
+ mova m6, [cq+32*14]
+ mova m7, [cq+32*15]
+ mov r5, r6
+ add r6, 32*16
+ call .pass2_main
+ jmp m(iadst_16x16_internal_12bpc).end
+ALIGN function_align
+.write_16x16:
+ mova [rsp+gprsize+32*0], m8
+ mova [rsp+gprsize+32*1], m9
+ mova [rsp+gprsize+32*2], m12
+ vpbroadcastd m12, [pw_16384]
+ pmulhrsw m0, m12
+ pmulhrsw m1, m12
+ pmulhrsw m2, m12
+ pmulhrsw m3, m12
+ call m(idct_16x8_internal_12bpc).write_16x4_start
+ call m(idct_16x8_internal_10bpc).write_16x4_zero
+ jmp m(idct_16x16_internal_10bpc).write_16x16_2
+ALIGN function_align
+.pass2_main:
+ call m(idct_8x8_internal_12bpc).transpose_8x8
+ mova [cq+32* 0], m0
+ mova [cq+32* 1], m2
+ mova [cq+32* 2], m4
+ mova [cq+32* 3], m6
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ pmaxsd m0, m12, m1
+ pmaxsd m1, m12, m3
+ pmaxsd m2, m12, m5
+ pmaxsd m3, m12, m7
+ REPX {pminsd x, m13}, m0, m1, m2, m3
+ test eobd, eobd
+ jge .pass2_slow
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ jmp .pass2_fast
+.pass2_slow:
+ sub r6, 32*8
+ mova m8, [r6-32*4]
+ mova m4, [r6-32*3]
+ mova m10, [r6-32*2]
+ mova m5, [r6-32*1]
+ mova m12, [r6+32*0]
+ mova m6, [r6+32*1]
+ mova m14, [r6+32*2]
+ mova m7, [r6+32*3]
+ TRANSPOSE_8X8_DWORD 8, 4, 10, 5, 12, 6, 14, 7, 9, 11, 13, 15
+ mova [cq+32* 4], m8
+ mova [cq+32* 5], m10
+ mova [cq+32* 6], m12
+ mova [cq+32* 7], m14
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ REPX {pmaxsd x, m12}, m4, m5, m6, m7
+ REPX {pminsd x, m13}, m4, m5, m6, m7
+.pass2_fast:
+ vpbroadcastd m11, [pd_2048]
+ vpbroadcastd m14, [pd_2896]
+ call m(idct_8x16_internal_10bpc).main_oddhalf
+ pmaxsd m0, m12, [cq+32* 0]
+ pmaxsd m1, m12, [cq+32* 1]
+ pmaxsd m2, m12, [cq+32* 2]
+ pmaxsd m3, m12, [cq+32* 3]
+ REPX {pminsd x, m13}, m0, m1, m2, m3
+ test eobd, eobd
+ jge .pass2_slow2
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ jmp .pass2_fast2
+.pass2_slow2:
+ pmaxsd m4, m12, [cq+32* 4]
+ pmaxsd m5, m12, [cq+32* 5]
+ pmaxsd m6, m12, [cq+32* 6]
+ pmaxsd m7, m12, [cq+32* 7]
+ REPX {pminsd x, m13}, m4, m5, m6, m7
+.pass2_fast2:
+ call m(idct_8x8_internal_10bpc).main
+ call m(idct_8x16_internal_10bpc).main_evenhalf
+ psrad m11, 8 ; pd_8
+ REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+ call m(idct_16x8_internal_10bpc).pass1_rotations
+ REPX {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ ret
+
+INV_TXFM_16X16_FN adst, dct, 0, 12
+INV_TXFM_16X16_FN adst, adst, 0, 12
+INV_TXFM_16X16_FN adst, flipadst, 0, 12
+
+cglobal iadst_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
+ vpbroadcastd m13, [clip_20b_min]
+ vpbroadcastd m14, [clip_20b_max]
+ jmp m(iadst_16x16_internal_10bpc).pass1
+.pass2:
+ call .pass2_part1
+ call m(iadst_16x8_internal_10bpc).pass1_rotations
+ call .pass2_part2
+ call m(iadst_16x8_internal_10bpc).pass1_rotations
+.pass2_part3:
+ REPX {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15
+ REPX {psrad x, 15}, m4, m5, m6, m7, m8, m9, m10, m11
+.end:
+ packssdw m15, m14
+ packssdw m14, m13, m12
+ packssdw m13, m11, m10
+ packssdw m12, m9, m8
+ packssdw m11, m7, m6
+ packssdw m10, m5, m4
+ packssdw m7, m3, m2
+ packssdw m6, m1, m0
+ vpblendd m0, m6, [r5-32*4], 0x33
+ vpblendd m1, m6, [r5-32*4], 0xcc
+ vpblendd m2, m7, [r5-32*3], 0x33
+ vpblendd m3, m7, [r5-32*3], 0xcc
+ vpermq m0, m0, q3120
+ vpermq m1, m1, q2031
+ vpermq m2, m2, q3120
+ vpermq m3, m3, q2031
+ call m(idct_16x8_internal_12bpc).write_16x4_start
+ call m(idct_16x8_internal_10bpc).write_16x4_zero
+ vpblendd m0, m10, [r5-32*2], 0x33
+ vpblendd m1, m10, [r5-32*2], 0xcc
+ vpblendd m2, m11, [r5-32*1], 0x33
+ vpblendd m3, m11, [r5-32*1], 0xcc
+ vpermq m0, m0, q3120
+ vpermq m1, m1, q2031
+ vpermq m2, m2, q3120
+ vpermq m3, m3, q2031
+ call m(idct_16x8_internal_10bpc).write_16x4_zero
+ vpblendd m0, m12, [r5+32*0], 0x33
+ vpblendd m1, m12, [r5+32*0], 0xcc
+ vpblendd m2, m13, [r5+32*1], 0x33
+ vpblendd m3, m13, [r5+32*1], 0xcc
+ vpermq m0, m0, q3120
+ vpermq m1, m1, q2031
+ vpermq m2, m2, q3120
+ vpermq m3, m3, q2031
+ call m(idct_16x8_internal_10bpc).write_16x4_zero
+ vpblendd m0, m14, [r5+32*2], 0x33
+ vpblendd m1, m14, [r5+32*2], 0xcc
+ vpblendd m2, m15, [r5+32*3], 0x33
+ vpblendd m3, m15, [r5+32*3], 0xcc
+ vpermq m0, m0, q3120
+ vpermq m1, m1, q2031
+ vpermq m2, m2, q3120
+ vpermq m3, m3, q2031
+ call m(idct_16x8_internal_10bpc).write_16x4_zero
+ RET
+ALIGN function_align
+.pass2_part1:
+ mova [cq+32* 8], m8
+ mova [cq+32* 9], m9
+ mova [cq+32*10], m10
+ mova [cq+32*11], m11
+ mova [cq+32*12], m12
+ mova [cq+32*13], m13
+ mova [cq+32*14], m14
+ mova [cq+32*15], m15
+.pass2_main:
+ call m(idct_8x8_internal_12bpc).transpose_8x8
+ mova [cq+32* 0], m0
+ mova [cq+32* 1], m3
+ mova [cq+32* 2], m4
+ mova [cq+32* 3], m7
+ vpbroadcastd m13, [clip_18b_min]
+ vpbroadcastd m14, [clip_18b_max]
+ pmaxsd m0, m13, m2
+ pmaxsd m2, m13, m6
+ pmaxsd m5, m13, m5
+ pmaxsd m7, m13, m1
+ REPX {pminsd x, m14}, m0, m2, m5, m7
+ test eobd, eobd
+ jge .pass2_slow
+ pxor m1, m1
+ REPX {mova x, m1}, m3, m4, m6
+ jmp .pass2_fast
+.pass2_slow:
+ sub r6, 32*8
+ mova m8, [r6-32*4]
+ mova m3, [r6-32*3]
+ mova m4, [r6-32*2]
+ mova m11, [r6-32*1]
+ mova m12, [r6+32*0]
+ mova m1, [r6+32*1]
+ mova m6, [r6+32*2]
+ mova m15, [r6+32*3]
+ TRANSPOSE_8X8_DWORD 8, 3, 4, 11, 12, 1, 6, 15, 13, 9, 10, 14
+ mova [cq+32* 4], m8
+ mova [cq+32* 5], m11
+ mova [cq+32* 6], m12
+ mova [cq+32* 7], m15
+ vpbroadcastd m13, [clip_18b_min]
+ vpbroadcastd m14, [clip_18b_max]
+ REPX {pmaxsd x, m13}, m1, m3, m4, m6
+ REPX {pminsd x, m14}, m1, m3, m4, m6
+.pass2_fast:
+ vpbroadcastd m12, [pd_2048]
+ vpbroadcastd m15, [pd_2896]
+ call m(iadst_16x8_internal_10bpc).main_part1
+ pmaxsd m0, m13, [cq+32* 0] ; 0
+ pmaxsd m7, m13, [cq+32* 1] ; 3
+ pmaxsd m2, m13, [cq+32* 2] ; 4
+ pmaxsd m5, m13, [cq+32* 3] ; 7
+ REPX {pminsd x, m14}, m0, m2, m5, m7
+ test eobd, eobd
+ jge .pass2_slow2
+ pxor m1, m1
+ REPX {mova x, m1}, m3, m4, m6
+ jmp .pass2_fast2
+.pass2_slow2:
+ pmaxsd m4, m13, [cq+32* 4] ; 8
+ pmaxsd m3, m13, [cq+32* 5] ; 11
+ pmaxsd m6, m13, [cq+32* 6] ; 12
+ pmaxsd m1, m13, [cq+32* 7] ; 15
+ REPX {pminsd x, m14}, m1, m3, m4, m6
+.pass2_fast2:
+ call m(iadst_16x8_internal_10bpc).main_part2
+ vpbroadcastd m14, [pd_17408]
+ psrld m15, 11 ; pd_1
+ psubd m13, m14, m15 ; pd_17407
+ pslld m15, 3 ; pd_8
+ ret
+ALIGN function_align
+.pass2_part2:
+ REPX {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15
+ REPX {psrad x, 15}, m4, m5, m6, m7, m8, m9, m10, m11
+ packssdw m0, m1
+ packssdw m1, m2, m3
+ packssdw m2, m4, m5
+ packssdw m3, m6, m7
+ packssdw m4, m8, m9
+ packssdw m5, m10, m11
+ packssdw m6, m12, m13
+ packssdw m7, m14, m15
+ mova [r6-32*4], m0
+ mova [r6-32*3], m1
+ mova [r6-32*2], m2
+ mova [r6-32*1], m3
+ mova [r6+32*0], m4
+ mova [r6+32*1], m5
+ mova [r6+32*2], m6
+ mova [r6+32*3], m7
+ mova m0, [cq+32* 8]
+ mova m1, [cq+32* 9]
+ mova m2, [cq+32*10]
+ mova m3, [cq+32*11]
+ mova m4, [cq+32*12]
+ mova m5, [cq+32*13]
+ mova m6, [cq+32*14]
+ mova m7, [cq+32*15]
+ mov r5, r6
+ add r6, 32*16
+ jmp .pass2_main
+
+INV_TXFM_16X16_FN flipadst, dct, 0, 12
+INV_TXFM_16X16_FN flipadst, adst, 0, 12
+INV_TXFM_16X16_FN flipadst, flipadst, 0, 12
+
+cglobal iflipadst_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
+ vpbroadcastd m13, [clip_20b_min]
+ vpbroadcastd m14, [clip_20b_max]
+ jmp m(iflipadst_16x16_internal_10bpc).pass1
+.pass2:
+ call m(iadst_16x16_internal_12bpc).pass2_part1
+ call m(iflipadst_16x8_internal_10bpc).pass1_rotations
+ call m(iadst_16x16_internal_12bpc).pass2_part2
+ call m(iflipadst_16x8_internal_10bpc).pass1_rotations
+ jmp m(iadst_16x16_internal_12bpc).pass2_part3
+
+INV_TXFM_16X16_FN identity, dct, -92, 12
+INV_TXFM_16X16_FN identity, identity, 0, 12
+
+%macro IDTX16_12BPC 1 ; src
+ pmulld m6, m7, m%1
+ paddd m6, m15
+ psrad m6, 12
+ paddd m6, m%1
+ psrad m%1, m6, 1
+%endmacro
+
+cglobal iidentity_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
+ vpbroadcastd m7, [pd_1697]
+ vpbroadcastd m15, [pd_5120]
+ lea r6, [rsp+32*4]
+ sub eobd, 36
+ jl .fast
+ mov r3, -32*8*4
+.righthalf:
+ mova m10, [cq+r3+32*33]
+ mova m11, [cq+r3+32*35]
+ mova m12, [cq+r3+32*37]
+ mova m13, [cq+r3+32*39]
+ add r6, 32*4
+ pmulld m0, m7, m10
+ pmulld m1, m7, m11
+ pmulld m2, m7, m12
+ pmulld m3, m7, m13
+ REPX {paddd x, m15}, m0, m1, m2, m3
+ REPX {psrad x, 12 }, m0, m1, m2, m3
+ paddd m0, m10
+ paddd m1, m11
+ paddd m2, m12
+ paddd m3, m13
+ REPX {psrad x, 1 }, m0, m1, m2, m3
+ mova [r6+32*0], m0
+ mova [r6+32*1], m1
+ mova [r6+32*2], m2
+ mova [r6+32*3], m3
+ add r3, 32*8
+ jl .righthalf
+.fast:
+ mova m0, [cq+64* 0]
+ mova m1, [cq+64* 1]
+ mova m2, [cq+64* 2]
+ mova m3, [cq+64* 3]
+ mova m4, [cq+64* 4]
+ mova m5, [cq+64* 5]
+ mova m8, [cq+64* 6]
+ mova m9, [cq+64* 7]
+ REPX {IDTX16_12BPC x}, 0, 1, 2, 3, 4, 5, 8, 9
+ mova [cq+64*0], m8
+ mova [cq+64*1], m9
+ mova m8, [cq+64* 8]
+ mova m9, [cq+64* 9]
+ mova m10, [cq+64*10]
+ mova m11, [cq+64*11]
+ mova m12, [cq+64*12]
+ mova m13, [cq+64*13]
+ mova m14, [cq+64*14]
+ REPX {IDTX16_12BPC x}, 8, 9, 10, 11, 12, 13, 14
+ mova m6, [cq+64*15]
+ pmulld m7, m6
+ paddd m7, m15
+ psrad m7, 12
+ paddd m7, m6
+ mova m6, [cq+64*0]
+ psrad m15, m7, 1
+ mova m7, [cq+64*1]
+ jmp tx2q
+.pass2:
+ call m(iidentity_8x16_internal_12bpc).pass2_main
+ call m(idct_16x16_internal_10bpc).transpose_fast
+ test eobd, eobd
+ jl .pass2_fast
+ mova [cq+32* 8], m0
+ mova [cq+32* 9], m1
+ mova [cq+32*10], m2
+ mova [cq+32*11], m3
+ mova [cq+32*12], m4
+ mova [cq+32*13], m5
+ mova [cq+32*14], m6
+ mova [cq+32*15], m7
+ mova m8, [r6-32*4]
+ mova m9, [r6-32*3]
+ mova m10, [r6-32*2]
+ mova m11, [r6-32*1]
+ mova m12, [r6+32*0]
+ mova m13, [r6+32*1]
+ mova m14, [r6+32*2]
+ mova m15, [r6+32*3]
+ sub r6, 32*8
+ mova m0, [r6-32*4]
+ mova m1, [r6-32*3]
+ mova m2, [r6-32*2]
+ mova m3, [r6-32*1]
+ mova m4, [r6+32*0]
+ mova m5, [r6+32*1]
+ mova m6, [r6+32*2]
+ mova m7, [r6+32*3]
+ call m(iidentity_8x16_internal_12bpc).pass2_main
+ call m(idct_16x8_internal_10bpc).transpose2
+ mova m8, m0
+ mova m9, m1
+ mova m10, m2
+ mova m11, m3
+ mova m12, m4
+ mova m13, m5
+ mova m14, m6
+ mova m15, m7
+ mova m0, [cq+32* 8]
+ mova m1, [cq+32* 9]
+ mova m2, [cq+32*10]
+ mova m3, [cq+32*11]
+ mova m4, [cq+32*12]
+ mova m5, [cq+32*13]
+ mova m6, [cq+32*14]
+ mova m7, [cq+32*15]
+.pass2_fast:
+ call m(idct_16x16_internal_12bpc).write_16x16
+ RET
+
+%macro IDCT32_END 6-7 1 ; in/out1, out2, tmp[1-3], shift, pack
+ mova m%4, [r6+32*(%1-4)]
+ mova m%2, [r5+32*(3-%1)]
+ mova m%5, [r4+32*(%1-4)]
+ psubd m%3, m%1, m%4 ; idct16 out15 - n
+ paddd m%1, m%4 ; idct16 out0 + n
+ pmaxsd m%1, m12
+ pmaxsd m%3, m12
+ pminsd m%1, m13
+ pminsd m%3, m13
+ paddd m%1, m11
+ paddd m%3, m11
+ psubd m%4, m%1, m%2 ; out31 - n
+ paddd m%1, m%2 ; out0 + n
+ paddd m%2, m%3, m%5 ; out15 - n
+ psubd m%3, m%5 ; out16 + n
+ REPX {psrad x, %6}, m%1, m%3, m%2, m%4
+%if %7 & 1
+ packssdw m%1, m%3 ; out0 + n, out16 + n
+ packssdw m%2, m%4 ; out15 - n, out31 - n
+%endif
+%endmacro
+
+cglobal inv_txfm_add_dct_dct_8x32_10bpc, 4, 7, 0, dst, stride, c, eob
+ test eobd, eobd
+ jz .dconly
+ PROLOGUE 0, 7, 16, 32*12, dst, stride, c, eob
+%undef cmp
+ vpbroadcastd m11, [pd_2048]
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ vbroadcasti128 m14, [idct32_shuf]
+ mov r4, cq
+ call .pass1_main
+ mova [rsp+32*0], m2
+ mova [rsp+32*1], m3
+ cmp eobd, 43
+ jge .eob43
+ pxor m4, m4
+ REPX {mova x, m4}, [rsp+32*2], m2, m3, m11
+ jmp .pass1_end_fast
+.eob43:
+ lea r6, [rsp+32*8]
+ mova [r6-32*4], m0
+ mova [r6-32*3], m1
+ call .pass1_main
+ mova [rsp+32*2], m2
+ cmp eobd, 107
+ jge .eob107
+ mova m11, m3
+ mova m2, m0
+ mova m3, m1
+ mova m0, [r6-32*4]
+ mova m1, [r6-32*3]
+ pxor m4, m4
+.pass1_end_fast:
+ vpbroadcastd m10, [pw_2048]
+ lea r6, [deint_shuf+128]
+ REPX {mova x, m4}, m5, m6, m7
+ call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast
+ jmp .end
+.eob107:
+ mova [rsp+32*3], m3
+ mova [r6-32*2], m0
+ mova [r6-32*1], m1
+ call .pass1_main
+ cmp eobd, 171
+ jge .eob171
+ pshufd m12, m2, q1032
+ pshufd m13, m3, q1032
+ mova m4, m0
+ mova m5, m1
+ pxor m6, m6
+ REPX {mova x, m6}, m7, m14, m15
+ jmp .pass1_end
+.eob171:
+ mova [r6+32*0], m0
+ mova [r6+32*1], m1
+ mova [r6+32*2], m2
+ mova [r6+32*3], m3
+ call .pass1_main
+ pshufd m12, [r6+32*2], q1032 ; out19 out17
+ pshufd m13, [r6+32*3], q1032 ; out23 out21
+ mova m4, [r6+32*0] ; out16 out18
+ mova m5, [r6+32*1] ; out20 out22
+ pshufd m14, m2, q1032 ; out27 out25
+ pshufd m15, m3, q1032 ; out31 out29
+ mova m6, m0 ; out24 out26
+ mova m7, m1 ; out28 out30
+.pass1_end:
+ mova m0, [r6-32*4] ; out0 out2
+ mova m1, [r6-32*3] ; out4 out6
+ mova m2, [r6-32*2] ; out8 out10
+ mova m3, [r6-32*1] ; out12 out14
+ lea r6, [deint_shuf+128]
+ mova m11, [rsp+32*3] ; out13 out15
+ vpbroadcastd m10, [pw_2048]
+ call m(inv_txfm_add_dct_dct_8x32_8bpc).main
+.end: ; [rsp+0*32] = m12
+ vpbroadcastd m12, [pw_2048]
+ mov cq, r4
+ mova [rsp+32*1], m8
+ mova [rsp+32*2], m9
+ mova [rsp+32*3], m10
+ mova [rsp+32*4], m11
+ vpermq m0, m0, q3120
+ vpermq m1, m1, q2031
+ pmulhrsw m0, m12
+ pmulhrsw m1, m12
+ call m(idct_8x8_internal_10bpc).write_8x4_start
+ vpermq m0, m2, q3120
+ vpermq m1, m3, q2031
+ pmulhrsw m0, m12
+ pmulhrsw m1, m12
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, m4, q3120
+ vpermq m1, m5, q2031
+ pmulhrsw m0, m12
+ pmulhrsw m1, m12
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, m6, q3120
+ vpermq m1, m7, q2031
+ pmulhrsw m0, m12
+ pmulhrsw m1, m12
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, [rsp+32*1], q3120
+ vpermq m1, [rsp+32*2], q2031
+ pmulhrsw m0, m12
+ pmulhrsw m1, m12
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, [rsp+32*3], q3120
+ vpermq m1, [rsp+32*4], q2031
+ pmulhrsw m0, m12
+ pmulhrsw m1, m12
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, [rsp+32*0], q3120
+ vpermq m1, m13, q2031
+ pmulhrsw m0, m12
+ pmulhrsw m1, m12
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, m14, q3120
+ vpermq m1, m15, q2031
+ pmulhrsw m0, m12
+ pmulhrsw m1, m12
+ call m(idct_8x8_internal_10bpc).write_8x4
+ RET
+.dconly:
+ imul r6d, [cq], 181
+ vpbroadcastd m2, [dconly_10bpc]
+ mov [cq], eobd ; 0
+ or r3d, 32
+ add r6d, 640
+ sar r6d, 10
+ jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly3
+ALIGN function_align
+.pass1_main_part1:
+ mova m0, [cq+128*0]
+ mova m1, [cq+128*1]
+ mova m2, [cq+128*2]
+ mova m3, [cq+128*3]
+ mova m4, [cq+128*4]
+ mova m5, [cq+128*5]
+ mova m6, [cq+128*6]
+ mova m7, [cq+128*7]
+ call m(idct_8x8_internal_10bpc).main
+ psrld m1, m11, 10 ; pd_2
+ REPX {paddd x, m1}, m0, m6, m5, m3
+ paddd m1, m6, m7 ; out1
+ psubd m6, m7 ; out6
+ psubd m7, m0, m9 ; out7
+ paddd m0, m9 ; out0
+ paddd m2, m5, m4 ; out2
+ psubd m5, m4 ; out5
+ psubd m4, m3, m8 ; out4
+ paddd m3, m8 ; out3
+ REPX {psrad x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7
+ ret
+ALIGN function_align
+.pass1_main:
+ call .pass1_main_part1
+ add cq, 32
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ pshufb m0, m14
+ pshufb m2, m14
+ pshufb m4, m14
+ pshufb m6, m14
+ punpckhdq m3, m0, m2
+ punpckldq m0, m2
+ punpckldq m2, m4, m6
+ punpckhdq m4, m6
+ vperm2i128 m1, m0, m2, 0x31 ; 4 6
+ vinserti128 m0, xm2, 1 ; 0 2
+ vinserti128 m2, m3, xm4, 1 ; 1 3
+ vperm2i128 m3, m4, 0x31 ; 5 7
+ ret
+.main_oddhalf_part1_fast_rect2:
+ REPX {paddd x, m11}, m0, m1, m2, m3
+ REPX {psrad x, 12 }, m0, m1, m2, m3
+.main_oddhalf_part1_fast: ; lower half zero
+ vpbroadcastd m7, [pd_4091]
+ vpbroadcastd m8, [pd_201]
+ vpbroadcastd m6, [pd_m1380]
+ vpbroadcastd m9, [pd_3857]
+ vpbroadcastd m5, [pd_3703]
+ vpbroadcastd m10, [pd_1751]
+ vpbroadcastd m4, [pd_m2751]
+ vpbroadcastd m15, [pd_3035]
+ pmulld m7, m0
+ pmulld m0, m8
+ pmulld m6, m1
+ pmulld m1, m9
+ pmulld m5, m2
+ pmulld m2, m10
+ pmulld m4, m3
+ pmulld m3, m15
+ jmp .main_oddhalf_part1_fast2
+.main_oddhalf_part1_rect2:
+ REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
+.main_oddhalf_part1: ; in1, in7, in9, in15, in17, in23, in25, in31
+ ITX_MULSUB_2D 0, 7, 8, 9, 10, _, 201, 4091 ; t16a, t31a
+ ITX_MULSUB_2D 6, 1, 8, 9, 10, _, 3857, 1380 ; t19a, t28a
+ ITX_MULSUB_2D 2, 5, 8, 9, 10, _, 1751, 3703 ; t18a, t29a
+ ITX_MULSUB_2D 4, 3, 8, 9, 10, _, 3035, 2751 ; t17a, t30a
+.main_oddhalf_part1_fast2:
+ REPX {paddd x, m11}, m0, m7, m6, m1, m2, m5, m4, m3
+ REPX {psrad x, 12 }, m0, m4, m6, m2, m1, m5, m7, m3
+ psubd m8, m0, m4 ; t17
+ paddd m0, m4 ; t16
+ psubd m4, m6, m2 ; t18
+ paddd m6, m2 ; t19
+ psubd m2, m1, m5 ; t29
+ paddd m1, m5 ; t28
+ psubd m5, m7, m3 ; t30
+ paddd m7, m3 ; t31
+ REPX {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7
+ REPX {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7
+ vpbroadcastd m15, [pd_4017]
+ vpbroadcastd m10, [pd_799]
+ ITX_MULSUB_2D 5, 8, 3, 9, _, 11, 10, 15 ; t17a, t30a
+ ITX_MULSUB_2D 2, 4, 3, 9, _, 11, 10, 15, 2 ; t29a, t18a
+ psubd m3, m0, m6 ; t19a
+ paddd m0, m6 ; t16a
+ psubd m6, m7, m1 ; t28a
+ paddd m7, m1 ; t31a
+ psubd m1, m5, m4 ; t18
+ paddd m5, m4 ; t17
+ psubd m4, m8, m2 ; t29
+ paddd m8, m2 ; t30
+ REPX {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8
+ REPX {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8
+ vpbroadcastd m15, [pd_3784]
+ vpbroadcastd m10, [pd_1567]
+ ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15 ; t18a, t29a
+ ITX_MULSUB_2D 6, 3, 2, 9, _, 11, 10, 15 ; t19, t28
+ mova [r6-32*4], m0
+ mova [r6-32*3], m5
+ mova [r6-32*2], m4
+ mova [r6-32*1], m6
+ mova [r6+32*0], m3
+ mova [r6+32*1], m1
+ mova [r6+32*2], m8
+ mova [r6+32*3], m7
+ ret
+.main_oddhalf_part2_fast_rect2:
+ REPX {paddd x, m11}, m0, m1, m2, m3
+ REPX {psrad x, 12 }, m0, m1, m2, m3
+.main_oddhalf_part2_fast: ; lower half zero
+ vpbroadcastd m7, [pd_m601]
+ vpbroadcastd m8, [pd_4052]
+ vpbroadcastd m6, [pd_3973]
+ vpbroadcastd m9, [pd_995]
+ vpbroadcastd m5, [pd_m2106]
+ vpbroadcastd m10, [pd_3513]
+ vpbroadcastd m4, [pd_3290]
+ vpbroadcastd m15, [pd_2440]
+ pmulld m7, m0
+ pmulld m0, m8
+ pmulld m6, m1
+ pmulld m1, m9
+ pmulld m5, m2
+ pmulld m2, m10
+ pmulld m4, m3
+ pmulld m3, m15
+ jmp .main_oddhalf_part2_fast2
+.main_oddhalf_part2_rect2:
+ REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
+.main_oddhalf_part2: ; in3, in5, in11, in13, in19, in21, in27, in29
+ ITX_MULSUB_2D 7, 0, 8, 9, 10, _, 4052, 601 ; t23a, t24a
+ ITX_MULSUB_2D 1, 6, 8, 9, 10, _, 995, 3973 ; t20a, t27a
+ ITX_MULSUB_2D 5, 2, 8, 9, 10, _, 3513, 2106 ; t21a, t26a
+ ITX_MULSUB_2D 3, 4, 8, 9, 10, _, 2440, 3290 ; t22a, t25a
+.main_oddhalf_part2_fast2:
+ REPX {paddd x, m11}, m0, m7, m6, m1, m2, m5, m4, m3
+ REPX {psrad x, 12 }, m0, m4, m6, m2, m1, m5, m7, m3
+ psubd m8, m0, m4 ; t25
+ paddd m0, m4 ; t24
+ psubd m4, m6, m2 ; t26
+ paddd m6, m2 ; t27
+ psubd m2, m1, m5 ; t21
+ paddd m1, m5 ; t20
+ psubd m5, m7, m3 ; t22
+ paddd m7, m3 ; t23
+ REPX {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7
+ REPX {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7
+ vpbroadcastd m15, [pd_2276]
+ vpbroadcastd m10, [pd_3406]
+ ITX_MULSUB_2D 4, 2, 3, 9, _, 11, 10, 15 ; t21a, t26a
+ ITX_MULSUB_2D 8, 5, 3, 9, _, 11, 10, 15, 2 ; t25a, t22a
+ psubd m3, m0, m6 ; t27a
+ paddd m0, m6 ; t24a
+ psubd m6, m7, m1 ; t20a
+ paddd m7, m1 ; t23a
+ psubd m1, m5, m4 ; t21
+ paddd m5, m4 ; t22
+ psubd m4, m8, m2 ; t26
+ paddd m8, m2 ; t25
+ REPX {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8
+ REPX {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8
+ vpbroadcastd m15, [pd_3784]
+ vpbroadcastd m10, [pd_1567]
+ ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15, 2 ; t26a, t21a
+ ITX_MULSUB_2D 3, 6, 2, 9, _, 11, 10, 15, 2 ; t27, t20
+ mova m9, [r6-32*4] ; t16a
+ mova m10, [r6-32*3] ; t17
+ psubd m2, m9, m7 ; t23
+ paddd m9, m7 ; t16
+ psubd m7, m10, m5 ; t22a
+ paddd m10, m5 ; t17a
+ REPX {pmaxsd x, m12}, m9, m10, m2, m7
+ REPX {pminsd x, m13}, m9, m10, m2, m7
+ mova [r6-32*4], m9
+ mova [r6-32*3], m10
+ mova m9, [r6-32*2] ; t18a
+ mova m10, [r6-32*1] ; t19
+ psubd m5, m9, m1 ; t21
+ paddd m9, m1 ; t18
+ psubd m1, m10, m6 ; t20a
+ paddd m10, m6 ; t19a
+ REPX {pmaxsd x, m12}, m9, m10, m5, m1
+ REPX {pminsd x, m13}, m9, m10, m5, m1
+ mova [r6-32*2], m9
+ mova [r6-32*1], m10
+ mova m9, [r6+32*0] ; t28
+ mova m10, [r6+32*1] ; t29a
+ psubd m6, m9, m3 ; t27a
+ paddd m9, m3 ; t28a
+ psubd m3, m10, m4 ; t26
+ paddd m10, m4 ; t29
+ REPX {pmaxsd x, m12}, m9, m10, m6, m3
+ REPX {pminsd x, m13}, m9, m10, m6, m3
+ REPX {pmulld x, m14}, m6, m3, m1, m5
+ paddd m6, m11
+ paddd m3, m11
+ psubd m4, m6, m1 ; t20
+ paddd m6, m1 ; t27
+ psubd m1, m3, m5 ; t21a
+ paddd m3, m5 ; t26a
+ REPX {psrad x, 12 }, m4, m1, m3, m6
+ mova [r6+32*0], m4
+ mova [r6+32*1], m1
+ mova m4, [r6+32*2] ; t30
+ mova m1, [r6+32*3] ; t31a
+ psubd m5, m4, m8 ; t25a
+ paddd m4, m8 ; t30a
+ psubd m8, m1, m0 ; t24
+ paddd m1, m0 ; t31
+ REPX {pmaxsd x, m12}, m8, m5, m4, m1
+ REPX {pminsd x, m13}, m8, m5, m4, m1
+ REPX {pmulld x, m14}, m5, m8, m7, m2
+ paddd m5, m11
+ paddd m8, m11
+ psubd m0, m5, m7 ; t22
+ paddd m5, m7 ; t25
+ psubd m7, m8, m2 ; t23a
+ paddd m2, m8 ; t24a
+ REPX {psrad x, 12 }, m0, m7, m2, m5
+ mova [r6+32*2], m0
+ mova [r6+32*3], m7
+ mov r4, r6
+ add r6, 32*8
+ mova [r6-32*4], m2
+ mova [r6-32*3], m5
+ mova [r6-32*2], m3
+ mova [r6-32*1], m6
+ mova [r6+32*0], m9
+ mova [r6+32*1], m10
+ mova [r6+32*2], m4
+ mova [r6+32*3], m1
+ mov r5, r6
+ add r6, 32*8
+ ret
+ALIGN function_align
+.main_end:
+ psrld m11, 10 ; pd_2
+ IDCT32_END 0, 15, 8, 9, 10, 2
+ IDCT32_END 1, 14, 8, 9, 10, 2
+ punpckhwd m8, m0, m1 ; 16 17
+ punpcklwd m0, m1 ; 0 1
+ punpcklwd m1, m14, m15 ; 14 15
+ punpckhwd m14, m15 ; 30 31
+ mova [r5+32*3], m8
+ mova [r5+32*2], m14
+ IDCT32_END 2, 15, 8, 9, 10, 2
+ IDCT32_END 3, 14, 8, 9, 10, 2
+ punpckhwd m8, m2, m3 ; 18 19
+ punpcklwd m2, m3 ; 2 3
+ punpcklwd m3, m14, m15 ; 12 13
+ punpckhwd m14, m15 ; 28 29
+ mova [r5+32*1], m8
+ mova [r5+32*0], m14
+ IDCT32_END 4, 15, 8, 9, 10, 2
+ IDCT32_END 5, 14, 8, 9, 10, 2
+ punpckhwd m8, m4, m5 ; 20 21
+ punpcklwd m4, m5 ; 4 5
+ punpcklwd m5, m14, m15 ; 10 11
+ punpckhwd m14, m15 ; 26 27
+ mova [r5-32*1], m8
+ mova [r5-32*2], m14
+ IDCT32_END 6, 15, 8, 9, 10, 2
+ IDCT32_END 7, 14, 8, 9, 10, 2
+ punpckhwd m8, m6, m7 ; 22 23
+ punpcklwd m6, m7 ; 6 7
+ punpcklwd m7, m14, m15 ; 8 9
+ punpckhwd m14, m15 ; 24 25
+ mova [r5-32*3], m8
+ mova [r5-32*4], m14
+.transpose:
+ punpckhdq m15, m3, m1
+ punpckldq m3, m1
+ punpckhdq m1, m4, m6
+ punpckldq m4, m6
+ punpckhdq m6, m0, m2
+ punpckldq m0, m2
+ punpckhdq m2, m7, m5
+ punpckldq m7, m5
+ punpcklqdq m5, m2, m15
+ punpckhqdq m2, m15
+ punpckhqdq m15, m7, m3
+ punpcklqdq m7, m3
+ punpckhqdq m3, m6, m1
+ punpcklqdq m6, m1
+ punpckhqdq m1, m0, m4
+ punpcklqdq m0, m4
+ vperm2i128 m4, m0, m7, 0x31
+ vinserti128 m0, xm7, 1
+ vperm2i128 m7, m3, m2, 0x31
+ vinserti128 m3, xm2, 1
+ vinserti128 m2, m6, xm5, 1
+ vperm2i128 m6, m5, 0x31
+ vperm2i128 m5, m1, m15, 0x31
+ vinserti128 m1, xm15, 1
+ ret
+
+cglobal inv_txfm_add_identity_identity_8x32_10bpc, 4, 7, 8, dst, stride, c, eob
+ vpbroadcastd m7, [pixel_10bpc_max]
+.pass1:
+ vpbroadcastd m5, [pw_5]
+ pxor m6, m6
+ mov r6d, eobd
+ add eobb, 21
+ cmovc eobd, r6d ; 43, 107, 171 -> 64, 128, 192
+ lea r6, [strideq*3]
+ lea r5, [strideq*5]
+ lea r4, [strideq+r6*2] ; strideq*7
+.loop:
+ mova m0, [cq+128*0]
+ packssdw m0, [cq+128*1]
+ mova m1, [cq+128*2]
+ packssdw m1, [cq+128*3]
+ mova m2, [cq+128*4]
+ packssdw m2, [cq+128*5]
+ mova m3, [cq+128*6]
+ packssdw m3, [cq+128*7]
+ REPX {paddsw x, m5}, m0, m1, m2, m3
+ REPX {psraw x, 3 }, m0, m1, m2, m3
+ call .main_zero
+ add cq, 32
+ lea dstq, [dstq+strideq*8]
+ sub eobd, 64
+ jge .loop
+ RET
+ALIGN function_align
+.main_zero:
+ REPX {mova [cq+128*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
+.main:
+ punpckhwd m4, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m2, m3
+ punpcklwd m2, m3
+ punpckhwd m3, m0, m4
+ punpcklwd m0, m4
+ punpckhwd m4, m2, m1
+ punpcklwd m2, m1
+ punpckhqdq m1, m0, m2
+ punpcklqdq m0, m2
+ punpcklqdq m2, m3, m4
+ punpckhqdq m3, m4
+ mova xm4, [dstq+strideq*0]
+ vinserti128 m4, [dstq+strideq*4], 1
+ paddw m0, m4
+ mova xm4, [dstq+strideq*1]
+ vinserti128 m4, [dstq+r5 ], 1
+ paddw m1, m4
+ mova xm4, [dstq+strideq*2]
+ vinserti128 m4, [dstq+r6*2 ], 1
+ paddw m2, m4
+ mova xm4, [dstq+r6 ]
+ vinserti128 m4, [dstq+r4 ], 1
+ paddw m3, m4
+ REPX {pmaxsw x, m6}, m0, m1, m2, m3
+ REPX {pminsw x, m7}, m0, m1, m2, m3
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*4], m0, 1
+ mova [dstq+strideq*1], xm1
+ vextracti128 [dstq+r5 ], m1, 1
+ mova [dstq+strideq*2], xm2
+ vextracti128 [dstq+r6*2 ], m2, 1
+ mova [dstq+r6 ], xm3
+ vextracti128 [dstq+r4 ], m3, 1
+ ret
+
+cglobal inv_txfm_add_dct_dct_8x32_12bpc, 4, 7, 0, dst, stride, c, eob
+ test eobd, eobd
+ jz .dconly
+ PROLOGUE 0, 7, 16, 32*24, dst, stride, c, eob
+%undef cmp
+ vpbroadcastd m11, [pd_2048]
+ vpbroadcastd m12, [clip_20b_min]
+ vpbroadcastd m13, [clip_20b_max]
+ mov r4, cq
+ lea r6, [rsp+32*4]
+ call .pass1_main
+ cmp eobd, 43
+ jge .eob43
+ jmp .pass2_fast
+.eob43:
+ call .pass1_main
+ cmp eobd, 107
+ jge .eob107
+.pass2_fast:
+ mov cq, r4
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ pmaxsd m0, m12, [cq+128*1+ 0]
+ pmaxsd m1, m12, [cq+128*7+ 0]
+ pmaxsd m2, m12, [cq+128*1+32]
+ pmaxsd m3, m12, [cq+128*7+32]
+ REPX {pminsd x, m13}, m0, m1, m2, m3
+ vpbroadcastd m14, [pd_2896]
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_fast
+ pmaxsd m0, m12, [cq+128*3+ 0]
+ pmaxsd m1, m12, [cq+128*5+ 0]
+ pmaxsd m2, m12, [cq+128*3+32]
+ pmaxsd m3, m12, [cq+128*5+32]
+ REPX {pminsd x, m13}, m0, m1, m2, m3
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_fast
+ pmaxsd m0, m12, [cq+128*2+ 0]
+ pmaxsd m1, m12, [cq+128*6+ 0]
+ pmaxsd m2, m12, [cq+128*2+32]
+ pmaxsd m3, m12, [cq+128*6+32]
+ REPX {pminsd x, m13}, m0, m1, m2, m3
+ call m(idct_8x16_internal_10bpc).main_oddhalf_fast
+ pmaxsd m0, m12, [cq+128*0+ 0]
+ pmaxsd m1, m12, [cq+128*4+ 0]
+ pmaxsd m2, m12, [cq+128*0+32]
+ pmaxsd m3, m12, [cq+128*4+32]
+ REPX {pminsd x, m13}, m0, m1, m2, m3
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_8x8_internal_10bpc).main
+ call m(idct_8x16_internal_10bpc).main_evenhalf
+ jmp .pass2_end
+.eob107:
+ call .pass1_main
+ cmp eobd, 171
+ jge .eob171
+ jmp .pass2
+.eob171:
+ call .pass1_main
+.pass2:
+ mov cq, r4
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ pmaxsd m0, m12, [cq+128*1+ 0]
+ pmaxsd m1, m12, [cq+128*7+ 0]
+ pmaxsd m2, m12, [cq+128*1+32]
+ pmaxsd m3, m12, [cq+128*7+32]
+ pmaxsd m4, m12, [cq+128*1+64]
+ pmaxsd m5, m12, [cq+128*7+64]
+ pmaxsd m6, m12, [cq+128*1+96]
+ pmaxsd m7, m12, [cq+128*7+96]
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ vpbroadcastd m14, [pd_2896]
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1
+ pmaxsd m0, m12, [cq+128*3+ 0]
+ pmaxsd m1, m12, [cq+128*5+ 0]
+ pmaxsd m2, m12, [cq+128*3+32]
+ pmaxsd m3, m12, [cq+128*5+32]
+ pmaxsd m4, m12, [cq+128*3+64]
+ pmaxsd m5, m12, [cq+128*5+64]
+ pmaxsd m6, m12, [cq+128*3+96]
+ pmaxsd m7, m12, [cq+128*5+96]
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2
+ pmaxsd m0, m12, [cq+128*2+ 0]
+ pmaxsd m1, m12, [cq+128*6+ 0]
+ pmaxsd m2, m12, [cq+128*2+32]
+ pmaxsd m3, m12, [cq+128*6+32]
+ pmaxsd m4, m12, [cq+128*2+64]
+ pmaxsd m5, m12, [cq+128*6+64]
+ pmaxsd m6, m12, [cq+128*2+96]
+ pmaxsd m7, m12, [cq+128*6+96]
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ call m(idct_8x16_internal_10bpc).main_oddhalf
+ pmaxsd m0, m12, [cq+128*0+ 0]
+ pmaxsd m1, m12, [cq+128*4+ 0]
+ pmaxsd m2, m12, [cq+128*0+32]
+ pmaxsd m3, m12, [cq+128*4+32]
+ pmaxsd m4, m12, [cq+128*0+64]
+ pmaxsd m5, m12, [cq+128*4+64]
+ pmaxsd m6, m12, [cq+128*0+96]
+ pmaxsd m7, m12, [cq+128*4+96]
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ call m(idct_8x8_internal_10bpc).main
+ call m(idct_8x16_internal_10bpc).main_evenhalf
+.pass2_end:
+ psrld m11, 8 ; pd_8
+ IDCT32_END 0, 15, 8, 9, 10, 4
+ IDCT32_END 1, 14, 8, 9, 10, 4
+ punpckhqdq m8, m0, m1 ; 16 17 (interleaved)
+ punpcklqdq m0, m1 ; 0 1 (interleaved)
+ punpcklqdq m1, m14, m15 ; 14 15 (interleaved)
+ punpckhqdq m14, m15 ; 30 31 (interleaved)
+ mova [r5+32*3], m8
+ mova [r5+32*2], m14
+ IDCT32_END 2, 15, 8, 9, 10, 4
+ IDCT32_END 3, 14, 8, 9, 10, 4
+ punpckhqdq m8, m2, m3 ; 18 19 (interleaved)
+ punpcklqdq m2, m3 ; 2 3 (interleaved)
+ punpcklqdq m3, m14, m15 ; 12 13 (interleaved)
+ punpckhqdq m14, m15 ; 28 29 (interleaved)
+ mova [r5+32*1], m8
+ mova [r5+32*0], m14
+ IDCT32_END 4, 15, 8, 9, 10, 4
+ IDCT32_END 5, 14, 8, 9, 10, 4
+ punpckhqdq m8, m4, m5 ; 20 21 (interleaved)
+ punpcklqdq m4, m5 ; 4 5 (interleaved)
+ punpcklqdq m5, m14, m15 ; 10 11 (interleaved)
+ punpckhqdq m14, m15 ; 26 27 (interleaved)
+ mova [r5-32*1], m8
+ mova [r5-32*2], m14
+ IDCT32_END 6, 15, 8, 9, 10, 4
+ IDCT32_END 7, 14, 8, 9, 10, 4
+ punpckhqdq m8, m6, m7 ; 22 23 (interleaved)
+ punpcklqdq m6, m7 ; 6 7 (interleaved)
+ punpcklqdq m7, m14, m15 ; 8 9 (interleaved)
+ punpckhqdq m14, m15 ; 24 25 (interleaved)
+ mova [r5-32*3], m8
+ mova [r5-32*4], m14
+ mova m15, m1
+.end:
+ vpermq m0, m0, q3120
+ vpermq m1, m2, q3120
+ call m(idct_8x8_internal_12bpc).write_8x4_start
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, m4, q3120
+ vpermq m1, m6, q3120
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, m7, q3120
+ vpermq m1, m5, q3120
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, m3, q3120
+ vpermq m1, m15, q3120
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, [r5+32*3], q3120
+ vpermq m1, [r5+32*1], q3120
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, [r5-32*1], q3120
+ vpermq m1, [r5-32*3], q3120
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, [r5-32*4], q3120
+ vpermq m1, [r5-32*2], q3120
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, [r5+32*0], q3120
+ vpermq m1, [r5+32*2], q3120
+ call m(idct_8x8_internal_10bpc).write_8x4
+ RET
+.dconly:
+ imul r6d, [cq], 181
+ vpbroadcastd m2, [dconly_12bpc]
+ mov [cq], eobd ; 0
+ or r3d, 32
+ add r6d, 640
+ sar r6d, 10
+ jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly3
+ALIGN function_align
+.pass1_main:
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).pass1_main_part1
+ TRANSPOSE_8X8_DWORD 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15
+ mova [cq+128*0], m0
+ mova [cq+128*1], m1
+ mova [cq+128*2], m2
+ mova [cq+128*3], m3
+ mova [cq+128*4], m4
+ mova [cq+128*5], m5
+ mova [cq+128*6], m6
+ mova [cq+128*7], m7
+ add cq, 32
+ ret
+ALIGN function_align
+.main_end:
+ psrld m11, 10 ; pd_2
+ IDCT32_END 0, 15, 8, 9, 10, 2, 0
+ mova [cq+32*16], m8
+ mova [cq+32*31], m9
+ IDCT32_END 1, 14, 8, 9, 10, 2, 0
+ mova [cq+32*17], m8
+ mova [cq+32*30], m9
+ mova [cq+32*14], m14
+ IDCT32_END 2, 14, 8, 9, 10, 2, 0
+ mova [cq+32*18], m8
+ mova [cq+32*29], m9
+ mova [cq+32*13], m14
+ IDCT32_END 3, 14, 8, 9, 10, 2, 0
+ mova [cq+32*19], m8
+ mova [cq+32*28], m9
+ mova [cq+32*12], m14
+ IDCT32_END 4, 14, 8, 9, 10, 2, 0
+ mova [cq+32*20], m8
+ mova [cq+32*27], m9
+ mova [cq+32* 0], m0
+ mova [cq+32* 1], m1
+ mova [cq+32* 2], m2
+ IDCT32_END 5, 10, 0, 1, 2, 2, 0
+ mova [cq+32*21], m0
+ mova [cq+32*26], m1
+ IDCT32_END 6, 9, 0, 1, 2, 2, 0
+ mova [cq+32*22], m0
+ mova [cq+32*25], m1
+ IDCT32_END 7, 8, 0, 1, 2, 2, 0
+ mova [cq+32*23], m0
+ mova [cq+32*24], m1
+ mova m0, [cq+32* 0]
+ mova m1, [cq+32* 1]
+ mova m2, [cq+32* 2]
+ mova m11, m14
+ mova m12, [cq+32*12]
+ mova m13, [cq+32*13]
+ mova m14, [cq+32*14]
+ ret
+
+cglobal inv_txfm_add_identity_identity_8x32_12bpc, 4, 7, 8, dst, stride, c, eob
+ vpbroadcastd m7, [pixel_12bpc_max]
+ jmp m(inv_txfm_add_identity_identity_8x32_10bpc).pass1
+
+cglobal inv_txfm_add_dct_dct_32x8_10bpc, 4, 7, 0, dst, stride, c, eob
+ test eobd, eobd
+ jnz .full
+ imul r6d, [cq], 181
+ vpbroadcastd m3, [dconly_10bpc]
+ mov [cq], eobd ; 0
+ or r3d, 8
+.dconly:
+ add r6d, 640
+ sar r6d, 10
+.dconly2:
+ imul r6d, 181
+ add r6d, 2176
+ sar r6d, 12
+ movd xm0, r6d
+ paddsw xm0, xm3
+ vpbroadcastw m0, xm0
+.dconly_loop:
+ paddsw m1, m0, [dstq+32*0]
+ paddsw m2, m0, [dstq+32*1]
+ psubusw m1, m3
+ psubusw m2, m3
+ mova [dstq+32*0], m1
+ mova [dstq+32*1], m2
+ add dstq, strideq
+ dec r3d
+ jg .dconly_loop
+ RET
+.full:
+ PROLOGUE 0, 7, 16, 32*24, dst, stride, c, eob
+ lea r6, [rsp+32*4]
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ call .pass1
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_end
+ lea r6, [deint_shuf+128]
+ vpbroadcastd m11, [pw_2048]
+ mov r4, dstq
+ call .pass2
+ mova m0, [r5+32*3] ; 16 17
+ mova m1, [r5+32*2] ; 30 31
+ mova m2, [r5+32*1] ; 18 19
+ mova m3, [r5+32*0] ; 28 29
+ mova m4, [r5-32*1] ; 20 21
+ mova m5, [r5-32*2] ; 26 27
+ mova m6, [r5-32*3] ; 22 23
+ mova m7, [r5-32*4] ; 24 25
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose
+ lea dstq, [r4+32]
+ call .pass2
+ RET
+ALIGN function_align
+.pass2:
+ call m(idct_16x8_internal_8bpc).main
+ REPX {pmulhrsw x, m11}, m0, m1, m2, m3
+ call m(idct_16x8_internal_10bpc).write_16x4_start
+ pmulhrsw m0, m11, m4
+ pmulhrsw m1, m11, m5
+ pmulhrsw m2, m11, m6
+ pmulhrsw m3, m11, m7
+ jmp m(idct_16x8_internal_10bpc).write_16x4_zero
+ALIGN function_align
+.pass1:
+ mova m0, [cq+32* 1]
+ mova m1, [cq+32* 7]
+ mova m2, [cq+32* 9]
+ mova m3, [cq+32*15]
+ mova m4, [cq+32*17]
+ mova m5, [cq+32*23]
+ mova m6, [cq+32*25]
+ mova m7, [cq+32*31]
+ vpbroadcastd m11, [pd_2048]
+ vpbroadcastd m14, [pd_2896]
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1
+ mova m0, [cq+32* 3]
+ mova m1, [cq+32* 5]
+ mova m2, [cq+32*11]
+ mova m3, [cq+32*13]
+ mova m4, [cq+32*19]
+ mova m5, [cq+32*21]
+ mova m6, [cq+32*27]
+ mova m7, [cq+32*29]
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2
+ mova m0, [cq+32* 2]
+ mova m1, [cq+32* 6]
+ mova m2, [cq+32*10]
+ mova m3, [cq+32*14]
+ mova m4, [cq+32*18]
+ mova m5, [cq+32*22]
+ mova m6, [cq+32*26]
+ mova m7, [cq+32*30]
+ call m(idct_8x16_internal_10bpc).main_oddhalf
+ mova m0, [cq+32* 0]
+ mova m1, [cq+32* 4]
+ mova m2, [cq+32* 8]
+ mova m3, [cq+32*12]
+ mova m4, [cq+32*16]
+ mova m5, [cq+32*20]
+ mova m6, [cq+32*24]
+ mova m7, [cq+32*28]
+ call m(idct_8x8_internal_10bpc).main
+ call m(idct_8x16_internal_10bpc).main_evenhalf
+ ret
+
+cglobal inv_txfm_add_identity_identity_32x8_10bpc, 4, 7, 8, dst, stride, c, eob
+ vpbroadcastd m7, [pixel_10bpc_max]
+.pass1:
+ vpbroadcastd m5, [pw_4096]
+ pxor m6, m6
+ mov r6d, eobd
+ add eobb, 21
+ cmovc eobd, r6d
+ lea r6, [strideq*3]
+ lea r5, [strideq*5]
+ lea r4, [strideq+r6*2] ; strideq*7
+.loop:
+ mova m0, [cq+32*0]
+ packssdw m0, [cq+32*1]
+ mova m1, [cq+32*2]
+ packssdw m1, [cq+32*3]
+ REPX {mova [cq+32*x], m6}, 0, 1, 2, 3
+ add cq, 32*8
+ mova m2, [cq-32*4]
+ packssdw m2, [cq-32*3]
+ mova m3, [cq-32*2]
+ packssdw m3, [cq-32*1]
+ REPX {pmulhrsw x, m5}, m0, m1, m2, m3
+ REPX {mova [cq+32*x], m6}, -4, -3, -2, -1
+ call m(inv_txfm_add_identity_identity_8x32_10bpc).main
+ add dstq, 16
+ sub eobd, 64
+ jge .loop
+ RET
+
+cglobal inv_txfm_add_dct_dct_32x8_12bpc, 4, 7, 0, dst, stride, c, eob
+ test eobd, eobd
+ jnz .full
+ imul r6d, [cq], 181
+ vpbroadcastd m3, [dconly_12bpc]
+ mov [cq], eobd ; 0
+ or r3d, 8
+ jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly
+.full:
+ PROLOGUE 0, 7, 16, 32*24, dst, stride, c, eob
+ lea r6, [rsp+32*4]
+ vpbroadcastd m12, [clip_20b_min]
+ vpbroadcastd m13, [clip_20b_max]
+ call m(inv_txfm_add_dct_dct_32x8_10bpc).pass1
+ call m(inv_txfm_add_dct_dct_8x32_12bpc).main_end
+ mov r4, dstq
+ call m(idct_16x8_internal_12bpc).pass2_main
+ mova m0, [cq+32* 0] ; 16
+ mova m1, [cq+32* 1] ; 17
+ mova m2, [cq+32* 2] ; 18
+ mova m3, [cq+32* 3] ; 19
+ mova m4, [cq+32* 4] ; 20
+ mova m5, [cq+32* 5] ; 21
+ mova m6, [cq+32* 6] ; 22
+ mova m7, [cq+32* 7] ; 23
+ mova m8, [cq+32* 8] ; 24
+ mova m9, [cq+32* 9] ; 25
+ mova m10, [cq+32*10] ; 26
+ mova m11, [cq+32*11] ; 27
+ mova m12, [cq+32*12] ; 28
+ mova m13, [cq+32*13] ; 29
+ mova m14, [cq+32*14] ; 30
+ mova m15, [cq+32*15] ; 31
+ lea dstq, [r4+32]
+ call m(idct_16x8_internal_12bpc).pass2_main
+ RET
+
+cglobal inv_txfm_add_identity_identity_32x8_12bpc, 4, 7, 8, dst, stride, c, eob
+ vpbroadcastd m7, [pixel_12bpc_max]
+ jmp m(inv_txfm_add_identity_identity_32x8_10bpc).pass1
+
+%macro IDCT32_PASS2_END 6 ; coefs[1-2], tmp[1-2], offset[1-2]
+ mova m%4, [%2]
+ paddsw m%3, m%1, m%4
+ psubsw m%1, m%4
+%if %1 == 0
+ pxor m6, m6
+%endif
+ pmulhrsw m%3, m15
+ pmulhrsw m%1, m15
+ paddw m%3, [dstq+%5]
+ paddw m%1, [r2+%6]
+ pmaxsw m%3, m6
+ pmaxsw m%1, m6
+ pminsw m%3, m7
+ pminsw m%1, m7
+ mova [dstq+%5], m%3
+ mova [r2+%6], m%1
+%endmacro
+
+cglobal inv_txfm_add_dct_dct_16x32_10bpc, 4, 7, 0, dst, stride, c, eob
+ test eobd, eobd
+ jz .dconly
+ PROLOGUE 0, 8, 16, 32*36, dst, stride, c, eob
+%undef cmp
+ vpbroadcastd m11, [pd_2048]
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ vpbroadcastd m14, [pd_2896]
+ lea r6, [rsp+32*16]
+ lea r4, [r6+32*8]
+ lea r5, [r6+32*16]
+ call .main
+ sub eobd, 44
+ jge .eob44
+ vperm2i128 m2, m0, m3, 0x31 ; 5
+ vinserti128 m0, xm3, 1 ; 1
+ vperm2i128 m3, m1, m4, 0x31 ; 7
+ vinserti128 m1, xm4, 1 ; 3
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ REPX {mova [r6+32*x], m4}, 0, 1, 2, 3
+ jmp .fast
+.dconly:
+ imul r6d, [cq], 181
+ vpbroadcastd m3, [dconly_10bpc]
+ mov [cq], eobd ; 0
+ or r3d, 32
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly2
+.eob44:
+ mova [r4+16*0], xm0
+ mova [r4+16*1], xm3
+ mova [r4+16*2], xm1
+ mova [r4+16*3], xm4
+ vextracti128 [r4+16*4], m0, 1
+ vextracti128 [r4+16*5], m3, 1
+ vextracti128 [r4+16*6], m1, 1
+ vextracti128 [r4+16*7], m4, 1
+ call .main
+ sub eobd, 107
+ jge .eob151
+ vperm2i128 m7, m1, m4, 0x31 ; 15
+ vinserti128 m5, m1, xm4, 1 ; 11
+ vperm2i128 m6, m0, m3, 0x31 ; 13
+ vinserti128 m4, m0, xm3, 1 ; 9
+ mova m0, [r4+32*0]
+ mova m1, [r4+32*1]
+ mova m2, [r4+32*2]
+ mova m3, [r4+32*3]
+.fast:
+ lea r6, [pw_5+128]
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ pxor m8, m8
+ REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15
+ jmp .idct16
+.eob151:
+ mova [r4-16*8], xm0
+ mova [r4-16*7], xm3
+ mova [r4-16*6], xm1
+ mova [r4-16*5], xm4
+ vextracti128 [r4-16*4], m0, 1
+ vextracti128 [r4-16*3], m3, 1
+ vextracti128 [r4-16*2], m1, 1
+ vextracti128 [r4-16*1], m4, 1
+ call .main
+ sub eobd, 128
+ jge .eob279
+ vperm2i128 m10, m0, m3, 0x31 ; 21
+ vinserti128 m8, m0, xm3, 1 ; 17
+ vperm2i128 m11, m1, m4, 0x31 ; 23
+ vinserti128 m9, m1, xm4, 1 ; 19
+ pxor m12, m12
+ REPX {mova x, m12}, m13, m14, m15
+ REPX {mova [r6+32*x], m12}, 0, 1, 2, 3
+ jmp .full
+.eob279:
+ mova [r5+16*0], xm0
+ mova [r5+16*1], xm3
+ mova [r5+16*2], xm1
+ mova [r5+16*3], xm4
+ vextracti128 [r5+16*4], m0, 1
+ vextracti128 [r5+16*5], m3, 1
+ vextracti128 [r5+16*6], m1, 1
+ vextracti128 [r5+16*7], m4, 1
+ call .main
+ vperm2i128 m14, m0, m3, 0x31 ; 29
+ vinserti128 m12, m0, xm3, 1 ; 25
+ vperm2i128 m15, m1, m4, 0x31 ; 31
+ vinserti128 m13, m1, xm4, 1 ; 27
+ mova m8, [r5+32*0]
+ mova m9, [r5+32*1]
+ mova m10, [r5+32*2]
+ mova m11, [r5+32*3]
+.full:
+ mova m0, [r4+32*0]
+ mova m1, [r4+32*1]
+ mova m2, [r4+32*2]
+ mova m3, [r4+32*3]
+ mova m4, [r4-32*4]
+ mova m5, [r4-32*3]
+ mova m6, [r4-32*2]
+ mova m7, [r4-32*1]
+ lea r6, [pw_5 + 128]
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
+ lea r3, [rsp+32*8]
+ mova m8, [r3+32*0]
+ mova m9, [r3+32*1]
+ mova m10, [r3+32*2]
+ mova m11, [r3+32*3]
+ mova m12, [r3-32*4]
+ mova m13, [r3-32*3]
+ mova m14, [r3-32*2]
+ mova m15, [r3-32*1]
+.idct16:
+ lea r3, [rsp+32*16]
+ mova m0, [r3+32*0]
+ mova m1, [r3+32*1]
+ mova m2, [r3+32*2]
+ mova m3, [r3+32*3]
+ mova m4, [r3-32*4]
+ mova m5, [r3-32*3]
+ mova m6, [r3-32*2]
+ mova m7, [r3-32*1]
+ mova [rsp], m15
+ call m(idct_16x16_internal_8bpc).main
+ imul r2, strideq, 19
+ lea r3, [strideq*3]
+ add r2, dstq
+ call .pass2_end
+ RET
+ALIGN function_align
+.main:
+ pmulld m0, m14, [cq+128* 1]
+ pmulld m1, m14, [cq+128* 3]
+ pmulld m2, m14, [cq+128* 5]
+ pmulld m3, m14, [cq+128* 7]
+ pmulld m4, m14, [cq+128* 9]
+ pmulld m5, m14, [cq+128*11]
+ pmulld m6, m14, [cq+128*13]
+ pmulld m7, m14, [cq+128*15]
+ call m(idct_8x16_internal_10bpc).main_oddhalf_rect2
+ pmulld m0, m14, [cq+128* 0]
+ pmulld m1, m14, [cq+128* 2]
+ pmulld m2, m14, [cq+128* 4]
+ pmulld m3, m14, [cq+128* 6]
+ pmulld m4, m14, [cq+128* 8]
+ pmulld m5, m14, [cq+128*10]
+ pmulld m6, m14, [cq+128*12]
+ pmulld m7, m14, [cq+128*14]
+ call m(idct_8x8_internal_10bpc).main_rect2
+ call m(idct_8x16_internal_10bpc).main_evenhalf
+ psrld m15, m11, 11 ; pd_1
+ mova m8, [r6-32*4]
+ mova m9, [r6-32*3]
+ REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7
+ psubd m10, m0, m8 ; out15
+ paddd m0, m8 ; out0
+ mova m8, [r6-32*2]
+ paddd m15, m1, m9 ; out1
+ psubd m1, m9 ; out14
+ mova m9, [r6-32*1]
+ REPX {psrad x, 1}, m0, m15, m10, m1
+ packssdw m0, m15
+ packssdw m1, m10
+ psubd m10, m2, m8 ; out13
+ paddd m2, m8 ; out2
+ mova m8, [r6+32*0]
+ paddd m15, m3, m9 ; out3
+ psubd m3, m9 ; out12
+ mova m9, [r6+32*1]
+ REPX {psrad x, 1}, m2, m15, m10, m3
+ packssdw m2, m15
+ packssdw m3, m10
+ psubd m10, m4, m8 ; out11
+ paddd m4, m8 ; out4
+ mova m8, [r6+32*2]
+ paddd m15, m5, m9 ; out5
+ psubd m5, m9 ; out10
+ mova m9, [r6+32*3]
+ REPX {psrad x, 1}, m4, m10, m15, m5
+ packssdw m4, m15
+ packssdw m5, m10
+ psubd m10, m6, m8 ; out9
+ paddd m6, m8 ; out6
+ paddd m15, m7, m9 ; out7
+ psubd m7, m9 ; out8
+ REPX {psrad x, 1}, m6, m10, m15, m7
+ packssdw m6, m15
+ packssdw m7, m10
+ punpckhwd m8, m0, m2
+ punpcklwd m0, m2
+ punpckhwd m2, m3, m1
+ punpcklwd m3, m1
+ punpckhwd m1, m4, m6
+ punpcklwd m4, m6
+ punpcklwd m6, m7, m5
+ punpckhwd m7, m5
+ pxor m5, m5
+ mov r7d, 128*13
+.main_zero_loop:
+ mova [cq+r7-128*1], m5
+ mova [cq+r7+128*0], m5
+ mova [cq+r7+128*1], m5
+ mova [cq+r7+128*2], m5
+ sub r7d, 128*4
+ jg .main_zero_loop
+ add cq, 32
+ punpcklwd m5, m3, m2
+ punpckhwd m3, m2
+ punpcklwd m2, m4, m1
+ punpckhwd m4, m1
+ punpckhwd m1, m0, m8
+ punpcklwd m0, m8
+ punpckhwd m8, m6, m7
+ punpcklwd m6, m7
+ punpcklqdq m7, m1, m4
+ punpckhqdq m1, m4
+ punpckhqdq m4, m8, m3
+ punpcklqdq m8, m3
+ punpckhqdq m3, m6, m5
+ punpcklqdq m6, m5
+ punpcklqdq m5, m0, m2
+ punpckhqdq m0, m2
+ mova [r6+16*0], xm5
+ mova [r6+16*1], xm6
+ mova [r6+16*2], xm7
+ mova [r6+16*3], xm8
+ vextracti128 [r6+16*4], m5, 1
+ vextracti128 [r6+16*5], m6, 1
+ vextracti128 [r6+16*6], m7, 1
+ vextracti128 [r6+16*7], m8, 1
+ sub r6, 32*4
+ ret
+ALIGN function_align
+.pass2_end:
+ mova [rsp+gprsize+32*0], m6
+ mova [rsp+gprsize+32*2], m7
+ mova [rsp+gprsize+32*3], m15
+ vpbroadcastd m15, [pw_2048]
+ vpbroadcastd m7, [pixel_10bpc_max]
+ IDCT32_PASS2_END 0, r5+32*3, 1, 6, strideq*0, r3*4
+ IDCT32_PASS2_END 4, r5-32*1, 0, 1, strideq*4, strideq*8
+ IDCT32_PASS2_END 8, r4+32*3, 0, 4, strideq*8, strideq*4
+ IDCT32_PASS2_END 12, r4-32*1, 0, 4, r3*4, strideq*0
+ add dstq, strideq
+ sub r2, strideq
+ mova m1, [rsp+gprsize+32*1]
+ IDCT32_PASS2_END 1, r5+32*2, 0, 4, strideq*0, r3*4
+ IDCT32_PASS2_END 5, r5-32*2, 0, 4, strideq*4, strideq*8
+ IDCT32_PASS2_END 9, r4+32*2, 0, 4, strideq*8, strideq*4
+ IDCT32_PASS2_END 13, r4-32*2, 0, 4, r3*4, strideq*0
+ add dstq, strideq
+ sub r2, strideq
+ mova m1, [rsp+gprsize+32*0]
+ IDCT32_PASS2_END 2, r5+32*1, 0, 4, strideq*0, r3*4
+ IDCT32_PASS2_END 1, r5-32*3, 0, 4, strideq*4, strideq*8
+ IDCT32_PASS2_END 10, r4+32*1, 0, 4, strideq*8, strideq*4
+ IDCT32_PASS2_END 14, r4-32*3, 0, 4, r3*4, strideq*0
+ add dstq, strideq
+ sub r2, strideq
+ mova m1, [rsp+gprsize+32*2]
+ mova m2, [rsp+gprsize+32*3]
+ IDCT32_PASS2_END 3, r5+32*0, 0, 4, strideq*0, r3*4
+ IDCT32_PASS2_END 1, r5-32*4, 0, 4, strideq*4, strideq*8
+ IDCT32_PASS2_END 11, r4+32*0, 0, 4, strideq*8, strideq*4
+ IDCT32_PASS2_END 2, r4-32*4, 0, 4, r3*4, strideq*0
+ ret
+
+cglobal inv_txfm_add_identity_identity_16x32_10bpc, 4, 7, 12, dst, stride, c, eob
+ vpbroadcastd m7, [pixel_10bpc_max]
+.pass1:
+ vpbroadcastd m8, [pw_2896x8]
+ vpbroadcastd m9, [pw_1697x16]
+ vpbroadcastd m11, [pw_8192]
+ lea r6, [strideq*5]
+ pxor m6, m6
+ paddw m10, m11, m11 ; pw_16384
+ mov r5, dstq
+ call .main
+ sub eobd, 36
+ jl .ret
+ add cq, 128*8
+ lea dstq, [r5+16]
+ call .main
+ sub cq, 128*8-32
+ lea dstq, [r5+strideq*8]
+ mov r5, dstq
+ call .main
+ sub eobd, 107 ; eob < 143
+ jl .ret
+ add cq, 128*8
+ lea dstq, [r5+16]
+ call .main
+ sub cq, 128*8-32
+ lea dstq, [r5+strideq*8]
+ mov r5, dstq
+ call .main
+ sub eobd, 128 ; eob < 271
+ jl .ret
+ add cq, 128*8
+ lea dstq, [r5+16]
+ call .main
+ sub cq, 128*8-32
+ lea dstq, [r5+strideq*8]
+ mov r5, dstq
+ call .main
+ sub eobd, 128 ; eob < 399
+ jl .ret
+ add cq, 128*8
+ lea dstq, [r5+16]
+ call .main
+.ret:
+ RET
+ALIGN function_align
+.main:
+ mova m0, [cq+128*0]
+ packssdw m0, [cq+128*1]
+ mova m1, [cq+128*2]
+ packssdw m1, [cq+128*3]
+ mova m2, [cq+128*4]
+ packssdw m2, [cq+128*5]
+ mova m3, [cq+128*6]
+ packssdw m3, [cq+128*7]
+ REPX {pmulhrsw x, m8 }, m0, m1, m2, m3
+ REPX {IDTX16 x, 4, 9, 10}, 0, 1, 2, 3
+ REPX {pmulhrsw x, m11}, m0, m1, m2, m3
+ REPX {mova [cq+128*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
+.main2:
+ punpckhwd m4, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m2, m3
+ punpcklwd m2, m3
+ punpckhwd m3, m0, m4
+ punpcklwd m0, m4
+ punpcklwd m4, m2, m1
+ punpckhwd m2, m1
+ punpckhqdq m1, m0, m4
+ punpcklqdq m0, m4
+ call m(iidentity_8x8_internal_10bpc).write_2x8x2
+ punpcklqdq m0, m3, m2
+ punpckhqdq m1, m3, m2
+ jmp m(iidentity_8x8_internal_10bpc).write_2x8x2
+
+cglobal inv_txfm_add_identity_identity_16x32_12bpc, 4, 7, 12, dst, stride, c, eob
+ vpbroadcastd m7, [pixel_12bpc_max]
+ jmp m(inv_txfm_add_identity_identity_16x32_10bpc).pass1
+
+cglobal inv_txfm_add_dct_dct_32x16_10bpc, 4, 7, 0, dst, stride, c, eob
+ test eobd, eobd
+ jz .dconly
+ PROLOGUE 0, 8, 16, 32*40, dst, stride, c, eob
+%undef cmp
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ lea r6, [rsp+32*4]
+ call .main
+ cmp eobd, 36
+ jge .full
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose
+ pxor m8, m8
+ REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, [rsp]
+ lea r6, [pw_5+128]
+ mov r7, dstq
+ call m(idct_16x16_internal_8bpc).main
+ call .write_16x16
+ mova m0, [r5+32*3]
+ mova m1, [r5+32*2]
+ mova m2, [r5+32*1]
+ mova m3, [r5+32*0]
+ mova m4, [r5-32*1]
+ mova m5, [r5-32*2]
+ mova m6, [r5-32*3]
+ mova m7, [r5-32*4]
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose
+ pxor m8, m8
+ REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, [rsp]
+ jmp .end
+.dconly:
+ imul r6d, [cq], 181
+ vpbroadcastd m3, [dconly_10bpc]
+ mov [cq], eobd ; 0
+ or r3d, 16
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ add r6d, 384
+ sar r6d, 9
+ jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly2
+.full:
+ add cq, 32
+ mova [r4+32*3], m0
+ mova [r4+32*2], m1
+ mova [r4+32*1], m2
+ mova [r4+32*0], m3
+ mova [r4-32*1], m4
+ mova [r4-32*2], m5
+ mova [r4-32*3], m6
+ mova [r4-32*4], m7
+ call .main
+ sub r4, 32*16 ; topleft 16x8
+ call .transpose_16x16
+ lea r6, [pw_5+128]
+ mov r7, dstq
+ call m(idct_16x16_internal_8bpc).main
+ call .write_16x16
+ mova m0, [r5+32*3]
+ mova m1, [r5+32*2]
+ mova m2, [r5+32*1]
+ mova m3, [r5+32*0]
+ mova m4, [r5-32*1]
+ mova m5, [r5-32*2]
+ mova m6, [r5-32*3]
+ mova m7, [r5-32*4]
+ add r4, 32*8 ; bottomleft 16x8
+ call .transpose_16x16
+.end:
+ lea dstq, [r7+32]
+ call m(idct_16x16_internal_8bpc).main
+ call .write_16x16
+ RET
+ALIGN function_align
+.transpose_16x16:
+ punpckhdq m8, m3, m1
+ punpckldq m3, m1
+ punpckhdq m1, m0, m2
+ punpckldq m0, m2
+ punpckhdq m2, m7, m5
+ punpckldq m7, m5
+ punpckhdq m5, m4, m6
+ punpckldq m4, m6
+ punpckhqdq m6, m0, m4
+ punpcklqdq m0, m4
+ punpckhqdq m4, m1, m5
+ punpcklqdq m1, m5
+ punpckhqdq m5, m7, m3
+ punpcklqdq m7, m3
+ punpckhqdq m3, m2, m8
+ punpcklqdq m2, m8
+ vinserti128 m8, m0, xm7, 1
+ vperm2i128 m12, m0, m7, 0x31
+ vinserti128 m9, m6, xm5, 1
+ vperm2i128 m13, m6, m5, 0x31
+ vinserti128 m10, m1, xm2, 1
+ vperm2i128 m14, m1, m2, 0x31
+ vinserti128 m11, m4, xm3, 1
+ vperm2i128 m15, m4, m3, 0x31
+ mova m0, [r4+32*3]
+ mova m1, [r4+32*2]
+ mova m2, [r4+32*1]
+ mova m3, [r4+32*0]
+ mova m4, [r4-32*1]
+ mova m5, [r4-32*2]
+ mova m6, [r4-32*3]
+ mova m7, [r4-32*4]
+ mova [rsp+gprsize], m15
+ jmp m(inv_txfm_add_dct_dct_8x32_10bpc).transpose
+ALIGN function_align
+.main:
+ vpbroadcastd m14, [pd_2896]
+ vpbroadcastd m11, [pd_2048]
+ pmulld m0, m14, [cq+64* 1]
+ pmulld m1, m14, [cq+64* 7]
+ pmulld m2, m14, [cq+64* 9]
+ pmulld m3, m14, [cq+64*15]
+ pmulld m4, m14, [cq+64*17]
+ pmulld m5, m14, [cq+64*23]
+ pmulld m6, m14, [cq+64*25]
+ pmulld m7, m14, [cq+64*31]
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_rect2
+ pmulld m0, m14, [cq+64* 3]
+ pmulld m1, m14, [cq+64* 5]
+ pmulld m2, m14, [cq+64*11]
+ pmulld m3, m14, [cq+64*13]
+ pmulld m4, m14, [cq+64*19]
+ pmulld m5, m14, [cq+64*21]
+ pmulld m6, m14, [cq+64*27]
+ pmulld m7, m14, [cq+64*29]
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_rect2
+ pmulld m0, m14, [cq+64* 2]
+ pmulld m1, m14, [cq+64* 6]
+ pmulld m2, m14, [cq+64*10]
+ pmulld m3, m14, [cq+64*14]
+ pmulld m4, m14, [cq+64*18]
+ pmulld m5, m14, [cq+64*22]
+ pmulld m6, m14, [cq+64*26]
+ pmulld m7, m14, [cq+64*30]
+ call m(idct_8x16_internal_10bpc).main_oddhalf_rect2
+ pmulld m0, m14, [cq+64* 0]
+ pmulld m1, m14, [cq+64* 4]
+ pmulld m2, m14, [cq+64* 8]
+ pmulld m3, m14, [cq+64*12]
+ pmulld m4, m14, [cq+64*16]
+ pmulld m5, m14, [cq+64*20]
+ pmulld m6, m14, [cq+64*24]
+ pmulld m7, m14, [cq+64*28]
+ call m(idct_8x8_internal_10bpc).main_rect2
+ call m(idct_8x16_internal_10bpc).main_evenhalf
+ pxor m8, m8
+ mov r7d, 64*30
+.main_zero_loop:
+ mova [cq+r7-64*2], m8
+ mova [cq+r7-64*1], m8
+ mova [cq+r7+64*0], m8
+ mova [cq+r7+64*1], m8
+ sub r7d, 64*4
+ jg .main_zero_loop
+.main_end:
+ psrld m11, 11 ; pd_1
+ IDCT32_END 0, 15, 8, 9, 10, 1
+ IDCT32_END 1, 14, 8, 9, 10, 1
+ punpckhwd m8, m0, m1 ; 16 17
+ punpcklwd m0, m1 ; 0 1
+ punpcklwd m1, m14, m15 ; 14 15
+ punpckhwd m14, m15 ; 30 31
+ mova [r5+32*3], m8
+ mova [r5+32*2], m14
+ IDCT32_END 2, 15, 8, 9, 10, 1
+ IDCT32_END 3, 14, 8, 9, 10, 1
+ punpckhwd m8, m2, m3 ; 18 19
+ punpcklwd m2, m3 ; 2 3
+ punpcklwd m3, m14, m15 ; 12 13
+ punpckhwd m14, m15 ; 28 29
+ mova [r5+32*1], m8
+ mova [r5+32*0], m14
+ IDCT32_END 4, 15, 8, 9, 10, 1
+ IDCT32_END 5, 14, 8, 9, 10, 1
+ punpckhwd m8, m4, m5 ; 20 21
+ punpcklwd m4, m5 ; 4 5
+ punpcklwd m5, m14, m15 ; 10 11
+ punpckhwd m14, m15 ; 26 27
+ mova [r5-32*1], m8
+ mova [r5-32*2], m14
+ IDCT32_END 6, 15, 8, 9, 10, 1
+ IDCT32_END 7, 14, 8, 9, 10, 1
+ punpckhwd m8, m6, m7 ; 22 23
+ punpcklwd m6, m7 ; 6 7
+ punpcklwd m7, m14, m15 ; 8 9
+ punpckhwd m14, m15 ; 24 25
+ mova [r5-32*3], m8
+ mova [r5-32*4], m14
+ ret
+ALIGN function_align
+.write_16x16:
+ mova m1, [rsp+gprsize+32*1]
+ mova [rsp+gprsize+32*0], m8
+ mova [rsp+gprsize+32*1], m9
+ mova [rsp+gprsize+32*2], m12
+ vpbroadcastd m12, [pw_2048]
+ vpbroadcastd m9, [pixel_10bpc_max]
+ lea r3, [strideq*3]
+ pxor m8, m8
+ pmulhrsw m0, m12
+ pmulhrsw m1, m12
+ pmulhrsw m2, m12
+ pmulhrsw m3, m12
+ call m(idct_16x8_internal_10bpc).write_16x4
+ pmulhrsw m0, m12, m4
+ pmulhrsw m1, m12, m5
+ pmulhrsw m2, m12, m6
+ pmulhrsw m3, m12, m7
+ call m(idct_16x8_internal_10bpc).write_16x4
+ pmulhrsw m0, m12, [rsp+gprsize+32*0]
+ pmulhrsw m1, m12, [rsp+gprsize+32*1]
+ pmulhrsw m2, m12, m10
+ pmulhrsw m3, m12, m11
+ call m(idct_16x8_internal_10bpc).write_16x4
+ pmulhrsw m0, m12, [rsp+gprsize+32*2]
+ pmulhrsw m1, m12, m13
+ pmulhrsw m2, m12, m14
+ pmulhrsw m3, m12, m15
+ jmp m(idct_16x8_internal_10bpc).write_16x4
+
+cglobal inv_txfm_add_identity_identity_32x16_10bpc, 4, 7, 11, dst, stride, c, eob
+ vpbroadcastd m7, [pixel_10bpc_max]
+.pass1:
+ vpbroadcastd m8, [pw_2896x8]
+ vpbroadcastd m9, [pw_1697x16]
+ vpbroadcastd m10, [pw_4096]
+ lea r6, [strideq*5]
+ pxor m6, m6
+ mov r5, dstq
+ call .main
+ sub eobd, 36
+ jl .ret
+ add cq, 32
+ lea dstq, [dstq+strideq*4]
+ call .main
+ add cq, 64*8-32
+ lea dstq, [r5+16*1]
+ call .main
+ sub eobd, 107 ; eob < 143
+ jl .ret
+ add cq, 32
+ lea dstq, [dstq+strideq*4]
+ call .main
+ add cq, 64*8-32
+ lea dstq, [r5+16*2]
+ call .main
+ sub eobd, 128 ; eob < 271
+ jl .ret
+ add cq, 32
+ lea dstq, [dstq+strideq*4]
+ call .main
+ add cq, 64*8-32
+ lea dstq, [r5+16*3]
+ call .main
+ sub eobd, 128 ; eob < 399
+ jl .ret
+ add cq, 32
+ lea dstq, [dstq+strideq*4]
+ call .main
+.ret:
+ RET
+ALIGN function_align
+.main:
+ mova m0, [cq+64*0]
+ packssdw m0, [cq+64*1]
+ mova m1, [cq+64*2]
+ packssdw m1, [cq+64*3]
+ mova m2, [cq+64*4]
+ packssdw m2, [cq+64*5]
+ mova m3, [cq+64*6]
+ packssdw m3, [cq+64*7]
+ REPX {pmulhrsw x, m8 }, m0, m1, m2, m3
+ REPX {paddsw x, x }, m0, m1, m2, m3
+ REPX {IDTX16 x, 4, 9, _ }, 0, 1, 2, 3
+ REPX {pmulhrsw x, m10}, m0, m1, m2, m3
+ REPX {mova [cq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
+ jmp m(inv_txfm_add_identity_identity_16x32_10bpc).main2
+
+cglobal inv_txfm_add_identity_identity_32x16_12bpc, 4, 7, 11, dst, stride, c, eob
+ vpbroadcastd m7, [pixel_12bpc_max]
+ jmp m(inv_txfm_add_identity_identity_32x16_10bpc).pass1
+
+cglobal inv_txfm_add_dct_dct_32x32_10bpc, 4, 7, 0, dst, stride, c, eob
+ test eobd, eobd
+ jz .dconly
+ PROLOGUE 0, 8, 16, 32*83, dst, stride, c, eob
+%undef cmp
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ lea r6, [rsp+32*7]
+ call .main
+ cmp eobd, 36
+ jl .fast
+ call .main
+ cmp eobd, 136
+ jl .fast
+ call .main
+ cmp eobd, 300
+ jl .fast
+ call .main
+ jmp .pass2
+.dconly:
+ imul r6d, [cq], 181
+ vpbroadcastd m3, [dconly_10bpc]
+ mov [cq], eobd ; 0
+ or r3d, 32
+ jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly
+.fast:
+ lea r4, [rsp+32*71]
+ pxor m0, m0
+.fast_loop:
+ REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3
+ add r6, 32*8
+ cmp r6, r4
+ jl .fast_loop
+.pass2:
+ lea r3, [rsp+32*3]
+ mov r4, r6
+ lea r5, [r6+32*8]
+ lea r6, [pw_5+128]
+ call .pass2_oddhalf
+ call .pass2_evenhalf
+ imul r2, strideq, 19
+ lea r3, [strideq*3]
+ add r2, dstq
+ call m(inv_txfm_add_dct_dct_16x32_10bpc).pass2_end
+ sub dstq, r3
+ lea r2, [r2+r3+32]
+ add dstq, 32
+ lea r3, [rsp+32*11]
+ call .pass2_oddhalf
+ call .pass2_evenhalf
+ lea r3, [strideq*3]
+ call m(inv_txfm_add_dct_dct_16x32_10bpc).pass2_end
+ RET
+ALIGN function_align
+.main:
+ mova m0, [cq+128* 1]
+ mova m1, [cq+128* 7]
+ mova m2, [cq+128* 9]
+ mova m3, [cq+128*15]
+ mova m4, [cq+128*17]
+ mova m5, [cq+128*23]
+ mova m6, [cq+128*25]
+ mova m7, [cq+128*31]
+ vpbroadcastd m11, [pd_2048]
+ vpbroadcastd m14, [pd_2896]
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1
+ mova m0, [cq+128* 3]
+ mova m1, [cq+128* 5]
+ mova m2, [cq+128*11]
+ mova m3, [cq+128*13]
+ mova m4, [cq+128*19]
+ mova m5, [cq+128*21]
+ mova m6, [cq+128*27]
+ mova m7, [cq+128*29]
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2
+ mova m0, [cq+128* 2]
+ mova m1, [cq+128* 6]
+ mova m2, [cq+128*10]
+ mova m3, [cq+128*14]
+ mova m4, [cq+128*18]
+ mova m5, [cq+128*22]
+ mova m6, [cq+128*26]
+ mova m7, [cq+128*30]
+ call m(idct_8x16_internal_10bpc).main_oddhalf
+ mova m0, [cq+128* 0]
+ mova m1, [cq+128* 4]
+ mova m2, [cq+128* 8]
+ mova m3, [cq+128*12]
+ mova m4, [cq+128*16]
+ mova m5, [cq+128*20]
+ mova m6, [cq+128*24]
+ mova m7, [cq+128*28]
+ call m(idct_8x8_internal_10bpc).main
+ call m(idct_8x16_internal_10bpc).main_evenhalf
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_end
+ pxor m15, m15
+ mov r7d, 128*29
+.main_zero_loop:
+ mova [cq+r7-128*1], m15
+ mova [cq+r7+128*0], m15
+ mova [cq+r7+128*1], m15
+ mova [cq+r7+128*2], m15
+ sub r7d, 128*4
+ jg .main_zero_loop
+ add cq, 32
+ mova [r4-32*4], m0
+ mova [r4-32*3], m1
+ mova [r4-32*2], m2
+ mova [r4-32*1], m3
+ mova [r4+32*0], m4
+ mova [r4+32*1], m5
+ mova [r4+32*2], m6
+ mova [r4+32*3], m7
+ mova m0, [r5+32*3]
+ mova m1, [r5+32*2]
+ mova m2, [r5+32*1]
+ mova m3, [r5+32*0]
+ mova m4, [r5-32*1]
+ mova m5, [r5-32*2]
+ mova m6, [r5-32*3]
+ mova m7, [r5-32*4]
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose
+ mova [r5-32*4], m0
+ mova [r5-32*3], m1
+ mova [r5-32*2], m2
+ mova [r5-32*1], m3
+ mova [r5+32*0], m4
+ mova [r5+32*1], m5
+ mova [r5+32*2], m6
+ mova [r5+32*3], m7
+ ret
+ALIGN function_align
+.pass2_oddhalf:
+ mova m0, [r3+32* 1] ; 1
+ mova m1, [r3+32* 3] ; 3
+ mova m2, [r3+32* 5] ; 5
+ mova m3, [r3+32* 7] ; 7
+ mova m4, [r3+32*17] ; 9
+ mova m5, [r3+32*19] ; 11
+ mova m6, [r3+32*21] ; 13
+ mova m7, [r3+32*23] ; 15
+ mova m8, [r3+32*33] ; 17
+ mova m9, [r3+32*35] ; 19
+ mova m10, [r3+32*37] ; 21
+ mova m11, [r3+32*39] ; 23
+ mova m12, [r3+32*49] ; 25
+ mova m13, [r3+32*51] ; 27
+ mova m14, [r3+32*53] ; 29
+ mova m15, [r3+32*55] ; 31
+ jmp m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
+ALIGN function_align
+.pass2_evenhalf:
+ mova m0, [r3+32* 0] ; 0
+ mova m1, [r3+32* 2] ; 2
+ mova m2, [r3+32* 4] ; 4
+ mova m3, [r3+32* 6] ; 6
+ mova m4, [r3+32*16] ; 8
+ mova m5, [r3+32*18] ; 10
+ mova m6, [r3+32*20] ; 12
+ mova m7, [r3+32*22] ; 14
+ mova m8, [r3+32*32] ; 16
+ mova m9, [r3+32*34] ; 18
+ mova m10, [r3+32*36] ; 20
+ mova m11, [r3+32*38] ; 22
+ mova m12, [r3+32*48] ; 24
+ mova m13, [r3+32*50] ; 26
+ mova m14, [r3+32*52] ; 28
+ mova m15, [r3+32*54] ; 30
+ mova [rsp+gprsize], m15
+ jmp m(idct_16x16_internal_8bpc).main
+
+cglobal inv_txfm_add_identity_identity_32x32_10bpc, 4, 8, 8, dst, stride, c, eob
+%undef cmp
+ vpbroadcastd m7, [pixel_10bpc_max]
+.pass1:
+ vpbroadcastd m5, [pw_8192]
+ pxor m6, m6
+ lea r6, [strideq*3]
+ lea r5, [strideq*5]
+ lea r4, [strideq+r6*2] ; strideq*7
+ call .main ; 0
+ cmp eobd, 36
+ jl .ret
+ add cq, 128*8 ; 0 1
+ mov r7, dstq ; 1
+ add dstq, 16
+ call .main
+ call .main2
+ cmp eobd, 136
+ jl .ret
+ add cq, 128*16-32 ; 0 1 2
+ lea dstq, [r7+16*2] ; 1 2
+ call .main ; 2
+ call .main2
+ call .main2
+ cmp eobd, 300
+ jl .ret
+ add cq, 128*24-64 ; 0 1 2 3
+ add r7, 16*3 ; 1 2 3
+ mov dstq, r7 ; 2 3
+ call .main ; 3
+ call .main2
+ call .main2
+ call .main2
+ cmp eobd, 535
+ jl .ret
+ add cq, 128*24-64 ; 0 1 2 3
+ lea dstq, [r7+strideq*8] ; 1 2 3 4
+ mov r7, dstq ; 2 3 4
+ call .main ; 3 4
+ call .main2
+ call .main2
+ cmp eobd, 755
+ jl .ret
+ add cq, 128*16-32 ; 0 1 2 3
+ lea dstq, [r7+strideq*8] ; 1 2 3 4
+ call .main ; 2 3 4 5
+ call .main2 ; 3 4 5
+ cmp eobd, 911
+ jl .ret
+ add cq, 128*8 ; 0 1 2 3
+ add dstq, 16 ; 1 2 3 4
+ call .main ; 2 3 4 5
+.ret: ; 3 4 5 6
+ RET
+ALIGN function_align
+.main2:
+ sub cq, 128*8-32
+ lea dstq, [dstq+strideq*8-16]
+.main:
+ mova m0, [cq+128*0]
+ packssdw m0, [cq+128*1]
+ mova m1, [cq+128*2]
+ packssdw m1, [cq+128*3]
+ mova m2, [cq+128*4]
+ packssdw m2, [cq+128*5]
+ mova m3, [cq+128*6]
+ packssdw m3, [cq+128*7]
+ REPX {pmulhrsw x, m5}, m0, m1, m2, m3
+ jmp m(inv_txfm_add_identity_identity_8x32_10bpc).main_zero
+
+cglobal inv_txfm_add_identity_identity_32x32_12bpc, 4, 8, 8, dst, stride, c, eob
+ vpbroadcastd m7, [pixel_12bpc_max]
+ jmp m(inv_txfm_add_identity_identity_32x32_10bpc).pass1
+
+%macro IDCT64_PART2_END 6-10 ; out, src[1-2], tmp[1-3], (offset[1-4])
+%if %1 & 1
+ mova m%5, [r5-32*(51-%1)] ; idct16 out 0+n
+ mova m%4, [r4-32*(14+%1)] ; idct32 out31-n
+%else
+ mova m%5, [r4-32*(45-%1)]
+ mova m%4, [r5-32*(20+%1)]
+%endif
+ paddsw m%6, m%5, m%4 ; idct32 out 0+n
+ psubsw m%5, m%4 ; idct32 out31-n
+ paddsw m%4, m%5, m%3 ; out31-n
+ psubsw m%5, m%3 ; out32+n
+ paddsw m%3, m%6, m%2 ; out 0+n
+ psubsw m%6, m%2 ; out63-n
+ REPX {pmulhrsw x, m14}, m%5, m%6, m%4, m%3
+%if %1 & 1
+ %define %%d0 r2
+ %define %%d1 dstq
+%else
+ %define %%d0 dstq
+ %define %%d1 r2
+%endif
+ paddw m%3, [%%d0+%7 ]
+ paddw m%4, [%%d1+%8 ]
+ paddw m%5, [%%d0+%9 ]
+ paddw m%6, [%%d1+%10]
+ pxor m%2, m%2
+ REPX {pmaxsw x, m%2}, m%3, m%4, m%5, m%6
+ vpbroadcastd m%2, [pixel_10bpc_max]
+ REPX {pminsw x, m%2}, m%3, m%4, m%5, m%6
+ mova [%%d0+%7 ], m%3
+ mova [%%d1+%8 ], m%4
+ mova [%%d0+%9 ], m%5
+ mova [%%d1+%10], m%6
+%endmacro
+
+cglobal inv_txfm_add_dct_dct_16x64_10bpc, 4, 7, 0, dst, stride, c, eob
+ test eobd, eobd
+ jz .dconly
+ PROLOGUE 0, 10, 16, 32*98, dst, stride, c, eob
+%undef cmp
+ vpbroadcastd m11, [pd_2048]
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ vpbroadcastd m14, [pd_2896]
+ lea r6, [rsp+32*6]
+ call .main
+ sub eobd, 44
+ jl .fast
+ call .main
+ sub eobd, 107
+ jl .fast
+ call .main
+ sub eobd, 128
+ jl .fast
+ call .main
+ jmp .pass2
+.dconly:
+ imul r6d, [cq], 181
+ vpbroadcastd m3, [dconly_10bpc]
+ mov [cq], eobd ; 0
+ or r3d, 64
+ add r6d, 640
+ sar r6d, 10
+ jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly3
+.fast:
+ lea r4, [rsp+32*38]
+ pxor m0, m0
+.fast_loop:
+ REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3
+ add r6, 32*8
+ cmp r6, r4
+ jl .fast_loop
+.pass2:
+ lea r6, [pw_5+128]
+ mova m0, [rsp+32* 2] ; in0
+ mova m1, [rsp+32* 6] ; in4
+ mova m2, [rsp+32*10] ; in8
+ mova m3, [rsp+32*14] ; in12
+ mova m4, [rsp+32*18] ; in16
+ mova m5, [rsp+32*22] ; in20
+ mova m6, [rsp+32*26] ; in24
+ mova m7, [rsp+32*30] ; in28
+ pxor m8, m8
+ REPX {mova x, m8}, m9, m10, m11, m12, m13, m14
+ mova [rsp], m8
+ call m(idct_16x16_internal_8bpc).main
+ mova m1, [rsp+32*1]
+ lea r4, [rsp+32*38]
+ mova [r4-32*4], m0
+ mova [r4-32*3], m1
+ mova [r4-32*2], m2
+ mova [r4-32*1], m3
+ mova [r4+32*0], m4
+ mova [r4+32*1], m5
+ mova [r4+32*2], m6
+ mova [r4+32*3], m7
+ add r4, 32*8
+ mova [r4-32*4], m8
+ mova [r4-32*3], m9
+ mova [r4-32*2], m10
+ mova [r4-32*1], m11
+ mova [r4+32*0], m12
+ mova [r4+32*1], m13
+ mova [r4+32*2], m14
+ mova [r4+32*3], m15
+ mova m0, [rsp+32* 4] ; in2
+ mova m1, [rsp+32* 8] ; in6
+ mova m2, [rsp+32*12] ; in10
+ mova m3, [rsp+32*16] ; in14
+ mova m4, [rsp+32*20] ; in18
+ mova m5, [rsp+32*24] ; in22
+ mova m6, [rsp+32*28] ; in26
+ mova m7, [rsp+32*32] ; in30
+ lea r5, [r4+32*16]
+ add r4, 32*8
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ mova m0, [rsp+32* 3] ; in1
+ mova m1, [rsp+32*33] ; in31
+ mova m2, [rsp+32*19] ; in17
+ mova m3, [rsp+32*17] ; in15
+ mova m4, [rsp+32*11] ; in9
+ mova m5, [rsp+32*25] ; in23
+ mova m6, [rsp+32*27] ; in25
+ mova m7, [rsp+32* 9] ; in7
+ lea r6, [idct64_mul - 8]
+ add r4, 32*16
+ add r5, 32*32
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
+ mova m0, [rsp+32* 7] ; in5
+ mova m1, [rsp+32*29] ; in27
+ mova m2, [rsp+32*23] ; in21
+ mova m3, [rsp+32*13] ; in11
+ mova m4, [rsp+32*15] ; in13
+ mova m5, [rsp+32*21] ; in19
+ mova m6, [rsp+32*31] ; in29
+ mova m7, [rsp+32* 5] ; in3
+ add r6, 8
+ add r4, 32*8
+ sub r5, 32*8
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
+ lea r8, [strideq*4]
+ lea r9, [strideq*5]
+ lea r3, [r9+strideq*1] ; stride*6
+ lea r7, [r9+strideq*2] ; stride*7
+ call .main_part2_pass2
+ RET
+ALIGN function_align
+.main:
+ mova m0, [cq+128* 1]
+ mova m1, [cq+128* 3]
+ mova m2, [cq+128* 5]
+ mova m3, [cq+128* 7]
+ mova m4, [cq+128* 9]
+ mova m5, [cq+128*11]
+ mova m6, [cq+128*13]
+ mova m7, [cq+128*15]
+ call m(idct_8x16_internal_10bpc).main_oddhalf
+ mova m0, [cq+128* 0]
+ mova m1, [cq+128* 2]
+ mova m2, [cq+128* 4]
+ mova m3, [cq+128* 6]
+ mova m4, [cq+128* 8]
+ mova m5, [cq+128*10]
+ mova m6, [cq+128*12]
+ mova m7, [cq+128*14]
+ call m(idct_8x8_internal_10bpc).main
+ call m(idct_8x16_internal_10bpc).main_evenhalf
+ pxor m15, m15
+ mov r7d, 128*13
+.main_zero_loop:
+ mova [cq+r7-128*1], m15
+ mova [cq+r7+128*0], m15
+ mova [cq+r7+128*1], m15
+ mova [cq+r7+128*2], m15
+ sub r7d, 128*4
+ jg .main_zero_loop
+ add cq, 32
+ psrld m15, m11, 10 ; pd_2
+ mova m8, [r6-32*4]
+ mova m9, [r6+32*3]
+ REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7
+ psubd m10, m0, m8 ; out15
+ paddd m0, m8 ; out0
+ mova m8, [r6-32*3]
+ psubd m15, m7, m9 ; out8
+ paddd m7, m9 ; out7
+ mova m9, [r6+32*2]
+ REPX {psrad x, 2}, m0, m15, m10, m7
+ packssdw m0, m15
+ packssdw m7, m10
+ psubd m10, m1, m8 ; out14
+ paddd m1, m8 ; out1
+ mova m8, [r6-32*2]
+ psubd m15, m6, m9 ; out9
+ paddd m6, m9 ; out6
+ mova m9, [r6+32*1]
+ REPX {psrad x, 2}, m1, m15, m10, m6
+ packssdw m1, m15
+ packssdw m6, m10
+ psubd m10, m2, m8 ; out13
+ paddd m2, m8 ; out2
+ mova m8, [r6-32*1]
+ psubd m15, m5, m9 ; out10
+ paddd m5, m9 ; out5
+ mova m9, [r6+32*0]
+ REPX {psrad x, 2}, m2, m15, m10, m5
+ packssdw m2, m15
+ packssdw m5, m10
+ psubd m10, m3, m8 ; out12
+ paddd m3, m8 ; out3
+ psubd m15, m4, m9 ; out11
+ paddd m4, m9 ; out4
+ REPX {psrad x, 2}, m3, m15, m10, m4
+ packssdw m3, m15
+ packssdw m4, m10
+ call m(idct_16x8_internal_10bpc).transpose3
+ mova [r6-32*4], m0
+ mova [r6-32*3], m1
+ mova [r6-32*2], m2
+ mova [r6-32*1], m3
+ mova [r6+32*0], m4
+ mova [r6+32*1], m5
+ mova [r6+32*2], m6
+ mova [r6+32*3], m7
+ add r6, 32*8
+ ret
+.main_part2_pass2:
+ vpbroadcastd m11, [pw_1567_3784]
+ vpbroadcastd m12, [pw_m3784_1567]
+ vpbroadcastd m13, [pw_2896_2896]
+ lea r6, [pw_5+128]
+ lea r2, [dstq+r7]
+.main_part2_pass2_loop:
+ vpbroadcastd m14, [pw_m2896_2896]
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_internal
+ vpbroadcastd m14, [pw_2048]
+ IDCT64_PART2_END 0, 7, 0, 6, 9, 10, strideq*0, r3*4, r8*8, r7*8
+ IDCT64_PART2_END 7, 8, 5, 0, 6, 7, strideq*0, r3*4, r8*8, r7*8
+ IDCT64_PART2_END 8, 2, 1, 0, 6, 7, strideq*8, r8*4, r9*8, r3*8
+ IDCT64_PART2_END 15, 3, 4, 0, 6, 7, strideq*8, r8*4, r9*8, r3*8
+ add dstq, strideq
+ sub r2, strideq
+ cmp r4, r5
+ jne .main_part2_pass2_loop
+ ret
+ALIGN function_align
+.main_part1_rect2:
+ REPX {paddd x, m11}, m0, m1, m2, m3
+ REPX {psrad x, 12 }, m0, m1, m2, m3
+.main_part1: ; idct64 steps 1-5
+ ; in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
+ ; in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
+ ; in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
+ ; in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
+ vpbroadcastd m7, [r5+4*0]
+ vpbroadcastd m8, [r5+4*1]
+ vpbroadcastd m6, [r5+4*2]
+ vpbroadcastd m9, [r5+4*3]
+ vpbroadcastd m5, [r5+4*4]
+ vpbroadcastd m10, [r5+4*5]
+ vpbroadcastd m4, [r5+4*6]
+ vpbroadcastd m15, [r5+4*7]
+ pmulld m7, m0 ; t63a
+ pmulld m0, m8 ; t32a
+ pmulld m6, m1 ; t62a
+ pmulld m1, m9 ; t33a
+ pmulld m5, m2 ; t61a
+ pmulld m2, m10 ; t34a
+ pmulld m4, m3 ; t60a
+ pmulld m3, m15 ; t35a
+ vpbroadcastd m10, [r5+4*8]
+ vpbroadcastd m15, [r5+4*9]
+ REPX {paddd x, m11}, m7, m0, m6, m1, m5, m2, m4, m3
+ REPX {psrad x, 12 }, m0, m1, m7, m6, m2, m3, m5, m4
+ psubd m8, m0, m1 ; t33
+ paddd m0, m1 ; t32
+ psubd m1, m7, m6 ; t62
+ paddd m7, m6 ; t63
+ psubd m6, m3, m2 ; t34
+ paddd m3, m2 ; t35
+ psubd m2, m4, m5 ; t61
+ paddd m4, m5 ; t60
+ REPX {pmaxsd x, m12}, m8, m1, m6, m2
+ REPX {pminsd x, m13}, m8, m1, m6, m2
+ ITX_MULSUB_2D 1, 8, 5, 9, _, 11, 10, 15 ; t33a, t62a
+ ITX_MULSUB_2D 2, 6, 5, 9, _, 11, 10, 15, 2 ; t61a, t34a
+ REPX {pmaxsd x, m12}, m0, m3, m7, m4
+ REPX {pminsd x, m13}, m0, m3, m7, m4
+ vpbroadcastd m10, [r5+4*10]
+ vpbroadcastd m15, [r5+4*11]
+ psubd m5, m0, m3 ; t35a
+ paddd m0, m3 ; t32a
+ psubd m3, m7, m4 ; t60a
+ paddd m7, m4 ; t63a
+ psubd m4, m1, m6 ; t34
+ paddd m1, m6 ; t33
+ psubd m6, m8, m2 ; t61
+ paddd m8, m2 ; t62
+ REPX {pmaxsd x, m12}, m5, m3, m4, m6
+ REPX {pminsd x, m13}, m5, m3, m4, m6
+ ITX_MULSUB_2D 3, 5, 2, 9, _, 11, 10, 15 ; t35, t60
+ ITX_MULSUB_2D 6, 4, 2, 9, _, 11, 10, 15 ; t34a, t61a
+ REPX {pmaxsd x, m12}, m0, m7, m1, m8
+ REPX {pminsd x, m13}, m0, m7, m1, m8
+ add r5, 4*12
+ mova [r6-32*4], m0
+ mova [r6+32*3], m7
+ mova [r6-32*3], m1
+ mova [r6+32*2], m8
+ mova [r6-32*2], m6
+ mova [r6+32*1], m4
+ mova [r6-32*1], m3
+ mova [r6+32*0], m5
+ add r6, 32*8
+ ret
+.main_part2: ; idct64 steps 6-9
+ lea r5, [r6+32*3]
+ sub r6, 32*4
+ vpbroadcastd m10, [pd_1567]
+ vpbroadcastd m15, [pd_3784]
+.main_part2_loop:
+ mova m0, [r6-32*32] ; t32a
+ mova m1, [r5-32*24] ; t39a
+ mova m2, [r5-32*32] ; t63a
+ mova m3, [r6-32*24] ; t56a
+ mova m4, [r6-32*16] ; t40a
+ mova m5, [r5-32* 8] ; t47a
+ mova m6, [r5-32*16] ; t55a
+ mova m7, [r6-32* 8] ; t48a
+ psubd m8, m0, m1 ; t39
+ paddd m0, m1 ; t32
+ psubd m1, m2, m3 ; t56
+ paddd m2, m3 ; t63
+ psubd m3, m5, m4 ; t40
+ paddd m5, m4 ; t47
+ psubd m4, m7, m6 ; t55
+ paddd m7, m6 ; t48
+ REPX {pmaxsd x, m12}, m8, m1, m3, m4
+ REPX {pminsd x, m13}, m8, m1, m3, m4
+ ITX_MULSUB_2D 1, 8, 6, 9, _, 11, 10, 15 ; t39a, t56a
+ ITX_MULSUB_2D 4, 3, 6, 9, _, 11, 10, 15, 2 ; t55a, t40a
+ REPX {pmaxsd x, m12}, m0, m2, m5, m7
+ REPX {pminsd x, m13}, m0, m5, m2, m7
+ psubd m6, m2, m7 ; t48a
+ paddd m2, m7 ; t63a
+ psubd m7, m0, m5 ; t47a
+ paddd m0, m5 ; t32a
+ psubd m5, m8, m4 ; t55
+ paddd m8, m4 ; t56
+ psubd m4, m1, m3 ; t40
+ paddd m1, m3 ; t39
+ REPX {pmaxsd x, m12}, m6, m7, m5, m4
+ REPX {pminsd x, m13}, m6, m7, m5, m4
+ REPX {pmulld x, m14}, m6, m7, m5, m4
+ REPX {pmaxsd x, m12}, m2, m0, m8, m1
+ REPX {pminsd x, m13}, m2, m0, m8, m1
+ paddd m6, m11
+ paddd m5, m11
+ psubd m3, m6, m7 ; t47
+ paddd m6, m7 ; t48
+ psubd m7, m5, m4 ; t40a
+ paddd m5, m4 ; t55a
+ REPX {psrad x, 12}, m3, m6, m7, m5
+ mova [r5-32* 8], m2
+ mova [r6-32*32], m0
+ mova [r6-32* 8], m8
+ mova [r5-32*32], m1
+ mova [r5-32*24], m3
+ mova [r6-32*16], m6
+ mova [r6-32*24], m7
+ mova [r5-32*16], m5
+ add r6, 32
+ sub r5, 32
+ cmp r6, r5
+ jl .main_part2_loop
+ ret
+
+cglobal inv_txfm_add_dct_dct_32x64_10bpc, 4, 7, 0, dst, stride, c, eob
+ test eobd, eobd
+ jz .dconly
+ PROLOGUE 0, 11, 16, 32*134, dst, stride, c, eob
+%undef cmp
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ lea r6, [rsp+32*6]
+ call .main
+ cmp eobd, 36
+ jl .fast
+ call .main
+ cmp eobd, 136
+ jl .fast
+ call .main
+ cmp eobd, 300
+ jl .fast
+ call .main
+ jmp .pass2
+.dconly:
+ imul r6d, [cq], 181
+ vpbroadcastd m3, [dconly_10bpc]
+ mov [cq], eobd ; 0
+ or r3d, 64
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ add r6d, 384
+ sar r6d, 9
+ jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly2
+.fast:
+ lea r4, [rsp+32*70]
+ pxor m0, m0
+.fast_loop:
+ REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3
+ add r6, 32*8
+ cmp r6, r4
+ jl .fast_loop
+.pass2:
+ lea r6, [pw_5 + 128]
+ mov r10, rsp
+ lea r8, [strideq*4]
+ lea r9, [strideq*5]
+ lea r3, [r9+strideq*1] ; stride*6
+ lea r7, [r9+strideq*2] ; stride*7
+.pass2_loop:
+ mova m0, [r10+32* 2] ; in0
+ mova m1, [r10+32* 6] ; in4
+ mova m2, [r10+32*18] ; in8
+ mova m3, [r10+32*22] ; in12
+ mova m4, [r10+32*34] ; in16
+ mova m5, [r10+32*38] ; in20
+ mova m6, [r10+32*50] ; in24
+ mova m7, [r10+32*54] ; in28
+ pxor m8, m8
+ REPX {mova x, m8}, m9, m10, m11, m12, m13, m14
+ mova [rsp], m8
+ call m(idct_16x16_internal_8bpc).main
+ mova m1, [rsp+32*1]
+ lea r4, [rsp+32*70]
+ mova [r4-32*4], m0
+ mova [r4-32*3], m1
+ mova [r4-32*2], m2
+ mova [r4-32*1], m3
+ mova [r4+32*0], m4
+ mova [r4+32*1], m5
+ mova [r4+32*2], m6
+ mova [r4+32*3], m7
+ add r4, 32*8
+ mova [r4-32*4], m8
+ mova [r4-32*3], m9
+ mova [r4-32*2], m10
+ mova [r4-32*1], m11
+ mova [r4+32*0], m12
+ mova [r4+32*1], m13
+ mova [r4+32*2], m14
+ mova [r4+32*3], m15
+ mova m0, [r10+32* 4] ; in2
+ mova m1, [r10+32* 8] ; in6
+ mova m2, [r10+32*20] ; in10
+ mova m3, [r10+32*24] ; in14
+ mova m4, [r10+32*36] ; in18
+ mova m5, [r10+32*40] ; in22
+ mova m6, [r10+32*52] ; in26
+ mova m7, [r10+32*56] ; in30
+ lea r5, [r4+32*16]
+ add r4, 32*8
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ mova m0, [r10+32* 3] ; in1
+ mova m1, [r10+32*57] ; in31
+ mova m2, [r10+32*35] ; in17
+ mova m3, [r10+32*25] ; in15
+ mova m4, [r10+32*19] ; in9
+ mova m5, [r10+32*41] ; in23
+ mova m6, [r10+32*51] ; in25
+ mova m7, [r10+32* 9] ; in7
+ lea r6, [idct64_mul - 8]
+ add r4, 32*16
+ add r5, 32*32
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
+ mova m0, [r10+32* 7] ; in5
+ mova m1, [r10+32*53] ; in27
+ mova m2, [r10+32*39] ; in21
+ mova m3, [r10+32*21] ; in11
+ mova m4, [r10+32*23] ; in13
+ mova m5, [r10+32*37] ; in19
+ mova m6, [r10+32*55] ; in29
+ mova m7, [r10+32* 5] ; in3
+ add r6, 8
+ add r4, 32*8
+ sub r5, 32*8
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
+ call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2_pass2
+ add r10, 32*8
+ sub r4, 32*98 ; rsp+32*16
+ sub dstq, r8
+ add dstq, 32
+ cmp r10, r4
+ jl .pass2_loop
+ RET
+ALIGN function_align
+.main:
+ vpbroadcastd m14, [pd_2896]
+ vpbroadcastd m11, [pd_2048]
+ pmulld m0, m14, [cq+128* 1]
+ pmulld m1, m14, [cq+128* 7]
+ pmulld m2, m14, [cq+128* 9]
+ pmulld m3, m14, [cq+128*15]
+ pmulld m4, m14, [cq+128*17]
+ pmulld m5, m14, [cq+128*23]
+ pmulld m6, m14, [cq+128*25]
+ pmulld m7, m14, [cq+128*31]
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_rect2
+ pmulld m0, m14, [cq+128* 3]
+ pmulld m1, m14, [cq+128* 5]
+ pmulld m2, m14, [cq+128*11]
+ pmulld m3, m14, [cq+128*13]
+ pmulld m4, m14, [cq+128*19]
+ pmulld m5, m14, [cq+128*21]
+ pmulld m6, m14, [cq+128*27]
+ pmulld m7, m14, [cq+128*29]
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_rect2
+ pmulld m0, m14, [cq+128* 2]
+ pmulld m1, m14, [cq+128* 6]
+ pmulld m2, m14, [cq+128*10]
+ pmulld m3, m14, [cq+128*14]
+ pmulld m4, m14, [cq+128*18]
+ pmulld m5, m14, [cq+128*22]
+ pmulld m6, m14, [cq+128*26]
+ pmulld m7, m14, [cq+128*30]
+ call m(idct_8x16_internal_10bpc).main_oddhalf_rect2
+ pmulld m0, m14, [cq+128* 0]
+ pmulld m1, m14, [cq+128* 4]
+ pmulld m2, m14, [cq+128* 8]
+ pmulld m3, m14, [cq+128*12]
+ pmulld m4, m14, [cq+128*16]
+ pmulld m5, m14, [cq+128*20]
+ pmulld m6, m14, [cq+128*24]
+ pmulld m7, m14, [cq+128*28]
+ pxor m15, m15
+ mov r7d, 128*29
+.main_zero_loop:
+ mova [cq+r7-128*1], m15
+ mova [cq+r7+128*0], m15
+ mova [cq+r7+128*1], m15
+ mova [cq+r7+128*2], m15
+ sub r7d, 128*4
+ jg .main_zero_loop
+ add cq, 32
+ call m(idct_8x8_internal_10bpc).main_rect2
+ call m(idct_8x16_internal_10bpc).main_evenhalf
+ call m(inv_txfm_add_dct_dct_32x16_10bpc).main_end
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose
+ mova [r4-32*4], m0
+ mova [r4-32*3], m1
+ mova [r4-32*2], m2
+ mova [r4-32*1], m3
+ mova [r4+32*0], m4
+ mova [r4+32*1], m5
+ mova [r4+32*2], m6
+ mova [r4+32*3], m7
+ mova m0, [r5+32*3]
+ mova m1, [r5+32*2]
+ mova m2, [r5+32*1]
+ mova m3, [r5+32*0]
+ mova m4, [r5-32*1]
+ mova m5, [r5-32*2]
+ mova m6, [r5-32*3]
+ mova m7, [r5-32*4]
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose
+ mova [r5-32*4], m0
+ mova [r5-32*3], m1
+ mova [r5-32*2], m2
+ mova [r5-32*1], m3
+ mova [r5+32*0], m4
+ mova [r5+32*1], m5
+ mova [r5+32*2], m6
+ mova [r5+32*3], m7
+ ret
+
+cglobal inv_txfm_add_dct_dct_64x16_10bpc, 4, 7, 0, dst, stride, c, eob
+ test eobd, eobd
+ jnz .normal
+ imul r6d, [cq], 181
+ mov [cq], eobd ; 0
+ or r3d, 16
+.dconly:
+ add r6d, 640
+ sar r6d, 10
+.dconly2:
+ vpbroadcastd m5, [dconly_10bpc]
+ imul r6d, 181
+ add r6d, 2176
+ sar r6d, 12
+ movd xm0, r6d
+ paddsw xm0, xm5
+ vpbroadcastw m0, xm0
+.dconly_loop:
+ paddsw m1, m0, [dstq+32*0]
+ paddsw m2, m0, [dstq+32*1]
+ paddsw m3, m0, [dstq+32*2]
+ paddsw m4, m0, [dstq+32*3]
+ REPX {psubusw x, m5}, m1, m2, m3, m4
+ mova [dstq+32*0], m1
+ mova [dstq+32*1], m2
+ mova [dstq+32*2], m3
+ mova [dstq+32*3], m4
+ add dstq, strideq
+ dec r3d
+ jg .dconly_loop
+ RET
+.normal:
+ PROLOGUE 0, 8, 16, 32*96, dst, stride, c, eob
+%undef cmp
+ vpbroadcastd m11, [pd_2048]
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ vpbroadcastd m14, [pd_2896]
+ lea r6, [rsp+32*4]
+ call .main
+ call .shift_transpose
+ cmp eobd, 36
+ jl .fast
+ call .main
+ call .shift_transpose
+ jmp .pass2
+.fast:
+ pxor m0, m0
+ mov r3d, 4
+.fast_loop:
+ REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3
+ add r6, 32*8
+ dec r3d
+ jg .fast_loop
+.pass2:
+ lea r7, [r6-32*64]
+ lea r4, [r6-32*32]
+ lea r6, [pw_5+128]
+ mov r5, dstq
+.pass2_loop:
+ mova m0, [r7-32*4]
+ mova m1, [r7-32*3]
+ mova m2, [r7-32*2]
+ mova m3, [r7-32*1]
+ mova m4, [r7+32*0]
+ mova m5, [r7+32*1]
+ mova m6, [r7+32*2]
+ mova m7, [r7+32*3]
+ add r7, 32*32
+ mova m8, [r7-32*4]
+ mova m9, [r7-32*3]
+ mova m10, [r7-32*2]
+ mova m11, [r7-32*1]
+ mova m12, [r7+32*0]
+ mova m13, [r7+32*1]
+ mova m14, [r7+32*2]
+ mova m15, [r7+32*3]
+ sub r7, 32*24
+ mova [rsp], m15
+ call m(idct_16x16_internal_8bpc).main
+ mova m1, [rsp+32*1]
+ call m(inv_txfm_add_dct_dct_32x16_10bpc).write_16x16
+ add r5, 32
+ mov dstq, r5
+ cmp r7, r4
+ jl .pass2_loop
+ RET
+ALIGN function_align
+.main:
+ lea r5, [idct64_mul_16bpc]
+ mova m0, [cq+64* 1]
+ mova m1, [cq+64*31]
+ mova m2, [cq+64*17]
+ mova m3, [cq+64*15]
+ call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1
+ mova m0, [cq+64* 7]
+ mova m1, [cq+64*25]
+ mova m2, [cq+64*23]
+ mova m3, [cq+64* 9]
+ call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1
+ mova m0, [cq+64* 5]
+ mova m1, [cq+64*27]
+ mova m2, [cq+64*21]
+ mova m3, [cq+64*11]
+ call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1
+ mova m0, [cq+64* 3]
+ mova m1, [cq+64*29]
+ mova m2, [cq+64*19]
+ mova m3, [cq+64*13]
+ call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1
+ call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2
+ mova m0, [cq+64* 2]
+ mova m1, [cq+64*14]
+ mova m2, [cq+64*18]
+ mova m3, [cq+64*30]
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_fast
+ mova m0, [cq+64* 6]
+ mova m1, [cq+64*10]
+ mova m2, [cq+64*22]
+ mova m3, [cq+64*26]
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_fast
+ mova m0, [cq+64* 4]
+ mova m1, [cq+64*12]
+ mova m2, [cq+64*20]
+ mova m3, [cq+64*28]
+ call m(idct_8x16_internal_10bpc).main_oddhalf_fast
+ mova m0, [cq+64* 0]
+ mova m1, [cq+64* 8]
+ mova m2, [cq+64*16]
+ mova m3, [cq+64*24]
+ pxor m15, m15
+ mov r7d, 64*30
+.main_zero_loop:
+ mova [cq+r7-64*2], m15
+ mova [cq+r7-64*1], m15
+ mova [cq+r7+64*0], m15
+ mova [cq+r7+64*1], m15
+ sub r7d, 64*4
+ jg .main_zero_loop
+.main_end:
+ psrld m15, m11, 10 ; pd_2
+.main_end2:
+ add cq, 32
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_8x8_internal_10bpc).main
+ add r6, 32*8
+ call m(idct_8x16_internal_10bpc).main_evenhalf
+ mova [r6+32*2], m1
+ mova [r6+32*1], m2
+ mova [r6+32*0], m3
+ mova [r6-32*1], m4
+ mova [r6-32*2], m5
+ mova [r6-32*3], m6
+ mova [r6-32*4], m7
+ jmp .main_end_loop_start
+.main_end_loop:
+ mova m0, [r6+32* 3] ; idct8 0 + n
+.main_end_loop_start:
+ mova m1, [r5+32* 4] ; idct16 15 - n
+ mova m2, [r5-32*12] ; idct32 16 + n
+ mova m3, [r6-32*13] ; idct32 31 - n
+ mova m4, [r6-32*29] ; idct64 63 - n
+ mova m5, [r5-32*28] ; idct64 48 + n
+ mova m6, [r6-32*45] ; idct64 47 - n
+ mova m7, [r5-32*44] ; idct64 32 + n
+ paddd m8, m0, m1 ; idct16 out0 + n
+ psubd m0, m1 ; idct16 out15 - n
+ REPX {pmaxsd x, m12}, m8, m0
+ REPX {pminsd x, m13}, m8, m0
+ paddd m1, m8, m3 ; idct32 out0 + n
+ psubd m8, m3 ; idct32 out31 - n
+ paddd m3, m0, m2 ; idct32 out15 - n
+ psubd m0, m2 ; idct32 out16 + n
+ REPX {pmaxsd x, m12}, m1, m8, m3, m0
+ REPX {pminsd x, m13}, m1, m3, m8, m0
+ REPX {paddd x, m15}, m1, m3, m0, m8
+ paddd m2, m1, m4 ; idct64 out0 + n (unshifted)
+ psubd m1, m4 ; idct64 out63 - n (unshifted)
+ paddd m4, m3, m5 ; idct64 out15 - n (unshifted)
+ psubd m3, m5 ; idct64 out48 + n (unshifted)
+ paddd m5, m0, m6 ; idct64 out16 + n (unshifted)
+ psubd m0, m6 ; idct64 out47 - n (unshifted)
+ paddd m6, m8, m7 ; idct64 out31 - n (unshifted)
+ psubd m8, m7 ; idct64 out32 + n (unshifted)
+ mova [r5-32*44], m2
+ mova [r6+32* 3], m1
+ mova [r6-32*45], m4
+ mova [r5+32* 4], m3
+ mova [r5-32*28], m5
+ mova [r6-32*13], m0
+ mova [r6-32*29], m6
+ mova [r5-32*12], m8
+ add r5, 32
+ sub r6, 32
+ cmp r5, r6
+ jl .main_end_loop
+ ret
+.shift_transpose:
+%macro IDCT64_SHIFT_TRANSPOSE 1 ; shift
+ sub r6, 32*48
+ mov r5, r6
+%%loop:
+ mova m0, [r6-32* 4]
+ mova m4, [r6+32* 4]
+ mova m1, [r6-32* 3]
+ mova m5, [r6+32* 5]
+ mova m2, [r6-32* 2]
+ mova m6, [r6+32* 6]
+ mova m3, [r6-32* 1]
+ mova m7, [r6+32* 7]
+ REPX {psrad x, %1}, m0, m4, m1, m5, m2, m6, m3, m7
+ packssdw m0, m4
+ packssdw m1, m5
+ packssdw m2, m6
+ packssdw m3, m7
+ mova m4, [r6+32* 0]
+ mova m6, [r6+32* 8]
+ mova m5, [r6+32* 1]
+ mova m7, [r6+32* 9]
+ REPX {psrad x, %1}, m4, m6, m5, m7
+ packssdw m4, m6
+ packssdw m5, m7
+ mova m6, [r6+32* 2]
+ mova m8, [r6+32*10]
+ mova m7, [r6+32* 3]
+ mova m9, [r6+32*11]
+ REPX {psrad x, %1}, m6, m8, m7, m9
+ packssdw m6, m8
+ packssdw m7, m9
+ call m(idct_16x8_internal_10bpc).transpose3
+ mova [r5-32*4], m0
+ mova [r5-32*3], m1
+ mova [r5-32*2], m2
+ mova [r5-32*1], m3
+ mova [r5+32*0], m4
+ mova [r5+32*1], m5
+ mova [r5+32*2], m6
+ mova [r5+32*3], m7
+ add r6, 32*16
+ add r5, 32*8
+ cmp r5, r4
+ jl %%loop
+ mov r6, r4
+%endmacro
+ IDCT64_SHIFT_TRANSPOSE 2
+ ret
+
+cglobal inv_txfm_add_dct_dct_64x32_10bpc, 4, 7, 0, dst, stride, c, eob
+ test eobd, eobd
+ jz .dconly
+ PROLOGUE 0, 8, 16, 32*163, dst, stride, c, eob
+%undef cmp
+ vpbroadcastd m11, [pd_2048]
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ vpbroadcastd m14, [pd_2896]
+ lea r6, [rsp+32*7]
+ call .main
+ cmp eobd, 36
+ jl .fast
+ call .main
+ cmp eobd, 136
+ jl .fast
+ call .main
+ cmp eobd, 300
+ jl .fast
+ call .main
+ jmp .pass2
+.dconly:
+ imul r6d, [cq], 181
+ mov [cq], eobd ; 0
+ or r3d, 32
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ add r6d, 384
+ sar r6d, 9
+ jmp m(inv_txfm_add_dct_dct_64x16_10bpc).dconly2
+.fast:
+ pxor m0, m0
+ lea r4, [rsp+32*135]
+.fast_loop:
+ REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3
+ add r6, 32*8
+ cmp r6, r4
+ jl .fast_loop
+.pass2:
+ lea r7, [r6-32*32]
+ lea r5, [r6+32*8]
+ lea r6, [pw_5+128]
+ imul r2, strideq, 19
+ lea r3, [strideq*3]
+ add r2, dstq
+.pass2_loop:
+ mova m0, [r7-32*99]
+ mova m1, [r7-32*97]
+ mova m2, [r7-32*95]
+ mova m3, [r7-32*93]
+ mova m4, [r7-32*67]
+ mova m5, [r7-32*65]
+ mova m6, [r7-32*63]
+ mova m7, [r7-32*61]
+ mova m8, [r7-32*35]
+ mova m9, [r7-32*33]
+ mova m10, [r7-32*31]
+ mova m11, [r7-32*29]
+ mova m12, [r7-32* 3]
+ mova m13, [r7-32* 1]
+ mova m14, [r7+32* 1]
+ mova m15, [r7+32* 3]
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
+ mova m0, [r7-32*100]
+ mova m1, [r7-32*98]
+ mova m2, [r7-32*96]
+ mova m3, [r7-32*94]
+ mova m4, [r7-32*68]
+ mova m5, [r7-32*66]
+ mova m6, [r7-32*64]
+ mova m7, [r7-32*62]
+ mova m8, [r7-32*36]
+ mova m9, [r7-32*34]
+ mova m10, [r7-32*32]
+ mova m11, [r7-32*30]
+ mova m12, [r7-32* 4]
+ mova m13, [r7-32* 2]
+ mova m14, [r7+32* 0]
+ mova m15, [r7+32* 2]
+ add r7, 32*8
+ mova [rsp], m15
+ call m(idct_16x16_internal_8bpc).main
+ call m(inv_txfm_add_dct_dct_16x32_10bpc).pass2_end
+ sub dstq, r3
+ lea r2, [r2+r3+32]
+ add dstq, 32
+ cmp r7, r4
+ jl .pass2_loop
+ RET
+ALIGN function_align
+.main:
+ lea r5, [idct64_mul_16bpc]
+ pmulld m0, m14, [cq+128* 1]
+ pmulld m1, m14, [cq+128*31]
+ pmulld m2, m14, [cq+128*17]
+ pmulld m3, m14, [cq+128*15]
+ call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1_rect2
+ pmulld m0, m14, [cq+128* 7]
+ pmulld m1, m14, [cq+128*25]
+ pmulld m2, m14, [cq+128*23]
+ pmulld m3, m14, [cq+128* 9]
+ call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1_rect2
+ pmulld m0, m14, [cq+128* 5]
+ pmulld m1, m14, [cq+128*27]
+ pmulld m2, m14, [cq+128*21]
+ pmulld m3, m14, [cq+128*11]
+ call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1_rect2
+ pmulld m0, m14, [cq+128* 3]
+ pmulld m1, m14, [cq+128*29]
+ pmulld m2, m14, [cq+128*19]
+ pmulld m3, m14, [cq+128*13]
+ call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1_rect2
+ call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2
+ pmulld m0, m14, [cq+128* 2]
+ pmulld m1, m14, [cq+128*14]
+ pmulld m2, m14, [cq+128*18]
+ pmulld m3, m14, [cq+128*30]
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_fast_rect2
+ pmulld m0, m14, [cq+128* 6]
+ pmulld m1, m14, [cq+128*10]
+ pmulld m2, m14, [cq+128*22]
+ pmulld m3, m14, [cq+128*26]
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_fast_rect2
+ pmulld m0, m14, [cq+128* 4]
+ pmulld m1, m14, [cq+128*12]
+ pmulld m2, m14, [cq+128*20]
+ pmulld m3, m14, [cq+128*28]
+ call m(idct_8x16_internal_10bpc).main_oddhalf_fast_rect2
+ pmulld m0, m14, [cq+128* 0]
+ pmulld m1, m14, [cq+128* 8]
+ pmulld m2, m14, [cq+128*16]
+ pmulld m3, m14, [cq+128*24]
+ pxor m15, m15
+ mov r7d, 128*29
+.main_zero_loop:
+ mova [cq+r7-128*1], m15
+ mova [cq+r7+128*0], m15
+ mova [cq+r7+128*1], m15
+ mova [cq+r7+128*2], m15
+ sub r7d, 128*4
+ jg .main_zero_loop
+ psrld m15, m11, 11 ; pd_1
+ REPX {paddd x, m11}, m0, m1, m2, m3
+ REPX {psrad x, 12 }, m0, m1, m2, m3
+ call m(inv_txfm_add_dct_dct_64x16_10bpc).main_end2
+ IDCT64_SHIFT_TRANSPOSE 1
+ ret
+
+cglobal inv_txfm_add_dct_dct_64x64_10bpc, 4, 7, 0, dst, stride, c, eob
+ test eobd, eobd
+ jz .dconly
+ PROLOGUE 0, 11, 16, 32*195, dst, stride, c, eob
+%undef cmp
+ vpbroadcastd m11, [pd_2048]
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ vpbroadcastd m14, [pd_2896]
+ lea r6, [rsp+32*7]
+ call .main
+ cmp eobd, 36
+ jl .fast
+ call .main
+ cmp eobd, 136
+ jl .fast
+ call .main
+ cmp eobd, 300
+ jl .fast
+ call .main
+ jmp .pass2
+.dconly:
+ imul r6d, [cq], 181
+ mov [cq], eobd ; 0
+ or r3d, 64
+ jmp m(inv_txfm_add_dct_dct_64x16_10bpc).dconly
+.fast:
+ pxor m0, m0
+ lea r4, [rsp+32*135]
+.fast_loop:
+ REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3
+ add r6, 32*8
+ cmp r6, r4
+ jl .fast_loop
+.pass2:
+ lea r10, [r6-32*32]
+ lea r6, [pw_5+128]
+ lea r8, [strideq*4]
+ lea r9, [strideq*5]
+ lea r3, [r9+strideq*1] ; stride*6
+ lea r7, [r9+strideq*2] ; stride*7
+.pass2_loop:
+ mova m0, [r10-32*100] ; in0
+ mova m1, [r10-32*96] ; in4
+ mova m2, [r10-32*68] ; in8
+ mova m3, [r10-32*64] ; in12
+ mova m4, [r10-32*36] ; in16
+ mova m5, [r10-32*32] ; in20
+ mova m6, [r10-32* 4] ; in24
+ mova m7, [r10+32* 0] ; in28
+ pxor m8, m8
+ REPX {mova x, m8}, m9, m10, m11, m12, m13, m14
+ mova [rsp], m8
+ call m(idct_16x16_internal_8bpc).main
+ mova m1, [rsp+32*1]
+ mova [r4-32*4], m0
+ mova [r4-32*3], m1
+ mova [r4-32*2], m2
+ mova [r4-32*1], m3
+ mova [r4+32*0], m4
+ mova [r4+32*1], m5
+ mova [r4+32*2], m6
+ mova [r4+32*3], m7
+ add r4, 32*8
+ mova [r4-32*4], m8
+ mova [r4-32*3], m9
+ mova [r4-32*2], m10
+ mova [r4-32*1], m11
+ mova [r4+32*0], m12
+ mova [r4+32*1], m13
+ mova [r4+32*2], m14
+ mova [r4+32*3], m15
+ mova m0, [r10-32*98] ; in2
+ mova m1, [r10-32*94] ; in6
+ mova m2, [r10-32*66] ; in10
+ mova m3, [r10-32*62] ; in14
+ mova m4, [r10-32*34] ; in18
+ mova m5, [r10-32*30] ; in22
+ mova m6, [r10-32* 2] ; in26
+ mova m7, [r10+32* 2] ; in30
+ lea r5, [r4+32*16]
+ add r4, 32*8
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ mova m0, [r10-32*99] ; in1
+ mova m1, [r10+32* 3] ; in31
+ mova m2, [r10-32*35] ; in17
+ mova m3, [r10-32*61] ; in15
+ mova m4, [r10-32*67] ; in9
+ mova m5, [r10-32*29] ; in23
+ mova m6, [r10-32* 3] ; in25
+ mova m7, [r10-32*93] ; in7
+ lea r6, [idct64_mul - 8]
+ add r4, 32*16
+ add r5, 32*32
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
+ mova m0, [r10-32*95] ; in5
+ mova m1, [r10-32* 1] ; in27
+ mova m2, [r10-32*31] ; in21
+ mova m3, [r10-32*65] ; in11
+ mova m4, [r10-32*63] ; in13
+ mova m5, [r10-32*33] ; in19
+ mova m6, [r10+32* 1] ; in29
+ mova m7, [r10-32*97] ; in3
+ add r6, 8
+ add r4, 32*8
+ sub r5, 32*8
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
+ call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2_pass2
+ add r10, 32*8
+ sub dstq, r8
+ sub r4, 32*44
+ add dstq, 32
+ cmp r10, r4
+ jl .pass2_loop
+ RET
+ALIGN function_align
+.main:
+ lea r5, [idct64_mul_16bpc]
+ mova m0, [cq+128* 1]
+ mova m1, [cq+128*31]
+ mova m2, [cq+128*17]
+ mova m3, [cq+128*15]
+ call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1
+ mova m0, [cq+128* 7]
+ mova m1, [cq+128*25]
+ mova m2, [cq+128*23]
+ mova m3, [cq+128* 9]
+ call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1
+ mova m0, [cq+128* 5]
+ mova m1, [cq+128*27]
+ mova m2, [cq+128*21]
+ mova m3, [cq+128*11]
+ call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1
+ mova m0, [cq+128* 3]
+ mova m1, [cq+128*29]
+ mova m2, [cq+128*19]
+ mova m3, [cq+128*13]
+ call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1
+ call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2
+ mova m0, [cq+128* 2]
+ mova m1, [cq+128*14]
+ mova m2, [cq+128*18]
+ mova m3, [cq+128*30]
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_fast
+ mova m0, [cq+128* 6]
+ mova m1, [cq+128*10]
+ mova m2, [cq+128*22]
+ mova m3, [cq+128*26]
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_fast
+ mova m0, [cq+128* 4]
+ mova m1, [cq+128*12]
+ mova m2, [cq+128*20]
+ mova m3, [cq+128*28]
+ call m(idct_8x16_internal_10bpc).main_oddhalf_fast
+ mova m0, [cq+128* 0]
+ mova m1, [cq+128* 8]
+ mova m2, [cq+128*16]
+ mova m3, [cq+128*24]
+ pxor m15, m15
+ mov r7d, 128*29
+.main_zero_loop:
+ mova [cq+r7-128*1], m15
+ mova [cq+r7+128*0], m15
+ mova [cq+r7+128*1], m15
+ mova [cq+r7+128*2], m15
+ sub r7d, 128*4
+ jg .main_zero_loop
+ call m(inv_txfm_add_dct_dct_64x16_10bpc).main_end
+ jmp m(inv_txfm_add_dct_dct_64x16_10bpc).shift_transpose
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/itx16_avx512.asm b/third_party/dav1d/src/x86/itx16_avx512.asm
new file mode 100644
index 0000000000..9f5f909a5f
--- /dev/null
+++ b/third_party/dav1d/src/x86/itx16_avx512.asm
@@ -0,0 +1,6056 @@
+; Copyright © 2022-2023, VideoLAN and dav1d authors
+; Copyright © 2022-2023, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 64
+
+idct8x8p: db 0, 1, 4, 5, 2, 3, 6, 7, 16, 17, 20, 21, 18, 19, 22, 23
+ db 8, 9, 12, 13, 10, 11, 14, 15, 24, 25, 28, 29, 26, 27, 30, 31
+ db 32, 33, 36, 37, 34, 35, 38, 39, 48, 49, 52, 53, 50, 51, 54, 55
+ db 40, 41, 44, 45, 42, 43, 46, 47, 56, 57, 60, 61, 58, 59, 62, 63
+idtx8x8p: db 0, 1, 32, 33, 2, 3, 34, 35, 4, 5, 36, 37, 6, 7, 38, 39
+ db 8, 9, 40, 41, 10, 11, 42, 43, 12, 13, 44, 45, 14, 15, 46, 47
+ db 16, 17, 48, 49, 18, 19, 50, 51, 20, 21, 52, 53, 22, 23, 54, 55
+ db 24, 25, 56, 57, 26, 27, 58, 59, 28, 29, 60, 61, 30, 31, 62, 63
+idct8x16p: db 54, 55, 2, 3, 22, 23, 34, 35, 38, 39, 18, 19, 6, 7, 50, 51
+ db 62, 63, 10, 11, 30, 31, 42, 43, 46, 47, 26, 27, 14, 15, 58, 59
+ db 52, 53, 4, 5, 20, 21, 36, 37, 32, 33, 0, 1, 48, 49, 16, 17
+ db 60, 61, 12, 13, 28, 29, 44, 45, 40, 41, 8, 9, 56, 57, 24, 25
+iadst8x16p: db 0, 1, 54, 55, 48, 49, 6, 7, 16, 17, 38, 39, 32, 33, 22, 23
+ db 8, 9, 62, 63, 56, 57, 14, 15, 24, 25, 46, 47, 40, 41, 30, 31
+ db 4, 5, 50, 51, 52, 53, 2, 3, 20, 21, 34, 35, 36, 37, 18, 19
+ db 12, 13, 58, 59, 60, 61, 10, 11, 28, 29, 42, 43, 44, 45, 26, 27
+permA: db 0, 1, 0, 8, 4, 5, 1, 9, 8, 9, 4, 12, 12, 13, 5, 13
+ db 16, 17, 16, 24, 20, 21, 17, 25, 24, 25, 20, 28, 28, 29, 21, 29
+ db 2, 3, 2, 10, 6, 7, 3, 11, 10, 11, 6, 14, 14, 15, 7, 15
+ db 18, 19, 18, 26, 22, 23, 19, 27, 26, 27, 22, 30, 30, 31, 23, 31
+permB: db 4, 2, 1, 8, 0, 0, 1, 0, 12, 3, 3, 10, 8, 1, 3, 2
+ db 5, 10, 5, 12, 1, 8, 5, 4, 13, 11, 7, 14, 9, 9, 7, 6
+ db 6, 6, 13, 4, 2, 4, 4, 5, 14, 7, 15, 6, 10, 5, 6, 7
+ db 7, 14, 9, 0, 3, 12, 0, 1, 15, 15, 11, 2, 11, 13, 2, 3
+permC: db 0, 9, 0, 0, 0, 1, 4, 4, 2, 11, 2, 2, 2, 3, 6, 6
+ db 1, 8, 1, 8, 4, 5, 5, 12, 3, 10, 3, 10, 6, 7, 7, 14
+ db 9, 1, 8, 1, 1, 0, 12, 5, 11, 3, 10, 3, 3, 2, 14, 7
+ db 8, 0, 9, 9, 5, 4, 13, 13, 10, 2, 11, 11, 7, 6, 15, 15
+idct8x32p: db 0, 1, 4, 5, 16, 17, 20, 21, 32, 33, 36, 37, 48, 49, 52, 53
+ db 8, 9, 12, 13, 24, 25, 28, 29, 40, 41, 44, 45, 56, 57, 60, 61
+ db 2, 3, 6, 7, 18, 19, 22, 23, 34, 35, 38, 39, 50, 51, 54, 55
+ db 10, 11, 14, 15, 26, 27, 30, 31, 42, 43, 46, 47, 58, 59, 62, 63
+idct32x8p: db 2, 18, 0, 16, 3, 19, 1, 17, 10, 26, 8, 24, 11, 27, 9, 25
+ db 34, 50, 32, 48, 35, 51, 33, 49, 42, 58, 40, 56, 43, 59, 41, 57
+ db 6, 22, 4, 20, 7, 23, 5, 21, 14, 30, 12, 28, 15, 31, 13, 29
+ db 38, 54, 36, 52, 39, 55, 37, 53, 46, 62, 44, 60, 47, 63, 45, 61
+idtx32x8p: db 0, 8, 16, 24, 4, 12, 20, 28, 2, 10, 18, 26, 6, 14, 22, 30
+ db 32, 40, 48, 56, 36, 44, 52, 60, 34, 42, 50, 58, 38, 46, 54, 62
+ db 1, 9, 17, 25, 5, 13, 21, 29, 3, 11, 19, 27, 7, 15, 23, 31
+ db 33, 41, 49, 57, 37, 45, 53, 61, 35, 43, 51, 59, 39, 47, 55, 63
+
+pw_2048_m2048: times 16 dw 2048
+pw_m2048_2048: times 16 dw -2048
+pw_2048: times 16 dw 2048
+
+; flags: 0 = ++, 1 = +-, 2 = -+, 3 = ++-, 4=--
+%macro COEF_PAIR 2-3 0 ; a, b, flags
+%if %3 == 1
+pd_%1_m%2: dd %1, %1, -%2, -%2
+%define pd_%1 (pd_%1_m%2 + 4*0)
+%define pd_m%2 (pd_%1_m%2 + 4*2)
+%elif %3 == 2
+pd_m%1_%2: dd -%1, -%1, %2, %2
+%define pd_m%1 (pd_m%1_%2 + 4*0)
+%define pd_%2 (pd_m%1_%2 + 4*2)
+%elif %3 == 4
+pd_m%1_m%2: dd -%1, -%1, -%2, -%2
+%define pd_m%1 (pd_m%1_m%2 + 4*0)
+%define pd_m%2 (pd_m%1_m%2 + 4*2)
+%else
+pd_%1_%2: dd %1, %1, %2, %2
+%define pd_%1 (pd_%1_%2 + 4*0)
+%define pd_%2 (pd_%1_%2 + 4*2)
+%if %3 == 3
+%define pd_%2_m%2 pd_%2
+dd -%2, -%2
+%endif
+%endif
+%endmacro
+
+COEF_PAIR 101, 501
+COEF_PAIR 201, 601, 1
+COEF_PAIR 201, 995
+COEF_PAIR 401, 1189, 1
+COEF_PAIR 401, 1931
+COEF_PAIR 401, 3920
+COEF_PAIR 401, 4076
+COEF_PAIR 700, 301, 4
+COEF_PAIR 799, 2276, 1
+COEF_PAIR 799, 3406
+COEF_PAIR 799, 4017
+COEF_PAIR 1380, 601
+COEF_PAIR 1751, 2440
+COEF_PAIR 2598, 1189
+COEF_PAIR 2598, 1931, 2
+COEF_PAIR 2598, 3612
+COEF_PAIR 2751, 2106
+COEF_PAIR 2896, 1567, 3
+COEF_PAIR 2896, 3784, 3
+COEF_PAIR 3035, 3513
+COEF_PAIR 3166, 1931
+COEF_PAIR 3166, 3612
+COEF_PAIR 3166, 3920
+COEF_PAIR 3703, 3290
+COEF_PAIR 3857, 4052
+COEF_PAIR 4017, 2276
+COEF_PAIR 4017, 3406
+COEF_PAIR 4036, 4085
+COEF_PAIR 4076, 1189
+COEF_PAIR 4076, 3612
+COEF_PAIR 4076, 3920
+COEF_PAIR 4091, 3973
+COEF_PAIR 4091, 4052
+COEF_PAIR 4095, 4065
+
+pb_32: times 4 db 32
+pw_5: times 2 dw 5
+pw_4096: times 2 dw 4096
+pw_8192: times 2 dw 8192
+pw_1697x16: times 2 dw 1697*16
+pw_2896x8: times 2 dw 2896*8
+pixel_10bpc_max: times 2 dw 0x03ff
+dconly_10bpc: times 2 dw 0x7c00
+clip_18b_min: dd -0x20000
+clip_18b_max: dd 0x1ffff
+pd_1: dd 1
+pd_2: dd 2
+pd_1448: dd 1448
+pd_2048: dd 2048
+pd_3071: dd 3071 ; 1024 + 2048 - 1
+pd_3072: dd 3072 ; 1024 + 2048
+pd_5119: dd 5119 ; 1024 + 4096 - 1
+pd_5120: dd 5120 ; 1024 + 4096
+pd_5793: dd 5793
+
+cextern dup16_perm
+cextern int8_permA
+cextern idct64_mul_16bpc
+cextern idct_8x8_internal_8bpc_avx512icl.main
+cextern iadst_8x8_internal_8bpc_avx512icl.main_pass2
+cextern idct_8x16_internal_8bpc_avx512icl.main
+cextern idct_8x16_internal_8bpc_avx512icl.main2
+cextern idct_8x16_internal_8bpc_avx512icl.main_fast
+cextern idct_8x16_internal_8bpc_avx512icl.main_fast2
+cextern iadst_8x16_internal_8bpc_avx512icl.main2
+cextern idct_16x8_internal_8bpc_avx512icl.main
+cextern iadst_16x8_internal_8bpc_avx512icl.main_pass2
+cextern idct_16x16_internal_8bpc_avx512icl.main
+cextern idct_16x16_internal_8bpc_avx512icl.main2
+cextern idct_16x16_internal_8bpc_avx512icl.main_fast
+cextern idct_16x16_internal_8bpc_avx512icl.main_fast2
+cextern iadst_16x16_internal_8bpc_avx512icl.main_pass2b
+cextern inv_txfm_add_dct_dct_8x32_8bpc_avx512icl.main
+cextern inv_txfm_add_dct_dct_8x32_8bpc_avx512icl.main_fast
+cextern inv_txfm_add_dct_dct_8x32_8bpc_avx512icl.main_fast2
+cextern inv_txfm_add_dct_dct_8x32_8bpc_avx512icl.main_end
+cextern inv_txfm_add_dct_dct_16x32_8bpc_avx512icl.main_oddhalf
+cextern inv_txfm_add_dct_dct_16x32_8bpc_avx512icl.main_oddhalf_fast
+cextern inv_txfm_add_dct_dct_16x32_8bpc_avx512icl.main_oddhalf_fast2
+cextern inv_txfm_add_dct_dct_32x8_8bpc_avx512icl.main
+cextern inv_txfm_add_dct_dct_32x16_8bpc_avx512icl.main_oddhalf
+cextern inv_txfm_add_dct_dct_32x16_8bpc_avx512icl.main_oddhalf_fast
+cextern inv_txfm_add_dct_dct_32x16_8bpc_avx512icl.main_oddhalf_fast2
+cextern inv_txfm_add_dct_dct_32x16_8bpc_avx512icl.main_oddhalf_fast3
+cextern inv_txfm_add_dct_dct_32x32_8bpc_avx512icl.main_oddhalf
+cextern inv_txfm_add_dct_dct_32x32_8bpc_avx512icl.main_oddhalf_fast
+cextern inv_txfm_add_dct_dct_32x32_8bpc_avx512icl.main_oddhalf_fast2
+cextern inv_txfm_add_dct_dct_32x32_8bpc_avx512icl.main_oddhalf_fast3
+cextern inv_txfm_add_dct_dct_16x64_8bpc_avx512icl.main_oddhalf
+cextern inv_txfm_add_dct_dct_16x64_8bpc_avx512icl.main_oddhalf_fast
+cextern inv_txfm_add_dct_dct_32x64_8bpc_avx512icl.main_part1
+cextern inv_txfm_add_dct_dct_32x64_8bpc_avx512icl.main_part1_fast
+cextern inv_txfm_add_dct_dct_32x64_8bpc_avx512icl.main_part1_fast2
+cextern inv_txfm_add_dct_dct_32x64_8bpc_avx512icl.main_part2
+
+SECTION .text
+
+%define o_base (pw_2048+4*128)
+%define o_base_8bpc (int8_permA+64*18)
+%define o(x) (r5 - o_base + (x))
+%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
+
+INIT_ZMM avx512icl
+
+; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
+; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
+; flags: 1 = inv_dst1, 2 = inv_dst2
+; skip round/shift if rnd is not a number
+%macro ITX_MULSUB_2D 8-9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], flags
+%if %8 < 32
+ pmulld m%4, m%1, m%8
+ pmulld m%3, m%2, m%8
+%else
+%if %8 < 4096
+ vpbroadcastd m%3, [o(pd_%8)]
+%else
+ vbroadcasti32x4 m%3, [o(pd_%8)]
+%endif
+ pmulld m%4, m%1, m%3
+ pmulld m%3, m%2
+%endif
+%if %7 < 32
+ pmulld m%1, m%7
+ pmulld m%2, m%7
+%else
+%if %7 < 4096
+ vpbroadcastd m%5, [o(pd_%7)]
+%else
+ vbroadcasti32x4 m%5, [o(pd_%7)]
+%endif
+ pmulld m%1, m%5
+ pmulld m%2, m%5
+%endif
+%if %9 & 2
+ psubd m%4, m%6, m%4
+ psubd m%2, m%4, m%2
+%else
+%ifnum %6
+ paddd m%4, m%6
+%endif
+ paddd m%2, m%4
+%endif
+%ifnum %6
+ paddd m%1, m%6
+%endif
+%if %9 & 1
+ psubd m%1, m%3, m%1
+%else
+ psubd m%1, m%3
+%endif
+%ifnum %6
+ psrad m%2, 12
+ psrad m%1, 12
+%endif
+%endmacro
+
+%macro INV_TXFM_FN 4 ; type1, type2, eob_offset, size
+cglobal inv_txfm_add_%1_%2_%4_10bpc, 4, 7, 0, dst, stride, c, eob, tx2
+ %define %%p1 m(i%1_%4_internal_10bpc)
+ lea r5, [o_base]
+ ; Jump to the 1st txfm function if we're not taking the fast path, which
+ ; in turn performs an indirect jump to the 2nd txfm function.
+ lea tx2q, [m(i%2_%4_internal_10bpc).pass2]
+%ifidn %1_%2, dct_dct
+ test eobd, eobd
+ jnz %%p1
+%else
+%if %3
+ add eobd, %3
+%endif
+ ; jump to the 1st txfm function unless it's located directly after this
+ times ((%%end - %%p1) >> 31) & 1 jmp %%p1
+ALIGN function_align
+%%end:
+%endif
+%endmacro
+
+%macro INV_TXFM_8X8_FN 2-3 0 ; type1, type2, eob_offset
+ INV_TXFM_FN %1, %2, %3, 8x8
+%ifidn %1_%2, dct_dct
+ imul r6d, [cq], 181
+ mov [cq], eobd ; 0
+ or r3d, 8
+.dconly:
+ add r6d, 384
+ sar r6d, 9
+.dconly2:
+ vpbroadcastd ym2, [o(dconly_10bpc)]
+ imul r6d, 181
+ add r6d, 2176
+ sar r6d, 12
+ vpbroadcastw ym1, r6d
+ paddsw ym1, ym2
+.dconly_loop:
+ mova xm0, [dstq+strideq*0]
+ vinserti32x4 ym0, [dstq+strideq*1], 1
+ paddsw ym0, ym1
+ psubusw ym0, ym2
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ lea dstq, [dstq+strideq*2]
+ sub r3d, 2
+ jg .dconly_loop
+ RET
+%endif
+%endmacro
+
+INV_TXFM_8X8_FN dct, dct
+INV_TXFM_8X8_FN dct, adst
+INV_TXFM_8X8_FN dct, flipadst
+INV_TXFM_8X8_FN dct, identity
+
+cglobal idct_8x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+ call .load
+ vpermi2q m1, m0, m2 ; 1 5
+ vpermi2q m3, m6, m4 ; 7 3
+ vpermt2q m0, m5, m4 ; 0 2
+ vpermt2q m2, m5, m6 ; 4 6
+ call .main
+ call .main_end
+ mova m4, [o(idct8x8p)]
+ packssdw m0, m2 ; 0 1 4 5
+ packssdw m1, m3 ; 3 2 7 6
+ vpermb m0, m4, m0
+ vprolq m1, 32
+ vpermb m2, m4, m1
+ punpckhdq m1, m0, m2
+ punpckldq m0, m2
+ jmp tx2q
+.pass2:
+ lea r5, [o_base_8bpc]
+ vextracti32x8 ym2, m0, 1
+ vextracti32x8 ym3, m1, 1
+ call m(idct_8x8_internal_8bpc).main
+ mova m10, [permC]
+ vpbroadcastd m12, [pw_2048]
+.end:
+ vpermt2q m0, m10, m1
+ vpermt2q m2, m10, m3
+.end2:
+ vpbroadcastd m11, [pixel_10bpc_max]
+ lea r6, [strideq*3]
+ pxor m10, m10
+ pmulhrsw m8, m12, m0
+ call .write_8x4_start
+ pmulhrsw m8, m12, m2
+.write_8x4:
+ lea dstq, [dstq+strideq*4]
+ add cq, 64*2
+.write_8x4_start:
+ mova xm9, [dstq+strideq*0]
+ vinserti32x4 ym9, [dstq+strideq*1], 1
+ vinserti32x4 m9, [dstq+strideq*2], 2
+ vinserti32x4 m9, [dstq+r6 ], 3
+ mova [cq+64*0], m10
+ mova [cq+64*1], m10
+ paddw m9, m8
+ pmaxsw m9, m10
+ pminsw m9, m11
+ mova [dstq+strideq*0], xm9
+ vextracti32x4 [dstq+strideq*1], ym9, 1
+ vextracti32x4 [dstq+strideq*2], m9, 2
+ vextracti32x4 [dstq+r6 ], m9, 3
+ ret
+ALIGN function_align
+.load:
+ mova m0, [cq+64*0] ; 0 1
+ mova m4, [cq+64*1] ; 2 3
+ mova m1, [o(permB)]
+ mova m2, [cq+64*2] ; 4 5
+ mova m6, [cq+64*3] ; 6 7
+ vpbroadcastd m13, [o(pd_2048)]
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+ psrlq m5, m1, 32
+ vpbroadcastd m12, [o(pd_2896)]
+ mova m3, m1
+ vpbroadcastd m11, [o(pd_1)]
+ ret
+ALIGN function_align
+.main_fast: ; bottom half is zero
+ vbroadcasti32x4 m3, [o(pd_4017_3406)]
+ vbroadcasti32x4 m8, [o(pd_799_m2276)]
+ vbroadcasti32x4 m2, [o(pd_2896_3784)]
+ vbroadcasti32x4 m9, [o(pd_2896_1567)]
+ pmulld m3, m1 ; t4a t5a
+ pmulld m1, m8 ; t7a t6a
+ pmulld m2, m0 ; t0 t3
+ pmulld m0, m9 ; t1 t2
+ jmp .main2
+.main:
+ ITX_MULSUB_2D 1, 3, 8, 9, 10, _, 799_3406, 4017_2276
+ ITX_MULSUB_2D 0, 2, 8, 9, 10, _, 2896_1567, 2896_3784
+.main2:
+ REPX {paddd x, m13}, m1, m3, m0, m2
+ REPX {psrad x, 12 }, m1, m3, m0, m2
+ punpcklqdq m8, m1, m3 ; t4a t7a
+ punpckhqdq m1, m3 ; t5a t6a
+ psubd m3, m8, m1 ; t5a t6a
+ paddd m8, m1 ; t4 t7
+ pmaxsd m3, m14
+ punpckhqdq m1, m2, m0 ; t3 t2
+ pminsd m3, m15
+ punpcklqdq m2, m0 ; t0 t1
+ pmulld m3, m12
+ paddd m0, m2, m1 ; dct4 out0 out1
+ psubd m2, m1 ; dct4 out3 out2
+ REPX {pmaxsd x, m14}, m8, m0, m2
+ REPX {pminsd x, m15}, m8, m0, m2
+.main3:
+ pshufd m1, m3, q1032
+ paddd m3, m13
+ psubd m9, m3, m1
+ paddd m3, m1
+ psrad m9, 12
+ psrad m3, 12
+ punpckhqdq m1, m8, m3 ; t7 t6
+ shufpd m8, m9, 0xaa ; t4 t5
+ ret
+.main_end:
+ paddd m0, m11
+ paddd m2, m11
+ psubd m3, m0, m1 ; out7 out6
+ paddd m0, m1 ; out0 out1
+ paddd m1, m2, m8 ; out3 out2
+ psubd m2, m8 ; out4 out5
+ REPX {vpsravd x, m11}, m0, m2, m3, m1
+ ret
+
+INV_TXFM_8X8_FN adst, dct
+INV_TXFM_8X8_FN adst, flipadst
+INV_TXFM_8X8_FN adst, identity
+INV_TXFM_8X8_FN adst, adst
+
+cglobal iadst_8x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+ call m(idct_8x8_internal_10bpc).load
+ vpermi2q m1, m6, m2 ; 7 5
+ vpermi2q m3, m4, m0 ; 3 1
+ vpermt2q m0, m5, m4 ; 0 2
+ vpermt2q m2, m5, m6 ; 4 6
+ call .main
+ punpckldq m1, m2, m4 ; out4 out6
+ punpckhdq m2, m0 ; -out5 -out7
+ punpckldq m0, m3 ; out0 out2
+ punpckhdq m4, m3 ; -out1 -out3
+ paddd m1, m11
+ psubd m3, m11, m2
+ paddd m0, m11
+ psubd m4, m11, m4
+.pass1_end:
+ REPX {psrad x, 1}, m1, m0, m3, m4
+ packssdw m0, m1 ; 0 2 4 6
+ packssdw m4, m3 ; 1 3 5 7
+ psrlq m1, [o(permB)], 8
+ punpckhwd m3, m0, m4
+ punpcklwd m0, m4
+ psrlq m2, m1, 32
+ vpermi2q m1, m0, m3
+ vpermt2q m0, m2, m3
+ jmp tx2q
+.pass2:
+ call .main_pass2
+ movu m10, [permC+2]
+ vbroadcasti32x8 m12, [pw_2048_m2048+16]
+ jmp m(idct_8x8_internal_10bpc).end
+.main_pass2:
+ vextracti32x8 ym2, m0, 1
+ vextracti32x8 ym3, m1, 1
+ lea r5, [o_base_8bpc]
+ pshufd ym4, ym0, q1032
+ pshufd ym5, ym1, q1032
+ jmp m(iadst_8x8_internal_8bpc).main_pass2
+ALIGN function_align
+.main:
+ ITX_MULSUB_2D 1, 0, 4, 5, 6, 13, 401_1931, 4076_3612
+ ITX_MULSUB_2D 3, 2, 4, 5, 6, 13, 3166_3920, 2598_1189
+ psubd m4, m0, m2 ; t4 t6
+ paddd m0, m2 ; t0 t2
+ psubd m2, m1, m3 ; t5 t7
+ paddd m1, m3 ; t1 t3
+ REPX {pmaxsd x, m14}, m4, m2, m0, m1
+ REPX {pminsd x, m15}, m4, m2, m0, m1
+ pxor m5, m5
+ psubd m5, m4
+ shufpd m4, m2, 0xaa ; t4 t7
+ shufpd m2, m5, 0xaa ; t5 -t6
+ ITX_MULSUB_2D 4, 2, 3, 5, 6, 13, 1567, 3784
+ punpckhqdq m3, m0, m1
+ punpcklqdq m0, m1
+ psubd m1, m0, m3 ; t2 t3
+ paddd m0, m3 ; out0 -out7
+ punpckhqdq m3, m4, m2 ; t7a t6a
+ punpcklqdq m4, m2 ; t5a t4a
+ psubd m2, m4, m3 ; t7 t6
+ paddd m4, m3 ; out6 -out1
+ REPX {pmaxsd x, m14}, m1, m2
+ REPX {pminsd x, m15}, m1, m2
+ shufpd m3, m1, m2, 0xaa
+ shufpd m1, m2, 0x55
+ pmulld m3, m12
+ pmulld m1, m12
+ paddd m3, m13
+ psubd m2, m3, m1
+ paddd m3, m1
+ psrad m2, 12 ; out4 -out5
+ pshufd m3, m3, q1032
+ psrad m3, 12 ; out2 -out3
+ ret
+
+INV_TXFM_8X8_FN flipadst, dct
+INV_TXFM_8X8_FN flipadst, adst
+INV_TXFM_8X8_FN flipadst, identity
+INV_TXFM_8X8_FN flipadst, flipadst
+
+cglobal iflipadst_8x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+ call m(idct_8x8_internal_10bpc).load
+ vpermi2q m1, m6, m2 ; 7 5
+ vpermi2q m3, m4, m0 ; 3 1
+ vpermt2q m0, m5, m4 ; 0 2
+ vpermt2q m2, m5, m6 ; 4 6
+ call m(iadst_8x8_internal_10bpc).main
+ punpckhdq m1, m3, m4 ; -out3 -out1
+ punpckldq m3, m0 ; out2 out0
+ punpckhdq m0, m2 ; -out7 -out5
+ punpckldq m4, m2 ; out6 out4
+ psubd m1, m11, m1
+ paddd m3, m11
+ psubd m0, m11, m0
+ paddd m4, m11
+ jmp m(iadst_8x8_internal_10bpc).pass1_end
+.pass2:
+ call m(iadst_8x8_internal_10bpc).main_pass2
+ movu m10, [permC+1]
+ vbroadcasti32x8 m12, [pw_m2048_2048+16]
+ lea r6, [strideq*3]
+ vpermt2q m0, m10, m1 ; 7 6 5 4
+ vpbroadcastd m11, [pixel_10bpc_max]
+ vpermt2q m2, m10, m3 ; 3 2 1 0
+ pxor m10, m10
+ pmulhrsw m8, m12, m2
+ call m(idct_8x8_internal_10bpc).write_8x4_start
+ pmulhrsw m8, m12, m0
+ jmp m(idct_8x8_internal_10bpc).write_8x4
+
+INV_TXFM_8X8_FN identity, dct
+INV_TXFM_8X8_FN identity, adst
+INV_TXFM_8X8_FN identity, flipadst
+INV_TXFM_8X8_FN identity, identity
+
+cglobal iidentity_8x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+ mova m1, [cq+64*0]
+ packssdw m1, [cq+64*2] ; 0 4 1 5
+ mova m2, [cq+64*1] ; 2 6 3 7
+ packssdw m2, [cq+64*3]
+ mova m0, [o(idtx8x8p)]
+ vpermb m1, m0, m1
+ vpermb m2, m0, m2
+ punpckldq m0, m1, m2 ; 0 1 4 5
+ punpckhdq m1, m2 ; 2 3 6 7
+ jmp tx2q
+.pass2:
+ movu m3, [o(permC+2)]
+ vpbroadcastd m12, [o(pw_4096)]
+ psrlq m2, m3, 32
+ vpermi2q m2, m0, m1
+ vpermt2q m0, m3, m1
+ jmp m(idct_8x8_internal_10bpc).end2
+
+%macro INV_TXFM_8X16_FN 2-3 0 ; type1, type2, eob_offset
+ INV_TXFM_FN %1, %2, %3, 8x16
+%ifidn %1_%2, dct_dct
+ imul r6d, [cq], 181
+ mov [cq], eobd ; 0
+ or r3d, 16
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly
+%endif
+%endmacro
+
+INV_TXFM_8X16_FN dct, dct
+INV_TXFM_8X16_FN dct, identity, 35
+INV_TXFM_8X16_FN dct, flipadst
+INV_TXFM_8X16_FN dct, adst
+
+cglobal idct_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+%undef cmp
+ cmp eobd, 43
+ jl .fast
+ call .load
+ call .main
+ call .main_end
+.pass1_end:
+ packssdw m0, m4
+ packssdw m1, m5
+ packssdw m2, m6
+ packssdw m3, m7
+ jmp tx2q
+.pass2:
+ mova m8, [o(idct8x16p)]
+ REPX {vpermb x, m8, x}, m0, m1, m2, m3
+ punpckhdq m5, m0, m1
+ punpckldq m0, m1
+ punpckhdq m4, m2, m3
+ punpckldq m2, m3
+ punpcklqdq m8, m0, m2 ; 15 1
+ punpckhqdq m0, m2 ; 7 9
+ punpckhqdq m1, m5, m4 ; 3 13
+ punpcklqdq m5, m4 ; 11 5
+ lea r5, [o_base_8bpc]
+ vextracti32x8 ym7, m8, 1 ; 14 2
+ vextracti32x8 ym3, m0, 1 ; 6 10
+ vextracti32x8 ym6, m1, 1 ; 12 4
+ vextracti32x8 ym9, m5, 1 ; 8 0
+ call m(idct_8x16_internal_8bpc).main2
+ mova m8, [permC]
+ vpbroadcastd m12, [pw_2048]
+ vpermt2q m0, m8, m1
+ lea r6, [strideq*3]
+ vpermt2q m2, m8, m3
+ vpbroadcastd m11, [pixel_10bpc_max]
+ vpermt2q m4, m8, m5
+ pxor m10, m10
+ vpermt2q m6, m8, m7
+ pmulhrsw m8, m12, m0
+ call m(idct_8x8_internal_10bpc).write_8x4_start
+ pmulhrsw m8, m12, m2
+ call m(idct_8x8_internal_10bpc).write_8x4
+ pmulhrsw m8, m12, m4
+ call m(idct_8x8_internal_10bpc).write_8x4
+ pmulhrsw m8, m12, m6
+ jmp m(idct_8x8_internal_10bpc).write_8x4
+.fast:
+ mova ym0, [cq+64*0]
+ mova ym4, [cq+64*2]
+ mova ym1, [cq+64*1]
+ mova ym5, [cq+64*5]
+ mova ym2, [cq+64*4]
+ mova ym6, [cq+64*6]
+ mova ym3, [cq+64*7]
+ mova ym7, [cq+64*3]
+ call .round_input_fast
+ call m(idct_8x8_internal_10bpc).main
+ call m(idct_8x8_internal_10bpc).main_end
+ movu m6, [o(permC+3)]
+ packssdw m3, m1, m3
+ packssdw m1, m0, m2
+ vprolq m3, 32
+ vpermd m1, m6, m1
+ vpermd m3, m6, m3
+ mova ym0, ym1 ; 0 4
+ vextracti32x8 ym1, m1, 1 ; 1 5
+ mova ym2, ym3 ; 2 6
+ vextracti32x8 ym3, m3, 1 ; 3 7
+ jmp tx2q
+ALIGN function_align
+.round_input_fast:
+ movshdup m8, [o(permB)]
+ vpbroadcastd m12, [o(pd_2896)]
+ vpermt2q m0, m8, m4
+ vpermt2q m1, m8, m5
+ vpermt2q m2, m8, m6
+ vpermt2q m3, m8, m7
+ vpbroadcastd m13, [o(pd_2048)]
+ REPX {pmulld x, m12}, m0, m1, m2, m3
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+ REPX {paddd x, m13}, m0, m1, m2, m3
+ vpbroadcastd m11, [o(pd_1)]
+ REPX {psrad x, 12 }, m0, m1, m2, m3
+ ret
+ALIGN function_align
+.load:
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+.load2:
+ vpbroadcastd m12, [o(pd_2896)]
+ pmulld m0, m12, [cq+64*0]
+ pmulld m1, m12, [cq+64*1]
+ pmulld m2, m12, [cq+64*2]
+ pmulld m3, m12, [cq+64*3]
+ vpbroadcastd m13, [o(pd_2048)]
+ pmulld m4, m12, [cq+64*4]
+ pmulld m5, m12, [cq+64*5]
+ pmulld m6, m12, [cq+64*6]
+ pmulld m7, m12, [cq+64*7]
+.round:
+ REPX {paddd x, m13}, m0, m1, m2, m3
+ REPX {psrad x, 12 }, m0, m1, m2, m3
+ REPX {paddd x, m13}, m4, m5, m6, m7
+ REPX {psrad x, 12 }, m4, m5, m6, m7
+ ret
+ALIGN function_align
+.main_fast2_rect2:
+ REPX {paddd x, m13}, m0, m1
+ REPX {psrad x, 12 }, m0, m1
+.main_fast2:
+ pmulld m0, m12
+ pmulld m6, m1, [o(pd_4017)] {1to16} ; t7a
+ pmulld m8, m1, [o(pd_799)] {1to16} ; t4a
+ REPX {paddd x, m13}, m0, m6, m8
+ REPX {psrad x, 12 }, m0, m6, m8
+ pmulld m5, m6, m12
+ pmulld m1, m8, m12
+ paddd m5, m13
+ psubd m4, m5, m1
+ paddd m5, m1
+ REPX {psrad x, 12 }, m4, m5
+ REPX {mova x, m0 }, m1, m2, m3
+ ret
+.main_fast_rect2:
+ REPX {paddd x, m13}, m0, m1, m2, m3
+ REPX {psrad x, 12 }, m0, m1, m2, m3
+.main_fast:
+ pmulld m0, m12
+ pmulld m5, m3, [o(pd_2276)] {1to16} ; t5a
+ pmulld m3, [o(pd_3406)] {1to16} ; t6a
+ pmulld m7, m1, [o(pd_4017)] {1to16} ; t7a
+ pmulld m1, [o(pd_799)] {1to16} ; t4a
+ pmulld m6, m2, [o(pd_3784)] {1to16} ; t3
+ pmulld m2, [o(pd_1567)] {1to16} ; t2
+ paddd m0, m13
+ psubd m5, m13, m5
+ psrad m0, 12 ; t0
+ mova m9, m0 ; t1
+ jmp .main2
+.main_rect2:
+ call .round
+.main:
+ pmulld m0, m12
+ ITX_MULSUB_2D 5, 3, 8, 9, 10, _, 3406, 2276 ; t5a t6a
+ ITX_MULSUB_2D 1, 7, 8, 9, 10, _, 799, 4017 ; t4a t7a
+ ITX_MULSUB_2D 2, 6, 8, 9, 10, _, 1567, 3784 ; t2 t3
+ pmulld m4, m12
+ paddd m0, m13
+ paddd m5, m13
+ psubd m9, m0, m4 ; t1
+ paddd m0, m4 ; t0
+ psrad m9, 12
+ psrad m0, 12
+.main2:
+ REPX {paddd x, m13}, m3, m1, m7
+ REPX {psrad x, 12 }, m5, m1, m3, m7
+ paddd m8, m1, m5 ; t4
+ psubd m1, m5 ; t5a
+ psubd m5, m7, m3 ; t6a
+ paddd m7, m3 ; t7
+ pmaxsd m5, m14
+ pmaxsd m1, m14
+ paddd m2, m13
+ paddd m6, m13
+ pminsd m5, m15
+ pminsd m1, m15
+ pmulld m5, m12
+ pmulld m1, m12
+ pmaxsd m8, m14
+ pmaxsd m7, m14
+ pminsd m8, m15
+ paddd m5, m13
+ psubd m4, m5, m1
+ paddd m5, m1
+ REPX {psrad x, 12 }, m2, m6, m5, m4
+ paddd m1, m9, m2 ; dct4 out1
+ psubd m2, m9, m2 ; dct4 out2
+ psubd m3, m0, m6 ; dct4 out3
+ paddd m0, m6 ; dct4 out0
+ pminsd m6, m15, m7
+ REPX {pmaxsd x, m14}, m0, m1, m2, m3
+ REPX {pminsd x, m15}, m0, m1, m2, m3
+ ret
+.main_end:
+ vpbroadcastd m11, [o(pd_1)]
+.main_end2:
+ REPX {paddd x, m11}, m0, m1, m2, m3
+ psubd m7, m0, m6 ; out7
+ paddd m0, m6 ; out0
+ psubd m6, m1, m5 ; out6
+ paddd m1, m5 ; out1
+ psubd m5, m2, m4 ; out5
+ paddd m2, m4 ; out2
+ psubd m4, m3, m8 ; out4
+ paddd m3, m8 ; out3
+ REPX {vpsravd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+ ret
+
+INV_TXFM_8X16_FN adst, dct
+INV_TXFM_8X16_FN adst, identity, 35
+INV_TXFM_8X16_FN adst, flipadst
+INV_TXFM_8X16_FN adst, adst
+
+cglobal iadst_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+%undef cmp
+ cmp eobd, 43
+ jl .fast
+ call m(idct_8x16_internal_10bpc).load
+ call .main
+ psrad m0, 1
+ psrad m1, 1
+ psrad m6, m10, 1
+ psrad m7, m11, 1
+ psrad m2, 12
+ psrad m3, 12
+ psrad m4, m8, 12
+ psrad m5, m9, 12
+ jmp m(idct_8x16_internal_10bpc).pass1_end
+.fast:
+ call .fast_main
+ punpcklqdq m1, m2, m4 ; out4 out6
+ punpckhqdq m2, m0 ; -out5 -out7
+ punpcklqdq m0, m3 ; out0 out2
+ punpckhqdq m4, m3 ; -out1 -out3
+ paddd m1, m11
+ psubd m3, m11, m2
+ paddd m0, m11
+ psubd m4, m11, m4
+.fast_end:
+ movu m5, [o(permC+3)]
+ REPX {psrad x, 1}, m1, m0, m3, m4
+ packssdw m2, m0, m1 ; 0 2 4 6
+ packssdw m3, m4, m3 ; 1 3 5 7
+ vpermd m2, m5, m2
+ vpermd m3, m5, m3
+ mova ym0, ym2
+ vextracti32x8 ym2, m2, 1
+ mova ym1, ym3
+ vextracti32x8 ym3, m3, 1
+ jmp tx2q
+.pass2:
+ call .pass2_main
+ movu m4, [permB+2]
+ vbroadcasti32x8 m12, [pw_2048_m2048+16]
+ psrlq m7, m4, 8
+ vpermi2q m4, m0, m3 ; 0 1 2 3
+ psrlq m5, m7, 24
+ vpermi2q m7, m0, m3 ; 12 13 14 15
+ psrlq m6, m5, 8
+ vpermq m5, m5, m1 ; 4 5 6 7
+ vpermq m6, m6, m2 ; 8 9 10 11
+.pass2_end:
+ vpbroadcastd m11, [pixel_10bpc_max]
+ pxor m10, m10
+ lea r6, [strideq*3]
+ pmulhrsw m8, m12, m4
+ call m(idct_8x8_internal_10bpc).write_8x4_start
+ pmulhrsw m8, m12, m5
+ call m(idct_8x8_internal_10bpc).write_8x4
+ pmulhrsw m8, m12, m6
+ call m(idct_8x8_internal_10bpc).write_8x4
+ pmulhrsw m8, m12, m7
+ jmp m(idct_8x8_internal_10bpc).write_8x4
+ALIGN function_align
+.main:
+ ITX_MULSUB_2D 7, 0, 8, 9, 10, 13, 401, 4076 ; t1a, t0a
+ ITX_MULSUB_2D 1, 6, 8, 9, 10, 13, 3920, 1189 ; t7a, t6a
+ ITX_MULSUB_2D 5, 2, 8, 9, 10, 13, 1931, 3612 ; t3a, t2a
+ ITX_MULSUB_2D 3, 4, 8, 9, 10, 13, 3166, 2598 ; t5a, t4a
+ psubd m8, m2, m6 ; t6
+ paddd m2, m6 ; t2
+ psubd m6, m0, m4 ; t4
+ paddd m0, m4 ; t0
+ psubd m4, m5, m1 ; t7
+ paddd m5, m1 ; t3
+ psubd m1, m7, m3 ; t5
+ paddd m7, m3 ; t1
+ REPX {pmaxsd x, m14}, m6, m1, m8, m4, m2, m0, m5, m7
+ REPX {pminsd x, m15}, m6, m1, m8, m4, m2, m0, m5, m7
+ vpbroadcastd m10, [o(pd_1567)]
+ vpbroadcastd m11, [o(pd_3784)]
+ ITX_MULSUB_2D 6, 1, 3, 9, _, 13, 10, 11 ; t5a, t4a
+ ITX_MULSUB_2D 4, 8, 3, 9, _, 13, 11, 10 ; t6a, t7a
+ vpbroadcastd m12, [o(pd_1448)]
+ psubd m9, m6, m8 ; t7
+ paddd m6, m8 ; out6
+ psubd m3, m7, m5 ; t3
+ paddd m7, m5 ; -out7
+ psubd m5, m0, m2 ; t2
+ paddd m0, m2 ; out0
+ psubd m2, m1, m4 ; t6
+ paddd m1, m4 ; -out1
+ REPX {pmaxsd x, m14}, m5, m3, m2, m9
+ REPX {pminsd x, m15}, m5, m3, m2, m9
+ REPX {pmulld x, m12}, m5, m3, m2, m9
+ vpbroadcastd m4, [o(pd_1)]
+ psubd m8, m5, m3 ; (t2 - t3) * 1448
+ paddd m3, m5 ; (t2 + t3) * 1448
+ psubd m5, m2, m9 ; (t6 - t7) * 1448
+ paddd m2, m9 ; (t6 + t7) * 1448
+ vpbroadcastd m9, [o(pd_3072)]
+ paddd m0, m4
+ psubd m1, m4, m1
+ paddd m10, m6, m4
+ psubd m11, m4, m7
+ paddd m2, m9
+ paddd m8, m9
+ vpbroadcastd m9, [o(pd_3071)]
+ psubd m3, m9, m3
+ psubd m9, m5
+ ret
+ALIGN function_align
+.fast_main:
+ mova ym0, [cq+64*0]
+ mova ym4, [cq+64*2]
+ mova ym1, [cq+64*7]
+ mova ym5, [cq+64*5]
+ mova ym2, [cq+64*4]
+ mova ym6, [cq+64*6]
+ mova ym3, [cq+64*3]
+ mova ym7, [cq+64*1]
+ call m(idct_8x16_internal_10bpc).round_input_fast
+ jmp m(iadst_8x8_internal_10bpc).main
+ALIGN function_align
+.pass2_main:
+ mova m8, [o(iadst8x16p)]
+ REPX {vpermb x, m8, x}, m0, m1, m2, m3
+ vpbroadcastd m10, [o(pw_2896x8)]
+ punpckhdq m5, m0, m1
+ punpckldq m0, m1
+ punpckhdq m1, m2, m3
+ punpckldq m2, m3
+ lea r5, [o_base_8bpc]
+ punpckhqdq m4, m0, m2 ; 12 3 14 1
+ punpcklqdq m0, m2 ; 0 15 2 13
+ punpckhqdq m6, m5, m1 ; 8 7 10 5
+ punpcklqdq m5, m1 ; 4 11 6 9
+ call m(iadst_8x16_internal_8bpc).main2
+ paddsw m1, m2, m4
+ psubsw m2, m4
+ pmulhrsw m1, m10 ; -out7 out4 out6 -out5
+ pmulhrsw m2, m10 ; out8 -out11 -out9 out10
+ ret
+
+INV_TXFM_8X16_FN flipadst, dct
+INV_TXFM_8X16_FN flipadst, identity, 35
+INV_TXFM_8X16_FN flipadst, adst
+INV_TXFM_8X16_FN flipadst, flipadst
+
+cglobal iflipadst_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+%undef cmp
+ cmp eobd, 43
+ jl .fast
+ call m(idct_8x16_internal_10bpc).load
+ call m(iadst_8x16_internal_10bpc).main
+ psrad m7, m0, 1
+ psrad m0, m11, 1
+ psrad m6, m1, 1
+ psrad m1, m10, 1
+ psrad m5, m2, 12
+ psrad m2, m9, 12
+ psrad m4, m3, 12
+ psrad m3, m8, 12
+ jmp m(idct_8x16_internal_10bpc).pass1_end
+.fast:
+ call m(iadst_8x16_internal_10bpc).fast_main
+ punpckhqdq m1, m3, m4 ; -out3 -out1
+ punpcklqdq m3, m0 ; out2 out0
+ punpckhqdq m0, m2 ; -out7 -out5
+ punpcklqdq m4, m2 ; out6 out4
+ psubd m1, m11, m1
+ paddd m3, m11
+ psubd m0, m11, m0
+ paddd m4, m11
+ jmp m(iadst_8x16_internal_10bpc).fast_end
+.pass2:
+ call m(iadst_8x16_internal_10bpc).pass2_main
+ movu m7, [permB+2]
+ vbroadcasti32x8 m12, [pw_m2048_2048+16]
+ psrlq m4, m7, 8
+ vpermi2q m7, m3, m0 ; 3 2 1 0
+ psrlq m5, m4, 24
+ vpermi2q m4, m3, m0 ; 15 14 13 12
+ psrlq m6, m5, 8
+ vpermq m5, m5, m2 ; 11 10 9 8
+ vpermq m6, m6, m1 ; 7 6 5 4
+ jmp m(iadst_8x16_internal_10bpc).pass2_end
+
+INV_TXFM_8X16_FN identity, dct
+INV_TXFM_8X16_FN identity, adst
+INV_TXFM_8X16_FN identity, flipadst
+INV_TXFM_8X16_FN identity, identity
+
+cglobal iidentity_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+ call m(idct_8x16_internal_10bpc).load2
+ jmp m(idct_8x16_internal_10bpc).pass1_end
+.pass2:
+ vpbroadcastd m8, [o(pw_1697x16)]
+ pmulhrsw m4, m8, m0
+ pmulhrsw m5, m8, m1
+ pmulhrsw m6, m8, m2
+ pmulhrsw m7, m8, m3
+ REPX {paddsw x, x}, m0, m1, m2, m3
+ paddsw m0, m4
+ paddsw m1, m5
+ paddsw m2, m6
+ paddsw m3, m7
+ vpbroadcastd m7, [o(pw_2048)]
+ punpckhwd m4, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m2, m3
+ punpcklwd m2, m3
+ vpbroadcastd m6, [o(pixel_10bpc_max)]
+ punpckhdq m3, m0, m2
+ punpckldq m0, m2
+ punpckldq m2, m4, m1
+ punpckhdq m4, m1
+ pxor m5, m5
+ punpckhqdq m1, m0, m2 ; 1 5 9 13
+ punpcklqdq m0, m2 ; 0 4 8 12
+ punpcklqdq m2, m3, m4 ; 2 6 10 14
+ punpckhqdq m3, m4 ; 3 7 11 15
+ lea r6, [strideq*3]
+ pmulhrsw m0, m7
+ call .write_8x4_start
+ pmulhrsw m0, m7, m1
+ call .write_8x4
+ pmulhrsw m0, m7, m2
+ call .write_8x4
+ pmulhrsw m0, m7, m3
+.write_8x4:
+ add dstq, strideq
+ add cq, 64*2
+.write_8x4_start:
+ mova xm4, [dstq+strideq*0]
+ vinserti32x4 ym4, [dstq+strideq*4], 1
+ vinserti32x4 m4, [dstq+strideq*8], 2
+ vinserti32x4 m4, [dstq+r6*4 ], 3
+ mova [cq+64*0], m5
+ mova [cq+64*1], m5
+ paddw m4, m0
+ pmaxsw m4, m5
+ pminsw m4, m6
+ mova [dstq+strideq*0], xm4
+ vextracti32x4 [dstq+strideq*4], ym4, 1
+ vextracti32x4 [dstq+strideq*8], m4, 2
+ vextracti32x4 [dstq+r6*4 ], m4, 3
+ ret
+
+%macro INV_TXFM_16X8_FN 2-3 0 ; type1, type2, eob_offset
+ INV_TXFM_FN %1, %2, %3, 16x8
+%ifidn %1_%2, dct_dct
+ imul r6d, [cq], 181
+ mov [cq], eobd ; 0
+ or r3d, 8
+.dconly:
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ add r6d, 384
+ sar r6d, 9
+.dconly2:
+ vpbroadcastd m2, [o(dconly_10bpc)]
+ imul r6d, 181
+ add r6d, 2176
+ sar r6d, 12
+ vpbroadcastw m1, r6d
+ paddsw m1, m2
+.dconly_loop:
+ mova ym0, [dstq+strideq*0]
+ vinserti32x8 m0, [dstq+strideq*1], 1
+ paddsw m0, m1
+ psubusw m0, m2
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub r3d, 2
+ jg .dconly_loop
+ RET
+%endif
+%endmacro
+
+INV_TXFM_16X8_FN dct, dct
+INV_TXFM_16X8_FN dct, identity, -21
+INV_TXFM_16X8_FN dct, flipadst
+INV_TXFM_16X8_FN dct, adst
+
+cglobal idct_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+%undef cmp
+ vpbroadcastd m12, [o(pd_2896)]
+ pmulld m4, m12, [cq+64*0] ; 0 1
+ pmulld m9, m12, [cq+64*1] ; 2 3
+ pmulld m8, m12, [cq+64*2] ; 4 5
+ pmulld m7, m12, [cq+64*3] ; 6 7
+ vpbroadcastd m13, [o(pd_2048)]
+ pxor m2, m2
+ mova m15, [o(permB)]
+ REPX {mova [cq+64*x], m2}, 0, 1, 2, 3
+ psrlq m0, m15, 32
+ REPX {paddd x, m13}, m4, m9, m8, m7
+ vpbroadcastd m14, [o(clip_18b_min)]
+ REPX {psrad x, 12 }, m4, m8, m9, m7
+ mova m1, m0
+ vpermi2q m0, m4, m8 ; 0 4
+ cmp eobd, 43
+ jl .fast
+ pmulld m5, m12, [cq+64*4] ; 8 9
+ pmulld m10, m12, [cq+64*5] ; 10 11
+ pmulld m11, m12, [cq+64*6] ; 12 13
+ pmulld m6, m12, [cq+64*7] ; 14 15
+ REPX {mova [cq+64*x], m2}, 4, 5, 6, 7
+ REPX {paddd x, m13}, m5, m10, m11, m6
+ REPX {psrad x, 12 }, m10, m5, m11, m6
+ mova m2, m1
+ vpermi2q m1, m9, m10 ; 2 10
+ mova m3, m2
+ vpermi2q m2, m5, m11 ; 8 12
+ vpermi2q m3, m6, m7 ; 14 6
+ vpermt2q m4, m15, m11 ; 1 13
+ vpermt2q m6, m15, m9 ; 15 3
+ vpermt2q m5, m15, m8 ; 9 5
+ vpermt2q m7, m15, m10 ; 7 11
+ vpbroadcastd m15, [o(clip_18b_max)]
+ call m(idct_8x8_internal_10bpc).main
+ call .main
+ jmp .pass1_end
+.fast:
+ vpermi2q m1, m9, m7 ; 2 6
+ vpermt2q m4, m15, m9 ; 1 3
+ vpermt2q m7, m15, m8 ; 7 5
+ vpbroadcastd m15, [o(clip_18b_max)]
+ call m(idct_8x8_internal_10bpc).main_fast
+ call .main_fast
+.pass1_end:
+ call m(idct_8x16_internal_10bpc).main_end
+ mova m8, [o(permA)]
+ psrlq m9, m8, 8
+.pass1_end2:
+ mova m10, m9
+ mova m11, m8
+ call .transpose_16x8
+ jmp tx2q
+.pass2:
+ lea r5, [o_base_8bpc]
+ call m(idct_16x8_internal_8bpc).main
+ movshdup m4, [permC]
+ vpbroadcastd m11, [pw_2048]
+ psrlq m5, m4, 8
+.end:
+ vpbroadcastd m13, [pixel_10bpc_max]
+ pxor m12, m12
+ vpermq m8, m4, m0
+ vpermq m9, m5, m1
+ lea r6, [strideq*3]
+ call .write_16x4
+ vpermq m8, m4, m2
+ vpermq m9, m5, m3
+.write_16x4:
+ pmulhrsw m8, m11
+ pmulhrsw m9, m11
+.write_16x4_noround:
+ mova ym10, [dstq+strideq*0]
+ vinserti32x8 m10, [dstq+strideq*1], 1
+ paddw m8, m10
+ mova ym10, [dstq+strideq*2]
+ vinserti32x8 m10, [dstq+r6 ], 1
+ paddw m9, m10
+ pmaxsw m8, m12
+ pmaxsw m9, m12
+ pminsw m8, m13
+ pminsw m9, m13
+ mova [dstq+strideq*0], ym8
+ vextracti32x8 [dstq+strideq*1], m8, 1
+ mova [dstq+strideq*2], ym9
+ vextracti32x8 [dstq+r6 ], m9, 1
+ lea dstq, [dstq+strideq*4]
+ ret
+ALIGN function_align
+.main_fast: ; bottom half is zero
+ vbroadcasti32x4 m6, [o(pd_4076_3920)]
+ vbroadcasti32x4 m3, [o(pd_401_m1189)]
+ vbroadcasti32x4 m5, [o(pd_m2598_1931)]
+ vbroadcasti32x4 m9, [o(pd_3166_3612)]
+ pmulld m6, m4 ; t15a t12a
+ pmulld m4, m3 ; t8a t11a
+ pmulld m5, m7 ; t9a t10a
+ pmulld m7, m9 ; t14a t13a
+ jmp .main2
+.main:
+ ITX_MULSUB_2D 4, 6, 3, 9, 10, _, 401_3920, 4076_1189
+ ITX_MULSUB_2D 5, 7, 3, 9, 10, _, 3166_1931, 2598_3612
+.main2:
+ REPX {paddd x, m13}, m4, m6, m5, m7
+ REPX {psrad x, 12 }, m4, m5, m6, m7
+ paddd m9, m4, m5 ; t8 t11
+ psubd m4, m5 ; t9 t10
+ psubd m5, m6, m7 ; t14 t13
+ paddd m6, m7 ; t15 t12
+ REPX {pmaxsd x, m14}, m5, m4, m9, m6
+ REPX {pminsd x, m15}, m5, m4, m9, m6
+.main3:
+ psubd m3, m0, m1 ; dct8 out7 out6
+ paddd m0, m1 ; dct8 out0 out1
+ vbroadcasti32x4 m7, [o(pd_3784_m3784)]
+ pmulld m7, m5
+ vpmulld m5, [o(pd_1567)] {1to16}
+ paddd m1, m2, m8 ; dct8 out3 out2
+ psubd m2, m8 ; dct8 out4 out5
+ vbroadcasti32x4 m8, [o(pd_1567_m1567)]
+ pmulld m8, m4
+ vpmulld m4, [o(pd_3784)] {1to16}
+ REPX {pmaxsd x, m14}, m0, m1
+ REPX {pminsd x, m15}, m0, m1
+ paddd m7, m13
+ paddd m5, m13
+ paddd m7, m8
+ psubd m5, m4
+ psrad m7, 12 ; t14a t10a
+ psrad m5, 12 ; t9a t13a
+ punpckhqdq m4, m9, m7
+ punpcklqdq m8, m9, m5
+ punpckhqdq m5, m6, m5
+ punpcklqdq m6, m7
+ psubd m7, m8, m4 ; t11a t10
+ paddd m8, m4 ; t8a t9
+ psubd m4, m6, m5 ; t12a t13
+ paddd m6, m5 ; t15a t14
+ REPX {pmaxsd x, m14}, m4, m7
+ REPX {pminsd x, m15}, m4, m7
+ pmulld m4, m12
+ pmulld m7, m12
+ REPX {pmaxsd x, m14}, m2, m3, m6, m8
+ REPX {pminsd x, m15}, m2, m3, m6, m8
+ paddd m4, m13
+ paddd m5, m4, m7
+ psubd m4, m7
+ psrad m4, 12 ; t11 t10a
+ psrad m5, 12 ; t12 t13a
+ ret
+ALIGN function_align
+.transpose_16x8:
+ packssdw m0, m4
+ packssdw m1, m5
+ packssdw m2, m6
+ packssdw m3, m7
+ vpermi2d m8, m0, m2
+ vpermt2d m0, m9, m2
+ vpermi2d m10, m1, m3
+ vpermi2d m11, m1, m3
+ punpckhwd m3, m8, m0
+ punpcklwd m1, m8, m0
+ punpckhwd m4, m10, m11
+ punpcklwd m2, m10, m11
+ punpckldq m0, m1, m2
+ punpckhdq m1, m2
+ punpckldq m2, m3, m4
+ punpckhdq m3, m4
+ ret
+
+INV_TXFM_16X8_FN adst, dct
+INV_TXFM_16X8_FN adst, identity, -21
+INV_TXFM_16X8_FN adst, flipadst
+INV_TXFM_16X8_FN adst, adst
+
+cglobal iadst_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+%undef cmp
+ call .main_pass1
+ vpbroadcastd m9, [o(pd_1)]
+ paddd m0, m9
+ psubd m1, m9, m1
+ paddd m2, m9
+ psubd m3, m9, m3
+ paddd m4, m9, m5
+ psubd m5, m9, m6
+ paddd m6, m9, m7
+ psubd m7, m9, m8
+.pass1_end:
+ mova m9, [o(permA)]
+ psrlq m8, m9, 8
+ REPX {psrad x, 1}, m0, m4, m1, m5, m2, m6, m3, m7
+ jmp m(idct_16x8_internal_10bpc).pass1_end2
+.pass2:
+ call .main_pass2
+ vpermq m8, m11, m0
+ vpermq m9, m11, m1
+ call m(idct_16x8_internal_10bpc).write_16x4_noround
+ vpermq m8, m11, m2
+ vpermq m9, m11, m3
+ jmp m(idct_16x8_internal_10bpc).write_16x4_noround
+ALIGN function_align
+.main_pass1:
+ vpbroadcastd m12, [o(pd_2896)]
+ pmulld m2, m12, [cq+64*0]
+ pmulld m7, m12, [cq+64*1]
+ pmulld m1, m12, [cq+64*2]
+ pmulld m5, m12, [cq+64*3]
+ vpbroadcastd m13, [o(pd_2048)]
+ pxor m4, m4
+ mova m10, [o(permB)]
+ REPX {mova [cq+64*x], m4}, 0, 1, 2, 3
+ REPX {paddd x, m13}, m2, m7, m1, m5
+ psrlq m6, m10, 32
+ REPX {psrad x, 12 }, m2, m7, m1, m5
+ mova m0, m6
+ vpermi2q m0, m2, m7 ; 0 2
+ vpermt2q m7, m10, m2 ; 3 1
+ mova m2, m6
+ vpermi2q m2, m1, m5 ; 4 6
+ vpermt2q m5, m10, m1 ; 7 5
+ cmp eobd, 43
+ jl .main_fast
+ pmulld m8, m12, [cq+64*4]
+ pmulld m3, m12, [cq+64*5]
+ pmulld m9, m12, [cq+64*6]
+ pmulld m1, m12, [cq+64*7]
+ REPX {mova [cq+64*x], m4}, 4, 5, 6, 7
+ REPX {paddd x, m13}, m8, m3, m9, m1
+ REPX {psrad x, 12 }, m8, m3, m9, m1
+ mova m4, m6
+ vpermi2q m4, m8, m3 ; 8 10
+ vpermt2q m3, m10, m8 ; 11 9
+ vpermi2q m6, m9, m1 ; 12 14
+ vpermt2q m1, m10, m9 ; 15 13
+.main:
+ ITX_MULSUB_2D 1, 0, 8, 9, 10, _, 201_995, 4091_3973, 1
+ ITX_MULSUB_2D 3, 2, 8, 9, 10, _, 1751_2440, 3703_3290, 1
+ ITX_MULSUB_2D 5, 4, 8, 9, 10, _, 3035_3513, 2751_2106
+ ITX_MULSUB_2D 7, 6, 8, 9, 10, _, 3857_4052, 1380_601
+ jmp .main2
+.main_fast:
+ vbroadcasti32x4 m1, [o(pd_4091_3973)]
+ vbroadcasti32x4 m8, [o(pd_201_995)]
+ vbroadcasti32x4 m3, [o(pd_3703_3290)]
+ vbroadcasti32x4 m9, [o(pd_1751_2440)]
+ vbroadcasti32x4 m4, [o(pd_2751_2106)]
+ vbroadcasti32x4 m10, [o(pd_3035_3513)]
+ vbroadcasti32x4 m6, [o(pd_1380_601)]
+ vbroadcasti32x4 m11, [o(pd_3857_4052)]
+ pmulld m1, m0
+ pmulld m0, m8
+ pmulld m3, m2
+ pmulld m2, m9
+ pmulld m4, m5
+ pmulld m5, m10
+ pmulld m6, m7
+ pmulld m7, m11
+.main2:
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+ REPX {psubd x, m13, x}, m1, m3
+ REPX {paddd x, m13 }, m0, m2, m4, m5, m6, m7
+ REPX {psrad x, 12 }, m0, m4, m1, m5, m2, m6, m3, m7
+ psubd m8, m0, m4 ; t8a t10a
+ paddd m0, m4 ; t0a t2a
+ psubd m4, m1, m5 ; t9a t11a
+ paddd m1, m5 ; t1a t3a
+ psubd m5, m2, m6 ; t12a t14a
+ paddd m2, m6 ; t4a t6a
+ psubd m6, m3, m7 ; t13a t15a
+ paddd m3, m7 ; t5a t7a
+ REPX {pmaxsd x, m14}, m8, m4, m5, m6
+ REPX {pminsd x, m15}, m8, m4, m5, m6
+ vbroadcasti32x4 m11, [o(pd_4017_2276)]
+ vbroadcasti32x4 m10, [o(pd_799_3406)]
+ ITX_MULSUB_2D 8, 4, 7, 9, _, 13, 10, 11
+ ITX_MULSUB_2D 6, 5, 7, 9, _, 13, 11, 10
+ REPX {pmaxsd x, m14}, m0, m2, m1, m3
+ REPX {pminsd x, m15}, m0, m2, m1, m3
+ psubd m7, m0, m2 ; t4 t6
+ paddd m0, m2 ; t0 t2
+ psubd m2, m1, m3 ; t5 t7
+ paddd m1, m3 ; t1 t3
+ psubd m3, m4, m6 ; t12a t14a
+ paddd m4, m6 ; t8a t10a
+ psubd m6, m8, m5 ; t13a t15a
+ paddd m8, m5 ; t9a t11a
+ REPX {pmaxsd x, m14}, m7, m3, m2, m6
+ REPX {pminsd x, m15}, m7, m3, m2, m6
+ punpcklqdq m5, m3, m7 ; t12a t4
+ punpckhqdq m3, m7 ; t14a t6
+ punpckhqdq m7, m6, m2 ; t15a t7
+ punpcklqdq m6, m2 ; t13a t5
+ vpbroadcastd m11, [o(pd_1567)]
+ vpbroadcastd m10, [o(pd_3784)]
+ ITX_MULSUB_2D 7, 3, 2, 9, 10, 13, 10, 11
+ ITX_MULSUB_2D 5, 6, 2, 9, 10, 13, 11, 10
+ REPX {pmaxsd x, m14}, m0, m4, m1, m8
+ REPX {pminsd x, m15}, m0, m4, m1, m8
+ punpckhqdq m2, m4, m0 ; t10a t2
+ punpcklqdq m4, m0 ; t8a t0
+ punpckhqdq m0, m8, m1 ; t11a t3
+ punpcklqdq m8, m1 ; t9a t1
+ paddd m1, m6, m7 ; out2 -out3
+ psubd m6, m7 ; t14a t6
+ paddd m7, m5, m3 ; -out13 out12
+ psubd m5, m3 ; t15a t7
+ psubd m3, m8, m0 ; t11 t3a
+ paddd m8, m0 ; out14 -out15
+ paddd m0, m4, m2 ; -out1 out0
+ psubd m4, m2 ; t10 t2a
+ REPX {pmaxsd x, m14}, m6, m5, m3, m4
+ mov r6d, 0x3333
+ REPX {pminsd x, m15}, m6, m5, m3, m4
+ kmovw k1, r6d
+ REPX {pmulld x, m12}, m6, m5, m3, m4
+ pxor m9, m9
+ REPX {vpsubd x{k1}, m9, x}, m0, m1, m7, m8
+ paddd m6, m13
+ paddd m4, m13
+ paddd m2, m6, m5 ; -out5 out4
+ psubd m6, m5 ; out10 -out11
+ psubd m5, m4, m3 ; -out9 out8
+ paddd m3, m4 ; out6 -out7
+ REPX {psrad x, 12}, m2, m3, m5, m6
+ REPX {vpsubd x{k1}, m9, x}, m2, m3, m5, m6
+ ret
+ALIGN function_align
+.main_pass2:
+ lea r5, [o_base_8bpc]
+ pshufd m4, m0, q1032
+ pshufd m5, m1, q1032
+ call m(iadst_16x8_internal_8bpc).main_pass2
+ movshdup m11, [permC]
+ pmulhrsw m0, m6
+ pmulhrsw m1, m6
+ vpbroadcastd m13, [pixel_10bpc_max]
+ pxor m12, m12
+ lea r6, [strideq*3]
+ ret
+
+INV_TXFM_16X8_FN flipadst, dct
+INV_TXFM_16X8_FN flipadst, identity, -21
+INV_TXFM_16X8_FN flipadst, adst
+INV_TXFM_16X8_FN flipadst, flipadst
+
+cglobal iflipadst_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+ call m(iadst_16x8_internal_10bpc).main_pass1
+ vpbroadcastd m9, [o(pd_1)]
+ psubd m4, m9, m3
+ paddd m3, m9, m5
+ paddd m5, m9, m2
+ psubd m2, m9, m6
+ psubd m6, m9, m1
+ paddd m1, m9, m7
+ paddd m7, m9, m0
+ psubd m0, m9, m8
+ jmp m(iadst_16x8_internal_10bpc).pass1_end
+.pass2:
+ call m(iadst_16x8_internal_10bpc).main_pass2
+ psrlq m11, 8
+ vpermq m8, m11, m3
+ vpermq m9, m11, m2
+ call m(idct_16x8_internal_10bpc).write_16x4_noround
+ vpermq m8, m11, m1
+ vpermq m9, m11, m0
+ jmp m(idct_16x8_internal_10bpc).write_16x4_noround
+
+INV_TXFM_16X8_FN identity, dct
+INV_TXFM_16X8_FN identity, adst
+INV_TXFM_16X8_FN identity, flipadst
+INV_TXFM_16X8_FN identity, identity
+
+cglobal iidentity_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+ call m(idct_8x16_internal_10bpc).load2
+ vpbroadcastd m8, [o(pd_5793)]
+ vpbroadcastd m13, [o(pd_3072)]
+ pxor m10, m10
+ REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {mova [cq+64*x], m10}, 0, 1, 2, 3, 4, 5, 6, 7
+ call m(idct_8x16_internal_10bpc).round
+ psrlq m8, [o(permA)], 16
+ psrlq m9, m8, 8
+ mova m10, m8
+ mova m11, m9
+ call m(idct_16x8_internal_10bpc).transpose_16x8
+ jmp tx2q
+.pass2:
+ movshdup m4, [o(permC)]
+ vpbroadcastd m11, [o(pw_4096)]
+ mova m5, m4
+ jmp m(idct_16x8_internal_10bpc).end
+
+%macro INV_TXFM_16X16_FN 2-3 0 ; type1, type2, eob_offset
+ INV_TXFM_FN %1, %2, %3, 16x16
+%ifidn %1_%2, dct_dct
+ imul r6d, [cq], 181
+ mov [cq], eobd ; 0
+ or r3d, 16
+ add r6d, 640
+ sar r6d, 10
+ jmp m(inv_txfm_add_dct_dct_16x8_10bpc).dconly2
+%endif
+%endmacro
+
+INV_TXFM_16X16_FN dct, dct
+INV_TXFM_16X16_FN dct, identity, 28
+INV_TXFM_16X16_FN dct, flipadst
+INV_TXFM_16X16_FN dct, adst
+
+cglobal idct_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+%undef cmp
+ vpbroadcastd m13, [o(pd_2048)]
+ vpbroadcastd m12, [o(pd_2896)]
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+ cmp eobd, 36
+ jl .fast
+ mova m0, [cq+64* 0]
+ mova m1, [cq+64* 2]
+ mova m2, [cq+64* 4]
+ mova m3, [cq+64* 6]
+ mova m4, [cq+64* 8]
+ mova m5, [cq+64*10]
+ mova m6, [cq+64*12]
+ mova m7, [cq+64*14]
+%if WIN64
+ movaps [cq+16*0], xmm6
+ movaps [cq+16*1], xmm7
+%endif
+ call m(idct_8x16_internal_10bpc).main
+ mova m16, [cq+64* 1]
+ mova m17, [cq+64* 3]
+ mova m18, [cq+64* 5]
+ mova m19, [cq+64* 7]
+ mova m20, [cq+64* 9]
+ mova m21, [cq+64*11]
+ mova m22, [cq+64*13]
+ mova m23, [cq+64*15]
+ call .main
+ call .main_end
+.pass1_end:
+%if WIN64
+ movaps xmm6, [cq+16*0]
+ movaps xmm7, [cq+16*1]
+%endif
+ vzeroupper
+.pass1_end2:
+ call .main_end3
+.pass1_end3:
+ mov r6d, 64*12
+ pxor m8, m8
+.zero_loop:
+ mova [cq+r6+64*3], m8
+ mova [cq+r6+64*2], m8
+ mova [cq+r6+64*1], m8
+ mova [cq+r6+64*0], m8
+ sub r6d, 64*4
+ jge .zero_loop
+ jmp tx2q
+.pass2:
+ lea r5, [o_base_8bpc]
+ call m(idct_16x16_internal_8bpc).main
+ movshdup m12, [permC]
+ vpbroadcastd m11, [pw_2048]
+ psrlq m13, m12, 8
+ vpermq m8, m12, m0
+ vpermq m0, m13, m7
+ vpermq m7, m13, m1
+ vpermq m1, m12, m6
+ vpermq m6, m12, m2
+ vpermq m2, m13, m5
+ vpermq m5, m13, m3
+ vpermq m3, m12, m4
+.pass2_end:
+ lea r6, [strideq*3]
+ vpbroadcastd m13, [pixel_10bpc_max]
+ pxor m12, m12
+ pmulhrsw m8, m11, m8
+ pmulhrsw m9, m11, m7
+ call m(idct_16x8_internal_10bpc).write_16x4_noround
+ pmulhrsw m8, m11, m6
+ pmulhrsw m9, m11, m5
+ call m(idct_16x8_internal_10bpc).write_16x4_noround
+ pmulhrsw m8, m11, m3
+ pmulhrsw m9, m11, m2
+ call m(idct_16x8_internal_10bpc).write_16x4_noround
+ pmulhrsw m8, m11, m1
+ pmulhrsw m9, m11, m0
+ jmp m(idct_16x8_internal_10bpc).write_16x4_noround
+.fast:
+ mova ym0, [cq+64*0]
+ mova ym2, [cq+64*4]
+ movshdup m8, [o(permB)]
+ mova ym1, [cq+64*2]
+ mova ym3, [cq+64*6]
+ mova ym4, [cq+64*1]
+ mova ym5, [cq+64*3]
+ mova ym6, [cq+64*5]
+ mova ym7, [cq+64*7]
+ vpermt2q m0, m8, m2 ; 0 4
+ vpermt2q m1, m8, m3 ; 2 6
+ vpermt2q m4, m8, m5 ; 1 3
+ vpermt2q m7, m8, m6 ; 7 5
+ call m(idct_8x8_internal_10bpc).main_fast
+ call m(idct_16x8_internal_10bpc).main_fast
+ vpbroadcastd m11, [o(pd_2)]
+ call m(idct_8x16_internal_10bpc).main_end2
+ mova m8, [o(permA)]
+ psrlq m9, m8, 8
+ jmp m(iadst_16x16_internal_10bpc).pass1_fast_end2
+ALIGN function_align
+.main_fast2_rect2:
+ REPX {paddd x, m13}, m16, m17
+ REPX {psrad x, 12 }, m16, m17
+.main_fast2:
+ pmulld m22, m16, [o(pd_4076)] {1to16} ; t15a
+ pmulld m9, m16, [o(pd_401)] {1to16} ; t8a
+ pmulld m18, m17, [o(pd_1189)] {1to16} ; t11a
+ pmulld m17, [o(pd_3920)] {1to16} ; t12a
+ psubd m18, m13, m18
+ REPX {paddd x, m13}, m22, m9, m17
+ REPX {psrad x, 12 }, m18, m22, m9, m17
+
+ mova m20, m9
+ mova m16, m18
+ mova m23, m22
+ mova m19, m17
+ jmp .main3
+.main_fast_rect2:
+ REPX {paddd x, m13}, m16, m17, m18, m19
+ REPX {psrad x, 12 }, m16, m17, m18, m19
+.main_fast:
+ pmulld m23, m16, [o(pd_4076)] {1to16} ; t15a
+ pmulld m16, [o(pd_401)] {1to16} ; t8a
+ pmulld m20, m19, [o(pd_2598)] {1to16} ; t9a
+ pmulld m19, [o(pd_3166)] {1to16} ; t14a
+ pmulld m22, m17, [o(pd_1189)] {1to16} ; t11a
+ pmulld m17, [o(pd_3920)] {1to16} ; t12a
+ pmulld m21, m18, [o(pd_3612)] {1to16} ; t13a
+ pmulld m18, [o(pd_1931)] {1to16} ; t10a
+ psubd m20, m13, m20
+ psubd m22, m13, m22
+ call .round2
+ jmp .main2
+.main_rect2:
+ call .round
+.main:
+ ITX_MULSUB_2D 16, 23, 7, 9, 10, _, 401, 4076 ; t8a, t15a
+ ITX_MULSUB_2D 20, 19, 7, 9, 10, _, 3166, 2598 ; t9a, t14a
+ ITX_MULSUB_2D 22, 17, 7, 9, 10, _, 3920, 1189 ; t11a, t12a
+ ITX_MULSUB_2D 18, 21, 7, 9, 10, _, 1931, 3612 ; t10a, t13a
+ call .round
+.main2:
+ paddd m9, m20, m16 ; t8
+ psubd m20, m16, m20 ; t9
+ psubd m16, m22, m18 ; t10
+ paddd m18, m22 ; t11
+ paddd m22, m23, m19 ; t15
+ psubd m23, m19 ; t14
+ psubd m19, m17, m21 ; t13
+ paddd m17, m21 ; t12
+ REPX {pmaxsd x, m14}, m20, m23, m16, m19
+ REPX {pminsd x, m15}, m20, m23, m16, m19
+ REPX {pmaxsd x, m14}, m9, m18, m22, m17
+ REPX {pminsd x, m15}, m9, m18, m22, m17
+.main3:
+ vpbroadcastd m11, [o(pd_3784)]
+ vpbroadcastd m10, [o(pd_1567)]
+ ITX_MULSUB_2D 23, 20, 21, 7, _, 13, 10, 11
+ ITX_MULSUB_2D 19, 16, 21, 7, _, 13, 10, 11, 2
+ paddd m21, m20, m19 ; t14
+ psubd m20, m19 ; t13
+ psubd m19, m9, m18 ; t11a
+ paddd m9, m18 ; t8a
+ psubd m18, m23, m16 ; t10
+ paddd m16, m23 ; t9
+ psubd m23, m22, m17 ; t12a
+ paddd m22, m17 ; t15a
+ REPX {pmaxsd x, m14}, m20, m23, m18, m19
+ REPX {pminsd x, m15}, m20, m23, m18, m19
+ REPX {pmulld x, m12}, m20, m23, m18, m19
+ psubd m7, m0, m6 ; dct8 out7
+ paddd m0, m6 ; dct8 out0
+ psubd m6, m1, m5 ; dct8 out6
+ paddd m1, m5 ; dct8 out1
+ REPX {pmaxsd x, m14}, m7, m0, m6, m1
+ psubd m5, m2, m4 ; dct8 out5
+ paddd m2, m4 ; dct8 out2
+ REPX {pminsd x, m15}, m7, m0, m6, m1
+ psubd m4, m3, m8 ; dct8 out4
+ paddd m3, m8 ; dct8 out3
+ REPX {pmaxsd x, m14}, m5, m2, m4, m3
+ paddd m20, m13
+ paddd m23, m13
+ REPX {pminsd x, m15}, m5, m2, m4, m3
+ psubd m17, m20, m18 ; t10a
+ paddd m20, m18 ; t13a
+ REPX {pmaxsd x, m14}, m22, m21, m16, m9
+ psubd m18, m23, m19 ; t11
+ paddd m19, m23 ; t12
+ REPX {pminsd x, m15}, m22, m21, m16, m9
+ REPX {psrad x, 12 }, m20, m19, m18, m17
+ ret
+.main_end:
+ vpbroadcastd m11, [o(pd_2)]
+.main_end2:
+ REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+ psubd m23, m0, m22 ; out15
+ paddd m0, m22 ; out0
+ psubd m22, m1, m21 ; out14
+ paddd m1, m21 ; out1
+ psubd m21, m2, m20 ; out13
+ paddd m2, m20 ; out2
+ psubd m20, m3, m19 ; out12
+ paddd m3, m19 ; out3
+ psubd m19, m4, m18 ; out11
+ paddd m4, m18 ; out4
+ psubd m18, m5, m17 ; out10
+ paddd m5, m17 ; out5
+ psubd m17, m6, m16 ; out9
+ paddd m6, m16 ; out6
+ psubd m16, m7, m9 ; out8
+ paddd m7, m9 ; out7
+ REPX {vpsravd x, m11}, m0, m16, m1, m17, m2, m18, m3, m19, \
+ m4, m20, m5, m21, m6, m22, m7, m23
+ packssdw m0, m16
+ packssdw m1, m17
+ packssdw m2, m18
+ packssdw m3, m19
+ packssdw m4, m20
+ packssdw m5, m21
+ packssdw m6, m22
+ packssdw m7, m23
+ ret
+.main_end3:
+ punpckhwd m8, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m2, m3
+ punpcklwd m2, m3
+ punpckhwd m3, m4, m5
+ punpcklwd m4, m5
+ punpcklwd m5, m6, m7
+ punpckhwd m6, m7
+ punpckhdq m7, m0, m2
+ punpckldq m0, m2
+ punpckhdq m2, m8, m1
+ punpckldq m8, m1
+ punpckhdq m1, m4, m5
+ punpckldq m4, m5
+ punpckhdq m5, m3, m6
+ punpckldq m3, m6
+ vshufi32x4 m6, m0, m4, q3232
+ vinserti32x8 m0, ym4, 1
+ vinserti32x8 m4, m8, ym3, 1
+ vshufi32x4 m8, m3, q3232
+ vinserti32x8 m3, m7, ym1, 1
+ vshufi32x4 m7, m1, q3232
+ vshufi32x4 m1, m2, m5, q3232
+ vinserti32x8 m2, ym5, 1
+ vshufi32x4 m5, m7, m1, q2020 ; 10 11
+ vshufi32x4 m7, m1, q3131 ; 14 15
+ vshufi32x4 m1, m3, m2, q2020 ; 2 3
+ vshufi32x4 m3, m2, q3131 ; 6 7
+ vshufi32x4 m2, m0, m4, q3131 ; 4 5
+ vshufi32x4 m0, m4, q2020 ; 0 1
+ vshufi32x4 m4, m6, m8, q2020 ; 8 9
+ vshufi32x4 m6, m8, q3131 ; 12 13
+ ret
+ALIGN function_align
+.round:
+ paddd m20, m13
+ paddd m22, m13
+.round2:
+ paddd m16, m13
+ paddd m18, m13
+.round3:
+ REPX {psrad x, 12 }, m16, m18, m20, m22
+ REPX {paddd x, m13}, m17, m19, m21, m23
+ REPX {psrad x, 12 }, m17, m19, m21, m23
+ ret
+
+INV_TXFM_16X16_FN adst, dct
+INV_TXFM_16X16_FN adst, flipadst
+INV_TXFM_16X16_FN adst, adst
+
+cglobal iadst_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+%undef cmp
+ cmp eobd, 36
+ jl .fast
+ call .main_pass1
+ packssdw m0, m16
+ packssdw m1, m17
+ packssdw m2, m18
+ packssdw m3, m19
+ packssdw m4, m5, m20
+ packssdw m5, m6, m21
+ packssdw m6, m7, m22
+ packssdw m7, m8, m23
+ jmp m(idct_16x16_internal_10bpc).pass1_end
+.fast:
+ call .main_pass1_fast
+ vpbroadcastd m9, [o(pd_2)]
+ paddd m0, m9
+ psubd m1, m9, m1
+ paddd m2, m9
+ psubd m3, m9, m3
+ paddd m4, m9, m5
+ psubd m5, m9, m6
+ paddd m6, m9, m7
+ psubd m7, m9, m8
+.pass1_fast_end:
+ mova m9, [o(permA)]
+ psrlq m8, m9, 8
+ REPX {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7
+.pass1_fast_end2:
+ mova m10, m9
+ mova m11, m8
+ call m(idct_16x8_internal_10bpc).transpose_16x8
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ REPX {mova [cq+64*x], ym4}, 0, 1, 2, 3, 4, 5, 6, 7
+ jmp tx2q
+.pass2:
+ lea r5, [o_base_8bpc]
+ call m(iadst_16x16_internal_8bpc).main_pass2b
+ movshdup m12, [permC]
+ mova m11, [pw_2048_m2048]
+ psrlq m13, m12, 8
+ vpermq m8, m13, m0
+ vpermq m0, m12, m7
+ vpermq m7, m13, m1
+ vpermq m1, m12, m6
+ vpermq m6, m13, m2
+ vpermq m2, m12, m5
+ vpermq m5, m13, m3
+ vpermq m3, m12, m4
+ jmp m(idct_16x16_internal_10bpc).pass2_end
+ALIGN function_align
+.main_pass1:
+ mova m0, [cq+64* 0]
+%if WIN64
+ movaps [cq+16*0], xmm6
+ movaps [cq+16*1], xmm7
+%endif
+ mova m23, [cq+64*15]
+ vpbroadcastd m13, [o(pd_2048)]
+ ITX_MULSUB_2D 23, 0, 8, 9, 10, 13, 201, 4091 ; t1 t0
+ mova m7, [cq+64* 7]
+ mova m16, [cq+64* 8]
+ ITX_MULSUB_2D 7, 16, 8, 9, 10, 13, 3035, 2751 ; t9 t8
+ mova m2, [cq+64* 2]
+ mova m21, [cq+64*13]
+ ITX_MULSUB_2D 21, 2, 8, 9, 10, 13, 995, 3973 ; t3 t2
+ mova m5, [cq+64* 5]
+ mova m18, [cq+64*10]
+ ITX_MULSUB_2D 5, 18, 8, 9, 10, 13, 3513, 2106 ; t11 t10
+ mova m4, [cq+64* 4]
+ mova m19, [cq+64*11]
+ ITX_MULSUB_2D 19, 4, 8, 9, 10, 13, 1751, 3703 ; t5 t4
+ mova m3, [cq+64* 3]
+ mova m20, [cq+64*12]
+ ITX_MULSUB_2D 3, 20, 8, 9, 10, 13, 3857, 1380 ; t13 t12
+ mova m6, [cq+64* 6]
+ mova m17, [cq+64* 9]
+ ITX_MULSUB_2D 17, 6, 8, 9, 10, 13, 2440, 3290 ; t7 t6
+ mova m1, [cq+64* 1]
+ mova m22, [cq+64*14]
+ ITX_MULSUB_2D 1, 22, 8, 9, 10, 13, 4052, 601 ; t15 t14
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+ psubd m9, m23, m7 ; t9a
+ paddd m23, m7 ; t1a
+ psubd m7, m2, m18 ; t10a
+ paddd m18, m2 ; t2a
+ REPX {pmaxsd x, m14}, m9, m23, m7, m18
+ psubd m2, m17, m1 ; t15a
+ paddd m17, m1 ; t7a
+ REPX {pminsd x, m15}, m9, m23, m7, m18
+ psubd m1, m21, m5 ; t11a
+ paddd m21, m5 ; t3a
+ REPX {pmaxsd x, m14}, m2, m17, m1, m21
+ psubd m5, m4, m20 ; t12a
+ paddd m4, m20 ; t4a
+ REPX {pminsd x, m15}, m2, m17, m1, m21
+ psubd m20, m19, m3 ; t13a
+ paddd m19, m3 ; t5a
+ REPX {pmaxsd x, m14}, m5, m4, m20, m19
+ psubd m8, m6, m22 ; t14a
+ paddd m6, m22 ; t6a
+ REPX {pminsd x, m15}, m5, m4, m20, m19
+ psubd m22, m0, m16 ; t8a
+ paddd m16, m0 ; t0a
+ REPX {pmaxsd x, m14}, m8, m6, m22, m16
+ vpbroadcastd m11, [o(pd_4017)]
+ vpbroadcastd m10, [o(pd_799)]
+ REPX {pminsd x, m15}, m8, m6, m22, m16
+ ITX_MULSUB_2D 22, 9, 0, 3, _, 13, 10, 11 ; t9 t8
+ ITX_MULSUB_2D 20, 5, 0, 3, _, 13, 11, 10 ; t12 t13
+ vpbroadcastd m11, [o(pd_2276)]
+ vpbroadcastd m10, [o(pd_3406)]
+ ITX_MULSUB_2D 7, 1, 0, 3, _, 13, 10, 11 ; t11 t10
+ ITX_MULSUB_2D 2, 8, 0, 3, _, 13, 11, 10 ; t14 t15
+ paddd m0, m16, m4 ; t0
+ psubd m16, m4 ; t4
+ psubd m3, m23, m19 ; t5
+ paddd m23, m19 ; t1
+ REPX {pmaxsd x, m14}, m0, m16, m3, m23
+ psubd m19, m18, m6 ; t6
+ paddd m18, m6 ; t2
+ REPX {pminsd x, m15}, m0, m16, m3, m23
+ psubd m6, m21, m17 ; t7
+ paddd m21, m17 ; t3
+ REPX {pmaxsd x, m14}, m19, m18, m6, m21
+ paddd m17, m9, m20 ; t8a
+ psubd m9, m20 ; t12a
+ REPX {pminsd x, m15}, m19, m18, m6, m21
+ psubd m20, m22, m5 ; t13a
+ paddd m22, m5 ; t9a
+ REPX {pmaxsd x, m14}, m17, m9, m20, m22
+ psubd m5, m1, m2 ; t14a
+ paddd m1, m2 ; t10a
+ REPX {pminsd x, m15}, m17, m9, m20, m22
+ psubd m2, m7, m8 ; t15a
+ paddd m7, m8 ; t11a
+ REPX {pmaxsd x, m14}, m5, m1, m2, m7
+ vpbroadcastd m11, [o(pd_3784)]
+ vpbroadcastd m10, [o(pd_1567)]
+ REPX {pminsd x, m15}, m5, m1, m2, m7
+ ITX_MULSUB_2D 16, 3, 4, 8, _, 13, 10, 11 ; t5a t4a
+ ITX_MULSUB_2D 6, 19, 4, 8, _, 13, 11, 10 ; t6a t7a
+ ITX_MULSUB_2D 9, 20, 4, 8, _, 13, 10, 11 ; t13 t12
+ ITX_MULSUB_2D 2, 5, 4, 8, _, 13, 11, 10 ; t14 t15
+ psubd m8, m0, m18 ; t2a
+ paddd m0, m18 ; out0
+ psubd m18, m23, m21 ; t3a
+ paddd m23, m21 ; -out15
+ paddd m21, m9, m5 ; -out13
+ psubd m9, m5 ; t15a
+ psubd m5, m3, m6 ; t6
+ paddd m3, m6 ; -out3
+ REPX {pmaxsd x, m14}, m8, m18, m9, m5
+ psubd m6, m20, m2 ; t14a
+ paddd m2, m20 ; out2
+ paddd m20, m16, m19 ; out12
+ psubd m16, m19 ; t7
+ REPX {pminsd x, m15}, m8, m18, m9, m5
+ psubd m19, m22, m7 ; t11
+ paddd m22, m7 ; out14
+ psubd m7, m17, m1 ; t10
+ paddd m1, m17 ; -out1
+ REPX {pmaxsd x, m14}, m6, m16, m19, m7
+ vpbroadcastd m12, [o(pd_1448)]
+ vpbroadcastd m4, [o(pd_2)]
+ vpbroadcastd m10, [o(pd_5120)]
+ vpbroadcastd m11, [o(pd_5119)]
+ REPX {pminsd x, m15}, m6, m16, m19, m7
+ psubd m17, m7, m19 ; -out9
+ paddd m7, m19 ; out6
+ psubd m19, m5, m16 ; -out11
+ paddd m5, m16 ; out4
+ REPX {pmulld x, m12}, m17, m7, m19, m5
+ psubd m16, m8, m18 ; out8
+ paddd m8, m18 ; -out7
+ psubd m18, m6, m9 ; out10
+ paddd m6, m9 ; -out5
+ REPX {pmulld x, m12}, m16, m8, m18, m6
+ REPX {paddd x, m4 }, m0, m2, m20, m22
+ REPX {psubd x, m4, x}, m1, m3, m21, m23
+ REPX {paddd x, m10 }, m7, m5, m16, m18
+ REPX {psubd x, m11, x}, m17, m19, m8, m6
+ REPX {psrad x, 2 }, m20, m22, m0, m2, m21, m23, m1, m3
+ REPX {psrad x, 13}, m17, m19, m5, m7, m16, m18, m6, m8
+ ret
+ALIGN function_align
+.main_pass1_fast:
+ mova ym0, [cq+64*0]
+ mova ym1, [cq+64*2]
+ movshdup m8, [o(permB)]
+ mova ym6, [cq+64*1]
+ mova ym7, [cq+64*3]
+ mova ym2, [cq+64*4]
+ mova ym3, [cq+64*6]
+ mova ym4, [cq+64*5]
+ mova ym5, [cq+64*7]
+ vpermt2q m0, m8, m1 ; 0 2
+ vpermt2q m7, m8, m6 ; 3 1
+ vpermt2q m2, m8, m3 ; 4 6
+ vpermt2q m5, m8, m4 ; 7 5
+ vpbroadcastd m13, [o(pd_2048)]
+ vpbroadcastd m12, [o(pd_2896)]
+ jmp m(iadst_16x8_internal_10bpc).main_fast
+
+INV_TXFM_16X16_FN flipadst, dct
+INV_TXFM_16X16_FN flipadst, adst
+INV_TXFM_16X16_FN flipadst, flipadst
+
+cglobal iflipadst_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+%undef cmp
+ cmp eobd, 36
+ jl .fast
+ call m(iadst_16x16_internal_10bpc).main_pass1
+ packssdw m4, m19, m3
+ packssdw m3, m20, m5
+ packssdw m5, m18, m2
+ packssdw m2, m21, m6
+ packssdw m6, m17, m1
+ packssdw m1, m22, m7
+ packssdw m7, m16, m0
+ packssdw m0, m23, m8
+ jmp m(idct_16x16_internal_10bpc).pass1_end
+.fast:
+ call m(iadst_16x16_internal_10bpc).main_pass1_fast
+ vpbroadcastd m9, [o(pd_2)]
+ psubd m4, m9, m3
+ paddd m3, m9, m5
+ paddd m5, m9, m2
+ psubd m2, m9, m6
+ psubd m6, m9, m1
+ paddd m1, m9, m7
+ paddd m7, m9, m0
+ psubd m0, m9, m8
+ jmp m(iadst_16x16_internal_10bpc).pass1_fast_end
+.pass2:
+ lea r5, [o_base_8bpc]
+ call m(iadst_16x16_internal_8bpc).main_pass2b
+ movshdup m12, [permC]
+ movu m11, [pw_m2048_2048]
+ psrlq m13, m12, 8
+ vpermq m8, m13, m7
+ vpermq m7, m13, m6
+ vpermq m6, m13, m5
+ vpermq m5, m13, m4
+ vpermq m3, m12, m3
+ vpermq m2, m12, m2
+ vpermq m1, m12, m1
+ vpermq m0, m12, m0
+ jmp m(idct_16x16_internal_10bpc).pass2_end
+
+INV_TXFM_16X16_FN identity, dct, -92
+INV_TXFM_16X16_FN identity, identity
+
+cglobal iidentity_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+%undef cmp
+ vpbroadcastd m10, [o(pd_5793)]
+ vpbroadcastd m11, [o(pd_5120)]
+ mov r6, cq
+ cmp eobd, 36
+ jl .fast
+ call .pass1_main
+ packssdw m0, m6, m8
+ packssdw m1, m7, m9
+ call .pass1_main
+ packssdw m2, m6, m8
+ packssdw m3, m7, m9
+ call .pass1_main
+ packssdw m4, m6, m8
+ packssdw m5, m7, m9
+ call .pass1_main
+ packssdw m6, m8
+ packssdw m7, m9
+ jmp m(idct_16x16_internal_10bpc).pass1_end2
+.fast:
+ call .pass1_main_fast
+ packssdw m0, m6, m7
+ call .pass1_main_fast
+ packssdw m1, m6, m7
+ call .pass1_main_fast
+ packssdw m2, m6, m7
+ call .pass1_main_fast
+ packssdw m3, m6, m7
+ punpckhwd m4, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m2, m3
+ punpcklwd m2, m3
+ punpckldq m3, m4, m1
+ punpckhdq m4, m1
+ punpckhdq m1, m0, m2
+ punpckldq m0, m2
+ pxor m7, m7
+ vshufi32x4 m2, m0, m3, q3131
+ vshufi32x4 m0, m3, q2020
+ vshufi32x4 m3, m1, m4, q3131
+ vshufi32x4 m1, m4, q2020
+ REPX {mova x, m7}, m4, m5, m6
+ jmp m(idct_16x16_internal_10bpc).pass1_end3
+.pass2:
+ movshdup m14, [o(permC)]
+ vpbroadcastd m15, [o(pw_1697x16)]
+ lea r6, [strideq*3]
+ vpbroadcastd m11, [o(pw_2048)]
+ pxor m12, m12
+ vpbroadcastd m13, [pixel_10bpc_max]
+ vpermq m8, m14, m0
+ vpermq m9, m14, m1
+ call .pass2_main
+ vpermq m8, m14, m2
+ vpermq m9, m14, m3
+ call .pass2_main
+ vpermq m8, m14, m4
+ vpermq m9, m14, m5
+ call .pass2_main
+ vpermq m8, m14, m6
+ vpermq m9, m14, m7
+.pass2_main:
+ pmulhrsw m0, m15, m8
+ pmulhrsw m1, m15, m9
+ paddsw m8, m8
+ paddsw m9, m9
+ paddsw m8, m0
+ paddsw m9, m1
+ jmp m(idct_16x8_internal_10bpc).write_16x4
+ALIGN function_align
+.pass1_main:
+ pmulld m6, m10, [r6+64*0]
+ pmulld m7, m10, [r6+64*1]
+ pmulld m8, m10, [r6+64*8]
+ pmulld m9, m10, [r6+64*9]
+ add r6, 64*2
+ REPX {paddd x, m11}, m6, m7, m8, m9
+ REPX {psrad x, 13 }, m6, m8, m7, m9
+ ret
+ALIGN function_align
+.pass1_main_fast:
+ mova ym6, [r6+64* 0]
+ vinserti32x8 m6, [r6+64* 4], 1
+ mova ym7, [r6+64* 8]
+ vinserti32x8 m7, [r6+64*12], 1
+ add r6, 64
+ REPX {pmulld x, m10}, m6, m7
+ REPX {paddd x, m11}, m6, m7
+ REPX {psrad x, 13 }, m6, m7
+ ret
+
+cglobal inv_txfm_add_dct_dct_8x32_10bpc, 4, 7, 22, dst, stride, c, eob
+%undef cmp
+ lea r5, [o_base]
+ test eobd, eobd
+ jz .dconly
+ vpbroadcastd m12, [o(pd_2896)]
+ vpbroadcastd m13, [o(pd_2048)]
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+ vpbroadcastd m11, [o(pd_2)]
+ mova m20, [o(idct8x32p)]
+ pxor m21, m21
+ cmp eobd, 43
+ jl .fast
+ call .pass1_main
+ punpcklwd m16, m0, m1
+ punpcklwd m17, m2, m3
+ punpckhwd m18, m0, m1
+ punpckhwd m19, m2, m3
+ cmp eobd, 107
+ jge .full
+ punpckldq m0, m16, m17 ; 0 2
+ punpckhdq m1, m16, m17 ; 4 6
+ punpckldq m2, m18, m19 ; 8 10
+ punpckhdq m3, m18, m19 ; 12 14
+ lea r5, [o_base_8bpc]
+ vextracti32x8 ym14, m0, 1
+ vextracti32x8 ym15, m1, 1
+ vextracti32x8 ym16, m2, 1
+ vextracti32x8 ym17, m3, 1
+ call m(idct_8x16_internal_8bpc).main_fast
+ call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast
+ jmp .end
+.full:
+ add cq, 64
+ call .pass1_main
+ punpcklwd m5, m0, m1
+ punpcklwd m6, m2, m3
+ punpckhwd m7, m0, m1
+ punpckhwd m8, m2, m3
+ punpckldq m0, m16, m17 ; 0 2
+ punpckhdq m1, m16, m17 ; 4 6
+ punpckldq m2, m18, m19 ; 8 10
+ punpckhdq m3, m18, m19 ; 12 14
+ punpckldq m4, m5, m6 ; 16 18
+ punpckhdq m5, m6 ; 20 22
+ punpckldq m6, m7, m8 ; 24 26
+ punpckhdq m7, m8 ; 28 30
+ lea r5, [o_base_8bpc]
+ vextracti32x8 ym14, m0, 1
+ vextracti32x8 ym15, m1, 1
+ vextracti32x8 ym16, m2, 1
+ vextracti32x8 ym17, m3, 1
+ vextracti32x8 ym18, m4, 1
+ vextracti32x8 ym19, m5, 1
+ vextracti32x8 ym20, m6, 1
+ vextracti32x8 ym21, m7, 1
+ call m(idct_8x16_internal_8bpc).main
+ REPX {pshufd x, x, q1032}, ym18, ym19, ym20, ym21
+ call m(inv_txfm_add_dct_dct_8x32_8bpc).main
+ jmp .end
+.fast:
+ movshdup m8, [o(permB)]
+ mova ym1, [cq+128*1]
+ mova ym5, [cq+128*5]
+ mova ym7, [cq+128*3]
+ mova ym3, [cq+128*7]
+ mova ym0, [cq+128*0]
+ mova ym4, [cq+128*2]
+ mova ym2, [cq+128*4]
+ mova ym6, [cq+128*6]
+ vpermt2q m1, m8, m5 ; 1 5
+ vpermt2q m3, m8, m7 ; 7 3
+ vpermt2q m0, m8, m4 ; 0 2
+ vpermt2q m2, m8, m6 ; 4 6
+ mova [cq+128*0], ym21
+ REPX {vmovdqa32 [cq+128*x], ym21}, 1, 2, 3, 4, 5, 6, 7
+ call m(idct_8x8_internal_10bpc).main
+ call m(idct_8x8_internal_10bpc).main_end
+ packssdw m0, m2
+ packssdw m1, m3
+ vpermb m0, m20, m0
+ vprold m20, 16
+ vpermb m2, m20, m1
+ punpckhdq m1, m0, m2
+ punpckldq m0, m2
+ lea r5, [o_base_8bpc]
+ vextracti32x8 ym14, m0, 1
+ vextracti32x8 ym15, m1, 1
+ call m(idct_8x16_internal_8bpc).main_fast2
+ call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast2
+.end:
+ call m(inv_txfm_add_dct_dct_8x32_8bpc).main_end ; performs vzeroupper
+ lea r3, [strideq*2]
+ vpbroadcastd m12, [pixel_10bpc_max]
+ lea r6, [strideq*3]
+ pxor m11, m11
+ lea r3, [dstq+r3*8]
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ call .write_8x4x2
+ pmulhrsw m0, m10, m2
+ pmulhrsw m1, m10, m3
+ call .write_8x4x2
+ pmulhrsw m0, m10, m4
+ pmulhrsw m1, m10, m5
+ call .write_8x4x2
+ pmulhrsw m0, m10, m6
+ pmulhrsw m1, m10, m7
+.write_8x4x2:
+ mova xm8, [dstq+strideq*0]
+ vinserti32x4 ym8, [dstq+strideq*1], 1
+ vinserti32x4 m8, [dstq+strideq*2], 2
+ vinserti32x4 m8, [dstq+r6 ], 3
+ mova xm9, [r3 +r6 ]
+ vinserti32x4 ym9, [r3 +strideq*2], 1
+ vinserti32x4 m9, [r3 +strideq*1], 2
+ vinserti32x4 m9, [r3 +strideq*0], 3
+ paddw m8, m0
+ paddw m9, m1
+ pmaxsw m8, m11
+ pmaxsw m9, m11
+ pminsw m8, m12
+ pminsw m9, m12
+ mova [dstq+strideq*0], xm8
+ vextracti32x4 [dstq+strideq*1], ym8, 1
+ vextracti32x4 [dstq+strideq*2], m8, 2
+ vextracti32x4 [dstq+r6 ], m8, 3
+ lea dstq, [dstq+strideq*4]
+ vextracti32x4 [r3 +strideq*0], m9, 3
+ vextracti32x4 [r3 +strideq*1], m9, 2
+ vextracti32x4 [r3 +strideq*2], ym9, 1
+ mova [r3 +r6 ], xm9
+ lea r3, [r3+strideq*4]
+ ret
+.dconly:
+ imul r6d, [cq], 181
+ mov [cq], eobd
+ or r3d, 32
+ add r6d, 640
+ sar r6d, 10
+ jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly2
+ALIGN function_align
+.pass1_main:
+ mova m0, [cq+128*0]
+ mova m1, [cq+128*1]
+ mova m2, [cq+128*2]
+ mova m3, [cq+128*3]
+ mova m4, [cq+128*4]
+ mova m5, [cq+128*5]
+ mova m6, [cq+128*6]
+ mova m7, [cq+128*7]
+ REPX {mova [cq+128*x], m21}, 0, 1, 2, 3, 4, 5, 6, 7
+ call m(idct_8x16_internal_10bpc).main
+ call m(idct_8x16_internal_10bpc).main_end2
+ packssdw m0, m4
+ packssdw m1, m5
+ packssdw m2, m6
+ packssdw m3, m7
+ REPX {vpermb x, m20, x}, m0, m1, m2, m3
+ ret
+
+cglobal inv_txfm_add_identity_identity_8x32_10bpc, 4, 8, 12, dst, stride, c, eob
+ vpbroadcastd m9, [pw_5]
+ lea r4, [strideq*3]
+ pxor m10, m10
+ lea r5, [strideq*5]
+ vpbroadcastd m11, [pixel_10bpc_max]
+ sub eobd, 107
+ lea r6, [strideq+r4*2]
+.loop:
+ mova m0, [cq+128*0]
+ packssdw m0, [cq+128*1]
+ mova m1, [cq+128*2]
+ packssdw m1, [cq+128*3]
+ mova m2, [cq+128*4]
+ packssdw m2, [cq+128*5]
+ mova m3, [cq+128*6]
+ packssdw m3, [cq+128*7]
+ lea r7, [dstq+strideq*8]
+ REPX {mova [cq+128*x], m10}, 0, 1, 2, 3
+ REPX {paddsw x, m9}, m0, m1, m2, m3
+ REPX {mova [cq+128*x], m10}, 4, 5, 6, 7
+ REPX {psraw x, 3 }, m0, m1, m2, m3
+ add cq, 64
+ mova xm4, [dstq+strideq*0]
+ mova xm5, [dstq+strideq*1]
+ mova xm6, [dstq+strideq*2]
+ mova xm7, [dstq+r4 *1]
+ punpckhwd m8, m0, m1
+ vinserti32x4 ym4, [dstq+strideq*4], 1
+ punpcklwd m0, m1
+ vinserti32x4 ym5, [dstq+r5 *1], 1
+ punpckhwd m1, m2, m3
+ vinserti32x4 ym6, [dstq+r4 *2], 1
+ punpcklwd m2, m3
+ vinserti32x4 ym7, [dstq+r6 *1], 1
+ punpckhwd m3, m0, m8
+ vinserti32x4 m4, [r7 +strideq*0], 2
+ punpcklwd m0, m8
+ vinserti32x4 m5, [r7 +strideq*1], 2
+ punpckhwd m8, m2, m1
+ vinserti32x4 m6, [r7 +strideq*2], 2
+ punpcklwd m2, m1
+ vinserti32x4 m7, [r7 +r4 *1], 2
+ punpckhqdq m1, m0, m2
+ vinserti32x4 m4, [r7 +strideq*4], 3
+ punpcklqdq m0, m2
+ vinserti32x4 m5, [r7 +r5 *1], 3
+ punpcklqdq m2, m3, m8
+ vinserti32x4 m6, [r7 +r4 *2], 3
+ punpckhqdq m3, m8
+ vinserti32x4 m7, [r7 +r6 *1], 3
+ paddw m0, m4
+ paddw m1, m5
+ paddw m2, m6
+ paddw m3, m7
+ REPX {pmaxsw x, m10}, m0, m1, m2, m3
+ REPX {pminsw x, m11}, m0, m1, m2, m3
+ mova [dstq+strideq*0], xm0
+ mova [dstq+strideq*1], xm1
+ mova [dstq+strideq*2], xm2
+ mova [dstq+r4 *1], xm3
+ vextracti32x4 [dstq+strideq*4], ym0, 1
+ vextracti32x4 [dstq+r5 *1], ym1, 1
+ vextracti32x4 [dstq+r4 *2], ym2, 1
+ vextracti32x4 [dstq+r6 *1], ym3, 1
+ lea dstq, [r7+strideq*8]
+ vextracti32x4 [r7 +strideq*0], m0, 2
+ vextracti32x4 [r7 +strideq*1], m1, 2
+ vextracti32x4 [r7 +strideq*2], m2, 2
+ vextracti32x4 [r7 +r4 *1], m3, 2
+ vextracti32x4 [r7 +strideq*4], m0, 3
+ vextracti32x4 [r7 +r5 *1], m1, 3
+ vextracti32x4 [r7 +r4 *2], m2, 3
+ vextracti32x4 [r7 +r6 *1], m3, 3
+ add eobd, 0x80000000
+ jnc .loop
+ RET
+
+cglobal inv_txfm_add_dct_dct_32x8_10bpc, 4, 7, 0, dst, stride, c, eob
+%undef cmp
+ lea r5, [o_base]
+ test eobd, eobd
+ jz .dconly
+ mova m11, [o(permB)]
+ mova m0, [cq+64* 0] ; 0 1
+ mova m4, [cq+64* 1] ; 2 3
+ mova m1, [cq+64* 2] ; 4 5
+ mova m8, [cq+64* 3] ; 6 7
+ vpbroadcastd m12, [o(pd_2896)]
+ vpbroadcastd m13, [o(pd_2048)]
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+ psrlq m10, m11, 32
+%if WIN64
+ movaps [cq+16*0], xmm6
+ movaps [cq+16*1], xmm7
+%endif
+ mova m16, m11
+ vpermi2q m16, m0, m1 ; 1 5
+ mova m17, m11
+ vpermi2q m17, m8, m4 ; 7 3
+ cmp eobd, 43
+ jl .fast
+ mova m18, [cq+64* 4] ; 8 9
+ mova m20, [cq+64* 5] ; 10 11
+ mova m6, [cq+64* 6] ; 12 13
+ mova m7, [cq+64* 7] ; 14 15
+ vpermt2q m0, m10, m18 ; 0 8
+ vpermt2q m18, m11, m6 ; 9 13
+ mova m19, m11
+ vpermi2q m19, m7, m20 ; 15 11
+ cmp eobd, 107
+ jge .full
+ vpermt2q m1, m10, m6 ; 4 12
+ vpermt2q m4, m10, m8 ; 2 6
+ vpermt2q m7, m10, m20 ; 14 10
+ mov r6d, 64*1
+ call m(idct_8x8_internal_10bpc).main_fast
+ call m(idct_16x8_internal_10bpc).main_fast
+ call .main_fast
+ call m(idct_16x16_internal_10bpc).main_end
+ jmp .end
+.full:
+ mova m2, [cq+64* 8] ; 16 17
+ mova m5, [cq+64* 9] ; 18 19
+ mova m9, [cq+64*10] ; 20 21
+ mova m21, [cq+64*11] ; 22 23
+ vpermt2q m1, m10, m9 ; 4 20
+ vpermt2q m7, m10, m21 ; 14 22
+ vpermt2q m21, m11, m5 ; 23 19
+ vpermt2q m5, m10, m20 ; 18 10
+ mova m20, m11
+ vpermi2q m20, m2, m9 ; 17 21
+ mova m22, [cq+64*12] ; 24 25
+ mova m9, [cq+64*13] ; 26 27
+ mova m3, [cq+64*14] ; 28 29
+ mova m23, [cq+64*15] ; 30 31
+ vpermt2q m2, m10, m22 ; 16 24
+ vpermt2q m22, m11, m3 ; 25 29
+ vpermt2q m3, m10, m6 ; 28 12
+ vpermt2q m4, m10, m9 ; 2 26
+ mova m6, m10
+ vpermi2q m6, m23, m8 ; 30 6
+ vpermt2q m23, m11, m9 ; 31 27
+ mov r6d, 64*3
+ call m(idct_8x8_internal_10bpc).main
+ call m(idct_16x8_internal_10bpc).main
+ call .main
+ call m(idct_16x16_internal_10bpc).main_end
+ jmp .end
+.fast:
+ vpermq m0, m10, m0 ; 0 0
+ vpermq m1, m10, m1 ; 4 4
+ vpermt2q m4, m10, m8 ; 2 6
+ xor r6d, r6d
+ call .main_fast2
+ call m(idct_16x16_internal_10bpc).main_end
+.end:
+%if WIN64
+ movaps xmm6, [cq+16*0]
+ movaps xmm7, [cq+16*1]
+%endif
+ vzeroupper
+ call .transpose_8x32
+ pxor m14, m14
+.zero_loop:
+ mova [cq+r6*4+64*3], m14
+ mova [cq+r6*4+64*2], m14
+ mova [cq+r6*4+64*1], m14
+ mova [cq+r6*4+64*0], m14
+ sub r6d, 64
+ jge .zero_loop
+ lea r5, [o_base_8bpc]
+ punpckhqdq m1, m0, m2
+ punpcklqdq m0, m2
+ punpcklqdq m2, m3, m4
+ punpckhqdq m3, m4
+ punpcklqdq m4, m5, m7
+ punpckhqdq m5, m7
+ punpckhqdq m7, m6, m8
+ punpcklqdq m6, m8
+ call m(inv_txfm_add_dct_dct_32x8_8bpc).main
+ pxor m12, m12
+.write_32x8_start:
+ vpbroadcastd m11, [pw_2048]
+ vpbroadcastd m13, [pixel_10bpc_max]
+ lea r3, [strideq*3]
+.write_32x8:
+ pmulhrsw m0, m11
+ pmulhrsw m1, m11
+ pmulhrsw m2, m11
+ pmulhrsw m3, m11
+ call .write_32x4
+ pmulhrsw m0, m11, m4
+ pmulhrsw m1, m11, m5
+ pmulhrsw m2, m11, m6
+ pmulhrsw m3, m11, m7
+.write_32x4:
+ paddw m0, [dstq+strideq*0]
+ paddw m1, [dstq+strideq*1]
+ paddw m2, [dstq+strideq*2]
+ paddw m3, [dstq+r3 ]
+ REPX {pmaxsw x, m12}, m0, m1, m2, m3
+ REPX {pminsw x, m13}, m0, m1, m2, m3
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+r3 ], m3
+ lea dstq, [dstq+strideq*4]
+ ret
+.dconly:
+ imul r6d, [cq], 181
+ mov [cq], eobd
+ or r3d, 8
+ add r6d, 640
+ sar r6d, 10
+ jmp m(inv_txfm_add_dct_dct_32x16_10bpc).dconly2
+ALIGN function_align
+.main_fast3:
+ ; assuming m0=in0 in0, m4=in2 in2, and m16=in1 in3
+ vbroadcasti32x4 m5, [o(pd_401_4076)]
+ pmulld m3, m0, m12
+ pmulld m4, m5
+ REPX {paddd x, m13}, m3, m4
+ REPX {psrad x, 12 }, m3, m4 ; m3=idct8:t0-7, m4=t8a t15a
+
+ ; t8a t15a -> t8/9 t14/15
+
+ vbroadcasti32x4 m5, [o(pd_3784_m3784)]
+ pshufd m7, m4, q1032
+ pmulld m6, m4, [o(pd_1567)]{bcstd}
+ pmulld m5, m7
+ paddd m6, m13
+ paddd m5, m6
+ psrad m5, 12 ; m5=t9a t14a
+
+ ; t14a t9a -> t13/14 t9/10 [m5] & t8 15 -> t8/11a t12/15a [m4]
+
+ shufps m6, m4, m5, q1032 ; t12 t13
+ shufps m8, m4, m5, q3210 ; t11a t10
+ pmulld m9, m6, m12
+ pmulld m7, m8, m12
+ paddd m9, m13
+ paddd m5, m9, m7 ; t12 t13a
+ psubd m4, m9, m7 ; t11 t10a
+ REPX {psrad x, 12 }, m5, m4
+
+ psubd m7, m3, m6 ; dct16 out15 out14
+ paddd m0, m3, m6 ; dct16 out0 out1
+ psubd m6, m3, m5 ; dct16 out12 out13
+ paddd m1, m3, m5 ; dct16 out3 out2
+ psubd m5, m3, m4 ; dct16 out11 out10
+ paddd m2, m3, m4 ; dct16 out4 out5
+ psubd m4, m3, m8 ; dct16 out8 out9
+ paddd m3, m8 ; dct16 out7 out6
+ REPX {pmaxsd x, m14}, m7, m0, m6, m1, m5, m2, m4, m3
+ REPX {pminsd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7
+
+ ; idct32_bottomhalf
+ vbroadcasti32x4 m18, [o(pd_201_m601)]
+ vbroadcasti32x4 m19, [o(pd_4091_4052)]
+ pmulld m17, m16, m19
+ pmulld m16, m18
+ REPX {paddd x, m13}, m17, m16
+ REPX {psrad x, 12 }, m17, m16
+
+ ; m17: t31a t24a -> t30/31 t24/25, m16: t16a t23a -> t16/17 t22/23 [step2]
+
+ vbroadcasti32x4 m10, [o(pd_799_m2276)]
+ vbroadcasti32x4 m11, [o(pd_4017_3406)]
+ pmulld m18, m17, m10
+ pmulld m19, m17, m11
+ pmulld m8, m16, m11
+ pmulld m9, m16, m10
+ REPX {paddd x, m13}, m18, m19
+ psubd m18, m8
+ paddd m19, m9
+ REPX {psrad x, 12 }, m18, m19
+
+ ; m17=t31 t24 -> t28/31a t24/27a, m16=t16 t23 -> t16/19a t20/23a
+ ; m18=t17a t22a -> t17/18 t21/22, m19=t30a t25a -> t29/30 t25/26
+
+ punpckhqdq m23, m17, m19 ; t24a t25 [or t27a t26]
+ punpcklqdq m20, m16, m18 ; t16a t17 [or t19a t18]
+ punpckhqdq m22, m16, m18 ; t23a t22 [or t20a t21]
+ punpcklqdq m16, m17, m19 ; t28a t29 [or t31a t30]
+ mova m21, m23
+ mova m18, m20
+ mova m17, m22
+ mova m19, m16
+
+ jmp .main4
+.main_fast2: ; bottom three-quarters are zero
+ vbroadcasti32x4 m8, [o(pd_799_4017)]
+ pmulld m8, m1 ; t4 t7
+ vpmulld m0, [o(pd_2896)] {1to16} ; t0 t1
+ REPX {paddd x, m13}, m8, m0
+ REPX {psrad x, 12 }, m8, m0
+ pmulld m3, m8, m12
+ mova m2, m0 ; t3 t2
+ call m(idct_8x8_internal_10bpc).main3
+ vbroadcasti32x4 m6, [o(pd_4076_3920)]
+ vbroadcasti32x4 m3, [o(pd_401_m1189)]
+ pmulld m6, m4 ; t15 t12
+ pmulld m4, m3 ; t9 t10
+ REPX {paddd x, m13}, m6, m4
+ REPX {psrad x, 12 }, m6, m4
+ mova m5, m6 ; t14 t13
+ mova m9, m4 ; t8 t11
+ call m(idct_16x8_internal_10bpc).main3
+ vbroadcasti32x4 m23, [o(pd_4091_3973)]
+ vbroadcasti32x4 m7, [o(pd_201_995)]
+ vbroadcasti32x4 m22, [o(pd_1380_601)]
+ vbroadcasti32x4 m9, [o(pd_3857_4052)]
+ pmulld m23, m16 ; t16 t20
+ pmulld m16, m7 ; t31 t27
+ pmulld m22, m17 ; -t19 -t25
+ pmulld m17, m9 ; t28 t24
+ REPX {paddd x, m13}, m23, m16, m17
+ psubd m22, m13, m22
+ REPX {psrad x, 12 }, m23, m16, m22, m17
+ mova m20, m23 ; t30 t26
+ mova m9, m16 ; t17 t21
+ mova m19, m22 ; t18 t22
+ mova m18, m17 ; t29 t25
+ jmp .main3
+.main_fast: ; bottom half is zero
+ vbroadcasti32x4 m23, [o(pd_4091_3973)]
+ vbroadcasti32x4 m7, [o(pd_201_995)]
+ vbroadcasti32x4 m20, [o(pd_2751_2106)]
+ vbroadcasti32x4 m9, [o(pd_3035_3513)]
+ vbroadcasti32x4 m21, [o(pd_3703_3290)]
+ vbroadcasti32x4 m10, [o(pd_1751_2440)]
+ vbroadcasti32x4 m22, [o(pd_1380_601)]
+ vbroadcasti32x4 m11, [o(pd_3857_4052)]
+ pmulld m23, m16 ; t16a t20a
+ pmulld m16, m7 ; t31a t27a
+ pmulld m20, m19 ; -t17a -t21a
+ pmulld m19, m9 ; t30a t26a
+ pmulld m21, m18 ; t18a t22a
+ pmulld m18, m10 ; t29a t25a
+ pmulld m22, m17 ; -t19a -t25a
+ pmulld m17, m11 ; t28a t24a
+ psubd m20, m13, m20
+ psubd m22, m13, m22
+ jmp .main2
+.main:
+ ITX_MULSUB_2D 16, 23, 7, 9, 10, _, 201_995, 4091_3973
+ ITX_MULSUB_2D 20, 19, 7, 9, 10, _, 3035_3513, 2751_2106
+ ITX_MULSUB_2D 18, 21, 7, 9, 10, _, 1751_2440, 3703_3290
+ ITX_MULSUB_2D 22, 17, 7, 9, 10, _, 3857_4052, 1380_601
+ paddd m20, m13
+ paddd m22, m13
+.main2:
+ REPX {paddd x, m13}, m16, m23, m19
+ REPX {psrad x, 12 }, m16, m20, m23, m19
+ psubd m9, m16, m20 ; t17 t21
+ paddd m16, m20 ; t16 t20
+ psubd m20, m23, m19 ; t30 t26
+ paddd m23, m19 ; t31 t27
+ REPX {pmaxsd x, m14}, m9, m16, m20, m23
+ REPX {paddd x, m13}, m21, m18, m17
+ REPX {psrad x, 12 }, m18, m22, m21, m17
+ psubd m19, m22, m18 ; t18 t22
+ paddd m22, m18 ; t19 t23
+ psubd m18, m17, m21 ; t29 t25
+ paddd m17, m21 ; t28 t24
+ REPX {pmaxsd x, m14}, m19, m22, m18, m17
+ REPX {pminsd x, m15}, m20, m9, m18, m19, m16, m23, m22, m17
+.main3:
+ vbroadcasti32x4 m11, [o(pd_4017_2276)]
+ vbroadcasti32x4 m10, [o(pd_799_3406)]
+ psubd m7, m0, m6 ; dct16 out15 out14
+ paddd m0, m6 ; dct16 out0 out1
+ psubd m6, m1, m5 ; dct16 out12 out13
+ paddd m1, m5 ; dct16 out3 out2
+ psubd m5, m2, m4 ; dct16 out11 out10
+ paddd m2, m4 ; dct16 out4 out5
+ psubd m4, m3, m8 ; dct16 out8 out9
+ paddd m3, m8 ; dct16 out7 out6
+ ITX_MULSUB_2D 20, 9, 8, 21, _, 13, 10, 11
+ ITX_MULSUB_2D 18, 19, 8, 21, _, 13, 10, 11, 2
+ REPX {pmaxsd x, m14}, m7, m0, m6, m1, m5, m2, m4, m3
+ punpckhqdq m21, m16, m20 ; t20 t21a
+ punpcklqdq m16, m20 ; t16 t17a
+ punpcklqdq m20, m22, m19 ; t19 t18a
+ punpckhqdq m22, m19 ; t23 t22a
+ REPX {pminsd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7
+ punpcklqdq m19, m23, m9 ; t31 t30a
+ punpckhqdq m23, m9 ; t27 t26a
+ punpckhqdq m9, m17, m18 ; t24 t25a
+ punpcklqdq m17, m18 ; t28 t29a
+ psubd m18, m16, m20 ; t19a t18
+ paddd m20, m16 ; t16a t17
+ psubd m16, m19, m17 ; t28a t29
+ paddd m19, m17 ; t31a t30
+ psubd m17, m22, m21 ; t20a t21
+ paddd m22, m21 ; t23a t22
+ psubd m21, m9, m23 ; t27a t26
+ paddd m23, m9 ; t24a t25
+ REPX {pmaxsd x, m14}, m18, m16, m17, m21
+ REPX {pminsd x, m15}, m16, m18, m21, m17
+ REPX {pmaxsd x, m14}, m20, m22, m19, m23
+ REPX {pminsd x, m15}, m20, m22, m19, m23
+.main4:
+ vpbroadcastd m11, [o(pd_3784)]
+ vpbroadcastd m10, [o(pd_1567)]
+ ITX_MULSUB_2D 16, 18, 8, 9, _, 13, 10, 11
+ ITX_MULSUB_2D 21, 17, 8, 9, _, 13, 10, 11, 2
+ paddd m9, m20, m22 ; t16 t17a
+ psubd m20, m22 ; t23 t22a
+ paddd m22, m19, m23 ; t31 t30a
+ psubd m19, m23 ; t24 t25a
+ psubd m23, m16, m17 ; t20a t21
+ paddd m16, m17 ; t19a t18
+ psubd m17, m18, m21 ; t27a t26
+ paddd m21, m18 ; t28a t29
+ REPX {pmaxsd x, m14}, m20, m19, m23, m17
+ REPX {pminsd x, m15}, m19, m20, m17, m23
+ REPX {pmulld x, m12}, m19, m20, m17, m23
+ REPX {pmaxsd x, m14}, m22, m21, m16, m9
+ paddd m19, m13
+ paddd m17, m13
+ REPX {pminsd x, m15}, m22, m21, m16, m9
+ psubd m18, m19, m20 ; t23a t22
+ paddd m19, m20 ; t24a t25
+ paddd m20, m17, m23 ; t27 t26a
+ psubd m17, m23 ; t20 t21a
+ REPX {psrad x, 12 }, m20, m19, m18, m17
+ ret
+.transpose_8x32:
+ mova m10, [o(idct32x8p)]
+ psrlw m8, m10, 8
+ mova m9, m8
+ vpermi2w m8, m1, m5
+ vpermt2w m1, m10, m5
+ vprold m5, m9, 16
+ vpermi2w m9, m3, m7
+ vpermt2w m3, m10, m7
+ vprold m10, 16
+ mova m7, m5
+ vpermi2w m5, m0, m4
+ vpermt2w m0, m10, m4
+ vpermi2w m7, m2, m6
+ vpermt2w m2, m10, m6
+ punpckhdq m6, m5, m8
+ punpckldq m5, m8
+ punpckhdq m8, m7, m9
+ punpckldq m7, m9
+ punpckhdq m4, m2, m3
+ punpckldq m2, m3
+ punpckhdq m3, m0, m1
+ punpckldq m0, m1
+ ret
+
+cglobal inv_txfm_add_identity_identity_32x8_10bpc, 4, 7, 10, dst, stride, c, eob
+ vpbroadcastd m5, [pw_4096]
+ lea r4, [strideq*3]
+ mova m6, [idtx32x8p]
+ lea r5, [strideq*5]
+ vpbroadcastd m9, [pixel_10bpc_max]
+ lea r6, [strideq+r4*2]
+ pxor m8, m8
+ sub eobd, 107
+ psrlw m7, m6, 8
+.loop:
+ mova m0, [cq+64*0]
+ packssdw m0, [cq+64*1] ; 02 13
+ mova m1, [cq+64*2]
+ packssdw m1, [cq+64*3] ; 46 57
+ mova m2, [cq+64*4]
+ packssdw m2, [cq+64*5] ; 8a 9b
+ mova m3, [cq+64*6]
+ packssdw m3, [cq+64*7] ; ce df
+ REPX {pmulhrsw x, m5}, m0, m1, m2, m3
+ REPX {mova [cq+64*x], m8}, 0, 1, 2, 3
+ mova m4, m6
+ vpermi2w m4, m1, m3
+ vpermt2w m1, m7, m3
+ REPX {mova [cq+64*x], m8}, 4, 5, 6, 7
+ mova m3, m7
+ vpermi2w m3, m0, m2
+ vpermt2w m0, m6, m2
+ add cq, 64*8
+ punpcklqdq m2, m3, m1 ; 4 5
+ punpckhqdq m3, m1 ; 6 7
+ punpckhqdq m1, m0, m4 ; 2 3
+ punpcklqdq m0, m4 ; 0 1
+ mova ym4, [dstq+strideq*0]
+ vinserti32x8 m4, [dstq+strideq*1], 1
+ paddw m0, m4
+ mova ym4, [dstq+strideq*2]
+ vinserti32x8 m4, [dstq+r4 *1], 1
+ paddw m1, m4
+ mova ym4, [dstq+strideq*4]
+ vinserti32x8 m4, [dstq+r5 *1], 1
+ paddw m2, m4
+ mova ym4, [dstq+r4 *2]
+ vinserti32x8 m4, [dstq+r6 *1], 1
+ paddw m3, m4
+ REPX {pmaxsw x, m8}, m0, m1, m2, m3
+ REPX {pminsw x, m9}, m0, m1, m2, m3
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], ym1
+ vextracti32x8 [dstq+r4 *1], m1, 1
+ mova [dstq+strideq*4], ym2
+ vextracti32x8 [dstq+r5 *1], m2, 1
+ mova [dstq+r4 *2], ym3
+ vextracti32x8 [dstq+r6 *1], m3, 1
+ add dstq, 32
+ add eobd, 0x80000000
+ jnc .loop
+ RET
+
+cglobal inv_txfm_add_dct_dct_16x32_10bpc, 4, 7, 0, dst, stride, c, eob
+%undef cmp
+ lea r5, [o_base]
+ test eobd, eobd
+ jz .dconly
+ vpbroadcastd m12, [o(pd_2896)]
+ vpbroadcastd m13, [o(pd_2048)]
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+%if WIN64
+ movaps [rsp+ 8], xmm6
+ movaps [rsp+24], xmm7
+%endif
+ cmp eobd, 36
+ jl .fast
+ call .pass1
+ cmp eobd, 151
+ jge .full
+ lea r5, [o_base_8bpc]
+ pxor m9, m9
+ punpcklwd m8, m1, m1 ; 2
+ punpckhwd m14, m1, m1 ; 3
+ punpcklwd m1, m3, m3 ; 6
+ punpckhwd m15, m3, m3 ; 7
+ punpcklwd m3, m6, m6 ; 12
+ punpckhwd m19, m6, m6 ; 13
+ punpcklwd m6, m9, m4 ; __ 8
+ punpckhwd m20, m4, m4 ; 9
+ punpckhwd m16, m5, m5 ; 11
+ punpcklwd m5, m5 ; 10
+ punpcklwd m9, m0 ; __ 0
+ punpckhwd m21, m0, m0 ; 1
+ punpcklwd m0, m7, m7 ; 14
+ punpckhwd m17, m7, m7 ; 15
+ punpcklwd m7, m2, m2 ; 4
+ punpckhwd m18, m2, m2 ; 5
+ call m(idct_16x16_internal_8bpc).main_fast
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ mov r6d, 64*3
+ pxor m8, m8
+.zero_loop:
+ REPX {mova [cq+r6*8+128*x], m8}, 3, 2, 1, 0
+ sub r6d, 64
+ jge .zero_loop
+ jmp .pass2_end
+.full:
+ mova [cq+128*0], m0
+ mova [cq+128*1], m1
+ mova [cq+128*2], m2
+ mova [cq+128*3], m3
+ mova [cq+128*4], m4
+ mova [cq+128*5], m5
+ mova [cq+128*6], m6
+ mova [cq+128*7], m7
+ add cq, 64
+ call .pass1
+ mova m9, [cq-64* 1] ; 0 1
+ mova m14, [cq+64* 1] ; 2 3
+ mova m18, [cq+64* 3] ; 4 5
+ mova m15, [cq+64* 5] ; 6 7
+ mova m20, [cq+64* 7] ; 8 9
+ mova m16, [cq+64* 9] ; 10 11
+ mova m22, [cq+64*11] ; 12 13
+ mova m19, [cq+64*13] ; 14 15
+ lea r5, [o_base_8bpc]
+ punpcklwd m8, m7, m14 ; 30 2
+ punpckhwd m21, m7, m9 ; 31 1
+ punpcklwd m7, m6, m18 ; 28 4
+ punpckhwd m14, m6 ; 3 29
+ punpcklwd m9, m0, m9 ; 16 0
+ punpckhwd m17, m19, m0 ; 15 17
+ punpcklwd m0, m19, m1 ; 14 18
+ punpckhwd m19, m1, m22 ; 19 13
+ punpcklwd m1, m15, m5 ; 6 26
+ punpckhwd m18, m5, m18 ; 27 5
+ punpcklwd m6, m4, m20 ; 24 8
+ punpckhwd m15, m4 ; 7 25
+ punpcklwd m5, m3, m16 ; 22 10
+ punpckhwd m20, m3, m20 ; 23 9
+ punpcklwd m3, m22, m2 ; 12 20
+ punpckhwd m16, m2 ; 11 21
+ call m(idct_16x16_internal_8bpc).main2
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
+ mov r6d, 32*7
+ pxor m8, m8
+.full_zero_loop:
+ REPX {mova [cq+r6*8+64*x], m8}, 2, 1, 0, -1
+ sub r6d, 32
+ jge .full_zero_loop
+ jmp .pass2_end
+.fast:
+ mova ym0, [cq+128*0]
+ mova ym2, [cq+128*4]
+ movshdup m8, [o(permB)]
+ mova ym1, [cq+128*2]
+ mova ym3, [cq+128*6]
+ mova ym4, [cq+128*1]
+ mova ym5, [cq+128*3]
+ mova ym6, [cq+128*5]
+ mova ym7, [cq+128*7]
+ vpermt2q m0, m8, m2 ; 0 4
+ vpermt2q m1, m8, m3 ; 2 6
+ vpermt2q m4, m8, m5 ; 1 3
+ vpermt2q m7, m8, m6 ; 7 5
+ REPX {pmulld x, m12}, m0, m1, m4, m7
+ pxor ym16, ym16
+ mova [cq+128*0], ym16
+ REPX {vmovdqa32 [cq+128*x], ym16}, 1, 2, 3, 4, 5, 6, 7
+ REPX {paddd x, m13}, m0, m1, m4, m7
+ REPX {psrad x, 12 }, m0, m1, m4, m7
+ call m(idct_8x8_internal_10bpc).main_fast
+ call m(idct_16x8_internal_10bpc).main_fast
+ vpbroadcastd m11, [o(pd_1)]
+ call m(idct_8x16_internal_10bpc).main_end2
+ mova m8, [o(idct8x32p)]
+ packssdw m0, m4
+ packssdw m1, m5
+ packssdw m2, m6
+ packssdw m3, m7
+ mova m6, [dup16_perm]
+ vpermb m0, m8, m0
+ vpermb m2, m8, m2
+ vprold m8, 16
+ vpermb m1, m8, m1
+ vpermb m3, m8, m3
+ punpckldq m4, m0, m2
+ punpckhdq m0, m2
+ punpckldq m2, m1, m3
+ punpckhdq m1, m3
+ punpckldq m21, m4, m2
+ punpckhdq m14, m4, m2
+ punpckldq m18, m0, m1
+ punpckhdq m15, m0, m1
+ vpermb m8, m6, m14 ; 2
+ vpermb m1, m6, m15 ; 6
+ vpermb m7, m6, m18 ; 4
+ pmovzxwd m9, ym21 ; 0
+ vpord m6, [o(pb_32)] {1to16}
+ lea r5, [o_base_8bpc]
+ vpermb m21, m6, m21 ; 1
+ vpermb m15, m6, m15 ; 7
+ vpermb m18, m6, m18 ; 5
+ vpermb m14, m6, m14 ; 3
+ pslld m9, 16
+ call m(idct_16x16_internal_8bpc).main_fast2
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2
+.pass2_end:
+ movshdup m22, [permC]
+ vpbroadcastd m11, [pw_2048]
+ vpbroadcastd m13, [pixel_10bpc_max]
+ lea r6, [strideq*3]
+ pxor m12, m12
+ psrlq m23, m22, 8
+ vpermq m8, m22, m0
+ vpermq m9, m23, m1
+ call m(idct_16x8_internal_10bpc).write_16x4
+ vpermq m8, m22, m2
+ vpermq m9, m23, m3
+ call m(idct_16x8_internal_10bpc).write_16x4
+ vpermq m8, m22, m4
+ vpermq m9, m23, m5
+ call m(idct_16x8_internal_10bpc).write_16x4
+ vpermq m8, m22, m6
+ vpermq m9, m23, m7
+ call m(idct_16x8_internal_10bpc).write_16x4
+ vpermq m8, m22, m14
+ vpermq m9, m23, m15
+ call m(idct_16x8_internal_10bpc).write_16x4
+ vpermq m8, m22, m16
+ vpermq m9, m23, m17
+ call m(idct_16x8_internal_10bpc).write_16x4
+ vpermq m8, m22, m18
+ vpermq m9, m23, m19
+ call m(idct_16x8_internal_10bpc).write_16x4
+ vpermq m8, m22, m20
+ vpermq m9, m23, m21
+%if WIN64
+ movaps xmm6, [rsp+ 8]
+ movaps xmm7, [rsp+24]
+%endif
+ vzeroupper
+ jmp m(idct_16x8_internal_10bpc).write_16x4
+.pass1:
+ pmulld m0, m12, [cq+128* 0]
+ pmulld m1, m12, [cq+128* 2]
+ pmulld m2, m12, [cq+128* 4]
+ pmulld m3, m12, [cq+128* 6]
+ pmulld m4, m12, [cq+128* 8]
+ pmulld m5, m12, [cq+128*10]
+ pmulld m6, m12, [cq+128*12]
+ pmulld m7, m12, [cq+128*14]
+ call m(idct_8x16_internal_10bpc).main_rect2
+ pmulld m16, m12, [cq+128* 1]
+ pmulld m17, m12, [cq+128* 3]
+ pmulld m18, m12, [cq+128* 5]
+ pmulld m19, m12, [cq+128* 7]
+ pmulld m20, m12, [cq+128* 9]
+ pmulld m21, m12, [cq+128*11]
+ pmulld m22, m12, [cq+128*13]
+ pmulld m23, m12, [cq+128*15]
+ call m(idct_16x16_internal_10bpc).main_rect2
+ vpbroadcastd m11, [o(pd_1)]
+ call m(idct_16x16_internal_10bpc).main_end2
+ jmp m(idct_16x16_internal_10bpc).main_end3
+.dconly:
+ imul r6d, [cq], 181
+ mov [cq], eobd
+ or r3d, 32
+ jmp m(inv_txfm_add_dct_dct_16x8_10bpc).dconly
+
+cglobal inv_txfm_add_identity_identity_16x32_10bpc, 4, 7, 16, dst, stride, c, eob
+%undef cmp
+ vpbroadcastd m10, [pw_2896x8]
+ vpbroadcastd m11, [pw_1697x16]
+ vpbroadcastd m13, [pw_8192]
+ vpbroadcastd m15, [pixel_10bpc_max]
+ lea r6, [strideq*9]
+ pxor m14, m14
+ paddw m12, m13, m13 ; pw_16384
+ cmp eobd, 151
+ jl .main
+ call .main
+ add cq, 64-128*4
+ lea dstq, [dstq+strideq*8]
+.main:
+ call .main_internal
+ add cq, 128*4
+ pmulhrsw m1, m13, m2
+ pmulhrsw m3, m13, m4
+ pmulhrsw m5, m13, m6
+ pmulhrsw m7, m13, m8
+ call .main_internal
+.main2:
+ pmulhrsw m2, m13
+ pmulhrsw m4, m13
+ pmulhrsw m6, m13
+ pmulhrsw m8, m13
+ punpcklqdq m0, m1, m2 ; 0 8
+ punpckhqdq m1, m2 ; 1 9
+ call .write_16x2x2
+ punpcklqdq m0, m3, m4 ; 2 10
+ punpckhqdq m1, m3, m4 ; 3 11
+ call .write_16x2x2
+ punpcklqdq m0, m5, m6 ; 4 12
+ punpckhqdq m1, m5, m6 ; 5 13
+ call .write_16x2x2
+ punpcklqdq m0, m7, m8 ; 6 14
+ punpckhqdq m1, m7, m8 ; 7 15
+.write_16x2x2:
+ mova ym2, [dstq+strideq*0]
+ vinserti32x8 m2, [dstq+strideq*8], 1
+ mova ym9, [dstq+strideq*1]
+ vinserti32x8 m9, [dstq+r6 ], 1
+ paddw m0, m2
+ paddw m1, m9
+ pmaxsw m0, m14
+ pmaxsw m1, m14
+ pminsw m0, m15
+ pminsw m1, m15
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*8], m0, 1
+ mova [dstq+strideq*1], ym1
+ vextracti32x8 [dstq+r6 ], m1, 1
+ lea dstq, [dstq+strideq*2]
+ ret
+.main_internal:
+ mova m8, [cq+128* 0]
+ packssdw m8, [cq+128* 8]
+ mova m6, [cq+128* 1]
+ packssdw m6, [cq+128* 9]
+ mova m0, [cq+128* 2]
+ packssdw m0, [cq+128*10]
+ mova m2, [cq+128* 3]
+ packssdw m2, [cq+128*11]
+ REPX {pmulhrsw x, m10}, m8, m6, m0, m2
+ REPX {vpermq x, x, q3120}, m8, m6, m0, m2
+ pmulhrsw m4, m11, m8
+ pmulhrsw m9, m11, m6
+ REPX {mova [cq+128*x], m14}, 0, 1, 2, 3
+ pmulhrsw m4, m12
+ pmulhrsw m9, m12
+ paddsw m8, m4
+ paddsw m6, m9
+ pmulhrsw m4, m11, m0
+ pmulhrsw m9, m11, m2
+ REPX {mova [cq+128*x], m14}, 8, 9, 10, 11
+ pmulhrsw m4, m12
+ pmulhrsw m9, m12
+ paddsw m0, m4
+ paddsw m2, m9
+ punpcklwd m4, m8, m6
+ punpckhwd m8, m6
+ punpcklwd m6, m0, m2
+ punpckhwd m0, m2
+ punpckldq m2, m4, m6 ; 0 1
+ punpckhdq m4, m6 ; 2 3
+ punpckldq m6, m8, m0 ; 4 5
+ punpckhdq m8, m0 ; 6 7
+ ret
+
+cglobal inv_txfm_add_dct_dct_32x16_10bpc, 4, 7, 0, dst, stride, c, eob
+%undef cmp
+ lea r5, [o_base]
+ test eobd, eobd
+ jz .dconly
+ vpbroadcastd m12, [o(pd_2896)]
+ vpbroadcastd m13, [o(pd_2048)]
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+%if WIN64
+ movaps [rsp+ 8], xmm6
+ movaps [rsp+24], xmm7
+%endif
+ mov r6d, 8*12
+ cmp eobd, 36
+ jl .fast
+ pmulld m0, m12, [cq+64* 0]
+ pmulld m1, m12, [cq+64* 4]
+ pmulld m2, m12, [cq+64* 8]
+ pmulld m3, m12, [cq+64*12]
+ pmulld m16, m12, [cq+64* 2]
+ pmulld m17, m12, [cq+64* 6]
+ pmulld m18, m12, [cq+64*10]
+ pmulld m19, m12, [cq+64*14]
+ cmp eobd, 151
+ jge .full
+ call m(idct_8x16_internal_10bpc).main_fast_rect2
+ call m(idct_16x16_internal_10bpc).main_fast_rect2
+ call .idct16_sumsub
+ call .pass1_load_spill
+ call .main_fast_rect2
+ jmp .pass1_end
+.full:
+ pmulld m4, m12, [cq+64*16]
+ pmulld m5, m12, [cq+64*20]
+ pmulld m6, m12, [cq+64*24]
+ pmulld m7, m12, [cq+64*28]
+ pmulld m20, m12, [cq+64*18]
+ pmulld m21, m12, [cq+64*22]
+ pmulld m22, m12, [cq+64*26]
+ pmulld m23, m12, [cq+64*30]
+ add r6d, 8*16
+ call m(idct_8x16_internal_10bpc).main_rect2
+ call m(idct_16x16_internal_10bpc).main_rect2
+ call .idct16_sumsub
+ call .pass1_load_spill
+ pmulld m16, m12, [cq+64*17]
+ pmulld m17, m12, [cq+64*19]
+ pmulld m18, m12, [cq+64*21]
+ pmulld m19, m12, [cq+64*23]
+ pmulld m20, m12, [cq+64*25]
+ pmulld m21, m12, [cq+64*27]
+ pmulld m22, m12, [cq+64*29]
+ pmulld m23, m12, [cq+64*31]
+ call .main_rect2
+.pass1_end:
+ vpbroadcastd m11, [o(pd_1)]
+ lea r4, [cq+64]
+ call .idct32_pass1_end
+ lea r5, [o_base_8bpc]
+ punpckhqdq m19, m5, m16 ; 11
+ punpcklqdq m5, m16 ; 10
+ punpckhqdq m16, m2, m1 ; 5
+ punpcklqdq m2, m1 ; 4
+ punpcklqdq m1, m15, m4 ; 2
+ punpckhqdq m15, m4 ; 3
+ punpcklqdq m4, m14, m18 ; 8
+ punpckhqdq m18, m14, m18 ; 9
+ punpckhqdq m14, m0, m20 ; 1
+ punpcklqdq m0, m20 ; 0
+ punpckhqdq m20, m6, m17 ; 13
+ punpcklqdq m6, m17 ; 12
+ punpckhqdq m17, m3, m21 ; 7
+ punpcklqdq m3, m21 ; 6
+ punpckhqdq m21, m7, m8 ; 15
+ punpcklqdq m7, m8 ; 14
+ call m(inv_txfm_add_dct_dct_32x8_8bpc).main
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
+ jmp .end
+.fast:
+ pmulld ym0, ym12, [cq+64*0]
+ pmulld ym1, ym12, [cq+64*4]
+ movshdup m7, [o(permB)]
+ mova ym4, [cq+64*2]
+ mova ym5, [cq+64*6]
+ mova ym16, [cq+64*1]
+ mova ym2, [cq+64*5]
+ mova ym3, [cq+64*3]
+ mova ym17, [cq+64*7]
+ vpermt2q m4, m7, m5 ; 2 6
+ vpermt2q m16, m7, m2 ; 1 5
+ vpermt2q m17, m7, m3 ; 7 3
+ paddd ym0, ym13
+ paddd ym1, ym13
+ psrad ym0, 12
+ psrad ym1, 12
+ vpermq m0, m7, m0 ; 0 0
+ vpermq m1, m7, m1 ; 4 4
+ REPX {pmulld x, m12}, m4, m16, m17
+ REPX {paddd x, m13}, m4, m16, m17
+ REPX {psrad x, 12 }, m4, m16, m17
+ call m(inv_txfm_add_dct_dct_32x8_10bpc).main_fast2
+ vpbroadcastd m11, [o(pd_1)]
+ call m(idct_16x16_internal_10bpc).main_end2
+ call m(inv_txfm_add_dct_dct_32x8_10bpc).transpose_8x32
+ lea r5, [o_base_8bpc]
+ punpckhqdq m14, m0, m2 ; 1
+ punpcklqdq m0, m2 ; 0
+ punpcklqdq m1, m3, m4 ; 2
+ punpckhqdq m15, m3, m4 ; 3
+ punpcklqdq m2, m5, m7 ; 4
+ punpckhqdq m16, m5, m7 ; 5
+ punpcklqdq m3, m6, m8 ; 6
+ punpckhqdq m17, m6, m8 ; 7
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
+.end:
+%if WIN64
+ movaps xmm6, [rsp+ 8]
+ movaps xmm7, [rsp+24]
+%endif
+ pxor m12, m12
+.zero_loop:
+ mova [cq+r6*8+64*3], m12
+ mova [cq+r6*8+64*2], m12
+ mova [cq+r6*8+64*1], m12
+ mova [cq+r6*8+64*0], m12
+ sub r6d, 8*4
+ jge .zero_loop
+ call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x8_start
+ pmulhrsw m0, m11, m14
+ pmulhrsw m1, m11, m15
+ pmulhrsw m2, m11, m16
+ pmulhrsw m3, m11, m17
+ call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4
+ pmulhrsw m0, m11, m18
+ pmulhrsw m1, m11, m19
+ pmulhrsw m2, m11, m20
+ pmulhrsw m3, m11, m21
+ vzeroupper
+ jmp m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4
+.dconly:
+ imul r6d, [cq], 181
+ mov [cq], eobd
+ or r3d, 16
+.dconly3:
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ add r6d, 384
+ sar r6d, 9
+.dconly2:
+ vpbroadcastd m3, [o(dconly_10bpc)]
+ imul r6d, 181
+ add r6d, 2176
+ sar r6d, 12
+ vpbroadcastw m2, r6d
+ paddsw m2, m3
+.dconly_loop:
+ paddsw m0, m2, [dstq+strideq*0]
+ paddsw m1, m2, [dstq+strideq*1]
+ psubusw m0, m3
+ psubusw m1, m3
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ sub r3d, 2
+ jg .dconly_loop
+ RET
+ALIGN function_align
+.idct16_sumsub:
+ psubd m23, m0, m22 ; t15
+ paddd m0, m22 ; t0
+ psubd m22, m1, m21 ; t14
+ paddd m1, m21 ; t1
+ REPX {pmaxsd x, m14}, m23, m0, m22, m1
+ psubd m21, m2, m20 ; t13
+ paddd m2, m20 ; t2
+ REPX {pminsd x, m15}, m23, m0, m22, m1
+ psubd m20, m3, m19 ; t12
+ paddd m3, m19 ; t3
+ REPX {pmaxsd x, m14}, m21, m2, m20, m3
+ psubd m19, m4, m18 ; t11
+ paddd m4, m18 ; t4
+ REPX {pminsd x, m15}, m21, m2, m20, m3
+ psubd m18, m5, m17 ; t10
+ paddd m5, m17 ; t5
+ REPX {pmaxsd x, m14}, m19, m4, m18, m5
+ psubd m17, m6, m16 ; t9
+ paddd m6, m16 ; t6
+ REPX {pminsd x, m15}, m19, m4, m18, m5
+ psubd m16, m7, m9 ; t8
+ paddd m7, m9 ; t7
+ REPX {pmaxsd x, m14}, m17, m6, m16, m7
+ REPX {pminsd x, m15}, m17, m6, m16, m7
+ ret
+.idct32_pass1_end:
+ psrlq m12, [o(permC)], 24 ; 0 2 8 10 1 3 9 11
+ psrlq m13, m12, 32 ; 4 6 12 14 5 7 13 15
+%macro IDCT32_PASS1_END 2 ; low, high
+ paddd m8, m11, [r4+128*%1]
+ paddd m9, m11, [cq+128*%1]
+ psubd m10, m8, m%1 ; out 16+n
+ paddd m8, m%1 ; out 15-n
+ paddd m%1, m9, m%2 ; out 0+n
+ psubd m9, m%2 ; out 31-n
+ REPX {vpsravd x, m11}, m10, m%1, m8, m9
+ packssdw m%1, m10 ; 0+n 16+n
+ packssdw m%2, m8, m9 ; 15-n 31-n
+%endmacro
+ IDCT32_PASS1_END 0, 23 ; 0 16, 15 31
+ IDCT32_PASS1_END 7, 16 ; 7 23, 8 24
+ IDCT32_PASS1_END 1, 22 ; 1 17, 14 30
+ IDCT32_PASS1_END 6, 17 ; 6 22, 9 25
+ IDCT32_PASS1_END 2, 21 ; 2 18, 13 29
+ IDCT32_PASS1_END 5, 18 ; 5 21, 10 26
+ IDCT32_PASS1_END 3, 20 ; 3 19, 12 28
+ IDCT32_PASS1_END 4, 19 ; 4 20, 11 27
+.transpose_16x32:
+ mova m14, m13
+ vpermi2q m14, m0, m16
+ vpermt2q m0, m12, m16
+ mova m15, m13
+ vpermi2q m15, m1, m17
+ vpermt2q m1, m12, m17
+ mova m16, m13
+ vpermi2q m16, m2, m18
+ vpermt2q m2, m12, m18
+ mova m17, m13
+ vpermi2q m17, m3, m19
+ vpermt2q m3, m12, m19
+ mova m18, m13
+ vpermi2q m18, m4, m20
+ vpermt2q m4, m12, m20
+ mova m19, m13
+ vpermi2q m19, m5, m21
+ vpermt2q m5, m12, m21
+ mova m20, m13
+ vpermi2q m20, m6, m22
+ vpermt2q m6, m12, m22
+ mova m21, m13
+ vpermi2q m21, m7, m23
+ vpermt2q m7, m12, m23
+ punpckhwd m8, m2, m3 ; c04 d04 c05 d05 c06 d06 c07 d07
+ punpcklwd m2, m3 ; c00 d00 c01 d01 c02 d02 c03 d03
+ punpckhwd m3, m0, m1 ; a04 b04 a05 b05 a06 b06 a07 b07
+ punpcklwd m0, m1 ; a00 b00 a01 b01 a02 b02 a03 b03
+ punpckhwd m1, m4, m5 ; e04 f04 e05 f05 e06 f06 e07 f07
+ punpcklwd m4, m5 ; e00 f00 e01 f01 e02 f02 e03 f03
+ punpckhwd m5, m6, m7 ; g04 h04 g05 h05 g06 h06 g07 h07
+ punpcklwd m6, m7 ; g00 h00 g01 h01 g02 h02 g03 h03
+ punpckhwd m7, m14, m15 ; a12 b12 a13 b13 a14 b14 a15 b15
+ punpcklwd m14, m15 ; a08 b08 a09 b09 a10 b10 a11 b11
+ punpckhwd m15, m16, m17 ; c12 d12 c13 d13 c14 d14 c15 d15
+ punpcklwd m16, m17 ; c08 d08 c09 d09 c10 d10 c11 d11
+ punpckhwd m17, m18, m19 ; e12 f12 e13 f13 e14 f14 e15 f15
+ punpcklwd m18, m19 ; e08 f08 e09 f09 e10 f10 e11 f11
+ punpckhwd m19, m20, m21 ; g12 h12 g13 h13 g14 h14 g15 h15
+ punpcklwd m20, m21 ; g08 h08 g09 h09 g10 h10 g11 h11
+ punpckhdq m21, m1, m5 ; e06 f06 g06 h06 e07 f07 g07 h07
+ punpckldq m1, m5 ; e04 f04 g04 h04 e05 f05 g05 h05
+ punpckhdq m5, m14, m16 ; a10 b10 c10 d10 a11 b11 c11 d11
+ punpckldq m14, m16 ; a08 b08 c08 d08 a09 b09 c09 d09
+ punpckhdq m16, m18, m20 ; e10 f10 g10 h10 e11 f11 g11 h11
+ punpckldq m18, m20 ; e08 f08 g08 h08 e09 f09 g09 h09
+ punpckldq m20, m4, m6 ; e00 f00 g00 h00 e01 f01 g01 h01
+ punpckhdq m4, m6 ; e02 f02 g02 h02 e03 f03 g03 h03
+ punpckldq m6, m7, m15 ; a12 b12 c12 d12 a13 b13 c13 d13
+ punpckhdq m7, m15 ; a14 b14 c14 d14 a15 b15 c15 d15
+ punpckhdq m15, m0, m2 ; a02 b02 c02 d02 a03 b03 c03 d03
+ punpckldq m0, m2 ; a00 b00 c00 d00 a01 b01 c01 d01
+ punpckldq m2, m3, m8 ; a04 b04 c04 d04 a05 b05 c05 d05
+ punpckhdq m3, m8 ; a06 b06 c06 d06 a07 b07 c07 d07
+ punpckhdq m8, m17, m19 ; e14 f14 g14 h14 e15 f15 g15 h15
+ punpckldq m17, m19 ; e12 f12 g12 h12 e13 f13 g13 h13
+ ret
+.pass1_load_spill:
+ mova [cq+64* 0], m0
+ mova [cq+64* 2], m1
+ mova [cq+64* 4], m2
+ mova [cq+64* 6], m3
+ mova [cq+64* 8], m4
+ mova [cq+64*10], m5
+ mova [cq+64*12], m6
+ mova [cq+64*14], m7
+ pmulld m0, m12, [cq+64* 1]
+ pmulld m1, m12, [cq+64* 3]
+ pmulld m2, m12, [cq+64* 5]
+ pmulld m3, m12, [cq+64* 7]
+ pmulld m4, m12, [cq+64* 9]
+ pmulld m5, m12, [cq+64*11]
+ pmulld m6, m12, [cq+64*13]
+ pmulld m7, m12, [cq+64*15]
+ mova [cq+64* 1], m23
+ mova [cq+64* 3], m22
+ mova [cq+64* 5], m21
+ mova [cq+64* 7], m20
+ mova [cq+64* 9], m19
+ mova [cq+64*11], m18
+ mova [cq+64*13], m17
+ mova [cq+64*15], m16
+ ret
+.main_fast2_rect2:
+ REPX {paddd x, m13}, m0, m1, m2, m3
+ REPX {psrad x, 12 }, m0, m1, m2, m3
+.main_fast2: ; bottom 3/4 is zero
+ pmulld m23, m0, [o(pd_4091)] {1to16} ; t31a
+ pmulld m0, [o(pd_201)] {1to16} ; t16a
+ pmulld m20, m3, [o(pd_1380)] {1to16} ; t19a
+ pmulld m3, [o(pd_3857)] {1to16} ; t28a
+ pmulld m21, m2, [o(pd_3973)] {1to16} ; t27a
+ pmulld m2, [o(pd_995)] {1to16} ; t20a
+ pmulld m6, m1, [o(pd_601)] {1to16} ; t23a
+ pmulld m17, m1, [o(pd_4052)] {1to16} ; t24a
+ REPX {psubd x, m13, x}, m20, m6
+ REPX {paddd x, m13}, m23, m0, m3, m21, m2, m17
+ REPX {psrad x, 12 }, m20, m6, m23, m0, m3, m21, m2, m17
+ mova m8, m0
+ mova m16, m23
+ mova m7, m20
+ mova m4, m3
+ mova m19, m2
+ mova m18, m21
+ mova m5, m6
+ mova m22, m17
+ jmp .main3
+.main_fast_rect2:
+ call m(idct_8x16_internal_10bpc).round
+.main_fast: ; bottom half is zero
+ pmulld m23, m0, [o(pd_4091)] {1to16} ; t31a
+ pmulld m0, [o(pd_201)] {1to16} ; t16a
+ pmulld m16, m7, [o(pd_2751)] {1to16} ; t17a
+ pmulld m7, [o(pd_3035)] {1to16} ; t30a
+ pmulld m19, m4, [o(pd_3703)] {1to16} ; t29a
+ pmulld m4, [o(pd_1751)] {1to16} ; t18a
+ pmulld m20, m3, [o(pd_1380)] {1to16} ; t19a
+ pmulld m3, [o(pd_3857)] {1to16} ; t28a
+ pmulld m21, m2, [o(pd_3973)] {1to16} ; t27a
+ pmulld m2, [o(pd_995)] {1to16} ; t20a
+ pmulld m18, m5, [o(pd_2106)] {1to16} ; t21a
+ pmulld m5, [o(pd_3513)] {1to16} ; t26a
+ pmulld m17, m6, [o(pd_3290)] {1to16} ; t25a
+ pmulld m6, [o(pd_2440)] {1to16} ; t22a
+ pmulld m22, m1, [o(pd_601)] {1to16} ; t23a
+ pmulld m1, [o(pd_4052)] {1to16} ; t24a
+ REPX {psubd x, m13, x}, m16, m20, m18, m22
+ call m(idct_16x16_internal_10bpc).round3
+ jmp .main2
+.main_rect2:
+ call m(idct_8x16_internal_10bpc).round
+ call m(idct_16x16_internal_10bpc).round
+.main:
+ ITX_MULSUB_2D 0, 23, 8, 9, 10, _, 201, 4091 ; t16a, t31a
+ ITX_MULSUB_2D 16, 7, 8, 9, 10, _, 3035, 2751 ; t17a, t30a
+ ITX_MULSUB_2D 4, 19, 8, 9, 10, _, 1751, 3703 ; t18a, t29a
+ ITX_MULSUB_2D 20, 3, 8, 9, 10, _, 3857, 1380 ; t19a, t28a
+ ITX_MULSUB_2D 2, 21, 8, 9, 10, _, 995, 3973 ; t20a, t27a
+ ITX_MULSUB_2D 18, 5, 8, 9, 10, _, 3513, 2106 ; t21a, t26a
+ ITX_MULSUB_2D 6, 17, 8, 9, 10, _, 2440, 3290 ; t22a, t25a
+ ITX_MULSUB_2D 22, 1, 8, 9, 10, _, 4052, 601 ; t23a, t24a
+ call m(idct_16x16_internal_10bpc).round
+.main2:
+ call m(idct_8x16_internal_10bpc).round
+ psubd m8, m0, m16 ; t17
+ paddd m0, m16 ; t16
+ psubd m16, m23, m7 ; t30
+ paddd m23, m7 ; t31
+ REPX {pmaxsd x, m14}, m8, m0, m16, m23
+ paddd m7, m20, m4 ; t19
+ psubd m20, m4 ; t18
+ REPX {pminsd x, m15}, m8, m0, m16, m23
+ paddd m4, m3, m19 ; t28
+ psubd m3, m19 ; t29
+ REPX {pmaxsd x, m14}, m7, m20, m4, m3
+ psubd m19, m2, m18 ; t21
+ paddd m2, m18 ; t20
+ REPX {pminsd x, m15}, m7, m20, m4, m3
+ psubd m18, m21, m5 ; t26
+ paddd m21, m5 ; t27
+ REPX {pmaxsd x, m14}, m19, m2, m18, m21
+ psubd m5, m22, m6 ; t22
+ paddd m6, m22 ; t23
+ REPX {pminsd x, m15}, m19, m2, m18, m21
+ psubd m22, m1, m17 ; t25
+ paddd m17, m1 ; t24
+ REPX {pmaxsd x, m14}, m5, m6, m22, m17
+ REPX {pminsd x, m15}, m5, m6, m22, m17
+.main3:
+ vpbroadcastd m11, [o(pd_4017)]
+ vpbroadcastd m10, [o(pd_799)]
+ ITX_MULSUB_2D 16, 8, 9, 1, _, 13, 10, 11 ; t17a, t30a
+ ITX_MULSUB_2D 3, 20, 9, 1, _, 13, 10, 11, 2 ; t29a, t18a
+ vpbroadcastd m11, [o(pd_2276)]
+ vpbroadcastd m10, [o(pd_3406)]
+ ITX_MULSUB_2D 18, 19, 9, 1, _, 13, 10, 11 ; t21a, t26a
+ ITX_MULSUB_2D 22, 5, 9, 1, _, 13, 10, 11, 2 ; t25a, t22a
+ paddd m1, m6, m2 ; t23a
+ psubd m6, m2 ; t20a
+ psubd m2, m17, m21 ; t27a
+ paddd m17, m21 ; t24a
+ REPX {pmaxsd x, m14}, m1, m6, m2, m17
+ psubd m21, m23, m4 ; t28a
+ paddd m23, m4 ; t31a
+ REPX {pminsd x, m15}, m1, m6, m2, m17
+ psubd m4, m16, m20 ; t18
+ paddd m16, m20 ; t17
+ REPX {pmaxsd x, m14}, m21, m23, m4, m16
+ psubd m20, m0, m7 ; t19a
+ paddd m0, m7 ; t16a
+ REPX {pminsd x, m15}, m21, m23, m4, m16
+ psubd m7, m8, m3 ; t29
+ paddd m3, m8 ; t30
+ REPX {pmaxsd x, m14}, m20, m0, m7, m3
+ paddd m8, m5, m18 ; t22
+ psubd m5, m18 ; t21
+ REPX {pminsd x, m15}, m20, m0, m7, m3
+ psubd m18, m22, m19 ; t26
+ paddd m22, m19 ; t25
+ REPX {pmaxsd x, m14}, m8, m5, m18, m22
+ vpbroadcastd m11, [o(pd_3784)]
+ vpbroadcastd m10, [o(pd_1567)]
+ REPX {pminsd x, m15}, m8, m5, m18, m22
+ ITX_MULSUB_2D 21, 20, 9, 19, _, 13, 10, 11 ; t19, t28
+ ITX_MULSUB_2D 2, 6, 9, 19, _, 13, 10, 11, 2 ; t27, t20
+ ITX_MULSUB_2D 7, 4, 9, 19, _, 13, 10, 11 ; t18a, t29a
+ ITX_MULSUB_2D 18, 5, 9, 19, _, 13, 10, 11, 2 ; t26a, t21a
+ psubd m19, m0, m1 ; t23
+ paddd m0, m1 ; t16
+ paddd m1, m8, m16 ; t17a
+ psubd m8, m16, m8 ; t22a
+ REPX {pmaxsd x, m14}, m19, m0, m1, m8
+ psubd m16, m23, m17 ; t24
+ paddd m23, m17 ; t31
+ REPX {pminsd x, m15}, m19, m0, m1, m8
+ psubd m17, m3, m22 ; t25a
+ paddd m22, m3 ; t30a
+ REPX {pmaxsd x, m14}, m16, m23, m17, m22
+ paddd m3, m6, m21 ; t19a
+ psubd m6, m21, m6 ; t20a
+ REPX {pminsd x, m15}, m16, m23, m17, m22
+ paddd m21, m18, m4 ; t29
+ psubd m18, m4, m18 ; t26
+ REPX {pmaxsd x, m14}, m3, m6, m21, m18
+ psubd m4, m20, m2 ; t27a
+ paddd m20, m2 ; t28a
+ REPX {pminsd x, m15}, m3, m6, m21, m18
+ paddd m2, m7, m5 ; t18
+ psubd m7, m5 ; t21
+ REPX {pmaxsd x, m14}, m4, m20, m2, m7
+ REPX {pminsd x, m15}, m4, m20, m2, m7
+ REPX {pmulld x, m12}, m18, m16, m4, m17, m7, m19, m6, m8
+ REPX {paddd x, m13}, m18, m16, m4, m17
+ psubd m5, m18, m7 ; t21a
+ paddd m18, m7 ; t26a
+ psubd m7, m16, m19 ; t23a
+ paddd m16, m19 ; t24a
+ REPX {psrad x, 12 }, m5, m18, m7, m16
+ paddd m19, m4, m6 ; t27
+ psubd m4, m6 ; t20
+ psubd m6, m17, m8 ; t22
+ paddd m17, m8 ; t25
+ REPX {psrad x, 12 }, m19, m4, m6, m17
+ ret
+
+cglobal inv_txfm_add_identity_identity_32x16_10bpc, 4, 7, 16, dst, stride, c, eob
+%undef cmp
+ vpbroadcastd m10, [pw_2896x8]
+ vpbroadcastd m11, [pw_1697x16]
+ vpbroadcastd m13, [pw_2048]
+ vpbroadcastd m15, [pixel_10bpc_max]
+ lea r6, [strideq*9]
+ pxor m14, m14
+ cmp eobd, 151
+ jl .main
+ mov r4, dstq
+ call .main
+ add cq, 64*12
+ lea dstq, [r4+32]
+.main:
+ call .main_internal
+ add cq, 64*4
+ pmulhrsw m1, m13, m2
+ pmulhrsw m3, m13, m4
+ pmulhrsw m5, m13, m6
+ pmulhrsw m7, m13, m8
+ call .main_internal
+ jmp m(inv_txfm_add_identity_identity_16x32_10bpc).main2
+.main_internal:
+ mova m8, [cq+64* 0]
+ packssdw m8, [cq+64* 8]
+ mova m6, [cq+64* 1]
+ packssdw m6, [cq+64* 9]
+ mova m0, [cq+64* 2]
+ packssdw m0, [cq+64*10]
+ mova m2, [cq+64* 3]
+ packssdw m2, [cq+64*11]
+ REPX {pmulhrsw x, m10}, m8, m6, m0, m2
+ REPX {paddsw x, x }, m8, m6, m0, m2
+ REPX {vpermq x, x, q3120}, m8, m6, m0, m2
+ pmulhrsw m4, m11, m8
+ pmulhrsw m9, m11, m6
+ paddsw m8, m8
+ paddsw m6, m6
+ REPX {mova [cq+64*x], m14}, 0, 1, 2, 3
+ paddsw m8, m4
+ paddsw m6, m9
+ pmulhrsw m4, m11, m0
+ pmulhrsw m9, m11, m2
+ paddsw m0, m0
+ paddsw m2, m2
+ REPX {mova [cq+64*x], m14}, 8, 9, 10, 11
+ paddsw m0, m4
+ paddsw m2, m9
+ punpcklwd m4, m8, m6
+ punpckhwd m8, m6
+ punpcklwd m6, m0, m2
+ punpckhwd m0, m2
+ punpckldq m2, m4, m6 ; 0 1
+ punpckhdq m4, m6 ; 2 3
+ punpckldq m6, m8, m0 ; 4 5
+ punpckhdq m8, m0 ; 6 7
+ ret
+
+cglobal inv_txfm_add_dct_dct_32x32_10bpc, 4, 7, 0, dst, stride, c, eob
+%undef cmp
+ lea r5, [o_base]
+ test eobd, eobd
+ jz .dconly
+ vpbroadcastd m12, [o(pd_2896)]
+ vpbroadcastd m13, [o(pd_2048)]
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+ WIN64_SPILL_XMM 30
+ cmp eobd, 136
+ jl .fast
+ add cq, 64
+ cmp eobd, 543
+ jge .full
+ call .pass1_fast ; bottomright 16x16 zero
+ mov r6d, 16*12
+ jmp .lefthalf
+.full:
+ call .pass1
+ mov r6d, 16*28
+.lefthalf:
+ mova [cq+128* 0], m0
+ mova [cq+128* 1], m1
+ mova [cq+128* 2], m2
+ mova [cq+128* 3], m3
+ mova [cq+128* 4], m14
+ mova [cq+128* 5], m15
+ mova [cq+128* 6], m16
+ mova [cq+128* 7], m17
+ mova [cq+128* 8], m22
+ mova [cq+128* 9], m23
+ mova [cq+128*10], m24
+ mova [cq+128*11], m25
+ mova [cq+128*12], m26
+ mova [cq+128*13], m27
+ mova [cq+128*14], m28
+ mova [cq+128*15], m29
+ sub cq, 64
+ vpbroadcastd m12, [o(pd_2896)]
+ vpbroadcastd m13, [o(pd_2048)]
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+ call .pass1
+ lea r5, [o_base_8bpc]
+ call .pass2_start
+ pxor m12, m12
+.right_zero_loop:
+ mova [cq+r6*8+64+128*3], m12
+ mova [cq+r6*8+64+128*2], m12
+ mova [cq+r6*8+64+128*1], m12
+ mova [cq+r6*8+64+128*0], m12
+ sub r6d, 16*4
+ jge .right_zero_loop
+ mov r6d, 16*28
+ jmp .end2
+.pass2_start:
+ mova m4, [cq+64+128* 0]
+ mova m5, [cq+64+128* 1]
+ mova m6, [cq+64+128* 2]
+ mova m7, [cq+64+128* 3]
+ mova m18, [cq+64+128* 4]
+ mova m19, [cq+64+128* 5]
+ mova m20, [cq+64+128* 6]
+ mova m21, [cq+64+128* 7]
+ call m(inv_txfm_add_dct_dct_32x8_8bpc).main
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
+ mova [cq+128*0], m14
+ mova [cq+128*1], m15
+ mova [cq+128*2], m16
+ mova [cq+128*3], m17
+ mova [cq+128*4], m18
+ mova [cq+128*5], m19
+ mova [cq+128*6], m20
+ mova [cq+128*7], m21
+ mova m14, [cq+64+128* 8]
+ mova m15, [cq+64+128* 9]
+ mova m16, [cq+64+128*10]
+ mova m17, [cq+64+128*11]
+ mova m18, [cq+64+128*12]
+ mova m19, [cq+64+128*13]
+ mova m20, [cq+64+128*14]
+ mova m21, [cq+64+128*15]
+ jmp m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf
+.fast: ; topleft 16x16 nonzero
+ cmp eobd, 36
+ jl .fast2
+ call .pass1_fast
+ lea r5, [o_base_8bpc]
+ call .pass2_fast_start
+ jmp .end
+.pass2_fast_start:
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
+ mova [cq+128*0], m14
+ mova [cq+128*1], m15
+ mova [cq+128*2], m16
+ mova [cq+128*3], m17
+ mova [cq+128*4], m18
+ mova [cq+128*5], m19
+ mova [cq+128*6], m20
+ mova [cq+128*7], m21
+ jmp m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast
+.fast2: ; topleft 8x8 nonzero
+ movshdup m7, [o(permB)]
+ mova ym0, [cq+128*0]
+ mova ym1, [cq+128*4]
+ mova ym4, [cq+128*2]
+ mova ym5, [cq+128*6]
+ mova ym16, [cq+128*1]
+ mova ym2, [cq+128*5]
+ mova ym3, [cq+128*3]
+ mova ym17, [cq+128*7]
+ mov r6d, 16*4
+ vpermq m0, m7, m0 ; 0 0
+ vpermq m1, m7, m1 ; 4 4
+ vpermt2q m4, m7, m5 ; 2 6
+ vpermt2q m16, m7, m2 ; 1 5
+ vpermt2q m17, m7, m3 ; 7 3
+ call m(inv_txfm_add_dct_dct_32x8_10bpc).main_fast2
+ call m(idct_16x16_internal_10bpc).main_end
+ call .pass2_fast2_start
+.end:
+ pxor m12, m12
+.end2:
+ call .pass2_end
+.zero_loop:
+ mova [cq+r6*8+128*3], m12
+ mova [cq+r6*8+128*2], m12
+ mova [cq+r6*8+128*1], m12
+ mova [cq+r6*8+128*0], m12
+ sub r6d, 16*4
+ jge .zero_loop
+ WIN64_RESTORE_XMM
+ vzeroupper
+ ret
+.pass2_fast2_start:
+ call m(inv_txfm_add_dct_dct_32x8_10bpc).transpose_8x32
+ lea r5, [o_base_8bpc]
+ punpckhqdq m22, m0, m2 ; 1
+ punpcklqdq m0, m2 ; 0
+ punpcklqdq m1, m5, m7 ; 4
+ punpckhqdq m24, m5, m7 ; 5
+ punpcklqdq m14, m3, m4 ; 2
+ punpckhqdq m23, m3, m4 ; 3
+ punpcklqdq m15, m6, m8 ; 6
+ punpckhqdq m25, m6, m8 ; 7
+ mova m10, m13
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast2
+ mova [cq+128*0], m14
+ mova [cq+128*1], m15
+ mova [cq+128*2], m16
+ mova [cq+128*3], m17
+ mova [cq+128*4], m18
+ mova [cq+128*5], m19
+ mova [cq+128*6], m20
+ mova [cq+128*7], m21
+ jmp m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast2
+.pass2_end:
+ psubsw m9, m0, m29 ; out31
+ paddsw m0, m29 ; out0
+ psubsw m29, m1, m28 ; out30
+ paddsw m1, m28 ; out1
+ psubsw m28, m2, m27 ; out29
+ paddsw m2, m27 ; out2
+ psubsw m27, m3, m26 ; out28
+ paddsw m3, m26 ; out3
+ psubsw m26, m4, m25 ; out27
+ paddsw m4, m25 ; out4
+ psubsw m25, m5, m24 ; out26
+ paddsw m5, m24 ; out5
+ psubsw m24, m6, m23 ; out25
+ paddsw m6, m23 ; out6
+ psubsw m23, m7, m22 ; out24
+ paddsw m7, m22 ; out7
+ call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x8_start
+ mova m0, [cq+128*0]
+ mova m1, [cq+128*1]
+ mova m2, [cq+128*2]
+ mova m3, [cq+128*3]
+ mova m4, [cq+128*4]
+ mova m5, [cq+128*5]
+ mova m6, [cq+128*6]
+ mova m7, [cq+128*7]
+ psubsw m22, m0, m21 ; out23
+ paddsw m0, m21 ; out8
+ psubsw m21, m1, m20 ; out22
+ paddsw m1, m20 ; out9
+ psubsw m20, m2, m19 ; out21
+ paddsw m2, m19 ; out10
+ psubsw m19, m3, m18 ; out20
+ paddsw m3, m18 ; out11
+ psubsw m18, m4, m17 ; out19
+ paddsw m4, m17 ; out12
+ psubsw m17, m5, m16 ; out18
+ paddsw m5, m16 ; out13
+ psubsw m16, m6, m15 ; out17
+ paddsw m6, m15 ; out14
+ psubsw m15, m7, m14 ; out16
+ paddsw m7, m14 ; out15
+ call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x8
+ pmulhrsw m0, m11, m15
+ pmulhrsw m1, m11, m16
+ pmulhrsw m2, m11, m17
+ pmulhrsw m3, m11, m18
+ call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4
+ pmulhrsw m0, m11, m19
+ pmulhrsw m1, m11, m20
+ pmulhrsw m2, m11, m21
+ pmulhrsw m3, m11, m22
+ call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4
+ pmulhrsw m0, m11, m23
+ pmulhrsw m1, m11, m24
+ pmulhrsw m2, m11, m25
+ pmulhrsw m3, m11, m26
+ call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4
+ pmulhrsw m0, m11, m27
+ pmulhrsw m1, m11, m28
+ pmulhrsw m2, m11, m29
+ pmulhrsw m3, m11, m9
+ jmp m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4
+.dconly:
+ imul r6d, [cq], 181
+ mov [cq], eobd
+ or r3d, 32
+ add r6d, 640
+ sar r6d, 10
+ jmp m(inv_txfm_add_dct_dct_32x16_10bpc).dconly2
+.pass1_fast:
+ mova m0, [cq+128* 0]
+ mova m1, [cq+128* 4]
+ mova m2, [cq+128* 8]
+ mova m3, [cq+128*12]
+ mov r6d, 16*12
+ call m(idct_8x16_internal_10bpc).main_fast
+ mova m16, [cq+128* 2]
+ mova m17, [cq+128* 6]
+ mova m18, [cq+128*10]
+ mova m19, [cq+128*14]
+ call m(idct_16x16_internal_10bpc).main_fast
+ call .pass1_load_spill
+ call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast
+ jmp .pass1_end
+.pass1:
+ mova m0, [cq+128* 0]
+ mova m1, [cq+128* 4]
+ mova m2, [cq+128* 8]
+ mova m3, [cq+128*12]
+ mova m4, [cq+128*16]
+ mova m5, [cq+128*20]
+ mova m6, [cq+128*24]
+ mova m7, [cq+128*28]
+ call m(idct_8x16_internal_10bpc).main
+ mova m16, [cq+128* 2]
+ mova m17, [cq+128* 6]
+ mova m18, [cq+128*10]
+ mova m19, [cq+128*14]
+ mova m20, [cq+128*18]
+ mova m21, [cq+128*22]
+ mova m22, [cq+128*26]
+ mova m23, [cq+128*30]
+ call m(idct_16x16_internal_10bpc).main
+ call .pass1_load_spill
+ mova m16, [cq+128*17]
+ mova m17, [cq+128*19]
+ mova m18, [cq+128*21]
+ mova m19, [cq+128*23]
+ mova m20, [cq+128*25]
+ mova m21, [cq+128*27]
+ mova m22, [cq+128*29]
+ mova m23, [cq+128*31]
+ call m(inv_txfm_add_dct_dct_32x16_10bpc).main
+.pass1_end:
+ vpbroadcastd m11, [o(pd_2)]
+ lea r4, [cq+128*8]
+ call m(inv_txfm_add_dct_dct_32x16_10bpc).idct32_pass1_end
+ punpckhqdq m22, m0, m20 ; 1
+ punpcklqdq m0, m20 ; 0
+ punpckhqdq m24, m2, m1 ; 5
+ punpcklqdq m1, m2, m1 ; 4
+ punpcklqdq m2, m14, m18 ; 8
+ punpckhqdq m26, m14, m18 ; 9
+ punpcklqdq m14, m15, m4 ; 2
+ punpckhqdq m23, m15, m4 ; 3
+ punpckhqdq m25, m3, m21 ; 7
+ punpcklqdq m15, m3, m21 ; 6
+ punpckhqdq m28, m6, m17 ; 13
+ punpcklqdq m3, m6, m17 ; 12
+ punpckhqdq m27, m5, m16 ; 11
+ punpcklqdq m16, m5, m16 ; 10
+ punpckhqdq m29, m7, m8 ; 15
+ punpcklqdq m17, m7, m8 ; 14
+ ret
+.pass1_load_spill:
+ call m(inv_txfm_add_dct_dct_32x16_10bpc).idct16_sumsub
+ mova [cq+128* 0], m0
+ mova m0, [cq+128* 1]
+ mova [cq+128* 1], m1
+ mova [cq+128* 2], m2
+ mova m1, [cq+128* 3]
+ mova m2, [cq+128* 5]
+ mova [cq+128* 3], m3
+ mova [cq+128* 4], m4
+ mova m3, [cq+128* 7]
+ mova m4, [cq+128* 9]
+ mova [cq+128* 5], m5
+ mova [cq+128* 6], m6
+ mova [cq+128* 7], m7
+ mova m5, [cq+128*11]
+ mova m6, [cq+128*13]
+ mova m7, [cq+128*15]
+ mova [cq+128* 8], m23
+ mova [cq+128* 9], m22
+ mova [cq+128*10], m21
+ mova [cq+128*11], m20
+ mova [cq+128*12], m19
+ mova [cq+128*13], m18
+ mova [cq+128*14], m17
+ mova [cq+128*15], m16
+ ret
+
+cglobal inv_txfm_add_identity_identity_32x32_10bpc, 4, 7, 16, dst, stride, c, eob
+%undef cmp
+ vpbroadcastd m13, [pw_8192]
+ vpbroadcastd m15, [pixel_10bpc_max]
+ pxor m14, m14
+ lea r6, [strideq*9]
+ cmp eobd, 136
+ jl .main
+ mov r4, dstq
+ call .main
+ add cq, 64-128*4
+ lea dstq, [dstq+strideq*8]
+ call .main
+ add cq, 128*12-64
+ lea dstq, [r4+32]
+ cmp eobd, 543
+ jl .main
+ call .main
+ add cq, 64-128*4
+ lea dstq, [dstq+strideq*8]
+.main:
+ call .main_internal
+ add cq, 128*4
+ pmulhrsw m1, m13, m2
+ pmulhrsw m3, m13, m4
+ pmulhrsw m5, m13, m6
+ pmulhrsw m7, m13, m8
+ call .main_internal
+ jmp m(inv_txfm_add_identity_identity_16x32_10bpc).main2
+.main_internal:
+ mova m8, [cq+128* 0]
+ packssdw m8, [cq+128* 8]
+ mova m6, [cq+128* 1]
+ packssdw m6, [cq+128* 9]
+ mova m0, [cq+128* 2]
+ packssdw m0, [cq+128*10]
+ mova m2, [cq+128* 3]
+ packssdw m2, [cq+128*11]
+ REPX {vpermq x, x, q3120}, m8, m6, m0, m2
+ REPX {mova [cq+128*x], m14}, 0, 1, 2, 3
+ punpcklwd m4, m8, m6
+ punpckhwd m8, m6
+ punpcklwd m6, m0, m2
+ punpckhwd m0, m2
+ REPX {mova [cq+128*x], m14}, 8, 9, 10, 11
+ punpckldq m2, m4, m6 ; 0 1
+ punpckhdq m4, m6 ; 2 3
+ punpckldq m6, m8, m0 ; 4 5
+ punpckhdq m8, m0 ; 6 7
+ ret
+
+cglobal inv_txfm_add_dct_dct_16x64_10bpc, 4, 7, 0, dst, stride, c, eob
+ lea r5, [o_base]
+ test eobd, eobd
+ jz .dconly
+
+ PROLOGUE 4, 7, 32, -8*mmsize, dst, stride, c, eob
+%undef cmp
+ vpbroadcastd m12, [o(pd_2896)]
+ vpbroadcastd m13, [o(pd_2048)]
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+ cmp eobd, 36
+ jl .fast
+ call .pass1
+ cmp eobd, 151
+ jge .full
+ lea r5, [o_base_8bpc]
+
+ punpckhwd m22, m0, m0
+ punpckhwd m23, m1, m1
+ punpckhwd m24, m2, m2
+ punpckhwd m25, m3, m3
+ punpckhwd m26, m4, m4
+ punpckhwd m27, m5, m5
+ punpckhwd m28, m6, m6
+ punpckhwd m29, m7, m7
+ punpcklwd m21, m1, m1
+ punpcklwd m14, m3, m3
+ punpcklwd m18, m5, m5
+ punpcklwd m15, m7, m7
+ pxor m9, m9
+ punpcklwd m9, m9, m0
+ punpcklwd m8, m2, m2
+ punpcklwd m7, m4, m4
+ punpcklwd m1, m6, m6
+ call m(idct_16x16_internal_8bpc).main_fast2
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2
+ mova [rsp+mmsize*0], m14
+ mova [rsp+mmsize*1], m15
+ mova [rsp+mmsize*2], m16
+ mova [rsp+mmsize*3], m17
+ mova [rsp+mmsize*4], m18
+ mova [rsp+mmsize*5], m19
+ mova [rsp+mmsize*6], m20
+ mova [rsp+mmsize*7], m21
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast
+
+ pxor m12, m12
+ mov r3d, 64*3
+.zero_loop:
+ REPX {mova [cq+r3*8+128*x], m12}, 0, 1, 2, 3
+ sub r3d, 64
+ jge .zero_loop
+
+ jmp .pass2_end
+.full:
+ mova [cq+128*0], m0
+ mova [cq+128*1], m1
+ mova [cq+128*2], m2
+ mova [cq+128*3], m3
+ mova [cq+128*4], m4
+ mova [cq+128*5], m5
+ mova [cq+128*6], m6
+ mova [cq+128*7], m7
+ add cq, 64
+ call .pass1
+ sub cq, 64
+ mova m22, [cq+128*0] ; 0 1
+ mova m23, [cq+128*1] ; 2 3
+ mova m24, [cq+128*2] ; 4 5
+ mova m25, [cq+128*3] ; 6 7
+ mova m26, [cq+128*4] ; 8 9
+ mova m27, [cq+128*5] ; 10 11
+ mova m28, [cq+128*6] ; 12 13
+ mova m29, [cq+128*7] ; 14 15
+ mova [cq+64* 8], m0
+ mova [cq+64* 9], m1
+ mova [cq+64*10], m2
+ mova [cq+64*11], m3
+ mova [cq+64*12], m4
+ mova [cq+64*13], m5
+ mova [cq+64*14], m6
+ mova [cq+64*15], m7
+ lea r5, [o_base_8bpc]
+
+ punpcklwd m20, m1, m1
+ punpcklwd m16, m3, m3
+ punpcklwd m19, m5, m5
+ punpcklwd m17, m7, m7
+ punpcklwd m8, m24, m24 ; 4
+ punpcklwd m5, m2, m2 ; 20
+ punpcklwd m1, m28, m28 ; 12
+ punpcklwd m7, m26, m26 ; 8
+ punpcklwd m3, m4, m4 ; 24
+ punpcklwd m4, m6, m6 ; 28
+ pxor m9, m9
+ punpcklwd m6, m9, m0 ; __ 16
+ mova m0, m4
+ punpcklwd m9, m9, m22 ; __ 0
+ call m(idct_16x16_internal_8bpc).main_fast
+ punpcklwd m21, m23, m23 ; 2
+ punpcklwd m15, m29, m29 ; 14
+ punpcklwd m18, m27, m27 ; 10
+ punpcklwd m14, m25, m25 ; 6
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ mova [rsp+mmsize*0], m14
+ mova [rsp+mmsize*1], m15
+ mova [rsp+mmsize*2], m16
+ mova [rsp+mmsize*3], m17
+ mova [rsp+mmsize*4], m18
+ mova [rsp+mmsize*5], m19
+ mova [rsp+mmsize*6], m20
+ mova [rsp+mmsize*7], m21
+ mova m21, [cq+64*15]
+ mova m14, [cq+64* 8]
+ mova m17, [cq+64*11]
+ mova m18, [cq+64*12]
+ mova m19, [cq+64*13]
+ mova m16, [cq+64*10]
+ mova m15, [cq+64* 9]
+ mova m20, [cq+64*14]
+ REPX {punpckhwd x, x}, m22, m21, m14, m29, m26, m17, m18, m25, \
+ m24, m19, m16, m27, m28, m15, m20, m23
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf
+
+ pxor m12, m12
+ mov r3d, 32*7
+.full_zero_loop:
+ REPX {mova [cq+r3*8+64*x], m12}, 0, 1, 2, 3
+ sub r3d, 32
+ jge .full_zero_loop
+
+ jmp .pass2_end
+.fast:
+ mova ym0, [cq+128*0]
+ mova ym2, [cq+128*4]
+ movshdup m8, [o(permB)]
+ mova ym1, [cq+128*2]
+ mova ym3, [cq+128*6]
+ mova ym4, [cq+128*1]
+ mova ym5, [cq+128*3]
+ mova ym6, [cq+128*5]
+ mova ym7, [cq+128*7]
+ vpermt2q m0, m8, m2 ; 0 4
+ vpermt2q m1, m8, m3 ; 2 6
+ vpermt2q m4, m8, m5 ; 1 3
+ vpermt2q m7, m8, m6 ; 7 5
+ call m(idct_8x8_internal_10bpc).main_fast
+ call m(idct_16x8_internal_10bpc).main_fast
+ vpbroadcastd m11, [o(pd_2)]
+ call m(idct_8x16_internal_10bpc).main_end2
+ mova m8, [o(idct8x32p)]
+ packssdw m0, m4
+ packssdw m1, m5
+ packssdw m2, m6
+ packssdw m3, m7
+ mova m6, [dup16_perm]
+ vpermb m0, m8, m0
+ vpermb m2, m8, m2
+ vprold m8, 16
+ vpermb m1, m8, m1
+ vpermb m3, m8, m3
+ punpckldq m4, m0, m2
+ punpckhdq m0, m2
+ punpckldq m2, m1, m3
+ punpckhdq m1, m3
+ punpckldq m21, m4, m2
+ punpckhdq m14, m4, m2
+ punpckldq m18, m0, m1
+ punpckhdq m15, m0, m1
+ vpord m7, m6, [o(pb_32)] {1to16}
+ vpermb m22, m7, m21 ; 1
+ pmovzxwd m9, ym21 ; 0
+ vpermb m8, m6, m18 ; 4
+ vpermb m24, m7, m18 ; 5
+ vpermb m21, m6, m14 ; 2
+ vpermb m23, m7, m14 ; 3
+ vpermb m14, m6, m15 ; 6
+ vpermb m25, m7, m15 ; 7
+ lea r5, [o_base_8bpc]
+ pslld m9, 16
+
+ pxor m7, m7
+ REPX {mova x, m7}, m1, m18, m15, m26, m27, m28, m29
+
+ call m(idct_16x16_internal_8bpc).main_fast2
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2
+ mova [rsp+mmsize*0], m14
+ mova [rsp+mmsize*1], m15
+ mova [rsp+mmsize*2], m16
+ mova [rsp+mmsize*3], m17
+ mova [rsp+mmsize*4], m18
+ mova [rsp+mmsize*5], m19
+ mova [rsp+mmsize*6], m20
+ mova [rsp+mmsize*7], m21
+
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast
+
+ pxor m12, m12
+ REPX {mova [cq+128*x], ym12}, 0, 1, 2, 3, 4, 5, 6, 7
+.pass2_end:
+ movshdup m30, [permC]
+ vpbroadcastd m11, [pw_2048]
+ vpbroadcastd m13, [pixel_10bpc_max]
+ lea r6, [strideq*3]
+ psrlq m31, m30, 8
+ vpermq m8, m30, m0
+ vpermq m9, m31, m1
+ call m(idct_16x8_internal_10bpc).write_16x4
+ vpermq m8, m30, m2
+ vpermq m9, m31, m3
+ call m(idct_16x8_internal_10bpc).write_16x4
+ vpermq m8, m30, m4
+ vpermq m9, m31, m5
+ call m(idct_16x8_internal_10bpc).write_16x4
+ vpermq m8, m30, m6
+ vpermq m9, m31, m7
+ call m(idct_16x8_internal_10bpc).write_16x4
+
+ mova m1, [rsp+mmsize*0]
+ mova m2, [rsp+mmsize*1]
+ mova m3, [rsp+mmsize*2]
+ mova m4, [rsp+mmsize*3]
+ mova m5, [rsp+mmsize*4]
+ mova m6, [rsp+mmsize*5]
+ mova m7, [rsp+mmsize*6]
+ mova m8, [rsp+mmsize*7]
+
+ paddsw m0, m1, m21
+ psubsw m21, m1, m21
+ paddsw m1, m2, m20
+ psubsw m20, m2, m20
+ paddsw m2, m3, m19
+ psubsw m19, m3, m19
+ paddsw m3, m4, m18
+ psubsw m18, m4, m18
+ paddsw m4, m5, m17
+ psubsw m17, m5, m17
+ paddsw m5, m6, m16
+ psubsw m16, m6, m16
+ paddsw m6, m7, m15
+ psubsw m15, m7, m15
+ paddsw m7, m8, m14
+ psubsw m14, m8, m14
+
+ vpermq m8, m30, m0
+ vpermq m9, m31, m1
+ call m(idct_16x8_internal_10bpc).write_16x4
+ vpermq m8, m30, m2
+ vpermq m9, m31, m3
+ call m(idct_16x8_internal_10bpc).write_16x4
+ vpermq m8, m30, m4
+ vpermq m9, m31, m5
+ call m(idct_16x8_internal_10bpc).write_16x4
+ vpermq m8, m30, m6
+ vpermq m9, m31, m7
+ call m(idct_16x8_internal_10bpc).write_16x4
+
+ vpermq m8, m30, m14
+ vpermq m9, m31, m15
+ call m(idct_16x8_internal_10bpc).write_16x4
+ vpermq m8, m30, m16
+ vpermq m9, m31, m17
+ call m(idct_16x8_internal_10bpc).write_16x4
+ vpermq m8, m30, m18
+ vpermq m9, m31, m19
+ call m(idct_16x8_internal_10bpc).write_16x4
+ vpermq m8, m30, m20
+ vpermq m9, m31, m21
+ call m(idct_16x8_internal_10bpc).write_16x4
+
+ vpermq m8, m30, m22
+ vpermq m9, m31, m23
+ call m(idct_16x8_internal_10bpc).write_16x4
+ vpermq m8, m30, m24
+ vpermq m9, m31, m25
+ call m(idct_16x8_internal_10bpc).write_16x4
+ vpermq m8, m30, m26
+ vpermq m9, m31, m27
+ call m(idct_16x8_internal_10bpc).write_16x4
+ vpermq m8, m30, m28
+ vpermq m9, m31, m29
+ call m(idct_16x8_internal_10bpc).write_16x4
+ RET
+.pass1:
+ mova m0, [cq+128* 0]
+ mova m1, [cq+128* 2]
+ mova m2, [cq+128* 4]
+ mova m3, [cq+128* 6]
+ mova m4, [cq+128* 8]
+ mova m5, [cq+128*10]
+ mova m6, [cq+128*12]
+ mova m7, [cq+128*14]
+ call m(idct_8x16_internal_10bpc).main
+ mova m16, [cq+128* 1]
+ mova m17, [cq+128* 3]
+ mova m18, [cq+128* 5]
+ mova m19, [cq+128* 7]
+ mova m20, [cq+128* 9]
+ mova m21, [cq+128*11]
+ mova m22, [cq+128*13]
+ mova m23, [cq+128*15]
+ call m(idct_16x16_internal_10bpc).main
+ call m(idct_16x16_internal_10bpc).main_end
+ jmp m(idct_16x16_internal_10bpc).main_end3
+.dconly:
+ imul r6d, [cq], 181
+ mov [cq], eobd
+ or r3d, 64
+ add r6d, 640
+ sar r6d, 10
+ jmp m(inv_txfm_add_dct_dct_16x8_10bpc).dconly2
+
+cglobal inv_txfm_add_dct_dct_32x64_10bpc, 4, 7, 0, dst, stride, c, eob
+ lea r5, [o_base]
+ test eobd, eobd
+ jz .dconly
+ PROLOGUE 4, 7, 32, -64*40, dst, stride, c, eob
+%undef cmp
+ vpbroadcastd m12, [o(pd_2896)]
+ vpbroadcastd m13, [o(pd_2048)]
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+ cmp eobd, 136
+ jl .fast
+ add cq, 64
+ cmp eobd, 543
+ jge .full
+ call .pass1_fast ; bottomright 16x16 zero
+ jmp .lefthalf
+.full:
+ call .pass1
+ mov r3d, 16*28
+.lefthalf:
+ mova [cq+128* 0], m27
+ mova [cq+128* 1], m14
+ mova [cq+128* 2], m28
+ mova [cq+128* 3], m15
+ mova [cq+128* 4], m22
+ mova [cq+128* 5], m23
+ mova [cq+128* 6], m24
+ mova [cq+128* 7], m25
+ mova [cq+128* 8], m0
+ mova [cq+128* 9], m26
+ mova [cq+128*10], m20
+ mova [cq+128*11], m21
+ mova [cq+128*12], m18
+ mova [cq+128*13], m16
+ mova [cq+128*14], m17
+ mova [cq+128*15], m3
+ sub cq, 64
+ vpbroadcastd m12, [o(pd_2896)]
+ vpbroadcastd m13, [o(pd_2048)]
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+ call .pass1
+ call .pass2_start
+
+ pxor m31, m31
+.right_zero_loop:
+ REPX {mova [cq+r3*8+64+128*x], m31}, 0, 1, 2, 3
+ sub r3d, 16*4
+ jge .right_zero_loop
+ mov r3d, 16*28
+ jmp .left_zero_loop
+.pass2_start:
+ vpbroadcastd m10, [o(pd_2048)]
+ lea r5, [o_base_8bpc]
+
+ lea r4, [rsp+gprsize]
+ mova m1, [cq+128*15+64]
+ mova m2, [cq+128* 8+64]
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
+ mova m0, m21
+ mova m1, [cq+128*12+64]
+ mova m2, [cq+128*11+64]
+ mova m3, m18
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
+ mova m0, m20
+ mova m1, [cq+128*13+64]
+ mova m2, [cq+128*10+64]
+ mova m3, m16
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
+ mova m0, m26
+ mova m1, [cq+128*14+64]
+ mova m2, [cq+128* 9+64]
+ mova m3, m17
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2
+
+ mova m0, m27
+ mova m1, m28
+ mova m2, [cq+128* 0+64]
+ mova m3, [cq+128* 2+64]
+ mova m16, [cq+128* 1+64]
+ mova m17, [cq+128* 3+64]
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
+ mova m26, [cq+128* 4+64]
+ mova m27, [cq+128* 5+64]
+ mova m28, [cq+128* 6+64]
+ mova m29, [cq+128* 7+64]
+ mova [rsp+64*32+gprsize], m14
+ mova [rsp+64*33+gprsize], m15
+ mova [rsp+64*34+gprsize], m16
+ mova [rsp+64*35+gprsize], m17
+ mova [rsp+64*36+gprsize], m18
+ mova [rsp+64*37+gprsize], m19
+ mova [rsp+64*38+gprsize], m20
+ mova [rsp+64*39+gprsize], m21
+ jmp m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast
+.fast: ; topleft 16x16 nonzero
+ cmp eobd, 36
+ jl .fast2
+ call .pass1_fast
+ vpbroadcastd m10, [o(pd_2048)]
+ call .pass2_fast_start
+ jmp .end
+.fast2: ; topleft 8x8 nonzero
+ movshdup m7, [o(permB)]
+ mova ym0, [cq+128*0]
+ mova ym1, [cq+128*4]
+ mova ym4, [cq+128*2]
+ mova ym5, [cq+128*6]
+ mova ym16, [cq+128*1]
+ mova ym2, [cq+128*5]
+ mova ym3, [cq+128*3]
+ mova ym17, [cq+128*7]
+ mov r3d, 16*4
+ vpermq m0, m7, m0 ; 0 0
+ vpermq m1, m7, m1 ; 4 4
+ vpermt2q m4, m7, m5 ; 2 6
+ vpermt2q m16, m7, m2 ; 1 5
+ vpermt2q m17, m7, m3 ; 7 3
+ REPX {pmulld x, m12}, m0, m1, m4, m16, m17
+ REPX {paddd x, m13}, m0, m1, m4, m16, m17
+ REPX {psrad x, 12 }, m0, m1, m4, m16, m17
+ call m(inv_txfm_add_dct_dct_32x8_10bpc).main_fast2
+ vpbroadcastd m11, [o(pd_1)]
+ call m(idct_16x16_internal_10bpc).main_end2
+
+ call m(inv_txfm_add_dct_dct_32x8_10bpc).transpose_8x32
+ punpcklqdq m27, m0, m2 ; 0
+ punpckhqdq m0, m2 ; 1
+ punpcklqdq m22, m3, m4 ; 2
+ punpckhqdq m26, m3, m4 ; 3
+ punpcklqdq m14, m5, m7 ; 4
+ punpckhqdq m20, m5, m7 ; 5
+ punpcklqdq m23, m6, m8 ; 6
+ punpckhqdq m21, m6, m8 ; 7
+
+ mova m10, m13
+ call .pass2_fast2_start
+.end:
+
+ pxor m31, m31
+
+.left_zero_loop:
+ REPX {mova [cq+r3*8+128*x], m31}, 0, 1, 2, 3
+ sub r3d, 16*4
+ jge .left_zero_loop
+
+ call .pass2_end
+ RET
+.pass2_end:
+ DEFINE_ARGS dst, stride, _, dst2, stride32, stklo, stkhi
+ vpbroadcastd m30, [pixel_10bpc_max]
+ vpbroadcastd m13, [pw_2048]
+
+ mov stride32q, strideq
+ shl stride32q, 5
+ lea stkhiq, [rsp+31*mmsize+gprsize]
+ lea dst2q, [dstq+stride32q]
+ lea stkloq, [rsp+gprsize]
+ sub dst2q, strideq ; dst31
+
+ paddsw m8, m0, m29 ; t0[idct32]
+ psubsw m9, m0, m29 ; t31[idct32]
+ call .end_sumsub_write
+ paddsw m8, m1, m28 ; t1[idct32]
+ psubsw m9, m1, m28 ; t30[idct32]
+ call .end_sumsub_write
+ paddsw m8, m2, m27 ; t2[idct32]
+ psubsw m9, m2, m27 ; t29[idct32]
+ call .end_sumsub_write
+ paddsw m8, m3, m26 ; t3[idct32]
+ psubsw m9, m3, m26 ; t28[idct32]
+ call .end_sumsub_write
+ paddsw m8, m4, m25 ; t4[idct32]
+ psubsw m9, m4, m25 ; t27[idct32]
+ call .end_sumsub_write
+ paddsw m8, m5, m24 ; t5[idct32]
+ psubsw m9, m5, m24 ; t26[idct32]
+ call .end_sumsub_write
+ paddsw m8, m6, m23 ; t6[idct32]
+ psubsw m9, m6, m23 ; t25[idct32]
+ call .end_sumsub_write
+ paddsw m8, m7, m22 ; t7[idct32]
+ psubsw m9, m7, m22 ; t24[idct32]
+ call .end_sumsub_write
+ mova m0, [rsp+64*32+gprsize]
+ mova m1, [rsp+64*33+gprsize]
+ mova m2, [rsp+64*34+gprsize]
+ mova m3, [rsp+64*35+gprsize]
+ mova m4, [rsp+64*36+gprsize]
+ mova m5, [rsp+64*37+gprsize]
+ mova m6, [rsp+64*38+gprsize]
+ mova m7, [rsp+64*39+gprsize]
+ paddsw m8, m0, m21 ; t8[idct32]
+ psubsw m9, m0, m21 ; t23[idct32]
+ call .end_sumsub_write
+ paddsw m8, m1, m20 ; t9[idct32]
+ psubsw m9, m1, m20 ; t22[idct32]
+ call .end_sumsub_write
+ paddsw m8, m2, m19 ; t10[idct32]
+ psubsw m9, m2, m19 ; t21[idct32]
+ call .end_sumsub_write
+ paddsw m8, m3, m18 ; t11[idct32]
+ psubsw m9, m3, m18 ; t20[idct32]
+ call .end_sumsub_write
+ paddsw m8, m4, m17 ; t12[idct32]
+ psubsw m9, m4, m17 ; t19[idct32]
+ call .end_sumsub_write
+ paddsw m8, m5, m16 ; t13[idct32]
+ psubsw m9, m5, m16 ; t18[idct32]
+ call .end_sumsub_write
+ paddsw m8, m6, m15 ; t14[idct32]
+ psubsw m9, m6, m15 ; t17[idct32]
+ call .end_sumsub_write
+ paddsw m8, m7, m14 ; t15[idct32]
+ psubsw m9, m7, m14 ; t16[idct32]
+ ; fall-through
+.end_sumsub_write:
+ mova m10, [stkhiq] ; t63-n
+ mova m12, [stkloq] ; t32+n
+ psubsw m11, m8, m10 ; out63-n
+ paddsw m8, m10 ; out0 +n
+ psubsw m10, m9, m12 ; out32+n
+ paddsw m9, m12 ; out32-n
+ REPX {pmulhrsw x, m13}, m11, m8, m10, m9
+ paddw m8, [dstq]
+ paddw m9, [dst2q]
+ paddw m10, [dstq+stride32q]
+ paddw m11, [dst2q+stride32q]
+ REPX {pminsw x, m30}, m11, m8, m10, m9
+ REPX {pmaxsw x, m31}, m11, m8, m10, m9
+ mova [dstq ], m8
+ mova [dst2q ], m9
+ mova [dstq +stride32q], m10
+ mova [dst2q+stride32q], m11
+ add stkloq, mmsize
+ sub stkhiq, mmsize
+ add dstq, strideq
+ sub dst2q, strideq
+ ret
+.pass2_fast_start:
+ lea r5, [o_base_8bpc]
+ lea r4, [rsp+gprsize]
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast
+ mova m0, m21
+ mova m3, m18
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast
+ mova m0, m20
+ mova m3, m16
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast
+ mova m0, m26
+ mova m3, m17
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2
+
+ mova m0, m27
+ mova m1, m28
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast2
+ mova [rsp+64*32+gprsize], m14
+ mova [rsp+64*33+gprsize], m15
+ mova [rsp+64*34+gprsize], m16
+ mova [rsp+64*35+gprsize], m17
+ mova [rsp+64*36+gprsize], m18
+ mova [rsp+64*37+gprsize], m19
+ mova [rsp+64*38+gprsize], m20
+ mova [rsp+64*39+gprsize], m21
+ jmp m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast2
+.pass2_fast2_start:
+ lea r5, [o_base_8bpc]
+ lea r4, [rsp+gprsize]
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast2
+ mova m0, m21
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast2
+ mova m0, m20
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast2
+ mova m0, m26
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast2
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2
+
+ mova m0, m27
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast3
+ mova [rsp+64*32+gprsize], m14
+ mova [rsp+64*33+gprsize], m15
+ mova [rsp+64*34+gprsize], m16
+ mova [rsp+64*35+gprsize], m17
+ mova [rsp+64*36+gprsize], m18
+ mova [rsp+64*37+gprsize], m19
+ mova [rsp+64*38+gprsize], m20
+ mova [rsp+64*39+gprsize], m21
+ jmp m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast3
+.dconly:
+ DEFINE_ARGS dst, stride, c, eob
+ imul r6d, [cq], 181
+ mov [cq], eobd
+ or r3d, 64
+ jmp m(inv_txfm_add_dct_dct_32x16_10bpc).dconly3
+.pass1_fast:
+ pmulld m0, m12, [cq+128* 0]
+ pmulld m1, m12, [cq+128* 4]
+ pmulld m2, m12, [cq+128* 8]
+ pmulld m3, m12, [cq+128*12]
+ mov r3d, 16*12
+ call m(idct_8x16_internal_10bpc).main_fast_rect2
+ pmulld m16, m12, [cq+128* 2]
+ pmulld m17, m12, [cq+128* 6]
+ pmulld m18, m12, [cq+128*10]
+ pmulld m19, m12, [cq+128*14]
+ call m(idct_16x16_internal_10bpc).main_fast_rect2
+ call .pass1_load_spill
+ call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast_rect2
+ jmp .pass1_end
+.pass1:
+ pmulld m0, m12, [cq+128* 0]
+ pmulld m1, m12, [cq+128* 4]
+ pmulld m2, m12, [cq+128* 8]
+ pmulld m3, m12, [cq+128*12]
+ pmulld m4, m12, [cq+128*16]
+ pmulld m5, m12, [cq+128*20]
+ pmulld m6, m12, [cq+128*24]
+ pmulld m7, m12, [cq+128*28]
+ call m(idct_8x16_internal_10bpc).main_rect2
+ pmulld m16, m12, [cq+128* 2]
+ pmulld m17, m12, [cq+128* 6]
+ pmulld m18, m12, [cq+128*10]
+ pmulld m19, m12, [cq+128*14]
+ pmulld m20, m12, [cq+128*18]
+ pmulld m21, m12, [cq+128*22]
+ pmulld m22, m12, [cq+128*26]
+ pmulld m23, m12, [cq+128*30]
+ call m(idct_16x16_internal_10bpc).main_rect2
+ call .pass1_load_spill
+ pmulld m16, m12, [cq+128*17]
+ pmulld m17, m12, [cq+128*19]
+ pmulld m18, m12, [cq+128*21]
+ pmulld m19, m12, [cq+128*23]
+ pmulld m20, m12, [cq+128*25]
+ pmulld m21, m12, [cq+128*27]
+ pmulld m22, m12, [cq+128*29]
+ pmulld m23, m12, [cq+128*31]
+ call m(inv_txfm_add_dct_dct_32x16_10bpc).main_rect2
+.pass1_end:
+ vpbroadcastd m11, [o(pd_1)]
+ lea r4, [cq+128*8]
+ call m(inv_txfm_add_dct_dct_32x16_10bpc).idct32_pass1_end
+ punpcklqdq m27, m0, m20 ; 0
+ punpckhqdq m0, m20 ; 1
+ punpcklqdq m24, m5, m16 ; 10
+ punpckhqdq m16, m5, m16 ; 11
+ punpcklqdq m23, m3, m21 ; 6
+ punpckhqdq m21, m3, m21 ; 7
+ punpcklqdq m25, m7, m8 ; 14
+ punpckhqdq m3, m7, m8 ; 15
+ punpcklqdq m22, m15, m4 ; 2
+ punpckhqdq m26, m15, m4 ; 3
+ punpcklqdq m15, m6, m17 ; 12
+ punpckhqdq m17, m6, m17 ; 13
+ punpcklqdq m28, m14, m18 ; 8
+ punpckhqdq m18, m14, m18 ; 9
+ punpcklqdq m14, m2, m1 ; 4
+ punpckhqdq m20, m2, m1 ; 5
+ ret
+.pass1_load_spill:
+ call m(inv_txfm_add_dct_dct_32x16_10bpc).idct16_sumsub
+ mova [cq+128* 0], m0
+ pmulld m0, m12, [cq+128* 1]
+ mova [cq+128* 1], m1
+ mova [cq+128* 2], m2
+ pmulld m1, m12, [cq+128* 3]
+ pmulld m2, m12, [cq+128* 5]
+ mova [cq+128* 3], m3
+ mova [cq+128* 4], m4
+ pmulld m3, m12, [cq+128* 7]
+ pmulld m4, m12, [cq+128* 9]
+ mova [cq+128* 5], m5
+ mova [cq+128* 6], m6
+ mova [cq+128* 7], m7
+ pmulld m5, m12, [cq+128*11]
+ pmulld m6, m12, [cq+128*13]
+ pmulld m7, m12, [cq+128*15]
+ mova [cq+128* 8], m23
+ mova [cq+128* 9], m22
+ mova [cq+128*10], m21
+ mova [cq+128*11], m20
+ mova [cq+128*12], m19
+ mova [cq+128*13], m18
+ mova [cq+128*14], m17
+ mova [cq+128*15], m16
+ ret
+
+cglobal inv_txfm_add_dct_dct_64x16_10bpc, 4, 7, 0, dst, stride, c, eob
+%undef cmp
+ lea r5, [o_base]
+ test eobd, eobd
+ jz .dconly
+
+ PROLOGUE 4, 7, 32, -64*32, dst, stride, c, eob
+%undef cmp
+ vpbroadcastd m12, [o(pd_2896)]
+ vpbroadcastd m13, [o(pd_2048)]
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+ cmp eobd, 36
+ jl .fast ; 8x8
+ cmp eobd, 151
+ jge .full ; 16x16
+ lea r4, [idct64_mul_16bpc]
+ lea r6, [rsp+4*64]
+ mova m0, [cq+64* 1]
+ mova m3, [cq+64*15]
+ call .main_part1_fast
+ mova m0, [cq+64* 7]
+ mova m3, [cq+64* 9]
+ call .main_part1_fast
+ mova m0, [cq+64* 5]
+ mova m3, [cq+64*11]
+ call .main_part1_fast
+ mova m0, [cq+64* 3]
+ mova m3, [cq+64*13]
+ call .main_part1_fast
+ call .main_part2
+ mova m0, [cq+64* 0]
+ mova m1, [cq+64* 8]
+ mova m16, [cq+64* 4]
+ mova m17, [cq+64*12]
+ call m(idct_8x16_internal_10bpc).main_fast2
+ call m(idct_16x16_internal_10bpc).main_fast2
+ call m(inv_txfm_add_dct_dct_32x16_10bpc).idct16_sumsub
+ call .pass1_load_spill
+ call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast2
+ mov r6d, 12*8
+ jmp .idct64_end
+.full:
+ lea r4, [idct64_mul_16bpc]
+ lea r6, [rsp+4*64]
+ mova m0, [cq+64* 1]
+ mova m1, [cq+64*31]
+ mova m2, [cq+64*17]
+ mova m3, [cq+64*15]
+ call .main_part1
+ mova m0, [cq+64* 7]
+ mova m1, [cq+64*25]
+ mova m2, [cq+64*23]
+ mova m3, [cq+64* 9]
+ call .main_part1
+ mova m0, [cq+64* 5]
+ mova m1, [cq+64*27]
+ mova m2, [cq+64*21]
+ mova m3, [cq+64*11]
+ call .main_part1
+ mova m0, [cq+64* 3]
+ mova m1, [cq+64*29]
+ mova m2, [cq+64*19]
+ mova m3, [cq+64*13]
+ call .main_part1
+ call .main_part2
+ mova m0, [cq+64* 0]
+ mova m1, [cq+64* 8]
+ mova m2, [cq+64*16]
+ mova m3, [cq+64*24]
+ mova m16, [cq+64* 4]
+ mova m17, [cq+64*12]
+ mova m18, [cq+64*20]
+ mova m19, [cq+64*28]
+ call m(idct_8x16_internal_10bpc).main_fast
+ call m(idct_16x16_internal_10bpc).main_fast
+ call m(inv_txfm_add_dct_dct_32x16_10bpc).idct16_sumsub
+ call .pass1_load_spill
+ mova m4, [cq+64*18]
+ mova m5, [cq+64*22]
+ mova m6, [cq+64*26]
+ mova m7, [cq+64*30]
+ call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast
+ mov r6d, 28*8
+ jmp .idct64_end
+.dconly:
+ imul r6d, [cq], 181
+ mov [cq], eobd
+ or r3d, 16
+.dconly1:
+ add r6d, 640
+ sar r6d, 10
+.dconly2:
+ vpbroadcastd m3, [o(dconly_10bpc)]
+ imul r6d, 181
+ add r6d, 2176
+ sar r6d, 12
+ vpbroadcastw m2, r6d
+ paddsw m2, m3
+.dconly_loop:
+ paddsw m0, m2, [dstq+64*0]
+ paddsw m1, m2, [dstq+64*1]
+ psubusw m0, m3
+ psubusw m1, m3
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ add dstq, strideq
+ dec r3d
+ jg .dconly_loop
+ ret
+.pass1_load_spill:
+ mova [cq+64* 0], m0
+ mova m0, [cq+64* 2]
+ mova [cq+64* 2], m1
+ mova m1, [cq+64* 6]
+ mova [cq+64* 4], m2
+ mova [cq+64* 6], m3
+ mova m2, [cq+64*10]
+ mova m3, [cq+64*14]
+ mova [cq+64* 8], m4
+ mova [cq+64*10], m5
+ mova [cq+64*12], m6
+ mova [cq+64*14], m7
+ mova [cq+64* 1], m23
+ mova [cq+64* 3], m22
+ mova [cq+64* 5], m21
+ mova [cq+64* 7], m20
+ mova [cq+64* 9], m19
+ mova [cq+64*11], m18
+ mova [cq+64*13], m17
+ mova [cq+64*15], m16
+ ret
+ALIGN function_align
+.main_part1_fast_rect2:
+ REPX {paddd x, m13}, m0, m3
+ REPX {psrad x, 12 }, m0, m3
+.main_part1_fast:
+ pmulld m7, m0, [r4+4*0]{bcstd} ; t63a
+ pmulld m0, [r4+4*1]{bcstd} ; t32a
+ pmulld m4, m3, [r4+4*6]{bcstd} ; t60a
+ pmulld m3, [r4+4*7]{bcstd} ; t35a
+ vpbroadcastd m10, [r4+4*8]
+ vpbroadcastd m11, [r4+4*9]
+ REPX {paddd x, m13}, m7, m0, m4, m3
+ REPX {psrad x, 12 }, m7, m0, m4, m3
+ mova m8, m0
+ mova m1, m7
+ mova m6, m3
+ mova m2, m4
+ jmp .main_part1b
+.main_part1_rect2:
+ REPX {paddd x, m13}, m0, m1, m2, m3
+ REPX {psrad x, 12 }, m0, m1, m2, m3
+.main_part1: ; idct64 steps 1-5
+ ; in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
+ ; in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
+ ; in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
+ ; in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
+ pmulld m7, m0, [r4+4*0]{bcstd} ; t63a
+ pmulld m0, [r4+4*1]{bcstd} ; t32a
+ pmulld m6, m1, [r4+4*2]{bcstd} ; t62a
+ pmulld m1, [r4+4*3]{bcstd} ; t33a
+ pmulld m5, m2, [r4+4*4]{bcstd} ; t61a
+ pmulld m2, [r4+4*5]{bcstd} ; t34a
+ pmulld m4, m3, [r4+4*6]{bcstd} ; t60a
+ pmulld m3, [r4+4*7]{bcstd} ; t35a
+ vpbroadcastd m10, [r4+4*8]
+ vpbroadcastd m11, [r4+4*9]
+ REPX {paddd x, m13}, m7, m0, m6, m1, m5, m2, m4, m3
+ REPX {psrad x, 12 }, m0, m1, m7, m6, m2, m3, m5, m4
+ psubd m8, m0, m1 ; t33
+ paddd m0, m1 ; t32
+ psubd m1, m7, m6 ; t62
+ paddd m7, m6 ; t63
+ psubd m6, m3, m2 ; t34
+ paddd m3, m2 ; t35
+ psubd m2, m4, m5 ; t61
+ paddd m4, m5 ; t60
+.main_part1b:
+ REPX {pmaxsd x, m14}, m8, m1, m6, m2
+ REPX {pminsd x, m15}, m8, m1, m6, m2
+ ITX_MULSUB_2D 1, 8, 5, 9, _, 13, 10, 11 ; t33a, t62a
+ ITX_MULSUB_2D 2, 6, 5, 9, _, 13, 10, 11, 2 ; t61a, t34a
+ REPX {pmaxsd x, m14}, m0, m3, m7, m4
+ REPX {pminsd x, m15}, m0, m3, m7, m4
+ vpbroadcastd m10, [r4+4*10]
+ vpbroadcastd m11, [r4+4*11]
+ psubd m5, m0, m3 ; t35a
+ paddd m0, m3 ; t32a
+ psubd m3, m7, m4 ; t60a
+ paddd m7, m4 ; t63a
+ psubd m4, m1, m6 ; t34
+ paddd m1, m6 ; t33
+ psubd m6, m8, m2 ; t61
+ paddd m8, m2 ; t62
+ REPX {pmaxsd x, m14}, m5, m3, m4, m6
+ REPX {pminsd x, m15}, m5, m3, m4, m6
+ ITX_MULSUB_2D 3, 5, 2, 9, _, 13, 10, 11 ; t35, t60
+ ITX_MULSUB_2D 6, 4, 2, 9, _, 13, 10, 11 ; t34a, t61a
+ REPX {pmaxsd x, m14}, m0, m7, m1, m8
+ REPX {pminsd x, m15}, m0, m7, m1, m8
+ add r4, 4*12
+ mova [r6-64*4], m0
+ mova [r6+64*3], m7
+ mova [r6-64*3], m1
+ mova [r6+64*2], m8
+ mova [r6-64*2], m6
+ mova [r6+64*1], m4
+ mova [r6-64*1], m3
+ mova [r6+64*0], m5
+ add r6, 64*8
+ ret
+.main_part2: ; idct64 steps 6-9
+ lea r4, [r6+64*3]
+ sub r6, 64*4
+ vpbroadcastd m10, [pd_1567]
+ vpbroadcastd m11, [pd_3784]
+.main_part2_loop:
+ mova m0, [r6-64*32] ; t32a
+ mova m1, [r4-64*24] ; t39a
+ mova m2, [r4-64*32] ; t63a
+ mova m3, [r6-64*24] ; t56a
+ mova m4, [r6-64*16] ; t40a
+ mova m5, [r4-64* 8] ; t47a
+ mova m6, [r4-64*16] ; t55a
+ mova m7, [r6-64* 8] ; t48a
+ psubd m8, m0, m1 ; t39
+ paddd m0, m1 ; t32
+ psubd m1, m2, m3 ; t56
+ paddd m2, m3 ; t63
+ psubd m3, m5, m4 ; t40
+ paddd m5, m4 ; t47
+ psubd m4, m7, m6 ; t55
+ paddd m7, m6 ; t48
+ REPX {pmaxsd x, m14}, m8, m1, m3, m4
+ REPX {pminsd x, m15}, m8, m1, m3, m4
+ ITX_MULSUB_2D 1, 8, 6, 9, _, 13, 10, 11 ; t39a, t56a
+ ITX_MULSUB_2D 4, 3, 6, 9, _, 13, 10, 11, 2 ; t55a, t40a
+ REPX {pmaxsd x, m14}, m0, m2, m5, m7
+ REPX {pminsd x, m15}, m0, m5, m2, m7
+ psubd m6, m2, m7 ; t48a
+ paddd m2, m7 ; t63a
+ psubd m7, m0, m5 ; t47a
+ paddd m0, m5 ; t32a
+ psubd m5, m8, m4 ; t55
+ paddd m8, m4 ; t56
+ psubd m4, m1, m3 ; t40
+ paddd m1, m3 ; t39
+ REPX {pmaxsd x, m14}, m6, m7, m5, m4
+ REPX {pminsd x, m15}, m6, m7, m5, m4
+ REPX {pmulld x, m12}, m6, m7, m5, m4
+ REPX {pmaxsd x, m14}, m2, m0, m8, m1
+ REPX {pminsd x, m15}, m2, m0, m8, m1
+ paddd m6, m13
+ paddd m5, m13
+ psubd m3, m6, m7 ; t47
+ paddd m6, m7 ; t48
+ psubd m7, m5, m4 ; t40a
+ paddd m5, m4 ; t55a
+ REPX {psrad x, 12}, m3, m6, m7, m5
+ mova [r4-64* 8], m2
+ mova [r6-64*32], m0
+ mova [r6-64* 8], m8
+ mova [r4-64*32], m1
+ mova [r4-64*24], m3
+ mova [r6-64*16], m6
+ mova [r6-64*24], m7
+ mova [r4-64*16], m5
+ add r6, 64
+ sub r4, 64
+ cmp r6, r4
+ jl .main_part2_loop
+ ret
+.idct64_main_end:
+%macro IDCT64_PASS1_END 9
+ mova m%5, [%9+%1*128] ; t0+n [idct32] + idct64 rounding
+ psubd m%6, m%5, m%2 ; out31-n [idct32] = t31-n [idct64]
+ paddd m%5, m%2 ; out0+n [idct32] = t0+n [idct64]
+ REPX {pmaxsd x, m14}, m%6, m%5
+ REPX {pminsd x, m15}, m%6, m%5
+ REPX {paddd x, m11}, m%6, m%5
+ mova m%2, [r3+%3*64] ; t32+n [idct64]
+ mova m%7, [r3+%4*64] ; t63-n [idct64]
+ psubd m%8, m%5, m%7 ; out63-n
+ paddd m%5, m%7 ; out0+n
+ psubd m%7, m%6, m%2 ; out32+n
+ paddd m%6, m%2 ; out31-n
+ REPX {vpsravd x, m11}, m%8, m%5, m%7, m%6
+%endmacro
+
+%macro IDCT64_PASS1_ENDx4 1
+%assign %%m1 %1 ; t32+n
+%assign %%m2 (7-%1) ; t39-n
+%assign %%m3 (8+%1) ; t40+n
+%assign %%m4 (15-%1) ; t47-n
+%assign %%m5 (16+%1) ; t48+n
+%assign %%m6 (23-%1) ; t55-n
+%assign %%m7 (24+%1) ; t56+n
+%assign %%m8 (31-%1) ; t63-n
+
+%assign %%r1 %1 ; t16+n
+%assign %%r2 (7-%1) ; t23-n
+%assign %%r3 (16+%1) ; t24-n
+%assign %%r4 (23-%1) ; t31-n
+
+%assign %%c1 (%1) ; t0/8+n
+%assign %%c2 (7-%1) ; t7/15-n
+
+ IDCT64_PASS1_END %%c1, %%r4, %%m1, %%m8, 24, 25, 26, 27, cq ; out0/31/32/63
+ IDCT64_PASS1_END %%c1, %%r1, %%m4, %%m5, 28, 29, 30, 31, r4 ; out15/16/47/48
+ packssdw m %+ %%r1, m24, m29
+ packssdw m %+ %%r4, m28, m25
+ packssdw m26, m31
+ packssdw m30, m27
+ mova [r3+%%m5*mmsize], m26
+ mova [r3+%%m8*mmsize], m30
+ IDCT64_PASS1_END %%c2, %%r3, %%m2, %%m7, 24, 25, 26, 27, cq ; out7/24/39/56
+ IDCT64_PASS1_END %%c2, %%r2, %%m3, %%m6, 28, 29, 30, 31, r4 ; out8/23/40/55
+ packssdw m %+ %%r2, m24, m29
+ packssdw m %+ %%r3, m28, m25
+ packssdw m26, m31
+ packssdw m30, m27
+ mova [r3+%%m6*mmsize], m26
+ mova [r3+%%m7*mmsize], m30
+%endmacro
+ IDCT64_PASS1_ENDx4 0
+ IDCT64_PASS1_ENDx4 1
+ IDCT64_PASS1_ENDx4 2
+ IDCT64_PASS1_ENDx4 3
+ ret
+.idct64_end:
+ vpbroadcastd m11, [o(pd_2)]
+ lea r4, [cq+64]
+ mov r3, rsp
+ lea r5, [o_base_8bpc]
+ call .idct64_main_end
+
+ pxor m12, m12
+.zero_loop:
+ REPX {mova [cq+r6*8+64*x], m12}, 0, 1, 2, 3
+ sub r6d, 8*4
+ jge .zero_loop
+
+ lea r3, [strideq*3]
+ mov r4, dstq
+ call .pass2
+ mova m0, [rsp+16*mmsize]
+ mova m1, [rsp+17*mmsize]
+ mova m2, [rsp+18*mmsize]
+ mova m3, [rsp+19*mmsize]
+ mova m4, [rsp+20*mmsize]
+ mova m5, [rsp+21*mmsize]
+ mova m6, [rsp+22*mmsize]
+ mova m7, [rsp+23*mmsize]
+ mova m16, [rsp+24*mmsize]
+ mova m17, [rsp+25*mmsize]
+ mova m18, [rsp+26*mmsize]
+ mova m19, [rsp+27*mmsize]
+ mova m20, [rsp+28*mmsize]
+ mova m21, [rsp+29*mmsize]
+ mova m22, [rsp+30*mmsize]
+ mova m23, [rsp+31*mmsize]
+ lea dstq, [r4+64]
+ call .pass2
+ RET
+.pass2:
+ psrlq m12, [permC], 24 ; 0 2 8 10 1 3 9 11
+ psrlq m13, m12, 32 ; 4 6 12 14 5 7 13 15
+ call m(inv_txfm_add_dct_dct_32x16_10bpc).transpose_16x32
+
+ punpckhqdq m19, m5, m16 ; 11
+ punpcklqdq m5, m16 ; 10
+ punpckhqdq m16, m2, m1 ; 5
+ punpcklqdq m2, m1 ; 4
+ punpcklqdq m1, m15, m4 ; 2
+ punpckhqdq m15, m4 ; 3
+ punpcklqdq m4, m14, m18 ; 8
+ punpckhqdq m18, m14, m18 ; 9
+ punpckhqdq m14, m0, m20 ; 1
+ punpcklqdq m0, m20 ; 0
+ punpckhqdq m20, m6, m17 ; 13
+ punpcklqdq m6, m17 ; 12
+ punpckhqdq m17, m3, m21 ; 7
+ punpcklqdq m3, m21 ; 6
+ punpckhqdq m21, m7, m8 ; 15
+ punpcklqdq m7, m8 ; 14
+
+ call m(inv_txfm_add_dct_dct_32x8_8bpc).main
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
+.write:
+ vpbroadcastd m11, [pw_2048]
+ pxor m12, m12
+ vpbroadcastd m13, [pixel_10bpc_max]
+ call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x8
+ pmulhrsw m0, m11, m14
+ pmulhrsw m1, m11, m15
+ pmulhrsw m2, m11, m16
+ pmulhrsw m3, m11, m17
+ call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4
+ pmulhrsw m0, m11, m18
+ pmulhrsw m1, m11, m19
+ pmulhrsw m2, m11, m20
+ pmulhrsw m3, m11, m21
+ jmp m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4
+.fast: ; 8x8 packed
+ movshdup m7, [o(permB)]
+ mova ym0, [cq+64*1]
+ mova ym2, [cq+64*5]
+ mova ym3, [cq+64*3]
+ mova ym1, [cq+64*7]
+ vpermt2q m0, m7, m2 ; 1 5
+ vpermt2q m1, m7, m3 ; 7 3
+ call .main_oddhalf_packed
+ mova [rsp+ 0*mmsize], m0
+ mova [rsp+ 1*mmsize], m1
+ mova [rsp+ 2*mmsize], m2
+ mova [rsp+ 3*mmsize], m3
+ mova [rsp+ 4*mmsize], m4
+ mova [rsp+ 5*mmsize], m5
+ mova [rsp+ 6*mmsize], m6
+ mova [rsp+ 7*mmsize], m7
+ mova [rsp+ 8*mmsize], m16
+ mova [rsp+ 9*mmsize], m17
+ mova [rsp+10*mmsize], m18
+ mova [rsp+11*mmsize], m19
+ mova [rsp+12*mmsize], m20
+ mova [rsp+13*mmsize], m21
+ mova [rsp+14*mmsize], m22
+ mova [rsp+15*mmsize], m23
+
+ movshdup m7, [o(permB)]
+ mova ym0, [cq+64*0]
+ mova ym4, [cq+64*4]
+ mova ym16, [cq+64*2]
+ mova ym5, [cq+64*6]
+ vpermt2q m16, m7, m5 ; 2 6
+ vpermq m0, m7, m0 ; 0 0
+ vpermq m4, m7, m4 ; 4 4
+ call m(inv_txfm_add_dct_dct_32x8_10bpc).main_fast3
+ ; m0-7,9,16-22 contain un-sumsub'ed dct32 output data
+
+ ; zero input coefs
+ pxor m12, m12
+ REPX {mova [cq+x*64], ym12}, 0, 1, 2, 3, 4, 5, 6, 7
+
+ vpbroadcastd m11, [o(pd_2)]
+ call .main_end
+ lea r3, [strideq*3]
+ mov r4, dstq
+ call .pass2_fast
+ mova m0, m24
+ mova m1, m25
+ mova m2, m26
+ mova m3, m27
+ mova m4, m28
+ mova m5, m29
+ mova m6, m30
+ mova m7, m31
+ lea dstq, [r4+64]
+ lea r5, [o_base]
+ call .pass2_fast
+ RET
+.pass2_fast:
+ call m(inv_txfm_add_dct_dct_32x8_10bpc).transpose_8x32
+ lea r5, [o_base_8bpc]
+ punpckhqdq m14, m0, m2 ; 1
+ punpcklqdq m0, m2 ; 0
+ punpcklqdq m1, m3, m4 ; 2
+ punpckhqdq m15, m3, m4 ; 3
+ punpcklqdq m2, m5, m7 ; 4
+ punpckhqdq m16, m5, m7 ; 5
+ punpcklqdq m3, m6, m8 ; 6
+ punpckhqdq m17, m6, m8 ; 7
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
+ jmp .write
+.main_end:
+
+%macro IDCT64_PASS1_PACKED_END 7
+ psubd m%5, m%1, m%2 ; out31-n [idct32] = t31-n [idct64]
+ paddd m%1, m%2 ; out0+n [idct32] = t0+n [idct64]
+ REPX {pmaxsd x, m14}, m%5, m%1
+ REPX {pminsd x, m15}, m%5, m%1
+ REPX {paddd x, m11}, m%5, m%1
+ mova m%2, [rsp+%6*64+gprsize] ; t32+n [idct64]
+ mova m%3, [rsp+%7*64+gprsize] ; t63-n [idct64]
+ psubd m%4, m%1, m%3 ; out63-n
+ paddd m%1, m%3 ; out0+n
+ psubd m%3, m%5, m%2 ; out32+n
+ paddd m%2, m%5 ; out31-n
+ REPX {vpsravd x, m11}, m%4, m%1, m%3, m%2
+%endmacro
+
+ IDCT64_PASS1_PACKED_END 0, 22, 24, 10, 12, 0, 15 ; out0/1,31/30,32/33,63/62
+ IDCT64_PASS1_PACKED_END 7, 9, 31, 13, 12, 7, 8 ; out15/14,16/17,47/46,48/49
+ packssdw m0, m9
+ packssdw m7, m22
+ packssdw m24, m13
+ packssdw m31, m10
+ IDCT64_PASS1_PACKED_END 1, 21, 25, 10, 12, 1, 14 ; out3/2,28/29,35/34,60/61
+ IDCT64_PASS1_PACKED_END 6, 16, 30, 13, 12, 6, 9 ; out12/13,19/18,44/45,51/50
+ packssdw m1, m16
+ packssdw m6, m21
+ packssdw m25, m13
+ packssdw m30, m10
+ IDCT64_PASS1_PACKED_END 2, 20, 26, 10, 12, 2, 13 ; out4/5,27/26,36/37,59/58
+ IDCT64_PASS1_PACKED_END 5, 17, 29, 13, 12, 5, 10 ; out11/10,20/21,43/42,52/53
+ packssdw m2, m17
+ packssdw m5, m20
+ packssdw m26, m13
+ packssdw m29, m10
+ IDCT64_PASS1_PACKED_END 3, 19, 27, 10, 12, 3, 12 ; out7/6,24/25,39/38,56/57
+ IDCT64_PASS1_PACKED_END 4, 18, 28, 13, 12, 4, 11 ; out8/9,23/22,40/41,55/54
+ packssdw m3, m18
+ packssdw m4, m19
+ packssdw m27, m13
+ packssdw m28, m10
+ ret
+.main_oddhalf_packed_rect2:
+ REPX {paddd x, m13}, m0, m1
+ REPX {psrad x, 12 }, m0, m1
+.main_oddhalf_packed:
+ ; m0=in1 in5, m1=in7 in3
+ vbroadcasti32x4 m2, [o(pd_101_501)]
+ vbroadcasti32x4 m3, [o(pd_m700_m301)]
+ vbroadcasti32x4 m4, [o(pd_4095_4065)]
+ vbroadcasti32x4 m5, [o(pd_4036_4085)]
+ pmulld m2, m0
+ pmulld m3, m1
+ pmulld m0, m4
+ pmulld m1, m5
+ REPX {paddd x, m13}, m2, m3, m0, m1
+ REPX {psrad x, 12 }, m2, m3, m0, m1
+
+ ; m2=t32a t40a -> t32/33 t40/41, m3=t39a t47a -> t38/39 t46/47
+ ; m0=t63a t55a -> t62/63 t54/55, m1=t56a t48a -> t56/57 t48/49
+ ; end of step 1-2
+
+ vbroadcasti32x4 m10, [o(pd_401_1931)]
+ vbroadcasti32x4 m11, [o(pd_4076_3612)]
+ mova m4, m0
+ mova m5, m2
+ ITX_MULSUB_2D 4, 5, 8, 9, _, 13, 10, 11
+ vbroadcasti32x4 m10, [o(pd_3166_3920)]
+ vbroadcasti32x4 m11, [o(pd_2598_1189)]
+ mova m6, m3
+ mova m7, m1
+ ITX_MULSUB_2D 7, 6, 8, 9, _, 13, 10, 11, 2
+
+ ; m4=t33a t41a -> t41/42 t33/34, m5=t63a t54a -> t61/62 t53/54
+ ; m6=t38a t46a -> t37/38 t45/46, m7=t57a t49a -> t57/58 t49/50
+ ; and from earlier:
+ ; m0=t63 t55 -> t60/63a t52/55a, m1=t56 t48 -> t56/59a t48/51a
+ ; m2=t32 t40 -> t32/35a t40/43a, m3=t39 t47 -> t36/39a t44/47a
+ ; end of step 3-4
+
+ punpcklqdq m22, m2, m4 ; t32a/33 or t35a/34
+ punpcklqdq m21, m3, m6 ; t36a/37 or t39a/38
+ punpckhqdq m18, m2, m4 ; t40a/41 or t43a/42
+ punpckhqdq m17, m3, m6 ; t44a/45 or t47a/46
+ punpckhqdq m6, m1, m7 ; t48a/49 or t51a/50
+ punpckhqdq m19, m0, m5 ; t52a/53 or t55a/54
+ punpcklqdq m8, m1, m7 ; t56a/57 or t59a/58
+ punpcklqdq m23, m0, m5 ; t60a/61 or t63a/62
+ mova m0, m22
+ mova m7, m21
+ mova m3, m18
+ mova m16, m17
+ mova m5, m6
+ mova m4, m19
+ mova m2, m8
+ mova m1, m23
+ ; m0/22/7/21,18/3/17/16,6/5/19/4,2/8/1/23: t32-63[a]
+
+ ; step5
+ vpbroadcastd m10, [o(pd_799)]
+ vpbroadcastd m11, [o(pd_4017)]
+ ITX_MULSUB_2D 1, 22, 20, 9, _, 13, 10, 11 ; t35/34a, t60/61a
+ ITX_MULSUB_2D 8, 7, 20, 9, _, 13, 10, 11, 2 ; t59/58a, t36/37a
+ vpbroadcastd m10, [o(pd_3406)]
+ vpbroadcastd m11, [o(pd_2276)]
+ ITX_MULSUB_2D 19, 3, 20, 9, _, 13, 10, 11 ; t43/42a, t52/53a
+ ITX_MULSUB_2D 5, 17, 20, 9, _, 13, 10, 11, 2 ; t51/50a, t44/45a
+ ; m0-1/7/21: t32-39[a], m18-19/17-16: t40-47[a]
+ ; m6-5/3-4: t48-55[a], m2/8/22-23: t56-63[a]
+
+ ; step6
+ psubd m20, m0, m21 ; t39/38a
+ paddd m0, m21 ; t32/33a
+ psubd m21, m1, m7 ; t36a/37
+ paddd m1, m7 ; t35a/34
+ REPX {pmaxsd x, m14}, m20, m0, m21, m1
+ psubd m7, m16, m18 ; t40/41a
+ paddd m16, m18 ; t47/46a
+ REPX {pminsd x, m15}, m20, m0, m21, m1
+ psubd m18, m17, m19 ; t43a/42
+ paddd m17, m19 ; t44a/45
+ REPX {pmaxsd x, m14}, m7, m16, m18, m17
+ psubd m19, m6, m4 ; t55/54a
+ paddd m6, m4 ; t48/49a
+ REPX {pminsd x, m15}, m7, m16, m18, m17
+ psubd m4, m5, m3 ; t52a/53
+ paddd m5, m3 ; t51a/50
+ REPX {pmaxsd x, m14}, m19, m6, m4, m5
+ psubd m3, m23, m2 ; t56/57a
+ paddd m23, m2 ; t63/62a
+ REPX {pminsd x, m15}, m19, m6, m4, m5
+ psubd m2, m22, m8 ; t59a/58
+ paddd m22, m8 ; t60a/61
+ REPX {pmaxsd x, m14}, m3, m23, m2, m22
+ REPX {pminsd x, m15}, m3, m23, m2, m22
+ ; m0-1: t32-35[a], m17-16: t44-47[a], m6-5: t48-51[a], m22-23: t60-63[a]
+ ; m21-20: t36-39[a], m7/18: t40-43[a], m4/19: t52-55[a], m3-2: t56-59[a]
+
+ ; step7
+ vpbroadcastd m10, [o(pd_1567)]
+ vpbroadcastd m11, [o(pd_3784)]
+ ITX_MULSUB_2D 2, 21, 8, 9, _, 13, 10, 11 ; t36/37a, t59/58a
+ ITX_MULSUB_2D 3, 20, 8, 9, _, 13, 10, 11 ; t39a/38, t56a/57
+ ITX_MULSUB_2D 19, 7, 8, 9, _, 13, 10, 11, 2 ; t55a/54, t40a/41
+ ITX_MULSUB_2D 4, 18, 8, 9, _, 13, 10, 11, 2 ; t52/53a, t43/42a
+ ; m0-3: t32-39[a], m7,18-16: t40-47[a], m6-4,19: t48-55[a], m20-23: t56-63[a]
+
+ ; step8
+ psubd m8, m0, m16 ; t47a/46
+ paddd m0, m16 ; t32a/33
+ psubd m16, m1, m17 ; t44/45a
+ paddd m1, m17 ; t35/34a
+ REPX {pmaxsd x, m14}, m8, m0, m16, m1
+ psubd m17, m2, m18 ; t43a/42
+ paddd m2, m18 ; t36a/37
+ REPX {pminsd x, m15}, m8, m0, m16, m1
+ psubd m18, m3, m7 ; t40/41a
+ paddd m3, m7 ; t39/38a
+ REPX {pmaxsd x, m14}, m17, m2, m18, m3
+ psubd m7, m23, m6 ; t48a/49
+ paddd m23, m6 ; t63a/62
+ REPX {pminsd x, m15}, m17, m2, m18, m3
+ psubd m6, m22, m5 ; t51/50a
+ paddd m22, m5 ; t60/61a
+ REPX {pmaxsd x, m14}, m7, m23, m6, m22
+ psubd m5, m21, m4 ; t52a/53
+ paddd m21, m4 ; t59a/58
+ REPX {pminsd x, m15}, m7, m23, m6, m22
+ psubd m4, m20, m19 ; t55/54a
+ paddd m20, m19 ; t56/57a
+ REPX {pmaxsd x, m14}, m5, m21, m4, m20
+ REPX {pminsd x, m15}, m5, m21, m4, m20
+ ; m0-3=t32-39[a], m18-16,8: t40-47[a], m7-4=t48-55[a], m20-23=t56-63[a]
+
+ ; step9
+ REPX {pmulld x, m12}, m4, m18, m5, m17, m6, m16, m7, m8
+ REPX {paddd x, m13}, m4, m5, m6, m7
+ paddd m19, m4, m18 ; t55a/54
+ psubd m4, m18 ; t40a/41
+ paddd m18, m5, m17 ; t52/53a
+ psubd m5, m17 ; t43/42a
+ paddd m17, m6, m16 ; t51a/50
+ psubd m6, m16 ; t44a/45
+ paddd m16, m7, m8 ; t48/49a
+ psubd m7, m8 ; t47/46a
+ REPX {psrad x, 12 }, m19, m4, m18, m5, m17, m6, m16, m7
+ ; m4-7=t40-47[a], m16-19=t48-55[a]
+ ret
+
+cglobal inv_txfm_add_dct_dct_64x32_10bpc, 4, 7, 0, dst, stride, c, eob
+ lea r5, [o_base]
+ test eobd, eobd
+ jz .dconly
+
+ PROLOGUE 4, 8, 32, -64*32, dst, stride, c, eob
+%undef cmp
+ vpbroadcastd m12, [o(pd_2896)]
+ vpbroadcastd m13, [o(pd_2048)]
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+ cmp eobd, 136
+ jl .fast
+ add cq, 64
+ cmp eobd, 543
+ jge .full
+ call .pass1_fast ; bottomright 16x16 zero
+ mov r7d, 16*12
+ jmp .lefthalf
+.full:
+ call .pass1
+ mov r7d, 16*28
+.lefthalf:
+ mova [cq+128* 0], m0
+ mova [cq+128* 1], m1
+ mova [cq+128* 2], m2
+ mova [cq+128* 3], m3
+ mova [cq+128* 4], m14
+ mova [cq+128* 5], m15
+ mova [cq+128* 6], m16
+ mova [cq+128* 7], m17
+ mova [cq+128* 8], m22
+ mova [cq+128* 9], m23
+ mova [cq+128*10], m24
+ mova [cq+128*11], m25
+ mova [cq+128*12], m26
+ mova [cq+128*13], m27
+ mova [cq+128*14], m28
+ mova [cq+128*15], m29
+ sub cq, 64
+ vpbroadcastd m12, [o(pd_2896)]
+ vpbroadcastd m13, [o(pd_2048)]
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+ sub rsp, 16*64
+ call .pass1
+ add rsp, 16*64
+ lea r5, [o_base_8bpc]
+ call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_start
+ mov r4, dstq
+ pxor m12, m12
+ call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_end
+ lea dstq, [r4+64]
+ mova m0, [rsp+16*mmsize]
+ mova m1, [rsp+17*mmsize]
+ mova m2, [rsp+18*mmsize]
+ mova m3, [rsp+19*mmsize]
+ mova m4, [rsp+20*mmsize]
+ mova m5, [rsp+21*mmsize]
+ mova m6, [rsp+22*mmsize]
+ mova m7, [rsp+23*mmsize]
+ mova m16, [rsp+24*mmsize]
+ mova m17, [rsp+25*mmsize]
+ mova m18, [rsp+26*mmsize]
+ mova m19, [rsp+27*mmsize]
+ mova m20, [rsp+28*mmsize]
+ mova m21, [rsp+29*mmsize]
+ mova m22, [rsp+30*mmsize]
+ mova m23, [rsp+31*mmsize]
+ call .transpose
+ mova [cq+128* 0+64], m0
+ mova [cq+128* 1+64], m1
+ mova [cq+128* 2+64], m2
+ mova [cq+128* 3+64], m3
+ mova [cq+128* 4+64], m14
+ mova [cq+128* 5+64], m15
+ mova [cq+128* 6+64], m16
+ mova [cq+128* 7+64], m17
+ mova [cq+128* 8+64], m22
+ mova [cq+128* 9+64], m23
+ mova [cq+128*10+64], m24
+ mova [cq+128*11+64], m25
+ mova [cq+128*12+64], m26
+ mova [cq+128*13+64], m27
+ mova [cq+128*14+64], m28
+ mova [cq+128*15+64], m29
+ mova m0, [rsp+ 0*mmsize]
+ mova m1, [rsp+ 1*mmsize]
+ mova m2, [rsp+ 2*mmsize]
+ mova m3, [rsp+ 3*mmsize]
+ mova m4, [rsp+ 4*mmsize]
+ mova m5, [rsp+ 5*mmsize]
+ mova m6, [rsp+ 6*mmsize]
+ mova m7, [rsp+ 7*mmsize]
+ mova m16, [rsp+ 8*mmsize]
+ mova m17, [rsp+ 9*mmsize]
+ mova m18, [rsp+10*mmsize]
+ mova m19, [rsp+11*mmsize]
+ mova m20, [rsp+12*mmsize]
+ mova m21, [rsp+13*mmsize]
+ mova m22, [rsp+14*mmsize]
+ mova m23, [rsp+15*mmsize]
+ call .transpose
+ call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_start
+ pxor m12, m12
+.right_zero_loop:
+ mova [cq+r7*8+64+128*3], m12
+ mova [cq+r7*8+64+128*2], m12
+ mova [cq+r7*8+64+128*1], m12
+ mova [cq+r7*8+64+128*0], m12
+ sub r7d, 16*4
+ jge .right_zero_loop
+ mov r7d, 16*28
+ jmp .end
+.fast: ; topleft 16x16 nonzero
+ cmp eobd, 36
+ jl .fast2
+ call .pass1_fast
+ lea r5, [o_base_8bpc]
+ call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_fast_start
+ mov r4, dstq
+ pxor m12, m12
+ call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_end
+ lea dstq, [r4+64]
+ mova m0, [rsp+16*mmsize]
+ mova m1, [rsp+17*mmsize]
+ mova m2, [rsp+18*mmsize]
+ mova m3, [rsp+19*mmsize]
+ mova m4, [rsp+20*mmsize]
+ mova m5, [rsp+21*mmsize]
+ mova m6, [rsp+22*mmsize]
+ mova m7, [rsp+23*mmsize]
+ mova m16, [rsp+24*mmsize]
+ mova m17, [rsp+25*mmsize]
+ mova m18, [rsp+26*mmsize]
+ mova m19, [rsp+27*mmsize]
+ mova m20, [rsp+28*mmsize]
+ mova m21, [rsp+29*mmsize]
+ mova m22, [rsp+30*mmsize]
+ mova m23, [rsp+31*mmsize]
+ call .transpose
+ call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_fast_start
+ mov r7d, 16*12
+ pxor m12, m12
+ jmp .end
+.fast2: ; topleft 8x8 nonzero
+ movshdup m7, [o(permB)]
+ mova ym0, [cq+128*1]
+ mova ym2, [cq+128*5]
+ mova ym3, [cq+128*3]
+ mova ym1, [cq+128*7]
+ vpermt2q m0, m7, m2 ; 1 5
+ vpermt2q m1, m7, m3 ; 7 3
+ REPX {pmulld x, m12}, m0, m1
+ call m(inv_txfm_add_dct_dct_64x16_10bpc).main_oddhalf_packed_rect2
+ mova [rsp+ 0*mmsize], m0
+ mova [rsp+ 1*mmsize], m1
+ mova [rsp+ 2*mmsize], m2
+ mova [rsp+ 3*mmsize], m3
+ mova [rsp+ 4*mmsize], m4
+ mova [rsp+ 5*mmsize], m5
+ mova [rsp+ 6*mmsize], m6
+ mova [rsp+ 7*mmsize], m7
+ mova [rsp+ 8*mmsize], m16
+ mova [rsp+ 9*mmsize], m17
+ mova [rsp+10*mmsize], m18
+ mova [rsp+11*mmsize], m19
+ mova [rsp+12*mmsize], m20
+ mova [rsp+13*mmsize], m21
+ mova [rsp+14*mmsize], m22
+ mova [rsp+15*mmsize], m23
+
+ movshdup m7, [o(permB)]
+ pmulld ym0, ym12, [cq+128*0]
+ pmulld ym4, ym12, [cq+128*4]
+ mova ym16, [cq+128*2]
+ mova ym5, [cq+128*6]
+ REPX {paddd x, ym13}, ym0, ym4
+ REPX {psrad x, 12 }, ym0, ym4
+ vpermt2q m16, m7, m5 ; 2 6
+ vpermq m0, m7, m0 ; 0 0
+ vpermq m4, m7, m4 ; 4 4
+ pmulld m16, m12
+ paddd m16, m13
+ psrad m16, 12
+ call m(inv_txfm_add_dct_dct_32x8_10bpc).main_fast3
+
+ vpbroadcastd m11, [o(pd_1)]
+ call m(inv_txfm_add_dct_dct_64x16_10bpc).main_end
+ mova [rsp+16*mmsize], m24
+ mova [rsp+17*mmsize], m25
+ mova [rsp+18*mmsize], m26
+ mova [rsp+19*mmsize], m27
+ mova [rsp+20*mmsize], m28
+ mova [rsp+21*mmsize], m29
+ mova [rsp+22*mmsize], m30
+ mova [rsp+23*mmsize], m31
+ vpbroadcastd m13, [o(pd_2048)]
+ call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_fast2_start
+ mov r7d, 16*4
+ mov r4, dstq
+ pxor m12, m12
+ call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_end
+ lea dstq, [r4+64]
+ mova m0, [rsp+16*mmsize]
+ mova m1, [rsp+17*mmsize]
+ mova m2, [rsp+18*mmsize]
+ mova m3, [rsp+19*mmsize]
+ mova m4, [rsp+20*mmsize]
+ mova m5, [rsp+21*mmsize]
+ mova m6, [rsp+22*mmsize]
+ mova m7, [rsp+23*mmsize]
+ lea r5, [o_base]
+ vpbroadcastd m13, [o(pd_2048)]
+ call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_fast2_start
+ pxor m12, m12
+.end:
+ call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_end
+.zero_loop:
+ mova [cq+r7*8+128*3], m12
+ mova [cq+r7*8+128*2], m12
+ mova [cq+r7*8+128*1], m12
+ mova [cq+r7*8+128*0], m12
+ sub r7d, 16*4
+ jge .zero_loop
+ RET
+.dconly:
+ imul r6d, [cq], 181
+ mov [cq], eobd
+ or r3d, 32
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ add r6d, 384
+ sar r6d, 9
+ jmp m(inv_txfm_add_dct_dct_64x16_10bpc).dconly2
+.pass1_fast:
+ lea r4, [idct64_mul_16bpc]
+ lea r6, [rsp+4*64+gprsize]
+ pmulld m0, m12, [cq+128* 1]
+ pmulld m3, m12, [cq+128*15]
+ call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast_rect2
+ pmulld m0, m12, [cq+128* 7]
+ pmulld m3, m12, [cq+128* 9]
+ call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast_rect2
+ pmulld m0, m12, [cq+128* 5]
+ pmulld m3, m12, [cq+128*11]
+ call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast_rect2
+ pmulld m0, m12, [cq+128* 3]
+ pmulld m3, m12, [cq+128*13]
+ call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast_rect2
+ call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part2
+ pmulld m0, m12, [cq+128* 0]
+ pmulld m1, m12, [cq+128* 8]
+ pmulld m16, m12, [cq+128* 4]
+ pmulld m17, m12, [cq+128*12]
+ call m(idct_8x16_internal_10bpc).main_fast2_rect2
+ call m(idct_16x16_internal_10bpc).main_fast2_rect2
+ call .pass1_load_spill
+ call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast2_rect2
+ jmp .pass1_end
+.pass1:
+ lea r4, [idct64_mul_16bpc]
+ lea r6, [rsp+4*64+gprsize]
+ pmulld m0, m12, [cq+128* 1]
+ pmulld m1, m12, [cq+128*31]
+ pmulld m2, m12, [cq+128*17]
+ pmulld m3, m12, [cq+128*15]
+ call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_rect2
+ pmulld m0, m12, [cq+128* 7]
+ pmulld m1, m12, [cq+128*25]
+ pmulld m2, m12, [cq+128*23]
+ pmulld m3, m12, [cq+128* 9]
+ call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_rect2
+ pmulld m0, m12, [cq+128* 5]
+ pmulld m1, m12, [cq+128*27]
+ pmulld m2, m12, [cq+128*21]
+ pmulld m3, m12, [cq+128*11]
+ call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_rect2
+ pmulld m0, m12, [cq+128* 3]
+ pmulld m1, m12, [cq+128*29]
+ pmulld m2, m12, [cq+128*19]
+ pmulld m3, m12, [cq+128*13]
+ call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_rect2
+ call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part2
+ pmulld m0, m12, [cq+128* 0]
+ pmulld m1, m12, [cq+128* 8]
+ pmulld m2, m12, [cq+128*16]
+ pmulld m3, m12, [cq+128*24]
+ pmulld m16, m12, [cq+128* 4]
+ pmulld m17, m12, [cq+128*12]
+ pmulld m18, m12, [cq+128*20]
+ pmulld m19, m12, [cq+128*28]
+ call m(idct_8x16_internal_10bpc).main_fast_rect2
+ call m(idct_16x16_internal_10bpc).main_fast_rect2
+ call .pass1_load_spill
+ pmulld m4, m12, [cq+128*18]
+ pmulld m5, m12, [cq+128*22]
+ pmulld m6, m12, [cq+128*26]
+ pmulld m7, m12, [cq+128*30]
+ call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast_rect2
+.pass1_end:
+ vpbroadcastd m11, [o(pd_1)]
+ lea r3, [rsp+gprsize]
+ lea r4, [cq+8*128]
+ call m(inv_txfm_add_dct_dct_64x16_10bpc).idct64_main_end
+ ; transpose one half immediately, we can transpose lower half later
+.transpose:
+ ; transpose m0-7,16-23
+ psrlq m12, [permC], 24 ; 0 2 8 10 1 3 9 11
+ psrlq m13, m12, 32 ; 4 6 12 14 5 7 13 15
+ call m(inv_txfm_add_dct_dct_32x16_10bpc).transpose_16x32
+ punpckhqdq m22, m0, m20 ; 1
+ punpcklqdq m0, m20 ; 0
+ punpckhqdq m24, m2, m1 ; 5
+ punpcklqdq m1, m2, m1 ; 4
+ punpcklqdq m2, m14, m18 ; 8
+ punpckhqdq m26, m14, m18 ; 9
+ punpcklqdq m14, m15, m4 ; 2
+ punpckhqdq m23, m15, m4 ; 3
+ punpckhqdq m25, m3, m21 ; 7
+ punpcklqdq m15, m3, m21 ; 6
+ punpckhqdq m28, m6, m17 ; 13
+ punpcklqdq m3, m6, m17 ; 12
+ punpckhqdq m27, m5, m16 ; 11
+ punpcklqdq m16, m5, m16 ; 10
+ punpckhqdq m29, m7, m8 ; 15
+ punpcklqdq m17, m7, m8 ; 14
+ ret
+.pass1_load_spill:
+ call m(inv_txfm_add_dct_dct_32x16_10bpc).idct16_sumsub
+ mova [cq+128* 0], m0
+ mova [cq+128* 1], m1
+ pmulld m0, m12, [cq+128* 2]
+ pmulld m1, m12, [cq+128* 6]
+ mova [cq+128* 2], m2
+ mova [cq+128* 3], m3
+ pmulld m2, m12, [cq+128*10]
+ pmulld m3, m12, [cq+128*14]
+ mova [cq+128* 4], m4
+ mova [cq+128* 5], m5
+ mova [cq+128* 6], m6
+ mova [cq+128* 7], m7
+ mova [cq+128* 8], m23
+ mova [cq+128* 9], m22
+ mova [cq+128*10], m21
+ mova [cq+128*11], m20
+ mova [cq+128*12], m19
+ mova [cq+128*13], m18
+ mova [cq+128*14], m17
+ mova [cq+128*15], m16
+ ret
+
+cglobal inv_txfm_add_dct_dct_64x64_10bpc, 4, 7, 0, dst, stride, c, eob
+ lea r5, [o_base]
+ test eobd, eobd
+ jz .dconly
+
+ PROLOGUE 4, 9, 32, -64*32, dst, stride, c, eob
+%undef cmp
+ vpbroadcastd m12, [o(pd_2896)]
+ vpbroadcastd m13, [o(pd_2048)]
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+ cmp eobd, 136
+ jl .fast
+ add cq, 64
+ cmp eobd, 543
+ jge .full
+ call .pass1_fast ; bottomright 16x16 zero
+ mov r7d, 16*12
+ jmp .lefthalf
+.full:
+ call .pass1
+ mov r7d, 16*28
+.lefthalf:
+ mova [cq+128* 0], m27
+ mova [cq+128* 1], m14
+ mova [cq+128* 2], m28
+ mova [cq+128* 3], m15
+ mova [cq+128* 4], m22
+ mova [cq+128* 5], m23
+ mova [cq+128* 6], m24
+ mova [cq+128* 7], m25
+ mova [cq+128* 8], m0
+ mova [cq+128* 9], m26
+ mova [cq+128*10], m20
+ mova [cq+128*11], m21
+ mova [cq+128*12], m18
+ mova [cq+128*13], m16
+ mova [cq+128*14], m17
+ mova [cq+128*15], m3
+ sub cq, 64
+ vpbroadcastd m12, [o(pd_2896)]
+ vpbroadcastd m13, [o(pd_2048)]
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+ sub rsp, 16*64
+ call .pass1
+ sub rsp, 24*64
+ call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_start
+ mov r8, dstq
+ pxor m31, m31
+ call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_end
+ lea dstq, [r8+64]
+ mova m0, [rsp+56*mmsize]
+ mova m1, [rsp+57*mmsize]
+ mova m2, [rsp+58*mmsize]
+ mova m3, [rsp+59*mmsize]
+ mova m4, [rsp+60*mmsize]
+ mova m5, [rsp+61*mmsize]
+ mova m6, [rsp+62*mmsize]
+ mova m7, [rsp+63*mmsize]
+ mova m16, [rsp+64*mmsize]
+ mova m17, [rsp+65*mmsize]
+ mova m18, [rsp+66*mmsize]
+ mova m19, [rsp+67*mmsize]
+ mova m20, [rsp+68*mmsize]
+ mova m21, [rsp+69*mmsize]
+ mova m22, [rsp+70*mmsize]
+ mova m23, [rsp+71*mmsize]
+ call .transpose
+ mova [cq+128* 0+64], m27
+ mova [cq+128* 1+64], m14
+ mova [cq+128* 2+64], m28
+ mova [cq+128* 3+64], m15
+ mova [cq+128* 4+64], m22
+ mova [cq+128* 5+64], m23
+ mova [cq+128* 6+64], m24
+ mova [cq+128* 7+64], m25
+ mova [cq+128* 8+64], m0
+ mova [cq+128* 9+64], m26
+ mova [cq+128*10+64], m20
+ mova [cq+128*11+64], m21
+ mova [cq+128*12+64], m18
+ mova [cq+128*13+64], m16
+ mova [cq+128*14+64], m17
+ mova [cq+128*15+64], m3
+ mova m0, [rsp+40*mmsize]
+ mova m1, [rsp+41*mmsize]
+ mova m2, [rsp+42*mmsize]
+ mova m3, [rsp+43*mmsize]
+ mova m4, [rsp+44*mmsize]
+ mova m5, [rsp+45*mmsize]
+ mova m6, [rsp+46*mmsize]
+ mova m7, [rsp+47*mmsize]
+ mova m16, [rsp+48*mmsize]
+ mova m17, [rsp+49*mmsize]
+ mova m18, [rsp+50*mmsize]
+ mova m19, [rsp+51*mmsize]
+ mova m20, [rsp+52*mmsize]
+ mova m21, [rsp+53*mmsize]
+ mova m22, [rsp+54*mmsize]
+ mova m23, [rsp+55*mmsize]
+ add rsp, 32*64
+ call .transpose
+ lea r5, [o_base]
+ call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_start
+.right_zero_loop:
+ REPX {mova [cq+r7*8+64+128*x], m31}, 0, 1, 2, 3
+ sub r7d, 16*4
+ jge .right_zero_loop
+ mov r7d, 16*28
+ jmp .end
+.fast: ; topleft 16x16 nonzero
+ cmp eobd, 36
+ jl .fast2
+ call .pass1_fast
+ sub rsp, 24*64
+ vpbroadcastd m10, [o(pd_2048)]
+ call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_fast_start
+ mov r8, dstq
+ pxor m31, m31
+ call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_end
+ lea dstq, [r8+64]
+ mova m0, [rsp+40*mmsize]
+ mova m1, [rsp+41*mmsize]
+ mova m2, [rsp+42*mmsize]
+ mova m3, [rsp+43*mmsize]
+ mova m4, [rsp+44*mmsize]
+ mova m5, [rsp+45*mmsize]
+ mova m6, [rsp+46*mmsize]
+ mova m7, [rsp+47*mmsize]
+ mova m16, [rsp+48*mmsize]
+ mova m17, [rsp+49*mmsize]
+ mova m18, [rsp+50*mmsize]
+ mova m19, [rsp+51*mmsize]
+ mova m20, [rsp+52*mmsize]
+ mova m21, [rsp+53*mmsize]
+ mova m22, [rsp+54*mmsize]
+ mova m23, [rsp+55*mmsize]
+ add rsp, 16*64
+ call .transpose
+ lea r5, [o_base]
+ vpbroadcastd m10, [o(pd_2048)]
+ call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_fast_start
+ mov r7d, 16*12
+ jmp .end
+.fast2: ; topleft 8x8 nonzero
+ movshdup m7, [o(permB)]
+ mova ym0, [cq+128*1]
+ mova ym2, [cq+128*5]
+ mova ym3, [cq+128*3]
+ mova ym1, [cq+128*7]
+ vpermt2q m0, m7, m2 ; 1 5
+ vpermt2q m1, m7, m3 ; 7 3
+ call m(inv_txfm_add_dct_dct_64x16_10bpc).main_oddhalf_packed
+ mova [rsp+ 0*mmsize], m0
+ mova [rsp+ 1*mmsize], m1
+ mova [rsp+ 2*mmsize], m2
+ mova [rsp+ 3*mmsize], m3
+ mova [rsp+ 4*mmsize], m4
+ mova [rsp+ 5*mmsize], m5
+ mova [rsp+ 6*mmsize], m6
+ mova [rsp+ 7*mmsize], m7
+ mova [rsp+ 8*mmsize], m16
+ mova [rsp+ 9*mmsize], m17
+ mova [rsp+10*mmsize], m18
+ mova [rsp+11*mmsize], m19
+ mova [rsp+12*mmsize], m20
+ mova [rsp+13*mmsize], m21
+ mova [rsp+14*mmsize], m22
+ mova [rsp+15*mmsize], m23
+
+ movshdup m7, [o(permB)]
+ mova ym0, [cq+128*0]
+ mova ym4, [cq+128*4]
+ mova ym16, [cq+128*2]
+ mova ym5, [cq+128*6]
+ vpermt2q m16, m7, m5 ; 2 6
+ vpermq m0, m7, m0 ; 0 0
+ vpermq m4, m7, m4 ; 4 4
+ call m(inv_txfm_add_dct_dct_32x8_10bpc).main_fast3
+
+ vpbroadcastd m11, [o(pd_2)]
+ call m(inv_txfm_add_dct_dct_64x16_10bpc).main_end
+ sub rsp, 16*64
+ mova [rsp+40*mmsize], m24
+ mova [rsp+41*mmsize], m25
+ mova [rsp+42*mmsize], m26
+ mova [rsp+43*mmsize], m27
+ mova [rsp+44*mmsize], m28
+ mova [rsp+45*mmsize], m29
+ mova [rsp+46*mmsize], m30
+ mova [rsp+47*mmsize], m31
+ call .pass2_fast2_start
+ mov r7d, 16*4
+ mov r8, dstq
+ pxor m31, m31
+ call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_end
+ lea dstq, [r8+64]
+ mova m0, [rsp+40*mmsize]
+ mova m1, [rsp+41*mmsize]
+ mova m2, [rsp+42*mmsize]
+ mova m3, [rsp+43*mmsize]
+ mova m4, [rsp+44*mmsize]
+ mova m5, [rsp+45*mmsize]
+ mova m6, [rsp+46*mmsize]
+ mova m7, [rsp+47*mmsize]
+ add rsp, 8*64
+ lea r5, [o_base]
+ call .pass2_fast2_start
+.end:
+ pxor m31, m31
+.zero_loop:
+ REPX {mova [cq+r7*8+128*x], m31}, 0, 1, 2, 3
+ sub r7d, 16*4
+ jge .zero_loop
+ call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_end
+ add rsp, 8*64 ; FIXME adjust stack_size_padded instead?
+ RET
+.pass2_fast2_start:
+ call m(inv_txfm_add_dct_dct_32x8_10bpc).transpose_8x32
+ punpcklqdq m27, m0, m2 ; 0
+ punpckhqdq m0, m2 ; 1
+ punpcklqdq m22, m3, m4 ; 2
+ punpckhqdq m26, m3, m4 ; 3
+ punpcklqdq m14, m5, m7 ; 4
+ punpckhqdq m20, m5, m7 ; 5
+ punpcklqdq m23, m6, m8 ; 6
+ punpckhqdq m21, m6, m8 ; 7
+ vpbroadcastd m10, [o(pd_2048)]
+ jmp m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_fast2_start
+.dconly:
+ imul r6d, [cq], 181
+ mov [cq], eobd
+ or r3d, 64
+ jmp m(inv_txfm_add_dct_dct_64x16_10bpc).dconly1
+.pass1_fast:
+ lea r4, [idct64_mul_16bpc]
+ lea r6, [rsp+4*64+gprsize]
+ mova m0, [cq+128* 1]
+ mova m3, [cq+128*15]
+ call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast
+ mova m0, [cq+128* 7]
+ mova m3, [cq+128* 9]
+ call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast
+ mova m0, [cq+128* 5]
+ mova m3, [cq+128*11]
+ call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast
+ mova m0, [cq+128* 3]
+ mova m3, [cq+128*13]
+ call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast
+ call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part2
+ mova m0, [cq+128* 0]
+ mova m1, [cq+128* 8]
+ mova m16, [cq+128* 4]
+ mova m17, [cq+128*12]
+ call m(idct_8x16_internal_10bpc).main_fast2
+ call m(idct_16x16_internal_10bpc).main_fast2
+ call .pass1_load_spill
+ call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast2
+ jmp .pass1_end
+.pass1:
+ lea r4, [idct64_mul_16bpc]
+ lea r6, [rsp+4*64+gprsize]
+ mova m0, [cq+128* 1]
+ mova m1, [cq+128*31]
+ mova m2, [cq+128*17]
+ mova m3, [cq+128*15]
+ call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1
+ mova m0, [cq+128* 7]
+ mova m1, [cq+128*25]
+ mova m2, [cq+128*23]
+ mova m3, [cq+128* 9]
+ call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1
+ mova m0, [cq+128* 5]
+ mova m1, [cq+128*27]
+ mova m2, [cq+128*21]
+ mova m3, [cq+128*11]
+ call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1
+ mova m0, [cq+128* 3]
+ mova m1, [cq+128*29]
+ mova m2, [cq+128*19]
+ mova m3, [cq+128*13]
+ call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1
+ call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part2
+ mova m0, [cq+128* 0]
+ mova m1, [cq+128* 8]
+ mova m2, [cq+128*16]
+ mova m3, [cq+128*24]
+ mova m16, [cq+128* 4]
+ mova m17, [cq+128*12]
+ mova m18, [cq+128*20]
+ mova m19, [cq+128*28]
+ call m(idct_8x16_internal_10bpc).main_fast
+ call m(idct_16x16_internal_10bpc).main_fast
+ call .pass1_load_spill
+ mova m4, [cq+128*18]
+ mova m5, [cq+128*22]
+ mova m6, [cq+128*26]
+ mova m7, [cq+128*30]
+ call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast
+.pass1_end:
+ vpbroadcastd m11, [o(pd_2)]
+ lea r3, [rsp+gprsize]
+ lea r4, [cq+8*128]
+ call m(inv_txfm_add_dct_dct_64x16_10bpc).idct64_main_end
+ ; transpose one half immediately, we can transpose lower half later
+.transpose:
+ ; transpose m0-7,16-23
+ psrlq m12, [permC], 24 ; 0 2 8 10 1 3 9 11
+ psrlq m13, m12, 32 ; 4 6 12 14 5 7 13 15
+ call m(inv_txfm_add_dct_dct_32x16_10bpc).transpose_16x32
+ punpcklqdq m27, m0, m20 ; 0
+ punpckhqdq m0, m20 ; 1
+ punpcklqdq m24, m5, m16 ; 10
+ punpckhqdq m16, m5, m16 ; 11
+ punpcklqdq m23, m3, m21 ; 6
+ punpckhqdq m21, m3, m21 ; 7
+ punpcklqdq m25, m7, m8 ; 14
+ punpckhqdq m3, m7, m8 ; 15
+ punpcklqdq m22, m15, m4 ; 2
+ punpckhqdq m26, m15, m4 ; 3
+ punpcklqdq m15, m6, m17 ; 12
+ punpckhqdq m17, m6, m17 ; 13
+ punpcklqdq m28, m14, m18 ; 8
+ punpckhqdq m18, m14, m18 ; 9
+ punpcklqdq m14, m2, m1 ; 4
+ punpckhqdq m20, m2, m1 ; 5
+ ret
+.pass1_load_spill:
+ call m(inv_txfm_add_dct_dct_32x16_10bpc).idct16_sumsub
+ mova [cq+128* 0], m0
+ mova [cq+128* 1], m1
+ mova m0, [cq+128* 2]
+ mova m1, [cq+128* 6]
+ mova [cq+128* 2], m2
+ mova [cq+128* 3], m3
+ mova m2, [cq+128*10]
+ mova m3, [cq+128*14]
+ mova [cq+128* 4], m4
+ mova [cq+128* 5], m5
+ mova [cq+128* 6], m6
+ mova [cq+128* 7], m7
+ mova [cq+128* 8], m23
+ mova [cq+128* 9], m22
+ mova [cq+128*10], m21
+ mova [cq+128*11], m20
+ mova [cq+128*12], m19
+ mova [cq+128*13], m18
+ mova [cq+128*14], m17
+ mova [cq+128*15], m16
+ ret
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/itx16_sse.asm b/third_party/dav1d/src/x86/itx16_sse.asm
new file mode 100644
index 0000000000..3833e17c99
--- /dev/null
+++ b/third_party/dav1d/src/x86/itx16_sse.asm
@@ -0,0 +1,8135 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
+; Copyright © 2017-2021, The rav1e contributors
+; Copyright © 2020, Nathan Egge
+; Copyright © 2021, Matthias Dressel
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA
+%macro COEF 1-2
+pd_%1: times 4 dd %1
+%if %0 == 2
+pd_m%1: times 4 dd -%1
+%endif
+%endmacro
+
+COEF 201
+COEF 401
+COEF 601, 1
+COEF 799
+COEF 995
+COEF 1189, 1
+COEF 1380, 1
+COEF 1567
+COEF 1751
+COEF 1931
+COEF 2106, 1
+COEF 2276, 1
+COEF 2440
+COEF 2598, 1
+COEF 2751, 1
+COEF 2896
+COEF 3035
+COEF 3166
+COEF 3290
+COEF 3406
+COEF 3513
+COEF 3612
+COEF 3703
+COEF 3784
+COEF 3857
+COEF 3920
+COEF 3973
+COEF 4017
+COEF 4052
+COEF 4076
+COEF 4091
+
+deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
+
+%if ARCH_X86_32
+pd_1: times 4 dd 1
+%endif
+pd_2: times 4 dd 2
+pw_5: times 8 dw 5
+pd_1321: times 4 dd 1321
+pd_2482: times 4 dd 2482
+pd_m3344: times 4 dd -3344
+pd_2048: times 4 dd 2048
+pw_4x2048_4xm2048: times 4 dw 2048
+ times 4 dw -2048
+pw_4xm2048_4x2048: times 4 dw -2048
+ times 4 dw 2048
+pw_2048: times 8 dw 2048
+pw_m2048: times 8 dw -2048
+pd_3803: times 4 dd 3803
+pw_4096: times 8 dw 4096
+pd_5793: times 4 dd 5793
+pd_6144: times 4 dd 6144
+pw_8192: times 8 dw 8192
+pd_10240: times 4 dd 10240
+pd_11586: times 4 dd 11586
+pw_1697x8: times 8 dw 1697*8
+pw_2896x8: times 8 dw 2896*8
+pw_1697x16: times 8 dw 1697*16
+pw_16384: times 8 dw 16384
+pixel_10bpc_max: times 8 dw 0x03ff
+
+pw_1567_3784: times 4 dw 1567, 3784
+pw_m3784_1567: times 4 dw -3784, 1567
+pw_2896_2896: times 4 dw 2896, 2896
+pw_m2896_2896: times 4 dw -2896, 2896
+
+clip_18b_min: times 4 dd -0x20000
+clip_18b_max: times 4 dd 0x1ffff
+
+idct64_mul_16bpc:
+dd 4095, 101, 2967, -2824, 3745, 1660, 3822, -1474, 401, 4076, 799, 4017
+dd -700, 4036, 2359, 3349, -2191, 3461, 897, 3996, -2598, -3166, -4017, -799
+dd 4065, 501, 3229, -2520, 3564, 2019, 3948, -1092, 1931, 3612, 3406, 2276
+dd -301, 4085, 2675, 3102, -1842, 3659, 1285, 3889, -1189, -3920, -2276, -3406
+
+cextern inv_txfm_add_dct_dct_4x4_8bpc_ssse3
+cextern iadst_4x4_internal_8bpc_ssse3.main
+cextern idct_4x8_internal_8bpc_ssse3.main
+cextern iadst_4x8_internal_8bpc_ssse3.main
+cextern idct_16x4_internal_8bpc_ssse3.main
+cextern iadst_16x4_internal_8bpc_ssse3.main
+cextern iadst_16x4_internal_8bpc_ssse3.main_pass2_end
+cextern idct_8x4_internal_8bpc_ssse3.main
+cextern iadst_8x4_internal_8bpc_ssse3.main
+cextern idct_8x8_internal_8bpc_ssse3.main
+cextern idct_8x8_internal_8bpc_ssse3.pass1_end3
+cextern iadst_8x8_internal_8bpc_ssse3.main
+cextern iadst_8x8_internal_8bpc_ssse3.main_pass2_end
+cextern idct_16x8_internal_8bpc_ssse3.main
+cextern iadst_16x8_internal_8bpc_ssse3.main
+cextern iadst_16x8_internal_8bpc_ssse3.main_pass2_end
+cextern idct_8x32_internal_8bpc_ssse3.main
+cextern idct_8x32_internal_8bpc_ssse3.main_fast
+cextern idct_8x32_internal_8bpc_ssse3.main_veryfast
+cextern idct_16x64_internal_8bpc_ssse3.main
+cextern idct_16x64_internal_8bpc_ssse3.main_fast
+
+tbl_4x16_2d: db 0, 13, 29, 45
+tbl_4x16_h: db 0, 16, 32, 48
+tbl_4x16_v: db 0, 4, 8, 12
+
+tbl_8x16_2d: db 0, 14, 30, 46
+tbl_8x16_v: db 0, 4, 8, 12
+tbl_8x16_h: db 0, 32, 64, 96
+
+tbl_16x16_2d: db 0, 10, 36, 78
+tbl_16x16_v: db 0, 4, 8, 12
+tbl_16x16_h: db 0, 64, 128, 192
+
+tbl_8x32_2d: dw 0, 14, 43, 75, 107, 139, 171, 203
+
+tbl_16x32_2d: dw 0, 14, 44, 90, 151, 215, 279, 343
+
+tbl_32x16_2d: ; first 4 entries of 32x32 are identical to this one
+tbl_32x32_2d: dw 0, 10, 36, 78, 136, 210, 300, 406
+
+tbl_Nx32_odd_offset: db 2*16, 2*23
+ db 2*20, 2*19
+ db 2*18, 2*21
+ db 2*22, 2*17
+ db 2*30, 2*25
+ db 2*26, 2*29
+ db 2*28, 2*27
+ db 2*24, 2*31
+
+tbl_Nx64_offset: db 2* 0, 2*32, 2*16, 2*46
+ db 2* 8, 2*40, 2*23, 2*38
+ db 2* 1, 2*36, 2*20, 2*42
+ db 2* 9, 2*44, 2*19, 2*34
+ db 2* 2, 2*60, 2*18, 2*50
+ db 2*10, 2*52, 2*21, 2*58
+ db 2* 3, 2*56, 2*22, 2*54
+ db 2*11, 2*48, 2*17, 2*62
+
+SECTION .text
+
+%define m_suffix(x, sfx) mangle(private_prefix %+ _ %+ x %+ sfx)
+%define m(x) m_suffix(x, SUFFIX)
+
+; This refers to the first function in itx_sse i.e. the start of the text section
+; which is needed as a base pointer for constants.
+%define itx8_start m_suffix(inv_txfm_add_dct_dct_4x4_8bpc, _ssse3)
+
+%if ARCH_X86_64
+%define o(x) x
+%else
+%define o(x) r6-$$+x ; PIC
+%endif
+
+%macro IWHT4_1D 0
+ ; m0 = in0, m1 = in1, m2 = in2, m3 = in3
+ paddd m0, m1 ; in0 += in1
+ psubd m4, m2, m3 ; tmp0 = in2 - in3
+ psubd m5, m0, m4 ; tmp1 = (in0 - tmp0) >> 1
+ psrad m5, 1
+ psubd m2, m5, m1 ; in2 = tmp1 - in1
+ psubd m5, m3 ; in1 = tmp1 - in3
+ psubd m0, m5 ; in0 -= in1
+ paddd m4, m2 ; in3 = tmp0 + in2
+ ; m0 = out0, m1 = in1, m2 = out2, m3 = in3
+ ; m4 = out3, m5 = out1
+%endmacro
+
+INIT_XMM sse2
+cglobal inv_txfm_add_wht_wht_4x4_16bpc, 3, 3, 6, dst, stride, c, eob, bdmax
+ mova m0, [cq+16*0]
+ mova m1, [cq+16*1]
+ mova m2, [cq+16*2]
+ mova m3, [cq+16*3]
+ REPX {psrad x, 2}, m0, m1, m2, m3
+ IWHT4_1D
+ punpckldq m1, m0, m5
+ punpckhdq m3, m0, m5
+ punpckldq m5, m2, m4
+ punpckhdq m2, m4
+ punpcklqdq m0, m1, m5
+ punpckhqdq m1, m5
+ punpcklqdq m4, m3, m2
+ punpckhqdq m3, m2
+ mova m2, m4
+ IWHT4_1D
+ packssdw m0, m4 ; low: out3, high: out0
+ packssdw m2, m5 ; low: out2, high: out1
+ pxor m4, m4
+ mova [cq+16*0], m4
+ mova [cq+16*1], m4
+ mova [cq+16*2], m4
+ mova [cq+16*3], m4
+ lea r2, [dstq+strideq*2]
+ movq m1, [dstq+strideq*0]
+ movhps m1, [r2 +strideq*1]
+ movq m3, [r2 +strideq*0]
+ movhps m3, [dstq+strideq*1]
+ movd m5, bdmaxm
+ pshuflw m5, m5, q0000 ; broadcast
+ punpcklqdq m5, m5 ; broadcast
+ paddsw m0, m1
+ paddsw m2, m3
+ pmaxsw m0, m4
+ pmaxsw m2, m4
+ pminsw m0, m5
+ pminsw m2, m5
+ movhps [r2 +strideq*1], m0 ; write out0
+ movhps [dstq+strideq*1], m2 ; write out1
+ movq [r2 +strideq*0], m2 ; write out2
+ movq [dstq+strideq*0], m0 ; write out3
+ RET
+
+; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
+; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
+; flags: 2 = inv_dst1, 4 = inv_dst2
+; skip round/shift if rnd is not a number
+%macro ITX_MULSUB_2D 8-9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], flags
+; %1 dst/src[1]
+; %2 dst/src[2]
+; %3 tmp[1]
+; %4 tmp[2]
+; %5 tmp[3]
+; %6 rnd
+; %7 coef[1]
+; %8 coef[2]
+; %9 flags
+%ifnidn %7,%8 ; optimize when coef1 == coef2
+%if %8 < 32
+ pmulld m%4, m%1, m%8
+ pmulld m%3, m%2, m%8
+%else
+ mova m%3, [o(pd_%8)]
+ pmulld m%4, m%1, m%3
+ pmulld m%3, m%2
+%endif
+%endif
+%if %7 < 32
+ pmulld m%1, m%7
+ pmulld m%2, m%7
+%else
+ mova m%5, [o(pd_%7)]
+ pmulld m%1, m%5
+ pmulld m%2, m%5
+%endif
+%if %9 & 4 ; invert dst2
+ paddd m%4, m%2
+ psubd m%2, m%6, m%4
+%else
+%ifnum %6
+%ifnidn %7,%8
+ paddd m%4, m%6
+%else
+ paddd m%1, m%6
+%endif
+%endif
+%ifnidn %7,%8
+ paddd m%2, m%4
+%else
+ mova m%3, m%2
+ paddd m%2, m%1
+%endif
+%endif
+%if %9 & 2 ; invert dst1
+ psubd m%3, m%1
+ paddd m%1, m%3, m%6
+%else
+%ifnum %6
+%ifnidn %7,%8
+ paddd m%1, m%6
+%endif
+%endif
+ psubd m%1, m%3
+%endif
+%ifnum %6
+ psrad m%2, 12
+ psrad m%1, 12
+%endif
+%endmacro
+
+%macro INV_TXFM_FN 4-5+ 8 ; type1, type2, eob_offset, size, mmsize/stack
+cglobal inv_txfm_add_%1_%2_%4_16bpc, 4, 7, %5, dst, stride, c, eob, tx2
+ %define %%p1 m(i%1_%4_internal_16bpc)
+%if ARCH_X86_32
+ LEA r6, $$
+%endif
+%if has_epilogue
+%ifidn %1_%2, dct_dct
+ test eobd, eobd
+ jz %%end
+%endif
+ lea tx2q, [o(m(i%2_%4_internal_16bpc).pass2)]
+%ifnum %3
+%if %3
+ add eobd, %3
+%endif
+%else
+ lea r5, [o(%3)]
+%endif
+ call %%p1
+ RET
+%%end:
+%else
+ ; Jump to the 1st txfm function if we're not taking the fast path, which
+ ; in turn performs an indirect jump to the 2nd txfm function.
+ lea tx2q, [o(m(i%2_%4_internal_16bpc).pass2)]
+%ifnum %3
+%if %3
+ add eobd, %3
+%endif
+%else
+ lea r5, [o(%3)]
+%endif
+%ifidn %1_%2, dct_dct
+ test eobd, eobd
+ jnz %%p1
+%else
+ ; jump to the 1st txfm function unless it's located directly after this
+ times ((%%end - %%p1) >> 31) & 1 jmp %%p1
+ALIGN function_align
+%%end:
+%endif
+%endif
+%endmacro
+
+%macro INV_TXFM_4X4_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 0, 4x4
+%ifidn %1_%2, dct_dct
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 4
+.dconly:
+ add r5d, 128
+ sar r5d, 8
+.dconly2:
+ imul r5d, 2896
+ mova m2, [o(pixel_10bpc_max)]
+ add r5d, 34816
+ movd m0, r5d
+ pshuflw m0, m0, q1111
+ pxor m3, m3
+ punpcklqdq m0, m0
+.dconly_loop:
+ movq m1, [dstq+strideq*0]
+ movhps m1, [dstq+strideq*1]
+ paddw m1, m0
+ pminsw m1, m2
+ pmaxsw m1, m3
+ movq [dstq+strideq*0], m1
+ movhps [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ sub r3d, 2
+ jg .dconly_loop
+ RET
+%endif
+%endmacro
+
+%macro IDCT4_1D 8 ; src[1-4], tmp[1-3], rnd
+ ; butterfly rotation
+ ITX_MULSUB_2D %1, %3, %5, %6, %7, %8, 2896, 2896 ; %1 out1 %3 out0
+ ITX_MULSUB_2D %2, %4, %5, %6, %7, %8, 1567, 3784 ; %2 out2 %4 out3
+ ; Hadamard rotation
+ psubd m%5, m%1, m%2
+ paddd m%2, m%1
+ paddd m%1, m%3, m%4
+ psubd m%3, m%4
+ ; %1 (src1) = out0
+ ; %2 (src2) = out1
+ ; %3 (src3) = out3
+ ; $5 (tmp1) = out2
+%endmacro
+
+INIT_XMM sse4
+
+INV_TXFM_4X4_FN dct, dct
+INV_TXFM_4X4_FN dct, identity
+INV_TXFM_4X4_FN dct, adst
+INV_TXFM_4X4_FN dct, flipadst
+
+cglobal idct_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+ mova m0, [cq+16*0]
+ mova m1, [cq+16*1]
+ mova m2, [cq+16*2]
+ mova m3, [cq+16*3]
+ mova m5, [o(pd_2048)]
+ call .pass1_main
+ packssdw m0, m1 ; out0 out1
+ packssdw m4, m2 ; out2 out3
+ ; transpose
+ punpckhwd m2, m0, m4
+ punpcklwd m0, m4
+ punpckhwd m1, m0, m2
+ punpcklwd m0, m2
+ ; m0 = out0 out1
+ ; m1 = out2 out3
+ ; m5 = pd_2048
+ jmp tx2q
+.pass1_main:
+ IDCT4_1D 0, 1, 2, 3, 4, 6, 7, 5
+ ret
+.pass2:
+ ; m0 = in0 in1
+ ; m1 = in2 in3
+ ; m5 = pd_2048
+ punpckhwd m2, m1, m0
+ punpcklwd m1, m0
+ pmaddwd m4, m2, [o(pw_m3784_1567)]
+ pmaddwd m2, [o(pw_1567_3784)]
+ pmaddwd m0, m1, [o(pw_m2896_2896)]
+ pmaddwd m1, [o(pw_2896_2896)]
+ REPX {paddd x, m5}, m4, m2, m0, m1
+ packssdw m5, m5 ; pw_2048
+ REPX {psrad x, 12}, m4, m2, m0, m1
+ packssdw m2, m4 ; t3 t2
+ packssdw m1, m0 ; t0 t1
+ paddsw m0, m1, m2 ; out0 out1
+ psubsw m1, m2 ; out3 out2
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ movq m2, [dstq+strideq*0]
+ movhps m2, [dstq+strideq*1]
+ lea r5, [dstq+strideq*2]
+ movq m3, [r5 +strideq*1]
+ movhps m3, [r5 +strideq*0]
+ mova m5, [o(pixel_10bpc_max)]
+ pxor m4, m4
+ mova [cq+16*0], m4
+ mova [cq+16*1], m4
+ mova [cq+16*2], m4
+ mova [cq+16*3], m4
+ paddw m0, m2
+ paddw m1, m3
+ pmaxsw m0, m4
+ pmaxsw m1, m4
+ pminsw m0, m5
+ pminsw m1, m5
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ movhps [r5 +strideq*0], m1
+ movq [r5 +strideq*1], m1
+ RET
+
+INV_TXFM_4X4_FN adst, dct
+INV_TXFM_4X4_FN adst, adst
+INV_TXFM_4X4_FN adst, flipadst
+INV_TXFM_4X4_FN adst, identity
+
+cglobal iadst_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+ call .main
+ packssdw m0, m2 ; out0 out1
+ packssdw m1, m4 ; out2 out3
+ ; transpose
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m0, m2
+ punpcklwd m0, m2
+ ; m0 = out0 out1
+ ; m1 = out2 out3
+ ; m5 = pd_2048
+ jmp tx2q
+.pass2:
+ ; m0 = in0 in1
+ ; m1 = in2 in3
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call m_suffix(iadst_4x4_internal_8bpc, _ssse3).main
+.end:
+ mova m4, [o(pw_2048)]
+ movq m2, [dstq+strideq*0]
+ movhps m2, [dstq+strideq*1]
+ lea r5, [dstq+strideq*2]
+ movq m3, [r5 +strideq*0]
+ movhps m3, [r5 +strideq*1]
+ mova m5, [o(pixel_10bpc_max)]
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+ pxor m4, m4
+ mova [cq+16*0], m4
+ mova [cq+16*1], m4
+ mova [cq+16*2], m4
+ mova [cq+16*3], m4
+ paddw m0, m2
+ paddw m1, m3
+ pmaxsw m0, m4
+ pmaxsw m1, m4
+ pminsw m0, m5
+ pminsw m1, m5
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ movq [r5 +strideq*0], m1
+ movhps [r5 +strideq*1], m1
+ RET
+ALIGN function_align
+.main:
+ mova m1, [cq+16*2]
+ mova m3, [cq+16*3]
+ mova m5, [cq+16*0]
+ lea r3, [cq+16*1]
+.main2:
+ mova m0, [o(pd_1321)] ; SINPI_1_9
+ mova m2, [o(pd_2482)] ; SINPI_2_9
+ mova m6, [o(pd_3803)] ; SINPI_4_9
+ pmulld m4, m0, m1 ; s[4] = SINPI_1_9 * T[2]
+ pmulld m7, m3, m6 ; s[6] = SINPI_4_9 * T[3]
+ pmulld m6, m1 ; s[3] = SINPI_4_9 * T[2]
+ pmulld m0, m5 ; s[0] = SINPI_1_9 * T[0]
+ psubd m1, m3 ; T[2] - T[3]
+ pmulld m3, m2 ; s[5] = SINPI_2_9 * T[3]
+ pmulld m2, m5 ; s[1] = SINPI_2_9 * T[0]
+ paddd m0, m6 ; s[0] += s[3]
+ paddd m0, m3 ; s[0] += s[5]
+ mova m3, [o(pd_m3344)] ; -SINPI_3_9
+ psubd m2, m4 ; s[1] -= s[4]
+ psubd m2, m7 ; s[1] -= s[6]
+ psubd m1, m5 ; -b7 = (T[2] -T[3]) - T[0]
+ pmulld m1, m3 ; s[2] = -SINPI_3_9 * -b7
+ pmulld m3, [r3] ; -s[3] = -SINPI_3_9 * T[1]
+ mova m5, [o(pd_2048)]
+ REPX {paddd x, m5}, m0, m1 ; {s[0], s[2]} + 2048
+ paddd m4, m0, m2 ; x[3] = s[0] + s[1]
+ psubd m2, m3 ; x[1] = s[1] + s[3]
+ psubd m0, m3 ; x[0] = s[0] + s[3]
+ paddd m4, m3 ; x[3] -= s[3]
+ paddd m2, m5 ; x[1] + 2048
+ REPX {psrad x, 12}, m0, m2, m1, m4
+ ret
+
+
+INV_TXFM_4X4_FN flipadst, dct
+INV_TXFM_4X4_FN flipadst, adst
+INV_TXFM_4X4_FN flipadst, flipadst
+INV_TXFM_4X4_FN flipadst, identity
+
+cglobal iflipadst_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+ call m(iadst_4x4_internal_16bpc).main
+ packssdw m0, m2 ; out0 out1
+ packssdw m1, m4 ; out2 out3
+ ; transpose
+ punpcklwd m2, m1, m0
+ punpckhwd m1, m0
+ punpcklwd m0, m1, m2
+ punpckhwd m1, m2
+ ; m0 = out0 out1
+ ; m1 = out2 out3
+ ; m5 = pd_2048
+ jmp tx2q
+.pass2:
+ ; m0 = in0 in1
+ ; m1 = in2 in3
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call m_suffix(iadst_4x4_internal_8bpc, _ssse3).main
+ mova m4, [o(pw_2048)]
+ movq m3, [dstq+strideq*1]
+ movhps m3, [dstq+strideq*0]
+ lea r5, [dstq+strideq*2]
+ movq m2, [r5 +strideq*1]
+ movhps m2, [r5 +strideq*0]
+ mova m5, [o(pixel_10bpc_max)]
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+ pxor m4, m4
+ mova [cq+16*0], m4
+ mova [cq+16*1], m4
+ mova [cq+16*2], m4
+ mova [cq+16*3], m4
+ paddw m0, m2
+ paddw m1, m3
+ pmaxsw m0, m4
+ pmaxsw m1, m4
+ pminsw m0, m5
+ pminsw m1, m5
+ movhps [dstq+strideq*0], m1
+ movq [dstq+strideq*1], m1
+ movhps [r5 +strideq*0], m0
+ movq [r5 +strideq*1], m0
+ RET
+
+INV_TXFM_4X4_FN identity, dct
+INV_TXFM_4X4_FN identity, adst
+INV_TXFM_4X4_FN identity, flipadst
+INV_TXFM_4X4_FN identity, identity
+
+cglobal iidentity_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+ mova m3, [o(pd_5793)]
+ pmulld m0, m3, [cq+16*0]
+ pmulld m1, m3, [cq+16*1]
+ pmulld m2, m3, [cq+16*2]
+ pmulld m3, [cq+16*3]
+ mova m5, [o(pd_2048)]
+ REPX {paddd x, m5}, m0, m1, m2, m3
+ REPX {psrad x, 12}, m0, m1, m2, m3
+ packssdw m0, m1
+ packssdw m2, m3
+ ; transpose
+ punpckhwd m3, m0, m2
+ punpcklwd m0, m2
+ punpckhwd m1, m0, m3
+ punpcklwd m0, m3
+ ; m0 = out0 out1
+ ; m1 = out2 out3
+ ; m5 = pd_2048
+ jmp tx2q
+.pass2:
+ ; m0 = in0 in1
+ ; m1 = in2 in3
+ ; m5 = pd_2048
+ mova m4, [o(pw_1697x8)]
+ movq m2, [dstq+strideq*0]
+ movhps m2, [dstq+strideq*1]
+ lea r5, [dstq+strideq*2]
+ pmulhrsw m3, m4, m0
+ pmulhrsw m4, m1
+ paddsw m0, m3
+ paddsw m1, m4
+ movq m3, [r5 +strideq*0]
+ movhps m3, [r5 +strideq*1]
+ mova m4, [o(pixel_10bpc_max)]
+ packssdw m5, m5 ; pw_2048
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ pxor m5, m5
+ mova [cq+16*0], m5
+ mova [cq+16*1], m5
+ mova [cq+16*2], m5
+ mova [cq+16*3], m5
+ paddw m0, m2
+ paddw m1, m3
+ pmaxsw m0, m5
+ pmaxsw m1, m5
+ pminsw m0, m4
+ pminsw m1, m4
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ movq [r5 +strideq*0], m1
+ movhps [r5 +strideq*1], m1
+ RET
+
+%macro INV_TXFM_4X8_FN 2-3 0 ; type1, type2, eob_offset
+ INV_TXFM_FN %1, %2, %3, 4x8
+%ifidn %1_%2, dct_dct
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 8
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
+ jmp m(inv_txfm_add_dct_dct_4x4_16bpc).dconly
+%endif
+%endmacro
+
+INV_TXFM_4X8_FN dct, dct
+INV_TXFM_4X8_FN dct, identity, 9
+INV_TXFM_4X8_FN dct, adst
+INV_TXFM_4X8_FN dct, flipadst
+
+cglobal idct_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%undef cmp
+ mova m5, [o(pd_2048)]
+%if ARCH_X86_64
+ xor r5d, r5d
+ cmp eobd, 13
+ setge r5b
+%else
+ mov r5d, 1
+ cmp eobd, 13
+ sbb r5d, 0
+%endif
+ shl r5d, 4
+.loop_pass1:
+ mova m3, [o(pd_2896)]
+ pmulld m0, m3, [cq+32*0+r5]
+ pmulld m1, m3, [cq+32*1+r5]
+ pmulld m2, m3, [cq+32*2+r5]
+ pmulld m3, [cq+32*3+r5]
+ REPX {paddd x, m5}, m0, m1, m2, m3
+ REPX {psrad x, 12}, m0, m1, m2, m3
+ call m(idct_4x4_internal_16bpc).pass1_main
+ packssdw m0, m1 ; out0 out1
+ packssdw m4, m2 ; out2 out3
+ test r5d, r5d
+ jz .end_pass1
+ mova [cq+32*0+16], m0
+ mova [cq+32*1+16], m4
+ xor r5d, r5d
+ jmp .loop_pass1
+.end_pass1:
+ punpckhwd m2, m0, m4
+ punpcklwd m0, m4
+ punpckhwd m1, m0, m2
+ punpcklwd m0, m2
+ mova m2, [cq+32*0+16]
+ mova m6, [cq+32*1+16]
+ punpckhwd m4, m2, m6
+ punpcklwd m2, m6
+ punpckhwd m3, m2, m4
+ punpcklwd m2, m4
+ ; m0-3 = packed & transposed output
+ jmp tx2q
+.pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call m_suffix(idct_4x8_internal_8bpc, _ssse3).main
+ ; m0-3 is now out0/1,3/2,4/5,7/6
+ mova m4, [o(pw_2048)]
+ shufps m1, m1, q1032
+ shufps m3, m3, q1032
+.end:
+ REPX {pmulhrsw x, m4}, m0, m1, m2, m3
+ pxor m4, m4
+ REPX {mova [cq+16*x], m4}, 0, 1, 2, 3, 4, 5, 6, 7
+ mova m7, [o(pixel_10bpc_max)]
+ lea r2, [strideq*3]
+ movq m5, [dstq+strideq*0]
+ movq m6, [dstq+strideq*2]
+ movhps m5, [dstq+strideq*1]
+ movhps m6, [dstq+r2]
+ lea r4, [dstq+strideq*4]
+ paddw m0, m5
+ paddw m1, m6
+ movq m5, [r4+strideq*0]
+ movq m6, [r4+strideq*2]
+ movhps m5, [r4+strideq*1]
+ movhps m6, [r4+r2]
+ paddw m2, m5
+ paddw m3, m6
+ REPX {pminsw x, m7}, m0, m1, m2, m3
+ REPX {pmaxsw x, m4}, m0, m1, m2, m3
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ movq [dstq+strideq*2], m1
+ movhps [dstq+r2 ], m1
+ movq [r4 +strideq*0], m2
+ movhps [r4 +strideq*1], m2
+ movq [r4 +strideq*2], m3
+ movhps [r4 +r2 ], m3
+ RET
+
+INV_TXFM_4X8_FN adst, dct
+INV_TXFM_4X8_FN adst, adst
+INV_TXFM_4X8_FN adst, flipadst
+INV_TXFM_4X8_FN adst, identity, 9
+
+cglobal iadst_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+ call .pass1_main
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m0, m2
+ punpcklwd m0, m2
+ mova m2, [cq+32*2+16]
+ mova m6, [cq+32*3+16]
+ punpckhwd m4, m2, m6
+ punpcklwd m2, m6
+ punpckhwd m3, m2, m4
+ punpcklwd m2, m4
+ ; m0-3 = packed & transposed output
+ jmp tx2q
+.pass1_main:
+%undef cmp
+%if ARCH_X86_64
+ xor r5d, r5d
+ cmp eobd, 13
+ setge r5b
+%else
+ mov r5d, 1
+ cmp eobd, 13
+ sbb r5d, 0
+%endif
+ shl r5d, 4
+ lea r3, [cq+32*1+16]
+.loop_pass1:
+ mova m0, [o(pd_2048)]
+ mova m3, [o(pd_2896)]
+ pmulld m5, m3, [cq+32*0+r5]
+ pmulld m2, m3, [cq+32*1+r5]
+ pmulld m1, m3, [cq+32*2+r5]
+ pmulld m3, [cq+32*3+r5]
+ REPX {paddd x, m0}, m5, m2, m1, m3
+ REPX {psrad x, 12}, m5, m2, m1, m3
+ mova [r3], m2
+ call m(iadst_4x4_internal_16bpc).main2
+ packssdw m0, m2 ; out0 out1
+ packssdw m1, m4 ; out2 out3
+ test r5d, r5d
+ jz .end_pass1
+ mova [cq+32*2+16], m0
+ mova [cq+32*3+16], m1
+ xor r5d, r5d
+ jmp .loop_pass1
+.end_pass1:
+ ret
+.pass2:
+ shufps m0, m0, q1032
+ shufps m1, m1, q1032
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call m_suffix(iadst_4x8_internal_8bpc, _ssse3).main
+ mova m4, [o(pw_4x2048_4xm2048)]
+ jmp m(idct_4x8_internal_16bpc).end
+
+INV_TXFM_4X8_FN flipadst, dct
+INV_TXFM_4X8_FN flipadst, adst
+INV_TXFM_4X8_FN flipadst, flipadst
+INV_TXFM_4X8_FN flipadst, identity, 9
+
+cglobal iflipadst_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+ call m(iadst_4x8_internal_16bpc).pass1_main
+ punpcklwd m2, m1, m0
+ punpckhwd m1, m0
+ punpcklwd m0, m1, m2
+ punpckhwd m1, m2
+ mova m6, [cq+32*2+16]
+ mova m2, [cq+32*3+16]
+ punpcklwd m4, m2, m6
+ punpckhwd m2, m6
+ punpckhwd m3, m2, m4
+ punpcklwd m2, m4
+ ; m0-3 = packed & transposed output
+ jmp tx2q
+.pass2:
+ shufps m0, m0, q1032
+ shufps m1, m1, q1032
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call m_suffix(iadst_4x8_internal_8bpc, _ssse3).main
+ mova m4, m0
+ mova m5, m1
+ pshufd m0, m3, q1032
+ pshufd m1, m2, q1032
+ pshufd m2, m5, q1032
+ pshufd m3, m4, q1032
+ mova m4, [o(pw_4xm2048_4x2048)]
+ jmp m(idct_4x8_internal_16bpc).end
+
+INV_TXFM_4X8_FN identity, dct
+INV_TXFM_4X8_FN identity, adst
+INV_TXFM_4X8_FN identity, flipadst
+INV_TXFM_4X8_FN identity, identity, 3
+
+cglobal iidentity_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%undef cmp
+ mova m5, [o(pd_2048)]
+ mova m4, [o(pd_2896)]
+ mova m6, [o(pd_5793)]
+ ; clear m7 in case we skip the bottom square
+ pxor m7, m7
+%if ARCH_X86_64
+ xor r5d, r5d
+ cmp eobd, 16
+ setge r5b
+%else
+ mov r5d, 1
+ cmp eobd, 16
+ sbb r5d, 0
+%endif
+ shl r5d, 4
+.loop_pass1:
+ pmulld m0, m4, [cq+32*0+r5]
+ pmulld m1, m4, [cq+32*1+r5]
+ pmulld m2, m4, [cq+32*2+r5]
+ pmulld m3, m4, [cq+32*3+r5]
+ REPX {paddd x, m5}, m0, m1, m2, m3
+ REPX {psrad x, 12}, m0, m1, m2, m3
+ REPX {pmulld x, m6}, m0, m1, m2, m3
+ REPX {paddd x, m5}, m0, m1, m2, m3
+ REPX {psrad x, 12}, m0, m1, m2, m3
+ packssdw m0, m1
+ packssdw m2, m3
+ test r5d, r5d
+ jz .end_pass1
+ mova [cq+32*0+16], m0
+ mova m7, m2
+ xor r5d, r5d
+ jmp .loop_pass1
+.end_pass1:
+ punpckhwd m4, m0, m2
+ punpcklwd m0, m2
+ punpckhwd m1, m0, m4
+ punpcklwd m0, m4
+ mova m2, [cq+32*0+16]
+ punpckhwd m4, m2, m7
+ punpcklwd m2, m7
+ punpckhwd m3, m2, m4
+ punpcklwd m2, m4
+ ; m0-3 = packed & transposed output
+ jmp tx2q
+.pass2:
+ mova m4, [o(pw_4096)]
+ jmp m(idct_4x8_internal_16bpc).end
+
+%macro INV_TXFM_4X16_FN 2-3 2d ; type1, type2, eob_tbl_suffix
+ INV_TXFM_FN %1, %2, tbl_4x16_%3, 4x16
+%ifidn %1_%2, dct_dct
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 16
+ add r5d, 384
+ sar r5d, 9
+ jmp m(inv_txfm_add_dct_dct_4x4_16bpc).dconly2
+%endif
+%endmacro
+
+INV_TXFM_4X16_FN dct, dct
+INV_TXFM_4X16_FN dct, identity, v
+INV_TXFM_4X16_FN dct, adst
+INV_TXFM_4X16_FN dct, flipadst
+
+cglobal idct_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%undef cmp
+%if ARCH_X86_32
+ mov r5m, r6d
+%endif
+ mov r6d, 4
+.zero_loop:
+ dec r6d
+ cmp eobb, byte [r5+r6]
+ jl .zero_loop
+ mov r5d, r6d
+ shl r5d, 4
+%if ARCH_X86_32
+ ; restore pic-ptr
+ mov r6, r5m
+%endif
+ mova m5, [o(pd_2048)]
+.loop_pass1:
+ mova m0, [cq+64*0+r5]
+ mova m1, [cq+64*1+r5]
+ mova m2, [cq+64*2+r5]
+ mova m3, [cq+64*3+r5]
+ call m(idct_4x4_internal_16bpc).pass1_main
+ pcmpeqd m3, m3
+ REPX {psubd x, m3}, m0, m1, m4, m2
+ REPX {psrad x, 1}, m0, m1, m4, m2
+ packssdw m0, m1 ; out0 out1
+ packssdw m4, m2 ; out2 out3
+ punpckhwd m2, m0, m4
+ punpcklwd m0, m4
+ punpckhwd m1, m0, m2
+ punpcklwd m0, m2
+ test r5d, r5d
+ jz .end_pass1
+ mova [cq+64*0+r5], m0
+ mova [cq+64*1+r5], m1
+ sub r5d, 16
+ jmp .loop_pass1
+.end_pass1:
+ mova m2, [cq+64*0+16]
+ mova m3, [cq+64*1+16]
+ mova m4, [cq+64*0+32]
+ mova m5, [cq+64*1+32]
+ mova m6, [cq+64*0+48]
+ mova m7, [cq+64*1+48]
+ ; m0-7 = packed & transposed output
+ jmp tx2q
+.pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call m_suffix(idct_16x4_internal_8bpc, _ssse3).main
+ ; m0-6 is out0-13 [with odd registers having inversed output]
+ ; [coeffq+16*7] has out15/14
+ mova m7, [o(pw_2048)]
+ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pmulhrsw m7, [cq+16*7]
+ REPX {shufps x, x, q1032}, m1, m3, m5, m7
+ mova [cq+16*0], m4
+ mova [cq+16*1], m5
+ mova [cq+16*2], m6
+ mova [cq+16*3], m7
+.end:
+ pxor m4, m4
+ REPX {mova [cq+16*x], m4}, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ mova m7, [o(pixel_10bpc_max)]
+ mov r5d, 2
+ lea r3, [strideq*3]
+.loop:
+ movq m5, [dstq+strideq*0]
+ movq m6, [dstq+strideq*2]
+ movhps m5, [dstq+strideq*1]
+ movhps m6, [dstq+r3]
+ lea r4, [dstq+strideq*4]
+ paddw m0, m5
+ paddw m1, m6
+ movq m5, [r4+strideq*0]
+ movq m6, [r4+strideq*2]
+ movhps m5, [r4+strideq*1]
+ movhps m6, [r4+r3]
+ paddw m2, m5
+ paddw m3, m6
+ REPX {pminsw x, m7}, m0, m1, m2, m3
+ REPX {pmaxsw x, m4}, m0, m1, m2, m3
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ movq [dstq+strideq*2], m1
+ movhps [dstq+r3 ], m1
+ movq [r4 +strideq*0], m2
+ movhps [r4 +strideq*1], m2
+ movq [r4 +strideq*2], m3
+ movhps [r4 +r3 ], m3
+ dec r5d
+ jz .end2
+ lea dstq, [dstq+strideq*8]
+ mova m0, [cq+0*16]
+ mova m1, [cq+1*16]
+ mova m2, [cq+2*16]
+ mova m3, [cq+3*16]
+ REPX {mova [cq+x*16], m4}, 0, 1, 2, 3
+ jmp .loop
+.end2:
+ RET
+
+INV_TXFM_4X16_FN adst, dct
+INV_TXFM_4X16_FN adst, adst
+INV_TXFM_4X16_FN adst, flipadst
+INV_TXFM_4X16_FN adst, identity, v
+
+cglobal iadst_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%undef cmp
+%if ARCH_X86_32
+ mov r5m, r6d
+%endif
+ mov r6d, 4
+.zero_loop:
+ dec r6d
+ cmp eobb, byte [r6+r5]
+ jl .zero_loop
+ mov r5d, r6d
+ shl r5d, 4
+%if ARCH_X86_32
+ ; restore pic-ptr
+ mov r6, r5m
+%endif
+.loop_pass1:
+ mova m5, [cq+64*0+r5]
+ lea r3, [cq+64*1+r5]
+ mova m1, [cq+64*2+r5]
+ mova m3, [cq+64*3+r5]
+ call m(iadst_4x4_internal_16bpc).main2
+ pcmpeqd m3, m3
+ REPX {psubd x, m3}, m0, m2, m1, m4
+ REPX {psrad x, 1}, m0, m2, m1, m4
+ packssdw m0, m2 ; out0 out1
+ packssdw m1, m4 ; out2 out3
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m0, m2
+ punpcklwd m0, m2
+ test r5d, r5d
+ jz m(idct_4x16_internal_16bpc).end_pass1
+ mova [cq+64*0+r5], m0
+ mova [cq+64*1+r5], m1
+ sub r5d, 16
+ jmp .loop_pass1
+.pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main
+ call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main_pass2_end
+ ; m7/5/2/4 = out4/-11,-5/10,6/-9,-7/8
+ ; m0/3 & cq6/7 = out0/-15,-3/12,-1/14,2/-13
+ mova m1, [o(pw_4x2048_4xm2048)]
+ REPX {pmulhrsw x, m1}, m7, m2, m0
+ pshufd m6, m1, q1032 ; 4x-2048,4x2048
+ pmulhrsw m1, [cq+16*7]
+ REPX {pmulhrsw x, m6}, m5, m4, m3
+ pmulhrsw m6, [cq+16*6]
+ ; m7/5/2/4 = out4/11,5/10,6/9,7/8
+ ; m0/3/6/1 = out0/15,3/12,1/14,2/13
+ ; output should be as 0-3 for out0-7, and cq+0-3*16 for out8-15
+ movhps [cq+0*8], m4
+ movhps [cq+1*8], m2
+ movhps [cq+2*8], m5
+ movhps [cq+3*8], m7
+ movhps [cq+4*8], m3
+ movhps [cq+5*8], m1
+ movhps [cq+6*8], m6
+ movhps [cq+7*8], m0
+ punpcklqdq m0, m6
+ punpcklqdq m1, m3
+ punpcklqdq m3, m2, m4
+ punpcklqdq m2, m7, m5
+ jmp m(idct_4x16_internal_16bpc).end
+
+INV_TXFM_4X16_FN flipadst, dct
+INV_TXFM_4X16_FN flipadst, adst
+INV_TXFM_4X16_FN flipadst, flipadst
+INV_TXFM_4X16_FN flipadst, identity, v
+
+cglobal iflipadst_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%undef cmp
+%if ARCH_X86_32
+ mov r5m, r6d
+%endif
+ mov r6d, 4
+.zero_loop:
+ dec r6d
+ cmp eobb, byte [r5+r6]
+ jl .zero_loop
+ mov r5d, r6d
+ shl r5d, 4
+%if ARCH_X86_32
+ ; restore pic-ptr
+ mov r6, r5m
+%endif
+.loop_pass1:
+ mova m5, [cq+64*0+r5]
+ lea r3, [cq+64*1+r5]
+ mova m1, [cq+64*2+r5]
+ mova m3, [cq+64*3+r5]
+ call m(iadst_4x4_internal_16bpc).main2
+ pcmpeqd m3, m3
+ REPX {psubd x, m3}, m0, m2, m1, m4
+ REPX {psrad x, 1}, m0, m2, m1, m4
+ packssdw m0, m2 ; out3 out2
+ packssdw m1, m4 ; out1 out0
+ punpcklwd m2, m1, m0
+ punpckhwd m1, m0
+ punpcklwd m0, m1, m2
+ punpckhwd m1, m2
+ test r5d, r5d
+ jz m(idct_4x16_internal_16bpc).end_pass1
+ mova [cq+64*0+r5], m0
+ mova [cq+64*1+r5], m1
+ sub r5d, 16
+ jmp .loop_pass1
+.pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main
+ call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main_pass2_end
+ ; m7/5/2/4 = out11/-4,-10/5,9/-6,-8/7
+ ; m0/3 & cq6/7 = out15/-0,-12/3,-14/1,13/-2
+ mova m1, [o(pw_4x2048_4xm2048)]
+ REPX {pmulhrsw x, m1}, m7, m2, m0
+ pshufd m6, m1, q1032 ; 4x-2048,4x2048
+ pmulhrsw m1, [cq+16*7]
+ REPX {pmulhrsw x, m6}, m5, m4, m3
+ pmulhrsw m6, [cq+16*6]
+ ; m7/5/2/4 = out11/4,10/5,9/6,8/7
+ ; m0/3/6/1 = out15/0,12/3,14/1,13/2
+ ; output should be as 0-3 for out0-7, and cq+0-3*16 for out8-15
+ movq [cq+0*8], m4
+ movq [cq+1*8], m2
+ movq [cq+2*8], m5
+ movq [cq+3*8], m7
+ movq [cq+4*8], m3
+ movq [cq+5*8], m1
+ movq [cq+6*8], m6
+ movq [cq+7*8], m0
+ punpckhqdq m0, m6
+ punpckhqdq m1, m3
+ punpckhqdq m3, m2, m4
+ punpckhqdq m2, m7, m5
+ jmp m(idct_4x16_internal_16bpc).end
+
+INV_TXFM_4X16_FN identity, dct, h
+INV_TXFM_4X16_FN identity, adst, h
+INV_TXFM_4X16_FN identity, flipadst, h
+INV_TXFM_4X16_FN identity, identity
+
+cglobal iidentity_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%undef cmp
+%if ARCH_X86_32
+ mov r5m, r6d
+%endif
+ mov r6d, 4
+.zero_loop:
+ dec r6d
+ cmp eobb, byte [r5+r6]
+ jl .zero_loop
+ mov r5d, r6d
+ shl r5d, 4
+%if ARCH_X86_32
+ ; restore pic-ptr
+ mov r6, r5m
+%endif
+ mova m5, [o(pd_6144)]
+ mova m4, [o(pd_5793)]
+.loop_pass1:
+ pmulld m0, m4, [cq+64*0+r5]
+ pmulld m1, m4, [cq+64*1+r5]
+ pmulld m2, m4, [cq+64*2+r5]
+ pmulld m3, m4, [cq+64*3+r5]
+ REPX {paddd x, m5}, m0, m1, m2, m3
+ REPX {psrad x, 13}, m0, m1, m2, m3
+ packssdw m0, m1
+ packssdw m2, m3
+ punpckhwd m3, m0, m2
+ punpcklwd m0, m2
+ punpckhwd m1, m0, m3
+ punpcklwd m0, m3
+ test r5d, r5d
+ jz m(idct_4x16_internal_16bpc).end_pass1
+ mova [cq+64*0+r5], m0
+ mova [cq+64*1+r5], m1
+ sub r5d, 16
+ jmp .loop_pass1
+.pass2:
+ mova [cq+16*4], m0
+ mova [cq+16*5], m1
+ mova [cq+16*6], m2
+ mova [cq+16*7], m7
+ mova m0, [o(pw_1697x16)]
+ mova m7, [o(pw_2048)]
+ pmulhrsw m1, m0, m4
+ pmulhrsw m2, m0, m5
+ REPX {paddsw x, x}, m4, m5
+ paddsw m4, m1
+ paddsw m5, m2
+ REPX {pmulhrsw x, m7}, m4, m5
+ mova [cq+16*0], m4
+ mova [cq+16*1], m5
+ mova m4, [cq+16*7]
+ pmulhrsw m1, m0, m6
+ pmulhrsw m2, m0, m4
+ REPX {paddsw x, x}, m6, m4
+ paddsw m6, m1
+ paddsw m4, m2
+ REPX {pmulhrsw x, m7}, m6, m4
+ mova [cq+16*2], m6
+ mova [cq+16*3], m4
+ mova m4, [cq+16*4]
+ mova m1, [cq+16*5]
+ mova m2, [cq+16*6]
+ pmulhrsw m5, m0, m2
+ pmulhrsw m6, m0, m3
+ REPX {paddsw x, x}, m2, m3
+ paddsw m2, m5
+ paddsw m3, m6
+ pmulhrsw m6, m0, m1
+ pmulhrsw m0, m4
+ REPX {paddsw x, x}, m1, m4
+ paddsw m1, m6
+ paddsw m0, m4
+ REPX {pmulhrsw x, m7}, m2, m3, m1, m0
+ jmp m(idct_4x16_internal_16bpc).end
+
+%macro INV_TXFM_8X4_FN 2 ; type1, type2
+%if ARCH_X86_64
+ INV_TXFM_FN %1, %2, 0, 8x4, 15
+%else
+ INV_TXFM_FN %1, %2, 0, 8x4, 8, 0-4*16
+%endif
+%ifidn %1_%2, dct_dct
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 2896
+ add r5d, 34816
+ movd m0, r5d
+ pshuflw m0, m0, q1111
+ punpcklqdq m0, m0
+ mova m6, [o(pixel_10bpc_max)]
+ pxor m5, m5
+ lea r2, [strideq*3]
+ mova m1, [dstq+strideq*0]
+ mova m2, [dstq+strideq*1]
+ mova m3, [dstq+strideq*2]
+ mova m4, [dstq+r2]
+ REPX {paddw x, m0}, m1, m2, m3, m4
+ REPX {pmaxsw x, m5}, m1, m2, m3, m4
+ REPX {pminsw x, m6}, m1, m2, m3, m4
+ mova [dstq+strideq*0], m1
+ mova [dstq+strideq*1], m2
+ mova [dstq+strideq*2], m3
+ mova [dstq+r2 ], m4
+ RET
+%endif
+%endmacro
+
+INV_TXFM_8X4_FN dct, dct
+INV_TXFM_8X4_FN dct, identity
+INV_TXFM_8X4_FN dct, adst
+INV_TXFM_8X4_FN dct, flipadst
+
+cglobal idct_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+ lea r5, [o(.main)]
+.pass1_entry:
+%if ARCH_X86_32
+ lea r3, [rsp+gprsize]
+%else
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+ mova m0, [cq+0*16]
+ mova m1, [cq+1*16]
+ mova m2, [cq+2*16]
+ mova m3, [cq+3*16]
+ mova m4, [cq+4*16]
+ mova m5, [cq+5*16]
+ mova m6, [cq+6*16]
+ mova m7, [cq+7*16]
+ call .rect2_mul
+ call r5
+ call .transpose4x8packed
+ ; m0-3 = packed & transposed output
+ jmp tx2q
+.transpose4x8packed:
+ ; transpose
+ punpcklwd m1, m2, m6
+ punpckhwd m2, m6
+ punpckhwd m6, m0, m4
+ punpcklwd m0, m4
+
+ punpckhwd m3, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m4, m6, m2
+ punpcklwd m6, m2
+
+ punpcklwd m2, m3, m4
+ punpckhwd m3, m4
+ punpckhwd m1, m0, m6
+ punpcklwd m0, m6
+ ret
+.main:
+ call .main_pass1
+ call .round
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ ret
+.rect2_mul:
+%if ARCH_X86_64
+ REPX {pmulld x, m14}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+%else
+ mova [r3], m7
+ mova m7, [o(pd_2896)]
+ REPX {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pmulld m7, [r3]
+ mova [r3], m7
+ mova m7, [o(pd_2048)]
+ REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6
+ paddd m7, [r3]
+%endif
+ REPX {psrad x, 12}, m0, m1, m2, m3, m4, m5, m6, m7
+ ret
+%if ARCH_X86_64
+.main_pass1_fast:
+ pmulld m5, m3, [o(pd_m2276)]
+ pmulld m3, [o(pd_3406)]
+ pmulld m7, m1, [o(pd_4017)]
+ pmulld m1, [o(pd_799)]
+ pmulld m6, m2, [o(pd_3784)]
+ pmulld m2, [o(pd_1567)]
+ pmulld m0, m14
+ pxor m4, m4
+ jmp .main_pass1_fast2
+.main_pass1:
+ ITX_MULSUB_2D 5, 3, 8, 9, 10, _, 3406, 2276 ; t5a t6a
+ ITX_MULSUB_2D 1, 7, 8, 9, 10, _, 799, 4017 ; t4a t7a
+ ITX_MULSUB_2D 2, 6, 8, 9, 10, _, 1567, 3784 ; t2 t3
+ REPX {pmulld x, m14}, m0, m4
+.main_pass1_fast2:
+ REPX {paddd x, m11}, m1, m2, m3, m5, m6, m7
+ REPX {psrad x, 12 }, m1, m2, m3, m5, m6, m7
+ paddd m8, m1, m5 ; t4
+ psubd m1, m5 ; t5a
+ paddd m9, m7, m3 ; t7
+ psubd m7, m3 ; t6a
+ REPX {pmaxsd x, m12}, m1, m8, m7, m9
+ REPX {pminsd x, m13}, m1, m8, m7, m9
+ REPX {pmulld x, m14}, m7, m1
+ paddd m0, m11
+ paddd m7, m11
+ psubd m5, m0, m4
+ paddd m0, m4
+ psubd m4, m7, m1
+ paddd m7, m1
+ REPX {psrad x, 12 }, m5, m0, m4, m7
+ psubd m3, m0, m6 ; dct4 out3
+ paddd m0, m6 ; dct4 out0
+ paddd m6, m5, m2 ; dct4 out1
+ psubd m5, m2 ; dct4 out2
+ REPX {pmaxsd x, m12}, m0, m6, m5, m3
+ REPX {pminsd x, m13}, m0, m6, m5, m3
+ ret
+.round:
+ paddd m1, m6, m7 ; out1
+ psubd m6, m7 ; out6
+ psubd m7, m0, m9 ; out7
+ paddd m0, m9 ; out0
+ paddd m2, m5, m4 ; out2
+ psubd m5, m4 ; out5
+ psubd m4, m3, m8 ; out4
+ paddd m3, m8 ; out3
+%else
+.main_pass1_fast:
+ pmulld m5, m3, [o(pd_m2276)]
+ pmulld m3, [o(pd_3406)]
+ pmulld m7, m1, [o(pd_4017)]
+ pmulld m1, [o(pd_799)]
+ pmulld m6, m2, [o(pd_3784)]
+ pmulld m2, [o(pd_1567)]
+ mova m4, [o(pd_2048)]
+ mova [r3+0*16], m2
+ REPX {paddd x, m4}, m5, m3, m7, m1
+ REPX {psrad x, 12}, m5, m3, m7, m1
+ paddd m2, m1, m5 ; t4
+ psubd m1, m5 ; t5a
+ pmulld m5, m0, [o(pd_2896)]
+ mova m0, m4
+ paddd m4, m7, m3 ; t7
+ psubd m7, m3 ; t6a
+ mova m3, [o(clip_18b_min)]
+ REPX {pmaxsd x, m3 }, m1, m2, m7, m4
+ mova m3, [o(clip_18b_max)]
+ REPX {pminsd x, m3 }, m1, m2, m7, m4
+ mova [r3+3*16], m2
+ mova [r3+1*16], m4
+ pxor m4, m4
+ mova m2, [r3+0*16]
+ mova m3, [o(pd_2896)]
+ jmp .main_pass1_fast2
+.main_pass1:
+ mova [r3+0*16], m0
+ mova [r3+1*16], m2
+ mova [r3+2*16], m4
+ mova [r3+3*16], m6
+ mova m0, [o(pd_2048)]
+ ITX_MULSUB_2D 5, 3, 2, 4, 6, 0, 3406, 2276 ; t5a t6a
+ ITX_MULSUB_2D 1, 7, 2, 4, 6, 0, 799, 4017 ; t4a t7a
+ paddd m2, m1, m5 ; t4
+ psubd m1, m5 ; t5a
+ paddd m4, m7, m3 ; t7
+ psubd m7, m3 ; t6a
+ mova m6, [o(clip_18b_min)]
+ REPX {pmaxsd x, m6 }, m1, m2, m7, m4
+ mova m6, [o(clip_18b_max)]
+ REPX {pminsd x, m6 }, m1, m2, m7, m4
+ mova m6, [r3+3*16]
+ mova [r3+3*16], m2
+ mova m2, [r3+1*16]
+ mova [r3+1*16], m4
+
+ ITX_MULSUB_2D 2, 6, 4, 3, 5, _, 1567, 3784 ; t2 t3
+ mova m3, [o(pd_2896)]
+ mova m5, [r3+0*16]
+ mova m4, [r3+2*16]
+ REPX {pmulld x, m3 }, m5, m4
+.main_pass1_fast2:
+ REPX {paddd x, m0 }, m2, m6
+ REPX {psrad x, 12 }, m2, m6
+ REPX {pmulld x, m3 }, m7, m1
+ paddd m7, m0
+ paddd m0, m5
+
+ psubd m5, m0, m4
+ paddd m0, m4
+ psubd m4, m7, m1
+ paddd m7, m1
+ REPX {psrad x, 12 }, m5, m0, m4, m7
+ psubd m3, m0, m6 ; dct4 out3
+ paddd m0, m6 ; dct4 out0
+ paddd m6, m5, m2 ; dct4 out1
+ psubd m5, m2 ; dct4 out2
+
+ mova m1, [o(clip_18b_min)]
+ REPX {pmaxsd x, m1 }, m0, m6, m5, m3
+ mova m1, [o(clip_18b_max)]
+ REPX {pminsd x, m1 }, m0, m6, m5, m3
+ ret
+.round:
+ paddd m1, m6, m7 ; out1
+ psubd m6, m7 ; out6
+ mova [r3+0*16], m6
+ mova m6, [r3+1*16]
+ psubd m7, m0, m6 ; out7
+ paddd m0, m6 ; out0
+ paddd m2, m5, m4 ; out2
+ psubd m5, m4 ; out5
+ mova m6, [r3+3*16]
+ psubd m4, m3, m6 ; out4
+ paddd m3, m6 ; out3
+ mova m6, [r3+0*16]
+%endif
+ ret
+
+.pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call m_suffix(idct_8x4_internal_8bpc, _ssse3).main
+.end:
+ lea r3, [strideq*3]
+ call .round2_and_write_8x4
+ REPX {mova [cq+16*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
+ RET
+.round2_and_write_8x4:
+ pxor m6, m6
+ mova m5, [o(pixel_10bpc_max)]
+ mova m4, [o(pw_2048)]
+.round1_and_write_8x4:
+ REPX {pmulhrsw x, m4}, m0, m1, m2, m3
+.write_8x4:
+ paddw m0, [dstq+strideq*0]
+ paddw m1, [dstq+strideq*1]
+ paddw m2, [dstq+strideq*2]
+ paddw m3, [dstq+r3]
+ REPX {pminsw x, m5}, m0, m1, m2, m3
+ REPX {pmaxsw x, m6}, m0, m1, m2, m3
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+r3 ], m3
+ ret
+
+INV_TXFM_8X4_FN adst, dct
+INV_TXFM_8X4_FN adst, adst
+INV_TXFM_8X4_FN adst, flipadst
+INV_TXFM_8X4_FN adst, identity
+
+cglobal iadst_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+ lea r5, [o(.main)]
+ jmp m(idct_8x4_internal_16bpc).pass1_entry
+.main:
+ call .main_pass1
+ call .round
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ ret
+.main_pass1:
+%if ARCH_X86_64
+ ITX_MULSUB_2D 7, 0, 8, 9, 10, 11, 401, 4076 ; t1a, t0a
+ ITX_MULSUB_2D 1, 6, 8, 9, 10, 11, 3920, 1189 ; t7a, t6a
+ ITX_MULSUB_2D 5, 2, 8, 9, 10, 11, 1931, 3612 ; t3a, t2a
+ ITX_MULSUB_2D 3, 4, 8, 9, 10, 11, 3166, 2598 ; t5a, t4a
+ psubd m8, m2, m6 ; t6
+ paddd m2, m6 ; t2
+ psubd m6, m0, m4 ; t4
+ paddd m0, m4 ; t0
+ psubd m4, m5, m1 ; t7
+ paddd m5, m1 ; t3
+ psubd m1, m7, m3 ; t5
+ paddd m7, m3 ; t1
+ REPX {pmaxsd x, m12}, m6, m1, m8, m4, m2, m0, m5, m7
+ REPX {pminsd x, m13}, m6, m1, m8, m4, m2, m0, m5, m7
+ ITX_MULSUB_2D 6, 1, 3, 9, 10, 11, 1567, 3784 ; t5a, t4a
+ ITX_MULSUB_2D 4, 8, 3, 9, 10, 11, 3784, 10 ; t6a, t7a
+ psubd m9, m6, m8 ; t7
+ paddd m6, m8 ; out6
+ mova m8, [o(pd_2896)]
+ psubd m3, m7, m5 ; t3
+ paddd m7, m5 ; -out7
+ psubd m5, m0, m2 ; t2
+ paddd m0, m2 ; out0
+ psubd m2, m1, m4 ; t6
+ paddd m1, m4 ; -out1
+ REPX {pmaxsd x, m12}, m5, m3, m2, m9
+ REPX {pminsd x, m13}, m5, m3, m2, m9
+ REPX {pmulld x, m14}, m5, m3, m2, m9
+ psubd m4, m5, m3 ; (t2 - t3) * 2896
+ paddd m3, m5 ; (t2 + t3) * 2896
+ psubd m5, m2, m9 ; (t6 - t7) * 2896
+ paddd m2, m9 ; (t6 + t7) * 2896
+ ret
+.round:
+
+ ; m0=out0,m1=-out1,m6=out6,m7=-out7
+
+ pcmpeqd m8, m8
+ REPX {pxor x, m8 }, m1, m7, m3, m5
+ REPX {psubd x, m8 }, m1, m7
+ REPX {paddd x, m11}, m2, m3, m4, m5
+ REPX {psrad x, 12 }, m2, m3, m4, m5
+%else
+ mova [r3+0*16], m2
+ mova [r3+1*16], m3
+ mova [r3+2*16], m4
+ mova [r3+3*16], m5
+ mova m5, [o(pd_2048)]
+
+ ITX_MULSUB_2D 7, 0, 2, 3, 4, 5, 401, 4076 ; t1a, t0a
+ ITX_MULSUB_2D 1, 6, 2, 3, 4, 5, 3920, 1189 ; t7a, t6a
+ mova m2, [r3+0*16]
+ mova m3, [r3+1*16]
+ mova m4, [r3+2*16]
+ mova [r3+0*16], m0
+ mova [r3+1*16], m1
+ mova [r3+2*16], m6
+ mova m1, [r3+3*16]
+ mova [r3+3*16], m7
+ ITX_MULSUB_2D 1, 2, 0, 6, 7, 5, 1931, 3612 ; t3a, t2a
+ ITX_MULSUB_2D 3, 4, 0, 6, 7, 5, 3166, 2598 ; t5a, t4a
+ mova m0, [r3+0*16]
+ mova m6, [r3+2*16]
+ psubd m7, m2, m6 ; t6
+ paddd m2, m6 ; t2
+ psubd m6, m0, m4 ; t4
+ paddd m0, m4 ; t0
+ mova [r3+0*16], m7
+ mova m5, [r3+1*16]
+ mova m7, [r3+3*16]
+ psubd m4, m1, m5 ; t7
+ paddd m5, m1 ; t3
+ psubd m1, m7, m3 ; t5
+ paddd m7, m3 ; t1
+ mova m3, [o(clip_18b_min)]
+ REPX {pmaxsd x, m3 }, m6, m1, m4, m2, m0, m5, m7
+ mova [r3+1*16], m7
+ mova m7, [o(clip_18b_max)]
+ pmaxsd m3, [r3+0*16]
+ REPX {pminsd x, m7 }, m6, m1, m3, m4, m2, m0, m5
+ pminsd m7, [r3+1*16]
+ mova [r3+0*16], m0
+ mova [r3+1*16], m2
+ mova [r3+2*16], m5
+ mova [r3+3*16], m7
+ mova m0, [o(pd_2048)]
+ ITX_MULSUB_2D 6, 1, 2, 5, 7, 0, 1567, 3784 ; t5a, t4a
+ ITX_MULSUB_2D 4, 3, 2, 5, 7, 0, 3784, 7 ; t6a, t7a
+ mova m5, [r3+2*16]
+ mova m7, [r3+3*16]
+ psubd m2, m6, m3 ; t7
+ paddd m6, m3 ; out6
+ mova [r3+3*16], m6
+ mova m0, [r3+0*16]
+ mova m6, [r3+1*16]
+ psubd m3, m7, m5 ; t3
+ paddd m7, m5 ; -out7
+ psubd m5, m0, m6 ; t2
+ paddd m0, m6 ; out0
+ psubd m6, m1, m4 ; t6
+ paddd m1, m4 ; -out1
+ mova m4, [o(clip_18b_min)]
+ REPX {pmaxsd x, m4 }, m5, m3, m6, m2
+ mova m4, [o(clip_18b_max)]
+ REPX {pminsd x, m4 }, m5, m3, m6, m2
+ mova m4, [o(pd_2896)]
+ REPX {pmulld x, m4 }, m5, m3, m6, m2
+ psubd m4, m5, m3 ; (t2 - t3) * 2896
+ paddd m3, m5 ; (t2 + t3) * 2896
+ psubd m5, m6, m2 ; (t6 - t7) * 2896
+ paddd m2, m6 ; (t6 + t7) * 2896
+ ret
+.round:
+ mova [r3+2*16], m0
+
+ pcmpeqd m0, m0
+ mova m6, [o(pd_2048)]
+ REPX {pxor x, m0 }, m1, m7, m3, m5
+ REPX {psubd x, m0 }, m1, m7
+ REPX {paddd x, m6 }, m2, m3, m4, m5
+ REPX {psrad x, 12 }, m2, m3, m4, m5
+
+ mova m6, [r3+3*16]
+ mova m0, [r3+2*16]
+%endif
+ ret
+
+.pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call m_suffix(iadst_8x4_internal_8bpc, _ssse3).main
+ jmp m(idct_8x4_internal_16bpc).end
+
+INV_TXFM_8X4_FN flipadst, dct
+INV_TXFM_8X4_FN flipadst, adst
+INV_TXFM_8X4_FN flipadst, flipadst
+INV_TXFM_8X4_FN flipadst, identity
+
+cglobal iflipadst_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+ lea r5, [o(.main)]
+ jmp m(idct_8x4_internal_16bpc).pass1_entry
+.main:
+ call m(iadst_8x4_internal_16bpc).main_pass1
+ call m(iadst_8x4_internal_16bpc).round
+ packssdw m7, m6
+ packssdw m5, m4
+ packssdw m3, m2
+ packssdw m1, m0
+ mova m0, m7
+ mova m2, m5
+ mova m4, m3
+ mova m6, m1
+ ret
+.pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call m_suffix(iadst_8x4_internal_8bpc, _ssse3).main
+ lea r3, [strideq*3]
+ add dstq, r3
+ neg strideq
+ jmp m(idct_8x4_internal_16bpc).end
+
+INV_TXFM_8X4_FN identity, dct
+INV_TXFM_8X4_FN identity, adst
+INV_TXFM_8X4_FN identity, flipadst
+INV_TXFM_8X4_FN identity, identity
+
+cglobal iidentity_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+ lea r5, [o(.main)]
+ jmp m(idct_8x4_internal_16bpc).pass1_entry
+.main:
+ REPX {paddd x, x}, m0, m1, m2, m3, m4, m5, m6, m7
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ ret
+.pass2:
+ mova m7, [o(pw_1697x8)]
+ pmulhrsw m4, m7, m0
+ pmulhrsw m5, m7, m1
+ pmulhrsw m6, m7, m2
+ pmulhrsw m7, m3
+ paddsw m0, m4
+ paddsw m1, m5
+ paddsw m2, m6
+ paddsw m3, m7
+ jmp m(idct_8x4_internal_16bpc).end
+
+%macro INV_TXFM_8X8_FN 2-3 0 ; type1, type2, eob_offset
+%if ARCH_X86_64
+ INV_TXFM_FN %1, %2, %3, 8x8, 15, 0-3*16
+%else
+ INV_TXFM_FN %1, %2, %3, 8x8, 8, 0-5*16
+%endif
+%ifidn %1_%2, dct_dct
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 2
+.end:
+ add r5d, 384
+ sar r5d, 9
+.end2:
+ imul r5d, 2896
+ add r5d, 34816
+ movd m0, r5d
+ pshuflw m0, m0, q1111
+ punpcklqdq m0, m0
+ mova m6, [o(pixel_10bpc_max)]
+ pxor m5, m5
+ lea r2, [strideq*3]
+.loop:
+ mova m1, [dstq+strideq*0]
+ mova m2, [dstq+strideq*1]
+ mova m3, [dstq+strideq*2]
+ mova m4, [dstq+r2]
+ REPX {paddw x, m0}, m1, m2, m3, m4
+ REPX {pmaxsw x, m5}, m1, m2, m3, m4
+ REPX {pminsw x, m6}, m1, m2, m3, m4
+ mova [dstq+strideq*0], m1
+ mova [dstq+strideq*1], m2
+ mova [dstq+strideq*2], m3
+ mova [dstq+r2 ], m4
+ lea dstq, [dstq+strideq*4]
+ dec r3d
+ jg .loop
+ RET
+%endif
+%endmacro
+
+INV_TXFM_8X8_FN dct, dct
+INV_TXFM_8X8_FN dct, identity, 6
+INV_TXFM_8X8_FN dct, adst
+INV_TXFM_8X8_FN dct, flipadst
+
+cglobal idct_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if ARCH_X86_32
+ DECLARE_REG_TMP 1
+ mov [rsp+4*16+1*gprsize], r1
+%else
+ DECLARE_REG_TMP 6
+%endif
+ lea t0, [o(.pass1_main)]
+
+.pass1_full:
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+%undef cmp
+%if ARCH_X86_64
+ xor r5d, r5d
+ cmp eobd, 10
+ setge r5b
+%else
+ mov r5d, 1
+ cmp eobd, 10
+ sbb r5d, 0
+%endif
+ shl r5d, 4
+%if ARCH_X86_32
+ lea r3, [rsp+gprsize]
+%endif
+.loop_pass1:
+ mova m0, [cq+0*32+r5]
+ mova m1, [cq+1*32+r5]
+ mova m2, [cq+2*32+r5]
+ mova m3, [cq+3*32+r5]
+ mova m4, [cq+4*32+r5]
+ mova m5, [cq+5*32+r5]
+ mova m6, [cq+6*32+r5]
+ mova m7, [cq+7*32+r5]
+ call t0
+
+ test r5d, r5d
+ jz .end_pass1
+
+ mova [cq+0*32+16], m0
+ mova [cq+1*32+16], m1
+ mova [cq+2*32+16], m2
+ mova [cq+3*32+16], m3
+
+ sub r5d, 16
+ jmp .loop_pass1
+.end_pass1:
+ mova m4, [cq+0*32+16]
+ mova m5, [cq+1*32+16]
+ mova m6, [cq+2*32+16]
+ mova m7, [cq+3*32+16]
+%if ARCH_X86_32
+ mov r1, [rsp+4*16+1*gprsize]
+%endif
+ jmp tx2q
+.pass1_main:
+ call m(idct_8x4_internal_16bpc).main_pass1
+ pcmpeqd m1, m1
+ REPX {psubd x, m1}, m0, m6, m5, m3
+ call m(idct_8x4_internal_16bpc).round
+ REPX {psrad x, 1 }, m0, m1, m2, m3, m4, m5, m6, m7
+.pack_and_transpose:
+ packssdw m2, m3
+ packssdw m6, m7
+ packssdw m0, m1
+ packssdw m4, m5
+ jmp m(idct_8x4_internal_16bpc).transpose4x8packed
+
+.pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call m_suffix(idct_8x8_internal_8bpc, _ssse3).main
+ lea r3, [strideq*3]
+%if ARCH_X86_64
+ mova m10, [o(pixel_10bpc_max)]
+ pxor m9, m9
+%endif
+ call .round3_and_write_8x8
+.zero:
+%if ARCH_X86_64
+%define mzero m9
+%else
+%define mzero m7
+ pxor m7, m7
+%endif
+ REPX {mova [cq+16*x], mzero}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+%undef mzero
+ RET
+
+ ; round (rounded right-shift by 5) before writing
+ ; data in m0-7
+ ; on x86-64, pw_2048 is in m8
+ ; .round1 is for m0-7
+ ; .round2 is for m0-6 & [rsp+gprsize*2]
+ ; .round3 is same, but without using m8 on x86-64 (.round2/3 are identical on x86-32)
+ ; .round4 is x86-32-only, it is similar to .round2 but with constant already in m7
+%if ARCH_X86_32
+.round1_and_write_8x8:
+ mova [rsp+gprsize*2], m7
+.round2_and_write_8x8:
+%endif
+.round3_and_write_8x8:
+ mova m7, [o(pw_2048)]
+%if ARCH_X86_32
+.round4_and_write_8x8:
+%endif
+ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pmulhrsw m7, [rsp+gprsize*2]
+%if ARCH_X86_64
+ jmp .write_8x8
+.round2_and_write_8x8:
+ mova m7, [rsp+gprsize*2]
+.round1_and_write_8x8:
+ REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
+%endif
+
+ ; m0-7 have to-be-written data [pre-rounded]
+ ; on x86-64, m9-10 contain a zero/pixel_max
+ ; on x86-32, these are runtime-generated, and [rsp+gprsize*2] is scratch
+ ; r0,1,3 contain dstq/strideq/stride3q
+ ; r5 is a scratch register
+.write_8x8:
+ lea r5, [dstq+strideq*4]
+ paddw m0, [dstq+strideq*0]
+ paddw m1, [dstq+strideq*1]
+ paddw m2, [dstq+strideq*2]
+ paddw m3, [dstq+r3]
+ paddw m4, [r5 +strideq*0]
+ paddw m5, [r5 +strideq*1]
+ paddw m6, [r5 +strideq*2]
+ paddw m7, [r5 +r3]
+%if ARCH_X86_64
+ REPX {pmaxsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {pminsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
+%else
+ mova [rsp+gprsize*2], m7
+ pxor m7, m7
+ REPX {pmaxsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pmaxsw m7, [rsp+gprsize*2]
+ mova [rsp+gprsize*2], m7
+ mova m7, [o(pixel_10bpc_max)]
+ REPX {pminsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pminsw m7, [rsp+gprsize*2]
+%endif
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+r3 ], m3
+ mova [r5 +strideq*0], m4
+ mova [r5 +strideq*1], m5
+ mova [r5 +strideq*2], m6
+ mova [r5 +r3 ], m7
+ ret
+
+INV_TXFM_8X8_FN adst, dct
+INV_TXFM_8X8_FN adst, adst
+INV_TXFM_8X8_FN adst, flipadst
+INV_TXFM_8X8_FN adst, identity, 6
+
+cglobal iadst_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if ARCH_X86_32
+ mov [rsp+4*16+1*gprsize], r1
+%endif
+ lea t0, [o(.pass1_main)]
+ jmp m(idct_8x8_internal_16bpc).pass1_full
+.pass1_main:
+ call m(iadst_8x4_internal_16bpc).main_pass1
+ call .round
+ jmp m(idct_8x8_internal_16bpc).pack_and_transpose
+.round:
+%if ARCH_X86_64
+ pcmpeqd m8, m8 ; -1
+ REPX {psubd x, m8 }, m0, m6
+ REPX {pxor x, m8 }, m1, m7, m3, m5
+ REPX {psrad x, 1 }, m0, m1, m6, m7
+ REPX {psubd x, m8 }, m1, m7
+ mova m8, [o(pd_6144)]
+ REPX {paddd x, m8 }, m2, m3, m4, m5
+ REPX {psrad x, 13 }, m2, m3, m4, m5
+%else
+ mova [r3+2*16], m0
+
+ pcmpeqd m0, m0 ; -1
+ mova m6, [o(pd_6144)]
+ REPX {pxor x, m0 }, m1, m7, m3, m5
+ REPX {psrad x, 1 }, m1, m7
+ REPX {psubd x, m0 }, m1, m7
+ REPX {paddd x, m6 }, m2, m3, m4, m5
+ REPX {psrad x, 13 }, m2, m3, m4, m5
+
+ mova m0, [r3+2*16]
+ psrld m6, 12 ; +1
+ paddd m0, m6
+ paddd m6, [r3+3*16]
+ REPX {psrad x, 1 }, m0, m6
+%endif
+ ret
+
+.pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main
+ call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main_pass2_end
+ lea r3, [strideq*3]
+%if ARCH_X86_64
+ mova m10, [o(pixel_10bpc_max)]
+ pxor m9, m9
+%endif
+ call .round3_and_write_8x8
+ jmp m(idct_8x8_internal_16bpc).zero
+
+ ; round (rounded right-shift by 5) before writing; odd registers are negated
+ ; data in m0-7
+ ; on x86-64, pw_2048 is in m8 and pw_m2048 is in m11
+ ; .round1 is for m0-7
+ ; .round2 is for m0-6 & [rsp+gprsize*2]
+ ; .round3 is same, but without using m8 on x86-64 (.round2/3 are identical on x86-32)
+%if ARCH_X86_64
+.round2_and_write_8x8:
+ mova m7, [rsp+gprsize*2]
+.round1_and_write_8x8:
+ REPX {pmulhrsw x, m8 }, m0, m2, m4, m6
+ REPX {pmulhrsw x, m11}, m1, m3, m5, m7
+ jmp m(idct_8x8_internal_16bpc).write_8x8
+%else
+.round1_and_write_8x8:
+ mova [rsp+gprsize*2], m7
+.round2_and_write_8x8:
+%endif
+.round3_and_write_8x8:
+ mova m7, [o(pw_2048)]
+ REPX {pmulhrsw x, m7}, m0, m2, m4, m6
+ mova m7, [o(pw_m2048)]
+ REPX {pmulhrsw x, m7}, m1, m3, m5
+ pmulhrsw m7, [rsp+gprsize*2]
+ jmp m(idct_8x8_internal_16bpc).write_8x8
+
+INV_TXFM_8X8_FN flipadst, dct
+INV_TXFM_8X8_FN flipadst, adst
+INV_TXFM_8X8_FN flipadst, flipadst
+INV_TXFM_8X8_FN flipadst, identity, 6
+
+cglobal iflipadst_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if ARCH_X86_32
+ mov [rsp+4*16+1*gprsize], r1
+%endif
+ lea t0, [o(.pass1_main)]
+ jmp m(idct_8x8_internal_16bpc).pass1_full
+.pass1_main:
+ call m(iadst_8x4_internal_16bpc).main_pass1
+ call m(iadst_8x8_internal_16bpc).round
+ ; invert registers
+ packssdw m7, m6
+ packssdw m5, m4
+ packssdw m3, m2
+ packssdw m1, m0
+ mova m0, m7
+ mova m2, m5
+ mova m4, m3
+ mova m6, m1
+ jmp m(idct_8x4_internal_16bpc).transpose4x8packed
+
+.pass2:
+ lea dstq, [dstq+strideq*8]
+ sub dstq, strideq
+ neg strideq
+ jmp m(iadst_8x8_internal_16bpc).pass2
+
+INV_TXFM_8X8_FN identity, dct
+INV_TXFM_8X8_FN identity, adst
+INV_TXFM_8X8_FN identity, flipadst
+INV_TXFM_8X8_FN identity, identity
+
+cglobal iidentity_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+ mova m0, [cq+0*32]
+ mova m1, [cq+1*32]
+ mova m2, [cq+2*32]
+ mova m3, [cq+3*32]
+ mova m4, [cq+4*32]
+ mova m5, [cq+5*32]
+ mova m6, [cq+6*32]
+ mova m7, [cq+7*32]
+ packssdw m0, [cq+0*32+16]
+ packssdw m1, [cq+1*32+16]
+ packssdw m2, [cq+2*32+16]
+ packssdw m3, [cq+3*32+16]
+ packssdw m4, [cq+4*32+16]
+ packssdw m5, [cq+5*32+16]
+ packssdw m6, [cq+6*32+16]
+ packssdw m7, [cq+7*32+16]
+ mova [rsp+gprsize+16*1], m6
+ jmp m_suffix(idct_8x8_internal_8bpc, _ssse3).pass1_end3
+
+.pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ lea r3, [strideq*3]
+%if ARCH_X86_64
+ mova m10, [o(pixel_10bpc_max)]
+ pxor m9, m9
+ mova m8, [o(pw_4096)]
+ call m(idct_8x8_internal_16bpc).round1_and_write_8x8
+%else
+ mova [rsp+gprsize], m7
+ mova m7, [o(pw_4096)]
+ call m(idct_8x8_internal_16bpc).round4_and_write_8x8
+%endif
+ jmp m(idct_8x8_internal_16bpc).zero
+
+%macro INV_TXFM_8X16_FN 2-3 2d ; type1, type2, eob_tbl_suffix
+%if ARCH_X86_64
+ INV_TXFM_FN %1, %2, tbl_8x16_%3, 8x16, 15, 0-16*16
+%else
+ INV_TXFM_FN %1, %2, tbl_8x16_%3, 8x16, 8, 0-17*16
+%endif
+%ifidn %1_%2, dct_dct
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
+ mov r3d, 4
+%if stack_size_padded > 0
+ ; adjust to caller's stack allocation
+ add rsp, (12+ARCH_X86_64)*16
+%endif
+ jmp m(inv_txfm_add_dct_dct_8x8_16bpc).end
+%endif
+%endmacro
+
+INV_TXFM_8X16_FN dct, dct
+INV_TXFM_8X16_FN dct, identity, v
+INV_TXFM_8X16_FN dct, adst
+INV_TXFM_8X16_FN dct, flipadst
+
+%if ARCH_X86_64
+DECLARE_REG_TMP 7
+%endif
+
+cglobal idct_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if WIN64
+ PUSH r7
+%elif ARCH_X86_32
+ mov [rsp+16*16+gprsize*1], r1
+ mov [rsp+16*16+gprsize*2], r6
+%endif
+ lea t0, [o(m(idct_8x8_internal_16bpc).pass1_main)]
+.pass1_full:
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+%undef cmp
+ mov r6d, 4
+.zero_loop:
+ dec r6d
+ cmp eobb, byte [r5+r6]
+ jl .zero_loop
+ mov r5d, r6d
+ shl r5d, 4
+%if ARCH_X86_32
+ ; restore pic-ptr
+ mov r6, [rsp+16*16+2*gprsize]
+ ; setup stack pointer
+ lea r3, [rsp+gprsize]
+%endif
+.loop_pass1:
+ mova m0, [cq+0*64+r5]
+ mova m1, [cq+1*64+r5]
+ mova m2, [cq+2*64+r5]
+ mova m3, [cq+3*64+r5]
+ mova m4, [cq+4*64+r5]
+ mova m5, [cq+5*64+r5]
+ mova m6, [cq+6*64+r5]
+ mova m7, [cq+7*64+r5]
+ call m(idct_8x4_internal_16bpc).rect2_mul
+ call t0
+
+ mova [cq+0*64+r5], m0
+ mova [cq+1*64+r5], m1
+ mova [cq+2*64+r5], m2
+ mova [cq+3*64+r5], m3
+ sub r5d, 16
+ jge .loop_pass1
+%if WIN64
+ POP r7
+%elif ARCH_X86_32
+ mov r1, [rsp+16*16+1*gprsize]
+%endif
+ jmp tx2q
+
+.pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+
+ ; input is in cqN*16, where N=0/4/8/12/1/5/9/13/2/6/10/14/3/7/11/15
+ ; some are still pre-loaded from the final loop iteration in pass=1
+
+ mova m1, m2
+ mova m2, [cq+ 1*16]
+ mova m3, [cq+ 9*16]
+ mova m4, [cq+ 2*16]
+ mova m5, [cq+10*16]
+ mova m6, [cq+ 3*16]
+ mova m7, [cq+11*16]
+ call m_suffix(idct_8x8_internal_8bpc, _ssse3).main
+ mova [rsp+gprsize+3*16], m0
+ mova [rsp+gprsize+4*16], m1
+ mova [rsp+gprsize+5*16], m2
+ mova [rsp+gprsize+6*16], m3
+ mova [rsp+gprsize+7*16], m4
+ mova [rsp+gprsize+8*16], m5
+ mova [rsp+gprsize+9*16], m6
+ ; m7 is already stored in [rsp+gprsize+0*16]
+ mova m0, [cq+ 4*16]
+ mova m1, [cq+12*16]
+ mova m2, [cq+ 5*16]
+ mova m3, [cq+13*16]
+ mova m4, [cq+ 6*16]
+ mova m5, [cq+14*16]
+ mova m6, [cq+ 7*16]
+ mova m7, [cq+15*16]
+ call m_suffix(idct_16x8_internal_8bpc, _ssse3).main
+
+ ; out0-7 is in rsp+gprsize+3-10*mmsize
+ ; out8-14 is in m0-6, and out15 is in m7 as well as rsp+gprsize+0*mmsize
+
+%if ARCH_X86_64
+ mova m8, [o(pw_2048)]
+ mova m10, [o(pixel_10bpc_max)]
+ pxor m9, m9
+ mov r6, dstq
+%else
+ mov [rsp+16*16+gprsize*1], dstq
+%endif
+ lea r3, [strideq*3]
+ lea dstq, [dstq+strideq*8]
+ call m(idct_8x8_internal_16bpc).round2_and_write_8x8
+%if ARCH_X86_64
+%define mzero m9
+%else
+%define mzero m7
+ pxor m7, m7
+%endif
+ REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+%undef mzero
+ mova m0, [rsp+gprsize+ 3*16]
+ mova m1, [rsp+gprsize+ 4*16]
+ mova m2, [rsp+gprsize+ 5*16]
+ mova m3, [rsp+gprsize+ 6*16]
+ mova m4, [rsp+gprsize+ 7*16]
+ mova m5, [rsp+gprsize+ 8*16]
+ mova m6, [rsp+gprsize+ 9*16]
+ mova m7, [rsp+gprsize+10*16]
+%if ARCH_X86_64
+ mov dstq, r6
+%else
+ mov dstq, [rsp+16*16+gprsize*1]
+%endif
+ call m(idct_8x8_internal_16bpc).round1_and_write_8x8
+ RET
+
+INV_TXFM_8X16_FN adst, dct
+INV_TXFM_8X16_FN adst, adst
+INV_TXFM_8X16_FN adst, flipadst
+INV_TXFM_8X16_FN adst, identity, v
+
+cglobal iadst_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if WIN64
+ PUSH r7
+%elif ARCH_X86_32
+ mov [rsp+16*16+gprsize*1], r1
+ mov [rsp+16*16+gprsize*2], r6
+%endif
+ lea t0, [o(m(iadst_8x8_internal_16bpc).pass1_main)]
+ jmp m(idct_8x16_internal_16bpc).pass1_full
+
+.pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ mova m4, [cq+ 9*16]
+ mova m5, [cq+13*16]
+ mova [rsp+gprsize+7*16], m0
+ mova [rsp+gprsize+8*16], m1
+ mova [rsp+gprsize+5*16], m4
+ mova [rsp+gprsize+6*16], m5
+ mova m0, m2
+ mova m1, m3
+ mova m2, [cq+ 1*16]
+ mova m3, [cq+ 5*16]
+ mova m4, [cq+ 2*16]
+ mova m5, [cq+ 6*16]
+ mova m6, [cq+11*16]
+ mova m7, [cq+15*16]
+ mova [rsp+gprsize+ 3*16], m4
+ mova [rsp+gprsize+ 4*16], m5
+ mova [rsp+gprsize+ 9*16], m6
+ mova [rsp+gprsize+10*16], m7
+ mova m4, [cq+10*16]
+ mova m5, [cq+14*16]
+ mova m6, [cq+ 3*16]
+ mova m7, [cq+ 7*16]
+ call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main
+ call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main_pass2_end
+
+%if ARCH_X86_64
+ mova m11, [o(pw_m2048)]
+ mova m8, [o(pw_2048)]
+ mova m10, [o(pixel_10bpc_max)]
+ pxor m9, m9
+ mov r6, dstq
+%else
+ mov [rsp+16*16+gprsize*1], dstq
+%endif
+ lea r3, [strideq*3]
+ lea dstq, [dstq+strideq*8]
+ call m(iadst_8x8_internal_16bpc).round2_and_write_8x8
+%if ARCH_X86_64
+%define mzero m9
+%else
+%define mzero m7
+ pxor m7, m7
+%endif
+ REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+%undef mzero
+ mova m0, [rsp+gprsize+ 3*16]
+ mova m1, [rsp+gprsize+ 4*16]
+ mova m2, [rsp+gprsize+ 5*16]
+ mova m3, [rsp+gprsize+ 6*16]
+ mova m4, [rsp+gprsize+ 7*16]
+ mova m5, [rsp+gprsize+ 8*16]
+ mova m6, [rsp+gprsize+ 9*16]
+ mova m7, [rsp+gprsize+10*16]
+%if ARCH_X86_64
+ mov dstq, r6
+%else
+ mov dstq, [rsp+16*16+gprsize*1]
+%endif
+ call m(iadst_8x8_internal_16bpc).round1_and_write_8x8
+ RET
+
+INV_TXFM_8X16_FN flipadst, dct
+INV_TXFM_8X16_FN flipadst, adst
+INV_TXFM_8X16_FN flipadst, flipadst
+INV_TXFM_8X16_FN flipadst, identity, v
+
+cglobal iflipadst_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if WIN64
+ PUSH r7
+%elif ARCH_X86_32
+ mov [rsp+16*16+gprsize*1], r1
+ mov [rsp+16*16+gprsize*2], r6
+%endif
+ lea t0, [o(m(iflipadst_8x8_internal_16bpc).pass1_main)]
+ jmp m(idct_8x16_internal_16bpc).pass1_full
+
+.pass2:
+ lea r3, [strideq*3]
+ lea r3, [r3*5]
+ add dstq, r3
+ neg strideq
+ jmp m(iadst_8x16_internal_16bpc).pass2
+
+INV_TXFM_8X16_FN identity, dct, h
+INV_TXFM_8X16_FN identity, adst, h
+INV_TXFM_8X16_FN identity, flipadst, h
+INV_TXFM_8X16_FN identity, identity
+
+cglobal iidentity_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if WIN64
+ PUSH r7
+%elif ARCH_X86_32
+ mov [rsp+16*16+gprsize*1], r1
+ mov [rsp+16*16+gprsize*2], r6
+%endif
+ lea t0, [o(m(idct_8x8_internal_16bpc).pack_and_transpose)]
+ jmp m(idct_8x16_internal_16bpc).pass1_full
+
+.pass2:
+%if ARCH_X86_64
+ mova m4, [o(pw_2048)]
+ mova m5, [o(pixel_10bpc_max)]
+ pxor m6, m6
+ mova m7, [o(pw_1697x16)]
+%endif
+ mov r5d, 4
+ lea r3, [strideq*3]
+.pass2_loop:
+ call .main
+%if ARCH_X86_64
+ call m(idct_8x4_internal_16bpc).round1_and_write_8x4
+%else
+ call m(idct_8x4_internal_16bpc).round2_and_write_8x4
+%endif
+ REPX {mova [cq+x*16], m6}, 0, 4, 8, 12, 16, 20, 24, 28
+ dec r5d
+ jle .end
+ add cq, 16
+ lea dstq, [dstq+strideq*4]
+ mova m0, [cq+ 0*16]
+ mova m1, [cq+ 4*16]
+ mova m2, [cq+ 8*16]
+ mova m3, [cq+12*16]
+ jmp .pass2_loop
+.end:
+ RET
+.main:
+ ; y = pmulhrsw(x, pw_1697x16); x = paddsw(x, x); x = paddsw(x, y)
+%if ARCH_X86_32
+ mova m7, [o(pw_1697x16)]
+ pmulhrsw m4, m7, m0
+ pmulhrsw m5, m7, m1
+ pmulhrsw m6, m7, m2
+ pmulhrsw m7, m3
+%else
+ pmulhrsw m8, m7, m0
+ pmulhrsw m9, m7, m1
+ pmulhrsw m10, m7, m2
+ pmulhrsw m11, m7, m3
+%endif
+ REPX {paddsw x, x}, m0, m1, m2, m3
+%if ARCH_X86_64
+ paddsw m0, m8
+ paddsw m1, m9
+ paddsw m2, m10
+ paddsw m3, m11
+%else
+ paddsw m0, m4
+ paddsw m1, m5
+ paddsw m2, m6
+ paddsw m3, m7
+%endif
+ ret
+
+%macro INV_TXFM_16X4_FN 2 ; type1, type2
+%if ARCH_X86_64
+ INV_TXFM_FN %1, %2, 0, 16x4, 16, 0-8*16
+%else
+ INV_TXFM_FN %1, %2, 0, 16x4, 8, 0-12*16
+%endif
+%ifidn %1_%2, dct_dct
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 4
+.dconly:
+ add r5d, 384
+ sar r5d, 9
+.dconly2:
+ imul r5d, 2896
+ add r5d, 34816
+ movd m0, r5d
+ pshuflw m0, m0, q1111
+ punpcklqdq m0, m0
+ mova m3, [o(pixel_10bpc_max)]
+ pxor m4, m4
+.loop:
+ mova m1, [dstq+ 0]
+ mova m2, [dstq+16]
+ REPX {paddw x, m0}, m1, m2
+ REPX {pminsw x, m3}, m1, m2
+ REPX {pmaxsw x, m4}, m1, m2
+ mova [dstq+ 0], m1
+ mova [dstq+16], m2
+ add dstq, strideq
+ dec r3d
+ jg .loop
+ RET
+%endif
+%endmacro
+
+INV_TXFM_16X4_FN dct, dct
+INV_TXFM_16X4_FN dct, identity
+INV_TXFM_16X4_FN dct, adst
+INV_TXFM_16X4_FN dct, flipadst
+
+cglobal idct_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+ ; setup stack pointer
+ lea r3, [rsp+gprsize]
+
+ mova m0, [cq+ 1*16]
+ mova m1, [cq+ 3*16]
+ mova m2, [cq+ 5*16]
+ mova m3, [cq+ 7*16]
+ mova m4, [cq+ 9*16]
+ mova m5, [cq+11*16]
+ mova m6, [cq+13*16]
+ mova m7, [cq+15*16]
+ call .main_oddhalf
+ mova m0, [cq+ 0*16]
+ mova m1, [cq+ 2*16]
+ mova m2, [cq+ 4*16]
+ mova m3, [cq+ 6*16]
+ mova m4, [cq+ 8*16]
+ mova m5, [cq+10*16]
+ mova m6, [cq+12*16]
+ mova m7, [cq+14*16]
+ call m(idct_8x4_internal_16bpc).main_pass1
+ call m(idct_8x4_internal_16bpc).round
+ ; t0-7 is in m0-7
+
+ call .round
+
+%if ARCH_X86_64
+.pack_transpose:
+ ; transpose in two parts
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ packssdw m8, m9
+ packssdw m10, m11
+ packssdw m12, m13
+ packssdw m14, m15
+.transpose:
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ call .transpose4x8packed_hi
+%else
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [r3+0*16], m0
+ mova [r3+1*16], m1
+ mova [r3+2*16], m2
+ mova [r3+3*16], m3
+ mova m0, [r3+ 8*16]
+ mova m2, [r3+ 9*16]
+ mova m4, [r3+10*16]
+ mova m6, [r3+11*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+%endif
+ jmp tx2q
+%if ARCH_X86_64
+.transpose4x8packed_hi:
+ punpcklwd m9, m10, m14
+ punpckhwd m10, m14
+ punpckhwd m14, m8, m12
+ punpcklwd m8, m12
+
+ punpckhwd m11, m8, m9
+ punpcklwd m8, m9
+ punpckhwd m12, m14, m10
+ punpcklwd m14, m10
+
+ punpcklwd m10, m11, m12
+ punpckhwd m11, m12
+ punpckhwd m9, m8, m14
+ punpcklwd m8, m14
+ ret
+%endif
+.main_oddhalf_fast: ; lower half zero
+ pmulld m7, m0, [o(pd_4076)]
+ pmulld m0, [o(pd_401)]
+ pmulld m6, m1, [o(pd_m1189)]
+ pmulld m1, [o(pd_3920)]
+%if ARCH_X86_32
+ mova m4, [o(pd_2048)]
+ REPX {paddd x, m4}, m1, m6
+ REPX {psrad x, 12}, m1, m6
+ mova [r3+1*16], m1
+%endif
+ pmulld m5, m2, [o(pd_3612)]
+ pmulld m2, [o(pd_1931)]
+%if ARCH_X86_32
+ pmulld m1, m3, [o(pd_m2598)]
+%else
+ pmulld m4, m3, [o(pd_m2598)]
+%endif
+ pmulld m3, [o(pd_3166)]
+ jmp .main_oddhalf_fast2
+.main_oddhalf:
+%if ARCH_X86_64
+ ITX_MULSUB_2D 0, 7, 8, 9, 10, _, 401, 4076 ; t8a, t15a
+ ITX_MULSUB_2D 6, 1, 8, 9, 10, _, 3920, 1189 ; t11a, t12a
+ ITX_MULSUB_2D 2, 5, 8, 9, 10, _, 1931, 3612 ; t10a, t13a
+ ITX_MULSUB_2D 4, 3, 8, 9, 10, _, 3166, 2598 ; t9a, t14a
+.main_oddhalf_fast2:
+ REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
+ psubd m8, m0, m4 ; t9
+ paddd m0, m4 ; t8
+ psubd m4, m6, m2 ; t10
+ paddd m2, m6 ; t11
+ psubd m6, m1, m5 ; t13
+ paddd m5, m1 ; t12
+ psubd m1, m7, m3 ; t14
+ paddd m7, m3 ; t15
+ REPX {pmaxsd x, m12}, m8, m1, m4, m6, m0, m2, m5, m7
+ REPX {pminsd x, m13}, m8, m1, m4, m6, m0, m2, m5, m7
+ mova m15, [o(pd_3784)]
+ mova m10, [o(pd_1567)]
+ ITX_MULSUB_2D 1, 8, 3, 9, _, 11, 10, 15
+ ITX_MULSUB_2D 6, 4, 3, 9, _, 11, 10, 15, 4
+ psubd m3, m1, m4 ; t10
+ paddd m1, m4 ; t9
+ psubd m4, m0, m2 ; t11a
+ paddd m0, m2 ; t8a
+ psubd m2, m8, m6 ; t13
+ paddd m6, m8 ; t14
+ psubd m8, m7, m5 ; t12a
+ paddd m7, m5 ; t15a
+ REPX {pmaxsd x, m12}, m2, m8, m3, m4, m0, m1, m6, m7
+ REPX {pminsd x, m13}, m2, m8, m3, m4, m0, m1, m6, m7
+ REPX {pmulld x, m14}, m2, m8, m3, m4
+ paddd m2, m11
+ paddd m8, m11
+ paddd m5, m2, m3 ; t13a
+ psubd m2, m3 ; t10a
+ psubd m3, m8, m4 ; t11
+ paddd m4, m8 ; t12
+ REPX {psrad x, 12}, m5, m2, m3, m4
+ mova [r3+0*16], m0
+ mova [r3+1*16], m1
+ mova [r3+2*16], m2
+ mova [r3+3*16], m3
+ mova [r3+4*16], m4
+ mova [r3+5*16], m5
+ mova [r3+6*16], m6
+ mova [r3+7*16], m7
+%else
+ mova [r3+0*16], m2
+ mova [r3+1*16], m3
+ mova [r3+2*16], m4
+ mova [r3+3*16], m5
+ mova m4, [o(pd_2048)]
+
+ ITX_MULSUB_2D 0, 7, 2, 3, 5, _, 401, 4076 ; t8a, t15a
+ ITX_MULSUB_2D 6, 1, 2, 3, 5, 4, 3920, 1189 ; t11a, t12a
+
+ mova m2, [r3+0*16]
+ mova m3, [r3+1*16]
+ mova [r3+0*16], m0
+ mova [r3+1*16], m1
+ mova m1, [r3+2*16]
+ mova m5, [r3+3*16]
+ mova [r3+2*16], m6
+ mova [r3+3*16], m7
+
+ ITX_MULSUB_2D 2, 5, 0, 6, 7, _, 1931, 3612 ; t10a, t13a
+ ITX_MULSUB_2D 1, 3, 0, 6, 7, _, 3166, 2598 ; t9a, t14a
+
+ mova m0, [r3+0*16]
+ mova m6, [r3+2*16]
+ mova m7, [r3+3*16]
+.main_oddhalf_fast2:
+ REPX {paddd x, m4}, m0, m7, m2, m5, m1, m3
+ REPX {psrad x, 12}, m0, m7, m2, m5, m1, m3
+ psubd m4, m0, m1 ; t9
+ paddd m0, m1 ; t8
+ mova m1, [r3+1*16]
+ mova [r3+0*16], m4
+ psubd m4, m6, m2 ; t10
+ paddd m2, m6 ; t11
+ psubd m6, m1, m5 ; t13
+ paddd m5, m1 ; t12
+ psubd m1, m7, m3 ; t14
+ paddd m7, m3 ; t15
+ mova m3, [o(clip_18b_min)]
+ REPX {pmaxsd x, m3}, m1, m4, m6, m0, m2, m5, m7
+ pmaxsd m3, [r3+0*16]
+ mova [r3+0*16], m3
+ mova m3, [o(clip_18b_max)]
+ REPX {pminsd x, m3}, m1, m4, m6, m0, m2, m5, m7
+ pminsd m3, [r3+0*16]
+ mova [r3+0*16], m0
+ mova [r3+1*16], m2
+ mova [r3+2*16], m5
+ mova [r3+3*16], m7
+ mova m7, [o(pd_2048)]
+ ITX_MULSUB_2D 1, 3, 0, 2, 5, 7, 1567, 3784
+ ITX_MULSUB_2D 6, 4, 0, 2, _, 7, 5, 3784, 4
+ mova m0, [r3+0*16]
+ mova m2, [r3+1*16]
+ psubd m5, m1, m4 ; t10
+ mova [r3+1*16], m5
+ paddd m1, m4 ; t9
+ psubd m4, m0, m2 ; t11a
+ paddd m0, m2 ; t8a
+ mova m5, [r3+2*16]
+ mova m7, [r3+3*16]
+ psubd m2, m3, m6 ; t13
+ paddd m6, m3 ; t14
+ paddd m3, m7, m5 ; t15a
+ psubd m7, m5 ; t12a
+ mova [r3+0*16], m3
+ mova m3, [r3+1*16]
+ mova m5, [o(clip_18b_min)]
+ REPX {pmaxsd x, m5}, m2, m7, m3, m4, m0, m1, m6
+ pmaxsd m5, [r3+0*16]
+ mova [r3+0*16], m5
+ mova m5, [o(clip_18b_max)]
+ REPX {pminsd x, m5}, m2, m7, m3, m4, m0, m1, m6
+ pminsd m5, [r3+0*16]
+ mova [r3+0*16], m5
+ mova m5, [o(pd_2896)]
+ REPX {pmulld x, m5}, m2, m7, m3, m4
+ mova m5, [o(pd_2048)]
+ REPX {paddd x, m5}, m2, m7
+ paddd m5, m2, m3 ; t13a
+ psubd m2, m3 ; t10a
+ psubd m3, m7, m4 ; t11
+ paddd m4, m7 ; t12
+ REPX {psrad x, 12}, m5, m2, m3, m4
+ mova m7, [r3+0*16]
+ mova [r3+11*16], m0
+ mova [r3+10*16], m1
+ mova [r3+9*16], m2
+ mova [r3+8*16], m3
+ mova [r3+7*16], m4
+ mova [r3+6*16], m5
+ mova [r3+5*16], m6
+ mova [r3+4*16], m7
+%endif
+ ret
+.round:
+%if ARCH_X86_64
+ REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ pcmpeqd m8, m8
+ REPX {psubd x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
+ mova m8, [r3+1*16]
+ mova m9, [r3+2*16]
+ mova m10, [r3+3*16]
+ mova m11, [r3+4*16]
+ mova m12, [r3+5*16]
+ mova m13, [r3+6*16]
+ mova m14, [r3+7*16]
+ psubd m15, m0, m14 ; out15
+ paddd m0, m14 ; out0
+ psubd m14, m1, m13 ; out14
+ paddd m1, m13 ; out1
+ psubd m13, m2, m12 ; out13
+ paddd m2, m12 ; out2
+ psubd m12, m3, m11 ; out12
+ paddd m3, m11 ; out3
+ psubd m11, m4, m10 ; out11
+ paddd m4, m10 ; out4
+ psubd m10, m5, m9 ; out10
+ paddd m5, m9 ; out5
+ psubd m9, m6, m8 ; out9
+ paddd m6, m8 ; out6
+ psubd m8, m7, [r3+0*16] ; out8
+ paddd m7, [r3+0*16] ; out7
+ REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ ; and out0-15 is now in m0-15
+%else
+ mova [r3+ 0*16], m0
+ mova m0, [o(clip_18b_min)]
+ REPX {pmaxsd x, m0}, m1, m2, m3, m4, m5, m6, m7
+ pmaxsd m0, [r3+ 0*16]
+ mova [r3+ 0*16], m7
+ mova m7, [o(clip_18b_max)]
+ REPX {pminsd x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pminsd m7, [r3+ 0*16]
+ mova [r3+ 0*16], m0
+ pcmpeqd m0, m0
+ REPX {psubd x, m0}, m1, m2, m3, m4, m5, m6, m7
+ mova [r3+ 1*16], m1
+ mova [r3+ 2*16], m2
+ mova m1, [r3+ 0*16]
+ psubd m1, m0
+ mova [r3+ 0*16], m1
+ mova m1, [r3+11*16]
+ mova m2, [r3+10*16]
+ psubd m0, m7, m1
+ paddd m7, m1
+ psubd m1, m6, m2
+ paddd m6, m2
+ REPX {psrad x, 1}, m0, m1, m6, m7
+ packssdw m0, m1 ; out8-9
+ packssdw m6, m7 ; out6-7
+ mova [r3+11*16], m6
+ mova m1, [r3+9*16]
+ mova m7, [r3+8*16]
+ psubd m2, m5, m1
+ paddd m5, m1
+ psubd m1, m4, m7
+ paddd m4, m7
+ REPX {psrad x, 1}, m2, m1, m4, m5
+ packssdw m2, m1 ; out10-11
+ packssdw m4, m5 ; out4-5
+ mova m1, [r3+2*16]
+ mova [r3+10*16], m4
+ mova m6, [r3+7*16]
+ mova m7, [r3+6*16]
+ psubd m4, m3, m6
+ paddd m3, m6
+ psubd m6, m1, m7
+ paddd m1, m7
+ REPX {psrad x, 1}, m4, m6, m1, m3
+ packssdw m4, m6 ; out12-13
+ packssdw m1, m3 ; out2-3
+ mova m3, [r3+1*16]
+ mova [r3+9*16], m1
+ mova m1, [r3+0*16]
+ mova m5, [r3+5*16]
+ mova m7, [r3+4*16]
+ psubd m6, m3, m5
+ paddd m3, m5
+ psubd m5, m1, m7
+ paddd m1, m7
+ REPX {psrad x, 1}, m6, m5, m1, m3
+ packssdw m6, m5 ; out14-15
+ packssdw m1, m3 ; out0-1
+ mova [r3+8*16], m1
+%endif
+ ret
+
+.pass2:
+ lea r4, [o(m_suffix(idct_8x4_internal_8bpc, _ssse3).main)]
+.pass2_loop:
+ lea r3, [strideq*3]
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call r4
+ call m(idct_8x4_internal_16bpc).round2_and_write_8x4
+ REPX {mova [cq+x*16], m6}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+%if ARCH_X86_64
+ mova m0, m8
+ mova m1, m9
+ mova m2, m10
+ mova m3, m11
+%else
+ mova m0, [rsp+gprsize+0*16]
+ mova m1, [rsp+gprsize+1*16]
+ mova m2, [rsp+gprsize+2*16]
+ mova m3, [rsp+gprsize+3*16]
+%endif
+ add dstq, 16
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call r4
+ call m(idct_8x4_internal_16bpc).round2_and_write_8x4
+ RET
+
+INV_TXFM_16X4_FN adst, dct
+INV_TXFM_16X4_FN adst, adst
+INV_TXFM_16X4_FN adst, flipadst
+INV_TXFM_16X4_FN adst, identity
+
+cglobal iadst_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+ ; setup stack pointer
+ lea r3, [rsp+gprsize]
+ call .main
+%if ARCH_X86_64
+ jmp m(idct_16x4_internal_16bpc).pack_transpose
+%else
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [rsp+gprsize+0*16], m0
+ mova [rsp+gprsize+1*16], m1
+ mova [rsp+gprsize+2*16], m2
+ mova [rsp+gprsize+3*16], m3
+ mova m0, [rsp+gprsize+ 8*16]
+ mova m2, [rsp+gprsize+ 9*16]
+ mova m4, [rsp+gprsize+10*16]
+ mova m6, [rsp+gprsize+11*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ jmp tx2q
+%endif
+
+.main:
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+ mova m0, [cq+ 2*16]
+ mova m1, [cq+13*16]
+ mova m2, [cq+ 6*16]
+ mova m3, [cq+ 9*16]
+ mova m4, [cq+10*16]
+ mova m5, [cq+ 5*16]
+ mova m6, [cq+14*16]
+ mova m7, [cq+ 1*16]
+ call .main_part1
+ mova m0, [cq+ 0*16]
+ mova m1, [cq+15*16]
+ mova m2, [cq+ 4*16]
+ mova m3, [cq+11*16]
+ mova m4, [cq+ 8*16]
+ mova m5, [cq+ 7*16]
+ mova m6, [cq+12*16]
+ mova m7, [cq+ 3*16]
+ call .main_part2
+.round:
+%if ARCH_X86_64
+ mova m15, [o(pd_6144)]
+ psrld m14, 11 ; pd_1
+ pcmpeqd m8, m8 ; -1
+ psubd m13, m15, m14 ; pd_6143
+ REPX {paddd x, m14}, m0, m2
+ REPX {paddd x, m15}, m4, m6
+ REPX {pxor x, m8 }, m1, m3, m5, m7
+ REPX {psrad x, 1 }, m1, m3
+ REPX {paddd x, m15}, m5, m7
+ REPX {psubd x, m8 }, m1, m3
+ paddd m8, m15, m9
+ psubd m9, m13, m10
+ paddd m10, m15, m11
+ psubd m11, m13, m12
+ paddd m12, m14, [r3+3*16]
+ psubd m13, m14, [r3+2*16]
+ psubd m15, m14, [r3+0*16]
+ paddd m14, [r3+1*16]
+ REPX {psrad x, 1 }, m0, m2, m12, m13, m14, m15
+ REPX {psrad x, 13}, m4, m5, m6, m7, m8, m9, m10, m11
+%else
+ mova [r3+8*16], m1
+ mova [r3+9*16], m3
+ mova m3, [o(pd_6144)]
+ pcmpeqd m1, m1
+ REPX {pxor x, m1}, m5, m7
+ REPX {paddd x, m3}, m4, m5, m6, m7
+ REPX {psrad x, 13}, m4, m5, m6, m7
+ packssdw m4, m5
+ packssdw m6, m7
+ mova [r3+10*16], m4
+ mova [r3+11*16], m6
+ mova m4, [r3+4*16]
+ mova m5, [r3+5*16]
+ mova m6, [r3+6*16]
+ mova m7, [r3+7*16]
+ REPX {pxor x, m1}, m5, m7
+ REPX {psubd x, m1}, m4, m6
+ REPX {psrad x, 1 }, m4, m5, m6, m7
+ REPX {psubd x, m1}, m5, m7
+ packssdw m4, m5
+ packssdw m6, m7
+ mova m5, [r3+8*16]
+ mova m7, [r3+9*16]
+ mova [r3+8*16], m4
+ mova [r3+9*16], m6
+ REPX {pxor x, m1}, m5, m7
+ REPX {paddd x, m3}, m0, m5, m2, m7
+ REPX {psrad x, 13}, m0, m5, m2, m7
+ packssdw m0, m5
+ packssdw m2, m7
+ mova m4, [r3+0*16]
+ mova m5, [r3+1*16]
+ mova m6, [r3+2*16]
+ mova m7, [r3+3*16]
+ REPX {psubd x, m1}, m4, m6
+ REPX {pxor x, m1}, m5, m7
+ REPX {psrad x, 1 }, m4, m5, m6, m7
+ REPX {psubd x, m1}, m5, m7
+ packssdw m4, m5
+ packssdw m6, m7
+%endif
+ ret
+
+.main_part2:
+%if ARCH_X86_64
+ ITX_MULSUB_2D 1, 0, 8, 9, 10, 11, 201, 4091
+ ITX_MULSUB_2D 3, 2, 8, 9, 10, 11, 1751, 3703
+ ITX_MULSUB_2D 5, 4, 8, 9, 10, 11, 3035, 2751
+ ITX_MULSUB_2D 7, 6, 8, 9, 10, 11, 3857, 1380
+ psubd m8, m0, m4 ; t8a
+ paddd m0, m4 ; t0a
+ psubd m4, m1, m5 ; t9a
+ paddd m1, m5 ; t1a
+ psubd m5, m2, m6 ; t12a
+ paddd m2, m6 ; t4a
+ psubd m6, m3, m7 ; t13a
+ paddd m7, m3 ; t5a
+ REPX {pmaxsd x, m12}, m8, m4, m5, m6, m0, m1, m2, m7
+ REPX {pminsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7
+ mova m15, [o(pd_4017)]
+ mova m10, [o(pd_799)]
+ ITX_MULSUB_2D 8, 4, 3, 9, _, 11, 10, 15
+ ITX_MULSUB_2D 6, 5, 3, 9, _, 11, 15, 10
+ psubd m3, m0, m2 ; t4
+ paddd m0, m2 ; t0
+ psubd m2, m1, m7 ; t5
+ paddd m1, m7 ; t1
+ psubd m7, m4, m6 ; t12a
+ paddd m4, m6 ; t8a
+ psubd m6, m8, m5 ; t13a
+ paddd m5, m8 ; t9a
+ REPX {pmaxsd x, m12}, m3, m2, m7, m6, m0, m1, m4, m5
+ REPX {pminsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5
+ mova m15, [o(pd_3784)]
+ mova m10, [o(pd_1567)]
+ ITX_MULSUB_2D 3, 2, 8, 9, _, 11, 10, 15
+ ITX_MULSUB_2D 7, 6, 8, 9, _, 11, 10, 15
+ mova m10, [r3+0*16] ; t2
+ mova m8, [r3+1*16] ; t3
+ psubd m9, m0, m10 ; t2a
+ paddd m0, m10 ; out0
+ psubd m10, m1, m8 ; t3a
+ paddd m1, m8 ; -out15
+ mova [r3+0*16], m1
+ mova m15, [r3+3*16] ; t7a
+ mova m1, [r3+2*16] ; t6a
+ psubd m8, m3, m15 ; t7
+ paddd m15, m3 ; out12
+ paddd m3, m2, m1 ; -out3
+ psubd m2, m1 ; t6
+ mova [r3+3*16], m15
+ mova [r3+1*16], m2
+ mova m1, [r3+7*16] ; t15
+ mova m2, [r3+6*16] ; t14
+ paddd m15, m7, m1 ; -out13
+ psubd m7, m1 ; t15a
+ psubd m11, m6, m2 ; t14a
+ paddd m2, m6 ; out2
+ mova [r3+2*16], m15
+ mova m1, [r3+4*16] ; t10a
+ mova m15, [r3+5*16] ; t11a
+ psubd m6, m4, m1 ; t10
+ paddd m1, m4 ; -out1
+ psubd m4, m5, m15 ; t11
+ paddd m5, m15 ; out14
+ REPX {pmaxsd x, m12}, m11, m7, m9, m10, m6, m4, m8
+ pmaxsd m12, [r3+1*16] ; t6
+ mova [r3+1*16], m5
+ REPX {pminsd x, m13}, m11, m7, m9, m10, m6, m4, m12, m8
+ REPX {pmulld x, m14}, m11, m7, m9, m10, m6, m4, m12, m8
+ paddd m5, m11, m7 ; -out5 (unshifted)
+ psubd m11, m7 ; out10 (unshifted)
+ paddd m7, m9, m10 ; -out7 (unshifted)
+ psubd m9, m10 ; out8 (unshifted)
+ psubd m10, m6, m4 ; -out9 (unshifted)
+ paddd m6, m4 ; out6 (unshifted)
+ paddd m4, m12, m8 ; out4 (unshifted)
+ psubd m12, m8 ; -out11 (unshifted)
+%else
+ mova [r3+8*16], m0
+ mova [r3+9*16], m1
+ mova [r3+10*16], m2
+ mova [r3+11*16], m3
+ mova m3, [o(pd_2048)]
+ ITX_MULSUB_2D 5, 4, 0, 1, 2, 3, 3035, 2751
+ ITX_MULSUB_2D 7, 6, 0, 1, 2, 3, 3857, 1380
+ mova m0, [r3+8*16]
+ mova m1, [r3+9*16]
+ mova [r3+8*16], m4
+ mova m4, [r3+10*16]
+ mova [r3+9*16], m5
+ mova [r3+10*16], m6
+ mova m5, [r3+11*16]
+ mova [r3+11*16], m7
+ ITX_MULSUB_2D 1, 0, 2, 6, 7, 3, 201, 4091
+ ITX_MULSUB_2D 5, 4, 2, 6, 7, 3, 1751, 3703
+ mova m2, [r3+8*16]
+ mova m6, [r3+9*16]
+ psubd m3, m0, m2 ; t8a
+ paddd m0, m2 ; t0a
+ mova [r3+8*16], m3
+ psubd m2, m1, m6 ; t9a
+ paddd m1, m6 ; t1a
+ mova m3, [r3+10*16]
+ psubd m6, m4, m3 ; t12a
+ paddd m4, m3 ; t4a
+ mova m3, [r3+11*16]
+ psubd m7, m5, m3 ; t13a
+ paddd m5, m3 ; t5a
+ mova m3, [o(clip_18b_min)]
+ REPX {pmaxsd x, m3}, m2, m6, m7, m0, m1, m4, m5
+ pmaxsd m3, [r3+8*16]
+ mova [r3+8*16], m3
+ mova m3, [o(clip_18b_max)]
+ REPX {pminsd x, m3}, m2, m6, m7, m0, m1, m4, m5
+ pminsd m3, [r3+8*16]
+ mova [r3+8*16], m3
+ psubd m3, m0, m4 ; t4
+ paddd m0, m4 ; t0
+ psubd m4, m1, m5 ; t5
+ paddd m1, m5 ; t1
+ mova m5, [o(pd_2048)]
+ mova [r3+9*16], m1
+ mova [r3+10*16], m4
+ mova [r3+11*16], m3
+ mova m3, [r3+8*16]
+ mova [r3+8*16], m0
+ ITX_MULSUB_2D 3, 2, 0, 1, 4, 5, 799, 4017
+ ITX_MULSUB_2D 7, 6, 0, 1, 4, 5, 4017, 4
+ psubd m5, m2, m7 ; t12a
+ paddd m2, m7 ; t8a
+ psubd m7, m3, m6 ; t13a
+ paddd m6, m3 ; t9a
+ mova m0, [r3+8*16]
+ mova m1, [r3+9*16]
+ mova m4, [r3+10*16]
+ mova m3, [o(clip_18b_min)]
+ REPX {pmaxsd x, m3}, m4, m5, m7, m0, m1, m2, m6
+ pmaxsd m3, [r3+11*16]
+ mova [r3+8*16], m3
+ mova m3, [o(clip_18b_max)]
+ REPX {pminsd x, m3}, m4, m5, m7, m0, m1, m2, m6
+ pminsd m3, [r3+8*16]
+ mova [r3+8*16], m0
+ mova [r3+9*16], m1
+ mova [r3+10*16], m2
+ mova [r3+11*16], m6
+ mova m0, [o(pd_2048)]
+ ITX_MULSUB_2D 3, 4, 1, 2, 6, 0, 1567, 3784
+ ITX_MULSUB_2D 5, 7, 1, 2, 6, 0, 6, 3784
+ mova m0, [r3+7*16] ; t7a
+ mova m2, [r3+6*16] ; t6a
+ psubd m1, m3, m0 ; t7
+ paddd m0, m3 ; out12
+ paddd m3, m4, m2 ; -out3
+ psubd m4, m2 ; t6
+ mova [r3+7*16], m3
+ mova m3, [r3+3*16] ; t15
+ mova m2, [r3+2*16] ; t14
+ paddd m6, m5, m3 ; -out13
+ psubd m5, m3 ; t15a
+ psubd m3, m7, m2 ; t14a
+ paddd m2, m7 ; out2
+ mova [r3+6*16], m2
+ mova m7, [r3+0*16] ; t10a
+ mova m2, [r3+1*16] ; t11a
+ mova [r3+0*16], m0
+ mova [r3+1*16], m6
+ mova m6, [r3+11*16]
+ psubd m0, m6, m2 ; t11
+ paddd m6, m2 ; out14
+ mova [r3+2*16], m6
+ mova m2, [r3+10*16]
+ psubd m6, m2, m7 ; t10
+ paddd m2, m7 ; -out1
+ mova m7, [r3+5*16] ; t3
+ mova [r3+5*16], m2
+ mova [r3+10*16], m1
+ mova m1, [r3+9*16]
+ psubd m2, m1, m7 ; t3a
+ paddd m1, m7 ; -out15
+ mova [r3+3*16], m1
+ mova m1, [r3+4*16] ; t2
+ mova m7, [r3+8*16]
+ psubd m7, m1 ; t2a
+ paddd m1, [r3+8*16] ; out0
+ mova [r3+4*16], m1
+ mova m1, [o(clip_18b_min)]
+ REPX {pmaxsd x, m1}, m0, m2, m3, m4, m5, m6, m7
+ pmaxsd m1, [r3+10*16]
+ mova [r3+10*16], m1
+ mova m1, [o(clip_18b_max)]
+ REPX {pminsd x, m1}, m0, m2, m3, m4, m5, m6, m7
+ pminsd m1, [r3+10*16]
+ mova [r3+10*16], m1
+ mova m1, [o(pd_2896)]
+ REPX {pmulld x, m1}, m0, m2, m3, m4, m5, m6, m7
+ pmulld m1, [r3+10*16]
+ mova [r3+11*16], m3
+ psubd m3, m4, m1 ; -out11 (unshifted)
+ paddd m4, m1 ; out4 (unshifted)
+ psubd m1, m6, m0 ; -out9 (unshifted)
+ paddd m6, m0 ; out6 (unshifted)
+ psubd m0, m7, m2 ; out8 (unshifted)
+ paddd m7, m2 ; -out7 (unshifted)
+ mova m2, [r3+11*16]
+ mova [r3+11*16], m5
+ paddd m5, m2 ; -out5 (unshifted)
+ psubd m2, [r3+11*16] ; out10 (unshifted)
+ ; m0-3 contain out8-11 (unshifted), m4-7 contain out4-7 (unshifted)
+ ; r[-4,3] contain out0-3 and out12-15
+%endif
+ ret
+.main_part1:
+%if ARCH_X86_64
+ ITX_MULSUB_2D 1, 0, 8, 9, 10, 11, 995, 3973
+ ITX_MULSUB_2D 3, 2, 8, 9, 10, 11, 2440, 3290
+ ITX_MULSUB_2D 5, 4, 8, 9, 10, 11, 3513, 2106
+ ITX_MULSUB_2D 7, 6, 8, 9, 10, 11, 4052, 601
+ psubd m8, m0, m4 ; t10a
+ paddd m0, m4 ; t2a
+ psubd m4, m1, m5 ; t11a
+ paddd m1, m5 ; t3a
+ psubd m5, m2, m6 ; t14a
+ paddd m2, m6 ; t6a
+ psubd m6, m3, m7 ; t15a
+ paddd m7, m3 ; t7a
+ REPX {pmaxsd x, m12}, m8, m4, m5, m6, m0, m1, m2, m7
+ REPX {pminsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7
+ mova m15, [o(pd_2276)]
+ mova m10, [o(pd_3406)]
+ ITX_MULSUB_2D 8, 4, 3, 9, _, 11, 10, 15
+ ITX_MULSUB_2D 6, 5, 3, 9, _, 11, 15, 10
+ psubd m3, m0, m2 ; t6
+ paddd m0, m2 ; t2
+ psubd m2, m1, m7 ; t7
+ paddd m1, m7 ; t3
+ psubd m7, m4, m6 ; t14a
+ paddd m4, m6 ; t10a
+ psubd m6, m8, m5 ; t15a
+ paddd m5, m8 ; t11a
+ REPX {pmaxsd x, m12}, m3, m2, m7, m6, m0, m1, m4, m5
+ REPX {pminsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5
+ mova m15, [o(pd_1567)]
+ mova m10, [o(pd_3784)]
+ ITX_MULSUB_2D 2, 3, 8, 9, _, 11, 10, 15
+ ITX_MULSUB_2D 6, 7, 8, 9, _, 11, 10, 15
+ mova [r3+0*16], m0
+ mova [r3+1*16], m1
+ mova [r3+4*16], m4
+ mova [r3+5*16], m5
+ mova [r3+2*16], m2
+ mova [r3+3*16], m3
+ mova [r3+6*16], m6
+ mova [r3+7*16], m7
+%else
+ mova [r3+4*16], m0
+ mova [r3+5*16], m1
+ mova [r3+6*16], m2
+ mova [r3+7*16], m3
+ mova m3, [o(pd_2048)]
+ ITX_MULSUB_2D 5, 4, 0, 1, 2, 3, 3513, 2106
+ ITX_MULSUB_2D 7, 6, 0, 1, 2, 3, 4052, 601
+ mova [r3+0*16], m4
+ mova [r3+1*16], m5
+ mova [r3+2*16], m6
+ mova [r3+3*16], m7
+ mova m0, [r3+4*16]
+ mova m1, [r3+5*16]
+ mova m2, [r3+6*16]
+ mova m7, [r3+7*16]
+ ITX_MULSUB_2D 1, 0, 4, 5, 6, 3, 995, 3973
+ ITX_MULSUB_2D 7, 2, 4, 5, 6, 3, 2440, 3290
+ mova m4, [r3+0*16]
+ mova m5, [r3+1*16]
+ psubd m6, m0, m4 ; t10a
+ paddd m0, m4 ; t2a
+ mova [r3+4*16], m6
+ mova m6, [r3+2*16]
+ mova m3, [r3+3*16]
+ psubd m4, m1, m5 ; t11a
+ paddd m1, m5 ; t3a
+ psubd m5, m2, m6 ; t14a
+ paddd m2, m6 ; t6a
+ psubd m6, m7, m3 ; t15a
+ paddd m7, m3 ; t7a
+ mova m3, [o(clip_18b_min)]
+ REPX {pmaxsd x, m3}, m4, m5, m6, m0, m1, m2, m7
+ pmaxsd m3, [r3+4*16]
+ mova [r3+4*16], m3
+ mova m3, [o(clip_18b_max)]
+ REPX {pminsd x, m3}, m4, m5, m6, m0, m1, m2, m7
+ pminsd m3, [r3+4*16]
+ mova [r3+4*16], m3
+ psubd m3, m0, m2 ; t6
+ paddd m0, m2 ; t2
+ psubd m2, m1, m7 ; t7
+ paddd m1, m7 ; t3
+ mova [r3+5*16], m1
+ mova [r3+6*16], m3
+ mova [r3+7*16], m2
+ mova m1, [r3+4*16]
+ mova [r3+4*16], m0
+ mova m3, [o(pd_2048)]
+ ITX_MULSUB_2D 1, 4, 0, 7, 2, 3, 3406, 2276
+ ITX_MULSUB_2D 6, 5, 0, 7, 2, 3, 2276, 2
+ psubd m7, m4, m6 ; t14a
+ paddd m4, m6 ; t10a
+ psubd m6, m1, m5 ; t15a
+ paddd m5, m1 ; t11a
+ mova m1, [r3+5*16]
+ mova m3, [r3+6*16]
+ mova m2, [r3+7*16]
+ mova m0, [o(clip_18b_min)]
+ REPX {pmaxsd x, m0}, m3, m2, m7, m6, m1, m4, m5
+ pmaxsd m0, [r3+4*16]
+ mova [r3+4*16], m0
+ mova m0, [o(clip_18b_max)]
+ REPX {pminsd x, m0}, m3, m2, m7, m6, m1, m4, m5
+ pminsd m0, [r3+4*16]
+ mova [r3+4*16], m0
+ mova [r3+5*16], m1
+ mova [r3+0*16], m4
+ mova [r3+1*16], m5
+ mova m0, [o(pd_2048)]
+ ITX_MULSUB_2D 2, 3, 1, 4, 5, 0, 3784, 1567
+ ITX_MULSUB_2D 6, 7, 1, 4, 5, 0, 5, 1567
+ mova [r3+6*16], m2
+ mova [r3+7*16], m3
+ mova [r3+2*16], m6
+ mova [r3+3*16], m7
+%endif
+ ret
+
+.pass2:
+ lea r4, [o(m_suffix(iadst_8x4_internal_8bpc, _ssse3).main)]
+ jmp m(idct_16x4_internal_16bpc).pass2_loop
+
+INV_TXFM_16X4_FN flipadst, dct
+INV_TXFM_16X4_FN flipadst, adst
+INV_TXFM_16X4_FN flipadst, flipadst
+INV_TXFM_16X4_FN flipadst, identity
+
+cglobal iflipadst_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+ lea r3, [rsp+gprsize]
+ call m(iadst_16x4_internal_16bpc).main
+%if ARCH_X86_64
+ packssdw m1, m0
+ packssdw m3, m2
+ packssdw m5, m4
+ packssdw m7, m6
+ packssdw m9, m8
+ packssdw m11, m10
+ packssdw m13, m12
+ packssdw m15, m14
+ mova m0, m15
+ mova m2, m13
+ mova m4, m11
+ mova m6, m9
+ mova m8, m7
+ mova m10, m5
+ mova m12, m3
+ mova m14, m1
+ jmp m(idct_16x4_internal_16bpc).transpose
+%else
+ mova [rsp+gprsize+4*16], m0
+ mova [rsp+gprsize+5*16], m2
+ mova [rsp+gprsize+6*16], m4
+ mova [rsp+gprsize+7*16], m6
+ pshufd m6, [rsp+gprsize+ 8*16], q1032
+ pshufd m4, [rsp+gprsize+ 9*16], q1032
+ pshufd m2, [rsp+gprsize+10*16], q1032
+ pshufd m0, [rsp+gprsize+11*16], q1032
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [rsp+gprsize+0*16], m0
+ mova [rsp+gprsize+1*16], m1
+ mova [rsp+gprsize+2*16], m2
+ mova [rsp+gprsize+3*16], m3
+ pshufd m6, [rsp+gprsize+ 4*16], q1032
+ pshufd m4, [rsp+gprsize+ 5*16], q1032
+ pshufd m2, [rsp+gprsize+ 6*16], q1032
+ pshufd m0, [rsp+gprsize+ 7*16], q1032
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ jmp tx2q
+%endif
+
+.pass2:
+ lea r3, [strideq*3]
+ lea dstq, [dstq+r3]
+ neg strideq
+ lea r4, [o(m_suffix(iadst_8x4_internal_8bpc, _ssse3).main)]
+ jmp m(idct_16x4_internal_16bpc).pass2_loop
+
+INV_TXFM_16X4_FN identity, dct
+INV_TXFM_16X4_FN identity, adst
+INV_TXFM_16X4_FN identity, flipadst
+INV_TXFM_16X4_FN identity, identity
+
+cglobal iidentity_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if ARCH_X86_64
+ mova m15, [o(pd_11586)]
+ pmulld m0, m15, [cq+ 0*16]
+ pmulld m1, m15, [cq+ 1*16]
+ pmulld m2, m15, [cq+ 2*16]
+ pmulld m3, m15, [cq+ 3*16]
+ pmulld m4, m15, [cq+ 4*16]
+ pmulld m5, m15, [cq+ 5*16]
+ pmulld m6, m15, [cq+ 6*16]
+ pmulld m7, m15, [cq+ 7*16]
+ pmulld m8, m15, [cq+ 8*16]
+ pmulld m9, m15, [cq+ 9*16]
+ pmulld m10, m15, [cq+10*16]
+ pmulld m11, m15, [cq+11*16]
+ pmulld m12, m15, [cq+12*16]
+ pmulld m13, m15, [cq+13*16]
+ pmulld m14, m15, [cq+14*16]
+ pmulld m15, [cq+15*16]
+ mova [cq+ 0*16], m15
+ mova m15, [o(pd_6144)]
+ REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14
+ paddd m15, [cq+ 0*16]
+ REPX {psrad x, 13 }, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ jmp m(idct_16x4_internal_16bpc).pack_transpose
+%else
+ add cq, 8*16
+ mov r5d, 2
+.loop_pass1:
+ mova m7, [o(pd_11586)]
+ pmulld m0, m7, [cq+0*16]
+ pmulld m1, m7, [cq+1*16]
+ pmulld m2, m7, [cq+2*16]
+ pmulld m3, m7, [cq+3*16]
+ pmulld m4, m7, [cq+4*16]
+ pmulld m5, m7, [cq+5*16]
+ pmulld m6, m7, [cq+6*16]
+ pmulld m7, [cq+7*16]
+ mova [cq+7*16], m7
+ mova m7, [o(pd_6144)]
+ REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6
+ paddd m7, [cq+7*16]
+ REPX {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ dec r5d
+ jz .end_pass1
+ mova [rsp+gprsize+0*16], m0
+ mova [rsp+gprsize+1*16], m1
+ mova [rsp+gprsize+2*16], m2
+ mova [rsp+gprsize+3*16], m3
+ sub cq, 8*16
+ jmp .loop_pass1
+.end_pass1:
+ jmp tx2q
+%endif
+
+.pass2:
+%if ARCH_X86_64
+ mova m12, [o(pw_1697x8)]
+%endif
+ lea r4, [o(.main)]
+ jmp m(idct_16x4_internal_16bpc).pass2_loop
+.main:
+%if ARCH_X86_64
+ pmulhrsw m4, m0, m12
+ pmulhrsw m5, m1, m12
+ pmulhrsw m6, m2, m12
+ pmulhrsw m7, m3, m12
+%else
+ mova m7, [o(pw_1697x8)]
+ pmulhrsw m4, m0, m7
+ pmulhrsw m5, m1, m7
+ pmulhrsw m6, m2, m7
+ pmulhrsw m7, m3
+%endif
+ paddsw m0, m4
+ paddsw m1, m5
+ paddsw m2, m6
+ paddsw m3, m7
+ ret
+
+%macro INV_TXFM_16X8_FN 2-3 0 ; type1, type2, eob_offset
+%if ARCH_X86_64
+ INV_TXFM_FN %1, %2, %3, 16x8, 16, 0-8*16
+%else
+ INV_TXFM_FN %1, %2, %3, 16x8, 8, 0-13*16
+%endif
+%ifidn %1_%2, dct_dct
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 8
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
+%if ARCH_X86_32
+ add rsp, 1*16
+%endif
+ jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly
+%endif
+%endmacro
+
+INV_TXFM_16X8_FN dct, dct
+INV_TXFM_16X8_FN dct, identity, 6
+INV_TXFM_16X8_FN dct, adst
+INV_TXFM_16X8_FN dct, flipadst
+
+cglobal idct_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if ARCH_X86_64
+ DECLARE_REG_TMP 6, 4, 6
+%else
+ mov [rsp+gprsize+12*16], r1
+ DECLARE_REG_TMP 1, 4, 3
+%endif
+ lea t0, [o(.main)]
+.loop_main:
+%undef cmp
+%if ARCH_X86_64
+ xor r5d, r5d
+ cmp eobd, 10
+ setge r5b
+%else
+ mov r5d, 1
+ cmp eobd, 10
+ sbb r5d, 0
+%endif
+ shl r5d, 4
+
+ lea r3, [rsp+gprsize]
+.loop_pass1:
+ call t0
+%if ARCH_X86_64
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [cq+4*32+r5], m8
+ mova [cq+5*32+r5], m9
+ mova [cq+6*32+r5], m10
+ mova [cq+7*32+r5], m11
+%else
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [cq+4*32+r5], m0
+ mova [cq+5*32+r5], m1
+ mova [cq+6*32+r5], m2
+ mova [cq+7*32+r5], m3
+ mova m0, [rsp+gprsize+ 8*16]
+ mova m2, [rsp+gprsize+ 9*16]
+ mova m4, [rsp+gprsize+10*16]
+ mova m6, [rsp+gprsize+11*16]
+%endif
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ pxor m7, m7
+ REPX {mova [cq+x*32+r5], m7}, 8, 9, 10, 11, 12, 13, 14, 15
+ test r5d, r5d
+ jz .end
+ mova [cq+0*32+r5], m0
+ mova [cq+1*32+r5], m1
+ mova [cq+2*32+r5], m2
+ mova [cq+3*32+r5], m3
+ xor r5d, r5d
+ jmp .loop_pass1
+.end:
+
+ jmp tx2q
+.main:
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+ mova m0, [cq+ 1*32+r5]
+ mova m1, [cq+ 3*32+r5]
+ mova m2, [cq+ 5*32+r5]
+ mova m3, [cq+ 7*32+r5]
+ mova m4, [cq+ 9*32+r5]
+ mova m5, [cq+11*32+r5]
+ mova m6, [cq+13*32+r5]
+ mova m7, [cq+15*32+r5]
+ call m(idct_8x4_internal_16bpc).rect2_mul
+ call m(idct_16x4_internal_16bpc).main_oddhalf
+
+ mova m0, [cq+ 0*32+r5]
+ mova m1, [cq+ 2*32+r5]
+ mova m2, [cq+ 4*32+r5]
+ mova m3, [cq+ 6*32+r5]
+ mova m4, [cq+ 8*32+r5]
+ mova m5, [cq+10*32+r5]
+ mova m6, [cq+12*32+r5]
+ mova m7, [cq+14*32+r5]
+ call m(idct_8x4_internal_16bpc).rect2_mul
+ call m(idct_8x4_internal_16bpc).main_pass1
+ call m(idct_8x4_internal_16bpc).round
+ call m(idct_16x4_internal_16bpc).round
+%if ARCH_X86_64
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ packssdw m8, m9
+ packssdw m10, m11
+ packssdw m12, m13
+ packssdw m14, m15
+%endif
+ ret
+
+.pass2:
+%if ARCH_X86_32
+ mov strideq, [rsp+gprsize+12*16]
+%endif
+ mov r4d, 2
+.pass2_main:
+%if ARCH_X86_64
+ mova m8, [o(pw_2048)]
+ pxor m9, m9
+ mova m10, [o(pixel_10bpc_max)]
+%endif
+ lea r3, [strideq*3]
+ jmp .loop_pass2_entry
+.loop_pass2:
+ mova m0, [cq+0*32+ 0]
+ mova m1, [cq+1*32+ 0]
+ mova m2, [cq+2*32+ 0]
+ mova m3, [cq+3*32+ 0]
+.loop_pass2_entry:
+ mova m4, [cq+0*32+16]
+ mova m5, [cq+1*32+16]
+ mova m6, [cq+2*32+16]
+ mova m7, [cq+3*32+16]
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call m_suffix(idct_8x8_internal_8bpc, _ssse3).main
+ call m(idct_8x8_internal_16bpc).round2_and_write_8x8
+%if ARCH_X86_64
+%define mzero m9
+%else
+%define mzero m7
+ pxor m7, m7
+%endif
+ REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7
+ add dstq, 16
+ add cq, 4*32
+ dec r4d
+ jg .loop_pass2
+ RET
+
+INV_TXFM_16X8_FN adst, dct
+INV_TXFM_16X8_FN adst, adst
+INV_TXFM_16X8_FN adst, flipadst
+INV_TXFM_16X8_FN adst, identity, 6
+
+cglobal iadst_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if ARCH_X86_32
+ mov [rsp+gprsize+12*16], r1
+%endif
+ lea t0, [o(.main)]
+ jmp m(idct_16x8_internal_16bpc).loop_main
+
+.main:
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+ mova m0, [cq+ 2*32+r5]
+ mova m1, [cq+13*32+r5]
+ mova m2, [cq+ 6*32+r5]
+ mova m3, [cq+ 9*32+r5]
+ mova m4, [cq+10*32+r5]
+ mova m5, [cq+ 5*32+r5]
+ mova m6, [cq+14*32+r5]
+ mova m7, [cq+ 1*32+r5]
+ call m(idct_8x4_internal_16bpc).rect2_mul
+ call m(iadst_16x4_internal_16bpc).main_part1
+ mova m0, [cq+ 0*32+r5]
+ mova m1, [cq+15*32+r5]
+ mova m2, [cq+ 4*32+r5]
+ mova m3, [cq+11*32+r5]
+ mova m4, [cq+ 8*32+r5]
+ mova m5, [cq+ 7*32+r5]
+ mova m6, [cq+12*32+r5]
+ mova m7, [cq+ 3*32+r5]
+%if ARCH_X86_32
+ add r3, 8*16
+%endif
+ call m(idct_8x4_internal_16bpc).rect2_mul
+%if ARCH_X86_32
+ sub r3, 8*16
+%endif
+ call m(iadst_16x4_internal_16bpc).main_part2
+ call m(iadst_16x4_internal_16bpc).round
+%if ARCH_X86_64
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ packssdw m8, m9
+ packssdw m10, m11
+ packssdw m12, m13
+ packssdw m14, m15
+%endif
+ ret
+
+.pass2:
+%if ARCH_X86_32
+ mov strideq, [rsp+gprsize+12*16]
+%endif
+ mov r4d, 2
+%if ARCH_X86_64
+ mova m8, [o(pw_2048)]
+ pxor m9, m9
+ mova m10, [o(pixel_10bpc_max)]
+ mova m11, [o(pw_m2048)]
+%endif
+ lea r3, [strideq*3]
+ jmp .loop_pass2_entry
+.loop_pass2:
+ mova m0, [cq+0*32+ 0]
+ mova m1, [cq+1*32+ 0]
+ mova m2, [cq+2*32+ 0]
+ mova m3, [cq+3*32+ 0]
+.loop_pass2_entry:
+ mova m4, [cq+0*32+16]
+ mova m5, [cq+1*32+16]
+ mova m6, [cq+2*32+16]
+ mova m7, [cq+3*32+16]
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main
+ call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main_pass2_end
+ call m(iadst_8x8_internal_16bpc).round2_and_write_8x8
+%if ARCH_X86_64
+%define mzero m9
+%else
+%define mzero m7
+ pxor m7, m7
+%endif
+ REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7
+ add dstq, 16
+ add cq, 4*32
+ dec r4d
+ jg .loop_pass2
+ RET
+
+INV_TXFM_16X8_FN flipadst, dct
+INV_TXFM_16X8_FN flipadst, adst
+INV_TXFM_16X8_FN flipadst, flipadst
+INV_TXFM_16X8_FN flipadst, identity, 6
+
+cglobal iflipadst_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if ARCH_X86_32
+ mov [rsp+gprsize+12*16], r1
+%endif
+ lea t0, [o(.main)]
+ jmp m(idct_16x8_internal_16bpc).loop_main
+.main:
+ call m(iadst_16x8_internal_16bpc).main
+%if ARCH_X86_64
+ pshufd m1, m0, q1032
+ pshufd m3, m2, q1032
+ pshufd m5, m4, q1032
+ pshufd m7, m6, q1032
+ pshufd m0, m14, q1032
+ pshufd m2, m12, q1032
+ pshufd m4, m10, q1032
+ pshufd m6, m8, q1032
+ mova m14, m1
+ mova m12, m3
+ mova m10, m5
+ mova m8, m7
+%else
+ pshufd m1, m0, q1032
+ pshufd m3, m2, q1032
+ pshufd m5, m4, q1032
+ pshufd m7, m6, q1032
+ pshufd m0, [r3+11*16], q1032
+ pshufd m2, [r3+10*16], q1032
+ pshufd m4, [r3+9*16], q1032
+ pshufd m6, [r3+8*16], q1032
+ mova [r3+8*16], m7
+ mova [r3+9*16], m5
+ mova [r3+10*16], m3
+ mova [r3+11*16], m1
+%endif
+ ret
+
+.pass2:
+%if ARCH_X86_32
+ mov strideq, [rsp+gprsize+12*16]
+%endif
+ lea dstq, [dstq+strideq*8]
+ neg strideq
+ add dstq, strideq
+%if ARCH_X86_32
+ mov [rsp+gprsize+12*16], strideq
+%endif
+ jmp m(iadst_16x8_internal_16bpc).pass2
+
+INV_TXFM_16X8_FN identity, dct, -54
+INV_TXFM_16X8_FN identity, adst, -54
+INV_TXFM_16X8_FN identity, flipadst, -54
+INV_TXFM_16X8_FN identity, identity
+
+cglobal iidentity_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if ARCH_X86_32
+ mov [rsp+gprsize+12*16], r1
+%endif
+ lea t0, [o(.main)]
+ jmp m(idct_16x8_internal_16bpc).loop_main
+.main:
+%if ARCH_X86_64
+ mova m15, [o(pd_2896)]
+ pmulld m0, m15, [cq+ 0*32+r5]
+ pmulld m1, m15, [cq+ 1*32+r5]
+ pmulld m2, m15, [cq+ 2*32+r5]
+ pmulld m3, m15, [cq+ 3*32+r5]
+ pmulld m4, m15, [cq+ 4*32+r5]
+ pmulld m5, m15, [cq+ 5*32+r5]
+ pmulld m6, m15, [cq+ 6*32+r5]
+ pmulld m7, m15, [cq+ 7*32+r5]
+ pmulld m8, m15, [cq+ 8*32+r5]
+ pmulld m9, m15, [cq+ 9*32+r5]
+ pmulld m10, m15, [cq+10*32+r5]
+ pmulld m11, m15, [cq+11*32+r5]
+ pmulld m12, m15, [cq+12*32+r5]
+ pmulld m13, m15, [cq+13*32+r5]
+ pmulld m14, m15, [cq+14*32+r5]
+ pmulld m15, [cq+15*32+r5]
+ mova [r3], m15
+ mova m15, [o(pd_2048)]
+ REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14
+ paddd m15, [r3]
+ REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ mova [r3], m15
+ mova m15, [o(pd_11586)]
+ REPX {pmulld x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14
+ pmulld m15, [r3]
+ mova [r3], m15
+ mova m15, [o(pd_6144)]
+ REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14
+ paddd m15, [r3]
+ REPX {psrad x, 13 }, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ packssdw m8, m9
+ packssdw m10, m11
+ packssdw m12, m13
+ packssdw m14, m15
+%else
+ mova m0, [cq+ 0*32+r5]
+ mova m1, [cq+ 1*32+r5]
+ mova m2, [cq+ 2*32+r5]
+ mova m3, [cq+ 3*32+r5]
+ mova m4, [cq+ 4*32+r5]
+ mova m5, [cq+ 5*32+r5]
+ mova m6, [cq+ 6*32+r5]
+ mova m7, [cq+ 7*32+r5]
+ call m(idct_8x4_internal_16bpc).rect2_mul
+ mova [r3], m7
+ mova m7, [o(pd_11586)]
+ REPX {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pmulld m7, [r3]
+ mova [r3], m7
+ mova m7, [o(pd_6144)]
+ REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6
+ paddd m7, [r3]
+ REPX {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ mova [r3+ 8*16], m0
+ mova [r3+ 9*16], m2
+ mova [r3+10*16], m4
+ mova [r3+11*16], m6
+ mova m0, [cq+ 8*32+r5]
+ mova m1, [cq+ 9*32+r5]
+ mova m2, [cq+10*32+r5]
+ mova m3, [cq+11*32+r5]
+ mova m4, [cq+12*32+r5]
+ mova m5, [cq+13*32+r5]
+ mova m6, [cq+14*32+r5]
+ mova m7, [cq+15*32+r5]
+ call m(idct_8x4_internal_16bpc).rect2_mul
+ mova [r3], m7
+ mova m7, [o(pd_11586)]
+ REPX {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pmulld m7, [r3]
+ mova [r3], m7
+ mova m7, [o(pd_6144)]
+ REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6
+ paddd m7, [r3]
+ REPX {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+%endif
+ ret
+.pass2:
+%if ARCH_X86_32
+ mov strideq, [rsp+gprsize+12*16]
+%endif
+ mov r4d, 2
+%if ARCH_X86_64
+ mova m8, [o(pw_4096)]
+ pxor m9, m9
+ mova m10, [o(pixel_10bpc_max)]
+%endif
+ lea r3, [strideq*3]
+ jmp .loop_pass2_entry
+.loop_pass2:
+ mova m0, [cq+0*32+ 0]
+ mova m1, [cq+1*32+ 0]
+ mova m2, [cq+2*32+ 0]
+ mova m3, [cq+3*32+ 0]
+.loop_pass2_entry:
+ mova m4, [cq+0*32+16]
+ mova m5, [cq+1*32+16]
+ mova m6, [cq+2*32+16]
+ mova m7, [cq+3*32+16]
+%if ARCH_X86_64
+ call m(idct_8x8_internal_16bpc).round1_and_write_8x8
+%else
+ mova [rsp+gprsize], m7
+ mova m7, [o(pw_4096)]
+ call m(idct_8x8_internal_16bpc).round4_and_write_8x8
+%endif
+%if ARCH_X86_64
+%define mzero m9
+%else
+%define mzero m7
+ pxor m7, m7
+%endif
+ REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7
+ add dstq, 16
+ add cq, 4*32
+ dec r4d
+ jg .loop_pass2
+ RET
+
+%macro INV_TXFM_16X16_FN 2-3 2d ; type1, type2, eob_tbl_suffix
+%if ARCH_X86_64
+ INV_TXFM_FN %1, %2, tbl_16x16_%3, 16x16, 16, 0-(16+WIN64)*16
+%else
+ INV_TXFM_FN %1, %2, tbl_16x16_%3, 16x16, 8, 0-17*16
+%endif
+%ifidn %1_%2, dct_dct
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 16
+ add r5d, 640
+ sar r5d, 10
+ add rsp, (5+ARCH_X86_64*3+WIN64)*16
+ jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly2
+%endif
+%endmacro
+
+INV_TXFM_16X16_FN dct, dct
+INV_TXFM_16X16_FN dct, identity, v
+INV_TXFM_16X16_FN dct, adst
+INV_TXFM_16X16_FN dct, flipadst
+
+cglobal idct_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if ARCH_X86_64
+ DECLARE_REG_TMP 6, 7
+%if WIN64
+ mov [rsp+16*16+gprsize], r7
+%endif
+%elif ARCH_X86_32
+ DECLARE_REG_TMP 1, 6
+ mov [rsp+16*16+gprsize*1], r1
+ mov [rsp+16*16+gprsize*2], r6
+%endif
+ lea t0, [o(.main)]
+.pass1_full:
+%undef cmp
+ mov t1d, 4
+.zero_loop:
+ dec t1d
+ cmp eobb, byte [r5+t1]
+ jb .zero_loop
+ mov r5d, t1d
+ shl r5d, 4
+%if ARCH_X86_32
+ ; restore pic-ptr
+ mov r6, [rsp+16*16+2*gprsize]
+%endif
+ ; setup stack pointer
+ lea r3, [rsp+gprsize]
+.loop_pass1:
+ call t0
+%if ARCH_X86_64
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [cq+4*64+r5], m8
+ mova [cq+5*64+r5], m9
+ mova [cq+6*64+r5], m10
+ mova [cq+7*64+r5], m11
+%else
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [cq+4*64+r5], m0
+ mova [cq+5*64+r5], m1
+ mova [cq+6*64+r5], m2
+ mova [cq+7*64+r5], m3
+ mova m0, [rsp+gprsize+ 8*16]
+ mova m2, [rsp+gprsize+ 9*16]
+ mova m4, [rsp+gprsize+10*16]
+ mova m6, [rsp+gprsize+11*16]
+%endif
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [cq+0*64+r5], m0
+ mova [cq+1*64+r5], m1
+ mova [cq+2*64+r5], m2
+ mova [cq+3*64+r5], m3
+ pxor m0, m0
+ REPX {mova [cq+x*64+r5], m0}, 8, 9, 10, 11, 12, 13, 14, 15
+ sub r5d, 16
+ jge .loop_pass1
+
+%if ARCH_X86_32
+ ; restore pic-ptr
+ mov r1, [rsp+16*16+1*gprsize]
+%endif
+ jmp tx2q
+.main:
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+
+ mova m0, [cq+ 1*64+r5]
+ mova m1, [cq+ 3*64+r5]
+ mova m2, [cq+ 5*64+r5]
+ mova m3, [cq+ 7*64+r5]
+ mova m4, [cq+ 9*64+r5]
+ mova m5, [cq+11*64+r5]
+ mova m6, [cq+13*64+r5]
+ mova m7, [cq+15*64+r5]
+ call m(idct_16x4_internal_16bpc).main_oddhalf
+
+ mova m0, [cq+ 0*64+r5]
+ mova m1, [cq+ 2*64+r5]
+ mova m2, [cq+ 4*64+r5]
+ mova m3, [cq+ 6*64+r5]
+ mova m4, [cq+ 8*64+r5]
+ mova m5, [cq+10*64+r5]
+ mova m6, [cq+12*64+r5]
+ mova m7, [cq+14*64+r5]
+ call m(idct_8x4_internal_16bpc).main_pass1
+ call m(idct_8x4_internal_16bpc).round
+ call .round
+%if ARCH_X86_64
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ packssdw m8, m9
+ packssdw m10, m11
+ packssdw m12, m13
+ packssdw m14, m15
+%endif
+ ret
+.round:
+%if ARCH_X86_64
+ REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ psrld m8, m11, 10 ; 2
+ REPX {paddd x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
+ mova m8, [r3+1*16]
+ mova m9, [r3+2*16]
+ mova m10, [r3+3*16]
+ mova m11, [r3+4*16]
+ mova m12, [r3+5*16]
+ mova m13, [r3+6*16]
+ mova m14, [r3+7*16]
+ psubd m15, m0, m14 ; out15
+ paddd m0, m14 ; out0
+ psubd m14, m1, m13 ; out14
+ paddd m1, m13 ; out1
+ psubd m13, m2, m12 ; out13
+ paddd m2, m12 ; out2
+ psubd m12, m3, m11 ; out12
+ paddd m3, m11 ; out3
+ psubd m11, m4, m10 ; out11
+ paddd m4, m10 ; out4
+ psubd m10, m5, m9 ; out10
+ paddd m5, m9 ; out5
+ psubd m9, m6, m8 ; out9
+ paddd m6, m8 ; out6
+ psubd m8, m7, [r3+0*16] ; out8
+ paddd m7, [r3+0*16] ; out7
+ REPX {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ ; and out0-15 is now in m0-15
+%else
+ mova [r3+ 0*16], m0
+ mova m0, [o(clip_18b_min)]
+ REPX {pmaxsd x, m0}, m1, m2, m3, m4, m5, m6, m7
+ pmaxsd m0, [r3+ 0*16]
+ mova [r3+ 0*16], m7
+ mova m7, [o(clip_18b_max)]
+ REPX {pminsd x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pminsd m7, [r3+ 0*16]
+ mova [r3+ 0*16], m0
+ mova m0, [o(pd_2)]
+ REPX {paddd x, m0}, m1, m2, m3, m4, m5, m6, m7
+ paddd m0, [r3+ 0*16]
+ mova [r3+ 0*16], m0
+ mova [r3+ 1*16], m1
+ mova [r3+ 2*16], m2
+ mova m1, [r3+11*16]
+ mova m2, [r3+10*16]
+ psubd m0, m7, m1
+ paddd m7, m1
+ psubd m1, m6, m2
+ paddd m6, m2
+ REPX {psrad x, 2}, m0, m1, m6, m7
+ packssdw m0, m1 ; out8-9
+ packssdw m6, m7 ; out6-7
+ mova [r3+11*16], m6
+ mova m1, [r3+9*16]
+ mova m7, [r3+8*16]
+ psubd m2, m5, m1
+ paddd m5, m1
+ psubd m1, m4, m7
+ paddd m4, m7
+ REPX {psrad x, 2}, m2, m1, m4, m5
+ packssdw m2, m1 ; out10-11
+ packssdw m4, m5 ; out4-5
+ mova m1, [r3+2*16]
+ mova [r3+10*16], m4
+ mova m6, [r3+7*16]
+ mova m7, [r3+6*16]
+ psubd m4, m3, m6
+ paddd m3, m6
+ psubd m6, m1, m7
+ paddd m1, m7
+ REPX {psrad x, 2}, m4, m6, m1, m3
+ packssdw m4, m6 ; out12-13
+ packssdw m1, m3 ; out2-3
+ mova m3, [r3+1*16]
+ mova [r3+9*16], m1
+ mova m1, [r3+0*16]
+ mova m5, [r3+5*16]
+ mova m7, [r3+4*16]
+ psubd m6, m3, m5
+ paddd m3, m5
+ psubd m5, m1, m7
+ paddd m1, m7
+ REPX {psrad x, 2}, m6, m5, m1, m3
+ packssdw m6, m5 ; out14-15
+ packssdw m1, m3 ; out0-1
+ mova [r3+8*16], m1
+%endif
+ ret
+
+.pass2:
+%if ARCH_X86_64
+ mova m8, [o(pw_2048)]
+ pxor m9, m9
+ mova m10, [o(pixel_10bpc_max)]
+ mov r7, dstq
+%else
+ mov [rsp+2*gprsize+16*16], dstq
+%endif
+ lea r3, [strideq*3]
+ mov r4d, 2
+.loop_pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ mova m0, [cq+0*64+ 0]
+ mova m1, [cq+2*64+ 0]
+ mova m2, [cq+0*64+16]
+ mova m3, [cq+2*64+16]
+ mova m4, [cq+0*64+32]
+ mova m5, [cq+2*64+32]
+ mova m6, [cq+0*64+48]
+ mova m7, [cq+2*64+48]
+ call m_suffix(idct_8x8_internal_8bpc, _ssse3).main
+ mova [rsp+gprsize+3*16], m0
+ mova [rsp+gprsize+4*16], m1
+ mova [rsp+gprsize+5*16], m2
+ mova [rsp+gprsize+6*16], m3
+ mova [rsp+gprsize+7*16], m4
+ mova [rsp+gprsize+8*16], m5
+ mova [rsp+gprsize+9*16], m6
+ ; m7 is already stored in [rsp+gprsize+0*16]
+ mova m0, [cq+1*64+ 0]
+ mova m1, [cq+3*64+ 0]
+ mova m2, [cq+1*64+16]
+ mova m3, [cq+3*64+16]
+ mova m4, [cq+1*64+32]
+ mova m5, [cq+3*64+32]
+ mova m6, [cq+1*64+48]
+ mova m7, [cq+3*64+48]
+ call m_suffix(idct_16x8_internal_8bpc, _ssse3).main
+
+ ; out0-7 is in rsp+gprsize+3-10*mmsize
+ ; out8-14 is in m0-6, and out15 is in m7 as well as rsp+gprsize+0*mmsize
+
+%if ARCH_X86_64
+ lea dstq, [r7+strideq*8]
+%else
+ mov dstq, [rsp+2*gprsize+16*16]
+ lea dstq, [dstq+strideq*8]
+%endif
+ call m(idct_8x8_internal_16bpc).round2_and_write_8x8
+%if ARCH_X86_64
+ mov dstq, r7
+%else
+ mov dstq, [rsp+2*gprsize+16*16]
+%endif
+ mova m0, [rsp+gprsize+ 3*16]
+ mova m1, [rsp+gprsize+ 4*16]
+ mova m2, [rsp+gprsize+ 5*16]
+ mova m3, [rsp+gprsize+ 6*16]
+ mova m4, [rsp+gprsize+ 7*16]
+ mova m5, [rsp+gprsize+ 8*16]
+ mova m6, [rsp+gprsize+ 9*16]
+ mova m7, [rsp+gprsize+10*16]
+ call m(idct_8x8_internal_16bpc).round1_and_write_8x8
+%if ARCH_X86_64
+ add r7, 16
+%define mzero m9
+%else
+ add dword [rsp+2*gprsize+16*16], 16
+%define mzero m7
+ pxor m7, m7
+%endif
+ REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7
+ add cq, 64*4
+ REPX {mova [cq+x*16], mzero}, -8, -7, -6, -5, -4, -3, -2, -1
+%undef mzero
+ dec r4d
+ jg .loop_pass2
+%if WIN64
+ mov r7, [rsp+16*16+gprsize]
+%endif
+ RET
+
+INV_TXFM_16X16_FN adst, dct
+INV_TXFM_16X16_FN adst, adst
+INV_TXFM_16X16_FN adst, flipadst
+
+cglobal iadst_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if WIN64
+ mov [rsp+16*16+gprsize], r7
+%elif ARCH_X86_32
+ mov [rsp+16*16+gprsize*1], r1
+ mov [rsp+16*16+gprsize*2], r6
+%endif
+ lea t0, [o(.main)]
+ jmp m(idct_16x16_internal_16bpc).pass1_full
+
+.main:
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+ mova m0, [cq+ 2*64+r5]
+ mova m1, [cq+13*64+r5]
+ mova m2, [cq+ 6*64+r5]
+ mova m3, [cq+ 9*64+r5]
+ mova m4, [cq+10*64+r5]
+ mova m5, [cq+ 5*64+r5]
+ mova m6, [cq+14*64+r5]
+ mova m7, [cq+ 1*64+r5]
+ call m(iadst_16x4_internal_16bpc).main_part1
+ mova m0, [cq+ 0*64+r5]
+ mova m1, [cq+15*64+r5]
+ mova m2, [cq+ 4*64+r5]
+ mova m3, [cq+11*64+r5]
+ mova m4, [cq+ 8*64+r5]
+ mova m5, [cq+ 7*64+r5]
+ mova m6, [cq+12*64+r5]
+ mova m7, [cq+ 3*64+r5]
+ call m(iadst_16x4_internal_16bpc).main_part2
+ call .round
+%if ARCH_X86_64
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ packssdw m8, m9
+ packssdw m10, m11
+ packssdw m12, m13
+ packssdw m14, m15
+%endif
+ ret
+.round:
+%if ARCH_X86_64
+ pcmpeqd m8, m8 ; -1
+ mova m15, [o(pd_10240)]
+ psrld m14, 10 ; +2
+ psubd m13, m14, m8 ; +3
+ REPX {pxor x, m8 }, m1, m3, m5, m7
+ REPX {paddd x, m14}, m0, m2
+ REPX {paddd x, m13}, m1, m3
+ REPX {paddd x, m15}, m4, m5, m6, m7
+ paddd m13, m15, m8 ; +10239
+ paddd m8, m15, m9
+ psubd m9, m13, m10
+ paddd m10, m15, m11
+ psubd m11, m13, m12
+ paddd m12, m14, [r3+3*16]
+ psubd m13, m14, [r3+2*16]
+ psubd m15, m14, [r3+0*16]
+ paddd m14, [r3+1*16]
+ REPX {psrad x, 2 }, m0, m1, m2, m3, m12, m13, m14, m15
+ REPX {psrad x, 14}, m4, m5, m6, m7, m8, m9, m10, m11
+%else
+ mova [r3+8*16], m1
+ mova [r3+9*16], m3
+ mova m3, [o(pd_10240)]
+ pcmpeqd m1, m1
+ REPX {pxor x, m1}, m5, m7
+ REPX {paddd x, m3}, m4, m5, m6, m7
+ REPX {psrad x, 14}, m4, m5, m6, m7
+ packssdw m4, m5
+ packssdw m6, m7
+ mova [r3+10*16], m4
+ mova [r3+11*16], m6
+ mova m4, [r3+4*16]
+ mova m5, [r3+5*16]
+ mova m6, [r3+6*16]
+ mova m7, [r3+7*16]
+ mova m3, [o(pd_2)]
+ REPX {pxor x, m1}, m5, m7
+ REPX {paddd x, m3}, m4, m6
+ psubd m3, m1
+ REPX {paddd x, m3}, m5, m7
+ REPX {psrad x, 2 }, m4, m5, m6, m7
+ packssdw m4, m5
+ packssdw m6, m7
+ mova m5, [r3+8*16]
+ mova m7, [r3+9*16]
+ mova [r3+8*16], m4
+ mova [r3+9*16], m6
+ mova m3, [o(pd_10240)]
+ REPX {pxor x, m1}, m5, m7
+ REPX {paddd x, m3}, m0, m5, m2, m7
+ REPX {psrad x, 14}, m0, m5, m2, m7
+ packssdw m0, m5
+ packssdw m2, m7
+ mova m4, [r3+0*16]
+ mova m5, [r3+1*16]
+ mova m6, [r3+2*16]
+ mova m7, [r3+3*16]
+ mova m3, [o(pd_2)]
+ REPX {pxor x, m1}, m5, m7
+ REPX {paddd x, m3}, m4, m6
+ psubd m3, m1
+ REPX {paddd x, m3}, m5, m7
+ REPX {psrad x, 2 }, m4, m5, m6, m7
+ packssdw m4, m5
+ packssdw m6, m7
+%endif
+ ret
+.pass2:
+%if ARCH_X86_64
+ mova m8, [o(pw_2048)]
+ mova m11, [o(pw_m2048)]
+ pxor m9, m9
+ mova m10, [o(pixel_10bpc_max)]
+ mov r7, dstq
+%else
+ mov [rsp+2*gprsize+16*16], dstq
+%endif
+ lea r3, [strideq*3]
+ mov r4d, 2
+.loop_pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ mova m0, [cq+0*64+32]
+ mova m1, [cq+1*64+32]
+ mova m2, [cq+2*64+16]
+ mova m3, [cq+3*64+16]
+ mova m4, [cq+0*64+ 0]
+ mova m5, [cq+1*64+ 0]
+ mova m6, [cq+2*64+48]
+ mova m7, [cq+3*64+48]
+ mova [rsp+gprsize+3*16], m0
+ mova [rsp+gprsize+4*16], m1
+ mova [rsp+gprsize+5*16], m2
+ mova [rsp+gprsize+6*16], m3
+ mova [rsp+gprsize+7*16], m4
+ mova [rsp+gprsize+8*16], m5
+ mova [rsp+gprsize+9*16], m6
+ mova [rsp+gprsize+10*16], m7
+ mova m0, [cq+2*64+ 0]
+ mova m1, [cq+3*64+ 0]
+ mova m2, [cq+0*64+16]
+ mova m3, [cq+1*64+16]
+ mova m4, [cq+2*64+32]
+ mova m5, [cq+3*64+32]
+ mova m6, [cq+0*64+48]
+ mova m7, [cq+1*64+48]
+ call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main
+ call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main_pass2_end
+
+ ; out0-7 is in rsp+gprsize+3-10*mmsize
+ ; out8-14 is in m0-6, and out15 is in m7 as well as rsp+gprsize+0*mmsize
+
+%if ARCH_X86_64
+ lea dstq, [r7+strideq*8]
+%else
+ mov dstq, [rsp+2*gprsize+16*16]
+ lea dstq, [dstq+strideq*8]
+%endif
+ call m(iadst_8x8_internal_16bpc).round2_and_write_8x8
+%if ARCH_X86_64
+ mov dstq, r7
+%else
+ mov dstq, [rsp+2*gprsize+16*16]
+%endif
+ mova m0, [rsp+gprsize+ 3*16]
+ mova m1, [rsp+gprsize+ 4*16]
+ mova m2, [rsp+gprsize+ 5*16]
+ mova m3, [rsp+gprsize+ 6*16]
+ mova m4, [rsp+gprsize+ 7*16]
+ mova m5, [rsp+gprsize+ 8*16]
+ mova m6, [rsp+gprsize+ 9*16]
+ mova m7, [rsp+gprsize+10*16]
+ call m(iadst_8x8_internal_16bpc).round1_and_write_8x8
+%if ARCH_X86_64
+ add r7, 16
+%define mzero m9
+%else
+ add dword [rsp+2*gprsize+16*16], 16
+%define mzero m7
+ pxor m7, m7
+%endif
+ REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7
+ add cq, 64*4
+ REPX {mova [cq+x*16], mzero}, -8, -7, -6, -5, -4, -3, -2, -1
+%undef mzero
+ dec r4d
+ jg .loop_pass2
+%if WIN64
+ mov r7, [rsp+16*16+gprsize]
+%endif
+ RET
+
+INV_TXFM_16X16_FN flipadst, dct
+INV_TXFM_16X16_FN flipadst, adst
+INV_TXFM_16X16_FN flipadst, flipadst
+
+cglobal iflipadst_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if WIN64
+ mov [rsp+16*16+gprsize], r7
+%elif ARCH_X86_32
+ mov [rsp+16*16+gprsize*1], r1
+ mov [rsp+16*16+gprsize*2], r6
+%endif
+ lea t0, [o(.main)]
+ jmp m(idct_16x16_internal_16bpc).pass1_full
+
+.main:
+ call m(iadst_16x16_internal_16bpc).main
+%if ARCH_X86_64
+ mova m1, m0
+ mova m3, m2
+ mova m5, m4
+ mova m7, m6
+ pshufd m0, m14, q1032
+ pshufd m2, m12, q1032
+ pshufd m4, m10, q1032
+ pshufd m6, m8, q1032
+ pshufd m8, m7, q1032
+ pshufd m10, m5, q1032
+ pshufd m12, m3, q1032
+ pshufd m14, m1, q1032
+%else
+ pshufd m1, m0, q1032
+ pshufd m3, m2, q1032
+ pshufd m5, m4, q1032
+ pshufd m7, m6, q1032
+ pshufd m0, [r3+11*16], q1032
+ pshufd m2, [r3+10*16], q1032
+ pshufd m4, [r3+9*16], q1032
+ pshufd m6, [r3+8*16], q1032
+ mova [r3+11*16], m1
+ mova [r3+10*16], m3
+ mova [r3+ 9*16], m5
+ mova [r3+ 8*16], m7
+%endif
+ ret
+
+.pass2:
+ lea r3, [strideq*3]
+ lea r3, [r3*5]
+ add dstq, r3
+ neg strideq
+ jmp m(iadst_16x16_internal_16bpc).pass2
+
+INV_TXFM_16X16_FN identity, dct, h
+INV_TXFM_16X16_FN identity, identity
+
+cglobal iidentity_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if WIN64
+ mov [rsp+16*16+gprsize], r7
+%elif ARCH_X86_32
+ mov [rsp+16*16+gprsize*1], r1
+ mov [rsp+16*16+gprsize*2], r6
+%endif
+ lea t0, [o(.main)]
+ jmp m(idct_16x16_internal_16bpc).pass1_full
+
+.main:
+%if ARCH_X86_64
+ mova m15, [o(pd_11586)]
+ pmulld m0, m15, [cq+ 0*64+r5]
+ pmulld m1, m15, [cq+ 1*64+r5]
+ pmulld m2, m15, [cq+ 2*64+r5]
+ pmulld m3, m15, [cq+ 3*64+r5]
+ pmulld m4, m15, [cq+ 4*64+r5]
+ pmulld m5, m15, [cq+ 5*64+r5]
+ pmulld m6, m15, [cq+ 6*64+r5]
+ pmulld m7, m15, [cq+ 7*64+r5]
+ pmulld m8, m15, [cq+ 8*64+r5]
+ pmulld m9, m15, [cq+ 9*64+r5]
+ pmulld m10, m15, [cq+10*64+r5]
+ pmulld m11, m15, [cq+11*64+r5]
+ pmulld m12, m15, [cq+12*64+r5]
+ pmulld m13, m15, [cq+13*64+r5]
+ pmulld m14, m15, [cq+14*64+r5]
+ pmulld m15, [cq+15*64+r5]
+ mova [r3], m15
+ mova m15, [o(pd_10240)]
+ REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14
+ paddd m15, [r3]
+ REPX {psrad x, 14 }, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ packssdw m8, m9
+ packssdw m10, m11
+ packssdw m12, m13
+ packssdw m14, m15
+%else
+ mova m7, [o(pd_11586)]
+ pmulld m0, m7, [cq+ 0*64+r5]
+ pmulld m1, m7, [cq+ 1*64+r5]
+ pmulld m2, m7, [cq+ 2*64+r5]
+ pmulld m3, m7, [cq+ 3*64+r5]
+ pmulld m4, m7, [cq+ 4*64+r5]
+ pmulld m5, m7, [cq+ 5*64+r5]
+ pmulld m6, m7, [cq+ 6*64+r5]
+ pmulld m7, [cq+ 7*64+r5]
+ mova [r3], m7
+ mova m7, [o(pd_10240)]
+ REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6
+ paddd m7, [r3]
+ REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ mova [r3+8*16], m0
+ mova [r3+9*16], m2
+ mova [r3+10*16], m4
+ mova [r3+11*16], m6
+ mova m7, [o(pd_11586)]
+ pmulld m0, m7, [cq+ 8*64+r5]
+ pmulld m1, m7, [cq+ 9*64+r5]
+ pmulld m2, m7, [cq+10*64+r5]
+ pmulld m3, m7, [cq+11*64+r5]
+ pmulld m4, m7, [cq+12*64+r5]
+ pmulld m5, m7, [cq+13*64+r5]
+ pmulld m6, m7, [cq+14*64+r5]
+ pmulld m7, [cq+15*64+r5]
+ mova [r3], m7
+ mova m7, [o(pd_10240)]
+ REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6
+ paddd m7, [r3]
+ REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+%endif
+ ret
+
+.pass2:
+%if ARCH_X86_64
+ mova m4, [o(pw_2048)]
+ mova m5, [o(pixel_10bpc_max)]
+ pxor m6, m6
+ mova m7, [o(pw_1697x16)]
+ mov r7, dstq
+%else
+ mov [rsp+2*gprsize+16*16], dstq
+%endif
+ mov r5d, 4
+ lea r3, [strideq*3]
+.pass2_loop:
+ mova m0, [cq+0*64+0]
+ mova m1, [cq+1*64+0]
+ mova m2, [cq+2*64+0]
+ mova m3, [cq+3*64+0]
+ call m(iidentity_8x16_internal_16bpc).main
+%if ARCH_X86_64
+ call m(idct_8x4_internal_16bpc).round1_and_write_8x4
+%else
+ call m(idct_8x4_internal_16bpc).round2_and_write_8x4
+%endif
+ REPX {mova [cq+x*16], m6}, 0, 4, 8, 12
+ add cq, 16
+ lea dstq, [dstq+strideq*4]
+ dec r5w
+ jg .pass2_loop
+ add cq, 64*3
+ btc r5d, 16
+ jc .end
+%if ARCH_X86_64
+ lea dstq, [r7+16]
+%else
+ mov dstq, [rsp+2*gprsize+16*16]
+ add dstq, 16
+%endif
+ add r5d, 4
+ jmp .pass2_loop
+.end:
+%if WIN64
+ mov r7, [rsp+16*16+gprsize]
+%endif
+ RET
+
+cglobal inv_txfm_add_identity_identity_8x32_16bpc, 4, 7, 8, dst, stride, c, eob
+%if ARCH_X86_32
+ LEA r6, $$
+%endif
+ mova m5, [o(pw_5)]
+ mova m7, [o(pixel_10bpc_max)]
+ pxor m6, m6
+ mov r5d, eobd
+ add eobb, 21
+ cmovc eobd, r5d ; 43, 107, 171 -> 64, 128, 192
+ lea r4, [strideq*3]
+.loop:
+ mova m0, [cq+128*0]
+ packssdw m0, [cq+128*1]
+ mova m1, [cq+128*2]
+ packssdw m1, [cq+128*3]
+ mova m2, [cq+128*4]
+ packssdw m2, [cq+128*5]
+ mova m3, [cq+128*6]
+ packssdw m3, [cq+128*7]
+ REPX {paddsw x, m5}, m0, m1, m2, m3
+ REPX {psraw x, 3 }, m0, m1, m2, m3
+ call .main_zero
+ add cq, 16
+ lea dstq, [dstq+strideq*4]
+ btc eobd, 16
+ jnc .loop
+ sub eobd, 64
+ jge .loop
+ RET
+ALIGN function_align
+.main_zero:
+ REPX {mova [cq+128*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
+.main:
+ punpckhwd m4, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m2, m3
+ punpcklwd m2, m3
+ punpckhwd m3, m0, m4
+ punpcklwd m0, m4
+ punpckhwd m4, m2, m1
+ punpcklwd m2, m1
+ punpckhqdq m1, m0, m2
+ punpcklqdq m0, m2
+ punpcklqdq m2, m3, m4
+ punpckhqdq m3, m4
+ paddw m0, [dstq+strideq*0]
+ paddw m1, [dstq+strideq*1]
+ paddw m2, [dstq+strideq*2]
+ paddw m3, [dstq+r4 ]
+ REPX {pmaxsw x, m6}, m0, m1, m2, m3
+ REPX {pminsw x, m7}, m0, m1, m2, m3
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+r4 ], m3
+ ret
+
+cglobal inv_txfm_add_identity_identity_32x8_16bpc, 4, 7, 8, dst, stride, c, eob
+%if ARCH_X86_32
+ LEA r6, $$
+%endif
+ mova m5, [o(pw_4096)]
+ mova m7, [o(pixel_10bpc_max)]
+ pxor m6, m6
+ mov r4d, eobd
+ add eobb, 21
+ cmovc eobd, r4d
+ lea r4, [strideq*3]
+ mov r5, dstq
+.loop:
+ mova m0, [cq+32*0]
+ packssdw m0, [cq+32*1]
+ mova m1, [cq+32*2]
+ packssdw m1, [cq+32*3]
+ mova m2, [cq+32*4]
+ packssdw m2, [cq+32*5]
+ mova m3, [cq+32*6]
+ packssdw m3, [cq+32*7]
+ REPX {mova [cq+32*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
+ REPX {pmulhrsw x, m5}, m0, m1, m2, m3
+ call m(inv_txfm_add_identity_identity_8x32_16bpc).main
+ lea dstq, [dstq+strideq*4]
+ add cq, 16
+ btc eobd, 16
+ jnc .loop
+ add cq, 32*8-32
+ add r5, 16
+ mov dstq, r5
+ sub eobd, 64
+ jge .loop
+ RET
+
+cglobal inv_txfm_add_identity_identity_16x32_16bpc, 4, 7, 12, dst, stride, c, eob
+%if ARCH_X86_32
+ LEA r6, $$
+%else
+ mova m8, [o(pw_2896x8)]
+ mova m9, [o(pw_1697x16)]
+ mova m11, [o(pw_8192)]
+%endif
+ mova m7, [o(pixel_10bpc_max)]
+ lea r4, [strideq*3]
+ pxor m6, m6
+%if ARCH_X86_64
+ paddw m10, m11, m11 ; pw_16384
+%endif
+ mov r5, dstq
+ call .main
+ sub eobd, 36
+ jl .ret
+ add cq, 128*8-32
+ lea dstq, [r5+16]
+ call .main
+ sub cq, 128*8
+ lea dstq, [r5+strideq*8]
+ mov r5, dstq
+ call .main
+ sub eobd, 107 ; eob < 143
+ jl .ret
+ add cq, 128*8-32
+ lea dstq, [r5+16]
+ call .main
+ sub cq, 128*8
+ lea dstq, [r5+strideq*8]
+ mov r5, dstq
+ call .main
+ sub eobd, 128 ; eob < 271
+ jl .ret
+ add cq, 128*8-32
+ lea dstq, [r5+16]
+ call .main
+ sub cq, 128*8
+ lea dstq, [r5+strideq*8]
+ mov r5, dstq
+ call .main
+ sub eobd, 128 ; eob < 399
+ jl .ret
+ add cq, 128*8-32
+ lea dstq, [r5+16]
+ call .main
+.ret:
+ RET
+ALIGN function_align
+.main:
+ mova m0, [cq+128*0]
+ packssdw m0, [cq+128*1]
+ mova m1, [cq+128*2]
+ packssdw m1, [cq+128*3]
+ mova m2, [cq+128*4]
+ packssdw m2, [cq+128*5]
+ mova m3, [cq+128*6]
+ packssdw m3, [cq+128*7]
+%if ARCH_X86_64
+ REPX {pmulhrsw x, m8 }, m0, m1, m2, m3
+ pmulhrsw m4, m9, m0
+ pmulhrsw m5, m9, m1
+ REPX {pmulhrsw x, m10}, m4, m5
+%else
+ mova m6, [o(pw_2896x8)]
+ REPX {pmulhrsw x, m6 }, m0, m1, m2, m3
+ mova m5, [o(pw_1697x16)]
+ pmulhrsw m4, m5, m0
+ pmulhrsw m5, m1
+ mova m6, [o(pw_16384)]
+ REPX {pmulhrsw x, m6 }, m4, m5
+%endif
+ paddsw m0, m4
+ paddsw m1, m5
+%if ARCH_X86_64
+ pmulhrsw m4, m9, m2
+ pmulhrsw m5, m9, m3
+ REPX {pmulhrsw x, m10}, m4, m5
+%else
+ mova m5, [o(pw_1697x16)]
+ pmulhrsw m4, m5, m2
+ pmulhrsw m5, m3
+ REPX {pmulhrsw x, m6 }, m4, m5
+%endif
+ paddsw m2, m4
+ paddsw m3, m5
+%if ARCH_X86_64
+ REPX {pmulhrsw x, m11}, m0, m1, m2, m3
+%else
+ psrlw m6, 1 ; pw_8192
+ REPX {pmulhrsw x, m6 }, m0, m1, m2, m3
+ pxor m6, m6
+%endif
+ call m(inv_txfm_add_identity_identity_8x32_16bpc).main_zero
+ lea dstq, [dstq+strideq*4]
+ add cq, 16
+ btc eobd, 16
+ jnc .main
+ ret
+
+cglobal inv_txfm_add_identity_identity_32x16_16bpc, 4, 7, 11, dst, stride, c, eob
+%if ARCH_X86_32
+ LEA r6, $$
+%else
+ mova m8, [o(pw_2896x8)]
+ mova m9, [o(pw_1697x16)]
+ mova m10, [o(pw_2048)]
+%endif
+ mova m7, [o(pixel_10bpc_max)]
+ lea r4, [strideq*3]
+ pxor m6, m6
+ mov r5, dstq
+ call .main
+ sub eobd, 36
+ jl .ret
+ call .main
+ add cq, 64*8-64
+ lea dstq, [r5+16*1]
+ call .main
+ sub eobd, 107 ; eob < 143
+ jl .ret
+ call .main
+ add cq, 64*8-64
+ lea dstq, [r5+16*2]
+ call .main
+ sub eobd, 128 ; eob < 271
+ jl .ret
+ call .main
+ add cq, 64*8-64
+ lea dstq, [r5+16*3]
+ call .main
+ sub eobd, 128 ; eob < 399
+ jl .ret
+ call .main
+.ret:
+ RET
+ALIGN function_align
+.main:
+ mova m0, [cq+64*0]
+ packssdw m0, [cq+64*1]
+ mova m1, [cq+64*2]
+ packssdw m1, [cq+64*3]
+ mova m2, [cq+64*4]
+ packssdw m2, [cq+64*5]
+ mova m3, [cq+64*6]
+ packssdw m3, [cq+64*7]
+%if ARCH_X86_64
+ REPX {pmulhrsw x, m8 }, m0, m1, m2, m3
+%else
+ mova m6, [o(pw_2896x8)]
+ REPX {pmulhrsw x, m6 }, m0, m1, m2, m3
+%endif
+ REPX {paddsw x, x }, m0, m1, m2, m3
+%if ARCH_X86_64
+ pmulhrsw m4, m9, m0
+ pmulhrsw m5, m9, m1
+%else
+ mova m6, [o(pw_1697x16)]
+ pmulhrsw m4, m6, m0
+ pmulhrsw m5, m6, m1
+%endif
+ REPX {paddsw x, x }, m0, m1
+ paddsw m0, m4
+ paddsw m1, m5
+%if ARCH_X86_64
+ pmulhrsw m4, m9, m2
+ pmulhrsw m5, m9, m3
+%else
+ pmulhrsw m4, m6, m2
+ pmulhrsw m6, m3
+%endif
+ REPX {paddsw x, x }, m2, m3
+ paddsw m2, m4
+%if ARCH_X86_64
+ paddsw m3, m5
+ REPX {pmulhrsw x, m10}, m0, m1, m2, m3
+%else
+ paddsw m3, m6
+ mova m6, [o(pw_2048)]
+ REPX {pmulhrsw x, m6 }, m0, m1, m2, m3
+ pxor m6, m6
+%endif
+ REPX {mova [cq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
+ call m(inv_txfm_add_identity_identity_8x32_16bpc).main
+ lea dstq, [dstq+strideq*4]
+ add cq, 16
+ btc eobd, 16
+ jnc .main
+ ret
+
+cglobal inv_txfm_add_identity_identity_32x32_16bpc, 4, 7, 8, dst, stride, c, eob
+%undef cmp
+%if ARCH_X86_32
+ LEA r6, $$
+%endif
+ mova m5, [o(pw_8192)]
+ mova m7, [o(pixel_10bpc_max)]
+ pxor m6, m6
+ lea r4, [strideq*3]
+ mov r5, dstq
+ call .main ; 0
+ cmp eobd, 36
+ jl .ret
+ add cq, 128*8-32 ; 0 1
+ lea dstq, [r5+16] ; 1
+ call .main
+ call .main2
+ cmp eobd, 136
+ jl .ret
+ add cq, 128*16-64 ; 0 1 2
+ lea dstq, [r5+16*2] ; 1 2
+ call .main ; 2
+ call .main2
+ call .main2
+ cmp eobd, 300
+ jl .ret
+ add cq, 128*24-96 ; 0 1 2 3
+ add r5, 16*3 ; 1 2 3
+ mov dstq, r5 ; 2 3
+ call .main ; 3
+ call .main2
+ call .main2
+ call .main2
+ cmp eobd, 535
+ jl .ret
+ add cq, 128*24-96 ; 0 1 2 3
+ lea dstq, [r5+strideq*8] ; 1 2 3 4
+ mov r5, dstq ; 2 3 4
+ call .main ; 3 4
+ call .main2
+ call .main2
+ cmp eobd, 755
+ jl .ret
+ add cq, 128*16-64 ; 0 1 2 3
+ lea dstq, [r5+strideq*8] ; 1 2 3 4
+ mov r5, dstq ; 2 3 4 5
+ call .main ; 3 4 5
+ call .main2
+ cmp eobd, 911
+ jl .ret
+ add cq, 128*8-32 ; 0 1 2 3
+ lea dstq, [r5+strideq*8] ; 1 2 3 4
+ call .main ; 2 3 4 5
+.ret: ; 3 4 5 6
+ RET
+ALIGN function_align
+.main2:
+ sub cq, 128*8
+ sub dstq, 16
+.main:
+ mova m0, [cq+128*0]
+ packssdw m0, [cq+128*1]
+ mova m1, [cq+128*2]
+ packssdw m1, [cq+128*3]
+ mova m2, [cq+128*4]
+ packssdw m2, [cq+128*5]
+ mova m3, [cq+128*6]
+ packssdw m3, [cq+128*7]
+ REPX {pmulhrsw x, m5}, m0, m1, m2, m3
+ call m(inv_txfm_add_identity_identity_8x32_16bpc).main_zero
+ lea dstq, [dstq+strideq*4]
+ add cq, 16
+ btc eobd, 16
+ jnc .main
+ ret
+
+cglobal inv_txfm_add_dct_dct_8x32_16bpc, 4, 7, 15, 0-36*16, \
+ dst, stride, c, eob
+%if ARCH_X86_32
+ LEA r6, $$
+%define base $$
+ DECLARE_REG_TMP 0, 4
+%else
+ lea r6, [tbl_Nx32_odd_offset]
+%define base tbl_Nx32_odd_offset
+ DECLARE_REG_TMP 4, 7
+%if WIN64
+ mov [rsp+gprsize*1+35*16], r7
+%endif
+%endif
+%define o2(x) r6-base+x
+ test eobd, eobd
+ jz .dconly
+
+%if ARCH_X86_32
+ mov [rsp+gprsize*1+35*16], r0
+%endif
+%undef cmp
+ ; remove entirely-zero iterations
+ mov r5d, 7*2
+ cmp eobw, word [o2(tbl_8x32_2d)+r5]
+ jge .end_zero_loop
+ pxor m0, m0
+.zero_loop:
+ movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5]
+ movzx t1d, t0b
+ shr t0d, 8
+ mova [rsp+ 3*16+r5*8], m0
+ mova [rsp+11*16+r5*8], m0
+ mova [rsp+ 3*16+t0*8], m0
+ mova [rsp+ 3*16+t1*8], m0
+ sub r5d, 2
+ cmp eobw, word [o2(tbl_8x32_2d)+r5]
+ jl .zero_loop
+.end_zero_loop:
+ ; actual first pass after skipping all-zero data
+ mov [rsp+gprsize*0+35*16], eobd
+ mov r3, rsp
+.loop_pass1:
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+ mova m0, [cq+0*128+r5*8]
+ mova m1, [cq+1*128+r5*8]
+ mova m2, [cq+2*128+r5*8]
+ mova m3, [cq+3*128+r5*8]
+ mova m4, [cq+4*128+r5*8]
+ mova m5, [cq+5*128+r5*8]
+ mova m6, [cq+6*128+r5*8]
+ mova m7, [cq+7*128+r5*8]
+ call m(idct_8x4_internal_16bpc).main_pass1
+ mova m1, [o(pd_2)]
+ REPX {paddd x, m1}, m0, m6, m5, m3
+ call m(idct_8x4_internal_16bpc).round
+ REPX {psrad x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+
+ movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5]
+ movzx t1d, t0b
+ shr t0d, 8
+ mova [r3+ 3*16+r5*8], m0
+ mova [r3+11*16+r5*8], m2
+ mova [r3+ 3*16+t1*8], m1
+ mova [r3+ 3*16+t0*8], m3
+ pxor m7, m7
+ REPX {mova [cq+x*128+r5*8], m7}, 0, 1, 2, 3, 4, 5, 6, 7
+ sub r5d, 2
+ jge .loop_pass1
+
+ ; pass 2 code starts here
+ ; m0 is already loaded from last iteration of first pass
+%if ARCH_X86_32
+ mov r0, [rsp+gprsize*1+35*16]
+%endif
+ mov eobd, [rsp+gprsize*0+35*16]
+ cmp eobd, 43
+ jl .load_veryfast
+ cmp eobd, 107
+ jl .load_fast
+ ; load normal
+ lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main)]
+ jmp .run
+.load_fast:
+ lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)]
+ jmp .run
+.load_veryfast:
+ lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)]
+ ; fall-through
+.run:
+ call .pass2
+%if WIN64
+ mov r7, [rsp+gprsize*1+35*16]
+%endif
+ RET
+
+.pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ mova m1, [rsp+gprsize+16* 4]
+ mova m2, [rsp+gprsize+16* 5]
+ mova m3, [rsp+gprsize+16* 6]
+ mova m4, [rsp+gprsize+16* 7]
+ mova m5, [rsp+gprsize+16* 8]
+ mova m6, [rsp+gprsize+16* 9]
+ mova m7, [rsp+gprsize+16*10]
+ call m_suffix(idct_8x8_internal_8bpc, _ssse3).main
+ mova [rsp+gprsize+ 3*16], m0
+ mova [rsp+gprsize+ 4*16], m1
+ mova [rsp+gprsize+ 5*16], m2
+ mova [rsp+gprsize+ 6*16], m3
+ mova [rsp+gprsize+ 7*16], m4
+ mova [rsp+gprsize+ 8*16], m5
+ mova [rsp+gprsize+ 9*16], m6
+ mova m0, [rsp+gprsize+11*16]
+ mova m1, [rsp+gprsize+12*16]
+ mova m2, [rsp+gprsize+13*16]
+ mova m3, [rsp+gprsize+14*16]
+ mova m4, [rsp+gprsize+15*16]
+ mova m5, [rsp+gprsize+16*16]
+ mova m6, [rsp+gprsize+17*16]
+ mova m7, [rsp+gprsize+18*16]
+ call m_suffix(idct_16x8_internal_8bpc, _ssse3).main
+ mova m7, [rsp+gprsize+ 0*16]
+ mova [rsp+gprsize+11*16], m0
+ mova [rsp+gprsize+12*16], m1
+ mova [rsp+gprsize+13*16], m2
+ mova [rsp+gprsize+14*16], m3
+ mova [rsp+gprsize+15*16], m4
+ mova [rsp+gprsize+16*16], m5
+ mova [rsp+gprsize+17*16], m6
+ mova [rsp+gprsize+18*16], m7
+ call r4
+%if ARCH_X86_64
+ mova m8, [o(pw_2048)]
+ pxor m9, m9
+ mova m10, [o(pixel_10bpc_max)]
+%endif
+ lea r3, [strideq*3]
+ call m(idct_8x8_internal_16bpc).round1_and_write_8x8
+ lea dstq, [dstq+strideq*8]
+ mova m0, [rsp+gprsize+11*16]
+ mova m1, [rsp+gprsize+12*16]
+ mova m2, [rsp+gprsize+13*16]
+ mova m3, [rsp+gprsize+14*16]
+ mova m4, [rsp+gprsize+15*16]
+ mova m5, [rsp+gprsize+16*16]
+ mova m6, [rsp+gprsize+17*16]
+ mova m7, [rsp+gprsize+18*16]
+ call m(idct_8x8_internal_16bpc).round1_and_write_8x8
+ lea dstq, [dstq+strideq*8]
+ mova m0, [rsp+gprsize+19*16]
+ mova m1, [rsp+gprsize+20*16]
+ mova m2, [rsp+gprsize+21*16]
+ mova m3, [rsp+gprsize+22*16]
+ mova m4, [rsp+gprsize+23*16]
+ mova m5, [rsp+gprsize+24*16]
+ mova m6, [rsp+gprsize+25*16]
+ mova m7, [rsp+gprsize+26*16]
+ call m(idct_8x8_internal_16bpc).round1_and_write_8x8
+ lea dstq, [dstq+strideq*8]
+ mova m0, [rsp+gprsize+27*16]
+ mova m1, [rsp+gprsize+28*16]
+ mova m2, [rsp+gprsize+29*16]
+ mova m3, [rsp+gprsize+30*16]
+ mova m4, [rsp+gprsize+31*16]
+ mova m5, [rsp+gprsize+32*16]
+ mova m6, [rsp+gprsize+33*16]
+ mova m7, [rsp+gprsize+34*16]
+ call m(idct_8x8_internal_16bpc).round1_and_write_8x8
+ ret
+.dconly:
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 8
+ add r5d, 640
+ sar r5d, 10
+ add rsp, (31+2*ARCH_X86_64)*16
+ jmp m(inv_txfm_add_dct_dct_8x8_16bpc).end2
+
+cglobal inv_txfm_add_dct_dct_16x32_16bpc, 4, 7, 16, 0-77*16, \
+ dst, stride, c, eob
+ LEA r6, base
+ test eobd, eobd
+ jz .dconly
+
+%if ARCH_X86_32
+ mov [rsp+gprsize*1+76*16], r0
+%elif WIN64
+ mov [rsp+gprsize*1+76*16], r7
+%endif
+%undef cmp
+ ; remove entirely-zero iterations
+ mov r5d, 7*2
+ cmp eobw, word [o2(tbl_16x32_2d)+r5]
+ jge .end_zero_loop
+ pxor m0, m0
+.zero_loop:
+ movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5]
+ movzx t1d, t0b
+ shr t0d, 8
+ mova [rsp+12*16+r5*8], m0
+ mova [rsp+20*16+r5*8], m0
+ mova [rsp+12*16+t0*8], m0
+ mova [rsp+12*16+t1*8], m0
+ mova [rsp+44*16+r5*8], m0
+ mova [rsp+52*16+r5*8], m0
+ mova [rsp+44*16+t0*8], m0
+ mova [rsp+44*16+t1*8], m0
+ sub r5d, 2
+ cmp eobw, word [o2(tbl_16x32_2d)+r5]
+ jl .zero_loop
+.end_zero_loop:
+ ; actual first pass after skipping all-zero data
+ mov [rsp+gprsize*0+76*16], eobd
+ mov r3, rsp
+.loop_pass1:
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+ mova m0, [cq+ 1*128+r5*8]
+ mova m1, [cq+ 3*128+r5*8]
+ mova m2, [cq+ 5*128+r5*8]
+ mova m3, [cq+ 7*128+r5*8]
+ mova m4, [cq+ 9*128+r5*8]
+ mova m5, [cq+11*128+r5*8]
+ mova m6, [cq+13*128+r5*8]
+ mova m7, [cq+15*128+r5*8]
+ call m(idct_8x4_internal_16bpc).rect2_mul
+ call m(idct_16x4_internal_16bpc).main_oddhalf
+
+ mova m0, [cq+ 0*128+r5*8]
+ mova m1, [cq+ 2*128+r5*8]
+ mova m2, [cq+ 4*128+r5*8]
+ mova m3, [cq+ 6*128+r5*8]
+ mova m4, [cq+ 8*128+r5*8]
+ mova m5, [cq+10*128+r5*8]
+ mova m6, [cq+12*128+r5*8]
+ mova m7, [cq+14*128+r5*8]
+ call m(idct_8x4_internal_16bpc).rect2_mul
+ call m(idct_8x4_internal_16bpc).main_pass1
+ call m(idct_8x4_internal_16bpc).round
+ call m(idct_16x4_internal_16bpc).round
+%if ARCH_X86_64
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ packssdw m8, m9
+ packssdw m10, m11
+ packssdw m12, m13
+ packssdw m14, m15
+%endif
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5]
+ movzx t1d, t0b
+ shr t0d, 8
+%if ARCH_X86_64
+ mova [rsp+12*16+r5*8], m0
+ mova [rsp+20*16+r5*8], m2
+ mova [rsp+12*16+t1*8], m1
+ mova [rsp+12*16+t0*8], m3
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [rsp+44*16+r5*8], m8
+ mova [rsp+52*16+r5*8], m10
+ mova [rsp+44*16+t1*8], m9
+ mova [rsp+44*16+t0*8], m11
+%else
+ mova [rsp+44*16+r5*8], m0
+ mova [rsp+52*16+r5*8], m2
+ mova [rsp+44*16+t1*8], m1
+ mova [rsp+44*16+t0*8], m3
+ mova m0, [r3+ 8*16]
+ mova m2, [r3+ 9*16]
+ mova m4, [r3+10*16]
+ mova m6, [r3+11*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [rsp+12*16+r5*8], m0
+ mova [rsp+20*16+r5*8], m2
+ mova [rsp+12*16+t1*8], m1
+ mova [rsp+12*16+t0*8], m3
+%endif
+ pxor m7, m7
+ REPX {mova [cq+x*128+r5*8], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ sub r5d, 2
+ jge .loop_pass1
+
+ ; pass=2
+ add rsp, 9*16
+%if ARCH_X86_64
+ mov r6, dstq
+%else
+ mov dstq, [rsp+gprsize*1+67*16]
+%endif
+ mov eobd, [rsp+gprsize*0+67*16]
+ cmp eobd, 44
+ jl .load_veryfast
+ cmp eobd, 151
+ jl .load_fast
+ ; load normal
+ lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main)]
+ jmp .run
+.load_fast:
+ lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)]
+ jmp .run
+.load_veryfast:
+ lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)]
+ ; fall-through
+.run:
+%if ARCH_X86_64
+ lea r2, [dstq+32]
+ mov r7, -4
+%else
+ lea r2, [rsp+67*16]
+ mov dword [r2+0*gprsize], 2
+%endif
+ jmp .loop_pass2_entry
+.loop_pass2:
+ mova m0, [rsp+16* 3]
+.loop_pass2_entry:
+%if ARCH_X86_32
+ mov dstq, [r2+1*gprsize]
+%endif
+ call m(inv_txfm_add_dct_dct_8x32_16bpc).pass2
+ add rsp, 32*16
+%if ARCH_X86_64
+ add r7, 2
+ lea dstq, [r2+r7*8]
+ jl .loop_pass2
+%if WIN64
+ mov r7, [rsp+gprsize*1+3*16]
+%endif
+%else
+ add dword [r2+1*gprsize], 16
+ dec dword [r2+0*gprsize]
+ jg .loop_pass2
+%endif
+%assign stack_size (stack_size-73*16)
+%if STACK_ALIGNMENT >= 16
+%assign stack_size_padded (stack_size_padded-73*16)
+%assign stack_offset (stack_offset-73*16)
+%else
+%xdefine rstkm [rsp + stack_size]
+%endif
+ RET
+.dconly:
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 32
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
+ add rsp, (65+4*ARCH_X86_64)*16
+ jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly
+
+cglobal inv_txfm_add_dct_dct_32x8_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \
+ dst, stride, c, eob
+%if ARCH_X86_32
+ LEA r6, $$
+%endif
+ test eobd, eobd
+ jz .dconly
+
+ ; remove entirely-zero iterations
+%undef cmp
+%if ARCH_X86_64
+ xor r5d, r5d
+ cmp eobd, 10
+ setge r5b
+%else
+ mov r5d, 1
+ cmp eobd, 10
+ sbb r5d, 0
+%endif
+ add r5d, r5d
+
+ ; actual first pass after skipping all-zero data
+.loop_pass1:
+ mova m0, [cq+32* 1+r5*8]
+ mova m1, [cq+32* 7+r5*8]
+ mova m2, [cq+32* 9+r5*8]
+ mova m3, [cq+32*15+r5*8]
+ mova m4, [cq+32*17+r5*8]
+ mova m5, [cq+32*23+r5*8]
+ mova m6, [cq+32*25+r5*8]
+ mova m7, [cq+32*31+r5*8]
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+ mov r3, rsp
+ call .main_oddhalf_part1
+ mova m0, [cq+32* 3+r5*8]
+ mova m1, [cq+32* 5+r5*8]
+ mova m2, [cq+32*11+r5*8]
+ mova m3, [cq+32*13+r5*8]
+ mova m4, [cq+32*19+r5*8]
+ mova m5, [cq+32*21+r5*8]
+ mova m6, [cq+32*27+r5*8]
+ mova m7, [cq+32*29+r5*8]
+ call .main_oddhalf_part2
+ mova m0, [cq+32* 2+r5*8]
+ mova m1, [cq+32* 6+r5*8]
+ mova m2, [cq+32*10+r5*8]
+ mova m3, [cq+32*14+r5*8]
+ mova m4, [cq+32*18+r5*8]
+ mova m5, [cq+32*22+r5*8]
+ mova m6, [cq+32*26+r5*8]
+ mova m7, [cq+32*30+r5*8]
+ add r3, 16*(16+4*ARCH_X86_32)
+ call m(idct_16x4_internal_16bpc).main_oddhalf
+ mova m0, [cq+32* 0+r5*8]
+ mova m1, [cq+32* 4+r5*8]
+ mova m2, [cq+32* 8+r5*8]
+ mova m3, [cq+32*12+r5*8]
+ mova m4, [cq+32*16+r5*8]
+ mova m5, [cq+32*20+r5*8]
+ mova m6, [cq+32*24+r5*8]
+ mova m7, [cq+32*28+r5*8]
+ call m(idct_8x4_internal_16bpc).main_pass1
+ call m(idct_8x4_internal_16bpc).round
+ sub r3, 16*(16+4*ARCH_X86_32)
+ call .round_dct32
+%if ARCH_X86_64
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [cq+32* 8+r5*8], m8
+ mova [cq+32* 9+r5*8], m9
+ mova [cq+32*10+r5*8], m10
+ mova [cq+32*11+r5*8], m11
+ mova m8, [r3+16* 9] ; 8 9
+ mova m10, [r3+16*11] ; 10 11
+ mova m12, [r3+16*13] ; 12 13
+ mova m14, [r3+16*15] ; 14 15
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [cq+32* 4+r5*8], m8
+ mova [cq+32* 5+r5*8], m9
+ mova [cq+32* 6+r5*8], m10
+ mova [cq+32* 7+r5*8], m11
+ mova m8, [r3+16* 8] ; 24 25
+ mova m10, [r3+16*10] ; 26 27
+ mova m12, [r3+16*12] ; 28 29
+ mova m14, [r3+16*14] ; 30 31
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [cq+32*12+r5*8], m8
+ mova [cq+32*13+r5*8], m9
+ mova [cq+32*14+r5*8], m10
+ mova [cq+32*15+r5*8], m11
+%else
+ sub r3, 8*16
+ mova m0, [r3+ 8*16]
+ mova m2, [r3+10*16]
+ mova m4, [r3+12*16]
+ mova m6, [r3+14*16]
+ packssdw m0, [r3+ 9*16]
+ packssdw m2, [r3+11*16]
+ packssdw m4, [r3+13*16]
+ packssdw m6, [r3+15*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [cq+32* 4+r5*8], m0
+ mova [cq+32* 5+r5*8], m1
+ mova [cq+32* 6+r5*8], m2
+ mova [cq+32* 7+r5*8], m3
+ mova m0, [r3+16*16]
+ mova m2, [r3+18*16]
+ mova m4, [r3+20*16]
+ mova m6, [r3+22*16]
+ packssdw m0, [r3+17*16]
+ packssdw m2, [r3+19*16]
+ packssdw m4, [r3+21*16]
+ packssdw m6, [r3+23*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [cq+32* 8+r5*8], m0
+ mova [cq+32* 9+r5*8], m1
+ mova [cq+32*10+r5*8], m2
+ mova [cq+32*11+r5*8], m3
+ mova m0, [r3+31*16]
+ mova m2, [r3+29*16]
+ mova m4, [r3+27*16]
+ mova m6, [r3+25*16]
+ packssdw m0, [r3+30*16]
+ packssdw m2, [r3+28*16]
+ packssdw m4, [r3+26*16]
+ packssdw m6, [r3+24*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [cq+32*12+r5*8], m0
+ mova [cq+32*13+r5*8], m1
+ mova [cq+32*14+r5*8], m2
+ mova [cq+32*15+r5*8], m3
+ mova m0, [r3+ 0*16]
+ mova m2, [r3+ 2*16]
+ mova m4, [r3+ 4*16]
+ mova m6, [r3+ 6*16]
+ packssdw m0, [r3+ 1*16]
+ packssdw m2, [r3+ 3*16]
+ packssdw m4, [r3+ 5*16]
+ packssdw m6, [r3+ 7*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+%endif
+ pxor m7, m7
+ ; clear lower half of [cq]
+ REPX {mova [cq+x*32+r5*8], m7}, 16, 17, 18, 19, 20, 21, 22, 23, \
+ 24, 25, 26, 27, 28, 29, 30, 31
+ test r5d, r5d
+ jz .end_pass1
+ mova [cq+32* 0+r5*8], m0
+ mova [cq+32* 1+r5*8], m1
+ mova [cq+32* 2+r5*8], m2
+ mova [cq+32* 3+r5*8], m3
+ sub r5d, 2
+ jmp .loop_pass1
+.end_pass1:
+
+ ; pass=2, we need to call this otherwise the stack pointer has
+ ; the wrong offset in the 8-bit code
+ mov r4d, 4
+ call m(idct_16x8_internal_16bpc).pass2_main
+ RET
+
+.main_oddhalf_part1_fast: ; lower half zero
+ pmulld m7, m0, [o(pd_4091)]
+ pmulld m0, [o(pd_201)]
+ pmulld m4, m3, [o(pd_m2751)]
+%if ARCH_X86_32
+ pmulld m3, [o(pd_3035)]
+ mova m5, [o(pd_2048)]
+ REPX {paddd x, m5}, m0, m7
+ REPX {psrad x, 12}, m0, m7
+ mova [r3+3*16], m7
+ mova m7, m3
+ mova m3, m5
+%else
+ pmulld m3, [o(pd_3035)]
+%endif
+ pmulld m6, m1, [o(pd_m1380)]
+ pmulld m1, [o(pd_3857)]
+ pmulld m5, m2, [o(pd_3703)]
+ pmulld m2, [o(pd_1751)]
+ jmp .main_oddhalf_part1_fast2
+.main_oddhalf_part1: ; in1, in7, in9, in15, in17, in23, in25, in31
+%if ARCH_X86_64
+ ITX_MULSUB_2D 0, 7, 8, 9, 10, _, 201, 4091 ; t16a, t31a
+ ITX_MULSUB_2D 6, 1, 8, 9, 10, _, 3857, 1380 ; t19a, t28a
+ ITX_MULSUB_2D 2, 5, 8, 9, 10, _, 1751, 3703 ; t18a, t29a
+ ITX_MULSUB_2D 4, 3, 8, 9, 10, _, 3035, 2751 ; t17a, t30a
+.main_oddhalf_part1_fast2:
+ REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
+ psubd m8, m0, m4 ; t17
+ paddd m0, m4 ; t16
+ psubd m4, m6, m2 ; t18
+ paddd m6, m2 ; t19
+ psubd m2, m1, m5 ; t29
+ paddd m1, m5 ; t28
+ psubd m5, m7, m3 ; t30
+ paddd m7, m3 ; t31
+ REPX {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7
+ REPX {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7
+ mova m15, [o(pd_4017)]
+ mova m10, [o(pd_799)]
+ ITX_MULSUB_2D 5, 8, 3, 9, _, 11, 10, 15 ; t17a, t30a
+ ITX_MULSUB_2D 2, 4, 3, 9, _, 11, 10, 15, 4 ; t29a, t18a
+ psubd m3, m0, m6 ; t19a
+ paddd m0, m6 ; t16a
+ psubd m6, m7, m1 ; t28a
+ paddd m7, m1 ; t31a
+ psubd m1, m5, m4 ; t18
+ paddd m5, m4 ; t17
+ psubd m4, m8, m2 ; t29
+ paddd m8, m2 ; t30
+ REPX {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8
+ REPX {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8
+ mova m15, [o(pd_3784)]
+ mova m10, [o(pd_1567)]
+ ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15 ; t18a, t29a
+ ITX_MULSUB_2D 6, 3, 2, 9, _, 11, 10, 15 ; t19, t28
+ mova [r3+16*0], m0
+ mova [r3+16*1], m5
+ mova [r3+16*2], m4
+ mova [r3+16*3], m6
+ mova [r3+16*4], m3
+ mova [r3+16*5], m1
+ mova [r3+16*6], m8
+ mova [r3+16*7], m7
+%else
+ mova [r3+0*16], m2
+ mova [r3+1*16], m3
+ mova [r3+2*16], m4
+ mova [r3+3*16], m5
+ mova m3, [o(pd_2048)]
+ ITX_MULSUB_2D 0, 7, 2, 4, 5, 3, 201, 4091 ; t16a, t31a
+ ITX_MULSUB_2D 6, 1, 2, 4, 5, _, 3857, 1380 ; t19a, t28a
+ mova m4, [r3+2*16]
+ mova m5, [r3+3*16]
+ mova [r3+2*16], m6
+ mova [r3+3*16], m7
+ mova m2, [r3+0*16]
+ mova m7, [r3+1*16]
+ mova [r3+0*16], m0
+ mova [r3+1*16], m1
+ ITX_MULSUB_2D 2, 5, 0, 1, 6, _, 1751, 3703 ; t18a, t29a
+ ITX_MULSUB_2D 4, 7, 0, 1, 6, _, 3035, 2751 ; t17a, t30a
+ mova m0, [r3+0*16]
+ mova m1, [r3+1*16]
+ mova m6, [r3+2*16]
+.main_oddhalf_part1_fast2:
+ REPX {paddd x, m3}, m1, m2, m4, m5, m6, m7
+ REPX {psrad x, 12}, m1, m2, m4, m5, m6, m7
+ psubd m3, m0, m4 ; t17
+ mova [r3+0*16], m3
+ mova m3, [r3+3*16]
+ paddd m0, m4 ; t16
+ psubd m4, m6, m2 ; t18
+ paddd m6, m2 ; t19
+ psubd m2, m1, m5 ; t29
+ paddd m1, m5 ; t28
+ psubd m5, m3, m7 ; t30
+ paddd m7, m3 ; t31
+ mova m3, [o(clip_18b_min)]
+ REPX {pmaxsd x, m3}, m5, m4, m2, m0, m6, m1, m7
+ pmaxsd m3, [r3+0*16]
+ mova [r3+0*16], m3
+ mova m3, [o(clip_18b_max)]
+ REPX {pminsd x, m3}, m5, m4, m2, m0, m6, m1, m7
+ pminsd m3, [r3+0*16]
+ mova [r3+0*16], m0
+ mova [r3+1*16], m1
+ mova [r3+2*16], m6
+ mova [r3+3*16], m7
+ mova m0, [o(pd_2048)]
+ ITX_MULSUB_2D 5, 3, 1, 6, 7, 0, 799, 4017 ; t17a, t30a
+ ITX_MULSUB_2D 2, 4, 1, 6, _, 0, 7, 4017, 4 ; t29a, t18a
+ psubd m1, m5, m4 ; t18
+ paddd m5, m4 ; t17
+ psubd m4, m3, m2 ; t29
+ paddd m3, m2 ; t30
+ mova m0, [r3+0*16]
+ mova m2, [r3+1*16]
+ mova m6, [r3+2*16]
+ mova m7, [r3+3*16]
+ mova [r3+0*16], m3
+ psubd m3, m0, m6 ; t19a
+ paddd m0, m6 ; t16a
+ psubd m6, m7, m2 ; t28a
+ paddd m7, m2 ; t31a
+ mova m2, [o(clip_18b_min)]
+ REPX {pmaxsd x, m2}, m3, m6, m1, m4, m0, m7, m5
+ pmaxsd m2, [r3+0*16]
+ mova [r3+0*16], m2
+ mova m2, [o(clip_18b_max)]
+ REPX {pminsd x, m2}, m3, m6, m1, m4, m0, m7, m5
+ pminsd m2, [r3+0*16]
+ mova [r3+16*0], m0
+ mova [r3+16*1], m5
+ mova [r3+16*6], m2
+ mova [r3+16*7], m7
+ mova m7, [o(pd_2048)]
+ ITX_MULSUB_2D 4, 1, 0, 5, 2, 7, 1567, 3784 ; t18a, t29a
+ ITX_MULSUB_2D 6, 3, 0, 5, 2, 7, 2, 3784 ; t19, t28
+ mova [r3+16*2], m4
+ mova [r3+16*3], m6
+ mova [r3+16*4], m3
+ mova [r3+16*5], m1
+%endif
+ ret
+.main_oddhalf_part2_fast: ; lower half zero
+ pmulld m7, m0, [o(pd_m601)]
+ pmulld m0, [o(pd_4052)]
+ pmulld m4, m3, [o(pd_3290)]
+%if ARCH_X86_32
+ pmulld m3, [o(pd_2440)]
+ mova m5, [o(pd_2048)]
+ REPX {paddd x, m5}, m0, m7
+ REPX {psrad x, 12}, m0, m7
+ mova [r3+11*16], m7
+ mova m7, m3
+ mova m3, m5
+%else
+ pmulld m3, [o(pd_2440)]
+%endif
+ pmulld m6, m1, [o(pd_3973)]
+ pmulld m1, [o(pd_995)]
+ pmulld m5, m2, [o(pd_m2106)]
+ pmulld m2, [o(pd_3513)]
+ jmp .main_oddhalf_part2_fast2
+.main_oddhalf_part2: ; in3, in5, in11, in13, in19, in21, in27, in29
+%if ARCH_X86_64
+ ITX_MULSUB_2D 7, 0, 8, 9, 10, _, 4052, 601 ; t23a, t24a
+ ITX_MULSUB_2D 1, 6, 8, 9, 10, _, 995, 3973 ; t20a, t27a
+ ITX_MULSUB_2D 5, 2, 8, 9, 10, _, 3513, 2106 ; t21a, t26a
+ ITX_MULSUB_2D 3, 4, 8, 9, 10, _, 2440, 3290 ; t22a, t25a
+.main_oddhalf_part2_fast2:
+ REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
+ psubd m8, m0, m4 ; t25
+ paddd m0, m4 ; t24
+ psubd m4, m6, m2 ; t26
+ paddd m6, m2 ; t27
+ psubd m2, m1, m5 ; t21
+ paddd m1, m5 ; t20
+ psubd m5, m7, m3 ; t22
+ paddd m7, m3 ; t23
+ REPX {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7
+ REPX {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7
+ mova m15, [o(pd_2276)]
+ mova m10, [o(pd_3406)]
+ ITX_MULSUB_2D 4, 2, 3, 9, _, 11, 10, 15 ; t21a, t26a
+ ITX_MULSUB_2D 8, 5, 3, 9, _, 11, 10, 15, 4 ; t25a, t22a
+ psubd m3, m0, m6 ; t27a
+ paddd m0, m6 ; t24a
+ psubd m6, m7, m1 ; t20a
+ paddd m7, m1 ; t23a
+ psubd m1, m5, m4 ; t21
+ paddd m5, m4 ; t22
+ psubd m4, m8, m2 ; t26
+ paddd m8, m2 ; t25
+ REPX {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8
+ REPX {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8
+ mova m15, [o(pd_3784)]
+ mova m10, [o(pd_1567)]
+ ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15, 4 ; t26a, t21a
+ ITX_MULSUB_2D 3, 6, 2, 9, _, 11, 10, 15, 4 ; t27, t20
+ mova m9, [r3+16*0] ; t16a
+ mova m10, [r3+16*1] ; t17
+ psubd m2, m9, m7 ; t23
+ paddd m9, m7 ; t16
+ psubd m7, m10, m5 ; t22a
+ paddd m10, m5 ; t17a
+ REPX {pmaxsd x, m12}, m9, m10, m2, m7
+ REPX {pminsd x, m13}, m9, m10, m2, m7
+ mova [r3+16*0], m9
+ mova [r3+16*1], m10
+ mova m9, [r3+16*2] ; t18a
+ mova m10, [r3+16*3] ; t19
+ psubd m5, m9, m1 ; t21
+ paddd m9, m1 ; t18
+ psubd m1, m10, m6 ; t20a
+ paddd m10, m6 ; t19a
+ REPX {pmaxsd x, m12}, m9, m10, m5, m1
+ REPX {pminsd x, m13}, m9, m10, m5, m1
+ mova [r3+16*2], m9
+ mova [r3+16*3], m10
+ mova m9, [r3+16*4] ; t28
+ mova m10, [r3+16*5] ; t29a
+ psubd m6, m9, m3 ; t27a
+ paddd m9, m3 ; t28a
+ psubd m3, m10, m4 ; t26
+ paddd m10, m4 ; t29
+ REPX {pmaxsd x, m12}, m9, m10, m6, m3
+ REPX {pminsd x, m13}, m9, m10, m6, m3
+ REPX {pmulld x, m14}, m6, m3, m1, m5
+ paddd m6, m11
+ paddd m3, m11
+ psubd m4, m6, m1 ; t20
+ paddd m6, m1 ; t27
+ psubd m1, m3, m5 ; t21a
+ paddd m3, m5 ; t26a
+ REPX {psrad x, 12 }, m4, m1, m3, m6
+ mova [r3+16*4], m4
+ mova [r3+16*5], m1
+ mova m4, [r3+16*6] ; t30
+ mova m1, [r3+16*7] ; t31a
+ psubd m5, m4, m8 ; t25a
+ paddd m4, m8 ; t30a
+ psubd m8, m1, m0 ; t24
+ paddd m1, m0 ; t31
+ REPX {pmaxsd x, m12}, m8, m5, m4, m1
+ REPX {pminsd x, m13}, m8, m5, m4, m1
+ REPX {pmulld x, m14}, m5, m8, m7, m2
+ paddd m5, m11
+ paddd m8, m11
+ psubd m0, m5, m7 ; t22
+ paddd m5, m7 ; t25
+ psubd m7, m8, m2 ; t23a
+ paddd m2, m8 ; t24a
+ REPX {psrad x, 12 }, m0, m7, m2, m5
+ mova [r3+16*6], m0
+ mova [r3+16*7], m7
+ mova [r3+16*8], m2
+ mova [r3+16*9], m5
+ mova [r3+16*10], m3
+ mova [r3+16*11], m6
+ mova [r3+16*12], m9
+ mova [r3+16*13], m10
+ mova [r3+16*14], m4
+ mova [r3+16*15], m1
+%else
+ mova [r3+ 8*16], m2
+ mova [r3+ 9*16], m3
+ mova [r3+10*16], m4
+ mova [r3+11*16], m5
+ mova m3, [o(pd_2048)]
+ ITX_MULSUB_2D 7, 0, 2, 4, 5, 3, 4052, 601 ; t23a, t24a
+ ITX_MULSUB_2D 1, 6, 2, 4, 5, _, 995, 3973 ; t20a, t27a
+ mova m2, [r3+ 8*16]
+ mova m4, [r3+10*16]
+ mova m5, [r3+11*16]
+ mova [r3+ 8*16], m0
+ mova [r3+10*16], m6
+ mova [r3+11*16], m7
+ mova m7, [r3+ 9*16]
+ mova [r3+ 9*16], m1
+ ITX_MULSUB_2D 5, 2, 0, 6, 1, _, 3513, 2106 ; t21a, t26a
+ ITX_MULSUB_2D 7, 4, 0, 6, 1, _, 2440, 3290 ; t22a, t25a
+ mova m0, [r3+ 8*16]
+ mova m1, [r3+ 9*16]
+ mova m6, [r3+10*16]
+.main_oddhalf_part2_fast2:
+ REPX {paddd x, m3}, m1, m2, m7, m4, m5, m6
+ REPX {psrad x, 12}, m1, m2, m7, m4, m5, m6
+ psubd m3, m0, m4 ; t25
+ mova [r3+ 8*16], m3
+ mova m3, [r3+11*16]
+ paddd m0, m4 ; t24
+ psubd m4, m6, m2 ; t26
+ paddd m6, m2 ; t27
+ psubd m2, m1, m5 ; t21
+ paddd m1, m5 ; t20
+ psubd m5, m3, m7 ; t22
+ paddd m7, m3 ; t23
+ mova m3, [o(clip_18b_min)]
+ REPX {pmaxsd x, m3}, m5, m4, m2, m0, m6, m1, m7
+ pmaxsd m3, [r3+ 8*16]
+ mova [r3+ 8*16], m3
+ mova m3, [o(clip_18b_max)]
+ REPX {pminsd x, m3}, m5, m4, m2, m0, m6, m1, m7
+ pminsd m3, [r3+ 8*16]
+ mova [r3+ 8*16], m0
+ mova [r3+ 9*16], m1
+ mova [r3+10*16], m6
+ mova [r3+11*16], m7
+ mova m7, [o(pd_2048)]
+ ITX_MULSUB_2D 4, 2, 0, 1, 6, 7, 3406, 2276 ; t21a, t26a
+ ITX_MULSUB_2D 3, 5, 0, 1, _, 7, 6, 2276, 4 ; t25a, t22a
+ psubd m1, m5, m4 ; t21
+ paddd m5, m4 ; t22
+ psubd m4, m3, m2 ; t26
+ paddd m3, m2 ; t25
+ mova m0, [r3+ 8*16]
+ mova m2, [r3+ 9*16]
+ mova m6, [r3+10*16]
+ mova m7, [r3+11*16]
+ mova [r3+ 8*16], m3
+ psubd m3, m0, m6 ; t27a
+ paddd m0, m6 ; t24a
+ psubd m6, m7, m2 ; t20a
+ paddd m7, m2 ; t23a
+ mova m2, [o(clip_18b_min)]
+ REPX {pmaxsd x, m2}, m3, m6, m1, m4, m0, m7, m5
+ pmaxsd m2, [r3+ 8*16]
+ mova [r3+ 8*16], m2
+ mova m2, [o(clip_18b_max)]
+ REPX {pminsd x, m2}, m3, m6, m1, m4, m0, m7, m5
+ pminsd m2, [r3+ 8*16]
+ mova [r3+ 8*16], m0
+ mova [r3+ 9*16], m2
+ mova [r3+14*16], m5
+ mova [r3+15*16], m7
+ mova m0, [o(pd_2048)]
+ ITX_MULSUB_2D 4, 1, 2, 5, 7, 0, 1567, 3784, 4 ; t26a, t21a
+ ITX_MULSUB_2D 3, 6, 2, 5, _, 0, 7, 3784, 4 ; t27, t20
+ mova [r3+10*16], m3
+ mova m0, [o(clip_18b_min)]
+ mova m2, [o(clip_18b_max)]
+ mova m5, [r3+16*2] ; t18a
+ mova m7, [r3+16*3] ; t19
+ psubd m3, m5, m1 ; t21
+ paddd m5, m1 ; t18
+ psubd m1, m7, m6 ; t20a
+ paddd m7, m6 ; t19a
+ REPX {pmaxsd x, m0}, m5, m7, m3, m1
+ REPX {pminsd x, m2}, m5, m7, m3, m1
+ mova [r3+16*2], m5
+ mova [r3+16*3], m7
+ mova [r3+11*16], m3
+ mova m3, [r3+10*16]
+ mova m5, [r3+16*4] ; t28
+ mova m7, [r3+16*5] ; t29a
+ psubd m6, m5, m3 ; t27a
+ paddd m5, m3 ; t28a
+ psubd m3, m7, m4 ; t26
+ paddd m7, m4 ; t29
+ REPX {pmaxsd x, m0}, m5, m7, m6, m3
+ REPX {pminsd x, m2}, m5, m7, m6, m3
+ mova [r3+16*12], m5
+ mova [r3+16*13], m7
+ mova m5, [o(pd_2048)]
+ mova m7, [o(pd_2896)]
+ mova m4, [r3+11*16]
+ REPX {pmulld x, m7}, m6, m3, m1, m4
+ paddd m6, m5
+ paddd m3, m5
+ psubd m5, m6, m1 ; t20
+ paddd m6, m1 ; t27
+ psubd m1, m3, m4 ; t21a
+ paddd m3, m4 ; t26a
+ REPX {psrad x, 12}, m5, m1, m3, m6
+ mova [r3+16*4], m5
+ mova [r3+16*5], m1
+ mova [r3+16*10], m3
+ mova [r3+16*11], m6
+
+ mova m5, [r3+14*16]
+ mova m6, [r3+15*16]
+ mova m3, [r3+16*0] ; t16a
+ mova m4, [r3+16*1] ; t17
+ psubd m1, m3, m6 ; t23
+ paddd m3, m6 ; t16
+ psubd m6, m4, m5 ; t22a
+ paddd m4, m5 ; t17a
+ REPX {pmaxsd x, m0}, m3, m4, m1, m6
+ REPX {pminsd x, m2}, m3, m4, m1, m6
+ mova [r3+16*0], m3
+ mova [r3+16*1], m4
+ mova m5, [r3+ 8*16]
+ mova m3, [r3+ 9*16]
+ mova [r3+ 8*16], m1
+ mova [r3+ 9*16], m6
+ mova m4, [r3+16*6] ; t30
+ mova m1, [r3+16*7] ; t31a
+ psubd m6, m1, m5 ; t24
+ paddd m1, m5 ; t31
+ psubd m5, m4, m3 ; t25a
+ paddd m4, m3 ; t30a
+ REPX {pmaxsd x, m0}, m6, m5, m4, m1
+ REPX {pminsd x, m2}, m6, m5, m4, m1
+ mova [r3+16*14], m4
+ mova [r3+16*15], m1
+ mova m4, [o(pd_2048)]
+ mova m1, [r3+ 9*16]
+ mova m2, [r3+ 8*16]
+ REPX {pmulld x, m7}, m5, m6, m1, m2
+ paddd m5, m4
+ paddd m6, m4
+ psubd m0, m5, m1 ; t22
+ paddd m5, m1 ; t25
+ psubd m1, m6, m2 ; t23a
+ paddd m2, m6 ; t24a
+ REPX {psrad x, 12}, m0, m1, m2, m5
+ mova [r3+16*6], m0
+ mova [r3+16*7], m1
+ mova [r3+16*8], m2
+ mova [r3+16*9], m5
+%endif
+ ret
+
+ ; final sumsub for idct16 as well as idct32, plus final downshift
+%macro IDCT32_END 6 ; in/out1, out2-4, tmp, shift, idx
+ mova m%4, [r3+16*(23-%1)]
+ pmaxsd m%1, m12
+ pminsd m%1, m13
+ psubd m%3, m%1, m%4 ; idct16 out15 - n
+ paddd m%1, m%4 ; idct16 out0 + n
+ pmaxsd m%1, m12
+ pmaxsd m%3, m12
+ pminsd m%1, m13
+ pminsd m%3, m13
+ paddd m%1, m11
+ paddd m%3, m11
+ mova m%5, [r3+16*( 0+%1)]
+ mova m%2, [r3+16*(15-%1)]
+ psubd m%4, m%1, m%2 ; out31 - n
+ paddd m%1, m%2 ; out0 + n
+ paddd m%2, m%3, m%5 ; out15 - n
+ psubd m%3, m%5 ; out16 + n
+ REPX {psrad x, %6}, m%1, m%3, m%2, m%4
+%endmacro
+
+.round_dct32:
+%if ARCH_X86_64
+ psrld m11, 10 ; pd_2
+ IDCT32_END 0, 15, 8, 9, 10, 2 ; 0 15 16 31
+ mova [r3+ 0*16], m6
+ mova [r3+23*16], m7
+ IDCT32_END 1, 14, 6, 7, 10, 2 ; 1 14 17 30
+ packssdw m0, m1 ; 0 1
+ packssdw m14, m15 ; 14 15
+ packssdw m8, m6 ; 16 17
+ packssdw m7, m9 ; 30 31
+ mova [r3+16*15], m14
+ mova [r3+16*14], m7
+ IDCT32_END 2, 15, 10, 7, 6, 2 ; 2 13 18 29
+ IDCT32_END 3, 14, 1, 9, 6, 2 ; 3 12 19 28
+ packssdw m2, m3 ; 2 3
+ packssdw m14, m15 ; 12 13
+ packssdw m10, m1 ; 18 19
+ packssdw m9, m7 ; 28 29
+ mova [r3+16*13], m14
+ mova [r3+16*12], m9
+ IDCT32_END 4, 15, 1, 7, 6, 2 ; 4 11 20 27
+ IDCT32_END 5, 14, 3, 9, 6, 2 ; 5 10 21 26
+ packssdw m4, m5 ; 4 5
+ packssdw m14, m15 ; 10 11
+ packssdw m1, m3 ; 20 21
+ packssdw m9, m7 ; 26 27
+ mova [r3+16*11], m14
+ mova [r3+16*10], m9
+ mova m6, [r3+ 0*16]
+ mova m7, [r3+23*16]
+ IDCT32_END 6, 15, 14, 5, 3, 2 ; 6 9 22 25
+ IDCT32_END 7, 11, 3, 9, 13, 2 ; 7 8 23 24
+ packssdw m6, m7 ; 6 7
+ packssdw m11, m15 ; 8 9
+ packssdw m14, m3 ; 22 23
+ packssdw m9, m5 ; 24 25
+ mova [r3+16*9], m11
+ mova [r3+16*8], m9
+ mova m12, m1
+ ret
+%else
+ mova [r3+16*16], m0
+ mova [r3+17*16], m1
+ mova [r3+18*16], m2
+ mova [r3+19*16], m3
+ mova [r3+20*16], m4
+ mova [r3+21*16], m5
+ mova [r3+22*16], m6
+ mova [r3+23*16], m7
+ mova m1, [o(pd_2)]
+ mova m2, [o(clip_18b_min)]
+ mova m3, [o(clip_18b_max)]
+
+ mov r4, 15*16
+.loop_dct32_end:
+ mova m0, [r3+16*16]
+ mova m6, [r3+16*24]
+ pmaxsd m0, m2
+ pminsd m0, m3
+ psubd m5, m0, m6 ; idct16 out15 - n
+ paddd m0, m6 ; idct16 out0 + n
+ pmaxsd m0, m2
+ pmaxsd m5, m2
+ pminsd m0, m3
+ pminsd m5, m3
+ paddd m0, m1
+ paddd m5, m1
+ mova m7, [r3]
+ mova m4, [r3+r4]
+ psubd m6, m0, m4 ; out31 - n
+ paddd m0, m4 ; out0 + n
+ paddd m4, m5, m7 ; out15 - n
+ psubd m5, m7 ; out16 + n
+ REPX {psrad x, 2}, m0, m5, m4, m6
+ mova [r3], m0
+ mova [r3+r4], m4
+ mova [r3+16*16], m5
+ mova [r3+24*16], m6
+ add r3, 16
+ sub r4, 32
+ jg .loop_dct32_end
+ ret
+%endif
+
+.dconly:
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 8
+.dconly1:
+ add r5d, 640
+ sar r5d, 10
+.dconly2:
+ imul r5d, 2896
+ add r5d, 34816
+ movd m0, r5d
+ pshuflw m0, m0, q1111
+ punpcklqdq m0, m0
+ mova m6, [o(pixel_10bpc_max)]
+ pxor m5, m5
+.dconly_loop:
+ mova m1, [dstq+16*0]
+ mova m2, [dstq+16*1]
+ mova m3, [dstq+16*2]
+ mova m4, [dstq+16*3]
+ REPX {paddw x, m0}, m1, m2, m3, m4
+ REPX {pminsw x, m6}, m1, m2, m3, m4
+ REPX {pmaxsw x, m5}, m1, m2, m3, m4
+ mova [dstq+16*0], m1
+ mova [dstq+16*1], m2
+ mova [dstq+16*2], m3
+ mova [dstq+16*3], m4
+ add dstq, strideq
+ dec r3d
+ jg .dconly_loop
+ RET
+
+cglobal inv_txfm_add_dct_dct_32x16_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \
+ dst, stride, c, eob
+ LEA r6, base
+ test eobd, eobd
+ jz .dconly
+
+ ; remove entirely-zero iterations
+%undef cmp
+ mov r5d, 8
+.zero_loop:
+ sub r5d, 2
+ cmp eobw, word [o2(tbl_32x16_2d)+r5]
+ jl .zero_loop
+
+ ; actual first pass after skipping all-zero data
+.loop_pass1:
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+ mova m0, [cq+64* 1+r5*8]
+ mova m1, [cq+64* 7+r5*8]
+ mova m2, [cq+64* 9+r5*8]
+ mova m3, [cq+64*15+r5*8]
+ mova m4, [cq+64*17+r5*8]
+ mova m5, [cq+64*23+r5*8]
+ mova m6, [cq+64*25+r5*8]
+ mova m7, [cq+64*31+r5*8]
+ mov r3, rsp
+ call m(idct_8x4_internal_16bpc).rect2_mul
+ call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1
+
+ mova m0, [cq+64* 3+r5*8]
+ mova m1, [cq+64* 5+r5*8]
+ mova m2, [cq+64*11+r5*8]
+ mova m3, [cq+64*13+r5*8]
+ mova m4, [cq+64*19+r5*8]
+ mova m5, [cq+64*21+r5*8]
+ mova m6, [cq+64*27+r5*8]
+ mova m7, [cq+64*29+r5*8]
+%if ARCH_X86_32
+ add r3, 16*8
+%endif
+ call m(idct_8x4_internal_16bpc).rect2_mul
+%if ARCH_X86_32
+ sub r3, 16*8
+%endif
+ call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2
+ add r3, 16*(16+4*ARCH_X86_32)
+
+ mova m0, [cq+64* 2+r5*8]
+ mova m1, [cq+64* 6+r5*8]
+ mova m2, [cq+64*10+r5*8]
+ mova m3, [cq+64*14+r5*8]
+ mova m4, [cq+64*18+r5*8]
+ mova m5, [cq+64*22+r5*8]
+ mova m6, [cq+64*26+r5*8]
+ mova m7, [cq+64*30+r5*8]
+ call m(idct_8x4_internal_16bpc).rect2_mul
+ call m(idct_16x4_internal_16bpc).main_oddhalf
+
+ mova m0, [cq+64* 0+r5*8]
+ mova m1, [cq+64* 4+r5*8]
+ mova m2, [cq+64* 8+r5*8]
+ mova m3, [cq+64*12+r5*8]
+ mova m4, [cq+64*16+r5*8]
+ mova m5, [cq+64*20+r5*8]
+ mova m6, [cq+64*24+r5*8]
+ mova m7, [cq+64*28+r5*8]
+ call m(idct_8x4_internal_16bpc).rect2_mul
+ call m(idct_8x4_internal_16bpc).main_pass1
+ call m(idct_8x4_internal_16bpc).round
+ sub r3, 16*(16+4*ARCH_X86_32)
+ call .round_dct32
+
+%if ARCH_X86_64
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [cq+64* 8+r5*8], m8
+ mova [cq+64* 9+r5*8], m9
+ mova [cq+64*10+r5*8], m10
+ mova [cq+64*11+r5*8], m11
+ mova m8, [r3+16* 9] ; 8 9
+ mova m10, [r3+16*11] ; 10 11
+ mova m12, [r3+16*13] ; 12 13
+ mova m14, [r3+16*15] ; 14 15
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [cq+64* 4+r5*8], m8
+ mova [cq+64* 5+r5*8], m9
+ mova [cq+64* 6+r5*8], m10
+ mova [cq+64* 7+r5*8], m11
+ mova m8, [r3+16* 8] ; 24 25
+ mova m10, [r3+16*10] ; 26 27
+ mova m12, [r3+16*12] ; 28 29
+ mova m14, [r3+16*14] ; 30 31
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [cq+64*12+r5*8], m8
+ mova [cq+64*13+r5*8], m9
+ mova [cq+64*14+r5*8], m10
+ mova [cq+64*15+r5*8], m11
+%else
+ sub r3, 8*16
+ mova m0, [r3+ 8*16]
+ mova m2, [r3+10*16]
+ mova m4, [r3+12*16]
+ mova m6, [r3+14*16]
+ packssdw m0, [r3+ 9*16]
+ packssdw m2, [r3+11*16]
+ packssdw m4, [r3+13*16]
+ packssdw m6, [r3+15*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [cq+64* 4+r5*8], m0
+ mova [cq+64* 5+r5*8], m1
+ mova [cq+64* 6+r5*8], m2
+ mova [cq+64* 7+r5*8], m3
+ mova m0, [r3+16*16]
+ mova m2, [r3+18*16]
+ mova m4, [r3+20*16]
+ mova m6, [r3+22*16]
+ packssdw m0, [r3+17*16]
+ packssdw m2, [r3+19*16]
+ packssdw m4, [r3+21*16]
+ packssdw m6, [r3+23*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [cq+64* 8+r5*8], m0
+ mova [cq+64* 9+r5*8], m1
+ mova [cq+64*10+r5*8], m2
+ mova [cq+64*11+r5*8], m3
+ mova m0, [r3+31*16]
+ mova m2, [r3+29*16]
+ mova m4, [r3+27*16]
+ mova m6, [r3+25*16]
+ packssdw m0, [r3+30*16]
+ packssdw m2, [r3+28*16]
+ packssdw m4, [r3+26*16]
+ packssdw m6, [r3+24*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [cq+64*12+r5*8], m0
+ mova [cq+64*13+r5*8], m1
+ mova [cq+64*14+r5*8], m2
+ mova [cq+64*15+r5*8], m3
+ mova m0, [r3+ 0*16]
+ mova m2, [r3+ 2*16]
+ mova m4, [r3+ 4*16]
+ mova m6, [r3+ 6*16]
+ packssdw m0, [r3+ 1*16]
+ packssdw m2, [r3+ 3*16]
+ packssdw m4, [r3+ 5*16]
+ packssdw m6, [r3+ 7*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+%endif
+ mova [cq+64* 0+r5*8], m0
+ mova [cq+64* 1+r5*8], m1
+ mova [cq+64* 2+r5*8], m2
+ mova [cq+64* 3+r5*8], m3
+ pxor m0, m0
+ REPX {mova [cq+x*64+r5*8], m0}, 16, 17, 18, 19, 20, 21, 22, 23, \
+ 24, 25, 26, 27, 28, 29, 30, 31
+ sub r5d, 2
+ jge .loop_pass1
+
+ ; pass=2, we need to call this otherwise the stack pointer has
+ ; the wrong offset in the 8-bit code
+ call .pass2
+ RET
+
+.pass2:
+%if ARCH_X86_64
+ mova m8, [o(pw_2048)]
+ pxor m9, m9
+ mova m10, [o(pixel_10bpc_max)]
+%if WIN64
+ mov [rsp+16*16+gprsize], r7
+%endif
+ mov r7, dstq
+%else
+ mov [rsp+2*gprsize+16*16], dstq
+%endif
+ lea r3, [strideq*3]
+ mov r4d, 4
+ jmp m(idct_16x16_internal_16bpc).loop_pass2
+
+.round_dct32:
+%if ARCH_X86_64
+ psrld m11, 11 ; pd_1
+ IDCT32_END 0, 15, 8, 9, 10, 1 ; 0 15 16 31
+ mova [r3+ 0*16], m6
+ mova [r3+23*16], m7
+ IDCT32_END 1, 14, 6, 7, 10, 1 ; 1 14 17 30
+ packssdw m0, m1 ; 0 1
+ packssdw m14, m15 ; 14 15
+ packssdw m8, m6 ; 16 17
+ packssdw m7, m9 ; 30 31
+ mova [r3+16*15], m14
+ mova [r3+16*14], m7
+ IDCT32_END 2, 15, 10, 7, 6, 1 ; 2 13 18 29
+ IDCT32_END 3, 14, 1, 9, 6, 1 ; 3 12 19 28
+ packssdw m2, m3 ; 2 3
+ packssdw m14, m15 ; 12 13
+ packssdw m10, m1 ; 18 19
+ packssdw m9, m7 ; 28 29
+ mova [r3+16*13], m14
+ mova [r3+16*12], m9
+ IDCT32_END 4, 15, 1, 7, 6, 1 ; 4 11 20 27
+ IDCT32_END 5, 14, 3, 9, 6, 1 ; 5 10 21 26
+ packssdw m4, m5 ; 4 5
+ packssdw m14, m15 ; 10 11
+ packssdw m1, m3 ; 20 21
+ packssdw m9, m7 ; 26 27
+ mova [r3+16*11], m14
+ mova [r3+16*10], m9
+ mova m6, [r3+ 0*16]
+ mova m7, [r3+23*16]
+ IDCT32_END 6, 15, 14, 5, 3, 1 ; 6 9 22 25
+ IDCT32_END 7, 11, 3, 9, 13, 1 ; 7 8 23 24
+ packssdw m6, m7 ; 6 7
+ packssdw m11, m15 ; 8 9
+ packssdw m14, m3 ; 22 23
+ packssdw m9, m5 ; 24 25
+ mova [r3+16*9], m11
+ mova [r3+16*8], m9
+ mova m12, m1
+ ret
+%else
+ mova [r3+16*16], m0
+ mova [r3+17*16], m1
+ mova [r3+18*16], m2
+ mova [r3+19*16], m3
+ mova [r3+20*16], m4
+ mova [r3+21*16], m5
+ mova [r3+22*16], m6
+ mova [r3+23*16], m7
+ pcmpeqd m1, m1 ; -1
+ mova m2, [o(clip_18b_min)]
+ mova m3, [o(clip_18b_max)]
+
+ mov r4, 15*16
+.loop_dct32_end:
+ mova m0, [r3+16*16]
+ mova m6, [r3+16*24]
+ psubd m5, m0, m6 ; idct16 out15 - n
+ paddd m0, m6 ; idct16 out0 + n
+ pmaxsd m0, m2
+ pmaxsd m5, m2
+ pminsd m0, m3
+ pminsd m5, m3
+ psubd m0, m1
+ psubd m5, m1
+ mova m7, [r3]
+ mova m4, [r3+r4]
+ psubd m6, m0, m4 ; out31 - n
+ paddd m0, m4 ; out0 + n
+ paddd m4, m5, m7 ; out15 - n
+ psubd m5, m7 ; out16 + n
+ REPX {psrad x, 1}, m0, m5, m4, m6
+ mova [r3], m0
+ mova [r3+r4], m4
+ mova [r3+16*16], m5
+ mova [r3+24*16], m6
+ add r3, 16
+ sub r4, 32
+ jg .loop_dct32_end
+ ret
+%endif
+
+.dconly:
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 16
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
+ add r5d, 384
+ sar r5d, 9
+ jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly2
+
+cglobal inv_txfm_add_dct_dct_32x32_16bpc, 4, 7, 16, 0-(5*32+1)*16, \
+ dst, stride, c, eob
+ LEA r6, base
+ test eobd, eobd
+ jz .dconly
+
+ ; remove entirely-zero iterations
+%if ARCH_X86_32
+ mov [rsp+5*32*16+1*gprsize], dstq
+%elif WIN64
+ mov [rsp+5*32*16+1*gprsize], r7
+%endif
+%undef cmp
+ mov r5d, 14
+ cmp eobw, word [o2(tbl_32x32_2d)+r5]
+ jge .end_zero_loop
+ pxor m0, m0
+.zero_loop:
+ movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5]
+ movzx t1d, t0b
+ shr t0d, 8
+ mova [rsp+32*16+r5*8+0*32*16], m0
+ mova [rsp+40*16+r5*8+0*32*16], m0
+ mova [rsp+32*16+t0*8+0*32*16], m0
+ mova [rsp+32*16+t1*8+0*32*16], m0
+ mova [rsp+32*16+r5*8+1*32*16], m0
+ mova [rsp+40*16+r5*8+1*32*16], m0
+ mova [rsp+32*16+t0*8+1*32*16], m0
+ mova [rsp+32*16+t1*8+1*32*16], m0
+ mova [rsp+32*16+r5*8+2*32*16], m0
+ mova [rsp+40*16+r5*8+2*32*16], m0
+ mova [rsp+32*16+t0*8+2*32*16], m0
+ mova [rsp+32*16+t1*8+2*32*16], m0
+ mova [rsp+32*16+r5*8+3*32*16], m0
+ mova [rsp+40*16+r5*8+3*32*16], m0
+ mova [rsp+32*16+t0*8+3*32*16], m0
+ mova [rsp+32*16+t1*8+3*32*16], m0
+ sub r5d, 2
+ cmp eobw, word [o2(tbl_32x32_2d)+r5]
+ jl .zero_loop
+.end_zero_loop:
+
+ ; actual first pass after skipping all-zero data
+ mov [rsp+gprsize*0+5*32*16], eobd
+.loop_pass1:
+ mova m0, [cq+128* 1+r5*8]
+ mova m1, [cq+128* 7+r5*8]
+ mova m2, [cq+128* 9+r5*8]
+ mova m3, [cq+128*15+r5*8]
+ mova m4, [cq+128*17+r5*8]
+ mova m5, [cq+128*23+r5*8]
+ mova m6, [cq+128*25+r5*8]
+ mova m7, [cq+128*31+r5*8]
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+ mov r3, rsp
+ call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1
+ mova m0, [cq+128* 3+r5*8]
+ mova m1, [cq+128* 5+r5*8]
+ mova m2, [cq+128*11+r5*8]
+ mova m3, [cq+128*13+r5*8]
+ mova m4, [cq+128*19+r5*8]
+ mova m5, [cq+128*21+r5*8]
+ mova m6, [cq+128*27+r5*8]
+ mova m7, [cq+128*29+r5*8]
+ call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2
+ mova m0, [cq+128* 2+r5*8]
+ mova m1, [cq+128* 6+r5*8]
+ mova m2, [cq+128*10+r5*8]
+ mova m3, [cq+128*14+r5*8]
+ mova m4, [cq+128*18+r5*8]
+ mova m5, [cq+128*22+r5*8]
+ mova m6, [cq+128*26+r5*8]
+ mova m7, [cq+128*30+r5*8]
+ add r3, 16*(16+4*ARCH_X86_32)
+ call m(idct_16x4_internal_16bpc).main_oddhalf
+ mova m0, [cq+128* 0+r5*8]
+ mova m1, [cq+128* 4+r5*8]
+ mova m2, [cq+128* 8+r5*8]
+ mova m3, [cq+128*12+r5*8]
+ mova m4, [cq+128*16+r5*8]
+ mova m5, [cq+128*20+r5*8]
+ mova m6, [cq+128*24+r5*8]
+ mova m7, [cq+128*28+r5*8]
+ call m(idct_8x4_internal_16bpc).main_pass1
+ call m(idct_8x4_internal_16bpc).round
+ sub r3, 16*(16+4*ARCH_X86_32)
+ call m(inv_txfm_add_dct_dct_32x8_16bpc).round_dct32
+ movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5]
+ movzx t1d, t0b
+ shr t0d, 8
+%if ARCH_X86_64
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [rsp+32*16+r5*8+2*32*16], m8
+ mova [rsp+40*16+r5*8+2*32*16], m10
+ mova [rsp+32*16+t1*8+2*32*16], m9
+ mova [rsp+32*16+t0*8+2*32*16], m11
+ mova m8, [r3+16* 9] ; 8 9
+ mova m10, [r3+16*11] ; 10 11
+ mova m12, [r3+16*13] ; 12 13
+ mova m14, [r3+16*15] ; 14 15
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [rsp+32*16+r5*8+1*32*16], m8
+ mova [rsp+40*16+r5*8+1*32*16], m10
+ mova [rsp+32*16+t1*8+1*32*16], m9
+ mova [rsp+32*16+t0*8+1*32*16], m11
+ mova m8, [r3+16* 8] ; 24 25
+ mova m10, [r3+16*10] ; 26 27
+ mova m12, [r3+16*12] ; 28 29
+ mova m14, [r3+16*14] ; 30 31
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [rsp+32*16+r5*8+3*32*16], m8
+ mova [rsp+40*16+r5*8+3*32*16], m10
+ mova [rsp+32*16+t1*8+3*32*16], m9
+ mova [rsp+32*16+t0*8+3*32*16], m11
+%else
+ sub r3, 8*16
+ mova m0, [r3+ 8*16]
+ mova m2, [r3+10*16]
+ mova m4, [r3+12*16]
+ mova m6, [r3+14*16]
+ packssdw m0, [r3+ 9*16]
+ packssdw m2, [r3+11*16]
+ packssdw m4, [r3+13*16]
+ packssdw m6, [r3+15*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [rsp+32*16+r5*8+1*32*16], m0
+ mova [rsp+40*16+r5*8+1*32*16], m2
+ mova [rsp+32*16+t1*8+1*32*16], m1
+ mova [rsp+32*16+t0*8+1*32*16], m3
+ mova m0, [r3+16*16]
+ mova m2, [r3+18*16]
+ mova m4, [r3+20*16]
+ mova m6, [r3+22*16]
+ packssdw m0, [r3+17*16]
+ packssdw m2, [r3+19*16]
+ packssdw m4, [r3+21*16]
+ packssdw m6, [r3+23*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [rsp+32*16+r5*8+2*32*16], m0
+ mova [rsp+40*16+r5*8+2*32*16], m2
+ mova [rsp+32*16+t1*8+2*32*16], m1
+ mova [rsp+32*16+t0*8+2*32*16], m3
+ mova m0, [r3+31*16]
+ mova m2, [r3+29*16]
+ mova m4, [r3+27*16]
+ mova m6, [r3+25*16]
+ packssdw m0, [r3+30*16]
+ packssdw m2, [r3+28*16]
+ packssdw m4, [r3+26*16]
+ packssdw m6, [r3+24*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [rsp+32*16+r5*8+3*32*16], m0
+ mova [rsp+40*16+r5*8+3*32*16], m2
+ mova [rsp+32*16+t1*8+3*32*16], m1
+ mova [rsp+32*16+t0*8+3*32*16], m3
+ mova m0, [r3+ 0*16]
+ mova m2, [r3+ 2*16]
+ mova m4, [r3+ 4*16]
+ mova m6, [r3+ 6*16]
+ packssdw m0, [r3+ 1*16]
+ packssdw m2, [r3+ 3*16]
+ packssdw m4, [r3+ 5*16]
+ packssdw m6, [r3+ 7*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+%endif
+ pxor m7, m7
+ ; clear lower half of [cq]
+ REPX {mova [cq+x*128+r5*8], m7}, 0, 1, 2, 3, 4, 5, 6, 7, \
+ 8, 9, 10, 11, 12, 13, 14, 15, \
+ 16, 17, 18, 19, 20, 21, 22, 23, \
+ 24, 25, 26, 27, 28, 29, 30, 31
+ mova [rsp+32*16+r5*8+0*32*16], m0
+ mova [rsp+40*16+r5*8+0*32*16], m2
+ mova [rsp+32*16+t1*8+0*32*16], m1
+ mova [rsp+32*16+t0*8+0*32*16], m3
+ sub r5d, 2
+ jge .loop_pass1
+
+ ; pass=2 code starts here
+ mov eobd, [rsp+gprsize*0+5*32*16]
+ add rsp, 29*16
+ cmp eobd, 36
+ jl .load_veryfast
+ cmp eobd, 136
+ jl .load_fast
+ ; load normal
+ lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main)]
+ jmp .run
+.load_fast:
+ lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)]
+ jmp .run
+.load_veryfast:
+ lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)]
+ ; fall-through
+.run:
+%if ARCH_X86_64
+ lea r2, [dstq+64]
+ mov r7, -8
+%else
+ lea r2, [rsp+(4*32+3)*16]
+ mov dword [r2+0*gprsize], 4
+%endif
+ jmp m(inv_txfm_add_dct_dct_16x32_16bpc).loop_pass2_entry
+
+.dconly:
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 32
+ add rsp, (5*32+1-(24+8*ARCH_X86_32))*16
+ jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly1
+
+cglobal inv_txfm_add_dct_dct_16x64_16bpc, 4, 7, 16, \
+ 0-(12+2*64)*16-(4+4*ARCH_X86_32)*gprsize, \
+ dst, stride, c, eob
+ LEA r6, base
+ test eobd, eobd
+ jz .dconly
+
+%if ARCH_X86_32
+ DECLARE_REG_TMP 4, 1, 2, 0
+ mov [rsp+gprsize*1+(64*2+12)*16], r0
+ mov [rsp+gprsize*2+(64*2+12)*16], r1
+ mov [rsp+gprsize*3+(64*2+12)*16], r2
+%else
+ DECLARE_REG_TMP 8, 9, 4, 7
+ mov [rsp+gprsize*1+(64*2+12)*16], r9
+%if WIN64
+ mov [rsp+gprsize*2+(64*2+12)*16], r7
+ mov [rsp+gprsize*3+(64*2+12)*16], r8
+%endif
+%endif
+%undef cmp
+ ; remove entirely-zero iterations
+ mov r5d, 7*2
+ cmp eobw, word [o2(tbl_16x32_2d)+r5]
+ jge .end_zero_loop
+ pxor m0, m0
+.zero_loop:
+ movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0]
+ movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2]
+ movzx t0d, t1b
+ movzx t2d, t3b
+ shr t1d, 8
+ shr t3d, 8
+ mova [rsp+12*16+t0*8], m0
+ mova [rsp+12*16+t1*8], m0
+ mova [rsp+12*16+t2*8], m0
+ mova [rsp+12*16+t3*8], m0
+ mova [rsp+76*16+t0*8], m0
+ mova [rsp+76*16+t1*8], m0
+ mova [rsp+76*16+t2*8], m0
+ mova [rsp+76*16+t3*8], m0
+ sub r5d, 2
+ cmp eobw, word [o2(tbl_16x32_2d)+r5]
+ jl .zero_loop
+.end_zero_loop:
+ ; actual first pass after skipping all-zero data
+ mov [rsp+gprsize*0+(64*2+12)*16], eobd
+ mov r3, rsp
+%if ARCH_X86_32
+ DECLARE_REG_TMP 4, 1, 6, 0
+ mov r2, [rsp+gprsize*3+(64*2+12)*16]
+ mov [rsp+gprsize*3+(64*2+12)*16], r6
+%endif
+.loop_pass1:
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+ mova m0, [cq+ 1*128+r5*8]
+ mova m1, [cq+ 3*128+r5*8]
+ mova m2, [cq+ 5*128+r5*8]
+ mova m3, [cq+ 7*128+r5*8]
+ mova m4, [cq+ 9*128+r5*8]
+ mova m5, [cq+11*128+r5*8]
+ mova m6, [cq+13*128+r5*8]
+ mova m7, [cq+15*128+r5*8]
+ call m(idct_16x4_internal_16bpc).main_oddhalf
+
+ mova m0, [cq+ 0*128+r5*8]
+ mova m1, [cq+ 2*128+r5*8]
+ mova m2, [cq+ 4*128+r5*8]
+ mova m3, [cq+ 6*128+r5*8]
+ mova m4, [cq+ 8*128+r5*8]
+ mova m5, [cq+10*128+r5*8]
+ mova m6, [cq+12*128+r5*8]
+ mova m7, [cq+14*128+r5*8]
+ call m(idct_8x4_internal_16bpc).main_pass1
+ call m(idct_8x4_internal_16bpc).round
+ call m(idct_16x16_internal_16bpc).round
+%if ARCH_X86_64
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ packssdw m8, m9
+ packssdw m10, m11
+ packssdw m12, m13
+ packssdw m14, m15
+%endif
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0]
+ movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2]
+ movzx t0d, t1b
+ movzx t2d, t3b
+ shr t1d, 8
+ shr t3d, 8
+%if ARCH_X86_64
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [rsp+76*16+t0*8], m8
+ mova [rsp+76*16+t1*8], m9
+ mova [rsp+76*16+t2*8], m10
+ mova [rsp+76*16+t3*8], m11
+%else
+ mova [rsp+76*16+t0*8], m0
+ mova [rsp+76*16+t1*8], m1
+ mova [rsp+76*16+t2*8], m2
+ mova [rsp+76*16+t3*8], m3
+ mova m0, [rsp+ 8*16]
+ mova m2, [rsp+ 9*16]
+ mova m4, [rsp+10*16]
+ mova m6, [rsp+11*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+%endif
+ mova [rsp+12*16+t0*8], m0
+ mova [rsp+12*16+t1*8], m1
+ mova [rsp+12*16+t2*8], m2
+ mova [rsp+12*16+t3*8], m3
+%if ARCH_X86_32
+ mov r6, [rsp+gprsize*3+(64*2+12)*16]
+%endif
+ pxor m7, m7
+ REPX {mova [cq+x*128+r5*8], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ sub r5d, 2
+ jge .loop_pass1
+
+ ; pass=2
+ mov eobd, [rsp+gprsize*0+(64*2+12)*16]
+ cmp eobd, 151
+ jl .fast
+ ; fall-through
+%if ARCH_X86_64
+ DECLARE_REG_TMP 8, 9
+%else
+ DECLARE_REG_TMP 1, 5
+%endif
+ lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)]
+ lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main)]
+ jmp .run
+.fast:
+ lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)]
+ lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main_fast)]
+.run:
+ add rsp, 9*16
+
+%if ARCH_X86_64
+ lea r2, [dstq+32]
+ mov r7, -4
+%else
+ lea r2, [rsp+(64*2+3)*16]
+ mov [r2+4*gprsize], t0
+ mov [r2+5*gprsize], t1
+ mov r1, [r2+2*gprsize]
+ mov dword [r2+0*gprsize], 2
+%endif
+.loop_pass2:
+%if ARCH_X86_32
+ mov dstq, [r2+1*gprsize]
+%endif
+ call .pass2
+ add rsp, 64*16
+%if ARCH_X86_64
+ add r7, 2
+ lea dstq, [r2+r7*8]
+ jl .loop_pass2
+%else
+ add dword [r2+1*gprsize], 16
+ dec dword [r2+0*gprsize]
+ jg .loop_pass2
+%endif
+%assign stack_size (stack_size-(64*2+9)*16)
+%if STACK_ALIGNMENT >= 16
+%assign stack_size_padded (stack_size_padded-(64*2+9)*16)
+%assign stack_offset (stack_offset-(64*2+9)*16)
+%else
+%xdefine rstkm [rsp + stack_size]
+%endif
+%if ARCH_X86_64
+ mov r9, [rsp+gprsize*1+3*16]
+%if WIN64
+ mov r7, [rsp+gprsize*2+3*16]
+ mov r8, [rsp+gprsize*3+3*16]
+%endif
+%endif
+ RET
+
+.pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ mova m0, [rsp+gprsize+16* 3]
+ mova m1, [rsp+gprsize+16* 4]
+ mova m2, [rsp+gprsize+16* 5]
+ mova m3, [rsp+gprsize+16* 6]
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m_suffix(idct_8x8_internal_8bpc, _ssse3).main
+ mova [rsp+gprsize+ 3*16], m0
+ mova [rsp+gprsize+ 4*16], m1
+ mova [rsp+gprsize+ 5*16], m2
+ mova [rsp+gprsize+ 6*16], m3
+ mova [rsp+gprsize+ 7*16], m4
+ mova [rsp+gprsize+ 8*16], m5
+ mova [rsp+gprsize+ 9*16], m6
+ mova [rsp+gprsize+10*16], m7
+ mova m0, [rsp+gprsize+16*11]
+ mova m1, [rsp+gprsize+16*12]
+ mova m2, [rsp+gprsize+16*13]
+ mova m3, [rsp+gprsize+16*14]
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m_suffix(idct_16x8_internal_8bpc, _ssse3).main
+ mova m7, [rsp+gprsize+ 0*16]
+ mova [rsp+gprsize+11*16], m0
+ mova [rsp+gprsize+12*16], m1
+ mova [rsp+gprsize+13*16], m2
+ mova [rsp+gprsize+14*16], m3
+ mova [rsp+gprsize+15*16], m4
+ mova [rsp+gprsize+16*16], m5
+ mova [rsp+gprsize+17*16], m6
+ mova [rsp+gprsize+18*16], m7
+%if ARCH_X86_64
+ call r8
+%else
+ call [r2+4*gprsize]
+%endif
+ mova [rsp+gprsize+ 3*16], m0
+ mova [rsp+gprsize+ 5*16], m2
+ mova [rsp+gprsize+ 8*16], m5
+ mova [rsp+gprsize+10*16], m7
+%if ARCH_X86_64
+ call r9
+ mova m8, [o(pw_2048)]
+ pxor m9, m9
+ mova m10, [o(pixel_10bpc_max)]
+%else
+ call [r2+5*gprsize]
+%endif
+ lea r3, [strideq*3]
+ lea r4, [rsp+gprsize+ 3*16]
+%if ARCH_X86_64
+ mov r6d, 8
+%else
+ mov dword [r2+2*gprsize], 8
+%endif
+.loop_write:
+ mova m0, [r4+0*16]
+ mova m1, [r4+1*16]
+ mova m2, [r4+2*16]
+ mova m3, [r4+3*16]
+ mova m4, [r4+4*16]
+ mova m5, [r4+5*16]
+ mova m6, [r4+6*16]
+ mova m7, [r4+7*16]
+ call m(idct_8x8_internal_16bpc).round1_and_write_8x8
+ lea dstq, [dstq+strideq*8]
+ add r4, 8*16
+%if ARCH_X86_64
+ dec r6d
+%else
+ dec dword [r2+2*gprsize]
+%endif
+ jg .loop_write
+ ret
+
+.dconly:
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 64
+ add r5d, 640
+ sar r5d, 10
+ add rsp, (12+2*64)*16+(4+4*ARCH_X86_32)*gprsize-(8+4*ARCH_X86_32)*16
+ jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly2
+
+cglobal inv_txfm_add_dct_dct_32x64_16bpc, 4, 7, 16, \
+ 0-(32+4*64)*16-(4+4*ARCH_X86_32)*gprsize, \
+ dst, stride, c, eob
+ LEA r6, base
+ test eobd, eobd
+ jz .dconly
+
+%if ARCH_X86_32
+ DECLARE_REG_TMP 4, 1, 2, 0
+ mov [rsp+gprsize*1+(64*4+32)*16], r0
+ mov [rsp+gprsize*2+(64*4+32)*16], r1
+ mov [rsp+gprsize*3+(64*4+32)*16], r2
+%else
+ DECLARE_REG_TMP 8, 9, 4, 7
+ mov [rsp+gprsize*1+(64*4+32)*16], r9
+%if WIN64
+ mov [rsp+gprsize*2+(64*4+32)*16], r7
+ mov [rsp+gprsize*3+(64*4+32)*16], r8
+%endif
+%endif
+%undef cmp
+ ; remove entirely-zero iterations
+ mov r5d, 7*2
+ cmp eobw, word [o2(tbl_32x32_2d)+r5]
+ jge .end_zero_loop
+ pxor m0, m0
+.zero_loop:
+ movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0]
+ movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2]
+ movzx t0d, t1b
+ movzx t2d, t3b
+ shr t1d, 8
+ shr t3d, 8
+ mova [rsp+ 32*16+t0*8], m0
+ mova [rsp+ 32*16+t1*8], m0
+ mova [rsp+ 32*16+t2*8], m0
+ mova [rsp+ 32*16+t3*8], m0
+ mova [rsp+ 96*16+t0*8], m0
+ mova [rsp+ 96*16+t1*8], m0
+ mova [rsp+ 96*16+t2*8], m0
+ mova [rsp+ 96*16+t3*8], m0
+ mova [rsp+160*16+t0*8], m0
+ mova [rsp+160*16+t1*8], m0
+ mova [rsp+160*16+t2*8], m0
+ mova [rsp+160*16+t3*8], m0
+ mova [rsp+224*16+t0*8], m0
+ mova [rsp+224*16+t1*8], m0
+ mova [rsp+224*16+t2*8], m0
+ mova [rsp+224*16+t3*8], m0
+ sub r5d, 2
+ cmp eobw, word [o2(tbl_32x32_2d)+r5]
+ jl .zero_loop
+.end_zero_loop:
+ ; actual first pass after skipping all-zero data
+ mov [rsp+gprsize*0+(64*4+32)*16], eobd
+ mov r3, rsp
+%if ARCH_X86_32
+ DECLARE_REG_TMP 4, 1, 6, 0
+ mov r2, [rsp+gprsize*3+(64*4+32)*16]
+ mov [rsp+gprsize*3+(64*4+32)*16], r6
+%endif
+.loop_pass1:
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+ mova m0, [cq+128* 1+r5*8]
+ mova m1, [cq+128* 7+r5*8]
+ mova m2, [cq+128* 9+r5*8]
+ mova m3, [cq+128*15+r5*8]
+ mova m4, [cq+128*17+r5*8]
+ mova m5, [cq+128*23+r5*8]
+ mova m6, [cq+128*25+r5*8]
+ mova m7, [cq+128*31+r5*8]
+ mov r3, rsp
+ call m(idct_8x4_internal_16bpc).rect2_mul
+ call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1
+
+ mova m0, [cq+128* 3+r5*8]
+ mova m1, [cq+128* 5+r5*8]
+ mova m2, [cq+128*11+r5*8]
+ mova m3, [cq+128*13+r5*8]
+ mova m4, [cq+128*19+r5*8]
+ mova m5, [cq+128*21+r5*8]
+ mova m6, [cq+128*27+r5*8]
+ mova m7, [cq+128*29+r5*8]
+%if ARCH_X86_32
+ add r3, 16*8
+%endif
+ call m(idct_8x4_internal_16bpc).rect2_mul
+%if ARCH_X86_32
+ sub r3, 16*8
+%endif
+ call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2
+ add r3, 16*(16+4*ARCH_X86_32)
+
+ mova m0, [cq+128* 2+r5*8]
+ mova m1, [cq+128* 6+r5*8]
+ mova m2, [cq+128*10+r5*8]
+ mova m3, [cq+128*14+r5*8]
+ mova m4, [cq+128*18+r5*8]
+ mova m5, [cq+128*22+r5*8]
+ mova m6, [cq+128*26+r5*8]
+ mova m7, [cq+128*30+r5*8]
+ call m(idct_8x4_internal_16bpc).rect2_mul
+ call m(idct_16x4_internal_16bpc).main_oddhalf
+
+ mova m0, [cq+128* 0+r5*8]
+ mova m1, [cq+128* 4+r5*8]
+ mova m2, [cq+128* 8+r5*8]
+ mova m3, [cq+128*12+r5*8]
+ mova m4, [cq+128*16+r5*8]
+ mova m5, [cq+128*20+r5*8]
+ mova m6, [cq+128*24+r5*8]
+ mova m7, [cq+128*28+r5*8]
+ call m(idct_8x4_internal_16bpc).rect2_mul
+ call m(idct_8x4_internal_16bpc).main_pass1
+ call m(idct_8x4_internal_16bpc).round
+ sub r3, 16*(16+4*ARCH_X86_32)
+ call m(inv_txfm_add_dct_dct_32x16_16bpc).round_dct32
+
+ movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0]
+ movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2]
+ movzx t0d, t1b
+ movzx t2d, t3b
+ shr t1d, 8
+ shr t3d, 8
+%if ARCH_X86_64
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [rsp+160*16+t0*8], m8
+ mova [rsp+160*16+t1*8], m9
+ mova [rsp+160*16+t2*8], m10
+ mova [rsp+160*16+t3*8], m11
+ mova m8, [r3+16* 9] ; 8 9
+ mova m10, [r3+16*11] ; 10 11
+ mova m12, [r3+16*13] ; 12 13
+ mova m14, [r3+16*15] ; 14 15
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [rsp+ 96*16+t0*8], m8
+ mova [rsp+ 96*16+t1*8], m9
+ mova [rsp+ 96*16+t2*8], m10
+ mova [rsp+ 96*16+t3*8], m11
+ mova m8, [r3+16* 8] ; 24 25
+ mova m10, [r3+16*10] ; 26 27
+ mova m12, [r3+16*12] ; 28 29
+ mova m14, [r3+16*14] ; 30 31
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [rsp+224*16+t0*8], m8
+ mova [rsp+224*16+t1*8], m9
+ mova [rsp+224*16+t2*8], m10
+ mova [rsp+224*16+t3*8], m11
+%else
+ sub r3, 8*16
+ mova m0, [r3+ 8*16]
+ mova m2, [r3+10*16]
+ mova m4, [r3+12*16]
+ mova m6, [r3+14*16]
+ packssdw m0, [r3+ 9*16]
+ packssdw m2, [r3+11*16]
+ packssdw m4, [r3+13*16]
+ packssdw m6, [r3+15*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [rsp+ 96*16+t0*8], m0
+ mova [rsp+ 96*16+t1*8], m1
+ mova [rsp+ 96*16+t2*8], m2
+ mova [rsp+ 96*16+t3*8], m3
+ mova m0, [r3+16*16]
+ mova m2, [r3+18*16]
+ mova m4, [r3+20*16]
+ mova m6, [r3+22*16]
+ packssdw m0, [r3+17*16]
+ packssdw m2, [r3+19*16]
+ packssdw m4, [r3+21*16]
+ packssdw m6, [r3+23*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [rsp+160*16+t0*8], m0
+ mova [rsp+160*16+t1*8], m1
+ mova [rsp+160*16+t2*8], m2
+ mova [rsp+160*16+t3*8], m3
+ mova m0, [r3+31*16]
+ mova m2, [r3+29*16]
+ mova m4, [r3+27*16]
+ mova m6, [r3+25*16]
+ packssdw m0, [r3+30*16]
+ packssdw m2, [r3+28*16]
+ packssdw m4, [r3+26*16]
+ packssdw m6, [r3+24*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [rsp+224*16+t0*8], m0
+ mova [rsp+224*16+t1*8], m1
+ mova [rsp+224*16+t2*8], m2
+ mova [rsp+224*16+t3*8], m3
+ mova m0, [r3+ 0*16]
+ mova m2, [r3+ 2*16]
+ mova m4, [r3+ 4*16]
+ mova m6, [r3+ 6*16]
+ packssdw m0, [r3+ 1*16]
+ packssdw m2, [r3+ 3*16]
+ packssdw m4, [r3+ 5*16]
+ packssdw m6, [r3+ 7*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+%endif
+ mova [rsp+ 32*16+t0*8], m0
+ mova [rsp+ 32*16+t1*8], m1
+ mova [rsp+ 32*16+t2*8], m2
+ mova [rsp+ 32*16+t3*8], m3
+ pxor m0, m0
+ REPX {mova [cq+x*128+r5*8], m0}, 0, 1, 2, 3, 4, 5, 6, 7, \
+ 8, 9, 10, 11, 12, 13, 14, 15, \
+ 16, 17, 18, 19, 20, 21, 22, 23, \
+ 24, 25, 26, 27, 28, 29, 30, 31
+%if ARCH_X86_32
+ mov r6, [rsp+gprsize*3+(64*4+32)*16]
+%endif
+ sub r5d, 2
+ jge .loop_pass1
+
+ ; pass=2
+ mov eobd, [rsp+gprsize*0+(64*4+32)*16]
+ cmp eobd, 136
+ jl .fast
+ ; fall-through
+%if ARCH_X86_64
+ DECLARE_REG_TMP 8, 9
+%else
+ DECLARE_REG_TMP 1, 5
+%endif
+ lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)]
+ lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main)]
+ jmp .run
+.fast:
+ lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)]
+ lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main_fast)]
+.run:
+ add rsp, 29*16
+
+%if ARCH_X86_64
+ lea r2, [dstq+64]
+ mov r7, -8
+%else
+ lea r2, [rsp+(64*4+3)*16]
+ mov [r2+4*gprsize], t0
+ mov [r2+5*gprsize], t1
+ mov r1, [r2+2*gprsize]
+ mov dword [r2+0*gprsize], 4
+%endif
+ jmp m(inv_txfm_add_dct_dct_16x64_16bpc).loop_pass2
+
+.dconly:
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 64
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
+ add r5d, 384
+ sar r5d, 9
+ add rsp, (32+4*64)*16+(4+4*ARCH_X86_32)*gprsize-(24+8*ARCH_X86_32)*16
+ jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly2
+
+cglobal inv_txfm_add_dct_dct_64x16_16bpc, 4, 7, 16, 0-(64+8*ARCH_X86_32)*16, \
+ dst, stride, c, eob
+ LEA r6, base
+ test eobd, eobd
+ jz .dconly
+
+ ; remove entirely-zero iterations
+%undef cmp
+ mov r5d, 8
+.zero_loop:
+ sub r5d, 2
+ cmp eobw, word [o2(tbl_32x16_2d)+r5]
+ jl .zero_loop
+
+ ; actual first pass after skipping all-zero data
+.loop_pass1:
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+
+ mov r3, rsp
+ lea r4, [o(idct64_mul_16bpc)]
+ mova m0, [cq+64* 1+r5*8]
+ mova m1, [cq+64*31+r5*8]
+ mova m2, [cq+64*17+r5*8]
+ mova m3, [cq+64*15+r5*8]
+ call .main_part1
+ mova m0, [cq+64* 7+r5*8]
+ mova m1, [cq+64*25+r5*8]
+ mova m2, [cq+64*23+r5*8]
+ mova m3, [cq+64* 9+r5*8]
+ call .main_part1
+ mova m0, [cq+64* 5+r5*8]
+ mova m1, [cq+64*27+r5*8]
+ mova m2, [cq+64*21+r5*8]
+ mova m3, [cq+64*11+r5*8]
+ call .main_part1
+ mova m0, [cq+64* 3+r5*8]
+ mova m1, [cq+64*29+r5*8]
+ mova m2, [cq+64*19+r5*8]
+ mova m3, [cq+64*13+r5*8]
+ call .main_part1
+ call .main_part2
+
+ mova m0, [cq+64* 2+r5*8]
+ mova m1, [cq+64*14+r5*8]
+ mova m2, [cq+64*18+r5*8]
+ mova m3, [cq+64*30+r5*8]
+ call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1_fast
+
+ mova m0, [cq+64* 6+r5*8]
+ mova m1, [cq+64*10+r5*8]
+ mova m2, [cq+64*22+r5*8]
+ mova m3, [cq+64*26+r5*8]
+ call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2_fast
+ add r3, 16*(24+4*ARCH_X86_32)
+
+ mova m0, [cq+64* 4+r5*8]
+ mova m1, [cq+64*12+r5*8]
+ mova m2, [cq+64*20+r5*8]
+ mova m3, [cq+64*28+r5*8]
+ call m(idct_16x4_internal_16bpc).main_oddhalf_fast
+
+ mova m0, [cq+64* 0+r5*8]
+ mova m1, [cq+64* 8+r5*8]
+ mova m2, [cq+64*16+r5*8]
+ mova m3, [cq+64*24+r5*8]
+ call m(idct_8x4_internal_16bpc).main_pass1_fast
+ call m(idct_8x4_internal_16bpc).round
+ mova [r3-(7+4*ARCH_X86_32)*16], m1
+ mova [r3-(6+4*ARCH_X86_32)*16], m2
+ mova [r3-(5+4*ARCH_X86_32)*16], m3
+ mova [r3-(4+4*ARCH_X86_32)*16], m4
+ mova [r3-(3+4*ARCH_X86_32)*16], m5
+ mova [r3-(2+4*ARCH_X86_32)*16], m6
+ mova [r3-(1+4*ARCH_X86_32)*16], m7
+ sub r3, 16*(40+4*ARCH_X86_32-4)
+
+%if ARCH_X86_64
+ psrld m15, m11, 10 ; pd_2
+%else
+ mova m7, [o(pd_2)]
+%endif
+ call .main_end_loop_start
+
+ lea r3, [rsp+56*16]
+ lea r4, [cq+r5*8+64*28]
+ call .shift_transpose
+ sub r5d, 2
+ jge .loop_pass1
+
+ ; pass=2, we need to call this otherwise the stack pointer has
+ ; the wrong offset in the 8-bit code
+ call .pass2
+ RET
+
+.pass2:
+%if ARCH_X86_64
+ mova m8, [o(pw_2048)]
+ pxor m9, m9
+ mova m10, [o(pixel_10bpc_max)]
+%if WIN64
+ mov [rsp+16*16+gprsize], r7
+%endif
+ mov r7, dstq
+%else
+ mov [rsp+2*gprsize+16*16], dstq
+%endif
+ lea r3, [strideq*3]
+ mov r4d, 8
+ jmp m(idct_16x16_internal_16bpc).loop_pass2
+
+.main_part1: ; idct64 steps 1-5
+ ; in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
+ ; in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
+ ; in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
+ ; in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
+%if ARCH_X86_64
+ movd m7, [r4+4*0]
+ movd m8, [r4+4*1]
+ movd m6, [r4+4*2]
+ movd m9, [r4+4*3]
+ movd m5, [r4+4*4]
+ movd m10, [r4+4*5]
+ movd m4, [r4+4*6]
+ movd m15, [r4+4*7]
+ REPX {pshufd x, x, q0000}, m7, m8, m6, m9, m5, m10, m4, m15
+ pmulld m7, m0 ; t63a
+ pmulld m0, m8 ; t32a
+ pmulld m6, m1 ; t62a
+ pmulld m1, m9 ; t33a
+ pmulld m5, m2 ; t61a
+ pmulld m2, m10 ; t34a
+ pmulld m4, m3 ; t60a
+ pmulld m3, m15 ; t35a
+ movd m10, [r4+4*8]
+ movd m15, [r4+4*9]
+ REPX {pshufd x, x, q0000}, m10, m15
+ REPX {paddd x, m11}, m7, m0, m6, m1, m5, m2, m4, m3
+ REPX {psrad x, 12 }, m0, m1, m7, m6, m2, m3, m5, m4
+ psubd m8, m0, m1 ; t33
+ paddd m0, m1 ; t32
+ psubd m1, m7, m6 ; t62
+ paddd m7, m6 ; t63
+ psubd m6, m3, m2 ; t34
+ paddd m3, m2 ; t35
+ psubd m2, m4, m5 ; t61
+ paddd m4, m5 ; t60
+ REPX {pmaxsd x, m12}, m8, m1, m6, m2
+ REPX {pminsd x, m13}, m8, m1, m6, m2
+ ITX_MULSUB_2D 1, 8, 5, 9, _, 11, 10, 15 ; t33a, t62a
+ ITX_MULSUB_2D 2, 6, 5, 9, _, 11, 10, 15, 4 ; t61a, t34a
+ REPX {pmaxsd x, m12}, m0, m3, m7, m4
+ REPX {pminsd x, m13}, m0, m3, m7, m4
+ movd m10, [r4+4*10]
+ movd m15, [r4+4*11]
+ REPX {pshufd x, x, q0000}, m10, m15
+ psubd m5, m0, m3 ; t35a
+ paddd m0, m3 ; t32a
+ psubd m3, m7, m4 ; t60a
+ paddd m7, m4 ; t63a
+ psubd m4, m1, m6 ; t34
+ paddd m1, m6 ; t33
+ psubd m6, m8, m2 ; t61
+ paddd m8, m2 ; t62
+ REPX {pmaxsd x, m12}, m5, m3, m4, m6
+ REPX {pminsd x, m13}, m5, m3, m4, m6
+ ITX_MULSUB_2D 3, 5, 2, 9, _, 11, 10, 15 ; t35, t60
+ ITX_MULSUB_2D 6, 4, 2, 9, _, 11, 10, 15 ; t34a, t61a
+ REPX {pmaxsd x, m12}, m0, m7, m1, m8
+ REPX {pminsd x, m13}, m0, m7, m1, m8
+ add r4, 4*12
+ mova [r3+16*0], m0
+ mova [r3+16*7], m7
+ mova [r3+16*1], m1
+ mova [r3+16*6], m8
+ mova [r3+16*2], m6
+ mova [r3+16*5], m4
+ mova [r3+16*3], m3
+ mova [r3+16*4], m5
+%else
+ movd m7, [r4+4*0]
+ movd m6, [r4+4*2]
+ movd m5, [r4+4*4]
+ movd m4, [r4+4*6]
+ REPX {pshufd x, x, q0000}, m7, m6, m5, m4
+ pmulld m7, m0 ; t63a
+ pmulld m6, m1 ; t62a
+ pmulld m5, m2 ; t61a
+ pmulld m4, m3 ; t60a
+ mova [r3+0*16], m6
+ mova [r3+1*16], m7
+ movd m6, [r4+4*1]
+ movd m7, [r4+4*3]
+ REPX {pshufd x, x, q0000}, m7, m6
+ pmulld m0, m6 ; t32a
+ pmulld m1, m7 ; t33a
+ movd m6, [r4+4*5]
+ movd m7, [r4+4*7]
+ REPX {pshufd x, x, q0000}, m7, m6
+ pmulld m2, m6 ; t34a
+ pmulld m3, m7 ; t35a
+ mova m6, [r3+0*16]
+ mova m7, [o(pd_2048)]
+ REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6
+ paddd m7, [r3+1*16]
+ REPX {psrad x, 12}, m0, m1, m7, m6, m2, m3, m5, m4
+ mova [r3+0*16], m5
+ psubd m5, m0, m1 ; t33
+ paddd m0, m1 ; t32
+ mova [r3+1*16], m0
+ mova m0, [r3+0*16]
+ psubd m1, m7, m6 ; t62
+ paddd m7, m6 ; t63
+ psubd m6, m3, m2 ; t34
+ paddd m3, m2 ; t35
+ psubd m2, m4, m0 ; t61
+ paddd m4, m0 ; t60
+ mova m0, [o(clip_18b_min)]
+ REPX {pmaxsd x, m0}, m5, m1, m7, m6, m3, m2, m4
+ pmaxsd m0, [r3+1*16]
+ mova [r3+0*16], m0
+ mova m0, [o(clip_18b_max)]
+ REPX {pminsd x, m0}, m5, m1, m7, m6, m3, m2, m4
+ pminsd m0, [r3+0*16]
+ mova [r3+0*16], m0
+ mova [r3+1*16], m3
+ mova [r3+2*16], m4
+ mova [r3+3*16], m7
+ mova m0, [o(pd_2048)]
+ movd m3, [r4+4*8]
+ movd m4, [r4+4*9]
+ REPX {pshufd x, x, q0000}, m3, m4
+ mova [r3+4*16], m2
+ ITX_MULSUB_2D 1, 5, 2, 7, _, 0, 3, 4 ; t33a, t62a
+ mova m2, [r3+4*16]
+ mova [r3+4*16], m5
+ ITX_MULSUB_2D 2, 6, 5, 7, _, 0, 3, 4, 4 ; t61a, t34a
+ mova m0, [r3+0*16]
+ mova m3, [r3+1*16]
+ mova m4, [r3+2*16]
+ mova m7, [r3+3*16]
+ psubd m5, m0, m3 ; t35a
+ paddd m0, m3 ; t32a
+ mova [r3+0*16], m5
+ mova m5, [r3+4*16]
+ psubd m3, m7, m4 ; t60a
+ paddd m7, m4 ; t63a
+ psubd m4, m1, m6 ; t34
+ paddd m1, m6 ; t33
+ psubd m6, m5, m2 ; t61
+ paddd m2, m5 ; t62
+ mova m5, [o(clip_18b_min)]
+ REPX {pmaxsd x, m5}, m0, m3, m7, m4, m1, m6, m2
+ pmaxsd m5, [r3+0*16]
+ mova [r3+0*16], m5
+ mova m5, [o(clip_18b_max)]
+ REPX {pminsd x, m5}, m0, m3, m7, m4, m1, m6, m2
+ pminsd m5, [r3+0*16]
+ mova [r3+16*0], m0
+ mova [r3+16*7], m7
+ mova [r3+16*1], m1
+ mova [r3+16*6], m2
+ mova [r3+16*2], m4
+ mova m7, [o(pd_2048)]
+ movd m0, [r4+4*10]
+ movd m1, [r4+4*11]
+ REPX {pshufd x, x, q0000}, m0, m1
+ ITX_MULSUB_2D 3, 5, 2, 4, _, 7, 0, 1 ; t35, t60
+ mova [r3+16*3], m3
+ mova [r3+16*4], m5
+ mova m4, [r3+2*16]
+ ITX_MULSUB_2D 6, 4, 2, 3, _, 7, 0, 1 ; t34a, t61a
+ add r4, 4*12
+ mova [r3+16*2], m6
+ mova [r3+16*5], m4
+%endif
+ add r3, 16*8
+ ret
+
+.main_part2: ; idct64 steps 6-9
+ lea r4, [r3+16*7]
+%if ARCH_X86_64
+ mova m10, [o(pd_1567)]
+ mova m15, [o(pd_3784)]
+.main_part2_loop:
+ mova m0, [r3-16*32] ; t32a
+ mova m1, [r4-16*24] ; t39a
+ mova m2, [r4-16*32] ; t63a
+ mova m3, [r3-16*24] ; t56a
+ mova m4, [r3-16*16] ; t40a
+ mova m5, [r4-16* 8] ; t47a
+ mova m6, [r4-16*16] ; t55a
+ mova m7, [r3-16* 8] ; t48a
+ psubd m8, m0, m1 ; t39
+ paddd m0, m1 ; t32
+ psubd m1, m2, m3 ; t56
+ paddd m2, m3 ; t63
+ psubd m3, m5, m4 ; t40
+ paddd m5, m4 ; t47
+ psubd m4, m7, m6 ; t55
+ paddd m7, m6 ; t48
+ REPX {pmaxsd x, m12}, m8, m1, m3, m4
+ REPX {pminsd x, m13}, m8, m1, m3, m4
+ ITX_MULSUB_2D 1, 8, 6, 9, _, 11, 10, 15 ; t39a, t56a
+ ITX_MULSUB_2D 4, 3, 6, 9, _, 11, 10, 15, 4 ; t55a, t40a
+ REPX {pmaxsd x, m12}, m0, m2, m5, m7
+ REPX {pminsd x, m13}, m0, m5, m2, m7
+ psubd m6, m2, m7 ; t48a
+ paddd m2, m7 ; t63a
+ psubd m7, m0, m5 ; t47a
+ paddd m0, m5 ; t32a
+ psubd m5, m8, m4 ; t55
+ paddd m8, m4 ; t56
+ psubd m4, m1, m3 ; t40
+ paddd m1, m3 ; t39
+ REPX {pmaxsd x, m12}, m6, m7, m5, m4
+ REPX {pminsd x, m13}, m6, m7, m5, m4
+ REPX {pmulld x, m14}, m6, m7, m5, m4
+ REPX {pmaxsd x, m12}, m2, m0, m8, m1
+ REPX {pminsd x, m13}, m2, m0, m8, m1
+ paddd m6, m11
+ paddd m5, m11
+ psubd m3, m6, m7 ; t47
+ paddd m6, m7 ; t48
+ psubd m7, m5, m4 ; t40a
+ paddd m5, m4 ; t55a
+ REPX {psrad x, 12}, m3, m6, m7, m5
+ mova [r4-16* 8], m2
+ mova [r3-16*32], m0
+ mova [r3-16* 8], m8
+ mova [r4-16*32], m1
+ mova [r4-16*24], m3
+ mova [r3-16*16], m6
+ mova [r3-16*24], m7
+ mova [r4-16*16], m5
+%else
+.main_part2_loop:
+ mova m0, [r3-16*32] ; t32a
+ mova m1, [r4-16*24] ; t39a
+ mova m2, [r4-16*32] ; t63a
+ mova m3, [r3-16*24] ; t56a
+ mova m4, [r3-16*16] ; t40a
+ mova m5, [r4-16* 8] ; t47a
+ mova m6, [r4-16*16] ; t55a
+ psubd m7, m0, m1 ; t39
+ paddd m0, m1 ; t32
+ mova [r3+0*16], m7
+ mova m7, [r3-16* 8] ; t48a
+ psubd m1, m2, m3 ; t56
+ paddd m2, m3 ; t63
+ psubd m3, m5, m4 ; t40
+ paddd m5, m4 ; t47
+ psubd m4, m7, m6 ; t55
+ paddd m7, m6 ; t48
+ mova m6, [o(clip_18b_min)]
+ REPX {pmaxsd x, m6}, m0, m1, m2, m3, m5, m4, m7
+ pmaxsd m6, [r3+0*16]
+ mova [r3+0*16], m6
+ mova m6, [o(clip_18b_max)]
+ REPX {pminsd x, m6}, m0, m1, m2, m3, m5, m4, m7
+ pminsd m6, [r3+0*16]
+ mova [r3+0*16], m0
+ mova [r3+1*16], m2
+ mova [r3+2*16], m5
+ mova [r3+3*16], m7
+ mova m0, [o(pd_2048)]
+ ITX_MULSUB_2D 1, 6, 2, 5, 7, 0, 1567, 3784 ; t39a, t56a
+ ITX_MULSUB_2D 4, 3, 2, 5, _, 0, 7, 3784, 4 ; t55a, t40a
+ mova m2, [r3+1*16]
+ mova m7, [r3+3*16]
+ psubd m5, m2, m7 ; t48a
+ paddd m2, m7 ; t63a
+ mova [r3+1*16], m5
+ mova m0, [r3+0*16]
+ mova m5, [r3+2*16]
+ psubd m7, m0, m5 ; t47a
+ paddd m0, m5 ; t32a
+ psubd m5, m6, m4 ; t55
+ paddd m6, m4 ; t56
+ psubd m4, m1, m3 ; t40
+ paddd m1, m3 ; t39
+ mova m3, [o(clip_18b_min)]
+ REPX {pmaxsd x, m3}, m2, m7, m0, m5, m6, m4, m1
+ pmaxsd m3, [r3+1*16]
+ mova [r3+0*16], m3
+ mova m3, [o(clip_18b_max)]
+ REPX {pminsd x, m3}, m2, m7, m0, m5, m6, m4, m1
+ pminsd m3, [r3+0*16]
+ mova [r4-16* 8], m2
+ mova [r3-16*32], m0
+ mova [r3-16* 8], m6
+ mova [r4-16*32], m1
+ mova m0, [o(pd_2896)]
+ mova m1, [o(pd_2048)]
+ REPX {pmulld x, m0}, m3, m7, m5, m4
+ REPX {paddd x, m1}, m3, m5
+ psubd m6, m3, m7 ; t47
+ paddd m3, m7 ; t48
+ psubd m7, m5, m4 ; t40a
+ paddd m5, m4 ; t55a
+ REPX {psrad x, 12}, m6, m3, m7, m5
+ mova [r4-16*24], m6
+ mova [r3-16*16], m3
+ mova [r3-16*24], m7
+ mova [r4-16*16], m5
+%endif
+ add r3, 16
+ sub r4, 16
+ cmp r3, r4
+ jl .main_part2_loop
+ sub r3, 4*16
+ ret
+
+.main_end_loop:
+ mova m0, [r3+16*28] ; idct8 0 + n
+.main_end_loop_start:
+ mova m2, [r3+16*12] ; idct32 16 + n
+ mova m3, [r4+16*12] ; idct32 31 - n
+%if ARCH_X86_64
+ mova m1, [r4+16*28] ; idct16 15 - n
+ mova m4, [r4-16* 4] ; idct64 63 - n
+ mova m5, [r3-16* 4] ; idct64 48 + n
+ mova m6, [r4-16*20] ; idct64 47 - n
+ mova m7, [r3-16*20] ; idct64 32 + n
+ pmaxsd m0, m12
+ pminsd m0, m13
+ paddd m8, m0, m1 ; idct16 out0 + n
+ psubd m0, m1 ; idct16 out15 - n
+ REPX {pmaxsd x, m12}, m8, m0
+ REPX {pminsd x, m13}, m8, m0
+ paddd m1, m8, m3 ; idct32 out0 + n
+ psubd m8, m3 ; idct32 out31 - n
+ paddd m3, m0, m2 ; idct32 out15 - n
+ psubd m0, m2 ; idct32 out16 + n
+ REPX {pmaxsd x, m12}, m1, m8, m3, m0
+ REPX {pminsd x, m13}, m1, m3, m8, m0
+ REPX {paddd x, m15}, m1, m3, m0, m8
+ paddd m2, m1, m4 ; idct64 out0 + n (unshifted)
+ psubd m1, m4 ; idct64 out63 - n (unshifted)
+ paddd m4, m3, m5 ; idct64 out15 - n (unshifted)
+ psubd m3, m5 ; idct64 out48 + n (unshifted)
+ paddd m5, m0, m6 ; idct64 out16 + n (unshifted)
+ psubd m0, m6 ; idct64 out47 - n (unshifted)
+ paddd m6, m8, m7 ; idct64 out31 - n (unshifted)
+ psubd m8, m7 ; idct64 out32 + n (unshifted)
+ mova [r3-16*20], m2
+ mova [r4+16*28], m1
+ mova [r4-16*20], m4
+ mova [r3+16*28], m3
+ mova [r3-16* 4], m5
+ mova [r4+16*12], m0
+ mova [r4-16* 4], m6
+ mova [r3+16*12], m8
+%else
+ mova m5, [o(clip_18b_min)]
+ mova m6, [o(clip_18b_max)]
+ mova m1, [r3+16*44] ; idct16 15 - n
+ pmaxsd m0, m5
+ pminsd m0, m6
+ paddd m4, m0, m1 ; idct16 out0 + n
+ psubd m0, m1 ; idct16 out15 - n
+ REPX {pmaxsd x, m5}, m4, m0
+ REPX {pminsd x, m6}, m4, m0
+ paddd m1, m4, m3 ; idct32 out0 + n
+ psubd m4, m3 ; idct32 out31 - n
+ paddd m3, m0, m2 ; idct32 out15 - n
+ psubd m0, m2 ; idct32 out16 + n
+ REPX {pmaxsd x, m5}, m1, m4, m3, m0
+ REPX {pminsd x, m6}, m1, m3, m4, m0
+ REPX {paddd x, m7}, m1, m3, m0, m4
+ mova m5, [r4-16* 4] ; idct64 63 - n
+ mova m6, [r3-16* 4] ; idct64 48 + n
+ paddd m2, m1, m5 ; idct64 out0 + n (unshifted)
+ psubd m1, m5 ; idct64 out63 - n (unshifted)
+ paddd m5, m3, m6 ; idct64 out15 - n (unshifted)
+ psubd m3, m6 ; idct64 out48 + n (unshifted)
+ mova [r4+16*28], m1
+ mova [r3+16*28], m3
+ mova m6, [r4-16*20] ; idct64 47 - n
+ mova m1, [r3-16*20] ; idct64 32 + n
+ mova [r3-16*20], m2
+ mova [r4-16*20], m5
+ paddd m5, m0, m6 ; idct64 out16 + n (unshifted)
+ psubd m0, m6 ; idct64 out47 - n (unshifted)
+ paddd m6, m4, m1 ; idct64 out31 - n (unshifted)
+ psubd m4, m1 ; idct64 out32 + n (unshifted)
+ mova [r3-16* 4], m5
+ mova [r4+16*12], m0
+ mova [r4-16* 4], m6
+ mova [r3+16*12], m4
+%endif
+ sub r4, 16
+ add r3, 16
+ cmp r3, r4
+ jl .main_end_loop
+ ret
+
+.shift_transpose:
+ mova m0, [r3+0*16]
+ mova m1, [r3+1*16]
+ mova m2, [r3+2*16]
+ mova m3, [r3+3*16]
+ mova m4, [r3+4*16]
+ mova m5, [r3+5*16]
+ mova m6, [r3+6*16]
+ mova m7, [r3+7*16]
+ REPX {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [r4+0*64], m0
+ mova [r4+1*64], m1
+ mova [r4+2*64], m2
+ mova [r4+3*64], m3
+ sub r4, 4*64
+ sub r3, 8*16
+ cmp r3, rsp
+ jg .shift_transpose
+ ret
+
+.dconly:
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 16
+.dconly1:
+ add r5d, 640
+ sar r5d, 10
+.dconly2:
+ imul r5d, 2896
+ add r5d, 34816
+ movd m0, r5d
+ pshuflw m0, m0, q1111
+ punpcklqdq m0, m0
+ mova m6, [o(pixel_10bpc_max)]
+ pxor m5, m5
+.dconly_loop:
+ paddw m1, m0, [dstq+16*0]
+ paddw m2, m0, [dstq+16*1]
+ paddw m3, m0, [dstq+16*2]
+ paddw m4, m0, [dstq+16*3]
+ REPX {pmaxsw x, m5}, m1, m2, m3, m4
+ REPX {pminsw x, m6}, m1, m2, m3, m4
+ mova [dstq+16*0], m1
+ mova [dstq+16*1], m2
+ mova [dstq+16*2], m3
+ mova [dstq+16*3], m4
+ add dstq, 64
+ btc r3d, 16
+ jnc .dconly_loop
+ lea dstq, [dstq+strideq-128]
+ dec r3d
+ jg .dconly_loop
+ RET
+
+cglobal inv_txfm_add_dct_dct_64x32_16bpc, 4, 7, 16, \
+ 0-(1+64+8*ARCH_X86_32+8*32+1*WIN64)*16, \
+ dst, stride, c, eob
+ LEA r6, base
+ test eobd, eobd
+ jz .dconly
+
+%if ARCH_X86_32
+ DECLARE_REG_TMP 0, 4, 1
+ mov [rsp+(8*32+64+8)*16+1*gprsize], dstq
+ mov [rsp+(8*32+64+8)*16+2*gprsize], strideq
+%else
+ DECLARE_REG_TMP 4, 7, 8
+%if WIN64
+ mov [rsp+(8*32+64+1)*16+1*gprsize], r7
+ mov [rsp+64*16+0*gprsize], r8
+%endif
+%endif
+%undef cmp
+ ; remove entirely-zero iterations
+ mov r5d, 14
+ cmp eobw, word [o2(tbl_32x32_2d)+r5]
+ jge .end_zero_loop
+ pxor m0, m0
+.zero_loop:
+ movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5]
+ movzx t1d, t0b
+ shr t0d, 8
+ lea t2, [rsp+7*32*16]
+.zero_loop_inner:
+ mova [t2+(64+8*ARCH_X86_32+1*WIN64)*16+r5*8], m0
+ mova [t2+(72+8*ARCH_X86_32+1*WIN64)*16+r5*8], m0
+ mova [t2+(64+8*ARCH_X86_32+1*WIN64)*16+t0*8], m0
+ mova [t2+(64+8*ARCH_X86_32+1*WIN64)*16+t1*8], m0
+ sub t2, 32*16
+ cmp t2, rsp
+ jge .zero_loop_inner
+ sub r5d, 2
+ cmp eobw, word [o2(tbl_32x32_2d)+r5]
+ jl .zero_loop
+.end_zero_loop:
+ mov [rsp+(8*32+64+8*ARCH_X86_32+1*WIN64)*16+0*gprsize], eobd
+ ; actual first pass after skipping all-zero data
+.loop_pass1:
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+
+ mov r3, rsp
+ lea r4, [o(idct64_mul_16bpc)]
+ mova m0, [cq+128* 1+r5*8]
+ mova m1, [cq+128*31+r5*8]
+ mova m2, [cq+128*17+r5*8]
+ mova m3, [cq+128*15+r5*8]
+ call .rect2_mul_fast
+ call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
+ mova m0, [cq+128* 7+r5*8]
+ mova m1, [cq+128*25+r5*8]
+ mova m2, [cq+128*23+r5*8]
+ mova m3, [cq+128* 9+r5*8]
+ call .rect2_mul_fast
+ call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
+ mova m0, [cq+128* 5+r5*8]
+ mova m1, [cq+128*27+r5*8]
+ mova m2, [cq+128*21+r5*8]
+ mova m3, [cq+128*11+r5*8]
+ call .rect2_mul_fast
+ call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
+ mova m0, [cq+128* 3+r5*8]
+ mova m1, [cq+128*29+r5*8]
+ mova m2, [cq+128*19+r5*8]
+ mova m3, [cq+128*13+r5*8]
+ call .rect2_mul_fast
+ call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
+ call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part2
+
+ mova m0, [cq+128* 2+r5*8]
+ mova m1, [cq+128*14+r5*8]
+ mova m2, [cq+128*18+r5*8]
+ mova m3, [cq+128*30+r5*8]
+ call .rect2_mul_fast
+ call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1_fast
+
+ mova m0, [cq+128* 6+r5*8]
+ mova m1, [cq+128*10+r5*8]
+ mova m2, [cq+128*22+r5*8]
+ mova m3, [cq+128*26+r5*8]
+ call .rect2_mul_fast
+ call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2_fast
+ add r3, 16*(24+4*ARCH_X86_32)
+
+ mova m0, [cq+128* 4+r5*8]
+ mova m1, [cq+128*12+r5*8]
+ mova m2, [cq+128*20+r5*8]
+ mova m3, [cq+128*28+r5*8]
+ call .rect2_mul_fast
+ call m(idct_16x4_internal_16bpc).main_oddhalf_fast
+
+ mova m0, [cq+128* 0+r5*8]
+ mova m1, [cq+128* 8+r5*8]
+ mova m2, [cq+128*16+r5*8]
+ mova m3, [cq+128*24+r5*8]
+ call .rect2_mul_fast
+ call m(idct_8x4_internal_16bpc).main_pass1_fast
+ call m(idct_8x4_internal_16bpc).round
+ mova [r3-(7+4*ARCH_X86_32)*16], m1
+ mova [r3-(6+4*ARCH_X86_32)*16], m2
+ mova [r3-(5+4*ARCH_X86_32)*16], m3
+ mova [r3-(4+4*ARCH_X86_32)*16], m4
+ mova [r3-(3+4*ARCH_X86_32)*16], m5
+ mova [r3-(2+4*ARCH_X86_32)*16], m6
+ mova [r3-(1+4*ARCH_X86_32)*16], m7
+ sub r3, 16*(40+4*ARCH_X86_32-4)
+
+%if ARCH_X86_64
+ psrld m15, m11, 11 ; pd_1
+%else
+ mova m7, [o(pd_1)]
+%endif
+ call m(inv_txfm_add_dct_dct_64x16_16bpc).main_end_loop_start
+
+ lea r3, [rsp+56*16]
+ lea t2, [rsp+7*32*16+(64+8*ARCH_X86_32+1*WIN64)*16]
+ movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5]
+ movzx t1d, t0b
+ shr t0d, 8
+ call .shift_transpose
+ ; zero cq
+ pxor m7, m7
+ lea r4, [cq+30*128+r5*8]
+.zero_cq_loop:
+ REPX {mova [r4+x*128], m7}, -2, -1, 0, 1
+ sub r4, 4*128
+ cmp r4, cq
+ jg .zero_cq_loop
+ sub r5d, 2
+ jge .loop_pass1
+
+ ; pass=2 code starts here
+ mov eobd, [rsp+gprsize*0+(8*32+64+8*ARCH_X86_32+1*WIN64)*16]
+%if ARCH_X86_32
+ mov strideq, [rsp+gprsize*2+(8*32+64+8)*16]
+%elif WIN64
+ mov r8, [rsp+gprsize*0+64*16]
+%endif
+ add rsp, (64+8*ARCH_X86_32+1*WIN64-3)*16
+ cmp eobd, 36
+ jl .load_veryfast
+ cmp eobd, 136
+ jl .load_fast
+ ; load normal
+ lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main)]
+ jmp .run
+.load_fast:
+ lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)]
+ jmp .run
+.load_veryfast:
+ lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)]
+ ; fall-through
+.run:
+%if ARCH_X86_64
+ lea r2, [dstq+128]
+ mov r7, -16
+%else
+ lea r2, [rsp+(8*32+3)*16]
+ mov dword [r2+0*gprsize], 8
+%endif
+ jmp m(inv_txfm_add_dct_dct_16x32_16bpc).loop_pass2_entry
+
+.rect2_mul_fast:
+%if ARCH_X86_64
+ REPX {pmulld x, m14}, m0, m1, m2, m3
+ REPX {paddd x, m11}, m0, m1, m2, m3
+%else
+ mova m4, [o(pd_2896)]
+ mova m5, [o(pd_2048)]
+ REPX {pmulld x, m4 }, m0, m1, m2, m3
+ REPX {paddd x, m5 }, m0, m1, m2, m3
+%endif
+ REPX {psrad x, 12 }, m0, m1, m2, m3
+ ret
+
+.shift_transpose:
+ mova m0, [r3+0*16]
+ mova m1, [r3+1*16]
+ mova m2, [r3+2*16]
+ mova m3, [r3+3*16]
+ mova m4, [r3+4*16]
+ mova m5, [r3+5*16]
+ mova m6, [r3+6*16]
+ mova m7, [r3+7*16]
+ REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [t2+0*16+r5*8], m0
+ mova [t2+8*16+r5*8], m2
+ mova [t2+0*16+t0*8], m3
+ mova [t2+0*16+t1*8], m1
+ sub t2, 16*32
+ sub r3, 8*16
+ cmp r3, rsp
+ jg .shift_transpose
+ ret
+
+.dconly:
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 32
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
+ add r5d, 384
+ sar r5d, 9
+ add rsp, (1+8*32+1*WIN64)*16
+ jmp m(inv_txfm_add_dct_dct_64x16_16bpc).dconly2
+
+cglobal inv_txfm_add_dct_dct_64x64_16bpc, 4, 7, 16, \
+ 0-(64+8*ARCH_X86_32+8*64+1*ARCH_X86_64)*16-(4+4*ARCH_X86_32)*gprsize, \
+ dst, stride, c, eob
+ LEA r6, base
+ test eobd, eobd
+ jz .dconly
+
+%if ARCH_X86_32
+ DECLARE_REG_TMP 4, 1, 2, 0, 6
+ mov [rsp+gprsize*1+(64*9+8)*16], r0
+ mov [rsp+gprsize*2+(64*9+8)*16], r1
+ mov [rsp+gprsize*3+(64*9+8)*16], r2
+ mov [rsp+gprsize*4+(64*9+8)*16], r6
+%else
+ DECLARE_REG_TMP 8, 9, 4, 7, 0
+ mov [rsp+gprsize*1+(64*9+1)*16], r9
+ mov [rsp+gprsize*0+64*16], r0
+%if WIN64
+ mov [rsp+gprsize*2+(64*9+1)*16], r7
+ mov [rsp+gprsize*3+(64*9+1)*16], r8
+%endif
+%endif
+%undef cmp
+
+ ; remove entirely-zero iterations
+ mov r5d, 14
+ cmp eobw, word [o2(tbl_32x32_2d)+r5]
+ jge .end_zero_loop
+ pxor m0, m0
+.zero_loop:
+ movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0]
+ movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2]
+ movzx t0d, t1b
+ movzx t2d, t3b
+ shr t1d, 8
+ shr t3d, 8
+ lea t4, [rsp+7*64*16]
+.zero_loop_inner:
+ mova [t4+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16+t0*8], m0
+ mova [t4+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16+t1*8], m0
+ mova [t4+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16+t2*8], m0
+ mova [t4+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16+t3*8], m0
+ sub t4, 64*16
+ cmp t4, rsp
+ jge .zero_loop_inner
+%if ARCH_X86_32
+ mov r6, [rsp+gprsize*4+(64*9+8)*16]
+%endif
+ sub r5d, 2
+ cmp eobw, word [o2(tbl_32x32_2d)+r5]
+ jl .zero_loop
+.end_zero_loop:
+ mov [rsp+gprsize*0+(9*64+8*ARCH_X86_32+1*ARCH_X86_64)*16], eobd
+%if ARCH_X86_32
+ mov cq, [rsp+gprsize*3+(64*9+8)*16]
+%endif
+ ; actual first pass after skipping all-zero data
+.loop_pass1:
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+
+ mov r3, rsp
+ lea r4, [o(idct64_mul_16bpc)]
+ mova m0, [cq+128* 1+r5*8]
+ mova m1, [cq+128*31+r5*8]
+ mova m2, [cq+128*17+r5*8]
+ mova m3, [cq+128*15+r5*8]
+ call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
+ mova m0, [cq+128* 7+r5*8]
+ mova m1, [cq+128*25+r5*8]
+ mova m2, [cq+128*23+r5*8]
+ mova m3, [cq+128* 9+r5*8]
+ call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
+ mova m0, [cq+128* 5+r5*8]
+ mova m1, [cq+128*27+r5*8]
+ mova m2, [cq+128*21+r5*8]
+ mova m3, [cq+128*11+r5*8]
+ call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
+ mova m0, [cq+128* 3+r5*8]
+ mova m1, [cq+128*29+r5*8]
+ mova m2, [cq+128*19+r5*8]
+ mova m3, [cq+128*13+r5*8]
+ call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
+ call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part2
+
+ mova m0, [cq+128* 2+r5*8]
+ mova m1, [cq+128*14+r5*8]
+ mova m2, [cq+128*18+r5*8]
+ mova m3, [cq+128*30+r5*8]
+ call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1_fast
+
+ mova m0, [cq+128* 6+r5*8]
+ mova m1, [cq+128*10+r5*8]
+ mova m2, [cq+128*22+r5*8]
+ mova m3, [cq+128*26+r5*8]
+ call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2_fast
+ add r3, 16*(24+4*ARCH_X86_32)
+
+ mova m0, [cq+128* 4+r5*8]
+ mova m1, [cq+128*12+r5*8]
+ mova m2, [cq+128*20+r5*8]
+ mova m3, [cq+128*28+r5*8]
+ call m(idct_16x4_internal_16bpc).main_oddhalf_fast
+
+ mova m0, [cq+128* 0+r5*8]
+ mova m1, [cq+128* 8+r5*8]
+ mova m2, [cq+128*16+r5*8]
+ mova m3, [cq+128*24+r5*8]
+ call m(idct_8x4_internal_16bpc).main_pass1_fast
+ call m(idct_8x4_internal_16bpc).round
+ mova [r3-(7+4*ARCH_X86_32)*16], m1
+ mova [r3-(6+4*ARCH_X86_32)*16], m2
+ mova [r3-(5+4*ARCH_X86_32)*16], m3
+ mova [r3-(4+4*ARCH_X86_32)*16], m4
+ mova [r3-(3+4*ARCH_X86_32)*16], m5
+ mova [r3-(2+4*ARCH_X86_32)*16], m6
+ mova [r3-(1+4*ARCH_X86_32)*16], m7
+ sub r3, 16*(40+4*ARCH_X86_32-4)
+
+%if ARCH_X86_64
+ psrld m15, m11, 10 ; pd_2
+%else
+ mova m7, [o(pd_2)]
+%endif
+ call m(inv_txfm_add_dct_dct_64x16_16bpc).main_end_loop_start
+
+ lea r3, [rsp+56*16]
+ movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0]
+ movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2]
+ movzx t0d, t1b
+ movzx t2d, t3b
+ shr t1d, 8
+ shr t3d, 8
+ lea t4, [rsp+7*64*16+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16]
+ call .shift_transpose
+ ; zero cq
+ pxor m7, m7
+%if ARCH_X86_32
+ mov cq, [rsp+gprsize*3+(64*9+8)*16]
+%endif
+ lea r4, [cq+30*128+r5*8]
+.zero_cq_loop:
+ REPX {mova [r4+x*128], m7}, -2, -1, 0, 1
+ sub r4, 4*128
+ cmp r4, cq
+ jg .zero_cq_loop
+%if ARCH_X86_32
+ mov r6, [rsp+gprsize*4+(64*9+8)*16]
+%endif
+ sub r5d, 2
+ jge .loop_pass1
+
+ ; pass=2 code starts here
+ mov eobd, [rsp+gprsize*0+(9*64+8*ARCH_X86_32+1*ARCH_X86_64)*16]
+%if ARCH_X86_32
+ mov strideq, [rsp+gprsize*2+(9*64+8)*16]
+%else
+ mov r0, [rsp+gprsize*0+64*16]
+%endif
+ add rsp, (64+8*ARCH_X86_32+1*ARCH_X86_64-3)*16
+ cmp eobd, 151
+ jl .fast
+ ; fall-through
+%if ARCH_X86_64
+ DECLARE_REG_TMP 8, 9
+%else
+ DECLARE_REG_TMP 1, 5
+%endif
+ lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)]
+ lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main)]
+ jmp .run
+.fast:
+ lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)]
+ lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main_fast)]
+.run:
+
+%if ARCH_X86_64
+ lea r2, [dstq+128]
+ mov r7, -16
+%else
+ lea r2, [rsp+(64*8+3)*16]
+ mov [r2+4*gprsize], t0
+ mov [r2+5*gprsize], t1
+ mov r1, [r2+2*gprsize]
+ mov dword [r2+0*gprsize], 8
+%endif
+ jmp m(inv_txfm_add_dct_dct_16x64_16bpc).loop_pass2
+
+ ; copy of pass=1 tmp-regs
+%if ARCH_X86_32
+ DECLARE_REG_TMP 4, 1, 2, 0, 6
+%else
+ DECLARE_REG_TMP 8, 9, 4, 7, 0
+%endif
+
+.shift_transpose:
+ mova m0, [r3+0*16]
+ mova m1, [r3+1*16]
+ mova m2, [r3+2*16]
+ mova m3, [r3+3*16]
+ mova m4, [r3+4*16]
+ mova m5, [r3+5*16]
+ mova m6, [r3+6*16]
+ mova m7, [r3+7*16]
+ REPX {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [t4+t0*8], m0
+ mova [t4+t1*8], m1
+ mova [t4+t2*8], m2
+ mova [t4+t3*8], m3
+ sub t4, 16*64
+ sub r3, 8*16
+ cmp r3, rsp
+ jg .shift_transpose
+ ret
+
+.dconly:
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 64
+ add rsp, (64+8*ARCH_X86_32+8*64+1*ARCH_X86_64)*16 + \
+ (4+4*ARCH_X86_32)*gprsize - (64+8*ARCH_X86_32)*16
+ jmp m(inv_txfm_add_dct_dct_64x16_16bpc).dconly1
diff --git a/third_party/dav1d/src/x86/itx_avx2.asm b/third_party/dav1d/src/x86/itx_avx2.asm
new file mode 100644
index 0000000000..a67f053a61
--- /dev/null
+++ b/third_party/dav1d/src/x86/itx_avx2.asm
@@ -0,0 +1,5542 @@
+; Copyright © 2018-2021, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 16
+
+; Note: The order of (at least some of) those constants matter!
+
+const deint_shuf, db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
+
+%macro COEF_PAIR 2
+pw_%1_%2: dw %1, %2
+pw_m%2_%1: dw -%2, %1
+%endmacro
+
+; ADST-only
+pw_3803_1321: dw 3803, 1321
+pw_m1321_2482: dw -1321, 2482
+pw_2482_3344: dw 2482, 3344
+pw_m3344_3344: dw -3344, 3344
+pw_m3803_3344: dw -3803, 3344
+pw_m3803_m6688: dw -3803, -6688
+pw_2896_m2896: dw 2896, -2896
+
+const pw_5, times 2 dw 5
+const pw_2048, times 2 dw 2048
+const pw_4096, times 2 dw 4096
+const pw_8192, times 2 dw 8192
+const pw_16384, times 2 dw 16384
+const pw_1697x16, times 2 dw 1697*16
+const pw_1697x8, times 2 dw 1697*8
+const pw_2896x8, times 2 dw 2896*8
+const pd_2048, dd 2048
+
+const pw_2896_2896, dw 2896, 2896
+const pw_m2896_2896, dw -2896, 2896
+const pw_1567_3784, dw 1567, 3784
+const pw_m3784_1567, dw -3784, 1567
+COEF_PAIR 3784, 1567
+COEF_PAIR 201, 4091
+COEF_PAIR 995, 3973
+COEF_PAIR 1751, 3703
+COEF_PAIR 2440, 3290
+COEF_PAIR 3035, 2751
+COEF_PAIR 3513, 2106
+COEF_PAIR 3857, 1380
+COEF_PAIR 4052, 601
+COEF_PAIR 401, 4076
+COEF_PAIR 1931, 3612
+COEF_PAIR 3166, 2598
+COEF_PAIR 3920, 1189
+COEF_PAIR 799, 4017
+COEF_PAIR 3406, 2276
+pw_m799_m4017: dw -799, -4017
+const pw_m1567_m3784, dw -1567, -3784
+pw_m3406_m2276: dw -3406, -2276
+pw_m401_m4076: dw -401, -4076
+pw_m3166_m2598: dw -3166, -2598
+pw_m1931_m3612: dw -1931, -3612
+pw_m3920_m1189: dw -3920, -1189
+COEF_PAIR 2276, 3406
+COEF_PAIR 4017, 799
+
+%macro COEF_X8 1-*
+%rep %0
+ dw %1*8, %1*8
+ %rotate 1
+%endrep
+%endmacro
+
+pw_3703x8: COEF_X8 3703
+pw_1751x8: COEF_X8 1751
+pw_m1380x8: COEF_X8 -1380
+pw_3857x8: COEF_X8 3857
+pw_3973x8: COEF_X8 3973
+pw_995x8: COEF_X8 995
+pw_m2106x8: COEF_X8 -2106
+pw_3513x8: COEF_X8 3513
+pw_3290x8: COEF_X8 3290
+pw_2440x8: COEF_X8 2440
+pw_m601x8: COEF_X8 -601
+pw_4052x8: COEF_X8 4052
+
+const idct64_mul
+COEF_X8 4095, 101, 4065, 501, 2967, -2824, 3229, -2520
+COEF_X8 3745, 1660, 3564, 2019, 3822, -1474, 3948, -1092
+COEF_X8 3996, 897, 3889, 1285, 3461, -2191, 3659, -1842
+COEF_X8 3349, 2359, 3102, 2675, 4036, -700, 4085, -301
+
+pw_201_4091x8: dw 201*8, 4091*8
+pw_m601_4052x8: dw -601*8, 4052*8
+pw_995_3973x8: dw 995*8, 3973*8
+pw_m1380_3857x8: dw -1380*8, 3857*8
+pw_1751_3703x8: dw 1751*8, 3703*8
+pw_m2106_3513x8: dw -2106*8, 3513*8
+pw_2440_3290x8: dw 2440*8, 3290*8
+pw_m2751_3035x8: dw -2751*8, 3035*8
+
+%define o_idct64_offset idct64_mul - (o_base) - 8
+
+SECTION .text
+
+; Code size reduction trickery: Instead of using rip-relative loads with
+; mandatory 4-byte offsets everywhere, we can set up a base pointer with a
+; single rip-relative lea and then address things relative from that with
+; 1-byte offsets as long as data is within +-128 bytes of the base pointer.
+%define o_base deint_shuf + 128
+%define o(x) (r6 - (o_base) + (x))
+%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
+
+; flags: 1 = swap, 2 = interleave, 4: coef_regs
+%macro ITX_MUL2X_PACK 6-7 0 ; dst/src, tmp[1-2], rnd, coef[1-2], flags
+%if %7 & 4
+ pmaddwd m%2, m%5, m%1
+ pmaddwd m%1, m%6
+%else
+%if %7 & 1
+ vpbroadcastd m%2, [o(pw_%5_%6)]
+ vpbroadcastd m%3, [o(pw_m%6_%5)]
+%else
+ vpbroadcastd m%2, [o(pw_m%6_%5)]
+ vpbroadcastd m%3, [o(pw_%5_%6)]
+%endif
+ pmaddwd m%2, m%1
+ pmaddwd m%1, m%3
+%endif
+ paddd m%2, m%4
+ paddd m%1, m%4
+%if %7 & 2
+ pslld m%2, 4
+ psrld m%1, 12
+ pblendw m%1, m%2, 0xaa
+%else
+ psrad m%2, 12
+ psrad m%1, 12
+ packssdw m%1, m%2
+%endif
+%endmacro
+
+; flags: 1 = swap, 2 = interleave, 4 = coef_regs
+%macro ITX_MUL4X_PACK 9-10 0 ; dst/src, tmp[1-3], rnd, coef[1-4], flags
+%if %10 & 1
+ vpbroadcastd m%3, [o(pw_%8_%9)]
+ vpbroadcastd m%4, [o(pw_m%9_%8)]
+ vpbroadcastd xm%2, [o(pw_%6_%7)]
+ vpblendd m%2, m%3, 0xf0
+ vpbroadcastd xm%3, [o(pw_m%7_%6)]
+%else
+ vpbroadcastd m%3, [o(pw_m%9_%8)]
+ vpbroadcastd m%4, [o(pw_%8_%9)]
+ vpbroadcastd xm%2, [o(pw_m%7_%6)]
+ vpblendd m%2, m%3, 0xf0
+ vpbroadcastd xm%3, [o(pw_%6_%7)]
+%endif
+ vpblendd m%3, m%4, 0xf0
+ ITX_MUL2X_PACK %1, %4, _, %5, %2, %3, (4|%10)
+%endmacro
+
+; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
+; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
+%macro ITX_MULSUB_2W 7-8 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], dst2
+ punpckhwd m%3, m%2, m%1
+ punpcklwd m%2, m%1
+%if %7 < 32
+ pmaddwd m%1, m%7, m%2
+ pmaddwd m%4, m%7, m%3
+%else
+ vpbroadcastd m%1, [o(pw_m%7_%6)]
+ pmaddwd m%4, m%3, m%1
+ pmaddwd m%1, m%2
+%endif
+ paddd m%4, m%5
+ paddd m%1, m%5
+ psrad m%4, 12
+ psrad m%1, 12
+ packssdw m%1, m%4
+%if %7 < 32
+ pmaddwd m%3, m%6
+ pmaddwd m%2, m%6
+%else
+ vpbroadcastd m%4, [o(pw_%6_%7)]
+ pmaddwd m%3, m%4
+ pmaddwd m%2, m%4
+%endif
+ paddd m%3, m%5
+ paddd m%2, m%5
+ psrad m%3, 12
+ psrad m%2, 12
+%if %0 == 8
+ packssdw m%8, m%2, m%3
+%else
+ packssdw m%2, m%3
+%endif
+%endmacro
+
+%macro IDCT4_1D 7 ; src[1-4], tmp[1-2], pd_2048
+ ITX_MULSUB_2W %2, %4, %5, %6, %7, 1567, 3784, %5 ; t2, t3
+ ITX_MULSUB_2W %1, %3, %4, %6, %7, 2896, 2896, %4 ; t1, t0
+ psubsw m%3, m%1, m%2
+ paddsw m%2, m%1
+ paddsw m%1, m%4, m%5
+ psubsw m%4, m%5
+%endmacro
+
+%macro IDCT8_1D 11 ; src[1-8], tmp[1-2], pd_2048
+ ITX_MULSUB_2W %6, %4, %9, %10, %11, 3406, 2276 ; t5a, t6a
+ ITX_MULSUB_2W %2, %8, %9, %10, %11, 799, 4017 ; t4a, t7a
+ ITX_MULSUB_2W %3, %7, %9, %10, %11, 1567, 3784 ; t2, t3
+ paddsw m%9, m%2, m%6 ; t4
+ psubsw m%2, m%6 ; t5a
+ paddsw m%10, m%8, m%4 ; t7
+ psubsw m%8, m%4 ; t6a
+ ITX_MULSUB_2W %1, %5, %4, %6, %11, 2896, 2896 ; t1, t0
+ ITX_MULSUB_2W %8, %2, %4, %6, %11, 2896, 2896 ; t5, t6
+ psubsw m%6, m%1, m%3 ; dct4 out2
+ paddsw m%3, m%1 ; dct4 out1
+ paddsw m%1, m%5, m%7 ; dct4 out0
+ psubsw m%5, m%7 ; dct4 out3
+ psubsw m%7, m%3, m%2 ; out6
+ paddsw m%2, m%3 ; out1
+ paddsw m%3, m%6, m%8 ; out2
+ psubsw m%6, m%8 ; out5
+ psubsw m%8, m%1, m%10 ; out7
+ paddsw m%1, m%10 ; out0
+ paddsw m%4, m%5, m%9 ; out3
+ psubsw m%5, m%9 ; out4
+%endmacro
+
+; in1 = %1, in3 = %2, in5 = %3, in7 = %4
+; in9 = %5, in11 = %6, in13 = %7, in15 = %8
+%macro IDCT16_1D_ODDHALF 11 ; src[1-8], tmp[1-2], pd_2048
+ ITX_MULSUB_2W %1, %8, %9, %10, %11, 401, 4076 ; t8a, t15a
+ ITX_MULSUB_2W %5, %4, %9, %10, %11, 3166, 2598 ; t9a, t14a
+ ITX_MULSUB_2W %3, %6, %9, %10, %11, 1931, 3612 ; t10a, t13a
+ ITX_MULSUB_2W %7, %2, %9, %10, %11, 3920, 1189 ; t11a, t12a
+ psubsw m%9, m%2, m%6 ; t13
+ paddsw m%6, m%2 ; t12
+ psubsw m%2, m%8, m%4 ; t14
+ paddsw m%8, m%4 ; t15
+ psubsw m%4, m%7, m%3 ; t10
+ paddsw m%3, m%7 ; t11
+ psubsw m%7, m%1, m%5 ; t9
+ paddsw m%1, m%5 ; t8
+ ITX_MULSUB_2W %2, %7, %5, %10, %11, 1567, 3784 ; t9a, t14a
+ ITX_MULSUB_2W %9, %4, %5, %10, %11, m3784, 1567 ; t10a, t13a
+ psubsw m%5, m%1, m%3 ; t11a
+ paddsw m%1, m%3 ; t8a
+ psubsw m%3, m%7, m%4 ; t13
+ paddsw m%7, m%4 ; t14
+ psubsw m%4, m%8, m%6 ; t12a
+ paddsw m%8, m%6 ; t15a
+ psubsw m%6, m%2, m%9 ; t10
+ paddsw m%2, m%9 ; t9
+ ITX_MULSUB_2W %3, %6, %9, %10, %11, 2896, 2896 ; t10a, t13a
+ ITX_MULSUB_2W %4, %5, %9, %10, %11, 2896, 2896 ; t11, t12
+%endmacro
+
+%macro WRAP_XMM 1+
+ INIT_XMM cpuname
+ %1
+ INIT_YMM cpuname
+%endmacro
+
+%macro ITX4_END 4-5 2048 ; row[1-4], rnd
+%if %5
+ vpbroadcastd m2, [o(pw_%5)]
+ pmulhrsw m0, m2
+ pmulhrsw m1, m2
+%endif
+ lea r2, [dstq+strideq*2]
+%assign %%i 1
+%rep 4
+ %if %1 & 2
+ CAT_XDEFINE %%row_adr, %%i, r2 + strideq*(%1&1)
+ %else
+ CAT_XDEFINE %%row_adr, %%i, dstq + strideq*(%1&1)
+ %endif
+ %assign %%i %%i + 1
+ %rotate 1
+%endrep
+ movd m2, [%%row_adr1]
+ pinsrd m2, [%%row_adr2], 1
+ movd m3, [%%row_adr3]
+ pinsrd m3, [%%row_adr4], 1
+ pmovzxbw m2, m2
+ pmovzxbw m3, m3
+ paddw m0, m2
+ paddw m1, m3
+ packuswb m0, m1
+ movd [%%row_adr1], m0
+ pextrd [%%row_adr2], m0, 1
+ pextrd [%%row_adr3], m0, 2
+ pextrd [%%row_adr4], m0, 3
+ ret
+%endmacro
+
+%macro IWHT4_1D_PACKED 0
+ punpckhqdq m3, m0, m1 ; in1 in3
+ punpcklqdq m0, m1 ; in0 in2
+ psubw m2, m0, m3
+ paddw m0, m3
+ punpckhqdq m2, m2 ; t2 t2
+ punpcklqdq m0, m0 ; t0 t0
+ psubw m1, m0, m2
+ psraw m1, 1
+ psubw m1, m3 ; t1 t3
+ psubw m0, m1 ; ____ out0
+ paddw m2, m1 ; out3 ____
+%endmacro
+
+INIT_XMM avx2
+cglobal inv_txfm_add_wht_wht_4x4_8bpc, 3, 3, 4, dst, stride, c
+ mova m0, [cq+16*0]
+ mova m1, [cq+16*1]
+ pxor m2, m2
+ mova [cq+16*0], m2
+ mova [cq+16*1], m2
+ psraw m0, 2
+ psraw m1, 2
+ IWHT4_1D_PACKED
+ punpckhwd m0, m1
+ punpcklwd m3, m1, m2
+ punpckhdq m1, m0, m3
+ punpckldq m0, m3
+ IWHT4_1D_PACKED
+ vpblendd m0, m2, 0x03
+ ITX4_END 3, 0, 2, 1, 0
+
+%macro INV_TXFM_FN 3 ; type1, type2, size
+cglobal inv_txfm_add_%1_%2_%3_8bpc, 4, 5, 0, dst, stride, c, eob, tx2
+ %define %%p1 m(i%1_%3_internal_8bpc)
+ lea r6, [o_base]
+ ; Jump to the 1st txfm function if we're not taking the fast path, which
+ ; in turn performs an indirect jump to the 2nd txfm function.
+ lea tx2q, [m(i%2_%3_internal_8bpc).pass2]
+%ifidn %1_%2, dct_dct
+ test eobd, eobd
+ jnz %%p1
+%else
+ ; jump to the 1st txfm function unless it's located directly after this
+ times ((%%end - %%p1) >> 31) & 1 jmp %%p1
+ALIGN function_align
+%%end:
+%endif
+%endmacro
+
+%macro INV_TXFM_4X4_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 4x4
+%ifidn %1_%2, dct_dct
+ vpbroadcastw m0, [cq]
+ vpbroadcastd m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1
+ mov [cq], eobd ; 0
+ pmulhrsw m0, m1
+ mova m1, m0
+ jmp m(iadst_4x4_internal_8bpc).end2
+%endif
+%endmacro
+
+%macro IDCT4_1D_PACKED 0
+ vpbroadcastd m4, [o(pd_2048)]
+ punpckhwd m2, m1, m0
+ punpcklwd m1, m0
+ ITX_MUL2X_PACK 2, 0, 3, 4, 1567, 3784
+ ITX_MUL2X_PACK 1, 0, 3, 4, 2896, 2896
+ paddsw m0, m1, m2 ; out0 out1
+ psubsw m1, m2 ; out3 out2
+%endmacro
+
+%macro IADST4_1D_PACKED 0
+ punpcklwd m2, m1, m0
+ punpckhwd m3, m1, m0
+ vpbroadcastd m5, [o(pw_m3344_3344)]
+ vpbroadcastd m0, [o(pw_3803_1321)]
+ vpbroadcastd m4, [o(pw_m1321_2482)]
+ pmaddwd m1, m5, m2 ; 3344*in3 - 3344*in2
+ psrld m5, 16
+ pmaddwd m0, m2
+ pmaddwd m2, m4
+ pmaddwd m5, m3 ; 3344*in0
+ paddd m1, m5 ; 3344*in0 - 3344*in2 + 3344*in3
+ vpbroadcastd m4, [o(pw_2482_3344)]
+ vpbroadcastd m5, [o(pw_m3803_3344)]
+ pmaddwd m4, m3
+ pmaddwd m5, m3
+ paddd m4, m0 ; 1321*in0 + 3344*in1 + 3803*in2 + 2482*in3
+ vpbroadcastd m0, [o(pw_m3803_m6688)]
+ pmaddwd m3, m0
+ vpbroadcastd m0, [o(pd_2048)]
+ paddd m2, m0
+ paddd m1, m0
+ paddd m0, m4
+ paddd m5, m2 ; 2482*in0 + 3344*in1 - 1321*in2 - 3803*in3
+ paddd m2, m4
+ paddd m2, m3
+ REPX {psrad x, 12}, m1, m2, m0, m5
+ packssdw m0, m5 ; out0 out1
+ packssdw m1, m2 ; out2 out3
+%endmacro
+
+INV_TXFM_4X4_FN dct, dct
+INV_TXFM_4X4_FN dct, adst
+INV_TXFM_4X4_FN dct, flipadst
+INV_TXFM_4X4_FN dct, identity
+
+cglobal idct_4x4_internal_8bpc, 0, 5, 6, dst, stride, c, eob, tx2
+ mova m0, [cq+16*0]
+ mova m1, [cq+16*1]
+ IDCT4_1D_PACKED
+ mova m2, [o(deint_shuf)]
+ shufps m3, m0, m1, q1331
+ shufps m0, m1, q0220
+ pshufb m0, m2
+ pshufb m1, m3, m2
+ jmp tx2q
+.pass2:
+ IDCT4_1D_PACKED
+ pxor m2, m2
+ mova [cq+16*0], m2
+ mova [cq+16*1], m2
+ ITX4_END 0, 1, 3, 2
+
+INV_TXFM_4X4_FN adst, dct
+INV_TXFM_4X4_FN adst, adst
+INV_TXFM_4X4_FN adst, flipadst
+INV_TXFM_4X4_FN adst, identity
+
+cglobal iadst_4x4_internal_8bpc, 0, 5, 6, dst, stride, c, eob, tx2
+ mova m0, [cq+16*0]
+ mova m1, [cq+16*1]
+ call .main
+ punpckhwd m3, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m0, m3
+ punpcklwd m0, m3
+ jmp tx2q
+.pass2:
+ call .main
+.end:
+ pxor m2, m2
+ mova [cq+16*0], m2
+ mova [cq+16*1], m2
+.end2:
+ ITX4_END 0, 1, 2, 3
+ALIGN function_align
+cglobal_label .main
+ IADST4_1D_PACKED
+ ret
+
+INV_TXFM_4X4_FN flipadst, dct
+INV_TXFM_4X4_FN flipadst, adst
+INV_TXFM_4X4_FN flipadst, flipadst
+INV_TXFM_4X4_FN flipadst, identity
+
+cglobal iflipadst_4x4_internal_8bpc, 0, 5, 6, dst, stride, c, eob, tx2
+ mova m0, [cq+16*0]
+ mova m1, [cq+16*1]
+ call m(iadst_4x4_internal_8bpc).main
+ punpcklwd m2, m1, m0
+ punpckhwd m1, m0
+ punpcklwd m0, m1, m2
+ punpckhwd m1, m2
+ jmp tx2q
+.pass2:
+ call m(iadst_4x4_internal_8bpc).main
+.end:
+ pxor m2, m2
+ mova [cq+16*0], m2
+ mova [cq+16*1], m2
+.end2:
+ ITX4_END 3, 2, 1, 0
+
+INV_TXFM_4X4_FN identity, dct
+INV_TXFM_4X4_FN identity, adst
+INV_TXFM_4X4_FN identity, flipadst
+INV_TXFM_4X4_FN identity, identity
+
+cglobal iidentity_4x4_internal_8bpc, 0, 5, 6, dst, stride, c, eob, tx2
+ mova m0, [cq+16*0]
+ mova m1, [cq+16*1]
+ vpbroadcastd m3, [o(pw_1697x8)]
+ pmulhrsw m2, m3, m0
+ pmulhrsw m3, m1
+ paddsw m0, m2
+ paddsw m1, m3
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m0, m2
+ punpcklwd m0, m2
+ jmp tx2q
+.pass2:
+ vpbroadcastd m3, [o(pw_1697x8)]
+ pmulhrsw m2, m3, m0
+ pmulhrsw m3, m1
+ paddsw m0, m2
+ paddsw m1, m3
+ jmp m(iadst_4x4_internal_8bpc).end
+
+%macro WRITE_4X8 2 ; coefs[1-2]
+ movd xm4, [dstq+strideq*0]
+ pinsrd xm4, [dstq+strideq*1], 1
+ movd xm5, [dstq+strideq*2]
+ pinsrd xm5, [dstq+r3 ], 1
+ pinsrd xm4, [r2 +strideq*0], 2
+ pinsrd xm4, [r2 +strideq*1], 3
+ pinsrd xm5, [r2 +strideq*2], 2
+ pinsrd xm5, [r2 +r3 ], 3
+ pmovzxbw m4, xm4
+ pmovzxbw m5, xm5
+ paddw m4, m%1
+ paddw m5, m%2
+ packuswb m4, m5
+ vextracti128 xm5, m4, 1
+ movd [dstq+strideq*0], xm4
+ pextrd [dstq+strideq*1], xm4, 1
+ pextrd [dstq+strideq*2], xm4, 2
+ pextrd [dstq+r3 ], xm4, 3
+ movd [r2 +strideq*0], xm5
+ pextrd [r2 +strideq*1], xm5, 1
+ pextrd [r2 +strideq*2], xm5, 2
+ pextrd [r2 +r3 ], xm5, 3
+%endmacro
+
+%macro INV_TXFM_4X8_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 4x8
+%ifidn %1_%2, dct_dct
+ movd xm1, [o(pw_2896x8)]
+ pmulhrsw xm0, xm1, [cq]
+ movd xm2, [o(pw_2048)]
+ mov [cq], eobd
+ pmulhrsw xm0, xm1
+ pmulhrsw xm0, xm1
+ pmulhrsw xm0, xm2
+ vpbroadcastw m0, xm0
+ mova m1, m0
+ jmp m(iadst_4x8_internal_8bpc).end3
+%endif
+%endmacro
+
+%macro IDCT8_1D_PACKED 0
+ vpbroadcastd m6, [o(pd_2048)]
+ punpckhwd m5, m3, m0 ; in7 in1
+ punpckhwd m4, m1, m2 ; in3 in5
+ punpcklwd m3, m1 ; in6 in2
+ punpcklwd m2, m0 ; in4 in0
+ ITX_MUL2X_PACK 5, 0, 1, 6, 799, 4017, 3 ; t4a t7a
+ ITX_MUL2X_PACK 4, 0, 1, 6, 3406, 2276, 3 ; t5a t6a
+ ITX_MUL2X_PACK 3, 0, 1, 6, 1567, 3784 ; t3 t2
+ psubsw m0, m5, m4 ; t5a t6a (interleaved)
+ paddsw m4, m5 ; t4 t7 (interleaved)
+ ITX_MUL2X_PACK 2, 1, 5, 6, 2896, 2896 ; t0 t1
+ vpbroadcastd m1, [o(pw_m2896_2896)]
+ ITX_MUL2X_PACK 0, 1, _, 6, 1, 5, 4 ; t6 t5
+%if mmsize > 16
+ vbroadcasti128 m1, [o(deint_shuf)]
+ pshufb m4, m1
+%else
+ pshufb m4, [o(deint_shuf)]
+%endif
+ psubsw m1, m2, m3 ; tmp3 tmp2
+ paddsw m3, m2 ; tmp0 tmp1
+ shufps m2, m4, m0, q1032 ; t7 t6
+ vpblendd m4, m0, 0xcc ; t4 t5
+ paddsw m0, m3, m2 ; out0 out1
+ psubsw m3, m2 ; out7 out6
+ psubsw m2, m1, m4 ; out4 out5
+ paddsw m1, m4 ; out3 out2
+%endmacro
+
+%macro IADST8_1D_PACKED 1 ; pass
+ vpbroadcastd m6, [o(pd_2048)]
+ punpckhwd m0, m4, m3 ; 0 7
+ punpckhwd m1, m5, m2 ; 2 5
+ punpcklwd m2, m5 ; 4 3
+ punpcklwd m3, m4 ; 6 1
+%if %1 == 1
+ ITX_MUL2X_PACK 0, 4, 5, 6, 401, 4076, 3 ; t1a t0a
+ ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612, 2 ; t2a t3a
+ ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598, 3 ; t5a t4a
+ ITX_MUL2X_PACK 3, 4, 5, 6, 3920, 1189, 2 ; t6a t7a
+ psubsw m4, m0, m2 ; t5 t4
+ paddsw m0, m2 ; t1 t0
+ psubsw m5, m1, m3 ; t6 t7
+ paddsw m1, m3 ; t2 t3
+ ITX_MUL2X_PACK 4, 2, 3, 6, 1567, 3784, 3 ; t5a t4a
+ ITX_MUL2X_PACK 5, 2, 3, 6, 3784, 1567, 2 ; t7a t6a
+%if mmsize > 16
+ vbroadcasti128 m2, [o(deint_shuf)]
+%else
+ mova m2, [o(deint_shuf)]
+%endif
+ pshuflw m1, m1, q2301
+ pshufhw m1, m1, q2301
+ psubsw m3, m0, m1 ; t3 t2
+ paddsw m0, m1 ; -out7 out0
+ psubsw m1, m4, m5 ; t7 t6
+ paddsw m4, m5 ; out6 -out1
+ pshufb m0, m2
+ pshufb m4, m2
+ vpbroadcastd m5, [o(pw_m2896_2896)]
+ pmaddwd m2, m5, m3
+ pmaddwd m5, m1
+ paddd m2, m6
+ paddd m5, m6
+ psrad m2, 12
+ psrad m5, 12
+ packssdw m2, m5 ; out4 -out5
+ vpbroadcastd m5, [o(pw_2896_2896)]
+ pmaddwd m3, m5
+ pmaddwd m1, m5
+ paddd m3, m6
+ paddd m1, m6
+ psrad m3, 12
+ psrad m1, 12
+ packssdw m1, m3 ; out2 -out3
+ punpcklqdq m3, m4, m0 ; out6 -out7
+ punpckhqdq m0, m4 ; out0 -out1
+%else
+ ITX_MUL2X_PACK 0, 4, 5, 6, 401, 4076 ; t0a t1a
+ ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612 ; t2a t3a
+ ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598 ; t4a t5a
+ ITX_MUL2X_PACK 3, 4, 5, 6, 3920, 1189 ; t6a t7a
+ psubsw m4, m0, m2 ; t4 t5
+ paddsw m0, m2 ; t0 t1
+ psubsw m5, m1, m3 ; t6 t7
+ paddsw m1, m3 ; t2 t3
+ shufps m2, m5, m4, q1032
+ punpckhwd m4, m2
+ punpcklwd m5, m2
+ ITX_MUL2X_PACK 4, 2, 3, 6, 1567, 3784, 1 ; t5a t4a
+ ITX_MUL2X_PACK 5, 2, 3, 6, 3784, 1567 ; t7a t6a
+ psubsw m2, m0, m1 ; t2 t3
+ paddsw m0, m1 ; out0 -out7
+ psubsw m1, m4, m5 ; t7 t6
+ paddsw m4, m5 ; out6 -out1
+ vpbroadcastd m5, [o(pw_2896x8)]
+ vpblendd m3, m0, m4, 0x33 ; out6 -out7
+ vpblendd m0, m4, 0xcc ; out0 -out1
+ shufps m4, m2, m1, q1032 ; t3 t7
+ vpblendd m1, m2, 0x33 ; t2 t6
+ psubsw m2, m1, m4 ; t2-t3 t6-t7
+ paddsw m1, m4 ; t2+t3 t6+t7
+ pmulhrsw m2, m5 ; out4 -out5
+ pshufd m1, m1, q1032
+ pmulhrsw m1, m5 ; out2 -out3
+%endif
+%endmacro
+
+INIT_YMM avx2
+INV_TXFM_4X8_FN dct, dct
+INV_TXFM_4X8_FN dct, adst
+INV_TXFM_4X8_FN dct, flipadst
+INV_TXFM_4X8_FN dct, identity
+
+cglobal idct_4x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
+ vpermq m0, [cq+32*0], q3120
+ vpermq m1, [cq+32*1], q3120
+ vpbroadcastd m2, [o(pw_2896x8)]
+ pmulhrsw m0, m2
+ pmulhrsw m1, m2
+ IDCT4_1D_PACKED
+ vbroadcasti128 m2, [o(deint_shuf)]
+ shufps m3, m0, m1, q1331
+ shufps m0, m1, q0220
+ pshufb m0, m2
+ pshufb m1, m3, m2
+ jmp tx2q
+.pass2:
+ vextracti128 xm2, m0, 1
+ vextracti128 xm3, m1, 1
+ call .main
+ vpbroadcastd m4, [o(pw_2048)]
+ vinserti128 m0, xm2, 1
+ vinserti128 m1, xm3, 1
+ pshufd m1, m1, q1032
+ jmp m(iadst_4x8_internal_8bpc).end2
+ALIGN function_align
+cglobal_label .main
+ WRAP_XMM IDCT8_1D_PACKED
+ ret
+
+INV_TXFM_4X8_FN adst, dct
+INV_TXFM_4X8_FN adst, adst
+INV_TXFM_4X8_FN adst, flipadst
+INV_TXFM_4X8_FN adst, identity
+
+cglobal iadst_4x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
+ vpermq m0, [cq+32*0], q3120
+ vpermq m1, [cq+32*1], q3120
+ vpbroadcastd m2, [o(pw_2896x8)]
+ pmulhrsw m0, m2
+ pmulhrsw m1, m2
+ call m(iadst_8x4_internal_8bpc).main
+ punpckhwd m3, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m0, m3
+ punpcklwd m0, m3
+ jmp tx2q
+.pass2:
+ vextracti128 xm2, m0, 1
+ vextracti128 xm3, m1, 1
+ pshufd xm4, xm0, q1032
+ pshufd xm5, xm1, q1032
+ call .main_pass2
+ vpbroadcastd m4, [o(pw_2048)]
+ vinserti128 m0, xm2, 1
+ vinserti128 m1, xm3, 1
+ pxor m5, m5
+ psubw m5, m4
+.end:
+ vpblendd m4, m5, 0xcc
+.end2:
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+ WIN64_RESTORE_XMM
+ pxor m2, m2
+ mova [cq+32*0], m2
+ mova [cq+32*1], m2
+.end3:
+ lea r2, [dstq+strideq*4]
+ lea r3, [strideq*3]
+ WRITE_4X8 0, 1
+ RET
+ALIGN function_align
+.main_pass1:
+ WRAP_XMM IADST8_1D_PACKED 1
+ ret
+ALIGN function_align
+cglobal_label .main_pass2
+ WRAP_XMM IADST8_1D_PACKED 2
+ ret
+
+INV_TXFM_4X8_FN flipadst, dct
+INV_TXFM_4X8_FN flipadst, adst
+INV_TXFM_4X8_FN flipadst, flipadst
+INV_TXFM_4X8_FN flipadst, identity
+
+cglobal iflipadst_4x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
+ vpermq m0, [cq+32*0], q3120
+ vpermq m1, [cq+32*1], q3120
+ vpbroadcastd m2, [o(pw_2896x8)]
+ pmulhrsw m0, m2
+ pmulhrsw m1, m2
+ call m(iadst_8x4_internal_8bpc).main
+ punpcklwd m3, m1, m0
+ punpckhwd m1, m0
+ punpcklwd m0, m1, m3
+ punpckhwd m1, m3
+ jmp tx2q
+.pass2:
+ vextracti128 xm2, m0, 1
+ vextracti128 xm3, m1, 1
+ pshufd xm4, xm0, q1032
+ pshufd xm5, xm1, q1032
+ call m(iadst_4x8_internal_8bpc).main_pass2
+ vpbroadcastd m5, [o(pw_2048)]
+ vinserti128 m3, xm1, 1
+ vinserti128 m2, xm0, 1
+ pxor m4, m4
+ psubw m4, m5
+ pshufd m0, m3, q1032
+ pshufd m1, m2, q1032
+ jmp m(iadst_4x8_internal_8bpc).end
+
+INV_TXFM_4X8_FN identity, dct
+INV_TXFM_4X8_FN identity, adst
+INV_TXFM_4X8_FN identity, flipadst
+INV_TXFM_4X8_FN identity, identity
+
+cglobal iidentity_4x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
+ vpermq m2, [cq+32*0], q3120
+ vpermq m0, [cq+32*1], q3120
+ vpbroadcastd m3, [o(pw_2896x8)]
+ vpbroadcastd m4, [o(pw_1697x8)]
+ punpcklwd m1, m2, m0
+ punpckhwd m2, m0
+ pmulhrsw m1, m3
+ pmulhrsw m2, m3
+ punpcklwd m0, m1, m2
+ punpckhwd m1, m2
+ pmulhrsw m2, m4, m0
+ pmulhrsw m4, m1
+ paddsw m0, m2
+ paddsw m1, m4
+ jmp tx2q
+.pass2:
+ vpbroadcastd m4, [o(pw_4096)]
+ jmp m(iadst_4x8_internal_8bpc).end2
+
+%macro INV_TXFM_4X16_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 4x16
+%ifidn %1_%2, dct_dct
+ movd xm1, [o(pw_2896x8)]
+ pmulhrsw xm0, xm1, [cq]
+ movd xm2, [o(pw_16384)]
+ movd xm3, [o(pw_2048)]
+ mov [cq], eobd
+ pmulhrsw xm0, xm2
+ pmulhrsw xm0, xm1
+ pmulhrsw xm0, xm3
+ vpbroadcastw m0, xm0
+ mova m1, m0
+ mova m2, m0
+ mova m3, m0
+ jmp m(iadst_4x16_internal_8bpc).end3
+%endif
+%endmacro
+
+%macro IDCT16_1D_PACKED 0
+ vpbroadcastd m10, [o(pd_2048)]
+.main2:
+ punpckhwd m8, m7, m0 ; dct16 in15 in1
+ punpcklwd m9, m4, m0 ; dct4 in2 in0
+ punpckhwd m0, m3, m4 ; dct16 in7 in9
+ punpcklwd m7, m1 ; dct8 in7 in1
+ punpckhwd m1, m6 ; dct16 in3 in13
+ punpcklwd m3, m5 ; dct8 in3 in5
+ punpckhwd m5, m2 ; dct16 in11 in5
+ punpcklwd m6, m2 ; dct4 in3 in1
+ ITX_MUL2X_PACK 8, 2, 4, 10, 401, 4076, 3 ; t8a t15a
+ ITX_MUL2X_PACK 0, 2, 4, 10, 3166, 2598, 3 ; t9a t14a
+ ITX_MUL2X_PACK 1, 2, 4, 10, 3920, 1189, 3 ; t11a t12a
+ ITX_MUL2X_PACK 5, 2, 4, 10, 1931, 3612, 3 ; t10a t13a
+ ITX_MUL2X_PACK 7, 2, 4, 10, 799, 4017, 3 ; t4a t7a
+ ITX_MUL2X_PACK 3, 2, 4, 10, 3406, 2276, 3 ; t5a t6a
+ ITX_MUL2X_PACK 6, 2, 4, 10, 1567, 3784 ; t3 t2
+ psubsw m2, m8, m0 ; t9 t14
+ paddsw m8, m0 ; t8 t15
+ psubsw m0, m1, m5 ; t10 t13
+ paddsw m1, m5 ; t11 t12
+ vpbroadcastd m5, [o(pw_m3784_1567)] ; reuse pw_1567_3784
+ ITX_MUL2X_PACK 2, 4, _, 10, 4, 5, 6 ; t9a t14a
+ vpbroadcastd m4, [o(pw_m1567_m3784)] ; reuse pw_m3784_1567
+ ITX_MUL2X_PACK 0, 5, _, 10, 5, 4, 6 ; t10a t13a
+ psubsw m4, m8, m1 ; t11a t12a
+ paddsw m8, m1 ; t8a t15a
+ psubsw m1, m7, m3 ; t5a t6a
+ paddsw m7, m3 ; t4 t7
+ paddsw m3, m2, m0 ; t9 t14
+ psubsw m2, m0 ; t10 t13
+%if mmsize > 16
+ vbroadcasti128 m0, [o(deint_shuf)]
+%else
+ mova m0, [o(deint_shuf)]
+%endif
+ pshufb m8, m0
+ pshufb m7, m0
+ pshufb m3, m0
+ ITX_MUL2X_PACK 9, 0, 5, 10, 2896, 2896 ; t0 t1
+ vpbroadcastd m0, [o(pw_m2896_2896)]
+ ITX_MUL2X_PACK 4, 5, _, 10, 5, 0, 4 ; t11 t12
+ vpbroadcastd m5, [o(pw_2896_2896)]
+ ITX_MUL2X_PACK 1, 0, _, 10, 0, 5, 4 ; t6 t5
+ vpbroadcastd m0, [o(pw_m2896_2896)]
+ ITX_MUL2X_PACK 2, 0, _, 10, 0, 5, 4 ; t13a t10a
+ punpckhqdq m0, m8, m3 ; t15a t14
+ punpcklqdq m8, m3 ; t8a t9
+ shufps m5, m4, m2, q1032 ; t12 t13a
+ vpblendd m4, m2, 0xcc ; t11 t10a
+ shufps m2, m7, m1, q1032 ; t7 t6
+ vpblendd m7, m1, 0xcc ; t4 t5
+ psubsw m1, m9, m6 ; dct4 out3 out2
+ paddsw m9, m6 ; dct4 out0 out1
+ psubsw m3, m9, m2 ; dct8 out7 out6
+ paddsw m9, m2 ; dct8 out0 out1
+ psubsw m2, m1, m7 ; dct8 out4 out5
+ paddsw m1, m7 ; dct8 out3 out2
+ psubsw m7, m9, m0 ; out15 out14
+ paddsw m0, m9 ; out0 out1
+ psubsw m6, m1, m5 ; out12 out13
+ paddsw m1, m5 ; out3 out2
+ psubsw m5, m2, m4 ; out11 out10
+ paddsw m2, m4 ; out4 out5
+ psubsw m4, m3, m8 ; out8 out9
+ paddsw m3, m8 ; out7 out6
+%endmacro
+
+INV_TXFM_4X16_FN dct, dct
+INV_TXFM_4X16_FN dct, adst
+INV_TXFM_4X16_FN dct, flipadst
+INV_TXFM_4X16_FN dct, identity
+
+cglobal idct_4x16_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2
+ mova m0, [cq+32*0]
+ mova m1, [cq+32*1]
+ mova m2, [cq+32*2]
+ mova m3, [cq+32*3]
+ call m(idct_16x4_internal_8bpc).main
+ vpbroadcastd m5, [o(pw_16384)]
+ punpckhwd m4, m2, m3
+ punpcklwd m2, m3
+ punpckhwd m3, m0, m1
+ punpcklwd m0, m1
+ REPX {pmulhrsw x, m5}, m0, m4, m2, m3
+ punpckhdq m1, m0, m2
+ punpckldq m0, m2
+ punpckldq m2, m3, m4
+ punpckhdq m3, m4
+ jmp tx2q
+.pass2:
+ vextracti128 xm4, m0, 1
+ vextracti128 xm5, m1, 1
+ vextracti128 xm6, m2, 1
+ vextracti128 xm7, m3, 1
+ call .main
+ vinserti128 m0, xm4, 1
+ vinserti128 m1, xm5, 1
+ vpbroadcastd m5, [o(pw_2048)]
+ vinserti128 m2, xm6, 1
+ vinserti128 m3, xm7, 1
+ pshufd m1, m1, q1032
+ pshufd m3, m3, q1032
+ jmp m(iadst_4x16_internal_8bpc).end2
+ALIGN function_align
+cglobal_label .main
+ WRAP_XMM IDCT16_1D_PACKED
+ ret
+
+INV_TXFM_4X16_FN adst, dct
+INV_TXFM_4X16_FN adst, adst
+INV_TXFM_4X16_FN adst, flipadst
+INV_TXFM_4X16_FN adst, identity
+
+cglobal iadst_4x16_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2
+ mova m0, [cq+32*0]
+ mova m1, [cq+32*1]
+ mova m2, [cq+32*2]
+ mova m3, [cq+32*3]
+ call m(iadst_16x4_internal_8bpc).main
+ vpbroadcastd m5, [o(pw_16384)]
+ punpckhwd m4, m2, m3
+ punpcklwd m2, m3
+ punpckhwd m3, m0, m1
+ punpcklwd m0, m1
+ REPX {pmulhrsw x, m5}, m4, m2, m3, m0
+ punpckhdq m1, m0, m2
+ punpckldq m0, m2
+ punpckldq m2, m3, m4
+ punpckhdq m3, m4
+ jmp tx2q
+.pass2:
+ call .main
+ vpbroadcastd m5, [o(pw_2896x8)]
+ paddsw m1, m2, m4
+ psubsw m2, m4
+ pmulhrsw m1, m5 ; -out7 out4 out6 -out5
+ pmulhrsw m2, m5 ; out8 -out11 -out9 out10
+ vpbroadcastd m5, [o(pw_2048)]
+ pshufd m1, m1, q1032
+ vpblendd m4, m1, m0, 0x33
+ vpblendd m0, m2, 0x33
+ vpblendd m2, m3, 0x33
+ vpblendd m3, m1, 0x33
+ vpermq m0, m0, q2031
+ vpermq m1, m2, q1302
+ vpermq m2, m3, q3120
+ vpermq m3, m4, q0213
+ psubw m6, m7, m5
+.end:
+ vpblendd m5, m6, 0xcc
+.end2:
+ REPX {pmulhrsw x, m5}, m0, m1, m2, m3
+ WIN64_RESTORE_XMM
+ pxor m4, m4
+ mova [cq+32*0], m4
+ mova [cq+32*1], m4
+ mova [cq+32*2], m4
+ mova [cq+32*3], m4
+.end3:
+ lea r2, [dstq+strideq*8]
+ lea r3, [strideq*3]
+ WRITE_4X8 0, 1
+ lea dstq, [dstq+strideq*4]
+ lea r2, [r2 +strideq*4]
+ WRITE_4X8 2, 3
+ RET
+ALIGN function_align
+.main:
+ vpblendd m4, m1, m0, 0xcc
+ vpblendd m1, m0, 0x33
+ vpblendd m5, m2, m3, 0xcc
+ vpblendd m2, m3, 0x33
+ vperm2i128 m3, m5, m2, 0x31
+ vinserti128 m0, m1, xm4, 1 ; in0 in3 in2 in1
+ vperm2i128 m4, m1, m4, 0x31
+ vinserti128 m1, m5, xm2, 1 ; in4 in7 in6 in5
+ pshufd m3, m3, q1032 ; in15 in12 in13 in14
+ pshufd m2, m4, q1032 ; in11 in8 in9 in10
+cglobal_label .main2
+ vpbroadcastd m8, [o(pd_2048)]
+ pxor m7, m7
+ punpckhwd m4, m3, m0 ; in12 in3 in14 in1
+ punpcklwd m0, m3 ; in0 in15 in2 in13
+ punpckhwd m3, m2, m1 ; in8 in7 in10 in5
+ punpcklwd m1, m2 ; in4 in11 in6 in9
+ ITX_MUL4X_PACK 0, 2, 5, 6, 8, 201, 4091, 995, 3973, 3
+ ITX_MUL4X_PACK 1, 2, 5, 6, 8, 1751, 3703, 2440, 3290, 3
+ ITX_MUL4X_PACK 3, 2, 5, 6, 8, 3035, 2751, 3513, 2106, 3
+ ITX_MUL4X_PACK 4, 2, 5, 6, 8, 3857, 1380, 4052, 601, 3
+ psubsw m2, m0, m3 ; t9a t8a t11a t10a
+ paddsw m0, m3 ; t1a t0a t3a t2a
+ psubsw m3, m1, m4 ; t13a t12a t15a t14a
+ paddsw m1, m4 ; t5a t4a t7a t6a
+ ITX_MUL4X_PACK 2, 4, 5, 6, 8, 799, 4017, 3406, 2276, 3
+ psubw m6, m7, m5
+ ITX_MUL2X_PACK 3, 5, _, 8, 6, 4, 6
+ vpbroadcastd m6, [o(pw_m3784_1567)]
+ vpbroadcastd m5, [o(pw_1567_3784)]
+ psubsw m4, m0, m1 ; t5 t4 t7 t6
+ paddsw m0, m1 ; t1 t0 t3 t2
+ psubsw m1, m2, m3 ; t13a t12a t15a t14a
+ paddsw m2, m3 ; t9a t8a t11a t10a
+ psubw m3, m7, m6 ; pw_3784_m1567
+ vpblendd m6, m3, 0xf0
+ ITX_MUL2X_PACK 4, 3, _, 8, 6, 5, 4 ; t4a t5a t7a t6a
+ ITX_MUL2X_PACK 1, 3, _, 8, 6, 5, 4 ; t12 t13 t15 t14
+ vbroadcasti128 m5, [o(deint_shuf)]
+ pshufb m0, m5
+ pshufb m2, m5
+ vperm2i128 m3, m0, m2, 0x31 ; t3 t2 t11a t10a
+ vinserti128 m0, xm2, 1 ; t1 t0 t9a t8a
+ vperm2i128 m2, m4, m1, 0x31 ; t7a t6a t15 t14
+ vinserti128 m4, xm1, 1 ; t4a t5a t12 t13
+ pshufd m2, m2, q1032 ; t6a t7a t14 t15
+ psubsw m1, m0, m3 ; t3a t2a t11 t10
+ paddsw m0, m3 ; -out15 out0 out14 -out1
+ paddsw m3, m4, m2 ; -out3 out12 out2 -out13
+ psubsw m4, m2 ; t6 t7 t14a t15a
+ shufps m2, m1, m4, q1032 ; t2a t6 t10 t14a
+ vpblendd m4, m1, 0x33 ; t3a t7 t11 t15a
+ ret
+ALIGN function_align
+.main_pass1_end:
+ vpbroadcastd m5, [o(pw_m2896_2896)]
+ vpbroadcastd m6, [o(pw_2896_2896)]
+ punpcklwd m1, m4, m2
+ punpckhwd m4, m2
+ pmaddwd m2, m5, m4
+ pmaddwd m4, m6
+ pmaddwd m5, m1
+ pmaddwd m1, m6
+ REPX {paddd x, m8}, m5, m1, m2, m4
+ REPX {psrad x, 12}, m5, m2, m1, m4
+ packssdw m2, m5 ; -out11 out8 out10 -out9
+ packssdw m1, m4 ; -out7 out4 out6 -out5
+ ret
+
+INV_TXFM_4X16_FN flipadst, dct
+INV_TXFM_4X16_FN flipadst, adst
+INV_TXFM_4X16_FN flipadst, flipadst
+INV_TXFM_4X16_FN flipadst, identity
+
+cglobal iflipadst_4x16_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2
+ mova m0, [cq+32*0]
+ mova m1, [cq+32*1]
+ mova m2, [cq+32*2]
+ mova m3, [cq+32*3]
+ call m(iadst_16x4_internal_8bpc).main
+ vpbroadcastd m5, [o(pw_16384)]
+ punpcklwd m4, m1, m0
+ punpckhwd m1, m0
+ punpcklwd m0, m3, m2
+ punpckhwd m3, m2
+ REPX {pmulhrsw x, m5}, m4, m1, m0, m3
+ punpckldq m2, m3, m1
+ punpckhdq m3, m1
+ punpckhdq m1, m0, m4
+ punpckldq m0, m4
+ jmp tx2q
+.pass2:
+ call m(iadst_4x16_internal_8bpc).main
+ vpbroadcastd m5, [o(pw_2896x8)]
+ paddsw m1, m2, m4
+ psubsw m2, m4
+ pmulhrsw m1, m5 ; -out7 out4 out6 -out5
+ pmulhrsw m2, m5 ; out8 -out11 -out9 out10
+ vpbroadcastd m6, [o(pw_2048)]
+ pshufd m1, m1, q1032
+ vpblendd m4, m0, m2, 0x33
+ vpblendd m0, m1, 0xcc
+ vpblendd m1, m3, 0xcc
+ vpblendd m2, m3, 0x33
+ vpermq m0, m0, q3120
+ vpermq m1, m1, q0213
+ vpermq m2, m2, q2031
+ vpermq m3, m4, q1302
+ psubw m5, m7, m6
+ jmp m(iadst_4x16_internal_8bpc).end
+
+INV_TXFM_4X16_FN identity, dct
+INV_TXFM_4X16_FN identity, adst
+INV_TXFM_4X16_FN identity, flipadst
+INV_TXFM_4X16_FN identity, identity
+
+cglobal iidentity_4x16_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2
+ mova m3, [cq+32*0]
+ mova m2, [cq+32*1]
+ mova m4, [cq+32*2]
+ mova m5, [cq+32*3]
+ vpbroadcastd m8, [o(pw_1697x8)]
+ pcmpeqw m0, m0 ; -1
+ punpcklwd m1, m3, m2
+ punpckhwd m3, m2
+ punpcklwd m2, m4, m5
+ punpckhwd m4, m5
+ pmulhrsw m5, m8, m1
+ pmulhrsw m6, m8, m2
+ pmulhrsw m7, m8, m3
+ pmulhrsw m8, m4
+ pcmpeqw m9, m0, m1 ; we want to do a signed avg, but pavgw is
+ pxor m1, m9 ; unsigned. as long as both signs are equal
+ pcmpeqw m9, m0, m2 ; it still works, but if the input is -1 the
+ pxor m2, m9 ; pmulhrsw result will become 0 which causes
+ pcmpeqw m9, m0, m3 ; pavgw to output -32768 instead of 0 unless
+ pxor m3, m9 ; we explicitly deal with that case here.
+ pcmpeqw m0, m4
+ pxor m4, m0
+ pavgw m1, m5
+ pavgw m2, m6
+ pavgw m3, m7
+ pavgw m4, m8
+ punpckldq m0, m1, m2
+ punpckhdq m1, m2
+ punpckldq m2, m3, m4
+ punpckhdq m3, m4
+ jmp tx2q
+.pass2:
+ vpbroadcastd m8, [o(pw_1697x16)]
+ vpbroadcastd m5, [o(pw_2048)]
+ pmulhrsw m4, m8, m0
+ pmulhrsw m6, m8, m1
+ pmulhrsw m7, m8, m2
+ pmulhrsw m8, m3
+ REPX {paddsw x, x}, m0, m1, m2, m3
+ paddsw m0, m4
+ paddsw m1, m6
+ paddsw m2, m7
+ paddsw m3, m8
+ jmp m(iadst_4x16_internal_8bpc).end2
+
+%macro WRITE_8X4 4-7 strideq*1, strideq*2, r3 ; coefs[1-2], tmp[1-2], off[1-3]
+ movq xm%3, [dstq ]
+ movhps xm%3, [dstq+%5]
+ movq xm%4, [dstq+%6]
+ movhps xm%4, [dstq+%7]
+ pmovzxbw m%3, xm%3
+ pmovzxbw m%4, xm%4
+%ifnum %1
+ paddw m%3, m%1
+%else
+ paddw m%3, %1
+%endif
+%ifnum %2
+ paddw m%4, m%2
+%else
+ paddw m%4, %2
+%endif
+ packuswb m%3, m%4
+ vextracti128 xm%4, m%3, 1
+ movq [dstq ], xm%3
+ movhps [dstq+%6], xm%3
+ movq [dstq+%5], xm%4
+ movhps [dstq+%7], xm%4
+%endmacro
+
+%macro INV_TXFM_8X4_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 8x4
+%ifidn %1_%2, dct_dct
+ movd xm1, [o(pw_2896x8)]
+ pmulhrsw xm0, xm1, [cq]
+ mov [cq], eobd
+ pmulhrsw xm0, xm1
+ jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly2
+%endif
+%endmacro
+
+INV_TXFM_8X4_FN dct, dct
+INV_TXFM_8X4_FN dct, adst
+INV_TXFM_8X4_FN dct, flipadst
+INV_TXFM_8X4_FN dct, identity
+
+cglobal idct_8x4_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
+ vpbroadcastd xm3, [o(pw_2896x8)]
+ pmulhrsw xm0, xm3, [cq+16*0]
+ pmulhrsw xm1, xm3, [cq+16*1]
+ pmulhrsw xm2, xm3, [cq+16*2]
+ pmulhrsw xm3, [cq+16*3]
+ call m(idct_4x8_internal_8bpc).main
+ vbroadcasti128 m4, [o(deint_shuf)]
+ vinserti128 m3, m1, xm3, 1
+ vinserti128 m1, m0, xm2, 1
+ shufps m0, m1, m3, q0220
+ shufps m1, m3, q1331
+ pshufb m0, m4
+ pshufb m1, m4
+ jmp tx2q
+.pass2:
+ IDCT4_1D_PACKED
+ vpermq m0, m0, q3120
+ vpermq m1, m1, q2031
+ jmp m(iadst_8x4_internal_8bpc).end2
+
+INV_TXFM_8X4_FN adst, dct
+INV_TXFM_8X4_FN adst, adst
+INV_TXFM_8X4_FN adst, flipadst
+INV_TXFM_8X4_FN adst, identity
+
+cglobal iadst_8x4_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
+ vpbroadcastd xm0, [o(pw_2896x8)]
+ pshufd xm4, [cq+16*0], q1032
+ pmulhrsw xm3, xm0, [cq+16*3]
+ pshufd xm5, [cq+16*1], q1032
+ pmulhrsw xm2, xm0, [cq+16*2]
+ pmulhrsw xm4, xm0
+ pmulhrsw xm5, xm0
+ call m(iadst_4x8_internal_8bpc).main_pass1
+ vinserti128 m0, xm2, 1
+ vinserti128 m1, xm3, 1
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ pxor m3, m3
+ psubsw m3, m2
+ punpckhwd m1, m0, m3
+ punpcklwd m0, m3
+ jmp tx2q
+.pass2:
+ call .main
+.end:
+ vpermq m0, m0, q3120
+ vpermq m1, m1, q3120
+.end2:
+ vpbroadcastd m2, [o(pw_2048)]
+ pmulhrsw m0, m2
+ pmulhrsw m1, m2
+ WIN64_RESTORE_XMM
+.end3:
+ pxor m2, m2
+ mova [cq+32*0], m2
+ mova [cq+32*1], m2
+ lea r3, [strideq*3]
+ WRITE_8X4 0, 1, 4, 5
+ RET
+ALIGN function_align
+cglobal_label .main
+ IADST4_1D_PACKED
+ ret
+
+INV_TXFM_8X4_FN flipadst, dct
+INV_TXFM_8X4_FN flipadst, adst
+INV_TXFM_8X4_FN flipadst, flipadst
+INV_TXFM_8X4_FN flipadst, identity
+
+cglobal iflipadst_8x4_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
+ vpbroadcastd xm0, [o(pw_2896x8)]
+ pshufd xm4, [cq+16*0], q1032
+ pmulhrsw xm3, xm0, [cq+16*3]
+ pshufd xm5, [cq+16*1], q1032
+ pmulhrsw xm2, xm0, [cq+16*2]
+ pmulhrsw xm4, xm0
+ pmulhrsw xm5, xm0
+ call m(iadst_4x8_internal_8bpc).main_pass1
+ vinserti128 m3, xm1, 1
+ vinserti128 m2, xm0, 1
+ punpckhwd m1, m3, m2
+ punpcklwd m3, m2
+ pxor m0, m0
+ psubsw m0, m1
+ punpckhwd m1, m0, m3
+ punpcklwd m0, m3
+ jmp tx2q
+.pass2:
+ call m(iadst_8x4_internal_8bpc).main
+ mova m2, m1
+ vpermq m1, m0, q2031
+ vpermq m0, m2, q2031
+ jmp m(iadst_8x4_internal_8bpc).end2
+
+INV_TXFM_8X4_FN identity, dct
+INV_TXFM_8X4_FN identity, adst
+INV_TXFM_8X4_FN identity, flipadst
+INV_TXFM_8X4_FN identity, identity
+
+cglobal iidentity_8x4_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
+ mova xm2, [cq+16*0]
+ mova xm0, [cq+16*1]
+ vinserti128 m2, [cq+16*2], 1
+ vinserti128 m0, [cq+16*3], 1
+ vpbroadcastd m3, [o(pw_2896x8)]
+ punpcklwd m1, m2, m0
+ punpckhwd m2, m0
+ pmulhrsw m1, m3
+ pmulhrsw m2, m3
+ punpcklwd m0, m1, m2
+ punpckhwd m1, m2
+ paddsw m0, m0
+ paddsw m1, m1
+ jmp tx2q
+.pass2:
+ vpbroadcastd m3, [o(pw_1697x8)]
+ pmulhrsw m2, m3, m0
+ pmulhrsw m3, m1
+ paddsw m0, m2
+ paddsw m1, m3
+ jmp m(iadst_8x4_internal_8bpc).end
+
+%macro INV_TXFM_8X8_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 8x8
+%ifidn %1_%2, dct_dct
+ movd xm1, [o(pw_2896x8)]
+ pmulhrsw xm0, xm1, [cq]
+ movd xm2, [o(pw_16384)]
+ mov [cq], eobd
+ or r3d, 8
+.dconly:
+ pmulhrsw xm0, xm2
+.dconly2:
+ movd xm2, [pw_2048]
+ pmulhrsw xm0, xm1
+ lea r2, [strideq*3]
+ pmulhrsw xm0, xm2
+ vpbroadcastw m0, xm0
+.dconly_loop:
+ WRITE_8X4 0, 0, 1, 2, strideq*1, strideq*2, r2
+ lea dstq, [dstq+strideq*4]
+ sub r3d, 4
+ jg .dconly_loop
+ RET
+%endif
+%endmacro
+
+INV_TXFM_8X8_FN dct, dct
+INV_TXFM_8X8_FN dct, adst
+INV_TXFM_8X8_FN dct, flipadst
+INV_TXFM_8X8_FN dct, identity
+
+cglobal idct_8x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
+ vpermq m0, [cq+32*0], q3120 ; 0 1
+ vpermq m3, [cq+32*3], q3120 ; 6 7
+ vpermq m2, [cq+32*2], q3120 ; 4 5
+ vpermq m1, [cq+32*1], q3120 ; 2 3
+ call .main
+ shufps m4, m0, m1, q0220
+ shufps m5, m0, m1, q1331
+ shufps m1, m2, m3, q0220
+ shufps m3, m2, m3, q1331
+ vbroadcasti128 m0, [o(deint_shuf)]
+ vpbroadcastd m2, [o(pw_16384)]
+ REPX {pshufb x, m0}, m4, m5, m1, m3
+ REPX {pmulhrsw x, m2}, m4, m5, m1, m3
+ vinserti128 m0, m4, xm1, 1
+ vperm2i128 m2, m4, m1, 0x31
+ vinserti128 m1, m5, xm3, 1
+ vperm2i128 m3, m5, m3, 0x31
+ jmp tx2q
+.pass2:
+ call .main
+ vpbroadcastd m4, [o(pw_2048)]
+ vpermq m0, m0, q3120
+ vpermq m1, m1, q2031
+ vpermq m2, m2, q3120
+ vpermq m3, m3, q2031
+ jmp m(iadst_8x8_internal_8bpc).end2
+ALIGN function_align
+cglobal_label .main
+ IDCT8_1D_PACKED
+ ret
+
+INV_TXFM_8X8_FN adst, dct
+INV_TXFM_8X8_FN adst, adst
+INV_TXFM_8X8_FN adst, flipadst
+INV_TXFM_8X8_FN adst, identity
+
+cglobal iadst_8x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
+ vpermq m4, [cq+32*0], q1302 ; 1 0
+ vpermq m3, [cq+32*3], q3120 ; 6 7
+ vpermq m5, [cq+32*1], q1302 ; 3 2
+ vpermq m2, [cq+32*2], q3120 ; 4 5
+ call .main_pass1
+ vpbroadcastd m5, [o(pw_16384)]
+ punpcklwd m4, m0, m1
+ punpckhwd m0, m1
+ punpcklwd m1, m2, m3
+ punpckhwd m2, m3
+ pxor m3, m3
+ psubw m3, m5 ; negate odd elements during rounding
+ pmulhrsw m4, m5
+ pmulhrsw m0, m3
+ pmulhrsw m1, m5
+ pmulhrsw m2, m3
+ punpcklwd m3, m4, m0
+ punpckhwd m4, m0
+ punpcklwd m0, m1, m2
+ punpckhwd m1, m2
+ vperm2i128 m2, m3, m0, 0x31
+ vinserti128 m0, m3, xm0, 1
+ vperm2i128 m3, m4, m1, 0x31
+ vinserti128 m1, m4, xm1, 1
+ jmp tx2q
+.pass2:
+ pshufd m4, m0, q1032
+ pshufd m5, m1, q1032
+ call .main_pass2
+ vpbroadcastd m5, [o(pw_2048)]
+ vpbroadcastd xm4, [o(pw_4096)]
+ psubw m4, m5 ; lower half = 2048, upper half = -2048
+.end:
+ REPX {vpermq x, x, q3120}, m0, m1, m2, m3
+.end2:
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+.end3:
+ pmulhrsw m2, m4
+ pmulhrsw m3, m4
+ WIN64_RESTORE_XMM
+.end4:
+ pxor m4, m4
+ mova [cq+32*0], m4
+ mova [cq+32*1], m4
+ mova [cq+32*2], m4
+ mova [cq+32*3], m4
+ lea r3, [strideq*3]
+ WRITE_8X4 0, 1, 4, 5
+ lea dstq, [dstq+strideq*4]
+ WRITE_8X4 2, 3, 4, 5
+ RET
+ALIGN function_align
+.main_pass1:
+ IADST8_1D_PACKED 1
+ ret
+ALIGN function_align
+cglobal_label .main_pass2
+ IADST8_1D_PACKED 2
+ ret
+
+INV_TXFM_8X8_FN flipadst, dct
+INV_TXFM_8X8_FN flipadst, adst
+INV_TXFM_8X8_FN flipadst, flipadst
+INV_TXFM_8X8_FN flipadst, identity
+
+cglobal iflipadst_8x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
+ vpermq m4, [cq+32*0], q1302 ; 1 0
+ vpermq m3, [cq+32*3], q3120 ; 6 7
+ vpermq m5, [cq+32*1], q1302 ; 3 2
+ vpermq m2, [cq+32*2], q3120 ; 4 5
+ call m(iadst_8x8_internal_8bpc).main_pass1
+ vpbroadcastd m5, [o(pw_16384)]
+ punpckhwd m4, m3, m2
+ punpcklwd m3, m2
+ punpckhwd m2, m1, m0
+ punpcklwd m1, m0
+ pxor m0, m0
+ psubw m0, m5
+ pmulhrsw m4, m0
+ pmulhrsw m3, m5
+ pmulhrsw m2, m0
+ pmulhrsw m1, m5
+ punpckhwd m0, m4, m3
+ punpcklwd m4, m3
+ punpckhwd m3, m2, m1
+ punpcklwd m2, m1
+ vinserti128 m1, m0, xm3, 1
+ vperm2i128 m3, m0, m3, 0x31
+ vinserti128 m0, m4, xm2, 1
+ vperm2i128 m2, m4, m2, 0x31
+ jmp tx2q
+.pass2:
+ pshufd m4, m0, q1032
+ pshufd m5, m1, q1032
+ call m(iadst_8x8_internal_8bpc).main_pass2
+ vpbroadcastd m4, [o(pw_2048)]
+ vpbroadcastd xm5, [o(pw_4096)]
+ psubw m4, m5 ; lower half = -2048, upper half = 2048
+ vpermq m5, m3, q2031
+ vpermq m3, m0, q2031
+ vpermq m0, m2, q2031
+ vpermq m2, m1, q2031
+ pmulhrsw m1, m0, m4
+ pmulhrsw m0, m5, m4
+ jmp m(iadst_8x8_internal_8bpc).end3
+
+INV_TXFM_8X8_FN identity, dct
+INV_TXFM_8X8_FN identity, adst
+INV_TXFM_8X8_FN identity, flipadst
+INV_TXFM_8X8_FN identity, identity
+
+cglobal iidentity_8x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
+ mova xm3, [cq+16*0]
+ mova xm2, [cq+16*1]
+ vinserti128 m3, [cq+16*4], 1
+ vinserti128 m2, [cq+16*5], 1
+ mova xm4, [cq+16*2]
+ mova xm0, [cq+16*3]
+ vinserti128 m4, [cq+16*6], 1
+ vinserti128 m0, [cq+16*7], 1
+ punpcklwd m1, m3, m2
+ punpckhwd m3, m2
+ punpcklwd m2, m4, m0
+ punpckhwd m4, m0
+ punpckldq m0, m1, m2
+ punpckhdq m1, m2
+ punpckldq m2, m3, m4
+ punpckhdq m3, m4
+ jmp tx2q
+.pass2:
+ vpbroadcastd m4, [o(pw_4096)]
+ jmp m(iadst_8x8_internal_8bpc).end
+
+%macro INV_TXFM_8X16_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 8x16
+%ifidn %1_%2, dct_dct
+ movd xm1, [o(pw_2896x8)]
+ pmulhrsw xm0, xm1, [cq]
+ movd xm2, [o(pw_16384)]
+ mov [cq], eobd
+ pmulhrsw xm0, xm1
+ or r3d, 16
+ jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly
+%endif
+%endmacro
+
+%macro ITX_8X16_LOAD_COEFS 0
+ vpbroadcastd m4, [o(pw_2896x8)]
+ pmulhrsw m0, m4, [cq+32*0]
+ add cq, 32*4
+ pmulhrsw m7, m4, [cq+32*3]
+ pmulhrsw m1, m4, [cq-32*3]
+ pmulhrsw m6, m4, [cq+32*2]
+ pmulhrsw m2, m4, [cq-32*2]
+ pmulhrsw m5, m4, [cq+32*1]
+ pmulhrsw m3, m4, [cq-32*1]
+ pmulhrsw m4, [cq+32*0]
+%endmacro
+
+INV_TXFM_8X16_FN dct, dct
+INV_TXFM_8X16_FN dct, adst
+INV_TXFM_8X16_FN dct, flipadst
+INV_TXFM_8X16_FN dct, identity
+
+cglobal idct_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2
+ ITX_8X16_LOAD_COEFS
+ call m(idct_16x8_internal_8bpc).main
+ vpbroadcastd m10, [o(pw_16384)]
+.pass1_end:
+ vperm2i128 m9, m3, m7, 0x31
+ vinserti128 m3, xm7, 1
+ vperm2i128 m8, m2, m6, 0x31
+ vinserti128 m2, xm6, 1
+ vperm2i128 m6, m1, m5, 0x31
+ vinserti128 m1, xm5, 1
+ vperm2i128 m5, m0, m4, 0x31
+ vinserti128 m0, xm4, 1
+ punpckhwd m4, m2, m3
+ punpcklwd m2, m3
+ punpckhwd m3, m0, m1
+ punpcklwd m0, m1
+.pass1_end2:
+ punpckhwd m7, m5, m6
+ punpcklwd m5, m6
+ punpcklwd m6, m8, m9
+ punpckhwd m8, m9
+ REPX {pmulhrsw x, m10}, m2, m0, m4, m3, m5, m6, m7, m8
+ punpckhdq m1, m0, m2
+ punpckldq m0, m2
+ punpckldq m2, m3, m4
+ punpckhdq m3, m4
+ punpckldq m4, m5, m6
+ punpckhdq m5, m6
+ punpckldq m6, m7, m8
+ punpckhdq m7, m8
+ jmp tx2q
+.pass2:
+ call .main
+ REPX {vpermq x, x, q3120}, m0, m2, m4, m6
+ REPX {vpermq x, x, q2031}, m1, m3, m5, m7
+.end:
+ vpbroadcastd m8, [o(pw_2048)]
+.end2:
+ REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
+.end3:
+ pxor m8, m8
+ REPX {mova [cq+32*x], m8}, -4, -3, -2, -1, 0, 1, 2, 3
+ lea r3, [strideq*3]
+ WRITE_8X4 0, 1, 8, 9
+ lea dstq, [dstq+strideq*4]
+ WRITE_8X4 2, 3, 0, 1
+ lea dstq, [dstq+strideq*4]
+ WRITE_8X4 4, 5, 0, 1
+ lea dstq, [dstq+strideq*4]
+ WRITE_8X4 6, 7, 0, 1
+ RET
+ALIGN function_align
+cglobal_label .main
+ IDCT16_1D_PACKED
+ ret
+
+INV_TXFM_8X16_FN adst, dct
+INV_TXFM_8X16_FN adst, adst
+INV_TXFM_8X16_FN adst, flipadst
+INV_TXFM_8X16_FN adst, identity
+
+cglobal iadst_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2
+ ITX_8X16_LOAD_COEFS
+ call m(iadst_16x8_internal_8bpc).main
+ call m(iadst_16x8_internal_8bpc).main_pass1_end
+ vpbroadcastd m10, [o(pw_16384)]
+ pslld m9, m10, 17
+ psubw m10, m9 ; 16384, -16384
+ jmp m(idct_8x16_internal_8bpc).pass1_end
+ALIGN function_align
+.pass2:
+ call .main
+ call .main_pass2_end
+ vpbroadcastd m9, [o(pw_2048)]
+ vpbroadcastd xm8, [o(pw_4096)]
+ psubw m8, m9
+ REPX {vpermq x, x, q2031}, m0, m1, m2, m3
+ REPX {vpermq x, x, q3120}, m4, m5, m6, m7
+ jmp m(idct_8x16_internal_8bpc).end2
+ALIGN function_align
+cglobal_label .main
+ REPX {pshufd x, x, q1032}, m7, m1, m5, m3
+.main2:
+ vpbroadcastd m10, [o(pd_2048)]
+ punpckhwd m8, m7, m0 ; in14 in1
+ punpcklwd m0, m7 ; in0 in15
+ punpcklwd m7, m6, m1 ; in12 in3
+ punpckhwd m1, m6 ; in2 in13
+ punpckhwd m6, m5, m2 ; in10 in5
+ punpcklwd m2, m5 ; in4 in11
+ punpcklwd m5, m4, m3 ; in8 in7
+ punpckhwd m3, m4 ; in6 in9
+ ITX_MUL2X_PACK 0, 4, 9, 10, 201, 4091, 3 ; t0 t1
+ ITX_MUL2X_PACK 1, 4, 9, 10, 995, 3973, 3 ; t2 t3
+ ITX_MUL2X_PACK 2, 4, 9, 10, 1751, 3703, 3 ; t4 t5
+ ITX_MUL2X_PACK 3, 4, 9, 10, 2440, 3290, 3 ; t6 t7
+ ITX_MUL2X_PACK 5, 4, 9, 10, 3035, 2751, 3 ; t8 t9
+ ITX_MUL2X_PACK 6, 4, 9, 10, 3513, 2106, 3 ; t10 t11
+ ITX_MUL2X_PACK 7, 4, 9, 10, 3857, 1380, 3 ; t12 t13
+ ITX_MUL2X_PACK 8, 4, 9, 10, 4052, 601, 3 ; t14 t15
+ psubsw m4, m0, m5 ; t9a t8a
+ paddsw m0, m5 ; t1a t0a
+ psubsw m5, m1, m6 ; t11a t10a
+ paddsw m1, m6 ; t3a t2a
+ psubsw m6, m2, m7 ; t13a t12a
+ paddsw m2, m7 ; t5a t4a
+ psubsw m7, m3, m8 ; t15a t14a
+ paddsw m3, m8 ; t7a t6a
+ vpbroadcastd m11, [o(pw_m4017_799)]
+ vpbroadcastd m12, [o(pw_799_4017)]
+ pxor m9, m9
+ ITX_MUL2X_PACK 4, 8, _, 10, 11, 12, 6 ; t8 t9
+ psubw m8, m9, m11 ; pw_4017_m799
+ ITX_MUL2X_PACK 6, 12, _, 10, 12, 8, 6 ; t12 t13
+ vpbroadcastd m11, [o(pw_m2276_3406)]
+ vpbroadcastd m12, [o(pw_3406_2276)]
+ ITX_MUL2X_PACK 5, 8, _, 10, 11, 12, 6 ; t10 t11
+ psubw m8, m9, m11 ; pw_2276_m3406
+ ITX_MUL2X_PACK 7, 12, _, 10, 12, 8, 6 ; t14 t15
+ psubsw m8, m1, m3 ; t7 t6
+ paddsw m1, m3 ; t3 t2
+ psubsw m3, m0, m2 ; t5 t4
+ paddsw m0, m2 ; t1 t0
+ psubsw m2, m5, m7 ; t14a t15a
+ paddsw m7, m5 ; t10a t11a
+ psubsw m5, m4, m6 ; t12a t13a
+ paddsw m4, m6 ; t8a t9a
+ vpbroadcastd m11, [o(pw_m3784_1567)]
+ vpbroadcastd m12, [o(pw_1567_3784)]
+ ITX_MUL2X_PACK 3, 6, _, 10, 12, 11, 6 ; t5a t4a
+ psubw m6, m9, m11 ; pw_3784_m1567
+ ITX_MUL2X_PACK 8, 6, _, 10, 6, 12, 6 ; t7a t6a
+ vpbroadcastd m11, [o(pw_m1567_3784)]
+ vpbroadcastd m12, [o(pw_3784_1567)]
+ ITX_MUL2X_PACK 2, 6, _, 10, 11, 12, 6 ; t15 t14
+ psubw m6, m9, m11 ; pw_1567_m3784
+ ITX_MUL2X_PACK 5, 12, _, 10, 12, 6, 6 ; t13 t12
+ vbroadcasti128 m12, [o(deint_shuf)]
+ paddsw m6, m4, m7 ; -out1 out14
+ psubsw m4, m7 ; t10 t11
+ psubsw m11, m3, m8 ; t7 t6
+ paddsw m8, m3 ; out12 -out3
+ psubsw m3, m0, m1 ; t3a t2a
+ paddsw m0, m1 ; -out15 out0
+ paddsw m1, m2, m5 ; -out13 out2
+ psubsw m5, m2 ; t15a t14a
+ pshufb m0, m12
+ pshufb m6, m12
+ pshufb m8, m12
+ pshufb m1, m12
+ shufps m7, m6, m0, q1032 ; out14 -out15
+ vpblendd m0, m6, 0x33 ; -out1 out0
+ punpcklqdq m6, m8, m1 ; out12 -out13
+ punpckhqdq m1, m8, m1 ; -out3 out2
+ ret
+ALIGN function_align
+.main_pass1_end:
+ vpbroadcastd m8, [o(pw_m2896_2896)]
+ vpbroadcastd m12, [o(pw_2896_2896)]
+ pmaddwd m9, m8, m11 ; -out11
+ pmaddwd m2, m12, m5 ; -out5
+ pmaddwd m5, m8 ; out10
+ pmaddwd m11, m12 ; out4
+ REPX {paddd x, m10}, m9, m5, m2, m11
+ REPX {psrad x, 12 }, m9, m5, m2, m11
+ packssdw m5, m9 ; out10 -out11
+ packssdw m2, m11 ; -out5 out4
+ pmaddwd m11, m8, m3 ; out8
+ vpbroadcastd m8, [o(pw_2896_m2896)]
+ pmaddwd m3, m12 ; -out7
+ pmaddwd m8, m4 ; -out9
+ pmaddwd m4, m12 ; out6
+ REPX {paddd x, m10}, m11, m3, m8, m4
+ REPX {psrad x, 12 }, m11, m3, m8, m4
+ packssdw m3, m4 ; -out7 out6
+ packssdw m4, m11, m8 ; out8 -out9
+ vpbroadcastd m10, [o(pw_16384)]
+ pxor m9, m9
+ ret
+ALIGN function_align
+cglobal_label .main_pass2_end
+ vpbroadcastd m8, [o(pw_2896x8)]
+ pshufb m2, m11, m12
+ pshufb m5, m12
+ pshufb m3, m12
+ pshufb m4, m12
+ punpcklqdq m11, m5, m2 ; t15a t7
+ punpckhqdq m5, m2 ; t14a t6
+ shufps m2, m3, m4, q1032 ; t2a t10
+ vpblendd m3, m4, 0xcc ; t3a t11
+ psubsw m4, m2, m3 ; out8 -out9
+ paddsw m3, m2 ; -out7 out6
+ paddsw m2, m5, m11 ; -out5 out4
+ psubsw m5, m11 ; out10 -out11
+ REPX {pmulhrsw x, m8}, m2, m3, m4, m5
+ ret
+
+INV_TXFM_8X16_FN flipadst, dct
+INV_TXFM_8X16_FN flipadst, adst
+INV_TXFM_8X16_FN flipadst, flipadst
+INV_TXFM_8X16_FN flipadst, identity
+
+cglobal iflipadst_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2
+ ITX_8X16_LOAD_COEFS
+ call m(iadst_16x8_internal_8bpc).main
+ call m(iadst_16x8_internal_8bpc).main_pass1_end
+ vpbroadcastd m9, [o(pw_16384)]
+ pslld m10, m9, 17
+ psubw m10, m9 ; -16384, 16384
+ vperm2i128 m9, m4, m0, 0x31
+ vinserti128 m0, m4, xm0, 1
+ vperm2i128 m8, m5, m1, 0x31
+ vinserti128 m4, m5, xm1, 1
+ vperm2i128 m5, m7, m3, 0x31
+ vinserti128 m3, m7, xm3, 1
+ vinserti128 m1, m6, xm2, 1
+ vperm2i128 m6, m6, m2, 0x31
+ punpcklwd m2, m4, m0
+ punpckhwd m4, m0
+ punpcklwd m0, m3, m1
+ punpckhwd m3, m1
+ jmp m(idct_8x16_internal_8bpc).pass1_end2
+.pass2:
+ call m(iadst_8x16_internal_8bpc).main
+ call m(iadst_8x16_internal_8bpc).main_pass2_end
+ vpbroadcastd m8, [o(pw_2048)]
+ vpbroadcastd xm9, [o(pw_4096)]
+ psubw m8, m9
+ vpermq m9, m0, q3120
+ vpermq m0, m7, q2031
+ vpermq m7, m1, q3120
+ vpermq m1, m6, q2031
+ vpermq m6, m2, q3120
+ vpermq m2, m5, q2031
+ vpermq m5, m3, q3120
+ vpermq m3, m4, q2031
+ pmulhrsw m0, m8
+ pmulhrsw m1, m8
+ pmulhrsw m2, m8
+ pmulhrsw m3, m8
+ pmulhrsw m4, m5, m8
+ pmulhrsw m5, m6, m8
+ pmulhrsw m6, m7, m8
+ pmulhrsw m7, m9, m8
+ jmp m(idct_8x16_internal_8bpc).end3
+
+INV_TXFM_8X16_FN identity, dct
+INV_TXFM_8X16_FN identity, adst
+INV_TXFM_8X16_FN identity, flipadst
+INV_TXFM_8X16_FN identity, identity
+
+%macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16394]
+ pmulhrsw m%2, m%3, m%1
+%if %0 == 4 ; if downshifting by 1
+ pmulhrsw m%2, m%4
+%else
+ paddsw m%1, m%1
+%endif
+ paddsw m%1, m%2
+%endmacro
+
+cglobal iidentity_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2
+ mova xm3, [cq+16*0]
+ mova xm2, [cq+16*2]
+ add cq, 16*8
+ vinserti128 m3, [cq+16*0], 1
+ vinserti128 m2, [cq+16*2], 1
+ vpbroadcastd m9, [o(pw_2896x8)]
+ mova xm4, [cq-16*4]
+ mova xm5, [cq-16*2]
+ vinserti128 m4, [cq+16*4], 1
+ vinserti128 m5, [cq+16*6], 1
+ mova xm7, [cq-16*7]
+ mova xm6, [cq-16*5]
+ vinserti128 m7, [cq+16*1], 1
+ vinserti128 m6, [cq+16*3], 1
+ mova xm8, [cq-16*3]
+ mova xm0, [cq-16*1]
+ vinserti128 m8, [cq+16*5], 1
+ vinserti128 m0, [cq+16*7], 1
+ punpcklwd m1, m3, m2
+ punpckhwd m3, m2
+ punpcklwd m2, m4, m5
+ punpckhwd m4, m5
+ punpcklwd m5, m7, m6
+ punpckhwd m7, m6
+ punpcklwd m6, m8, m0
+ punpckhwd m8, m0
+ REPX {pmulhrsw x, m9}, m1, m2, m3, m4, m5, m6, m7, m8
+ punpckldq m0, m1, m2
+ punpckhdq m1, m2
+ punpckldq m2, m3, m4
+ punpckhdq m3, m4
+ punpckldq m4, m5, m6
+ punpckhdq m5, m6
+ punpckldq m6, m7, m8
+ punpckhdq m7, m8
+ jmp tx2q
+.pass2:
+ vpbroadcastd m8, [o(pw_1697x16)]
+ REPX {vpermq x, x, q3120}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {IDTX16 x, 9, 8}, 0, 1, 2, 3, 4, 5, 6, 7
+ jmp m(idct_8x16_internal_8bpc).end
+
+%macro WRITE_16X2 6 ; coefs[1-2], tmp[1-2], offset[1-2]
+ pmovzxbw m%3, [dstq+%5]
+%ifnum %1
+ paddw m%3, m%1
+%else
+ paddw m%3, %1
+%endif
+ pmovzxbw m%4, [dstq+%6]
+%ifnum %2
+ paddw m%4, m%2
+%else
+ paddw m%4, %2
+%endif
+ packuswb m%3, m%4
+ vpermq m%3, m%3, q3120
+ mova [dstq+%5], xm%3
+ vextracti128 [dstq+%6], m%3, 1
+%endmacro
+
+%macro INV_TXFM_16X4_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 16x4
+%ifidn %1_%2, dct_dct
+ movd xm1, [o(pw_2896x8)]
+ pmulhrsw xm0, xm1, [cq]
+ movd xm2, [o(pw_16384)]
+ mov [cq], eobd
+ or r3d, 4
+.dconly:
+ pmulhrsw xm0, xm2
+ movd xm2, [pw_2048] ; intentionally rip-relative
+ pmulhrsw xm0, xm1
+ pmulhrsw xm0, xm2
+ vpbroadcastw m0, xm0
+ pxor m3, m3
+.dconly_loop:
+ mova xm1, [dstq+strideq*0]
+ vinserti128 m1, [dstq+strideq*1], 1
+ punpckhbw m2, m1, m3
+ punpcklbw m1, m3
+ paddw m2, m0
+ paddw m1, m0
+ packuswb m1, m2
+ mova [dstq+strideq*0], xm1
+ vextracti128 [dstq+strideq*1], m1, 1
+ lea dstq, [dstq+strideq*2]
+ sub r3d, 2
+ jg .dconly_loop
+ RET
+%endif
+%endmacro
+
+INV_TXFM_16X4_FN dct, dct
+INV_TXFM_16X4_FN dct, adst
+INV_TXFM_16X4_FN dct, flipadst
+INV_TXFM_16X4_FN dct, identity
+
+cglobal idct_16x4_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2
+ mova xm0, [cq+16*0]
+ mova xm1, [cq+16*1]
+ mova xm2, [cq+16*2]
+ mova xm3, [cq+16*3]
+ mova xm4, [cq+16*4]
+ mova xm5, [cq+16*5]
+ mova xm6, [cq+16*6]
+ mova xm7, [cq+16*7]
+ call m(idct_4x16_internal_8bpc).main
+ vinserti128 m6, m2, xm6, 1
+ vinserti128 m2, m0, xm4, 1
+ vinserti128 m0, m1, xm5, 1
+ vinserti128 m1, m3, xm7, 1
+ punpcklwd m3, m2, m6
+ punpckhwd m2, m6
+ vpbroadcastd m6, [o(pw_16384)]
+ punpckhwd m4, m0, m1
+ punpcklwd m0, m1
+ mova m1, m6
+ jmp m(iadst_16x4_internal_8bpc).pass1_end
+.pass2:
+ call .main
+ jmp m(iadst_16x4_internal_8bpc).end
+ALIGN function_align
+cglobal_label .main
+ vpbroadcastd m6, [o(pd_2048)]
+ IDCT4_1D 0, 1, 2, 3, 4, 5, 6
+ ret
+
+INV_TXFM_16X4_FN adst, dct
+INV_TXFM_16X4_FN adst, adst
+INV_TXFM_16X4_FN adst, flipadst
+INV_TXFM_16X4_FN adst, identity
+
+cglobal iadst_16x4_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2
+ vpermq m0, [cq+32*0], q1230
+ vpermq m3, [cq+32*3], q2103
+ vpermq m1, [cq+32*1], q1230
+ vpermq m2, [cq+32*2], q2103
+ call m(iadst_4x16_internal_8bpc).main2
+ call m(iadst_4x16_internal_8bpc).main_pass1_end
+ punpcklwd m4, m3, m1
+ punpcklwd m5, m2, m0
+ punpckhwd m0, m1
+ punpckhwd m2, m3
+ vpbroadcastd m1, [o(pw_16384)]
+ vinserti128 m3, m0, xm2, 1
+ vperm2i128 m2, m0, m2, 0x31
+ vinserti128 m0, m4, xm5, 1
+ vperm2i128 m4, m4, m5, 0x31
+ psubw m6, m7, m1
+.pass1_end:
+ pmulhrsw m3, m1
+ pmulhrsw m2, m6
+ pmulhrsw m4, m1
+ pmulhrsw m0, m6
+ punpcklwd m1, m3, m2
+ punpckhwd m3, m2
+ punpcklwd m2, m4, m0
+ punpckhwd m4, m0
+ punpckldq m0, m1, m2
+ punpckhdq m1, m2
+ punpckldq m2, m3, m4
+ punpckhdq m3, m4
+ jmp tx2q
+.pass2:
+ call .main
+.end:
+ vpbroadcastd m4, [o(pw_2048)]
+ REPX {pmulhrsw x, m4}, m0, m1, m2, m3
+ WIN64_RESTORE_XMM
+.end2:
+ pxor m4, m4
+ mova [cq+32*0], m4
+ mova [cq+32*1], m4
+ mova [cq+32*2], m4
+ mova [cq+32*3], m4
+.end3:
+ WRITE_16X2 0, 1, 4, 5, strideq*0, strideq*1
+ lea dstq, [dstq+strideq*2]
+ WRITE_16X2 2, 3, 4, 5, strideq*0, strideq*1
+ RET
+ALIGN function_align
+cglobal_label .main
+ vpbroadcastd m6, [o(pw_m3344_3344)]
+ vpbroadcastd m7, [o(pw_3803_1321)]
+ vpbroadcastd m8, [o(pw_m1321_2482)]
+ vpbroadcastd m9, [o(pw_2482_3344)]
+ punpcklwd m4, m2, m0 ; in2 in0 l
+ punpckhwd m2, m0 ; in2 in0 h
+ psrld m5, m6, 16
+ pmaddwd m10, m6, m4 ; t2:02 l
+ pmaddwd m6, m2 ; t2:02 h
+ pmaddwd m0, m7, m4 ; t0:02 l
+ pmaddwd m7, m2 ; t0:02 h
+ pmaddwd m4, m8 ; t1:02 l
+ pmaddwd m8, m2 ; t1:02 h
+ punpckhwd m2, m3, m1 ; in3 in1 h
+ punpcklwd m3, m1 ; in3 in1 l
+ pmaddwd m1, m5, m2 ; t2:3 h
+ pmaddwd m5, m3 ; t2:3 l
+ paddd m6, m1
+ vpbroadcastd m1, [o(pd_2048)]
+ paddd m10, m5
+ pmaddwd m5, m9, m3
+ pmaddwd m9, m2
+ paddd m0, m1
+ paddd m7, m1
+ paddd m0, m5 ; t0 + t3 + 2048 l
+ paddd m7, m9 ; t0 + t3 + 2048 h
+ vpbroadcastd m9, [o(pw_m3803_3344)]
+ pmaddwd m5, m9, m2
+ pmaddwd m9, m3
+ paddd m10, m1 ; t2 + 2048 l
+ paddd m6, m1 ; t2 + 2048 h
+ paddd m5, m1 ; t1:13 + 2048 h
+ paddd m1, m9 ; t1:13 + 2048 l
+ vpbroadcastd m9, [o(pw_m3803_m6688)]
+ pmaddwd m2, m9
+ pmaddwd m3, m9
+ paddd m5, m8 ; t1 + t3 + 2048 h
+ paddd m1, m4 ; t1 + t3 + 2048 l
+ paddd m8, m7
+ paddd m4, m0
+ paddd m2, m8 ; t0 + t1 - t3 + 2048 h
+ paddd m3, m4 ; t0 + t1 - t3 + 2048 l
+ REPX {psrad x, 12}, m10, m6, m0, m7, m5, m1, m2, m3
+ packssdw m0, m7
+ packssdw m1, m5
+ packssdw m3, m2
+ packssdw m2, m10, m6
+ ret
+
+INV_TXFM_16X4_FN flipadst, dct
+INV_TXFM_16X4_FN flipadst, adst
+INV_TXFM_16X4_FN flipadst, flipadst
+INV_TXFM_16X4_FN flipadst, identity
+
+cglobal iflipadst_16x4_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2
+ vpermq m0, [cq+32*0], q1230
+ vpermq m3, [cq+32*3], q2103
+ vpermq m1, [cq+32*1], q1230
+ vpermq m2, [cq+32*2], q2103
+ call m(iadst_4x16_internal_8bpc).main2
+ call m(iadst_4x16_internal_8bpc).main_pass1_end
+ punpckhwd m4, m3, m2
+ punpckhwd m5, m1, m0
+ punpcklwd m0, m2
+ punpcklwd m1, m3
+ vpbroadcastd m6, [o(pw_16384)]
+ vinserti128 m3, m0, xm1, 1
+ vperm2i128 m2, m0, m1, 0x31
+ vinserti128 m0, m4, xm5, 1
+ vperm2i128 m4, m4, m5, 0x31
+ psubw m1, m7, m6
+ jmp m(iadst_16x4_internal_8bpc).pass1_end
+ALIGN function_align
+.pass2:
+ call m(iadst_16x4_internal_8bpc).main
+ vpbroadcastd m4, [o(pw_2048)]
+ REPX {pmulhrsw x, m4}, m3, m2, m1, m0
+ pxor m4, m4
+ mova [cq+32*0], m4
+ mova [cq+32*1], m4
+ mova [cq+32*2], m4
+ mova [cq+32*3], m4
+ WRITE_16X2 3, 2, 4, 5, strideq*0, strideq*1
+ lea dstq, [dstq+strideq*2]
+ WRITE_16X2 1, 0, 4, 5, strideq*0, strideq*1
+ RET
+
+INV_TXFM_16X4_FN identity, dct
+INV_TXFM_16X4_FN identity, adst
+INV_TXFM_16X4_FN identity, flipadst
+INV_TXFM_16X4_FN identity, identity
+
+cglobal iidentity_16x4_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2
+ mova xm2, [cq+16*0]
+ mova xm4, [cq+16*1]
+ vinserti128 m2, [cq+16*4], 1
+ vinserti128 m4, [cq+16*5], 1
+ mova xm0, [cq+16*2]
+ mova xm1, [cq+16*3]
+ vinserti128 m0, [cq+16*6], 1
+ vinserti128 m1, [cq+16*7], 1
+ vpbroadcastd m7, [o(pw_1697x16)]
+ vpbroadcastd m8, [o(pw_16384)]
+ punpcklwd m3, m2, m4
+ punpckhwd m2, m4
+ punpcklwd m4, m0, m1
+ punpckhwd m0, m1
+ punpcklwd m1, m3, m2
+ punpckhwd m3, m2
+ punpcklwd m2, m4, m0
+ punpckhwd m4, m0
+ pmulhrsw m0, m7, m1
+ pmulhrsw m5, m7, m2
+ pmulhrsw m6, m7, m3
+ pmulhrsw m7, m4
+ REPX {pmulhrsw x, m8}, m0, m5, m6, m7
+ paddsw m1, m0
+ paddsw m2, m5
+ paddsw m3, m6
+ paddsw m4, m7
+ punpcklqdq m0, m1, m2
+ punpckhqdq m1, m2
+ punpcklqdq m2, m3, m4
+ punpckhqdq m3, m4
+ jmp tx2q
+.pass2:
+ vpbroadcastd m7, [o(pw_1697x8)]
+ pmulhrsw m4, m7, m0
+ pmulhrsw m5, m7, m1
+ pmulhrsw m6, m7, m2
+ pmulhrsw m7, m3
+ paddsw m0, m4
+ paddsw m1, m5
+ paddsw m2, m6
+ paddsw m3, m7
+ jmp m(iadst_16x4_internal_8bpc).end
+
+%macro INV_TXFM_16X8_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 16x8
+%ifidn %1_%2, dct_dct
+ movd xm1, [o(pw_2896x8)]
+ pmulhrsw xm0, xm1, [cq]
+ movd xm2, [o(pw_16384)]
+ mov [cq], eobd
+ pmulhrsw xm0, xm1
+ or r3d, 8
+ jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
+%endif
+%endmacro
+
+%macro ITX_16X8_LOAD_COEFS 1 ; shuf_odd
+ vpbroadcastd m8, [o(pw_2896x8)]
+ vpermq m0, [cq+32*0], q3120
+ add cq, 32*4
+ vpermq m7, [cq+32*3], q%1
+ vpermq m1, [cq-32*3], q%1
+ vpermq m6, [cq+32*2], q3120
+ vpermq m2, [cq-32*2], q3120
+ vpermq m5, [cq+32*1], q%1
+ vpermq m3, [cq-32*1], q%1
+ vpermq m4, [cq+32*0], q3120
+ REPX {pmulhrsw x, m8}, m0, m7, m1, m6, m2, m5, m3, m4
+%endmacro
+
+INV_TXFM_16X8_FN dct, dct
+INV_TXFM_16X8_FN dct, adst
+INV_TXFM_16X8_FN dct, flipadst
+INV_TXFM_16X8_FN dct, identity
+
+cglobal idct_16x8_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2
+ ITX_16X8_LOAD_COEFS 3120
+ call m(idct_8x16_internal_8bpc).main
+ vpbroadcastd m10, [o(pw_16384)]
+ punpckhwd m8, m0, m2
+ punpcklwd m0, m2
+ punpckhwd m2, m1, m3
+ punpcklwd m1, m3
+ punpcklwd m9, m4, m6
+ punpckhwd m4, m6
+ punpcklwd m6, m5, m7
+ punpckhwd m5, m7
+ REPX {pmulhrsw x, m10}, m8, m1, m4, m6
+.pass1_end:
+ REPX {pmulhrsw x, m10}, m0, m2, m9, m5
+ punpckhwd m3, m0, m8
+ punpcklwd m0, m8
+ punpckhwd m8, m2, m1
+ punpcklwd m2, m1
+ punpcklwd m7, m9, m4
+ punpckhwd m9, m4
+ punpcklwd m4, m5, m6
+ punpckhwd m5, m6
+ punpckhdq m1, m0, m2
+ punpckldq m0, m2
+ punpckldq m2, m3, m8
+ punpckhdq m3, m8
+ punpckldq m6, m7, m4
+ punpckhdq m7, m4
+ punpckldq m8, m9, m5
+ punpckhdq m9, m5
+ vperm2i128 m4, m0, m6, 0x31
+ vinserti128 m0, xm6, 1
+ vperm2i128 m5, m1, m7, 0x31
+ vinserti128 m1, xm7, 1
+ vperm2i128 m6, m2, m8, 0x31
+ vinserti128 m2, xm8, 1
+ vperm2i128 m7, m3, m9, 0x31
+ vinserti128 m3, xm9, 1
+ jmp tx2q
+.pass2:
+ call .main
+ vpbroadcastd m8, [o(pw_2048)]
+.end:
+ REPX {pmulhrsw x, m8}, m0, m2, m4, m6
+.end2:
+ REPX {pmulhrsw x, m8}, m1, m3, m5, m7
+ lea r3, [strideq*3]
+ WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1
+ WRITE_16X2 2, 3, 0, 1, strideq*2, r3
+.end3:
+ pxor m0, m0
+ REPX {mova [cq+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3
+.end4:
+ lea dstq, [dstq+strideq*4]
+ WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1
+ WRITE_16X2 6, 7, 0, 1, strideq*2, r3
+ RET
+ALIGN function_align
+cglobal_label .main
+ vpbroadcastd m10, [o(pd_2048)]
+.main2:
+ IDCT8_1D 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
+ ret
+
+INV_TXFM_16X8_FN adst, dct
+INV_TXFM_16X8_FN adst, adst
+INV_TXFM_16X8_FN adst, flipadst
+INV_TXFM_16X8_FN adst, identity
+
+cglobal iadst_16x8_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2
+ ITX_16X8_LOAD_COEFS 1302
+ call m(iadst_8x16_internal_8bpc).main2
+ call m(iadst_8x16_internal_8bpc).main_pass1_end
+ psubw m11, m9, m10
+ punpcklwd m8, m0, m2
+ punpckhwd m0, m2
+ punpckhwd m2, m1, m3
+ punpcklwd m1, m3
+ punpcklwd m9, m4, m6
+ punpckhwd m4, m6
+ punpckhwd m6, m5, m7
+ punpcklwd m5, m7
+ REPX {pmulhrsw x, m11}, m8, m1, m4, m6
+ jmp m(idct_16x8_internal_8bpc).pass1_end
+ALIGN function_align
+.pass2:
+ call .main
+ call .main_pass2_end
+ pxor m8, m8
+ psubw m8, m9
+ REPX {pmulhrsw x, m9}, m0, m2, m4, m6
+ jmp m(idct_16x8_internal_8bpc).end2
+ALIGN function_align
+cglobal_label .main
+ vpbroadcastd m10, [o(pd_2048)]
+ ITX_MULSUB_2W 7, 0, 8, 9, 10, 401, 4076 ; t1a, t0a
+ ITX_MULSUB_2W 3, 4, 8, 9, 10, 3166, 2598 ; t5a, t4a
+ ITX_MULSUB_2W 1, 6, 8, 9, 10, 3920, 1189 ; t7a, t6a
+ ITX_MULSUB_2W 5, 2, 8, 9, 10, 1931, 3612 ; t3a, t2a
+ psubsw m8, m2, m6 ; t6
+ paddsw m2, m6 ; t2
+ psubsw m6, m0, m4 ; t4
+ paddsw m0, m4 ; t0
+ psubsw m4, m5, m1 ; t7
+ paddsw m5, m1 ; t3
+ psubsw m1, m7, m3 ; t5
+ paddsw m7, m3 ; t1
+ ITX_MULSUB_2W 6, 1, 3, 9, 10, 1567, 3784 ; t5a, t4a
+ ITX_MULSUB_2W 4, 8, 3, 9, 10, 3784, 1567 ; t6a, t7a
+ psubsw m9, m6, m8 ; t7
+ paddsw m6, m8 ; out6
+ psubsw m3, m7, m5 ; t3
+ paddsw m7, m5 ; -out7
+ psubsw m5, m0, m2 ; t2
+ paddsw m0, m2 ; out0
+ psubsw m2, m1, m4 ; t6
+ paddsw m1, m4 ; -out1
+ ret
+ALIGN function_align
+.main_pass1_end:
+ vpbroadcastd m11, [o(pw_m2896_2896)]
+ vpbroadcastd m12, [o(pw_2896_2896)]
+ punpckhwd m4, m3, m5
+ punpcklwd m3, m5
+ pmaddwd m5, m11, m4
+ pmaddwd m4, m12
+ pmaddwd m8, m11, m3
+ pmaddwd m3, m12
+ REPX {paddd x, m10}, m5, m4, m8, m3
+ REPX {psrad x, 12 }, m5, m8, m4, m3
+ packssdw m3, m4 ; -out3
+ packssdw m4, m8, m5 ; out4
+ punpcklwd m5, m9, m2
+ punpckhwd m9, m2
+ pmaddwd m2, m12, m5
+ pmaddwd m5, m11
+ pmaddwd m12, m9
+ pmaddwd m11, m9
+ REPX {paddd x, m10}, m2, m5, m12, m11
+ REPX {psrad x, 12 }, m2, m12, m5, m11
+ packssdw m2, m12 ; out2
+ packssdw m5, m11 ; -out5
+ ret
+ALIGN function_align
+cglobal_label .main_pass2_end
+ vpbroadcastd m8, [o(pw_2896x8)]
+ psubsw m4, m5, m3
+ paddsw m3, m5
+ psubsw m5, m2, m9
+ paddsw m2, m9
+ pmulhrsw m2, m8 ; out2
+ pmulhrsw m3, m8 ; -out3
+ pmulhrsw m4, m8 ; out4
+ pmulhrsw m5, m8 ; -out5
+ vpbroadcastd m9, [o(pw_2048)]
+ ret
+
+INV_TXFM_16X8_FN flipadst, dct
+INV_TXFM_16X8_FN flipadst, adst
+INV_TXFM_16X8_FN flipadst, flipadst
+INV_TXFM_16X8_FN flipadst, identity
+
+cglobal iflipadst_16x8_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2
+ ITX_16X8_LOAD_COEFS 1302
+ call m(iadst_8x16_internal_8bpc).main2
+ call m(iadst_8x16_internal_8bpc).main_pass1_end
+ psubw m9, m10
+ punpcklwd m8, m6, m4
+ punpckhwd m6, m4
+ punpcklwd m4, m7, m5
+ punpckhwd m7, m5
+ punpckhwd m5, m3, m1
+ punpcklwd m3, m1
+ punpckhwd m1, m2, m0
+ punpcklwd m2, m0
+ REPX {pmulhrsw x, m10}, m8, m4, m5, m1
+ REPX {pmulhrsw x, m9 }, m6, m7, m3, m2
+ punpcklwd m0, m7, m4
+ punpckhwd m7, m4
+ punpckhwd m4, m6, m8
+ punpcklwd m6, m8
+ punpckhwd m8, m3, m5
+ punpcklwd m3, m5
+ punpcklwd m5, m2, m1
+ punpckhwd m2, m1
+ punpckhdq m1, m0, m6
+ punpckldq m0, m6
+ punpckldq m6, m7, m4
+ punpckhdq m7, m4
+ punpckhdq m4, m3, m5
+ punpckldq m3, m5
+ punpckldq m5, m8, m2
+ punpckhdq m8, m2
+ vinserti128 m2, m6, xm5, 1
+ vperm2i128 m6, m5, 0x31
+ vperm2i128 m5, m1, m4, 0x31
+ vinserti128 m1, xm4, 1
+ vperm2i128 m4, m0, m3, 0x31
+ vinserti128 m0, xm3, 1
+ vinserti128 m3, m7, xm8, 1
+ vperm2i128 m7, m8, 0x31
+ jmp tx2q
+.pass2:
+ call m(iadst_16x8_internal_8bpc).main
+ call m(iadst_16x8_internal_8bpc).main_pass2_end
+ pxor m8, m8
+ psubw m8, m9
+ pmulhrsw m10, m7, m8
+ pmulhrsw m7, m0, m9
+ pmulhrsw m0, m6, m9
+ pmulhrsw m6, m1, m8
+ pmulhrsw m1, m5, m8
+ pmulhrsw m5, m2, m9
+ pmulhrsw m2, m4, m9
+ pmulhrsw m4, m3, m8
+ lea r3, [strideq*3]
+ WRITE_16X2 10, 0, 8, 9, strideq*0, strideq*1
+ WRITE_16X2 1, 2, 0, 1, strideq*2, r3
+ jmp m(idct_16x8_internal_8bpc).end3
+
+INV_TXFM_16X8_FN identity, dct
+INV_TXFM_16X8_FN identity, adst
+INV_TXFM_16X8_FN identity, flipadst
+INV_TXFM_16X8_FN identity, identity
+
+cglobal iidentity_16x8_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2
+ mova xm7, [cq+16*0]
+ mova xm2, [cq+16*1]
+ add cq, 16*8
+ vpbroadcastd m3, [o(pw_2896x8)]
+ vinserti128 m7, [cq+16*0], 1
+ vinserti128 m2, [cq+16*1], 1
+ mova xm6, [cq-16*6]
+ mova xm4, [cq-16*5]
+ vinserti128 m6, [cq+16*2], 1
+ vinserti128 m4, [cq+16*3], 1
+ mova xm8, [cq-16*4]
+ mova xm5, [cq-16*3]
+ vinserti128 m8, [cq+16*4], 1
+ vinserti128 m5, [cq+16*5], 1
+ mova xm0, [cq-16*2]
+ mova xm1, [cq-16*1]
+ vinserti128 m0, [cq+16*6], 1
+ vinserti128 m1, [cq+16*7], 1
+ vpbroadcastd m10, [o(pw_1697x16)]
+ vpbroadcastd m11, [o(pw_16384)]
+ REPX {pmulhrsw x, m3}, m7, m2, m6, m4, m8, m5, m0, m1
+ punpcklwd m3, m7, m2
+ punpckhwd m7, m2
+ punpcklwd m2, m6, m4
+ punpckhwd m6, m4
+ punpcklwd m4, m8, m5
+ punpckhwd m8, m5
+ punpcklwd m5, m0, m1
+ punpckhwd m0, m1
+ punpckldq m1, m3, m2
+ punpckhdq m3, m2
+ punpckldq m2, m4, m5
+ punpckhdq m4, m5
+ punpckldq m5, m7, m6
+ punpckhdq m7, m6
+ punpckldq m6, m8, m0
+ punpckhdq m8, m0
+ REPX {IDTX16 x, 0, 10, 11}, 1, 3, 2, 4, 5, 7, 6, 8
+ punpcklqdq m0, m1, m2
+ punpckhqdq m1, m2
+ punpcklqdq m2, m3, m4
+ punpckhqdq m3, m4
+ punpcklqdq m4, m5, m6
+ punpckhqdq m5, m6
+ punpcklqdq m6, m7, m8
+ punpckhqdq m7, m8
+ jmp tx2q
+.pass2:
+ vpbroadcastd m8, [o(pw_4096)]
+ jmp m(idct_16x8_internal_8bpc).end
+
+%define o_base pw_5 + 128
+
+%macro INV_TXFM_16X16_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 16x16
+%ifidn %1_%2, dct_dct
+ movd xm1, [o(pw_2896x8)]
+ pmulhrsw xm0, xm1, [cq]
+ movd xm2, [o(pw_8192)]
+ mov [cq], eobd
+ or r3d, 16
+ jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
+%endif
+%endmacro
+
+%macro ITX_16X16_LOAD_COEFS 0
+ mova m0, [cq+32*0]
+ mova m1, [cq+32*1]
+ mova m2, [cq+32*2]
+ mova m3, [cq+32*3]
+ add cq, 32*8
+ mova m4, [cq-32*4]
+ mova m5, [cq-32*3]
+ mova m6, [cq-32*2]
+ mova m7, [cq-32*1]
+ mova m8, [cq+32*0]
+ mova m9, [cq+32*1]
+ mova m10, [cq+32*2]
+ mova m11, [cq+32*3]
+ mova m12, [cq+32*4]
+ mova m13, [cq+32*5]
+ mova m14, [cq+32*6]
+ mova m15, [cq+32*7]
+ mova [rsp], m15
+%endmacro
+
+INV_TXFM_16X16_FN dct, dct
+INV_TXFM_16X16_FN dct, adst
+INV_TXFM_16X16_FN dct, flipadst
+INV_TXFM_16X16_FN dct, identity
+
+cglobal idct_16x16_internal_8bpc, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
+ ITX_16X16_LOAD_COEFS
+ call .main
+.pass1_end:
+ vpbroadcastd m1, [o(pw_8192)]
+ REPX {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14
+ vextracti128 [rsp+16*5], m8, 1
+ mova [rsp+16*1], xm8
+.pass1_end2:
+ vextracti128 [rsp+16*4], m0, 1
+ mova [rsp+16*0], xm0
+ REPX {pmulhrsw x, m1}, m3, m5, m7, m9, m11, m13, m15
+ pmulhrsw m1, [rsp+32*1]
+ vperm2i128 m8, m1, m9, 0x31
+ vinserti128 m1, xm9, 1
+ vperm2i128 m9, m2, m10, 0x31
+ vinserti128 m2, xm10, 1
+ vperm2i128 m10, m3, m11, 0x31
+ vinserti128 m3, xm11, 1
+ vperm2i128 m11, m4, m12, 0x31
+ vinserti128 m4, xm12, 1
+ vperm2i128 m12, m5, m13, 0x31
+ vinserti128 m5, xm13, 1
+ vperm2i128 m13, m6, m14, 0x31
+ vinserti128 m6, xm14, 1
+ vperm2i128 m14, m7, m15, 0x31
+ vinserti128 m7, xm15, 1
+ mova m15, [rsp+32*2]
+.pass1_end3:
+ punpcklwd m0, m9, m10
+ punpckhwd m9, m10
+ punpcklwd m10, m15, m8
+ punpckhwd m15, m8
+ punpckhwd m8, m11, m12
+ punpcklwd m11, m12
+ punpckhwd m12, m13, m14
+ punpcklwd m13, m14
+ punpckhdq m14, m11, m13
+ punpckldq m11, m13
+ punpckldq m13, m15, m9
+ punpckhdq m15, m9
+ punpckldq m9, m10, m0
+ punpckhdq m10, m0
+ punpckhdq m0, m8, m12
+ punpckldq m8, m12
+ punpcklqdq m12, m13, m8
+ punpckhqdq m13, m8
+ punpcklqdq m8, m9, m11
+ punpckhqdq m9, m11
+ punpckhqdq m11, m10, m14
+ punpcklqdq m10, m14
+ punpcklqdq m14, m15, m0
+ punpckhqdq m15, m0
+ mova m0, [rsp]
+ mova [rsp], m15
+ punpckhwd m15, m4, m5
+ punpcklwd m4, m5
+ punpckhwd m5, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m6, m7
+ punpcklwd m6, m7
+ punpckhwd m7, m2, m3
+ punpcklwd m2, m3
+ punpckhdq m3, m0, m2
+ punpckldq m0, m2
+ punpckldq m2, m4, m6
+ punpckhdq m4, m6
+ punpckhdq m6, m5, m7
+ punpckldq m5, m7
+ punpckldq m7, m15, m1
+ punpckhdq m15, m1
+ punpckhqdq m1, m0, m2
+ punpcklqdq m0, m2
+ punpcklqdq m2, m3, m4
+ punpckhqdq m3, m4
+ punpcklqdq m4, m5, m7
+ punpckhqdq m5, m7
+ punpckhqdq m7, m6, m15
+ punpcklqdq m6, m15
+ jmp tx2q
+.pass2:
+ call .main
+.end:
+ vpbroadcastd m1, [o(pw_2048)]
+ REPX {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14
+ mova [rsp], m6
+.end2:
+ REPX {pmulhrsw x, m1}, m3, m5, m7, m9, m11, m13, m15
+ pmulhrsw m1, [rsp+32*1]
+ lea r3, [strideq*3]
+ WRITE_16X2 0, 1, 6, 0, strideq*0, strideq*1
+ WRITE_16X2 2, 3, 0, 1, strideq*2, r3
+ lea dstq, [dstq+strideq*4]
+ WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1
+ WRITE_16X2 [rsp], 7, 0, 1, strideq*2, r3
+.end3:
+ pxor m2, m2
+ REPX {mova [cq+32*x], m2}, -8, -7, -6, -5, -4, -3, -2, -1
+ lea dstq, [dstq+strideq*4]
+ WRITE_16X2 8, 9, 0, 1, strideq*0, strideq*1
+ WRITE_16X2 10, 11, 0, 1, strideq*2, r3
+ REPX {mova [cq+32*x], m2}, 0, 1, 2, 3, 4, 5, 6, 7
+ lea dstq, [dstq+strideq*4]
+ WRITE_16X2 12, 13, 0, 1, strideq*0, strideq*1
+ WRITE_16X2 14, 15, 0, 1, strideq*2, r3
+ RET
+ALIGN function_align
+cglobal_label .main
+ vpbroadcastd m15, [o(pd_2048)]
+ mova [rsp+gprsize+32*1], m1
+ mova [rsp+gprsize+32*2], m9
+ IDCT8_1D 0, 2, 4, 6, 8, 10, 12, 14, 1, 9, 15
+ mova m1, [rsp+gprsize+32*2] ; in9
+ mova [rsp+gprsize+32*2], m14 ; tmp7
+ mova m9, [rsp+gprsize+32*1] ; in1
+ mova [rsp+gprsize+32*1], m10 ; tmp5
+ mova m14, [rsp+gprsize+32*0] ; in15
+ mova [rsp+gprsize+32*0], m6 ; tmp3
+ IDCT16_1D_ODDHALF 9, 3, 5, 7, 1, 11, 13, 14, 6, 10, 15
+ mova m6, [rsp+gprsize+32*1] ; tmp5
+ psubsw m15, m0, m14 ; out15
+ paddsw m0, m14 ; out0
+ psubsw m14, m2, m13 ; out14
+ paddsw m2, m13 ; out1
+ mova [rsp+gprsize+32*1], m2
+ psubsw m13, m4, m11 ; out13
+ paddsw m2, m4, m11 ; out2
+ psubsw m11, m8, m7 ; out11
+ paddsw m4, m8, m7 ; out4
+ mova m7, [rsp+gprsize+32*2] ; tmp7
+ psubsw m10, m6, m5 ; out10
+ paddsw m5, m6 ; out5
+ psubsw m8, m7, m9 ; out8
+ paddsw m7, m9 ; out7
+ psubsw m9, m12, m3 ; out9
+ paddsw m6, m12, m3 ; out6
+ mova m3, [rsp+gprsize+32*0] ; tmp3
+ psubsw m12, m3, m1 ; out12
+ paddsw m3, m1 ; out3
+ ret
+
+INV_TXFM_16X16_FN adst, dct
+INV_TXFM_16X16_FN adst, adst
+INV_TXFM_16X16_FN adst, flipadst
+
+cglobal iadst_16x16_internal_8bpc, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
+ ITX_16X16_LOAD_COEFS
+ call .main
+ call .main_pass1_end
+ pmulhrsw m0, m1, [cq+32*0]
+ pmulhrsw m2, m1, [cq+32*1]
+ REPX {pmulhrsw x, m1}, m4, m6, m8, m10
+ pmulhrsw m12, m1, [cq+32*2]
+ pmulhrsw m14, m1, [cq+32*3]
+ vextracti128 [rsp+16*5], m8, 1
+ mova [rsp+16*1], xm8
+ pxor m8, m8
+ psubw m1, m8, m1
+ jmp m(idct_16x16_internal_8bpc).pass1_end2
+ALIGN function_align
+.pass2:
+ call .main
+ call .main_pass2_end
+ REPX {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14
+ mova [rsp+32*0], m6
+ pxor m6, m6
+ psubw m1, m6, m1
+ jmp m(idct_16x16_internal_8bpc).end2
+ALIGN function_align
+cglobal_label .main
+ vpbroadcastd m15, [o(pd_2048)]
+ mova [rsp+gprsize+32*1], m0
+ mova [rsp+gprsize+32*2], m4
+ ITX_MULSUB_2W 13, 2, 0, 4, 15, 995, 3973 ; t3, t2
+ ITX_MULSUB_2W 9, 6, 0, 4, 15, 2440, 3290 ; t7, t6
+ ITX_MULSUB_2W 5, 10, 0, 4, 15, 3513, 2106 ; t11, t10
+ ITX_MULSUB_2W 1, 14, 0, 4, 15, 4052, 601 ; t15, t14
+ psubsw m0, m2, m10 ; t10a
+ paddsw m2, m10 ; t2a
+ psubsw m10, m13, m5 ; t11a
+ paddsw m13, m5 ; t3a
+ psubsw m5, m6, m14 ; t14a
+ paddsw m6, m14 ; t6a
+ psubsw m14, m9, m1 ; t15a
+ paddsw m9, m1 ; t7a
+ ITX_MULSUB_2W 0, 10, 1, 4, 15, 3406, 2276 ; t11, t10
+ ITX_MULSUB_2W 14, 5, 1, 4, 15, 2276, 3406 ; t14, t15
+ psubsw m1, m10, m14 ; t14a
+ paddsw m10, m14 ; t10a
+ psubsw m14, m0, m5 ; t15a
+ paddsw m0, m5 ; t11a
+ psubsw m5, m2, m6 ; t6
+ paddsw m2, m6 ; t2
+ psubsw m6, m13, m9 ; t7
+ paddsw m13, m9 ; t3
+ ITX_MULSUB_2W 6, 5, 4, 9, 15, 3784, 1567 ; t6a, t7a
+ ITX_MULSUB_2W 14, 1, 4, 9, 15, 3784, 1567 ; t14, t15
+ mova m9, [rsp+gprsize+32*0] ; in15
+ mova [rsp+gprsize+32*0], m10 ; t10a
+ mova m4, [rsp+gprsize+32*1] ; in0
+ mova [rsp+gprsize+32*1], m6 ; t6a
+ mova m6, [rsp+gprsize+32*2] ; in4
+ mova [rsp+gprsize+32*2], m2 ; t2
+ ITX_MULSUB_2W 9, 4, 2, 10, 15, 201, 4091 ; t1, t0
+ ITX_MULSUB_2W 11, 6, 2, 10, 15, 1751, 3703 ; t5, t4
+ ITX_MULSUB_2W 7, 8, 2, 10, 15, 3035, 2751 ; t9, t8
+ ITX_MULSUB_2W 3, 12, 2, 10, 15, 3857, 1380 ; t13, t12
+ psubsw m10, m4, m8 ; t8a
+ paddsw m8, m4 ; t0a
+ psubsw m4, m9, m7 ; t9a
+ paddsw m9, m7 ; t1a
+ psubsw m7, m6, m12 ; t12a
+ paddsw m6, m12 ; t4a
+ psubsw m12, m11, m3 ; t13a
+ paddsw m11, m3 ; t5a
+ ITX_MULSUB_2W 10, 4, 2, 3, 15, 799, 4017 ; t9, t8
+ ITX_MULSUB_2W 12, 7, 2, 3, 15, 4017, 799 ; t12, t13
+ psubsw m3, m9, m11 ; t5
+ paddsw m9, m11 ; t1
+ psubsw m11, m4, m12 ; t12a
+ paddsw m4, m12 ; t8a
+ paddsw m12, m8, m6 ; t0
+ psubsw m8, m6 ; t4
+ paddsw m6, m10, m7 ; t9a
+ psubsw m10, m7 ; t13a
+ ITX_MULSUB_2W 8, 3, 2, 7, 15, 1567, 3784 ; t5a, t4a
+ ITX_MULSUB_2W 11, 10, 2, 7, 15, 1567, 3784 ; t13, t12
+ mova m7, [rsp+gprsize+32*0] ; t10a
+ mova m2, [rsp+gprsize+32*1] ; t6a
+ paddsw m15, m9, m13 ; -out15
+ psubsw m9, m13 ; t3a
+ paddsw m13, m11, m1 ; -out13
+ psubsw m11, m1 ; t15a
+ psubsw m1, m4, m7 ; t10
+ paddsw m7, m4 ; -out1
+ psubsw m4, m3, m2 ; t6
+ paddsw m3, m2 ; -out3
+ paddsw m2, m10, m14 ; out2
+ psubsw m10, m14 ; t14a
+ paddsw m14, m6, m0 ; out14
+ psubsw m6, m0 ; t11
+ mova m0, [rsp+gprsize+32*2] ; t2
+ mova [rsp+gprsize+32*1], m7
+ psubsw m7, m12, m0 ; t2a
+ paddsw m0, m12 ; out0
+ paddsw m12, m8, m5 ; out12
+ psubsw m8, m5 ; t7
+ ret
+ALIGN function_align
+.main_pass1_end:
+ mova [cq+32*0], m0
+ mova [cq+32*1], m2
+ mova [cq+32*2], m12
+ mova [cq+32*3], m14
+ vpbroadcastd m14, [pw_m2896_2896]
+ vpbroadcastd m12, [pw_2896_2896]
+ vpbroadcastd m2, [pd_2048]
+ punpcklwd m5, m11, m10
+ punpckhwd m11, m10
+ pmaddwd m10, m14, m5
+ pmaddwd m0, m14, m11
+ pmaddwd m5, m12
+ pmaddwd m11, m12
+ REPX {paddd x, m2}, m10, m0, m5, m11
+ REPX {psrad x, 12}, m10, m0, m5, m11
+ packssdw m10, m0 ; out10
+ packssdw m5, m11 ; -out5
+ punpcklwd m11, m8, m4
+ punpckhwd m8, m4
+ pmaddwd m4, m12, m11
+ pmaddwd m0, m12, m8
+ pmaddwd m11, m14
+ pmaddwd m8, m14
+ REPX {paddd x, m2}, m4, m0, m11, m8
+ REPX {psrad x, 12}, m4, m0, m11, m8
+ packssdw m4, m0 ; out4
+ packssdw m11, m8 ; -out11
+ punpcklwd m8, m9, m7
+ punpckhwd m9, m7
+ pmaddwd m7, m12, m8
+ pmaddwd m0, m12, m9
+ pmaddwd m8, m14
+ pmaddwd m9, m14
+ REPX {paddd x, m2}, m7, m0, m8, m9
+ REPX {psrad x, 12}, m7, m0, m8, m9
+ packssdw m7, m0 ; -out7
+ packssdw m8, m9 ; out8
+ punpckhwd m0, m6, m1
+ punpcklwd m6, m1
+ pmaddwd m1, m14, m0
+ pmaddwd m9, m14, m6
+ pmaddwd m0, m12
+ pmaddwd m6, m12
+ REPX {paddd x, m2}, m1, m9, m0, m6
+ REPX {psrad x, 12}, m1, m9, m0, m6
+ packssdw m9, m1 ; -out7
+ packssdw m6, m0 ; out8
+ vpbroadcastd m1, [o(pw_8192)]
+ ret
+ALIGN function_align
+cglobal_label .main_pass2_end
+ ; In pass 2 we're going to clip to pixels afterwards anyway, so clipping to
+ ; 16-bit here will produce the same result as using 32-bit intermediates.
+ paddsw m5, m10, m11 ; -out5
+ psubsw m10, m11 ; out10
+ psubsw m11, m4, m8 ; -out11
+ paddsw m4, m8 ; out4
+ psubsw m8, m7, m9 ; out8
+ paddsw m7, m9 ; -out7
+ psubsw m9, m1, m6 ; -out9
+ paddsw m6, m1 ; out6
+ vpbroadcastd m1, [o(pw_2896x8)]
+ REPX {pmulhrsw x, m1}, m4, m5, m6, m7, m8, m9, m10, m11
+ vpbroadcastd m1, [o(pw_2048)]
+ ret
+
+INV_TXFM_16X16_FN flipadst, dct
+INV_TXFM_16X16_FN flipadst, adst
+INV_TXFM_16X16_FN flipadst, flipadst
+
+cglobal iflipadst_16x16_internal_8bpc, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
+ ITX_16X16_LOAD_COEFS
+ call m(iadst_16x16_internal_8bpc).main
+ call m(iadst_16x16_internal_8bpc).main_pass1_end
+ pmulhrsw m6, m1
+ pmulhrsw m2, m1, m8
+ mova [rsp+32*2], m6
+ pmulhrsw m6, m1, m4
+ pmulhrsw m4, m1, m10
+ pmulhrsw m8, m1, [cq+32*3]
+ pmulhrsw m10, m1, [cq+32*2]
+ pmulhrsw m12, m1, [cq+32*1]
+ pmulhrsw m14, m1, [cq+32*0]
+ pxor m0, m0
+ psubw m0, m1
+ REPX {pmulhrsw x, m0}, m3, m5, m7, m11, m15
+ pmulhrsw m1, m0, m9
+ pmulhrsw m9, m0, m13
+ pmulhrsw m0, [rsp+32*1]
+ mova [rsp+16*0], xm15
+ mova [rsp+16*1], xm7
+ vperm2i128 m15, m15, m7, 0x31
+ vinserti128 m7, m2, xm14, 1
+ vperm2i128 m14, m2, m14, 0x31
+ vinserti128 m2, m9, xm5, 1
+ vperm2i128 m9, m9, m5, 0x31
+ vinserti128 m5, m4, xm12, 1
+ vperm2i128 m12, m4, m12, 0x31
+ vinserti128 m4, m11, xm3, 1
+ vperm2i128 m11, m11, m3, 0x31
+ vinserti128 m3, m10, xm6, 1
+ vperm2i128 m10, m10, m6, 0x31
+ vinserti128 m6, m1, xm0, 1
+ vperm2i128 m13, m1, m0, 0x31
+ vinserti128 m1, m8, [rsp+32*2], 1
+ vperm2i128 m8, m8, [rsp+32*2], 0x31
+ jmp m(idct_16x16_internal_8bpc).pass1_end3
+.pass2:
+ call m(iadst_16x16_internal_8bpc).main
+ call m(iadst_16x16_internal_8bpc).main_pass2_end
+ pmulhrsw m0, m1
+ pmulhrsw m8, m1
+ mova [rsp+32*0], m0
+ mova [rsp+32*2], m8
+ pxor m0, m0
+ psubw m0, m1
+ pmulhrsw m8, m0, m7
+ pmulhrsw m7, m0, m9
+ pmulhrsw m9, m1, m6
+ pmulhrsw m6, m1, m10
+ pmulhrsw m10, m0, m5
+ pmulhrsw m5, m0, m11
+ pmulhrsw m11, m1, m4
+ pmulhrsw m4, m1, m12
+ pmulhrsw m12, m0, m3
+ pmulhrsw m3, m0, m13
+ pmulhrsw m13, m1, m2
+ pmulhrsw m1, m14
+ pmulhrsw m14, m0, [rsp+32*1]
+ pmulhrsw m0, m15
+ lea r3, [strideq*3]
+ WRITE_16X2 0, 1, 2, 0, strideq*0, strideq*1
+ mova m15, [rsp+32*0]
+ WRITE_16X2 3, 4, 0, 1, strideq*2, r3
+ lea dstq, [dstq+strideq*4]
+ WRITE_16X2 5, 6, 0, 1, strideq*0, strideq*1
+ WRITE_16X2 7, [rsp+32*2], 0, 1, strideq*2, r3
+ jmp m(idct_16x16_internal_8bpc).end3
+
+%macro IDTX16B 3 ; src/dst, tmp, pw_1697x16
+ pmulhrsw m%2, m%3, m%1
+ psraw m%2, 1
+ pavgw m%1, m%2 ; signs are guaranteed to be equal
+%endmacro
+
+INV_TXFM_16X16_FN identity, dct
+INV_TXFM_16X16_FN identity, identity
+
+cglobal iidentity_16x16_internal_8bpc, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
+ vpbroadcastd m7, [o(pw_1697x16)]
+ mova xm0, [cq+16* 0]
+ vinserti128 m0, [cq+16*16], 1
+ mova xm15, [cq+16* 1]
+ vinserti128 m15, [cq+16*17], 1
+ mova xm1, [cq+16* 2]
+ vinserti128 m1, [cq+16*18], 1
+ mova xm8, [cq+16* 3]
+ vinserti128 m8, [cq+16*19], 1
+ mova xm2, [cq+16* 4]
+ vinserti128 m2, [cq+16*20], 1
+ mova xm9, [cq+16* 5]
+ vinserti128 m9, [cq+16*21], 1
+ mova xm3, [cq+16* 6]
+ vinserti128 m3, [cq+16*22], 1
+ mova xm10, [cq+16* 7]
+ add cq, 16*16
+ vinserti128 m10, [cq+16* 7], 1
+ mova xm4, [cq-16* 8]
+ vinserti128 m4, [cq+16* 8], 1
+ mova xm11, [cq-16* 7]
+ vinserti128 m11, [cq+16* 9], 1
+ mova xm5, [cq-16* 6]
+ vinserti128 m5, [cq+16*10], 1
+ mova xm12, [cq-16* 5]
+ vinserti128 m12, [cq+16*11], 1
+ mova xm13, [cq-16* 3]
+ vinserti128 m13, [cq+16*13], 1
+ mova xm14, [cq-16* 1]
+ vinserti128 m14, [cq+16*15], 1
+ REPX {IDTX16B x, 6, 7}, 0, 15, 1, 8, 2, 9, 3, \
+ 10, 4, 11, 5, 12, 13, 14
+ mova xm6, [cq-16* 4]
+ vinserti128 m6, [cq+16*12], 1
+ mova [rsp], m0
+ IDTX16B 6, 0, 7
+ mova xm0, [cq-16* 2]
+ vinserti128 m0, [cq+16*14], 1
+ pmulhrsw m7, m0
+ psraw m7, 1
+ pavgw m7, m0
+ jmp m(idct_16x16_internal_8bpc).pass1_end3
+ALIGN function_align
+.pass2:
+ vpbroadcastd m15, [o(pw_1697x16)]
+ mova [rsp+32*1], m0
+ REPX {IDTX16 x, 0, 15}, 1, 2, 3, 4, 5, 6, 7, \
+ 8, 9, 10, 11, 12, 13, 14
+ mova m0, [rsp+32*1]
+ mova [rsp+32*1], m1
+ IDTX16 0, 1, 15
+ mova m1, [rsp+32*0]
+ pmulhrsw m15, m1
+ paddsw m1, m1
+ paddsw m15, m1
+ jmp m(idct_16x16_internal_8bpc).end
+
+%define o_base deint_shuf + 128
+
+%macro LOAD_8ROWS 2-3 0 ; src, stride, is_rect2
+%if %3
+ vpbroadcastd m15, [o(pw_2896x8)]
+ pmulhrsw m0, m15, [%1+%2*0]
+ pmulhrsw m1, m15, [%1+%2*1]
+ pmulhrsw m2, m15, [%1+%2*2]
+ pmulhrsw m3, m15, [%1+%2*3]
+ pmulhrsw m4, m15, [%1+%2*4]
+ pmulhrsw m5, m15, [%1+%2*5]
+ pmulhrsw m6, m15, [%1+%2*6]
+ pmulhrsw m7, m15, [%1+%2*7]
+%else
+ mova m0, [%1+%2*0]
+ mova m1, [%1+%2*1]
+ mova m2, [%1+%2*2]
+ mova m3, [%1+%2*3]
+ mova m4, [%1+%2*4]
+ mova m5, [%1+%2*5]
+ mova m6, [%1+%2*6]
+ mova m7, [%1+%2*7]
+%endif
+%endmacro
+
+%macro LOAD_8ROWS_H 2-3 0 ; src, stride, is_rect2
+%if %3
+%if %3 == 1
+ vpbroadcastd m15, [o(pw_2896x8)]
+%endif
+ pmulhrsw m8, m15, [%1+%2*0]
+ pmulhrsw m9, m15, [%1+%2*1]
+ pmulhrsw m10, m15, [%1+%2*2]
+ pmulhrsw m11, m15, [%1+%2*3]
+ pmulhrsw m12, m15, [%1+%2*4]
+ pmulhrsw m13, m15, [%1+%2*5]
+ pmulhrsw m14, m15, [%1+%2*6]
+ pmulhrsw m15, [%1+%2*7]
+%else
+ mova m8, [%1+%2*0]
+ mova m9, [%1+%2*1]
+ mova m10, [%1+%2*2]
+ mova m11, [%1+%2*3]
+ mova m12, [%1+%2*4]
+ mova m13, [%1+%2*5]
+ mova m14, [%1+%2*6]
+ mova m15, [%1+%2*7]
+%endif
+%endmacro
+
+%macro ITX_UNPACK_MULHRSW 7 ; dst1, dst2/src, tmp, coef[1-4]
+ vpbroadcastd m%3, [r5-pw_201_4091x8+pw_%4_%5x8]
+ punpcklwd m%1, m%2, m%2
+ pmulhrsw m%1, m%3
+ vpbroadcastd m%3, [r5-pw_201_4091x8+pw_%6_%7x8]
+ punpckhwd m%2, m%2
+ pmulhrsw m%2, m%3
+%endmacro
+
+cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 4, 0, dst, stride, c, eob
+ lea r6, [o_base]
+ test eobd, eobd
+ jz .dconly
+ PROLOGUE 0, 4, 16, 32*3, dst, stride, c, eob
+ %undef cmp
+ cmp eobd, 106
+ jle .fast
+ LOAD_8ROWS cq+32*1, 32*2
+ call m(idct_16x8_internal_8bpc).main
+ vperm2i128 m11, m0, m4, 0x31
+ vinserti128 m0, xm4, 1
+ vperm2i128 m4, m1, m5, 0x31
+ vinserti128 m1, xm5, 1
+ vperm2i128 m5, m2, m6, 0x31
+ vinserti128 m2, xm6, 1
+ vperm2i128 m6, m3, m7, 0x31
+ vinserti128 m3, xm7, 1
+ pxor m7, m7
+ REPX {mova [cq+32*x], m7}, 1, 3, 5, 7, 9, 11, 13, 15
+ punpckhwd m7, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m2, m3
+ punpcklwd m2, m3
+ punpcklwd m3, m11, m4
+ punpckhwd m11, m4
+ punpckhwd m4, m5, m6
+ punpcklwd m5, m6
+ punpckhdq m6, m0, m2
+ punpckldq m0, m2
+ punpckldq m2, m3, m5
+ punpckhdq m3, m5
+ punpckhdq m5, m11, m4
+ punpckldq m11, m4
+ punpckldq m4, m7, m1
+ punpckhdq m7, m1
+ punpckhqdq m12, m6, m0
+ punpcklqdq m0, m6 ; out4
+ punpckhqdq m13, m7, m4
+ punpcklqdq m4, m7 ; out5
+ punpckhqdq m14, m3, m2
+ punpcklqdq m2, m3 ; out6
+ punpckhqdq m15, m5, m11
+ punpcklqdq m11, m5 ; out7
+ mova [rsp+32*0], m0
+ mova [rsp+32*1], m4
+ mova [rsp+32*2], m2
+.fast:
+ LOAD_8ROWS cq+32*0, 32*2
+ call m(idct_16x8_internal_8bpc).main
+ vperm2i128 m8, m0, m4, 0x31
+ vinserti128 m0, xm4, 1
+ vperm2i128 m4, m1, m5, 0x31
+ vinserti128 m1, xm5, 1
+ vperm2i128 m5, m2, m6, 0x31
+ vinserti128 m2, xm6, 1
+ vperm2i128 m6, m3, m7, 0x31
+ vinserti128 m3, xm7, 1
+ vpbroadcastd m9, [o(pw_8192)]
+ pxor m7, m7
+ REPX {mova [cq+32*x], m7}, 0, 2, 4, 6, 8, 10, 12, 14
+ punpckhwd m7, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m2, m3
+ punpcklwd m2, m3
+ punpckhwd m3, m8, m4
+ punpcklwd m8, m4
+ punpckhwd m4, m5, m6
+ punpcklwd m5, m6
+ punpckhdq m6, m0, m2
+ punpckldq m0, m2
+ punpckldq m2, m8, m5
+ punpckhdq m8, m5
+ punpckhdq m5, m3, m4
+ punpckldq m3, m4
+ punpckhdq m4, m7, m1
+ punpckldq m7, m1
+ punpcklqdq m1, m7, m4
+ punpckhqdq m7, m4 ; out9
+ punpckhqdq m4, m2, m8 ; out10
+ punpcklqdq m2, m8
+ punpckhqdq m8, m3, m5
+ punpcklqdq m3, m5
+ punpckhqdq m5, m0, m6 ; out8
+ punpcklqdq m0, m6
+ REPX {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m7
+ cmp eobd, 106
+ jg .full
+ mova [rsp+32*0], m5
+ mova [rsp+32*1], m7
+ mova [rsp+32*2], m4
+ pmulhrsw m11, m9, m8
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call .main_fast
+ jmp .pass2
+.dconly:
+ movd xm1, [o(pw_2896x8)]
+ pmulhrsw xm0, xm1, [cq]
+ movd xm2, [o(pw_8192)]
+ mov [cq], eobd
+ or r3d, 32
+ jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly
+.full:
+ REPX {pmulhrsw x, m9}, m12, m13, m14, m15
+ pmulhrsw m6, m9, [rsp+32*2]
+ mova [rsp+32*2], m4
+ pmulhrsw m4, m9, [rsp+32*0]
+ mova [rsp+32*0], m5
+ pmulhrsw m5, m9, [rsp+32*1]
+ mova [rsp+32*1], m7
+ pmulhrsw m7, m9, m11
+ pmulhrsw m11, m9, m8
+ call .main
+.pass2:
+ vpbroadcastd m12, [o(pw_2048)]
+ REPX {pmulhrsw x, m12}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m13, m14, m15
+ pmulhrsw m12, [rsp]
+ REPX {vpermq x, x, q3120}, m0, m2, m4, m6, m8, m10, m12, m14
+ REPX {vpermq x, x, q2031}, m1, m3, m5, m7, m9, m11, m13, m15
+ mova [rsp+32*0], m4
+ mova [rsp+32*1], m6
+ lea r3, [strideq*3]
+ WRITE_8X4 0, 1, 4, 6
+ lea dstq, [dstq+strideq*4]
+ WRITE_8X4 2, 3, 4, 6
+ lea dstq, [dstq+strideq*4]
+ WRITE_8X4 [rsp+32*0], 5, 4, 6
+ lea dstq, [dstq+strideq*4]
+ WRITE_8X4 [rsp+32*1], 7, 4, 6
+ lea dstq, [dstq+strideq*4]
+ WRITE_8X4 8, 9, 4, 6
+ lea dstq, [dstq+strideq*4]
+ WRITE_8X4 10, 11, 4, 6
+ lea dstq, [dstq+strideq*4]
+ WRITE_8X4 12, 13, 4, 6
+ lea dstq, [dstq+strideq*4]
+ WRITE_8X4 14, 15, 4, 6
+ RET
+ALIGN function_align
+cglobal_label .main_fast ; bottom half is zero
+ call m(idct_8x16_internal_8bpc).main
+ mova m8, [rsp+gprsize+0*32]
+ mova [rsp+gprsize+0*32], m0
+ mova m9, [rsp+gprsize+1*32]
+ mova [rsp+gprsize+1*32], m1
+ mova m0, [rsp+gprsize+2*32]
+ mova [rsp+gprsize+2*32], m6
+ lea r5, [r6-(o_base)+pw_201_4091x8]
+ ITX_UNPACK_MULHRSW 1, 8, 6, 201, 4091, m601, 4052 ; t16a, t31a, t23a, t24a
+ ITX_UNPACK_MULHRSW 15, 9, 6, 995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a
+ ITX_UNPACK_MULHRSW 14, 0, 6, 1751, 3703, m2106, 3513 ; t18a, t29a, t21a, t26a
+ ITX_UNPACK_MULHRSW 13, 11, 6, 2440, 3290, m2751, 3035 ; t22a, t25a, t17a, t30a
+ jmp .main2
+ALIGN function_align
+cglobal_label .main
+ call m(idct_8x16_internal_8bpc).main
+ mova m8, [rsp+gprsize+0*32]
+ mova [rsp+gprsize+0*32], m0
+ mova m9, [rsp+gprsize+1*32]
+ mova [rsp+gprsize+1*32], m1
+ mova m0, [rsp+gprsize+2*32]
+ mova [rsp+gprsize+2*32], m6
+ punpcklwd m1, m15, m8 ; in31 in1
+ punpckhwd m8, m15 ; in3 in29
+ punpcklwd m15, m14, m9 ; in27 in5
+ punpckhwd m9, m14 ; in7 in25
+ punpcklwd m14, m13, m0 ; in23 in9
+ punpckhwd m0, m13 ; in11 in21
+ punpcklwd m13, m12, m11 ; in19 in13
+ punpckhwd m11, m12 ; in15 in17
+ ITX_MUL2X_PACK 1, 6, 12, 10, 201, 4091, 3 ; t16a, t31a
+ ITX_MUL2X_PACK 8, 6, 12, 10, 4052, 601, 3 ; t23a, t24a
+ ITX_MUL2X_PACK 15, 6, 12, 10, 995, 3973, 3 ; t20a, t27a
+ ITX_MUL2X_PACK 9, 6, 12, 10, 3857, 1380, 3 ; t19a, t28a
+ ITX_MUL2X_PACK 14, 6, 12, 10, 1751, 3703, 3 ; t18a, t29a
+ ITX_MUL2X_PACK 0, 6, 12, 10, 3513, 2106, 3 ; t21a, t26a
+ ITX_MUL2X_PACK 13, 6, 12, 10, 2440, 3290, 3 ; t22a, t25a
+ ITX_MUL2X_PACK 11, 6, 12, 10, 3035, 2751, 3 ; t17a, t30a
+.main2:
+ psubsw m6, m1, m11 ; t17 t30
+ paddsw m1, m11 ; t16 t31
+ psubsw m11, m9, m14 ; t18 t29
+ paddsw m9, m14 ; t19 t28
+ psubsw m14, m15, m0 ; t21 t26
+ paddsw m15, m0 ; t20 t27
+ psubsw m0, m8, m13 ; t22 t25
+ paddsw m8, m13 ; t23 t24
+ ITX_MUL2X_PACK 6, 12, 13, 10, 799, 4017, 3 ; t17a t30a
+ ITX_MUL2X_PACK 11, 12, 13, 10, m4017, 799, 3 ; t18a t29a
+ ITX_MUL2X_PACK 14, 12, 13, 10, 3406, 2276, 3 ; t21a t26a
+ ITX_MUL2X_PACK 0, 12, 13, 10, m2276, 3406, 3 ; t22a t25a
+ psubsw m13, m1, m9 ; t19a t28a
+ paddsw m1, m9 ; t16a t31a
+ psubsw m9, m8, m15 ; t20a t27a
+ paddsw m8, m15 ; t23a t24a
+ psubsw m15, m6, m11 ; t18 t29
+ paddsw m6, m11 ; t17 t30
+ psubsw m11, m0, m14 ; t21 t26
+ paddsw m0, m14 ; t22 t25
+ ITX_MUL2X_PACK 15, 12, 14, 10, 1567, 3784, 3 ; t18a t29a
+ ITX_MUL2X_PACK 13, 12, 14, 10, 1567, 3784, 3 ; t19 t28
+ ITX_MUL2X_PACK 9, 12, 14, 10, m3784, 1567, 3 ; t20 t27
+ ITX_MUL2X_PACK 11, 12, 14, 10, m3784, 1567, 3 ; t21a t26a
+ vbroadcasti128 m12, [o(deint_shuf)]
+ psubsw m14, m1, m8 ; t23 t24
+ paddsw m1, m8 ; t16 t31
+ psubsw m8, m6, m0 ; t22a t25a
+ paddsw m6, m0 ; t17a t30a
+ psubsw m0, m15, m11 ; t21 t26
+ paddsw m15, m11 ; t18 t29
+ psubsw m11, m13, m9 ; t20a t27a
+ paddsw m13, m9 ; t19a t28a
+ REPX {pshufb x, m12}, m1, m6, m15, m13
+ ITX_MUL2X_PACK 14, 9, 12, 10, 2896, 2896 ; t24a t23a
+ vpbroadcastd m9, [o(pw_m2896_2896)]
+ ITX_MUL2X_PACK 8, 12, _, 10, 12, 9, 4 ; t22 t25
+ vpbroadcastd m12, [o(pw_2896_2896)]
+ ITX_MUL2X_PACK 0, 12, _, 10, 12, 9, 4 ; t21a t26a
+ vpbroadcastd m12, [o(pw_2896_2896)]
+ ITX_MUL2X_PACK 11, 9, _, 10, 9, 12, 4 ; t27 t20
+ shufps m9, m14, m8, q1032 ; t23a t22
+ vpblendd m14, m8, 0xcc ; t24a t25
+ shufps m8, m11, m0, q1032 ; t20 t21a
+ vpblendd m11, m0, 0xcc ; t27 t26a
+ punpcklqdq m0, m1, m6 ; t16 t17a
+ punpckhqdq m1, m6 ; t31 t30a
+ psubsw m10, m5, m8 ; out20 out21
+ paddsw m5, m8 ; out11 out10
+ psubsw m6, m3, m14 ; out24 out25
+ paddsw m3, m14 ; out7 out6
+ psubsw m8, m7, m0 ; out16 out17
+ paddsw m7, m0 ; out15 out14
+ mova m0, [rsp+gprsize+0*32]
+ punpcklqdq m12, m13, m15 ; t19a t18
+ punpckhqdq m13, m15 ; t28a t29
+ psubsw m15, m0, m1 ; out31 out30
+ paddsw m0, m1 ; out0 out1
+ mova m1, [rsp+gprsize+1*32]
+ mova [rsp+gprsize+0*32], m6
+ mova m6, [rsp+gprsize+2*32]
+ psubsw m14, m1, m13 ; out28 out29
+ paddsw m1, m13 ; out3 out2
+ psubsw m13, m2, m11 ; out27 out26
+ paddsw m2, m11 ; out4 out5
+ psubsw m11, m4, m9 ; out23 out22
+ paddsw m4, m9 ; out8 out9
+ psubsw m9, m6, m12 ; out19 out18
+ paddsw m6, m12 ; out12 out13
+ ret
+
+%macro LOAD_PACKED_16X2 4 ; dst, tmp, row[1-2]
+ vbroadcasti128 m%1, [cq+16*%3]
+ vbroadcasti128 m%2, [cq+16*%4]
+ shufpd m%1, m%2, 0x0c
+%endmacro
+
+cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, stride, c, eob
+ lea r6, [o_base]
+ test eobd, eobd
+ jnz .normal
+ movd xm1, [o(pw_2896x8)]
+ pmulhrsw xm0, xm1, [cq]
+ movd xm2, [o(pw_8192)]
+ mov [cq], eobd
+ or r3d, 8
+.dconly:
+ pmulhrsw xm0, xm2
+ movd xm2, [pw_2048] ; intentionally rip-relative
+ pmulhrsw xm0, xm1
+ pmulhrsw xm0, xm2
+ vpbroadcastw m0, xm0
+ pxor m3, m3
+.dconly_loop:
+ mova m1, [dstq]
+ punpckhbw m2, m1, m3
+ punpcklbw m1, m3
+ paddw m2, m0
+ paddw m1, m0
+ packuswb m1, m2
+ mova [dstq], m1
+ add dstq, strideq
+ dec r3d
+ jg .dconly_loop
+ RET
+.normal:
+ PROLOGUE 0, 4, 16, 32*3, dst, stride, c, eob
+ %undef cmp
+ LOAD_PACKED_16X2 0, 7, 0, 2 ; in0 in2
+ LOAD_PACKED_16X2 4, 7, 1, 3 ; in1 in3
+ LOAD_PACKED_16X2 1, 7, 4, 6 ; in4 in6
+ LOAD_PACKED_16X2 5, 7, 5, 7 ; in5 in7
+ pxor m8, m8
+ REPX {mova [cq+32*x], m8}, 0, 1, 2, 3
+ add cq, 16*16
+ LOAD_PACKED_16X2 2, 7, -8, -6 ; in8 in10
+ LOAD_PACKED_16X2 6, 7, -7, -5 ; in9 in11
+ LOAD_PACKED_16X2 3, 7, -4, -2 ; in12 in14
+ LOAD_PACKED_16X2 11, 7, -3, -1 ; in13 in15
+ REPX {mova [cq+32*x], m8}, -4, -3, -2, -1
+ mova [rsp+32*0], m4
+ mova [rsp+32*1], m5
+ mova [rsp+32*2], m6
+ cmp eobd, 106
+ jg .full
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast
+ jmp .pass2
+.full:
+ LOAD_PACKED_16X2 4, 7, 0, 2 ; in16 in18
+ LOAD_PACKED_16X2 12, 7, 3, 1 ; in19 in17
+ LOAD_PACKED_16X2 5, 7, 4, 6 ; in20 in22
+ LOAD_PACKED_16X2 13, 7, 7, 5 ; in23 in21
+ REPX {mova [cq+32*x], m8}, 0, 1, 2, 3
+ add cq, 16*8
+ LOAD_PACKED_16X2 6, 7, 0, 2 ; in24 in26
+ LOAD_PACKED_16X2 14, 7, 3, 1 ; in27 in25
+ LOAD_PACKED_16X2 7, 8, 4, 6 ; in28 in30
+ LOAD_PACKED_16X2 15, 8, 7, 5 ; in31 in29
+ pxor m8, m8
+ REPX {mova [cq+32*x], m8}, 0, 1, 2, 3
+ call m(inv_txfm_add_dct_dct_8x32_8bpc).main
+.pass2:
+ vpbroadcastd m12, [o(pw_8192)]
+ REPX {pmulhrsw x, m12}, m8, m9, m10, m11, m13, m14, m15
+ mova [rsp+32*1], m9
+ mova [rsp+32*2], m10
+ punpckhwd m9, m0, m2
+ punpcklwd m0, m2
+ punpckhwd m2, m1, m3
+ punpcklwd m1, m3
+ punpcklwd m10, m4, m6
+ punpckhwd m4, m6
+ punpcklwd m6, m5, m7
+ punpckhwd m5, m7
+ punpckhwd m3, m0, m9
+ punpcklwd m0, m9
+ punpckhwd m9, m2, m1
+ punpcklwd m2, m1
+ punpcklwd m7, m10, m4
+ punpckhwd m10, m4
+ punpcklwd m4, m5, m6
+ punpckhwd m5, m6
+ punpckhdq m1, m0, m2
+ punpckldq m0, m2
+ punpckldq m2, m3, m9
+ punpckhdq m3, m9
+ punpckldq m6, m7, m4
+ punpckhdq m7, m4
+ punpckldq m9, m10, m5
+ punpckhdq m10, m5
+ REPX {pmulhrsw x, m12}, m0, m1, m2, m3, m6, m7, m9, m10
+ pmulhrsw m12, [rsp+32*0]
+ mova [rsp+32*0], m8
+ vperm2i128 m4, m0, m6, 0x31
+ vinserti128 m0, xm6, 1
+ vperm2i128 m5, m1, m7, 0x31
+ vinserti128 m1, xm7, 1
+ vperm2i128 m6, m2, m9, 0x31
+ vinserti128 m2, xm9, 1
+ vperm2i128 m7, m3, m10, 0x31
+ vinserti128 m3, xm10, 1
+ call m(idct_16x8_internal_8bpc).main
+ vpbroadcastd m8, [o(pw_2048)]
+ REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
+ lea r2, [strideq*3]
+ WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1
+ WRITE_16X2 2, 3, 0, 1, strideq*2, r2
+ lea r3, [dstq+strideq*4]
+ %define dstq r3
+ WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1
+ WRITE_16X2 6, 7, 0, 1, strideq*2, r2
+ mova m0, [rsp+32*0]
+ mova m1, [rsp+32*1]
+ mova m2, [rsp+32*2]
+ punpckhwd m7, m0, m2
+ punpcklwd m0, m2
+ punpckhwd m2, m1, m11
+ punpcklwd m1, m11
+ punpckhwd m4, m12, m14
+ punpcklwd m12, m14
+ punpckhwd m5, m13, m15
+ punpcklwd m13, m15
+ punpckhwd m3, m0, m7
+ punpcklwd m0, m7
+ punpckhwd m9, m2, m1
+ punpcklwd m2, m1
+ punpcklwd m7, m12, m4
+ punpckhwd m12, m4
+ punpcklwd m4, m5, m13
+ punpckhwd m5, m13
+ punpckhdq m1, m0, m2
+ punpckldq m0, m2
+ punpckldq m2, m3, m9
+ punpckhdq m3, m9
+ punpckldq m6, m7, m4
+ punpckhdq m7, m4
+ punpckldq m9, m12, m5
+ punpckhdq m12, m5
+ vperm2i128 m4, m0, m6, 0x31
+ vinserti128 m0, xm6, 1
+ vperm2i128 m5, m1, m7, 0x31
+ vinserti128 m1, xm7, 1
+ vperm2i128 m6, m2, m9, 0x31
+ vinserti128 m2, xm9, 1
+ vperm2i128 m7, m3, m12, 0x31
+ vinserti128 m3, xm12, 1
+ call m(idct_16x8_internal_8bpc).main2
+ vpbroadcastd m8, [o(pw_2048)]
+ REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
+ add r0, 16
+ add r3, 16
+ %define dstq r0
+ WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1
+ WRITE_16X2 2, 3, 0, 1, strideq*2, r2
+ %define dstq r3
+ WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1
+ WRITE_16X2 6, 7, 0, 1, strideq*2, r2
+ RET
+
+cglobal inv_txfm_add_identity_identity_8x32_8bpc, 4, 5, 11, dst, stride, c, eob
+ vpbroadcastd m9, [pw_5]
+ lea r4, [strideq*3]
+ sub eobd, 107 ; loop_iterations = 1 + (eobd >= 107)
+.loop:
+ mova xm0,[cq+16* 0]
+ mova xm1, [cq+16* 4]
+ vinserti128 m0, [cq+16* 1], 1
+ vinserti128 m1, [cq+16* 5], 1
+ pxor m8, m8
+ mova [cq+32*0], m8
+ mova [cq+32*2], m8
+ add cq, 16*16
+ mova xm2, [cq-16* 8]
+ mova xm3, [cq-16* 4]
+ vinserti128 m2, [cq-16* 7], 1
+ vinserti128 m3, [cq-16* 3], 1
+ mova xm4, [cq+16* 0]
+ mova xm5, [cq+16* 4]
+ vinserti128 m4, [cq+16* 1], 1
+ vinserti128 m5, [cq+16* 5], 1
+ mova xm6, [cq+16* 8]
+ mova xm7, [cq+16*12]
+ vinserti128 m6, [cq+16* 9], 1
+ vinserti128 m7, [cq+16*13], 1
+ REPX {mova [cq+32*x], m8}, -4, -2, 0, 2, 4, 6
+ REPX {paddsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
+ call .transpose8x8
+ REPX {psraw x, 3 }, m0, m1, m2, m3, m4, m5, m6, m7
+ WRITE_8X4 0, 4, 8, 10, strideq*8, strideq*4, r4*4
+ add dstq, strideq
+ WRITE_8X4 1, 5, 0, 4, strideq*8, strideq*4, r4*4
+ add dstq, strideq
+ WRITE_8X4 2, 6, 0, 4, strideq*8, strideq*4, r4*4
+ add dstq, strideq
+ WRITE_8X4 3, 7, 0, 4, strideq*8, strideq*4, r4*4
+ add dstq, strideq
+ sub cq, 16*16-32
+ lea dstq, [dstq+r4*4]
+ add eobd, 0x80000000
+ jnc .loop
+ RET
+ALIGN function_align
+.transpose8x8:
+ punpckhwd m8, m4, m5
+ punpcklwd m4, m5
+ punpckhwd m5, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m6, m7
+ punpcklwd m6, m7
+ punpckhwd m7, m2, m3
+ punpcklwd m2, m3
+ punpckhdq m3, m0, m2
+ punpckldq m0, m2
+ punpckldq m2, m4, m6
+ punpckhdq m4, m6
+ punpckhdq m6, m5, m7
+ punpckldq m5, m7
+ punpckldq m7, m8, m1
+ punpckhdq m8, m1
+ punpckhqdq m1, m0, m2
+ punpcklqdq m0, m2
+ punpcklqdq m2, m3, m4
+ punpckhqdq m3, m4
+ punpcklqdq m4, m5, m7
+ punpckhqdq m5, m7
+ punpckhqdq m7, m6, m8
+ punpcklqdq m6, m8
+ ret
+
+cglobal inv_txfm_add_identity_identity_32x8_8bpc, 4, 6, 10, dst, stride, c, eob
+ add cq, 16*8
+ vpbroadcastd m9, [pw_4096]
+ lea r4, [strideq*3]
+ lea r5, [dstq+strideq*4]
+ sub eobd, 107
+.loop:
+ mova xm0, [cq-16*8]
+ mova xm1, [cq-16*7]
+ vinserti128 m0, [cq+16*0], 1
+ vinserti128 m1, [cq+16*1], 1
+ mova xm2, [cq-16*6]
+ mova xm3, [cq-16*5]
+ vinserti128 m2, [cq+16*2], 1
+ vinserti128 m3, [cq+16*3], 1
+ mova xm4, [cq-16*4]
+ mova xm5, [cq-16*3]
+ vinserti128 m4, [cq+16*4], 1
+ vinserti128 m5, [cq+16*5], 1
+ mova xm6, [cq-16*2]
+ mova xm7, [cq-16*1]
+ vinserti128 m6, [cq+16*6], 1
+ vinserti128 m7, [cq+16*7], 1
+ pxor m8, m8
+ REPX {mova [cq+32*x], m8}, -4, -3, -2, -1, 0, 1, 2, 3
+ call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8
+ REPX {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
+ WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1
+ WRITE_16X2 2, 3, 0, 1, strideq*2, r4
+ %define dstq r5
+ WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1
+ WRITE_16X2 6, 7, 0, 1, strideq*2, r4
+ add cq, 16*16
+ add r0, 16
+ add r5, 16
+ add eobd, 0x80000000
+ jnc .loop
+ RET
+
+%define o_base pw_5 + 128
+
+%macro LOAD_16ROWS 2-4 0, 1 ; src, stride, is_rect2, zero_coefs
+%if %3
+ vpbroadcastd m15, [o(pw_2896x8)]
+ pmulhrsw m0, m15, [%1+%2* 0]
+ pmulhrsw m1, m15, [%1+%2* 1]
+ pmulhrsw m2, m15, [%1+%2* 2]
+ pmulhrsw m3, m15, [%1+%2* 3]
+ pmulhrsw m4, m15, [%1+%2* 4]
+ pmulhrsw m5, m15, [%1+%2* 5]
+ pmulhrsw m6, m15, [%1+%2* 6]
+ pmulhrsw m7, m15, [%1+%2* 7]
+ pmulhrsw m8, m15, [%1+%2* 8]
+ pmulhrsw m9, m15, [%1+%2* 9]
+ pmulhrsw m10, m15, [%1+%2*10]
+ pmulhrsw m11, m15, [%1+%2*11]
+ pmulhrsw m12, m15, [%1+%2*12]
+ pmulhrsw m13, m15, [%1+%2*13]
+ pmulhrsw m14, m15, [%1+%2*14]
+ pmulhrsw m15, [%1+%2*15]
+%else
+ mova m0, [%1+%2* 0]
+ mova m1, [%1+%2* 1]
+ mova m2, [%1+%2* 2]
+ mova m3, [%1+%2* 3]
+ mova m4, [%1+%2* 4]
+ mova m5, [%1+%2* 5]
+ mova m6, [%1+%2* 6]
+ mova m7, [%1+%2* 7]
+ mova m8, [%1+%2* 8]
+ mova m9, [%1+%2* 9]
+ mova m10, [%1+%2*10]
+ mova m11, [%1+%2*11]
+ mova m12, [%1+%2*12]
+ mova m13, [%1+%2*13]
+ mova m14, [%1+%2*14]
+ mova m15, [%1+%2*15]
+%endif
+ mova [rsp], m15
+%if %4
+ pxor m15, m15
+ REPX {mova [%1+%2*x], m15}, 0, 1, 2, 3, 4, 5, 6, 7, \
+ 8, 9, 10, 11, 12, 13, 14, 15
+%endif
+%endmacro
+
+%macro IDCT32_PASS2_END 7 ; coefs[1-2], tmp[1-2], rnd, offset[1-2]
+ mova m%4, [%2]
+ paddsw m%3, m%1, m%4
+ psubsw m%1, m%4
+ pmovzxbw m%4, [dstq+%6]
+ pmulhrsw m%3, m%5
+ pmulhrsw m%1, m%5
+ paddw m%3, m%4
+ pmovzxbw m%4, [r2+%7]
+ paddw m%1, m%4
+ packuswb m%3, m%1
+ vpermq m%3, m%3, q3120
+ mova [dstq+%6], xm%3
+ vextracti128 [r2+%7], m%3, 1
+%endmacro
+
+cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 4, 0, dst, stride, c, eob
+ lea r6, [o_base]
+ test eobd, eobd
+ jz .dconly
+ PROLOGUE 0, 8, 16, 32*35, dst, stride, c, eob, tmp1, tmp2, \
+ base, tmp3
+ %undef cmp
+ LOAD_16ROWS cq, 64, 1
+ call m(idct_16x16_internal_8bpc).main
+ lea tmp1q, [rsp+32*7]
+ lea tmp2q, [tmp1q+32*8]
+ lea tmp3q, [tmp1q+32*16]
+ mova m1, [rsp+32*1]
+ mova [rsp+32*0], m6
+ mova [rsp+32*1], m7
+ vpbroadcastd m7, [o(pw_16384)]
+ call .transpose_2x8x8_round
+ mova m15, [rsp+32*0]
+ mova [tmp3q-32*4+ 0], xm0
+ vextracti128 [tmp3q+32*0+ 0], m0, 1
+ mova [tmp3q-32*3+ 0], xm2
+ vextracti128 [tmp3q+32*1+ 0], m2, 1
+ mova [tmp3q-32*2+ 0], xm4
+ vextracti128 [tmp3q+32*2+ 0], m4, 1
+ mova [tmp3q-32*1+ 0], xm6
+ vextracti128 [tmp3q+32*3+ 0], m6, 1
+ mova [tmp3q-32*4+16], xm8
+ vextracti128 [tmp3q+32*0+16], m8, 1
+ mova [tmp3q-32*3+16], xm10
+ vextracti128 [tmp3q+32*1+16], m10, 1
+ mova [tmp3q-32*2+16], xm12
+ vextracti128 [tmp3q+32*2+16], m12, 1
+ mova [tmp3q-32*1+16], xm14
+ vextracti128 [tmp3q+32*3+16], m14, 1
+ cmp eobd, 150
+ jg .full
+ vinserti128 m0, m1, xm9, 1
+ vperm2i128 m4, m1, m9, 0x31
+ vinserti128 m2, m5, xm13, 1
+ vperm2i128 m6, m5, m13, 0x31
+ vinserti128 m1, m3, xm11, 1
+ vperm2i128 m5, m3, m11, 0x31
+ vinserti128 m3, m7, xm15, 1
+ vperm2i128 m7, m7, m15, 0x31
+ call .main_oddhalf_fast
+ pxor m8, m8
+ REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15
+ jmp .idct16
+.dconly:
+ movd xm1, [o(pw_2896x8)]
+ pmulhrsw xm0, xm1, [cq]
+ movd xm2, [o(pw_16384)]
+ mov [cq], eobd
+ pmulhrsw xm0, xm1
+ or r3d, 32
+ jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
+.full:
+ mova [tmp1q-32*4], m1
+ mova [tmp1q-32*3], m3
+ mova [tmp1q-32*2], m5
+ mova [tmp1q-32*1], m7
+ mova [tmp1q+32*0], m9
+ mova [tmp1q+32*1], m11
+ mova [tmp1q+32*2], m13
+ mova [tmp1q+32*3], m15
+ LOAD_16ROWS cq+32, 64, 1
+ call m(idct_16x16_internal_8bpc).main
+ lea r2, [tmp3q+32*8]
+ mova m1, [rsp+32*1]
+ mova [rsp+32*0], m6
+ mova [rsp+32*1], m7
+ vpbroadcastd m7, [o(pw_16384)]
+ call .transpose_2x8x8_round
+ mova m15, [rsp+32*0]
+ mova [r2-32*4+ 0], xm0
+ vextracti128 [r2+32*0+ 0], m0, 1
+ mova [r2-32*3+ 0], xm2
+ vextracti128 [r2+32*1+ 0], m2, 1
+ mova [r2-32*2+ 0], xm4
+ vextracti128 [r2+32*2+ 0], m4, 1
+ mova [r2-32*1+ 0], xm6
+ vextracti128 [r2+32*3+ 0], m6, 1
+ mova [r2-32*4+16], xm8
+ vextracti128 [r2+32*0+16], m8, 1
+ mova [r2-32*3+16], xm10
+ vextracti128 [r2+32*1+16], m10, 1
+ mova [r2-32*2+16], xm12
+ vextracti128 [r2+32*2+16], m12, 1
+ mova [r2-32*1+16], xm14
+ vextracti128 [r2+32*3+16], m14, 1
+ vinserti128 m8, m1, xm9, 1
+ vperm2i128 m12, m1, m9, 0x31
+ mova xm0, [tmp1q-32*4]
+ mova xm1, [tmp1q-32*3]
+ vinserti128 m0, [tmp1q+32*0], 1
+ vinserti128 m1, [tmp1q+32*1], 1
+ vinserti128 m10, m5, xm13, 1
+ vperm2i128 m14, m5, m13, 0x31
+ mova xm4, [tmp1q-32*4+16]
+ mova xm5, [tmp1q-32*3+16]
+ vinserti128 m4, [tmp1q+32*0+16], 1
+ vinserti128 m5, [tmp1q+32*1+16], 1
+ vinserti128 m9, m3, xm11, 1
+ vperm2i128 m13, m3, m11, 0x31
+ mova xm2, [tmp1q-32*2]
+ mova xm3, [tmp1q-32*1]
+ vinserti128 m2, [tmp1q+32*2], 1
+ vinserti128 m3, [tmp1q+32*3], 1
+ vinserti128 m11, m7, xm15, 1
+ vperm2i128 m15, m7, m15, 0x31
+ mova xm6, [tmp1q-32*2+16]
+ mova xm7, [tmp1q-32*1+16]
+ vinserti128 m6, [tmp1q+32*2+16], 1
+ vinserti128 m7, [tmp1q+32*3+16], 1
+ call .main_oddhalf
+ LOAD_8ROWS_H r2-32*4, 32
+.idct16:
+ LOAD_8ROWS tmp3q-32*4, 32
+ mova [rsp], m15
+ call m(idct_16x16_internal_8bpc).main
+ imul r2, strideq, 19
+ lea r3, [strideq*3]
+ add r2, dstq
+ call .pass2_end
+ RET
+ALIGN function_align
+cglobal_label .main_oddhalf_fast ; lower half is zero
+ mova [rsp+gprsize+32*1], m7
+ pxor m7, m7
+ mova [rsp+gprsize+32*0], m7
+ mova [rsp+gprsize+32*2], m7
+ vpbroadcastd m11, [o(pw_3703x8)]
+ vpbroadcastd m7, [o(pw_1751x8)]
+ vpbroadcastd m12, [o(pw_m1380x8)]
+ vpbroadcastd m8, [o(pw_3857x8)]
+ vpbroadcastd m13, [o(pw_3973x8)]
+ vpbroadcastd m15, [o(pw_995x8)]
+ pmulhrsw m11, m4 ; t29a
+ pmulhrsw m4, m7 ; t18a
+ pmulhrsw m12, m3 ; t19a
+ pmulhrsw m3, m8 ; t28a
+ pmulhrsw m13, m2 ; t27a
+ pmulhrsw m2, m15 ; t20a
+ vpbroadcastd m10, [o(pw_m2106x8)]
+ vpbroadcastd m7, [o(pw_3513x8)]
+ vpbroadcastd m9, [o(pw_3290x8)]
+ vpbroadcastd m8, [o(pw_2440x8)]
+ vpbroadcastd m14, [o(pw_m601x8)]
+ vpbroadcastd m15, [o(pw_4052x8)]
+ pmulhrsw m10, m5 ; t21a
+ pmulhrsw m5, m7 ; t26a
+ pmulhrsw m9, m6 ; t25a
+ pmulhrsw m6, m8 ; t22a
+ pmulhrsw m14, m1 ; t23a
+ pmulhrsw m1, m15 ; t24a
+ vpbroadcastd m15, [o(pd_2048)]
+ jmp .main2
+ALIGN function_align
+cglobal_label .main_oddhalf
+ mova [rsp+gprsize+32*0], m15
+ mova [rsp+gprsize+32*1], m7
+ mova [rsp+gprsize+32*2], m8
+ vpbroadcastd m15, [o(pd_2048)]
+ ITX_MULSUB_2W 4, 11, 7, 8, 15, 1751, 3703 ; t18a, t29a
+ ITX_MULSUB_2W 12, 3, 7, 8, 15, 3857, 1380 ; t19a, t28a
+ ITX_MULSUB_2W 2, 13, 7, 8, 15, 995, 3973 ; t20a, t27a
+ ITX_MULSUB_2W 10, 5, 7, 8, 15, 3513, 2106 ; t21a, t26a
+ ITX_MULSUB_2W 6, 9, 7, 8, 15, 2440, 3290 ; t22a, t25a
+ ITX_MULSUB_2W 14, 1, 7, 8, 15, 4052, 601 ; t23a, t24a
+.main2:
+ psubsw m7, m12, m4 ; t18
+ paddsw m12, m4 ; t19
+ psubsw m4, m2, m10 ; t21
+ paddsw m2, m10 ; t20
+ psubsw m10, m14, m6 ; t22
+ paddsw m14, m6 ; t23
+ psubsw m6, m1, m9 ; t25
+ paddsw m1, m9 ; t24
+ psubsw m9, m13, m5 ; t26
+ paddsw m13, m5 ; t27
+ psubsw m5, m3, m11 ; t29
+ paddsw m3, m11 ; t28
+ ITX_MULSUB_2W 5, 7, 8, 11, 15, m4017, 799 ; t18a, t29a
+ ITX_MULSUB_2W 9, 4, 8, 11, 15, 3406, 2276 ; t21a, t26a
+ ITX_MULSUB_2W 6, 10, 8, 11, 15, m2276, 3406 ; t22a, t25a
+ psubsw m8, m14, m2 ; t20a
+ paddsw m14, m2 ; t23a
+ psubsw m2, m1, m13 ; t27a
+ paddsw m1, m13 ; t24a
+ psubsw m13, m6, m9 ; t21
+ paddsw m6, m9 ; t22
+ psubsw m9, m10, m4 ; t26
+ paddsw m10, m4 ; t25
+ ITX_MULSUB_2W 2, 8, 4, 11, 15, m3784, 1567 ; t20, t27
+ ITX_MULSUB_2W 9, 13, 4, 11, 15, m3784, 1567 ; t21a, t26a
+ mova m4, [rsp+gprsize+32*0] ; in31
+ mova [rsp+gprsize+32*0], m6 ; t22
+ mova m6, [rsp+gprsize+32*1] ; in15
+ mova [rsp+gprsize+32*1], m14 ; t23a
+ mova m14, [rsp+gprsize+32*2] ; in17
+ mova [rsp+gprsize+32*2], m1 ; t24a
+ ITX_MULSUB_2W 0, 4, 1, 11, 15, 201, 4091 ; t16a, t31a
+ ITX_MULSUB_2W 14, 6, 1, 11, 15, 3035, 2751 ; t17a, t30a
+ psubsw m1, m0, m14 ; t17
+ paddsw m0, m14 ; t16
+ psubsw m14, m4, m6 ; t30
+ paddsw m4, m6 ; t31
+ ITX_MULSUB_2W 14, 1, 6, 11, 15, 799, 4017 ; t17a, t30a
+ psubsw m6, m0, m12 ; t19a
+ paddsw m0, m12 ; t16a
+ psubsw m12, m4, m3 ; t28a
+ paddsw m4, m3 ; t31a
+ psubsw m3, m14, m5 ; t18
+ paddsw m14, m5 ; t17
+ psubsw m5, m1, m7 ; t29
+ paddsw m1, m7 ; t30
+ ITX_MULSUB_2W 5, 3, 7, 11, 15, 1567, 3784 ; t18a, t29a
+ ITX_MULSUB_2W 12, 6, 7, 11, 15, 1567, 3784 ; t19, t28
+ psubsw m7, m1, m10 ; t25a
+ paddsw m1, m10 ; t30a
+ psubsw m10, m5, m9 ; t21
+ paddsw m5, m9 ; t18
+ psubsw m9, m12, m2 ; t20a
+ paddsw m12, m2 ; t19a
+ psubsw m2, m3, m13 ; t26
+ paddsw m3, m13 ; t29
+ psubsw m13, m6, m8 ; t27a
+ paddsw m6, m8 ; t28a
+ mova [tmp1q-32*2], m5
+ mova [tmp1q-32*1], m12
+ mova [tmp2q+32*0], m6
+ mova [tmp2q+32*1], m3
+ mova [tmp2q+32*2], m1
+ mova m5, [rsp+gprsize+32*0] ; t22
+ mova m6, [rsp+gprsize+32*1] ; t23
+ mova m3, [rsp+gprsize+32*2] ; t24a
+ psubsw m1, m14, m5 ; t22a
+ paddsw m14, m5 ; t17a
+ psubsw m5, m0, m6 ; t23
+ paddsw m0, m6 ; t16
+ psubsw m6, m4, m3 ; t24
+ paddsw m4, m3 ; t31
+ vpbroadcastd m8, [o(pw_m2896_2896)]
+ vpbroadcastd m3, [o(pw_2896_2896)]
+ mova [tmp1q-32*4], m0
+ mova [tmp1q-32*3], m14
+ mova [tmp2q+32*3], m4
+ ITX_MULSUB_2W 13, 9, 0, 4, 15, 3, 8 ; t20, t27
+ ITX_MULSUB_2W 2, 10, 0, 4, 15, 3, 8 ; t21a, t26a
+ ITX_MULSUB_2W 7, 1, 0, 4, 15, 3, 8 ; t22, t25
+ ITX_MULSUB_2W 6, 5, 0, 4, 15, 3, 8 ; t23a, t24a
+ mova [tmp1q+32*0], m13
+ mova [tmp1q+32*1], m2
+ mova [tmp1q+32*2], m7
+ mova [tmp1q+32*3], m6
+ mova [tmp2q-32*4], m5
+ mova [tmp2q-32*3], m1
+ mova [tmp2q-32*2], m10
+ mova [tmp2q-32*1], m9
+ ret
+ALIGN function_align
+.transpose_2x8x8_round:
+ punpckhwd m6, m12, m13
+ punpcklwd m12, m13
+ punpckhwd m13, m8, m9
+ punpcklwd m8, m9
+ punpckhwd m9, m14, m15
+ punpcklwd m14, m15
+ punpckhwd m15, m10, m11
+ punpcklwd m10, m11
+ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5
+ punpckhdq m11, m8, m10
+ punpckldq m8, m10
+ punpckldq m10, m12, m14
+ punpckhdq m12, m14
+ punpckhdq m14, m13, m15
+ punpckldq m13, m15
+ punpckldq m15, m6, m9
+ punpckhdq m6, m9
+ punpckhqdq m9, m8, m10
+ punpcklqdq m8, m10
+ punpcklqdq m10, m11, m12
+ punpckhqdq m11, m12
+ punpcklqdq m12, m13, m15
+ punpckhqdq m13, m15
+ punpckhqdq m15, m14, m6
+ punpcklqdq m14, m6
+ pmulhrsw m6, m7, [rsp+gprsize+32*0]
+ REPX {pmulhrsw x, m7}, m8, m9, m10, m11, m12, m13, m14, m15
+ pmulhrsw m7, [rsp+gprsize+32*1]
+ mova [rsp+gprsize+32*0], m15
+ punpckhwd m15, m4, m5
+ punpcklwd m4, m5
+ punpckhwd m5, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m6, m7
+ punpcklwd m6, m7
+ punpckhwd m7, m2, m3
+ punpcklwd m2, m3
+ punpckhdq m3, m0, m2
+ punpckldq m0, m2
+ punpckldq m2, m4, m6
+ punpckhdq m4, m6
+ punpckhdq m6, m5, m7
+ punpckldq m5, m7
+ punpckldq m7, m15, m1
+ punpckhdq m15, m1
+ punpckhqdq m1, m0, m2
+ punpcklqdq m0, m2
+ punpcklqdq m2, m3, m4
+ punpckhqdq m3, m4
+ punpcklqdq m4, m5, m7
+ punpckhqdq m5, m7
+ punpckhqdq m7, m6, m15
+ punpcklqdq m6, m15
+ ret
+ALIGN function_align
+.pass2_end:
+ mova [rsp+gprsize+32*0], m7
+ mova [rsp+gprsize+32*2], m15
+ vpbroadcastd m15, [o(pw_2048)]
+ IDCT32_PASS2_END 0, tmp2q+32*3, 1, 7, 15, strideq*0, r3*4
+ IDCT32_PASS2_END 4, tmp2q-32*1, 0, 7, 15, strideq*4, strideq*8
+ IDCT32_PASS2_END 8, tmp1q+32*3, 0, 4, 15, strideq*8, strideq*4
+ IDCT32_PASS2_END 12, tmp1q-32*1, 0, 4, 15, r3*4, strideq*0
+ add dstq, strideq
+ sub r2, strideq
+ mova m1, [rsp+gprsize+32*1]
+ IDCT32_PASS2_END 1, tmp2q+32*2, 0, 4, 15, strideq*0, r3*4
+ IDCT32_PASS2_END 5, tmp2q-32*2, 0, 4, 15, strideq*4, strideq*8
+ IDCT32_PASS2_END 9, tmp1q+32*2, 0, 4, 15, strideq*8, strideq*4
+ IDCT32_PASS2_END 13, tmp1q-32*2, 0, 4, 15, r3*4, strideq*0
+ add dstq, strideq
+ sub r2, strideq
+ IDCT32_PASS2_END 2, tmp2q+32*1, 0, 4, 15, strideq*0, r3*4
+ IDCT32_PASS2_END 6, tmp2q-32*3, 0, 4, 15, strideq*4, strideq*8
+ IDCT32_PASS2_END 10, tmp1q+32*1, 0, 4, 15, strideq*8, strideq*4
+ IDCT32_PASS2_END 14, tmp1q-32*3, 0, 4, 15, r3*4, strideq*0
+ add dstq, strideq
+ sub r2, strideq
+ mova m7, [rsp+gprsize+32*0]
+ mova m1, [rsp+gprsize+32*2]
+ IDCT32_PASS2_END 3, tmp2q+32*0, 0, 4, 15, strideq*0, r3*4
+ IDCT32_PASS2_END 7, tmp2q-32*4, 0, 4, 15, strideq*4, strideq*8
+ IDCT32_PASS2_END 11, tmp1q+32*0, 0, 4, 15, strideq*8, strideq*4
+ IDCT32_PASS2_END 1, tmp1q-32*4, 0, 4, 15, r3*4, strideq*0
+ ret
+
+; Perform the final sumsub step and YMM lane shuffling
+%macro IDCT32_PASS1_END 4 ; row[1-2], tmp[1-2]
+ mova m%3, [tmp2q+32*( 3-%1)]
+ psubsw m%4, m%1, m%3
+ paddsw m%1, m%3
+ mova m%3, [tmp1q+32*(11-%2)]
+ mova [tmp1q+32*(11-%2)+16], xm%4
+ vextracti128 [tmp2q+32*( 3-%1)+16], m%4, 1
+ paddsw m%4, m%2, m%3
+ psubsw m%2, m%3
+ mova [tmp1q+32*(11-%2)], xm%2
+ vextracti128 [tmp2q+32*( 3-%1)], m%2, 1
+ vperm2i128 m%2, m%1, m%4, 0x31
+ vinserti128 m%1, xm%4, 1
+%endmacro
+
+cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 4, 0, dst, stride, c, eob
+ lea r6, [o_base]
+ test eobd, eobd
+ jnz .normal
+ movd xm1, [o(pw_2896x8)]
+ pmulhrsw xm0, xm1, [cq]
+ movd xm2, [o(pw_16384)]
+ mov [cq], eobd
+ pmulhrsw xm0, xm1
+ or r3d, 16
+ jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly
+.normal:
+ PROLOGUE 0, 6, 16, 32*19, dst, stride, c, eob, tmp1, tmp2
+ vpbroadcastd m15, [o(pw_2896x8)]
+ pmulhrsw m0, m15, [cq+32* 1]
+ pmulhrsw m1, m15, [cq+32* 3]
+ pmulhrsw m2, m15, [cq+32* 5]
+ pmulhrsw m3, m15, [cq+32* 7]
+ pmulhrsw m4, m15, [cq+32* 9]
+ pmulhrsw m5, m15, [cq+32*11]
+ pmulhrsw m6, m15, [cq+32*13]
+ pmulhrsw m7, m15, [cq+32*15]
+ pmulhrsw m8, m15, [cq+32*17]
+ pmulhrsw m9, m15, [cq+32*19]
+ pmulhrsw m10, m15, [cq+32*21]
+ pmulhrsw m11, m15, [cq+32*23]
+ pmulhrsw m12, m15, [cq+32*25]
+ pmulhrsw m13, m15, [cq+32*27]
+ pmulhrsw m14, m15, [cq+32*29]
+ pmulhrsw m15, [cq+32*31]
+ lea tmp1q, [rsp+32*7]
+ lea tmp2q, [tmp1q+32*8]
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
+ LOAD_16ROWS cq+32*0, 32*2, 1, 0
+ pxor m15, m15
+ mov r3d, 8
+.zero_loop:
+ mova [cq+32*0], m15
+ mova [cq+32*1], m15
+ mova [cq+32*2], m15
+ mova [cq+32*3], m15
+ add cq, 32*4
+ dec r3d
+ jg .zero_loop
+ call m(idct_16x16_internal_8bpc).main
+ call .pass1_end
+ lea r2, [strideq*3]
+ mov r3, dstq
+.pass2:
+ vpbroadcastd m7, [o(pw_16384)]
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round
+ call m(idct_16x16_internal_8bpc).main
+ mova [rsp+32*2], m15
+ vpbroadcastd m15, [o(pw_2048)]
+ REPX {pmulhrsw x, m15}, m2, m3, m0
+ WRITE_16X2 2, 3, 1, 2, strideq*2, r2
+ pmulhrsw m1, m15, [rsp+32*1]
+ WRITE_16X2 0, 1, 2, 3, strideq*0, strideq*1
+ lea dstq, [dstq+strideq*4]
+ REPX {pmulhrsw x, m15}, m4, m5, m6, m7
+ WRITE_16X2 4, 5, 2, 3, strideq*0, strideq*1
+ WRITE_16X2 6, 7, 2, 3, strideq*2, r2
+ lea dstq, [dstq+strideq*4]
+ REPX {pmulhrsw x, m15}, m8, m9, m10, m11
+ WRITE_16X2 8, 9, 2, 3, strideq*0, strideq*1
+ WRITE_16X2 10, 11, 2, 3, strideq*2, r2
+ lea dstq, [dstq+strideq*4]
+ REPX {pmulhrsw x, m15}, m11, m12, m13, m14
+ pmulhrsw m15, [rsp+32*2]
+ WRITE_16X2 12, 13, 2, 3, strideq*0, strideq*1
+ WRITE_16X2 14, 15, 2, 3, strideq*2, r2
+ test r3, r3
+ jnz .right_half
+ RET
+.right_half:
+ LOAD_8ROWS tmp1q-32*4, 32
+ LOAD_8ROWS_H tmp2q-32*4, 32
+ lea dstq, [r3+16]
+ xor r3d, r3d
+ mova [rsp+32*0], m6
+ mova [rsp+32*1], m7
+ jmp .pass2
+ALIGN function_align
+.pass1_end:
+ mova [rsp+gprsize+32*0], m9
+ IDCT32_PASS1_END 0, 8, 1, 9
+ IDCT32_PASS1_END 2, 10, 1, 9
+ IDCT32_PASS1_END 3, 11, 1, 9
+ IDCT32_PASS1_END 4, 12, 1, 9
+ IDCT32_PASS1_END 5, 13, 1, 9
+ IDCT32_PASS1_END 6, 14, 1, 9
+ IDCT32_PASS1_END 7, 15, 1, 9
+ mova m1, [rsp+gprsize+32*1]
+ mova m9, [rsp+gprsize+32*0]
+ mova [rsp+gprsize+32*0], m6
+ mova [rsp+gprsize+32*1], m7
+ IDCT32_PASS1_END 1, 9, 6, 7
+ ret
+
+cglobal inv_txfm_add_identity_identity_16x32_8bpc, 4, 5, 13, dst, stride, c, eob
+%undef cmp
+ lea r6, [o_base]
+ vpbroadcastd m9, [o(pw_2896x8)]
+ vpbroadcastd m10, [o(pw_1697x16)]
+ vpbroadcastd m12, [o(pw_8192)]
+ cmp eobd, 43 ; if (eob > 43)
+ setg r4b ; iteration_count++
+ cmp eobd, 150 ; if (eob > 150)
+ setg al ; iteration_count++
+ add eobd, -279 ; if (eob > 278)
+ adc r4b, al ; iteration_count++
+ lea r3, [strideq*3]
+ mov r6, cq
+ paddw m11, m12, m12 ; pw_16384
+.loop:
+ mova xm0, [cq+64* 0]
+ mova xm1, [cq+64* 1]
+ vinserti128 m0, [cq+64* 8], 1
+ vinserti128 m1, [cq+64* 9], 1
+ mova xm2, [cq+64* 2]
+ mova xm3, [cq+64* 3]
+ vinserti128 m2, [cq+64*10], 1
+ vinserti128 m3, [cq+64*11], 1
+ mova xm4, [cq+64* 4]
+ mova xm5, [cq+64* 5]
+ vinserti128 m4, [cq+64*12], 1
+ vinserti128 m5, [cq+64*13], 1
+ mova xm6, [cq+64* 6]
+ mova xm7, [cq+64* 7]
+ vinserti128 m6, [cq+64*14], 1
+ vinserti128 m7, [cq+64*15], 1
+ REPX {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {IDTX16 x, 8, 10, 11}, 0, 1, 2, 3, 4, 5, 6, 7
+ call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8
+ REPX {pmulhrsw x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
+ WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1
+ WRITE_16X2 2, 3, 0, 1, strideq*2, r3
+ lea dstq, [dstq+strideq*4]
+ WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1
+ WRITE_16X2 6, 7, 0, 1, strideq*2, r3
+ lea dstq, [dstq+strideq*4]
+ add cq, 16
+ dec r4b
+ jge .loop
+ sub cq, 32
+ pxor m0, m0
+ mov r0d, 8
+ cmp cq, r6
+ ja .zero_loop
+.zero_loop_half:
+ mova [r6+64*0], m0
+ mova [r6+64*1], m0
+ add r6, 64*4
+ mova [r6-64*2], m0
+ mova [r6-64*1], m0
+ sub r0d, 2
+ jg .zero_loop_half
+ RET
+.zero_loop:
+ mova [r6+32*0], m0
+ mova [r6+32*1], m0
+ mova [r6+32*2], m0
+ mova [r6+32*3], m0
+ add r6, 32*4
+ dec r0d
+ jg .zero_loop
+ RET
+
+cglobal inv_txfm_add_identity_identity_32x16_8bpc, 4, 6, 12, dst, stride, c, eob
+%undef cmp
+ lea r6, [o_base]
+ vpbroadcastd m9, [o(pw_2896x8)]
+ vpbroadcastd m10, [o(pw_1697x16)]
+ vpbroadcastd m11, [o(pw_2048)]
+ cmp eobd, 35 ; if (eob > 35)
+ setg r4b ; iteration_count++
+ cmp eobd, 150 ; if (eob > 150)
+ setg r3b ; iteration_count += 2
+ lea r4d, [r4+r3*2]
+ lea r3, [strideq*3]
+ mov r5, dstq
+ mov r6, cq
+.loop:
+ mova xm0, [cq+32* 0]
+ mova xm1, [cq+32* 1]
+ vinserti128 m0, [cq+32* 8], 1
+ vinserti128 m1, [cq+32* 9], 1
+ mova xm2, [cq+32* 2]
+ mova xm3, [cq+32* 3]
+ vinserti128 m2, [cq+32*10], 1
+ vinserti128 m3, [cq+32*11], 1
+ mova xm4, [cq+32* 4]
+ mova xm5, [cq+32* 5]
+ vinserti128 m4, [cq+32*12], 1
+ vinserti128 m5, [cq+32*13], 1
+ mova xm6, [cq+32* 6]
+ mova xm7, [cq+32* 7]
+ vinserti128 m6, [cq+32*14], 1
+ vinserti128 m7, [cq+32*15], 1
+ REPX {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {paddsw x, x }, m0, m1, m2, m3, m4, m5, m6, m7
+ call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8
+ REPX {IDTX16 x, 8, 10}, 0, 1, 2, 3, 4, 5, 6, 7
+ REPX {pmulhrsw x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+ WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1
+ WRITE_16X2 2, 3, 0, 1, strideq*2, r3
+ lea dstq, [dstq+strideq*4]
+ WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1
+ WRITE_16X2 6, 7, 0, 1, strideq*2, r3
+ lea dstq, [dstq+strideq*4]
+ add cq, 16
+ dec r4b
+ jl .ret
+ test r4b, 1
+ jz .loop
+ add cq, 32*15
+ lea dstq, [r5+16]
+ jmp .loop
+.ret:
+ sub cd, eax
+ pxor m0, m0
+ add cd, 384
+.zero_loop:
+ mova [r6+32*0], m0
+ mova [r6+32*1], m0
+ mova [r6+32*2], m0
+ mova [r6+32*3], m0
+ add r6, 32*4
+ sub cd, 128
+ jge .zero_loop
+ RET
+
+cglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 4, 0, dst, stride, c, eob
+ lea r6, [o_base]
+ test eobd, eobd
+ jnz .normal
+ movd xm1, [o(pw_2896x8)]
+ pmulhrsw xm0, xm1, [cq]
+ movd xm2, [o(pw_8192)]
+ mov [cq], eobd
+ or r3d, 32
+ jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly
+.normal:
+ PROLOGUE 0, 9, 16, 32*67, dst, stride, c, eob, tmp1, tmp2, \
+ base, tmp3, tmp4
+ %undef cmp
+ lea tmp1q, [rsp+32*7]
+ lea tmp2q, [tmp1q+32*8]
+ sub eobd, 136
+ mov tmp4d, eobd
+.pass1_loop:
+ LOAD_8ROWS cq+64*1, 64*2
+ pxor m8, m8
+ REPX {mova [cq+64*x], m8}, 1, 3, 5, 7, 9, 11, 13, 15
+ test tmp4d, tmp4d
+ jl .fast
+ LOAD_8ROWS_H cq+64*17, 64*2
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
+ LOAD_8ROWS_H cq+64*16, 64*2
+ pxor m0, m0
+ REPX {mova [cq+64*x], m0}, 16, 17, 18, 19, 20, 21, 22, 23, \
+ 24, 25, 26, 27, 28, 29, 30, 31
+ mova [rsp], m15
+ jmp .idct16
+.fast:
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ pxor m8, m8
+ REPX {mova x, m8}, m9, m10, m11, m12, m13, m14
+ mova [rsp], m8
+.idct16:
+ LOAD_8ROWS cq+64*0, 64*2
+ pxor m15, m15
+ REPX {mova [cq+64*x], m15}, 0, 2, 4, 6, 8, 10, 12, 14
+ call m(idct_16x16_internal_8bpc).main
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).pass1_end
+ vpbroadcastd m7, [o(pw_8192)]
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round
+ lea tmp3q, [tmp1q+32*32]
+ mova m15, [rsp]
+ mova [tmp3q-32*4], m0
+ mova [tmp3q-32*3], m2
+ mova [tmp3q-32*2], m4
+ mova [tmp3q-32*1], m6
+ mova [tmp3q+32*0], m8
+ mova [tmp3q+32*1], m10
+ mova [tmp3q+32*2], m12
+ mova [tmp3q+32*3], m14
+ add tmp3q, 32*8
+ mova [tmp3q-32*4], m1
+ mova [tmp3q-32*3], m3
+ mova [tmp3q-32*2], m5
+ mova [tmp3q-32*1], m7
+ mova [tmp3q+32*0], m9
+ mova [tmp3q+32*1], m11
+ mova [tmp3q+32*2], m13
+ mova [tmp3q+32*3], m15
+ vpbroadcastd m9, [o(pw_8192)]
+ pmulhrsw m0, m9, [tmp1q-32*4]
+ pmulhrsw m1, m9, [tmp1q-32*3]
+ pmulhrsw m2, m9, [tmp1q-32*2]
+ pmulhrsw m3, m9, [tmp1q-32*1]
+ pmulhrsw m4, m9, [tmp1q+32*0]
+ pmulhrsw m5, m9, [tmp1q+32*1]
+ pmulhrsw m6, m9, [tmp1q+32*2]
+ pmulhrsw m7, m9, [tmp1q+32*3]
+ call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8
+ mova [tmp1q-32*4], m0
+ pmulhrsw m0, m9, [tmp2q-32*4]
+ mova [tmp2q-32*4], m1
+ pmulhrsw m1, m9, [tmp2q-32*3]
+ mova [tmp1q-32*3], m2
+ pmulhrsw m2, m9, [tmp2q-32*2]
+ mova [tmp2q-32*3], m3
+ pmulhrsw m3, m9, [tmp2q-32*1]
+ mova [tmp1q-32*2], m4
+ pmulhrsw m4, m9, [tmp2q+32*0]
+ mova [tmp2q-32*2], m5
+ pmulhrsw m5, m9, [tmp2q+32*1]
+ mova [tmp1q-32*1], m6
+ pmulhrsw m6, m9, [tmp2q+32*2]
+ mova [tmp2q-32*1], m7
+ pmulhrsw m7, m9, [tmp2q+32*3]
+ call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8
+ mova [tmp1q+32*0], m0
+ mova [tmp2q+32*0], m1
+ mova [tmp1q+32*1], m2
+ mova [tmp2q+32*1], m3
+ mova [tmp1q+32*2], m4
+ mova [tmp2q+32*2], m5
+ mova [tmp1q+32*3], m6
+ mova [tmp2q+32*3], m7
+ add cq, 32
+ add tmp1q, 32*16
+ add tmp2q, 32*16
+ add eobd, 0x80000000
+ jnc .pass1_loop
+ add tmp1q, 32*24
+ imul r2, strideq, 19
+ lea r3, [strideq*3]
+ add r2, dstq
+ test tmp4d, tmp4d
+ jge .pass2_loop
+ add tmp1q, 32*16
+ add tmp2q, 32*16
+ add tmp3q, 32*16
+.pass2_loop:
+ LOAD_8ROWS tmp2q-32*4, 32
+ test tmp4d, tmp4d
+ jl .fast2
+ LOAD_8ROWS_H tmp3q-32*4, 32
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
+ sub tmp3q, 32*8
+ LOAD_8ROWS_H tmp3q-32*4, 32
+ sub tmp3q, 32*16
+ jmp .pass2_loop_end
+.fast2:
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ sub tmp3q, 32*24
+ pxor m8, m8
+ REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15
+.pass2_loop_end:
+ LOAD_8ROWS tmp3q-32*4, 32
+ mova [rsp], m15
+ call m(idct_16x16_internal_8bpc).main
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).pass2_end
+ lea tmp3q, [tmp1q-32*32]
+ cmp tmp2q, tmp3q
+ jb .ret
+ sub tmp2q, 32*32
+ sub dstq, r3
+ lea r2, [r2+r3+16]
+ add dstq, 16
+ jmp .pass2_loop
+.ret:
+ RET
+
+cglobal inv_txfm_add_identity_identity_32x32_8bpc, 4, 6, 10, dst, stride, c, eob
+ %undef cmp
+ vpbroadcastd m9, [pw_8192]
+ sub eobd, 136 ; if (eob < 136)
+ shr eobd, 30 ; topleft 16x16 only
+ lea eobd, [eobq*2-8]
+ lea r4, [strideq*3]
+ mov r5, dstq
+ lea r6, [cq+32]
+.loop:
+ mova xm0, [cq+64* 0]
+ mova xm1, [cq+64* 1]
+ vinserti128 m0, [cq+64* 8], 1
+ vinserti128 m1, [cq+64* 9], 1
+ mova xm2, [cq+64* 2]
+ mova xm3, [cq+64* 3]
+ vinserti128 m2, [cq+64*10], 1
+ vinserti128 m3, [cq+64*11], 1
+ mova xm4, [cq+64* 4]
+ mova xm5, [cq+64* 5]
+ vinserti128 m4, [cq+64*12], 1
+ vinserti128 m5, [cq+64*13], 1
+ mova xm6, [cq+64* 6]
+ mova xm7, [cq+64* 7]
+ vinserti128 m6, [cq+64*14], 1
+ vinserti128 m7, [cq+64*15], 1
+ call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8
+ REPX {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
+ WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1
+ WRITE_16X2 2, 3, 0, 1, strideq*2, r4
+ lea dstq, [dstq+strideq*4]
+ WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1
+ WRITE_16X2 6, 7, 0, 1, strideq*2, r4
+ lea dstq, [dstq+strideq*4]
+ add cq, 16
+ inc eobd
+ jz .ret
+ test eobd, 3
+ jnz .loop
+ add cq, 64*15
+ lea dstq, [r5+16]
+ jmp .loop
+.ret:
+ pxor m0, m0
+ mov r0d, 16
+ cmp cq, r6
+ jne .zero_loop
+.zero_loop_topleft:
+ mova [r6-32*1], m0
+ mova [r6+32*1], m0
+ mova [r6+32*3], m0
+ mova [r6+32*5], m0
+ add r6, 64*4
+ sub r0d, 4
+ jg .zero_loop_topleft
+ RET
+.zero_loop:
+ mova [r6-32*1], m0
+ mova [r6+32*0], m0
+ mova [r6+32*1], m0
+ mova [r6+32*2], m0
+ add r6, 32*4
+ dec r0d
+ jg .zero_loop
+ RET
+
+%macro IDCT64_PART2_END 6-10 ; out, src[1-2], tmp[1-3], (offset[1-4])
+%if %1 & 1
+ mova m%5, [tmp2q-32*(51-%1)] ; idct16 out 0+n
+ mova m%4, [tmp1q-32*(14+%1)] ; idct32 out31-n
+%else
+ mova m%5, [tmp1q-32*(45-%1)]
+ mova m%4, [tmp2q-32*(20+%1)]
+%endif
+ psubsw m%6, m%5, m%4 ; idct32 out31-n
+ paddsw m%5, m%4 ; idct32 out 0+n
+ psubsw m%4, m%6, m%3 ; out32+n
+ paddsw m%6, m%3 ; out31-n
+ psubsw m%3, m%5, m%2 ; out63-n
+ paddsw m%5, m%2 ; out 0+n
+%if %0 == 6 ; pass 1
+%if %1 & 1
+ mova [tmp2q-32*(19-%1)], m%4
+ mova [tmp1q-32*(14+%1)], m%6
+ mova [tmp1q+32*(18-%1)], m%3
+ mova [tmp2q-32*(51-%1)], m%5
+%else
+ mova [tmp1q-32*(13-%1)], m%4
+ mova [tmp2q-32*(20+%1)], m%6
+ mova [tmp2q+32*(12-%1)], m%3
+ mova [tmp1q-32*(45-%1)], m%5
+%endif
+%else ; pass 2
+ REPX {pmulhrsw x, m14}, m%4, m%6, m%3, m%5
+%if %1 & 1
+ %define %%d0 r2
+ %define %%d1 dstq
+%else
+ %define %%d0 dstq
+ %define %%d1 r2
+%endif
+ pmovzxbw m%2, [%%d0+%9 ]
+ paddw m%2, m%4
+ pmovzxbw m%4, [%%d1+%8 ]
+ paddw m%4, m%6
+ pmovzxbw m%6, [%%d1+%10]
+ paddw m%3, m%6
+ pmovzxbw m%6, [%%d0+%7 ]
+ paddw m%5, m%6
+ packuswb m%2, m%4
+ packuswb m%3, m%5
+ vpermq m%2, m%2, q3120
+ vpermq m%3, m%3, q3120
+ mova [%%d0+%9 ], xm%2
+ vextracti128 [%%d1+%8 ], m%2, 1
+ mova [%%d1+%10], xm%3
+ vextracti128 [%%d0+%7 ], m%3, 1
+%endif
+%endmacro
+
+cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 4, 0, dst, stride, c, eob
+ lea r6, [o_base]
+ test eobd, eobd
+ jnz .normal
+ movd xm1, [o(pw_2896x8)]
+ pmulhrsw xm0, xm1, [cq]
+ movd xm2, [o(pw_8192)]
+ mov [cq], eobd
+ or r3d, 64
+ jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
+.normal:
+ PROLOGUE 0, 10, 16, 32*67, dst, stride, c, eob, tmp1, tmp2
+ %undef cmp
+ lea tmp1q, [rsp+32*23]
+ lea tmp2q, [tmp1q+32*24]
+ sub eobd, 151
+ mov r7d, eobd
+.pass1_loop:
+ LOAD_16ROWS cq, 64
+ call m(idct_16x16_internal_8bpc).main
+ mova m1, [rsp+32*1]
+ mova [rsp+32*0], m6
+ mova [rsp+32*1], m7
+ vpbroadcastd m7, [o(pw_8192)]
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round
+ mova m15, [rsp+32*0]
+ mova [tmp1q-32*4], m0
+ mova [tmp1q-32*3], m2
+ mova [tmp1q-32*2], m4
+ mova [tmp1q-32*1], m6
+ mova [tmp1q+32*0], m8
+ mova [tmp1q+32*1], m10
+ mova [tmp1q+32*2], m12
+ mova [tmp1q+32*3], m14
+ mova [tmp2q-32*4], m1
+ mova [tmp2q-32*3], m3
+ mova [tmp2q-32*2], m5
+ mova [tmp2q-32*1], m7
+ mova [tmp2q+32*0], m9
+ mova [tmp2q+32*1], m11
+ mova [tmp2q+32*2], m13
+ mova [tmp2q+32*3], m15
+ add cq, 32
+ add tmp1q, 32*8
+ add tmp2q, 32*8
+ add eobd, 0x80000000
+ jnc .pass1_loop
+ lea r2, [rsp+32*23]
+ mova xm0, [r2-32*4+ 0]
+ mova xm1, [r2-32*2+ 0]
+ vinserti128 m0, [r2+32*0+ 0], 1
+ vinserti128 m1, [r2+32*2+ 0], 1
+ mova xm2, [r2-32*4+16]
+ mova xm3, [r2-32*2+16]
+ vinserti128 m2, [r2+32*0+16], 1
+ vinserti128 m3, [r2+32*2+16], 1
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14
+ test r7d, r7d
+ jl .fast
+ lea r3, [r2+32*8]
+ mova xm4, [r3-32*4+ 0]
+ mova xm5, [r3-32*2+ 0]
+ vinserti128 m4, [r3+32*0+ 0], 1
+ vinserti128 m5, [r3+32*2+ 0], 1
+ mova xm6, [r3-32*4+16]
+ mova xm7, [r3-32*2+16]
+ vinserti128 m6, [r3+32*0+16], 1
+ vinserti128 m7, [r3+32*2+16], 1
+.fast:
+ mova [rsp], m8
+ lea tmp1q, [rsp+32*7]
+ call m(idct_16x16_internal_8bpc).main
+ mova m1, [rsp+32*1]
+ mova [tmp1q-32*4], m0
+ mova [tmp1q-32*3], m1
+ mova [tmp1q-32*2], m2
+ mova [tmp1q-32*1], m3
+ mova [tmp1q+32*0], m4
+ mova [tmp1q+32*1], m5
+ mova [tmp1q+32*2], m6
+ mova [tmp1q+32*3], m7
+ add tmp1q, 32*8
+ mova [tmp1q-32*4], m8
+ mova [tmp1q-32*3], m9
+ mova [tmp1q-32*2], m10
+ mova [tmp1q-32*1], m11
+ mova [tmp1q+32*0], m12
+ mova [tmp1q+32*1], m13
+ mova [tmp1q+32*2], m14
+ mova [tmp1q+32*3], m15
+ mova xm0, [r2-32*3+ 0]
+ mova xm1, [r2-32*1+ 0]
+ vinserti128 m0, [r2+32*1+ 0], 1
+ vinserti128 m1, [r2+32*3+ 0], 1
+ mova xm2, [r2-32*3+16]
+ mova xm3, [r2-32*1+16]
+ vinserti128 m2, [r2+32*1+16], 1
+ vinserti128 m3, [r2+32*3+16], 1
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ test r7d, r7d
+ jl .fast2
+ mova xm4, [r3-32*3+ 0]
+ mova xm5, [r3-32*1+ 0]
+ vinserti128 m4, [r3+32*1+ 0], 1
+ vinserti128 m5, [r3+32*3+ 0], 1
+ mova xm6, [r3-32*3+16]
+ mova xm7, [r3-32*1+16]
+ vinserti128 m6, [r3+32*1+16], 1
+ vinserti128 m7, [r3+32*3+16], 1
+.fast2:
+ add tmp1q, 32*8
+ lea tmp2q, [tmp1q+32*8]
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ add r2, 32*24
+ vpbroadcastd m15, [o(pd_2048)]
+ add tmp1q, 32*16
+ add tmp2q, 32*32
+ mova xm0, [r2-32*4+ 0]
+ mova xm3, [r2-32*1+16]
+ vinserti128 m0, [r2+32*0+ 0], 1
+ vinserti128 m3, [r2+32*3+16], 1
+ mova xm4, [r2-32*4+16]
+ mova xm7, [r2-32*1+ 0]
+ vinserti128 m4, [r2+32*0+16], 1
+ vinserti128 m7, [r2+32*3+ 0], 1
+ pxor m1, m1
+ REPX {mova x, m1}, m2, m5, m6
+ test r7d, r7d
+ jl .fast3
+ add r3, 32*24
+ mova xm1, [r3-32*1+16]
+ mova xm2, [r3-32*4+ 0]
+ vinserti128 m1, [r3+32*3+16], 1
+ vinserti128 m2, [r3+32*0+ 0], 1
+ mova xm5, [r3-32*1+ 0]
+ mova xm6, [r3-32*4+16]
+ vinserti128 m5, [r3+32*3+ 0], 1
+ vinserti128 m6, [r3+32*0+16], 1
+.fast3:
+ add r6, o_idct64_offset
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
+ add r6, 8
+ add tmp1q, 32*8
+ sub tmp2q, 32*8
+ mova xm0, [r2-32*2+ 0]
+ mova xm3, [r2-32*3+16]
+ vinserti128 m0, [r2+32*2+ 0], 1
+ vinserti128 m3, [r2+32*1+16], 1
+ mova xm4, [r2-32*2+16]
+ mova xm7, [r2-32*3+ 0]
+ vinserti128 m4, [r2+32*2+16], 1
+ vinserti128 m7, [r2+32*1+ 0], 1
+ pxor m1, m1
+ REPX {mova x, m1}, m2, m5, m6
+ test r7d, r7d
+ jl .fast4
+ mova xm1, [r3-32*3+16]
+ mova xm2, [r3-32*2+ 0]
+ vinserti128 m1, [r3+32*1+16], 1
+ vinserti128 m2, [r3+32*2+ 0], 1
+ mova xm5, [r3-32*3+ 0]
+ mova xm6, [r3-32*2+16]
+ vinserti128 m5, [r3+32*1+ 0], 1
+ vinserti128 m6, [r3+32*2+16], 1
+.fast4:
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass2
+ RET
+ALIGN function_align
+%define o_base idct64_mul - 8
+cglobal_label .main_part1
+ ; idct64 steps 1-5:
+ ; in1/31/17/15/ 9/23/25/ 7 ->
+ ; t32a/33/34a/35/36/37a/38/39a/56a/57/58a/59/60/61a/62/63a
+ ; in5/27/21/11/13/19/29/ 3 ->
+ ; t40a/41/42a/43/44/45a/46/47a/48a/49/50a/51/52/53a/54/55a
+ vpbroadcastd m11, [o(idct64_mul+4* 0)]
+ vpbroadcastd m13, [o(idct64_mul+4* 1)]
+ vpbroadcastd m10, [o(idct64_mul+4* 4)]
+ vpbroadcastd m12, [o(idct64_mul+4* 5)]
+ pmulhrsw m11, m0 ; t63a
+ pmulhrsw m0, m13 ; t32a
+ pmulhrsw m10, m1 ; t62a
+ pmulhrsw m1, m12 ; t33a
+ vpbroadcastd m9, [o(idct64_mul+4* 8)]
+ vpbroadcastd m13, [o(idct64_mul+4* 9)]
+ vpbroadcastd m8, [o(idct64_mul+4*12)]
+ vpbroadcastd m12, [o(idct64_mul+4*13)]
+ pmulhrsw m9, m2 ; t61a
+ pmulhrsw m2, m13 ; t34a
+ pmulhrsw m8, m3 ; t60a
+ pmulhrsw m3, m12 ; t35a
+ psubsw m12, m0, m1 ; t33
+ paddsw m0, m1 ; t32
+ psubsw m1, m3, m2 ; t34
+ paddsw m3, m2 ; t35
+ psubsw m2, m8, m9 ; t61
+ paddsw m8, m9 ; t60
+ psubsw m9, m11, m10 ; t62
+ paddsw m11, m10 ; t63
+ ITX_MULSUB_2W 2, 1, 10, 13, 15, m4076, 401 ; t34a, t61a
+ vpbroadcastd m14, [o(pw_401_4076)]
+ ITX_MULSUB_2W 9, 12, 10, 13, 15, 14, 13 ; t33a, t62a
+ psubsw m10, m0, m3 ; t35a
+ paddsw m0, m3 ; t32a
+ psubsw m3, m11, m8 ; t60a
+ paddsw m11, m8 ; t63a
+ psubsw m8, m9, m2 ; t34
+ paddsw m9, m2 ; t33
+ psubsw m2, m12, m1 ; t61
+ paddsw m12, m1 ; t62
+ mova [tmp1q-32*4], m0
+ mova [tmp1q-32*3], m9
+ mova [tmp2q+32*2], m12
+ mova [tmp2q+32*3], m11
+ vpbroadcastd m13, [o(pw_m4017_799)]
+ vpbroadcastd m14, [o(pw_799_4017)]
+ ITX_MULSUB_2W 2, 8, 0, 1, 15, 14, 13 ; t34a, t61a
+ ITX_MULSUB_2W 3, 10, 0, 1, 15, 14, 13 ; t35, t60
+ mova [tmp1q-32*2], m2
+ mova [tmp1q-32*1], m3
+ mova [tmp2q+32*0], m10
+ mova [tmp2q+32*1], m8
+ vpbroadcastd m3, [o(idct64_mul+4*16)]
+ vpbroadcastd m11, [o(idct64_mul+4*17)]
+ vpbroadcastd m2, [o(idct64_mul+4*20)]
+ vpbroadcastd m10, [o(idct64_mul+4*21)]
+ vpbroadcastd m1, [o(idct64_mul+4*24)]
+ vpbroadcastd m9, [o(idct64_mul+4*25)]
+ vpbroadcastd m0, [o(idct64_mul+4*28)]
+ vpbroadcastd m8, [o(idct64_mul+4*29)]
+ pmulhrsw m3, m4 ; t59a
+ pmulhrsw m4, m11 ; t36a
+ pmulhrsw m2, m5 ; t58a
+ pmulhrsw m5, m10 ; t37a
+ pmulhrsw m1, m6 ; t57a
+ pmulhrsw m6, m9 ; t38a
+ pmulhrsw m0, m7 ; t56a
+ pmulhrsw m7, m8 ; t39a
+ psubsw m8, m4, m5 ; t37
+ paddsw m4, m5 ; t36
+ psubsw m5, m7, m6 ; t38
+ paddsw m7, m6 ; t39
+ psubsw m6, m0, m1 ; t57
+ paddsw m0, m1 ; t56
+ psubsw m1, m3, m2 ; t58
+ paddsw m3, m2 ; t59
+ ITX_MULSUB_2W 6, 5, 2, 9, 15, m2598, 3166 ; t38a, t57a
+ vpbroadcastd m10, [o(pw_3166_2598)]
+ ITX_MULSUB_2W 1, 8, 2, 9, 15, 10, 9 ; t37a, t58a
+ psubsw m2, m7, m4 ; t36a
+ paddsw m7, m4 ; t39a
+ psubsw m4, m0, m3 ; t59a
+ paddsw m0, m3 ; t56a
+ psubsw m3, m6, m1 ; t37
+ paddsw m6, m1 ; t38
+ psubsw m1, m5, m8 ; t58
+ paddsw m5, m8 ; t57
+ mova [tmp1q+32*2], m6
+ mova [tmp1q+32*3], m7
+ mova [tmp2q-32*4], m0
+ mova [tmp2q-32*3], m5
+ vpbroadcastd m6, [o(pw_m799_m4017)]
+ vpbroadcastd m7, [o(pw_m4017_799)]
+ ITX_MULSUB_2W 4, 2, 0, 5, 15, 7, 6 ; t36, t59
+ ITX_MULSUB_2W 1, 3, 0, 5, 15, 7, 6 ; t37a, t58a
+ mova [tmp1q+32*0], m4
+ mova [tmp1q+32*1], m1
+ mova [tmp2q-32*2], m3
+ mova [tmp2q-32*1], m2
+ ret
+%define o_base pw_5 + 128
+.main_part2_pass1: ; idct64 steps 6-9 + idct16/32/64 sumsub
+ sub r6, o_idct64_offset + 8
+ vpbroadcastd m11, [o(pw_1567_3784)]
+ vpbroadcastd m12, [o(pw_m3784_1567)]
+ vpbroadcastd m13, [o(pw_2896_2896)]
+ vpbroadcastd m14, [o(pw_m2896_2896)]
+.main_part2_pass1_loop:
+ call .main_part2_internal
+ IDCT64_PART2_END 0, 7, 0, 6, 9, 10
+ IDCT64_PART2_END 7, 8, 5, 0, 6, 7
+ IDCT64_PART2_END 8, 2, 1, 0, 6, 7
+ IDCT64_PART2_END 15, 3, 4, 0, 6, 7
+ cmp tmp1q, tmp2q
+ jne .main_part2_pass1_loop
+ ret
+cglobal_label .main_part2_internal
+ mova m0, [tmp1q-32*12] ; t32a
+ mova m6, [tmp2q-32*13] ; t39a
+ mova m1, [tmp1q-32* 4] ; t40a
+ mova m5, [tmp2q+32* 3] ; t55a
+ add tmp1q, 32
+ sub tmp2q, 32
+ mova m2, [tmp1q+32* 3] ; t48a
+ mova m4, [tmp2q-32* 4] ; t47a
+ mova m3, [tmp1q+32*11] ; t56a
+ mova m7, [tmp2q+32*12] ; t63a
+ psubsw m8, m0, m6 ; t39
+ paddsw m0, m6 ; t32
+ psubsw m6, m4, m1 ; t40
+ paddsw m4, m1 ; t47
+ psubsw m1, m2, m5 ; t55
+ paddsw m2, m5 ; t48
+ psubsw m5, m7, m3 ; t56
+ paddsw m7, m3 ; t63
+ ITX_MULSUB_2W 5, 8, 3, 9, 15, 11, 12 ; t39a, t56a
+ vpbroadcastd m9, [o(pw_m1567_m3784)]
+ ITX_MULSUB_2W 1, 6, 3, 9, 15, 12, 9 ; t40a, t55a
+ psubsw m3, m0, m4 ; t47a
+ paddsw m0, m4 ; t32a
+ psubsw m4, m7, m2 ; t48a
+ paddsw m7, m2 ; t63a
+ psubsw m2, m5, m1 ; t40
+ paddsw m5, m1 ; t39
+ psubsw m1, m8, m6 ; t55
+ paddsw m8, m6 ; t56
+ ITX_MULSUB_2W 4, 3, 6, 9, 15, 13, 14 ; t47, t48
+ ITX_MULSUB_2W 1, 2, 6, 9, 15, 13, 14 ; t40a, t55a
+ ret
+.main_part2_pass2:
+ sub r6, o_idct64_offset + 8
+ vpbroadcastd m11, [o(pw_1567_3784)]
+ vpbroadcastd m12, [o(pw_m3784_1567)]
+ vpbroadcastd m13, [o(pw_2896_2896)]
+ lea r9, [strideq*5] ; stride*5
+ lea r3, [r9+strideq*1] ; stride*6
+ lea r7, [r9+strideq*2] ; stride*7
+ lea r8, [r3+strideq*2] ; stride*8
+ lea r2, [dstq+r7]
+.main_part2_pass2_loop:
+ vpbroadcastd m14, [o(pw_m2896_2896)]
+ call .main_part2_internal
+ vpbroadcastd m14, [o(pw_2048)]
+ IDCT64_PART2_END 0, 7, 0, 6, 9, 10, strideq*0, r3*4, r8*4, r7*8
+ IDCT64_PART2_END 7, 8, 5, 0, 6, 7, strideq*0, r3*4, r8*4, r7*8
+ IDCT64_PART2_END 8, 2, 1, 0, 6, 7, strideq*8, r8*2, r9*8, r3*8
+ IDCT64_PART2_END 15, 3, 4, 0, 6, 7, strideq*8, r8*2, r9*8, r3*8
+ add dstq, strideq
+ sub r2, strideq
+ cmp tmp1q, tmp2q
+ jne .main_part2_pass2_loop
+ ret
+
+cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 4, 0, dst, stride, c, eob
+ lea r6, [o_base]
+ test eobd, eobd
+ jnz .normal
+ movd xm1, [o(pw_2896x8)]
+ pmulhrsw xm0, xm1, [cq]
+ movd xm2, [o(pw_8192)]
+ mov [cq], eobd
+ or r3d, 16
+.dconly:
+ pmulhrsw xm0, xm2
+ movd xm2, [o(pw_2048)]
+ pmulhrsw xm0, xm1
+ pmulhrsw xm0, xm2
+ vpbroadcastw m0, xm0
+ pxor m1, m1
+.dconly_loop:
+ mova m2, [dstq+32*0]
+ mova m3, [dstq+32*1]
+ punpckhbw m4, m2, m1
+ punpcklbw m2, m1
+ punpckhbw m5, m3, m1
+ punpcklbw m3, m1
+ paddw m4, m0
+ paddw m2, m0
+ paddw m5, m0
+ paddw m3, m0
+ packuswb m2, m4
+ packuswb m3, m5
+ mova [dstq+32*0], m2
+ mova [dstq+32*1], m3
+ add dstq, strideq
+ dec r3d
+ jg .dconly_loop
+ RET
+.normal:
+ PROLOGUE 0, 7, 16, 32*67, dst, stride, c, eob, tmp1, tmp2
+ LOAD_8ROWS cq+32*0, 32*4
+ pxor m8, m8
+ REPX {mova [cq+32*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28
+ REPX {mova x, m8}, m9, m10, m11, m12, m13, m14
+ mova [rsp], m8
+ lea tmp1q, [rsp+32*7]
+ call m(idct_16x16_internal_8bpc).main
+ mova m1, [rsp+32*1]
+ mova [tmp1q-32*4], m0
+ mova [tmp1q-32*3], m1
+ mova [tmp1q-32*2], m2
+ mova [tmp1q-32*1], m3
+ mova [tmp1q+32*0], m4
+ mova [tmp1q+32*1], m5
+ mova [tmp1q+32*2], m6
+ mova [tmp1q+32*3], m7
+ add tmp1q, 32*8
+ mova [tmp1q-32*4], m8
+ mova [tmp1q-32*3], m9
+ mova [tmp1q-32*2], m10
+ mova [tmp1q-32*1], m11
+ mova [tmp1q+32*0], m12
+ mova [tmp1q+32*1], m13
+ mova [tmp1q+32*2], m14
+ mova [tmp1q+32*3], m15
+ LOAD_8ROWS cq+32*2, 32*4
+ pxor m8, m8
+ REPX {mova [cq+32*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30
+ add tmp1q, 32*8
+ lea tmp2q, [tmp1q+32*8]
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ vpbroadcastd m15, [o(pd_2048)]
+ add tmp1q, 32*16
+ add tmp2q, 32*32
+ mova m0, [cq+32* 1]
+ mova m1, [cq+32*31]
+ mova m2, [cq+32*17]
+ mova m3, [cq+32*15]
+ mova m4, [cq+32* 9]
+ mova m5, [cq+32*23]
+ mova m6, [cq+32*25]
+ mova m7, [cq+32* 7]
+ pxor m8, m8
+ REPX {mova [cq+32*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7
+ add r6, o_idct64_offset
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
+ add r6, 8
+ add tmp1q, 32*8
+ sub tmp2q, 32*8
+ mova m0, [cq+32* 5]
+ mova m1, [cq+32*27]
+ mova m2, [cq+32*21]
+ mova m3, [cq+32*11]
+ mova m4, [cq+32*13]
+ mova m5, [cq+32*19]
+ mova m6, [cq+32*29]
+ mova m7, [cq+32* 3]
+ pxor m8, m8
+ REPX {mova [cq+32*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass1
+ sub tmp1q, 32*36
+ lea r2, [strideq*3]
+ mov tmp2d, 4
+.pass2_loop:
+ lea r3, [tmp1q-32*8]
+ mova xm0, [r3 -32*4]
+ mova xm1, [r3 -32*3]
+ vinserti128 m0, [tmp1q-32*4], 1
+ vinserti128 m1, [tmp1q-32*3], 1
+ mova xm2, [r3 -32*2]
+ mova xm3, [r3 -32*1]
+ vinserti128 m2, [tmp1q-32*2], 1
+ vinserti128 m3, [tmp1q-32*1], 1
+ mova xm4, [r3 +32*0]
+ mova xm5, [r3 +32*1]
+ vinserti128 m4, [tmp1q+32*0], 1
+ vinserti128 m5, [tmp1q+32*1], 1
+ mova xm6, [r3 +32*2]
+ mova xm7, [r3 +32*3]
+ vinserti128 m6, [tmp1q+32*2], 1
+ vinserti128 m7, [tmp1q+32*3], 1
+ mova xm8, [r3 -32*4+16]
+ mova xm9, [r3 -32*3+16]
+ vinserti128 m8, [tmp1q-32*4+16], 1
+ vinserti128 m9, [tmp1q-32*3+16], 1
+ mova xm10, [r3 -32*2+16]
+ mova xm11, [r3 -32*1+16]
+ vinserti128 m10, [tmp1q-32*2+16], 1
+ vinserti128 m11, [tmp1q-32*1+16], 1
+ mova xm12, [r3 +32*0+16]
+ mova xm13, [r3 +32*1+16]
+ vinserti128 m12, [tmp1q+32*0+16], 1
+ vinserti128 m13, [tmp1q+32*1+16], 1
+ mova xm14, [r3 +32*2+16]
+ mova xm15, [r3 +32*3+16]
+ vinserti128 m14, [tmp1q+32*2+16], 1
+ vinserti128 m15, [tmp1q+32*3+16], 1
+ mova [rsp+32*0], m6
+ mova [rsp+32*1], m7
+ vpbroadcastd m7, [o(pw_8192)]
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round
+ call m(idct_16x16_internal_8bpc).main
+ mova [rsp+32*0], m15
+ vpbroadcastd m15, [o(pw_2048)]
+ REPX {pmulhrsw x, m15}, m0, m2, m3, m4, m5, m6, m7
+ WRITE_16X2 2, 3, 1, 2, strideq*2, r2
+ pmulhrsw m1, m15, [rsp+32*1]
+ WRITE_16X2 0, 1, 2, 3, strideq*0, strideq*1
+ lea r3, [dstq+strideq*4]
+ %define dstq r3
+ WRITE_16X2 4, 5, 2, 3, strideq*0, strideq*1
+ WRITE_16X2 6, 7, 2, 3, strideq*2, r2
+ REPX {pmulhrsw x, m15}, m8, m9, m10, m11, m12, m13, m14
+ lea r3, [r3+strideq*4]
+ WRITE_16X2 8, 9, 2, 3, strideq*0, strideq*1
+ WRITE_16X2 10, 11, 2, 3, strideq*2, r2
+ pmulhrsw m15, [rsp+32*0]
+ lea r3, [r3+strideq*4]
+ WRITE_16X2 12, 13, 2, 3, strideq*0, strideq*1
+ WRITE_16X2 14, 15, 2, 3, strideq*2, r2
+ add tmp1q, 32*16
+ add r0, 16
+ dec tmp2d
+ jg .pass2_loop
+ RET
+
+cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 4, 0, dst, stride, c, eob
+ lea r6, [o_base]
+ test eobd, eobd
+ jnz .normal
+ movd xm1, [o(pw_2896x8)]
+ pmulhrsw xm0, xm1, [cq]
+ movd xm2, [o(pw_16384)]
+ mov [cq], eobd
+ pmulhrsw xm0, xm1
+ or r3d, 64
+ jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly
+.normal:
+ PROLOGUE 0, 11, 16, 32*99, dst, stride, c, eob, tmp1, tmp2
+ lea tmp1q, [rsp+32*7]
+ lea r10d, [eobq-136]
+ sar r10d, 31
+.pass1_loop:
+ lea tmp2q, [tmp1q+32*16]
+ LOAD_8ROWS cq+64*1, 64*2, 1
+ pxor m8, m8
+ REPX {mova [cq+64*x], m8}, 1, 3, 5, 7, 9, 11, 13, 15
+ test r10b, r10b
+ jnz .fast
+ LOAD_8ROWS_H cq+64*17, 64*2, 2
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
+ LOAD_8ROWS_H cq+64*16, 64*2, 1
+ mova [rsp], m15
+ pxor m15, m15
+ REPX {mova [cq+64*x], m15}, 16, 17, 18, 19, 20, 21, 22, 23, \
+ 24, 25, 26, 27, 28, 29, 30, 31
+ jmp .idct16
+.fast:
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ pxor m8, m8
+ REPX {mova x, m8}, m9, m10, m11, m12, m13, m14
+ mova [rsp], m8
+.idct16:
+ LOAD_8ROWS cq+64*0, 64*2, 1
+ pxor m15, m15
+ REPX {mova [cq+64*x], m15}, 0, 2, 4, 6, 8, 10, 12, 14
+ call m(idct_16x16_internal_8bpc).main
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).pass1_end
+ vpbroadcastd m7, [o(pw_16384)]
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round
+ lea r3, [tmp1q+32*48]
+ mova m15, [rsp]
+ mova [r3-32*4], m0
+ mova [r3-32*3], m2
+ mova [r3-32*2], m4
+ mova [r3-32*1], m6
+ mova [r3+32*0], m8
+ mova [r3+32*1], m10
+ mova [r3+32*2], m12
+ mova [r3+32*3], m14
+ add r3, 32*24
+ mova [r3-32*4], m1
+ mova [r3-32*3], m3
+ mova [r3-32*2], m5
+ mova [r3-32*1], m7
+ mova [r3+32*0], m9
+ mova [r3+32*1], m11
+ mova [r3+32*2], m13
+ mova [r3+32*3], m15
+ vpbroadcastd m9, [o(pw_16384)]
+ pmulhrsw m0, m9, [tmp1q-32*4]
+ pmulhrsw m1, m9, [tmp1q-32*3]
+ pmulhrsw m2, m9, [tmp1q-32*2]
+ pmulhrsw m3, m9, [tmp1q-32*1]
+ pmulhrsw m4, m9, [tmp1q+32*0]
+ pmulhrsw m5, m9, [tmp1q+32*1]
+ pmulhrsw m6, m9, [tmp1q+32*2]
+ pmulhrsw m7, m9, [tmp1q+32*3]
+ call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8
+ mova [tmp1q-32*4], m0
+ pmulhrsw m0, m9, [tmp2q-32*4]
+ mova [tmp2q-32*4], m1
+ pmulhrsw m1, m9, [tmp2q-32*3]
+ mova [tmp1q-32*3], m2
+ pmulhrsw m2, m9, [tmp2q-32*2]
+ mova [tmp2q-32*3], m3
+ pmulhrsw m3, m9, [tmp2q-32*1]
+ mova [tmp1q-32*2], m4
+ pmulhrsw m4, m9, [tmp2q+32*0]
+ mova [tmp2q-32*2], m5
+ pmulhrsw m5, m9, [tmp2q+32*1]
+ mova [tmp1q-32*1], m6
+ pmulhrsw m6, m9, [tmp2q+32*2]
+ mova [tmp2q-32*1], m7
+ pmulhrsw m7, m9, [tmp2q+32*3]
+ call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8
+ mova [tmp1q+32*0], m0
+ mova [tmp2q+32*0], m1
+ mova [tmp1q+32*1], m2
+ mova [tmp2q+32*1], m3
+ mova [tmp1q+32*2], m4
+ mova [tmp2q+32*2], m5
+ mova [tmp1q+32*3], m6
+ mova [tmp2q+32*3], m7
+ add cq, 32
+ add tmp1q, 32*8
+ add r10d, 0x80000000
+ jnc .pass1_loop
+ lea r2, [rsp+32*55]
+ lea r7, [r2+32*24]
+.pass2_loop:
+ lea r3, [r2+32*8]
+ lea r8, [r7+32*8]
+ mova m0, [r2-32*4]
+ mova m1, [r2-32*2]
+ mova m2, [r2+32*0]
+ mova m3, [r2+32*2]
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14
+ test r10b, r10b
+ jnz .fast2
+ mova m4, [r3-32*4]
+ mova m5, [r3-32*2]
+ mova m6, [r3+32*0]
+ mova m7, [r3+32*2]
+.fast2:
+ mova [rsp], m8
+ lea tmp1q, [rsp+32*39]
+ call m(idct_16x16_internal_8bpc).main
+ mova m1, [rsp+32*1]
+ mova [tmp1q-32*4], m0
+ mova [tmp1q-32*3], m1
+ mova [tmp1q-32*2], m2
+ mova [tmp1q-32*1], m3
+ mova [tmp1q+32*0], m4
+ mova [tmp1q+32*1], m5
+ mova [tmp1q+32*2], m6
+ mova [tmp1q+32*3], m7
+ add tmp1q, 32*8
+ mova [tmp1q-32*4], m8
+ mova [tmp1q-32*3], m9
+ mova [tmp1q-32*2], m10
+ mova [tmp1q-32*1], m11
+ mova [tmp1q+32*0], m12
+ mova [tmp1q+32*1], m13
+ mova [tmp1q+32*2], m14
+ mova [tmp1q+32*3], m15
+ mova m0, [r2-32*3]
+ mova m1, [r2-32*1]
+ mova m2, [r2+32*1]
+ mova m3, [r2+32*3]
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ test r10b, r10b
+ jnz .fast3
+ mova m4, [r3-32*3]
+ mova m5, [r3-32*1]
+ mova m6, [r3+32*1]
+ mova m7, [r3+32*3]
+.fast3:
+ add tmp1q, 32*8
+ lea tmp2q, [tmp1q+32*8]
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ vpbroadcastd m15, [o(pd_2048)]
+ add tmp1q, 32*16
+ add tmp2q, 32*32
+ mova m0, [r7-32*4]
+ mova m3, [r7+32*3]
+ mova m4, [r7+32*0]
+ mova m7, [r7-32*1]
+ pxor m1, m1
+ REPX {mova x, m1}, m2, m5, m6
+ test r10b, r10b
+ jnz .fast4
+ mova m1, [r8+32*3]
+ mova m2, [r8-32*4]
+ mova m5, [r8-32*1]
+ mova m6, [r8+32*0]
+.fast4:
+ add r6, o_idct64_offset
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
+ add r6, 8
+ add tmp1q, 32*8
+ sub tmp2q, 32*8
+ mova m0, [r7-32*2]
+ mova m3, [r7+32*1]
+ mova m4, [r7+32*2]
+ mova m7, [r7-32*3]
+ pxor m1, m1
+ REPX {mova x, m1}, m2, m5, m6
+ test r10b, r10b
+ jnz .fast5
+ mova m1, [r8+32*1]
+ mova m2, [r8-32*2]
+ mova m5, [r8-32*3]
+ mova m6, [r8+32*2]
+.fast5:
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass2
+ add r10d, 0x80000000
+ jc .ret
+ lea r2, [rsp+32*7]
+ lea r7, [r2+32*16]
+ sub dstq, r8
+ lea dstq, [dstq+strideq*4+16]
+ jmp .pass2_loop
+.ret:
+ RET
+
+cglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 4, 0, dst, stride, c, eob
+ lea r6, [o_base]
+ test eobd, eobd
+ jnz .normal
+ movd xm1, [o(pw_2896x8)]
+ pmulhrsw xm0, xm1, [cq]
+ movd xm2, [o(pw_16384)]
+ mov [cq], eobd
+ pmulhrsw xm0, xm1
+ or r3d, 32
+ jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly
+.normal:
+ PROLOGUE 0, 9, 16, 32*131, dst, stride, c, eob, tmp1, tmp2, \
+ base, tmp3, tmp4
+ lea tmp1q, [rsp+32*7]
+ lea tmp4d, [eobq-136]
+.pass1_loop:
+ LOAD_8ROWS cq+64*0, 64*4, 1
+ pxor m8, m8
+ REPX {mova [cq+64*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28
+ REPX {mova x, m8}, m9, m10, m11, m12, m13, m14
+ mova [rsp], m8
+ call m(idct_16x16_internal_8bpc).main
+ mova m1, [rsp+32*1]
+ mova [tmp1q-32*4], m0
+ mova [tmp1q-32*3], m1
+ mova [tmp1q-32*2], m2
+ mova [tmp1q-32*1], m3
+ mova [tmp1q+32*0], m4
+ mova [tmp1q+32*1], m5
+ mova [tmp1q+32*2], m6
+ mova [tmp1q+32*3], m7
+ add tmp1q, 32*8
+ mova [tmp1q-32*4], m8
+ mova [tmp1q-32*3], m9
+ mova [tmp1q-32*2], m10
+ mova [tmp1q-32*1], m11
+ mova [tmp1q+32*0], m12
+ mova [tmp1q+32*1], m13
+ mova [tmp1q+32*2], m14
+ mova [tmp1q+32*3], m15
+ LOAD_8ROWS cq+64*2, 64*4, 1
+ pxor m8, m8
+ REPX {mova [cq+64*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30
+ add tmp1q, 32*8
+ lea tmp2q, [tmp1q+32*8]
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ vpbroadcastd m15, [o(pd_2048)]
+ add tmp1q, 32*16
+ add tmp2q, 32*32
+ vpbroadcastd m7, [o(pw_2896x8)]
+ pmulhrsw m0, m7, [cq+64* 1]
+ pmulhrsw m1, m7, [cq+64*31]
+ pmulhrsw m2, m7, [cq+64*17]
+ pmulhrsw m3, m7, [cq+64*15]
+ pmulhrsw m4, m7, [cq+64* 9]
+ pmulhrsw m5, m7, [cq+64*23]
+ pmulhrsw m6, m7, [cq+64*25]
+ pmulhrsw m7, [cq+64* 7]
+ pxor m8, m8
+ REPX {mova [cq+64*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7
+ add r6, o_idct64_offset
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
+ vpbroadcastd m7, [o(pw_2896x8-(o_idct64_offset))]
+ add r6, 8
+ add tmp1q, 32*8
+ sub tmp2q, 32*8
+ pmulhrsw m0, m7, [cq+64* 5]
+ pmulhrsw m1, m7, [cq+64*27]
+ pmulhrsw m2, m7, [cq+64*21]
+ pmulhrsw m3, m7, [cq+64*11]
+ pmulhrsw m4, m7, [cq+64*13]
+ pmulhrsw m5, m7, [cq+64*19]
+ pmulhrsw m6, m7, [cq+64*29]
+ pmulhrsw m7, [cq+64* 3]
+ pxor m8, m8
+ REPX {mova [cq+64*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass1
+ sub tmp1q, 32*44
+ vpbroadcastd m10, [o(pw_16384)]
+ call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_round_interleave
+ add cq, 32
+ add tmp4d, 0x80000000
+ jnc .pass1_loop
+ lea tmp1q, [rsp+32*15]
+ imul r2, strideq, 19
+ lea r3, [strideq*3]
+ add r2, dstq
+ mov tmp4b, 4
+.pass2_loop:
+ lea tmp2q, [tmp1q+32*64]
+ LOAD_8ROWS tmp1q-32*4, 32
+ test tmp4d, 0x40000000
+ jnz .fast
+ LOAD_8ROWS_H tmp2q-32*4, 32
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
+ lea tmp3q, [tmp2q-32*8]
+ LOAD_8ROWS_H tmp3q-32*4, 32
+ mova [rsp], m15
+ jmp .idct16
+.fast:
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ pxor m8, m8
+ REPX {mova x, m8}, m9, m10, m11, m12, m13, m14
+ mova [rsp], m8
+.idct16:
+ lea tmp3q, [tmp1q-32*8]
+ LOAD_8ROWS tmp3q-32*4, 32
+ call m(idct_16x16_internal_8bpc).main
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).pass2_end
+ add tmp1q, 32*16
+ sub dstq, r3
+ lea r2, [r2+r3+16]
+ add dstq, 16
+ dec tmp4b
+ jg .pass2_loop
+ RET
+ALIGN function_align
+.transpose_round_interleave:
+ mov tmp3d, 4
+.loop:
+ lea tmp2q, [tmp1q+32*8]
+ mova xm0, [tmp1q-32*4]
+ mova xm1, [tmp1q-32*3]
+ vinserti128 m0, [tmp2q-32*4], 1
+ vinserti128 m1, [tmp2q-32*3], 1
+ mova xm2, [tmp1q-32*2]
+ mova xm3, [tmp1q-32*1]
+ vinserti128 m2, [tmp2q-32*2], 1
+ vinserti128 m3, [tmp2q-32*1], 1
+ mova xm4, [tmp1q+32*0]
+ mova xm5, [tmp1q+32*1]
+ vinserti128 m4, [tmp2q+32*0], 1
+ vinserti128 m5, [tmp2q+32*1], 1
+ mova xm6, [tmp1q+32*2]
+ mova xm7, [tmp1q+32*3]
+ vinserti128 m6, [tmp2q+32*2], 1
+ vinserti128 m7, [tmp2q+32*3], 1
+ REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
+ call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8
+ mova xm8, [tmp1q-32*4+16]
+ mova xm9, [tmp1q-32*3+16]
+ vinserti128 m8, [tmp2q-32*4+16], 1
+ vinserti128 m9, [tmp2q-32*3+16], 1
+ mova [tmp1q-32*4], m0
+ mova [tmp2q-32*4], m1
+ mova [tmp1q-32*3], m2
+ mova [tmp2q-32*3], m3
+ mova xm2, [tmp1q-32*2+16]
+ mova xm3, [tmp1q-32*1+16]
+ vinserti128 m2, [tmp2q-32*2+16], 1
+ vinserti128 m3, [tmp2q-32*1+16], 1
+ mova [tmp1q-32*2], m4
+ mova [tmp2q-32*2], m5
+ mova [tmp1q-32*1], m6
+ mova [tmp2q-32*1], m7
+ mova xm4, [tmp1q+32*0+16]
+ mova xm5, [tmp1q+32*1+16]
+ vinserti128 m4, [tmp2q+32*0+16], 1
+ vinserti128 m5, [tmp2q+32*1+16], 1
+ mova xm6, [tmp1q+32*2+16]
+ mova xm7, [tmp1q+32*3+16]
+ vinserti128 m6, [tmp2q+32*2+16], 1
+ vinserti128 m7, [tmp2q+32*3+16], 1
+ pmulhrsw m0, m8, m10
+ pmulhrsw m1, m9, m10
+ REPX {pmulhrsw x, m10}, m2, m3, m4, m5, m6, m7
+ call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8
+ mova [tmp1q+32*0], m0
+ mova [tmp2q+32*0], m1
+ mova [tmp1q+32*1], m2
+ mova [tmp2q+32*1], m3
+ mova [tmp1q+32*2], m4
+ mova [tmp2q+32*2], m5
+ mova [tmp1q+32*3], m6
+ mova [tmp2q+32*3], m7
+ add tmp1q, 32*16
+ dec tmp3d
+ jg .loop
+ ret
+
+cglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 4, 0, dst, stride, c, eob
+ lea r6, [o_base]
+ test eobd, eobd
+ jnz .normal
+ movd xm1, [o(pw_2896x8)]
+ pmulhrsw xm0, xm1, [cq]
+ movd xm2, [o(pw_8192)]
+ mov [cq], eobd
+ or r3d, 64
+ jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly
+.normal:
+ PROLOGUE 0, 11, 16, 32*199, dst, stride, c, eob, tmp1, tmp2
+ lea tmp1q, [rsp+32*71]
+ lea r10d, [eobq-136]
+.pass1_loop:
+ LOAD_8ROWS cq+64*0, 64*4
+ pxor m8, m8
+ REPX {mova [cq+64*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28
+ REPX {mova x, m8}, m9, m10, m11, m12, m13, m14
+ mova [rsp], m8
+ call m(idct_16x16_internal_8bpc).main
+ mova m1, [rsp+32*1]
+ mova [tmp1q-32*4], m0
+ mova [tmp1q-32*3], m1
+ mova [tmp1q-32*2], m2
+ mova [tmp1q-32*1], m3
+ mova [tmp1q+32*0], m4
+ mova [tmp1q+32*1], m5
+ mova [tmp1q+32*2], m6
+ mova [tmp1q+32*3], m7
+ add tmp1q, 32*8
+ mova [tmp1q-32*4], m8
+ mova [tmp1q-32*3], m9
+ mova [tmp1q-32*2], m10
+ mova [tmp1q-32*1], m11
+ mova [tmp1q+32*0], m12
+ mova [tmp1q+32*1], m13
+ mova [tmp1q+32*2], m14
+ mova [tmp1q+32*3], m15
+ LOAD_8ROWS cq+64*2, 64*4
+ pxor m8, m8
+ REPX {mova [cq+64*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30
+ add tmp1q, 32*8
+ lea tmp2q, [tmp1q+32*8]
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ vpbroadcastd m15, [o(pd_2048)]
+ add tmp1q, 32*16
+ add tmp2q, 32*32
+ mova m0, [cq+64* 1]
+ mova m1, [cq+64*31]
+ mova m2, [cq+64*17]
+ mova m3, [cq+64*15]
+ mova m4, [cq+64* 9]
+ mova m5, [cq+64*23]
+ mova m6, [cq+64*25]
+ mova m7, [cq+64* 7]
+ pxor m8, m8
+ REPX {mova [cq+64*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7
+ add r6, o_idct64_offset
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
+ add r6, 8
+ add tmp1q, 32*8
+ sub tmp2q, 32*8
+ mova m0, [cq+64* 5]
+ mova m1, [cq+64*27]
+ mova m2, [cq+64*21]
+ mova m3, [cq+64*11]
+ mova m4, [cq+64*13]
+ mova m5, [cq+64*19]
+ mova m6, [cq+64*29]
+ mova m7, [cq+64* 3]
+ pxor m8, m8
+ REPX {mova [cq+64*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass1
+ sub tmp1q, 32*44
+ vpbroadcastd m10, [o(pw_8192)]
+ call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_round_interleave
+ add cq, 32
+ add r10d, 0x80000000
+ jnc .pass1_loop
+ lea tmp1q, [rsp+32*7]
+ mov r10b, 4
+.pass2_loop:
+ lea r2, [tmp1q+32*64]
+ mova m0, [r2-32*4]
+ mova m1, [r2-32*2]
+ mova m2, [r2+32*0]
+ mova m3, [r2+32*2]
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14
+ mova [rsp], m4
+ test r10d, 0x40000000
+ jnz .fast
+ lea r3, [r2+32*64]
+ mova m4, [r3-32*4]
+ mova m5, [r3-32*2]
+ mova m6, [r3+32*0]
+ mova m7, [r3+32*2]
+.fast:
+ call m(idct_16x16_internal_8bpc).main
+ mova m1, [rsp+32*1]
+ mova [tmp1q-32*4], m0
+ mova [tmp1q-32*3], m1
+ mova [tmp1q-32*2], m2
+ mova [tmp1q-32*1], m3
+ mova [tmp1q+32*0], m4
+ mova [tmp1q+32*1], m5
+ mova [tmp1q+32*2], m6
+ mova [tmp1q+32*3], m7
+ add tmp1q, 32*8
+ mova [tmp1q-32*4], m8
+ mova [tmp1q-32*3], m9
+ mova [tmp1q-32*2], m10
+ mova [tmp1q-32*1], m11
+ mova [tmp1q+32*0], m12
+ mova [tmp1q+32*1], m13
+ mova [tmp1q+32*2], m14
+ mova [tmp1q+32*3], m15
+ mova m0, [r2-32*3]
+ mova m1, [r2-32*1]
+ mova m2, [r2+32*1]
+ mova m3, [r2+32*3]
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ test r10d, 0x40000000
+ jnz .fast2
+ mova m4, [r3-32*3]
+ mova m5, [r3-32*1]
+ mova m6, [r3+32*1]
+ mova m7, [r3+32*3]
+.fast2:
+ add tmp1q, 32*8
+ lea tmp2q, [tmp1q+32*8]
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ vpbroadcastd m15, [o(pd_2048)]
+ add r2, 32*8
+ add r3, 32*8
+ add tmp1q, 32*16
+ add tmp2q, 32*32
+ mova m0, [r2-32*4] ; 1
+ mova m3, [r2+32*3] ; 15
+ mova m4, [r2+32*0] ; 9
+ mova m7, [r2-32*1] ; 7
+ pxor m1, m1
+ REPX {mova x, m1}, m2, m5, m6
+ test r10d, 0x40000000
+ jnz .fast3
+ mova m1, [r3+32*3] ; 31
+ mova m2, [r3-32*4] ; 17
+ mova m5, [r3-32*1] ; 23
+ mova m6, [r3+32*0] ; 25
+.fast3:
+ add r6, o_idct64_offset
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
+ add r6, 8
+ add tmp1q, 32*8
+ sub tmp2q, 32*8
+ mova m0, [r2-32*2] ; 5
+ mova m3, [r2+32*1] ; 11
+ mova m4, [r2+32*2] ; 13
+ mova m7, [r2-32*3] ; 3
+ pxor m1, m1
+ REPX {mova x, m1}, m2, m5, m6
+ test r10d, 0x40000000
+ jnz .fast4
+ mova m1, [r3+32*1] ; 27
+ mova m2, [r3-32*2] ; 21
+ mova m5, [r3-32*3] ; 19
+ mova m6, [r3+32*2] ; 29
+.fast4:
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass2
+ sub tmp1q, 32*28
+ sub dstq, r8
+ lea dstq, [dstq+strideq*4+16]
+ dec r10b
+ jg .pass2_loop
+ RET
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/itx_avx512.asm b/third_party/dav1d/src/x86/itx_avx512.asm
new file mode 100644
index 0000000000..a3f25d37e5
--- /dev/null
+++ b/third_party/dav1d/src/x86/itx_avx512.asm
@@ -0,0 +1,7507 @@
+; Copyright © 2020-2023, VideoLAN and dav1d authors
+; Copyright © 2020-2023, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 64
+const \
+dup16_perm, db 0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7
+ db 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15, 14, 15
+ db 16, 17, 16, 17, 18, 19, 18, 19, 20, 21, 20, 21, 22, 23, 22, 23
+ db 24, 25, 24, 25, 26, 27, 26, 27, 28, 29, 28, 29, 30, 31, 30, 31
+const \
+int8_permA, db 0, 1, 16, 17, 32, 33, 48, 49, 2, 3, 18, 19, 34, 35, 50, 51
+ db 4, 5, 20, 21, 36, 37, 52, 53, 6, 7, 22, 23, 38, 39, 54, 55
+ db 8, 9, 24, 25, 40, 41, 56, 57, 10, 11, 26, 27, 42, 43, 58, 59
+ db 12, 13, 28, 29, 44, 45, 60, 61, 14, 15, 30, 31, 46, 47, 62, 63
+int8_permB: db 0, 1, 16, 17, 32, 33, 48, 49, 2, 3, 18, 19, 34, 35, 50, 51
+ db 8, 9, 24, 25, 40, 41, 56, 57, 10, 11, 26, 27, 42, 43, 58, 59
+ db 4, 5, 20, 21, 36, 37, 52, 53, 6, 7, 22, 23, 38, 39, 54, 55
+ db 12, 13, 28, 29, 44, 45, 60, 61, 14, 15, 30, 31, 46, 47, 62, 63
+int16_perm: db 0, 1, 32, 33, 2, 3, 34, 35, 4, 5, 36, 37, 6, 7, 38, 39
+ db 8, 9, 40, 41, 10, 11, 42, 43, 12, 13, 44, 45, 14, 15, 46, 47
+ db 16, 17, 48, 49, 18, 19, 50, 51, 20, 21, 52, 53, 22, 23, 54, 55
+ db 24, 25, 56, 57, 26, 27, 58, 59, 28, 29, 60, 61, 30, 31, 62, 63
+idtx_16x4p: db 0, 1, 4, 5, 16, 17, 20, 21, 2, 3, 6, 7, 18, 19, 22, 23
+ db 32, 33, 36, 37, 48, 49, 52, 53, 34, 35, 38, 39, 50, 51, 54, 55
+ db 8, 9, 12, 13, 24, 25, 28, 29, 10, 11, 14, 15, 26, 27, 30, 31
+ db 40, 41, 44, 45, 56, 57, 60, 61, 42, 43, 46, 47, 58, 59, 62, 63
+idct_8x32p: db 60, 61, 4, 5, 32, 33, 0, 1, 28, 29, 36, 37, 56, 57, 8, 9
+ db 12, 13, 52, 53, 24, 25, 40, 41, 44, 45, 20, 21, 48, 49, 16, 17
+ db 62, 63, 2, 3, 6, 7, 58, 59, 54, 55, 10, 11, 14, 15, 50, 51
+ db 46, 47, 18, 19, 22, 23, 42, 43, 38, 39, 26, 27, 30, 31, 34, 35
+idct_16x32p: db 6, 7, 58, 59, 38, 39, 26, 27, 32, 33, 0, 1, 30, 31, 34, 35
+ db 46, 47, 18, 19, 22, 23, 42, 43, 24, 25, 40, 41, 44, 45, 20, 21
+ db 62, 63, 2, 3, 48, 49, 16, 17, 56, 57, 8, 9, 14, 15, 50, 51
+ db 54, 55, 10, 11, 60, 61, 4, 5, 12, 13, 52, 53, 28, 29, 36, 37
+end_16x32p: db 0, 32, 1, 48, 2, 36, 3, 52, 16, 40, 17, 56, 18, 44, 19, 60
+ db 4, 33, 5, 49, 6, 37, 7, 53, 20, 41, 21, 57, 22, 45, 23, 61
+ db 8, 35, 9, 51, 10, 39, 11, 55, 24, 43, 25, 59, 26, 47, 27, 63
+ db 12, 34, 13, 50, 14, 38, 15, 54, 28, 42, 29, 58, 30, 46, 31, 62
+
+; packed 4-bit qword shuffle indices
+permA: dq 0x1c0d0d1ce0d94040, 0x5849495868fb6262
+ dq 0x3e2f2f3ef1c85151, 0x7a6b6b7a79ea7373
+ dq 0x94858594a451c8d9, 0xd0c1c1d02c73eafb
+ dq 0xb6a7a7b6b540d9c8, 0xf2e3e3f23d62fbea
+permB: dq 0x40acbd0fcadb0f40, 0x518e9f3ce8f99604
+ dq 0xc824352d56128751, 0xd906171e74301e15
+ dq 0x6271604b03472d62, 0x735342782165b426
+ dq 0xeaf9e8699f8ea573, 0xfbdbca5abdac3c37
+permC: dq 0x9d409d041551c2e0, 0xbf62bf263773a486
+ dq 0xc88c8c15409dd3f1, 0xeaaeae3762bfb597
+ dq 0x04d9158c8cc84a68, 0x26fb37aeaeea2c0e
+ dq 0x5115049dd9045b79, 0x733726bffb263d1f
+permD: dq 0x0cda098800041504, 0x0edb09b2028c3726
+ dq 0x0f11fa9c01150415, 0x0988f326039d2637
+ dq 0x05640f1108269d8c, 0x05290edb0aaebfae
+ dq 0x0005000509378c9d, 0xffffffff0bbfaebf
+
+pd_0to15: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+gather8a: dd 0, 2, 1, 3, 8, 10, 9, 11
+gather8b: dd 0, 1, 4, 5, 8, 9, 12, 13
+gather8c: dd 0, 4, 2, 6, 12, 8, 14, 10
+gather8d: dd 0, 19, 1, 18, 2, 17, 3, 16
+
+int_shuf1: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
+int_shuf2: db 8, 9, 0, 1, 10, 11, 2, 3, 12, 13, 4, 5, 14, 15, 6, 7
+int_shuf3: db 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15
+int_shuf4: db 8, 9, 0, 1, 12, 13, 4, 5, 10, 11, 2, 3, 14, 15, 6, 7
+deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
+int_mshift: db 12, 20, 0, 0, 44, 52, 0, 0
+
+pb_32: times 4 db 32
+pw_2048: times 2 dw 2048
+pw_4096: times 2 dw 4096
+pw_8192: times 2 dw 8192
+pw_16384: times 2 dw 16384
+pw_1697x16: times 2 dw 1697*16
+pw_1697x8: times 2 dw 1697*8
+pw_2896x8: times 2 dw 2896*8
+pd_2048: dd 2048
+
+%define pw_5 (permD+52)
+%define pd_m1 (permD+60)
+%define pw_3803_1321 (permD+44)
+%define pw_2482_3803 (permD+12)
+%define pw_2440_3290 (permD+ 4)
+%define pw_m3290_2440 (permD+28)
+%define pw_3857_1380 (permD+36)
+%define pw_m1380_3857 (permD+20)
+
+pw_8192_m8192: dw 8192, -8192
+pw_m8192_8192: dw -8192, 8192
+pw_16384_m16384: dw 16384, -16384
+pw_m16384_16384: dw -16384, 16384
+
+pw_m1321_2482: dw -1321, 2482
+pw_m3344_3344: dw -3344, 3344
+pw_2482_3344: dw 2482, 3344
+pw_m3803_3344: dw -3803, 3344
+pd_3344: dd 3344
+pw_m1321_m3344: dw -1321, -3344
+pw_2896_m2896: dw 2896, -2896
+
+pw_1567_m3784: dw 1567, -3784
+pw_3784_m1567: dw 3784, -1567
+pw_4017_m799: dw 4017, -799
+pw_2276_m3406: dw 2276, -3406
+pw_m799_m4017: dw -799, -4017
+pw_m3406_m2276: dw -3406, -2276
+
+%macro COEF_PAIR 2-3 0
+pw_%1_%2: dw %1, %2
+pw_m%2_%1: dw -%2, %1
+%if %3
+pw_m%1_m%2: dw -%1, -%2
+%endif
+%endmacro
+
+COEF_PAIR 2896, 2896
+COEF_PAIR 1567, 3784, 1
+COEF_PAIR 3784, 1567
+COEF_PAIR 201, 4091
+COEF_PAIR 995, 3973
+COEF_PAIR 1751, 3703
+COEF_PAIR 3035, 2751
+COEF_PAIR 3513, 2106
+COEF_PAIR 4052, 601
+COEF_PAIR 3166, 2598, 1
+COEF_PAIR 3920, 1189, 1
+COEF_PAIR 2276, 3406
+COEF_PAIR 4017, 799
+
+%macro COEF_X8 1-*
+%rep %0
+ dw %1*8, %1*8
+ %rotate 1
+%endrep
+%endmacro
+
+pw_m2276x8: COEF_X8 -2276
+pw_3406x8: COEF_X8 3406
+pw_4017x8: COEF_X8 4017
+pw_799x8: COEF_X8 799
+pw_3784x8: COEF_X8 3784
+pw_1567x8: COEF_X8 1567
+
+pw_4076x8: COEF_X8 4076
+pw_401x8: COEF_X8 401
+pw_m2598x8: COEF_X8 -2598
+pw_3166x8: COEF_X8 3166
+pw_3612x8: COEF_X8 3612
+pw_1931x8: COEF_X8 1931
+pw_m1189x8: COEF_X8 -1189
+pw_3920x8: COEF_X8 3920
+
+pw_4091x8: COEF_X8 4091
+pw_201x8: COEF_X8 201
+pw_m2751x8: COEF_X8 -2751
+pw_3035x8: COEF_X8 3035
+pw_3703x8: COEF_X8 3703
+pw_1751x8: COEF_X8 1751
+pw_m1380x8: COEF_X8 -1380
+pw_3857x8: COEF_X8 3857
+pw_3973x8: COEF_X8 3973
+pw_995x8: COEF_X8 995
+pw_m2106x8: COEF_X8 -2106
+pw_3513x8: COEF_X8 3513
+pw_3290x8: COEF_X8 3290
+pw_2440x8: COEF_X8 2440
+pw_m601x8: COEF_X8 -601
+pw_4052x8: COEF_X8 4052
+
+pw_401_4076x8: dw 401*8, 4076*8
+pw_m2598_3166x8: dw -2598*8, 3166*8
+pw_1931_3612x8: dw 1931*8, 3612*8
+pw_m1189_3920x8: dw -1189*8, 3920*8
+pw_799_4017x8: dw 799*8, 4017*8
+pw_m2276_3406x8: dw -2276*8, 3406*8
+
+pw_201_4091x8: dw 201*8, 4091*8
+pw_m601_4052x8: dw -601*8, 4052*8
+pw_995_3973x8: dw 995*8, 3973*8
+pw_m1380_3857x8: dw -1380*8, 3857*8
+pw_1751_3703x8: dw 1751*8, 3703*8
+pw_m2106_3513x8: dw -2106*8, 3513*8
+pw_2440_3290x8: dw 2440*8, 3290*8
+pw_m2751_3035x8: dw -2751*8, 3035*8
+
+pw_101_4095x8: dw 101*8, 4095*8
+pw_m2824_2967x8: dw -2824*8, 2967*8
+pw_1660_3745x8: dw 1660*8, 3745*8
+pw_m1474_3822x8: dw -1474*8, 3822*8
+pw_897_3996x8: dw 897*8, 3996*8
+pw_m2191_3461x8: dw -2191*8, 3461*8
+pw_2359_3349x8: dw 2359*8, 3349*8
+pw_m700_4036x8: dw -700*8, 4036*8
+pw_501_4065x8: dw 501*8, 4065*8
+pw_m2520_3229x8: dw -2520*8, 3229*8
+pw_2019_3564x8: dw 2019*8, 3564*8
+pw_m1092_3948x8: dw -1092*8, 3948*8
+pw_1285_3889x8: dw 1285*8, 3889*8
+pw_m1842_3659x8: dw -1842*8, 3659*8
+pw_2675_3102x8: dw 2675*8, 3102*8
+pw_m301_4085x8: dw -301*8, 4085*8
+
+idct64_mul: COEF_X8 4095, 101, 2967, -2824, 3745, 1660, 3822, -1474
+COEF_PAIR 401, 4076, 1
+COEF_PAIR 799, 4017
+ COEF_X8 -700, 4036, 2359, 3349, -2191, 3461, 897, 3996
+dw -2598, -3166, 3166, -2598, 2598, 3166, -4017, -799, 799, -4017
+ COEF_X8 4065, 501, 3229, -2520, 3564, 2019, 3948, -1092
+COEF_PAIR 1931, 3612, 1
+COEF_PAIR 3406, 2276
+ COEF_X8 -301, 4085, 2675, 3102, -1842, 3659, 1285, 3889
+dw -1189, -3920, 3920, -1189, 1189, 3920, -2276, -3406, 3406, -2276
+
+SECTION .text
+
+%define o_base int8_permA+64*18
+%define o(x) (r5 - (o_base) + (x))
+%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
+
+; flags: 1 = swap, 2 = interleave (l), 4 = interleave (t), 8 = no_pack,
+; 16 = special_mul1, 32 = special_mul2
+%macro ITX_MUL2X_PACK 6-7 0 ; dst/src, tmp[1-2], rnd, coef[1-2], flags
+ mova m%2, m%4
+%if %7 & 16
+ vpdpwssd m%2, m%1, [o(pw_%5)] {bcstd}
+ mova m%3, m%4
+%if %7 & 32
+ vpdpwssd m%3, m%1, [o(pw_%6)] {bcstd}
+%else
+ vpdpwssd m%3, m%1, m%6
+%endif
+%elif %7 & 32
+ vpdpwssd m%2, m%1, m%5
+ mova m%3, m%4
+ vpdpwssd m%3, m%1, [o(pw_%6)] {bcstd}
+%elif %6 < 32
+ vpdpwssd m%2, m%1, m%5
+ mova m%3, m%4
+ vpdpwssd m%3, m%1, m%6
+%elif %7 & 1
+ vpdpwssd m%2, m%1, [o(pw_%5_%6)] {bcstd}
+ mova m%3, m%4
+ vpdpwssd m%3, m%1, [o(pw_m%6_%5)] {bcstd}
+%else
+ vpdpwssd m%2, m%1, [o(pw_m%6_%5)] {bcstd}
+ mova m%3, m%4
+ vpdpwssd m%3, m%1, [o(pw_%5_%6)] {bcstd}
+%endif
+%if %7 & 2
+ psrld m%2, 12
+ pslld m%3, 4
+ vpshrdd m%1, m%3, m%2, 16
+%elif %7 & 4
+ ; compared to using shifts (as above) this has better throughput,
+ ; but worse latency and requires setting up the opmask/index
+ ; registers, so only use this method for the larger transforms
+ pslld m%1, m%2, 4
+ vpmultishiftqb m%1{k7}, m13, m%3
+%else
+ psrad m%2, 12
+ psrad m%3, 12
+%if %7 & 8 == 0
+ packssdw m%1, m%3, m%2
+%endif
+%endif
+%endmacro
+
+; flags: same as ITX_MUL2X_PACK
+%macro ITX_MUL4X_PACK 10-11 0 ; dst/src, tmp[1-2], coef_tmp[1-2], rnd, coef[1-4], flags
+%if %11 & 1
+ vpbroadcastd m%4, [o(pw_%9_%10)]
+ vpbroadcastd m%4{k1}, [o(pw_%7_%8)]
+ vpbroadcastd m%5, [o(pw_m%10_%9)]
+ vpbroadcastd m%5{k1}, [o(pw_m%8_%7)]
+%else
+ vpbroadcastd m%4, [o(pw_m%10_%9)]
+ vpbroadcastd m%4{k1}, [o(pw_m%8_%7)]
+ vpbroadcastd m%5, [o(pw_%9_%10)]
+ vpbroadcastd m%5{k1}, [o(pw_%7_%8)]
+%endif
+ ITX_MUL2X_PACK %1, %2, %3, %6, %4, %5, %11
+%endmacro
+
+; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
+; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
+%macro ITX_MULSUB_2W 7-8 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], dst2
+ punpcklwd m%3, m%2, m%1
+ punpckhwd m%2, m%1
+%if %7 < 32
+ mova m%1, m%5
+ vpdpwssd m%1, m%3, m%7
+ mova m%4, m%5
+ vpdpwssd m%4, m%2, m%7
+%else
+ mova m%1, m%5
+ vpdpwssd m%1, m%3, [o(pw_m%7_%6)] {bcstd}
+ mova m%4, m%5
+ vpdpwssd m%4, m%2, [o(pw_m%7_%6)] {bcstd}
+%endif
+ psrad m%1, 12
+ psrad m%4, 12
+ packssdw m%1, m%4
+ mova m%4, m%5
+%if %7 < 32
+ vpdpwssd m%4, m%2, m%6
+ mova m%2, m%5
+ vpdpwssd m%2, m%3, m%6
+%else
+ vpdpwssd m%4, m%2, [o(pw_%6_%7)] {bcstd}
+ mova m%2, m%5
+ vpdpwssd m%2, m%3, [o(pw_%6_%7)] {bcstd}
+%endif
+ psrad m%4, 12
+ psrad m%2, 12
+%if %0 == 8
+ packssdw m%8, m%2, m%4
+%else
+ packssdw m%2, m%4
+%endif
+%endmacro
+
+%macro WRAP_XMM 1+
+ %xdefine %%reset RESET_MM_PERMUTATION
+ INIT_XMM cpuname
+ DEFINE_MMREGS xmm
+ AVX512_MM_PERMUTATION
+ %1
+ %%reset
+%endmacro
+
+%macro WRAP_YMM 1+
+ INIT_YMM cpuname
+ %1
+ INIT_ZMM cpuname
+%endmacro
+
+%macro ITX4_END 4-5 2048 ; row[1-4], rnd
+%if %5
+ vpbroadcastd m2, [o(pw_%5)]
+ pmulhrsw m0, m2
+ pmulhrsw m1, m2
+%endif
+ lea r2, [dstq+strideq*2]
+%assign %%i 1
+%rep 4
+ %if %1 & 2
+ CAT_XDEFINE %%row_adr, %%i, r2 + strideq*(%1&1)
+ %else
+ CAT_XDEFINE %%row_adr, %%i, dstq + strideq*(%1&1)
+ %endif
+ %assign %%i %%i + 1
+ %rotate 1
+%endrep
+ movd m2, [%%row_adr1]
+ pinsrd m2, [%%row_adr2], 1
+ movd m3, [%%row_adr3]
+ pinsrd m3, [%%row_adr4], 1
+ pmovzxbw m2, m2
+ pmovzxbw m3, m3
+ paddw m0, m2
+ paddw m1, m3
+ packuswb m0, m1
+ movd [%%row_adr1], m0
+ pextrd [%%row_adr2], m0, 1
+ pextrd [%%row_adr3], m0, 2
+ pextrd [%%row_adr4], m0, 3
+ ret
+%endmacro
+
+%macro INV_TXFM_FN 3 ; type1, type2, size
+cglobal inv_txfm_add_%1_%2_%3_8bpc, 4, 6, 0, dst, stride, c, eob, tx2, base
+ %define %%p1 m(i%1_%3_internal_8bpc)
+ lea baseq, [o_base]
+ ; Jump to the 1st txfm function if we're not taking the fast path, which
+ ; in turn performs an indirect jump to the 2nd txfm function.
+ lea tx2q, [m(i%2_%3_internal_8bpc).pass2]
+%ifidn %1_%2, dct_dct
+ test eobd, eobd
+ jnz %%p1
+%else
+ ; jump to the 1st txfm function unless it's located directly after this
+ times ((%%end - %%p1) >> 31) & 1 jmp %%p1
+ALIGN function_align
+%%end:
+%endif
+%endmacro
+
+%macro INV_TXFM_4X4_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 4x4
+%ifidn %1_%2, dct_dct
+ vpbroadcastw m0, [cq]
+ vpbroadcastd m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1
+ mov [cq], eobd
+ pmulhrsw m0, m1
+ mova m1, m0
+ jmp m(iadst_4x4_internal_8bpc).end2
+%endif
+%endmacro
+
+%macro IDCT4_1D_PACKED 0
+ vpbroadcastd m4, [o(pd_2048)]
+ punpckhwd m2, m1, m0
+ punpcklwd m1, m0
+ ITX_MUL2X_PACK 2, 0, 3, 4, 1567, 3784
+ ITX_MUL2X_PACK 1, 0, 3, 4, 2896, 2896
+ paddsw m0, m1, m2 ; out0 out1
+ psubsw m1, m2 ; out3 out2
+%endmacro
+
+%macro IADST4_1D_PACKED 0
+ punpcklwd m4, m1, m0 ; in2 in0
+ punpckhwd m5, m1, m0 ; in3 in1
+.main2:
+ vpbroadcastd m3, [o(pd_2048)]
+ mova m0, m3
+ vpdpwssd m0, m4, [o(pw_3803_1321)] {bcstd}
+ mova m2, m3
+ vpdpwssd m2, m4, [o(pw_m1321_2482)] {bcstd}
+ mova m1, m3
+ vpdpwssd m1, m4, [o(pw_m3344_3344)] {bcstd}
+ vpdpwssd m3, m4, [o(pw_2482_3803)] {bcstd}
+ vpdpwssd m0, m5, [o(pw_2482_3344)] {bcstd}
+ vpdpwssd m2, m5, [o(pw_m3803_3344)] {bcstd}
+ vpdpwssd m1, m5, [o(pd_3344)] {bcstd}
+ vpdpwssd m3, m5, [o(pw_m1321_m3344)] {bcstd}
+ REPX {psrad x, 12}, m0, m2, m1, m3
+ packssdw m0, m2 ; out0 out1
+ packssdw m1, m3 ; out2 out3
+%endmacro
+
+INIT_XMM avx512icl
+INV_TXFM_4X4_FN dct, dct
+INV_TXFM_4X4_FN dct, adst
+INV_TXFM_4X4_FN dct, flipadst
+INV_TXFM_4X4_FN dct, identity
+
+cglobal idct_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova m0, [cq+16*0]
+ mova m1, [cq+16*1]
+ IDCT4_1D_PACKED
+ mova m2, [o(deint_shuf)]
+ shufps m3, m0, m1, q1331
+ shufps m0, m0, m1, q0220
+ pshufb m0, m2
+ pshufb m1, m3, m2
+ jmp tx2q
+.pass2:
+ IDCT4_1D_PACKED
+ pxor ymm16, ymm16
+ mova [cq], ymm16
+ ITX4_END 0, 1, 3, 2
+
+INV_TXFM_4X4_FN adst, dct
+INV_TXFM_4X4_FN adst, adst
+INV_TXFM_4X4_FN adst, flipadst
+INV_TXFM_4X4_FN adst, identity
+
+cglobal iadst_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova m0, [cq+16*0]
+ mova m1, [cq+16*1]
+ call .main
+ punpckhwd m3, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m0, m3
+ punpcklwd m0, m3
+ jmp tx2q
+.pass2:
+ call .main
+.end:
+ pxor ymm16, ymm16
+ mova [cq], ymm16
+.end2:
+ ITX4_END 0, 1, 2, 3
+ALIGN function_align
+.main:
+ IADST4_1D_PACKED
+ ret
+
+INV_TXFM_4X4_FN flipadst, dct
+INV_TXFM_4X4_FN flipadst, adst
+INV_TXFM_4X4_FN flipadst, flipadst
+INV_TXFM_4X4_FN flipadst, identity
+
+cglobal iflipadst_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova m0, [cq+16*0]
+ mova m1, [cq+16*1]
+ call m(iadst_4x4_internal_8bpc).main
+ punpcklwd m2, m1, m0
+ punpckhwd m1, m0
+ punpcklwd m0, m1, m2
+ punpckhwd m1, m2
+ jmp tx2q
+.pass2:
+ call m(iadst_4x4_internal_8bpc).main
+.end:
+ pxor ymm16, ymm16
+ mova [cq], ymm16
+.end2:
+ ITX4_END 3, 2, 1, 0
+
+INV_TXFM_4X4_FN identity, dct
+INV_TXFM_4X4_FN identity, adst
+INV_TXFM_4X4_FN identity, flipadst
+INV_TXFM_4X4_FN identity, identity
+
+cglobal iidentity_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova m0, [cq+16*0]
+ mova m1, [cq+16*1]
+ vpbroadcastd m3, [o(pw_1697x8)]
+ pmulhrsw m2, m3, m0
+ pmulhrsw m3, m1
+ paddsw m0, m2
+ paddsw m1, m3
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m0, m2
+ punpcklwd m0, m2
+ jmp tx2q
+.pass2:
+ vpbroadcastd m3, [o(pw_1697x8)]
+ pmulhrsw m2, m3, m0
+ pmulhrsw m3, m1
+ paddsw m0, m2
+ paddsw m1, m3
+ jmp m(iadst_4x4_internal_8bpc).end
+
+%macro INV_TXFM_4X8_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 4x8
+%ifidn %1_%2, dct_dct
+ movd xmm1, [o(pw_2896x8)]
+ pmulhrsw xmm0, xmm1, [cq]
+ movd xmm2, [o(pw_2048)]
+ pmulhrsw xmm0, xmm1
+ pmulhrsw xmm0, xmm1
+ pmulhrsw xmm0, xmm2
+ vpbroadcastw ym0, xmm0
+ mova ym1, ym0
+ jmp m(iadst_4x8_internal_8bpc).end3
+%endif
+%endmacro
+
+%macro IDCT8_1D_PACKED 0
+ punpckhwd m5, m3, m0 ; in7 in1
+ punpckhwd m4, m1, m2 ; in3 in5
+ punpcklwd m3, m1 ; in6 in2
+ punpcklwd m2, m0 ; in4 in0
+.main2:
+ vpbroadcastd m6, [o(pd_2048)]
+ ITX_MUL2X_PACK 5, 0, 1, 6, 799, 4017, 3 ; t4a t7a
+ ITX_MUL2X_PACK 4, 0, 1, 6, 3406, 2276, 3 ; t5a t6a
+ ITX_MUL2X_PACK 3, 0, 1, 6, 1567, 3784 ; t3 t2
+ psubsw m0, m5, m4 ; t5a t6a (interleaved)
+ paddsw m4, m5 ; t4 t7 (interleaved)
+ ITX_MUL2X_PACK 2, 1, 5, 6, 2896, 2896 ; t0 t1
+ ITX_MUL2X_PACK 0, 1, 5, 6, 2896, 2896, 1 ; t6 t5
+%if mmsize > 16
+ vbroadcasti32x4 m1, [o(deint_shuf)]
+ pshufb m4, m1
+%else
+ pshufb m4, [o(deint_shuf)]
+%endif
+ psubsw m1, m2, m3 ; tmp3 tmp2
+ paddsw m3, m2 ; tmp0 tmp1
+ punpckhqdq m2, m4, m0 ; t7 t6
+ punpcklqdq m4, m0 ; t4 t5
+ paddsw m0, m3, m2 ; out0 out1
+ psubsw m3, m2 ; out7 out6
+ psubsw m2, m1, m4 ; out4 out5
+ paddsw m1, m4 ; out3 out2
+%endmacro
+
+%macro IADST8_1D_PACKED 1 ; pass
+ vpbroadcastd m6, [o(pd_2048)]
+%if %1 == 1
+ ITX_MUL2X_PACK 0, 4, 5, 6, 401, 4076, 3 ; t1a t0a
+ ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612, 2 ; t2a t3a
+ ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598, 3 ; t5a t4a
+ ITX_MUL2X_PACK 3, 4, 5, 6, 3920, 1189, 2 ; t6a t7a
+ psubsw m4, m0, m2 ; t5 t4
+ paddsw m0, m2 ; t1 t0
+ psubsw m5, m1, m3 ; t6 t7
+ paddsw m1, m3 ; t2 t3
+ ITX_MUL2X_PACK 4, 2, 3, 6, 1567, 3784, 3 ; t5a t4a
+ ITX_MUL2X_PACK 5, 2, 3, 6, 3784, 1567, 2 ; t7a t6a
+%if mmsize > 16
+ vbroadcasti32x4 m2, [o(deint_shuf)]
+%else
+ mova m2, [o(deint_shuf)]
+%endif
+ vprord m1, 16
+ psubsw m3, m0, m1 ; t3 t2
+ paddsw m0, m1 ; -out7 out0
+ psubsw m1, m4, m5 ; t7 t6
+ paddsw m4, m5 ; out6 -out1
+ pshufb m0, m2
+ pshufb m4, m2
+ mova m2, m6
+ vpdpwssd m2, m3, [o(pw_m2896_2896)] {bcstd}
+ mova m5, m6
+ vpdpwssd m5, m1, [o(pw_m2896_2896)] {bcstd}
+ psrad m2, 12
+ psrad m5, 12
+ packssdw m2, m5 ; out4 -out5
+ mova m5, m6
+ vpdpwssd m5, m3, [o(pw_2896_2896)] {bcstd}
+ mova m3, m6
+ vpdpwssd m3, m1, [o(pw_2896_2896)] {bcstd}
+ psrad m5, 12
+ psrad m3, 12
+ packssdw m1, m3, m5 ; out2 -out3
+%else
+ punpckhwd m0, m4, m3 ; 0 7
+ punpckhwd m1, m5, m2 ; 2 5
+ punpcklwd m2, m5 ; 4 3
+ punpcklwd m3, m4 ; 6 1
+ ITX_MUL2X_PACK 0, 4, 5, 6, 401, 4076 ; t0a t1a
+ ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612 ; t2a t3a
+ ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598 ; t4a t5a
+ ITX_MUL2X_PACK 3, 4, 5, 6, 3920, 1189 ; t6a t7a
+ psubsw m4, m0, m2 ; t4 t5
+ paddsw m0, m2 ; t0 t1
+ psubsw m5, m1, m3 ; t6 t7
+ paddsw m1, m3 ; t2 t3
+ shufps m2, m5, m4, q1032
+ punpckhwd m4, m2
+ punpcklwd m5, m2
+ ITX_MUL2X_PACK 4, 2, 3, 6, 1567, 3784 ; t4a t5a
+ ITX_MUL2X_PACK 5, 2, 3, 6, 3784, 1567, 1 ; t6a t7a
+ psubsw m2, m0, m1 ; t2 t3
+ paddsw m0, m1 ; out0 -out7
+ psubsw m1, m4, m5 ; t6 t7
+ paddsw m4, m5 ; -out1 out6
+ vpbroadcastd m5, [o(pw_2896x8)]
+ punpckhqdq m3, m2, m1 ; t3 t7
+ punpcklqdq m2, m1 ; t2 t6
+ paddsw m1, m2, m3 ; t2+t3 t6+t7
+ psubsw m2, m3 ; t2-t3 t6-t7
+ punpckhqdq m3, m4, m0 ; out6 -out7
+ punpcklqdq m0, m4 ; out0 -out1
+ pmulhrsw m2, m5 ; out4 -out5
+ pshufd m1, m1, q1032
+ pmulhrsw m1, m5 ; out2 -out3
+%endif
+%endmacro
+
+INIT_YMM avx512icl
+INV_TXFM_4X8_FN dct, dct
+INV_TXFM_4X8_FN dct, identity
+INV_TXFM_4X8_FN dct, adst
+INV_TXFM_4X8_FN dct, flipadst
+
+cglobal idct_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ vpermq m0, [cq+32*0], q3120
+ vpermq m1, [cq+32*1], q3120
+ vpbroadcastd m2, [o(pw_2896x8)]
+ pmulhrsw m0, m2
+ pmulhrsw m1, m2
+ IDCT4_1D_PACKED
+ vbroadcasti32x4 m2, [o(deint_shuf)]
+ shufps m3, m0, m1, q1331
+ shufps m0, m0, m1, q0220
+ pshufb m0, m2
+ pshufb m1, m3, m2
+ jmp tx2q
+.pass2:
+ vextracti32x4 xm2, m0, 1
+ vextracti32x4 xm3, m1, 1
+ call .main
+ vpbroadcastd m4, [o(pw_2048)]
+ vinserti32x4 m0, m0, xm2, 1
+ vinserti32x4 m1, m1, xm3, 1
+ pshufd m1, m1, q1032
+ jmp m(iadst_4x8_internal_8bpc).end2
+ALIGN function_align
+.main:
+ WRAP_XMM IDCT8_1D_PACKED
+ ret
+
+INV_TXFM_4X8_FN adst, dct
+INV_TXFM_4X8_FN adst, adst
+INV_TXFM_4X8_FN adst, flipadst
+INV_TXFM_4X8_FN adst, identity
+
+cglobal iadst_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ vpermq m0, [cq+32*0], q3120
+ vpermq m1, [cq+32*1], q3120
+ vpbroadcastd m2, [o(pw_2896x8)]
+ pmulhrsw m0, m2
+ pmulhrsw m1, m2
+ call m(iadst_8x4_internal_8bpc).main
+ punpckhwd m3, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m0, m3
+ punpcklwd m0, m3
+ jmp tx2q
+.pass2:
+ vextracti32x4 xm2, m0, 1
+ vextracti32x4 xm3, m1, 1
+ pshufd xm4, xm0, q1032
+ pshufd xm5, xm1, q1032
+ call .main_pass2
+ vpbroadcastd m4, [o(pw_2048)]
+ vinserti32x4 m0, xm2, 1
+ vinserti32x4 m1, xm3, 1
+ pxor m5, m5
+ psubw m5, m4
+.end:
+ punpcklqdq m4, m5
+.end2:
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+.end3:
+ vpbroadcastd m3, strided
+ pmulld m5, m3, [o(pd_0to15)]
+ kxnorb k1, k1, k1
+ kmovb k2, k1
+ vpgatherdd m3{k1}, [dstq+m5]
+ pxor m4, m4
+ mova [cq], zmm20
+ punpcklbw m2, m3, m4
+ punpckhbw m3, m4
+ paddw m0, m2
+ paddw m1, m3
+ packuswb m0, m1
+ vpscatterdd [dstq+m5]{k2}, m0
+ RET
+ALIGN function_align
+.main_pass1:
+ punpckhwd xm0, xm4, xm3 ; 0 7
+ punpckhwd xm1, xm5, xm2 ; 2 5
+ punpcklwd xm2, xm5 ; 4 3
+ punpcklwd xm3, xm4 ; 6 1
+ WRAP_XMM IADST8_1D_PACKED 1
+ punpcklqdq xm3, xm4, xm0 ; out6 -out7
+ punpckhqdq xm0, xm4 ; out0 -out1
+ ret
+ALIGN function_align
+.main_pass2:
+ WRAP_XMM IADST8_1D_PACKED 2
+ ret
+
+INV_TXFM_4X8_FN flipadst, dct
+INV_TXFM_4X8_FN flipadst, adst
+INV_TXFM_4X8_FN flipadst, flipadst
+INV_TXFM_4X8_FN flipadst, identity
+
+cglobal iflipadst_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ vpermq m0, [cq+32*0], q3120
+ vpermq m1, [cq+32*1], q3120
+ vpbroadcastd m2, [o(pw_2896x8)]
+ pmulhrsw m0, m2
+ pmulhrsw m1, m2
+ call m(iadst_8x4_internal_8bpc).main
+ punpcklwd m3, m1, m0
+ punpckhwd m1, m0
+ punpcklwd m0, m1, m3
+ punpckhwd m1, m3
+ jmp tx2q
+.pass2:
+ vextracti32x4 xm2, m0, 1
+ vextracti32x4 xm3, m1, 1
+ pshufd xm4, xm0, q1032
+ pshufd xm5, xm1, q1032
+ call m(iadst_4x8_internal_8bpc).main_pass2
+ vpbroadcastd m5, [o(pw_2048)]
+ vinserti32x4 m3, xm1, 1
+ vinserti32x4 m2, xm0, 1
+ pxor m4, m4
+ psubw m4, m5
+ pshufd m0, m3, q1032
+ pshufd m1, m2, q1032
+ jmp m(iadst_4x8_internal_8bpc).end
+
+INIT_ZMM avx512icl
+INV_TXFM_4X8_FN identity, dct
+INV_TXFM_4X8_FN identity, adst
+INV_TXFM_4X8_FN identity, flipadst
+INV_TXFM_4X8_FN identity, identity
+
+cglobal iidentity_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ vpbroadcastd m0, [o(pw_2896x8)]
+ pmulhrsw m0, [cq]
+ mova m1, [o(int8_permB)]
+ vpbroadcastd m2, [o(pw_1697x8)]
+ vpermb m0, m1, m0
+ pmulhrsw m2, m0
+ paddsw m0, m2
+ vextracti32x8 ym1, m0, 1
+ jmp tx2q
+.pass2:
+ vpbroadcastd ym4, [o(pw_4096)]
+ jmp m(iadst_4x8_internal_8bpc).end2
+
+%macro INV_TXFM_4X16_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 4x16
+%ifidn %1_%2, dct_dct
+ movsx r6d, word [cq]
+ mov [cq], eobd
+ imul r6d, 181
+ add r6d, 128+256
+ sar r6d, 8+1
+ imul r6d, 181
+ add r6d, 128+2048
+ sar r6d, 8+4
+ vpbroadcastw m0, r6d
+ mova m1, m0
+ jmp m(iadst_4x16_internal_8bpc).end3
+%endif
+%endmacro
+
+%macro IDCT16_1D_PACKED 0
+ punpckhwd m8, m7, m0 ; dct16 in15 in1
+ punpcklwd m9, m4, m0 ; dct4 in2 in0
+ punpckhwd m0, m3, m4 ; dct16 in7 in9
+ punpcklwd m7, m1 ; dct8 in7 in1
+ punpckhwd m1, m6 ; dct16 in3 in13
+ punpcklwd m3, m5 ; dct8 in3 in5
+ punpckhwd m5, m2 ; dct16 in11 in5
+ punpcklwd m6, m2 ; dct4 in3 in1
+cglobal_label .main2
+ vpbroadcastd m10, [o(pd_2048)]
+.main3:
+ vpbroadcastq m13, [o(int_mshift)]
+ vpcmpub k7, m13, m10, 6 ; 0x33...
+ ITX_MUL2X_PACK 8, 2, 4, 10, 401, 4076, 5 ; t8a t15a
+ ITX_MUL2X_PACK 0, 2, 4, 10, 3166, 2598, 5 ; t9a t14a
+ ITX_MUL2X_PACK 5, 2, 4, 10, 1931, 3612, 5 ; t10a t13a
+ ITX_MUL2X_PACK 1, 2, 4, 10, 3920, 1189, 5 ; t11a t12a
+ ITX_MUL2X_PACK 7, 2, 4, 10, 799, 4017, 5 ; t4a t7a
+ ITX_MUL2X_PACK 3, 2, 4, 10, 3406, 2276, 5 ; t5a t6a
+.main4:
+ psubsw m2, m8, m0 ; t9 t14
+ paddsw m8, m0 ; t8 t15
+ psubsw m4, m1, m5 ; t10 t13
+ paddsw m1, m5 ; t11 t12
+ ITX_MUL2X_PACK 6, 0, 5, 10, 1567, 3784 ; t3 t2
+ psubsw m0, m8, m1 ; t11a t12a
+ paddsw m8, m1 ; t8a t15a
+ psubsw m1, m7, m3 ; t5a t6a
+ paddsw m7, m3 ; t4 t7
+.main5:
+ ITX_MUL2X_PACK 2, 3, 5, 10, 1567, 3784, 5 ; t9a t14a
+ ITX_MUL2X_PACK 4, 3, 5, 10, m3784, 1567, 5 ; t10a t13a
+%if mmsize > 16
+ vbroadcasti32x4 m5, [o(deint_shuf)]
+%else
+ mova m5, [o(deint_shuf)]
+%endif
+ vpbroadcastd m11, [o(pw_m2896_2896)]
+ vpbroadcastd m12, [o(pw_2896_2896)]
+ paddsw m3, m2, m4 ; t9 t14
+ psubsw m2, m4 ; t10 t13
+ pshufb m8, m5
+ pshufb m7, m5
+ pshufb m3, m5
+ ITX_MUL2X_PACK 9, 4, 5, 10, 11, 12 ; t0 t1
+ ITX_MUL2X_PACK 1, 4, 5, 10, 12, 11 ; t5 t6
+ ITX_MUL2X_PACK 0, 4, 5, 10, 11, 12, 8 ; t11 t12
+ ITX_MUL2X_PACK 2, 0, 11, 10, 11, 12, 8 ; t10a t13a
+ punpckhqdq m2, m7, m1 ; t7 t6
+ punpcklqdq m7, m1 ; t4 t5
+ psubsw m1, m9, m6 ; dct4 out3 out2
+ paddsw m9, m6 ; dct4 out0 out1
+ packssdw m5, m11 ; t12 t13a
+ packssdw m4, m0 ; t11 t10a
+ punpckhqdq m0, m8, m3 ; t15a t14
+ punpcklqdq m8, m3 ; t8a t9
+ psubsw m3, m9, m2 ; dct8 out7 out6
+ paddsw m9, m2 ; dct8 out0 out1
+ psubsw m2, m1, m7 ; dct8 out4 out5
+ paddsw m1, m7 ; dct8 out3 out2
+ psubsw m7, m9, m0 ; out15 out14
+ paddsw m0, m9 ; out0 out1
+ psubsw m6, m1, m5 ; out12 out13
+ paddsw m1, m5 ; out3 out2
+ psubsw m5, m2, m4 ; out11 out10
+ paddsw m2, m4 ; out4 out5
+ psubsw m4, m3, m8 ; out8 out9
+ paddsw m3, m8 ; out7 out6
+%endmacro
+
+INV_TXFM_4X16_FN dct, dct
+INV_TXFM_4X16_FN dct, identity
+INV_TXFM_4X16_FN dct, adst
+INV_TXFM_4X16_FN dct, flipadst
+
+cglobal idct_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova ym1, [cq+32*2]
+ vinserti32x8 m1, [cq+32*0], 1
+ mova m0, [o(int16_perm)]
+ mova ym2, [cq+32*3]
+ vinserti32x8 m2, [cq+32*1], 1
+ vpbroadcastd m4, [o(pd_2048)]
+ vpermb m1, m0, m1 ; c0 a0 c1 a1 c2 a2 c3 a3
+ vpermb m2, m0, m2 ; d0 b0 d1 b1 d2 b2 d3 b3
+ ITX_MUL2X_PACK 1, 0, 3, 4, 2896, 2896, 2
+ ITX_MUL2X_PACK 2, 0, 3, 4, 1567, 3784, 2
+ vpbroadcastd m4, [o(pw_16384)]
+ psubsw m3, m1, m2
+ paddsw m1, m2 ; out0 out1
+ vprord m3, 16 ; out2 out3
+ punpckldq m0, m1, m3
+ punpckhdq m1, m3
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+ jmp tx2q
+.pass2:
+ vextracti32x4 xm2, ym0, 1
+ vextracti32x4 xm3, ym1, 1
+ vextracti32x4 xm4, m0, 2
+ vextracti32x4 xm5, m1, 2
+ vextracti32x4 xm6, m0, 3
+ vextracti32x4 xm7, m1, 3
+ call .main
+ vinserti32x4 ym0, xm2, 1
+ vinserti32x4 ym1, xm3, 1
+ vinserti32x4 ym4, xm6, 1
+ vinserti32x4 ym5, xm7, 1
+ vinserti32x8 m0, ym4, 1
+ vinserti32x8 m1, ym5, 1
+ vpbroadcastd m5, [o(pw_2048)]
+ pshufd m1, m1, q1032
+ jmp m(iadst_4x16_internal_8bpc).end2
+ALIGN function_align
+.main:
+ WRAP_XMM IDCT16_1D_PACKED
+ ret
+
+INV_TXFM_4X16_FN adst, dct
+INV_TXFM_4X16_FN adst, adst
+INV_TXFM_4X16_FN adst, flipadst
+INV_TXFM_4X16_FN adst, identity
+
+cglobal iadst_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova m1, [o(permB)]
+ vpermq m0, m1, [cq+64*0]
+ vpermq m1, m1, [cq+64*1]
+ call m(iadst_16x4_internal_8bpc).main
+ vpbroadcastd m3, [o(pw_16384)]
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ pmulhrsw m2, m3
+ pmulhrsw m0, m3
+ punpckhwd m1, m0, m2
+ punpcklwd m0, m2
+ jmp tx2q
+.pass2:
+ call .main
+ vpbroadcastd m5, [o(pw_2048)]
+ psrlq m10, 4
+ psubw m6, m8, m5
+.end:
+ vpbroadcastd m7, [o(pw_2896x8)]
+ paddsw ym1, ym2, ym4
+ psubsw ym2, ym4
+ vinserti32x8 m1, ym2, 1
+ pmulhrsw m1, m7 ; -out7 out4 out6 -out5 out8 -out11 -out9 out10
+ psrlq m0, m10, 4
+ vpermi2q m0, m1, m3 ; 0 1 4 5 8 9 c d
+ vpermt2q m1, m10, m3 ; 2 3 6 7 a b e f
+ punpcklqdq m5, m6
+.end2:
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+.end3:
+ vpbroadcastd m3, strided
+ pmulld m5, m3, [o(pd_0to15)]
+ kxnorw k1, k1, k1
+ kmovw k2, k1
+ vpgatherdd m3{k1}, [dstq+m5]
+ pxor m4, m4
+ mova [cq+64*0], m4
+ mova [cq+64*1], m4
+ punpcklbw m2, m3, m4
+ punpckhbw m3, m4
+ paddw m0, m2
+ paddw m1, m3
+ packuswb m0, m1
+ vpscatterdd [dstq+m5]{k2}, m0
+ RET
+ALIGN function_align
+.main:
+ movu m3, [o(permB+1)]
+ psrlq m10, m3, 4
+.main2:
+ vpermi2q m3, m0, m1 ; in15 in12 in13 in14 in11 in8 in9 in10
+ vpermt2q m0, m10, m1 ; in0 in3 in2 in1 in4 in7 in6 in5
+ vpbroadcastd m9, [o(pd_2048)]
+ vpbroadcastq ym13, [o(int_mshift)]
+ kxnorb k1, k1, k1
+ punpckhwd m4, m3, m0 ; in12 in3 in14 in1
+ punpcklwd m0, m3 ; in0 in15 in2 in13
+ kshiftrb k1, k1, 4
+ vextracti32x8 ym3, m4, 1 ; in8 in7 in10 in5
+ vextracti32x8 ym1, m0, 1 ; in4 in11 in6 in9
+INIT_YMM avx512icl
+ vpcmpub k7, m13, m9, 6 ; 0x33...
+ pxor m8, m8
+ ITX_MUL4X_PACK 0, 2, 5, 6, 7, 9, 201, 4091, 995, 3973, 5
+ ITX_MUL4X_PACK 1, 2, 5, 6, 7, 9, 1751, 3703, 2440, 3290, 5
+ ITX_MUL4X_PACK 3, 2, 5, 6, 7, 9, 3035, 2751, 3513, 2106, 5
+ ITX_MUL4X_PACK 4, 2, 5, 6, 7, 9, 3857, 1380, 4052, 601, 5
+ psubsw m2, m0, m3 ; t9a t8a t11a t10a
+ paddsw m0, m3 ; t1a t0a t3a t2a
+ psubsw m3, m1, m4 ; t13a t12a t15a t14a
+ paddsw m4, m1 ; t5a t4a t7a t6a
+ ITX_MUL4X_PACK 2, 1, 5, 6, 7, 9, 799, 4017, 3406, 2276, 5
+ psubw m7, m8, m7
+ ITX_MUL2X_PACK 3, 1, 5, 9, 7, 6, 4
+ vpbroadcastd m6, [o(pw_3784_m1567)]
+ vpbroadcastd m6{k1}, [o(pw_m3784_1567)]
+ psubsw m1, m0, m4 ; t5 t4 t7 t6
+ paddsw m0, m4 ; t1 t0 t3 t2
+ psubsw m4, m2, m3 ; t13a t12a t15a t14a
+ paddsw m2, m3 ; t9a t8a t11a t10a
+ ITX_MUL2X_PACK 1, 3, 5, 9, 1567_3784, 6, 16 ; t4a t5a t7a t6a
+ ITX_MUL2X_PACK 4, 3, 5, 9, 1567_3784, 6, 16 ; t12 t13 t15 t14
+ vbroadcasti32x4 m5, [o(deint_shuf)]
+ pshufb m0, m5
+ pshufb m2, m5
+ vshufi32x4 m3, m0, m2, 0x03 ; t3 t2 t11a t10a
+ vinserti32x4 m0, xm2, 1 ; t1 t0 t9a t8a
+ vshufi32x4 m2, m1, m4, 0x03 ; t7a t6a t15 t14
+ vinserti32x4 m1, xm4, 1 ; t4a t5a t12 t13
+ pshufd m2, m2, q1032 ; t6a t7a t14 t15
+ psubsw m4, m0, m3 ; t3a t2a t11 t10
+ paddsw m0, m3 ; -out15 out0 out14 -out1
+ paddsw m3, m1, m2 ; out12 -out3 -out13 out2
+ psubsw m1, m2 ; t7 t6 t15a t14a
+ punpckhqdq m2, m4, m1 ; t2a t6 t10 t14a
+ punpcklqdq m4, m1 ; t3a t7 t11 t15a
+INIT_ZMM avx512icl
+ vinserti32x8 m3, ym0, 1 ; out12 -out3 -out13 out2 -out15 out0 out14 -out1
+ ret
+
+INV_TXFM_4X16_FN flipadst, dct
+INV_TXFM_4X16_FN flipadst, adst
+INV_TXFM_4X16_FN flipadst, flipadst
+INV_TXFM_4X16_FN flipadst, identity
+
+cglobal iflipadst_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova m1, [o(permB)]
+ vpermq m0, m1, [cq+64*0]
+ vpermq m1, m1, [cq+64*1]
+ call m(iadst_16x4_internal_8bpc).main
+ vpbroadcastd m3, [o(pw_16384)]
+ punpcklwd m2, m1, m0
+ punpckhwd m1, m0
+ pmulhrsw m2, m3
+ pmulhrsw m1, m3
+ punpcklwd m0, m1, m2
+ punpckhwd m1, m2
+ jmp tx2q
+.pass2:
+ call m(iadst_4x16_internal_8bpc).main
+ vpbroadcastd m6, [o(pw_2048)]
+ psrlq m10, 12
+ psubw m5, m8, m6
+ jmp m(iadst_4x16_internal_8bpc).end
+
+INV_TXFM_4X16_FN identity, dct
+INV_TXFM_4X16_FN identity, adst
+INV_TXFM_4X16_FN identity, flipadst
+INV_TXFM_4X16_FN identity, identity
+
+cglobal iidentity_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova m2, [o(int16_perm)]
+ vpermb m1, m2, [cq+64*0]
+ vpermb m2, m2, [cq+64*1]
+ vpbroadcastd m4, [o(pw_1697x8)]
+ vpbroadcastd m0, [o(pd_m1)]
+ pmulhrsw m3, m4, m1 ; we want to do a signed avg, but pavgw is
+ vpcmpw k1, m1, m0, 4 ; unsigned. as long as both signs are equal
+ pmulhrsw m4, m2 ; it still works, but if the input is -1 the
+ vpcmpw k2, m2, m0, 4 ; pmulhrsw result will become 0 which causes
+ vpavgw m1{k1}{z}, m3 ; pavgw to output -32768 instead of 0 unless
+ vpavgw m2{k2}{z}, m4 ; we explicitly deal with that case here.
+ punpckldq m0, m1, m2
+ punpckhdq m1, m2
+ jmp tx2q
+.pass2:
+ vpbroadcastd m3, [o(pw_1697x16)]
+ vpbroadcastd m5, [o(pw_2048)]
+ pmulhrsw m2, m3, m0
+ pmulhrsw m3, m1
+ paddsw m0, m0
+ paddsw m1, m1
+ paddsw m0, m2
+ paddsw m1, m3
+ jmp m(iadst_4x16_internal_8bpc).end2
+
+%macro WRITE_8X4 4-7 strideq*1, strideq*2, r6 ; coefs[1-2], tmp[1-2], off[1-3]
+ movq xm%3, [dstq ]
+ movhps xm%3, [dstq+%5]
+ movq xm%4, [dstq+%6]
+ movhps xm%4, [dstq+%7]
+ pmovzxbw m%3, xm%3
+ pmovzxbw m%4, xm%4
+%ifnum %1
+ paddw m%3, m%1
+%else
+ paddw m%3, %1
+%endif
+%ifnum %2
+ paddw m%4, m%2
+%else
+ paddw m%4, %2
+%endif
+ packuswb m%3, m%4
+ vextracti32x4 xm%4, m%3, 1
+ movq [dstq ], xm%3
+ movhps [dstq+%6], xm%3
+ movq [dstq+%5], xm%4
+ movhps [dstq+%7], xm%4
+%endmacro
+
+%macro INV_TXFM_8X4_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 8x4
+%ifidn %1_%2, dct_dct
+ movd xm1, [o(pw_2896x8)]
+ pmulhrsw xm0, xm1, [cq]
+ movd xm2, [o(pw_2048)]
+ pmulhrsw xm0, xm1
+ pmulhrsw xm0, xm1
+ pmulhrsw xm0, xm2
+ vpbroadcastw m0, xm0
+ mova m1, m0
+ jmp m(iadst_8x4_internal_8bpc).end3
+%endif
+%endmacro
+
+INIT_YMM avx512icl
+INV_TXFM_8X4_FN dct, dct
+INV_TXFM_8X4_FN dct, adst
+INV_TXFM_8X4_FN dct, flipadst
+INV_TXFM_8X4_FN dct, identity
+
+cglobal idct_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ vpbroadcastd xm3, [o(pw_2896x8)]
+ pmulhrsw xm0, xm3, [cq+16*0]
+ pmulhrsw xm1, xm3, [cq+16*1]
+ pmulhrsw xm2, xm3, [cq+16*2]
+ pmulhrsw xm3, [cq+16*3]
+ call m(idct_4x8_internal_8bpc).main
+ vbroadcasti32x4 m4, [o(deint_shuf)]
+ vinserti32x4 m3, m1, xm3, 1
+ vinserti32x4 m1, m0, xm2, 1
+ shufps m0, m1, m3, q0220
+ shufps m1, m3, q1331
+ pshufb m0, m4
+ pshufb m1, m4
+ jmp tx2q
+.pass2:
+ IDCT4_1D_PACKED
+ vpermq m0, m0, q3120
+ vpermq m1, m1, q2031
+ jmp m(iadst_8x4_internal_8bpc).end2
+
+INV_TXFM_8X4_FN adst, dct
+INV_TXFM_8X4_FN adst, adst
+INV_TXFM_8X4_FN adst, flipadst
+INV_TXFM_8X4_FN adst, identity
+
+cglobal iadst_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ vpbroadcastd xm0, [o(pw_2896x8)]
+ pshufd xm4, [cq+16*0], q1032
+ pmulhrsw xm3, xm0, [cq+16*3]
+ pshufd xm5, [cq+16*1], q1032
+ pmulhrsw xm2, xm0, [cq+16*2]
+ pmulhrsw xm4, xm0
+ pmulhrsw xm5, xm0
+ call m(iadst_4x8_internal_8bpc).main_pass1
+ vinserti32x4 m0, xm2, 1
+ vinserti32x4 m1, xm3, 1
+ pxor m3, m3
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ psubsw m3, m2
+ punpckhwd m1, m0, m3
+ punpcklwd m0, m3
+ jmp tx2q
+.pass2:
+ call .main
+.end:
+ vpermq m0, m0, q3120
+ vpermq m1, m1, q3120
+.end2:
+ vpbroadcastd m2, [o(pw_2048)]
+ pmulhrsw m0, m2
+ pmulhrsw m1, m2
+.end3:
+ pxor m2, m2
+ mova [cq], zmm18
+ lea r6, [strideq*3]
+ WRITE_8X4 0, 1, 4, 5
+ RET
+ALIGN function_align
+.main:
+ IADST4_1D_PACKED
+ ret
+
+INV_TXFM_8X4_FN flipadst, dct
+INV_TXFM_8X4_FN flipadst, adst
+INV_TXFM_8X4_FN flipadst, flipadst
+INV_TXFM_8X4_FN flipadst, identity
+
+cglobal iflipadst_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ vpbroadcastd xm0, [o(pw_2896x8)]
+ pshufd xm4, [cq+16*0], q1032
+ pmulhrsw xm3, xm0, [cq+16*3]
+ pshufd xm5, [cq+16*1], q1032
+ pmulhrsw xm2, xm0, [cq+16*2]
+ pmulhrsw xm4, xm0
+ pmulhrsw xm5, xm0
+ call m(iadst_4x8_internal_8bpc).main_pass1
+ vinserti32x4 m3, m3, xm1, 1
+ vinserti32x4 m2, m2, xm0, 1
+ punpckhwd m1, m3, m2
+ punpcklwd m3, m2
+ pxor m0, m0
+ psubsw m0, m1
+ punpckhwd m1, m0, m3
+ punpcklwd m0, m3
+ jmp tx2q
+.pass2:
+ call m(iadst_8x4_internal_8bpc).main
+ mova m2, m1
+ vpermq m1, m0, q2031
+ vpermq m0, m2, q2031
+ jmp m(iadst_8x4_internal_8bpc).end2
+
+INV_TXFM_8X4_FN identity, dct
+INV_TXFM_8X4_FN identity, adst
+INV_TXFM_8X4_FN identity, flipadst
+INV_TXFM_8X4_FN identity, identity
+
+cglobal iidentity_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova xm2, [cq+16*0]
+ mova xm0, [cq+16*1]
+ vinserti32x4 m2, [cq+16*2], 1
+ vinserti32x4 m0, [cq+16*3], 1
+ vpbroadcastd m3, [o(pw_2896x8)]
+ punpcklwd m1, m2, m0
+ punpckhwd m2, m0
+ pmulhrsw m1, m3
+ pmulhrsw m2, m3
+ punpcklwd m0, m1, m2
+ punpckhwd m1, m2
+ paddsw m0, m0
+ paddsw m1, m1
+ jmp tx2q
+.pass2:
+ vpbroadcastd m3, [o(pw_1697x8)]
+ pmulhrsw m2, m3, m0
+ pmulhrsw m3, m1
+ paddsw m0, m2
+ paddsw m1, m3
+ jmp m(iadst_8x4_internal_8bpc).end
+
+%macro INV_TXFM_8X8_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 8x8
+%ifidn %1_%2, dct_dct
+INIT_ZMM avx512icl
+ movsx r6d, word [cq]
+ mov [cq], eobd
+.dconly:
+ imul r6d, 181
+ add r6d, 128+256
+ sar r6d, 8+1
+.dconly2:
+ vpbroadcastd ym2, strided
+ imul r6d, 181
+ pmulld ym5, ym2, [o(pd_0to15)]
+ kxnorb k1, k1, k1
+ add r6d, 128+2048
+ sar r6d, 8+4
+ pxor m3, m3
+ vpbroadcastw m4, r6d
+.dconly_loop:
+ kmovb k2, k1
+ vpgatherdq m2{k1}, [dstq+ym5]
+ punpcklbw m0, m2, m3
+ punpckhbw m1, m2, m3
+ paddw m0, m4
+ paddw m1, m4
+ packuswb m0, m1
+ kmovb k1, k2
+ vpscatterdq [dstq+ym5]{k2}, m0
+ lea dstq, [dstq+strideq*8]
+ sub r3d, 8
+ jg .dconly_loop
+ RET
+INIT_YMM avx512icl
+%endif
+%endmacro
+
+INV_TXFM_8X8_FN dct, dct
+INV_TXFM_8X8_FN dct, identity
+INV_TXFM_8X8_FN dct, adst
+INV_TXFM_8X8_FN dct, flipadst
+
+cglobal idct_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ vpermq m0, [cq+32*0], q3120 ; 0 1
+ vpermq m3, [cq+32*3], q3120 ; 6 7
+ vpermq m2, [cq+32*2], q3120 ; 4 5
+ vpermq m1, [cq+32*1], q3120 ; 2 3
+ call .main
+ shufps m4, m0, m1, q0220
+ shufps m5, m0, m1, q1331
+ shufps m1, m2, m3, q0220
+ shufps m3, m2, m3, q1331
+ vbroadcasti32x4 m0, [o(deint_shuf)]
+ vpbroadcastd m2, [o(pw_16384)]
+ REPX {pshufb x, m0}, m4, m5, m1, m3
+ REPX {pmulhrsw x, m2}, m4, m5, m1, m3
+ vinserti32x4 m0, m4, xm1, 1
+ vshufi32x4 m2, m4, m1, 0x03
+ vinserti32x4 m1, m5, xm3, 1
+ vshufi32x4 m3, m5, m3, 0x03
+ jmp tx2q
+.pass2:
+ call .main
+ vpbroadcastd m4, [o(pw_2048)]
+ vpermq m0, m0, q3120
+ vpermq m1, m1, q2031
+ vpermq m2, m2, q3120
+ vpermq m3, m3, q2031
+ jmp m(iadst_8x8_internal_8bpc).end2
+ALIGN function_align
+cglobal_label .main
+ IDCT8_1D_PACKED
+ ret
+
+INV_TXFM_8X8_FN adst, dct
+INV_TXFM_8X8_FN adst, adst
+INV_TXFM_8X8_FN adst, flipadst
+INV_TXFM_8X8_FN adst, identity
+
+cglobal iadst_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ vpermq m4, [cq+32*0], q1302 ; 1 0
+ vpermq m3, [cq+32*3], q3120 ; 6 7
+ vpermq m5, [cq+32*1], q1302 ; 3 2
+ vpermq m2, [cq+32*2], q3120 ; 4 5
+ call .main_pass1
+ vpbroadcastd m5, [o(pw_16384_m16384)]
+ punpcklwd m4, m0, m1
+ punpckhwd m0, m1
+ punpcklwd m1, m2, m3
+ punpckhwd m2, m3
+ punpcklwd m3, m4, m0
+ punpckhwd m4, m0
+ punpcklwd m0, m1, m2
+ punpckhwd m1, m2
+ REPX {pmulhrsw x, m5}, m3, m4, m0, m1
+ vshufi32x4 m2, m3, m0, 0x03
+ vinserti32x4 m0, m3, xm0, 1
+ vshufi32x4 m3, m4, m1, 0x03
+ vinserti32x4 m1, m4, xm1, 1
+ jmp tx2q
+.pass2:
+ pshufd m4, m0, q1032
+ pshufd m5, m1, q1032
+ call .main_pass2
+ vpbroadcastd m5, [o(pw_2048)]
+ vpbroadcastd xm4, [o(pw_4096)]
+ psubw m4, m5 ; lower half = 2048, upper half = -2048
+.end:
+ REPX {vpermq x, x, q3120}, m0, m1, m2, m3
+.end2:
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+.end3:
+ pmulhrsw m2, m4
+ pmulhrsw m3, m4
+.end4:
+ pxor m4, m4
+ mova [cq+32*0], m4
+ mova [cq+32*1], m4
+ mova [cq+32*2], m4
+ mova [cq+32*3], m4
+ lea r6, [strideq*3]
+ WRITE_8X4 0, 1, 4, 5
+ lea dstq, [dstq+strideq*4]
+ WRITE_8X4 2, 3, 4, 5
+ RET
+ALIGN function_align
+.main_pass1:
+ punpckhwd m0, m4, m3 ; 0 7
+ punpckhwd m1, m5, m2 ; 2 5
+ punpcklwd m2, m5 ; 4 3
+ punpcklwd m3, m4 ; 6 1
+ IADST8_1D_PACKED 1
+ punpcklqdq m3, m4, m0 ; out6 -out7
+ punpckhqdq m0, m4 ; out0 -out1
+ ret
+ALIGN function_align
+cglobal_label .main_pass2
+ IADST8_1D_PACKED 2
+ ret
+
+INV_TXFM_8X8_FN flipadst, dct
+INV_TXFM_8X8_FN flipadst, adst
+INV_TXFM_8X8_FN flipadst, flipadst
+INV_TXFM_8X8_FN flipadst, identity
+
+cglobal iflipadst_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ vpermq m4, [cq+32*0], q1302 ; 1 0
+ vpermq m3, [cq+32*3], q3120 ; 6 7
+ vpermq m5, [cq+32*1], q1302 ; 3 2
+ vpermq m2, [cq+32*2], q3120 ; 4 5
+ call m(iadst_8x8_internal_8bpc).main_pass1
+ vpbroadcastd m5, [o(pw_m16384_16384)]
+ punpckhwd m4, m3, m2
+ punpcklwd m3, m2
+ punpckhwd m2, m1, m0
+ punpcklwd m1, m0
+ punpckhwd m0, m4, m3
+ punpcklwd m4, m3
+ punpckhwd m3, m2, m1
+ punpcklwd m2, m1
+ REPX {pmulhrsw x, m5}, m0, m4, m3, m2
+ vinserti32x4 m1, m0, xm3, 1
+ vshufi32x4 m3, m0, m3, 0x03
+ vinserti32x4 m0, m4, xm2, 1
+ vshufi32x4 m2, m4, m2, 0x03
+ jmp tx2q
+.pass2:
+ pshufd m4, m0, q1032
+ pshufd m5, m1, q1032
+ call m(iadst_8x8_internal_8bpc).main_pass2
+ vpbroadcastd m4, [o(pw_2048)]
+ vpbroadcastd xm5, [o(pw_4096)]
+ psubw m4, m5 ; lower half = -2048, upper half = 2048
+ vpermq m5, m3, q2031
+ vpermq m3, m0, q2031
+ vpermq m0, m2, q2031
+ vpermq m2, m1, q2031
+ pmulhrsw m1, m0, m4
+ pmulhrsw m0, m5, m4
+ jmp m(iadst_8x8_internal_8bpc).end3
+
+INV_TXFM_8X8_FN identity, dct
+INV_TXFM_8X8_FN identity, adst
+INV_TXFM_8X8_FN identity, flipadst
+INV_TXFM_8X8_FN identity, identity
+
+cglobal iidentity_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova xm3, [cq+16*0]
+ mova xm2, [cq+16*1]
+ vinserti32x4 m3, [cq+16*4], 1
+ vinserti32x4 m2, [cq+16*5], 1
+ mova xm4, [cq+16*2]
+ mova xm0, [cq+16*3]
+ vinserti32x4 m4, [cq+16*6], 1
+ vinserti32x4 m0, [cq+16*7], 1
+ punpcklwd m1, m3, m2
+ punpckhwd m3, m2
+ punpcklwd m2, m4, m0
+ punpckhwd m4, m0
+ punpckldq m0, m1, m2
+ punpckhdq m1, m2
+ punpckldq m2, m3, m4
+ punpckhdq m3, m4
+ jmp tx2q
+.pass2:
+ vpbroadcastd m4, [o(pw_4096)]
+ jmp m(iadst_8x8_internal_8bpc).end
+
+%macro INV_TXFM_8X16_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 8x16
+%ifidn %1_%2, dct_dct
+ movsx r6d, word [cq]
+ mov [cq], eobd
+ or r3d, 16
+ imul r6d, 181
+ add r6d, 128
+ sar r6d, 8
+ jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly
+%endif
+%endmacro
+
+%macro ITX_8X16_LOAD_COEFS 0
+ vpbroadcastd m4, [o(pw_2896x8)]
+ pmulhrsw m0, m4, [cq+32*0]
+ add cq, 32*4
+ pmulhrsw m7, m4, [cq+32*3]
+ pmulhrsw m1, m4, [cq-32*3]
+ pmulhrsw m6, m4, [cq+32*2]
+ pmulhrsw m2, m4, [cq-32*2]
+ pmulhrsw m5, m4, [cq+32*1]
+ pmulhrsw m3, m4, [cq-32*1]
+ pmulhrsw m4, [cq+32*0]
+%endmacro
+
+INIT_ZMM avx512icl
+INV_TXFM_8X16_FN dct, dct
+INV_TXFM_8X16_FN dct, identity
+INV_TXFM_8X16_FN dct, adst
+INV_TXFM_8X16_FN dct, flipadst
+
+cglobal idct_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova m3, [o(permB)]
+ vpermq m0, m3, [cq+64*0]
+ vpbroadcastd m4, [o(pw_2896x8)]
+ vpermq m1, m3, [cq+64*1]
+ vpermq m2, m3, [cq+64*2]
+ vpermq m3, m3, [cq+64*3]
+ REPX {pmulhrsw x, m4}, m0, m1, m2, m3
+ call m(idct_16x8_internal_8bpc).main
+ vpbroadcastd m5, [o(pw_16384)]
+ punpckhwd m4, m0, m2 ; b0 f0 b1 f1 b2 f2 b3 f3
+ punpcklwd m0, m2 ; a0 e0 a1 e1 a2 e2 a3 e3
+ punpckhwd m2, m1, m3 ; c0 g0 c1 g1 c2 g2 c3 g3
+ punpcklwd m1, m3 ; d0 h0 d1 h1 d2 h2 d3 h3
+ REPX {pmulhrsw x, m5}, m4, m0, m2, m1
+ punpckhwd m3, m0, m4 ; a2 b2 e2 f2 a3 b3 e3 f3
+ punpcklwd m0, m4 ; a0 b0 e0 f0 a1 b1 e1 f1
+ punpckhwd m4, m2, m1 ; c2 d2 g2 h2 c3 d3 g3 h3
+ punpcklwd m2, m1 ; c0 d0 g0 h0 c1 d1 g1 h1
+ punpckhdq m1, m0, m2 ; 1 5 9 13
+ punpckldq m0, m2 ; 0 4 8 12
+ punpckldq m2, m3, m4 ; 2 6 10 14
+ punpckhdq m3, m4 ; 3 7 11 15
+ jmp tx2q
+.pass2:
+ vprord m5, [o(int16_perm)], 16
+ vshufi32x4 m2, m2, q1320 ; 2 10 14 6
+ vshufi32x4 m4, m1, m3, q2310 ; 1 5 15 11
+ vshufi32x4 m1, m3, q0132 ; 9 13 7 3
+ vpermb m9, m5, m0
+ vpermb m7, m5, m2
+ vpermb m8, m5, m4
+ vpermb m0, m5, m1
+ vextracti32x8 ym6, m9, 1
+ vextracti32x8 ym3, m7, 1
+ vextracti32x8 ym5, m8, 1
+ vextracti32x8 ym1, m0, 1
+ call .main2
+ mova ym8, [o(gather8a)]
+ lea r3, [dstq+strideq*4]
+ pmovzxdq m9, ym8
+ pshufd ym8, ym8, q1230
+ vpermt2q m0, m9, m4
+ vpermt2q m1, m9, m5
+ vpermt2q m2, m9, m6
+ vpermt2q m3, m9, m7
+.end:
+ vpbroadcastd m7, [o(pw_2048)]
+.end2:
+ pmulhrsw m0, m7
+ pmulhrsw m1, m7
+.end3:
+ pmulhrsw m2, m7
+ pmulhrsw m3, m7
+.end4:
+ vpbroadcastd ym6, strided
+ kxnorb k1, k1, k1
+ pxor m4, m4
+ pmulld ym8, ym6
+ kmovb k2, k1
+ vpgatherdq m6{k1}, [dstq+ym8]
+ kmovb k1, k2
+ vpgatherdq m7{k2}, [r3+ym8]
+ mova [cq+64*0], m4
+ mova [cq+64*1], m4
+ kmovb k2, k1
+ mova [cq+64*2], m4
+ mova [cq+64*3], m4
+ punpcklbw m5, m6, m4
+ punpckhbw m6, m4
+ paddw m0, m5
+ paddw m1, m6
+ packuswb m0, m1
+ vpscatterdq [dstq+ym8]{k1}, m0
+ punpcklbw m6, m7, m4
+ punpckhbw m7, m4
+ paddw m2, m6
+ paddw m3, m7
+ packuswb m2, m3
+ vpscatterdq [r3+ym8]{k2}, m2
+ RET
+ALIGN function_align
+cglobal_label .main_fast2 ; bottom three-quarters are zero
+ vpbroadcastd ym10, [o(pd_2048)]
+ vpbroadcastq ym13, [o(int_mshift)]
+ vpbroadcastd ym3, [o(pw_401_4076x8)]
+ vpbroadcastd ym5, [o(pw_799_4017x8)]
+ vpbroadcastd ym4, [o(pw_m1189_3920x8)]
+ pxor ym6, ym6
+ punpckhwd ym2, ym0, ym0
+ pmulhrsw ym2, ym3 ; t8a t15a
+ punpcklwd ym7, ym1, ym1
+ pmulhrsw ym7, ym5 ; t4a t7a
+ punpckhwd ym1, ym1
+ pmulhrsw ym4, ym1 ; t11a t12a
+ vpcmpub k7, ym13, ym10, 6
+ punpcklwd ym9, ym6, ym0
+ psubsw ym0, ym2, ym4 ; t11a t12a
+ paddsw ym8, ym2, ym4 ; t8a t15a
+ mova ym1, ym7
+ jmp .main5
+ALIGN function_align
+cglobal_label .main_fast ; bottom half is zero
+ vpbroadcastd ym10, [o(pd_2048)]
+ vpbroadcastq ym13, [o(int_mshift)]
+ pxor ym6, ym6
+ punpckhwd ym8, ym0, ym0
+ punpckhwd ym4, ym3, ym3
+ punpckhwd ym5, ym2, ym2
+ punpcklwd ym7, ym1, ym1
+ punpckhwd ym1, ym1
+ punpcklwd ym3, ym3
+ punpcklwd ym9, ym6, ym0
+ punpcklwd ym6, ym2
+ vpbroadcastd ym2, [o(pw_401_4076x8)]
+ vpbroadcastd ym0, [o(pw_m2598_3166x8)]
+ vpbroadcastd ym11, [o(pw_1931_3612x8)]
+ vpbroadcastd ym12, [o(pw_m1189_3920x8)]
+ pmulhrsw ym8, ym2 ; t8a t15a
+ vpbroadcastd ym2, [o(pw_799_4017x8)]
+ pmulhrsw ym0, ym4 ; t9a t14a
+ vpbroadcastd ym4, [o(pw_m2276_3406x8)]
+ pmulhrsw ym5, ym11 ; t10a t13a
+ pmulhrsw ym1, ym12 ; t11a t12a
+ pmulhrsw ym7, ym2 ; t4a t7a
+ pmulhrsw ym3, ym4 ; t5a t6a
+ vpcmpub k7, ym13, ym10, 6
+ jmp .main4
+ALIGN function_align
+cglobal_label .main
+ WRAP_YMM IDCT16_1D_PACKED
+ ret
+
+INV_TXFM_8X16_FN adst, dct
+INV_TXFM_8X16_FN adst, adst
+INV_TXFM_8X16_FN adst, flipadst
+INV_TXFM_8X16_FN adst, identity
+
+cglobal iadst_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ call m(iadst_16x8_internal_8bpc).main_pass1
+ vbroadcasti32x4 m6, [o(int_shuf1)]
+ vpbroadcastd m7, [o(pw_16384_m16384)]
+ punpckhwd m3, m0, m4 ; a0 b0 a1 b1 a2 b2 a3 b3
+ punpcklwd m4, m0 ; g0 h0 g1 h1 g2 h2 g3 h3
+ pshufb m5, m1, m6 ; c0 d0 c1 d1 c2 d2 c3 d3
+ pshufb m2, m6 ; e0 f0 e1 f1 e2 f2 e3 f3
+.pass1_end:
+ REPX {pmulhrsw x, m7}, m3, m5, m4, m2
+ punpckldq m0, m3, m5 ; a0 b0 c0 d0 a1 b1 c1 d1
+ punpckhdq m3, m5 ; a2 b2 c2 d2 a3 b3 c3 d3
+ punpckhdq m5, m2, m4 ; e2 f2 g2 h2 e3 f3 g3 h3
+ punpckldq m2, m4 ; e0 f0 g0 h0 e1 f1 g1 h1
+ punpckhqdq m1, m0, m2
+ punpcklqdq m0, m2
+ punpcklqdq m2, m3, m5
+ punpckhqdq m3, m5
+ jmp tx2q
+.pass2:
+ call .main_pass2
+ vpbroadcastd m6, [o(pw_2048)]
+ psrlq m10, 4
+ psubw m7, m8, m6
+.pass2_end:
+ vpbroadcastd m5, [o(pw_2896x8)]
+ paddsw m1, m2, m4
+ psubsw m2, m4
+ pmulhrsw m1, m5 ; -out7 out4 out6 -out5
+ pmulhrsw m5, m2 ; out8 -out11 -out9 out10
+ mova ym8, [o(gather8c)]
+ lea r3, [dstq+strideq]
+ psrlq m2, m10, 4
+ vpermi2q m2, m0, m3 ; 1 3 13 15
+ vpermt2q m0, m10, m3 ; 0 2 12 14
+ psrlq m3, m10, 8
+ vpermi2q m3, m1, m5 ; 5 7 9 11
+ psrlq m10, 12
+ vpermt2q m1, m10, m5 ; 4 6 8 10
+ pmulhrsw m0, m6
+ pmulhrsw m1, m6
+ jmp m(idct_8x16_internal_8bpc).end3
+ALIGN function_align
+.main_pass1:
+ vpbroadcastd m2, [o(pw_2896x8)]
+ pmulhrsw m5, m2, [cq+64*0]
+ pmulhrsw m3, m2, [cq+64*3]
+ pmulhrsw m1, m2, [cq+64*1]
+ pmulhrsw m2, [cq+64*2]
+ movu m4, [o(permA+3)]
+ psrlq m10, m4, 4
+ mova m6, m4
+ vpermi2q m4, m5, m3 ; in0 in12 in2 in14
+ vpermt2q m5, m10, m3 ; in15 in3 in13 in1
+ vpermi2q m6, m1, m2 ; in4 in8 in6 in10
+ vpermt2q m1, m10, m2 ; in11 in7 in9 in5
+ jmp .main
+ALIGN function_align
+.main_pass2:
+ mova m4, [o(permC)]
+ psrlq m5, m4, 4
+ vpermi2q m4, m0, m2 ; in0 in12 in2 in14
+ psrlq m6, m5, 4
+ vpermi2q m5, m1, m3 ; in15 in3 in13 in1
+ psrlq m10, m6, 4
+ vpermi2q m6, m0, m2 ; in4 in8 in6 in10
+ vpermt2q m1, m10, m3 ; in11 in7 in9 in5
+.main:
+ punpcklwd m0, m4, m5 ; in0 in15 in2 in13
+ punpckhwd m4, m5 ; in12 in3 in14 in1
+ punpcklwd m5, m6, m1 ; in4 in11 in6 in9
+ punpckhwd m6, m1 ; in8 in7 in10 in5
+cglobal_label .main2
+ vpbroadcastd m9, [o(pd_2048)]
+ vpbroadcastq m13, [o(int_mshift)]
+ kxnorb k1, k1, k1
+ vpcmpub k7, m13, m9, 6 ; 0x33...
+ pxor m8, m8
+ ITX_MUL4X_PACK 0, 1, 2, 3, 7, 9, 201, 4091, 995, 3973, 5
+ ITX_MUL4X_PACK 6, 1, 2, 3, 7, 9, 3035, 2751, 3513, 2106, 5
+ ITX_MUL4X_PACK 4, 1, 2, 3, 7, 9, 3857, 1380, 4052, 601, 5
+ ITX_MUL4X_PACK 5, 1, 2, 3, 7, 9, 1751, 3703, 2440, 3290, 5
+ psubsw m2, m0, m6 ; t9a t8a t11a t10a
+ paddsw m0, m6 ; t1a t0a t3a t2a
+ psubsw m3, m5, m4 ; t13a t12a t15a t14a
+ paddsw m5, m4 ; t5a t4a t7a t6a
+ ITX_MUL4X_PACK 2, 4, 1, 6, 7, 9, 799, 4017, 3406, 2276, 5
+ psubw m7, m8, m7
+ ITX_MUL2X_PACK 3, 4, 1, 9, 7, 6, 4
+ vpbroadcastd m6, [o(pw_3784_m1567)]
+ vpbroadcastd m6{k1}, [o(pw_m3784_1567)]
+ psubsw m1, m0, m5 ; t5 t4 t7 t6
+ paddsw m0, m5 ; t1 t0 t3 t2
+ psubsw m4, m2, m3 ; t13a t12a t15a t14a
+ paddsw m2, m3 ; t9a t8a t11a t10a
+ ITX_MUL2X_PACK 1, 3, 5, 9, 1567_3784, 6, 16 ; t5a t4a t6a t7a
+ ITX_MUL2X_PACK 4, 3, 5, 9, 1567_3784, 6, 16 ; t13 t12 t14 t15
+ vbroadcasti32x4 m5, [o(deint_shuf)]
+ pshufb m0, m5
+ pshufb m2, m5
+ vshufi32x4 m3, m0, m2, q3232 ; t3 t2 t11a t10a
+ vinserti32x8 m0, ym2, 1 ; t1 t0 t9a t8a
+ vshufi32x4 m2, m1, m4, q3232 ; t6a t7a t14 t15
+ vinserti32x8 m1, ym4, 1 ; t5a t4a t13 t12
+ pshufd m2, m2, q1032 ; t7a t6a t15 t14
+ psubsw m4, m0, m3 ; t3a t2a t11 t10
+ paddsw m0, m3 ; -out15 out0 out14 -out1
+ paddsw m3, m1, m2 ; out12 -out3 -out13 out2
+ psubsw m1, m2 ; t7 t6 t15a t14a
+ punpckhqdq m2, m4, m1 ; t2a t6 t10 t14a
+ punpcklqdq m4, m1 ; t3a t7 t11 t15a
+ ret
+
+INV_TXFM_8X16_FN flipadst, dct
+INV_TXFM_8X16_FN flipadst, adst
+INV_TXFM_8X16_FN flipadst, flipadst
+INV_TXFM_8X16_FN flipadst, identity
+
+cglobal iflipadst_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ call m(iadst_16x8_internal_8bpc).main_pass1
+ vbroadcasti32x4 m6, [o(int_shuf2)]
+ vpbroadcastd m7, [o(pw_m16384_16384)]
+ punpcklwd m3, m0, m4 ; a0 b0 a1 b1 a2 b2 a3 b3
+ punpckhwd m4, m0 ; g0 h0 g1 h1 g2 h2 g3 h3
+ pshufb m5, m2, m6 ; c0 d0 c1 d1 c2 d2 c3 d3
+ pshufb m2, m1, m6 ; e0 f0 e1 f1 e2 f2 e3 f3
+ jmp m(iadst_8x16_internal_8bpc).pass1_end
+.pass2:
+ call m(iadst_8x16_internal_8bpc).main_pass2
+ vpbroadcastd m7, [o(pw_2048)]
+ psrlq m10, 36
+ psubw m6, m8, m7
+ jmp m(iadst_8x16_internal_8bpc).pass2_end
+
+INV_TXFM_8X16_FN identity, dct
+INV_TXFM_8X16_FN identity, adst
+INV_TXFM_8X16_FN identity, flipadst
+INV_TXFM_8X16_FN identity, identity
+
+cglobal iidentity_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova m0, [o(int16_perm)]
+ vpermb m3, m0, [cq+64*0] ; a0 b0 a1 b1 a2 b2 a3 b3
+ vpermb m2, m0, [cq+64*1] ; c0 d0 c1 d1 c2 d2 c3 d3
+ vpermb m4, m0, [cq+64*2] ; e0 f0 e1 f1 e2 f2 e3 f3
+ vpermb m0, m0, [cq+64*3] ; g0 h0 g1 h1 g2 h2 g3 h3
+ vpbroadcastd m5, [o(pw_2896x8)]
+ punpckldq m1, m3, m2 ; a0 b0 c0 d0 a1 b1 c1 d1
+ punpckhdq m3, m2 ; a2 b2 c2 d2 a3 b3 c3 d3
+ punpckldq m2, m4, m0 ; e0 f0 g0 h0 a1 f1 g1 h1
+ punpckhdq m4, m0 ; e2 f2 g2 h2 e3 f3 g3 h3
+ REPX {pmulhrsw x, m5}, m1, m2, m3, m4
+ punpcklqdq m0, m1, m2 ; a0 b0 c0 d0 e0 f0 g0 h0
+ punpckhqdq m1, m2 ; a1 b1 c1 d1 e1 f1 g1 h1
+ punpcklqdq m2, m3, m4 ; a2 b2 c2 d2 e2 f2 g2 h2
+ punpckhqdq m3, m4 ; a3 b3 c3 d3 e3 f3 g3 h3
+ jmp tx2q
+.pass2:
+ vpbroadcastd m7, [o(pw_1697x16)]
+ mova ym8, [o(gather8b)]
+ lea r3, [dstq+strideq*2]
+ pmulhrsw m4, m7, m0
+ pmulhrsw m5, m7, m1
+ pmulhrsw m6, m7, m2
+ pmulhrsw m7, m3
+ REPX {paddsw x, x}, m0, m1, m2, m3
+ paddsw m0, m4
+ paddsw m1, m5
+ paddsw m2, m6
+ paddsw m3, m7
+ jmp m(idct_8x16_internal_8bpc).end
+
+%macro WRITE_16X2 6 ; coefs[1-2], tmp[1-2], offset[1-2]
+ pmovzxbw m%3, [dstq+%5]
+%ifnum %1
+ paddw m%3, m%1
+%else
+ paddw m%3, %1
+%endif
+ pmovzxbw m%4, [dstq+%6]
+%ifnum %2
+ paddw m%4, m%2
+%else
+ paddw m%4, %2
+%endif
+ packuswb m%3, m%4
+ vpermq m%3, m%3, q3120
+ mova [dstq+%5], xm%3
+ vextracti32x4 [dstq+%6], m%3, 1
+%endmacro
+
+%macro INV_TXFM_16X4_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 16x4
+%ifidn %1_%2, dct_dct
+ movsx r6d, word [cq]
+ mov [cq], eobd
+ jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly2
+%endif
+%endmacro
+
+INIT_ZMM avx512icl
+INV_TXFM_16X4_FN dct, dct
+INV_TXFM_16X4_FN dct, adst
+INV_TXFM_16X4_FN dct, flipadst
+INV_TXFM_16X4_FN dct, identity
+
+cglobal idct_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova xm0, [cq+16*0]
+ mova xm1, [cq+16*1]
+ mova xm2, [cq+16*2]
+ mova xm3, [cq+16*3]
+ mova xm4, [cq+16*4]
+ mova xm5, [cq+16*5]
+ mova xm6, [cq+16*6]
+ mova xm7, [cq+16*7]
+ call m(idct_4x16_internal_8bpc).main
+ vpbroadcastd m8, [o(pw_16384)]
+ vinserti32x4 ym1, xm3, 1 ; 3 2 7 6
+ vinserti32x4 ym5, xm7, 1 ; b a f e
+ vinserti32x4 ym0, xm2, 1 ; 0 1 4 5
+ vinserti32x4 ym4, xm6, 1 ; 8 9 c d
+ vinserti32x8 m1, ym5, 1 ; 3 2 7 6 b a f e
+ vinserti32x8 m0, ym4, 1 ; 0 1 4 5 8 9 c d
+ pmulhrsw m1, m8
+ pmulhrsw m0, m8
+ pshufd m1, m1, q1032
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m0, m2
+ punpcklwd m0, m2
+ jmp tx2q
+.pass2:
+ IDCT4_1D_PACKED
+ mova m2, [o(permA)]
+ jmp m(iadst_16x4_internal_8bpc).end
+
+INV_TXFM_16X4_FN adst, dct
+INV_TXFM_16X4_FN adst, adst
+INV_TXFM_16X4_FN adst, flipadst
+INV_TXFM_16X4_FN adst, identity
+
+cglobal iadst_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova m0, [cq+64*0]
+ mova m1, [cq+64*1]
+ movshdup m3, [o(permB)]
+ psrlq m10, m3, 4
+ call m(iadst_4x16_internal_8bpc).main2
+ vpbroadcastd m6, [o(pw_16384_m16384)]
+ psrlq m0, m10, 4
+ psrlq m10, 8
+.pass1_end:
+ punpcklwd ym5, ym4, ym2
+ punpckhwd ym4, ym2
+ vinserti32x8 m5, ym4, 1
+ mova m1, m9
+ vpdpwssd m1, m5, [o(pw_m2896_2896)] {1to16}
+ mova m4, m9
+ vpdpwssd m4, m5, [o(pw_2896_2896)] {1to16}
+ psrad m1, 12
+ psrad m4, 12
+ packssdw m1, m4 ; out8 -out7 -out9 out6 -out11 out4 out10 -out5
+ vpermi2q m0, m1, m3 ; 0 1 4 5 8 9 c d
+ vpermt2q m1, m10, m3 ; 2 3 6 7 a b e f
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m0, m2
+ punpcklwd m0, m2
+ pmulhrsw m0, m6
+ pmulhrsw m1, m6
+ jmp tx2q
+.pass2:
+ call .main
+ movu m2, [o(permA+1)]
+.end:
+ vpbroadcastd m3, [o(pw_2048)]
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+.end2:
+ psrlq m3, m2, 4
+ vpermi2q m2, m0, m1
+ vpermi2q m3, m0, m1
+.end3:
+ lea r3, [dstq+strideq*2]
+ mova xm1, [dstq+strideq*0]
+ vinserti32x4 ym1, [dstq+strideq*1], 1
+ vinserti32x4 m1, [r3 +strideq*0], 2
+ vinserti32x4 m1, [r3 +strideq*1], 3
+ pxor m4, m4
+ mova [cq+64*0], m4
+ mova [cq+64*1], m4
+ punpcklbw m0, m1, m4
+ punpckhbw m1, m4
+ paddw m0, m2
+ paddw m1, m3
+ packuswb m0, m1
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ vextracti32x4 [r3 +strideq*0], m0, 2
+ vextracti32x4 [r3 +strideq*1], m0, 3
+ RET
+ALIGN function_align
+.main:
+ IADST4_1D_PACKED
+ ret
+
+INV_TXFM_16X4_FN flipadst, dct
+INV_TXFM_16X4_FN flipadst, adst
+INV_TXFM_16X4_FN flipadst, flipadst
+INV_TXFM_16X4_FN flipadst, identity
+
+cglobal iflipadst_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova m0, [cq+64*0]
+ mova m1, [cq+64*1]
+ movshdup m3, [o(permB)]
+ psrlq m10, m3, 4
+ call m(iadst_4x16_internal_8bpc).main2
+ vpbroadcastd m6, [o(pw_m16384_16384)]
+ psrlq m0, m10, 12
+ psrlq m10, 16
+ jmp m(iadst_16x4_internal_8bpc).pass1_end
+.pass2:
+ call m(iadst_16x4_internal_8bpc).main
+ movu m2, [o(permA+2)]
+ jmp m(iadst_16x4_internal_8bpc).end
+
+INV_TXFM_16X4_FN identity, dct
+INV_TXFM_16X4_FN identity, adst
+INV_TXFM_16X4_FN identity, flipadst
+INV_TXFM_16X4_FN identity, identity
+
+cglobal iidentity_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova m1, [cq+64*0]
+ mova m2, [cq+64*1]
+ vpbroadcastd m3, [o(pw_1697x16)]
+ vpbroadcastd m4, [o(pw_16384)]
+ mova m5, [o(idtx_16x4p)]
+ shufps m0, m1, m2, q2020
+ shufps m1, m2, q3131
+ pmulhrsw m2, m3, m0
+ pmulhrsw m3, m1
+ pmulhrsw m2, m4
+ pmulhrsw m3, m4
+ paddsw m0, m2
+ paddsw m1, m3
+ vpermb m0, m5, m0
+ vpermb m1, m5, m1
+ jmp tx2q
+.pass2:
+ vpbroadcastd m3, [o(pw_1697x8)]
+ pmulhrsw m2, m3, m0
+ pmulhrsw m3, m1
+ paddsw m0, m2
+ paddsw m1, m3
+ movu m2, [o(permA+1)]
+ jmp m(iadst_16x4_internal_8bpc).end
+
+%macro INV_TXFM_16X8_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 16x8
+%ifidn %1_%2, dct_dct
+ movsx r6d, word [cq]
+ mov [cq], eobd
+ or r3d, 8
+.dconly:
+ imul r6d, 181
+ add r6d, 128
+ sar r6d, 8
+.dconly2:
+ imul r6d, 181
+ add r6d, 128+256
+ sar r6d, 8+1
+.dconly3:
+ imul r6d, 181
+ lea r2, [strideq*3]
+ add r6d, 128+2048
+ sar r6d, 8+4
+ pxor m2, m2
+ vpbroadcastw m3, r6d
+.dconly_loop:
+ mova xm1, [dstq+strideq*0]
+ vinserti32x4 ym1, [dstq+strideq*1], 1
+ vinserti32x4 m1, [dstq+strideq*2], 2
+ vinserti32x4 m1, [dstq+r2 ], 3
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ paddw m0, m3
+ paddw m1, m3
+ packuswb m0, m1
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ vextracti32x4 [dstq+strideq*2], m0, 2
+ vextracti32x4 [dstq+r2 ], m0, 3
+ lea dstq, [dstq+strideq*4]
+ sub r3d, 4
+ jg .dconly_loop
+ RET
+%endif
+%endmacro
+
+%macro ITX_16X8_LOAD_COEFS 1 ; shuf_odd
+ vpbroadcastd m8, [o(pw_2896x8)]
+ vpermq m0, [cq+32*0], q3120
+ add cq, 32*4
+ vpermq m7, [cq+32*3], q%1
+ vpermq m1, [cq-32*3], q%1
+ vpermq m6, [cq+32*2], q3120
+ vpermq m2, [cq-32*2], q3120
+ vpermq m5, [cq+32*1], q%1
+ vpermq m3, [cq-32*1], q%1
+ vpermq m4, [cq+32*0], q3120
+ REPX {pmulhrsw x, m8}, m0, m7, m1, m6, m2, m5, m3, m4
+%endmacro
+
+INV_TXFM_16X8_FN dct, dct
+INV_TXFM_16X8_FN dct, identity
+INV_TXFM_16X8_FN dct, adst
+INV_TXFM_16X8_FN dct, flipadst
+
+cglobal idct_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ vpbroadcastd m1, [o(pw_2896x8)]
+ vpermq m0, [cq+64*0], q3120
+ vpermq m2, [cq+64*1], q3120
+ vpermq m4, [cq+64*2], q3120
+ vpermq m6, [cq+64*3], q3120
+ REPX {pmulhrsw x, m1}, m0, m2, m4, m6
+ vextracti32x8 ym1, m0, 1
+ vextracti32x8 ym3, m2, 1
+ vextracti32x8 ym5, m4, 1
+ vextracti32x8 ym7, m6, 1
+ call m(idct_8x16_internal_8bpc).main
+ vbroadcasti32x4 m8, [o(int_shuf1)]
+ vbroadcasti32x4 m9, [o(int_shuf2)]
+ vinserti32x8 m0, ym2, 1 ; a0 a1 a2 a3 b0 b1 b2 b3
+ vinserti32x8 m1, ym3, 1 ; d0 d1 d2 d3 c0 c1 c2 c3
+ vinserti32x8 m4, ym6, 1 ; i0 i1 i2 i3 j0 j1 j2 j3
+ vinserti32x8 m5, ym7, 1 ; l0 l1 l2 l3 k0 k1 k2 k3
+ vpbroadcastd m2, [o(pw_16384)]
+ pshufb m0, m8 ; a0 b0 a1 b1 a2 b2 a3 b3
+ pshufb m1, m9 ; c0 d0 c1 d1 c2 d2 c3 d3
+ pshufb m6, m4, m8 ; i0 j0 i1 j1 i2 j2 i3 j3
+ pshufb m7, m5, m9 ; m0 n0 m1 n1 m2 n2 m3 n3
+ REPX {pmulhrsw x, m2}, m0, m1, m6, m7
+ punpckldq m2, m0, m1 ; a0 b0 c0 d0 a1 b1 c1 d1
+ punpckhdq m3, m0, m1 ; a2 b2 c2 d2 a3 b3 c3 d3
+ punpckldq m4, m6, m7 ; i0 j0 k0 l0 i1 j1 k1 l1
+ punpckhdq m5, m6, m7 ; i2 j2 k2 l2 i3 j3 k3 l3
+ jmp tx2q
+.pass2:
+ vshufi32x4 m0, m2, m4, q2020 ; 0 1
+ vshufi32x4 m2, m4, q3131 ; 4 5
+ vshufi32x4 m1, m3, m5, q2020 ; 2 3
+ vshufi32x4 m3, m5, q3131 ; 6 7
+ call .main
+ movshdup m4, [o(permC)]
+ psrlq m6, m4, 4
+ vpermq m5, m4, q1032
+ vpermi2q m4, m0, m2 ; a2 a3 b2 b3 e2 e3 f2 f3
+ vpermt2q m0, m6, m2 ; a0 a1 b0 b1 e0 e1 f0 f1
+ psrlq m6, m5, 4
+ vpermi2q m5, m1, m3 ; c2 c3 d2 d3 g2 g3 h2 h3
+ vpermt2q m1, m6, m3 ; c0 c1 d0 d1 g0 g1 h0 h1
+ vpbroadcastd m6, [o(pw_2048)]
+.end:
+ REPX {pmulhrsw x, m6}, m0, m4, m1, m5
+.end2:
+ lea r3, [dstq+strideq*4]
+ lea r4, [strideq*3]
+ mova xm3, [dstq+strideq*0]
+ mova xm6, [dstq+strideq*2]
+ vinserti32x4 ym3, [dstq+strideq*1], 1
+ vinserti32x4 ym6, [dstq+r4 ], 1
+ vinserti32x4 m3, [r3 +strideq*0], 2
+ vinserti32x4 m6, [r3 +strideq*2], 2
+ vinserti32x4 m3, [r3 +strideq*1], 3
+ vinserti32x4 m6, [r3 +r4 ], 3
+ pxor m7, m7
+ mova [cq+64*0], m7
+ mova [cq+64*1], m7
+ mova [cq+64*2], m7
+ mova [cq+64*3], m7
+ punpcklbw m2, m3, m7
+ punpckhbw m3, m7
+ paddw m0, m2
+ paddw m4, m3
+ packuswb m0, m4
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ vextracti32x4 [r3 +strideq*0], m0, 2
+ vextracti32x4 [r3 +strideq*1], m0, 3
+ punpcklbw m3, m6, m7
+ punpckhbw m6, m7
+ paddw m1, m3
+ paddw m5, m6
+ packuswb m1, m5
+ mova [dstq+strideq*2], xm1
+ vextracti32x4 [dstq+r4 ], ym1, 1
+ vextracti32x4 [r3 +strideq*2], m1, 2
+ vextracti32x4 [r3 +r4 ], m1, 3
+ RET
+ALIGN function_align
+cglobal_label .main
+ IDCT8_1D_PACKED
+ ret
+
+INV_TXFM_16X8_FN adst, dct
+INV_TXFM_16X8_FN adst, adst
+INV_TXFM_16X8_FN adst, flipadst
+INV_TXFM_16X8_FN adst, identity
+
+cglobal iadst_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ call m(iadst_8x16_internal_8bpc).main_pass1
+ vpbroadcastd m7, [o(pw_16384_m16384)]
+ psrlq m10, 4
+.pass1_end:
+ punpcklwd m5, m4, m2
+ punpckhwd m4, m2
+ mova m1, m9
+ vpdpwssd m1, m5, [o(pw_m2896_2896)] {1to16}
+ mova m6, m9
+ vpdpwssd m6, m5, [o(pw_2896_2896)] {1to16}
+ mova m2, m9
+ vpdpwssd m2, m4, [o(pw_m2896_2896)] {1to16}
+ vpdpwssd m9, m4, [o(pw_2896_2896)] {1to16}
+ psrad m1, 12
+ psrad m6, 12
+ packssdw m1, m6 ; out8 -out7 -out9 out6
+ psrad m2, 12
+ psrad m9, 12
+ packssdw m2, m9 ; -out11 out4 out10 -out5
+ psrlq m4, m10, 4
+ vpermi2q m4, m0, m2
+ vpermt2q m0, m10, m2
+ psrlq m5, m10, 8
+ vpermi2q m5, m1, m3
+ psrlq m10, 12
+ vpermt2q m1, m10, m3
+ punpcklwd m3, m4, m5 ; a0 c0 a1 c1 a2 c2 a3 c3
+ punpckhwd m4, m5 ; b0 d0 b1 d1 b2 d2 b3 d3
+ punpcklwd m5, m1, m0 ; i0 k0 i1 k1 2i k2 i3 k3
+ punpckhwd m1, m0 ; j0 l0 j1 l1 j2 l2 j3 l3
+ punpcklwd m2, m3, m4 ; a0 b0 c0 d0 a1 b1 c1 d1
+ punpckhwd m3, m4 ; a2 b2 c2 d2 a3 b3 c3 d3
+ punpcklwd m4, m5, m1 ; i0 j0 k0 l0 i1 j1 k1 l1
+ punpckhwd m5, m1 ; i2 j2 k2 l2 i3 j3 k3 l3
+ REPX {pmulhrsw x, m7}, m2, m3, m4, m5
+ jmp tx2q
+.pass2:
+ vshufi32x4 m0, m2, m4, q2020
+ vshufi32x4 m2, m4, q3131 ; 4 5
+ vshufi32x4 m1, m3, m5, q2020
+ vshufi32x4 m3, m5, q3131 ; 6 7
+ pshufd m4, m0, q1032 ; 1 0
+ pshufd m5, m1, q1032 ; 3 2
+ call .main_pass2
+ movshdup m4, [o(permC)]
+ pmulhrsw m0, m6
+ pmulhrsw m1, m6
+ psrlq m6, m4, 4
+ mova m5, m4
+ vpermi2q m4, m0, m2
+ vpermt2q m0, m6, m2
+ vpermi2q m5, m1, m3
+ vpermt2q m1, m6, m3
+ jmp m(idct_16x8_internal_8bpc).end2
+ALIGN function_align
+.main_pass1:
+ vpbroadcastd m4, [o(pw_2896x8)]
+ pmulhrsw m3, m4, [cq+64*0]
+ pmulhrsw m1, m4, [cq+64*3]
+ pmulhrsw m2, m4, [cq+64*1]
+ pmulhrsw m4, [cq+64*2]
+ mova m5, [o(int16_perm)]
+ kxnorb k1, k1, k1
+ vpblendmd m0{k1}, m1, m3 ; 0 7
+ vmovdqa32 m3{k1}, m1 ; 6 1
+ vpblendmd m1{k1}, m4, m2 ; 2 5
+ vmovdqa32 m2{k1}, m4 ; 4 3
+ REPX {vpermb x, m5, x}, m0, m1, m2, m3
+ IADST8_1D_PACKED 1
+ ret
+ALIGN function_align
+cglobal_label .main_pass2
+ IADST8_1D_PACKED 2
+ pxor m5, m5
+ psubd m5, m6
+ packssdw m6, m5
+ pmulhrsw m2, m6
+ pmulhrsw m3, m6
+ ret
+
+INV_TXFM_16X8_FN flipadst, dct
+INV_TXFM_16X8_FN flipadst, adst
+INV_TXFM_16X8_FN flipadst, flipadst
+INV_TXFM_16X8_FN flipadst, identity
+
+cglobal iflipadst_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ call m(iadst_8x16_internal_8bpc).main_pass1
+ vpbroadcastd m7, [o(pw_m16384_16384)]
+ psrlq m10, 20
+ jmp m(iadst_16x8_internal_8bpc).pass1_end
+.pass2:
+ vshufi32x4 m0, m2, m4, q2020
+ vshufi32x4 m2, m4, q3131 ; 4 5
+ vshufi32x4 m1, m3, m5, q2020
+ vshufi32x4 m3, m5, q3131 ; 6 7
+ pshufd m4, m0, q1032 ; 1 0
+ pshufd m5, m1, q1032 ; 3 2
+ call m(iadst_16x8_internal_8bpc).main_pass2
+ movshdup m4, [o(permC)]
+ pmulhrsw m5, m6, m0
+ pmulhrsw m0, m6, m1
+ psrlq m1, m4, 12
+ psrlq m4, 8
+ mova m7, m4
+ vpermi2q m4, m0, m3
+ vpermt2q m0, m1, m3
+ vpermi2q m1, m5, m2
+ vpermt2q m5, m7, m2
+ jmp m(idct_16x8_internal_8bpc).end2
+
+INV_TXFM_16X8_FN identity, dct
+INV_TXFM_16X8_FN identity, adst
+INV_TXFM_16X8_FN identity, flipadst
+INV_TXFM_16X8_FN identity, identity
+
+cglobal iidentity_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ vpbroadcastd m0, [o(pw_2896x8)]
+ pmulhrsw m3, m0, [cq+64*0]
+ pmulhrsw m4, m0, [cq+64*1]
+ pmulhrsw m5, m0, [cq+64*2]
+ pmulhrsw m0, [cq+64*3]
+ vpbroadcastd m7, [o(pw_1697x16)]
+ vpbroadcastd m8, [o(pw_16384)]
+ shufps m2, m3, m4, q2020 ; a0 a1 a4 a5 e0 e1 e4 e5
+ shufps m3, m4, q3131 ; a2 a3 a6 a7 e2 e3 e6 e7
+ shufps m4, m5, m0, q2020 ; i0 i1 i4 i5 m0 m1 m4 m5
+ shufps m5, m0, q3131 ; i2 i3 i6 i7 m2 m3 m6 m7
+ mova m9, [o(int8_permA)]
+ pmulhrsw m0, m7, m2
+ pmulhrsw m1, m7, m3
+ pmulhrsw m6, m7, m4
+ pmulhrsw m7, m5
+ REPX {pmulhrsw x, m8}, m0, m1, m6, m7
+ paddsw m2, m0
+ paddsw m3, m1
+ paddsw m4, m6
+ paddsw m5, m7
+ REPX {vpermb x, m9, x}, m2, m3, m4, m5
+ jmp tx2q
+.pass2:
+ mova m7, [o(permB)]
+ vpbroadcastd m6, [o(pw_4096)]
+ vpermq m0, m7, m2
+ vpermq m4, m7, m4
+ vpermq m1, m7, m3
+ vpermq m5, m7, m5
+ jmp m(idct_16x8_internal_8bpc).end
+
+%macro INV_TXFM_16X16_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 16x16
+%ifidn %1_%2, dct_dct
+ movsx r6d, word [cq]
+ mov [cq], eobd
+ or r3d, 16
+ imul r6d, 181
+ add r6d, 128+512
+ sar r6d, 8+2
+ jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly3
+%endif
+%endmacro
+
+INV_TXFM_16X16_FN dct, dct
+INV_TXFM_16X16_FN dct, identity
+INV_TXFM_16X16_FN dct, adst
+INV_TXFM_16X16_FN dct, flipadst
+
+cglobal idct_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova m7, [o(permB)]
+ vpermq m0, m7, [cq+64*0]
+ vpermq m1, m7, [cq+64*1]
+ vpermq m2, m7, [cq+64*2]
+ vpermq m3, m7, [cq+64*3]
+ vpermq m4, m7, [cq+64*4]
+ vpermq m5, m7, [cq+64*5]
+ vpermq m6, m7, [cq+64*6]
+ vpermq m7, m7, [cq+64*7]
+ call .main
+ vbroadcasti32x4 m12, [o(int_shuf1)]
+ vbroadcasti32x4 m11, [o(int_shuf2)]
+ vpbroadcastd m13, [o(pw_8192)]
+ pshufb m0, m12
+ pshufb m8, m1, m11
+ pshufb m2, m12
+ pshufb m9, m3, m11
+ pshufb m4, m12
+ pshufb m10, m5, m11
+ pshufb m6, m12
+ pshufb m11, m7, m11
+ REPX {pmulhrsw x, m13}, m0, m8, m2, m9, m4, m10, m6, m11
+ punpckhdq m1, m0, m8
+ punpckldq m0, m8
+ punpckhdq m3, m2, m9
+ punpckldq m2, m9
+ punpckhdq m5, m4, m10
+ punpckldq m4, m10
+ punpckhdq m7, m6, m11
+ punpckldq m6, m11
+ jmp tx2q
+.pass2:
+ vshufi32x4 m8, m4, m6, q3232 ; i8 ic m8 mc
+ vinserti32x8 m4, ym6, 1 ; i0 i4 m0 m4
+ vshufi32x4 m6, m0, m2, q3232 ; a8 ac e8 ec
+ vinserti32x8 m0, ym2, 1 ; a0 a4 e0 e4
+ vshufi32x4 m9, m5, m7, q3232 ; ia ie ma me
+ vinserti32x8 m5, ym7, 1 ; i2 i6 m2 m6
+ vshufi32x4 m7, m1, m3, q3232 ; aa ae ea ee
+ vinserti32x8 m1, ym3, 1 ; a2 a6 e2 e6
+ vshufi32x4 m2, m0, m4, q3131 ; 4 5
+ vshufi32x4 m0, m4, q2020 ; 0 1
+ vshufi32x4 m4, m6, m8, q2020 ; 8 9
+ vshufi32x4 m6, m8, q3131 ; 12 13
+ vshufi32x4 m3, m1, m5, q3131 ; 6 7
+ vshufi32x4 m1, m5, q2020 ; 2 3
+ vshufi32x4 m5, m7, m9, q2020 ; 10 11
+ vshufi32x4 m7, m9, q3131 ; 14 15
+ call .main
+ mova m8, [o(permD)]
+ psrlq m12, m8, 4
+ psrlq m9, m8, 8
+ psrlq m13, m8, 12
+ mova m10, m8
+ vpermi2q m8, m0, m2 ; 0 1 4 5
+ vpermt2q m0, m12, m2
+ mova m11, m9
+ vpermi2q m9, m1, m3 ; 2 3 6 7
+ vpermt2q m1, m13, m3
+ vpermi2q m10, m4, m6 ; 8 9 12 13
+ vpermt2q m4, m12, m6
+ vpermi2q m11, m5, m7 ; 10 11 14 15
+ vpermt2q m5, m13, m7
+.end:
+ vpbroadcastd m12, [o(pw_2048)]
+.end2:
+ REPX {pmulhrsw x, m12}, m0, m1, m4, m5
+.end3:
+ REPX {pmulhrsw x, m12}, m8, m9, m10, m11
+ lea r3, [strideq*3]
+ lea r4, [dstq+strideq*4]
+ lea r5, [dstq+strideq*8]
+ lea r6, [r4 +strideq*8]
+ mova xm3, [dstq+strideq*0]
+ mova xm6, [dstq+strideq*2]
+ vinserti32x4 ym3, [dstq+strideq*1], 1
+ vinserti32x4 ym6, [dstq+r3 ], 1
+ vinserti32x4 m3, [r4+strideq*0], 2
+ vinserti32x4 m6, [r4+strideq*2], 2
+ vinserti32x4 m3, [r4+strideq*1], 3
+ vinserti32x4 m6, [r4+r3 ], 3
+ mova xm12, [r5+strideq*0]
+ mova xm13, [r5+strideq*2]
+ vinserti32x4 ym12, [r5+strideq*1], 1
+ vinserti32x4 ym13, [r5+r3 ], 1
+ vinserti32x4 m12, [r6+strideq*0], 2
+ vinserti32x4 m13, [r6+strideq*2], 2
+ vinserti32x4 m12, [r6+strideq*1], 3
+ vinserti32x4 m13, [r6+r3 ], 3
+ pxor m7, m7
+ REPX {mova [cq+64*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
+ punpcklbw m2, m3, m7
+ punpckhbw m3, m7
+ paddw m0, m2
+ paddw m8, m3
+ packuswb m0, m8
+ punpcklbw m2, m6, m7
+ punpckhbw m6, m7
+ paddw m1, m2
+ paddw m9, m6
+ packuswb m1, m9
+ punpcklbw m2, m12, m7
+ punpckhbw m12, m7
+ paddw m2, m4
+ paddw m10, m12
+ packuswb m2, m10
+ punpcklbw m3, m13, m7
+ punpckhbw m13, m7
+ paddw m3, m5
+ paddw m11, m13
+ packuswb m3, m11
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ mova [dstq+strideq*2], xm1
+ vextracti32x4 [dstq+r3 ], ym1, 1
+ vextracti32x4 [r4+strideq*0], m0, 2
+ vextracti32x4 [r4+strideq*1], m0, 3
+ vextracti32x4 [r4+strideq*2], m1, 2
+ vextracti32x4 [r4+r3 ], m1, 3
+ mova [r5+strideq*0], xm2
+ vextracti32x4 [r5+strideq*1], ym2, 1
+ mova [r5+strideq*2], xm3
+ vextracti32x4 [r5+r3 ], ym3, 1
+ vextracti32x4 [r6+strideq*0], m2, 2
+ vextracti32x4 [r6+strideq*1], m2, 3
+ vextracti32x4 [r6+strideq*2], m3, 2
+ vextracti32x4 [r6+r3 ], m3, 3
+ RET
+ALIGN function_align
+cglobal_label .main_fast2 ; bottom three-quarters are zero
+ vpbroadcastd m10, [o(pd_2048)]
+ vpbroadcastq m13, [o(int_mshift)]
+ vpcmpub k7, m13, m10, 6
+.main_fast4:
+ vpbroadcastd m2, [o(pw_401_4076x8)]
+ vpbroadcastd m4, [o(pw_m1189_3920x8)]
+ vpbroadcastd m3, [o(pw_799_4017x8)]
+ pmulhrsw m2, m8 ; t8a t15a
+ pmulhrsw m4, m1 ; t11a t12a
+ pmulhrsw m7, m3 ; t4a t7a
+ pxor m6, m6
+ psubsw m0, m2, m4 ; t11a t12a
+ paddsw m8, m2, m4 ; t8a t15a
+ mova m1, m7
+ jmp .main5
+ALIGN function_align
+cglobal_label .main_fast ; bottom half is zero
+ vpbroadcastd m10, [o(pd_2048)]
+.main_fast3:
+ vpbroadcastq m13, [o(int_mshift)]
+ vpcmpub k7, m13, m10, 6
+.main_fast5:
+ vpbroadcastd m2, [o(pw_401_4076x8)]
+ vpbroadcastd m4, [o(pw_m2598_3166x8)]
+ vpbroadcastd m11, [o(pw_1931_3612x8)]
+ vpbroadcastd m12, [o(pw_m1189_3920x8)]
+ pmulhrsw m8, m2 ; t8a t15a
+ vpbroadcastd m2, [o(pw_799_4017x8)]
+ pmulhrsw m0, m4 ; t9a t14a
+ vpbroadcastd m4, [o(pw_m2276_3406x8)]
+ pmulhrsw m5, m11 ; t10a t13a
+ pmulhrsw m1, m12 ; t11a t12a
+ pmulhrsw m7, m2 ; t4a t7a
+ pmulhrsw m3, m4 ; t5a t6a
+ jmp .main4
+ALIGN function_align
+cglobal_label .main
+ IDCT16_1D_PACKED
+ ret
+
+INV_TXFM_16X16_FN adst, dct
+INV_TXFM_16X16_FN adst, adst
+INV_TXFM_16X16_FN adst, flipadst
+
+cglobal iadst_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ call .main_pass1
+ vpbroadcastd m10, [o(pw_8192_m8192)]
+ punpcklwd m8, m0, m1 ; b0 d0 b1 d1 b2 d2 b3 d3
+ punpckhwd m0, m1 ; a0 c0 a1 c1 a2 c2 a3 c3
+ punpckhwd m1, m0, m8 ; a2 b2 c2 d2 a3 b3 c3 d3
+ punpcklwd m0, m8 ; a0 b0 c0 d0 a1 b1 c1 d1
+ punpcklwd m8, m2, m3 ; f0 h0 f1 h1 f2 h2 f3 h3
+ punpckhwd m2, m3 ; e0 g0 e1 g1 e2 g2 e3 g3
+ punpckhwd m3, m2, m8 ; e2 f2 g2 h2 e3 f3 g3 h3
+ punpcklwd m2, m8 ; e0 f0 g0 h0 e1 f1 g1 h1
+ punpckhwd m8, m4, m5 ; i0 k0 i1 k1 i2 k2 i3 k3
+ punpcklwd m4, m5 ; j0 l0 j1 l1 j2 l2 j3 l3
+ punpckhwd m5, m4, m8 ; i2 j2 k2 l2 i3 j3 k3 l3
+ punpcklwd m4, m8 ; i0 j0 k0 l0 i1 j1 k1 l1
+ punpckhwd m8, m6, m7 ; m0 o0 m1 o1 m2 o2 m3 o3
+ punpcklwd m6, m7 ; n0 p0 n1 p1 n2 p2 n3 p3
+ punpckhwd m7, m6, m8 ; m2 n2 o2 p2 m3 n3 o3 p3
+ punpcklwd m6, m8 ; m0 n0 o0 p0 m1 n1 o1 p1
+.pass1_end:
+ REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
+ jmp tx2q
+.pass2:
+ call .main_pass2
+ mova m10, [o(permD)]
+ psrlq m8, m10, 8
+ psrlq m12, m10, 12
+ psrlq m13, m10, 4
+ mova m9, m8
+ vpermi2q m8, m0, m2 ; 0 1 4 5
+ vpermt2q m0, m12, m2
+ vpermi2q m9, m1, m3 ; 2 3 6 7
+ vpermt2q m1, m12, m3
+ vpbroadcastd m12, [o(pw_2048)]
+ mov r3d, 0xff00ff00
+ mova m11, m10
+ vpermi2q m10, m4, m6 ; 8 9 12 13
+ vpermt2q m4, m13, m6
+ kmovd k1, r3d
+ vpermi2q m11, m5, m7 ; 10 11 14 15
+ vpermt2q m5, m13, m7
+ pxor m7, m7
+ vpsubw m12{k1}, m7, m12
+ jmp m(idct_16x16_internal_8bpc).end2
+ALIGN function_align
+.main_pass1:
+ mova m4, [o(permB)]
+ psrlq m3, m4, 4
+ vpermq m0, m4, [cq+64*0]
+ vpermq m7, m3, [cq+64*7]
+ vpermq m6, m4, [cq+64*6]
+ vpermq m1, m3, [cq+64*1]
+ vpermq m2, m4, [cq+64*2]
+ vpermq m5, m3, [cq+64*5]
+ vpermq m4, m4, [cq+64*4]
+ vpermq m3, m3, [cq+64*3]
+ call .main
+ vpbroadcastd m13, [o(pw_2896_2896)]
+ vpbroadcastd m12, [o(pw_m2896_2896)]
+ mova m2, m10
+ vpdpwssd m2, m5, m13 ; -out5
+ mova m8, m10
+ vpdpwssd m8, m11, m13 ; out4
+ mova m9, m10
+ vpdpwssd m9, m5, m12 ; out10
+ mova m5, m10
+ vpdpwssd m5, m11, m12 ; -out11
+ mova m11, m10
+ vpdpwssd m11, m3, m13 ; -out7
+ mova m14, m10
+ vpdpwssd m14, m4, m13 ; out6
+ mova m13, m10
+ vpdpwssd m13, m3, m12 ; out8
+ vpdpwssd m10, m4, [o(pw_2896_m2896)] {1to16} ; -out9
+ REPX {psrad x, 12}, m2, m8, m9, m5, m11, m14, m13, m10
+ packssdw m2, m8 ; -out5 out4
+ packssdw m5, m9, m5 ; out10 -out11
+ packssdw m3, m11, m14 ; -out7 out6
+ packssdw m4, m13, m10 ; out8 -out9
+ ret
+ALIGN function_align
+.main_pass2:
+ vshufi32x4 m8, m4, m6, q3232 ; i8 ic m8 mc
+ vinserti32x8 m4, ym6, 1 ; i0 i4 m0 m4
+ vshufi32x4 m6, m0, m2, q3232 ; a8 ac e8 ec
+ vinserti32x8 m0, ym2, 1 ; a0 a4 e0 e4
+ vshufi32x4 m9, m5, m7, q3232 ; ia ie ma me
+ vinserti32x8 m5, ym7, 1 ; i2 i6 m2 m6
+ vshufi32x4 m7, m1, m3, q3232 ; aa ae ea ee
+ vinserti32x8 m1, ym3, 1 ; a2 a6 e2 e6
+ vshufi32x4 m2, m0, m4, q3131 ; 4 5
+ vshufi32x4 m0, m4, q2020 ; 0 1
+ vshufi32x4 m4, m6, m8, q2020 ; 8 9
+ vshufi32x4 m6, m8, q3131 ; 12 13
+ vshufi32x4 m3, m1, m5, q3131 ; 6 7
+ vshufi32x4 m1, m5, q2020 ; 2 3
+ vshufi32x4 m5, m7, m9, q2020 ; 10 11
+ vshufi32x4 m7, m9, q3131 ; 14 15
+cglobal_label .main_pass2b
+ REPX {pshufd x, x, q1032}, m1, m3, m5, m7
+ call .main
+ vpbroadcastd m8, [o(pw_2896x8)]
+ pshufb m2, m11, m12
+ pshufb m5, m12
+ pshufb m3, m12
+ pshufb m4, m12
+ punpcklqdq m9, m5, m2 ; t15a t7
+ punpckhqdq m5, m2 ; t14a t6
+ shufps m2, m3, m4, q1032 ; t2a t10
+ shufps m3, m4, q3210 ; t3a t11
+ psubsw m4, m2, m3 ; out8 -out9
+ paddsw m3, m2 ; -out7 out6
+ paddsw m2, m5, m9 ; -out5 out4
+ psubsw m5, m9 ; out10 -out11
+ REPX {pmulhrsw x, m8}, m2, m3, m4, m5
+ ret
+ALIGN function_align
+.main:
+ vpbroadcastd m10, [o(pd_2048)]
+ vpbroadcastq m13, [o(int_mshift)]
+ punpckhwd m8, m7, m0 ; in14 in1
+ punpcklwd m0, m7 ; in0 in15
+ punpcklwd m7, m6, m1 ; in12 in3
+ punpckhwd m1, m6 ; in2 in13
+ punpckhwd m6, m5, m2 ; in10 in5
+ punpcklwd m2, m5 ; in4 in11
+ punpcklwd m5, m4, m3 ; in8 in7
+ punpckhwd m3, m4 ; in6 in9
+ vpcmpub k7, m13, m10, 6 ; 0x33...
+ ITX_MUL2X_PACK 0, 4, 9, 10, 201, 4091, 5 ; t0 t1
+ ITX_MUL2X_PACK 1, 4, 9, 10, 995, 3973, 5 ; t2 t3
+ ITX_MUL2X_PACK 2, 4, 9, 10, 1751, 3703, 5 ; t4 t5
+ ITX_MUL2X_PACK 3, 4, 9, 10, 2440, 3290, 5 ; t6 t7
+ ITX_MUL2X_PACK 5, 4, 9, 10, 3035, 2751, 5 ; t8 t9
+ ITX_MUL2X_PACK 6, 4, 9, 10, 3513, 2106, 5 ; t10 t11
+ ITX_MUL2X_PACK 7, 4, 9, 10, 3857, 1380, 5 ; t12 t13
+ ITX_MUL2X_PACK 8, 4, 9, 10, 4052, 601, 5 ; t14 t15
+ psubsw m4, m0, m5 ; t9a t8a
+ paddsw m0, m5 ; t1a t0a
+ psubsw m5, m1, m6 ; t11a t10a
+ paddsw m1, m6 ; t3a t2a
+ psubsw m6, m2, m7 ; t13a t12a
+ paddsw m2, m7 ; t5a t4a
+ psubsw m7, m3, m8 ; t15a t14a
+ paddsw m3, m8 ; t7a t6a
+ ITX_MUL2X_PACK 4, 8, 9, 10, 799, 4017, 4 ; t8 t9
+ ITX_MUL2X_PACK 6, 8, 9, 10, 799_4017, 4017_m799, 52 ; t12 t13
+ ITX_MUL2X_PACK 5, 8, 9, 10, 3406, 2276, 4 ; t10 t11
+ ITX_MUL2X_PACK 7, 8, 9, 10, 3406_2276, 2276_m3406, 52 ; t14 t15
+ psubsw m8, m1, m3 ; t7 t6
+ paddsw m1, m3 ; t3 t2
+ psubsw m3, m0, m2 ; t5 t4
+ paddsw m0, m2 ; t1 t0
+ psubsw m2, m5, m7 ; t14a t15a
+ paddsw m7, m5 ; t10a t11a
+ psubsw m5, m4, m6 ; t12a t13a
+ paddsw m4, m6 ; t8a t9a
+ ITX_MUL2X_PACK 3, 6, 9, 10, 1567, 3784, 5 ; t5a t4a
+ ITX_MUL2X_PACK 8, 6, 9, 10, 3784_m1567, 1567_3784, 52 ; t7a t6a
+ ITX_MUL2X_PACK 2, 6, 9, 10, 3784, 1567, 4 ; t15 t14
+ ITX_MUL2X_PACK 5, 6, 9, 10, 3784_1567, 1567_m3784, 52 ; t13 t12
+ vbroadcasti32x4 m12, [o(deint_shuf)]
+ paddsw m6, m4, m7 ; -out1 out14
+ psubsw m4, m7 ; t10 t11
+ psubsw m11, m3, m8 ; t7 t6
+ paddsw m8, m3 ; out12 -out3
+ psubsw m3, m0, m1 ; t3a t2a
+ paddsw m0, m1 ; -out15 out0
+ paddsw m1, m2, m5 ; -out13 out2
+ psubsw m5, m2 ; t15a t14a
+ pshufb m0, m12
+ pshufb m6, m12
+ pshufb m8, m12
+ pshufb m1, m12
+ shufps m7, m6, m0, q1032 ; out14 -out15
+ shufps m0, m6, m0, q3210 ; -out1 out0
+ punpcklqdq m6, m8, m1 ; out12 -out13
+ punpckhqdq m1, m8, m1 ; -out3 out2
+ ret
+
+INV_TXFM_16X16_FN flipadst, dct
+INV_TXFM_16X16_FN flipadst, adst
+INV_TXFM_16X16_FN flipadst, flipadst
+
+cglobal iflipadst_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ call m(iadst_16x16_internal_8bpc).main_pass1
+ vpbroadcastd m10, [o(pw_m8192_8192)]
+ punpcklwd m8, m1, m0 ; m0 o0 m1 o1 m2 o2 m3 o3
+ punpckhwd m9, m1, m0 ; n0 p0 n1 p1 n2 p2 n3 p3
+ punpckhwd m1, m7, m6 ; a0 c0 a1 c1 a2 c2 a3 c3
+ punpcklwd m7, m6 ; b0 d0 b1 d1 b2 d2 b3 d3
+ punpcklwd m0, m1, m7 ; a0 b0 c0 d0 a1 b1 c1 d1
+ punpckhwd m1, m7 ; a2 b2 c2 d2 a3 b3 c3 d3
+ punpcklwd m6, m8, m9 ; m0 n0 o0 p0 m1 n1 o1 p1
+ punpckhwd m7, m8, m9 ; m2 n2 o2 p2 m3 n3 o3 p3
+ punpcklwd m8, m3, m2 ; i0 k0 i1 k1 i2 k2 i3 k3
+ punpckhwd m9, m3, m2 ; j0 l0 j1 l1 j2 l2 j3 l3
+ punpckhwd m3, m5, m4 ; e0 g0 e1 g1 e2 g2 e3 g3
+ punpcklwd m5, m4 ; f0 h0 f1 h1 f2 h2 f3 h3
+ punpcklwd m2, m3, m5 ; e0 f0 g0 h0 e1 f1 g1 h1
+ punpckhwd m3, m5 ; e2 f2 g2 h2 e3 f3 g3 h3
+ punpcklwd m4, m8, m9 ; i0 j0 k0 l0 i1 j1 k1 l1
+ punpckhwd m5, m8, m9 ; i2 j2 k2 l2 i3 j3 k3 l3
+ jmp m(iadst_16x16_internal_8bpc).pass1_end
+.pass2:
+ call m(iadst_16x16_internal_8bpc).main_pass2
+ mova m10, [o(permD)]
+ psrlq m8, m10, 8
+ psrlq m12, m10, 12
+ psrlq m13, m10, 4
+ mova m9, m8
+ vpermi2q m8, m7, m5 ; 0 1 4 5
+ vpermt2q m7, m12, m5
+ vpermi2q m9, m6, m4 ; 2 3 6 7
+ vpermt2q m6, m12, m4
+ vpbroadcastd m12, [o(pw_2048)]
+ mov r3d, 0x00ff00ff
+ mova m11, m10
+ vpermi2q m10, m3, m1 ; 8 9 12 13
+ vpermt2q m3, m13, m1
+ kmovd k1, r3d
+ vpermi2q m11, m2, m0 ; 10 11 14 15
+ vpermt2q m2, m13, m0
+ pxor m0, m0
+ vpsubw m12{k1}, m0, m12
+ pmulhrsw m0, m7, m12
+ pmulhrsw m1, m6, m12
+ pmulhrsw m4, m3, m12
+ pmulhrsw m5, m2, m12
+ jmp m(idct_16x16_internal_8bpc).end3
+
+INV_TXFM_16X16_FN identity, dct
+INV_TXFM_16X16_FN identity, identity
+
+cglobal iidentity_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova m8, [o(int16_perm)]
+ vpermb m1, m8, [cq+64*0] ; a0 b0 a1 b1 a2 b2 a3 b3
+ vpermb m2, m8, [cq+64*1] ; c0 d0 c1 d1 c2 d2 c3 d3
+ vpbroadcastd m0, [o(pw_1697x16)]
+ vpermb m3, m8, [cq+64*2] ; e0 f0 e1 f1 e2 f2 e3 f3
+ vpermb m4, m8, [cq+64*3] ; g0 h0 g1 h1 g2 h2 g3 h3
+ vpermb m5, m8, [cq+64*4] ; i0 j0 i1 j1 i2 j2 i3 j3
+ vpermb m6, m8, [cq+64*5] ; k0 l0 k1 l1 k2 l2 k3 l3
+ vpermb m7, m8, [cq+64*6] ; m0 n0 m1 n1 m2 n2 m3 n3
+ vpermb m8, m8, [cq+64*7] ; o0 p0 o1 p1 o2 p2 o3 p3
+ pmulhrsw m9, m0, m1
+ pmulhrsw m10, m0, m2
+ pmulhrsw m11, m0, m3
+ pmulhrsw m12, m0, m4
+ pmulhrsw m13, m0, m5
+ pmulhrsw m14, m0, m6
+ pmulhrsw m15, m0, m7
+ pmulhrsw m0, m8
+ REPX {psraw x, 1}, m9, m10, m11, m12
+ pavgw m1, m9
+ pavgw m2, m10
+ pavgw m3, m11
+ pavgw m4, m12
+ REPX {psraw x, 1}, m13, m14, m15, m0
+ pavgw m5, m13
+ pavgw m6, m14
+ pavgw m7, m15
+ pavgw m8, m0
+ punpckldq m0, m1, m2 ; a0 b0 c0 d0 a1 b1 c1 d1
+ punpckhdq m1, m2 ; a2 b2 c2 d2 a3 b3 c3 d3
+ punpckldq m2, m3, m4 ; e0 f0 g0 h0 e1 f1 g1 h1
+ punpckhdq m3, m4 ; e2 f2 g2 h2 e3 f3 g3 h3
+ punpckldq m4, m5, m6 ; i0 j0 k0 l0 i1 j1 k1 l1
+ punpckhdq m5, m6 ; i2 j2 k2 l2 i3 j3 k3 l3
+ punpckldq m6, m7, m8 ; m0 n0 o0 p0 m1 n1 o1 p1
+ punpckhdq m7, m8 ; m2 n2 o2 p2 m3 n3 o3 p3
+ jmp tx2q
+ALIGN function_align
+.pass2:
+ vpbroadcastd m11, [o(pw_1697x16)]
+ pmulhrsw m12, m11, m0
+ pmulhrsw m13, m11, m1
+ pmulhrsw m14, m11, m2
+ pmulhrsw m15, m11, m3
+ pmulhrsw m8, m11, m4
+ pmulhrsw m9, m11, m5
+ pmulhrsw m10, m11, m6
+ pmulhrsw m11, m7
+ REPX {paddsw x, x}, m0, m1, m2, m3, m4, m5, m6, m7
+ paddsw m0, m12
+ paddsw m1, m13
+ paddsw m2, m14
+ paddsw m3, m15
+ paddsw m8, m4
+ movu m4, [o(permD+2)]
+ paddsw m9, m5
+ paddsw m6, m10
+ paddsw m7, m11
+ psrlq m12, m4, 4
+ mova m5, m4
+ mova m10, m4
+ mova m11, m4
+ vpermi2q m4, m0, m2 ; 8 9 12 13
+ vpermt2q m0, m12, m2 ; 0 1 4 5
+ vpermi2q m5, m1, m3 ; 10 11 14 15
+ vpermt2q m1, m12, m3 ; 2 3 6 7
+ vpermi2q m10, m8, m6
+ vpermt2q m8, m12, m6
+ vpermi2q m11, m9, m7
+ vpermt2q m9, m12, m7
+ jmp m(idct_16x16_internal_8bpc).end
+
+%macro ITX_UNPACK_MULHRSW 8 ; dst[1-2], src, tmp, coef[1-4]
+ vpbroadcastd m%4, [o(pw_%5_%6x8)]
+ punpcklwd m%1, m%3, m%3
+ pmulhrsw m%1, m%4
+ vpbroadcastd m%4, [o(pw_%7_%8x8)]
+ punpckhwd m%2, m%3, m%3
+ pmulhrsw m%2, m%4
+%endmacro
+
+cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 4, 0, dst, stride, c, eob
+%undef cmp
+ lea r5, [o_base]
+ test eobd, eobd
+ jz .dconly
+ cmp eobd, 107
+ jb .fast
+ mova m5, [cq+64*5]
+ mova m3, [cq+64*3]
+ mova m1, [cq+64*1]
+ mova m7, [cq+64*7]
+ mova m2, [cq+64*2]
+ mova m6, [cq+64*6]
+ mova m0, [cq+64*0]
+ mova m4, [cq+64*4]
+ call m(inv_txfm_add_dct_dct_32x8_8bpc).main
+ mova m8, [o(idct_8x32p)]
+ vpbroadcastd m9, [o(pw_8192)]
+ REPX {vpermb x, m8, x}, m0, m1, m2, m3, m4, m5, m6, m7
+ punpckldq m8, m0, m1 ; ab
+ punpckhdq m0, m1
+ punpckldq m1, m2, m3 ; cd
+ punpckhdq m2, m3
+ punpckldq m3, m4, m5 ; ef
+ punpckhdq m4, m5
+ punpckldq m5, m6, m7 ; gh
+ punpckhdq m6, m7
+ REPX {pmulhrsw x, m9}, m8, m0, m1, m2, m3, m4, m5, m6
+ punpcklqdq m18, m8, m1 ; 30 2 6 26 31 1 23 9
+ punpckhqdq m14, m8, m1 ; 16 0 12 20 3 29 11 21
+ punpcklqdq m21, m0, m2 ; 14 18 22 10 27 5 19 13
+ punpckhqdq m15, m0, m2 ; 18 4 24 8 7 25 15 17
+ punpcklqdq m20, m3, m5
+ punpckhqdq m16, m3, m5
+ punpcklqdq m19, m4, m6
+ punpckhqdq m17, m4, m6
+ vinserti32x4 ym8, ym18, xm20, 1
+ vshufi32x4 ym1, ym18, ym20, 0x03
+ vinserti32x4 ym9, ym14, xm16, 1
+ vshufi32x4 ym3, ym14, ym16, 0x03
+ vinserti32x4 ym0, ym21, xm19, 1
+ vshufi32x4 ym5, ym21, ym19, 0x03
+ vinserti32x4 ym7, ym15, xm17, 1
+ vshufi32x4 ym6, ym15, ym17, 0x03
+ call m(idct_8x16_internal_8bpc).main2
+ psrlq m12, [o(permB)], 60
+ vpermt2q m14, m12, m16
+ vpermt2q m21, m12, m19
+ vpermt2q m15, m12, m17
+ vpermi2q m12, m18, m20
+ vextracti32x8 ym16, m14, 1
+ vextracti32x8 ym19, m21, 1
+ vextracti32x8 ym17, m15, 1
+ vextracti32x8 ym20, m12, 1
+ call .main2
+ jmp .end
+.fast: ; right half is zero
+ mova m0, [o(int16_perm)]
+ mova ym2, [cq+64*4]
+ vinserti32x8 m2, [cq+64*0], 1
+ mova ym3, [cq+64*6]
+ vinserti32x8 m3, [cq+64*2], 1
+ mova ym4, [cq+64*3]
+ vinserti32x8 m4, [cq+64*5], 1
+ mova ym5, [cq+64*7]
+ vinserti32x8 m5, [cq+64*1], 1
+ REPX {vpermb x, m0, x}, m2, m3, m4, m5
+ call m(idct_16x8_internal_8bpc).main2
+ vbroadcasti32x4 m4, [o(int_shuf3)]
+ vbroadcasti32x4 m5, [o(int_shuf4)]
+ pshufb m2, m4 ; e0 f0 e2 f2 e1 f1 e3 f3
+ pshufb m3, m5 ; g0 h0 g2 h2 g1 h1 g3 h3
+ pshufb m0, m4 ; a0 b0 a2 b2 a1 b1 a3 b3
+ pshufb m1, m5 ; c0 d0 c2 d2 c1 d1 c3 d3
+ vpbroadcastd m4, [o(pw_8192)]
+ psrlq m5, [o(permB)], 60
+ punpckldq m6, m2, m3 ; e0 f0 g0 h0 e2 f2 g2 h2
+ punpckhdq m17, m2, m3 ; e1 f1 g1 h1 e3 f3 g3 h3
+ punpckldq m2, m0, m1 ; a0 b0 c0 d0 a2 b2 c2 d2
+ punpckhdq m16, m0, m1 ; a1 b1 c1 d1 a3 b3 c3 d3
+ REPX {pmulhrsw x, m4}, m6, m17, m2, m16
+ vinserti32x4 ym0, ym2, xm6, 1 ; 0 2
+ vshufi32x4 ym1, ym2, ym6, 0x03 ; 4 6
+ vinserti32x4 ym14, ym16, xm17, 1 ; 1 3
+ vshufi32x4 ym15, ym16, ym17, 0x03 ; 5 7
+ vpermt2q m2, m5, m6 ; 8 10
+ vpermt2q m16, m5, m17 ; 9 11
+ vextracti32x8 ym3, m2, 1 ; 12 14
+ vextracti32x8 ym17, m16, 1 ; 13 15
+ call m(idct_8x16_internal_8bpc).main_fast
+ call .main_fast
+.end:
+ vpbroadcastd ym8, strided
+ pmulld ym8, [o(gather8d)]
+ call .main_end
+ lea r3, [dstq+strideq*4]
+ kxnorb k1, k1, k1
+ lea r4, [dstq+strideq*8]
+ pxor m9, m9
+ lea r1, [r3+strideq*8]
+ kmovb k2, k1
+ vpgatherdq m12{k1}, [r0+ym8]
+ kmovb k1, k2
+ vpgatherdq m13{k2}, [r3+ym8]
+ kmovb k2, k1
+ vpgatherdq m14{k1}, [r4+ym8]
+ kmovb k1, k2
+ vpgatherdq m15{k2}, [r1+ym8]
+ REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {mova [cq+64*x], m9}, 0, 1, 2, 3, 4, 5, 6, 7
+ punpcklbw m11, m12, m9
+ punpckhbw m12, m9
+ paddw m0, m11
+ paddw m1, m12
+ packuswb m0, m1
+ kmovb k2, k1
+ vpscatterdq [r0+ym8]{k1}, m0
+ punpcklbw m12, m13, m9
+ punpckhbw m13, m9
+ paddw m2, m12
+ paddw m3, m13
+ packuswb m2, m3
+ kmovb k1, k2
+ vpscatterdq [r3+ym8]{k2}, m2
+ punpcklbw m13, m14, m9
+ punpckhbw m14, m9
+ paddw m4, m13
+ paddw m5, m14
+ packuswb m4, m5
+ kmovb k2, k1
+ vpscatterdq [r4+ym8]{k1}, m4
+ punpcklbw m14, m15, m9
+ punpckhbw m15, m9
+ paddw m6, m14
+ paddw m7, m15
+ packuswb m6, m7
+ vpscatterdq [r1+ym8]{k2}, m6
+ RET
+.dconly:
+ movsx r6d, word [cq]
+ mov [cq], eobd
+ or r3d, 32
+ imul r6d, 181
+ add r6d, 128+512
+ sar r6d, 8+2
+ jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly2
+INIT_YMM avx512icl
+ALIGN function_align
+cglobal_label .main_fast2 ; bottom three-quarters are zero
+ ITX_UNPACK_MULHRSW 12, 14, 14, 8, 201, 4091, m601, 4052 ; t16a, t31a, t23a, t24a
+ ITX_UNPACK_MULHRSW 21, 20, 15, 8, 995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a
+ mova m11, m12
+ mova m17, m20
+ mova m15, m21
+ mova m16, m14
+ jmp .main4
+ALIGN function_align
+cglobal_label .main_fast ; bottom half is zero
+ ITX_UNPACK_MULHRSW 12, 14, 14, 8, 201, 4091, m601, 4052 ; t16a, t31a, t23a, t24a
+ ITX_UNPACK_MULHRSW 21, 15, 15, 8, 995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a
+ ITX_UNPACK_MULHRSW 20, 16, 16, 8, 1751, 3703, m2106, 3513 ; t18a, t29a, t21a, t26a
+ ITX_UNPACK_MULHRSW 19, 17, 17, 8, 2440, 3290, m2751, 3035 ; t22a, t25a, t17a, t30a
+ jmp .main3
+ALIGN function_align
+cglobal_label .main
+ punpcklwd m12, m21, m14 ; in31 in1
+ punpckhwd m14, m21 ; in3 in29
+ punpcklwd m21, m20, m15 ; in27 in5
+ punpckhwd m15, m20 ; in7 in25
+ punpcklwd m20, m19, m16 ; in23 in9
+ punpckhwd m16, m19 ; in11 in21
+ punpcklwd m19, m18, m17 ; in19 in13
+ punpckhwd m17, m18 ; in15 in17
+.main2:
+ ITX_MUL2X_PACK 12, 8, 9, 10, 201, 4091, 5 ; t16a, t31a
+ ITX_MUL2X_PACK 14, 8, 9, 10, 4052, 601, 5 ; t23a, t24a
+ ITX_MUL2X_PACK 21, 8, 9, 10, 995, 3973, 5 ; t20a, t27a
+ ITX_MUL2X_PACK 15, 8, 9, 10, 3857, 1380, 5 ; t19a, t28a
+ ITX_MUL2X_PACK 20, 8, 9, 10, 1751, 3703, 5 ; t18a, t29a
+ ITX_MUL2X_PACK 16, 8, 9, 10, 3513, 2106, 5 ; t21a, t26a
+ ITX_MUL2X_PACK 19, 8, 9, 10, 2440, 3290, 5 ; t22a, t25a
+ ITX_MUL2X_PACK 17, 8, 9, 10, 3035, 2751, 5 ; t17a, t30a
+.main3:
+ psubsw m11, m12, m17 ; t17 t30
+ paddsw m12, m17 ; t16 t31
+ psubsw m17, m15, m20 ; t18 t29
+ paddsw m20, m15 ; t19 t28
+ psubsw m15, m21, m16 ; t21 t26
+ paddsw m21, m16 ; t20 t27
+ psubsw m16, m14, m19 ; t22 t25
+ paddsw m14, m19 ; t23 t24
+.main4:
+ ITX_MUL2X_PACK 11, 18, 19, 10, 799, 4017, 5 ; t17a t30a
+ ITX_MUL2X_PACK 17, 18, 19, 10, m4017, 799, 5 ; t18a t29a
+ ITX_MUL2X_PACK 15, 18, 19, 10, 3406, 2276, 5 ; t21a t26a
+ ITX_MUL2X_PACK 16, 18, 19, 10, m2276, 3406, 5 ; t22a t25a
+ vpbroadcastd m8, [o(pw_m3784_1567)]
+ psubsw m19, m12, m20 ; t19a t28a
+ paddsw m20, m12 ; t16a t31a
+ psubsw m12, m14, m21 ; t20a t27a
+ paddsw m14, m21 ; t23a t24a
+ psubsw m21, m11, m17 ; t18 t29
+ paddsw m11, m17 ; t17 t30
+ psubsw m17, m16, m15 ; t21 t26
+ paddsw m16, m15 ; t22 t25
+ ITX_MUL2X_PACK 21, 18, 15, 10, 1567_3784, 8, 20 ; t18a t29a
+ ITX_MUL2X_PACK 19, 18, 15, 10, 1567_3784, 8, 20 ; t19 t28
+ ITX_MUL2X_PACK 12, 18, 15, 10, 8, m1567_m3784, 36 ; t20 t27
+ ITX_MUL2X_PACK 17, 18, 15, 10, 8, m1567_m3784, 36 ; t21a t26a
+ vbroadcasti32x4 m18, [o(deint_shuf)]
+ vpbroadcastd m8, [o(pw_m2896_2896)]
+ vpbroadcastd m9, [o(pw_2896_2896)]
+ psubsw m15, m20, m14 ; t23 t24
+ paddsw m20, m14 ; t16 t31
+ psubsw m14, m11, m16 ; t22a t25a
+ paddsw m11, m16 ; t17a t30a
+ psubsw m16, m21, m17 ; t21 t26
+ paddsw m21, m17 ; t18 t29
+ psubsw m17, m19, m12 ; t20a t27a
+ paddsw m19, m12 ; t19a t28a
+ REPX {pshufb x, m18}, m20, m11, m21, m19
+ ITX_MUL2X_PACK 15, 18, 12, 10, 8, 9, 8 ; t23a t22a
+ ITX_MUL2X_PACK 14, 13, 15, 10, 8, 9, 8 ; t22 t25
+ packssdw m18, m13 ; t23a t22
+ packssdw m12, m15 ; t24a t25
+ ITX_MUL2X_PACK 16, 13, 15, 10, 8, 9, 8 ; t21a t26a
+ ITX_MUL2X_PACK 17, 16, 14, 10, 8, 9, 8 ; t20 t27
+ packssdw m16, m13 ; t20 t21a
+ packssdw m14, m15 ; t27 t26a
+ punpcklqdq m13, m19, m21 ; t19a t18
+ punpckhqdq m19, m21 ; t28a t29
+ punpcklqdq m21, m20, m11 ; t16 t17a
+ punpckhqdq m20, m11 ; t31 t30a
+INIT_ZMM avx512icl
+ mova m15, [o(permA)]
+ ret
+cglobal_label .main_end
+ vpbroadcastd m10, [o(pw_2048)]
+ vpermt2q m0, m15, m1 ; t0 t1 t2 t3
+ vpermt2q m20, m15, m19 ; t31 t30a t29 t28a
+ vpermt2q m2, m15, m3 ; t4 t5 t6 t7
+ vpermt2q m14, m15, m12 ; t27 t26a t25 t24a
+ vpermt2q m4, m15, m5 ; t8 t9 t10 t11
+ vpermt2q m18, m15, m16 ; t23a t22 t21a t20
+ vpermt2q m6, m15, m7 ; t12 t13 t14 t15
+ vpermt2q m13, m15, m21 ; t19a t18 t17a t16
+ psubsw m7, m0, m20 ; out31 out30 out29 out28
+ paddsw m0, m20 ; out0 out1 out2 out3
+ psubsw m5, m2, m14 ; out27 out26 out25 out24
+ paddsw m2, m14 ; out4 out5 out6 out7
+ psubsw m3, m4, m18 ; out23 out22 out21 out20
+ paddsw m4, m18 ; out8 out9 out10 out11
+ psubsw m1, m6, m13 ; out19 out18 out17 out16
+ paddsw m6, m13 ; out12 out13 out14 out15
+ vzeroupper
+ ret
+
+%macro LOAD_PACKED_16X2 3 ; dst, row[1-2]
+ vbroadcasti32x4 ym%1, [cq+16*%2]
+ vbroadcasti32x4 ym8, [cq+16*%3]
+ shufpd ym%1, ym8, 0x0c
+%endmacro
+
+cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, stride, c, eob
+%undef cmp
+ test eobd, eobd
+ jz .dconly
+ lea r5, [o_base]
+ LOAD_PACKED_16X2 0, 0, 2 ; in0 in2
+ LOAD_PACKED_16X2 1, 4, 6 ; in4 in6
+ LOAD_PACKED_16X2 2, 8, 10 ; in8 in10
+ LOAD_PACKED_16X2 3, 12, 14 ; in12 in14
+ LOAD_PACKED_16X2 14, 1, 3 ; in1 in3
+ LOAD_PACKED_16X2 15, 5, 7 ; in5 in7
+ LOAD_PACKED_16X2 16, 9, 11 ; in9 in11
+ LOAD_PACKED_16X2 17, 13, 15 ; in13 in15
+ pxor m4, m4
+ REPX {mova [cq+64*x], m4}, 0, 1, 2, 3
+ cmp eobd, 107
+ jb .fast
+ LOAD_PACKED_16X2 4, 16, 18 ; in16 in18
+ LOAD_PACKED_16X2 5, 20, 22 ; in20 in22
+ LOAD_PACKED_16X2 6, 24, 26 ; in24 in26
+ LOAD_PACKED_16X2 7, 28, 30 ; in28 in30
+ call m(idct_8x16_internal_8bpc).main
+ LOAD_PACKED_16X2 18, 19, 17 ; in19 in17
+ LOAD_PACKED_16X2 19, 23, 21 ; in23 in21
+ LOAD_PACKED_16X2 20, 27, 25 ; in27 in25
+ LOAD_PACKED_16X2 21, 31, 29 ; in31 in29
+ pxor m8, m8
+ REPX {mova [cq+64*x], m8}, 4, 5, 6, 7
+ call m(inv_txfm_add_dct_dct_8x32_8bpc).main
+ jmp .pass2
+.fast: ; bottom half is zero
+ mova ym5, ym4
+ mova ym6, ym4
+ mova ym7, ym4
+ call m(idct_8x16_internal_8bpc).main
+ call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast
+.pass2:
+ vpbroadcastd m10, [o(pw_8192)]
+ vpermt2q m0, m15, m4 ; t0 t1 t9 t8
+ vpermt2q m20, m15, m18 ; t31 t30a t23a t22
+ vpermt2q m3, m15, m7 ; t7 t6 t14 t15
+ vpermt2q m12, m15, m21 ; t25 t24a t17a t16
+ vpermt2q m2, m15, m6 ; t4 t5 t13 t12
+ vpermt2q m14, m15, m13 ; t23a t22 t21a t20
+ vpermt2q m1, m15, m5 ; t3 t2 t10 t11
+ vpermt2q m19, m15, m16 ; t27 t26a t19a t18
+ psubsw m8, m0, m20 ; out31 out30 out22 out23
+ paddsw m0, m20 ; out0 out1 out9 out8
+ paddsw m6, m3, m12 ; out7 out6 out14 out15
+ psubsw m3, m12 ; out24 out25 out17 out16
+ psubsw m5, m2, m14 ; out27 out26 out18 out19
+ paddsw m4, m2, m14 ; out4 out5 out13 out12
+ psubsw m7, m1, m19 ; out28 out29 out21 out20
+ paddsw m2, m1, m19 ; out3 out2 out10 out11
+ vzeroupper
+ vshufi32x4 m1, m0, m3, q1221 ; out1 out9 out17 out25
+ vshufi32x4 m0, m3, q0330 ; out0 out8 out16 out24
+ vshufi32x4 m3, m2, m5, q0330 ; out3 out11 out19 out27
+ vshufi32x4 m2, m5, q1221 ; out2 out10 out18 out26
+ vshufi32x4 m5, m4, m7, q1221 ; out5 out13 out21 out29
+ vshufi32x4 m4, m7, q0330 ; out4 out12 out20 out28
+ vshufi32x4 m7, m6, m8, q0330 ; out7 out15 out23 out31
+ vshufi32x4 m6, m8, q1221 ; out6 out14 out22 out30
+ REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
+ call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_8x8
+ call .main
+ vpbroadcastd m8, [o(pw_2048)]
+ REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
+ lea r2, [strideq*3]
+ lea r3, [dstq+strideq*4]
+ movshdup m12, [o(permD)]
+ pmovzxbw m8, [dstq+strideq*0]
+ pmovzxbw m9, [dstq+strideq*1]
+ pmovzxbw m10, [dstq+strideq*2]
+ pmovzxbw m11, [dstq+r2 ]
+ paddw m0, m8
+ paddw m1, m9
+ paddw m2, m10
+ paddw m3, m11
+ pmovzxbw m8, [r3+strideq*0]
+ pmovzxbw m9, [r3+strideq*1]
+ pmovzxbw m10, [r3+strideq*2]
+ pmovzxbw m11, [r3+r2 ]
+ paddw m4, m8
+ paddw m5, m9
+ paddw m6, m10
+ paddw m7, m11
+ packuswb m0, m1
+ packuswb m2, m3
+ vpermq m0, m12, m0
+ vpermq m2, m12, m2
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], ym2
+ vextracti32x8 [dstq+r2 ], m2, 1
+ packuswb m4, m5
+ packuswb m6, m7
+ vpermq m4, m12, m4
+ vpermq m6, m12, m6
+ mova [r3+strideq*0], ym4
+ vextracti32x8 [r3+strideq*1], m4, 1
+ mova [r3+strideq*2], ym6
+ vextracti32x8 [r3+r2 ], m6, 1
+ RET
+.dconly:
+ movsx r6d, word [cq]
+ mov [cq], eobd
+ or r3d, 8
+.dconly2:
+ imul r6d, 181
+ add r6d, 128+512
+ sar r6d, 8+2
+.dconly3:
+ imul r6d, 181
+ add r6d, 128+2048
+ sar r6d, 8+4
+ pxor m2, m2
+ vpbroadcastw m3, r6d
+.dconly_loop:
+ mova ym1, [dstq+strideq*0]
+ vinserti32x8 m1, [dstq+strideq*1], 1
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ paddw m0, m3
+ paddw m1, m3
+ packuswb m0, m1
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub r3d, 2
+ jg .dconly_loop
+ RET
+ALIGN function_align
+cglobal_label .main
+ vpbroadcastd m10, [o(pd_2048)]
+.main2:
+ ITX_MULSUB_2W 5, 3, 8, 9, 10, 3406, 2276 ; t5a, t6a
+ ITX_MULSUB_2W 1, 7, 8, 9, 10, 799, 4017 ; t4a, t7a
+ ITX_MULSUB_2W 2, 6, 8, 9, 10, 1567, 3784 ; t2, t3
+ vpbroadcastd m11, [o(pw_2896_2896)]
+ vpbroadcastd m12, [o(pw_m2896_2896)]
+ ITX_MULSUB_2W 0, 4, 8, 9, 10, 11, 12 ; t1, t0
+.main3:
+ paddsw m8, m1, m5 ; t4
+ psubsw m1, m5 ; t5a
+ paddsw m9, m7, m3 ; t7
+ psubsw m7, m3 ; t6a
+ ITX_MULSUB_2W 7, 1, 3, 5, 10, 11, 12 ; t5, t6
+ psubsw m5, m0, m2 ; dct4 out2
+ paddsw m2, m0 ; dct4 out1
+ paddsw m0, m4, m6 ; dct4 out0
+ psubsw m4, m6 ; dct4 out3
+ psubsw m6, m2, m1 ; out6
+ paddsw m1, m2 ; out1
+ paddsw m2, m5, m7 ; out2
+ psubsw m5, m7 ; out5
+ psubsw m7, m0, m9 ; out7
+ paddsw m0, m9 ; out0
+ paddsw m3, m4, m8 ; out3
+ psubsw m4, m8 ; out4
+ ret
+
+cglobal inv_txfm_add_identity_identity_8x32_8bpc, 3, 5, 0, dst, stride, c
+ vpbroadcastd m7, [pw_5]
+ paddsw m0, m7, [cq+64*0]
+ paddsw m1, m7, [cq+64*1]
+ vpbroadcastd ym9, strided
+ paddsw m2, m7, [cq+64*2]
+ paddsw m3, m7, [cq+64*3]
+ paddsw m4, m7, [cq+64*4]
+ paddsw m5, m7, [cq+64*5]
+ paddsw m6, m7, [cq+64*6]
+ paddsw m7, [cq+64*7]
+ pmulld ym14, ym9, [pd_0to15]
+ lea r3, [dstq+strideq*1]
+ lea r4, [dstq+strideq*2]
+ kxnorb k1, k1, k1
+ pxor m13, m13
+ add r1, r4 ; dstq+strideq*3
+ kmovb k2, k1
+ vpgatherdq m9{k1}, [r0+ym14*4]
+ kmovb k1, k2
+ vpgatherdq m10{k2}, [r3+ym14*4]
+ kmovb k2, k1
+ call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_8x8
+ REPX {psraw x, 3}, m0, m1, m2, m3, m4, m5, m6, m7
+ vpgatherdq m11{k1}, [r4+ym14*4]
+ kmovb k1, k2
+ vpgatherdq m12{k2}, [r1+ym14*4]
+ REPX {mova [cq+64*x], m13}, 0, 1, 2, 3, 4, 5, 6, 7
+ punpcklbw m8, m9, m13 ; 0 8 16 24
+ punpckhbw m9, m13 ; 4 12 20 28
+ paddw m0, m8
+ paddw m4, m9
+ packuswb m0, m4
+ kmovb k2, k1
+ vpscatterdq [r0+ym14*4]{k1}, m0
+ punpcklbw m8, m10, m13 ; 1 9 17 25
+ punpckhbw m10, m13 ; 5 13 21 29
+ paddw m1, m8
+ paddw m5, m10
+ packuswb m1, m5
+ kmovb k1, k2
+ vpscatterdq [r3+ym14*4]{k2}, m1
+ punpcklbw m8, m11, m13 ; 2 10 18 26
+ punpckhbw m11, m13 ; 6 14 22 30
+ paddw m2, m8
+ paddw m6, m11
+ packuswb m2, m6
+ kmovb k2, k1
+ vpscatterdq [r4+ym14*4]{k1}, m2
+ punpcklbw m8, m12, m13 ; 3 11 19 27
+ punpckhbw m12, m13 ; 7 15 23 31
+ paddw m3, m8
+ paddw m7, m12
+ packuswb m3, m7
+ vpscatterdq [r1+ym14*4]{k2}, m3
+ RET
+
+cglobal inv_txfm_add_identity_identity_32x8_8bpc, 3, 5, 0, dst, stride, c
+ vpbroadcastd m0, [pw_4096]
+ pmulhrsw m3, m0, [cq+64*0]
+ pmulhrsw m4, m0, [cq+64*4]
+ pmulhrsw m6, m0, [cq+64*1]
+ pmulhrsw m5, m0, [cq+64*5]
+ pmulhrsw m7, m0, [cq+64*2]
+ pmulhrsw m2, m0, [cq+64*6]
+ pmulhrsw m8, m0, [cq+64*3]
+ pmulhrsw m0, [cq+64*7]
+ mova m13, [int8_permA]
+ lea r3, [strideq*3]
+ lea r4, [dstq+strideq*4]
+ punpckldq m1, m3, m4
+ punpckhdq m3, m4
+ punpckldq m4, m6, m5
+ punpckhdq m6, m5
+ punpckldq m5, m7, m2
+ punpckhdq m7, m2
+ punpckldq m2, m8, m0
+ punpckhdq m8, m0
+ mova ym9, [dstq+strideq*0]
+ vinserti32x8 m9, [dstq+strideq*2], 1
+ mova ym10, [dstq+strideq*1]
+ vinserti32x8 m10, [dstq+r3 ], 1
+ mova ym11, [r4+strideq*0]
+ vinserti32x8 m11, [r4+strideq*2], 1
+ mova ym12, [r4+strideq*1]
+ vinserti32x8 m12, [r4+r3 ], 1
+ REPX {vpermb x, m13, x}, m1, m4, m5, m2, m3, m6, m7, m8
+ pxor m13, m13
+ REPX {mova [cq+64*x], m13}, 0, 1, 2, 3, 4, 5, 6, 7
+ punpcklqdq m0, m1, m4 ; a0 a2 c0 c2
+ punpckhqdq m1, m4 ; b0 b2 d0 d2
+ punpcklqdq m4, m5, m2 ; a1 a3 c1 c3
+ punpckhqdq m5, m2 ; b1 b3 d1 d3
+ punpcklqdq m2, m3, m6 ; e0 e2 g0 g2
+ punpckhqdq m3, m6 ; f0 f2 h0 h2
+ punpcklqdq m6, m7, m8 ; e1 e3 g1 g3
+ punpckhqdq m7, m8 ; f1 f3 h1 h3
+ punpcklbw m8, m9, m13
+ punpckhbw m9, m13
+ paddw m0, m8
+ paddw m4, m9
+ packuswb m0, m4
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*2], m0, 1
+ punpcklbw m8, m10, m13
+ punpckhbw m10, m13
+ paddw m1, m8
+ paddw m5, m10
+ packuswb m1, m5
+ mova [dstq+strideq*1], ym1
+ vextracti32x8 [dstq+r3 ], m1, 1
+ punpcklbw m8, m11, m13
+ punpckhbw m11, m13
+ paddw m2, m8
+ paddw m6, m11
+ packuswb m2, m6
+ mova [r4+strideq*0], ym2
+ vextracti32x8 [r4+strideq*2], m2, 1
+ punpcklbw m8, m12, m13
+ punpckhbw m12, m13
+ paddw m3, m8
+ paddw m7, m12
+ packuswb m3, m7
+ mova [r4+strideq*1], ym3
+ vextracti32x8 [r4+r3 ], m3, 1
+ RET
+
+%macro IDCT_16x32_END 3 ; src[1-2], row
+ mova xm8, [dstq+strideq*0]
+ vinserti32x4 ym8, [dstq+strideq*1], 1
+ mova xm9, [dstq+r3 ]
+ vinserti32x4 ym9, [dstq+strideq*2], 1
+ pmulhrsw m%1, m10
+ pmulhrsw m%2, m10
+ vpermb m8, m11, m8
+ vpermb m9, m11, m9
+ mova [cq+64*(%3*2+0)], m13
+ mova [cq+64*(%3*2+1)], m13
+ paddw m8, m%1
+ paddw m9, m%2
+ packuswb m8, m9
+ vpermd m8, m12, m8
+ mova [dstq+strideq*0], xm8
+ vextracti32x4 [dstq+strideq*1], ym8, 1
+ vextracti32x4 [dstq+strideq*2], m8, 2
+ vextracti32x4 [dstq+r3 ], m8, 3
+%if %1 != 20
+ lea dstq, [dstq+strideq*4]
+%endif
+%endmacro
+
+cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 4, 22, dst, stride, c, eob
+%undef cmp
+ lea r5, [o_base]
+ test eobd, eobd
+ jz .dconly
+ vpbroadcastd m15, [o(pw_2896x8)]
+ cmp eobd, 151
+ jb .fast
+ pmulhrsw m5, m15, [cq+64*10]
+ pmulhrsw m3, m15, [cq+64* 6]
+ pmulhrsw m1, m15, [cq+64* 2]
+ pmulhrsw m7, m15, [cq+64*14]
+ pmulhrsw m2, m15, [cq+64* 4]
+ pmulhrsw m6, m15, [cq+64*12]
+ pmulhrsw m0, m15, [cq+64* 0]
+ pmulhrsw m4, m15, [cq+64* 8]
+ call m(inv_txfm_add_dct_dct_32x8_8bpc).main
+ pmulhrsw m14, m15, [cq+64* 1]
+ pmulhrsw m21, m15, [cq+64*15]
+ pmulhrsw m18, m15, [cq+64* 9]
+ pmulhrsw m17, m15, [cq+64* 7]
+ pmulhrsw m16, m15, [cq+64* 5]
+ pmulhrsw m19, m15, [cq+64*11]
+ pmulhrsw m20, m15, [cq+64*13]
+ pmulhrsw m15, [cq+64* 3]
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
+ mova m8, [o(idct_16x32p)]
+ vpbroadcastd m9, [o(pw_16384)]
+ REPX {vpermb x, m8, x}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m14, m15, m16, m17, m18, m19, m20, m21
+ punpckldq m8, m0, m1
+ punpckhdq m0, m1
+ punpckldq m1, m2, m3
+ punpckhdq m2, m3
+ REPX {pmulhrsw x, m9}, m8, m0, m1, m2
+ punpckldq m3, m4, m5
+ punpckhdq m4, m5
+ punpckldq m5, m6, m7
+ punpckhdq m6, m7
+ REPX {pmulhrsw x, m9}, m3, m4, m5, m6
+ punpckldq m7, m14, m15
+ punpckhdq m14, m15
+ punpckldq m15, m16, m17
+ punpckhdq m16, m17
+ REPX {pmulhrsw x, m9}, m7, m14, m15, m16
+ punpckldq m17, m18, m19
+ punpckhdq m18, m19
+ punpckldq m19, m20, m21
+ punpckhdq m20, m21
+ REPX {pmulhrsw x, m9}, m17, m18, m19, m20
+ punpcklqdq m21, m8, m1
+ punpckhqdq m8, m1
+ punpcklqdq m1, m0, m2
+ punpckhqdq m0, m2
+ punpcklqdq m2, m3, m5
+ punpckhqdq m3, m5
+ punpcklqdq m5, m4, m6
+ punpckhqdq m4, m6
+ punpcklqdq m6, m7, m15
+ punpckhqdq m7, m15
+ punpcklqdq m15, m14, m16
+ punpckhqdq m14, m16
+ punpcklqdq m16, m17, m19
+ punpckhqdq m17, m19
+ punpcklqdq m19, m18, m20
+ punpckhqdq m18, m20
+ vinserti32x8 m20, m21, ym2, 1
+ vshufi32x4 m21, m2, q3232
+ vinserti32x8 m2, m8, ym3, 1
+ vshufi32x4 m8, m3, q3232
+ vinserti32x8 m3, m1, ym5, 1
+ vshufi32x4 m1, m5, q3232
+ vinserti32x8 m5, m0, ym4, 1
+ vshufi32x4 m0, m4, q3232
+ vinserti32x8 m4, m6, ym16, 1
+ vshufi32x4 m6, m16, q3232
+ vinserti32x8 m16, m7, ym17, 1
+ vshufi32x4 m7, m17, q3232
+ vinserti32x8 m17, m15, ym19, 1
+ vshufi32x4 m15, m19, q3232
+ vinserti32x8 m19, m14, ym18, 1
+ vshufi32x4 m14, m18, q3232
+ vshufi32x4 m18, m21, m6, q3131 ; 27 5
+ vshufi32x4 m21, m6, q2020 ; 31 1
+ vshufi32x4 m6, m8, m7, q2020 ; 24 8
+ vshufi32x4 m8, m7, q3131 ; 30 2
+ vshufi32x4 m7, m1, m15, q2020 ; 28 4
+ vshufi32x4 m1, m15, q3131 ; 6 26
+ vshufi32x4 m15, m0, m14, q2020 ; 7 25
+ vshufi32x4 m0, m14, q3131 ; 14 18
+ vshufi32x4 m14, m20, m4, q2020 ; 3 29
+ vshufi32x4 m20, m4, q3131 ; 23 9
+ vshufi32x4 m9, m3, m17, q2020 ; 16 0
+ vshufi32x4 m3, m17, q3131 ; 12 20
+ vshufi32x4 m17, m5, m19, q2020 ; 15 17
+ vshufi32x4 m5, m19, q3131 ; 22 10
+ vshufi32x4 m19, m2, m16, q2020 ; 19 13
+ vshufi32x4 m16, m2, m16, q3131 ; 11 21
+ call m(idct_16x16_internal_8bpc).main3
+ call .main_oddhalf
+ jmp .pass2
+.fast: ; right half is zero
+ mova ym8, [cq+64*15]
+ vinserti32x8 m8, [cq+64* 1], 1
+ mova m2, [o(int16_perm)]
+ mova ym9, [cq+64* 8]
+ vinserti32x8 m9, [cq+64* 0], 1
+ mova ym0, [cq+64* 7]
+ vinserti32x8 m0, [cq+64* 9], 1
+ mova ym7, [cq+64*14]
+ vinserti32x8 m7, [cq+64* 2], 1
+ mova ym1, [cq+64* 3]
+ vinserti32x8 m1, [cq+64*13], 1
+ mova ym3, [cq+64* 6]
+ vinserti32x8 m3, [cq+64*10], 1
+ mova ym5, [cq+64*11]
+ vinserti32x8 m5, [cq+64* 5], 1
+ mova ym6, [cq+64*12]
+ vinserti32x8 m6, [cq+64* 4], 1
+ REPX {pmulhrsw x, m15}, m8, m9, m0, m7, m1, m3, m5, m6
+ REPX {vpermb x, m2, x}, m8, m9, m0, m7, m1, m3, m5, m6
+ call m(idct_16x16_internal_8bpc).main2
+ vbroadcasti32x4 m8, [o(int_shuf3)]
+ vbroadcasti32x4 m9, [o(int_shuf4)]
+ vpbroadcastd m11, [o(pw_16384)]
+ pshufb m0, m8
+ pshufb m1, m9
+ pshufb m2, m8
+ pshufb m3, m9
+ REPX {pmulhrsw x, m11}, m0, m1, m2, m3
+ pshufb m4, m8
+ pshufb m5, m9
+ pshufb m6, m8
+ pshufb m7, m9
+ REPX {pmulhrsw x, m11}, m4, m5, m6, m7
+ punpckhdq m17, m0, m1
+ punpckldq m0, m1
+ punpckhdq m16, m2, m3
+ punpckldq m2, m3
+ punpckhdq m18, m4, m5
+ punpckldq m4, m5
+ punpckhdq m5, m6, m7
+ punpckldq m6, m7
+ vinserti32x8 m1, m0, ym2, 1
+ vshufi32x4 m3, m0, m2, q3232
+ vinserti32x8 m2, m4, ym6, 1
+ vshufi32x4 m4, m6, q3232
+ vinserti32x8 m15, m17, ym16, 1
+ vshufi32x4 m17, m16, q3232
+ vinserti32x8 m16, m18, ym5, 1
+ vshufi32x4 m18, m5, q3232
+ vshufi32x4 m0, m1, m2, q2020 ; 0 2
+ vshufi32x4 m1, m2, q3131 ; 4 6
+ vshufi32x4 m2, m3, m4, q2020 ; 8 10
+ vshufi32x4 m3, m4, q3131 ; 12 14
+ vshufi32x4 m14, m15, m16, q2020 ; 1 3
+ vshufi32x4 m15, m16, q3131 ; 5 7
+ vshufi32x4 m16, m17, m18, q2020 ; 9 11
+ vshufi32x4 m17, m18, q3131 ; 13 15
+ pxor m6, m6
+ punpckhwd m8, m0, m0
+ punpcklwd m9, m6, m0
+ punpckhwd m0, m3, m3
+ punpckhwd m5, m2, m2
+ punpcklwd m7, m1, m1
+ punpckhwd m1, m1
+ punpcklwd m3, m3
+ punpcklwd m6, m2
+ call m(idct_16x16_internal_8bpc).main_fast5
+ punpcklwd m21, m14, m14
+ punpckhwd m14, m14
+ punpcklwd m18, m15, m15
+ punpckhwd m15, m15
+ punpcklwd m20, m16, m16
+ punpckhwd m16, m16
+ punpcklwd m19, m17, m17
+ punpckhwd m17, m17
+ call .main_oddhalf_fast
+.pass2:
+ vpbroadcastd m10, [o(pw_2048)]
+ mova m11, [o(end_16x32p)]
+ lea r3, [strideq*3]
+ pxor m13, m13
+ psrld m12, m11, 8
+ IDCT_16x32_END 0, 1, 0
+ IDCT_16x32_END 2, 3, 1
+ IDCT_16x32_END 4, 5, 2
+ IDCT_16x32_END 6, 7, 3
+ IDCT_16x32_END 14, 15, 4
+ IDCT_16x32_END 16, 17, 5
+ IDCT_16x32_END 18, 19, 6
+ IDCT_16x32_END 20, 21, 7
+ RET
+ALIGN function_align
+.dconly:
+ movsx r6d, word [cq]
+ mov [cq], eobd
+ or r3d, 32
+ jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly
+ALIGN function_align
+cglobal_label .main_oddhalf_fast2 ; bottom three-quarters are zero
+ vpbroadcastd m8, [o(pw_201_4091x8)]
+ vpbroadcastd m20, [o(pw_m1380_3857x8)]
+ vpbroadcastd m9, [o(pw_995_3973x8)]
+ vpbroadcastd m16, [o(pw_m601_4052x8)]
+ pmulhrsw m21, m8 ; t16a, t31a
+ pmulhrsw m20, m15 ; t19a, t28a
+ pmulhrsw m18, m9 ; t20a, t27a
+ pmulhrsw m14, m16 ; t23a, t24a
+ mova m8, m21
+ mova m17, m20
+ mova m15, m18
+ mova m16, m14
+ jmp .main3
+ALIGN function_align
+cglobal_label .main_oddhalf_fast ; bottom half is zero
+ vpbroadcastd m8, [o(pw_201_4091x8)]
+ vpbroadcastd m9, [o(pw_m2751_3035x8)]
+ vpbroadcastd m11, [o(pw_1751_3703x8)]
+ vpbroadcastd m12, [o(pw_m1380_3857x8)]
+ pmulhrsw m21, m8 ; t16a, t31a
+ vpbroadcastd m8, [o(pw_995_3973x8)]
+ pmulhrsw m17, m9 ; t17a, t30a
+ vpbroadcastd m9, [o(pw_m2106_3513x8)]
+ pmulhrsw m20, m11 ; t18a, t29a
+ vpbroadcastd m11, [o(pw_2440_3290x8)]
+ pmulhrsw m15, m12 ; t19a, t28a
+ vpbroadcastd m12, [o(pw_m601_4052x8)]
+ pmulhrsw m18, m8 ; t20a, t27a
+ pmulhrsw m16, m9 ; t21a, t26a
+ pmulhrsw m19, m11 ; t22a, t25a
+ pmulhrsw m14, m12 ; t23a, t24a
+ jmp .main2
+ALIGN function_align
+cglobal_label .main_oddhalf
+ ITX_MUL2X_PACK 21, 8, 9, 10, 201, 4091, 5 ; t16a, t31a
+ ITX_MUL2X_PACK 17, 8, 9, 10, 3035, 2751, 5 ; t17a, t30a
+ ITX_MUL2X_PACK 20, 8, 9, 10, 1751, 3703, 5 ; t18a, t29a
+ ITX_MUL2X_PACK 15, 8, 9, 10, 3857, 1380, 5 ; t19a, t28a
+ ITX_MUL2X_PACK 18, 8, 9, 10, 995, 3973, 5 ; t20a, t27a
+ ITX_MUL2X_PACK 16, 8, 9, 10, 3513, 2106, 5 ; t21a, t26a
+ ITX_MUL2X_PACK 19, 8, 9, 10, 2440, 3290, 5 ; t22a, t25a
+ ITX_MUL2X_PACK 14, 8, 9, 10, 4052, 601, 5 ; t23a, t24a
+.main2:
+ psubsw m8, m21, m17 ; t17 t30
+ paddsw m21, m17 ; t16 t31
+ psubsw m17, m15, m20 ; t18 t29
+ paddsw m20, m15 ; t19 t28
+ psubsw m15, m18, m16 ; t21 t26
+ paddsw m18, m16 ; t20 t27
+ psubsw m16, m14, m19 ; t22 t25
+ paddsw m14, m19 ; t23 t24
+.main3:
+ ITX_MUL2X_PACK 8, 9, 19, 10, 799, 4017, 5 ; t17a t30a
+ ITX_MUL2X_PACK 17, 9, 19, 10, m4017, 799, 5 ; t18a t29a
+ ITX_MUL2X_PACK 15, 9, 19, 10, 3406, 2276, 5 ; t21a t26a
+ ITX_MUL2X_PACK 16, 9, 19, 10, m2276, 3406, 5 ; t22a t25a
+ vpbroadcastd m11, [o(pw_m3784_1567)]
+ psubsw m19, m21, m20 ; t19a t28a
+ paddsw m21, m20 ; t16a t31a
+ psubsw m20, m14, m18 ; t20a t27a
+ paddsw m14, m18 ; t23a t24a
+ psubsw m18, m8, m17 ; t18 t29
+ paddsw m8, m17 ; t17 t30
+ psubsw m17, m16, m15 ; t21 t26
+ paddsw m15, m16 ; t22 t25
+ ITX_MUL2X_PACK 18, 9, 16, 10, 1567_3784, 11, 20 ; t18a t29a
+ ITX_MUL2X_PACK 19, 9, 16, 10, 1567_3784, 11, 20 ; t19 t28
+ ITX_MUL2X_PACK 20, 9, 16, 10, 11, m1567_m3784, 36 ; t20 t27
+ ITX_MUL2X_PACK 17, 9, 16, 10, 11, m1567_m3784, 36 ; t21a t26a
+ vbroadcasti32x4 m9, [o(deint_shuf)]
+ psubsw m16, m21, m14 ; t23 t24
+ paddsw m14, m21 ; t16 t31
+ psubsw m21, m8, m15 ; t22a t25a
+ paddsw m15, m8 ; t17a t30a
+ psubsw m8, m18, m17 ; t21 t26
+ paddsw m18, m17 ; t18 t29
+ paddsw m17, m19, m20 ; t19a t28a
+ psubsw m19, m20 ; t20a t27a
+ vpbroadcastd m11, [o(pw_m2896_2896)]
+ vpbroadcastd m12, [o(pw_2896_2896)]
+ REPX {pshufb x, m9}, m14, m15, m18, m17
+ mova m9, m10
+ vpdpwssd m9, m16, m11
+ mova m20, m10
+ vpdpwssd m20, m21, m11
+ psrad m9, 12
+ psrad m20, 12
+ packssdw m9, m20 ; t23a t22
+ mova m20, m10
+ vpdpwssd m20, m16, m12
+ mova m16, m10
+ vpdpwssd m16, m21, m12
+ psrad m20, 12
+ psrad m16, 12
+ packssdw m16, m20, m16 ; t24a t25
+ ITX_MUL2X_PACK 8, 21, 20, 10, 11, 12, 8 ; t21a t26a
+ ITX_MUL2X_PACK 19, 8, 11, 10, 11, 12, 8 ; t20 t27
+ packssdw m11, m20 ; t27 t26a
+ packssdw m8, m21 ; t20 t21a
+ punpcklqdq m20, m14, m15 ; t16 t17a
+ punpckhqdq m14, m15 ; t31 t30a
+ punpckhqdq m15, m17, m18 ; t28a t29
+ punpcklqdq m17, m18 ; t19a t18
+ psubsw m21, m0, m14 ; out31 out30
+ paddsw m0, m14 ; out0 out1
+ psubsw m14, m7, m20 ; out16 out17
+ paddsw m7, m20 ; out15 out14
+ psubsw m20, m1, m15 ; out28 out29
+ paddsw m1, m15 ; out3 out2
+ psubsw m15, m6, m17 ; out19 out18
+ paddsw m6, m17 ; out12 out13
+ psubsw m17, m4, m9 ; out23 out22
+ paddsw m4, m9 ; out8 out9
+ psubsw m18, m3, m16 ; out24 out25
+ paddsw m3, m16 ; out7 out6
+ psubsw m16, m5, m8 ; out20 out21
+ paddsw m5, m8 ; out11 out10
+ psubsw m19, m2, m11 ; out27 out26
+ paddsw m2, m11 ; out4 out5
+ ret
+
+cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 6, 22, dst, stride, c, eob
+%undef cmp
+ lea r5, [o_base]
+ test eobd, eobd
+ jz .dconly
+ mova m21, [o(permB)]
+ vpermq m1, m21, [cq+64* 0] ; 0 1
+ vpermq m14, m21, [cq+64* 1] ; 2 3
+ vpermq m20, m21, [cq+64* 2] ; 4 5
+ vpermq m15, m21, [cq+64* 3] ; 6 7
+ vpbroadcastd m8, [o(pw_2896x8)]
+ vpermq m2, m21, [cq+64* 4] ; 8 9
+ vpermq m16, m21, [cq+64* 5] ; 10 11
+ vpermq m3, m21, [cq+64* 6] ; 12 13
+ vpermq m17, m21, [cq+64* 7] ; 14 15
+ REPX {pmulhrsw x, m8}, m1, m14, m20, m15, m2, m16, m3, m17
+ pxor m12, m12
+ REPX {mova [cq+64*x], m12}, 0, 1, 2, 3, 4, 5, 6, 7
+ cmp eobd, 151
+ jb .fast
+ vpermq m9, m21, [cq+64* 8] ; 16 17
+ vpermq m19, m21, [cq+64* 9] ; 18 19
+ vpermq m4, m21, [cq+64*10] ; 20 21
+ vpermq m5, m21, [cq+64*11] ; 22 23
+ vpermq m6, m21, [cq+64*12] ; 24 25
+ vpermq m18, m21, [cq+64*13] ; 26 27
+ vpermq m7, m21, [cq+64*14] ; 28 29
+ vpermq m21, m21, [cq+64*15] ; 30 31
+ REPX {pmulhrsw x, m8}, m9, m19, m4, m5, m6, m18, m7, m21
+ REPX {mova [cq+64*x], m12}, 8, 9, 10, 11, 12, 13, 14, 15
+ punpcklwd m8, m21, m14 ; 30 2
+ punpckhwd m21, m1 ; 31 1
+ punpcklwd m0, m17, m19 ; 14 18
+ punpckhwd m17, m9 ; 15 17
+ punpcklwd m9, m1 ; 16 0
+ punpckhwd m14, m7 ; 3 29
+ punpcklwd m1, m15, m18 ; 6 26
+ punpckhwd m15, m6 ; 7 25
+ punpcklwd m6, m2 ; 24 8
+ punpckhwd m19, m3 ; 19 13
+ punpcklwd m3, m4 ; 12 20
+ punpckhwd m18, m20 ; 27 5
+ punpcklwd m7, m20 ; 28 4
+ punpckhwd m20, m5, m2 ; 23 9
+ punpcklwd m5, m16 ; 22 10
+ punpckhwd m16, m4 ; 11 21
+ call m(idct_16x16_internal_8bpc).main2
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
+ jmp .pass2
+.fast: ; bottom half zero
+ punpcklwd m8, m14, m14 ; 2
+ punpcklwd m0, m17, m17 ; 14
+ punpcklwd m5, m16, m16 ; 10
+ punpcklwd m9, m12, m1 ; __ 0
+ punpckhwd m21, m1, m1 ; 1
+ punpcklwd m1, m15, m15 ; 6
+ punpcklwd m7, m20, m20 ; 4
+ punpckhwd m19, m3, m3 ; 13
+ punpcklwd m3, m3 ; 12
+ punpcklwd m6, m12, m2 ; __ 8
+ punpckhwd m18, m20, m20 ; 5
+ punpckhwd m20, m2, m2 ; 9
+ call m(idct_16x16_internal_8bpc).main_fast
+ punpckhwd m15, m15 ; 7
+ punpckhwd m14, m14 ; 3
+ punpckhwd m16, m16 ; 11
+ punpckhwd m17, m17 ; 15
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+.pass2:
+ vpbroadcastd m9, [o(pw_16384)]
+ call .transpose_round
+ vshufi32x4 m16, m14, m2, q3131 ; 5
+ vshufi32x4 m14, m2, q2020 ; 1
+ vshufi32x4 m2, m0, m3, q3131 ; 4
+ vshufi32x4 m0, m3, q2020 ; 0
+ vshufi32x4 m3, m1, m18, q3131 ; 6
+ vshufi32x4 m1, m18, q2020 ; 2
+ vshufi32x4 m18, m20, m6, q2020 ; 9
+ vshufi32x4 m20, m6, q3131 ; 13
+ vshufi32x4 m6, m21, m4, q3131 ; 12
+ vshufi32x4 m4, m21, m4, q2020 ; 8
+ vshufi32x4 m21, m19, m7, q3131 ; 15
+ vshufi32x4 m19, m7, q2020 ; 11
+ vshufi32x4 m7, m5, m15, q3131 ; 14
+ vshufi32x4 m5, m15, q2020 ; 10
+ vshufi32x4 m15, m17, m9, q2020 ; 3
+ vshufi32x4 m17, m9, q3131 ; 7
+ call m(inv_txfm_add_dct_dct_32x8_8bpc).main2
+ call .main_oddhalf
+ vpbroadcastd m12, [o(pw_2048)]
+ movshdup m13, [o(permD)]
+ lea r2, [strideq*3]
+ pmovzxbw m8, [dstq+strideq*0]
+ pmovzxbw m9, [dstq+strideq*1]
+ pmovzxbw m10, [dstq+strideq*2]
+ pmovzxbw m11, [dstq+r2 ]
+ REPX {pmulhrsw x, m12}, m0, m1, m2, m3
+ lea r3, [dstq+strideq*4]
+ paddw m0, m8
+ paddw m1, m9
+ paddw m2, m10
+ paddw m3, m11
+ pmovzxbw m8, [r3+strideq*0]
+ pmovzxbw m9, [r3+strideq*1]
+ pmovzxbw m10, [r3+strideq*2]
+ pmovzxbw m11, [r3+r2 ]
+ REPX {pmulhrsw x, m12}, m4, m5, m6, m7
+ lea r4, [dstq+strideq*8]
+ packuswb m0, m1
+ paddw m4, m8
+ paddw m5, m9
+ packuswb m2, m3
+ paddw m6, m10
+ paddw m7, m11
+ pmovzxbw m8, [r4+strideq*0]
+ pmovzxbw m9, [r4+strideq*1]
+ pmovzxbw m10, [r4+strideq*2]
+ pmovzxbw m11, [r4+r2 ]
+ REPX {pmulhrsw x, m12}, m14, m15, m16, m17
+ lea r5, [r3+strideq*8]
+ packuswb m4, m5
+ paddw m14, m8
+ paddw m15, m9
+ packuswb m6, m7
+ paddw m16, m10
+ paddw m17, m11
+ pmovzxbw m8, [r5+strideq*0]
+ pmovzxbw m9, [r5+strideq*1]
+ pmovzxbw m10, [r5+strideq*2]
+ pmovzxbw m11, [r5+r2 ]
+ REPX {pmulhrsw x, m12}, m18, m19, m20, m21
+ packuswb m14, m15
+ paddw m18, m8
+ paddw m19, m9
+ packuswb m16, m17
+ paddw m20, m10
+ paddw m21, m11
+ packuswb m18, m19
+ packuswb m20, m21
+ REPX {vpermq x, m13, x}, m0, m2, m4, m6, m14, m16, m18, m20
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], ym2
+ vextracti32x8 [dstq+r2 ], m2, 1
+ mova [r3+strideq*0], ym4
+ vextracti32x8 [r3+strideq*1], m4, 1
+ mova [r3+strideq*2], ym6
+ vextracti32x8 [r3+r2 ], m6, 1
+ mova [r4+strideq*0], ym14
+ vextracti32x8 [r4+strideq*1], m14, 1
+ mova [r4+strideq*2], ym16
+ vextracti32x8 [r4+r2 ], m16, 1
+ mova [r5+strideq*0], ym18
+ vextracti32x8 [r5+strideq*1], m18, 1
+ mova [r5+strideq*2], ym20
+ vextracti32x8 [r5+r2 ], m20, 1
+ RET
+ALIGN function_align
+.dconly:
+ movsx r6d, word [cq]
+ mov [cq], eobd
+ or r3d, 16
+ imul r6d, 181
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ add r6d, 128+256
+ sar r6d, 8+1
+ jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly3
+ALIGN function_align
+cglobal_label .main_oddhalf_fast3 ; bottom seven-eights are zero
+ vpbroadcastd m8, [o(pw_2896x8)]
+ vpbroadcastd m4, [o(pw_4076x8)]
+ vpbroadcastd m3, [o(pw_401x8)]
+ pmulhrsw m8, m0 ; t0
+ pmulhrsw m4, m14 ; t15a
+ pmulhrsw m3, m14 ; t8a
+ punpcklwd m9, m3, m4
+ punpckhwd m5, m3, m4
+ mova m2, m10
+ vpdpwssd m2, m9, [o(pw_m3784_1567)] {bcstd}
+ mova m1, m10
+ vpdpwssd m1, m5, [o(pw_m3784_1567)] {bcstd}
+ mova m6, m10
+ vpdpwssd m6, m5, [o(pw_1567_3784)] {bcstd}
+ mova m5, m10
+ vpdpwssd m5, m9, [o(pw_1567_3784)] {bcstd}
+ vpbroadcastd m11, [o(pw_2896_2896)]
+ vpbroadcastd m12, [o(pw_m2896_2896)]
+ psubsw m21, m8, m4 ; out15
+ paddsw m0, m8, m4 ; out0
+ psubsw m14, m8, m3 ; out8
+ paddsw m7, m8, m3 ; out7
+ REPX {psrad x, 12}, m2, m1, m6, m5
+ packssdw m2, m1 ; t9a
+ packssdw m5, m6 ; t14a
+ ITX_MULSUB_2W 4, 3, 16, 17, 10, 11, 12 ; t11, t12
+ psubsw m20, m8, m5 ; out14
+ paddsw m1, m8, m5 ; out1
+ psubsw m15, m8, m2 ; out9
+ paddsw m6, m8, m2 ; out6
+ ITX_MULSUB_2W 5, 2, 16, 17, 10, 11, 12 ; t10a, t13a
+ psubsw m18, m8, m3 ; out12
+ paddsw m3, m8 ; out3
+ psubsw m17, m8, m4 ; out11
+ paddsw m4, m8 ; out4
+ psubsw m19, m8, m2 ; out13
+ paddsw m2, m8 ; out2
+ psubsw m16, m8, m5 ; out10
+ paddsw m5, m8 ; out5
+ ret
+cglobal_label .main_oddhalf_fast2 ; bottom three-quarters are zero
+ vpbroadcastd m9, [o(pw_2896x8)]
+ vpbroadcastd m2, [o(pw_4017x8)]
+ vpbroadcastd m3, [o(pw_799x8)]
+ vpbroadcastd m18, [o(pw_4076x8)]
+ vpbroadcastd m19, [o(pw_401x8)]
+ vpbroadcastd m20, [o(pw_m1189x8)]
+ vpbroadcastd m16, [o(pw_3920x8)]
+ pmulhrsw m9, m0 ; t0
+ pmulhrsw m2, m1 ; t7a
+ pmulhrsw m1, m3 ; t4a
+ pmulhrsw m18, m14 ; t15a
+ pmulhrsw m14, m19 ; t8a
+ pmulhrsw m20, m15 ; t11a
+ pmulhrsw m15, m16 ; t12a
+ psubsw m7, m9, m2 ; idct8 out7
+ paddsw m0, m9, m2 ; idct8 out0
+ psubsw m4, m9, m1 ; idct8 out4
+ paddsw m3, m9, m1 ; idct8 out3
+ ITX_MULSUB_2W 2, 1, 5, 6, 10, 2896, 2896 ; t5, t6
+ mova m21, m18
+ mova m19, m14
+ mova m16, m15
+ mova m8, m20
+ psubsw m6, m9, m1 ; idct8 out6
+ paddsw m1, m9 ; idct8 out1
+ psubsw m5, m9, m2 ; idct8 out5
+ paddsw m2, m9 ; idct8 out2
+ jmp .main3
+ALIGN function_align
+cglobal_label .main_oddhalf_fast ; bottom half is zero
+ vpbroadcastd m5, [o(pw_m2276x8)]
+ vpbroadcastd m11, [o(pw_3406x8)]
+ vpbroadcastd m7, [o(pw_4017x8)]
+ vpbroadcastd m12, [o(pw_799x8)]
+ vpbroadcastd m6, [o(pw_3784x8)]
+ vpbroadcastd m10, [o(pw_1567x8)]
+ vpbroadcastd m4, [o(pw_2896x8)]
+ pmulhrsw m5, m3 ; t5a
+ pmulhrsw m3, m11 ; t6a
+ pmulhrsw m7, m1 ; t7a
+ pmulhrsw m1, m12 ; t4a
+ pmulhrsw m6, m2 ; t3
+ pmulhrsw m2, m10 ; t2
+ pmulhrsw m4, m0 ; t0
+ vpbroadcastd m11, [o(pw_2896_2896)]
+ vpbroadcastd m12, [o(pw_m2896_2896)]
+ vpbroadcastd m10, [o(pd_2048)]
+ mova m0, m4 ; t1
+ call m(inv_txfm_add_dct_dct_32x8_8bpc).main3
+ vpbroadcastd m21, [o(pw_4076x8)]
+ vpbroadcastd m8, [o(pw_401x8)]
+ vpbroadcastd m18, [o(pw_m2598x8)]
+ vpbroadcastd m9, [o(pw_3166x8)]
+ vpbroadcastd m19, [o(pw_3612x8)]
+ vpbroadcastd m11, [o(pw_1931x8)]
+ vpbroadcastd m20, [o(pw_m1189x8)]
+ vpbroadcastd m12, [o(pw_3920x8)]
+ pmulhrsw m21, m14 ; t15a
+ pmulhrsw m14, m8 ; t8a
+ pmulhrsw m18, m17 ; t9a
+ pmulhrsw m17, m9 ; t14a
+ pmulhrsw m19, m16 ; t13a
+ pmulhrsw m16, m11 ; t10a
+ pmulhrsw m20, m15 ; t11a
+ pmulhrsw m15, m12 ; t12a
+ jmp .main2
+ALIGN function_align
+cglobal_label .main_oddhalf
+ ITX_MULSUB_2W 14, 21, 8, 9, 10, 401, 4076 ; t8a, t15a
+ ITX_MULSUB_2W 18, 17, 8, 9, 10, 3166, 2598 ; t9a, t14a
+ ITX_MULSUB_2W 16, 19, 8, 9, 10, 1931, 3612 ; t10a, t13a
+ ITX_MULSUB_2W 20, 15, 8, 9, 10, 3920, 1189 ; t11a, t12a
+.main2:
+ paddsw m8, m20, m16 ; t11
+ psubsw m20, m16 ; t10
+ paddsw m16, m15, m19 ; t12
+ psubsw m15, m19 ; t13
+ psubsw m19, m14, m18 ; t9
+ paddsw m14, m18 ; t8
+ psubsw m18, m21, m17 ; t14
+ paddsw m21, m17 ; t15
+.main3:
+ vpbroadcastd m11, [o(pw_1567_3784)]
+ vpbroadcastd m12, [o(pw_m3784_1567)]
+ ITX_MULSUB_2W 18, 19, 9, 17, 10, 11, 12 ; t9a, t14a
+ vpbroadcastd m11, [o(pw_m1567_m3784)]
+ ITX_MULSUB_2W 15, 20, 9, 17, 10, 12, 11 ; t10a, t13a
+ vpbroadcastd m11, [o(pw_2896_2896)]
+ vpbroadcastd m12, [o(pw_m2896_2896)]
+ psubsw m17, m14, m8 ; t11a
+ paddsw m8, m14 ; t8a
+ paddsw m14, m18, m15 ; t9
+ psubsw m18, m15 ; t10
+ psubsw m15, m19, m20 ; t13
+ paddsw m19, m20 ; t14
+ paddsw m20, m21, m16 ; t15a
+ psubsw m16, m21, m16 ; t12a
+ ITX_MULSUB_2W 15, 18, 9, 21, 10, 11, 12 ; t10a, t13a
+ ITX_MULSUB_2W 16, 17, 9, 21, 10, 11, 12 ; t11, t12
+ psubsw m21, m0, m20 ; out15
+ paddsw m0, m20 ; out0
+ psubsw m20, m1, m19 ; out14
+ paddsw m1, m19 ; out1
+ psubsw m19, m2, m18 ; out13
+ paddsw m2, m18 ; out2
+ psubsw m18, m3, m17 ; out12
+ paddsw m3, m17 ; out3
+ psubsw m17, m4, m16 ; out11
+ paddsw m4, m16 ; out4
+ psubsw m16, m5, m15 ; out10
+ paddsw m5, m15 ; out5
+ psubsw m15, m6, m14 ; out9
+ paddsw m6, m14 ; out6
+ psubsw m14, m7, m8 ; out8
+ paddsw m7, m8 ; out7
+ ret
+.transpose_round:
+ punpcklwd m8, m0, m2
+ punpckhwd m0, m2
+ punpcklwd m2, m1, m3
+ punpckhwd m1, m3
+ punpcklwd m3, m4, m6
+ punpckhwd m4, m6
+ punpcklwd m6, m5, m7
+ punpckhwd m5, m7
+ punpcklwd m7, m14, m16
+ punpckhwd m14, m16
+ punpcklwd m16, m15, m17
+ punpckhwd m15, m17
+ punpcklwd m17, m19, m21
+ punpckhwd m19, m21
+ punpckhwd m21, m18, m20
+ punpcklwd m18, m20
+ punpcklwd m20, m8, m1
+ punpckhwd m8, m1
+ punpcklwd m1, m0, m2
+ punpckhwd m0, m2
+ punpcklwd m2, m3, m5
+ punpckhwd m3, m5
+ punpcklwd m5, m4, m6
+ punpckhwd m4, m6
+ REPX {pmulhrsw x, m9}, m20, m8, m1, m0
+ punpcklwd m6, m7, m15
+ punpckhwd m7, m15
+ punpcklwd m15, m14, m16
+ punpckhwd m14, m16
+ REPX {pmulhrsw x, m9}, m2, m3, m5, m4
+ punpckhwd m16, m18, m19
+ punpcklwd m18, m19
+ punpcklwd m19, m21, m17
+ punpckhwd m21, m17
+ REPX {pmulhrsw x, m9}, m6, m7, m15, m14
+ punpcklwd m17, m8, m0 ; a2 a6 aa ae
+ punpckhwd m8, m0 ; a3 a7 ab af
+ punpcklwd m0, m20, m1 ; a0 a4 a8 ac
+ punpckhwd m20, m1 ; a1 a5 a9 ad
+ REPX {pmulhrsw x, m9}, m16, m18, m19, m21
+ punpcklwd m1, m2, m5 ; b0 b4 b8 bc
+ punpckhwd m2, m5 ; b1 b5 b9 bd
+ punpcklwd m5, m3, m4 ; b2 b6 ba be
+ punpckhwd m3, m4 ; b3 b7 bb bf
+ punpcklwd m4, m6, m15 ; c0 c4 c8 cc
+ punpckhwd m6, m15 ; c1 c5 c9 cd
+ punpcklwd m15, m7, m14 ; c2 c6 ca ce
+ punpckhwd m7, m14 ; c3 c7 cb cf
+ punpcklwd m14, m18, m19 ; d0 d4 d8 dc
+ punpckhwd m18, m19 ; d1 d5 d9 dd
+ punpcklwd m9, m16, m21 ; d2 d6 da de
+ punpckhwd m16, m21 ; d3 d7 db df
+ vshufi32x4 m21, m0, m1, q3232 ; a8 ac b8 bc
+ vinserti32x8 m0, ym1, 1 ; a0 a4 b0 b4
+ vinserti32x8 m1, m17, ym5, 1 ; a2 a6 b2 b6
+ vshufi32x4 m5, m17, m5, q3232 ; aa ae ba be
+ vinserti32x8 m17, m8, ym3, 1 ; a3 a7 b3 b7
+ vshufi32x4 m19, m8, m3, q3232 ; ab af bb bf
+ vinserti32x8 m3, m4, ym14, 1 ; c0 c4 d0 d4
+ vshufi32x4 m4, m14, q3232 ; c8 cc d8 dc
+ vinserti32x8 m14, m20, ym2, 1 ; a1 a5 b1 b5
+ vshufi32x4 m20, m2, q3232 ; a9 ad b9 bd
+ vinserti32x8 m2, m6, ym18, 1 ; c1 c5 d1 d5
+ vshufi32x4 m6, m18, q3232 ; c9 cd d9 dd
+ vinserti32x8 m18, m15, ym9, 1 ; c2 c6 d2 d6
+ vshufi32x4 m15, m9, q3232 ; ca ce da de
+ vinserti32x8 m9, m7, ym16, 1 ; c3 c7 d3 d7
+ vshufi32x4 m7, m16, q3232 ; cb cf db df
+ ret
+
+%macro IDTX_16x32 4 ; src/dst[1-4]
+ pmulhrsw m%1, m15, [cq+64*%1]
+ pmulhrsw m%2, m15, [cq+64*%2]
+ pmulhrsw m%3, m15, [cq+64*%3]
+ pmulhrsw m%4, m15, [cq+64*%4]
+ pmulhrsw m18, m16, m%1
+ pmulhrsw m19, m16, m%2
+ pmulhrsw m20, m16, m%3
+ pmulhrsw m21, m16, m%4
+ REPX {pmulhrsw x, m17}, m18, m19, m20, m21
+ paddsw m%1, m18
+ paddsw m%2, m19
+ paddsw m%3, m20
+ paddsw m%4, m21
+%endmacro
+
+%macro IDTX_16x32_STORE 2 ; src[1-2]
+ mova xm17, [dstq+r3*0]
+ vinserti128 ym17, [dstq+r3*4], 1
+ vinserti32x4 m17, [dstq+r3*8], 2
+ vinserti32x4 m17, [dstq+r4*8], 3
+ mova [cq+64*(%1*2+0)], m18
+ mova [cq+64*(%1*2+1)], m18
+ punpcklbw m16, m17, m18
+ punpckhbw m17, m18
+ paddw m16, m%1
+ paddw m17, m%2
+ packuswb m16, m17
+ mova [dstq+r3*0], xm16
+ vextracti128 [dstq+r3*4], ym16, 1
+ vextracti32x4 [dstq+r3*8], m16, 2
+ vextracti32x4 [dstq+r4*8], m16, 3
+%if %1 != 7
+ add dstq, strideq
+%endif
+%endmacro
+
+cglobal inv_txfm_add_identity_identity_16x32_8bpc, 3, 5, 22, dst, stride, c
+ vpbroadcastd m15, [pw_2896x8]
+ vpbroadcastd m16, [pw_1697x16]
+ vpbroadcastd m17, [pw_16384]
+ IDTX_16x32 0, 1, 2, 3
+ IDTX_16x32 4, 5, 6, 7
+ IDTX_16x32 8, 9, 10, 11
+ IDTX_16x32 12, 13, 14, 15
+ vpbroadcastd m16, [pw_8192]
+ call .transpose_2x8x8_round
+ lea r3, [strideq*2]
+ lea r4, [strideq*3]
+ pxor m18, m18
+ IDTX_16x32_STORE 0, 8
+ IDTX_16x32_STORE 1, 9
+ IDTX_16x32_STORE 2, 10
+ IDTX_16x32_STORE 3, 11
+ IDTX_16x32_STORE 4, 12
+ IDTX_16x32_STORE 5, 13
+ IDTX_16x32_STORE 6, 14
+ IDTX_16x32_STORE 7, 15
+ RET
+ALIGN function_align
+.transpose_2x8x8_round:
+ punpckhwd m17, m4, m5
+ punpcklwd m4, m5
+ punpckhwd m5, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m6, m7
+ punpcklwd m6, m7
+ punpckhwd m7, m2, m3
+ punpcklwd m2, m3
+ punpckhdq m3, m0, m2
+ punpckldq m0, m2
+ punpckldq m2, m4, m6
+ punpckhdq m4, m6
+ punpckhdq m6, m5, m7
+ punpckldq m5, m7
+ punpckldq m7, m17, m1
+ punpckhdq m17, m1
+ REPX {pmulhrsw x, m16}, m0, m2, m3, m4, m5, m7, m6, m17
+ punpckhqdq m1, m0, m2
+ punpcklqdq m0, m2
+ punpcklqdq m2, m3, m4
+ punpckhqdq m3, m4
+ punpcklqdq m4, m5, m7
+ punpckhqdq m5, m7
+ punpckhqdq m7, m6, m17
+ punpcklqdq m6, m17
+ punpckhwd m17, m12, m13
+ punpcklwd m12, m13
+ punpckhwd m13, m8, m9
+ punpcklwd m8, m9
+ punpckhwd m9, m14, m15
+ punpcklwd m14, m15
+ punpckhwd m15, m10, m11
+ punpcklwd m10, m11
+ punpckhdq m11, m8, m10
+ punpckldq m8, m10
+ punpckldq m10, m12, m14
+ punpckhdq m12, m14
+ punpckhdq m14, m13, m15
+ punpckldq m13, m15
+ punpckldq m15, m17, m9
+ punpckhdq m17, m9
+ REPX {pmulhrsw x, m16}, m8, m10, m11, m12, m13, m15, m14, m17
+ punpckhqdq m9, m8, m10
+ punpcklqdq m8, m10
+ punpcklqdq m10, m11, m12
+ punpckhqdq m11, m12
+ punpcklqdq m12, m13, m15
+ punpckhqdq m13, m15
+ punpckhqdq m15, m14, m17
+ punpcklqdq m14, m17
+ ret
+
+%macro IDTX_32x16 4 ; dst[1-4]
+ pmulhrsw m%2, m12, [cq+32*(%1+ 0)]
+ pmulhrsw m18, m12, [cq+32*(%1+16)]
+ pmulhrsw m%4, m12, [cq+32*(%3+ 0)]
+ pmulhrsw m19, m12, [cq+32*(%3+16)]
+ REPX {paddsw x, x}, m%2, m18, m%4, m19
+ mova m%1, m14
+ vpermi2q m%1, m%2, m18
+ vpermt2q m%2, m16, m18
+%if %3 != 14
+ mova m%3, m14
+%endif
+ vpermi2q m%3, m%4, m19
+ vpermt2q m%4, m16, m19
+ pmulhrsw m18, m17, m%1
+ pmulhrsw m19, m17, m%2
+ pmulhrsw m20, m17, m%3
+ pmulhrsw m21, m17, m%4
+ REPX {paddsw x, x}, m%1, m%2, m%3, m%4
+ paddsw m%1, m18
+ paddsw m%2, m19
+ paddsw m%3, m20
+ paddsw m%4, m21
+%endmacro
+
+%macro IDTX_32x16_STORE 2-3 0 ; src[1-2], 32x32
+ mova ym19, [dstq+strideq*0]
+ vinserti32x8 m19, [dstq+strideq*8], 1
+%if %3 == 0
+ mova [cq+64*(%1*2+0)], m20
+ mova [cq+64*(%1*2+1)], m20
+%endif
+ punpcklbw m18, m19, m20
+ punpckhbw m19, m20
+ paddw m18, m%1
+ paddw m19, m%2
+ packuswb m18, m19
+ mova [dstq+strideq*0], ym18
+ vextracti32x8 [dstq+strideq*8], m18, 1
+%if %3 || %1 != 7
+ add dstq, strideq
+%endif
+%endmacro
+
+cglobal inv_txfm_add_identity_identity_32x16_8bpc, 3, 3, 22, dst, stride, c
+ vpbroadcastd m12, [pw_2896x8]
+ movu m14, [permB+7]
+ vpbroadcastd m17, [pw_1697x16]
+ psrlq m16, m14, 4
+ IDTX_32x16 0, 1, 2, 3
+ IDTX_32x16 4, 5, 6, 7
+ IDTX_32x16 8, 9, 10, 11
+ IDTX_32x16 12, 13, 14, 15
+ vpbroadcastd m16, [pw_2048]
+ call m(inv_txfm_add_identity_identity_16x32_8bpc).transpose_2x8x8_round
+ pxor m20, m20
+ IDTX_32x16_STORE 0, 8
+ IDTX_32x16_STORE 1, 9
+ IDTX_32x16_STORE 2, 10
+ IDTX_32x16_STORE 3, 11
+ IDTX_32x16_STORE 4, 12
+ IDTX_32x16_STORE 5, 13
+ IDTX_32x16_STORE 6, 14
+ IDTX_32x16_STORE 7, 15
+ RET
+
+%macro IDCT_32x32_END 4 ; src, mem, stride[1-2]
+ pmovzxbw m10, [dstq+%3]
+ pmovzxbw m11, [r3 +%4]
+%if %2 < 8
+ paddsw m8, m%2, m%1
+ psubsw m9, m%2, m%1
+%else
+ mova m9, [cq+64*(%2*2-16)]
+ paddsw m8, m9, m%1
+ psubsw m9, m%1
+%endif
+ pmulhrsw m8, m12
+ pmulhrsw m9, m12
+%if %2 >= 8
+%if %2 == 8
+ pxor m0, m0
+%endif
+ mova [cq+64*(%2*2-16)], m0
+ mova [cq+64*(%2*2-15)], m0
+%endif
+ paddw m8, m10
+ paddw m9, m11
+ packuswb m8, m9
+ vpermq m8, m13, m8
+ mova [dstq+%3], ym8
+ vextracti32x8 [r3 +%4], m8, 1
+%if %2 == 3 || %2 == 7 || %2 == 11
+ add dstq, r5
+ sub r3, r5
+%endif
+%endmacro
+
+cglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 6, 0, dst, stride, c, eob
+%undef cmp
+ lea r5, [o_base]
+ test eobd, eobd
+ jz .dconly
+ WIN64_SPILL_XMM 30
+ cmp eobd, 136
+ jb .fast
+ mova m5, [cq+64*20]
+ mova m3, [cq+64*12]
+ mova m1, [cq+64* 4]
+ mova m7, [cq+64*28]
+ mova m2, [cq+64* 8]
+ mova m6, [cq+64*24]
+ mova m0, [cq+64* 0]
+ mova m4, [cq+64*16]
+ call m(inv_txfm_add_dct_dct_32x8_8bpc).main
+ mova m14, [cq+64* 2]
+ mova m21, [cq+64*30]
+ mova m18, [cq+64*18]
+ mova m17, [cq+64*14]
+ mova m16, [cq+64*10]
+ mova m19, [cq+64*22]
+ mova m20, [cq+64*26]
+ mova m15, [cq+64* 6]
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
+ mova [cq+64* 0], m14
+ mova [cq+64* 2], m15
+ mova [cq+64* 4], m16
+ mova [cq+64* 6], m17
+ mova [cq+64* 8], m18
+ mova [cq+64*10], m19
+ mova [cq+64*12], m20
+ mova [cq+64*14], m21
+ mova m22, [cq+64* 1]
+ mova m21, [cq+64*31]
+ mova m14, [cq+64*17]
+ mova m29, [cq+64*15]
+ mova m26, [cq+64* 9]
+ mova m17, [cq+64*23]
+ mova m18, [cq+64*25]
+ mova m25, [cq+64* 7]
+ mova m24, [cq+64* 5]
+ mova m19, [cq+64*27]
+ mova m16, [cq+64*21]
+ mova m27, [cq+64*11]
+ mova m28, [cq+64*13]
+ mova m15, [cq+64*19]
+ mova m20, [cq+64*29]
+ mova m23, [cq+64* 3]
+ call .main_oddhalf
+ vpbroadcastd m10, [o(pw_8192)]
+ psubsw m13, m0, m29 ; 31
+ paddsw m0, m29 ; 0
+ psubsw m29, m1, m28 ; 30
+ paddsw m1, m28 ; 1
+ psubsw m28, m2, m27 ; 29
+ paddsw m2, m27 ; 2
+ psubsw m27, m3, m26 ; 28
+ paddsw m3, m26 ; 3
+ psubsw m26, m4, m25 ; 27
+ paddsw m4, m25 ; 4
+ psubsw m25, m5, m24 ; 26
+ paddsw m5, m24 ; 5
+ psubsw m24, m6, m23 ; 25
+ paddsw m6, m23 ; 6
+ psubsw m23, m7, m22 ; 24
+ paddsw m7, m22 ; 7
+ pxor m9, m9
+ punpckhwd m8, m0, m1 ; a4 b4 a5 b5 a6 b6 a7 b7
+ punpcklwd m0, m1 ; a0 b0 a1 b1 a2 b2 a3 b3
+ punpckhwd m1, m2, m3 ; c4 d4 c5 d5 c6 d6 c7 d7
+ punpcklwd m2, m3 ; c0 d0 c1 d1 c2 d2 c3 d3
+ REPX {mova [cq+64*x], m9}, 16, 17, 18, 19
+ punpckhwd m22, m4, m5 ; e4 f4 e5 f5 e6 f6 e7 f7
+ punpcklwd m4, m5 ; e0 f0 e1 f1 e2 f2 e3 f3
+ punpckhwd m5, m6, m7 ; g4 h4 g5 h5 g6 h6 g7 h7
+ punpcklwd m6, m7 ; g0 h0 g1 h1 g2 h2 g3 h3
+ REPX {mova [cq+64*x], m9}, 20, 21, 22, 23
+ punpckhwd m3, m23, m24
+ punpcklwd m23, m24
+ punpckhwd m24, m25, m26
+ punpcklwd m25, m26
+ REPX {mova [cq+64*x], m9}, 24, 25, 26, 27
+ punpckhwd m26, m27, m28
+ punpcklwd m27, m28
+ punpckhwd m28, m29, m13
+ punpcklwd m29, m13
+ REPX {mova [cq+64*x], m9}, 28, 29, 30, 31
+ punpckhdq m7, m0, m2 ; a2 b2 c2 d2 a3 b3 c3 d3
+ punpckldq m0, m2 ; a0 b0 c0 d0 a1 b1 c1 d1
+ punpckhdq m2, m4, m6 ; e2 f2 g2 h2 e3 f3 g3 h3
+ punpckldq m4, m6 ; e0 f0 g0 h0 e1 f1 g1 h1
+ punpckhdq m6, m8, m1 ; a6 b6 c6 d6 a7 b7 c7 d7
+ punpckldq m8, m1 ; a4 b4 c4 d4 a5 b5 c5 d5
+ punpckhdq m1, m22, m5 ; e6 f6 g6 h6 e7 f7 g7 h7
+ punpckldq m22, m5 ; e4 f4 g4 h5 e5 f5 g5 h5
+ REPX {pmulhrsw x, m10}, m0, m4, m8, m22
+ punpckhdq m13, m23, m25
+ punpckldq m23, m25
+ punpckhdq m25, m27, m29
+ punpckldq m27, m29
+ REPX {pmulhrsw x, m10}, m13, m23, m25, m27
+ punpckhdq m9, m3, m24
+ punpckldq m3, m24
+ punpckhdq m24, m26, m28
+ punpckldq m26, m28
+ punpcklqdq m5, m23, m27 ; d00 d08 d16 d24
+ punpckhqdq m23, m27 ; d01 d09 d17 d25
+ punpckhqdq m27, m13, m25 ; d03 d11 d19 d27
+ punpcklqdq m13, m25 ; d02 d10 d18 d26
+ punpckhqdq m25, m3, m26 ; d05 d13 d21 d29
+ punpcklqdq m3, m26 ; d04 d12 d20 d28
+ punpckhqdq m26, m9, m24 ; d07 d15 d23 d31
+ punpcklqdq m9, m24 ; d06 d14 d22 d30
+ REPX {pmulhrsw x, m10}, m25, m3, m26
+ mova [cq+64* 9], m23
+ mova [cq+64*11], m27
+ mova [cq+64*13], m25
+ mova [cq+64*15], m26
+ punpckhqdq m24, m8, m22 ; a05 a13 a21 a29
+ punpcklqdq m8, m22 ; a04 a12 a20 a28
+ punpckhqdq m22, m0, m4 ; a01 a09 a17 a25
+ punpcklqdq m0, m4 ; a00 a08 a16 a24
+ punpckhqdq m23, m7, m2 ; a03 a11 a19 a27
+ punpcklqdq m7, m2 ; a02 a10 a18 a26
+ punpckhqdq m25, m6, m1 ; a07 a15 a23 a31
+ punpcklqdq m6, m1 ; a06 a14 a22 a30
+ mova m2, [cq+64* 0]
+ mova m11, [cq+64* 2]
+ mova m12, [cq+64* 4]
+ mova m29, [cq+64* 6]
+ mova m27, [cq+64* 8]
+ mova m26, [cq+64*10]
+ mova m4, [cq+64*12]
+ mova m28, [cq+64*14]
+ psubsw m1, m2, m21 ; 23
+ paddsw m2, m21 ; 8
+ psubsw m21, m11, m20 ; 22
+ paddsw m11, m20 ; 9
+ psubsw m20, m12, m19 ; 21
+ paddsw m12, m19 ; 10
+ psubsw m19, m29, m18 ; 20
+ paddsw m29, m18 ; 11
+ psubsw m18, m27, m17 ; 19
+ paddsw m27, m17 ; 12
+ psubsw m17, m26, m16 ; 18
+ paddsw m26, m16 ; 13
+ paddsw m16, m4, m15 ; 14
+ psubsw m4, m15 ; 17
+ pmulhrsw m15, m6, m10
+ psubsw m6, m28, m14 ; 16
+ paddsw m28, m14 ; 15
+ pmulhrsw m14, m7, m10
+ punpcklwd m7, m6, m4
+ punpckhwd m6, m4
+ punpckhwd m4, m17, m18
+ punpcklwd m17, m18
+ punpckhwd m18, m19, m20
+ punpcklwd m19, m20
+ punpckhwd m20, m21, m1
+ punpcklwd m21, m1
+ punpckhwd m1, m2, m11 ; i4 j4 i5 j5 i6 j6 i7 j7
+ punpcklwd m2, m11 ; i0 j1 i1 j1 i2 j2 i3 j3
+ punpckhwd m11, m12, m29 ; k4 l4 k5 l5 k6 l6 k7 l7
+ punpcklwd m12, m29 ; k0 l0 k1 l1 k2 l2 k3 l3
+ punpckhwd m29, m27, m26 ; m4 n4 m5 n5 m6 n6 m7 n7
+ punpcklwd m27, m26 ; m0 n0 m1 n1 m2 n2 m3 n3
+ punpckhwd m26, m16, m28 ; o4 p4 o5 p5 o6 p6 o7 p7
+ punpcklwd m16, m28 ; o0 p0 o1 p1 o2 p2 o3 p3
+ pmulhrsw m23, m10
+ pmulhrsw m25, m10
+ punpckhdq m28, m2, m12 ; i2 j2 k2 l2 i3 j3 k3 l3
+ punpckldq m2, m12 ; i0 j0 k0 l0 i1 j1 k1 l1
+ punpckhdq m12, m27, m16 ; m2 n2 o2 p2 m3 n3 o3 p3
+ punpckldq m27, m16 ; m0 n0 o0 p0 m1 n1 o1 p1
+ REPX {pmulhrsw x, m10}, m28, m2, m12, m27
+ punpckhdq m16, m1, m11 ; i6 j6 k6 l6 i7 j7 k7 l7
+ punpckldq m1, m11 ; i4 j4 k4 l4 i5 j5 k5 l5
+ punpckhdq m11, m29, m26 ; m6 n6 o6 p6 m7 n7 o7 p7
+ punpckldq m29, m26 ; m4 n4 o4 p4 m5 n5 o5 p5
+ REPX {pmulhrsw x, m10}, m16, m1, m11, m29
+ punpckhdq m26, m19, m21
+ punpckldq m19, m21
+ punpckhdq m21, m6, m4
+ punpckldq m6, m4
+ REPX {pmulhrsw x, m10}, m26, m19, m21, m6
+ punpckhdq m4, m18, m20
+ punpckldq m18, m20
+ punpckhdq m20, m7, m17
+ punpckldq m7, m17
+ REPX {pmulhrsw x, m10}, m4, m18, m20, m7
+ punpcklqdq m17, m28, m12 ; b02 b10 b18 b26
+ punpckhqdq m28, m12 ; b03 b11 b19 b27
+ punpckhqdq m12, m2, m27 ; b01 b09 b17 b25
+ punpcklqdq m2, m27 ; b00 b08 b16 b24
+ punpckhqdq m27, m1, m29 ; b05 b13 b21 b29
+ punpcklqdq m1, m29 ; b04 b12 b20 b28
+ punpckhqdq m29, m16, m11 ; b07 b15 b23 b31
+ punpcklqdq m16, m11 ; b06 b14 b22 b30
+ mova [cq+64* 1], m12
+ mova [cq+64* 3], m28
+ mova [cq+64* 5], m27
+ mova [cq+64* 7], m29
+ punpckhqdq m27, m20, m26 ; c03 c11 c19 c27
+ punpcklqdq m20, m26 ; c02 c10 c18 c26
+ punpckhqdq m26, m7, m19 ; c01 c09 c17 c25
+ punpcklqdq m7, m19 ; c00 c08 c16 c24
+ punpckhqdq m28, m6, m18 ; c05 c13 c21 c29
+ punpcklqdq m6, m18 ; c04 c12 c20 c28
+ punpckhqdq m29, m21, m4 ; c07 c15 c23 c31
+ punpcklqdq m21, m4 ; c06 c14 c22 c30
+ pmulhrsw m19, m9, m10
+ vshufi32x4 m4, m0, m2, q3232 ; a16 a24 b16 b24
+ vinserti32x8 m0, ym2, 1 ; a00 a08 b00 b08
+ vshufi32x4 m2, m7, m5, q3232 ; c16 c24 d16 d24
+ vinserti32x8 m7, ym5, 1 ; c00 c08 d00 d08
+ vshufi32x4 m5, m8, m1, q3232 ; a20 a28 b20 b28
+ vinserti32x8 m1, m8, ym1, 1 ; a04 a12 b04 b12
+ vshufi32x4 m8, m6, m3, q3232 ; c20 c28 d20 d28
+ vinserti32x8 m6, ym3, 1 ; c04 c12 d04 d12
+ vshufi32x4 m3, m1, m6, q3131 ; 12
+ vshufi32x4 m1, m6, q2020 ; 4
+ vshufi32x4 m6, m4, m2, q3131 ; 24
+ vshufi32x4 m4, m2, q2020 ; 16
+ vshufi32x4 m2, m0, m7, q3131 ; 8
+ vshufi32x4 m0, m7, q2020 ; 0
+ vshufi32x4 m7, m5, m8, q3131 ; 28
+ vshufi32x4 m5, m8, q2020 ; 20
+ call m(inv_txfm_add_dct_dct_32x8_8bpc).main
+ vshufi32x4 m18, m14, m17, q3232 ; a18 a26 b18 b26
+ vinserti32x8 m14, ym17, 1 ; a02 a10 b02 b10
+ vshufi32x4 m17, m20, m13, q3232 ; c18 c26 d18 d26
+ vinserti32x8 m20, ym13, 1 ; c02 c10 d02 d10
+ vshufi32x4 m13, m21, m19, q3232 ; c22 c30 d22 d30
+ vinserti32x8 m21, ym19, 1 ; c06 c14 d06 d14
+ vshufi32x4 m19, m15, m16, q3232 ; a22 a30 b22 b30
+ vinserti32x8 m15, ym16, 1 ; a06 a14 b06 b14
+ vshufi32x4 m16, m14, m20, q3131 ; 10
+ vshufi32x4 m14, m20, q2020 ; 2
+ vshufi32x4 m20, m18, m17, q3131 ; 26
+ vshufi32x4 m18, m17, q2020 ; 18
+ vshufi32x4 m17, m15, m21, q3131 ; 14
+ vshufi32x4 m15, m21, q2020 ; 6
+ vshufi32x4 m21, m19, m13, q3131 ; 30
+ vshufi32x4 m19, m13, q2020 ; 22
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
+ mova [cq+64* 0], m14
+ mova [cq+64* 2], m15
+ mova [cq+64* 4], m16
+ mova [cq+64* 6], m17
+ mova [cq+64* 8], m18
+ mova [cq+64*10], m19
+ mova [cq+64*12], m20
+ mova [cq+64*14], m21
+ mova m15, [cq+64* 1]
+ mova m16, [cq+64* 3]
+ mova m17, [cq+64* 5]
+ mova m19, [cq+64* 7]
+ mova m20, [cq+64* 9]
+ mova m21, [cq+64*11]
+ mova m13, [cq+64*13]
+ mova m18, [cq+64*15]
+ vshufi32x4 m14, m22, m15, q3232 ; a17 a25 b17 b25
+ vinserti32x8 m22, ym15, 1 ; a01 a09 b01 b09
+ vshufi32x4 m15, m23, m16, q3232 ; a19 a27 b19 b27
+ vinserti32x8 m23, ym16, 1 ; a03 a11 b03 b11
+ vshufi32x4 m16, m24, m17, q3232 ; a21 a29 b21 b29
+ vinserti32x8 m24, ym17, 1 ; a05 a13 b05 b13
+ vshufi32x4 m17, m25, m19, q3232 ; a23 a31 b23 b31
+ vinserti32x8 m25, ym19, 1 ; a07 a15 b07 b15
+ vinserti32x8 m8, m26, ym20, 1 ; c01 c09 d01 d09
+ vshufi32x4 m26, m20, q3232 ; c17 c25 d17 d25
+ vinserti32x8 m9, m27, ym21, 1 ; c03 c11 d03 d11
+ vshufi32x4 m27, m21, q3232 ; c19 c27 d19 d27
+ vinserti32x8 m11, m28, ym13, 1 ; c05 c13 d05 d13
+ vshufi32x4 m28, m13, q3232 ; c21 c29 d21 d29
+ vinserti32x8 m12, m29, ym18, 1 ; c07 c15 d07 d15
+ vshufi32x4 m29, m18, q3232 ; c23 c31 d23 d31
+ vshufi32x4 m18, m14, m26, q3131 ; 25
+ vshufi32x4 m14, m26, q2020 ; 17
+ vshufi32x4 m19, m15, m27, q3131 ; 27
+ vshufi32x4 m15, m27, q2020 ; 19
+ vshufi32x4 m20, m16, m28, q3131 ; 29
+ vshufi32x4 m16, m28, q2020 ; 21
+ vshufi32x4 m21, m17, m29, q3131 ; 31
+ vshufi32x4 m17, m29, q2020 ; 23
+ vshufi32x4 m26, m22, m8, q3131 ; 9
+ vshufi32x4 m22, m8, q2020 ; 1
+ vshufi32x4 m27, m23, m9, q3131 ; 11
+ vshufi32x4 m23, m9, q2020 ; 3
+ vshufi32x4 m28, m24, m11, q3131 ; 13
+ vshufi32x4 m24, m11, q2020 ; 5
+ vshufi32x4 m29, m25, m12, q3131 ; 15
+ vshufi32x4 m25, m12, q2020 ; 7
+ call .main_oddhalf
+ jmp .end
+.fast: ; bottom/right halves are zero
+ mova m14, [o(dup16_perm)]
+ pmovzxwd m9, [cq+64* 0]
+ pmovzxwd m6, [cq+64* 8]
+ vpermb m8, m14, [cq+64* 2]
+ vpermb ym0, ym14, [cq+64*14]
+ vpermb ym5, ym14, [cq+64*10]
+ vpermb m1, m14, [cq+64* 6]
+ vpermb m7, m14, [cq+64* 4]
+ vpermb ym3, ym14, [cq+64*12]
+ pslld m9, 16
+ pslld m6, 16
+ call m(idct_16x16_internal_8bpc).main_fast
+ vpermb m21, m14, [cq+64* 1]
+ vpermb ym17, ym14, [cq+64*15]
+ vpermb ym20, ym14, [cq+64* 9]
+ vpermb m15, m14, [cq+64* 7]
+ vpermb m18, m14, [cq+64* 5]
+ vpermb ym16, ym14, [cq+64*11]
+ vpermb ym19, ym14, [cq+64*13]
+ vpermb m14, m14, [cq+64* 3]
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ vpbroadcastd m9, [o(pw_8192)]
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).transpose_round
+ vshufi32x4 m22, m14, m2, q2020 ; 1
+ vshufi32x4 m24, m14, m2, q3131 ; 5
+ vshufi32x4 m23, m17, m9, q2020 ; 3
+ vshufi32x4 m25, m17, m9, q3131 ; 7
+ vshufi32x4 m16, m5, m15, q2020 ; 10
+ vshufi32x4 m17, m5, m15, q3131 ; 14
+ vshufi32x4 m14, m1, m18, q2020 ; 2
+ vshufi32x4 m15, m1, m18, q3131 ; 6
+ vshufi32x4 m1, m0, m3, q3131 ; 4
+ vshufi32x4 m0, m3, q2020 ; 0
+ vshufi32x4 m3, m21, m4, q3131 ; 12
+ vshufi32x4 m2, m21, m4, q2020 ; 8
+ vshufi32x4 m26, m20, m6, q2020 ; 9
+ vshufi32x4 m28, m20, m6, q3131 ; 13
+ vshufi32x4 m27, m19, m7, q2020 ; 11
+ vshufi32x4 m29, m19, m7, q3131 ; 15
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
+ mova [cq+64* 0], m14
+ mova [cq+64* 2], m15
+ mova [cq+64* 4], m16
+ mova [cq+64* 6], m17
+ mova [cq+64* 8], m18
+ mova [cq+64*10], m19
+ mova [cq+64*12], m20
+ mova [cq+64*14], m21
+ call .main_oddhalf_fast
+.end:
+ lea r4, [strideq*3]
+ vpbroadcastd m12, [o(pw_2048)]
+ movshdup m13, [o(permD)]
+ lea r3, [dstq+r4*8]
+ lea r5, [strideq+r4] ; stride*4
+ add r3, r5 ; dst+stride*28
+ IDCT_32x32_END 29, 0, strideq*0, r4
+ IDCT_32x32_END 28, 1, strideq*1, strideq*2
+ IDCT_32x32_END 27, 2, strideq*2, strideq*1
+ IDCT_32x32_END 26, 3, r4 , strideq*0
+ IDCT_32x32_END 25, 4, strideq*0, r4
+ IDCT_32x32_END 24, 5, strideq*1, strideq*2
+ IDCT_32x32_END 23, 6, strideq*2, strideq*1
+ IDCT_32x32_END 22, 7, r4 , strideq*0
+ IDCT_32x32_END 21, 8, strideq*0, r4
+ IDCT_32x32_END 20, 9, strideq*1, strideq*2
+ IDCT_32x32_END 19, 10, strideq*2, strideq*1
+ IDCT_32x32_END 18, 11, r4 , strideq*0
+ IDCT_32x32_END 17, 12, strideq*0, r4
+ IDCT_32x32_END 16, 13, strideq*1, strideq*2
+ IDCT_32x32_END 15, 14, strideq*2, strideq*1
+ IDCT_32x32_END 14, 15, r4 , strideq*0
+ RET
+.dconly:
+ movsx r6d, word [cq]
+ mov [cq], eobd
+ or r3d, 32
+ jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly2
+ALIGN function_align
+cglobal_label .main_oddhalf_fast3 ; bottom seven-eights are zero
+ vpbroadcastd m21, [o(pw_4091x8)]
+ vpbroadcastd m8, [o(pw_201x8)]
+ vpbroadcastd m24, [o(pw_m601x8)]
+ vpbroadcastd m12, [o(pw_4052x8)]
+ pmulhrsw m21, m22 ; t31a
+ pmulhrsw m22, m8 ; t16a
+ pmulhrsw m24, m23 ; t23a
+ pmulhrsw m23, m12 ; t24a
+
+ punpcklwd m9, m22, m21
+ punpckhwd m8, m22, m21
+ mova m15, m10
+ vpdpwssd m15, m9, [o(pw_m4017_799)] {bcstd}
+ mova m17, m10
+ vpdpwssd m17, m8, [o(pw_m4017_799)] {bcstd}
+ REPX {psrad x, 12}, m15, m17
+ packssdw m15, m17
+ mova m17, m10
+ vpdpwssd m17, m8, [o(pw_799_4017)] {bcstd}
+ mova m8, m10
+ vpdpwssd m8, m9, [o(pw_799_4017)] {bcstd}
+ REPX {psrad x, 12}, m17, m8
+ packssdw m8, m17
+
+ punpcklwd m9, m24, m23
+ punpckhwd m16, m24, m23
+ mova m20, m10
+ vpdpwssd m20, m9, [o(pw_m3406_m2276)] {bcstd}
+ mova m17, m10
+ vpdpwssd m17, m16, [o(pw_m3406_m2276)] {bcstd}
+ REPX {psrad x, 12}, m20, m17
+ packssdw m20, m17
+ mova m17, m10
+ vpdpwssd m17, m16, [o(pw_m2276_3406)] {bcstd}
+ mova m16, m10
+ vpdpwssd m16, m9, [o(pw_m2276_3406)] {bcstd}
+ REPX {psrad x, 12}, m17, m16
+ packssdw m16, m17
+
+ mova m17, m21
+ mova m27, m15
+ mova m25, m20
+ mova m29, m8
+ mova m18, m22
+ mova m14, m24
+ mova m28, m16
+ mova m26, m23
+ jmp .main4
+cglobal_label .main_oddhalf_fast2 ; bottom three-quarters are zero
+ vpbroadcastd m21, [o(pw_4091x8)]
+ vpbroadcastd m8, [o(pw_201x8)]
+ vpbroadcastd m18, [o(pw_m1380x8)]
+ vpbroadcastd m9, [o(pw_3857x8)]
+ vpbroadcastd m19, [o(pw_3973x8)]
+ vpbroadcastd m11, [o(pw_995x8)]
+ vpbroadcastd m28, [o(pw_m601x8)]
+ vpbroadcastd m12, [o(pw_4052x8)]
+ pmulhrsw m21, m22 ; t31a
+ pmulhrsw m22, m8 ; t16a
+ pmulhrsw m18, m25 ; t19a
+ pmulhrsw m25, m9 ; t28a
+ pmulhrsw m19, m24 ; t27a
+ pmulhrsw m24, m11 ; t20a
+ pmulhrsw m28, m23 ; t23a
+ pmulhrsw m23, m12 ; t24a
+ mova m15, m21
+ mova m8, m22
+ mova m14, m18
+ mova m27, m25
+ mova m29, m19
+ mova m26, m24
+ mova m16, m28
+ mova m20, m23
+ jmp .main3
+ALIGN function_align
+cglobal_label .main_oddhalf_fast ; bottom half is zero
+ vpbroadcastd m21, [o(pw_4091x8)]
+ vpbroadcastd m8, [o(pw_201x8)]
+ vpbroadcastd m14, [o(pw_m2751x8)]
+ vpbroadcastd m9, [o(pw_3035x8)]
+ vpbroadcastd m17, [o(pw_3703x8)]
+ vpbroadcastd m11, [o(pw_1751x8)]
+ vpbroadcastd m18, [o(pw_m1380x8)]
+ vpbroadcastd m12, [o(pw_3857x8)]
+ pmulhrsw m21, m22 ; t31a
+ vpbroadcastd m19, [o(pw_3973x8)]
+ pmulhrsw m22, m8 ; t16a
+ vpbroadcastd m8, [o(pw_995x8)]
+ pmulhrsw m14, m29 ; t30a
+ vpbroadcastd m16, [o(pw_m2106x8)]
+ pmulhrsw m29, m9 ; t17a
+ vpbroadcastd m9, [o(pw_3513x8)]
+ pmulhrsw m17, m26 ; t29a
+ vpbroadcastd m15, [o(pw_3290x8)]
+ pmulhrsw m26, m11 ; t18a
+ vpbroadcastd m11, [o(pw_2440x8)]
+ pmulhrsw m18, m25 ; t19a
+ vpbroadcastd m20, [o(pw_m601x8)]
+ pmulhrsw m25, m12 ; t28a
+ vpbroadcastd m12, [o(pw_4052x8)]
+ pmulhrsw m19, m24 ; t27a
+ pmulhrsw m24, m8 ; t20a
+ pmulhrsw m16, m27 ; t21a
+ pmulhrsw m27, m9 ; t26a
+ pmulhrsw m15, m28 ; t25a
+ pmulhrsw m28, m11 ; t22a
+ pmulhrsw m20, m23 ; t23a
+ pmulhrsw m23, m12 ; t24a
+ jmp .main2
+ALIGN function_align
+cglobal_label .main_oddhalf
+ ITX_MULSUB_2W 22, 21, 8, 9, 10, 201, 4091 ; t16a, t31a
+ ITX_MULSUB_2W 14, 29, 8, 9, 10, 3035, 2751 ; t17a, t30a
+ ITX_MULSUB_2W 26, 17, 8, 9, 10, 1751, 3703 ; t18a, t29a
+ ITX_MULSUB_2W 18, 25, 8, 9, 10, 3857, 1380 ; t19a, t28a
+ ITX_MULSUB_2W 24, 19, 8, 9, 10, 995, 3973 ; t20a, t27a
+ ITX_MULSUB_2W 16, 27, 8, 9, 10, 3513, 2106 ; t21a, t26a
+ ITX_MULSUB_2W 28, 15, 8, 9, 10, 2440, 3290 ; t22a, t25a
+ ITX_MULSUB_2W 20, 23, 8, 9, 10, 4052, 601 ; t23a, t24a
+.main2:
+ psubsw m8, m22, m14 ; t17
+ paddsw m22, m14 ; t16
+ paddsw m14, m18, m26 ; t19
+ psubsw m18, m26 ; t18
+ psubsw m26, m24, m16 ; t21
+ paddsw m24, m16 ; t20
+ psubsw m16, m20, m28 ; t22
+ paddsw m28, m20 ; t23
+ psubsw m20, m23, m15 ; t25
+ paddsw m23, m15 ; t24
+ psubsw m15, m21, m29 ; t30
+ paddsw m21, m29 ; t31
+ psubsw m29, m19, m27 ; t26
+ paddsw m19, m27 ; t27
+ paddsw m27, m25, m17 ; t28
+ psubsw m25, m17 ; t29
+.main3:
+ ITX_MULSUB_2W 15, 8, 9, 17, 10, 799, 4017 ; t17a, t30a
+ ITX_MULSUB_2W 25, 18, 9, 17, 10, m4017, 799 ; t18a, t29a
+ ITX_MULSUB_2W 29, 26, 9, 17, 10, 3406, 2276 ; t21a, t26a
+ ITX_MULSUB_2W 20, 16, 9, 17, 10, m2276, 3406 ; t22a, t25a
+ psubsw m17, m21, m27 ; t28a
+ paddsw m21, m27 ; t31a
+ psubsw m27, m15, m25 ; t18
+ paddsw m15, m25 ; t17
+ psubsw m25, m20, m29 ; t21
+ paddsw m20, m29 ; t22
+ psubsw m29, m8, m18 ; t29
+ paddsw m8, m18 ; t30
+ psubsw m18, m22, m14 ; t19a
+ paddsw m22, m14 ; t16a
+ psubsw m14, m28, m24 ; t20a
+ paddsw m24, m28 ; t23a
+ paddsw m28, m16, m26 ; t25
+ psubsw m16, m26 ; t26
+ psubsw m26, m23, m19 ; t27a
+ paddsw m23, m19 ; t24a
+.main4:
+ vpbroadcastd m12, [o(pw_m3784_1567)]
+ vpbroadcastd m11, [o(pw_1567_3784)]
+ ITX_MULSUB_2W 29, 27, 9, 19, 10, 11, 12 ; t18a, t29a
+ ITX_MULSUB_2W 17, 18, 9, 19, 10, 11, 12 ; t19, t28
+ vpbroadcastd m11, [o(pw_m1567_m3784)]
+ ITX_MULSUB_2W 16, 25, 9, 19, 10, 12, 11 ; t21a, t26a
+ ITX_MULSUB_2W 26, 14, 9, 19, 10, 12, 11 ; t20, t27
+ vpbroadcastd m12, [o(pw_m2896_2896)]
+ vpbroadcastd m11, [o(pw_2896_2896)]
+ psubsw m19, m27, m25 ; t26
+ paddsw m27, m25 ; t29
+ psubsw m25, m17, m26 ; t20a
+ paddsw m17, m26 ; t19a
+ paddsw m26, m18, m14 ; t28a
+ psubsw m18, m14 ; t27a
+ paddsw m14, m22, m24 ; t16
+ psubsw m22, m24 ; t23
+ psubsw m24, m29, m16 ; t21
+ paddsw m16, m29 ; t18
+ paddsw m29, m21, m23 ; t31
+ psubsw m21, m23 ; t24
+ psubsw m23, m15, m20 ; t22a
+ paddsw m15, m20 ; t17a
+ psubsw m20, m8, m28 ; t25a
+ paddsw m28, m8 ; t30a
+ ITX_MULSUB_2W 18, 25, 8, 9, 10, 11, 12 ; t20, t27
+ ITX_MULSUB_2W 19, 24, 8, 9, 10, 11, 12 ; t21a, t26a
+ ITX_MULSUB_2W 21, 22, 8, 9, 10, 11, 12 ; t23a, t24a
+ ITX_MULSUB_2W 20, 23, 8, 9, 10, 11, 12 ; t22, t25
+ ret
+
+%macro IDTX_32x32 2 ; dst[1-2]
+ vmovdqa32 ym%1, [cq+64*(%1+ 0)] ; force EVEX encoding, which
+ vmovdqa32 ym17, [cq+64*(%1+16)] ; reduces code size due to
+ vmovdqa32 ym%2, [cq+64*(%2+ 0)] ; compressed displacements
+ vmovdqa32 ym18, [cq+64*(%2+16)]
+ vpermt2q m%1, m21, m17
+ vpermt2q m%2, m21, m18
+%endmacro
+
+cglobal inv_txfm_add_identity_identity_32x32_8bpc, 3, 3, 22, dst, stride, c
+ movu m21, [permB+7]
+ vpbroadcastd m16, [pw_8192]
+ pxor m20, m20
+.loop:
+ IDTX_32x32 0, 1
+ IDTX_32x32 2, 3
+ IDTX_32x32 4, 5
+ IDTX_32x32 6, 7
+ IDTX_32x32 8, 9
+ IDTX_32x32 10, 11
+ IDTX_32x32 12, 13
+ IDTX_32x32 14, 15
+ call m(inv_txfm_add_identity_identity_16x32_8bpc).transpose_2x8x8_round
+ IDTX_32x16_STORE 0, 8, 1
+ IDTX_32x16_STORE 1, 9, 1
+ IDTX_32x16_STORE 2, 10, 1
+ IDTX_32x16_STORE 3, 11, 1
+ IDTX_32x16_STORE 4, 12, 1
+ IDTX_32x16_STORE 5, 13, 1
+ IDTX_32x16_STORE 6, 14, 1
+ IDTX_32x16_STORE 7, 15, 1
+ lea dstq, [dstq+strideq*8]
+ btc cq, 5
+ jnc .loop
+ mov r0d, 8
+.zero_loop:
+ mova [cq+64*0], m20
+ mova [cq+64*1], m20
+ mova [cq+64*2], m20
+ mova [cq+64*3], m20
+ add cq, 64*4
+ dec r0d
+ jg .zero_loop
+ RET
+
+cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 7, 0, dst, stride, c, eob
+%undef cmp
+ lea r5, [o_base]
+ test eobd, eobd
+ jz .dconly
+ WIN64_SPILL_XMM 30
+ cmp eobd, 151
+ jb .fast
+ mova m5, [cq+64*10]
+ mova m3, [cq+64* 6]
+ mova m1, [cq+64* 2]
+ mova m7, [cq+64*14]
+ mova m2, [cq+64* 4]
+ mova m6, [cq+64*12]
+ mova m0, [cq+64* 0]
+ mova m4, [cq+64* 8]
+ call m(inv_txfm_add_dct_dct_32x8_8bpc).main
+ mova m14, [cq+64* 1]
+ mova m21, [cq+64*15]
+ mova m18, [cq+64* 9]
+ mova m17, [cq+64* 7]
+ mova m16, [cq+64* 5]
+ mova m19, [cq+64*11]
+ mova m20, [cq+64*13]
+ mova m15, [cq+64* 3]
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
+ vpbroadcastd m9, [o(pw_8192)]
+%macro TRANSPOSE_8x4_ROUND 4
+ punpckhwd m8, m%3, m%4 ; c4 d4 c5 d5 c6 d6 c7 d7
+ punpcklwd m%3, m%4 ; c0 d0 c1 d1 c2 d2 c3 d3
+ punpckhwd m%4, m%1, m%2 ; a4 b4 a5 b5 a6 b6 a7 b7
+ punpcklwd m%1, m%2 ; a0 b0 a1 b1 a2 b2 a3 b3
+ punpckhdq m%2, m%1, m%3 ; a2 b2 c2 d2 a3 b3 c3 d3
+ punpckldq m%1, m%3 ; a0 b0 c0 d0 a1 b1 c1 d1
+ punpckldq m%3, m%4, m8 ; a4 b4 c4 d4 a5 b5 c5 d5
+ punpckhdq m%4, m8 ; a6 b6 c6 d6 a7 b7 c7 d7
+ REPX {pmulhrsw x, m9}, m%2, m%1, m%3, m%4
+%endmacro
+ TRANSPOSE_8x4_ROUND 0, 1, 2, 3
+ TRANSPOSE_8x4_ROUND 4, 5, 6, 7
+ TRANSPOSE_8x4_ROUND 14, 15, 16, 17
+ TRANSPOSE_8x4_ROUND 18, 19, 20, 21
+ vinserti32x8 m26, m0, ym4, 1 ; a0 a4 b0 b4
+ vshufi32x4 m0, m4, q3232 ; a8 a12 b8 b12
+ vinserti32x8 m27, m1, ym5, 1 ; a1 a5 b1 b5
+ vshufi32x4 m1, m5, q3232 ; a9 a13 b9 b13
+ vinserti32x8 m28, m2, ym6, 1 ; a2 a6 b2 b6
+ vshufi32x4 m2, m6, q3232 ; a10 a14 b10 b14
+ vinserti32x8 m29, m3, ym7, 1 ; a3 a7 b3 b7
+ vshufi32x4 m8, m3, m7, q3232 ; a11 a15 b11 b15
+ vinserti32x8 m4, m14, ym18, 1 ; c0 c4 d0 d4
+ vshufi32x4 m14, m18, q3232 ; c8 c12 d8 d12
+ vinserti32x8 m5, m15, ym19, 1 ; c1 c5 d1 d5
+ vshufi32x4 m15, m19, q3232 ; c9 c13 d9 d13
+ vinserti32x8 m6, m16, ym20, 1 ; c2 c6 d2 d6
+ vshufi32x4 m16, m20, q3232 ; c10 c14 d10 d14
+ vinserti32x8 m7, m17, ym21, 1 ; c3 c7 d3 d7
+ vshufi32x4 m17, m21, q3232 ; c11 c15 d11 d15
+ vshufi32x4 m22, m26, m4, q2020 ; 0 1
+ vshufi32x4 m26, m4, q3131 ; 8 9
+ vshufi32x4 m23, m27, m5, q2020 ; 2 3
+ vshufi32x4 m27, m5, q3131 ; 10 11
+ vshufi32x4 m24, m28, m6, q2020 ; 4 5
+ vshufi32x4 m28, m6, q3131 ; 12 13
+ vshufi32x4 m25, m29, m7, q2020 ; 6 7
+ vshufi32x4 m29, m7, q3131 ; 14 15
+ vshufi32x4 m4, m0, m14, q2020 ; 16 17
+ vshufi32x4 m3, m0, m14, q3131 ; 24 25
+ vshufi32x4 m20, m1, m15, q2020 ; 18 19
+ vshufi32x4 m19, m1, m15, q3131 ; 26 27
+ vshufi32x4 m5, m2, m16, q2020 ; 20 21
+ vshufi32x4 m0, m2, m16, q3131 ; 28 29
+ vshufi32x4 m16, m8, m17, q2020 ; 22 23
+ vshufi32x4 m17, m8, m17, q3131 ; 30 31
+ pxor m6, m6
+ mova [cq+64* 0], m4
+ mova [cq+64* 2], m5
+ mova [cq+64* 4], m3
+ mova [cq+64* 6], m0
+ punpcklwd m8, m24, m24 ; 4
+ punpcklwd m0, m0 ; 28
+ punpcklwd m5, m5 ; 20
+ punpcklwd m1, m28, m28 ; 12
+ punpcklwd m7, m26, m26 ; 8
+ punpcklwd m3, m3 ; 24
+ punpcklwd m9, m6, m22 ; __ 0
+ punpcklwd m6, m4 ; __ 16
+ call m(idct_16x16_internal_8bpc).main_fast3
+ mova [cq+64* 1], m20
+ mova [cq+64* 3], m16
+ mova [cq+64* 5], m19
+ mova [cq+64* 7], m17
+ punpcklwd m21, m23, m23 ; 2
+ punpcklwd m17, m17 ; 30
+ punpcklwd m20, m20 ; 18
+ punpcklwd m15, m29, m29 ; 14
+ punpcklwd m18, m27, m27 ; 10
+ punpcklwd m16, m16 ; 22
+ punpcklwd m19, m19 ; 26
+ punpcklwd m14, m25, m25 ; 6
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ mova [cq+64* 8], m14
+ mova [cq+64* 9], m15
+ mova [cq+64*10], m16
+ mova [cq+64*11], m17
+ mova [cq+64*12], m18
+ mova [cq+64*13], m19
+ mova [cq+64*14], m20
+ mova [cq+64*15], m21
+ mova m21, [cq+64* 7]
+ mova m14, [cq+64* 0]
+ mova m17, [cq+64* 3]
+ mova m18, [cq+64* 4]
+ mova m19, [cq+64* 5]
+ mova m16, [cq+64* 2]
+ mova m15, [cq+64* 1]
+ mova m20, [cq+64* 6]
+ REPX {punpckhwd x, x}, m22, m21, m14, m29, m26, m17, m18, m25, \
+ m24, m19, m16, m27, m28, m15, m20, m23
+ call .main_oddhalf
+ jmp .end
+.fast: ; right half is zero
+ mova ym8, [cq+64*15]
+ vinserti32x8 m8, [cq+64* 1], 1
+ mova m2, [o(int16_perm)]
+ mova ym9, [cq+64* 8]
+ vinserti32x8 m9, [cq+64* 0], 1
+ mova ym0, [cq+64* 7]
+ vinserti32x8 m0, [cq+64* 9], 1
+ mova ym7, [cq+64*14]
+ vinserti32x8 m7, [cq+64* 2], 1
+ mova ym1, [cq+64* 3]
+ vinserti32x8 m1, [cq+64*13], 1
+ mova ym3, [cq+64* 6]
+ vinserti32x8 m3, [cq+64*10], 1
+ mova ym5, [cq+64*11]
+ vinserti32x8 m5, [cq+64* 5], 1
+ mova ym6, [cq+64*12]
+ vinserti32x8 m6, [cq+64* 4], 1
+ REPX {vpermb x, m2, x}, m8, m9, m0, m7, m1, m3, m5, m6
+ call m(idct_16x16_internal_8bpc).main2
+ vbroadcasti32x4 m8, [o(int_shuf3)]
+ vbroadcasti32x4 m9, [o(int_shuf4)]
+ vpbroadcastd m11, [o(pw_8192)]
+ pshufb m0, m8
+ pshufb m1, m9
+ pshufb m2, m8
+ pshufb m3, m9
+ REPX {pmulhrsw x, m11}, m0, m1, m2, m3
+ pshufb m4, m8
+ pshufb m5, m9
+ pshufb m6, m8
+ pshufb m7, m9
+ REPX {pmulhrsw x, m11}, m4, m5, m6, m7
+ punpckhdq m28, m0, m1
+ punpckldq m0, m1
+ punpckhdq m27, m2, m3
+ punpckldq m2, m3
+ punpckhdq m22, m4, m5
+ punpckldq m4, m5
+ punpckhdq m23, m6, m7
+ punpckldq m6, m7
+ vinserti32x8 m14, m0, ym2, 1
+ vshufi32x4 m15, m0, m2, q3232
+ vinserti32x8 m2, m4, ym6, 1
+ vshufi32x4 m4, m6, q3232
+ vshufi32x4 m21, m14, m2, q2020 ; 0 2
+ vshufi32x4 m14, m2, q3131 ; 4 6
+ vshufi32x4 m18, m15, m4, q2020 ; 8 10
+ vshufi32x4 m15, m4, q3131 ; 12 14
+ pxor m9, m9
+ punpcklwd m8, m14, m14 ; 4
+ punpcklwd m1, m15, m15 ; 12
+ punpcklwd m7, m18, m18 ; 8
+ punpcklwd m9, m21 ; __ 0
+ call m(idct_16x16_internal_8bpc).main_fast4
+ punpckhwd m21, m21 ; 2
+ punpckhwd m15, m15 ; 14
+ punpckhwd m18, m18 ; 10
+ punpckhwd m14, m14 ; 6
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2
+ vinserti32x8 m24, m28, ym27, 1
+ vshufi32x4 m28, m27, q3232
+ vinserti32x8 m27, m22, ym23, 1
+ vshufi32x4 m22, m23, q3232
+ vshufi32x4 m23, m24, m27, q2020 ; 1 3
+ vshufi32x4 m24, m27, q3131 ; 5 7
+ vshufi32x4 m27, m28, m22, q2020 ; 9 11
+ vshufi32x4 m28, m22, q3131 ; 13 15
+ punpcklwd m22, m23, m23 ; 1
+ punpckhwd m29, m28, m28 ; 15
+ punpcklwd m26, m27, m27 ; 9
+ punpckhwd m25, m24, m24 ; 7
+ mova [cq+64* 8], m14
+ mova [cq+64* 9], m15
+ mova [cq+64*10], m16
+ mova [cq+64*11], m17
+ punpcklwd m24, m24 ; 5
+ punpckhwd m27, m27 ; 11
+ punpcklwd m28, m28 ; 13
+ punpckhwd m23, m23 ; 3
+ mova [cq+64*12], m18
+ mova [cq+64*13], m19
+ mova [cq+64*14], m20
+ mova [cq+64*15], m21
+ call .main_oddhalf_fast
+.end:
+ imul r6, strideq, 60
+ mova m10, [o(end_16x32p)]
+ vpbroadcastd m11, [o(pw_2048)]
+ lea r3, [strideq*3]
+ pxor m12, m12
+ add r6, dstq ; dst+stride*60
+ psrldq m13, m10, 1
+ lea r4, [strideq+r3] ; stride*4
+%macro IDCT_16x64_END 3 ; idct32, idct64, tmp
+%if %1 & 1
+ %define %%s0 r3
+ %define %%s1 strideq*2
+ %define %%s2 strideq*1
+ %define %%s3 strideq*0
+%else
+ %define %%s0 strideq*0
+ %define %%s1 strideq*1
+ %define %%s2 strideq*2
+ %define %%s3 r3
+%if %1
+ add dstq, r4
+ sub r6, r4
+%endif
+%endif
+%if %1 < 8
+ pmulhrsw m8, m11, m%1
+ pmulhrsw m9, m11, m%2
+%else
+ mova m9, [cq+64*%1]
+ paddsw m8, m9, m%2 ; out 0+n, 1+n
+ psubsw m9, m%2 ; out 63-n, 62-n
+ pmulhrsw m8, m11
+ pmulhrsw m9, m11
+%endif
+ mova xm29, [dstq+%%s0]
+ vinserti128 ym29, [dstq+%%s1], 1
+ mova xm%3, [r6 +%%s3]
+ vinserti128 ym%3, [r6 +%%s2], 1
+ vpermb m29, m10, m29
+ vpermb m%3, m10, m%3
+ mova [cq+64*%1], m12
+ paddw m29, m8
+ paddw m%3, m9
+ packuswb m29, m%3
+ vpermd m29, m13, m29
+ mova [dstq+%%s0], xm29
+ vextracti128 [dstq+%%s1], ym29, 1
+ vextracti32x4 [r6 +%%s2], m29, 2
+ vextracti32x4 [r6 +%%s3], m29, 3
+%endmacro
+ IDCT_16x64_END 0, 29, 0
+ IDCT_16x64_END 1, 28, 28
+ IDCT_16x64_END 2, 27, 28
+ IDCT_16x64_END 3, 26, 28
+ IDCT_16x64_END 4, 25, 28
+ IDCT_16x64_END 5, 24, 28
+ IDCT_16x64_END 6, 23, 28
+ IDCT_16x64_END 7, 22, 28
+ IDCT_16x64_END 8, 21, 28
+ IDCT_16x64_END 9, 20, 28
+ IDCT_16x64_END 10, 19, 28
+ IDCT_16x64_END 11, 18, 28
+ IDCT_16x64_END 12, 17, 28
+ IDCT_16x64_END 13, 16, 28
+ IDCT_16x64_END 14, 15, 28
+ IDCT_16x64_END 15, 14, 28
+ RET
+.dconly:
+ movsx r6d, word [cq]
+ mov [cq], eobd
+ or r3d, 64
+ imul r6d, 181
+ add r6d, 128+512
+ sar r6d, 8+2
+ jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly3
+ALIGN function_align
+cglobal_label .main_oddhalf_fast ; bottom three-quarters are zero
+ vpbroadcastd m8, [o(pw_101_4095x8)]
+ vpbroadcastd m21, [o(pw_m1474_3822x8)]
+ vpbroadcastd m14, [o(pw_897_3996x8)]
+ vpbroadcastd m17, [o(pw_m700_4036x8)]
+ vpbroadcastd m18, [o(pw_501_4065x8)]
+ vpbroadcastd m19, [o(pw_m1092_3948x8)]
+ vpbroadcastd m16, [o(pw_1285_3889x8)]
+ vpbroadcastd m15, [o(pw_m301_4085x8)]
+ pmulhrsw m8, m22 ; t32a t63a
+ pmulhrsw m21, m29 ; t35a t60a
+ pmulhrsw m14, m26 ; t36a t59a
+ pmulhrsw m17, m25 ; t39a t56
+ pmulhrsw m18, m24 ; t40a t55a
+ pmulhrsw m19, m27 ; t43a t52a
+ pmulhrsw m16, m28 ; t44a t51a
+ pmulhrsw m15, m23 ; t47a t48a
+ mova m22, m8
+ mova m29, m21
+ mova m26, m14
+ mova m25, m17
+ mova m24, m18
+ mova m27, m19
+ mova m28, m16
+ mova m20, m15
+ jmp .main_oddhalf2
+ALIGN function_align
+cglobal_label .main_oddhalf
+ vpbroadcastd m8, [o(pw_101_4095x8)]
+ vpbroadcastd m9, [o(pw_m2824_2967x8)]
+ vpbroadcastd m11, [o(pw_1660_3745x8)]
+ vpbroadcastd m12, [o(pw_m1474_3822x8)]
+ pmulhrsw m22, m8 ; t32a t63a
+ vpbroadcastd m8, [o(pw_897_3996x8)]
+ pmulhrsw m21, m9 ; t33a t62a
+ vpbroadcastd m9, [o(pw_m2191_3461x8)]
+ pmulhrsw m14, m11 ; t34a t61a
+ vpbroadcastd m11, [o(pw_2359_3349x8)]
+ pmulhrsw m29, m12 ; t35a t60a
+ vpbroadcastd m12, [o(pw_m700_4036x8)]
+ pmulhrsw m26, m8 ; t36a t59a
+ vpbroadcastd m8, [o(pw_501_4065x8)]
+ pmulhrsw m17, m9 ; t37a t58a
+ vpbroadcastd m9, [o(pw_m2520_3229x8)]
+ pmulhrsw m18, m11 ; t38a t57a
+ vpbroadcastd m11, [o(pw_2019_3564x8)]
+ pmulhrsw m25, m12 ; t39a t56a
+ vpbroadcastd m12, [o(pw_m1092_3948x8)]
+ pmulhrsw m24, m8 ; t40a t55a
+ vpbroadcastd m8, [o(pw_1285_3889x8)]
+ pmulhrsw m19, m9 ; t41a t54a
+ vpbroadcastd m9, [o(pw_m1842_3659x8)]
+ pmulhrsw m16, m11 ; t42a t53a
+ vpbroadcastd m11, [o(pw_2675_3102x8)]
+ pmulhrsw m27, m12 ; t43a t52a
+ vpbroadcastd m12, [o(pw_m301_4085x8)]
+ pmulhrsw m28, m8 ; t44a t51a
+ pmulhrsw m15, m9 ; t45a t50a
+ pmulhrsw m20, m11 ; t46a t49a
+ pmulhrsw m23, m12 ; t47a t48a
+ psubsw m8, m22, m21 ; t33 t62
+ paddsw m22, m21 ; t32 t63
+ psubsw m21, m29, m14 ; t34 t61
+ paddsw m29, m14 ; t35 t60
+ psubsw m14, m26, m17 ; t37 t58
+ paddsw m26, m17 ; t36 t59
+ psubsw m17, m25, m18 ; t38 t57
+ paddsw m25, m18 ; t39 t56
+ psubsw m18, m24, m19 ; t41 t54
+ paddsw m24, m19 ; t40 t55
+ psubsw m19, m27, m16 ; t42 t53
+ paddsw m27, m16 ; t43 t52
+ psubsw m16, m28, m15 ; t45 t50
+ paddsw m28, m15 ; t44 t51
+ psubsw m15, m23, m20 ; t46 t49
+ paddsw m20, m23 ; t47 t48
+.main_oddhalf2:
+ ITX_MUL2X_PACK 8, 9, 23, 10, 401, 4076, 5 ; t33a t62a
+ ITX_MUL2X_PACK 21, 9, 23, 10, m4076, 401, 5 ; t34a t61a
+ ITX_MUL2X_PACK 14, 9, 23, 10, 3166, 2598, 5 ; t37a t58a
+ ITX_MUL2X_PACK 17, 9, 23, 10, m2598, 3166, 5 ; t38a t57a
+ ITX_MUL2X_PACK 18, 9, 23, 10, 1931, 3612, 5 ; t41a t54a
+ ITX_MUL2X_PACK 19, 9, 23, 10, m3612, 1931, 5 ; t42a t53a
+ ITX_MUL2X_PACK 16, 9, 23, 10, 3920, 1189, 5 ; t45a t50a
+ ITX_MUL2X_PACK 15, 9, 23, 10, m1189, 3920, 5 ; t46a t49a
+ vpbroadcastd m11, [o(pw_m4017_799)]
+ psubsw m23, m25, m26 ; t36a t59a
+ paddsw m25, m26 ; t39a t56a
+ psubsw m26, m24, m27 ; t43a t52a
+ paddsw m27, m24 ; t40a t55a
+ psubsw m24, m20, m28 ; t44a t51a
+ paddsw m20, m28 ; t47a t48a
+ psubsw m28, m8, m21 ; t34 t61
+ paddsw m8, m21 ; t33 t62
+ psubsw m21, m17, m14 ; t37 t58
+ paddsw m17, m14 ; t38 t57
+ psubsw m14, m18, m19 ; t42 t53
+ paddsw m18, m19 ; t41 t54
+ psubsw m19, m15, m16 ; t45 t50
+ paddsw m15, m16 ; t46 t49
+ psubsw m16, m22, m29 ; t35a t60a
+ paddsw m22, m29 ; t32a t63a
+ ITX_MUL2X_PACK 16, 9, 29, 10, 799_4017, 11, 20 ; t35 t60
+ ITX_MUL2X_PACK 28, 9, 29, 10, 799_4017, 11, 20 ; t34a t61a
+ ITX_MUL2X_PACK 23, 9, 29, 10, 11, m799_m4017, 36 ; t36 t59
+ ITX_MUL2X_PACK 21, 9, 29, 10, 11, m799_m4017, 36 ; t37a t58a
+ vpbroadcastd m11, [o(pw_m2276_3406)]
+ ITX_MUL2X_PACK 26, 9, 29, 10, 3406_2276, 11, 20 ; t43 t52
+ ITX_MUL2X_PACK 14, 9, 29, 10, 3406_2276, 11, 20 ; t42a t53a
+ ITX_MUL2X_PACK 24, 9, 29, 10, 11, m3406_m2276, 36 ; t44 t51
+ ITX_MUL2X_PACK 19, 9, 29, 10, 11, m3406_m2276, 36 ; t45a t50a
+ vpbroadcastd m11, [o(pw_1567_3784)]
+ vpbroadcastd m12, [o(pw_m3784_1567)]
+ psubsw m29, m22, m25 ; t39 t56
+ paddsw m22, m25 ; t32 t63
+ psubsw m25, m20, m27 ; t40 t55
+ paddsw m20, m27 ; t47 t48
+ psubsw m27, m8, m17 ; t38a t57a
+ paddsw m8, m17 ; t33a t62a
+ psubsw m17, m15, m18 ; t41a t54a
+ paddsw m15, m18 ; t46a t49a
+ paddsw m18, m16, m23 ; t35a t60a
+ psubsw m16, m23 ; t36a t59a
+ psubsw m23, m24, m26 ; t43a t52a
+ paddsw m24, m26 ; t44a t51a
+ paddsw m26, m28, m21 ; t34 t61
+ psubsw m28, m21 ; t37 t58
+ psubsw m21, m19, m14 ; t42 t53
+ paddsw m19, m14 ; t45 t50
+ ITX_MUL2X_PACK 29, 9, 14, 10, 11, 12, 4 ; t39a t56a
+ ITX_MUL2X_PACK 27, 9, 14, 10, 11, 12, 4 ; t38 t57
+ ITX_MUL2X_PACK 16, 9, 14, 10, 11, 12, 4 ; t36 t59
+ ITX_MUL2X_PACK 28, 9, 14, 10, 11, 12, 4 ; t37a t58a
+ vpbroadcastd m11, [o(pw_m1567_m3784)]
+ ITX_MUL2X_PACK 25, 9, 14, 10, 12, 11, 4 ; t40a t55a
+ ITX_MUL2X_PACK 17, 9, 14, 10, 12, 11, 4 ; t41 t54
+ ITX_MUL2X_PACK 23, 9, 14, 10, 12, 11, 4 ; t43 t52
+ ITX_MUL2X_PACK 21, 9, 14, 10, 12, 11, 4 ; t42a t53a
+ vbroadcasti32x4 m13, [o(deint_shuf)]
+ vpbroadcastd m11, [o(pw_2896_2896)]
+ vpbroadcastd m12, [o(pw_m2896_2896)]
+ paddsw m14, m22, m20 ; t32a t63a
+ psubsw m22, m20 ; t47a t48a
+ psubsw m20, m8, m15 ; t46 t49
+ paddsw m8, m15 ; t33 t62
+ paddsw m15, m18, m24 ; t35 t60
+ psubsw m18, m24 ; t44 t51
+ psubsw m24, m26, m19 ; t45a t50a
+ paddsw m26, m19 ; t34a t61a
+ REPX {pshufb x, m13}, m14, m8, m15, m26
+ psubsw m19, m29, m25 ; t40 t55
+ paddsw m25, m29 ; t39 t56
+ psubsw m29, m27, m17 ; t41a t54a
+ paddsw m27, m17 ; t38a t57a
+ psubsw m17, m16, m23 ; t43a t52a
+ paddsw m16, m23 ; t36a t59a
+ psubsw m9, m28, m21 ; t42 t53
+ paddsw m28, m21 ; t37 t58
+ REPX {pshufb x, m13}, m25, m27, m16, m28
+ ITX_MUL2X_PACK 22, 13, 21, 10, 11, 12, 8 ; t47 t48
+ ITX_MUL2X_PACK 20, 23, 22, 10, 11, 12, 8 ; t46a t49a
+ packssdw m21, m22 ; t47 t46a
+ packssdw m13, m23 ; t48 t49a
+ ITX_MUL2X_PACK 18, 22, 20, 10, 11, 12, 8 ; t44a t51a
+ ITX_MUL2X_PACK 24, 23, 18, 10, 11, 12, 8 ; t45 t50
+ packssdw m20, m18 ; t44a t45
+ packssdw m22, m23 ; t51a t50
+ ITX_MUL2X_PACK 19, 24, 18, 10, 11, 12, 8 ; t40a t55a
+ ITX_MUL2X_PACK 29, 23, 19, 10, 11, 12, 8 ; t41 t54
+ packssdw m18, m19 ; t40a t41
+ packssdw m24, m23 ; t55a t54
+ ITX_MUL2X_PACK 17, 23, 19, 10, 11, 12, 8 ; t43 t52
+ ITX_MUL2X_PACK 9, 29, 17, 10, 11, 12, 8 ; t42a t53a
+ packssdw m19, m17 ; t43 t42a
+ packssdw m23, m29 ; t52 t53a
+ punpcklqdq m17, m25, m27 ; t39 t38a
+ punpckhqdq m25, m27 ; t56 t57a
+ punpckhqdq m27, m15, m26 ; t60 t61a
+ punpcklqdq m15, m26 ; t35 t34a
+ punpckhqdq m26, m16, m28 ; t59a t58
+ punpcklqdq m16, m28 ; t36a t37
+ punpckhqdq m28, m14, m8 ; t63a t62
+ punpcklqdq m14, m8 ; t32a t33
+ psubsw m29, m0, m28 ; out63 out62
+ paddsw m0, m28 ; out0 out1
+ psubsw m28, m1, m27 ; out60 out61
+ paddsw m1, m27 ; out3 out2
+ psubsw m27, m2, m26 ; out59 out58
+ paddsw m2, m26 ; out4 out5
+ psubsw m26, m3, m25 ; out56 out57
+ paddsw m3, m25 ; out7 out6
+ psubsw m25, m4, m24 ; out55 out54
+ paddsw m4, m24 ; out8 out9
+ psubsw m24, m5, m23 ; out52 out53
+ paddsw m5, m23 ; out11 out10
+ psubsw m23, m6, m22 ; out51 out50
+ paddsw m6, m22 ; out12 out13
+ psubsw m22, m7, m13 ; out48 out49
+ paddsw m7, m13 ; out15 out14
+ ret
+
+cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 7, 0, dst, stride, c, eob
+%undef cmp
+ lea r5, [o_base]
+ test eobd, eobd
+ jnz .normal
+ movsx r6d, word [cq]
+ mov [cq], eobd
+ or r3d, 16
+.dconly:
+ imul r6d, 181
+ add r6d, 128+512
+ sar r6d, 8+2
+.dconly2:
+ imul r6d, 181
+ add r6d, 128+2048
+ sar r6d, 8+4
+ pxor m2, m2
+ vpbroadcastw m3, r6d
+.dconly_loop:
+ mova m1, [dstq]
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ paddw m0, m3
+ paddw m1, m3
+ packuswb m0, m1
+ mova [dstq], m0
+ add dstq, strideq
+ dec r3d
+ jg .dconly_loop
+ RET
+.normal:
+ WIN64_SPILL_XMM 31
+ mova m19, [o(dup16_perm)]
+ mova m24, [cq+64* 2]
+ mova m28, [cq+64* 6]
+ mova m26, [cq+64* 4]
+ mova m22, [cq+64* 0]
+ mova m23, [cq+64* 1]
+ mova m29, [cq+64* 7]
+ mova m27, [cq+64* 5]
+ mova m25, [cq+64* 3]
+ vpermb m8, m19, m24 ; 4
+ vpermb m1, m19, m28 ; 12
+ vpermb m7, m19, m26 ; 8
+ vpermb m9, m19, m22 ; __ 0
+ vpermb m21, m19, m23 ; 2
+ vpermb m15, m19, m29 ; 14
+ vpermb m18, m19, m27 ; 10
+ vpermb m14, m19, m25 ; 6
+ pslld m9, 16
+ vpord m30, m19, [o(pb_32)] {1to16}
+ REPX {vpermb x, m30, x}, m22, m29, m26, m25, m24, m27, m28, m23
+ cmp eobd, 151
+ jb .fast
+ vpermb m0, m19, [cq+64*14] ; 28
+ vpermb m5, m19, [cq+64*10] ; 20
+ vpermb m3, m19, [cq+64*12] ; 24
+ vpermb m6, m19, [cq+64* 8] ; __ 16
+ pslld m6, 16
+ call m(idct_16x16_internal_8bpc).main_fast
+ vpermb m17, m19, [cq+64*15] ; 30
+ vpermb m20, m19, [cq+64* 9] ; 18
+ vpermb m16, m19, [cq+64*11] ; 22
+ vpermb m19, m19, [cq+64*13] ; 26
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ mova [cq+64* 0], m14
+ mova [cq+64* 1], m15
+ mova [cq+64* 2], m16
+ mova [cq+64* 3], m17
+ mova [cq+64* 4], m18
+ mova [cq+64* 5], m19
+ mova [cq+64* 6], m20
+ mova [cq+64* 7], m21
+ vpermb m21, m30, [cq+64*15]
+ vpermb m14, m30, [cq+64* 8]
+ vpermb m17, m30, [cq+64*11]
+ vpermb m18, m30, [cq+64*12]
+ vpermb m19, m30, [cq+64*13]
+ vpermb m16, m30, [cq+64*10]
+ vpermb m15, m30, [cq+64* 9]
+ vpermb m20, m30, [cq+64*14]
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf
+ jmp .end
+.fast: ; bottom half is zero
+ call m(idct_16x16_internal_8bpc).main_fast2
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2
+ mova [cq+64* 0], m14
+ mova [cq+64* 1], m15
+ mova [cq+64* 2], m16
+ mova [cq+64* 3], m17
+ mova [cq+64* 4], m18
+ mova [cq+64* 5], m19
+ mova [cq+64* 6], m20
+ mova [cq+64* 7], m21
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast
+.end:
+ mova [cq+64* 8], m4
+ mova [cq+64* 9], m5
+ mova [cq+64*10], m6
+ mova [cq+64*11], m7
+ mova [cq+64*12], m26
+ mova [cq+64*13], m27
+ mova [cq+64*14], m28
+ mova [cq+64*15], m29
+ vpbroadcastd m13, [o(pw_8192)]
+ call .pass1_end
+ call .pass2
+ mova [cq+64* 0], m0
+ mova [cq+64* 1], m1
+ mova [cq+64* 2], m2
+ mova [cq+64* 3], m3
+ mova [cq+64* 4], m4
+ mova [cq+64* 5], m5
+ mova [cq+64* 6], m6
+ mova [cq+64* 7], m7
+ pmulhrsw m0, m13, [cq+64* 8]
+ pmulhrsw m1, m13, [cq+64* 9]
+ pmulhrsw m2, m13, [cq+64*10]
+ pmulhrsw m3, m13, [cq+64*11]
+ vpbroadcastd m30, [o(pw_2048)]
+ pmulhrsw m4, m13, m22
+ pmulhrsw m5, m13, m23
+ pmulhrsw m6, m13, m24
+ pmulhrsw m7, m13, m25
+ pmulhrsw m22, m30, m14
+ pmulhrsw m14, m13, m26
+ pmulhrsw m23, m30, m15
+ pmulhrsw m15, m13, m27
+ pmulhrsw m24, m30, m16
+ pmulhrsw m16, m13, m28
+ pmulhrsw m25, m30, m17
+ pmulhrsw m17, m13, m29
+ pmulhrsw m26, m30, m18
+ pmulhrsw m18, m13, [cq+64*12]
+ pmulhrsw m27, m30, m19
+ pmulhrsw m19, m13, [cq+64*13]
+ pmulhrsw m28, m30, m20
+ pmulhrsw m20, m13, [cq+64*14]
+ pmulhrsw m29, m30, m21
+ pmulhrsw m21, m13, [cq+64*15]
+ call .transpose_round
+ call .pass2
+ pxor m10, m10
+ lea r3, [strideq*3]
+%macro IDCT_64x16_END 4
+ mova m9, [dstq+%4]
+%if %1 < 8
+ pmulhrsw m%3, m30, [cq+64*%1]
+%endif
+ pmulhrsw m%2, m30
+ mova [cq+64*%1], m10
+ punpcklbw m8, m9, m10
+ punpckhbw m9, m10
+ paddw m8, m%3
+ paddw m9, m%2
+ packuswb m8, m9
+ mova [dstq+%4], m8
+%if %1 == 3 || %1 == 7 || %1 == 11
+ lea dstq, [dstq+strideq*4]
+%endif
+%endmacro
+ IDCT_64x16_END 0, 0, 11, strideq*0
+ IDCT_64x16_END 1, 1, 11, strideq*1
+ IDCT_64x16_END 2, 2, 11, strideq*2
+ IDCT_64x16_END 3, 3, 11, r3
+ IDCT_64x16_END 4, 4, 11, strideq*0
+ IDCT_64x16_END 5, 5, 11, strideq*1
+ IDCT_64x16_END 6, 6, 11, strideq*2
+ IDCT_64x16_END 7, 7, 11, r3
+ IDCT_64x16_END 8, 14, 22, strideq*0
+ IDCT_64x16_END 9, 15, 23, strideq*1
+ IDCT_64x16_END 10, 16, 24, strideq*2
+ IDCT_64x16_END 11, 17, 25, r3
+ IDCT_64x16_END 12, 18, 26, strideq*0
+ IDCT_64x16_END 13, 19, 27, strideq*1
+ IDCT_64x16_END 14, 20, 28, strideq*2
+ IDCT_64x16_END 15, 21, 29, r3
+ RET
+ALIGN function_align
+.pass1_end:
+ mova m4, [cq+64* 0]
+ mova m5, [cq+64* 1]
+ mova m6, [cq+64* 2]
+ mova m7, [cq+64* 3]
+ mova m8, [cq+64* 4]
+ mova m9, [cq+64* 5]
+ mova m11, [cq+64* 6]
+ mova m12, [cq+64* 7]
+ psubsw m29, m4, m21 ; out47 out46
+ paddsw m4, m21 ; out16 out17
+ psubsw m28, m5, m20 ; out44 out45
+ paddsw m5, m20 ; out19 out18
+ REPX {pmulhrsw x, m13}, m0, m1, m2, m3
+ psubsw m27, m6, m19 ; out43 out42
+ paddsw m6, m19 ; out20 out21
+ psubsw m26, m7, m18 ; out40 out41
+ paddsw m7, m18 ; out23 out22
+ pmulhrsw m18, m13, m22
+ pmulhrsw m19, m13, m23
+ pmulhrsw m20, m13, m24
+ pmulhrsw m21, m13, m25
+ paddsw m25, m12, m14 ; out31 out30
+ psubsw m14, m12, m14 ; out32 out33
+ paddsw m24, m11, m15 ; out28 out29
+ psubsw m15, m11, m15 ; out35 out34
+ REPX {pmulhrsw x, m13}, m4, m5, m6, m7
+ paddsw m23, m9, m16 ; out27 out26
+ psubsw m16, m9, m16 ; out36 out37
+ paddsw m22, m8, m17 ; out24 out25
+ psubsw m17, m8, m17 ; out39 out38
+ REPX {pmulhrsw x, m13}, m14, m15, m16, m17
+.transpose_round:
+%macro TRANSPOSE_8x4_PACKED 4
+ punpckhwd m8, m%1, m%3 ; b0 f0 b1 f1 b2 f2 b3 f3
+ punpcklwd m%1, m%3 ; a0 e0 a1 e1 a2 e2 a3 e3
+ punpcklwd m%3, m%2, m%4 ; d0 h0 d1 h1 d2 h2 d3 h3
+ punpckhwd m%2, m%4 ; c0 g0 c1 g1 c2 g2 c3 g3
+ punpckhwd m%4, m%1, m%2 ; a2 c2 e2 g2 a3 c3 e3 g3
+ punpcklwd m%1, m%2 ; a0 c0 e0 g0 a1 c1 e1 g1
+ punpckhwd m%2, m8, m%3 ; b2 d2 f2 h2 b3 d3 f3 h3
+ punpcklwd m8, m%3 ; b0 d0 f0 h0 b1 d1 f1 h1
+ punpcklwd m%3, m%4, m%2 ; 2
+ punpckhwd m%4, m%2 ; 3
+ punpckhwd m%2, m%1, m8 ; 1
+ punpcklwd m%1, m8 ; 0
+%endmacro
+ TRANSPOSE_8x4_PACKED 0, 1, 2, 3
+ TRANSPOSE_8x4_PACKED 18, 19, 20, 21
+ TRANSPOSE_8x4_PACKED 4, 5, 6, 7
+ TRANSPOSE_8x4_PACKED 14, 15, 16, 17
+ vshufi32x4 m8, m0, m4, q3232 ; a02 a03 b02 b03
+ vinserti32x8 m0, ym4, 1 ; a00 a01 b00 b01
+ vshufi32x4 m4, m1, m5, q3232 ; a12 a13 b12 b13
+ vinserti32x8 m9, m1, ym5, 1 ; a10 a11 b10 b11
+ vshufi32x4 m5, m2, m6, q3232 ; a22 a23 b22 b23
+ vinserti32x8 m1, m2, ym6, 1 ; a20 a21 b20 b21
+ vshufi32x4 m6, m3, m7, q3232 ; a32 a33 b32 b33
+ vinserti32x8 m11, m3, ym7, 1 ; a30 a31 b30 b31
+ vshufi32x4 m2, m14, m18, q3232 ; c02 c03 d02 d03
+ vinserti32x8 m3, m14, ym18, 1 ; c00 c01 d00 d01
+ vshufi32x4 m18, m15, m19, q3232 ; c12 c13 d12 d13
+ vinserti32x8 m15, ym19, 1 ; c10 c11 d10 d11
+ vshufi32x4 m19, m16, m20, q3232 ; c22 c23 d22 d23
+ vinserti32x8 m16, ym20, 1 ; c20 c21 d20 d21
+ vshufi32x4 m20, m17, m21, q3232 ; c32 c33 d32 d33
+ vinserti32x8 m17, ym21, 1 ; c30 c31 d30 d31
+ ret
+.pass2:
+ vshufi32x4 m7, m5, m19, q3131 ; 14
+ vshufi32x4 m5, m19, q2020 ; 10
+ vshufi32x4 m21, m6, m20, q3131 ; 15
+ vshufi32x4 m19, m6, m20, q2020 ; 11
+ vshufi32x4 m20, m4, m18, q3131 ; 13
+ vshufi32x4 m18, m4, m18, q2020 ; 9
+ vshufi32x4 m6, m8, m2, q3131 ; 12
+ vshufi32x4 m4, m8, m2, q2020 ; 8
+ vshufi32x4 m2, m0, m3, q3131 ; 4
+ vshufi32x4 m0, m3, q2020 ; 0
+ vshufi32x4 m3, m1, m16, q3131 ; 6
+ vshufi32x4 m1, m16, q2020 ; 2
+ vshufi32x4 m16, m9, m15, q3131 ; 5
+ vshufi32x4 m14, m9, m15, q2020 ; 1
+ vshufi32x4 m15, m11, m17, q2020 ; 3
+ vshufi32x4 m17, m11, m17, q3131 ; 7
+ call m(inv_txfm_add_dct_dct_32x8_8bpc).main2
+ jmp m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
+
+cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 7, 0, dst, stride, c, eob
+ lea r5, [o_base]
+ test eobd, eobd
+ jz .dconly
+ PROLOGUE 0, 9, 30, 64*32, dst, stride, c, eob
+ vpbroadcastd m23, [o(pw_2896x8)]
+%undef cmp
+ cmp eobd, 136
+ jb .fast
+ pmulhrsw m5, m23, [cq+64*20]
+ pmulhrsw m3, m23, [cq+64*12]
+ pmulhrsw m1, m23, [cq+64* 4]
+ pmulhrsw m7, m23, [cq+64*28]
+ pmulhrsw m2, m23, [cq+64* 8]
+ pmulhrsw m6, m23, [cq+64*24]
+ pmulhrsw m0, m23, [cq+64* 0]
+ pmulhrsw m4, m23, [cq+64*16]
+ call m(inv_txfm_add_dct_dct_32x8_8bpc).main
+ pmulhrsw m14, m23, [cq+64* 2]
+ pmulhrsw m21, m23, [cq+64*30]
+ pmulhrsw m18, m23, [cq+64*18]
+ pmulhrsw m17, m23, [cq+64*14]
+ pmulhrsw m16, m23, [cq+64*10]
+ pmulhrsw m19, m23, [cq+64*22]
+ pmulhrsw m20, m23, [cq+64*26]
+ pmulhrsw m15, m23, [cq+64* 6]
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
+ mova [cq+64* 0], m14
+ mova [cq+64* 2], m15
+ mova [cq+64* 4], m16
+ mova [cq+64* 6], m17
+ mova [cq+64* 8], m18
+ mova [cq+64*10], m19
+ mova [cq+64*12], m20
+ mova [cq+64*14], m21
+ pmulhrsw m22, m23, [cq+64* 1]
+ pmulhrsw m21, m23, [cq+64*31]
+ pmulhrsw m14, m23, [cq+64*17]
+ pmulhrsw m29, m23, [cq+64*15]
+ pmulhrsw m26, m23, [cq+64* 9]
+ pmulhrsw m17, m23, [cq+64*23]
+ pmulhrsw m18, m23, [cq+64*25]
+ pmulhrsw m25, m23, [cq+64* 7]
+ pmulhrsw m24, m23, [cq+64* 5]
+ pmulhrsw m19, m23, [cq+64*27]
+ pmulhrsw m16, m23, [cq+64*21]
+ pmulhrsw m27, m23, [cq+64*11]
+ pmulhrsw m28, m23, [cq+64*13]
+ pmulhrsw m15, m23, [cq+64*19]
+ pmulhrsw m20, m23, [cq+64*29]
+ pmulhrsw m23, [cq+64* 3]
+ call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf
+ vpbroadcastd m12, [o(pw_16384)]
+ psubsw m13, m0, m29 ; 31
+ paddsw m0, m29 ; 0
+ psubsw m29, m1, m28 ; 30
+ paddsw m1, m28 ; 1
+ psubsw m28, m2, m27 ; 29
+ paddsw m2, m27 ; 2
+ psubsw m27, m3, m26 ; 28
+ paddsw m3, m26 ; 3
+ psubsw m26, m4, m25 ; 27
+ paddsw m4, m25 ; 4
+ psubsw m25, m5, m24 ; 26
+ paddsw m5, m24 ; 5
+ psubsw m24, m6, m23 ; 25
+ paddsw m6, m23 ; 6
+ psubsw m23, m7, m22 ; 24
+ paddsw m7, m22 ; 7
+ pxor m9, m9
+ punpckhwd m8, m0, m1 ; a4 b4 a5 b5 a6 b6 a7 b7
+ punpcklwd m0, m1 ; a0 b0 a1 b1 a2 b2 a3 b3
+ punpckhwd m1, m2, m3 ; c4 d4 c5 d5 c6 d6 c7 d7
+ punpcklwd m2, m3 ; c0 d0 c1 d1 c2 d2 c3 d3
+ REPX {mova [cq+64*x], m9}, 16, 17, 18, 19
+ punpckhwd m22, m4, m5 ; e4 f4 e5 f5 e6 f6 e7 f7
+ punpcklwd m4, m5 ; e0 f0 e1 f1 e2 f2 e3 f3
+ punpckhwd m5, m6, m7 ; g4 h4 g5 h5 g6 h6 g7 h7
+ punpcklwd m6, m7 ; g0 h0 g1 h1 g2 h2 g3 h3
+ REPX {mova [cq+64*x], m9}, 20, 21, 22, 23
+ punpckhwd m3, m23, m24
+ punpcklwd m23, m24
+ punpckhwd m24, m25, m26
+ punpcklwd m25, m26
+ REPX {mova [cq+64*x], m9}, 24, 25, 26, 27
+ punpckhwd m26, m27, m28
+ punpcklwd m27, m28
+ punpckhwd m28, m29, m13
+ punpcklwd m29, m13
+ REPX {mova [cq+64*x], m9}, 28, 29, 30, 31
+ punpckhdq m7, m0, m2 ; a2 b2 c2 d2 a3 b3 c3 d3
+ punpckldq m0, m2 ; a0 b0 c0 d0 a1 b1 c1 d1
+ punpckhdq m2, m4, m6 ; e2 f2 g2 h2 e3 f3 g3 h3
+ punpckldq m4, m6 ; e0 f0 g0 h0 e1 f1 g1 h1
+ REPX {pmulhrsw x, m12}, m7, m0, m2, m4
+ punpckhdq m6, m8, m1 ; a6 b6 c6 d6 a7 b7 c7 d7
+ punpckldq m8, m1 ; a4 b4 c4 d4 a5 b5 c5 d5
+ punpckhdq m1, m22, m5 ; e6 f6 g6 h6 e7 f7 g7 h7
+ punpckldq m22, m5 ; e4 f4 g4 h5 e5 f5 g5 h5
+ REPX {pmulhrsw x, m12}, m6, m8, m1, m22
+ punpckhdq m13, m23, m25
+ punpckldq m23, m25
+ punpckhdq m25, m27, m29
+ punpckldq m27, m29
+ REPX {pmulhrsw x, m12}, m13, m23, m25, m27
+ punpckhdq m9, m3, m24
+ punpckldq m3, m24
+ punpckhdq m24, m26, m28
+ punpckldq m26, m28
+ REPX {pmulhrsw x, m12}, m9, m3, m24, m26
+ punpckhqdq m5, m23, m27 ; d01 d09 d17 d25
+ punpcklqdq m23, m27 ; d00 d08 d16 d24
+ punpcklqdq m27, m13, m25 ; d02 d10 d18 d26
+ punpckhqdq m13, m25 ; d03 d11 d19 d27
+ punpcklqdq m25, m3, m26 ; d04 d12 d20 d28
+ punpckhqdq m3, m26 ; d05 d13 d21 d29
+ punpcklqdq m26, m9, m24 ; d06 d14 d22 d30
+ punpckhqdq m9, m24 ; d07 d15 d23 d31
+ mova [cq+64* 3], m23
+ mova [cq+64*13], m27
+ mova [cq+64* 7], m25
+ mova [cq+64*15], m26
+ punpckhqdq m24, m8, m22 ; a05 a13 a21 a29
+ punpcklqdq m8, m22 ; a04 a12 a20 a28
+ punpckhqdq m22, m0, m4 ; a01 a09 a17 a25
+ punpcklqdq m0, m4 ; a00 a08 a16 a24
+ punpckhqdq m23, m7, m2 ; a03 a11 a19 a27
+ punpcklqdq m7, m2 ; a02 a10 a18 a26
+ punpckhqdq m25, m6, m1 ; a07 a15 a23 a31
+ punpcklqdq m6, m1 ; a06 a14 a22 a30
+ mova [cq+64* 1], m0
+ mova [cq+64* 9], m7
+ mova [cq+64* 5], m8
+ mova [cq+64*11], m6
+ mova m2, [cq+64* 0]
+ mova m11, [cq+64* 2]
+ mova m8, [cq+64* 4]
+ mova m29, [cq+64* 6]
+ mova m27, [cq+64* 8]
+ mova m26, [cq+64*10]
+ mova m4, [cq+64*12]
+ mova m28, [cq+64*14]
+ psubsw m1, m2, m21 ; 23
+ paddsw m2, m21 ; 8
+ psubsw m21, m11, m20 ; 22
+ paddsw m11, m20 ; 9
+ psubsw m20, m8, m19 ; 21
+ paddsw m8, m19 ; 10
+ psubsw m19, m29, m18 ; 20
+ paddsw m29, m18 ; 11
+ psubsw m18, m27, m17 ; 19
+ paddsw m27, m17 ; 12
+ psubsw m17, m26, m16 ; 18
+ paddsw m26, m16 ; 13
+ psubsw m16, m4, m15 ; 17
+ paddsw m4, m15 ; 14
+ psubsw m15, m28, m14 ; 16
+ paddsw m28, m14 ; 15
+ punpcklwd m14, m15, m16
+ punpckhwd m15, m16
+ punpckhwd m16, m17, m18
+ punpcklwd m17, m18
+ punpckhwd m18, m19, m20
+ punpcklwd m19, m20
+ punpckhwd m20, m21, m1
+ punpcklwd m21, m1
+ punpckhwd m1, m2, m11 ; i4 j4 i5 j5 i6 j6 i7 j7
+ punpcklwd m2, m11 ; i0 j1 i1 j1 i2 j2 i3 j3
+ punpckhwd m11, m8, m29 ; k4 l4 k5 l5 k6 l6 k7 l7
+ punpcklwd m8, m29 ; k0 l0 k1 l1 k2 l2 k3 l3
+ punpckhwd m29, m27, m26 ; m4 n4 m5 n5 m6 n6 m7 n7
+ punpcklwd m27, m26 ; m0 n0 m1 n1 m2 n2 m3 n3
+ punpckhwd m26, m4, m28 ; o4 p4 o5 p5 o6 p6 o7 p7
+ punpcklwd m4, m28 ; o0 p0 o1 p1 o2 p2 o3 p3
+ punpckhdq m28, m2, m8 ; i2 j2 k2 l2 i3 j3 k3 l3
+ punpckldq m2, m8 ; i0 j0 k0 l0 i1 j1 k1 l1
+ punpckhdq m8, m27, m4 ; m2 n2 o2 p2 m3 n3 o3 p3
+ punpckldq m27, m4 ; m0 n0 o0 p0 m1 n1 o1 p1
+ REPX {pmulhrsw x, m12}, m28, m2, m8, m27
+ punpckhdq m4, m1, m11 ; i6 j6 k6 l6 i7 j7 k7 l7
+ punpckldq m1, m11 ; i4 j4 k4 l4 i5 j5 k5 l5
+ punpckhdq m11, m29, m26 ; m6 n6 o6 p6 m7 n7 o7 p7
+ punpckldq m29, m26 ; m4 n4 o4 p4 m5 n5 o5 p5
+ REPX {pmulhrsw x, m12}, m4, m1, m11, m29
+ punpckhdq m26, m19, m21
+ punpckldq m19, m21
+ punpckhdq m21, m15, m16
+ punpckldq m15, m16
+ REPX {pmulhrsw x, m12}, m26, m19, m21, m15
+ punpckhdq m16, m18, m20
+ punpckldq m18, m20
+ punpckhdq m20, m14, m17
+ punpckldq m14, m17
+ REPX {pmulhrsw x, m12}, m16, m18, m20, m14
+ punpckhqdq m17, m28, m8 ; b03 b11 b19 b27
+ punpcklqdq m28, m8 ; b02 b10 b18 b26
+ punpckhqdq m8, m2, m27 ; b01 b09 b17 b25
+ punpcklqdq m2, m27 ; b00 b08 b16 b24
+ punpcklqdq m27, m1, m29 ; b04 b12 b20 b28
+ punpckhqdq m1, m29 ; b05 b13 b21 b29
+ punpcklqdq m29, m4, m11 ; b06 b14 b22 b30
+ punpckhqdq m4, m11 ; b07 b15 b23 b31
+ mova [cq+64* 0], m2
+ mova [cq+64* 8], m28
+ mova [cq+64* 4], m27
+ mova [cq+64*10], m29
+ punpckhqdq m27, m20, m26 ; c03 c11 c19 c27
+ punpcklqdq m20, m26 ; c02 c10 c18 c26
+ punpckhqdq m26, m14, m19 ; c01 c09 c17 c25
+ punpcklqdq m14, m19 ; c00 c08 c16 c24
+ punpckhqdq m28, m15, m18 ; c05 c13 c21 c29
+ punpcklqdq m15, m18 ; c04 c12 c20 c28
+ punpckhqdq m29, m21, m16 ; c07 c15 c23 c31
+ punpcklqdq m21, m16 ; c06 c14 c22 c30
+ mova [cq+64* 2], m14
+ mova [cq+64*12], m20
+ mova [cq+64* 6], m15
+ mova [cq+64*14], m21
+ vshufi32x4 m14, m22, m8, q3232 ; a17 a25 b17 b25
+ vinserti32x8 m22, ym8, 1 ; a01 a09 b01 b09
+ vshufi32x4 m15, m23, m17, q3232 ; a19 a27 b19 b27
+ vinserti32x8 m23, ym17, 1 ; a03 a11 b03 b11
+ vshufi32x4 m16, m24, m1, q3232 ; a21 a29 b21 b29
+ vinserti32x8 m24, ym1, 1 ; a05 a13 b05 b13
+ vshufi32x4 m17, m25, m4, q3232 ; a23 a31 b23 b31
+ vinserti32x8 m25, ym4, 1 ; a07 a15 b07 b15
+ vinserti32x8 m19, m26, ym5, 1 ; c01 c09 d01 d09
+ vshufi32x4 m26, m5, q3232 ; c17 c25 d17 d25
+ vinserti32x8 m20, m27, ym13, 1 ; c03 c11 d03 d11
+ vshufi32x4 m27, m13, q3232 ; c19 c27 d19 d27
+ vinserti32x8 m21, m28, ym3, 1 ; c05 c13 d05 d13
+ vshufi32x4 m28, m3, q3232 ; c21 c29 d21 d29
+ vinserti32x8 m18, m29, ym9, 1 ; c07 c15 d07 d15
+ vshufi32x4 m29, m9, q3232 ; c23 c31 d23 d31
+ mov r4, rsp
+ vshufi32x4 m0, m22, m19, q2020 ; 1
+ vshufi32x4 m1, m17, m29, q3131 ; 31
+ vshufi32x4 m2, m14, m26, q2020 ; 17
+ vshufi32x4 m3, m25, m18, q3131 ; 15
+ call .main_part1
+ vshufi32x4 m0, m25, m18, q2020 ; 7
+ vshufi32x4 m1, m14, m26, q3131 ; 25
+ vshufi32x4 m2, m17, m29, q2020 ; 23
+ vshufi32x4 m3, m22, m19, q3131 ; 9
+ call .main_part1
+ vshufi32x4 m0, m24, m21, q2020 ; 5
+ vshufi32x4 m1, m15, m27, q3131 ; 27
+ vshufi32x4 m2, m16, m28, q2020 ; 21
+ vshufi32x4 m3, m23, m20, q3131 ; 11
+ call .main_part1
+ vshufi32x4 m0, m23, m20, q2020 ; 3
+ vshufi32x4 m1, m16, m28, q3131 ; 29
+ vshufi32x4 m2, m15, m27, q2020 ; 19
+ vshufi32x4 m3, m24, m21, q3131 ; 13
+ call .main_part1
+ call .main_part2
+ mova m0, [cq+64* 1] ; a0
+ mova m15, [cq+64* 0] ; b0
+ mova m3, [cq+64* 2] ; c0
+ mova m16, [cq+64* 3] ; d0
+ mova m14, [cq+64* 5] ; a4
+ mova m8, [cq+64* 4] ; b4
+ mova m17, [cq+64* 6] ; c4
+ mova m1, [cq+64* 7] ; d4
+ vshufi32x4 m2, m0, m15, q3232 ; a16 a24 b16 b24
+ vinserti32x8 m0, ym15, 1 ; a00 a08 b00 b08
+ vshufi32x4 m15, m3, m16, q3232 ; c16 c24 d16 d24
+ vinserti32x8 m3, ym16, 1 ; c00 c08 d00 d08
+ vshufi32x4 m16, m14, m8, q3232 ; a20 a28 b20 b28
+ vinserti32x8 m14, ym8, 1 ; a04 a12 b04 b12
+ vshufi32x4 m8, m17, m1, q3232 ; c20 c28 d20 d28
+ vinserti32x8 m17, ym1, 1 ; c04 c12 d04 d12
+ vshufi32x4 m1, m0, m3, q3131 ; 8
+ vshufi32x4 m0, m3, q2020 ; 0
+ vshufi32x4 m3, m2, m15, q3131 ; 24
+ vshufi32x4 m2, m15, q2020 ; 16
+ vshufi32x4 m15, m14, m17, q3131 ; 12
+ vshufi32x4 m14, m17, q2020 ; 4
+ vshufi32x4 m17, m16, m8, q3131 ; 28
+ vshufi32x4 m16, m8, q2020 ; 20
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
+ mova m8, [cq+64* 8]
+ mova m9, [cq+64*12]
+ mova m11, [cq+64*10]
+ mova m12, [cq+64*14]
+ mova [cq+64* 0], m14
+ mova [cq+64* 2], m15
+ mova [cq+64* 4], m16
+ mova [cq+64* 6], m17
+ mova [cq+64* 8], m18
+ mova [cq+64*10], m19
+ mova [cq+64*12], m20
+ mova [cq+64*14], m21
+ mova m22, [cq+64* 9]
+ mova m27, [cq+64*13]
+ mova m23, [cq+64*11]
+ mova m24, [cq+64*15]
+ vshufi32x4 m26, m22, m8, q3232 ; a18 a26 b18 b26
+ vinserti32x8 m22, ym8, 1 ; a02 a10 b02 b10
+ vshufi32x4 m8, m9, m27, q3232 ; c18 c26 d18 d26
+ vinserti32x8 m9, ym27, 1 ; c02 c10 d02 d10
+ vshufi32x4 m27, m23, m11, q3232 ; a22 a30 b22 b30
+ vinserti32x8 m23, ym11, 1 ; a06 a14 b06 b14
+ vshufi32x4 m11, m12, m24, q3232 ; c22 c30 d22 d30
+ vinserti32x8 m12, ym24, 1 ; c06 c14 d06 d14
+ vshufi32x4 m28, m26, m8, q3131 ; 26
+ vshufi32x4 m26, m8, q2020 ; 18
+ vshufi32x4 m24, m22, m9, q3131 ; 10
+ vshufi32x4 m22, m9, q2020 ; 2
+ vshufi32x4 m29, m27, m11, q3131 ; 30
+ vshufi32x4 m27, m11, q2020 ; 22
+ vshufi32x4 m25, m23, m12, q3131 ; 14
+ vshufi32x4 m23, m12, q2020 ; 6
+ call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast
+ jmp .end
+.fast: ; bottom/right halves are zero
+ pmulhrsw ym9, ym23, [cq+64* 0]
+ pmulhrsw ym6, ym23, [cq+64* 8]
+ mova m14, [o(dup16_perm)]
+ pmulhrsw ym8, ym23, [cq+64* 2]
+ pmulhrsw xm0, xm23, [cq+64*14]
+ pmulhrsw xm5, xm23, [cq+64*10]
+ pmulhrsw ym1, ym23, [cq+64* 6]
+ pmulhrsw ym7, ym23, [cq+64* 4]
+ pmulhrsw xm3, xm23, [cq+64*12]
+ pmovzxwd m9, ym9
+ pmovzxwd m6, ym6
+ vpermb m8, m14, m8
+ punpcklwd xm0, xm0
+ vpermb ym5, ym14, ym5
+ vpermb m1, m14, m1
+ vpermb m7, m14, m7
+ punpcklwd xm3, xm3
+ pslld m9, 16
+ pslld m6, 16
+ call m(idct_16x16_internal_8bpc).main_fast
+ vpmulhrsw ym21, ym23, [cq+64* 1]
+ {evex}vpmulhrsw xm17, xm23, [cq+64*15] ; force EVEX encoding, which
+ {evex}vpmulhrsw xm20, xm23, [cq+64* 9] ; reduces code size due to
+ {evex}vpmulhrsw ym15, ym23, [cq+64* 7] ; compressed displacements
+ {evex}vpmulhrsw ym18, ym23, [cq+64* 5]
+ {evex}vpmulhrsw xm16, xm23, [cq+64*11]
+ {evex}vpmulhrsw xm19, xm23, [cq+64*13]
+ {evex}vpmulhrsw ym23, [cq+64* 3]
+ vpermb m21, m14, m21
+ punpcklwd xm17, xm17
+ vpermb ym20, ym14, ym20
+ vpermb m15, m14, m15
+ vpermb m18, m14, m18
+ vpermb ym16, ym14, ym16
+ punpcklwd xm19, xm19
+ vpermb m14, m14, m23
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ vpbroadcastd m9, [o(pw_16384)]
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).transpose_round
+ vshufi32x4 m16, m0, m3, q2020 ; 0
+ vshufi32x4 m26, m0, m3, q3131 ; 4
+ vshufi32x4 m0, m14, m2, q2020 ; 1
+ vshufi32x4 m14, m2, q3131 ; 5
+ vshufi32x4 m3, m19, m7, q3131 ; 15
+ vshufi32x4 m19, m7, q2020 ; 11
+ vshufi32x4 m27, m17, m9, q2020 ; 3
+ vshufi32x4 m17, m9, q3131 ; 7
+ vshufi32x4 m28, m20, m6, q2020 ; 9
+ vshufi32x4 m20, m6, q3131 ; 13
+ vshufi32x4 m22, m1, m18, q2020 ; 2
+ vshufi32x4 m23, m1, m18, q3131 ; 6
+ vshufi32x4 m24, m5, m15, q2020 ; 10
+ vshufi32x4 m25, m5, m15, q3131 ; 14
+ vshufi32x4 m15, m21, m4, q3131 ; 12
+ vshufi32x4 m21, m21, m4, q2020 ; 8
+ mov r4, rsp
+ call .main_part1_fast
+ mova m0, m17
+ mova m3, m28
+ call .main_part1_fast
+ mova m0, m14
+ mova m3, m19
+ call .main_part1_fast
+ mova m0, m27
+ mova m3, m20
+ call .main_part1_fast
+ call .main_part2
+ mova m0, m16
+ mova m1, m21
+ mova m14, m26
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast2
+ mova [cq+64*14], m21
+ mova [cq+64* 0], m14
+ mova [cq+64* 6], m17
+ mova [cq+64* 8], m18
+ mova [cq+64*10], m19
+ mova [cq+64* 4], m16
+ mova [cq+64* 2], m15
+ mova [cq+64*12], m20
+ call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast2
+.end:
+ lea r4, [strideq*3]
+ vpbroadcastd m12, [o(pw_2048)]
+ movshdup m13, [o(permD)]
+ lea r5, [r4+strideq] ; stride*4
+ lea r3, [dstq+r4*8]
+ lea r6, [strideq+r5*8] ; stride*33
+ lea r8, [r4+r5*8] ; stride*35
+ add r3, r5 ; dst+stride*28
+ lea r7, [r6+strideq] ; stride*34
+%macro IDCT_32x64_END 6 ; src, mem, stride[1-4]
+%if %2 < 8
+ paddsw m10, m%2, m%1
+ psubsw m11, m%2, m%1
+%else
+ mova m11, [cq+64*(%2*2-16)]
+ paddsw m10, m11, m%1
+ psubsw m11, m%1
+%endif
+ mova m9, [rsp+64*(31-%2)]
+ mova m%1, [rsp+64*%2]
+ paddsw m8, m10, m9
+ psubsw m10, m9
+ paddsw m9, m11, m%1
+ pmovzxbw m0, [dstq+%3]
+ psubsw m11, m%1
+ pmovzxbw m%1, [r3 +%4]
+ REPX {pmulhrsw x, m12}, m8, m10, m9, m11
+ paddw m8, m0
+ pmovzxbw m0, [r3 +%5]
+ paddw m10, m%1
+ pmovzxbw m%1, [dstq+%6]
+ paddw m9, m0
+ paddw m11, m%1
+%if %2 >= 8
+%if %2 == 8
+ pxor m1, m1
+%endif
+ mova [cq+64*(%2*2-16)], m1
+ mova [cq+64*(%2*2-15)], m1
+%endif
+ packuswb m8, m10
+ packuswb m9, m11
+ vpermq m8, m13, m8
+ vpermq m9, m13, m9
+ mova [dstq+%3], ym8
+ vextracti32x8 [r3 +%4], m8, 1
+ mova [r3 +%5], ym9
+ vextracti32x8 [dstq+%6], m9, 1
+%if %2 == 3 || %2 == 7 || %2 == 11
+ add dstq, r5
+ sub r3, r5
+%endif
+%endmacro
+ IDCT_32x64_END 29, 0, strideq*0, r8, r4 , r5*8
+ IDCT_32x64_END 28, 1, strideq*1, r7, strideq*2, r6
+ IDCT_32x64_END 27, 2, strideq*2, r6, strideq*1, r7
+ IDCT_32x64_END 26, 3, r4 , r5*8, strideq*0, r8
+ IDCT_32x64_END 25, 4, strideq*0, r8, r4 , r5*8
+ IDCT_32x64_END 24, 5, strideq*1, r7, strideq*2, r6
+ IDCT_32x64_END 23, 6, strideq*2, r6, strideq*1, r7
+ IDCT_32x64_END 22, 7, r4 , r5*8, strideq*0, r8
+ IDCT_32x64_END 21, 8, strideq*0, r8, r4 , r5*8
+ IDCT_32x64_END 20, 9, strideq*1, r7, strideq*2, r6
+ IDCT_32x64_END 19, 10, strideq*2, r6, strideq*1, r7
+ IDCT_32x64_END 18, 11, r4 , r5*8, strideq*0, r8
+ IDCT_32x64_END 17, 12, strideq*0, r8, r4 , r5*8
+ IDCT_32x64_END 16, 13, strideq*1, r7, strideq*2, r6
+ IDCT_32x64_END 15, 14, strideq*2, r6, strideq*1, r7
+ IDCT_32x64_END 14, 15, r4 , r5*8, strideq*0, r8
+ RET
+.dconly:
+ movsx r6d, word [cq]
+ mov [cq], eobd
+ or r3d, 64
+ imul r6d, 181
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ add r6d, 128+256
+ sar r6d, 8+1
+ jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly3
+ALIGN function_align ; bottom three-quarters are zero
+cglobal_label .main_part1_fast2
+ vpbroadcastd m7, [o(idct64_mul+4*0)]
+ vpbroadcastd m8, [o(idct64_mul+4*1)]
+ pmulhrsw m7, m0 ; t63a
+ pmulhrsw m0, m8 ; t32a
+
+ punpcklwd m4, m0, m7
+ punpckhwd m6, m0, m7
+ mova m1, m10
+ vpdpwssd m1, m4, [o(idct64_mul+4*9)] {bcstd}
+ mova m9, m10
+ vpdpwssd m9, m6, [o(idct64_mul+4*9)] {bcstd}
+ REPX {psrad x, 12}, m1, m9
+ packssdw m1, m9
+ mova m9, m10
+ vpdpwssd m9, m6, [o(idct64_mul+4*8)] {bcstd}
+ mova m6, m10
+ vpdpwssd m6, m4, [o(idct64_mul+4*8)] {bcstd}
+ REPX {psrad x, 12}, m9, m6
+ packssdw m6, m9
+
+ mova m4, m0
+ mova m3, m7
+ mova m5, m1
+ mova m2, m6
+ jmp .main_part1c
+cglobal_label .main_part1_fast
+ vpbroadcastd m1, [o(idct64_mul+4*0)]
+ vpbroadcastd m8, [o(idct64_mul+4*1)]
+ vpbroadcastd m2, [o(idct64_mul+4*6)]
+ vpbroadcastd m9, [o(idct64_mul+4*7)]
+ pmulhrsw m1, m0 ; t63a
+ pmulhrsw m0, m8 ; t32a
+ pmulhrsw m2, m3 ; t60a
+ pmulhrsw m3, m9 ; t35a
+ mova m8, m0
+ mova m7, m1
+ mova m6, m3
+ mova m5, m2
+ jmp .main_part1b
+cglobal_label .main_part1
+ ; idct64 steps 1-5:
+ ; in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
+ ; in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
+ ; in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
+ ; in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
+ vpbroadcastd m7, [o(idct64_mul+4*0)]
+ vpbroadcastd m8, [o(idct64_mul+4*1)]
+ vpbroadcastd m6, [o(idct64_mul+4*2)]
+ vpbroadcastd m9, [o(idct64_mul+4*3)]
+ pmulhrsw m7, m0 ; t63a
+ vpbroadcastd m5, [o(idct64_mul+4*4)]
+ pmulhrsw m0, m8 ; t32a
+ vpbroadcastd m8, [o(idct64_mul+4*5)]
+ pmulhrsw m6, m1 ; t62a
+ vpbroadcastd m4, [o(idct64_mul+4*6)]
+ pmulhrsw m1, m9 ; t33a
+ vpbroadcastd m9, [o(idct64_mul+4*7)]
+ pmulhrsw m5, m2 ; t61a
+ pmulhrsw m2, m8 ; t34a
+ pmulhrsw m4, m3 ; t60a
+ pmulhrsw m3, m9 ; t35a
+ psubsw m8, m0, m1 ; t33
+ paddsw m0, m1 ; t32
+ psubsw m1, m7, m6 ; t62
+ paddsw m7, m6 ; t63
+ psubsw m6, m3, m2 ; t34
+ paddsw m3, m2 ; t35
+ psubsw m2, m4, m5 ; t61
+ paddsw m5, m4 ; t60
+.main_part1b:
+ vpbroadcastd m11, [o(idct64_mul+4*8)]
+ vpbroadcastd m12, [o(idct64_mul+4*9)]
+ ITX_MULSUB_2W 1, 8, 4, 9, 10, 11, 12 ; t33a, t62a
+ vpbroadcastd m11, [o(idct64_mul+4*10)]
+ ITX_MULSUB_2W 2, 6, 4, 9, 10, 12, 11 ; t34a, t61a
+ psubsw m4, m0, m3 ; t35a
+ paddsw m0, m3 ; t32a
+ psubsw m3, m7, m5 ; t60a
+ paddsw m7, m5 ; t63a
+ psubsw m5, m1, m2 ; t34
+ paddsw m1, m2 ; t33
+ psubsw m2, m8, m6 ; t61
+ paddsw m6, m8 ; t62
+.main_part1c:
+ vpbroadcastd m11, [o(idct64_mul+4*11)]
+ vpbroadcastd m12, [o(idct64_mul+4*12)]
+ add r5, 4*13
+ ITX_MULSUB_2W 3, 4, 8, 9, 10, 11, 12 ; t35, t60
+ ITX_MULSUB_2W 2, 5, 8, 9, 10, 11, 12 ; t34a, t61a
+ mova [r4+64*0], m0
+ mova [r4+64*7], m7
+ mova [r4+64*1], m1
+ mova [r4+64*6], m6
+ mova [r4+64*3], m3
+ mova [r4+64*4], m4
+ mova [r4+64*2], m2
+ mova [r4+64*5], m5
+ add r4, 64*8
+ ret
+cglobal_label .main_part2
+ vpbroadcastd m11, [o(pw_1567_3784 -16*13)]
+ vpbroadcastd m12, [o(pw_m3784_1567 -16*13)]
+ lea r6, [r4+64*7]
+ vpbroadcastd m17, [o(pw_m1567_m3784-16*13)]
+ vpbroadcastd m18, [o(pw_2896_2896 -16*13)]
+ vpbroadcastd m19, [o(pw_m2896_2896 -16*13)]
+ sub r5, 16*13
+.main_part2_loop:
+ mova m0, [r4-64*32] ; t32a
+ mova m1, [r6-64*24] ; t39a
+ mova m2, [r6-64*32] ; t63a
+ mova m3, [r4-64*24] ; t56a
+ mova m4, [r4-64*16] ; t40a
+ mova m5, [r6-64* 8] ; t47a
+ mova m6, [r6-64*16] ; t55a
+ mova m7, [r4-64* 8] ; t48a
+ psubsw m8, m0, m1 ; t39
+ paddsw m0, m1 ; t32
+ psubsw m1, m2, m3 ; t56
+ paddsw m2, m3 ; t63
+ psubsw m3, m5, m4 ; t40
+ paddsw m5, m4 ; t47
+ psubsw m4, m7, m6 ; t55
+ paddsw m7, m6 ; t48
+ ITX_MULSUB_2W 1, 8, 6, 9, 10, 11, 12 ; t39a, t56a
+ ITX_MULSUB_2W 4, 3, 6, 9, 10, 12, 17 ; t40a, t55a
+ psubsw m6, m2, m7 ; t48a
+ paddsw m2, m7 ; t63a
+ psubsw m7, m0, m5 ; t47a
+ paddsw m0, m5 ; t32a
+ psubsw m5, m8, m3 ; t55
+ paddsw m8, m3 ; t56
+ psubsw m3, m1, m4 ; t40
+ paddsw m1, m4 ; t39
+ ITX_MULSUB_2W 6, 7, 4, 9, 10, 18, 19 ; t47, t48
+ ITX_MULSUB_2W 5, 3, 4, 9, 10, 18, 19 ; t40a, t55a
+ mova [r6-64* 8], m2
+ mova [r4-64*32], m0
+ mova [r4-64* 8], m8
+ mova [r6-64*32], m1
+ mova [r6-64*24], m6
+ mova [r4-64*16], m7
+ mova [r4-64*24], m5
+ mova [r6-64*16], m3
+ add r4, 64
+ sub r6, 64
+ cmp r4, r6
+ jb .main_part2_loop
+ ret
+
+cglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 7, 0, dst, stride, c, eob
+ lea r5, [o_base]
+ test eobd, eobd
+ jz .dconly
+ PROLOGUE 0, 7, 30, 64*32, dst, stride, c, eob
+ vpbroadcastd m23, [o(pw_2896x8)]
+%undef cmp
+ cmp eobd, 136
+ jb .fast
+ pmulhrsw m0, m23, [cq+64* 1]
+ pmulhrsw m1, m23, [cq+64*31]
+ pmulhrsw m2, m23, [cq+64*17]
+ pmulhrsw m3, m23, [cq+64*15]
+ vpbroadcastd m10, [o(pd_2048)]
+ mov r4, rsp
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
+ pmulhrsw m0, m23, [cq+64* 7]
+ pmulhrsw m1, m23, [cq+64*25]
+ pmulhrsw m2, m23, [cq+64*23]
+ pmulhrsw m3, m23, [cq+64* 9]
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
+ pmulhrsw m0, m23, [cq+64* 5]
+ pmulhrsw m1, m23, [cq+64*27]
+ pmulhrsw m2, m23, [cq+64*21]
+ pmulhrsw m3, m23, [cq+64*11]
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
+ pmulhrsw m0, m23, [cq+64* 3]
+ pmulhrsw m1, m23, [cq+64*29]
+ pmulhrsw m2, m23, [cq+64*19]
+ pmulhrsw m3, m23, [cq+64*13]
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2
+ pmulhrsw m3, m23, [cq+64*24]
+ pmulhrsw m1, m23, [cq+64* 8]
+ pmulhrsw m2, m23, [cq+64*16]
+ pmulhrsw m0, m23, [cq+64* 0]
+ pmulhrsw m14, m23, [cq+64* 4]
+ pmulhrsw m17, m23, [cq+64*28]
+ pmulhrsw m16, m23, [cq+64*20]
+ pmulhrsw m15, m23, [cq+64*12]
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
+ pmulhrsw m22, m23, [cq+64* 2]
+ pmulhrsw m29, m23, [cq+64*30]
+ pmulhrsw m26, m23, [cq+64*18]
+ pmulhrsw m25, m23, [cq+64*14]
+ pmulhrsw m24, m23, [cq+64*10]
+ pmulhrsw m27, m23, [cq+64*22]
+ pmulhrsw m28, m23, [cq+64*26]
+ pmulhrsw m23, [cq+64* 6]
+ mova [cq+64* 0], m14
+ mova [cq+64* 1], m15
+ mova [cq+64* 2], m16
+ mova [cq+64* 3], m17
+ mova [cq+64* 4], m18
+ mova [cq+64* 5], m19
+ mova [cq+64* 6], m20
+ mova [cq+64* 7], m21
+ call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast
+ vpbroadcastd m13, [o(pw_16384)]
+ call .pass1_end_part1
+ mova [cq+64*16], m1
+ mova [cq+64*17], m3
+ mova [cq+64*18], m5
+ mova [cq+64*19], m7
+ mova [cq+64*24], m23
+ mova [cq+64*25], m25
+ mova [cq+64*26], m27
+ mova [cq+64*27], m29
+ pmulhrsw m23, m13, m0 ; a0
+ pmulhrsw m25, m13, m2 ; a2
+ pmulhrsw m27, m13, m4 ; a4
+ pmulhrsw m29, m13, m6 ; a6
+ REPX {pmulhrsw x, m13}, m22, m24, m26, m28 ; e0 e2 e4 e6
+ call .pass1_end_part2
+ mova [cq+64*20], m15
+ mova [cq+64*21], m17
+ mova [cq+64*22], m19
+ mova [cq+64*23], m21
+ mova [cq+64*28], m1
+ mova [cq+64*29], m3
+ mova [cq+64*30], m5
+ mova [cq+64*31], m7
+ REPX {pmulhrsw x, m13}, m14, m16, m18, m20 ; c0 c2 c4 c6
+ REPX {pmulhrsw x, m13}, m0, m2, m4, m6 ; g0 g2 g4 g6
+ vinserti32x8 m3, m23, ym14, 1 ; a00 a01 c00 c01
+ vshufi32x4 m23, m14, q3232 ; a02 a03 c02 c03
+ vinserti32x8 m15, m22, ym0, 1 ; e00 e01 g00 g01
+ vshufi32x4 m22, m0, q3232 ; e02 e03 g02 g03
+ vinserti32x8 m1, m27, ym18, 1 ; a40 a41 c40 c41
+ vshufi32x4 m27, m18, q3232 ; a42 a43 c42 c43
+ vinserti32x8 m18, m26, ym4, 1 ; e40 e41 g40 g41
+ vshufi32x4 m26, m4, q3232 ; e42 e43 g42 g43
+ vinserti32x8 m14, m25, ym16, 1 ; a20 a21 c20 c21
+ vshufi32x4 m25, m16, q3232 ; a22 a23 c22 c23
+ vinserti32x8 m17, m24, ym2, 1 ; e20 e21 g20 g21
+ vshufi32x4 m24, m2, q3232 ; e22 e23 g22 g23
+ vinserti32x8 m19, m29, ym20, 1 ; a60 a61 c60 c61
+ vshufi32x4 m29, m20, q3232 ; a62 a63 c62 c63
+ vinserti32x8 m20, m28, ym6, 1 ; e60 e61 g60 g61
+ vshufi32x4 m28, m6, q3232 ; e62 e63 g62 g63
+ vshufi32x4 m2, m3, m15, q3131 ; 8
+ vshufi32x4 m0, m3, m15, q2020 ; 0
+ vshufi32x4 m6, m23, m22, q3131 ; 24
+ vshufi32x4 m4, m23, m22, q2020 ; 16
+ vshufi32x4 m3, m1, m18, q3131 ; 12
+ vshufi32x4 m1, m18, q2020 ; 4
+ vshufi32x4 m7, m27, m26, q3131 ; 28
+ vshufi32x4 m5, m27, m26, q2020 ; 20
+ call m(inv_txfm_add_dct_dct_32x8_8bpc).main
+ vshufi32x4 m16, m14, m17, q3131 ; 10
+ vshufi32x4 m14, m17, q2020 ; 2
+ vshufi32x4 m17, m19, m20, q3131 ; 14
+ vshufi32x4 m15, m19, m20, q2020 ; 6
+ vshufi32x4 m20, m25, m24, q3131 ; 26
+ vshufi32x4 m18, m25, m24, q2020 ; 18
+ vshufi32x4 m21, m29, m28, q3131 ; 30
+ vshufi32x4 m19, m29, m28, q2020 ; 22
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
+ pmulhrsw m22, m13, [cq+64*16] ; a1
+ pmulhrsw m23, m13, [cq+64*20] ; c1
+ pmulhrsw m24, m13, [cq+64*24] ; e1
+ pmulhrsw m25, m13, [cq+64*28] ; g1
+ pmulhrsw m26, m13, [cq+64*17] ; a3
+ pmulhrsw m27, m13, [cq+64*21] ; c3
+ pmulhrsw m28, m13, [cq+64*25] ; e3
+ pmulhrsw m29, m13, [cq+64*29] ; g3
+ mova [cq+64* 8], m14
+ mova [cq+64* 9], m15
+ mova [cq+64*10], m16
+ mova [cq+64*11], m17
+ mova [cq+64*12], m18
+ mova [cq+64*13], m19
+ mova [cq+64*14], m20
+ mova [cq+64*15], m21
+ pmulhrsw m14, m13, [cq+64*18] ; a5
+ pmulhrsw m15, m13, [cq+64*22] ; c5
+ pmulhrsw m16, m13, [cq+64*26] ; e5
+ pmulhrsw m17, m13, [cq+64*30] ; g5
+ pmulhrsw m18, m13, [cq+64*19] ; a7
+ pmulhrsw m19, m13, [cq+64*23] ; c7
+ pmulhrsw m20, m13, [cq+64*27] ; e7
+ pmulhrsw m21, m13, [cq+64*31] ; g7
+ vinserti32x8 m8, m22, ym23, 1 ; a10 a11 c10 c11
+ vshufi32x4 m22, m23, q3232 ; a12 a13 c12 c13
+ vinserti32x8 m9, m24, ym25, 1 ; e10 e11 g10 g11
+ vshufi32x4 m24, m25, q3232 ; e12 e13 g12 g13
+ vinserti32x8 m23, m26, ym27, 1 ; a30 a31 c30 c31
+ vshufi32x4 m26, m27, q3232 ; a32 a33 c32 c33
+ vinserti32x8 m11, m28, ym29, 1 ; e30 e31 g30 g31
+ vshufi32x4 m28, m29, q3232 ; e32 e33 g32 g33
+ mova [cq+64* 0], m0
+ mova [cq+64* 1], m1
+ mova [cq+64* 2], m2
+ mova [cq+64* 3], m3
+ mova [cq+64* 4], m4
+ mova [cq+64* 5], m5
+ mova [cq+64* 6], m6
+ mova [cq+64* 7], m7
+ vinserti32x8 m12, m14, ym15, 1 ; a50 a51 c50 c51
+ vshufi32x4 m14, m15, q3232 ; a52 a53 c52 c53
+ vinserti32x8 m13, m16, ym17, 1 ; e50 e51 g50 g51
+ vshufi32x4 m16, m17, q3232 ; e52 e53 g52 g53
+ vinserti32x8 m25, m18, ym19, 1 ; a70 a71 c70 c71
+ vshufi32x4 m18, m19, q3232 ; a72 a73 c72 c73
+ vinserti32x8 m17, m20, ym21, 1 ; e70 e71 g70 g71
+ vshufi32x4 m20, m21, q3232 ; e72 e73 g72 g73
+ vshufi32x4 m27, m23, m11, q3131 ; 11 m27
+ vshufi32x4 m23, m11, q2020 ; 3 m23
+ vshufi32x4 m19, m26, m28, q3131 ; 27 m19
+ vshufi32x4 m15, m26, m28, q2020 ; 19 m15
+ vshufi32x4 m29, m25, m17, q3131 ; 15 m29
+ vshufi32x4 m25, m17, q2020 ; 7 m25
+ vshufi32x4 m21, m18, m20, q3131 ; 31 m21
+ vshufi32x4 m17, m18, m20, q2020 ; 23 m17
+ vshufi32x4 m20, m14, m16, q3131 ; 29 m20
+ vshufi32x4 m16, m14, m16, q2020 ; 21 m16
+ vshufi32x4 m18, m22, m24, q3131 ; 25 m18
+ vshufi32x4 m14, m22, m24, q2020 ; 17 m14
+ vshufi32x4 m26, m8, m9, q3131 ; 9 m26
+ vshufi32x4 m22, m8, m9, q2020 ; 1 m22
+ vshufi32x4 m28, m12, m13, q3131 ; 13 m28
+ vshufi32x4 m24, m12, m13, q2020 ; 5 m24
+ call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf
+ vpbroadcastd m13, [o(pw_16384)]
+ pmulhrsw m0, m13, [r4-64*21]
+ pmulhrsw m1, m13, [r4-64*22]
+ pmulhrsw m2, m13, [r4-64*23]
+ pmulhrsw m3, m13, [r4-64*24]
+ pmulhrsw m4, m13, [r4-64*25]
+ pmulhrsw m5, m13, [r4-64*26]
+ pmulhrsw m6, m13, [r4-64*27]
+ pmulhrsw m7, m13, [r4-64*28]
+ mova [cq+64*16], m14
+ mova [cq+64*17], m15
+ mova [cq+64*18], m16
+ mova [cq+64*19], m17
+ mova [cq+64*20], m18
+ mova [cq+64*21], m19
+ mova [cq+64*22], m20
+ mova [cq+64*23], m21
+ pmulhrsw m14, m13, [r4-64*12]
+ pmulhrsw m15, m13, [r4-64*11]
+ pmulhrsw m16, m13, [r4-64*10]
+ pmulhrsw m17, m13, [r4-64* 9]
+ pmulhrsw m18, m13, [r4-64* 8]
+ pmulhrsw m19, m13, [r4-64* 7]
+ pmulhrsw m20, m13, [r4-64* 6]
+ pmulhrsw m21, m13, [r4-64* 5]
+ mova [cq+64*24], m22
+ mova [cq+64*25], m23
+ mova [cq+64*26], m24
+ mova [cq+64*27], m25
+ mova [cq+64*28], m26
+ mova [cq+64*29], m27
+ mova [cq+64*30], m28
+ mova [cq+64*31], m29
+ call .transpose_2x8x8_lo
+ mova [r4-64*12], m1
+ mova [r4-64*11], m3
+ mova [r4-64*10], m5
+ mova [r4-64* 9], m7
+ mova [r4-64* 8], m15
+ mova [r4-64* 7], m17
+ mova [r4-64* 6], m19
+ mova [r4-64* 5], m21
+ vinserti32x8 m22, m0, ym14, 1 ; f00 f01 h00 h01
+ vshufi32x4 m23, m0, m14, q3232 ; f02 f03 h02 h03
+ vinserti32x8 m24, m2, ym16, 1 ; f20 f21 h20 h21
+ vshufi32x4 m25, m2, m16, q3232 ; f22 f23 h22 h23
+ vinserti32x8 m26, m4, ym18, 1 ; f40 f41 h40 h41
+ vshufi32x4 m27, m4, m18, q3232 ; f42 f43 h42 h43
+ vinserti32x8 m28, m6, ym20, 1 ; f60 f61 h60 h61
+ vshufi32x4 m29, m6, m20, q3232 ; f62 f63 h62 h63
+ pmulhrsw m0, m13, [r4-64*20]
+ pmulhrsw m1, m13, [r4-64*19]
+ pmulhrsw m2, m13, [r4-64*18]
+ pmulhrsw m3, m13, [r4-64*17]
+ pmulhrsw m4, m13, [r4-64*16]
+ pmulhrsw m5, m13, [r4-64*15]
+ pmulhrsw m6, m13, [r4-64*14]
+ pmulhrsw m7, m13, [r4-64*13]
+ pmulhrsw m14, m13, [r4-64*29]
+ pmulhrsw m15, m13, [r4-64*30]
+ pmulhrsw m16, m13, [r4-64*31]
+ pmulhrsw m17, m13, [r4-64*32]
+ pmulhrsw m18, m13, [r4-64*33]
+ pmulhrsw m19, m13, [r4-64*34]
+ pmulhrsw m20, m13, [r4-64*35]
+ pmulhrsw m21, m13, [r4-64*36]
+ call .transpose_2x8x8_lo
+ mova [r4-64*20], m1
+ mova [r4-64*19], m3
+ mova [r4-64*18], m5
+ mova [r4-64*17], m7
+ mova [r4-64*16], m15
+ mova [r4-64*15], m17
+ mova [r4-64*14], m19
+ mova [r4-64*13], m21
+ vinserti32x8 m1, m4, ym18, 1 ; b40 b41 d40 d41
+ vshufi32x4 m5, m4, m18, q3232 ; b42 b43 d42 d43
+ vshufi32x4 m4, m0, m14, q3232 ; b02 b03 d02 d03
+ vinserti32x8 m0, ym14, 1 ; b00 b01 d00 d01
+ vinserti32x8 m14, m2, ym16, 1 ; b20 b21 d20 d21
+ vshufi32x4 m18, m2, m16, q3232 ; b22 b23 d22 d23
+ vinserti32x8 m15, m6, ym20, 1 ; b60 b61 d60 d61
+ vshufi32x4 m19, m6, m20, q3232 ; b62 b63 d62 d63
+ vshufi32x4 m2, m0, m22, q3131 ; 8
+ vshufi32x4 m0, m22, q2020 ; 0
+ vshufi32x4 m3, m1, m26, q3131 ; 12
+ vshufi32x4 m1, m26, q2020 ; 4
+ vshufi32x4 m6, m4, m23, q3131 ; 24
+ vshufi32x4 m4, m23, q2020 ; 16
+ vshufi32x4 m7, m5, m27, q3131 ; 28
+ vshufi32x4 m5, m27, q2020 ; 20
+ call m(inv_txfm_add_dct_dct_32x8_8bpc).main
+ vshufi32x4 m16, m14, m24, q3131 ; 10
+ vshufi32x4 m14, m24, q2020 ; 2
+ vshufi32x4 m17, m15, m28, q3131 ; 14
+ vshufi32x4 m15, m28, q2020 ; 6
+ vshufi32x4 m20, m18, m25, q3131 ; 26
+ vshufi32x4 m18, m25, q2020 ; 18
+ vshufi32x4 m21, m19, m29, q3131 ; 30
+ vshufi32x4 m19, m29, q2020 ; 22
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
+ mova m22, [r4-64*20]
+ mova m26, [r4-64*16]
+ mova m23, [r4-64*19]
+ mova m27, [r4-64*15]
+ mova m24, [r4-64*18]
+ mova m28, [r4-64*14]
+ mova m25, [r4-64*17]
+ mova m29, [r4-64*13]
+ mova [r4-64*20], m14
+ mova [r4-64*19], m15
+ mova [r4-64*18], m16
+ mova [r4-64*17], m17
+ mova [r4-64*16], m18
+ mova [r4-64*15], m19
+ mova [r4-64*14], m20
+ mova [r4-64*13], m21
+ mova m19, [r4-64*12]
+ mova m11, [r4-64* 8]
+ mova m20, [r4-64*11]
+ mova m12, [r4-64* 7]
+ mova m21, [r4-64*10]
+ mova m8, [r4-64* 6]
+ mova m9, [r4-64* 9]
+ mova m18, [r4-64* 5]
+ vshufi32x4 m14, m22, m26, q3232 ; b12 b13 d12 d13
+ vinserti32x8 m22, ym26, 1 ; b10 b11 d10 d11
+ vshufi32x4 m15, m23, m27, q3232 ; b32 b33 d32 d33
+ vinserti32x8 m23, ym27, 1 ; b30 b31 d30 d31
+ vshufi32x4 m16, m24, m28, q3232 ; b52 b53 d52 d53
+ vinserti32x8 m24, ym28, 1 ; b50 b51 d50 d51
+ vshufi32x4 m17, m25, m29, q3232 ; b72 b73 d72 d73
+ vinserti32x8 m25, ym29, 1 ; b70 b71 d70 d71
+ vinserti32x8 m27, m19, ym11, 1 ; f10 f11 h10 h11
+ vshufi32x4 m19, m11, q3232 ; f12 f13 h12 h13
+ vinserti32x8 m28, m20, ym12, 1 ; f30 f31 h30 h31
+ vshufi32x4 m20, m12, q3232 ; f32 f33 h32 h33
+ vinserti32x8 m29, m21, ym8, 1 ; f50 f51 h50 h51
+ vshufi32x4 m21, m8, q3232 ; f52 f53 h52 h53
+ vinserti32x8 m8, m9, ym18, 1 ; f70 f71 h70 h71
+ vshufi32x4 m9, m18, q3232 ; f72 f73 h72 h73
+ vshufi32x4 m26, m22, m27, q3131 ; 9
+ vshufi32x4 m22, m27, q2020 ; 1
+ vshufi32x4 m27, m23, m28, q3131 ; 11
+ vshufi32x4 m23, m28, q2020 ; 3
+ vshufi32x4 m28, m24, m29, q3131 ; 13
+ vshufi32x4 m24, m29, q2020 ; 5
+ vshufi32x4 m29, m25, m8, q3131 ; 15
+ vshufi32x4 m25, m8, q2020 ; 7
+ vshufi32x4 m18, m14, m19, q3131 ; 25
+ vshufi32x4 m14, m19, q2020 ; 17
+ vshufi32x4 m19, m15, m20, q3131 ; 27
+ vshufi32x4 m15, m20, q2020 ; 19
+ vshufi32x4 m20, m16, m21, q3131 ; 29
+ vshufi32x4 m16, m21, q2020 ; 21
+ vshufi32x4 m21, m17, m9, q3131 ; 31
+ vshufi32x4 m17, m9, q2020 ; 23
+ call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf
+ jmp .end
+.fast: ; bottom/right halves are zero
+ {evex}vpmulhrsw ym8, ym23, [cq+64* 4]
+ {evex}vpmulhrsw xm1, xm23, [cq+64*12]
+ mova m28, [o(dup16_perm)]
+ {evex}vpmulhrsw ym7, ym23, [cq+64* 8]
+ vpmulhrsw ym22, ym23, [cq+64* 0]
+ vpermb m8, m28, m8
+ vpermb ym1, ym28, ym1
+ vpermb m7, m28, m7
+ pmovzxwd m9, ym22
+ pslld m9, 16
+ call m(idct_16x16_internal_8bpc).main_fast2
+ {evex}vpmulhrsw ym21, ym23, [cq+64* 2]
+ {evex}vpmulhrsw xm15, xm23, [cq+64*14]
+ {evex}vpmulhrsw xm18, xm23, [cq+64*10]
+ {evex}vpmulhrsw ym14, ym23, [cq+64* 6]
+ vpermb m21, m28, m21
+ punpcklwd xm15, xm15
+ vpermb ym18, ym28, ym18
+ vpermb m14, m28, m14
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2
+ vpmulhrsw ym22, ym23, [cq+64* 1]
+ {evex}vpmulhrsw xm29, xm23, [cq+64*15]
+ {evex}vpmulhrsw xm26, xm23, [cq+64* 9]
+ {evex}vpmulhrsw ym25, ym23, [cq+64* 7]
+ {evex}vpmulhrsw ym24, ym23, [cq+64* 5]
+ {evex}vpmulhrsw xm27, xm23, [cq+64*11]
+ {evex}vpmulhrsw xm8, xm23, [cq+64*13]
+ {evex}vpmulhrsw ym23, [cq+64* 3]
+ vpermb m22, m28, m22
+ punpcklwd xm29, xm29
+ vpermb ym26, ym28, ym26
+ vpermb m25, m28, m25
+ mova [cq+64* 0], m14
+ mova [cq+64* 1], m15
+ mova [cq+64* 2], m16
+ mova [cq+64* 3], m17
+ REPX {vpermb x, m28, x}, m24, m27, m23
+ punpcklwd xm28, xm8, xm8
+ mova [cq+64* 4], m18
+ mova [cq+64* 5], m19
+ mova [cq+64* 6], m20
+ mova [cq+64* 7], m21
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast
+ mov r4, rsp
+ vpbroadcastd m13, [o(pw_16384)]
+ mova [r4+64*16], m4
+ mova [r4+64*17], m5
+ mova [r4+64*18], m6
+ mova [r4+64*19], m7
+ mova [r4+64*28], m26
+ mova [r4+64*29], m27
+ mova [r4+64*30], m28
+ mova [r4+64*31], m29
+ call m(inv_txfm_add_dct_dct_64x16_8bpc).pass1_end
+ mova [r4+64*20], m22
+ mova [r4+64*21], m23
+ mova [r4+64*22], m24
+ mova [r4+64*23], m25
+ mova [r4+64*24], m26
+ mova [r4+64*25], m27
+ mova [r4+64*26], m28
+ mova [r4+64*27], m29
+ call .pass2_fast
+ mova [cq+64* 8], m14
+ mova [cq+64* 9], m15
+ mova [cq+64*10], m16
+ mova [cq+64*11], m17
+ mova [cq+64*12], m18
+ mova [cq+64*13], m19
+ mova [cq+64*14], m20
+ mova [cq+64*15], m21
+ call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast
+ mova [cq+64* 0], m0
+ mova [cq+64* 1], m1
+ mova [cq+64* 2], m2
+ mova [cq+64* 3], m3
+ mova [cq+64* 4], m4
+ mova [cq+64* 5], m5
+ mova [cq+64* 6], m6
+ mova [cq+64* 7], m7
+ pmulhrsw m0, m13, [r4+64*16]
+ pmulhrsw m1, m13, [r4+64*17]
+ pmulhrsw m2, m13, [r4+64*18]
+ pmulhrsw m3, m13, [r4+64*19]
+ pmulhrsw m4, m13, [r4+64*20]
+ pmulhrsw m5, m13, [r4+64*21]
+ pmulhrsw m6, m13, [r4+64*22]
+ pmulhrsw m7, m13, [r4+64*23]
+ mova [cq+64*16], m14
+ mova [cq+64*17], m15
+ mova [cq+64*18], m16
+ mova [cq+64*19], m17
+ mova [cq+64*20], m18
+ mova [cq+64*21], m19
+ mova [cq+64*22], m20
+ mova [cq+64*23], m21
+ pmulhrsw m14, m13, [r4+64*24]
+ pmulhrsw m15, m13, [r4+64*25]
+ pmulhrsw m16, m13, [r4+64*26]
+ pmulhrsw m17, m13, [r4+64*27]
+ pmulhrsw m18, m13, [r4+64*28]
+ pmulhrsw m19, m13, [r4+64*29]
+ pmulhrsw m20, m13, [r4+64*30]
+ pmulhrsw m21, m13, [r4+64*31]
+ mova [cq+64*24], m22
+ mova [cq+64*25], m23
+ mova [cq+64*26], m24
+ mova [cq+64*27], m25
+ mova [cq+64*28], m26
+ mova [cq+64*29], m27
+ mova [cq+64*30], m28
+ mova [cq+64*31], m29
+ call m(inv_txfm_add_dct_dct_64x16_8bpc).transpose_round
+ call .pass2_fast
+ mova [r4+64*16], m14
+ mova [r4+64*17], m15
+ mova [r4+64*18], m16
+ mova [r4+64*19], m17
+ mova [r4+64*20], m18
+ mova [r4+64*21], m19
+ mova [r4+64*22], m20
+ mova [r4+64*23], m21
+ call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast
+.end:
+ vpbroadcastd m13, [o(pw_2048)]
+ lea r5, [strideq*3]
+ pxor m12, m12
+ lea r3, [dstq+r5*8]
+ lea r6, [strideq+r5] ; stride*4
+ add r3, r6 ; dst+stride*28
+%macro IDCT_64x32_END 5 ; src16, src32, mem, off_lo, off_hi
+ mova m11, [cq+64*( %3)] ; 0
+ mova m9, [cq+64*(31-%3)] ; 31
+%if %3 >= 8
+ mova m%1, [rsp+64*(%1+16)]
+%endif
+ mova m10, [dstq+%4]
+ paddsw m8, m11, m9
+ psubsw m11, m9
+ paddsw m9, m%1, m%2
+ psubsw m%1, m%2
+ punpcklbw m%2, m10, m12
+ punpckhbw m10, m12
+ pmulhrsw m8, m13
+ pmulhrsw m9, m13
+ paddw m8, m%2
+ paddw m9, m10
+ mova m10, [r3+%5]
+ pmulhrsw m11, m13
+ pmulhrsw m%1, m13
+ mova [cq+64*( %3)], m12
+ mova [cq+64*(31-%3)], m12
+ punpcklbw m%2, m10, m12
+ punpckhbw m10, m12
+ packuswb m8, m9
+ paddw m11, m%2
+ paddw m%1, m10
+ packuswb m11, m%1
+ mova [dstq+%4], m8
+ mova [r3 +%5], m11
+%if %3 == 3 || %3 == 7 || %3 == 11
+ add dstq, r6
+ sub r3, r6
+%endif
+%endmacro
+ IDCT_64x32_END 0, 29, 0, strideq*0, r5
+ IDCT_64x32_END 1, 28, 1, strideq*1, strideq*2
+ IDCT_64x32_END 2, 27, 2, strideq*2, strideq*1
+ IDCT_64x32_END 3, 26, 3, r5 , strideq*0
+ IDCT_64x32_END 4, 25, 4, strideq*0, r5
+ IDCT_64x32_END 5, 24, 5, strideq*1, strideq*2
+ IDCT_64x32_END 6, 23, 6, strideq*2, strideq*1
+ IDCT_64x32_END 7, 22, 7, r5 , strideq*0
+ IDCT_64x32_END 0, 21, 8, strideq*0, r5
+ IDCT_64x32_END 1, 20, 9, strideq*1, strideq*2
+ IDCT_64x32_END 2, 19, 10, strideq*2, strideq*1
+ IDCT_64x32_END 3, 18, 11, r5 , strideq*0
+ IDCT_64x32_END 4, 17, 12, strideq*0, r5
+ IDCT_64x32_END 5, 16, 13, strideq*1, strideq*2
+ IDCT_64x32_END 6, 15, 14, strideq*2, strideq*1
+ IDCT_64x32_END 7, 14, 15, r5 , strideq*0
+ RET
+ALIGN function_align
+.dconly:
+ movsx r6d, word [cq]
+ mov [cq], eobd
+ or r3d, 32
+ imul r6d, 181
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ add r6d, 128+256
+ sar r6d, 8+1
+ jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly2
+ALIGN function_align
+.pass1_end_part1:
+%macro IDCT_64x32_PASS1_END 3 ; src16, src32, src64
+%if %1 != %3
+ mova m%1, [cq+64*%1]
+%endif
+ mova m9, [r4+64*(%3-36)] ; idct64 32+n
+ mova m11, [r4+64*(-5-%3)] ; idct64 63-n
+ psubsw m8, m%1, m%2 ; idct32 31-n
+ paddsw m%1, m%2 ; idct32 0+n
+%if %1 == %3
+ psubsw m%2, m8, m9 ; out 32+n e
+ paddsw m8, m9 ; out 31-n d
+ psubsw m9, m%1, m11 ; out 63-n h
+ paddsw m%1, m11 ; out 0+n a
+%else
+ paddsw m%2, m8, m9 ; out 23-n c
+ psubsw m8, m9 ; out 40+n f
+ paddsw m9, m%1, m11 ; out 8+n b
+ psubsw m%1, m11 ; out 55-n g
+%endif
+ mova [r4+64*(%3-36)], m8
+ mova [r4+64*(-5-%3)], m9
+%endmacro
+ IDCT_64x32_PASS1_END 0, 29, 0
+ IDCT_64x32_PASS1_END 1, 28, 1
+ IDCT_64x32_PASS1_END 2, 27, 2
+ IDCT_64x32_PASS1_END 3, 26, 3
+ IDCT_64x32_PASS1_END 4, 25, 4
+ IDCT_64x32_PASS1_END 5, 24, 5
+ IDCT_64x32_PASS1_END 6, 23, 6
+ IDCT_64x32_PASS1_END 7, 22, 7
+.transpose_2x8x8_hi: ; m0-m7 + m22-m29 (inverted)
+ punpcklwd m8, m25, m24 ; e0 f0 e1 f1 e2 f2 e3 f3
+ punpckhwd m25, m24 ; e4 f4 e5 f5 e6 f6 e7 f7
+ punpcklwd m24, m23, m22 ; g0 h0 g1 h1 g2 h2 g3 h3
+ punpckhwd m23, m22 ; g4 h4 g5 h5 g6 h6 g7 h7
+ punpcklwd m22, m29, m28 ; a0 b0 a1 b1 a2 b2 a3 b3
+ punpckhwd m29, m28 ; a4 b4 a5 b5 a6 b6 a7 b7
+ punpcklwd m28, m27, m26 ; c0 d0 c1 d1 c2 d2 c3 d3
+ punpckhwd m27, m26 ; c4 d4 c5 d5 c6 d6 c7 d7
+ punpckldq m26, m29, m27 ; a4 b4 c4 d4 a5 b5 c5 d5
+ punpckhdq m29, m27 ; a6 b6 c6 d6 a7 b7 c7 d7
+ punpckldq m27, m8, m24 ; e0 f0 g0 h0 e1 f1 g1 h1
+ punpckhdq m8, m24 ; e2 f2 g2 h2 e3 f3 g3 h3
+ punpckhdq m24, m22, m28 ; a2 b2 c2 d2 a3 b3 c3 d3
+ punpckldq m22, m28 ; a0 b0 c0 d0 a1 b1 c1 d1
+ punpckldq m28, m25, m23 ; e4 f4 g4 h4 e5 f5 g5 h5
+ punpckhdq m25, m23 ; e6 f6 g6 h6 e7 f7 g7 h7
+ punpckhqdq m23, m22, m27 ; 1 23
+ punpcklqdq m22, m27 ; 0 22
+ punpckhqdq m27, m26, m28 ; 5 27
+ punpcklqdq m26, m28 ; 4 26
+ punpcklqdq m28, m29, m25 ; 6 28
+ punpckhqdq m29, m25 ; 7 29
+ punpckhqdq m25, m24, m8 ; 3 25
+ punpcklqdq m24, m8 ; 2 24
+.transpose_8x8:
+ punpckhwd m8, m4, m5
+ punpcklwd m4, m5
+ punpckhwd m5, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m6, m7
+ punpcklwd m6, m7
+ punpckhwd m7, m2, m3
+ punpcklwd m2, m3
+ punpckhdq m3, m0, m2
+ punpckldq m0, m2
+ punpckldq m2, m4, m6
+ punpckhdq m4, m6
+ punpckhdq m6, m5, m7
+ punpckldq m5, m7
+ punpckldq m7, m8, m1
+ punpckhdq m8, m1
+ punpckhqdq m1, m0, m2
+ punpcklqdq m0, m2
+ punpcklqdq m2, m3, m4
+ punpckhqdq m3, m4
+ punpcklqdq m4, m5, m7
+ punpckhqdq m5, m7
+ punpckhqdq m7, m6, m8
+ punpcklqdq m6, m8
+ ret
+.pass1_end_part2:
+ IDCT_64x32_PASS1_END 0, 21, 8
+ IDCT_64x32_PASS1_END 1, 20, 9
+ IDCT_64x32_PASS1_END 2, 19, 10
+ IDCT_64x32_PASS1_END 3, 18, 11
+ IDCT_64x32_PASS1_END 4, 17, 12
+ IDCT_64x32_PASS1_END 5, 16, 13
+ IDCT_64x32_PASS1_END 6, 15, 14
+ IDCT_64x32_PASS1_END 7, 14, 15
+.transpose_2x8x8_lo: ; m0-m7 (inverted) + m14-m21
+ punpcklwd m8, m3, m2
+ punpckhwd m3, m2
+ punpcklwd m2, m1, m0
+ punpckhwd m1, m0
+ punpcklwd m0, m7, m6
+ punpckhwd m7, m6
+ punpcklwd m6, m5, m4
+ punpckhwd m5, m4
+ punpckldq m4, m7, m5
+ punpckhdq m7, m5
+ punpckldq m5, m8, m2
+ punpckhdq m8, m2
+ punpckhdq m2, m0, m6
+ punpckldq m0, m6
+ punpckldq m6, m3, m1
+ punpckhdq m3, m1
+ punpckhqdq m1, m0, m5
+ punpcklqdq m0, m5
+ punpckhqdq m5, m4, m6
+ punpcklqdq m4, m6
+ punpcklqdq m6, m7, m3
+ punpckhqdq m7, m3
+ punpckhqdq m3, m2, m8
+ punpcklqdq m2, m8
+ punpckhwd m8, m18, m19
+ punpcklwd m18, m19
+ punpckhwd m19, m14, m15
+ punpcklwd m14, m15
+ punpckhwd m15, m20, m21
+ punpcklwd m20, m21
+ punpckhwd m21, m16, m17
+ punpcklwd m16, m17
+ punpckhdq m17, m14, m16
+ punpckldq m14, m16
+ punpckldq m16, m18, m20
+ punpckhdq m18, m20
+ punpckhdq m20, m19, m21
+ punpckldq m19, m21
+ punpckldq m21, m8, m15
+ punpckhdq m8, m15
+ punpckhqdq m15, m14, m16
+ punpcklqdq m14, m16
+ punpcklqdq m16, m17, m18
+ punpckhqdq m17, m18
+ punpcklqdq m18, m19, m21
+ punpckhqdq m19, m21
+ punpckhqdq m21, m20, m8
+ punpcklqdq m20, m8
+ ret
+.pass2_fast:
+ vshufi32x4 m24, m9, m15, q3131 ; 5
+ vshufi32x4 m22, m9, m15, q2020 ; 1
+ vshufi32x4 m15, m1, m16, q3131 ; 6
+ vshufi32x4 m14, m1, m16, q2020 ; 2
+ vshufi32x4 m1, m0, m3, q3131 ; 4
+ vshufi32x4 m0, m3, q2020 ; 0
+ vshufi32x4 m3, m8, m2, q3131 ; 12
+ vshufi32x4 m2, m8, m2, q2020 ; 8
+ vshufi32x4 m25, m11, m17, q3131 ; 7
+ vshufi32x4 m23, m11, m17, q2020 ; 3
+ vshufi32x4 m17, m5, m19, q3131 ; 14
+ vshufi32x4 m16, m5, m19, q2020 ; 10
+ vshufi32x4 m29, m6, m20, q3131 ; 15
+ vshufi32x4 m27, m6, m20, q2020 ; 11
+ vshufi32x4 m28, m4, m18, q3131 ; 13
+ vshufi32x4 m26, m4, m18, q2020 ; 9
+ jmp m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
+
+cglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 7, 0, dst, stride, c, eob
+ lea r5, [o_base]
+ test eobd, eobd
+ jz .dconly
+ PROLOGUE 0, 7, 30, 64*96, dst, stride, c, eob
+%undef cmp
+ cmp eobd, 136
+ jb .fast
+ mova m0, [cq+64* 1]
+ mova m1, [cq+64*31]
+ mova m2, [cq+64*17]
+ mova m3, [cq+64*15]
+ vpbroadcastd m10, [o(pd_2048)]
+ mov r4, rsp
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
+ mova m0, [cq+64* 7]
+ mova m1, [cq+64*25]
+ mova m2, [cq+64*23]
+ mova m3, [cq+64* 9]
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
+ mova m0, [cq+64* 5]
+ mova m1, [cq+64*27]
+ mova m2, [cq+64*21]
+ mova m3, [cq+64*11]
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
+ mova m0, [cq+64* 3]
+ mova m1, [cq+64*29]
+ mova m2, [cq+64*19]
+ mova m3, [cq+64*13]
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2
+ mova m0, [cq+64* 0]
+ mova m1, [cq+64* 8]
+ mova m2, [cq+64*16]
+ mova m3, [cq+64*24]
+ mova m14, [cq+64* 4]
+ mova m15, [cq+64*12]
+ mova m16, [cq+64*20]
+ mova m17, [cq+64*28]
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
+ mova m22, [cq+64* 2]
+ mova m29, [cq+64*30]
+ mova m26, [cq+64*18]
+ mova m25, [cq+64*14]
+ mova m24, [cq+64*10]
+ mova m27, [cq+64*22]
+ mova m28, [cq+64*26]
+ mova m23, [cq+64* 6]
+ mova [cq+64* 0], m14
+ mova [cq+64* 1], m15
+ mova [cq+64* 2], m16
+ mova [cq+64* 3], m17
+ mova [cq+64* 4], m18
+ mova [cq+64* 5], m19
+ mova [cq+64* 6], m20
+ mova [cq+64* 7], m21
+ call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast
+ vpbroadcastd m13, [o(pw_8192)]
+ call m(inv_txfm_add_dct_dct_64x32_8bpc).pass1_end_part1
+ mova [r4+64*36], m1
+ mova [r4+64*37], m3
+ mova [r4+64*38], m5
+ mova [r4+64*39], m7
+ mova [r4+64*44], m23
+ mova [r4+64*45], m25
+ mova [r4+64*46], m27
+ mova [r4+64*47], m29
+ pmulhrsw m23, m13, m0 ; a0
+ pmulhrsw m25, m13, m2 ; a2
+ pmulhrsw m27, m13, m4 ; a4
+ pmulhrsw m29, m13, m6 ; a6
+ call m(inv_txfm_add_dct_dct_64x32_8bpc).pass1_end_part2
+ lea r6, [r4-64*4]
+ add r4, 64*28
+ call .pass2_end
+ mov r4, rsp
+ mova m0, [r4+64*23]
+ mova m1, [r4+64*22]
+ mova m2, [r4+64*21]
+ mova m3, [r4+64*20]
+ mova m4, [r4+64*19]
+ mova m5, [r4+64*18]
+ mova m6, [r4+64*17]
+ mova m7, [r4+64*16]
+ mova m22, [r4+64*15]
+ mova m23, [r4+64*14]
+ mova m24, [r4+64*13]
+ mova m25, [r4+64*12]
+ mova m26, [r4+64*11]
+ mova m27, [r4+64*10]
+ mova m28, [r4+64* 9]
+ mova m29, [r4+64* 8]
+ call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_2x8x8_hi
+ vpbroadcastd m13, [o(pw_8192)]
+ mova [r4+64* 8], m1
+ mova [r4+64* 9], m3
+ mova [r4+64*10], m5
+ mova [r4+64*11], m7
+ mova [r4+64*16], m23
+ mova [r4+64*17], m25
+ mova [r4+64*18], m27
+ mova [r4+64*19], m29
+ pmulhrsw m23, m13, m0 ; b0
+ pmulhrsw m25, m13, m2 ; b2
+ pmulhrsw m27, m13, m4 ; b4
+ pmulhrsw m29, m13, m6 ; b6
+ mova m0, [r4+64*31]
+ mova m1, [r4+64*30]
+ mova m2, [r4+64*29]
+ mova m3, [r4+64*28]
+ mova m4, [r4+64*27]
+ mova m5, [r4+64*26]
+ mova m6, [r4+64*25]
+ mova m7, [r4+64*24]
+ mova m14, [r4+64* 7]
+ mova m15, [r4+64* 6]
+ mova m16, [r4+64* 5]
+ mova m17, [r4+64* 4]
+ mova m18, [r4+64* 3]
+ mova m19, [r4+64* 2]
+ mova m20, [r4+64* 1]
+ mova m21, [r4+64* 0]
+ call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_2x8x8_lo
+ mov r6, cq
+ call .pass2_end
+ jmp .end
+.fast: ; bottom/right halves are zero
+ mova m28, [o(dup16_perm)]
+ pmovzxwd m9, [cq+64* 0]
+ vpermb m8, m28, [cq+64* 4]
+ vpermb ym1, ym28, [cq+64*12]
+ vpermb m7, m28, [cq+64* 8]
+ pslld m9, 16
+ call m(idct_16x16_internal_8bpc).main_fast2
+ vpermb m21, m28, [cq+64* 2]
+ vpermb ym15, ym28, [cq+64*14]
+ vpermb ym18, ym28, [cq+64*10]
+ vpermb m14, m28, [cq+64* 6]
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2
+ vpermb m22, m28, [cq+64* 1]
+ vpermb ym29, ym28, [cq+64*15]
+ vpermb ym26, ym28, [cq+64* 9]
+ vpermb m25, m28, [cq+64* 7]
+ vpermb m24, m28, [cq+64* 5]
+ vpermb ym27, ym28, [cq+64*11]
+ vpermb m23, m28, [cq+64* 3]
+ vpermb ym28, ym28, [cq+64*13]
+ mova [cq+64* 0], m14
+ mova [cq+64* 1], m15
+ mova [cq+64* 2], m16
+ mova [cq+64* 3], m17
+ mova [cq+64* 4], m18
+ mova [cq+64* 5], m19
+ mova [cq+64* 6], m20
+ mova [cq+64* 7], m21
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast
+ vpbroadcastd m13, [o(pw_8192)]
+ mova [cq+64*16], m4
+ mova [cq+64*17], m5
+ mova [cq+64*18], m6
+ mova [cq+64*19], m7
+ mova [cq+64*28], m26
+ mova [cq+64*29], m27
+ mova [cq+64*30], m28
+ mova [cq+64*31], m29
+ call m(inv_txfm_add_dct_dct_64x16_8bpc).pass1_end
+ mova [cq+64*20], m22
+ mova [cq+64*21], m23
+ mova [cq+64*22], m24
+ mova [cq+64*23], m25
+ mova [cq+64*24], m26
+ mova [cq+64*25], m27
+ mova [cq+64*26], m28
+ mova [cq+64*27], m29
+ lea r4, [rsp+64*64]
+ lea r3, [rsp+64*32]
+ call .pass2_fast
+ pmulhrsw m0, m13, [cq+64*16]
+ pmulhrsw m1, m13, [cq+64*17]
+ pmulhrsw m2, m13, [cq+64*18]
+ pmulhrsw m3, m13, [cq+64*19]
+ pmulhrsw m4, m13, [cq+64*20]
+ pmulhrsw m5, m13, [cq+64*21]
+ pmulhrsw m6, m13, [cq+64*22]
+ pmulhrsw m7, m13, [cq+64*23]
+ pmulhrsw m14, m13, [cq+64*24]
+ pmulhrsw m15, m13, [cq+64*25]
+ pmulhrsw m16, m13, [cq+64*26]
+ pmulhrsw m17, m13, [cq+64*27]
+ pmulhrsw m18, m13, [cq+64*28]
+ pmulhrsw m19, m13, [cq+64*29]
+ pmulhrsw m20, m13, [cq+64*30]
+ pmulhrsw m21, m13, [cq+64*31]
+ call m(inv_txfm_add_dct_dct_64x16_8bpc).transpose_round
+ mov r4, rsp
+ mov r3, cq
+ call .pass2_fast
+.end:
+ vpbroadcastd m17, [o(pw_2048)]
+ lea r5, [strideq*8]
+ mov r3, dstq
+ pxor m16, m16
+ sub r4, 64*5 ; rsp+64*31
+ mov r6, rsp
+.end_loop:
+ mova m2, [r6+64*32] ; idct16 0+n lo
+ mova m7, [r6+64*48] ; idct32 31-n lo
+ mova m6, [cq+64* 0] ; idct16 0+n hi
+ mova m0, [cq+64*16] ; idct32 31-n hi
+ mova m4, [r4+64*64] ; idct64 63-n lo
+ mova m1, [r4+64* 0] ; idct64 63-n hi
+ mova m5, [r6+64*64] ; idct64 32+n lo
+ mova m8, [r6+64* 0] ; idct64 32+n hi
+ sub r3, strideq
+ paddsw m3, m2, m7 ; idct32 0+n lo
+ mova m12, [dstq+r5*0]
+ psubsw m2, m7 ; idct32 31-n lo
+ mova m15, [r3 +r5*8]
+ paddsw m7, m6, m0 ; idct32 0+n hi
+ mova m13, [r3 +r5*4]
+ psubsw m6, m0 ; idct32 31-n hi
+ mova m14, [dstq+r5*4]
+ paddsw m0, m3, m4 ; out 0+n lo
+ add r6, 64
+ psubsw m3, m4 ; out 63-n lo
+ sub r4, 64
+ paddsw m4, m7, m1 ; out 0+n hi
+ mova [cq+64* 0], m16
+ psubsw m7, m1 ; out 63-n hi
+ mova [cq+64*16], m16
+ paddsw m1, m2, m5 ; out 31-n lo
+ add cq, 64
+ psubsw m2, m5 ; out 32+n lo
+ paddsw m5, m6, m8 ; out 31-n hi
+ psubsw m6, m8 ; out 32+n hi
+ pmulhrsw m0, m17
+ punpcklbw m8, m12, m16
+ pmulhrsw m4, m17
+ punpckhbw m12, m16
+ pmulhrsw m3, m17
+ punpcklbw m11, m15, m16
+ pmulhrsw m7, m17
+ punpckhbw m15, m16
+ pmulhrsw m1, m17
+ punpcklbw m9, m13, m16
+ pmulhrsw m5, m17
+ punpckhbw m13, m16
+ pmulhrsw m2, m17
+ punpcklbw m10, m14, m16
+ pmulhrsw m6, m17
+ punpckhbw m14, m16
+ paddw m0, m8
+ paddw m4, m12
+ packuswb m0, m4
+ paddw m3, m11
+ paddw m7, m15
+ packuswb m3, m7
+ paddw m1, m9
+ paddw m5, m13
+ packuswb m1, m5
+ paddw m2, m10
+ paddw m6, m14
+ packuswb m2, m6
+ mova [dstq+r5*0], m0
+ mova [r3 +r5*8], m3
+ mova [r3 +r5*4], m1
+ mova [dstq+r5*4], m2
+ add dstq, strideq
+ cmp r6, r4
+ jb .end_loop
+ RET
+.dconly:
+ movsx r6d, word [cq]
+ mov [cq], eobd
+ or r3d, 64
+ jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly
+ALIGN function_align
+.pass2_end:
+ REPX {pmulhrsw x, m13}, m22, m24, m26, m28, m14, m16, m18, m20, m0, m2, m4, m6
+ mova [r4+64*20], m1
+ mova [r4+64*21], m3
+ mova [r4+64*22], m5
+ mova [r4+64*23], m7
+ vinserti32x8 m1, m23, ym14, 1 ; a00 a01 c00 c01
+ vshufi32x4 m3, m23, m14, q3232 ; a02 a03 c02 c03
+ vinserti32x8 m5, m22, ym0, 1 ; e00 e01 g00 g01
+ vshufi32x4 m14, m22, m0, q3232 ; e02 e03 g02 g03
+ mova [r4+64*12], m15
+ mova [r4+64*13], m17
+ mova [r4+64*14], m19
+ mova [r4+64*15], m21
+ vinserti32x8 m15, m27, ym18, 1 ; a40 a41 c40 c41
+ vshufi32x4 m17, m27, m18, q3232 ; a42 a43 c42 c43
+ vinserti32x8 m18, m26, ym4, 1 ; e40 e41 g40 g41
+ vshufi32x4 m19, m26, m4, q3232 ; e42 e43 g42 g43
+ vinserti32x8 m22, m25, ym16, 1 ; a20 a21 c20 c21
+ vshufi32x4 m26, m25, m16, q3232 ; a22 a23 c22 c23
+ vinserti32x8 m25, m24, ym2, 1 ; e20 e21 g20 g21
+ vshufi32x4 m27, m24, m2, q3232 ; e22 e23 g22 g23
+ vinserti32x8 m23, m29, ym20, 1 ; a60 a61 c60 c61
+ vshufi32x4 m29, m20, q3232 ; a62 a63 c62 c63
+ vshufi32x4 m13, m28, m6, q3232 ; e62 e63 g62 g63
+ vinserti32x8 m28, ym6, 1 ; e60 e61 g60 g61
+ vshufi32x4 m0, m1, m5, q2020 ; 0
+ vshufi32x4 m1, m5, q3131 ; 8
+ vshufi32x4 m2, m3, m14, q2020 ; 16
+ vshufi32x4 m3, m14, q3131 ; 24
+ vshufi32x4 m14, m15, m18, q2020 ; 4
+ vshufi32x4 m15, m18, q3131 ; 12
+ vshufi32x4 m16, m17, m19, q2020 ; 20
+ vshufi32x4 m17, m19, q3131 ; 28
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
+ vshufi32x4 m24, m22, m25, q3131 ; 10
+ vshufi32x4 m22, m25, q2020 ; 2
+ vshufi32x4 m25, m23, m28, q3131 ; 14
+ vshufi32x4 m23, m28, q2020 ; 6
+ vshufi32x4 m28, m26, m27, q3131 ; 26
+ vshufi32x4 m26, m27, q2020 ; 18
+ vshufi32x4 m27, m29, m13, q2020 ; 22
+ vshufi32x4 m29, m13, q3131 ; 30
+ mova [r6+64* 0], m0
+ mova [r6+64* 1], m1
+ mova [r6+64* 2], m2
+ mova [r6+64* 3], m3
+ mova [r6+64* 4], m4
+ mova [r6+64* 5], m5
+ mova [r6+64* 6], m6
+ mova [r6+64* 7], m7
+ mova [r6+64* 8], m14
+ mova [r6+64* 9], m15
+ mova [r6+64*10], m16
+ mova [r6+64*11], m17
+ mova [r6+64*12], m18
+ mova [r6+64*13], m19
+ mova [r6+64*14], m20
+ mova [r6+64*15], m21
+ call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast
+ vpbroadcastd m13, [o(pw_8192)]
+ mova [r6+64*16], m29
+ mova [r6+64*17], m28
+ mova [r6+64*18], m27
+ mova [r6+64*19], m26
+ mova [r6+64*20], m25
+ mova [r6+64*21], m24
+ mova [r6+64*22], m23
+ mova [r6+64*23], m22
+ mova [r6+64*24], m21
+ mova [r6+64*25], m20
+ mova [r6+64*26], m19
+ mova [r6+64*27], m18
+ mova [r6+64*28], m17
+ mova [r6+64*29], m16
+ mova [r6+64*30], m15
+ mova [r6+64*31], m14
+ pmulhrsw m15, m13, [r4+64* 8] ; 1 9 17 25
+ pmulhrsw m16, m13, [r4+64*12]
+ pmulhrsw m17, m13, [r4+64*16]
+ pmulhrsw m18, m13, [r4+64*20]
+ pmulhrsw m19, m13, [r4+64*11] ; 7 15 23 31
+ pmulhrsw m20, m13, [r4+64*15]
+ pmulhrsw m21, m13, [r4+64*19]
+ pmulhrsw m22, m13, [r4+64*23]
+ vinserti32x8 m14, m15, ym16, 1 ; a1 a9 c1 c9
+ vshufi32x4 m15, m16, q3232 ; a17 a25 c17 c25
+ vinserti32x8 m16, m17, ym18, 1 ; e1 e9 g1 g9
+ vshufi32x4 m17, m18, q3232 ; e17 e25 g17 g25
+ pmulhrsw m23, m13, [r4+64*10] ; 5 13 21 29
+ pmulhrsw m24, m13, [r4+64*14]
+ pmulhrsw m25, m13, [r4+64*18]
+ pmulhrsw m26, m13, [r4+64*22]
+ vinserti32x8 m18, m19, ym20, 1 ; a7 a15 c7 c15
+ vshufi32x4 m19, m20, q3232 ; a23 a31 c23 c31
+ vinserti32x8 m20, m21, ym22, 1 ; e7 e15 g7 g15
+ vshufi32x4 m21, m22, q3232 ; e23 e31 g23 g31
+ pmulhrsw m27, m13, [r4+64* 9] ; 3 11 19 27
+ pmulhrsw m28, m13, [r4+64*13]
+ pmulhrsw m29, m13, [r4+64*17]
+ pmulhrsw m13, [r4+64*21]
+ vshufi32x4 m0, m14, m16, q2020 ; 1
+ vshufi32x4 m1, m19, m21, q3131 ; 31
+ vshufi32x4 m2, m15, m17, q2020 ; 17
+ vshufi32x4 m3, m18, m20, q3131 ; 15
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
+ vshufi32x4 m0, m18, m20, q2020 ; 7
+ vshufi32x4 m1, m15, m17, q3131 ; 25
+ vshufi32x4 m2, m19, m21, q2020 ; 23
+ vshufi32x4 m3, m14, m16, q3131 ; 9
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
+ vinserti32x8 m22, m23, ym24, 1 ; a5 a13 c5 c13
+ vshufi32x4 m23, m24, q3232 ; a21 a29 c21 c29
+ vinserti32x8 m24, m25, ym26, 1 ; e5 e13 g5 g13
+ vshufi32x4 m25, m26, q3232 ; e21 e29 g21 g29
+ vinserti32x8 m26, m27, ym28, 1 ; a3 a11 c3 c11
+ vshufi32x4 m27, m28, q3232 ; a19 a27 c19 c27
+ vinserti32x8 m28, m29, ym13, 1 ; e3 e11 g3 g11
+ vshufi32x4 m29, m13, q3232 ; e19 e17 g19 g27
+ vshufi32x4 m0, m22, m24, q2020 ; 5
+ vshufi32x4 m1, m27, m29, q3131 ; 27
+ vshufi32x4 m2, m23, m25, q2020 ; 21
+ vshufi32x4 m3, m26, m28, q3131 ; 11
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
+ vshufi32x4 m0, m26, m28, q2020 ; 3
+ vshufi32x4 m1, m23, m25, q3131 ; 29
+ vshufi32x4 m2, m27, m29, q2020 ; 19
+ vshufi32x4 m3, m22, m24, q3131 ; 13
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
+ jmp m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2
+ALIGN function_align
+.pass2_fast:
+ vshufi32x4 m23, m1, m16, q3131 ; 6
+ vshufi32x4 m22, m1, m16, q2020 ; 2
+ vshufi32x4 m14, m0, m3, q3131 ; 4
+ vshufi32x4 m26, m0, m3, q2020 ; 0
+ vshufi32x4 m28, m9, m15, q3131 ; 5
+ vshufi32x4 m0, m9, m15, q2020 ; 1
+ vshufi32x4 m16, m11, m17, q3131 ; 7
+ vshufi32x4 m29, m11, m17, q2020 ; 3
+ vshufi32x4 m15, m8, m2, q3131 ; 12
+ vshufi32x4 m27, m8, m2, q2020 ; 8
+ vshufi32x4 m25, m5, m19, q3131 ; 14
+ vshufi32x4 m24, m5, m19, q2020 ; 10
+ vshufi32x4 m3, m6, m20, q3131 ; 15
+ vshufi32x4 m19, m6, m20, q2020 ; 11
+ vshufi32x4 m17, m4, m18, q3131 ; 13
+ vshufi32x4 m18, m4, m18, q2020 ; 9
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast
+ mova m0, m16
+ mova m3, m18
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast
+ mova m0, m28
+ mova m3, m19
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast
+ mova m0, m29
+ mova m3, m17
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2
+ mova m0, m26
+ mova m1, m27
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast2
+ mova [r3+64* 0], m0
+ mova [r3+64* 1], m1
+ mova [r3+64* 2], m2
+ mova [r3+64* 3], m3
+ mova [r3+64* 4], m4
+ mova [r3+64* 5], m5
+ mova [r3+64* 6], m6
+ mova [r3+64* 7], m7
+ mova [r3+64* 8], m14
+ mova [r3+64* 9], m15
+ mova [r3+64*10], m16
+ mova [r3+64*11], m17
+ mova [r3+64*12], m18
+ mova [r3+64*13], m19
+ mova [r3+64*14], m20
+ mova [r3+64*15], m21
+ call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast2
+ mova [r3+64*16], m29
+ mova [r3+64*17], m28
+ mova [r3+64*18], m27
+ mova [r3+64*19], m26
+ mova [r3+64*20], m25
+ mova [r3+64*21], m24
+ mova [r3+64*22], m23
+ mova [r3+64*23], m22
+ mova [r3+64*24], m21
+ mova [r3+64*25], m20
+ mova [r3+64*26], m19
+ mova [r3+64*27], m18
+ mova [r3+64*28], m17
+ mova [r3+64*29], m16
+ mova [r3+64*30], m15
+ mova [r3+64*31], m14
+ ret
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/itx_sse.asm b/third_party/dav1d/src/x86/itx_sse.asm
new file mode 100644
index 0000000000..ec7e3a52f4
--- /dev/null
+++ b/third_party/dav1d/src/x86/itx_sse.asm
@@ -0,0 +1,6533 @@
+; Copyright © 2018-2021, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+
+SECTION_RODATA 16
+
+deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
+
+deint_shuf1: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
+deint_shuf2: db 8, 9, 0, 1, 10, 11, 2, 3, 12, 13, 4, 5, 14, 15, 6, 7
+
+%macro COEF_PAIR 2-3 0 ; !0 = m%1_m%2, 2 = no %2_%1
+pw_%1_m%2: times 4 dw %1, -%2
+%if %3 != 2
+pw_%2_%1: times 4 dw %2, %1
+%endif
+%if %3
+pw_m%1_m%2: times 4 dw -%1, -%2
+%endif
+%endmacro
+
+;adst4
+pw_1321_3803: times 4 dw 1321, 3803
+pw_2482_m1321: times 4 dw 2482, -1321
+pw_3344_2482: times 4 dw 3344, 2482
+pw_3344_m3803: times 4 dw 3344, -3803
+pw_3344_m3344: times 4 dw 3344, -3344
+pw_0_3344 times 4 dw 0, 3344
+pw_m6688_m3803: times 4 dw -6688, -3803
+
+COEF_PAIR 2896, 2896
+COEF_PAIR 1567, 3784
+COEF_PAIR 799, 4017
+COEF_PAIR 3406, 2276
+COEF_PAIR 401, 4076
+COEF_PAIR 1931, 3612
+COEF_PAIR 3166, 2598
+COEF_PAIR 3920, 1189
+COEF_PAIR 3784, 1567, 1
+COEF_PAIR 995, 3973
+COEF_PAIR 1751, 3703
+COEF_PAIR 3513, 2106
+COEF_PAIR 3857, 1380
+COEF_PAIR 4017, 799, 1
+COEF_PAIR 201, 4091
+COEF_PAIR 2440, 3290
+COEF_PAIR 3035, 2751
+COEF_PAIR 4052, 601
+COEF_PAIR 2276, 3406, 1
+COEF_PAIR 4076, 401, 2
+COEF_PAIR 2598, 3166, 2
+COEF_PAIR 3612, 1931, 2
+COEF_PAIR 1189, 3920, 2
+
+pd_2048: times 4 dd 2048
+pw_2048: times 8 dw 2048
+pw_m2048: times 8 dw -2048
+pw_4096: times 8 dw 4096
+pw_16384: times 8 dw 16384
+pw_m16384: times 8 dw -16384
+pw_1697x16: times 8 dw 1697*16
+pw_1697x8: times 8 dw 1697*8
+pw_2896x8: times 8 dw 2896*8
+pw_3344x8: times 8 dw 3344*8
+pw_8192: times 8 dw 8192
+pw_m8192: times 8 dw -8192
+pw_5: times 8 dw 5
+pw_201x8: times 8 dw 201*8
+pw_4091x8: times 8 dw 4091*8
+pw_m2751x8: times 8 dw -2751*8
+pw_3035x8: times 8 dw 3035*8
+pw_1751x8: times 8 dw 1751*8
+pw_3703x8: times 8 dw 3703*8
+pw_m1380x8: times 8 dw -1380*8
+pw_3857x8: times 8 dw 3857*8
+pw_995x8: times 8 dw 995*8
+pw_3973x8: times 8 dw 3973*8
+pw_m2106x8: times 8 dw -2106*8
+pw_3513x8: times 8 dw 3513*8
+pw_2440x8: times 8 dw 2440*8
+pw_3290x8: times 8 dw 3290*8
+pw_m601x8: times 8 dw -601*8
+pw_4052x8: times 8 dw 4052*8
+
+pw_4095x8: times 8 dw 4095*8
+pw_101x8: times 8 dw 101*8
+pw_2967x8: times 8 dw 2967*8
+pw_m2824x8: times 8 dw -2824*8
+pw_3745x8: times 8 dw 3745*8
+pw_1660x8: times 8 dw 1660*8
+pw_3822x8: times 8 dw 3822*8
+pw_m1474x8: times 8 dw -1474*8
+pw_3996x8: times 8 dw 3996*8
+pw_897x8: times 8 dw 897*8
+pw_3461x8: times 8 dw 3461*8
+pw_m2191x8: times 8 dw -2191*8
+pw_3349x8: times 8 dw 3349*8
+pw_2359x8: times 8 dw 2359*8
+pw_4036x8: times 8 dw 4036*8
+pw_m700x8: times 8 dw -700*8
+pw_4065x8: times 8 dw 4065*8
+pw_501x8: times 8 dw 501*8
+pw_3229x8: times 8 dw 3229*8
+pw_m2520x8: times 8 dw -2520*8
+pw_3564x8: times 8 dw 3564*8
+pw_2019x8: times 8 dw 2019*8
+pw_3948x8: times 8 dw 3948*8
+pw_m1092x8: times 8 dw -1092*8
+pw_3889x8: times 8 dw 3889*8
+pw_1285x8: times 8 dw 1285*8
+pw_3659x8: times 8 dw 3659*8
+pw_m1842x8: times 8 dw -1842*8
+pw_3102x8: times 8 dw 3102*8
+pw_2675x8: times 8 dw 2675*8
+pw_4085x8: times 8 dw 4085*8
+pw_m301x8: times 8 dw -301*8
+
+SECTION .text
+
+%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
+
+%if ARCH_X86_64
+%define o(x) x
+%else
+%define o(x) r5-$$+x ; PIC
+%endif
+
+%macro WRITE_4X4 9 ;src[1-2], tmp[1-3], row[1-4]
+ lea r2, [dstq+strideq*2]
+%assign %%i 1
+%rotate 5
+%rep 4
+ %if %1 & 2
+ CAT_XDEFINE %%row_adr, %%i, r2 + strideq*(%1&1)
+ %else
+ CAT_XDEFINE %%row_adr, %%i, dstq + strideq*(%1&1)
+ %endif
+ %assign %%i %%i + 1
+ %rotate 1
+%endrep
+
+ movd m%3, [%%row_adr1] ;dst0
+ movd m%5, [%%row_adr2] ;dst1
+ punpckldq m%3, m%5 ;high: dst1 :low: dst0
+ movd m%4, [%%row_adr3] ;dst2
+ movd m%5, [%%row_adr4] ;dst3
+ punpckldq m%4, m%5 ;high: dst3 :low: dst2
+
+ pxor m%5, m%5
+ punpcklbw m%3, m%5 ;extend byte to word
+ punpcklbw m%4, m%5 ;extend byte to word
+
+ paddw m%3, m%1 ;high: dst1 + out1 ;low: dst0 + out0
+ paddw m%4, m%2 ;high: dst3 + out3 ;low: dst2 + out2
+
+ packuswb m%3, m%4 ;high->low: dst3 + out3, dst2 + out2, dst1 + out1, dst0 + out0
+
+ movd [%%row_adr1], m%3 ;store dst0 + out0
+ pshuflw m%4, m%3, q1032
+ movd [%%row_adr2], m%4 ;store dst1 + out1
+ punpckhqdq m%3, m%3
+ movd [%%row_adr3], m%3 ;store dst2 + out2
+ psrlq m%3, 32
+ movd [%%row_adr4], m%3 ;store dst3 + out3
+%endmacro
+
+%macro ITX4_END 4-5 2048 ; row[1-4], rnd
+%if %5
+ mova m2, [o(pw_%5)]
+ pmulhrsw m0, m2
+ pmulhrsw m1, m2
+%endif
+
+ WRITE_4X4 0, 1, 2, 3, 4, %1, %2, %3, %4
+ ret
+%endmacro
+
+; flags: 1 = swap, 2: coef_regs, 4: no_pack
+%macro ITX_MUL2X_PACK 5-6 0 ; dst/src, tmp[1], rnd, coef[1-2], flags
+%if %6 & 2
+ pmaddwd m%2, m%4, m%1
+ pmaddwd m%1, m%5
+%elif %6 & 1
+ pmaddwd m%2, m%1, [o(pw_%5_%4)]
+ pmaddwd m%1, [o(pw_%4_m%5)]
+%else
+ pmaddwd m%2, m%1, [o(pw_%4_m%5)]
+ pmaddwd m%1, [o(pw_%5_%4)]
+%endif
+ paddd m%2, m%3
+ paddd m%1, m%3
+ psrad m%2, 12
+ psrad m%1, 12
+%if %6 & 4 == 0
+ packssdw m%1, m%2
+%endif
+%endmacro
+
+%macro IDCT4_1D_PACKED 0-1 ;pw_2896x8
+ mova m3, [o(pd_2048)]
+ punpckhwd m2, m0, m1 ;unpacked in1 in3
+ punpcklwd m0, m1 ;unpacked in0 in2
+ ITX_MUL2X_PACK 2, 1, 3, 1567, 3784
+ ITX_MUL2X_PACK 0, 1, 3, 2896, 2896
+ psubsw m1, m0, m2 ;high: out2 ;low: out3
+ paddsw m0, m2 ;high: out1 ;low: out0
+%endmacro
+
+%macro INV_TXFM_FN 4+ ; type1, type2, size, xmm/stack
+cglobal inv_txfm_add_%1_%2_%3_8bpc, 4, 6, %4, dst, stride, coeff, eob, tx2
+ %define %%p1 m(i%1_%3_internal_8bpc)
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+%if has_epilogue
+%ifidn %1_%2, dct_dct
+ test eobd, eobd
+ jz %%end
+%endif
+ lea tx2q, [o(m(i%2_%3_internal_8bpc).pass2)]
+ call %%p1
+ RET
+%%end:
+%else
+ lea tx2q, [o(m(i%2_%3_internal_8bpc).pass2)]
+%ifidn %1_%2, dct_dct
+ test eobd, eobd
+ jnz %%p1
+%else
+ times ((%%end - %%p1) >> 31) & 1 jmp %%p1
+ALIGN function_align
+%%end:
+%endif
+%endif
+%endmacro
+
+%macro INV_TXFM_4X4_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 4x4, 6
+%ifidn %1_%2, dct_dct
+ pshuflw m0, [coeffq], q0000
+ punpcklqdq m0, m0
+ mova m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1
+ mov [coeffq], eobd ;0
+ pmulhrsw m0, m1
+ mova m1, m0
+ TAIL_CALL m(iadst_4x4_internal_8bpc).end2
+%endif
+%endmacro
+
+INIT_XMM ssse3
+; itx16 relies on dct_dct being the first function. If you change the order, adjust `itx8_start` in itx16.
+
+INV_TXFM_4X4_FN dct, dct
+INV_TXFM_4X4_FN dct, adst
+INV_TXFM_4X4_FN dct, flipadst
+INV_TXFM_4X4_FN dct, identity
+
+cglobal idct_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m0, [coeffq+16*0] ;high: in1 ;low: in0
+ mova m1, [coeffq+16*1] ;high: in3 ;low in2
+
+ IDCT4_1D_PACKED
+
+ mova m2, [o(deint_shuf)]
+ shufps m3, m0, m1, q1331
+ shufps m0, m1, q0220
+ pshufb m0, m2 ;high: in1 ;low: in0
+ pshufb m1, m3, m2 ;high: in3 ;low :in2
+ jmp tx2q
+
+.pass2:
+ IDCT4_1D_PACKED
+
+ pxor m2, m2
+ mova [coeffq+16*0], m2
+ mova [coeffq+16*1], m2 ;memset(coeff, 0, sizeof(*coeff) * sh * sw);
+
+ ITX4_END 0, 1, 3, 2
+
+INV_TXFM_4X4_FN adst, dct
+INV_TXFM_4X4_FN adst, adst
+INV_TXFM_4X4_FN adst, flipadst
+INV_TXFM_4X4_FN adst, identity
+
+cglobal iadst_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m0, [coeffq+16*0]
+ mova m1, [coeffq+16*1]
+ call .main
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m0, m2 ;high: in3 ;low :in2
+ punpcklwd m0, m2 ;high: in1 ;low: in0
+ jmp tx2q
+
+.pass2:
+ call .main
+
+.end:
+ pxor m2, m2
+ mova [coeffq+16*0], m2
+ mova [coeffq+16*1], m2
+
+.end2:
+ ITX4_END 0, 1, 2, 3
+
+ALIGN function_align
+cglobal_label .main
+ punpcklwd m2, m0, m1 ;unpacked in0 in2
+ punpckhwd m0, m1 ;unpacked in1 in3
+ mova m3, m0
+ pmaddwd m1, m2, [o(pw_3344_m3344)];3344 * in0 - 3344 * in2
+ pmaddwd m0, [o(pw_0_3344)] ;3344 * in3
+ paddd m1, m0 ;t2
+ pmaddwd m0, m2, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2
+ pmaddwd m2, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2
+ pmaddwd m4, m3, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3
+ pmaddwd m5, m3, [o(pw_3344_m3803)];3344 * in1 - 3803 * in3
+ paddd m4, m0 ;t0 + t3
+ pmaddwd m3, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3
+ mova m0, [o(pd_2048)]
+ paddd m1, m0 ;t2 + 2048
+ paddd m2, m0
+ paddd m0, m4 ;t0 + t3 + 2048
+ paddd m5, m2 ;t1 + t3 + 2048
+ paddd m2, m4
+ paddd m2, m3 ;t0 + t1 - t3 + 2048
+ REPX {psrad x, 12}, m1, m0, m5, m2
+ packssdw m0, m5 ;high: out1 ;low: out0
+ packssdw m1, m2 ;high: out3 ;low: out3
+ ret
+
+INV_TXFM_4X4_FN flipadst, dct
+INV_TXFM_4X4_FN flipadst, adst
+INV_TXFM_4X4_FN flipadst, flipadst
+INV_TXFM_4X4_FN flipadst, identity
+
+cglobal iflipadst_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m0, [coeffq+16*0]
+ mova m1, [coeffq+16*1]
+ call m(iadst_4x4_internal_8bpc).main
+ punpcklwd m2, m1, m0
+ punpckhwd m1, m0
+ punpcklwd m0, m1, m2 ;high: in3 ;low :in2
+ punpckhwd m1, m2 ;high: in1 ;low: in0
+ jmp tx2q
+
+.pass2:
+ call m(iadst_4x4_internal_8bpc).main
+
+.end:
+ pxor m2, m2
+ mova [coeffq+16*0], m2
+ mova [coeffq+16*1], m2
+
+.end2:
+ ITX4_END 3, 2, 1, 0
+
+INV_TXFM_4X4_FN identity, dct
+INV_TXFM_4X4_FN identity, adst
+INV_TXFM_4X4_FN identity, flipadst
+INV_TXFM_4X4_FN identity, identity
+
+cglobal iidentity_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m0, [coeffq+16*0]
+ mova m1, [coeffq+16*1]
+ mova m3, [o(pw_1697x8)]
+ pmulhrsw m2, m0, m3
+ pmulhrsw m3, m1
+ paddsw m0, m2
+ paddsw m1, m3
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m0, m2 ;high: in3 ;low :in2
+ punpcklwd m0, m2 ;high: in1 ;low: in0
+ jmp tx2q
+
+.pass2:
+ mova m3, [o(pw_1697x8)]
+ pmulhrsw m2, m3, m0
+ pmulhrsw m3, m1
+ paddsw m0, m2
+ paddsw m1, m3
+ jmp m(iadst_4x4_internal_8bpc).end
+
+%macro IWHT4_1D_PACKED 0
+ punpckhqdq m3, m0, m1 ;low: in1 high: in3
+ punpcklqdq m0, m1 ;low: in0 high: in2
+ psubw m2, m0, m3 ;low: in0 - in1 high: in2 - in3
+ paddw m0, m3 ;low: in0 + in1 high: in2 + in3
+ punpckhqdq m2, m2 ;t2 t2
+ punpcklqdq m0, m0 ;t0 t0
+ psubw m1, m0, m2
+ psraw m1, 1 ;t4 t4
+ psubw m1, m3 ;low: t1/out2 high: t3/out1
+ psubw m0, m1 ;high: out0
+ paddw m2, m1 ;low: out3
+%endmacro
+
+INIT_XMM sse2
+cglobal inv_txfm_add_wht_wht_4x4_8bpc, 3, 3, 4, dst, stride, coeff
+ mova m0, [coeffq+16*0]
+ mova m1, [coeffq+16*1]
+ pxor m2, m2
+ mova [coeffq+16*0], m2
+ mova [coeffq+16*1], m2
+ psraw m0, 2
+ psraw m1, 2
+ IWHT4_1D_PACKED
+ punpckhwd m0, m1
+ punpcklwd m3, m1, m2
+ punpckhdq m1, m0, m3
+ punpckldq m0, m3
+ IWHT4_1D_PACKED
+ shufpd m0, m2, 0x01
+ ITX4_END 0, 3, 2, 1, 0
+
+%macro IDCT8_1D_PACKED 0
+ mova m6, [o(pd_2048)]
+ punpckhwd m4, m0, m3 ;unpacked in1 in7
+ punpcklwd m0, m2 ;unpacked in0 in4
+ punpckhwd m2, m1 ;unpacked in5 in3
+ punpcklwd m1, m3 ;unpacked in2 in6
+ ITX_MUL2X_PACK 4, 3, 6, 799, 4017 ;low: t7a high: t4a
+ ITX_MUL2X_PACK 2, 3, 6, 3406, 2276 ;low: t6a high: t5a
+ ITX_MUL2X_PACK 1, 3, 6, 1567, 3784 ;low: t3 high: t2
+ psubsw m3, m4, m2 ;low: t6a high: t5a
+ paddsw m4, m2 ;low: t7 high: t4
+ pshufb m3, [o(deint_shuf1)]
+ ITX_MUL2X_PACK 0, 2, 6, 2896, 2896 ;low: t0 high: t1
+ ITX_MUL2X_PACK 3, 2, 6, 2896, 2896 ;low: t6 high: t5
+ psubsw m2, m0, m1 ;low: tmp3 high: tmp2
+ paddsw m0, m1 ;low: tmp0 high: tmp1
+ punpcklqdq m1, m4, m3 ;low: t7 high: t6
+ punpckhqdq m4, m3 ;low: t4 high: t5
+ psubsw m3, m0, m1 ;low: out7 high: out6
+ paddsw m0, m1 ;low: out0 high: out1
+ paddsw m1, m2, m4 ;low: out3 high: out2
+ psubsw m2, m4 ;low: out4 high: out5
+%endmacro
+
+;dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
+;dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
+%macro ITX_MULSUB_2W 7-8 0 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], dst2_in_tmp1
+ punpckhwd m%4, m%1, m%2
+ punpcklwd m%1, m%2
+%if %7 < 8
+ pmaddwd m%2, m%7, m%1
+ pmaddwd m%3, m%7, m%4
+%else
+ mova m%2, [o(pw_%7_%6)]
+%if %8
+ pmaddwd m%3, m%1, m%2
+ pmaddwd m%2, m%4
+%else
+ pmaddwd m%3, m%4, m%2
+ pmaddwd m%2, m%1
+%endif
+%endif
+ paddd m%3, m%5
+ paddd m%2, m%5
+ psrad m%3, 12
+ psrad m%2, 12
+%if %8
+ packssdw m%3, m%2
+%else
+ packssdw m%2, m%3 ;dst2
+%endif
+%if %7 < 8
+ pmaddwd m%4, m%6
+ pmaddwd m%1, m%6
+%elif %8
+ mova m%2, [o(pw_%6_m%7)]
+ pmaddwd m%4, m%2
+ pmaddwd m%1, m%2
+%else
+ mova m%3, [o(pw_%6_m%7)]
+ pmaddwd m%4, m%3
+ pmaddwd m%1, m%3
+%endif
+ paddd m%4, m%5
+ paddd m%1, m%5
+ psrad m%4, 12
+ psrad m%1, 12
+ packssdw m%1, m%4 ;dst1
+%endmacro
+
+%macro IDCT4_1D 7 ; src[1-4], tmp[1-2], pd_2048
+ ITX_MULSUB_2W %2, %4, %5, %6, %7, 1567, 3784, 1 ;t2, t3
+ ITX_MULSUB_2W %1, %3, %4, %6, %7, 2896, 2896, 1 ;t1, t0
+ psubsw m%3, m%1, m%2 ;out2
+ paddsw m%2, m%1 ;out1
+ paddsw m%1, m%5, m%4 ;out0
+ psubsw m%4, m%5 ;out3
+%endmacro
+
+%macro WRITE_4X8 4 ;row[1-4]
+ WRITE_4X4 0, 1, 4, 5, 6, %1, %2, %3, %4
+ lea dstq, [dstq+strideq*4]
+ WRITE_4X4 2, 3, 4, 5, 6, %1, %2, %3, %4
+%endmacro
+
+%macro INV_4X8 0
+ punpckhwd m4, m2, m3
+ punpcklwd m2, m3
+ punpckhwd m3, m0, m1
+ punpcklwd m0, m1
+ punpckhdq m1, m0, m2 ;low: in2 high: in3
+ punpckldq m0, m2 ;low: in0 high: in1
+ punpckldq m2, m3, m4 ;low: in4 high: in5
+ punpckhdq m3, m4 ;low: in6 high: in7
+%endmacro
+
+%macro INV_TXFM_4X8_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 4x8, 8
+%ifidn %1_%2, dct_dct
+ pshuflw m0, [coeffq], q0000
+ punpcklqdq m0, m0
+ mova m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1
+ mov [coeffq], eobd
+ pmulhrsw m0, m1
+ pmulhrsw m0, m1
+ pmulhrsw m0, [o(pw_2048)]
+ mova m1, m0
+ mova m2, m0
+ mova m3, m0
+ TAIL_CALL m(iadst_4x8_internal_8bpc).end3
+%endif
+%endmacro
+
+INIT_XMM ssse3
+INV_TXFM_4X8_FN dct, dct
+INV_TXFM_4X8_FN dct, adst
+INV_TXFM_4X8_FN dct, flipadst
+INV_TXFM_4X8_FN dct, identity
+
+cglobal idct_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m3, [o(pw_2896x8)]
+ pmulhrsw m0, m3, [coeffq+16*0]
+ pmulhrsw m1, m3, [coeffq+16*1]
+ pmulhrsw m2, m3, [coeffq+16*2]
+ pmulhrsw m3, [coeffq+16*3]
+
+.pass1:
+ call m(idct_8x4_internal_8bpc).main
+ jmp m(iadst_4x8_internal_8bpc).pass1_end
+
+.pass2:
+ call .main
+ shufps m1, m1, q1032
+ shufps m3, m3, q1032
+ mova m4, [o(pw_2048)]
+ jmp m(iadst_4x8_internal_8bpc).end2
+
+ALIGN function_align
+cglobal_label .main
+ IDCT8_1D_PACKED
+ ret
+
+
+INV_TXFM_4X8_FN adst, dct
+INV_TXFM_4X8_FN adst, adst
+INV_TXFM_4X8_FN adst, flipadst
+INV_TXFM_4X8_FN adst, identity
+
+cglobal iadst_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m3, [o(pw_2896x8)]
+ pmulhrsw m0, m3, [coeffq+16*0]
+ pmulhrsw m1, m3, [coeffq+16*1]
+ pmulhrsw m2, m3, [coeffq+16*2]
+ pmulhrsw m3, [coeffq+16*3]
+
+.pass1:
+ call m(iadst_8x4_internal_8bpc).main
+
+.pass1_end:
+ INV_4X8
+ jmp tx2q
+
+.pass2:
+ shufps m0, m0, q1032
+ shufps m1, m1, q1032
+ call .main
+ mova m4, [o(pw_2048)]
+ pxor m5, m5
+ psubw m5, m4
+
+.end:
+ punpcklqdq m4, m5
+
+.end2:
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+ pmulhrsw m2, m4
+ pmulhrsw m3, m4
+ pxor m5, m5
+ mova [coeffq+16*0], m5
+ mova [coeffq+16*1], m5
+ mova [coeffq+16*2], m5
+ mova [coeffq+16*3], m5
+
+.end3:
+ WRITE_4X8 0, 1, 2, 3
+ RET
+
+ALIGN function_align
+cglobal_label .main
+ mova m6, [o(pd_2048)]
+ punpckhwd m4, m3, m0 ;unpacked in7 in0
+ punpckhwd m5, m2, m1 ;unpacked in5 in2
+ punpcklwd m1, m2 ;unpacked in3 in4
+ punpcklwd m0, m3 ;unpacked in1 in6
+ ITX_MUL2X_PACK 4, 2, 6, 401, 4076 ;low: t0a high: t1a
+ ITX_MUL2X_PACK 5, 2, 6, 1931, 3612 ;low: t2a high: t3a
+ ITX_MUL2X_PACK 1, 2, 6, 3166, 2598 ;low: t4a high: t5a
+ ITX_MUL2X_PACK 0, 2, 6, 3920, 1189 ;low: t6a high: t7a
+
+ psubsw m3, m4, m1 ;low: t4 high: t5
+ paddsw m4, m1 ;low: t0 high: t1
+ psubsw m2, m5, m0 ;low: t6 high: t7
+ paddsw m5, m0 ;low: t2 high: t3
+
+ shufps m1, m3, m2, q1032
+ punpckhwd m2, m1
+ punpcklwd m3, m1
+ ITX_MUL2X_PACK 3, 0, 6, 1567, 3784, 1 ;low: t5a high: t4a
+ ITX_MUL2X_PACK 2, 0, 6, 3784, 1567 ;low: t7a high: t6a
+
+ psubsw m1, m4, m5 ;low: t2 high: t3
+ paddsw m4, m5 ;low: out0 high: -out7
+ psubsw m5, m3, m2 ;low: t7 high: t6
+ paddsw m3, m2 ;low: out6 high: -out1
+ shufps m0, m4, m3, q3210 ;low: out0 high: -out1
+ shufps m3, m4, q3210 ;low: out6 high: -out7
+
+ mova m2, [o(pw_2896_m2896)]
+ mova m7, [o(pw_2896_2896)]
+ shufps m4, m1, m5, q1032 ;low: t3 high: t7
+ shufps m1, m5, q3210 ;low: t2 high: t6
+ punpcklwd m5, m1, m4
+ punpckhwd m1, m4
+ pmaddwd m4, m2, m1 ;-out5
+ pmaddwd m2, m5 ; out4
+ pmaddwd m1, m7 ; out2
+ pmaddwd m5, m7 ;-out3
+ REPX {paddd x, m6}, m4, m2, m1, m5
+ REPX {psrad x, 12}, m4, m2, m1, m5
+ packssdw m1, m5 ;low: out2 high: -out3
+ packssdw m2, m4 ;low: out4 high: -out5
+ ret
+
+INV_TXFM_4X8_FN flipadst, dct
+INV_TXFM_4X8_FN flipadst, adst
+INV_TXFM_4X8_FN flipadst, flipadst
+INV_TXFM_4X8_FN flipadst, identity
+
+cglobal iflipadst_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m3, [o(pw_2896x8)]
+ pmulhrsw m0, m3, [coeffq+16*0]
+ pmulhrsw m1, m3, [coeffq+16*1]
+ pmulhrsw m2, m3, [coeffq+16*2]
+ pmulhrsw m3, [coeffq+16*3]
+
+.pass1:
+ call m(iadst_8x4_internal_8bpc).main
+
+ punpcklwd m4, m3, m2
+ punpckhwd m3, m2
+ punpcklwd m5, m1, m0
+ punpckhwd m1, m0
+ punpckldq m2, m3, m1 ;low: in4 high: in5
+ punpckhdq m3, m1 ;low: in6 high: in7
+ punpckldq m0, m4, m5 ;low: in0 high: in1
+ punpckhdq m1, m4, m5 ;low: in2 high: in3
+ jmp tx2q
+
+.pass2:
+ shufps m0, m0, q1032
+ shufps m1, m1, q1032
+ call m(iadst_4x8_internal_8bpc).main
+
+ mova m4, m0
+ mova m5, m1
+ pshufd m0, m3, q1032
+ pshufd m1, m2, q1032
+ pshufd m2, m5, q1032
+ pshufd m3, m4, q1032
+ mova m5, [o(pw_2048)]
+ pxor m4, m4
+ psubw m4, m5
+ jmp m(iadst_4x8_internal_8bpc).end
+
+INV_TXFM_4X8_FN identity, dct
+INV_TXFM_4X8_FN identity, adst
+INV_TXFM_4X8_FN identity, flipadst
+INV_TXFM_4X8_FN identity, identity
+
+cglobal iidentity_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m3, [o(pw_2896x8)]
+ pmulhrsw m0, m3, [coeffq+16*0]
+ pmulhrsw m1, m3, [coeffq+16*1]
+ pmulhrsw m2, m3, [coeffq+16*2]
+ pmulhrsw m3, [coeffq+16*3]
+
+.pass1:
+ mova m7, [o(pw_1697x8)]
+ pmulhrsw m4, m7, m0
+ pmulhrsw m5, m7, m1
+ pmulhrsw m6, m7, m2
+ pmulhrsw m7, m3
+ paddsw m0, m4
+ paddsw m1, m5
+ paddsw m2, m6
+ paddsw m3, m7
+ jmp m(iadst_4x8_internal_8bpc).pass1_end
+
+.pass2:
+ mova m4, [o(pw_4096)]
+ jmp m(iadst_4x8_internal_8bpc).end2
+
+
+%macro WRITE_8X2 5 ;coefs[1-2], tmp[1-3]
+ movq m%3, [dstq ]
+ movq m%4, [dstq+strideq]
+ pxor m%5, m%5
+ punpcklbw m%3, m%5 ;extend byte to word
+ punpcklbw m%4, m%5 ;extend byte to word
+%ifnum %1
+ paddw m%3, m%1
+%else
+ paddw m%3, %1
+%endif
+%ifnum %2
+ paddw m%4, m%2
+%else
+ paddw m%4, %2
+%endif
+ packuswb m%3, m%4
+ movq [dstq ], m%3
+ punpckhqdq m%3, m%3
+ movq [dstq+strideq], m%3
+%endmacro
+
+%macro WRITE_8X4 7 ;coefs[1-4], tmp[1-3]
+ WRITE_8X2 %1, %2, %5, %6, %7
+ lea dstq, [dstq+strideq*2]
+ WRITE_8X2 %3, %4, %5, %6, %7
+%endmacro
+
+%macro INV_TXFM_8X4_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 8x4, 8
+%ifidn %1_%2, dct_dct
+ pshuflw m0, [coeffq], q0000
+ punpcklqdq m0, m0
+ mova m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1
+ pmulhrsw m0, m1
+ mova m2, [o(pw_2048)]
+ pmulhrsw m0, m1
+ pmulhrsw m0, m2
+ mova m1, m0
+ mova m2, m0
+ mova m3, m0
+ TAIL_CALL m(iadst_8x4_internal_8bpc).end2
+%endif
+%endmacro
+
+INV_TXFM_8X4_FN dct, dct
+INV_TXFM_8X4_FN dct, adst
+INV_TXFM_8X4_FN dct, flipadst
+INV_TXFM_8X4_FN dct, identity
+
+cglobal idct_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m3, [o(pw_2896x8)]
+ pmulhrsw m0, m3, [coeffq+16*0]
+ pmulhrsw m1, m3, [coeffq+16*1]
+ pmulhrsw m2, m3, [coeffq+16*2]
+ pmulhrsw m3, [coeffq+16*3]
+
+ call m(idct_4x8_internal_8bpc).main
+
+ mova m4, [o(deint_shuf1)]
+ mova m5, [o(deint_shuf2)]
+ pshufb m0, m4
+ pshufb m1, m5
+ pshufb m2, m4
+ pshufb m3, m5
+ punpckhdq m4, m0, m1
+ punpckldq m0, m1
+ punpckhdq m5, m2, m3
+ punpckldq m2, m3
+ punpckhqdq m1, m0, m2 ;in1
+ punpcklqdq m0, m2 ;in0
+ punpckhqdq m3, m4, m5 ;in3
+ punpcklqdq m2 ,m4, m5 ;in2
+ jmp tx2q
+
+.pass2:
+ call .main
+ jmp m(iadst_8x4_internal_8bpc).end
+
+ALIGN function_align
+cglobal_label .main
+ mova m6, [o(pd_2048)]
+ IDCT4_1D 0, 1, 2, 3, 4, 5, 6
+ ret
+
+INV_TXFM_8X4_FN adst, dct
+INV_TXFM_8X4_FN adst, adst
+INV_TXFM_8X4_FN adst, flipadst
+INV_TXFM_8X4_FN adst, identity
+
+cglobal iadst_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m3, [o(pw_2896x8)]
+ pmulhrsw m0, m3, [coeffq+16*0]
+ pmulhrsw m1, m3, [coeffq+16*1]
+ pmulhrsw m2, m3, [coeffq+16*2]
+ pmulhrsw m3, [coeffq+16*3]
+
+ shufps m0, m0, q1032
+ shufps m1, m1, q1032
+ call m(iadst_4x8_internal_8bpc).main
+
+ punpckhwd m4, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m2, m3
+ punpcklwd m2, m3
+ pxor m5, m5
+ psubsw m3, m5, m1
+ psubsw m5, m4
+ punpckhdq m4, m5, m3
+ punpckldq m5, m3
+ punpckhdq m3, m0, m2
+ punpckldq m0, m2
+ punpckhwd m1, m0, m5 ;in1
+ punpcklwd m0, m5 ;in0
+ punpcklwd m2, m3, m4 ;in2
+ punpckhwd m3, m4 ;in3
+ jmp tx2q
+
+.pass2:
+ call .main
+
+.end:
+ mova m4, [o(pw_2048)]
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+ pmulhrsw m2, m4
+ pmulhrsw m3, m4
+
+.end2:
+ pxor m6, m6
+ mova [coeffq+16*0], m6
+ mova [coeffq+16*1], m6
+ mova [coeffq+16*2], m6
+ mova [coeffq+16*3], m6
+.end3:
+ WRITE_8X4 0, 1, 2, 3, 4, 5, 6
+ RET
+
+ALIGN function_align
+cglobal_label .main
+ punpckhwd m6, m0, m2 ;unpacked in0 in2
+ punpcklwd m0, m2 ;unpacked in0 in2
+ punpckhwd m7, m1, m3 ;unpacked in1 in3
+ punpcklwd m1, m3 ;unpacked in1 in3
+
+ mova m2, [o(pw_3344_m3344)]
+ mova m4, [o(pw_0_3344)]
+ pmaddwd m3, m2, m6 ;3344 * in0 - 3344 * in2
+ pmaddwd m5, m4, m7 ;3344 * in3
+ pmaddwd m2, m0
+ pmaddwd m4, m1
+ paddd m3, m5
+ paddd m2, m4
+ mova m4, [o(pd_2048)]
+ paddd m3, m4 ;t2 + 2048
+ paddd m2, m4
+ psrad m3, 12
+ psrad m2, 12
+ packssdw m2, m3 ;out2
+
+ pmaddwd m4, m0, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2
+ pmaddwd m0, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2
+ pmaddwd m3, m1, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3
+ pmaddwd m5, m1, [o(pw_3344_m3803)] ;3344 * in1 - 3803 * in3
+ paddd m3, m4 ;t0 + t3
+
+ pmaddwd m1, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3
+ mova m4, [o(pd_2048)]
+ paddd m0, m4
+ paddd m4, m3 ;t0 + t3 + 2048
+ paddd m5, m0 ;t1 + t3 + 2048
+ paddd m3, m0
+ paddd m3, m1 ;t0 + t1 - t3 + 2048
+
+ psrad m4, 12 ;out0
+ psrad m5, 12 ;out1
+ psrad m3, 12 ;out3
+ packssdw m0, m4, m5 ;low: out0 high: out1
+
+ pmaddwd m4, m6, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2
+ pmaddwd m6, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2
+ pmaddwd m1, m7, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3
+ pmaddwd m5, m7, [o(pw_3344_m3803)] ;3344 * in1 - 3803 * in3
+ paddd m1, m4 ;t0 + t3
+ pmaddwd m7, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3
+
+ mova m4, [o(pd_2048)]
+ paddd m6, m4
+ paddd m4, m1 ;t0 + t3 + 2048
+ paddd m5, m6 ;t1 + t3 + 2048
+ paddd m1, m6
+ paddd m1, m7 ;t0 + t1 - t3 + 2048
+
+ psrad m4, 12 ;out0
+ psrad m5, 12 ;out1
+ psrad m1, 12 ;out3
+ packssdw m3, m1 ;out3
+ packssdw m4, m5 ;low: out0 high: out1
+
+ punpckhqdq m1, m0, m4 ;out1
+ punpcklqdq m0, m4 ;out0
+ ret
+
+INV_TXFM_8X4_FN flipadst, dct
+INV_TXFM_8X4_FN flipadst, adst
+INV_TXFM_8X4_FN flipadst, flipadst
+INV_TXFM_8X4_FN flipadst, identity
+
+cglobal iflipadst_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m3, [o(pw_2896x8)]
+ pmulhrsw m0, m3, [coeffq+16*0]
+ pmulhrsw m1, m3, [coeffq+16*1]
+ pmulhrsw m2, m3, [coeffq+16*2]
+ pmulhrsw m3, [coeffq+16*3]
+
+ shufps m0, m0, q1032
+ shufps m1, m1, q1032
+ call m(iadst_4x8_internal_8bpc).main
+
+ punpckhwd m5, m3, m2
+ punpcklwd m3, m2
+ punpckhwd m2, m1, m0
+ punpcklwd m1, m0
+
+ pxor m0, m0
+ psubsw m4, m0, m2
+ psubsw m0, m5
+ punpckhdq m2, m0, m4
+ punpckldq m0, m4
+ punpckhdq m4, m3, m1
+ punpckldq m3, m1
+ punpckhwd m1, m0, m3 ;in1
+ punpcklwd m0, m3 ;in0
+ punpckhwd m3, m2, m4 ;in3
+ punpcklwd m2, m4 ;in2
+ jmp tx2q
+
+.pass2:
+ call m(iadst_8x4_internal_8bpc).main
+ mova m4, m0
+ mova m5, m1
+ mova m0, m3
+ mova m1, m2
+ mova m2, m5
+ mova m3, m4
+ jmp m(iadst_8x4_internal_8bpc).end
+
+INV_TXFM_8X4_FN identity, dct
+INV_TXFM_8X4_FN identity, adst
+INV_TXFM_8X4_FN identity, flipadst
+INV_TXFM_8X4_FN identity, identity
+
+cglobal iidentity_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m3, [o(pw_2896x8)]
+ pmulhrsw m0, m3, [coeffq+16*0]
+ pmulhrsw m1, m3, [coeffq+16*1]
+ pmulhrsw m2, m3, [coeffq+16*2]
+ pmulhrsw m3, [coeffq+16*3]
+ paddsw m0, m0
+ paddsw m1, m1
+ paddsw m2, m2
+ paddsw m3, m3
+
+ punpckhwd m4, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m2, m3
+ punpcklwd m2, m3
+ punpckhdq m5, m4, m1
+ punpckldq m4, m1
+ punpckhdq m3, m0, m2
+ punpckldq m0, m2
+ punpckhwd m1, m0, m4 ;in1
+ punpcklwd m0, m4 ;in0
+ punpcklwd m2, m3, m5 ;in2
+ punpckhwd m3, m5 ;in3
+ jmp tx2q
+
+.pass2:
+ mova m7, [o(pw_1697x8)]
+ pmulhrsw m4, m7, m0
+ pmulhrsw m5, m7, m1
+ pmulhrsw m6, m7, m2
+ pmulhrsw m7, m3
+ paddsw m0, m4
+ paddsw m1, m5
+ paddsw m2, m6
+ paddsw m3, m7
+ jmp m(iadst_8x4_internal_8bpc).end
+
+%macro INV_TXFM_8X8_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 8x8, 8, 16*4
+%ifidn %1_%2, dct_dct
+ pshuflw m0, [coeffq], q0000
+ punpcklwd m0, m0
+ mova m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1
+ mova m2, [o(pw_16384)]
+ mov [coeffq], eobd
+ pmulhrsw m0, m2
+ psrlw m2, 3
+ pmulhrsw m0, m1
+ pmulhrsw m0, m2
+.end:
+ mov r3d, 2
+ lea tx2q, [o(m(inv_txfm_add_dct_dct_8x8_8bpc).end3)]
+.loop:
+ WRITE_8X4 0, 0, 0, 0, 1, 2, 3
+ lea dstq, [dstq+strideq*2]
+ dec r3d
+ jg .loop
+ jmp tx2q
+.end3:
+ RET
+%endif
+%endmacro
+
+%macro LOAD_8ROWS 2-3 0 ; src, stride, is_rect2
+%if %3
+ mova m7, [o(pw_2896x8)]
+ pmulhrsw m0, m7, [%1+%2*0]
+ pmulhrsw m1, m7, [%1+%2*1]
+ pmulhrsw m2, m7, [%1+%2*2]
+ pmulhrsw m3, m7, [%1+%2*3]
+ pmulhrsw m4, m7, [%1+%2*4]
+ pmulhrsw m5, m7, [%1+%2*5]
+ pmulhrsw m6, m7, [%1+%2*6]
+ pmulhrsw m7, [%1+%2*7]
+%else
+ mova m0, [%1+%2*0]
+ mova m1, [%1+%2*1]
+ mova m2, [%1+%2*2]
+ mova m3, [%1+%2*3]
+ mova m4, [%1+%2*4]
+ mova m5, [%1+%2*5]
+ mova m6, [%1+%2*6]
+ mova m7, [%1+%2*7]
+%endif
+%endmacro
+
+%macro IDCT8_1D_ODDHALF 7 ; src[1-4], tmp[1-2], pd_2048
+ ITX_MULSUB_2W %1, %4, %5, %6, %7, 799, 4017 ;t4a, t7a
+ ITX_MULSUB_2W %3, %2, %5, %6, %7, 3406, 2276, 1 ;t5a, t6a
+ psubsw m%2, m%4, m%5 ;t6a
+ paddsw m%4, m%5 ;t7
+ psubsw m%5, m%1, m%3 ;t5a
+ paddsw m%1, m%3 ;t4
+ ITX_MULSUB_2W %2, %5, %3, %6, %7, 2896, 2896, 1 ;t5, t6
+%endmacro
+
+INV_TXFM_8X8_FN dct, dct
+INV_TXFM_8X8_FN dct, adst
+INV_TXFM_8X8_FN dct, flipadst
+INV_TXFM_8X8_FN dct, identity
+
+cglobal idct_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ LOAD_8ROWS coeffq, 16
+
+.pass1:
+ call .main
+
+.pass1_end:
+ mova m7, [o(pw_16384)]
+
+.pass1_end1:
+ REPX {pmulhrsw x, m7}, m0, m2, m4, m6
+ mova [rsp+gprsize+16*1], m6
+
+.pass1_end2:
+ REPX {pmulhrsw x, m7}, m1, m3, m5
+ pmulhrsw m7, [rsp+gprsize+16*0]
+
+cglobal_label .pass1_end3
+ punpcklwd m6, m1, m5 ;10 50 11 51 12 52 13 53
+ punpckhwd m1, m5 ;14 54 15 55 16 56 17 57
+ punpckhwd m5, m0, m4 ;04 44 05 45 06 46 07 47
+ punpcklwd m0, m4 ;00 40 01 41 02 42 03 43
+ punpckhwd m4, m3, m7 ;34 74 35 75 36 76 37 77
+ punpcklwd m3, m7 ;30 70 31 71 32 72 33 73
+ punpckhwd m7, m1, m4 ;16 36 56 76 17 37 57 77
+ punpcklwd m1, m4 ;14 34 54 74 15 35 55 75
+ punpckhwd m4, m6, m3 ;12 32 52 72 13 33 53 73
+ punpcklwd m6, m3 ;10 30 50 70 11 31 51 71
+ mova [rsp+gprsize+16*2], m6
+ mova m6, [rsp+gprsize+16*1]
+ punpckhwd m3, m2, m6 ;24 64 25 65 26 66 27 67
+ punpcklwd m2, m6 ;20 60 21 61 22 62 23 63
+ punpckhwd m6, m5, m3 ;06 26 46 66 07 27 47 67
+ punpcklwd m5, m3 ;04 24 44 64 05 25 45 65
+ punpckhwd m3, m0, m2 ;02 22 42 62 03 23 43 63
+ punpcklwd m0, m2 ;00 20 40 60 01 21 41 61
+
+ punpckhwd m2, m6, m7 ;07 17 27 37 47 57 67 77
+ punpcklwd m6, m7 ;06 16 26 36 46 56 66 76
+ mova [rsp+gprsize+16*0], m2
+ punpcklwd m2, m3, m4 ;02 12 22 32 42 52 62 72
+ punpckhwd m3, m4 ;03 13 23 33 43 53 63 73
+ punpcklwd m4, m5, m1 ;04 14 24 34 44 54 64 74
+ punpckhwd m5, m1 ;05 15 25 35 45 55 65 75
+ mova m7, [rsp+gprsize+16*2]
+ punpckhwd m1, m0, m7 ;01 11 21 31 41 51 61 71
+ punpcklwd m0, m7 ;00 10 20 30 40 50 60 70
+ mova m7, [rsp+gprsize+16*0]
+ jmp tx2q
+
+.pass2:
+ lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)]
+
+.pass2_main:
+ call .main
+
+.end:
+ mova m7, [o(pw_2048)]
+ REPX {pmulhrsw x, m7}, m0, m2, m4, m6
+ mova [rsp+gprsize+16*1], m6
+
+.end2:
+ REPX {pmulhrsw x, m7}, m1, m3, m5
+ pmulhrsw m7, [rsp+gprsize+16*0]
+ mova [rsp+gprsize+16*2], m5
+ mova [rsp+gprsize+16*0], m7
+
+.end3:
+ WRITE_8X4 0, 1, 2, 3, 5, 6, 7
+ lea dstq, [dstq+strideq*2]
+ WRITE_8X4 4, [rsp+gprsize+16*2], [rsp+gprsize+16*1], [rsp+gprsize+16*0], 5, 6, 7
+ jmp tx2q
+
+.end4:
+ pxor m7, m7
+ REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
+ ret
+
+ALIGN function_align
+cglobal_label .main
+ mova [rsp+gprsize*2+16*0], m7
+ mova [rsp+gprsize*2+16*1], m3
+ mova [rsp+gprsize*2+16*2], m1
+ mova m7, [o(pd_2048)]
+ IDCT4_1D 0, 2, 4, 6, 1, 3, 7
+ mova m3, [rsp+gprsize*2+16*2]
+ mova [rsp+gprsize*2+16*2], m2
+ mova m2, [rsp+gprsize*2+16*1]
+ mova [rsp+gprsize*2+16*1], m4
+ mova m4, [rsp+gprsize*2+16*0]
+ mova [rsp+gprsize*2+16*0], m6
+ IDCT8_1D_ODDHALF 3, 2, 5, 4, 1, 6, 7
+ mova m6, [rsp+gprsize*2+16*0]
+ psubsw m7, m0, m4 ;out7
+ paddsw m0, m4 ;out0
+ mova [rsp+gprsize*2+16*0], m7
+ mova m1, [rsp+gprsize*2+16*2]
+ psubsw m4, m6, m3 ;out4
+ paddsw m3, m6 ;out3
+ mova m7, [rsp+gprsize*2+16*1]
+ psubsw m6, m1, m5 ;out6
+ paddsw m1, m5 ;out1
+ psubsw m5, m7, m2 ;out5
+ paddsw m2, m7 ;out2
+ ret
+
+
+INV_TXFM_8X8_FN adst, dct
+INV_TXFM_8X8_FN adst, adst
+INV_TXFM_8X8_FN adst, flipadst
+INV_TXFM_8X8_FN adst, identity
+
+cglobal iadst_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ LOAD_8ROWS coeffq, 16
+
+.pass1:
+ call .main
+ call .main_pass1_end
+
+.pass1_end:
+ mova m7, [o(pw_16384)]
+
+.pass1_end1:
+ REPX {pmulhrsw x, m7}, m0, m2, m4, m6
+ mova [rsp+gprsize+16*1], m6
+ pxor m6, m6
+ psubw m6, m7
+ mova m7, m6
+ jmp m(idct_8x8_internal_8bpc).pass1_end2
+
+ALIGN function_align
+.pass2:
+ lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)]
+
+.pass2_main:
+ call .main
+ call .main_pass2_end
+
+.end:
+ mova m7, [o(pw_2048)]
+ REPX {pmulhrsw x, m7}, m0, m2, m4, m6
+ mova [rsp+gprsize+16*1], m6
+ pxor m6, m6
+ psubw m6, m7
+ mova m7, m6
+ jmp m(idct_8x8_internal_8bpc).end2
+
+ALIGN function_align
+cglobal_label .main
+ mova [rsp+gprsize*2+16*0], m7
+ mova [rsp+gprsize*2+16*1], m3
+ mova [rsp+gprsize*2+16*2], m4
+ mova m7, [o(pd_2048)]
+ ITX_MULSUB_2W 5, 2, 3, 4, 7, 1931, 3612 ;t3a, t2a
+ ITX_MULSUB_2W 1, 6, 3, 4, 7, 3920, 1189 ;t7a, t6a
+ paddsw m3, m2, m6 ;t2
+ psubsw m2, m6 ;t6
+ paddsw m4, m5, m1 ;t3
+ psubsw m5, m1 ;t7
+ ITX_MULSUB_2W 5, 2, 1, 6, 7, 3784, 1567 ;t6a, t7a
+
+ mova m6, [rsp+gprsize*2+16*2]
+ mova [rsp+gprsize*2+16*2], m5
+ mova m1, [rsp+gprsize*2+16*1]
+ mova [rsp+gprsize*2+16*1], m2
+ mova m5, [rsp+gprsize*2+16*0]
+ mova [rsp+gprsize*2+16*0], m3
+ ITX_MULSUB_2W 5, 0, 2, 3, 7, 401, 4076 ;t1a, t0a
+ ITX_MULSUB_2W 1, 6, 2, 3, 7, 3166, 2598 ;t5a, t4a
+ psubsw m2, m0, m6 ;t4
+ paddsw m0, m6 ;t0
+ paddsw m3, m5, m1 ;t1
+ psubsw m5, m1 ;t5
+ ITX_MULSUB_2W 2, 5, 1, 6, 7, 1567, 3784 ;t5a, t4a
+
+ mova m7, [rsp+gprsize*2+16*0]
+ paddsw m1, m3, m4 ;-out7
+ psubsw m3, m4 ;t3
+ mova [rsp+gprsize*2+16*0], m1
+ psubsw m4, m0, m7 ;t2
+ paddsw m0, m7 ;out0
+ mova m6, [rsp+gprsize*2+16*2]
+ mova m7, [rsp+gprsize*2+16*1]
+ paddsw m1, m5, m6 ;-out1
+ psubsw m5, m6 ;t6
+ paddsw m6, m2, m7 ;out6
+ psubsw m2, m7 ;t7
+ ret
+ALIGN function_align
+.main_pass1_end:
+ mova [rsp+gprsize*2+16*1], m1
+ mova [rsp+gprsize*2+16*2], m6
+ punpckhwd m1, m4, m3
+ punpcklwd m4, m3
+ punpckhwd m7, m5, m2
+ punpcklwd m5, m2
+ mova m2, [o(pw_2896_2896)]
+ mova m6, [o(pd_2048)]
+ pmaddwd m3, m2, m7
+ pmaddwd m2, m5
+ paddd m3, m6
+ paddd m2, m6
+ psrad m3, 12
+ psrad m2, 12
+ packssdw m2, m3 ;out2
+ mova m3, [o(pw_2896_m2896)]
+ pmaddwd m7, m3
+ pmaddwd m5, m3
+ paddd m7, m6
+ paddd m5, m6
+ psrad m7, 12
+ psrad m5, 12
+ packssdw m5, m7 ;-out5
+ mova m3, [o(pw_2896_2896)]
+ pmaddwd m7, m3, m1
+ pmaddwd m3, m4
+ paddd m7, m6
+ paddd m3, m6
+ psrad m7, 12
+ psrad m3, 12
+ packssdw m3, m7 ;-out3
+ mova m7, [o(pw_2896_m2896)]
+ pmaddwd m1, m7
+ pmaddwd m4, m7
+ paddd m1, m6
+ paddd m4, m6
+ psrad m1, 12
+ psrad m4, 12
+ packssdw m4, m1 ;-out5
+ mova m1, [rsp+gprsize*2+16*1]
+ mova m6, [rsp+gprsize*2+16*2]
+ ret
+ALIGN function_align
+cglobal_label .main_pass2_end
+ paddsw m7, m4, m3 ;t2 + t3
+ psubsw m4, m3 ;t2 - t3
+ paddsw m3, m5, m2 ;t6 + t7
+ psubsw m5, m2 ;t6 - t7
+ mova m2, [o(pw_2896x8)]
+ pmulhrsw m4, m2 ;out4
+ pmulhrsw m5, m2 ;-out5
+ pmulhrsw m7, m2 ;-out3
+ pmulhrsw m2, m3 ;out2
+ mova m3, m7
+ ret
+
+INV_TXFM_8X8_FN flipadst, dct
+INV_TXFM_8X8_FN flipadst, adst
+INV_TXFM_8X8_FN flipadst, flipadst
+INV_TXFM_8X8_FN flipadst, identity
+
+cglobal iflipadst_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ LOAD_8ROWS coeffq, 16
+
+.pass1:
+ call m(iadst_8x8_internal_8bpc).main
+ call m(iadst_8x8_internal_8bpc).main_pass1_end
+
+.pass1_end:
+ mova m7, [o(pw_m16384)]
+
+.pass1_end1:
+ pmulhrsw m1, m7
+ mova [rsp+gprsize+16*1], m1
+ mova m1, m6
+ mova m6, m2
+ pmulhrsw m2, m5, m7
+ mova m5, m6
+ mova m6, m4
+ pmulhrsw m4, m3, m7
+ mova m3, m6
+ mova m6, m0
+ mova m0, m7
+ pxor m7, m7
+ psubw m7, m0
+ pmulhrsw m0, [rsp+gprsize+16*0]
+ REPX {pmulhrsw x, m7}, m1, m3, m5
+ pmulhrsw m7, m6
+ jmp m(idct_8x8_internal_8bpc).pass1_end3
+
+ALIGN function_align
+.pass2:
+ lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)]
+
+.pass2_main:
+ call m(iadst_8x8_internal_8bpc).main
+ call m(iadst_8x8_internal_8bpc).main_pass2_end
+
+.end:
+ mova m7, [o(pw_2048)]
+ REPX {pmulhrsw x, m7}, m0, m2, m4, m6
+ mova [rsp+gprsize+16*2], m2
+ mova m2, m0
+ pxor m0, m0
+ psubw m0, m7
+ mova m7, m2
+ pmulhrsw m1, m0
+ pmulhrsw m2, m5, m0
+ mova [rsp+gprsize+16*1], m1
+ mova m5, m4
+ mova m1, m6
+ pmulhrsw m4, m3, m0
+ pmulhrsw m0, [rsp+gprsize+16*0]
+ mova m3, m5
+ mova [rsp+gprsize+16*0], m7
+ jmp m(idct_8x8_internal_8bpc).end3
+
+INV_TXFM_8X8_FN identity, dct
+INV_TXFM_8X8_FN identity, adst
+INV_TXFM_8X8_FN identity, flipadst
+INV_TXFM_8X8_FN identity, identity
+
+cglobal iidentity_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ LOAD_8ROWS coeffq, 16
+ mova [rsp+gprsize+16*1], m6
+ jmp m(idct_8x8_internal_8bpc).pass1_end3
+
+ALIGN function_align
+.pass2:
+ lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)]
+
+.end:
+ pmulhrsw m7, [o(pw_4096)]
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_4096)]
+ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+ mova [rsp+gprsize+16*2], m5
+ mova [rsp+gprsize+16*1], m6
+ jmp m(idct_8x8_internal_8bpc).end3
+
+
+%macro INV_TXFM_4X16_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 4x16, 8
+%ifidn %1_%2, dct_dct
+ pshuflw m0, [coeffq], q0000
+ punpcklwd m0, m0
+ mova m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1
+ mov [coeffq], eobd
+ pmulhrsw m0, [o(pw_16384)]
+ pmulhrsw m0, m1
+ pmulhrsw m0, [o(pw_2048)]
+.end:
+ WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3
+ lea dstq, [dstq+strideq*4]
+ WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3
+ lea dstq, [dstq+strideq*4]
+ WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3
+ lea dstq, [dstq+strideq*4]
+ WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3
+ RET
+%endif
+%endmacro
+
+INV_TXFM_4X16_FN dct, dct
+INV_TXFM_4X16_FN dct, adst
+INV_TXFM_4X16_FN dct, flipadst
+INV_TXFM_4X16_FN dct, identity
+
+cglobal idct_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ lea r3, [o(m(idct_4x8_internal_8bpc).pass1)]
+
+.pass1:
+ mova m0, [coeffq+16*1]
+ mova m1, [coeffq+16*3]
+ mova m2, [coeffq+16*5]
+ mova m3, [coeffq+16*7]
+ push tx2q
+ lea tx2q, [o(m(idct_4x16_internal_8bpc).pass1_2)]
+ jmp r3
+
+.pass1_2:
+ mova [coeffq+16*1], m0
+ mova [coeffq+16*3], m1
+ mova [coeffq+16*5], m2
+ mova [coeffq+16*7], m3
+ mova m0, [coeffq+16*0]
+ mova m1, [coeffq+16*2]
+ mova m2, [coeffq+16*4]
+ mova m3, [coeffq+16*6]
+ lea tx2q, [o(m(idct_4x16_internal_8bpc).pass1_end)]
+ jmp r3
+
+.pass1_end:
+ pop tx2q
+
+ mova m4, [coeffq+16*1]
+ mova m5, [coeffq+16*3]
+ mova m6, [coeffq+16*5]
+ mova m7, [o(pw_16384)]
+ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+
+ pmulhrsw m7, [coeffq+16*7]
+ mova [coeffq+16*7], m7
+ jmp tx2q
+
+.pass2:
+ call m(idct_16x4_internal_8bpc).main
+
+.end:
+ mova m7, [o(pw_2048)]
+ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pmulhrsw m7, [coeffq+16*7]
+ mova [coeffq+16*4], m4
+
+.end1:
+ mova [coeffq+16*5], m5
+ mova [coeffq+16*6], m6
+ mov r3, coeffq
+ WRITE_4X8 0, 1, 3, 2
+
+ mova m0, [r3+16*4]
+ mova m1, [r3+16*5]
+ mova m2, [r3+16*6]
+ mova m3, m7
+ lea dstq, [dstq+strideq*4]
+ WRITE_4X8 0, 1, 3, 2
+
+.end2:
+ pxor m7, m7
+ REPX {mova [r3+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
+ ret
+
+INV_TXFM_4X16_FN adst, dct
+INV_TXFM_4X16_FN adst, adst
+INV_TXFM_4X16_FN adst, flipadst
+INV_TXFM_4X16_FN adst, identity
+
+cglobal iadst_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ lea r3, [o(m(iadst_4x8_internal_8bpc).pass1)]
+ jmp m(idct_4x16_internal_8bpc).pass1
+
+.pass2:
+ call m(iadst_16x4_internal_8bpc).main
+ call m(iadst_16x4_internal_8bpc).main_pass2_end
+
+ punpcklqdq m6, m5, m4 ;low: -out5 high: -out7
+ punpckhqdq m4, m5 ;low: out8 high: out10
+ punpcklqdq m5, m7, m2 ;low: out4 high: out6
+ punpckhqdq m2, m7 ;low: -out9 high: -out11
+ mova [coeffq+16*4], m2
+ mova [coeffq+16*5], m6
+ mova m2, [coeffq+16*6]
+ mova m6, [coeffq+16*7]
+ punpckhqdq m1, m6, m0 ;low: -out13 high: -out15
+ punpcklqdq m0, m6 ;low: out0 high: out2
+ punpckhqdq m6, m3, m2 ;low: out12 high: out14
+ punpcklqdq m2, m3 ;low: -out1 high: -out3
+
+ mova m7, [o(pw_2048)]
+
+.end1:
+ REPX {pmulhrsw x, m7}, m0, m5, m4, m6
+ pxor m3, m3
+ psubw m3, m7
+ mova m7, [coeffq+16*4]
+ REPX {pmulhrsw x, m3}, m2, m7, m1
+ pmulhrsw m3, [coeffq+16*5]
+ mova [coeffq+16*7], m5
+
+ punpckhqdq m5, m4, m7 ;low: out10 high: out11
+ punpcklqdq m4, m7 ;low: out8 high: out9
+ punpckhqdq m7, m6, m1 ;low: out14 high: out15
+ punpcklqdq m6, m1 ;low: out12 high: out13
+ punpckhqdq m1, m0, m2 ;low: out2 high: out3
+ punpcklqdq m0, m2 ;low: out0 high: out1
+ mova [coeffq+16*4], m4
+ mova m4, [coeffq+16*7]
+ punpcklqdq m2, m4, m3 ;low: out4 high: out5
+ punpckhqdq m4, m3 ;low: out6 high: out7
+ mova m3, m4
+
+.end2:
+ mova [coeffq+16*5], m5
+ mova [coeffq+16*6], m6
+ mov r3, coeffq
+ WRITE_4X8 0, 1, 2, 3
+
+ mova m0, [r3+16*4]
+ mova m1, [r3+16*5]
+ mova m2, [r3+16*6]
+ mova m3, m7
+ lea dstq, [dstq+strideq*4]
+ WRITE_4X8 0, 1, 2, 3
+
+.end3:
+ pxor m7, m7
+ REPX {mova [r3+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
+ ret
+
+
+INV_TXFM_4X16_FN flipadst, dct
+INV_TXFM_4X16_FN flipadst, adst
+INV_TXFM_4X16_FN flipadst, flipadst
+INV_TXFM_4X16_FN flipadst, identity
+
+cglobal iflipadst_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ lea r3, [o(m(iflipadst_4x8_internal_8bpc).pass1)]
+ jmp m(idct_4x16_internal_8bpc).pass1
+
+.pass2:
+ call m(iadst_16x4_internal_8bpc).main
+ call m(iadst_16x4_internal_8bpc).main_pass2_end
+
+ punpckhqdq m6, m5, m4 ;low: out5 high: out7
+ punpcklqdq m4, m5 ;low: -out8 high: -out10
+ punpckhqdq m5, m7, m2 ;low: -out4 high: -out6
+ punpcklqdq m2, m7 ;low: out9 high: out11
+ mova [coeffq+16*4], m2
+ mova [coeffq+16*5], m6
+ mova m2, [coeffq+16*6]
+ mova m6, [coeffq+16*7]
+ punpcklqdq m1, m6, m0 ;low: out13 high: out15
+ punpckhqdq m0, m6 ;low: -out0 high: -out2
+ punpcklqdq m6, m3, m2 ;low: -out12 high: -out14
+ punpckhqdq m2, m3 ;low: out1 high: out3
+
+ mova m7, [o(pw_m2048)]
+ jmp m(iadst_4x16_internal_8bpc).end1
+
+
+INV_TXFM_4X16_FN identity, dct
+INV_TXFM_4X16_FN identity, adst
+INV_TXFM_4X16_FN identity, flipadst
+INV_TXFM_4X16_FN identity, identity
+
+%macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16394]
+ pmulhrsw m%2, m%3, m%1
+%if %0 == 4 ; if downshifting by 1
+ pmulhrsw m%2, m%4
+%else
+ paddsw m%1, m%1
+%endif
+ paddsw m%1, m%2
+%endmacro
+
+cglobal iidentity_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m0, [coeffq+16*1]
+ mova m6, [o(pw_1697x8)]
+ mova m1, [coeffq+16*3]
+ mova m2, [coeffq+16*5]
+ mova m3, [coeffq+16*7]
+ pcmpeqw m7, m7
+ mov r3, tx2q
+ lea tx2q, [o(.pass1_2)]
+.pass1:
+ pmulhrsw m4, m6, m0
+ pmulhrsw m5, m6, m1
+ pavgw m4, m0
+ pcmpeqw m0, m7
+ pavgw m5, m1
+ pcmpeqw m1, m7
+ pandn m0, m4
+ pmulhrsw m4, m6, m2
+ pandn m1, m5
+ pmulhrsw m5, m6, m3
+ pavgw m4, m2
+ pcmpeqw m2, m7
+ pavgw m5, m3
+ pcmpeqw m3, m7
+ pandn m2, m4
+ pandn m3, m5
+ jmp m(iadst_4x8_internal_8bpc).pass1_end
+.pass1_2:
+ mova [coeffq+16*1], m0
+ mova [coeffq+16*3], m1
+ mova [coeffq+16*5], m2
+ mova [coeffq+16*7], m3
+ mova m0, [coeffq+16*0]
+ mova m1, [coeffq+16*2]
+ mova m2, [coeffq+16*4]
+ mova m3, [coeffq+16*6]
+ lea tx2q, [o(.pass1_end)]
+ jmp .pass1
+.pass1_end:
+ mova m4, [coeffq+16*1]
+ mova m5, [coeffq+16*3]
+ mova m6, [coeffq+16*5]
+ jmp r3
+.pass2:
+ mova m7, [o(pw_1697x16)]
+ mova [coeffq+16*6], m6
+ REPX {IDTX16 x, 6, 7}, 0, 1, 2, 3, 4, 5
+ mova m6, [coeffq+16*7]
+ IDTX16 6, 7, 7
+ mova [coeffq+16*7], m6
+ mova m6, [coeffq+16*6]
+ pmulhrsw m7, m6, [o(pw_1697x16)]
+ paddsw m6, m6
+ paddsw m6, m7
+ mova m7, [o(pw_2048)]
+ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pmulhrsw m7, [coeffq+16*7]
+ mova [coeffq+16*4], m4
+ jmp m(iadst_4x16_internal_8bpc).end2
+
+
+%macro INV_TXFM_16X4_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 16x4, 8
+%ifidn %1_%2, dct_dct
+ movd m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1, [coeffq]
+ movd m2, [o(pw_16384)]
+ mov [coeffq], eobd
+ mov r2d, 2
+ lea tx2q, [o(m(inv_txfm_add_dct_dct_16x4_8bpc).end)]
+.dconly:
+ pmulhrsw m0, m2
+ movd m2, [o(pw_2048)] ;intentionally rip-relative
+ pmulhrsw m0, m1
+ pmulhrsw m0, m2
+ pshuflw m0, m0, q0000
+ punpcklwd m0, m0
+ pxor m5, m5
+.dconly_loop:
+ mova m1, [dstq]
+ mova m3, [dstq+strideq]
+ punpckhbw m2, m1, m5
+ punpcklbw m1, m5
+ punpckhbw m4, m3, m5
+ punpcklbw m3, m5
+ paddw m2, m0
+ paddw m1, m0
+ paddw m4, m0
+ paddw m3, m0
+ packuswb m1, m2
+ packuswb m3, m4
+ mova [dstq], m1
+ mova [dstq+strideq], m3
+ lea dstq, [dstq+strideq*2]
+ dec r2d
+ jg .dconly_loop
+ jmp tx2q
+.end:
+ RET
+%endif
+%endmacro
+
+%macro LOAD_7ROWS 2 ;src, stride
+ mova m0, [%1+%2*0]
+ mova m1, [%1+%2*1]
+ mova m2, [%1+%2*2]
+ mova m3, [%1+%2*3]
+ mova m4, [%1+%2*4]
+ mova m5, [%1+%2*5]
+ mova m6, [%1+%2*6]
+%endmacro
+
+%macro SAVE_7ROWS 2 ;src, stride
+ mova [%1+%2*0], m0
+ mova [%1+%2*1], m1
+ mova [%1+%2*2], m2
+ mova [%1+%2*3], m3
+ mova [%1+%2*4], m4
+ mova [%1+%2*5], m5
+ mova [%1+%2*6], m6
+%endmacro
+
+%macro IDCT16_1D_PACKED_ODDHALF 7 ;src[1-4], tmp[1-3]
+ punpckhwd m%5, m%4, m%1 ;packed in13 in3
+ punpcklwd m%1, m%4 ;packed in1 in15
+ punpcklwd m%4, m%3, m%2 ;packed in9 in7
+ punpckhwd m%2, m%3 ;packed in5 in11
+ mova m%7, [o(pd_2048)]
+ ITX_MUL2X_PACK %1, %6, %7, 401, 4076, 1 ;low: t8a high: t15a
+ ITX_MUL2X_PACK %4, %6, %7, 3166, 2598, 1 ;low: t9a high: t14a
+ ITX_MUL2X_PACK %2, %6, %7, 1931, 3612, 1 ;low: t10a high: t13a
+ ITX_MUL2X_PACK %5, %6, %7, 3920, 1189, 1 ;low: t11a high: t12a
+ psubsw m%6, m%1, m%4 ;low: t9 high: t14
+ paddsw m%1, m%4 ;low: t8 high: t15
+ psubsw m%4, m%5, m%2 ;low: t10 high: t13
+ paddsw m%5, m%2 ;low: t11 high: t12
+ mova m%2, [o(deint_shuf2)]
+ pshufb m%6, m%2
+ pshufb m%4, m%2
+ ITX_MUL2X_PACK %6, %3, %7, 1567, 3784, 1 ;low: t9a high: t14a
+ ITX_MUL2X_PACK %4, %3, %7, m3784, 1567, 1 ;low: t10a high: t13a
+ psubsw m%3, m%1, m%5 ;low: t11a high: t12a
+ paddsw m%1, m%5 ;low: t8a high: t15a
+ psubsw m%5, m%6, m%4 ;low: t10 high: t13
+ paddsw m%6, m%4 ;low: t9 high: t14
+ pshufb m%3, m%2
+ pshufb m%5, m%2
+ ITX_MUL2X_PACK %3, %2, %7, 2896, 2896, 4 ;t12, t11
+ ITX_MUL2X_PACK %5, %4, %7, 2896, 2896, 4 ;t13a, t10a
+ packssdw m%2, m%4 ;low: t11 high: t10a
+ packssdw m%3, m%5 ;low: t12 high: t13a
+ punpckhqdq m%4, m%1, m%6 ;low: t15a high: t14
+ punpcklqdq m%1, m%6 ;low: t8a high: t9
+%endmacro
+
+INV_TXFM_16X4_FN dct, dct
+INV_TXFM_16X4_FN dct, adst
+INV_TXFM_16X4_FN dct, flipadst
+INV_TXFM_16X4_FN dct, identity
+
+cglobal idct_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ LOAD_7ROWS coeffq, 16
+ call .main
+
+.pass1_end:
+ punpckhwd m7, m0, m2 ;packed out1, out5
+ punpcklwd m0, m2 ;packed out0, out4
+ punpcklwd m2, m1, m3 ;packed out3, out7
+ punpckhwd m1, m3 ;packed out2, out6
+ mova [coeffq+16*6], m7
+ mova m7, [coeffq+16*7]
+ punpckhwd m3, m4, m6 ;packed out9, out13
+ punpcklwd m4, m6 ;packed out8, out12
+ punpcklwd m6, m5, m7 ;packed out11, out15
+ punpckhwd m5, m7 ;packed out10, out14
+
+.pass1_end2:
+ mova m7, [o(pw_16384)]
+ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pmulhrsw m7, [coeffq+16*6]
+ mova [coeffq+16*6], m7
+
+.pass1_end3:
+ punpckhwd m7, m3, m6 ;packed 9, 11, 13, 15 high
+ punpcklwd m3, m6 ;packed 9, 10, 13, 15 low
+ punpckhwd m6, m4, m5 ;packed 8, 10, 12, 14 high
+ punpcklwd m4, m5 ;packed 8, 10, 12, 14 low
+ punpckhwd m5, m4, m3 ;8, 9, 10, 11, 12, 13, 14, 15(1)
+ punpcklwd m4, m3 ;8, 9, 10, 11, 12, 13, 14, 15(0)
+ punpckhwd m3, m6, m7 ;8, 9, 10, 11, 12, 13, 14, 15(3)
+ punpcklwd m6, m7 ;8, 9, 10, 11, 12, 13, 14, 15(2)
+ mova [coeffq+16*7], m3
+ mova m3, [coeffq+16*6]
+ punpckhwd m7, m3, m2 ;packed 1, 3, 5, 7 high
+ punpcklwd m3, m2 ;packed 1, 3, 5, 7 low
+ punpckhwd m2, m0, m1 ;packed 0, 2, 4, 6 high
+ punpcklwd m0, m1 ;packed 0, 2, 4, 6 low
+ punpckhwd m1, m0, m3 ;0, 1, 2, 3, 4, 5, 6, 7(1)
+ punpcklwd m0, m3 ;0, 1, 2, 3, 4, 5, 6, 7(0)
+ punpckhwd m3, m2, m7 ;0, 1, 2, 3, 4, 5, 6, 7(3)
+ punpcklwd m2, m7 ;0, 1, 2, 3, 4, 5, 6, 7(2)
+ jmp tx2q
+
+.pass2:
+ lea tx2q, [o(m(idct_8x4_internal_8bpc).pass2)]
+
+.pass2_end:
+ mova [coeffq+16*4], m4
+ mova [coeffq+16*5], m5
+ mova [coeffq+16*6], m6
+ lea r3, [dstq+8]
+ call tx2q
+
+ add coeffq, 16*4
+ mova m0, [coeffq+16*0]
+ mova m1, [coeffq+16*1]
+ mova m2, [coeffq+16*2]
+ mova m3, [coeffq+16*3]
+ mov dstq, r3
+ jmp tx2q
+
+ALIGN function_align
+cglobal_label .main
+ punpckhqdq m7, m0, m1 ;low:in1 high:in3
+ punpcklqdq m0, m1
+ punpcklqdq m1, m2, m3
+ punpckhqdq m3, m2 ;low:in7 high:in5
+ mova [coeffq+16*4], m7
+ mova [coeffq+16*5], m3
+ mova m7, [coeffq+16*7]
+ punpcklqdq m2, m4, m5
+ punpckhqdq m4, m5 ;low:in9 high:in11
+ punpcklqdq m3, m6, m7
+ punpckhqdq m7, m6 ;low:in15 high:in13
+ mova [coeffq+16*6], m4
+ IDCT8_1D_PACKED
+ mova m6, [coeffq+16*4]
+ mova m4, [coeffq+16*5]
+ mova m5, [coeffq+16*6]
+ mova [coeffq+16*4], m1
+ mova [coeffq+16*5], m2
+ mova [coeffq+16*6], m3
+
+ IDCT16_1D_PACKED_ODDHALF 6, 4, 5, 7, 1, 2, 3
+
+ mova m1, [coeffq+16*4]
+ psubsw m3, m0, m7 ;low:out15 high:out14
+ paddsw m0, m7 ;low:out0 high:out1
+ psubsw m7, m1, m5 ;low:out12 high:out13
+ paddsw m1, m5 ;low:out3 high:out2
+ mova [coeffq+16*7], m3
+ mova m2, [coeffq+16*5]
+ mova m3, [coeffq+16*6]
+ psubsw m5, m2, m4 ;low:out11 high:out10
+ paddsw m2, m4 ;low:out4 high:out5
+ psubsw m4, m3, m6 ;low:out8 high:out9
+ paddsw m3, m6 ;low:out7 high:out6
+ mova m6, m7
+ ret
+
+INV_TXFM_16X4_FN adst, dct
+INV_TXFM_16X4_FN adst, adst
+INV_TXFM_16X4_FN adst, flipadst
+INV_TXFM_16X4_FN adst, identity
+
+cglobal iadst_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ LOAD_7ROWS coeffq, 16
+ call .main
+ call .main_pass1_end
+
+ punpckhwd m6, m7, m0 ;packed -out11, -out15
+ punpcklwd m0, m7 ;packed out0, out4
+ punpcklwd m7, m3, m4 ;packed -out3, -out7
+ punpckhwd m4, m3 ;packed out8, out12
+ mova m1, [coeffq+16*6]
+ punpcklwd m3, m1, m5 ;packed -out1, -out5
+ punpckhwd m5, m1 ;packed out10, out14
+ mova m1, [coeffq+16*7]
+ mova [coeffq+16*6], m3
+ mova [coeffq+16*7], m7
+ punpckhwd m3, m2, m1 ;packed -out9, -out13
+ punpcklwd m1, m2 ;packed out2, out6
+
+ mova m7, [o(pw_16384)]
+
+.pass1_end:
+ REPX {pmulhrsw x, m7}, m0, m1, m4, m5
+ pxor m2, m2
+ psubw m2, m7
+ mova m7, [coeffq+16*6]
+ REPX {pmulhrsw x, m2}, m7, m3, m6
+ pmulhrsw m2, [coeffq+16*7]
+ mova [coeffq+16*6], m7
+ jmp m(idct_16x4_internal_8bpc).pass1_end3
+
+.pass2:
+ lea tx2q, [o(m(iadst_8x4_internal_8bpc).pass2)]
+ jmp m(idct_16x4_internal_8bpc).pass2_end
+
+ALIGN function_align
+cglobal_label .main
+ mova [coeffq+16*6], m0
+ pshufd m0, m1, q1032
+ pshufd m2, m2, q1032
+ punpckhwd m1, m6, m0 ;packed in13, in2
+ punpcklwd m0, m6 ;packed in3, in12
+ punpckhwd m7, m5, m2 ;packed in11, in4
+ punpcklwd m2, m5 ;packed in5, in10
+ mova m6, [o(pd_2048)]
+ ITX_MUL2X_PACK 1, 5, 6, 995, 3973 ;low:t2 high:t3
+ ITX_MUL2X_PACK 7, 5, 6, 1751, 3703 ;low:t4 high:t5
+ ITX_MUL2X_PACK 2, 5, 6, 3513, 2106 ;low:t10 high:t11
+ ITX_MUL2X_PACK 0, 5, 6, 3857, 1380 ;low:t12 high:t13
+ psubsw m5, m1, m2 ;low:t10a high:t11a
+ paddsw m1, m2 ;low:t2a high:t3a
+ psubsw m2, m7, m0 ;low:t12a high:t13a
+ paddsw m7, m0 ;low:t4a high:t5a
+ punpcklqdq m0, m5
+ punpckhwd m0, m5 ;packed t10a, t11a
+ punpcklqdq m5, m2
+ punpckhwd m2, m5 ;packed t13a, t12a
+ ITX_MUL2X_PACK 0, 5, 6, 3406, 2276 ;low:t10 high:t11
+ ITX_MUL2X_PACK 2, 5, 6, 4017, 799, 1 ;low:t12 high:t13
+ mova [coeffq+16*4], m1
+ mova [coeffq+16*5], m7
+ mova m1, [coeffq+16*6]
+ mova m7, [coeffq+16*7]
+ pshufd m1, m1, q1032
+ pshufd m3, m3, q1032
+ punpckhwd m5, m7, m1 ;packed in15, in0
+ punpcklwd m1, m7 ;packed in1, in14
+ punpckhwd m7, m4, m3 ;packed in9, in6
+ punpcklwd m3, m4 ;packed in7, in8
+ ITX_MUL2X_PACK 5, 4, 6, 201, 4091 ;low:t0 high:t1
+ ITX_MUL2X_PACK 7, 4, 6, 2440, 3290 ;low:t6 high:t7
+ ITX_MUL2X_PACK 3, 4, 6, 3035, 2751 ;low:t8 high:t9
+ ITX_MUL2X_PACK 1, 4, 6, 4052, 601 ;low:t14 high:t15
+ psubsw m4, m5, m3 ;low:t8a high:t9a
+ paddsw m5, m3 ;low:t0a high:t1a
+ psubsw m3, m7, m1 ;low:t14a high:t15a
+ paddsw m7, m1 ;low:t6a high:t7a
+ punpcklqdq m1, m4
+ punpckhwd m1, m4 ;packed t8a, t9a
+ punpcklqdq m4, m3
+ punpckhwd m3, m4 ;packed t15a, t14a
+ ITX_MUL2X_PACK 1, 4, 6, 799, 4017 ;low:t8 high:t9
+ ITX_MUL2X_PACK 3, 4, 6, 2276, 3406, 1 ;low:t14 high:t15
+ paddsw m4, m1, m2 ;low:t12a high:t13a
+ psubsw m1, m2 ;low:t8a high:t9a
+ psubsw m2, m0, m3 ;low:t14a high:t15a
+ paddsw m0, m3 ;low:t10a high:t11a
+ punpcklqdq m3, m1
+ punpckhwd m3, m1 ;packed t12a, t13a
+ punpcklqdq m1, m2
+ punpckhwd m2, m1 ;packed t15a, t14a
+ ITX_MUL2X_PACK 3, 1, 6, 1567, 3784 ;low:t12 high:t13
+ ITX_MUL2X_PACK 2, 1, 6, 3784, 1567, 1 ;low:t14 high:t15
+ psubsw m1, m3, m2 ;low:t14a high:t15a
+ paddsw m3, m2 ;low:out2 high:-out13
+ psubsw m2, m4, m0 ;low:t10 high:t11
+ paddsw m0, m4 ;low:-out1 high:out14
+ mova [coeffq+16*6], m0
+ mova [coeffq+16*7], m3
+ mova m0, [coeffq+16*4]
+ mova m3, [coeffq+16*5]
+ psubsw m4, m5, m3 ;low:t4 high:t5
+ paddsw m5, m3 ;low:t0 high:t1
+ psubsw m3, m0, m7 ;low:t6 high:t7
+ paddsw m0, m7 ;low:t2 high:t3
+ punpcklqdq m7, m4
+ punpckhwd m7, m4 ;packed t4, t5
+ punpcklqdq m4, m3
+ punpckhwd m3, m4 ;packed t7, t6
+ ITX_MUL2X_PACK 7, 4, 6, 1567, 3784 ;low:t4a high:t5a
+ ITX_MUL2X_PACK 3, 4, 6, 3784, 1567, 1 ;low:t6a high:t7a
+ psubsw m4, m5, m0 ;low:t2a high:t3a
+ paddsw m0, m5 ;low:out0 high:-out15
+ psubsw m5, m7, m3 ;low:t6 high:t7
+ paddsw m3, m7 ;low:-out3 high:out12
+ ret
+ALIGN function_align
+.main_pass1_end:
+ mova m7, [o(deint_shuf1)]
+ mova [coeffq+16*4], m0
+ mova [coeffq+16*5], m3
+ mova m0, [o(pw_2896_m2896)]
+ mova m3, [o(pw_2896_2896)]
+ pshufb m1, m7 ;t14a t15a
+ pshufb m2, m7 ;t10 t11
+ pshufb m4, m7 ;t2a t3a
+ pshufb m5, m7 ;t6 t7
+ pmaddwd m7, m0, m2
+ pmaddwd m2, m3
+ paddd m7, m6
+ paddd m2, m6
+ psrad m7, 12
+ psrad m2, 12
+ packssdw m2, m7 ;low:out6 high:-out9
+ pmaddwd m7, m0, m4
+ pmaddwd m4, m3
+ paddd m7, m6
+ paddd m4, m6
+ psrad m7, 12
+ psrad m4, 12
+ packssdw m4, m7 ;low:-out7 high:out8
+ pmaddwd m7, m3, m5
+ pmaddwd m5, m0
+ paddd m7, m6
+ paddd m5, m6
+ psrad m7, 12
+ psrad m5, 12
+ packssdw m7, m5 ;low:out4 high:-out11
+ pmaddwd m5, m3, m1
+ pmaddwd m1, m0
+ paddd m5, m6
+ paddd m1, m6
+ psrad m5, 12
+ psrad m1, 12
+ packssdw m5, m1 ;low:-out5 high:out10
+ mova m0, [coeffq+16*4]
+ mova m3, [coeffq+16*5]
+ ret
+ALIGN function_align
+cglobal_label .main_pass2_end
+ mova m7, [o(pw_2896x8)]
+ punpckhqdq m6, m2, m1 ;low:t11 high:t15a
+ punpcklqdq m2, m1 ;low:t10 high:t14a
+ psubsw m1, m2, m6
+ paddsw m2, m6
+ punpckhqdq m6, m4, m5 ;low:t3a high:t7
+ punpcklqdq m4, m5 ;low:t2a high:t6
+ psubsw m5, m4, m6
+ paddsw m4, m6
+ pmulhrsw m1, m7 ;low:-out9 high:out10
+ pmulhrsw m2, m7 ;low:out6 high:-out5
+ pmulhrsw m5, m7 ;low:out8 high:-out11
+ pmulhrsw m4, m7 ;low:-out7 high:out4
+ punpckhqdq m7, m4, m5 ;low:out4 high:-out11
+ punpcklqdq m4, m5 ;low:-out7 high:out8
+ punpckhqdq m5, m2, m1 ;low:-out5 high:out10
+ punpcklqdq m2, m1 ;low:out6 high:-out9
+ ret
+
+
+INV_TXFM_16X4_FN flipadst, dct
+INV_TXFM_16X4_FN flipadst, adst
+INV_TXFM_16X4_FN flipadst, flipadst
+INV_TXFM_16X4_FN flipadst, identity
+
+cglobal iflipadst_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ LOAD_7ROWS coeffq, 16
+ call m(iadst_16x4_internal_8bpc).main
+ call m(iadst_16x4_internal_8bpc).main_pass1_end
+
+ punpcklwd m6, m7, m0 ;packed out11, out15
+ punpckhwd m0, m7 ;packed -out0, -out4
+ punpckhwd m7, m3, m4 ;packed out3, out7
+ punpcklwd m4, m3 ;packed -out8, -out12
+ mova m1, [coeffq+16*6]
+ punpckhwd m3, m1, m5 ;packed out1, out5
+ punpcklwd m5, m1 ;packed -out10, -out14
+ mova m1, [coeffq+16*7]
+ mova [coeffq+16*6], m3
+ mova [coeffq+16*7], m7
+ punpcklwd m3, m2, m1 ;packed out9, out13
+ punpckhwd m1, m2 ;packed -out2, -out6
+
+ mova m7, [o(pw_m16384)]
+ jmp m(iadst_16x4_internal_8bpc).pass1_end
+
+.pass2:
+ lea tx2q, [o(m(iflipadst_8x4_internal_8bpc).pass2)]
+ jmp m(idct_16x4_internal_8bpc).pass2_end
+
+
+INV_TXFM_16X4_FN identity, dct
+INV_TXFM_16X4_FN identity, adst
+INV_TXFM_16X4_FN identity, flipadst
+INV_TXFM_16X4_FN identity, identity
+
+cglobal iidentity_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m1, [coeffq+16*6]
+ mova m0, [coeffq+16*5]
+ mova m2, [coeffq+16*7]
+ mova m6, [o(pw_1697x16)]
+ mova m7, [o(pw_16384)]
+ pmulhrsw m4, m6, m1
+ pmulhrsw m3, m6, m0
+ pmulhrsw m5, m6, m2
+ pmulhrsw m4, m7
+ pmulhrsw m3, m7
+ pmulhrsw m5, m7
+ paddsw m1, m4
+ paddsw m0, m3
+ paddsw m5, m2
+ mova m2, [coeffq+16*2]
+ mova m3, [coeffq+16*3]
+ mova m4, [coeffq+16*4]
+ mova [coeffq+16*6], m1
+ mova [coeffq+16*5], m0
+ mova [coeffq+16*7], m5
+ pmulhrsw m0, m6, m2
+ pmulhrsw m1, m6, m3
+ pmulhrsw m5, m6, m4
+ pmulhrsw m0, m7
+ pmulhrsw m1, m7
+ pmulhrsw m5, m7
+ paddsw m2, m0
+ paddsw m3, m1
+ paddsw m4, m5
+ mova m0, [coeffq+16*0]
+ mova m1, [coeffq+16*1]
+ pmulhrsw m5, m6, m0
+ pmulhrsw m6, m1
+ pmulhrsw m5, m7
+ pmulhrsw m6, m7
+ paddsw m0, m5
+ paddsw m1, m6
+ mova m6, [coeffq+16*6]
+ mova m5, [coeffq+16*5]
+ punpckhwd m7, m0, m2 ;packed out1, out5
+ punpcklwd m0, m2 ;packed out0, out4
+ punpckhwd m2, m1, m3 ;packed out3, out7
+ punpcklwd m1, m3 ;packed out2, out6
+ mova [coeffq+16*6], m7
+ mova m7, [coeffq+16*7]
+ punpckhwd m3, m4, m6 ;packed out9, out13
+ punpcklwd m4, m6 ;packed out8, out12
+ punpckhwd m6, m5, m7 ;packed out11, out15
+ punpcklwd m5, m7 ;packed out10, out14
+ jmp m(idct_16x4_internal_8bpc).pass1_end3
+
+.pass2:
+ lea tx2q, [o(m(iidentity_8x4_internal_8bpc).pass2)]
+ jmp m(idct_16x4_internal_8bpc).pass2_end
+
+
+%macro SAVE_8ROWS 2 ;src, stride
+ mova [%1+%2*0], m0
+ mova [%1+%2*1], m1
+ mova [%1+%2*2], m2
+ mova [%1+%2*3], m3
+ mova [%1+%2*4], m4
+ mova [%1+%2*5], m5
+ mova [%1+%2*6], m6
+ mova [%1+%2*7], m7
+%endmacro
+
+%macro INV_TXFM_8X16_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 8x16, 8, 16*16
+%ifidn %1_%2, dct_dct
+ pshuflw m0, [coeffq], q0000
+ punpcklwd m0, m0
+ mova m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1
+ mova m2, [o(pw_16384)]
+ mov [coeffq], eobd
+ pmulhrsw m0, m1
+ pmulhrsw m0, m2
+ psrlw m2, 3 ; pw_2048
+ pmulhrsw m0, m1
+ pmulhrsw m0, m2
+ mov r3d, 4
+ lea tx2q, [o(m(inv_txfm_add_dct_dct_8x16_8bpc).end)]
+ jmp m(inv_txfm_add_dct_dct_8x8_8bpc).loop
+.end:
+ RET
+%endif
+%endmacro
+
+INV_TXFM_8X16_FN dct, dct
+INV_TXFM_8X16_FN dct, adst
+INV_TXFM_8X16_FN dct, flipadst
+INV_TXFM_8X16_FN dct, identity
+
+cglobal idct_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ lea r3, [o(m(idct_8x8_internal_8bpc).pass1)]
+
+.pass1:
+ LOAD_8ROWS coeffq+16*1, 32, 1
+ mov [rsp+gprsize+16*11], tx2q
+ lea tx2q, [o(m(idct_8x16_internal_8bpc).pass1_end)]
+ jmp r3
+
+.pass1_end:
+ SAVE_8ROWS coeffq+16*1, 32
+ LOAD_8ROWS coeffq+16*0, 32, 1
+ mov tx2q, [rsp+gprsize+16*11]
+ jmp r3
+
+.pass2:
+ lea tx2q, [o(m(idct_8x16_internal_8bpc).end)]
+
+.pass2_pre:
+ mova [coeffq+16*2 ], m1
+ mova [coeffq+16*6 ], m3
+ mova [coeffq+16*10], m5
+ mova [coeffq+16*14], m7
+ mova m1, m2
+ mova m2, m4
+ mova m3, m6
+ mova m4, [coeffq+16*1 ]
+ mova m5, [coeffq+16*5 ]
+ mova m6, [coeffq+16*9 ]
+ mova m7, [coeffq+16*13]
+
+.pass2_main:
+ call m(idct_8x8_internal_8bpc).main
+
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+ mova m0, [coeffq+16*2 ]
+ mova m1, [coeffq+16*6 ]
+ mova m2, [coeffq+16*10]
+ mova m3, [coeffq+16*14]
+ mova m4, [coeffq+16*3 ]
+ mova m5, [coeffq+16*7 ]
+ mova m6, [coeffq+16*11]
+ mova m7, [coeffq+16*15]
+ call m(idct_16x8_internal_8bpc).main
+
+ mov r3, dstq
+ lea dstq, [dstq+strideq*8]
+ jmp m(idct_8x8_internal_8bpc).end
+
+.end:
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
+ mov dstq, r3
+ jmp m(idct_8x8_internal_8bpc).end
+
+.end1:
+ pxor m7, m7
+ REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ ret
+
+INV_TXFM_8X16_FN adst, dct
+INV_TXFM_8X16_FN adst, adst
+INV_TXFM_8X16_FN adst, flipadst
+INV_TXFM_8X16_FN adst, identity
+
+cglobal iadst_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ lea r3, [o(m(iadst_8x8_internal_8bpc).pass1)]
+ jmp m(idct_8x16_internal_8bpc).pass1
+
+.pass2:
+ lea tx2q, [o(m(iadst_8x16_internal_8bpc).end)]
+
+.pass2_pre:
+ mova [rsp+gprsize+16*7], m0
+ mova [rsp+gprsize+16*8], m1
+ mova [rsp+gprsize+16*5], m6
+ mova [rsp+gprsize+16*6], m7
+ mova m0, m2
+ mova m1, m3
+ mova m2, m4
+ mova m3, m5
+
+.pass2_main:
+ mova m4, [coeffq+16*1 ]
+ mova m5, [coeffq+16*3 ]
+ mova m6, [coeffq+16*13]
+ mova m7, [coeffq+16*15]
+ mova [rsp+gprsize+16*3], m4
+ mova [rsp+gprsize+16*4], m5
+ mova [rsp+gprsize+16*9], m6
+ mova [rsp+gprsize+32*5], m7
+ mova m4, [coeffq+16*5 ]
+ mova m5, [coeffq+16*7 ]
+ mova m6, [coeffq+16*9 ]
+ mova m7, [coeffq+16*11]
+
+ call m(iadst_16x8_internal_8bpc).main
+ call m(iadst_16x8_internal_8bpc).main_pass2_end
+
+ mov r3, dstq
+ lea dstq, [dstq+strideq*8]
+ jmp m(iadst_8x8_internal_8bpc).end
+
+.end:
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
+ mov dstq, r3
+ jmp m(iadst_8x8_internal_8bpc).end
+
+
+INV_TXFM_8X16_FN flipadst, dct
+INV_TXFM_8X16_FN flipadst, adst
+INV_TXFM_8X16_FN flipadst, flipadst
+INV_TXFM_8X16_FN flipadst, identity
+
+cglobal iflipadst_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ lea r3, [o(m(iflipadst_8x8_internal_8bpc).pass1)]
+ jmp m(idct_8x16_internal_8bpc).pass1
+
+.pass2:
+ lea tx2q, [o(m(iflipadst_8x16_internal_8bpc).end)]
+ lea r3, [dstq+strideq*8]
+
+.pass2_pre:
+ mova [rsp+gprsize+16*7], m0
+ mova [rsp+gprsize+16*8], m1
+ mova [rsp+gprsize+16*5], m6
+ mova [rsp+gprsize+16*6], m7
+ mova m0, m2
+ mova m1, m3
+ mova m2, m4
+ mova m3, m5
+
+.pass2_main:
+ mova m4, [coeffq+16*1 ]
+ mova m5, [coeffq+16*3 ]
+ mova m6, [coeffq+16*13]
+ mova m7, [coeffq+16*15]
+ mova [rsp+gprsize+16*3], m4
+ mova [rsp+gprsize+16*4], m5
+ mova [rsp+gprsize+16*9], m6
+ mova [rsp+gprsize+32*5], m7
+ mova m4, [coeffq+16*5 ]
+ mova m5, [coeffq+16*7 ]
+ mova m6, [coeffq+16*9 ]
+ mova m7, [coeffq+16*11]
+
+ call m(iadst_16x8_internal_8bpc).main
+ call m(iadst_16x8_internal_8bpc).main_pass2_end
+ jmp m(iflipadst_8x8_internal_8bpc).end
+
+.end:
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
+ mov dstq, r3
+ jmp m(iflipadst_8x8_internal_8bpc).end
+
+
+INV_TXFM_8X16_FN identity, dct
+INV_TXFM_8X16_FN identity, adst
+INV_TXFM_8X16_FN identity, flipadst
+INV_TXFM_8X16_FN identity, identity
+
+cglobal iidentity_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ LOAD_8ROWS coeffq+16*1, 32, 1
+ mov r3, tx2q
+ lea tx2q, [o(.pass1_end)]
+ mova [rsp+gprsize+16*1], m6
+ jmp m(idct_8x8_internal_8bpc).pass1_end3
+
+.pass1_end:
+ SAVE_8ROWS coeffq+16*1, 32
+ LOAD_8ROWS coeffq+16*0, 32, 1
+ mov tx2q, r3
+ mova [rsp+gprsize+16*1], m6
+ jmp m(idct_8x8_internal_8bpc).pass1_end3
+
+.pass2:
+ lea tx2q, [o(.end1)]
+
+.end:
+ mova [rsp+gprsize+16*0], m7
+ mova [rsp+gprsize+16*1], m6
+ mova m7, [o(pw_1697x16)]
+ REPX {IDTX16 x, 6, 7}, 0, 1, 2, 3, 4, 5
+ mova m6, [rsp+gprsize+16*1]
+ mova [rsp+gprsize+16*2], m5
+ IDTX16 6, 5, 7
+ mova m5, [rsp+gprsize+16*0]
+ IDTX16 5, 7, 7
+ mova m7, [o(pw_2048)]
+ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pmulhrsw m7, [rsp+gprsize+16*2]
+ mova [rsp+gprsize+16*0], m5
+ mova [rsp+gprsize+16*1], m6
+ mova [rsp+gprsize+16*2], m7
+ jmp m(idct_8x8_internal_8bpc).end3
+
+.end1:
+ LOAD_8ROWS coeffq+16*1, 32
+ lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
+ lea dstq, [dstq+strideq*2]
+ jmp .end
+
+
+%macro INV_TXFM_16X8_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 16x8, 8, 16*16
+%ifidn %1_%2, dct_dct
+ movd m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1, [coeffq]
+ movd m2, [o(pw_16384)]
+ mov [coeffq], eobd
+ pmulhrsw m0, m1
+ mov r2d, 4
+ lea tx2q, [o(m(inv_txfm_add_dct_dct_16x8_8bpc).end)]
+ jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
+.end:
+ RET
+%endif
+%endmacro
+
+INV_TXFM_16X8_FN dct, dct
+INV_TXFM_16X8_FN dct, adst
+INV_TXFM_16X8_FN dct, flipadst
+INV_TXFM_16X8_FN dct, identity
+
+cglobal idct_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ LOAD_8ROWS coeffq+16*0, 32, 1
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+
+ LOAD_8ROWS coeffq+16*1, 32, 1
+ call .main
+ mov r3, tx2q
+ lea tx2q, [o(.pass1_end)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end:
+ SAVE_8ROWS coeffq+16*1, 32
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ mov tx2q, r3
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass2:
+ lea tx2q, [o(.end)]
+ lea r3, [dstq+8]
+ jmp m(idct_8x8_internal_8bpc).pass2_main
+
+.end:
+ LOAD_8ROWS coeffq+16*1, 32
+ lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
+ mov dstq, r3
+ jmp m(idct_8x8_internal_8bpc).pass2_main
+
+
+ALIGN function_align
+cglobal_label .main
+ mova [rsp+gprsize*2+16*1], m2
+ mova [rsp+gprsize*2+16*2], m6
+ mova [rsp+gprsize*2+32*5], m5
+
+ mova m6, [o(pd_2048)]
+ ITX_MULSUB_2W 0, 7, 2, 5, 6, 401, 4076 ;t8a, t15a
+ ITX_MULSUB_2W 4, 3, 2, 5, 6, 3166, 2598 ;t9a, t14a
+ psubsw m2, m0, m4 ;t9
+ paddsw m0, m4 ;t8
+ psubsw m4, m7, m3 ;t14
+ paddsw m7, m3 ;t15
+ ITX_MULSUB_2W 4, 2, 3, 5, 6, 1567, 3784 ;t9a, t14a
+ mova m3, [rsp+gprsize*2+16*1]
+ mova m5, [rsp+gprsize*2+32*5]
+ mova [rsp+gprsize*2+16*1], m2
+ mova [rsp+gprsize*2+32*5], m4
+ mova m2, [rsp+gprsize*2+16*2]
+ mova [rsp+gprsize*2+16*2], m7
+ ITX_MULSUB_2W 3, 5, 7, 4, 6, 1931, 3612 ;t10a, t13a
+ ITX_MULSUB_2W 2, 1, 7, 4, 6, 3920, 1189 ;t11a, t12a
+ psubsw m4, m2, m3 ;t10
+ paddsw m2, m3 ;t11
+ psubsw m3, m1, m5 ;t13
+ paddsw m1, m5 ;t12
+ ITX_MULSUB_2W 3, 4, 7, 5, 6, m3784, 1567 ;t10a, t13a
+ mova m7, [rsp+gprsize*2+32*5]
+ psubsw m6, m0, m2 ;t11a
+ paddsw m0, m2 ;t8a
+ paddsw m2, m7, m3 ;t9
+ psubsw m7, m3 ;t10
+ mova m5, [rsp+gprsize*2+16*0]
+ psubsw m3, m5, m0 ;out8
+ paddsw m0, m5 ;out7
+ mova [rsp+gprsize*2+32*5], m0
+ mova m5, [rsp+gprsize*2+16*9]
+ psubsw m0, m5, m2 ;out9
+ paddsw m2, m5 ;out6
+ mova [rsp+gprsize*2+16*0], m0
+ mova [rsp+gprsize*2+16*9], m2
+ mova m0, [rsp+gprsize*2+16*1]
+ mova m2, [rsp+gprsize*2+16*2]
+ mova [rsp+gprsize*2+16*1], m3
+ psubsw m5, m0, m4 ;t13
+ paddsw m0, m4 ;t14
+ mova m3, [o(pd_2048)]
+ psubsw m4, m2, m1 ;t12a
+ paddsw m1, m2 ;t15a
+ mova [rsp+gprsize*2+16*2], m1
+ ITX_MULSUB_2W 5, 7, 1, 2, 3, 2896, 2896 ;t10a, t13a
+ ITX_MULSUB_2W 4, 6, 1, 2, 3, 2896, 2896 ;t11, t12
+ mova m3, [rsp+gprsize*2+16*8]
+ psubsw m2, m3, m5 ;out10
+ paddsw m3, m5 ;out5
+ mova m5, [rsp+gprsize*2+16*7]
+ mova [rsp+gprsize*2+16*8], m3
+ psubsw m3, m5, m4 ;out11
+ paddsw m5, m4 ;out4
+ mova m4, [rsp+gprsize*2+16*6]
+ mova [rsp+gprsize*2+16*7], m5
+ paddsw m5, m4, m6 ;out3
+ psubsw m4, m6 ;out12
+ mova m6, [rsp+gprsize*2+16*5]
+ mova [rsp+gprsize*2+16*6], m5
+ psubsw m5, m6, m7 ;out13
+ paddsw m6, m7 ;out2
+ mova m7, [rsp+gprsize*2+16*4]
+ mova [rsp+gprsize*2+16*5], m6
+ psubsw m6, m7, m0 ;out14
+ paddsw m7, m0 ;out1
+ mova m1, [rsp+gprsize*2+16*2]
+ mova m0, [rsp+gprsize*2+16*3]
+ mova [rsp+gprsize*2+16*4], m7
+ psubsw m7, m0, m1 ;out15
+ paddsw m0, m1 ;out0
+ mova [rsp+gprsize*2+16*3], m0
+ mova m1, [rsp+gprsize*2+16*0]
+ mova m0, [rsp+gprsize*2+16*1]
+ mova [rsp+gprsize*2+16*0], m7
+ ret
+
+INV_TXFM_16X8_FN adst, dct
+INV_TXFM_16X8_FN adst, adst
+INV_TXFM_16X8_FN adst, flipadst
+INV_TXFM_16X8_FN adst, identity
+
+cglobal iadst_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m7, [o(pw_2896x8)]
+ pmulhrsw m0, m7, [coeffq+16*0 ]
+ pmulhrsw m1, m7, [coeffq+16*1 ]
+ pmulhrsw m2, m7, [coeffq+16*14]
+ pmulhrsw m3, m7, [coeffq+16*15]
+ mova [rsp+gprsize+16*7], m0
+ mova [rsp+gprsize+16*8], m1
+ mova [rsp+gprsize+16*9], m2
+ mova [rsp+gprsize+32*5], m3
+ pmulhrsw m0, m7, [coeffq+16*6 ]
+ pmulhrsw m1, m7, [coeffq+16*7 ]
+ pmulhrsw m2, m7, [coeffq+16*8 ]
+ pmulhrsw m3, m7, [coeffq+16*9 ]
+ mova [rsp+gprsize+16*3], m2
+ mova [rsp+gprsize+16*4], m3
+ mova [rsp+gprsize+16*5], m0
+ mova [rsp+gprsize+16*6], m1
+ pmulhrsw m0, m7, [coeffq+16*2 ]
+ pmulhrsw m1, m7, [coeffq+16*3 ]
+ pmulhrsw m2, m7, [coeffq+16*4 ]
+ pmulhrsw m3, m7, [coeffq+16*5 ]
+ pmulhrsw m4, m7, [coeffq+16*10]
+ pmulhrsw m5, m7, [coeffq+16*11]
+ pmulhrsw m6, m7, [coeffq+16*12]
+ pmulhrsw m7, [coeffq+16*13]
+
+ call .main
+ call .main_pass1_end
+ mov r3, tx2q
+ lea tx2q, [o(.pass1_end)]
+ jmp m(iadst_8x8_internal_8bpc).pass1_end
+
+.pass1_end:
+ SAVE_8ROWS coeffq+16*1, 32
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ mov tx2q, r3
+ jmp m(iadst_8x8_internal_8bpc).pass1_end
+
+.pass2:
+ lea tx2q, [o(.end)]
+ lea r3, [dstq+8]
+ jmp m(iadst_8x8_internal_8bpc).pass2_main
+
+.end:
+ LOAD_8ROWS coeffq+16*1, 32
+ lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
+ mov dstq, r3
+ jmp m(iadst_8x8_internal_8bpc).pass2_main
+
+ALIGN function_align
+cglobal_label .main
+ mova [rsp+gprsize*2+16*0], m1
+ mova [rsp+gprsize*2+16*1], m2
+ mova [rsp+gprsize*2+16*2], m6
+
+ mova m6, [o(pd_2048)]
+ ITX_MULSUB_2W 7, 0, 1, 2, 6, 995, 3973 ;t3, t2
+ ITX_MULSUB_2W 3, 4, 1, 2, 6, 3513, 2106 ;t11, t10
+ psubsw m1, m0, m4 ;t10a
+ paddsw m0, m4 ;t2a
+ psubsw m4, m7, m3 ;t11a
+ paddsw m3, m7 ;t3a
+ ITX_MULSUB_2W 1, 4, 7, 2, 6, 3406, 2276 ;t11, t10
+ mova m2, [rsp+gprsize*2+16*0] ;in3
+ mova m7, [rsp+gprsize*2+16*1] ;in4
+ mova [rsp+gprsize*2+16*0], m1 ;t11
+ mova [rsp+gprsize*2+16*1], m4 ;t10
+ mova m1, [rsp+gprsize*2+16*2] ;in12
+ mova [rsp+gprsize*2+16*2], m0 ;t2a
+ ITX_MULSUB_2W 5, 7, 0, 4, 6, 1751, 3703 ;t5, t4
+ ITX_MULSUB_2W 2, 1, 0, 4, 6, 3857, 1380 ;t13, t12
+ psubsw m0, m7, m1 ;t12a
+ paddsw m1, m7 ;t4a
+ psubsw m4, m5, m2 ;t13a
+ paddsw m5, m2 ;t5a
+ ITX_MULSUB_2W 4, 0, 7, 2, 6, 4017, 799 ;t12, t13
+ mova m2, [rsp+gprsize*2+16*8] ;in1
+ mova m7, [rsp+gprsize*2+16*9] ;in14
+ mova [rsp+gprsize*2+16*8], m4 ;t12
+ mova [rsp+gprsize*2+16*9], m0 ;t13
+ mova m4, [rsp+gprsize*2+16*4] ;in9
+ mova m0, [rsp+gprsize*2+16*5] ;in6
+ mova [rsp+gprsize*2+16*4], m1 ;t4a
+ mova [rsp+gprsize*2+16*5], m5 ;t5a
+ ITX_MULSUB_2W 2, 7, 1, 5, 6, 4052, 601 ;t15, t14
+ ITX_MULSUB_2W 4, 0, 1, 5, 6, 2440, 3290 ;t7, t6
+ psubsw m1, m0, m7 ;t14a
+ paddsw m0, m7 ;t6a
+ psubsw m5, m4, m2 ;t15a
+ paddsw m4, m2 ;t7a
+ ITX_MULSUB_2W 5, 1, 7, 2, 6, 2276, 3406 ;t14, t15
+ mova m2, [rsp+gprsize*2+16*2] ;t2a
+ mova [rsp+gprsize*2+16*2], m5 ;t14
+ psubsw m7, m2, m0 ;t6
+ paddsw m2, m0 ;t2
+ psubsw m0, m3, m4 ;t7
+ paddsw m3, m4 ;t3
+ ITX_MULSUB_2W 0, 7, 4, 5, 6, 3784, 1567 ;t6a, t7a
+ mova m4, [rsp+gprsize*2+16*7] ;in0
+ mova m5, [rsp+gprsize*2+32*5] ;in15
+ mova [rsp+gprsize*2+16*7], m3 ;t3
+ mova [rsp+gprsize*2+32*5], m1 ;t15
+ mova m1, [rsp+gprsize*2+16*6] ;in7
+ mova m3, [rsp+gprsize*2+16*3] ;in8
+ mova [rsp+gprsize*2+16*6], m7 ;t7a
+ mova [rsp+gprsize*2+16*3], m0 ;t6a
+ ITX_MULSUB_2W 5, 4, 0, 7, 6, 201, 4091 ;t1, t0
+ ITX_MULSUB_2W 1, 3, 0, 7, 6, 3035, 2751 ;t9, t8
+ psubsw m0, m4, m3 ;t8a
+ paddsw m4, m3 ;t0a
+ psubsw m3, m5, m1 ;t9a
+ paddsw m5, m1 ;t1a
+ ITX_MULSUB_2W 0, 3, 1, 7, 6, 799, 4017 ;t9, t8
+ mova m1, [rsp+gprsize*2+16*4] ;t4a
+ mova m7, [rsp+gprsize*2+16*5] ;t5a
+ mova [rsp+gprsize*2+16*4], m3 ;t8
+ mova [rsp+gprsize*2+16*5], m0 ;t9
+ psubsw m0, m4, m1 ;t4
+ paddsw m4, m1 ;t0
+ psubsw m3, m5, m7 ;t5
+ paddsw m5, m7 ;t1
+ ITX_MULSUB_2W 0, 3, 1, 7, 6, 1567, 3784 ;t5a, t4a
+ mova m7, [rsp+gprsize*2+16*3] ;t6a
+ psubsw m1, m4, m2 ;t2a
+ paddsw m4, m2 ;out0
+ mova [rsp+gprsize*2+16*3], m4 ;out0
+ mova m4, [rsp+gprsize*2+16*6] ;t7a
+ psubsw m2, m3, m7 ;t6
+ paddsw m3, m7 ;-out3
+ mova [rsp+gprsize*2+16*6], m3 ;-out3
+ psubsw m3, m0, m4 ;t7
+ paddsw m0, m4 ;out12
+ mova [rsp+gprsize*2+16*12], m3
+ mova m3, [rsp+gprsize*2+16*7] ;t3
+ mova [rsp+gprsize*2+16* 7], m2 ;out4
+ psubsw m2, m5, m3 ;t3a
+ paddsw m5, m3 ;-out15
+ mova [rsp+gprsize*2+16*11], m2
+ mova m2, [rsp+gprsize*2+32*5] ;t15
+ mova [rsp+gprsize*2+16*10], m1 ;-out7
+ mova m1, [rsp+gprsize*2+16*0] ;t11
+ mova [rsp+gprsize*2+16*0 ], m5 ;-out15
+ mova m3, [rsp+gprsize*2+16*1] ;t10
+ mova [rsp+gprsize*2+16*1 ], m4 ;-out11
+ mova m4, [rsp+gprsize*2+16*2] ;t14
+ mova [rsp+gprsize*2+16*2 ], m0 ;out12
+ psubsw m0, m3, m4 ;t14a
+ paddsw m3, m4 ;t10a
+ psubsw m5, m1, m2 ;t15a
+ paddsw m1, m2 ;t11a
+ ITX_MULSUB_2W 5, 0, 2, 4, 6, 3784, 1567 ;t14, t15
+ mova m2, [rsp+gprsize*2+16*4] ;t8
+ mova m4, [rsp+gprsize*2+16*5] ;t9
+ mova [rsp+gprsize*2+16*4], m3 ;t10a
+ mova [rsp+gprsize*2+16*5], m1 ;t11a
+ mova m3, [rsp+gprsize*2+16*8] ;t12
+ mova m1, [rsp+gprsize*2+16*9] ;t13
+ mova [rsp+gprsize*2+16*8], m5 ;t14
+ mova [rsp+gprsize*2+16*9], m0 ;t15
+ psubsw m5, m2, m3 ;t12a
+ paddsw m2, m3 ;t8a
+ psubsw m0, m4, m1 ;t13a
+ paddsw m4, m1 ;t9a
+ ITX_MULSUB_2W 5, 0, 1, 3, 6, 1567, 3784 ;t13, t12
+ mova m6, [rsp+gprsize*2+16*4] ;t10a
+ mova m1, [rsp+gprsize*2+16*5] ;t11a
+ psubsw m3, m2, m6 ;t10
+ paddsw m2, m6 ;-out1
+ paddsw m6, m4, m1 ;out14
+ psubsw m4, m1 ;t11
+ mova [rsp+gprsize*2+16*14], m4
+ mova [rsp+gprsize*2+16* 4], m2 ;-out1
+ mova m4, [rsp+gprsize*2+16*8] ;t14
+ mova m2, [rsp+gprsize*2+16*9] ;t15
+ mova [rsp+gprsize*2+16* 9], m3 ;out6
+ psubsw m3, m0, m4 ;t14a
+ paddsw m0, m4 ;out2
+ psubsw m4, m5, m2 ;t15a
+ paddsw m5, m2 ;-out13
+ mova [rsp+gprsize*2+16* 5], m0 ;out2
+ ret
+ALIGN function_align
+.main_pass1_end:
+ mova m0, [rsp+gprsize*2+16*14]
+ mova [rsp+gprsize*2+16*14], m5
+ mova [rsp+gprsize*2+16*15], m6
+ mova m5, [o(pw_2896_2896)]
+ mova m6, [o(pw_2896_m2896)]
+ mova m7, [o(pd_2048)]
+ punpcklwd m2, m3, m4
+ punpckhwd m3, m4
+ pmaddwd m4, m5, m2
+ pmaddwd m2, m6
+ pmaddwd m1, m5, m3
+ pmaddwd m3, m6
+ REPX {paddd x, m7}, m4, m2, m1, m3
+ REPX {psrad x, 12}, m4, m1, m2, m3
+ packssdw m4, m1 ;-out5
+ packssdw m2, m3 ;out10
+ mova [rsp+gprsize*2+16* 8], m4
+ mova m3, [rsp+gprsize*2+16* 9]
+ punpcklwd m1, m3, m0
+ punpckhwd m3, m0
+ pmaddwd m0, m5, m1
+ pmaddwd m1, m6
+ pmaddwd m4, m5, m3
+ pmaddwd m3, m6
+ REPX {paddd x, m7}, m0, m1, m4, m3
+ REPX {psrad x, 12}, m0, m4, m1, m3
+ packssdw m0, m4 ;out6
+ packssdw m1, m3 ;-out9
+ mova [rsp+gprsize*2+16* 9], m0
+ mova m0, [rsp+gprsize*2+16* 7]
+ mova m4, [rsp+gprsize*2+16*12]
+ punpcklwd m3, m0, m4
+ punpckhwd m0, m4
+ pmaddwd m4, m5, m3
+ pmaddwd m3, m6
+ pmaddwd m5, m0
+ pmaddwd m0, m6
+ REPX {paddd x, m7}, m4, m3, m5, m0
+ REPX {psrad x, 12}, m4, m5, m3, m0
+ packssdw m4, m5 ;out4
+ packssdw m3, m0 ;-out11
+ mova [rsp+gprsize*2+16* 7], m4
+ mova m4, [rsp+gprsize*2+16*10]
+ mova m5, [rsp+gprsize*2+16*11]
+ punpcklwd m0, m4, m5
+ punpckhwd m4, m5
+ pmaddwd m5, m0, [o(pw_2896_2896)]
+ pmaddwd m0, m6
+ pmaddwd m6, m4
+ pmaddwd m4, [o(pw_2896_2896)]
+ REPX {paddd x, m7}, m5, m0, m6, m4
+ REPX {psrad x, 12}, m0, m6, m5, m4
+ packssdw m0, m6 ;out8
+ packssdw m5, m4 ;-out7
+ mova [rsp+gprsize*2+16*10], m5
+ mova m4, [rsp+gprsize*2+16* 2] ;out12
+ mova m5, [rsp+gprsize*2+16*14] ;-out13
+ mova m6, [rsp+gprsize*2+16*15] ;out14
+ ret
+ALIGN function_align
+cglobal_label .main_pass2_end
+ mova m7, [o(pw_2896x8)]
+ mova m1, [rsp+gprsize*2+16* 9]
+ mova m2, [rsp+gprsize*2+16*14]
+ paddsw m0, m1, m2
+ psubsw m1, m2
+ pmulhrsw m0, m7 ;out6
+ pmulhrsw m1, m7 ;-out9
+ mova [rsp+gprsize*2+16* 9], m0
+ psubsw m2, m3, m4
+ paddsw m3, m4
+ pmulhrsw m2, m7 ;out10
+ pmulhrsw m3, m7 ;-out5
+ mova [rsp+gprsize*2+16* 8], m3
+ mova m3, [rsp+gprsize*2+16* 7]
+ mova m4, [rsp+gprsize*2+16*12]
+ paddsw m0, m3, m4
+ psubsw m3, m4
+ pmulhrsw m0, m7 ;out4
+ pmulhrsw m3, m7 ;-out11
+ mova [rsp+gprsize*2+16* 7], m0
+ mova m0, [rsp+gprsize*2+16*10]
+ paddsw m4, m0, [rsp+gprsize*2+16*11]
+ psubsw m0, [rsp+gprsize*2+16*11]
+ pmulhrsw m4, m7 ;-out7
+ pmulhrsw m0, m7 ;out8
+ mova [rsp+gprsize*2+16*10], m4
+ mova m4, [rsp+gprsize*2+16*2 ] ;out12
+ ret
+
+INV_TXFM_16X8_FN flipadst, dct
+INV_TXFM_16X8_FN flipadst, adst
+INV_TXFM_16X8_FN flipadst, flipadst
+INV_TXFM_16X8_FN flipadst, identity
+
+cglobal iflipadst_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m7, [o(pw_2896x8)]
+ pmulhrsw m0, m7, [coeffq+16*0 ]
+ pmulhrsw m1, m7, [coeffq+16*1 ]
+ pmulhrsw m2, m7, [coeffq+16*14]
+ pmulhrsw m3, m7, [coeffq+16*15]
+ mova [rsp+gprsize+16*7], m0
+ mova [rsp+gprsize+16*8], m1
+ mova [rsp+gprsize+16*9], m2
+ mova [rsp+gprsize+32*5], m3
+ pmulhrsw m0, m7, [coeffq+16*6 ]
+ pmulhrsw m1, m7, [coeffq+16*7 ]
+ pmulhrsw m2, m7, [coeffq+16*8 ]
+ pmulhrsw m3, m7, [coeffq+16*9 ]
+ mova [rsp+gprsize+16*3], m2
+ mova [rsp+gprsize+16*4], m3
+ mova [rsp+gprsize+16*5], m0
+ mova [rsp+gprsize+16*6], m1
+ pmulhrsw m0, m7, [coeffq+16*2 ]
+ pmulhrsw m1, m7, [coeffq+16*3 ]
+ pmulhrsw m2, m7, [coeffq+16*4 ]
+ pmulhrsw m3, m7, [coeffq+16*5 ]
+ pmulhrsw m4, m7, [coeffq+16*10]
+ pmulhrsw m5, m7, [coeffq+16*11]
+ pmulhrsw m6, m7, [coeffq+16*12]
+ pmulhrsw m7, [coeffq+16*13]
+
+ call m(iadst_16x8_internal_8bpc).main
+ call m(iadst_16x8_internal_8bpc).main_pass1_end
+
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS coeffq+16*0, 32
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ mov r3, tx2q
+ lea tx2q, [o(.pass1_end)]
+ jmp m(iflipadst_8x8_internal_8bpc).pass1_end
+
+.pass1_end:
+ SAVE_8ROWS coeffq+16*1, 32
+ LOAD_8ROWS coeffq+16*0, 32
+ mova [rsp+gprsize+16*0], m7
+ mov tx2q, r3
+ jmp m(iflipadst_8x8_internal_8bpc).pass1_end
+
+.pass2:
+ lea tx2q, [o(.end)]
+ lea r3, [dstq+8]
+ jmp m(iflipadst_8x8_internal_8bpc).pass2_main
+
+.end:
+ LOAD_8ROWS coeffq+16*1, 32
+ lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
+ mov dstq, r3
+ jmp m(iflipadst_8x8_internal_8bpc).pass2_main
+
+
+INV_TXFM_16X8_FN identity, dct
+INV_TXFM_16X8_FN identity, adst
+INV_TXFM_16X8_FN identity, flipadst
+INV_TXFM_16X8_FN identity, identity
+
+cglobal iidentity_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ add coeffq, 16*16
+ mova m4, [coeffq-16*7]
+ mova m5, [coeffq-16*5]
+ mova m6, [coeffq-16*3]
+ mova m7, [coeffq-16*1]
+ mov r3, tx2q
+ lea tx2q, [o(.pass1_end)]
+
+.pass1:
+ mova m0, [o(pw_2896x8)]
+ mova m2, [o(pw_1697x16)]
+ mova m3, [o(pw_16384)]
+ sub coeffq, 8*16
+ REPX {pmulhrsw x, m0}, m4, m5, m6, m7
+ pmulhrsw m1, m2, m4
+ pmulhrsw m1, m3
+ paddsw m1, m4 ; 1
+ pmulhrsw m4, m2, m5
+ pmulhrsw m4, m3
+ paddsw m4, m5 ; 3
+ pmulhrsw m5, m2, m6
+ pmulhrsw m5, m3
+ paddsw m5, m6 ; 5
+ pmulhrsw m6, m2, m7
+ pmulhrsw m6, m3
+ paddsw m7, m6 ; 7
+ pmulhrsw m6, m0, [coeffq+16*6]
+ mova [rsp+gprsize+16*0], m4
+ pmulhrsw m4, m2, m6
+ pmulhrsw m4, m3
+ paddsw m6, m4 ; 6
+ pmulhrsw m4, m0, [coeffq+16*4]
+ mova [rsp+gprsize+16*1], m6
+ pmulhrsw m6, m2, m4
+ pmulhrsw m6, m3
+ paddsw m4, m6 ; 4
+ pmulhrsw m6, m0, [coeffq+16*2]
+ pmulhrsw m0, [coeffq+16*0]
+ pmulhrsw m2, m6
+ pmulhrsw m2, m3
+ paddsw m2, m6 ; 2
+ pmulhrsw m6, m0, [o(pw_1697x16)]
+ pmulhrsw m6, m3
+ mova m3, [rsp+gprsize+16*0]
+ paddsw m0, m6
+ jmp m(idct_8x8_internal_8bpc).pass1_end3
+
+.pass1_end:
+ mova [coeffq+16*1], m4
+ mova [coeffq+16*3], m5
+ mova [coeffq+16*5], m6
+ mova [coeffq+16*7], m7
+ mova m4, [coeffq-16*7]
+ mova m5, [coeffq-16*5]
+ mova m6, [coeffq-16*3]
+ mova m7, [coeffq-16*1]
+ mova [coeffq-16*7], m0
+ mova [coeffq-16*5], m1
+ mova [coeffq-16*3], m2
+ mova [coeffq-16*1], m3
+ mov tx2q, r3
+ jmp .pass1
+
+.pass2:
+ lea tx2q, [o(.end)]
+ lea r3, [dstq+8]
+ jmp m(iidentity_8x8_internal_8bpc).end
+
+.end:
+ LOAD_8ROWS coeffq+16*1, 32
+ lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
+ mov dstq, r3
+ jmp m(iidentity_8x8_internal_8bpc).end
+
+
+%macro INV_TXFM_16X16_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 16x16, 8, 16*16
+%ifidn %1_%2, dct_dct
+ movd m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1, [coeffq]
+ movd m2, [o(pw_8192)]
+ mov [coeffq], eobd
+ mov r2d, 8
+ lea tx2q, [o(m(inv_txfm_add_dct_dct_16x16_8bpc).end)]
+ jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
+.end:
+ RET
+%endif
+%endmacro
+
+INV_TXFM_16X16_FN dct, dct
+INV_TXFM_16X16_FN dct, adst
+INV_TXFM_16X16_FN dct, flipadst
+INV_TXFM_16X16_FN dct, identity
+
+cglobal idct_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ LOAD_8ROWS coeffq+16*1, 64
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+ LOAD_8ROWS coeffq+16*3, 64
+ call m(idct_16x8_internal_8bpc).main
+ mov r3, tx2q
+ lea tx2q, [o(.pass1_end)]
+ mova m7, [o(pw_8192)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end:
+ SAVE_8ROWS coeffq+16*17, 32
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end1)]
+ mova m7, [o(pw_8192)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end1:
+ SAVE_8ROWS coeffq+16*1, 32
+ LOAD_8ROWS coeffq+16*0, 64
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+ LOAD_8ROWS coeffq+16*2, 64
+ call m(idct_16x8_internal_8bpc).main
+ lea tx2q, [o(.pass1_end2)]
+ mova m7, [o(pw_8192)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end2:
+ SAVE_8ROWS coeffq+16*16, 32
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ mov tx2q, r3
+ mova m7, [o(pw_8192)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass2:
+ lea tx2q, [o(.end)]
+ jmp m(idct_8x16_internal_8bpc).pass2_pre
+
+.end:
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.end1)]
+ mov dstq, r3
+ lea r3, [dstq+8]
+ jmp m(idct_8x8_internal_8bpc).end
+
+.end1:
+ pxor m7, m7
+ REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+
+ add coeffq, 32*8
+ mov dstq, r3
+
+ mova m0, [coeffq+16*0 ]
+ mova m1, [coeffq+16*4 ]
+ mova m2, [coeffq+16*8 ]
+ mova m3, [coeffq+16*12]
+ mova m4, [coeffq+16*1 ]
+ mova m5, [coeffq+16*5 ]
+ mova m6, [coeffq+16*9 ]
+ mova m7, [coeffq+16*13]
+ lea tx2q, [o(m(idct_8x16_internal_8bpc).end)]
+ jmp m(idct_8x16_internal_8bpc).pass2_main
+
+
+%macro ITX_16X16_ADST_LOAD_ODD_COEFS 0
+ mova m0, [coeffq+16*1 ]
+ mova m1, [coeffq+16*3 ]
+ mova m2, [coeffq+16*29]
+ mova m3, [coeffq+16*31]
+ mova [rsp+gprsize+16*7], m0
+ mova [rsp+gprsize+16*8], m1
+ mova [rsp+gprsize+16*9], m2
+ mova [rsp+gprsize+32*5], m3
+ mova m0, [coeffq+16*13]
+ mova m1, [coeffq+16*15]
+ mova m2, [coeffq+16*17]
+ mova m3, [coeffq+16*19]
+ mova [rsp+gprsize+16*3], m2
+ mova [rsp+gprsize+16*4], m3
+ mova [rsp+gprsize+16*5], m0
+ mova [rsp+gprsize+16*6], m1
+ mova m0, [coeffq+16*5 ]
+ mova m1, [coeffq+16*7 ]
+ mova m2, [coeffq+16*9 ]
+ mova m3, [coeffq+16*11]
+ mova m4, [coeffq+16*21]
+ mova m5, [coeffq+16*23]
+ mova m6, [coeffq+16*25]
+ mova m7, [coeffq+16*27]
+%endmacro
+
+%macro ITX_16X16_ADST_LOAD_EVEN_COEFS 0
+ mova m0, [coeffq+16*0 ]
+ mova m1, [coeffq+16*2 ]
+ mova m2, [coeffq+16*28]
+ mova m3, [coeffq+16*30]
+ mova [rsp+gprsize+16*7], m0
+ mova [rsp+gprsize+16*8], m1
+ mova [rsp+gprsize+16*9], m2
+ mova [rsp+gprsize+32*5], m3
+ mova m0, [coeffq+16*12]
+ mova m1, [coeffq+16*14]
+ mova m2, [coeffq+16*16]
+ mova m3, [coeffq+16*18]
+ mova [rsp+gprsize+16*3], m2
+ mova [rsp+gprsize+16*4], m3
+ mova [rsp+gprsize+16*5], m0
+ mova [rsp+gprsize+16*6], m1
+ mova m0, [coeffq+16*4 ]
+ mova m1, [coeffq+16*6 ]
+ mova m2, [coeffq+16*8 ]
+ mova m3, [coeffq+16*10]
+ mova m4, [coeffq+16*20]
+ mova m5, [coeffq+16*22]
+ mova m6, [coeffq+16*24]
+ mova m7, [coeffq+16*26]
+%endmacro
+
+INV_TXFM_16X16_FN adst, dct
+INV_TXFM_16X16_FN adst, adst
+INV_TXFM_16X16_FN adst, flipadst
+
+cglobal iadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ ITX_16X16_ADST_LOAD_ODD_COEFS
+ call m(iadst_16x8_internal_8bpc).main
+ call m(iadst_16x8_internal_8bpc).main_pass1_end
+
+ mov r3, tx2q
+ lea tx2q, [o(.pass1_end)]
+ mova m7, [o(pw_8192)]
+ jmp m(iadst_8x8_internal_8bpc).pass1_end1
+
+.pass1_end:
+ SAVE_8ROWS coeffq+16*17, 32
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end1)]
+ mova m7, [o(pw_8192)]
+ jmp m(iadst_8x8_internal_8bpc).pass1_end1
+
+.pass1_end1:
+ SAVE_8ROWS coeffq+16*1, 32
+ ITX_16X16_ADST_LOAD_EVEN_COEFS
+ call m(iadst_16x8_internal_8bpc).main
+ call m(iadst_16x8_internal_8bpc).main_pass1_end
+
+ lea tx2q, [o(.pass1_end2)]
+ mova m7, [o(pw_8192)]
+ jmp m(iadst_8x8_internal_8bpc).pass1_end1
+
+.pass1_end2:
+ SAVE_8ROWS coeffq+16*16, 32
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ mov tx2q, r3
+ mova m7, [o(pw_8192)]
+ jmp m(iadst_8x8_internal_8bpc).pass1_end1
+
+.pass2:
+ lea tx2q, [o(.end)]
+ jmp m(iadst_8x16_internal_8bpc).pass2_pre
+
+.end:
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.end1)]
+ mov dstq, r3
+ lea r3, [dstq+8]
+ jmp m(iadst_8x8_internal_8bpc).end
+
+.end1:
+ pxor m7, m7
+ REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+
+ add coeffq, 32*8
+ mov dstq, r3
+
+ mova m4, [coeffq+16*0 ]
+ mova m5, [coeffq+16*2 ]
+ mova m0, [coeffq+16*4 ]
+ mova m1, [coeffq+16*6 ]
+ mova m2, [coeffq+16*8 ]
+ mova m3, [coeffq+16*10]
+ mova m6, [coeffq+16*12]
+ mova m7, [coeffq+16*14]
+ mova [rsp+gprsize+16*7], m4
+ mova [rsp+gprsize+16*8], m5
+ mova [rsp+gprsize+16*5], m6
+ mova [rsp+gprsize+16*6], m7
+ lea tx2q, [o(m(iadst_8x16_internal_8bpc).end)]
+ jmp m(iadst_8x16_internal_8bpc).pass2_main
+
+
+INV_TXFM_16X16_FN flipadst, dct
+INV_TXFM_16X16_FN flipadst, adst
+INV_TXFM_16X16_FN flipadst, flipadst
+
+cglobal iflipadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ ITX_16X16_ADST_LOAD_ODD_COEFS
+ call m(iadst_16x8_internal_8bpc).main
+ call m(iadst_16x8_internal_8bpc).main_pass1_end
+
+ mov r3, tx2q
+ lea tx2q, [o(.pass1_end)]
+ mova m7, [o(pw_m8192)]
+ jmp m(iflipadst_8x8_internal_8bpc).pass1_end1
+
+.pass1_end:
+ SAVE_8ROWS coeffq+16*1, 32
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end1)]
+ mova m7, [o(pw_m8192)]
+ jmp m(iflipadst_8x8_internal_8bpc).pass1_end1
+
+.pass1_end1:
+ SAVE_8ROWS coeffq+16*17, 32
+ ITX_16X16_ADST_LOAD_EVEN_COEFS
+ call m(iadst_16x8_internal_8bpc).main
+ call m(iadst_16x8_internal_8bpc).main_pass1_end
+
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS coeffq+16*0, 32
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end2)]
+ mova m7, [o(pw_m8192)]
+ jmp m(iflipadst_8x8_internal_8bpc).pass1_end1
+
+.pass1_end2:
+ SAVE_8ROWS coeffq+16*16, 32
+ LOAD_8ROWS coeffq+16* 0, 32
+ mova [rsp+gprsize+16*0], m7
+ mov tx2q, r3
+ mova m7, [o(pw_m8192)]
+ jmp m(iflipadst_8x8_internal_8bpc).pass1_end1
+
+.pass2:
+ lea tx2q, [o(.end)]
+ lea r3, [dstq+8]
+ jmp m(iflipadst_8x16_internal_8bpc).pass2_pre
+
+.end:
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.end1)]
+ lea dstq, [dstq+strideq*2]
+ jmp m(iflipadst_8x8_internal_8bpc).end
+
+.end1:
+ pxor m7, m7
+ REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+
+ add coeffq, 32*8
+
+ mova m4, [coeffq+16*0 ]
+ mova m5, [coeffq+16*2 ]
+ mova m0, [coeffq+16*4 ]
+ mova m1, [coeffq+16*6 ]
+ mova m2, [coeffq+16*8 ]
+ mova m3, [coeffq+16*10]
+ mova m6, [coeffq+16*12]
+ mova m7, [coeffq+16*14]
+ mova [rsp+gprsize+16*7], m4
+ mova [rsp+gprsize+16*8], m5
+ mova [rsp+gprsize+16*5], m6
+ mova [rsp+gprsize+16*6], m7
+
+ lea tx2q, [o(.end2)]
+ mov dstq, r3
+ jmp m(iflipadst_8x16_internal_8bpc).pass2_main
+
+.end2:
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
+ lea dstq, [dstq+strideq*2]
+ jmp m(iflipadst_8x8_internal_8bpc).end
+
+
+%macro IDTX16B 3 ; src/dst, tmp, pw_1697x16
+ pmulhrsw m%2, m%3, m%1
+ psraw m%2, 1
+ pavgw m%1, m%2
+%endmacro
+
+INV_TXFM_16X16_FN identity, dct
+INV_TXFM_16X16_FN identity, identity
+
+cglobal iidentity_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ add coeffq, 16*17
+ mov r3, tx2q
+ lea tx2q, [o(.pass1_end)]
+
+.pass1:
+ mova m6, [o(pw_1697x16)]
+ mova m7, [coeffq+32*6]
+ mova m0, [coeffq+32*0]
+ mova m1, [coeffq+32*1]
+ mova m2, [coeffq+32*2]
+ mova m3, [coeffq+32*3]
+ mova m4, [coeffq+32*4]
+ REPX {IDTX16B x, 5, 6}, 7, 0, 1, 2, 3, 4
+ mova m5, [coeffq+32*5]
+ mova [rsp+gprsize+16*1], m7
+ IDTX16B 5, 7, 6
+ mova m7, [coeffq+32*7]
+ IDTX16B 7, 6, 6
+ jmp m(idct_8x8_internal_8bpc).pass1_end3
+
+.pass1_end:
+ SAVE_8ROWS coeffq, 32
+ sub coeffq, 16
+ lea tx2q, [o(.pass1_end1)]
+ jmp .pass1
+
+.pass1_end1:
+ SAVE_8ROWS coeffq, 32
+ sub coeffq, 15*16
+ lea tx2q, [o(.pass1_end2)]
+ jmp .pass1
+
+.pass1_end2:
+ SAVE_8ROWS coeffq, 32
+ sub coeffq, 16
+ mov tx2q, r3
+ jmp .pass1
+
+.pass2:
+ lea r3, [dstq+8]
+ lea tx2q, [o(.end1)]
+
+.end:
+ mova [rsp+gprsize+16*0], m7
+ mova [rsp+gprsize+16*1], m4
+ mova m7, [o(pw_1697x16)]
+ REPX {IDTX16 x, 4, 7}, 5, 6, 0, 1, 2, 3
+ mova m4, [o(pw_2048)]
+ pmulhrsw m5, m4
+ pmulhrsw m6, m4
+ mova [rsp+gprsize+16*2], m5
+ mova m5, [rsp+gprsize+16*1]
+ mova [rsp+gprsize+16*1], m6
+ IDTX16 5, 6, 7
+ mova m6, [rsp+gprsize+16*0]
+ IDTX16 6, 7, 7
+ REPX {pmulhrsw x, m4}, m0, m1, m2, m3, m6
+ pmulhrsw m4, m5
+ mova [rsp+gprsize+16*0], m6
+ jmp m(idct_8x8_internal_8bpc).end3
+
+.end1:
+ LOAD_8ROWS coeffq+16*1, 32
+ lea tx2q, [o(.end2)]
+ lea dstq, [dstq+strideq*2]
+ jmp .end
+
+.end2:
+ pxor m7, m7
+ REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+
+ add coeffq, 32*8
+ LOAD_8ROWS coeffq, 32
+ lea tx2q, [o(.end3)]
+ mov dstq, r3
+ jmp .end
+
+.end3:
+ LOAD_8ROWS coeffq+16*1, 32
+ lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
+ lea dstq, [dstq+strideq*2]
+ jmp .end
+
+
+cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+ test eobd, eobd
+ jz .dconly
+ call m(idct_8x32_internal_8bpc)
+ RET
+
+.dconly:
+ movd m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1, [coeffq]
+ movd m2, [o(pw_8192)]
+ mov [coeffq], eobd
+ pmulhrsw m0, m2
+ psrlw m2, 2 ;pw_2048
+ pmulhrsw m0, m1
+ pmulhrsw m0, m2
+ pshuflw m0, m0, q0000
+ punpcklwd m0, m0
+ mov r3d, 8
+ lea tx2q, [o(.end)]
+ jmp m(inv_txfm_add_dct_dct_8x8_8bpc).loop
+
+.end:
+ RET
+
+
+
+cglobal idct_8x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ cmp eobd, 106
+ jle .fast
+
+ LOAD_8ROWS coeffq+16*3, 64
+ call m(idct_8x8_internal_8bpc).main
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1:
+ mova [rsp+gprsize+16*9 ], m0 ;in24
+ mova [rsp+gprsize+16*10], m4 ;in28
+ mova [rsp+gprsize+16*17], m2 ;in26
+ mova [rsp+gprsize+16*18], m6 ;in30
+ mova [rsp+gprsize+16*31], m1 ;in25
+ mova [rsp+gprsize+16*30], m3 ;in27
+ mova [rsp+gprsize+16*27], m5 ;in29
+ mova [rsp+gprsize+16*34], m7 ;in31
+ LOAD_8ROWS coeffq+16*2, 64
+ call m(idct_8x8_internal_8bpc).main
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_1)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_1:
+ mova [rsp+gprsize+16*7 ], m0 ;in16
+ mova [rsp+gprsize+16*8 ], m4 ;in20
+ mova [rsp+gprsize+16*15], m2 ;in18
+ mova [rsp+gprsize+16*16], m6 ;in22
+ mova [rsp+gprsize+16*33], m1 ;in17
+ mova [rsp+gprsize+16*28], m3 ;in19
+ mova [rsp+gprsize+16*29], m5 ;in21
+ mova [rsp+gprsize+16*32], m7 ;in23
+
+.fast:
+ LOAD_8ROWS coeffq+16*1, 64
+ call m(idct_8x8_internal_8bpc).main
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end:
+ mova [rsp+gprsize+16*5 ], m0 ;in8
+ mova [rsp+gprsize+16*6 ], m4 ;in12
+ mova [rsp+gprsize+16*13], m2 ;in10
+ mova [rsp+gprsize+16*14], m6 ;in14
+ mova [rsp+gprsize+16*21], m1 ;in9
+ mova [rsp+gprsize+16*24], m3 ;in11
+ mova [rsp+gprsize+16*25], m5 ;in13
+ mova [rsp+gprsize+16*20], m7 ;in15
+ LOAD_8ROWS coeffq+16*0, 64
+ call m(idct_8x8_internal_8bpc).main
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end1)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end1:
+ mova [rsp+gprsize+16*11], m2 ;in2
+ mova [rsp+gprsize+16*12], m6 ;in6
+ mova [rsp+gprsize+16*19], m1 ;in1
+ mova [rsp+gprsize+16*26], m3 ;in3
+ mova [rsp+gprsize+16*23], m5 ;in5
+ mova [rsp+gprsize+16*22], m7 ;in7
+ mova m1, m4 ;in4
+ mova m2, [rsp+gprsize+16*5 ] ;in8
+ mova m3, [rsp+gprsize+16*6 ] ;in12
+
+ cmp eobd, 106
+ jg .full
+
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3 , 16
+ mova m0, [rsp+gprsize+16*11]
+ mova m1, [rsp+gprsize+16*12]
+ mova m2, [rsp+gprsize+16*13]
+ mova m3, [rsp+gprsize+16*14]
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ call .main_fast
+ jmp .pass2
+
+.full:
+ mova m4, [rsp+gprsize+16*7 ] ;in16
+ mova m5, [rsp+gprsize+16*8 ] ;in20
+ mova m6, [rsp+gprsize+16*9 ] ;in24
+ mova m7, [rsp+gprsize+16*10] ;in28
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3 , 16
+ LOAD_8ROWS rsp+gprsize+16*11, 16
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+ call .main
+
+.pass2:
+ lea r3, [o(.end6)]
+
+.end:
+ mova [rsp+gprsize+16*0 ], m7
+ lea tx2q, [o(.end2)]
+
+.end1:
+ pxor m7, m7
+ REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, \
+ 8, 9, 10, 11, 12, 13, 14, 15, \
+ 16, 17, 18, 19, 20, 21, 22, 23, \
+ 24, 25, 26, 27, 28, 29, 30, 31
+
+ jmp tx2q
+
+.end2:
+ lea tx2q, [o(.end3)]
+ jmp m(idct_8x8_internal_8bpc).end
+
+.end3:
+ LOAD_8ROWS rsp+gprsize+16*11, 16
+ mova [rsp+gprsize+16*0 ], m7
+ lea dstq, [dstq+strideq*2]
+ lea tx2q, [o(.end4)]
+ jmp m(idct_8x8_internal_8bpc).end
+
+.end4:
+ LOAD_8ROWS rsp+gprsize+16*19, 16
+ mova [rsp+gprsize+16*0 ], m7
+ lea dstq, [dstq+strideq*2]
+ lea tx2q, [o(.end5)]
+ jmp m(idct_8x8_internal_8bpc).end
+
+.end5:
+ LOAD_8ROWS rsp+gprsize+16*27, 16
+ mova [rsp+gprsize+16*0 ], m7
+ lea dstq, [dstq+strideq*2]
+ mov tx2q, r3
+ jmp m(idct_8x8_internal_8bpc).end
+
+.end6:
+ ret
+
+ALIGN function_align
+cglobal_label .main_veryfast
+ mova m0, [rsp+gprsize*2+16*19] ;in1
+ pmulhrsw m3, m0, [o(pw_4091x8)] ;t30,t31
+ pmulhrsw m0, [o(pw_201x8)] ;t16,t17
+ mova m7, [o(pd_2048)]
+ mova [rsp+gprsize*2+16*19], m0 ;t16
+ mova [rsp+gprsize*2+16*34], m3 ;t31
+ ITX_MULSUB_2W 3, 0, 1, 2, 7, 799, 4017 ;t17a, t30a
+ mova [rsp+gprsize*2+16*20], m3 ;t17a
+ mova [rsp+gprsize*2+16*33], m0 ;t30a
+ mova m1, [rsp+gprsize*2+16*22] ;in7
+ pmulhrsw m2, m1, [o(pw_3857x8)] ;t28,t29
+ pmulhrsw m1, [o(pw_m1380x8)] ;t18,t19
+ mova [rsp+gprsize*2+16*22], m1 ;t19
+ mova [rsp+gprsize*2+16*31], m2 ;t28
+ ITX_MULSUB_2W 2, 1, 0, 3, 7, m4017, 799 ;t18a, t29a
+ mova [rsp+gprsize*2+16*21], m2 ;t18a
+ mova [rsp+gprsize*2+16*32], m1 ;t29a
+ mova m0, [rsp+gprsize*2+16*23] ;in5
+ pmulhrsw m3, m0, [o(pw_3973x8)] ;t26, t27
+ pmulhrsw m0, [o(pw_995x8)] ;t20, t21
+ mova [rsp+gprsize*2+16*23], m0 ;t20
+ mova [rsp+gprsize*2+16*30], m3 ;t27
+ ITX_MULSUB_2W 3, 0, 1, 2, 7, 3406, 2276 ;t21a, t26a
+ mova [rsp+gprsize*2+16*24], m3 ;t21a
+ mova [rsp+gprsize*2+16*29], m0 ;t26a
+ mova m2, [rsp+gprsize*2+16*26] ;in3
+ pxor m0, m0
+ mova m3, m0
+ pmulhrsw m1, m2, [o(pw_4052x8)]
+ pmulhrsw m2, [o(pw_m601x8)]
+ jmp .main2
+
+ALIGN function_align
+cglobal_label .main_fast ;bottom half is zero
+ mova m0, [rsp+gprsize*2+16*19] ;in1
+ mova m1, [rsp+gprsize*2+16*20] ;in15
+ pmulhrsw m3, m0, [o(pw_4091x8)] ;t31a
+ pmulhrsw m0, [o(pw_201x8)] ;t16a
+ pmulhrsw m2, m1, [o(pw_3035x8)] ;t30a
+ pmulhrsw m1, [o(pw_m2751x8)] ;t17a
+ mova m7, [o(pd_2048)]
+ psubsw m4, m0, m1 ;t17
+ paddsw m0, m1 ;t16
+ psubsw m5, m3, m2 ;t30
+ paddsw m3, m2 ;t31
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, 799, 4017 ;t17a, t30a
+ mova [rsp+gprsize*2+16*19], m0 ;t16
+ mova [rsp+gprsize*2+16*20], m5 ;t17a
+ mova [rsp+gprsize*2+16*33], m4 ;t30a
+ mova [rsp+gprsize*2+16*34], m3 ;t31
+ mova m0, [rsp+gprsize*2+16*21] ;in9
+ mova m1, [rsp+gprsize*2+16*22] ;in7
+ pmulhrsw m3, m0, [o(pw_3703x8)]
+ pmulhrsw m0, [o(pw_1751x8)]
+ pmulhrsw m2, m1, [o(pw_3857x8)]
+ pmulhrsw m1, [o(pw_m1380x8)]
+ psubsw m4, m1, m0 ;t18
+ paddsw m0, m1 ;t19
+ psubsw m5, m2, m3 ;t29
+ paddsw m3, m2 ;t28
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, m4017, 799 ;t18a, t29a
+ mova [rsp+gprsize*2+16*21], m5 ;t18a
+ mova [rsp+gprsize*2+16*22], m0 ;t19
+ mova [rsp+gprsize*2+16*31], m3 ;t28
+ mova [rsp+gprsize*2+16*32], m4 ;t29a
+ mova m0, [rsp+gprsize*2+16*23] ;in5
+ mova m1, [rsp+gprsize*2+16*24] ;in11
+ pmulhrsw m3, m0, [o(pw_3973x8)]
+ pmulhrsw m0, [o(pw_995x8)]
+ pmulhrsw m2, m1, [o(pw_3513x8)]
+ pmulhrsw m1, [o(pw_m2106x8)]
+ psubsw m4, m0, m1 ;t21
+ paddsw m0, m1 ;t20
+ psubsw m5, m3, m2 ;t26
+ paddsw m3, m2 ;t27
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, 3406, 2276 ;t21a, t26a
+ mova [rsp+gprsize*2+16*23], m0 ;t20
+ mova [rsp+gprsize*2+16*24], m5 ;t21a
+ mova [rsp+gprsize*2+16*29], m4 ;t26a
+ mova [rsp+gprsize*2+16*30], m3 ;t27
+ mova m0, [rsp+gprsize*2+16*25] ;in13
+ mova m2, [rsp+gprsize*2+16*26] ;in3
+ pmulhrsw m3, m0, [o(pw_3290x8)]
+ pmulhrsw m0, [o(pw_2440x8)]
+ pmulhrsw m1, m2, [o(pw_4052x8)]
+ pmulhrsw m2, [o(pw_m601x8)]
+ jmp .main2
+
+ALIGN function_align
+cglobal_label .main
+ mova m7, [o(pd_2048)]
+ mova m0, [rsp+gprsize*2+16*19] ;in1
+ mova m1, [rsp+gprsize*2+16*20] ;in15
+ mova m2, [rsp+gprsize*2+16*33] ;in17
+ mova m3, [rsp+gprsize*2+16*34] ;in31
+ ITX_MULSUB_2W 0, 3, 4, 5, 7, 201, 4091 ;t16a, t31a
+ ITX_MULSUB_2W 2, 1, 4, 5, 7, 3035, 2751 ;t17a, t30a
+ psubsw m4, m0, m2 ;t17
+ paddsw m0, m2 ;t16
+ psubsw m5, m3, m1 ;t30
+ paddsw m3, m1 ;t31
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, 799, 4017 ;t17a, t30a
+ mova [rsp+gprsize*2+16*19], m0 ;t16
+ mova [rsp+gprsize*2+16*20], m5 ;t17a
+ mova [rsp+gprsize*2+16*33], m4 ;t30a
+ mova [rsp+gprsize*2+16*34], m3 ;t31
+ mova m0, [rsp+gprsize*2+16*21] ;in9
+ mova m1, [rsp+gprsize*2+16*22] ;in7
+ mova m2, [rsp+gprsize*2+16*31] ;in25
+ mova m3, [rsp+gprsize*2+16*32] ;in23
+ ITX_MULSUB_2W 0, 3, 4, 5, 7, 1751, 3703 ;t18a, t29a
+ ITX_MULSUB_2W 2, 1, 4, 5, 7, 3857, 1380 ;t19a, t28a
+ psubsw m4, m2, m0 ;t18
+ paddsw m0, m2 ;t19
+ psubsw m5, m1, m3 ;t29
+ paddsw m3, m1 ;t28
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, m4017, 799 ;t18a, t29a
+ mova [rsp+gprsize*2+16*21], m5 ;t18a
+ mova [rsp+gprsize*2+16*22], m0 ;t19
+ mova [rsp+gprsize*2+16*31], m3 ;t28
+ mova [rsp+gprsize*2+16*32], m4 ;t29a
+ mova m0, [rsp+gprsize*2+16*23] ;in5
+ mova m1, [rsp+gprsize*2+16*24] ;in11
+ mova m2, [rsp+gprsize*2+16*29] ;in21
+ mova m3, [rsp+gprsize*2+16*30] ;in27
+ ITX_MULSUB_2W 0, 3, 4, 5, 7, 995, 3973 ;t20a, t27a
+ ITX_MULSUB_2W 2, 1, 4, 5, 7, 3513, 2106 ;t21a, t26a
+ psubsw m4, m0, m2 ;t21
+ paddsw m0, m2 ;t20
+ psubsw m5, m3, m1 ;t26
+ paddsw m3, m1 ;t27
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, 3406, 2276 ;t21a, t26a
+ mova [rsp+gprsize*2+16*23], m0 ;t20
+ mova [rsp+gprsize*2+16*24], m5 ;t21a
+ mova [rsp+gprsize*2+16*29], m4 ;t26a
+ mova [rsp+gprsize*2+16*30], m3 ;t27
+ mova m0, [rsp+gprsize*2+16*25] ;in13
+ mova m1, [rsp+gprsize*2+16*26] ;in3
+ mova m2, [rsp+gprsize*2+16*27] ;in29
+ mova m3, [rsp+gprsize*2+16*28] ;in19
+ ITX_MULSUB_2W 0, 3, 4, 5, 7, 2440, 3290 ;t22a, t25a
+ ITX_MULSUB_2W 2, 1, 4, 5, 7, 4052, 601 ;t23a, t24a
+
+.main2:
+ psubsw m4, m2, m0 ;t22
+ paddsw m0, m2 ;t23
+ psubsw m5, m1, m3 ;t25
+ paddsw m3, m1 ;t24
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, m2276, 3406 ;t22a, t25a
+ mova m2, [rsp+gprsize*2+16*24] ;t21a
+ psubsw m1, m5, m2 ;t21
+ paddsw m5, m2 ;t22
+ mova [rsp+gprsize*2+16*25], m5 ;t22
+ mova m2, [rsp+gprsize*2+16*29] ;t26a
+ psubsw m5, m4, m2 ;t26
+ paddsw m4, m2 ;t25
+ mova [rsp+gprsize*2+16*28], m4 ;t25
+ ITX_MULSUB_2W 5, 1, 2, 4, 7, m3784, 1567 ;t21a, t26a
+ mova [rsp+gprsize*2+16*24], m5 ;t21a
+ mova [rsp+gprsize*2+16*29], m1 ;t26a
+
+ mova m1, [rsp+gprsize*2+16*23] ;t20
+ mova m5, [rsp+gprsize*2+16*30] ;t27
+ psubsw m2, m0, m1 ;t20a
+ paddsw m0, m1 ;t23a
+ psubsw m6, m3, m5 ;t27a
+ paddsw m3, m5 ;t24a
+ ITX_MULSUB_2W 6, 2, 1, 5, 7, m3784, 1567 ;t20, t27
+ mova [rsp+gprsize*2+16*26], m0 ;t23a
+ mova [rsp+gprsize*2+16*27], m3 ;t24a
+ mova [rsp+gprsize*2+16*30], m2 ;t27
+
+ mova m0, [rsp+gprsize*2+16*20] ;t17a
+ mova m1, [rsp+gprsize*2+16*21] ;t18a
+ mova m2, [rsp+gprsize*2+16*32] ;t29a
+ mova m3, [rsp+gprsize*2+16*33] ;t30a
+ psubsw m4, m0, m1 ;t18
+ paddsw m0, m1 ;t17
+ psubsw m5, m3, m2 ;t29
+ paddsw m3, m2 ;t30
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, 1567, 3784 ;t18a, t29a
+ mova [rsp+gprsize*2+16*20], m0 ;t17
+ mova [rsp+gprsize*2+16*21], m5 ;t18a
+ mova [rsp+gprsize*2+16*32], m4 ;t29a
+ mova [rsp+gprsize*2+16*33], m3 ;t30
+ mova m0, [rsp+gprsize*2+16*19] ;t16
+ mova m1, [rsp+gprsize*2+16*22] ;t19
+ mova m2, [rsp+gprsize*2+16*31] ;t28
+ mova m3, [rsp+gprsize*2+16*34] ;t31
+ psubsw m4, m0, m1 ;t19a
+ paddsw m0, m1 ;t16a
+ psubsw m5, m3, m2 ;t28a
+ paddsw m3, m2 ;t31a
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, 1567, 3784 ;t19, t28
+ mova m2, [rsp+gprsize*2+16*15] ;tmp12
+ psubsw m1, m5, m6 ;t20a
+ paddsw m5, m6 ;t19a
+ psubsw m6, m2, m5 ;out19
+ paddsw m2, m5 ;out12
+ mova m5, [rsp+gprsize*2+16*30] ;t27
+ mova [rsp+gprsize*2+16*22], m6 ;out19
+ mova [rsp+gprsize*2+16*15], m2 ;out12
+ psubsw m6, m4, m5 ;t27a
+ paddsw m4, m5 ;t28a
+ ITX_MULSUB_2W 6, 1, 2, 5, 7, 2896, 2896 ;t20, t27
+ mova m2, [rsp+gprsize*2+16*6 ] ;tmp3
+ psubsw m5, m2, m4 ;out28
+ paddsw m2, m4 ;out3
+ mova m4, [rsp+gprsize*2+16*14] ;tmp11
+ mova [rsp+gprsize*2+16*31], m5 ;out28
+ mova [rsp+gprsize*2+16*6 ], m2 ;out3
+ psubsw m5, m4, m6 ;out20
+ paddsw m4, m6 ;out11
+ mova m2, [rsp+gprsize*2+16*7 ] ;tmp4
+ mova [rsp+gprsize*2+16*23], m5 ;out20
+ mova [rsp+gprsize*2+16*14], m4 ;out11
+ psubsw m5, m2, m1 ;out27
+ paddsw m2, m1 ;out4
+ mova m1, [rsp+gprsize*2+16*26] ;t23a
+ mova m4, [rsp+gprsize*2+16*27] ;t24a
+ mova [rsp+gprsize*2+16*30], m5 ;out27
+ mova [rsp+gprsize*2+16*7 ], m2 ;out4
+ psubsw m5, m0, m1 ;t23
+ paddsw m0, m1 ;t16
+ psubsw m2, m3, m4 ;t24
+ paddsw m3, m4 ;t31
+ ITX_MULSUB_2W 2, 5, 4, 6, 7, 2896, 2896 ;t23a, t24a
+ mova m6, [rsp+gprsize*2+16*18] ;tmp15
+ psubsw m4, m6, m0 ;out16
+ paddsw m6, m0 ;out15
+ mova m0, [rsp+gprsize*2+16*3 ] ;tmp0
+ mova m1, [rsp+gprsize*2+16*11] ;tmp8
+ mova [rsp+gprsize*2+16*18], m6 ;out15
+ mova [rsp+gprsize*2+16*19], m4 ;out16
+ psubsw m6, m0, m3 ;out31
+ paddsw m0, m3 ;out0
+ psubsw m4, m1, m2 ;out23
+ paddsw m1, m2 ;out8
+ mova m3, [rsp+gprsize*2+16*10] ;tmp7
+ mova [rsp+gprsize*2+16*34], m6 ;out31
+ mova [rsp+gprsize*2+16*11], m1 ;out8
+ mova [rsp+gprsize*2+16*26], m4 ;out23
+ paddsw m6, m3, m5 ;out7
+ psubsw m3, m5 ;out24
+ mova m1, [rsp+gprsize*2+16*20] ;t17
+ mova m5, [rsp+gprsize*2+16*25] ;t22
+ mova m2, [rsp+gprsize*2+16*17] ;tmp14
+ mova [rsp+gprsize*2+16*27], m3 ;out24
+ psubsw m4, m1, m5 ;t22a
+ paddsw m1, m5 ;t17a
+ psubsw m3, m2, m1 ;out17
+ paddsw m2, m1 ;out14
+ mova m5, [rsp+gprsize*2+16*28] ;t25
+ mova m1, [rsp+gprsize*2+16*33] ;t30
+ mova [rsp+gprsize*2+16*17], m2 ;out14
+ mova [rsp+gprsize*2+16*20], m3 ;out17
+ psubsw m2, m1, m5 ;t25a
+ paddsw m1, m5 ;t30a
+ ITX_MULSUB_2W 2, 4, 3, 5, 7, 2896, 2896 ;t22, t25
+ mova m5, [rsp+gprsize*2+16*4 ] ;tmp1
+ psubsw m3, m5, m1 ;out30
+ paddsw m5, m1 ;out1
+ mova m1, [rsp+gprsize*2+16*12] ;tmp9
+ mova [rsp+gprsize*2+16*33], m3 ;out30
+ mova [rsp+gprsize*2+16*4 ], m5 ;out1
+ psubsw m3, m1, m2 ;out22
+ paddsw m1, m2 ;out9
+ mova m5, [rsp+gprsize*2+16*9 ] ;tmp6
+ mova [rsp+gprsize*2+16*25], m3 ;out22
+ mova [rsp+gprsize*2+16*12], m1 ;out9
+ psubsw m3, m5, m4 ;out25
+ paddsw m5, m4 ;out6
+ mova m4, [rsp+gprsize*2+16*21] ;t18a
+ mova m1, [rsp+gprsize*2+16*24] ;t21a
+ mova m2, [rsp+gprsize*2+16*16] ;tmp13
+ mova [rsp+gprsize*2+16*28], m3 ;out25
+ mova [rsp+gprsize*2+16*9 ], m5 ;out6
+ paddsw m3, m4, m1 ;t18
+ psubsw m4, m1 ;t21
+ psubsw m5, m2, m3 ;out18
+ paddsw m2, m3 ;out13
+ mova m1, [rsp+gprsize*2+16*29] ;t26a
+ mova m3, [rsp+gprsize*2+16*32] ;t29a
+ mova [rsp+gprsize*2+16*21], m5 ;out18
+ mova [rsp+gprsize*2+16*16], m2 ;out13
+ psubsw m5, m3, m1 ;t26
+ paddsw m3, m1 ;t29
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, 2896, 2896 ;t21a, t26a
+ mova m2, [rsp+gprsize*2+16*5 ] ;tmp2
+ psubsw m1, m2, m3 ;out29
+ paddsw m2, m3 ;out2
+ mova m3, [rsp+gprsize*2+16*13] ;tmp10
+ mova [rsp+gprsize*2+16*32], m1 ;out29
+ psubsw m7, m3, m5 ;out21
+ paddsw m3, m5 ;out10
+ mova m5, [rsp+gprsize*2+16*8 ] ;tmp5
+ mova [rsp+gprsize*2+16*24], m7 ;out21
+ mova [rsp+gprsize*2+16*13], m3 ;out10
+ psubsw m1, m5, m4 ;out26
+ paddsw m5, m4 ;out5
+ mova m7, m6 ;out7
+ mova m3, [rsp+gprsize*2+16*6 ] ;out3
+ mova m4, [rsp+gprsize*2+16*7 ] ;out4
+ mova [rsp+gprsize*2+16*29], m1 ;out26
+ mova m6, [rsp+gprsize*2+16*9 ] ;out6
+ mova m1, [rsp+gprsize*2+16*4 ] ;out1
+ ret
+
+
+cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+ test eobd, eobd
+ jz .dconly
+ call m(idct_32x8_internal_8bpc)
+ RET
+
+.dconly:
+ movd m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1, [coeffq]
+ movd m2, [o(pw_8192)]
+ mov [coeffq], eobd
+ mov r3d, 8
+ lea tx2q, [o(.end)]
+
+.body:
+ pmulhrsw m0, m2
+ movd m2, [o(pw_2048)] ;intentionally rip-relative
+ pmulhrsw m0, m1
+ pmulhrsw m0, m2
+ pshuflw m0, m0, q0000
+ punpcklwd m0, m0
+ pxor m5, m5
+
+.loop:
+ mova m1, [dstq+16*0]
+ mova m3, [dstq+16*1]
+ punpckhbw m2, m1, m5
+ punpcklbw m1, m5
+ punpckhbw m4, m3, m5
+ punpcklbw m3, m5
+ paddw m2, m0
+ paddw m1, m0
+ paddw m4, m0
+ paddw m3, m0
+ packuswb m1, m2
+ packuswb m3, m4
+ mova [dstq+16*0], m1
+ mova [dstq+16*1], m3
+ add dstq, strideq
+ dec r3d
+ jg .loop
+ jmp tx2q
+
+.end:
+ RET
+
+
+cglobal idct_32x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ LOAD_8ROWS coeffq+16*0, 64
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+
+ LOAD_8ROWS coeffq+16*2, 64
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ LOAD_8ROWS coeffq+16*1, 32
+ mova [rsp+gprsize+16*19], m0 ;in1
+ mova [rsp+gprsize+16*26], m1 ;in3
+ mova [rsp+gprsize+16*23], m2 ;in5
+ mova [rsp+gprsize+16*22], m3 ;in7
+ mova [rsp+gprsize+16*21], m4 ;in9
+ mova [rsp+gprsize+16*24], m5 ;in11
+ mova [rsp+gprsize+16*25], m6 ;in13
+ mova [rsp+gprsize+16*20], m7 ;in15
+
+ cmp eobd, 106
+ jg .full
+ call m(idct_8x32_internal_8bpc).main_fast
+ jmp .pass2
+
+.full:
+ LOAD_8ROWS coeffq+16*17, 32
+ mova [rsp+gprsize+16*33], m0 ;in17
+ mova [rsp+gprsize+16*28], m1 ;in19
+ mova [rsp+gprsize+16*29], m2 ;in21
+ mova [rsp+gprsize+16*32], m3 ;in23
+ mova [rsp+gprsize+16*31], m4 ;in25
+ mova [rsp+gprsize+16*30], m5 ;in27
+ mova [rsp+gprsize+16*27], m6 ;in29
+ mova [rsp+gprsize+16*34], m7 ;in31
+ call m(idct_8x32_internal_8bpc).main
+
+.pass2:
+ mova [rsp+gprsize+16*0 ], m7
+ lea tx2q, [o(.end)]
+ jmp m(idct_8x32_internal_8bpc).end1
+
+.end:
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.end1)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.end1:
+ lea r3, [dstq+8]
+ lea tx2q, [o(.end2)]
+ jmp m(idct_8x8_internal_8bpc).pass2_main
+
+.end2:
+ LOAD_8ROWS rsp+gprsize+16*11, 16
+ mova [rsp+gprsize+16*0 ], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.end3)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.end3:
+ mov dstq, r3
+ add r3, 8
+ lea tx2q, [o(.end4)]
+ jmp m(idct_8x8_internal_8bpc).pass2_main
+
+.end4:
+ LOAD_8ROWS rsp+gprsize+16*19, 16
+ mova [rsp+gprsize+16*0 ], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.end5)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.end5:
+ mov dstq, r3
+ add r3, 8
+ lea tx2q, [o(.end6)]
+ jmp m(idct_8x8_internal_8bpc).pass2_main
+
+.end6:
+ LOAD_8ROWS rsp+gprsize+16*27, 16
+ mova [rsp+gprsize+16*0 ], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.end7)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.end7:
+ mov dstq, r3
+ lea tx2q, [o(.end8)]
+ jmp m(idct_8x8_internal_8bpc).pass2_main
+
+.end8:
+ ret
+
+
+cglobal inv_txfm_add_identity_identity_8x32_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2
+ mov r5d, 4
+ mov tx2d, 2
+ cmp eobd, 107
+ cmovns tx2d, r5d
+ mov r3d, tx2d
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+ lea tx2q, [o(m(idct_32x8_internal_8bpc).end8)]
+.loop:
+ LOAD_8ROWS coeffq+16*0, 64
+ paddsw m6, [o(pw_5)]
+ mova [rsp+16*1], m6
+ mova m6, [o(pw_5)]
+ REPX {paddsw x, m6}, m0, m1, m2, m3, m4, m5, m7
+ call m(idct_8x8_internal_8bpc).pass1_end3
+ REPX {psraw x, 3 }, m0, m1, m2, m3, m4, m5, m6, m7
+ mova [rsp+16*2], m5
+ mova [rsp+16*1], m6
+ mova [rsp+16*0], m7
+ call m(idct_8x8_internal_8bpc).end3
+ lea dstq, [dstq+strideq*2]
+ pxor m7, m7
+ REPX {mova [coeffq+64*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
+ add coeffq, 16
+ dec r3d
+ jg .loop
+ RET
+
+cglobal inv_txfm_add_identity_identity_32x8_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2
+ mov r5d, 4
+ mov tx2d, 2
+ cmp eobd, 107
+ cmovns tx2d, r5d
+ mov r3d, tx2d
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+
+.loop:
+ LOAD_8ROWS coeffq+16*0, 16
+ pmulhrsw m6, [o(pw_4096)]
+ mova [rsp+16*1], m6
+ mova m6, [o(pw_4096)]
+ REPX {pmulhrsw x, m6}, m0, m1, m2, m3, m4, m5, m7
+ lea tx2q, [o(m(idct_32x8_internal_8bpc).end8)]
+ call m(idct_8x8_internal_8bpc).pass1_end3
+
+ mov [rsp+16*3], dstq
+ mova [rsp+16*2], m5
+ mova [rsp+16*1], m6
+ mova [rsp+16*0], m7
+ lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)]
+ call m(idct_8x8_internal_8bpc).end3
+
+ add coeffq, 16*8
+ mov dstq, [rsp+16*3]
+ lea dstq, [dstq+8]
+ dec r3d
+ jg .loop
+ jnc .loop
+ RET
+
+
+cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+ test eobd, eobd
+ jz .dconly
+ call m(idct_16x32_internal_8bpc)
+.end:
+ RET
+
+.dconly:
+ movd m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1, [coeffq]
+ movd m2, [o(pw_16384)]
+ mov [coeffq], eobd
+ pmulhrsw m0, m1
+ mov r2d, 16
+ lea tx2q, [o(.end)]
+ jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
+
+
+cglobal idct_16x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ LOAD_8ROWS coeffq+16*1, 128, 1
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+ LOAD_8ROWS coeffq+16*5, 128, 1
+ call m(idct_16x8_internal_8bpc).main
+ lea tx2q, [o(.pass1_end)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end:
+ SAVE_8ROWS coeffq+16*33, 64 ;in8~in15
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end1)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end1:
+ mova [coeffq+16*1 ], m0 ;in8
+ mova [coeffq+16*5 ], m4 ;in12
+ mova [rsp+gprsize+16*13], m2 ;in10
+ mova [rsp+gprsize+16*14], m6 ;in14
+ mova [rsp+gprsize+16*21], m1 ;in9
+ mova [rsp+gprsize+16*24], m3 ;in11
+ mova [rsp+gprsize+16*25], m5 ;in13
+ mova [rsp+gprsize+16*20], m7 ;in15
+ LOAD_8ROWS coeffq+16*0, 128, 1
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+ LOAD_8ROWS coeffq+16*4, 128, 1
+ call m(idct_16x8_internal_8bpc).main
+ lea tx2q, [o(.pass1_end2)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end2:
+ SAVE_8ROWS coeffq+16*32, 64 ;in0~in7
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end3)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end3:
+ mova [rsp+gprsize+16*11], m2 ;in2
+ mova [rsp+gprsize+16*12], m6 ;in6
+ mova [rsp+gprsize+16*19], m1 ;in1
+ mova [rsp+gprsize+16*26], m3 ;in3
+ mova [rsp+gprsize+16*23], m5 ;in5
+ mova [rsp+gprsize+16*22], m7 ;in7
+
+ cmp eobd, 150
+ jg .full
+
+ mova m1, m4 ;in4
+ mova m2, [coeffq+16*1 ] ;in8
+ mova m3, [coeffq+16*5 ] ;in12
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+ mova m0, [rsp+gprsize+16*11] ;in2
+ mova m1, [rsp+gprsize+16*12] ;in6
+ mova m2, [rsp+gprsize+16*13] ;in10
+ mova m3, [rsp+gprsize+16*14] ;in14
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ call m(idct_8x32_internal_8bpc).main_fast
+ jmp .pass2
+
+.full:
+ mova [coeffq+16*0 ], m0 ;in0
+ mova [coeffq+16*4 ], m4 ;in4
+
+ LOAD_8ROWS coeffq+16*2, 128, 1
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+ LOAD_8ROWS coeffq+16*6, 128, 1
+ call m(idct_16x8_internal_8bpc).main
+ lea tx2q, [o(.pass1_end4)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end4:
+ SAVE_8ROWS coeffq+16*34, 64 ;in16~in23
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end5)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end5:
+ mova [coeffq+16*2 ], m0 ;in16
+ mova [coeffq+16*6 ], m4 ;in20
+ mova [rsp+gprsize+16*15], m2 ;in18
+ mova [rsp+gprsize+16*16], m6 ;in22
+ mova [rsp+gprsize+16*33], m1 ;in17
+ mova [rsp+gprsize+16*28], m3 ;in19
+ mova [rsp+gprsize+16*29], m5 ;in21
+ mova [rsp+gprsize+16*32], m7 ;in23
+
+ LOAD_8ROWS coeffq+16*3, 128, 1
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+ LOAD_8ROWS coeffq+16*7, 128, 1
+ call m(idct_16x8_internal_8bpc).main
+ lea tx2q, [o(.pass1_end6)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end6:
+ SAVE_8ROWS coeffq+16*35, 64 ;in24~in31
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end7)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end7:
+ mova [rsp+gprsize+16*17], m2 ;in26
+ mova [rsp+gprsize+16*18], m6 ;in30
+ mova [rsp+gprsize+16*31], m1 ;in25
+ mova [rsp+gprsize+16*30], m3 ;in27
+ mova [rsp+gprsize+16*27], m5 ;in29
+ mova [rsp+gprsize+16*34], m7 ;in31
+
+ mova m6, m0 ;in24
+ mova m7, m4 ;in28
+ mova m0, [coeffq+16*0 ] ;in0
+ mova m1, [coeffq+16*4 ] ;in4
+ mova m2, [coeffq+16*1 ] ;in8
+ mova m3, [coeffq+16*5 ] ;in12
+ mova m4, [coeffq+16*2 ] ;in16
+ mova m5, [coeffq+16*6 ] ;in20
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3 , 16
+ LOAD_8ROWS rsp+gprsize+16*11, 16
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ call m(idct_8x32_internal_8bpc).main
+
+.pass2:
+ mov [rsp+gprsize*1+16*35], eobd
+ lea r3, [dstq+8]
+ mov [rsp+gprsize*2+16*35], r3
+ lea r3, [o(.end)]
+ jmp m(idct_8x32_internal_8bpc).end
+
+.end:
+ mov dstq, [rsp+gprsize*2+16*35]
+ mov eobd, [rsp+gprsize*1+16*35]
+ add coeffq, 16*32
+
+ mova m0, [coeffq+16*4 ] ;in1
+ mova m1, [coeffq+16*12] ;in3
+ mova m2, [coeffq+16*20] ;in5
+ mova m3, [coeffq+16*28] ;in7
+ mova m4, [coeffq+16*5 ] ;in9
+ mova m5, [coeffq+16*13] ;in11
+ mova m6, [coeffq+16*21] ;in13
+ mova m7, [coeffq+16*29] ;in15
+
+ mova [rsp+gprsize+16*19], m0 ;in1
+ mova [rsp+gprsize+16*26], m1 ;in3
+ mova [rsp+gprsize+16*23], m2 ;in5
+ mova [rsp+gprsize+16*22], m3 ;in7
+ mova [rsp+gprsize+16*21], m4 ;in9
+ mova [rsp+gprsize+16*24], m5 ;in11
+ mova [rsp+gprsize+16*25], m6 ;in13
+ mova [rsp+gprsize+16*20], m7 ;in15
+
+ mova m0, [coeffq+16*0 ] ;in0
+ mova m1, [coeffq+16*16] ;in4
+ mova m2, [coeffq+16*1 ] ;in8
+ mova m3, [coeffq+16*17] ;in12
+
+ cmp eobd, 150
+ jg .full1
+
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+
+ mova m0, [coeffq+16*8 ] ;in2
+ mova m1, [coeffq+16*24] ;in6
+ mova m2, [coeffq+16*9 ] ;in10
+ mova m3, [coeffq+16*25] ;in14
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ call m(idct_8x32_internal_8bpc).main_fast
+ jmp m(idct_8x32_internal_8bpc).pass2
+
+.full1:
+ mova m4, [coeffq+16*2 ] ;in16
+ mova m5, [coeffq+16*18] ;in20
+ mova m6, [coeffq+16*3 ] ;in24
+ mova m7, [coeffq+16*19] ;in26
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+
+ mova m0, [coeffq+16*8 ] ;in2
+ mova m1, [coeffq+16*24] ;in6
+ mova m2, [coeffq+16*9 ] ;in10
+ mova m3, [coeffq+16*25] ;in14
+ mova m4, [coeffq+16*10] ;in18
+ mova m5, [coeffq+16*26] ;in22
+ mova m6, [coeffq+16*11] ;in26
+ mova m7, [coeffq+16*27] ;in30
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ mova m0, [coeffq+16*6 ] ;in17
+ mova m1, [coeffq+16*14] ;in19
+ mova m2, [coeffq+16*22] ;in21
+ mova m3, [coeffq+16*30] ;in23
+ mova m4, [coeffq+16*7 ] ;in25
+ mova m5, [coeffq+16*15] ;in27
+ mova m6, [coeffq+16*23] ;in29
+ mova m7, [coeffq+16*31] ;in31
+
+ mova [rsp+gprsize+16*33], m0 ;in17
+ mova [rsp+gprsize+16*28], m1 ;in19
+ mova [rsp+gprsize+16*29], m2 ;in21
+ mova [rsp+gprsize+16*32], m3 ;in23
+ mova [rsp+gprsize+16*31], m4 ;in25
+ mova [rsp+gprsize+16*30], m5 ;in27
+ mova [rsp+gprsize+16*27], m6 ;in29
+ mova [rsp+gprsize+16*34], m7 ;in31
+
+ call m(idct_8x32_internal_8bpc).main
+ jmp m(idct_8x32_internal_8bpc).pass2
+
+
+cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+ test eobd, eobd
+ jz .dconly
+
+ call m(idct_32x16_internal_8bpc)
+ call m(idct_8x16_internal_8bpc).pass2
+
+ add coeffq, 16*16
+ lea dstq, [r3+8]
+ LOAD_8ROWS rsp+16*11, 16
+ mova [rsp+16*0], m7
+ lea tx2q, [o(m(idct_32x16_internal_8bpc).end)]
+ call m(idct_8x8_internal_8bpc).pass1_end
+ call m(idct_8x16_internal_8bpc).pass2
+
+ add coeffq, 16*16
+ lea dstq, [r3+8]
+ LOAD_8ROWS rsp+16*19, 16
+ mova [rsp+16*0], m7
+ lea tx2q, [o(m(idct_32x16_internal_8bpc).end)]
+ call m(idct_8x8_internal_8bpc).pass1_end
+ call m(idct_8x16_internal_8bpc).pass2
+
+ add coeffq, 16*16
+ lea dstq, [r3+8]
+ LOAD_8ROWS rsp+16*27, 16
+ mova [rsp+16*0], m7
+ lea tx2q, [o(m(idct_32x16_internal_8bpc).end)]
+ call m(idct_8x8_internal_8bpc).pass1_end
+ call m(idct_8x16_internal_8bpc).pass2
+ RET
+
+.dconly:
+ movd m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1, [coeffq]
+ movd m2, [o(pw_16384)]
+ mov [coeffq], eobd
+ pmulhrsw m0, m1
+ mov r3d, 16
+ lea tx2q, [o(m(inv_txfm_add_dct_dct_32x8_8bpc).end)]
+ jmp m(inv_txfm_add_dct_dct_32x8_8bpc).body
+
+
+cglobal idct_32x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ add coeffq, 16
+ lea r3, [o(.pass1_end1)]
+.pass1:
+ LOAD_8ROWS coeffq+16*0, 128, 1
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+
+ LOAD_8ROWS coeffq+16*4, 128, 1
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ LOAD_8ROWS coeffq+16*2, 64, 1
+ mova [rsp+gprsize+16*19], m0 ;in1
+ mova [rsp+gprsize+16*26], m1 ;in3
+ mova [rsp+gprsize+16*23], m2 ;in5
+ mova [rsp+gprsize+16*22], m3 ;in7
+ mova [rsp+gprsize+16*21], m4 ;in9
+ mova [rsp+gprsize+16*24], m5 ;in11
+ mova [rsp+gprsize+16*25], m6 ;in13
+ mova [rsp+gprsize+16*20], m7 ;in15
+
+ LOAD_8ROWS coeffq+16*34, 64, 1
+ mova [rsp+gprsize+16*33], m0 ;in17
+ mova [rsp+gprsize+16*28], m1 ;in19
+ mova [rsp+gprsize+16*29], m2 ;in21
+ mova [rsp+gprsize+16*32], m3 ;in23
+ mova [rsp+gprsize+16*31], m4 ;in25
+ mova [rsp+gprsize+16*30], m5 ;in27
+ mova [rsp+gprsize+16*27], m6 ;in29
+ mova [rsp+gprsize+16*34], m7 ;in31
+ call m(idct_8x32_internal_8bpc).main
+
+.pass1_end:
+ mova [rsp+gprsize+16*0 ], m7
+ mov tx2q, r3
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end1:
+ SAVE_8ROWS coeffq+16*0, 32
+ LOAD_8ROWS rsp+gprsize+16*11, 16
+ mova [rsp+gprsize+16*0 ], m7
+ lea tx2q, [o(.pass1_end2)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end2:
+ SAVE_8ROWS coeffq+16*16, 32
+ LOAD_8ROWS rsp+gprsize+16*19, 16
+ mova [rsp+gprsize+16*0 ], m7
+ lea tx2q, [o(.pass1_end3)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end3:
+ SAVE_8ROWS coeffq+16*32, 32
+ LOAD_8ROWS rsp+gprsize+16*27, 16
+ mova [rsp+gprsize+16*0 ], m7
+ lea tx2q, [o(.pass1_end4)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end4:
+ SAVE_8ROWS coeffq+16*48, 32
+
+ sub coeffq, 16
+ lea r3, [o(.end)]
+ jmp .pass1
+
+.end:
+ ret
+
+
+cglobal inv_txfm_add_identity_identity_16x32_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2
+ mov r4d, eobd
+ cmp eobd, 43 ;if (eob > 43)
+ sbb r3d, r3d ; iteration_count++
+ cmp r4d, 150 ;if (eob > 150)
+ sbb r3d, 0 ; iteration_count++
+ cmp r4d, 278 ;if (eob > 278)
+ sbb r3d, -4 ; iteration_count++
+
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+ lea r4, [dstq+8]
+ mov [rsp+16*3], r4
+ mov [rsp+gprsize+16*3], r3d
+ mov [rsp+gprsize*2+16*3], coeffq
+
+.loop:
+ LOAD_8ROWS coeffq, 64, 1
+ mova [rsp+16*1], m6
+ pxor m6, m6
+ REPX {mova [coeffq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
+ lea tx2q, [o(m(idct_32x16_internal_8bpc).end)]
+ call m(idct_8x8_internal_8bpc).pass1_end3
+ mova [rsp+16*0], m2
+ mova [rsp+16*1], m3
+ mova [rsp+16*2], m4
+ mova m3, [o(pw_1697x16)]
+ mova m4, [o(pw_16384)]
+ REPX {IDTX16 x, 2, 3, 4}, 5, 6, 7, 0, 1
+ mova m2, [o(pw_8192)]
+ REPX {pmulhrsw x, m2}, m5, m6, m7, m0, m1
+ mova m2, [rsp+16*0]
+ mova [rsp+16*0], m7
+ IDTX16 2, 7, 3, 4
+ mova m7, [rsp+16*2]
+ mova [rsp+16*2], m5
+ IDTX16 7, 5, 3, 4
+ mova m5, [rsp+16*1]
+ mova [rsp+16*1], m6
+ pmulhrsw m3, m5
+ pmulhrsw m3, m4
+ psrlw m4, 1 ; pw_8192
+ paddsw m3, m5
+ pmulhrsw m2, m4
+ pmulhrsw m3, m4
+ pmulhrsw m4, m7
+ call m(idct_8x8_internal_8bpc).end3
+ lea dstq, [dstq+strideq*2]
+ add coeffq, 16
+ dec r3d
+ jg .loop
+ mov coeffq, [rsp+gprsize*2+16*3]
+ add coeffq, 64*8
+ mov r3d, [rsp+gprsize+16*3]
+ xor dstq, dstq
+ mov [rsp+gprsize+16*3], dstq
+ mov dstq, [rsp+16*3]
+ test r3d, r3d
+ jnz .loop
+ RET
+
+
+cglobal inv_txfm_add_identity_identity_32x16_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2
+ mov r4d, 12 ;0100b
+ mov r5d, 136 ;1000 1000b
+ cmp eobd, 44 ;if (eob > 43)
+ cmovns r4d, r5d ; iteration_count+2
+ cmp eobd, 151 ;if (eob > 150)
+ mov r3d, 34952 ;1000 1000 1000 1000b
+ cmovs r3d, r4d ; iteration_count += 4
+
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+ lea r4, [dstq+8]
+ mov [rsp+16*3], r4
+
+.loop:
+ LOAD_8ROWS coeffq, 32, 1
+ REPX {paddsw x, x}, m0, m1, m2, m3, m4, m5, m6, m7
+ mova [rsp+16*1], m6
+ lea tx2q, [o(m(idct_32x16_internal_8bpc).end)]
+ call m(idct_8x8_internal_8bpc).pass1_end3
+ mova [rsp+16*1], m5
+ mova [rsp+16*2], m6
+ mova m6, [o(pw_1697x16)]
+ REPX {IDTX16 x, 5, 6}, 7, 0, 1, 2, 3, 4
+ pmulhrsw m7, [o(pw_2048)]
+ mova m5, [rsp+16*1]
+ mova [rsp+16*0], m7
+ IDTX16 5, 7, 6
+ mova m7, [rsp+16*2]
+ IDTX16 7, 6, 6
+ mova m6, [o(pw_2048)]
+ REPX {pmulhrsw x, m6}, m0, m1, m2, m3, m4, m5, m7
+ mova [rsp+16*2], m5
+ mova [rsp+16*1], m7
+ call m(idct_8x8_internal_8bpc).end3
+ lea dstq, [dstq+strideq*2]
+ pxor m7, m7
+ REPX {mova [coeffq+32*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
+
+.loop_end:
+ add coeffq, 16
+ shr r3d, 2
+ jz .ret
+ test r3d, 2
+ jnz .loop
+ mov r4d, r3d
+ and r4d, 1
+ lea coeffq, [coeffq+r4*8+32*7]
+ mov dstq, [rsp+16*3]
+ lea r4, [dstq+8]
+ mov [rsp+16*3], r4
+ jmp .loop
+
+.ret:
+ RET
+
+
+cglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+ test eobd, eobd
+ jz .dconly
+
+ call m(idct_32x32_internal_8bpc)
+ RET
+
+.dconly:
+ movd m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1, [coeffq]
+ movd m2, [o(pw_8192)]
+ mov [coeffq], eobd
+ mov r3d, 32
+ lea tx2q, [o(m(inv_txfm_add_dct_dct_32x8_8bpc).end)]
+ jmp m(inv_txfm_add_dct_dct_32x8_8bpc).body
+
+
+cglobal idct_32x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mov r4d, 2
+ sub eobd, 136
+ mov [rsp+gprsize*1+16*35], eobd
+ mov r3d, 4
+ cmovs r3d, r4d
+
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+
+ mov [rsp+gprsize*2+16*35], coeffq
+
+.pass1_loop:
+ LOAD_8ROWS coeffq+64*1, 64*2
+ mova [rsp+gprsize+16*19], m0 ;in1
+ mova [rsp+gprsize+16*26], m1 ;in3
+ mova [rsp+gprsize+16*23], m2 ;in5
+ mova [rsp+gprsize+16*22], m3 ;in7
+ mova [rsp+gprsize+16*21], m4 ;in9
+ mova [rsp+gprsize+16*24], m5 ;in11
+ mova [rsp+gprsize+16*25], m6 ;in13
+ mova [rsp+gprsize+16*20], m7 ;in15
+
+ mov tx2d, [rsp+gprsize*1+16*35]
+ test tx2d, tx2d
+ jl .fast
+
+.full:
+ LOAD_8ROWS coeffq+64*0, 64*4
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+ LOAD_8ROWS coeffq+64*2, 64*4
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ LOAD_8ROWS coeffq+64*17, 64*2
+ mova [rsp+gprsize+16*33], m0 ;in17
+ mova [rsp+gprsize+16*28], m1 ;in19
+ mova [rsp+gprsize+16*29], m2 ;in21
+ mova [rsp+gprsize+16*32], m3 ;in23
+ mova [rsp+gprsize+16*31], m4 ;in25
+ mova [rsp+gprsize+16*30], m5 ;in27
+ mova [rsp+gprsize+16*27], m6 ;in29
+ mova [rsp+gprsize+16*34], m7 ;in31
+
+ call m(idct_8x32_internal_8bpc).main
+ jmp .pass1_end
+
+.fast:
+ mova m0, [coeffq+256*0]
+ mova m1, [coeffq+256*1]
+ mova m2, [coeffq+256*2]
+ mova m3, [coeffq+256*3]
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_8x8_internal_8bpc).main
+
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+ mova m0, [coeffq+128*1]
+ mova m1, [coeffq+128*3]
+ mova m2, [coeffq+128*5]
+ mova m3, [coeffq+128*7]
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ call m(idct_8x32_internal_8bpc).main_fast
+
+.pass1_end:
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end1)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end1:
+ SAVE_8ROWS coeffq+64*0, 64
+ LOAD_8ROWS rsp+gprsize+16*11, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end2)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end2:
+ SAVE_8ROWS coeffq+64*8, 64
+ LOAD_8ROWS rsp+gprsize+16*19, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end3)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end3:
+ SAVE_8ROWS coeffq+64*16, 64
+ LOAD_8ROWS rsp+gprsize+16*27, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end4)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end4:
+ SAVE_8ROWS coeffq+64*24, 64
+
+ add coeffq, 16
+ dec r3d
+ jg .pass1_loop
+
+
+.pass2:
+ mov coeffq, [rsp+gprsize*2+16*35]
+ mov r3d, 4
+ lea tx2q, [o(.pass2_end)]
+
+.pass2_loop:
+ mov [rsp+gprsize*3+16*35], r3d
+ lea r3, [dstq+8]
+ mov [rsp+gprsize*2+16*35], r3
+
+ mova m0, [coeffq+16*4 ]
+ mova m1, [coeffq+16*12]
+ mova m2, [coeffq+16*20]
+ mova m3, [coeffq+16*28]
+ mova m4, [coeffq+16*5 ]
+ mova m5, [coeffq+16*13]
+ mova m6, [coeffq+16*21]
+ mova m7, [coeffq+16*29]
+ mova [rsp+gprsize+16*19], m0 ;in1
+ mova [rsp+gprsize+16*26], m1 ;in3
+ mova [rsp+gprsize+16*23], m2 ;in5
+ mova [rsp+gprsize+16*22], m3 ;in7
+ mova [rsp+gprsize+16*21], m4 ;in9
+ mova [rsp+gprsize+16*24], m5 ;in11
+ mova [rsp+gprsize+16*25], m6 ;in13
+ mova [rsp+gprsize+16*20], m7 ;in15
+
+ mov eobd, [rsp+gprsize*1+16*35]
+ test eobd, eobd
+ jl .fast1
+
+.full1:
+ mova m0, [coeffq+16*0 ]
+ mova m1, [coeffq+16*16]
+ mova m2, [coeffq+16*1 ]
+ mova m3, [coeffq+16*17]
+ mova m4, [coeffq+16*2 ]
+ mova m5, [coeffq+16*18]
+ mova m6, [coeffq+16*3 ]
+ mova m7, [coeffq+16*19]
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+
+ mova m0, [coeffq+16*8 ]
+ mova m1, [coeffq+16*24]
+ mova m2, [coeffq+16*9 ]
+ mova m3, [coeffq+16*25]
+ mova m4, [coeffq+16*10]
+ mova m5, [coeffq+16*26]
+ mova m6, [coeffq+16*11]
+ mova m7, [coeffq+16*27]
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ mova m0, [coeffq+16*6 ]
+ mova m1, [coeffq+16*14]
+ mova m2, [coeffq+16*22]
+ mova m3, [coeffq+16*30]
+ mova m4, [coeffq+16*7 ]
+ mova m5, [coeffq+16*15]
+ mova m6, [coeffq+16*23]
+ mova m7, [coeffq+16*31]
+ mova [rsp+gprsize+16*33], m0 ;in17
+ mova [rsp+gprsize+16*28], m1 ;in19
+ mova [rsp+gprsize+16*29], m2 ;in21
+ mova [rsp+gprsize+16*32], m3 ;in23
+ mova [rsp+gprsize+16*31], m4 ;in25
+ mova [rsp+gprsize+16*30], m5 ;in27
+ mova [rsp+gprsize+16*27], m6 ;in29
+ mova [rsp+gprsize+16*34], m7 ;in31
+
+ call m(idct_8x32_internal_8bpc).main
+ jmp tx2q
+
+.fast1:
+ mova m0, [coeffq+16*0 ]
+ mova m1, [coeffq+16*16]
+ mova m2, [coeffq+16*1 ]
+ mova m3, [coeffq+16*17]
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+
+ mova m0, [coeffq+16*8 ]
+ mova m1, [coeffq+16*24]
+ mova m2, [coeffq+16*9 ]
+ mova m3, [coeffq+16*25]
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ call m(idct_8x32_internal_8bpc).main_fast
+ jmp tx2q
+
+.pass2_end:
+ lea r3, [o(.pass2_end1)]
+ jmp m(idct_8x32_internal_8bpc).end
+
+.pass2_end1:
+ lea tx2q, [o(.pass2_end)]
+ add coeffq, 16*32
+ mov dstq, [rsp+gprsize*2+16*35]
+ mov r3d, [rsp+gprsize*3+16*35]
+ dec r3d
+ jg .pass2_loop
+
+ ret
+
+
+cglobal inv_txfm_add_identity_identity_32x32_8bpc, 4, 6, 8, 16*5, dst, stride, coeff, eob, tx2
+ mov r4d, 2
+ cmp eobd, 136
+ mov r3d, 4
+ cmovs r3d, r4d
+
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+
+ lea r4, [dstq+8]
+ mov [rsp+gprsize*0+16*3], r4
+ mov [rsp+gprsize*1+16*3], r3d
+ mov [rsp+gprsize*2+16*3], r3d
+ mov [rsp+gprsize*3+16*3], coeffq
+
+.loop:
+ LOAD_8ROWS coeffq, 64
+ mova [rsp+16*1], m6
+ lea tx2q, [o(m(idct_32x16_internal_8bpc).end)]
+ call m(idct_8x8_internal_8bpc).pass1_end3
+ pmulhrsw m7, [o(pw_8192)]
+ mova [rsp+16*0], m7
+ mova m7, [o(pw_8192)]
+ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+ mova [rsp+16*1], m6
+ mova [rsp+16*2], m5
+ call m(idct_8x8_internal_8bpc).end3
+ lea dstq, [dstq+strideq*2]
+
+ pxor m7, m7
+ REPX {mova [coeffq+64*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
+
+ add coeffq, 16
+ dec r3d
+ jg .loop
+
+ mov r4d, [rsp+gprsize*2+16*3]
+ dec r4d
+ jle .ret
+
+ mov dstq, [rsp+gprsize*0+16*3]
+ mov coeffq, [rsp+gprsize*3+16*3]
+ mov [rsp+gprsize*2+16*3], r4
+ lea r3, [dstq+8]
+ add coeffq, 64*8
+ mov [rsp+gprsize*0+16*3], r3
+ mov r3d, [rsp+gprsize*1+16*3]
+ mov [rsp+gprsize*3+16*3], coeffq
+ jmp .loop
+
+.ret:
+ RET
+
+
+cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 6, 8, 16*68, dst, stride, coeff, eob, tx2
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+ test eobd, eobd
+ jz .dconly
+ call m(idct_16x64_internal_8bpc)
+.end:
+ RET
+
+.dconly:
+ movd m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1, [coeffq]
+ movd m2, [o(pw_8192)]
+ mov [coeffq], eobd
+ mov r2d, 32
+ lea tx2q, [o(.end)]
+ jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
+
+
+cglobal idct_16x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mov r4d, 2
+ sub eobd, 151
+ mov [rsp+gprsize*1+16*67], eobd
+ mov r3d, 4
+ cmovs r3d, r4d
+
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+
+ mov [rsp+gprsize*2+16*67], coeffq
+
+.pass1_loop:
+ LOAD_8ROWS coeffq+64*0, 64*2
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+ LOAD_8ROWS coeffq+64*1, 64*2
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end:
+ SAVE_8ROWS coeffq+64*8, 64
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end1)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end1:
+ SAVE_8ROWS coeffq+64*0, 64
+
+ add coeffq, 16
+ dec r3d
+ jg .pass1_loop
+
+ mov coeffq, [rsp+gprsize*2+16*67]
+ mov r3d, 2
+ lea r4, [dstq+8]
+ mov [rsp+gprsize*2+16*67], r4
+ lea r4, [o(.end1)]
+
+.pass2_loop:
+ mov [rsp+gprsize*3+16*67], r3d
+ mov eobd, [rsp+gprsize*1+16*67]
+
+ mova m0, [coeffq+16*4 ] ;in1
+ mova m1, [coeffq+16*12] ;in3
+ mova m2, [coeffq+16*20] ;in5
+ mova m3, [coeffq+16*28] ;in7
+ mova m4, [coeffq+16*5 ] ;in9
+ mova m5, [coeffq+16*13] ;in11
+ mova m6, [coeffq+16*21] ;in13
+ mova m7, [coeffq+16*29] ;in15
+ mova [rsp+gprsize+16*35], m0 ;in1
+ mova [rsp+gprsize+16*49], m1 ;in3
+ mova [rsp+gprsize+16*43], m2 ;in5
+ mova [rsp+gprsize+16*41], m3 ;in7
+ mova [rsp+gprsize+16*39], m4 ;in9
+ mova [rsp+gprsize+16*45], m5 ;in11
+ mova [rsp+gprsize+16*47], m6 ;in13
+ mova [rsp+gprsize+16*37], m7 ;in15
+
+ pxor m4, m4
+ mova m0, [coeffq+16*0]
+ mova m1, [coeffq+16*1]
+
+ test eobd, eobd
+ jl .fast
+
+.full:
+ mova m2, [coeffq+16*2]
+ mova m3, [coeffq+16*3]
+
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+
+ pxor m4, m4
+ mova m0, [coeffq+16*16]
+ mova m1, [coeffq+16*17]
+ mova m2, [coeffq+16*18]
+ mova m3, [coeffq+16*19]
+
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ mova m0, [coeffq+16*8 ]
+ mova m1, [coeffq+16*24]
+ mova m2, [coeffq+16*9 ]
+ mova m3, [coeffq+16*25]
+ mova m4, [coeffq+16*10]
+ mova m5, [coeffq+16*26]
+ mova m6, [coeffq+16*11]
+ mova m7, [coeffq+16*27]
+ mova [rsp+gprsize+16*19], m0
+ mova [rsp+gprsize+16*26], m1
+ mova [rsp+gprsize+16*23], m2
+ mova [rsp+gprsize+16*22], m3
+ mova [rsp+gprsize+16*21], m4
+ mova [rsp+gprsize+16*24], m5
+ mova [rsp+gprsize+16*25], m6
+ mova [rsp+gprsize+16*20], m7
+
+ call m(idct_8x32_internal_8bpc).main_fast
+ SAVE_8ROWS rsp+gprsize+16*3, 16
+
+ mova m0, [coeffq+16*6 ] ;in17
+ mova m1, [coeffq+16*14] ;in19
+ mova m2, [coeffq+16*22] ;in21
+ mova m3, [coeffq+16*30] ;in23
+ mova m4, [coeffq+16*7 ] ;in25
+ mova m5, [coeffq+16*15] ;in27
+ mova m6, [coeffq+16*23] ;in29
+ mova m7, [coeffq+16*31] ;in31
+ mova [rsp+gprsize+16*63], m0 ;in17
+ mova [rsp+gprsize+16*53], m1 ;in19
+ mova [rsp+gprsize+16*55], m2 ;in21
+ mova [rsp+gprsize+16*61], m3 ;in23
+ mova [rsp+gprsize+16*59], m4 ;in25
+ mova [rsp+gprsize+16*57], m5 ;in27
+ mova [rsp+gprsize+16*51], m6 ;in29
+ mova [rsp+gprsize+16*65], m7 ;in31
+
+ call .main
+ jmp .end
+
+.fast:
+ REPX {mova x, m4}, m2, m3, m5, m6, m7
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+
+ pxor m4, m4
+ mova m0, [coeffq+16*16]
+ mova m1, [coeffq+16*17]
+
+ REPX {mova x, m4}, m2, m3, m5, m6, m7
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ mova m0, [coeffq+16*8 ]
+ mova m1, [coeffq+16*24]
+ mova m2, [coeffq+16*9 ]
+ mova m3, [coeffq+16*25]
+ mova [rsp+gprsize+16*19], m0 ;in1
+ mova [rsp+gprsize+16*26], m1 ;in3
+ mova [rsp+gprsize+16*23], m2 ;in5
+ mova [rsp+gprsize+16*22], m3 ;in7
+
+ call m(idct_8x32_internal_8bpc).main_veryfast
+ SAVE_8ROWS rsp+gprsize+16*3, 16
+
+ call .main_fast
+
+.end:
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ mov r3, r4
+ jmp m(idct_8x32_internal_8bpc).end2
+
+.end1:
+ LOAD_8ROWS rsp+gprsize+16*35, 16
+ lea dstq, [dstq+strideq*2]
+ lea r3, [rsp+16*32+gprsize]
+ call .write
+ mov dstq, [rsp+gprsize*2+16*67]
+ mov r3d, [rsp+gprsize*3+16*67]
+ lea r4, [dstq+8]
+ mov [rsp+gprsize*2+16*67], r4
+ lea r4, [o(.end1)]
+
+ dec r3d
+ jg .pass2_loop
+ ret
+.write:
+ mova [r3+16*0], m7
+ mov r4, -16*32
+ pxor m7, m7
+ sub coeffq, r4
+.zero_loop:
+ mova [coeffq+r4+16*0], m7
+ mova [coeffq+r4+16*1], m7
+ add r4, 16*2
+ jl .zero_loop
+ call .write_main2
+ LOAD_8ROWS r3+16*11, 16
+ call .write_main
+ LOAD_8ROWS r3+16*19, 16
+ call .write_main
+ LOAD_8ROWS r3+16*27, 16
+.write_main:
+ mova [r3+16*0], m7
+.write_main2:
+ mova m7, [o(pw_2048)]
+ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pmulhrsw m7, [r3+16*0]
+ mova [r3+16*2], m5
+ mova [r3+16*1], m6
+ mova [r3+16*0], m7
+ WRITE_8X4 0, 1, 2, 3, 5, 6, 7
+ lea dstq, [dstq+strideq*2]
+ WRITE_8X4 4, [r3+16*2], [r3+16*1], [r3+16*0], 5, 6, 7
+ lea dstq, [dstq+strideq*2]
+ ret
+
+
+ALIGN function_align
+cglobal_label .main_fast
+ mova m0, [rsp+gprsize*2+16*35] ;in1
+ pmulhrsw m3, m0, [o(pw_4095x8)] ;t62,t63
+ pmulhrsw m0, [o(pw_101x8)] ;t32,t33
+ mova m7, [o(pd_2048)]
+ mova [rsp+gprsize*2+16*35], m0 ;t32
+ mova [rsp+gprsize*2+16*66], m3 ;t63
+ ITX_MULSUB_2W 3, 0, 1, 2, 7, 401, 4076 ;t33a, t62a
+ mova [rsp+gprsize*2+16*36], m3 ;t33a
+ mova [rsp+gprsize*2+16*65], m0 ;t62a
+
+ mova m1, [rsp+gprsize*2+16*37] ;in15
+ pmulhrsw m2, m1, [o(pw_3822x8)] ;t60,t61
+ pmulhrsw m1, [o(pw_m1474x8)] ;t34,t35
+ mova [rsp+gprsize*2+16*38], m1 ;t35
+ mova [rsp+gprsize*2+16*63], m2 ;t60
+ ITX_MULSUB_2W 2, 1, 0, 3, 7, m4076, 401 ;t34a, t61a
+ mova [rsp+gprsize*2+16*37], m2 ;t34a
+ mova [rsp+gprsize*2+16*64], m1 ;t61a
+
+ mova m0, [rsp+gprsize*2+16*39] ;in9
+ pmulhrsw m3, m0, [o(pw_3996x8)] ;t58,t59
+ pmulhrsw m0, [o(pw_897x8)] ;t36,t37
+ mova [rsp+gprsize*2+16*39], m0 ;t36
+ mova [rsp+gprsize*2+16*62], m3 ;t59
+ ITX_MULSUB_2W 3, 0, 1, 2, 7, 3166, 2598 ;t37a, t58a
+ mova [rsp+gprsize*2+16*40], m3 ;t37a
+ mova [rsp+gprsize*2+16*61], m0 ;t58a
+
+ mova m1, [rsp+gprsize*2+16*41] ;in7
+ pmulhrsw m2, m1, [o(pw_4036x8)] ;t56,t57
+ pmulhrsw m1, [o(pw_m700x8)] ;t38,t39
+ mova [rsp+gprsize*2+16*42], m1 ;t39
+ mova [rsp+gprsize*2+16*59], m2 ;t56
+ ITX_MULSUB_2W 2, 1, 0, 3, 7, m2598, 3166 ;t38a, t57a
+ mova [rsp+gprsize*2+16*41], m2 ;t38a
+ mova [rsp+gprsize*2+16*60], m1 ;t57a
+
+ mova m0, [rsp+gprsize*2+16*43] ;in5
+ pmulhrsw m3, m0, [o(pw_4065x8)] ;t54,t55
+ pmulhrsw m0, [o(pw_501x8)] ;t40,t41
+ mova [rsp+gprsize*2+16*43], m0 ;t40
+ mova [rsp+gprsize*2+16*58], m3 ;t55
+ ITX_MULSUB_2W 3, 0, 1, 2, 7, 1931, 3612 ;t41a, t54a
+ mova [rsp+gprsize*2+16*44], m3 ;t41a
+ mova [rsp+gprsize*2+16*57], m0 ;t54a
+
+ mova m1, [rsp+gprsize*2+16*45] ;in11
+ pmulhrsw m2, m1, [o(pw_3948x8)] ;t52,t53
+ pmulhrsw m1, [o(pw_m1092x8)] ;t42,t43
+ mova [rsp+gprsize*2+16*46], m1 ;t43
+ mova [rsp+gprsize*2+16*55], m2 ;t52
+ ITX_MULSUB_2W 2, 1, 0, 3, 7, m3612, 1931 ;t42a, t53a
+ mova [rsp+gprsize*2+16*45], m2 ;t42a
+ mova [rsp+gprsize*2+16*56], m1 ;t53a
+
+ mova m0, [rsp+gprsize*2+16*47] ;in13
+ pmulhrsw m3, m0, [o(pw_3889x8)] ;t50,t51
+ pmulhrsw m0, [o(pw_1285x8)] ;t44,t45
+ mova m6, m0
+ mova [rsp+gprsize*2+16*54], m3 ;t51
+ ITX_MULSUB_2W 3, 0, 1, 2, 7, 3920, 1189 ;t45a, t50a
+ mova [rsp+gprsize*2+16*48], m3 ;t45a
+ mova [rsp+gprsize*2+16*53], m0 ;t50a
+
+ mova m0, [rsp+gprsize*2+16*49] ;in3
+ pmulhrsw m3, m0, [o(pw_4085x8)] ;t48,t49
+ pmulhrsw m0, [o(pw_m301x8)] ;t46,t47
+ mova m4, m3
+ mova m5, m0
+
+ jmp .main2
+
+ALIGN function_align
+cglobal_label .main
+ mova m0, [rsp+gprsize*2+16*35] ;in1
+ mova m1, [rsp+gprsize*2+16*65] ;in31
+ pmulhrsw m3, m0, [o(pw_4095x8)] ;t63a
+ pmulhrsw m0, [o(pw_101x8)] ;t32a
+ pmulhrsw m2, m1, [o(pw_2967x8)] ;t62a
+ pmulhrsw m1, [o(pw_m2824x8)] ;t33a
+ mova m7, [o(pd_2048)]
+ psubsw m4, m0, m1 ;t33
+ paddsw m0, m1 ;t32
+ psubsw m5, m3, m2 ;t62
+ paddsw m3, m2 ;t63
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, 401, 4076 ;t33a, t62a
+ mova [rsp+gprsize*2+16*35], m0 ;t32
+ mova [rsp+gprsize*2+16*36], m5 ;t33a
+ mova [rsp+gprsize*2+16*65], m4 ;t62a
+ mova [rsp+gprsize*2+16*66], m3 ;t63
+
+ mova m0, [rsp+gprsize*2+16*63] ;in17
+ mova m1, [rsp+gprsize*2+16*37] ;in15
+ pmulhrsw m3, m0, [o(pw_3745x8)] ;t61a
+ pmulhrsw m0, [o(pw_1660x8)] ;t34a
+ pmulhrsw m2, m1, [o(pw_3822x8)] ;t60a
+ pmulhrsw m1, [o(pw_m1474x8)] ;t35a
+ psubsw m4, m1, m0 ;t34
+ paddsw m0, m1 ;t35
+ psubsw m5, m2, m3 ;t61
+ paddsw m3, m2 ;t60
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, m4076, 401 ;t34a, t61a
+ mova [rsp+gprsize*2+16*37], m5 ;t34a
+ mova [rsp+gprsize*2+16*38], m0 ;t35
+ mova [rsp+gprsize*2+16*63], m3 ;t60
+ mova [rsp+gprsize*2+16*64], m4 ;t61a
+
+ mova m0, [rsp+gprsize*2+16*39] ;in9
+ mova m1, [rsp+gprsize*2+16*61] ;in23
+ pmulhrsw m3, m0, [o(pw_3996x8)] ;t59a
+ pmulhrsw m0, [o(pw_897x8)] ;t36a
+ pmulhrsw m2, m1, [o(pw_3461x8)] ;t58a
+ pmulhrsw m1, [o(pw_m2191x8)] ;t37a
+ psubsw m4, m0, m1 ;t37
+ paddsw m0, m1 ;t36
+ psubsw m5, m3, m2 ;t58
+ paddsw m3, m2 ;t59
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, 3166, 2598 ;t37a, t58a
+ mova [rsp+gprsize*2+16*39], m0 ;t36
+ mova [rsp+gprsize*2+16*40], m5 ;t37a
+ mova [rsp+gprsize*2+16*61], m4 ;t58a
+ mova [rsp+gprsize*2+16*62], m3 ;t59
+
+ mova m0, [rsp+gprsize*2+16*59] ;in25
+ mova m1, [rsp+gprsize*2+16*41] ;in7
+ pmulhrsw m3, m0, [o(pw_3349x8)] ;t57a
+ pmulhrsw m0, [o(pw_2359x8)] ;t38a
+ pmulhrsw m2, m1, [o(pw_4036x8)] ;t56a
+ pmulhrsw m1, [o(pw_m700x8)] ;t39a
+ psubsw m4, m1, m0 ;t38
+ paddsw m0, m1 ;t39
+ psubsw m5, m2, m3 ;t57
+ paddsw m3, m2 ;t56
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, m2598, 3166 ;t38a, t57a
+ mova [rsp+gprsize*2+16*41], m5 ;t38a
+ mova [rsp+gprsize*2+16*42], m0 ;t39
+ mova [rsp+gprsize*2+16*59], m3 ;t56
+ mova [rsp+gprsize*2+16*60], m4 ;t57a
+
+ mova m0, [rsp+gprsize*2+16*43] ;in5
+ mova m1, [rsp+gprsize*2+16*57] ;in27
+ pmulhrsw m3, m0, [o(pw_4065x8)] ;t55a
+ pmulhrsw m0, [o(pw_501x8)] ;t40a
+ pmulhrsw m2, m1, [o(pw_3229x8)] ;t54a
+ pmulhrsw m1, [o(pw_m2520x8)] ;t41a
+ psubsw m4, m0, m1 ;t41
+ paddsw m0, m1 ;t40
+ psubsw m5, m3, m2 ;t54
+ paddsw m3, m2 ;t55
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, 1931, 3612 ;t41a, t54a
+ mova [rsp+gprsize*2+16*43], m0 ;t40
+ mova [rsp+gprsize*2+16*44], m5 ;t41a
+ mova [rsp+gprsize*2+16*57], m4 ;t54a
+ mova [rsp+gprsize*2+16*58], m3 ;t55
+
+ mova m0, [rsp+gprsize*2+16*55] ;in21
+ mova m1, [rsp+gprsize*2+16*45] ;in11
+ pmulhrsw m3, m0, [o(pw_3564x8)] ;t53a
+ pmulhrsw m0, [o(pw_2019x8)] ;t42a
+ pmulhrsw m2, m1, [o(pw_3948x8)] ;t52a
+ pmulhrsw m1, [o(pw_m1092x8)] ;t43a
+ psubsw m4, m1, m0 ;t42
+ paddsw m0, m1 ;t43
+ psubsw m5, m2, m3 ;t53
+ paddsw m3, m2 ;t52
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, m3612, 1931 ;t42a, t53a
+ mova [rsp+gprsize*2+16*45], m5 ;t42a
+ mova [rsp+gprsize*2+16*46], m0 ;t43
+ mova [rsp+gprsize*2+16*55], m3 ;t52
+ mova [rsp+gprsize*2+16*56], m4 ;t53a
+
+ mova m0, [rsp+gprsize*2+16*47] ;in13
+ mova m1, [rsp+gprsize*2+16*53] ;in19
+ pmulhrsw m3, m0, [o(pw_3889x8)] ;t51a
+ pmulhrsw m0, [o(pw_1285x8)] ;t44a
+ pmulhrsw m2, m1, [o(pw_3659x8)] ;t50a
+ pmulhrsw m1, [o(pw_m1842x8)] ;t45a
+ psubsw m4, m0, m1 ;t45
+ paddsw m0, m1 ;t44
+ psubsw m5, m3, m2 ;t50
+ paddsw m3, m2 ;t51
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, 3920, 1189 ;t45a, t50a
+ mova m6, m0
+ mova [rsp+gprsize*2+16*48], m5 ;t45a
+ mova [rsp+gprsize*2+16*53], m4 ;t50a
+ mova [rsp+gprsize*2+16*54], m3 ;t51
+
+ mova m0, [rsp+gprsize*2+16*51] ;in29
+ mova m1, [rsp+gprsize*2+16*49] ;in3
+ pmulhrsw m3, m0, [o(pw_3102x8)] ;t49a
+ pmulhrsw m0, [o(pw_2675x8)] ;t46a
+ pmulhrsw m2, m1, [o(pw_4085x8)] ;t48a
+ pmulhrsw m1, [o(pw_m301x8)] ;t47a
+ psubsw m5, m1, m0 ;t46
+ paddsw m0, m1 ;t47
+ psubsw m4, m2, m3 ;t49
+ paddsw m3, m2 ;t48
+
+ALIGN function_align
+.main2:
+ ITX_MULSUB_2W 4, 5, 1, 2, 7, m1189, 3920 ;t46a, t49a
+ mova m1, [rsp+gprsize*2+16*54] ;t51
+ psubsw m2, m0, m6 ;t44a
+ paddsw m0, m6 ;t47a
+ psubsw m6, m3, m1 ;t51a
+ paddsw m3, m1 ;t48a
+ mova [rsp+gprsize*2+16*50], m0 ;t47a
+ mova [rsp+gprsize*2+16*51], m3 ;t48a
+ ITX_MULSUB_2W 6, 2, 0, 3, 7, m2276, 3406 ;t44, t51
+ mova [rsp+gprsize*2+16*47], m6 ;t44
+ mova [rsp+gprsize*2+16*54], m2 ;t51
+
+ mova m0, [rsp+gprsize*2+16*48] ;t45a
+ mova m3, [rsp+gprsize*2+16*53] ;t50a
+ psubsw m2, m4, m0 ;t45
+ paddsw m4, m0 ;t46
+ psubsw m6, m5, m3 ;t50
+ paddsw m5, m3 ;t49
+ ITX_MULSUB_2W 6, 2, 0, 3, 7, m2276, 3406 ;t45a, t50a
+ mova [rsp+gprsize*2+16*48], m6 ;t45a
+ mova [rsp+gprsize*2+16*49], m4 ;t46
+ mova [rsp+gprsize*2+16*52], m5 ;t49
+ mova [rsp+gprsize*2+16*53], m2 ;t50a
+
+ mova m0, [rsp+gprsize*2+16*43] ;t40
+ mova m2, [rsp+gprsize*2+16*46] ;t43
+ mova m3, [rsp+gprsize*2+16*55] ;t52
+ mova m1, [rsp+gprsize*2+16*58] ;t55
+ psubsw m4, m0, m2 ;t43a
+ paddsw m0, m2 ;t40a
+ psubsw m5, m1, m3 ;t52a
+ paddsw m1, m3 ;t55a
+ ITX_MULSUB_2W 5, 4, 2, 3, 7, 3406, 2276 ;t43, t52
+ mova [rsp+gprsize*2+16*43], m0 ;t40a
+ mova [rsp+gprsize*2+16*46], m5 ;t43
+ mova [rsp+gprsize*2+16*55], m4 ;t52
+ mova [rsp+gprsize*2+16*58], m1 ;t55a
+
+ mova m0, [rsp+gprsize*2+16*44] ;t41a
+ mova m2, [rsp+gprsize*2+16*45] ;t42a
+ mova m3, [rsp+gprsize*2+16*56] ;t53a
+ mova m1, [rsp+gprsize*2+16*57] ;t54a
+ psubsw m4, m0, m2 ;t42
+ paddsw m0, m2 ;t41
+ psubsw m5, m1, m3 ;t53
+ paddsw m1, m3 ;t54
+ ITX_MULSUB_2W 5, 4, 2, 3, 7, 3406, 2276 ;t42a, t53a
+ mova [rsp+gprsize*2+16*44], m0 ;t41
+ mova [rsp+gprsize*2+16*45], m5 ;t42a
+ mova [rsp+gprsize*2+16*56], m4 ;t53a
+ mova [rsp+gprsize*2+16*57], m1 ;t54
+
+ mova m0, [rsp+gprsize*2+16*41] ;t38a
+ mova m2, [rsp+gprsize*2+16*40] ;t37a
+ mova m3, [rsp+gprsize*2+16*61] ;t58a
+ mova m1, [rsp+gprsize*2+16*60] ;t57a
+ psubsw m4, m0, m2 ;t37
+ paddsw m0, m2 ;t38
+ psubsw m5, m1, m3 ;t58
+ paddsw m1, m3 ;t57
+ ITX_MULSUB_2W 5, 4, 2, 3, 7, m4017, 799 ;t37a, t58a
+ mova [rsp+gprsize*2+16*41], m0 ;t38
+ mova [rsp+gprsize*2+16*40], m5 ;t37a
+ mova [rsp+gprsize*2+16*61], m4 ;t58a
+ mova [rsp+gprsize*2+16*60], m1 ;t57
+
+ mova m0, [rsp+gprsize*2+16*42] ;t39
+ mova m2, [rsp+gprsize*2+16*39] ;t36
+ mova m3, [rsp+gprsize*2+16*62] ;t59
+ mova m1, [rsp+gprsize*2+16*59] ;t56
+ psubsw m4, m0, m2 ;t36a
+ paddsw m0, m2 ;t39a
+ psubsw m5, m1, m3 ;t59a
+ paddsw m1, m3 ;t56a
+ ITX_MULSUB_2W 5, 4, 2, 3, 7, m4017, 799 ;t36, t59
+ mova [rsp+gprsize*2+16*42], m0 ;t39a
+ mova [rsp+gprsize*2+16*39], m5 ;t36
+ mova [rsp+gprsize*2+16*62], m4 ;t59
+ mova [rsp+gprsize*2+16*59], m1 ;t56a
+
+ mova m0, [rsp+gprsize*2+16*35] ;t32
+ mova m2, [rsp+gprsize*2+16*38] ;t35
+ mova m3, [rsp+gprsize*2+16*63] ;t60
+ mova m1, [rsp+gprsize*2+16*66] ;t63
+ psubsw m4, m0, m2 ;t35a
+ paddsw m0, m2 ;t32a
+ psubsw m5, m1, m3 ;t60a
+ paddsw m1, m3 ;t63a
+ ITX_MULSUB_2W 5, 4, 2, 3, 7, 799, 4017 ;t35, t60
+ mova [rsp+gprsize*2+16*35], m0 ;t32a
+ mova [rsp+gprsize*2+16*38], m5 ;t35
+ mova [rsp+gprsize*2+16*63], m4 ;t60
+ mova [rsp+gprsize*2+16*66], m1 ;t63a
+
+ mova m0, [rsp+gprsize*2+16*36] ;t33a
+ mova m2, [rsp+gprsize*2+16*37] ;t34a
+ mova m3, [rsp+gprsize*2+16*64] ;t61a
+ mova m1, [rsp+gprsize*2+16*65] ;t62a
+ psubsw m4, m0, m2 ;t34
+ paddsw m0, m2 ;t33
+ psubsw m5, m1, m3 ;t61
+ paddsw m1, m3 ;t62
+ ITX_MULSUB_2W 5, 4, 2, 3, 7, 799, 4017 ;t34a, t61a
+
+ mova m2, [rsp+gprsize*2+16*41] ;t38
+ mova m3, [rsp+gprsize*2+16*60] ;t57
+ psubsw m6, m0, m2 ;t38a
+ paddsw m0, m2 ;t33a
+ psubsw m2, m1, m3 ;t57a
+ paddsw m1, m3 ;t62a
+ mova [rsp+gprsize*2+16*36], m0 ;t33a
+ mova [rsp+gprsize*2+16*65], m1 ;t62a
+ ITX_MULSUB_2W 2, 6, 0, 3, 7, 1567, 3784 ;t38, t57
+ mova [rsp+gprsize*2+16*41], m2 ;t38
+ mova [rsp+gprsize*2+16*60], m6 ;t57
+
+ mova m2, [rsp+gprsize*2+16*40] ;t37
+ mova m3, [rsp+gprsize*2+16*61] ;t58
+ psubsw m0, m5, m2 ;t37
+ paddsw m5, m2 ;t34
+ psubsw m1, m4, m3 ;t58
+ paddsw m4, m3 ;t61
+ ITX_MULSUB_2W 1, 0, 2, 3, 7, 1567, 3784 ;t37a, t58a
+ mova [rsp+gprsize*2+16*37], m5 ;t34
+ mova [rsp+gprsize*2+16*64], m4 ;t61
+ mova [rsp+gprsize*2+16*40], m1 ;t37a
+ mova [rsp+gprsize*2+16*61], m0 ;t58a
+
+ mova m0, [rsp+gprsize*2+16*38] ;t35
+ mova m2, [rsp+gprsize*2+16*39] ;t36
+ mova m3, [rsp+gprsize*2+16*62] ;t59
+ mova m1, [rsp+gprsize*2+16*63] ;t60
+ psubsw m4, m0, m2 ;t36a
+ paddsw m0, m2 ;t35a
+ psubsw m5, m1, m3 ;t59a
+ paddsw m1, m3 ;t60a
+ ITX_MULSUB_2W 5, 4, 2, 3, 7, 1567, 3784 ;t36, t59
+ mova [rsp+gprsize*2+16*38], m0 ;t35a
+ mova [rsp+gprsize*2+16*39], m5 ;t36
+ mova [rsp+gprsize*2+16*62], m4 ;t59
+ mova [rsp+gprsize*2+16*63], m1 ;t60a
+
+ mova m0, [rsp+gprsize*2+16*35] ;t32a
+ mova m2, [rsp+gprsize*2+16*42] ;t39a
+ mova m3, [rsp+gprsize*2+16*59] ;t56a
+ mova m1, [rsp+gprsize*2+16*66] ;t63a
+ psubsw m4, m0, m2 ;t39
+ paddsw m0, m2 ;t32
+ psubsw m5, m1, m3 ;t56
+ paddsw m1, m3 ;t63
+ ITX_MULSUB_2W 5, 4, 2, 3, 7, 1567, 3784 ;t39a, t56a
+ mova [rsp+gprsize*2+16*35], m0 ;t32
+ mova [rsp+gprsize*2+16*42], m5 ;t39a
+ mova [rsp+gprsize*2+16*59], m4 ;t56a
+ mova [rsp+gprsize*2+16*66], m1 ;t63
+
+ mova m0, [rsp+gprsize*2+16*50] ;t47a
+ mova m2, [rsp+gprsize*2+16*43] ;t40a
+ mova m3, [rsp+gprsize*2+16*58] ;t55a
+ mova m1, [rsp+gprsize*2+16*51] ;t48a
+ psubsw m4, m0, m2 ;t40
+ paddsw m0, m2 ;t47
+ psubsw m5, m1, m3 ;t55
+ paddsw m1, m3 ;t48
+ ITX_MULSUB_2W 5, 4, 2, 3, 7, m3784, 1567 ;t40a, t55a
+ mova [rsp+gprsize*2+16*50], m0 ;t47
+ mova [rsp+gprsize*2+16*43], m5 ;t40a
+ mova [rsp+gprsize*2+16*58], m4 ;t55a
+ mova [rsp+gprsize*2+16*51], m1 ;t48
+
+ mova m0, [rsp+gprsize*2+16*49] ;t46
+ mova m2, [rsp+gprsize*2+16*44] ;t41
+ mova m3, [rsp+gprsize*2+16*57] ;t54
+ mova m1, [rsp+gprsize*2+16*52] ;t49
+ psubsw m4, m0, m2 ;t41a
+ paddsw m0, m2 ;t46a
+ psubsw m5, m1, m3 ;t54a
+ paddsw m1, m3 ;t49a
+ ITX_MULSUB_2W 5, 4, 2, 3, 7, m3784, 1567 ;t41, t54
+ mova [rsp+gprsize*2+16*49], m0 ;t46a
+ mova [rsp+gprsize*2+16*44], m5 ;t41
+ mova [rsp+gprsize*2+16*57], m4 ;t54
+ mova [rsp+gprsize*2+16*52], m1 ;t49a
+
+ mova m0, [rsp+gprsize*2+16*48] ;t45a
+ mova m2, [rsp+gprsize*2+16*45] ;t42a
+ mova m3, [rsp+gprsize*2+16*56] ;t53a
+ mova m1, [rsp+gprsize*2+16*53] ;t50a
+ psubsw m4, m0, m2 ;t42
+ paddsw m0, m2 ;t45
+ psubsw m5, m1, m3 ;t53
+ paddsw m1, m3 ;t50
+ ITX_MULSUB_2W 5, 4, 2, 3, 7, m3784, 1567 ;t42a, t53a
+ mova [rsp+gprsize*2+16*48], m0 ;t45
+ mova [rsp+gprsize*2+16*45], m5 ;t42a
+ mova [rsp+gprsize*2+16*56], m4 ;t53a
+ mova [rsp+gprsize*2+16*53], m1 ;t50
+
+ mova m0, [rsp+gprsize*2+16*47] ;t44
+ mova m2, [rsp+gprsize*2+16*46] ;t43
+ mova m3, [rsp+gprsize*2+16*55] ;t52
+ mova m1, [rsp+gprsize*2+16*54] ;t51
+ psubsw m4, m0, m2 ;t43a
+ paddsw m0, m2 ;t44a
+ psubsw m5, m1, m3 ;t52a
+ paddsw m1, m3 ;t51a
+ ITX_MULSUB_2W 5, 4, 2, 3, 7, m3784, 1567 ;t43, t52
+
+ mova m2, [rsp+gprsize*2+16*38] ;t35a
+ mova m3, [rsp+gprsize*2+16*31] ;tmp[28]
+ psubsw m6, m2, m0 ;t44
+ paddsw m2, m0 ;t35
+ psubsw m0, m3, m2 ;out35
+ paddsw m2, m3 ;out28
+ mova m3, [rsp+gprsize*2+16*63] ;t60a
+ mova [rsp+gprsize*2+16*38], m0 ;out35
+ mova [rsp+gprsize*2+16*31], m2 ;out28
+ psubsw m0, m3, m1 ;t51
+ paddsw m3, m1 ;t60
+ ITX_MULSUB_2W 0, 6, 1, 2, 7, 2896, 2896 ;t44a, t51a
+ mova m2, [rsp+gprsize*2+16*6 ] ;tmp[3]
+ psubsw m1, m2, m3 ;out60
+ paddsw m2, m3 ;out3
+ mova m3, [rsp+gprsize*2+16*22] ;tmp[19]
+ mova [rsp+gprsize*2+16*63], m1 ;out60
+ mova [rsp+gprsize*2+16*6 ], m2 ;out3
+ psubsw m1, m3, m0 ;out44
+ paddsw m3, m0 ;out19
+ mova m2, [rsp+gprsize*2+16*15] ;tmp[12]
+
+ mova m0, [rsp+gprsize*2+16*39] ;t36
+ mova [rsp+gprsize*2+16*47], m1 ;out44
+ mova [rsp+gprsize*2+16*22], m3 ;out19
+ mova m1, [rsp+gprsize*2+16*62] ;t59
+ psubsw m3, m2, m6 ;out51
+ paddsw m2, m6 ;out12
+ mova [rsp+gprsize*2+16*54], m3 ;out51
+ mova [rsp+gprsize*2+16*15], m2 ;out12
+ psubsw m2, m0, m5 ;t43a
+ paddsw m0, m5 ;t36a
+ mova m5, [rsp+gprsize*2+16*30] ;tmp[27]
+ psubsw m3, m1, m4 ;t52a
+ paddsw m1, m4 ;t59a
+ ITX_MULSUB_2W 3, 2, 4, 6, 7, 2896, 2896 ;t43, t52
+ mova m4, [rsp+gprsize*2+16*7 ] ;tmp[4 ]
+ psubsw m6, m5, m0 ;out36
+ paddsw m5, m0 ;out27
+ psubsw m0, m4, m1 ;out59
+ paddsw m4, m1 ;out4
+ mova [rsp+gprsize*2+16*39], m6 ;out36
+ mova [rsp+gprsize*2+16*30], m5 ;out27
+ mova [rsp+gprsize*2+16*62], m0 ;out59
+ mova [rsp+gprsize*2+16*7 ], m4 ;out4
+ mova m0, [rsp+gprsize*2+16*23] ;tmp[20]
+ mova m5, [rsp+gprsize*2+16*14] ;tmp[11]
+ psubsw m4, m0, m3 ;out43
+ paddsw m0, m3 ;out20
+ psubsw m6, m5, m2 ;out52
+ paddsw m5, m2 ;out11
+ mova [rsp+gprsize*2+16*46], m4 ;out43
+ mova [rsp+gprsize*2+16*23], m0 ;out20
+ mova [rsp+gprsize*2+16*55], m6 ;out52
+ mova [rsp+gprsize*2+16*14], m5 ;out11
+
+ mova m0, [rsp+gprsize*2+16*40] ;t37a
+ mova m5, [rsp+gprsize*2+16*45] ;t42a
+ mova m3, [rsp+gprsize*2+16*56] ;t53a
+ mova m1, [rsp+gprsize*2+16*61] ;t58a
+ mova m2, [rsp+gprsize*2+16*29] ;tmp[26]
+ psubsw m4, m0, m5 ;t42
+ paddsw m0, m5 ;t37
+ psubsw m5, m1, m3 ;t53
+ paddsw m1, m3 ;t58
+ ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t43, t52
+ mova m3, [rsp+gprsize*2+16*8 ] ;tmp[5 ]
+ psubsw m6, m2, m0 ;out37
+ paddsw m2, m0 ;out26
+ psubsw m0, m3, m1 ;out58
+ paddsw m3, m1 ;out5
+ mova [rsp+gprsize*2+16*40], m6 ;out37
+ mova [rsp+gprsize*2+16*29], m2 ;out26
+ mova [rsp+gprsize*2+16*61], m0 ;out58
+ mova [rsp+gprsize*2+16*8 ], m3 ;out5
+ mova m0, [rsp+gprsize*2+16*24] ;tmp[21]
+ mova m1, [rsp+gprsize*2+16*13] ;tmp[10]
+ psubsw m2, m0, m5 ;out42
+ paddsw m0, m5 ;out21
+ psubsw m3, m1, m4 ;out53
+ paddsw m1, m4 ;out10
+ mova [rsp+gprsize*2+16*45], m2 ;out42
+ mova [rsp+gprsize*2+16*24], m0 ;out21
+ mova [rsp+gprsize*2+16*56], m3 ;out53
+ mova [rsp+gprsize*2+16*13], m1 ;out10
+
+ mova m0, [rsp+gprsize*2+16*41] ;t38
+ mova m5, [rsp+gprsize*2+16*44] ;t41
+ mova m3, [rsp+gprsize*2+16*57] ;t54
+ mova m1, [rsp+gprsize*2+16*60] ;t57
+ mova m2, [rsp+gprsize*2+16*28] ;tmp[25]
+ psubsw m4, m0, m5 ;t41a
+ paddsw m0, m5 ;t38a
+ psubsw m5, m1, m3 ;t54a
+ paddsw m1, m3 ;t57a
+ ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t41a, t54a
+ mova m3, [rsp+gprsize*2+16*9 ] ;tmp[6 ]
+ psubsw m6, m2, m0 ;out38
+ paddsw m2, m0 ;out25
+ psubsw m0, m3, m1 ;out57
+ paddsw m3, m1 ;out6
+ mova [rsp+gprsize*2+16*41], m6 ;out38
+ mova [rsp+gprsize*2+16*28], m2 ;out25
+ mova [rsp+gprsize*2+16*60], m0 ;out57
+ mova [rsp+gprsize*2+16*9 ], m3 ;out6
+ mova m0, [rsp+gprsize*2+16*25] ;tmp[22]
+ mova m1, [rsp+gprsize*2+16*12] ;tmp[9 ]
+ psubsw m2, m0, m5 ;out41
+ paddsw m0, m5 ;out22
+ psubsw m3, m1, m4 ;out54
+ paddsw m1, m4 ;out9
+ mova [rsp+gprsize*2+16*44], m2 ;out41
+ mova [rsp+gprsize*2+16*25], m0 ;out22
+ mova [rsp+gprsize*2+16*57], m3 ;out54
+ mova [rsp+gprsize*2+16*12], m1 ;out9
+
+ mova m0, [rsp+gprsize*2+16*42] ;t39a
+ mova m5, [rsp+gprsize*2+16*43] ;t40a
+ mova m3, [rsp+gprsize*2+16*58] ;t55a
+ mova m1, [rsp+gprsize*2+16*59] ;t56a
+ mova m2, [rsp+gprsize*2+16*27] ;tmp[24]
+ psubsw m4, m0, m5 ;t40
+ paddsw m0, m5 ;t39
+ psubsw m5, m1, m3 ;t55
+ paddsw m1, m3 ;t56
+ ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t40a, t55a
+ mova m3, [rsp+gprsize*2+16*10] ;tmp[7 ]
+ psubsw m6, m2, m0 ;out39
+ paddsw m2, m0 ;out24
+ psubsw m0, m3, m1 ;out56
+ paddsw m3, m1 ;out7
+ mova [rsp+gprsize*2+16*42], m6 ;out39
+ mova [rsp+gprsize*2+16*27], m2 ;out24
+ mova [rsp+gprsize*2+16*59], m0 ;out56
+ mova [rsp+gprsize*2+16*10], m3 ;out7
+ mova m0, [rsp+gprsize*2+16*26] ;tmp[23]
+ mova m1, [rsp+gprsize*2+16*11] ;tmp[8 ]
+ psubsw m2, m0, m5 ;out40
+ paddsw m0, m5 ;out23
+ psubsw m3, m1, m4 ;out55
+ paddsw m1, m4 ;out8
+ mova [rsp+gprsize*2+16*43], m2 ;out40
+ mova [rsp+gprsize*2+16*26], m0 ;out23
+ mova [rsp+gprsize*2+16*58], m3 ;out55
+ mova [rsp+gprsize*2+16*11], m1 ;out8
+
+ mova m0, [rsp+gprsize*2+16*37] ;t34
+ mova m5, [rsp+gprsize*2+16*48] ;t45
+ mova m3, [rsp+gprsize*2+16*53] ;t50
+ mova m1, [rsp+gprsize*2+16*64] ;t61
+ mova m2, [rsp+gprsize*2+16*32] ;tmp[29]
+ psubsw m4, m0, m5 ;t45a
+ paddsw m0, m5 ;t34a
+ psubsw m5, m1, m3 ;t50a
+ paddsw m1, m3 ;t61a
+ ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t45, t50
+ mova m3, [rsp+gprsize*2+16*5 ] ;tmp[2 ]
+ psubsw m6, m2, m0 ;out34
+ paddsw m2, m0 ;out29
+ psubsw m0, m3, m1 ;out61
+ paddsw m3, m1 ;out2
+ mova [rsp+gprsize*2+16*37], m6 ;out34
+ mova [rsp+gprsize*2+16*32], m2 ;out29
+ mova [rsp+gprsize*2+16*64], m0 ;out61
+ mova [rsp+gprsize*2+16*5 ], m3 ;out2
+ mova m0, [rsp+gprsize*2+16*21] ;tmp[18]
+ mova m1, [rsp+gprsize*2+16*16] ;tmp[13]
+ psubsw m2, m0, m5 ;out45
+ paddsw m0, m5 ;out18
+ psubsw m3, m1, m4 ;out50
+ paddsw m1, m4 ;out13
+ mova [rsp+gprsize*2+16*48], m2 ;out45
+ mova [rsp+gprsize*2+16*21], m0 ;out18
+ mova [rsp+gprsize*2+16*53], m3 ;out50
+ mova [rsp+gprsize*2+16*16], m1 ;out13
+
+ mova m0, [rsp+gprsize*2+16*36] ;t33a
+ mova m5, [rsp+gprsize*2+16*49] ;t46a
+ mova m3, [rsp+gprsize*2+16*52] ;t49a
+ mova m1, [rsp+gprsize*2+16*65] ;t62a
+ mova m2, [rsp+gprsize*2+16*33] ;tmp[30]
+ psubsw m4, m0, m5 ;t46
+ paddsw m0, m5 ;t33
+ psubsw m5, m1, m3 ;t49
+ paddsw m1, m3 ;t62
+ ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t45, t50
+ mova m3, [rsp+gprsize*2+16*4 ] ;tmp[1 ]
+ psubsw m6, m2, m0 ;out33
+ paddsw m2, m0 ;out30
+ psubsw m0, m3, m1 ;out62
+ paddsw m3, m1 ;out1
+ mova [rsp+gprsize*2+16*36], m6 ;out33
+ mova [rsp+gprsize*2+16*33], m2 ;out30
+ mova [rsp+gprsize*2+16*65], m0 ;out62
+ mova [rsp+gprsize*2+16*4 ], m3 ;out1
+ mova m0, [rsp+gprsize*2+16*20] ;tmp[17]
+ mova m1, [rsp+gprsize*2+16*17] ;tmp[14]
+ psubsw m2, m0, m5 ;out46
+ paddsw m0, m5 ;out17
+ psubsw m3, m1, m4 ;out49
+ paddsw m1, m4 ;out14
+ mova [rsp+gprsize*2+16*49], m2 ;out46
+ mova [rsp+gprsize*2+16*20], m0 ;out17
+ mova [rsp+gprsize*2+16*52], m3 ;out49
+ mova [rsp+gprsize*2+16*17], m1 ;out14
+
+ mova m0, [rsp+gprsize*2+16*35] ;t32
+ mova m5, [rsp+gprsize*2+16*50] ;t47
+ mova m3, [rsp+gprsize*2+16*51] ;t48
+ mova m1, [rsp+gprsize*2+16*66] ;t63
+ mova m2, [rsp+gprsize*2+16*34] ;tmp[31]
+ psubsw m4, m0, m5 ;t47a
+ paddsw m0, m5 ;t32a
+ psubsw m5, m1, m3 ;t48a
+ paddsw m1, m3 ;t63a
+ ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t47, t48
+ mova m3, [rsp+gprsize*2+16*3 ] ;tmp[0 ]
+ psubsw m6, m2, m0 ;out32
+ paddsw m2, m0 ;out31
+ psubsw m0, m3, m1 ;out63
+ paddsw m3, m1 ;out0
+ mova [rsp+gprsize*2+16*35], m6 ;out32
+ mova [rsp+gprsize*2+16*34], m2 ;out31
+ mova [rsp+gprsize*2+16*66], m0 ;out63
+ mova [rsp+gprsize*2+16*3 ], m3 ;out0
+ mova m0, [rsp+gprsize*2+16*19] ;tmp[16]
+ mova m1, [rsp+gprsize*2+16*18] ;tmp[15]
+ psubsw m2, m0, m5 ;out47
+ paddsw m0, m5 ;out16
+ psubsw m3, m1, m4 ;out48
+ paddsw m1, m4 ;out15
+ mova [rsp+gprsize*2+16*50], m2 ;out47
+ mova [rsp+gprsize*2+16*19], m0 ;out16
+ mova [rsp+gprsize*2+16*51], m3 ;out48
+ mova [rsp+gprsize*2+16*18], m1 ;out15
+ ret
+
+
+cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 6, 8, 16*132, dst, stride, coeff, eob, tx2
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+ test eobd, eobd
+ jz .dconly
+
+ call m(idct_64x16_internal_8bpc)
+ RET
+
+.dconly:
+ movd m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1, [coeffq]
+ movd m2, [o(pw_8192)]
+ mov [coeffq], eobd
+ mov r3d, 16
+ lea tx2q, [o(.end)]
+
+.body:
+ pmulhrsw m0, m2
+ movd m2, [o(pw_2048)] ;intentionally rip-relative
+ pmulhrsw m0, m1
+ pmulhrsw m0, m2
+ pshuflw m0, m0, q0000
+ punpcklwd m0, m0
+ pxor m7, m7
+
+.loop:
+ mova m1, [dstq+16*0]
+ mova m3, [dstq+16*1]
+ mova m5, [dstq+16*2]
+ mova m6, [dstq+16*3]
+ punpckhbw m2, m1, m7
+ punpcklbw m1, m7
+ punpckhbw m4, m3, m7
+ punpcklbw m3, m7
+ paddw m2, m0
+ paddw m1, m0
+ paddw m4, m0
+ paddw m3, m0
+ packuswb m1, m2
+ packuswb m3, m4
+ punpckhbw m2, m5, m7
+ punpcklbw m5, m7
+ punpckhbw m4, m6, m7
+ punpcklbw m6, m7
+ paddw m2, m0
+ paddw m5, m0
+ paddw m4, m0
+ paddw m6, m0
+ packuswb m5, m2
+ packuswb m6, m4
+ mova [dstq+16*0], m1
+ mova [dstq+16*1], m3
+ mova [dstq+16*2], m5
+ mova [dstq+16*3], m6
+ add dstq, strideq
+ dec r3d
+ jg .loop
+ jmp tx2q
+
+.end:
+ RET
+
+
+%macro LOAD_4ROWS 2-3 0 ;src, stride, is_rect2
+
+%if %3
+ mova m3, [o(pw_2896x8)]
+ pmulhrsw m0, m3, [%1+%2*0]
+ pmulhrsw m1, m3, [%1+%2*1]
+ pmulhrsw m2, m3, [%1+%2*2]
+ pmulhrsw m3, [%1+%2*3]
+%else
+ mova m0, [%1+%2*0]
+ mova m1, [%1+%2*1]
+ mova m2, [%1+%2*2]
+ mova m3, [%1+%2*3]
+%endif
+%endmacro
+
+%macro LOAD_4ROWS_H 2 ;src, stride
+ mova m4, [%1+%2*0]
+ mova m5, [%1+%2*1]
+ mova m6, [%1+%2*2]
+ mova m7, [%1+%2*3]
+%endmacro
+
+cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mov r3d, 2
+ mov [rsp+gprsize*2+16*67], dstq
+ lea dstq, [rsp+gprsize+16*68]
+
+.pass1_loop:
+ LOAD_4ROWS coeffq+32*0, 32*8
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+
+ pxor m4, m4
+ LOAD_4ROWS coeffq+32*4, 32*8
+
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ LOAD_8ROWS coeffq+32*2, 32*4
+ mova [rsp+gprsize+16*19], m0
+ mova [rsp+gprsize+16*26], m1
+ mova [rsp+gprsize+16*23], m2
+ mova [rsp+gprsize+16*22], m3
+ mova [rsp+gprsize+16*21], m4
+ mova [rsp+gprsize+16*24], m5
+ mova [rsp+gprsize+16*25], m6
+ mova [rsp+gprsize+16*20], m7
+
+ call m(idct_8x32_internal_8bpc).main_fast
+ SAVE_8ROWS rsp+gprsize+16*3, 16
+
+ LOAD_8ROWS coeffq+32*1, 32*2
+ mova [rsp+gprsize+16*35], m0 ;in1
+ mova [rsp+gprsize+16*49], m1 ;in3
+ mova [rsp+gprsize+16*43], m2 ;in5
+ mova [rsp+gprsize+16*41], m3 ;in7
+ mova [rsp+gprsize+16*39], m4 ;in9
+ mova [rsp+gprsize+16*45], m5 ;in11
+ mova [rsp+gprsize+16*47], m6 ;in13
+ mova [rsp+gprsize+16*37], m7 ;in15
+
+ LOAD_8ROWS coeffq+32*17, 32*2
+ mova [rsp+gprsize+16*63], m0 ;in17
+ mova [rsp+gprsize+16*53], m1 ;in19
+ mova [rsp+gprsize+16*55], m2 ;in21
+ mova [rsp+gprsize+16*61], m3 ;in23
+ mova [rsp+gprsize+16*59], m4 ;in25
+ mova [rsp+gprsize+16*57], m5 ;in27
+ mova [rsp+gprsize+16*51], m6 ;in29
+ mova [rsp+gprsize+16*65], m7 ;in31
+
+ call m(idct_16x64_internal_8bpc).main
+
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end:
+ SAVE_8ROWS coeffq+32*0, 32
+ LOAD_8ROWS rsp+gprsize+16*11, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end1)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end1:
+ SAVE_8ROWS coeffq+32*8, 32
+ LOAD_8ROWS rsp+gprsize+16*19, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end2)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end2:
+ SAVE_8ROWS coeffq+32*16, 32
+ LOAD_8ROWS rsp+gprsize+16*27, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end3)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end3:
+ SAVE_8ROWS coeffq+32*24, 32
+ LOAD_8ROWS rsp+gprsize+16*35, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end4)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end4:
+ SAVE_8ROWS dstq+32*0, 32
+ LOAD_8ROWS rsp+gprsize+16*43, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end5)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end5:
+ SAVE_8ROWS dstq+32*8, 32
+ LOAD_8ROWS rsp+gprsize+16*51, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end6)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end6:
+ SAVE_8ROWS dstq+32*16, 32
+ LOAD_8ROWS rsp+gprsize+16*59, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end7)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end7:
+ SAVE_8ROWS dstq+32*24, 32
+
+ add coeffq, 16
+ add dstq, 16
+ dec r3d
+ jg .pass1_loop
+
+.pass2:
+ mov dstq, [rsp+gprsize*2+16*67]
+ sub coeffq, 32
+ mov r3d, 4
+
+.pass2_loop:
+ mov [rsp+gprsize*1+16*67], r3d
+
+ LOAD_4ROWS coeffq+16*0, 32*2
+ LOAD_4ROWS_H coeffq+16*1, 32*2
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+ LOAD_4ROWS coeffq+16*2, 32*2
+ LOAD_4ROWS_H coeffq+16*3, 32*2
+ call m(idct_16x8_internal_8bpc).main
+
+ mov r3, dstq
+ lea tx2q, [o(.end)]
+ lea dstq, [dstq+strideq*8]
+ jmp m(idct_8x8_internal_8bpc).end
+
+.end:
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.end1)]
+ mov dstq, r3
+ jmp m(idct_8x8_internal_8bpc).end
+
+.end1:
+ pxor m7, m7
+ REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+
+ add coeffq, 16*16
+ mov r3d, [rsp+gprsize*1+16*67]
+ mov dstq, [rsp+gprsize*2+16*67]
+ add dstq, 8
+ mov [rsp+gprsize*2+16*67], dstq
+ dec r3d
+ jg .pass2_loop
+
+ mov r3d, 4
+ lea coeffq, [rsp+gprsize+16*68]
+.pass2_loop2:
+ mov [rsp+gprsize*1+16*67], r3d
+
+ LOAD_4ROWS coeffq+16*0, 32*2
+ LOAD_4ROWS_H coeffq+16*1, 32*2
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+ LOAD_4ROWS coeffq+16*2, 32*2
+ LOAD_4ROWS_H coeffq+16*3, 32*2
+ call m(idct_16x8_internal_8bpc).main
+
+ mov r3, dstq
+ lea tx2q, [o(.end2)]
+ lea dstq, [dstq+strideq*8]
+ jmp m(idct_8x8_internal_8bpc).end
+
+.end2:
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.end3)]
+ mov dstq, r3
+ jmp m(idct_8x8_internal_8bpc).end
+
+.end3:
+
+ add coeffq, 16*16
+ mov r3d, [rsp+gprsize*1+16*67]
+ mov dstq, [rsp+gprsize*2+16*67]
+ add dstq, 8
+ mov [rsp+gprsize*2+16*67], dstq
+ dec r3d
+ jg .pass2_loop2
+ ret
+
+
+cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 6, 8, 16*68, dst, stride, coeff, eob, tx2
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+ test eobd, eobd
+ jz .dconly
+ call m(idct_32x64_internal_8bpc)
+.end:
+ RET
+
+.dconly:
+ movd m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1, [coeffq]
+ movd m2, [o(pw_16384)]
+ mov [coeffq], eobd
+ pmulhrsw m0, m1
+ mov r3d, 64
+ lea tx2q, [o(.end)]
+ jmp m(inv_txfm_add_dct_dct_32x8_8bpc).body
+
+
+cglobal idct_32x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mov r4d, 2
+ sub eobd, 136
+ mov [rsp+gprsize*1+16*67], eobd
+ mov r3d, 4
+ cmovs r3d, r4d
+
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+
+ mov [rsp+gprsize*2+16*67], coeffq
+
+.pass1_loop:
+ LOAD_8ROWS coeffq+64*1, 64*2, 1
+ mova [rsp+gprsize+16*19], m0 ;in1
+ mova [rsp+gprsize+16*26], m1 ;in3
+ mova [rsp+gprsize+16*23], m2 ;in5
+ mova [rsp+gprsize+16*22], m3 ;in7
+ mova [rsp+gprsize+16*21], m4 ;in9
+ mova [rsp+gprsize+16*24], m5 ;in11
+ mova [rsp+gprsize+16*25], m6 ;in13
+ mova [rsp+gprsize+16*20], m7 ;in15
+
+ mov tx2d, [rsp+gprsize*1+16*67]
+ test tx2d, tx2d
+ jl .fast
+
+.full:
+ LOAD_8ROWS coeffq+64*0, 64*4, 1
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+ LOAD_8ROWS coeffq+64*2, 64*4, 1
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ LOAD_8ROWS coeffq+64*17, 64*2, 1
+ mova [rsp+gprsize+16*33], m0 ;in17
+ mova [rsp+gprsize+16*28], m1 ;in19
+ mova [rsp+gprsize+16*29], m2 ;in21
+ mova [rsp+gprsize+16*32], m3 ;in23
+ mova [rsp+gprsize+16*31], m4 ;in25
+ mova [rsp+gprsize+16*30], m5 ;in27
+ mova [rsp+gprsize+16*27], m6 ;in29
+ mova [rsp+gprsize+16*34], m7 ;in31
+
+ call m(idct_8x32_internal_8bpc).main
+ jmp .pass1_end
+
+.fast:
+ LOAD_4ROWS coeffq, 256, 1
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_8x8_internal_8bpc).main
+
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+ LOAD_4ROWS coeffq+128*1, 256, 1
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ call m(idct_8x32_internal_8bpc).main_fast
+
+.pass1_end:
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end1)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end1:
+ SAVE_8ROWS coeffq+64*0, 64
+ LOAD_8ROWS rsp+gprsize+16*11, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end2)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end2:
+ SAVE_8ROWS coeffq+64*8, 64
+ LOAD_8ROWS rsp+gprsize+16*19, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end3)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end3:
+ SAVE_8ROWS coeffq+64*16, 64
+ LOAD_8ROWS rsp+gprsize+16*27, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end4)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end4:
+ SAVE_8ROWS coeffq+64*24, 64
+
+ add coeffq, 16
+ dec r3d
+ jg .pass1_loop
+
+.pass2:
+ mov coeffq, [rsp+gprsize*2+16*67]
+ mov r3d, 4
+ lea r4, [dstq+8]
+ mov [rsp+gprsize*2+16*67], r4
+ lea r4, [o(m(idct_16x64_internal_8bpc).end1)]
+ jmp m(idct_16x64_internal_8bpc).pass2_loop
+
+
+cglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 6, 8, 16*197, dst, stride, coeff, eob, tx2
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+ test eobd, eobd
+ jz .dconly
+ call m(idct_64x32_internal_8bpc)
+.end:
+ RET
+
+.dconly:
+ movd m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1, [coeffq]
+ movd m2, [o(pw_16384)]
+ pmulhrsw m0, m1
+ mov [coeffq], eobd
+ mov r3d, 32
+ lea tx2q, [o(.end)]
+ jmp m(inv_txfm_add_dct_dct_64x16_8bpc).body
+
+
+cglobal idct_64x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mov r4d, 2
+ sub eobd, 136
+ mov [rsp+gprsize*1+16*67], eobd
+ mov r3d, 4
+ cmovs r3d, r4d
+
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+
+ mov [rsp+gprsize*2+16*67], coeffq
+ mov [rsp+gprsize*3+16*67], dstq
+ lea dstq, [rsp+gprsize+16*69]
+ mov [rsp+gprsize*4+16*67], dstq
+
+.pass1_loop:
+ LOAD_4ROWS coeffq+64*0, 64*8, 1
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+
+ pxor m4, m4
+ LOAD_4ROWS coeffq+64*4, 64*8, 1
+
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ LOAD_8ROWS coeffq+64*2, 64*4, 1
+ mova [rsp+gprsize+16*19], m0
+ mova [rsp+gprsize+16*26], m1
+ mova [rsp+gprsize+16*23], m2
+ mova [rsp+gprsize+16*22], m3
+ mova [rsp+gprsize+16*21], m4
+ mova [rsp+gprsize+16*24], m5
+ mova [rsp+gprsize+16*25], m6
+ mova [rsp+gprsize+16*20], m7
+
+ call m(idct_8x32_internal_8bpc).main_fast
+ SAVE_8ROWS rsp+gprsize+16*3, 16
+
+ LOAD_8ROWS coeffq+64*1, 64*2, 1
+ mova [rsp+gprsize+16*35], m0 ;in1
+ mova [rsp+gprsize+16*49], m1 ;in3
+ mova [rsp+gprsize+16*43], m2 ;in5
+ mova [rsp+gprsize+16*41], m3 ;in7
+ mova [rsp+gprsize+16*39], m4 ;in9
+ mova [rsp+gprsize+16*45], m5 ;in11
+ mova [rsp+gprsize+16*47], m6 ;in13
+ mova [rsp+gprsize+16*37], m7 ;in15
+
+ LOAD_8ROWS coeffq+64*17, 64*2, 1
+ mova [rsp+gprsize+16*63], m0 ;in17
+ mova [rsp+gprsize+16*53], m1 ;in19
+ mova [rsp+gprsize+16*55], m2 ;in21
+ mova [rsp+gprsize+16*61], m3 ;in23
+ mova [rsp+gprsize+16*59], m4 ;in25
+ mova [rsp+gprsize+16*57], m5 ;in27
+ mova [rsp+gprsize+16*51], m6 ;in29
+ mova [rsp+gprsize+16*65], m7 ;in31
+
+ call m(idct_16x64_internal_8bpc).main
+
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end:
+ SAVE_8ROWS coeffq+64*0, 64
+ LOAD_8ROWS rsp+gprsize+16*11, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end1)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end1:
+ SAVE_8ROWS coeffq+64*8, 64
+ LOAD_8ROWS rsp+gprsize+16*19, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end2)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end2:
+ SAVE_8ROWS coeffq+64*16, 64
+ LOAD_8ROWS rsp+gprsize+16*27, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end3)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end3:
+ SAVE_8ROWS coeffq+64*24, 64
+ LOAD_8ROWS rsp+gprsize+16*35, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end4)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end4:
+ SAVE_8ROWS dstq+64*0, 64
+ LOAD_8ROWS rsp+gprsize+16*43, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end5)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end5:
+ SAVE_8ROWS dstq+64*8, 64
+ LOAD_8ROWS rsp+gprsize+16*51, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end6)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end6:
+ SAVE_8ROWS dstq+64*16, 64
+ LOAD_8ROWS rsp+gprsize+16*59, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end7)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end7:
+ SAVE_8ROWS dstq+64*24, 64
+
+ add coeffq, 16
+ add dstq, 16
+ dec r3d
+ jg .pass1_loop
+
+.pass2:
+ mov coeffq, [rsp+gprsize*4+16*67]
+ mov dstq, [rsp+gprsize*3+16*67]
+ mov eobd, [rsp+gprsize*1+16*67]
+ lea dstq, [dstq+32]
+ mov [rsp+gprsize*1+16*35], eobd
+ lea tx2q, [o(.pass2_end)]
+ mov r3d, 4
+ jmp m(idct_32x32_internal_8bpc).pass2_loop
+
+.pass2_end:
+ mova [rsp+gprsize+16*0], m7
+ lea r3, [o(.pass2_end1)]
+ jmp m(idct_8x32_internal_8bpc).end2
+
+.pass2_end1:
+ lea tx2q, [o(.pass2_end)]
+ add coeffq, 16*32
+ mov dstq, [rsp+gprsize*2+16*35]
+ mov r3d, [rsp+gprsize*3+16*35]
+ dec r3d
+ jg m(idct_32x32_internal_8bpc).pass2_loop
+
+.pass2_end2:
+ mov dstq, [rsp+gprsize*3+16*67]
+ mov coeffq, [rsp+gprsize*2+16*67]
+ lea tx2q, [o(m(idct_32x32_internal_8bpc).pass2_end)]
+ mov r3d, 4
+ jmp m(idct_32x32_internal_8bpc).pass2_loop
+
+
+cglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 6, 8, 16*197, dst, stride, coeff, eob, tx2
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+ test eobd, eobd
+ jz .dconly
+
+ call m(idct_64x64_internal_8bpc)
+ RET
+
+.dconly:
+ movd m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1, [coeffq]
+ movd m2, [o(pw_8192)]
+ mov [coeffq], eobd
+ mov r3d, 64
+ lea tx2q, [o(m(inv_txfm_add_dct_dct_64x32_8bpc).end)]
+ jmp m(inv_txfm_add_dct_dct_64x16_8bpc).body
+
+cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mov r5d, 4
+ mov r4d, 2
+ sub eobd, 136
+ cmovns r4d, r5d
+
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+
+ mov [rsp+gprsize*1+16*67], eobd
+ mov r3d, r4d
+ mov [rsp+gprsize*4+16*67], coeffq
+ mov [rsp+gprsize*3+16*67], dstq
+ lea dstq, [rsp+gprsize+16*69]
+ mov [rsp+gprsize*2+16*67], dstq
+
+.pass1_loop:
+ LOAD_4ROWS coeffq+64*0, 64*8
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+
+ pxor m4, m4
+ LOAD_4ROWS coeffq+64*4, 64*8
+
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ LOAD_8ROWS coeffq+64*2, 64*4
+ mova [rsp+gprsize+16*19], m0
+ mova [rsp+gprsize+16*26], m1
+ mova [rsp+gprsize+16*23], m2
+ mova [rsp+gprsize+16*22], m3
+ mova [rsp+gprsize+16*21], m4
+ mova [rsp+gprsize+16*24], m5
+ mova [rsp+gprsize+16*25], m6
+ mova [rsp+gprsize+16*20], m7
+
+ call m(idct_8x32_internal_8bpc).main_fast
+ SAVE_8ROWS rsp+gprsize+16*3, 16
+
+ LOAD_8ROWS coeffq+64*1, 64*2
+ mova [rsp+gprsize+16*35], m0 ;in1
+ mova [rsp+gprsize+16*49], m1 ;in3
+ mova [rsp+gprsize+16*43], m2 ;in5
+ mova [rsp+gprsize+16*41], m3 ;in7
+ mova [rsp+gprsize+16*39], m4 ;in9
+ mova [rsp+gprsize+16*45], m5 ;in11
+ mova [rsp+gprsize+16*47], m6 ;in13
+ mova [rsp+gprsize+16*37], m7 ;in15
+
+ LOAD_8ROWS coeffq+64*17, 64*2
+ mova [rsp+gprsize+16*63], m0 ;in17
+ mova [rsp+gprsize+16*53], m1 ;in19
+ mova [rsp+gprsize+16*55], m2 ;in21
+ mova [rsp+gprsize+16*61], m3 ;in23
+ mova [rsp+gprsize+16*59], m4 ;in25
+ mova [rsp+gprsize+16*57], m5 ;in27
+ mova [rsp+gprsize+16*51], m6 ;in29
+ mova [rsp+gprsize+16*65], m7 ;in31
+
+ call m(idct_16x64_internal_8bpc).main
+
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end:
+ SAVE_8ROWS coeffq+64*0, 64
+ LOAD_8ROWS rsp+gprsize+16*11, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end1)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end1:
+ SAVE_8ROWS coeffq+64*8, 64
+ LOAD_8ROWS rsp+gprsize+16*19, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end2)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end2:
+ SAVE_8ROWS coeffq+64*16, 64
+ LOAD_8ROWS rsp+gprsize+16*27, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end3)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end3:
+ SAVE_8ROWS coeffq+64*24, 64
+ LOAD_8ROWS rsp+gprsize+16*35, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end4)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end4:
+ SAVE_8ROWS dstq+64*0, 64
+ LOAD_8ROWS rsp+gprsize+16*43, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end5)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end5:
+ SAVE_8ROWS dstq+64*8, 64
+ LOAD_8ROWS rsp+gprsize+16*51, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end6)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end6:
+ SAVE_8ROWS dstq+64*16, 64
+ LOAD_8ROWS rsp+gprsize+16*59, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end7)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end7:
+ SAVE_8ROWS dstq+64*24, 64
+
+ add coeffq, 16
+ add dstq, 16
+ dec r3d
+ jg .pass1_loop
+
+.pass2:
+ mov dstq, [rsp+gprsize*3+16*67]
+ mov coeffq, [rsp+gprsize*2+16*67]
+ lea dstq, [dstq+32]
+ mov r3d, 4
+ lea r4, [dstq+8]
+ mov [rsp+gprsize*2+16*67], r4
+ lea r4, [o(.pass2_end)]
+ jmp m(idct_16x64_internal_8bpc).pass2_loop
+
+.pass2_end:
+ LOAD_8ROWS rsp+gprsize+16*35, 16
+ lea dstq, [dstq+strideq*2]
+ lea r3, [rsp+16*32+gprsize]
+ mova [rsp+gprsize+16*0], m7
+ call m(idct_16x64_internal_8bpc).write
+ mov dstq, [rsp+gprsize*2+16*67]
+ mov r3d, [rsp+gprsize*3+16*67]
+ lea r4, [dstq+8]
+ mov [rsp+gprsize*2+16*67], r4
+ lea r4, [o(.pass2_end)]
+
+ dec r3d
+ jg m(idct_16x64_internal_8bpc).pass2_loop
+
+.pass2_end2:
+ mov coeffq, [rsp+gprsize*4+16*67]
+ mov dstq, [rsp+gprsize*2+16*67]
+ mov r3d, 4
+ sub dstq, 72
+ lea r4, [dstq+8]
+ mov [rsp+gprsize*2+16*67], r4
+ lea r4, [o(m(idct_16x64_internal_8bpc).end1)]
+ jmp m(idct_16x64_internal_8bpc).pass2_loop
diff --git a/third_party/dav1d/src/x86/loopfilter.h b/third_party/dav1d/src/x86/loopfilter.h
new file mode 100644
index 0000000000..9535c753fb
--- /dev/null
+++ b/third_party/dav1d/src/x86/loopfilter.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/loopfilter.h"
+
+#define decl_loopfilter_sb_fns(ext) \
+decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_y, ext)); \
+decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_y, ext)); \
+decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_uv, ext)); \
+decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_uv, ext))
+
+decl_loopfilter_sb_fns(ssse3);
+decl_loopfilter_sb_fns(avx2);
+decl_loopfilter_sb_fns(avx512icl);
+
+static ALWAYS_INLINE void loop_filter_dsp_init_x86(Dav1dLoopFilterDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
+
+ c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, ssse3);
+ c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, ssse3);
+ c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, ssse3);
+ c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, ssse3);
+
+#if ARCH_X86_64
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
+
+ c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, avx2);
+ c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, avx2);
+ c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, avx2);
+ c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, avx2);
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
+
+ c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, avx512icl);
+ c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, avx512icl);
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SLOW_GATHER)) {
+ c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, avx512icl);
+ c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, avx512icl);
+ }
+#endif
+}
diff --git a/third_party/dav1d/src/x86/loopfilter16_avx2.asm b/third_party/dav1d/src/x86/loopfilter16_avx2.asm
new file mode 100644
index 0000000000..ed83000ac2
--- /dev/null
+++ b/third_party/dav1d/src/x86/loopfilter16_avx2.asm
@@ -0,0 +1,1161 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 32
+
+pb_mask: dd 1, 1, 2, 2, 4, 4, 8, 8
+pb_4x1_4x5_4x9_4x13: times 4 db 0, 1
+ times 4 db 8, 9
+ times 4 db 0, 1
+ times 4 db 8, 9
+
+pw_1: times 16 dw 1
+pw_2: times 16 dw 2
+pw_3: times 16 dw 3
+pw_4096: times 2 dw 4096
+
+; 10bpc/12bpc:
+pw_4: times 2 dw 4
+ times 2 dw 16
+clip_max: times 2 dw 511
+ times 2 dw 2047
+clip_min: times 2 dw -512
+ times 2 dw -2048
+
+SECTION .text
+
+; in: out:
+; mm%1 a b c d a e i m
+; mm%2 e f g h b f j n
+; mm%3 i j k l -> c g k o
+; mm%4 m n o p d h l p
+%macro TRANSPOSE4X4W 5
+ punpcklwd m%5, m%1, m%2
+ punpckhwd m%1, m%2
+ punpcklwd m%2, m%3, m%4
+ punpckhwd m%3, m%4
+ punpckldq m%4, m%5, m%2
+ punpckhdq m%5, m%2
+ punpckldq m%2, m%1, m%3
+ punpckhdq m%1, m%3
+
+ SWAP %1, %4
+ SWAP %2, %5, %3
+%endmacro
+
+; in: out:
+; xmm%1 a b c d e f g h a i q y 6 E M U
+; xmm%2 i j k l m n o p b j r z 7 F N V
+; xmm%3 q r s t u v w x c k s 0 8 G O W
+; xmm%4 y z 0 1 2 3 4 5 d l t 1 9 H P X
+; xmm%5 6 7 8 9 A B C D -> e m u 2 A I Q Y
+; xmm%6 E F G H I J K L f n v 3 B J R Z
+; xmm%7 M N O P Q R S T g o w 4 C K S +
+; xmm%8 U V W X Y Z + = h p x 5 D L T =
+%macro TRANSPOSE8X8W 9
+ ; xmm%1 a b c d e f g h a i q y b j r z
+ ; xmm%2 i j k l m n o p c k s 0 d l t 1
+ ; xmm%3 q r s t u v w x -> e m u 2 f n v 3
+ ; xmm%4 y z 0 1 2 3 4 5 g o w 4 h p x 5
+ TRANSPOSE4X4W %1, %2, %3, %4, %9
+
+ ; xmm%5 6 7 8 9 A B C D 6 E M U 7 F N V
+ ; xmm%6 E F G H I J K L 8 G O W 9 H P X
+ ; xmm%7 M N O P Q R S T -> A I Q Y B J R Z
+ ; xmm%8 U V W X Y Z + = C K S + D L T =
+ TRANSPOSE4X4W %5, %6, %7, %8, %9
+
+ ; xmm%1 a i q y b j r z a i q y 6 E M U
+ ; xmm%2 c k s 0 d l t 1 b j r z 7 F N V
+ ; xmm%3 e m u 2 f n v 3 c k s 0 8 G O W
+ ; xmm%4 g o w 4 h p x 5 d l t 1 9 H P X
+ ; xmm%5 6 E M U 7 F N V -> e m u 2 A I Q Y
+ ; xmm%6 8 G O W 9 H P X f n v 3 B J R Z
+ ; xmm%7 A I Q Y B J R Z g o w 4 C K S +
+ ; xmm%8 C K S + D L T = h p x 5 D L T =
+ punpckhqdq m%9, m%1, m%5
+ punpcklqdq m%1, m%5
+ punpckhqdq m%5, m%2, m%6
+ punpcklqdq m%2, m%6
+ punpckhqdq m%6, m%3, m%7
+ punpcklqdq m%3, m%7
+ punpckhqdq m%7, m%4, m%8
+ punpcklqdq m%4, m%8
+
+ SWAP %8, %7, %4, %5, %3, %2, %9
+%endmacro
+
+; transpose and write m3-6, everything else is scratch
+%macro TRANSPOSE_8x4_AND_WRITE_4x16 0
+ ; transpose 8x4
+ punpcklwd m0, m3, m4
+ punpckhwd m3, m4
+ punpcklwd m4, m5, m6
+ punpckhwd m5, m6
+ punpckldq m6, m0, m4
+ punpckhdq m0, m4
+ punpckldq m4, m3, m5
+ punpckhdq m3, m5
+
+ ; write out
+ movq [dstq+strideq*0-4], xm6
+ movhps [dstq+strideq*1-4], xm6
+ movq [dstq+strideq*2-4], xm0
+ movhps [dstq+stride3q -4], xm0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0-4], xm4
+ movhps [dstq+strideq*1-4], xm4
+ movq [dstq+strideq*2-4], xm3
+ movhps [dstq+stride3q -4], xm3
+ lea dstq, [dstq+strideq*4]
+
+ vextracti128 xm6, m6, 1
+ vextracti128 xm0, m0, 1
+ vextracti128 xm4, m4, 1
+ vextracti128 xm3, m3, 1
+
+ movq [dstq+strideq*0-4], xm6
+ movhps [dstq+strideq*1-4], xm6
+ movq [dstq+strideq*2-4], xm0
+ movhps [dstq+stride3q -4], xm0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0-4], xm4
+ movhps [dstq+strideq*1-4], xm4
+ movq [dstq+strideq*2-4], xm3
+ movhps [dstq+stride3q -4], xm3
+ lea dstq, [dstq+strideq*4]
+%endmacro
+
+%macro FILTER 2 ; width [4/6/8/16], dir [h/v]
+ ; load data
+%ifidn %2, v
+%if %1 == 4
+ lea tmpq, [dstq+mstrideq*2]
+ mova m3, [tmpq+strideq*0] ; p1
+ mova m4, [tmpq+strideq*1] ; p0
+ mova m5, [tmpq+strideq*2] ; q0
+ mova m6, [tmpq+stride3q] ; q1
+%else
+ ; load 6-8 pixels, remainder (for wd=16) will be read inline
+ lea tmpq, [dstq+mstrideq*4]
+ ; we load p3 later
+ mova m13, [tmpq+strideq*1]
+ mova m3, [tmpq+strideq*2]
+ mova m4, [tmpq+stride3q]
+ mova m5, [dstq+strideq*0]
+ mova m6, [dstq+strideq*1]
+ mova m14, [dstq+strideq*2]
+%if %1 != 6
+ mova m15, [dstq+stride3q]
+%endif
+%endif
+%else
+ ; load lines
+%if %1 == 4
+ movq xm3, [dstq+strideq*0-4]
+ movq xm4, [dstq+strideq*1-4]
+ movq xm5, [dstq+strideq*2-4]
+ movq xm6, [dstq+stride3q -4]
+ lea tmpq, [dstq+strideq*4]
+ movq xm11, [tmpq+strideq*0-4]
+ movq xm13, [tmpq+strideq*1-4]
+ movq xm14, [tmpq+strideq*2-4]
+ movq xm15, [tmpq+stride3q -4]
+ lea tmpq, [tmpq+strideq*4]
+ ; this overreads by 8 bytes but the buffers are padded
+ ; so that should be ok
+ vinserti128 m3, [tmpq+strideq*0-4], 1
+ vinserti128 m4, [tmpq+strideq*1-4], 1
+ vinserti128 m5, [tmpq+strideq*2-4], 1
+ vinserti128 m6, [tmpq+stride3q -4], 1
+ lea tmpq, [tmpq+strideq*4]
+ vinserti128 m11, [tmpq+strideq*0-4], 1
+ vinserti128 m13, [tmpq+strideq*1-4], 1
+ vinserti128 m14, [tmpq+strideq*2-4], 1
+ vinserti128 m15, [tmpq+stride3q -4], 1
+
+ ; transpose 4x8
+ ; xm3: A-D0,A-D4
+ ; xm4: A-D1,A-D5
+ ; xm5: A-D2,A-D6
+ ; xm6: A-D3,A-D7
+ punpcklwd m7, m3, m4
+ punpcklwd m3, m11, m13
+ punpcklwd m4, m5, m6
+ punpcklwd m5, m14, m15
+ ; xm7: A0-1,B0-1,C0-1,D0-1
+ ; xm3: A4-5,B4-5,C4-5,D4-5
+ ; xm4: A2-3,B2-3,C2-3,D2-3
+ ; xm5: A6-7,B6-7,C6-7,D6-7
+ punpckldq m6, m7, m4
+ punpckhdq m7, m4
+ punpckldq m8, m3, m5
+ punpckhdq m5, m3, m5
+ ; xm6: A0-3,B0-3
+ ; xm7: C0-3,D0-3
+ ; xm8: A4-7,B4-7
+ ; xm5: C4-7,D4-7
+ punpcklqdq m3, m6, m8
+ punpckhqdq m4, m6, m8
+ punpckhqdq m6, m7, m5
+ punpcklqdq m5, m7, m5
+ ; xm3: A0-7
+ ; xm4: B0-7
+ ; xm5: C0-7
+ ; xm6: D0-7
+%elif %1 == 6 || %1 == 8
+ movu xm3, [dstq+strideq*0-8]
+ movu xm4, [dstq+strideq*1-8]
+ movu xm5, [dstq+strideq*2-8]
+ movu xm6, [dstq+stride3q -8]
+ lea tmpq, [dstq+strideq*4]
+ movu xm11, [tmpq+strideq*0-8]
+ movu xm13, [tmpq+strideq*1-8]
+ movu xm14, [tmpq+strideq*2-8]
+ movu xm15, [tmpq+stride3q -8]
+ lea tmpq, [tmpq+strideq*4]
+ vinserti128 m3, [tmpq+strideq*0-8], 1
+ vinserti128 m4, [tmpq+strideq*1-8], 1
+ vinserti128 m5, [tmpq+strideq*2-8], 1
+ vinserti128 m6, [tmpq+stride3q -8], 1
+ lea tmpq, [tmpq+strideq*4]
+ vinserti128 m11, [tmpq+strideq*0-8], 1
+ vinserti128 m13, [tmpq+strideq*1-8], 1
+ vinserti128 m14, [tmpq+strideq*2-8], 1
+ vinserti128 m15, [tmpq+stride3q -8], 1
+
+ ; transpose 8x16
+ ; xm3: A-H0,A-H8
+ ; xm4: A-H1,A-H9
+ ; xm5: A-H2,A-H10
+ ; xm6: A-H3,A-H11
+ ; xm11: A-H4,A-H12
+ ; xm13: A-H5,A-H13
+ ; xm14: A-H6,A-H14
+ ; xm15: A-H7,A-H15
+ punpcklwd m7, m3, m4
+ punpckhwd m3, m4
+ punpcklwd m4, m5, m6
+ punpckhwd m5, m6
+ punpcklwd m6, m11, m13
+ punpckhwd m11, m13
+ punpcklwd m13, m14, m15
+ punpckhwd m14, m15
+ ; xm7: A0-1,B0-1,C0-1,D0-1
+ ; xm3: E0-1,F0-1,G0-1,H0-1
+ ; xm4: A2-3,B2-3,C2-3,D2-3
+ ; xm5: E2-3,F2-3,G2-3,H2-3
+ ; xm6: A4-5,B4-5,C4-5,D4-5
+ ; xm11: E4-5,F4-5,G4-5,H4-5
+ ; xm13: A6-7,B6-7,C6-7,D6-7
+ ; xm14: E6-7,F6-7,G6-7,H6-7
+ punpckldq m15, m7, m4
+ punpckhdq m7, m4
+ punpckldq m9, m3, m5
+ punpckhdq m8, m3, m5
+ punpckldq m3, m6, m13
+ punpckhdq m6, m13
+ punpckldq m10, m11, m14
+ punpckhdq m11, m14
+ ; xm15: A0-3,B0-3
+ ; xm7: C0-3,D0-3
+ ; xm9: E0-3,F0-3
+ ; xm8: G0-3,H0-3
+ ; xm3: A4-7,B4-7
+ ; xm6: C4-7,D4-7
+ ; xm10: E4-7,F4-7
+ ; xm11: G4-7,H4-7
+%if %1 != 6
+ punpcklqdq m0, m15, m3
+%endif
+ punpckhqdq m13, m15, m3
+ punpcklqdq m3, m7, m6
+ punpckhqdq m4, m7, m6
+ punpcklqdq m5, m9, m10
+ punpckhqdq m6, m9, m10
+ punpcklqdq m14, m8, m11
+%if %1 != 6
+ punpckhqdq m15, m8, m11
+ mova [rsp+5*32], m0
+%endif
+%else
+ ; We only use 14 pixels but we'll need the remainder at the end for
+ ; the second transpose
+ mova xm0, [dstq+strideq*0-16]
+ mova xm1, [dstq+strideq*1-16]
+ mova xm2, [dstq+strideq*2-16]
+ mova xm3, [dstq+stride3q -16]
+ lea tmpq, [dstq+strideq*4]
+ mova xm4, [tmpq+strideq*0-16]
+ mova xm5, [tmpq+strideq*1-16]
+ mova xm6, [tmpq+strideq*2-16]
+ mova xm7, [tmpq+stride3q -16]
+ lea tmpq, [tmpq+strideq*4]
+ vinserti128 m0, m0, [tmpq+strideq*0-16], 1
+ vinserti128 m1, m1, [tmpq+strideq*1-16], 1
+ vinserti128 m2, m2, [tmpq+strideq*2-16], 1
+ vinserti128 m3, m3, [tmpq+stride3q -16], 1
+ lea tmpq, [tmpq+strideq*4]
+ vinserti128 m4, m4, [tmpq+strideq*0-16], 1
+ vinserti128 m5, m5, [tmpq+strideq*1-16], 1
+ vinserti128 m6, m6, [tmpq+strideq*2-16], 1
+ vinserti128 m7, m7, [tmpq+stride3q -16], 1
+
+ TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, 8
+
+ mova [rsp+6*32], m0
+ mova [rsp+7*32], m1
+ mova [rsp+8*32], m2
+ mova [rsp+9*32], m3
+ mova [rsp+5*32], m4
+
+ mova xm0, [dstq+strideq*0]
+ mova xm1, [dstq+strideq*1]
+ mova xm2, [dstq+strideq*2]
+ mova xm3, [dstq+stride3q ]
+ lea tmpq, [dstq+strideq*4]
+ mova xm8, [tmpq+strideq*0]
+ mova xm9, [tmpq+strideq*1]
+ mova xm10, [tmpq+strideq*2]
+ mova xm11, [tmpq+stride3q ]
+ lea tmpq, [tmpq+strideq*4]
+ vinserti128 m0, m0, [tmpq+strideq*0], 1
+ vinserti128 m1, m1, [tmpq+strideq*1], 1
+ vinserti128 m2, m2, [tmpq+strideq*2], 1
+ vinserti128 m3, m3, [tmpq+stride3q ], 1
+ lea tmpq, [tmpq+strideq*4]
+ vinserti128 m8, m8, [tmpq+strideq*0], 1
+ vinserti128 m9, m9, [tmpq+strideq*1], 1
+ vinserti128 m10, m10, [tmpq+strideq*2], 1
+ vinserti128 m11, m11, [tmpq+stride3q ], 1
+
+ TRANSPOSE8X8W 0, 1, 2, 3, 8, 9, 10, 11, 4
+
+ mova [rsp+10*32], m8
+ mova [rsp+11*32], m9
+ mova [rsp+12*32], m10
+ mova [rsp+13*32], m11
+
+ ; 5,6,7,0,1,2,3 -> 13,3,4,5,6,14,15
+ SWAP 13, 5, 0
+ SWAP 3, 6, 1, 15
+ SWAP 4, 7
+ SWAP 2, 14
+%endif
+%endif
+
+ ; load L/E/I/H
+%ifidn %2, v
+ pmovzxbw m1, [lq]
+ pmovzxbw m0, [lq+l_strideq]
+ pxor m2, m2
+%else
+ vpbroadcastq m0, [lq] ; l0, l1
+ vpbroadcastq m1, [lq+l_strideq] ; l2, l3
+ vpbroadcastq m2, [lq+l_strideq*2] ; l4, l5
+ vpbroadcastq m10, [lq+l_stride3q] ; l6, l7
+ punpckldq m0, m1 ; l0, l2, l1, l3 [2x]
+ punpckldq m2, m10 ; l4, l6, l5, l7 [2x]
+ vpblendd m0, m0, m2, 11110000b ; l0, l2, l1, l3, l4, l6, l5, l7
+ pxor m2, m2
+ punpcklbw m1, m0, m2 ; l0, l2, l4, l6
+ punpckhbw m0, m2 ; l1, l3, l5, l7
+%endif
+ pcmpeqw m10, m2, m0
+ pand m1, m10
+ por m0, m1 ; l[x][] ? l[x][] : l[x-stride][]
+ pshufb m0, [pb_4x1_4x5_4x9_4x13] ; l[x][1]
+ pcmpeqw m10, m2, m0 ; !L
+ psrlw m10, 1
+ psrlw m2, m0, [lutq+128]
+ vpbroadcastw m1, [lutq+136]
+ pminuw m2, m1
+ pmaxuw m2, [pw_1] ; I
+ psrlw m1, m0, 4 ; H
+ paddw m0, [pw_2]
+ vpbroadcastd m8, [r11]
+ paddw m0, m0
+ paddw m0, m2 ; E
+ REPX {pmullw x, m8}, m0, m1, m2
+
+ psubw m8, m3, m4 ; p1-p0
+ psubw m9, m5, m6 ; q1-q0
+ REPX {pabsw x, x}, m8, m9
+ pmaxuw m8, m10
+ pmaxuw m8, m9
+ pcmpgtw m7, m8, m1 ; hev
+%if %1 != 4
+ psubw m9, m13, m4 ; p2-p0
+ pabsw m9, m9
+ pmaxuw m9, m8
+%if %1 != 6
+%ifidn %2, v
+ mova m11, [tmpq+strideq*0] ; p3
+%else
+ mova m11, [rsp+5*32] ; p3
+%endif
+ psubw m10, m11, m4 ; p3-p0
+ pabsw m10, m10
+ pmaxuw m9, m10
+%endif
+ psubw m10, m5, m14 ; q2-q0
+ pabsw m10, m10
+ pmaxuw m9, m10
+%if %1 != 6
+ psubw m10, m5, m15 ; q3-q0
+ pabsw m10, m10
+ pmaxuw m9, m10
+%endif
+ vpbroadcastd m10, [r11]
+ pcmpgtw m9, m10 ; !flat8in
+
+ psubw m10, m13, m3 ; p2-p1
+ pabsw m10, m10
+%if %1 != 6
+ psubw m11, m13 ; p3-p2
+ pabsw m11, m11
+ pmaxuw m10, m11
+ psubw m11, m14, m15 ; q3-q2
+ pabsw m11, m11
+ pmaxuw m10, m11
+%endif
+ psubw m11, m14, m6 ; q2-q1
+ pabsw m11, m11
+ pmaxuw m10, m11
+
+%if %1 == 16
+ vpbroadcastd m11, [maskq+8]
+ vpbroadcastd m1, [maskq+4]
+ por m11, m1
+ pand m11, m12
+ pcmpeqd m11, m12
+ pand m10, m11
+%else
+ vpbroadcastd m11, [maskq+4]
+ pand m11, m12
+ pcmpeqd m11, m12
+ pand m10, m11 ; only apply fm-wide to wd>4 blocks
+%endif
+ pmaxuw m8, m10
+%endif
+ pcmpgtw m8, m2
+
+ psubw m10, m3, m6 ; p1-q1
+ psubw m11, m4, m5 ; p0-q0
+ REPX {pabsw x, x}, m10, m11
+ paddw m11, m11
+ psrlw m10, 1
+ paddw m10, m11 ; abs(p0-q0)*2+(abs(p1-q1)>>1)
+ pcmpgtw m10, m0 ; abs(p0-q0)*2+(abs(p1-q1)>>1) > E
+ por m8, m10
+
+%if %1 == 16
+
+%ifidn %2, v
+ lea tmpq, [dstq+mstrideq*8]
+ mova m0, [tmpq+strideq*1]
+ mova m1, [tmpq+strideq*2]
+ mova m2, [tmpq+stride3q]
+%else
+ mova m0, [rsp+7*32]
+ mova m1, [rsp+8*32]
+ mova m2, [rsp+9*32]
+%endif
+ REPX {psubw x, m4}, m0, m1, m2
+ REPX {pabsw x, x}, m0, m1, m2
+ pmaxuw m1, m0
+ pmaxuw m1, m2
+%ifidn %2, v
+ lea tmpq, [dstq+strideq*4]
+ mova m0, [tmpq+strideq*0]
+ mova m2, [tmpq+strideq*1]
+ mova m10, [tmpq+strideq*2]
+%else
+ mova m0, [rsp+10*32]
+ mova m2, [rsp+11*32]
+ mova m10, [rsp+12*32]
+%endif
+ REPX {psubw x, m5}, m0, m2, m10
+ REPX {pabsw x, x}, m0, m2, m10
+ pmaxuw m0, m2
+ pmaxuw m1, m10
+ pmaxuw m1, m0
+ vpbroadcastd m0, [r11]
+ pcmpgtw m1, m0 ; !flat8out
+ por m1, m9 ; !flat8in | !flat8out
+ vpbroadcastd m2, [maskq+8]
+ pand m10, m2, m12
+ pcmpeqd m10, m12
+ pandn m1, m10 ; flat16
+ pandn m1, m8, m1 ; flat16 & fm
+
+ vpbroadcastd m10, [maskq+4]
+ por m10, m2
+ pand m2, m10, m12
+ pcmpeqd m2, m12
+ pandn m9, m2 ; flat8in
+ pandn m9, m8, m9
+ vpbroadcastd m2, [maskq+0]
+ por m2, m10
+ pand m2, m12
+ pcmpeqd m2, m12
+ pandn m8, m2
+ pandn m8, m9, m8 ; fm & !flat8 & !flat16
+ pandn m9, m1, m9 ; flat8 & !flat16
+%elif %1 != 4
+ vpbroadcastd m0, [maskq+4]
+ pand m2, m0, m12
+ pcmpeqd m2, m12
+ pandn m9, m2
+ pandn m9, m8, m9 ; flat8 & fm
+ vpbroadcastd m2, [maskq+0]
+ por m0, m2
+ pand m0, m12
+ pcmpeqd m0, m12
+ pandn m8, m0
+ pandn m8, m9, m8 ; fm & !flat8
+%else
+ vpbroadcastd m0, [maskq+0]
+ pand m0, m12
+ pcmpeqd m0, m12
+ pandn m8, m0 ; fm
+%endif
+
+ ; short filter
+ vpbroadcastd m0, [r11+8*1] ; 511 or 2047
+ vpbroadcastd m2, [r11+8*2] ; -512 or -2048
+ psubw m10, m5, m4
+ paddw m11, m10, m10
+ paddw m11, m10
+ psubw m10, m3, m6 ; iclip_diff(p1-q1)
+ pminsw m10, m0
+ pmaxsw m10, m2
+ pand m10, m7 ; f=iclip_diff(p1-q1)&hev
+ paddw m10, m11 ; f=iclip_diff(3*(q0-p0)+f)
+ pminsw m10, m0
+ pmaxsw m10, m2
+ pand m8, m10 ; f&=fm
+ vpbroadcastd m10, [pw_4]
+ paddw m10, m8
+ paddw m8, [pw_3]
+ REPX {pminsw x, m0}, m10, m8
+ psraw m10, 3 ; f2
+ psraw m8, 3 ; f1
+ psubw m5, m10
+ paddw m4, m8
+
+ paddw m10, [pw_1]
+ psraw m10, 1 ; f=(f1+1)>>1
+ pandn m8, m7, m10 ; f&=!hev
+ paddw m3, m8
+ psubw m6, m8
+ pxor m8, m8
+ psubw m0, m2 ; 1023 or 4095
+ REPX {pminsw x, m0}, m3, m4, m5, m6
+ REPX {pmaxsw x, m8}, m3, m4, m5, m6
+
+%if %1 == 16
+
+; m3-6 = p1/p0/q0/q1, m9=flat8, m1=flat16
+; m12=filter bits mask
+; m13-15=p2/q2/q3
+; m0,2,7-8,10-11 = free
+
+ ; flat16 filter
+%ifidn %2, v
+ lea tmpq, [dstq+mstrideq*8]
+ mova m0, [tmpq+strideq*1] ; p6
+ mova m2, [tmpq+strideq*2] ; p5
+ mova m7, [tmpq+stride3q] ; p4
+ mova m11, [tmpq+strideq*4] ; p3
+%else
+ mova m0, [rsp+7*32]
+ mova m2, [rsp+8*32]
+ mova m7, [rsp+9*32]
+ mova m11, [rsp+5*32]
+%endif
+
+ mova [rsp+ 0*32], m9
+
+ ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0
+ paddw m8, m0, [pw_1]
+ psllw m8, 3 ; p6*8+8
+ paddw m10, m2, m7 ; p5+p4
+ psubw m8, m0
+ paddw m10, m10 ; (p5+p4)*2
+ paddw m8, m11 ; p6*7+p3
+ paddw m10, m13 ; (p5+p4)*2+p2
+ paddw m8, m3 ; p6*7+p3+p1
+ paddw m10, m4 ; (p5+p4)*2+p2+p0
+ paddw m8, m5 ; p6*7+p3+p1+q0
+ paddw m8, m10 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0
+ psrlw m10, m8, 4
+ vpblendvb m10, m2, m10, m1
+%ifidn %2, v
+ mova [tmpq+strideq*2], m10 ; p5
+%else
+ mova [rsp+8*32], m10
+%endif
+
+ ; sub p6*2, add p3/q1
+ paddw m8, m11
+ paddw m10, m0, m0
+ paddw m8, m6
+ psubw m8, m10
+ psrlw m10, m8, 4
+ vpblendvb m10, m7, m10, m1
+%ifidn %2, v
+ mova [tmpq+stride3q], m10 ; p4
+%else
+ mova [rsp+9*32], m10
+%endif
+
+ ; sub p6/p5, add p2/q2
+ psubw m8, m0
+ paddw m10, m13, m14
+ psubw m8, m2
+ paddw m8, m10
+ psrlw m10, m8, 4
+ vpblendvb m10, m11, m10, m1
+%ifidn %2, v
+ mova [tmpq+strideq*4], m10 ; p3
+ lea tmpq, [dstq+strideq*4]
+%else
+ mova [rsp+5*32], m10
+%endif
+
+ ; sub p6/p4, add p1/q3
+ paddw m8, m3
+ paddw m10, m0, m7
+ paddw m8, m15
+ psubw m8, m10
+ psrlw m10, m8, 4
+ vpblendvb m10, m13, m10, m1
+ mova [rsp+1*32], m10 ; don't clobber p2/m13
+
+ ; sub p6/p3, add p0/q4
+ paddw m8, m4
+ paddw m10, m0, m11
+%ifidn %2, v
+ paddw m8, [tmpq+strideq*0]
+%else
+ paddw m8, [rsp+10*32]
+%endif
+ psubw m8, m10
+ psrlw m10, m8, 4
+ vpblendvb m10, m3, m10, m1
+ mova [rsp+2*32], m10 ; don't clobber p1/m3
+
+ ; sub p6/p2, add q0/q5
+ paddw m8, m5
+ paddw m10, m0, m13
+%ifidn %2, v
+ paddw m8, [tmpq+strideq*1]
+%else
+ paddw m8, [rsp+11*32]
+%endif
+ psubw m8, m10
+ psrlw m10, m8, 4
+ vpblendvb m10, m4, m10, m1
+ mova [rsp+3*32], m10 ; don't clobber p0/m4
+
+ ; sub p6/p1, add q1/q6
+ paddw m8, m6
+ paddw m10, m0, m3
+%ifidn %2, v
+ mova m0, [tmpq+strideq*2] ; q6
+%else
+ mova m0, [rsp+12*32] ; q6
+%endif
+ paddw m8, m0
+ psubw m8, m10
+ psrlw m10, m8, 4
+ vpblendvb m10, m5, m10, m1
+ mova [rsp+4*32], m10 ; don't clobber q0/m5
+
+ ; sub p5/p0, add q2/q6
+ paddw m8, m14
+ paddw m10, m2, m4
+ paddw m8, m0
+ psubw m8, m10
+ psrlw m10, m8, 4
+ vpblendvb m2, m6, m10, m1 ; don't clobber q1/m6
+
+ ; sub p4/q0, add q3/q6
+ paddw m8, m15
+ paddw m10, m7, m5
+ paddw m8, m0
+ psubw m8, m10
+ psrlw m10, m8, 4
+ vpblendvb m7, m14, m10, m1 ; don't clobber q2/m14
+
+ ; sub p3/q1, add q4/q6
+%ifidn %2, v
+ paddw m8, [tmpq+strideq*0]
+%else
+ paddw m8, [rsp+10*32]
+%endif
+ paddw m10, m11, m6
+ paddw m8, m0
+ psubw m8, m10
+ psrlw m10, m8, 4
+ vpblendvb m10, m15, m10, m1
+%ifidn %2, v
+ mova [tmpq+mstrideq], m10 ; q3
+%else
+ mova [rsp+14*32], m10
+%endif
+
+ ; sub p2/q2, add q5/q6
+%ifidn %2, v
+ paddw m8, [tmpq+strideq*1]
+%else
+ paddw m8, [rsp+11*32]
+%endif
+ paddw m10, m13, m14
+ paddw m8, m0
+ psubw m8, m10
+ psrlw m10, m8, 4
+%ifidn %2, v
+ mova m9, [tmpq+strideq*0]
+%else
+ mova m9, [rsp+10*32]
+%endif
+ vpblendvb m10, m9, m10, m1
+%ifidn %2, v
+ mova [tmpq+strideq*0], m10 ; q4
+%else
+ mova [rsp+10*32], m10
+%endif
+
+ ; sub p1/q3, add q6*2
+ psubw m8, m3
+ paddw m0, m0
+ psubw m8, m15
+ paddw m8, m0
+ psrlw m10, m8, 4
+%ifidn %2, v
+ mova m9, [tmpq+strideq*1]
+%else
+ mova m9, [rsp+11*32]
+%endif
+ vpblendvb m10, m9, m10, m1
+%ifidn %2, v
+ mova [tmpq+strideq*1], m10 ; q5
+%else
+ mova [rsp+11*32], m10
+%endif
+
+ mova m9, [rsp+0*32]
+ mova m13, [rsp+1*32]
+ mova m3, [rsp+2*32]
+ mova m4, [rsp+3*32]
+ mova m5, [rsp+4*32]
+ SWAP 2, 6
+ SWAP 7, 14
+%ifidn %2, v
+ lea tmpq, [dstq+mstrideq*4]
+%else
+ mova m15, [rsp+14*32]
+%endif
+%endif
+
+%if %1 >= 8
+ ; flat8 filter
+ vpbroadcastd m7, [pw_4096]
+%ifidn %2, v
+ mova m0, [tmpq+strideq*0] ; p3
+%else
+ mova m0, [rsp+5*32] ; p3
+%endif
+ paddw m1, m0, m13 ; p3+p2
+ paddw m2, m3, m4 ; p1+p0
+ paddw m8, m1, m1 ; 2*(p3+p2)
+ paddw m2, m0 ; p1+p0+p3
+ paddw m8, m5 ; 2*(p3+p2)+q0
+ paddw m2, m8 ; 3*p3+2*p2+p1+p0+q0
+ pmulhrsw m10, m2, m7
+
+ paddw m8, m3, m6
+ psubw m2, m1
+ paddw m2, m8
+ pmulhrsw m8, m2, m7
+
+ paddw m11, m0, m3
+ paddw m1, m4, m14
+ psubw m2, m11
+ paddw m2, m1
+ pmulhrsw m1, m2, m7
+
+ paddw m11, m0, m4
+ pblendvb m4, m1, m9
+ paddw m1, m5, m15
+ psubw m2, m11
+ paddw m2, m1
+ pmulhrsw m11, m2, m7
+
+ paddw m2, m6
+ paddw m2, m15
+ paddw m1, m13, m5
+ pblendvb m5, m11, m9
+ pblendvb m13, m10, m9
+ psubw m2, m1
+ pmulhrsw m1, m2, m7
+
+ psubw m2, m3
+ pblendvb m3, m8, m9
+ psubw m2, m6
+ pblendvb m6, m1, m9
+ paddw m1, m15, m14
+ paddw m2, m1
+ pmulhrsw m2, m7
+
+ pblendvb m14, m2, m9
+
+%ifidn %2, v
+ mova [tmpq+strideq*1], m13 ; p2
+ mova [tmpq+strideq*2], m3 ; p1
+ mova [tmpq+stride3q ], m4 ; p0
+ mova [dstq+strideq*0], m5 ; q0
+ mova [dstq+strideq*1], m6 ; q1
+ mova [dstq+strideq*2], m14 ; q2
+%elif %1 == 8
+ TRANSPOSE8X8W 0, 13, 3, 4, 5, 6, 14, 15, 1
+
+ ; write 8x16
+ movu [dstq+strideq*0-8], xm0
+ movu [dstq+strideq*1-8], xm13
+ movu [dstq+strideq*2-8], xm3
+ movu [dstq+stride3q -8], xm4
+ lea dstq, [dstq+strideq*4]
+ movu [dstq+strideq*0-8], xm5
+ movu [dstq+strideq*1-8], xm6
+ movu [dstq+strideq*2-8], xm14
+ movu [dstq+stride3q -8], xm15
+ lea dstq, [dstq+strideq*4]
+ vextracti128 [dstq+strideq*0-8], m0, 1
+ vextracti128 [dstq+strideq*1-8], m13, 1
+ vextracti128 [dstq+strideq*2-8], m3, 1
+ vextracti128 [dstq+stride3q -8], m4, 1
+ lea dstq, [dstq+strideq*4]
+ vextracti128 [dstq+strideq*0-8], m5, 1
+ vextracti128 [dstq+strideq*1-8], m6, 1
+ vextracti128 [dstq+strideq*2-8], m14, 1
+ vextracti128 [dstq+stride3q -8], m15, 1
+ lea dstq, [dstq+strideq*4]
+%else
+ mova m8, [rsp+6*32]
+ mova m1, [rsp+7*32]
+ mova m2, [rsp+8*32]
+ mova m7, [rsp+9*32]
+ TRANSPOSE8X8W 8, 1, 2, 7, 0, 13, 3, 4, 9
+
+ mova [dstq+strideq*0-16], xm8
+ mova [dstq+strideq*1-16], xm1
+ mova [dstq+strideq*2-16], xm2
+ mova [dstq+stride3q -16], xm7
+ lea tmpq, [dstq+strideq*4]
+ mova [tmpq+strideq*0-16], xm0
+ mova [tmpq+strideq*1-16], xm13
+ mova [tmpq+strideq*2-16], xm3
+ mova [tmpq+stride3q -16], xm4
+ lea tmpq, [tmpq+strideq*4]
+ vextracti128 [tmpq+strideq*0-16], m8, 1
+ vextracti128 [tmpq+strideq*1-16], m1, 1
+ vextracti128 [tmpq+strideq*2-16], m2, 1
+ vextracti128 [tmpq+stride3q -16], m7, 1
+ lea tmpq, [tmpq+strideq*4]
+ vextracti128 [tmpq+strideq*0-16], m0, 1
+ vextracti128 [tmpq+strideq*1-16], m13, 1
+ vextracti128 [tmpq+strideq*2-16], m3, 1
+ vextracti128 [tmpq+stride3q -16], m4, 1
+
+ mova m0, [rsp+10*32]
+ mova m1, [rsp+11*32]
+ mova m2, [rsp+12*32]
+ mova m3, [rsp+13*32]
+ TRANSPOSE8X8W 5, 6, 14, 15, 0, 1, 2, 3, 4
+ mova [dstq+strideq*0], xm5
+ mova [dstq+strideq*1], xm6
+ mova [dstq+strideq*2], xm14
+ mova [dstq+stride3q ], xm15
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], xm0
+ mova [dstq+strideq*1], xm1
+ mova [dstq+strideq*2], xm2
+ mova [dstq+stride3q ], xm3
+ lea dstq, [dstq+strideq*4]
+ vextracti128 [dstq+strideq*0], m5, 1
+ vextracti128 [dstq+strideq*1], m6, 1
+ vextracti128 [dstq+strideq*2], m14, 1
+ vextracti128 [dstq+stride3q ], m15, 1
+ lea dstq, [dstq+strideq*4]
+ vextracti128 [dstq+strideq*0], m0, 1
+ vextracti128 [dstq+strideq*1], m1, 1
+ vextracti128 [dstq+strideq*2], m2, 1
+ vextracti128 [dstq+stride3q ], m3, 1
+ lea dstq, [dstq+strideq*4]
+%endif
+%elif %1 == 6
+ ; flat6 filter
+ vpbroadcastd m7, [pw_4096]
+ paddw m8, m3, m4
+ paddw m8, m13 ; p2+p1+p0
+ paddw m11, m13, m5
+ paddw m8, m8
+ paddw m8, m11 ; p2+2*(p2+p1+p0)+q0
+ pmulhrsw m2, m8, m7
+
+ paddw m8, m5
+ paddw m11, m13, m13
+ paddw m8, m6
+ psubw m8, m11
+ pmulhrsw m10, m8, m7
+
+ paddw m8, m6
+ paddw m11, m13, m3
+ paddw m8, m14
+ psubw m8, m11
+ pmulhrsw m11, m8, m7
+
+ psubw m8, m3
+ paddw m14, m14
+ psubw m8, m4
+ paddw m8, m14
+ pmulhrsw m8, m7
+
+ pblendvb m3, m2, m9
+ pblendvb m4, m10, m9
+ pblendvb m5, m11, m9
+ pblendvb m6, m8, m9
+
+%ifidn %2, v
+ mova [tmpq+strideq*2], m3 ; p1
+ mova [tmpq+stride3q ], m4 ; p0
+ mova [dstq+strideq*0], m5 ; q0
+ mova [dstq+strideq*1], m6 ; q1
+%else
+ TRANSPOSE_8x4_AND_WRITE_4x16
+%endif
+%else
+%ifidn %2, v
+ mova [tmpq+strideq*0], m3 ; p1
+ mova [tmpq+strideq*1], m4 ; p0
+ mova [tmpq+strideq*2], m5 ; q0
+ mova [tmpq+stride3q ], m6 ; q1
+%else
+ TRANSPOSE_8x4_AND_WRITE_4x16
+%endif
+%endif
+%endmacro
+
+INIT_YMM avx2
+cglobal lpf_v_sb_y_16bpc, 6, 12, 16, 32 * 5, \
+ dst, stride, mask, l, l_stride, lut, \
+ w, stride3, mstride, tmp, mask_bits
+ mov r6d, r7m
+ lea r11, [pw_4]
+ shr r6d, 11 ; is_12bpc
+ lea r11, [r11+r6*4]
+ mov wd, wm
+ shl l_strideq, 2
+ sub lq, l_strideq
+ mov mstrideq, strideq
+ neg mstrideq
+ lea stride3q, [strideq*3]
+ mov mask_bitsd, 0xf
+ mova m12, [pb_mask]
+
+.loop:
+ test [maskq+8], mask_bitsd ; vmask[2]
+ jz .no_flat16
+
+ FILTER 16, v
+ jmp .end
+
+.no_flat16:
+ test [maskq+4], mask_bitsd ; vmask[1]
+ jz .no_flat
+
+ FILTER 8, v
+ jmp .end
+
+.no_flat:
+ test [maskq+0], mask_bitsd ; vmask[0]
+ jz .end
+
+ call .v4
+
+.end:
+ pslld m12, 4
+ add lq, 16
+ add dstq, 32
+ shl mask_bitsd, 4
+ sub wd, 4
+ jg .loop
+ RET
+ALIGN function_align
+.v4:
+ FILTER 4, v
+ ret
+
+INIT_YMM avx2
+cglobal lpf_h_sb_y_16bpc, 6, 12, 16, 32 * 15, \
+ dst, stride, mask, l, l_stride, lut, \
+ h, stride3, l_stride3, tmp, mask_bits
+ mov r6d, r7m
+ lea r11, [pw_4]
+ shr r6d, 11 ; is_12bpc
+ lea r11, [r11+r6*4]
+ mov hd, hm
+ shl l_strideq, 2
+ sub lq, 4
+ lea stride3q, [strideq*3]
+ lea l_stride3q, [l_strideq*3]
+ mov mask_bitsd, 0xf
+ mova m12, [pb_mask]
+
+.loop:
+ test [maskq+8], mask_bitsd ; vmask[2]
+ jz .no_flat16
+
+ FILTER 16, h
+ jmp .end
+
+.no_flat16:
+ test [maskq+4], mask_bitsd ; vmask[1]
+ jz .no_flat
+
+ FILTER 8, h
+ jmp .end
+
+.no_flat:
+ test [maskq+0], mask_bitsd ; vmask[0]
+ jz .no_filter
+
+ call .h4
+ jmp .end
+
+.no_filter:
+ lea dstq, [dstq+strideq*8]
+ lea dstq, [dstq+strideq*8]
+.end:
+ pslld m12, 4
+ lea lq, [lq+l_strideq*4]
+ shl mask_bitsd, 4
+ sub hd, 4
+ jg .loop
+ RET
+ALIGN function_align
+.h4:
+ FILTER 4, h
+ ret
+
+INIT_YMM avx2
+cglobal lpf_v_sb_uv_16bpc, 6, 12, 16, \
+ dst, stride, mask, l, l_stride, lut, \
+ w, stride3, mstride, tmp, mask_bits
+ mov r6d, r7m
+ lea r11, [pw_4]
+ shr r6d, 11 ; is_12bpc
+ lea r11, [r11+r6*4]
+ mov wd, wm
+ shl l_strideq, 2
+ sub lq, l_strideq
+ mov mstrideq, strideq
+ neg mstrideq
+ lea stride3q, [strideq*3]
+ mov mask_bitsd, 0xf
+ mova m12, [pb_mask]
+
+.loop:
+ test [maskq+4], mask_bitsd ; vmask[1]
+ jz .no_flat
+
+ FILTER 6, v
+ jmp .end
+
+.no_flat:
+ test [maskq+0], mask_bitsd ; vmask[0]
+ jz .end
+
+ call mangle(private_prefix %+ _lpf_v_sb_y_16bpc_avx2).v4
+
+.end:
+ pslld m12, 4
+ add lq, 16
+ add dstq, 32
+ shl mask_bitsd, 4
+ sub wd, 4
+ jg .loop
+ RET
+
+INIT_YMM avx2
+cglobal lpf_h_sb_uv_16bpc, 6, 12, 16, \
+ dst, stride, mask, l, l_stride, lut, \
+ h, stride3, l_stride3, tmp, mask_bits
+ mov r6d, r7m
+ lea r11, [pw_4]
+ shr r6d, 11 ; is_12bpc
+ lea r11, [r11+r6*4]
+ mov hd, hm
+ shl l_strideq, 2
+ sub lq, 4
+ lea stride3q, [strideq*3]
+ lea l_stride3q, [l_strideq*3]
+ mov mask_bitsd, 0xf
+ mova m12, [pb_mask]
+
+.loop:
+ test [maskq+4], mask_bitsd ; vmask[1]
+ jz .no_flat
+
+ FILTER 6, h
+ jmp .end
+
+.no_flat:
+ test [maskq+0], mask_bitsd ; vmask[0]
+ jz .no_filter
+
+ call mangle(private_prefix %+ _lpf_h_sb_y_16bpc_avx2).h4
+ jmp .end
+
+.no_filter:
+ lea dstq, [dstq+strideq*8]
+ lea dstq, [dstq+strideq*8]
+.end:
+ pslld m12, 4
+ lea lq, [lq+l_strideq*4]
+ shl mask_bitsd, 4
+ sub hd, 4
+ jg .loop
+ RET
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/loopfilter16_avx512.asm b/third_party/dav1d/src/x86/loopfilter16_avx512.asm
new file mode 100644
index 0000000000..b7bc3aa106
--- /dev/null
+++ b/third_party/dav1d/src/x86/loopfilter16_avx512.asm
@@ -0,0 +1,912 @@
+; Copyright © 2022, VideoLAN and dav1d authors
+; Copyright © 2022, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 64
+
+l_shuf_v: times 2 db 0, 32
+pw_1: times 2 dw 1
+ times 2 db 4, 36
+pw_3: times 2 dw 3
+ times 2 db 8, 40
+pw_4: times 2 dw 4
+ times 2 db 12, 44
+pw_16: times 2 dw 16
+ times 2 db 16, 48
+pw_4096: times 2 dw 4096
+ times 2 db 20, 52
+pw_16384: times 2 dw 16384
+ times 2 db 24, 56
+pw_32767: times 2 dw 32767
+ times 2 db 28, 60
+ times 2 dw 0
+filter_mask: dd 1, 2, 4, 8, 16, 32, 64,128
+stride_mul: dd 0, 1, 8, 9, 16, 17, 24, 25
+l_shuf_h: db 4, -1, 4, -1, 4, -1, 4, -1, 12, -1, 12, -1, 12, -1, 12, -1
+clip_max: dw 511, 511, 2047, 2047
+clip_min: dw -512, -512, -2048, -2048
+
+SECTION .text
+
+%macro TRANSPOSE8X8W 9 ; src/dst[1-8], tmp
+ punpckhwd m%9, m%5, m%6
+ punpcklwd m%5, m%6
+ punpckhwd m%6, m%1, m%2
+ punpcklwd m%1, m%2
+ punpckhwd m%2, m%7, m%8
+ punpcklwd m%7, m%8
+ punpckhwd m%8, m%3, m%4
+ punpcklwd m%3, m%4
+ punpckhdq m%4, m%1, m%3
+ punpckldq m%1, m%3
+ punpckldq m%3, m%5, m%7
+ punpckhdq m%5, m%7
+ punpckhdq m%7, m%6, m%8
+ punpckldq m%6, m%8
+ punpckldq m%8, m%9, m%2
+ punpckhdq m%9, m%2
+ punpckhqdq m%2, m%1, m%3
+ punpcklqdq m%1, m%3
+ punpcklqdq m%3, m%4, m%5
+ punpckhqdq m%4, m%5
+ punpcklqdq m%5, m%6, m%8
+ punpckhqdq m%6, m%8
+ punpckhqdq m%8, m%7, m%9
+ punpcklqdq m%7, m%9
+%endmacro
+
+%macro FILTER 2 ; width [4/6/8/16], dir [h/v]
+%ifidn %2, v
+%if %1 == 16
+ lea tmpq, [dstq+mstrideq*8]
+ mova m0, [tmpq+strideq*1 ]
+ mova m1, [tmpq+strideq*2 ] ; p5
+ mova m2, [tmpq+stride3q ] ; p4
+ mova m3, [tmpq+strideq*4 ] ; p3
+ mova m4, [tmpq+stride5q ] ; p2
+%elif %1 == 6 || %1 == 8
+ lea tmpq, [dstq+mstrideq*4]
+%if %1 == 8
+ mova m3, [tmpq+strideq*0 ]
+%endif
+ mova m4, [tmpq+strideq*1 ]
+%endif
+ mova m5, [dstq+mstrideq*2] ; p1
+ mova m6, [dstq+mstrideq*1] ; p0
+ mova m7, [dstq+strideq*0 ] ; q0
+ mova m8, [dstq+strideq*1 ] ; q1
+%if %1 != 4
+ mova m9, [dstq+strideq*2 ] ; q2
+%endif
+%if %1 == 8 || %1 == 16
+ mova m10, [dstq+stride3q ] ; q3
+%endif
+%if %1 == 16
+ mova m11, [dstq+strideq*4 ] ; q4
+ mova m22, [dstq+stride5q ] ; q5
+ mova m23, [dstq+stride3q*2]
+%endif
+%else ; h
+%if %1 == 16
+ movu ym16, [dstq+strideq*0 -16]
+ movu ym17, [dstq+strideq*1 -16]
+ movu ym18, [dstq+strideq*2 -16]
+ movu ym19, [dstq+stride3q -16]
+ movu ym20, [dstq+strideq*4 -16]
+ movu ym22, [dstq+stride5q -16]
+ movu ym23, [dstq+stride3q*2-16]
+ movu ym28, [dstq+stride7q -16]
+ lea tmpq, [dstq+strideq*8 -16]
+ vinserti32x8 m7, m16, [tmpq+strideq*0 ], 1
+ vinserti32x8 m8, m17, [tmpq+strideq*1 ], 1
+ vinserti32x8 m9, m18, [tmpq+strideq*2 ], 1
+ vinserti32x8 m10, m19, [tmpq+stride3q ], 1
+ vinserti32x8 m11, m20, [tmpq+strideq*4 ], 1
+ vinserti32x8 m22, m22, [tmpq+stride5q ], 1
+ vinserti32x8 m23, m23, [tmpq+stride3q*2], 1
+ vinserti32x8 m28, m28, [tmpq+stride7q ], 1
+ lea tmpq, [tmpq+strideq*8]
+ TRANSPOSE8X8W 7, 8, 9, 10, 11, 22, 23, 28, 27
+ movu ym16, [tmpq+strideq*0 ]
+ movu ym17, [tmpq+strideq*1 ]
+ movu ym18, [tmpq+strideq*2 ]
+ movu ym19, [tmpq+stride3q ]
+ movu ym24, [tmpq+strideq*4 ]
+ movu ym25, [tmpq+stride5q ]
+ movu ym26, [tmpq+stride3q*2]
+ movu ym20, [tmpq+stride7q ]
+ lea tmpq, [tmpq+strideq*8]
+ vinserti32x8 m0, m16, [tmpq+strideq*0 ], 1
+ vinserti32x8 m1, m17, [tmpq+strideq*1 ], 1
+ vinserti32x8 m2, m18, [tmpq+strideq*2 ], 1
+ vinserti32x8 m3, m19, [tmpq+stride3q ], 1
+ vinserti32x8 m4, m24, [tmpq+strideq*4 ], 1
+ vinserti32x8 m5, m25, [tmpq+stride5q ], 1
+ vinserti32x8 m6, m26, [tmpq+stride3q*2], 1
+ vinserti32x8 m20, m20, [tmpq+stride7q ], 1
+ TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 20, 27
+ vshufi32x4 m27, m7, m0, q2020
+ vshufi32x4 m7, m0, q3131
+ vshufi32x4 m0, m8, m1, q2020
+ vshufi32x4 m8, m1, q3131
+ vshufi32x4 m1, m9, m2, q2020
+ vshufi32x4 m9, m2, q3131
+ vshufi32x4 m2, m10, m3, q2020
+ vshufi32x4 m10, m3, q3131
+ vshufi32x4 m3, m11, m4, q2020
+ vshufi32x4 m11, m4, q3131
+ vshufi32x4 m4, m22, m5, q2020
+ vshufi32x4 m22, m5, q3131
+ vshufi32x4 m5, m23, m6, q2020
+ vshufi32x4 m23, m6, q3131
+ vshufi32x4 m6, m28, m20, q2020
+ vshufi32x4 m28, m20, q3131
+%elif %1 == 6 || %1 == 8
+%if %1 == 8
+ sub dstq, 8
+ movu xm16, [dstq+strideq*0 ]
+ movu xm17, [dstq+strideq*1 ]
+ movu xm18, [dstq+strideq*2 ]
+ movu xm19, [dstq+stride3q ]
+ movu xm24, [dstq+strideq*4 ]
+ movu xm25, [dstq+stride5q ]
+ movu xm26, [dstq+stride3q*2]
+ movu xm27, [dstq+stride7q ]
+ lea tmpq, [dstq+strideq*8 ]
+ vinserti128 ym16, [tmpq+strideq*0 ], 1
+ vinserti128 ym17, [tmpq+strideq*1 ], 1
+ vinserti128 ym18, [tmpq+strideq*2 ], 1
+ vinserti128 ym19, [tmpq+stride3q ], 1
+ vinserti128 ym24, [tmpq+strideq*4 ], 1
+ vinserti128 ym25, [tmpq+stride5q ], 1
+ vinserti128 ym26, [tmpq+stride3q*2], 1
+ vinserti128 ym27, [tmpq+stride7q ], 1
+ lea tmpq, [tmpq+strideq*8 ]
+ vinserti32x4 m10, m16, [tmpq+strideq*0 ], 2
+ vinserti32x4 m8, m17, [tmpq+strideq*1 ], 2
+ vinserti32x4 m5, m18, [tmpq+strideq*2 ], 2
+ vinserti32x4 m7, m19, [tmpq+stride3q ], 2
+ vinserti32x4 m2, m24, [tmpq+strideq*4 ], 2
+ vinserti32x4 m9, m25, [tmpq+stride5q ], 2
+ vinserti32x4 m3, m26, [tmpq+stride3q*2], 2
+ vinserti32x4 m4, m27, [tmpq+stride7q ], 2
+ lea tmpq, [tmpq+strideq*8 ]
+ vinserti32x4 m10, [tmpq+strideq*0 ], 3
+ vinserti32x4 m8, [tmpq+strideq*1 ], 3
+ vinserti32x4 m5, [tmpq+strideq*2 ], 3
+ vinserti32x4 m7, [tmpq+stride3q ], 3
+ vinserti32x4 m2, [tmpq+strideq*4 ], 3
+ vinserti32x4 m9, [tmpq+stride5q ], 3
+ vinserti32x4 m3, [tmpq+stride3q*2], 3
+ vinserti32x4 m4, [tmpq+stride7q ], 3
+%else ; %1 == 6
+ movu xm16, [dstq+strideq*0-8]
+ movu xm17, [dstq+strideq*1-8]
+ movu xm18, [dstq+strideq*2-8]
+ movu xm19, [dstq+stride3q -8]
+ lea tmpq, [dstq+strideq*4-8]
+ movu xm2, [tmpq+strideq*0]
+ movu xm9, [tmpq+strideq*1]
+ movu xm3, [tmpq+strideq*2]
+ movu xm4, [tmpq+stride3q ]
+ lea tmpq, [tmpq+strideq*4]
+ vinserti128 ym16, [tmpq+strideq*0], 1
+ vinserti128 ym17, [tmpq+strideq*1], 1
+ vinserti128 ym18, [tmpq+strideq*2], 1
+ vinserti128 ym19, [tmpq+stride3q ], 1
+ lea tmpq, [tmpq+strideq*4]
+ vinserti128 ym2, [tmpq+strideq*0], 1
+ vinserti128 ym9, [tmpq+strideq*1], 1
+ vinserti128 ym3, [tmpq+strideq*2], 1
+ vinserti128 ym4, [tmpq+stride3q ], 1
+ lea tmpq, [tmpq+strideq*4]
+ vinserti32x4 m10, m16, [tmpq+strideq*0], 2
+ vinserti32x4 m8, m17, [tmpq+strideq*1], 2
+ vinserti32x4 m5, m18, [tmpq+strideq*2], 2
+ vinserti32x4 m7, m19, [tmpq+stride3q ], 2
+ lea tmpq, [tmpq+strideq*4]
+ vinserti32x4 m2, [tmpq+strideq*0], 2
+ vinserti32x4 m9, [tmpq+strideq*1], 2
+ vinserti32x4 m3, [tmpq+strideq*2], 2
+ vinserti32x4 m4, [tmpq+stride3q ], 2
+ lea tmpq, [tmpq+strideq*4]
+ vinserti32x4 m10, [tmpq+strideq*0], 3
+ vinserti32x4 m8, [tmpq+strideq*1], 3
+ vinserti32x4 m5, [tmpq+strideq*2], 3
+ vinserti32x4 m7, [tmpq+stride3q ], 3
+ lea tmpq, [tmpq+strideq*4]
+ vinserti32x4 m2, [tmpq+strideq*0], 3
+ vinserti32x4 m9, [tmpq+strideq*1], 3
+ vinserti32x4 m3, [tmpq+strideq*2], 3
+ vinserti32x4 m4, [tmpq+stride3q ], 3
+%endif
+ punpcklwd m6, m10, m8
+ punpckhwd m10, m8
+ punpcklwd m8, m5, m7
+ punpckhwd m5, m7
+ punpcklwd m7, m2, m9
+ punpckhwd m2, m9
+ punpcklwd m9, m3, m4
+ punpckhwd m3, m4
+ punpckldq m4, m6, m8
+ punpckhdq m6, m8
+ punpckldq m8, m10, m5
+ punpckhdq m10, m5
+ punpckldq m5, m7, m9
+ punpckhdq m7, m9
+ punpckldq m9, m2, m3
+ punpckhdq m2, m3
+%if %1 == 8
+ punpcklqdq m3, m4, m5
+%endif
+ punpckhqdq m4, m5
+ punpcklqdq m5, m6, m7
+ punpckhqdq m6, m7
+ punpcklqdq m7, m8, m9
+ punpckhqdq m8, m9
+ punpcklqdq m9, m10, m2
+%if %1 == 8
+ punpckhqdq m10, m2
+%endif
+%else ; %1 == 4
+ kxnorb k1, k1, k1
+ kmovb k2, k1
+ vpgatherdq m7{k1}, [dstq+ym12-4]
+ lea tmpq, [dstq+strideq*2-4]
+ kmovb k1, k2
+ vpgatherdq m4{k2}, [tmpq+ym12]
+ lea tmpq, [tmpq+strideq*2]
+ kmovb k2, k1
+ vpgatherdq m5{k1}, [tmpq+ym12]
+ lea tmpq, [tmpq+strideq*2]
+ vpgatherdq m6{k2}, [tmpq+ym12]
+ punpcklwd m8, m7, m4
+ punpckhwd m7, m4
+ punpcklwd m4, m5, m6
+ punpckhwd m5, m6
+ punpcklwd m6, m8, m7
+ punpckhwd m8, m7
+ punpcklwd m7, m4, m5
+ punpckhwd m4, m5
+ punpcklqdq m5, m6, m7
+ punpckhqdq m6, m7
+ punpcklqdq m7, m8, m4
+ punpckhqdq m8, m4
+%endif
+%endif
+
+ ; load L/E/I/H
+%ifidn %2, v
+ movu ym16, [lq+l_strideq*1]
+ movsldup m17, [l_shuf_v]
+ vptestnmb k1, ym16, ym16
+ vmovdqu8 ym16{k1}, [lq+l_strideq*0] ; l[x][] ? l[x][] : l[x-stride][]
+ vpermb m16, m17, m16 ; l[x][1]
+%else
+ movq xm16, [lq+l_strideq*0]
+ movq xm17, [lq+l_strideq*1]
+ vinserti128 ym16, [lq+l_strideq*2], 1
+ vinserti128 ym17, [lq+l_stride3q ], 1
+ lea tmpq, [lq+l_strideq*4]
+ vinserti32x4 m16, [tmpq+l_strideq*0], 2
+ vinserti32x4 m17, [tmpq+l_strideq*1], 2
+ vinserti32x4 m16, [tmpq+l_strideq*2], 3
+ vinserti32x4 m17, [tmpq+l_stride3q ], 3
+ punpcklqdq m16, m17
+ vbroadcasti32x4 m17, [l_shuf_h]
+ vptestnmb k1, m16, m16
+ vpalignr m16{k1}, m16, 12
+ pshufb m16, m17 ; l[x][1]
+%endif
+ vpbroadcastd m20, [pw_32767]
+ psubw m17, m5, m6 ; p1-p0
+ psubw m18, m7, m8 ; q1-q0
+ vptestmw k1, m16, m16 ; L
+ pabsw m17, m17
+ pabsw m18, m18
+ vpmaxuw m20{k1}, m17, m18
+ vpbroadcastw m17, [lutq+136]
+ psrlw m18, m16, [lutq+128]
+ vpbroadcastd m19, [pw_1]
+ pminuw m18, m17
+ psrlw m17, m16, 4 ; H
+ paddw m16, m16
+ pmaxuw m18, m19 ; I
+ vpaddd m16, [pw_4] {1to16}
+ paddw m16, m18 ; E
+ REPX {pmullw x, m13}, m17, m18, m16
+ vpcmpw k4, m20, m17, 6 ; hev
+%if %1 != 4
+ psubw m19, m4, m5 ; p2-p1
+ pabsw m19, m19
+%if %1 == 8 || %1 == 16
+ psubw m17, m3, m4 ; p3-p2
+ pabsw m17, m17
+ pmaxuw m19, m17
+ psubw m17, m9, m10 ; q3-q2
+ pabsw m17, m17
+ pmaxuw m19, m17
+%endif
+ psubw m17, m9, m8 ; q2-q1
+ pabsw m17, m17
+ pmaxuw m19, m17
+%if %1 == 16
+ vpbroadcastd ym17, [maskq+4]
+ vpord ym17, [maskq+8] {1to8}
+ vptestmd k1, ym17, ym21
+%else
+ vptestmd k1, ym21, [maskq+4] {1to8}
+%endif
+ pmaxuw m19, m20
+ psubw m17, m4, m6 ; p2-p0
+ pabsw m17, m17
+ pmaxuw m17, m20
+ vmovdqa64 m20{k1}, m19 ; only apply fm-wide to wd>4 blocks
+%if %1 == 8 || %1 == 16
+ psubw m19, m3, m6 ; p3-p0
+ pabsw m19, m19
+ pmaxuw m17, m19
+ psubw m19, m7, m10 ; q3-q0
+ pabsw m19, m19
+ pmaxuw m17, m19
+%endif
+ psubw m19, m7, m9 ; q2-q0
+ pabsw m19, m19
+ pmaxuw m17, m19
+%endif
+ vpcmpw k1, m20, m18, 2
+ psubw m18, m5, m8 ; p1-q1
+ psubw m19, m6, m7 ; p0-q0
+ pabsw m18, m18
+ pabsw m19, m19
+ psrlw m18, 1
+ paddw m19, m19
+ paddw m18, m19 ; abs(p0-q0)*2+(abs(p1-q1)>>1)
+ vpcmpw k1{k1}, m18, m16, 2 ; abs(p0-q0)*2+(abs(p1-q1)>>1) <= E
+%if %1 != 4
+ vpcmpw k2{k1}, m17, m13, 2 ; flat8in
+%endif
+%if %1 == 16
+ psubw m20, m0, m6
+ psubw m16, m1, m6
+ pabsw m20, m20
+ psubw m17, m2, m6
+ pabsw m16, m16
+ psubw m18, m11, m7
+ pabsw m17, m17
+ psubw m19, m22, m7
+ pabsw m18, m18
+ pmaxuw m20, m16
+ psubw m16, m23, m7
+ pabsw m19, m19
+ pmaxuw m17, m18
+ pabsw m16, m16
+ vpandd ym18, ym21, [maskq+8] {1to8}
+ pmaxuw m20, m17
+ pmaxuw m19, m16
+ pcmpeqd ym16, ym21, ym18
+ vpternlogd ym18, ym21, [maskq+4] {1to8}, 0xc8
+ pmaxuw m20, m19
+ pcmpeqd ym17, ym21, ym18
+ vpternlogd ym18, ym21, [maskq+0] {1to8}, 0xc8
+ vpcmpw k3{k2}, m20, m13, 2 ; flat8in & flat8out
+ pcmpeqd ym18, ym21
+ vptestmb k3{k3}, ym16, ym16 ; flat8 & fm
+ vptestmb k2{k2}, ym17, ym17 ; flat8in
+ vptestmb k1{k1}, ym18, ym18
+ kandnd k1, k2, k1 ; fm & !flat8 & !flat16
+ kandnd k2, k3, k2 ; flat8 & !flat16
+%elif %1 == 6 || %1 == 8
+ vpandd ym17, ym21, [maskq+4] {1to8}
+ pcmpeqd ym16, ym21, ym17
+ vpternlogd ym17, ym21, [maskq+0] {1to8}, 0xc8
+ pcmpeqd ym17, ym21
+ vptestmb k2{k2}, ym16, ym16 ; flat8 & fm
+ vptestmb k1{k1}, ym17, ym17
+ kandnd k1, k2, k1 ; fm & !flat8
+%else ; %1 == 4
+ vpandd ym16, ym21, [maskq+0] {1to8}
+ pcmpeqd ym16, ym21
+ vptestmb k1{k1}, ym16, ym16
+%endif
+
+ ; short filter
+ psubw m16, m7, m6
+ vpbroadcastd m17, [pw_3]
+ paddw m18, m16, m16
+ paddw m18, m16
+ psubw m16, m5, m8 ; iclip_diff(p1-q1)
+ pminsw m16, m14
+ vpmaxsw m16{k4}{z}, m15 ; f=iclip_diff(p1-q1)&hev
+ knotd k4, k4 ; !hev
+ paddw m16, m18 ; f=iclip_diff(3*(q0-p0)+f)
+ vpbroadcastd m18, [pw_4]
+ pminsw m16, m14
+ vpmaxsw m16{k1}{z}, m15 ; f&=fm
+ paddw m17, m16
+ paddw m16, m18
+ vpbroadcastd m18, [pw_16384]
+ pminsw m17, m14
+ pminsw m16, m14
+ psraw m17, 3 ; f2
+ psraw m16, 3 ; f1
+ paddw m6, m17
+ psubw m7, m16
+ vpmulhrsw m16{k4}{z}, m18 ; (f=(f1+1)>>1) & !hev
+ psubw m17, m14, m15 ; 1023 or 4095
+ pxor m18, m18
+ paddw m5, m16
+ psubw m8, m16
+ REPX {pminsw x, m17}, m6, m7, m5, m8
+ REPX {pmaxsw x, m18}, m6, m7, m5, m8
+
+%if %1 == 16 ; flat16 filter
+ vpaddd m19, m0, [pw_1] {1to16}
+ paddw m16, m1, m2 ; p5+p4
+ paddw m26, m1, m6 ; p5+p0
+ paddw m24, m2, m7 ; p4+q0
+ paddw m16, m4 ; p5+p4+p3
+ paddw m17, m3, m5 ; p2+p1
+ psllw m19, 3
+ paddw m16, m26 ; p5*2+p4+p3+p0
+ paddw m17, m24 ; p4+p2+p1+q0
+ psubw m19, m0 ; p6*7+8
+ paddw m16, m17 ; p5*2+p4*2+p3+p2+p1+q0
+ paddw m18, m3, m8
+ paddw m19, m16 ; p6*7+p5+p4*2+p3+p2+p1+p0+q0
+ paddw m25, m1, m0
+ paddw m16, m0, m0
+ psrlw m1{k3}, m19, 4
+ paddw m19, m18
+ psubw m19, m16 ; +p3+q1-p6*2
+ paddw m16, m2, m0
+ psrlw m2{k3}, m19, 4
+ psubw m19, m25
+ paddw m25, m4, m9
+ paddw m20, m10, m5
+ paddw m19, m25 ; +p2+q2-p6-p5
+ paddw m17, m0, m3
+ psubw m16, m20, m16
+ psrlw m3{k3}, m19, 4
+ paddw m19, m16 ; +p1+q3-p6-p4
+ paddw m16, m11, m6
+ psubw m16, m17
+ paddw m17, m0, m4
+ psrlw m4{k3}, m19, 4
+ paddw m19, m16 ; +p0+q4-p6-p3
+ paddw m16, m22, m7
+ psubw m16, m17
+ paddw m17, m0, m5
+ psrlw m5{k3}, m19, 4
+ paddw m19, m16 ; +q0+q5-p6-p2
+ paddw m16, m23, m8
+ psrlw m6{k3}, m19, 4
+ psubw m16, m17
+ paddw m19, m16 ; +q1+q6-p6-p1
+ paddw m16, m23, m9
+ psrlw m7{k3}, m19, 4
+ psubw m16, m26
+ paddw m19, m16 ; +q2+q6-p5-p0
+ paddw m16, m23, m10
+ psrlw m8{k3}, m19, 4
+ psubw m16, m24
+ paddw m19, m16 ; +q3+q6-p4-p0
+ paddw m16, m23, m11
+ psrlw m9{k3}, m19, 4
+ psubw m16, m18
+ paddw m19, m16 ; +q4+q6-p3-q1
+ paddw m16, m23, m22
+ psrlw m10{k3}, m19, 4
+ psubw m16, m25
+ paddw m19, m16 ; +q5+q6-p2-q2
+ paddw m16, m23, m23
+ psrlw m11{k3}, m19, 4
+ psubw m16, m20
+ paddw m19, m16 ; +q6*2-p1-q3
+ psrlw m22{k3}, m19, 4
+%endif
+%if %1 == 8 || %1 == 16 ; flat8 filter
+ vpbroadcastd m20, [pw_4096]
+ paddw m16, m3, m4 ; p3+p2
+ paddw m19, m5, m6 ; p1+p0
+ paddw m17, m16, m16 ; 2*(p3+p2)
+ paddw m19, m3 ; p1+p0+p3
+ paddw m17, m7 ; 2*(p3+p2)+q0
+ paddw m19, m17 ; 3*p3+2*p2+p1+p0+q0
+ paddw m18, m4, m7
+ pmulhrsw m4{k2}, m19, m20
+ psubw m19, m16
+ paddw m17, m5, m8
+ paddw m16, m3, m5
+ paddw m19, m17
+ pmulhrsw m5{k2}, m19, m20
+ psubw m19, m16
+ paddw m16, m6, m9
+ paddw m19, m16
+ paddw m16, m3, m6
+ pmulhrsw m6{k2}, m19, m20
+ paddw m19, m10
+ psubw m16, m7, m16
+ paddw m19, m16
+ psubw m16, m10, m18
+ pmulhrsw m7{k2}, m19, m20
+ paddw m16, m8
+ paddw m19, m16
+ psubw m16, m10, m17
+ pmulhrsw m8{k2}, m19, m20
+ paddw m16, m9
+ paddw m19, m16
+ pmulhrsw m9{k2}, m19, m20
+%elif %1 == 6 ; flat6 filter
+ vpbroadcastd m10, [pw_4096]
+ paddw m2, m5, m6
+ paddw m0, m4, m7
+ paddw m1, m2, m4 ; p2+p1+p0
+ paddw m3, m4, m4
+ paddw m1, m1
+ paddw m4, m5
+ paddw m1, m0 ; p2+2*(p2+p1+p0)+q0
+ psubw m3, m7, m3
+ pmulhrsw m5{k2}, m1, m10
+ paddw m3, m8
+ psubw m4, m8, m4
+ paddw m1, m3
+ pmulhrsw m6{k2}, m1, m10
+ paddw m4, m9
+ paddw m9, m9
+ paddw m1, m4
+ pmulhrsw m7{k2}, m1, m10
+ psubw m9, m2
+ paddw m1, m9
+ pmulhrsw m8{k2}, m1, m10
+%endif
+
+%ifidn %2, v
+%if %1 == 16
+ mova [tmpq+strideq*2 ], m1 ; p5
+ mova [tmpq+stride3q ], m2 ; p4
+ mova [tmpq+strideq*4 ], m3 ; p3
+ mova [tmpq+stride5q ], m4 ; p2
+%elif %1 == 8
+ mova [tmpq+strideq*1 ], m4 ; p2
+%endif
+ mova [dstq+mstrideq*2], m5 ; p1
+ mova [dstq+mstrideq ], m6 ; p0
+ mova [dstq+strideq*0 ], m7 ; q0
+ mova [dstq+strideq*1 ], m8 ; q1
+%if %1 == 8 || %1 == 16
+ mova [dstq+strideq*2 ], m9 ; q2
+%endif
+%if %1 == 16
+ mova [dstq+stride3q ], m10 ; q3
+ mova [dstq+strideq*4 ], m11 ; q4
+ mova [dstq+stride5q ], m22 ; q5
+%endif
+%else
+%if %1 == 16
+ TRANSPOSE8X8W 27, 0, 1, 2, 3, 4, 5, 6, 20
+ TRANSPOSE8X8W 7, 8, 9, 10, 11, 22, 23, 28, 20
+ mova [dstq+strideq*0 -16], xm27
+ mova [dstq+strideq*0 ], xm7
+ mova [dstq+strideq*1 -16], xm0
+ mova [dstq+strideq*1 ], xm8
+ mova [dstq+strideq*2 -16], xm1
+ mova [dstq+strideq*2 ], xm9
+ mova [dstq+stride3q -16], xm2
+ mova [dstq+stride3q ], xm10
+ mova [dstq+strideq*4 -16], xm3
+ mova [dstq+strideq*4 ], xm11
+ mova [dstq+stride5q -16], xm4
+ mova [dstq+stride5q ], xm22
+ mova [dstq+stride3q*2-16], xm5
+ mova [dstq+stride3q*2 ], xm23
+ mova [dstq+stride7q -16], xm6
+ mova [dstq+stride7q ], xm28
+ lea dstq, [dstq+strideq*8]
+ vextracti128 [dstq+strideq*0 -16], ym27, 1
+ vextracti128 [dstq+strideq*0 ], ym7, 1
+ vextracti128 [dstq+strideq*1 -16], ym0, 1
+ vextracti128 [dstq+strideq*1 ], ym8, 1
+ vextracti128 [dstq+strideq*2 -16], ym1, 1
+ vextracti128 [dstq+strideq*2 ], ym9, 1
+ vextracti128 [dstq+stride3q -16], ym2, 1
+ vextracti128 [dstq+stride3q ], ym10, 1
+ vextracti128 [dstq+strideq*4 -16], ym3, 1
+ vextracti128 [dstq+strideq*4 ], ym11, 1
+ vextracti128 [dstq+stride5q -16], ym4, 1
+ vextracti128 [dstq+stride5q ], ym22, 1
+ vextracti128 [dstq+stride3q*2-16], ym5, 1
+ vextracti128 [dstq+stride3q*2 ], ym23, 1
+ vextracti128 [dstq+stride7q -16], ym6, 1
+ vextracti128 [dstq+stride7q ], ym28, 1
+ lea dstq, [dstq+strideq*8]
+ vextracti32x4 [dstq+strideq*0 -16], m27, 2
+ vextracti32x4 [dstq+strideq*0 ], m7, 2
+ vextracti32x4 [dstq+strideq*1 -16], m0, 2
+ vextracti32x4 [dstq+strideq*1 ], m8, 2
+ vextracti32x4 [dstq+strideq*2 -16], m1, 2
+ vextracti32x4 [dstq+strideq*2 ], m9, 2
+ vextracti32x4 [dstq+stride3q -16], m2, 2
+ vextracti32x4 [dstq+stride3q ], m10, 2
+ vextracti32x4 [dstq+strideq*4 -16], m3, 2
+ vextracti32x4 [dstq+strideq*4 ], m11, 2
+ vextracti32x4 [dstq+stride5q -16], m4, 2
+ vextracti32x4 [dstq+stride5q ], m22, 2
+ vextracti32x4 [dstq+stride3q*2-16], m5, 2
+ vextracti32x4 [dstq+stride3q*2 ], m23, 2
+ vextracti32x4 [dstq+stride7q -16], m6, 2
+ vextracti32x4 [dstq+stride7q ], m28, 2
+ lea dstq, [dstq+strideq*8]
+ vextracti32x4 [dstq+strideq*0 -16], m27, 3
+ vextracti32x4 [dstq+strideq*0 ], m7, 3
+ vextracti32x4 [dstq+strideq*1 -16], m0, 3
+ vextracti32x4 [dstq+strideq*1 ], m8, 3
+ vextracti32x4 [dstq+strideq*2 -16], m1, 3
+ vextracti32x4 [dstq+strideq*2 ], m9, 3
+ vextracti32x4 [dstq+stride3q -16], m2, 3
+ vextracti32x4 [dstq+stride3q ], m10, 3
+ vextracti32x4 [dstq+strideq*4 -16], m3, 3
+ vextracti32x4 [dstq+strideq*4 ], m11, 3
+ vextracti32x4 [dstq+stride5q -16], m4, 3
+ vextracti32x4 [dstq+stride5q ], m22, 3
+ vextracti32x4 [dstq+stride3q*2-16], m5, 3
+ vextracti32x4 [dstq+stride3q*2 ], m23, 3
+ vextracti32x4 [dstq+stride7q -16], m6, 3
+ vextracti32x4 [dstq+stride7q ], m28, 3
+%elif %1 == 8
+ TRANSPOSE8X8W 3, 4, 5, 6, 7, 8, 9, 10, 2
+ movu [dstq+strideq*0 ], xm3
+ movu [dstq+strideq*1 ], xm4
+ movu [dstq+strideq*2 ], xm5
+ movu [dstq+stride3q ], xm6
+ movu [dstq+strideq*4 ], xm7
+ movu [dstq+stride5q ], xm8
+ movu [dstq+stride3q*2], xm9
+ movu [dstq+stride7q ], xm10
+ lea dstq, [dstq+strideq*8]
+ vextracti128 [dstq+strideq*0 ], ym3, 1
+ vextracti128 [dstq+strideq*1 ], ym4, 1
+ vextracti128 [dstq+strideq*2 ], ym5, 1
+ vextracti128 [dstq+stride3q ], ym6, 1
+ vextracti128 [dstq+strideq*4 ], ym7, 1
+ vextracti128 [dstq+stride5q ], ym8, 1
+ vextracti128 [dstq+stride3q*2], ym9, 1
+ vextracti128 [dstq+stride7q ], ym10, 1
+ lea dstq, [dstq+strideq*8]
+ vextracti32x4 [dstq+strideq*0 ], m3, 2
+ vextracti32x4 [dstq+strideq*1 ], m4, 2
+ vextracti32x4 [dstq+strideq*2 ], m5, 2
+ vextracti32x4 [dstq+stride3q ], m6, 2
+ vextracti32x4 [dstq+strideq*4 ], m7, 2
+ vextracti32x4 [dstq+stride5q ], m8, 2
+ vextracti32x4 [dstq+stride3q*2], m9, 2
+ vextracti32x4 [dstq+stride7q ], m10, 2
+ lea dstq, [dstq+strideq*8]
+ vextracti32x4 [dstq+strideq*0 ], m3, 3
+ vextracti32x4 [dstq+strideq*1 ], m4, 3
+ vextracti32x4 [dstq+strideq*2 ], m5, 3
+ vextracti32x4 [dstq+stride3q ], m6, 3
+ vextracti32x4 [dstq+strideq*4 ], m7, 3
+ vextracti32x4 [dstq+stride5q ], m8, 3
+ vextracti32x4 [dstq+stride3q*2], m9, 3
+ vextracti32x4 [dstq+stride7q ], m10, 3
+ lea dstq, [dstq+strideq*8+8]
+%else ; %1 == 4 || %1 == 6
+ punpcklwd m9, m5, m6
+ punpckhwd m5, m6
+ kxnorb k1, k1, k1
+ punpcklwd m6, m7, m8
+ punpckhwd m7, m8
+ kmovb k2, k1
+ punpckldq m8, m9, m6
+ vpscatterdq [dstq+ym12-4]{k1}, m8
+ punpckhdq m9, m6
+ lea tmpq, [dstq+strideq*2-4]
+ kmovb k1, k2
+ vpscatterdq [tmpq+ym12]{k2}, m9
+ punpckldq m6, m5, m7
+ lea tmpq, [tmpq+strideq*2]
+ kmovb k2, k1
+ vpscatterdq [tmpq+ym12]{k1}, m6
+ punpckhdq m5, m7
+ lea tmpq, [tmpq+strideq*2]
+ vpscatterdq [tmpq+ym12]{k2}, m5
+%endif
+%endif
+%endmacro
+
+INIT_ZMM avx512icl
+cglobal lpf_v_sb_y_16bpc, 6, 12, 26, dst, stride, mask, l, l_stride, \
+ lut, w, stride3, mstride, tmp, \
+ mask_bits, stride5
+%define base tmpq-filter_mask
+ SWAP 12, 26 ; avoids clobbering xmm10 on WIN64
+ lea tmpq, [filter_mask]
+ mov r6d, r7m ; bitdepth_max
+ lea stride3q, [strideq*3]
+ shl l_strideq, 2
+ lea stride5q, [strideq*5]
+ shr r6d, 11 ; is_12bpc
+ mova ym21, [base+filter_mask]
+ mov mstrideq, strideq
+ vpbroadcastd m13, [base+pw_4+r6*8]
+ mov mask_bitsd, 0xff
+ vpbroadcastd m14, [base+clip_max+r6*4]
+ sub lq, l_strideq
+ vpbroadcastd m15, [base+clip_min+r6*4]
+ neg mstrideq
+ mov wd, wm
+.loop:
+ test [maskq+8], mask_bitsd ; vmask[2]
+ jz .no_flat16
+ FILTER 16, v
+ jmp .end
+.no_flat16:
+ test [maskq+4], mask_bitsd ; vmask[1]
+ jz .no_flat
+ FILTER 8, v
+ jmp .end
+.no_flat:
+ test [maskq+0], mask_bitsd ; vmask[0]
+ jz .end
+ call .v4
+.end:
+ shl mask_bitsd, 8
+ add dstq, 64
+ pslld ym21, 8
+ add lq, 32
+ sub wd, 8
+ jg .loop
+ RET
+ALIGN function_align
+.v4: ; called by both luma and chroma
+ FILTER 4, v
+ ret
+
+cglobal lpf_h_sb_y_16bpc, 6, 13, 29, dst, stride, mask, l, l_stride, \
+ lut, h, stride3, l_stride3, tmp, \
+ mask_bits, stride5, stride7
+ lea tmpq, [filter_mask]
+ mov r6d, r7m ; bitdepth_max
+ lea stride3q, [strideq*3]
+ vpbroadcastd ym12, strided
+ shl l_strideq, 2
+ lea stride5q, [strideq*5]
+ shr r6d, 11 ; is_12bpc
+ pmulld ym12, [base+stride_mul]
+ lea stride7q, [strideq+stride3q*2]
+ mova ym21, [base+filter_mask]
+ mov mask_bitsd, 0xff
+ vpbroadcastd m13, [base+pw_4+r6*8]
+ sub lq, 4
+ vpbroadcastd m14, [base+clip_max+r6*4]
+ lea l_stride3q, [l_strideq*3]
+ vpbroadcastd m15, [base+clip_min+r6*4]
+ mov hd, hm
+.loop:
+ test [maskq+8], mask_bitsd ; vmask[2]
+ jz .no_flat16
+ FILTER 16, h
+ jmp .end
+.no_flat16:
+ test [maskq+4], mask_bitsd ; vmask[1]
+ jz .no_flat
+ FILTER 8, h
+ jmp .end2
+.no_flat:
+ test [maskq+0], mask_bitsd ; vmask[0]
+ jz .no_filter
+ call .h4
+.no_filter:
+ lea dstq, [dstq+stride3q*8]
+.end:
+ lea dstq, [dstq+strideq*8]
+.end2:
+ shl mask_bitsd, 8
+ pslld ym21, 8
+ lea lq, [lq+l_strideq*8]
+ sub hd, 8
+ jg .loop
+ RET
+ALIGN function_align
+.h4: ; called by both luma and chroma
+ FILTER 4, h
+ ret
+
+cglobal lpf_v_sb_uv_16bpc, 6, 11, 22, dst, stride, mask, l, l_stride, lut, \
+ w, stride3, mstride, tmp, mask_bits
+ lea tmpq, [filter_mask]
+ mov r6d, r7m ; bitdepth_max
+ shl l_strideq, 2
+ lea stride3q, [strideq*3]
+ shr r6d, 11 ; is_12bpc
+ mova ym21, [base+filter_mask]
+ mov mstrideq, strideq
+ vpbroadcastd m13, [base+pw_4+r6*8]
+ mov mask_bitsd, 0xff
+ vpbroadcastd m14, [base+clip_max+r6*4]
+ sub lq, l_strideq
+ vpbroadcastd m15, [base+clip_min+r6*4]
+ neg mstrideq
+ mov wd, wm
+.loop:
+ test [maskq+4], mask_bitsd ; vmask[1]
+ jz .no_flat
+ FILTER 6, v
+ jmp .end
+.no_flat:
+ test [maskq+0], mask_bitsd ; vmask[0]
+ jz .end
+ call mangle(private_prefix %+ _lpf_v_sb_y_16bpc_avx512icl).v4
+.end:
+ shl mask_bitsd, 8
+ add dstq, 64
+ pslld ym21, 8
+ add lq, 32
+ sub wd, 8
+ jg .loop
+ RET
+
+cglobal lpf_h_sb_uv_16bpc, 6, 11, 22, dst, stride, mask, l, l_stride, lut, \
+ h, stride3, l_stride3, tmp, mask_bits
+ lea tmpq, [filter_mask]
+ mov r6d, r7m ; bitdepth_max
+ vpbroadcastd ym12, strided
+ shl l_strideq, 2
+ shr r6d, 11 ; is_12bpc
+ pmulld ym12, [base+stride_mul]
+ lea stride3q, [strideq*3]
+ mova ym21, [base+filter_mask]
+ mov mask_bitsd, 0xff
+ vpbroadcastd m13, [base+pw_4+r6*8]
+ sub lq, 4
+ vpbroadcastd m14, [base+clip_max+r6*4]
+ lea l_stride3q, [l_strideq*3]
+ vpbroadcastd m15, [base+clip_min+r6*4]
+ mov hd, hm
+.loop:
+ test [maskq+4], mask_bitsd ; vmask[1]
+ jz .no_flat
+ FILTER 6, h
+ jmp .end
+.no_flat:
+ test [maskq+0], mask_bitsd ; vmask[0]
+ jz .end
+ call mangle(private_prefix %+ _lpf_h_sb_y_16bpc_avx512icl).h4
+.end:
+ lea tmpq, [strideq+stride3q]
+ shl mask_bitsd, 8
+ pslld ym21, 8
+ lea dstq, [dstq+tmpq*8]
+ lea lq, [lq+l_strideq*8]
+ sub hd, 8
+ jg .loop
+ RET
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/loopfilter16_sse.asm b/third_party/dav1d/src/x86/loopfilter16_sse.asm
new file mode 100644
index 0000000000..c486b57a21
--- /dev/null
+++ b/third_party/dav1d/src/x86/loopfilter16_sse.asm
@@ -0,0 +1,1793 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA 16
+
+%if ARCH_X86_64
+%define PIC_sym(a) a
+%else
+%define PIC_base $$
+%define PIC_sym(a) pic_regq+a-PIC_base
+%endif
+
+pb_4x1_4x5_4x9_4x13: times 4 db 0, 1
+ times 4 db 8, 9
+
+pw_1: times 8 dw 1
+pw_2: times 8 dw 2
+pw_3: times 8 dw 3
+; 4 and 16 need to be next to each other since they are used as alternates
+; depending on whether bitdepth is 10 or 12
+pw_4: times 8 dw 4
+pw_16: times 8 dw 16
+pw_8: times 8 dw 8
+pw_4096: times 8 dw 4096
+
+pb_mask: dd 1, 1, 2, 2
+
+SECTION .text
+
+%if ARCH_X86_32
+%if STACK_ALIGNMENT < 16
+%define extra_stack 2
+%else
+%define extra_stack 0
+%endif
+%endif
+
+%macro RELOC_ARGS 2 ; h/v, off
+ASSERT ARCH_X86_32
+%if STACK_ALIGNMENT < 16
+ mov r5d, [rstk + stack_offset + 4*4 + 4]
+%define lstridem [esp+%2+0*gprsize]
+ mov lstridem, r5d
+ mov r5d, [rstk + stack_offset + 4*5 + 4]
+%define lutm [esp+%2+1*gprsize]
+ mov lutm, r5d
+ mov r5d, [rstk + stack_offset + 4*6 + 4]
+%ifidn %1, v
+%define wm [esp+%2+2*gprsize]
+ mov wm, r5d
+ mov r5d, [rstk + stack_offset + 4*3 + 4]
+%define lm [esp+%2+3*gprsize]
+ mov lm, r5d
+%else ; %1 == h
+%define hm [esp+%2+2*gprsize]
+ mov hm, r5d
+%endif ; %1==v
+ mov r5d, r7m
+%define bdmulm [esp+%2+4*gprsize]
+ mov bdmulm, r5d
+%else
+%define lstridem r4m
+%define lutm r5m
+%ifidn %1, v
+%define wm r6m
+%define lm r3m
+%else
+%define hm r6m
+%endif
+%define bdmulm r7m
+%endif ; STACK_ALIGNMENT
+%endmacro
+
+%macro UNRELOC_ARGS 0
+%if ARCH_X86_32
+%undef lm
+%undef lstridem
+%undef wm
+%undef hm
+%undef lutm
+%endif
+%endmacro
+
+%macro SPLATD 2
+ movd %1, %2
+ pshufd %1, %1, q0000
+%endmacro
+
+%macro SPLATW 2
+ movd %1, %2
+ pshuflw %1, %1, q0000
+ punpcklqdq %1, %1
+%endmacro
+
+; in: out:
+; mm%1 a b c d a e i m
+; mm%2 e f g h b f j n
+; mm%3 i j k l -> c g k o
+; mm%4 m n o p d h l p
+%macro TRANSPOSE4X4W 5
+ punpcklwd m%5, m%1, m%2
+ punpckhwd m%1, m%2
+ punpcklwd m%2, m%3, m%4
+ punpckhwd m%3, m%4
+ punpckldq m%4, m%5, m%2
+ punpckhdq m%5, m%2
+ punpckldq m%2, m%1, m%3
+ punpckhdq m%1, m%3
+
+ SWAP %1, %4
+ SWAP %2, %5, %3
+%endmacro
+
+; in: out:
+; m%1 a b c d e f g h a i q y 6 E M U
+; m%2 i j k l m n o p b j r z 7 F N V
+; m%3 q r s t u v w x c k s 0 8 G O W
+; m%4 y z 0 1 2 3 4 5 d l t 1 9 H P X
+; m%5 6 7 8 9 A B C D -> e m u 2 A I Q Y
+; m%6 E F G H I J K L f n v 3 B J R Z
+; m%7 M N O P Q R S T g o w 4 C K S +
+; m%8 U V W X Y Z + = h p x 5 D L T =
+%if ARCH_X86_64
+%macro TRANSPOSE8X8W 9
+ ; m%1 a b c d e f g h a i q y b j r z
+ ; m%2 i j k l m n o p c k s 0 d l t 1
+ ; m%3 q r s t u v w x -> e m u 2 f n v 3
+ ; m%4 y z 0 1 2 3 4 5 g o w 4 h p x 5
+ TRANSPOSE4X4W %1, %2, %3, %4, %9
+
+ ; m%5 6 7 8 9 A B C D 6 E M U 7 F N V
+ ; m%6 E F G H I J K L 8 G O W 9 H P X
+ ; m%7 M N O P Q R S T -> A I Q Y B J R Z
+ ; m%8 U V W X Y Z + = C K S + D L T =
+ TRANSPOSE4X4W %5, %6, %7, %8, %9
+
+ ; m%1 a i q y b j r z a i q y 6 E M U
+ ; m%2 c k s 0 d l t 1 b j r z 7 F N V
+ ; m%3 e m u 2 f n v 3 c k s 0 8 G O W
+ ; m%4 g o w 4 h p x 5 d l t 1 9 H P X
+ ; m%5 6 E M U 7 F N V -> e m u 2 A I Q Y
+ ; m%6 8 G O W 9 H P X f n v 3 B J R Z
+ ; m%7 A I Q Y B J R Z g o w 4 C K S +
+ ; m%8 C K S + D L T = h p x 5 D L T =
+ punpckhqdq m%9, m%1, m%5
+ punpcklqdq m%1, m%5
+ punpckhqdq m%5, m%2, m%6
+ punpcklqdq m%2, m%6
+ punpckhqdq m%6, m%3, m%7
+ punpcklqdq m%3, m%7
+ punpckhqdq m%7, m%4, m%8
+ punpcklqdq m%4, m%8
+
+ SWAP %8, %7, %4, %5, %3, %2, %9
+%endmacro
+%else ; x86-32
+; input: 1-7 in registers, 8 in first memory [read-only]
+; second memory is scratch, and may overlap with first or third memory
+; output: 1-5,7-8 in registers, 6 in third memory [write-only]
+%macro TRANSPOSE8X8W 13 ; regs [8x], mem [3x], a/u [in/out alignment [2x]
+ TRANSPOSE4X4W %1, %2, %3, %4, %8
+%ifnidn %9, ""
+ mov%12 m%8, %9
+%else
+ mova m%8, %10
+%endif
+ mova %10, m%4
+ TRANSPOSE4X4W %5, %6, %7, %8, %4
+ punpckhqdq m%4, m%1, m%5
+ punpcklqdq m%1, m%5
+ punpckhqdq m%5, m%2, m%6
+ punpcklqdq m%2, m%6
+ punpckhqdq m%6, m%3, m%7
+ punpcklqdq m%3, m%7
+ mova m%7, %10
+%ifnidn %11, ""
+ mov%13 %11, m%6
+%else
+ mova %10, m%6
+%endif
+ punpckhqdq m%6, m%7, m%8
+ punpcklqdq m%7, m%8
+
+ ; 1,4,2,5,3,8,7,6 -> 1,2,3,4,5,6,7,8
+ SWAP %2, %4, %5, %3
+ SWAP %6, %8
+%endmacro
+%endif ; x86-32/64
+
+; transpose and write m8-11, everything else is scratch
+%macro TRANSPOSE_8x4_AND_WRITE_4x8 5 ; p1, p0, q0, q1, tmp
+ ; transpose 8x4
+ punpcklwd %5, %1, %2
+ punpckhwd %1, %2
+ punpcklwd %2, %3, %4
+ punpckhwd %3, %4
+ punpckldq %4, %5, %2
+ punpckhdq %5, %2
+ punpckldq %2, %1, %3
+ punpckhdq %1, %3
+
+ ; write out
+ movq [dstq+strideq*0-4], %4
+ movhps [dstq+strideq*1-4], %4
+ movq [dstq+strideq*2-4], %5
+ movhps [dstq+stride3q -4], %5
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0-4], %2
+ movhps [dstq+strideq*1-4], %2
+ movq [dstq+strideq*2-4], %1
+ movhps [dstq+stride3q -4], %1
+ lea dstq, [dstq+strideq*4]
+%endmacro
+
+%macro FILTER 2 ; width [4/6/8/16], dir [h/v]
+ ; load data
+%ifidn %2, v
+%if %1 == 4
+%if ARCH_X86_64
+%define P1 m8
+%define P0 m9
+%define Q0 m10
+%define Q1 m11
+ mova P1, [dstq+mstrideq*2] ; p1
+ mova P0, [dstq+mstrideq*1] ; p0
+ mova Q0, [dstq+strideq*0] ; q0
+ mova Q1, [dstq+strideq*1] ; q1
+%else ; x86-32
+%define P1 [dstq+mstrideq*2]
+%define P0 [dstq+mstrideq*1]
+%define Q0 [dstq+strideq*0]
+%define Q1 [dstq+strideq*1]
+%endif ; x86-32/64
+%else ; %1 != 4
+ ; load 6-8 pixels, remainder (for wd=16) will be read inline
+ lea tmpq, [dstq+mstrideq*4]
+%if ARCH_X86_64
+ ; we load p3 later
+%define P2 m13
+%define P1 m8
+%define P0 m9
+%define Q0 m10
+%define Q1 m11
+%define Q2 m14
+ mova P2, [tmpq+strideq*1]
+ mova P1, [tmpq+strideq*2]
+ mova P0, [tmpq+stride3q]
+ mova Q0, [dstq+strideq*0]
+ mova Q1, [dstq+strideq*1]
+ mova Q2, [dstq+strideq*2]
+%if %1 != 6
+%define P3 [tmpq+strideq*0]
+%define Q3 m15
+ mova Q3, [dstq+stride3q]
+%endif ; %1 != 6
+%else ; x86-32
+%define P2 [tmpq+strideq*1]
+%define P1 [dstq+mstrideq*2]
+%define P0 [dstq+mstrideq*1]
+%define Q0 [dstq+strideq*0]
+%define Q1 [dstq+strideq*1]
+%define Q2 [dstq+strideq*2]
+%if %1 != 6
+%define P3 [dstq+mstrideq*4]
+%define Q3 [dstq+stride3q]
+%endif ; %1 != 6
+%endif ; x86-32/64
+%endif ; %1 ==/!= 4
+%else ; %2 != v
+ ; load lines
+%if %1 == 4
+ movq m0, [dstq+strideq*0-4]
+ movq m2, [dstq+strideq*1-4]
+ movq m4, [dstq+strideq*2-4]
+ movq m5, [dstq+stride3q -4]
+ lea tmpq, [dstq+strideq*4]
+ movq m3, [tmpq+strideq*0-4]
+ movq m6, [tmpq+strideq*1-4]
+ movq m1, [tmpq+strideq*2-4]
+ movq m7, [tmpq+stride3q -4]
+
+ ; transpose 4x8
+ ; m0: A-D0
+ ; m2: A-D1
+ ; m4: A-D2
+ ; m5: A-D3
+ ; m3: A-D4
+ ; m6: A-D5
+ ; m1: A-D6
+ ; m7: A-D7
+ punpcklwd m0, m2
+ punpcklwd m4, m5
+ punpcklwd m3, m6
+ punpcklwd m1, m7
+ ; m0: A0-1,B0-1,C0-1,D0-1
+ ; m4: A2-3,B2-3,C2-3,D2-3
+ ; m3: A4-5,B4-5,C4-5,D4-5
+ ; m1: A6-7,B6-7,C6-7,D6-7
+ punpckhdq m2, m0, m4
+ punpckldq m0, m4
+ punpckhdq m4, m3, m1
+ punpckldq m3, m1
+ ; m0: A0-3,B0-3
+ ; m2: C0-3,D0-3
+ ; m3: A4-7,B4-7
+ ; m4: C4-7,D4-7
+ punpckhqdq m1, m0, m3
+ punpcklqdq m0, m3
+ punpckhqdq m3, m2, m4
+ punpcklqdq m2, m4
+ ; m0: A0-7
+ ; m1: B0-7
+ ; m2: C0-7
+ ; m3: D0-7
+%if ARCH_X86_64
+ SWAP 0, 8
+ SWAP 1, 9
+ SWAP 2, 10
+ SWAP 3, 11
+%define P1 m8
+%define P0 m9
+%define Q0 m10
+%define Q1 m11
+%else
+%define P1 [esp+3*mmsize]
+%define P0 [esp+4*mmsize]
+%define Q0 [esp+5*mmsize]
+%define Q1 [esp+6*mmsize]
+ mova P1, m0
+ mova P0, m1
+ mova Q0, m2
+ mova Q1, m3
+%endif
+%elif %1 == 6 || %1 == 8
+ movu m0, [dstq+strideq*0-8]
+ movu m1, [dstq+strideq*1-8]
+ movu m2, [dstq+strideq*2-8]
+ movu m3, [dstq+stride3q -8]
+ lea tmpq, [dstq+strideq*4]
+ movu m4, [tmpq+strideq*0-8]
+ movu m5, [tmpq+strideq*1-8]
+ movu m6, [tmpq+strideq*2-8]
+%if ARCH_X86_64
+ movu m7, [tmpq+stride3q -8]
+%endif
+
+ ; transpose 8x16
+ ; m0: A-H0,A-H8
+ ; m1: A-H1,A-H9
+ ; m2: A-H2,A-H10
+ ; m3: A-H3,A-H11
+ ; m4: A-H4,A-H12
+ ; m5: A-H5,A-H13
+ ; m6: A-H6,A-H14
+ ; m7: A-H7,A-H15
+%if ARCH_X86_64
+ punpcklwd m8, m0, m1
+%else
+ punpcklwd m7, m0, m1
+%endif
+ punpckhwd m0, m1
+ punpcklwd m1, m2, m3
+ punpckhwd m2, m3
+ punpcklwd m3, m4, m5
+ punpckhwd m4, m5
+%if ARCH_X86_64
+ punpcklwd m5, m6, m7
+ punpckhwd m6, m7
+%else
+ mova [rsp+3*16], m4
+ movu m4, [tmpq+stride3q -8]
+ punpcklwd m5, m6, m4
+ punpckhwd m6, m4
+%endif
+ ; m8: A0-1,B0-1,C0-1,D0-1 [m7 on x86-32]
+ ; m0: E0-1,F0-1,G0-1,H0-1
+ ; m1: A2-3,B2-3,C2-3,D2-3
+ ; m2: E2-3,F2-3,G2-3,H2-3
+ ; m3: A4-5,B4-5,C4-5,D4-5
+ ; m4: E4-5,F4-5,G4-5,H4-5 [r3 on x86-32]
+ ; m5: A6-7,B6-7,C6-7,D6-7
+ ; m6: E6-7,F6-7,G6-7,H6-7
+%if ARCH_X86_64
+ punpckldq m7, m8, m1
+ punpckhdq m8, m1
+%else
+ punpckldq m4, m7, m1
+ punpckhdq m7, m1
+%endif
+ punpckldq m1, m0, m2
+ punpckhdq m0, m2
+ punpckldq m2, m3, m5
+ punpckhdq m3, m5
+%if ARCH_X86_64
+ punpckldq m5, m4, m6
+ punpckhdq m4, m6
+%else
+ mova [rsp+4*16], m3
+ mova m3, [rsp+3*16]
+ punpckldq m5, m3, m6
+ punpckhdq m3, m6
+%endif
+ ; m7: A0-3,B0-3 [m4 on x86-32]
+ ; m8: C0-3,D0-3 [m7 on x86-32]
+ ; m1: E0-3,F0-3
+ ; m0: G0-3,H0-3
+ ; m2: A4-7,B4-7
+ ; m3: C4-7,D4-7 [r4 on x86-32]
+ ; m5: E4-7,F4-7
+ ; m4: G4-7,H4-7 [m3 on x86-32]
+%if ARCH_X86_64
+%if %1 != 6
+ punpcklqdq m6, m7, m2
+%endif
+ punpckhqdq m7, m2
+ punpcklqdq m2, m8, m3
+ punpckhqdq m8, m3
+ punpcklqdq m3, m1, m5
+ punpckhqdq m1, m5
+%if %1 != 6
+ punpckhqdq m5, m0, m4
+%endif
+ punpcklqdq m0, m4
+%if %1 == 8
+ mova [rsp+1*16], m6
+%define P3 [rsp+1*16]
+%endif
+ ; 7,2,8,3,1,0,5 -> 13,8,9,10,11,14,15
+ SWAP 7, 13
+ SWAP 8, 2, 9
+ SWAP 3, 10
+ SWAP 1, 11
+ SWAP 0, 14
+ SWAP 5, 15
+%define P2 m13
+%define P1 m8
+%define P0 m9
+%define Q0 m10
+%define Q1 m11
+%define Q2 m14
+%if %1 == 8
+%define Q3 m15
+%endif
+%else ; x86-32
+%if %1 == 8
+%define P3 [rsp+ 6*16]
+ punpcklqdq m6, m4, m2
+ mova P3, m6
+%endif
+ mova m6, [rsp+4*16]
+ punpckhqdq m4, m2
+ punpcklqdq m2, m7, m6
+ punpckhqdq m7, m6
+ punpcklqdq m6, m1, m5
+ punpckhqdq m1, m5
+%if %1 == 8
+%define Q3 [rsp+24*16]
+ punpckhqdq m5, m0, m3
+ mova Q3, m5
+%endif
+ punpcklqdq m0, m3
+%if %1 == 8
+%define P2 [rsp+18*16]
+%define P1 [rsp+19*16]
+%define P0 [rsp+20*16]
+%define Q0 [rsp+21*16]
+%define Q1 [rsp+22*16]
+%define Q2 [rsp+23*16]
+%else
+%define P2 [rsp+3*16]
+%define P1 [rsp+4*16]
+%define P0 [rsp+5*16]
+%define Q0 [rsp+6*16]
+%define Q1 [rsp+7*16]
+%define Q2 [rsp+8*16]
+%endif
+ mova P2, m4
+ mova P1, m2
+ mova P0, m7
+ mova Q0, m6
+ mova Q1, m1
+ mova Q2, m0
+%endif ; x86-32/64
+%else ; %1 == 16
+ ; We only use 14 pixels but we'll need the remainder at the end for
+ ; the second transpose
+ mova m0, [dstq+strideq*0-16]
+ mova m1, [dstq+strideq*1-16]
+ mova m2, [dstq+strideq*2-16]
+ mova m3, [dstq+stride3q -16]
+ lea tmpq, [dstq+strideq*4]
+ mova m4, [tmpq+strideq*0-16]
+ mova m5, [tmpq+strideq*1-16]
+ mova m6, [tmpq+strideq*2-16]
+%if ARCH_X86_64
+ mova m7, [tmpq+stride3q -16]
+
+ TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, 8
+ SWAP 5, 13
+ SWAP 6, 8
+ SWAP 7, 9
+%define P2 m13
+%define P1 m8
+%define P0 m9
+%else ; x86-32
+%define P2 [esp+18*16]
+%define P1 [esp+19*16]
+%define P0 [esp+20*16]
+ TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, \
+ [tmpq+stride3q -16], P2, "", a, a
+ mova P1, m6
+ mova P0, m7
+%endif ; x86-32/64
+ mova [rsp+ 7*16], m0
+ mova [rsp+ 8*16], m1
+ mova [rsp+ 9*16], m2
+ mova [rsp+10*16], m3
+%define P3 [rsp+6*16]
+ mova P3, m4
+
+ mova m0, [dstq+strideq*0]
+ mova m1, [dstq+strideq*1]
+ mova m2, [dstq+strideq*2]
+ mova m3, [dstq+stride3q ]
+ lea tmpq, [dstq+strideq*4]
+ mova m4, [tmpq+strideq*0]
+ mova m5, [tmpq+strideq*1]
+ mova m6, [tmpq+strideq*2]
+%if ARCH_X86_64
+ mova m7, [tmpq+stride3q ]
+
+ TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, 10
+ SWAP 0, 10
+ SWAP 1, 11
+ SWAP 2, 14
+ SWAP 3, 15
+%define Q0 m10
+%define Q1 m11
+%define Q2 m14
+%define Q3 m15
+%else ; x86-32
+ TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, \
+ [tmpq+stride3q ], [rsp+12*16], "", a, a
+%define Q0 [esp+21*16]
+%define Q1 [esp+22*16]
+%define Q2 [esp+23*16]
+%define Q3 [esp+24*16]
+ mova Q0, m0
+ mova Q1, m1
+ mova Q2, m2
+ mova Q3, m3
+%endif ; x86-32/64
+
+ mova [rsp+11*16], m4
+%if ARCH_X86_64
+ mova [rsp+12*16], m5
+%endif
+ mova [rsp+13*16], m6
+ mova [rsp+14*16], m7
+%endif ; %1 == 4/6/8/16
+%endif ; %2 ==/!= v
+
+ ; load L/E/I/H
+%if ARCH_X86_32
+%define l_strideq r5
+ mov l_strideq, dword lstridem
+%ifidn %2, v
+%define lq r3
+ mov lq, dword lm
+%endif
+%endif
+%ifidn %2, v
+%if cpuflag(sse4)
+ pmovzxbw m1, [lq]
+ pmovzxbw m0, [lq+l_strideq]
+ pxor m2, m2
+%else ; ssse3
+ movq m1, [lq]
+ movq m0, [lq+l_strideq]
+ pxor m2, m2
+ REPX {punpcklbw x, m2}, m1, m0
+%endif ; ssse3/sse4
+%else ; %2 != v
+ movq m0, [lq] ; l0, l1
+ movq m1, [lq+l_strideq] ; l2, l3
+ punpckldq m0, m1 ; l0, l2, l1, l3
+ pxor m2, m2
+ punpcklbw m1, m0, m2 ; l0, l2
+ punpckhbw m0, m2 ; l1, l3
+%endif ; %2==/!=v
+%if ARCH_X86_32
+%ifidn %2, v
+%undef lq
+ mov mstrideq, mstridem
+%endif
+%endif
+ pcmpeqw m5, m2, m0
+ pand m1, m5
+ por m0, m1 ; l[x][] ? l[x][] : l[x-stride][]
+ pshufb m0, [PIC_sym(pb_4x1_4x5_4x9_4x13)] ; l[x][1]
+ pcmpeqw m5, m2, m0 ; !L
+ psrlw m5, 1
+%if ARCH_X86_64
+ psrlw m2, m0, [lutq+128]
+ SPLATW m1, [lutq+136]
+%else ; x86-32
+ mov r5, lutm
+ psrlw m2, m0, [r5+128]
+ SPLATW m1, [r5+136]
+%endif ; x86-32/64
+ pminsw m2, m1
+ pmaxsw m2, [PIC_sym(pw_1)] ; I
+ psrlw m1, m0, 4 ; H
+ paddw m0, [PIC_sym(pw_2)]
+ paddw m0, m0
+ paddw m0, m2 ; E
+ REPX {pmullw x, [bdmulq]}, m0, m1, m2
+%if ARCH_X86_32
+%undef l_strideq
+ lea stride3q, [strideq*3]
+%endif
+
+ psubw m3, P1, P0 ; p1-p0
+ psubw m4, Q0, Q1 ; q0-q1
+ REPX {pabsw x, x}, m3, m4
+ pmaxsw m3, m5
+ pmaxsw m3, m4
+ pcmpgtw m7, m3, m1 ; hev
+%if %1 != 4
+ psubw m4, P2, P0 ; p2-p0
+ pabsw m4, m4
+ pmaxsw m4, m3
+%if %1 != 6
+ mova m6, P3 ; p3
+ psubw m5, m6, P0 ; p3-p0
+ pabsw m5, m5
+ pmaxsw m4, m5
+%endif ; %1 != 6
+ psubw m5, Q0, Q2 ; q0-q2
+ pabsw m5, m5
+ pmaxsw m4, m5
+%if %1 != 6
+ psubw m5, Q0, Q3 ; q0-q3
+ pabsw m5, m5
+ pmaxsw m4, m5
+%endif ; %1 != 6
+ pcmpgtw m4, [bdmulq] ; !flat8in
+
+ psubw m5, P2, P1 ; p2-p1
+ pabsw m5, m5
+%if %1 != 6
+ psubw m6, P2 ; p3-p2
+ pabsw m6, m6
+ pmaxsw m5, m6
+ psubw m6, Q2, Q3 ; q2-q3
+ pabsw m6, m6
+ pmaxsw m5, m6
+%endif ; %1 != 6
+ psubw m6, Q2, Q1 ; q2-q1
+ pabsw m6, m6
+ pmaxsw m5, m6
+
+%if %1 == 16
+ SPLATD m6, [maskq+8]
+ SPLATD m1, [maskq+4]
+ por m6, m1
+ pand m6, m12
+ pcmpeqd m6, m12
+ pand m5, m6
+%else ; %1 != 16
+ SPLATD m6, [maskq+4]
+ pand m6, m12
+ pcmpeqd m6, m12
+ pand m5, m6 ; only apply fm-wide to wd>4 blocks
+%endif ; %1==/!=16
+ pmaxsw m3, m5
+%endif ; %1 != 4
+ pcmpgtw m3, m2
+
+ psubw m5, P1, Q1 ; p1-q1
+ psubw m6, P0, Q0 ; p0-q0
+ REPX {pabsw x, x}, m5, m6
+ paddw m6, m6
+ psrlw m5, 1
+ paddw m5, m6 ; abs(p0-q0)*2+(abs(p1-q1)>>1)
+ pcmpgtw m5, m0 ; abs(p0-q0)*2+(abs(p1-q1)>>1) > E
+ por m3, m5
+
+%if %1 == 16
+
+%ifidn %2, v
+ lea tmpq, [dstq+mstrideq*8]
+ mova m0, [tmpq+strideq*1]
+ mova m1, [tmpq+strideq*2]
+ mova m2, [tmpq+stride3q]
+%else ; %2 != v
+ mova m0, [rsp+ 8*16]
+ mova m1, [rsp+ 9*16]
+ mova m2, [rsp+10*16]
+%endif ; %2==/!=v
+ REPX {psubw x, P0}, m0, m1, m2
+ REPX {pabsw x, x}, m0, m1, m2
+ pmaxsw m1, m0
+ pmaxsw m1, m2
+%ifidn %2, v
+ lea tmpq, [dstq+strideq*4]
+ mova m0, [tmpq+strideq*0]
+ mova m2, [tmpq+strideq*1]
+ mova m5, [tmpq+strideq*2]
+%else ; %2 != v
+ mova m0, [rsp+11*16]
+ mova m2, [rsp+12*16]
+ mova m5, [rsp+13*16]
+%endif ; %2==/!=v
+ REPX {psubw x, Q0}, m0, m2, m5
+ REPX {pabsw x, x}, m0, m2, m5
+ pmaxsw m0, m2
+ pmaxsw m1, m5
+ pmaxsw m1, m0
+ pcmpgtw m1, [bdmulq] ; !flat8out
+ por m1, m4 ; !flat8in | !flat8out
+ SPLATD m2, [maskq+8]
+ pand m5, m2, m12
+ pcmpeqd m5, m12
+ pandn m1, m5 ; flat16
+ pandn m5, m3, m1 ; flat16 & fm
+ SWAP 1, 5
+
+ SPLATD m5, [maskq+4]
+ por m5, m2
+ pand m2, m5, m12
+ pcmpeqd m2, m12
+ pandn m4, m2 ; flat8in
+ pandn m2, m3, m4
+ SWAP 2, 4
+ SPLATD m2, [maskq+0]
+ por m2, m5
+ pand m2, m12
+ pcmpeqd m2, m12
+ pandn m3, m2
+ pandn m0, m4, m3 ; fm & !flat8 & !flat16
+ SWAP 0, 3
+ pandn m0, m1, m4 ; flat8 & !flat16
+ SWAP 0, 4
+%elif %1 != 4
+ SPLATD m0, [maskq+4]
+ pand m2, m0, m12
+ pcmpeqd m2, m12
+ pandn m4, m2
+ pandn m2, m3, m4 ; flat8 & fm
+ SWAP 2, 4
+ SPLATD m2, [maskq+0]
+ por m0, m2
+ pand m0, m12
+ pcmpeqd m0, m12
+ pandn m3, m0
+ pandn m0, m4, m3 ; fm & !flat8
+ SWAP 0, 3
+%else ; %1 == 4
+ SPLATD m0, [maskq+0]
+ pand m0, m12
+ pcmpeqd m0, m12
+ pandn m3, m0 ; fm
+%endif ; %1==/!=4
+
+ ; short filter
+%if ARCH_X86_64
+ SPLATW m0, r7m
+%else
+ SPLATW m0, bdmulm
+%endif
+ pcmpeqw m2, m2
+ psrlw m0, 1 ; 511 or 2047
+ pxor m2, m0 ; -512 or -2048
+
+ psubw m5, Q0, P0 ; q0-p0
+ paddw m6, m5, m5
+ paddw m6, m5 ; 3*(q0-p0)
+ psubw m5, P1, Q1 ; iclip_diff(p1-q1)
+ pminsw m5, m0
+ pmaxsw m5, m2
+ pand m5, m7 ; f=iclip_diff(p1-q1)&hev
+ paddw m5, m6 ; f=iclip_diff(3*(q0-p0)+f)
+ pminsw m5, m0
+ pmaxsw m5, m2
+ pand m3, m5 ; f&=fm
+ paddw m5, m3, [PIC_sym(pw_3)]
+ paddw m3, [PIC_sym(pw_4)]
+ REPX {pminsw x, m0}, m5, m3
+ psraw m5, 3 ; f2
+ psraw m3, 3 ; f1
+ psubw m0, m2 ; 1023 or 4095
+ pxor m2, m2
+%if ARCH_X86_64
+ paddw P0, m5
+ psubw Q0, m3
+%else
+ paddw m5, P0
+ psubw m6, Q0, m3
+ REPX {pminsw x, m0}, m5, m6
+ REPX {pmaxsw x, m2}, m5, m6
+%endif
+
+ paddw m3, [PIC_sym(pw_1)]
+ psraw m3, 1 ; f=(f1+1)>>1
+ pandn m7, m3 ; f&=!hev
+ SWAP 7, 3
+%if ARCH_X86_64
+ paddw P1, m3
+ psubw Q1, m3
+ REPX {pminsw x, m0}, P1, P0, Q0, Q1
+ REPX {pmaxsw x, m2}, P1, P0, Q0, Q1
+%else
+ psubw m7, Q1, m3
+ paddw m3, P1
+ REPX {pminsw x, m0}, m7, m3
+ REPX {pmaxsw x, m2}, m7, m3
+%if %1 > 4
+ mova P1, m3
+ mova P0, m5
+ mova Q0, m6
+ mova Q1, m7
+%endif
+%endif
+
+%if %1 == 16
+
+; m8-11 = p1/p0/q0/q1, m4=flat8, m1=flat16
+; m12=filter bits mask
+; m13-15=p2/q2/q3
+; m0,2-3,5-7 = free
+
+ ; flat16 filter
+%ifidn %2, v
+ lea tmpq, [dstq+mstrideq*8]
+ mova m0, [tmpq+strideq*1] ; p6
+ mova m2, [tmpq+strideq*2] ; p5
+ mova m7, [tmpq+stride3q] ; p4
+ mova m6, [tmpq+strideq*4] ; p3
+ lea tmpq, [dstq+mstrideq*4]
+%else ; %2 != v
+ mova m0, [rsp+ 8*16]
+ mova m2, [rsp+ 9*16]
+ mova m7, [rsp+10*16]
+ mova m6, [rsp+ 6*16]
+%endif ; %2==/!=v
+
+ mova [rsp+ 0*16], m4
+
+ ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0
+ psllw m3, m0, 3 ; p6*8
+ paddw m3, [PIC_sym(pw_8)]
+ paddw m5, m2, m7 ; p5+p4
+ psubw m3, m0
+ paddw m5, m5 ; (p5+p4)*2
+ paddw m3, m6 ; p6*7+p3
+ paddw m5, P2 ; (p5+p4)*2+p2
+ paddw m3, P1 ; p6*7+p3+p1
+ paddw m5, P0 ; (p5+p4)*2+p2+p0
+ paddw m3, Q0 ; p6*7+p3+p1+q0
+ paddw m3, m5 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0
+ psrlw m5, m3, 4
+ pand m5, m1
+ pandn m4, m1, m2
+ por m5, m4
+%ifidn %2, v
+ mova [tmpq+mstrideq*2], m5 ; p5
+%else ; %2 != v
+ mova [rsp+9*16], m5
+%endif ; %2==/!=v
+
+ ; sub p6*2, add p3/q1
+ paddw m3, m6
+ paddw m5, m0, m0
+ paddw m3, Q1
+ psubw m3, m5
+ psrlw m5, m3, 4
+ pand m5, m1
+ pandn m4, m1, m7
+ por m5, m4
+%ifidn %2, v
+ mova [tmpq+mstrideq*1], m5 ; p4
+%else ; %2 != v
+ mova [rsp+10*16], m5
+%endif ; %2==/!=v
+
+ ; sub p6/p5, add p2/q2
+ psubw m3, m0
+ paddw m5, P2, Q2
+ psubw m3, m2
+ paddw m3, m5
+ psrlw m5, m3, 4
+ pand m5, m1
+ pandn m4, m1, m6
+ por m5, m4
+%ifidn %2, v
+ mova [tmpq+strideq*0], m5 ; p3
+%else ; %2 != v
+ mova [rsp+6*16], m5
+%endif ; %2==/!=v
+
+%define WRITE_IN_PLACE 0
+%ifidn %2, v
+%if ARCH_X86_64
+%define WRITE_IN_PLACE 1
+%endif
+%endif
+
+ ; sub p6/p4, add p1/q3
+ paddw m3, P1
+ paddw m5, m0, m7
+ paddw m3, Q3
+ psubw m3, m5
+ psrlw m5, m3, 4
+ pand m5, m1
+ pandn m4, m1, P2
+ por m5, m4
+%if WRITE_IN_PLACE
+ mova [tmpq+strideq*1], m5
+%else
+ mova [rsp+1*16], m5 ; don't clobber p2/m13
+%endif
+
+ ; sub p6/p3, add p0/q4
+ paddw m3, P0
+ paddw m5, m0, m6
+%ifidn %2, v
+ paddw m3, [dstq+strideq*4]
+%else ; %2 != v
+ paddw m3, [rsp+11*16]
+%endif ; %2==/!=v
+ psubw m3, m5
+ psrlw m5, m3, 4
+ pand m5, m1
+ pandn m4, m1, P1
+ por m5, m4
+%if WRITE_IN_PLACE
+ mova [dstq+mstrideq*2], m5
+%else
+ mova [rsp+2*16], m5 ; don't clobber p1/m3
+%endif
+
+ ; sub p6/p2, add q0/q5
+ paddw m3, Q0
+ paddw m5, m0, P2
+%ifidn %2, v
+%if ARCH_X86_32
+ lea r4, P2
+%endif
+ lea tmpq, [dstq+strideq*4]
+ paddw m3, [tmpq+strideq*1]
+%else ; %2 != v
+ paddw m3, [rsp+12*16]
+%endif ; %2==/!=v
+ psubw m3, m5
+ psrlw m5, m3, 4
+ pand m5, m1
+ pandn m4, m1, P0
+ por m5, m4
+%if WRITE_IN_PLACE
+ mova [dstq+mstrideq*1], m5
+%else
+ mova [rsp+3*16], m5 ; don't clobber p0/m4
+%endif
+
+ ; sub p6/p1, add q1/q6
+ paddw m3, Q1
+ paddw m5, m0, P1
+%ifidn %2, v
+ mova m0, [tmpq+strideq*2] ; q6
+%else ; %2 != v
+ mova m0, [rsp+13*16] ; q6
+%endif ; %2==/!=v
+ paddw m3, m0
+ psubw m3, m5
+ psrlw m5, m3, 4
+ pand m5, m1
+ pandn m4, m1, Q0
+ por m5, m4
+%if WRITE_IN_PLACE
+ mova [dstq], m5
+%else
+ mova [rsp+4*16], m5 ; don't clobber q0/m5
+%endif
+
+ ; sub p5/p0, add q2/q6
+ paddw m3, Q2
+ paddw m5, m2, P0
+ paddw m3, m0
+ psubw m3, m5
+ psrlw m5, m3, 4
+ pand m5, m1
+ pandn m4, m1, Q1
+ por m2, m5, m4 ; don't clobber q1/m6
+
+ ; sub p4/q0, add q3/q6
+ paddw m3, Q3
+ paddw m7, Q0
+ paddw m3, m0
+ psubw m3, m7
+ psrlw m7, m3, 4
+ pand m7, m1
+ pandn m4, m1, Q2
+ por m7, m4 ; don't clobber q2/m14
+
+ ; sub p3/q1, add q4/q6
+%ifidn %2, v
+ paddw m3, [tmpq+strideq*0]
+%else ; %2 != v
+ paddw m3, [rsp+11*16]
+%endif ; %2==/!=v
+ paddw m6, Q1
+ paddw m3, m0
+ psubw m3, m6
+ psrlw m6, m3, 4
+ pand m6, m1
+ pandn m4, m1, Q3
+ por m6, m4
+%if WRITE_IN_PLACE
+ mova [tmpq+mstrideq], m6 ; q3
+%else ; %2 != v
+ mova [rsp+5*16], m6
+%endif ; %2==/!=v
+
+ ; sub p2/q2, add q5/q6
+%ifidn %2, v
+ paddw m3, [tmpq+strideq*1]
+%if ARCH_X86_64
+ paddw m5, P2, Q2
+%else
+ ; because tmpq is clobbered, so we use a backup pointer for P2 instead
+ paddw m5, [r4], Q2
+ mov pic_regq, pic_regm
+%endif
+%else ; %2 != v
+ paddw m3, [rsp+12*16]
+ paddw m5, P2, Q2
+%endif ; %2==/!=v
+ paddw m3, m0
+ psubw m3, m5
+ psrlw m5, m3, 4
+ pand m5, m1
+%ifidn %2, v
+ pandn m4, m1, [tmpq+strideq*0]
+%else ; %2 != v
+ pandn m4, m1, [rsp+11*16]
+%endif ; %2==/!=v
+ por m5, m4
+%ifidn %2, v
+ mova [tmpq+strideq*0], m5 ; q4
+%else ; %2 != v
+ mova [rsp+11*16], m5
+%endif ; %2==/!=v
+
+ ; sub p1/q3, add q6*2
+ psubw m3, P1
+ paddw m0, m0
+ psubw m3, Q3
+ paddw m3, m0
+ psrlw m5, m3, 4
+ pand m5, m1
+%ifidn %2, v
+ pandn m4, m1, [tmpq+strideq*1]
+%else ; %2 != v
+ pandn m4, m1, [rsp+12*16]
+%endif ; %2==/!=v
+ por m5, m4
+%ifidn %2, v
+ mova [tmpq+strideq*1], m5 ; q5
+%else ; %2 != v
+ mova [rsp+12*16], m5
+%endif ; %2==/!=v
+
+ mova m4, [rsp+0*16]
+%ifidn %2, v
+ lea tmpq, [dstq+mstrideq*4]
+%endif
+%if ARCH_X86_64
+ SWAP 2, 11
+ SWAP 7, 14
+ SWAP 6, 15
+%else ; x86-32
+ mova Q1, m2
+ mova Q2, m7
+%endif ; x86-32/64
+%if WRITE_IN_PLACE
+ mova P2, [tmpq+strideq*1]
+ mova P1, [tmpq+strideq*2]
+ mova P0, [tmpq+stride3q]
+ mova Q0, [dstq]
+%elif ARCH_X86_64
+ mova P2, [rsp+1*16]
+ mova P1, [rsp+2*16]
+ mova P0, [rsp+3*16]
+ mova Q0, [rsp+4*16]
+%else ; !WRITE_IN_PLACE & x86-32
+ mova m0, [rsp+1*16]
+ mova m1, [rsp+2*16]
+ mova m2, [rsp+3*16]
+ mova m3, [rsp+4*16]
+ mova m7, [rsp+5*16]
+ mova P2, m0
+ mova P1, m1
+ mova P0, m2
+ mova Q0, m3
+ mova Q3, m7
+%endif ; WRITE_IN_PLACE / x86-32/64
+%undef WRITE_IN_PLACE
+%endif ; %1 == 16
+
+%if %1 >= 8
+
+ ; flat8 filter
+ mova m0, P3 ; p3
+ paddw m1, m0, P2 ; p3+p2
+ paddw m2, P1, P0 ; p1+p0
+ paddw m3, m1, m1 ; 2*(p3+p2)
+ paddw m2, m0 ; p1+p0+p3
+ paddw m3, Q0 ; 2*(p3+p2)+q0
+ paddw m2, m3 ; 3*p3+2*p2+p1+p0+q0
+ pmulhrsw m7, m2, [PIC_sym(pw_4096)]
+ psubw m7, P2
+ pand m7, m4
+
+ paddw m3, P1, Q1 ; p1+q1
+ psubw m2, m1 ; 2*p3+p2+p1+p0+q0
+ paddw m2, m3 ; 2*p3+p2+2*p1+p0+q0+q1
+ pmulhrsw m3, m2, [PIC_sym(pw_4096)]
+ psubw m3, P1
+ pand m3, m4
+
+ paddw m5, m0, P1 ; p3+p1
+ paddw m6, P0, Q2 ; p0+q2
+ psubw m2, m5 ; p3+p2+p1+p0+q0+q1
+ paddw m2, m6 ; p3+p2+p1+2*p0+q0+q1+q2
+ pmulhrsw m5, m2, [PIC_sym(pw_4096)]
+ psubw m5, P0
+ pand m5, m4
+
+ paddw m6, m0, P0 ; p3+p0
+ paddw m1, Q0, Q3 ; q0+q3
+ psubw m2, m6 ; p2+p1+p0+q0+q1+q2
+ paddw m2, m1 ; p2+p1+p0+2*q0+q1+q2+q3
+ pmulhrsw m6, m2, [PIC_sym(pw_4096)]
+ psubw m6, Q0
+ pand m6, m4
+
+ paddw m2, Q1 ; p2+p1+p0+2*q0+2*q1+q2+q3
+ paddw m2, Q3 ; p2+p1+p0+2*q0+2*q1+q2+2*q3
+ paddw m1, P2, Q0 ; p2+q0
+ psubw m2, m1 ; p1+p0+q0+2*q1+q2+2*q3
+ pmulhrsw m1, m2, [PIC_sym(pw_4096)]
+ psubw m1, Q1
+ pand m1, m4
+
+ psubw m2, P1 ; p0+q0+2*q1+q2+2*q3
+ psubw m2, Q1 ; p0+q0+q1+q2+2*q3
+ paddw m0, Q3, Q2 ; q3+q2
+ paddw m2, m0 ; p0+q0+q1+2*q2+3*q3
+ pmulhrsw m2, [PIC_sym(pw_4096)]
+ psubw m2, Q2
+ pand m2, m4
+
+ paddw m7, P2
+ paddw m3, P1
+ paddw m5, P0
+ paddw m6, Q0
+ paddw m1, Q1
+ paddw m2, Q2
+
+%ifidn %2, v
+ mova [tmpq+strideq*1], m7 ; p2
+ mova [tmpq+strideq*2], m3 ; p1
+ mova [tmpq+stride3q ], m5 ; p0
+ mova [dstq+strideq*0], m6 ; q0
+ mova [dstq+strideq*1], m1 ; q1
+ mova [dstq+strideq*2], m2 ; q2
+%else ; %2 != v
+ mova m0, P3
+
+%if %1 == 8
+ lea tmpq, [dstq+strideq*4]
+%if ARCH_X86_64
+ SWAP 4, 15
+ TRANSPOSE8X8W 0, 7, 3, 5, 6, 1, 2, 4, 8
+%else
+ TRANSPOSE8X8W 0, 7, 3, 5, 6, 1, 2, 4, "", \
+ Q3, [tmpq+strideq*1-8], a, u
+%endif
+
+ ; write 8x8
+ movu [dstq+strideq*0-8], m0
+ movu [dstq+strideq*1-8], m7
+ movu [dstq+strideq*2-8], m3
+ movu [dstq+stride3q -8], m5
+ movu [tmpq+strideq*0-8], m6
+%if ARCH_X86_64
+ movu [tmpq+strideq*1-8], m1
+%endif
+ movu [tmpq+strideq*2-8], m2
+ movu [tmpq+stride3q -8], m4
+ lea dstq, [dstq+strideq*8]
+%else ; %1 != 8
+%if ARCH_X86_64
+ SWAP 6, 8
+ SWAP 1, 9
+ SWAP 2, 10
+%else
+ mova [rsp+1*16], m6
+ mova [rsp+2*16], m1
+ mova [rsp+3*16], m2
+%endif
+
+ mova m1, [rsp+ 7*16]
+ mova m2, [rsp+ 8*16]
+ mova m4, [rsp+ 9*16]
+ mova m6, [rsp+10*16]
+ lea tmpq, [dstq+strideq*4]
+%if ARCH_X86_64
+ TRANSPOSE8X8W 1, 2, 4, 6, 0, 7, 3, 5, 11
+%else
+ mova [rsp+7*16], m5
+ TRANSPOSE8X8W 1, 2, 4, 6, 0, 7, 3, 5, "", \
+ [rsp+7*16], [tmpq+strideq*1-16], a, a
+%endif
+
+ mova [dstq+strideq*0-16], m1
+ mova [dstq+strideq*1-16], m2
+ mova [dstq+strideq*2-16], m4
+ mova [dstq+stride3q -16], m6
+ mova [tmpq+strideq*0-16], m0
+%if ARCH_X86_64
+ mova [tmpq+strideq*1-16], m7
+%endif
+ mova [tmpq+strideq*2-16], m3
+ mova [tmpq+stride3q -16], m5
+
+%if ARCH_X86_64
+ SWAP 6, 8
+ SWAP 1, 9
+ SWAP 2, 10
+ SWAP 4, 15
+%else
+ mova m6, [rsp+1*16]
+ mova m1, [rsp+2*16]
+ mova m2, [rsp+3*16]
+ mova m4, Q3
+%endif
+ mova m0, [rsp+11*16]
+ mova m3, [rsp+12*16]
+ mova m5, [rsp+13*16]
+%if ARCH_X86_64
+ mova m7, [rsp+14*16]
+ TRANSPOSE8X8W 6, 1, 2, 4, 0, 3, 5, 7, 8
+%else
+ TRANSPOSE8X8W 6, 1, 2, 4, 0, 3, 5, 7, "", \
+ [rsp+14*16], [tmpq+strideq*1], a, a
+%endif
+ mova [dstq+strideq*0], m6
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+stride3q ], m4
+ mova [tmpq+strideq*0], m0
+%if ARCH_X86_64
+ mova [tmpq+strideq*1], m3
+%endif
+ mova [tmpq+strideq*2], m5
+ mova [tmpq+stride3q ], m7
+ lea dstq, [dstq+strideq*8]
+%endif ; %1==/!=8
+%endif ; %2==/!=v
+%elif %1 == 6
+ ; flat6 filter
+ paddw m3, P1, P0 ; p1+p0
+ paddw m3, P2 ; p2+p1+p0
+ paddw m6, P2, Q0 ; p2+q0
+ paddw m3, m3 ; 2*(p2+p1+p0)
+ paddw m3, m6 ; p2+2*(p2+p1+p0)+q0
+ pmulhrsw m2, m3, [PIC_sym(pw_4096)]
+ psubw m2, P1
+ pand m2, m4
+
+ paddw m3, Q0 ; p2+2*(p2+p1+p0+q0)
+ paddw m6, P2, P2 ; 2*p2
+ paddw m3, Q1 ; p2+2*(p2+p1+p0+q0)+q1
+ psubw m3, m6 ; p2+2*(p1+p0+q0)+q1
+ pmulhrsw m5, m3, [PIC_sym(pw_4096)]
+ psubw m5, P0
+ pand m5, m4
+
+ paddw m3, Q1 ; p2+2*(p1+p0+q0+q1)
+ paddw m6, P2, P1 ; p2+p1
+ paddw m3, Q2 ; p2+2*(p1+p0+q0+q1)+q2
+ psubw m3, m6 ; p1+2*(p0+q0+q1)+q2
+ pmulhrsw m6, m3, [PIC_sym(pw_4096)]
+ psubw m6, Q0
+ pand m6, m4
+
+ psubw m3, P1 ; 2*(p0+q0+q1)+q2
+%if ARCH_X86_64
+ paddw Q2, Q2 ; q2*2
+%else
+ mova m0, Q2
+ paddw m0, m0
+%endif
+ psubw m3, P0 ; p0+2*(q0+q1)+q2
+%if ARCH_X86_64
+ paddw m3, Q2 ; p0+q*(q0+q1+q2)+q2
+%else
+ paddw m3, m0
+%endif
+ pmulhrsw m3, [PIC_sym(pw_4096)]
+ psubw m3, Q1
+ pand m3, m4
+
+ paddw m2, P1
+ paddw m5, P0
+ paddw m6, Q0
+ paddw m3, Q1
+
+%ifidn %2, v
+ mova [dstq+mstrideq*2], m2 ; p1
+ mova [dstq+mstrideq*1], m5 ; p0
+ mova [dstq+strideq*0], m6 ; q0
+ mova [dstq+strideq*1], m3 ; q1
+%else ; %2 != v
+ TRANSPOSE_8x4_AND_WRITE_4x8 m2, m5, m6, m3, m0
+%endif ; %2==/!=v
+%else ; %1 == 4
+%if ARCH_X86_64
+%ifidn %2, v
+ mova [dstq+mstrideq*2], P1 ; p1
+ mova [dstq+mstrideq*1], P0 ; p0
+ mova [dstq+strideq*0], Q0 ; q0
+ mova [dstq+strideq*1], Q1 ; q1
+%else ; %2 != v
+ TRANSPOSE_8x4_AND_WRITE_4x8 P1, P0, Q0, Q1, m0
+%endif ; %2==/!=v
+%else ; x86-32
+%ifidn %2, v
+ mova [dstq+mstrideq*2], m3
+ mova [dstq+mstrideq*1], m5
+ mova [dstq+strideq*0], m6
+ mova [dstq+strideq*1], m7
+%else ; %2 != v
+ TRANSPOSE_8x4_AND_WRITE_4x8 m3, m5, m6, m7, m0
+%endif ; %2==/!=v
+%endif ; x86-32/64
+%endif ; %1
+%undef P3
+%undef P2
+%undef P1
+%undef P0
+%undef Q0
+%undef Q1
+%undef Q2
+%undef Q3
+%endmacro
+
+INIT_XMM ssse3
+; stack layout:
+; r0 - flat8 backup inside flat16 code
+%if ARCH_X86_64
+cglobal lpf_v_sb_y_16bpc, 6, 12, 16, -16 * 1, \
+ dst, stride, mask, l, l_stride, lut, \
+ w, stride3, mstride, tmp, mask_bits, bdmul
+ mov r6d, r7m
+ sar r6d, 7
+ and r6d, 16 ; 0 for 10bpc, 16 for 12bpc
+ lea bdmulq, [pw_4]
+ add bdmulq, r6
+ mov wd, wm
+ shl l_strideq, 2
+ sub lq, l_strideq
+%else
+; stack layout [32bit only]:
+; r1-4 - p2-q0 post-filter16
+; r5 - p3
+; r6 - q3 post-filter16
+; r7 - GPRs [mask_bitsm, mstridem]
+; r8 - m12/pb_mask
+; r9 - bdmulq
+cglobal lpf_v_sb_y_16bpc, 4, 7, 8, -16 * (10 + extra_stack), \
+ dst, stride, mask, mstride, pic_reg, stride3, tmp
+ RELOC_ARGS v, 10*16
+%if STACK_ALIGNMENT >= 16
+ mov r5d, r7m
+%endif
+ sar r5d, 7
+ and r5d, 16 ; 0 for 10bpc, 16 for 12bpc
+ LEA pic_regq, PIC_base
+%define pic_regm dword [esp+7*16+2*gprsize]
+ mov pic_regm, pic_regq
+ mova m0, [PIC_sym(pw_4)+r5]
+%define bdmulq esp+9*16
+ mova [bdmulq], m0
+ shl dword lstridem, 2
+ sub r3, dword lstridem
+ mov dword lm, r3
+%endif
+ mov mstrideq, strideq
+ neg mstrideq
+ lea stride3q, [strideq*3]
+%if ARCH_X86_64
+ mov mask_bitsd, 0x3
+ mova m12, [pb_mask]
+%else
+%define mstridem dword [esp+7*16+1*gprsize]
+ mov mstridem, mstrideq
+%define mask_bitsm dword [esp+7*16+0*gprsize]
+ mov mask_bitsm, 0x3
+ mova m0, [PIC_sym(pb_mask)]
+%define m12 [esp+8*16]
+ mova m12, m0
+%endif
+
+.loop:
+%if ARCH_X86_64
+ test [maskq+8], mask_bitsd ; vmask[2]
+%else
+ mov r6d, mask_bitsm
+ test [maskq+8], r6d
+%endif
+ jz .no_flat16
+
+ FILTER 16, v
+ jmp .end
+
+.no_flat16:
+%if ARCH_X86_64
+ test [maskq+4], mask_bitsd ; vmask[1]
+%else
+ test [maskq+4], r6d
+%endif
+ jz .no_flat
+
+ FILTER 8, v
+ jmp .end
+
+.no_flat:
+%if ARCH_X86_64
+ test [maskq+0], mask_bitsd ; vmask[0]
+%else
+ test [maskq+0], r6d
+%endif
+ jz .end
+
+ FILTER 4, v
+
+.end:
+%if ARCH_X86_64
+ pslld m12, 2
+ add lq, 8
+%else
+ mova m0, m12
+ pslld m0, 2
+ mova m12, m0
+ add dword lm, 8
+%endif
+ add dstq, 16
+%if ARCH_X86_64
+ shl mask_bitsd, 2
+ sub wd, 2
+%else
+ shl mask_bitsm, 2
+ sub dword wm, 2
+%endif
+ jg .loop
+%undef mask_bitsm
+%undef bdmulq
+ UNRELOC_ARGS
+ RET
+
+INIT_XMM ssse3
+; stack layout:
+; r0 - flat8 backup inside flat16
+; r1-4 - p2-q0 post-filter16 backup
+; r5 - q3 post-filter16 backup
+; r6 - p3
+; r7-10 - p7-4
+; r11-14 - q4-7
+%if ARCH_X86_64
+cglobal lpf_h_sb_y_16bpc, 6, 11, 16, -16 * 15, \
+ dst, stride, mask, l, l_stride, lut, \
+ h, stride3, tmp, mask_bits, bdmul
+ mov r6d, r7m
+ sar r6d, 7
+ and r6d, 16 ; 0 for 10bpc, 16 for 12bpc
+ lea bdmulq, [pw_4]
+ add bdmulq, r6
+ mov hd, hm
+ shl l_strideq, 2
+%else
+; stack layout [32bit only]:
+; r15 - GPRs [mask_bitsm]
+; r16 - m12/pb_mask
+; r17 - bdmulq
+; r18-24 - p2-q3
+cglobal lpf_h_sb_y_16bpc, 4, 7, 8, -16 * (25 + extra_stack), \
+ dst, stride, mask, l, pic_reg, stride3, tmp
+ RELOC_ARGS h, 25*16
+%if STACK_ALIGNMENT >= 16
+ mov r5d, r7m
+%endif
+ sar r5d, 7
+ and r5d, 16 ; 0 for 10bpc, 16 for 12bpc
+ LEA pic_regq, PIC_base
+ mova m0, [PIC_sym(pw_4)+r5]
+%define bdmulq esp+17*16
+ mova [bdmulq], m0
+ shl dword lstridem, 2
+%endif
+ sub lq, 4
+ lea stride3q, [strideq*3]
+%if ARCH_X86_64
+ mov mask_bitsd, 0x3
+ mova m12, [pb_mask]
+%else
+%define mask_bitsm dword [esp+15*16+0*gprsize]
+ mov mask_bitsm, 0x3
+ mova m0, [PIC_sym(pb_mask)]
+%define m12 [esp+16*16]
+ mova m12, m0
+%endif
+
+.loop:
+%if ARCH_X86_64
+ test [maskq+8], mask_bitsd ; vmask[2]
+%else
+ mov r6d, mask_bitsm
+ test [maskq+8], r6d
+%endif
+ jz .no_flat16
+
+ FILTER 16, h
+ jmp .end
+
+.no_flat16:
+%if ARCH_X86_64
+ test [maskq+4], mask_bitsd ; vmask[1]
+%else
+ test [maskq+4], r6d
+%endif
+ jz .no_flat
+
+ FILTER 8, h
+ jmp .end
+
+.no_flat:
+%if ARCH_X86_64
+ test [maskq+0], mask_bitsd ; vmask[0]
+%else
+ test [maskq+0], r6d
+%endif
+ jz .no_filter
+
+ FILTER 4, h
+ jmp .end
+
+.no_filter:
+ lea dstq, [dstq+strideq*8]
+.end:
+%if ARCH_X86_64
+ pslld m12, 2
+ lea lq, [lq+l_strideq*2]
+ shl mask_bitsd, 2
+ sub hd, 2
+%else
+ mova m0, m12
+ pslld m0, 2
+ mova m12, m0
+ add lq, dword lstridem
+ add lq, dword lstridem
+ shl mask_bitsm, 2
+ sub dword hm, 2
+%endif
+ jg .loop
+%undef mask_bitsm
+%undef bdmulq
+ UNRELOC_ARGS
+ RET
+
+INIT_XMM ssse3
+%if ARCH_X86_64
+cglobal lpf_v_sb_uv_16bpc, 6, 12, 16, \
+ dst, stride, mask, l, l_stride, lut, \
+ w, stride3, mstride, tmp, mask_bits, bdmul
+ mov r6d, r7m
+ sar r6d, 7
+ and r6d, 16 ; 0 for 10bpc, 16 for 12bpc
+ lea bdmulq, [pw_4]
+ add bdmulq, r6
+ mov wd, wm
+ shl l_strideq, 2
+ sub lq, l_strideq
+%else
+; stack layout [32bit only]:
+; r0 - GPRs [mask_bitsm, mstridem]
+; r1 - m12/pb_mask
+; r2 - bdmulq
+cglobal lpf_v_sb_uv_16bpc, 4, 7, 8, -16 * (3 + extra_stack), \
+ dst, stride, mask, mstride, pic_reg, stride3, tmp
+ RELOC_ARGS v, 3*16
+%if STACK_ALIGNMENT >= 16
+ mov r5d, r7m
+%endif
+ sar r5d, 7
+ and r5d, 16 ; 0 for 10bpc, 16 for 12bpc
+ LEA pic_regq, PIC_base
+ mova m0, [PIC_sym(pw_4)+r5]
+%define bdmulq esp+2*16
+ mova [bdmulq], m0
+ shl dword lstridem, 2
+ sub r3, dword lstridem
+ mov dword lm, r3
+%endif
+ mov mstrideq, strideq
+ neg mstrideq
+ lea stride3q, [strideq*3]
+%if ARCH_X86_64
+ mov mask_bitsd, 0x3
+ mova m12, [pb_mask]
+%else
+%define mask_bitsm dword [esp+0*gprsize]
+%define mstridem dword [esp+1*gprsize]
+ mov mask_bitsm, 0x3
+ mov mstridem, mstrideq
+ mova m0, [PIC_sym(pb_mask)]
+%define m12 [esp+1*16]
+ mova m12, m0
+%endif
+
+.loop:
+%if ARCH_X86_64
+ test [maskq+4], mask_bitsd ; vmask[1]
+%else
+ mov r6d, mask_bitsm
+ test [maskq+4], r6d
+%endif
+ jz .no_flat
+
+ FILTER 6, v
+ jmp .end
+
+.no_flat:
+%if ARCH_X86_64
+ test [maskq+0], mask_bitsd ; vmask[0]
+%else
+ test [maskq+0], r6d
+%endif
+ jz .end
+
+ FILTER 4, v
+
+.end:
+%if ARCH_X86_64
+ pslld m12, 2
+ add lq, 8
+%else
+ mova m0, m12
+ pslld m0, 2
+ mova m12, m0
+ add dword lm, 8
+%endif
+ add dstq, 16
+%if ARCH_X86_64
+ shl mask_bitsd, 2
+ sub wd, 2
+%else
+ shl mask_bitsm, 2
+ sub dword wm, 2
+%endif
+ jg .loop
+%undef mask_bitsm
+%undef bdmulq
+ UNRELOC_ARGS
+ RET
+
+INIT_XMM ssse3
+%if ARCH_X86_64
+cglobal lpf_h_sb_uv_16bpc, 6, 11, 16, \
+ dst, stride, mask, l, l_stride, lut, \
+ h, stride3, tmp, mask_bits, bdmul
+ mov r6d, r7m
+ sar r6d, 7
+ and r6d, 16 ; 0 for 10bpc, 16 for 12bpc
+ lea bdmulq, [pw_4]
+ add bdmulq, r6
+ mov hd, hm
+ shl l_strideq, 2
+%else
+; stack layout [32bit only]:
+; r0 - GPRs [mask_bitsm]
+; r1 - m12/pb_mask
+; r2 - bdmulq
+; r3-8 - p2-q2
+cglobal lpf_h_sb_uv_16bpc, 4, 7, 8, -16 * (9 + extra_stack), \
+ dst, stride, mask, l, pic_reg, stride3, tmp
+ RELOC_ARGS h, 9*16
+%if STACK_ALIGNMENT >= 16
+ mov r5d, r7m
+%endif
+ sar r5d, 7
+ and r5d, 16 ; 0 for 10bpc, 16 for 12bpc
+ LEA pic_regq, PIC_base
+ mova m0, [PIC_sym(pw_4)+r5]
+%define bdmulq esp+2*16
+ mova [bdmulq], m0
+ shl dword lstridem, 2
+%endif
+ sub lq, 4
+ lea stride3q, [strideq*3]
+%if ARCH_X86_64
+ mov mask_bitsd, 0x3
+ mova m12, [pb_mask]
+%else
+%define mask_bitsm dword [esp+0*gprsize]
+ mov mask_bitsm, 0x3
+ mova m0, [PIC_sym(pb_mask)]
+%define m12 [esp+1*16]
+ mova m12, m0
+%endif
+
+.loop:
+%if ARCH_X86_64
+ test [maskq+4], mask_bitsd ; vmask[1]
+%else
+ mov r6d, mask_bitsm
+ test [maskq+4], r6d
+%endif
+ jz .no_flat
+
+ FILTER 6, h
+ jmp .end
+
+.no_flat:
+%if ARCH_X86_64
+ test [maskq+0], mask_bitsd ; vmask[0]
+%else
+ test [maskq+0], r6d
+%endif
+ jz .no_filter
+
+ FILTER 4, h
+ jmp .end
+
+.no_filter:
+ lea dstq, [dstq+strideq*8]
+.end:
+%if ARCH_X86_64
+ pslld m12, 2
+ lea lq, [lq+l_strideq*2]
+ shl mask_bitsd, 2
+ sub hd, 2
+%else
+ mova m0, m12
+ pslld m0, 2
+ mova m12, m0
+ add lq, dword lstridem
+ add lq, dword lstridem
+ shl mask_bitsm, 2
+ sub dword hm, 2
+%endif
+ jg .loop
+%undef mask_bitsm
+%undef bdmulq
+ UNRELOC_ARGS
+ RET
diff --git a/third_party/dav1d/src/x86/loopfilter_avx2.asm b/third_party/dav1d/src/x86/loopfilter_avx2.asm
new file mode 100644
index 0000000000..84696c758a
--- /dev/null
+++ b/third_party/dav1d/src/x86/loopfilter_avx2.asm
@@ -0,0 +1,1569 @@
+; Copyright © 2018-2021, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 32
+
+pb_4x1_4x5_4x9_4x13: times 2 db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12
+pb_7_1: times 16 db 7, 1
+pb_3_1: times 16 db 3, 1
+pb_2_1: times 16 db 2, 1
+pb_m1_0: times 16 db -1, 0
+pb_m1_1: times 16 db -1, 1
+pb_m1_2: times 16 db -1, 2
+pb_1: times 32 db 1
+pb_2: times 32 db 2
+pb_3: times 32 db 3
+pb_4: times 32 db 4
+pb_16: times 32 db 16
+pb_63: times 32 db 63
+pb_64: times 32 db 64
+pb_128: times 32 db 0x80
+pb_129: times 32 db 0x81
+pb_240: times 32 db 0xf0
+pb_248: times 32 db 0xf8
+pb_254: times 32 db 0xfe
+
+pw_2048: times 16 dw 2048
+pw_4096: times 16 dw 4096
+
+pb_mask: dd 1, 2, 4, 8, 16, 32, 64, 128
+
+SECTION .text
+
+%macro ABSSUB 4 ; dst, a, b, tmp
+ psubusb %1, %2, %3
+ psubusb %4, %3, %2
+ por %1, %4
+%endmacro
+
+%macro TRANSPOSE_16x4_AND_WRITE_4x32 5
+ ; transpose 16x4
+ punpcklbw m%5, m%1, m%2
+ punpckhbw m%1, m%2
+ punpcklbw m%2, m%3, m%4
+ punpckhbw m%3, m%4
+ punpcklwd m%4, m%5, m%2
+ punpckhwd m%5, m%2
+ punpcklwd m%2, m%1, m%3
+ punpckhwd m%1, m%3
+
+ ; write out
+ movd [dstq+strideq*0-2], xm%4
+ pextrd [dstq+strideq*1-2], xm%4, 1
+ pextrd [dstq+strideq*2-2], xm%4, 2
+ pextrd [dstq+stride3q-2], xm%4, 3
+ lea dstq, [dstq+strideq*4]
+ movd [dstq+strideq*0-2], xm%5
+ pextrd [dstq+strideq*1-2], xm%5, 1
+ pextrd [dstq+strideq*2-2], xm%5, 2
+ pextrd [dstq+stride3q-2], xm%5, 3
+ lea dstq, [dstq+strideq*4]
+ movd [dstq+strideq*0-2], xm%2
+ pextrd [dstq+strideq*1-2], xm%2, 1
+ pextrd [dstq+strideq*2-2], xm%2, 2
+ pextrd [dstq+stride3q-2], xm%2, 3
+ lea dstq, [dstq+strideq*4]
+ movd [dstq+strideq*0-2], xm%1
+ pextrd [dstq+strideq*1-2], xm%1, 1
+ pextrd [dstq+strideq*2-2], xm%1, 2
+ pextrd [dstq+stride3q-2], xm%1, 3
+ lea dstq, [dstq+strideq*4]
+
+ vextracti128 xm%4, m%4, 1
+ vextracti128 xm%5, m%5, 1
+ vextracti128 xm%2, m%2, 1
+ vextracti128 xm%1, m%1, 1
+
+ movd [dstq+strideq*0-2], xm%4
+ pextrd [dstq+strideq*1-2], xm%4, 1
+ pextrd [dstq+strideq*2-2], xm%4, 2
+ pextrd [dstq+stride3q-2], xm%4, 3
+ lea dstq, [dstq+strideq*4]
+ movd [dstq+strideq*0-2], xm%5
+ pextrd [dstq+strideq*1-2], xm%5, 1
+ pextrd [dstq+strideq*2-2], xm%5, 2
+ pextrd [dstq+stride3q-2], xm%5, 3
+ lea dstq, [dstq+strideq*4]
+ movd [dstq+strideq*0-2], xm%2
+ pextrd [dstq+strideq*1-2], xm%2, 1
+ pextrd [dstq+strideq*2-2], xm%2, 2
+ pextrd [dstq+stride3q-2], xm%2, 3
+ lea dstq, [dstq+strideq*4]
+ movd [dstq+strideq*0-2], xm%1
+ pextrd [dstq+strideq*1-2], xm%1, 1
+ pextrd [dstq+strideq*2-2], xm%1, 2
+ pextrd [dstq+stride3q-2], xm%1, 3
+ lea dstq, [dstq+strideq*4]
+%endmacro
+
+%macro TRANSPOSE_16X16B 3 ; in_load_15_from_mem, out_store_0_in_mem, mem
+%if %1 == 0
+ mova %3, m15
+%endif
+
+ ; input in m0-15
+ punpcklbw m15, m0, m1
+ punpckhbw m0, m1
+ punpcklbw m1, m2, m3
+ punpckhbw m2, m3
+ punpcklbw m3, m4, m5
+ punpckhbw m4, m5
+ punpcklbw m5, m6, m7
+ punpckhbw m6, m7
+ punpcklbw m7, m8, m9
+ punpckhbw m8, m9
+ punpcklbw m9, m10, m11
+ punpckhbw m10, m11
+ punpcklbw m11, m12, m13
+ punpckhbw m12, m13
+ mova m13, %3
+ mova %3, m12
+ punpcklbw m12, m14, m13
+ punpckhbw m13, m14, m13
+
+ ; interleaved in m15,0,1,2,3,4,5,6,7,8,9,10,11,rsp%3,12,13
+ punpcklwd m14, m15, m1
+ punpckhwd m15, m1
+ punpcklwd m1, m0, m2
+ punpckhwd m0, m2
+ punpcklwd m2, m3, m5
+ punpckhwd m3, m5
+ punpcklwd m5, m4, m6
+ punpckhwd m4, m6
+ punpcklwd m6, m7, m9
+ punpckhwd m7, m9
+ punpcklwd m9, m8, m10
+ punpckhwd m8, m10
+ punpcklwd m10, m11, m12
+ punpckhwd m11, m12
+ mova m12, %3
+ mova %3, m11
+ punpcklwd m11, m12, m13
+ punpckhwd m12, m13
+
+ ; interleaved in m14,15,1,0,2,3,5,4,6,7,9,8,10,rsp%3,11,12
+ punpckldq m13, m14, m2
+ punpckhdq m14, m2
+ punpckldq m2, m15, m3
+ punpckhdq m15, m3
+ punpckldq m3, m1, m5
+ punpckhdq m1, m5
+ punpckldq m5, m0, m4
+ punpckhdq m0, m4
+ punpckldq m4, m6, m10
+ punpckhdq m6, m10
+ punpckldq m10, m9, m11
+ punpckhdq m9, m11
+ punpckldq m11, m8, m12
+ punpckhdq m8, m12
+ mova m12, %3
+ mova %3, m8
+ punpckldq m8, m7, m12
+ punpckhdq m7, m12
+
+ ; interleaved in m13,14,2,15,3,1,5,0,4,6,8,7,10,9,11,rsp%3
+ punpcklqdq m12, m13, m4
+ punpckhqdq m13, m4
+ punpcklqdq m4, m14, m6
+ punpckhqdq m14, m6
+ punpcklqdq m6, m2, m8
+ punpckhqdq m2, m8
+ punpcklqdq m8, m15, m7
+ punpckhqdq m15, m7
+ punpcklqdq m7, m3, m10
+ punpckhqdq m3, m10
+ punpcklqdq m10, m1, m9
+ punpckhqdq m1, m9
+ punpcklqdq m9, m5, m11
+ punpckhqdq m5, m11
+ mova m11, %3
+ mova %3, m12
+ punpcklqdq m12, m0, m11
+ punpckhqdq m0, m11
+%if %2 == 0
+ mova m11, %3
+%endif
+
+ ; interleaved m11,13,4,14,6,2,8,15,7,3,10,1,9,5,12,0
+ SWAP 0, 11, 1, 13, 5, 2, 4, 6, 8, 7, 15
+ SWAP 3, 14, 12, 9
+%endmacro
+
+%macro FILTER 2 ; width [4/6/8/16], dir [h/v]
+ ; load data
+%ifidn %2, v
+%if %1 == 4
+ lea tmpq, [dstq+mstrideq*2]
+ mova m3, [tmpq+strideq*0] ; p1
+ mova m4, [tmpq+strideq*1] ; p0
+ mova m5, [tmpq+strideq*2] ; q0
+ mova m6, [tmpq+stride3q] ; q1
+%else
+ ; load 6-8 pixels, remainder (for wd=16) will be read inline
+ lea tmpq, [dstq+mstrideq*4]
+%if %1 != 6
+ mova m12, [tmpq+strideq*0]
+%endif
+ mova m13, [tmpq+strideq*1]
+ mova m3, [tmpq+strideq*2]
+ mova m4, [tmpq+stride3q]
+ mova m5, [dstq+strideq*0]
+ mova m6, [dstq+strideq*1]
+ mova m14, [dstq+strideq*2]
+%if %1 != 6
+ mova m15, [dstq+stride3q]
+%endif
+%endif
+%else
+ ; load lines
+%if %1 == 4
+ movd xm3, [dstq+strideq*0-2]
+ movd xm4, [dstq+strideq*1-2]
+ movd xm5, [dstq+strideq*2-2]
+ movd xm6, [dstq+stride3q -2]
+ lea tmpq, [dstq+strideq*4]
+ pinsrd xm3, [tmpq+strideq*0-2], 2
+ pinsrd xm4, [tmpq+strideq*1-2], 2
+ pinsrd xm5, [tmpq+strideq*2-2], 2
+ pinsrd xm6, [tmpq+stride3q -2], 2
+ lea tmpq, [tmpq+strideq*4]
+ pinsrd xm3, [tmpq+strideq*0-2], 1
+ pinsrd xm4, [tmpq+strideq*1-2], 1
+ pinsrd xm5, [tmpq+strideq*2-2], 1
+ pinsrd xm6, [tmpq+stride3q -2], 1
+ lea tmpq, [tmpq+strideq*4]
+ pinsrd xm3, [tmpq+strideq*0-2], 3
+ pinsrd xm4, [tmpq+strideq*1-2], 3
+ pinsrd xm5, [tmpq+strideq*2-2], 3
+ pinsrd xm6, [tmpq+stride3q -2], 3
+ lea tmpq, [tmpq+strideq*4]
+ movd xm12, [tmpq+strideq*0-2]
+ movd xm13, [tmpq+strideq*1-2]
+ movd xm14, [tmpq+strideq*2-2]
+ movd xm15, [tmpq+stride3q -2]
+ lea tmpq, [tmpq+strideq*4]
+ pinsrd xm12, [tmpq+strideq*0-2], 2
+ pinsrd xm13, [tmpq+strideq*1-2], 2
+ pinsrd xm14, [tmpq+strideq*2-2], 2
+ pinsrd xm15, [tmpq+stride3q -2], 2
+ lea tmpq, [tmpq+strideq*4]
+ pinsrd xm12, [tmpq+strideq*0-2], 1
+ pinsrd xm13, [tmpq+strideq*1-2], 1
+ pinsrd xm14, [tmpq+strideq*2-2], 1
+ pinsrd xm15, [tmpq+stride3q -2], 1
+ lea tmpq, [tmpq+strideq*4]
+ pinsrd xm12, [tmpq+strideq*0-2], 3
+ pinsrd xm13, [tmpq+strideq*1-2], 3
+ pinsrd xm14, [tmpq+strideq*2-2], 3
+ pinsrd xm15, [tmpq+stride3q -2], 3
+ vinserti128 m3, xm12, 1
+ vinserti128 m4, xm13, 1
+ vinserti128 m5, xm14, 1
+ vinserti128 m6, xm15, 1
+
+ ; transpose 4x16
+ ; xm3: A-D0,A-D8,A-D4,A-D12
+ ; xm4: A-D1,A-D9,A-D5,A-D13
+ ; xm5: A-D2,A-D10,A-D6,A-D14
+ ; xm6: A-D3,A-D11,A-D7,A-D15
+ punpcklbw m7, m3, m4
+ punpckhbw m3, m4
+ punpcklbw m4, m5, m6
+ punpckhbw m5, m6
+ ; xm7: A0-1,B0-1,C0-1,D0-1,A8-9,B8-9,C8-9,D8-9
+ ; xm3: A4-5,B4-5,C4-5,D4-5,A12-13,B12-13,C12-13,D12-13
+ ; xm4: A2-3,B2-3,C2-3,D2-3,A10-11,B10-11,C10-11,D10-11
+ ; xm5: A6-7,B6-7,C6-7,D6-7,A14-15,B14-15,C14-15,D14-15
+ punpcklwd m6, m7, m4
+ punpckhwd m7, m4
+ punpcklwd m4, m3, m5
+ punpckhwd m3, m5
+ ; xm6: A0-3,B0-3,C0-3,D0-3
+ ; xm7: A8-11,B8-11,C8-11,D8-11
+ ; xm4: A4-7,B4-7,C4-7,D4-7
+ ; xm3: A12-15,B12-15,C12-15,D12-15
+ punpckldq m5, m6, m4
+ punpckhdq m6, m4
+ punpckldq m4, m7, m3
+ punpckhdq m7, m3
+ ; xm5: A0-7,B0-7
+ ; xm6: C0-7,D0-7
+ ; xm4: A8-15,B8-15
+ ; xm7: C8-15,D8-15
+ punpcklqdq m3, m5, m4
+ punpckhqdq m4, m5, m4
+ punpcklqdq m5, m6, m7
+ punpckhqdq m6, m7
+ ; xm3: A0-15
+ ; xm5: B0-15
+ ; xm4: C0-15
+ ; xm6: D0-15
+%elif %1 == 6 || %1 == 8
+ movq xm3, [dstq+strideq*0-%1/2]
+ movq xm4, [dstq+strideq*1-%1/2]
+ movq xm5, [dstq+strideq*2-%1/2]
+ movq xm6, [dstq+stride3q -%1/2]
+ lea tmpq, [dstq+strideq*8]
+ movhps xm3, [tmpq+strideq*0-%1/2]
+ movhps xm4, [tmpq+strideq*1-%1/2]
+ movhps xm5, [tmpq+strideq*2-%1/2]
+ movhps xm6, [tmpq+stride3q -%1/2]
+ lea tmpq, [tmpq+strideq*8]
+ movq xm7, [tmpq+strideq*0-%1/2]
+ movq xm8, [tmpq+strideq*1-%1/2]
+ movq xm9, [tmpq+strideq*2-%1/2]
+ movq xm11, [tmpq+stride3q -%1/2]
+ lea tmpq, [tmpq+strideq*8]
+ movhps xm7, [tmpq+strideq*0-%1/2]
+ movhps xm8, [tmpq+strideq*1-%1/2]
+ movhps xm9, [tmpq+strideq*2-%1/2]
+ movhps xm11, [tmpq+stride3q -%1/2]
+ vinserti128 m3, xm7, 1
+ vinserti128 m4, xm8, 1
+ vinserti128 m5, xm9, 1
+ vinserti128 m6, xm11, 1
+ lea tmpq, [dstq+strideq*4]
+ movq xm12, [tmpq+strideq*0-%1/2]
+ movq xm13, [tmpq+strideq*1-%1/2]
+ movq xm14, [tmpq+strideq*2-%1/2]
+ movq xm15, [tmpq+stride3q -%1/2]
+ lea tmpq, [tmpq+strideq*8]
+ movhps xm12, [tmpq+strideq*0-%1/2]
+ movhps xm13, [tmpq+strideq*1-%1/2]
+ movhps xm14, [tmpq+strideq*2-%1/2]
+ movhps xm15, [tmpq+stride3q -%1/2]
+ lea tmpq, [tmpq+strideq*8]
+ movq xm7, [tmpq+strideq*0-%1/2]
+ movq xm8, [tmpq+strideq*1-%1/2]
+ movq xm9, [tmpq+strideq*2-%1/2]
+ movq xm11, [tmpq+stride3q -%1/2]
+ lea tmpq, [tmpq+strideq*8]
+ movhps xm7, [tmpq+strideq*0-%1/2]
+ movhps xm8, [tmpq+strideq*1-%1/2]
+ movhps xm9, [tmpq+strideq*2-%1/2]
+ movhps xm11, [tmpq+stride3q -%1/2]
+ vinserti128 m12, xm7, 1
+ vinserti128 m13, xm8, 1
+ vinserti128 m14, xm9, 1
+ vinserti128 m15, xm11, 1
+
+ ; transpose 8x16
+ ; xm3: A-H0,A-H8
+ ; xm4: A-H1,A-H9
+ ; xm5: A-H2,A-H10
+ ; xm6: A-H3,A-H11
+ ; xm12: A-H4,A-H12
+ ; xm13: A-H5,A-H13
+ ; xm14: A-H6,A-H14
+ ; xm15: A-H7,A-H15
+ punpcklbw m7, m3, m4
+ punpckhbw m3, m4
+ punpcklbw m4, m5, m6
+ punpckhbw m5, m6
+ punpcklbw m6, m12, m13
+ punpckhbw m12, m13
+ punpcklbw m13, m14, m15
+ punpckhbw m14, m15
+ ; xm7: A0-1,B0-1,C0-1,D0-1,E0-1,F0-1,G0-1,H0-1
+ ; xm3: A8-9,B8-9,C8-9,D8-9,E8-9,F8-9,G8-9,H8-9
+ ; xm4: A2-3,B2-3,C2-3,D2-3,E2-3,F2-3,G2-3,H2-3
+ ; xm5: A10-11,B10-11,C10-11,D10-11,E10-11,F10-11,G10-11,H10-11
+ ; xm6: A4-5,B4-5,C4-5,D4-5,E4-5,F4-5,G4-5,H4-5
+ ; xm12: A12-13,B12-13,C12-13,D12-13,E12-13,F12-13,G12-13,H12-13
+ ; xm13: A6-7,B6-7,C6-7,D6-7,E6-7,F6-7,G6-7,H6-7
+ ; xm14: A14-15,B14-15,C14-15,D14-15,E14-15,F14-15,G14-15,H14-15
+ punpcklwd m15, m7, m4
+ punpckhwd m7, m4
+ punpcklwd m4, m3, m5
+ punpckhwd m3, m5
+ punpcklwd m5, m6, m13
+ punpckhwd m6, m13
+ punpcklwd m13, m12, m14
+ punpckhwd m12, m14
+ ; xm15: A0-3,B0-3,C0-3,D0-3
+ ; xm7: E0-3,F0-3,G0-3,H0-3
+ ; xm4: A8-11,B8-11,C8-11,D8-11
+ ; xm3: E8-11,F8-11,G8-11,H8-11
+ ; xm5: A4-7,B4-7,C4-7,D4-7
+ ; xm6: E4-7,F4-7,G4-7,H4-7
+ ; xm13: A12-15,B12-15,C12-15,D12-15
+ ; xm12: E12-15,F12-15,G12-15,H12-15
+ punpckldq m14, m15, m5
+ punpckhdq m15, m5
+ punpckldq m5, m7, m6
+%if %1 != 6
+ punpckhdq m7, m6
+%endif
+ punpckldq m6, m4, m13
+ punpckhdq m4, m13
+ punpckldq m13, m3, m12
+%if %1 != 6
+ punpckhdq m12, m3, m12
+%endif
+ ; xm14: A0-7,B0-7
+ ; xm15: C0-7,D0-7
+ ; xm5: E0-7,F0-7
+ ; xm7: G0-7,H0-7
+ ; xm6: A8-15,B8-15
+ ; xm4: C8-15,D8-15
+ ; xm13: E8-15,F8-15
+ ; xm12: G8-15,H8-15
+ punpcklqdq m3, m14, m6
+ punpckhqdq m14, m6
+ punpckhqdq m6, m15, m4
+ punpcklqdq m15, m4
+ punpcklqdq m4, m5, m13
+ punpckhqdq m13, m5, m13
+%if %1 == 8
+ punpcklqdq m5, m7, m12
+ punpckhqdq m12, m7, m12
+ ; xm3: A0-15
+ ; xm14: B0-15
+ ; xm15: C0-15
+ ; xm6: D0-15
+ ; xm4: E0-15
+ ; xm13: F0-15
+ ; xm5: G0-15
+ ; xm12: H0-15
+ SWAP 12, 3, 15
+ SWAP 13, 14, 5, 4, 6
+ ; 3,14,15,6,4,13,5,12 -> 12,13,3,4,5,6,14,15
+%else
+ SWAP 13, 3, 14
+ SWAP 6, 4, 15, 5
+ ; 3,14,15,6,4,13 -> 13,3,4,5,6,14
+%endif
+%else
+ ; load and 16x16 transpose. We only use 14 pixels but we'll need the
+ ; remainder at the end for the second transpose
+ movu xm0, [dstq+strideq*0-8]
+ movu xm1, [dstq+strideq*1-8]
+ movu xm2, [dstq+strideq*2-8]
+ movu xm3, [dstq+stride3q -8]
+ lea tmpq, [dstq+strideq*4]
+ movu xm4, [tmpq+strideq*0-8]
+ movu xm5, [tmpq+strideq*1-8]
+ movu xm6, [tmpq+strideq*2-8]
+ movu xm7, [tmpq+stride3q -8]
+ lea tmpq, [tmpq+strideq*4]
+ movu xm8, [tmpq+strideq*0-8]
+ movu xm9, [tmpq+strideq*1-8]
+ movu xm10, [tmpq+strideq*2-8]
+ movu xm11, [tmpq+stride3q -8]
+ lea tmpq, [tmpq+strideq*4]
+ movu xm12, [tmpq+strideq*0-8]
+ movu xm13, [tmpq+strideq*1-8]
+ movu xm14, [tmpq+strideq*2-8]
+ movu xm15, [tmpq+stride3q -8]
+ lea tmpq, [tmpq+strideq*4]
+ vinserti128 m0, [tmpq+strideq*0-8], 1
+ vinserti128 m1, [tmpq+strideq*1-8], 1
+ vinserti128 m2, [tmpq+strideq*2-8], 1
+ vinserti128 m3, [tmpq+stride3q -8], 1
+ lea tmpq, [tmpq+strideq*4]
+ vinserti128 m4, [tmpq+strideq*0-8], 1
+ vinserti128 m5, [tmpq+strideq*1-8], 1
+ vinserti128 m6, [tmpq+strideq*2-8], 1
+ vinserti128 m7, [tmpq+stride3q -8], 1
+ lea tmpq, [tmpq+strideq*4]
+ vinserti128 m8, [tmpq+strideq*0-8], 1
+ vinserti128 m9, [tmpq+strideq*1-8], 1
+ vinserti128 m10, [tmpq+strideq*2-8], 1
+ vinserti128 m11, [tmpq+stride3q -8], 1
+ lea tmpq, [tmpq+strideq*4]
+ vinserti128 m12, [tmpq+strideq*0-8], 1
+ vinserti128 m13, [tmpq+strideq*1-8], 1
+ vinserti128 m14, [tmpq+strideq*2-8], 1
+ vinserti128 m15, [tmpq+stride3q -8], 1
+
+ TRANSPOSE_16X16B 0, 1, [rsp+11*32]
+ mova [rsp+12*32], m1
+ mova [rsp+13*32], m2
+ mova [rsp+14*32], m3
+ mova [rsp+15*32], m12
+ mova [rsp+16*32], m13
+ mova [rsp+17*32], m14
+ mova [rsp+18*32], m15
+ ; 4,5,6,7,8,9,10,11 -> 12,13,3,4,5,6,14,15
+ SWAP 12, 4, 7
+ SWAP 13, 5, 8
+ SWAP 3, 6, 9
+ SWAP 10, 14
+ SWAP 11, 15
+%endif
+%endif
+
+ ; load L/E/I/H
+%ifidn %2, v
+ movu m1, [lq]
+ movu m0, [lq+l_strideq]
+%else
+ movq xm1, [lq]
+ movq xm2, [lq+l_strideq*2]
+ movhps xm1, [lq+l_strideq]
+ movhps xm2, [lq+l_stride3q]
+ lea lq, [lq+l_strideq*4]
+ movq xm10, [lq]
+ movq xm0, [lq+l_strideq*2]
+ movhps xm10, [lq+l_strideq]
+ movhps xm0, [lq+l_stride3q]
+ lea lq, [lq+l_strideq*4]
+ vinserti128 m1, xm10, 1
+ vinserti128 m2, xm0, 1
+ shufps m0, m1, m2, q3131
+ shufps m1, m2, q2020
+%endif
+ pxor m2, m2
+ pcmpeqb m10, m2, m0
+ pand m1, m10
+ por m0, m1 ; l[x][] ? l[x][] : l[x-stride][]
+ pshufb m0, [pb_4x1_4x5_4x9_4x13] ; l[x][1]
+ pcmpeqb m10, m2, m0 ; !L
+ psrlq m2, m0, [lutq+128]
+ pand m2, [pb_63]
+ vpbroadcastb m1, [lutq+136]
+ pminub m2, m1
+ pmaxub m2, [pb_1] ; I
+ pand m1, m0, [pb_240]
+ psrlq m1, 4 ; H
+ paddb m0, [pb_2]
+ paddb m0, m0
+ paddb m0, m2 ; E
+ pxor m1, [pb_128]
+ pxor m2, [pb_128]
+ pxor m0, [pb_128]
+
+ ABSSUB m8, m3, m4, m9 ; abs(p1-p0)
+ pmaxub m8, m10
+ ABSSUB m9, m5, m6, m10 ; abs(q1-q0)
+ pmaxub m8, m9
+%if %1 == 4
+ pxor m8, [pb_128]
+ pcmpgtb m7, m8, m1 ; hev
+%else
+ pxor m7, m8, [pb_128]
+ pcmpgtb m7, m1 ; hev
+
+%if %1 == 6
+ ABSSUB m9, m13, m4, m10 ; abs(p2-p0)
+ pmaxub m9, m8
+%else
+ ABSSUB m9, m12, m4, m10 ; abs(p3-p0)
+ pmaxub m9, m8
+ ABSSUB m10, m13, m4, m11 ; abs(p2-p0)
+ pmaxub m9, m10
+%endif
+ ABSSUB m10, m5, m14, m11 ; abs(q2-q0)
+ pmaxub m9, m10
+%if %1 != 6
+ ABSSUB m10, m5, m15, m11 ; abs(q3-q0)
+ pmaxub m9, m10
+%endif
+ pxor m9, [pb_128]
+ pcmpgtb m9, [pb_129] ; !flat8in
+
+%if %1 == 6
+ ABSSUB m10, m13, m3, m1 ; abs(p2-p1)
+%else
+ ABSSUB m10, m12, m13, m11 ; abs(p3-p2)
+ ABSSUB m11, m13, m3, m1 ; abs(p2-p1)
+ pmaxub m10, m11
+ ABSSUB m11, m14, m15, m1 ; abs(q3-q2)
+ pmaxub m10, m11
+%endif
+ ABSSUB m11, m14, m6, m1 ; abs(q2-q1)
+ pmaxub m10, m11
+%if %1 == 16
+ vpbroadcastd m11, [maskq+8]
+ vpbroadcastd m1, [maskq+4]
+ por m11, m1
+ pand m11, [pb_mask]
+ pcmpeqd m11, [pb_mask]
+ pand m10, m11
+%else
+ vpbroadcastd m11, [maskq+4]
+ pand m11, [pb_mask]
+ pcmpeqd m11, [pb_mask]
+ pand m10, m11 ; only apply fm-wide to wd>4 blocks
+%endif
+ pmaxub m8, m10
+
+ pxor m8, [pb_128]
+%endif
+ pcmpgtb m8, m2
+
+ ABSSUB m10, m3, m6, m11 ; abs(p1-q1)
+ ABSSUB m11, m4, m5, m2 ; abs(p0-q0)
+ paddusb m11, m11
+ pand m10, [pb_254]
+ psrlq m10, 1
+ paddusb m10, m11 ; abs(p0-q0)*2+(abs(p1-q1)>>1)
+ pxor m10, [pb_128]
+ pcmpgtb m10, m0 ; abs(p0-q0)*2+(abs(p1-q1)>>1) > E
+ por m8, m10
+
+%if %1 == 16
+%ifidn %2, v
+ lea tmpq, [dstq+mstrideq*8]
+ mova m0, [tmpq+strideq*1]
+%else
+ mova m0, [rsp+12*32]
+%endif
+ ABSSUB m1, m0, m4, m2
+%ifidn %2, v
+ mova m0, [tmpq+strideq*2]
+%else
+ mova m0, [rsp+13*32]
+%endif
+ ABSSUB m2, m0, m4, m10
+ pmaxub m1, m2
+%ifidn %2, v
+ mova m0, [tmpq+stride3q]
+%else
+ mova m0, [rsp+14*32]
+%endif
+ ABSSUB m2, m0, m4, m10
+ pmaxub m1, m2
+%ifidn %2, v
+ lea tmpq, [dstq+strideq*4]
+ mova m0, [tmpq+strideq*0]
+%else
+ mova m0, [rsp+15*32]
+%endif
+ ABSSUB m2, m0, m5, m10
+ pmaxub m1, m2
+%ifidn %2, v
+ mova m0, [tmpq+strideq*1]
+%else
+ mova m0, [rsp+16*32]
+%endif
+ ABSSUB m2, m0, m5, m10
+ pmaxub m1, m2
+%ifidn %2, v
+ mova m0, [tmpq+strideq*2]
+%else
+ mova m0, [rsp+17*32]
+%endif
+ ABSSUB m2, m0, m5, m10
+ pmaxub m1, m2
+ pxor m1, [pb_128]
+ pcmpgtb m1, [pb_129] ; !flat8out
+ por m1, m9 ; !flat8in | !flat8out
+ vpbroadcastd m2, [maskq+8]
+ pand m10, m2, [pb_mask]
+ pcmpeqd m10, [pb_mask]
+ pandn m1, m10 ; flat16
+ pandn m1, m8, m1 ; flat16 & fm
+
+ vpbroadcastd m10, [maskq+4]
+ por m10, m2
+ pand m2, m10, [pb_mask]
+ pcmpeqd m2, [pb_mask]
+ pandn m9, m2 ; flat8in
+ pandn m9, m8, m9
+ vpbroadcastd m2, [maskq+0]
+ por m2, m10
+ pand m2, [pb_mask]
+ pcmpeqd m2, [pb_mask]
+ pandn m8, m2
+ pandn m8, m9, m8 ; fm & !flat8 & !flat16
+ pandn m9, m1, m9 ; flat8 & !flat16
+%elif %1 != 4
+ vpbroadcastd m0, [maskq+4]
+ pand m2, m0, [pb_mask]
+ pcmpeqd m2, [pb_mask]
+ pandn m9, m2
+ pandn m9, m8, m9 ; flat8 & fm
+ vpbroadcastd m2, [maskq+0]
+ por m0, m2
+ pand m0, [pb_mask]
+ pcmpeqd m0, [pb_mask]
+ pandn m8, m0
+ pandn m8, m9, m8 ; fm & !flat8
+%else
+ vpbroadcastd m0, [maskq+0]
+ pand m0, [pb_mask]
+ pcmpeqd m0, [pb_mask]
+ pandn m8, m0 ; fm
+%endif
+
+ ; short filter
+
+ pxor m3, [pb_128]
+ pxor m6, [pb_128]
+ psubsb m10, m3, m6 ; iclip_diff(p1-q1)
+ pand m10, m7 ; f=iclip_diff(p1-q1)&hev
+ pxor m4, [pb_128]
+ pxor m5, [pb_128]
+ psubsb m11, m5, m4
+ paddsb m10, m11
+ paddsb m10, m11
+ paddsb m10, m11 ; f=iclip_diff(3*(q0-p0)+f)
+ pand m8, m10 ; f&=fm
+ paddsb m10, m8, [pb_3]
+ paddsb m8, [pb_4]
+ pand m10, [pb_248]
+ pand m8, [pb_248]
+ psrlq m10, 3
+ psrlq m8, 3
+ pxor m10, [pb_16]
+ pxor m8, [pb_16]
+ psubb m10, [pb_16] ; f2
+ psubb m8, [pb_16] ; f1
+ paddsb m4, m10
+ psubsb m5, m8
+ pxor m4, [pb_128]
+ pxor m5, [pb_128]
+
+ pxor m8, [pb_128]
+ pxor m10, m10
+ pavgb m8, m10 ; f=(f1+1)>>1
+ psubb m8, [pb_64]
+ pandn m8, m7, m8 ; f&=!hev
+ paddsb m3, m8
+ psubsb m6, m8
+ pxor m3, [pb_128]
+ pxor m6, [pb_128]
+
+%if %1 == 16
+ ; flat16 filter
+%ifidn %2, v
+ lea tmpq, [dstq+mstrideq*8]
+ mova m0, [tmpq+strideq*1] ; p6
+ mova m2, [tmpq+strideq*2] ; p5
+ mova m7, [tmpq+stride3q] ; p4
+%else
+ mova m0, [rsp+12*32]
+ mova m2, [rsp+13*32]
+ mova m7, [rsp+14*32]
+%endif
+
+ mova [rsp+0*32], m9
+ mova [rsp+1*32], m14
+ mova [rsp+2*32], m15
+
+ ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 [p5/p4/p2/p1/p0/q0][p6/p3] A
+ ; write -6
+ punpcklbw m14, m0, m12
+ punpckhbw m15, m0, m12
+ pmaddubsw m10, m14, [pb_7_1]
+ pmaddubsw m11, m15, [pb_7_1] ; p6*7+p3
+ punpcklbw m8, m2, m7
+ punpckhbw m9, m2, m7
+ pmaddubsw m8, [pb_2]
+ pmaddubsw m9, [pb_2]
+ paddw m10, m8
+ paddw m11, m9 ; p6*7+p5*2+p4*2+p3
+ punpcklbw m8, m13, m3
+ punpckhbw m9, m13, m3
+ pmaddubsw m8, [pb_1]
+ pmaddubsw m9, [pb_1]
+ paddw m10, m8
+ paddw m11, m9 ; p6*7+p5*2+p4*2+p3+p2+p1
+ punpcklbw m8, m4, m5
+ punpckhbw m9, m4, m5
+ pmaddubsw m8, [pb_1]
+ pmaddubsw m9, [pb_1]
+ paddw m10, m8
+ paddw m11, m9 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0
+ pmulhrsw m8, m10, [pw_2048]
+ pmulhrsw m9, m11, [pw_2048]
+ packuswb m8, m9
+ pand m8, m1
+ pandn m9, m1, m2
+ por m8, m9
+%ifidn %2, v
+ mova [tmpq+strideq*2], m8 ; p5
+%else
+ mova [rsp+13*32], m8
+%endif
+
+ ; sub p6*2, add p3/q1 [reuse p6/p3 from A][-p6,+q1|save] B
+ ; write -5
+ pmaddubsw m14, [pb_m1_1]
+ pmaddubsw m15, [pb_m1_1]
+ paddw m10, m14
+ paddw m11, m15 ; p6*6+p5*2+p4*2+p3*2+p2+p1+p0+q0
+ punpcklbw m8, m0, m6
+ punpckhbw m9, m0, m6
+ pmaddubsw m8, [pb_m1_1]
+ pmaddubsw m9, [pb_m1_1]
+ mova [rsp+3*32], m8
+ mova [rsp+4*32], m9
+ paddw m10, m8
+ paddw m11, m9 ; p6*5+p5*2+p4*2+p3*2+p2+p1+p0+q0+q1
+ pmulhrsw m8, m10, [pw_2048]
+ pmulhrsw m9, m11, [pw_2048]
+ packuswb m8, m9
+ vpblendvb m8, m7, m8, m1
+%ifidn %2, v
+ mova [tmpq+stride3q], m8 ; p4
+%else
+ mova [rsp+14*32], m8
+%endif
+
+ ; sub p6/p5, add p2/q2 [-p6,+p2][-p5,+q2|save] C
+ ; write -4
+ mova m14, [rsp+1*32]
+ punpcklbw m8, m0, m13
+ punpckhbw m9, m0, m13
+ pmaddubsw m8, [pb_m1_1]
+ pmaddubsw m9, [pb_m1_1]
+ paddw m10, m8
+ paddw m11, m9 ; p6*4+p5*2+p4*2+p3*2+p2*2+p1+p0+q0+q1
+ punpcklbw m8, m2, m14
+ punpckhbw m2, m14
+ pmaddubsw m8, [pb_m1_1]
+ pmaddubsw m2, [pb_m1_1]
+ mova [rsp+1*32], m8
+ paddw m10, m8
+ paddw m11, m2 ; p6*4+p5+p4*2+p3*2+p2*2+p1+p0+q0+q1+q2
+ pmulhrsw m8, m10, [pw_2048]
+ pmulhrsw m9, m11, [pw_2048]
+ packuswb m8, m9
+ vpblendvb m8, m12, m8, m1
+%ifidn %2, v
+ mova [tmpq+strideq*4], m8 ; p3
+%else
+ mova [rsp+19*32], m8
+%endif
+
+ ; sub p6/p4, add p1/q3 [-p6,+p1][-p4,+q3|save] D
+ ; write -3
+ mova m15, [rsp+2*32]
+ punpcklbw m8, m0, m3
+ punpckhbw m9, m0, m3
+ pmaddubsw m8, [pb_m1_1]
+ pmaddubsw m9, [pb_m1_1]
+ paddw m10, m8
+ paddw m11, m9 ; p6*3+p5+p4*2+p3*2+p2*2+p1*2+p0+q0+q1+q2
+ punpcklbw m8, m7, m15
+ punpckhbw m7, m15
+ pmaddubsw m8, [pb_m1_1]
+ pmaddubsw m7, [pb_m1_1]
+ mova [rsp+2*32], m8
+ paddw m10, m8
+ paddw m11, m7 ; p6*3+p5+p4+p3*2+p2*2+p1*2+p0+q0+q1+q2+q3
+ pmulhrsw m8, m10, [pw_2048]
+ pmulhrsw m9, m11, [pw_2048]
+ packuswb m8, m9
+ vpblendvb m8, m13, m8, m1
+ mova [rsp+6*32], m8 ; don't clobber p2/m13 since we need it in F
+
+ ; sub p6/p3, add p0/q4 [-p6,+p0][-p3,+q4|save] E
+ ; write -2
+%ifidn %2, v
+ lea tmpq, [dstq+strideq*4]
+%endif
+ punpcklbw m8, m0, m4
+ punpckhbw m9, m0, m4
+ pmaddubsw m8, [pb_m1_1]
+ pmaddubsw m9, [pb_m1_1]
+ paddw m10, m8
+ paddw m11, m9 ; p6*2+p5+p4+p3*2+p2*2+p1*2+p0*2+q0+q1+q2+q3
+%ifidn %2, v
+ mova m9, [tmpq+strideq*0] ; q4
+%else
+ mova m9, [rsp+15*32]
+%endif
+ punpcklbw m8, m12, m9
+ punpckhbw m9, m12, m9
+ pmaddubsw m8, [pb_m1_1]
+ pmaddubsw m9, [pb_m1_1]
+ mova [rsp+7*32], m8
+ mova [rsp+5*32], m9
+ paddw m10, m8
+ paddw m11, m9 ; p6*2+p5+p4+p3+p2*2+p1*2+p0*2+q0+q1+q2+q3+q4
+ pmulhrsw m8, m10, [pw_2048]
+ pmulhrsw m9, m11, [pw_2048]
+ packuswb m8, m9
+ vpblendvb m8, m3, m8, m1
+ mova [rsp+8*32], m8 ; don't clobber p1/m3 since we need it in G
+
+ ; sub p6/p2, add q0/q5 [-p6,+q0][-p2,+q5|save] F
+ ; write -1
+%ifidn %2, v
+ mova m9, [tmpq+strideq*1] ; q5
+%else
+ mova m9, [rsp+16*32]
+%endif
+ punpcklbw m8, m0, m5
+ punpckhbw m0, m5
+ pmaddubsw m8, [pb_m1_1]
+ pmaddubsw m0, [pb_m1_1]
+ paddw m10, m8
+ paddw m11, m0 ; p6+p5+p4+p3+p2*2+p1*2+p0*2+q0*2+q1+q2+q3+q4
+ punpcklbw m0, m13, m9
+ punpckhbw m9, m13, m9
+ mova m13, [rsp+6*32]
+ pmaddubsw m0, [pb_m1_1]
+ pmaddubsw m9, [pb_m1_1]
+ mova [rsp+ 9*32], m0
+ mova [rsp+10*32], m9
+ paddw m10, m0
+ paddw m11, m9 ; p6+p5+p4+p3+p2+p1*2+p0*2+q0*2+q1+q2+q3+q4+q5
+ pmulhrsw m0, m10, [pw_2048]
+ pmulhrsw m8, m11, [pw_2048]
+ packuswb m0, m8
+ vpblendvb m0, m4, m0, m1
+ mova [rsp+6*32], m0 ; don't clobber p0/m4 since we need it in H
+
+ ; sub p6/p1, add q1/q6 [reuse -p6,+q1 from B][-p1,+q6|save] G
+ ; write +0
+%ifidn %2, v
+ mova m0, [tmpq+strideq*2] ; q6
+%else
+ mova m0, [rsp+17*32]
+%endif
+ paddw m10, [rsp+3*32]
+ paddw m11, [rsp+4*32] ; p5+p4+p3+p2+p1*2+p0*2+q0*2+q1*2+q2+q3+q4+q5
+ punpcklbw m8, m3, m0
+ punpckhbw m9, m3, m0
+ mova m3, [rsp+8*32]
+ pmaddubsw m8, [pb_m1_1]
+ pmaddubsw m9, [pb_m1_1]
+ mova [rsp+3*32], m8
+ mova [rsp+4*32], m9
+ paddw m10, m8
+ paddw m11, m9 ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6
+ pmulhrsw m8, m10, [pw_2048]
+ pmulhrsw m9, m11, [pw_2048]
+ packuswb m8, m9
+ vpblendvb m8, m5, m8, m1
+ mova [rsp+8*32], m8 ; don't clobber q0/m5 since we need it in I
+
+ ; sub p5/p0, add q2/q6 [reuse -p5,+q2 from C][-p0,+q6] H
+ ; write +1
+ paddw m10, [rsp+1*32]
+ paddw m11, m2 ; p4+p3+p2+p1+p0*2+q0*2+q1*2+q2*2+q3+q4+q5+q6
+ punpcklbw m8, m4, m0
+ punpckhbw m2, m4, m0
+ mova m4, [rsp+6*32]
+ pmaddubsw m8, [pb_m1_1]
+ pmaddubsw m2, [pb_m1_1]
+ paddw m10, m8
+ paddw m11, m2 ; p4+p3+p2+p1+p0+q0*2+q1*2+q2*2+q3+q4+q5+q6*2
+ pmulhrsw m2, m10, [pw_2048]
+ pmulhrsw m9, m11, [pw_2048]
+ packuswb m2, m9
+ vpblendvb m2, m6, m2, m1 ; don't clobber q1/m6 since we need it in K
+
+ ; sub p4/q0, add q3/q6 [reuse -p4,+q3 from D][-q0,+q6] I
+ ; write +2
+ paddw m10, [rsp+2*32]
+ paddw m11, m7 ; p3+p2+p1+p0+q0*2+q1*2+q2*2+q3*2+q4+q5+q6*2
+ punpcklbw m8, m5, m0
+ punpckhbw m9, m5, m0
+ mova m5, [rsp+8*32]
+ pmaddubsw m8, [pb_m1_1]
+ pmaddubsw m9, [pb_m1_1]
+ paddw m10, m8
+ paddw m11, m9 ; p3+p2+p1+p0+q0+q1*2+q2*2+q3*2+q4+q5+q6*3
+ pmulhrsw m7, m10, [pw_2048]
+ pmulhrsw m9, m11, [pw_2048]
+ packuswb m7, m9
+ vpblendvb m7, m14, m7, m1 ; don't clobber q2/m14 since we need it in K
+
+ ; sub p3/q1, add q4/q6 [reuse -p3,+q4 from E][-q1,+q6] J
+ ; write +3
+ paddw m10, [rsp+7*32]
+ paddw m11, [rsp+5*32] ; p2+p1+p0+q0+q1*2+q2*2+q3*2+q4*2+q5+q6*3
+ punpcklbw m8, m6, m0
+ punpckhbw m9, m6, m0
+ SWAP 2, 6
+ pmaddubsw m8, [pb_m1_1]
+ pmaddubsw m9, [pb_m1_1]
+ paddw m10, m8
+ paddw m11, m9 ; p2+p1+p0+q0+q1+q2*2+q3*2+q4*2+q5+q6*4
+ pmulhrsw m8, m10, [pw_2048]
+ pmulhrsw m9, m11, [pw_2048]
+ packuswb m8, m9
+ vpblendvb m8, m15, m8, m1
+%ifidn %2, v
+ mova [tmpq+mstrideq], m8 ; q3
+%else
+ mova [rsp+20*32], m8
+%endif
+
+ ; sub p2/q2, add q5/q6 [reuse -p2,+q5 from F][-q2,+q6] K
+ ; write +4
+ paddw m10, [rsp+ 9*32]
+ paddw m11, [rsp+10*32] ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4
+ punpcklbw m8, m14, m0
+ punpckhbw m9, m14, m0
+ SWAP 14, 7
+ pmaddubsw m8, [pb_m1_1]
+ pmaddubsw m9, [pb_m1_1]
+ paddw m10, m8
+ paddw m11, m9 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5
+ pmulhrsw m8, m10, [pw_2048]
+ pmulhrsw m9, m11, [pw_2048]
+ packuswb m8, m9
+%ifidn %2, v
+ mova m9, [tmpq+strideq*0]
+%else
+ mova m9, [rsp+15*32]
+%endif
+ vpblendvb m8, m9, m8, m1
+%ifidn %2, v
+ mova [tmpq+strideq*0], m8 ; q4
+%else
+ mova [rsp+15*32], m8
+%endif
+
+ ; sub p1/q3, add q6*2 [reuse -p1,+q6 from G][-q3,+q6] L
+ ; write +5
+ paddw m10, [rsp+3*32]
+ paddw m11, [rsp+4*32] ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4
+ punpcklbw m8, m15, m0
+ punpckhbw m9, m15, m0
+ pmaddubsw m8, [pb_m1_1]
+ pmaddubsw m9, [pb_m1_1]
+ paddw m10, m8
+ paddw m11, m9 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5
+ pmulhrsw m10, [pw_2048]
+ pmulhrsw m11, [pw_2048]
+ packuswb m10, m11
+%ifidn %2, v
+ mova m11, [tmpq+strideq*1]
+%else
+ mova m11, [rsp+16*32]
+%endif
+ vpblendvb m10, m11, m10, m1
+%ifidn %2, v
+ mova [tmpq+strideq*1], m10 ; q5
+%else
+ mova [rsp+16*32], m10
+%endif
+
+ mova m9, [rsp+0*32]
+%ifidn %2, v
+ lea tmpq, [dstq+mstrideq*4]
+%endif
+%endif
+%if %1 >= 8
+ ; flat8 filter
+ punpcklbw m0, m12, m3
+ punpckhbw m1, m12, m3
+ pmaddubsw m2, m0, [pb_3_1]
+ pmaddubsw m7, m1, [pb_3_1] ; 3 * p3 + p1
+ punpcklbw m8, m13, m4
+ punpckhbw m11, m13, m4
+ pmaddubsw m8, [pb_2_1]
+ pmaddubsw m11, [pb_2_1]
+ paddw m2, m8
+ paddw m7, m11 ; 3 * p3 + 2 * p2 + p1 + p0
+ punpcklbw m8, m5, [pb_4]
+ punpckhbw m11, m5, [pb_4]
+ pmaddubsw m8, [pb_1]
+ pmaddubsw m11, [pb_1]
+ paddw m2, m8
+ paddw m7, m11 ; 3 * p3 + 2 * p2 + p1 + p0 + q0 + 4
+ psrlw m8, m2, 3
+ psrlw m11, m7, 3
+ packuswb m8, m11
+ vpblendvb m10, m13, m8, m9 ; p2
+%ifidn %2, v
+ mova [tmpq+strideq*1], m10 ; p2
+%endif
+
+ pmaddubsw m8, m0, [pb_m1_1]
+ pmaddubsw m11, m1, [pb_m1_1]
+ paddw m2, m8
+ paddw m7, m11
+ punpcklbw m8, m13, m6
+ punpckhbw m11, m13, m6
+ pmaddubsw m8, [pb_m1_1]
+ pmaddubsw m11, [pb_m1_1]
+ paddw m2, m8
+ paddw m7, m11 ; 2 * p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4
+ psrlw m8, m2, 3
+ psrlw m11, m7, 3
+ packuswb m8, m11
+ vpblendvb m8, m3, m8, m9 ; p1
+%ifidn %2, v
+ mova [tmpq+strideq*2], m8 ; p1
+%else
+ mova [rsp+0*32], m8
+%endif
+
+ pmaddubsw m0, [pb_1]
+ pmaddubsw m1, [pb_1]
+ psubw m2, m0
+ psubw m7, m1
+ punpcklbw m8, m4, m14
+ punpckhbw m11, m4, m14
+ pmaddubsw m8, [pb_1]
+ pmaddubsw m11, [pb_1]
+ paddw m2, m8
+ paddw m7, m11 ; p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4
+ psrlw m8, m2, 3
+ psrlw m11, m7, 3
+ packuswb m8, m11
+ vpblendvb m8, m4, m8, m9 ; p0
+%ifidn %2, v
+ mova [tmpq+stride3q ], m8 ; p0
+%else
+ mova [rsp+1*32], m8
+%endif
+
+ punpcklbw m0, m5, m15
+ punpckhbw m1, m5, m15
+ pmaddubsw m8, m0, [pb_1]
+ pmaddubsw m11, m1, [pb_1]
+ paddw m2, m8
+ paddw m7, m11
+ punpcklbw m8, m4, m12
+ punpckhbw m11, m4, m12
+ pmaddubsw m8, [pb_1]
+ pmaddubsw m11, [pb_1]
+ psubw m2, m8
+ psubw m7, m11 ; p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4
+ psrlw m8, m2, 3
+ psrlw m11, m7, 3
+ packuswb m8, m11
+ vpblendvb m11, m5, m8, m9 ; q0
+%ifidn %2, v
+ mova [dstq+strideq*0], m11 ; q0
+%endif
+
+ pmaddubsw m0, [pb_m1_1]
+ pmaddubsw m1, [pb_m1_1]
+ paddw m2, m0
+ paddw m7, m1
+ punpcklbw m8, m13, m6
+ punpckhbw m13, m6
+ pmaddubsw m8, [pb_m1_1]
+ pmaddubsw m13, [pb_m1_1]
+ paddw m2, m8
+ paddw m7, m13 ; p1 + p0 + q0 + 2 * q1 + q2 + 2 * q3 + 4
+ psrlw m8, m2, 3
+ psrlw m13, m7, 3
+ packuswb m8, m13
+ vpblendvb m13, m6, m8, m9 ; q1
+%ifidn %2, v
+ mova [dstq+strideq*1], m13 ; q1
+%endif
+
+ punpcklbw m0, m3, m6
+ punpckhbw m1, m3, m6
+ pmaddubsw m0, [pb_1]
+ pmaddubsw m1, [pb_1]
+ psubw m2, m0
+ psubw m7, m1
+ punpcklbw m0, m14, m15
+ punpckhbw m1, m14, m15
+ pmaddubsw m0, [pb_1]
+ pmaddubsw m1, [pb_1]
+ paddw m2, m0
+ paddw m7, m1 ; p0 + q0 + q1 + q2 + 2 * q2 + 3 * q3 + 4
+ psrlw m2, 3
+ psrlw m7, 3
+ packuswb m2, m7
+ vpblendvb m2, m14, m2, m9 ; q2
+%ifidn %2, v
+ mova [dstq+strideq*2], m2 ; q2
+%else
+ mova m0, [rsp+0*32]
+ mova m1, [rsp+1*32]
+%if %1 == 8
+ ; 16x8 transpose
+ punpcklbw m3, m12, m10
+ punpckhbw m12, m10
+ punpcklbw m10, m0, m1
+ punpckhbw m0, m1
+ punpcklbw m1, m11, m13
+ punpckhbw m11, m13
+ punpcklbw m13, m2, m15
+ punpckhbw m2, m15
+
+ punpcklwd m15, m3, m10
+ punpckhwd m3, m10
+ punpcklwd m10, m12, m0
+ punpckhwd m12, m0
+ punpcklwd m0, m1, m13
+ punpckhwd m1, m13
+ punpcklwd m13, m11, m2
+ punpckhwd m11, m2
+
+ punpckldq m2, m15, m0
+ punpckhdq m15, m0
+ punpckldq m0, m3, m1
+ punpckhdq m3, m1
+ punpckldq m1, m10, m13
+ punpckhdq m10, m13
+ punpckldq m13, m12, m11
+ punpckhdq m12, m11
+
+ ; write 8x32
+ movq [dstq+strideq*0-4], xm2
+ movhps [dstq+strideq*1-4], xm2
+ movq [dstq+strideq*2-4], xm15
+ movhps [dstq+stride3q -4], xm15
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0-4], xm0
+ movhps [dstq+strideq*1-4], xm0
+ movq [dstq+strideq*2-4], xm3
+ movhps [dstq+stride3q -4], xm3
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0-4], xm1
+ movhps [dstq+strideq*1-4], xm1
+ movq [dstq+strideq*2-4], xm10
+ movhps [dstq+stride3q -4], xm10
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0-4], xm13
+ movhps [dstq+strideq*1-4], xm13
+ movq [dstq+strideq*2-4], xm12
+ movhps [dstq+stride3q -4], xm12
+ lea dstq, [dstq+strideq*4]
+
+ vextracti128 xm2, m2, 1
+ vextracti128 xm15, m15, 1
+ vextracti128 xm0, m0, 1
+ vextracti128 xm3, m3, 1
+ vextracti128 xm1, m1, 1
+ vextracti128 xm10, m10, 1
+ vextracti128 xm13, m13, 1
+ vextracti128 xm12, m12, 1
+
+ movq [dstq+strideq*0-4], xm2
+ movhps [dstq+strideq*1-4], xm2
+ movq [dstq+strideq*2-4], xm15
+ movhps [dstq+stride3q -4], xm15
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0-4], xm0
+ movhps [dstq+strideq*1-4], xm0
+ movq [dstq+strideq*2-4], xm3
+ movhps [dstq+stride3q -4], xm3
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0-4], xm1
+ movhps [dstq+strideq*1-4], xm1
+ movq [dstq+strideq*2-4], xm10
+ movhps [dstq+stride3q -4], xm10
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0-4], xm13
+ movhps [dstq+strideq*1-4], xm13
+ movq [dstq+strideq*2-4], xm12
+ movhps [dstq+stride3q -4], xm12
+ lea dstq, [dstq+strideq*4]
+%else
+ ; 16x16 transpose and store
+ SWAP 5, 10, 2
+ SWAP 6, 0
+ SWAP 7, 1
+ SWAP 8, 11
+ SWAP 9, 13
+ mova m0, [rsp+11*32]
+ mova m1, [rsp+12*32]
+ mova m2, [rsp+13*32]
+ mova m3, [rsp+14*32]
+ mova m4, [rsp+19*32]
+ mova m11, [rsp+20*32]
+ mova m12, [rsp+15*32]
+ mova m13, [rsp+16*32]
+ mova m14, [rsp+17*32]
+ TRANSPOSE_16X16B 1, 0, [rsp+18*32]
+ movu [dstq+strideq*0-8], xm0
+ movu [dstq+strideq*1-8], xm1
+ movu [dstq+strideq*2-8], xm2
+ movu [dstq+stride3q -8], xm3
+ lea dstq, [dstq+strideq*4]
+ movu [dstq+strideq*0-8], xm4
+ movu [dstq+strideq*1-8], xm5
+ movu [dstq+strideq*2-8], xm6
+ movu [dstq+stride3q -8], xm7
+ lea dstq, [dstq+strideq*4]
+ movu [dstq+strideq*0-8], xm8
+ movu [dstq+strideq*1-8], xm9
+ movu [dstq+strideq*2-8], xm10
+ movu [dstq+stride3q -8], xm11
+ lea dstq, [dstq+strideq*4]
+ movu [dstq+strideq*0-8], xm12
+ movu [dstq+strideq*1-8], xm13
+ movu [dstq+strideq*2-8], xm14
+ movu [dstq+stride3q -8], xm15
+ lea dstq, [dstq+strideq*4]
+ vextracti128 [dstq+strideq*0-8], m0, 1
+ vextracti128 [dstq+strideq*1-8], m1, 1
+ vextracti128 [dstq+strideq*2-8], m2, 1
+ vextracti128 [dstq+stride3q -8], m3, 1
+ lea dstq, [dstq+strideq*4]
+ vextracti128 [dstq+strideq*0-8], m4, 1
+ vextracti128 [dstq+strideq*1-8], m5, 1
+ vextracti128 [dstq+strideq*2-8], m6, 1
+ vextracti128 [dstq+stride3q -8], m7, 1
+ lea dstq, [dstq+strideq*4]
+ vextracti128 [dstq+strideq*0-8], m8, 1
+ vextracti128 [dstq+strideq*1-8], m9, 1
+ vextracti128 [dstq+strideq*2-8], m10, 1
+ vextracti128 [dstq+stride3q -8], m11, 1
+ lea dstq, [dstq+strideq*4]
+ vextracti128 [dstq+strideq*0-8], m12, 1
+ vextracti128 [dstq+strideq*1-8], m13, 1
+ vextracti128 [dstq+strideq*2-8], m14, 1
+ vextracti128 [dstq+stride3q -8], m15, 1
+ lea dstq, [dstq+strideq*4]
+%endif
+%endif
+%elif %1 == 6
+ ; flat6 filter
+
+ punpcklbw m8, m13, m5
+ punpckhbw m11, m13, m5
+ pmaddubsw m0, m8, [pb_3_1]
+ pmaddubsw m1, m11, [pb_3_1]
+ punpcklbw m7, m4, m3
+ punpckhbw m10, m4, m3
+ pmaddubsw m2, m7, [pb_2]
+ pmaddubsw m12, m10, [pb_2]
+ paddw m0, m2
+ paddw m1, m12
+ pmulhrsw m2, m0, [pw_4096]
+ pmulhrsw m12, m1, [pw_4096]
+ packuswb m2, m12
+ vpblendvb m2, m3, m2, m9
+%ifidn %2, v
+ mova [tmpq+strideq*2], m2 ; p1
+%endif
+
+ pmaddubsw m8, [pb_m1_1]
+ pmaddubsw m11, [pb_m1_1]
+ paddw m0, m8
+ paddw m1, m11
+ punpcklbw m8, m13, m6
+ punpckhbw m11, m13, m6
+ pmaddubsw m8, [pb_m1_1]
+ pmaddubsw m11, [pb_m1_1]
+ paddw m0, m8
+ paddw m1, m11
+ pmulhrsw m12, m0, [pw_4096]
+ pmulhrsw m13, m1, [pw_4096]
+ packuswb m12, m13
+ vpblendvb m12, m4, m12, m9
+%ifidn %2, v
+ mova [tmpq+stride3q], m12 ; p0
+%endif
+
+ paddw m0, m8
+ paddw m1, m11
+ punpcklbw m8, m3, m14
+ punpckhbw m11, m3, m14
+ pmaddubsw m14, m8, [pb_m1_1]
+ pmaddubsw m13, m11, [pb_m1_1]
+ paddw m0, m14
+ paddw m1, m13
+ pmulhrsw m14, m0, [pw_4096]
+ pmulhrsw m13, m1, [pw_4096]
+ packuswb m14, m13
+ vpblendvb m14, m5, m14, m9
+%ifidn %2, v
+ mova [dstq+strideq*0], m14 ; q0
+%endif
+
+ pmaddubsw m8, [pb_m1_2]
+ pmaddubsw m11, [pb_m1_2]
+ paddw m0, m8
+ paddw m1, m11
+ pmaddubsw m7, [pb_m1_0]
+ pmaddubsw m10, [pb_m1_0]
+ paddw m0, m7
+ paddw m1, m10
+ pmulhrsw m0, [pw_4096]
+ pmulhrsw m1, [pw_4096]
+ packuswb m0, m1
+ vpblendvb m0, m6, m0, m9
+%ifidn %2, v
+ mova [dstq+strideq*1], m0 ; q1
+%else
+ TRANSPOSE_16x4_AND_WRITE_4x32 2, 12, 14, 0, 1
+%endif
+%else
+%ifidn %2, v
+ mova [tmpq+strideq*0], m3 ; p1
+ mova [tmpq+strideq*1], m4 ; p0
+ mova [tmpq+strideq*2], m5 ; q0
+ mova [tmpq+stride3q ], m6 ; q1
+%else
+ TRANSPOSE_16x4_AND_WRITE_4x32 3, 4, 5, 6, 7
+%endif
+%endif
+%endmacro
+
+INIT_YMM avx2
+cglobal lpf_v_sb_y_8bpc, 7, 10, 16, 32 * 11, \
+ dst, stride, mask, l, l_stride, lut, \
+ w, stride3, mstride, tmp
+ shl l_strideq, 2
+ sub lq, l_strideq
+ mov mstrideq, strideq
+ neg mstrideq
+ lea stride3q, [strideq*3]
+
+.loop:
+ cmp byte [maskq+8], 0 ; vmask[2]
+ je .no_flat16
+
+ FILTER 16, v
+ jmp .end
+
+.no_flat16:
+ cmp byte [maskq+4], 0 ; vmask[1]
+ je .no_flat
+
+ FILTER 8, v
+ jmp .end
+
+.no_flat:
+ cmp byte [maskq+0], 0 ; vmask[0]
+ je .end
+
+ call .v4
+
+.end:
+ add lq, 32
+ add dstq, 32
+ add maskq, 1
+ sub wd, 8
+ jg .loop
+ RET
+ALIGN function_align
+.v4:
+ FILTER 4, v
+ ret
+
+INIT_YMM avx2
+cglobal lpf_h_sb_y_8bpc, 7, 10, 16, 32 * 21, \
+ dst, stride, mask, l, l_stride, lut, \
+ h, stride3, l_stride3, tmp
+ shl l_strideq, 2
+ sub lq, 4
+ lea stride3q, [strideq*3]
+ lea l_stride3q, [l_strideq*3]
+
+.loop:
+ cmp byte [maskq+8], 0 ; vmask[2]
+ je .no_flat16
+
+ FILTER 16, h
+ jmp .end
+
+.no_flat16:
+ cmp byte [maskq+4], 0 ; vmask[1]
+ je .no_flat
+
+ FILTER 8, h
+ jmp .end
+
+.no_flat:
+ cmp byte [maskq+0], 0 ; vmask[0]
+ je .no_filter
+
+ call .h4
+ jmp .end
+
+.no_filter:
+ lea dstq, [dstq+stride3q*8]
+ lea lq, [lq+l_strideq*8]
+ lea dstq, [dstq+strideq*8]
+.end:
+ add maskq, 1
+ sub hd, 8
+ jg .loop
+ RET
+ALIGN function_align
+.h4:
+ FILTER 4, h
+ ret
+
+INIT_YMM avx2
+cglobal lpf_v_sb_uv_8bpc, 7, 10, 16, \
+ dst, stride, mask, l, l_stride, lut, \
+ w, stride3, mstride, tmp
+ shl l_strideq, 2
+ sub lq, l_strideq
+ mov mstrideq, strideq
+ neg mstrideq
+ lea stride3q, [strideq*3]
+
+.loop:
+ cmp byte [maskq+4], 0 ; vmask[1]
+ je .no_flat
+
+ FILTER 6, v
+ jmp .end
+
+.no_flat:
+ cmp byte [maskq+0], 0 ; vmask[0]
+ je .end
+
+ call mangle(private_prefix %+ _lpf_v_sb_y_8bpc_avx2).v4
+
+.end:
+ add lq, 32
+ add dstq, 32
+ add maskq, 1
+ sub wd, 8
+ jg .loop
+ RET
+
+INIT_YMM avx2
+cglobal lpf_h_sb_uv_8bpc, 7, 10, 16, \
+ dst, stride, mask, l, l_stride, lut, \
+ h, stride3, l_stride3, tmp
+ shl l_strideq, 2
+ sub lq, 4
+ lea stride3q, [strideq*3]
+ lea l_stride3q, [l_strideq*3]
+
+.loop:
+ cmp byte [maskq+4], 0 ; vmask[1]
+ je .no_flat
+
+ FILTER 6, h
+ jmp .end
+
+.no_flat:
+ cmp byte [maskq+0], 0 ; vmask[0]
+ je .no_filter
+
+ call mangle(private_prefix %+ _lpf_h_sb_y_8bpc_avx2).h4
+ jmp .end
+
+.no_filter:
+ lea dstq, [dstq+stride3q*8]
+ lea lq, [lq+l_strideq*8]
+ lea dstq, [dstq+strideq*8]
+.end:
+ add maskq, 1
+ sub hd, 8
+ jg .loop
+ RET
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/loopfilter_avx512.asm b/third_party/dav1d/src/x86/loopfilter_avx512.asm
new file mode 100644
index 0000000000..202a612aac
--- /dev/null
+++ b/third_party/dav1d/src/x86/loopfilter_avx512.asm
@@ -0,0 +1,1529 @@
+; Copyright © 2018, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 64
+
+pb_4x0_4x4_4x8_4x12: times 4 db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12
+
+pb_mask: dd 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080
+ dd 0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000, 0x8000
+
+hmulA: dd 0, 8, 16, 24, 32, 40, 48, 56, 4, 12, 20, 28, 36, 44, 52, 60
+hmulB: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+hmulC: dd 0, 1, 2, 3, 16, 17, 18, 19, 32, 33, 34, 35, 48, 49, 50, 51
+hmulD: dd 0, 1, 16, 17, 32, 33, 48, 49
+hshuf4:db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
+
+shift1: dq 0x0204081020408000
+shift3: dq 0x0810204080000000
+shift4: dq 0x1020408000000000
+
+pb_1: times 4 db 1
+pb_2: times 4 db 2
+pb_3: times 4 db 3
+pb_4: times 4 db 4
+pb_16: times 4 db 16
+pb_63: times 4 db 63
+pb_64: times 4 db 64
+pb_128: times 4 db 0x80
+pb_2_1: times 2 db 2, 1
+pb_3_1: times 2 db 3, 1
+pb_7_1: times 2 db 7, 1
+pb_m1_0: times 2 db -1, 0
+pb_m1_1: times 2 db -1, 1
+pb_m1_2: times 2 db -1, 2
+pw_2048: times 2 dw 2048
+pw_4096: times 2 dw 4096
+
+SECTION .text
+
+%macro ABSSUB 4 ; dst, a, b, tmp
+ psubusb %1, %2, %3
+ psubusb %4, %3, %2
+ por %1, %4
+%endmacro
+
+%macro TRANSPOSE_16x4_AND_WRITE_4x32 5
+ punpcklbw m%5, m%1, m%2
+ punpckhbw m%1, m%2
+ punpcklbw m%2, m%3, m%4
+ punpckhbw m%3, m%4
+ punpcklwd m%4, m%5, m%2
+ punpckhwd m%5, m%2
+ punpcklwd m%2, m%1, m%3
+ punpckhwd m%1, m%3
+ kmovw k1, k6
+ lea t0, [dstq+strideq*4]
+ vpscatterdd [dstq+m19-2]{k1}, m%4
+ kmovw k1, k6
+ lea t1, [dstq+strideq*8]
+ vpscatterdd [t0 +m19-2]{k1}, m%5
+ kmovw k1, k6
+ lea t2, [t0 +strideq*8]
+ vpscatterdd [t1 +m19-2]{k1}, m%2
+ kmovw k1, k6
+ vpscatterdd [t2 +m19-2]{k1}, m%1
+%endmacro
+
+%macro TRANSPOSE_16X16B 3 ; in_load_15_from_mem, out_store_0_in_mem, mem
+%if %1 == 0
+ SWAP m16, m22
+%endif
+ punpcklbw m22, m24, m26
+ punpckhbw m24, m26
+ punpcklbw m26, m2, m3
+ punpckhbw m2, m3
+ punpcklbw m3, m4, m5
+ punpckhbw m4, m5
+ punpcklbw m5, m6, m7
+ punpckhbw m6, m7
+ punpcklbw m7, m8, m9
+ punpckhbw m8, m9
+ punpcklbw m9, m10, m11
+ punpckhbw m10, m11
+ punpcklbw m11, m25, m13
+ punpckhbw m25, m13
+%if %1 == 0
+ SWAP m13, m16
+%else
+ mova m13, %3
+%endif
+ SWAP m16, m25
+ punpcklbw m25, m14, m13
+ punpckhbw m13, m14, m13
+ ; interleaved in m22,24,26,2,3,4,5,6,7,8,9,10,11,rsp%3,25,13
+ punpcklwd m14, m22, m26
+ punpckhwd m22, m26
+ punpcklwd m26, m24, m2
+ punpckhwd m24, m2
+ punpcklwd m2, m3, m5
+ punpckhwd m3, m5
+ punpcklwd m5, m4, m6
+ punpckhwd m4, m6
+ punpcklwd m6, m7, m9
+ punpckhwd m7, m9
+ punpcklwd m9, m8, m10
+ punpckhwd m8, m10
+ punpcklwd m10, m11, m25
+ punpckhwd m11, m25
+ SWAP m25, m16, m11
+ punpcklwd m11, m25, m13
+ punpckhwd m25, m13
+ ; interleaved in m14,15,26,24,2,3,5,4,6,7,9,8,10,rsp%3,11,25
+ punpckldq m13, m14, m2
+ punpckhdq m14, m2
+ punpckldq m2, m22, m3
+ punpckhdq m22, m3
+ punpckldq m3, m26, m5
+ punpckhdq m26, m5
+ punpckldq m5, m24, m4
+ punpckhdq m24, m4
+ punpckldq m4, m6, m10
+ punpckhdq m6, m10
+ punpckldq m10, m9, m11
+ punpckhdq m9, m11
+ punpckldq m11, m8, m25
+ punpckhdq m8, m25
+ SWAP m25, m16, m8
+ punpckldq m8, m7, m25
+ punpckhdq m7, m25
+ ; interleaved in m13,14,2,15,3,26,5,24,4,6,8,7,10,9,11,rsp%3
+ punpcklqdq m25, m13, m4
+ punpckhqdq m13, m4
+ punpcklqdq m4, m14, m6
+ punpckhqdq m14, m6
+ punpcklqdq m6, m2, m8
+ punpckhqdq m2, m8
+ punpcklqdq m8, m22, m7
+ punpckhqdq m22, m7
+ punpcklqdq m7, m3, m10
+ punpckhqdq m3, m10
+ punpcklqdq m10, m26, m9
+ punpckhqdq m26, m9
+ punpcklqdq m9, m5, m11
+ punpckhqdq m5, m11
+ SWAP m11, m16
+%if %2 == 0
+ SWAP m16, m25
+%else
+ mova %3, m25
+%endif
+ punpcklqdq m25, m24, m11
+ punpckhqdq m24, m11
+%if %2 == 0
+ SWAP m11, m16
+%endif
+ ; interleaved m11,13,4,14,6,2,8,15,7,3,10,26,9,5,25,24
+ SWAP 24, 11, 26, 13, 5, 2, 4, 6, 8, 7, 22
+ SWAP 3, 14, 25, 9
+%endmacro
+
+%macro FILTER 2 ; width [4/6/8/16], dir [h/v]
+ ; load data
+%ifidn %2, v
+%define is_h 0
+%if %1 == 4
+ lea t0, [dstq+mstrideq*2]
+ mova m3, [t0 +strideq*0] ; p1
+ mova m4, [t0 +strideq*1] ; p0
+ mova m5, [t0 +strideq*2] ; q0
+ mova m6, [t0 +stride3q ] ; q1
+%else
+ ; load 6-8 pixels, remainder (for wd=16) will be read inline
+%if %1 == 16
+ lea t0, [dstq+mstrideq*8]
+ mova m16, [t0 +strideq*1]
+ mova m17, [t0 +strideq*2]
+ mova m18, [t0 +stride3q ]
+%endif
+ lea t0, [dstq+mstrideq*4]
+%if %1 != 6
+ mova m25, [t0 +strideq*0]
+%endif
+ mova m13, [t0 +strideq*1]
+ mova m3, [t0 +strideq*2]
+ mova m4, [t0 +stride3q ]
+ mova m5, [dstq+strideq*0]
+ mova m6, [dstq+strideq*1]
+ mova m14, [dstq+strideq*2]
+%if %1 != 6
+ mova m22, [dstq+stride3q ]
+%endif
+%if %1 == 16
+ lea t0, [dstq+strideq*4]
+ mova m29, [t0 +strideq*0]
+ mova m30, [t0 +strideq*1]
+ mova m31, [t0 +strideq*2]
+%endif
+%endif
+%else ; h
+%define is_h 1
+ ; load lines
+%if %1 == 4
+ vbroadcasti32x4 m0, [hshuf4]
+ kmovw k1, k6
+ lea t0, [dstq+strideq*4]
+ vpgatherdd m3{k1}, [dstq+m19-2]
+ kmovw k1, k6
+ lea t1, [dstq+strideq*8]
+ vpgatherdd m4{k1}, [t0 +m19-2]
+ kmovw k1, k6
+ lea t2, [t0 +strideq*8]
+ vpgatherdd m5{k1}, [t1 +m19-2]
+ kmovw k1, k6
+ vpgatherdd m6{k1}, [t2 +m19-2]
+ pshufb m3, m0
+ pshufb m4, m0
+ pshufb m5, m0
+ pshufb m6, m0
+ punpckldq m7, m3, m4
+ punpckhdq m3, m4
+ punpckldq m4, m5, m6
+ punpckhdq m5, m6
+ punpcklqdq m6, m7, m4
+ punpckhqdq m7, m4
+ punpcklqdq m4, m3, m5
+ punpckhqdq m3, m5
+ SWAP 3, 6
+ SWAP 5, 4, 7
+ ; 6,7,4,3 -> 3,4,5,6
+%elif %1 == 6 || %1 == 8
+ kmovb k1, k7
+ lea t0, [dstq+strideq*1]
+ vpgatherdq m3{k1}, [dstq+ym21-%1/2]
+ kmovb k1, k7
+ lea t1, [dstq+strideq*2]
+ vpgatherdq m4{k1}, [t0 +ym21-%1/2]
+ kmovb k1, k7
+ lea t2, [dstq+stride3q ]
+ vpgatherdq m5{k1}, [t1 +ym21-%1/2]
+ kmovb k1, k7
+ vextracti32x8 ym0, m21, 1
+ vpgatherdq m6{k1}, [t2 +ym21-%1/2]
+ kmovb k1, k7
+ vpgatherdq m12{k1}, [dstq+ym0 -%1/2]
+ kmovb k1, k7
+ vpgatherdq m13{k1}, [t0 +ym0 -%1/2]
+ kmovb k1, k7
+ vpgatherdq m14{k1}, [t1 +ym0 -%1/2]
+ kmovb k1, k7
+ vpgatherdq m15{k1}, [t2 +ym0 -%1/2]
+ ; transpose 8x16
+ ; xm3: A-H0,A-H8
+ ; xm4: A-H1,A-H9
+ ; xm5: A-H2,A-H10
+ ; xm6: A-H3,A-H11
+ ; xm12: A-H4,A-H12
+ ; xm13: A-H5,A-H13
+ ; xm14: A-H6,A-H14
+ ; xm15: A-H7,A-H15
+ punpcklbw m7, m3, m4
+ punpckhbw m3, m4
+ punpcklbw m4, m5, m6
+ punpckhbw m5, m6
+ punpcklbw m6, m12, m13
+ punpckhbw m12, m13
+ punpcklbw m13, m14, m15
+ punpckhbw m14, m15
+ ; xm7: A0-1,B0-1,C0-1,D0-1,E0-1,F0-1,G0-1,H0-1
+ ; xm3: A8-9,B8-9,C8-9,D8-9,E8-9,F8-9,G8-9,H8-9
+ ; xm4: A2-3,B2-3,C2-3,D2-3,E2-3,F2-3,G2-3,H2-3
+ ; xm5: A10-11,B10-11,C10-11,D10-11,E10-11,F10-11,G10-11,H10-11
+ ; xm6: A4-5,B4-5,C4-5,D4-5,E4-5,F4-5,G4-5,H4-5
+ ; xm12: A12-13,B12-13,C12-13,D12-13,E12-13,F12-13,G12-13,H12-13
+ ; xm13: A6-7,B6-7,C6-7,D6-7,E6-7,F6-7,G6-7,H6-7
+ ; xm14: A14-15,B14-15,C14-15,D14-15,E14-15,F14-15,G14-15,H14-15
+ punpcklwd m15, m7, m4
+ punpckhwd m7, m4
+ punpcklwd m4, m3, m5
+ punpckhwd m3, m5
+ punpcklwd m5, m6, m13
+ punpckhwd m6, m13
+ punpcklwd m13, m12, m14
+ punpckhwd m12, m14
+ ; xm15: A0-3,B0-3,C0-3,D0-3
+ ; xm7: E0-3,F0-3,G0-3,H0-3
+ ; xm4: A8-11,B8-11,C8-11,D8-11
+ ; xm3: E8-11,F8-11,G8-11,H8-11
+ ; xm5: A4-7,B4-7,C4-7,D4-7
+ ; xm6: E4-7,F4-7,G4-7,H4-7
+ ; xm13: A12-15,B12-15,C12-15,D12-15
+ ; xm12: E12-15,F12-15,G12-15,H12-15
+ punpckldq m14, m15, m5
+ punpckhdq m15, m5
+ punpckldq m5, m7, m6
+ %if %1 != 6
+ punpckhdq m7, m6
+ %endif
+ punpckldq m6, m4, m13
+ punpckhdq m4, m13
+ punpckldq m13, m3, m12
+ %if %1 != 6
+ punpckhdq m12, m3, m12
+ %endif
+ ; xm14: A0-7,B0-7
+ ; xm15: C0-7,D0-7
+ ; xm5: E0-7,F0-7
+ ; xm7: G0-7,H0-7
+ ; xm6: A8-15,B8-15
+ ; xm4: C8-15,D8-15
+ ; xm13: E8-15,F8-15
+ ; xm12: G8-15,H8-15
+ punpcklqdq m3, m14, m6
+ punpckhqdq m14, m6
+ punpckhqdq m6, m15, m4
+ punpcklqdq m15, m4
+ punpcklqdq m4, m5, m13
+ punpckhqdq m13, m5, m13
+ %if %1 == 8
+ punpcklqdq m5, m7, m12
+ punpckhqdq m25, m7, m12
+ ; xm3: A0-15
+ ; xm14: B0-15
+ ; xm15: C0-15
+ ; xm6: D0-15
+ ; xm4: E0-15
+ ; xm13: F0-15
+ ; xm5: G0-15
+ ; xm25: H0-15
+ SWAP 25, 3, 15
+ SWAP 13, 14, 5, 4, 6
+ SWAP 15, 22
+ ; 3,14,15,6,4,13,5,12 -> 12,13,3,4,5,6,14,22
+ %else
+ SWAP 13, 3, 14
+ SWAP 6, 4, 15, 5
+ ; 3,14,15,6,4,13 -> 13,3,4,5,6,14
+ %endif
+%else ; 16, h
+ ; load and 16x16 transpose. We only use 14 pixels but we'll need the
+ ; remainder at the end for the second transpose
+ movu xm24, [dstq+strideq*0-8]
+ movu xm26, [dstq+strideq*1-8]
+ movu xm2, [dstq+strideq*2-8]
+ movu xm3, [dstq+stride3q -8]
+ lea t0, [dstq+strideq*4]
+ movu xm4, [t0 +strideq*0-8]
+ movu xm5, [t0 +strideq*1-8]
+ movu xm6, [t0 +strideq*2-8]
+ movu xm7, [t0 +stride3q -8]
+ lea t0, [t0 +strideq*4]
+ movu xm8, [t0 +strideq*0-8]
+ movu xm9, [t0 +strideq*1-8]
+ movu xm10, [t0 +strideq*2-8]
+ movu xm11, [t0 +stride3q -8]
+ lea t0, [t0 +strideq*4]
+ movu xm25, [t0 +strideq*0-8]
+ movu xm13, [t0 +strideq*1-8]
+ movu xm14, [t0 +strideq*2-8]
+ movu xm22, [t0 +stride3q -8]
+ lea t0, [t0 +strideq*4]
+ vinserti32x4 ym24, [t0 +strideq*0-8], 1
+ vinserti32x4 ym26, [t0 +strideq*1-8], 1
+ vinserti32x4 ym2, [t0 +strideq*2-8], 1
+ vinserti32x4 ym3, [t0 +stride3q -8], 1
+ lea t0, [t0 +strideq*4]
+ vinserti32x4 ym4, [t0 +strideq*0-8], 1
+ vinserti32x4 ym5, [t0 +strideq*1-8], 1
+ vinserti32x4 ym6, [t0 +strideq*2-8], 1
+ vinserti32x4 ym7, [t0 +stride3q -8], 1
+ lea t0, [t0 +strideq*4]
+ vinserti32x4 ym8, [t0 +strideq*0-8], 1
+ vinserti32x4 ym9, [t0 +strideq*1-8], 1
+ vinserti32x4 ym10, [t0 +strideq*2-8], 1
+ vinserti32x4 ym11, [t0 +stride3q -8], 1
+ lea t0, [t0 +strideq*4]
+ vinserti32x4 ym25, [t0 +strideq*0-8], 1
+ vinserti32x4 ym13, [t0 +strideq*1-8], 1
+ vinserti32x4 ym14, [t0 +strideq*2-8], 1
+ vinserti32x4 ym22, [t0 +stride3q -8], 1
+ lea t0, [t0 +strideq*4]
+ vinserti32x4 m24, [t0 +strideq*0-8], 2
+ vinserti32x4 m26, [t0 +strideq*1-8], 2
+ vinserti32x4 m2, [t0 +strideq*2-8], 2
+ vinserti32x4 m3, [t0 +stride3q -8], 2
+ lea t0, [t0 +strideq*4]
+ vinserti32x4 m4, [t0 +strideq*0-8], 2
+ vinserti32x4 m5, [t0 +strideq*1-8], 2
+ vinserti32x4 m6, [t0 +strideq*2-8], 2
+ vinserti32x4 m7, [t0 +stride3q -8], 2
+ lea t0, [t0 +strideq*4]
+ vinserti32x4 m8, [t0 +strideq*0-8], 2
+ vinserti32x4 m9, [t0 +strideq*1-8], 2
+ vinserti32x4 m10, [t0 +strideq*2-8], 2
+ vinserti32x4 m11, [t0 +stride3q -8], 2
+ lea t0, [t0 +strideq*4]
+ vinserti32x4 m25, [t0 +strideq*0-8], 2
+ vinserti32x4 m13, [t0 +strideq*1-8], 2
+ vinserti32x4 m14, [t0 +strideq*2-8], 2
+ vinserti32x4 m22, [t0 +stride3q -8], 2
+ lea t0, [t0 +strideq*4]
+ vinserti32x4 m24, [t0 +strideq*0-8], 3
+ vinserti32x4 m26, [t0 +strideq*1-8], 3
+ vinserti32x4 m2, [t0 +strideq*2-8], 3
+ vinserti32x4 m3, [t0 +stride3q -8], 3
+ lea t0, [t0 +strideq*4]
+ vinserti32x4 m4, [t0 +strideq*0-8], 3
+ vinserti32x4 m5, [t0 +strideq*1-8], 3
+ vinserti32x4 m6, [t0 +strideq*2-8], 3
+ vinserti32x4 m7, [t0 +stride3q -8], 3
+ lea t0, [t0 +strideq*4]
+ vinserti32x4 m8, [t0 +strideq*0-8], 3
+ vinserti32x4 m9, [t0 +strideq*1-8], 3
+ vinserti32x4 m10, [t0 +strideq*2-8], 3
+ vinserti32x4 m11, [t0 +stride3q -8], 3
+ lea t0, [t0 +strideq*4]
+ vinserti32x4 m25, [t0 +strideq*0-8], 3
+ vinserti32x4 m13, [t0 +strideq*1-8], 3
+ vinserti32x4 m14, [t0 +strideq*2-8], 3
+ vinserti32x4 m22, [t0 +stride3q -8], 3
+ ;
+ TRANSPOSE_16X16B 0, 1, [rsp+0*64]
+ SWAP m16, m26
+ SWAP m17, m2
+ SWAP m18, m3
+ SWAP m29, m25
+ SWAP m30, m13
+ SWAP m31, m14
+ mova [rsp+4*64], m22
+ ; 4,5,6,7,8,9,10,11 -> 25,13,3,4,5,6,14,22
+ SWAP 25, 4, 7
+ SWAP 13, 5, 8
+ SWAP 3, 6, 9
+ SWAP 10, 14
+ SWAP 11, 22
+%endif
+%endif
+
+ ; load L/E/I/H
+ vpbroadcastd m15, [pb_1]
+%ifidn %2, v
+ movu m1, [lq]
+ movu m0, [lq+l_strideq]
+%else
+ kmovw k1, k6
+ vpgatherdd m0{k1}, [lq+m20+4]
+ kmovw k1, k6
+ vpgatherdd m1{k1}, [lq+m20+0]
+%endif
+ pxor m2, m2
+ pcmpeqb k1, m0, m2
+ vmovdqu8 m0{k1}, m1 ; l[x][] ? l[x][] : l[x-stride][]
+ pshufb m0, pbshuf ; l[x][0]
+ vpcmpub k3, m0, m2, 4 ; neq ; L
+ psrlq m2, m0, [lutq+128]
+ pand m2, [pb_63]{bcstd}
+ vpbroadcastb m1, [lutq+136]
+ pminub m2, m1
+ pmaxub m2, m15 ; I
+ gf2p8affineqb m1, m0, [shift4]{bcstq}, 0 ; H
+ paddd m0, [pb_2]{bcstd}
+ paddb m0, m0
+ paddb m0, m2 ; E
+
+ ABSSUB m8, m3, m4, m9 ; abs(p1-p0)
+ ABSSUB m9, m5, m6, m10 ; abs(q1-q0)
+ pmaxub m8, m9
+ vpcmpub k1, m8, m1, 6 ; gt ; hev
+%if %1 != 4
+ %if %1 == 6
+ ABSSUB m9, m13, m4, m10 ; abs(p2-p0)
+ pmaxub m9, m8
+ %else
+ ABSSUB m9, m25, m4, m10 ; abs(p3-p0)
+ pmaxub m9, m8
+ ABSSUB m10, m13, m4, m11 ; abs(p2-p0)
+ pmaxub m9, m10
+ %endif
+ ABSSUB m10, m5, m14, m11 ; abs(q2-q0)
+ pmaxub m9, m10
+ %if %1 != 6
+ ABSSUB m10, m5, m22, m11 ; abs(q3-q0)
+ pmaxub m9, m10
+ %endif
+ vpcmpub k2{k3}, m9, m15, 2 ; le ; flat8in
+ %if %1 == 6
+ ABSSUB m10, m13, m3, m1 ; abs(p2-p1)
+ %else
+ ABSSUB m10, m25, m13, m11 ; abs(p3-p2)
+ ABSSUB m11, m13, m3, m1 ; abs(p2-p1)
+ pmaxub m10, m11
+ ABSSUB m11, m14, m22, m1 ; abs(q3-q2)
+ pmaxub m10, m11
+ %endif
+ ABSSUB m11, m14, m6, m1 ; abs(q2-q1)
+ pmaxub m10, m11
+ %if %1 == 16
+ vpbroadcastd m11, [maskq+8]
+ por m11, [maskq+4]{bcstd}
+ %else
+ vpbroadcastd m11, [maskq+4]
+ %endif
+ vptestmd k4, m11, pbmask
+ vmovdqa32 m10{k4}{z}, m10 ; only apply fm-wide to wd>4 blocks
+ pmaxub m8, m10
+%endif
+ vpcmpub k3{k3}, m8, m2, 2 ; le
+ ABSSUB m10, m3, m6, m11 ; abs(p1-q1)
+ ABSSUB m11, m4, m5, m2 ; abs(p0-q0)
+ paddusb m11, m11
+ gf2p8affineqb m10, m10, [shift1]{bcstq}, 0
+ paddusb m10, m11 ; abs(p0-q0)*2+(abs(p1-q1)>>1)
+ vpcmpub k3{k3}, m10, m0, 2 ; abs(p0-q0)*2+(abs(p1-q1)>>1) <= E
+
+%if %1 == 16
+ ABSSUB m1, m16, m4, m2
+ ABSSUB m2, m17, m4, m10
+ pmaxub m1, m2
+ ABSSUB m2, m18, m4, m10
+ pmaxub m1, m2
+ ABSSUB m2, m29, m5, m10
+ pmaxub m1, m2
+ ABSSUB m2, m30, m5, m10
+ pmaxub m1, m2
+ ABSSUB m2, m31, m5, m10
+ pmaxub m1, m2
+ kandq k2, k2, k3
+ vpcmpub k4{k2}, m1, m15, 2 ; flat8in & flat8out
+ vpbroadcastd m2, [maskq+8]
+ vptestmd k5, m2, pbmask
+ vpmovm2d m7, k5
+ vptestmb k4{k4}, m7, m7 ; flat16 & fm
+ por m10, m2, [maskq+4]{bcstd}
+ vptestmd k5, m10, pbmask
+ vpmovm2d m7, k5
+ vptestmb k2{k2}, m7, m7 ; flat8in
+ por m2, m10, [maskq+0]{bcstd}
+ vptestmd k5, m2, pbmask
+ vpmovm2d m7, k5
+ vptestmb k3{k3}, m7, m7
+ kandnq k3, k2, k3 ; fm & !flat8 & !flat16
+ kandnq k2, k4, k2 ; flat8 & !flat16
+%elif %1 != 4
+ vpbroadcastd m0, [maskq+4]
+ vptestmd k4, m0, pbmask
+ vpmovm2d m7, k4
+ vptestmb k2{k2}, m7, m7
+ kandq k2, k2, k3 ; flat8 & fm
+ por m0, [maskq+0]{bcstd}
+ vptestmd k4, m0, pbmask
+ vpmovm2d m7, k4
+ vptestmb k3{k3}, m7, m7
+ kandnq k3, k2, k3 ; fm & !flat8
+%else
+ %ifidn %2, v
+ vptestmd k4, pbmask, [maskq+0]{bcstd}
+ %else
+ vpbroadcastd m0, [maskq+0]
+ vptestmd k4, m0, pbmask
+ %endif
+ vpmovm2d m7, k4
+ vptestmb k3{k3}, m7, m7 ; fm
+%endif
+
+ ; short filter
+%if %1 >= 8
+ SWAP m23, m15
+%endif
+ vpbroadcastd m15, [pb_3]
+ vpbroadcastd m0, [pb_4]
+ vpbroadcastd m12, [pb_16]
+ vpbroadcastd m1, [pb_64]
+ pxor m3, pb128
+ pxor m6, pb128
+ psubsb m10{k1}{z}, m3, m6 ; f=iclip_diff(p1-q1)&hev
+ pxor m4, pb128
+ pxor m5, pb128
+ psubsb m11, m5, m4
+ paddsb m10, m11
+ paddsb m10, m11
+ paddsb m10{k3}{z}, m10, m11 ; f=iclip_diff(3*(q0-p0)+f)&fm
+ paddsb m8, m10, m15
+ paddsb m10, m0
+ gf2p8affineqb m8, m8, [shift3]{bcstq}, 16
+ gf2p8affineqb m10, m10, [shift3]{bcstq}, 16
+ psubb m8, m12 ; f2
+ psubb m10, m12 ; f1
+ paddsb m4, m8
+ psubsb m5, m10
+ pxor m4, pb128
+ pxor m5, pb128
+ ;
+ pxor m10, pb128
+ pxor m8, m8
+ pavgb m8, m10 ; f=(f1+1)>>1
+ psubb m8, m1
+ knotq k1, k1
+ paddsb m3{k1}, m3, m8
+ psubsb m6{k1}, m6, m8
+ pxor m3, pb128
+ pxor m6, pb128
+
+%if %1 == 16
+ ; flat16 filter
+%ifidn %2, v
+ lea t0, [dstq+mstrideq*8]
+%endif
+ SWAP m24, m16, m14
+ SWAP m2, m17, m22
+ SWAP m7, m18
+
+ ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 [p5/p4/p2/p1/p0/q0][p6/p3] A
+ ; write -6
+ vpbroadcastd m1, [pb_7_1]
+ vpbroadcastd m12, [pb_2]
+ punpcklbw m14, m24, m25
+ punpckhbw m22, m24, m25
+ pmaddubsw m10, m14, m1
+ pmaddubsw m11, m22, m1 ; p6*7+p3
+ punpcklbw m8, m2, m7
+ punpckhbw m9, m2, m7
+ pmaddubsw m8, m12
+ pmaddubsw m9, m12
+ paddw m10, m8
+ paddw m11, m9 ; p6*7+p5*2+p4*2+p3
+%ifidn %2, h
+ vpbroadcastd m27, [pw_2048]
+ vpbroadcastd m1, [pb_m1_1]
+ %define pw2048 m27
+ %define pbm1_1 m1
+%endif
+ punpcklbw m8, m13, m3
+ punpckhbw m9, m13, m3
+ pmaddubsw m8, m23
+ pmaddubsw m9, m23
+ paddw m10, m8
+ paddw m11, m9 ; p6*7+p5*2+p4*2+p3+p2+p1
+ punpcklbw m8, m4, m5
+ punpckhbw m9, m4, m5
+ pmaddubsw m8, m23
+ pmaddubsw m9, m23
+ paddw m10, m8
+ paddw m11, m9 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0
+ pmulhrsw m8, m10, pw2048
+ pmulhrsw m9, m11, pw2048
+ packuswb m8, m9
+%ifidn %2, v
+ vmovdqu8 [t0+strideq*2]{k4}, m8 ; p5
+%else
+ vpblendmb m8{k4}, m2, m8
+ mova [rsp+1*64], m8
+%endif
+
+ ; sub p6*2, add p3/q1 [reuse p6/p3 from A][-p6,+q1|save] B
+ ; write -5
+ pmaddubsw m14, pbm1_1
+ pmaddubsw m22, pbm1_1
+ paddw m10, m14
+ paddw m11, m22 ; p6*6+p5*2+p4*2+p3*2+p2+p1+p0+q0
+ punpcklbw m8, m24, m6
+ punpckhbw m9, m24, m6
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m9, pbm1_1
+ paddw m10, m8
+ paddw m11, m9 ; p6*5+p5*2+p4*2+p3*2+p2+p1+p0+q0+q1
+ SWAP m18, m8
+ SWAP m23, m9
+ pmulhrsw m8, m10, pw2048
+ pmulhrsw m9, m11, pw2048
+ packuswb m8, m9
+%ifidn %2, v
+ vmovdqu8 [t0+stride3q]{k4}, m8 ; p4
+%else
+ vpblendmb m8{k4}, m7, m8
+ mova [rsp+2*64], m8
+%endif
+
+ ; sub p6/p5, add p2/q2 [-p6,+p2][-p5,+q2|save] C
+ ; write -4
+ SWAP m14, m16
+ punpcklbw m8, m24, m13
+ punpckhbw m9, m24, m13
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m9, pbm1_1
+ paddw m10, m8
+ paddw m11, m9 ; p6*4+p5*2+p4*2+p3*2+p2*2+p1+p0+q0+q1
+ punpcklbw m8, m2, m14
+ punpckhbw m2, m14
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m2, pbm1_1
+ paddw m10, m8
+ paddw m11, m2 ; p6*4+p5+p4*2+p3*2+p2*2+p1+p0+q0+q1+q2
+ SWAP m16, m8
+ pmulhrsw m8, m10, pw2048
+ pmulhrsw m9, m11, pw2048
+ packuswb m8, m9
+%ifidn %2, v
+ vmovdqu8 [t0+strideq*4]{k4}, m8 ; p3
+%else
+ vpblendmb m8{k4}, m25, m8
+ mova [rsp+3*64], m8
+%endif
+
+ ; sub p6/p4, add p1/q3 [-p6,+p1][-p4,+q3|save] D
+ ; write -3
+ SWAP m22, m17
+ punpcklbw m8, m24, m3
+ punpckhbw m9, m24, m3
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m9, pbm1_1
+ paddw m10, m8
+ paddw m11, m9 ; p6*3+p5+p4*2+p3*2+p2*2+p1*2+p0+q0+q1+q2
+ punpcklbw m8, m7, m22
+ punpckhbw m7, m22
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m7, pbm1_1
+ paddw m10, m8
+ paddw m11, m7 ; p6*3+p5+p4+p3*2+p2*2+p1*2+p0+q0+q1+q2+q3
+ SWAP m17, m8
+ pmulhrsw m8, m10, pw2048
+ pmulhrsw m9, m11, pw2048
+ packuswb m8, m9
+ vpblendmb m15{k4}, m13, m8 ; don't clobber p2/m13 since we need it in F
+
+ ; sub p6/p3, add p0/q4 [-p6,+p0][-p3,+q4|save] E
+ ; write -2
+%ifidn %2, v
+ lea t0, [dstq+strideq*4]
+%endif
+ punpcklbw m8, m24, m4
+ punpckhbw m9, m24, m4
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m9, pbm1_1
+ paddw m10, m8
+ paddw m11, m9 ; p6*2+p5+p4+p3*2+p2*2+p1*2+p0*2+q0+q1+q2+q3
+ punpcklbw m8, m25, m29
+ punpckhbw m9, m25, m29
+ SWAP m26, m29
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m9, pbm1_1
+ paddw m10, m8
+ paddw m11, m9 ; p6*2+p5+p4+p3+p2*2+p1*2+p0*2+q0+q1+q2+q3+q4
+ SWAP m29, m8
+ SWAP m0, m9
+ pmulhrsw m8, m10, pw2048
+ pmulhrsw m9, m11, pw2048
+ packuswb m8, m9
+ vpblendmb m12{k4}, m3, m8 ; don't clobber p1/m3 since we need it in G
+
+ ; sub p6/p2, add q0/q5 [-p6,+q0][-p2,+q5|save] F
+ ; write -1
+%ifidn %2, h
+ SWAP m28, m24
+ punpcklbw m8, m28, m5
+ punpckhbw m24, m28, m5
+%else
+ punpcklbw m8, m24, m5
+ punpckhbw m24, m5
+%endif
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m24, pbm1_1
+ paddw m10, m8
+ paddw m11, m24 ; p6+p5+p4+p3+p2*2+p1*2+p0*2+q0*2+q1+q2+q3+q4
+ punpcklbw m24, m13, m30
+ punpckhbw m9, m13, m30
+%ifidn %2, h
+ SWAP m27, m30
+%endif
+ SWAP m13, m15
+ pmaddubsw m24, pbm1_1
+ pmaddubsw m9, pbm1_1
+ paddw m10, m24
+ paddw m11, m9 ; p6+p5+p4+p3+p2+p1*2+p0*2+q0*2+q1+q2+q3+q4+q5
+ SWAP m30, m24
+ SWAP m15, m9
+%ifidn %2, h
+ SWAP m9, m24
+ %define pw2048 m9
+%endif
+ pmulhrsw m24, m10, pw2048
+ pmulhrsw m8, m11, pw2048
+ paddw m10, m18 ; p5+p4+p3+p2+p1*2+p0*2+q0*2+q1*2+q2+q3+q4+q5
+ paddw m11, m23
+ packuswb m24, m8
+ punpcklbw m8, m3, m31
+ pmaddubsw m8, pbm1_1
+ paddw m10, m8 ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6
+ SWAP m18, m8
+ pmulhrsw m8, m10, pw2048
+ paddw m10, m16 ; p4+p3+p2+p1+p0*2+q0*2+q1*2+q2*2+q3+q4+q5+q6
+%ifidn %2, h
+ SWAP m16, m9
+ %define pw2048 m16
+%endif
+ punpckhbw m9, m3, m31
+ SWAP m3, m12
+ pmaddubsw m9, pbm1_1
+ paddw m11, m9 ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6
+ SWAP m23, m9
+ pmulhrsw m9, m11, pw2048
+ paddw m11, m2 ; p4+p3+p2+p1+p0*2+q0*2+q1*2+q2*2+q3+q4+q5+q6
+%ifidn %2, h
+ SWAP m2, m1
+ %define pbm1_1 m2
+%endif
+ vpblendmb m1{k4}, m4, m24 ; don't clobber p0/m4 since we need it in H
+
+ ; sub p6/p1, add q1/q6 [reuse -p6,+q1 from B][-p1,+q6|save] G
+ ; write +0
+ SWAP m24, m31 ; q6
+ packuswb m8, m9
+%ifidn %2, h
+ SWAP m31, m2
+ %define pbm1_1 m31
+%endif
+ vpblendmb m12{k4}, m5, m8 ; don't clobber q0/m5 since we need it in I
+
+ ; sub p5/p0, add q2/q6 [reuse -p5,+q2 from C][-p0,+q6] H
+ ; write +1
+ punpcklbw m8, m4, m24
+ punpckhbw m2, m4, m24
+ SWAP m4, m1
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m2, pbm1_1
+ paddw m10, m8
+ paddw m11, m2 ; p4+p3+p2+p1+p0+q0*2+q1*2+q2*2+q3+q4+q5+q6*2
+ pmulhrsw m2, m10, pw2048
+ pmulhrsw m9, m11, pw2048
+ packuswb m2, m9
+ vpblendmb m2{k4}, m6, m2 ; don't clobber q1/m6 since we need it in K
+
+ ; sub p4/q0, add q3/q6 [reuse -p4,+q3 from D][-q0,+q6] I
+ ; write +2
+ paddw m10, m17 ; p3+p2+p1+p0+q0*2+q1*2+q2*2+q3*2+q4+q5+q6*2
+ paddw m11, m7
+ punpcklbw m8, m5, m24
+ punpckhbw m9, m5, m24
+ SWAP m5, m12
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m9, pbm1_1
+ paddw m10, m8
+ paddw m11, m9 ; p3+p2+p1+p0+q0+q1*2+q2*2+q3*2+q4+q5+q6*3
+ pmulhrsw m7, m10, pw2048
+ pmulhrsw m9, m11, pw2048
+ packuswb m7, m9
+ vpblendmb m7{k4}, m14, m7 ; don't clobber q2/m14 since we need it in K
+
+ ; sub p3/q1, add q4/q6 [reuse -p3,+q4 from E][-q1,+q6] J
+ ; write +3
+ paddw m10, m29 ; p2+p1+p0+q0+q1*2+q2*2+q3*2+q4*2+q5+q6*3
+ paddw m11, m0
+ punpcklbw m8, m6, m24
+ punpckhbw m9, m6, m24
+ SWAP 2, 6
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m9, pbm1_1
+ paddw m10, m8
+ paddw m11, m9 ; p2+p1+p0+q0+q1+q2*2+q3*2+q4*2+q5+q6*4
+ pmulhrsw m8, m10, pw2048
+ pmulhrsw m9, m11, pw2048
+ packuswb m8, m9
+%ifidn %2, v
+ vmovdqu8 [t0+mstrideq]{k4}, m8
+%else
+ SWAP m29, m16
+ %define pw2048 m29
+ vpblendmb m16{k4}, m22, m8
+%endif
+
+ ; sub p2/q2, add q5/q6 [reuse -p2,+q5 from F][-q2,+q6] K
+ ; write +4
+ paddw m10, m30 ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4
+ paddw m11, m15
+%ifidn %2, h
+ SWAP m15, m8
+%endif
+ punpcklbw m8, m14, m24
+ punpckhbw m9, m14, m24
+ SWAP 14, 7
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m9, pbm1_1
+ paddw m10, m8
+ paddw m11, m9 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5
+ pmulhrsw m8, m10, pw2048
+ pmulhrsw m9, m11, pw2048
+ packuswb m8, m9
+%ifidn %2, v
+ vmovdqu8 [t0+strideq*0]{k4}, m8 ; q4
+%else
+ vpblendmb m17{k4}, m26, m8
+%endif
+
+ ; sub p1/q3, add q6*2 [reuse -p1,+q6 from G][-q3,+q6] L
+ ; write +5
+ paddw m10, m18 ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4
+ paddw m11, m23
+ punpcklbw m8, m22, m24
+ punpckhbw m9, m22, m24
+ SWAP m30, m24
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m9, pbm1_1
+ paddw m10, m8
+ paddw m11, m9 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5
+ pmulhrsw m10, pw2048
+ pmulhrsw m11, pw2048
+ packuswb m10, m11
+%ifidn %2, v
+ vmovdqu8 [t0+strideq*1]{k4}, m10 ; q5
+%else
+ vmovdqu8 m27{k4}, m10
+%endif
+
+%ifidn %2, v
+ lea t0, [dstq+mstrideq*4]
+%endif
+%endif
+
+%if %1 >= 8
+ ; flat8 filter
+ vpbroadcastd m9, [pb_3_1]
+ vpbroadcastd m10, [pb_2_1]
+%if %1 == 16
+ vpbroadcastd m23, [pb_1]
+ vpbroadcastd m0, [pb_4]
+%elifidn %2, h
+ vpbroadcastd m31, [pb_m1_1]
+ %define pbm1_1 m31
+%endif
+ punpcklbw m24, m25, m3
+ punpckhbw m26, m25, m3
+ pmaddubsw m2, m24, m9
+ pmaddubsw m7, m26, m9 ; 3 * p3 + p1
+ punpcklbw m8, m13, m4
+ punpckhbw m11, m13, m4
+ pmaddubsw m8, m10
+ pmaddubsw m11, m10
+ paddw m2, m8
+ paddw m7, m11 ; 3 * p3 + 2 * p2 + p1 + p0
+ punpcklbw m8, m5, m0
+ punpckhbw m11, m5, m0
+ pmaddubsw m8, m23
+ pmaddubsw m11, m23
+ paddw m2, m8
+ paddw m7, m11 ; 3 * p3 + 2 * p2 + p1 + p0 + q0 + 4
+ psrlw m8, m2, 3
+ psrlw m11, m7, 3
+ packuswb m8, m11
+%if is_h || %1 == 16
+ vpblendmb m10{k2}, m13, m8 ; p2
+%endif
+%ifidn %2, v
+ %if %1 == 8
+ vmovdqu8 [t0+strideq*1]{k2}, m8
+ %else
+ mova [t0+strideq*1], m10
+ %endif
+%endif
+
+ pmaddubsw m8, m24, pbm1_1
+ pmaddubsw m11, m26, pbm1_1
+ paddw m2, m8
+ paddw m7, m11
+ punpcklbw m8, m13, m6
+ punpckhbw m11, m13, m6
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m11, pbm1_1
+ paddw m2, m8
+ paddw m7, m11 ; 2 * p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4
+ psrlw m8, m2, 3
+ psrlw m11, m7, 3
+ packuswb m8, m11
+ vpblendmb m8{k2}, m3, m8 ; p1
+%ifidn %2, v
+ mova [t0+strideq*2], m8
+%else
+ SWAP m18, m8
+%endif
+
+ pmaddubsw m24, m23
+ pmaddubsw m26, m23
+ psubw m2, m24
+ psubw m7, m26
+ punpcklbw m8, m4, m14
+ punpckhbw m11, m4, m14
+ pmaddubsw m8, m23
+ pmaddubsw m11, m23
+ paddw m2, m8
+ paddw m7, m11 ; p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4
+ psrlw m8, m2, 3
+ psrlw m11, m7, 3
+ packuswb m8, m11
+ vpblendmb m8{k2}, m4, m8 ; p0
+%ifidn %2, v
+ mova [t0+stride3q], m8
+%else
+ SWAP m29, m8
+%endif
+
+ punpcklbw m24, m5, m22
+ punpckhbw m26, m5, m22
+ pmaddubsw m8, m24, m23
+ pmaddubsw m11, m26, m23
+ paddw m2, m8
+ paddw m7, m11
+ punpcklbw m8, m4, m25
+ punpckhbw m11, m4, m25
+ pmaddubsw m8, m23
+ pmaddubsw m11, m23
+ psubw m2, m8
+ psubw m7, m11 ; p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4
+ psrlw m8, m2, 3
+ psrlw m11, m7, 3
+ packuswb m8, m11
+ vpblendmb m11{k2}, m5, m8 ; q0
+%ifidn %2, v
+ mova [dstq+strideq*0], m11
+%endif
+
+ pmaddubsw m24, pbm1_1
+ pmaddubsw m26, pbm1_1
+ paddw m2, m24
+ paddw m7, m26
+ punpcklbw m8, m13, m6
+ punpckhbw m13, m6
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m13, pbm1_1
+ paddw m2, m8
+ paddw m7, m13 ; p1 + p0 + q0 + 2 * q1 + q2 + 2 * q3 + 4
+ psrlw m8, m2, 3
+ psrlw m13, m7, 3
+ packuswb m8, m13
+ vpblendmb m13{k2}, m6, m8 ; q1
+%ifidn %2, v
+ mova [dstq+strideq*1], m13
+%endif
+
+ punpcklbw m24, m3, m6
+ punpckhbw m26, m3, m6
+ pmaddubsw m24, m23
+ pmaddubsw m26, m23
+ psubw m2, m24
+ psubw m7, m26
+ punpcklbw m24, m14, m22
+ punpckhbw m26, m14, m22
+ pmaddubsw m24, m23
+ pmaddubsw m26, m23
+ paddw m2, m24
+ paddw m7, m26 ; p0 + q0 + q1 + q2 + 2 * q2 + 3 * q3 + 4
+ psrlw m2, 3
+ psrlw m7, 3
+ packuswb m2, m7
+%if is_h || %1 == 16
+ vpblendmb m2{k2}, m14, m2 ; q2
+%endif
+%ifidn %2, v
+ %if %1 == 8
+ vmovdqu8 [dstq+strideq*2]{k2}, m2
+ %else
+ mova [dstq+strideq*2], m2
+ %endif
+%endif
+
+%ifidn %2, h
+ SWAP m24, m18
+ SWAP m26, m29
+%if %1 == 8
+ ; 16x8 transpose
+ punpcklbw m3, m25, m10
+ punpckhbw m25, m10
+ punpcklbw m10, m24, m26
+ punpckhbw m24, m26
+ punpcklbw m26, m11, m13
+ punpckhbw m11, m13
+ punpcklbw m13, m2, m22
+ punpckhbw m2, m22
+ ;
+ punpcklwd m22, m3, m10
+ punpckhwd m3, m10
+ punpcklwd m10, m25, m24
+ punpckhwd m25, m24
+ punpcklwd m24, m26, m13
+ punpckhwd m26, m13
+ punpcklwd m13, m11, m2
+ punpckhwd m11, m2
+ ;
+ punpckldq m2, m22, m24
+ punpckhdq m22, m24
+ punpckldq m24, m3, m26
+ punpckhdq m3, m26
+ punpckldq m26, m10, m13
+ punpckhdq m10, m13
+ punpckldq m13, m25, m11
+ punpckhdq m25, m11
+ ; write 8x32
+ vpbroadcastd ym16, strided
+ pmulld ym16, [hmulD]
+ lea t1, [dstq+strideq*2]
+ lea t2, [dstq+strideq*4]
+ lea t3, [t1 +strideq*4]
+ lea t0, [dstq+strideq*8]
+ kmovb k1, k6
+ kmovb k2, k6
+ kmovb k3, k6
+ kmovb k4, k6
+ vpscatterdq [dstq+ym16-4]{k1}, m2
+ vpscatterdq [t1 +ym16-4]{k2}, m22
+ vpscatterdq [t2 +ym16-4]{k3}, m24
+ vpscatterdq [t3 +ym16-4]{k4}, m3
+ lea t1, [t0+strideq*2]
+ lea t2, [t0+strideq*4]
+ lea t3, [t1+strideq*4]
+ kmovb k1, k6
+ kmovb k2, k6
+ kmovb k3, k6
+ kmovb k4, k6
+ vpscatterdq [t0+ym16-4]{k1}, m26
+ vpscatterdq [t1+ym16-4]{k2}, m10
+ vpscatterdq [t2+ym16-4]{k3}, m13
+ vpscatterdq [t3+ym16-4]{k4}, m25
+%else
+ ; 16x16 transpose and store
+ SWAP 5, 10, 2
+ SWAP 6, 24
+ SWAP 7, 26
+ SWAP 8, 11
+ SWAP 9, 13
+ mova m24, [rsp+0*64]
+ SWAP m26, m28
+ mova m2, [rsp+1*64]
+ mova m3, [rsp+2*64]
+ mova m4, [rsp+3*64]
+ SWAP m11, m16
+ SWAP m25, m17
+ SWAP m13, m27
+ SWAP m14, m30
+ TRANSPOSE_16X16B 1, 0, [rsp+4*64]
+ movu [dstq+strideq*0-8], xm24
+ movu [dstq+strideq*1-8], xm26
+ movu [dstq+strideq*2-8], xm2
+ movu [dstq+stride3q -8], xm3
+ lea t0, [dstq+strideq*4]
+ movu [t0+strideq*0-8], xm4
+ movu [t0+strideq*1-8], xm5
+ movu [t0+strideq*2-8], xm6
+ movu [t0+stride3q -8], xm7
+ lea t0, [t0+strideq*4]
+ movu [t0+strideq*0-8], xm8
+ movu [t0+strideq*1-8], xm9
+ movu [t0+strideq*2-8], xm10
+ movu [t0+stride3q -8], xm11
+ lea t0, [t0+strideq*4]
+ movu [t0+strideq*0-8], xm25
+ movu [t0+strideq*1-8], xm13
+ movu [t0+strideq*2-8], xm14
+ movu [t0+stride3q -8], xm22
+ lea t0, [t0+strideq*4]
+ vextracti128 [t0+strideq*0-8], ym24, 1
+ vextracti128 [t0+strideq*1-8], ym26, 1
+ vextracti128 [t0+strideq*2-8], ym2, 1
+ vextracti128 [t0+stride3q -8], ym3, 1
+ lea t0, [t0+strideq*4]
+ vextracti128 [t0+strideq*0-8], ym4, 1
+ vextracti128 [t0+strideq*1-8], ym5, 1
+ vextracti128 [t0+strideq*2-8], ym6, 1
+ vextracti128 [t0+stride3q -8], ym7, 1
+ lea t0, [t0+strideq*4]
+ vextracti128 [t0+strideq*0-8], ym8, 1
+ vextracti128 [t0+strideq*1-8], ym9, 1
+ vextracti128 [t0+strideq*2-8], ym10, 1
+ vextracti128 [t0+stride3q -8], ym11, 1
+ lea t0, [t0+strideq*4]
+ vextracti128 [t0+strideq*0-8], ym25, 1
+ vextracti128 [t0+strideq*1-8], ym13, 1
+ vextracti128 [t0+strideq*2-8], ym14, 1
+ vextracti128 [t0+stride3q -8], ym22, 1
+ lea t0, [t0+strideq*4]
+ vextracti32x4 [t0+strideq*0-8], m24, 2
+ vextracti32x4 [t0+strideq*1-8], m26, 2
+ vextracti32x4 [t0+strideq*2-8], m2, 2
+ vextracti32x4 [t0+stride3q -8], m3, 2
+ lea t0, [t0+strideq*4]
+ vextracti32x4 [t0+strideq*0-8], m4, 2
+ vextracti32x4 [t0+strideq*1-8], m5, 2
+ vextracti32x4 [t0+strideq*2-8], m6, 2
+ vextracti32x4 [t0+stride3q -8], m7, 2
+ lea t0, [t0+strideq*4]
+ vextracti32x4 [t0+strideq*0-8], m8, 2
+ vextracti32x4 [t0+strideq*1-8], m9, 2
+ vextracti32x4 [t0+strideq*2-8], m10, 2
+ vextracti32x4 [t0+stride3q -8], m11, 2
+ lea t0, [t0+strideq*4]
+ vextracti32x4 [t0+strideq*0-8], m25, 2
+ vextracti32x4 [t0+strideq*1-8], m13, 2
+ vextracti32x4 [t0+strideq*2-8], m14, 2
+ vextracti32x4 [t0+stride3q -8], m22, 2
+ lea t0, [t0+strideq*4]
+ vextracti32x4 [t0+strideq*0-8], m24, 3
+ vextracti32x4 [t0+strideq*1-8], m26, 3
+ vextracti32x4 [t0+strideq*2-8], m2, 3
+ vextracti32x4 [t0+stride3q -8], m3, 3
+ lea t0, [t0+strideq*4]
+ vextracti32x4 [t0+strideq*0-8], m4, 3
+ vextracti32x4 [t0+strideq*1-8], m5, 3
+ vextracti32x4 [t0+strideq*2-8], m6, 3
+ vextracti32x4 [t0+stride3q -8], m7, 3
+ lea t0, [t0+strideq*4]
+ vextracti32x4 [t0+strideq*0-8], m8, 3
+ vextracti32x4 [t0+strideq*1-8], m9, 3
+ vextracti32x4 [t0+strideq*2-8], m10, 3
+ vextracti32x4 [t0+stride3q -8], m11, 3
+ lea t0, [t0+strideq*4]
+ vextracti32x4 [t0+strideq*0-8], m25, 3
+ vextracti32x4 [t0+strideq*1-8], m13, 3
+ vextracti32x4 [t0+strideq*2-8], m14, 3
+ vextracti32x4 [t0+stride3q -8], m22, 3
+%endif
+%endif
+
+%elif %1 == 6
+ ; flat6 filter
+ vpbroadcastd m15, [pb_3_1]
+ vpbroadcastd m12, [pb_2]
+ punpcklbw m8, m13, m5
+ punpckhbw m11, m13, m5
+ pmaddubsw m0, m8, m15
+ pmaddubsw m1, m11, m15
+ punpcklbw m7, m4, m3
+ punpckhbw m10, m4, m3
+ pmaddubsw m2, m7, m12
+ pmaddubsw m12, m10, m12
+%ifidn %2, h
+ vpbroadcastd m15, [pb_m1_1]
+ %define pbm1_1 m15
+%endif
+ paddw m0, m2
+ paddw m1, m12
+ pmulhrsw m2, m0, m16
+ pmulhrsw m12, m1, m16
+ packuswb m2, m12
+ vpblendmb m2{k2}, m3, m2 ; p1
+%ifidn %2, v
+ mova [t0+strideq*2], m2
+%endif
+
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m11, pbm1_1
+ paddw m0, m8
+ paddw m1, m11
+ punpcklbw m8, m13, m6
+ punpckhbw m11, m13, m6
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m11, pbm1_1
+ paddw m0, m8
+ paddw m1, m11
+ pmulhrsw m12, m0, m16
+ pmulhrsw m13, m1, m16
+ packuswb m12, m13
+ vpblendmb m12{k2}, m4, m12 ; p0
+%ifidn %2, v
+ mova [t0+stride3q], m12
+%endif
+
+ vpbroadcastd m9, [pb_m1_2]
+ vpbroadcastd m4, [pb_m1_0]
+ paddw m0, m8
+ paddw m1, m11
+ punpcklbw m8, m3, m14
+ punpckhbw m11, m3, m14
+ pmaddubsw m14, m8, pbm1_1
+ pmaddubsw m13, m11, pbm1_1
+ paddw m0, m14
+ paddw m1, m13
+ pmulhrsw m14, m0, m16
+ pmulhrsw m13, m1, m16
+ packuswb m14, m13
+ vpblendmb m14{k2}, m5, m14 ; q0
+%ifidn %2, v
+ mova [dstq+strideq*0], m14
+%endif
+
+ pmaddubsw m8, m9
+ pmaddubsw m11, m9
+ paddw m0, m8
+ paddw m1, m11
+ pmaddubsw m7, m4
+ pmaddubsw m10, m4
+ paddw m0, m7
+ paddw m1, m10
+ pmulhrsw m0, m16
+ pmulhrsw m1, m16
+ packuswb m0, m1
+ vpblendmb m0{k2}, m6, m0 ; q1
+%ifidn %2, v
+ mova [dstq+strideq*1], m0
+%else
+ TRANSPOSE_16x4_AND_WRITE_4x32 2, 12, 14, 0, 1
+%endif
+%else ; %1 == 4
+%ifidn %2, v
+ mova [t0+strideq*0], m3 ; p1
+ mova [t0+strideq*1], m4 ; p0
+ mova [t0+strideq*2], m5 ; q0
+ mova [t0+stride3q ], m6 ; q1
+%else
+ TRANSPOSE_16x4_AND_WRITE_4x32 3, 4, 5, 6, 7
+%endif
+%endif
+%endmacro
+
+%define k7 k6
+
+INIT_ZMM avx512icl
+cglobal lpf_v_sb_y_8bpc, 7, 10, 32, dst, stride, mask, l, l_stride, \
+ lut, w, stride3, mstride
+ DECLARE_REG_TMP 9
+ shl l_strideq, 2
+ sub lq, l_strideq
+ mov mstrideq, strideq
+ neg mstrideq
+ lea stride3q, [strideq*3]
+ mova m21, [pb_4x0_4x4_4x8_4x12]
+ mova m20, [pb_mask]
+ vpbroadcastd m19, [pb_128]
+ vpbroadcastd m28, [pb_m1_1]
+ vpbroadcastd m27, [pw_2048]
+ %define pbshuf m21
+ %define pbmask m20
+ %define pb128 m19
+ %define pbm1_1 m28
+ %define pw2048 m27
+
+.loop:
+ cmp word [maskq+8], 0 ; vmask[2]
+ je .no_flat16
+
+ FILTER 16, v
+ jmp .end
+
+.no_flat16:
+ cmp word [maskq+4], 0 ; vmask[1]
+ je .no_flat
+
+ FILTER 8, v
+ jmp .end
+
+.no_flat:
+ cmp word [maskq+0], 0 ; vmask[0]
+ je .end
+
+ call .v4
+
+.end:
+ add lq, 64
+ add dstq, 64
+ add maskq, 2
+ sub wd, 16
+ jg .loop
+ RET
+ALIGN function_align
+RESET_MM_PERMUTATION
+.v4:
+ FILTER 4, v
+ ret
+
+cglobal lpf_h_sb_y_8bpc, 7, 13, 32, 5*64, dst, stride, mask, l, l_stride, \
+ lut, h, stride3, stride8
+ DECLARE_REG_TMP 9, 10, 11, 12
+ shl l_strideq, 2
+ sub lq, 4
+ lea stride3q, [strideq*3]
+ lea stride8q, [strideq*8]
+ kxnorw k6, k6, k6
+ vpbroadcastd m19, strided
+ vpbroadcastd m20, l_strided
+ pmulld m21, m19, [hmulA]
+ pmulld m20, [hmulB]
+ pmulld m19, [hmulC]
+ %define pbshuf [pb_4x0_4x4_4x8_4x12]
+ %define pbmask [pb_mask]
+ %define pb128 [pb_128]{bcstd}
+ shl l_strideq, 1
+
+.loop:
+ cmp word [maskq+8], 0 ; vmask[2]
+ je .no_flat16
+
+ FILTER 16, h
+ jmp .end
+
+.no_flat16:
+ cmp word [maskq+4], 0 ; vmask[1]
+ je .no_flat
+
+ FILTER 8, h
+ jmp .end
+
+.no_flat:
+ cmp word [maskq+0], 0 ; vmask[0]
+ je .end
+
+ call .h4
+
+.end:
+ lea lq, [lq+l_strideq*8]
+ lea dstq, [dstq+stride8q*8]
+ add maskq, 2
+ sub hd, 16
+ jg .loop
+ RET
+ALIGN function_align
+RESET_MM_PERMUTATION
+.h4:
+ FILTER 4, h
+ ret
+
+cglobal lpf_v_sb_uv_8bpc, 7, 10, 22, dst, stride, mask, l, l_stride, \
+ lut, w, stride3, mstride
+ DECLARE_REG_TMP 9
+ shl l_strideq, 2
+ sub lq, l_strideq
+ mov mstrideq, strideq
+ neg mstrideq
+ lea stride3q, [strideq*3]
+ mova m21, [pb_4x0_4x4_4x8_4x12]
+ mova m20, [pb_mask]
+ vpbroadcastd m19, [pb_128]
+ vpbroadcastd m17, [pb_m1_1]
+ vpbroadcastd m16, [pw_4096]
+ %define pbshuf m21
+ %define pbmask m20
+ %define pb128 m19
+ %define pbm1_1 m17
+
+.loop:
+ cmp word [maskq+4], 0 ; vmask[1]
+ je .no_flat
+
+ FILTER 6, v
+ jmp .end
+
+.no_flat:
+ cmp word [maskq+0], 0 ; vmask[0]
+ je .end
+
+ call mangle(private_prefix %+ _lpf_v_sb_y_8bpc_avx512icl).v4
+
+.end:
+ add lq, 64
+ add dstq, 64
+ add maskq, 2
+ sub wd, 16
+ jg .loop
+ RET
+
+%undef k7
+cglobal lpf_h_sb_uv_8bpc, 7, 12, 22, dst, stride, mask, l, l_stride, \
+ lut, h, stride3, stride8
+ DECLARE_REG_TMP 9, 10, 11
+ mov r7d, 0xffff
+ movzx r8d, r7b
+ cmp hd, 9
+ cmovb r7d, r8d
+ kmovw k6, r7d ; h > 8 ? 0xffff : 0x00ff
+ shl l_strideq, 2
+ sub lq, 4
+ kshiftrw k7, k6, 4 ; h > 8 ? 0xff : 0xf0
+ lea stride3q, [strideq*3]
+ lea stride8q, [strideq*8]
+ vpbroadcastd m19, strided
+ vpbroadcastd m20, l_strided
+ pmulld m21, m19, [hmulA]
+ pmulld m20, [hmulB]
+ pmulld m19, [hmulC]
+ mova m18, [pb_mask]
+ vpbroadcastd m17, [pb_128]
+ vpbroadcastd m16, [pw_4096]
+ %define pbshuf [pb_4x0_4x4_4x8_4x12]
+ %define pbmask m18
+ %define pb128 m17
+ add l_strideq, l_strideq
+
+.loop:
+ cmp word [maskq+4], 0 ; vmask[1]
+ je .no_flat
+
+ FILTER 6, h
+ jmp .end
+
+.no_flat:
+ cmp word [maskq+0], 0 ; vmask[0]
+ je .end
+
+ call mangle(private_prefix %+ _lpf_h_sb_y_8bpc_avx512icl).h4
+
+.end:
+ lea lq, [lq+l_strideq*8]
+ lea dstq, [dstq+stride8q*8]
+ add maskq, 2
+ sub hd, 16
+ jg .loop
+ RET
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/loopfilter_sse.asm b/third_party/dav1d/src/x86/loopfilter_sse.asm
new file mode 100644
index 0000000000..cd0eb54702
--- /dev/null
+++ b/third_party/dav1d/src/x86/loopfilter_sse.asm
@@ -0,0 +1,2348 @@
+; Copyright © 2018-2021, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA 16
+
+pb_4x0_4x4_4x8_4x12: db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12
+pb_7_1: times 8 db 7, 1
+pb_3_1: times 8 db 3, 1
+pb_2_1: times 8 db 2, 1
+pb_m1_0: times 8 db -1, 0
+pb_m1_1: times 8 db -1, 1
+pb_m1_2: times 8 db -1, 2
+pb_1: times 16 db 1
+pb_2: times 16 db 2
+pb_3: times 16 db 3
+pb_4: times 16 db 4
+pb_16: times 16 db 16
+pb_63: times 16 db 63
+pb_64: times 16 db 64
+pb_128: times 16 db 0x80
+pb_129: times 16 db 0x81
+pb_240: times 16 db 0xf0
+pb_248: times 16 db 0xf8
+pb_254: times 16 db 0xfe
+
+pw_2048: times 8 dw 2048
+pw_4096: times 8 dw 4096
+
+pd_mask: dd 1, 2, 4, 8
+
+SECTION .text
+
+%macro ABSSUB 4 ; dst, a, b, tmp
+ psubusb %1, %2, %3
+ psubusb %4, %3, %2
+ por %1, %4
+%endmacro
+
+%macro TRANSPOSE_16x4_AND_WRITE_4x16 5
+ ; transpose 16x4
+ punpcklbw m%5, m%1, m%2
+ punpckhbw m%1, m%2
+ punpcklbw m%2, m%3, m%4
+ punpckhbw m%3, m%4
+ punpcklwd m%4, m%5, m%2
+ punpckhwd m%5, m%2
+ punpcklwd m%2, m%1, m%3
+ punpckhwd m%1, m%3
+
+ ; write out
+%assign %%n 0
+%rep 4
+ movd [dstq+strideq *0-2], xm%4
+ movd [dstq+strideq *4-2], xm%5
+ movd [dstq+strideq *8-2], xm%2
+ movd [dstq+stride3q*4-2], xm%1
+ add dstq, strideq
+%if %%n < 3
+ psrldq xm%4, 4
+ psrldq xm%5, 4
+ psrldq xm%2, 4
+ psrldq xm%1, 4
+%endif
+%assign %%n (%%n+1)
+%endrep
+ lea dstq, [dstq+stride3q*4]
+%endmacro
+
+%macro TRANSPOSE_16X16B 2 ; output_transpose, mem
+%if %1 == 0
+ mova %2, m15 ; m7 in 32-bit
+%endif
+
+ ; input in m0-7
+ punpcklbw m15, m0, m1
+ punpckhbw m0, m1
+ punpcklbw m1, m2, m3
+ punpckhbw m2, m3
+ punpcklbw m3, m4, m5
+ punpckhbw m4, m5
+%if ARCH_X86_64
+ SWAP 4, 5, 7
+%else
+ %if %1 == 0
+ mova m5, %2
+ %else
+ mova m5, [esp+1*16]
+ %endif
+ mova %2, m4
+%endif
+ punpcklbw m4, m6, m5
+ punpckhbw m6, m5
+
+ ; interleaved in m15,0,1,2,3,7,4,6
+ punpcklwd m5, m15, m1
+ punpckhwd m15, m1
+ punpcklwd m1, m0, m2
+ punpckhwd m0, m2
+ punpcklwd m2, m3, m4
+ punpckhwd m3, m4
+%if ARCH_X86_64
+ SWAP 3, 4, 7
+%else
+ mova m4, %2
+ mova %2, m3
+%endif
+ punpcklwd m3, m4, m6
+ punpckhwd m4, m6
+
+ ; interleaved in m5,15,1,0,2,7,3,4
+ punpckldq m6, m5, m2
+ punpckhdq m5, m2
+%if ARCH_X86_64
+ SWAP 2, 7, 5
+%else
+ mova m2, %2
+ mova [esp+1*16], m5
+%endif
+ punpckldq m5, m15, m2
+ punpckhdq m15, m2
+ punpckldq m2, m1, m3
+ punpckhdq m1, m3
+ punpckldq m3, m0, m4
+ punpckhdq m0, m4
+
+%if ARCH_X86_32
+ mova [esp+0*16], m6
+ mova [esp+2*16], m5
+ mova [esp+3*16], m15
+ mova [esp+4*16], m2
+ mova [esp+5*16], m1
+ mova [esp+6*16], m3
+ mova [esp+7*16], m0
+ mova m8, [esp+ 8*16]
+ mova m9, [esp+ 9*16]
+ mova m10, [esp+10*16]
+ %if %1 == 0
+ mova m11, [esp+11*16]
+ mova m12, [esp+12*16]
+ mova m13, [esp+13*16]
+ mova m14, [esp+14*16]
+ %else
+ mova m11, [esp+20*16]
+ mova m12, [esp+15*16]
+ mova m13, [esp+16*16]
+ mova m14, [esp+17*16]
+ %endif
+%endif
+
+ ; input in m8-m15
+%if ARCH_X86_64
+ SWAP 7, 4
+%endif
+ punpcklbw m7, m8, m9
+ punpckhbw m8, m9
+ punpcklbw m9, m10, m11
+ punpckhbw m10, m11
+ punpcklbw m11, m12, m13
+ punpckhbw m12, m13
+%if ARCH_X86_64
+ mova m13, %2
+%else
+ %if %1 == 0
+ mova m13, [esp+15*16]
+ %else
+ mova m13, [esp+18*16]
+ %endif
+%endif
+ mova %2, m12
+ punpcklbw m12, m14, m13
+ punpckhbw m14, m14, m13
+
+ ; interleaved in m7,8,9,10,11,rsp%2,12,14
+ punpcklwd m13, m7, m9
+ punpckhwd m7, m9
+ punpcklwd m9, m8, m10
+ punpckhwd m8, m10
+ punpcklwd m10, m11, m12
+ punpckhwd m11, m12
+ mova m12, %2
+ mova %2, m11
+ punpcklwd m11, m12, m14
+ punpckhwd m12, m14
+
+ ; interleaved in m13,7,9,8,10,rsp%2,11,12
+ punpckldq m14, m13, m10
+ punpckhdq m13, m10
+ punpckldq m10, m9, m11
+ punpckhdq m9, m11
+ punpckldq m11, m8, m12
+ punpckhdq m8, m12
+ mova m12, %2
+ mova %2, m8
+ punpckldq m8, m7, m12
+ punpckhdq m7, m12
+
+%if ARCH_X86_32
+ mova [esp+ 8*16], m10
+ mova [esp+ 9*16], m9
+ mova [esp+10*16], m11
+ SWAP 6, 1
+ SWAP 4, 2
+ SWAP 5, 3
+ mova m6, [esp+0*16]
+ mova m4, [esp+1*16]
+ mova m5, [esp+2*16]
+%endif
+
+ ; interleaved in m6,7,5,15,2,1,3,0,14,13,10,9,11,rsp%2,8,7
+ punpcklqdq m12, m6, m14
+ punpckhqdq m6, m14
+ punpcklqdq m14, m4, m13
+ punpckhqdq m4, m13
+ punpcklqdq m13, m5, m8
+ punpckhqdq m5, m8
+%if ARCH_X86_64
+ SWAP 8, 5
+%else
+ mova m8, [esp+3*16]
+ mova [esp+27*16], m5
+ %define m15 m8
+%endif
+ punpcklqdq m5, m15, m7
+ punpckhqdq m15, m7
+
+%if ARCH_X86_32
+ mova [esp+11*16], m12
+ mova [esp+12*16], m6
+ mova [esp+13*16], m14
+ mova [esp+14*16], m4
+ mova [esp+26*16], m13
+ mova [esp+ 0*16], m5
+ mova [esp+ 1*16], m15
+ mova m2, [esp+ 4*16]
+ mova m10, [esp+ 8*16]
+ mova m1, [esp+ 5*16]
+ mova m9, [esp+ 9*16]
+ mova m3, [esp+ 6*16]
+ mova m11, [esp+10*16]
+ mova m0, [esp+ 7*16]
+%endif
+
+ punpcklqdq m7, m2, m10
+ punpckhqdq m2, m10
+ punpcklqdq m10, m1, m9
+ punpckhqdq m1, m9
+ punpcklqdq m9, m3, m11
+ punpckhqdq m3, m11
+ mova m11, %2
+%if ARCH_X86_32
+ %define m12 m3
+%endif
+ mova %2, m12
+ punpcklqdq m12, m0, m11
+ punpckhqdq m0, m11
+%if %1 == 1
+ mova m11, %2
+%endif
+
+%if ARCH_X86_64
+ ; interleaved m11,6,14,4,13,8,5,15,7,2,10,1,9,3,12,0
+ SWAP 0, 11, 1, 6, 5, 8, 7, 15
+ SWAP 2, 14, 12, 9
+ SWAP 3, 4, 13
+%else
+ %if %1 == 0
+ mova [esp+15*16], m9
+ mova [esp+17*16], m12
+ mova [esp+18*16], m0
+ mova [esp+28*16], m10
+ mova [esp+29*16], m1
+ mova m3, [esp+0*16]
+ mova m4, [esp+1*16]
+ SWAP m5, m7
+ SWAP m6, m2
+ %else
+ SWAP 0, 7
+ SWAP 3, 1, 2, 4, 6
+ %endif
+%endif
+%endmacro
+
+%macro FILTER 2 ; width [4/6/8/16], dir [h/v]
+%if ARCH_X86_64
+ %define %%flat8mem [rsp+0*16]
+ %define %%q2mem [rsp+1*16]
+ %define %%q3mem [rsp+2*16]
+%else
+ %if %1 == 4 || %1 == 6
+ %define %%p2mem [esp+ 8*16]
+ %define %%q2mem [esp+ 9*16]
+ %define %%flat8mem [esp+10*16]
+ %else
+ %ifidn %2, v
+ %define %%p2mem [esp+16*16]
+ %define %%q2mem [esp+ 1*16]
+ %define %%q3mem [esp+18*16]
+ %define %%flat8mem [esp+ 0*16]
+ %define %%flat16mem [esp+20*16]
+ %else
+ %define %%p2mem [esp+27*16]
+ %define %%q2mem [esp+28*16]
+ %define %%q3mem [esp+29*16]
+ %define %%flat8mem [esp+21*16]
+ %define %%flat16mem [esp+30*16]
+ %endif
+ %endif
+ %xdefine m12reg m12
+%endif
+
+%if ARCH_X86_32
+ lea stride3q, [strideq*3]
+%endif
+ ; load data
+%ifidn %2, v
+%if ARCH_X86_32
+ mov mstrideq, strideq
+ neg mstrideq
+%endif
+%if %1 == 4
+ lea tmpq, [dstq+mstrideq*2]
+ mova m3, [tmpq+strideq*0] ; p1
+ mova m4, [tmpq+strideq*1] ; p0
+ mova m5, [tmpq+strideq*2] ; q0
+ mova m6, [tmpq+stride3q] ; q1
+%else
+ ; load 6-8 pixels, remainder (for wd=16) will be read inline
+ lea tmpq, [dstq+mstrideq*4]
+ ; we load p3 later
+%define %%p3mem [dstq+mstrideq*4]
+ %if ARCH_X86_32
+ %define m13 m0
+ %define m14 m1
+ %define m15 m2
+ %endif
+ mova m13, [tmpq+strideq*1]
+ mova m3, [tmpq+strideq*2]
+ mova m4, [tmpq+stride3q]
+ mova m5, [dstq+strideq*0]
+ mova m6, [dstq+strideq*1]
+ mova m14, [dstq+strideq*2]
+%if %1 != 6
+ mova m15, [dstq+stride3q]
+%endif
+ %if ARCH_X86_32
+ mova %%p2mem, m13
+ mova %%q2mem, m14
+ %define m13 %%p2mem
+ %define m14 %%q2mem
+ %if %1 != 6
+ mova %%q3mem, m15
+ %define m15 %%q3mem
+ %endif
+ %endif
+%endif
+%else ; %2 == h
+ ; load lines
+%if %1 == 4
+ ; transpose 4x16
+ movd m7, [dstq+strideq*0-2]
+ movd m3, [dstq+strideq*1-2]
+ movd m4, [dstq+strideq*2-2]
+ movd m5, [dstq+stride3q -2]
+ lea tmpq, [dstq+strideq*4]
+ punpcklbw m7, m3
+ punpcklbw m4, m5
+ movd m3, [tmpq+strideq*0-2]
+ movd m1, [tmpq+strideq*1-2]
+ movd m5, [tmpq+strideq*2-2]
+ movd m6, [tmpq+stride3q -2]
+ lea tmpq, [tmpq+strideq*4]
+ punpcklbw m3, m1
+ punpcklbw m5, m6
+ movd m0, [tmpq+strideq*0-2]
+ movd m1, [tmpq+strideq*1-2]
+ punpcklbw m0, m1
+ movd m1, [tmpq+strideq*2-2]
+ movd m2, [tmpq+stride3q -2]
+ punpcklbw m1, m2
+ punpcklqdq m7, m0
+ punpcklqdq m4, m1
+ lea tmpq, [tmpq+strideq*4]
+ movd m0, [tmpq+strideq*0-2]
+ movd m1, [tmpq+strideq*1-2]
+ punpcklbw m0, m1
+ movd m1, [tmpq+strideq*2-2]
+ movd m2, [tmpq+stride3q -2]
+ punpcklbw m1, m2
+ punpcklqdq m3, m0
+ punpcklqdq m5, m1
+ ; xm7: A0-1,B0-1,C0-1,D0-1,A8-9,B8-9,C8-9,D8-9
+ ; xm3: A4-5,B4-5,C4-5,D4-5,A12-13,B12-13,C12-13,D12-13
+ ; xm4: A2-3,B2-3,C2-3,D2-3,A10-11,B10-11,C10-11,D10-11
+ ; xm5: A6-7,B6-7,C6-7,D6-7,A14-15,B14-15,C14-15,D14-15
+ punpcklwd m6, m7, m4
+ punpckhwd m7, m4
+ punpcklwd m4, m3, m5
+ punpckhwd m3, m5
+ ; xm6: A0-3,B0-3,C0-3,D0-3
+ ; xm7: A8-11,B8-11,C8-11,D8-11
+ ; xm4: A4-7,B4-7,C4-7,D4-7
+ ; xm3: A12-15,B12-15,C12-15,D12-15
+ punpckldq m5, m6, m4
+ punpckhdq m6, m4
+ punpckldq m4, m7, m3
+ punpckhdq m7, m3
+ ; xm5: A0-7,B0-7
+ ; xm6: C0-7,D0-7
+ ; xm4: A8-15,B8-15
+ ; xm7: C8-15,D8-15
+ punpcklqdq m3, m5, m4
+ punpckhqdq m5, m5, m4
+ punpcklqdq m4, m6, m7
+ punpckhqdq m6, m7
+ ; xm3: A0-15
+ ; xm5: B0-15
+ ; xm4: C0-15
+ ; xm6: D0-15
+ SWAP 4, 5
+%elif %1 == 6 || %1 == 8
+ ; transpose 8x16
+ movq m7, [dstq+strideq*0-%1/2]
+ movq m3, [dstq+strideq*1-%1/2]
+ movq m4, [dstq+strideq*2-%1/2]
+ movq m5, [dstq+stride3q -%1/2]
+ lea tmpq, [dstq+strideq*8]
+ punpcklbw m7, m3
+ punpcklbw m4, m5
+ movq m3, [tmpq+strideq*0-%1/2]
+ movq m1, [tmpq+strideq*1-%1/2]
+ movq m5, [tmpq+strideq*2-%1/2]
+ movq m6, [tmpq+stride3q -%1/2]
+ lea tmpq, [dstq+strideq*4]
+ punpcklbw m3, m1
+ punpcklbw m5, m6
+ movq m6, [tmpq+strideq*0-%1/2]
+ movq m0, [tmpq+strideq*1-%1/2]
+ movq m1, [tmpq+strideq*2-%1/2]
+ movq m2, [tmpq+stride3q -%1/2]
+ lea tmpq, [tmpq+strideq*8]
+ punpcklbw m6, m0
+ punpcklbw m1, m2
+ movq m2, [tmpq+strideq*2-%1/2]
+ movq m0, [tmpq+stride3q -%1/2]
+ punpcklbw m2, m0
+%if ARCH_X86_64
+ SWAP m15, m2
+%else
+ %define m15 [esp+3*16]
+ mova m15, m2
+%endif
+ movq m0, [tmpq+strideq*0-%1/2]
+ movq m2, [tmpq+strideq*1-%1/2]
+ punpcklbw m0, m2
+ ; xm7: A0-1,B0-1,C0-1,D0-1,E0-1,F0-1,G0-1,H0-1
+ ; xm3: A8-9,B8-9,C8-9,D8-9,E8-9,F8-9,G8-9,H8-9
+ ; xm4: A2-3,B2-3,C2-3,D2-3,E2-3,F2-3,G2-3,H2-3
+ ; xm5: A10-11,B10-11,C10-11,D10-11,E10-11,F10-11,G10-11,H10-11
+ ; xm6: A4-5,B4-5,C4-5,D4-5,E4-5,F4-5,G4-5,H4-5
+ ; xm0: A12-13,B12-13,C12-13,D12-13,E12-13,F12-13,G12-13,H12-13
+ ; xm1: A6-7,B6-7,C6-7,D6-7,E6-7,F6-7,G6-7,H6-7
+ ; xm2: A14-15,B14-15,C14-15,D14-15,E14-15,F14-15,G14-15,H14-15
+ punpcklwd m2, m7, m4
+ punpckhwd m7, m4
+ punpcklwd m4, m3, m5
+ punpckhwd m3, m5
+ punpcklwd m5, m6, m1
+ punpckhwd m6, m1
+ punpcklwd m1, m0, m15
+ punpckhwd m0, m15
+%if ARCH_X86_64
+ SWAP m15, m0
+%else
+ mova m15, m0
+%endif
+ ; xm2: A0-3,B0-3,C0-3,D0-3
+ ; xm7: E0-3,F0-3,G0-3,H0-3
+ ; xm4: A8-11,B8-11,C8-11,D8-11
+ ; xm3: E8-11,F8-11,G8-11,H8-11
+ ; xm5: A4-7,B4-7,C4-7,D4-7
+ ; xm6: E4-7,F4-7,G4-7,H4-7
+ ; xm1: A12-15,B12-15,C12-15,D12-15
+ ; xm0: E12-15,F12-15,G12-15,H12-15
+ punpckldq m0, m2, m5
+ punpckhdq m2, m5
+ punpckldq m5, m7, m6
+%if %1 != 6
+ punpckhdq m7, m6
+%endif
+ punpckldq m6, m4, m1
+ punpckhdq m4, m1
+ punpckldq m1, m3, m15
+%if %1 != 6
+ punpckhdq m3, m15
+ %if ARCH_X86_64
+ SWAP m15, m3
+ %else
+ mova m15, m3
+ %endif
+%endif
+ ; xm0: A0-7,B0-7
+ ; xm2: C0-7,D0-7
+ ; xm5: E0-7,F0-7
+ ; xm7: G0-7,H0-7
+ ; xm6: A8-15,B8-15
+ ; xm4: C8-15,D8-15
+ ; xm1: E8-15,F8-15
+ ; xm3: G8-15,H8-15
+ punpcklqdq m3, m0, m6
+ punpckhqdq m0, m6
+ punpckhqdq m6, m2, m4
+ punpcklqdq m2, m4
+ punpcklqdq m4, m5, m1
+ punpckhqdq m5, m1
+%if %1 == 8
+ punpcklqdq m1, m7, m15
+ punpckhqdq m7, m15
+ ; xm3: A0-15
+ ; xm0: B0-15
+ ; xm2: C0-15
+ ; xm6: D0-15
+ ; xm4: E0-15
+ ; xm5: F0-15
+ ; xm1: G0-15
+ ; xm7: H0-15
+%if ARCH_X86_64
+ SWAP 11, 3, 2
+ SWAP 13, 0
+ SWAP 6, 5, 4
+ SWAP 14, 1
+ SWAP 15, 7
+ ; 3,0,2,6,4,5,1,7 -> 11,13,3,4,5,6,14,15
+ mova [rsp+21*16], m11
+ %define %%p3mem [rsp+21*16]
+%else
+ %define m11 [esp+26*16]
+ %define m13 [esp+27*16]
+ %define m14 [esp+28*16]
+ %define m15 [esp+29*16]
+ mova m11, m3
+ mova m13, m0
+ SWAP 3, 2
+ SWAP 6, 5, 4
+ mova m14, m1
+ mova m15, m7
+ %define %%p3mem [esp+26*16]
+%endif
+%else
+ %if ARCH_X86_64
+ SWAP 13, 3, 0
+ SWAP 14, 5, 6, 4, 2
+ ; 3,0,2,6,4,5 -> 13,3,4,5,6,14
+ %else
+ %define m13 %%p2mem
+ %define m14 %%q2mem
+ mova m13, m3
+ mova m14, m5
+ SWAP 3, 0
+ SWAP 5, 6, 4, 2
+ ; 0,2,6,4 -> 3,4,5,6
+ %endif
+%endif
+%else
+%if ARCH_X86_64
+ mova [rsp+20*16], m12
+%endif
+ ; load and 16x16 transpose. We only use 14 pixels but we'll need the
+ ; remainder at the end for the second transpose
+%if ARCH_X86_32
+ %xdefine m8 m0
+ %xdefine m9 m1
+ %xdefine m10 m2
+ %xdefine m11 m3
+ %xdefine m12 m4
+ %xdefine m13 m5
+ %xdefine m14 m6
+ %xdefine m15 m7
+ lea tmpq, [dstq+strideq*8]
+ movu m8, [tmpq+strideq*0-8]
+ movu m9, [tmpq+strideq*1-8]
+ movu m10, [tmpq+strideq*2-8]
+ movu m11, [tmpq+stride3q -8]
+ lea tmpq, [tmpq+strideq*4]
+ movu m12, [tmpq+strideq*0-8]
+ movu m13, [tmpq+strideq*1-8]
+ movu m14, [tmpq+strideq*2-8]
+ movu m15, [tmpq+stride3q -8]
+ mova [esp+ 8*16], m8
+ mova [esp+ 9*16], m9
+ mova [esp+10*16], m10
+ mova [esp+11*16], m11
+ mova [esp+12*16], m12
+ mova [esp+13*16], m13
+ mova [esp+14*16], m14
+ mova [esp+15*16], m15
+%endif
+ movu m0, [dstq+strideq*0-8]
+ movu m1, [dstq+strideq*1-8]
+ movu m2, [dstq+strideq*2-8]
+ movu m3, [dstq+stride3q -8]
+ lea tmpq, [dstq+strideq*4]
+ movu m4, [tmpq+strideq*0-8]
+ movu m5, [tmpq+strideq*1-8]
+ movu m6, [tmpq+strideq*2-8]
+ movu m7, [tmpq+stride3q -8]
+ lea tmpq, [tmpq+strideq*4]
+%if ARCH_X86_64
+ movu m8, [tmpq+strideq*0-8]
+ movu m9, [tmpq+strideq*1-8]
+ movu m10, [tmpq+strideq*2-8]
+ movu m11, [tmpq+stride3q -8]
+ lea tmpq, [tmpq+strideq*4]
+ movu m12, [tmpq+strideq*0-8]
+ movu m13, [tmpq+strideq*1-8]
+ movu m14, [tmpq+strideq*2-8]
+ movu m15, [tmpq+stride3q -8]
+%endif
+
+%if ARCH_X86_64
+ TRANSPOSE_16X16B 0, [rsp+11*16]
+ mova [rsp+12*16], m1
+ mova [rsp+13*16], m2
+ mova [rsp+14*16], m3
+ mova [rsp+15*16], m12
+ mova [rsp+16*16], m13
+ mova [rsp+17*16], m14
+ mova [rsp+18*16], m15
+ ; 4,5,6,7,8,9,10,11 -> 12,13,3,4,5,6,14,15
+ SWAP 12, 4, 7
+ SWAP 13, 5, 8
+ SWAP 3, 6, 9
+ SWAP 10, 14
+ SWAP 11, 15
+ mova [rsp+21*16], m12
+ %define %%p3mem [rsp+21*16]
+ mova m12, [rsp+20*16]
+%else
+ TRANSPOSE_16X16B 0, [esp+16*16]
+ %define %%p3mem [esp+26*16]
+ %define m11 %%p3mem
+ %define m13 %%p2mem
+ %define m14 %%q2mem
+ %define m15 %%q3mem
+%endif
+%endif ; if 4 elif 6 or 8 else 16
+%endif ; if v else h
+
+ ; load L/E/I/H
+%if ARCH_X86_32
+ mov l_strideq, l_stridem
+%endif
+%ifidn %2, v
+ movu m1, [lq]
+ movu m0, [lq+l_strideq]
+%else
+ %if ARCH_X86_32
+ lea l_stride3q, [l_strideq*3]
+ %endif
+ movq xm1, [lq]
+ movq xm2, [lq+l_strideq*2]
+ movhps xm1, [lq+l_strideq]
+ movhps xm2, [lq+l_stride3q]
+ shufps m0, m1, m2, q3131
+ shufps m1, m2, q2020
+ %if ARCH_X86_32
+ lea stride3q, [strideq*3]
+ %endif
+%endif
+
+%if ARCH_X86_32
+ %ifidn %2, v
+ mov lutd, lutm
+ %endif
+%endif
+ pxor m2, m2
+ pcmpeqb m7, m2, m0
+ pand m1, m7
+ por m0, m1 ; l[x][] ? l[x][] : l[x-stride][]
+ pshufb m0, [PIC_sym(pb_4x0_4x4_4x8_4x12)] ; l[x][1]
+ pcmpeqb m2, m0 ; !L
+ psrlq m7, m0, [lutq+128]
+ pand m7, [PIC_sym(pb_63)]
+ pminub m7, minlvl
+ pmaxub m7, [PIC_sym(pb_1)] ; I
+ pand m1, m0, [PIC_sym(pb_240)]
+ psrlq m1, 4 ; H
+ paddb m0, [PIC_sym(pb_2)]
+ paddb m0, m0
+ paddb m0, m7 ; E
+ pxor m1, [PIC_sym(pb_128)]
+ pxor m7, [PIC_sym(pb_128)]
+ pxor m0, [PIC_sym(pb_128)]
+ SWAP 2, 7
+
+%if ARCH_X86_64
+ SWAP 0, 8
+ SWAP 2, 10
+%else
+ %ifidn %2, v
+ mov mstrideq, strideq
+ neg mstrideq
+ %if %1 == 4
+ lea tmpq, [dstq+mstrideq*2]
+ %elif %1 == 6 || %1 == 8
+ lea tmpq, [dstq+mstrideq*4]
+ %endif
+ %endif
+ mova [esp+3*16], m0
+ mova [esp+4*16], m2
+%endif
+
+ ABSSUB m0, m3, m4, m2 ; abs(p1-p0)
+ pmaxub m0, m7
+ ABSSUB m2, m5, m6, m7 ; abs(q1-q0)
+ pmaxub m0, m2
+%if %1 == 4
+ pxor m0, [PIC_sym(pb_128)]
+ pcmpgtb m7, m0, m1 ; hev
+ %if ARCH_X86_64
+ SWAP 7, 11
+ %else
+ mova [esp+5*16], m7
+ %endif
+%else
+ pxor m7, m0, [PIC_sym(pb_128)]
+ pcmpgtb m7, m1 ; hev
+%if ARCH_X86_64
+ SWAP 7, 11
+%else
+ mova [esp+5*16], m7
+%endif
+
+%if %1 == 6
+ ABSSUB m1, m13, m4, m7 ; abs(p2-p0)
+ pmaxub m1, m0
+%else
+ mova m2, %%p3mem
+ ABSSUB m1, m2, m4, m7 ; abs(p3-p0)
+ pmaxub m1, m0
+ ABSSUB m7, m13, m4, m2 ; abs(p2-p0)
+ pmaxub m1, m7
+%endif
+ ABSSUB m7, m5, m14, m2 ; abs(p2-p0)
+ pmaxub m1, m7
+%if %1 != 6
+ ABSSUB m7, m5, m15, m2 ; abs(q3-q0)
+ pmaxub m1, m7
+%endif
+ pxor m1, [PIC_sym(pb_128)]
+ pcmpgtb m1, [PIC_sym(pb_129)] ; !flat8in
+%if ARCH_X86_64
+ SWAP 1, 9
+%else
+ mova [esp+6*16], m1
+%endif
+
+%if %1 == 6
+ ABSSUB m7, m13, m3, m1 ; abs(p2-p1)
+%else
+ mova m2, %%p3mem
+ ABSSUB m7, m2, m13, m1 ; abs(p3-p2)
+ ABSSUB m2, m13, m3, m1 ; abs(p2-p1)
+ pmaxub m7, m2
+ ABSSUB m2, m14, m15, m1 ; abs(q3-q2)
+ pmaxub m7, m2
+%endif
+ ABSSUB m2, m14, m6, m1 ; abs(q2-q1)
+ pmaxub m7, m2
+%if ARCH_X86_32
+ %define m12 m1
+ mova m12, maskmem
+%endif
+ pand m2, m12, mask1
+ pcmpeqd m2, m12
+ pand m7, m2 ; only apply fm-wide to wd>4 blocks
+ pmaxub m0, m7
+
+ pxor m0, [PIC_sym(pb_128)]
+%endif ; %if %1 == 4 else
+%if ARCH_X86_64
+ SWAP 2, 10
+ pcmpgtb m0, m2
+%else
+ pcmpgtb m0, [esp+4*16]
+%endif
+
+ ABSSUB m1, m3, m6, m7 ; abs(p1-q1)
+ ABSSUB m7, m4, m5, m2 ; abs(p0-q0)
+ paddusb m7, m7
+ pand m1, [PIC_sym(pb_254)]
+ psrlq m1, 1
+ paddusb m1, m7 ; abs(p0-q0)*2+(abs(p1-q1)>>1)
+ pxor m1, [PIC_sym(pb_128)]
+%if ARCH_X86_64
+ pcmpgtb m1, m8 ; abs(p0-q0)*2+(abs(p1-q1)>>1) > E
+%else
+ pcmpgtb m1, [esp+3*16]
+%endif
+ por m0, m1
+
+%if %1 == 16
+%if ARCH_X86_64
+ SWAP 0, 8
+%else
+ mova [esp+3*16], m0
+%endif
+%ifidn %2, v
+ lea tmpq, [dstq+mstrideq*8]
+ mova m0, [tmpq+strideq*1]
+%else
+ mova m0, [rsp+12*16]
+%endif
+ ABSSUB m1, m0, m4, m2
+%ifidn %2, v
+ mova m0, [tmpq+strideq*2]
+%else
+ mova m0, [rsp+13*16]
+%endif
+ ABSSUB m2, m0, m4, m7
+ pmaxub m1, m2
+%ifidn %2, v
+ mova m0, [tmpq+stride3q]
+%else
+ mova m0, [rsp+14*16]
+%endif
+ ABSSUB m2, m0, m4, m7
+ pmaxub m1, m2
+%ifidn %2, v
+ lea tmpq, [dstq+strideq*4]
+ mova m0, [tmpq+strideq*0]
+%else
+ mova m0, [rsp+15*16]
+%endif
+ ABSSUB m2, m0, m5, m7
+ pmaxub m1, m2
+%ifidn %2, v
+ mova m0, [tmpq+strideq*1]
+%else
+ mova m0, [rsp+16*16]
+%endif
+ ABSSUB m2, m0, m5, m7
+ pmaxub m1, m2
+%ifidn %2, v
+ mova m0, [tmpq+strideq*2]
+%else
+ mova m0, [rsp+17*16]
+%endif
+ ABSSUB m2, m0, m5, m7
+ pmaxub m1, m2
+ pxor m1, [PIC_sym(pb_128)]
+ pcmpgtb m1, [PIC_sym(pb_129)] ; !flat8out
+%if ARCH_X86_64
+ por m1, m9 ; !flat8in | !flat8out
+%else
+ por m1, [esp+6*16]
+ %define m12 m7
+ mova m12, maskmem
+%endif
+ pand m2, m12, mask2
+ pcmpeqd m2, m12
+ pandn m1, m2 ; flat16
+%if ARCH_X86_64
+ pandn m2, m8, m1 ; flat16 & fm
+%else
+ pandn m2, [esp+3*16], m1 ; flat16 & fm
+ mova %%flat16mem, m2
+%endif
+ SWAP 1, 2
+
+ pand m2, m12, mask1
+ pcmpeqd m2, m12
+%if ARCH_X86_64
+ pandn m9, m2 ; flat8in
+ pandn m2, m8, m9
+ SWAP 2, 9
+%else
+ pandn m0, [esp+6*16], m2
+ pandn m2, [esp+3*16], m0
+ mova [esp+6*16], m2
+%endif
+ pand m2, m12, mask0
+ pcmpeqd m2, m12
+%if ARCH_X86_64
+ pandn m8, m2
+ pandn m2, m9, m8 ; fm & !flat8 & !flat16
+ SWAP 2, 8
+ pandn m2, m1, m9 ; flat8 & !flat16
+ SWAP 2, 9
+ SWAP 0, 8
+ SWAP 1, 10
+%else
+ pandn m0, [esp+3*16], m2
+ pandn m2, [esp+6*16], m0
+ SWAP 2, 0
+ pandn m2, m1, [esp+6*16]
+ mova %%flat8mem, m2
+%endif
+%elif %1 != 4
+ %if ARCH_X86_64
+ SWAP 1, 9
+ %else
+ %define m12 m7
+ mova m12, maskmem
+ mova m1, [esp+6*16]
+ %endif
+ pand m2, m12, mask1
+ pcmpeqd m2, m12
+ pandn m1, m2
+ pandn m2, m0, m1 ; flat8 & fm
+ pand m1, m12, mask0
+ pcmpeqd m1, m12
+ pandn m0, m1
+ pandn m1, m2, m0 ; fm & !flat8
+ SWAP 1, 2, 0
+ %if ARCH_X86_64
+ SWAP 1, 9
+ %else
+ mova %%flat8mem, m1
+ %endif
+%else
+%if ARCH_X86_32
+ %define m12 m1
+ mova m12, maskmem
+%endif
+ pand m2, m12, mask0
+ pcmpeqd m2, m12
+ pandn m0, m2 ; fm
+%endif
+
+ ; short filter
+
+ mova m1, [PIC_sym(pb_128)]
+%if ARCH_X86_64
+ SWAP 7, 11
+%else
+ mova m7, [esp+5*16]
+%endif
+ pxor m3, m1
+ pxor m6, m1
+ pxor m4, m1
+ pxor m5, m1
+ psubsb m1, m3, m6 ; iclip_diff(p1-q1)
+ pand m1, m7 ; f=iclip_diff(p1-q1)&hev
+ psubsb m2, m5, m4
+ paddsb m1, m2
+ paddsb m1, m2
+ paddsb m1, m2 ; f=iclip_diff(3*(q0-p0)+f)
+ mova m2, [PIC_sym(pb_16)]
+ pand m0, m1 ; f&=fm
+ paddsb m1, m0, [PIC_sym(pb_3)]
+ paddsb m0, [PIC_sym(pb_4)]
+ pand m1, [PIC_sym(pb_248)]
+ pand m0, [PIC_sym(pb_248)]
+ psrlq m1, 3
+ psrlq m0, 3
+ pxor m1, m2
+ pxor m0, m2
+ psubb m1, m2 ; f2
+ psubb m0, m2 ; f1
+ mova m2, [PIC_sym(pb_128)]
+ paddsb m4, m1
+ psubsb m5, m0
+ pxor m4, m2
+ pxor m5, m2
+
+ pxor m0, m2
+ pxor m1, m1
+ pavgb m0, m1 ; f=(f1+1)>>1
+ psubb m0, [PIC_sym(pb_64)]
+ pandn m7, m0 ; f&=!hev
+ paddsb m3, m7
+ psubsb m6, m7
+ pxor m3, m2
+ pxor m6, m2
+
+%if %1 == 16
+ ; flat16 filter
+%ifidn %2, v
+ lea tmpq, [dstq+mstrideq*8]
+ mova m0, [tmpq+strideq*1] ; p6
+ mova m2, [tmpq+strideq*2] ; p5
+ mova m7, [tmpq+stride3q] ; p4
+%else
+ mova m0, [rsp+12*16]
+ mova m2, [rsp+13*16]
+ mova m7, [rsp+14*16]
+%endif
+
+%if ARCH_X86_64
+ SWAP 1, 10
+ mova %%flat8mem, m9
+ mova %%q2mem, m14
+ mova %%q3mem, m15
+ SWAP 0, 8
+ SWAP 1, 9
+%else
+ %ifidn %2, v
+ mova [esp+17*16], m0
+ mova [esp+19*16], m3
+ mova [esp+21*16], m4
+ mova [esp+22*16], m5
+ mova [esp+23*16], m6
+ %xdefine m11 m3
+ %xdefine m14 m4
+ %xdefine m15 m5
+ %xdefine m10 m6
+ %define m13 %%p2mem
+ %define m8 [esp+17*16]
+ %define m9 %%flat16mem
+ %define m3 [esp+19*16]
+ %define m4 [esp+21*16]
+ %define m5 [esp+22*16]
+ %define m6 [esp+23*16]
+ %else
+ mova [esp+31*16], m0
+ mova [esp+32*16], m3
+ mova [esp+33*16], m4
+ mova [esp+34*16], m5
+ mova [esp+35*16], m6
+ %xdefine m11 m3
+ %xdefine m14 m4
+ %xdefine m15 m5
+ %xdefine m10 m6
+ %define m13 %%p2mem
+ %define m8 [esp+31*16]
+ %define m9 %%flat16mem
+ %define m3 [esp+32*16]
+ %define m4 [esp+33*16]
+ %define m5 [esp+34*16]
+ %define m6 [esp+35*16]
+ %endif
+%endif
+
+ ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 [p5/p4/p2/p1/p0/q0][p6/p3] A
+ ; write -6
+ mova m11, %%p3mem
+%if ARCH_X86_64
+ punpcklbw m14, m8, m11
+ punpckhbw m15, m8, m11
+%else
+ punpcklbw m14, m0, m11
+ punpckhbw m15, m0, m11
+%endif
+%ifidn %2, v
+ mova [rsp+5*16], m11
+%endif
+ pmaddubsw m10, m14, [PIC_sym(pb_7_1)]
+ pmaddubsw m11, m15, [PIC_sym(pb_7_1)] ; p6*7+p3
+ punpcklbw m0, m2, m7
+ punpckhbw m1, m2, m7
+ pmaddubsw m0, [PIC_sym(pb_2)]
+ pmaddubsw m1, [PIC_sym(pb_2)]
+ paddw m10, m0
+ paddw m11, m1 ; p6*7+p5*2+p4*2+p3
+ punpcklbw m0, m13, m3
+ punpckhbw m1, m13, m3
+ pmaddubsw m0, [PIC_sym(pb_1)]
+ pmaddubsw m1, [PIC_sym(pb_1)]
+ paddw m10, m0
+ paddw m11, m1 ; p6*7+p5*2+p4*2+p3+p2+p1
+ punpcklbw m0, m4, m5
+ punpckhbw m1, m4, m5
+ pmaddubsw m0, [PIC_sym(pb_1)]
+ pmaddubsw m1, [PIC_sym(pb_1)]
+ paddw m10, m0
+ paddw m11, m1 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0
+ pmulhrsw m0, m10, [PIC_sym(pw_2048)]
+ pmulhrsw m1, m11, [PIC_sym(pw_2048)]
+ packuswb m0, m1
+ pand m0, m9
+ pandn m1, m9, m2
+ por m0, m1
+%ifidn %2, v
+ mova [tmpq+strideq*2], m0 ; p5
+%else
+ mova [rsp+13*16], m0
+%endif
+
+ ; sub p6*2, add p3/q1 [reuse p6/p3 from A][-p6,+q1|save] B
+ ; write -5
+ pmaddubsw m14, [PIC_sym(pb_m1_1)]
+ pmaddubsw m15, [PIC_sym(pb_m1_1)]
+ paddw m10, m14
+ paddw m11, m15 ; p6*6+p5*2+p4*2+p3*2+p2+p1+p0+q0
+ punpcklbw m0, m8, m6
+ punpckhbw m1, m8, m6
+ pmaddubsw m0, [PIC_sym(pb_m1_1)]
+ pmaddubsw m1, [PIC_sym(pb_m1_1)]
+ mova [rsp+3*16], m0
+ mova [rsp+4*16], m1
+ paddw m10, m0
+ paddw m11, m1 ; p6*5+p5*2+p4*2+p3*2+p2+p1+p0+q0+q1
+ pmulhrsw m0, m10, [PIC_sym(pw_2048)]
+ pmulhrsw m1, m11, [PIC_sym(pw_2048)]
+ packuswb m0, m1
+ pand m0, m9
+ pandn m1, m9, m7
+ por m0, m1
+%ifidn %2, v
+ mova [tmpq+stride3q], m0 ; p4
+%else
+ mova [rsp+14*16], m0
+%endif
+
+ ; sub p6/p5, add p2/q2 [-p6,+p2][-p5,+q2|save] C
+ ; write -4
+ mova m14, %%q2mem
+ punpcklbw m0, m8, m13
+ punpckhbw m1, m8, m13
+ pmaddubsw m0, [PIC_sym(pb_m1_1)]
+ pmaddubsw m1, [PIC_sym(pb_m1_1)]
+ paddw m10, m0
+ paddw m11, m1 ; p6*4+p5*2+p4*2+p3*2+p2*2+p1+p0+q0+q1
+ punpcklbw m0, m2, m14
+ punpckhbw m2, m14
+ pmaddubsw m0, [PIC_sym(pb_m1_1)]
+ pmaddubsw m2, [PIC_sym(pb_m1_1)]
+ mova [rsp+1*16], m0
+ paddw m10, m0
+ paddw m11, m2 ; p6*4+p5+p4*2+p3*2+p2*2+p1+p0+q0+q1+q2
+ pmulhrsw m0, m10, [PIC_sym(pw_2048)]
+ pmulhrsw m1, m11, [PIC_sym(pw_2048)]
+ packuswb m0, m1
+ pand m0, m9
+ pandn m1, m9, %%p3mem
+ por m0, m1
+%ifidn %2, v
+ mova [tmpq+strideq*4], m0 ; p3
+%else
+ mova [rsp+19*16], m0
+%endif
+
+ ; sub p6/p4, add p1/q3 [-p6,+p1][-p4,+q3|save] D
+ ; write -3
+ mova m15, %%q3mem
+ punpcklbw m0, m8, m3
+ punpckhbw m1, m8, m3
+ pmaddubsw m0, [PIC_sym(pb_m1_1)]
+ pmaddubsw m1, [PIC_sym(pb_m1_1)]
+ paddw m10, m0
+ paddw m11, m1 ; p6*3+p5+p4*2+p3*2+p2*2+p1*2+p0+q0+q1+q2
+ punpcklbw m0, m7, m15
+ punpckhbw m7, m15
+ pmaddubsw m0, [PIC_sym(pb_m1_1)]
+ pmaddubsw m7, [PIC_sym(pb_m1_1)]
+ mova [rsp+2*16], m0
+%if ARCH_X86_32
+ %ifidn %2, v
+ mova [esp+24*16], m7
+ %else
+ mova [esp+36*16], m7
+ %endif
+%endif
+ paddw m10, m0
+ paddw m11, m7 ; p6*3+p5+p4+p3*2+p2*2+p1*2+p0+q0+q1+q2+q3
+ pmulhrsw m0, m10, [PIC_sym(pw_2048)]
+ pmulhrsw m1, m11, [PIC_sym(pw_2048)]
+ packuswb m0, m1
+ pand m0, m9
+ pandn m1, m9, m13
+ por m0, m1
+ mova [rsp+6*16], m0 ; don't clobber p2/m13 since we need it in F
+
+ ; sub p6/p3, add p0/q4 [-p6,+p0][-p3,+q4|save] E
+ ; write -2
+ punpcklbw m0, m8, m4
+ punpckhbw m1, m8, m4
+ pmaddubsw m0, [PIC_sym(pb_m1_1)]
+ pmaddubsw m1, [PIC_sym(pb_m1_1)]
+ paddw m10, m0
+ paddw m11, m1 ; p6*2+p5+p4+p3*2+p2*2+p1*2+p0*2+q0+q1+q2+q3
+%if ARCH_X86_64
+ SWAP 7, 8
+%endif
+%ifidn %2, v
+ mova m1, [dstq+strideq*4] ; q4
+ mova m7, [rsp+5*16] ; (pre-filter) p3
+%else
+ mova m1, [rsp+15*16]
+ mova m7, %%p3mem ; (pre-filter) p3
+%endif
+ punpcklbw m0, m1, m7
+ punpckhbw m1, m1, m7
+ pmaddubsw m0, [PIC_sym(pb_m1_1)]
+ pmaddubsw m1, [PIC_sym(pb_m1_1)]
+ mova [rsp+7*16], m0
+ mova [rsp+5*16], m1
+ psubw m10, m0
+ psubw m11, m1 ; p6*2+p5+p4+p3+p2*2+p1*2+p0*2+q0+q1+q2+q3+q4
+ pmulhrsw m0, m10, [PIC_sym(pw_2048)]
+ pmulhrsw m1, m11, [PIC_sym(pw_2048)]
+ packuswb m0, m1
+ pand m0, m9
+ pandn m1, m9, m3
+ por m0, m1
+ mova [rsp+8*16], m0 ; don't clobber p1/m3 since we need it in G
+
+ ; sub p6/p2, add q0/q5 [-p6,+q0][-p2,+q5|save] F
+ ; write -1
+%ifidn %2, v
+ mova m7, [tmpq+strideq*1] ; p6
+ lea tmpq, [dstq+strideq*4]
+ mova m1, [tmpq+strideq*1] ; q5
+%else
+ mova m7, [rsp+12*16] ; p6
+ mova m1, [rsp+16*16]
+%endif
+ punpcklbw m0, m7, m5
+ punpckhbw m7, m5
+ pmaddubsw m0, [PIC_sym(pb_m1_1)]
+ pmaddubsw m7, [PIC_sym(pb_m1_1)]
+ paddw m10, m0
+ paddw m11, m7 ; p6+p5+p4+p3+p2*2+p1*2+p0*2+q0*2+q1+q2+q3+q4
+ punpcklbw m7, m13, m1
+ pmaddubsw m7, [PIC_sym(pb_m1_1)]
+ mova [rsp+9*16], m7
+ paddw m10, m7
+%if ARCH_X86_64
+ punpckhbw m13, m1
+ mova m1, [rsp+6*16]
+ SWAP 1, 13
+%else
+ punpckhbw m7, m13, m1
+ mova m1, [esp+6*16]
+ mova m13, m1
+ SWAP 1, 7
+%endif
+ pmaddubsw m1, [PIC_sym(pb_m1_1)]
+ mova [rsp+10*16], m1
+ paddw m11, m1 ; p6+p5+p4+p3+p2+p1*2+p0*2+q0*2+q1+q2+q3+q4+q5
+ pmulhrsw m7, m10, [PIC_sym(pw_2048)]
+ pmulhrsw m0, m11, [PIC_sym(pw_2048)]
+ packuswb m7, m0
+ pand m7, m9
+ pandn m0, m9, m4
+ por m7, m0
+ mova [rsp+6*16], m7 ; don't clobber p0/m4 since we need it in H
+
+ ; sub p6/p1, add q1/q6 [reuse -p6,+q1 from B][-p1,+q6|save] G
+ ; write +0
+%ifidn %2, v
+ mova m7, [tmpq+strideq*2] ; q6
+%else
+ mova m7, [rsp+17*16]
+%endif
+ paddw m10, [rsp+3*16]
+ paddw m11, [rsp+4*16] ; p5+p4+p3+p2+p1*2+p0*2+q0*2+q1*2+q2+q3+q4+q5
+ punpcklbw m0, m3, m7
+ punpckhbw m1, m3, m7
+%if ARCH_X86_64
+ mova m3, [rsp+8*16]
+%endif
+ pmaddubsw m0, [PIC_sym(pb_m1_1)]
+ pmaddubsw m1, [PIC_sym(pb_m1_1)]
+ mova [rsp+3*16], m0
+ mova [rsp+4*16], m1
+ paddw m10, m0
+ paddw m11, m1 ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6
+ pmulhrsw m0, m10, [PIC_sym(pw_2048)]
+ pmulhrsw m1, m11, [PIC_sym(pw_2048)]
+ packuswb m0, m1
+ pand m0, m9
+ pandn m1, m9, m5
+ por m0, m1
+%if ARCH_X86_32
+ mova m1, [esp+8*16]
+ mova m3, m1
+%endif
+ mova [rsp+8*16], m0 ; don't clobber q0/m5 since we need it in I
+
+ ; sub p5/p0, add q2/q6 [reuse -p5,+q2 from C][-p0,+q6] H
+ ; write +1
+ paddw m10, [rsp+1*16]
+ paddw m11, m2 ; p4+p3+p2+p1+p0*2+q0*2+q1*2+q2*2+q3+q4+q5+q6
+ punpcklbw m0, m4, m7
+ punpckhbw m2, m4, m7
+ pmaddubsw m0, [PIC_sym(pb_m1_1)]
+ pmaddubsw m2, [PIC_sym(pb_m1_1)]
+ paddw m10, m0
+ paddw m11, m2 ; p4+p3+p2+p1+p0+q0*2+q1*2+q2*2+q3+q4+q5+q6*2
+%if ARCH_X86_64
+ mova m4, [rsp+6*16]
+%else
+ %define m4 [esp+6*16]
+%endif
+ pmulhrsw m2, m10, [PIC_sym(pw_2048)]
+ pmulhrsw m1, m11, [PIC_sym(pw_2048)]
+ packuswb m2, m1
+ pand m2, m9
+ pandn m1, m9, m6
+ por m2, m1 ; don't clobber q1/m6 since we need it in K
+
+ ; sub p4/q0, add q3/q6 [reuse -p4,+q3 from D][-q0,+q6] I
+ ; write +2
+ paddw m10, [rsp+2*16]
+%if ARCH_X86_64
+ SWAP 7, 8
+ paddw m11, m7
+%else
+ mova m8, m7
+ %ifidn %2, v
+ paddw m11, [esp+24*16] ; p3+p2+p1+p0+q0*2+q1*2+q2*2+q3*2+q4+q5+q6*2
+ %else
+ paddw m11, [esp+36*16] ; p3+p2+p1+p0+q0*2+q1*2+q2*2+q3*2+q4+q5+q6*2
+ %endif
+%endif
+ punpcklbw m0, m5, m8
+ punpckhbw m1, m5, m8
+%if ARCH_X86_64
+ mova m5, [rsp+8*16]
+%else
+ %define m5 [esp+8*16]
+%endif
+ pmaddubsw m0, [PIC_sym(pb_m1_1)]
+ pmaddubsw m1, [PIC_sym(pb_m1_1)]
+ paddw m10, m0
+ paddw m11, m1 ; p3+p2+p1+p0+q0+q1*2+q2*2+q3*2+q4+q5+q6*3
+ pmulhrsw m7, m10, [PIC_sym(pw_2048)]
+ pmulhrsw m1, m11, [PIC_sym(pw_2048)]
+ packuswb m7, m1
+ pand m7, m9
+ pandn m1, m9, m14
+ por m7, m1 ; don't clobber q2/m14 since we need it in K
+
+ ; sub p3/q1, add q4/q6 [reuse -p3,+q4 from E][-q1,+q6] J
+ ; write +3
+ psubw m10, [rsp+7*16]
+ psubw m11, [rsp+5*16] ; p2+p1+p0+q0+q1*2+q2*2+q3*2+q4*2+q5+q6*3
+ punpcklbw m0, m6, m8
+ punpckhbw m1, m6, m8
+ pmaddubsw m0, [PIC_sym(pb_m1_1)]
+ pmaddubsw m1, [PIC_sym(pb_m1_1)]
+ paddw m10, m0
+ paddw m11, m1 ; p2+p1+p0+q0+q1+q2*2+q3*2+q4*2+q5+q6*4
+ pmulhrsw m0, m10, [PIC_sym(pw_2048)]
+ pmulhrsw m1, m11, [PIC_sym(pw_2048)]
+ packuswb m0, m1
+ pand m0, m9
+ pandn m1, m9, m15
+ por m0, m1
+%ifidn %2, v
+ mova [tmpq+mstrideq], m0 ; q3
+%else
+ mova [rsp+20*16], m0
+%endif
+
+ ; sub p2/q2, add q5/q6 [reuse -p2,+q5 from F][-q2,+q6] K
+ ; write +4
+ paddw m10, [rsp+ 9*16]
+ paddw m11, [rsp+10*16] ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4
+ punpcklbw m0, m14, m8
+ punpckhbw m1, m14, m8
+ pmaddubsw m0, [PIC_sym(pb_m1_1)]
+ pmaddubsw m1, [PIC_sym(pb_m1_1)]
+ paddw m10, m0
+ paddw m11, m1 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5
+ pmulhrsw m0, m10, [PIC_sym(pw_2048)]
+ pmulhrsw m1, m11, [PIC_sym(pw_2048)]
+ packuswb m0, m1
+ pand m0, m9
+%ifidn %2, v
+ pandn m1, m9, [tmpq+strideq*0]
+%else
+ pandn m1, m9, [rsp+15*16]
+%endif
+ por m0, m1
+%ifidn %2, v
+ mova [tmpq+strideq*0], m0 ; q4
+%else
+ mova [rsp+15*16], m0
+%endif
+
+ ; sub p1/q3, add q6*2 [reuse -p1,+q6 from G][-q3,+q6] L
+ ; write +5
+ paddw m10, [rsp+3*16]
+ paddw m11, [rsp+4*16] ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4
+ punpcklbw m0, m15, m8
+ punpckhbw m1, m15, m8
+ pmaddubsw m0, [PIC_sym(pb_m1_1)]
+ pmaddubsw m1, [PIC_sym(pb_m1_1)]
+ paddw m10, m0
+ paddw m11, m1 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5
+ pmulhrsw m10, [PIC_sym(pw_2048)]
+ pmulhrsw m11, [PIC_sym(pw_2048)]
+ packuswb m10, m11
+ pand m10, m9
+%ifidn %2, v
+ pandn m11, m9, [tmpq+strideq*1]
+%else
+ pandn m11, m9, [rsp+16*16]
+%endif
+ por m10, m11
+%ifidn %2, v
+ mova [tmpq+strideq*1], m10 ; q5
+%else
+ mova [rsp+16*16], m10
+%endif
+
+%if ARCH_X86_64
+ SWAP 0, 8
+ SWAP 1, 9
+ SWAP 14, 7
+%else
+ %xdefine m3 m11
+ %xdefine m4 m14
+ %xdefine m5 m15
+ %xdefine m6 m10
+ mova %%q2mem, m7
+ %ifidn %2, v
+ mova m3, [esp+19*16]
+ %else
+ mova m3, [esp+32*16]
+ %endif
+ mova m4, [esp+ 6*16]
+ mova m5, [esp+ 8*16]
+%endif
+ SWAP m6, m2
+
+%if ARCH_X86_64
+ mova m9, %%flat8mem
+%endif
+%ifidn %2, v
+ lea tmpq, [dstq+mstrideq*4]
+%endif
+%endif ; if %1 == 16
+%if %1 >= 8
+ ; flat8 filter
+%if ARCH_X86_32
+ %define m9 %%flat8mem
+ %define m11 m1
+ %define m13 %%p2mem
+ %define m14 %%q2mem
+ %define m15 %%q3mem
+%endif
+ mova m11, %%p3mem
+ punpcklbw m0, m11, m3
+ punpcklbw m7, m13, m4
+ pmaddubsw m2, m0, [PIC_sym(pb_3_1)] ; 3 * p3 + p1
+ pmaddubsw m7, [PIC_sym(pb_2_1)]
+ paddw m2, m7 ; 3 * p3 + 2 * p2 + p1 + p0
+ punpcklbw m7, m5, [PIC_sym(pb_4)]
+ pmaddubsw m7, [PIC_sym(pb_1)]
+ paddw m2, m7 ; 3 * p3 + 2 * p2 + p1 + p0 + q0 + 4
+ punpckhbw m1, m11, m3
+ pmaddubsw m7, m1, [PIC_sym(pb_3_1)] ; 3 * p3 + p1
+ punpckhbw m0, m13, m4
+ pmaddubsw m0, [PIC_sym(pb_2_1)]
+ paddw m7, m0 ; 3 * p3 + 2 * p2 + p1 + p0
+ punpckhbw m0, m5, [PIC_sym(pb_4)]
+ pmaddubsw m0, [PIC_sym(pb_1)]
+ paddw m7, m0 ; 3 * p3 + 2 * p2 + p1 + p0 + q0 + 4
+ psrlw m0, m2, 3
+ psrlw m1, m7, 3
+ packuswb m0, m1
+ pand m0, m9
+ pandn m1, m9, m13
+ por m0, m1 ; p2
+%ifidn %2, v
+ mova [tmpq+strideq*1], m0
+%else
+ %if ARCH_X86_64
+ SWAP 0, 10
+ %else
+ mova [esp+2*16], m0
+ %endif
+%endif
+
+%if ARCH_X86_32
+ mova m11, %%p3mem
+%endif
+ punpcklbw m0, m11, m3
+ punpckhbw m1, m11, m3
+ pmaddubsw m0, [PIC_sym(pb_m1_1)]
+ pmaddubsw m1, [PIC_sym(pb_m1_1)]
+ paddw m2, m0
+ paddw m7, m1
+ punpcklbw m0, m13, m6
+ punpckhbw m1, m13, m6
+ pmaddubsw m0, [PIC_sym(pb_m1_1)]
+ pmaddubsw m1, [PIC_sym(pb_m1_1)]
+ paddw m2, m0
+ paddw m7, m1 ; 2 * p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4
+ psrlw m0, m2, 3
+ psrlw m1, m7, 3
+ packuswb m0, m1
+ pand m0, m9
+ pandn m1, m9, m3
+ por m0, m1 ; p1
+%ifidn %2, v
+ mova [tmpq+strideq*2], m0
+%else
+ mova [rsp+0*16], m0
+%endif
+
+%if ARCH_X86_32
+ mova m11, %%p3mem
+%endif
+ punpcklbw m0, m11, m3
+ punpckhbw m1, m11, m3
+ pmaddubsw m0, [PIC_sym(pb_1)]
+ pmaddubsw m1, [PIC_sym(pb_1)]
+ psubw m2, m0
+ psubw m7, m1
+ punpcklbw m0, m4, m14
+ punpckhbw m1, m4, m14
+ pmaddubsw m0, [PIC_sym(pb_1)]
+ pmaddubsw m1, [PIC_sym(pb_1)]
+ paddw m2, m0
+ paddw m7, m1 ; p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4
+ psrlw m0, m2, 3
+ psrlw m1, m7, 3
+ packuswb m0, m1
+ pand m0, m9
+ pandn m1, m9, m4
+ por m0, m1 ; p0
+%ifidn %2, v
+ mova [tmpq+stride3q], m0
+%else
+ mova [rsp+1*16], m0
+%endif
+
+ punpcklbw m0, m5, m15
+ punpckhbw m1, m5, m15
+ pmaddubsw m0, [PIC_sym(pb_1)]
+ pmaddubsw m1, [PIC_sym(pb_1)]
+ paddw m2, m0
+ paddw m7, m1
+%if ARCH_X86_32
+ mova m11, %%p3mem
+%endif
+ punpcklbw m0, m11, m4
+ punpckhbw m11, m11, m4
+ pmaddubsw m0, [PIC_sym(pb_1)]
+ pmaddubsw m11, [PIC_sym(pb_1)]
+ psubw m2, m0
+ psubw m7, m11 ; p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4
+ psrlw m0, m2, 3
+ psrlw m11, m7, 3
+ packuswb m0, m11
+ pand m0, m9
+ pandn m11, m9, m5
+ por m11, m0 ; q0
+%ifidn %2, v
+ mova [dstq+strideq*0], m11
+%elif ARCH_X86_32
+ mova [esp+8*16], m11
+%endif
+
+ punpcklbw m0, m5, m15
+ punpckhbw m1, m5, m15
+ pmaddubsw m0, [PIC_sym(pb_m1_1)]
+ pmaddubsw m1, [PIC_sym(pb_m1_1)]
+ paddw m2, m0
+ paddw m7, m1
+ punpcklbw m0, m13, m6
+ punpckhbw m1, m13, m6
+ pmaddubsw m0, [PIC_sym(pb_m1_1)]
+ pmaddubsw m1, [PIC_sym(pb_m1_1)]
+ paddw m2, m0
+ paddw m7, m1 ; p1 + p0 + q0 + 2 * q1 + q2 + 2 * q3 + 4
+ psrlw m0, m2, 3
+ psrlw m1, m7, 3
+ packuswb m0, m1
+ pand m0, m9
+ pandn m1, m9, m6
+ por m0, m1 ; q1
+%ifidn %2, v
+ mova [dstq+strideq*1], m0
+%else
+ %if ARCH_X86_64
+ SWAP 0, 13
+ %else
+ mova [esp+9*16], m0
+ %endif
+%endif
+
+ punpcklbw m0, m3, m6
+ punpckhbw m1, m3, m6
+ pmaddubsw m0, [PIC_sym(pb_1)]
+ pmaddubsw m1, [PIC_sym(pb_1)]
+ psubw m2, m0
+ psubw m7, m1
+ punpcklbw m0, m14, m15
+ punpckhbw m1, m14, m15
+ pmaddubsw m0, [PIC_sym(pb_1)]
+ pmaddubsw m1, [PIC_sym(pb_1)]
+ paddw m2, m0
+ paddw m7, m1 ; p0 + q0 + q1 + q2 + 2 * q2 + 3 * q3 + 4
+ psrlw m2, 3
+ psrlw m7, 3
+ packuswb m2, m7
+ pand m2, m9
+ pandn m7, m9, m14
+ por m2, m7 ; q2
+%ifidn %2, v
+ mova [dstq+strideq*2], m2
+%else
+ mova m0, [rsp+0*16]
+%if %1 == 8
+ mova m1, [rsp+1*16]
+ mova m4, %%p3mem
+
+%if ARCH_X86_32
+ %define m10 [esp+2*16]
+ %define m11 [esp+8*16]
+ %define m13 [esp+9*16]
+%endif
+
+ ; 16x8 transpose
+ punpcklbw m3, m4, m10
+ punpckhbw m4, m10
+ punpcklbw m5, m0, m1
+ punpckhbw m0, m1
+ punpcklbw m1, m11, m13
+ punpckhbw m6, m11, m13
+ punpcklbw m7, m2, m15
+ punpckhbw m2, m15
+%if ARCH_X86_64
+ SWAP 2, 15
+%else
+ mova m15, m2
+%endif
+
+ punpcklwd m2, m3, m5
+ punpckhwd m3, m5
+ punpcklwd m5, m4, m0
+ punpckhwd m4, m0
+ punpcklwd m0, m1, m7
+ punpckhwd m1, m7
+ punpcklwd m7, m6, m15
+ punpckhwd m6, m15
+%if ARCH_X86_64
+ SWAP 6, 15
+%else
+ mova m15, m6
+%endif
+
+ punpckldq m6, m2, m0
+ punpckhdq m2, m0
+ punpckldq m0, m3, m1
+ punpckhdq m3, m1
+ punpckldq m1, m5, m7
+ punpckhdq m5, m7
+ punpckldq m7, m4, m15
+ punpckhdq m4, m15
+
+ ; write 8x16
+ movq [dstq+strideq*0-4], xm6
+ movhps [dstq+strideq*1-4], xm6
+ movq [dstq+strideq*2-4], xm2
+ movhps [dstq+stride3q -4], xm2
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0-4], xm0
+ movhps [dstq+strideq*1-4], xm0
+ movq [dstq+strideq*2-4], xm3
+ movhps [dstq+stride3q -4], xm3
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0-4], xm1
+ movhps [dstq+strideq*1-4], xm1
+ movq [dstq+strideq*2-4], xm5
+ movhps [dstq+stride3q -4], xm5
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0-4], xm7
+ movhps [dstq+strideq*1-4], xm7
+ movq [dstq+strideq*2-4], xm4
+ movhps [dstq+stride3q -4], xm4
+ lea dstq, [dstq+strideq*4]
+%else
+ ; 16x16 transpose and store
+ SWAP 6, 0
+ SWAP 7, 1
+ %if ARCH_X86_64
+ SWAP 5, 10, 2
+ SWAP 8, 11
+ SWAP 9, 13
+ mova [rsp+21*16], m12
+ %else
+ mova [esp+10*16], m2
+ %xdefine m8 m0
+ %xdefine m9 m1
+ %xdefine m10 m2
+ %xdefine m11 m3
+ %xdefine m12 m4
+ %xdefine m13 m5
+ %xdefine m14 m6
+ %xdefine m15 m7
+ %endif
+ mova m0, [rsp+11*16]
+ mova m1, [rsp+12*16]
+ mova m2, [rsp+13*16]
+ mova m3, [rsp+14*16]
+ mova m4, [rsp+19*16]
+%if ARCH_X86_64
+ mova m7, [rsp+ 1*16]
+ mova m11, [rsp+20*16]
+ mova m12, [rsp+15*16]
+ mova m13, [rsp+16*16]
+ mova m14, [rsp+17*16]
+ TRANSPOSE_16X16B 1, [rsp+18*16]
+%else
+ mova m5, [esp+ 2*16]
+ TRANSPOSE_16X16B 1, [esp+32*16]
+ mov tmpq, dstq
+ lea dstq, [dstq+strideq*8]
+%endif
+ movu [dstq+strideq*0-8], xm0
+ movu [dstq+strideq*1-8], xm1
+ movu [dstq+strideq*2-8], xm2
+ movu [dstq+stride3q -8], xm3
+ lea dstq, [dstq+strideq*4]
+ movu [dstq+strideq*0-8], xm4
+ movu [dstq+strideq*1-8], xm5
+ movu [dstq+strideq*2-8], xm6
+ movu [dstq+stride3q -8], xm7
+%if ARCH_X86_64
+ lea dstq, [dstq+strideq*4]
+%else
+ %xdefine m8 m0
+ %xdefine m9 m1
+ %xdefine m10 m2
+ %xdefine m11 m3
+ %xdefine m12 m4
+ %xdefine m13 m5
+ %xdefine m14 m6
+ %xdefine m15 m7
+ mova m8, [esp+11*16]
+ mova m9, [esp+12*16]
+ mova m10, [esp+13*16]
+ mova m11, [esp+14*16]
+ mova m12, [esp+26*16]
+ mova m13, [esp+27*16]
+ mova m14, [esp+ 0*16]
+ mova m15, [esp+ 1*16]
+ mov dstq, tmpq
+%endif
+ movu [dstq+strideq*0-8], xm8
+ movu [dstq+strideq*1-8], xm9
+ movu [dstq+strideq*2-8], xm10
+ movu [dstq+stride3q -8], xm11
+ lea dstq, [dstq+strideq*4]
+ movu [dstq+strideq*0-8], xm12
+ movu [dstq+strideq*1-8], xm13
+ movu [dstq+strideq*2-8], xm14
+ movu [dstq+stride3q -8], xm15
+ lea dstq, [dstq+strideq*4]
+%if ARCH_X86_32
+ lea dstq, [dstq+strideq*8]
+%else
+ mova m12, [rsp+21*16]
+%endif
+
+%endif ; if %1 == 8
+%endif ; ifidn %2, v
+%elif %1 == 6
+ ; flat6 filter
+%if ARCH_X86_32
+ mova [esp+3*16], m3
+ mova [esp+4*16], m4
+ mova [esp+5*16], m5
+ mova [esp+6*16], m6
+ %xdefine m8 m3
+ %xdefine m10 m4
+ %xdefine m11 m5
+ %xdefine m15 m6
+ %define m3 [esp+3*16]
+ %define m4 [esp+4*16]
+ %define m5 [esp+5*16]
+ %define m6 [esp+6*16]
+ %define m9 %%flat8mem
+ %define m13 %%p2mem
+ %define m14 %%q2mem
+%endif
+
+ punpcklbw m8, m13, m5
+ punpckhbw m11, m13, m5
+ pmaddubsw m0, m8, [PIC_sym(pb_3_1)]
+ pmaddubsw m1, m11, [PIC_sym(pb_3_1)]
+ punpcklbw m7, m4, m3
+ punpckhbw m10, m4, m3
+ pmaddubsw m2, m7, [PIC_sym(pb_2)]
+ pmaddubsw m15, m10, [PIC_sym(pb_2)]
+ paddw m0, m2
+ paddw m1, m15
+ pmulhrsw m2, m0, [PIC_sym(pw_4096)]
+ pmulhrsw m15, m1, [PIC_sym(pw_4096)]
+ packuswb m2, m15
+ pand m2, m9
+ pandn m15, m9, m3
+ por m2, m15
+%ifidn %2, v
+ mova [tmpq+strideq*2], m2 ; p1
+%elif ARCH_X86_32
+ mova [esp+11*16], m2
+%endif
+
+ pmaddubsw m8, [PIC_sym(pb_m1_1)]
+ pmaddubsw m11, [PIC_sym(pb_m1_1)]
+ paddw m0, m8
+ paddw m1, m11
+ punpcklbw m8, m13, m6
+ punpckhbw m11, m13, m6
+%if ARCH_X86_64
+ SWAP 2, 13
+%endif
+ pmaddubsw m8, [PIC_sym(pb_m1_1)]
+ pmaddubsw m11, [PIC_sym(pb_m1_1)]
+ paddw m0, m8
+ paddw m1, m11
+ pmulhrsw m2, m0, [PIC_sym(pw_4096)]
+ pmulhrsw m15, m1, [PIC_sym(pw_4096)]
+ packuswb m2, m15
+ pand m2, m9
+ pandn m15, m9, m4
+ por m2, m15
+%ifidn %2, v
+ mova [tmpq+stride3q], m2 ; p0
+%elif ARCH_X86_32
+ mova [esp+8*16], m2
+%endif
+
+ paddw m0, m8
+ paddw m1, m11
+ punpcklbw m8, m3, m14
+ punpckhbw m11, m3, m14
+%if ARCH_X86_64
+ SWAP 2, 14
+%endif
+ pmaddubsw m2, m8, [PIC_sym(pb_m1_1)]
+ pmaddubsw m15, m11, [PIC_sym(pb_m1_1)]
+ paddw m0, m2
+ paddw m1, m15
+ pmulhrsw m2, m0, [PIC_sym(pw_4096)]
+ pmulhrsw m15, m1, [PIC_sym(pw_4096)]
+ packuswb m2, m15
+ pand m2, m9
+ pandn m15, m9, m5
+ por m2, m15
+%ifidn %2, v
+ mova [dstq+strideq*0], m2 ; q0
+%endif
+
+ pmaddubsw m8, [PIC_sym(pb_m1_2)]
+ pmaddubsw m11, [PIC_sym(pb_m1_2)]
+ paddw m0, m8
+ paddw m1, m11
+ pmaddubsw m7, [PIC_sym(pb_m1_0)]
+ pmaddubsw m10, [PIC_sym(pb_m1_0)]
+ paddw m0, m7
+ paddw m1, m10
+ pmulhrsw m0, [PIC_sym(pw_4096)]
+ pmulhrsw m1, [PIC_sym(pw_4096)]
+ packuswb m0, m1
+ pand m0, m9
+ pandn m1, m9, m6
+ por m0, m1
+%if ARCH_X86_32
+ %xdefine m3 m8
+ %xdefine m4 m10
+ %xdefine m5 m11
+ %xdefine m6 m15
+%endif
+%ifidn %2, v
+ mova [dstq+strideq*1], m0 ; q1
+%else
+ %if ARCH_X86_64
+ SWAP 3, 13
+ SWAP 4, 14
+ %else
+ mova m3, [esp+11*16]
+ mova m4, [esp+ 8*16]
+ %endif
+ SWAP 5, 2
+ SWAP 6, 0
+ TRANSPOSE_16x4_AND_WRITE_4x16 3, 4, 5, 6, 7
+%endif
+%else ; if %1 == 4
+%ifidn %2, v
+ mova [tmpq+strideq*0], m3 ; p1
+ mova [tmpq+strideq*1], m4 ; p0
+ mova [tmpq+strideq*2], m5 ; q0
+ mova [tmpq+stride3q ], m6 ; q1
+%else
+ TRANSPOSE_16x4_AND_WRITE_4x16 3, 4, 5, 6, 7
+%endif
+%endif
+%if ARCH_X86_32
+ %define m12 m12reg
+%endif
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; 32-bit PIC helpers ;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%if ARCH_X86_32
+ %define PIC_base_offset $$
+
+ %macro SETUP_PIC 0 ; PIC_reg
+ %define PIC_reg r2
+ %assign PIC_reg_stk_offset stack_size-gprsize*(1+copy_args*4)
+ LEA PIC_reg, $$
+ %endmacro
+
+ %macro XCHG_PIC_REG 1 ; 0=mask 1=PIC_base
+ %if %1 == 0
+ mov [esp+PIC_reg_stk_offset], PIC_reg
+ mov PIC_reg, maskm
+ %else
+ mov PIC_reg, [esp+PIC_reg_stk_offset]
+ %endif
+ %endmacro
+
+ %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset)
+
+%else
+ %macro XCHG_PIC_REG 1
+ %endmacro
+ %define PIC_sym(sym) (sym)
+%endif
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%if ARCH_X86_32
+ %if STACK_ALIGNMENT < required_stack_alignment
+ %assign copy_args 1
+ %else
+ %assign copy_args 0
+ %endif
+%endif
+
+%macro RELOC_ARGS 1
+ %if copy_args
+ %define maskm [esp+stack_size-gprsize*1]
+ %define l_stridem [esp+stack_size-gprsize*2]
+ %define lutm [esp+stack_size-gprsize*3]
+ %define %1m [esp+stack_size-gprsize*4]
+ mov r6d, r6m
+ mov maskm, maskd
+ mov lutm, lutd
+ mov %1m, r6d
+ %else
+ %define %1m r6m
+ %endif
+%endmacro
+
+%if ARCH_X86_32
+ %define tmpq r4
+ %define mstrideq r5
+ %define stride3q r6
+ %define l_stride3q r6
+%endif
+
+INIT_XMM ssse3
+%if ARCH_X86_64
+cglobal lpf_v_sb_y_8bpc, 7, 11, 16, 16 * 15, \
+ dst, stride, mask, l, l_stride, lut, \
+ w, stride3, mstride, tmp, mask_bits
+%else
+cglobal lpf_v_sb_y_8bpc, 6, 7, 8, -16 * (26 + copy_args), \
+ dst, stride, mask, l, l_stride, lut, mask_bits
+ RELOC_ARGS w
+ SETUP_PIC
+ %define m12 m5
+%endif
+ shl l_strideq, 2
+ sub lq, l_strideq
+%if ARCH_X86_64
+ mov mstrideq, strideq
+ neg mstrideq
+ lea stride3q, [strideq*3]
+%else
+ mov l_stridem, l_strided
+%endif
+ mov mask_bitsd, 0xf
+ mova m12, [PIC_sym(pd_mask)]
+ XCHG_PIC_REG 0
+ movu m0, [maskq]
+ pxor m4, m4
+ movd m3, [lutq+136]
+ pshufb m3, m4
+ pshufd m2, m0, q2222
+ pshufd m1, m0, q1111
+ pshufd m0, m0, q0000
+ por m1, m2
+ por m0, m1
+ mova [rsp+11*16], m0
+ mova [rsp+12*16], m1
+ mova [rsp+13*16], m2
+ mova [rsp+14*16], m3
+
+%define maskmem [esp+15*16]
+%define mask0 [rsp+11*16]
+%define mask1 [rsp+12*16]
+%define mask2 [rsp+13*16]
+%define minlvl [rsp+14*16]
+
+.loop:
+ test [maskq+8], mask_bitsd ; vmask[2]
+ je .no_flat16
+
+%if ARCH_X86_32
+ XCHG_PIC_REG 1
+ mov [esp+25*16], mask_bitsd
+ mova maskmem, m12
+%endif
+ FILTER 16, v
+ jmp .end
+
+.no_flat16:
+ test [maskq+4], mask_bitsd ; vmask[1]
+ je .no_flat
+
+%if ARCH_X86_32
+ XCHG_PIC_REG 1
+ mov [esp+25*16], mask_bitsd
+ mova maskmem, m12
+%endif
+ FILTER 8, v
+ jmp .end
+
+.no_flat:
+ test [maskq+0], mask_bitsd ; vmask[0]
+ XCHG_PIC_REG 1
+ je .no_filter
+
+%if ARCH_X86_32
+ mov [esp+25*16], mask_bitsd
+ mova maskmem, m12
+%endif
+ FILTER 4, v
+
+.end:
+%if ARCH_X86_32
+ mova m12, maskmem
+ mov mask_bitsd, [esp+25*16]
+%endif
+.no_filter:
+ pslld m12, 4
+ shl mask_bitsd, 4
+ add lq, 16
+ add dstq, 16
+%if ARCH_X86_64
+ sub wd, 4
+%else
+ sub dword wm, 4
+%endif
+ XCHG_PIC_REG 0
+ jg .loop
+ RET
+
+INIT_XMM ssse3
+%if ARCH_X86_64
+cglobal lpf_h_sb_y_8bpc, 7, 11, 16, 16 * 26, \
+ dst, stride, mask, l, l_stride, lut, \
+ h, stride3, l_stride3, tmp, mask_bits
+%else
+cglobal lpf_h_sb_y_8bpc, 6, 7, 8, -16 * (39 + copy_args), \
+ dst, stride, mask, l, l_stride, lut, mask_bits
+ RELOC_ARGS h
+ SETUP_PIC
+ %define m12 m5
+%endif
+ sub lq, 4
+ shl l_strideq, 2
+%if ARCH_X86_64
+ lea stride3q, [strideq*3]
+ lea l_stride3q, [l_strideq*3]
+%else
+ mov l_stridem, l_strided
+%endif
+ mov mask_bitsd, 0xf
+ mova m12, [PIC_sym(pd_mask)]
+ XCHG_PIC_REG 0
+ movu m0, [maskq]
+ pxor m4, m4
+ movd m3, [lutq+136]
+ pshufb m3, m4
+ pshufd m2, m0, q2222
+ pshufd m1, m0, q1111
+ pshufd m0, m0, q0000
+ por m1, m2
+ por m0, m1
+ mova [rsp+22*16], m0
+ mova [rsp+23*16], m1
+ mova [rsp+24*16], m2
+ mova [rsp+25*16], m3
+
+%define maskmem [esp+37*16]
+%define mask0 [rsp+22*16]
+%define mask1 [rsp+23*16]
+%define mask2 [rsp+24*16]
+%define minlvl [rsp+25*16]
+
+.loop:
+ test [maskq+8], mask_bitsd ; vmask[2]
+ je .no_flat16
+
+%if ARCH_X86_32
+ XCHG_PIC_REG 1
+ mov [esp+38*16], mask_bitsd
+ mova maskmem, m12
+%endif
+ FILTER 16, h
+ jmp .end
+
+.no_flat16:
+ test [maskq+4], mask_bitsd ; vmask[1]
+ je .no_flat
+
+%if ARCH_X86_32
+ XCHG_PIC_REG 1
+ mov [esp+38*16], mask_bitsd
+ mova maskmem, m12
+%endif
+ FILTER 8, h
+ jmp .end
+
+.no_flat:
+ test [maskq+0], mask_bitsd ; vmask[0]
+ XCHG_PIC_REG 1
+ je .no_filter
+
+%if ARCH_X86_32
+ mov [esp+38*16], mask_bitsd
+ mova maskmem, m12
+%endif
+ FILTER 4, h
+ jmp .end
+
+.no_filter:
+ lea dstq, [dstq+strideq*8]
+ lea dstq, [dstq+strideq*8]
+%if ARCH_X86_32
+ jmp .end_noload
+.end:
+ mova m12, maskmem
+ mov l_strideq, l_stridem
+ mov mask_bitsd, [esp+38*16]
+.end_noload:
+%else
+.end:
+%endif
+ lea lq, [lq+l_strideq*4]
+ pslld m12, 4
+ shl mask_bitsd, 4
+%if ARCH_X86_64
+ sub hd, 4
+%else
+ sub dword hm, 4
+%endif
+ XCHG_PIC_REG 0
+ jg .loop
+ RET
+
+INIT_XMM ssse3
+%if ARCH_X86_64
+cglobal lpf_v_sb_uv_8bpc, 7, 11, 16, 3 * 16, \
+ dst, stride, mask, l, l_stride, lut, \
+ w, stride3, mstride, tmp, mask_bits
+%else
+cglobal lpf_v_sb_uv_8bpc, 6, 7, 8, -16 * (12 + copy_args), \
+ dst, stride, mask, l, l_stride, lut, mask_bits
+ RELOC_ARGS w
+ SETUP_PIC
+ %define m12 m4
+%endif
+ shl l_strideq, 2
+ sub lq, l_strideq
+%if ARCH_X86_64
+ mov mstrideq, strideq
+ neg mstrideq
+ lea stride3q, [strideq*3]
+%else
+ mov l_stridem, l_strided
+%endif
+ mov mask_bitsd, 0xf
+ mova m12, [PIC_sym(pd_mask)]
+ XCHG_PIC_REG 0
+ movq m0, [maskq]
+ pxor m3, m3
+ movd m2, [lutq+136]
+ pshufb m2, m3
+ pshufd m1, m0, q1111
+ pshufd m0, m0, q0000
+ por m0, m1
+ mova [rsp+0*16], m0
+ mova [rsp+1*16], m1
+ mova [rsp+2*16], m2
+
+%define maskmem [esp+7*16]
+%define mask0 [rsp+0*16]
+%define mask1 [rsp+1*16]
+%define minlvl [rsp+2*16]
+
+.loop:
+ test [maskq+4], mask_bitsd ; vmask[1]
+ je .no_flat
+
+%if ARCH_X86_32
+ XCHG_PIC_REG 1
+ mov [esp+11*16], mask_bitsd
+ mova maskmem, m12
+%endif
+ FILTER 6, v
+ jmp .end
+
+.no_flat:
+ test [maskq+0], mask_bitsd ; vmask[1]
+ XCHG_PIC_REG 1
+ je .no_filter
+
+%if ARCH_X86_32
+ mov [esp+11*16], mask_bitsd
+ mova maskmem, m12
+%endif
+ FILTER 4, v
+
+.end:
+%if ARCH_X86_32
+ mova m12, maskmem
+ mov mask_bitsd, [esp+11*16]
+%endif
+.no_filter:
+ pslld m12, 4
+ shl mask_bitsd, 4
+ add lq, 16
+ add dstq, 16
+%if ARCH_X86_64
+ sub wd, 4
+%else
+ sub dword wm, 4
+%endif
+ XCHG_PIC_REG 0
+ jg .loop
+ RET
+
+INIT_XMM ssse3
+%if ARCH_X86_64
+cglobal lpf_h_sb_uv_8bpc, 7, 11, 16, 16 * 3, \
+ dst, stride, mask, l, l_stride, lut, \
+ h, stride3, l_stride3, tmp, mask_bits
+%else
+cglobal lpf_h_sb_uv_8bpc, 6, 7, 8, -16 * (13 + copy_args), \
+ dst, stride, mask, l, l_stride, lut, mask_bits
+ RELOC_ARGS h
+ SETUP_PIC
+ %define m12 m4
+%endif
+ sub lq, 4
+ shl l_strideq, 2
+%if ARCH_X86_64
+ lea stride3q, [strideq*3]
+ lea l_stride3q, [l_strideq*3]
+%else
+ mov l_stridem, l_strided
+%endif
+ mov mask_bitsd, 0xf
+ mova m12, [PIC_sym(pd_mask)]
+ XCHG_PIC_REG 0
+ movq m0, [maskq]
+ pxor m3, m3
+ movd m2, [lutq+136]
+ pshufb m2, m3
+ pshufd m1, m0, q1111
+ pshufd m0, m0, q0000
+ por m0, m1
+ mova [rsp+0*16], m0
+ mova [rsp+1*16], m1
+ mova [rsp+2*16], m2
+
+%define maskmem [esp+7*16]
+%define mask0 [rsp+0*16]
+%define mask1 [rsp+1*16]
+%define minlvl [rsp+2*16]
+
+.loop:
+ test [maskq+4], mask_bitsd ; vmask[1]
+ je .no_flat
+
+%if ARCH_X86_32
+ XCHG_PIC_REG 1
+ mov [esp+12*16], mask_bitsd
+ mova maskmem, m12
+%endif
+ FILTER 6, h
+ jmp .end
+
+.no_flat:
+ test [maskq+0], mask_bitsd ; vmask[1]
+ XCHG_PIC_REG 1
+ je .no_filter
+
+%if ARCH_X86_32
+ mov [esp+12*16], mask_bitsd
+ mova maskmem, m12
+%endif
+ FILTER 4, h
+ jmp .end
+
+.no_filter:
+ lea dstq, [dstq+strideq*8]
+ lea dstq, [dstq+strideq*8]
+%if ARCH_X86_32
+ jmp .end_noload
+.end:
+ mova m12, maskmem
+ mov l_strided, l_stridem
+ mov mask_bitsd, [esp+12*16]
+.end_noload:
+%else
+.end:
+%endif
+ lea lq, [lq+l_strideq*4]
+ pslld m12, 4
+ shl mask_bitsd, 4
+%if ARCH_X86_64
+ sub hd, 4
+%else
+ sub dword hm, 4
+%endif
+ XCHG_PIC_REG 0
+ jg .loop
+ RET
diff --git a/third_party/dav1d/src/x86/looprestoration.h b/third_party/dav1d/src/x86/looprestoration.h
new file mode 100644
index 0000000000..de23be8866
--- /dev/null
+++ b/third_party/dav1d/src/x86/looprestoration.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/looprestoration.h"
+
+#include "common/intops.h"
+
+#define decl_wiener_filter_fns(ext) \
+decl_lr_filter_fn(BF(dav1d_wiener_filter7, ext)); \
+decl_lr_filter_fn(BF(dav1d_wiener_filter5, ext))
+
+#define decl_sgr_filter_fns(ext) \
+decl_lr_filter_fn(BF(dav1d_sgr_filter_5x5, ext)); \
+decl_lr_filter_fn(BF(dav1d_sgr_filter_3x3, ext)); \
+decl_lr_filter_fn(BF(dav1d_sgr_filter_mix, ext))
+
+decl_wiener_filter_fns(sse2);
+decl_wiener_filter_fns(ssse3);
+decl_wiener_filter_fns(avx2);
+decl_wiener_filter_fns(avx512icl);
+decl_sgr_filter_fns(ssse3);
+decl_sgr_filter_fns(avx2);
+decl_sgr_filter_fns(avx512icl);
+
+static ALWAYS_INLINE void loop_restoration_dsp_init_x86(Dav1dLoopRestorationDSPContext *const c, const int bpc) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return;
+#if BITDEPTH == 8
+ c->wiener[0] = BF(dav1d_wiener_filter7, sse2);
+ c->wiener[1] = BF(dav1d_wiener_filter5, sse2);
+#endif
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
+ c->wiener[0] = BF(dav1d_wiener_filter7, ssse3);
+ c->wiener[1] = BF(dav1d_wiener_filter5, ssse3);
+ if (BITDEPTH == 8 || bpc == 10) {
+ c->sgr[0] = BF(dav1d_sgr_filter_5x5, ssse3);
+ c->sgr[1] = BF(dav1d_sgr_filter_3x3, ssse3);
+ c->sgr[2] = BF(dav1d_sgr_filter_mix, ssse3);
+ }
+
+#if ARCH_X86_64
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
+
+ c->wiener[0] = BF(dav1d_wiener_filter7, avx2);
+ c->wiener[1] = BF(dav1d_wiener_filter5, avx2);
+ if (BITDEPTH == 8 || bpc == 10) {
+ c->sgr[0] = BF(dav1d_sgr_filter_5x5, avx2);
+ c->sgr[1] = BF(dav1d_sgr_filter_3x3, avx2);
+ c->sgr[2] = BF(dav1d_sgr_filter_mix, avx2);
+ }
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
+
+ c->wiener[0] = BF(dav1d_wiener_filter7, avx512icl);
+#if BITDEPTH == 8
+ /* With VNNI we don't need a 5-tap version. */
+ c->wiener[1] = c->wiener[0];
+#else
+ c->wiener[1] = BF(dav1d_wiener_filter5, avx512icl);
+#endif
+ if (BITDEPTH == 8 || bpc == 10) {
+ c->sgr[0] = BF(dav1d_sgr_filter_5x5, avx512icl);
+ c->sgr[1] = BF(dav1d_sgr_filter_3x3, avx512icl);
+ c->sgr[2] = BF(dav1d_sgr_filter_mix, avx512icl);
+ }
+#endif
+}
diff --git a/third_party/dav1d/src/x86/looprestoration16_avx2.asm b/third_party/dav1d/src/x86/looprestoration16_avx2.asm
new file mode 100644
index 0000000000..4cf8b905c2
--- /dev/null
+++ b/third_party/dav1d/src/x86/looprestoration16_avx2.asm
@@ -0,0 +1,2540 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 32
+
+sgr_lshuf3: db 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
+sgr_lshuf5: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
+wiener_lshuf5: db 4, 5, 4, 5, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+wiener_lshuf7: db 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 10, 11, 12, 13, 14, 15
+ db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+wiener_shufA: db 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11
+wiener_shufB: db 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 8, 9, 12, 13, 10, 11
+wiener_shufC: db 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15
+wiener_shufD: db 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1
+wiener_shufE: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
+
+wiener_hshift: dw 4, 4, 1, 1
+wiener_vshift: dw 1024, 1024, 4096, 4096
+wiener_round: dd 1049600, 1048832
+
+pb_m10_m9: times 2 db -10, -9
+pb_m6_m5: times 2 db -6, -5
+pb_m2_m1: times 2 db -2, -1
+pb_2_3: times 2 db 2, 3
+pb_6_7: times 2 db 6, 7
+pw_1023: times 2 dw 1023
+pd_8: dd 8
+pd_25: dd 25
+pd_4096: dd 4096
+pd_34816: dd 34816
+pd_m262128: dd -262128
+pd_0xf00800a4: dd 0xf00800a4
+pd_0xf00801c7: dd 0xf00801c7
+
+%define pw_256 sgr_lshuf5
+
+cextern pb_0to63
+cextern sgr_x_by_x_avx2
+
+SECTION .text
+
+DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; wiener ring buffer pointers
+
+INIT_YMM avx2
+cglobal wiener_filter7_16bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \
+ w, h, edge, flt
+%define base t4-wiener_hshift
+ mov fltq, r6mp
+ movifnidn wd, wm
+ movifnidn hd, hm
+ mov edged, r7m
+ mov t3d, r8m ; pixel_max
+ vbroadcasti128 m6, [wiener_shufA]
+ vpbroadcastd m12, [fltq+ 0] ; x0 x1
+ lea t4, [wiener_hshift]
+ vbroadcasti128 m7, [wiener_shufB]
+ add wd, wd
+ vpbroadcastd m13, [fltq+ 4] ; x2 x3
+ shr t3d, 11
+ vpbroadcastd m14, [fltq+16] ; y0 y1
+ add lpfq, wq
+ vpbroadcastd m15, [fltq+20] ; y2 y3
+ add dstq, wq
+ vbroadcasti128 m8, [wiener_shufC]
+ lea t1, [rsp+wq+16]
+ vbroadcasti128 m9, [wiener_shufD]
+ neg wq
+ vpbroadcastd m0, [base+wiener_hshift+t3*4]
+ vpbroadcastd m10, [base+wiener_round+t3*4]
+ vpbroadcastd m11, [base+wiener_vshift+t3*4]
+ pmullw m12, m0 ; upshift filter coefs to make the
+ pmullw m13, m0 ; horizontal downshift constant
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t6, t1
+ mov t5, t1
+ add t1, 384*2
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ mov t4, t1
+ add t1, 384*2
+ add r10, strideq
+ mov [rsp], r10 ; below
+ call .h
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v2
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v3
+.main:
+ lea t0, [t1+384*2]
+.main_loop:
+ call .hv
+ dec hd
+ jnz .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .v3
+ mov lpfq, [rsp]
+ call .hv_bottom
+ add lpfq, strideq
+ call .hv_bottom
+.v1:
+ call .v
+ RET
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov [rsp], r10
+ call .h
+ mov t6, t1
+ mov t5, t1
+ mov t4, t1
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v2
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v3
+ lea t0, [t1+384*2]
+ call .hv
+ dec hd
+ jz .v3
+ add t0, 384*8
+ call .hv
+ dec hd
+ jnz .main
+.v3:
+ call .v
+.v2:
+ call .v
+ jmp .v1
+.extend_right:
+ movd xm1, r10d
+ vpbroadcastd m0, [pb_6_7]
+ mova m2, [pb_0to63]
+ vpbroadcastb m1, xm1
+ psubb m0, m1
+ pminub m0, m2
+ pshufb m3, m0
+ vpbroadcastd m0, [pb_m2_m1]
+ psubb m0, m1
+ pminub m0, m2
+ pshufb m4, m0
+ vpbroadcastd m0, [pb_m10_m9]
+ psubb m0, m1
+ pminub m0, m2
+ pshufb m5, m0
+ ret
+.h:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movq xm3, [leftq]
+ vpblendd m3, [lpfq+r10-8], 0xfc
+ add leftq, 8
+ jmp .h_main
+.h_extend_left:
+ vbroadcasti128 m3, [lpfq+r10] ; avoid accessing memory located
+ mova m4, [lpfq+r10] ; before the start of the buffer
+ shufpd m3, m4, 0x05
+ pshufb m3, [wiener_lshuf7]
+ jmp .h_main2
+.h_top:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m3, [lpfq+r10-8]
+.h_main:
+ mova m4, [lpfq+r10+0]
+.h_main2:
+ movu m5, [lpfq+r10+8]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -36
+ jl .h_have_right
+ call .extend_right
+.h_have_right:
+ pshufb m0, m3, m6
+ pshufb m1, m4, m7
+ paddw m0, m1
+ pshufb m3, m8
+ pmaddwd m0, m12
+ pshufb m1, m4, m9
+ paddw m3, m1
+ pshufb m1, m4, m6
+ pmaddwd m3, m13
+ pshufb m2, m5, m7
+ paddw m1, m2
+ vpbroadcastd m2, [pd_m262128] ; (1 << 4) - (1 << 18)
+ pshufb m4, m8
+ pmaddwd m1, m12
+ pshufb m5, m9
+ paddw m4, m5
+ pmaddwd m4, m13
+ paddd m0, m2
+ paddd m1, m2
+ paddd m0, m3
+ paddd m1, m4
+ psrad m0, 4
+ psrad m1, 4
+ packssdw m0, m1
+ psraw m0, 1
+ mova [t1+r10], m0
+ add r10, 32
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv:
+ add lpfq, strideq
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movq xm3, [leftq]
+ vpblendd m3, [lpfq+r10-8], 0xfc
+ add leftq, 8
+ jmp .hv_main
+.hv_extend_left:
+ movu m3, [lpfq+r10-8]
+ pshufb m3, [wiener_lshuf7]
+ jmp .hv_main
+.hv_bottom:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu m3, [lpfq+r10-8]
+.hv_main:
+ mova m4, [lpfq+r10+0]
+ movu m5, [lpfq+r10+8]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp r10d, -36
+ jl .hv_have_right
+ call .extend_right
+.hv_have_right:
+ pshufb m0, m3, m6
+ pshufb m1, m4, m7
+ paddw m0, m1
+ pshufb m3, m8
+ pmaddwd m0, m12
+ pshufb m1, m4, m9
+ paddw m3, m1
+ pshufb m1, m4, m6
+ pmaddwd m3, m13
+ pshufb m2, m5, m7
+ paddw m1, m2
+ vpbroadcastd m2, [pd_m262128]
+ pshufb m4, m8
+ pmaddwd m1, m12
+ pshufb m5, m9
+ paddw m4, m5
+ pmaddwd m4, m13
+ paddd m0, m2
+ paddd m1, m2
+ mova m2, [t4+r10]
+ paddw m2, [t2+r10]
+ mova m5, [t3+r10]
+ paddd m0, m3
+ paddd m1, m4
+ psrad m0, 4
+ psrad m1, 4
+ packssdw m0, m1
+ mova m4, [t5+r10]
+ paddw m4, [t1+r10]
+ psraw m0, 1
+ paddw m3, m0, [t6+r10]
+ mova [t0+r10], m0
+ punpcklwd m0, m2, m5
+ pmaddwd m0, m15
+ punpckhwd m2, m5
+ pmaddwd m2, m15
+ punpcklwd m1, m3, m4
+ pmaddwd m1, m14
+ punpckhwd m3, m4
+ pmaddwd m3, m14
+ paddd m0, m10
+ paddd m2, m10
+ paddd m0, m1
+ paddd m2, m3
+ psrad m0, 5
+ psrad m2, 5
+ packusdw m0, m2
+ pmulhuw m0, m11
+ mova [dstq+r10], m0
+ add r10, 32
+ jl .hv_loop
+ mov t6, t5
+ mov t5, t4
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ mov t1, t0
+ mov t0, t6
+ add dstq, strideq
+ ret
+.v:
+ mov r10, wq
+.v_loop:
+ mova m1, [t4+r10]
+ paddw m1, [t2+r10]
+ mova m2, [t3+r10]
+ mova m4, [t1+r10]
+ paddw m3, m4, [t6+r10]
+ paddw m4, [t5+r10]
+ punpcklwd m0, m1, m2
+ pmaddwd m0, m15
+ punpckhwd m1, m2
+ pmaddwd m1, m15
+ punpcklwd m2, m3, m4
+ pmaddwd m2, m14
+ punpckhwd m3, m4
+ pmaddwd m3, m14
+ paddd m0, m10
+ paddd m1, m10
+ paddd m0, m2
+ paddd m1, m3
+ psrad m0, 5
+ psrad m1, 5
+ packusdw m0, m1
+ pmulhuw m0, m11
+ mova [dstq+r10], m0
+ add r10, 32
+ jl .v_loop
+ mov t6, t5
+ mov t5, t4
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ add dstq, strideq
+ ret
+
+cglobal wiener_filter5_16bpc, 4, 13, 16, 384*8+16, dst, stride, left, lpf, \
+ w, h, edge, flt
+%define base t4-wiener_hshift
+ mov fltq, r6mp
+ movifnidn wd, wm
+ movifnidn hd, hm
+ mov edged, r7m
+ mov t3d, r8m ; pixel_max
+ vbroadcasti128 m5, [wiener_shufE]
+ vpbroadcastw m11, [fltq+ 2] ; x1
+ vbroadcasti128 m6, [wiener_shufB]
+ lea t4, [wiener_hshift]
+ vbroadcasti128 m7, [wiener_shufD]
+ add wd, wd
+ vpbroadcastd m12, [fltq+ 4] ; x2 x3
+ shr t3d, 11
+ vpbroadcastd m8, [pd_m262128] ; (1 << 4) - (1 << 18)
+ add lpfq, wq
+ vpbroadcastw m13, [fltq+18] ; y1
+ add dstq, wq
+ vpbroadcastd m14, [fltq+20] ; y2 y3
+ lea t1, [rsp+wq+16]
+ neg wq
+ vpbroadcastd m0, [base+wiener_hshift+t3*4]
+ vpbroadcastd m9, [base+wiener_round+t3*4]
+ vpbroadcastd m10, [base+wiener_vshift+t3*4]
+ mova m15, [wiener_lshuf5]
+ pmullw m11, m0
+ pmullw m12, m0
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t4, t1
+ add t1, 384*2
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ mov t3, t1
+ add t1, 384*2
+ add r10, strideq
+ mov [rsp], r10 ; below
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v2
+.main:
+ mov t0, t4
+.main_loop:
+ call .hv
+ dec hd
+ jnz .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .v2
+ mov lpfq, [rsp]
+ call .hv_bottom
+ add lpfq, strideq
+ call .hv_bottom
+.end:
+ RET
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov [rsp], r10
+ call .h
+ mov t4, t1
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v2
+ lea t0, [t1+384*2]
+ call .hv
+ dec hd
+ jz .v2
+ add t0, 384*6
+ call .hv
+ dec hd
+ jnz .main
+.v2:
+ call .v
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ add dstq, strideq
+.v1:
+ call .v
+ jmp .end
+.extend_right:
+ movd xm2, r10d
+ vpbroadcastd m0, [pb_2_3]
+ vpbroadcastd m1, [pb_m6_m5]
+ vpbroadcastb m2, xm2
+ psubb m0, m2
+ psubb m1, m2
+ mova m2, [pb_0to63]
+ pminub m0, m2
+ pminub m1, m2
+ pshufb m3, m0
+ pshufb m4, m1
+ ret
+.h:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movd xm3, [leftq+4]
+ vpblendd m3, [lpfq+r10-4], 0xfe
+ add leftq, 8
+ jmp .h_main
+.h_extend_left:
+ vbroadcasti128 m4, [lpfq+r10] ; avoid accessing memory located
+ mova m3, [lpfq+r10] ; before the start of the buffer
+ palignr m3, m4, 12
+ pshufb m3, m15
+ jmp .h_main
+.h_top:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m3, [lpfq+r10-4]
+.h_main:
+ movu m4, [lpfq+r10+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -34
+ jl .h_have_right
+ call .extend_right
+.h_have_right:
+ pshufb m0, m3, m5
+ pmaddwd m0, m11
+ pshufb m1, m4, m5
+ pmaddwd m1, m11
+ pshufb m2, m3, m6
+ pshufb m3, m7
+ paddw m2, m3
+ pshufb m3, m4, m6
+ pmaddwd m2, m12
+ pshufb m4, m7
+ paddw m3, m4
+ pmaddwd m3, m12
+ paddd m0, m8
+ paddd m1, m8
+ paddd m0, m2
+ paddd m1, m3
+ psrad m0, 4
+ psrad m1, 4
+ packssdw m0, m1
+ psraw m0, 1
+ mova [t1+r10], m0
+ add r10, 32
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv:
+ add lpfq, strideq
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movd xm3, [leftq+4]
+ vpblendd m3, [lpfq+r10-4], 0xfe
+ add leftq, 8
+ jmp .hv_main
+.hv_extend_left:
+ movu m3, [lpfq+r10-4]
+ pshufb m3, m15
+ jmp .hv_main
+.hv_bottom:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu m3, [lpfq+r10-4]
+.hv_main:
+ movu m4, [lpfq+r10+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp r10d, -34
+ jl .hv_have_right
+ call .extend_right
+.hv_have_right:
+ pshufb m0, m3, m5
+ pmaddwd m0, m11
+ pshufb m1, m4, m5
+ pmaddwd m1, m11
+ pshufb m2, m3, m6
+ pshufb m3, m7
+ paddw m2, m3
+ pshufb m3, m4, m6
+ pmaddwd m2, m12
+ pshufb m4, m7
+ paddw m3, m4
+ pmaddwd m3, m12
+ paddd m0, m8
+ paddd m1, m8
+ paddd m0, m2
+ mova m2, [t3+r10]
+ paddw m2, [t1+r10]
+ paddd m1, m3
+ mova m4, [t2+r10]
+ punpckhwd m3, m2, m4
+ pmaddwd m3, m14
+ punpcklwd m2, m4
+ mova m4, [t4+r10]
+ psrad m0, 4
+ psrad m1, 4
+ packssdw m0, m1
+ pmaddwd m2, m14
+ psraw m0, 1
+ mova [t0+r10], m0
+ punpckhwd m1, m0, m4
+ pmaddwd m1, m13
+ punpcklwd m0, m4
+ pmaddwd m0, m13
+ paddd m3, m9
+ paddd m2, m9
+ paddd m1, m3
+ paddd m0, m2
+ psrad m1, 5
+ psrad m0, 5
+ packusdw m0, m1
+ pmulhuw m0, m10
+ mova [dstq+r10], m0
+ add r10, 32
+ jl .hv_loop
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ mov t1, t0
+ mov t0, t4
+ add dstq, strideq
+ ret
+.v:
+ mov r10, wq
+.v_loop:
+ mova m0, [t1+r10]
+ paddw m2, m0, [t3+r10]
+ mova m1, [t2+r10]
+ mova m4, [t4+r10]
+ punpckhwd m3, m2, m1
+ pmaddwd m3, m14
+ punpcklwd m2, m1
+ pmaddwd m2, m14
+ punpckhwd m1, m0, m4
+ pmaddwd m1, m13
+ punpcklwd m0, m4
+ pmaddwd m0, m13
+ paddd m3, m9
+ paddd m2, m9
+ paddd m1, m3
+ paddd m0, m2
+ psrad m1, 5
+ psrad m0, 5
+ packusdw m0, m1
+ pmulhuw m0, m10
+ mova [dstq+r10], m0
+ add r10, 32
+ jl .v_loop
+ ret
+
+cglobal sgr_filter_5x5_16bpc, 4, 14, 15, 400*24+16, dst, stride, left, lpf, \
+ w, h, edge, params
+ movifnidn wd, wm
+ mov paramsq, r6mp
+ lea r13, [sgr_x_by_x_avx2+256*4]
+ movifnidn hd, hm
+ mov edged, r7m
+ add wd, wd
+ vpbroadcastw m7, [paramsq+8] ; w0
+ add lpfq, wq
+ vpbroadcastd m8, [pd_8]
+ add dstq, wq
+ vpbroadcastd m9, [pd_25]
+ lea t3, [rsp+wq*2+400*12+16]
+ vpbroadcastd m10, [paramsq+0] ; s0
+ lea t4, [rsp+wq+400*20+16]
+ vpbroadcastd m11, [pd_0xf00800a4]
+ lea t1, [rsp+wq+20]
+ mova xm12, [sgr_lshuf5]
+ neg wq
+ vpbroadcastd m13, [pd_34816] ; (1 << 11) + (1 << 15)
+ pxor m6, m6
+ vpbroadcastd m14, [pw_1023]
+ psllw m7, 4
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t2, t1
+ call .top_fixup
+ add t1, 400*6
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ add r10, strideq
+ mov [rsp], r10 ; below
+ mov t0, t2
+ dec hd
+ jz .height1
+ or edged, 16
+ call .h
+.main:
+ add lpfq, strideq
+ call .hv
+ call .prep_n
+ sub hd, 2
+ jl .extend_bottom
+.main_loop:
+ add lpfq, strideq
+ test hd, hd
+ jz .odd_height
+ call .h
+ add lpfq, strideq
+ call .hv
+ call .n0
+ call .n1
+ sub hd, 2
+ jge .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .extend_bottom
+ mov lpfq, [rsp]
+ call .h_top
+ add lpfq, strideq
+ call .hv_bottom
+.end:
+ call .n0
+ call .n1
+.end2:
+ RET
+.height1:
+ call .hv
+ call .prep_n
+ jmp .odd_height_end
+.odd_height:
+ call .hv
+ call .n0
+ call .n1
+.odd_height_end:
+ call .v
+ call .n0
+ jmp .end2
+.extend_bottom:
+ call .v
+ jmp .end
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov [rsp], r10
+ call .h
+ lea t2, [t1+400*6]
+ call .top_fixup
+ dec hd
+ jz .no_top_height1
+ or edged, 16
+ mov t0, t1
+ mov t1, t2
+ jmp .main
+.no_top_height1:
+ call .v
+ call .prep_n
+ jmp .odd_height_end
+.extend_right:
+ vpbroadcastw m0, [lpfq-2]
+ movu m1, [r13+r10+ 0]
+ movu m2, [r13+r10+16]
+ vpblendvb m4, m0, m1
+ vpblendvb m5, m0, m2
+ ret
+.h: ; horizontal boxsum
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ vpbroadcastq xm5, [leftq]
+ vinserti128 m5, [lpfq+wq], 1
+ mova m4, [lpfq+wq]
+ add leftq, 8
+ palignr m4, m5, 10
+ jmp .h_main
+.h_extend_left:
+ mova xm4, [lpfq+wq]
+ pshufb xm4, xm12
+ vinserti128 m4, [lpfq+wq+10], 1
+ jmp .h_main
+.h_top:
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m4, [lpfq+r10- 2]
+.h_main:
+ movu m5, [lpfq+r10+14]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -36
+ jl .h_have_right
+ call .extend_right
+.h_have_right:
+ palignr m2, m5, m4, 2
+ paddw m0, m4, m2
+ palignr m3, m5, m4, 6
+ paddw m0, m3
+ punpcklwd m1, m2, m3
+ pmaddwd m1, m1
+ punpckhwd m2, m3
+ pmaddwd m2, m2
+ shufpd m5, m4, m5, 0x05
+ paddw m0, m5
+ punpcklwd m3, m4, m5
+ pmaddwd m3, m3
+ paddd m1, m3
+ punpckhwd m3, m4, m5
+ pmaddwd m3, m3
+ shufps m4, m5, q2121
+ paddw m0, m4 ; sum
+ punpcklwd m5, m4, m6
+ pmaddwd m5, m5
+ punpckhwd m4, m6
+ pmaddwd m4, m4
+ paddd m2, m3
+ test edgeb, 16 ; y > 0
+ jz .h_loop_end
+ paddw m0, [t1+r10+400*0]
+ paddd m1, [t1+r10+400*2]
+ paddd m2, [t1+r10+400*4]
+.h_loop_end:
+ paddd m1, m5 ; sumsq
+ paddd m2, m4
+ mova [t1+r10+400*0], m0
+ mova [t1+r10+400*2], m1
+ mova [t1+r10+400*4], m2
+ add r10, 32
+ jl .h_loop
+ ret
+.top_fixup:
+ lea r10, [wq-4]
+.top_fixup_loop: ; the sums of the first row needs to be doubled
+ mova m0, [t1+r10+400*0]
+ mova m1, [t1+r10+400*2]
+ mova m2, [t1+r10+400*4]
+ paddw m0, m0
+ paddd m1, m1
+ paddd m2, m2
+ mova [t2+r10+400*0], m0
+ mova [t2+r10+400*2], m1
+ mova [t2+r10+400*4], m2
+ add r10, 32
+ jl .top_fixup_loop
+ ret
+ALIGN function_align
+.hv: ; horizontal boxsum + vertical boxsum + ab
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ vpbroadcastq xm5, [leftq]
+ vinserti128 m5, [lpfq+wq], 1
+ mova m4, [lpfq+wq]
+ add leftq, 8
+ palignr m4, m5, 10
+ jmp .hv_main
+.hv_extend_left:
+ mova xm4, [lpfq+wq]
+ pshufb xm4, xm12
+ vinserti128 m4, [lpfq+wq+10], 1
+ jmp .hv_main
+.hv_bottom:
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu m4, [lpfq+r10- 2]
+.hv_main:
+ movu m5, [lpfq+r10+14]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp r10d, -36
+ jl .hv_have_right
+ call .extend_right
+.hv_have_right:
+ palignr m3, m5, m4, 2
+ paddw m0, m4, m3
+ palignr m1, m5, m4, 6
+ paddw m0, m1
+ punpcklwd m2, m3, m1
+ pmaddwd m2, m2
+ punpckhwd m3, m1
+ pmaddwd m3, m3
+ shufpd m5, m4, m5, 0x05
+ paddw m0, m5
+ punpcklwd m1, m4, m5
+ pmaddwd m1, m1
+ paddd m2, m1
+ punpckhwd m1, m4, m5
+ pmaddwd m1, m1
+ shufps m4, m5, q2121
+ paddw m0, m4 ; h sum
+ punpcklwd m5, m4, m6
+ pmaddwd m5, m5
+ punpckhwd m4, m6
+ pmaddwd m4, m4
+ paddd m3, m1
+ paddd m2, m5 ; h sumsq
+ paddd m3, m4
+ paddw m1, m0, [t1+r10+400*0]
+ paddd m4, m2, [t1+r10+400*2]
+ paddd m5, m3, [t1+r10+400*4]
+ test hd, hd
+ jz .hv_last_row
+.hv_main2:
+ paddw m1, [t2+r10+400*0] ; hv sum
+ paddd m4, [t2+r10+400*2] ; hv sumsq
+ paddd m5, [t2+r10+400*4]
+ mova [t0+r10+400*0], m0
+ mova [t0+r10+400*2], m2
+ mova [t0+r10+400*4], m3
+ psrlw m3, m1, 1
+ paddd m4, m8
+ pavgw m3, m6 ; (b + 2) >> 2
+ paddd m5, m8
+ psrld m4, 4 ; (a + 8) >> 4
+ punpcklwd m2, m3, m6
+ psrld m5, 4
+ punpckhwd m3, m6
+ pmulld m4, m9 ; a * 25
+ pmulld m5, m9
+ pmaddwd m2, m2 ; b * b
+ pmaddwd m3, m3
+ punpcklwd m0, m1, m6 ; b
+ punpckhwd m1, m6
+ pmaxud m4, m2
+ pmaxud m5, m3
+ psubd m4, m2 ; p
+ psubd m5, m3
+ pmulld m4, m10 ; p * s
+ pmulld m5, m10
+ pmaddwd m0, m11 ; b * 164
+ pmaddwd m1, m11
+ paddusw m4, m11
+ paddusw m5, m11
+ psrad m3, m4, 20 ; min(z, 255) - 256
+ vpgatherdd m2, [r13+m3*4], m4 ; x
+ psrad m4, m5, 20
+ vpgatherdd m3, [r13+m4*4], m5
+ pmulld m0, m2
+ pmulld m1, m3
+ packssdw m2, m3
+ paddd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15)
+ paddd m1, m13
+ mova [t4+r10+4], m2
+ psrld m0, 12 ; b
+ psrld m1, 12
+ mova [t3+r10*2+ 8], xm0
+ vextracti128 [t3+r10*2+40], m0, 1
+ mova [t3+r10*2+24], xm1
+ vextracti128 [t3+r10*2+56], m1, 1
+ add r10, 32
+ jl .hv_loop
+ mov t2, t1
+ mov t1, t0
+ mov t0, t2
+ ret
+.hv_last_row: ; esoteric edge case for odd heights
+ mova [t1+r10+400*0], m1
+ paddw m1, m0
+ mova [t1+r10+400*2], m4
+ paddd m4, m2
+ mova [t1+r10+400*4], m5
+ paddd m5, m3
+ jmp .hv_main2
+.v: ; vertical boxsum + ab
+ lea r10, [wq-4]
+.v_loop:
+ mova m0, [t1+r10+400*0]
+ mova m2, [t1+r10+400*2]
+ mova m3, [t1+r10+400*4]
+ paddw m1, m0, [t2+r10+400*0]
+ paddd m4, m2, [t2+r10+400*2]
+ paddd m5, m3, [t2+r10+400*4]
+ paddw m0, m0
+ paddd m2, m2
+ paddd m3, m3
+ paddw m1, m0 ; hv sum
+ paddd m4, m2 ; hv sumsq
+ paddd m5, m3
+ psrlw m3, m1, 1
+ paddd m4, m8
+ pavgw m3, m6 ; (b + 2) >> 2
+ paddd m5, m8
+ psrld m4, 4 ; (a + 8) >> 4
+ punpcklwd m2, m3, m6
+ psrld m5, 4
+ punpckhwd m3, m6
+ pmulld m4, m9 ; a * 25
+ pmulld m5, m9
+ pmaddwd m2, m2 ; b * b
+ pmaddwd m3, m3
+ punpcklwd m0, m1, m6 ; b
+ punpckhwd m1, m6
+ pmaxud m4, m2
+ pmaxud m5, m3
+ psubd m4, m2 ; p
+ psubd m5, m3
+ pmulld m4, m10 ; p * s
+ pmulld m5, m10
+ pmaddwd m0, m11 ; b * 164
+ pmaddwd m1, m11
+ paddusw m4, m11
+ paddusw m5, m11
+ psrad m3, m4, 20 ; min(z, 255) - 256
+ vpgatherdd m2, [r13+m3*4], m4 ; x
+ psrad m4, m5, 20
+ vpgatherdd m3, [r13+m4*4], m5
+ pmulld m0, m2
+ pmulld m1, m3
+ packssdw m2, m3
+ paddd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15)
+ paddd m1, m13
+ mova [t4+r10+4], m2
+ psrld m0, 12 ; b
+ psrld m1, 12
+ mova [t3+r10*2+ 8], xm0
+ vextracti128 [t3+r10*2+40], m0, 1
+ mova [t3+r10*2+24], xm1
+ vextracti128 [t3+r10*2+56], m1, 1
+ add r10, 32
+ jl .v_loop
+ ret
+.prep_n: ; initial neighbor setup
+ mov r10, wq
+.prep_n_loop:
+ movu m0, [t4+r10*1+ 2]
+ movu m1, [t3+r10*2+ 4]
+ movu m2, [t3+r10*2+36]
+ paddw m3, m0, [t4+r10*1+ 0]
+ paddd m4, m1, [t3+r10*2+ 0]
+ paddd m5, m2, [t3+r10*2+32]
+ paddw m3, [t4+r10*1+ 4]
+ paddd m4, [t3+r10*2+ 8]
+ paddd m5, [t3+r10*2+40]
+ paddw m0, m3
+ psllw m3, 2
+ paddd m1, m4
+ pslld m4, 2
+ paddd m2, m5
+ pslld m5, 2
+ paddw m0, m3 ; a 565
+ paddd m1, m4 ; b 565
+ paddd m2, m5
+ mova [t4+r10*1+400*2+ 0], m0
+ mova [t3+r10*2+400*4+ 0], m1
+ mova [t3+r10*2+400*4+32], m2
+ add r10, 32
+ jl .prep_n_loop
+ ret
+ALIGN function_align
+.n0: ; neighbor + output (even rows)
+ mov r10, wq
+.n0_loop:
+ movu m0, [t4+r10*1+ 2]
+ movu m1, [t3+r10*2+ 4]
+ movu m2, [t3+r10*2+36]
+ paddw m3, m0, [t4+r10*1+ 0]
+ paddd m4, m1, [t3+r10*2+ 0]
+ paddd m5, m2, [t3+r10*2+32]
+ paddw m3, [t4+r10*1+ 4]
+ paddd m4, [t3+r10*2+ 8]
+ paddd m5, [t3+r10*2+40]
+ paddw m0, m3
+ psllw m3, 2
+ paddd m1, m4
+ pslld m4, 2
+ paddd m2, m5
+ pslld m5, 2
+ paddw m0, m3 ; a 565
+ paddd m1, m4 ; b 565
+ paddd m2, m5
+ paddw m3, m0, [t4+r10*1+400*2+ 0]
+ paddd m4, m1, [t3+r10*2+400*4+ 0]
+ paddd m5, m2, [t3+r10*2+400*4+32]
+ mova [t4+r10*1+400*2+ 0], m0
+ mova [t3+r10*2+400*4+ 0], m1
+ mova [t3+r10*2+400*4+32], m2
+ mova m0, [dstq+r10]
+ punpcklwd m1, m0, m6 ; src
+ punpcklwd m2, m3, m6 ; a
+ pmaddwd m2, m1 ; a * src
+ punpckhwd m1, m0, m6
+ punpckhwd m3, m6
+ pmaddwd m3, m1
+ vinserti128 m1, m4, xm5, 1
+ vperm2i128 m4, m5, 0x31
+ psubd m1, m2 ; b - a * src + (1 << 8)
+ psubd m4, m3
+ psrad m1, 9
+ psrad m4, 9
+ packssdw m1, m4
+ pmulhrsw m1, m7
+ paddw m0, m1
+ pmaxsw m0, m6
+ pminsw m0, m14
+ mova [dstq+r10], m0
+ add r10, 32
+ jl .n0_loop
+ add dstq, strideq
+ ret
+ALIGN function_align
+.n1: ; neighbor + output (odd rows)
+ mov r10, wq
+.n1_loop:
+ mova m0, [dstq+r10]
+ mova m3, [t4+r10*1+400*2+ 0]
+ mova m4, [t3+r10*2+400*4+ 0]
+ mova m5, [t3+r10*2+400*4+32]
+ punpcklwd m1, m0, m6 ; src
+ punpcklwd m2, m3, m6 ; a
+ pmaddwd m2, m1
+ punpckhwd m1, m0, m6
+ punpckhwd m3, m6
+ pmaddwd m3, m1
+ vinserti128 m1, m4, xm5, 1
+ vperm2i128 m4, m5, 0x31
+ psubd m1, m2 ; b - a * src + (1 << 7)
+ psubd m4, m3
+ psrad m1, 8
+ psrad m4, 8
+ packssdw m1, m4
+ pmulhrsw m1, m7
+ paddw m0, m1
+ pmaxsw m0, m6
+ pminsw m0, m14
+ mova [dstq+r10], m0
+ add r10, 32
+ jl .n1_loop
+ add dstq, strideq
+ ret
+
+cglobal sgr_filter_3x3_16bpc, 4, 14, 14, 400*42+8, dst, stride, left, lpf, \
+ w, h, edge, params
+ movifnidn wd, wm
+ mov paramsq, r6mp
+ lea r13, [sgr_x_by_x_avx2+256*4]
+ add wd, wd
+ movifnidn hd, hm
+ mov edged, r7m
+ add lpfq, wq
+ vpbroadcastw m7, [paramsq+10] ; w1
+ add dstq, wq
+ vpbroadcastd m9, [paramsq+ 4] ; s1
+ lea t3, [rsp+wq*2+400*12+8]
+ vpbroadcastd m8, [pd_8]
+ lea t4, [rsp+wq+400*32+8]
+ vpbroadcastd m10, [pd_0xf00801c7]
+ lea t1, [rsp+wq+12]
+ vpbroadcastd m11, [pd_34816]
+ neg wq
+ mova xm12, [sgr_lshuf3]
+ pxor m6, m6
+ vpbroadcastd m13, [pw_1023]
+ psllw m7, 4
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t2, t1
+ add t1, 400*6
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ add r10, strideq
+ mov [rsp], r10 ; below
+ call .hv0
+.main:
+ dec hd
+ jz .height1
+ add lpfq, strideq
+ call .hv1
+ call .prep_n
+ sub hd, 2
+ jl .extend_bottom
+.main_loop:
+ add lpfq, strideq
+ call .hv0
+ test hd, hd
+ jz .odd_height
+ add lpfq, strideq
+ call .hv1
+ call .n0
+ call .n1
+ sub hd, 2
+ jge .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .extend_bottom
+ mov lpfq, [rsp]
+ call .hv0_bottom
+ add lpfq, strideq
+ call .hv1_bottom
+.end:
+ call .n0
+ call .n1
+.end2:
+ RET
+.height1:
+ call .v1
+ call .prep_n
+ jmp .odd_height_end
+.odd_height:
+ call .v1
+ call .n0
+ call .n1
+.odd_height_end:
+ call .v0
+ call .v1
+ call .n0
+ jmp .end2
+.extend_bottom:
+ call .v0
+ call .v1
+ jmp .end
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov [rsp], r10
+ call .h
+ lea r10, [wq-4]
+ lea t2, [t1+400*6]
+.top_fixup_loop:
+ mova m0, [t1+r10+400*0]
+ mova m1, [t1+r10+400*2]
+ mova m2, [t1+r10+400*4]
+ mova [t2+r10+400*0], m0
+ mova [t2+r10+400*2], m1
+ mova [t2+r10+400*4], m2
+ add r10, 32
+ jl .top_fixup_loop
+ call .v0
+ jmp .main
+.extend_right:
+ vpbroadcastw m0, [lpfq-2]
+ movu m1, [r13+r10+ 2]
+ movu m2, [r13+r10+18]
+ vpblendvb m4, m0, m1
+ vpblendvb m5, m0, m2
+ ret
+.h: ; horizontal boxsum
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ vpbroadcastq xm5, [leftq]
+ vinserti128 m5, [lpfq+wq], 1
+ mova m4, [lpfq+wq]
+ add leftq, 8
+ palignr m4, m5, 12
+ jmp .h_main
+.h_extend_left:
+ mova xm4, [lpfq+wq]
+ pshufb xm4, xm12
+ vinserti128 m4, [lpfq+wq+12], 1
+ jmp .h_main
+.h_top:
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m4, [lpfq+r10+ 0]
+.h_main:
+ movu m5, [lpfq+r10+16]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -34
+ jl .h_have_right
+ call .extend_right
+.h_have_right:
+ palignr m0, m5, m4, 2
+ paddw m1, m4, m0
+ punpcklwd m2, m4, m0
+ pmaddwd m2, m2
+ punpckhwd m3, m4, m0
+ pmaddwd m3, m3
+ palignr m5, m4, 4
+ paddw m1, m5 ; sum
+ punpcklwd m4, m5, m6
+ pmaddwd m4, m4
+ punpckhwd m5, m6
+ pmaddwd m5, m5
+ paddd m2, m4 ; sumsq
+ paddd m3, m5
+ mova [t1+r10+400*0], m1
+ mova [t1+r10+400*2], m2
+ mova [t1+r10+400*4], m3
+ add r10, 32
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv0: ; horizontal boxsum + vertical boxsum + ab (even rows)
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+ vpbroadcastq xm5, [leftq]
+ vinserti128 m5, [lpfq+wq], 1
+ mova m4, [lpfq+wq]
+ add leftq, 8
+ palignr m4, m5, 12
+ jmp .hv0_main
+.hv0_extend_left:
+ mova xm4, [lpfq+wq]
+ pshufb xm4, xm12
+ vinserti128 m4, [lpfq+wq+12], 1
+ jmp .hv0_main
+.hv0_bottom:
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+.hv0_loop:
+ movu m4, [lpfq+r10+ 0]
+.hv0_main:
+ movu m5, [lpfq+r10+16]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv0_have_right
+ cmp r10d, -34
+ jl .hv0_have_right
+ call .extend_right
+.hv0_have_right:
+ palignr m0, m5, m4, 2
+ paddw m1, m4, m0
+ punpcklwd m2, m4, m0
+ pmaddwd m2, m2
+ punpckhwd m3, m4, m0
+ pmaddwd m3, m3
+ palignr m5, m4, 4
+ paddw m1, m5 ; sum
+ punpcklwd m4, m5, m6
+ pmaddwd m4, m4
+ punpckhwd m5, m6
+ pmaddwd m5, m5
+ paddd m2, m4 ; sumsq
+ paddd m3, m5
+ paddw m0, m1, [t1+r10+400*0]
+ paddd m4, m2, [t1+r10+400*2]
+ paddd m5, m3, [t1+r10+400*4]
+ mova [t1+r10+400*0], m1
+ mova [t1+r10+400*2], m2
+ mova [t1+r10+400*4], m3
+ paddw m1, m0, [t2+r10+400*0]
+ paddd m2, m4, [t2+r10+400*2]
+ paddd m3, m5, [t2+r10+400*4]
+ mova [t2+r10+400*0], m0
+ mova [t2+r10+400*2], m4
+ mova [t2+r10+400*4], m5
+ paddd m2, m8
+ paddd m3, m8
+ psrld m2, 4 ; (a + 8) >> 4
+ psrld m3, 4
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; ((a + 8) >> 4) * 9
+ paddd m5, m3
+ psrlw m3, m1, 1
+ pavgw m3, m6 ; (b + 2) >> 2
+ punpcklwd m2, m3, m6
+ pmaddwd m2, m2
+ punpckhwd m3, m6
+ pmaddwd m3, m3
+ punpcklwd m0, m1, m6 ; b
+ punpckhwd m1, m6
+ pmaxud m4, m2
+ psubd m4, m2 ; p
+ pmaxud m5, m3
+ psubd m5, m3
+ pmulld m4, m9 ; p * s
+ pmulld m5, m9
+ pmaddwd m0, m10 ; b * 455
+ pmaddwd m1, m10
+ paddusw m4, m10
+ paddusw m5, m10
+ psrad m3, m4, 20 ; min(z, 255) - 256
+ vpgatherdd m2, [r13+m3*4], m4 ; x
+ psrad m4, m5, 20
+ vpgatherdd m3, [r13+m4*4], m5
+ pmulld m0, m2
+ pmulld m1, m3
+ packssdw m2, m3
+ paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m11
+ psrld m0, 12
+ psrld m1, 12
+ mova [t4+r10*1+400*0+ 4], m2
+ mova [t3+r10*2+400*0+ 8], xm0
+ vextracti128 [t3+r10*2+400*0+40], m0, 1
+ mova [t3+r10*2+400*0+24], xm1
+ vextracti128 [t3+r10*2+400*0+56], m1, 1
+ add r10, 32
+ jl .hv0_loop
+ ret
+ALIGN function_align
+.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+ vpbroadcastq xm5, [leftq]
+ vinserti128 m5, [lpfq+wq], 1
+ mova m4, [lpfq+wq]
+ add leftq, 8
+ palignr m4, m5, 12
+ jmp .hv1_main
+.hv1_extend_left:
+ mova xm4, [lpfq+wq]
+ pshufb xm4, xm12
+ vinserti128 m4, [lpfq+wq+12], 1
+ jmp .hv1_main
+.hv1_bottom:
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+.hv1_loop:
+ movu m4, [lpfq+r10+ 0]
+.hv1_main:
+ movu m5, [lpfq+r10+16]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv1_have_right
+ cmp r10d, -34
+ jl .hv1_have_right
+ call .extend_right
+.hv1_have_right:
+ palignr m1, m5, m4, 2
+ paddw m0, m4, m1
+ punpcklwd m2, m4, m1
+ pmaddwd m2, m2
+ punpckhwd m3, m4, m1
+ pmaddwd m3, m3
+ palignr m5, m4, 4
+ paddw m0, m5 ; h sum
+ punpcklwd m1, m5, m6
+ pmaddwd m1, m1
+ punpckhwd m5, m6
+ pmaddwd m5, m5
+ paddd m2, m1 ; h sumsq
+ paddd m3, m5
+ paddw m1, m0, [t2+r10+400*0]
+ paddd m4, m2, [t2+r10+400*2]
+ paddd m5, m3, [t2+r10+400*4]
+ mova [t2+r10+400*0], m0
+ mova [t2+r10+400*2], m2
+ mova [t2+r10+400*4], m3
+ paddd m4, m8
+ paddd m5, m8
+ psrld m4, 4 ; (a + 8) >> 4
+ psrld m5, 4
+ pslld m2, m4, 3
+ pslld m3, m5, 3
+ paddd m4, m2 ; ((a + 8) >> 4) * 9
+ paddd m5, m3
+ psrlw m3, m1, 1
+ pavgw m3, m6 ; (b + 2) >> 2
+ punpcklwd m2, m3, m6
+ pmaddwd m2, m2
+ punpckhwd m3, m6
+ pmaddwd m3, m3
+ punpcklwd m0, m1, m6 ; b
+ punpckhwd m1, m6
+ pmaxud m4, m2
+ psubd m4, m2 ; p
+ pmaxud m5, m3
+ psubd m5, m3
+ pmulld m4, m9 ; p * s
+ pmulld m5, m9
+ pmaddwd m0, m10 ; b * 455
+ pmaddwd m1, m10
+ paddusw m4, m10
+ paddusw m5, m10
+ psrad m3, m4, 20 ; min(z, 255) - 256
+ vpgatherdd m2, [r13+m3*4], m4 ; x
+ psrad m4, m5, 20
+ vpgatherdd m3, [r13+m4*4], m5
+ pmulld m0, m2
+ pmulld m1, m3
+ packssdw m2, m3
+ paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m11
+ psrld m0, 12
+ psrld m1, 12
+ mova [t4+r10*1+400*2 +4], m2
+ mova [t3+r10*2+400*4+ 8], xm0
+ vextracti128 [t3+r10*2+400*4+40], m0, 1
+ mova [t3+r10*2+400*4+24], xm1
+ vextracti128 [t3+r10*2+400*4+56], m1, 1
+ add r10, 32
+ jl .hv1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.v0: ; vertical boxsums + ab (even rows)
+ lea r10, [wq-4]
+.v0_loop:
+ mova m0, [t1+r10+400*0]
+ mova m4, [t1+r10+400*2]
+ mova m5, [t1+r10+400*4]
+ paddw m0, m0
+ paddd m4, m4
+ paddd m5, m5
+ paddw m1, m0, [t2+r10+400*0]
+ paddd m2, m4, [t2+r10+400*2]
+ paddd m3, m5, [t2+r10+400*4]
+ mova [t2+r10+400*0], m0
+ mova [t2+r10+400*2], m4
+ mova [t2+r10+400*4], m5
+ paddd m2, m8
+ paddd m3, m8
+ psrld m2, 4 ; (a + 8) >> 4
+ psrld m3, 4
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; ((a + 8) >> 4) * 9
+ paddd m5, m3
+ psrlw m3, m1, 1
+ pavgw m3, m6 ; (b + 2) >> 2
+ punpcklwd m2, m3, m6
+ pmaddwd m2, m2
+ punpckhwd m3, m6
+ pmaddwd m3, m3
+ punpcklwd m0, m1, m6 ; b
+ punpckhwd m1, m6
+ pmaxud m4, m2
+ psubd m4, m2 ; p
+ pmaxud m5, m3
+ psubd m5, m3
+ pmulld m4, m9 ; p * s
+ pmulld m5, m9
+ pmaddwd m0, m10 ; b * 455
+ pmaddwd m1, m10
+ paddusw m4, m10
+ paddusw m5, m10
+ psrad m3, m4, 20 ; min(z, 255) - 256
+ vpgatherdd m2, [r13+m3*4], m4 ; x
+ psrad m4, m5, 20
+ vpgatherdd m3, [r13+m4*4], m5
+ pmulld m0, m2
+ pmulld m1, m3
+ packssdw m2, m3
+ paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m11
+ psrld m0, 12
+ psrld m1, 12
+ mova [t4+r10*1+400*0+ 4], m2
+ mova [t3+r10*2+400*0+ 8], xm0
+ vextracti128 [t3+r10*2+400*0+40], m0, 1
+ mova [t3+r10*2+400*0+24], xm1
+ vextracti128 [t3+r10*2+400*0+56], m1, 1
+ add r10, 32
+ jl .v0_loop
+ ret
+.v1: ; vertical boxsums + ab (odd rows)
+ lea r10, [wq-4]
+.v1_loop:
+ mova m0, [t1+r10+400*0]
+ mova m4, [t1+r10+400*2]
+ mova m5, [t1+r10+400*4]
+ paddw m1, m0, [t2+r10+400*0]
+ paddd m2, m4, [t2+r10+400*2]
+ paddd m3, m5, [t2+r10+400*4]
+ mova [t2+r10+400*0], m0
+ mova [t2+r10+400*2], m4
+ mova [t2+r10+400*4], m5
+ paddd m2, m8
+ paddd m3, m8
+ psrld m2, 4 ; (a + 8) >> 4
+ psrld m3, 4
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; ((a + 8) >> 4) * 9
+ paddd m5, m3
+ psrlw m3, m1, 1
+ pavgw m3, m6 ; (b + 2) >> 2
+ punpcklwd m2, m3, m6
+ pmaddwd m2, m2
+ punpckhwd m3, m6
+ pmaddwd m3, m3
+ punpcklwd m0, m1, m6 ; b
+ punpckhwd m1, m6
+ pmaxud m4, m2
+ psubd m4, m2 ; p
+ pmaxud m5, m3
+ psubd m5, m3
+ pmulld m4, m9 ; p * s
+ pmulld m5, m9
+ pmaddwd m0, m10 ; b * 455
+ pmaddwd m1, m10
+ paddusw m4, m10
+ paddusw m5, m10
+ psrad m3, m4, 20 ; min(z, 255) - 256
+ vpgatherdd m2, [r13+m3*4], m4 ; x
+ psrad m4, m5, 20
+ vpgatherdd m3, [r13+m4*4], m5
+ pmulld m0, m2
+ pmulld m1, m3
+ packssdw m2, m3
+ paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m11
+ psrld m0, 12
+ psrld m1, 12
+ mova [t4+r10*1+400*2+ 4], m2
+ mova [t3+r10*2+400*4+ 8], xm0
+ vextracti128 [t3+r10*2+400*4+40], m0, 1
+ mova [t3+r10*2+400*4+24], xm1
+ vextracti128 [t3+r10*2+400*4+56], m1, 1
+ add r10, 32
+ jl .v1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.prep_n: ; initial neighbor setup
+ mov r10, wq
+.prep_n_loop:
+ mova xm0, [t4+r10*1+400*0+0]
+ paddw xm0, [t4+r10*1+400*0+4]
+ paddw xm2, xm0, [t4+r10*1+400*0+2]
+ mova m1, [t3+r10*2+400*0+0]
+ paddd m1, [t3+r10*2+400*0+8]
+ paddd m3, m1, [t3+r10*2+400*0+4]
+ psllw xm2, 2 ; a[-1] 444
+ pslld m3, 2 ; b[-1] 444
+ psubw xm2, xm0 ; a[-1] 343
+ psubd m3, m1 ; b[-1] 343
+ mova [t4+r10*1+400* 4], xm2
+ mova [t3+r10*2+400* 8], m3
+ mova xm0, [t4+r10*1+400*2+0]
+ paddw xm0, [t4+r10*1+400*2+4]
+ paddw xm2, xm0, [t4+r10*1+400*2+2]
+ mova m1, [t3+r10*2+400*4+0]
+ paddd m1, [t3+r10*2+400*4+8]
+ paddd m3, m1, [t3+r10*2+400*4+4]
+ psllw xm2, 2 ; a[ 0] 444
+ pslld m3, 2 ; b[ 0] 444
+ mova [t4+r10*1+400* 6], xm2
+ mova [t3+r10*2+400*12], m3
+ psubw xm2, xm0 ; a[ 0] 343
+ psubd m3, m1 ; b[ 0] 343
+ mova [t4+r10*1+400* 8], xm2
+ mova [t3+r10*2+400*16], m3
+ add r10, 16
+ jl .prep_n_loop
+ ret
+ALIGN function_align
+.n0: ; neighbor + output (even rows)
+ mov r10, wq
+.n0_loop:
+ mova m3, [t4+r10*1+400*0+0]
+ paddw m3, [t4+r10*1+400*0+4]
+ paddw m1, m3, [t4+r10*1+400*0+2]
+ psllw m1, 2 ; a[ 1] 444
+ psubw m2, m1, m3 ; a[ 1] 343
+ paddw m3, m2, [t4+r10*1+400*4]
+ paddw m3, [t4+r10*1+400*6]
+ mova [t4+r10*1+400*4], m2
+ mova [t4+r10*1+400*6], m1
+ mova m4, [t3+r10*2+400*0+0]
+ paddd m4, [t3+r10*2+400*0+8]
+ paddd m1, m4, [t3+r10*2+400*0+4]
+ pslld m1, 2 ; b[ 1] 444
+ psubd m2, m1, m4 ; b[ 1] 343
+ paddd m4, m2, [t3+r10*2+400* 8+ 0]
+ paddd m4, [t3+r10*2+400*12+ 0]
+ mova [t3+r10*2+400* 8+ 0], m2
+ mova [t3+r10*2+400*12+ 0], m1
+ mova m5, [t3+r10*2+400*0+32]
+ paddd m5, [t3+r10*2+400*0+40]
+ paddd m1, m5, [t3+r10*2+400*0+36]
+ pslld m1, 2
+ psubd m2, m1, m5
+ paddd m5, m2, [t3+r10*2+400* 8+32]
+ paddd m5, [t3+r10*2+400*12+32]
+ mova [t3+r10*2+400* 8+32], m2
+ mova [t3+r10*2+400*12+32], m1
+ mova m0, [dstq+r10]
+ punpcklwd m1, m0, m6
+ punpcklwd m2, m3, m6
+ pmaddwd m2, m1 ; a * src
+ punpckhwd m1, m0, m6
+ punpckhwd m3, m6
+ pmaddwd m3, m1
+ vinserti128 m1, m4, xm5, 1
+ vperm2i128 m4, m5, 0x31
+ psubd m1, m2 ; b - a * src + (1 << 8)
+ psubd m4, m3
+ psrad m1, 9
+ psrad m4, 9
+ packssdw m1, m4
+ pmulhrsw m1, m7
+ paddw m0, m1
+ pmaxsw m0, m6
+ pminsw m0, m13
+ mova [dstq+r10], m0
+ add r10, 32
+ jl .n0_loop
+ add dstq, strideq
+ ret
+ALIGN function_align
+.n1: ; neighbor + output (odd rows)
+ mov r10, wq
+.n1_loop:
+ mova m3, [t4+r10*1+400*2+0]
+ paddw m3, [t4+r10*1+400*2+4]
+ paddw m1, m3, [t4+r10*1+400*2+2]
+ psllw m1, 2 ; a[ 1] 444
+ psubw m2, m1, m3 ; a[ 1] 343
+ paddw m3, m2, [t4+r10*1+400*6]
+ paddw m3, [t4+r10*1+400*8]
+ mova [t4+r10*1+400*6], m1
+ mova [t4+r10*1+400*8], m2
+ mova m4, [t3+r10*2+400*4+0]
+ paddd m4, [t3+r10*2+400*4+8]
+ paddd m1, m4, [t3+r10*2+400*4+4]
+ pslld m1, 2 ; b[ 1] 444
+ psubd m2, m1, m4 ; b[ 1] 343
+ paddd m4, m2, [t3+r10*2+400*12+ 0]
+ paddd m4, [t3+r10*2+400*16+ 0]
+ mova [t3+r10*2+400*12+ 0], m1
+ mova [t3+r10*2+400*16+ 0], m2
+ mova m5, [t3+r10*2+400*4+32]
+ paddd m5, [t3+r10*2+400*4+40]
+ paddd m1, m5, [t3+r10*2+400*4+36]
+ pslld m1, 2
+ psubd m2, m1, m5
+ paddd m5, m2, [t3+r10*2+400*12+32]
+ paddd m5, [t3+r10*2+400*16+32]
+ mova [t3+r10*2+400*12+32], m1
+ mova [t3+r10*2+400*16+32], m2
+ mova m0, [dstq+r10]
+ punpcklwd m1, m0, m6
+ punpcklwd m2, m3, m6
+ pmaddwd m2, m1 ; a * src
+ punpckhwd m1, m0, m6
+ punpckhwd m3, m6
+ pmaddwd m3, m1
+ vinserti128 m1, m4, xm5, 1
+ vperm2i128 m4, m5, 0x31
+ psubd m1, m2 ; b - a * src + (1 << 8)
+ psubd m4, m3
+ psrad m1, 9
+ psrad m4, 9
+ packssdw m1, m4
+ pmulhrsw m1, m7
+ paddw m0, m1
+ pmaxsw m0, m6
+ pminsw m0, m13
+ mova [dstq+r10], m0
+ add r10, 32
+ jl .n1_loop
+ add dstq, strideq
+ ret
+
+cglobal sgr_filter_mix_16bpc, 4, 14, 16, 400*66+8, dst, stride, left, lpf, \
+ w, h, edge, params
+ movifnidn wd, wm
+ mov paramsq, r6mp
+ lea r13, [sgr_x_by_x_avx2+256*4]
+ add wd, wd
+ movifnidn hd, hm
+ mov edged, r7m
+ add lpfq, wq
+ vpbroadcastd m15, [paramsq+8] ; w0 w1
+ add dstq, wq
+ vpbroadcastd m13, [paramsq+0] ; s0
+ lea t3, [rsp+wq*2+400*24+8]
+ vpbroadcastd m14, [paramsq+4] ; s1
+ lea t4, [rsp+wq+400*52+8]
+ vpbroadcastd m9, [pd_8]
+ lea t1, [rsp+wq+12]
+ vpbroadcastd m10, [pd_34816]
+ neg wq
+ vpbroadcastd m11, [pd_4096]
+ pxor m7, m7
+ vpbroadcastd m12, [pd_0xf00801c7]
+ psllw m15, 2
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t2, t1
+ call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).top_fixup
+ add t1, 400*12
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ add r10, strideq
+ mov [rsp], r10 ; below
+ call .hv0
+.main:
+ dec hd
+ jz .height1
+ add lpfq, strideq
+ call .hv1
+ call .prep_n
+ sub hd, 2
+ jl .extend_bottom
+.main_loop:
+ add lpfq, strideq
+ call .hv0
+ test hd, hd
+ jz .odd_height
+ add lpfq, strideq
+ call .hv1
+ call .n0
+ call .n1
+ sub hd, 2
+ jge .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .extend_bottom
+ mov lpfq, [rsp]
+ call .hv0_bottom
+ add lpfq, strideq
+ call .hv1_bottom
+.end:
+ call .n0
+ call .n1
+.end2:
+ RET
+.height1:
+ call .v1
+ call .prep_n
+ jmp .odd_height_end
+.odd_height:
+ call .v1
+ call .n0
+ call .n1
+.odd_height_end:
+ call .v0
+ call .v1
+ call .n0
+ jmp .end2
+.extend_bottom:
+ call .v0
+ call .v1
+ jmp .end
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov [rsp], r10
+ call .h
+ lea r10, [wq-4]
+ lea t2, [t1+400*12]
+.top_fixup_loop:
+ mova m0, [t1+r10+400* 0]
+ mova m1, [t1+r10+400* 2]
+ mova m2, [t1+r10+400* 4]
+ paddw m0, m0
+ mova m3, [t1+r10+400* 6]
+ paddd m1, m1
+ mova m4, [t1+r10+400* 8]
+ paddd m2, m2
+ mova m5, [t1+r10+400*10]
+ mova [t2+r10+400* 0], m0
+ mova [t2+r10+400* 2], m1
+ mova [t2+r10+400* 4], m2
+ mova [t2+r10+400* 6], m3
+ mova [t2+r10+400* 8], m4
+ mova [t2+r10+400*10], m5
+ add r10, 32
+ jl .top_fixup_loop
+ call .v0
+ jmp .main
+.h: ; horizontal boxsum
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ vpbroadcastq xm5, [leftq]
+ vinserti128 m5, [lpfq+wq], 1
+ mova m4, [lpfq+wq]
+ add leftq, 8
+ palignr m4, m5, 10
+ jmp .h_main
+.h_extend_left:
+ mova xm4, [lpfq+wq]
+ pshufb xm4, [sgr_lshuf5]
+ vinserti128 m4, [lpfq+wq+10], 1
+ jmp .h_main
+.h_top:
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m4, [lpfq+r10- 2]
+.h_main:
+ movu m5, [lpfq+r10+14]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -36
+ jl .h_have_right
+ call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).extend_right
+.h_have_right:
+ palignr m3, m5, m4, 2
+ palignr m0, m5, m4, 4
+ paddw m1, m3, m0
+ punpcklwd m2, m3, m0
+ pmaddwd m2, m2
+ punpckhwd m3, m0
+ pmaddwd m3, m3
+ palignr m0, m5, m4, 6
+ paddw m1, m0 ; sum3
+ punpcklwd m6, m0, m7
+ pmaddwd m6, m6
+ punpckhwd m0, m7
+ pmaddwd m0, m0
+ paddd m2, m6 ; sumsq3
+ shufpd m6, m4, m5, 0x05
+ punpcklwd m5, m6, m4
+ paddw m8, m4, m6
+ pmaddwd m5, m5
+ punpckhwd m6, m4
+ pmaddwd m6, m6
+ paddd m3, m0
+ mova [t1+r10+400* 6], m1
+ mova [t1+r10+400* 8], m2
+ mova [t1+r10+400*10], m3
+ paddw m8, m1 ; sum5
+ paddd m5, m2 ; sumsq5
+ paddd m6, m3
+ mova [t1+r10+400* 0], m8
+ mova [t1+r10+400* 2], m5
+ mova [t1+r10+400* 4], m6
+ add r10, 32
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows)
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+ vpbroadcastq xm5, [leftq]
+ vinserti128 m5, [lpfq+wq], 1
+ mova m4, [lpfq+wq]
+ add leftq, 8
+ palignr m4, m5, 10
+ jmp .hv0_main
+.hv0_extend_left:
+ mova xm4, [lpfq+wq]
+ pshufb xm4, [sgr_lshuf5]
+ vinserti128 m4, [lpfq+wq+10], 1
+ jmp .hv0_main
+.hv0_bottom:
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+.hv0_loop:
+ movu m4, [lpfq+r10- 2]
+.hv0_main:
+ movu m5, [lpfq+r10+14]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv0_have_right
+ cmp r10d, -36
+ jl .hv0_have_right
+ call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).extend_right
+.hv0_have_right:
+ palignr m3, m5, m4, 2
+ palignr m0, m5, m4, 4
+ paddw m1, m3, m0
+ punpcklwd m2, m3, m0
+ pmaddwd m2, m2
+ punpckhwd m3, m0
+ pmaddwd m3, m3
+ palignr m0, m5, m4, 6
+ paddw m1, m0 ; h sum3
+ punpcklwd m6, m0, m7
+ pmaddwd m6, m6
+ punpckhwd m0, m7
+ pmaddwd m0, m0
+ paddd m2, m6 ; h sumsq3
+ shufpd m6, m4, m5, 0x05
+ punpcklwd m5, m6, m4
+ paddw m8, m4, m6
+ pmaddwd m5, m5
+ punpckhwd m6, m4
+ pmaddwd m6, m6
+ paddd m3, m0
+ paddw m8, m1 ; h sum5
+ paddd m5, m2 ; h sumsq5
+ paddd m6, m3
+ mova [t3+r10*2+400*8+ 8], m8 ; we need a clean copy of the last row TODO: t4?
+ mova [t3+r10*2+400*0+ 8], m5 ; in case height is odd
+ mova [t3+r10*2+400*0+40], m6
+ paddw m8, [t1+r10+400* 0]
+ paddd m5, [t1+r10+400* 2]
+ paddd m6, [t1+r10+400* 4]
+ mova [t1+r10+400* 0], m8
+ mova [t1+r10+400* 2], m5
+ mova [t1+r10+400* 4], m6
+ paddw m0, m1, [t1+r10+400* 6]
+ paddd m4, m2, [t1+r10+400* 8]
+ paddd m5, m3, [t1+r10+400*10]
+ mova [t1+r10+400* 6], m1
+ mova [t1+r10+400* 8], m2
+ mova [t1+r10+400*10], m3
+ paddw m1, m0, [t2+r10+400* 6]
+ paddd m2, m4, [t2+r10+400* 8]
+ paddd m3, m5, [t2+r10+400*10]
+ mova [t2+r10+400* 6], m0
+ mova [t2+r10+400* 8], m4
+ mova [t2+r10+400*10], m5
+ paddd m2, m9
+ paddd m3, m9
+ psrld m2, 4 ; (a3 + 8) >> 4
+ psrld m3, 4
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; ((a3 + 8) >> 4) * 9
+ paddd m5, m3
+ psrlw m3, m1, 1
+ pavgw m3, m7 ; (b3 + 2) >> 2
+ punpcklwd m2, m3, m7
+ pmaddwd m2, m2
+ punpckhwd m3, m7
+ pmaddwd m3, m3
+ punpcklwd m0, m1, m7 ; b3
+ punpckhwd m1, m7
+ pmaxud m4, m2
+ psubd m4, m2 ; p3
+ pmaxud m5, m3
+ psubd m5, m3
+ pmulld m4, m14 ; p3 * s1
+ pmulld m5, m14
+ pmaddwd m0, m12 ; b3 * 455
+ pmaddwd m1, m12
+ paddusw m4, m12
+ paddusw m5, m12
+ psrad m3, m4, 20 ; min(z3, 255) - 256
+ vpgatherdd m2, [r13+m3*4], m4 ; x3
+ psrad m4, m5, 20
+ vpgatherdd m3, [r13+m4*4], m5
+ pmulld m0, m2
+ pmulld m1, m3
+ packssdw m2, m3
+ paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m10
+ psrld m0, 12
+ psrld m1, 12
+ mova [t4+r10*1+400*2+ 4], m2
+ mova [t3+r10*2+400*4+ 8], xm0
+ vextracti128 [t3+r10*2+400*4+40], m0, 1
+ mova [t3+r10*2+400*4+24], xm1
+ vextracti128 [t3+r10*2+400*4+56], m1, 1
+ add r10, 32
+ jl .hv0_loop
+ ret
+ALIGN function_align
+.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+ vpbroadcastq xm5, [leftq]
+ vinserti128 m5, [lpfq+wq], 1
+ mova m4, [lpfq+wq]
+ add leftq, 8
+ palignr m4, m5, 10
+ jmp .hv1_main
+.hv1_extend_left:
+ mova xm4, [lpfq+wq]
+ pshufb xm4, [sgr_lshuf5]
+ vinserti128 m4, [lpfq+wq+10], 1
+ jmp .hv1_main
+.hv1_bottom:
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+.hv1_loop:
+ movu m4, [lpfq+r10- 2]
+.hv1_main:
+ movu m5, [lpfq+r10+14]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv1_have_right
+ cmp r10d, -36
+ jl .hv1_have_right
+ call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).extend_right
+.hv1_have_right:
+ palignr m6, m5, m4, 2
+ palignr m3, m5, m4, 4
+ paddw m2, m6, m3
+ punpcklwd m0, m6, m3
+ pmaddwd m0, m0
+ punpckhwd m6, m3
+ pmaddwd m6, m6
+ palignr m3, m5, m4, 6
+ paddw m2, m3 ; h sum3
+ punpcklwd m1, m3, m7
+ pmaddwd m1, m1
+ punpckhwd m3, m7
+ pmaddwd m3, m3
+ paddd m0, m1 ; h sumsq3
+ shufpd m1, m4, m5, 0x05
+ punpckhwd m5, m4, m1
+ paddw m8, m4, m1
+ pmaddwd m5, m5
+ punpcklwd m4, m1
+ pmaddwd m4, m4
+ paddd m6, m3
+ paddw m1, m2, [t2+r10+400* 6]
+ mova [t2+r10+400* 6], m2
+ paddw m8, m2 ; h sum5
+ paddd m2, m0, [t2+r10+400* 8]
+ paddd m3, m6, [t2+r10+400*10]
+ mova [t2+r10+400* 8], m0
+ mova [t2+r10+400*10], m6
+ paddd m4, m0 ; h sumsq5
+ paddd m5, m6
+ paddd m2, m9
+ paddd m3, m9
+ psrld m2, 4 ; (a3 + 8) >> 4
+ psrld m3, 4
+ pslld m0, m2, 3
+ pslld m6, m3, 3
+ paddd m2, m0 ; ((a3 + 8) >> 4) * 9
+ paddd m3, m6
+ psrlw m6, m1, 1
+ pavgw m6, m7 ; (b3 + 2) >> 2
+ punpcklwd m0, m6, m7
+ pmaddwd m0, m0
+ punpckhwd m6, m7
+ pmaddwd m6, m6
+ pmaxud m2, m0
+ psubd m2, m0 ; p3
+ pmaxud m3, m6
+ psubd m3, m6
+ punpcklwd m0, m1, m7 ; b3
+ punpckhwd m1, m7
+ pmulld m2, m14 ; p3 * s1
+ pmulld m3, m14
+ pmaddwd m0, m12 ; b3 * 455
+ pmaddwd m1, m12
+ paddusw m2, m12
+ paddusw m3, m12
+ psrad m7, m2, 20 ; min(z3, 255) - 256
+ vpgatherdd m6, [r13+m7*4], m2 ; x3
+ psrad m2, m3, 20
+ vpgatherdd m7, [r13+m2*4], m3
+ pmulld m0, m6
+ packssdw m6, m7
+ pmulld m7, m1
+ paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m7, m10
+ psrld m0, 12
+ psrld m7, 12
+ paddw m1, m8, [t2+r10+400*0]
+ paddd m2, m4, [t2+r10+400*2]
+ paddd m3, m5, [t2+r10+400*4]
+ paddw m1, [t1+r10+400*0]
+ paddd m2, [t1+r10+400*2]
+ paddd m3, [t1+r10+400*4]
+ mova [t2+r10+400*0], m8
+ mova [t2+r10+400*2], m4
+ mova [t2+r10+400*4], m5
+ mova [t4+r10*1+400*4 +4], m6
+ mova [t3+r10*2+400*8+ 8], xm0
+ vextracti128 [t3+r10*2+400*8+40], m0, 1
+ mova [t3+r10*2+400*8+24], xm7
+ vextracti128 [t3+r10*2+400*8+56], m7, 1
+ vpbroadcastd m4, [pd_25]
+ pxor m7, m7
+ paddd m2, m9
+ paddd m3, m9
+ psrld m2, 4 ; (a5 + 8) >> 4
+ psrld m3, 4
+ pmulld m2, m4 ; ((a5 + 8) >> 4) * 25
+ pmulld m3, m4
+ psrlw m5, m1, 1
+ pavgw m5, m7 ; (b5 + 2) >> 2
+ punpcklwd m4, m5, m7
+ pmaddwd m4, m4
+ punpckhwd m5, m7
+ pmaddwd m5, m5
+ punpcklwd m0, m1, m7 ; b5
+ punpckhwd m1, m7
+ pmaxud m2, m4
+ psubd m2, m4 ; p5
+ vpbroadcastd m4, [pd_0xf00800a4]
+ pmaxud m3, m5
+ psubd m3, m5
+ pmulld m2, m13 ; p5 * s0
+ pmulld m3, m13
+ pmaddwd m0, m4 ; b5 * 164
+ pmaddwd m1, m4
+ paddusw m2, m4
+ paddusw m3, m4
+ psrad m5, m2, 20 ; min(z5, 255) - 256
+ vpgatherdd m4, [r13+m5*4], m2 ; x5
+ psrad m2, m3, 20
+ vpgatherdd m5, [r13+m2*4], m3
+ pmulld m0, m4
+ pmulld m1, m5
+ packssdw m4, m5
+ paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
+ paddd m1, m10
+ psrld m0, 12
+ psrld m1, 12
+ mova [t4+r10*1+400*0+ 4], m4
+ mova [t3+r10*2+400*0+ 8], xm0
+ vextracti128 [t3+r10*2+400*0+40], m0, 1
+ mova [t3+r10*2+400*0+24], xm1
+ vextracti128 [t3+r10*2+400*0+56], m1, 1
+ add r10, 32
+ jl .hv1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.v0: ; vertical boxsums + ab3 (even rows)
+ lea r10, [wq-4]
+.v0_loop:
+ mova m0, [t1+r10+400* 6]
+ mova m4, [t1+r10+400* 8]
+ mova m5, [t1+r10+400*10]
+ paddw m0, m0
+ paddd m4, m4
+ paddd m5, m5
+ paddw m1, m0, [t2+r10+400* 6]
+ paddd m2, m4, [t2+r10+400* 8]
+ paddd m3, m5, [t2+r10+400*10]
+ mova [t2+r10+400* 6], m0
+ mova [t2+r10+400* 8], m4
+ mova [t2+r10+400*10], m5
+ paddd m2, m9
+ paddd m3, m9
+ psrld m2, 4 ; (a3 + 8) >> 4
+ psrld m3, 4
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; ((a3 + 8) >> 4) * 9
+ paddd m5, m3
+ psrlw m3, m1, 1
+ pavgw m3, m7 ; (b3 + 2) >> 2
+ punpcklwd m2, m3, m7
+ pmaddwd m2, m2
+ punpckhwd m3, m7
+ pmaddwd m3, m3
+ punpcklwd m0, m1, m7 ; b3
+ punpckhwd m1, m7
+ pmaxud m4, m2
+ psubd m4, m2 ; p3
+ pmaxud m5, m3
+ psubd m5, m3
+ pmulld m4, m14 ; p3 * s1
+ pmulld m5, m14
+ pmaddwd m0, m12 ; b3 * 455
+ pmaddwd m1, m12
+ paddusw m4, m12
+ paddusw m5, m12
+ psrad m3, m4, 20 ; min(z3, 255) - 256
+ vpgatherdd m2, [r13+m3*4], m4 ; x3
+ psrad m4, m5, 20
+ vpgatherdd m3, [r13+m4*4], m5
+ pmulld m0, m2
+ pmulld m1, m3
+ packssdw m2, m3
+ paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m10
+ psrld m0, 12
+ psrld m1, 12
+ mova m3, [t1+r10+400*0]
+ mova m4, [t1+r10+400*2]
+ mova m5, [t1+r10+400*4]
+ mova [t3+r10*2+400*8+ 8], m3
+ mova [t3+r10*2+400*0+ 8], m4
+ mova [t3+r10*2+400*0+40], m5
+ paddw m3, m3 ; cc5
+ paddd m4, m4
+ paddd m5, m5
+ mova [t1+r10+400*0], m3
+ mova [t1+r10+400*2], m4
+ mova [t1+r10+400*4], m5
+ mova [t4+r10*1+400*2+ 4], m2
+ mova [t3+r10*2+400*4+ 8], xm0
+ vextracti128 [t3+r10*2+400*4+40], m0, 1
+ mova [t3+r10*2+400*4+24], xm1
+ vextracti128 [t3+r10*2+400*4+56], m1, 1
+ add r10, 32
+ jl .v0_loop
+ ret
+.v1: ; vertical boxsums + ab (odd rows)
+ lea r10, [wq-4]
+.v1_loop:
+ mova m4, [t1+r10+400* 6]
+ mova m5, [t1+r10+400* 8]
+ mova m6, [t1+r10+400*10]
+ paddw m1, m4, [t2+r10+400* 6]
+ paddd m2, m5, [t2+r10+400* 8]
+ paddd m3, m6, [t2+r10+400*10]
+ mova [t2+r10+400* 6], m4
+ mova [t2+r10+400* 8], m5
+ mova [t2+r10+400*10], m6
+ paddd m2, m9
+ paddd m3, m9
+ psrld m2, 4 ; (a3 + 8) >> 4
+ psrld m3, 4
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; ((a3 + 8) >> 4) * 9
+ paddd m5, m3
+ psrlw m3, m1, 1
+ pavgw m3, m7 ; (b3 + 2) >> 2
+ punpcklwd m2, m3, m7
+ pmaddwd m2, m2
+ punpckhwd m3, m7
+ pmaddwd m3, m3
+ punpcklwd m0, m1, m7 ; b3
+ punpckhwd m1, m7
+ pmaxud m4, m2
+ psubd m4, m2 ; p3
+ pmaxud m5, m3
+ psubd m5, m3
+ pmulld m4, m14 ; p3 * s1
+ pmulld m5, m14
+ pmaddwd m0, m12 ; b3 * 455
+ pmaddwd m1, m12
+ paddusw m4, m12
+ paddusw m5, m12
+ psrad m3, m4, 20 ; min(z3, 255) - 256
+ vpgatherdd m2, [r13+m3*4], m4 ; x3
+ psrad m4, m5, 20
+ vpgatherdd m3, [r13+m4*4], m5
+ pmulld m0, m2
+ pmulld m1, m3
+ packssdw m2, m3
+ paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m10
+ psrld m0, 12
+ psrld m8, m1, 12
+ mova [t4+r10*1+400*4+4], m2
+ mova m4, [t3+r10*2+400*8+ 8]
+ mova m5, [t3+r10*2+400*0+ 8]
+ mova m6, [t3+r10*2+400*0+40]
+ paddw m1, m4, [t2+r10+400*0]
+ paddd m2, m5, [t2+r10+400*2]
+ paddd m3, m6, [t2+r10+400*4]
+ paddw m1, [t1+r10+400*0]
+ paddd m2, [t1+r10+400*2]
+ paddd m3, [t1+r10+400*4]
+ mova [t2+r10+400*0], m4
+ mova [t2+r10+400*2], m5
+ mova [t2+r10+400*4], m6
+ vpbroadcastd m4, [pd_25]
+ mova [t3+r10*2+400*8+ 8], xm0
+ vextracti128 [t3+r10*2+400*8+40], m0, 1
+ mova [t3+r10*2+400*8+24], xm8
+ vextracti128 [t3+r10*2+400*8+56], m8, 1
+ paddd m2, m9
+ paddd m3, m9
+ psrld m2, 4 ; (a5 + 8) >> 4
+ psrld m3, 4
+ pmulld m2, m4 ; ((a5 + 8) >> 4) * 25
+ pmulld m3, m4
+ psrlw m5, m1, 1
+ pavgw m5, m7 ; (b5 + 2) >> 2
+ punpcklwd m4, m5, m7
+ pmaddwd m4, m4
+ punpckhwd m5, m7
+ pmaddwd m5, m5
+ punpcklwd m0, m1, m7 ; b5
+ punpckhwd m1, m7
+ pmaxud m2, m4
+ psubd m2, m4 ; p5
+ vpbroadcastd m4, [pd_0xf00800a4]
+ pmaxud m3, m5
+ psubd m3, m5
+ pmulld m2, m13 ; p5 * s0
+ pmulld m3, m13
+ pmaddwd m0, m4 ; b5 * 164
+ pmaddwd m1, m4
+ paddusw m2, m4
+ paddusw m3, m4
+ psrad m5, m2, 20 ; min(z5, 255) - 256
+ vpgatherdd m4, [r13+m5*4], m2 ; x5
+ psrad m2, m3, 20
+ vpgatherdd m5, [r13+m2*4], m3
+ pmulld m0, m4
+ pmulld m1, m5
+ packssdw m4, m5
+ paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
+ paddd m1, m10
+ psrld m0, 12
+ psrld m1, 12
+ mova [t4+r10*1+400*0+ 4], m4
+ mova [t3+r10*2+400*0+ 8], xm0
+ vextracti128 [t3+r10*2+400*0+40], m0, 1
+ mova [t3+r10*2+400*0+24], xm1
+ vextracti128 [t3+r10*2+400*0+56], m1, 1
+ add r10, 32
+ jl .v1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.prep_n: ; initial neighbor setup
+ mov r10, wq
+.prep_n_loop:
+ movu xm0, [t4+r10*1+400*0+2]
+ paddw xm2, xm0, [t4+r10*1+400*0+0]
+ paddw xm2, [t4+r10*1+400*0+4]
+ movu m1, [t3+r10*2+400*0+4]
+ paddd m3, m1, [t3+r10*2+400*0+0]
+ paddd m3, [t3+r10*2+400*0+8]
+ paddw xm0, xm2
+ paddd m1, m3
+ psllw xm2, 2
+ pslld m3, 2
+ paddw xm0, xm2 ; a5 565
+ paddd m1, m3 ; b5 565
+ mova [t4+r10*1+400* 6], xm0
+ mova [t3+r10*2+400*12], m1
+ mova xm0, [t4+r10*1+400*2+0]
+ paddw xm0, [t4+r10*1+400*2+4]
+ paddw xm2, xm0, [t4+r10*1+400*2+2]
+ mova m1, [t3+r10*2+400*4+0]
+ paddd m1, [t3+r10*2+400*4+8]
+ paddd m3, m1, [t3+r10*2+400*4+4]
+ psllw xm2, 2 ; a3[-1] 444
+ pslld m3, 2 ; b3[-1] 444
+ psubw xm2, xm0 ; a3[-1] 343
+ psubd m3, m1 ; b3[-1] 343
+ mova [t4+r10*1+400* 8], xm2
+ mova [t3+r10*2+400*16], m3
+ mova xm0, [t4+r10*1+400*4+0]
+ paddw xm0, [t4+r10*1+400*4+4]
+ paddw xm2, xm0, [t4+r10*1+400*4+2]
+ mova m1, [t3+r10*2+400*8+0]
+ paddd m1, [t3+r10*2+400*8+8]
+ paddd m3, m1, [t3+r10*2+400*8+4]
+ psllw xm2, 2 ; a3[ 0] 444
+ pslld m3, 2 ; b3[ 0] 444
+ mova [t4+r10*1+400*10], xm2
+ mova [t3+r10*2+400*20], m3
+ psubw xm2, xm0 ; a3[ 0] 343
+ psubd m3, m1 ; b3[ 0] 343
+ mova [t4+r10*1+400*12], xm2
+ mova [t3+r10*2+400*24], m3
+ add r10, 16
+ jl .prep_n_loop
+ ret
+ALIGN function_align
+.n0: ; neighbor + output (even rows)
+ mov r10, wq
+.n0_loop:
+ movu xm2, [t4+r10*1+2]
+ paddw xm0, xm2, [t4+r10*1+0]
+ paddw xm0, [t4+r10*1+4]
+ paddw xm2, xm0
+ psllw xm0, 2
+ paddw xm0, xm2 ; a5
+ movu m1, [t3+r10*2+4]
+ paddd m4, m1, [t3+r10*2+0]
+ paddd m4, [t3+r10*2+8]
+ paddd m1, m4
+ pslld m4, 2
+ paddd m4, m1 ; b5
+ paddw xm2, xm0, [t4+r10*1+400* 6]
+ mova [t4+r10*1+400* 6], xm0
+ paddd m0, m4, [t3+r10*2+400*12]
+ mova [t3+r10*2+400*12], m4
+ mova xm3, [t4+r10*1+400*2+0]
+ paddw xm3, [t4+r10*1+400*2+4]
+ paddw xm5, xm3, [t4+r10*1+400*2+2]
+ psllw xm5, 2 ; a3[ 1] 444
+ psubw xm4, xm5, xm3 ; a3[ 1] 343
+ paddw xm3, xm4, [t4+r10*1+400* 8]
+ paddw xm3, [t4+r10*1+400*10]
+ mova [t4+r10*1+400* 8], xm4
+ mova [t4+r10*1+400*10], xm5
+ mova m1, [t3+r10*2+400*4+0]
+ paddd m1, [t3+r10*2+400*4+8]
+ paddd m5, m1, [t3+r10*2+400*4+4]
+ pslld m5, 2 ; b3[ 1] 444
+ psubd m4, m5, m1 ; b3[ 1] 343
+ paddd m1, m4, [t3+r10*2+400*16]
+ paddd m1, [t3+r10*2+400*20]
+ mova [t3+r10*2+400*16], m4
+ mova [t3+r10*2+400*20], m5
+ pmovzxwd m4, [dstq+r10]
+ pmovzxwd m2, xm2 ; a5
+ pmovzxwd m3, xm3 ; a3
+ pmaddwd m2, m4 ; a5 * src
+ pmaddwd m3, m4 ; a3 * src
+ pslld m4, 13
+ psubd m0, m2 ; b5 - a5 * src + (1 << 8)
+ psubd m1, m3 ; b3 - a3 * src + (1 << 8)
+ psrld m0, 9
+ pslld m1, 7
+ pblendw m0, m1, 0xaa
+ pmaddwd m0, m15
+ paddd m4, m11
+ paddd m0, m4
+ psrad m0, 7
+ vextracti128 xm1, m0, 1
+ packusdw xm0, xm1 ; clip
+ psrlw xm0, 6
+ mova [dstq+r10], xm0
+ add r10, 16
+ jl .n0_loop
+ add dstq, strideq
+ ret
+ALIGN function_align
+.n1: ; neighbor + output (odd rows)
+ mov r10, wq
+.n1_loop:
+ mova xm3, [t4+r10*1+400*4+0]
+ paddw xm3, [t4+r10*1+400*4+4]
+ paddw xm5, xm3, [t4+r10*1+400*4+2]
+ psllw xm5, 2 ; a3[ 1] 444
+ psubw xm4, xm5, xm3 ; a3[ 1] 343
+ paddw xm3, xm4, [t4+r10*1+400*12]
+ paddw xm3, [t4+r10*1+400*10]
+ mova [t4+r10*1+400*10], xm5
+ mova [t4+r10*1+400*12], xm4
+ mova m1, [t3+r10*2+400*8+0]
+ paddd m1, [t3+r10*2+400*8+8]
+ paddd m5, m1, [t3+r10*2+400*8+4]
+ pslld m5, 2 ; b3[ 1] 444
+ psubd m4, m5, m1 ; b3[ 1] 343
+ paddd m1, m4, [t3+r10*2+400*24]
+ paddd m1, [t3+r10*2+400*20]
+ mova [t3+r10*2+400*20], m5
+ mova [t3+r10*2+400*24], m4
+ pmovzxwd m4, [dstq+r10]
+ pmovzxwd m2, [t4+r10*1+400* 6]
+ pmovzxwd m3, xm3
+ mova m0, [t3+r10*2+400*12]
+ pmaddwd m2, m4 ; a5 * src
+ pmaddwd m3, m4 ; a3 * src
+ pslld m4, 13
+ psubd m0, m2 ; b5 - a5 * src + (1 << 8)
+ psubd m1, m3 ; b3 - a3 * src + (1 << 8)
+ psrld m0, 8
+ pslld m1, 7
+ pblendw m0, m1, 0xaa
+ pmaddwd m0, m15
+ paddd m4, m11
+ paddd m0, m4
+ psrad m0, 7
+ vextracti128 xm1, m0, 1
+ packusdw xm0, xm1 ; clip
+ psrlw xm0, 6
+ mova [dstq+r10], xm0
+ add r10, 16
+ jl .n1_loop
+ add dstq, strideq
+ ret
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/looprestoration16_avx512.asm b/third_party/dav1d/src/x86/looprestoration16_avx512.asm
new file mode 100644
index 0000000000..e560c54a40
--- /dev/null
+++ b/third_party/dav1d/src/x86/looprestoration16_avx512.asm
@@ -0,0 +1,2524 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 16
+
+wiener_shufA: db 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11
+wiener_shufB: db 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 8, 9, 12, 13, 10, 11
+wiener_shufC: db 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15
+wiener_shufD: db 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1
+wiener_shufE: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
+r_ext_mask: times 72 db -1
+ times 8 db 0
+wiener_hshift: dw 4, 4, 1, 1
+wiener_vshift: dw 1024, 1024, 4096, 4096
+wiener_round: dd 1049600, 1048832
+
+pw_164_455: dw 164, 455
+pw_1023: times 2 dw 1023
+pw_61448: times 2 dw 61448
+pd_m262128: dd -262128
+pd_m34816: dd -34816
+pd_m25: dd -25
+pd_m9: dd -9
+pd_8: dd 8
+pd_2147483648: dd 2147483648
+
+cextern sgr_x_by_x
+
+SECTION .text
+
+DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; ring buffer pointers
+
+INIT_ZMM avx512icl
+cglobal wiener_filter7_16bpc, 4, 15, 17, -384*12-16, dst, stride, left, lpf, \
+ w, h, edge, flt
+%define base t4-wiener_hshift
+ mov fltq, r6mp
+ movifnidn wd, wm
+ movifnidn hd, hm
+ mov edged, r7m
+ mov t3d, r8m ; pixel_max
+ vbroadcasti128 m6, [wiener_shufA]
+ vpbroadcastd m12, [fltq+ 0] ; x0 x1
+ lea t4, [wiener_hshift]
+ vbroadcasti128 m7, [wiener_shufB]
+ add wd, wd
+ vpbroadcastd m13, [fltq+ 4] ; x2 x3
+ shr t3d, 11
+ vpbroadcastd m14, [fltq+16] ; y0 y1
+ add lpfq, wq
+ vpbroadcastd m15, [fltq+20] ; y2 y3
+ add dstq, wq
+ vbroadcasti128 m8, [wiener_shufC]
+ lea t1, [rsp+wq+16]
+ vbroadcasti128 m9, [wiener_shufD]
+ neg wq
+ vpbroadcastd m0, [base+wiener_hshift+t3*4]
+ mov r10d, 0xfe
+ vpbroadcastd m10, [base+wiener_round+t3*4]
+ kmovb k1, r10d
+ vpbroadcastd m11, [base+wiener_vshift+t3*4]
+ pmullw m12, m0 ; upshift filter coefs to make the
+ vpbroadcastd m16, [pd_m262128]
+ pmullw m13, m0 ; horizontal downshift constant
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t6, t1
+ mov t5, t1
+ add t1, 384*2
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ mov t4, t1
+ add t1, 384*2
+ add r10, strideq
+ mov [rsp], r10 ; below
+ call .h
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v2
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v3
+.main:
+ lea t0, [t1+384*2]
+.main_loop:
+ call .hv
+ dec hd
+ jnz .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .v3
+ mov lpfq, [rsp]
+ call .hv_bottom
+ add lpfq, strideq
+ call .hv_bottom
+.v1:
+ call .v
+ RET
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov [rsp], r10
+ call .h
+ mov t6, t1
+ mov t5, t1
+ mov t4, t1
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v2
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v3
+ lea t0, [t1+384*2]
+ call .hv
+ dec hd
+ jz .v3
+ add t0, 384*8
+ call .hv
+ dec hd
+ jnz .main
+.v3:
+ call .v
+.v2:
+ call .v
+ jmp .v1
+.h:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movq xm3, [leftq]
+ vmovdqu64 m3{k1}, [lpfq+r10-8]
+ add leftq, 8
+ jmp .h_main
+.h_extend_left:
+ mova m4, [lpfq+r10+0]
+ vpbroadcastw xm3, xm4
+ vmovdqu64 m3{k1}, [lpfq+r10-8]
+ jmp .h_main2
+.h_top:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m3, [lpfq+r10-8]
+.h_main:
+ mova m4, [lpfq+r10+0]
+.h_main2:
+ movu m5, [lpfq+r10+8]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -68
+ jl .h_have_right
+ push r0
+ lea r0, [r_ext_mask+66]
+ vpbroadcastw m0, [lpfq-2]
+ vpternlogd m3, m0, [r0+r10+ 0], 0xe4 ; c ? a : b
+ vpternlogd m4, m0, [r0+r10+ 8], 0xe4
+ vpternlogd m5, m0, [r0+r10+16], 0xe4
+ pop r0
+.h_have_right:
+ pshufb m2, m3, m6
+ pshufb m1, m4, m7
+ paddw m2, m1
+ pshufb m3, m8
+ mova m0, m16
+ vpdpwssd m0, m2, m12
+ pshufb m1, m4, m9
+ paddw m3, m1
+ pshufb m1, m4, m6
+ vpdpwssd m0, m3, m13
+ pshufb m2, m5, m7
+ paddw m2, m1
+ mova m1, m16
+ pshufb m4, m8
+ vpdpwssd m1, m2, m12
+ pshufb m5, m9
+ paddw m4, m5
+ vpdpwssd m1, m4, m13
+ psrad m0, 4
+ psrad m1, 4
+ packssdw m0, m1
+ psraw m0, 1
+ mova [t1+r10], m0
+ add r10, 64
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv:
+ add lpfq, strideq
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movq xm3, [leftq]
+ vmovdqu64 m3{k1}, [lpfq+r10-8]
+ add leftq, 8
+ jmp .hv_main
+.hv_extend_left:
+ mova m4, [lpfq+r10+0]
+ vpbroadcastw xm3, xm4
+ vmovdqu64 m3{k1}, [lpfq+r10-8]
+ jmp .hv_main2
+.hv_bottom:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu m3, [lpfq+r10-8]
+.hv_main:
+ mova m4, [lpfq+r10+0]
+.hv_main2:
+ movu m5, [lpfq+r10+8]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp r10d, -68
+ jl .hv_have_right
+ push r0
+ lea r0, [r_ext_mask+66]
+ vpbroadcastw m0, [lpfq-2]
+ vpternlogd m3, m0, [r0+r10+ 0], 0xe4
+ vpternlogd m4, m0, [r0+r10+ 8], 0xe4
+ vpternlogd m5, m0, [r0+r10+16], 0xe4
+ pop r0
+.hv_have_right:
+ pshufb m2, m3, m6
+ pshufb m1, m4, m7
+ paddw m2, m1
+ pshufb m3, m8
+ mova m0, m16
+ vpdpwssd m0, m2, m12
+ pshufb m1, m4, m9
+ paddw m3, m1
+ pshufb m1, m4, m6
+ vpdpwssd m0, m3, m13
+ pshufb m2, m5, m7
+ paddw m2, m1
+ pshufb m4, m8
+ mova m1, m16
+ vpdpwssd m1, m2, m12
+ pshufb m5, m9
+ paddw m4, m5
+ vpdpwssd m1, m4, m13
+ mova m2, [t4+r10]
+ paddw m2, [t2+r10]
+ mova m5, [t3+r10]
+ psrad m0, 4
+ psrad m1, 4
+ packssdw m0, m1
+ mova m4, [t5+r10]
+ paddw m4, [t1+r10]
+ psraw m0, 1
+ paddw m3, m0, [t6+r10]
+ mova [t0+r10], m0
+ punpcklwd m1, m2, m5
+ mova m0, m10
+ vpdpwssd m0, m1, m15
+ punpckhwd m2, m5
+ mova m1, m10
+ vpdpwssd m1, m2, m15
+ punpcklwd m2, m3, m4
+ vpdpwssd m0, m2, m14
+ punpckhwd m3, m4
+ vpdpwssd m1, m3, m14
+ psrad m0, 5
+ psrad m1, 5
+ packusdw m0, m1
+ pmulhuw m0, m11
+ mova [dstq+r10], m0
+ add r10, 64
+ jl .hv_loop
+ mov t6, t5
+ mov t5, t4
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ mov t1, t0
+ mov t0, t6
+ add dstq, strideq
+ ret
+.v:
+ mov r10, wq
+.v_loop:
+ mova m2, [t4+r10]
+ paddw m2, [t2+r10]
+ mova m3, [t3+r10]
+ punpcklwd m1, m2, m3
+ mova m0, m10
+ vpdpwssd m0, m1, m15
+ punpckhwd m2, m3
+ mova m1, m10
+ vpdpwssd m1, m2, m15
+ mova m4, [t1+r10]
+ paddw m3, m4, [t6+r10]
+ paddw m4, [t5+r10]
+ punpcklwd m2, m3, m4
+ vpdpwssd m0, m2, m14
+ punpckhwd m3, m4
+ vpdpwssd m1, m3, m14
+ psrad m0, 5
+ psrad m1, 5
+ packusdw m0, m1
+ pmulhuw m0, m11
+ mova [dstq+r10], m0
+ add r10, 64
+ jl .v_loop
+ mov t6, t5
+ mov t5, t4
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ add dstq, strideq
+ ret
+
+cglobal wiener_filter5_16bpc, 4, 14, 15, 384*8+16, dst, stride, left, lpf, \
+ w, h, edge, flt
+%define base r13-r_ext_mask-70
+ mov fltq, r6mp
+ movifnidn wd, wm
+ movifnidn hd, hm
+ mov edged, r7m
+ mov t3d, r8m ; pixel_max
+ vbroadcasti128 m5, [wiener_shufE]
+ vpbroadcastw m11, [fltq+ 2] ; x1
+ vbroadcasti128 m6, [wiener_shufB]
+ lea r13, [r_ext_mask+70]
+ vbroadcasti128 m7, [wiener_shufD]
+ add wd, wd
+ vpbroadcastd m12, [fltq+ 4] ; x2 x3
+ shr t3d, 11
+ vpbroadcastd m8, [pd_m262128] ; (1 << 4) - (1 << 18)
+ add lpfq, wq
+ vpbroadcastw m13, [fltq+18] ; y1
+ add dstq, wq
+ vpbroadcastd m14, [fltq+20] ; y2 y3
+ lea t1, [rsp+wq+16]
+ vpbroadcastd m0, [base+wiener_hshift+t3*4]
+ neg wq
+ vpbroadcastd m9, [base+wiener_round+t3*4]
+ mov r10d, 0xfffe
+ vpbroadcastd m10, [base+wiener_vshift+t3*4]
+ kmovw k1, r10d
+ pmullw m11, m0
+ pmullw m12, m0
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t4, t1
+ add t1, 384*2
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ mov t3, t1
+ add t1, 384*2
+ add r10, strideq
+ mov [rsp], r10 ; below
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v2
+.main:
+ mov t0, t4
+.main_loop:
+ call .hv
+ dec hd
+ jnz .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .v2
+ mov lpfq, [rsp]
+ call .hv_bottom
+ add lpfq, strideq
+ call .hv_bottom
+.end:
+ RET
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov [rsp], r10
+ call .h
+ mov t4, t1
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v2
+ lea t0, [t1+384*2]
+ call .hv
+ dec hd
+ jz .v2
+ add t0, 384*6
+ call .hv
+ dec hd
+ jnz .main
+.v2:
+ call .v
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ add dstq, strideq
+.v1:
+ call .v
+ jmp .end
+.h:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movd xm3, [leftq+4]
+ vmovdqu32 m3{k1}, [lpfq+r10-4]
+ add leftq, 8
+ jmp .h_main
+.h_extend_left:
+ vpbroadcastw xm3, [lpfq+r10]
+ vmovdqu32 m3{k1}, [lpfq+r10-4]
+ jmp .h_main
+.h_top:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m3, [lpfq+r10-4]
+.h_main:
+ movu m4, [lpfq+r10+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -66
+ jl .h_have_right
+ vpbroadcastw m0, [lpfq-2]
+ vpternlogd m3, m0, [r13+r10+0], 0xe4 ; c ? a : b
+ vpternlogd m4, m0, [r13+r10+8], 0xe4
+.h_have_right:
+ pshufb m1, m3, m5
+ mova m0, m8
+ vpdpwssd m0, m1, m11
+ pshufb m2, m4, m5
+ mova m1, m8
+ vpdpwssd m1, m2, m11
+ pshufb m2, m3, m6
+ pshufb m3, m7
+ paddw m2, m3
+ pshufb m3, m4, m6
+ vpdpwssd m0, m2, m12
+ pshufb m4, m7
+ paddw m3, m4
+ vpdpwssd m1, m3, m12
+ psrad m0, 4
+ psrad m1, 4
+ packssdw m0, m1
+ psraw m0, 1
+ mova [t1+r10], m0
+ add r10, 64
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv:
+ add lpfq, strideq
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movd xm3, [leftq+4]
+ vmovdqu32 m3{k1}, [lpfq+r10-4]
+ add leftq, 8
+ jmp .hv_main
+.hv_extend_left:
+ vpbroadcastw xm3, [lpfq+r10]
+ vmovdqu32 m3{k1}, [lpfq+r10-4]
+ jmp .hv_main
+.hv_bottom:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu m3, [lpfq+r10-4]
+.hv_main:
+ movu m4, [lpfq+r10+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp r10d, -66
+ jl .hv_have_right
+ vpbroadcastw m0, [lpfq-2]
+ vpternlogd m3, m0, [r13+r10+0], 0xe4
+ vpternlogd m4, m0, [r13+r10+8], 0xe4
+.hv_have_right:
+ pshufb m1, m3, m5
+ mova m0, m8
+ vpdpwssd m0, m1, m11
+ pshufb m2, m4, m5
+ mova m1, m8
+ vpdpwssd m1, m2, m11
+ pshufb m2, m3, m6
+ pshufb m3, m7
+ paddw m2, m3
+ pshufb m3, m4, m6
+ vpdpwssd m0, m2, m12
+ pshufb m4, m7
+ paddw m4, m3
+ vpdpwssd m1, m4, m12
+ mova m2, [t3+r10]
+ paddw m2, [t1+r10]
+ mova m3, [t2+r10]
+ punpcklwd m4, m2, m3
+ punpckhwd m2, m3
+ mova m3, m9
+ vpdpwssd m3, m2, m14
+ mova m2, m9
+ vpdpwssd m2, m4, m14
+ mova m4, [t4+r10]
+ psrad m0, 4
+ psrad m1, 4
+ packssdw m0, m1
+ psraw m0, 1
+ mova [t0+r10], m0
+ punpcklwd m1, m0, m4
+ vpdpwssd m2, m1, m13
+ punpckhwd m0, m4
+ vpdpwssd m3, m0, m13
+ psrad m2, 5
+ psrad m3, 5
+ packusdw m2, m3
+ pmulhuw m2, m10
+ mova [dstq+r10], m2
+ add r10, 64
+ jl .hv_loop
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ mov t1, t0
+ mov t0, t4
+ add dstq, strideq
+ ret
+.v:
+ mov r10, wq
+.v_loop:
+ mova m0, [t1+r10]
+ paddw m2, m0, [t3+r10]
+ mova m1, [t2+r10]
+ mova m4, [t4+r10]
+ punpckhwd m3, m2, m1
+ pmaddwd m3, m14
+ punpcklwd m2, m1
+ pmaddwd m2, m14
+ punpckhwd m1, m0, m4
+ pmaddwd m1, m13
+ punpcklwd m0, m4
+ pmaddwd m0, m13
+ paddd m3, m9
+ paddd m2, m9
+ paddd m1, m3
+ paddd m0, m2
+ psrad m1, 5
+ psrad m0, 5
+ packusdw m0, m1
+ pmulhuw m0, m10
+ mova [dstq+r10], m0
+ add r10, 64
+ jl .v_loop
+ ret
+
+cglobal sgr_filter_5x5_16bpc, 4, 14, 22, 416*24+8, dst, stride, left, lpf, \
+ w, h, edge, params
+%define base r13-r_ext_mask-72
+ movifnidn wd, wm
+ mov paramsq, r6mp
+ lea r13, [r_ext_mask+72]
+ mov edged, r7m
+ movifnidn hd, hm
+ pxor m6, m6
+ vpbroadcastw m7, [paramsq+8] ; w0
+ add wd, wd
+ vpbroadcastd m8, [base+pd_8]
+ add lpfq, wq
+ vpbroadcastd m9, [base+pd_m25]
+ add dstq, wq
+ vpsubd m10, m6, [paramsq+0] {1to16} ; -s0
+ lea t3, [rsp+wq*2+416*12+8]
+ vpbroadcastd m11, [base+pw_164_455]
+ lea t4, [rsp+wq+416*20+8]
+ vpbroadcastd m12, [base+pw_61448] ; (15 << 12) + (1 << 3)
+ lea t1, [rsp+wq+12]
+ vpbroadcastd m13, [base+pd_m34816] ; -((1 << 11) + (1 << 15))
+ neg wq
+ vpbroadcastd m14, [base+pw_1023]
+ psllw m7, 4
+ mova m18, [sgr_x_by_x+64*0]
+ mov r10d, 0xfffffff8
+ mova m19, [sgr_x_by_x+64*1]
+ kmovd k1, r10d
+ mova m20, [sgr_x_by_x+64*2]
+ mov r10, 0x3333333333333333
+ mova m21, [sgr_x_by_x+64*3]
+ kmovq k2, r10
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t2, t1
+ call .top_fixup
+ add t1, 416*6
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ add r10, strideq
+ mov [rsp], r10 ; below
+ mov t0, t2
+ dec hd
+ jz .height1
+ or edged, 16
+ call .h
+.main:
+ add lpfq, strideq
+ call .hv
+ call .prep_n
+ sub hd, 2
+ jl .extend_bottom
+.main_loop:
+ add lpfq, strideq
+ test hd, hd
+ jz .odd_height
+ call .h
+ add lpfq, strideq
+ call .hv
+ call .n0
+ call .n1
+ sub hd, 2
+ jge .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .extend_bottom
+ mov lpfq, [rsp]
+ call .h_top
+ add lpfq, strideq
+ call .hv_bottom
+.end:
+ call .n0
+ call .n1
+.end2:
+ RET
+.height1:
+ call .hv
+ call .prep_n
+ jmp .odd_height_end
+.odd_height:
+ call .hv
+ call .n0
+ call .n1
+.odd_height_end:
+ call .v
+ call .n0
+ jmp .end2
+.extend_bottom:
+ call .v
+ jmp .end
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov [rsp], r10
+ call .h
+ lea t2, [t1+416*6]
+ call .top_fixup
+ dec hd
+ jz .no_top_height1
+ or edged, 16
+ mov t0, t1
+ mov t1, t2
+ jmp .main
+.no_top_height1:
+ call .v
+ call .prep_n
+ jmp .odd_height_end
+.h: ; horizontal boxsum
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movq xm16, [leftq+2]
+ vmovdqu16 m16{k1}, [lpfq+wq-6]
+ add leftq, 8
+ jmp .h_main
+.h_extend_left:
+ vpbroadcastw xm16, [lpfq+wq]
+ vmovdqu16 m16{k1}, [lpfq+wq-6]
+ jmp .h_main
+.h_top:
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m16, [lpfq+r10- 2]
+.h_main:
+ movu m17, [lpfq+r10+14]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -68
+ jl .h_have_right
+ vpbroadcastw m0, [lpfq-2]
+ vpternlogd m16, m0, [r13+r10+ 0], 0xe4 ; c ? a : b
+ vpternlogd m17, m0, [r13+r10+16], 0xe4
+.h_have_right:
+ palignr m2, m17, m16, 2
+ paddw m0, m16, m2
+ palignr m3, m17, m16, 6
+ paddw m0, m3
+ punpcklwd m1, m2, m3
+ pmaddwd m1, m1
+ punpckhwd m2, m3
+ pmaddwd m2, m2
+ shufpd m17, m16, m17, 0x55
+ paddw m0, m17
+ punpcklwd m3, m16, m17
+ vpdpwssd m1, m3, m3
+ punpckhwd m3, m16, m17
+ vpdpwssd m2, m3, m3
+ shufps m16, m17, q2121
+ paddw m0, m16 ; sum
+ test edgeb, 16 ; y > 0
+ jz .h_loop_end
+ paddw m0, [t1+r10+416*0]
+ paddd m1, [t1+r10+416*2]
+ paddd m2, [t1+r10+416*4]
+.h_loop_end:
+ punpcklwd m17, m16, m6
+ vpdpwssd m1, m17, m17 ; sumsq
+ punpckhwd m16, m6
+ vpdpwssd m2, m16, m16
+ mova [t1+r10+416*0], m0
+ mova [t1+r10+416*2], m1
+ mova [t1+r10+416*4], m2
+ add r10, 64
+ jl .h_loop
+ ret
+.top_fixup:
+ lea r10, [wq-4]
+.top_fixup_loop: ; the sums of the first row needs to be doubled
+ mova m0, [t1+r10+416*0]
+ mova m1, [t1+r10+416*2]
+ mova m2, [t1+r10+416*4]
+ paddw m0, m0
+ paddd m1, m1
+ paddd m2, m2
+ mova [t2+r10+416*0], m0
+ mova [t2+r10+416*2], m1
+ mova [t2+r10+416*4], m2
+ add r10, 64
+ jl .top_fixup_loop
+ ret
+ALIGN function_align
+.hv: ; horizontal boxsum + vertical boxsum + ab
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movq xm16, [leftq+2]
+ vmovdqu16 m16{k1}, [lpfq+wq-6]
+ add leftq, 8
+ jmp .hv_main
+.hv_extend_left:
+ vpbroadcastw xm16, [lpfq+wq]
+ vmovdqu16 m16{k1}, [lpfq+wq-6]
+ jmp .hv_main
+.hv_bottom:
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu m16, [lpfq+r10- 2]
+.hv_main:
+ movu m17, [lpfq+r10+14]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp r10d, -68
+ jl .hv_have_right
+ vpbroadcastw m0, [lpfq-2]
+ vpternlogd m16, m0, [r13+r10+ 0], 0xe4
+ vpternlogd m17, m0, [r13+r10+16], 0xe4
+.hv_have_right:
+ palignr m3, m17, m16, 2
+ paddw m0, m16, m3
+ palignr m1, m17, m16, 6
+ paddw m0, m1
+ punpcklwd m2, m3, m1
+ pmaddwd m2, m2
+ punpckhwd m3, m1
+ pmaddwd m3, m3
+ shufpd m17, m16, m17, 0x55
+ paddw m0, m17
+ punpcklwd m1, m16, m17
+ vpdpwssd m2, m1, m1
+ punpckhwd m1, m16, m17
+ vpdpwssd m3, m1, m1
+ shufps m16, m17, q2121
+ paddw m0, m16 ; h sum
+ punpcklwd m17, m16, m6
+ vpdpwssd m2, m17, m17 ; h sumsq
+ punpckhwd m16, m6
+ vpdpwssd m3, m16, m16
+ paddw m1, m0, [t1+r10+416*0]
+ paddd m16, m2, [t1+r10+416*2]
+ paddd m17, m3, [t1+r10+416*4]
+ test hd, hd
+ jz .hv_last_row
+.hv_main2:
+ paddw m1, [t2+r10+416*0] ; hv sum
+ paddd m16, [t2+r10+416*2] ; hv sumsq
+ paddd m17, [t2+r10+416*4]
+ mova [t0+r10+416*0], m0
+ mova [t0+r10+416*2], m2
+ mova [t0+r10+416*4], m3
+ psrlw m3, m1, 1
+ paddd m16, m8
+ pavgw m3, m6 ; (b + 2) >> 2
+ paddd m17, m8
+ psrld m16, 4 ; (a + 8) >> 4
+ psrld m17, 4
+ pmulld m16, m9 ; -a * 25
+ pmulld m17, m9
+ punpcklwd m2, m3, m6
+ vpdpwssd m16, m2, m2 ; -p
+ punpckhwd m3, m6
+ vpdpwssd m17, m3, m3
+ punpcklwd m0, m1, m6 ; b
+ punpckhwd m1, m6
+ pmulld m16, m10 ; p * s
+ pmulld m17, m10
+ pmaddwd m0, m11 ; b * 164
+ pmaddwd m1, m11
+ vpalignr m17{k2}, m16, m16, 2
+ mova m16, m20
+ pmaxsw m17, m6
+ paddusw m17, m12
+ psraw m17, 4 ; min(z, 255) - 256
+ vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m17
+ vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m17{k3}, m16 ; x
+ pandn m16, m13, m17
+ psrld m17, 16
+ pmulld m0, m16
+ pmulld m1, m17
+ packssdw m16, m17
+ psubd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15)
+ psubd m1, m13
+ mova [t4+r10+4], m16
+ psrld m16, m0, 12 ; b
+ psrld m17, m1, 12
+ mova [t3+r10*2+ 8], xm16
+ mova [t3+r10*2+ 24], xm17
+ vextracti128 [t3+r10*2+ 40], ym16, 1
+ vextracti128 [t3+r10*2+ 56], ym17, 1
+ vextracti32x4 [t3+r10*2+ 72], m16, 2
+ vextracti32x4 [t3+r10*2+ 88], m17, 2
+ vextracti32x4 [t3+r10*2+104], m16, 3
+ vextracti32x4 [t3+r10*2+120], m17, 3
+ add r10, 64
+ jl .hv_loop
+ mov t2, t1
+ mov t1, t0
+ mov t0, t2
+ ret
+.hv_last_row: ; esoteric edge case for odd heights
+ mova [t1+r10+416*0], m1
+ paddw m1, m0
+ mova [t1+r10+416*2], m16
+ paddd m16, m2
+ mova [t1+r10+416*4], m17
+ paddd m17, m3
+ jmp .hv_main2
+.v: ; vertical boxsum + ab
+ lea r10, [wq-4]
+.v_loop:
+ mova m2, [t1+r10+416*2]
+ mova m3, [t1+r10+416*4]
+ mova m0, [t1+r10+416*0]
+ paddd m16, m2, [t2+r10+416*2]
+ paddd m17, m3, [t2+r10+416*4]
+ paddw m1, m0, [t2+r10+416*0]
+ paddd m2, m2
+ paddd m3, m3
+ paddd m16, m2 ; hv sumsq
+ paddd m17, m3
+ paddd m16, m8
+ paddd m17, m8
+ psrld m16, 4 ; (a + 8) >> 4
+ psrld m17, 4
+ pmulld m16, m9 ; -a * 25
+ pmulld m17, m9
+ paddw m0, m0
+ paddw m1, m0 ; hv sum
+ psrlw m3, m1, 1
+ pavgw m3, m6 ; (b + 2) >> 2
+ punpcklwd m2, m3, m6
+ vpdpwssd m16, m2, m2 ; -p
+ punpckhwd m3, m6
+ vpdpwssd m17, m3, m3
+ punpcklwd m0, m1, m6 ; b
+ punpckhwd m1, m6
+ pmulld m16, m10 ; p * s
+ pmulld m17, m10
+ pmaddwd m0, m11 ; b * 164
+ pmaddwd m1, m11
+ vpalignr m17{k2}, m16, m16, 2
+ mova m16, m20
+ pmaxsw m17, m6
+ paddusw m17, m12
+ psraw m17, 4 ; min(z, 255) - 256
+ vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m17
+ vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m17{k3}, m16 ; x
+ pandn m16, m13, m17
+ psrld m17, 16
+ pmulld m0, m16
+ pmulld m1, m17
+ packssdw m16, m17
+ psubd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15)
+ psubd m1, m13
+ mova [t4+r10+4], m16
+ psrld m16, m0, 12 ; b
+ psrld m17, m1, 12
+ mova [t3+r10*2+ 8], xm16
+ mova [t3+r10*2+ 24], xm17
+ vextracti128 [t3+r10*2+ 40], ym16, 1
+ vextracti128 [t3+r10*2+ 56], ym17, 1
+ vextracti32x4 [t3+r10*2+ 72], m16, 2
+ vextracti32x4 [t3+r10*2+ 88], m17, 2
+ vextracti32x4 [t3+r10*2+104], m16, 3
+ vextracti32x4 [t3+r10*2+120], m17, 3
+ add r10, 64
+ jl .v_loop
+ ret
+.prep_n: ; initial neighbor setup
+ mov r10, wq
+.prep_n_loop:
+ movu m0, [t4+r10*1+ 2]
+ movu m1, [t3+r10*2+ 4]
+ movu m2, [t3+r10*2+68]
+ paddw m3, m0, [t4+r10*1+ 0]
+ paddd m16, m1, [t3+r10*2+ 0]
+ paddd m17, m2, [t3+r10*2+64]
+ paddw m3, [t4+r10*1+ 4]
+ paddd m16, [t3+r10*2+ 8]
+ paddd m17, [t3+r10*2+72]
+ paddw m0, m3
+ psllw m3, 2
+ paddd m1, m16
+ pslld m16, 2
+ paddd m2, m17
+ pslld m17, 2
+ paddw m0, m3 ; a 565
+ paddd m1, m16 ; b 565
+ paddd m2, m17
+ mova [t4+r10*1+416*2+ 0], m0
+ mova [t3+r10*2+416*4+ 0], m1
+ mova [t3+r10*2+416*4+64], m2
+ add r10, 64
+ jl .prep_n_loop
+ ret
+ALIGN function_align
+.n0: ; neighbor + output (even rows)
+ mov r10, wq
+.n0_loop:
+ movu m0, [t4+r10*1+ 2]
+ movu m1, [t3+r10*2+ 4]
+ movu m2, [t3+r10*2+68]
+ paddw m3, m0, [t4+r10*1+ 0]
+ paddd m16, m1, [t3+r10*2+ 0]
+ paddd m17, m2, [t3+r10*2+64]
+ paddw m3, [t4+r10*1+ 4]
+ paddd m16, [t3+r10*2+ 8]
+ paddd m17, [t3+r10*2+72]
+ paddw m0, m3
+ psllw m3, 2
+ paddd m1, m16
+ pslld m16, 2
+ paddd m2, m17
+ pslld m17, 2
+ paddw m0, m3 ; a 565
+ paddd m1, m16 ; b 565
+ paddd m2, m17
+ paddw m3, m0, [t4+r10*1+416*2+ 0]
+ paddd m16, m1, [t3+r10*2+416*4+ 0]
+ paddd m17, m2, [t3+r10*2+416*4+64]
+ mova [t4+r10*1+416*2+ 0], m0
+ mova [t3+r10*2+416*4+ 0], m1
+ mova [t3+r10*2+416*4+64], m2
+ mova m0, [dstq+r10]
+ punpcklwd m1, m0, m6 ; src
+ punpcklwd m2, m3, m6 ; a
+ pmaddwd m2, m1 ; a * src
+ punpckhwd m1, m0, m6
+ punpckhwd m3, m6
+ pmaddwd m3, m1
+ vshufi32x4 m1, m16, m17, q2020
+ vshufi32x4 m16, m17, q3131
+ psubd m1, m2 ; b - a * src + (1 << 8)
+ psubd m16, m3
+ psrad m1, 9
+ psrad m16, 9
+ packssdw m1, m16
+ pmulhrsw m1, m7
+ paddw m0, m1
+ pmaxsw m0, m6
+ pminsw m0, m14
+ mova [dstq+r10], m0
+ add r10, 64
+ jl .n0_loop
+ add dstq, strideq
+ ret
+ALIGN function_align
+.n1: ; neighbor + output (odd rows)
+ mov r10, wq
+.n1_loop:
+ mova m0, [dstq+r10]
+ mova m3, [t4+r10*1+416*2+ 0]
+ mova m16, [t3+r10*2+416*4+ 0]
+ mova m17, [t3+r10*2+416*4+64]
+ punpcklwd m1, m0, m6 ; src
+ punpcklwd m2, m3, m6 ; a
+ pmaddwd m2, m1
+ punpckhwd m1, m0, m6
+ punpckhwd m3, m6
+ pmaddwd m3, m1
+ vshufi32x4 m1, m16, m17, q2020
+ vshufi32x4 m16, m17, q3131
+ psubd m1, m2 ; b - a * src + (1 << 7)
+ psubd m16, m3
+ psrad m1, 8
+ psrad m16, 8
+ packssdw m1, m16
+ pmulhrsw m1, m7
+ paddw m0, m1
+ pmaxsw m0, m6
+ pminsw m0, m14
+ mova [dstq+r10], m0
+ add r10, 64
+ jl .n1_loop
+ add dstq, strideq
+ ret
+
+cglobal sgr_filter_3x3_16bpc, 4, 14, 22, 416*42+8, dst, stride, left, lpf, \
+ w, h, edge, params
+ movifnidn wd, wm
+ mov paramsq, r6mp
+ lea r13, [r_ext_mask+72]
+ mov edged, r7m
+ movifnidn hd, hm
+ pxor m6, m6
+ vpbroadcastw m7, [paramsq+10] ; w1
+ add wd, wd
+ vpbroadcastd m8, [base+pd_8]
+ add lpfq, wq
+ vpbroadcastd m9, [base+pd_m9]
+ add dstq, wq
+ vpsubd m10, m6, [paramsq+4] {1to16} ; -s1
+ lea t3, [rsp+wq*2+416*12+8]
+ vpbroadcastd m11, [base+pw_164_455]
+ lea t4, [rsp+wq+416*32+8]
+ vpbroadcastd m12, [base+pw_61448]
+ lea t1, [rsp+wq+12]
+ vpbroadcastd m13, [base+pd_m34816]
+ neg wq
+ vpbroadcastd m14, [base+pw_1023]
+ psllw m7, 4
+ mova m18, [sgr_x_by_x+64*0]
+ mov r10d, 0xfffffffc
+ mova m19, [sgr_x_by_x+64*1]
+ kmovd k1, r10d
+ mova m20, [sgr_x_by_x+64*2]
+ mov r10, 0x3333333333333333
+ mova m21, [sgr_x_by_x+64*3]
+ kmovq k2, r10
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t2, t1
+ add t1, 416*6
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ add r10, strideq
+ mov [rsp], r10 ; below
+ call .hv0
+.main:
+ dec hd
+ jz .height1
+ add lpfq, strideq
+ call .hv1
+ call .prep_n
+ sub hd, 2
+ jl .extend_bottom
+.main_loop:
+ add lpfq, strideq
+ call .hv0
+ test hd, hd
+ jz .odd_height
+ add lpfq, strideq
+ call .hv1
+ call .n0
+ call .n1
+ sub hd, 2
+ jge .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .extend_bottom
+ mov lpfq, [rsp]
+ call .hv0_bottom
+ add lpfq, strideq
+ call .hv1_bottom
+.end:
+ call .n0
+ call .n1
+.end2:
+ RET
+.height1:
+ call .v1
+ call .prep_n
+ jmp .odd_height_end
+.odd_height:
+ call .v1
+ call .n0
+ call .n1
+.odd_height_end:
+ call .v0
+ call .v1
+ call .n0
+ jmp .end2
+.extend_bottom:
+ call .v0
+ call .v1
+ jmp .end
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov [rsp], r10
+ call .h
+ lea r10, [wq-4]
+ lea t2, [t1+416*6]
+.top_fixup_loop:
+ mova m0, [t1+r10+416*0]
+ mova m1, [t1+r10+416*2]
+ mova m2, [t1+r10+416*4]
+ mova [t2+r10+416*0], m0
+ mova [t2+r10+416*2], m1
+ mova [t2+r10+416*4], m2
+ add r10, 64
+ jl .top_fixup_loop
+ call .v0
+ jmp .main
+.h: ; horizontal boxsum
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movd xm16, [leftq+4]
+ vmovdqu16 m16{k1}, [lpfq+wq-4]
+ add leftq, 8
+ jmp .h_main
+.h_extend_left:
+ vpbroadcastw xm16, [lpfq+wq]
+ vmovdqu16 m16{k1}, [lpfq+wq-4]
+ jmp .h_main
+.h_top:
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m16, [lpfq+r10+ 0]
+.h_main:
+ movu m17, [lpfq+r10+16]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -66
+ jl .h_have_right
+ vpbroadcastw m0, [lpfq-2]
+ vpternlogd m16, m0, [r13+r10+ 0], 0xe4
+ vpternlogd m17, m0, [r13+r10+16], 0xe4
+.h_have_right:
+ palignr m0, m17, m16, 2
+ paddw m1, m16, m0
+ punpcklwd m2, m16, m0
+ pmaddwd m2, m2
+ punpckhwd m3, m16, m0
+ pmaddwd m3, m3
+ palignr m17, m16, 4
+ paddw m1, m17 ; sum
+ punpcklwd m16, m17, m6
+ vpdpwssd m2, m16, m16 ; sumsq
+ punpckhwd m17, m6
+ vpdpwssd m3, m17, m17
+ mova [t1+r10+416*0], m1
+ mova [t1+r10+416*2], m2
+ mova [t1+r10+416*4], m3
+ add r10, 64
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv0: ; horizontal boxsum + vertical boxsum + ab (even rows)
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+ movd xm16, [leftq+4]
+ vmovdqu16 m16{k1}, [lpfq+wq-4]
+ add leftq, 8
+ jmp .hv0_main
+.hv0_extend_left:
+ vpbroadcastw xm16, [lpfq+wq]
+ vmovdqu16 m16{k1}, [lpfq+wq-4]
+ jmp .hv0_main
+.hv0_bottom:
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+.hv0_loop:
+ movu m16, [lpfq+r10+ 0]
+.hv0_main:
+ movu m17, [lpfq+r10+16]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv0_have_right
+ cmp r10d, -66
+ jl .hv0_have_right
+ vpbroadcastw m0, [lpfq-2]
+ vpternlogd m16, m0, [r13+r10+ 0], 0xe4
+ vpternlogd m17, m0, [r13+r10+16], 0xe4
+.hv0_have_right:
+ palignr m0, m17, m16, 2
+ paddw m1, m16, m0
+ punpcklwd m2, m16, m0
+ pmaddwd m2, m2
+ punpckhwd m3, m16, m0
+ pmaddwd m3, m3
+ palignr m17, m16, 4
+ paddw m1, m17 ; sum
+ punpcklwd m16, m17, m6
+ vpdpwssd m2, m16, m16 ; sumsq
+ punpckhwd m17, m6
+ vpdpwssd m3, m17, m17
+ paddw m0, m1, [t1+r10+416*0]
+ paddd m16, m2, [t1+r10+416*2]
+ paddd m17, m3, [t1+r10+416*4]
+ mova [t1+r10+416*0], m1
+ mova [t1+r10+416*2], m2
+ mova [t1+r10+416*4], m3
+ paddw m1, m0, [t2+r10+416*0]
+ paddd m2, m16, [t2+r10+416*2]
+ paddd m3, m17, [t2+r10+416*4]
+ mova [t2+r10+416*0], m0
+ mova [t2+r10+416*2], m16
+ mova [t2+r10+416*4], m17
+ paddd m2, m8
+ paddd m3, m8
+ psrld m2, 4 ; (a + 8) >> 4
+ psrld m3, 4
+ pmulld m2, m9 ; -((a + 8) >> 4) * 9
+ pmulld m3, m9
+ psrlw m17, m1, 1
+ pavgw m17, m6 ; (b + 2) >> 2
+ punpcklwd m16, m17, m6
+ vpdpwssd m2, m16, m16 ; -p
+ punpckhwd m17, m6
+ vpdpwssd m3, m17, m17
+ punpcklwd m16, m6, m1 ; b
+ punpckhwd m17, m6, m1
+ pminsd m2, m6
+ pminsd m3, m6
+ pmulld m2, m10 ; p * s
+ pmulld m3, m10
+ pmaddwd m16, m11 ; b * 455
+ pmaddwd m17, m11
+ vpalignr m3{k2}, m2, m2, 2
+ mova m2, m20
+ paddusw m3, m12
+ psraw m3, 4 ; min(z, 255) - 256
+ vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m3
+ vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m3{k3}, m2 ; x
+ pandn m2, m13, m3
+ psrld m3, 16
+ pmulld m16, m2
+ pmulld m17, m3
+ packssdw m2, m3
+ psubd m16, m13 ; x * b * 455 + (1 << 11) + (1 << 15)
+ psubd m17, m13
+ mova [t4+r10*1+416*0+4], m2
+ psrld m16, 12
+ psrld m17, 12
+ mova [t3+r10*2+416*0+ 8], xm16
+ mova [t3+r10*2+416*0+ 24], xm17
+ vextracti128 [t3+r10*2+416*0+ 40], ym16, 1
+ vextracti128 [t3+r10*2+416*0+ 56], ym17, 1
+ vextracti32x4 [t3+r10*2+416*0+ 72], m16, 2
+ vextracti32x4 [t3+r10*2+416*0+ 88], m17, 2
+ vextracti32x4 [t3+r10*2+416*0+104], m16, 3
+ vextracti32x4 [t3+r10*2+416*0+120], m17, 3
+ add r10, 64
+ jl .hv0_loop
+ ret
+ALIGN function_align
+.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+ movd xm16, [leftq+4]
+ vmovdqu16 m16{k1}, [lpfq+wq-4]
+ add leftq, 8
+ jmp .hv1_main
+.hv1_extend_left:
+ vpbroadcastw xm16, [lpfq+wq]
+ vmovdqu16 m16{k1}, [lpfq+wq-4]
+ jmp .hv1_main
+.hv1_bottom:
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+.hv1_loop:
+ movu m16, [lpfq+r10+ 0]
+.hv1_main:
+ movu m17, [lpfq+r10+16]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv1_have_right
+ cmp r10d, -66
+ jl .hv1_have_right
+ vpbroadcastw m0, [lpfq-2]
+ vpternlogd m16, m0, [r13+r10+ 0], 0xe4
+ vpternlogd m17, m0, [r13+r10+16], 0xe4
+.hv1_have_right:
+ palignr m1, m17, m16, 2
+ paddw m0, m16, m1
+ punpcklwd m2, m16, m1
+ pmaddwd m2, m2
+ punpckhwd m3, m16, m1
+ pmaddwd m3, m3
+ palignr m17, m16, 4
+ paddw m0, m17 ; h sum
+ punpcklwd m1, m17, m6
+ vpdpwssd m2, m1, m1 ; h sumsq
+ punpckhwd m17, m6
+ vpdpwssd m3, m17, m17
+ paddw m1, m0, [t2+r10+416*0]
+ paddd m16, m2, [t2+r10+416*2]
+ paddd m17, m3, [t2+r10+416*4]
+ mova [t2+r10+416*0], m0
+ mova [t2+r10+416*2], m2
+ mova [t2+r10+416*4], m3
+ paddd m16, m8
+ paddd m17, m8
+ psrld m16, 4 ; (a + 8) >> 4
+ psrld m17, 4
+ pmulld m16, m9 ; -((a + 8) >> 4) * 9
+ pmulld m17, m9
+ psrlw m3, m1, 1
+ pavgw m3, m6 ; (b + 2) >> 2
+ punpcklwd m2, m3, m6
+ vpdpwssd m16, m2, m2 ; -p
+ punpckhwd m3, m6
+ vpdpwssd m17, m3, m3
+ punpcklwd m0, m6, m1 ; b
+ punpckhwd m1, m6, m1
+ pminsd m16, m6
+ pminsd m17, m6
+ pmulld m16, m10 ; p * s
+ pmulld m17, m10
+ pmaddwd m0, m11 ; b * 455
+ pmaddwd m1, m11
+ vpalignr m17{k2}, m16, m16, 2
+ mova m16, m20
+ paddusw m17, m12
+ psraw m17, 4 ; min(z, 255) - 256
+ vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m17
+ vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m17{k3}, m16 ; x
+ pandn m16, m13, m17
+ psrld m17, 16
+ pmulld m0, m16
+ pmulld m1, m17
+ packssdw m16, m17
+ psubd m0, m13 ; x * b * 455 + (1 << 11) + (1 << 15)
+ psubd m1, m13
+ mova [t4+r10*1+416*2+4], m16
+ psrld m16, m0, 12
+ psrld m17, m1, 12
+ mova [t3+r10*2+416*4+ 8], xm16
+ mova [t3+r10*2+416*4+ 24], xm17
+ vextracti128 [t3+r10*2+416*4+ 40], ym16, 1
+ vextracti128 [t3+r10*2+416*4+ 56], ym17, 1
+ vextracti32x4 [t3+r10*2+416*4+ 72], m16, 2
+ vextracti32x4 [t3+r10*2+416*4+ 88], m17, 2
+ vextracti32x4 [t3+r10*2+416*4+104], m16, 3
+ vextracti32x4 [t3+r10*2+416*4+120], m17, 3
+ add r10, 64
+ jl .hv1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.v0: ; vertical boxsums + ab (even rows)
+ lea r10, [wq-4]
+.v0_loop:
+ mova m0, [t1+r10+416*0]
+ mova m16, [t1+r10+416*2]
+ mova m17, [t1+r10+416*4]
+ paddw m0, m0
+ paddd m16, m16
+ paddd m17, m17
+ paddw m1, m0, [t2+r10+416*0]
+ paddd m2, m16, [t2+r10+416*2]
+ paddd m3, m17, [t2+r10+416*4]
+ mova [t2+r10+416*0], m0
+ mova [t2+r10+416*2], m16
+ mova [t2+r10+416*4], m17
+ paddd m2, m8
+ paddd m3, m8
+ psrld m2, 4 ; (a + 8) >> 4
+ psrld m3, 4
+ pmulld m2, m9 ; -((a + 8) >> 4) * 9
+ pmulld m3, m9
+ psrlw m17, m1, 1
+ pavgw m17, m6 ; (b + 2) >> 2
+ punpcklwd m16, m17, m6
+ vpdpwssd m2, m16, m16 ; -p
+ punpckhwd m17, m6
+ vpdpwssd m3, m17, m17
+ punpcklwd m16, m6, m1 ; b
+ punpckhwd m17, m6, m1
+ pminsd m2, m6
+ pminsd m3, m6
+ pmulld m2, m10 ; p * s
+ pmulld m3, m10
+ pmaddwd m16, m11 ; b * 455
+ pmaddwd m17, m11
+ vpalignr m3{k2}, m2, m2, 2
+ mova m2, m20
+ paddusw m3, m12
+ psraw m3, 4 ; min(z, 255) - 256
+ vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m3
+ vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m3{k3}, m2 ; x
+ pandn m2, m13, m3
+ psrld m3, 16
+ pmulld m16, m2
+ pmulld m17, m3
+ packssdw m2, m3
+ psubd m16, m13 ; x * b * 455 + (1 << 11) + (1 << 15)
+ psubd m17, m13
+ mova [t4+r10*1+416*0+4], m2
+ psrld m16, 12
+ psrld m17, 12
+ mova [t3+r10*2+416*0+ 8], xm16
+ mova [t3+r10*2+416*0+ 24], xm17
+ vextracti128 [t3+r10*2+416*0+ 40], ym16, 1
+ vextracti128 [t3+r10*2+416*0+ 56], ym17, 1
+ vextracti32x4 [t3+r10*2+416*0+ 72], m16, 2
+ vextracti32x4 [t3+r10*2+416*0+ 88], m17, 2
+ vextracti32x4 [t3+r10*2+416*0+104], m16, 3
+ vextracti32x4 [t3+r10*2+416*0+120], m17, 3
+ add r10, 64
+ jl .v0_loop
+ ret
+.v1: ; vertical boxsums + ab (odd rows)
+ lea r10, [wq-4]
+.v1_loop:
+ mova m0, [t1+r10+416*0]
+ mova m16, [t1+r10+416*2]
+ mova m17, [t1+r10+416*4]
+ paddw m1, m0, [t2+r10+416*0]
+ paddd m2, m16, [t2+r10+416*2]
+ paddd m3, m17, [t2+r10+416*4]
+ mova [t2+r10+416*0], m0
+ mova [t2+r10+416*2], m16
+ mova [t2+r10+416*4], m17
+ paddd m2, m8
+ paddd m3, m8
+ psrld m2, 4 ; (a + 8) >> 4
+ psrld m3, 4
+ pmulld m2, m9 ; -((a + 8) >> 4) * 9
+ pmulld m3, m9
+ psrlw m17, m1, 1
+ pavgw m17, m6 ; (b + 2) >> 2
+ punpcklwd m16, m17, m6
+ vpdpwssd m2, m16, m16 ; -p
+ punpckhwd m17, m6
+ vpdpwssd m3, m17, m17
+ punpcklwd m16, m6, m1 ; b
+ punpckhwd m17, m6, m1
+ pminsd m2, m6
+ pminsd m3, m6
+ pmulld m2, m10 ; p * s
+ pmulld m3, m10
+ pmaddwd m16, m11 ; b * 455
+ pmaddwd m17, m11
+ vpalignr m3{k2}, m2, m2, 2
+ mova m2, m20
+ paddusw m3, m12
+ psraw m3, 4 ; min(z, 255) - 256
+ vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m3
+ vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m3{k3}, m2 ; x
+ pandn m2, m13, m3
+ psrld m3, 16
+ pmulld m16, m2
+ pmulld m17, m3
+ packssdw m2, m3
+ psubd m16, m13 ; x * b * 455 + (1 << 11) + (1 << 15)
+ psubd m17, m13
+ mova [t4+r10*1+416*2+4], m2
+ psrld m16, 12
+ psrld m17, 12
+ mova [t3+r10*2+416*4+ 8], xm16
+ mova [t3+r10*2+416*4+ 24], xm17
+ vextracti128 [t3+r10*2+416*4+ 40], ym16, 1
+ vextracti128 [t3+r10*2+416*4+ 56], ym17, 1
+ vextracti32x4 [t3+r10*2+416*4+ 72], m16, 2
+ vextracti32x4 [t3+r10*2+416*4+ 88], m17, 2
+ vextracti32x4 [t3+r10*2+416*4+104], m16, 3
+ vextracti32x4 [t3+r10*2+416*4+120], m17, 3
+ add r10, 64
+ jl .v1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.prep_n: ; initial neighbor setup
+ mov r10, wq
+.prep_n_loop:
+ mova ym16, [t4+r10*1+416*0+0]
+ paddw ym16, [t4+r10*1+416*0+4]
+ paddw ym17, ym16, [t4+r10*1+416*0+2]
+ mova m0, [t3+r10*2+416*0+0]
+ paddd m0, [t3+r10*2+416*0+8]
+ paddd m1, m0, [t3+r10*2+416*0+4]
+ psllw ym17, 2 ; a[-1] 444
+ pslld m1, 2 ; b[-1] 444
+ psubw ym17, ym16 ; a[-1] 343
+ psubd m1, m0 ; b[-1] 343
+ vmovdqa32 [t4+r10*1+416* 4], ym17
+ vmovdqa32 [t3+r10*2+416* 8], m1
+ mova ym16, [t4+r10*1+416*2+0]
+ paddw ym16, [t4+r10*1+416*2+4]
+ paddw ym17, ym16, [t4+r10*1+416*2+2]
+ mova m0, [t3+r10*2+416*4+0]
+ paddd m0, [t3+r10*2+416*4+8]
+ paddd m1, m0, [t3+r10*2+416*4+4]
+ psllw ym17, 2 ; a[ 0] 444
+ pslld m1, 2 ; b[ 0] 444
+ vmovdqa32 [t4+r10*1+416* 6], ym17
+ vmovdqa32 [t3+r10*2+416*12], m1
+ psubw ym17, ym16 ; a[ 0] 343
+ psubd m1, m0 ; b[ 0] 343
+ vmovdqa32 [t4+r10*1+416* 8], ym17
+ vmovdqa32 [t3+r10*2+416*16], m1
+ add r10, 32
+ jl .prep_n_loop
+ ret
+ALIGN function_align
+.n0: ; neighbor + output (even rows)
+ mov r10, wq
+.n0_loop:
+ mova m3, [t4+r10*1+416*0+0]
+ paddw m3, [t4+r10*1+416*0+4]
+ paddw m1, m3, [t4+r10*1+416*0+2]
+ psllw m1, 2 ; a[ 1] 444
+ psubw m2, m1, m3 ; a[ 1] 343
+ paddw m3, m2, [t4+r10*1+416*4]
+ paddw m3, [t4+r10*1+416*6]
+ mova [t4+r10*1+416*4], m2
+ mova [t4+r10*1+416*6], m1
+ mova m16, [t3+r10*2+416*0+0]
+ paddd m16, [t3+r10*2+416*0+8]
+ paddd m1, m16, [t3+r10*2+416*0+4]
+ pslld m1, 2 ; b[ 1] 444
+ psubd m2, m1, m16 ; b[ 1] 343
+ paddd m16, m2, [t3+r10*2+416* 8+ 0]
+ paddd m16, [t3+r10*2+416*12+ 0]
+ mova [t3+r10*2+416* 8+ 0], m2
+ mova [t3+r10*2+416*12+ 0], m1
+ mova m17, [t3+r10*2+416*0+64]
+ paddd m17, [t3+r10*2+416*0+72]
+ paddd m1, m17, [t3+r10*2+416*0+68]
+ pslld m1, 2
+ psubd m2, m1, m17
+ paddd m17, m2, [t3+r10*2+416* 8+64]
+ paddd m17, [t3+r10*2+416*12+64]
+ mova [t3+r10*2+416* 8+64], m2
+ mova [t3+r10*2+416*12+64], m1
+ mova m0, [dstq+r10]
+ punpcklwd m1, m0, m6
+ punpcklwd m2, m3, m6
+ pmaddwd m2, m1 ; a * src
+ punpckhwd m1, m0, m6
+ punpckhwd m3, m6
+ pmaddwd m3, m1
+ vshufi32x4 m1, m16, m17, q2020
+ vshufi32x4 m16, m17, q3131
+ psubd m1, m2 ; b - a * src + (1 << 8)
+ psubd m16, m3
+ psrad m1, 9
+ psrad m16, 9
+ packssdw m1, m16
+ pmulhrsw m1, m7
+ paddw m0, m1
+ pmaxsw m0, m6
+ pminsw m0, m14
+ mova [dstq+r10], m0
+ add r10, 64
+ jl .n0_loop
+ add dstq, strideq
+ ret
+ALIGN function_align
+.n1: ; neighbor + output (odd rows)
+ mov r10, wq
+.n1_loop:
+ mova m3, [t4+r10*1+416*2+0]
+ paddw m3, [t4+r10*1+416*2+4]
+ paddw m1, m3, [t4+r10*1+416*2+2]
+ psllw m1, 2 ; a[ 1] 444
+ psubw m2, m1, m3 ; a[ 1] 343
+ paddw m3, m2, [t4+r10*1+416*6]
+ paddw m3, [t4+r10*1+416*8]
+ mova [t4+r10*1+416*6], m1
+ mova [t4+r10*1+416*8], m2
+ mova m16, [t3+r10*2+416*4+0]
+ paddd m16, [t3+r10*2+416*4+8]
+ paddd m1, m16, [t3+r10*2+416*4+4]
+ pslld m1, 2 ; b[ 1] 444
+ psubd m2, m1, m16 ; b[ 1] 343
+ paddd m16, m2, [t3+r10*2+416*12+ 0]
+ paddd m16, [t3+r10*2+416*16+ 0]
+ mova [t3+r10*2+416*12+ 0], m1
+ mova [t3+r10*2+416*16+ 0], m2
+ mova m17, [t3+r10*2+416*4+64]
+ paddd m17, [t3+r10*2+416*4+72]
+ paddd m1, m17, [t3+r10*2+416*4+68]
+ pslld m1, 2
+ psubd m2, m1, m17
+ paddd m17, m2, [t3+r10*2+416*12+64]
+ paddd m17, [t3+r10*2+416*16+64]
+ mova [t3+r10*2+416*12+64], m1
+ mova [t3+r10*2+416*16+64], m2
+ mova m0, [dstq+r10]
+ punpcklwd m1, m0, m6
+ punpcklwd m2, m3, m6
+ pmaddwd m2, m1 ; a * src
+ punpckhwd m1, m0, m6
+ punpckhwd m3, m6
+ pmaddwd m3, m1
+ vshufi32x4 m1, m16, m17, q2020
+ vshufi32x4 m16, m17, q3131
+ psubd m1, m2 ; b - a * src + (1 << 8)
+ psubd m16, m3
+ psrad m1, 9
+ psrad m16, 9
+ packssdw m1, m16
+ pmulhrsw m1, m7
+ paddw m0, m1
+ pmaxsw m0, m6
+ pminsw m0, m14
+ mova [dstq+r10], m0
+ add r10, 64
+ jl .n1_loop
+ add dstq, strideq
+ ret
+
+cglobal sgr_filter_mix_16bpc, 4, 14, 23, 416*66+8, dst, stride, left, lpf, \
+ w, h, edge, params
+ movifnidn wd, wm
+ mov paramsq, r6mp
+ lea r13, [r_ext_mask+72]
+ mov edged, r7m
+ movifnidn hd, hm
+ vpbroadcastd m7, [paramsq+8] ; w0 w1
+ pxor m6, m6
+ vpbroadcastd m8, [base+pd_8]
+ add wd, wd
+ vpbroadcastd m9, [base+pd_m9]
+ add lpfq, wq
+ vpbroadcastd m10, [base+pd_m25]
+ add dstq, wq
+ vpsubd m11, m6, [paramsq+0] {1to16} ; -s0
+ lea t3, [rsp+wq*2+416*24+8]
+ vpsubd m12, m6, [paramsq+4] {1to16} ; -s1
+ lea t4, [rsp+wq+416*52+8]
+ vpbroadcastd m13, [base+pw_164_455]
+ lea t1, [rsp+wq+12]
+ vpbroadcastd m14, [base+pw_61448]
+ neg wq
+ vpbroadcastd m15, [base+pd_m34816]
+ psllw m7, 2
+ vpbroadcastd m22, [base+pd_2147483648]
+ mov r10d, 0xfffffff8
+ mova m18, [sgr_x_by_x+64*0]
+ kmovd k1, r10d
+ mova m19, [sgr_x_by_x+64*1]
+ mov r10, 0x3333333333333333
+ mova m20, [sgr_x_by_x+64*2]
+ kmovq k2, r10
+ mova m21, [sgr_x_by_x+64*3]
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t2, t1
+ call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx512icl).top_fixup
+ add t1, 416*12
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ add r10, strideq
+ mov [rsp], r10 ; below
+ call .hv0
+.main:
+ dec hd
+ jz .height1
+ add lpfq, strideq
+ call .hv1
+ call .prep_n
+ sub hd, 2
+ jl .extend_bottom
+.main_loop:
+ add lpfq, strideq
+ call .hv0
+ test hd, hd
+ jz .odd_height
+ add lpfq, strideq
+ call .hv1
+ call .n0
+ call .n1
+ sub hd, 2
+ jge .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .extend_bottom
+ mov lpfq, [rsp]
+ call .hv0_bottom
+ add lpfq, strideq
+ call .hv1_bottom
+.end:
+ call .n0
+ call .n1
+.end2:
+ RET
+.height1:
+ call .v1
+ call .prep_n
+ jmp .odd_height_end
+.odd_height:
+ call .v1
+ call .n0
+ call .n1
+.odd_height_end:
+ call .v0
+ call .v1
+ call .n0
+ jmp .end2
+.extend_bottom:
+ call .v0
+ call .v1
+ jmp .end
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov [rsp], r10
+ call .h
+ lea r10, [wq-4]
+ lea t2, [t1+416*12]
+.top_fixup_loop:
+ mova m0, [t1+r10+416* 0]
+ mova m1, [t1+r10+416* 2]
+ mova m2, [t1+r10+416* 4]
+ paddw m0, m0
+ mova m3, [t1+r10+416* 6]
+ paddd m1, m1
+ mova m4, [t1+r10+416* 8]
+ paddd m2, m2
+ mova m5, [t1+r10+416*10]
+ mova [t2+r10+416* 0], m0
+ mova [t2+r10+416* 2], m1
+ mova [t2+r10+416* 4], m2
+ mova [t2+r10+416* 6], m3
+ mova [t2+r10+416* 8], m4
+ mova [t2+r10+416*10], m5
+ add r10, 64
+ jl .top_fixup_loop
+ call .v0
+ jmp .main
+.h: ; horizontal boxsum
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movq xm16, [leftq+2]
+ vmovdqu16 m16{k1}, [lpfq+wq-6]
+ add leftq, 8
+ jmp .h_main
+.h_extend_left:
+ vpbroadcastw xm16, [lpfq+wq]
+ vmovdqu16 m16{k1}, [lpfq+wq-6]
+ jmp .h_main
+.h_top:
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m16, [lpfq+r10- 2]
+.h_main:
+ movu m17, [lpfq+r10+14]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -68
+ jl .h_have_right
+ vpbroadcastw m0, [lpfq-2]
+ vpternlogd m16, m0, [r13+r10+ 0], 0xe4
+ vpternlogd m17, m0, [r13+r10+16], 0xe4
+.h_have_right:
+ palignr m3, m17, m16, 2
+ palignr m0, m17, m16, 4
+ paddw m1, m3, m0
+ punpcklwd m2, m3, m0
+ pmaddwd m2, m2
+ punpckhwd m3, m0
+ pmaddwd m3, m3
+ palignr m0, m17, m16, 6
+ paddw m1, m0 ; sum3
+ punpcklwd m4, m0, m6
+ vpdpwssd m2, m4, m4 ; sumsq3
+ punpckhwd m0, m6
+ vpdpwssd m3, m0, m0
+ shufpd m4, m16, m17, 0x55
+ punpcklwd m17, m4, m16
+ paddw m0, m16, m4
+ punpckhwd m4, m16
+ mova [t1+r10+416* 6], m1
+ mova [t1+r10+416* 8], m2
+ mova [t1+r10+416*10], m3
+ paddw m1, m0 ; sum5
+ vpdpwssd m2, m17, m17 ; sumsq5
+ vpdpwssd m3, m4, m4
+ mova [t1+r10+416* 0], m1
+ mova [t1+r10+416* 2], m2
+ mova [t1+r10+416* 4], m3
+ add r10, 64
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows)
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+ movq xm16, [leftq+2]
+ vmovdqu16 m16{k1}, [lpfq+wq-6]
+ add leftq, 8
+ jmp .hv0_main
+.hv0_extend_left:
+ vpbroadcastw xm16, [lpfq+wq]
+ vmovdqu16 m16{k1}, [lpfq+wq-6]
+ jmp .hv0_main
+.hv0_bottom:
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+.hv0_loop:
+ movu m16, [lpfq+r10- 2]
+.hv0_main:
+ movu m17, [lpfq+r10+14]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv0_have_right
+ cmp r10d, -68
+ jl .hv0_have_right
+ vpbroadcastw m0, [lpfq-2]
+ vpternlogd m16, m0, [r13+r10+ 0], 0xe4
+ vpternlogd m17, m0, [r13+r10+16], 0xe4
+.hv0_have_right:
+ palignr m3, m17, m16, 2
+ palignr m0, m17, m16, 4
+ paddw m1, m3, m0
+ punpcklwd m2, m3, m0
+ pmaddwd m2, m2
+ punpckhwd m3, m0
+ pmaddwd m3, m3
+ palignr m0, m17, m16, 6
+ paddw m1, m0 ; h sum3
+ punpcklwd m4, m0, m6
+ vpdpwssd m2, m4, m4 ; h sumsq3
+ punpckhwd m0, m6
+ vpdpwssd m3, m0, m0
+ shufpd m17, m16, m17, 0x55
+ paddw m4, m1, [t1+r10+416* 6]
+ paddd m5, m2, [t1+r10+416* 8]
+ mova [t1+r10+416* 6], m1
+ mova [t1+r10+416* 8], m2
+ paddw m1, m16
+ paddw m1, m17 ; h sum5
+ punpcklwd m0, m17, m16
+ vpdpwssd m2, m0, m0 ; h sumsq5
+ paddd m0, m3, [t1+r10+416*10]
+ mova [t1+r10+416*10], m3
+ punpckhwd m17, m16
+ vpdpwssd m3, m17, m17
+ mova [t3+r10*2+416*8+ 8], m1 ; we need a clean copy of the last row
+ mova [t3+r10*2+416*0+ 8], m2 ; in case height is odd
+ mova [t3+r10*2+416*0+72], m3
+ paddw m1, [t1+r10+416* 0]
+ paddd m2, [t1+r10+416* 2]
+ paddd m3, [t1+r10+416* 4]
+ mova [t1+r10+416* 0], m1
+ mova [t1+r10+416* 2], m2
+ mova [t1+r10+416* 4], m3
+ paddw m17, m4, [t2+r10+416* 6]
+ paddd m2, m5, [t2+r10+416* 8]
+ paddd m3, m0, [t2+r10+416*10]
+ mova [t2+r10+416* 6], m4
+ mova [t2+r10+416* 8], m5
+ mova [t2+r10+416*10], m0
+ paddd m2, m8
+ paddd m3, m8
+ psrld m2, 4 ; (a3 + 8) >> 4
+ psrld m3, 4
+ pmulld m2, m9 ; -((a3 + 8) >> 4) * 9
+ pmulld m3, m9
+ psrlw m5, m17, 1
+ pavgw m5, m6 ; (b3 + 2) >> 2
+ punpcklwd m4, m5, m6
+ vpdpwssd m2, m4, m4 ; -p3
+ punpckhwd m5, m6
+ vpdpwssd m3, m5, m5
+ punpcklwd m16, m6, m17 ; b3
+ punpckhwd m17, m6, m17
+ pminsd m2, m6
+ pminsd m3, m6
+ pmulld m2, m12 ; p3 * s1
+ pmulld m3, m12
+ pmaddwd m16, m13 ; b3 * 455
+ pmaddwd m17, m13
+ vpalignr m3{k2}, m2, m2, 2
+ mova m2, m20
+ paddusw m3, m14
+ psraw m3, 4 ; min(z3, 255) - 256
+ vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m3
+ vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m3{k3}, m2 ; x3
+ pandn m2, m15, m3
+ psrld m3, 16
+ pmulld m16, m2
+ pmulld m17, m3
+ packssdw m2, m3
+ mova [t4+r10*1+416*2+4], m2
+ psubd m16, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ psubd m17, m15
+ psrld m16, 12
+ psrld m17, 12
+ mova [t3+r10*2+416*4+ 8], xm16
+ mova [t3+r10*2+416*4+ 24], xm17
+ vextracti128 [t3+r10*2+416*4+ 40], ym16, 1
+ vextracti128 [t3+r10*2+416*4+ 56], ym17, 1
+ vextracti32x4 [t3+r10*2+416*4+ 72], m16, 2
+ vextracti32x4 [t3+r10*2+416*4+ 88], m17, 2
+ vextracti32x4 [t3+r10*2+416*4+104], m16, 3
+ vextracti32x4 [t3+r10*2+416*4+120], m17, 3
+ add r10, 64
+ jl .hv0_loop
+ ret
+ALIGN function_align
+.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+ movq xm16, [leftq+2]
+ vmovdqu16 m16{k1}, [lpfq+wq-6]
+ add leftq, 8
+ jmp .hv1_main
+.hv1_extend_left:
+ vpbroadcastw xm16, [lpfq+wq]
+ vmovdqu16 m16{k1}, [lpfq+wq-6]
+ jmp .hv1_main
+.hv1_bottom:
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+.hv1_loop:
+ movu m16, [lpfq+r10- 2]
+.hv1_main:
+ movu m17, [lpfq+r10+14]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv1_have_right
+ cmp r10d, -68
+ jl .hv1_have_right
+ vpbroadcastw m0, [lpfq-2]
+ vpternlogd m16, m0, [r13+r10+ 0], 0xe4
+ vpternlogd m17, m0, [r13+r10+16], 0xe4
+.hv1_have_right:
+ palignr m1, m17, m16, 2
+ palignr m3, m17, m16, 4
+ paddw m2, m1, m3
+ punpcklwd m0, m1, m3
+ pmaddwd m0, m0
+ punpckhwd m1, m3
+ pmaddwd m1, m1
+ palignr m3, m17, m16, 6
+ paddw m2, m3 ; h sum3
+ punpcklwd m5, m3, m6
+ vpdpwssd m0, m5, m5 ; h sumsq3
+ punpckhwd m3, m6
+ vpdpwssd m1, m3, m3
+ shufpd m3, m16, m17, 0x55
+ punpcklwd m5, m16, m3
+ paddw m4, m16, m3
+ punpckhwd m16, m3
+ paddw m17, m2, [t2+r10+416* 6]
+ mova [t2+r10+416* 6], m2
+ paddw m4, m2 ; h sum5
+ paddd m2, m0, [t2+r10+416* 8]
+ paddd m3, m1, [t2+r10+416*10]
+ mova [t2+r10+416* 8], m0
+ mova [t2+r10+416*10], m1
+ vpdpwssd m0, m5, m5 ; h sumsq5
+ vpdpwssd m1, m16, m16
+ paddd m2, m8
+ paddd m3, m8
+ psrld m2, 4 ; (a3 + 8) >> 4
+ psrld m3, 4
+ pmulld m2, m9 ; -((a3 + 8) >> 4) * 9
+ pmulld m3, m9
+ psrlw m16, m17, 1
+ pavgw m16, m6 ; (b3 + 2) >> 2
+ punpcklwd m5, m16, m6
+ vpdpwssd m2, m5, m5 ; -p3
+ punpckhwd m16, m6
+ vpdpwssd m3, m16, m16
+ punpcklwd m16, m6, m17 ; b3
+ punpckhwd m17, m6, m17
+ pminsd m2, m6
+ pminsd m3, m6
+ pmulld m2, m12 ; p3 * s1
+ pmulld m3, m12
+ pmaddwd m16, m13 ; b3 * 455
+ pmaddwd m17, m13
+ vpalignr m3{k2}, m2, m2, 2
+ mova m2, m20
+ paddusw m3, m14
+ psraw m3, 4 ; min(z3, 255) - 256
+ vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m3
+ vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m3{k3}, m2 ; x3
+ pandn m2, m15, m3
+ psrld m3, 16
+ pmulld m16, m2
+ pmulld m17, m3
+ packssdw m2, m3
+ mova [t4+r10*1+416*4+4], m2
+ psubd m16, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ psubd m17, m15
+ psrld m16, 12
+ psrld m17, 12
+ paddw m5, m4, [t2+r10+416*0]
+ paddd m2, m0, [t2+r10+416*2]
+ paddd m3, m1, [t2+r10+416*4]
+ paddw m5, [t1+r10+416*0]
+ paddd m2, [t1+r10+416*2]
+ paddd m3, [t1+r10+416*4]
+ mova [t2+r10+416*0], m4
+ mova [t2+r10+416*2], m0
+ mova [t2+r10+416*4], m1
+ mova [t3+r10*2+416*8+ 8], xm16
+ mova [t3+r10*2+416*8+ 24], xm17
+ vextracti128 [t3+r10*2+416*8+ 40], ym16, 1
+ vextracti128 [t3+r10*2+416*8+ 56], ym17, 1
+ vextracti32x4 [t3+r10*2+416*8+ 72], m16, 2
+ vextracti32x4 [t3+r10*2+416*8+ 88], m17, 2
+ vextracti32x4 [t3+r10*2+416*8+104], m16, 3
+ vextracti32x4 [t3+r10*2+416*8+120], m17, 3
+ paddd m2, m8
+ paddd m3, m8
+ psrld m2, 4 ; (a5 + 8) >> 4
+ psrld m3, 4
+ pmulld m2, m10 ; -((a5 + 8) >> 4) * 25
+ pmulld m3, m10
+ psrlw m17, m5, 1
+ pavgw m17, m6 ; (b5 + 2) >> 2
+ punpcklwd m16, m17, m6
+ vpdpwssd m2, m16, m16 ; -p5
+ punpckhwd m17, m6
+ vpdpwssd m3, m17, m17
+ punpcklwd m16, m5, m6 ; b5
+ punpckhwd m17, m5, m6
+ pmulld m2, m11 ; p5 * s0
+ pmulld m3, m11
+ pmaddwd m16, m13 ; b5 * 164
+ pmaddwd m17, m13
+ vpalignr m3{k2}, m2, m2, 2
+ mova m2, m20
+ pmaxsw m3, m6
+ paddusw m3, m14
+ psraw m3, 4 ; min(z5, 255) - 256
+ vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m3
+ vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m3{k3}, m2 ; x5
+ pandn m2, m15, m3
+ psrld m3, 16
+ pmulld m16, m2
+ pmulld m17, m3
+ packssdw m2, m3
+ mova [t4+r10*1+416*0+4], m2
+ psubd m16, m15 ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
+ psubd m17, m15
+ psrld m16, 12
+ psrld m17, 12
+ mova [t3+r10*2+416*0+ 8], xm16
+ mova [t3+r10*2+416*0+ 24], xm17
+ vextracti128 [t3+r10*2+416*0+ 40], ym16, 1
+ vextracti128 [t3+r10*2+416*0+ 56], ym17, 1
+ vextracti32x4 [t3+r10*2+416*0+ 72], m16, 2
+ vextracti32x4 [t3+r10*2+416*0+ 88], m17, 2
+ vextracti32x4 [t3+r10*2+416*0+104], m16, 3
+ vextracti32x4 [t3+r10*2+416*0+120], m17, 3
+ add r10, 64
+ jl .hv1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.v0: ; vertical boxsums + ab3 (even rows)
+ lea r10, [wq-4]
+.v0_loop:
+ mova m16, [t1+r10+416* 6]
+ mova m2, [t1+r10+416* 8]
+ mova m3, [t1+r10+416*10]
+ paddw m16, m16
+ paddd m2, m2
+ paddd m3, m3
+ paddw m17, m16, [t2+r10+416* 6]
+ paddd m4, m2, [t2+r10+416* 8]
+ paddd m5, m3, [t2+r10+416*10]
+ mova [t2+r10+416* 6], m16
+ mova [t2+r10+416* 8], m2
+ mova [t2+r10+416*10], m3
+ paddd m4, m8
+ paddd m5, m8
+ psrld m4, 4 ; (a3 + 8) >> 4
+ psrld m5, 4
+ pmulld m4, m9 ; -((a3 + 8) >> 4) * 9
+ pmulld m5, m9
+ psrlw m3, m17, 1
+ pavgw m3, m6 ; (b3 + 2) >> 2
+ punpcklwd m2, m3, m6
+ vpdpwssd m4, m2, m2 ; -p3
+ punpckhwd m3, m6
+ vpdpwssd m5, m3, m3
+ punpcklwd m16, m6, m17 ; b3
+ punpckhwd m17, m6, m17
+ pminsd m4, m6
+ pminsd m5, m6
+ pmulld m4, m12 ; p3 * s1
+ pmulld m5, m12
+ pmaddwd m16, m13 ; b3 * 455
+ pmaddwd m17, m13
+ vpalignr m5{k2}, m4, m4, 2
+ mova m4, m20
+ paddusw m5, m14
+ psraw m5, 4 ; min(z3, 255) - 256
+ vpermt2b m4, m5, m21 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m5
+ vpermi2b m5, m18, m19 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m5{k3}, m4 ; x3
+ pandn m4, m15, m5
+ psrld m5, 16
+ pmulld m16, m4
+ pmulld m17, m5
+ packssdw m4, m5
+ mova [t4+r10*1+416*2+4], m4
+ psubd m16, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ psubd m17, m15
+ psrld m16, 12
+ psrld m17, 12
+ mova m3, [t1+r10+416*0]
+ mova m4, [t1+r10+416*2]
+ mova m5, [t1+r10+416*4]
+ mova [t3+r10*2+416*8+ 8], m3
+ mova [t3+r10*2+416*0+ 8], m4
+ mova [t3+r10*2+416*0+72], m5
+ paddw m3, m3 ; cc5
+ paddd m4, m4
+ paddd m5, m5
+ mova [t1+r10+416*0], m3
+ mova [t1+r10+416*2], m4
+ mova [t1+r10+416*4], m5
+ mova [t3+r10*2+416*4+ 8], xm16
+ mova [t3+r10*2+416*4+ 24], xm17
+ vextracti128 [t3+r10*2+416*4+ 40], ym16, 1
+ vextracti128 [t3+r10*2+416*4+ 56], ym17, 1
+ vextracti32x4 [t3+r10*2+416*4+ 72], m16, 2
+ vextracti32x4 [t3+r10*2+416*4+ 88], m17, 2
+ vextracti32x4 [t3+r10*2+416*4+104], m16, 3
+ vextracti32x4 [t3+r10*2+416*4+120], m17, 3
+ add r10, 64
+ jl .v0_loop
+ ret
+.v1: ; vertical boxsums + ab (odd rows)
+ lea r10, [wq-4]
+.v1_loop:
+ mova m16, [t1+r10+416* 6]
+ mova m2, [t1+r10+416* 8]
+ mova m3, [t1+r10+416*10]
+ paddw m17, m16, [t2+r10+416* 6]
+ paddd m4, m2, [t2+r10+416* 8]
+ paddd m5, m3, [t2+r10+416*10]
+ mova [t2+r10+416* 6], m16
+ mova [t2+r10+416* 8], m2
+ mova [t2+r10+416*10], m3
+ paddd m4, m8
+ paddd m5, m8
+ psrld m4, 4 ; (a3 + 8) >> 4
+ psrld m5, 4
+ pmulld m4, m9 ; -((a3 + 8) >> 4) * 9
+ pmulld m5, m9
+ psrlw m3, m17, 1
+ pavgw m3, m6 ; (b3 + 2) >> 2
+ punpcklwd m2, m3, m6
+ vpdpwssd m4, m2, m2 ; -p3
+ punpckhwd m3, m6
+ vpdpwssd m5, m3, m3
+ punpcklwd m16, m6, m17 ; b3
+ punpckhwd m17, m6, m17
+ pminsd m4, m6
+ pminsd m5, m6
+ pmulld m4, m12 ; p3 * s1
+ pmulld m5, m12
+ pmaddwd m16, m13 ; b3 * 455
+ pmaddwd m17, m13
+ vpalignr m5{k2}, m4, m4, 2
+ mova m4, m20
+ paddusw m5, m14
+ psraw m5, 4 ; min(z3, 255) - 256
+ vpermt2b m4, m5, m21 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m5
+ vpermi2b m5, m18, m19 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m5{k3}, m4 ; x3
+ pandn m4, m15, m5
+ psrld m5, 16
+ pmulld m16, m4
+ pmulld m17, m5
+ packssdw m4, m5
+ mova [t4+r10*1+416*4+4], m4
+ psubd m16, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ psubd m17, m15
+ psrld m16, 12
+ psrld m17, 12
+ mova m0, [t3+r10*2+416*8+ 8]
+ mova m4, [t3+r10*2+416*0+ 8]
+ mova m5, [t3+r10*2+416*0+72]
+ paddw m1, m0, [t2+r10+416*0]
+ paddd m2, m4, [t2+r10+416*2]
+ paddd m3, m5, [t2+r10+416*4]
+ paddw m1, [t1+r10+416*0]
+ paddd m2, [t1+r10+416*2]
+ paddd m3, [t1+r10+416*4]
+ mova [t2+r10+416*0], m0
+ mova [t2+r10+416*2], m4
+ mova [t2+r10+416*4], m5
+ mova [t3+r10*2+416*8+ 8], xm16
+ mova [t3+r10*2+416*8+ 24], xm17
+ vextracti128 [t3+r10*2+416*8+ 40], ym16, 1
+ vextracti128 [t3+r10*2+416*8+ 56], ym17, 1
+ vextracti32x4 [t3+r10*2+416*8+ 72], m16, 2
+ vextracti32x4 [t3+r10*2+416*8+ 88], m17, 2
+ vextracti32x4 [t3+r10*2+416*8+104], m16, 3
+ vextracti32x4 [t3+r10*2+416*8+120], m17, 3
+ paddd m2, m8
+ paddd m3, m8
+ psrld m2, 4 ; (a5 + 8) >> 4
+ psrld m3, 4
+ pmulld m2, m10 ; -((a5 + 8) >> 4) * 25
+ pmulld m3, m10
+ psrlw m5, m1, 1
+ pavgw m5, m6 ; (b5 + 2) >> 2
+ punpcklwd m4, m5, m6
+ vpdpwssd m2, m4, m4 ; -p5
+ punpckhwd m5, m6
+ vpdpwssd m3, m5, m5
+ punpcklwd m16, m1, m6 ; b5
+ punpckhwd m17, m1, m6
+ pmulld m2, m11 ; p5 * s0
+ pmulld m3, m11
+ pmaddwd m16, m13 ; b5 * 164
+ pmaddwd m17, m13
+ vpalignr m3{k2}, m2, m2, 2
+ mova m2, m20
+ pmaxsw m3, m6
+ paddusw m3, m14
+ psraw m3, 4 ; min(z5, 255) - 256
+ vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m3
+ vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m3{k3}, m2 ; x5
+ pandn m2, m15, m3
+ psrld m3, 16
+ pmulld m16, m2
+ pmulld m17, m3
+ packssdw m2, m3
+ mova [t4+r10*1+416*0+4], m2
+ psubd m16, m15 ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
+ psubd m17, m15
+ psrld m16, 12
+ psrld m17, 12
+ mova [t3+r10*2+416*0+ 8], xm16
+ mova [t3+r10*2+416*0+ 24], xm17
+ vextracti128 [t3+r10*2+416*0+ 40], ym16, 1
+ vextracti128 [t3+r10*2+416*0+ 56], ym17, 1
+ vextracti32x4 [t3+r10*2+416*0+ 72], m16, 2
+ vextracti32x4 [t3+r10*2+416*0+ 88], m17, 2
+ vextracti32x4 [t3+r10*2+416*0+104], m16, 3
+ vextracti32x4 [t3+r10*2+416*0+120], m17, 3
+ add r10, 64
+ jl .v1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.prep_n: ; initial neighbor setup
+ mov r10, wq
+.prep_n_loop:
+ movu ym0, [t4+r10*1+416*0+2]
+ paddw ym2, ym0, [t4+r10*1+416*0+0]
+ paddw ym2, [t4+r10*1+416*0+4]
+ movu m1, [t3+r10*2+416*0+4]
+ paddd m3, m1, [t3+r10*2+416*0+0]
+ paddd m3, [t3+r10*2+416*0+8]
+ paddw ym0, ym2
+ paddd m1, m3
+ psllw ym2, 2
+ pslld m3, 2
+ paddw ym0, ym2 ; a5 565
+ paddd m1, m3 ; b5 565
+ mova [t4+r10*1+416* 6], ym0
+ mova [t3+r10*2+416*12], m1
+ mova ym0, [t4+r10*1+416*2+0]
+ paddw ym0, [t4+r10*1+416*2+4]
+ paddw ym2, ym0, [t4+r10*1+416*2+2]
+ mova m1, [t3+r10*2+416*4+0]
+ paddd m1, [t3+r10*2+416*4+8]
+ paddd m3, m1, [t3+r10*2+416*4+4]
+ psllw ym2, 2 ; a3[-1] 444
+ pslld m3, 2 ; b3[-1] 444
+ psubw ym2, ym0 ; a3[-1] 343
+ psubd m3, m1 ; b3[-1] 343
+ mova [t4+r10*1+416* 8], ym2
+ mova [t3+r10*2+416*16], m3
+ mova ym0, [t4+r10*1+416*4+0]
+ paddw ym0, [t4+r10*1+416*4+4]
+ paddw ym2, ym0, [t4+r10*1+416*4+2]
+ mova m1, [t3+r10*2+416*8+0]
+ paddd m1, [t3+r10*2+416*8+8]
+ paddd m3, m1, [t3+r10*2+416*8+4]
+ psllw ym2, 2 ; a3[ 0] 444
+ pslld m3, 2 ; b3[ 0] 444
+ mova [t4+r10*1+416*10], ym2
+ mova [t3+r10*2+416*20], m3
+ psubw ym2, ym0 ; a3[ 0] 343
+ psubd m3, m1 ; b3[ 0] 343
+ mova [t4+r10*1+416*12], ym2
+ mova [t3+r10*2+416*24], m3
+ add r10, 32
+ jl .prep_n_loop
+ ret
+ALIGN function_align
+.n0: ; neighbor + output (even rows)
+ mov r10, wq
+.n0_loop:
+ movu ym2, [t4+r10*1+2]
+ paddw ym0, ym2, [t4+r10*1+0]
+ paddw ym0, [t4+r10*1+4]
+ paddw ym2, ym0
+ psllw ym0, 2
+ paddw ym0, ym2 ; a5
+ movu m1, [t3+r10*2+4]
+ paddd m4, m1, [t3+r10*2+0]
+ paddd m4, [t3+r10*2+8]
+ paddd m1, m4
+ pslld m4, 2
+ paddd m4, m1 ; b5
+ paddw ym2, ym0, [t4+r10*1+416* 6]
+ mova [t4+r10*1+416* 6], ym0
+ paddd m0, m4, [t3+r10*2+416*12]
+ mova [t3+r10*2+416*12], m4
+ mova ym3, [t4+r10*1+416*2+0]
+ paddw ym3, [t4+r10*1+416*2+4]
+ paddw ym5, ym3, [t4+r10*1+416*2+2]
+ psllw ym5, 2 ; a3[ 1] 444
+ psubw ym4, ym5, ym3 ; a3[ 1] 343
+ paddw ym3, ym4, [t4+r10*1+416* 8]
+ paddw ym3, [t4+r10*1+416*10]
+ mova [t4+r10*1+416* 8], ym4
+ mova [t4+r10*1+416*10], ym5
+ mova m1, [t3+r10*2+416*4+0]
+ paddd m1, [t3+r10*2+416*4+8]
+ paddd m5, m1, [t3+r10*2+416*4+4]
+ pslld m5, 2 ; b3[ 1] 444
+ psubd m4, m5, m1 ; b3[ 1] 343
+ paddd m1, m4, [t3+r10*2+416*16]
+ paddd m1, [t3+r10*2+416*20]
+ mova [t3+r10*2+416*16], m4
+ mova [t3+r10*2+416*20], m5
+ pmovzxwd m4, [dstq+r10]
+ pmovzxwd m2, ym2 ; a5
+ pmovzxwd m3, ym3 ; a3
+ pmaddwd m2, m4 ; a5 * src
+ pmaddwd m3, m4 ; a3 * src
+ vpshldd m4, m22, 13
+ psubd m0, m2 ; b5 - a5 * src + (1 << 8)
+ psubd m1, m3 ; b3 - a3 * src + (1 << 8)
+ psrld m0, 9
+ pslld m1, 7
+ vpblendmb m0{k2}, m1, m0
+ vpdpwssd m4, m0, m7
+ psrad m4, 7
+ pmaxsd m4, m6
+ vpmovusdw ym16, m4 ; clip
+ psrlw ym16, 6
+ mova [dstq+r10], ym16
+ add r10, 32
+ jl .n0_loop
+ add dstq, strideq
+ ret
+ALIGN function_align
+.n1: ; neighbor + output (odd rows)
+ mov r10, wq
+.n1_loop:
+ mova ym3, [t4+r10*1+416*4+0]
+ paddw ym3, [t4+r10*1+416*4+4]
+ paddw ym5, ym3, [t4+r10*1+416*4+2]
+ psllw ym5, 2 ; a3[ 1] 444
+ psubw ym4, ym5, ym3 ; a3[ 1] 343
+ paddw ym3, ym4, [t4+r10*1+416*12]
+ paddw ym3, [t4+r10*1+416*10]
+ mova [t4+r10*1+416*10], ym5
+ mova [t4+r10*1+416*12], ym4
+ mova m0, [t3+r10*2+416*8+0]
+ paddd m0, [t3+r10*2+416*8+8]
+ paddd m5, m0, [t3+r10*2+416*8+4]
+ pslld m5, 2 ; b3[ 1] 444
+ psubd m4, m5, m0 ; b3[ 1] 343
+ paddd m0, m4, [t3+r10*2+416*24]
+ paddd m0, [t3+r10*2+416*20]
+ mova [t3+r10*2+416*20], m5
+ mova [t3+r10*2+416*24], m4
+ pmovzxwd m4, [dstq+r10]
+ pmovzxwd m2, [t4+r10*1+416* 6]
+ pmovzxwd m3, ym3
+ mova m1, [t3+r10*2+416*12]
+ pmaddwd m2, m4 ; a5 * src
+ pmaddwd m3, m4 ; a3 * src
+ vpshldd m4, m22, 13
+ psubd m1, m2 ; b5 - a5 * src + (1 << 8)
+ psubd m0, m3 ; b3 - a3 * src + (1 << 8)
+ pslld m0, 7
+ vpalignr m0{k2}, m1, m1, 1
+ vpdpwssd m4, m0, m7
+ psrad m4, 7
+ pmaxsd m4, m6
+ vpmovusdw ym16, m4 ; clip
+ psrlw ym16, 6
+ mova [dstq+r10], ym16
+ add r10, 32
+ jl .n1_loop
+ add dstq, strideq
+ ret
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/looprestoration16_sse.asm b/third_party/dav1d/src/x86/looprestoration16_sse.asm
new file mode 100644
index 0000000000..872e502982
--- /dev/null
+++ b/third_party/dav1d/src/x86/looprestoration16_sse.asm
@@ -0,0 +1,3723 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA
+
+wiener_shufA: db 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11
+wiener_shufB: db 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 8, 9, 12, 13, 10, 11
+wiener_shufC: db 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15
+wiener_shufD: db 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1
+wiener_shufE: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
+wiener_lshuf5: db 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
+wiener_lshuf7: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7
+sgr_lshuf3: db 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
+sgr_lshuf5: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
+pb_0to15: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+
+pb_m14_m13: times 8 db -14,-13
+pb_m10_m9: times 8 db -10, -9
+pb_m6_m5: times 8 db -6, -5
+pb_m2_m1: times 8 db -2, -1
+pb_2_3: times 8 db 2, 3
+pb_6_7: times 8 db 6, 7
+pw_256: times 8 dw 256
+pw_1023: times 8 dw 1023
+pd_8: times 4 dd 8
+pd_4096: times 4 dd 4096
+pd_34816: times 4 dd 34816
+pd_m262128: times 4 dd -262128
+pd_0xffff: times 4 dd 0xffff
+pd_0xf00800a4: times 4 dd 0xf00800a4
+pd_0xf00801c7: times 4 dd 0xf00801c7
+pd_0xfffffff0: times 4 dd 0xfffffff0
+
+wiener_shifts: dw 4, 4, 2048, 2048, 1, 1, 8192, 8192
+wiener_round: dd 1049600, 1048832
+
+cextern sgr_x_by_x
+
+SECTION .text
+
+%macro movif64 2 ; dst, src
+ %if ARCH_X86_64
+ mov %1, %2
+ %endif
+%endmacro
+
+%macro movif32 2 ; dst, src
+ %if ARCH_X86_32
+ mov %1, %2
+ %endif
+%endmacro
+
+INIT_XMM ssse3
+%if ARCH_X86_32
+DECLARE_REG_TMP 5, 6
+ %if STACK_ALIGNMENT < 16
+ %assign extra_stack 13*16
+ %else
+ %assign extra_stack 12*16
+ %endif
+cglobal wiener_filter7_16bpc, 4, 7, 8, -384*12-16-extra_stack, \
+ dst, stride, left, lpf, w, flt
+ %if STACK_ALIGNMENT < 16
+ %define lpfm dword [esp+calloff+16*12+ 0]
+ %define wm dword [esp+calloff+16*12+ 4]
+ %define hd dword [esp+calloff+16*12+ 8]
+ %define edgeb byte [esp+calloff+16*12+12]
+ %define edged dword [esp+calloff+16*12+12]
+ %else
+ %define hd dword r5m
+ %define edgeb byte r7m
+ %endif
+ %define PICmem dword [esp+calloff+4*0]
+ %define t0m dword [esp+calloff+4*1] ; wiener ring buffer pointers
+ %define t1m dword [esp+calloff+4*2]
+ %define t2m dword [esp+calloff+4*3]
+ %define t3m dword [esp+calloff+4*4]
+ %define t4m dword [esp+calloff+4*5]
+ %define t5m dword [esp+calloff+4*6]
+ %define t6m dword [esp+calloff+4*7]
+ %define t2 t2m
+ %define t3 t3m
+ %define t4 t4m
+ %define t5 t5m
+ %define t6 t6m
+ %define m8 [esp+calloff+16*2]
+ %define m9 [esp+calloff+16*3]
+ %define m10 [esp+calloff+16*4]
+ %define m11 [esp+calloff+16*5]
+ %define m12 [esp+calloff+16*6]
+ %define m13 [esp+calloff+16*7]
+ %define m14 [esp+calloff+16*8]
+ %define m15 [esp+calloff+16*9]
+ %define r10 r4
+ %define base t0-wiener_shifts
+ %assign calloff 0
+ %if STACK_ALIGNMENT < 16
+ mov wd, [rstk+stack_offset+20]
+ mov wm, wd
+ mov r5, [rstk+stack_offset+24]
+ mov hd, r5
+ mov r5, [rstk+stack_offset+32]
+ mov edged, r5 ; edge
+ %endif
+%else
+DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; wiener ring buffer pointers
+cglobal wiener_filter7_16bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \
+ w, h, edge, flt
+ %define base
+%endif
+%if ARCH_X86_64 || STACK_ALIGNMENT >= 16
+ movifnidn wd, wm
+%endif
+%if ARCH_X86_64
+ mov fltq, r6mp
+ movifnidn hd, hm
+ mov edged, r7m
+ mov t3d, r8m ; pixel_max
+ movq m13, [fltq]
+ movq m15, [fltq+16]
+%else
+ %if STACK_ALIGNMENT < 16
+ mov t0, [rstk+stack_offset+28]
+ mov t1, [rstk+stack_offset+36] ; pixel_max
+ movq m1, [t0] ; fx
+ movq m3, [t0+16] ; fy
+ LEA t0, wiener_shifts
+ %else
+ mov fltq, r6m
+ movq m1, [fltq]
+ movq m3, [fltq+16]
+ LEA t0, wiener_shifts
+ mov t1, r8m ; pixel_max
+ %endif
+ mov PICmem, t0
+%endif
+ mova m6, [base+wiener_shufA]
+ mova m7, [base+wiener_shufB]
+%if ARCH_X86_64
+ lea t4, [wiener_shifts]
+ add wd, wd
+ pshufd m12, m13, q0000 ; x0 x1
+ pshufd m13, m13, q1111 ; x2 x3
+ pshufd m14, m15, q0000 ; y0 y1
+ pshufd m15, m15, q1111 ; y2 y3
+ mova m8, [wiener_shufC]
+ mova m9, [wiener_shufD]
+ add lpfq, wq
+ lea t1, [rsp+wq+16]
+ add dstq, wq
+ neg wq
+ shr t3d, 11
+ %define base t4-wiener_shifts
+ movd m10, [base+wiener_round+t3*4]
+ movq m11, [base+wiener_shifts+t3*8]
+ pshufd m10, m10, q0000
+ pshufd m0, m11, q0000
+ pshufd m11, m11, q1111
+ pmullw m12, m0 ; upshift filter coefs to make the
+ pmullw m13, m0 ; horizontal downshift constant
+ DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w
+ %define lpfm [rsp]
+ %define base
+ %define wiener_lshuf7_mem [wiener_lshuf7]
+ %define pd_m262128_mem [pd_m262128]
+%else
+ add wd, wd
+ mova m4, [base+wiener_shufC]
+ mova m5, [base+wiener_shufD]
+ pshufd m0, m1, q0000
+ pshufd m1, m1, q1111
+ pshufd m2, m3, q0000
+ pshufd m3, m3, q1111
+ mova m8, m4
+ mova m9, m5
+ mova m14, m2
+ mova m15, m3
+ shr t1, 11
+ add lpfq, wq
+ mova m3, [base+pd_m262128]
+ movd m4, [base+wiener_round+t1*4]
+ movq m5, [base+wiener_shifts+t1*8]
+ lea t1, [esp+extra_stack+wq+16]
+ add dstq, wq
+ neg wq
+ pshufd m4, m4, q0000
+ pshufd m2, m5, q0000
+ pshufd m5, m5, q1111
+ mov wm, wq
+ pmullw m0, m2
+ pmullw m1, m2
+ mova m2, [base+wiener_lshuf7]
+ %define pd_m262128_mem [esp+calloff+16*10]
+ mova pd_m262128_mem, m3
+ mova m10, m4
+ mova m11, m5
+ mova m12, m0
+ mova m13, m1
+ %define wiener_lshuf7_mem [esp+calloff+16*11]
+ mova wiener_lshuf7_mem, m2
+%endif
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t6, t1
+ mov t5, t1
+ add t1, 384*2
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ mov t4, t1
+ add t1, 384*2
+ add r10, strideq
+ mov lpfm, r10 ; below
+ call .h
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v2
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v3
+.main:
+ lea t0, [t1+384*2]
+.main_loop:
+ call .hv
+ dec hd
+ jnz .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .v3
+ mov lpfq, lpfm
+ call .hv_bottom
+ add lpfq, strideq
+ call .hv_bottom
+.v1:
+ call .v
+ RET
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov lpfm, r10
+ call .h
+ mov t6, t1
+ mov t5, t1
+ mov t4, t1
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v2
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v3
+ lea t0, [t1+384*2]
+ call .hv
+ dec hd
+ jz .v3
+ add t0, 384*8
+ call .hv
+ dec hd
+ jnz .main
+.v3:
+ call .v
+ movif32 wq, wm
+.v2:
+ call .v
+ movif32 wq, wm
+ jmp .v1
+.extend_right:
+%assign stack_offset stack_offset+8
+%assign calloff 8
+ movif32 t0, PICmem
+ pxor m0, m0
+ movd m1, wd
+ mova m2, [base+pb_0to15]
+ pshufb m1, m0
+ mova m0, [base+pb_6_7]
+ psubb m0, m1
+ pminub m0, m2
+ pshufb m3, m0
+ mova m0, [base+pb_m2_m1]
+ psubb m0, m1
+ pminub m0, m2
+ pshufb m4, m0
+ mova m0, [base+pb_m10_m9]
+ psubb m0, m1
+ pminub m0, m2
+ pshufb m5, m0
+ movif32 t0, t0m
+ ret
+%assign stack_offset stack_offset-4
+%assign calloff 4
+.h:
+ movif64 wq, r4
+ movif32 wq, wm
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movq m3, [leftq]
+ movhps m3, [lpfq+wq]
+ add leftq, 8
+ jmp .h_main
+.h_extend_left:
+ mova m3, [lpfq+wq] ; avoid accessing memory located
+ pshufb m3, wiener_lshuf7_mem ; before the start of the buffer
+ jmp .h_main
+.h_top:
+ movif64 wq, r4
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m3, [lpfq+wq-8]
+.h_main:
+ mova m4, [lpfq+wq+0]
+ movu m5, [lpfq+wq+8]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp wd, -20
+ jl .h_have_right
+ call .extend_right
+.h_have_right:
+ pshufb m0, m3, m6
+ pshufb m1, m4, m7
+ paddw m0, m1
+ pshufb m3, m8
+ pmaddwd m0, m12
+ pshufb m1, m4, m9
+ paddw m3, m1
+ pshufb m1, m4, m6
+ pmaddwd m3, m13
+ pshufb m2, m5, m7
+ paddw m1, m2
+ mova m2, pd_m262128_mem ; (1 << 4) - (1 << 18)
+ pshufb m4, m8
+ pmaddwd m1, m12
+ pshufb m5, m9
+ paddw m4, m5
+ pmaddwd m4, m13
+ paddd m0, m2
+ paddd m1, m2
+ paddd m0, m3
+ paddd m1, m4
+ psrad m0, 4
+ psrad m1, 4
+ packssdw m0, m1
+ psraw m0, 1
+ mova [t1+wq], m0
+ add wq, 16
+ jl .h_loop
+ movif32 wq, wm
+ ret
+ALIGN function_align
+.hv:
+ add lpfq, strideq
+ movif64 wq, r4
+ movif32 t0m, t0
+ movif32 t1m, t1
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movq m3, [leftq]
+ movhps m3, [lpfq+wq]
+ add leftq, 8
+ jmp .hv_main
+.hv_extend_left:
+ mova m3, [lpfq+wq]
+ pshufb m3, wiener_lshuf7_mem
+ jmp .hv_main
+.hv_bottom:
+ movif64 wq, r4
+ movif32 t0m, t0
+ movif32 t1m, t1
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu m3, [lpfq+wq-8]
+.hv_main:
+ mova m4, [lpfq+wq+0]
+ movu m5, [lpfq+wq+8]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp wd, -20
+ jl .hv_have_right
+ call .extend_right
+.hv_have_right:
+ movif32 t1, t4m
+ movif32 t0, t2m
+ pshufb m0, m3, m6
+ pshufb m1, m4, m7
+ paddw m0, m1
+ pshufb m3, m8
+ pmaddwd m0, m12
+ pshufb m1, m4, m9
+ paddw m3, m1
+ pshufb m1, m4, m6
+ pmaddwd m3, m13
+ pshufb m2, m5, m7
+ paddw m1, m2
+ mova m2, pd_m262128_mem
+ pshufb m4, m8
+ pmaddwd m1, m12
+ pshufb m5, m9
+ paddw m4, m5
+ pmaddwd m4, m13
+ paddd m0, m2
+ paddd m1, m2
+%if ARCH_X86_64
+ mova m2, [t4+wq]
+ paddw m2, [t2+wq]
+ mova m5, [t3+wq]
+%else
+ mova m2, [t1+wq]
+ paddw m2, [t0+wq]
+ mov t1, t3m
+ mov t0, t5m
+ mova m5, [t1+wq]
+ mov t1, t1m
+%endif
+ paddd m0, m3
+ paddd m1, m4
+ psrad m0, 4
+ psrad m1, 4
+ packssdw m0, m1
+%if ARCH_X86_64
+ mova m4, [t5+wq]
+ paddw m4, [t1+wq]
+ psraw m0, 1
+ paddw m3, m0, [t6+wq]
+%else
+ mova m4, [t0+wq]
+ paddw m4, [t1+wq]
+ mov t0, t0m
+ mov t1, t6m
+ psraw m0, 1
+ paddw m3, m0, [t1+wq]
+%endif
+ mova [t0+wq], m0
+ punpcklwd m0, m2, m5
+ pmaddwd m0, m15
+ punpckhwd m2, m5
+ pmaddwd m2, m15
+ punpcklwd m1, m3, m4
+ pmaddwd m1, m14
+ punpckhwd m3, m4
+ pmaddwd m3, m14
+ paddd m0, m10
+ paddd m2, m10
+ paddd m0, m1
+ paddd m2, m3
+ psrad m0, 6
+ psrad m2, 6
+ packssdw m0, m2
+ pmulhw m0, m11
+ pxor m1, m1
+ pmaxsw m0, m1
+ mova [dstq+wq], m0
+ add wq, 16
+ jl .hv_loop
+%if ARCH_X86_64
+ mov t6, t5
+ mov t5, t4
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ mov t1, t0
+ mov t0, t6
+%else
+ mov r4, t5m
+ mov t1, t4m
+ mov t6m, r4
+ mov t5m, t1
+ mov r4, t3m
+ mov t1, t2m
+ mov t4m, r4
+ mov t3m, t1
+ mov r4, t1m
+ mov t1, t0
+ mov t2m, r4
+ mov t0, t6m
+ mov wq, wm
+%endif
+ add dstq, strideq
+ ret
+.v:
+ movif64 wq, r4
+ movif32 t0m, t0
+ movif32 t1m, t1
+.v_loop:
+%if ARCH_X86_64
+ mova m1, [t4+wq]
+ paddw m1, [t2+wq]
+ mova m2, [t3+wq]
+ mova m4, [t1+wq]
+ paddw m3, m4, [t6+wq]
+ paddw m4, [t5+wq]
+%else
+ mov t0, t4m
+ mov t1, t2m
+ mova m1, [t0+wq]
+ paddw m1, [t1+wq]
+ mov t0, t3m
+ mov t1, t1m
+ mova m2, [t0+wq]
+ mova m4, [t1+wq]
+ mov t0, t6m
+ mov t1, t5m
+ paddw m3, m4, [t0+wq]
+ paddw m4, [t1+wq]
+%endif
+ punpcklwd m0, m1, m2
+ pmaddwd m0, m15
+ punpckhwd m1, m2
+ pmaddwd m1, m15
+ punpcklwd m2, m3, m4
+ pmaddwd m2, m14
+ punpckhwd m3, m4
+ pmaddwd m3, m14
+ paddd m0, m10
+ paddd m1, m10
+ paddd m0, m2
+ paddd m1, m3
+ psrad m0, 6
+ psrad m1, 6
+ packssdw m0, m1
+ pmulhw m0, m11
+ pxor m1, m1
+ pmaxsw m0, m1
+ mova [dstq+wq], m0
+ add wq, 16
+ jl .v_loop
+%if ARCH_X86_64
+ mov t6, t5
+ mov t5, t4
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+%else
+ mov t0, t5m
+ mov t1, t4m
+ mov r4, t3m
+ mov t6m, t0
+ mov t5m, t1
+ mov t4m, r4
+ mov r4, t2m
+ mov t1, t1m
+ mov t0, t0m
+ mov t3m, r4
+ mov t2m, t1
+%endif
+ add dstq, strideq
+ ret
+
+%if ARCH_X86_32
+ %if STACK_ALIGNMENT < 16
+ %assign stack_size 12*16+384*8
+ %else
+ %assign stack_size 11*16+384*8
+ %endif
+cglobal wiener_filter5_16bpc, 4, 7, 8, -stack_size, dst, stride, left, \
+ lpf, w, flt
+ %if STACK_ALIGNMENT < 16
+ %define lpfm dword [esp+calloff+4*6]
+ %define wm dword [esp+calloff+4*7]
+ %define hd dword [esp+calloff+16*10+0]
+ %define edgeb byte [esp+calloff+16*10+4]
+ %define edged dword [esp+calloff+16*10+4]
+ %else
+ %define hd dword r5m
+ %define edgeb byte r7m
+ %endif
+ %define PICmem dword [esp+calloff+4*0]
+ %define t0m dword [esp+calloff+4*1] ; wiener ring buffer pointers
+ %define t1m dword [esp+calloff+4*2]
+ %define t2m dword [esp+calloff+4*3]
+ %define t3m dword [esp+calloff+4*4]
+ %define t4m dword [esp+calloff+4*5]
+ %define t2 t2m
+ %define t3 t3m
+ %define t4 t4m
+ %define m8 [esp+calloff+16*2]
+ %define m9 [esp+calloff+16*3]
+ %define m10 [esp+calloff+16*4]
+ %define m11 [esp+calloff+16*5]
+ %define m12 [esp+calloff+16*6]
+ %define m13 [esp+calloff+16*7]
+ %define m14 [esp+calloff+16*8]
+ %define m15 [esp+calloff+16*9]
+ %define base t0-wiener_shifts
+ %assign calloff 0
+ %if STACK_ALIGNMENT < 16
+ mov wd, [rstk+stack_offset+20]
+ mov wm, wd
+ mov r5, [rstk+stack_offset+24]
+ mov hd, r5
+ mov r5, [rstk+stack_offset+32]
+ mov edged, r5 ; edge
+ %endif
+%else
+cglobal wiener_filter5_16bpc, 4, 14, 16, 384*8+16, dst, stride, left, lpf, \
+ w, h, edge, flt
+ %define base
+%endif
+%if ARCH_X86_64 || STACK_ALIGNMENT >= 16
+ movifnidn wd, wm
+%endif
+%if ARCH_X86_64
+ mov fltq, r6mp
+ movifnidn hd, hm
+ mov edged, r7m
+ mov t3d, r8m ; pixel_max
+ movq m12, [fltq]
+ movq m14, [fltq+16]
+%else
+ %if STACK_ALIGNMENT < 16
+ mov t0, [rstk+stack_offset+28]
+ mov t1, [rstk+stack_offset+36] ; pixel_max
+ movq m1, [t0] ; fx
+ movq m3, [t0+16] ; fy
+ LEA t0, wiener_shifts
+ %else
+ mov fltq, r6m
+ movq m1, [fltq]
+ movq m3, [fltq+16]
+ LEA t0, wiener_shifts
+ mov t1, r8m ; pixel_max
+ %endif
+ mov PICmem, t0
+%endif
+ mova m5, [base+wiener_shufE]
+ mova m6, [base+wiener_shufB]
+ mova m7, [base+wiener_shufD]
+%if ARCH_X86_64
+ lea t4, [wiener_shifts]
+ add wd, wd
+ punpcklwd m11, m12, m12
+ pshufd m11, m11, q1111 ; x1
+ pshufd m12, m12, q1111 ; x2 x3
+ punpcklwd m13, m14, m14
+ pshufd m13, m13, q1111 ; y1
+ pshufd m14, m14, q1111 ; y2 y3
+ shr t3d, 11
+ mova m8, [pd_m262128] ; (1 << 4) - (1 << 18)
+ add lpfq, wq
+ lea t1, [rsp+wq+16]
+ add dstq, wq
+ neg wq
+ %define base t4-wiener_shifts
+ movd m9, [base+wiener_round+t3*4]
+ movq m10, [base+wiener_shifts+t3*8]
+ pshufd m9, m9, q0000
+ pshufd m0, m10, q0000
+ pshufd m10, m10, q1111
+ mova m15, [wiener_lshuf5]
+ pmullw m11, m0
+ pmullw m12, m0
+ DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w
+ %define lpfm [rsp]
+ %define base
+%else
+ add wd, wd
+ punpcklwd m0, m1, m1
+ pshufd m0, m0, q1111 ; x1
+ pshufd m1, m1, q1111 ; x2 x3
+ punpcklwd m2, m3, m3
+ pshufd m2, m2, q1111 ; y1
+ pshufd m3, m3, q1111 ; y2 y3
+ mova m4, [base+pd_m262128] ; (1 << 4) - (1 << 18)
+ mova m13, m2
+ mova m14, m3
+ mova m8, m4
+ shr t1, 11
+ add lpfq, wq
+ movd m2, [base+wiener_round+t1*4]
+ movq m3, [base+wiener_shifts+t1*8]
+ %if STACK_ALIGNMENT < 16
+ lea t1, [esp+16*11+wq+16]
+ %else
+ lea t1, [esp+16*10+wq+16]
+ %endif
+ add dstq, wq
+ neg wq
+ pshufd m2, m2, q0000
+ pshufd m4, m3, q0000
+ pshufd m3, m3, q1111
+ mov wm, wq
+ pmullw m0, m4
+ pmullw m1, m4
+ mova m4, [base+wiener_lshuf5]
+ mova m9, m2
+ mova m10, m3
+ mova m11, m0
+ mova m12, m1
+ mova m15, m4
+%endif
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t4, t1
+ add t1, 384*2
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ mov t3, t1
+ add t1, 384*2
+ add r10, strideq
+ mov lpfm, r10 ; below
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v2
+.main:
+ mov t0, t4
+.main_loop:
+ call .hv
+ dec hd
+ jnz .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .v2
+ mov lpfq, lpfm
+ call .hv_bottom
+ add lpfq, strideq
+ call .hv_bottom
+.end:
+ RET
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov lpfm, r10
+ call .h
+ mov t4, t1
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v2
+ lea t0, [t1+384*2]
+ call .hv
+ dec hd
+ jz .v2
+ add t0, 384*6
+ call .hv
+ dec hd
+ jnz .main
+.v2:
+ call .v
+%if ARCH_X86_64
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+%else
+ mov t0, t3m
+ mov r4, t2m
+ mov t1, t1m
+ mov t4m, t0
+ mov t3m, r4
+ mov t2m, t1
+ mov wq, wm
+%endif
+ add dstq, strideq
+.v1:
+ call .v
+ jmp .end
+.extend_right:
+%assign stack_offset stack_offset+8
+%assign calloff 8
+ movif32 t0, PICmem
+ pxor m1, m1
+ movd m2, wd
+ mova m0, [base+pb_2_3]
+ pshufb m2, m1
+ mova m1, [base+pb_m6_m5]
+ psubb m0, m2
+ psubb m1, m2
+ mova m2, [base+pb_0to15]
+ pminub m0, m2
+ pminub m1, m2
+ pshufb m3, m0
+ pshufb m4, m1
+ ret
+%assign stack_offset stack_offset-4
+%assign calloff 4
+.h:
+ movif64 wq, r4
+ movif32 wq, wm
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ mova m4, [lpfq+wq]
+ movd m3, [leftq+4]
+ pslldq m4, 4
+ por m3, m4
+ add leftq, 8
+ jmp .h_main
+.h_extend_left:
+ mova m3, [lpfq+wq] ; avoid accessing memory located
+ pshufb m3, m15 ; before the start of the buffer
+ jmp .h_main
+.h_top:
+ movif64 wq, r4
+ movif32 wq, wm
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m3, [lpfq+wq-4]
+.h_main:
+ movu m4, [lpfq+wq+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp wd, -18
+ jl .h_have_right
+ call .extend_right
+.h_have_right:
+ pshufb m0, m3, m5
+ pmaddwd m0, m11
+ pshufb m1, m4, m5
+ pmaddwd m1, m11
+ pshufb m2, m3, m6
+ pshufb m3, m7
+ paddw m2, m3
+ pshufb m3, m4, m6
+ pmaddwd m2, m12
+ pshufb m4, m7
+ paddw m3, m4
+ pmaddwd m3, m12
+ paddd m0, m8
+ paddd m1, m8
+ paddd m0, m2
+ paddd m1, m3
+ psrad m0, 4
+ psrad m1, 4
+ packssdw m0, m1
+ psraw m0, 1
+ mova [t1+wq], m0
+ add wq, 16
+ jl .h_loop
+ movif32 wq, wm
+ ret
+ALIGN function_align
+.hv:
+ add lpfq, strideq
+ movif64 wq, r4
+ movif32 t0m, t0
+ movif32 t1m, t1
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ mova m4, [lpfq+wq]
+ movd m3, [leftq+4]
+ pslldq m4, 4
+ por m3, m4
+ add leftq, 8
+ jmp .hv_main
+.hv_extend_left:
+ mova m3, [lpfq+wq]
+ pshufb m3, m15
+ jmp .hv_main
+.hv_bottom:
+ movif64 wq, r4
+ movif32 t0m, t0
+ movif32 t1m, t1
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu m3, [lpfq+wq-4]
+.hv_main:
+ movu m4, [lpfq+wq+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp wd, -18
+ jl .hv_have_right
+ call .extend_right
+.hv_have_right:
+ movif32 t1, t1m
+ movif32 t0, t3m
+ pshufb m0, m3, m5
+ pmaddwd m0, m11
+ pshufb m1, m4, m5
+ pmaddwd m1, m11
+ pshufb m2, m3, m6
+ pshufb m3, m7
+ paddw m2, m3
+ pshufb m3, m4, m6
+ pmaddwd m2, m12
+ pshufb m4, m7
+ paddw m3, m4
+ pmaddwd m3, m12
+ paddd m0, m8
+ paddd m1, m8
+ paddd m0, m2
+%if ARCH_X86_64
+ mova m2, [t3+wq]
+ paddw m2, [t1+wq]
+ paddd m1, m3
+ mova m4, [t2+wq]
+%else
+ mova m2, [t0+wq]
+ mov t0, t2m
+ paddw m2, [t1+wq]
+ mov t1, t4m
+ paddd m1, m3
+ mova m4, [t0+wq]
+ mov t0, t0m
+%endif
+ punpckhwd m3, m2, m4
+ pmaddwd m3, m14
+ punpcklwd m2, m4
+%if ARCH_X86_64
+ mova m4, [t4+wq]
+%else
+ mova m4, [t1+wq]
+%endif
+ psrad m0, 4
+ psrad m1, 4
+ packssdw m0, m1
+ pmaddwd m2, m14
+ psraw m0, 1
+ mova [t0+wq], m0
+ punpckhwd m1, m0, m4
+ pmaddwd m1, m13
+ punpcklwd m0, m4
+ pmaddwd m0, m13
+ paddd m3, m9
+ paddd m2, m9
+ paddd m1, m3
+ paddd m0, m2
+ psrad m1, 6
+ psrad m0, 6
+ packssdw m0, m1
+ pmulhw m0, m10
+ pxor m1, m1
+ pmaxsw m0, m1
+ mova [dstq+wq], m0
+ add wq, 16
+ jl .hv_loop
+%if ARCH_X86_64
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ mov t1, t0
+ mov t0, t4
+%else
+ mov r4, t3m
+ mov t1, t2m
+ mov t4m, r4
+ mov t3m, t1
+ mov r4, t1m
+ mov t1, t0
+ mov t2m, r4
+ mov t0, t4m
+ mov wq, wm
+%endif
+ add dstq, strideq
+ ret
+.v:
+ movif64 wq, r4
+ movif32 t1m, t1
+.v_loop:
+%if ARCH_X86_64
+ mova m0, [t1+wq]
+ paddw m2, m0, [t3+wq]
+ mova m1, [t2+wq]
+ mova m4, [t4+wq]
+%else
+ mov t0, t3m
+ mova m0, [t1+wq]
+ mov t1, t2m
+ paddw m2, m0, [t0+wq]
+ mov t0, t4m
+ mova m1, [t1+wq]
+ mova m4, [t0+wq]
+%endif
+ punpckhwd m3, m2, m1
+ pmaddwd m3, m14
+ punpcklwd m2, m1
+ pmaddwd m2, m14
+ punpckhwd m1, m0, m4
+ pmaddwd m1, m13
+ punpcklwd m0, m4
+ pmaddwd m0, m13
+ paddd m3, m9
+ paddd m2, m9
+ paddd m1, m3
+ paddd m0, m2
+ psrad m1, 6
+ psrad m0, 6
+ packssdw m0, m1
+ pmulhw m0, m10
+ pxor m1, m1
+ pmaxsw m0, m1
+ mova [dstq+wq], m0
+ add wq, 16
+%if ARCH_X86_64
+ jl .v_loop
+%else
+ jge .v_end
+ mov t1, t1m
+ jmp .v_loop
+.v_end:
+%endif
+ ret
+
+%macro GATHERDD 3 ; dst, src, tmp
+ movd %3d, %2
+ %if ARCH_X86_64
+ movd %1, [r13+%3]
+ pextrw %3d, %2, 2
+ pinsrw %1, [r13+%3+2], 3
+ pextrw %3d, %2, 4
+ pinsrw %1, [r13+%3+2], 5
+ pextrw %3d, %2, 6
+ pinsrw %1, [r13+%3+2], 7
+ %else
+ movd %1, [base+sgr_x_by_x-0xf03+%3]
+ pextrw %3, %2, 2
+ pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 3
+ pextrw %3, %2, 4
+ pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 5
+ pextrw %3, %2, 6
+ pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 7
+ %endif
+%endmacro
+
+%macro GATHER_X_BY_X 5 ; dst, src0, src1, tmp32, tmp32_restore
+ %if ARCH_X86_64
+ %define tmp r14
+ %else
+ %define tmp %4
+ %endif
+ GATHERDD %1, %2, tmp
+ GATHERDD %2, %3, tmp
+ movif32 %4, %5
+ psrld %1, 24
+ psrld %2, 24
+ packssdw %1, %2
+%endmacro
+
+%macro MAXSD 3-4 0 ; dst, src, restore_tmp
+ pcmpgtd %3, %1, %2
+ pand %1, %3
+ pandn %3, %2
+ por %1, %3
+ %if %4 == 1
+ pxor %3, %3
+ %endif
+%endmacro
+
+%macro MULLD 3 ; dst, src, tmp
+ pmulhuw %3, %1, %2
+ pmullw %1, %2
+ pslld %3, 16
+ paddd %1, %3
+%endmacro
+
+%if ARCH_X86_32
+DECLARE_REG_TMP 0, 1, 2, 3, 5
+ %if STACK_ALIGNMENT < 16
+ %assign extra_stack 5*16
+ %else
+ %assign extra_stack 3*16
+ %endif
+cglobal sgr_filter_5x5_16bpc, 1, 7, 8, -400*24-16-extra_stack, \
+ dst, stride, left, lpf, w
+ %if STACK_ALIGNMENT < 16
+ %define dstm dword [esp+calloff+16*0+4*6]
+ %define stridemp dword [esp+calloff+16*0+4*7]
+ %define leftm dword [esp+calloff+16*3+4*0]
+ %define lpfm dword [esp+calloff+16*3+4*1]
+ %define w0m dword [esp+calloff+16*3+4*2]
+ %define hd dword [esp+calloff+16*3+4*3]
+ %define edgeb byte [esp+calloff+16*3+4*4]
+ %define edged dword [esp+calloff+16*3+4*4]
+ %define leftmp leftm
+ %else
+ %define w0m wm
+ %define hd dword r5m
+ %define edgeb byte r7m
+ %define edged dword r7m
+ %endif
+ %define hvsrcm dword [esp+calloff+4*0]
+ %define w1m dword [esp+calloff+4*1]
+ %define t0m dword [esp+calloff+4*2]
+ %define t2m dword [esp+calloff+4*3]
+ %define t3m dword [esp+calloff+4*4]
+ %define t4m dword [esp+calloff+4*5]
+ %define m8 [base+pd_8]
+ %define m9 [base+pd_0xfffffff0]
+ %define m10 [esp+calloff+16*2]
+ %define m11 [base+pd_0xf00800a4]
+ %define m12 [base+sgr_lshuf5]
+ %define m13 [base+pd_34816]
+ %define m14 [base+pw_1023]
+ %define r10 r4
+ %define base r6-$$
+ %assign calloff 0
+ %if STACK_ALIGNMENT < 16
+ mov strideq, [rstk+stack_offset+ 8]
+ mov leftq, [rstk+stack_offset+12]
+ mov lpfq, [rstk+stack_offset+16]
+ mov wd, [rstk+stack_offset+20]
+ mov dstm, dstq
+ mov stridemp, strideq
+ mov leftm, leftq
+ mov r1, [rstk+stack_offset+24]
+ mov r2, [rstk+stack_offset+32]
+ mov lpfm, lpfq
+ mov hd, r1
+ mov edged, r2
+ %endif
+%else
+cglobal sgr_filter_5x5_16bpc, 4, 15, 15, -400*24-16, dst, stride, left, lpf, \
+ w, h, edge, params
+%endif
+%if ARCH_X86_64 || STACK_ALIGNMENT >= 16
+ movifnidn wd, wm
+%endif
+%if ARCH_X86_64
+ mov paramsq, r6mp
+ lea r13, [sgr_x_by_x-0xf03]
+ movifnidn hd, hm
+ add wd, wd
+ mov edged, r7m
+ movu m10, [paramsq]
+ mova m12, [sgr_lshuf5]
+ add lpfq, wq
+ mova m8, [pd_8]
+ lea t1, [rsp+wq+20]
+ mova m9, [pd_0xfffffff0]
+ add dstq, wq
+ lea t3, [rsp+wq*2+400*12+16]
+ mova m11, [pd_0xf00800a4]
+ lea t4, [rsp+wq+400*20+16]
+ pshufhw m7, m10, q0000
+ pshufb m10, [pw_256] ; s0
+ punpckhqdq m7, m7 ; w0
+ neg wq
+ mova m13, [pd_34816] ; (1 << 11) + (1 << 15)
+ pxor m6, m6
+ mova m14, [pw_1023]
+ psllw m7, 4
+ DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w
+ %define lpfm [rsp]
+%else
+ mov r1, [rstk+stack_offset+28] ; params
+ LEA r6, $$
+ add wd, wd
+ movu m1, [r1]
+ add lpfm, wq
+ lea t1, [rsp+extra_stack+wq+20]
+ add dstq, wq
+ lea t3, [rsp+extra_stack+wq*2+400*12+16]
+ mov dstm, dstq
+ lea t4, [rsp+extra_stack+wq+400*20+16]
+ mov t3m, t3
+ pshufhw m7, m1, q0000
+ mov t4m, t4
+ pshufb m1, [base+pw_256] ; s0
+ punpckhqdq m7, m7 ; w0
+ psllw m7, 4
+ neg wq
+ mova m10, m1
+ pxor m6, m6
+ mov w1m, wd
+ sub wd, 4
+ mov lpfq, lpfm
+ mov w0m, wd
+ %define strideq r5
+%endif
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, stridemp
+ movif32 t2m, t1
+ mov t2, t1
+ call .top_fixup
+ add t1, 400*6
+ call .h_top
+ movif32 strideq, stridemp
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ add r10, strideq
+ mov lpfm, r10 ; below
+ movif32 t0m, t2
+ mov t0, t2
+ dec hd
+ jz .height1
+ or edged, 16
+ call .h
+.main:
+ add lpfq, stridemp
+ movif32 t4, t4m
+ call .hv
+ call .prep_n
+ sub hd, 2
+ jl .extend_bottom
+.main_loop:
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+%if ARCH_X86_64
+ test hb, hb
+%else
+ mov r4, hd
+ test r4, r4
+%endif
+ jz .odd_height
+ call .h
+ add lpfq, stridemp
+ call .hv
+ movif32 dstq, dstm
+ call .n0
+ call .n1
+ sub hd, 2
+ movif32 t0, t0m
+ jge .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .extend_bottom
+ mov lpfq, lpfm
+ call .h_top
+ add lpfq, stridemp
+ call .hv_bottom
+.end:
+ movif32 dstq, dstm
+ call .n0
+ call .n1
+.end2:
+ RET
+.height1:
+ movif32 t4, t4m
+ call .hv
+ call .prep_n
+ jmp .odd_height_end
+.odd_height:
+ call .hv
+ movif32 dstq, dstm
+ call .n0
+ call .n1
+.odd_height_end:
+ call .v
+ movif32 dstq, dstm
+ call .n0
+ jmp .end2
+.extend_bottom:
+ call .v
+ jmp .end
+.no_top:
+ movif32 strideq, stridemp
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov lpfm, r10
+ call .h
+ lea t2, [t1+400*6]
+ movif32 t2m, t2
+ call .top_fixup
+ dec hd
+ jz .no_top_height1
+ or edged, 16
+ mov t0, t1
+ mov t1, t2
+ movif32 t0m, t0
+ jmp .main
+.no_top_height1:
+ movif32 t3, t3m
+ movif32 t4, t4m
+ call .v
+ call .prep_n
+ jmp .odd_height_end
+.extend_right:
+ movd m0, wd
+ movd m1, [lpfq-2]
+ mova m2, [base+pw_256]
+ mova m3, [base+pb_m14_m13]
+ pshufb m0, m6
+ pshufb m1, m2
+ psubb m2, m0
+ psubb m3, m0
+ mova m0, [base+pb_0to15]
+ pcmpgtb m2, m0
+ pcmpgtb m3, m0
+ pand m4, m2
+ pand m5, m3
+ pandn m2, m1
+ pandn m3, m1
+ por m4, m2
+ por m5, m3
+ ret
+%assign stack_offset stack_offset+4
+%assign calloff 4
+.h: ; horizontal boxsum
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ %define leftq r4
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movif32 leftq, leftm
+ movddup m5, [leftq]
+ movif32 wq, w0m
+ mova m4, [lpfq+wq+4]
+ add leftmp, 8
+ palignr m4, m5, 10
+ jmp .h_main
+.h_extend_left:
+ movif32 wq, w0m
+ mova m4, [lpfq+wq+4]
+ pshufb m4, m12
+ jmp .h_main
+.h_top:
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movif32 wq, w0m
+.h_loop:
+ movu m4, [lpfq+wq- 2]
+.h_main:
+ movu m5, [lpfq+wq+14]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp wd, -20
+ jl .h_have_right
+ call .extend_right
+.h_have_right:
+ palignr m2, m5, m4, 2
+ paddw m0, m4, m2
+ palignr m3, m5, m4, 6
+ paddw m0, m3
+ punpcklwd m1, m2, m3
+ pmaddwd m1, m1
+ punpckhwd m2, m3
+ pmaddwd m2, m2
+ palignr m5, m4, 8
+ paddw m0, m5
+ punpcklwd m3, m4, m5
+ pmaddwd m3, m3
+ paddd m1, m3
+ punpckhwd m3, m4, m5
+ pmaddwd m3, m3
+ shufps m4, m5, q2121
+ paddw m0, m4 ; sum
+ punpcklwd m5, m4, m6
+ pmaddwd m5, m5
+ punpckhwd m4, m6
+ pmaddwd m4, m4
+ paddd m2, m3
+ test edgeb, 16 ; y > 0
+ jz .h_loop_end
+ paddw m0, [t1+wq+400*0]
+ paddd m1, [t1+wq+400*2]
+ paddd m2, [t1+wq+400*4]
+.h_loop_end:
+ paddd m1, m5 ; sumsq
+ paddd m2, m4
+ mova [t1+wq+400*0], m0
+ mova [t1+wq+400*2], m1
+ mova [t1+wq+400*4], m2
+ add wq, 16
+ jl .h_loop
+ ret
+.top_fixup:
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ mov wd, w0m
+%endif
+.top_fixup_loop: ; the sums of the first row needs to be doubled
+ mova m0, [t1+wq+400*0]
+ mova m1, [t1+wq+400*2]
+ mova m2, [t1+wq+400*4]
+ paddw m0, m0
+ paddd m1, m1
+ paddd m2, m2
+ mova [t2+wq+400*0], m0
+ mova [t2+wq+400*2], m1
+ mova [t2+wq+400*4], m2
+ add wq, 16
+ jl .top_fixup_loop
+ ret
+ALIGN function_align
+.hv: ; horizontal boxsum + vertical boxsum + ab
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movif32 leftq, leftm
+ movddup m5, [leftq]
+ movif32 wq, w0m
+ mova m4, [lpfq+wq+4]
+ add leftmp, 8
+ palignr m4, m5, 10
+ jmp .hv_main
+.hv_extend_left:
+ movif32 wq, w0m
+ mova m4, [lpfq+wq+4]
+ pshufb m4, m12
+ jmp .hv_main
+.hv_bottom:
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movif32 wq, w0m
+%if ARCH_X86_32
+ jmp .hv_loop_start
+%endif
+.hv_loop:
+ movif32 lpfq, hvsrcm
+.hv_loop_start:
+ movu m4, [lpfq+wq- 2]
+.hv_main:
+ movu m5, [lpfq+wq+14]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp wd, -20
+ jl .hv_have_right
+ call .extend_right
+.hv_have_right:
+ movif32 t3, hd
+ palignr m3, m5, m4, 2
+ paddw m0, m4, m3
+ palignr m1, m5, m4, 6
+ paddw m0, m1
+ punpcklwd m2, m3, m1
+ pmaddwd m2, m2
+ punpckhwd m3, m1
+ pmaddwd m3, m3
+ palignr m5, m4, 8
+ paddw m0, m5
+ punpcklwd m1, m4, m5
+ pmaddwd m1, m1
+ paddd m2, m1
+ punpckhwd m1, m4, m5
+ pmaddwd m1, m1
+ shufps m4, m5, q2121
+ paddw m0, m4 ; h sum
+ punpcklwd m5, m4, m6
+ pmaddwd m5, m5
+ punpckhwd m4, m6
+ pmaddwd m4, m4
+ paddd m3, m1
+ paddd m2, m5 ; h sumsq
+ paddd m3, m4
+ paddw m1, m0, [t1+wq+400*0]
+ paddd m4, m2, [t1+wq+400*2]
+ paddd m5, m3, [t1+wq+400*4]
+%if ARCH_X86_64
+ test hd, hd
+%else
+ test t3, t3
+%endif
+ jz .hv_last_row
+.hv_main2:
+ paddw m1, [t2+wq+400*0] ; hv sum
+ paddd m4, [t2+wq+400*2] ; hv sumsq
+ paddd m5, [t2+wq+400*4]
+ mova [t0+wq+400*0], m0
+ mova [t0+wq+400*2], m2
+ mova [t0+wq+400*4], m3
+ psrlw m3, m1, 1
+ paddd m4, m8
+ pavgw m3, m6 ; (b + 2) >> 2
+ paddd m5, m8
+ pand m4, m9 ; ((a + 8) >> 4) << 4
+ pand m5, m9
+ psrld m2, m4, 4
+ psrld m0, m5, 4
+ paddd m2, m4
+ psrld m4, 1
+ paddd m0, m5
+ psrld m5, 1
+ paddd m4, m2 ; a * 25
+ paddd m5, m0
+ punpcklwd m2, m3, m6
+ punpckhwd m3, m6
+ pmaddwd m2, m2 ; b * b
+ pmaddwd m3, m3
+ punpcklwd m0, m1, m6 ; b
+ punpckhwd m1, m6
+ MAXSD m4, m2, m6
+ MAXSD m5, m3, m6, 1
+ psubd m4, m2 ; p
+ psubd m5, m3
+ MULLD m4, m10, m2 ; p * s
+ MULLD m5, m10, m2
+ pmaddwd m0, m11 ; b * 164
+ pmaddwd m1, m11
+ paddusw m4, m11
+ paddusw m5, m11
+ psrld m4, 20 ; min(z, 255)
+ movif32 t3, t3m
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, t2, t2m
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m2
+ MULLD m1, m5, m2
+ paddd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15)
+ paddd m1, m13
+ mova [t4+wq+4], m3
+ psrld m0, 12 ; b
+ psrld m1, 12
+ mova [t3+wq*2+ 8], m0
+ mova [t3+wq*2+24], m1
+ add wq, 16
+ jl .hv_loop
+ mov t2, t1
+ mov t1, t0
+ mov t0, t2
+ movif32 t2m, t2
+ movif32 t0m, t0
+ ret
+.hv_last_row: ; esoteric edge case for odd heights
+ mova [t1+wq+400*0], m1
+ paddw m1, m0
+ mova [t1+wq+400*2], m4
+ paddd m4, m2
+ mova [t1+wq+400*4], m5
+ paddd m5, m3
+ jmp .hv_main2
+.v: ; vertical boxsum + ab
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ mov wd, w0m
+%endif
+.v_loop:
+ mova m0, [t1+wq+400*0]
+ mova m2, [t1+wq+400*2]
+ mova m3, [t1+wq+400*4]
+ paddw m1, m0, [t2+wq+400*0]
+ paddd m4, m2, [t2+wq+400*2]
+ paddd m5, m3, [t2+wq+400*4]
+ paddw m0, m0
+ paddd m2, m2
+ paddd m3, m3
+ paddw m1, m0 ; hv sum
+ paddd m4, m2 ; hv sumsq
+ paddd m5, m3
+ psrlw m3, m1, 1
+ paddd m4, m8
+ pavgw m3, m6 ; (b + 2) >> 2
+ paddd m5, m8
+ pand m4, m9 ; ((a + 8) >> 4) << 4
+ pand m5, m9
+ psrld m2, m4, 4
+ psrld m0, m5, 4
+ paddd m2, m4
+ psrld m4, 1
+ paddd m0, m5
+ psrld m5, 1
+ paddd m4, m2 ; a * 25
+ paddd m5, m0
+ punpcklwd m2, m3, m6
+ punpckhwd m3, m6
+ pmaddwd m2, m2 ; b * b
+ pmaddwd m3, m3
+ punpcklwd m0, m1, m6 ; b
+ punpckhwd m1, m6
+ MAXSD m4, m2, m6
+ MAXSD m5, m3, m6, 1
+ psubd m4, m2 ; p
+ psubd m5, m3
+ MULLD m4, m10, m2 ; p * s
+ MULLD m5, m10, m2
+ pmaddwd m0, m11 ; b * 164
+ pmaddwd m1, m11
+ paddusw m4, m11
+ paddusw m5, m11
+ psrld m4, 20 ; min(z, 255)
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, t2, t2m
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m2
+ MULLD m1, m5, m2
+ paddd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15)
+ paddd m1, m13
+ mova [t4+wq+4], m3
+ psrld m0, 12 ; b
+ psrld m1, 12
+ mova [t3+wq*2+ 8], m0
+ mova [t3+wq*2+24], m1
+ add wq, 16
+ jl .v_loop
+ ret
+.prep_n: ; initial neighbor setup
+ movif64 wq, r4
+ movif32 wd, w1m
+.prep_n_loop:
+ movu m0, [t4+wq*1+ 2]
+ movu m3, [t4+wq*1+ 4]
+ movu m1, [t3+wq*2+ 4]
+ movu m4, [t3+wq*2+ 8]
+ movu m2, [t3+wq*2+20]
+ movu m5, [t3+wq*2+24]
+ paddw m3, m0
+ paddd m4, m1
+ paddd m5, m2
+ paddw m3, [t4+wq*1+ 0]
+ paddd m4, [t3+wq*2+ 0]
+ paddd m5, [t3+wq*2+16]
+ paddw m0, m3
+ psllw m3, 2
+ paddd m1, m4
+ pslld m4, 2
+ paddd m2, m5
+ pslld m5, 2
+ paddw m0, m3 ; a 565
+ paddd m1, m4 ; b 565
+ paddd m2, m5
+ mova [t4+wq*1+400*2+ 0], m0
+ mova [t3+wq*2+400*4+ 0], m1
+ mova [t3+wq*2+400*4+16], m2
+ add wq, 16
+ jl .prep_n_loop
+ ret
+ALIGN function_align
+.n0: ; neighbor + output (even rows)
+ movif64 wq, r4
+ movif32 wd, w1m
+.n0_loop:
+ movu m0, [t4+wq*1+ 2]
+ movu m3, [t4+wq*1+ 4]
+ movu m1, [t3+wq*2+ 4]
+ movu m4, [t3+wq*2+ 8]
+ movu m2, [t3+wq*2+20]
+ movu m5, [t3+wq*2+24]
+ paddw m3, m0
+ paddd m4, m1
+ paddd m5, m2
+ paddw m3, [t4+wq*1+ 0]
+ paddd m4, [t3+wq*2+ 0]
+ paddd m5, [t3+wq*2+16]
+ paddw m0, m3
+ psllw m3, 2
+ paddd m1, m4
+ pslld m4, 2
+ paddd m2, m5
+ pslld m5, 2
+ paddw m0, m3 ; a 565
+ paddd m1, m4 ; b 565
+ paddd m2, m5
+ paddw m3, m0, [t4+wq*1+400*2+ 0]
+ paddd m4, m1, [t3+wq*2+400*4+ 0]
+ paddd m5, m2, [t3+wq*2+400*4+16]
+ mova [t4+wq*1+400*2+ 0], m0
+ mova [t3+wq*2+400*4+ 0], m1
+ mova [t3+wq*2+400*4+16], m2
+ mova m0, [dstq+wq]
+ punpcklwd m1, m0, m6 ; src
+ punpcklwd m2, m3, m6 ; a
+ pmaddwd m2, m1 ; a * src
+ punpckhwd m1, m0, m6
+ punpckhwd m3, m6
+ pmaddwd m3, m1
+ psubd m4, m2 ; b - a * src + (1 << 8)
+ psubd m5, m3
+ psrad m4, 9
+ psrad m5, 9
+ packssdw m4, m5
+ pmulhrsw m4, m7
+ paddw m0, m4
+ pmaxsw m0, m6
+ pminsw m0, m14
+ mova [dstq+wq], m0
+ add wq, 16
+ jl .n0_loop
+ add dstq, stridemp
+ ret
+ALIGN function_align
+.n1: ; neighbor + output (odd rows)
+ movif64 wq, r4
+ movif32 wd, w1m
+.n1_loop:
+ mova m0, [dstq+wq]
+ mova m3, [t4+wq*1+400*2+ 0]
+ mova m4, [t3+wq*2+400*4+ 0]
+ mova m5, [t3+wq*2+400*4+16]
+ punpcklwd m1, m0, m6 ; src
+ punpcklwd m2, m3, m6 ; a
+ pmaddwd m2, m1
+ punpckhwd m1, m0, m6
+ punpckhwd m3, m6
+ pmaddwd m3, m1
+ psubd m4, m2 ; b - a * src + (1 << 7)
+ psubd m5, m3
+ psrad m4, 8
+ psrad m5, 8
+ packssdw m4, m5
+ pmulhrsw m4, m7
+ paddw m0, m4
+ pmaxsw m0, m6
+ pminsw m0, m14
+ mova [dstq+wq], m0
+ add wq, 16
+ jl .n1_loop
+ add dstq, stridemp
+ movif32 dstm, dstq
+ ret
+
+%if ARCH_X86_32
+ %if STACK_ALIGNMENT < 16
+ %assign extra_stack 4*16
+ %else
+ %assign extra_stack 2*16
+ %endif
+cglobal sgr_filter_3x3_16bpc, 1, 7, 8, -400*42-16-extra_stack, \
+ dst, stride, left, lpf, w
+ %if STACK_ALIGNMENT < 16
+ %define dstm dword [esp+calloff+16*2+4*0]
+ %define stridemp dword [esp+calloff+16*2+4*1]
+ %define leftm dword [esp+calloff+16*2+4*2]
+ %define lpfm dword [esp+calloff+16*2+4*3]
+ %define w0m dword [esp+calloff+16*2+4*4]
+ %define hd dword [esp+calloff+16*2+4*5]
+ %define edgeb byte [esp+calloff+16*2+4*6]
+ %define edged dword [esp+calloff+16*2+4*6]
+ %define leftmp leftm
+ %else
+ %define w0m wm
+ %define hd dword r5m
+ %define edgeb byte r7m
+ %define edged dword r7m
+ %endif
+ %define hvsrcm dword [esp+calloff+4*0]
+ %define w1m dword [esp+calloff+4*1]
+ %define t3m dword [esp+calloff+4*2]
+ %define t4m dword [esp+calloff+4*3]
+ %define m8 [base+pd_8]
+ %define m9 [esp+calloff+16*1]
+ %define m10 [base+pd_0xf00801c7]
+ %define m11 [base+pd_34816]
+ %define m12 [base+sgr_lshuf3]
+ %define m13 [base+pw_1023]
+ %define m14 m6
+ %define base r6-$$
+ %assign calloff 0
+ %if STACK_ALIGNMENT < 16
+ mov strideq, [rstk+stack_offset+ 8]
+ mov leftq, [rstk+stack_offset+12]
+ mov lpfq, [rstk+stack_offset+16]
+ mov wd, [rstk+stack_offset+20]
+ mov dstm, dstq
+ mov stridemp, strideq
+ mov leftm, leftq
+ mov r1, [rstk+stack_offset+24]
+ mov r2, [rstk+stack_offset+32]
+ mov lpfm, lpfq
+ mov hd, r1
+ mov edged, r2
+ %endif
+%else
+cglobal sgr_filter_3x3_16bpc, 4, 15, 15, -400*42-8, dst, stride, left, lpf, \
+ w, h, edge, params
+%endif
+%if ARCH_X86_64 || STACK_ALIGNMENT >= 16
+ movifnidn wd, wm
+%endif
+%if ARCH_X86_64
+ mov paramsq, r6mp
+ lea r13, [sgr_x_by_x-0xf03]
+ movifnidn hd, hm
+ add wd, wd
+ mov edged, r7m
+ movq m9, [paramsq+4]
+ add lpfq, wq
+ lea t1, [rsp+wq+12]
+ mova m8, [pd_8]
+ add dstq, wq
+ lea t3, [rsp+wq*2+400*12+8]
+ mova m10, [pd_0xf00801c7]
+ lea t4, [rsp+wq+400*32+8]
+ mova m11, [pd_34816]
+ pshuflw m7, m9, q3333
+ pshufb m9, [pw_256] ; s1
+ punpcklqdq m7, m7 ; w1
+ neg wq
+ pxor m6, m6
+ mova m13, [pw_1023]
+ psllw m7, 4
+ mova m12, [sgr_lshuf3]
+ DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w
+ %define lpfm [rsp]
+%else
+ mov r1, [rstk+stack_offset+28] ; params
+ LEA r6, $$
+ add wd, wd
+ movq m1, [r1+4]
+ add lpfm, wq
+ lea t1, [rsp+extra_stack+wq+20]
+ add dstq, wq
+ lea t3, [rsp+extra_stack+wq*2+400*12+16]
+ mov dstm, dstq
+ lea t4, [rsp+extra_stack+wq+400*32+16]
+ mov t3m, t3
+ pshuflw m7, m1, q3333
+ mov t4m, t4
+ pshufb m1, [base+pw_256] ; s1
+ punpcklqdq m7, m7 ; w1
+ psllw m7, 4
+ neg wq
+ mova m9, m1
+ pxor m6, m6
+ mov w1m, wd
+ sub wd, 4
+ mov lpfq, lpfm
+ mov w0m, wd
+ %define strideq r5
+%endif
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, stridemp
+ mov t2, t1
+ add t1, 400*6
+ call .h_top
+ movif32 strideq, stridemp
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ add r10, strideq
+ mov lpfm, r10 ; below
+ movif32 t4, t4m
+ call .hv0
+.main:
+ dec hd
+ jz .height1
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+ call .hv1
+ call .prep_n
+ sub hd, 2
+ jl .extend_bottom
+.main_loop:
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+ call .hv0
+%if ARCH_X86_64
+ test hb, hb
+%else
+ mov r4, hd
+ test r4, r4
+%endif
+ jz .odd_height
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+ call .hv1
+ call .n0
+ call .n1
+ sub hd, 2
+ jge .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .extend_bottom
+ mov lpfq, lpfm
+ call .hv0_bottom
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+ call .hv1_bottom
+.end:
+ call .n0
+ call .n1
+.end2:
+ RET
+.height1:
+ call .v1
+ call .prep_n
+ jmp .odd_height_end
+.odd_height:
+ call .v1
+ call .n0
+ call .n1
+.odd_height_end:
+ call .v0
+ call .v1
+ call .n0
+ jmp .end2
+.extend_bottom:
+ call .v0
+ call .v1
+ jmp .end
+.no_top:
+ movif32 strideq, stridemp
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov lpfm, r10
+ call .h
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ mov wq, w0m
+ mov hvsrcm, lpfq
+%endif
+ lea t2, [t1+400*6]
+.top_fixup_loop:
+ mova m0, [t1+wq+400*0]
+ mova m1, [t1+wq+400*2]
+ mova m2, [t1+wq+400*4]
+ mova [t2+wq+400*0], m0
+ mova [t2+wq+400*2], m1
+ mova [t2+wq+400*4], m2
+ add wq, 16
+ jl .top_fixup_loop
+ movif32 t3, t3m
+ movif32 t4, t4m
+ call .v0
+ jmp .main
+.extend_right:
+ movd m1, wd
+ movd m5, [lpfq-2]
+ mova m2, [base+pw_256]
+ mova m3, [base+pb_0to15]
+ pshufb m1, m6
+ pshufb m5, m2
+ psubb m2, m1
+ pcmpgtb m2, m3
+ pand m4, m2
+ pandn m2, m5
+ por m4, m2
+ ret
+%assign stack_offset stack_offset+4
+%assign calloff 4
+.h: ; horizontal boxsum
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ %define leftq r4
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movif32 leftq, leftm
+ movddup m5, [leftq]
+ movif32 wq, w0m
+ mova m4, [lpfq+wq+4]
+ add leftmp, 8
+ palignr m4, m5, 12
+ jmp .h_main
+.h_extend_left:
+ movif32 wq, w0m
+ mova m4, [lpfq+wq+4]
+ pshufb m4, m12
+ jmp .h_main
+.h_top:
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movif32 wq, w0m
+.h_loop:
+ movu m4, [lpfq+wq+ 0]
+.h_main:
+ movu m5, [lpfq+wq+16]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp wd, -18
+ jl .h_have_right
+ call .extend_right
+.h_have_right:
+ palignr m0, m5, m4, 2
+ paddw m1, m4, m0
+ punpcklwd m2, m4, m0
+ pmaddwd m2, m2
+ punpckhwd m3, m4, m0
+ pmaddwd m3, m3
+ palignr m5, m4, 4
+ paddw m1, m5 ; sum
+ punpcklwd m4, m5, m6
+ pmaddwd m4, m4
+ punpckhwd m5, m6
+ pmaddwd m5, m5
+ paddd m2, m4 ; sumsq
+ paddd m3, m5
+ mova [t1+wq+400*0], m1
+ mova [t1+wq+400*2], m2
+ mova [t1+wq+400*4], m3
+ add wq, 16
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv0: ; horizontal boxsum + vertical boxsum + ab (even rows)
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+ movif32 leftq, leftm
+ movddup m5, [leftq]
+ movif32 wq, w0m
+ mova m4, [lpfq+wq+4]
+ add leftmp, 8
+ palignr m4, m5, 12
+ jmp .hv0_main
+.hv0_extend_left:
+ movif32 wq, w0m
+ mova m4, [lpfq+wq+4]
+ pshufb m4, m12
+ jmp .hv0_main
+.hv0_bottom:
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+ movif32 wq, w0m
+%if ARCH_X86_32
+ jmp .hv0_loop_start
+%endif
+.hv0_loop:
+ movif32 lpfq, hvsrcm
+.hv0_loop_start:
+ movu m4, [lpfq+wq+ 0]
+.hv0_main:
+ movu m5, [lpfq+wq+16]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv0_have_right
+ cmp wd, -18
+ jl .hv0_have_right
+ call .extend_right
+.hv0_have_right:
+ palignr m0, m5, m4, 2
+ paddw m1, m4, m0
+ punpcklwd m2, m4, m0
+ pmaddwd m2, m2
+ punpckhwd m3, m4, m0
+ pmaddwd m3, m3
+ palignr m5, m4, 4
+ paddw m1, m5 ; sum
+ punpcklwd m4, m5, m6
+ pmaddwd m4, m4
+ punpckhwd m5, m6
+ pmaddwd m5, m5
+ paddd m2, m4 ; sumsq
+ paddd m3, m5
+ paddw m0, m1, [t1+wq+400*0]
+ paddd m4, m2, [t1+wq+400*2]
+ paddd m5, m3, [t1+wq+400*4]
+ mova [t1+wq+400*0], m1
+ mova [t1+wq+400*2], m2
+ mova [t1+wq+400*4], m3
+ paddw m1, m0, [t2+wq+400*0]
+ paddd m2, m4, [t2+wq+400*2]
+ paddd m3, m5, [t2+wq+400*4]
+ mova [t2+wq+400*0], m0
+ mova [t2+wq+400*2], m4
+ mova [t2+wq+400*4], m5
+ paddd m2, m8
+ paddd m3, m8
+ psrld m2, 4 ; (a + 8) >> 4
+ psrld m3, 4
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; ((a + 8) >> 4) * 9
+ paddd m5, m3
+ psrlw m3, m1, 1
+ pavgw m3, m6 ; (b + 2) >> 2
+ punpcklwd m2, m3, m6
+ pmaddwd m2, m2
+ punpckhwd m3, m6
+ pmaddwd m3, m3
+ punpcklwd m0, m1, m6 ; b
+ punpckhwd m1, m6
+ MAXSD m4, m2, m14
+ MAXSD m5, m3, m14
+ psubd m4, m2 ; p
+ psubd m5, m3
+ MULLD m4, m9, m14 ; p * s
+ MULLD m5, m9, m14
+ pmaddwd m0, m10 ; b * 455
+ pmaddwd m1, m10
+ paddusw m4, m10
+ paddusw m5, m10
+ psrld m4, 20 ; min(z, 255)
+ movif32 t3, t3m
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, r0, dstm
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m14
+ MULLD m1, m5, m14
+%if ARCH_X86_32
+ pxor m6, m6
+%endif
+ paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m11
+ mova [t4+wq+4], m3
+ psrld m0, 12
+ psrld m1, 12
+ mova [t3+wq*2+ 8], m0
+ mova [t3+wq*2+24], m1
+ add wq, 16
+ jl .hv0_loop
+ ret
+ALIGN function_align
+.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+ movif32 leftq, leftm
+ movddup m5, [leftq]
+ movif32 wq, w0m
+ mova m4, [lpfq+wq+4]
+ add leftmp, 8
+ palignr m4, m5, 12
+ jmp .hv1_main
+.hv1_extend_left:
+ movif32 wq, w0m
+ mova m4, [lpfq+wq+4]
+ pshufb m4, m12
+ jmp .hv1_main
+.hv1_bottom:
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+ movif32 wq, w0m
+%if ARCH_X86_32
+ jmp .hv1_loop_start
+%endif
+.hv1_loop:
+ movif32 lpfq, hvsrcm
+.hv1_loop_start:
+ movu m4, [lpfq+wq+ 0]
+.hv1_main:
+ movu m5, [lpfq+wq+16]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv1_have_right
+ cmp wd, -18
+ jl .hv1_have_right
+ call .extend_right
+.hv1_have_right:
+ palignr m1, m5, m4, 2
+ paddw m0, m4, m1
+ punpcklwd m2, m4, m1
+ pmaddwd m2, m2
+ punpckhwd m3, m4, m1
+ pmaddwd m3, m3
+ palignr m5, m4, 4
+ paddw m0, m5 ; h sum
+ punpcklwd m1, m5, m6
+ pmaddwd m1, m1
+ punpckhwd m5, m6
+ pmaddwd m5, m5
+ paddd m2, m1 ; h sumsq
+ paddd m3, m5
+ paddw m1, m0, [t2+wq+400*0]
+ paddd m4, m2, [t2+wq+400*2]
+ paddd m5, m3, [t2+wq+400*4]
+ mova [t2+wq+400*0], m0
+ mova [t2+wq+400*2], m2
+ mova [t2+wq+400*4], m3
+ paddd m4, m8
+ paddd m5, m8
+ psrld m4, 4 ; (a + 8) >> 4
+ psrld m5, 4
+ pslld m2, m4, 3
+ pslld m3, m5, 3
+ paddd m4, m2 ; ((a + 8) >> 4) * 9
+ paddd m5, m3
+ psrlw m3, m1, 1
+ pavgw m3, m6 ; (b + 2) >> 2
+ punpcklwd m2, m3, m6
+ pmaddwd m2, m2
+ punpckhwd m3, m6
+ pmaddwd m3, m3
+ punpcklwd m0, m1, m6 ; b
+ punpckhwd m1, m6
+ MAXSD m4, m2, m14
+ MAXSD m5, m3, m14
+ psubd m4, m2 ; p
+ psubd m5, m3
+ MULLD m4, m9, m14 ; p * s
+ MULLD m5, m9, m14
+ pmaddwd m0, m10 ; b * 455
+ pmaddwd m1, m10
+ paddusw m4, m10
+ paddusw m5, m10
+ psrld m4, 20 ; min(z, 255)
+ movif32 t3, t3m
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, r0, dstm
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m14
+ MULLD m1, m5, m14
+%if ARCH_X86_32
+ pxor m6, m6
+%endif
+ paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m11
+ mova [t4+wq*1+400*2 +4], m3
+ psrld m0, 12
+ psrld m1, 12
+ mova [t3+wq*2+400*4+ 8], m0
+ mova [t3+wq*2+400*4+24], m1
+ add wq, 16
+ jl .hv1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.v0: ; vertical boxsums + ab (even rows)
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ mov wd, w0m
+%endif
+.v0_loop:
+ mova m0, [t1+wq+400*0]
+ mova m4, [t1+wq+400*2]
+ mova m5, [t1+wq+400*4]
+ paddw m0, m0
+ paddd m4, m4
+ paddd m5, m5
+ paddw m1, m0, [t2+wq+400*0]
+ paddd m2, m4, [t2+wq+400*2]
+ paddd m3, m5, [t2+wq+400*4]
+ mova [t2+wq+400*0], m0
+ mova [t2+wq+400*2], m4
+ mova [t2+wq+400*4], m5
+ paddd m2, m8
+ paddd m3, m8
+ psrld m2, 4 ; (a + 8) >> 4
+ psrld m3, 4
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; ((a + 8) >> 4) * 9
+ paddd m5, m3
+ psrlw m3, m1, 1
+ pavgw m3, m6 ; (b + 2) >> 2
+ punpcklwd m2, m3, m6
+ pmaddwd m2, m2
+ punpckhwd m3, m6
+ pmaddwd m3, m3
+ punpcklwd m0, m1, m6 ; b
+ punpckhwd m1, m6
+ MAXSD m4, m2, m14
+ MAXSD m5, m3, m14
+ psubd m4, m2 ; p
+ psubd m5, m3
+ MULLD m4, m9, m14 ; p * s
+ MULLD m5, m9, m14
+ pmaddwd m0, m10 ; b * 455
+ pmaddwd m1, m10
+ paddusw m4, m10
+ paddusw m5, m10
+ psrld m4, 20 ; min(z, 255)
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, r0, dstm
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m14
+ MULLD m1, m5, m14
+%if ARCH_X86_32
+ pxor m6, m6
+%endif
+ paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m11
+ mova [t4+wq*1+400*0+ 4], m3
+ psrld m0, 12
+ psrld m1, 12
+ mova [t3+wq*2+400*0+ 8], m0
+ mova [t3+wq*2+400*0+24], m1
+ add wq, 16
+ jl .v0_loop
+ ret
+.v1: ; vertical boxsums + ab (odd rows)
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ mov wd, w0m
+%endif
+.v1_loop:
+ mova m0, [t1+wq+400*0]
+ mova m4, [t1+wq+400*2]
+ mova m5, [t1+wq+400*4]
+ paddw m1, m0, [t2+wq+400*0]
+ paddd m2, m4, [t2+wq+400*2]
+ paddd m3, m5, [t2+wq+400*4]
+ mova [t2+wq+400*0], m0
+ mova [t2+wq+400*2], m4
+ mova [t2+wq+400*4], m5
+ paddd m2, m8
+ paddd m3, m8
+ psrld m2, 4 ; (a + 8) >> 4
+ psrld m3, 4
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; ((a + 8) >> 4) * 9
+ paddd m5, m3
+ psrlw m3, m1, 1
+ pavgw m3, m6 ; (b + 2) >> 2
+ punpcklwd m2, m3, m6
+ pmaddwd m2, m2
+ punpckhwd m3, m6
+ pmaddwd m3, m3
+ punpcklwd m0, m1, m6 ; b
+ punpckhwd m1, m6
+ MAXSD m4, m2, m14
+ MAXSD m5, m3, m14
+ psubd m4, m2 ; p
+ psubd m5, m3
+ MULLD m4, m9, m14 ; p * s
+ MULLD m5, m9, m14
+ pmaddwd m0, m10 ; b * 455
+ pmaddwd m1, m10
+ paddusw m4, m10
+ paddusw m5, m10
+ psrld m4, 20 ; min(z, 255)
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, r0, dstm
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m14
+ MULLD m1, m5, m14
+%if ARCH_X86_32
+ pxor m6, m6
+%endif
+ paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m11
+ mova [t4+wq*1+400*2+ 4], m3
+ psrld m0, 12
+ psrld m1, 12
+ mova [t3+wq*2+400*4+ 8], m0
+ mova [t3+wq*2+400*4+24], m1
+ add wq, 16
+ jl .v1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.prep_n: ; initial neighbor setup
+ movif64 wq, r4
+ movif32 wd, w1m
+.prep_n_loop:
+ movu m0, [t4+wq*1+400*0+ 4]
+ movu m1, [t3+wq*2+400*0+ 8]
+ movu m2, [t3+wq*2+400*0+24]
+ movu m3, [t4+wq*1+400*0+ 2]
+ movu m4, [t3+wq*2+400*0+ 4]
+ movu m5, [t3+wq*2+400*0+20]
+ paddw m0, [t4+wq*1+400*0+ 0]
+ paddd m1, [t3+wq*2+400*0+ 0]
+ paddd m2, [t3+wq*2+400*0+16]
+ paddw m3, m0
+ paddd m4, m1
+ paddd m5, m2
+ psllw m3, 2 ; a[-1] 444
+ pslld m4, 2 ; b[-1] 444
+ pslld m5, 2
+ psubw m3, m0 ; a[-1] 343
+ psubd m4, m1 ; b[-1] 343
+ psubd m5, m2
+ mova [t4+wq*1+400*4], m3
+ mova [t3+wq*2+400*8+ 0], m4
+ mova [t3+wq*2+400*8+16], m5
+ movu m0, [t4+wq*1+400*2+ 4]
+ movu m1, [t3+wq*2+400*4+ 8]
+ movu m2, [t3+wq*2+400*4+24]
+ movu m3, [t4+wq*1+400*2+ 2]
+ movu m4, [t3+wq*2+400*4+ 4]
+ movu m5, [t3+wq*2+400*4+20]
+ paddw m0, [t4+wq*1+400*2+ 0]
+ paddd m1, [t3+wq*2+400*4+ 0]
+ paddd m2, [t3+wq*2+400*4+16]
+ paddw m3, m0
+ paddd m4, m1
+ paddd m5, m2
+ psllw m3, 2 ; a[ 0] 444
+ pslld m4, 2 ; b[ 0] 444
+ pslld m5, 2
+ mova [t4+wq*1+400* 6], m3
+ mova [t3+wq*2+400*12+ 0], m4
+ mova [t3+wq*2+400*12+16], m5
+ psubw m3, m0 ; a[ 0] 343
+ psubd m4, m1 ; b[ 0] 343
+ psubd m5, m2
+ mova [t4+wq*1+400* 8], m3
+ mova [t3+wq*2+400*16+ 0], m4
+ mova [t3+wq*2+400*16+16], m5
+ add wq, 16
+ jl .prep_n_loop
+ ret
+ALIGN function_align
+.n0: ; neighbor + output (even rows)
+ movif64 wq, r4
+ movif32 wd, w1m
+.n0_loop:
+ movu m3, [t4+wq*1+400*0+4]
+ movu m1, [t4+wq*1+400*0+2]
+ paddw m3, [t4+wq*1+400*0+0]
+ paddw m1, m3
+ psllw m1, 2 ; a[ 1] 444
+ psubw m2, m1, m3 ; a[ 1] 343
+ paddw m3, m2, [t4+wq*1+400*4]
+ paddw m3, [t4+wq*1+400*6]
+ mova [t4+wq*1+400*4], m2
+ mova [t4+wq*1+400*6], m1
+ movu m4, [t3+wq*2+400*0+8]
+ movu m1, [t3+wq*2+400*0+4]
+ paddd m4, [t3+wq*2+400*0+0]
+ paddd m1, m4
+ pslld m1, 2 ; b[ 1] 444
+ psubd m2, m1, m4 ; b[ 1] 343
+ paddd m4, m2, [t3+wq*2+400* 8+ 0]
+ paddd m4, [t3+wq*2+400*12+ 0]
+ mova [t3+wq*2+400* 8+ 0], m2
+ mova [t3+wq*2+400*12+ 0], m1
+ movu m5, [t3+wq*2+400*0+24]
+ movu m1, [t3+wq*2+400*0+20]
+ paddd m5, [t3+wq*2+400*0+16]
+ paddd m1, m5
+ pslld m1, 2
+ psubd m2, m1, m5
+ paddd m5, m2, [t3+wq*2+400* 8+16]
+ paddd m5, [t3+wq*2+400*12+16]
+ mova [t3+wq*2+400* 8+16], m2
+ mova [t3+wq*2+400*12+16], m1
+ mova m0, [dstq+wq]
+ punpcklwd m1, m0, m6
+ punpcklwd m2, m3, m6
+ pmaddwd m2, m1 ; a * src
+ punpckhwd m1, m0, m6
+ punpckhwd m3, m6
+ pmaddwd m3, m1
+ psubd m4, m2 ; b - a * src + (1 << 8)
+ psubd m5, m3
+ psrad m4, 9
+ psrad m5, 9
+ packssdw m4, m5
+ pmulhrsw m4, m7
+ paddw m0, m4
+ pmaxsw m0, m6
+ pminsw m0, m13
+ mova [dstq+wq], m0
+ add wq, 16
+ jl .n0_loop
+ add dstq, stridemp
+ ret
+ALIGN function_align
+.n1: ; neighbor + output (odd rows)
+ movif64 wq, r4
+ movif32 wd, w1m
+.n1_loop:
+ movu m3, [t4+wq*1+400*2+4]
+ movu m1, [t4+wq*1+400*2+2]
+ paddw m3, [t4+wq*1+400*2+0]
+ paddw m1, m3
+ psllw m1, 2 ; a[ 1] 444
+ psubw m2, m1, m3 ; a[ 1] 343
+ paddw m3, m2, [t4+wq*1+400*6]
+ paddw m3, [t4+wq*1+400*8]
+ mova [t4+wq*1+400*6], m1
+ mova [t4+wq*1+400*8], m2
+ movu m4, [t3+wq*2+400*4+8]
+ movu m1, [t3+wq*2+400*4+4]
+ paddd m4, [t3+wq*2+400*4+0]
+ paddd m1, m4
+ pslld m1, 2 ; b[ 1] 444
+ psubd m2, m1, m4 ; b[ 1] 343
+ paddd m4, m2, [t3+wq*2+400*12+ 0]
+ paddd m4, [t3+wq*2+400*16+ 0]
+ mova [t3+wq*2+400*12+ 0], m1
+ mova [t3+wq*2+400*16+ 0], m2
+ movu m5, [t3+wq*2+400*4+24]
+ movu m1, [t3+wq*2+400*4+20]
+ paddd m5, [t3+wq*2+400*4+16]
+ paddd m1, m5
+ pslld m1, 2
+ psubd m2, m1, m5
+ paddd m5, m2, [t3+wq*2+400*12+16]
+ paddd m5, [t3+wq*2+400*16+16]
+ mova [t3+wq*2+400*12+16], m1
+ mova [t3+wq*2+400*16+16], m2
+ mova m0, [dstq+wq]
+ punpcklwd m1, m0, m6
+ punpcklwd m2, m3, m6
+ pmaddwd m2, m1 ; a * src
+ punpckhwd m1, m0, m6
+ punpckhwd m3, m6
+ pmaddwd m3, m1
+ psubd m4, m2 ; b - a * src + (1 << 8)
+ psubd m5, m3
+ psrad m4, 9
+ psrad m5, 9
+ packssdw m4, m5
+ pmulhrsw m4, m7
+ paddw m0, m4
+ pmaxsw m0, m6
+ pminsw m0, m13
+ mova [dstq+wq], m0
+ add wq, 16
+ jl .n1_loop
+ add dstq, stridemp
+ movif32 dstm, dstq
+ ret
+
+%if ARCH_X86_32
+ %if STACK_ALIGNMENT < 16
+ %assign extra_stack 10*16
+ %else
+ %assign extra_stack 8*16
+ %endif
+cglobal sgr_filter_mix_16bpc, 1, 7, 8, -400*66-48-extra_stack, \
+ dst, stride, left, lpf, w
+ %if STACK_ALIGNMENT < 16
+ %define dstm dword [esp+calloff+16*8+4*0]
+ %define stridemp dword [esp+calloff+16*8+4*1]
+ %define leftm dword [esp+calloff+16*8+4*2]
+ %define lpfm dword [esp+calloff+16*8+4*3]
+ %define w0m dword [esp+calloff+16*8+4*4]
+ %define hd dword [esp+calloff+16*8+4*5]
+ %define edgeb byte [esp+calloff+16*8+4*6]
+ %define edged dword [esp+calloff+16*8+4*6]
+ %define leftmp leftm
+ %else
+ %define w0m wm
+ %define hd dword r5m
+ %define edgeb byte r7m
+ %define edged dword r7m
+ %endif
+ %define hvsrcm dword [esp+calloff+4*0]
+ %define w1m dword [esp+calloff+4*1]
+ %define t3m dword [esp+calloff+4*2]
+ %define t4m dword [esp+calloff+4*3]
+ %xdefine m8 m6
+ %define m9 [base+pd_8]
+ %define m10 [base+pd_34816]
+ %define m11 [base+pd_0xf00801c7]
+ %define m12 [base+pd_0xf00800a4]
+ %define m13 [esp+calloff+16*4]
+ %define m14 [esp+calloff+16*5]
+ %define m15 [esp+calloff+16*6]
+ %define m6 [esp+calloff+16*7]
+ %define base r6-$$
+ %assign calloff 0
+ %if STACK_ALIGNMENT < 16
+ mov strideq, [rstk+stack_offset+ 8]
+ mov leftq, [rstk+stack_offset+12]
+ mov lpfq, [rstk+stack_offset+16]
+ mov wd, [rstk+stack_offset+20]
+ mov dstm, dstq
+ mov stridemp, strideq
+ mov leftm, leftq
+ mov r1, [rstk+stack_offset+24]
+ mov r2, [rstk+stack_offset+32]
+ mov lpfm, lpfq
+ mov hd, r1
+ mov edged, r2
+ %endif
+%else
+cglobal sgr_filter_mix_16bpc, 4, 15, 16, -400*66-40, dst, stride, left, lpf, \
+ w, h, edge, params
+%endif
+%if ARCH_X86_64 || STACK_ALIGNMENT >= 16
+ movifnidn wd, wm
+%endif
+%if ARCH_X86_64
+ mov paramsq, r6mp
+ lea r13, [sgr_x_by_x-0xf03]
+ movifnidn hd, hm
+ add wd, wd
+ mov edged, r7m
+ mova m14, [paramsq]
+ add lpfq, wq
+ mova m9, [pd_8]
+ lea t1, [rsp+wq+44]
+ mova m10, [pd_34816]
+ add dstq, wq
+ mova m11, [pd_0xf00801c7]
+ lea t3, [rsp+wq*2+400*24+40]
+ mova m12, [pd_0xf00800a4]
+ lea t4, [rsp+wq+400*52+40]
+ neg wq
+ pshufd m15, m14, q2222 ; w0 w1
+ punpcklwd m14, m14
+ pshufd m13, m14, q0000 ; s0
+ pshufd m14, m14, q2222 ; s1
+ pxor m6, m6
+ psllw m15, 2
+ DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w
+ %define lpfm [rsp]
+%else
+ mov r1, [rstk+stack_offset+28] ; params
+ LEA r6, $$
+ add wd, wd
+ mova m2, [r1]
+ add lpfm, wq
+ lea t1, [rsp+extra_stack+wq+52]
+ add dstq, wq
+ lea t3, [rsp+extra_stack+wq*2+400*24+48]
+ mov dstm, dstq
+ lea t4, [rsp+extra_stack+wq+400*52+48]
+ mov t3m, t3
+ mov t4m, t4
+ neg wq
+ pshuflw m0, m2, q0000
+ pshuflw m1, m2, q2222
+ pshufhw m2, m2, q1010
+ punpcklqdq m0, m0 ; s0
+ punpcklqdq m1, m1 ; s1
+ punpckhqdq m2, m2 ; w0 w1
+ mov w1m, wd
+ pxor m3, m3
+ psllw m2, 2
+ mova m13, m0
+ mova m14, m1
+ sub wd, 4
+ mova m15, m2
+ mova m6, m3
+ mov lpfq, lpfm
+ mov w0m, wd
+ %define strideq r5
+%endif
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, stridemp
+ mov t2, t1
+%if ARCH_X86_64
+ call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).top_fixup
+%else
+ mov wq, w0m
+ call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).top_fixup_loop
+%endif
+ add t1, 400*12
+ call .h_top
+ movif32 strideq, stridemp
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ add r10, strideq
+ mov lpfm, r10 ; below
+ movif32 t4, t4m
+ call .hv0
+.main:
+ dec hd
+ jz .height1
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+ call .hv1
+ call .prep_n
+ sub hd, 2
+ jl .extend_bottom
+.main_loop:
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+ call .hv0
+%if ARCH_X86_64
+ test hd, hd
+%else
+ mov r4, hd
+ test r4, r4
+%endif
+ jz .odd_height
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+ call .hv1
+ call .n0
+ call .n1
+ sub hd, 2
+ jge .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .extend_bottom
+ mov lpfq, lpfm
+ call .hv0_bottom
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+ call .hv1_bottom
+.end:
+ call .n0
+ call .n1
+.end2:
+ RET
+.height1:
+ call .v1
+ call .prep_n
+ jmp .odd_height_end
+.odd_height:
+ call .v1
+ call .n0
+ call .n1
+.odd_height_end:
+ call .v0
+ call .v1
+ call .n0
+ jmp .end2
+.extend_bottom:
+ call .v0
+ call .v1
+ jmp .end
+.no_top:
+ movif32 strideq, stridemp
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov lpfm, r10
+ call .h
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ mov wq, w0m
+ mov hvsrcm, lpfq
+%endif
+ lea t2, [t1+400*12]
+.top_fixup_loop:
+ mova m0, [t1+wq+400* 0]
+ mova m1, [t1+wq+400* 2]
+ mova m2, [t1+wq+400* 4]
+ paddw m0, m0
+ mova m3, [t1+wq+400* 6]
+ paddd m1, m1
+ mova m4, [t1+wq+400* 8]
+ paddd m2, m2
+ mova m5, [t1+wq+400*10]
+ mova [t2+wq+400* 0], m0
+ mova [t2+wq+400* 2], m1
+ mova [t2+wq+400* 4], m2
+ mova [t2+wq+400* 6], m3
+ mova [t2+wq+400* 8], m4
+ mova [t2+wq+400*10], m5
+ add wq, 16
+ jl .top_fixup_loop
+ movif32 t3, t3m
+ movif32 t4, t4m
+ call .v0
+ jmp .main
+.h: ; horizontal boxsum
+%assign stack_offset stack_offset+4
+%assign calloff 4
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ %define leftq r4
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movif32 leftq, leftm
+ movddup m5, [leftq]
+ movif32 wq, w0m
+ mova m4, [lpfq+wq+4]
+ add leftmp, 8
+ palignr m4, m5, 10
+ jmp .h_main
+.h_extend_left:
+ movif32 wq, w0m
+ mova m4, [lpfq+wq+4]
+ pshufb m4, [base+sgr_lshuf5]
+ jmp .h_main
+.h_top:
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movif32 wq, w0m
+.h_loop:
+ movu m4, [lpfq+wq- 2]
+.h_main:
+ movu m5, [lpfq+wq+14]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp wd, -20
+ jl .h_have_right
+%if ARCH_X86_32
+ pxor m8, m8
+%endif
+ call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).extend_right
+.h_have_right:
+ palignr m3, m5, m4, 2
+ palignr m0, m5, m4, 4
+ paddw m1, m3, m0
+ punpcklwd m2, m3, m0
+ pmaddwd m2, m2
+ punpckhwd m3, m0
+ pmaddwd m3, m3
+ palignr m0, m5, m4, 6
+ paddw m1, m0 ; sum3
+ punpcklwd m7, m0, m6
+ pmaddwd m7, m7
+ punpckhwd m0, m6
+ pmaddwd m0, m0
+ paddd m2, m7 ; sumsq3
+ palignr m5, m4, 8
+ punpcklwd m7, m5, m4
+ paddw m8, m4, m5
+ pmaddwd m7, m7
+ punpckhwd m5, m4
+ pmaddwd m5, m5
+ paddd m3, m0
+ mova [t1+wq+400* 6], m1
+ mova [t1+wq+400* 8], m2
+ mova [t1+wq+400*10], m3
+ paddw m8, m1 ; sum5
+ paddd m7, m2 ; sumsq5
+ paddd m5, m3
+ mova [t1+wq+400* 0], m8
+ mova [t1+wq+400* 2], m7
+ mova [t1+wq+400* 4], m5
+ add wq, 16
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows)
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+ movif32 leftq, leftm
+ movddup m5, [leftq]
+ movif32 wq, w0m
+ mova m4, [lpfq+wq+4]
+ add leftmp, 8
+ palignr m4, m5, 10
+ jmp .hv0_main
+.hv0_extend_left:
+ movif32 wq, w0m
+ mova m4, [lpfq+wq+4]
+ pshufb m4, [base+sgr_lshuf5]
+ jmp .hv0_main
+.hv0_bottom:
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+ movif32 wq, w0m
+%if ARCH_X86_32
+ jmp .hv0_loop_start
+%endif
+.hv0_loop:
+ movif32 lpfq, hvsrcm
+.hv0_loop_start:
+ movu m4, [lpfq+wq- 2]
+.hv0_main:
+ movu m5, [lpfq+wq+14]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv0_have_right
+ cmp wd, -20
+ jl .hv0_have_right
+%if ARCH_X86_32
+ pxor m8, m8
+%endif
+ call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).extend_right
+.hv0_have_right:
+ palignr m3, m5, m4, 2
+ palignr m0, m5, m4, 4
+ movif32 t3, t3m
+ paddw m1, m3, m0
+ punpcklwd m2, m3, m0
+ pmaddwd m2, m2
+ punpckhwd m3, m0
+ pmaddwd m3, m3
+ palignr m0, m5, m4, 6
+ paddw m1, m0 ; h sum3
+ punpcklwd m7, m0, m6
+ pmaddwd m7, m7
+ punpckhwd m0, m6
+ pmaddwd m0, m0
+ paddd m2, m7 ; h sumsq3
+ palignr m5, m4, 8
+ punpcklwd m7, m5, m4
+ paddw m8, m4, m5
+ pmaddwd m7, m7
+ punpckhwd m5, m4
+ pmaddwd m5, m5
+ paddd m3, m0
+ paddw m8, m1 ; h sum5
+ paddd m7, m2 ; h sumsq5
+ paddd m5, m3
+ mova [t3+wq*2+400*8+ 8], m8
+ mova [t3+wq*2+400*0+ 8], m7
+ mova [t3+wq*2+400*0+24], m5
+ paddw m8, [t1+wq+400* 0]
+ paddd m7, [t1+wq+400* 2]
+ paddd m5, [t1+wq+400* 4]
+ mova [t1+wq+400* 0], m8
+ mova [t1+wq+400* 2], m7
+ mova [t1+wq+400* 4], m5
+ paddw m0, m1, [t1+wq+400* 6]
+ paddd m4, m2, [t1+wq+400* 8]
+ paddd m5, m3, [t1+wq+400*10]
+ mova [t1+wq+400* 6], m1
+ mova [t1+wq+400* 8], m2
+ mova [t1+wq+400*10], m3
+ paddw m1, m0, [t2+wq+400* 6]
+ paddd m2, m4, [t2+wq+400* 8]
+ paddd m3, m5, [t2+wq+400*10]
+ mova [t2+wq+400* 6], m0
+ mova [t2+wq+400* 8], m4
+ mova [t2+wq+400*10], m5
+ paddd m2, m9
+ paddd m3, m9
+ psrld m2, 4 ; (a3 + 8) >> 4
+ psrld m3, 4
+%if ARCH_X86_32
+ pxor m7, m7
+%else
+ SWAP m7, m6
+%endif
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; ((a3 + 8) >> 4) * 9
+ paddd m5, m3
+ psrlw m3, m1, 1
+ pavgw m3, m7 ; (b3 + 2) >> 2
+ punpcklwd m2, m3, m7
+ pmaddwd m2, m2
+ punpckhwd m3, m7
+ pmaddwd m3, m3
+ punpcklwd m0, m1, m7 ; b3
+ punpckhwd m1, m7
+%if ARCH_X86_64
+ SWAP m7, m6
+%endif
+ MAXSD m4, m2, m7
+ MAXSD m5, m3, m7
+ psubd m4, m2 ; p3
+ psubd m5, m3
+ MULLD m4, m14, m7 ; p3 * s1
+ MULLD m5, m14, m7
+ pmaddwd m0, m11 ; b3 * 455
+ pmaddwd m1, m11
+ paddusw m4, m11
+ paddusw m5, m11
+ psrld m4, 20 ; min(z3, 255)
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, r0, dstm
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m7
+ MULLD m1, m5, m7
+ paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m10
+ mova [t4+wq*1+400*2+ 4], m3
+ psrld m0, 12
+ psrld m1, 12
+ mova [t3+wq*2+400*4+ 8], m0
+ mova [t3+wq*2+400*4+24], m1
+ add wq, 16
+ jl .hv0_loop
+ ret
+ALIGN function_align
+.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+ movif32 leftq, leftm
+ movddup m5, [leftq]
+ movif32 wq, w0m
+ mova m4, [lpfq+wq+4]
+ add leftmp, 8
+ palignr m4, m5, 10
+ jmp .hv1_main
+.hv1_extend_left:
+ movif32 wq, w0m
+ mova m4, [lpfq+wq+4]
+ pshufb m4, [base+sgr_lshuf5]
+ jmp .hv1_main
+.hv1_bottom:
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+ movif32 wq, w0m
+%if ARCH_X86_32
+ jmp .hv1_loop_start
+%endif
+.hv1_loop:
+ movif32 lpfq, hvsrcm
+.hv1_loop_start:
+ movu m4, [lpfq+wq- 2]
+.hv1_main:
+ movu m5, [lpfq+wq+14]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv1_have_right
+ cmp wd, -20
+ jl .hv1_have_right
+%if ARCH_X86_32
+ pxor m8, m8
+%endif
+ call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).extend_right
+.hv1_have_right:
+ palignr m7, m5, m4, 2
+ palignr m3, m5, m4, 4
+ paddw m2, m7, m3
+ punpcklwd m0, m7, m3
+ pmaddwd m0, m0
+ punpckhwd m7, m3
+ pmaddwd m7, m7
+ palignr m3, m5, m4, 6
+ paddw m2, m3 ; h sum3
+ punpcklwd m1, m3, m6
+ pmaddwd m1, m1
+ punpckhwd m3, m6
+ pmaddwd m3, m3
+ paddd m0, m1 ; h sumsq3
+ palignr m5, m4, 8
+ punpckhwd m1, m4, m5
+ paddw m8, m4, m5
+ pmaddwd m1, m1
+ punpcklwd m4, m5
+ pmaddwd m4, m4
+ paddd m7, m3
+ paddw m5, m2, [t2+wq+400* 6]
+ mova [t2+wq+400* 6], m2
+ paddw m8, m2 ; h sum5
+ paddd m2, m0, [t2+wq+400* 8]
+ paddd m3, m7, [t2+wq+400*10]
+ mova [t2+wq+400* 8], m0
+ mova [t2+wq+400*10], m7
+ paddd m4, m0 ; h sumsq5
+ paddd m1, m7
+ paddd m2, m9
+ paddd m3, m9
+ psrld m2, 4 ; (a3 + 8) >> 4
+ psrld m3, 4
+ pslld m0, m2, 3
+ pslld m7, m3, 3
+ paddd m2, m0 ; ((a3 + 8) >> 4) * 9
+ paddd m3, m7
+ psrlw m7, m5, 1
+ pavgw m7, m6 ; (b3 + 2) >> 2
+ punpcklwd m0, m7, m6
+ pmaddwd m0, m0
+ punpckhwd m7, m6
+ pmaddwd m7, m7
+%if ARCH_X86_32
+ mova [esp+20], m8
+%else
+ SWAP m8, m6
+%endif
+ MAXSD m2, m0, m8
+ MAXSD m3, m7, m8
+ pxor m8, m8
+ psubd m2, m0 ; p3
+ psubd m3, m7
+ punpcklwd m0, m5, m8 ; b3
+ punpckhwd m5, m8
+ MULLD m2, m14, m8 ; p3 * s1
+ MULLD m3, m14, m8
+ pmaddwd m0, m11 ; b3 * 455
+ pmaddwd m5, m11
+ paddusw m2, m11
+ paddusw m3, m11
+ psrld m2, 20 ; min(z3, 255)
+ movif32 t3, t3m
+ psrld m3, 20
+ GATHER_X_BY_X m8, m2, m3, r0, dstm
+ punpcklwd m2, m8, m8
+ punpckhwd m3, m8, m8
+ MULLD m0, m2, m7
+ MULLD m5, m3, m7
+ paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m5, m10
+ psrld m0, 12
+ psrld m5, 12
+ mova [t4+wq*1+400*4+4], m8
+ mova [t3+wq*2+400*8+ 8], m0
+ mova [t3+wq*2+400*8+24], m5
+%if ARCH_X86_32
+ mova m8, [esp+20]
+%else
+ SWAP m6, m8
+ pxor m6, m6
+%endif
+ paddw m5, m8, [t2+wq+400*0]
+ paddd m2, m4, [t2+wq+400*2]
+ paddd m3, m1, [t2+wq+400*4]
+ paddw m5, [t1+wq+400*0]
+ paddd m2, [t1+wq+400*2]
+ paddd m3, [t1+wq+400*4]
+ mova [t2+wq+400*0], m8
+ paddd m2, m9
+ paddd m3, m9
+ psrld m2, 4 ; (a5 + 8) >> 4
+ psrld m3, 4
+ mova [t2+wq+400*2], m4
+ pslld m8, m2, 4
+ mova [t2+wq+400*4], m1
+ pslld m4, m3, 4
+ paddd m8, m2
+ pslld m2, 3
+ paddd m4, m3
+ pslld m3, 3
+ paddd m2, m8 ; ((a5 + 8) >> 4) * 25
+ paddd m3, m4
+%if ARCH_X86_32
+ pxor m7, m7
+%else
+ SWAP m7, m6
+%endif
+ psrlw m1, m5, 1
+ pavgw m1, m7 ; (b5 + 2) >> 2
+ punpcklwd m4, m1, m7
+ pmaddwd m4, m4
+ punpckhwd m1, m7
+ pmaddwd m1, m1
+ punpcklwd m0, m5, m7 ; b5
+ punpckhwd m5, m7
+%if ARCH_X86_64
+ SWAP m7, m6
+%endif
+ MAXSD m2, m4, m7
+ psubd m2, m4 ; p5
+ MAXSD m3, m1, m7
+ psubd m3, m1
+ MULLD m2, m13, m7 ; p5 * s0
+ MULLD m3, m13, m7
+ pmaddwd m0, m12 ; b5 * 164
+ pmaddwd m5, m12
+ paddusw m2, m12
+ paddusw m3, m12
+ psrld m2, 20 ; min(z5, 255)
+ psrld m3, 20
+ GATHER_X_BY_X m1, m2, m3, r0, dstm
+ punpcklwd m2, m1, m1
+ punpckhwd m3, m1, m1
+ MULLD m0, m2, m7
+ MULLD m5, m3, m7
+ paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
+ paddd m5, m10
+ mova [t4+wq*1+400*0+ 4], m1
+ psrld m0, 12
+ psrld m5, 12
+ mova [t3+wq*2+400*0+ 8], m0
+ mova [t3+wq*2+400*0+24], m5
+ add wq, 16
+ jl .hv1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.v0: ; vertical boxsums + ab3 (even rows)
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ mov wd, w0m
+%endif
+.v0_loop:
+ mova m0, [t1+wq+400* 6]
+ mova m4, [t1+wq+400* 8]
+ mova m5, [t1+wq+400*10]
+ paddw m0, m0
+ paddd m4, m4
+ paddd m5, m5
+ paddw m1, m0, [t2+wq+400* 6]
+ paddd m2, m4, [t2+wq+400* 8]
+ paddd m3, m5, [t2+wq+400*10]
+ mova [t2+wq+400* 6], m0
+ mova [t2+wq+400* 8], m4
+ mova [t2+wq+400*10], m5
+ paddd m2, m9
+ paddd m3, m9
+ psrld m2, 4 ; (a3 + 8) >> 4
+ psrld m3, 4
+%if ARCH_X86_32
+ pxor m7, m7
+%else
+ SWAP m7, m6
+%endif
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; ((a3 + 8) >> 4) * 9
+ paddd m5, m3
+ psrlw m3, m1, 1
+ pavgw m3, m7 ; (b3 + 2) >> 2
+ punpcklwd m2, m3, m7
+ pmaddwd m2, m2
+ punpckhwd m3, m7
+ pmaddwd m3, m3
+ punpcklwd m0, m1, m7 ; b3
+ punpckhwd m1, m7
+%if ARCH_X86_64
+ SWAP m7, m6
+%endif
+ MAXSD m4, m2, m7
+ MAXSD m5, m3, m7
+ psubd m4, m2 ; p3
+ psubd m5, m3
+ MULLD m4, m14, m7 ; p3 * s1
+ MULLD m5, m14, m7
+ pmaddwd m0, m11 ; b3 * 455
+ pmaddwd m1, m11
+ paddusw m4, m11
+ paddusw m5, m11
+ psrld m4, 20 ; min(z3, 255)
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, r0, dstm
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m7
+ MULLD m1, m5, m7
+ paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m10
+ mova [t4+wq*1+400*2+4], m3
+ psrld m0, 12
+ psrld m1, 12
+ mova m3, [t1+wq+400*0]
+ mova m4, [t1+wq+400*2]
+ mova m5, [t1+wq+400*4]
+ mova [t3+wq*2+400*8+ 8], m3
+ mova [t3+wq*2+400*0+ 8], m4
+ mova [t3+wq*2+400*0+24], m5
+ paddw m3, m3 ; cc5
+ paddd m4, m4
+ paddd m5, m5
+ mova [t1+wq+400*0], m3
+ mova [t1+wq+400*2], m4
+ mova [t1+wq+400*4], m5
+ mova [t3+wq*2+400*4+ 8], m0
+ mova [t3+wq*2+400*4+24], m1
+ add wq, 16
+ jl .v0_loop
+ ret
+.v1: ; vertical boxsums + ab (odd rows)
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ mov wd, w0m
+%endif
+.v1_loop:
+ mova m4, [t1+wq+400* 6]
+ mova m5, [t1+wq+400* 8]
+ mova m7, [t1+wq+400*10]
+ paddw m1, m4, [t2+wq+400* 6]
+ paddd m2, m5, [t2+wq+400* 8]
+ paddd m3, m7, [t2+wq+400*10]
+ mova [t2+wq+400* 6], m4
+ mova [t2+wq+400* 8], m5
+ mova [t2+wq+400*10], m7
+ paddd m2, m9
+ paddd m3, m9
+ psrld m2, 4 ; (a3 + 8) >> 4
+ psrld m3, 4
+%if ARCH_X86_32
+ pxor m7, m7
+%else
+ SWAP m7, m6
+%endif
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; ((a3 + 8) >> 4) * 9
+ paddd m5, m3
+ psrlw m3, m1, 1
+ pavgw m3, m7 ; (b3 + 2) >> 2
+ punpcklwd m2, m3, m7
+ pmaddwd m2, m2
+ punpckhwd m3, m7
+ pmaddwd m3, m3
+ punpcklwd m0, m1, m7 ; b3
+ punpckhwd m1, m7
+%if ARCH_X86_64
+ SWAP m7, m6
+%endif
+ MAXSD m4, m2, m7
+ MAXSD m5, m3, m7
+ psubd m4, m2 ; p3
+ psubd m5, m3
+ MULLD m4, m14, m7 ; p3 * s1
+ MULLD m5, m14, m7
+ pmaddwd m0, m11 ; b3 * 455
+ pmaddwd m1, m11
+ paddusw m4, m11
+ paddusw m5, m11
+ psrld m4, 20 ; min(z3, 255)
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, r0, dstm
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m7
+ MULLD m1, m5, m7
+ paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m10
+ mova [t4+wq*1+400*4+4], m3
+ psrld m0, 12
+ psrld m8, m1, 12
+ mova m4, [t3+wq*2+400*8+ 8]
+ mova m5, [t3+wq*2+400*0+ 8]
+ mova m7, [t3+wq*2+400*0+24]
+ paddw m1, m4, [t2+wq+400*0]
+ paddd m2, m5, [t2+wq+400*2]
+ paddd m3, m7, [t2+wq+400*4]
+ paddw m1, [t1+wq+400*0]
+ paddd m2, [t1+wq+400*2]
+ paddd m3, [t1+wq+400*4]
+ mova [t2+wq+400*0], m4
+ mova [t2+wq+400*2], m5
+ mova [t2+wq+400*4], m7
+ paddd m2, m9
+ paddd m3, m9
+ psrld m2, 4 ; (a5 + 8) >> 4
+ psrld m3, 4
+ mova [t3+wq*2+400*8+ 8], m0
+ pslld m4, m2, 4
+ mova [t3+wq*2+400*8+24], m8
+ pslld m5, m3, 4
+ paddd m4, m2
+ pslld m2, 3
+ paddd m5, m3
+ pslld m3, 3
+ paddd m2, m4
+ paddd m3, m5
+%if ARCH_X86_32
+ pxor m7, m7
+%else
+ SWAP m7, m6
+%endif
+ psrlw m5, m1, 1
+ pavgw m5, m7 ; (b5 + 2) >> 2
+ punpcklwd m4, m5, m7
+ pmaddwd m4, m4
+ punpckhwd m5, m7
+ pmaddwd m5, m5
+ punpcklwd m0, m1, m7 ; b5
+ punpckhwd m1, m7
+%if ARCH_X86_64
+ SWAP m7, m6
+%endif
+ MAXSD m2, m4, m7
+ psubd m2, m4 ; p5
+ MAXSD m3, m5, m7
+ psubd m3, m5
+ MULLD m2, m13, m7 ; p5 * s0
+ MULLD m3, m13, m7
+ pmaddwd m0, m12 ; b5 * 164
+ pmaddwd m1, m12
+ paddusw m2, m12
+ paddusw m3, m12
+ psrld m2, 20 ; min(z5, 255)
+ psrld m3, 20
+ GATHER_X_BY_X m4, m2, m3, r0, dstm
+ punpcklwd m2, m4, m4
+ punpckhwd m3, m4, m4
+ MULLD m0, m2, m7
+ MULLD m1, m3, m7
+ paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
+ paddd m1, m10
+ mova [t4+wq*1+400*0+ 4], m4
+ psrld m0, 12
+ psrld m1, 12
+ mova [t3+wq*2+400*0+ 8], m0
+ mova [t3+wq*2+400*0+24], m1
+ add wq, 16
+ jl .v1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.prep_n: ; initial neighbor setup
+ movif64 wq, r4
+ movif32 wd, w1m
+.prep_n_loop:
+ movu m0, [t4+wq*1+400*0+ 2]
+ movu m1, [t3+wq*2+400*0+ 4]
+ movu m2, [t3+wq*2+400*0+20]
+ movu m7, [t4+wq*1+400*0+ 4]
+ movu m8, [t3+wq*2+400*0+ 8]
+ paddw m3, m0, [t4+wq*1+400*0+ 0]
+ paddd m4, m1, [t3+wq*2+400*0+ 0]
+ paddd m5, m2, [t3+wq*2+400*0+16]
+ paddw m3, m7
+ paddd m4, m8
+ movu m7, [t3+wq*2+400*0+24]
+ paddw m0, m3
+ paddd m1, m4
+ psllw m3, 2
+ pslld m4, 2
+ paddd m5, m7
+ paddd m2, m5
+ pslld m5, 2
+ paddw m0, m3 ; a5 565
+ paddd m1, m4 ; b5 565
+ paddd m2, m5
+ mova [t4+wq*1+400* 6+ 0], m0
+ mova [t3+wq*2+400*12+ 0], m1
+ mova [t3+wq*2+400*12+16], m2
+ movu m0, [t4+wq*1+400*2+ 4]
+ movu m1, [t3+wq*2+400*4+ 8]
+ movu m2, [t3+wq*2+400*4+24]
+ movu m3, [t4+wq*1+400*2+ 2]
+ movu m4, [t3+wq*2+400*4+ 4]
+ movu m5, [t3+wq*2+400*4+20]
+ paddw m0, [t4+wq*1+400*2+ 0]
+ paddd m1, [t3+wq*2+400*4+ 0]
+ paddd m2, [t3+wq*2+400*4+16]
+ paddw m3, m0
+ paddd m4, m1
+ paddd m5, m2
+ psllw m3, 2 ; a3[-1] 444
+ pslld m4, 2 ; b3[-1] 444
+ pslld m5, 2
+ psubw m3, m0 ; a3[-1] 343
+ psubd m4, m1 ; b3[-1] 343
+ psubd m5, m2
+ mova [t4+wq*1+400* 8+ 0], m3
+ mova [t3+wq*2+400*16+ 0], m4
+ mova [t3+wq*2+400*16+16], m5
+ movu m0, [t4+wq*1+400*4+ 4]
+ movu m1, [t3+wq*2+400*8+ 8]
+ movu m2, [t3+wq*2+400*8+24]
+ movu m3, [t4+wq*1+400*4+ 2]
+ movu m4, [t3+wq*2+400*8+ 4]
+ movu m5, [t3+wq*2+400*8+20]
+ paddw m0, [t4+wq*1+400*4+ 0]
+ paddd m1, [t3+wq*2+400*8+ 0]
+ paddd m2, [t3+wq*2+400*8+16]
+ paddw m3, m0
+ paddd m4, m1
+ paddd m5, m2
+ psllw m3, 2 ; a3[ 0] 444
+ pslld m4, 2 ; b3[ 0] 444
+ pslld m5, 2
+ mova [t4+wq*1+400*10+ 0], m3
+ mova [t3+wq*2+400*20+ 0], m4
+ mova [t3+wq*2+400*20+16], m5
+ psubw m3, m0 ; a3[ 0] 343
+ psubd m4, m1 ; b3[ 0] 343
+ psubd m5, m2
+ mova [t4+wq*1+400*12+ 0], m3
+ mova [t3+wq*2+400*24+ 0], m4
+ mova [t3+wq*2+400*24+16], m5
+ add wq, 16
+ jl .prep_n_loop
+ ret
+ALIGN function_align
+.n0: ; neighbor + output (even rows)
+ movif64 wq, r4
+ movif32 wd, w1m
+.n0_loop:
+ movu m0, [t4+wq*1+ 4]
+ movu m2, [t4+wq*1+ 2]
+ paddw m0, [t4+wq*1+ 0]
+ paddw m0, m2
+ paddw m2, m0
+ psllw m0, 2
+ paddw m0, m2 ; a5
+ movu m4, [t3+wq*2+ 8]
+ movu m5, [t3+wq*2+24]
+ movu m1, [t3+wq*2+ 4]
+ movu m3, [t3+wq*2+20]
+ paddd m4, [t3+wq*2+ 0]
+ paddd m5, [t3+wq*2+16]
+ paddd m4, m1
+ paddd m5, m3
+ paddd m1, m4
+ paddd m3, m5
+ pslld m4, 2
+ pslld m5, 2
+ paddd m4, m1 ; b5
+ paddd m5, m3
+ movu m2, [t4+wq*1+400* 6]
+ paddw m2, m0
+ mova [t4+wq*1+400* 6], m0
+ paddd m0, m4, [t3+wq*2+400*12+ 0]
+ paddd m1, m5, [t3+wq*2+400*12+16]
+ mova [t3+wq*2+400*12+ 0], m4
+ mova [t3+wq*2+400*12+16], m5
+ mova [rsp+16+ARCH_X86_32*4], m1
+ movu m3, [t4+wq*1+400*2+4]
+ movu m5, [t4+wq*1+400*2+2]
+ paddw m3, [t4+wq*1+400*2+0]
+ paddw m5, m3
+ psllw m5, 2 ; a3[ 1] 444
+ psubw m4, m5, m3 ; a3[ 1] 343
+ movu m3, [t4+wq*1+400* 8]
+ paddw m3, [t4+wq*1+400*10]
+ paddw m3, m4
+ mova [t4+wq*1+400* 8], m4
+ mova [t4+wq*1+400*10], m5
+ movu m1, [t3+wq*2+400*4+ 8]
+ movu m5, [t3+wq*2+400*4+ 4]
+ movu m7, [t3+wq*2+400*4+24]
+ movu m8, [t3+wq*2+400*4+20]
+ paddd m1, [t3+wq*2+400*4+ 0]
+ paddd m7, [t3+wq*2+400*4+16]
+ paddd m5, m1
+ paddd m8, m7
+ pslld m5, 2 ; b3[ 1] 444
+ pslld m8, 2
+ psubd m4, m5, m1 ; b3[ 1] 343
+%if ARCH_X86_32
+ mova [esp+52], m8
+ psubd m8, m7
+%else
+ psubd m6, m8, m7
+ SWAP m8, m6
+%endif
+ paddd m1, m4, [t3+wq*2+400*16+ 0]
+ paddd m7, m8, [t3+wq*2+400*16+16]
+ paddd m1, [t3+wq*2+400*20+ 0]
+ paddd m7, [t3+wq*2+400*20+16]
+ mova [t3+wq*2+400*16+ 0], m4
+ mova [t3+wq*2+400*16+16], m8
+ mova [t3+wq*2+400*20+ 0], m5
+%if ARCH_X86_32
+ mova m8, [esp+52]
+%else
+ SWAP m8, m6
+ pxor m6, m6
+%endif
+ mova [t3+wq*2+400*20+16], m8
+ mova [rsp+32+ARCH_X86_32*4], m7
+ movu m5, [dstq+wq]
+ punpcklwd m4, m5, m6
+ punpcklwd m7, m2, m6
+ pmaddwd m7, m4 ; a5 * src
+ punpcklwd m8, m3, m6
+ pmaddwd m8, m4 ; a3 * src
+ punpckhwd m5, m6
+ punpckhwd m2, m6
+ pmaddwd m2, m5
+ punpckhwd m3, m6
+ pmaddwd m3, m5
+ pslld m4, 13
+ pslld m5, 13
+ psubd m0, m7 ; b5 - a5 * src + (1 << 8)
+ psubd m1, m8 ; b3 - a3 * src + (1 << 8)
+ mova m7, [base+pd_0xffff]
+ psrld m0, 9
+ pslld m1, 7
+ pand m0, m7
+ pandn m8, m7, m1
+ por m0, m8
+ mova m1, [rsp+16+ARCH_X86_32*4]
+ mova m8, [rsp+32+ARCH_X86_32*4]
+ psubd m1, m2
+ psubd m8, m3
+ mova m2, [base+pd_4096]
+ psrld m1, 9
+ pslld m8, 7
+ pand m1, m7
+ pandn m7, m8
+ por m1, m7
+ pmaddwd m0, m15
+ pmaddwd m1, m15
+%if ARCH_X86_32
+ pxor m7, m7
+%else
+ SWAP m7, m6
+%endif
+ paddd m4, m2
+ paddd m5, m2
+ paddd m0, m4
+ paddd m1, m5
+ psrad m0, 8
+ psrad m1, 8
+ packssdw m0, m1 ; clip
+ pmaxsw m0, m7
+ psrlw m0, 5
+ mova [dstq+wq], m0
+ add wq, 16
+ jl .n0_loop
+ add dstq, stridemp
+ ret
+%if ARCH_X86_64
+ SWAP m6, m7
+%endif
+ALIGN function_align
+.n1: ; neighbor + output (odd rows)
+ movif64 wq, r4
+ movif32 wd, w1m
+.n1_loop:
+ movu m3, [t4+wq*1+400*4+4]
+ movu m5, [t4+wq*1+400*4+2]
+ paddw m3, [t4+wq*1+400*4+0]
+ paddw m5, m3
+ psllw m5, 2 ; a3[ 1] 444
+ psubw m4, m5, m3 ; a3[ 1] 343
+ paddw m3, m4, [t4+wq*1+400*12]
+ paddw m3, [t4+wq*1+400*10]
+ mova [t4+wq*1+400*10], m5
+ mova [t4+wq*1+400*12], m4
+ movu m1, [t3+wq*2+400*8+ 8]
+ movu m5, [t3+wq*2+400*8+ 4]
+ movu m7, [t3+wq*2+400*8+24]
+ movu m8, [t3+wq*2+400*8+20]
+ paddd m1, [t3+wq*2+400*8+ 0]
+ paddd m7, [t3+wq*2+400*8+16]
+ paddd m5, m1
+ paddd m8, m7
+ pslld m5, 2 ; b3[ 1] 444
+ pslld m8, 2
+ psubd m4, m5, m1 ; b3[ 1] 343
+ psubd m0, m8, m7
+ paddd m1, m4, [t3+wq*2+400*24+ 0]
+ paddd m7, m0, [t3+wq*2+400*24+16]
+ paddd m1, [t3+wq*2+400*20+ 0]
+ paddd m7, [t3+wq*2+400*20+16]
+ mova [t3+wq*2+400*20+ 0], m5
+ mova [t3+wq*2+400*20+16], m8
+ mova [t3+wq*2+400*24+ 0], m4
+ mova [t3+wq*2+400*24+16], m0
+ mova m5, [dstq+wq]
+ mova m2, [t4+wq*1+400* 6]
+ punpcklwd m4, m5, m6
+ punpcklwd m8, m2, m6
+ pmaddwd m8, m4 ; a5 * src
+ punpcklwd m0, m3, m6
+ pmaddwd m0, m4 ; a3 * src
+ punpckhwd m5, m6
+ punpckhwd m2, m6
+ pmaddwd m2, m5
+ punpckhwd m3, m6
+ pmaddwd m3, m5
+ psubd m1, m0 ; b3 - a3 * src + (1 << 8)
+ pslld m4, 13
+ pslld m5, 13
+ mova m0, [t3+wq*2+400*12+ 0]
+ psubd m0, m8 ; b5 - a5 * src + (1 << 8)
+ mova m8, [t3+wq*2+400*12+16]
+ psubd m8, m2
+ psubd m7, m3
+ mova m2, [base+pd_0xffff]
+ pslld m1, 7
+ psrld m0, 8
+ psrld m8, 8
+ pslld m7, 7
+ pand m0, m2
+ pandn m3, m2, m1
+ por m0, m3
+ pand m8, m2
+ pandn m2, m7
+ por m2, m8
+ mova m1, [base+pd_4096]
+ pmaddwd m0, m15
+ pmaddwd m2, m15
+%if ARCH_X86_64
+ SWAP m7, m6
+%endif
+ pxor m7, m7
+ paddd m4, m1
+ paddd m5, m1
+ paddd m0, m4
+ paddd m2, m5
+ psrad m0, 8
+ psrad m2, 8
+ packssdw m0, m2 ; clip
+ pmaxsw m0, m7
+ psrlw m0, 5
+ mova [dstq+wq], m0
+ add wq, 16
+ jl .n1_loop
+ add dstq, stridemp
+ movif32 dstm, dstq
+ ret
diff --git a/third_party/dav1d/src/x86/looprestoration_avx2.asm b/third_party/dav1d/src/x86/looprestoration_avx2.asm
new file mode 100644
index 0000000000..7787997425
--- /dev/null
+++ b/third_party/dav1d/src/x86/looprestoration_avx2.asm
@@ -0,0 +1,2238 @@
+; Copyright © 2018, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 32
+
+wiener_l_shuf: db 4, 4, 4, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+wiener_shufA: db 1, 7, 2, 8, 3, 9, 4, 10, 5, 11, 6, 12, 7, 13, 8, 14
+wiener_shufB: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
+wiener_shufC: db 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11, 13, 12
+sgr_l_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
+sgr_r_ext: times 16 db 1
+ times 16 db 9
+
+; dword version of dav1d_sgr_x_by_x[] for use with gathers, wastes a bit of
+; cache but eliminates some shifts in the inner sgr loop which is overall a win
+const sgr_x_by_x_avx2
+ dd 255,128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16
+ dd 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8
+ dd 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 5, 5
+ dd 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4
+ dd 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3
+ dd 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
+ dd 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+ dd 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+ dd 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+ dd 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+ dd 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1
+ dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+ dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+ dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+ dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+ dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0
+
+ times 4 db -1 ; needed for 16-bit sgr
+pb_m5: times 4 db -5
+pb_3: times 4 db 3
+pw_5_6: dw 5, 6
+
+sgr_shuf: db 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 8, -1
+ db 9, -1, 10, -1, 11, -1, 12, -1
+
+pw_256: times 2 dw 256
+pw_2056: times 2 dw 2056
+pw_m16380: times 2 dw -16380
+pd_25: dd 25
+pd_34816: dd 34816
+pd_m4096: dd -4096
+pd_0xf00801c7: dd 0xf00801c7
+pd_0xf00800a4: dd 0xf00800a4
+
+cextern pb_0to63
+
+SECTION .text
+
+DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; ring buffer pointers
+
+INIT_YMM avx2
+cglobal wiener_filter7_8bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \
+ w, h, edge, flt
+ mov fltq, r6mp
+ movifnidn hd, hm
+ mov edged, r7m
+ mov wd, wm
+ vbroadcasti128 m6, [wiener_shufA]
+ vpbroadcastb m11, [fltq+ 0] ; x0 x0
+ vbroadcasti128 m7, [wiener_shufB]
+ vpbroadcastd m12, [fltq+ 2]
+ vbroadcasti128 m8, [wiener_shufC]
+ packsswb m12, m12 ; x1 x2
+ vpbroadcastw m13, [fltq+ 6] ; x3
+ vbroadcasti128 m9, [sgr_shuf+6]
+ add lpfq, wq
+ vpbroadcastd m10, [pw_m16380]
+ vpbroadcastd m14, [fltq+16] ; y0 y1
+ add dstq, wq
+ vpbroadcastd m15, [fltq+20] ; y2 y3
+ lea t1, [rsp+wq*2+16]
+ psllw m14, 5
+ neg wq
+ psllw m15, 5
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t6, t1
+ mov t5, t1
+ add t1, 384*2
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ mov t4, t1
+ add t1, 384*2
+ add r10, strideq
+ mov [rsp], r10 ; below
+ call .h
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v2
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v3
+.main:
+ lea t0, [t1+384*2]
+.main_loop:
+ call .hv
+ dec hd
+ jnz .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .v3
+ mov lpfq, [rsp]
+ call .hv_bottom
+ add lpfq, strideq
+ call .hv_bottom
+.v1:
+ call .v
+ RET
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov [rsp], r10
+ call .h
+ mov t6, t1
+ mov t5, t1
+ mov t4, t1
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v2
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v3
+ lea t0, [t1+384*2]
+ call .hv
+ dec hd
+ jz .v3
+ add t0, 384*8
+ call .hv
+ dec hd
+ jnz .main
+.v3:
+ call .v
+.v2:
+ call .v
+ jmp .v1
+.extend_right:
+ movd xm2, r10d
+ vpbroadcastd m0, [pb_3]
+ vpbroadcastd m1, [pb_m5]
+ vpbroadcastb m2, xm2
+ mova m3, [pb_0to63]
+ psubb m0, m2
+ psubb m1, m2
+ pminub m0, m3
+ pminub m1, m3
+ pshufb m4, m0
+ pshufb m5, m1
+ ret
+.h:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movd xm4, [leftq]
+ vpblendd m4, [lpfq+r10-4], 0xfe
+ add leftq, 4
+ jmp .h_main
+.h_extend_left:
+ vbroadcasti128 m5, [lpfq+r10] ; avoid accessing memory located
+ mova m4, [lpfq+r10] ; before the start of the buffer
+ palignr m4, m5, 12
+ pshufb m4, [wiener_l_shuf]
+ jmp .h_main
+.h_top:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m4, [lpfq+r10-4]
+.h_main:
+ movu m5, [lpfq+r10+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -34
+ jl .h_have_right
+ call .extend_right
+.h_have_right:
+ pshufb m0, m4, m6
+ pmaddubsw m0, m11
+ pshufb m1, m5, m6
+ pmaddubsw m1, m11
+ pshufb m2, m4, m7
+ pmaddubsw m2, m12
+ pshufb m3, m5, m7
+ pmaddubsw m3, m12
+ paddw m0, m2
+ pshufb m2, m4, m8
+ pmaddubsw m2, m12
+ paddw m1, m3
+ pshufb m3, m5, m8
+ pmaddubsw m3, m12
+ pshufb m4, m9
+ paddw m0, m2
+ pmullw m2, m4, m13
+ pshufb m5, m9
+ paddw m1, m3
+ pmullw m3, m5, m13
+ psllw m4, 7
+ psllw m5, 7
+ paddw m4, m10
+ paddw m5, m10
+ paddw m0, m2
+ vpbroadcastd m2, [pw_2056]
+ paddw m1, m3
+ paddsw m0, m4
+ paddsw m1, m5
+ psraw m0, 3
+ psraw m1, 3
+ paddw m0, m2
+ paddw m1, m2
+ mova [t1+r10*2+ 0], m0
+ mova [t1+r10*2+32], m1
+ add r10, 32
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv:
+ add lpfq, strideq
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movd xm4, [leftq]
+ vpblendd m4, [lpfq+r10-4], 0xfe
+ add leftq, 4
+ jmp .hv_main
+.hv_extend_left:
+ movu m4, [lpfq+r10-4]
+ pshufb m4, [wiener_l_shuf]
+ jmp .hv_main
+.hv_bottom:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu m4, [lpfq+r10-4]
+.hv_main:
+ movu m5, [lpfq+r10+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp r10d, -34
+ jl .hv_have_right
+ call .extend_right
+.hv_have_right:
+ pshufb m0, m4, m6
+ pmaddubsw m0, m11
+ pshufb m1, m5, m6
+ pmaddubsw m1, m11
+ pshufb m2, m4, m7
+ pmaddubsw m2, m12
+ pshufb m3, m5, m7
+ pmaddubsw m3, m12
+ paddw m0, m2
+ pshufb m2, m4, m8
+ pmaddubsw m2, m12
+ paddw m1, m3
+ pshufb m3, m5, m8
+ pmaddubsw m3, m12
+ pshufb m4, m9
+ paddw m0, m2
+ pmullw m2, m4, m13
+ pshufb m5, m9
+ paddw m1, m3
+ pmullw m3, m5, m13
+ psllw m4, 7
+ psllw m5, 7
+ paddw m4, m10
+ paddw m5, m10
+ paddw m0, m2
+ paddw m1, m3
+ mova m2, [t4+r10*2]
+ paddw m2, [t2+r10*2]
+ mova m3, [t3+r10*2]
+ paddsw m0, m4
+ vpbroadcastd m4, [pw_2056]
+ paddsw m1, m5
+ mova m5, [t5+r10*2]
+ paddw m5, [t1+r10*2]
+ psraw m0, 3
+ psraw m1, 3
+ paddw m0, m4
+ paddw m1, m4
+ paddw m4, m0, [t6+r10*2]
+ mova [t0+r10*2], m0
+ punpcklwd m0, m2, m3
+ pmaddwd m0, m15
+ punpckhwd m2, m3
+ pmaddwd m2, m15
+ punpcklwd m3, m4, m5
+ pmaddwd m3, m14
+ punpckhwd m4, m5
+ pmaddwd m4, m14
+ paddd m0, m3
+ paddd m4, m2
+ mova m2, [t4+r10*2+32]
+ paddw m2, [t2+r10*2+32]
+ mova m3, [t3+r10*2+32]
+ mova m5, [t5+r10*2+32]
+ paddw m5, [t1+r10*2+32]
+ packuswb m0, m4
+ paddw m4, m1, [t6+r10*2+32]
+ mova [t0+r10*2+32], m1
+ punpcklwd m1, m2, m3
+ pmaddwd m1, m15
+ punpckhwd m2, m3
+ pmaddwd m2, m15
+ punpcklwd m3, m4, m5
+ pmaddwd m3, m14
+ punpckhwd m4, m5
+ pmaddwd m4, m14
+ paddd m1, m3
+ paddd m2, m4
+ packuswb m1, m2
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+ mova [dstq+r10], m0
+ add r10, 32
+ jl .hv_loop
+ mov t6, t5
+ mov t5, t4
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ mov t1, t0
+ mov t0, t6
+ add dstq, strideq
+ ret
+.v:
+ mov r10, wq
+.v_loop:
+ mova m2, [t4+r10*2+ 0]
+ paddw m2, [t2+r10*2+ 0]
+ mova m4, [t3+r10*2+ 0]
+ mova m6, [t1+r10*2+ 0]
+ paddw m8, m6, [t6+r10*2+ 0]
+ paddw m6, [t5+r10*2+ 0]
+ mova m3, [t4+r10*2+32]
+ paddw m3, [t2+r10*2+32]
+ mova m5, [t3+r10*2+32]
+ mova m7, [t1+r10*2+32]
+ paddw m9, m7, [t6+r10*2+32]
+ paddw m7, [t5+r10*2+32]
+ punpcklwd m0, m2, m4
+ pmaddwd m0, m15
+ punpckhwd m2, m4
+ pmaddwd m2, m15
+ punpcklwd m4, m8, m6
+ pmaddwd m4, m14
+ punpckhwd m6, m8, m6
+ pmaddwd m6, m14
+ punpcklwd m1, m3, m5
+ pmaddwd m1, m15
+ punpckhwd m3, m5
+ pmaddwd m3, m15
+ punpcklwd m5, m9, m7
+ pmaddwd m5, m14
+ punpckhwd m7, m9, m7
+ pmaddwd m7, m14
+ paddd m0, m4
+ paddd m2, m6
+ paddd m1, m5
+ paddd m3, m7
+ packuswb m0, m2
+ packuswb m1, m3
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+ mova [dstq+r10], m0
+ add r10, 32
+ jl .v_loop
+ mov t6, t5
+ mov t5, t4
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ add dstq, strideq
+ ret
+
+cglobal wiener_filter5_8bpc, 4, 13, 16, 384*8+16, dst, stride, left, lpf, \
+ w, h, edge, flt
+ mov fltq, r6mp
+ movifnidn hd, hm
+ mov edged, r7m
+ mov wd, wm
+ vbroadcasti128 m6, [wiener_shufB]
+ vpbroadcastd m12, [fltq+ 2]
+ vbroadcasti128 m7, [wiener_shufC]
+ packsswb m12, m12 ; x1 x2
+ vpbroadcastw m13, [fltq+ 6] ; x3
+ vbroadcasti128 m8, [sgr_shuf+6]
+ add lpfq, wq
+ vpbroadcastd m9, [pw_m16380]
+ vpbroadcastd m10, [pw_2056]
+ mova m11, [wiener_l_shuf]
+ vpbroadcastd m14, [fltq+16] ; __ y1
+ add dstq, wq
+ vpbroadcastd m15, [fltq+20] ; y2 y3
+ lea t1, [rsp+wq*2+16]
+ psllw m14, 5
+ neg wq
+ psllw m15, 5
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t4, t1
+ add t1, 384*2
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ mov t3, t1
+ add t1, 384*2
+ add r10, strideq
+ mov [rsp], r10 ; below
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v2
+.main:
+ mov t0, t4
+.main_loop:
+ call .hv
+ dec hd
+ jnz .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .v2
+ mov lpfq, [rsp]
+ call .hv_bottom
+ add lpfq, strideq
+ call .hv_bottom
+.end:
+ RET
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov [rsp], r10
+ call .h
+ mov t4, t1
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v2
+ lea t0, [t1+384*2]
+ call .hv
+ dec hd
+ jz .v2
+ add t0, 384*6
+ call .hv
+ dec hd
+ jnz .main
+.v2:
+ call .v
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ add dstq, strideq
+.v1:
+ call .v
+ jmp .end
+.h:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movd xm4, [leftq]
+ vpblendd m4, [lpfq+r10-4], 0xfe
+ add leftq, 4
+ jmp .h_main
+.h_extend_left:
+ vbroadcasti128 m5, [lpfq+r10] ; avoid accessing memory located
+ mova m4, [lpfq+r10] ; before the start of the buffer
+ palignr m4, m5, 12
+ pshufb m4, m11
+ jmp .h_main
+.h_top:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m4, [lpfq+r10-4]
+.h_main:
+ movu m5, [lpfq+r10+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -33
+ jl .h_have_right
+ call mangle(private_prefix %+ _wiener_filter7_8bpc_avx2).extend_right
+.h_have_right:
+ pshufb m0, m4, m6
+ pmaddubsw m0, m12
+ pshufb m1, m5, m6
+ pmaddubsw m1, m12
+ pshufb m2, m4, m7
+ pmaddubsw m2, m12
+ pshufb m3, m5, m7
+ pmaddubsw m3, m12
+ pshufb m4, m8
+ paddw m0, m2
+ pmullw m2, m4, m13
+ pshufb m5, m8
+ paddw m1, m3
+ pmullw m3, m5, m13
+ psllw m4, 7
+ psllw m5, 7
+ paddw m4, m9
+ paddw m5, m9
+ paddw m0, m2
+ paddw m1, m3
+ paddsw m0, m4
+ paddsw m1, m5
+ psraw m0, 3
+ psraw m1, 3
+ paddw m0, m10
+ paddw m1, m10
+ mova [t1+r10*2+ 0], m0
+ mova [t1+r10*2+32], m1
+ add r10, 32
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv:
+ add lpfq, strideq
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movd xm4, [leftq]
+ vpblendd m4, [lpfq+r10-4], 0xfe
+ add leftq, 4
+ jmp .hv_main
+.hv_extend_left:
+ movu m4, [lpfq+r10-4]
+ pshufb m4, m11
+ jmp .hv_main
+.hv_bottom:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu m4, [lpfq+r10-4]
+.hv_main:
+ movu m5, [lpfq+r10+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp r10d, -33
+ jl .hv_have_right
+ call mangle(private_prefix %+ _wiener_filter7_8bpc_avx2).extend_right
+.hv_have_right:
+ pshufb m0, m4, m6
+ pmaddubsw m0, m12
+ pshufb m1, m5, m6
+ pmaddubsw m1, m12
+ pshufb m2, m4, m7
+ pmaddubsw m2, m12
+ pshufb m3, m5, m7
+ pmaddubsw m3, m12
+ pshufb m4, m8
+ paddw m0, m2
+ pmullw m2, m4, m13
+ pshufb m5, m8
+ paddw m1, m3
+ pmullw m3, m5, m13
+ psllw m4, 7
+ psllw m5, 7
+ paddw m4, m9
+ paddw m5, m9
+ paddw m0, m2
+ paddw m1, m3
+ mova m2, [t3+r10*2]
+ paddw m2, [t1+r10*2]
+ mova m3, [t2+r10*2]
+ paddsw m0, m4
+ paddsw m1, m5
+ psraw m0, 3
+ psraw m1, 3
+ paddw m0, m10
+ paddw m1, m10
+ paddw m4, m0, [t4+r10*2]
+ mova [t0+r10*2], m0
+ punpcklwd m0, m2, m3
+ pmaddwd m0, m15
+ punpckhwd m2, m3
+ pmaddwd m2, m15
+ punpcklwd m3, m4, m4
+ pmaddwd m3, m14
+ punpckhwd m4, m4
+ pmaddwd m4, m14
+ paddd m0, m3
+ paddd m4, m2
+ mova m2, [t3+r10*2+32]
+ paddw m2, [t1+r10*2+32]
+ mova m3, [t2+r10*2+32]
+ packuswb m0, m4
+ paddw m4, m1, [t4+r10*2+32]
+ mova [t0+r10*2+32], m1
+ punpcklwd m1, m2, m3
+ pmaddwd m1, m15
+ punpckhwd m2, m3
+ pmaddwd m2, m15
+ punpcklwd m3, m4, m4
+ pmaddwd m3, m14
+ punpckhwd m4, m4
+ pmaddwd m4, m14
+ paddd m1, m3
+ paddd m2, m4
+ packuswb m1, m2
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+ mova [dstq+r10], m0
+ add r10, 32
+ jl .hv_loop
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ mov t1, t0
+ mov t0, t4
+ add dstq, strideq
+ ret
+.v:
+ mov r10, wq
+ psrld m13, m14, 16 ; y1 __
+.v_loop:
+ mova m6, [t1+r10*2+ 0]
+ paddw m2, m6, [t3+r10*2+ 0]
+ mova m4, [t2+r10*2+ 0]
+ mova m7, [t1+r10*2+32]
+ paddw m3, m7, [t3+r10*2+32]
+ mova m5, [t2+r10*2+32]
+ paddw m6, [t4+r10*2+ 0]
+ paddw m7, [t4+r10*2+32]
+ punpcklwd m0, m2, m4
+ pmaddwd m0, m15
+ punpckhwd m2, m4
+ pmaddwd m2, m15
+ punpcklwd m1, m3, m5
+ pmaddwd m1, m15
+ punpckhwd m3, m5
+ pmaddwd m3, m15
+ punpcklwd m5, m7, m6
+ pmaddwd m4, m5, m14
+ punpckhwd m7, m6
+ pmaddwd m6, m7, m14
+ pmaddwd m5, m13
+ pmaddwd m7, m13
+ paddd m0, m4
+ paddd m2, m6
+ paddd m1, m5
+ paddd m3, m7
+ packuswb m0, m2
+ packuswb m1, m3
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+ mova [dstq+r10], m0
+ add r10, 32
+ jl .v_loop
+ ret
+
+cglobal sgr_filter_5x5_8bpc, 4, 13, 16, 400*24+16, dst, stride, left, lpf, \
+ w, h, edge, params
+%define base r12-sgr_x_by_x_avx2-256*4
+ lea r12, [sgr_x_by_x_avx2+256*4]
+ mov paramsq, r6mp
+ mov wd, wm
+ movifnidn hd, hm
+ mov edged, r7m
+ vbroadcasti128 m8, [base+sgr_shuf+0]
+ vbroadcasti128 m9, [base+sgr_shuf+8]
+ add lpfq, wq
+ vbroadcasti128 m10, [base+sgr_shuf+2]
+ add dstq, wq
+ vbroadcasti128 m11, [base+sgr_shuf+6]
+ lea t3, [rsp+wq*4+16+400*12]
+ vpbroadcastd m12, [paramsq+0] ; s0
+ pxor m6, m6
+ vpbroadcastw m7, [paramsq+8] ; w0
+ lea t1, [rsp+wq*2+20]
+ vpbroadcastd m13, [base+pd_0xf00800a4]
+ neg wq
+ vpbroadcastd m14, [base+pd_34816] ; (1 << 11) + (1 << 15)
+ psllw m7, 4
+ vpbroadcastd m15, [base+pd_m4096]
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t2, t1
+ call .top_fixup
+ add t1, 400*6
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ add r10, strideq
+ mov [rsp], r10 ; below
+ mov t0, t2
+ dec hd
+ jz .height1
+ or edged, 16
+ call .h
+.main:
+ add lpfq, strideq
+ call .hv
+ call .prep_n
+ sub hd, 2
+ jl .extend_bottom
+.main_loop:
+ add lpfq, strideq
+ test hd, hd
+ jz .odd_height
+ call .h
+ add lpfq, strideq
+ call .hv
+ call .n0
+ call .n1
+ sub hd, 2
+ jge .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .extend_bottom
+ mov lpfq, [rsp]
+ call .h_top
+ add lpfq, strideq
+ call .hv_bottom
+.end:
+ call .n0
+ call .n1
+.end2:
+ RET
+.height1:
+ call .hv
+ call .prep_n
+ jmp .odd_height_end
+.odd_height:
+ call .hv
+ call .n0
+ call .n1
+.odd_height_end:
+ call .v
+ call .n0
+ jmp .end2
+.extend_bottom:
+ call .v
+ jmp .end
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov [rsp], r10
+ call .h
+ lea t2, [t1+400*6]
+ call .top_fixup
+ dec hd
+ jz .no_top_height1
+ or edged, 16
+ mov t0, t1
+ mov t1, t2
+ jmp .main
+.no_top_height1:
+ call .v
+ call .prep_n
+ jmp .odd_height_end
+.extend_right:
+ movd xm2, r10d
+ mova m0, [sgr_r_ext]
+ vpbroadcastb m2, xm2
+ psubb m0, m2
+ pminub m0, [pb_0to63]
+ pshufb m5, m0
+ ret
+.h: ; horizontal boxsum
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ vpbroadcastd xm0, [leftq]
+ mova xm5, [lpfq+wq]
+ palignr xm5, xm0, 12
+ add leftq, 4
+ jmp .h_main
+.h_extend_left:
+ mova xm5, [lpfq+wq]
+ pshufb xm5, [base+sgr_l_shuf]
+ jmp .h_main
+.h_top:
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu xm5, [lpfq+r10-2]
+.h_main:
+ vinserti128 m5, [lpfq+r10+6], 1
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -18
+ jl .h_have_right
+ call .extend_right
+.h_have_right:
+ pshufb m3, m5, m8
+ pmullw m4, m3, m3
+ pshufb m2, m5, m9
+ paddw m0, m3, m2
+ shufps m3, m2, q2121
+ paddw m0, m3
+ punpcklwd m1, m2, m3
+ pmaddwd m1, m1
+ punpckhwd m2, m3
+ pmaddwd m2, m2
+ punpcklwd m3, m4, m6
+ paddd m1, m3
+ punpckhwd m4, m6
+ paddd m2, m4
+ pshufb m4, m5, m10
+ paddw m0, m4
+ pshufb m5, m11
+ paddw m0, m5 ; sum
+ punpcklwd m3, m4, m5
+ pmaddwd m3, m3
+ punpckhwd m4, m5
+ pmaddwd m4, m4
+ test edgeb, 16 ; y > 0
+ jz .h_loop_end
+ paddw m0, [t1+r10*2+400*0]
+ paddd m1, [t1+r10*2+400*2]
+ paddd m2, [t1+r10*2+400*4]
+.h_loop_end:
+ paddd m1, m3 ; sumsq
+ paddd m2, m4
+ mova [t1+r10*2+400*0], m0
+ mova [t1+r10*2+400*2], m1
+ mova [t1+r10*2+400*4], m2
+ add r10, 16
+ jl .h_loop
+ ret
+.top_fixup:
+ lea r10, [wq-2]
+.top_fixup_loop: ; the sums of the first row needs to be doubled
+ mova m0, [t1+r10*2+400*0]
+ mova m1, [t1+r10*2+400*2]
+ mova m2, [t1+r10*2+400*4]
+ paddw m0, m0
+ paddd m1, m1
+ paddd m2, m2
+ mova [t2+r10*2+400*0], m0
+ mova [t2+r10*2+400*2], m1
+ mova [t2+r10*2+400*4], m2
+ add r10, 16
+ jl .top_fixup_loop
+ ret
+ALIGN function_align
+.hv: ; horizontal boxsum + vertical boxsum + ab
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ vpbroadcastd xm0, [leftq]
+ mova xm5, [lpfq+wq]
+ palignr xm5, xm0, 12
+ add leftq, 4
+ jmp .hv_main
+.hv_extend_left:
+ mova xm5, [lpfq+wq]
+ pshufb xm5, [base+sgr_l_shuf]
+ jmp .hv_main
+.hv_bottom:
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu xm5, [lpfq+r10-2]
+.hv_main:
+ vinserti128 m5, [lpfq+r10+6], 1
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp r10d, -18
+ jl .hv_have_right
+ call .extend_right
+.hv_have_right:
+ pshufb m1, m5, m8
+ pmullw m4, m1, m1
+ pshufb m3, m5, m9
+ paddw m0, m1, m3
+ shufps m1, m3, q2121
+ paddw m0, m1
+ punpcklwd m2, m3, m1
+ pmaddwd m2, m2
+ punpckhwd m3, m1
+ pmaddwd m3, m3
+ punpcklwd m1, m4, m6
+ paddd m2, m1
+ punpckhwd m4, m6
+ paddd m3, m4
+ pshufb m1, m5, m10
+ paddw m0, m1
+ pshufb m5, m11
+ paddw m0, m5 ; h sum
+ punpcklwd m4, m5, m1
+ pmaddwd m4, m4
+ punpckhwd m5, m1
+ pmaddwd m5, m5
+ paddw m1, m0, [t1+r10*2+400*0]
+ paddd m2, m4 ; h sumsq
+ paddd m3, m5
+ paddd m4, m2, [t1+r10*2+400*2]
+ paddd m5, m3, [t1+r10*2+400*4]
+ test hd, hd
+ jz .hv_last_row
+.hv_main2:
+ paddw m1, [t2+r10*2+400*0] ; hv sum
+ paddd m4, [t2+r10*2+400*2] ; hv sumsq
+ paddd m5, [t2+r10*2+400*4]
+ mova [t0+r10*2+400*0], m0
+ mova [t0+r10*2+400*2], m2
+ mova [t0+r10*2+400*4], m3
+ vpbroadcastd m2, [pd_25]
+ punpcklwd m0, m1, m6 ; b
+ punpckhwd m1, m6
+ pmulld m4, m2 ; a * 25
+ pmulld m5, m2
+ pmaddwd m2, m0, m0 ; b * b
+ pmaddwd m3, m1, m1
+ psubd m4, m2 ; p
+ psubd m5, m3
+ pmulld m4, m12 ; p * s
+ pmulld m5, m12
+ pmaddwd m0, m13 ; b * 164
+ pmaddwd m1, m13
+ paddusw m4, m13
+ paddusw m5, m13
+ psrad m3, m4, 20 ; min(z, 255) - 256
+ vpgatherdd m2, [r12+m3*4], m4 ; x
+ psrad m4, m5, 20
+ vpgatherdd m3, [r12+m4*4], m5
+ pmulld m0, m2
+ pmulld m1, m3
+ paddd m0, m14 ; x * b * 164 + (1 << 11) + (1 << 15)
+ paddd m1, m14
+ pand m0, m15
+ pand m1, m15
+ por m0, m2 ; a | (b << 12)
+ por m1, m3
+ mova [t3+r10*4+ 8], xm0 ; The neighbor calculations requires
+ vextracti128 [t3+r10*4+40], m0, 1 ; 13 bits for a and 21 bits for b.
+ mova [t3+r10*4+24], xm1 ; Packing them allows for 12+20, but
+ vextracti128 [t3+r10*4+56], m1, 1 ; that gets us most of the way.
+ add r10, 16
+ jl .hv_loop
+ mov t2, t1
+ mov t1, t0
+ mov t0, t2
+ ret
+.hv_last_row: ; esoteric edge case for odd heights
+ mova [t1+r10*2+400*0], m1
+ paddw m1, m0
+ mova [t1+r10*2+400*2], m4
+ paddd m4, m2
+ mova [t1+r10*2+400*4], m5
+ paddd m5, m3
+ jmp .hv_main2
+.v: ; vertical boxsum + ab
+ lea r10, [wq-2]
+.v_loop:
+ mova m0, [t1+r10*2+400*0]
+ mova m2, [t1+r10*2+400*2]
+ mova m3, [t1+r10*2+400*4]
+ paddw m1, m0, [t2+r10*2+400*0]
+ paddd m4, m2, [t2+r10*2+400*2]
+ paddd m5, m3, [t2+r10*2+400*4]
+ paddw m0, m0
+ paddd m2, m2
+ paddd m3, m3
+ paddw m1, m0 ; hv sum
+ paddd m4, m2 ; hv sumsq
+ paddd m5, m3
+ vpbroadcastd m2, [pd_25]
+ punpcklwd m0, m1, m6 ; b
+ punpckhwd m1, m6
+ pmulld m4, m2 ; a * 25
+ pmulld m5, m2
+ pmaddwd m2, m0, m0 ; b * b
+ pmaddwd m3, m1, m1
+ psubd m4, m2 ; p
+ psubd m5, m3
+ pmulld m4, m12 ; p * s
+ pmulld m5, m12
+ pmaddwd m0, m13 ; b * 164
+ pmaddwd m1, m13
+ paddusw m4, m13
+ paddusw m5, m13
+ psrad m3, m4, 20 ; min(z, 255) - 256
+ vpgatherdd m2, [r12+m3*4], m4 ; x
+ psrad m4, m5, 20
+ vpgatherdd m3, [r12+m4*4], m5
+ pmulld m0, m2
+ pmulld m1, m3
+ paddd m0, m14 ; x * b * 164 + (1 << 11) + (1 << 15)
+ paddd m1, m14
+ pand m0, m15
+ pand m1, m15
+ por m0, m2 ; a | (b << 12)
+ por m1, m3
+ mova [t3+r10*4+ 8], xm0
+ vextracti128 [t3+r10*4+40], m0, 1
+ mova [t3+r10*4+24], xm1
+ vextracti128 [t3+r10*4+56], m1, 1
+ add r10, 16
+ jl .v_loop
+ ret
+.prep_n: ; initial neighbor setup
+ mov r10, wq
+.prep_n_loop:
+ movu m0, [t3+r10*4+ 4]
+ movu m1, [t3+r10*4+36]
+ paddd m2, m0, [t3+r10*4+ 0]
+ paddd m3, m1, [t3+r10*4+32]
+ paddd m2, [t3+r10*4+ 8]
+ paddd m3, [t3+r10*4+40]
+ paddd m0, m2
+ pslld m2, 2
+ paddd m1, m3
+ pslld m3, 2
+ paddd m2, m0 ; ab 565
+ paddd m3, m1
+ pandn m0, m15, m2 ; a
+ psrld m2, 12 ; b
+ pandn m1, m15, m3
+ psrld m3, 12
+ mova [t3+r10*4+400*4+ 0], m0
+ mova [t3+r10*4+400*8+ 0], m2
+ mova [t3+r10*4+400*4+32], m1
+ mova [t3+r10*4+400*8+32], m3
+ add r10, 16
+ jl .prep_n_loop
+ ret
+ALIGN function_align
+.n0: ; neighbor + output (even rows)
+ mov r10, wq
+.n0_loop:
+ movu m0, [t3+r10*4+ 4]
+ movu m1, [t3+r10*4+36]
+ paddd m2, m0, [t3+r10*4+ 0]
+ paddd m3, m1, [t3+r10*4+32]
+ paddd m2, [t3+r10*4+ 8]
+ paddd m3, [t3+r10*4+40]
+ paddd m0, m2
+ pslld m2, 2
+ paddd m1, m3
+ pslld m3, 2
+ paddd m2, m0
+ paddd m3, m1
+ pandn m0, m15, m2
+ psrld m2, 12
+ pandn m1, m15, m3
+ psrld m3, 12
+ paddd m4, m0, [t3+r10*4+400*4+ 0] ; a
+ paddd m5, m1, [t3+r10*4+400*4+32]
+ mova [t3+r10*4+400*4+ 0], m0
+ mova [t3+r10*4+400*4+32], m1
+ paddd m0, m2, [t3+r10*4+400*8+ 0] ; b
+ paddd m1, m3, [t3+r10*4+400*8+32]
+ mova [t3+r10*4+400*8+ 0], m2
+ mova [t3+r10*4+400*8+32], m3
+ pmovzxbd m2, [dstq+r10+0]
+ pmovzxbd m3, [dstq+r10+8]
+ pmaddwd m4, m2 ; a * src
+ pmaddwd m5, m3
+ packssdw m2, m3
+ psubd m0, m4 ; b - a * src + (1 << 8)
+ psubd m1, m5
+ psrad m0, 9
+ psrad m1, 9
+ packssdw m0, m1
+ pmulhrsw m0, m7
+ paddw m0, m2
+ vextracti128 xm1, m0, 1
+ packuswb xm0, xm1
+ pshufd xm0, xm0, q3120
+ mova [dstq+r10], xm0
+ add r10, 16
+ jl .n0_loop
+ add dstq, strideq
+ ret
+ALIGN function_align
+.n1: ; neighbor + output (odd rows)
+ mov r10, wq
+.n1_loop:
+ pmovzxbd m2, [dstq+r10+0]
+ pmovzxbd m3, [dstq+r10+8]
+ pmaddwd m4, m2, [t3+r10*4+400*4+ 0] ; a * src
+ pmaddwd m5, m3, [t3+r10*4+400*4+32]
+ mova m0, [t3+r10*4+400*8+ 0] ; b
+ mova m1, [t3+r10*4+400*8+32]
+ packssdw m2, m3
+ psubd m0, m4 ; b - a * src + (1 << 7)
+ psubd m1, m5
+ psrad m0, 8
+ psrad m1, 8
+ packssdw m0, m1
+ pmulhrsw m0, m7
+ paddw m0, m2
+ vextracti128 xm1, m0, 1
+ packuswb xm0, xm1
+ pshufd xm0, xm0, q3120
+ mova [dstq+r10], xm0
+ add r10, 16
+ jl .n1_loop
+ add dstq, strideq
+ ret
+
+cglobal sgr_filter_3x3_8bpc, 4, 15, 15, -400*28-16, dst, stride, left, lpf, \
+ w, h, edge, params
+%define base r14-sgr_x_by_x_avx2-256*4
+ mov paramsq, r6mp
+ mov wd, wm
+ movifnidn hd, hm
+ mov edged, r7m
+ lea r14, [sgr_x_by_x_avx2+256*4]
+ vbroadcasti128 m8, [base+sgr_shuf+2]
+ add lpfq, wq
+ vbroadcasti128 m9, [base+sgr_shuf+4]
+ add dstq, wq
+ vbroadcasti128 m10, [base+sgr_shuf+6]
+ lea t3, [rsp+wq*4+16+400*12]
+ vpbroadcastd m11, [paramsq+ 4] ; s1
+ pxor m6, m6
+ vpbroadcastw m7, [paramsq+10] ; w1
+ lea t1, [rsp+wq*2+20]
+ vpbroadcastd m12, [base+pd_0xf00801c7]
+ neg wq
+ vpbroadcastd m13, [base+pd_34816] ; (1 << 11) + (1 << 15)
+ psllw m7, 4
+ vpbroadcastd m14, [base+pd_m4096]
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t2, t1
+ add t1, 400*6
+ call .h_top
+ lea t4, [lpfq+strideq*4]
+ mov lpfq, dstq
+ add t4, strideq
+ mov [rsp], t4 ; below
+ mov t0, t2
+ call .hv
+.main:
+ mov t5, t3
+ add t3, 400*4
+ dec hd
+ jz .height1
+ add lpfq, strideq
+ call .hv
+ call .prep_n
+ dec hd
+ jz .extend_bottom
+.main_loop:
+ add lpfq, strideq
+ call .hv
+ call .n
+ dec hd
+ jnz .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .extend_bottom
+ mov lpfq, [rsp]
+ call .hv_bottom
+ call .n
+ add lpfq, strideq
+ call .hv_bottom
+.end:
+ call .n
+ RET
+.height1:
+ call .v
+ call .prep_n
+ mov t2, t1
+ call .v
+ jmp .end
+.extend_bottom:
+ call .v
+ call .n
+ mov t2, t1
+ call .v
+ jmp .end
+.no_top:
+ lea t4, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea t4, [t4+strideq*2]
+ mov [rsp], t4
+ call .h
+ lea t0, [t1+400*6]
+ mov t2, t1
+ call .v
+ jmp .main
+.h: ; horizontal boxsum
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ vpbroadcastd xm0, [leftq]
+ mova xm5, [lpfq+wq]
+ palignr xm5, xm0, 12
+ add leftq, 4
+ jmp .h_main
+.h_extend_left:
+ mova xm5, [lpfq+wq]
+ pshufb xm5, [base+sgr_l_shuf]
+ jmp .h_main
+.h_top:
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu xm5, [lpfq+r10-2]
+.h_main:
+ vinserti128 m5, [lpfq+r10+6], 1
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -17
+ jl .h_have_right
+ call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right
+.h_have_right:
+ pshufb m0, m5, m8
+ pmullw m2, m0, m0
+ pshufb m4, m5, m9
+ paddw m0, m4
+ pshufb m5, m10
+ paddw m0, m5 ; sum
+ punpcklwd m3, m4, m5
+ pmaddwd m3, m3
+ punpckhwd m4, m5
+ pmaddwd m4, m4
+ punpcklwd m1, m2, m6
+ punpckhwd m2, m6
+ mova [t1+r10*2+400*0], m0
+ paddd m1, m3 ; sumsq
+ paddd m2, m4
+ mova [t1+r10*2+400*2], m1
+ mova [t1+r10*2+400*4], m2
+ add r10, 16
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv: ; horizontal boxsum + vertical boxsum + ab
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ vpbroadcastd xm0, [leftq]
+ mova xm5, [lpfq+wq]
+ palignr xm5, xm0, 12
+ add leftq, 4
+ jmp .hv_main
+.hv_extend_left:
+ mova xm5, [lpfq+wq]
+ pshufb xm5, [base+sgr_l_shuf]
+ jmp .hv_main
+.hv_bottom:
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu xm5, [lpfq+r10-2]
+.hv_main:
+ vinserti128 m5, [lpfq+r10+6], 1
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp r10d, -17
+ jl .hv_have_right
+ call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right
+.hv_have_right:
+ pshufb m0, m5, m8
+ pmullw m3, m0, m0
+ pshufb m1, m5, m9
+ paddw m0, m1
+ pshufb m5, m10
+ paddw m0, m5 ; h sum
+ punpcklwd m4, m5, m1
+ pmaddwd m4, m4
+ punpckhwd m5, m1
+ pmaddwd m5, m5
+ paddw m1, m0, [t2+r10*2+400*0]
+ paddw m1, [t1+r10*2+400*0] ; hv sum
+ punpcklwd m2, m3, m6
+ punpckhwd m3, m6
+ paddd m4, m2 ; h sumsq
+ paddd m5, m3
+ paddd m2, m4, [t2+r10*2+400*2]
+ paddd m3, m5, [t2+r10*2+400*4]
+ paddd m2, [t1+r10*2+400*2] ; hv sumsq
+ paddd m3, [t1+r10*2+400*4]
+ mova [t0+r10*2+400*0], m0
+ punpcklwd m0, m1, m6 ; b
+ punpckhwd m1, m6
+ mova [t0+r10*2+400*2], m4
+ pslld m4, m2, 3
+ mova [t0+r10*2+400*4], m5
+ pslld m5, m3, 3
+ paddd m4, m2 ; a * 9
+ pmaddwd m2, m0, m0 ; b * b
+ paddd m5, m3
+ pmaddwd m3, m1, m1
+ psubd m4, m2 ; p
+ psubd m5, m3
+ pmulld m4, m11 ; p * s
+ pmulld m5, m11
+ pmaddwd m0, m12 ; b * 455
+ pmaddwd m1, m12
+ paddusw m4, m12
+ paddusw m5, m12
+ psrad m3, m4, 20 ; min(z, 255) - 256
+ vpgatherdd m2, [r14+m3*4], m4
+ psrad m4, m5, 20
+ vpgatherdd m3, [r14+m4*4], m5
+ pmulld m0, m2
+ pmulld m1, m3
+ paddd m0, m13 ; x * b * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m13
+ pand m0, m14
+ pand m1, m14
+ por m0, m2 ; a | (b << 12)
+ por m1, m3
+ mova [t3+r10*4+ 8], xm0
+ vextracti128 [t3+r10*4+40], m0, 1
+ mova [t3+r10*4+24], xm1
+ vextracti128 [t3+r10*4+56], m1, 1
+ add r10, 16
+ jl .hv_loop
+ mov t2, t1
+ mov t1, t0
+ mov t0, t2
+ ret
+.v: ; vertical boxsum + ab
+ lea r10, [wq-2]
+.v_loop:
+ mova m1, [t1+r10*2+400*0]
+ paddw m1, m1
+ paddw m1, [t2+r10*2+400*0] ; hv sum
+ mova m2, [t1+r10*2+400*2]
+ mova m3, [t1+r10*2+400*4]
+ paddd m2, m2
+ paddd m3, m3
+ paddd m2, [t2+r10*2+400*2] ; hv sumsq
+ paddd m3, [t2+r10*2+400*4]
+ punpcklwd m0, m1, m6 ; b
+ punpckhwd m1, m6
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; a * 9
+ pmaddwd m2, m0, m0 ; b * b
+ paddd m5, m3
+ pmaddwd m3, m1, m1
+ psubd m4, m2 ; p
+ psubd m5, m3
+ pmulld m4, m11 ; p * s
+ pmulld m5, m11
+ pmaddwd m0, m12 ; b * 455
+ pmaddwd m1, m12
+ paddusw m4, m12
+ paddusw m5, m12
+ psrad m3, m4, 20 ; min(z, 255) - 256
+ vpgatherdd m2, [r14+m3*4], m4
+ psrad m4, m5, 20
+ vpgatherdd m3, [r14+m4*4], m5
+ pmulld m0, m2
+ pmulld m1, m3
+ paddd m0, m13 ; x * b * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m13
+ pand m0, m14
+ pand m1, m14
+ por m0, m2 ; a | (b << 12)
+ por m1, m3
+ mova [t3+r10*4+ 8], xm0
+ vextracti128 [t3+r10*4+40], m0, 1
+ mova [t3+r10*4+24], xm1
+ vextracti128 [t3+r10*4+56], m1, 1
+ add r10, 16
+ jl .v_loop
+ ret
+.prep_n: ; initial neighbor setup
+ mov r10, wq
+ mov t4, t3
+ add t3, 400*4
+.prep_n_loop:
+ mova m2, [t5+r10*4+0]
+ mova m3, [t4+r10*4+0]
+ paddd m2, [t5+r10*4+8]
+ paddd m3, [t4+r10*4+8]
+ paddd m0, m2, [t5+r10*4+4]
+ paddd m1, m3, [t4+r10*4+4]
+ pslld m0, 2
+ paddd m1, m1 ; ab[ 0] 222
+ psubd m0, m2 ; ab[-1] 343
+ mova [t3+r10*4+400*4], m1
+ paddd m1, m1
+ mova [t5+r10*4], m0
+ psubd m1, m3 ; ab[ 0] 343
+ mova [t4+r10*4], m1
+ add r10, 8
+ jl .prep_n_loop
+ ret
+; a+b are packed together in a single dword, but we can't do the
+; full neighbor calculations before splitting them since we don't
+; have sufficient precision. The solution is to do the calculations
+; in two equal halves and split a and b before doing the final sum.
+ALIGN function_align
+.n: ; neighbor + output
+ mov r10, wq
+.n_loop:
+ mova m4, [t3+r10*4+ 0]
+ paddd m4, [t3+r10*4+ 8]
+ paddd m5, m4, [t3+r10*4+ 4]
+ paddd m5, m5 ; ab[+1] 222
+ mova m2, [t3+r10*4+400*4+ 0]
+ paddd m0, m2, [t5+r10*4+ 0] ; ab[ 0] 222 + ab[-1] 343
+ mova m3, [t3+r10*4+400*4+32]
+ paddd m1, m3, [t5+r10*4+32]
+ mova [t3+r10*4+400*4+ 0], m5
+ paddd m5, m5
+ psubd m5, m4 ; ab[+1] 343
+ mova [t5+r10*4+ 0], m5
+ paddd m2, m5 ; ab[ 0] 222 + ab[+1] 343
+ mova m4, [t3+r10*4+32]
+ paddd m4, [t3+r10*4+40]
+ paddd m5, m4, [t3+r10*4+36]
+ paddd m5, m5
+ mova [t3+r10*4+400*4+32], m5
+ paddd m5, m5
+ psubd m5, m4
+ mova [t5+r10*4+32], m5
+ pandn m4, m14, m0
+ psrld m0, 12
+ paddd m3, m5
+ pandn m5, m14, m2
+ psrld m2, 12
+ paddd m4, m5 ; a
+ pandn m5, m14, m1
+ psrld m1, 12
+ paddd m0, m2 ; b + (1 << 8)
+ pandn m2, m14, m3
+ psrld m3, 12
+ paddd m5, m2
+ pmovzxbd m2, [dstq+r10+0]
+ paddd m1, m3
+ pmovzxbd m3, [dstq+r10+8]
+ pmaddwd m4, m2 ; a * src
+ pmaddwd m5, m3
+ packssdw m2, m3
+ psubd m0, m4 ; b - a * src + (1 << 8)
+ psubd m1, m5
+ psrad m0, 9
+ psrad m1, 9
+ packssdw m0, m1
+ pmulhrsw m0, m7
+ paddw m0, m2
+ vextracti128 xm1, m0, 1
+ packuswb xm0, xm1
+ pshufd xm0, xm0, q3120
+ mova [dstq+r10], xm0
+ add r10, 16
+ jl .n_loop
+ mov r10, t5
+ mov t5, t4
+ mov t4, r10
+ add dstq, strideq
+ ret
+
+cglobal sgr_filter_mix_8bpc, 4, 13, 16, 400*56+8, dst, stride, left, lpf, \
+ w, h, edge, params
+%define base r12-sgr_x_by_x_avx2-256*4
+ lea r12, [sgr_x_by_x_avx2+256*4]
+ mov paramsq, r6mp
+ mov wd, wm
+ movifnidn hd, hm
+ mov edged, r7m
+ vbroadcasti128 m9, [base+sgr_shuf+0]
+ vbroadcasti128 m10, [base+sgr_shuf+8]
+ add lpfq, wq
+ vbroadcasti128 m11, [base+sgr_shuf+2]
+ vbroadcasti128 m12, [base+sgr_shuf+6]
+ add dstq, wq
+ vpbroadcastd m15, [paramsq+8] ; w0 w1
+ lea t3, [rsp+wq*4+400*24+8]
+ vpbroadcastd m13, [paramsq+0] ; s0
+ pxor m7, m7
+ vpbroadcastd m14, [paramsq+4] ; s1
+ lea t1, [rsp+wq*2+12]
+ neg wq
+ psllw m15, 2 ; to reuse existing pd_m4096 register for rounding
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t2, t1
+ call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).top_fixup
+ add t1, 400*12
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ add r10, strideq
+ mov [rsp], r10 ; below
+ call .hv0
+.main:
+ dec hd
+ jz .height1
+ add lpfq, strideq
+ call .hv1
+ call .prep_n
+ sub hd, 2
+ jl .extend_bottom
+.main_loop:
+ add lpfq, strideq
+ call .hv0
+ test hd, hd
+ jz .odd_height
+ add lpfq, strideq
+ call .hv1
+ call .n0
+ call .n1
+ sub hd, 2
+ jge .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .extend_bottom
+ mov lpfq, [rsp]
+ call .hv0_bottom
+ add lpfq, strideq
+ call .hv1_bottom
+.end:
+ call .n0
+ call .n1
+.end2:
+ RET
+.height1:
+ call .v1
+ call .prep_n
+ jmp .odd_height_end
+.odd_height:
+ call .v1
+ call .n0
+ call .n1
+.odd_height_end:
+ call .v0
+ call .v1
+ call .n0
+ jmp .end2
+.extend_bottom:
+ call .v0
+ call .v1
+ jmp .end
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov [rsp], r10
+ call .h
+ lea t2, [t1+400*12]
+ lea r10, [wq-2]
+.top_fixup_loop:
+ mova m0, [t1+r10*2+400* 0]
+ mova m1, [t1+r10*2+400* 2]
+ mova m2, [t1+r10*2+400* 4]
+ paddw m0, m0
+ mova m3, [t1+r10*2+400* 6]
+ paddd m1, m1
+ mova m4, [t1+r10*2+400* 8]
+ paddd m2, m2
+ mova m5, [t1+r10*2+400*10]
+ mova [t2+r10*2+400* 0], m0
+ mova [t2+r10*2+400* 2], m1
+ mova [t2+r10*2+400* 4], m2
+ mova [t2+r10*2+400* 6], m3
+ mova [t2+r10*2+400* 8], m4
+ mova [t2+r10*2+400*10], m5
+ add r10, 16
+ jl .top_fixup_loop
+ call .v0
+ jmp .main
+.h: ; horizontal boxsums
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ vpbroadcastd xm0, [leftq]
+ mova xm5, [lpfq+wq]
+ palignr xm5, xm0, 12
+ add leftq, 4
+ jmp .h_main
+.h_extend_left:
+ mova xm5, [lpfq+wq]
+ pshufb xm5, [base+sgr_l_shuf]
+ jmp .h_main
+.h_top:
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu xm5, [lpfq+r10-2]
+.h_main:
+ vinserti128 m5, [lpfq+r10+6], 1
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -18
+ jl .h_have_right
+ call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right
+.h_have_right:
+ pshufb m6, m5, m9
+ pshufb m4, m5, m10
+ paddw m8, m6, m4
+ shufps m0, m6, m4, q2121
+ pmullw m3, m0, m0
+ pshufb m2, m5, m11
+ paddw m0, m2
+ pshufb m5, m12
+ paddw m0, m5 ; sum3
+ punpcklwd m1, m2, m5
+ pmaddwd m1, m1
+ punpckhwd m2, m5
+ pmaddwd m2, m2
+ punpcklwd m5, m6, m4
+ pmaddwd m5, m5
+ punpckhwd m6, m4
+ pmaddwd m6, m6
+ punpcklwd m4, m3, m7
+ paddd m1, m4 ; sumsq3
+ punpckhwd m3, m7
+ paddd m2, m3
+ mova [t1+r10*2+400* 6], m0
+ mova [t1+r10*2+400* 8], m1
+ mova [t1+r10*2+400*10], m2
+ paddw m8, m0 ; sum5
+ paddd m5, m1 ; sumsq5
+ paddd m6, m2
+ mova [t1+r10*2+400* 0], m8
+ mova [t1+r10*2+400* 2], m5
+ mova [t1+r10*2+400* 4], m6
+ add r10, 16
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv0: ; horizontal boxsums + vertical boxsum3 + ab3 (even rows)
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+ vpbroadcastd xm0, [leftq]
+ mova xm5, [lpfq+wq]
+ palignr xm5, xm0, 12
+ add leftq, 4
+ jmp .hv0_main
+.hv0_extend_left:
+ mova xm5, [lpfq+wq]
+ pshufb xm5, [base+sgr_l_shuf]
+ jmp .hv0_main
+.hv0_bottom:
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+.hv0_loop:
+ movu xm5, [lpfq+r10-2]
+.hv0_main:
+ vinserti128 m5, [lpfq+r10+6], 1
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv0_have_right
+ cmp r10d, -18
+ jl .hv0_have_right
+ call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right
+.hv0_have_right:
+ pshufb m6, m5, m9
+ pshufb m4, m5, m10
+ paddw m8, m6, m4
+ shufps m1, m6, m4, q2121
+ pmullw m0, m1, m1
+ pshufb m3, m5, m11
+ paddw m1, m3
+ pshufb m5, m12
+ paddw m1, m5 ; sum3
+ punpcklwd m2, m3, m5
+ pmaddwd m2, m2
+ punpckhwd m3, m5
+ pmaddwd m3, m3
+ punpcklwd m5, m6, m4
+ pmaddwd m5, m5
+ punpckhwd m6, m4
+ pmaddwd m6, m6
+ punpcklwd m4, m0, m7
+ paddd m2, m4 ; sumsq3
+ punpckhwd m0, m7
+ paddd m3, m0
+ paddw m8, m1 ; sum5
+ paddd m5, m2 ; sumsq5
+ paddd m6, m3
+ mova [t3+r10*4+400*8+ 8], m8 ; we need a clean copy of the last row
+ mova [t3+r10*4+400*0+ 8], m5 ; in case height is odd
+ mova [t3+r10*4+400*0+40], m6
+ paddw m8, [t1+r10*2+400* 0]
+ paddd m5, [t1+r10*2+400* 2]
+ paddd m6, [t1+r10*2+400* 4]
+ mova [t1+r10*2+400* 0], m8
+ mova [t1+r10*2+400* 2], m5
+ mova [t1+r10*2+400* 4], m6
+ paddw m0, m1, [t1+r10*2+400* 6]
+ paddd m4, m2, [t1+r10*2+400* 8]
+ paddd m5, m3, [t1+r10*2+400*10]
+ mova [t1+r10*2+400* 6], m1
+ mova [t1+r10*2+400* 8], m2
+ mova [t1+r10*2+400*10], m3
+ paddw m1, m0, [t2+r10*2+400* 6]
+ paddd m2, m4, [t2+r10*2+400* 8]
+ paddd m3, m5, [t2+r10*2+400*10]
+ mova [t2+r10*2+400* 6], m0
+ mova [t2+r10*2+400* 8], m4
+ mova [t2+r10*2+400*10], m5
+ punpcklwd m0, m1, m7 ; b3
+ punpckhwd m1, m7
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; a3 * 9
+ pmaddwd m2, m0, m0 ; b3 * b
+ paddd m5, m3
+ pmaddwd m3, m1, m1
+ psubd m4, m2 ; p3
+ vpbroadcastd m2, [base+pd_0xf00801c7]
+ psubd m5, m3
+ pmulld m4, m14 ; p3 * s1
+ pmulld m5, m14
+ pmaddwd m0, m2 ; b3 * 455
+ pmaddwd m1, m2
+ paddusw m4, m2
+ paddusw m5, m2
+ psrad m3, m4, 20 ; min(z3, 255) - 256
+ vpgatherdd m2, [r12+m3*4], m4
+ psrad m4, m5, 20
+ vpgatherdd m3, [r12+m4*4], m5
+ vpbroadcastd m4, [base+pd_34816]
+ pmulld m0, m2
+ vpbroadcastd m5, [base+pd_m4096]
+ pmulld m1, m3
+ paddd m0, m4 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m4
+ pand m0, m5
+ pand m1, m5
+ por m0, m2 ; a3 | (b3 << 12)
+ por m1, m3
+ mova [t3+r10*4+400*4+ 8], xm0
+ vextracti128 [t3+r10*4+400*4+40], m0, 1
+ mova [t3+r10*4+400*4+24], xm1
+ vextracti128 [t3+r10*4+400*4+56], m1, 1
+ add r10, 16
+ jl .hv0_loop
+ ret
+ALIGN function_align
+.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+ vpbroadcastd xm0, [leftq]
+ mova xm5, [lpfq+wq]
+ palignr xm5, xm0, 12
+ add leftq, 4
+ jmp .hv1_main
+.hv1_extend_left:
+ mova xm5, [lpfq+wq]
+ pshufb xm5, [base+sgr_l_shuf]
+ jmp .hv1_main
+.hv1_bottom:
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+.hv1_loop:
+ movu xm5, [lpfq+r10-2]
+.hv1_main:
+ vinserti128 m5, [lpfq+r10+6], 1
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv1_have_right
+ cmp r10d, -18
+ jl .hv1_have_right
+ call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right
+.hv1_have_right:
+ pshufb m6, m5, m9
+ pshufb m3, m5, m10
+ paddw m8, m6, m3
+ shufps m2, m6, m3, q2121
+ pmullw m1, m2, m2
+ pshufb m0, m5, m11
+ paddw m2, m0
+ pshufb m5, m12
+ paddw m2, m5 ; sum3
+ punpcklwd m4, m5, m0
+ pmaddwd m4, m4
+ punpckhwd m5, m0
+ pmaddwd m5, m5
+ punpcklwd m0, m6, m3
+ pmaddwd m0, m0
+ punpckhwd m6, m3
+ pmaddwd m6, m6
+ punpcklwd m3, m1, m7
+ paddd m4, m3 ; sumsq3
+ punpckhwd m1, m7
+ paddd m5, m1
+ paddw m1, m2, [t2+r10*2+400* 6]
+ mova [t2+r10*2+400* 6], m2
+ paddw m8, m2 ; sum5
+ paddd m2, m4, [t2+r10*2+400* 8]
+ paddd m3, m5, [t2+r10*2+400*10]
+ mova [t2+r10*2+400* 8], m4
+ mova [t2+r10*2+400*10], m5
+ paddd m4, m0 ; sumsq5
+ paddd m5, m6
+ punpcklwd m0, m1, m7 ; b3
+ punpckhwd m1, m7
+ pslld m6, m2, 3
+ pslld m7, m3, 3
+ paddd m6, m2 ; a3 * 9
+ pmaddwd m2, m0, m0 ; b3 * b3
+ paddd m7, m3
+ pmaddwd m3, m1, m1
+ psubd m6, m2 ; p3
+ vpbroadcastd m2, [base+pd_0xf00801c7]
+ psubd m7, m3
+ pmulld m6, m14 ; p3 * s1
+ pmulld m7, m14
+ pmaddwd m0, m2 ; b3 * 455
+ pmaddwd m1, m2
+ paddusw m6, m2
+ paddusw m7, m2
+ psrad m3, m6, 20 ; min(z3, 255) - 256
+ vpgatherdd m2, [r12+m3*4], m6
+ psrad m6, m7, 20
+ vpgatherdd m3, [r12+m6*4], m7
+ vpbroadcastd m6, [base+pd_34816] ; x3
+ pmulld m0, m2
+ vpbroadcastd m7, [base+pd_m4096]
+ pmulld m1, m3
+ paddd m0, m6 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m6
+ pand m0, m7
+ pand m7, m1
+ por m0, m2 ; a3 | (b3 << 12)
+ por m7, m3
+ paddw m1, m8, [t2+r10*2+400*0]
+ paddd m2, m4, [t2+r10*2+400*2]
+ paddd m3, m5, [t2+r10*2+400*4]
+ paddw m1, [t1+r10*2+400*0]
+ paddd m2, [t1+r10*2+400*2]
+ paddd m3, [t1+r10*2+400*4]
+ mova [t2+r10*2+400*0], m8
+ mova [t2+r10*2+400*2], m4
+ mova [t2+r10*2+400*4], m5
+ mova [t3+r10*4+400*8+ 8], xm0
+ vextracti128 [t3+r10*4+400*8+40], m0, 1
+ mova [t3+r10*4+400*8+24], xm7
+ vextracti128 [t3+r10*4+400*8+56], m7, 1
+ vpbroadcastd m4, [base+pd_25]
+ pxor m7, m7
+ punpcklwd m0, m1, m7 ; b5
+ punpckhwd m1, m7
+ pmulld m2, m4 ; a5 * 25
+ pmulld m3, m4
+ pmaddwd m4, m0, m0 ; b5 * b5
+ pmaddwd m5, m1, m1
+ psubd m2, m4 ; p5
+ vpbroadcastd m4, [base+pd_0xf00800a4]
+ psubd m3, m5
+ pmulld m2, m13 ; p5 * s0
+ pmulld m3, m13
+ pmaddwd m0, m4 ; b5 * 164
+ pmaddwd m1, m4
+ paddusw m2, m4
+ paddusw m3, m4
+ psrad m5, m2, 20 ; min(z5, 255) - 256
+ vpgatherdd m4, [r12+m5*4], m2 ; x5
+ psrad m2, m3, 20
+ vpgatherdd m5, [r12+m2*4], m3
+ pmulld m0, m4
+ pmulld m1, m5
+ paddd m0, m6 ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
+ paddd m1, m6
+ vpbroadcastd m6, [base+pd_m4096]
+ pand m0, m6
+ pand m1, m6
+ por m0, m4 ; a5 | (b5 << 12)
+ por m1, m5
+ mova [t3+r10*4+400*0+ 8], xm0
+ vextracti128 [t3+r10*4+400*0+40], m0, 1
+ mova [t3+r10*4+400*0+24], xm1
+ vextracti128 [t3+r10*4+400*0+56], m1, 1
+ add r10, 16
+ jl .hv1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.v0: ; vertical boxsums + ab3 (even rows)
+ lea r10, [wq-2]
+ vpbroadcastd m6, [base+pd_34816]
+ vpbroadcastd m8, [base+pd_m4096]
+.v0_loop:
+ mova m0, [t1+r10*2+400* 6]
+ mova m4, [t1+r10*2+400* 8]
+ mova m5, [t1+r10*2+400*10]
+ paddw m0, m0
+ paddd m4, m4
+ paddd m5, m5
+ paddw m1, m0, [t2+r10*2+400* 6]
+ paddd m2, m4, [t2+r10*2+400* 8]
+ paddd m3, m5, [t2+r10*2+400*10]
+ mova [t2+r10*2+400* 6], m0
+ mova [t2+r10*2+400* 8], m4
+ mova [t2+r10*2+400*10], m5
+ punpcklwd m0, m1, m7 ; b3
+ punpckhwd m1, m7
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; a3 * 9
+ pmaddwd m2, m0, m0 ; b3 * b3
+ paddd m5, m3
+ pmaddwd m3, m1, m1
+ psubd m4, m2 ; p3
+ vpbroadcastd m2, [base+pd_0xf00801c7]
+ psubd m5, m3
+ pmulld m4, m14 ; p3 * s1
+ pmulld m5, m14
+ pmaddwd m0, m2 ; b3 * 455
+ pmaddwd m1, m2
+ paddusw m4, m2
+ paddusw m5, m2
+ psrad m3, m4, 20 ; min(z3, 255) - 256
+ vpgatherdd m2, [r12+m3*4], m4 ; x3
+ psrad m4, m5, 20
+ vpgatherdd m3, [r12+m4*4], m5
+ pmulld m0, m2
+ pmulld m1, m3
+ paddd m0, m6 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m6
+ pand m0, m8
+ pand m1, m8
+ por m0, m2 ; a3 | (b3 << 12)
+ por m1, m3
+ mova m2, [t1+r10*2+400*0]
+ mova m3, [t1+r10*2+400*2]
+ mova m4, [t1+r10*2+400*4]
+ mova [t3+r10*4+400*8+ 8], m2
+ mova [t3+r10*4+400*0+ 8], m3
+ mova [t3+r10*4+400*0+40], m4
+ paddw m2, m2 ; cc5
+ paddd m3, m3
+ paddd m4, m4
+ mova [t1+r10*2+400*0], m2
+ mova [t1+r10*2+400*2], m3
+ mova [t1+r10*2+400*4], m4
+ mova [t3+r10*4+400*4+ 8], xm0
+ vextracti128 [t3+r10*4+400*4+40], m0, 1
+ mova [t3+r10*4+400*4+24], xm1
+ vextracti128 [t3+r10*4+400*4+56], m1, 1
+ add r10, 16
+ jl .v0_loop
+ ret
+.v1: ; vertical boxsums + ab (odd rows)
+ lea r10, [wq-2]
+.v1_loop:
+ mova m4, [t1+r10*2+400* 6]
+ mova m5, [t1+r10*2+400* 8]
+ mova m6, [t1+r10*2+400*10]
+ paddw m1, m4, [t2+r10*2+400* 6]
+ paddd m2, m5, [t2+r10*2+400* 8]
+ paddd m3, m6, [t2+r10*2+400*10]
+ mova [t2+r10*2+400* 6], m4
+ mova [t2+r10*2+400* 8], m5
+ mova [t2+r10*2+400*10], m6
+ punpcklwd m0, m1, m7 ; b3
+ punpckhwd m1, m7
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; a3 * 9
+ pmaddwd m2, m0, m0 ; b3 * b3
+ paddd m5, m3
+ pmaddwd m3, m1, m1
+ psubd m4, m2 ; p3
+ vpbroadcastd m2, [base+pd_0xf00801c7]
+ psubd m5, m3
+ pmulld m4, m14 ; p3 * s1
+ pmulld m5, m14
+ pmaddwd m0, m2 ; b3 * 455
+ pmaddwd m1, m2
+ paddusw m4, m2
+ paddusw m5, m2
+ psrad m3, m4, 20 ; min(z3, 255) - 256
+ vpgatherdd m2, [r12+m3*4], m4 ; x3
+ psrad m4, m5, 20
+ vpgatherdd m3, [r12+m4*4], m5
+ vpbroadcastd m4, [base+pd_34816]
+ pmulld m0, m2
+ vpbroadcastd m8, [base+pd_m4096]
+ pmulld m1, m3
+ paddd m0, m4 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m4
+ pand m0, m8
+ pand m8, m1
+ por m0, m2 ; a3 | (b3 << 12)
+ por m8, m3
+ mova m4, [t3+r10*4+400*8+ 8]
+ mova m5, [t3+r10*4+400*0+ 8]
+ mova m6, [t3+r10*4+400*0+40]
+ paddw m1, m4, [t2+r10*2+400*0]
+ paddd m2, m5, [t2+r10*2+400*2]
+ paddd m3, m6, [t2+r10*2+400*4]
+ paddw m1, [t1+r10*2+400*0]
+ paddd m2, [t1+r10*2+400*2]
+ paddd m3, [t1+r10*2+400*4]
+ mova [t2+r10*2+400*0], m4
+ mova [t2+r10*2+400*2], m5
+ mova [t2+r10*2+400*4], m6
+ vpbroadcastd m4, [base+pd_25]
+ mova [t3+r10*4+400*8+ 8], xm0
+ vextracti128 [t3+r10*4+400*8+40], m0, 1
+ mova [t3+r10*4+400*8+24], xm8
+ vextracti128 [t3+r10*4+400*8+56], m8, 1
+ punpcklwd m0, m1, m7 ; b5
+ punpckhwd m1, m7
+ pmulld m2, m4 ; a5 * 25
+ pmulld m3, m4
+ pmaddwd m4, m0, m0 ; b5 * b5
+ pmaddwd m5, m1, m1
+ psubd m2, m4 ; p5
+ vpbroadcastd m4, [base+pd_0xf00800a4]
+ psubd m3, m5
+ pmulld m2, m13 ; p5 * s0
+ pmulld m3, m13
+ pmaddwd m0, m4 ; b5 * 164
+ pmaddwd m1, m4
+ paddusw m2, m4
+ paddusw m3, m4
+ psrad m5, m2, 20 ; min(z5, 255) - 256
+ vpgatherdd m4, [r12+m5*4], m2 ; x5
+ psrad m2, m3, 20
+ vpgatherdd m5, [r12+m2*4], m3
+ pmulld m0, m4
+ vpbroadcastd m6, [base+pd_34816]
+ pmulld m1, m5
+ paddd m0, m6 ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
+ paddd m1, m6
+ vpbroadcastd m6, [base+pd_m4096]
+ pand m0, m6
+ pand m1, m6
+ por m0, m4 ; a5 | (b5 << 12)
+ por m1, m5
+ mova [t3+r10*4+400*0+ 8], xm0
+ vextracti128 [t3+r10*4+400*0+40], m0, 1
+ mova [t3+r10*4+400*0+24], xm1
+ vextracti128 [t3+r10*4+400*0+56], m1, 1
+ add r10, 16
+ jl .v1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.prep_n: ; initial neighbor setup
+ mov r10, wq
+.prep_n_loop:
+ movu m0, [t3+r10*4+400*0+4]
+ paddd m1, m0, [t3+r10*4+400*0+0]
+ mova m4, [t3+r10*4+400*4+0]
+ paddd m1, [t3+r10*4+400*0+8]
+ mova m5, [t3+r10*4+400*8+0]
+ paddd m4, [t3+r10*4+400*4+8]
+ paddd m5, [t3+r10*4+400*8+8]
+ paddd m2, m4, [t3+r10*4+400*4+4]
+ paddd m3, m5, [t3+r10*4+400*8+4]
+ paddd m0, m1
+ pslld m1, 2
+ pslld m2, 2
+ paddd m1, m0 ; ab5 565
+ paddd m3, m3 ; ab3[ 0] 222
+ psubd m2, m4 ; ab3[-1] 343
+ mova [t3+r10*4+400*20], m3
+ pandn m0, m6, m1 ; a5 565
+ mova [t3+r10*4+400*24], m2
+ psrld m1, 12 ; b5 565
+ mova [t3+r10*4+400*12], m0
+ paddd m3, m3
+ mova [t3+r10*4+400*16], m1
+ psubd m3, m5 ; ab3[ 0] 343
+ mova [t3+r10*4+400*28], m3
+ add r10, 8
+ jl .prep_n_loop
+ ret
+ALIGN function_align
+.n0: ; neighbor + output (even rows)
+ mov r10, wq
+.n0_loop:
+ movu m0, [t3+r10*4+4]
+ paddd m4, m0, [t3+r10*4+0]
+ paddd m4, [t3+r10*4+8]
+ paddd m0, m4
+ pslld m4, 2
+ paddd m4, m0
+ pandn m0, m6, m4
+ psrld m4, 12
+ paddd m2, m0, [t3+r10*4+400*12] ; a5
+ mova [t3+r10*4+400*12], m0
+ paddd m0, m4, [t3+r10*4+400*16] ; b5 + (1 << 8)
+ mova [t3+r10*4+400*16], m4
+ mova m3, [t3+r10*4+400*4+0]
+ paddd m3, [t3+r10*4+400*4+8]
+ paddd m5, m3, [t3+r10*4+400*4+4]
+ paddd m5, m5 ; ab3[ 1] 222
+ mova m4, [t3+r10*4+400*20]
+ paddd m1, m4, [t3+r10*4+400*24] ; ab3[ 0] 222 + ab3[-1] 343
+ mova [t3+r10*4+400*20], m5
+ paddd m5, m5
+ psubd m5, m3 ; ab3[ 1] 343
+ mova [t3+r10*4+400*24], m5
+ paddd m4, m5 ; ab3[ 0] 222 + ab3[ 1] 343
+ pandn m3, m6, m1
+ psrld m1, 12
+ pandn m5, m6, m4
+ psrld m4, 12
+ paddd m3, m5 ; a3
+ paddd m1, m4 ; b3 + (1 << 8)
+ pmovzxbd m4, [dstq+r10]
+ pmaddwd m2, m4 ; a5 * src
+ pmaddwd m3, m4 ; a3 * src
+ psubd m0, m2 ; b5 - a5 * src + (1 << 8)
+ psubd m1, m3 ; b3 - a3 * src + (1 << 8)
+ psrld m0, 9
+ pslld m1, 7
+ pblendw m0, m1, 0xaa
+ pmaddwd m0, m15
+ psubd m0, m6
+ psrad m0, 13
+ paddd m0, m4
+ vextracti128 xm1, m0, 1
+ packssdw xm0, xm1
+ packuswb xm0, xm0
+ movq [dstq+r10], xm0
+ add r10, 8
+ jl .n0_loop
+ add dstq, strideq
+ ret
+ALIGN function_align
+.n1: ; neighbor + output (odd rows)
+ mov r10, wq
+.n1_loop:
+ mova m3, [t3+r10*4+400*8+0]
+ paddd m3, [t3+r10*4+400*8+8]
+ paddd m5, m3, [t3+r10*4+400*8+4]
+ paddd m5, m5 ; ab3[ 1] 222
+ mova m4, [t3+r10*4+400*20]
+ paddd m1, m4, [t3+r10*4+400*28] ; ab3[ 0] 222 + ab3[-1] 343
+ mova [t3+r10*4+400*20], m5
+ paddd m5, m5
+ psubd m5, m3 ; ab3[ 1] 343
+ mova [t3+r10*4+400*28], m5
+ paddd m4, m5 ; ab3[ 0] 222 + ab3[ 1] 343
+ pandn m3, m6, m1
+ psrld m1, 12
+ pandn m5, m6, m4
+ psrld m4, 12
+ paddd m3, m5 ; -a3
+ paddd m1, m4 ; b3 + (1 << 8)
+ pmovzxbd m4, [dstq+r10]
+ pmaddwd m2, m4, [t3+r10*4+400*12] ; -a5 * src
+ mova m0, [t3+r10*4+400*16] ; b5 + (1 << 7)
+ pmaddwd m3, m4 ; -a3 * src
+ psubd m0, m2 ; a5 * src + b5 + (1 << 7)
+ psubd m1, m3 ; a3 * src + b3 + (1 << 8)
+ psrld m0, 8
+ pslld m1, 7
+ pblendw m0, m1, 0xaa
+ pmaddwd m0, m15
+ psubd m0, m6
+ psrad m0, 13
+ paddd m0, m4
+ vextracti128 xm1, m0, 1
+ packssdw xm0, xm1
+ packuswb xm0, xm0
+ movq [dstq+r10], xm0
+ add r10, 8
+ jl .n1_loop
+ add dstq, strideq
+ ret
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/looprestoration_avx512.asm b/third_party/dav1d/src/x86/looprestoration_avx512.asm
new file mode 100644
index 0000000000..1e571774ca
--- /dev/null
+++ b/third_party/dav1d/src/x86/looprestoration_avx512.asm
@@ -0,0 +1,2122 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 32
+
+wiener_shufA: db 1, 2, 7, 6, 3, 4, 9, 8, 5, 6, 11, 10, 7, 8, 13, 12
+wiener_shufB: db 2, 3, 8, 7, 4, 5, 10, 9, 6, 7, 12, 11, 8, 9, 14, 13
+wiener_shufC: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
+wiener_shufD: db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
+wiener_perm32: db 1, 9, 3, 11, 5, 13, 7, 15, 33, 41, 35, 43, 37, 45, 39, 47
+ db 17, 25, 19, 27, 21, 29, 23, 31, 49, 57, 51, 59, 53, 61, 55, 63
+sgr_shuf: db 128, 1, -1, 2,132, 3, -1, 4,136, 5, -1, 6,140, 7, -1, 8
+ db 129, 9, -1, 10,133, 11, -1, 12,137, -1, -1, -1,141, -1, 0,128
+sgr_mix_perm: db 1, 3, 5, 7, 17, 19, 21, 23, 33, 35, 37, 39, 49, 51, 53, 55
+r_ext_mask: times 68 db -1
+ times 4 db 0
+wiener_x_shuf: db 0, 2, -1, 0
+wiener_x_add: db 0, 1,127, 0
+
+pw_61448: times 2 dw 61448
+pw_164_455: dw 164, 455
+pd_m16380: dd -16380
+pd_m4096: dd -4096
+pd_m25 dd -25
+pd_m9: dd -9
+pd_34816: dd 34816
+pd_8421376: dd 8421376
+
+cextern sgr_x_by_x
+
+SECTION .text
+
+DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; ring buffer pointers
+
+INIT_ZMM avx512icl
+cglobal wiener_filter7_8bpc, 4, 15, 20, -384*12-16, dst, stride, left, lpf, \
+ w, h, edge, flt
+ mov fltq, r6mp
+ mov wd, wm
+ movifnidn hd, hm
+ mov edged, r7m
+ vbroadcasti32x4 m6, [wiener_shufA]
+ vbroadcasti32x4 m7, [wiener_shufB]
+ mov r10d, 0xfffe
+ vbroadcasti32x4 m8, [wiener_shufC]
+ vbroadcasti32x4 m9, [wiener_shufD]
+ kmovw k1, r10d
+ vpbroadcastd m0, [wiener_x_shuf]
+ vpbroadcastd m1, [wiener_x_add]
+ mov r10, 0xaaaaaaaaaaaaaaaa
+ vpbroadcastd m11, [fltq+ 0]
+ vpbroadcastd m12, [fltq+ 4]
+ kmovq k2, r10
+ vpbroadcastd m10, [pd_m16380]
+ packsswb m11, m11 ; x0 x1 x0 x1
+ vpbroadcastd m14, [fltq+16]
+ pshufb m12, m0
+ vpbroadcastd m15, [fltq+20]
+ paddb m12, m1 ; x2 x3+1 x2 127
+ vpbroadcastd m13, [pd_8421376]
+ psllw m14, 5 ; y0 y1
+ psllw m15, 5 ; y2 y3
+ cmp wd, 32 ; the minimum lr unit size for chroma in 4:2:0 is 32
+ jle .w32 ; pixels, so we need a special case for small widths
+ lea t1, [rsp+wq*2+16]
+ add lpfq, wq
+ add dstq, wq
+ neg wq
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t6, t1
+ mov t5, t1
+ add t1, 384*2
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ mov t4, t1
+ add t1, 384*2
+ add r10, strideq
+ mov [rsp], r10 ; below
+ call .h
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v2
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v3
+.main:
+ lea t0, [t1+384*2]
+.main_loop:
+ call .hv
+ dec hd
+ jnz .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .v3
+ mov lpfq, [rsp]
+ call .hv_bottom
+ add lpfq, strideq
+ call .hv_bottom
+.v1:
+ call .v
+ RET
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov [rsp], r10
+ call .h
+ mov t6, t1
+ mov t5, t1
+ mov t4, t1
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v2
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v3
+ lea t0, [t1+384*2]
+ call .hv
+ dec hd
+ jz .v3
+ add t0, 384*8
+ call .hv
+ dec hd
+ jnz .main
+.v3:
+ call .v
+.v2:
+ call .v
+ jmp .v1
+.h:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movd xm16, [leftq]
+ vmovdqu32 m16{k1}, [lpfq+r10-4]
+ add leftq, 4
+ jmp .h_main
+.h_extend_left:
+ vpbroadcastb xm16, [lpfq+r10] ; the masked load ensures that no exception
+ vmovdqu32 m16{k1}, [lpfq+r10-4] ; gets raised from accessing invalid memory
+ jmp .h_main
+.h_top:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m16, [lpfq+r10-4]
+.h_main:
+ movu m17, [lpfq+r10+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -66
+ jl .h_have_right
+ push r0
+ lea r0, [r_ext_mask+65]
+ vpbroadcastb m0, [lpfq-1]
+ vpternlogd m16, m0, [r0+r10+0], 0xe4 ; c ? a : b
+ vpternlogd m17, m0, [r0+r10+8], 0xe4
+ pop r0
+.h_have_right:
+ pshufb m4, m16, m6
+ mova m0, m10
+ vpdpbusd m0, m4, m11
+ pshufb m4, m16, m7
+ mova m2, m10
+ vpdpbusd m2, m4, m11
+ pshufb m4, m17, m6
+ mova m1, m10
+ vpdpbusd m1, m4, m11
+ pshufb m4, m17, m7
+ mova m3, m10
+ vpdpbusd m3, m4, m11
+ pshufb m4, m16, m8
+ vpdpbusd m0, m4, m12
+ pshufb m16, m9
+ vpdpbusd m2, m16, m12
+ pshufb m4, m17, m8
+ vpdpbusd m1, m4, m12
+ pshufb m17, m9
+ vpdpbusd m3, m17, m12
+ packssdw m0, m2
+ packssdw m1, m3
+ psraw m0, 3
+ psraw m1, 3
+ mova [t1+r10*2+ 0], m0
+ mova [t1+r10*2+64], m1
+ add r10, 64
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv:
+ add lpfq, strideq
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movd xm16, [leftq]
+ vmovdqu32 m16{k1}, [lpfq+r10-4]
+ add leftq, 4
+ jmp .hv_main
+.hv_extend_left:
+ vpbroadcastb xm16, [lpfq+r10]
+ vmovdqu32 m16{k1}, [lpfq+r10-4]
+ jmp .hv_main
+.hv_bottom:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu m16, [lpfq+r10-4]
+.hv_main:
+ movu m17, [lpfq+r10+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp r10d, -66
+ jl .hv_have_right
+ push r0
+ lea r0, [r_ext_mask+65]
+ vpbroadcastb m0, [lpfq-1]
+ vpternlogd m16, m0, [r0+r10+0], 0xe4 ; c ? a : b
+ vpternlogd m17, m0, [r0+r10+8], 0xe4
+ pop r0
+.hv_have_right:
+ pshufb m4, m16, m6
+ mova m0, m10
+ vpdpbusd m0, m4, m11
+ pshufb m4, m16, m7
+ mova m2, m10
+ vpdpbusd m2, m4, m11
+ pshufb m4, m17, m6
+ mova m1, m10
+ vpdpbusd m1, m4, m11
+ pshufb m4, m17, m7
+ mova m3, m10
+ vpdpbusd m3, m4, m11
+ pshufb m4, m16, m8
+ vpdpbusd m0, m4, m12
+ pshufb m16, m9
+ vpdpbusd m2, m16, m12
+ pshufb m4, m17, m8
+ vpdpbusd m1, m4, m12
+ pshufb m17, m9
+ vpdpbusd m3, m17, m12
+ packssdw m0, m2
+ packssdw m1, m3
+ psraw m0, 3
+ psraw m1, 3
+ mova m16, [t4+r10*2]
+ paddw m16, [t2+r10*2]
+ mova m3, [t3+r10*2]
+ mova m17, [t4+r10*2+64]
+ paddw m17, [t2+r10*2+64]
+ mova m5, [t3+r10*2+64]
+ punpcklwd m4, m16, m3
+ mova m2, m13
+ vpdpwssd m2, m4, m15
+ punpcklwd m18, m17, m5
+ mova m4, m13
+ vpdpwssd m4, m18, m15
+ punpckhwd m16, m3
+ mova m3, m13
+ vpdpwssd m3, m16, m15
+ punpckhwd m17, m5
+ mova m5, m13
+ vpdpwssd m5, m17, m15
+ mova m17, [t5+r10*2]
+ paddw m17, [t1+r10*2]
+ paddw m16, m0, [t6+r10*2]
+ mova m19, [t5+r10*2+64]
+ paddw m19, [t1+r10*2+64]
+ paddw m18, m1, [t6+r10*2+64]
+ mova [t0+r10*2+ 0], m0
+ mova [t0+r10*2+64], m1
+ punpcklwd m0, m16, m17
+ vpdpwssd m2, m0, m14
+ punpcklwd m1, m18, m19
+ vpdpwssd m4, m1, m14
+ punpckhwd m16, m17
+ vpdpwssd m3, m16, m14
+ punpckhwd m18, m19
+ vpdpwssd m5, m18, m14
+ packuswb m2, m4
+ psrlw m2, 8
+ vpackuswb m2{k2}, m3, m5
+ movu [dstq+r10], m2 ; We don't have a separate 5-tap version so the 7-tap
+ add r10, 64 ; function is used for chroma as well, and in some
+ jl .hv_loop ; esoteric edge cases chroma dst pointers may only
+ mov t6, t5 ; have a 32-byte alignment despite having a width
+ mov t5, t4 ; larger than 32, so use an unaligned store here.
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ mov t1, t0
+ mov t0, t6
+ add dstq, strideq
+ ret
+.v:
+ mov r10, wq
+.v_loop:
+ mova m4, [t4+r10*2+ 0]
+ paddw m4, [t2+r10*2+ 0]
+ mova m1, [t3+r10*2+ 0]
+ mova m5, [t4+r10*2+64]
+ paddw m5, [t2+r10*2+64]
+ mova m3, [t3+r10*2+64]
+ punpcklwd m6, m4, m1
+ mova m0, m13
+ vpdpwssd m0, m6, m15
+ punpcklwd m6, m5, m3
+ mova m2, m13
+ vpdpwssd m2, m6, m15
+ punpckhwd m4, m1
+ mova m1, m13
+ vpdpwssd m1, m4, m15
+ punpckhwd m5, m3
+ mova m3, m13
+ vpdpwssd m3, m5, m15
+ mova m5, [t1+r10*2+ 0]
+ paddw m4, m5, [t6+r10*2+ 0]
+ paddw m5, [t5+r10*2+ 0]
+ mova m7, [t1+r10*2+64]
+ paddw m6, m7, [t6+r10*2+64]
+ paddw m7, [t5+r10*2+64]
+ punpcklwd m8, m4, m5
+ vpdpwssd m0, m8, m14
+ punpcklwd m8, m6, m7
+ vpdpwssd m2, m8, m14
+ punpckhwd m4, m5
+ vpdpwssd m1, m4, m14
+ punpckhwd m6, m7
+ vpdpwssd m3, m6, m14
+ packuswb m0, m2
+ psrlw m0, 8
+ vpackuswb m0{k2}, m1, m3
+ movu [dstq+r10], m0
+ add r10, 64
+ jl .v_loop
+ mov t6, t5
+ mov t5, t4
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ add dstq, strideq
+ ret
+.w32:
+ lea r10, [r_ext_mask+73]
+ mova ym18, [wiener_perm32]
+ lea t1, [rsp+16]
+ sub r10, wq
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .w32_no_top
+ call .w32_h_top
+ add lpfq, strideq
+ mov t6, t1
+ mov t5, t1
+ add t1, 32*2
+ call .w32_h_top
+ lea r9, [lpfq+strideq*4]
+ mov lpfq, dstq
+ mov t4, t1
+ add t1, 32*2
+ add r9, strideq
+ mov [rsp], r9 ; below
+ call .w32_h
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .w32_v1
+ add lpfq, strideq
+ add t1, 32*2
+ call .w32_h
+ mov t2, t1
+ dec hd
+ jz .w32_v2
+ add lpfq, strideq
+ add t1, 32*2
+ call .w32_h
+ dec hd
+ jz .w32_v3
+.w32_main:
+ lea t0, [t1+32*2]
+.w32_main_loop:
+ call .w32_hv
+ dec hd
+ jnz .w32_main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .w32_v3
+ mov lpfq, [rsp]
+ call .w32_hv_bottom
+ add lpfq, strideq
+ call .w32_hv_bottom
+.w32_v1:
+ call .w32_v
+ RET
+.w32_no_top:
+ lea r9, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r9, [r9+strideq*2]
+ mov [rsp], r9
+ call .w32_h
+ mov t6, t1
+ mov t5, t1
+ mov t4, t1
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .w32_v1
+ add lpfq, strideq
+ add t1, 32*2
+ call .w32_h
+ mov t2, t1
+ dec hd
+ jz .w32_v2
+ add lpfq, strideq
+ add t1, 32*2
+ call .w32_h
+ dec hd
+ jz .w32_v3
+ lea t0, [t1+32*2]
+ call .w32_hv
+ dec hd
+ jz .w32_v3
+ add t0, 32*8
+ call .w32_hv
+ dec hd
+ jnz .w32_main
+.w32_v3:
+ call .w32_v
+.w32_v2:
+ call .w32_v
+ jmp .w32_v1
+.w32_h:
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .w32_h_extend_left
+ movd xm16, [leftq]
+ vmovdqu32 ym16{k1}, [lpfq-4]
+ add leftq, 4
+ jmp .w32_h_main
+.w32_h_extend_left:
+ vpbroadcastb xm16, [lpfq] ; the masked load ensures that no exception
+ vmovdqu32 ym16{k1}, [lpfq-4] ; gets raised from accessing invalid memory
+ jmp .w32_h_main
+.w32_h_top:
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .w32_h_extend_left
+ movu ym16, [lpfq-4]
+.w32_h_main:
+ vinserti32x8 m16, [lpfq+4], 1
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .w32_h_have_right
+ vpbroadcastb m0, [lpfq+wq-1]
+ movu ym17, [r10-8]
+ vinserti32x8 m17, [r10+0], 1
+ vpternlogd m16, m0, m17, 0xe4 ; c ? a : b
+.w32_h_have_right:
+ pshufb m2, m16, m6
+ mova m0, m10
+ vpdpbusd m0, m2, m11
+ pshufb m2, m16, m7
+ mova m1, m10
+ vpdpbusd m1, m2, m11
+ pshufb m2, m16, m8
+ vpdpbusd m0, m2, m12
+ pshufb m16, m9
+ vpdpbusd m1, m16, m12
+ packssdw m0, m1
+ psraw m0, 3
+ mova [t1], m0
+ ret
+.w32_hv:
+ add lpfq, strideq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .w32_hv_extend_left
+ movd xm16, [leftq]
+ vmovdqu32 ym16{k1}, [lpfq-4]
+ add leftq, 4
+ jmp .w32_hv_main
+.w32_hv_extend_left:
+ vpbroadcastb xm16, [lpfq]
+ vmovdqu32 ym16{k1}, [lpfq-4]
+ jmp .w32_hv_main
+.w32_hv_bottom:
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .w32_hv_extend_left
+ movu ym16, [lpfq-4]
+.w32_hv_main:
+ vinserti32x8 m16, [lpfq+4], 1
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .w32_hv_have_right
+ vpbroadcastb m0, [lpfq+wq-1]
+ movu ym17, [r10-8]
+ vinserti32x8 m17, [r10+0], 1
+ vpternlogd m16, m0, m17, 0xe4
+.w32_hv_have_right:
+ mova m3, [t4]
+ paddw m3, [t2]
+ mova m2, [t3]
+ pshufb m4, m16, m6
+ mova m0, m10
+ vpdpbusd m0, m4, m11
+ pshufb m4, m16, m7
+ mova m5, m10
+ vpdpbusd m5, m4, m11
+ punpcklwd m4, m3, m2
+ mova m1, m13
+ vpdpwssd m1, m4, m15
+ punpckhwd m3, m2
+ mova m2, m13
+ vpdpwssd m2, m3, m15
+ pshufb m4, m16, m8
+ vpdpbusd m0, m4, m12
+ pshufb m16, m9
+ vpdpbusd m5, m16, m12
+ packssdw m0, m5
+ psraw m0, 3
+ mova m4, [t5]
+ paddw m4, [t1]
+ paddw m3, m0, [t6]
+ mova [t0], m0
+ punpcklwd m0, m3, m4
+ vpdpwssd m1, m0, m14
+ punpckhwd m3, m4
+ vpdpwssd m2, m3, m14
+ packuswb m1, m2
+ vpermb m16, m18, m1
+ mova [dstq], ym16
+ mov t6, t5
+ mov t5, t4
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ mov t1, t0
+ mov t0, t6
+ add dstq, strideq
+ ret
+.w32_v:
+ mova m2, [t4]
+ paddw m2, [t2]
+ mova m1, [t3]
+ mova m4, [t1]
+ paddw m3, m4, [t6]
+ paddw m4, [t5]
+ punpcklwd m5, m2, m1
+ mova m0, m13
+ vpdpwssd m0, m5, m15
+ punpckhwd m2, m1
+ mova m1, m13
+ vpdpwssd m1, m2, m15
+ punpcklwd m2, m3, m4
+ vpdpwssd m0, m2, m14
+ punpckhwd m3, m4
+ vpdpwssd m1, m3, m14
+ packuswb m0, m1
+ vpermb m16, m18, m0
+ mova [dstq], ym16
+ mov t6, t5
+ mov t5, t4
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ add dstq, strideq
+ ret
+
+cglobal sgr_filter_5x5_8bpc, 4, 13, 23, 416*24+16, dst, stride, left, lpf, \
+ w, h, edge, params
+ mov paramsq, r6mp
+ mov wd, wm
+ mov hd, hm
+ mov edged, r7m
+ vbroadcasti32x4 m5, [sgr_shuf+1]
+ add lpfq, wq
+ vbroadcasti32x4 m6, [sgr_shuf+9]
+ add dstq, wq
+ vbroadcasti32x4 m7, [sgr_shuf+3]
+ lea t3, [rsp+wq*4+16+416*12]
+ vbroadcasti32x4 m8, [sgr_shuf+7]
+ pxor m4, m4
+ vpbroadcastd m9, [pd_m25]
+ vpsubd m11, m4, [paramsq+0] {1to16} ; -s0
+ vpbroadcastw m15, [paramsq+8] ; w0
+ lea t1, [rsp+wq*2+20]
+ vpbroadcastd m10, [pw_164_455]
+ neg wq
+ vpbroadcastd m12, [pw_61448] ; (15 << 12) + (1 << 3)
+ mov r10d, 0xfe
+ vpbroadcastd m13, [pd_m4096]
+ kmovb k1, r10d
+ vpbroadcastd m14, [pd_34816] ; (1 << 11) + (1 << 15)
+ mov r10, 0x3333333333333333
+ mova m18, [sgr_x_by_x+64*0]
+ kmovq k2, r10
+ mova m19, [sgr_x_by_x+64*1]
+ lea r12, [r_ext_mask+75]
+ mova m20, [sgr_x_by_x+64*2]
+ psllw m15, 4
+ mova m21, [sgr_x_by_x+64*3]
+ lea r10, [lpfq+strideq*4]
+ mova ym22, [sgr_shuf]
+ add r10, strideq
+ mov [rsp], r10 ; below
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t2, t1
+ call .top_fixup
+ add t1, 416*6
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ add r10, strideq
+ mov [rsp], r10 ; below
+ mov t0, t2
+ dec hd
+ jz .height1
+ or edged, 16
+ call .h
+.main:
+ add lpfq, strideq
+ call .hv
+ call .prep_n
+ sub hd, 2
+ jl .extend_bottom
+.main_loop:
+ add lpfq, strideq
+ test hd, hd
+ jz .odd_height
+ call .h
+ add lpfq, strideq
+ call .hv
+ call .n0
+ call .n1
+ sub hd, 2
+ jge .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .extend_bottom
+ mov lpfq, [rsp]
+ call .h_top
+ add lpfq, strideq
+ call .hv_bottom
+.end:
+ call .n0
+ call .n1
+.end2:
+ RET
+.height1:
+ call .hv
+ call .prep_n
+ jmp .odd_height_end
+.odd_height:
+ call .hv
+ call .n0
+ call .n1
+.odd_height_end:
+ call .v
+ call .n0
+ jmp .end2
+.extend_bottom:
+ call .v
+ jmp .end
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov [rsp], r10
+ call .h
+ lea t2, [t1+416*6]
+ call .top_fixup
+ dec hd
+ jz .no_top_height1
+ or edged, 16
+ mov t0, t1
+ mov t1, t2
+ jmp .main
+.no_top_height1:
+ call .v
+ call .prep_n
+ jmp .odd_height_end
+.h: ; horizontal boxsum
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movd xm17, [leftq]
+ vmovdqu32 ym17{k1}, [lpfq+wq-4]
+ add leftq, 4
+ jmp .h_main
+.h_extend_left:
+ vpbroadcastb xm17, [lpfq+wq]
+ vmovdqu32 ym17{k1}, [lpfq+wq-4]
+ jmp .h_main
+.h_top:
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu ym17, [lpfq+r10-2]
+.h_main:
+ vinserti32x8 m17, [lpfq+r10+6], 1
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -34
+ jl .h_have_right
+ vpbroadcastb m0, [lpfq-1]
+ movu ym16, [r12+r10-8]
+ vinserti32x8 m16, [r12+r10+0], 1
+ vpternlogd m17, m0, m16, 0xe4
+.h_have_right:
+ pshufb m3, m17, m5
+ pmullw m2, m3, m3
+ pshufb m1, m17, m6
+ paddw m0, m3, m1
+ shufps m3, m1, q2121
+ paddw m0, m3
+ punpcklwd m16, m3, m1
+ punpckhwd m3, m1
+ punpcklwd m1, m2, m4
+ vpdpwssd m1, m16, m16
+ punpckhwd m2, m4
+ vpdpwssd m2, m3, m3
+ pshufb m16, m17, m7
+ paddw m0, m16
+ pshufb m17, m8
+ paddw m0, m17 ; sum
+ punpcklwd m3, m16, m17
+ vpdpwssd m1, m3, m3 ; sumsq
+ punpckhwd m16, m17
+ vpdpwssd m2, m16, m16
+ test edgeb, 16 ; y > 0
+ jz .h_loop_end
+ paddw m0, [t1+r10*2+416*0]
+ paddd m1, [t1+r10*2+416*2]
+ paddd m2, [t1+r10*2+416*4]
+.h_loop_end:
+ mova [t1+r10*2+416*0], m0
+ mova [t1+r10*2+416*2], m1
+ mova [t1+r10*2+416*4], m2
+ add r10, 32
+ jl .h_loop
+ ret
+.top_fixup:
+ lea r10, [wq-2]
+.top_fixup_loop: ; the sums of the first row needs to be doubled
+ mova m0, [t1+r10*2+416*0]
+ mova m1, [t1+r10*2+416*2]
+ mova m2, [t1+r10*2+416*4]
+ paddw m0, m0
+ paddd m1, m1
+ paddd m2, m2
+ mova [t2+r10*2+416*0], m0
+ mova [t2+r10*2+416*2], m1
+ mova [t2+r10*2+416*4], m2
+ add r10, 32
+ jl .top_fixup_loop
+ ret
+ALIGN function_align
+.hv: ; horizontal boxsum + vertical boxsum + ab
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movd xm17, [leftq]
+ vmovdqu32 ym17{k1}, [lpfq+wq-4]
+ add leftq, 4
+ jmp .hv_main
+.hv_extend_left:
+ vpbroadcastb xm17, [lpfq+wq]
+ vmovdqu32 ym17{k1}, [lpfq+wq-4]
+ jmp .hv_main
+.hv_bottom:
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu ym17, [lpfq+r10-2]
+.hv_main:
+ vinserti32x8 m17, [lpfq+r10+6], 1
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp r10d, -34
+ jl .hv_have_right
+ vpbroadcastb m0, [lpfq-1]
+ movu ym16, [r12+r10-8]
+ vinserti32x8 m16, [r12+r10+0], 1
+ vpternlogd m17, m0, m16, 0xe4
+.hv_have_right:
+ pshufb m1, m17, m5
+ pmullw m3, m1, m1
+ pshufb m2, m17, m6
+ paddw m0, m1, m2
+ shufps m1, m2, q2121
+ paddw m0, m1
+ punpcklwd m16, m1, m2
+ punpckhwd m1, m2
+ punpcklwd m2, m3, m4
+ vpdpwssd m2, m16, m16
+ punpckhwd m3, m4
+ vpdpwssd m3, m1, m1
+ pshufb m16, m17, m7
+ paddw m0, m16
+ pshufb m17, m8
+ paddw m0, m17 ; h sum
+ punpcklwd m1, m16, m17
+ vpdpwssd m2, m1, m1 ; h sumsq
+ punpckhwd m16, m17
+ vpdpwssd m3, m16, m16
+ paddw m1, m0, [t1+r10*2+416*0]
+ paddd m16, m2, [t1+r10*2+416*2]
+ paddd m17, m3, [t1+r10*2+416*4]
+ test hd, hd
+ jz .hv_last_row
+.hv_main2:
+ paddd m16, [t2+r10*2+416*2] ; hv sumsq
+ paddd m17, [t2+r10*2+416*4]
+ paddw m1, [t2+r10*2+416*0] ; hv sum
+ mova [t0+r10*2+416*2], m2
+ mova [t0+r10*2+416*4], m3
+ mova [t0+r10*2+416*0], m0
+ pmulld m16, m9 ; -a * 25
+ pmulld m17, m9
+ punpcklwd m0, m1, m4 ; b
+ vpdpwssd m16, m0, m0 ; -p
+ punpckhwd m1, m4
+ vpdpwssd m17, m1, m1
+ pmaddwd m0, m10 ; b * 164
+ pmaddwd m1, m10
+ pmulld m16, m11 ; p * s
+ pmulld m17, m11
+ vpalignr m17{k2}, m16, m16, 2
+ mova m16, m20
+ paddusw m17, m12
+ psraw m17, 4 ; min(z, 255) - 256
+ vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m17
+ vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m17{k3}, m16 ; x
+ pandn m16, m13, m17
+ psrld m17, 16
+ pmulld m0, m16
+ pmulld m1, m17
+ paddd m0, m14 ; x * b * 164 + (1 << 11) + (1 << 15)
+ paddd m1, m14
+ vpternlogd m16, m0, m13, 0xd8 ; a | (b << 12)
+ vpternlogd m17, m1, m13, 0xd8
+ mova [t3+r10*4+ 8], m16 ; The neighbor calculations requires
+ mova [t3+r10*4+ 24], xm17 ; 13 bits for a and 21 bits for b.
+ vextracti32x4 [t3+r10*4+ 56], m17, 2 ; Packing them allows for 12+20, but
+ mova [t3+r10*4+ 72], m17 ; that gets us most of the way.
+ vextracti128 [t3+r10*4+ 72], ym16, 1
+ vextracti32x4 [t3+r10*4+104], m16, 3
+ add r10, 32
+ jl .hv_loop
+ mov t2, t1
+ mov t1, t0
+ mov t0, t2
+ ret
+.hv_last_row: ; esoteric edge case for odd heights
+ mova [t1+r10*2+416*0], m1
+ paddw m1, m0
+ mova [t1+r10*2+416*2], m16
+ paddd m16, m2
+ mova [t1+r10*2+416*4], m17
+ paddd m17, m3
+ jmp .hv_main2
+.v: ; vertical boxsum + ab
+ lea r10, [wq-2]
+.v_loop:
+ mova m2, [t1+r10*2+416*2]
+ paddd m16, m2, [t2+r10*2+416*2]
+ mova m3, [t1+r10*2+416*4]
+ paddd m17, m3, [t2+r10*2+416*4]
+ paddd m2, m2
+ paddd m3, m3
+ paddd m16, m2 ; hv sumsq
+ paddd m17, m3
+ pmulld m16, m9 ; -a * 25
+ pmulld m17, m9
+ mova m0, [t1+r10*2+416*0]
+ paddw m1, m0, [t2+r10*2+416*0]
+ paddw m0, m0
+ paddw m1, m0 ; hv sum
+ punpcklwd m0, m1, m4 ; b
+ vpdpwssd m16, m0, m0 ; -p
+ punpckhwd m1, m4
+ vpdpwssd m17, m1, m1
+ pmaddwd m0, m10 ; b * 164
+ pmaddwd m1, m10
+ pmulld m16, m11 ; p * s
+ pmulld m17, m11
+ vpalignr m17{k2}, m16, m16, 2
+ mova m16, m20
+ paddusw m17, m12
+ psraw m17, 4 ; min(z, 255) - 256
+ vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m17
+ vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m17{k3}, m16 ; x
+ pandn m16, m13, m17
+ psrld m17, 16
+ pmulld m0, m16
+ pmulld m1, m17
+ paddd m0, m14 ; x * b * 164 + (1 << 11) + (1 << 15)
+ paddd m1, m14
+ vpternlogd m16, m0, m13, 0xd8 ; a | (b << 12)
+ vpternlogd m17, m1, m13, 0xd8
+ mova [t3+r10*4+ 8], m16
+ mova [t3+r10*4+ 24], xm17
+ vextracti32x4 [t3+r10*4+ 56], m17, 2
+ mova [t3+r10*4+ 72], m17
+ vextracti128 [t3+r10*4+ 72], ym16, 1
+ vextracti32x4 [t3+r10*4+104], m16, 3
+ add r10, 32
+ jl .v_loop
+ ret
+.prep_n: ; initial neighbor setup
+ mov r10, wq
+.prep_n_loop:
+ movu m0, [t3+r10*4+ 4]
+ movu m1, [t3+r10*4+68]
+ paddd m2, m0, [t3+r10*4+ 0]
+ paddd m3, m1, [t3+r10*4+64]
+ paddd m2, [t3+r10*4+ 8]
+ paddd m3, [t3+r10*4+72]
+ paddd m0, m2
+ pslld m2, 2
+ paddd m1, m3
+ pslld m3, 2
+ paddd m2, m0 ; ab 565
+ paddd m3, m1
+ pandn m0, m13, m2 ; a
+ psrld m2, 12 ; b
+ pandn m1, m13, m3
+ psrld m3, 12
+ mova [t3+r10*4+416*4+ 0], m0
+ mova [t3+r10*4+416*8+ 0], m2
+ mova [t3+r10*4+416*4+64], m1
+ mova [t3+r10*4+416*8+64], m3
+ add r10, 32
+ jl .prep_n_loop
+ ret
+ALIGN function_align
+.n0: ; neighbor + output (even rows)
+ mov r10, wq
+.n0_loop:
+ movu m16, [t3+r10*4+ 4]
+ movu m17, [t3+r10*4+68]
+ paddd m0, m16, [t3+r10*4+ 0]
+ paddd m1, m17, [t3+r10*4+64]
+ paddd m0, [t3+r10*4+ 8]
+ paddd m1, [t3+r10*4+72]
+ paddd m16, m0
+ pslld m0, 2
+ paddd m17, m1
+ pslld m1, 2
+ paddd m0, m16
+ paddd m1, m17
+ pandn m16, m13, m0
+ psrld m0, 12
+ pandn m17, m13, m1
+ psrld m1, 12
+ paddd m2, m16, [t3+r10*4+416*4+ 0] ; a
+ paddd m3, m17, [t3+r10*4+416*4+64]
+ mova [t3+r10*4+416*4+ 0], m16
+ mova [t3+r10*4+416*4+64], m17
+ paddd m16, m0, [t3+r10*4+416*8+ 0] ; b + (1 << 8)
+ paddd m17, m1, [t3+r10*4+416*8+64]
+ mova [t3+r10*4+416*8+ 0], m0
+ mova [t3+r10*4+416*8+64], m1
+ pmovzxbd m0, [dstq+r10+ 0]
+ pmovzxbd m1, [dstq+r10+16]
+ pmaddwd m2, m0 ; a * src
+ pmaddwd m3, m1
+ packssdw m0, m1
+ psubd m16, m2 ; b - a * src + (1 << 8)
+ psubd m17, m3
+ psrad m16, 9
+ psrad m17, 9
+ packssdw m16, m17
+ pmulhrsw m16, m15
+ paddw m16, m0
+ packuswb m16, m16
+ vpermd m16, m22, m16
+ mova [dstq+r10], ym16
+ add r10, 32
+ jl .n0_loop
+ add dstq, strideq
+ ret
+ALIGN function_align
+.n1: ; neighbor + output (odd rows)
+ mov r10, wq
+.n1_loop:
+ pmovzxbd m0, [dstq+r10+ 0]
+ pmovzxbd m1, [dstq+r10+16]
+ pmaddwd m2, m0, [t3+r10*4+416*4+ 0] ; a * src
+ pmaddwd m3, m1, [t3+r10*4+416*4+64]
+ mova m16, [t3+r10*4+416*8+ 0] ; b + (1 << 7)
+ mova m17, [t3+r10*4+416*8+64]
+ packssdw m0, m1
+ psubd m16, m2 ; b - a * src + (1 << 7)
+ psubd m17, m3
+ psrad m16, 8
+ psrad m17, 8
+ packssdw m16, m17
+ pmulhrsw m16, m15
+ paddw m16, m0
+ packuswb m16, m16
+ vpermd m16, m22, m16
+ mova [dstq+r10], ym16
+ add r10, 32
+ jl .n1_loop
+ add dstq, strideq
+ ret
+
+cglobal sgr_filter_3x3_8bpc, 4, 15, 22, -416*28-16, dst, stride, left, lpf, \
+ w, h, edge, params
+ mov paramsq, r6mp
+ mov wd, wm
+ movifnidn hd, hm
+ mov edged, r7m
+ vbroadcasti32x4 m5, [sgr_shuf+3]
+ add lpfq, wq
+ vbroadcasti32x4 m6, [sgr_shuf+5]
+ add dstq, wq
+ vbroadcasti32x4 m7, [sgr_shuf+7]
+ pxor m4, m4
+ vpbroadcastd m8, [pd_m9]
+ vpsubd m11, m4, [paramsq+4] {1to16} ; -s1
+ vpbroadcastw m15, [paramsq+10] ; w1
+ lea t1, [rsp+wq*2+20]
+ vpbroadcastd m10, [pw_164_455]
+ lea t3, [rsp+wq*4+16+416*12]
+ vpbroadcastd m12, [pw_61448] ; (15 << 12) + (1 << 3)
+ neg wq
+ vpbroadcastd m13, [pd_m4096]
+ mov r10d, 0xfe
+ vpbroadcastd m14, [pd_34816] ; (1 << 11) + (1 << 15)
+ kmovb k1, r10d
+ mova m18, [sgr_x_by_x+64*0]
+ mov r10, 0x3333333333333333
+ mova m19, [sgr_x_by_x+64*1]
+ kmovq k2, r10
+ mova m20, [sgr_x_by_x+64*2]
+ psllw m15, 4
+ mova m21, [sgr_x_by_x+64*3]
+ lea r14, [r_ext_mask+75]
+ mova ym9, [sgr_shuf]
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t2, t1
+ add t1, 416*6
+ call .h_top
+ lea t4, [lpfq+strideq*4]
+ mov lpfq, dstq
+ add t4, strideq
+ mov [rsp], t4 ; below
+ mov t0, t2
+ call .hv
+.main:
+ mov t5, t3
+ add t3, 416*4
+ dec hd
+ jz .height1
+ add lpfq, strideq
+ call .hv
+ call .prep_n
+ dec hd
+ jz .extend_bottom
+.main_loop:
+ add lpfq, strideq
+ call .hv
+ call .n
+ dec hd
+ jnz .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .extend_bottom
+ mov lpfq, [rsp]
+ call .hv_bottom
+ call .n
+ add lpfq, strideq
+ call .hv_bottom
+.end:
+ call .n
+ RET
+.height1:
+ call .v
+ call .prep_n
+ mov t2, t1
+ call .v
+ jmp .end
+.extend_bottom:
+ call .v
+ call .n
+ mov t2, t1
+ call .v
+ jmp .end
+.no_top:
+ lea t4, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea t4, [t4+strideq*2]
+ mov [rsp], t4
+ call .h
+ lea t0, [t1+416*6]
+ mov t2, t1
+ call .v
+ jmp .main
+.h: ; horizontal boxsum
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movd xm17, [leftq]
+ vmovdqu32 ym17{k1}, [lpfq+wq-4]
+ add leftq, 4
+ jmp .h_main
+.h_extend_left:
+ vpbroadcastb xm17, [lpfq+wq]
+ vmovdqu32 ym17{k1}, [lpfq+wq-4]
+ jmp .h_main
+.h_top:
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu ym17, [lpfq+r10-2]
+.h_main:
+ vinserti32x8 m17, [lpfq+r10+6], 1
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -33
+ jl .h_have_right
+ vpbroadcastb m0, [lpfq-1]
+ movu ym16, [r14+r10-8]
+ vinserti32x8 m16, [r14+r10+0], 1
+ vpternlogd m17, m0, m16, 0xe4
+.h_have_right:
+ pshufb m0, m17, m5
+ pmullw m2, m0, m0
+ pshufb m16, m17, m6
+ paddw m0, m16
+ pshufb m17, m7
+ paddw m0, m17 ; sum
+ punpcklwd m3, m16, m17
+ punpcklwd m1, m2, m4
+ vpdpwssd m1, m3, m3 ; sumsq
+ punpckhwd m16, m17
+ punpckhwd m2, m4
+ vpdpwssd m2, m16, m16
+ mova [t1+r10*2+416*0], m0
+ mova [t1+r10*2+416*2], m1
+ mova [t1+r10*2+416*4], m2
+ add r10, 32
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv: ; horizontal boxsum + vertical boxsum + ab
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movd xm17, [leftq]
+ vmovdqu32 ym17{k1}, [lpfq+wq-4]
+ add leftq, 4
+ jmp .hv_main
+.hv_extend_left:
+ vpbroadcastb xm17, [lpfq+wq]
+ vmovdqu32 ym17{k1}, [lpfq+wq-4]
+ jmp .hv_main
+.hv_bottom:
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu ym17, [lpfq+r10-2]
+.hv_main:
+ vinserti32x8 m17, [lpfq+r10+6], 1
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp r10d, -33
+ jl .hv_have_right
+ vpbroadcastb m0, [lpfq-1]
+ movu ym16, [r14+r10-8]
+ vinserti32x8 m16, [r14+r10+0], 1
+ vpternlogd m17, m0, m16, 0xe4
+.hv_have_right:
+ pshufb m0, m17, m5
+ pmullw m3, m0, m0
+ pshufb m1, m17, m6
+ paddw m0, m1
+ pshufb m17, m7
+ paddw m0, m17 ; h sum
+ punpcklwd m16, m17, m1
+ punpcklwd m2, m3, m4
+ vpdpwssd m2, m16, m16 ; h sumsq
+ punpckhwd m17, m1
+ punpckhwd m3, m4
+ vpdpwssd m3, m17, m17
+ paddw m1, m0, [t2+r10*2+416*0]
+ paddw m1, [t1+r10*2+416*0] ; hv sum
+ paddd m16, m2, [t2+r10*2+416*2]
+ paddd m17, m3, [t2+r10*2+416*4]
+ paddd m16, [t1+r10*2+416*2] ; hv sumsq
+ paddd m17, [t1+r10*2+416*4]
+ mova [t0+r10*2+416*0], m0
+ mova [t0+r10*2+416*2], m2
+ mova [t0+r10*2+416*4], m3
+ pmulld m16, m8 ; -a * 9
+ pmulld m17, m8
+ punpcklwd m0, m4, m1 ; b
+ vpdpwssd m16, m0, m0 ; -p
+ punpckhwd m1, m4, m1
+ vpdpwssd m17, m1, m1
+ pmaddwd m0, m10 ; b * 455
+ pmaddwd m1, m10
+ pmulld m16, m11 ; p * s
+ pmulld m17, m11
+ vpalignr m17{k2}, m16, m16, 2
+ mova m16, m20
+ paddusw m17, m12
+ psraw m17, 4 ; min(z, 255) - 256
+ vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m17
+ vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m17{k3}, m16 ; x
+ pandn m16, m13, m17
+ psrld m17, 16
+ pmulld m0, m16
+ pmulld m1, m17
+ paddd m0, m14 ; x * b * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m14
+ vpternlogd m16, m0, m13, 0xd8 ; a | (b << 12)
+ vpternlogd m17, m1, m13, 0xd8
+ mova [t3+r10*4+ 8], m16
+ mova [t3+r10*4+ 24], xm17
+ vextracti32x4 [t3+r10*4+ 56], m17, 2
+ mova [t3+r10*4+ 72], m17
+ vextracti128 [t3+r10*4+ 72], ym16, 1
+ vextracti32x4 [t3+r10*4+104], m16, 3
+ add r10, 32
+ jl .hv_loop
+ mov t2, t1
+ mov t1, t0
+ mov t0, t2
+ ret
+.v: ; vertical boxsum + ab
+ lea r10, [wq-2]
+.v_loop:
+ mova m16, [t1+r10*2+416*2]
+ mova m17, [t1+r10*2+416*4]
+ paddd m16, m16
+ paddd m17, m17
+ paddd m16, [t2+r10*2+416*2] ; hv sumsq
+ paddd m17, [t2+r10*2+416*4]
+ pmulld m16, m8 ; -a * 9
+ pmulld m17, m8
+ mova m1, [t1+r10*2+416*0]
+ paddw m1, m1
+ paddw m1, [t2+r10*2+416*0] ; hv sum
+ punpcklwd m0, m4, m1 ; b
+ vpdpwssd m16, m0, m0 ; -p
+ punpckhwd m1, m4, m1
+ vpdpwssd m17, m1, m1
+ pmaddwd m0, m10 ; b * 455
+ pmaddwd m1, m10
+ pmulld m16, m11 ; p * s
+ pmulld m17, m11
+ vpalignr m17{k2}, m16, m16, 2
+ mova m16, m20
+ paddusw m17, m12
+ psraw m17, 4 ; min(z, 255) - 256
+ vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m17
+ vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m17{k3}, m16 ; x
+ pandn m16, m13, m17
+ psrld m17, 16
+ pmulld m0, m16
+ pmulld m1, m17
+ paddd m0, m14 ; x * b * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m14
+ vpternlogd m16, m0, m13, 0xd8 ; a | (b << 12)
+ vpternlogd m17, m1, m13, 0xd8
+ mova [t3+r10*4+ 8], m16
+ mova [t3+r10*4+ 24], xm17
+ vextracti32x4 [t3+r10*4+ 56], m17, 2
+ mova [t3+r10*4+ 72], m17
+ vextracti128 [t3+r10*4+ 72], ym16, 1
+ vextracti32x4 [t3+r10*4+104], m16, 3
+ add r10, 32
+ jl .v_loop
+ ret
+.prep_n: ; initial neighbor setup
+ mov r10, wq
+ mov t4, t3
+ add t3, 416*4
+.prep_n_loop:
+ mova m2, [t5+r10*4+0]
+ mova m3, [t4+r10*4+0]
+ paddd m2, [t5+r10*4+8]
+ paddd m3, [t4+r10*4+8]
+ paddd m0, m2, [t5+r10*4+4]
+ paddd m1, m3, [t4+r10*4+4]
+ pslld m0, 2
+ paddd m1, m1 ; ab[ 0] 222
+ psubd m0, m2 ; ab[-1] 343
+ mova [t3+r10*4+416*4], m1
+ paddd m1, m1
+ mova [t5+r10*4], m0
+ psubd m1, m3 ; ab[ 0] 343
+ mova [t4+r10*4], m1
+ add r10, 16
+ jl .prep_n_loop
+ ret
+; a+b are packed together in a single dword, but we can't do the
+; full neighbor calculations before splitting them since we don't
+; have sufficient precision. The solution is to do the calculations
+; in two equal halves and split a and b before doing the final sum.
+ALIGN function_align
+.n: ; neighbor + output
+ mov r10, wq
+.n_loop:
+ mova m16, [t3+r10*4+ 0]
+ paddd m16, [t3+r10*4+ 8]
+ paddd m17, m16, [t3+r10*4+ 4]
+ paddd m17, m17 ; ab[+1] 222
+ mova m2, [t3+r10*4+416*4+ 0]
+ paddd m0, m2, [t5+r10*4+ 0] ; ab[ 0] 222 + ab[-1] 343
+ mova m3, [t3+r10*4+416*4+64]
+ paddd m1, m3, [t5+r10*4+64]
+ mova [t3+r10*4+416*4+ 0], m17
+ paddd m17, m17
+ psubd m17, m16 ; ab[+1] 343
+ mova [t5+r10*4+ 0], m17
+ paddd m2, m17 ; ab[ 0] 222 + ab[+1] 343
+ mova m16, [t3+r10*4+64]
+ paddd m16, [t3+r10*4+72]
+ paddd m17, m16, [t3+r10*4+68]
+ paddd m17, m17
+ mova [t3+r10*4+416*4+64], m17
+ paddd m17, m17
+ psubd m17, m16
+ mova [t5+r10*4+64], m17
+ pandn m16, m13, m0
+ psrld m0, 12
+ paddd m3, m17
+ pandn m17, m13, m2
+ psrld m2, 12
+ paddd m16, m17 ; a
+ pandn m17, m13, m1
+ psrld m1, 12
+ paddd m0, m2 ; b + (1 << 8)
+ pandn m2, m13, m3
+ psrld m3, 12
+ paddd m17, m2
+ pmovzxbd m2, [dstq+r10+ 0]
+ paddd m1, m3
+ pmovzxbd m3, [dstq+r10+16]
+ pmaddwd m16, m2 ; a * src
+ pmaddwd m17, m3
+ packssdw m2, m3
+ psubd m0, m16 ; b - a * src + (1 << 8)
+ psubd m1, m17
+ psrad m0, 9
+ psrad m1, 9
+ packssdw m0, m1
+ pmulhrsw m0, m15
+ paddw m0, m2
+ packuswb m0, m0
+ vpermd m16, m9, m0
+ mova [dstq+r10], ym16
+ add r10, 32
+ jl .n_loop
+ mov r10, t5
+ mov t5, t4
+ mov t4, r10
+ add dstq, strideq
+ ret
+
+cglobal sgr_filter_mix_8bpc, 4, 13, 28, 416*56+8, dst, stride, left, lpf, \
+ w, h, edge, params
+ mov paramsq, r6mp
+ mov wd, wm
+ movifnidn hd, hm
+ mov edged, r7m
+ vbroadcasti128 m5, [sgr_shuf+1]
+ add lpfq, wq
+ vbroadcasti128 m6, [sgr_shuf+9]
+ add dstq, wq
+ vbroadcasti128 m7, [sgr_shuf+3]
+ lea t3, [rsp+wq*4+416*24+8]
+ vbroadcasti128 m8, [sgr_shuf+7]
+ pxor m4, m4
+ vpbroadcastd m9, [pd_m9]
+ vpsubd m11, m4, [paramsq+0] {1to16} ; -s0
+ vpbroadcastd m14, [pw_61448]
+ vpsubd m12, m4, [paramsq+4] {1to16} ; -s1
+ vpbroadcastd m26, [paramsq+8] ; w0 w1
+ lea t1, [rsp+wq*2+12]
+ vpbroadcastd m10, [pd_m25]
+ neg wq
+ vpbroadcastd m13, [pw_164_455]
+ mov r10d, 0xfe
+ vpbroadcastd m15, [pd_34816]
+ kmovb k1, r10d
+ mova m20, [sgr_x_by_x+64*0]
+ mov r10, 0x3333333333333333
+ mova m21, [sgr_x_by_x+64*1]
+ kmovq k2, r10
+ mova m22, [sgr_x_by_x+64*2]
+ lea r12, [r_ext_mask+75]
+ mova m23, [sgr_x_by_x+64*3]
+ vpbroadcastd m24, [pd_m4096]
+ vpbroadcastd m25, [sgr_shuf+28] ; 0x8000____
+ psllw m26, 5
+ mova xm27, [sgr_mix_perm]
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t2, t1
+ call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx512icl).top_fixup
+ add t1, 416*12
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ add r10, strideq
+ mov [rsp], r10 ; below
+ call .hv0
+.main:
+ dec hd
+ jz .height1
+ add lpfq, strideq
+ call .hv1
+ call .prep_n
+ sub hd, 2
+ jl .extend_bottom
+.main_loop:
+ add lpfq, strideq
+ call .hv0
+ test hd, hd
+ jz .odd_height
+ add lpfq, strideq
+ call .hv1
+ call .n0
+ call .n1
+ sub hd, 2
+ jge .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .extend_bottom
+ mov lpfq, [rsp]
+ call .hv0_bottom
+ add lpfq, strideq
+ call .hv1_bottom
+.end:
+ call .n0
+ call .n1
+.end2:
+ RET
+.height1:
+ call .v1
+ call .prep_n
+ jmp .odd_height_end
+.odd_height:
+ call .v1
+ call .n0
+ call .n1
+.odd_height_end:
+ call .v0
+ call .v1
+ call .n0
+ jmp .end2
+.extend_bottom:
+ call .v0
+ call .v1
+ jmp .end
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov [rsp], r10
+ call .h
+ lea t2, [t1+416*12]
+ lea r10, [wq-2]
+.top_fixup_loop:
+ mova m0, [t1+r10*2+416* 0]
+ mova m1, [t1+r10*2+416* 2]
+ mova m2, [t1+r10*2+416* 4]
+ paddw m0, m0
+ mova m3, [t1+r10*2+416* 6]
+ paddd m1, m1
+ mova m16, [t1+r10*2+416* 8]
+ paddd m2, m2
+ mova m17, [t1+r10*2+416*10]
+ mova [t2+r10*2+416* 0], m0
+ mova [t2+r10*2+416* 2], m1
+ mova [t2+r10*2+416* 4], m2
+ mova [t2+r10*2+416* 6], m3
+ mova [t2+r10*2+416* 8], m16
+ mova [t2+r10*2+416*10], m17
+ add r10, 32
+ jl .top_fixup_loop
+ call .v0
+ jmp .main
+.h: ; horizontal boxsums
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movd xm17, [leftq]
+ vmovdqu32 ym17{k1}, [lpfq+wq-4]
+ add leftq, 4
+ jmp .h_main
+.h_extend_left:
+ vpbroadcastb xm17, [lpfq+wq]
+ vmovdqu32 ym17{k1}, [lpfq+wq-4]
+ jmp .h_main
+.h_top:
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu ym17, [lpfq+r10-2]
+.h_main:
+ vinserti32x8 m17, [lpfq+r10+6], 1
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -34
+ jl .h_have_right
+ vpbroadcastb m0, [lpfq-1]
+ movu ym16, [r12+r10-8]
+ vinserti32x8 m16, [r12+r10+0], 1
+ vpternlogd m17, m0, m16, 0xe4
+.h_have_right:
+ pshufb m3, m17, m5
+ pshufb m18, m17, m6
+ shufps m0, m3, m18, q2121
+ pmullw m2, m0, m0
+ pshufb m19, m17, m7
+ paddw m0, m19
+ pshufb m17, m8
+ paddw m0, m17 ; sum3
+ punpcklwd m16, m19, m17
+ punpcklwd m1, m2, m4
+ vpdpwssd m1, m16, m16 ; sumsq3
+ punpckhwd m19, m17
+ punpckhwd m2, m4
+ vpdpwssd m2, m19, m19
+ mova [t1+r10*2+416* 6], m0
+ mova [t1+r10*2+416* 8], m1
+ mova [t1+r10*2+416*10], m2
+ punpcklwd m19, m3, m18
+ paddw m0, m3
+ vpdpwssd m1, m19, m19 ; sumsq5
+ punpckhwd m3, m18
+ paddw m0, m18 ; sum5
+ vpdpwssd m2, m3, m3
+ mova [t1+r10*2+416* 0], m0
+ mova [t1+r10*2+416* 2], m1
+ mova [t1+r10*2+416* 4], m2
+ add r10, 32
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv0: ; horizontal boxsums + vertical boxsum3 + ab3 (even rows)
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+ movd xm17, [leftq]
+ vmovdqu32 ym17{k1}, [lpfq+wq-4]
+ add leftq, 4
+ jmp .hv0_main
+.hv0_extend_left:
+ vpbroadcastb xm17, [lpfq+wq]
+ vmovdqu32 ym17{k1}, [lpfq+wq-4]
+ jmp .hv0_main
+.hv0_bottom:
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+.hv0_loop:
+ movu ym17, [lpfq+r10-2]
+.hv0_main:
+ vinserti32x8 m17, [lpfq+r10+6], 1
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv0_have_right
+ cmp r10d, -34
+ jl .hv0_have_right
+ vpbroadcastb m0, [lpfq-1]
+ movu ym16, [r12+r10-8]
+ vinserti32x8 m16, [r12+r10+0], 1
+ vpternlogd m17, m0, m16, 0xe4
+.hv0_have_right:
+ pshufb m18, m17, m5
+ pshufb m19, m17, m6
+ shufps m1, m18, m19, q2121
+ pmullw m3, m1, m1
+ pshufb m0, m17, m7
+ paddw m1, m0
+ pshufb m17, m8
+ paddw m1, m17 ; sum3
+ punpcklwd m16, m0, m17
+ punpcklwd m2, m3, m4
+ vpdpwssd m2, m16, m16 ; sumsq3
+ punpckhwd m0, m17
+ punpckhwd m3, m4
+ vpdpwssd m3, m0, m0
+ paddw m0, m1, [t1+r10*2+416* 6]
+ paddd m16, m2, [t1+r10*2+416* 8]
+ paddd m17, m3, [t1+r10*2+416*10]
+ mova [t1+r10*2+416* 6], m1
+ mova [t1+r10*2+416* 8], m2
+ mova [t1+r10*2+416*10], m3
+ paddw m1, m18
+ paddw m1, m19 ; sum5
+ mova [t3+r10*4+416*8+ 8], m1
+ paddw m1, [t1+r10*2+416* 0]
+ mova [t1+r10*2+416* 0], m1
+ punpcklwd m1, m18, m19
+ vpdpwssd m2, m1, m1 ; sumsq5
+ punpckhwd m18, m19
+ vpdpwssd m3, m18, m18
+ mova [t3+r10*4+416*0+ 8], m2 ; we need a clean copy of the last row
+ mova [t3+r10*4+416*0+72], m3 ; in case height is odd
+ paddd m2, [t1+r10*2+416* 2]
+ paddd m3, [t1+r10*2+416* 4]
+ mova [t1+r10*2+416* 2], m2
+ mova [t1+r10*2+416* 4], m3
+ paddw m1, m0, [t2+r10*2+416* 6]
+ paddd m2, m16, [t2+r10*2+416* 8]
+ paddd m3, m17, [t2+r10*2+416*10]
+ mova [t2+r10*2+416* 6], m0
+ mova [t2+r10*2+416* 8], m16
+ mova [t2+r10*2+416*10], m17
+ pmulld m16, m2, m9 ; -a3 * 9
+ pmulld m17, m3, m9
+ punpcklwd m0, m4, m1 ; b3
+ vpdpwssd m16, m0, m0 ; -p3
+ punpckhwd m1, m4, m1
+ vpdpwssd m17, m1, m1
+ pmulld m16, m12 ; p3 * s1
+ pmulld m17, m12
+ pmaddwd m0, m13 ; b3 * 455
+ pmaddwd m1, m13
+ vpalignr m17{k2}, m16, m16, 2
+ mova m16, m22
+ paddusw m17, m14
+ psraw m17, 4 ; min(z3, 255) - 256
+ vpermt2b m16, m17, m23 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m17
+ vpermi2b m17, m20, m21 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m17{k3}, m16 ; x3
+ pandn m16, m24, m17
+ psrld m17, 16
+ pmulld m0, m16
+ pmulld m1, m17
+ paddd m0, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m15
+ vpternlogd m16, m0, m24, 0xd8 ; a3 | (b3 << 12)
+ vpternlogd m17, m1, m24, 0xd8
+ mova [t3+r10*4+416*4+ 8], m16
+ mova [t3+r10*4+416*4+ 24], xm17
+ vextracti32x4 [t3+r10*4+416*4+ 56], m17, 2
+ mova [t3+r10*4+416*4+ 72], m17
+ vextracti128 [t3+r10*4+416*4+ 72], ym16, 1
+ vextracti32x4 [t3+r10*4+416*4+104], m16, 3
+ add r10, 32
+ jl .hv0_loop
+ ret
+ALIGN function_align
+.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+ movd xm17, [leftq]
+ vmovdqu32 ym17{k1}, [lpfq+wq-4]
+ add leftq, 4
+ jmp .hv1_main
+.hv1_extend_left:
+ vpbroadcastb xm17, [lpfq+wq]
+ vmovdqu32 ym17{k1}, [lpfq+wq-4]
+ jmp .hv1_main
+.hv1_bottom:
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+.hv1_loop:
+ movu ym17, [lpfq+r10-2]
+.hv1_main:
+ vinserti32x8 m17, [lpfq+r10+6], 1
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv1_have_right
+ cmp r10d, -34
+ jl .hv1_have_right
+ vpbroadcastb m0, [lpfq-1]
+ movu ym16, [r12+r10-8]
+ vinserti32x8 m16, [r12+r10+0], 1
+ vpternlogd m17, m0, m16, 0xe4
+.hv1_have_right:
+ pshufb m3, m17, m5
+ pshufb m19, m17, m6
+ shufps m2, m3, m19, q2121
+ pmullw m1, m2, m2
+ pshufb m18, m17, m7
+ paddw m2, m18
+ pshufb m17, m8
+ paddw m2, m17 ; sum3
+ punpcklwd m16, m17, m18
+ punpcklwd m0, m1, m4
+ vpdpwssd m0, m16, m16 ; sumsq3
+ punpckhwd m17, m18
+ punpckhwd m1, m4
+ vpdpwssd m1, m17, m17
+ paddd m16, m0, [t2+r10*2+416* 8]
+ paddd m17, m1, [t2+r10*2+416*10]
+ mova [t2+r10*2+416* 8], m0
+ mova [t2+r10*2+416*10], m1
+ punpcklwd m18, m3, m19
+ vpdpwssd m0, m18, m18 ; sumsq5
+ punpckhwd m18, m3, m19
+ vpdpwssd m1, m18, m18
+ paddw m3, m19
+ pmulld m16, m9 ; -a3 * 9
+ pmulld m17, m9
+ paddd m18, m0, [t2+r10*2+416*2]
+ paddd m19, m1, [t2+r10*2+416*4]
+ paddd m18, [t1+r10*2+416*2]
+ paddd m19, [t1+r10*2+416*4]
+ mova [t2+r10*2+416*2], m0
+ mova [t2+r10*2+416*4], m1
+ pmulld m18, m10 ; -a5 * 25
+ pmulld m19, m10
+ paddw m1, m2, [t2+r10*2+416* 6]
+ mova [t2+r10*2+416* 6], m2
+ paddw m2, m3 ; sum5
+ paddw m3, m2, [t2+r10*2+416*0]
+ paddw m3, [t1+r10*2+416*0]
+ mova [t2+r10*2+416*0], m2
+ punpcklwd m0, m4, m1 ; b3
+ vpdpwssd m16, m0, m0 ; -p3
+ punpckhwd m1, m4, m1
+ vpdpwssd m17, m1, m1
+ punpcklwd m2, m3, m4 ; b5
+ vpdpwssd m18, m2, m2 ; -p5
+ punpckhwd m3, m4
+ vpdpwssd m19, m3, m3
+ pmulld m16, m12 ; p3 * s1
+ pmulld m17, m12
+ pmulld m18, m11 ; p5 * s0
+ pmulld m19, m11
+ pmaddwd m0, m13 ; b3 * 455
+ pmaddwd m1, m13
+ pmaddwd m2, m13 ; b5 * 164
+ pmaddwd m3, m13
+ vpalignr m17{k2}, m16, m16, 2
+ vpalignr m19{k2}, m18, m18, 2
+ paddusw m17, m14
+ mova m16, m22
+ psraw m17, 4 ; min(z3, 255) - 256
+ vpermt2b m16, m17, m23 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m17
+ vpermi2b m17, m20, m21 ; sgr_x_by_x[ 0..127]
+ paddusw m19, m14
+ mova m18, m22
+ psraw m19, 4 ; min(z5, 255) - 256
+ vpermt2b m18, m19, m23 ; sgr_x_by_x[128..255]
+ vpmovb2m k4, m19
+ vpermi2b m19, m20, m21 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m17{k3}, m16 ; x3
+ vmovdqu8 m19{k4}, m18 ; x5
+ pandn m16, m24, m17
+ psrld m17, 16
+ pmulld m0, m16
+ pmulld m1, m17
+ pandn m18, m24, m19
+ psrld m19, 16
+ pmulld m2, m18
+ pmulld m3, m19
+ paddd m0, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m15
+ vpternlogd m16, m0, m24, 0xd8 ; a3 | (b3 << 12)
+ vpternlogd m17, m1, m24, 0xd8
+ mova [t3+r10*4+416*8+ 8], m16
+ mova [t3+r10*4+416*8+ 24], xm17
+ vextracti32x4 [t3+r10*4+416*8+ 56], m17, 2
+ paddd m2, m15 ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
+ paddd m3, m15
+ mova [t3+r10*4+416*8+ 72], m17
+ vextracti128 [t3+r10*4+416*8+ 72], ym16, 1
+ vextracti32x4 [t3+r10*4+416*8+104], m16, 3
+ vpternlogd m18, m2, m24, 0xd8 ; a5 | (b5 << 12)
+ vpternlogd m19, m3, m24, 0xd8
+ mova [t3+r10*4+416*0+ 8], m18
+ mova [t3+r10*4+416*0+ 24], xm19
+ vextracti32x4 [t3+r10*4+416*0+ 56], m19, 2
+ mova [t3+r10*4+416*0+ 72], m19
+ vextracti128 [t3+r10*4+416*0+ 72], ym18, 1
+ vextracti32x4 [t3+r10*4+416*0+104], m18, 3
+ add r10, 32
+ jl .hv1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.v0: ; vertical boxsums + ab3 (even rows)
+ lea r10, [wq-2]
+.v0_loop:
+ mova m2, [t1+r10*2+416* 8]
+ mova m3, [t1+r10*2+416*10]
+ paddd m2, m2
+ paddd m3, m3
+ paddd m16, m2, [t2+r10*2+416* 8]
+ paddd m17, m3, [t2+r10*2+416*10]
+ mova m0, [t1+r10*2+416* 6]
+ paddw m0, m0
+ paddw m1, m0, [t2+r10*2+416* 6]
+ pmulld m16, m9 ; -a3 * 9
+ pmulld m17, m9
+ mova [t2+r10*2+416* 6], m0
+ mova [t2+r10*2+416* 8], m2
+ mova [t2+r10*2+416*10], m3
+ mova m2, [t1+r10*2+416*0]
+ mova m3, [t1+r10*2+416*2]
+ mova m18, [t1+r10*2+416*4]
+ punpcklwd m0, m4, m1 ; b3
+ vpdpwssd m16, m0, m0 ; -p3
+ punpckhwd m1, m4, m1
+ vpdpwssd m17, m1, m1
+ pmulld m16, m12 ; p3 * s1
+ pmulld m17, m12
+ pmaddwd m0, m13 ; b3 * 455
+ pmaddwd m1, m13
+ mova [t3+r10*4+416*8+ 8], m2
+ mova [t3+r10*4+416*0+ 8], m3
+ mova [t3+r10*4+416*0+72], m18
+ vpalignr m17{k2}, m16, m16, 2
+ mova m16, m22
+ paddusw m17, m14
+ psraw m17, 4 ; min(z3, 255) - 256
+ vpermt2b m16, m17, m23 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m17
+ vpermi2b m17, m20, m21 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m17{k3}, m16 ; x3
+ pandn m16, m24, m17
+ psrld m17, 16
+ pmulld m0, m16
+ pmulld m1, m17
+ paddw m2, m2 ; cc5
+ paddd m3, m3
+ paddd m18, m18
+ mova [t1+r10*2+416*0], m2
+ mova [t1+r10*2+416*2], m3
+ mova [t1+r10*2+416*4], m18
+ paddd m0, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m15
+ vpternlogd m16, m0, m24, 0xd8 ; a3 | (b3 << 12)
+ vpternlogd m17, m1, m24, 0xd8
+ mova [t3+r10*4+416*4+ 8], m16
+ mova [t3+r10*4+416*4+ 24], xm17
+ vextracti32x4 [t3+r10*4+416*4+ 56], m17, 2
+ mova [t3+r10*4+416*4+ 72], m17
+ vextracti128 [t3+r10*4+416*4+ 72], ym16, 1
+ vextracti32x4 [t3+r10*4+416*4+104], m16, 3
+ add r10, 32
+ jl .v0_loop
+ ret
+.v1: ; vertical boxsums + ab (odd rows)
+ lea r10, [wq-2]
+.v1_loop:
+ mova m0, [t1+r10*2+416* 8]
+ paddd m16, m0, [t2+r10*2+416* 8]
+ mova m1, [t1+r10*2+416*10]
+ paddd m17, m1, [t2+r10*2+416*10]
+ mova m2, [t3+r10*4+416*0+ 8]
+ paddd m18, m2, [t2+r10*2+416* 2]
+ mova m3, [t3+r10*4+416*0+72]
+ paddd m19, m3, [t2+r10*2+416* 4]
+ paddd m18, [t1+r10*2+416* 2]
+ paddd m19, [t1+r10*2+416* 4]
+ mova [t2+r10*2+416* 8], m0
+ mova [t2+r10*2+416*10], m1
+ mova [t2+r10*2+416* 2], m2
+ mova [t2+r10*2+416* 4], m3
+ pmulld m16, m9 ; -a3 * 9
+ pmulld m17, m9
+ pmulld m18, m10 ; -a5 * 25
+ pmulld m19, m10
+ mova m0, [t1+r10*2+416* 6]
+ paddw m1, m0, [t2+r10*2+416* 6]
+ mova m2, [t3+r10*4+416*8+ 8]
+ paddw m3, m2, [t2+r10*2+416*0]
+ paddw m3, [t1+r10*2+416*0]
+ mova [t2+r10*2+416* 6], m0
+ mova [t2+r10*2+416*0], m2
+ punpcklwd m0, m4, m1 ; b3
+ vpdpwssd m16, m0, m0 ; -p3
+ punpckhwd m1, m4, m1
+ vpdpwssd m17, m1, m1
+ punpcklwd m2, m3, m4 ; b5
+ vpdpwssd m18, m2, m2 ; -p5
+ punpckhwd m3, m4
+ vpdpwssd m19, m3, m3
+ pmulld m16, m12 ; p3 * s1
+ pmulld m17, m12
+ pmulld m18, m11 ; p5 * s0
+ pmulld m19, m11
+ pmaddwd m0, m13 ; b3 * 455
+ pmaddwd m1, m13
+ pmaddwd m2, m13 ; b5 * 164
+ pmaddwd m3, m13
+ vpalignr m17{k2}, m16, m16, 2
+ vpalignr m19{k2}, m18, m18, 2
+ paddusw m17, m14
+ mova m16, m22
+ psraw m17, 4 ; min(z3, 255) - 256
+ vpermt2b m16, m17, m23 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m17
+ vpermi2b m17, m20, m21 ; sgr_x_by_x[ 0..127]
+ paddusw m19, m14
+ mova m18, m22
+ psraw m19, 4 ; min(z5, 255) - 256
+ vpermt2b m18, m19, m23 ; sgr_x_by_x[128..255]
+ vpmovb2m k4, m19
+ vpermi2b m19, m20, m21 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m17{k3}, m16 ; x3
+ vmovdqu8 m19{k4}, m18 ; x5
+ pandn m16, m24, m17
+ psrld m17, 16
+ pmulld m0, m16
+ pmulld m1, m17
+ pandn m18, m24, m19
+ psrld m19, m19, 16
+ pmulld m2, m18
+ pmulld m3, m19
+ paddd m0, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m15
+ vpternlogd m16, m0, m24, 0xd8 ; a3 | (b3 << 12)
+ vpternlogd m17, m1, m24, 0xd8
+ mova [t3+r10*4+416*8+ 8], m16
+ mova [t3+r10*4+416*8+ 24], xm17
+ vextracti32x4 [t3+r10*4+416*8+ 56], m17, 2
+ paddd m2, m15 ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
+ paddd m3, m15
+ mova [t3+r10*4+416*8+ 72], m17
+ vextracti128 [t3+r10*4+416*8+ 72], ym16, 1
+ vextracti32x4 [t3+r10*4+416*8+104], m16, 3
+ vpternlogd m18, m2, m24, 0xd8 ; a5 | (b5 << 12)
+ vpternlogd m19, m3, m24, 0xd8
+ mova [t3+r10*4+416*0+ 8], m18
+ mova [t3+r10*4+416*0+ 24], xm19
+ vextracti32x4 [t3+r10*4+416*0+ 56], m19, 2
+ mova [t3+r10*4+416*0+ 72], m19
+ vextracti128 [t3+r10*4+416*0+ 72], ym18, 1
+ vextracti32x4 [t3+r10*4+416*0+104], m18, 3
+ add r10, 32
+ jl .v1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.prep_n: ; initial neighbor setup
+ mov r10, wq
+.prep_n_loop:
+ movu m0, [t3+r10*4+416*0+4]
+ paddd m1, m0, [t3+r10*4+416*0+0]
+ mova m16, [t3+r10*4+416*4+0]
+ paddd m1, [t3+r10*4+416*0+8]
+ mova m17, [t3+r10*4+416*8+0]
+ paddd m16, [t3+r10*4+416*4+8]
+ paddd m17, [t3+r10*4+416*8+8]
+ paddd m2, m16, [t3+r10*4+416*4+4]
+ paddd m3, m17, [t3+r10*4+416*8+4]
+ paddd m0, m1
+ pslld m1, 2
+ pslld m2, 2
+ paddd m1, m0 ; ab5 565
+ paddd m3, m3 ; ab3[ 0] 222
+ psubd m2, m16 ; ab3[-1] 343
+ mova [t3+r10*4+416*20], m3
+ pandn m0, m24, m1 ; a5 565
+ mova [t3+r10*4+416*24], m2
+ psrld m1, 12 ; b5 565
+ mova [t3+r10*4+416*12], m0
+ paddd m3, m3
+ mova [t3+r10*4+416*16], m1
+ psubd m3, m17 ; ab3[ 0] 343
+ mova [t3+r10*4+416*28], m3
+ add r10, 16
+ jl .prep_n_loop
+ ret
+ALIGN function_align
+.n0: ; neighbor + output (even rows)
+ mov r10, wq
+.n0_loop:
+ movu m2, [t3+r10*4+4]
+ paddd m3, m2, [t3+r10*4+0]
+ paddd m3, [t3+r10*4+8]
+ mova m1, [t3+r10*4+416*4+0]
+ paddd m2, m3
+ pslld m3, 2
+ paddd m1, [t3+r10*4+416*4+8]
+ paddd m3, m2
+ pandn m2, m24, m3
+ psrld m3, 12
+ paddd m0, m2, [t3+r10*4+416*12] ; a5
+ paddd m16, m3, [t3+r10*4+416*16] ; b5 + (1 << 8)
+ mova [t3+r10*4+416*12], m2
+ mova [t3+r10*4+416*16], m3
+ paddd m2, m1, [t3+r10*4+416*4+4]
+ paddd m2, m2 ; ab3[ 1] 222
+ mova m3, [t3+r10*4+416*20]
+ paddd m17, m3, [t3+r10*4+416*24] ; ab3[ 0] 222 + ab3[-1] 343
+ mova [t3+r10*4+416*20], m2
+ paddd m2, m2
+ psubd m2, m1 ; ab3[ 1] 343
+ mova [t3+r10*4+416*24], m2
+ paddd m2, m3 ; ab3[ 0] 222 + ab3[ 1] 343
+ pandn m1, m24, m17
+ psrld m17, 12
+ pandn m3, m24, m2
+ psrld m2, 12
+ paddd m1, m3 ; a3
+ pmovzxbd m3, [dstq+r10]
+ paddd m17, m2 ; b3 + (1 << 8)
+ pmaddwd m0, m3 ; a5 * src
+ pmaddwd m1, m3 ; a3 * src
+ vpshldd m3, m25, 16 ; (dst << 16) + (1 << 15)
+ psubd m16, m0 ; b5 - a5 * src + (1 << 8)
+ psubd m17, m1 ; b3 - a3 * src + (1 << 8)
+ psrld m16, 9
+ pslld m17, 7
+ vmovdqu8 m17{k2}, m16
+ vpdpwssd m3, m17, m26
+ packuswb m3, m2
+ vpermb m16, m27, m3
+ mova [dstq+r10], xm16
+ add r10, 16
+ jl .n0_loop
+ add dstq, strideq
+ ret
+ALIGN function_align
+.n1: ; neighbor + output (odd rows)
+ mov r10, wq
+.n1_loop:
+ mova m1, [t3+r10*4+416*8+0]
+ paddd m1, [t3+r10*4+416*8+8]
+ paddd m2, m1, [t3+r10*4+416*8+4]
+ paddd m2, m2 ; ab3[ 1] 222
+ mova m0, [t3+r10*4+416*20]
+ paddd m17, m0, [t3+r10*4+416*28] ; ab3[ 0] 222 + ab3[-1] 343
+ pmovzxbd m3, [dstq+r10]
+ mova [t3+r10*4+416*20], m2
+ paddd m2, m2
+ psubd m2, m1 ; ab3[ 1] 343
+ mova [t3+r10*4+416*28], m2
+ paddd m0, m2 ; ab3[ 0] 222 + ab3[ 1] 343
+ pandn m1, m24, m17
+ psrld m17, 12
+ pandn m2, m24, m0
+ psrld m0, 12
+ paddd m1, m2 ; a3
+ paddd m17, m0 ; b3 + (1 << 8)
+ mova m16, [t3+r10*4+416*16] ; b5 + (1 << 7)
+ pmaddwd m1, m3 ; a3 * src
+ pmaddwd m0, m3, [t3+r10*4+416*12] ; a5 * src
+ vpshldd m3, m25, 16 ; (dst << 16) + (1 << 15)
+ psubd m17, m1 ; b3 - a3 * src + (1 << 8)
+ psubd m16, m0 ; b5 - a5 * src + (1 << 7)
+ pslld m17, 7
+ palignr m17{k2}, m16, m16, 1
+ vpdpwssd m3, m17, m26
+ packuswb m3, m3
+ vpermb m16, m27, m3
+ mova [dstq+r10], xm16
+ add r10, 16
+ jl .n1_loop
+ add dstq, strideq
+ ret
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/looprestoration_sse.asm b/third_party/dav1d/src/x86/looprestoration_sse.asm
new file mode 100644
index 0000000000..01eb6fa348
--- /dev/null
+++ b/third_party/dav1d/src/x86/looprestoration_sse.asm
@@ -0,0 +1,3681 @@
+; Copyright © 2018, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; Copyright © 2018, VideoLabs
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA 16
+
+wiener_init: db 6, 7, 6, 7, 6, 7, 6, 7, 0, 0, 0, 0, 2, 4, 2, 4
+wiener_shufA: db 1, 7, 2, 8, 3, 9, 4, 10, 5, 11, 6, 12, 7, 13, 8, 14
+wiener_shufB: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
+wiener_shufC: db 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11, 13, 12
+wiener_shufD: db 4, -1, 5, -1, 6, -1, 7, -1, 8, -1, 9, -1, 10, -1, 11, -1
+wiener_l_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
+sgr_lshuf3: db 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13
+sgr_lshuf5: db 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
+pb_0to15: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+
+pb_right_ext_mask: times 24 db 0xff
+ times 8 db 0
+pb_1: times 16 db 1
+pb_3: times 16 db 3
+pw_256: times 8 dw 256
+pw_2056: times 8 dw 2056
+pw_m16380: times 8 dw -16380
+pd_4096: times 4 dd 4096
+pd_34816: times 4 dd 34816
+pd_0xffff: times 4 dd 0xffff
+pd_0xf00800a4: times 4 dd 0xf00800a4
+pd_0xf00801c7: times 4 dd 0xf00801c7
+
+cextern sgr_x_by_x
+
+SECTION .text
+
+%macro movif64 2 ; dst, src
+ %if ARCH_X86_64
+ mov %1, %2
+ %endif
+%endmacro
+
+%macro movif32 2 ; dst, src
+ %if ARCH_X86_32
+ mov %1, %2
+ %endif
+%endmacro
+
+%if ARCH_X86_32
+ %define PIC_base_offset $$
+
+ %macro SETUP_PIC 1-3 1,0 ; PIC_reg, save_PIC_reg, restore_PIC_reg
+ %assign pic_reg_stk_off 4
+ %xdefine PIC_reg %1
+ %if %2 == 1
+ mov [esp], %1
+ %endif
+ LEA PIC_reg, PIC_base_offset
+ %if %3 == 1
+ XCHG_PIC_REG
+ %endif
+ %endmacro
+
+ %macro XCHG_PIC_REG 0
+ mov [esp+pic_reg_stk_off], PIC_reg
+ %assign pic_reg_stk_off (pic_reg_stk_off+4) % 8
+ mov PIC_reg, [esp+pic_reg_stk_off]
+ %endmacro
+
+ %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset)
+
+%else
+ %macro XCHG_PIC_REG 0
+ %endmacro
+
+ %define PIC_sym(sym) (sym)
+%endif
+
+%macro WIENER 0
+%if ARCH_X86_64
+DECLARE_REG_TMP 9, 7, 10, 11, 12, 13, 14 ; ring buffer pointers
+cglobal wiener_filter7_8bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \
+ w, h, edge, flt, x
+ %define tmpstrideq strideq
+ %define base 0
+ mov fltq, r6mp
+ mov wd, wm
+ movifnidn hd, hm
+ mov edged, r7m
+ movq m14, [fltq]
+ add lpfq, wq
+ movq m7, [fltq+16]
+ add dstq, wq
+ lea t1, [rsp+wq*2+16]
+ mova m15, [pw_2056]
+ neg wq
+%if cpuflag(ssse3)
+ pshufb m14, [wiener_init]
+ mova m8, [wiener_shufA]
+ pshufd m12, m14, q2222 ; x0 x0
+ mova m9, [wiener_shufB]
+ pshufd m13, m14, q3333 ; x1 x2
+ mova m10, [wiener_shufC]
+ punpcklqdq m14, m14 ; x3
+ mova m11, [wiener_shufD]
+%else
+ mova m10, [pw_m16380]
+ punpcklwd m14, m14
+ pshufd m11, m14, q0000 ; x0
+ pshufd m12, m14, q1111 ; x1
+ pshufd m13, m14, q2222 ; x2
+ pshufd m14, m14, q3333 ; x3
+%endif
+%else
+DECLARE_REG_TMP 4, 0, _, 5
+%if cpuflag(ssse3)
+ %define m10 [base+wiener_shufC]
+ %define m11 [base+wiener_shufD]
+ %define stk_off 96
+%else
+ %define m10 [base+pw_m16380]
+ %define m11 [stk+96]
+ %define stk_off 112
+%endif
+cglobal wiener_filter7_8bpc, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, tmpstride
+ %define base r6-pb_right_ext_mask-21
+ %define stk esp
+ %define dstq leftq
+ %define edgeb byte edged
+ %define edged [stk+ 8]
+ %define dstmp [stk+12]
+ %define hd dword [stk+16]
+ %define wq [stk+20]
+ %define strideq [stk+24]
+ %define leftmp [stk+28]
+ %define t2 [stk+32]
+ %define t4 [stk+36]
+ %define t5 [stk+40]
+ %define t6 [stk+44]
+ %define m8 [base+wiener_shufA]
+ %define m9 [base+wiener_shufB]
+ %define m12 [stk+48]
+ %define m13 [stk+64]
+ %define m14 [stk+80]
+ %define m15 [base+pw_2056]
+ mov r1, r6m ; flt
+ mov r0, r0m ; dst
+ mov r4, r4m ; w
+ mov lpfq, lpfm
+ mov r2, r7m ; edge
+ mov r5, r5m ; h
+ movq m3, [r1+ 0]
+ movq m7, [r1+16]
+ add r0, r4
+ mov r1, r1m ; stride
+ add lpfq, r4
+ mov edged, r2
+ mov r2, r2m ; left
+ mov dstmp, r0
+ lea t1, [rsp+r4*2+stk_off]
+ mov hd, r5
+ neg r4
+ LEA r6, pb_right_ext_mask+21
+ mov wq, r4
+ mov strideq, r1
+ mov leftmp, r2
+ mov r4, r1
+%if cpuflag(ssse3)
+ pshufb m3, [base+wiener_init]
+ pshufd m1, m3, q2222
+ pshufd m2, m3, q3333
+ punpcklqdq m3, m3
+%else
+ punpcklwd m3, m3
+ pshufd m0, m3, q0000
+ pshufd m1, m3, q1111
+ pshufd m2, m3, q2222
+ pshufd m3, m3, q3333
+ mova m11, m0
+%endif
+ mova m12, m1
+ mova m13, m2
+ mova m14, m3
+%endif
+ psllw m7, 5
+ pshufd m6, m7, q0000 ; y0 y1
+ pshufd m7, m7, q1111 ; y2 y3
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t6, t1
+ mov t5, t1
+ add t1, 384*2
+ call .h_top
+ lea t3, [lpfq+tmpstrideq*4]
+ mov lpfq, dstmp
+ add t3, tmpstrideq
+ mov [rsp], t3 ; below
+ mov t4, t1
+ add t1, 384*2
+ call .h
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v2
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v3
+.main:
+ lea t0, [t1+384*2]
+.main_loop:
+ call .hv
+ dec hd
+ jnz .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .v3
+ mov lpfq, [rsp]
+ call .hv_bottom
+ add lpfq, strideq
+ call .hv_bottom
+.v1:
+ call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v
+ RET
+.no_top:
+ lea t3, [lpfq+tmpstrideq*4]
+ mov lpfq, dstmp
+ lea t3, [t3+tmpstrideq*2]
+ mov [rsp], t3
+ call .h
+ mov t6, t1
+ mov t5, t1
+ mov t4, t1
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v2
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v3
+ lea t0, [t1+384*2]
+ call .hv
+ dec hd
+ jz .v3
+ add t0, 384*8
+ call .hv
+ dec hd
+ jnz .main
+.v3:
+ call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v
+.v2:
+ call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v
+ jmp .v1
+.extend_right:
+ movd m2, [lpfq-4]
+%if ARCH_X86_64
+ push r0
+ lea r0, [pb_right_ext_mask+21]
+ movu m0, [r0+xq+0]
+ movu m1, [r0+xq+8]
+ pop r0
+%else
+ movu m0, [r6+xq+0]
+ movu m1, [r6+xq+8]
+%endif
+%if cpuflag(ssse3)
+ pshufb m2, [base+pb_3]
+%else
+ punpcklbw m2, m2
+ pshuflw m2, m2, q3333
+ punpcklqdq m2, m2
+%endif
+ pand m4, m0
+ pand m5, m1
+ pandn m0, m2
+ pandn m1, m2
+ por m4, m0
+ por m5, m1
+ ret
+.h:
+ %define stk esp+4 ; offset due to call
+ mov xq, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movifnidn leftq, leftmp
+ mova m4, [lpfq+xq]
+ movd m5, [leftq]
+ add leftq, 4
+ pslldq m4, 4
+ por m4, m5
+ movifnidn leftmp, leftq
+ jmp .h_main
+.h_extend_left:
+%if cpuflag(ssse3)
+ mova m4, [lpfq+xq]
+ pshufb m4, [base+wiener_l_shuf]
+%else
+ mova m5, [lpfq+xq]
+ pshufd m4, m5, q2103
+ punpcklbw m5, m5
+ punpcklwd m5, m5
+ movss m4, m5
+%endif
+ jmp .h_main
+.h_top:
+ mov xq, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m4, [lpfq+xq-4]
+.h_main:
+ movu m5, [lpfq+xq+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp xd, -18
+ jl .h_have_right
+ call .extend_right
+.h_have_right:
+%macro %%h7 0
+%if cpuflag(ssse3)
+ pshufb m0, m4, m8
+ pmaddubsw m0, m12
+ pshufb m1, m5, m8
+ pmaddubsw m1, m12
+ pshufb m2, m4, m9
+ pmaddubsw m2, m13
+ pshufb m3, m5, m9
+ pmaddubsw m3, m13
+ paddw m0, m2
+ pshufb m2, m4, m10
+ pmaddubsw m2, m13
+ paddw m1, m3
+ pshufb m3, m5, m10
+ pmaddubsw m3, m13
+ pshufb m4, m11
+ paddw m0, m2
+ pmullw m2, m14, m4
+ pshufb m5, m11
+ paddw m1, m3
+ pmullw m3, m14, m5
+ psllw m4, 7
+ psllw m5, 7
+ paddw m0, m2
+ mova m2, [base+pw_m16380]
+ paddw m1, m3
+ paddw m4, m2
+ paddw m5, m2
+ paddsw m0, m4
+ paddsw m1, m5
+%else
+ psrldq m0, m4, 1
+ pslldq m1, m4, 1
+ pxor m3, m3
+ punpcklbw m0, m3
+ punpckhbw m1, m3
+ paddw m0, m1
+ pmullw m0, m11
+ psrldq m1, m4, 2
+ pslldq m2, m4, 2
+ punpcklbw m1, m3
+ punpckhbw m2, m3
+ paddw m1, m2
+ pmullw m1, m12
+ paddw m0, m1
+ pshufd m2, m4, q0321
+ punpcklbw m2, m3
+ pmullw m1, m14, m2
+ paddw m0, m1
+ psrldq m1, m4, 3
+ pslldq m4, 3
+ punpcklbw m1, m3
+ punpckhbw m4, m3
+ paddw m1, m4
+ pmullw m1, m13
+ paddw m0, m1
+ psllw m2, 7
+ paddw m2, m10
+ paddsw m0, m2
+ psrldq m1, m5, 1
+ pslldq m2, m5, 1
+ punpcklbw m1, m3
+ punpckhbw m2, m3
+ paddw m1, m2
+ pmullw m1, m11
+ psrldq m2, m5, 2
+ pslldq m4, m5, 2
+ punpcklbw m2, m3
+ punpckhbw m4, m3
+ paddw m2, m4
+ pmullw m2, m12
+ paddw m1, m2
+ pshufd m4, m5, q0321
+ punpcklbw m4, m3
+ pmullw m2, m14, m4
+ paddw m1, m2
+ psrldq m2, m5, 3
+ pslldq m5, 3
+ punpcklbw m2, m3
+ punpckhbw m5, m3
+ paddw m2, m5
+ pmullw m2, m13
+ paddw m1, m2
+ psllw m4, 7
+ paddw m4, m10
+ paddsw m1, m4
+%endif
+%endmacro
+ %%h7
+ psraw m0, 3
+ psraw m1, 3
+ paddw m0, m15
+ paddw m1, m15
+ mova [t1+xq*2+ 0], m0
+ mova [t1+xq*2+16], m1
+ add xq, 16
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv:
+ add lpfq, strideq
+ mov xq, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movifnidn leftq, leftmp
+ mova m4, [lpfq+xq]
+ movd m5, [leftq]
+ add leftq, 4
+ pslldq m4, 4
+ por m4, m5
+ movifnidn leftmp, leftq
+ jmp .hv_main
+.hv_extend_left:
+%if cpuflag(ssse3)
+ mova m4, [lpfq+xq]
+ pshufb m4, [base+wiener_l_shuf]
+%else
+ mova m5, [lpfq+xq]
+ pshufd m4, m5, q2103
+ punpcklbw m5, m5
+ punpcklwd m5, m5
+ movss m4, m5
+%endif
+ jmp .hv_main
+.hv_bottom:
+ mov xq, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu m4, [lpfq+xq-4]
+.hv_main:
+ movu m5, [lpfq+xq+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp xd, -18
+ jl .hv_have_right
+ call .extend_right
+.hv_have_right:
+ %%h7
+%if ARCH_X86_64
+ mova m2, [t4+xq*2]
+ paddw m2, [t2+xq*2]
+%else
+ mov r2, t4
+ mova m2, [r2+xq*2]
+ mov r2, t2
+ paddw m2, [r2+xq*2]
+ mov r2, t5
+%endif
+ mova m3, [t3+xq*2]
+%if ARCH_X86_64
+ mova m5, [t5+xq*2]
+%else
+ mova m5, [r2+xq*2]
+ mov r2, t6
+%endif
+ paddw m5, [t1+xq*2]
+ psraw m0, 3
+ psraw m1, 3
+ paddw m0, m15
+ paddw m1, m15
+%if ARCH_X86_64
+ paddw m4, m0, [t6+xq*2]
+%else
+ paddw m4, m0, [r2+xq*2]
+ mov r2, t4
+%endif
+ mova [t0+xq*2], m0
+ punpcklwd m0, m2, m3
+ pmaddwd m0, m7
+ punpckhwd m2, m3
+ pmaddwd m2, m7
+ punpcklwd m3, m4, m5
+ pmaddwd m3, m6
+ punpckhwd m4, m5
+ pmaddwd m4, m6
+ paddd m0, m3
+ mova m3, [t3+xq*2+16]
+ paddd m4, m2
+%if ARCH_X86_64
+ mova m2, [t4+xq*2+16]
+ paddw m2, [t2+xq*2+16]
+ mova m5, [t5+xq*2+16]
+%else
+ mova m2, [r2+xq*2+16]
+ mov r2, t2
+ paddw m2, [r2+xq*2+16]
+ mov r2, t5
+ mova m5, [r2+xq*2+16]
+ mov r2, t6
+%endif
+ paddw m5, [t1+xq*2+16]
+ packuswb m0, m4
+%if ARCH_X86_64
+ paddw m4, m1, [t6+xq*2+16]
+%else
+ paddw m4, m1, [r2+xq*2+16]
+ mov dstq, dstmp
+%endif
+ mova [t0+xq*2+16], m1
+ punpcklwd m1, m2, m3
+ pmaddwd m1, m7
+ punpckhwd m2, m3
+ pmaddwd m2, m7
+ punpcklwd m3, m4, m5
+ pmaddwd m3, m6
+ punpckhwd m4, m5
+ pmaddwd m4, m6
+ paddd m1, m3
+ paddd m2, m4
+ packuswb m1, m2
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+ mova [dstq+xq], m0
+ add xq, 16
+ jl .hv_loop
+ add dstq, strideq
+%if ARCH_X86_64
+ mov t6, t5
+ mov t5, t4
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ mov t1, t0
+ mov t0, t6
+%else
+ mov dstmp, dstq
+ mov r1, t5
+ mov r2, t4
+ mov t6, r1
+ mov t5, r2
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ mov t1, t0
+ mov t0, r1
+%endif
+ ret
+%if cpuflag(ssse3) ; identical in sse2 and ssse3, so share code
+.v:
+ mov xq, wq
+.v_loop:
+%if ARCH_X86_64
+ mova m1, [t4+xq*2]
+ paddw m1, [t2+xq*2]
+%else
+ mov r2, t4
+ mova m1, [r2+xq*2]
+ mov r2, t2
+ paddw m1, [r2+xq*2]
+ mov r2, t6
+%endif
+ mova m2, [t3+xq*2]
+ mova m4, [t1+xq*2]
+%if ARCH_X86_64
+ paddw m3, m4, [t6+xq*2]
+ paddw m4, [t5+xq*2]
+%else
+ paddw m3, m4, [r2+xq*2]
+ mov r2, t5
+ paddw m4, [r2+xq*2]
+ mov r2, t4
+%endif
+ punpcklwd m0, m1, m2
+ pmaddwd m0, m7
+ punpckhwd m1, m2
+ pmaddwd m1, m7
+ punpcklwd m2, m3, m4
+ pmaddwd m2, m6
+ punpckhwd m3, m4
+ pmaddwd m3, m6
+ paddd m0, m2
+ paddd m1, m3
+%if ARCH_X86_64
+ mova m2, [t4+xq*2+16]
+ paddw m2, [t2+xq*2+16]
+%else
+ mova m2, [r2+xq*2+16]
+ mov r2, t2
+ paddw m2, [r2+xq*2+16]
+ mov r2, t6
+%endif
+ mova m3, [t3+xq*2+16]
+ mova m5, [t1+xq*2+16]
+%if ARCH_X86_64
+ paddw m4, m5, [t6+xq*2+16]
+ paddw m5, [t5+xq*2+16]
+%else
+ paddw m4, m5, [r2+xq*2+16]
+ mov r2, t5
+ paddw m5, [r2+xq*2+16]
+ movifnidn dstq, dstmp
+%endif
+ packuswb m0, m1
+ punpcklwd m1, m2, m3
+ pmaddwd m1, m7
+ punpckhwd m2, m3
+ pmaddwd m2, m7
+ punpcklwd m3, m4, m5
+ pmaddwd m3, m6
+ punpckhwd m4, m5
+ pmaddwd m4, m6
+ paddd m1, m3
+ paddd m2, m4
+ packuswb m1, m2
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+ mova [dstq+xq], m0
+ add xq, 16
+ jl .v_loop
+ add dstq, strideq
+%if ARCH_X86_64
+ mov t6, t5
+ mov t5, t4
+%else
+ mov dstmp, dstq
+ mov r1, t5
+ mov r2, t4
+ mov t6, r1
+ mov t5, r2
+%endif
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ ret
+%endif
+
+%if ARCH_X86_64
+cglobal wiener_filter5_8bpc, 4, 13, 16, 384*8+16, dst, stride, left, lpf, \
+ w, h, edge, flt, x
+ mov fltq, r6mp
+ mov wd, wm
+ movifnidn hd, hm
+ mov edged, r7m
+ movq m14, [fltq]
+ add lpfq, wq
+ movq m7, [fltq+16]
+ add dstq, wq
+ mova m8, [pw_m16380]
+ lea t1, [rsp+wq*2+16]
+ mova m15, [pw_2056]
+ neg wq
+%if cpuflag(ssse3)
+ pshufb m14, [wiener_init]
+ mova m9, [wiener_shufB]
+ pshufd m13, m14, q3333 ; x1 x2
+ mova m10, [wiener_shufC]
+ punpcklqdq m14, m14 ; x3
+ mova m11, [wiener_shufD]
+ mova m12, [wiener_l_shuf]
+%else
+ punpcklwd m14, m14
+ pshufd m11, m14, q1111 ; x1
+ pshufd m13, m14, q2222 ; x2
+ pshufd m14, m14, q3333 ; x3
+%endif
+%else
+%if cpuflag(ssse3)
+ %define stk_off 80
+%else
+ %define m11 [stk+80]
+ %define stk_off 96
+%endif
+cglobal wiener_filter5_8bpc, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, tmpstride
+ %define stk esp
+ %define leftmp [stk+28]
+ %define m8 [base+pw_m16380]
+ %define m12 [base+wiener_l_shuf]
+ %define m14 [stk+48]
+ mov r1, r6m ; flt
+ mov r0, r0m ; dst
+ mov r4, r4m ; w
+ mov lpfq, lpfm
+ mov r2, r7m ; edge
+ mov r5, r5m ; h
+ movq m2, [r1+ 0]
+ movq m7, [r1+16]
+ add r0, r4
+ mov r1, r1m ; stride
+ add lpfq, r4
+ mov edged, r2
+ mov r2, r2m ; left
+ mov dstmp, r0
+ lea t1, [rsp+r4*2+stk_off]
+ mov hd, r5
+ neg r4
+ LEA r6, pb_right_ext_mask+21
+ mov wq, r4
+ mov strideq, r1
+ mov leftmp, r2
+ mov r4, r1
+%if cpuflag(ssse3)
+ pshufb m2, [base+wiener_init]
+ pshufd m1, m2, q3333
+ punpcklqdq m2, m2
+%else
+ punpcklwd m2, m2
+ pshufd m0, m2, q1111
+ pshufd m1, m2, q2222
+ pshufd m2, m2, q3333
+ mova m11, m0
+%endif
+ mova m13, m1
+ mova m14, m2
+%endif
+ psllw m7, 5
+ pshufd m6, m7, q0000 ; __ y1
+ pshufd m7, m7, q1111 ; y2 y3
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t4, t1
+ add t1, 384*2
+ call .h_top
+ lea xq, [lpfq+tmpstrideq*4]
+ mov lpfq, dstmp
+ mov t3, t1
+ add t1, 384*2
+ add xq, tmpstrideq
+ mov [rsp], xq ; below
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v2
+.main:
+ mov t0, t4
+.main_loop:
+ call .hv
+ dec hd
+ jnz .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .v2
+ mov lpfq, [rsp]
+ call .hv_bottom
+ add lpfq, strideq
+ call .hv_bottom
+.end:
+ RET
+.no_top:
+ lea t3, [lpfq+tmpstrideq*4]
+ mov lpfq, dstmp
+ lea t3, [t3+tmpstrideq*2]
+ mov [rsp], t3
+ call .h
+ mov t4, t1
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v2
+ lea t0, [t1+384*2]
+ call .hv
+ dec hd
+ jz .v2
+ add t0, 384*6
+ call .hv
+ dec hd
+ jnz .main
+.v2:
+ call mangle(private_prefix %+ _wiener_filter5_8bpc_ssse3).v
+ add dstq, strideq
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ movifnidn dstmp, dstq
+.v1:
+ call mangle(private_prefix %+ _wiener_filter5_8bpc_ssse3).v
+ jmp .end
+.h:
+ %define stk esp+4
+ mov xq, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movifnidn leftq, leftmp
+ mova m4, [lpfq+xq]
+ movd m5, [leftq]
+ add leftq, 4
+ pslldq m4, 4
+ por m4, m5
+ movifnidn leftmp, leftq
+ jmp .h_main
+.h_extend_left:
+%if cpuflag(ssse3)
+ mova m4, [lpfq+xq]
+ pshufb m4, m12
+%else
+ mova m5, [lpfq+xq]
+ pshufd m4, m5, q2103
+ punpcklbw m5, m5
+ punpcklwd m5, m5
+ movss m4, m5
+%endif
+ jmp .h_main
+.h_top:
+ mov xq, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m4, [lpfq+xq-4]
+.h_main:
+ movu m5, [lpfq+xq+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp xd, -17
+ jl .h_have_right
+ call mangle(private_prefix %+ _wiener_filter7_8bpc %+ SUFFIX).extend_right
+.h_have_right:
+%macro %%h5 0
+%if cpuflag(ssse3)
+ pshufb m0, m4, m9
+ pmaddubsw m0, m13
+ pshufb m1, m5, m9
+ pmaddubsw m1, m13
+ pshufb m2, m4, m10
+ pmaddubsw m2, m13
+ pshufb m3, m5, m10
+ pmaddubsw m3, m13
+ pshufb m4, m11
+ paddw m0, m2
+ pmullw m2, m14, m4
+ pshufb m5, m11
+ paddw m1, m3
+ pmullw m3, m14, m5
+ psllw m4, 7
+ psllw m5, 7
+ paddw m4, m8
+ paddw m5, m8
+ paddw m0, m2
+ paddw m1, m3
+ paddsw m0, m4
+ paddsw m1, m5
+%else
+ psrldq m0, m4, 2
+ pslldq m1, m4, 2
+ pxor m3, m3
+ punpcklbw m0, m3
+ punpckhbw m1, m3
+ paddw m0, m1
+ pmullw m0, m11
+ pshufd m2, m4, q0321
+ punpcklbw m2, m3
+ pmullw m1, m14, m2
+ paddw m0, m1
+ psrldq m1, m4, 3
+ pslldq m4, 3
+ punpcklbw m1, m3
+ punpckhbw m4, m3
+ paddw m1, m4
+ pmullw m1, m13
+ paddw m0, m1
+ psllw m2, 7
+ paddw m2, m8
+ paddsw m0, m2
+ psrldq m1, m5, 2
+ pslldq m4, m5, 2
+ punpcklbw m1, m3
+ punpckhbw m4, m3
+ paddw m1, m4
+ pmullw m1, m11
+ pshufd m4, m5, q0321
+ punpcklbw m4, m3
+ pmullw m2, m14, m4
+ paddw m1, m2
+ psrldq m2, m5, 3
+ pslldq m5, 3
+ punpcklbw m2, m3
+ punpckhbw m5, m3
+ paddw m2, m5
+ pmullw m2, m13
+ paddw m1, m2
+ psllw m4, 7
+ paddw m4, m8
+ paddsw m1, m4
+%endif
+%endmacro
+ %%h5
+ psraw m0, 3
+ psraw m1, 3
+ paddw m0, m15
+ paddw m1, m15
+ mova [t1+xq*2+ 0], m0
+ mova [t1+xq*2+16], m1
+ add xq, 16
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv:
+ add lpfq, strideq
+ mov xq, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movifnidn leftq, leftmp
+ mova m4, [lpfq+xq]
+ movd m5, [leftq]
+ add leftq, 4
+ pslldq m4, 4
+ por m4, m5
+ movifnidn leftmp, leftq
+ jmp .hv_main
+.hv_extend_left:
+%if cpuflag(ssse3)
+ mova m4, [lpfq+xq]
+ pshufb m4, m12
+%else
+ mova m5, [lpfq+xq]
+ pshufd m4, m5, q2103
+ punpcklbw m5, m5
+ punpcklwd m5, m5
+ movss m4, m5
+%endif
+ jmp .hv_main
+.hv_bottom:
+ mov xq, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu m4, [lpfq+xq-4]
+.hv_main:
+ movu m5, [lpfq+xq+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp xd, -17
+ jl .hv_have_right
+ call mangle(private_prefix %+ _wiener_filter7_8bpc %+ SUFFIX).extend_right
+.hv_have_right:
+ %%h5
+ mova m2, [t3+xq*2]
+ paddw m2, [t1+xq*2]
+ psraw m0, 3
+ psraw m1, 3
+ paddw m0, m15
+ paddw m1, m15
+%if ARCH_X86_64
+ mova m3, [t2+xq*2]
+ paddw m4, m0, [t4+xq*2]
+%else
+ mov r2, t2
+ mova m3, [r2+xq*2]
+ mov r2, t4
+ paddw m4, m0, [r2+xq*2]
+%endif
+ mova [t0+xq*2], m0
+ punpcklwd m0, m2, m3
+ pmaddwd m0, m7
+ punpckhwd m2, m3
+ pmaddwd m2, m7
+ punpcklwd m3, m4, m4
+ pmaddwd m3, m6
+ punpckhwd m4, m4
+ pmaddwd m4, m6
+ paddd m0, m3
+ paddd m4, m2
+ mova m2, [t3+xq*2+16]
+ paddw m2, [t1+xq*2+16]
+ packuswb m0, m4
+%if ARCH_X86_64
+ mova m3, [t2+xq*2+16]
+ paddw m4, m1, [t4+xq*2+16]
+%else
+ paddw m4, m1, [r2+xq*2+16]
+ mov r2, t2
+ mova m3, [r2+xq*2+16]
+ mov dstq, dstmp
+%endif
+ mova [t0+xq*2+16], m1
+ punpcklwd m1, m2, m3
+ pmaddwd m1, m7
+ punpckhwd m2, m3
+ pmaddwd m2, m7
+ punpcklwd m3, m4, m4
+ pmaddwd m3, m6
+ punpckhwd m4, m4
+ pmaddwd m4, m6
+ paddd m1, m3
+ paddd m2, m4
+ packuswb m1, m2
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+ mova [dstq+xq], m0
+ add xq, 16
+ jl .hv_loop
+ add dstq, strideq
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ mov t1, t0
+ mov t0, t4
+ movifnidn dstmp, dstq
+ ret
+%if cpuflag(ssse3)
+.v:
+ mov xq, wq
+.v_loop:
+ mova m3, [t1+xq*2]
+ paddw m1, m3, [t3+xq*2]
+%if ARCH_X86_64
+ mova m2, [t2+xq*2]
+ paddw m3, [t4+xq*2]
+%else
+ mov r2, t2
+ mova m2, [r2+xq*2]
+ mov r2, t4
+ paddw m3, [r2+xq*2]
+%endif
+ punpcklwd m0, m1, m2
+ pmaddwd m0, m7
+ punpckhwd m1, m2
+ pmaddwd m1, m7
+ punpcklwd m2, m3
+ pmaddwd m2, m6
+ punpckhwd m3, m3
+ pmaddwd m3, m6
+ paddd m0, m2
+ paddd m1, m3
+ mova m4, [t1+xq*2+16]
+ paddw m2, m4, [t3+xq*2+16]
+%if ARCH_X86_64
+ mova m3, [t2+xq*2+16]
+ paddw m4, [t4+xq*2+16]
+%else
+ paddw m4, [r2+xq*2+16]
+ mov r2, t2
+ mova m3, [r2+xq*2+16]
+ mov dstq, dstmp
+%endif
+ packuswb m0, m1
+ punpcklwd m1, m2, m3
+ pmaddwd m1, m7
+ punpckhwd m2, m3
+ pmaddwd m2, m7
+ punpcklwd m3, m4
+ pmaddwd m3, m6
+ punpckhwd m4, m4
+ pmaddwd m4, m6
+ paddd m1, m3
+ paddd m2, m4
+ packuswb m1, m2
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+ mova [dstq+xq], m0
+ add xq, 16
+ jl .v_loop
+ ret
+%endif
+%endmacro
+
+INIT_XMM sse2
+WIENER
+
+INIT_XMM ssse3
+WIENER
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; self-guided ;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%macro GATHERDD 3 ; dst, src, tmp
+ movd %3d, %2
+ %if ARCH_X86_64
+ movd %1, [r13+%3]
+ pextrw %3d, %2, 2
+ pinsrw %1, [r13+%3+2], 3
+ pextrw %3d, %2, 4
+ pinsrw %1, [r13+%3+2], 5
+ pextrw %3d, %2, 6
+ pinsrw %1, [r13+%3+2], 7
+ %else
+ movd %1, [base+sgr_x_by_x-0xf03+%3]
+ pextrw %3, %2, 2
+ pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 3
+ pextrw %3, %2, 4
+ pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 5
+ pextrw %3, %2, 6
+ pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 7
+ %endif
+%endmacro
+
+%macro GATHER_X_BY_X 5 ; dst, src0, src1, tmp32, tmp32_restore
+ %if ARCH_X86_64
+ %define tmp r14
+ %else
+ %define tmp %4
+ %endif
+ GATHERDD %1, %2, tmp
+ GATHERDD %2, %3, tmp
+ movif32 %4, %5
+ psrld %1, 24
+ psrld %2, 24
+ packssdw %1, %2
+%endmacro
+
+%macro MULLD 3 ; dst, src, tmp
+ pmulhuw %3, %1, %2
+ pmullw %1, %2
+ pslld %3, 16
+ paddd %1, %3
+%endmacro
+
+%if ARCH_X86_32
+DECLARE_REG_TMP 0, 1, 2, 3, 5
+ %if STACK_ALIGNMENT < 16
+ %assign extra_stack 5*16
+ %else
+ %assign extra_stack 3*16
+ %endif
+cglobal sgr_filter_5x5_8bpc, 1, 7, 8, -400*24-16-extra_stack, \
+ dst, stride, left, lpf, w
+ %if STACK_ALIGNMENT < 16
+ %define dstm dword [esp+calloff+16*0+4*6]
+ %define stridemp dword [esp+calloff+16*0+4*7]
+ %define leftm dword [esp+calloff+16*3+4*0]
+ %define lpfm dword [esp+calloff+16*3+4*1]
+ %define w0m dword [esp+calloff+16*3+4*2]
+ %define hd dword [esp+calloff+16*3+4*3]
+ %define edgeb byte [esp+calloff+16*3+4*4]
+ %define edged dword [esp+calloff+16*3+4*4]
+ %define leftmp leftm
+ %else
+ %define w0m wm
+ %define hd dword r5m
+ %define edgeb byte r7m
+ %define edged dword r7m
+ %endif
+ %define hvsrcm dword [esp+calloff+4*0]
+ %define w1m dword [esp+calloff+4*1]
+ %define t0m dword [esp+calloff+4*2]
+ %define t2m dword [esp+calloff+4*3]
+ %define t3m dword [esp+calloff+4*4]
+ %define t4m dword [esp+calloff+4*5]
+ %define m8 [base+pb_1]
+ %define m9 [esp+calloff+16*2]
+ %define m10 [base+pd_0xf00800a4]
+ %define m11 [base+sgr_lshuf5]
+ %define m12 [base+pd_34816]
+ %define m13 [base+pb_0to15]
+ %define r10 r4
+ %define base r6-$$
+ %assign calloff 0
+ %if STACK_ALIGNMENT < 16
+ mov strideq, [rstk+stack_offset+ 8]
+ mov leftq, [rstk+stack_offset+12]
+ mov lpfq, [rstk+stack_offset+16]
+ mov wd, [rstk+stack_offset+20]
+ mov dstm, dstq
+ mov stridemp, strideq
+ mov leftm, leftq
+ mov r1, [rstk+stack_offset+24]
+ mov r2, [rstk+stack_offset+32]
+ mov lpfm, lpfq
+ mov hd, r1
+ mov edged, r2
+ %endif
+%else
+DECLARE_REG_TMP 8, 7, 9, 11, 12
+cglobal sgr_filter_5x5_8bpc, 4, 15, 14, -400*24-16, dst, stride, left, lpf, \
+ w, h, edge, params
+%endif
+%if ARCH_X86_64 || STACK_ALIGNMENT >= 16
+ mov wd, wm
+%endif
+%if ARCH_X86_64
+ mov paramsq, r6mp
+ lea r13, [sgr_x_by_x-0xf03]
+ movifnidn hd, hm
+ mov edged, r7m
+ movu m9, [paramsq]
+ add lpfq, wq
+ mova m8, [pb_1]
+ lea t1, [rsp+wq*2+20]
+ mova m10, [pd_0xf00800a4]
+ add dstq, wq
+ lea t3, [rsp+wq*4+400*12+16]
+ mova m12, [pd_34816] ; (1 << 11) + (1 << 15)
+ lea t4, [rsp+wq*2+400*20+16]
+ pshufhw m7, m9, q0000
+ pshufb m9, [pw_256] ; s0
+ punpckhqdq m7, m7 ; w0
+ neg wq
+ mova m13, [pb_0to15]
+ pxor m6, m6
+ mova m11, [sgr_lshuf5]
+ psllw m7, 4
+ DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w
+ %define lpfm [rsp]
+%else
+ mov r1, [rstk+stack_offset+28] ; params
+ LEA r6, $$
+ movu m1, [r1]
+ add lpfm, wq
+ lea t1, [rsp+extra_stack+wq*2+20]
+ add dstq, wq
+ lea t3, [rsp+extra_stack+wq*4+400*12+16]
+ mov dstm, dstq
+ lea t4, [rsp+extra_stack+wq*2+400*20+16]
+ mov t3m, t3
+ pshufhw m7, m1, q0000
+ mov t4m, t4
+ pshufb m1, [base+pw_256] ; s0
+ punpckhqdq m7, m7 ; w0
+ psllw m7, 4
+ neg wq
+ mova m9, m1
+ pxor m6, m6
+ mov w1m, wd
+ sub wd, 2
+ mov lpfq, lpfm
+ mov w0m, wd
+ %define strideq r5
+%endif
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, stridemp
+ movif32 t2m, t1
+ mov t2, t1
+ call .top_fixup
+ add t1, 400*6
+ call .h_top
+ movif32 strideq, stridemp
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ add r10, strideq
+ mov lpfm, r10 ; below
+ movif32 t0m, t2
+ mov t0, t2
+ dec hd
+ jz .height1
+ or edged, 16
+ call .h
+.main:
+ add lpfq, stridemp
+ movif32 t4, t4m
+ call .hv
+ call .prep_n
+ sub hd, 2
+ jl .extend_bottom
+.main_loop:
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+%if ARCH_X86_64
+ test hb, hb
+%else
+ mov r4, hd
+ test r4, r4
+%endif
+ jz .odd_height
+ call .h
+ add lpfq, stridemp
+ call .hv
+ movif32 dstq, dstm
+ call .n0
+ call .n1
+ sub hd, 2
+ movif32 t0, t0m
+ jge .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .extend_bottom
+ mov lpfq, lpfm
+ call .h_top
+ add lpfq, stridemp
+ call .hv_bottom
+.end:
+ movif32 dstq, dstm
+ call .n0
+ call .n1
+.end2:
+ RET
+.height1:
+ movif32 t4, t4m
+ call .hv
+ call .prep_n
+ jmp .odd_height_end
+.odd_height:
+ call .hv
+ movif32 dstq, dstm
+ call .n0
+ call .n1
+.odd_height_end:
+ call .v
+ movif32 dstq, dstm
+ call .n0
+ jmp .end2
+.extend_bottom:
+ call .v
+ jmp .end
+.no_top:
+ movif32 strideq, stridemp
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov lpfm, r10
+ call .h
+ lea t2, [t1+400*6]
+ movif32 t2m, t2
+ call .top_fixup
+ dec hd
+ jz .no_top_height1
+ or edged, 16
+ mov t0, t1
+ mov t1, t2
+ movif32 t0m, t0
+ jmp .main
+.no_top_height1:
+ movif32 t3, t3m
+ movif32 t4, t4m
+ call .v
+ call .prep_n
+ jmp .odd_height_end
+.extend_right:
+%assign stack_offset stack_offset+8
+%assign calloff 8
+ movd m1, wd
+ movd m3, [lpfq-1]
+ pshufb m1, m6
+ pshufb m3, m6
+ psubb m2, m8, m1
+ pcmpgtb m2, m13
+ pand m5, m2
+ pandn m2, m3
+ por m5, m2
+ ret
+%assign stack_offset stack_offset-4
+%assign calloff 4
+.h: ; horizontal boxsum
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ %define leftq r4
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movif32 leftq, leftm
+ movddup m4, [leftq-4]
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ add leftmp, 4
+ palignr m5, m4, 13
+ jmp .h_main
+.h_extend_left:
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ pshufb m5, m11
+ jmp .h_main
+.h_top:
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movif32 wq, w0m
+.h_loop:
+ movu m5, [lpfq+wq-1]
+.h_main:
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp wd, -10
+ jl .h_have_right
+ call .extend_right
+.h_have_right:
+ punpcklbw m4, m5, m6
+ punpckhbw m5, m6
+ palignr m2, m5, m4, 2
+ paddw m0, m4, m2
+ palignr m3, m5, m4, 6
+ paddw m0, m3
+ punpcklwd m1, m2, m3
+ pmaddwd m1, m1
+ punpckhwd m2, m3
+ pmaddwd m2, m2
+ palignr m5, m4, 8
+ paddw m0, m5
+ punpcklwd m3, m4, m5
+ pmaddwd m3, m3
+ paddd m1, m3
+ punpckhwd m3, m4, m5
+ pmaddwd m3, m3
+ shufps m4, m5, q2121
+ paddw m0, m4 ; sum
+ punpcklwd m5, m4, m6
+ pmaddwd m5, m5
+ punpckhwd m4, m6
+ pmaddwd m4, m4
+ paddd m2, m3
+ test edgeb, 16 ; y > 0
+ jz .h_loop_end
+ paddw m0, [t1+wq*2+400*0]
+ paddd m1, [t1+wq*2+400*2]
+ paddd m2, [t1+wq*2+400*4]
+.h_loop_end:
+ paddd m1, m5 ; sumsq
+ paddd m2, m4
+ mova [t1+wq*2+400*0], m0
+ mova [t1+wq*2+400*2], m1
+ mova [t1+wq*2+400*4], m2
+ add wq, 8
+ jl .h_loop
+ ret
+.top_fixup:
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov wd, w0m
+%endif
+.top_fixup_loop: ; the sums of the first row needs to be doubled
+ mova m0, [t1+wq*2+400*0]
+ mova m1, [t1+wq*2+400*2]
+ mova m2, [t1+wq*2+400*4]
+ paddw m0, m0
+ paddd m1, m1
+ paddd m2, m2
+ mova [t2+wq*2+400*0], m0
+ mova [t2+wq*2+400*2], m1
+ mova [t2+wq*2+400*4], m2
+ add wq, 8
+ jl .top_fixup_loop
+ ret
+ALIGN function_align
+.hv: ; horizontal boxsum + vertical boxsum + ab
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movif32 leftq, leftm
+ movddup m4, [leftq-4]
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ add leftmp, 4
+ palignr m5, m4, 13
+ jmp .hv_main
+.hv_extend_left:
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ pshufb m5, m11
+ jmp .hv_main
+.hv_bottom:
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movif32 wq, w0m
+%if ARCH_X86_32
+ jmp .hv_loop_start
+%endif
+.hv_loop:
+ movif32 lpfq, hvsrcm
+.hv_loop_start:
+ movu m5, [lpfq+wq-1]
+.hv_main:
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp wd, -10
+ jl .hv_have_right
+ call .extend_right
+.hv_have_right:
+ movif32 t3, hd
+ punpcklbw m4, m5, m6
+ punpckhbw m5, m6
+ palignr m3, m5, m4, 2
+ paddw m0, m4, m3
+ palignr m1, m5, m4, 6
+ paddw m0, m1
+ punpcklwd m2, m3, m1
+ pmaddwd m2, m2
+ punpckhwd m3, m1
+ pmaddwd m3, m3
+ palignr m5, m4, 8
+ paddw m0, m5
+ punpcklwd m1, m4, m5
+ pmaddwd m1, m1
+ paddd m2, m1
+ punpckhwd m1, m4, m5
+ pmaddwd m1, m1
+ shufps m4, m5, q2121
+ paddw m0, m4 ; h sum
+ punpcklwd m5, m4, m6
+ pmaddwd m5, m5
+ punpckhwd m4, m6
+ pmaddwd m4, m4
+ paddd m3, m1
+ paddd m2, m5 ; h sumsq
+ paddd m3, m4
+ paddw m1, m0, [t1+wq*2+400*0]
+ paddd m4, m2, [t1+wq*2+400*2]
+ paddd m5, m3, [t1+wq*2+400*4]
+%if ARCH_X86_64
+ test hd, hd
+%else
+ test t3, t3
+%endif
+ jz .hv_last_row
+.hv_main2:
+ paddw m1, [t2+wq*2+400*0] ; hv sum
+ paddd m4, [t2+wq*2+400*2] ; hv sumsq
+ paddd m5, [t2+wq*2+400*4]
+ mova [t0+wq*2+400*0], m0
+ pslld m0, m4, 4
+ mova [t0+wq*2+400*2], m2
+ mova [t0+wq*2+400*4], m3
+ pslld m2, m4, 3
+ paddd m4, m0
+ pslld m0, m5, 4
+ paddd m4, m2 ; a * 25
+ pslld m2, m5, 3
+ paddd m5, m0
+ paddd m5, m2
+ punpcklwd m0, m1, m6 ; b
+ punpckhwd m1, m6
+ pmaddwd m2, m0, m0 ; b * b
+ pmaddwd m3, m1, m1
+ psubd m4, m2 ; p
+ psubd m5, m3
+ MULLD m4, m9, m2 ; p * s
+ MULLD m5, m9, m2
+ pmaddwd m0, m10 ; b * 164
+ pmaddwd m1, m10
+ paddusw m4, m10
+ paddusw m5, m10
+ psrld m4, 20 ; min(z, 255)
+ movif32 t3, t3m
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, t2, t2m
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m2
+ MULLD m1, m5, m2
+ paddd m0, m12 ; x * b * 164 + (1 << 11) + (1 << 15)
+ paddd m1, m12
+ mova [t4+wq*2+4], m3
+ psrld m0, 12 ; b
+ psrld m1, 12
+ mova [t3+wq*4+ 8], m0
+ mova [t3+wq*4+24], m1
+ add wq, 8
+ jl .hv_loop
+ mov t2, t1
+ mov t1, t0
+ mov t0, t2
+ movif32 t2m, t2
+ movif32 t0m, t0
+ ret
+.hv_last_row: ; esoteric edge case for odd heights
+ mova [t1+wq*2+400*0], m1
+ paddw m1, m0
+ mova [t1+wq*2+400*2], m4
+ paddd m4, m2
+ mova [t1+wq*2+400*4], m5
+ paddd m5, m3
+ jmp .hv_main2
+.v: ; vertical boxsum + ab
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov wd, w0m
+%endif
+.v_loop:
+ mova m0, [t1+wq*2+400*0]
+ mova m2, [t1+wq*2+400*2]
+ mova m3, [t1+wq*2+400*4]
+ paddw m1, m0, [t2+wq*2+400*0]
+ paddd m4, m2, [t2+wq*2+400*2]
+ paddd m5, m3, [t2+wq*2+400*4]
+ paddw m0, m0
+ paddd m2, m2
+ paddd m3, m3
+ paddw m1, m0 ; hv sum
+ paddd m4, m2 ; hv sumsq
+ pslld m0, m4, 4
+ paddd m5, m3
+ pslld m2, m4, 3
+ paddd m4, m0
+ pslld m0, m5, 4
+ paddd m4, m2 ; a * 25
+ pslld m2, m5, 3
+ paddd m5, m0
+ paddd m5, m2
+ punpcklwd m0, m1, m6
+ punpckhwd m1, m6
+ pmaddwd m2, m0, m0 ; b * b
+ pmaddwd m3, m1, m1
+ psubd m4, m2 ; p
+ psubd m5, m3
+ MULLD m4, m9, m2 ; p * s
+ MULLD m5, m9, m2
+ pmaddwd m0, m10 ; b * 164
+ pmaddwd m1, m10
+ paddusw m4, m10
+ paddusw m5, m10
+ psrld m4, 20 ; min(z, 255)
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, t2, t2m
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m2
+ MULLD m1, m5, m2
+ paddd m0, m12 ; x * b * 164 + (1 << 11) + (1 << 15)
+ paddd m1, m12
+ mova [t4+wq*2+4], m3
+ psrld m0, 12 ; b
+ psrld m1, 12
+ mova [t3+wq*4+ 8], m0
+ mova [t3+wq*4+24], m1
+ add wq, 8
+ jl .v_loop
+ ret
+.prep_n: ; initial neighbor setup
+ movif64 wq, r4
+ movif32 wd, w1m
+.prep_n_loop:
+ movu m0, [t4+wq*2+ 2]
+ movu m3, [t4+wq*2+ 4]
+ movu m1, [t3+wq*4+ 4]
+ movu m4, [t3+wq*4+ 8]
+ movu m2, [t3+wq*4+20]
+ movu m5, [t3+wq*4+24]
+ paddw m3, m0
+ paddd m4, m1
+ paddd m5, m2
+ paddw m3, [t4+wq*2+ 0]
+ paddd m4, [t3+wq*4+ 0]
+ paddd m5, [t3+wq*4+16]
+ paddw m0, m3
+ psllw m3, 2
+ paddd m1, m4
+ pslld m4, 2
+ paddd m2, m5
+ pslld m5, 2
+ paddw m0, m3 ; a 565
+ paddd m1, m4 ; b 565
+ paddd m2, m5
+ mova [t4+wq*2+400*2+ 0], m0
+ mova [t3+wq*4+400*4+ 0], m1
+ mova [t3+wq*4+400*4+16], m2
+ add wq, 8
+ jl .prep_n_loop
+ ret
+ALIGN function_align
+.n0: ; neighbor + output (even rows)
+ movif64 wq, r4
+ movif32 wd, w1m
+.n0_loop:
+ movu m0, [t4+wq*2+ 2]
+ movu m3, [t4+wq*2+ 4]
+ movu m1, [t3+wq*4+ 4]
+ movu m4, [t3+wq*4+ 8]
+ movu m2, [t3+wq*4+20]
+ movu m5, [t3+wq*4+24]
+ paddw m3, m0
+ paddd m4, m1
+ paddd m5, m2
+ paddw m3, [t4+wq*2+ 0]
+ paddd m4, [t3+wq*4+ 0]
+ paddd m5, [t3+wq*4+16]
+ paddw m0, m3
+ psllw m3, 2
+ paddd m1, m4
+ pslld m4, 2
+ paddd m2, m5
+ pslld m5, 2
+ paddw m0, m3 ; a 565
+ paddd m1, m4 ; b 565
+ paddd m2, m5
+ paddw m3, m0, [t4+wq*2+400*2+ 0]
+ paddd m4, m1, [t3+wq*4+400*4+ 0]
+ paddd m5, m2, [t3+wq*4+400*4+16]
+ mova [t4+wq*2+400*2+ 0], m0
+ mova [t3+wq*4+400*4+ 0], m1
+ mova [t3+wq*4+400*4+16], m2
+ movq m0, [dstq+wq]
+ punpcklbw m0, m6
+ punpcklwd m1, m0, m6 ; src
+ punpcklwd m2, m3, m6 ; a
+ pmaddwd m2, m1 ; a * src
+ punpckhwd m1, m0, m6
+ punpckhwd m3, m6
+ pmaddwd m3, m1
+ psubd m4, m2 ; b - a * src + (1 << 8)
+ psubd m5, m3
+ psrad m4, 9
+ psrad m5, 9
+ packssdw m4, m5
+ pmulhrsw m4, m7
+ paddw m0, m4
+ packuswb m0, m0
+ movq [dstq+wq], m0
+ add wq, 8
+ jl .n0_loop
+ add dstq, stridemp
+ ret
+ALIGN function_align
+.n1: ; neighbor + output (odd rows)
+ movif64 wq, r4
+ movif32 wd, w1m
+.n1_loop:
+ movq m0, [dstq+wq]
+ mova m3, [t4+wq*2+400*2+ 0]
+ mova m4, [t3+wq*4+400*4+ 0]
+ mova m5, [t3+wq*4+400*4+16]
+ punpcklbw m0, m6
+ punpcklwd m1, m0, m6 ; src
+ punpcklwd m2, m3, m6 ; a
+ pmaddwd m2, m1 ; a * src
+ punpckhwd m1, m0, m6
+ punpckhwd m3, m6
+ pmaddwd m3, m1
+ psubd m4, m2 ; b - a * src + (1 << 7)
+ psubd m5, m3
+ psrad m4, 8
+ psrad m5, 8
+ packssdw m4, m5
+ pmulhrsw m4, m7
+ paddw m0, m4
+ packuswb m0, m0
+ movq [dstq+wq], m0
+ add wq, 8
+ jl .n1_loop
+ add dstq, stridemp
+ movif32 dstm, dstq
+ ret
+
+%if ARCH_X86_32
+ %if STACK_ALIGNMENT < 16
+ %assign extra_stack 4*16
+ %else
+ %assign extra_stack 2*16
+ %endif
+cglobal sgr_filter_3x3_8bpc, 1, 7, 8, -400*42-16-extra_stack, \
+ dst, stride, left, lpf, w
+ %if STACK_ALIGNMENT < 16
+ %define dstm dword [esp+calloff+16*2+4*0]
+ %define stridemp dword [esp+calloff+16*2+4*1]
+ %define leftm dword [esp+calloff+16*2+4*2]
+ %define lpfm dword [esp+calloff+16*2+4*3]
+ %define w0m dword [esp+calloff+16*2+4*4]
+ %define hd dword [esp+calloff+16*2+4*5]
+ %define edgeb byte [esp+calloff+16*2+4*6]
+ %define edged dword [esp+calloff+16*2+4*6]
+ %define leftmp leftm
+ %else
+ %define w0m wm
+ %define hd dword r5m
+ %define edgeb byte r7m
+ %define edged dword r7m
+ %endif
+ %define hvsrcm dword [esp+calloff+4*0]
+ %define w1m dword [esp+calloff+4*1]
+ %define t3m dword [esp+calloff+4*2]
+ %define t4m dword [esp+calloff+4*3]
+ %define m8 [base+pb_0to15]
+ %define m9 [esp+calloff+16*1]
+ %define m10 [base+pd_0xf00801c7]
+ %define m11 [base+pd_34816]
+ %define m12 m6
+ %define m13 [base+sgr_lshuf3]
+ %define base r6-$$
+ %assign calloff 0
+ %if STACK_ALIGNMENT < 16
+ mov strideq, [rstk+stack_offset+ 8]
+ mov leftq, [rstk+stack_offset+12]
+ mov lpfq, [rstk+stack_offset+16]
+ mov wd, [rstk+stack_offset+20]
+ mov dstm, dstq
+ mov stridemp, strideq
+ mov leftm, leftq
+ mov r1, [rstk+stack_offset+24]
+ mov r2, [rstk+stack_offset+32]
+ mov lpfm, lpfq
+ mov hd, r1
+ mov edged, r2
+ %endif
+%else
+cglobal sgr_filter_3x3_8bpc, 4, 15, 14, -400*42-8, dst, stride, left, lpf, \
+ w, h, edge, params
+%endif
+%if ARCH_X86_64 || STACK_ALIGNMENT >= 16
+ mov wd, wm
+%endif
+%if ARCH_X86_64
+ mov paramsq, r6mp
+ lea r13, [sgr_x_by_x-0xf03]
+ mov hd, hm
+ mov edged, r7m
+ movq m9, [paramsq+4]
+ add lpfq, wq
+ lea t1, [rsp+wq*2+12]
+ mova m8, [pb_0to15]
+ add dstq, wq
+ lea t3, [rsp+wq*4+400*12+8]
+ mova m10, [pd_0xf00801c7]
+ lea t4, [rsp+wq*2+400*32+8]
+ mova m11, [pd_34816]
+ pshuflw m7, m9, q3333
+ pshufb m9, [pw_256] ; s1
+ punpcklqdq m7, m7 ; w1
+ neg wq
+ pxor m6, m6
+ mova m13, [sgr_lshuf3]
+ psllw m7, 4
+ DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w
+ %define lpfm [rsp]
+%else
+ mov r1, [rstk+stack_offset+28] ; params
+ LEA r6, $$
+ movq m1, [r1+4]
+ add lpfm, wq
+ lea t1, [rsp+extra_stack+wq*2+20]
+ add dstq, wq
+ lea t3, [rsp+extra_stack+wq*4+400*12+16]
+ mov dstm, dstq
+ lea t4, [rsp+extra_stack+wq*2+400*32+16]
+ mov t3m, t3
+ pshuflw m7, m1, q3333
+ mov t4m, t4
+ pshufb m1, [base+pw_256] ; s1
+ punpcklqdq m7, m7 ; w1
+ psllw m7, 4
+ neg wq
+ mova m9, m1
+ pxor m6, m6
+ mov w1m, wd
+ sub wd, 2
+ mov lpfq, lpfm
+ mov w0m, wd
+ %define strideq r5
+%endif
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, stridemp
+ mov t2, t1
+ add t1, 400*6
+ call .h_top
+ movif32 strideq, stridemp
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ add r10, strideq
+ mov lpfm, r10 ; below
+ movif32 t4, t4m
+ call .hv0
+.main:
+ dec hd
+ jz .height1
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+ call .hv1
+ call .prep_n
+ sub hd, 2
+ jl .extend_bottom
+.main_loop:
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+ call .hv0
+%if ARCH_X86_64
+ test hb, hb
+%else
+ mov r4, hd
+ test r4, r4
+%endif
+ jz .odd_height
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+ call .hv1
+ call .n0
+ call .n1
+ sub hd, 2
+ jge .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .extend_bottom
+ mov lpfq, lpfm
+ call .hv0_bottom
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+ call .hv1_bottom
+.end:
+ call .n0
+ call .n1
+.end2:
+ RET
+.height1:
+ call .v1
+ call .prep_n
+ jmp .odd_height_end
+.odd_height:
+ call .v1
+ call .n0
+ call .n1
+.odd_height_end:
+ call .v0
+ call .v1
+ call .n0
+ jmp .end2
+.extend_bottom:
+ call .v0
+ call .v1
+ jmp .end
+.no_top:
+ movif32 strideq, stridemp
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov lpfm, r10
+ call .h
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov wq, w0m
+ mov hvsrcm, lpfq
+%endif
+ lea t2, [t1+400*6]
+.top_fixup_loop:
+ mova m0, [t1+wq*2+400*0]
+ mova m1, [t1+wq*2+400*2]
+ mova m2, [t1+wq*2+400*4]
+ mova [t2+wq*2+400*0], m0
+ mova [t2+wq*2+400*2], m1
+ mova [t2+wq*2+400*4], m2
+ add wq, 8
+ jl .top_fixup_loop
+ movif32 t3, t3m
+ movif32 t4, t4m
+ call .v0
+ jmp .main
+.extend_right:
+%assign stack_offset stack_offset+8
+%assign calloff 8
+ movd m0, [lpfq-1]
+ movd m1, wd
+ mova m3, m8
+ pshufb m0, m6
+ pshufb m1, m6
+ mova m2, m6
+ psubb m2, m1
+ pcmpgtb m2, m3
+ pand m5, m2
+ pandn m2, m0
+ por m5, m2
+ ret
+%assign stack_offset stack_offset-4
+%assign calloff 4
+.h: ; horizontal boxsum
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ %define leftq r4
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movif32 leftq, leftm
+ movddup m4, [leftq-4]
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ add leftmp, 4
+ palignr m5, m4, 14
+ jmp .h_main
+.h_extend_left:
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ pshufb m5, m13
+ jmp .h_main
+.h_top:
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movif32 wq, w0m
+.h_loop:
+ movu m5, [lpfq+wq]
+.h_main:
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp wd, -9
+ jl .h_have_right
+ call .extend_right
+.h_have_right:
+ punpcklbw m4, m5, m6
+ punpckhbw m5, m6
+ palignr m0, m5, m4, 2
+ paddw m1, m4, m0
+ punpcklwd m2, m4, m0
+ pmaddwd m2, m2
+ punpckhwd m3, m4, m0
+ pmaddwd m3, m3
+ palignr m5, m4, 4
+ paddw m1, m5 ; sum
+ punpcklwd m4, m5, m6
+ pmaddwd m4, m4
+ punpckhwd m5, m6
+ pmaddwd m5, m5
+ paddd m2, m4 ; sumsq
+ paddd m3, m5
+ mova [t1+wq*2+400*0], m1
+ mova [t1+wq*2+400*2], m2
+ mova [t1+wq*2+400*4], m3
+ add wq, 8
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv0: ; horizontal boxsum + vertical boxsum + ab (even rows)
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+ movif32 leftq, leftm
+ movddup m4, [leftq-4]
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ add leftmp, 4
+ palignr m5, m4, 14
+ jmp .hv0_main
+.hv0_extend_left:
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ pshufb m5, m13
+ jmp .hv0_main
+.hv0_bottom:
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+ movif32 wq, w0m
+%if ARCH_X86_32
+ jmp .hv0_loop_start
+%endif
+.hv0_loop:
+ movif32 lpfq, hvsrcm
+.hv0_loop_start:
+ movu m5, [lpfq+wq]
+.hv0_main:
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv0_have_right
+ cmp wd, -9
+ jl .hv0_have_right
+ call .extend_right
+.hv0_have_right:
+ punpcklbw m4, m5, m6
+ punpckhbw m5, m6
+ palignr m0, m5, m4, 2
+ paddw m1, m4, m0
+ punpcklwd m2, m4, m0
+ pmaddwd m2, m2
+ punpckhwd m3, m4, m0
+ pmaddwd m3, m3
+ palignr m5, m4, 4
+ paddw m1, m5 ; sum
+ punpcklwd m4, m5, m6
+ pmaddwd m4, m4
+ punpckhwd m5, m6
+ pmaddwd m5, m5
+ paddd m2, m4 ; sumsq
+ paddd m3, m5
+ paddw m0, m1, [t1+wq*2+400*0]
+ paddd m4, m2, [t1+wq*2+400*2]
+ paddd m5, m3, [t1+wq*2+400*4]
+ mova [t1+wq*2+400*0], m1
+ mova [t1+wq*2+400*2], m2
+ mova [t1+wq*2+400*4], m3
+ paddw m1, m0, [t2+wq*2+400*0]
+ paddd m2, m4, [t2+wq*2+400*2]
+ paddd m3, m5, [t2+wq*2+400*4]
+ mova [t2+wq*2+400*0], m0
+ mova [t2+wq*2+400*2], m4
+ mova [t2+wq*2+400*4], m5
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; a * 9
+ paddd m5, m3
+ punpcklwd m0, m1, m6 ; b
+ pmaddwd m2, m0, m0 ; b * b
+ punpckhwd m1, m6
+ pmaddwd m3, m1, m1
+ psubd m4, m2 ; p
+ psubd m5, m3
+ MULLD m4, m9, m12 ; p * s
+ MULLD m5, m9, m12
+ pmaddwd m0, m10 ; b * 455
+ pmaddwd m1, m10
+ paddusw m4, m10
+ paddusw m5, m10
+ psrld m4, 20 ; min(z, 255)
+ movif32 t3, t3m
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, r0, dstm
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m12
+ MULLD m1, m5, m12
+%if ARCH_X86_32
+ pxor m6, m6
+%endif
+ paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m11
+ mova [t4+wq*2+4], m3
+ psrld m0, 12
+ psrld m1, 12
+ mova [t3+wq*4+ 8], m0
+ mova [t3+wq*4+24], m1
+ add wq, 8
+ jl .hv0_loop
+ ret
+ALIGN function_align
+.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+ movif32 leftq, leftm
+ movddup m4, [leftq-4]
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ add leftmp, 4
+ palignr m5, m4, 14
+ jmp .hv1_main
+.hv1_extend_left:
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ pshufb m5, m13
+ jmp .hv1_main
+.hv1_bottom:
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+ movif32 wq, w0m
+%if ARCH_X86_32
+ jmp .hv1_loop_start
+%endif
+.hv1_loop:
+ movif32 lpfq, hvsrcm
+.hv1_loop_start:
+ movu m5, [lpfq+wq]
+.hv1_main:
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv1_have_right
+ cmp wd, -9
+ jl .hv1_have_right
+ call .extend_right
+.hv1_have_right:
+ punpcklbw m4, m5, m6
+ punpckhbw m5, m6
+ palignr m1, m5, m4, 2
+ paddw m0, m4, m1
+ punpcklwd m2, m4, m1
+ pmaddwd m2, m2
+ punpckhwd m3, m4, m1
+ pmaddwd m3, m3
+ palignr m5, m4, 4
+ paddw m0, m5 ; h sum
+ punpcklwd m1, m5, m6
+ pmaddwd m1, m1
+ punpckhwd m5, m6
+ pmaddwd m5, m5
+ paddd m2, m1 ; h sumsq
+ paddd m3, m5
+ paddw m1, m0, [t2+wq*2+400*0]
+ paddd m4, m2, [t2+wq*2+400*2]
+ paddd m5, m3, [t2+wq*2+400*4]
+ mova [t2+wq*2+400*0], m0
+ mova [t2+wq*2+400*2], m2
+ mova [t2+wq*2+400*4], m3
+ pslld m2, m4, 3
+ pslld m3, m5, 3
+ paddd m4, m2 ; a * 9
+ paddd m5, m3
+ punpcklwd m0, m1, m6 ; b
+ pmaddwd m2, m0, m0 ; b * b
+ punpckhwd m1, m6
+ pmaddwd m3, m1, m1
+ psubd m4, m2 ; p
+ psubd m5, m3
+ MULLD m4, m9, m12 ; p * s
+ MULLD m5, m9, m12
+ pmaddwd m0, m10 ; b * 455
+ pmaddwd m1, m10
+ paddusw m4, m10
+ paddusw m5, m10
+ psrld m4, 20 ; min(z, 255)
+ movif32 t3, t3m
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, r0, dstm
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m12
+ MULLD m1, m5, m12
+%if ARCH_X86_32
+ pxor m6, m6
+%endif
+ paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m11
+ mova [t4+wq*2+400*2 +4], m3
+ psrld m0, 12
+ psrld m1, 12
+ mova [t3+wq*4+400*4+ 8], m0
+ mova [t3+wq*4+400*4+24], m1
+ add wq, 8
+ jl .hv1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.v0: ; vertical boxsums + ab (even rows)
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov wd, w0m
+%endif
+.v0_loop:
+ mova m0, [t1+wq*2+400*0]
+ mova m4, [t1+wq*2+400*2]
+ mova m5, [t1+wq*2+400*4]
+ paddw m0, m0
+ paddd m4, m4
+ paddd m5, m5
+ paddw m1, m0, [t2+wq*2+400*0]
+ paddd m2, m4, [t2+wq*2+400*2]
+ paddd m3, m5, [t2+wq*2+400*4]
+ mova [t2+wq*2+400*0], m0
+ mova [t2+wq*2+400*2], m4
+ mova [t2+wq*2+400*4], m5
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; a * 9
+ paddd m5, m3
+ punpcklwd m0, m1, m6 ; b
+ pmaddwd m2, m0, m0 ; b * b
+ punpckhwd m1, m6
+ pmaddwd m3, m1, m1
+ psubd m4, m2 ; p
+ psubd m5, m3
+ MULLD m4, m9, m12 ; p * s
+ MULLD m5, m9, m12
+ pmaddwd m0, m10 ; b * 455
+ pmaddwd m1, m10
+ paddusw m4, m10
+ paddusw m5, m10
+ psrld m4, 20 ; min(z, 255)
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, r0, dstm
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m12
+ MULLD m1, m5, m12
+%if ARCH_X86_32
+ pxor m6, m6
+%endif
+ paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m11
+ mova [t4+wq*2+4], m3
+ psrld m0, 12
+ psrld m1, 12
+ mova [t3+wq*4+ 8], m0
+ mova [t3+wq*4+24], m1
+ add wq, 8
+ jl .v0_loop
+ ret
+.v1: ; vertical boxsums + ab (odd rows)
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov wd, w0m
+%endif
+.v1_loop:
+ mova m0, [t1+wq*2+400*0]
+ mova m4, [t1+wq*2+400*2]
+ mova m5, [t1+wq*2+400*4]
+ paddw m1, m0, [t2+wq*2+400*0]
+ paddd m2, m4, [t2+wq*2+400*2]
+ paddd m3, m5, [t2+wq*2+400*4]
+ mova [t2+wq*2+400*0], m0
+ mova [t2+wq*2+400*2], m4
+ mova [t2+wq*2+400*4], m5
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; a * 9
+ paddd m5, m3
+ punpcklwd m0, m1, m6 ; b
+ pmaddwd m2, m0, m0 ; b * b
+ punpckhwd m1, m6
+ pmaddwd m3, m1, m1
+ psubd m4, m2 ; p
+ psubd m5, m3
+ MULLD m4, m9, m12 ; p * s
+ MULLD m5, m9, m12
+ pmaddwd m0, m10 ; b * 455
+ pmaddwd m1, m10
+ paddusw m4, m10
+ paddusw m5, m10
+ psrld m4, 20 ; min(z, 255)
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, r0, dstm
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m12
+ MULLD m1, m5, m12
+%if ARCH_X86_32
+ pxor m6, m6
+%endif
+ paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m11
+ mova [t4+wq*2+400*2+ 4], m3
+ psrld m0, 12
+ psrld m1, 12
+ mova [t3+wq*4+400*4+ 8], m0
+ mova [t3+wq*4+400*4+24], m1
+ add wq, 8
+ jl .v1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.prep_n: ; initial neighbor setup
+ movif64 wq, r4
+ movif32 wd, w1m
+.prep_n_loop:
+ movu m0, [t4+wq*2+400*0+ 4]
+ movu m1, [t3+wq*4+400*0+ 8]
+ movu m2, [t3+wq*4+400*0+24]
+ movu m3, [t4+wq*2+400*0+ 2]
+ movu m4, [t3+wq*4+400*0+ 4]
+ movu m5, [t3+wq*4+400*0+20]
+ paddw m0, [t4+wq*2+400*0+ 0]
+ paddd m1, [t3+wq*4+400*0+ 0]
+ paddd m2, [t3+wq*4+400*0+16]
+ paddw m3, m0
+ paddd m4, m1
+ paddd m5, m2
+ psllw m3, 2 ; a[-1] 444
+ pslld m4, 2 ; b[-1] 444
+ pslld m5, 2
+ psubw m3, m0 ; a[-1] 343
+ psubd m4, m1 ; b[-1] 343
+ psubd m5, m2
+ mova [t4+wq*2+400*4], m3
+ mova [t3+wq*4+400*8+ 0], m4
+ mova [t3+wq*4+400*8+16], m5
+ movu m0, [t4+wq*2+400*2+ 4]
+ movu m1, [t3+wq*4+400*4+ 8]
+ movu m2, [t3+wq*4+400*4+24]
+ movu m3, [t4+wq*2+400*2+ 2]
+ movu m4, [t3+wq*4+400*4+ 4]
+ movu m5, [t3+wq*4+400*4+20]
+ paddw m0, [t4+wq*2+400*2+ 0]
+ paddd m1, [t3+wq*4+400*4+ 0]
+ paddd m2, [t3+wq*4+400*4+16]
+ paddw m3, m0
+ paddd m4, m1
+ paddd m5, m2
+ psllw m3, 2 ; a[ 0] 444
+ pslld m4, 2 ; b[ 0] 444
+ pslld m5, 2
+ mova [t4+wq*2+400* 6], m3
+ mova [t3+wq*4+400*12+ 0], m4
+ mova [t3+wq*4+400*12+16], m5
+ psubw m3, m0 ; a[ 0] 343
+ psubd m4, m1 ; b[ 0] 343
+ psubd m5, m2
+ mova [t4+wq*2+400* 8], m3
+ mova [t3+wq*4+400*16+ 0], m4
+ mova [t3+wq*4+400*16+16], m5
+ add wq, 8
+ jl .prep_n_loop
+ ret
+ALIGN function_align
+.n0: ; neighbor + output (even rows)
+ movif64 wq, r4
+ movif32 wd, w1m
+.n0_loop:
+ movu m3, [t4+wq*2+400*0+4]
+ movu m1, [t4+wq*2+400*0+2]
+ paddw m3, [t4+wq*2+400*0+0]
+ paddw m1, m3
+ psllw m1, 2 ; a[ 1] 444
+ psubw m2, m1, m3 ; a[ 1] 343
+ paddw m3, m2, [t4+wq*2+400*4]
+ paddw m3, [t4+wq*2+400*6]
+ mova [t4+wq*2+400*4], m2
+ mova [t4+wq*2+400*6], m1
+ movu m4, [t3+wq*4+400*0+8]
+ movu m1, [t3+wq*4+400*0+4]
+ paddd m4, [t3+wq*4+400*0+0]
+ paddd m1, m4
+ pslld m1, 2 ; b[ 1] 444
+ psubd m2, m1, m4 ; b[ 1] 343
+ paddd m4, m2, [t3+wq*4+400* 8+ 0]
+ paddd m4, [t3+wq*4+400*12+ 0]
+ mova [t3+wq*4+400* 8+ 0], m2
+ mova [t3+wq*4+400*12+ 0], m1
+ movu m5, [t3+wq*4+400*0+24]
+ movu m1, [t3+wq*4+400*0+20]
+ paddd m5, [t3+wq*4+400*0+16]
+ paddd m1, m5
+ pslld m1, 2
+ psubd m2, m1, m5
+ paddd m5, m2, [t3+wq*4+400* 8+16]
+ paddd m5, [t3+wq*4+400*12+16]
+ mova [t3+wq*4+400* 8+16], m2
+ mova [t3+wq*4+400*12+16], m1
+ movq m0, [dstq+wq]
+ punpcklbw m0, m6
+ punpcklwd m1, m0, m6
+ punpcklwd m2, m3, m6
+ pmaddwd m2, m1 ; a * src
+ punpckhwd m1, m0, m6
+ punpckhwd m3, m6
+ pmaddwd m3, m1
+ psubd m4, m2 ; b - a * src + (1 << 8)
+ psubd m5, m3
+ psrad m4, 9
+ psrad m5, 9
+ packssdw m4, m5
+ pmulhrsw m4, m7
+ paddw m0, m4
+ packuswb m0, m0
+ movq [dstq+wq], m0
+ add wq, 8
+ jl .n0_loop
+ add dstq, stridemp
+ ret
+ALIGN function_align
+.n1: ; neighbor + output (odd rows)
+ movif64 wq, r4
+ movif32 wd, w1m
+.n1_loop:
+ movu m3, [t4+wq*2+400*2+4]
+ movu m1, [t4+wq*2+400*2+2]
+ paddw m3, [t4+wq*2+400*2+0]
+ paddw m1, m3
+ psllw m1, 2 ; a[ 1] 444
+ psubw m2, m1, m3 ; a[ 1] 343
+ paddw m3, m2, [t4+wq*2+400*6]
+ paddw m3, [t4+wq*2+400*8]
+ mova [t4+wq*2+400*6], m1
+ mova [t4+wq*2+400*8], m2
+ movu m4, [t3+wq*4+400*4+8]
+ movu m1, [t3+wq*4+400*4+4]
+ paddd m4, [t3+wq*4+400*4+0]
+ paddd m1, m4
+ pslld m1, 2 ; b[ 1] 444
+ psubd m2, m1, m4 ; b[ 1] 343
+ paddd m4, m2, [t3+wq*4+400*12+ 0]
+ paddd m4, [t3+wq*4+400*16+ 0]
+ mova [t3+wq*4+400*12+ 0], m1
+ mova [t3+wq*4+400*16+ 0], m2
+ movu m5, [t3+wq*4+400*4+24]
+ movu m1, [t3+wq*4+400*4+20]
+ paddd m5, [t3+wq*4+400*4+16]
+ paddd m1, m5
+ pslld m1, 2
+ psubd m2, m1, m5
+ paddd m5, m2, [t3+wq*4+400*12+16]
+ paddd m5, [t3+wq*4+400*16+16]
+ mova [t3+wq*4+400*12+16], m1
+ mova [t3+wq*4+400*16+16], m2
+ movq m0, [dstq+wq]
+ punpcklbw m0, m6
+ punpcklwd m1, m0, m6
+ punpcklwd m2, m3, m6
+ pmaddwd m2, m1 ; a * src
+ punpckhwd m1, m0, m6
+ punpckhwd m3, m6
+ pmaddwd m3, m1
+ psubd m4, m2 ; b - a * src + (1 << 8)
+ psubd m5, m3
+ psrad m4, 9
+ psrad m5, 9
+ packssdw m4, m5
+ pmulhrsw m4, m7
+ paddw m0, m4
+ packuswb m0, m0
+ movq [dstq+wq], m0
+ add wq, 8
+ jl .n1_loop
+ add dstq, stridemp
+ movif32 dstm, dstq
+ ret
+
+%if ARCH_X86_32
+ %if STACK_ALIGNMENT < 16
+ %assign extra_stack 10*16
+ %else
+ %assign extra_stack 8*16
+ %endif
+cglobal sgr_filter_mix_8bpc, 1, 7, 8, -400*66-48-extra_stack, \
+ dst, stride, left, lpf, w
+ %if STACK_ALIGNMENT < 16
+ %define dstm dword [esp+calloff+16*8+4*0]
+ %define stridemp dword [esp+calloff+16*8+4*1]
+ %define leftm dword [esp+calloff+16*8+4*2]
+ %define lpfm dword [esp+calloff+16*8+4*3]
+ %define w0m dword [esp+calloff+16*8+4*4]
+ %define hd dword [esp+calloff+16*8+4*5]
+ %define edgeb byte [esp+calloff+16*8+4*6]
+ %define edged dword [esp+calloff+16*8+4*6]
+ %define leftmp leftm
+ %else
+ %define w0m wm
+ %define hd dword r5m
+ %define edgeb byte r7m
+ %define edged dword r7m
+ %endif
+ %define hvsrcm dword [esp+calloff+4*0]
+ %define w1m dword [esp+calloff+4*1]
+ %define t3m dword [esp+calloff+4*2]
+ %define t4m dword [esp+calloff+4*3]
+ %xdefine m8 m6
+ %define m9 [base+pd_0xffff]
+ %define m10 [base+pd_34816]
+ %define m11 [base+pd_0xf00801c7]
+ %define m12 [base+pd_0xf00800a4]
+ %define m13 [esp+calloff+16*4]
+ %define m14 [esp+calloff+16*5]
+ %define m15 [esp+calloff+16*6]
+ %define m6 [esp+calloff+16*7]
+ %define base r6-$$
+ %assign calloff 0
+ %if STACK_ALIGNMENT < 16
+ mov strideq, [rstk+stack_offset+ 8]
+ mov leftq, [rstk+stack_offset+12]
+ mov lpfq, [rstk+stack_offset+16]
+ mov wd, [rstk+stack_offset+20]
+ mov dstm, dstq
+ mov stridemp, strideq
+ mov leftm, leftq
+ mov r1, [rstk+stack_offset+24]
+ mov r2, [rstk+stack_offset+32]
+ mov lpfm, lpfq
+ mov hd, r1
+ mov edged, r2
+ %endif
+%else
+cglobal sgr_filter_mix_8bpc, 4, 15, 16, -400*66-40, dst, stride, left, lpf, \
+ w, h, edge, params
+%endif
+%if ARCH_X86_64 || STACK_ALIGNMENT >= 16
+ mov wd, wm
+%endif
+%if ARCH_X86_64
+ mov paramsq, r6mp
+ lea r13, [sgr_x_by_x-0xf03]
+ movifnidn hd, hm
+ mov edged, r7m
+ mova m15, [paramsq]
+ add lpfq, wq
+ mova m9, [pd_0xffff]
+ lea t1, [rsp+wq*2+44]
+ mova m10, [pd_34816]
+ add dstq, wq
+ lea t3, [rsp+wq*4+400*24+40]
+ mova m11, [pd_0xf00801c7]
+ lea t4, [rsp+wq*2+400*52+40]
+ mova m12, [base+pd_0xf00800a4]
+ neg wq
+ pshuflw m13, m15, q0000
+ pshuflw m14, m15, q2222
+ pshufhw m15, m15, q1010
+ punpcklqdq m13, m13 ; s0
+ punpcklqdq m14, m14 ; s1
+ punpckhqdq m15, m15 ; w0 w1
+ pxor m6, m6
+ psllw m15, 2
+ DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w
+ %define lpfm [rsp]
+%else
+ mov r1, [rstk+stack_offset+28] ; params
+ LEA r6, $$
+ mova m2, [r1]
+ add lpfm, wq
+ lea t1, [rsp+extra_stack+wq*2+52]
+ add dstq, wq
+ lea t3, [rsp+extra_stack+wq*4+400*24+48]
+ mov dstm, dstq
+ lea t4, [rsp+extra_stack+wq*2+400*52+48]
+ mov t3m, t3
+ mov t4m, t4
+ neg wq
+ pshuflw m0, m2, q0000
+ pshuflw m1, m2, q2222
+ pshufhw m2, m2, q1010
+ punpcklqdq m0, m0 ; s0
+ punpcklqdq m1, m1 ; s1
+ punpckhqdq m2, m2 ; w0 w1
+ mov w1m, wd
+ pxor m3, m3
+ psllw m2, 2
+ mova m13, m0
+ mova m14, m1
+ sub wd, 2
+ mova m15, m2
+ mova m6, m3
+ mov lpfq, lpfm
+ mov w0m, wd
+ %define strideq r5
+%endif
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, stridemp
+ mov t2, t1
+%if ARCH_X86_64
+ call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_ssse3).top_fixup
+%else
+ mov wq, w0m
+ call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_ssse3).top_fixup_loop
+%endif
+ add t1, 400*12
+ call .h_top
+ movif32 strideq, stridemp
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ add r10, strideq
+ mov lpfm, r10 ; below
+ movif32 t4, t4m
+ call .hv0
+.main:
+ dec hd
+ jz .height1
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+ call .hv1
+ call .prep_n
+ sub hd, 2
+ jl .extend_bottom
+.main_loop:
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+ call .hv0
+%if ARCH_X86_64
+ test hd, hd
+%else
+ mov r4, hd
+ test r4, r4
+%endif
+ jz .odd_height
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+ call .hv1
+ call .n0
+ call .n1
+ sub hd, 2
+ jge .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .extend_bottom
+ mov lpfq, lpfm
+ call .hv0_bottom
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+ call .hv1_bottom
+.end:
+ call .n0
+ call .n1
+.end2:
+ RET
+.height1:
+ call .v1
+ call .prep_n
+ jmp .odd_height_end
+.odd_height:
+ call .v1
+ call .n0
+ call .n1
+.odd_height_end:
+ call .v0
+ call .v1
+ call .n0
+ jmp .end2
+.extend_bottom:
+ call .v0
+ call .v1
+ jmp .end
+.no_top:
+ movif32 strideq, stridemp
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov lpfm, r10
+ call .h
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov wq, w0m
+ mov hvsrcm, lpfq
+%endif
+ lea t2, [t1+400*12]
+.top_fixup_loop:
+ mova m0, [t1+wq*2+400* 0]
+ mova m1, [t1+wq*2+400* 2]
+ mova m2, [t1+wq*2+400* 4]
+ paddw m0, m0
+ mova m3, [t1+wq*2+400* 6]
+ paddd m1, m1
+ mova m4, [t1+wq*2+400* 8]
+ paddd m2, m2
+ mova m5, [t1+wq*2+400*10]
+ mova [t2+wq*2+400* 0], m0
+ mova [t2+wq*2+400* 2], m1
+ mova [t2+wq*2+400* 4], m2
+ mova [t2+wq*2+400* 6], m3
+ mova [t2+wq*2+400* 8], m4
+ mova [t2+wq*2+400*10], m5
+ add wq, 8
+ jl .top_fixup_loop
+ movif32 t3, t3m
+ movif32 t4, t4m
+ call .v0
+ jmp .main
+.extend_right:
+%assign stack_offset stack_offset+8
+%assign calloff 8
+%if ARCH_X86_64
+ SWAP m8, m6
+%endif
+ movd m1, wd
+ movd m3, [lpfq-1]
+ pshufb m1, m8
+ pshufb m3, m8
+ psubb m2, [base+pb_1], m1
+ pcmpgtb m2, [base+pb_0to15]
+ pand m5, m2
+ pandn m2, m3
+ por m5, m2
+%if ARCH_X86_64
+ SWAP m6, m8
+%endif
+ ret
+%assign stack_offset stack_offset-4
+%assign calloff 4
+.h: ; horizontal boxsum
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ %define leftq r4
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movif32 leftq, leftm
+ movddup m4, [leftq-4]
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ add leftmp, 4
+ palignr m5, m4, 13
+ jmp .h_main
+.h_extend_left:
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ pshufb m5, [base+sgr_lshuf5]
+ jmp .h_main
+.h_top:
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movif32 wq, w0m
+.h_loop:
+ movu m5, [lpfq+wq-1]
+.h_main:
+ test edgeb, 2 ; LR_HAVE_RIGHT
+%if ARCH_X86_32
+ pxor m8, m8
+%else
+ SWAP m8, m6
+%endif
+ jnz .h_have_right
+ cmp wd, -10
+ jl .h_have_right
+ call .extend_right
+.h_have_right:
+ punpcklbw m4, m5, m8
+ punpckhbw m5, m8
+ palignr m3, m5, m4, 2
+ palignr m0, m5, m4, 4
+ paddw m1, m3, m0
+ punpcklwd m2, m3, m0
+ pmaddwd m2, m2
+ punpckhwd m3, m0
+ pmaddwd m3, m3
+ palignr m0, m5, m4, 6
+ paddw m1, m0 ; sum3
+ punpcklwd m7, m0, m8
+ pmaddwd m7, m7
+ punpckhwd m0, m8
+ pmaddwd m0, m0
+%if ARCH_X86_64
+ SWAP m6, m8
+%endif
+ paddd m2, m7 ; sumsq3
+ palignr m5, m4, 8
+ punpcklwd m7, m5, m4
+ paddw m8, m4, m5
+ pmaddwd m7, m7
+ punpckhwd m5, m4
+ pmaddwd m5, m5
+ paddd m3, m0
+ mova [t1+wq*2+400* 6], m1
+ mova [t1+wq*2+400* 8], m2
+ mova [t1+wq*2+400*10], m3
+ paddw m8, m1 ; sum5
+ paddd m7, m2 ; sumsq5
+ paddd m5, m3
+ mova [t1+wq*2+400* 0], m8
+ mova [t1+wq*2+400* 2], m7
+ mova [t1+wq*2+400* 4], m5
+ add wq, 8
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows)
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+ movif32 leftq, leftm
+ movddup m4, [leftq-4]
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ add leftmp, 4
+ palignr m5, m4, 13
+ jmp .hv0_main
+.hv0_extend_left:
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ pshufb m5, [base+sgr_lshuf5]
+ jmp .hv0_main
+.hv0_bottom:
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+ movif32 wq, w0m
+%if ARCH_X86_32
+ jmp .hv0_loop_start
+%endif
+.hv0_loop:
+ movif32 lpfq, hvsrcm
+.hv0_loop_start:
+ movu m5, [lpfq+wq-1]
+.hv0_main:
+ test edgeb, 2 ; LR_HAVE_RIGHT
+%if ARCH_X86_32
+ pxor m8, m8
+%else
+ SWAP m8, m6
+%endif
+ jnz .hv0_have_right
+ cmp wd, -10
+ jl .hv0_have_right
+ call .extend_right
+.hv0_have_right:
+ punpcklbw m4, m5, m8
+ punpckhbw m5, m8
+ palignr m3, m5, m4, 2
+ palignr m0, m5, m4, 4
+ movif32 t3, t3m
+ paddw m1, m3, m0
+ punpcklwd m2, m3, m0
+ pmaddwd m2, m2
+ punpckhwd m3, m0
+ pmaddwd m3, m3
+ palignr m0, m5, m4, 6
+ paddw m1, m0 ; h sum3
+ punpcklwd m7, m0, m8
+ pmaddwd m7, m7
+ punpckhwd m0, m8
+%if ARCH_X86_64
+ SWAP m6, m8
+%endif
+ pmaddwd m0, m0
+ paddd m2, m7 ; h sumsq3
+ palignr m5, m4, 8
+ punpcklwd m7, m5, m4
+ paddw m8, m4, m5
+ pmaddwd m7, m7
+ punpckhwd m5, m4
+ pmaddwd m5, m5
+ paddd m3, m0
+ paddw m8, m1 ; h sum5
+ paddd m7, m2 ; h sumsq5
+ paddd m5, m3
+ mova [t3+wq*4+400*8+ 8], m8
+ mova [t3+wq*4+400*0+ 8], m7
+ mova [t3+wq*4+400*0+24], m5
+ paddw m8, [t1+wq*2+400* 0]
+ paddd m7, [t1+wq*2+400* 2]
+ paddd m5, [t1+wq*2+400* 4]
+ mova [t1+wq*2+400* 0], m8
+ mova [t1+wq*2+400* 2], m7
+ mova [t1+wq*2+400* 4], m5
+ paddw m0, m1, [t1+wq*2+400* 6]
+ paddd m4, m2, [t1+wq*2+400* 8]
+ paddd m5, m3, [t1+wq*2+400*10]
+ mova [t1+wq*2+400* 6], m1
+ mova [t1+wq*2+400* 8], m2
+ mova [t1+wq*2+400*10], m3
+ paddw m1, m0, [t2+wq*2+400* 6]
+ paddd m2, m4, [t2+wq*2+400* 8]
+ paddd m3, m5, [t2+wq*2+400*10]
+ mova [t2+wq*2+400* 6], m0
+ mova [t2+wq*2+400* 8], m4
+ mova [t2+wq*2+400*10], m5
+%if ARCH_X86_32
+ pxor m7, m7
+%else
+ SWAP m7, m6
+%endif
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; a3 * 9
+ paddd m5, m3
+ punpcklwd m0, m1, m7 ; b3
+ pmaddwd m2, m0, m0
+ punpckhwd m1, m7
+ pmaddwd m3, m1, m1
+%if ARCH_X86_64
+ SWAP m7, m6
+%endif
+ psubd m4, m2 ; p3
+ psubd m5, m3
+ MULLD m4, m14, m7 ; p3 * s1
+ MULLD m5, m14, m7
+ pmaddwd m0, m11 ; b3 * 455
+ pmaddwd m1, m11
+ paddusw m4, m11
+ paddusw m5, m11
+ psrld m4, 20 ; min(z3, 255)
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, r0, dstm
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m7
+ MULLD m1, m5, m7
+ paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m10
+ mova [t4+wq*2+400*2+ 4], m3
+ psrld m0, 12
+ psrld m1, 12
+ mova [t3+wq*4+400*4+ 8], m0
+ mova [t3+wq*4+400*4+24], m1
+ add wq, 8
+ jl .hv0_loop
+ ret
+ALIGN function_align
+.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+ movif32 leftq, leftm
+ movddup m4, [leftq-4]
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ add leftmp, 4
+ palignr m5, m4, 13
+ jmp .hv1_main
+.hv1_extend_left:
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ pshufb m5, [base+sgr_lshuf5]
+ jmp .hv1_main
+.hv1_bottom:
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+ movif32 wq, w0m
+%if ARCH_X86_32
+ jmp .hv1_loop_start
+%endif
+.hv1_loop:
+ movif32 lpfq, hvsrcm
+.hv1_loop_start:
+ movu m5, [lpfq+wq-1]
+.hv1_main:
+ test edgeb, 2 ; LR_HAVE_RIGHT
+%if ARCH_X86_32
+ pxor m8, m8
+%else
+ SWAP m8, m6
+%endif
+ jnz .hv1_have_right
+ cmp wd, -10
+ jl .hv1_have_right
+ call .extend_right
+.hv1_have_right:
+ punpcklbw m4, m5, m8
+ punpckhbw m5, m8
+ palignr m7, m5, m4, 2
+ palignr m3, m5, m4, 4
+ paddw m2, m7, m3
+ punpcklwd m0, m7, m3
+ pmaddwd m0, m0
+ punpckhwd m7, m3
+ pmaddwd m7, m7
+ palignr m3, m5, m4, 6
+ paddw m2, m3 ; h sum3
+ punpcklwd m1, m3, m8
+ pmaddwd m1, m1
+ punpckhwd m3, m8
+%if ARCH_X86_64
+ SWAP m6, m8
+%endif
+ pmaddwd m3, m3
+ paddd m0, m1 ; h sumsq3
+ palignr m5, m4, 8
+ punpckhwd m1, m4, m5
+ paddw m8, m4, m5
+ pmaddwd m1, m1
+ punpcklwd m4, m5
+ pmaddwd m4, m4
+ paddd m7, m3
+ paddw m5, m2, [t2+wq*2+400* 6]
+ mova [t2+wq*2+400* 6], m2
+ paddw m8, m2 ; h sum5
+ paddd m2, m0, [t2+wq*2+400* 8]
+ paddd m3, m7, [t2+wq*2+400*10]
+ mova [t2+wq*2+400* 8], m0
+ mova [t2+wq*2+400*10], m7
+ paddd m4, m0 ; h sumsq5
+ paddd m1, m7
+ pslld m0, m2, 3
+ pslld m7, m3, 3
+ paddd m2, m0 ; a3 * 9
+ paddd m3, m7
+%if ARCH_X86_32
+ mova [esp+20], m8
+ pxor m8, m8
+%else
+ SWAP m8, m6
+%endif
+ punpcklwd m0, m5, m8 ; b3
+ pmaddwd m7, m0, m0
+ punpckhwd m5, m8
+ pmaddwd m8, m5, m5
+ psubd m2, m7 ; p3
+ psubd m3, m8
+ MULLD m2, m14, m8 ; p3 * s1
+ MULLD m3, m14, m8
+ pmaddwd m0, m11 ; b3 * 455
+ pmaddwd m5, m11
+ paddusw m2, m11
+ paddusw m3, m11
+ psrld m2, 20 ; min(z3, 255)
+ movif32 t3, t3m
+ psrld m3, 20
+ GATHER_X_BY_X m8, m2, m3, r0, dstm
+ punpcklwd m2, m8, m8
+ punpckhwd m3, m8, m8
+ MULLD m0, m2, m7
+ MULLD m5, m3, m7
+ paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m5, m10
+ psrld m0, 12
+ psrld m5, 12
+ mova [t4+wq*2+400*4+ 4], m8
+ mova [t3+wq*4+400*8+ 8], m0
+ mova [t3+wq*4+400*8+24], m5
+%if ARCH_X86_32
+ mova m8, [esp+20]
+%else
+ SWAP m6, m8
+ pxor m6, m6
+%endif
+ paddw m5, m8, [t2+wq*2+400*0]
+ paddd m2, m4, [t2+wq*2+400*2]
+ paddd m3, m1, [t2+wq*2+400*4]
+ paddw m5, [t1+wq*2+400*0]
+ paddd m2, [t1+wq*2+400*2]
+ paddd m3, [t1+wq*2+400*4]
+ mova [t2+wq*2+400*0], m8
+ pslld m0, m2, 4
+ mova [t2+wq*2+400*2], m4
+ pslld m8, m3, 4
+ mova [t2+wq*2+400*4], m1
+ pslld m4, m2, 3
+ paddd m2, m0
+ pslld m7, m3, 3
+ paddd m3, m8
+ paddd m2, m4 ; a5 * 25
+ paddd m3, m7
+%if ARCH_X86_32
+ pxor m7, m7
+%else
+ SWAP m7, m6
+%endif
+ punpcklwd m0, m5, m7 ; b5
+ pmaddwd m4, m0, m0
+ punpckhwd m5, m7
+ pmaddwd m1, m5, m5
+%if ARCH_X86_64
+ SWAP m7, m6
+%endif
+ psubd m2, m4 ; p5
+ psubd m3, m1
+ MULLD m2, m13, m7 ; p5 * s0
+ MULLD m3, m13, m7
+ pmaddwd m0, m12 ; b5 * 164
+ pmaddwd m5, m12
+ paddusw m2, m12
+ paddusw m3, m12
+ psrld m2, 20 ; min(z5, 255)
+ psrld m3, 20
+ GATHER_X_BY_X m1, m2, m3, r0, dstm
+ punpcklwd m2, m1, m1
+ punpckhwd m3, m1, m1
+ MULLD m0, m2, m7
+ MULLD m5, m3, m7
+ paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
+ paddd m5, m10
+ mova [t4+wq*2+4], m1
+ psrld m0, 12
+ psrld m5, 12
+ mova [t3+wq*4+ 8], m0
+ mova [t3+wq*4+24], m5
+ add wq, 8
+ jl .hv1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.v0: ; vertical boxsums + ab3 (even rows)
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov wd, w0m
+%endif
+.v0_loop:
+ mova m0, [t1+wq*2+400* 6]
+ mova m4, [t1+wq*2+400* 8]
+ mova m5, [t1+wq*2+400*10]
+ paddw m0, m0
+ paddd m4, m4
+ paddd m5, m5
+ paddw m1, m0, [t2+wq*2+400* 6]
+ paddd m2, m4, [t2+wq*2+400* 8]
+ paddd m3, m5, [t2+wq*2+400*10]
+ mova [t2+wq*2+400* 6], m0
+ mova [t2+wq*2+400* 8], m4
+ mova [t2+wq*2+400*10], m5
+%if ARCH_X86_32
+ pxor m7, m7
+%else
+ SWAP m7, m6
+%endif
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; a3 * 9
+ paddd m5, m3
+ punpcklwd m0, m1, m7 ; b3
+ pmaddwd m2, m0, m0
+ punpckhwd m1, m7
+ pmaddwd m3, m1, m1
+ psubd m4, m2 ; p3
+ psubd m5, m3
+%if ARCH_X86_64
+ SWAP m7, m6
+%endif
+ MULLD m4, m14, m7 ; p3 * s1
+ MULLD m5, m14, m7
+ pmaddwd m0, m11 ; b3 * 455
+ pmaddwd m1, m11
+ paddusw m4, m11
+ paddusw m5, m11
+ psrld m4, 20 ; min(z3, 255)
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, r0, dstm
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m7
+ MULLD m1, m5, m7
+ paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m10
+ mova [t4+wq*2+400*2+4], m3
+ psrld m0, 12
+ psrld m1, 12
+ mova m3, [t1+wq*2+400*0]
+ mova m4, [t1+wq*2+400*2]
+ mova m5, [t1+wq*2+400*4]
+ mova [t3+wq*4+400*8+ 8], m3
+ mova [t3+wq*4+400*0+ 8], m4
+ mova [t3+wq*4+400*0+24], m5
+ paddw m3, m3 ; cc5
+ paddd m4, m4
+ paddd m5, m5
+ mova [t1+wq*2+400*0], m3
+ mova [t1+wq*2+400*2], m4
+ mova [t1+wq*2+400*4], m5
+ mova [t3+wq*4+400*4+ 8], m0
+ mova [t3+wq*4+400*4+24], m1
+ add wq, 8
+ jl .v0_loop
+ ret
+.v1: ; vertical boxsums + ab (odd rows)
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov wd, w0m
+%endif
+.v1_loop:
+ mova m4, [t1+wq*2+400* 6]
+ mova m5, [t1+wq*2+400* 8]
+ mova m7, [t1+wq*2+400*10]
+ paddw m1, m4, [t2+wq*2+400* 6]
+ paddd m2, m5, [t2+wq*2+400* 8]
+ paddd m3, m7, [t2+wq*2+400*10]
+ mova [t2+wq*2+400* 6], m4
+ mova [t2+wq*2+400* 8], m5
+ mova [t2+wq*2+400*10], m7
+%if ARCH_X86_32
+ pxor m7, m7
+%else
+ SWAP m7, m6
+%endif
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; ((a3 + 8) >> 4) * 9
+ paddd m5, m3
+ punpcklwd m0, m1, m7 ; b3
+ pmaddwd m2, m0, m0
+ punpckhwd m1, m7
+ pmaddwd m3, m1, m1
+ psubd m4, m2 ; p3
+ psubd m5, m3
+%if ARCH_X86_64
+ SWAP m7, m6
+%endif
+ MULLD m4, m14, m7 ; p3 * s1
+ MULLD m5, m14, m7
+ pmaddwd m0, m11 ; b3 * 455
+ pmaddwd m1, m11
+ paddusw m4, m11
+ paddusw m5, m11
+ psrld m4, 20 ; min(z3, 255)
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, r0, dstm
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m7
+ MULLD m1, m5, m7
+ paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m10
+ mova [t4+wq*2+400*4+4], m3
+ psrld m0, 12
+ psrld m8, m1, 12
+ mova m4, [t3+wq*4+400*8+ 8]
+ mova m5, [t3+wq*4+400*0+ 8]
+ mova m7, [t3+wq*4+400*0+24]
+ paddw m1, m4, [t2+wq*2+400*0]
+ paddd m2, m5, [t2+wq*2+400*2]
+ paddd m3, m7, [t2+wq*2+400*4]
+ paddw m1, [t1+wq*2+400*0]
+ paddd m2, [t1+wq*2+400*2]
+ paddd m3, [t1+wq*2+400*4]
+ mova [t2+wq*2+400*0], m4
+ mova [t2+wq*2+400*2], m5
+ mova [t2+wq*2+400*4], m7
+ pslld m4, m2, 4
+ mova [t3+wq*4+400*8+ 8], m0
+ pslld m5, m3, 4
+ mova [t3+wq*4+400*8+24], m8
+ pslld m7, m2, 3
+ paddd m2, m4
+ pslld m8, m3, 3
+ paddd m3, m5
+ paddd m2, m7 ; a5 * 25
+ paddd m3, m8
+%if ARCH_X86_32
+ pxor m7, m7
+%else
+ SWAP m7, m6
+%endif
+ punpcklwd m0, m1, m7 ; b5
+ pmaddwd m4, m0, m0
+ punpckhwd m1, m7
+ pmaddwd m5, m1, m1
+ psubd m2, m4 ; p5
+ psubd m3, m5
+%if ARCH_X86_64
+ SWAP m7, m6
+%endif
+ MULLD m2, m13, m7 ; p5 * s0
+ MULLD m3, m13, m7
+ pmaddwd m0, m12 ; b5 * 164
+ pmaddwd m1, m12
+ paddusw m2, m12
+ paddusw m3, m12
+ psrld m2, 20 ; min(z5, 255)
+ psrld m3, 20
+ GATHER_X_BY_X m4, m2, m3, r0, dstm
+ punpcklwd m2, m4, m4
+ punpckhwd m3, m4, m4
+ MULLD m0, m2, m7
+ MULLD m1, m3, m7
+ paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
+ paddd m1, m10
+ mova [t4+wq*2+4], m4
+ psrld m0, 12
+ psrld m1, 12
+ mova [t3+wq*4+ 8], m0
+ mova [t3+wq*4+24], m1
+ add wq, 8
+ jl .v1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.prep_n: ; initial neighbor setup
+ movif64 wq, r4
+ movif32 wd, w1m
+.prep_n_loop:
+ movu m0, [t4+wq*2+400*0+ 2]
+ movu m1, [t3+wq*4+400*0+ 4]
+ movu m2, [t3+wq*4+400*0+20]
+ movu m7, [t4+wq*2+400*0+ 4]
+ movu m8, [t3+wq*4+400*0+ 8]
+ paddw m3, m0, [t4+wq*2+400*0+ 0]
+ paddd m4, m1, [t3+wq*4+400*0+ 0]
+ paddd m5, m2, [t3+wq*4+400*0+16]
+ paddw m3, m7
+ paddd m4, m8
+ movu m7, [t3+wq*4+400*0+24]
+ paddw m0, m3
+ paddd m1, m4
+ psllw m3, 2
+ pslld m4, 2
+ paddd m5, m7
+ paddd m2, m5
+ pslld m5, 2
+ paddw m0, m3 ; a5 565
+ paddd m1, m4 ; b5 565
+ paddd m2, m5
+ mova [t4+wq*2+400* 6+ 0], m0
+ mova [t3+wq*4+400*12+ 0], m1
+ mova [t3+wq*4+400*12+16], m2
+ movu m0, [t4+wq*2+400*2+ 4]
+ movu m1, [t3+wq*4+400*4+ 8]
+ movu m2, [t3+wq*4+400*4+24]
+ movu m3, [t4+wq*2+400*2+ 2]
+ movu m4, [t3+wq*4+400*4+ 4]
+ movu m5, [t3+wq*4+400*4+20]
+ paddw m0, [t4+wq*2+400*2+ 0]
+ paddd m1, [t3+wq*4+400*4+ 0]
+ paddd m2, [t3+wq*4+400*4+16]
+ paddw m3, m0
+ paddd m4, m1
+ paddd m5, m2
+ psllw m3, 2 ; a3[-1] 444
+ pslld m4, 2 ; b3[-1] 444
+ pslld m5, 2
+ psubw m3, m0 ; a3[-1] 343
+ psubd m4, m1 ; b3[-1] 343
+ psubd m5, m2
+ mova [t4+wq*2+400* 8+ 0], m3
+ mova [t3+wq*4+400*16+ 0], m4
+ mova [t3+wq*4+400*16+16], m5
+ movu m0, [t4+wq*2+400*4+ 4]
+ movu m1, [t3+wq*4+400*8+ 8]
+ movu m2, [t3+wq*4+400*8+24]
+ movu m3, [t4+wq*2+400*4+ 2]
+ movu m4, [t3+wq*4+400*8+ 4]
+ movu m5, [t3+wq*4+400*8+20]
+ paddw m0, [t4+wq*2+400*4+ 0]
+ paddd m1, [t3+wq*4+400*8+ 0]
+ paddd m2, [t3+wq*4+400*8+16]
+ paddw m3, m0
+ paddd m4, m1
+ paddd m5, m2
+ psllw m3, 2 ; a3[ 0] 444
+ pslld m4, 2 ; b3[ 0] 444
+ pslld m5, 2
+ mova [t4+wq*2+400*10+ 0], m3
+ mova [t3+wq*4+400*20+ 0], m4
+ mova [t3+wq*4+400*20+16], m5
+ psubw m3, m0 ; a3[ 0] 343
+ psubd m4, m1 ; b3[ 0] 343
+ psubd m5, m2
+ mova [t4+wq*2+400*12+ 0], m3
+ mova [t3+wq*4+400*24+ 0], m4
+ mova [t3+wq*4+400*24+16], m5
+ add wq, 8
+ jl .prep_n_loop
+ ret
+ALIGN function_align
+.n0: ; neighbor + output (even rows)
+ movif64 wq, r4
+ movif32 wd, w1m
+.n0_loop:
+ movu m0, [t4+wq*2+ 4]
+ movu m2, [t4+wq*2+ 2]
+ paddw m0, [t4+wq*2+ 0]
+ paddw m0, m2
+ paddw m2, m0
+ psllw m0, 2
+ paddw m0, m2 ; a5
+ movu m4, [t3+wq*4+ 8]
+ movu m5, [t3+wq*4+24]
+ movu m1, [t3+wq*4+ 4]
+ movu m3, [t3+wq*4+20]
+ paddd m4, [t3+wq*4+ 0]
+ paddd m5, [t3+wq*4+16]
+ paddd m4, m1
+ paddd m5, m3
+ paddd m1, m4
+ paddd m3, m5
+ pslld m4, 2
+ pslld m5, 2
+ paddd m4, m1 ; b5
+ paddd m5, m3
+ movu m2, [t4+wq*2+400* 6]
+ paddw m2, m0
+ mova [t4+wq*2+400* 6], m0
+ paddd m0, m4, [t3+wq*4+400*12+ 0]
+ paddd m1, m5, [t3+wq*4+400*12+16]
+ mova [t3+wq*4+400*12+ 0], m4
+ mova [t3+wq*4+400*12+16], m5
+ mova [rsp+16+ARCH_X86_32*4], m1
+ movu m3, [t4+wq*2+400*2+4]
+ movu m5, [t4+wq*2+400*2+2]
+ paddw m3, [t4+wq*2+400*2+0]
+ paddw m5, m3
+ psllw m5, 2 ; a3[ 1] 444
+ psubw m4, m5, m3 ; a3[ 1] 343
+ movu m3, [t4+wq*2+400* 8]
+ paddw m3, [t4+wq*2+400*10]
+ paddw m3, m4
+ mova [t4+wq*2+400* 8], m4
+ mova [t4+wq*2+400*10], m5
+ movu m1, [t3+wq*4+400*4+ 8]
+ movu m5, [t3+wq*4+400*4+ 4]
+ movu m7, [t3+wq*4+400*4+24]
+ movu m8, [t3+wq*4+400*4+20]
+ paddd m1, [t3+wq*4+400*4+ 0]
+ paddd m7, [t3+wq*4+400*4+16]
+ paddd m5, m1
+ paddd m8, m7
+ pslld m5, 2 ; b3[ 1] 444
+ pslld m8, 2
+ psubd m4, m5, m1 ; b3[ 1] 343
+%if ARCH_X86_32
+ mova [esp+52], m8
+ psubd m8, m7
+%else
+ psubd m6, m8, m7
+ SWAP m8, m6
+%endif
+ paddd m1, m4, [t3+wq*4+400*16+ 0]
+ paddd m7, m8, [t3+wq*4+400*16+16]
+ paddd m1, [t3+wq*4+400*20+ 0]
+ paddd m7, [t3+wq*4+400*20+16]
+ mova [t3+wq*4+400*16+ 0], m4
+ mova [t3+wq*4+400*16+16], m8
+ mova [t3+wq*4+400*20+ 0], m5
+%if ARCH_X86_32
+ mova m8, [esp+52]
+%else
+ SWAP m8, m6
+ pxor m6, m6
+%endif
+ mova [t3+wq*4+400*20+16], m8
+ mova [rsp+32+ARCH_X86_32*4], m7
+ movq m4, [dstq+wq]
+ punpcklbw m4, m6
+ punpcklwd m5, m4, m6
+ punpcklwd m7, m2, m6
+ pmaddwd m7, m5 ; a5 * src
+ punpcklwd m8, m3, m6
+ pmaddwd m8, m5 ; a3 * src
+ punpckhwd m5, m4, m6
+ punpckhwd m2, m6
+ pmaddwd m2, m5
+ punpckhwd m3, m6
+ pmaddwd m3, m5
+ psubd m0, m7 ; b5 - a5 * src + (1 << 8) - (src << 13)
+ psubd m1, m8 ; b3 - a3 * src + (1 << 8) - (src << 13)
+ psrld m0, 9
+ pslld m1, 7
+ pand m0, m9
+ pandn m8, m9, m1
+ por m0, m8
+ mova m1, [rsp+16+ARCH_X86_32*4]
+ psubd m1, m2
+ mova m2, [rsp+32+ARCH_X86_32*4]
+ psubd m2, m3
+ mova m3, [base+pd_4096]
+ psrld m1, 9
+ pslld m2, 7
+ pand m1, m9
+ pandn m5, m9, m2
+ por m1, m5
+ pmaddwd m0, m15
+ pmaddwd m1, m15
+ paddd m0, m3
+ paddd m1, m3
+ psrad m0, 13
+ psrad m1, 13
+ packssdw m0, m1
+ paddw m0, m4
+ packuswb m0, m0
+ movq [dstq+wq], m0
+ add wq, 8
+ jl .n0_loop
+ add dstq, stridemp
+ ret
+ALIGN function_align
+.n1: ; neighbor + output (odd rows)
+ movif64 wq, r4
+ movif32 wd, w1m
+.n1_loop:
+ movu m3, [t4+wq*2+400*4+4]
+ movu m5, [t4+wq*2+400*4+2]
+ paddw m3, [t4+wq*2+400*4+0]
+ paddw m5, m3
+ psllw m5, 2 ; a3[ 1] 444
+ psubw m4, m5, m3 ; a3[ 1] 343
+ paddw m3, m4, [t4+wq*2+400*12]
+ paddw m3, [t4+wq*2+400*10]
+ mova [t4+wq*2+400*10], m5
+ mova [t4+wq*2+400*12], m4
+ movu m1, [t3+wq*4+400*8+ 8]
+ movu m5, [t3+wq*4+400*8+ 4]
+ movu m7, [t3+wq*4+400*8+24]
+ movu m8, [t3+wq*4+400*8+20]
+ paddd m1, [t3+wq*4+400*8+ 0]
+ paddd m7, [t3+wq*4+400*8+16]
+ paddd m5, m1
+ paddd m8, m7
+ pslld m5, 2 ; b3[ 1] 444
+ pslld m8, 2
+ psubd m4, m5, m1 ; b3[ 1] 343
+ psubd m0, m8, m7
+ paddd m1, m4, [t3+wq*4+400*24+ 0]
+ paddd m7, m0, [t3+wq*4+400*24+16]
+ paddd m1, [t3+wq*4+400*20+ 0]
+ paddd m7, [t3+wq*4+400*20+16]
+ mova [t3+wq*4+400*20+ 0], m5
+ mova [t3+wq*4+400*20+16], m8
+ mova [t3+wq*4+400*24+ 0], m4
+ mova [t3+wq*4+400*24+16], m0
+ movq m5, [dstq+wq]
+ mova m2, [t4+wq*2+400* 6]
+ punpcklbw m5, m6
+ punpcklwd m4, m5, m6
+ punpcklwd m8, m2, m6
+ pmaddwd m8, m4 ; a5 * src
+ punpcklwd m0, m3, m6
+ pmaddwd m0, m4 ; a3 * src
+ punpckhwd m4, m5, m6
+ punpckhwd m2, m6
+ pmaddwd m2, m4
+ punpckhwd m3, m6
+ pmaddwd m3, m4
+ psubd m1, m0 ; b3 - a3 * src + (1 << 8) - (src << 13)
+ mova m0, [t3+wq*4+400*12+ 0]
+ psubd m0, m8 ; b5 - a5 * src + (1 << 8) - (src << 13)
+ mova m4, [t3+wq*4+400*12+16]
+ psubd m4, m2
+ psubd m7, m3
+ pslld m1, 7
+ psrld m0, 8
+ psrld m4, 8
+ pslld m7, 7
+ pandn m3, m9, m1
+ pand m0, m9
+ por m0, m3
+ pand m4, m9
+ pandn m2, m9, m7
+ por m2, m4
+ mova m1, [base+pd_4096]
+ pmaddwd m0, m15
+ pmaddwd m2, m15
+ paddd m0, m1
+ paddd m2, m1
+ psrad m0, 13
+ psrad m2, 13
+ packssdw m0, m2
+ paddw m0, m5
+ packuswb m0, m0
+ movq [dstq+wq], m0
+ add wq, 8
+ jl .n1_loop
+ add dstq, stridemp
+ movif32 dstm, dstq
+ ret
diff --git a/third_party/dav1d/src/x86/mc.h b/third_party/dav1d/src/x86/mc.h
new file mode 100644
index 0000000000..b142361daa
--- /dev/null
+++ b/third_party/dav1d/src/x86/mc.h
@@ -0,0 +1,302 @@
+/*
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
+ * Copyright © 2018-2021, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/mc.h"
+
+#define decl_fn(type, name) \
+ decl_##type##_fn(BF(name, sse2)); \
+ decl_##type##_fn(BF(name, ssse3)); \
+ decl_##type##_fn(BF(name, avx2)); \
+ decl_##type##_fn(BF(name, avx512icl));
+#define init_mc_fn(type, name, suffix) \
+ c->mc[type] = BF(dav1d_put_##name, suffix)
+#define init_mct_fn(type, name, suffix) \
+ c->mct[type] = BF(dav1d_prep_##name, suffix)
+#define init_mc_scaled_fn(type, name, suffix) \
+ c->mc_scaled[type] = BF(dav1d_put_##name, suffix)
+#define init_mct_scaled_fn(type, name, suffix) \
+ c->mct_scaled[type] = BF(dav1d_prep_##name, suffix)
+
+decl_fn(mc, dav1d_put_8tap_regular);
+decl_fn(mc, dav1d_put_8tap_regular_smooth);
+decl_fn(mc, dav1d_put_8tap_regular_sharp);
+decl_fn(mc, dav1d_put_8tap_smooth);
+decl_fn(mc, dav1d_put_8tap_smooth_regular);
+decl_fn(mc, dav1d_put_8tap_smooth_sharp);
+decl_fn(mc, dav1d_put_8tap_sharp);
+decl_fn(mc, dav1d_put_8tap_sharp_regular);
+decl_fn(mc, dav1d_put_8tap_sharp_smooth);
+decl_fn(mc, dav1d_put_bilin);
+
+decl_fn(mct, dav1d_prep_8tap_regular);
+decl_fn(mct, dav1d_prep_8tap_regular_smooth);
+decl_fn(mct, dav1d_prep_8tap_regular_sharp);
+decl_fn(mct, dav1d_prep_8tap_smooth);
+decl_fn(mct, dav1d_prep_8tap_smooth_regular);
+decl_fn(mct, dav1d_prep_8tap_smooth_sharp);
+decl_fn(mct, dav1d_prep_8tap_sharp);
+decl_fn(mct, dav1d_prep_8tap_sharp_regular);
+decl_fn(mct, dav1d_prep_8tap_sharp_smooth);
+decl_fn(mct, dav1d_prep_bilin);
+
+decl_fn(mc_scaled, dav1d_put_8tap_scaled_regular);
+decl_fn(mc_scaled, dav1d_put_8tap_scaled_regular_smooth);
+decl_fn(mc_scaled, dav1d_put_8tap_scaled_regular_sharp);
+decl_fn(mc_scaled, dav1d_put_8tap_scaled_smooth);
+decl_fn(mc_scaled, dav1d_put_8tap_scaled_smooth_regular);
+decl_fn(mc_scaled, dav1d_put_8tap_scaled_smooth_sharp);
+decl_fn(mc_scaled, dav1d_put_8tap_scaled_sharp);
+decl_fn(mc_scaled, dav1d_put_8tap_scaled_sharp_regular);
+decl_fn(mc_scaled, dav1d_put_8tap_scaled_sharp_smooth);
+decl_fn(mc_scaled, dav1d_put_bilin_scaled);
+
+decl_fn(mct_scaled, dav1d_prep_8tap_scaled_regular);
+decl_fn(mct_scaled, dav1d_prep_8tap_scaled_regular_smooth);
+decl_fn(mct_scaled, dav1d_prep_8tap_scaled_regular_sharp);
+decl_fn(mct_scaled, dav1d_prep_8tap_scaled_smooth);
+decl_fn(mct_scaled, dav1d_prep_8tap_scaled_smooth_regular);
+decl_fn(mct_scaled, dav1d_prep_8tap_scaled_smooth_sharp);
+decl_fn(mct_scaled, dav1d_prep_8tap_scaled_sharp);
+decl_fn(mct_scaled, dav1d_prep_8tap_scaled_sharp_regular);
+decl_fn(mct_scaled, dav1d_prep_8tap_scaled_sharp_smooth);
+decl_fn(mct_scaled, dav1d_prep_bilin_scaled);
+
+decl_fn(avg, dav1d_avg);
+decl_fn(w_avg, dav1d_w_avg);
+decl_fn(mask, dav1d_mask);
+decl_fn(w_mask, dav1d_w_mask_420);
+decl_fn(w_mask, dav1d_w_mask_422);
+decl_fn(w_mask, dav1d_w_mask_444);
+decl_fn(blend, dav1d_blend);
+decl_fn(blend_dir, dav1d_blend_v);
+decl_fn(blend_dir, dav1d_blend_h);
+
+decl_fn(warp8x8, dav1d_warp_affine_8x8);
+decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, sse4));
+decl_fn(warp8x8t, dav1d_warp_affine_8x8t);
+decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, sse4));
+
+decl_fn(emu_edge, dav1d_emu_edge);
+
+decl_fn(resize, dav1d_resize);
+
+static ALWAYS_INLINE void mc_dsp_init_x86(Dav1dMCDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if(!(flags & DAV1D_X86_CPU_FLAG_SSE2))
+ return;
+
+#if BITDEPTH == 8
+ init_mct_fn(FILTER_2D_BILINEAR, bilin, sse2);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, sse2);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, sse2);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, sse2);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, sse2);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, sse2);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, sse2);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, sse2);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, sse2);
+ init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, sse2);
+
+ c->warp8x8 = BF(dav1d_warp_affine_8x8, sse2);
+ c->warp8x8t = BF(dav1d_warp_affine_8x8t, sse2);
+#endif
+
+ if(!(flags & DAV1D_X86_CPU_FLAG_SSSE3))
+ return;
+
+ init_mc_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, ssse3);
+ init_mc_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, ssse3);
+ init_mc_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, ssse3);
+ init_mc_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, ssse3);
+ init_mc_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, ssse3);
+ init_mc_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, ssse3);
+ init_mc_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, ssse3);
+ init_mc_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, ssse3);
+ init_mc_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, ssse3);
+ init_mc_fn(FILTER_2D_BILINEAR, bilin, ssse3);
+
+ init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, ssse3);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, ssse3);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, ssse3);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, ssse3);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, ssse3);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, ssse3);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, ssse3);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, ssse3);
+ init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, ssse3);
+ init_mct_fn(FILTER_2D_BILINEAR, bilin, ssse3);
+
+ init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, ssse3);
+ init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, ssse3);
+ init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, ssse3);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, ssse3);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH, 8tap_scaled_smooth, ssse3);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_scaled_smooth_sharp, ssse3);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, ssse3);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, ssse3);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, ssse3);
+ init_mc_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, ssse3);
+
+ init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, ssse3);
+ init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, ssse3);
+ init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, ssse3);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, ssse3);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH, 8tap_scaled_smooth, ssse3);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_scaled_smooth_sharp, ssse3);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, ssse3);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, ssse3);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, ssse3);
+ init_mct_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, ssse3);
+
+ c->avg = BF(dav1d_avg, ssse3);
+ c->w_avg = BF(dav1d_w_avg, ssse3);
+ c->mask = BF(dav1d_mask, ssse3);
+ c->w_mask[0] = BF(dav1d_w_mask_444, ssse3);
+ c->w_mask[1] = BF(dav1d_w_mask_422, ssse3);
+ c->w_mask[2] = BF(dav1d_w_mask_420, ssse3);
+ c->blend = BF(dav1d_blend, ssse3);
+ c->blend_v = BF(dav1d_blend_v, ssse3);
+ c->blend_h = BF(dav1d_blend_h, ssse3);
+ c->warp8x8 = BF(dav1d_warp_affine_8x8, ssse3);
+ c->warp8x8t = BF(dav1d_warp_affine_8x8t, ssse3);
+ c->emu_edge = BF(dav1d_emu_edge, ssse3);
+ c->resize = BF(dav1d_resize, ssse3);
+
+ if(!(flags & DAV1D_X86_CPU_FLAG_SSE41))
+ return;
+
+#if BITDEPTH == 8
+ c->warp8x8 = BF(dav1d_warp_affine_8x8, sse4);
+ c->warp8x8t = BF(dav1d_warp_affine_8x8t, sse4);
+#endif
+
+#if ARCH_X86_64
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX2))
+ return;
+
+ init_mc_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, avx2);
+ init_mc_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2);
+ init_mc_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx2);
+ init_mc_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx2);
+ init_mc_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx2);
+ init_mc_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx2);
+ init_mc_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx2);
+ init_mc_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx2);
+ init_mc_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx2);
+ init_mc_fn(FILTER_2D_BILINEAR, bilin, avx2);
+
+ init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, avx2);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx2);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx2);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx2);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx2);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx2);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx2);
+ init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx2);
+ init_mct_fn(FILTER_2D_BILINEAR, bilin, avx2);
+
+ init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, avx2);
+ init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, avx2);
+ init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, avx2);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, avx2);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH, 8tap_scaled_smooth, avx2);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_scaled_smooth_sharp, avx2);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, avx2);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, avx2);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, avx2);
+ init_mc_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, avx2);
+
+ init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, avx2);
+ init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, avx2);
+ init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, avx2);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, avx2);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH, 8tap_scaled_smooth, avx2);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_scaled_smooth_sharp, avx2);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, avx2);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, avx2);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, avx2);
+ init_mct_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, avx2);
+
+ c->avg = BF(dav1d_avg, avx2);
+ c->w_avg = BF(dav1d_w_avg, avx2);
+ c->mask = BF(dav1d_mask, avx2);
+ c->w_mask[0] = BF(dav1d_w_mask_444, avx2);
+ c->w_mask[1] = BF(dav1d_w_mask_422, avx2);
+ c->w_mask[2] = BF(dav1d_w_mask_420, avx2);
+ c->blend = BF(dav1d_blend, avx2);
+ c->blend_v = BF(dav1d_blend_v, avx2);
+ c->blend_h = BF(dav1d_blend_h, avx2);
+ c->warp8x8 = BF(dav1d_warp_affine_8x8, avx2);
+ c->warp8x8t = BF(dav1d_warp_affine_8x8t, avx2);
+ c->emu_edge = BF(dav1d_emu_edge, avx2);
+ c->resize = BF(dav1d_resize, avx2);
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL))
+ return;
+
+ init_mc_fn (FILTER_2D_8TAP_REGULAR, 8tap_regular, avx512icl);
+ init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx512icl);
+ init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx512icl);
+ init_mc_fn (FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx512icl);
+ init_mc_fn (FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx512icl);
+ init_mc_fn (FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx512icl);
+ init_mc_fn (FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx512icl);
+ init_mc_fn (FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx512icl);
+ init_mc_fn (FILTER_2D_8TAP_SHARP, 8tap_sharp, avx512icl);
+ init_mc_fn (FILTER_2D_BILINEAR, bilin, avx512icl);
+
+ init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, avx512icl);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx512icl);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx512icl);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx512icl);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx512icl);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx512icl);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx512icl);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx512icl);
+ init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx512icl);
+ init_mct_fn(FILTER_2D_BILINEAR, bilin, avx512icl);
+
+ c->avg = BF(dav1d_avg, avx512icl);
+ c->w_avg = BF(dav1d_w_avg, avx512icl);
+ c->mask = BF(dav1d_mask, avx512icl);
+ c->w_mask[0] = BF(dav1d_w_mask_444, avx512icl);
+ c->w_mask[1] = BF(dav1d_w_mask_422, avx512icl);
+ c->w_mask[2] = BF(dav1d_w_mask_420, avx512icl);
+ c->blend = BF(dav1d_blend, avx512icl);
+ c->blend_v = BF(dav1d_blend_v, avx512icl);
+ c->blend_h = BF(dav1d_blend_h, avx512icl);
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SLOW_GATHER)) {
+ c->resize = BF(dav1d_resize, avx512icl);
+ c->warp8x8 = BF(dav1d_warp_affine_8x8, avx512icl);
+ c->warp8x8t = BF(dav1d_warp_affine_8x8t, avx512icl);
+ }
+#endif
+}
diff --git a/third_party/dav1d/src/x86/mc16_avx2.asm b/third_party/dav1d/src/x86/mc16_avx2.asm
new file mode 100644
index 0000000000..61eeaa1007
--- /dev/null
+++ b/third_party/dav1d/src/x86/mc16_avx2.asm
@@ -0,0 +1,5879 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 64
+
+; dav1d_obmc_masks[] * -512
+const obmc_masks_avx2
+ dw 0, 0, -9728, 0, -12800, -7168, -2560, 0
+ dw -14336, -11264, -8192, -5632, -3584, -1536, 0, 0
+ dw -15360, -13824, -12288, -10752, -9216, -7680, -6144, -5120
+ dw -4096, -3072, -2048, -1536, 0, 0, 0, 0
+ dw -15872, -14848, -14336, -13312, -12288, -11776, -10752, -10240
+ dw -9728, -8704, -8192, -7168, -6656, -6144, -5632, -4608
+ dw -4096, -3584, -3072, -2560, -2048, -2048, -1536, -1024
+ dw 0, 0, 0, 0, 0, 0, 0, 0
+
+deint_shuf: dd 0, 4, 1, 5, 2, 6, 3, 7
+subpel_h_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9
+subpel_h_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13
+subpel_h_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
+subpel_s_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7
+subpel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
+rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7
+rescale_mul2: dd 0, 1, 4, 5, 2, 3, 6, 7
+resize_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7
+ db 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15
+blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
+wswap: db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
+bdct_lb_q: times 8 db 0
+ times 8 db 4
+ times 8 db 8
+ times 8 db 12
+
+prep_mul: dw 16, 16, 4, 4
+put_bilin_h_rnd: dw 8, 8, 10, 10
+put_8tap_h_rnd: dd 34, 40
+s_8tap_h_rnd: dd 2, 8
+s_8tap_h_sh: dd 2, 4
+put_s_8tap_v_rnd: dd 512, 128
+put_s_8tap_v_sh: dd 10, 8
+prep_8tap_1d_rnd: dd 8 - (8192 << 4)
+prep_8tap_2d_rnd: dd 32 - (8192 << 5)
+warp8x8t_rnd: dd 16384 - (8192 << 15)
+warp8x8_shift: dd 5, 3
+warp8x8_rnd: dw 4096, 4096, 16384, 16384
+bidir_rnd: dw -16400, -16400, -16388, -16388
+bidir_mul: dw 2048, 2048, 8192, 8192
+
+%define pw_16 prep_mul
+%define pd_512 put_s_8tap_v_rnd
+
+pw_2: times 2 dw 2
+pw_64: times 2 dw 64
+pw_2048: times 2 dw 2048
+pw_8192: times 2 dw 8192
+pw_27615: times 2 dw 27615
+pw_32766: times 2 dw 32766
+pw_m512: times 2 dw -512
+pd_32: dd 32
+pd_63: dd 63
+pd_64: dd 64
+pd_32768: dd 32768
+pd_65538: dd 65538
+pd_m524256: dd -524256 ; -8192 << 6 + 32
+pd_0x3ff: dd 0x3ff
+pq_0x40000000: dq 0x40000000
+ dd 0
+
+%macro BIDIR_JMP_TABLE 2-*
+ %xdefine %1_%2_table (%%table - 2*%3)
+ %xdefine %%base %1_%2_table
+ %xdefine %%prefix mangle(private_prefix %+ _%1_16bpc_%2)
+ %%table:
+ %rep %0 - 2
+ dd %%prefix %+ .w%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+BIDIR_JMP_TABLE avg, avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_avg, avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE mask, avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_420, avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_422, avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_444, avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE blend, avx2, 4, 8, 16, 32
+BIDIR_JMP_TABLE blend_v, avx2, 2, 4, 8, 16, 32
+BIDIR_JMP_TABLE blend_h, avx2, 2, 4, 8, 16, 32, 64, 128
+
+%macro BASE_JMP_TABLE 3-*
+ %xdefine %1_%2_table (%%table - %3)
+ %xdefine %%base %1_%2
+ %%table:
+ %rep %0 - 2
+ dw %%base %+ _w%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+%xdefine put_avx2 mangle(private_prefix %+ _put_bilin_16bpc_avx2.put)
+%xdefine prep_avx2 mangle(private_prefix %+ _prep_bilin_16bpc_avx2.prep)
+
+BASE_JMP_TABLE put, avx2, 2, 4, 8, 16, 32, 64, 128
+BASE_JMP_TABLE prep, avx2, 4, 8, 16, 32, 64, 128
+
+%macro HV_JMP_TABLE 5-*
+ %xdefine %%prefix mangle(private_prefix %+ _%1_%2_16bpc_%3)
+ %xdefine %%base %1_%3
+ %assign %%types %4
+ %if %%types & 1
+ %xdefine %1_%2_h_%3_table (%%h - %5)
+ %%h:
+ %rep %0 - 4
+ dw %%prefix %+ .h_w%5 - %%base
+ %rotate 1
+ %endrep
+ %rotate 4
+ %endif
+ %if %%types & 2
+ %xdefine %1_%2_v_%3_table (%%v - %5)
+ %%v:
+ %rep %0 - 4
+ dw %%prefix %+ .v_w%5 - %%base
+ %rotate 1
+ %endrep
+ %rotate 4
+ %endif
+ %if %%types & 4
+ %xdefine %1_%2_hv_%3_table (%%hv - %5)
+ %%hv:
+ %rep %0 - 4
+ dw %%prefix %+ .hv_w%5 - %%base
+ %rotate 1
+ %endrep
+ %endif
+%endmacro
+
+HV_JMP_TABLE put, bilin, avx2, 7, 2, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, bilin, avx2, 7, 4, 8, 16, 32, 64, 128
+
+%macro SCALED_JMP_TABLE 2-*
+ %xdefine %1_%2_table (%%table - %3)
+ %xdefine %%base mangle(private_prefix %+ _%1_16bpc_%2)
+%%table:
+ %rep %0 - 2
+ dw %%base %+ .w%3 - %%base
+ %rotate 1
+ %endrep
+ %rotate 2
+ %%dy_1024:
+ %xdefine %1_%2_dy1_table (%%dy_1024 - %3)
+ %rep %0 - 2
+ dw %%base %+ .dy1_w%3 - %%base
+ %rotate 1
+ %endrep
+ %rotate 2
+ %%dy_2048:
+ %xdefine %1_%2_dy2_table (%%dy_2048 - %3)
+ %rep %0 - 2
+ dw %%base %+ .dy2_w%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+SCALED_JMP_TABLE put_8tap_scaled, avx2, 2, 4, 8, 16, 32, 64, 128
+SCALED_JMP_TABLE prep_8tap_scaled, avx2, 4, 8, 16, 32, 64, 128
+
+%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
+
+cextern mc_subpel_filters
+%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
+
+cextern mc_warp_filter
+cextern resize_filter
+
+SECTION .text
+
+INIT_XMM avx2
+cglobal put_bilin_16bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy
+ mov mxyd, r6m ; mx
+ lea r7, [put_avx2]
+%if UNIX64
+ DECLARE_REG_TMP 8
+ %define org_w r8d
+ mov r8d, wd
+%else
+ DECLARE_REG_TMP 7
+ %define org_w wm
+%endif
+ tzcnt wd, wm
+ movifnidn hd, hm
+ test mxyd, mxyd
+ jnz .h
+ mov mxyd, r7m ; my
+ test mxyd, mxyd
+ jnz .v
+.put:
+ movzx wd, word [r7+wq*2+table_offset(put,)]
+ add wq, r7
+ jmp wq
+.put_w2:
+ mov r6d, [srcq+ssq*0]
+ mov r7d, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mov [dstq+dsq*0], r6d
+ mov [dstq+dsq*1], r7d
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w2
+ RET
+.put_w4:
+ mov r6, [srcq+ssq*0]
+ mov r7, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mov [dstq+dsq*0], r6
+ mov [dstq+dsq*1], r7
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w4
+ RET
+.put_w8:
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w8
+ RET
+INIT_YMM avx2
+.put_w16:
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w16
+ RET
+.put_w32:
+ movu m0, [srcq+ssq*0+32*0]
+ movu m1, [srcq+ssq*0+32*1]
+ movu m2, [srcq+ssq*1+32*0]
+ movu m3, [srcq+ssq*1+32*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0+32*0], m0
+ mova [dstq+dsq*0+32*1], m1
+ mova [dstq+dsq*1+32*0], m2
+ mova [dstq+dsq*1+32*1], m3
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w32
+ RET
+.put_w64:
+ movu m0, [srcq+32*0]
+ movu m1, [srcq+32*1]
+ movu m2, [srcq+32*2]
+ movu m3, [srcq+32*3]
+ add srcq, ssq
+ mova [dstq+32*0], m0
+ mova [dstq+32*1], m1
+ mova [dstq+32*2], m2
+ mova [dstq+32*3], m3
+ add dstq, dsq
+ dec hd
+ jg .put_w64
+ RET
+.put_w128:
+ movu m0, [srcq+32*0]
+ movu m1, [srcq+32*1]
+ movu m2, [srcq+32*2]
+ movu m3, [srcq+32*3]
+ mova [dstq+32*0], m0
+ mova [dstq+32*1], m1
+ mova [dstq+32*2], m2
+ mova [dstq+32*3], m3
+ movu m0, [srcq+32*4]
+ movu m1, [srcq+32*5]
+ movu m2, [srcq+32*6]
+ movu m3, [srcq+32*7]
+ add srcq, ssq
+ mova [dstq+32*4], m0
+ mova [dstq+32*5], m1
+ mova [dstq+32*6], m2
+ mova [dstq+32*7], m3
+ add dstq, dsq
+ dec hd
+ jg .put_w128
+ RET
+.h:
+ movd xm5, mxyd
+ mov mxyd, r7m ; my
+ vpbroadcastd m4, [pw_16]
+ vpbroadcastw m5, xm5
+ psubw m4, m5
+ test mxyd, mxyd
+ jnz .hv
+ ; 12-bit is rounded twice so we can't use the same pmulhrsw approach as .v
+ movzx wd, word [r7+wq*2+table_offset(put, _bilin_h)]
+ mov r6d, r8m ; bitdepth_max
+ add wq, r7
+ shr r6d, 11
+ vpbroadcastd m3, [r7-put_avx2+put_bilin_h_rnd+r6*4]
+ jmp wq
+.h_w2:
+ movq xm1, [srcq+ssq*0]
+ movhps xm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmullw xm0, xm4, xm1
+ psrlq xm1, 16
+ pmullw xm1, xm5
+ paddw xm0, xm3
+ paddw xm0, xm1
+ psrlw xm0, 4
+ movd [dstq+dsq*0], xm0
+ pextrd [dstq+dsq*1], xm0, 2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w2
+ RET
+.h_w4:
+ movq xm0, [srcq+ssq*0]
+ movhps xm0, [srcq+ssq*1]
+ movq xm1, [srcq+ssq*0+2]
+ movhps xm1, [srcq+ssq*1+2]
+ lea srcq, [srcq+ssq*2]
+ pmullw xm0, xm4
+ pmullw xm1, xm5
+ paddw xm0, xm3
+ paddw xm0, xm1
+ psrlw xm0, 4
+ movq [dstq+dsq*0], xm0
+ movhps [dstq+dsq*1], xm0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w4
+ RET
+.h_w8:
+ movu xm0, [srcq+ssq*0]
+ vinserti128 m0, [srcq+ssq*1], 1
+ movu xm1, [srcq+ssq*0+2]
+ vinserti128 m1, [srcq+ssq*1+2], 1
+ lea srcq, [srcq+ssq*2]
+ pmullw m0, m4
+ pmullw m1, m5
+ paddw m0, m3
+ paddw m0, m1
+ psrlw m0, 4
+ mova [dstq+dsq*0], xm0
+ vextracti128 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ pmullw m0, m4, [srcq+ssq*0]
+ pmullw m1, m5, [srcq+ssq*0+2]
+ paddw m0, m3
+ paddw m0, m1
+ pmullw m1, m4, [srcq+ssq*1]
+ pmullw m2, m5, [srcq+ssq*1+2]
+ lea srcq, [srcq+ssq*2]
+ paddw m1, m3
+ paddw m1, m2
+ psrlw m0, 4
+ psrlw m1, 4
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w16
+ RET
+.h_w32:
+ pmullw m0, m4, [srcq+32*0]
+ pmullw m1, m5, [srcq+32*0+2]
+ paddw m0, m3
+ paddw m0, m1
+ pmullw m1, m4, [srcq+32*1]
+ pmullw m2, m5, [srcq+32*1+2]
+ add srcq, ssq
+ paddw m1, m3
+ paddw m1, m2
+ psrlw m0, 4
+ psrlw m1, 4
+ mova [dstq+32*0], m0
+ mova [dstq+32*1], m1
+ add dstq, dsq
+ dec hd
+ jg .h_w32
+ RET
+.h_w64:
+.h_w128:
+ movifnidn t0d, org_w
+.h_w64_loop0:
+ mov r6d, t0d
+.h_w64_loop:
+ pmullw m0, m4, [srcq+r6*2-32*1]
+ pmullw m1, m5, [srcq+r6*2-32*1+2]
+ paddw m0, m3
+ paddw m0, m1
+ pmullw m1, m4, [srcq+r6*2-32*2]
+ pmullw m2, m5, [srcq+r6*2-32*2+2]
+ paddw m1, m3
+ paddw m1, m2
+ psrlw m0, 4
+ psrlw m1, 4
+ mova [dstq+r6*2-32*1], m0
+ mova [dstq+r6*2-32*2], m1
+ sub r6d, 32
+ jg .h_w64_loop
+ add srcq, ssq
+ add dstq, dsq
+ dec hd
+ jg .h_w64_loop0
+ RET
+.v:
+ movzx wd, word [r7+wq*2+table_offset(put, _bilin_v)]
+ shl mxyd, 11
+ movd xm5, mxyd
+ add wq, r7
+ vpbroadcastw m5, xm5
+ jmp wq
+.v_w2:
+ movd xm0, [srcq+ssq*0]
+.v_w2_loop:
+ movd xm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpckldq xm2, xm0, xm1
+ movd xm0, [srcq+ssq*0]
+ punpckldq xm1, xm0
+ psubw xm1, xm2
+ pmulhrsw xm1, xm5
+ paddw xm1, xm2
+ movd [dstq+dsq*0], xm1
+ pextrd [dstq+dsq*1], xm1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w2_loop
+ RET
+.v_w4:
+ movq xm0, [srcq+ssq*0]
+.v_w4_loop:
+ movq xm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklqdq xm2, xm0, xm1
+ movq xm0, [srcq+ssq*0]
+ punpcklqdq xm1, xm0
+ psubw xm1, xm2
+ pmulhrsw xm1, xm5
+ paddw xm1, xm2
+ movq [dstq+dsq*0], xm1
+ movhps [dstq+dsq*1], xm1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w4_loop
+ RET
+.v_w8:
+ movu xm0, [srcq+ssq*0]
+.v_w8_loop:
+ vbroadcasti128 m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpblendd m2, m0, m1, 0xf0
+ vbroadcasti128 m0, [srcq+ssq*0]
+ vpblendd m1, m0, 0xf0
+ psubw m1, m2
+ pmulhrsw m1, m5
+ paddw m1, m2
+ mova [dstq+dsq*0], xm1
+ vextracti128 [dstq+dsq*1], m1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w8_loop
+ RET
+.v_w32:
+ movu m0, [srcq+ssq*0+32*0]
+ movu m1, [srcq+ssq*0+32*1]
+.v_w32_loop:
+ movu m2, [srcq+ssq*1+32*0]
+ movu m3, [srcq+ssq*1+32*1]
+ lea srcq, [srcq+ssq*2]
+ psubw m4, m2, m0
+ pmulhrsw m4, m5
+ paddw m4, m0
+ movu m0, [srcq+ssq*0+32*0]
+ mova [dstq+dsq*0+32*0], m4
+ psubw m4, m3, m1
+ pmulhrsw m4, m5
+ paddw m4, m1
+ movu m1, [srcq+ssq*0+32*1]
+ mova [dstq+dsq*0+32*1], m4
+ psubw m4, m0, m2
+ pmulhrsw m4, m5
+ paddw m4, m2
+ mova [dstq+dsq*1+32*0], m4
+ psubw m4, m1, m3
+ pmulhrsw m4, m5
+ paddw m4, m3
+ mova [dstq+dsq*1+32*1], m4
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w32_loop
+ RET
+.v_w16:
+.v_w64:
+.v_w128:
+ movifnidn t0d, org_w
+ add t0d, t0d
+ mov r4, srcq
+ lea r6d, [hq+t0*8-256]
+ mov r7, dstq
+.v_w16_loop0:
+ movu m0, [srcq+ssq*0]
+.v_w16_loop:
+ movu m3, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ psubw m1, m3, m0
+ pmulhrsw m1, m5
+ paddw m1, m0
+ movu m0, [srcq+ssq*0]
+ psubw m2, m0, m3
+ pmulhrsw m2, m5
+ paddw m2, m3
+ mova [dstq+dsq*0], m1
+ mova [dstq+dsq*1], m2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w16_loop
+ add r4, 32
+ add r7, 32
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
+ sub r6d, 1<<8
+ jg .v_w16_loop0
+ RET
+.hv:
+ movzx wd, word [r7+wq*2+table_offset(put, _bilin_hv)]
+ WIN64_SPILL_XMM 8
+ shl mxyd, 11
+ vpbroadcastd m3, [pw_2]
+ movd xm6, mxyd
+ vpbroadcastd m7, [pw_8192]
+ add wq, r7
+ vpbroadcastw m6, xm6
+ test dword r8m, 0x800
+ jnz .hv_12bpc
+ psllw m4, 2
+ psllw m5, 2
+ vpbroadcastd m7, [pw_2048]
+.hv_12bpc:
+ jmp wq
+.hv_w2:
+ vpbroadcastq xm1, [srcq+ssq*0]
+ pmullw xm0, xm4, xm1
+ psrlq xm1, 16
+ pmullw xm1, xm5
+ paddw xm0, xm3
+ paddw xm0, xm1
+ psrlw xm0, 2
+.hv_w2_loop:
+ movq xm2, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movhps xm2, [srcq+ssq*0]
+ pmullw xm1, xm4, xm2
+ psrlq xm2, 16
+ pmullw xm2, xm5
+ paddw xm1, xm3
+ paddw xm1, xm2
+ psrlw xm1, 2 ; 1 _ 2 _
+ shufpd xm2, xm0, xm1, 0x01 ; 0 _ 1 _
+ mova xm0, xm1
+ psubw xm1, xm2
+ paddw xm1, xm1
+ pmulhw xm1, xm6
+ paddw xm1, xm2
+ pmulhrsw xm1, xm7
+ movd [dstq+dsq*0], xm1
+ pextrd [dstq+dsq*1], xm1, 2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w2_loop
+ RET
+.hv_w4:
+ pmullw xm0, xm4, [srcq+ssq*0-8]
+ pmullw xm1, xm5, [srcq+ssq*0-6]
+ paddw xm0, xm3
+ paddw xm0, xm1
+ psrlw xm0, 2
+.hv_w4_loop:
+ movq xm1, [srcq+ssq*1]
+ movq xm2, [srcq+ssq*1+2]
+ lea srcq, [srcq+ssq*2]
+ movhps xm1, [srcq+ssq*0]
+ movhps xm2, [srcq+ssq*0+2]
+ pmullw xm1, xm4
+ pmullw xm2, xm5
+ paddw xm1, xm3
+ paddw xm1, xm2
+ psrlw xm1, 2 ; 1 2
+ shufpd xm2, xm0, xm1, 0x01 ; 0 1
+ mova xm0, xm1
+ psubw xm1, xm2
+ paddw xm1, xm1
+ pmulhw xm1, xm6
+ paddw xm1, xm2
+ pmulhrsw xm1, xm7
+ movq [dstq+dsq*0], xm1
+ movhps [dstq+dsq*1], xm1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ pmullw xm0, xm4, [srcq+ssq*0]
+ pmullw xm1, xm5, [srcq+ssq*0+2]
+ paddw xm0, xm3
+ paddw xm0, xm1
+ psrlw xm0, 2
+ vinserti128 m0, xm0, 1
+.hv_w8_loop:
+ movu xm1, [srcq+ssq*1]
+ movu xm2, [srcq+ssq*1+2]
+ lea srcq, [srcq+ssq*2]
+ vinserti128 m1, [srcq+ssq*0], 1
+ vinserti128 m2, [srcq+ssq*0+2], 1
+ pmullw m1, m4
+ pmullw m2, m5
+ paddw m1, m3
+ paddw m1, m2
+ psrlw m1, 2 ; 1 2
+ vperm2i128 m2, m0, m1, 0x21 ; 0 1
+ mova m0, m1
+ psubw m1, m2
+ paddw m1, m1
+ pmulhw m1, m6
+ paddw m1, m2
+ pmulhrsw m1, m7
+ mova [dstq+dsq*0], xm1
+ vextracti128 [dstq+dsq*1], m1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w8_loop
+ RET
+.hv_w16:
+.hv_w32:
+.hv_w64:
+.hv_w128:
+%if UNIX64
+ lea r6d, [r8*2-32]
+%else
+ mov r6d, wm
+ lea r6d, [r6*2-32]
+%endif
+ mov r4, srcq
+ lea r6d, [hq+r6*8]
+ mov r7, dstq
+.hv_w16_loop0:
+ pmullw m0, m4, [srcq+ssq*0]
+ pmullw m1, m5, [srcq+ssq*0+2]
+ paddw m0, m3
+ paddw m0, m1
+ psrlw m0, 2
+.hv_w16_loop:
+ pmullw m1, m4, [srcq+ssq*1]
+ pmullw m2, m5, [srcq+ssq*1+2]
+ lea srcq, [srcq+ssq*2]
+ paddw m1, m3
+ paddw m1, m2
+ psrlw m1, 2
+ psubw m2, m1, m0
+ paddw m2, m2
+ pmulhw m2, m6
+ paddw m2, m0
+ pmulhrsw m2, m7
+ mova [dstq+dsq*0], m2
+ pmullw m0, m4, [srcq+ssq*0]
+ pmullw m2, m5, [srcq+ssq*0+2]
+ paddw m0, m3
+ paddw m0, m2
+ psrlw m0, 2
+ psubw m2, m0, m1
+ paddw m2, m2
+ pmulhw m2, m6
+ paddw m2, m1
+ pmulhrsw m2, m7
+ mova [dstq+dsq*1], m2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w16_loop
+ add r4, 32
+ add r7, 32
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
+ sub r6d, 1<<8
+ jg .hv_w16_loop0
+ RET
+
+cglobal prep_bilin_16bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
+ movifnidn mxyd, r5m ; mx
+ lea r6, [prep_avx2]
+%if UNIX64
+ DECLARE_REG_TMP 7
+ %define org_w r7d
+%else
+ DECLARE_REG_TMP 6
+ %define org_w r5m
+%endif
+ mov org_w, wd
+ tzcnt wd, wm
+ movifnidn hd, hm
+ test mxyd, mxyd
+ jnz .h
+ mov mxyd, r6m ; my
+ test mxyd, mxyd
+ jnz .v
+.prep:
+ movzx wd, word [r6+wq*2+table_offset(prep,)]
+ mov r5d, r7m ; bitdepth_max
+ vpbroadcastd m5, [r6-prep_avx2+pw_8192]
+ add wq, r6
+ shr r5d, 11
+ vpbroadcastd m4, [r6-prep_avx2+prep_mul+r5*4]
+ lea stride3q, [strideq*3]
+ jmp wq
+.prep_w4:
+ movq xm0, [srcq+strideq*0]
+ movhps xm0, [srcq+strideq*1]
+ vpbroadcastq m1, [srcq+strideq*2]
+ vpbroadcastq m2, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpblendd m0, m1, 0x30
+ vpblendd m0, m2, 0xc0
+ pmullw m0, m4
+ psubw m0, m5
+ mova [tmpq], m0
+ add tmpq, 32
+ sub hd, 4
+ jg .prep_w4
+ RET
+.prep_w8:
+ movu xm0, [srcq+strideq*0]
+ vinserti128 m0, [srcq+strideq*1], 1
+ movu xm1, [srcq+strideq*2]
+ vinserti128 m1, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ pmullw m0, m4
+ pmullw m1, m4
+ psubw m0, m5
+ psubw m1, m5
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ add tmpq, 32*2
+ sub hd, 4
+ jg .prep_w8
+ RET
+.prep_w16:
+ pmullw m0, m4, [srcq+strideq*0]
+ pmullw m1, m4, [srcq+strideq*1]
+ pmullw m2, m4, [srcq+strideq*2]
+ pmullw m3, m4, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ psubw m0, m5
+ psubw m1, m5
+ psubw m2, m5
+ psubw m3, m5
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ mova [tmpq+32*2], m2
+ mova [tmpq+32*3], m3
+ add tmpq, 32*4
+ sub hd, 4
+ jg .prep_w16
+ RET
+.prep_w32:
+ pmullw m0, m4, [srcq+strideq*0+32*0]
+ pmullw m1, m4, [srcq+strideq*0+32*1]
+ pmullw m2, m4, [srcq+strideq*1+32*0]
+ pmullw m3, m4, [srcq+strideq*1+32*1]
+ lea srcq, [srcq+strideq*2]
+ psubw m0, m5
+ psubw m1, m5
+ psubw m2, m5
+ psubw m3, m5
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ mova [tmpq+32*2], m2
+ mova [tmpq+32*3], m3
+ add tmpq, 32*4
+ sub hd, 2
+ jg .prep_w32
+ RET
+.prep_w64:
+ pmullw m0, m4, [srcq+32*0]
+ pmullw m1, m4, [srcq+32*1]
+ pmullw m2, m4, [srcq+32*2]
+ pmullw m3, m4, [srcq+32*3]
+ add srcq, strideq
+ psubw m0, m5
+ psubw m1, m5
+ psubw m2, m5
+ psubw m3, m5
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ mova [tmpq+32*2], m2
+ mova [tmpq+32*3], m3
+ add tmpq, 32*4
+ dec hd
+ jg .prep_w64
+ RET
+.prep_w128:
+ pmullw m0, m4, [srcq+32*0]
+ pmullw m1, m4, [srcq+32*1]
+ pmullw m2, m4, [srcq+32*2]
+ pmullw m3, m4, [srcq+32*3]
+ psubw m0, m5
+ psubw m1, m5
+ psubw m2, m5
+ psubw m3, m5
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ mova [tmpq+32*2], m2
+ mova [tmpq+32*3], m3
+ pmullw m0, m4, [srcq+32*4]
+ pmullw m1, m4, [srcq+32*5]
+ pmullw m2, m4, [srcq+32*6]
+ pmullw m3, m4, [srcq+32*7]
+ add tmpq, 32*8
+ add srcq, strideq
+ psubw m0, m5
+ psubw m1, m5
+ psubw m2, m5
+ psubw m3, m5
+ mova [tmpq-32*4], m0
+ mova [tmpq-32*3], m1
+ mova [tmpq-32*2], m2
+ mova [tmpq-32*1], m3
+ dec hd
+ jg .prep_w128
+ RET
+.h:
+ movd xm5, mxyd
+ mov mxyd, r6m ; my
+ vpbroadcastd m4, [pw_16]
+ vpbroadcastw m5, xm5
+ vpbroadcastd m3, [pw_32766]
+ psubw m4, m5
+ test dword r7m, 0x800
+ jnz .h_12bpc
+ psllw m4, 2
+ psllw m5, 2
+.h_12bpc:
+ test mxyd, mxyd
+ jnz .hv
+ movzx wd, word [r6+wq*2+table_offset(prep, _bilin_h)]
+ add wq, r6
+ lea stride3q, [strideq*3]
+ jmp wq
+.h_w4:
+ movu xm1, [srcq+strideq*0]
+ vinserti128 m1, [srcq+strideq*2], 1
+ movu xm2, [srcq+strideq*1]
+ vinserti128 m2, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ punpcklqdq m0, m1, m2
+ psrldq m1, 2
+ pslldq m2, 6
+ pmullw m0, m4
+ vpblendd m1, m2, 0xcc
+ pmullw m1, m5
+ psubw m0, m3
+ paddw m0, m1
+ psraw m0, 2
+ mova [tmpq], m0
+ add tmpq, 32
+ sub hd, 4
+ jg .h_w4
+ RET
+.h_w8:
+ movu xm0, [srcq+strideq*0]
+ vinserti128 m0, [srcq+strideq*1], 1
+ movu xm1, [srcq+strideq*0+2]
+ vinserti128 m1, [srcq+strideq*1+2], 1
+ lea srcq, [srcq+strideq*2]
+ pmullw m0, m4
+ pmullw m1, m5
+ psubw m0, m3
+ paddw m0, m1
+ psraw m0, 2
+ mova [tmpq], m0
+ add tmpq, 32
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ pmullw m0, m4, [srcq+strideq*0]
+ pmullw m1, m5, [srcq+strideq*0+2]
+ psubw m0, m3
+ paddw m0, m1
+ pmullw m1, m4, [srcq+strideq*1]
+ pmullw m2, m5, [srcq+strideq*1+2]
+ lea srcq, [srcq+strideq*2]
+ psubw m1, m3
+ paddw m1, m2
+ psraw m0, 2
+ psraw m1, 2
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ add tmpq, 32*2
+ sub hd, 2
+ jg .h_w16
+ RET
+.h_w32:
+.h_w64:
+.h_w128:
+ movifnidn t0d, org_w
+.h_w32_loop0:
+ mov r3d, t0d
+.h_w32_loop:
+ pmullw m0, m4, [srcq+r3*2-32*1]
+ pmullw m1, m5, [srcq+r3*2-32*1+2]
+ psubw m0, m3
+ paddw m0, m1
+ pmullw m1, m4, [srcq+r3*2-32*2]
+ pmullw m2, m5, [srcq+r3*2-32*2+2]
+ psubw m1, m3
+ paddw m1, m2
+ psraw m0, 2
+ psraw m1, 2
+ mova [tmpq+r3*2-32*1], m0
+ mova [tmpq+r3*2-32*2], m1
+ sub r3d, 32
+ jg .h_w32_loop
+ add srcq, strideq
+ lea tmpq, [tmpq+t0*2]
+ dec hd
+ jg .h_w32_loop0
+ RET
+.v:
+ movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)]
+ movd xm5, mxyd
+ vpbroadcastd m4, [pw_16]
+ vpbroadcastw m5, xm5
+ vpbroadcastd m3, [pw_32766]
+ add wq, r6
+ lea stride3q, [strideq*3]
+ psubw m4, m5
+ test dword r7m, 0x800
+ jnz .v_12bpc
+ psllw m4, 2
+ psllw m5, 2
+.v_12bpc:
+ jmp wq
+.v_w4:
+ movq xm0, [srcq+strideq*0]
+.v_w4_loop:
+ vpbroadcastq m2, [srcq+strideq*2]
+ vpbroadcastq xm1, [srcq+strideq*1]
+ vpblendd m2, m0, 0x03 ; 0 2 2 2
+ vpbroadcastq m0, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpblendd m1, m0, 0xf0 ; 1 1 3 3
+ vpbroadcastq m0, [srcq+strideq*0]
+ vpblendd m1, m2, 0x33 ; 0 1 2 3
+ vpblendd m0, m2, 0x0c ; 4 2 4 4
+ punpckhqdq m2, m1, m0 ; 1 2 3 4
+ pmullw m1, m4
+ pmullw m2, m5
+ psubw m1, m3
+ paddw m1, m2
+ psraw m1, 2
+ mova [tmpq], m1
+ add tmpq, 32
+ sub hd, 4
+ jg .v_w4_loop
+ RET
+.v_w8:
+ movu xm0, [srcq+strideq*0]
+.v_w8_loop:
+ vbroadcasti128 m2, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vpblendd m1, m0, m2, 0xf0 ; 0 1
+ vbroadcasti128 m0, [srcq+strideq*0]
+ vpblendd m2, m0, 0xf0 ; 1 2
+ pmullw m1, m4
+ pmullw m2, m5
+ psubw m1, m3
+ paddw m1, m2
+ psraw m1, 2
+ mova [tmpq], m1
+ add tmpq, 32
+ sub hd, 2
+ jg .v_w8_loop
+ RET
+.v_w16:
+ movu m0, [srcq+strideq*0]
+.v_w16_loop:
+ movu m2, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ pmullw m0, m4
+ pmullw m1, m5, m2
+ psubw m0, m3
+ paddw m1, m0
+ movu m0, [srcq+strideq*0]
+ psraw m1, 2
+ pmullw m2, m4
+ mova [tmpq+32*0], m1
+ pmullw m1, m5, m0
+ psubw m2, m3
+ paddw m1, m2
+ psraw m1, 2
+ mova [tmpq+32*1], m1
+ add tmpq, 32*2
+ sub hd, 2
+ jg .v_w16_loop
+ RET
+.v_w32:
+.v_w64:
+.v_w128:
+%if WIN64
+ PUSH r7
+%endif
+ movifnidn r7d, org_w
+ add r7d, r7d
+ mov r3, srcq
+ lea r6d, [hq+r7*8-256]
+ mov r5, tmpq
+.v_w32_loop0:
+ movu m0, [srcq+strideq*0]
+.v_w32_loop:
+ movu m2, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ pmullw m0, m4
+ pmullw m1, m5, m2
+ psubw m0, m3
+ paddw m1, m0
+ movu m0, [srcq+strideq*0]
+ psraw m1, 2
+ pmullw m2, m4
+ mova [tmpq+r7*0], m1
+ pmullw m1, m5, m0
+ psubw m2, m3
+ paddw m1, m2
+ psraw m1, 2
+ mova [tmpq+r7*1], m1
+ lea tmpq, [tmpq+r7*2]
+ sub hd, 2
+ jg .v_w32_loop
+ add r3, 32
+ add r5, 32
+ movzx hd, r6b
+ mov srcq, r3
+ mov tmpq, r5
+ sub r6d, 1<<8
+ jg .v_w32_loop0
+%if WIN64
+ POP r7
+%endif
+ RET
+.hv:
+ WIN64_SPILL_XMM 7
+ movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)]
+ shl mxyd, 11
+ movd xm6, mxyd
+ add wq, r6
+ lea stride3q, [strideq*3]
+ vpbroadcastw m6, xm6
+ jmp wq
+.hv_w4:
+ movu xm1, [srcq+strideq*0]
+%if WIN64
+ movaps [rsp+24], xmm7
+%endif
+ pmullw xm0, xm4, xm1
+ psrldq xm1, 2
+ pmullw xm1, xm5
+ psubw xm0, xm3
+ paddw xm0, xm1
+ psraw xm0, 2
+ vpbroadcastq m0, xm0
+.hv_w4_loop:
+ movu xm1, [srcq+strideq*1]
+ vinserti128 m1, [srcq+stride3q ], 1
+ movu xm2, [srcq+strideq*2]
+ lea srcq, [srcq+strideq*4]
+ vinserti128 m2, [srcq+strideq*0], 1
+ punpcklqdq m7, m1, m2
+ psrldq m1, 2
+ pslldq m2, 6
+ pmullw m7, m4
+ vpblendd m1, m2, 0xcc
+ pmullw m1, m5
+ psubw m7, m3
+ paddw m1, m7
+ psraw m1, 2 ; 1 2 3 4
+ vpblendd m0, m1, 0x3f
+ vpermq m2, m0, q2103 ; 0 1 2 3
+ mova m0, m1
+ psubw m1, m2
+ pmulhrsw m1, m6
+ paddw m1, m2
+ mova [tmpq], m1
+ add tmpq, 32
+ sub hd, 4
+ jg .hv_w4_loop
+%if WIN64
+ movaps xmm7, [rsp+24]
+%endif
+ RET
+.hv_w8:
+ pmullw xm0, xm4, [srcq+strideq*0]
+ pmullw xm1, xm5, [srcq+strideq*0+2]
+ psubw xm0, xm3
+ paddw xm0, xm1
+ psraw xm0, 2
+ vinserti128 m0, xm0, 1
+.hv_w8_loop:
+ movu xm1, [srcq+strideq*1]
+ movu xm2, [srcq+strideq*1+2]
+ lea srcq, [srcq+strideq*2]
+ vinserti128 m1, [srcq+strideq*0], 1
+ vinserti128 m2, [srcq+strideq*0+2], 1
+ pmullw m1, m4
+ pmullw m2, m5
+ psubw m1, m3
+ paddw m1, m2
+ psraw m1, 2 ; 1 2
+ vperm2i128 m2, m0, m1, 0x21 ; 0 1
+ mova m0, m1
+ psubw m1, m2
+ pmulhrsw m1, m6
+ paddw m1, m2
+ mova [tmpq], m1
+ add tmpq, 32
+ sub hd, 2
+ jg .hv_w8_loop
+ RET
+.hv_w16:
+.hv_w32:
+.hv_w64:
+.hv_w128:
+%if WIN64
+ PUSH r7
+%endif
+ movifnidn r7d, org_w
+ add r7d, r7d
+ mov r3, srcq
+ lea r6d, [hq+r7*8-256]
+ mov r5, tmpq
+.hv_w16_loop0:
+ pmullw m0, m4, [srcq]
+ pmullw m1, m5, [srcq+2]
+ psubw m0, m3
+ paddw m0, m1
+ psraw m0, 2
+.hv_w16_loop:
+ pmullw m1, m4, [srcq+strideq*1]
+ pmullw m2, m5, [srcq+strideq*1+2]
+ lea srcq, [srcq+strideq*2]
+ psubw m1, m3
+ paddw m1, m2
+ psraw m1, 2
+ psubw m2, m1, m0
+ pmulhrsw m2, m6
+ paddw m2, m0
+ mova [tmpq+r7*0], m2
+ pmullw m0, m4, [srcq+strideq*0]
+ pmullw m2, m5, [srcq+strideq*0+2]
+ psubw m0, m3
+ paddw m0, m2
+ psraw m0, 2
+ psubw m2, m0, m1
+ pmulhrsw m2, m6
+ paddw m2, m1
+ mova [tmpq+r7*1], m2
+ lea tmpq, [tmpq+r7*2]
+ sub hd, 2
+ jg .hv_w16_loop
+ add r3, 32
+ add r5, 32
+ movzx hd, r6b
+ mov srcq, r3
+ mov tmpq, r5
+ sub r6d, 1<<8
+ jg .hv_w16_loop0
+%if WIN64
+ POP r7
+%endif
+ RET
+
+; int8_t subpel_filters[5][15][8]
+%assign FILTER_REGULAR (0*15 << 16) | 3*15
+%assign FILTER_SMOOTH (1*15 << 16) | 4*15
+%assign FILTER_SHARP (2*15 << 16) | 3*15
+
+%macro FN 4 ; prefix, type, type_h, type_v
+cglobal %1_%2_16bpc
+ mov t0d, FILTER_%3
+%ifidn %3, %4
+ mov t1d, t0d
+%else
+ mov t1d, FILTER_%4
+%endif
+%ifnidn %2, regular ; skip the jump in the last filter
+ jmp mangle(private_prefix %+ _%1_16bpc %+ SUFFIX)
+%endif
+%endmacro
+
+%if WIN64
+DECLARE_REG_TMP 4, 5
+%else
+DECLARE_REG_TMP 7, 8
+%endif
+
+%define PUT_8TAP_FN FN put_8tap,
+PUT_8TAP_FN sharp, SHARP, SHARP
+PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH
+PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP
+PUT_8TAP_FN smooth, SMOOTH, SMOOTH
+PUT_8TAP_FN sharp_regular, SHARP, REGULAR
+PUT_8TAP_FN regular_sharp, REGULAR, SHARP
+PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR
+PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PUT_8TAP_FN regular, REGULAR, REGULAR
+
+cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
+%define base r8-put_avx2
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+ imul myd, mym, 0x010101
+ add myd, t1d ; 8tap_v, my, 4tap_v
+ lea r8, [put_avx2]
+ movifnidn wd, wm
+ movifnidn hd, hm
+ test mxd, 0xf00
+ jnz .h
+ test myd, 0xf00
+ jnz .v
+ tzcnt wd, wd
+ movzx wd, word [r8+wq*2+table_offset(put,)]
+ add wq, r8
+%if WIN64
+ pop r8
+%endif
+ jmp wq
+.h_w2:
+ movzx mxd, mxb
+ sub srcq, 2
+ mova xm2, [subpel_h_shuf2]
+ vpbroadcastd xm3, [base+subpel_filters+mxq*8+2]
+ pmovsxbw xm3, xm3
+.h_w2_loop:
+ movu xm0, [srcq+ssq*0]
+ movu xm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb xm0, xm2
+ pshufb xm1, xm2
+ pmaddwd xm0, xm3
+ pmaddwd xm1, xm3
+ phaddd xm0, xm1
+ paddd xm0, xm4
+ psrad xm0, 6
+ packusdw xm0, xm0
+ pminsw xm0, xm5
+ movd [dstq+dsq*0], xm0
+ pextrd [dstq+dsq*1], xm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w2_loop
+ RET
+.h_w4:
+ movzx mxd, mxb
+ sub srcq, 2
+ pmovsxbw xm3, [base+subpel_filters+mxq*8]
+ WIN64_SPILL_XMM 8
+ vbroadcasti128 m6, [subpel_h_shufA]
+ vbroadcasti128 m7, [subpel_h_shufB]
+ pshufd xm3, xm3, q2211
+ vpbroadcastq m2, xm3
+ vpermq m3, m3, q1111
+.h_w4_loop:
+ movu xm1, [srcq+ssq*0]
+ vinserti128 m1, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ pshufb m0, m1, m6 ; 0 1 1 2 2 3 3 4
+ pshufb m1, m7 ; 2 3 3 4 4 5 5 6
+ pmaddwd m0, m2
+ pmaddwd m1, m3
+ paddd m0, m4
+ paddd m0, m1
+ psrad m0, 6
+ vextracti128 xm1, m0, 1
+ packusdw xm0, xm1
+ pminsw xm0, xm5
+ movq [dstq+dsq*0], xm0
+ movhps [dstq+dsq*1], xm0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w4_loop
+ RET
+.h:
+ test myd, 0xf00
+ jnz .hv
+ mov r7d, r8m
+ vpbroadcastw m5, r8m
+ shr r7d, 11
+ vpbroadcastd m4, [base+put_8tap_h_rnd+r7*4]
+ cmp wd, 4
+ je .h_w4
+ jl .h_w2
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 13
+ shr mxd, 16
+ sub srcq, 6
+ vpbroadcastq m0, [base+subpel_filters+mxq*8]
+ vbroadcasti128 m6, [subpel_h_shufA]
+ vbroadcasti128 m7, [subpel_h_shufB]
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ pshufd m8, m0, q0000
+ pshufd m9, m0, q1111
+ pshufd m10, m0, q2222
+ pshufd m11, m0, q3333
+ cmp wd, 8
+ jg .h_w16
+.h_w8:
+%macro PUT_8TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2]
+ pshufb m%4, m%1, m7 ; 2 3 3 4 4 5 5 6
+ pshufb m%1, m6 ; 0 1 1 2 2 3 3 4
+ pmaddwd m%5, m9, m%4 ; abcd1
+ pmaddwd m%1, m8 ; abcd0
+ pshufb m%2, m7 ; 6 7 7 8 8 9 9 a
+ shufpd m%4, m%2, 0x05 ; 4 5 5 6 6 7 7 8
+ paddd m%5, m4
+ paddd m%1, m%5
+ pmaddwd m%5, m11, m%2 ; abcd3
+ paddd m%1, m%5
+ pmaddwd m%5, m10, m%4 ; abcd2
+ pshufb m%3, m7 ; a b b c c d d e
+ pmaddwd m%4, m8 ; efgh0
+ paddd m%1, m%5
+ pmaddwd m%5, m9, m%2 ; efgh1
+ shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c
+ pmaddwd m%3, m11 ; efgh3
+ pmaddwd m%2, m10 ; efgh2
+ paddd m%4, m4
+ paddd m%4, m%5
+ paddd m%3, m%4
+ paddd m%2, m%3
+ psrad m%1, 6
+ psrad m%2, 6
+ packusdw m%1, m%2
+ pminsw m%1, m5
+%endmacro
+ movu xm0, [srcq+ssq*0+ 0]
+ vinserti128 m0, [srcq+ssq*1+ 0], 1
+ movu xm2, [srcq+ssq*0+16]
+ vinserti128 m2, [srcq+ssq*1+16], 1
+ lea srcq, [srcq+ssq*2]
+ shufpd m1, m0, m2, 0x05
+ PUT_8TAP_H 0, 1, 2, 3, 12
+ mova [dstq+dsq*0], xm0
+ vextracti128 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ mov r6d, wd
+.h_w16_loop:
+ movu m0, [srcq+r6*2-32]
+ movu m1, [srcq+r6*2-24]
+ movu m2, [srcq+r6*2-16]
+ PUT_8TAP_H 0, 1, 2, 3, 12
+ mova [dstq+r6*2-32], m0
+ sub r6d, 16
+ jg .h_w16_loop
+ add srcq, ssq
+ add dstq, dsq
+ dec hd
+ jg .h_w16
+ RET
+.v:
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmovle myd, mxd
+ vpbroadcastq m0, [base+subpel_filters+myq*8]
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 15
+ vpbroadcastd m6, [pd_32]
+ vpbroadcastw m7, r8m
+ lea r6, [ssq*3]
+ sub srcq, r6
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ pshufd m8, m0, q0000
+ pshufd m9, m0, q1111
+ pshufd m10, m0, q2222
+ pshufd m11, m0, q3333
+ cmp wd, 4
+ jg .v_w8
+ je .v_w4
+.v_w2:
+ movd xm2, [srcq+ssq*0]
+ pinsrd xm2, [srcq+ssq*1], 1
+ pinsrd xm2, [srcq+ssq*2], 2
+ pinsrd xm2, [srcq+r6 ], 3 ; 0 1 2 3
+ lea srcq, [srcq+ssq*4]
+ movd xm3, [srcq+ssq*0]
+ vpbroadcastd xm1, [srcq+ssq*1]
+ vpbroadcastd xm0, [srcq+ssq*2]
+ add srcq, r6
+ vpblendd xm3, xm1, 0x02 ; 4 5
+ vpblendd xm1, xm0, 0x02 ; 5 6
+ palignr xm4, xm3, xm2, 4 ; 1 2 3 4
+ punpcklwd xm3, xm1 ; 45 56
+ punpcklwd xm1, xm2, xm4 ; 01 12
+ punpckhwd xm2, xm4 ; 23 34
+.v_w2_loop:
+ vpbroadcastd xm4, [srcq+ssq*0]
+ pmaddwd xm5, xm8, xm1 ; a0 b0
+ mova xm1, xm2
+ pmaddwd xm2, xm9 ; a1 b1
+ paddd xm5, xm6
+ paddd xm5, xm2
+ mova xm2, xm3
+ pmaddwd xm3, xm10 ; a2 b2
+ paddd xm5, xm3
+ vpblendd xm3, xm0, xm4, 0x02 ; 6 7
+ vpbroadcastd xm0, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpblendd xm4, xm0, 0x02 ; 7 8
+ punpcklwd xm3, xm4 ; 67 78
+ pmaddwd xm4, xm11, xm3 ; a3 b3
+ paddd xm5, xm4
+ psrad xm5, 6
+ packusdw xm5, xm5
+ pminsw xm5, xm7
+ movd [dstq+dsq*0], xm5
+ pextrd [dstq+dsq*1], xm5, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w2_loop
+ RET
+.v_w4:
+ movq xm1, [srcq+ssq*0]
+ vpbroadcastq m0, [srcq+ssq*1]
+ vpbroadcastq m2, [srcq+ssq*2]
+ vpbroadcastq m4, [srcq+r6 ]
+ lea srcq, [srcq+ssq*4]
+ vpbroadcastq m3, [srcq+ssq*0]
+ vpbroadcastq m5, [srcq+ssq*1]
+ vpblendd m1, m0, 0x30
+ vpblendd m0, m2, 0x30
+ punpcklwd m1, m0 ; 01 12
+ vpbroadcastq m0, [srcq+ssq*2]
+ add srcq, r6
+ vpblendd m2, m4, 0x30
+ vpblendd m4, m3, 0x30
+ punpcklwd m2, m4 ; 23 34
+ vpblendd m3, m5, 0x30
+ vpblendd m5, m0, 0x30
+ punpcklwd m3, m5 ; 45 56
+.v_w4_loop:
+ vpbroadcastq m4, [srcq+ssq*0]
+ pmaddwd m5, m8, m1 ; a0 b0
+ mova m1, m2
+ pmaddwd m2, m9 ; a1 b1
+ paddd m5, m6
+ paddd m5, m2
+ mova m2, m3
+ pmaddwd m3, m10 ; a2 b2
+ paddd m5, m3
+ vpblendd m3, m0, m4, 0x30
+ vpbroadcastq m0, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpblendd m4, m0, 0x30
+ punpcklwd m3, m4 ; 67 78
+ pmaddwd m4, m11, m3 ; a3 b3
+ paddd m5, m4
+ psrad m5, 6
+ vextracti128 xm4, m5, 1
+ packusdw xm5, xm4
+ pminsw xm5, xm7
+ movq [dstq+dsq*0], xm5
+ movhps [dstq+dsq*1], xm5
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w4_loop
+ RET
+.v_w8:
+ shl wd, 5
+ mov r7, srcq
+ mov r8, dstq
+ lea wd, [hq+wq-256]
+.v_w8_loop0:
+ vbroadcasti128 m4, [srcq+ssq*0]
+ vbroadcasti128 m5, [srcq+ssq*1]
+ vbroadcasti128 m0, [srcq+r6 ]
+ vbroadcasti128 m6, [srcq+ssq*2]
+ lea srcq, [srcq+ssq*4]
+ vbroadcasti128 m1, [srcq+ssq*0]
+ vbroadcasti128 m2, [srcq+ssq*1]
+ vbroadcasti128 m3, [srcq+ssq*2]
+ add srcq, r6
+ shufpd m4, m0, 0x0c
+ shufpd m5, m1, 0x0c
+ punpcklwd m1, m4, m5 ; 01
+ punpckhwd m4, m5 ; 34
+ shufpd m6, m2, 0x0c
+ punpcklwd m2, m5, m6 ; 12
+ punpckhwd m5, m6 ; 45
+ shufpd m0, m3, 0x0c
+ punpcklwd m3, m6, m0 ; 23
+ punpckhwd m6, m0 ; 56
+.v_w8_loop:
+ vbroadcasti128 m14, [srcq+ssq*0]
+ pmaddwd m12, m8, m1 ; a0
+ pmaddwd m13, m8, m2 ; b0
+ mova m1, m3
+ mova m2, m4
+ pmaddwd m3, m9 ; a1
+ pmaddwd m4, m9 ; b1
+ paddd m12, m3
+ paddd m13, m4
+ mova m3, m5
+ mova m4, m6
+ pmaddwd m5, m10 ; a2
+ pmaddwd m6, m10 ; b2
+ paddd m12, m5
+ vbroadcasti128 m5, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ paddd m13, m6
+ shufpd m6, m0, m14, 0x0d
+ shufpd m0, m14, m5, 0x0c
+ punpcklwd m5, m6, m0 ; 67
+ punpckhwd m6, m0 ; 78
+ pmaddwd m14, m11, m5 ; a3
+ paddd m12, m14
+ pmaddwd m14, m11, m6 ; b3
+ paddd m13, m14
+ psrad m12, 5
+ psrad m13, 5
+ packusdw m12, m13
+ pxor m13, m13
+ pavgw m12, m13
+ pminsw m12, m7
+ vpermq m12, m12, q3120
+ mova [dstq+dsq*0], xm12
+ vextracti128 [dstq+dsq*1], m12, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w8_loop
+ add r7, 16
+ add r8, 16
+ movzx hd, wb
+ mov srcq, r7
+ mov dstq, r8
+ sub wd, 1<<8
+ jg .v_w8_loop0
+ RET
+.hv:
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 16
+ vpbroadcastw m15, r8m
+ cmp wd, 4
+ jg .hv_w8
+ movzx mxd, mxb
+ vpbroadcastd m0, [base+subpel_filters+mxq*8+2]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmovle myd, mxd
+ vpbroadcastq m1, [base+subpel_filters+myq*8]
+ vpbroadcastd m6, [pd_512]
+ lea r6, [ssq*3]
+ sub srcq, 2
+ sub srcq, r6
+ pxor m7, m7
+ punpcklbw m7, m0
+ punpcklbw m1, m1
+ psraw m1, 8 ; sign-extend
+ test dword r8m, 0x800
+ jz .hv_10bit
+ psraw m7, 2
+ psllw m1, 2
+.hv_10bit:
+ pshufd m11, m1, q0000
+ pshufd m12, m1, q1111
+ pshufd m13, m1, q2222
+ pshufd m14, m1, q3333
+ cmp wd, 4
+ je .hv_w4
+ vbroadcasti128 m9, [subpel_h_shuf2]
+ vbroadcasti128 m1, [srcq+r6 ] ; 3 3
+ movu xm3, [srcq+ssq*2]
+ movu xm0, [srcq+ssq*0]
+ movu xm2, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*4]
+ vinserti128 m3, [srcq+ssq*0], 1 ; 2 4
+ vinserti128 m0, [srcq+ssq*1], 1 ; 0 5
+ vinserti128 m2, [srcq+ssq*2], 1 ; 1 6
+ add srcq, r6
+ pshufb m1, m9
+ pshufb m3, m9
+ pshufb m0, m9
+ pshufb m2, m9
+ pmaddwd m1, m7
+ pmaddwd m3, m7
+ pmaddwd m0, m7
+ pmaddwd m2, m7
+ phaddd m1, m3
+ phaddd m0, m2
+ paddd m1, m6
+ paddd m0, m6
+ psrad m1, 10
+ psrad m0, 10
+ packssdw m1, m0 ; 3 2 0 1
+ vextracti128 xm0, m1, 1 ; 3 4 5 6
+ pshufd xm2, xm1, q1301 ; 2 3 1 2
+ pshufd xm3, xm0, q2121 ; 4 5 4 5
+ punpckhwd xm1, xm2 ; 01 12
+ punpcklwd xm2, xm0 ; 23 34
+ punpckhwd xm3, xm0 ; 45 56
+.hv_w2_loop:
+ movu xm4, [srcq+ssq*0]
+ movu xm5, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb xm4, xm9
+ pshufb xm5, xm9
+ pmaddwd xm4, xm7
+ pmaddwd xm5, xm7
+ phaddd xm4, xm5
+ pmaddwd xm5, xm11, xm1 ; a0 b0
+ mova xm1, xm2
+ pmaddwd xm2, xm12 ; a1 b1
+ paddd xm5, xm2
+ mova xm2, xm3
+ pmaddwd xm3, xm13 ; a2 b2
+ paddd xm5, xm3
+ paddd xm4, xm6
+ psrad xm4, 10
+ packssdw xm4, xm4
+ palignr xm3, xm4, xm0, 12
+ mova xm0, xm4
+ punpcklwd xm3, xm0 ; 67 78
+ pmaddwd xm4, xm14, xm3 ; a3 b3
+ paddd xm5, xm6
+ paddd xm5, xm4
+ psrad xm5, 10
+ packusdw xm5, xm5
+ pminsw xm5, xm15
+ movd [dstq+dsq*0], xm5
+ pextrd [dstq+dsq*1], xm5, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w2_loop
+ RET
+.hv_w4:
+ vbroadcasti128 m9, [subpel_h_shufA]
+ vbroadcasti128 m10, [subpel_h_shufB]
+ pshufd m8, m7, q1111
+ pshufd m7, m7, q0000
+ movu xm1, [srcq+ssq*0]
+ vinserti128 m1, [srcq+ssq*1], 1 ; 0 1
+ vbroadcasti128 m0, [srcq+r6 ]
+ vinserti128 m2, m0, [srcq+ssq*2], 0 ; 2 3
+ lea srcq, [srcq+ssq*4]
+ vinserti128 m0, [srcq+ssq*0], 1 ; 3 4
+ movu xm3, [srcq+ssq*1]
+ vinserti128 m3, [srcq+ssq*2], 1 ; 5 6
+ add srcq, r6
+ pshufb m4, m1, m9
+ pshufb m1, m10
+ pmaddwd m4, m7
+ pmaddwd m1, m8
+ pshufb m5, m2, m9
+ pshufb m2, m10
+ pmaddwd m5, m7
+ pmaddwd m2, m8
+ paddd m4, m6
+ paddd m1, m4
+ pshufb m4, m0, m9
+ pshufb m0, m10
+ pmaddwd m4, m7
+ pmaddwd m0, m8
+ paddd m5, m6
+ paddd m2, m5
+ pshufb m5, m3, m9
+ pshufb m3, m10
+ pmaddwd m5, m7
+ pmaddwd m3, m8
+ paddd m4, m6
+ paddd m4, m0
+ paddd m5, m6
+ paddd m5, m3
+ vperm2i128 m0, m1, m2, 0x21
+ psrld m1, 10
+ psrld m2, 10
+ vperm2i128 m3, m4, m5, 0x21
+ pslld m4, 6
+ pslld m5, 6
+ pblendw m2, m4, 0xaa ; 23 34
+ pslld m0, 6
+ pblendw m1, m0, 0xaa ; 01 12
+ psrld m3, 10
+ pblendw m3, m5, 0xaa ; 45 56
+ psrad m0, m5, 16
+.hv_w4_loop:
+ movu xm4, [srcq+ssq*0]
+ vinserti128 m4, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ pmaddwd m5, m11, m1 ; a0 b0
+ mova m1, m2
+ pmaddwd m2, m12 ; a1 b1
+ paddd m5, m6
+ paddd m5, m2
+ mova m2, m3
+ pmaddwd m3, m13 ; a2 b2
+ paddd m5, m3
+ pshufb m3, m4, m9
+ pshufb m4, m10
+ pmaddwd m3, m7
+ pmaddwd m4, m8
+ paddd m3, m6
+ paddd m4, m3
+ psrad m4, 10
+ packssdw m0, m4 ; _ 7 6 8
+ vpermq m3, m0, q1122 ; _ 6 _ 7
+ punpckhwd m3, m0 ; 67 78
+ mova m0, m4
+ pmaddwd m4, m14, m3 ; a3 b3
+ paddd m4, m5
+ psrad m4, 10
+ vextracti128 xm5, m4, 1
+ packusdw xm4, xm5
+ pminsw xm4, xm15
+ movq [dstq+dsq*0], xm4
+ movhps [dstq+dsq*1], xm4
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ shr mxd, 16
+ vpbroadcastq m2, [base+subpel_filters+mxq*8]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmovle myd, mxd
+ pmovsxbw xm1, [base+subpel_filters+myq*8]
+ shl wd, 5
+ lea r6, [ssq*3]
+ sub srcq, 6
+ sub srcq, r6
+ pxor m0, m0
+ punpcklbw m0, m2
+ mov r7, srcq
+ mov r8, dstq
+ lea wd, [hq+wq-256]
+ test dword r8m, 0x800
+ jz .hv_w8_10bit
+ psraw m0, 2
+ psllw xm1, 2
+.hv_w8_10bit:
+ pshufd m11, m0, q0000
+ pshufd m12, m0, q1111
+ pshufd m13, m0, q2222
+ pshufd m14, m0, q3333
+%if WIN64
+ %define v_mul (rsp+stack_offset+40) ; r4m
+%else
+ %define v_mul (rsp-24) ; red zone
+%endif
+ mova [v_mul], xm1
+.hv_w8_loop0:
+%macro PUT_8TAP_HV_H 3 ; dst/src+0, src+8, src+16
+ pshufb m2, m%1, m9 ; 2 3 3 4 4 5 5 6
+ pshufb m%1, m8 ; 0 1 1 2 2 3 3 4
+ pmaddwd m3, m12, m2
+ pmaddwd m%1, m11
+ pshufb m%2, m9 ; 6 7 7 8 8 9 9 a
+ shufpd m2, m%2, 0x05 ; 4 5 5 6 6 7 7 8
+ paddd m3, m10
+ paddd m%1, m3
+ pmaddwd m3, m14, m%2
+ paddd m%1, m3
+ pmaddwd m3, m13, m2
+ pshufb m%3, m9 ; a b b c c d d e
+ pmaddwd m2, m11
+ paddd m%1, m3
+ pmaddwd m3, m12, m%2
+ shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c
+ pmaddwd m%3, m14
+ pmaddwd m%2, m13
+ paddd m2, m10
+ paddd m2, m3
+ paddd m%3, m2
+ paddd m%2, m%3
+ psrad m%1, 10
+ psrad m%2, 10
+ packssdw m%1, m%2
+%endmacro
+ movu xm4, [srcq+r6 *1+ 0]
+ vbroadcasti128 m8, [subpel_h_shufA]
+ movu xm6, [srcq+r6 *1+ 8]
+ vbroadcasti128 m9, [subpel_h_shufB]
+ movu xm0, [srcq+r6 *1+16]
+ vpbroadcastd m10, [pd_512]
+ movu xm5, [srcq+ssq*0+ 0]
+ vinserti128 m5, [srcq+ssq*4+ 0], 1
+ movu xm1, [srcq+ssq*0+16]
+ vinserti128 m1, [srcq+ssq*4+16], 1
+ shufpd m7, m5, m1, 0x05
+ INIT_XMM avx2
+ PUT_8TAP_HV_H 4, 6, 0 ; 3
+ INIT_YMM avx2
+ PUT_8TAP_HV_H 5, 7, 1 ; 0 4
+ movu xm0, [srcq+ssq*2+ 0]
+ vinserti128 m0, [srcq+r6 *2+ 0], 1
+ movu xm1, [srcq+ssq*2+16]
+ vinserti128 m1, [srcq+r6 *2+16], 1
+ shufpd m7, m0, m1, 0x05
+ PUT_8TAP_HV_H 0, 7, 1 ; 2 6
+ movu xm6, [srcq+ssq*1+ 0]
+ movu xm1, [srcq+ssq*1+16]
+ lea srcq, [srcq+ssq*4]
+ vinserti128 m6, [srcq+ssq*1+ 0], 1
+ vinserti128 m1, [srcq+ssq*1+16], 1
+ add srcq, r6
+ shufpd m7, m6, m1, 0x05
+ PUT_8TAP_HV_H 6, 7, 1 ; 1 5
+ vpermq m4, m4, q1100
+ vpermq m5, m5, q3120
+ vpermq m6, m6, q3120
+ vpermq m7, m0, q3120
+ punpcklwd m3, m7, m4 ; 23
+ punpckhwd m4, m5 ; 34
+ punpcklwd m1, m5, m6 ; 01
+ punpckhwd m5, m6 ; 45
+ punpcklwd m2, m6, m7 ; 12
+ punpckhwd m6, m7 ; 56
+.hv_w8_loop:
+ vpbroadcastd m9, [v_mul+4*0]
+ vpbroadcastd m7, [v_mul+4*1]
+ vpbroadcastd m10, [v_mul+4*2]
+ pmaddwd m8, m9, m1 ; a0
+ pmaddwd m9, m2 ; b0
+ mova m1, m3
+ mova m2, m4
+ pmaddwd m3, m7 ; a1
+ pmaddwd m4, m7 ; b1
+ paddd m8, m3
+ paddd m9, m4
+ mova m3, m5
+ mova m4, m6
+ pmaddwd m5, m10 ; a2
+ pmaddwd m6, m10 ; b2
+ paddd m8, m5
+ paddd m9, m6
+ movu xm5, [srcq+ssq*0]
+ vinserti128 m5, [srcq+ssq*1], 1
+ vbroadcasti128 m7, [subpel_h_shufA]
+ vbroadcasti128 m10, [subpel_h_shufB]
+ movu xm6, [srcq+ssq*0+16]
+ vinserti128 m6, [srcq+ssq*1+16], 1
+ vextracti128 [dstq], m0, 1
+ pshufb m0, m5, m7 ; 01
+ pshufb m5, m10 ; 23
+ pmaddwd m0, m11
+ pmaddwd m5, m12
+ paddd m0, m5
+ pshufb m5, m6, m7 ; 89
+ pshufb m6, m10 ; ab
+ pmaddwd m5, m13
+ pmaddwd m6, m14
+ paddd m6, m5
+ movu xm5, [srcq+ssq*0+8]
+ vinserti128 m5, [srcq+ssq*1+8], 1
+ lea srcq, [srcq+ssq*2]
+ pshufb m7, m5, m7
+ pshufb m5, m10
+ pmaddwd m10, m13, m7
+ pmaddwd m7, m11
+ paddd m0, m10
+ vpbroadcastd m10, [pd_512]
+ paddd m6, m7
+ pmaddwd m7, m14, m5
+ pmaddwd m5, m12
+ paddd m0, m7
+ paddd m5, m6
+ vbroadcasti128 m6, [dstq]
+ paddd m8, m10
+ paddd m9, m10
+ paddd m0, m10
+ paddd m5, m10
+ vpbroadcastd m10, [v_mul+4*3]
+ psrad m0, 10
+ psrad m5, 10
+ packssdw m0, m5
+ vpermq m7, m0, q3120 ; 7 8
+ shufpd m6, m7, 0x04 ; 6 7
+ punpcklwd m5, m6, m7 ; 67
+ punpckhwd m6, m7 ; 78
+ pmaddwd m7, m10, m5 ; a3
+ pmaddwd m10, m6 ; b3
+ paddd m7, m8
+ paddd m9, m10
+ psrad m7, 10
+ psrad m9, 10
+ packusdw m7, m9
+ pminsw m7, m15
+ vpermq m7, m7, q3120
+ mova [dstq+dsq*0], xm7
+ vextracti128 [dstq+dsq*1], m7, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w8_loop
+ add r7, 16
+ add r8, 16
+ movzx hd, wb
+ mov srcq, r7
+ mov dstq, r8
+ sub wd, 1<<8
+ jg .hv_w8_loop0
+ RET
+
+%if WIN64
+DECLARE_REG_TMP 6, 4
+%else
+DECLARE_REG_TMP 6, 7
+%endif
+
+%define PREP_8TAP_FN FN prep_8tap,
+PREP_8TAP_FN sharp, SHARP, SHARP
+PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH
+PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP
+PREP_8TAP_FN smooth, SMOOTH, SMOOTH
+PREP_8TAP_FN sharp_regular, SHARP, REGULAR
+PREP_8TAP_FN regular_sharp, REGULAR, SHARP
+PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR
+PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PREP_8TAP_FN regular, REGULAR, REGULAR
+
+cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my
+%define base r7-prep_avx2
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+ imul myd, mym, 0x010101
+ add myd, t1d ; 8tap_v, my, 4tap_v
+ lea r7, [prep_avx2]
+ movifnidn hd, hm
+ test mxd, 0xf00
+ jnz .h
+ test myd, 0xf00
+ jnz .v
+ tzcnt wd, wd
+ mov r6d, r7m ; bitdepth_max
+ movzx wd, word [r7+wq*2+table_offset(prep,)]
+ vpbroadcastd m5, [r7-prep_avx2+pw_8192]
+ shr r6d, 11
+ add wq, r7
+ vpbroadcastd m4, [base+prep_mul+r6*4]
+ lea r6, [strideq*3]
+%if WIN64
+ pop r7
+%endif
+ jmp wq
+.h_w4:
+ movzx mxd, mxb
+ sub srcq, 2
+ pmovsxbw xm0, [base+subpel_filters+mxq*8]
+ vbroadcasti128 m3, [subpel_h_shufA]
+ vbroadcasti128 m4, [subpel_h_shufB]
+ WIN64_SPILL_XMM 8
+ pshufd xm0, xm0, q2211
+ test dword r7m, 0x800
+ jnz .h_w4_12bpc
+ psllw xm0, 2
+.h_w4_12bpc:
+ vpbroadcastq m6, xm0
+ vpermq m7, m0, q1111
+.h_w4_loop:
+ movu xm1, [srcq+strideq*0]
+ vinserti128 m1, [srcq+strideq*2], 1
+ movu xm2, [srcq+strideq*1]
+ vinserti128 m2, [srcq+r6 ], 1
+ lea srcq, [srcq+strideq*4]
+ pshufb m0, m1, m3 ; 0 1 1 2 2 3 3 4
+ pshufb m1, m4 ; 2 3 3 4 4 5 5 6
+ pmaddwd m0, m6
+ pmaddwd m1, m7
+ paddd m0, m5
+ paddd m0, m1
+ pshufb m1, m2, m3
+ pshufb m2, m4
+ pmaddwd m1, m6
+ pmaddwd m2, m7
+ paddd m1, m5
+ paddd m1, m2
+ psrad m0, 4
+ psrad m1, 4
+ packssdw m0, m1
+ mova [tmpq], m0
+ add tmpq, 32
+ sub hd, 4
+ jg .h_w4_loop
+ RET
+.h:
+ test myd, 0xf00
+ jnz .hv
+ vpbroadcastd m5, [prep_8tap_1d_rnd] ; 8 - (8192 << 4)
+ lea r6, [strideq*3]
+ cmp wd, 4
+ je .h_w4
+ shr mxd, 16
+ sub srcq, 6
+ vpbroadcastq m0, [base+subpel_filters+mxq*8]
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 12
+ vbroadcasti128 m6, [subpel_h_shufA]
+ vbroadcasti128 m7, [subpel_h_shufB]
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ test dword r7m, 0x800
+ jnz .h_12bpc
+ psllw m0, 2
+.h_12bpc:
+ pshufd m8, m0, q0000
+ pshufd m9, m0, q1111
+ pshufd m10, m0, q2222
+ pshufd m11, m0, q3333
+ cmp wd, 8
+ jg .h_w16
+.h_w8:
+%macro PREP_8TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2]
+ pshufb m%4, m%1, m7 ; 2 3 3 4 4 5 5 6
+ pshufb m%1, m6 ; 0 1 1 2 2 3 3 4
+ pmaddwd m%5, m9, m%4 ; abcd1
+ pmaddwd m%1, m8 ; abcd0
+ pshufb m%2, m7 ; 6 7 7 8 8 9 9 a
+ shufpd m%4, m%2, 0x05 ; 4 5 5 6 6 7 7 8
+ paddd m%5, m5
+ paddd m%1, m%5
+ pmaddwd m%5, m11, m%2 ; abcd3
+ paddd m%1, m%5
+ pmaddwd m%5, m10, m%4 ; abcd2
+ pshufb m%3, m7 ; a b b c c d d e
+ pmaddwd m%4, m8 ; efgh0
+ paddd m%1, m%5
+ pmaddwd m%5, m9, m%2 ; efgh1
+ shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c
+ pmaddwd m%3, m11 ; efgh3
+ pmaddwd m%2, m10 ; efgh2
+ paddd m%4, m5
+ paddd m%4, m%5
+ paddd m%3, m%4
+ paddd m%2, m%3
+ psrad m%1, 4
+ psrad m%2, 4
+ packssdw m%1, m%2
+%endmacro
+ movu xm0, [srcq+strideq*0+ 0]
+ vinserti128 m0, [srcq+strideq*1+ 0], 1
+ movu xm2, [srcq+strideq*0+16]
+ vinserti128 m2, [srcq+strideq*1+16], 1
+ lea srcq, [srcq+strideq*2]
+ shufpd m1, m0, m2, 0x05
+ PREP_8TAP_H 0, 1, 2, 3, 4
+ mova [tmpq], m0
+ add tmpq, 32
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ add wd, wd
+.h_w16_loop0:
+ mov r6d, wd
+.h_w16_loop:
+ movu m0, [srcq+r6-32]
+ movu m1, [srcq+r6-24]
+ movu m2, [srcq+r6-16]
+ PREP_8TAP_H 0, 1, 2, 3, 4
+ mova [tmpq+r6-32], m0
+ sub r6d, 32
+ jg .h_w16_loop
+ add srcq, strideq
+ add tmpq, wq
+ dec hd
+ jg .h_w16_loop0
+ RET
+.v:
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmovle myd, mxd
+ vpbroadcastq m0, [base+subpel_filters+myq*8]
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 15
+ vpbroadcastd m7, [prep_8tap_1d_rnd]
+ lea r6, [strideq*3]
+ sub srcq, r6
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ test dword r7m, 0x800
+ jnz .v_12bpc
+ psllw m0, 2
+.v_12bpc:
+ pshufd m8, m0, q0000
+ pshufd m9, m0, q1111
+ pshufd m10, m0, q2222
+ pshufd m11, m0, q3333
+ cmp wd, 4
+ jg .v_w8
+.v_w4:
+ movq xm1, [srcq+strideq*0]
+ vpbroadcastq m0, [srcq+strideq*1]
+ vpbroadcastq m2, [srcq+strideq*2]
+ vpbroadcastq m4, [srcq+r6 ]
+ lea srcq, [srcq+strideq*4]
+ vpbroadcastq m3, [srcq+strideq*0]
+ vpbroadcastq m5, [srcq+strideq*1]
+ vpblendd m1, m0, 0x30
+ vpblendd m0, m2, 0x30
+ punpcklwd m1, m0 ; 01 12
+ vpbroadcastq m0, [srcq+strideq*2]
+ add srcq, r6
+ vpblendd m2, m4, 0x30
+ vpblendd m4, m3, 0x30
+ punpcklwd m2, m4 ; 23 34
+ vpblendd m3, m5, 0x30
+ vpblendd m5, m0, 0x30
+ punpcklwd m3, m5 ; 45 56
+.v_w4_loop:
+ vpbroadcastq m4, [srcq+strideq*0]
+ pmaddwd m5, m8, m1 ; a0 b0
+ mova m1, m2
+ pmaddwd m2, m9 ; a1 b1
+ paddd m5, m7
+ paddd m5, m2
+ mova m2, m3
+ pmaddwd m3, m10 ; a2 b2
+ paddd m5, m3
+ vpblendd m3, m0, m4, 0x30
+ vpbroadcastq m0, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vpblendd m4, m0, 0x30
+ punpcklwd m3, m4 ; 67 78
+ pmaddwd m4, m11, m3 ; a3 b3
+ paddd m5, m4
+ psrad m5, 4
+ vextracti128 xm4, m5, 1
+ packssdw xm5, xm4
+ mova [tmpq], xm5
+ add tmpq, 16
+ sub hd, 2
+ jg .v_w4_loop
+ RET
+.v_w8:
+%if WIN64
+ push r8
+%endif
+ mov r8d, wd
+ shl wd, 5
+ mov r5, srcq
+ mov r7, tmpq
+ lea wd, [hq+wq-256]
+.v_w8_loop0:
+ vbroadcasti128 m4, [srcq+strideq*0]
+ vbroadcasti128 m5, [srcq+strideq*1]
+ vbroadcasti128 m0, [srcq+r6 ]
+ vbroadcasti128 m6, [srcq+strideq*2]
+ lea srcq, [srcq+strideq*4]
+ vbroadcasti128 m1, [srcq+strideq*0]
+ vbroadcasti128 m2, [srcq+strideq*1]
+ vbroadcasti128 m3, [srcq+strideq*2]
+ add srcq, r6
+ shufpd m4, m0, 0x0c
+ shufpd m5, m1, 0x0c
+ punpcklwd m1, m4, m5 ; 01
+ punpckhwd m4, m5 ; 34
+ shufpd m6, m2, 0x0c
+ punpcklwd m2, m5, m6 ; 12
+ punpckhwd m5, m6 ; 45
+ shufpd m0, m3, 0x0c
+ punpcklwd m3, m6, m0 ; 23
+ punpckhwd m6, m0 ; 56
+.v_w8_loop:
+ vbroadcasti128 m14, [srcq+strideq*0]
+ pmaddwd m12, m8, m1 ; a0
+ pmaddwd m13, m8, m2 ; b0
+ mova m1, m3
+ mova m2, m4
+ pmaddwd m3, m9 ; a1
+ pmaddwd m4, m9 ; b1
+ paddd m12, m7
+ paddd m13, m7
+ paddd m12, m3
+ paddd m13, m4
+ mova m3, m5
+ mova m4, m6
+ pmaddwd m5, m10 ; a2
+ pmaddwd m6, m10 ; b2
+ paddd m12, m5
+ vbroadcasti128 m5, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ paddd m13, m6
+ shufpd m6, m0, m14, 0x0d
+ shufpd m0, m14, m5, 0x0c
+ punpcklwd m5, m6, m0 ; 67
+ punpckhwd m6, m0 ; 78
+ pmaddwd m14, m11, m5 ; a3
+ paddd m12, m14
+ pmaddwd m14, m11, m6 ; b3
+ paddd m13, m14
+ psrad m12, 4
+ psrad m13, 4
+ packssdw m12, m13
+ vpermq m12, m12, q3120
+ mova [tmpq+r8*0], xm12
+ vextracti128 [tmpq+r8*2], m12, 1
+ lea tmpq, [tmpq+r8*4]
+ sub hd, 2
+ jg .v_w8_loop
+ add r5, 16
+ add r7, 16
+ movzx hd, wb
+ mov srcq, r5
+ mov tmpq, r7
+ sub wd, 1<<8
+ jg .v_w8_loop0
+%if WIN64
+ pop r8
+%endif
+ RET
+.hv:
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 16
+ vpbroadcastd m15, [prep_8tap_2d_rnd]
+ cmp wd, 4
+ jg .hv_w8
+ movzx mxd, mxb
+ vpbroadcastd m0, [base+subpel_filters+mxq*8+2]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmovle myd, mxd
+ vpbroadcastq m1, [base+subpel_filters+myq*8]
+ lea r6, [strideq*3]
+ sub srcq, 2
+ sub srcq, r6
+ pxor m7, m7
+ punpcklbw m7, m0
+ punpcklbw m1, m1
+ psraw m7, 4
+ psraw m1, 8
+ test dword r7m, 0x800
+ jz .hv_w4_10bit
+ psraw m7, 2
+.hv_w4_10bit:
+ pshufd m11, m1, q0000
+ pshufd m12, m1, q1111
+ pshufd m13, m1, q2222
+ pshufd m14, m1, q3333
+.hv_w4:
+ vbroadcasti128 m9, [subpel_h_shufA]
+ vbroadcasti128 m10, [subpel_h_shufB]
+ pshufd m8, m7, q1111
+ pshufd m7, m7, q0000
+ movu xm1, [srcq+strideq*0]
+ vinserti128 m1, [srcq+strideq*1], 1 ; 0 1
+ vbroadcasti128 m0, [srcq+r6 ]
+ vinserti128 m2, m0, [srcq+strideq*2], 0 ; 2 3
+ lea srcq, [srcq+strideq*4]
+ vinserti128 m0, [srcq+strideq*0], 1 ; 3 4
+ movu xm3, [srcq+strideq*1]
+ vinserti128 m3, [srcq+strideq*2], 1 ; 5 6
+ add srcq, r6
+ pshufb m4, m1, m9
+ pshufb m1, m10
+ pmaddwd m4, m7
+ pmaddwd m1, m8
+ pshufb m5, m2, m9
+ pshufb m2, m10
+ pmaddwd m5, m7
+ pmaddwd m2, m8
+ paddd m4, m15
+ paddd m1, m4
+ pshufb m4, m0, m9
+ pshufb m0, m10
+ pmaddwd m4, m7
+ pmaddwd m0, m8
+ paddd m5, m15
+ paddd m2, m5
+ pshufb m5, m3, m9
+ pshufb m3, m10
+ pmaddwd m5, m7
+ pmaddwd m3, m8
+ paddd m4, m15
+ paddd m4, m0
+ paddd m5, m15
+ paddd m5, m3
+ vperm2i128 m0, m1, m2, 0x21
+ psrld m1, 6
+ psrld m2, 6
+ vperm2i128 m3, m4, m5, 0x21
+ pslld m4, 10
+ pslld m5, 10
+ pblendw m2, m4, 0xaa ; 23 34
+ pslld m0, 10
+ pblendw m1, m0, 0xaa ; 01 12
+ psrld m3, 6
+ pblendw m3, m5, 0xaa ; 45 56
+ psrad m0, m5, 16
+.hv_w4_loop:
+ movu xm4, [srcq+strideq*0]
+ vinserti128 m4, [srcq+strideq*1], 1
+ lea srcq, [srcq+strideq*2]
+ pmaddwd m5, m11, m1 ; a0 b0
+ mova m1, m2
+ pmaddwd m2, m12 ; a1 b1
+ paddd m5, m15
+ paddd m5, m2
+ mova m2, m3
+ pmaddwd m3, m13 ; a2 b2
+ paddd m5, m3
+ pshufb m3, m4, m9
+ pshufb m4, m10
+ pmaddwd m3, m7
+ pmaddwd m4, m8
+ paddd m3, m15
+ paddd m4, m3
+ psrad m4, 6
+ packssdw m0, m4 ; _ 7 6 8
+ vpermq m3, m0, q1122 ; _ 6 _ 7
+ punpckhwd m3, m0 ; 67 78
+ mova m0, m4
+ pmaddwd m4, m14, m3 ; a3 b3
+ paddd m4, m5
+ psrad m4, 6
+ vextracti128 xm5, m4, 1
+ packssdw xm4, xm5
+ mova [tmpq], xm4
+ add tmpq, 16
+ sub hd, 2
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ shr mxd, 16
+ vpbroadcastq m2, [base+subpel_filters+mxq*8]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmovle myd, mxd
+ pmovsxbw xm1, [base+subpel_filters+myq*8]
+%if WIN64
+ PUSH r8
+%endif
+ mov r8d, wd
+ shl wd, 5
+ lea r6, [strideq*3]
+ sub srcq, 6
+ sub srcq, r6
+ mov r5, srcq
+ mov r7, tmpq
+ lea wd, [hq+wq-256]
+ pxor m0, m0
+ punpcklbw m0, m2
+ mova [v_mul], xm1
+ psraw m0, 4
+ test dword r7m, 0x800
+ jz .hv_w8_10bit
+ psraw m0, 2
+.hv_w8_10bit:
+ pshufd m11, m0, q0000
+ pshufd m12, m0, q1111
+ pshufd m13, m0, q2222
+ pshufd m14, m0, q3333
+.hv_w8_loop0:
+%macro PREP_8TAP_HV_H 3 ; dst/src+0, src+8, src+16
+ pshufb m2, m%1, m9 ; 2 3 3 4 4 5 5 6
+ pshufb m%1, m8 ; 0 1 1 2 2 3 3 4
+ pmaddwd m3, m12, m2
+ pmaddwd m%1, m11
+ pshufb m%2, m9 ; 6 7 7 8 8 9 9 a
+ shufpd m2, m%2, 0x05 ; 4 5 5 6 6 7 7 8
+ paddd m3, m15
+ paddd m%1, m3
+ pmaddwd m3, m14, m%2
+ paddd m%1, m3
+ pmaddwd m3, m13, m2
+ pshufb m%3, m9 ; a b b c c d d e
+ pmaddwd m2, m11
+ paddd m%1, m3
+ pmaddwd m3, m12, m%2
+ shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c
+ pmaddwd m%3, m14
+ pmaddwd m%2, m13
+ paddd m2, m15
+ paddd m2, m3
+ paddd m2, m%3
+ paddd m2, m%2
+ psrad m%1, 6
+ psrad m2, 6
+ packssdw m%1, m2
+%endmacro
+ movu xm4, [srcq+r6 + 0]
+ vbroadcasti128 m8, [subpel_h_shufA]
+ movu xm6, [srcq+r6 + 8]
+ vbroadcasti128 m9, [subpel_h_shufB]
+ movu xm0, [srcq+r6 +16]
+ movu xm5, [srcq+strideq*0+ 0]
+ vinserti128 m5, [srcq+strideq*4+ 0], 1
+ movu xm1, [srcq+strideq*0+16]
+ vinserti128 m1, [srcq+strideq*4+16], 1
+ shufpd m7, m5, m1, 0x05
+ INIT_XMM avx2
+ PREP_8TAP_HV_H 4, 6, 0 ; 3
+ INIT_YMM avx2
+ PREP_8TAP_HV_H 5, 7, 1 ; 0 4
+ movu xm0, [srcq+strideq*2+ 0]
+ vinserti128 m0, [srcq+r6 *2+ 0], 1
+ movu xm1, [srcq+strideq*2+16]
+ vinserti128 m1, [srcq+r6 *2+16], 1
+ shufpd m7, m0, m1, 0x05
+ PREP_8TAP_HV_H 0, 7, 1 ; 2 6
+ movu xm6, [srcq+strideq*1+ 0]
+ movu xm1, [srcq+strideq*1+16]
+ lea srcq, [srcq+strideq*4]
+ vinserti128 m6, [srcq+strideq*1+ 0], 1
+ vinserti128 m1, [srcq+strideq*1+16], 1
+ add srcq, r6
+ shufpd m7, m6, m1, 0x05
+ PREP_8TAP_HV_H 6, 7, 1 ; 1 5
+ vpermq m4, m4, q1100
+ vpermq m5, m5, q3120
+ vpermq m6, m6, q3120
+ vpermq m7, m0, q3120
+ punpcklwd m3, m7, m4 ; 23
+ punpckhwd m4, m5 ; 34
+ punpcklwd m1, m5, m6 ; 01
+ punpckhwd m5, m6 ; 45
+ punpcklwd m2, m6, m7 ; 12
+ punpckhwd m6, m7 ; 56
+.hv_w8_loop:
+ vpbroadcastd m9, [v_mul+4*0]
+ vpbroadcastd m7, [v_mul+4*1]
+ vpbroadcastd m10, [v_mul+4*2]
+ pmaddwd m8, m9, m1 ; a0
+ pmaddwd m9, m2 ; b0
+ mova m1, m3
+ mova m2, m4
+ pmaddwd m3, m7 ; a1
+ pmaddwd m4, m7 ; b1
+ paddd m8, m15
+ paddd m9, m15
+ paddd m8, m3
+ paddd m9, m4
+ mova m3, m5
+ mova m4, m6
+ pmaddwd m5, m10 ; a2
+ pmaddwd m6, m10 ; b2
+ paddd m8, m5
+ paddd m9, m6
+ movu xm5, [srcq+strideq*0]
+ vinserti128 m5, [srcq+strideq*1], 1
+ vbroadcasti128 m7, [subpel_h_shufA]
+ vbroadcasti128 m10, [subpel_h_shufB]
+ movu xm6, [srcq+strideq*0+16]
+ vinserti128 m6, [srcq+strideq*1+16], 1
+ vextracti128 [tmpq], m0, 1
+ pshufb m0, m5, m7 ; 01
+ pshufb m5, m10 ; 23
+ pmaddwd m0, m11
+ pmaddwd m5, m12
+ paddd m0, m15
+ paddd m0, m5
+ pshufb m5, m6, m7 ; 89
+ pshufb m6, m10 ; ab
+ pmaddwd m5, m13
+ pmaddwd m6, m14
+ paddd m5, m15
+ paddd m6, m5
+ movu xm5, [srcq+strideq*0+8]
+ vinserti128 m5, [srcq+strideq*1+8], 1
+ lea srcq, [srcq+strideq*2]
+ pshufb m7, m5, m7
+ pshufb m5, m10
+ pmaddwd m10, m13, m7
+ pmaddwd m7, m11
+ paddd m0, m10
+ paddd m6, m7
+ pmaddwd m7, m14, m5
+ pmaddwd m5, m12
+ paddd m0, m7
+ paddd m5, m6
+ vbroadcasti128 m6, [tmpq]
+ vpbroadcastd m10, [v_mul+4*3]
+ psrad m0, 6
+ psrad m5, 6
+ packssdw m0, m5
+ vpermq m7, m0, q3120 ; 7 8
+ shufpd m6, m7, 0x04 ; 6 7
+ punpcklwd m5, m6, m7 ; 67
+ punpckhwd m6, m7 ; 78
+ pmaddwd m7, m10, m5 ; a3
+ pmaddwd m10, m6 ; b3
+ paddd m7, m8
+ paddd m9, m10
+ psrad m7, 6
+ psrad m9, 6
+ packssdw m7, m9
+ vpermq m7, m7, q3120
+ mova [tmpq+r8*0], xm7
+ vextracti128 [tmpq+r8*2], m7, 1
+ lea tmpq, [tmpq+r8*4]
+ sub hd, 2
+ jg .hv_w8_loop
+ add r5, 16
+ add r7, 16
+ movzx hd, wb
+ mov srcq, r5
+ mov tmpq, r7
+ sub wd, 1<<8
+ jg .hv_w8_loop0
+%if WIN64
+ POP r8
+%endif
+ RET
+
+%macro movifprep 2
+ %if isprep
+ mov %1, %2
+ %endif
+%endmacro
+
+%macro REMAP_REG 2
+ %xdefine r%1 r%2
+ %xdefine r%1q r%2q
+ %xdefine r%1d r%2d
+%endmacro
+
+%macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0
+ %if isprep
+ %xdefine r14_save r14
+ %assign %%i 14
+ %rep 14
+ %assign %%j %%i-1
+ REMAP_REG %%i, %%j
+ %assign %%i %%i-1
+ %endrep
+ %endif
+%endmacro
+
+%macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0
+ %if isprep
+ %assign %%i 1
+ %rep 13
+ %assign %%j %%i+1
+ REMAP_REG %%i, %%j
+ %assign %%i %%i+1
+ %endrep
+ %xdefine r14 r14_save
+ %undef r14_save
+ %endif
+%endmacro
+
+%macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged
+ MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT
+ RET
+ %if %1
+ MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
+ %endif
+%endmacro
+
+%macro MC_8TAP_SCALED_H 8-9 0 ; dst, tmp[0-6], load_hrnd
+ movu xm%1, [srcq+ r4*2]
+ movu xm%2, [srcq+ r6*2]
+ movu xm%3, [srcq+ r7*2]
+ movu xm%4, [srcq+ r9*2]
+ vinserti128 m%1, [srcq+r10*2], 1
+ vinserti128 m%2, [srcq+r11*2], 1
+ vinserti128 m%3, [srcq+r13*2], 1
+ vinserti128 m%4, [srcq+ rX*2], 1
+ add srcq, ssq
+ movu xm%5, [srcq+ r4*2]
+ movu xm%6, [srcq+ r6*2]
+ movu xm%7, [srcq+ r7*2]
+ movu xm%8, [srcq+ r9*2]
+ vinserti128 m%5, [srcq+r10*2], 1
+ vinserti128 m%6, [srcq+r11*2], 1
+ vinserti128 m%7, [srcq+r13*2], 1
+ vinserti128 m%8, [srcq+ rX*2], 1
+ add srcq, ssq
+ pmaddwd m%1, m12
+ pmaddwd m%2, m13
+ pmaddwd m%3, m14
+ pmaddwd m%4, m15
+ pmaddwd m%5, m12
+ pmaddwd m%6, m13
+ pmaddwd m%7, m14
+ pmaddwd m%8, m15
+ phaddd m%1, m%2
+ %if %9
+ mova m10, [rsp+0x00]
+ %endif
+ phaddd m%3, m%4
+ phaddd m%5, m%6
+ phaddd m%7, m%8
+ phaddd m%1, m%3
+ phaddd m%5, m%7
+ paddd m%1, m10
+ paddd m%5, m10
+ psrad m%1, xm11
+ psrad m%5, xm11
+ packssdw m%1, m%5
+%endmacro
+
+%macro MC_8TAP_SCALED 1
+%ifidn %1, put
+ %assign isput 1
+ %assign isprep 0
+cglobal put_8tap_scaled_16bpc, 4, 14, 16, 0xe0, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax
+ %xdefine base_reg r12
+ mov r7d, pxmaxm
+%else
+ %assign isput 0
+ %assign isprep 1
+cglobal prep_8tap_scaled_16bpc, 4, 14, 16, 0xe0, tmp, src, ss, w, h, mx, my, dx, dy, pxmax
+ %define tmp_stridem qword [rsp+0xd0]
+ %xdefine base_reg r11
+%endif
+ lea base_reg, [%1_8tap_scaled_16bpc_avx2]
+%define base base_reg-%1_8tap_scaled_16bpc_avx2
+ tzcnt wd, wm
+ vpbroadcastd m8, dxm
+%if isprep && UNIX64
+ movd xm10, mxd
+ vpbroadcastd m10, xm10
+ mov r5d, t0d
+ DECLARE_REG_TMP 5, 7
+ mov r6d, pxmaxm
+%else
+ vpbroadcastd m10, mxm
+ %if isput
+ vpbroadcastw m11, pxmaxm
+ %else
+ mov r6d, pxmaxm
+ %endif
+%endif
+ mov dyd, dym
+%if isput
+ %if WIN64
+ mov r8d, hm
+ DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3
+ %define hm r5m
+ %define dxm r8m
+ %else
+ DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3
+ %define hm r6m
+ %endif
+ %define dsm [rsp+0x98]
+ %define rX r1
+ %define rXd r1d
+%else ; prep
+ %if WIN64
+ mov r7d, hm
+ DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3
+ %define hm r4m
+ %define dxm r7m
+ %else
+ DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3
+ %define hm [rsp+0x98]
+ %endif
+ MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
+ %define rX r14
+ %define rXd r14d
+%endif
+ shr r7d, 11
+ vpbroadcastd m6, [base+pd_0x3ff]
+ vpbroadcastd m12, [base+s_8tap_h_rnd+r7*4]
+ movd xm7, [base+s_8tap_h_sh+r7*4]
+%if isput
+ vpbroadcastd m13, [base+put_s_8tap_v_rnd+r7*4]
+ pinsrd xm7, [base+put_s_8tap_v_sh+r7*4], 2
+%else
+ vpbroadcastd m13, [base+pd_m524256]
+%endif
+ pxor m9, m9
+ lea ss3q, [ssq*3]
+ movzx r7d, t1b
+ shr t1d, 16
+ cmp hd, 6
+ cmovs t1d, r7d
+ sub srcq, ss3q
+ cmp dyd, 1024
+ je .dy1
+ cmp dyd, 2048
+ je .dy2
+ movzx wd, word [base+%1_8tap_scaled_avx2_table+wq*2]
+ add wq, base_reg
+ jmp wq
+%if isput
+.w2:
+ mov myd, mym
+ movzx t0d, t0b
+ sub srcq, 2
+ movd xm15, t0d
+ punpckldq m8, m9, m8
+ paddd m10, m8 ; mx+dx*[0,1]
+ vpbroadcastd xm14, [base+pq_0x40000000+2]
+ vpbroadcastd xm15, xm15
+ pand xm8, xm10, xm6
+ psrld xm8, 6
+ paddd xm15, xm8
+ movd r4d, xm15
+ pextrd r6d, xm15, 1
+ vbroadcasti128 m5, [base+bdct_lb_q]
+ vbroadcasti128 m6, [base+subpel_s_shuf2]
+ vpbroadcastd xm15, [base+subpel_filters+r4*8+2]
+ vpbroadcastd xm4, [base+subpel_filters+r6*8+2]
+ pcmpeqd xm8, xm9
+ psrld m10, 10
+ paddd m10, m10
+ movu xm0, [srcq+ssq*0]
+ movu xm1, [srcq+ssq*1]
+ movu xm2, [srcq+ssq*2]
+ movu xm3, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ pshufb m10, m5
+ paddb m10, m6
+ vpblendd xm15, xm4, 0xa
+ pblendvb xm15, xm14, xm8
+ pmovsxbw m15, xm15
+ vinserti128 m0, [srcq+ssq*0], 1 ; 0 4
+ vinserti128 m1, [srcq+ssq*1], 1 ; 1 5
+ vinserti128 m2, [srcq+ssq*2], 1 ; 2 6
+ vinserti128 m3, [srcq+ss3q ], 1 ; 3 7
+ lea srcq, [srcq+ssq*4]
+ REPX {pshufb x, m10}, m0, m1, m2, m3
+ REPX {pmaddwd x, m15}, m0, m1, m2, m3
+ phaddd m0, m1
+ phaddd m2, m3
+ paddd m0, m12
+ paddd m2, m12
+ psrad m0, xm7
+ psrad m2, xm7
+ packssdw m0, m2 ; 0 1 2 3 4 5 6 7
+ vextracti128 xm1, m0, 1
+ palignr xm2, xm1, xm0, 4 ; 1 2 3 4
+ punpcklwd xm3, xm0, xm2 ; 01 12
+ punpckhwd xm0, xm2 ; 23 34
+ pshufd xm4, xm1, q0321 ; 5 6 7 _
+ punpcklwd xm2, xm1, xm4 ; 45 56
+ punpckhwd xm4, xm1, xm4 ; 67 __
+.w2_loop:
+ and myd, 0x3ff
+ mov r6d, 64 << 24
+ mov r4d, myd
+ shr r4d, 6
+ lea r4d, [t1+r4]
+ cmovnz r6q, [base+subpel_filters+r4*8]
+ movq xm14, r6q
+ pmovsxbw xm14, xm14
+ pshufd xm8, xm14, q0000
+ pshufd xm9, xm14, q1111
+ pmaddwd xm5, xm3, xm8
+ pmaddwd xm6, xm0, xm9
+ pshufd xm8, xm14, q2222
+ pshufd xm14, xm14, q3333
+ paddd xm5, xm6
+ pmaddwd xm6, xm2, xm8
+ pmaddwd xm8, xm4, xm14
+ psrldq xm9, xm7, 8
+ paddd xm5, xm6
+ paddd xm5, xm13
+ paddd xm5, xm8
+ psrad xm5, xm9
+ packusdw xm5, xm5
+ pminsw xm5, xm11
+ movd [dstq], xm5
+ add dstq, dsq
+ dec hd
+ jz .ret
+ add myd, dyd
+ test myd, ~0x3ff
+ jz .w2_loop
+ movu xm5, [srcq]
+ test myd, 0x400
+ jz .w2_skip_line
+ add srcq, ssq
+ shufps xm3, xm0, q1032 ; 01 12
+ shufps xm0, xm2, q1032 ; 23 34
+ shufps xm2, xm4, q1032 ; 45 56
+ pshufb xm5, xm10
+ pmaddwd xm5, xm15
+ phaddd xm5, xm5
+ paddd xm5, xm12
+ psrad xm5, xm7
+ packssdw xm5, xm5
+ palignr xm1, xm5, xm1, 12
+ punpcklqdq xm1, xm1 ; 6 7 6 7
+ punpcklwd xm4, xm1, xm5 ; 67 __
+ jmp .w2_loop
+.w2_skip_line:
+ movu xm6, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova xm3, xm0 ; 01 12
+ mova xm0, xm2 ; 23 34
+ pshufb xm5, xm10
+ pshufb xm6, xm10
+ pmaddwd xm5, xm15
+ pmaddwd xm6, xm15
+ phaddd xm5, xm6
+ paddd xm5, xm12
+ psrad xm5, xm7
+ packssdw xm5, xm5 ; 6 7 6 7
+ palignr xm1, xm5, xm1, 8 ; 4 5 6 7
+ pshufd xm5, xm1, q0321 ; 5 6 7 _
+ punpcklwd xm2, xm1, xm5 ; 45 56
+ punpckhwd xm4, xm1, xm5 ; 67 __
+ jmp .w2_loop
+%endif
+.w4:
+ mov myd, mym
+ mova [rsp+0x00], m12
+%if isput
+ mova [rsp+0x20], xm13
+%else
+ SWAP m11, m13
+%endif
+ mova [rsp+0x30], xm7
+ vbroadcasti128 m7, [base+rescale_mul]
+ movzx t0d, t0b
+ sub srcq, 2
+ movd xm15, t0d
+ pmaddwd m8, m7
+ vpbroadcastq m2, [base+pq_0x40000000+1]
+ vpbroadcastd xm15, xm15
+ SWAP m13, m10
+ paddd m13, m8 ; mx+dx*[0-3]
+ pand m6, m13
+ psrld m6, 6
+ paddd xm15, xm6
+ movd r4d, xm15
+ pextrd r6d, xm15, 1
+ pextrd r11d, xm15, 2
+ pextrd r13d, xm15, 3
+ vbroadcasti128 m5, [base+bdct_lb_q+ 0]
+ vbroadcasti128 m1, [base+bdct_lb_q+16]
+ vbroadcasti128 m0, [base+subpel_s_shuf2]
+ vpbroadcastd xm14, [base+subpel_filters+r4*8+2]
+ vpbroadcastd xm7, [base+subpel_filters+r6*8+2]
+ vpbroadcastd xm15, [base+subpel_filters+r11*8+2]
+ vpbroadcastd xm8, [base+subpel_filters+r13*8+2]
+ pcmpeqd m6, m9
+ punpckldq m10, m6, m6
+ punpckhdq m6, m6
+ psrld m13, 10
+ paddd m13, m13
+ vpblendd xm14, xm7, 0xa
+ vpblendd xm15, xm8, 0xa
+ pmovsxbw m14, xm14
+ pmovsxbw m15, xm15
+ pblendvb m14, m2, m10
+ pblendvb m15, m2, m6
+ pextrd r4, xm13, 2
+ pshufb m12, m13, m5
+ pshufb m13, m1
+ lea r6, [r4+ssq*1]
+ lea r11, [r4+ssq*2]
+ lea r13, [r4+ss3q ]
+ movu xm7, [srcq+ssq*0]
+ movu xm9, [srcq+ssq*1]
+ movu xm8, [srcq+ssq*2]
+ movu xm10, [srcq+ss3q ]
+ movu xm1, [srcq+r4 ]
+ movu xm3, [srcq+r6 ]
+ movu xm2, [srcq+r11 ]
+ movu xm4, [srcq+r13 ]
+ lea srcq, [srcq+ssq*4]
+ vinserti128 m7, [srcq+ssq*0], 1
+ vinserti128 m9, [srcq+ssq*1], 1
+ vinserti128 m8, [srcq+ssq*2], 1
+ vinserti128 m10, [srcq+ss3q ], 1
+ vinserti128 m1, [srcq+r4 ], 1
+ vinserti128 m3, [srcq+r6 ], 1
+ vinserti128 m2, [srcq+r11 ], 1
+ vinserti128 m4, [srcq+r13 ], 1
+ lea srcq, [srcq+ssq*4]
+ vpbroadcastb m5, xm13
+ psubb m13, m5
+ paddb m12, m0
+ paddb m13, m0
+ REPX {pshufb x, m12}, m7, m9, m8, m10
+ REPX {pmaddwd x, m14}, m7, m9, m8, m10
+ REPX {pshufb x, m13}, m1, m2, m3, m4
+ REPX {pmaddwd x, m15}, m1, m2, m3, m4
+ mova m5, [rsp+0x00]
+ movd xm6, [rsp+0x30]
+ phaddd m7, m1
+ phaddd m9, m3
+ phaddd m8, m2
+ phaddd m10, m4
+ REPX {paddd x, m5}, m7, m9, m8, m10
+ REPX {psrad x, xm6}, m7, m9, m8, m10
+ packssdw m7, m9 ; 0 1 4 5
+ packssdw m8, m10 ; 2 3 6 7
+ vextracti128 xm9, m7, 1 ; 4 5
+ vextracti128 xm3, m8, 1 ; 6 7
+ shufps xm4, xm7, xm8, q1032 ; 1 2
+ shufps xm5, xm8, xm9, q1032 ; 3 4
+ shufps xm6, xm9, xm3, q1032 ; 5 6
+ psrldq xm10, xm3, 8 ; 7 _
+ punpcklwd xm0, xm7, xm4 ; 01
+ punpckhwd xm7, xm4 ; 12
+ punpcklwd xm1, xm8, xm5 ; 23
+ punpckhwd xm8, xm5 ; 34
+ punpcklwd xm2, xm9, xm6 ; 45
+ punpckhwd xm9, xm6 ; 56
+ punpcklwd xm3, xm10 ; 67
+ mova [rsp+0x40], xm7
+ mova [rsp+0x50], xm8
+ mova [rsp+0x60], xm9
+.w4_loop:
+ and myd, 0x3ff
+ mov r11d, 64 << 24
+ mov r13d, myd
+ shr r13d, 6
+ lea r13d, [t1+r13]
+ cmovnz r11q, [base+subpel_filters+r13*8]
+ movq xm9, r11q
+ pmovsxbw xm9, xm9
+ pshufd xm7, xm9, q0000
+ pshufd xm8, xm9, q1111
+ pmaddwd xm4, xm0, xm7
+ pmaddwd xm5, xm1, xm8
+ pshufd xm7, xm9, q2222
+ pshufd xm9, xm9, q3333
+ pmaddwd xm6, xm2, xm7
+ pmaddwd xm8, xm3, xm9
+%if isput
+ mova xm7, [rsp+0x20]
+ movd xm9, [rsp+0x38]
+%else
+ SWAP m7, m11
+%endif
+ paddd xm4, xm5
+ paddd xm6, xm8
+ paddd xm4, xm6
+ paddd xm4, xm7
+%if isput
+ psrad xm4, xm9
+ packusdw xm4, xm4
+ pminuw xm4, xm11
+ movq [dstq], xm4
+ add dstq, dsq
+%else
+ SWAP m11, m7
+ psrad xm4, 6
+ packssdw xm4, xm4
+ movq [tmpq], xm4
+ add tmpq, 8
+%endif
+ dec hd
+ jz .ret
+ add myd, dyd
+ test myd, ~0x3ff
+ jz .w4_loop
+ mova xm8, [rsp+0x00]
+ movd xm9, [rsp+0x30]
+ movu xm4, [srcq]
+ movu xm5, [srcq+r4]
+ test myd, 0x400
+ jz .w4_skip_line
+ mova xm0, [rsp+0x40]
+ mova [rsp+0x40], xm1
+ mova xm1, [rsp+0x50]
+ mova [rsp+0x50], xm2
+ mova xm2, [rsp+0x60]
+ mova [rsp+0x60], xm3
+ pshufb xm4, xm12
+ pshufb xm5, xm13
+ pmaddwd xm4, xm14
+ pmaddwd xm5, xm15
+ phaddd xm4, xm5
+ paddd xm4, xm8
+ psrad xm4, xm9
+ packssdw xm4, xm4
+ punpcklwd xm3, xm10, xm4
+ mova xm10, xm4
+ add srcq, ssq
+ jmp .w4_loop
+.w4_skip_line:
+ movu xm6, [srcq+ssq*1]
+ movu xm7, [srcq+r6]
+ movu m0, [rsp+0x50]
+ pshufb xm4, xm12
+ pshufb xm6, xm12
+ pshufb xm5, xm13
+ pshufb xm7, xm13
+ pmaddwd xm4, xm14
+ pmaddwd xm6, xm14
+ pmaddwd xm5, xm15
+ pmaddwd xm7, xm15
+ mova [rsp+0x40], m0
+ phaddd xm4, xm5
+ phaddd xm6, xm7
+ paddd xm4, xm8
+ paddd xm6, xm8
+ psrad xm4, xm9
+ psrad xm6, xm9
+ packssdw xm4, xm6
+ punpcklwd xm9, xm10, xm4
+ mova [rsp+0x60], xm9
+ psrldq xm10, xm4, 8
+ mova xm0, xm1
+ mova xm1, xm2
+ mova xm2, xm3
+ punpcklwd xm3, xm4, xm10
+ lea srcq, [srcq+ssq*2]
+ jmp .w4_loop
+ SWAP m10, m13
+%if isprep
+ SWAP m13, m11
+%endif
+.w8:
+ mov dword [rsp+0x80], 1
+ movifprep tmp_stridem, 16
+ jmp .w_start
+.w16:
+ mov dword [rsp+0x80], 2
+ movifprep tmp_stridem, 32
+ jmp .w_start
+.w32:
+ mov dword [rsp+0x80], 4
+ movifprep tmp_stridem, 64
+ jmp .w_start
+.w64:
+ mov dword [rsp+0x80], 8
+ movifprep tmp_stridem, 128
+ jmp .w_start
+.w128:
+ mov dword [rsp+0x80], 16
+ movifprep tmp_stridem, 256
+.w_start:
+ SWAP m10, m12, m1
+ SWAP m11, m7
+ ; m1=mx, m7=pxmax, m10=h_rnd, m11=h_sh, m12=free
+%if isput
+ movifnidn dsm, dsq
+ mova [rsp+0xb0], xm7
+%endif
+ mova [rsp+0x00], m10
+ mova [rsp+0x20], m13
+ shr t0d, 16
+ sub srcq, 6
+ pmaddwd m8, [base+rescale_mul2]
+ movd xm15, t0d
+ mov [rsp+0x84], t0d
+ mov [rsp+0x88], srcq
+ mov [rsp+0x90], r0q ; dstq / tmpq
+%if UNIX64
+ mov hm, hd
+%endif
+ shl dword dxm, 3 ; dx*8
+ vpbroadcastd m15, xm15
+ paddd m1, m8 ; mx+dx*[0-7]
+ jmp .hloop
+.hloop_prep:
+ dec dword [rsp+0x80]
+ jz .ret
+ add qword [rsp+0x90], 16
+ mov hd, hm
+ vpbroadcastd m8, dxm
+ vpbroadcastd m6, [base+pd_0x3ff]
+ paddd m1, m8, [rsp+0x40]
+ vpbroadcastd m15, [rsp+0x84]
+ pxor m9, m9
+ mov srcq, [rsp+0x88]
+ mov r0q, [rsp+0x90] ; dstq / tmpq
+.hloop:
+ vpbroadcastq xm2, [base+pq_0x40000000]
+ pand m5, m1, m6
+ psrld m5, 6
+ paddd m15, m5
+ pcmpeqd m5, m9
+ vextracti128 xm7, m15, 1
+ movq r6, xm15
+ pextrq r9, xm15, 1
+ movq r11, xm7
+ pextrq rX, xm7, 1
+ mov r4d, r6d
+ shr r6, 32
+ mov r7d, r9d
+ shr r9, 32
+ mov r10d, r11d
+ shr r11, 32
+ mov r13d, rXd
+ shr rX, 32
+ mova [rsp+0x40], m1
+ movq xm12, [base+subpel_filters+ r4*8]
+ movq xm13, [base+subpel_filters+ r6*8]
+ movhps xm12, [base+subpel_filters+ r7*8]
+ movhps xm13, [base+subpel_filters+ r9*8]
+ movq xm14, [base+subpel_filters+r10*8]
+ movq xm15, [base+subpel_filters+r11*8]
+ movhps xm14, [base+subpel_filters+r13*8]
+ movhps xm15, [base+subpel_filters+ rX*8]
+ psrld m1, 10
+ vextracti128 xm7, m1, 1
+ vextracti128 xm6, m5, 1
+ movq [rsp+0xa0], xm1
+ movq [rsp+0xa8], xm7
+ movq r6, xm1
+ pextrq r11, xm1, 1
+ movq r9, xm7
+ pextrq rX, xm7, 1
+ mov r4d, r6d
+ shr r6, 32
+ mov r10d, r11d
+ shr r11, 32
+ mov r7d, r9d
+ shr r9, 32
+ mov r13d, rXd
+ shr rX, 32
+ pshufd xm4, xm5, q2200
+ pshufd xm5, xm5, q3311
+ pshufd xm7, xm6, q2200
+ pshufd xm6, xm6, q3311
+ pblendvb xm12, xm2, xm4
+ pblendvb xm13, xm2, xm5
+ pblendvb xm14, xm2, xm7
+ pblendvb xm15, xm2, xm6
+ pmovsxbw m12, xm12
+ pmovsxbw m13, xm13
+ pmovsxbw m14, xm14
+ pmovsxbw m15, xm15
+ MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
+ mova [rsp+0x60], m0
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
+ MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 0 ; 6a 7a 6b 7b
+ mova m0, [rsp+0x60]
+ vbroadcasti128 m9, [base+subpel_s_shuf8]
+ mov myd, mym
+ mov dyd, dym
+ pshufb m0, m9 ; 01a 01b
+ pshufb m1, m9 ; 23a 23b
+ pshufb m2, m9 ; 45a 45b
+ pshufb m3, m9 ; 67a 67b
+.vloop:
+ and myd, 0x3ff
+ mov r6d, 64 << 24
+ mov r4d, myd
+ shr r4d, 6
+ lea r4d, [t1+r4]
+ cmovnz r6q, [base+subpel_filters+r4*8]
+ movq xm9, r6q
+ punpcklqdq xm9, xm9
+ pmovsxbw m9, xm9
+ pshufd m8, m9, q0000
+ pshufd m7, m9, q1111
+ pmaddwd m4, m0, m8
+ pmaddwd m5, m1, m7
+ pshufd m8, m9, q2222
+ pshufd m9, m9, q3333
+ pmaddwd m6, m2, m8
+ pmaddwd m7, m3, m9
+%if isput
+ psrldq xm8, xm11, 8
+%endif
+ paddd m4, [rsp+0x20]
+ paddd m6, m7
+ paddd m4, m5
+ paddd m4, m6
+%if isput
+ psrad m4, xm8
+ vextracti128 xm5, m4, 1
+ packusdw xm4, xm5
+ pminsw xm4, [rsp+0xb0]
+ mova [dstq], xm4
+ add dstq, dsm
+%else
+ psrad m4, 6
+ vextracti128 xm5, m4, 1
+ packssdw xm4, xm5
+ mova [tmpq], xm4
+ add tmpq, tmp_stridem
+%endif
+ dec hd
+ jz .hloop_prep
+ add myd, dyd
+ test myd, ~0x3ff
+ jz .vloop
+ test myd, 0x400
+ mov [rsp+0x60], myd
+ mov r4d, [rsp+0xa0]
+ mov r6d, [rsp+0xa4]
+ mov r7d, [rsp+0xa8]
+ mov r9d, [rsp+0xac]
+ jz .skip_line
+ vbroadcasti128 m9, [base+wswap]
+ movu xm4, [srcq+ r4*2]
+ movu xm5, [srcq+ r6*2]
+ movu xm6, [srcq+ r7*2]
+ movu xm7, [srcq+ r9*2]
+ vinserti128 m4, [srcq+r10*2], 1
+ vinserti128 m5, [srcq+r11*2], 1
+ vinserti128 m6, [srcq+r13*2], 1
+ vinserti128 m7, [srcq+ rX*2], 1
+ add srcq, ssq
+ mov myd, [rsp+0x60]
+ mov dyd, dym
+ pshufb m0, m9
+ pshufb m1, m9
+ pshufb m2, m9
+ pshufb m3, m9
+ pmaddwd m4, m12
+ pmaddwd m5, m13
+ pmaddwd m6, m14
+ pmaddwd m7, m15
+ phaddd m4, m5
+ phaddd m6, m7
+ phaddd m4, m6
+ paddd m4, m10
+ psrad m4, xm11
+ pslld m4, 16
+ pblendw m0, m1, 0xaa
+ pblendw m1, m2, 0xaa
+ pblendw m2, m3, 0xaa
+ pblendw m3, m4, 0xaa
+ jmp .vloop
+.skip_line:
+ mova m0, m1
+ mova m1, m2
+ mova m2, m3
+ MC_8TAP_SCALED_H 3, 10, 4, 5, 6, 7, 8, 9, 1
+ vbroadcasti128 m9, [base+subpel_s_shuf8]
+ mov myd, [rsp+0x60]
+ mov dyd, dym
+ pshufb m3, m9
+ jmp .vloop
+ SWAP m1, m12, m10
+ SWAP m7, m11
+.dy1:
+ movzx wd, word [base+%1_8tap_scaled_avx2_dy1_table+wq*2]
+ add wq, base_reg
+ jmp wq
+%if isput
+.dy1_w2:
+ mov myd, mym
+ movzx t0d, t0b
+ sub srcq, 2
+ movd xm15, t0d
+ punpckldq m8, m9, m8
+ paddd m10, m8 ; mx+dx*[0-1]
+ vpbroadcastd xm14, [base+pq_0x40000000+2]
+ vpbroadcastd xm15, xm15
+ pand xm8, xm10, xm6
+ psrld xm8, 6
+ paddd xm15, xm8
+ movd r4d, xm15
+ pextrd r6d, xm15, 1
+ vbroadcasti128 m5, [base+bdct_lb_q]
+ vbroadcasti128 m6, [base+subpel_s_shuf2]
+ vpbroadcastd m15, [base+subpel_filters+r4*8+2]
+ vpbroadcastd m4, [base+subpel_filters+r6*8+2]
+ pcmpeqd xm8, xm9
+ psrld m10, 10
+ paddd m10, m10
+ movu xm0, [srcq+ssq*0]
+ movu xm1, [srcq+ssq*1]
+ movu xm2, [srcq+ssq*2]
+ movu xm3, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ pshufb m10, m5
+ paddb m10, m6
+ vpblendd xm15, xm4, 0xa
+ pblendvb xm15, xm14, xm8
+ pmovsxbw m15, xm15
+ vinserti128 m0, [srcq+ssq*0], 1
+ vinserti128 m1, [srcq+ssq*1], 1
+ vinserti128 m2, [srcq+ssq*2], 1
+ add srcq, ss3q
+ movq xm6, r4q
+ pmovsxbw xm6, xm6
+ pshufd xm8, xm6, q0000
+ pshufd xm9, xm6, q1111
+ pshufd xm14, xm6, q2222
+ pshufd xm6, xm6, q3333
+ REPX {pshufb x, m10}, m0, m1, m2
+ pshufb xm3, xm10
+ REPX {pmaddwd x, m15}, m0, m1, m2
+ pmaddwd xm3, xm15
+ phaddd m0, m1
+ phaddd m2, m3
+ paddd m0, m12
+ paddd m2, m12
+ psrad m0, xm7
+ psrad m2, xm7
+ packssdw m0, m2
+ vextracti128 xm1, m0, 1
+ palignr xm2, xm1, xm0, 4
+ pshufd xm4, xm1, q2121
+ punpcklwd xm3, xm0, xm2 ; 01 12
+ punpckhwd xm0, xm2 ; 23 34
+ punpcklwd xm2, xm1, xm4 ; 45 56
+.dy1_w2_loop:
+ movu xm1, [srcq+ssq*0]
+ movu xm5, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb xm1, xm10
+ pshufb xm5, xm10
+ pmaddwd xm1, xm15
+ pmaddwd xm5, xm15
+ phaddd xm1, xm5
+ pmaddwd xm5, xm3, xm8
+ mova xm3, xm0
+ pmaddwd xm0, xm9
+ paddd xm1, xm12
+ psrad xm1, xm7
+ packssdw xm1, xm1
+ paddd xm5, xm0
+ mova xm0, xm2
+ pmaddwd xm2, xm14
+ paddd xm5, xm2
+ palignr xm2, xm1, xm4, 12
+ punpcklwd xm2, xm1 ; 67 78
+ pmaddwd xm4, xm2, xm6
+ paddd xm5, xm13
+ paddd xm5, xm4
+ mova xm4, xm1
+ psrldq xm1, xm7, 8
+ psrad xm5, xm1
+ packusdw xm5, xm5
+ pminsw xm5, xm11
+ movd [dstq+dsq*0], xm5
+ pextrd [dstq+dsq*1], xm5, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .dy1_w2_loop
+ RET
+%endif
+.dy1_w4:
+ mov myd, mym
+%if isput
+ mova [rsp+0x50], xm11
+%endif
+ mova [rsp+0x00], m12
+ mova [rsp+0x20], m13
+ mova [rsp+0x40], xm7
+ vbroadcasti128 m7, [base+rescale_mul]
+ movzx t0d, t0b
+ sub srcq, 2
+ movd xm15, t0d
+ pmaddwd m8, m7
+ vpbroadcastq m2, [base+pq_0x40000000+1]
+ vpbroadcastd xm15, xm15
+ SWAP m13, m10
+ paddd m13, m8 ; mx+dx*[0-3]
+ pand m6, m13
+ psrld m6, 6
+ paddd xm15, xm6
+ movd r4d, xm15
+ pextrd r6d, xm15, 1
+ pextrd r11d, xm15, 2
+ pextrd r13d, xm15, 3
+ vbroadcasti128 m5, [base+bdct_lb_q+ 0]
+ vbroadcasti128 m1, [base+bdct_lb_q+16]
+ vbroadcasti128 m4, [base+subpel_s_shuf2]
+ vpbroadcastd xm14, [base+subpel_filters+r4*8+2]
+ vpbroadcastd xm7, [base+subpel_filters+r6*8+2]
+ vpbroadcastd xm15, [base+subpel_filters+r11*8+2]
+ vpbroadcastd xm8, [base+subpel_filters+r13*8+2]
+ pcmpeqd m6, m9
+ punpckldq m10, m6, m6
+ punpckhdq m6, m6
+ psrld m13, 10
+ paddd m13, m13
+ vpblendd xm14, xm7, 0xa
+ vpblendd xm15, xm8, 0xa
+ pmovsxbw m14, xm14
+ pmovsxbw m15, xm15
+ pblendvb m14, m2, m10
+ pblendvb m15, m2, m6
+ pextrd r4, xm13, 2
+ pshufb m12, m13, m5
+ pshufb m13, m1
+ lea r6, [r4+ssq*2]
+ lea r11, [r4+ssq*1]
+ lea r13, [r4+ss3q ]
+ movu xm0, [srcq+ssq*0]
+ movu xm7, [srcq+r4 ]
+ movu xm1, [srcq+ssq*2]
+ movu xm8, [srcq+r6 ]
+ vinserti128 m0, [srcq+ssq*1], 1 ; 0 1
+ vinserti128 m7, [srcq+r11 ], 1
+ vinserti128 m1, [srcq+ss3q ], 1 ; 2 3
+ vinserti128 m8, [srcq+r13 ], 1
+ lea srcq, [srcq+ssq*4]
+ movu xm2, [srcq+ssq*0]
+ movu xm9, [srcq+r4 ]
+ movu xm3, [srcq+ssq*2] ; 6 _
+ movu xm10, [srcq+r6 ]
+ vinserti128 m2, [srcq+ssq*1], 1 ; 4 5
+ vinserti128 m9, [srcq+r11 ], 1
+ lea srcq, [srcq+ss3q ]
+ vpbroadcastb m5, xm13
+ psubb m13, m5
+ paddb m12, m4
+ paddb m13, m4
+ mova m5, [rsp+0x00]
+ movd xm6, [rsp+0x40]
+ pshufb m0, m12
+ pshufb m1, m12
+ pmaddwd m0, m14
+ pmaddwd m1, m14
+ pshufb m7, m13
+ pshufb m8, m13
+ pmaddwd m7, m15
+ pmaddwd m8, m15
+ pshufb m2, m12
+ pshufb xm3, xm12
+ pmaddwd m2, m14
+ pmaddwd xm3, xm14
+ pshufb m9, m13
+ pshufb xm10, xm13
+ pmaddwd m9, m15
+ pmaddwd xm10, xm15
+ phaddd m0, m7
+ phaddd m1, m8
+ phaddd m2, m9
+ phaddd xm3, xm10
+ paddd m0, m5
+ paddd m1, m5
+ paddd m2, m5
+ paddd xm3, xm5
+ psrad m0, xm6
+ psrad m1, xm6
+ psrad m2, xm6
+ psrad xm3, xm6
+ vperm2i128 m4, m0, m1, 0x21 ; 1 2
+ vperm2i128 m5, m1, m2, 0x21 ; 3 4
+ vperm2i128 m6, m2, m3, 0x21 ; 5 6
+ shr myd, 6
+ mov r13d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r13q, [base+subpel_filters+myq*8]
+ pslld m4, 16
+ pslld m5, 16
+ pslld m6, 16
+ pblendw m0, m4, 0xaa ; 01 12
+ pblendw m1, m5, 0xaa ; 23 34
+ pblendw m2, m6, 0xaa ; 45 56
+ movq xm10, r13q
+ punpcklqdq xm10, xm10
+ pmovsxbw m10, xm10
+ pshufd m7, m10, q0000
+ pshufd m8, m10, q1111
+ pshufd m9, m10, q2222
+ pshufd m10, m10, q3333
+.dy1_w4_loop:
+ movu xm11, [srcq+ssq*0]
+ movu xm6, [srcq+r4 ]
+ vinserti128 m11, [srcq+ssq*1], 1
+ vinserti128 m6, [srcq+r11 ], 1
+ lea srcq, [srcq+ssq*2]
+ pmaddwd m4, m0, m7
+ pmaddwd m5, m1, m8
+ pshufb m11, m12
+ pshufb m6, m13
+ pmaddwd m11, m14
+ pmaddwd m6, m15
+ paddd m4, [rsp+0x20]
+ phaddd m11, m6
+ pmaddwd m6, m2, m9
+ paddd m11, [rsp+0x00]
+ psrad m11, [rsp+0x40]
+ mova m0, m1
+ mova m1, m2
+ paddd m5, m6
+ paddd m4, m5
+ vinserti128 m2, m3, xm11, 1
+ pslld m3, m11, 16
+ pblendw m2, m3, 0xaa ; 67 78
+ pmaddwd m5, m2, m10
+ vextracti128 xm3, m11, 1
+ paddd m4, m5
+%if isput
+ psrad m4, [rsp+0x48]
+ vextracti128 xm5, m4, 1
+ packusdw xm4, xm5
+ pminsw xm4, [rsp+0x50]
+ movq [dstq+dsq*0], xm4
+ movhps [dstq+dsq*1], xm4
+ lea dstq, [dstq+dsq*2]
+%else
+ psrad m4, 6
+ vextracti128 xm5, m4, 1
+ packssdw xm4, xm5
+ mova [tmpq], xm4
+ add tmpq, 16
+%endif
+ sub hd, 2
+ jg .dy1_w4_loop
+ MC_8TAP_SCALED_RET
+ SWAP m10, m13
+.dy1_w8:
+ mov dword [rsp+0xa0], 1
+ movifprep tmp_stridem, 16
+ jmp .dy1_w_start
+.dy1_w16:
+ mov dword [rsp+0xa0], 2
+ movifprep tmp_stridem, 32
+ jmp .dy1_w_start
+.dy1_w32:
+ mov dword [rsp+0xa0], 4
+ movifprep tmp_stridem, 64
+ jmp .dy1_w_start
+.dy1_w64:
+ mov dword [rsp+0xa0], 8
+ movifprep tmp_stridem, 128
+ jmp .dy1_w_start
+.dy1_w128:
+ mov dword [rsp+0xa0], 16
+ movifprep tmp_stridem, 256
+.dy1_w_start:
+ SWAP m10, m12, m1
+ SWAP m11, m7
+ ; m1=mx, m7=pxmax, m10=h_rnd, m11=h_sh, m12=free
+ mov myd, mym
+%if isput
+ %define dsm [rsp+0xb8]
+ movifnidn dsm, dsq
+ mova [rsp+0xc0], xm7
+%else
+ %if UNIX64
+ %define hm [rsp+0xb8]
+ %endif
+%endif
+ mova [rsp+0x00], m10
+ mova [rsp+0x20], m13
+ mova [rsp+0x40], xm11
+ shr t0d, 16
+ sub srcq, 6
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ pmaddwd m8, [base+rescale_mul2]
+ movd xm15, t0d
+ mov [rsp+0xa4], t0d
+ mov [rsp+0xa8], srcq
+ mov [rsp+0xb0], r0q ; dstq / tmpq
+%if UNIX64
+ mov hm, hd
+%endif
+ shl dword dxm, 3 ; dx*8
+ vpbroadcastd m15, xm15
+ paddd m1, m8 ; mx+dx*[0-7]
+ movq xm0, r4q
+ pmovsxbw xm0, xm0
+ mova [rsp+0x50], xm0
+ jmp .dy1_hloop
+.dy1_hloop_prep:
+ dec dword [rsp+0xa0]
+ jz .ret
+ add qword [rsp+0xb0], 16
+ mov hd, hm
+ vpbroadcastd m8, dxm
+ vpbroadcastd m6, [base+pd_0x3ff]
+ paddd m1, m8, [rsp+0x60]
+ vpbroadcastd m15, [rsp+0xa4]
+ pxor m9, m9
+ mov srcq, [rsp+0xa8]
+ mov r0q, [rsp+0xb0] ; dstq / tmpq
+ mova m10, [rsp+0x00]
+ mova xm11, [rsp+0x40]
+.dy1_hloop:
+ vpbroadcastq xm2, [base+pq_0x40000000]
+ pand m5, m1, m6
+ psrld m5, 6
+ paddd m15, m5
+ pcmpeqd m5, m9
+ vextracti128 xm7, m15, 1
+ movq r6, xm15
+ pextrq r9, xm15, 1
+ movq r11, xm7
+ pextrq rX, xm7, 1
+ mov r4d, r6d
+ shr r6, 32
+ mov r7d, r9d
+ shr r9, 32
+ mov r10d, r11d
+ shr r11, 32
+ mov r13d, rXd
+ shr rX, 32
+ mova [rsp+0x60], m1
+ movq xm12, [base+subpel_filters+ r4*8]
+ movq xm13, [base+subpel_filters+ r6*8]
+ movhps xm12, [base+subpel_filters+ r7*8]
+ movhps xm13, [base+subpel_filters+ r9*8]
+ movq xm14, [base+subpel_filters+r10*8]
+ movq xm15, [base+subpel_filters+r11*8]
+ movhps xm14, [base+subpel_filters+r13*8]
+ movhps xm15, [base+subpel_filters+ rX*8]
+ psrld m1, 10
+ vextracti128 xm7, m1, 1
+ vextracti128 xm6, m5, 1
+ movq r6, xm1
+ pextrq r11, xm1, 1
+ movq r9, xm7
+ pextrq rX, xm7, 1
+ mov r4d, r6d
+ shr r6, 32
+ mov r10d, r11d
+ shr r11, 32
+ mov r7d, r9d
+ shr r9, 32
+ mov r13d, rXd
+ shr rX, 32
+ pshufd xm4, xm5, q2200
+ pshufd xm5, xm5, q3311
+ pshufd xm7, xm6, q2200
+ pshufd xm6, xm6, q3311
+ pblendvb xm12, xm2, xm4
+ pblendvb xm13, xm2, xm5
+ pblendvb xm14, xm2, xm7
+ pblendvb xm15, xm2, xm6
+ pmovsxbw m12, xm12
+ pmovsxbw m13, xm13
+ pmovsxbw m14, xm14
+ pmovsxbw m15, xm15
+ MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
+ mova [rsp+0x80], m0
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
+ MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 0 ; 6a 7a 6b 7b
+ mova m0, [rsp+0x80]
+ vbroadcasti128 m7, [base+subpel_s_shuf8]
+ vpbroadcastd m8, [rsp+0x50]
+ vpbroadcastd m9, [rsp+0x54]
+ vpbroadcastd m10, [rsp+0x58]
+ vpbroadcastd m11, [rsp+0x5c]
+ pshufb m0, m7 ; 01a 01b
+ pshufb m1, m7 ; 23a 23b
+ pshufb m2, m7 ; 45a 45b
+ pshufb m3, m7 ; 67a 67b
+.dy1_vloop:
+ pmaddwd m4, m0, m8
+ pmaddwd m5, m1, m9
+ pmaddwd m6, m2, m10
+ pmaddwd m7, m3, m11
+ paddd m4, [rsp+0x20]
+ paddd m6, m7
+ paddd m4, m5
+ paddd m4, m6
+%if isput
+ psrad m4, [rsp+0x48]
+ vextracti128 xm5, m4, 1
+ packusdw xm4, xm5
+ pminsw xm4, [rsp+0xc0]
+ mova [dstq], xm4
+ add dstq, dsm
+%else
+ psrad m4, 6
+ vextracti128 xm5, m4, 1
+ packssdw xm4, xm5
+ mova [tmpq], xm4
+ add tmpq, tmp_stridem
+%endif
+ dec hd
+ jz .dy1_hloop_prep
+ vbroadcasti128 m7, [base+wswap]
+ pshufb m0, m7
+ pshufb m1, m7
+ pshufb m2, m7
+ pshufb m3, m7
+ movu xm4, [srcq+ r4*2]
+ movu xm5, [srcq+ r6*2]
+ movu xm6, [srcq+ r7*2]
+ movu xm7, [srcq+ r9*2]
+ vinserti128 m4, [srcq+r10*2], 1
+ vinserti128 m5, [srcq+r11*2], 1
+ vinserti128 m6, [srcq+r13*2], 1
+ vinserti128 m7, [srcq+ rX*2], 1
+ add srcq, ssq
+ pmaddwd m4, m12
+ pmaddwd m5, m13
+ pmaddwd m6, m14
+ pmaddwd m7, m15
+ phaddd m4, m5
+ phaddd m6, m7
+ phaddd m4, m6
+ paddd m4, [rsp+0x00]
+ psrad m4, [rsp+0x40]
+ pslld m4, 16
+ pblendw m0, m1, 0xaa
+ pblendw m1, m2, 0xaa
+ pblendw m2, m3, 0xaa
+ pblendw m3, m4, 0xaa
+ jmp .dy1_vloop
+ SWAP m1, m12, m10
+ SWAP m7, m11
+.dy2:
+ movzx wd, word [base+%1_8tap_scaled_avx2_dy2_table+wq*2]
+ add wq, base_reg
+ jmp wq
+%if isput
+.dy2_w2:
+ mov myd, mym
+ movzx t0d, t0b
+ sub srcq, 2
+ movd xm15, t0d
+ punpckldq m8, m9, m8
+ paddd m10, m8 ; mx+dx*[0-1]
+ vpbroadcastd xm14, [base+pq_0x40000000+2]
+ vpbroadcastd xm15, xm15
+ pand xm8, xm10, xm6
+ psrld xm8, 6
+ paddd xm15, xm8
+ movd r4d, xm15
+ pextrd r6d, xm15, 1
+ vbroadcasti128 m5, [base+bdct_lb_q]
+ vbroadcasti128 m6, [base+subpel_s_shuf2]
+ vpbroadcastd xm15, [base+subpel_filters+r4*8+2]
+ vpbroadcastd xm4, [base+subpel_filters+r6*8+2]
+ pcmpeqd xm8, xm9
+ psrld m10, 10
+ paddd m10, m10
+ movu xm0, [srcq+ssq*0]
+ movu xm1, [srcq+ssq*2]
+ movu xm2, [srcq+ssq*4]
+ pshufb m10, m5
+ paddb m10, m6
+ vpblendd xm15, xm4, 0xa
+ pblendvb xm15, xm14, xm8
+ pmovsxbw m15, xm15
+ vinserti128 m0, [srcq+ssq*1], 1 ; 0 1
+ vinserti128 m1, [srcq+ss3q ], 1 ; 2 3
+ lea srcq, [srcq+ssq*4]
+ vinserti128 m2, [srcq+ssq*1], 1 ; 4 5
+ lea srcq, [srcq+ssq*2]
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ pshufb m0, m10
+ pshufb m1, m10
+ pshufb m2, m10
+ pmaddwd m0, m15
+ pmaddwd m1, m15
+ pmaddwd m2, m15
+ movq xm6, r4q
+ pmovsxbw xm6, xm6
+ phaddd m0, m1
+ phaddd m1, m2
+ paddd m0, m12
+ paddd m1, m12
+ psrad m0, xm7
+ psrad m1, xm7
+ packssdw m0, m1 ; 0 2 2 4 1 3 3 5
+ vextracti128 xm1, m0, 1
+ pshufd xm8, xm6, q0000
+ pshufd xm9, xm6, q1111
+ pshufd xm14, xm6, q2222
+ pshufd xm6, xm6, q3333
+ punpcklwd xm2, xm0, xm1 ; 01 23
+ punpckhwd xm1, xm0, xm1 ; 23 45
+.dy2_w2_loop:
+ movu xm3, [srcq+ssq*0]
+ movu xm5, [srcq+ssq*2]
+ vinserti128 m3, [srcq+ssq*1], 1 ; 6 7
+ vinserti128 m5, [srcq+ss3q ], 1 ; 8 9
+ lea srcq, [srcq+ssq*4]
+ pmaddwd xm4, xm2, xm8
+ pmaddwd xm1, xm9
+ pshufb m3, m10
+ pshufb m5, m10
+ pmaddwd m3, m15
+ pmaddwd m5, m15
+ phaddd m3, m5
+ paddd xm4, xm1
+ paddd m3, m12
+ psrad m3, xm7
+ packssdw m3, m3
+ pshufd m3, m3, q2100
+ palignr m0, m3, m0, 12 ; 4 6 6 8 5 7 7 9
+ vextracti128 xm1, m0, 1
+ punpcklwd xm2, xm0, xm1 ; 45 67
+ punpckhwd xm1, xm0, xm1 ; 67 89
+ pmaddwd xm3, xm2, xm14
+ pmaddwd xm5, xm1, xm6
+ paddd xm4, xm13
+ paddd xm4, xm3
+ psrldq xm3, xm7, 8
+ paddd xm4, xm5
+ psrad xm4, xm3
+ packusdw xm4, xm4
+ pminsw xm4, xm11
+ movd [dstq+dsq*0], xm4
+ pextrd [dstq+dsq*1], xm4, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .dy2_w2_loop
+ RET
+%endif
+.dy2_w4:
+ mov myd, mym
+%if isput
+ mova [rsp+0x50], xm11
+%endif
+ mova [rsp+0x00], m12
+ mova [rsp+0x20], m13
+ mova [rsp+0x40], xm7
+ vbroadcasti128 m7, [base+rescale_mul]
+ movzx t0d, t0b
+ sub srcq, 2
+ movd xm15, t0d
+ pmaddwd m8, m7
+ vpbroadcastq m2, [base+pq_0x40000000+1]
+ vpbroadcastd xm15, xm15
+ SWAP m13, m10
+ paddd m13, m8 ; mx+dx*[0-3]
+ pand m6, m13
+ psrld m6, 6
+ paddd xm15, xm6
+ movd r4d, xm15
+ pextrd r6d, xm15, 1
+ pextrd r11d, xm15, 2
+ pextrd r13d, xm15, 3
+ vbroadcasti128 m5, [base+bdct_lb_q+ 0]
+ vbroadcasti128 m1, [base+bdct_lb_q+16]
+ vbroadcasti128 m4, [base+subpel_s_shuf2]
+ vpbroadcastd xm14, [base+subpel_filters+r4*8+2]
+ vpbroadcastd xm7, [base+subpel_filters+r6*8+2]
+ vpbroadcastd xm15, [base+subpel_filters+r11*8+2]
+ vpbroadcastd xm8, [base+subpel_filters+r13*8+2]
+ shr myd, 6
+ mov r13d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r13q, [base+subpel_filters+myq*8]
+ pcmpeqd m6, m9
+ punpckldq m11, m6, m6
+ punpckhdq m6, m6
+ psrld m13, 10
+ paddd m13, m13
+ vpblendd xm14, xm7, 0xa
+ vpblendd xm15, xm8, 0xa
+ pmovsxbw m14, xm14
+ pmovsxbw m15, xm15
+ movq xm10, r13q
+ pblendvb m14, m2, m11
+ pblendvb m15, m2, m6
+ pextrd r4, xm13, 2
+ pshufb m12, m13, m5
+ pshufb m13, m1
+ lea r6, [r4+ssq*1]
+ lea r11, [r4+ssq*2]
+ lea r13, [r4+ss3q ]
+ movu xm0, [srcq+ssq*0]
+ movu xm7, [srcq+r4 ]
+ movu xm1, [srcq+ssq*1]
+ movu xm8, [srcq+r6 ]
+ vinserti128 m0, [srcq+ssq*2], 1 ; 0 2
+ vinserti128 m7, [srcq+r11 ], 1
+ vinserti128 m1, [srcq+ss3q ], 1 ; 1 3
+ vinserti128 m8, [srcq+r13 ], 1
+ lea srcq, [srcq+ssq*4]
+ movu xm2, [srcq+ssq*0]
+ movu xm9, [srcq+r4 ]
+ vinserti128 m2, [srcq+ssq*1], 1 ; 4 5
+ vinserti128 m9, [srcq+r6 ], 1
+ lea srcq, [srcq+ssq*2]
+ vpbroadcastb m5, xm13
+ psubb m13, m5
+ paddb m12, m4
+ paddb m13, m4
+ mova m5, [rsp+0x00]
+ movd xm6, [rsp+0x40]
+ pshufb m0, m12
+ pshufb m1, m12
+ pshufb m2, m12
+ pmaddwd m0, m14
+ pmaddwd m1, m14
+ pmaddwd m2, m14
+ pshufb m7, m13
+ pshufb m8, m13
+ pshufb m9, m13
+ pmaddwd m7, m15
+ pmaddwd m8, m15
+ pmaddwd m9, m15
+ punpcklqdq xm10, xm10
+ pmovsxbw m10, xm10
+ phaddd m0, m7
+ phaddd m1, m8
+ phaddd m2, m9
+ paddd m0, m5
+ paddd m1, m5
+ paddd m2, m5
+ psrad m0, xm6
+ psrad m1, xm6
+ psrad m2, xm6
+ vperm2i128 m3, m0, m2, 0x21 ; 2 4
+ vperm2i128 m2, m1, 0x13 ; 3 5
+ pshufd m7, m10, q0000
+ pshufd m8, m10, q1111
+ pshufd m9, m10, q2222
+ pshufd m10, m10, q3333
+ packssdw m0, m3 ; 0 2 2 4
+ packssdw m1, m2 ; 1 3 3 5
+ punpckhwd m2, m0, m1 ; 23 45
+ punpcklwd m0, m1 ; 01 23
+.dy2_w4_loop:
+ movu xm1, [srcq+ssq*0]
+ movu xm6, [srcq+r4 ]
+ movu xm3, [srcq+ssq*1]
+ movu xm11, [srcq+r6 ]
+ vinserti128 m1, [srcq+ssq*2], 1 ; 6 8
+ vinserti128 m6, [srcq+r11 ], 1
+ vinserti128 m3, [srcq+ss3q ], 1 ; 7 9
+ vinserti128 m11, [srcq+r13 ], 1
+ lea srcq, [srcq+ssq*4]
+ pmaddwd m4, m0, m7
+ pmaddwd m5, m2, m8
+ pshufb m1, m12
+ pshufb m3, m12
+ pmaddwd m1, m14
+ pmaddwd m3, m14
+ mova m0, [rsp+0x00]
+ pshufb m6, m13
+ pshufb m11, m13
+ pmaddwd m6, m15
+ pmaddwd m11, m15
+ paddd m4, m5
+ movd xm5, [rsp+0x40]
+ phaddd m1, m6
+ phaddd m3, m11
+ paddd m1, m0
+ paddd m3, m0
+ psrad m1, xm5
+ psrad m3, xm5
+ pslld m3, 16
+ pblendw m1, m3, 0xaa ; 67 89
+ vperm2i128 m0, m2, m1, 0x21 ; 45 67
+ paddd m4, [rsp+0x20]
+ mova m2, m1
+ pmaddwd m5, m0, m9
+ pmaddwd m6, m2, m10
+ paddd m4, m5
+ paddd m4, m6
+%if isput
+ psrad m4, [rsp+0x48]
+ vextracti128 xm5, m4, 1
+ packusdw xm4, xm5
+ pminsw xm4, [rsp+0x50]
+ movq [dstq+dsq*0], xm4
+ movhps [dstq+dsq*1], xm4
+ lea dstq, [dstq+dsq*2]
+%else
+ psrad m4, 6
+ vextracti128 xm5, m4, 1
+ packssdw xm4, xm5
+ mova [tmpq], xm4
+ add tmpq, 16
+%endif
+ sub hd, 2
+ jg .dy2_w4_loop
+ MC_8TAP_SCALED_RET
+ SWAP m10, m13
+.dy2_w8:
+ mov dword [rsp+0xa0], 1
+ movifprep tmp_stridem, 16
+ jmp .dy2_w_start
+.dy2_w16:
+ mov dword [rsp+0xa0], 2
+ movifprep tmp_stridem, 32
+ jmp .dy2_w_start
+.dy2_w32:
+ mov dword [rsp+0xa0], 4
+ movifprep tmp_stridem, 64
+ jmp .dy2_w_start
+.dy2_w64:
+ mov dword [rsp+0xa0], 8
+ movifprep tmp_stridem, 128
+ jmp .dy2_w_start
+.dy2_w128:
+ mov dword [rsp+0xa0], 16
+ movifprep tmp_stridem, 256
+.dy2_w_start:
+ SWAP m10, m12, m1
+ SWAP m11, m7
+ ; m1=mx, m7=pxmax, m10=h_rnd, m11=h_sh, m12=free
+ mov myd, mym
+%if isput
+ movifnidn dsm, dsq
+ mova [rsp+0xc0], xm7
+%endif
+ mova [rsp+0x00], m10
+ mova [rsp+0x20], m13
+ mova [rsp+0x40], xm11
+ shr t0d, 16
+ sub srcq, 6
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ pmaddwd m8, [base+rescale_mul2]
+ movd xm15, t0d
+ mov [rsp+0xa4], t0d
+ mov [rsp+0xa8], srcq
+ mov [rsp+0xb0], r0q ; dstq / tmpq
+%if UNIX64
+ mov hm, hd
+%endif
+ shl dword dxm, 3 ; dx*8
+ vpbroadcastd m15, xm15
+ paddd m1, m8 ; mx+dx*[0-7]
+ movq xm0, r4q
+ pmovsxbw xm0, xm0
+ mova [rsp+0x50], xm0
+ jmp .dy2_hloop
+.dy2_hloop_prep:
+ dec dword [rsp+0xa0]
+ jz .ret
+ add qword [rsp+0xb0], 16
+ mov hd, hm
+ vpbroadcastd m8, dxm
+ vpbroadcastd m6, [base+pd_0x3ff]
+ paddd m1, m8, [rsp+0x60]
+ vpbroadcastd m15, [rsp+0xa4]
+ pxor m9, m9
+ mov srcq, [rsp+0xa8]
+ mov r0q, [rsp+0xb0] ; dstq / tmpq
+ mova m10, [rsp+0x00]
+ mova xm11, [rsp+0x40]
+.dy2_hloop:
+ vpbroadcastq xm2, [base+pq_0x40000000]
+ pand m5, m1, m6
+ psrld m5, 6
+ paddd m15, m5
+ pcmpeqd m5, m9
+ vextracti128 xm7, m15, 1
+ movq r6, xm15
+ pextrq r9, xm15, 1
+ movq r11, xm7
+ pextrq rX, xm7, 1
+ mov r4d, r6d
+ shr r6, 32
+ mov r7d, r9d
+ shr r9, 32
+ mov r10d, r11d
+ shr r11, 32
+ mov r13d, rXd
+ shr rX, 32
+ mova [rsp+0x60], m1
+ movq xm12, [base+subpel_filters+ r4*8]
+ movq xm13, [base+subpel_filters+ r6*8]
+ movhps xm12, [base+subpel_filters+ r7*8]
+ movhps xm13, [base+subpel_filters+ r9*8]
+ movq xm14, [base+subpel_filters+r10*8]
+ movq xm15, [base+subpel_filters+r11*8]
+ movhps xm14, [base+subpel_filters+r13*8]
+ movhps xm15, [base+subpel_filters+ rX*8]
+ psrld m1, 10
+ vextracti128 xm7, m1, 1
+ vextracti128 xm6, m5, 1
+ movq r6, xm1
+ pextrq r11, xm1, 1
+ movq r9, xm7
+ pextrq rX, xm7, 1
+ mov r4d, r6d
+ shr r6, 32
+ mov r10d, r11d
+ shr r11, 32
+ mov r7d, r9d
+ shr r9, 32
+ mov r13d, rXd
+ shr rX, 32
+ pshufd xm4, xm5, q2200
+ pshufd xm5, xm5, q3311
+ pshufd xm7, xm6, q2200
+ pshufd xm6, xm6, q3311
+ pblendvb xm12, xm2, xm4
+ pblendvb xm13, xm2, xm5
+ pblendvb xm14, xm2, xm7
+ pblendvb xm15, xm2, xm6
+ pmovsxbw m12, xm12
+ pmovsxbw m13, xm13
+ pmovsxbw m14, xm14
+ pmovsxbw m15, xm15
+ MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
+ mova [rsp+0x80], m0
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
+ MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 0 ; 6a 7a 6b 7b
+ mova m0, [rsp+0x80]
+ vbroadcasti128 m7, [base+subpel_s_shuf8]
+ vpbroadcastd m8, [rsp+0x50]
+ vpbroadcastd m9, [rsp+0x54]
+ vpbroadcastd m10, [rsp+0x58]
+ vpbroadcastd m11, [rsp+0x5c]
+ pshufb m0, m7 ; 01a 01b
+ pshufb m1, m7 ; 23a 23b
+ pshufb m2, m7 ; 45a 45b
+ pshufb m3, m7 ; 67a 67b
+.dy2_vloop:
+ pmaddwd m4, m0, m8
+ pmaddwd m5, m1, m9
+ pmaddwd m6, m2, m10
+ pmaddwd m7, m3, m11
+ paddd m4, [rsp+0x20]
+ paddd m6, m7
+ paddd m4, m5
+ paddd m4, m6
+%if isput
+ psrad m4, [rsp+0x48]
+ vextracti128 xm5, m4, 1
+ packusdw xm4, xm5
+ pminsw xm4, [rsp+0xc0]
+ mova [dstq], xm4
+ add dstq, dsm
+%else
+ psrad m4, 6
+ vextracti128 xm5, m4, 1
+ packssdw xm4, xm5
+ mova [tmpq], xm4
+ add tmpq, tmp_stridem
+%endif
+ dec hd
+ jz .dy2_hloop_prep
+ mova m0, m1
+ mova m1, m2
+ mova m2, m3
+ movu xm3, [srcq+ r4*2]
+ movu xm4, [srcq+ r6*2]
+ movu xm5, [srcq+ r7*2]
+ movu xm6, [srcq+ r9*2]
+ vinserti128 m3, [srcq+r10*2], 1
+ vinserti128 m4, [srcq+r11*2], 1
+ vinserti128 m5, [srcq+r13*2], 1
+ vinserti128 m6, [srcq+ rX*2], 1
+ add srcq, ssq
+ pmaddwd m3, m12
+ pmaddwd m4, m13
+ pmaddwd m5, m14
+ pmaddwd m6, m15
+ phaddd m3, m4
+ phaddd m5, m6
+ phaddd m3, m5
+ movu xm4, [srcq+ r4*2]
+ movu xm5, [srcq+ r6*2]
+ movu xm6, [srcq+ r7*2]
+ movu xm7, [srcq+ r9*2]
+ vinserti128 m4, [srcq+r10*2], 1
+ vinserti128 m5, [srcq+r11*2], 1
+ vinserti128 m6, [srcq+r13*2], 1
+ vinserti128 m7, [srcq+ rX*2], 1
+ add srcq, ssq
+ pmaddwd m4, m12
+ pmaddwd m5, m13
+ pmaddwd m6, m14
+ pmaddwd m7, m15
+ phaddd m4, m5
+ phaddd m6, m7
+ mova m5, [rsp+0x00]
+ movd xm7, [rsp+0x40]
+ phaddd m4, m6
+ paddd m3, m5
+ paddd m4, m5
+ psrad m3, xm7
+ psrad m4, xm7
+ pslld m4, 16
+ pblendw m3, m4, 0xaa
+ jmp .dy2_vloop
+.ret:
+ MC_8TAP_SCALED_RET 0
+%undef isput
+%undef isprep
+%endmacro
+
+%macro BILIN_SCALED_FN 1
+cglobal %1_bilin_scaled_16bpc
+ mov t0d, (5*15 << 16) | 5*15
+ mov t1d, t0d
+ jmp mangle(private_prefix %+ _%1_8tap_scaled_16bpc %+ SUFFIX)
+%endmacro
+
+%if WIN64
+DECLARE_REG_TMP 6, 5
+%else
+DECLARE_REG_TMP 6, 8
+%endif
+
+%define PUT_8TAP_SCALED_FN FN put_8tap_scaled,
+BILIN_SCALED_FN put
+PUT_8TAP_SCALED_FN sharp, SHARP, SHARP
+PUT_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH
+PUT_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP
+PUT_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH
+PUT_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR
+PUT_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP
+PUT_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR
+PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH
+PUT_8TAP_SCALED_FN regular, REGULAR, REGULAR
+MC_8TAP_SCALED put
+
+%if WIN64
+DECLARE_REG_TMP 5, 4
+%else
+DECLARE_REG_TMP 6, 7
+%endif
+
+%define PREP_8TAP_SCALED_FN FN prep_8tap_scaled,
+BILIN_SCALED_FN prep
+PREP_8TAP_SCALED_FN sharp, SHARP, SHARP
+PREP_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH
+PREP_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP
+PREP_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH
+PREP_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR
+PREP_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP
+PREP_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR
+PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH
+PREP_8TAP_SCALED_FN regular, REGULAR, REGULAR
+MC_8TAP_SCALED prep
+
+%macro WARP_V 5 ; dst, 01, 23, 45, 67
+ lea tmp1d, [myq+deltaq*4]
+ lea tmp2d, [myq+deltaq*1]
+ shr myd, 10
+ shr tmp1d, 10
+ movq xm8, [filterq+myq *8]
+ vinserti128 m8, [filterq+tmp1q*8], 1 ; a e
+ lea tmp1d, [tmp2q+deltaq*4]
+ lea myd, [tmp2q+deltaq*1]
+ shr tmp2d, 10
+ shr tmp1d, 10
+ movq xm0, [filterq+tmp2q*8]
+ vinserti128 m0, [filterq+tmp1q*8], 1 ; b f
+ lea tmp1d, [myq+deltaq*4]
+ lea tmp2d, [myq+deltaq*1]
+ shr myd, 10
+ shr tmp1d, 10
+ movq xm9, [filterq+myq *8]
+ vinserti128 m9, [filterq+tmp1q*8], 1 ; c g
+ lea tmp1d, [tmp2q+deltaq*4]
+ lea myd, [tmp2q+gammaq] ; my += gamma
+ punpcklwd m8, m0
+ shr tmp2d, 10
+ shr tmp1d, 10
+ movq xm0, [filterq+tmp2q*8]
+ vinserti128 m0, [filterq+tmp1q*8], 1 ; d h
+ punpcklwd m0, m9, m0
+ punpckldq m9, m8, m0
+ punpckhdq m0, m8, m0
+ punpcklbw m8, m11, m9 ; a0 a1 b0 b1 c0 c1 d0 d1 << 8
+ punpckhbw m9, m11, m9 ; a2 a3 b2 b3 c2 c3 d2 d3 << 8
+ pmaddwd m%2, m8
+ pmaddwd m9, m%3
+ punpcklbw m8, m11, m0 ; a4 a5 b4 b5 c4 c5 d4 d5 << 8
+ punpckhbw m0, m11, m0 ; a6 a7 b6 b7 c6 c7 d6 d7 << 8
+ pmaddwd m8, m%4
+ pmaddwd m0, m%5
+ paddd m9, m%2
+ mova m%2, m%3
+ paddd m0, m8
+ mova m%3, m%4
+ mova m%4, m%5
+ paddd m%1, m0, m9
+%endmacro
+
+cglobal warp_affine_8x8t_16bpc, 4, 14, 16, tmp, ts
+ mov r6d, r7m
+ lea r9, [$$]
+ shr r6d, 11
+ vpbroadcastd m13, [r9-$$+warp8x8_shift+r6*4]
+ vpbroadcastd m14, [warp8x8t_rnd]
+ call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx2).main
+ jmp .start
+.loop:
+ call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx2).main2
+ lea tmpq, [tmpq+tsq*4]
+.start:
+ paddd m7, m14
+ paddd m0, m14
+ psrad m7, 15
+ psrad m0, 15
+ packssdw m7, m0
+ vpermq m7, m7, q3120
+ mova [tmpq+tsq*0], xm7
+ vextracti128 [tmpq+tsq*2], m7, 1
+ dec r4d
+ jg .loop
+.end:
+ RET
+
+cglobal warp_affine_8x8_16bpc, 4, 14, 16, dst, ds, src, ss, abcd, mx, tmp2, \
+ alpha, beta, filter, tmp1, delta, \
+ my, gamma
+ mov r6d, r7m
+ lea filterq, [$$]
+ shr r6d, 11
+ vpbroadcastd m13, [filterq-$$+warp8x8_shift+r6*4]
+ vpbroadcastd m14, [filterq-$$+warp8x8_rnd +r6*4]
+ vpbroadcastw m15, r7m ; pixel_max
+ call .main
+ jmp .start
+.loop:
+ call .main2
+ lea dstq, [dstq+dsq*2]
+.start:
+ psrad m7, 16
+ psrad m0, 16
+ packusdw m7, m0
+ pmulhrsw m7, m14
+ pminsw m7, m15
+ vpermq m7, m7, q3120
+ mova [dstq+dsq*0], xm7
+ vextracti128 [dstq+dsq*1], m7, 1
+ dec r4d
+ jg .loop
+.end:
+ RET
+ALIGN function_align
+.main:
+ ; Stack args offset by one (r4m -> r5m etc.) due to call
+%if WIN64
+ mov abcdq, r5m
+ mov mxd, r6m
+%endif
+ movsx alphad, word [abcdq+2*0]
+ movsx betad, word [abcdq+2*1]
+ vpbroadcastd m12, [pd_32768]
+ pxor m11, m11
+ add filterq, mc_warp_filter-$$
+ lea tmp1q, [ssq*3]
+ add mxd, 512+(64<<10)
+ lea tmp2d, [alphaq*3]
+ sub srcq, tmp1q ; src -= src_stride*3
+ sub betad, tmp2d ; beta -= alpha*3
+ mov myd, r7m
+ call .h
+ psrld m1, m0, 16
+ call .h
+ pblendw m1, m0, 0xaa ; 01
+ psrld m2, m0, 16
+ call .h
+ pblendw m2, m0, 0xaa ; 12
+ psrld m3, m0, 16
+ call .h
+ pblendw m3, m0, 0xaa ; 23
+ psrld m4, m0, 16
+ call .h
+ pblendw m4, m0, 0xaa ; 34
+ psrld m5, m0, 16
+ call .h
+ pblendw m5, m0, 0xaa ; 45
+ psrld m6, m0, 16
+ call .h
+ pblendw m6, m0, 0xaa ; 56
+ movsx deltad, word [abcdq+2*2]
+ movsx gammad, word [abcdq+2*3]
+ add myd, 512+(64<<10)
+ mov r4d, 4
+ lea tmp1d, [deltaq*3]
+ sub gammad, tmp1d ; gamma -= delta*3
+.main2:
+ call .h
+ psrld m7, m6, 16
+ pblendw m7, m0, 0xaa ; 67
+ WARP_V 7, 1, 3, 5, 7
+ call .h
+ psrld m10, m5, 16
+ pblendw m10, m0, 0xaa ; 78
+ WARP_V 0, 2, 4, 6, 10
+ ret
+ALIGN function_align
+.h:
+ lea tmp1d, [mxq+alphaq*4]
+ lea tmp2d, [mxq+alphaq*1]
+ movu xm10, [srcq-6]
+ vinserti128 m10, [srcq+2], 1
+ shr mxd, 10 ; 0
+ shr tmp1d, 10 ; 4
+ movq xm0, [filterq+mxq *8]
+ vinserti128 m0, [filterq+tmp1q*8], 1
+ lea tmp1d, [tmp2q+alphaq*4]
+ lea mxd, [tmp2q+alphaq*1]
+ movu xm8, [srcq-4]
+ vinserti128 m8, [srcq+4], 1
+ shr tmp2d, 10 ; 1
+ shr tmp1d, 10 ; 5
+ movq xm9, [filterq+tmp2q*8]
+ vinserti128 m9, [filterq+tmp1q*8], 1
+ lea tmp1d, [mxq+alphaq*4]
+ lea tmp2d, [mxq+alphaq*1]
+ shr mxd, 10 ; 2
+ shr tmp1d, 10 ; 6
+ punpcklbw m0, m11, m0
+ pmaddwd m0, m10
+ movu xm10, [srcq-2]
+ vinserti128 m10, [srcq+6], 1
+ punpcklbw m9, m11, m9
+ pmaddwd m9, m8
+ movq xm8, [filterq+mxq *8]
+ vinserti128 m8, [filterq+tmp1q*8], 1
+ lea tmp1d, [tmp2q+alphaq*4]
+ lea mxd, [tmp2q+betaq] ; mx += beta
+ phaddd m0, m9 ; 0 1 4 5
+ movu xm9, [srcq+0]
+ vinserti128 m9, [srcq+8], 1
+ shr tmp2d, 10 ; 3
+ shr tmp1d, 10 ; 7
+ punpcklbw m8, m11, m8
+ pmaddwd m8, m10
+ movq xm10, [filterq+tmp2q*8]
+ vinserti128 m10, [filterq+tmp1q*8], 1
+ punpcklbw m10, m11, m10
+ pmaddwd m9, m10
+ add srcq, ssq
+ phaddd m8, m9 ; 2 3 6 7
+ phaddd m0, m8 ; 0 1 2 3 4 5 6 7
+ vpsllvd m0, m13
+ paddd m0, m12 ; rounded 14-bit result in upper 16 bits of dword
+ ret
+
+%macro BIDIR_FN 0
+ call .main
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ movq [dstq ], xm0
+ movhps [dstq+strideq*1], xm0
+ vextracti128 xm0, m0, 1
+ movq [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm0
+ cmp hd, 4
+ je .ret
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], xm1
+ movhps [dstq+strideq*1], xm1
+ vextracti128 xm1, m1, 1
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+stride3q ], xm1
+ cmp hd, 8
+ je .ret
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], xm2
+ movhps [dstq+strideq*1], xm2
+ vextracti128 xm2, m2, 1
+ movq [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm2
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], xm3
+ movhps [dstq+strideq*1], xm3
+ vextracti128 xm3, m3, 1
+ movq [dstq+strideq*2], xm3
+ movhps [dstq+stride3q ], xm3
+.ret:
+ RET
+.w8:
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], xm1
+ vextracti128 [dstq+stride3q ], m1, 1
+ cmp hd, 4
+ jne .w8_loop_start
+ RET
+.w8_loop:
+ call .main
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], xm1
+ vextracti128 [dstq+stride3q ], m1, 1
+.w8_loop_start:
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], xm2
+ vextracti128 [dstq+strideq*1], m2, 1
+ mova [dstq+strideq*2], xm3
+ vextracti128 [dstq+stride3q ], m3, 1
+ sub hd, 8
+ jg .w8_loop
+ RET
+.w16_loop:
+ call .main
+ lea dstq, [dstq+strideq*4]
+.w16:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+stride3q ], m3
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w32:
+ mova [dstq+strideq*0+32*0], m0
+ mova [dstq+strideq*0+32*1], m1
+ mova [dstq+strideq*1+32*0], m2
+ mova [dstq+strideq*1+32*1], m3
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64_loop:
+ call .main
+ add dstq, strideq
+.w64:
+ mova [dstq+32*0], m0
+ mova [dstq+32*1], m1
+ mova [dstq+32*2], m2
+ mova [dstq+32*3], m3
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop:
+ call .main
+ add dstq, strideq
+.w128:
+ mova [dstq+32*0], m0
+ mova [dstq+32*1], m1
+ mova [dstq+32*2], m2
+ mova [dstq+32*3], m3
+ call .main
+ mova [dstq+32*4], m0
+ mova [dstq+32*5], m1
+ mova [dstq+32*6], m2
+ mova [dstq+32*7], m3
+ dec hd
+ jg .w128_loop
+ RET
+%endmacro
+
+%if WIN64
+DECLARE_REG_TMP 5
+%else
+DECLARE_REG_TMP 7
+%endif
+
+cglobal avg_16bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
+%define base r6-avg_avx2_table
+ lea r6, [avg_avx2_table]
+ tzcnt wd, wm
+ mov t0d, r6m ; pixel_max
+ movsxd wq, [r6+wq*4]
+ shr t0d, 11
+ vpbroadcastd m4, [base+bidir_rnd+t0*4]
+ vpbroadcastd m5, [base+bidir_mul+t0*4]
+ movifnidn hd, hm
+ add wq, r6
+ BIDIR_FN
+ALIGN function_align
+.main:
+ mova m0, [tmp1q+32*0]
+ paddsw m0, [tmp2q+32*0]
+ mova m1, [tmp1q+32*1]
+ paddsw m1, [tmp2q+32*1]
+ mova m2, [tmp1q+32*2]
+ paddsw m2, [tmp2q+32*2]
+ mova m3, [tmp1q+32*3]
+ paddsw m3, [tmp2q+32*3]
+ add tmp1q, 32*4
+ add tmp2q, 32*4
+ pmaxsw m0, m4
+ pmaxsw m1, m4
+ pmaxsw m2, m4
+ pmaxsw m3, m4
+ psubsw m0, m4
+ psubsw m1, m4
+ psubsw m2, m4
+ psubsw m3, m4
+ pmulhw m0, m5
+ pmulhw m1, m5
+ pmulhw m2, m5
+ pmulhw m3, m5
+ ret
+
+cglobal w_avg_16bpc, 4, 7, 9, dst, stride, tmp1, tmp2, w, h, stride3
+ lea r6, [w_avg_avx2_table]
+ tzcnt wd, wm
+ mov t0d, r6m ; weight
+ vpbroadcastw m8, r7m ; pixel_max
+ vpbroadcastd m7, [r6-w_avg_avx2_table+pd_65538]
+ movsxd wq, [r6+wq*4]
+ paddw m7, m8
+ add wq, r6
+ lea r6d, [t0-16]
+ shl t0d, 16
+ sub t0d, r6d ; 16-weight, weight
+ pslld m7, 7
+ rorx r6d, t0d, 30 ; << 2
+ test dword r7m, 0x800
+ cmovz r6d, t0d
+ movifnidn hd, hm
+ movd xm6, r6d
+ vpbroadcastd m6, xm6
+ BIDIR_FN
+ALIGN function_align
+.main:
+ mova m4, [tmp1q+32*0]
+ mova m0, [tmp2q+32*0]
+ punpckhwd m5, m0, m4
+ punpcklwd m0, m4
+ mova m4, [tmp1q+32*1]
+ mova m1, [tmp2q+32*1]
+ pmaddwd m5, m6
+ pmaddwd m0, m6
+ paddd m5, m7
+ paddd m0, m7
+ psrad m5, 8
+ psrad m0, 8
+ packusdw m0, m5
+ punpckhwd m5, m1, m4
+ punpcklwd m1, m4
+ mova m4, [tmp1q+32*2]
+ mova m2, [tmp2q+32*2]
+ pmaddwd m5, m6
+ pmaddwd m1, m6
+ paddd m5, m7
+ paddd m1, m7
+ psrad m5, 8
+ psrad m1, 8
+ packusdw m1, m5
+ punpckhwd m5, m2, m4
+ punpcklwd m2, m4
+ mova m4, [tmp1q+32*3]
+ mova m3, [tmp2q+32*3]
+ add tmp1q, 32*4
+ add tmp2q, 32*4
+ pmaddwd m5, m6
+ pmaddwd m2, m6
+ paddd m5, m7
+ paddd m2, m7
+ psrad m5, 8
+ psrad m2, 8
+ packusdw m2, m5
+ punpckhwd m5, m3, m4
+ punpcklwd m3, m4
+ pmaddwd m5, m6
+ pmaddwd m3, m6
+ paddd m5, m7
+ paddd m3, m7
+ psrad m5, 8
+ psrad m3, 8
+ packusdw m3, m5
+ pminsw m0, m8
+ pminsw m1, m8
+ pminsw m2, m8
+ pminsw m3, m8
+ ret
+
+cglobal mask_16bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-mask_avx2_table
+ lea r7, [mask_avx2_table]
+ tzcnt wd, wm
+ mov r6d, r7m ; pixel_max
+ movifnidn hd, hm
+ shr r6d, 11
+ movsxd wq, [r7+wq*4]
+ vpbroadcastd m8, [base+pw_64]
+ vpbroadcastd m9, [base+bidir_rnd+r6*4]
+ vpbroadcastd m10, [base+bidir_mul+r6*4]
+ mov maskq, maskmp
+ add wq, r7
+ BIDIR_FN
+ALIGN function_align
+.main:
+%macro MASK 1
+ pmovzxbw m5, [maskq+16*%1]
+ mova m%1, [tmp1q+32*%1]
+ mova m6, [tmp2q+32*%1]
+ punpckhwd m4, m%1, m6
+ punpcklwd m%1, m6
+ psubw m7, m8, m5
+ punpckhwd m6, m5, m7 ; m, 64-m
+ punpcklwd m5, m7
+ pmaddwd m4, m6 ; tmp1 * m + tmp2 * (64-m)
+ pmaddwd m%1, m5
+ psrad m4, 5
+ psrad m%1, 5
+ packssdw m%1, m4
+ pmaxsw m%1, m9
+ psubsw m%1, m9
+ pmulhw m%1, m10
+%endmacro
+ MASK 0
+ MASK 1
+ MASK 2
+ MASK 3
+ add maskq, 16*4
+ add tmp1q, 32*4
+ add tmp2q, 32*4
+ ret
+
+cglobal w_mask_420_16bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-w_mask_420_avx2_table
+ lea r7, [w_mask_420_avx2_table]
+ tzcnt wd, wm
+ mov r6d, r8m ; pixel_max
+ movd xm0, r7m ; sign
+ movifnidn hd, hm
+ shr r6d, 11
+ movsxd wq, [r7+wq*4]
+ vpbroadcastd m10, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32
+ vpbroadcastd m11, [base+pw_64]
+ vpbroadcastd m12, [base+bidir_rnd+r6*4]
+ vpbroadcastd m13, [base+bidir_mul+r6*4]
+ movd xm14, [base+pw_2]
+ mov maskq, maskmp
+ psubw xm14, xm0
+ vpbroadcastw m14, xm14
+ add wq, r7
+ call .main
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ phaddd m4, m5
+ paddw m4, m14
+ psrlw m4, 2
+ packuswb m4, m4
+ vextracti128 xm5, m4, 1
+ punpcklwd xm4, xm5
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ vextracti128 xm0, m0, 1
+ movq [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm0
+ mova [maskq], xm4
+ cmp hd, 8
+ jl .w4_end
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm1
+ movhps [dstq+strideq*1], xm1
+ vextracti128 xm1, m1, 1
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+stride3q ], xm1
+ je .w4_end
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm2
+ movhps [dstq+strideq*1], xm2
+ vextracti128 xm2, m2, 1
+ movq [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm2
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm3
+ movhps [dstq+strideq*1], xm3
+ vextracti128 xm3, m3, 1
+ movq [dstq+strideq*2], xm3
+ movhps [dstq+stride3q ], xm3
+.w4_end:
+ RET
+.w8_loop:
+ call .main
+ lea dstq, [dstq+strideq*4]
+ add maskq, 16
+.w8:
+ vperm2i128 m6, m4, m5, 0x21
+ vpblendd m4, m5, 0xf0
+ paddw m4, m14
+ paddw m4, m6
+ psrlw m4, 2
+ vextracti128 xm5, m4, 1
+ packuswb xm4, xm5
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], xm1
+ vextracti128 [dstq+stride3q ], m1, 1
+ mova [maskq], xm4
+ sub hd, 8
+ jl .w8_end
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], xm2
+ vextracti128 [dstq+strideq*1], m2, 1
+ mova [dstq+strideq*2], xm3
+ vextracti128 [dstq+stride3q ], m3, 1
+ jg .w8_loop
+.w8_end:
+ RET
+.w16_loop:
+ call .main
+ lea dstq, [dstq+strideq*4]
+ add maskq, 16
+.w16:
+ punpcklqdq m6, m4, m5
+ punpckhqdq m4, m5
+ paddw m6, m14
+ paddw m4, m6
+ psrlw m4, 2
+ vextracti128 xm5, m4, 1
+ packuswb xm4, xm5
+ pshufd xm4, xm4, q3120
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+stride3q ], m3
+ mova [maskq], xm4
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32_loop:
+ call .main
+ lea dstq, [dstq+strideq*4]
+ add maskq, 32
+.w32:
+ paddw m4, m14
+ paddw m4, m5
+ psrlw m15, m4, 2
+ mova [dstq+strideq*0+32*0], m0
+ mova [dstq+strideq*0+32*1], m1
+ mova [dstq+strideq*1+32*0], m2
+ mova [dstq+strideq*1+32*1], m3
+ call .main
+ mova m6, [deint_shuf]
+ paddw m4, m14
+ paddw m4, m5
+ psrlw m4, 2
+ packuswb m15, m4
+ vpermd m4, m6, m15
+ mova [dstq+strideq*2+32*0], m0
+ mova [dstq+strideq*2+32*1], m1
+ mova [dstq+stride3q +32*0], m2
+ mova [dstq+stride3q +32*1], m3
+ mova [maskq], m4
+ sub hd, 4
+ jg .w32_loop
+ RET
+.w64_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+ add maskq, 32
+.w64:
+ paddw m4, m14
+ paddw m15, m14, m5
+ mova [dstq+strideq*0+32*0], m0
+ mova [dstq+strideq*0+32*1], m1
+ mova [dstq+strideq*0+32*2], m2
+ mova [dstq+strideq*0+32*3], m3
+ mova [maskq], m4 ; no available registers
+ call .main
+ paddw m4, [maskq]
+ mova m6, [deint_shuf]
+ paddw m5, m15
+ psrlw m4, 2
+ psrlw m5, 2
+ packuswb m4, m5 ; 0 2 4 6 1 3 5 7
+ vpermd m4, m6, m4
+ mova [dstq+strideq*1+32*0], m0
+ mova [dstq+strideq*1+32*1], m1
+ mova [dstq+strideq*1+32*2], m2
+ mova [dstq+strideq*1+32*3], m3
+ mova [maskq], m4
+ sub hd, 2
+ jg .w64_loop
+ RET
+.w128_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+ add maskq, 64
+.w128:
+ paddw m4, m14
+ paddw m5, m14
+ mova [dstq+strideq*0+32*0], m0
+ mova [dstq+strideq*0+32*1], m1
+ mova [dstq+strideq*0+32*2], m2
+ mova [dstq+strideq*0+32*3], m3
+ mova [maskq+32*0], m4
+ mova [dstq+strideq], m5
+ call .main
+ paddw m4, m14
+ paddw m15, m14, m5
+ mova [dstq+strideq*0+32*4], m0
+ mova [dstq+strideq*0+32*5], m1
+ mova [dstq+strideq*0+32*6], m2
+ mova [dstq+strideq*0+32*7], m3
+ mova [maskq+32*1], m4
+ call .main
+ paddw m4, [maskq+32*0]
+ paddw m5, [dstq+strideq]
+ mova m6, [deint_shuf]
+ psrlw m4, 2
+ psrlw m5, 2
+ packuswb m4, m5
+ vpermd m4, m6, m4
+ mova [dstq+strideq*1+32*0], m0
+ mova [dstq+strideq*1+32*1], m1
+ mova [dstq+strideq*1+32*2], m2
+ mova [dstq+strideq*1+32*3], m3
+ mova [maskq+32*0], m4
+ call .main
+ paddw m4, [maskq+32*1]
+ mova m6, [deint_shuf]
+ paddw m5, m15
+ psrlw m4, 2
+ psrlw m5, 2
+ packuswb m4, m5
+ vpermd m4, m6, m4
+ mova [dstq+strideq*1+32*4], m0
+ mova [dstq+strideq*1+32*5], m1
+ mova [dstq+strideq*1+32*6], m2
+ mova [dstq+strideq*1+32*7], m3
+ mova [maskq+32*1], m4
+ sub hd, 2
+ jg .w128_loop
+ RET
+ALIGN function_align
+.main:
+%macro W_MASK 2-6 11, 12, 13 ; dst/src1, mask/src2, pw_64, rnd, mul
+ mova m%1, [tmp1q+32*%1]
+ mova m%2, [tmp2q+32*%1]
+ punpcklwd m8, m%2, m%1
+ punpckhwd m9, m%2, m%1
+ psubsw m%1, m%2
+ pabsw m%1, m%1
+ psubusw m7, m10, m%1
+ psrlw m7, 10 ; 64-m
+ psubw m%2, m%3, m7 ; m
+ punpcklwd m%1, m7, m%2
+ punpckhwd m7, m%2
+ pmaddwd m%1, m8
+ pmaddwd m7, m9
+ psrad m%1, 5
+ psrad m7, 5
+ packssdw m%1, m7
+ pmaxsw m%1, m%4
+ psubsw m%1, m%4
+ pmulhw m%1, m%5
+%endmacro
+ W_MASK 0, 4
+ W_MASK 1, 5
+ phaddw m4, m5
+ W_MASK 2, 5
+ W_MASK 3, 6
+ phaddw m5, m6
+ add tmp1q, 32*4
+ add tmp2q, 32*4
+ ret
+
+cglobal w_mask_422_16bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-w_mask_422_avx2_table
+ lea r7, [w_mask_422_avx2_table]
+ tzcnt wd, wm
+ mov r6d, r8m ; pixel_max
+ vpbroadcastb m14, r7m ; sign
+ movifnidn hd, hm
+ shr r6d, 11
+ movsxd wq, [r7+wq*4]
+ vpbroadcastd m10, [base+pw_27615]
+ vpbroadcastd m11, [base+pw_64]
+ vpbroadcastd m12, [base+bidir_rnd+r6*4]
+ vpbroadcastd m13, [base+bidir_mul+r6*4]
+ mova m15, [base+deint_shuf]
+ mov maskq, maskmp
+ add wq, r7
+ call .main
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ vextracti128 xm0, m0, 1
+ movq [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm0
+ cmp hd, 8
+ jl .w4_end
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm1
+ movhps [dstq+strideq*1], xm1
+ vextracti128 xm1, m1, 1
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+stride3q ], xm1
+ je .w4_end
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm2
+ movhps [dstq+strideq*1], xm2
+ vextracti128 xm2, m2, 1
+ movq [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm2
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm3
+ movhps [dstq+strideq*1], xm3
+ vextracti128 xm3, m3, 1
+ movq [dstq+strideq*2], xm3
+ movhps [dstq+stride3q ], xm3
+.w4_end:
+ RET
+.w8_loop:
+ call .main
+ lea dstq, [dstq+strideq*4]
+.w8:
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], xm1
+ vextracti128 [dstq+stride3q ], m1, 1
+ sub hd, 8
+ jl .w8_end
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], xm2
+ vextracti128 [dstq+strideq*1], m2, 1
+ mova [dstq+strideq*2], xm3
+ vextracti128 [dstq+stride3q ], m3, 1
+ jg .w8_loop
+.w8_end:
+ RET
+.w16_loop:
+ call .main
+ lea dstq, [dstq+strideq*4]
+.w16:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+stride3q ], m3
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w32:
+ mova [dstq+strideq*0+32*0], m0
+ mova [dstq+strideq*0+32*1], m1
+ mova [dstq+strideq*1+32*0], m2
+ mova [dstq+strideq*1+32*1], m3
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64_loop:
+ call .main
+ add dstq, strideq
+.w64:
+ mova [dstq+32*0], m0
+ mova [dstq+32*1], m1
+ mova [dstq+32*2], m2
+ mova [dstq+32*3], m3
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop:
+ call .main
+ add dstq, strideq
+.w128:
+ mova [dstq+32*0], m0
+ mova [dstq+32*1], m1
+ mova [dstq+32*2], m2
+ mova [dstq+32*3], m3
+ call .main
+ mova [dstq+32*4], m0
+ mova [dstq+32*5], m1
+ mova [dstq+32*6], m2
+ mova [dstq+32*7], m3
+ dec hd
+ jg .w128_loop
+ RET
+ALIGN function_align
+.main:
+ W_MASK 0, 4
+ W_MASK 1, 5
+ phaddw m4, m5
+ W_MASK 2, 5
+ W_MASK 3, 6
+ phaddw m5, m6
+ add tmp1q, 32*4
+ add tmp2q, 32*4
+ packuswb m4, m5
+ pxor m5, m5
+ psubb m4, m14
+ pavgb m4, m5
+ vpermd m4, m15, m4
+ mova [maskq], m4
+ add maskq, 32
+ ret
+
+cglobal w_mask_444_16bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-w_mask_444_avx2_table
+ lea r7, [w_mask_444_avx2_table]
+ tzcnt wd, wm
+ mov r6d, r8m ; pixel_max
+ movifnidn hd, hm
+ shr r6d, 11
+ movsxd wq, [r7+wq*4]
+ vpbroadcastd m10, [base+pw_27615]
+ vpbroadcastd m4, [base+pw_64]
+ vpbroadcastd m5, [base+bidir_rnd+r6*4]
+ vpbroadcastd m6, [base+bidir_mul+r6*4]
+ mov maskq, maskmp
+ add wq, r7
+ call .main
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ vextracti128 xm0, m0, 1
+ movq [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm0
+ cmp hd, 8
+ jl .w4_end
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm1
+ movhps [dstq+strideq*1], xm1
+ vextracti128 xm1, m1, 1
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+stride3q ], xm1
+ je .w4_end
+ call .main
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ vextracti128 xm0, m0, 1
+ movq [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm1
+ movhps [dstq+strideq*1], xm1
+ vextracti128 xm1, m1, 1
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+stride3q ], xm1
+.w4_end:
+ RET
+.w8_loop:
+ call .main
+ lea dstq, [dstq+strideq*4]
+.w8:
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], xm1
+ vextracti128 [dstq+stride3q ], m1, 1
+ sub hd, 4
+ jg .w8_loop
+.w8_end:
+ RET
+.w16_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w16:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ sub hd, 2
+ jg .w16_loop
+ RET
+.w32_loop:
+ call .main
+ add dstq, strideq
+.w32:
+ mova [dstq+32*0], m0
+ mova [dstq+32*1], m1
+ dec hd
+ jg .w32_loop
+ RET
+.w64_loop:
+ call .main
+ add dstq, strideq
+.w64:
+ mova [dstq+32*0], m0
+ mova [dstq+32*1], m1
+ call .main
+ mova [dstq+32*2], m0
+ mova [dstq+32*3], m1
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop:
+ call .main
+ add dstq, strideq
+.w128:
+ mova [dstq+32*0], m0
+ mova [dstq+32*1], m1
+ call .main
+ mova [dstq+32*2], m0
+ mova [dstq+32*3], m1
+ call .main
+ mova [dstq+32*4], m0
+ mova [dstq+32*5], m1
+ call .main
+ mova [dstq+32*6], m0
+ mova [dstq+32*7], m1
+ dec hd
+ jg .w128_loop
+ RET
+ALIGN function_align
+.main:
+ W_MASK 0, 2, 4, 5, 6
+ W_MASK 1, 3, 4, 5, 6
+ packuswb m2, m3
+ vpermq m2, m2, q3120
+ add tmp1q, 32*2
+ add tmp2q, 32*2
+ mova [maskq], m2
+ add maskq, 32
+ ret
+
+; (a * (64 - m) + b * m + 32) >> 6
+; = (((b - a) * m + 32) >> 6) + a
+; = (((b - a) * (m << 9) + 16384) >> 15) + a
+; except m << 9 overflows int16_t when m == 64 (which is possible),
+; but if we negate m it works out (-64 << 9 == -32768).
+; = (((a - b) * (m * -512) + 16384) >> 15) + a
+cglobal blend_16bpc, 3, 7, 7, dst, ds, tmp, w, h, mask
+%define base r6-blend_avx2_table
+ lea r6, [blend_avx2_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, [r6+wq*4]
+ movifnidn maskq, maskmp
+ vpbroadcastd m6, [base+pw_m512]
+ add wq, r6
+ lea r6, [dsq*3]
+ jmp wq
+.w4:
+ pmovzxbw m3, [maskq]
+ movq xm0, [dstq+dsq*0]
+ movhps xm0, [dstq+dsq*1]
+ vpbroadcastq m1, [dstq+dsq*2]
+ vpbroadcastq m2, [dstq+r6 ]
+ vpblendd m0, m1, 0x30
+ vpblendd m0, m2, 0xc0
+ psubw m1, m0, [tmpq]
+ add maskq, 16
+ add tmpq, 32
+ pmullw m3, m6
+ pmulhrsw m1, m3
+ paddw m0, m1
+ vextracti128 xm1, m0, 1
+ movq [dstq+dsq*0], xm0
+ movhps [dstq+dsq*1], xm0
+ movq [dstq+dsq*2], xm1
+ movhps [dstq+r6 ], xm1
+ lea dstq, [dstq+dsq*4]
+ sub hd, 4
+ jg .w4
+ RET
+.w8:
+ pmovzxbw m4, [maskq+16*0]
+ pmovzxbw m5, [maskq+16*1]
+ mova xm0, [dstq+dsq*0]
+ vinserti128 m0, [dstq+dsq*1], 1
+ mova xm1, [dstq+dsq*2]
+ vinserti128 m1, [dstq+r6 ], 1
+ psubw m2, m0, [tmpq+32*0]
+ psubw m3, m1, [tmpq+32*1]
+ add maskq, 16*2
+ add tmpq, 32*2
+ pmullw m4, m6
+ pmullw m5, m6
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+dsq*0], xm0
+ vextracti128 [dstq+dsq*1], m0, 1
+ mova [dstq+dsq*2], xm1
+ vextracti128 [dstq+r6 ], m1, 1
+ lea dstq, [dstq+dsq*4]
+ sub hd, 4
+ jg .w8
+ RET
+.w16:
+ pmovzxbw m4, [maskq+16*0]
+ pmovzxbw m5, [maskq+16*1]
+ mova m0, [dstq+dsq*0]
+ psubw m2, m0, [tmpq+ 32*0]
+ mova m1, [dstq+dsq*1]
+ psubw m3, m1, [tmpq+ 32*1]
+ add maskq, 16*2
+ add tmpq, 32*2
+ pmullw m4, m6
+ pmullw m5, m6
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w16
+ RET
+.w32:
+ pmovzxbw m4, [maskq+16*0]
+ pmovzxbw m5, [maskq+16*1]
+ mova m0, [dstq+32*0]
+ psubw m2, m0, [tmpq+32*0]
+ mova m1, [dstq+32*1]
+ psubw m3, m1, [tmpq+32*1]
+ add maskq, 16*2
+ add tmpq, 32*2
+ pmullw m4, m6
+ pmullw m5, m6
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+32*0], m0
+ mova [dstq+32*1], m1
+ add dstq, dsq
+ dec hd
+ jg .w32
+ RET
+
+INIT_XMM avx2
+cglobal blend_v_16bpc, 3, 6, 6, dst, ds, tmp, w, h
+%define base r5-blend_v_avx2_table
+ lea r5, [blend_v_avx2_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ jmp wq
+.w2:
+ vpbroadcastd m2, [base+obmc_masks_avx2+2*2]
+.w2_loop:
+ movd m0, [dstq+dsq*0]
+ pinsrd m0, [dstq+dsq*1], 1
+ movq m1, [tmpq]
+ add tmpq, 4*2
+ psubw m1, m0, m1
+ pmulhrsw m1, m2
+ paddw m0, m1
+ movd [dstq+dsq*0], m0
+ pextrd [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w2_loop
+ RET
+.w4:
+ vpbroadcastq m2, [base+obmc_masks_avx2+4*2]
+.w4_loop:
+ movq m0, [dstq+dsq*0]
+ movhps m0, [dstq+dsq*1]
+ psubw m1, m0, [tmpq]
+ add tmpq, 8*2
+ pmulhrsw m1, m2
+ paddw m0, m1
+ movq [dstq+dsq*0], m0
+ movhps [dstq+dsq*1], m0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w4_loop
+ RET
+INIT_YMM avx2
+.w8:
+ vbroadcasti128 m2, [base+obmc_masks_avx2+8*2]
+.w8_loop:
+ mova xm0, [dstq+dsq*0]
+ vinserti128 m0, [dstq+dsq*1], 1
+ psubw m1, m0, [tmpq]
+ add tmpq, 16*2
+ pmulhrsw m1, m2
+ paddw m0, m1
+ mova [dstq+dsq*0], xm0
+ vextracti128 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w8_loop
+ RET
+.w16:
+ mova m4, [base+obmc_masks_avx2+16*2]
+.w16_loop:
+ mova m0, [dstq+dsq*0]
+ psubw m2, m0, [tmpq+ 32*0]
+ mova m1, [dstq+dsq*1]
+ psubw m3, m1, [tmpq+ 32*1]
+ add tmpq, 32*2
+ pmulhrsw m2, m4
+ pmulhrsw m3, m4
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w16_loop
+ RET
+.w32:
+%if WIN64
+ movaps [rsp+ 8], xmm6
+ movaps [rsp+24], xmm7
+%endif
+ mova m6, [base+obmc_masks_avx2+32*2]
+ vbroadcasti128 m7, [base+obmc_masks_avx2+32*3]
+.w32_loop:
+ mova m0, [dstq+dsq*0+32*0]
+ psubw m3, m0, [tmpq +32*0]
+ mova xm2, [dstq+dsq*0+32*1]
+ mova xm5, [tmpq +32*1]
+ mova m1, [dstq+dsq*1+32*0]
+ psubw m4, m1, [tmpq +32*2]
+ vinserti128 m2, [dstq+dsq*1+32*1], 1
+ vinserti128 m5, [tmpq +32*3], 1
+ add tmpq, 32*4
+ psubw m5, m2, m5
+ pmulhrsw m3, m6
+ pmulhrsw m4, m6
+ pmulhrsw m5, m7
+ paddw m0, m3
+ paddw m1, m4
+ paddw m2, m5
+ mova [dstq+dsq*0+32*0], m0
+ mova [dstq+dsq*1+32*0], m1
+ mova [dstq+dsq*0+32*1], xm2
+ vextracti128 [dstq+dsq*1+32*1], m2, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w32_loop
+%if WIN64
+ movaps xmm6, [rsp+ 8]
+ movaps xmm7, [rsp+24]
+%endif
+ RET
+
+%macro BLEND_H_ROW 2-3 0; dst_off, tmp_off, inc_tmp
+ mova m0, [dstq+32*(%1+0)]
+ psubw m2, m0, [tmpq+32*(%2+0)]
+ mova m1, [dstq+32*(%1+1)]
+ psubw m3, m1, [tmpq+32*(%2+1)]
+%if %3
+ add tmpq, 32*%3
+%endif
+ pmulhrsw m2, m4
+ pmulhrsw m3, m4
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+32*(%1+0)], m0
+ mova [dstq+32*(%1+1)], m1
+%endmacro
+
+INIT_XMM avx2
+cglobal blend_h_16bpc, 3, 6, 6, dst, ds, tmp, w, h, mask
+%define base r5-blend_h_avx2_table
+ lea r5, [blend_h_avx2_table]
+ tzcnt wd, wm
+ mov hd, hm
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ lea maskq, [base+obmc_masks_avx2+hq*2]
+ lea hd, [hq*3]
+ shr hd, 2 ; h * 3/4
+ lea maskq, [maskq+hq*2]
+ neg hq
+ jmp wq
+.w2:
+ movd m0, [dstq+dsq*0]
+ pinsrd m0, [dstq+dsq*1], 1
+ movd m2, [maskq+hq*2]
+ movq m1, [tmpq]
+ add tmpq, 4*2
+ punpcklwd m2, m2
+ psubw m1, m0, m1
+ pmulhrsw m1, m2
+ paddw m0, m1
+ movd [dstq+dsq*0], m0
+ pextrd [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w2
+ RET
+.w4:
+ mova m3, [blend_shuf]
+.w4_loop:
+ movq m0, [dstq+dsq*0]
+ movhps m0, [dstq+dsq*1]
+ movd m2, [maskq+hq*2]
+ psubw m1, m0, [tmpq]
+ add tmpq, 8*2
+ pshufb m2, m3
+ pmulhrsw m1, m2
+ paddw m0, m1
+ movq [dstq+dsq*0], m0
+ movhps [dstq+dsq*1], m0
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w4_loop
+ RET
+INIT_YMM avx2
+.w8:
+ vbroadcasti128 m3, [blend_shuf]
+ shufpd m3, m3, 0x0c
+.w8_loop:
+ mova xm0, [dstq+dsq*0]
+ vinserti128 m0, [dstq+dsq*1], 1
+ vpbroadcastd m2, [maskq+hq*2]
+ psubw m1, m0, [tmpq]
+ add tmpq, 16*2
+ pshufb m2, m3
+ pmulhrsw m1, m2
+ paddw m0, m1
+ mova [dstq+dsq*0], xm0
+ vextracti128 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w8_loop
+ RET
+.w16:
+ vpbroadcastw m4, [maskq+hq*2]
+ vpbroadcastw m5, [maskq+hq*2+2]
+ mova m0, [dstq+dsq*0]
+ psubw m2, m0, [tmpq+ 32*0]
+ mova m1, [dstq+dsq*1]
+ psubw m3, m1, [tmpq+ 32*1]
+ add tmpq, 32*2
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w16
+ RET
+.w32:
+ vpbroadcastw m4, [maskq+hq*2]
+ BLEND_H_ROW 0, 0, 2
+ add dstq, dsq
+ inc hq
+ jl .w32
+ RET
+.w64:
+ vpbroadcastw m4, [maskq+hq*2]
+ BLEND_H_ROW 0, 0
+ BLEND_H_ROW 2, 2, 4
+ add dstq, dsq
+ inc hq
+ jl .w64
+ RET
+.w128:
+ vpbroadcastw m4, [maskq+hq*2]
+ BLEND_H_ROW 0, 0
+ BLEND_H_ROW 2, 2, 8
+ BLEND_H_ROW 4, -4
+ BLEND_H_ROW 6, -2
+ add dstq, dsq
+ inc hq
+ jl .w128
+ RET
+
+cglobal emu_edge_16bpc, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \
+ bottomext, rightext
+ ; we assume that the buffer (stride) is larger than width, so we can
+ ; safely overwrite by a few bytes
+
+ ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
+ xor r12d, r12d
+ lea r10, [ihq-1]
+ cmp yq, ihq
+ cmovs r10, yq
+ test yq, yq
+ cmovs r10, r12
+ imul r10, sstrideq
+ add srcq, r10
+
+ ; ref += iclip(x, 0, iw - 1)
+ lea r10, [iwq-1]
+ cmp xq, iwq
+ cmovs r10, xq
+ test xq, xq
+ cmovs r10, r12
+ lea srcq, [srcq+r10*2]
+
+ ; bottom_ext = iclip(y + bh - ih, 0, bh - 1)
+ lea bottomextq, [yq+bhq]
+ sub bottomextq, ihq
+ lea r3, [bhq-1]
+ cmovs bottomextq, r12
+
+ DEFINE_ARGS bw, bh, iw, ih, x, topext, dst, dstride, src, sstride, \
+ bottomext, rightext
+
+ ; top_ext = iclip(-y, 0, bh - 1)
+ neg topextq
+ cmovs topextq, r12
+ cmp bottomextq, bhq
+ cmovns bottomextq, r3
+ cmp topextq, bhq
+ cmovg topextq, r3
+
+ ; right_ext = iclip(x + bw - iw, 0, bw - 1)
+ lea rightextq, [xq+bwq]
+ sub rightextq, iwq
+ lea r2, [bwq-1]
+ cmovs rightextq, r12
+
+ DEFINE_ARGS bw, bh, iw, ih, leftext, topext, dst, dstride, src, sstride, \
+ bottomext, rightext
+
+ ; left_ext = iclip(-x, 0, bw - 1)
+ neg leftextq
+ cmovs leftextq, r12
+ cmp rightextq, bwq
+ cmovns rightextq, r2
+ cmp leftextq, bwq
+ cmovns leftextq, r2
+
+ DEFINE_ARGS bw, centerh, centerw, dummy, leftext, topext, \
+ dst, dstride, src, sstride, bottomext, rightext
+
+ ; center_h = bh - top_ext - bottom_ext
+ lea r3, [bottomextq+topextq]
+ sub centerhq, r3
+
+ ; blk += top_ext * PXSTRIDE(dst_stride)
+ mov r2, topextq
+ imul r2, dstrideq
+ add dstq, r2
+ mov r9m, dstq
+
+ ; center_w = bw - left_ext - right_ext
+ mov centerwq, bwq
+ lea r3, [rightextq+leftextq]
+ sub centerwq, r3
+
+%macro v_loop 3 ; need_left_ext, need_right_ext, suffix
+.v_loop_%3:
+%if %1
+ ; left extension
+ xor r3, r3
+ vpbroadcastw m0, [srcq]
+.left_loop_%3:
+ mova [dstq+r3*2], m0
+ add r3, 16
+ cmp r3, leftextq
+ jl .left_loop_%3
+
+ ; body
+ lea r12, [dstq+leftextq*2]
+%endif
+ xor r3, r3
+.body_loop_%3:
+ movu m0, [srcq+r3*2]
+%if %1
+ movu [r12+r3*2], m0
+%else
+ movu [dstq+r3*2], m0
+%endif
+ add r3, 16
+ cmp r3, centerwq
+ jl .body_loop_%3
+
+%if %2
+ ; right extension
+%if %1
+ lea r12, [r12+centerwq*2]
+%else
+ lea r12, [dstq+centerwq*2]
+%endif
+ xor r3, r3
+ vpbroadcastw m0, [srcq+centerwq*2-2]
+.right_loop_%3:
+ movu [r12+r3*2], m0
+ add r3, 16
+ cmp r3, rightextq
+ jl .right_loop_%3
+
+%endif
+ add dstq, dstrideq
+ add srcq, sstrideq
+ dec centerhq
+ jg .v_loop_%3
+%endmacro
+
+ test leftextq, leftextq
+ jnz .need_left_ext
+ test rightextq, rightextq
+ jnz .need_right_ext
+ v_loop 0, 0, 0
+ jmp .body_done
+
+.need_left_ext:
+ test rightextq, rightextq
+ jnz .need_left_right_ext
+ v_loop 1, 0, 1
+ jmp .body_done
+
+.need_left_right_ext:
+ v_loop 1, 1, 2
+ jmp .body_done
+
+.need_right_ext:
+ v_loop 0, 1, 3
+
+.body_done:
+ ; bottom edge extension
+ test bottomextq, bottomextq
+ jz .top
+ mov srcq, dstq
+ sub srcq, dstrideq
+ xor r1, r1
+.bottom_x_loop:
+ mova m0, [srcq+r1*2]
+ lea r3, [dstq+r1*2]
+ mov r4, bottomextq
+.bottom_y_loop:
+ mova [r3], m0
+ add r3, dstrideq
+ dec r4
+ jg .bottom_y_loop
+ add r1, 16
+ cmp r1, bwq
+ jl .bottom_x_loop
+
+.top:
+ ; top edge extension
+ test topextq, topextq
+ jz .end
+ mov srcq, r9m
+ mov dstq, dstm
+ xor r1, r1
+.top_x_loop:
+ mova m0, [srcq+r1*2]
+ lea r3, [dstq+r1*2]
+ mov r4, topextq
+.top_y_loop:
+ mova [r3], m0
+ add r3, dstrideq
+ dec r4
+ jg .top_y_loop
+ add r1, 16
+ cmp r1, bwq
+ jl .top_x_loop
+
+.end:
+ RET
+
+cglobal resize_16bpc, 6, 12, 16, dst, dst_stride, src, src_stride, \
+ dst_w, h, src_w, dx, mx0, pxmax
+ sub dword mx0m, 4<<14
+ sub dword src_wm, 8
+ vpbroadcastd m5, dxm
+ vpbroadcastd m8, mx0m
+ vpbroadcastd m6, src_wm
+ DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, _, _, pxmax
+ LEA r7, $$
+%define base r7-$$
+ vpbroadcastd m3, [base+pd_64]
+ vpbroadcastw xm7, pxmaxm
+ pmaddwd m2, m5, [base+rescale_mul] ; dx*[0,1,2,3,4,5,6,7]
+ pslld m5, 3 ; dx*8
+ pslld m6, 14
+ paddd m8, m2 ; mx+[0..7]*dx
+.loop_y:
+ xor xd, xd
+ mova m4, m8 ; per-line working version of mx
+.loop_x:
+ vpbroadcastd m10, [base+pd_63]
+ pxor m2, m2
+ pmaxsd m0, m4, m2
+ psrad m9, m4, 8 ; filter offset (unmasked)
+ pminsd m0, m6 ; iclip(mx, 0, src_w-8)
+ psubd m1, m4, m0 ; pshufb offset
+ psrad m0, 14 ; clipped src_x offset
+ psrad m1, 14 ; pshufb edge_emu offset
+ pand m9, m10 ; filter offset (masked)
+ ; load source pixels
+ movd r8d, xm0
+ pextrd r9d, xm0, 1
+ pextrd r10d, xm0, 2
+ pextrd r11d, xm0, 3
+ vextracti128 xm0, m0, 1
+ movu xm10, [srcq+r8*2]
+ movu xm11, [srcq+r9*2]
+ movu xm12, [srcq+r10*2]
+ movu xm13, [srcq+r11*2]
+ movd r8d, xm0
+ pextrd r9d, xm0, 1
+ pextrd r10d, xm0, 2
+ pextrd r11d, xm0, 3
+ vinserti128 m10, [srcq+r8*2], 1
+ vinserti128 m11, [srcq+r9*2], 1
+ vinserti128 m12, [srcq+r10*2], 1
+ vinserti128 m13, [srcq+r11*2], 1
+ ptest m1, m1
+ jz .filter
+ movq r9, xm1
+ pextrq r11, xm1, 1
+ movsxd r8, r9d
+ sar r9, 32
+ movsxd r10, r11d
+ sar r11, 32
+ vextracti128 xm1, m1, 1
+ movu xm14, [base+resize_shuf+8+r8*2]
+ movu xm15, [base+resize_shuf+8+r9*2]
+ movu xm0, [base+resize_shuf+8+r10*2]
+ movu xm2, [base+resize_shuf+8+r11*2]
+ movq r9, xm1
+ pextrq r11, xm1, 1
+ movsxd r8, r9d
+ sar r9, 32
+ movsxd r10, r11d
+ sar r11, 32
+ vinserti128 m14, [base+resize_shuf+8+r8*2], 1
+ vinserti128 m15, [base+resize_shuf+8+r9*2], 1
+ vinserti128 m0, [base+resize_shuf+8+r10*2], 1
+ vinserti128 m2, [base+resize_shuf+8+r11*2], 1
+ pshufb m10, m14
+ pshufb m11, m15
+ pshufb m12, m0
+ pshufb m13, m2
+.filter:
+ movd r8d, xm9
+ pextrd r9d, xm9, 1
+ pextrd r10d, xm9, 2
+ pextrd r11d, xm9, 3
+ vextracti128 xm9, m9, 1
+ movq xm14, [base+resize_filter+r8*8]
+ movq xm15, [base+resize_filter+r9*8]
+ movq xm0, [base+resize_filter+r10*8]
+ movq xm2, [base+resize_filter+r11*8]
+ movd r8d, xm9
+ pextrd r9d, xm9, 1
+ pextrd r10d, xm9, 2
+ pextrd r11d, xm9, 3
+ movhps xm14, [base+resize_filter+r8*8]
+ movhps xm15, [base+resize_filter+r9*8]
+ movhps xm0, [base+resize_filter+r10*8]
+ movhps xm2, [base+resize_filter+r11*8]
+ pmovsxbw m14, xm14
+ pmovsxbw m15, xm15
+ pmovsxbw m0, xm0
+ pmovsxbw m2, xm2
+ pmaddwd m10, m14
+ pmaddwd m11, m15
+ pmaddwd m12, m0
+ pmaddwd m13, m2
+ phaddd m10, m11
+ phaddd m12, m13
+ phaddd m10, m12
+ psubd m10, m3, m10
+ psrad m10, 7
+ vextracti128 xm0, m10, 1
+ packusdw xm10, xm0
+ pminsw xm10, xm7
+ mova [dstq+xq*2], xm10
+ paddd m4, m5
+ add xd, 8
+ cmp xd, dst_wd
+ jl .loop_x
+ add dstq, dst_strideq
+ add srcq, src_strideq
+ dec hd
+ jg .loop_y
+ RET
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/mc16_avx512.asm b/third_party/dav1d/src/x86/mc16_avx512.asm
new file mode 100644
index 0000000000..585ba53e08
--- /dev/null
+++ b/third_party/dav1d/src/x86/mc16_avx512.asm
@@ -0,0 +1,4858 @@
+; Copyright © 2020, VideoLAN and dav1d authors
+; Copyright © 2020, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 64
+
+spel_h_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9
+ db 32, 33, 34, 35, 34, 35, 36, 37, 36, 37, 38, 39, 38, 39, 40, 41
+spel_h_shufC: db 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15, 14, 15, 16, 17
+ db 40, 41, 42, 43, 42, 43, 44, 45, 44, 45, 46, 47, 46, 47, 48, 49
+ db 16, 17, 18, 19, 18, 19, 20, 21, 20, 21, 22, 23, 22, 23, 24, 25
+ db 48, 49, 50, 51, 50, 51, 52, 53, 52, 53, 54, 55, 54, 55, 56, 57
+spel_h_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13
+ db 36, 37, 38, 39, 38, 39, 40, 41, 40, 41, 42, 43, 42, 43, 44, 45
+spel_h_shufD: db 12, 13, 14, 15, 14, 15, 16, 17, 16, 17, 18, 19, 18, 19, 20, 21
+ db 44, 45, 46, 47, 46, 47, 48, 49, 48, 49, 50, 51, 50, 51, 52, 53
+ db 20, 21, 22, 23, 22, 23, 24, 25, 24, 25, 26, 27, 26, 27, 28, 29
+ db 52, 53, 54, 55, 54, 55, 56, 57, 56, 57, 58, 59, 58, 59, 60, 61
+spel_v_shuf8: db 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23
+ db 16, 17, 32, 33, 18, 19, 34, 35, 20, 21, 36, 37, 22, 23, 38, 39
+ db 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31
+ db 24, 25, 40, 41, 26, 27, 42, 43, 28, 29, 44, 45, 30, 31, 46, 47
+spel_v_shuf16: db 0, 1, 32, 33, 2, 3, 34, 35, 4, 5, 36, 37, 6, 7, 38, 39
+ db 8, 9, 40, 41, 10, 11, 42, 43, 12, 13, 44, 45, 14, 15, 46, 47
+ db 16, 17, 48, 49, 18, 19, 50, 51, 20, 21, 52, 53, 22, 23, 54, 55
+ db 24, 25, 56, 57, 26, 27, 58, 59, 28, 29, 60, 61, 30, 31, 62, 63
+prep_endA: db 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30
+ db 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58, 61, 62
+ db 65, 66, 69, 70, 73, 74, 77, 78, 81, 82, 85, 86, 89, 90, 93, 94
+ db 97, 98,101,102,105,106,109,110,113,114,117,118,121,122,125,126
+prep_endB: db 1, 2, 5, 6, 9, 10, 13, 14, 33, 34, 37, 38, 41, 42, 45, 46
+ db 17, 18, 21, 22, 25, 26, 29, 30, 49, 50, 53, 54, 57, 58, 61, 62
+ db 65, 66, 69, 70, 73, 74, 77, 78, 97, 98,101,102,105,106,109,110
+ db 81, 82, 85, 86, 89, 90, 93, 94,113,114,117,118,121,122,125,126
+prep_endC: db 1, 2, 5, 6, 9, 10, 13, 14, 65, 66, 69, 70, 73, 74, 77, 78
+ db 17, 18, 21, 22, 25, 26, 29, 30, 81, 82, 85, 86, 89, 90, 93, 94
+ db 33, 34, 37, 38, 41, 42, 45, 46, 97, 98,101,102,105,106,109,110
+ db 49, 50, 53, 54, 57, 58, 61, 62,113,114,117,118,121,122,125,126
+spel_shuf4a: db 1, 2, 17, 18, 5, 6, 21, 22, 9, 10, 25, 26, 13, 14, 29, 30
+ db 17, 18, 33, 34, 21, 22, 37, 38, 25, 26, 41, 42, 29, 30, 45, 46
+spel_shuf4b: db 18, 19, 33, 34, 22, 23, 37, 38, 26, 27, 41, 42, 30, 31, 45, 46
+ db 33, 34, 49, 50, 37, 38, 53, 54, 41, 42, 57, 58, 45, 46, 61, 62
+spel_shuf8a: db 1, 2, 17, 18, 5, 6, 21, 22, 9, 10, 25, 26, 13, 14, 29, 30
+ db 17, 18, 65, 66, 21, 22, 69, 70, 25, 26, 73, 74, 29, 30, 77, 78
+ db 33, 34, 49, 50, 37, 38, 53, 54, 41, 42, 57, 58, 45, 46, 61, 62
+ db 49, 50, 97, 98, 53, 54,101,102, 57, 58,105,106, 61, 62,109,110
+spel_shuf8b: db 18, 19, 65, 66, 22, 23, 69, 70, 26, 27, 73, 74, 30, 31, 77, 78
+ db 65, 66, 81, 82, 69, 70, 85, 86, 73, 74, 89, 90, 77, 78, 93, 94
+ db 50, 51, 97, 98, 54, 55,101,102, 58, 59,105,106, 62, 63,109,110
+ db 97, 98,113,114,101,102,117,118,105,106,121,122,109,110,125,126
+spel_shuf16: db 1, 2, 33, 34, 5, 6, 37, 38, 9, 10, 41, 42, 13, 14, 45, 46
+ db 17, 18, 49, 50, 21, 22, 53, 54, 25, 26, 57, 58, 29, 30, 61, 62
+ db 65, 66, 97, 98, 69, 70,101,102, 73, 74,105,106, 77, 78,109,110
+ db 81, 82,113,114, 85, 86,117,118, 89, 90,121,122, 93, 94,125,126
+spel_shuf32: db 1, 2, 65, 66, 5, 6, 69, 70, 9, 10, 73, 74, 13, 14, 77, 78
+ db 17, 18, 81, 82, 21, 22, 85, 86, 25, 26, 89, 90, 29, 30, 93, 94
+ db 33, 34, 97, 98, 37, 38,101,102, 41, 42,105,106, 45, 46,109,110
+ db 49, 50,113,114, 53, 54,117,118, 57, 58,121,122, 61, 62,125,126
+spel_h_shuf2b: db 1, 2, 17, 18, 5, 6, 21, 22, 17, 18, 33, 34, 21, 22, 37, 38
+ db 33, 34, 49, 50, 37, 38, 53, 54, 49, 50, 9, 10, 53, 54, 13, 14
+ db 9, 10, 25, 26, 13, 14, 29, 30, 25, 26, 41, 42, 29, 30, 45, 46
+spel_shuf2: db 10, 11, 17, 18, 14, 15, 21, 22, 17, 18, 25, 26, 21, 22, 29, 30
+spel_h_shuf2a: db 0, 1, 2, 3, 2, 3, 4, 5, 16, 17, 18, 19, 18, 19, 20, 21
+ db 4, 5, 6, 7, 6, 7, 8, 9, 20, 21, 22, 23, 22, 23, 24, 25
+w_mask_end42x: db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61
+ db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125
+w_mask_end444: db 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+ db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62
+ db 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94
+ db 96, 98,100,102,104,106,108,110,112,114,116,118,120,122,124,126
+w_mask_shuf4: db 0, 2, 8, 10, 4, 6, 12, 14, 16, 18, 24, 26, 20, 22, 28, 30
+ db 32, 34, 40, 42, 36, 38, 44, 46, 48, 50, 56, 58, 52, 54, 60, 62
+ db 64, 66, 72, 74, 68, 70, 76, 78, 80, 82, 88, 90, 84, 86, 92, 94
+ db 96, 98,104,106,100,102,108,110,112,114,120,122,116,118,124,126
+w_mask_shuf8: db 0, 2, 16, 18, 4, 6, 20, 22, 8, 10, 24, 26, 12, 14, 28, 30
+ db 32, 34, 48, 50, 36, 38, 52, 54, 40, 42, 56, 58, 44, 46, 60, 62
+ db 64, 66, 80, 82, 68, 70, 84, 86, 72, 74, 88, 90, 76, 78, 92, 94
+ db 96, 98,112,114,100,102,116,118,104,106,120,122,108,110,124,126
+w_mask_shuf16: db 0, 2, 32, 34, 4, 6, 36, 38, 8, 10, 40, 42, 12, 14, 44, 46
+ db 16, 18, 48, 50, 20, 22, 52, 54, 24, 26, 56, 58, 28, 30, 60, 62
+ db 64, 66, 96, 98, 68, 70,100,102, 72, 74,104,106, 76, 78,108,110
+ db 80, 82,112,114, 84, 86,116,118, 88, 90,120,122, 92, 94,124,126
+warp8x8_permA: db 0, 1, 2, 3, 32, 33, 34, 35, 2, 3, 4, 5, 34, 35, 36, 37
+ db 4, 5, 6, 7, 36, 37, 38, 39, 6, 7, 8, 9, 38, 39, 40, 41
+ db 8, 9, 10, 11, 40, 41, 42, 43, 10, 11, 12, 13, 42, 43, 44, 45
+ db 12, 13, 14, 15, 44, 45, 46, 47, 14, 15, 16, 17, 46, 47, 48, 49
+warp8x8_permB: db 12, 13, 14, 15, 44, 45, 46, 47, 14, 15, 16, 17, 46, 47, 48, 49
+ db 16, 17, 18, 19, 48, 49, 50, 51, 18, 19, 20, 21, 50, 51, 52, 53
+ db 20, 21, 22, 23, 52, 53, 54, 55, 22, 23, 24, 25, 54, 55, 56, 57
+ db 24, 25, 26, 27, 56, 57, 58, 59, 26, 27, 28, 29, 58, 59, 60, 61
+warp8x8_end: db 0, 1, 4, 5, 16, 17, 20, 21, 32, 33, 36, 37, 48, 49, 52, 53
+ db 2, 3, 6, 7, 18, 19, 22, 23, 34, 35, 38, 39, 50, 51, 54, 55
+ db 8, 9, 12, 13, 24, 25, 28, 29, 40, 41, 44, 45, 56, 57, 60, 61
+ db 10, 11, 14, 15, 26, 27, 30, 31, 42, 43, 46, 47, 58, 59, 62, 63
+deint_q_shuf: ;dq 0, 2, 4, 6, 1, 3, 5, 7
+pd_0to7: dd 0, 1, 2, 3, 4, 5, 6, 7
+ dd 1
+pw_2048: times 2 dw 2048
+ dd 3
+pw_8192: times 2 dw 8192
+avg_shift: dw 5, 5, 3, 3
+pw_27615: times 2 dw 27615
+pw_32766: times 2 dw 32766
+warp8x8_permC: db -1, 0, -1, 1, -1, 8, -1, 9, -1, 4, -1, 5, -1, 12, -1, 13
+warp8x8_permD: db -1, 2, -1, 3, -1, 10, -1, 11, -1, 6, -1, 7, -1, 14, -1, 15
+warp_shift_h: db 11, 19, 11, 19, 43, 51, 43, 51, 13, 21, 13, 21, 45, 53, 45, 53
+blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
+resize_permA: dd 0, 4, 8, 12, 1, 5, 9, 13, 16, 20, 24, 28, 17, 21, 25, 29
+resize_permB: dd 2, 6, 10, 14, 3, 7, 11, 15, 18, 22, 26, 30, 19, 23, 27, 31
+resize_permC: dq 0, 1, 4, 5, 8, 9, 12, 13
+resize_permD: dq 2, 3, 6, 7, 10, 11, 14, 15
+resize_permE: dq 0, 2, 4, 6
+resize_shufA: db -1, 0, -1, 1, -1, 4, -1, 5, -1, 8, -1, 9, -1, 12, -1, 13
+resize_shufB: db -1, 2, -1, 3, -1, 6, -1, 7, -1, 10, -1, 11, -1, 14, -1, 15
+rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+resize_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7
+ db 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15
+
+prep_hv_shift: dq 6, 4
+put_bilin_h_rnd: dw 8, 8, 10, 10
+prep_mul: dw 16, 16, 4, 4
+put_8tap_h_rnd: dd 34, 40
+prep_8tap_rnd: dd 128 - (8192 << 8)
+warp_8x8_rnd_h: dd 512, 2048
+warp_8x8_rnd_v: dd 262144, 65536
+warp_8x8t_rnd_v: dd 16384 - (8192 << 15)
+avg_round: dw -16400, -16400, -16388, -16388
+w_avg_round: dd 128 + (8192 << 4), 32 + (8192 << 4)
+mask_round: dd 512 + (8192 << 6), 128 + (8192 << 6)
+w_mask_round: dd 128, 64
+bidir_shift: dw 6, 6, 4, 4
+
+pb_64: times 4 db 64
+pw_m512: times 2 dw -512
+pw_2: times 2 dw 2
+pw_64: times 2 dw 64
+pd_32: dd 32
+pd_63: dd 63
+pd_128: dd 128
+pd_640: dd 640
+pd_2176: dd 2176
+pd_16384: dd 16384
+pd_0_4: dd 0, 4
+
+%define pw_16 prep_mul
+%define pd_512 warp_8x8_rnd_h
+
+%macro BASE_JMP_TABLE 3-*
+ %xdefine %1_%2_table (%%table - %3)
+ %xdefine %%base %1_%2
+ %%table:
+ %rep %0 - 2
+ dw %%base %+ _w%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro HV_JMP_TABLE 5-*
+ %xdefine %%prefix mangle(private_prefix %+ _%1_%2_16bpc_%3)
+ %xdefine %%base %1_%3
+ %assign %%types %4
+ %if %%types & 1
+ %xdefine %1_%2_h_%3_table (%%h - %5)
+ %%h:
+ %rep %0 - 4
+ dw %%prefix %+ .h_w%5 - %%base
+ %rotate 1
+ %endrep
+ %rotate 4
+ %endif
+ %if %%types & 2
+ %xdefine %1_%2_v_%3_table (%%v - %5)
+ %%v:
+ %rep %0 - 4
+ dw %%prefix %+ .v_w%5 - %%base
+ %rotate 1
+ %endrep
+ %rotate 4
+ %endif
+ %if %%types & 4
+ %xdefine %1_%2_hv_%3_table (%%hv - %5)
+ %%hv:
+ %rep %0 - 4
+ dw %%prefix %+ .hv_w%5 - %%base
+ %rotate 1
+ %endrep
+ %endif
+%endmacro
+
+%macro BIDIR_JMP_TABLE 2-*
+ %xdefine %1_%2_table (%%table - 2*%3)
+ %xdefine %%base %1_%2_table
+ %xdefine %%prefix mangle(private_prefix %+ _%1_16bpc_%2)
+ %%table:
+ %rep %0 - 2
+ dd %%prefix %+ .w%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+%xdefine put_avx512icl mangle(private_prefix %+ _put_bilin_16bpc_avx512icl.put)
+%xdefine prep_avx512icl mangle(private_prefix %+ _prep_bilin_16bpc_avx512icl.prep)
+
+BIDIR_JMP_TABLE avg, avx512icl, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_avg, avx512icl, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE mask, avx512icl, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_420, avx512icl, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_422, avx512icl, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_444, avx512icl, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE blend, avx512icl, 4, 8, 16, 32
+BIDIR_JMP_TABLE blend_v, avx512icl, 2, 4, 8, 16, 32
+BIDIR_JMP_TABLE blend_h, avx512icl, 2, 4, 8, 16, 32, 64, 128
+BASE_JMP_TABLE put, avx512icl, 2, 4, 8, 16, 32, 64, 128
+BASE_JMP_TABLE prep, avx512icl, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE put, bilin, avx512icl, 7, 2, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, bilin, avx512icl, 7, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE put, 8tap, avx512icl, 2, 2, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, 8tap, avx512icl, 2, 4, 8, 16, 32, 64, 128
+
+%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
+
+cextern mc_subpel_filters
+%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
+
+cextern mc_warp_filter
+cextern obmc_masks_avx2
+cextern resize_filter
+
+SECTION .text
+
+%if WIN64
+DECLARE_REG_TMP 4
+%else
+DECLARE_REG_TMP 8
+%endif
+
+INIT_ZMM avx512icl
+cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w, h, mxy
+ mov mxyd, r6m ; mx
+ lea r7, [put_avx512icl]
+ tzcnt t0d, wm
+ movifnidn hd, hm
+ test mxyd, mxyd
+ jnz .h
+ mov mxyd, r7m ; my
+ test mxyd, mxyd
+ jnz .v
+.put:
+ movzx t0d, word [r7+t0*2+table_offset(put,)]
+ add t0, r7
+ jmp t0
+.put_w2:
+ mov r6d, [srcq+ssq*0]
+ mov r7d, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mov [dstq+dsq*0], r6d
+ mov [dstq+dsq*1], r7d
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w2
+ RET
+.put_w4:
+ mov r6, [srcq+ssq*0]
+ mov r7, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mov [dstq+dsq*0], r6
+ mov [dstq+dsq*1], r7
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w4
+ RET
+.put_w8:
+ movu xmm0, [srcq+ssq*0]
+ movu xmm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0], xmm0
+ mova [dstq+dsq*1], xmm1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w8
+ RET
+.put_w16:
+ movu ym0, [srcq+ssq*0]
+ movu ym1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0], ym0
+ mova [dstq+dsq*1], ym1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w16
+ RET
+.put_w32:
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w32
+ RET
+.put_w64:
+ movu m0, [srcq+ssq*0+64*0]
+ movu m1, [srcq+ssq*0+64*1]
+ movu m2, [srcq+ssq*1+64*0]
+ movu m3, [srcq+ssq*1+64*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0+64*0], m0
+ mova [dstq+dsq*0+64*1], m1
+ mova [dstq+dsq*1+64*0], m2
+ mova [dstq+dsq*1+64*1], m3
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w64
+ RET
+.put_w128:
+ movu m0, [srcq+64*0]
+ movu m1, [srcq+64*1]
+ movu m2, [srcq+64*2]
+ movu m3, [srcq+64*3]
+ add srcq, ssq
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ mova [dstq+64*2], m2
+ mova [dstq+64*3], m3
+ add dstq, dsq
+ dec hd
+ jg .put_w128
+ RET
+.h:
+ vpbroadcastw m5, mxyd
+ mov mxyd, r7m ; my
+ vpbroadcastd m4, [pw_16]
+ psubw m4, m5
+ test mxyd, mxyd
+ jnz .hv
+ ; 12-bit is rounded twice so we can't use the same pmulhrsw approach as .v
+ movzx t0d, word [r7+t0*2+table_offset(put, _bilin_h)]
+ mov r6d, r8m ; bitdepth_max
+ add t0, r7
+ shr r6d, 11
+ vpbroadcastd m6, [r7-put_avx512icl+put_bilin_h_rnd+r6*4]
+ jmp t0
+.h_w2:
+ movq xmm1, [srcq+ssq*0]
+ movhps xmm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmullw xmm0, xmm1, xm4
+ psrlq xmm1, 16
+ pmullw xmm1, xm5
+ paddw xmm0, xm6
+ paddw xmm0, xmm1
+ psrlw xmm0, 4
+ movd [dstq+dsq*0], xmm0
+ pextrd [dstq+dsq*1], xmm0, 2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w2
+ RET
+.h_w4:
+ movq xmm0, [srcq+ssq*0+0]
+ movhps xmm0, [srcq+ssq*1+0]
+ movq xmm1, [srcq+ssq*0+2]
+ movhps xmm1, [srcq+ssq*1+2]
+ lea srcq, [srcq+ssq*2]
+ pmullw xmm0, xm4
+ pmullw xmm1, xm5
+ paddw xmm0, xm6
+ paddw xmm0, xmm1
+ psrlw xmm0, 4
+ movq [dstq+dsq*0], xmm0
+ movhps [dstq+dsq*1], xmm0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w4
+ RET
+.h_w8:
+ movu xm0, [srcq+ssq*0+0]
+ vinserti32x4 ym0, [srcq+ssq*1+0], 1
+ movu xm1, [srcq+ssq*0+2]
+ vinserti32x4 ym1, [srcq+ssq*1+2], 1
+ lea srcq, [srcq+ssq*2]
+ pmullw ym0, ym4
+ pmullw ym1, ym5
+ paddw ym0, ym6
+ paddw ym0, ym1
+ psrlw ym0, 4
+ mova [dstq+dsq*0], xm0
+ vextracti32x4 [dstq+dsq*1], ym0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ movu ym0, [srcq+ssq*0+0]
+ vinserti32x8 m0, [srcq+ssq*1+0], 1
+ movu ym1, [srcq+ssq*0+2]
+ vinserti32x8 m1, [srcq+ssq*1+2], 1
+ lea srcq, [srcq+ssq*2]
+ pmullw m0, m4
+ pmullw m1, m5
+ paddw m0, m6
+ paddw m0, m1
+ psrlw m0, 4
+ mova [dstq+dsq*0], ym0
+ vextracti32x8 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w16
+ RET
+.h_w32:
+ pmullw m0, m4, [srcq+ssq*0+0]
+ pmullw m2, m5, [srcq+ssq*0+2]
+ pmullw m1, m4, [srcq+ssq*1+0]
+ pmullw m3, m5, [srcq+ssq*1+2]
+ lea srcq, [srcq+ssq*2]
+ paddw m0, m6
+ paddw m1, m6
+ paddw m0, m2
+ paddw m1, m3
+ psrlw m0, 4
+ psrlw m1, 4
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w32
+ RET
+.h_w64:
+ pmullw m0, m4, [srcq+64*0+0]
+ pmullw m2, m5, [srcq+64*0+2]
+ pmullw m1, m4, [srcq+64*1+0]
+ pmullw m3, m5, [srcq+64*1+2]
+ add srcq, ssq
+ paddw m0, m6
+ paddw m1, m6
+ paddw m0, m2
+ paddw m1, m3
+ psrlw m0, 4
+ psrlw m1, 4
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ add dstq, dsq
+ dec hd
+ jg .h_w64
+ RET
+.h_w128:
+ pmullw m0, m4, [srcq+64*0+0]
+ pmullw m7, m5, [srcq+64*0+2]
+ pmullw m1, m4, [srcq+64*1+0]
+ pmullw m8, m5, [srcq+64*1+2]
+ pmullw m2, m4, [srcq+64*2+0]
+ pmullw m9, m5, [srcq+64*2+2]
+ pmullw m3, m4, [srcq+64*3+0]
+ pmullw m10, m5, [srcq+64*3+2]
+ add srcq, ssq
+ REPX {paddw x, m6}, m0, m1, m2, m3
+ paddw m0, m7
+ paddw m1, m8
+ paddw m2, m9
+ paddw m3, m10
+ REPX {psrlw x, 4}, m0, m1, m2, m3
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ mova [dstq+64*2], m2
+ mova [dstq+64*3], m3
+ add dstq, dsq
+ dec hd
+ jg .h_w128
+ RET
+.v:
+ movzx t0d, word [r7+t0*2+table_offset(put, _bilin_v)]
+ shl mxyd, 11
+ vpbroadcastw m8, mxyd
+ add t0, r7
+ jmp t0
+.v_w2:
+ movd xmm0, [srcq+ssq*0]
+.v_w2_loop:
+ movd xmm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpckldq xmm2, xmm0, xmm1
+ movd xmm0, [srcq+ssq*0]
+ punpckldq xmm1, xmm0
+ psubw xmm1, xmm2
+ pmulhrsw xmm1, xm8
+ paddw xmm1, xmm2
+ movd [dstq+dsq*0], xmm1
+ pextrd [dstq+dsq*1], xmm1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w2_loop
+ RET
+.v_w4:
+ movq xmm0, [srcq+ssq*0]
+.v_w4_loop:
+ movq xmm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklqdq xmm2, xmm0, xmm1
+ movq xmm0, [srcq+ssq*0]
+ punpcklqdq xmm1, xmm0
+ psubw xmm1, xmm2
+ pmulhrsw xmm1, xm8
+ paddw xmm1, xmm2
+ movq [dstq+dsq*0], xmm1
+ movhps [dstq+dsq*1], xmm1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w4_loop
+ RET
+.v_w8:
+ movu xmm0, [srcq+ssq*0]
+.v_w8_loop:
+ vbroadcasti128 ymm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpblendd ymm2, ymm0, ymm1, 0xf0
+ vbroadcasti128 ymm0, [srcq+ssq*0]
+ vpblendd ymm1, ymm0, 0xf0
+ psubw ymm1, ymm2
+ pmulhrsw ymm1, ym8
+ paddw ymm1, ymm2
+ mova [dstq+dsq*0], xmm1
+ vextracti128 [dstq+dsq*1], ymm1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w8_loop
+ vzeroupper
+ RET
+.v_w16:
+ movu ym0, [srcq+ssq*0]
+.v_w16_loop:
+ movu ym3, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ psubw ym1, ym3, ym0
+ pmulhrsw ym1, ym8
+ paddw ym1, ym0
+ movu ym0, [srcq+ssq*0]
+ psubw ym2, ym0, ym3
+ pmulhrsw ym2, ym8
+ paddw ym2, ym3
+ mova [dstq+dsq*0], ym1
+ mova [dstq+dsq*1], ym2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w16_loop
+ RET
+.v_w32:
+ movu m0, [srcq+ssq*0]
+.v_w32_loop:
+ movu m3, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ psubw m1, m3, m0
+ pmulhrsw m1, m8
+ paddw m1, m0
+ movu m0, [srcq+ssq*0]
+ psubw m2, m0, m3
+ pmulhrsw m2, m8
+ paddw m2, m3
+ mova [dstq+dsq*0], m1
+ mova [dstq+dsq*1], m2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w32_loop
+ RET
+.v_w64:
+ movu m0, [srcq+ssq*0+64*0]
+ movu m1, [srcq+ssq*0+64*1]
+.v_w64_loop:
+ movu m2, [srcq+ssq*1+64*0]
+ movu m3, [srcq+ssq*1+64*1]
+ lea srcq, [srcq+ssq*2]
+ psubw m4, m2, m0
+ pmulhrsw m4, m8
+ paddw m4, m0
+ movu m0, [srcq+ssq*0+64*0]
+ psubw m5, m3, m1
+ pmulhrsw m5, m8
+ paddw m5, m1
+ movu m1, [srcq+ssq*0+64*1]
+ psubw m6, m0, m2
+ pmulhrsw m6, m8
+ psubw m7, m1, m3
+ pmulhrsw m7, m8
+ mova [dstq+dsq*0+64*0], m4
+ mova [dstq+dsq*0+64*1], m5
+ paddw m6, m2
+ paddw m7, m3
+ mova [dstq+dsq*1+64*0], m6
+ mova [dstq+dsq*1+64*1], m7
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w64_loop
+ RET
+.v_w128:
+ movu m0, [srcq+ssq*0+64*0]
+ movu m1, [srcq+ssq*0+64*1]
+ movu m2, [srcq+ssq*0+64*2]
+ movu m3, [srcq+ssq*0+64*3]
+.v_w128_loop:
+ movu m4, [srcq+ssq*1+64*0]
+ movu m5, [srcq+ssq*1+64*1]
+ movu m6, [srcq+ssq*1+64*2]
+ movu m7, [srcq+ssq*1+64*3]
+ lea srcq, [srcq+ssq*2]
+ psubw m9, m4, m0
+ pmulhrsw m9, m8
+ paddw m9, m0
+ movu m0, [srcq+ssq*0+64*0]
+ psubw m10, m5, m1
+ pmulhrsw m10, m8
+ paddw m10, m1
+ movu m1, [srcq+ssq*0+64*1]
+ psubw m11, m6, m2
+ pmulhrsw m11, m8
+ paddw m11, m2
+ movu m2, [srcq+ssq*0+64*2]
+ psubw m12, m7, m3
+ pmulhrsw m12, m8
+ paddw m12, m3
+ movu m3, [srcq+ssq*0+64*3]
+ mova [dstq+dsq*0+64*0], m9
+ psubw m9, m0, m4
+ pmulhrsw m9, m8
+ mova [dstq+dsq*0+64*1], m10
+ psubw m10, m1, m5
+ pmulhrsw m10, m8
+ mova [dstq+dsq*0+64*2], m11
+ psubw m11, m2, m6
+ pmulhrsw m11, m8
+ mova [dstq+dsq*0+64*3], m12
+ psubw m12, m3, m7
+ pmulhrsw m12, m8
+ paddw m9, m4
+ paddw m10, m5
+ mova [dstq+dsq*1+64*0], m9
+ mova [dstq+dsq*1+64*1], m10
+ paddw m11, m6
+ paddw m12, m7
+ mova [dstq+dsq*1+64*2], m11
+ mova [dstq+dsq*1+64*3], m12
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w128_loop
+ RET
+.hv:
+ movzx t0d, word [r7+t0*2+table_offset(put, _bilin_hv)]
+ shl mxyd, 11
+ vpbroadcastd m6, [pw_2]
+ vpbroadcastw m7, mxyd
+ vpbroadcastd m8, [pw_8192]
+ add t0, r7
+ test dword r8m, 0x800
+ jnz .hv_12bpc
+ psllw m4, 2
+ psllw m5, 2
+ vpbroadcastd m8, [pw_2048]
+.hv_12bpc:
+ jmp t0
+.hv_w2:
+ vpbroadcastq xmm1, [srcq+ssq*0]
+ pmullw xmm0, xmm1, xm4
+ psrlq xmm1, 16
+ pmullw xmm1, xm5
+ paddw xmm0, xm6
+ paddw xmm0, xmm1
+ psrlw xmm0, 2
+.hv_w2_loop:
+ movq xmm2, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movhps xmm2, [srcq+ssq*0]
+ pmullw xmm1, xmm2, xm4
+ psrlq xmm2, 16
+ pmullw xmm2, xm5
+ paddw xmm1, xm6
+ paddw xmm1, xmm2
+ psrlw xmm1, 2 ; 1 _ 2 _
+ shufpd xmm2, xmm0, xmm1, 0x01 ; 0 _ 1 _
+ mova xmm0, xmm1
+ psubw xmm1, xmm2
+ paddw xmm1, xmm1
+ pmulhw xmm1, xm7
+ paddw xmm1, xmm2
+ pmulhrsw xmm1, xm8
+ movd [dstq+dsq*0], xmm1
+ pextrd [dstq+dsq*1], xmm1, 2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w2_loop
+ RET
+.hv_w4:
+ pmullw xmm0, xm4, [srcq+ssq*0-8]
+ pmullw xmm1, xm5, [srcq+ssq*0-6]
+ paddw xmm0, xm6
+ paddw xmm0, xmm1
+ psrlw xmm0, 2
+.hv_w4_loop:
+ movq xmm1, [srcq+ssq*1+0]
+ movq xmm2, [srcq+ssq*1+2]
+ lea srcq, [srcq+ssq*2]
+ movhps xmm1, [srcq+ssq*0+0]
+ movhps xmm2, [srcq+ssq*0+2]
+ pmullw xmm1, xm4
+ pmullw xmm2, xm5
+ paddw xmm1, xm6
+ paddw xmm1, xmm2
+ psrlw xmm1, 2 ; 1 2
+ shufpd xmm2, xmm0, xmm1, 0x01 ; 0 1
+ mova xmm0, xmm1
+ psubw xmm1, xmm2
+ paddw xmm1, xmm1
+ pmulhw xmm1, xm7
+ paddw xmm1, xmm2
+ pmulhrsw xmm1, xm8
+ movq [dstq+dsq*0], xmm1
+ movhps [dstq+dsq*1], xmm1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ pmullw xmm0, xm4, [srcq+ssq*0+0]
+ pmullw xmm1, xm5, [srcq+ssq*0+2]
+ paddw xmm0, xm6
+ paddw xmm0, xmm1
+ psrlw xmm0, 2
+ vinserti32x4 ym0, xmm0, 1
+.hv_w8_loop:
+ movu xm1, [srcq+ssq*1+0]
+ movu xm2, [srcq+ssq*1+2]
+ lea srcq, [srcq+ssq*2]
+ vinserti32x4 ym1, [srcq+ssq*0+0], 1
+ vinserti32x4 ym2, [srcq+ssq*0+2], 1
+ pmullw ym1, ym4
+ pmullw ym2, ym5
+ paddw ym1, ym6
+ paddw ym1, ym2
+ psrlw ym1, 2 ; 1 2
+ vshufi32x4 ym2, ym0, ym1, 0x01 ; 0 1
+ mova ym0, ym1
+ psubw ym1, ym2
+ paddw ym1, ym1
+ pmulhw ym1, ym7
+ paddw ym1, ym2
+ pmulhrsw ym1, ym8
+ mova [dstq+dsq*0], xm1
+ vextracti32x4 [dstq+dsq*1], ym1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w8_loop
+ RET
+.hv_w16:
+ pmullw ym0, ym4, [srcq+ssq*0+0]
+ pmullw ym1, ym5, [srcq+ssq*0+2]
+ paddw ym0, ym6
+ paddw ym0, ym1
+ psrlw ym0, 2
+ vinserti32x8 m0, ym0, 1
+.hv_w16_loop:
+ movu ym1, [srcq+ssq*1+0]
+ movu ym2, [srcq+ssq*1+2]
+ lea srcq, [srcq+ssq*2]
+ vinserti32x8 m1, [srcq+ssq*0+0], 1
+ vinserti32x8 m2, [srcq+ssq*0+2], 1
+ pmullw m1, m4
+ pmullw m2, m5
+ paddw m1, m6
+ paddw m1, m2
+ psrlw m1, 2 ; 1 2
+ vshufi32x4 m2, m0, m1, q1032 ; 0 1
+ mova m0, m1
+ psubw m1, m2
+ paddw m1, m1
+ pmulhw m1, m7
+ paddw m1, m2
+ pmulhrsw m1, m8
+ mova [dstq+dsq*0], ym1
+ vextracti32x8 [dstq+dsq*1], m1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w16_loop
+ RET
+.hv_w32:
+.hv_w64:
+.hv_w128:
+ movifnidn wd, wm
+ lea r6d, [hq+wq*8-256]
+ mov r4, srcq
+ mov r7, dstq
+.hv_w32_loop0:
+ pmullw m0, m4, [srcq+ssq*0+0]
+ pmullw m1, m5, [srcq+ssq*0+2]
+ paddw m0, m6
+ paddw m0, m1
+ psrlw m0, 2
+.hv_w32_loop:
+ pmullw m3, m4, [srcq+ssq*1+0]
+ pmullw m1, m5, [srcq+ssq*1+2]
+ lea srcq, [srcq+ssq*2]
+ paddw m3, m6
+ paddw m3, m1
+ psrlw m3, 2
+ psubw m1, m3, m0
+ paddw m1, m1
+ pmulhw m1, m7
+ paddw m1, m0
+ pmullw m0, m4, [srcq+ssq*0+0]
+ pmullw m2, m5, [srcq+ssq*0+2]
+ paddw m0, m6
+ paddw m0, m2
+ psrlw m0, 2
+ psubw m2, m0, m3
+ paddw m2, m2
+ pmulhw m2, m7
+ paddw m2, m3
+ pmulhrsw m1, m8
+ pmulhrsw m2, m8
+ mova [dstq+dsq*0], m1
+ mova [dstq+dsq*1], m2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w32_loop
+ add r4, 64
+ add r7, 64
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
+ sub r6d, 1<<8
+ jg .hv_w32_loop0
+ RET
+
+cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride, w, h, mxy, stride3
+ movifnidn mxyd, r5m ; mx
+ lea r6, [prep_avx512icl]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ test mxyd, mxyd
+ jnz .h
+ mov mxyd, r6m ; my
+ test mxyd, mxyd
+ jnz .v
+.prep:
+ movzx wd, word [r6+wq*2+table_offset(prep,)]
+ mov r5d, r7m ; bitdepth_max
+ vpbroadcastd m5, [r6-prep_avx512icl+pw_8192]
+ add wq, r6
+ shr r5d, 11
+ vpbroadcastd m4, [r6-prep_avx512icl+prep_mul+r5*4]
+ lea stride3q, [strideq*3]
+ jmp wq
+.prep_w4:
+ movq xmm0, [srcq+strideq*0]
+ movhps xmm0, [srcq+strideq*1]
+ vpbroadcastq ymm1, [srcq+strideq*2]
+ vpbroadcastq ymm2, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpblendd ymm0, ymm1, 0x30
+ vpblendd ymm0, ymm2, 0xc0
+ pmullw ymm0, ym4
+ psubw ymm0, ym5
+ mova [tmpq], ymm0
+ add tmpq, 32
+ sub hd, 4
+ jg .prep_w4
+ vzeroupper
+ RET
+.prep_w8:
+ movu xm0, [srcq+strideq*0]
+ vinserti32x4 ym0, [srcq+strideq*1], 1
+ vinserti32x4 m0, [srcq+strideq*2], 2
+ vinserti32x4 m0, [srcq+stride3q ], 3
+ lea srcq, [srcq+strideq*4]
+ pmullw m0, m4
+ psubw m0, m5
+ mova [tmpq], m0
+ add tmpq, 64
+ sub hd, 4
+ jg .prep_w8
+ RET
+.prep_w16:
+ movu ym0, [srcq+strideq*0]
+ vinserti32x8 m0, [srcq+strideq*1], 1
+ movu ym1, [srcq+strideq*2]
+ vinserti32x8 m1, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ pmullw m0, m4
+ pmullw m1, m4
+ psubw m0, m5
+ psubw m1, m5
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+ add tmpq, 64*2
+ sub hd, 4
+ jg .prep_w16
+ RET
+.prep_w32:
+ pmullw m0, m4, [srcq+strideq*0]
+ pmullw m1, m4, [srcq+strideq*1]
+ pmullw m2, m4, [srcq+strideq*2]
+ pmullw m3, m4, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ REPX {psubw x, m5}, m0, m1, m2, m3
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+ mova [tmpq+64*2], m2
+ mova [tmpq+64*3], m3
+ add tmpq, 64*4
+ sub hd, 4
+ jg .prep_w32
+ RET
+.prep_w64:
+ pmullw m0, m4, [srcq+strideq*0+64*0]
+ pmullw m1, m4, [srcq+strideq*0+64*1]
+ pmullw m2, m4, [srcq+strideq*1+64*0]
+ pmullw m3, m4, [srcq+strideq*1+64*1]
+ lea srcq, [srcq+strideq*2]
+ REPX {psubw x, m5}, m0, m1, m2, m3
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+ mova [tmpq+64*2], m2
+ mova [tmpq+64*3], m3
+ add tmpq, 64*4
+ sub hd, 2
+ jg .prep_w64
+ RET
+.prep_w128:
+ pmullw m0, m4, [srcq+64*0]
+ pmullw m1, m4, [srcq+64*1]
+ pmullw m2, m4, [srcq+64*2]
+ pmullw m3, m4, [srcq+64*3]
+ add srcq, strideq
+ REPX {psubw x, m5}, m0, m1, m2, m3
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+ mova [tmpq+64*2], m2
+ mova [tmpq+64*3], m3
+ add tmpq, 64*4
+ dec hd
+ jg .prep_w128
+ RET
+.h:
+ vpbroadcastw m5, mxyd
+ mov mxyd, r6m ; my
+ vpbroadcastd m4, [pw_16]
+ vpbroadcastd m6, [pw_32766]
+ psubw m4, m5
+ test dword r7m, 0x800
+ jnz .h_12bpc
+ psllw m4, 2
+ psllw m5, 2
+.h_12bpc:
+ test mxyd, mxyd
+ jnz .hv
+ movzx wd, word [r6+wq*2+table_offset(prep, _bilin_h)]
+ add wq, r6
+ lea stride3q, [strideq*3]
+ jmp wq
+.h_w4:
+ movu xm1, [srcq+strideq*0]
+ vinserti32x4 ym1, [srcq+strideq*2], 1
+ movu xm2, [srcq+strideq*1]
+ vinserti32x4 ym2, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ punpcklqdq ym0, ym1, ym2
+ psrldq ym1, 2
+ psrldq ym2, 2
+ pmullw ym0, ym4
+ punpcklqdq ym1, ym2
+ pmullw ym1, ym5
+ psubw ym0, ym6
+ paddw ym0, ym1
+ psraw ym0, 2
+ mova [tmpq], ym0
+ add tmpq, 32
+ sub hd, 4
+ jg .h_w4
+ RET
+.h_w8:
+ movu xm0, [srcq+strideq*0+0]
+ movu xm1, [srcq+strideq*0+2]
+ vinserti32x4 ym0, [srcq+strideq*1+0], 1
+ vinserti32x4 ym1, [srcq+strideq*1+2], 1
+ vinserti32x4 m0, [srcq+strideq*2+0], 2
+ vinserti32x4 m1, [srcq+strideq*2+2], 2
+ vinserti32x4 m0, [srcq+stride3q +0], 3
+ vinserti32x4 m1, [srcq+stride3q +2], 3
+ lea srcq, [srcq+strideq*4]
+ pmullw m0, m4
+ pmullw m1, m5
+ psubw m0, m6
+ paddw m0, m1
+ psraw m0, 2
+ mova [tmpq], m0
+ add tmpq, 64
+ sub hd, 4
+ jg .h_w8
+ RET
+.h_w16:
+ movu ym0, [srcq+strideq*0+0]
+ vinserti32x8 m0, [srcq+strideq*1+0], 1
+ movu ym1, [srcq+strideq*0+2]
+ vinserti32x8 m1, [srcq+strideq*1+2], 1
+ lea srcq, [srcq+strideq*2]
+ pmullw m0, m4
+ pmullw m1, m5
+ psubw m0, m6
+ paddw m0, m1
+ psraw m0, 2
+ mova [tmpq], m0
+ add tmpq, 64
+ sub hd, 2
+ jg .h_w16
+ RET
+.h_w32:
+ pmullw m0, m4, [srcq+strideq*0+0]
+ pmullw m2, m5, [srcq+strideq*0+2]
+ pmullw m1, m4, [srcq+strideq*1+0]
+ pmullw m3, m5, [srcq+strideq*1+2]
+ lea srcq, [srcq+strideq*2]
+ psubw m0, m6
+ psubw m1, m6
+ paddw m0, m2
+ paddw m1, m3
+ psraw m0, 2
+ psraw m1, 2
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+ add tmpq, 64*2
+ sub hd, 2
+ jg .h_w32
+ RET
+.h_w64:
+ pmullw m0, m4, [srcq+ 0]
+ pmullw m2, m5, [srcq+ 2]
+ pmullw m1, m4, [srcq+64]
+ pmullw m3, m5, [srcq+66]
+ add srcq, strideq
+ psubw m0, m6
+ psubw m1, m6
+ paddw m0, m2
+ paddw m1, m3
+ psraw m0, 2
+ psraw m1, 2
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+ add tmpq, 64*2
+ dec hd
+ jg .h_w64
+ RET
+.h_w128:
+ pmullw m0, m4, [srcq+ 0]
+ pmullw m7, m5, [srcq+ 2]
+ pmullw m1, m4, [srcq+ 64]
+ pmullw m8, m5, [srcq+ 66]
+ pmullw m2, m4, [srcq+128]
+ pmullw m9, m5, [srcq+130]
+ pmullw m3, m4, [srcq+192]
+ pmullw m10, m5, [srcq+194]
+ add srcq, strideq
+ REPX {psubw x, m6}, m0, m1, m2, m3
+ paddw m0, m7
+ paddw m1, m8
+ paddw m2, m9
+ paddw m3, m10
+ REPX {psraw x, 2}, m0, m1, m2, m3
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+ mova [tmpq+64*2], m2
+ mova [tmpq+64*3], m3
+ add tmpq, 64*4
+ dec hd
+ jg .h_w128
+ RET
+.v:
+ movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)]
+ vpbroadcastw m9, mxyd
+ vpbroadcastd m8, [pw_16]
+ vpbroadcastd m10, [pw_32766]
+ add wq, r6
+ lea stride3q, [strideq*3]
+ psubw m8, m9
+ test dword r7m, 0x800
+ jnz .v_12bpc
+ psllw m8, 2
+ psllw m9, 2
+.v_12bpc:
+ jmp wq
+.v_w4:
+ movq xmm0, [srcq+strideq*0]
+.v_w4_loop:
+ vpbroadcastq xmm2, [srcq+strideq*1]
+ vpbroadcastq ymm1, [srcq+strideq*2]
+ vpbroadcastq ymm3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpblendd ymm2, ymm1, 0x30
+ vpblendd ymm2, ymm3, 0xc0
+ vpblendd ymm1, ymm2, ymm0, 0x03 ; 0 1 2 3
+ movq xmm0, [srcq+strideq*0]
+ valignq ymm2, ymm0, ymm2, 1 ; 1 2 3 4
+ pmullw ymm1, ym8
+ pmullw ymm2, ym9
+ psubw ymm1, ym10
+ paddw ymm1, ymm2
+ psraw ymm1, 2
+ mova [tmpq], ymm1
+ add tmpq, 32
+ sub hd, 4
+ jg .v_w4_loop
+ vzeroupper
+ RET
+.v_w8:
+ movu xm0, [srcq+strideq*0]
+.v_w8_loop:
+ vinserti32x4 ym1, ym0, [srcq+strideq*1], 1
+ vinserti32x4 m1, [srcq+strideq*2], 2
+ vinserti32x4 m1, [srcq+stride3q ], 3 ; 0 1 2 3
+ lea srcq, [srcq+strideq*4]
+ movu xm0, [srcq+strideq*0]
+ valignq m2, m0, m1, 2 ; 1 2 3 4
+ pmullw m1, m8
+ pmullw m2, m9
+ psubw m1, m10
+ paddw m1, m2
+ psraw m1, 2
+ mova [tmpq], m1
+ add tmpq, 64
+ sub hd, 4
+ jg .v_w8_loop
+ RET
+.v_w16:
+ movu ym0, [srcq+strideq*0]
+.v_w16_loop:
+ vinserti32x8 m1, m0, [srcq+strideq*1], 1 ; 0 1
+ movu ym3, [srcq+strideq*2]
+ vinserti32x8 m2, m3, [srcq+stride3q ], 1 ; 2 3
+ lea srcq, [srcq+strideq*4]
+ movu ym0, [srcq+strideq*0]
+ vshufi32x4 m3, m1, m3, q1032 ; 1 2
+ vshufi32x4 m4, m2, m0, q1032 ; 3 4
+ pmullw m1, m8
+ pmullw m2, m8
+ pmullw m3, m9
+ pmullw m4, m9
+ psubw m1, m10
+ psubw m2, m10
+ paddw m1, m3
+ paddw m2, m4
+ psraw m1, 2
+ psraw m2, 2
+ mova [tmpq+64*0], m1
+ mova [tmpq+64*1], m2
+ add tmpq, 64*2
+ sub hd, 4
+ jg .v_w16_loop
+ RET
+.v_w32:
+ movu m0, [srcq+strideq*0]
+.v_w32_loop:
+ movu m3, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ pmullw m1, m8, m0
+ movu m0, [srcq+strideq*0]
+ pmullw m2, m8, m3
+ pmullw m3, m9
+ pmullw m4, m9, m0
+ psubw m1, m10
+ psubw m2, m10
+ paddw m1, m3
+ paddw m2, m4
+ psraw m1, 2
+ psraw m2, 2
+ mova [tmpq+64*0], m1
+ mova [tmpq+64*1], m2
+ add tmpq, 64*2
+ sub hd, 2
+ jg .v_w32_loop
+ RET
+.v_w64:
+ movu m0, [srcq+64*0]
+ movu m1, [srcq+64*1]
+.v_w64_loop:
+ add srcq, strideq
+ pmullw m2, m8, m0
+ movu m0, [srcq+64*0]
+ pmullw m3, m8, m1
+ movu m1, [srcq+64*1]
+ pmullw m4, m9, m0
+ pmullw m5, m9, m1
+ psubw m2, m10
+ psubw m3, m10
+ paddw m2, m4
+ paddw m3, m5
+ psraw m2, 2
+ psraw m3, 2
+ mova [tmpq+64*0], m2
+ mova [tmpq+64*1], m3
+ add tmpq, 64*2
+ dec hd
+ jg .v_w64_loop
+ RET
+.v_w128:
+ movu m0, [srcq+64*0]
+ movu m1, [srcq+64*1]
+ movu m2, [srcq+64*2]
+ movu m3, [srcq+64*3]
+.v_w128_loop:
+ add srcq, strideq
+ pmullw m4, m8, m0
+ movu m0, [srcq+64*0]
+ pmullw m5, m8, m1
+ movu m1, [srcq+64*1]
+ pmullw m6, m8, m2
+ movu m2, [srcq+64*2]
+ pmullw m7, m8, m3
+ movu m3, [srcq+64*3]
+ pmullw m11, m9, m0
+ pmullw m12, m9, m1
+ pmullw m13, m9, m2
+ pmullw m14, m9, m3
+ REPX {psubw x, m10}, m4, m5, m6, m7
+ paddw m4, m11
+ paddw m5, m12
+ paddw m6, m13
+ paddw m7, m14
+ REPX {psraw x, 2}, m4, m5, m6, m7
+ mova [tmpq+64*0], m4
+ mova [tmpq+64*1], m5
+ mova [tmpq+64*2], m6
+ mova [tmpq+64*3], m7
+ add tmpq, 64*4
+ dec hd
+ jg .v_w128_loop
+ RET
+.hv:
+ movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)]
+ shl mxyd, 11
+ vpbroadcastw m7, mxyd
+ add wq, r6
+ lea stride3q, [strideq*3]
+ jmp wq
+.hv_w4:
+ movq xmm0, [srcq+strideq*0+0]
+ movq xmm1, [srcq+strideq*0+2]
+ pmullw xmm0, xm4
+ pmullw xmm1, xm5
+ psubw xmm0, xm6
+ paddw xmm0, xmm1
+ psraw xmm0, 2
+ vpbroadcastq ym0, xmm0
+.hv_w4_loop:
+ movu xm1, [srcq+strideq*1]
+ vinserti128 ym1, [srcq+stride3q ], 1
+ movu xm2, [srcq+strideq*2]
+ lea srcq, [srcq+strideq*4]
+ vinserti128 ym2, [srcq+strideq*0], 1
+ punpcklqdq ym3, ym1, ym2
+ psrldq ym1, 2
+ psrldq ym2, 2
+ pmullw ym3, ym4
+ punpcklqdq ym1, ym2
+ pmullw ym1, ym5
+ psubw ym3, ym6
+ paddw ym1, ym3
+ psraw ym1, 2 ; 1 2 3 4
+ valignq ym2, ym1, ym0, 3 ; 0 1 2 3
+ mova ym0, ym1
+ psubw ym1, ym2
+ pmulhrsw ym1, ym7
+ paddw ym1, ym2
+ mova [tmpq], ym1
+ add tmpq, 32
+ sub hd, 4
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ pmullw xm0, xm4, [srcq+strideq*0+0]
+ pmullw xm1, xm5, [srcq+strideq*0+2]
+ psubw xm0, xm6
+ paddw xm0, xm1
+ psraw xm0, 2
+ vinserti32x4 m0, xm0, 3
+.hv_w8_loop:
+ movu xm1, [srcq+strideq*1+0]
+ movu xm2, [srcq+strideq*1+2]
+ vinserti32x4 ym1, [srcq+strideq*2+0], 1
+ vinserti32x4 ym2, [srcq+strideq*2+2], 1
+ vinserti32x4 m1, [srcq+stride3q +0], 2
+ vinserti32x4 m2, [srcq+stride3q +2], 2
+ lea srcq, [srcq+strideq*4]
+ vinserti32x4 m1, [srcq+strideq*0+0], 3
+ vinserti32x4 m2, [srcq+strideq*0+2], 3
+ pmullw m1, m4
+ pmullw m2, m5
+ psubw m1, m6
+ paddw m1, m2
+ psraw m1, 2 ; 1 2 3 4
+ valignq m2, m1, m0, 6 ; 0 1 2 3
+ mova m0, m1
+ psubw m1, m2
+ pmulhrsw m1, m7
+ paddw m1, m2
+ mova [tmpq], m1
+ add tmpq, 64
+ sub hd, 4
+ jg .hv_w8_loop
+ RET
+.hv_w16:
+ pmullw ym0, ym4, [srcq+strideq*0+0]
+ pmullw ym1, ym5, [srcq+strideq*0+2]
+ psubw ym0, ym6
+ paddw ym0, ym1
+ psraw ym0, 2
+ vinserti32x8 m0, ym0, 1
+.hv_w16_loop:
+ movu ym1, [srcq+strideq*1+0]
+ movu ym2, [srcq+strideq*1+2]
+ lea srcq, [srcq+strideq*2]
+ vinserti32x8 m1, [srcq+strideq*0+0], 1
+ vinserti32x8 m2, [srcq+strideq*0+2], 1
+ pmullw m1, m4
+ pmullw m2, m5
+ psubw m1, m6
+ paddw m1, m2
+ psraw m1, 2 ; 1 2
+ vshufi32x4 m2, m0, m1, q1032 ; 0 1
+ mova m0, m1
+ psubw m1, m2
+ pmulhrsw m1, m7
+ paddw m1, m2
+ mova [tmpq], m1
+ add tmpq, 64
+ sub hd, 2
+ jg .hv_w16_loop
+ RET
+.hv_w32:
+ pmullw m0, m4, [srcq+strideq*0+0]
+ pmullw m1, m5, [srcq+strideq*0+2]
+ psubw m0, m6
+ paddw m0, m1
+ psraw m0, 2
+.hv_w32_loop:
+ pmullw m3, m4, [srcq+strideq*1+0]
+ pmullw m1, m5, [srcq+strideq*1+2]
+ lea srcq, [srcq+strideq*2]
+ psubw m3, m6
+ paddw m3, m1
+ psraw m3, 2
+ psubw m1, m3, m0
+ pmulhrsw m1, m7
+ paddw m1, m0
+ pmullw m0, m4, [srcq+strideq*0+0]
+ pmullw m2, m5, [srcq+strideq*0+2]
+ psubw m0, m6
+ paddw m0, m2
+ psraw m0, 2
+ psubw m2, m0, m3
+ pmulhrsw m2, m7
+ paddw m2, m3
+ mova [tmpq+64*0], m1
+ mova [tmpq+64*1], m2
+ add tmpq, 64*2
+ sub hd, 2
+ jg .hv_w32_loop
+ RET
+.hv_w64:
+ pmullw m0, m4, [srcq+ 0]
+ pmullw m2, m5, [srcq+ 2]
+ pmullw m1, m4, [srcq+64]
+ pmullw m3, m5, [srcq+66]
+ psubw m0, m6
+ psubw m1, m6
+ paddw m0, m2
+ paddw m1, m3
+ psraw m0, 2
+ psraw m1, 2
+.hv_w64_loop:
+ add srcq, strideq
+ pmullw m2, m4, [srcq+ 0]
+ pmullw m8, m5, [srcq+ 2]
+ pmullw m3, m4, [srcq+64]
+ pmullw m9, m5, [srcq+66]
+ psubw m2, m6
+ psubw m3, m6
+ paddw m2, m8
+ paddw m3, m9
+ psraw m2, 2
+ psraw m3, 2
+ psubw m8, m2, m0
+ psubw m9, m3, m1
+ pmulhrsw m8, m7
+ pmulhrsw m9, m7
+ paddw m8, m0
+ mova m0, m2
+ paddw m9, m1
+ mova m1, m3
+ mova [tmpq+64*0], m8
+ mova [tmpq+64*1], m9
+ add tmpq, 64*2
+ dec hd
+ jg .hv_w64_loop
+ RET
+.hv_w128:
+ pmullw m0, m4, [srcq+ 0]
+ pmullw m8, m5, [srcq+ 2]
+ pmullw m1, m4, [srcq+ 64]
+ pmullw m9, m5, [srcq+ 66]
+ pmullw m2, m4, [srcq+128]
+ pmullw m10, m5, [srcq+130]
+ pmullw m3, m4, [srcq+192]
+ pmullw m11, m5, [srcq+194]
+ REPX {psubw x, m6}, m0, m1, m2, m3
+ paddw m0, m8
+ paddw m1, m9
+ paddw m2, m10
+ paddw m3, m11
+ REPX {psraw x, 2}, m0, m1, m2, m3
+.hv_w128_loop:
+ add srcq, strideq
+ pmullw m8, m4, [srcq+ 0]
+ pmullw m12, m5, [srcq+ 2]
+ pmullw m9, m4, [srcq+ 64]
+ pmullw m13, m5, [srcq+ 66]
+ pmullw m10, m4, [srcq+128]
+ pmullw m14, m5, [srcq+130]
+ pmullw m11, m4, [srcq+192]
+ pmullw m15, m5, [srcq+194]
+ REPX {psubw x, m6}, m8, m9, m10, m11
+ paddw m8, m12
+ paddw m9, m13
+ paddw m10, m14
+ paddw m11, m15
+ REPX {psraw x, 2}, m8, m9, m10, m11
+ psubw m12, m8, m0
+ psubw m13, m9, m1
+ psubw m14, m10, m2
+ psubw m15, m11, m3
+ REPX {pmulhrsw x, m7}, m12, m13, m14, m15
+ paddw m12, m0
+ mova m0, m8
+ paddw m13, m1
+ mova m1, m9
+ mova [tmpq+64*0], m12
+ mova [tmpq+64*1], m13
+ paddw m14, m2
+ mova m2, m10
+ paddw m15, m3
+ mova m3, m11
+ mova [tmpq+64*2], m14
+ mova [tmpq+64*3], m15
+ add tmpq, 64*4
+ dec hd
+ jg .hv_w128_loop
+ RET
+
+; int8_t subpel_filters[5][15][8]
+%assign FILTER_REGULAR (0*15 << 16) | 3*15
+%assign FILTER_SMOOTH (1*15 << 16) | 4*15
+%assign FILTER_SHARP (2*15 << 16) | 3*15
+
+%macro MC_8TAP_FN 4 ; prefix, type, type_h, type_v
+cglobal %1_8tap_%2_16bpc
+ mov t0d, FILTER_%3
+%ifidn %3, %4
+ mov t1d, t0d
+%else
+ mov t1d, FILTER_%4
+%endif
+%ifnidn %2, regular ; skip the jump in the last filter
+ jmp mangle(private_prefix %+ _%1_8tap_16bpc %+ SUFFIX)
+%endif
+%endmacro
+
+%if WIN64
+DECLARE_REG_TMP 4, 5
+%define buf rsp+stack_offset+8 ; shadow space
+%else
+DECLARE_REG_TMP 7, 8
+%define buf rsp-40 ; red zone
+%endif
+
+MC_8TAP_FN put, sharp, SHARP, SHARP
+MC_8TAP_FN put, sharp_smooth, SHARP, SMOOTH
+MC_8TAP_FN put, smooth_sharp, SMOOTH, SHARP
+MC_8TAP_FN put, smooth, SMOOTH, SMOOTH
+MC_8TAP_FN put, sharp_regular, SHARP, REGULAR
+MC_8TAP_FN put, regular_sharp, REGULAR, SHARP
+MC_8TAP_FN put, smooth_regular, SMOOTH, REGULAR
+MC_8TAP_FN put, regular_smooth, REGULAR, SMOOTH
+MC_8TAP_FN put, regular, REGULAR, REGULAR
+
+cglobal put_8tap_16bpc, 4, 9, 16, dst, ds, src, ss, w, h, mx, my
+%define base r8-put_avx512icl
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+ imul myd, mym, 0x010101
+ add myd, t1d ; 8tap_v, my, 4tap_v
+ lea r8, [put_avx512icl]
+ movifnidn wd, wm
+ movifnidn hd, hm
+ test mxd, 0xf00
+ jnz .h
+ test myd, 0xf00
+ jnz .v
+ tzcnt wd, wd
+ movzx wd, word [r8+wq*2+table_offset(put,)]
+ add wq, r8
+%if WIN64
+ pop r8
+%endif
+ jmp wq
+.h_w2:
+ movzx mxd, mxb
+ sub srcq, 2
+ mova ym2, [spel_h_shuf2a]
+ pmovsxbw xmm4, [base+subpel_filters+mxq*8]
+ pshufd xmm3, xmm4, q1111
+ pshufd xmm4, xmm4, q2222
+.h_w2_loop:
+ movu xm1, [srcq+ssq*0]
+ vinserti32x4 ym1, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ mova xmm0, xm8
+ vpermb ym1, ym2, ym1
+ vpdpwssd xmm0, xmm3, xm1
+ vextracti32x4 xm1, ym1, 1
+ vpdpwssd xmm0, xmm4, xm1
+ psrad xmm0, 6
+ packusdw xmm0, xmm0
+ pminsw xmm0, xm9
+ movd [dstq+dsq*0], xmm0
+ pextrd [dstq+dsq*1], xmm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w2_loop
+ RET
+.h_w4:
+ movzx mxd, mxb
+ sub srcq, 2
+ pmovsxbw xmm0, [base+subpel_filters+mxq*8]
+ vbroadcasti32x4 ym4, [spel_h_shufA]
+ vbroadcasti32x4 ym5, [spel_h_shufB]
+ pshufd xmm0, xmm0, q2211
+ vpbroadcastq ym6, xmm0
+ vpermq ym7, ymm0, q1111
+.h_w4_loop:
+ movu xm2, [srcq+ssq*0]
+ vinserti32x4 ym2, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ mova ym0, ym8
+ pshufb ym1, ym2, ym4
+ vpdpwssd ym0, ym6, ym1
+ pshufb ym2, ym5
+ vpdpwssd ym0, ym7, ym2
+ psrad ym0, 6
+ vextracti32x4 xm1, ym0, 1
+ packusdw xm0, xm1
+ pminsw xmm0, xm0, xm9
+ movq [dstq+dsq*0], xmm0
+ movhps [dstq+dsq*1], xmm0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w4_loop
+ RET
+.h:
+ test myd, 0xf00
+ jnz .hv
+ mov r7d, r8m
+ vpbroadcastw m9, r8m
+ shr r7d, 11
+ vpbroadcastd m8, [base+put_8tap_h_rnd+r7*4]
+ cmp wd, 4
+ je .h_w4
+ jl .h_w2
+ shr mxd, 16
+ sub srcq, 6
+ pmovsxbw xmm0, [base+subpel_filters+mxq*8]
+ mova [buf], xmm0
+ vpbroadcastd m10, xmm0
+ vpbroadcastd m11, [buf+ 4]
+ vpbroadcastd m12, [buf+ 8]
+ vpbroadcastd m13, [buf+12]
+ sub wd, 16
+ je .h_w16
+ jg .h_w32
+.h_w8:
+ mova m4, [spel_h_shufA]
+ movu m5, [spel_h_shufB]
+ movu m6, [spel_h_shufC]
+ mova m7, [spel_h_shufD]
+.h_w8_loop:
+ movu ym2, [srcq+ssq*0]
+ vinserti32x8 m2, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ mova m0, m8
+ vpermb m1, m4, m2
+ vpdpwssd m0, m10, m1
+ vpermb m1, m5, m2
+ vpdpwssd m0, m11, m1
+ vpermb m1, m6, m2
+ vpdpwssd m0, m12, m1
+ vpermb m1, m7, m2
+ vpdpwssd m0, m13, m1
+ psrad m0, 6
+ vextracti32x8 ym1, m0, 1
+ packusdw ym0, ym1
+ pminsw ym0, ym9
+ mova [dstq+dsq*0], xm0
+ vextracti32x4 [dstq+dsq*1], ym0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w8_loop
+ RET
+.h_w16:
+ vbroadcasti32x4 m6, [spel_h_shufA]
+ vbroadcasti32x4 m7, [spel_h_shufB]
+.h_w16_loop:
+ movu ym2, [srcq+ssq*0+ 0]
+ vinserti32x8 m2, [srcq+ssq*1+ 0], 1
+ movu ym3, [srcq+ssq*0+16]
+ vinserti32x8 m3, [srcq+ssq*1+16], 1
+ lea srcq, [srcq+ssq*2]
+ mova m0, m8
+ mova m1, m8
+ pshufb m4, m2, m6
+ vpdpwssd m0, m10, m4 ; a0
+ pshufb m4, m3, m6
+ vpdpwssd m1, m12, m4 ; b2
+ pshufb m4, m2, m7
+ vpdpwssd m0, m11, m4 ; a1
+ pshufb m4, m3, m7
+ vpdpwssd m1, m13, m4 ; b3
+ shufpd m2, m3, 0x55
+ pshufb m4, m2, m6
+ vpdpwssd m0, m12, m4 ; a2
+ vpdpwssd m1, m10, m4 ; b0
+ pshufb m2, m7
+ vpdpwssd m0, m13, m2 ; a3
+ vpdpwssd m1, m11, m2 ; b1
+ psrad m0, 6
+ psrad m1, 6
+ packusdw m0, m1
+ pminsw m0, m9
+ mova [dstq+dsq*0], ym0
+ vextracti32x8 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w16_loop
+ RET
+.h_w32:
+ lea srcq, [srcq+wq*2]
+ vbroadcasti32x4 m6, [spel_h_shufA]
+ lea dstq, [dstq+wq*2]
+ vbroadcasti32x4 m7, [spel_h_shufB]
+ neg wq
+.h_w32_loop0:
+ mov r6, wq
+.h_w32_loop:
+ movu m2, [srcq+r6*2+ 0]
+ movu m3, [srcq+r6*2+ 8]
+ mova m0, m8
+ mova m1, m8
+ pshufb m4, m2, m6
+ vpdpwssd m0, m10, m4 ; a0
+ pshufb m4, m3, m6
+ vpdpwssd m1, m10, m4 ; b0
+ vpdpwssd m0, m12, m4 ; a2
+ movu m4, [srcq+r6*2+16]
+ pshufb m3, m7
+ vpdpwssd m1, m11, m3 ; b1
+ vpdpwssd m0, m13, m3 ; a3
+ pshufb m3, m4, m6
+ vpdpwssd m1, m12, m3 ; b2
+ pshufb m2, m7
+ vpdpwssd m0, m11, m2 ; a1
+ pshufb m4, m7
+ vpdpwssd m1, m13, m4 ; b3
+ psrad m0, 6
+ psrad m1, 6
+ packusdw m0, m1
+ pminsw m0, m9
+ mova [dstq+r6*2], m0
+ add r6, 32
+ jl .h_w32_loop
+ add srcq, ssq
+ add dstq, dsq
+ dec hd
+ jg .h_w32_loop0
+ RET
+.v:
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ vpbroadcastd m10, [pd_32]
+ pmovsxbw xmm0, [base+subpel_filters+myq*8]
+ tzcnt r7d, wd
+ vpbroadcastw m11, r8m
+ lea r6, [ssq*3]
+ movzx r7d, word [r8+r7*2+table_offset(put, _8tap_v)]
+ sub srcq, r6
+ mova [rsp+stack_offset+8], xmm0
+ vpbroadcastd m12, xmm0
+ add r7, r8
+ vpbroadcastd m13, [rsp+stack_offset+12]
+ vpbroadcastd m14, [rsp+stack_offset+16]
+ vpbroadcastd m15, [rsp+stack_offset+20]
+ jmp r7
+.v_w2:
+ movd xmm2, [srcq+ssq*0]
+ pinsrd xmm2, [srcq+ssq*1], 1
+ pinsrd xmm2, [srcq+ssq*2], 2
+ add srcq, r6
+ pinsrd xmm2, [srcq+ssq*0], 3 ; 0 1 2 3
+ movd xmm3, [srcq+ssq*1]
+ vpbroadcastd xmm1, [srcq+ssq*2]
+ add srcq, r6
+ vpbroadcastd xmm0, [srcq+ssq*0]
+ vpblendd xmm3, xmm1, 0x02 ; 4 5
+ vpblendd xmm1, xmm0, 0x02 ; 5 6
+ palignr xmm4, xmm3, xmm2, 4 ; 1 2 3 4
+ punpcklwd xmm3, xmm1 ; 45 56
+ punpcklwd xmm1, xmm2, xmm4 ; 01 12
+ punpckhwd xmm2, xmm4 ; 23 34
+.v_w2_loop:
+ vpbroadcastd xmm4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova xmm5, xm10
+ vpdpwssd xmm5, xm12, xmm1 ; a0 b0
+ mova xmm1, xmm2
+ vpdpwssd xmm5, xm13, xmm2 ; a1 b1
+ mova xmm2, xmm3
+ vpdpwssd xmm5, xm14, xmm3 ; a2 b2
+ vpblendd xmm3, xmm0, xmm4, 0x02 ; 6 7
+ vpbroadcastd xmm0, [srcq+ssq*0]
+ vpblendd xmm4, xmm0, 0x02 ; 7 8
+ punpcklwd xmm3, xmm4 ; 67 78
+ vpdpwssd xmm5, xm15, xmm3 ; a3 b3
+ psrad xmm5, 6
+ packusdw xmm5, xmm5
+ pminsw xmm5, xm11
+ movd [dstq+dsq*0], xmm5
+ pextrd [dstq+dsq*1], xmm5, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w2_loop
+ RET
+.v_w4:
+ movq xmm1, [srcq+ssq*0]
+ vpbroadcastq ymm0, [srcq+ssq*1]
+ vpbroadcastq ymm2, [srcq+ssq*2]
+ add srcq, r6
+ vpbroadcastq ymm4, [srcq+ssq*0]
+ vpbroadcastq ymm3, [srcq+ssq*1]
+ vpbroadcastq ymm5, [srcq+ssq*2]
+ add srcq, r6
+ vpblendd ymm1, ymm0, 0x30
+ vpblendd ymm0, ymm2, 0x30
+ punpcklwd ymm1, ymm0 ; 01 12
+ vpbroadcastq ymm0, [srcq+ssq*0]
+ vpblendd ymm2, ymm4, 0x30
+ vpblendd ymm4, ymm3, 0x30
+ punpcklwd ymm2, ymm4 ; 23 34
+ vpblendd ymm3, ymm5, 0x30
+ vpblendd ymm5, ymm0, 0x30
+ punpcklwd ymm3, ymm5 ; 45 56
+.v_w4_loop:
+ vpbroadcastq ymm5, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova ymm4, ym10
+ vpdpwssd ymm4, ym12, ymm1 ; a0 b0
+ mova ymm1, ymm2
+ vpdpwssd ymm4, ym13, ymm2 ; a1 b1
+ mova ymm2, ymm3
+ vpdpwssd ymm4, ym14, ymm3 ; a2 b2
+ vpblendd ymm3, ymm0, ymm5, 0x30
+ vpbroadcastq ymm0, [srcq+ssq*0]
+ vpblendd ymm5, ymm0, 0x30
+ punpcklwd ymm3, ymm5 ; 67 78
+ vpdpwssd ymm4, ym15, ymm3 ; a3 b3
+ psrad ymm4, 6
+ vextracti128 xmm5, ymm4, 1
+ packusdw xmm4, xmm5
+ pminsw xmm4, xm11
+ movq [dstq+dsq*0], xmm4
+ movhps [dstq+dsq*1], xmm4
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w4_loop
+ vzeroupper
+ RET
+.v_w8:
+ vbroadcasti32x4 m2, [srcq+ssq*2]
+ vinserti32x4 m1, m2, [srcq+ssq*0], 0
+ vinserti32x4 m1, [srcq+ssq*1], 1 ; 0 1 2
+ add srcq, r6
+ vinserti32x4 ym2, [srcq+ssq*0], 1
+ vinserti32x4 m2, [srcq+ssq*1], 2 ; 2 3 4
+ mova m6, [spel_v_shuf8]
+ movu xm0, [srcq+ssq*1]
+ vinserti32x4 ym0, [srcq+ssq*2], 1
+ add srcq, r6
+ vinserti32x4 m0, [srcq+ssq*0], 2 ; 4 5 6
+ vpermb m1, m6, m1 ; 01 12
+ vpermb m2, m6, m2 ; 23 34
+ vpermb m3, m6, m0 ; 45 56
+.v_w8_loop:
+ vinserti32x4 m0, [srcq+ssq*1], 3
+ lea srcq, [srcq+ssq*2]
+ movu xm5, [srcq+ssq*0]
+ mova m4, m10
+ vpdpwssd m4, m12, m1 ; a0 b0
+ mova m1, m2
+ vshufi32x4 m0, m5, q1032 ; 6 7 8
+ vpdpwssd m4, m13, m2 ; a1 b1
+ mova m2, m3
+ vpdpwssd m4, m14, m3 ; a2 b2
+ vpermb m3, m6, m0 ; 67 78
+ vpdpwssd m4, m15, m3 ; a3 b3
+ psrad m4, 6
+ vextracti32x8 ym5, m4, 1
+ packusdw ym4, ym5
+ pminsw ym4, ym11
+ mova [dstq+dsq*0], xm4
+ vextracti32x4 [dstq+dsq*1], ym4, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w8_loop
+ RET
+.v_w16:
+ vbroadcasti32x8 m1, [srcq+ssq*1]
+ vinserti32x8 m0, m1, [srcq+ssq*0], 0
+ vinserti32x8 m1, [srcq+ssq*2], 1
+ mova m8, [spel_v_shuf16]
+ add srcq, r6
+ movu ym3, [srcq+ssq*0]
+ vinserti32x8 m3, [srcq+ssq*1], 1
+ movu ym5, [srcq+ssq*2]
+ add srcq, r6
+ vinserti32x8 m5, [srcq+ssq*0], 1
+ vpermb m0, m8, m0 ; 01
+ vpermb m1, m8, m1 ; 12
+ vpermb m3, m8, m3 ; 34
+ vpermb m5, m8, m5 ; 56
+ mova m9, [deint_q_shuf]
+ vpshrdd m2, m1, m3, 16 ; 23
+ vpshrdd m4, m3, m5, 16 ; 45
+.v_w16_loop:
+ mova m6, m10
+ mova m7, m10
+ vpdpwssd m6, m12, m0 ; a0
+ mova m0, m2
+ vpdpwssd m7, m12, m1 ; b0
+ mova m1, m3
+ vpdpwssd m6, m13, m2 ; a1
+ mova m2, m4
+ vpdpwssd m7, m13, m3 ; b1
+ mova m3, m5
+ vpdpwssd m6, m14, m4 ; a2
+ mova m4, m5
+ vpdpwssd m7, m14, m5 ; b2
+ movu ym5, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vinserti32x8 m5, [srcq+ssq*0], 1
+ vpermb m5, m8, m5 ; 78
+ vpshrdd m4, m5, 16 ; 67
+ vpdpwssd m6, m15, m4 ; a3
+ vpdpwssd m7, m15, m5 ; b3
+ psrad m6, 6
+ psrad m7, 6
+ packusdw m6, m7
+ pminsw m6, m11
+ vpermq m6, m9, m6
+ mova [dstq+dsq*0], ym6
+ vextracti32x8 [dstq+dsq*1], m6, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w16_loop
+ RET
+.v_w32:
+.v_w64:
+.v_w128:
+%if WIN64
+ movaps [rsp+stack_offset+8], xmm6
+%endif
+ lea wd, [hq+wq*8-256]
+ mov r7, srcq
+ mov r8, dstq
+.v_w32_loop0:
+ movu m16, [srcq+ssq*0]
+ movu m17, [srcq+ssq*1]
+ movu m18, [srcq+ssq*2]
+ add srcq, r6
+ movu m19, [srcq+ssq*0]
+ movu m20, [srcq+ssq*1]
+ movu m21, [srcq+ssq*2]
+ add srcq, r6
+ movu m22, [srcq+ssq*0]
+ punpcklwd m0, m16, m17 ; 01l
+ punpckhwd m16, m17 ; 01h
+ punpcklwd m1, m17, m18 ; 12l
+ punpckhwd m17, m18 ; 12h
+ punpcklwd m2, m18, m19 ; 23l
+ punpckhwd m18, m19 ; 23h
+ punpcklwd m3, m19, m20 ; 34l
+ punpckhwd m19, m20 ; 34h
+ punpcklwd m4, m20, m21 ; 45l
+ punpckhwd m20, m21 ; 45h
+ punpcklwd m5, m21, m22 ; 56l
+ punpckhwd m21, m22 ; 56h
+.v_w32_loop:
+ mova m6, m10
+ vpdpwssd m6, m12, m0 ; a0l
+ mova m8, m10
+ vpdpwssd m8, m12, m16 ; a0h
+ mova m7, m10
+ vpdpwssd m7, m12, m1 ; b0l
+ mova m9, m10
+ vpdpwssd m9, m12, m17 ; b0h
+ mova m0, m2
+ vpdpwssd m6, m13, m2 ; a1l
+ mova m16, m18
+ vpdpwssd m8, m13, m18 ; a1h
+ mova m1, m3
+ vpdpwssd m7, m13, m3 ; b1l
+ mova m17, m19
+ vpdpwssd m9, m13, m19 ; b1h
+ mova m2, m4
+ vpdpwssd m6, m14, m4 ; a2l
+ mova m18, m20
+ vpdpwssd m8, m14, m20 ; a2h
+ mova m3, m5
+ vpdpwssd m7, m14, m5 ; b2l
+ mova m19, m21
+ vpdpwssd m9, m14, m21 ; b2h
+ movu m21, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklwd m4, m22, m21 ; 67l
+ punpckhwd m20, m22, m21 ; 67h
+ movu m22, [srcq+ssq*0]
+ vpdpwssd m6, m15, m4 ; a3l
+ vpdpwssd m8, m15, m20 ; a3h
+ punpcklwd m5, m21, m22 ; 78l
+ punpckhwd m21, m22 ; 78h
+ vpdpwssd m7, m15, m5 ; b3l
+ vpdpwssd m9, m15, m21 ; b3h
+ REPX {psrad x, 6}, m6, m8, m7, m9
+ packusdw m6, m8
+ packusdw m7, m9
+ pminsw m6, m11
+ pminsw m7, m11
+ mova [dstq+dsq*0], m6
+ mova [dstq+dsq*1], m7
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w32_loop
+ add r7, 64
+ add r8, 64
+ movzx hd, wb
+ mov srcq, r7
+ mov dstq, r8
+ sub wd, 1<<8
+ jg .v_w32_loop0
+%if WIN64
+ movaps xmm6, [rsp+stack_offset+8]
+%endif
+ vzeroupper
+ RET
+.hv:
+ vpbroadcastw m11, r8m
+ cmp wd, 4
+ jg .hv_w8
+ movzx mxd, mxb
+ pmovsxbw xmm0, [base+subpel_filters+mxq*8]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ pmovsxbw xmm1, [base+subpel_filters+myq*8]
+ lea r6, [ssq*3]
+ sub srcq, 2
+ sub srcq, r6
+ test dword r8m, 0x800
+ jnz .hv_12bit
+ vpbroadcastd m10, [pd_2176]
+ psllw xmm0, 6
+ jmp .hv_main
+.hv_12bit:
+ vpbroadcastd m10, [pd_640]
+ psllw xmm0, 4
+ psllw xmm1, 2
+.hv_main:
+ mova [buf+ 0], xmm0
+ mova [buf+16], xmm1
+ vpbroadcastd m8, [buf+ 4]
+ vpbroadcastd m9, [buf+ 8]
+ vpbroadcastd ym12, xmm1
+ vpbroadcastd ym13, [buf+20]
+ vpbroadcastd ym14, [buf+24]
+ vpbroadcastd ym15, [buf+28]
+ movu xm4, [srcq+ssq*0]
+ vinserti32x4 ym4, [srcq+ssq*1], 1
+ vinserti32x4 m4, [srcq+ssq*2], 2
+ add srcq, r6
+ vinserti32x4 m4, [srcq+ssq*0], 3 ; 0 1 2 3
+ movu xm0, [srcq+ssq*1]
+ vinserti32x4 ym0, [srcq+ssq*2], 1
+ add srcq, r6
+ vinserti32x4 m0, [srcq+ssq*0], 2 ; 4 5 6
+ cmp wd, 4
+ je .hv_w4
+ vbroadcasti32x4 m2, [spel_h_shufA]
+ mova m3, [spel_h_shuf2b]
+ mova ym6, [spel_h_shuf2a]
+ mova xm7, [spel_shuf2]
+ mova m1, m10
+ pshufb m4, m2
+ pshufb m0, m2
+ punpcklqdq m2, m4, m0
+ vpdpwssd m1, m8, m2 ; 04 15 26 3_
+ punpckhqdq m4, m0
+ vpdpwssd m1, m9, m4
+ vpermb m1, m3, m1 ; 01 12
+ vextracti32x4 xm2, ym1, 1 ; 23 34
+ vextracti32x4 xm3, m1, 2 ; 45 56
+.hv_w2_loop:
+ movu xm5, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vinserti32x4 ym5, [srcq+ssq*0], 1
+ mova xm4, xm10
+ vpermb ym5, ym6, ym5
+ pmaddwd xmm0, xm12, xm1 ; a0 b0
+ vpdpwssd xm4, xm8, xm5
+ vextracti32x4 xm5, ym5, 1
+ mova xm1, xm2
+ vpdpwssd xmm0, xm13, xm2 ; a1 b1
+ vpdpwssd xm4, xm9, xm5 ; 7 8
+ mova xm2, xm3
+ vpdpwssd xmm0, xm14, xm3 ; a2 b2
+ vpermt2b xm3, xm7, xm4 ; 67 78
+ vpdpwssd xmm0, xm15, xm3 ; a3 b3
+ psrad xmm0, 10
+ packusdw xmm0, xmm0
+ pminsw xmm0, xm11
+ movd [dstq+dsq*0], xmm0
+ pextrd [dstq+dsq*1], xmm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w2_loop
+ RET
+.hv_w4:
+ vbroadcasti32x4 m19, [spel_h_shufA]
+ vbroadcasti32x4 m20, [spel_h_shufB]
+ mova ym6, [spel_shuf4a]
+ mova ym7, [spel_shuf4b]
+ mova m2, m10
+ mova m3, m10
+ pshufb m1, m4, m19
+ vpdpwssd m2, m8, m1
+ pshufb m1, m0, m19
+ vpdpwssd m3, m8, m1
+ pshufb m4, m20
+ vpdpwssd m2, m9, m4
+ pshufb m0, m20
+ vpdpwssd m3, m9, m0
+ vpermb m1, m6, m2 ; 01 12
+ vshufi32x4 m2, m3, q1032
+ vpermb m3, m6, m3 ; 45 56
+ vpermb m2, m6, m2 ; 23 34
+.hv_w4_loop:
+ movu xm18, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vinserti128 ym18, [srcq+ssq*0], 1
+ mova ym4, ym10
+ pshufb ym17, ym18, ym19
+ pmaddwd ym16, ym12, ym1 ; a0 b0
+ vpdpwssd ym4, ym8, ym17
+ pshufb ym18, ym20
+ mova ym1, ym2
+ vpdpwssd ym16, ym13, ym2 ; a1 b1
+ vpdpwssd ym4, ym9, ym18 ; 7 8
+ mova ym2, ym3
+ vpdpwssd ym16, ym14, ym3 ; a2 b2
+ vpermt2b ym3, ym7, ym4 ; 67 78
+ vpdpwssd ym16, ym15, ym3 ; a3 b3
+ psrad ym16, 10
+ vextracti128 xm17, ym16, 1
+ packusdw xm16, xm17
+ pminsw xm16, xm11
+ movq [dstq+dsq*0], xm16
+ movhps [dstq+dsq*1], xm16
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w4_loop
+ vzeroupper
+ RET
+.hv_w8:
+ shr mxd, 16
+ pmovsxbw xmm0, [base+subpel_filters+mxq*8]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ pmovsxbw xmm1, [base+subpel_filters+myq*8]
+ lea r6, [ssq*3]
+ sub srcq, 6
+ sub srcq, r6
+ test dword r8m, 0x800
+ jnz .hv_w8_12bit
+ vpbroadcastd m10, [pd_2176]
+ psllw xmm0, 6
+ jmp .hv_w8_main
+.hv_w8_12bit:
+ vpbroadcastd m10, [pd_640]
+ psllw xmm0, 4
+ psllw xmm1, 2
+.hv_w8_main:
+ mova [buf+ 0], xmm0
+ mova [buf+16], xmm1
+ vpbroadcastd m12, xmm0
+ vpbroadcastd m13, [buf+ 4]
+ vpbroadcastd m14, [buf+ 8]
+ vpbroadcastd m15, [buf+12]
+ vpbroadcastd m16, xmm1
+ vpbroadcastd m17, [buf+20]
+ vpbroadcastd m18, [buf+24]
+ vpbroadcastd m19, [buf+28]
+ cmp wd, 16
+ je .hv_w16
+ jg .hv_w32
+ mova m5, [spel_h_shufA]
+ movu ym0, [srcq+ssq*0]
+ vinserti32x8 m0, [srcq+ssq*1], 1 ; 0 1
+ movu ym9, [srcq+ssq*2]
+ add srcq, r6
+ vinserti32x8 m9, [srcq+ssq*0], 1 ; 2 3
+ movu ym20, [srcq+ssq*1]
+ vinserti32x8 m20, [srcq+ssq*2], 1 ; 4 5
+ add srcq, r6
+ movu ym21, [srcq+ssq*0] ; 6
+ movu m6, [spel_h_shufB]
+ movu m7, [spel_h_shufC]
+ vpermb m8, m5, m0
+ mova m1, m10
+ vpdpwssd m1, m12, m8 ; a0 b0
+ vpermb m8, m5, m9
+ mova m2, m10
+ vpdpwssd m2, m12, m8 ; c0 d0
+ vpermb m8, m5, m20
+ mova m3, m10
+ vpdpwssd m3, m12, m8 ; e0 f0
+ vpermb m8, m5, m21
+ mova m4, m10
+ vpdpwssd m4, m12, m8 ; g0
+ vpermb m8, m6, m0
+ vpdpwssd m1, m13, m8 ; a1 b1
+ vpermb m8, m6, m9
+ vpdpwssd m2, m13, m8 ; c1 d1
+ vpermb m8, m6, m20
+ vpdpwssd m3, m13, m8 ; e1 f1
+ vpermb m8, m6, m21
+ vpdpwssd m4, m13, m8 ; g1
+ vpermb m8, m7, m0
+ vpdpwssd m1, m14, m8 ; a2 b2
+ vpermb m8, m7, m9
+ vpdpwssd m2, m14, m8 ; c2 d2
+ vpermb m8, m7, m20
+ vpdpwssd m3, m14, m8 ; e2 f2
+ vpermb m8, m7, m21
+ vpdpwssd m4, m14, m8 ; g2
+ mova m8, [spel_h_shufD]
+ vpermb m0, m8, m0
+ vpdpwssd m1, m15, m0 ; a3 b3
+ mova m0, [spel_shuf8a]
+ vpermb m9, m8, m9
+ vpdpwssd m2, m15, m9 ; c3 d3
+ mova m9, [spel_shuf8b]
+ vpermb m20, m8, m20
+ vpdpwssd m3, m15, m20 ; e3 f3
+ vpermb m21, m8, m21
+ vpdpwssd m4, m15, m21 ; g3
+ vpermt2b m1, m0, m2 ; 01 12
+ vpermt2b m2, m0, m3 ; 23 34
+ vpermt2b m3, m0, m4 ; 45 56
+.hv_w8_loop:
+ movu ym0, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vinserti32x8 m0, [srcq+ssq*0], 1
+ mova m4, m10
+ vpermb m21, m5, m0
+ vpdpwssd m4, m12, m21 ; h0 i0
+ vpermb m21, m6, m0
+ pmaddwd m20, m16, m1 ; A0 B0
+ vpdpwssd m4, m13, m21 ; h1 i1
+ vpermb m21, m7, m0
+ mova m1, m2
+ vpdpwssd m20, m17, m2 ; A1 B1
+ vpdpwssd m4, m14, m21 ; h2 i2
+ vpermb m21, m8, m0
+ mova m2, m3
+ vpdpwssd m20, m18, m3 ; A2 B2
+ vpdpwssd m4, m15, m21 ; h3 i3
+ vpermt2b m3, m9, m4 ; 67 78
+ vpdpwssd m20, m19, m3 ; A3 B3
+ psrad m20, 10
+ vextracti32x8 ym21, m20, 1
+ packusdw ym20, ym21
+ pminsw ym20, ym11
+ mova [dstq+dsq*0], xm20
+ vextracti128 [dstq+dsq*1], ym20, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w8_loop
+ vzeroupper
+ RET
+.hv_w16:
+ WIN64_SPILL_XMM 26
+ vbroadcasti32x8 m5, [srcq+ssq*0+ 8]
+ vinserti32x8 m4, m5, [srcq+ssq*0+ 0], 0
+ vinserti32x8 m5, [srcq+ssq*0+16], 1 ; 0
+ movu ym6, [srcq+ssq*1+ 0]
+ movu ym7, [srcq+ssq*1+16]
+ vinserti32x8 m6, [srcq+ssq*2+ 0], 1
+ vinserti32x8 m7, [srcq+ssq*2+16], 1 ; 1 2
+ add srcq, r6
+ movu ym22, [srcq+ssq*0+ 0]
+ movu ym23, [srcq+ssq*0+16]
+ vinserti32x8 m22, [srcq+ssq*1+ 0], 1
+ vinserti32x8 m23, [srcq+ssq*1+16], 1 ; 3 4
+ movu ym24, [srcq+ssq*2+ 0]
+ movu ym25, [srcq+ssq*2+16]
+ add srcq, r6
+ vinserti32x8 m24, [srcq+ssq*0+ 0], 1
+ vinserti32x8 m25, [srcq+ssq*0+16], 1 ; 5 6
+ vbroadcasti32x4 m20, [spel_h_shufA]
+ vbroadcasti32x4 m21, [spel_h_shufB]
+ mova m9, [spel_shuf16]
+ pshufb m0, m4, m20
+ mova m1, m10
+ vpdpwssd m1, m12, m0 ; a0
+ pshufb m0, m6, m20
+ mova m2, m10
+ vpdpwssd m2, m12, m0 ; b0
+ pshufb m0, m7, m20
+ mova m3, m10
+ vpdpwssd m3, m14, m0 ; c2
+ pshufb m0, m4, m21
+ vpdpwssd m1, m13, m0 ; a1
+ pshufb m0, m6, m21
+ vpdpwssd m2, m13, m0 ; b1
+ pshufb m0, m7, m21
+ vpdpwssd m3, m15, m0 ; c3
+ pshufb m0, m5, m20
+ vpdpwssd m1, m14, m0 ; a2
+ shufpd m6, m7, 0x55
+ pshufb m7, m6, m20
+ vpdpwssd m2, m14, m7 ; b2
+ vpdpwssd m3, m12, m7 ; c0
+ pshufb m5, m21
+ vpdpwssd m1, m15, m5 ; a3
+ pshufb m6, m21
+ vpdpwssd m2, m15, m6 ; b3
+ vpdpwssd m3, m13, m6 ; c1
+ pshufb m0, m22, m20
+ mova m4, m10
+ vpdpwssd m4, m12, m0 ; d0
+ pshufb m0, m23, m20
+ mova m5, m10
+ vpdpwssd m5, m14, m0 ; e2
+ pshufb m0, m24, m20
+ mova m6, m10
+ vpdpwssd m6, m12, m0 ; f0
+ pshufb m0, m25, m20
+ mova m7, m10
+ vpdpwssd m7, m14, m0 ; g2
+ pshufb m0, m22, m21
+ vpdpwssd m4, m13, m0 ; d1
+ pshufb m0, m23, m21
+ vpdpwssd m5, m15, m0 ; e3
+ pshufb m0, m24, m21
+ vpdpwssd m6, m13, m0 ; f1
+ pshufb m0, m25, m21
+ vpdpwssd m7, m15, m0 ; g3
+ shufpd m22, m23, 0x55
+ pshufb m23, m22, m20
+ vpdpwssd m4, m14, m23 ; d2
+ vpdpwssd m5, m12, m23 ; e0
+ shufpd m24, m25, 0x55
+ pshufb m25, m24, m20
+ vpdpwssd m6, m14, m25 ; f2
+ vpdpwssd m7, m12, m25 ; g0
+ pshufb m22, m21
+ vpdpwssd m4, m15, m22 ; d3
+ vpdpwssd m5, m13, m22 ; e1
+ pshufb m24, m21
+ vpdpwssd m6, m15, m24 ; f3
+ vpdpwssd m7, m13, m24 ; g1
+ pslldq m1, 1
+ vpermt2b m2, m9, m3 ; 12
+ vpermt2b m4, m9, m5 ; 34
+ vpermt2b m6, m9, m7 ; 56
+ vpshrdd m1, m2, 16 ; 01
+ vpshrdd m3, m2, m4, 16 ; 23
+ vpshrdd m5, m4, m6, 16 ; 45
+.hv_w16_loop:
+ movu ym24, [srcq+ssq*1+ 0]
+ movu ym25, [srcq+ssq*1+16]
+ lea srcq, [srcq+ssq*2]
+ vinserti32x8 m24, [srcq+ssq*0+ 0], 1
+ vinserti32x8 m25, [srcq+ssq*0+16], 1
+ mova m7, m10
+ mova m8, m10
+ pshufb m0, m24, m20
+ vpdpwssd m7, m12, m0 ; h0
+ pshufb m0, m25, m20
+ vpdpwssd m8, m14, m0 ; i2
+ pmaddwd m22, m16, m1 ; A0
+ mova m1, m3
+ pmaddwd m23, m16, m2 ; B0
+ mova m2, m4
+ pshufb m0, m24, m21
+ vpdpwssd m7, m13, m0 ; h1
+ pshufb m0, m25, m21
+ vpdpwssd m8, m15, m0 ; i3
+ vpdpwssd m22, m17, m3 ; A1
+ mova m3, m5
+ vpdpwssd m23, m17, m4 ; B1
+ mova m4, m6
+ shufpd m24, m25, 0x55
+ pshufb m25, m24, m20
+ vpdpwssd m7, m14, m25 ; h2
+ vpdpwssd m8, m12, m25 ; i0
+ vpdpwssd m22, m18, m5 ; A2
+ vpdpwssd m23, m18, m6 ; B2
+ pshufb m24, m21
+ vpdpwssd m7, m15, m24 ; h3
+ vpdpwssd m8, m13, m24 ; i1
+ vpermt2b m7, m9, m8 ; 78
+ vpshrdd m5, m6, m7, 16 ; 67
+ vpdpwssd m22, m19, m5 ; A3
+ vpdpwssd m23, m19, m7 ; B3
+ mova m6, m7
+ psrad m22, 10
+ psrad m23, 10
+ vshufi32x4 m0, m22, m23, q3232
+ vinserti32x8 m22, ym23, 1
+ packusdw m22, m0
+ pminsw m22, m11
+ mova [dstq+dsq*0], ym22
+ vextracti32x8 [dstq+dsq*1], m22, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w16_loop
+ RET
+.hv_w32:
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 32
+ vbroadcasti32x4 m20, [spel_h_shufA]
+ vbroadcasti32x4 m21, [spel_h_shufB]
+ mova m22, [spel_shuf32]
+ lea wd, [hq+wq*8-256]
+ mov r7, srcq
+ mov r8, dstq
+.hv_w32_loop0:
+ movu m6, [srcq+ssq*0+ 0]
+ movu m7, [srcq+ssq*0+ 8]
+ movu m8, [srcq+ssq*0+16]
+ mova m0, m10
+ mova m23, m10
+ pshufb m9, m6, m20
+ vpdpwssd m0, m12, m9 ; a0l
+ pshufb m9, m7, m20
+ vpdpwssd m23, m12, m9 ; a0h
+ vpdpwssd m0, m14, m9 ; a2l
+ pshufb m7, m21
+ vpdpwssd m23, m13, m7 ; a1h
+ vpdpwssd m0, m15, m7 ; a3l
+ pshufb m7, m8, m20
+ vpdpwssd m23, m14, m7 ; a2h
+ pshufb m6, m21
+ vpdpwssd m0, m13, m6 ; a1l
+ pshufb m8, m21
+ vpdpwssd m23, m15, m8 ; a3h
+%macro PUT_8TAP_HV_W32 5 ; dst_lo, dst_hi, stride_name, stride[1-2]
+ movu m6, [srcq+%3*%4+ 0]
+ movu m7, [srcq+%3*%4+ 8]
+ movu m8, [srcq+%3*%4+16]
+%if %4 == 2
+ add srcq, r6
+%endif
+ movu m29, [srcq+%3*%5+ 0]
+ movu m30, [srcq+%3*%5+ 8]
+ movu m31, [srcq+%3*%5+16]
+%if %5 == 2
+ add srcq, r6
+%endif
+ mova m%1, m10
+ mova m9, m10
+ pshufb m%2, m6, m20
+ vpdpwssd m%1, m12, m%2 ; x0l
+ pshufb m%2, m29, m20
+ vpdpwssd m9, m12, m%2 ; y0l
+ pshufb m6, m21
+ vpdpwssd m%1, m13, m6 ; x1l
+ pshufb m29, m21
+ vpdpwssd m9, m13, m29 ; y1l
+ pshufb m6, m7, m20
+ mova m%2, m10
+ vpdpwssd m%2, m12, m6 ; x0h
+ pshufb m29, m30, m20
+ vpdpwssd m%1, m14, m6 ; y2l
+ mova m6, m10
+ vpdpwssd m6, m12, m29 ; x0h
+ pshufb m7, m21
+ vpdpwssd m9, m14, m29 ; y2l
+ pshufb m30, m21
+ vpdpwssd m%2, m13, m7 ; x1h
+ vpdpwssd m%1, m15, m7 ; x3l
+ pshufb m7, m8, m20
+ vpdpwssd m6, m13, m30 ; y1h
+ vpdpwssd m9, m15, m30 ; y3l
+ pshufb m30, m31, m20
+ vpdpwssd m%2, m14, m7 ; x2h
+ pshufb m8, m21
+ vpdpwssd m6, m14, m30 ; y2h
+ pshufb m31, m21
+ vpdpwssd m%2, m15, m8 ; x3h
+ vpdpwssd m6, m15, m31 ; y3h
+%if %1 == 1
+ vpermt2b m0, m22, m%1 ; 01l
+ vpermt2b m23, m22, m%2 ; 01h
+%endif
+ vpermt2b m%1, m22, m9 ; xyl
+ vpermt2b m%2, m22, m6 ; xyh
+%endmacro
+ PUT_8TAP_HV_W32 1, 24, ssq, 1, 2 ; 12
+ PUT_8TAP_HV_W32 3, 26, ssq, 0, 1 ; 34
+ PUT_8TAP_HV_W32 5, 28, ssq, 2, 0 ; 56
+ vpshrdd m2, m1, m3, 16 ; 23l
+ vpshrdd m25, m24, m26, 16 ; 23h
+ vpshrdd m4, m3, m5, 16 ; 45l
+ vpshrdd m27, m26, m28, 16 ; 45h
+.hv_w32_loop:
+ movu m7, [srcq+ssq*1+ 0]
+ movu m9, [srcq+ssq*2+ 0]
+ movu m6, [srcq+ssq*1+ 8]
+ movu m8, [srcq+ssq*2+ 8]
+ mova m29, m10
+ mova m31, m10
+ pshufb m30, m7, m20
+ vpdpwssd m29, m12, m30 ; h0l
+ pshufb m30, m9, m20
+ vpdpwssd m31, m12, m30 ; i0l
+ pshufb m7, m21
+ vpdpwssd m29, m13, m7 ; h1l
+ pshufb m9, m21
+ vpdpwssd m31, m13, m9 ; i1l
+ pshufb m7, m6, m20
+ vpdpwssd m29, m14, m7 ; h2l
+ pshufb m9, m8, m20
+ vpdpwssd m31, m14, m9 ; i2l
+ pshufb m6, m21
+ vpdpwssd m29, m15, m6 ; h3l
+ pshufb m8, m21
+ vpdpwssd m31, m15, m8 ; i3l
+ mova m30, m10
+ vpdpwssd m30, m12, m7 ; h0h
+ movu m7, [srcq+ssq*1+16]
+ lea srcq, [srcq+ssq*2]
+ vpermt2b m29, m22, m31 ; 78l
+ mova m31, m10
+ vpdpwssd m31, m12, m9 ; i0h
+ movu m9, [srcq+ssq*0+16]
+ vpdpwssd m30, m13, m6 ; h1h
+ pshufb m6, m7, m20
+ vpdpwssd m31, m13, m8 ; i1h
+ pshufb m8, m9, m20
+ vpdpwssd m30, m14, m6 ; h2h
+ pmaddwd m6, m16, m0 ; A0l
+ pshufb m7, m21
+ vpdpwssd m31, m14, m8 ; i2h
+ pmaddwd m8, m16, m23 ; A0h
+ pshufb m9, m21
+ vpdpwssd m30, m15, m7 ; h3h
+ pmaddwd m7, m16, m1 ; B0l
+ vpdpwssd m31, m15, m9 ; i3h
+ pmaddwd m9, m16, m24 ; B0h
+ mova m0, m2
+ vpdpwssd m6, m17, m2 ; A1l
+ mova m23, m25
+ vpdpwssd m8, m17, m25 ; A1h
+ mova m1, m3
+ vpdpwssd m7, m17, m3 ; B1l
+ mova m24, m26
+ vpdpwssd m9, m17, m26 ; B1h
+ vpermt2b m30, m22, m31 ; 78h
+ vpdpwssd m6, m18, m4 ; A2l
+ mova m2, m4
+ vpdpwssd m8, m18, m27 ; A2h
+ mova m25, m27
+ vpdpwssd m7, m18, m5 ; B2l
+ mova m3, m5
+ vpdpwssd m9, m18, m28 ; B2h
+ mova m26, m28
+ vpshrdd m4, m5, m29, 16 ; 67l
+ vpdpwssd m6, m19, m4 ; A3l
+ vpshrdd m27, m28, m30, 16 ; 67h
+ vpdpwssd m8, m19, m27 ; A3h
+ mova m5, m29
+ vpdpwssd m7, m19, m29 ; B3l
+ mova m28, m30
+ vpdpwssd m9, m19, m30 ; B3h
+ REPX {psrad x, 10}, m6, m8, m7, m9
+ packusdw m6, m8
+ packusdw m7, m9
+ pminsw m6, m11
+ pminsw m7, m11
+ mova [dstq+dsq*0], m6
+ mova [dstq+dsq*1], m7
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w32_loop
+ add r7, 64
+ add r8, 64
+ movzx hd, wb
+ mov srcq, r7
+ mov dstq, r8
+ sub wd, 1<<8
+ jg .hv_w32_loop0
+ RET
+
+%if WIN64
+DECLARE_REG_TMP 6, 4
+%else
+DECLARE_REG_TMP 6, 7
+%endif
+
+MC_8TAP_FN prep, sharp, SHARP, SHARP
+MC_8TAP_FN prep, sharp_smooth, SHARP, SMOOTH
+MC_8TAP_FN prep, smooth_sharp, SMOOTH, SHARP
+MC_8TAP_FN prep, smooth, SMOOTH, SMOOTH
+MC_8TAP_FN prep, sharp_regular, SHARP, REGULAR
+MC_8TAP_FN prep, regular_sharp, REGULAR, SHARP
+MC_8TAP_FN prep, smooth_regular, SMOOTH, REGULAR
+MC_8TAP_FN prep, regular_smooth, REGULAR, SMOOTH
+MC_8TAP_FN prep, regular, REGULAR, REGULAR
+
+cglobal prep_8tap_16bpc, 3, 8, 16, tmp, src, stride, w, h, mx, my, stride3
+%define base r7-prep_avx512icl
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+ imul myd, mym, 0x010101
+ add myd, t1d ; 8tap_v, my, 4tap_v
+ lea r7, [prep_avx512icl]
+ mov wd, wm
+ movifnidn hd, hm
+ test mxd, 0xf00
+ jnz .h
+ test myd, 0xf00
+ jnz .v
+ tzcnt wd, wd
+ mov r5d, r7m ; bitdepth_max
+ vpbroadcastd m5, [pw_8192]
+ movzx wd, word [r7+wq*2+table_offset(prep,)]
+ shr r5d, 11
+ vpbroadcastd m4, [r7-prep_avx512icl+prep_mul+r5*4]
+ add wq, r7
+ lea r6, [strideq*3]
+%if WIN64
+ pop r7
+%endif
+ jmp wq
+.h_w4:
+ movzx mxd, mxb
+ sub srcq, 2
+ pmovsxbw xmm0, [base+subpel_filters+mxq*8]
+ mov r5d, r7m
+ vbroadcasti32x4 m4, [spel_h_shufA]
+ vbroadcasti32x4 m5, [spel_h_shufB]
+ shr r5d, 11
+ mova ym9, [prep_endA]
+ psllw xmm0, [base+prep_hv_shift+r5*8]
+ mova [tmpq], xmm0
+ vpbroadcastd m6, [tmpq+4]
+ vpbroadcastd m7, [tmpq+8]
+.h_w4_loop:
+ movu xm2, [srcq+strideq*0]
+ vinserti32x4 ym2, [srcq+strideq*1], 1
+ vinserti32x4 m2, [srcq+strideq*2], 2
+ vinserti32x4 m2, [srcq+r6 ], 3
+ lea srcq, [srcq+strideq*4]
+ mova m0, m10
+ pshufb m1, m2, m4
+ vpdpwssd m0, m6, m1
+ pshufb m2, m5
+ vpdpwssd m0, m7, m2
+ vpermb m0, m9, m0
+ mova [tmpq], ym0
+ add tmpq, 32
+ sub hd, 4
+ jg .h_w4_loop
+ RET
+.h:
+ test myd, 0xf00
+ jnz .hv
+ vpbroadcastd m10, [prep_8tap_rnd]
+ lea r6, [strideq*3]
+ cmp wd, 4
+ je .h_w4
+ shr mxd, 16
+ pmovsxbw xmm0, [base+subpel_filters+mxq*8]
+ mov r5d, r7m
+ sub srcq, 6
+ shr r5d, 11
+ psllw xmm0, [base+prep_hv_shift+r5*8]
+ mova [tmpq], xmm0
+ vpbroadcastd m12, xmm0
+ vpbroadcastd m13, [tmpq+ 4]
+ vpbroadcastd m14, [tmpq+ 8]
+ vpbroadcastd m15, [tmpq+12]
+ cmp wd, 16
+ je .h_w16
+ jg .h_w32
+.h_w8:
+ mova m6, [spel_h_shufA]
+ movu m7, [spel_h_shufB]
+ movu m8, [spel_h_shufC]
+ mova m9, [spel_h_shufD]
+ mova m11, [prep_endB]
+.h_w8_loop:
+ movu ym4, [srcq+strideq*0]
+ vinserti32x8 m4, [srcq+strideq*1], 1
+ movu ym5, [srcq+strideq*2]
+ vinserti32x8 m5, [srcq+r6 ], 1
+ lea srcq, [srcq+strideq*4]
+ mova m0, m10
+ mova m1, m10
+ vpermb m2, m6, m4
+ vpermb m3, m6, m5
+ vpdpwssd m0, m12, m2
+ vpdpwssd m1, m12, m3
+ vpermb m2, m7, m4
+ vpermb m3, m7, m5
+ vpdpwssd m0, m13, m2
+ vpdpwssd m1, m13, m3
+ vpermb m2, m8, m4
+ vpermb m3, m8, m5
+ vpdpwssd m0, m14, m2
+ vpdpwssd m1, m14, m3
+ vpermb m2, m9, m4
+ vpermb m3, m9, m5
+ vpdpwssd m0, m15, m2
+ vpdpwssd m1, m15, m3
+ vpermt2b m0, m11, m1
+ mova [tmpq], m0
+ add tmpq, 64
+ sub hd, 4
+ jg .h_w8_loop
+ RET
+.h_w16:
+ vbroadcasti32x4 m6, [spel_h_shufA]
+ vbroadcasti32x4 m7, [spel_h_shufB]
+ mova m11, [prep_endC]
+.h_w16_loop:
+ movu ym2, [srcq+strideq*0+ 0]
+ vinserti32x8 m2, [srcq+strideq*1+ 0], 1
+ movu ym3, [srcq+strideq*0+16]
+ vinserti32x8 m3, [srcq+strideq*1+16], 1
+ lea srcq, [srcq+strideq*2]
+ mova m0, m10
+ mova m1, m10
+ pshufb m4, m2, m6
+ vpdpwssd m0, m12, m4 ; a0
+ pshufb m4, m3, m6
+ vpdpwssd m1, m14, m4 ; b2
+ pshufb m4, m2, m7
+ vpdpwssd m0, m13, m4 ; a1
+ pshufb m4, m3, m7
+ vpdpwssd m1, m15, m4 ; b3
+ shufpd m2, m3, 0x55
+ pshufb m4, m2, m6
+ vpdpwssd m0, m14, m4 ; a2
+ vpdpwssd m1, m12, m4 ; b0
+ pshufb m2, m7
+ vpdpwssd m0, m15, m2 ; a3
+ vpdpwssd m1, m13, m2 ; b1
+ vpermt2b m0, m11, m1
+ mova [tmpq], m0
+ add tmpq, 64
+ sub hd, 2
+ jg .h_w16_loop
+ RET
+.h_w32:
+ vbroadcasti32x4 m6, [spel_h_shufA]
+ lea srcq, [srcq+wq*2]
+ vbroadcasti32x4 m7, [spel_h_shufB]
+ neg wq
+ mova m11, [prep_endC]
+.h_w32_loop0:
+ mov r6, wq
+.h_w32_loop:
+ movu m2, [srcq+r6*2+ 0]
+ movu m3, [srcq+r6*2+ 8]
+ mova m0, m10
+ mova m1, m10
+ pshufb m4, m2, m6
+ vpdpwssd m0, m12, m4 ; a0
+ pshufb m4, m3, m6
+ vpdpwssd m1, m12, m4 ; b0
+ vpdpwssd m0, m14, m4 ; a2
+ movu m4, [srcq+r6*2+16]
+ pshufb m3, m7
+ vpdpwssd m1, m13, m3 ; b1
+ vpdpwssd m0, m15, m3 ; a3
+ pshufb m3, m4, m6
+ vpdpwssd m1, m14, m3 ; b2
+ pshufb m2, m7
+ vpdpwssd m0, m13, m2 ; a1
+ pshufb m4, m7
+ vpdpwssd m1, m15, m4 ; b3
+ vpermt2b m0, m11, m1
+ mova [tmpq], m0
+ add tmpq, 64
+ add r6, 32
+ jl .h_w32_loop
+ add srcq, strideq
+ dec hd
+ jg .h_w32_loop0
+ RET
+.v:
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmove myd, mxd
+ mov r5d, r7m
+ vpbroadcastd m10, [prep_8tap_rnd]
+ pmovsxbw xmm0, [base+subpel_filters+myq*8]
+ tzcnt r6d, wd
+ shr r5d, 11
+ movzx r6d, word [r7+r6*2+table_offset(prep, _8tap_v)]
+ psllw xmm0, [base+prep_hv_shift+r5*8]
+ add r7, r6
+ lea r6, [strideq*3]
+ sub srcq, r6
+ mova [tmpq], xmm0
+ vpbroadcastd m12, xmm0
+ vpbroadcastd m13, [tmpq+ 4]
+ vpbroadcastd m14, [tmpq+ 8]
+ vpbroadcastd m15, [tmpq+12]
+ jmp r7
+.v_w4:
+ movq xmm1, [srcq+strideq*0]
+ vpbroadcastq ymm0, [srcq+strideq*1]
+ vpbroadcastq ymm2, [srcq+strideq*2]
+ add srcq, r6
+ vpbroadcastq ymm4, [srcq+strideq*0]
+ vpbroadcastq ymm3, [srcq+strideq*1]
+ vpbroadcastq ymm5, [srcq+strideq*2]
+ mova xm11, [prep_endA]
+ add srcq, r6
+ vpblendd ymm1, ymm0, 0x30
+ vpblendd ymm0, ymm2, 0x30
+ punpcklwd ymm1, ymm0 ; 01 12
+ vpbroadcastq ymm0, [srcq+strideq*0]
+ vpblendd ymm2, ymm4, 0x30
+ vpblendd ymm4, ymm3, 0x30
+ punpcklwd ymm2, ymm4 ; 23 34
+ vpblendd ymm3, ymm5, 0x30
+ vpblendd ymm5, ymm0, 0x30
+ punpcklwd ymm3, ymm5 ; 45 56
+.v_w4_loop:
+ vpbroadcastq ymm5, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ mova ymm4, ym10
+ vpdpwssd ymm4, ym12, ymm1 ; a0 b0
+ mova ymm1, ymm2
+ vpdpwssd ymm4, ym13, ymm2 ; a1 b1
+ mova ymm2, ymm3
+ vpdpwssd ymm4, ym14, ymm3 ; a2 b2
+ vpblendd ymm3, ymm0, ymm5, 0x30
+ vpbroadcastq ymm0, [srcq+strideq*0]
+ vpblendd ymm5, ymm0, 0x30
+ punpcklwd ymm3, ymm5 ; 67 78
+ vpdpwssd ymm4, ym15, ymm3 ; a3 b3
+ vpermb ymm4, ym11, ymm4
+ mova [tmpq], xmm4
+ add tmpq, 16
+ sub hd, 2
+ jg .v_w4_loop
+ vzeroupper
+ RET
+.v_w8:
+ vbroadcasti32x4 m2, [srcq+strideq*2]
+ vinserti32x4 m1, m2, [srcq+strideq*0], 0
+ vinserti32x4 m1, [srcq+strideq*1], 1 ; 0 1 2
+ add srcq, r6
+ vinserti32x4 ym2, [srcq+strideq*0], 1
+ vinserti32x4 m2, [srcq+strideq*1], 2 ; 2 3 4
+ mova m6, [spel_v_shuf8]
+ movu xm0, [srcq+strideq*1]
+ vinserti32x4 ym0, [srcq+strideq*2], 1
+ add srcq, r6
+ vinserti32x4 m0, [srcq+strideq*0], 2 ; 4 5 6
+ mova ym11, [prep_endB]
+ vpermb m1, m6, m1 ; 01 12
+ vpermb m2, m6, m2 ; 23 34
+ vpermb m3, m6, m0 ; 45 56
+.v_w8_loop:
+ vinserti32x4 m0, [srcq+strideq*1], 3
+ lea srcq, [srcq+strideq*2]
+ movu xm5, [srcq+strideq*0]
+ mova m4, m10
+ vpdpwssd m4, m12, m1 ; a0 b0
+ mova m1, m2
+ vshufi32x4 m0, m5, q1032 ; 6 7 8
+ vpdpwssd m4, m13, m2 ; a1 b1
+ mova m2, m3
+ vpdpwssd m4, m14, m3 ; a2 b2
+ vpermb m3, m6, m0 ; 67 78
+ vpdpwssd m4, m15, m3 ; a3 b3
+ vpermb m4, m11, m4
+ mova [tmpq], ym4
+ add tmpq, 32
+ sub hd, 2
+ jg .v_w8_loop
+ RET
+.v_w16:
+ vbroadcasti32x8 m1, [srcq+strideq*1]
+ vinserti32x8 m0, m1, [srcq+strideq*0], 0
+ vinserti32x8 m1, [srcq+strideq*2], 1
+ mova m8, [spel_v_shuf16]
+ add srcq, r6
+ movu ym3, [srcq+strideq*0]
+ vinserti32x8 m3, [srcq+strideq*1], 1
+ movu ym5, [srcq+strideq*2]
+ add srcq, r6
+ vinserti32x8 m5, [srcq+strideq*0], 1
+ mova m11, [prep_endA]
+ vpermb m0, m8, m0 ; 01
+ vpermb m1, m8, m1 ; 12
+ vpermb m3, m8, m3 ; 34
+ vpermb m5, m8, m5 ; 56
+ vpshrdd m2, m1, m3, 16 ; 23
+ vpshrdd m4, m3, m5, 16 ; 45
+.v_w16_loop:
+ mova m6, m10
+ mova m7, m10
+ vpdpwssd m6, m12, m0 ; a0
+ mova m0, m2
+ vpdpwssd m7, m12, m1 ; b0
+ mova m1, m3
+ vpdpwssd m6, m13, m2 ; a1
+ mova m2, m4
+ vpdpwssd m7, m13, m3 ; b1
+ mova m3, m5
+ vpdpwssd m6, m14, m4 ; a2
+ mova m4, m5
+ vpdpwssd m7, m14, m5 ; b2
+ movu ym5, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vinserti32x8 m5, [srcq+strideq*0], 1
+ vpermb m5, m8, m5 ; 78
+ vpshrdd m4, m5, 16 ; 67
+ vpdpwssd m6, m15, m4 ; a3
+ vpdpwssd m7, m15, m5 ; b3
+ vpermt2b m6, m11, m7
+ mova [tmpq], m6
+ add tmpq, 64
+ sub hd, 2
+ jg .v_w16_loop
+ RET
+.v_w32:
+.v_w64:
+.v_w128:
+%if WIN64
+ PUSH r8
+ movaps [rsp+stack_offset+8], xmm6
+%endif
+ lea r5, [hq+wq*8-256]
+ mov r7, srcq
+ mov r8, tmpq
+.v_w32_loop0:
+ movu m16, [srcq+strideq*0]
+ movu m17, [srcq+strideq*1]
+ movu m18, [srcq+strideq*2]
+ add srcq, r6
+ movu m19, [srcq+strideq*0]
+ movu m20, [srcq+strideq*1]
+ movu m21, [srcq+strideq*2]
+ add srcq, r6
+ movu m22, [srcq+strideq*0]
+ mova m11, [prep_endC]
+ punpcklwd m0, m16, m17 ; 01l
+ punpckhwd m16, m17 ; 01h
+ punpcklwd m1, m17, m18 ; 12l
+ punpckhwd m17, m18 ; 12h
+ punpcklwd m2, m18, m19 ; 23l
+ punpckhwd m18, m19 ; 23h
+ punpcklwd m3, m19, m20 ; 34l
+ punpckhwd m19, m20 ; 34h
+ punpcklwd m4, m20, m21 ; 45l
+ punpckhwd m20, m21 ; 45h
+ punpcklwd m5, m21, m22 ; 56l
+ punpckhwd m21, m22 ; 56h
+.v_w32_loop:
+ mova m6, m10
+ vpdpwssd m6, m12, m0 ; a0l
+ mova m8, m10
+ vpdpwssd m8, m12, m16 ; a0h
+ mova m7, m10
+ vpdpwssd m7, m12, m1 ; b0l
+ mova m9, m10
+ vpdpwssd m9, m12, m17 ; b0h
+ mova m0, m2
+ vpdpwssd m6, m13, m2 ; a1l
+ mova m16, m18
+ vpdpwssd m8, m13, m18 ; a1h
+ mova m1, m3
+ vpdpwssd m7, m13, m3 ; b1l
+ mova m17, m19
+ vpdpwssd m9, m13, m19 ; b1h
+ mova m2, m4
+ vpdpwssd m6, m14, m4 ; a2l
+ mova m18, m20
+ vpdpwssd m8, m14, m20 ; a2h
+ mova m3, m5
+ vpdpwssd m7, m14, m5 ; b2l
+ mova m19, m21
+ vpdpwssd m9, m14, m21 ; b2h
+ movu m21, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ punpcklwd m4, m22, m21 ; 67l
+ punpckhwd m20, m22, m21 ; 67h
+ movu m22, [srcq+strideq*0]
+ vpdpwssd m6, m15, m4 ; a3l
+ vpdpwssd m8, m15, m20 ; a3h
+ punpcklwd m5, m21, m22 ; 78l
+ punpckhwd m21, m22 ; 78h
+ vpdpwssd m7, m15, m5 ; b3l
+ vpdpwssd m9, m15, m21 ; b3h
+ vpermt2b m6, m11, m8
+ vpermt2b m7, m11, m9
+ mova [tmpq+wq*0], m6
+ mova [tmpq+wq*2], m7
+ lea tmpq, [tmpq+wq*4]
+ sub hd, 2
+ jg .v_w32_loop
+ add r7, 64
+ add r8, 64
+ movzx hd, r5b
+ mov srcq, r7
+ mov tmpq, r8
+ sub r5d, 1<<8
+ jg .v_w32_loop0
+%if WIN64
+ movaps xmm6, [rsp+stack_offset+8]
+ POP r8
+%endif
+ vzeroupper
+ RET
+.hv:
+ cmp wd, 4
+ jg .hv_w8
+ movzx mxd, mxb
+ pmovsxbw xmm0, [base+subpel_filters+mxq*8]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmove myd, mxd
+ mov r5d, r7m
+ pmovsxbw xmm1, [base+subpel_filters+myq*8]
+ lea r6, [strideq*3]
+ sub srcq, 2
+ shr r5d, 11
+ sub srcq, r6
+ psllw xmm0, [base+prep_hv_shift+r5*8]
+ psllw xmm1, 2
+ vpbroadcastd m10, [prep_8tap_rnd]
+ vpbroadcastd ym11, [pd_128]
+ mova xm21, [prep_endA]
+ mova [tmpq+ 0], xmm0
+ mova [tmpq+16], xmm1
+ vpbroadcastd m8, [tmpq+ 4]
+ vpbroadcastd m9, [tmpq+ 8]
+ vpbroadcastd ym12, xmm1
+ vpbroadcastd ym13, [tmpq+20]
+ vpbroadcastd ym14, [tmpq+24]
+ vpbroadcastd ym15, [tmpq+28]
+ movu xm4, [srcq+strideq*0]
+ vinserti32x4 ym4, [srcq+strideq*1], 1
+ vinserti32x4 m4, [srcq+strideq*2], 2
+ add srcq, r6
+ vinserti32x4 m4, [srcq+strideq*0], 3 ; 0 1 2 3
+ movu xm0, [srcq+strideq*1]
+ vinserti32x4 ym0, [srcq+strideq*2], 1
+ add srcq, r6
+ vinserti32x4 m0, [srcq+strideq*0], 2 ; 4 5 6
+ vbroadcasti32x4 m19, [spel_h_shufA]
+ vbroadcasti32x4 m20, [spel_h_shufB]
+ mova ym6, [spel_shuf4a]
+ mova ym7, [spel_shuf4b]
+ mova m2, m10
+ mova m3, m10
+ pshufb m1, m4, m19
+ vpdpwssd m2, m8, m1
+ pshufb m1, m0, m19
+ vpdpwssd m3, m8, m1
+ pshufb m4, m20
+ vpdpwssd m2, m9, m4
+ pshufb m0, m20
+ vpdpwssd m3, m9, m0
+ vpermb m1, m6, m2 ; 01 12
+ vshufi32x4 m2, m3, q1032
+ vpermb m3, m6, m3 ; 45 56
+ vpermb m2, m6, m2 ; 23 34
+.hv_w4_loop:
+ movu xm18, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vinserti128 ym18, [srcq+strideq*0], 1
+ mova ym16, ym11
+ mova ym4, ym10
+ pshufb ym17, ym18, ym19
+ vpdpwssd ym16, ym12, ym1 ; a0 b0
+ vpdpwssd ym4, ym8, ym17
+ pshufb ym18, ym20
+ mova ym1, ym2
+ vpdpwssd ym16, ym13, ym2 ; a1 b1
+ vpdpwssd ym4, ym9, ym18 ; 7 8
+ mova ym2, ym3
+ vpdpwssd ym16, ym14, ym3 ; a2 b2
+ vpermt2b ym3, ym7, ym4 ; 67 78
+ vpdpwssd ym16, ym15, ym3 ; a3 b3
+ vpermb ym16, ym21, ym16
+ mova [tmpq], xm16
+ add tmpq, 16
+ sub hd, 2
+ jg .hv_w4_loop
+ vzeroupper
+ RET
+.hv_w8:
+ shr mxd, 16
+ pmovsxbw xmm0, [base+subpel_filters+mxq*8]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ mov r5d, r7m
+ pmovsxbw xmm1, [base+subpel_filters+myq*8]
+ lea r6, [strideq*3]
+ sub srcq, 6
+ shr r5d, 11
+ sub srcq, r6
+ vpbroadcastd m10, [prep_8tap_rnd]
+ vpbroadcastd m11, [pd_128]
+ psllw xmm0, [base+prep_hv_shift+r5*8]
+ psllw xmm1, 2
+ mova [tmpq+ 0], xmm0
+ mova [tmpq+16], xmm1
+ vpbroadcastd m12, xmm0
+ vpbroadcastd m13, [tmpq+ 4]
+ vpbroadcastd m14, [tmpq+ 8]
+ vpbroadcastd m15, [tmpq+12]
+ vpbroadcastd m16, xmm1
+ vpbroadcastd m17, [tmpq+20]
+ vpbroadcastd m18, [tmpq+24]
+ vpbroadcastd m19, [tmpq+28]
+ cmp wd, 16
+ je .hv_w16
+ jg .hv_w32
+ WIN64_SPILL_XMM 23
+ mova m5, [spel_h_shufA]
+ movu ym0, [srcq+strideq*0]
+ vinserti32x8 m0, [srcq+strideq*1], 1 ; 0 1
+ movu ym9, [srcq+strideq*2]
+ add srcq, r6
+ vinserti32x8 m9, [srcq+strideq*0], 1 ; 2 3
+ movu ym20, [srcq+strideq*1]
+ vinserti32x8 m20, [srcq+strideq*2], 1 ; 4 5
+ add srcq, r6
+ movu ym21, [srcq+strideq*0] ; 6
+ movu m6, [spel_h_shufB]
+ movu m7, [spel_h_shufC]
+ mova ym22, [prep_endB]
+ vpermb m8, m5, m0
+ mova m1, m10
+ vpdpwssd m1, m12, m8 ; a0 b0
+ vpermb m8, m5, m9
+ mova m2, m10
+ vpdpwssd m2, m12, m8 ; c0 d0
+ vpermb m8, m5, m20
+ mova m3, m10
+ vpdpwssd m3, m12, m8 ; e0 f0
+ vpermb m8, m5, m21
+ mova m4, m10
+ vpdpwssd m4, m12, m8 ; g0
+ vpermb m8, m6, m0
+ vpdpwssd m1, m13, m8 ; a1 b1
+ vpermb m8, m6, m9
+ vpdpwssd m2, m13, m8 ; c1 d1
+ vpermb m8, m6, m20
+ vpdpwssd m3, m13, m8 ; e1 f1
+ vpermb m8, m6, m21
+ vpdpwssd m4, m13, m8 ; g1
+ vpermb m8, m7, m0
+ vpdpwssd m1, m14, m8 ; a2 b2
+ vpermb m8, m7, m9
+ vpdpwssd m2, m14, m8 ; c2 d2
+ vpermb m8, m7, m20
+ vpdpwssd m3, m14, m8 ; e2 f2
+ vpermb m8, m7, m21
+ vpdpwssd m4, m14, m8 ; g2
+ mova m8, [spel_h_shufD]
+ vpermb m0, m8, m0
+ vpdpwssd m1, m15, m0 ; a3 b3
+ mova m0, [spel_shuf8a]
+ vpermb m9, m8, m9
+ vpdpwssd m2, m15, m9 ; c3 d3
+ mova m9, [spel_shuf8b]
+ vpermb m20, m8, m20
+ vpdpwssd m3, m15, m20 ; e3 f3
+ vpermb m21, m8, m21
+ vpdpwssd m4, m15, m21 ; g3
+ vpermt2b m1, m0, m2 ; 01 12
+ vpermt2b m2, m0, m3 ; 23 34
+ vpermt2b m3, m0, m4 ; 45 56
+.hv_w8_loop:
+ movu ym0, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vinserti32x8 m0, [srcq+strideq*0], 1
+ mova m4, m10
+ mova m20, m11
+ vpermb m21, m5, m0
+ vpdpwssd m4, m12, m21 ; h0 i0
+ vpermb m21, m6, m0
+ vpdpwssd m20, m16, m1 ; A0 B0
+ vpdpwssd m4, m13, m21 ; h1 i1
+ vpermb m21, m7, m0
+ mova m1, m2
+ vpdpwssd m20, m17, m2 ; A1 B1
+ vpdpwssd m4, m14, m21 ; h2 i2
+ vpermb m21, m8, m0
+ mova m2, m3
+ vpdpwssd m20, m18, m3 ; A2 B2
+ vpdpwssd m4, m15, m21 ; h3 i3
+ vpermt2b m3, m9, m4 ; 67 78
+ vpdpwssd m20, m19, m3 ; A3 B3
+ vpermb m20, m22, m20
+ mova [tmpq], ym20
+ add tmpq, 32
+ sub hd, 2
+ jg .hv_w8_loop
+ RET
+.hv_w16:
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 27
+ vbroadcasti32x8 m5, [srcq+strideq*0+ 8]
+ vinserti32x8 m4, m5, [srcq+strideq*0+ 0], 0
+ vinserti32x8 m5, [srcq+strideq*0+16], 1 ; 0
+ movu ym6, [srcq+strideq*1+ 0]
+ movu ym7, [srcq+strideq*1+16]
+ vinserti32x8 m6, [srcq+strideq*2+ 0], 1
+ vinserti32x8 m7, [srcq+strideq*2+16], 1 ; 1 2
+ add srcq, r6
+ movu ym22, [srcq+strideq*0+ 0]
+ movu ym23, [srcq+strideq*0+16]
+ vinserti32x8 m22, [srcq+strideq*1+ 0], 1
+ vinserti32x8 m23, [srcq+strideq*1+16], 1 ; 3 4
+ movu ym24, [srcq+strideq*2+ 0]
+ movu ym25, [srcq+strideq*2+16]
+ add srcq, r6
+ vinserti32x8 m24, [srcq+strideq*0+ 0], 1
+ vinserti32x8 m25, [srcq+strideq*0+16], 1 ; 5 6
+ vbroadcasti32x4 m20, [spel_h_shufA]
+ vbroadcasti32x4 m21, [spel_h_shufB]
+ mova m9, [spel_shuf16]
+ mova m26, [prep_endB]
+ pshufb m0, m4, m20
+ mova m1, m10
+ vpdpwssd m1, m12, m0 ; a0
+ pshufb m0, m6, m20
+ mova m2, m10
+ vpdpwssd m2, m12, m0 ; b0
+ pshufb m0, m7, m20
+ mova m3, m10
+ vpdpwssd m3, m14, m0 ; c2
+ pshufb m0, m4, m21
+ vpdpwssd m1, m13, m0 ; a1
+ pshufb m0, m6, m21
+ vpdpwssd m2, m13, m0 ; b1
+ pshufb m0, m7, m21
+ vpdpwssd m3, m15, m0 ; c3
+ pshufb m0, m5, m20
+ vpdpwssd m1, m14, m0 ; a2
+ shufpd m6, m7, 0x55
+ pshufb m7, m6, m20
+ vpdpwssd m2, m14, m7 ; b2
+ vpdpwssd m3, m12, m7 ; c0
+ pshufb m5, m21
+ vpdpwssd m1, m15, m5 ; a3
+ pshufb m6, m21
+ vpdpwssd m2, m15, m6 ; b3
+ vpdpwssd m3, m13, m6 ; c1
+ pshufb m0, m22, m20
+ mova m4, m10
+ vpdpwssd m4, m12, m0 ; d0
+ pshufb m0, m23, m20
+ mova m5, m10
+ vpdpwssd m5, m14, m0 ; e2
+ pshufb m0, m24, m20
+ mova m6, m10
+ vpdpwssd m6, m12, m0 ; f0
+ pshufb m0, m25, m20
+ mova m7, m10
+ vpdpwssd m7, m14, m0 ; g2
+ pshufb m0, m22, m21
+ vpdpwssd m4, m13, m0 ; d1
+ pshufb m0, m23, m21
+ vpdpwssd m5, m15, m0 ; e3
+ pshufb m0, m24, m21
+ vpdpwssd m6, m13, m0 ; f1
+ pshufb m0, m25, m21
+ vpdpwssd m7, m15, m0 ; g3
+ shufpd m22, m23, 0x55
+ pshufb m23, m22, m20
+ vpdpwssd m4, m14, m23 ; d2
+ vpdpwssd m5, m12, m23 ; e0
+ shufpd m24, m25, 0x55
+ pshufb m25, m24, m20
+ vpdpwssd m6, m14, m25 ; f2
+ vpdpwssd m7, m12, m25 ; g0
+ pshufb m22, m21
+ vpdpwssd m4, m15, m22 ; d3
+ vpdpwssd m5, m13, m22 ; e1
+ pshufb m24, m21
+ vpdpwssd m6, m15, m24 ; f3
+ vpdpwssd m7, m13, m24 ; g1
+ pslldq m1, 1
+ vpermt2b m2, m9, m3 ; 12
+ vpermt2b m4, m9, m5 ; 34
+ vpermt2b m6, m9, m7 ; 56
+ vpshrdd m1, m2, 16 ; 01
+ vpshrdd m3, m2, m4, 16 ; 23
+ vpshrdd m5, m4, m6, 16 ; 45
+.hv_w16_loop:
+ movu ym24, [srcq+strideq*1+ 0]
+ movu ym25, [srcq+strideq*1+16]
+ lea srcq, [srcq+strideq*2]
+ vinserti32x8 m24, [srcq+strideq*0+ 0], 1
+ vinserti32x8 m25, [srcq+strideq*0+16], 1
+ mova m7, m10
+ mova m8, m10
+ pshufb m0, m24, m20
+ vpdpwssd m7, m12, m0 ; h0
+ mova m22, m11
+ pshufb m0, m25, m20
+ vpdpwssd m8, m14, m0 ; i2
+ mova m23, m11
+ vpdpwssd m22, m16, m1 ; A0
+ mova m1, m3
+ vpdpwssd m23, m16, m2 ; B0
+ mova m2, m4
+ pshufb m0, m24, m21
+ vpdpwssd m7, m13, m0 ; h1
+ pshufb m0, m25, m21
+ vpdpwssd m8, m15, m0 ; i3
+ vpdpwssd m22, m17, m3 ; A1
+ mova m3, m5
+ vpdpwssd m23, m17, m4 ; B1
+ mova m4, m6
+ shufpd m24, m25, 0x55
+ pshufb m25, m24, m20
+ vpdpwssd m7, m14, m25 ; h2
+ vpdpwssd m8, m12, m25 ; i0
+ vpdpwssd m22, m18, m5 ; A2
+ vpdpwssd m23, m18, m6 ; B2
+ pshufb m24, m21
+ vpdpwssd m7, m15, m24 ; h3
+ vpdpwssd m8, m13, m24 ; i1
+ vpermt2b m7, m9, m8 ; 78
+ vpshrdd m5, m6, m7, 16 ; 67
+ vpdpwssd m22, m19, m5 ; A3
+ vpdpwssd m23, m19, m7 ; B3
+ mova m6, m7
+ vpermt2b m22, m26, m23
+ mova [tmpq], m22
+ add tmpq, 64
+ sub hd, 2
+ jg .hv_w16_loop
+ RET
+.hv_w32:
+%if WIN64
+ %assign stack_offset stack_offset - stack_size_padded
+ PUSH r8
+ %assign regs_used regs_used + 1
+ WIN64_SPILL_XMM 32
+%endif
+ vbroadcasti32x4 m20, [spel_h_shufA]
+ vbroadcasti32x4 m21, [spel_h_shufB]
+ mova m22, [spel_shuf32]
+ lea r5d, [hq+wq*8-256]
+ mov r7, srcq
+ mov r8, tmpq
+.hv_w32_loop0:
+ movu m6, [srcq+strideq*0+ 0]
+ movu m7, [srcq+strideq*0+ 8]
+ movu m8, [srcq+strideq*0+16]
+ mova m0, m10
+ mova m23, m10
+ pshufb m9, m6, m20
+ vpdpwssd m0, m12, m9 ; a0l
+ pshufb m9, m7, m20
+ vpdpwssd m23, m12, m9 ; a0h
+ vpdpwssd m0, m14, m9 ; a2l
+ pshufb m7, m21
+ vpdpwssd m23, m13, m7 ; a1h
+ vpdpwssd m0, m15, m7 ; a3l
+ pshufb m7, m8, m20
+ vpdpwssd m23, m14, m7 ; a2h
+ pshufb m6, m21
+ vpdpwssd m0, m13, m6 ; a1l
+ pshufb m8, m21
+ vpdpwssd m23, m15, m8 ; a3h
+ PUT_8TAP_HV_W32 1, 24, strideq, 1, 2 ; 12
+ PUT_8TAP_HV_W32 3, 26, strideq, 0, 1 ; 34
+ PUT_8TAP_HV_W32 5, 28, strideq, 2, 0 ; 56
+ vpshrdd m2, m1, m3, 16 ; 23l
+ vpshrdd m25, m24, m26, 16 ; 23h
+ vpshrdd m4, m3, m5, 16 ; 45l
+ vpshrdd m27, m26, m28, 16 ; 45h
+.hv_w32_loop:
+ movu m7, [srcq+strideq*1+ 0]
+ movu m9, [srcq+strideq*2+ 0]
+ movu m6, [srcq+strideq*1+ 8]
+ movu m8, [srcq+strideq*2+ 8]
+ mova m29, m10
+ mova m31, m10
+ pshufb m30, m7, m20
+ vpdpwssd m29, m12, m30 ; h0l
+ pshufb m30, m9, m20
+ vpdpwssd m31, m12, m30 ; i0l
+ pshufb m7, m21
+ vpdpwssd m29, m13, m7 ; h1l
+ pshufb m9, m21
+ vpdpwssd m31, m13, m9 ; i1l
+ pshufb m7, m6, m20
+ vpdpwssd m29, m14, m7 ; h2l
+ pshufb m9, m8, m20
+ vpdpwssd m31, m14, m9 ; i2l
+ pshufb m6, m21
+ vpdpwssd m29, m15, m6 ; h3l
+ pshufb m8, m21
+ vpdpwssd m31, m15, m8 ; i3l
+ mova m30, m10
+ vpdpwssd m30, m12, m7 ; h0h
+ movu m7, [srcq+strideq*1+16]
+ lea srcq, [srcq+strideq*2]
+ vpermt2b m29, m22, m31 ; 78l
+ mova m31, m10
+ vpdpwssd m31, m12, m9 ; i0h
+ movu m9, [srcq+strideq*0+16]
+ vpdpwssd m30, m13, m6 ; h1h
+ pshufb m6, m7, m20
+ vpdpwssd m31, m13, m8 ; i1h
+ pshufb m8, m9, m20
+ vpdpwssd m30, m14, m6 ; h2h
+ mova m6, m11
+ vpdpwssd m6, m16, m0 ; A0l
+ pshufb m7, m21
+ vpdpwssd m31, m14, m8 ; i2h
+ mova m8, m11
+ vpdpwssd m8, m16, m23 ; A0h
+ pshufb m9, m21
+ vpdpwssd m30, m15, m7 ; h3h
+ mova m7, m11
+ vpdpwssd m7, m16, m1 ; B0l
+ vpdpwssd m31, m15, m9 ; i3h
+ mova m9, m11
+ vpdpwssd m9, m16, m24 ; B0h
+ mova m0, m2
+ vpdpwssd m6, m17, m2 ; A1l
+ mova m23, m25
+ vpdpwssd m8, m17, m25 ; A1h
+ mova m1, m3
+ vpdpwssd m7, m17, m3 ; B1l
+ mova m24, m26
+ vpdpwssd m9, m17, m26 ; B1h
+ vpermt2b m30, m22, m31 ; 78h
+ mova m31, [prep_endC]
+ vpdpwssd m6, m18, m4 ; A2l
+ mova m2, m4
+ vpdpwssd m8, m18, m27 ; A2h
+ mova m25, m27
+ vpdpwssd m7, m18, m5 ; B2l
+ mova m3, m5
+ vpdpwssd m9, m18, m28 ; B2h
+ mova m26, m28
+ vpshrdd m4, m5, m29, 16 ; 67l
+ vpdpwssd m6, m19, m4 ; A3l
+ vpshrdd m27, m28, m30, 16 ; 67h
+ vpdpwssd m8, m19, m27 ; A3h
+ mova m5, m29
+ vpdpwssd m7, m19, m29 ; B3l
+ mova m28, m30
+ vpdpwssd m9, m19, m30 ; B3h
+ vpermt2b m6, m31, m8
+ vpermt2b m7, m31, m9
+ mova [tmpq+wq*0], m6
+ mova [tmpq+wq*2], m7
+ lea tmpq, [tmpq+wq*4]
+ sub hd, 2
+ jg .hv_w32_loop
+ add r7, 64
+ add r8, 64
+ movzx hd, r5b
+ mov srcq, r7
+ mov tmpq, r8
+ sub r5d, 1<<8
+ jg .hv_w32_loop0
+ RET
+
+%if WIN64
+DECLARE_REG_TMP 5
+%else
+DECLARE_REG_TMP 7
+%endif
+
+cglobal warp_affine_8x8t_16bpc, 4, 7, 22, tmp, ts
+%define base r6-pd_0to7
+ mov t0d, r7m
+ lea r6, [pd_0to7]
+ shr t0d, 11
+ vpbroadcastd m8, [base+warp_8x8t_rnd_v]
+ vpbroadcastd m1, [base+warp_8x8_rnd_h+t0*4]
+ call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main
+ psrad m14, m16, 15
+ call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2
+ psrad m16, 15
+ packssdw m14, m16
+ call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2
+ psrad m15, m16, 15
+ call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2
+ add tsq, tsq
+ psrad m16, 15
+ packssdw m15, m16
+ jmp mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).end
+
+cglobal warp_affine_8x8_16bpc, 4, 7, 22, dst, ds, src, ss, abcd
+ mov t0d, r7m ; pixel_max
+ lea r6, [pd_0to7]
+ shr t0d, 11
+ vpbroadcastd m1, [base+warp_8x8_rnd_h+t0*4]
+ vpbroadcastd m8, [base+warp_8x8_rnd_v+t0*4]
+ call .main
+ psrad m14, m16, 13
+ call .main2
+ psrad m16, 13
+ packusdw m14, m16
+ call .main2
+ psrad m15, m16, 13
+ call .main2
+ vpbroadcastd m0, [base+bidir_shift+t0*4]
+ vpsrlvw m14, m0
+ psrad m16, 13
+ packusdw m15, m16
+ vpsrlvw m15, m0
+.end:
+ mova m0, [base+warp8x8_end]
+ vpermb m16, m0, m14
+ lea r2, [dsq*3]
+ mova [dstq+dsq*0], xm16
+ vextracti128 [dstq+dsq*1], ym16, 1
+ vextracti32x4 [dstq+dsq*2], m16, 2
+ vextracti32x4 [dstq+r2 ], m16, 3
+ vpermb m16, m0, m15
+ lea dstq, [dstq+dsq*4]
+ mova [dstq+dsq*0], xm16
+ vextracti128 [dstq+dsq*1], ym16, 1
+ vextracti32x4 [dstq+dsq*2], m16, 2
+ vextracti32x4 [dstq+r2 ], m16, 3
+ RET
+.main:
+ vpbroadcastd ym3, [base+pd_512]
+%if WIN64
+ mov abcdq, r5mp
+ vpaddd ym18, ym3, r6m {1to8} ; mx
+%else
+ add r5d, 512
+ vpbroadcastd ym18, r5d
+%endif
+ vpaddd ym20, ym3, r7m {1to8} ; my
+ mova ym16, [base+pd_0to7]
+ vpbroadcastd ym19, [abcdq+4*0] ; alpha
+ vpbroadcastd ym21, [abcdq+4*1] ; gamma
+ lea r4, [ssq*3+6]
+ vpdpwssd ym18, ym19, ym16 ; tmx
+ vpdpwssd ym20, ym21, ym16 ; tmy
+ sub srcq, r4
+ mova m10, [base+warp8x8_permA]
+ lea r4, [mc_warp_filter+64*8]
+ vbroadcasti32x4 m12, [base+warp8x8_permC]
+ kxnorb k1, k1, k1
+ vbroadcasti32x4 m13, [base+warp8x8_permD]
+ movu ym5, [srcq+0]
+ vinserti32x8 m5, [srcq+8], 1
+ psrad ym17, ym18, 10
+ mova m11, [base+warp8x8_permB]
+ kmovb k2, k1
+ vpgatherdq m3{k1}, [r4+ym17*8] ; filter_x0
+ psrad ym19, 16 ; beta
+ psrad ym21, 16 ; delta
+ paddd ym18, ym19
+ vpermb m4, m10, m5
+ vpbroadcastq m9, [base+warp_shift_h+t0*8]
+ pshufd m3, m3, q3120
+ paddd m7, m1, m1
+ pshufb m2, m3, m12
+ vpdpwssd m1, m4, m2
+ vpermb m5, m11, m5
+ vshufi32x4 m4, m5, q1021
+ pshufb m3, m13
+ vpdpwssd m1, m4, m3
+ call .h
+ psllq m2, m1, 32
+ paddd m1, m2
+ vpmultishiftqb m1, m9, m1
+ vpshrdq m1, m0, 48 ; 01 12
+ call .h
+ vpshrdq m2, m1, m0, 48 ; 23 34
+ call .h
+ vpshrdq m3, m2, m0, 48 ; 45 56
+.main2:
+ call .h
+ psrad ym6, ym20, 10
+ kmovb k1, k2
+ paddd ym17, ym20, ym21 ; my += delta
+ vpgatherdq m20{k2}, [r4+ym6*8] ; filter_y0
+ psrad ym16, ym17, 10
+ kmovb k2, k1
+ vpgatherdq m6{k1}, [r4+ym16*8] ; filter_y1
+ shufps m5, m20, m6, q2020
+ mova m16, m8
+ pshufb m4, m5, m12
+ vpdpwssd m16, m1, m4 ; a0 b0
+ pshufb m5, m13
+ mova m1, m2
+ vpdpwssd m16, m2, m5 ; a1 b1
+ shufps m6, m20, m6, q3131
+ paddd ym20, ym17, ym21
+ pshufb m4, m6, m12
+ mova m2, m3
+ vpdpwssd m16, m3, m4 ; a2 b2
+ vpshrdq m3, m0, 48 ; 67 78
+ pshufb m6, m13
+ vpdpwssd m16, m3, m6 ; a3 b3
+ ret
+ALIGN function_align
+.h:
+ movu ym16, [srcq+ssq*1]
+ psrad ym6, ym18, 10
+ lea srcq, [srcq+ssq*2]
+ vinserti32x8 m5, m16, [srcq+ssq*0], 1
+ kmovb k1, k2
+ paddd ym17, ym18, ym19 ; mx += beta
+ vpgatherdq m18{k2}, [r4+ym6*8] ; filter_x1
+ psrad ym16, ym17, 10
+ kmovb k2, k1
+ vpgatherdq m6{k1}, [r4+ym16*8] ; filter_x2
+ vpermb m4, m10, m5
+ shufps m16, m18, m6, q2020
+ shufps m6, m18, m6, q3131
+ mova m0, m7
+ pshufb m18, m16, m12
+ vpdpwssd m0, m4, m18 ; a0 b0
+ vpermb m5, m11, m5
+ pshufb m18, m6, m13
+ vpdpwssd m0, m5, m18 ; a3 b3
+ paddd ym18, ym17, ym19
+ vshufi32x4 m17, m4, m5, q1021
+ pshufb m16, m13
+ vpdpwssd m0, m17, m16 ; a1 b1
+ vshufi32x4 m4, m5, q2132
+ pshufb m6, m12
+ vpdpwssd m0, m4, m6 ; a2 b2
+ vpmultishiftqb m0, m9, m0 ; a a b b
+ ret
+
+%macro BIDIR_FN 0
+ call .main
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ movq [dstq ], xm0
+ movhps [dstq+strideq*1], xm0
+ vextracti32x4 xm2, ym0, 1
+ movq [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm2
+ cmp hd, 8
+ jl .w4_end
+ vextracti32x4 xm2, m0, 2
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], xm2
+ movhps [dstq+strideq*1], xm2
+ vextracti32x4 xm0, m0, 3
+ movq [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm0
+ je .w4_end
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], xm1
+ movhps [dstq+strideq*1], xm1
+ vextracti32x4 xm0, ym1, 1
+ movq [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm0
+ vextracti32x4 xm0, m1, 2
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], xm0
+ movhps [dstq+strideq*1], xm0
+ vextracti32x4 xm1, m1, 3
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+stride3q ], xm1
+.w4_end:
+ RET
+.w8_loop:
+ call .main
+ lea dstq, [dstq+strideq*4]
+.w8:
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ vextracti32x4 [dstq+strideq*2], m0, 2
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ sub hd, 8
+ jl .w8_end
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], xm1
+ vextracti32x4 [dstq+strideq*1], ym1, 1
+ vextracti32x4 [dstq+strideq*2], m1, 2
+ vextracti32x4 [dstq+stride3q ], m1, 3
+ jg .w8_loop
+.w8_end:
+ RET
+.w16_loop:
+ call .main
+ lea dstq, [dstq+strideq*4]
+.w16:
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], ym1
+ vextracti32x8 [dstq+stride3q ], m1, 1
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w32:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64_loop:
+ call .main
+ add dstq, strideq
+.w64:
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop:
+ call .main
+ add dstq, strideq
+.w128:
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ call .main
+ mova [dstq+64*2], m0
+ mova [dstq+64*3], m1
+ dec hd
+ jg .w128_loop
+ RET
+%endmacro
+
+%if WIN64
+DECLARE_REG_TMP 5
+%else
+DECLARE_REG_TMP 7
+%endif
+
+cglobal avg_16bpc, 4, 7, 4, dst, stride, tmp1, tmp2, w, h, stride3
+%define base r6-avg_avx512icl_table
+ lea r6, [avg_avx512icl_table]
+ tzcnt wd, wm
+ mov t0d, r6m ; pixel_max
+ movsxd wq, [r6+wq*4]
+ shr t0d, 11
+ vpbroadcastd m2, [base+avg_round+t0*4]
+ vpbroadcastd m3, [base+avg_shift+t0*4]
+ movifnidn hd, hm
+ add wq, r6
+ BIDIR_FN
+ALIGN function_align
+.main:
+ mova m0, [tmp1q+64*0]
+ paddsw m0, [tmp2q+64*0]
+ mova m1, [tmp1q+64*1]
+ paddsw m1, [tmp2q+64*1]
+ add tmp1q, 64*2
+ add tmp2q, 64*2
+ pmaxsw m0, m2
+ pmaxsw m1, m2
+ psubsw m0, m2
+ psubsw m1, m2
+ vpsrlvw m0, m3
+ vpsrlvw m1, m3
+ ret
+
+cglobal w_avg_16bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, h, stride3
+%define base r6-w_avg_avx512icl_table
+ lea r6, [w_avg_avx512icl_table]
+ tzcnt wd, wm
+ mov t0d, r7m ; pixel_max
+ shr t0d, 11
+ movsxd wq, [r6+wq*4]
+ vpbroadcastd m5, [base+w_avg_round+t0*4]
+ vpbroadcastd m7, [base+bidir_shift+t0*4]
+ add wq, r6
+ mov r6d, r6m ; weight
+ lea t0d, [r6-16]
+ shl r6d, 16
+ sub r6d, t0d ; 16-weight, weight
+ movifnidn hd, hm
+ vpbroadcastd m6, r6d
+ BIDIR_FN
+ALIGN function_align
+.main:
+ mova m3, [tmp1q+64*0]
+ mova m1, [tmp2q+64*0]
+ mova m0, [tmp1q+64*1]
+ mova m4, [tmp2q+64*1]
+ add tmp1q, 64*2
+ add tmp2q, 64*2
+ punpcklwd m2, m1, m3
+ punpckhwd m1, m3
+ punpcklwd m3, m4, m0
+ punpckhwd m4, m0
+ mova m0, m5
+ vpdpwssd m0, m6, m2
+ mova m2, m5
+ vpdpwssd m2, m6, m1
+ mova m1, m5
+ vpdpwssd m1, m6, m3
+ mova m3, m5
+ vpdpwssd m3, m6, m4
+ REPX {psrad x, 2}, m0, m2, m1, m3
+ packusdw m0, m2
+ packusdw m1, m3
+ vpsrlvw m0, m7
+ vpsrlvw m1, m7
+ ret
+
+cglobal mask_16bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-mask_avx512icl_table
+ lea r7, [mask_avx512icl_table]
+ tzcnt wd, wm
+ mov r6d, r7m ; pixel_max
+ movifnidn hd, hm
+ shr r6d, 11
+ movsxd wq, [r7+wq*4]
+ vpbroadcastd m8, [base+pw_64]
+ vpbroadcastd m9, [base+mask_round+r6*4]
+ vpbroadcastd m10, [base+bidir_shift+r6*4]
+ mov maskq, maskmp
+ add wq, r7
+ BIDIR_FN
+ALIGN function_align
+.main:
+ pmovzxbw m1, [maskq+32*0]
+ mova m4, [tmp1q+64*0]
+ mova m2, [tmp2q+64*0]
+ pmovzxbw m6, [maskq+32*1]
+ mova m5, [tmp1q+64*1]
+ mova m3, [tmp2q+64*1]
+ add maskq, 32*2
+ add tmp1q, 64*2
+ add tmp2q, 64*2
+ punpcklwd m7, m4, m2
+ punpckhwd m4, m2
+ psubw m0, m8, m1
+ punpcklwd m2, m1, m0 ; m, 64-m
+ punpckhwd m1, m0
+ mova m0, m9
+ vpdpwssd m0, m7, m2
+ mova m2, m9
+ vpdpwssd m2, m4, m1 ; tmp1 * m + tmp2 * (64-m)
+ punpcklwd m7, m5, m3
+ punpckhwd m5, m3
+ psubw m1, m8, m6
+ punpcklwd m3, m6, m1
+ punpckhwd m6, m1
+ mova m1, m9
+ vpdpwssd m1, m7, m3
+ mova m3, m9
+ vpdpwssd m3, m5, m6
+ REPX {psrad x, 4}, m0, m2, m1, m3
+ packusdw m0, m2
+ packusdw m1, m3
+ vpsrlvw m0, m10
+ vpsrlvw m1, m10
+ ret
+
+cglobal w_mask_420_16bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-w_mask_420_avx512icl_table
+ lea r7, [w_mask_420_avx512icl_table]
+ tzcnt wd, wm
+ mov r6d, r8m ; pixel_max
+ movifnidn hd, hm
+ shr r6d, 11
+ movsxd wq, [r7+wq*4]
+ vpbroadcastd m10, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32
+ vpbroadcastd m11, [base+pw_64]
+ vpbroadcastd m12, [base+mask_round+r6*4]
+ vpbroadcastd m13, [base+bidir_shift+r6*4]
+ mov r6d, r7m ; sign
+ vpbroadcastd m14, [base+w_mask_round+r6*4]
+ mova ym15, [w_mask_end42x]
+ mov maskq, maskmp
+ add wq, r7
+ call .main
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ mova m4, [w_mask_shuf4]
+ vpermt2b m2, m4, m3
+ mova m3, m14
+ vpdpbusd m3, m2, [pb_64] {1to16}
+ vpermb m3, m15, m3
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ vextracti32x4 xm2, ym0, 1
+ movq [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm2
+ mova [maskq], xm3
+ cmp hd, 8
+ jl .w4_end
+ vextracti32x4 xm2, m0, 2
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm2
+ movhps [dstq+strideq*1], xm2
+ vextracti32x4 xm0, m0, 3
+ movq [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm0
+ je .w4_end
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm1
+ movhps [dstq+strideq*1], xm1
+ vextracti32x4 xm2, ym1, 1
+ movq [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm2
+ vextracti32x4 xm2, m1, 2
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm2
+ movhps [dstq+strideq*1], xm2
+ vextracti32x4 xm1, m1, 3
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+stride3q ], xm1
+.w4_end:
+ RET
+.w8:
+ mova m8, [w_mask_shuf8]
+ vpbroadcastd m9, [pb_64]
+ jmp .w8_start
+.w8_loop:
+ call .main
+ lea dstq, [dstq+strideq*4]
+ add maskq, 16
+.w8_start:
+ vpermt2b m2, m8, m3
+ mova m3, m14
+ vpdpbusd m3, m2, m9
+ vpermb m3, m15, m3
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ vextracti32x4 [dstq+strideq*2], m0, 2
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ mova [maskq], xm3
+ sub hd, 8
+ jl .w8_end
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], xm1
+ vextracti32x4 [dstq+strideq*1], ym1, 1
+ vextracti32x4 [dstq+strideq*2], m1, 2
+ vextracti32x4 [dstq+stride3q ], m1, 3
+ jg .w8_loop
+.w8_end:
+ RET
+.w16:
+ mova m8, [w_mask_shuf16]
+ vpbroadcastd m9, [pb_64]
+ jmp .w16_start
+.w16_loop:
+ call .main
+ lea dstq, [dstq+strideq*4]
+ add maskq, 16
+.w16_start:
+ vpermt2b m2, m8, m3
+ mova m3, m14
+ vpdpbusd m3, m2, m9
+ vpermb m3, m15, m3
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], ym1
+ vextracti32x8 [dstq+stride3q ], m1, 1
+ mova [maskq], xm3
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32_loop:
+ call .main
+ lea dstq, [dstq+strideq*4]
+ add maskq, 32
+.w32:
+ paddw m2, m3
+ mova m8, m14
+ vpdpwssd m8, m11, m2
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ call .main
+ paddw m2, m3
+ mova m3, m14
+ vpdpwssd m3, m11, m2
+ vpermt2b m8, m15, m3
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m1
+ mova [maskq], ym8
+ sub hd, 4
+ jg .w32_loop
+ RET
+.w64_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+ add maskq, 32
+.w64:
+ mova m8, m2
+ mova m9, m3
+ mova [dstq+strideq*0+64*0], m0
+ mova [dstq+strideq*0+64*1], m1
+ call .main
+ paddw m8, m2
+ paddw m9, m3
+ mova m2, m14
+ vpdpwssd m2, m11, m8
+ mova m3, m14
+ vpdpwssd m3, m11, m9
+ vpermt2b m2, m15, m3
+ mova [dstq+strideq*1+64*0], m0
+ mova [dstq+strideq*1+64*1], m1
+ mova [maskq], ym2
+ sub hd, 2
+ jg .w64_loop
+ RET
+.w128_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+ add maskq, 64
+.w128:
+ mova m16, m2
+ mova m8, m3
+ mova [dstq+strideq*0+64*0], m0
+ mova [dstq+strideq*0+64*1], m1
+ call .main
+ mova m17, m2
+ mova m9, m3
+ mova [dstq+strideq*0+64*2], m0
+ mova [dstq+strideq*0+64*3], m1
+ call .main
+ paddw m2, m16
+ paddw m3, m8
+ mova m16, m14
+ vpdpwssd m16, m11, m2
+ mova m8, m14
+ vpdpwssd m8, m11, m3
+ mova [dstq+strideq*1+64*0], m0
+ mova [dstq+strideq*1+64*1], m1
+ call .main
+ paddw m2, m17
+ paddw m3, m9
+ mova m17, m14
+ vpdpwssd m17, m11, m2
+ mova m9, m14
+ vpdpwssd m9, m11, m3
+ vpermt2b m16, m15, m8
+ vpermt2b m17, m15, m9
+ mova [dstq+strideq*1+64*2], m0
+ mova [dstq+strideq*1+64*3], m1
+ mova [maskq+32*0], ym16
+ mova [maskq+32*1], ym17
+ sub hd, 2
+ jg .w128_loop
+ vzeroupper
+ RET
+ALIGN function_align
+.main:
+ mova m1, [tmp1q+64*0]
+ mova m3, [tmp2q+64*0]
+ mova m4, [tmp1q+64*1]
+ mova m7, [tmp2q+64*1]
+ add tmp1q, 64*2
+ add tmp2q, 64*2
+ psubsw m6, m1, m3
+ punpcklwd m5, m3, m1
+ pabsw m6, m6
+ punpckhwd m3, m1
+ psubusw m6, m10, m6
+ psrlw m6, 10 ; 64-m
+ psubw m2, m11, m6 ; m
+ punpcklwd m1, m6, m2
+ punpckhwd m6, m2
+ mova m0, m12
+ vpdpwssd m0, m5, m1
+ mova m1, m12
+ vpdpwssd m1, m3, m6
+ psubsw m5, m4, m7
+ punpcklwd m6, m7, m4
+ pabsw m5, m5
+ punpckhwd m7, m4
+ psubusw m5, m10, m5
+ psrlw m5, 10
+ psubw m3, m11, m5
+ punpcklwd m4, m5, m3
+ psrad m0, 4
+ punpckhwd m5, m3
+ psrad m1, 4
+ packusdw m0, m1
+ mova m1, m12
+ vpdpwssd m1, m6, m4
+ mova m4, m12
+ vpdpwssd m4, m7, m5
+ psrad m1, 4
+ psrad m4, 4
+ packusdw m1, m4
+ vpsrlvw m0, m13
+ vpsrlvw m1, m13
+ ret
+
+cglobal w_mask_422_16bpc, 4, 8, 15, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-w_mask_422_avx512icl_table
+ lea r7, [w_mask_422_avx512icl_table]
+ tzcnt wd, wm
+ mov r6d, r8m ; pixel_max
+ movifnidn hd, hm
+ shr r6d, 11
+ movsxd wq, [r7+wq*4]
+ vpbroadcastd m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32
+ vpbroadcastd m9, [base+pw_64]
+ vpbroadcastd m10, [base+mask_round+r6*4]
+ vpbroadcastd m11, [base+bidir_shift+r6*4]
+ mov r6d, r7m ; sign
+ vpbroadcastd m12, [base+w_mask_round+r6*4]
+ mova ym13, [w_mask_end42x]
+ mov maskq, maskmp
+ add wq, r7
+ paddw m14, m9, m9 ; pw_128
+ call .main
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ vextracti32x4 xm2, ym0, 1
+ movq [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm2
+ cmp hd, 8
+ jl .w4_end
+ vextracti32x4 xm2, m0, 2
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm2
+ movhps [dstq+strideq*1], xm2
+ vextracti32x4 xm0, m0, 3
+ movq [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm0
+ je .w4_end
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm1
+ movhps [dstq+strideq*1], xm1
+ vextracti32x4 xm2, ym1, 1
+ movq [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm2
+ vextracti32x4 xm2, m1, 2
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm2
+ movhps [dstq+strideq*1], xm2
+ vextracti32x4 xm1, m1, 3
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+stride3q ], xm1
+.w4_end:
+ RET
+.w8_loop:
+ call .main
+ lea dstq, [dstq+strideq*4]
+.w8:
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ vextracti32x4 [dstq+strideq*2], m0, 2
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ sub hd, 8
+ jl .w8_end
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], xm1
+ vextracti32x4 [dstq+strideq*1], ym1, 1
+ vextracti32x4 [dstq+strideq*2], m1, 2
+ vextracti32x4 [dstq+stride3q ], m1, 3
+ jg .w8_loop
+.w8_end:
+ RET
+.w16_loop:
+ call .main
+ lea dstq, [dstq+strideq*4]
+.w16:
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], ym1
+ vextracti32x8 [dstq+stride3q ], m1, 1
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w32:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64_loop:
+ call .main
+ add dstq, strideq
+.w64:
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop:
+ call .main
+ add dstq, strideq
+.w128:
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ call .main
+ mova [dstq+64*2], m0
+ mova [dstq+64*3], m1
+ dec hd
+ jg .w128_loop
+ RET
+ALIGN function_align
+.main:
+ mova m1, [tmp1q+64*0]
+ mova m3, [tmp2q+64*0]
+ mova m4, [tmp1q+64*1]
+ mova m7, [tmp2q+64*1]
+ add tmp1q, 64*2
+ add tmp2q, 64*2
+ psubsw m6, m1, m3
+ punpcklwd m5, m3, m1
+ pabsw m6, m6
+ punpckhwd m3, m1
+ psubusw m6, m8, m6
+ psrlw m6, 10
+ psubw m2, m9, m6
+ punpcklwd m1, m6, m2
+ punpckhwd m6, m2
+ mova m0, m10
+ vpdpwssd m0, m5, m1
+ mova m1, m10
+ vpdpwssd m1, m3, m6
+ psubsw m5, m4, m7
+ punpcklwd m6, m7, m4
+ pabsw m5, m5
+ punpckhwd m7, m4
+ psubusw m5, m8, m5
+ psrlw m5, 10
+ psubw m3, m9, m5
+ punpcklwd m4, m5, m3
+ psrad m0, 4
+ punpckhwd m5, m3
+ psrad m1, 4
+ packusdw m0, m1
+ mova m1, m10
+ vpdpwssd m1, m6, m4
+ mova m4, m10
+ vpdpwssd m4, m7, m5
+ mova m5, m12
+ vpdpwssd m5, m14, m2
+ mova m2, m12
+ vpdpwssd m2, m14, m3
+ psrad m1, 4
+ psrad m4, 4
+ packusdw m1, m4
+ vpermt2b m5, m13, m2
+ vpsrlvw m0, m11
+ vpsrlvw m1, m11
+ mova [maskq], ym5
+ add maskq, 32
+ ret
+
+cglobal w_mask_444_16bpc, 4, 8, 13, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-w_mask_444_avx512icl_table
+ lea r7, [w_mask_444_avx512icl_table]
+ tzcnt wd, wm
+ mov r6d, r8m ; pixel_max
+ movifnidn hd, hm
+ shr r6d, 11
+ movsxd wq, [r7+wq*4]
+ vpbroadcastd m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32
+ vpbroadcastd m9, [base+pw_64]
+ vpbroadcastd m10, [base+mask_round+r6*4]
+ mova m11, [w_mask_end444]
+ vpbroadcastd m12, [base+bidir_shift+r6*4]
+ mov maskq, maskmp
+ add wq, r7
+ call .main
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ vextracti32x4 xm2, ym0, 1
+ movq [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm2
+ cmp hd, 8
+ jl .w4_end
+ vextracti32x4 xm2, m0, 2
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm2
+ movhps [dstq+strideq*1], xm2
+ vextracti32x4 xm0, m0, 3
+ movq [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm0
+ je .w4_end
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm1
+ movhps [dstq+strideq*1], xm1
+ vextracti32x4 xm2, ym1, 1
+ movq [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm2
+ vextracti32x4 xm2, m1, 2
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm2
+ movhps [dstq+strideq*1], xm2
+ vextracti32x4 xm1, m1, 3
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+stride3q ], xm1
+.w4_end:
+ RET
+.w8_loop:
+ call .main
+ lea dstq, [dstq+strideq*4]
+.w8:
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ vextracti32x4 [dstq+strideq*2], m0, 2
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ sub hd, 8
+ jl .w8_end
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], xm1
+ vextracti32x4 [dstq+strideq*1], ym1, 1
+ vextracti32x4 [dstq+strideq*2], m1, 2
+ vextracti32x4 [dstq+stride3q ], m1, 3
+ jg .w8_loop
+.w8_end:
+ RET
+.w16_loop:
+ call .main
+ lea dstq, [dstq+strideq*4]
+.w16:
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], ym1
+ vextracti32x8 [dstq+stride3q ], m1, 1
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w32:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64_loop:
+ call .main
+ add dstq, strideq
+.w64:
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop:
+ call .main
+ add dstq, strideq
+.w128:
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ call .main
+ mova [dstq+64*2], m0
+ mova [dstq+64*3], m1
+ dec hd
+ jg .w128_loop
+ RET
+ALIGN function_align
+.main:
+ mova m1, [tmp1q+64*0]
+ mova m3, [tmp2q+64*0]
+ mova m4, [tmp1q+64*1]
+ mova m7, [tmp2q+64*1]
+ add tmp1q, 64*2
+ add tmp2q, 64*2
+ psubsw m6, m1, m3
+ punpcklwd m5, m3, m1
+ pabsw m6, m6
+ punpckhwd m3, m1
+ psubusw m6, m8, m6
+ psrlw m6, 10
+ psubw m2, m9, m6
+ punpcklwd m1, m6, m2
+ punpckhwd m6, m2
+ mova m0, m10
+ vpdpwssd m0, m5, m1
+ mova m1, m10
+ vpdpwssd m1, m3, m6
+ psubsw m5, m4, m7
+ punpcklwd m6, m7, m4
+ pabsw m5, m5
+ punpckhwd m7, m4
+ psubusw m5, m8, m5
+ psrlw m5, 10
+ psubw m3, m9, m5
+ punpcklwd m4, m5, m3
+ psrad m0, 4
+ punpckhwd m5, m3
+ psrad m1, 4
+ packusdw m0, m1
+ mova m1, m10
+ vpdpwssd m1, m6, m4
+ mova m4, m10
+ vpdpwssd m4, m7, m5
+ vpermt2b m2, m11, m3
+ psrad m1, 4
+ psrad m4, 4
+ packusdw m1, m4
+ vpsrlvw m0, m12
+ vpsrlvw m1, m12
+ mova [maskq], m2
+ add maskq, 64
+ ret
+
+cglobal blend_16bpc, 3, 7, 7, dst, ds, tmp, w, h, mask
+%define base r6-blend_avx512icl_table
+ lea r6, [blend_avx512icl_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, [r6+wq*4]
+ movifnidn maskq, maskmp
+ vpbroadcastd m6, [base+pw_m512]
+ add wq, r6
+ lea r6, [dsq*3]
+ jmp wq
+.w4:
+ pmovzxbw ym19, [maskq]
+ movq xm16, [dstq+dsq*0]
+ movhps xm16, [dstq+dsq*1]
+ vpbroadcastq ym17, [dstq+dsq*2]
+ vpbroadcastq ym18, [dstq+r6 ]
+ pmullw ym19, ym6
+ vpblendd ym16, ym17, 0x30
+ vpblendd ym16, ym18, 0xc0
+ psubw ym17, ym16, [tmpq]
+ add maskq, 16
+ add tmpq, 32
+ pmulhrsw ym17, ym19
+ paddw ym16, ym17
+ vextracti128 xm17, ym16, 1
+ movq [dstq+dsq*0], xm16
+ movhps [dstq+dsq*1], xm16
+ movq [dstq+dsq*2], xm17
+ movhps [dstq+r6 ], xm17
+ lea dstq, [dstq+dsq*4]
+ sub hd, 4
+ jg .w4
+ vzeroupper
+ RET
+.w8:
+ pmovzxbw m2, [maskq]
+ mova xm0, [dstq+dsq*0]
+ vinserti32x4 ym0, [dstq+dsq*1], 1
+ vinserti32x4 m0, [dstq+dsq*2], 2
+ vinserti32x4 m0, [dstq+r6 ], 3
+ pmullw m2, m6
+ psubw m1, m0, [tmpq]
+ add maskq, 32
+ add tmpq, 64
+ pmulhrsw m1, m2
+ paddw m0, m1
+ mova [dstq+dsq*0], xm0
+ vextracti32x4 [dstq+dsq*1], ym0, 1
+ vextracti32x4 [dstq+dsq*2], m0, 2
+ vextracti32x4 [dstq+r6 ], m0, 3
+ lea dstq, [dstq+dsq*4]
+ sub hd, 4
+ jg .w8
+ RET
+.w16:
+ pmovzxbw m4, [maskq+32*0]
+ pmovzxbw m5, [maskq+32*1]
+ mova ym0, [dstq+dsq*0]
+ vinserti32x8 m0, [dstq+dsq*1], 1
+ mova ym1, [dstq+dsq*2]
+ vinserti32x8 m1, [dstq+r6 ], 1
+ pmullw m4, m6
+ pmullw m5, m6
+ psubw m2, m0, [tmpq+64*0]
+ psubw m3, m1, [tmpq+64*1]
+ add maskq, 32*2
+ add tmpq, 64*2
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+dsq*0], ym0
+ vextracti32x8 [dstq+dsq*1], m0, 1
+ mova [dstq+dsq*2], ym1
+ vextracti32x8 [dstq+r6 ], m1, 1
+ lea dstq, [dstq+dsq*4]
+ sub hd, 4
+ jg .w16
+ RET
+.w32:
+ pmovzxbw m4, [maskq+32*0]
+ pmovzxbw m5, [maskq+32*1]
+ mova m0, [dstq+dsq*0]
+ mova m1, [dstq+dsq*1]
+ pmullw m4, m6
+ pmullw m5, m6
+ psubw m2, m0, [tmpq+ 64*0]
+ psubw m3, m1, [tmpq+ 64*1]
+ add maskq, 32*2
+ add tmpq, 64*2
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w32
+ RET
+
+cglobal blend_v_16bpc, 3, 6, 5, dst, ds, tmp, w, h
+ lea r5, [blend_v_avx512icl_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ jmp wq
+.w2:
+ vpbroadcastd xmm2, [obmc_masks_avx2+2*2]
+.w2_loop:
+ movd xmm0, [dstq+dsq*0]
+ pinsrd xmm0, [dstq+dsq*1], 1
+ movq xmm1, [tmpq]
+ add tmpq, 4*2
+ psubw xmm1, xmm0, xmm1
+ pmulhrsw xmm1, xmm2
+ paddw xmm0, xmm1
+ movd [dstq+dsq*0], xmm0
+ pextrd [dstq+dsq*1], xmm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w2_loop
+ RET
+.w4:
+ vpbroadcastq xmm2, [obmc_masks_avx2+4*2]
+.w4_loop:
+ movq xmm0, [dstq+dsq*0]
+ movhps xmm0, [dstq+dsq*1]
+ psubw xmm1, xmm0, [tmpq]
+ add tmpq, 8*2
+ pmulhrsw xmm1, xmm2
+ paddw xmm0, xmm1
+ movq [dstq+dsq*0], xmm0
+ movhps [dstq+dsq*1], xmm0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w4_loop
+ RET
+.w8:
+ vbroadcasti32x4 ym2, [obmc_masks_avx2+8*2]
+.w8_loop:
+ mova xm0, [dstq+dsq*0]
+ vinserti32x4 ym0, [dstq+dsq*1], 1
+ psubw ym1, ym0, [tmpq]
+ add tmpq, 16*2
+ pmulhrsw ym1, ym2
+ paddw ym0, ym1
+ mova [dstq+dsq*0], xm0
+ vextracti32x4 [dstq+dsq*1], ym0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w8_loop
+ RET
+.w16:
+ vbroadcasti32x8 m2, [obmc_masks_avx2+16*2]
+.w16_loop:
+ mova ym0, [dstq+dsq*0]
+ vinserti32x8 m0, [dstq+dsq*1], 1
+ psubw m1, m0, [tmpq]
+ add tmpq, 32*2
+ pmulhrsw m1, m2
+ paddw m0, m1
+ mova [dstq+dsq*0], ym0
+ vextracti32x8 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w16_loop
+ RET
+.w32:
+ mova m4, [obmc_masks_avx2+32*2]
+.w32_loop:
+ mova m0, [dstq+dsq*0]
+ psubw m2, m0, [tmpq+ 64*0]
+ mova m1, [dstq+dsq*1]
+ psubw m3, m1, [tmpq+ 64*1]
+ add tmpq, 64*2
+ pmulhrsw m2, m4
+ pmulhrsw m3, m4
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w32_loop
+ RET
+
+cglobal blend_h_16bpc, 3, 7, 9, dst, ds, tmp, w, h, mask
+%define base r6-$$
+ lea r6, [$$]
+ tzcnt wd, wm
+ mov hd, hm
+ movsxd wq, [base+blend_h_avx512icl_table+wq*4]
+ lea maskq, [base+obmc_masks_avx2+hq*2]
+ lea hd, [hq*3]
+ lea wq, [base+blend_h_avx512icl_table+wq]
+ shr hd, 2 ; h * 3/4
+ lea maskq, [maskq+hq*2]
+ neg hq
+ jmp wq
+.w2:
+ movd xmm0, [dstq+dsq*0]
+ pinsrd xmm0, [dstq+dsq*1], 1
+ movd xmm2, [maskq+hq*2]
+ movq xmm1, [tmpq]
+ add tmpq, 4*2
+ punpcklwd xmm2, xmm2
+ psubw xmm1, xmm0, xmm1
+ pmulhrsw xmm1, xmm2
+ paddw xmm0, xmm1
+ movd [dstq+dsq*0], xmm0
+ pextrd [dstq+dsq*1], xmm0, 1
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w2
+ RET
+.w4:
+ mova xmm3, [blend_shuf]
+.w4_loop:
+ movq xmm0, [dstq+dsq*0]
+ movhps xmm0, [dstq+dsq*1]
+ movd xmm2, [maskq+hq*2]
+ psubw xmm1, xmm0, [tmpq]
+ add tmpq, 8*2
+ pshufb xmm2, xmm3
+ pmulhrsw xmm1, xmm2
+ paddw xmm0, xmm1
+ movq [dstq+dsq*0], xmm0
+ movhps [dstq+dsq*1], xmm0
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w4_loop
+ RET
+.w8:
+ vbroadcasti32x4 ym3, [blend_shuf]
+ shufpd ym3, ym3, 0x0c
+.w8_loop:
+ mova xm0, [dstq+dsq*0]
+ vinserti32x4 ym0, [dstq+dsq*1], 1
+ vpbroadcastd ym2, [maskq+hq*2]
+ psubw ym1, ym0, [tmpq]
+ add tmpq, 16*2
+ pshufb ym2, ym3
+ pmulhrsw ym1, ym2
+ paddw ym0, ym1
+ mova [dstq+dsq*0], xm0
+ vextracti32x4 [dstq+dsq*1], ym0, 1
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w8_loop
+ RET
+.w16:
+ vbroadcasti32x4 m3, [blend_shuf]
+ shufpd m3, m3, 0xf0
+.w16_loop:
+ mova ym0, [dstq+dsq*0]
+ vinserti32x8 m0, [dstq+dsq*1], 1
+ vpbroadcastd m2, [maskq+hq*2]
+ psubw m1, m0, [tmpq]
+ add tmpq, 32*2
+ pshufb m2, m3
+ pmulhrsw m1, m2
+ paddw m0, m1
+ mova [dstq+dsq*0], ym0
+ vextracti32x8 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w16_loop
+ RET
+.w32:
+ vpbroadcastw m4, [maskq+hq*2]
+ vpbroadcastw m5, [maskq+hq*2+2]
+ mova m0, [dstq+dsq*0]
+ psubw m2, m0, [tmpq+ 64*0]
+ mova m1, [dstq+dsq*1]
+ psubw m3, m1, [tmpq+ 64*1]
+ add tmpq, 64*2
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w32
+ RET
+.w64:
+ vpbroadcastw m4, [maskq+hq*2]
+ mova m0, [dstq+64*0]
+ psubw m2, m0, [tmpq+64*0]
+ mova m1, [dstq+64*1]
+ psubw m3, m1, [tmpq+64*1]
+ add tmpq, 64*2
+ pmulhrsw m2, m4
+ pmulhrsw m3, m4
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ add dstq, dsq
+ inc hq
+ jl .w64
+ RET
+.w128:
+ vpbroadcastw m8, [maskq+hq*2]
+ mova m0, [dstq+64*0]
+ psubw m4, m0, [tmpq+64*0]
+ mova m1, [dstq+64*1]
+ psubw m5, m1, [tmpq+64*1]
+ mova m2, [dstq+64*2]
+ psubw m6, m2, [tmpq+64*2]
+ mova m3, [dstq+64*3]
+ psubw m7, m3, [tmpq+64*3]
+ add tmpq, 64*4
+ REPX {pmulhrsw x, m8}, m4, m5, m6, m7
+ paddw m0, m4
+ paddw m1, m5
+ paddw m2, m6
+ paddw m3, m7
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ mova [dstq+64*2], m2
+ mova [dstq+64*3], m3
+ add dstq, dsq
+ inc hq
+ jl .w128
+ RET
+
+cglobal resize_16bpc, 6, 12, 32, dst, dst_stride, src, src_stride, \
+ dst_w, h, src_w, dx, mx0, pxmax
+ sub dword mx0m, 4<<14
+ sub dword src_wm, 8
+ mov r6, ~0
+ vpbroadcastd m5, dxm
+ vpbroadcastd m8, mx0m
+ vpbroadcastd m6, src_wm
+ kmovq k6, r6
+ DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, _, _, pxmax
+ LEA r7, $$
+%define base r7-$$
+ vpbroadcastd m3, [base+pd_16384]
+ vpbroadcastd m7, [base+pd_63]
+ mova m24, [base+resize_permA]
+ mova m25, [base+resize_permB]
+ mova m26, [base+resize_permC]
+ mova m27, [base+resize_permD]
+ vbroadcasti32x4 m28, [base+resize_shufA]
+ vbroadcasti32x4 m29, [base+resize_shufB]
+ mova m30, [base+resize_permE]
+ vpbroadcastw ym31, pxmaxm
+ vpdpwssd m8, m5, [base+rescale_mul] ; mx+dx*[0-15]
+ pslld m5, 4 ; dx*16
+ pslld m6, 14
+ pxor m2, m2
+.loop_y:
+ xor xd, xd
+ mova m4, m8 ; per-line working version of mx
+.loop_x:
+ pmaxsd m0, m4, m2
+ psrad m9, m4, 8 ; filter offset (unmasked)
+ pminsd m0, m6 ; iclip(mx, 0, src_w-8)
+ psubd m1, m4, m0 ; pshufb offset
+ psrad m0, 14 ; clipped src_x offset
+ psrad m1, 14 ; pshufb edge_emu offset
+ vptestmd k5, m1, m1
+ pand m9, m7 ; filter offset (masked)
+ ktestw k5, k5
+ jz .load
+ vpbroadcastq m14, [base+pd_0_4]
+ vpermq m10, m0, q1100
+ vpermq m11, m0, q3322
+ vpermq m20, m1, q1100
+ vpermq m21, m1, q3322
+ punpckldq m10, m10
+ punpckldq m11, m11
+ punpckldq m20, m20
+ punpckldq m21, m21
+ paddd m10, m14
+ paddd m11, m14
+ paddd m20, m14
+ paddd m21, m14
+ vextracti32x8 ym12, m10, 1
+ vextracti32x8 ym13, m11, 1
+ vextracti32x8 ym22, m20, 1
+ vextracti32x8 ym23, m21, 1
+ kmovq k1, k6
+ kmovq k2, k6
+ kmovq k3, k6
+ kmovq k4, k6
+ vpgatherdq m16{k1}, [srcq+ym10*2] ; 0 1 2 3
+ vpgatherdq m17{k2}, [srcq+ym11*2] ; 4 5 6 7
+ vpgatherdq m18{k3}, [srcq+ym12*2] ; 8 9 A B
+ vpgatherdq m19{k4}, [srcq+ym13*2] ; C D E F
+ kmovq k1, k6
+ kmovq k2, k6
+ kmovq k3, k6
+ kmovq k4, k6
+ vpgatherdq m0{k1}, [base+resize_shuf+8+ym20*2]
+ vpgatherdq m1{k2}, [base+resize_shuf+8+ym21*2]
+ vpgatherdq m14{k3}, [base+resize_shuf+8+ym22*2]
+ vpgatherdq m15{k4}, [base+resize_shuf+8+ym23*2]
+ pshufb m16, m0
+ pshufb m17, m1
+ pshufb m18, m14
+ pshufb m19, m15
+ mova m20, m24
+ mova m22, m24
+ mova m21, m25
+ mova m23, m25
+ vpermi2d m20, m16, m17 ; 0-3a 0-3b 4-7a 4-7b
+ vpermi2d m21, m16, m17 ; 0-3c 0-3d 4-7c 4-7d
+ vpermi2d m22, m18, m19 ; 8-Ba 8-Bb C-Fa C-Fb
+ vpermi2d m23, m18, m19 ; 8-Bc 8-Bd C-Fc C-Fd
+ mova m15, m26
+ mova m17, m26
+ mova m16, m27
+ mova m18, m27
+ vpermi2q m15, m20, m22 ; 0-3a 4-7a 8-Ba C-Fa
+ vpermi2q m16, m20, m22 ; 0-3b 4-7b 8-Bb C-Fb
+ vpermi2q m17, m21, m23 ; 0-3c 4-7c 8-Bc C-Fc
+ vpermi2q m18, m21, m23 ; 0-3d 4-7d 8-Bd C-Fd
+ kmovq k1, k6
+ kmovq k2, k6
+ vpgatherdd m11{k1}, [base+resize_filter+m9*8+0]
+ vpgatherdd m13{k2}, [base+resize_filter+m9*8+4]
+ pshufb m10, m11, m28
+ pshufb m11, m11, m29
+ pshufb m12, m13, m28
+ pshufb m13, m13, m29
+ jmp .filter
+.load:
+ kmovq k1, k6
+ kmovq k2, k6
+ kmovq k3, k6
+ kmovq k4, k6
+ vpgatherdd m11{k1}, [base+resize_filter+m9*8+0]
+ vpgatherdd m13{k2}, [base+resize_filter+m9*8+4]
+ pshufb m10, m11, m28
+ pshufb m11, m11, m29
+ pshufb m12, m13, m28
+ pshufb m13, m13, m29
+ vpgatherdd m15{k3}, [srcq+m0*2+ 0]
+ vpgatherdd m16{k4}, [srcq+m0*2+ 4]
+ kmovq k1, k6
+ kmovq k2, k6
+ vpgatherdd m17{k1}, [srcq+m0*2+ 8]
+ vpgatherdd m18{k2}, [srcq+m0*2+12]
+.filter:
+ mova m14, m2
+ vpdpwssd m14, m15, m10
+ vpdpwssd m14, m16, m11
+ vpdpwssd m14, m17, m12
+ vpdpwssd m14, m18, m13
+ psubd m14, m3, m14
+ psrad m14, 15
+ packusdw m14, m14
+ vpermq m14, m30, m14
+ pminsw ym14, ym31
+ mova [dstq+xq*2], ym14
+ paddd m4, m5
+ add xd, 16
+ cmp xd, dst_wd
+ jl .loop_x
+ add dstq, dst_strideq
+ add srcq, src_strideq
+ dec hd
+ jg .loop_y
+ RET
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/mc16_sse.asm b/third_party/dav1d/src/x86/mc16_sse.asm
new file mode 100644
index 0000000000..fde8e372a3
--- /dev/null
+++ b/third_party/dav1d/src/x86/mc16_sse.asm
@@ -0,0 +1,8731 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA
+
+; dav1d_obmc_masks[] << 9
+obmc_masks: dw 0, 0, 9728, 0, 12800, 7168, 2560, 0
+ dw 14336, 11264, 8192, 5632, 3584, 1536, 0, 0
+ dw 15360, 13824, 12288, 10752, 9216, 7680, 6144, 5120
+ dw 4096, 3072, 2048, 1536, 0, 0, 0, 0
+ dw 15872, 14848, 14336, 13312, 12288, 11776, 10752, 10240
+ dw 9728, 8704, 8192, 7168, 6656, 6144, 5632, 4608
+ dw 4096, 3584, 3072, 2560, 2048, 2048, 1536, 1024
+
+blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
+spel_h_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9
+spel_h_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13
+spel_h_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
+spel_s_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7
+spel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
+unpckw: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
+rescale_mul: dd 0, 1, 2, 3
+resize_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7
+ db 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15
+bdct_lb_q: times 8 db 0
+ times 8 db 4
+ times 8 db 8
+ times 8 db 12
+
+pw_2: times 8 dw 2
+pw_16: times 4 dw 16
+prep_mul: times 4 dw 16
+ times 8 dw 4
+pw_64: times 8 dw 64
+pw_256: times 8 dw 256
+pw_2048: times 4 dw 2048
+bidir_mul: times 4 dw 2048
+pw_8192: times 8 dw 8192
+pw_27615: times 8 dw 27615
+pw_32766: times 8 dw 32766
+pw_m512: times 8 dw -512
+pd_63: times 4 dd 63
+pd_64: times 4 dd 64
+pd_512: times 4 dd 512
+pd_m524256: times 4 dd -524256 ; -8192 << 6 + 32
+pd_0x3ff: times 4 dd 0x3ff
+pd_0x4000: times 4 dd 0x4000
+pq_0x400000: times 2 dq 0x400000
+pq_0x40000000: times 2 dq 0x40000000
+pd_65538: times 2 dd 65538
+
+put_bilin_h_rnd: times 4 dw 8
+ times 4 dw 10
+s_8tap_h_rnd: times 2 dd 2
+ times 2 dd 8
+put_s_8tap_v_rnd: times 2 dd 512
+ times 2 dd 128
+s_8tap_h_sh: dd 2, 4
+put_s_8tap_v_sh: dd 10, 8
+bidir_rnd: times 4 dw -16400
+ times 4 dw -16388
+put_8tap_h_rnd: dd 34, 34, 40, 40
+prep_8tap_1d_rnd: times 2 dd 8 - (8192 << 4)
+prep_8tap_2d_rnd: times 4 dd 32 - (8192 << 5)
+
+warp8x8_shift: dd 11, 13
+warp8x8_rnd1: dd 1024, 1024, 4096, 4096
+warp8x8_rnd2: times 4 dw 4096
+ times 4 dw 16384
+warp8x8t_rnd: times 2 dd 16384 - (8192 << 15)
+
+%macro BIDIR_JMP_TABLE 2-*
+ %xdefine %1_%2_table (%%table - 2*%3)
+ %xdefine %%base %1_%2_table
+ %xdefine %%prefix mangle(private_prefix %+ _%1_16bpc_%2)
+ %%table:
+ %rep %0 - 2
+ dd %%prefix %+ .w%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+BIDIR_JMP_TABLE avg, ssse3, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_avg, ssse3, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE mask, ssse3, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_420, ssse3, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_422, ssse3, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_444, ssse3, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE blend, ssse3, 4, 8, 16, 32
+BIDIR_JMP_TABLE blend_v, ssse3, 2, 4, 8, 16, 32
+BIDIR_JMP_TABLE blend_h, ssse3, 2, 4, 8, 16, 32, 64, 128
+
+%macro BASE_JMP_TABLE 3-*
+ %xdefine %1_%2_table (%%table - %3)
+ %xdefine %%base %1_%2
+ %%table:
+ %rep %0 - 2
+ dw %%base %+ _w%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+%xdefine put_ssse3 mangle(private_prefix %+ _put_bilin_16bpc_ssse3.put)
+%xdefine prep_ssse3 mangle(private_prefix %+ _prep_bilin_16bpc_ssse3.prep)
+
+BASE_JMP_TABLE put, ssse3, 2, 4, 8, 16, 32, 64, 128
+BASE_JMP_TABLE prep, ssse3, 4, 8, 16, 32, 64, 128
+
+%macro SCALED_JMP_TABLE 2-*
+ %xdefine %1_%2_table (%%table - %3)
+ %xdefine %%base mangle(private_prefix %+ _%1_16bpc_%2)
+%%table:
+ %rep %0 - 2
+ dw %%base %+ .w%3 - %%base
+ %rotate 1
+ %endrep
+ %rotate 2
+%%dy_1024:
+ %xdefine %1_%2_dy1_table (%%dy_1024 - %3)
+ %rep %0 - 2
+ dw %%base %+ .dy1_w%3 - %%base
+ %rotate 1
+ %endrep
+ %rotate 2
+%%dy_2048:
+ %xdefine %1_%2_dy2_table (%%dy_2048 - %3)
+ %rep %0 - 2
+ dw %%base %+ .dy2_w%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+SCALED_JMP_TABLE put_8tap_scaled, ssse3, 2, 4, 8, 16, 32, 64, 128
+SCALED_JMP_TABLE prep_8tap_scaled, ssse3, 4, 8, 16, 32, 64, 128
+
+cextern mc_subpel_filters
+%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
+
+cextern mc_warp_filter
+cextern resize_filter
+
+SECTION .text
+
+%if UNIX64
+DECLARE_REG_TMP 7
+%else
+DECLARE_REG_TMP 5
+%endif
+
+INIT_XMM ssse3
+cglobal put_bilin_16bpc, 4, 7, 0, dst, ds, src, ss, w, h, mxy
+%define base t0-put_ssse3
+ mov mxyd, r6m ; mx
+ LEA t0, put_ssse3
+ movifnidn wd, wm
+ test mxyd, mxyd
+ jnz .h
+ mov mxyd, r7m ; my
+ test mxyd, mxyd
+ jnz .v
+.put:
+ tzcnt wd, wd
+ movzx wd, word [base+put_ssse3_table+wq*2]
+ add wq, t0
+ movifnidn hd, hm
+ jmp wq
+.put_w2:
+ mov r4d, [srcq+ssq*0]
+ mov r6d, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mov [dstq+dsq*0], r4d
+ mov [dstq+dsq*1], r6d
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w2
+ RET
+.put_w4:
+ movq m0, [srcq+ssq*0]
+ movq m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movq [dstq+dsq*0], m0
+ movq [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w4
+ RET
+.put_w8:
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w8
+ RET
+.put_w16:
+ movu m0, [srcq+ssq*0+16*0]
+ movu m1, [srcq+ssq*0+16*1]
+ movu m2, [srcq+ssq*1+16*0]
+ movu m3, [srcq+ssq*1+16*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0+16*0], m0
+ mova [dstq+dsq*0+16*1], m1
+ mova [dstq+dsq*1+16*0], m2
+ mova [dstq+dsq*1+16*1], m3
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w16
+ RET
+.put_w32:
+ movu m0, [srcq+16*0]
+ movu m1, [srcq+16*1]
+ movu m2, [srcq+16*2]
+ movu m3, [srcq+16*3]
+ add srcq, ssq
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ mova [dstq+16*2], m2
+ mova [dstq+16*3], m3
+ add dstq, dsq
+ dec hd
+ jg .put_w32
+ RET
+.put_w64:
+ movu m0, [srcq+16*0]
+ movu m1, [srcq+16*1]
+ movu m2, [srcq+16*2]
+ movu m3, [srcq+16*3]
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ mova [dstq+16*2], m2
+ mova [dstq+16*3], m3
+ movu m0, [srcq+16*4]
+ movu m1, [srcq+16*5]
+ movu m2, [srcq+16*6]
+ movu m3, [srcq+16*7]
+ add srcq, ssq
+ mova [dstq+16*4], m0
+ mova [dstq+16*5], m1
+ mova [dstq+16*6], m2
+ mova [dstq+16*7], m3
+ add dstq, dsq
+ dec hd
+ jg .put_w64
+ RET
+.put_w128:
+ add srcq, 16*8
+ add dstq, 16*8
+.put_w128_loop:
+ movu m0, [srcq-16*8]
+ movu m1, [srcq-16*7]
+ movu m2, [srcq-16*6]
+ movu m3, [srcq-16*5]
+ mova [dstq-16*8], m0
+ mova [dstq-16*7], m1
+ mova [dstq-16*6], m2
+ mova [dstq-16*5], m3
+ movu m0, [srcq-16*4]
+ movu m1, [srcq-16*3]
+ movu m2, [srcq-16*2]
+ movu m3, [srcq-16*1]
+ mova [dstq-16*4], m0
+ mova [dstq-16*3], m1
+ mova [dstq-16*2], m2
+ mova [dstq-16*1], m3
+ movu m0, [srcq+16*0]
+ movu m1, [srcq+16*1]
+ movu m2, [srcq+16*2]
+ movu m3, [srcq+16*3]
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ mova [dstq+16*2], m2
+ mova [dstq+16*3], m3
+ movu m0, [srcq+16*4]
+ movu m1, [srcq+16*5]
+ movu m2, [srcq+16*6]
+ movu m3, [srcq+16*7]
+ add srcq, ssq
+ mova [dstq+16*4], m0
+ mova [dstq+16*5], m1
+ mova [dstq+16*6], m2
+ mova [dstq+16*7], m3
+ add dstq, dsq
+ dec hd
+ jg .put_w128_loop
+ RET
+.h:
+ movd m5, mxyd
+ mov mxyd, r7m ; my
+ mova m4, [base+pw_16]
+ pshufb m5, [base+pw_256]
+ psubw m4, m5
+ test mxyd, mxyd
+ jnz .hv
+ ; 12-bit is rounded twice so we can't use the same pmulhrsw approach as .v
+ mov r6d, r8m ; bitdepth_max
+ shr r6d, 11
+ movddup m3, [base+put_bilin_h_rnd+r6*8]
+ movifnidn hd, hm
+ sub wd, 8
+ jg .h_w16
+ je .h_w8
+ cmp wd, -4
+ je .h_w4
+.h_w2:
+ movq m1, [srcq+ssq*0]
+ movhps m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmullw m0, m4, m1
+ psrlq m1, 16
+ pmullw m1, m5
+ paddw m0, m3
+ paddw m0, m1
+ psrlw m0, 4
+ movd [dstq+dsq*0], m0
+ punpckhqdq m0, m0
+ movd [dstq+dsq*1], m0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w2
+ RET
+.h_w4:
+ movq m0, [srcq+ssq*0]
+ movhps m0, [srcq+ssq*1]
+ movq m1, [srcq+ssq*0+2]
+ movhps m1, [srcq+ssq*1+2]
+ lea srcq, [srcq+ssq*2]
+ pmullw m0, m4
+ pmullw m1, m5
+ paddw m0, m3
+ paddw m0, m1
+ psrlw m0, 4
+ movq [dstq+dsq*0], m0
+ movhps [dstq+dsq*1], m0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w4
+ RET
+.h_w8:
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*0+2]
+ pmullw m0, m4
+ pmullw m1, m5
+ paddw m0, m3
+ paddw m0, m1
+ movu m1, [srcq+ssq*1]
+ movu m2, [srcq+ssq*1+2]
+ lea srcq, [srcq+ssq*2]
+ pmullw m1, m4
+ pmullw m2, m5
+ paddw m1, m3
+ paddw m1, m2
+ psrlw m0, 4
+ psrlw m1, 4
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ lea srcq, [srcq+wq*2]
+ lea dstq, [dstq+wq*2]
+ neg wq
+.h_w16_loop0:
+ mov r6, wq
+.h_w16_loop:
+ movu m0, [srcq+r6*2+ 0]
+ movu m1, [srcq+r6*2+ 2]
+ pmullw m0, m4
+ pmullw m1, m5
+ paddw m0, m3
+ paddw m0, m1
+ movu m1, [srcq+r6*2+16]
+ movu m2, [srcq+r6*2+18]
+ pmullw m1, m4
+ pmullw m2, m5
+ paddw m1, m3
+ paddw m1, m2
+ psrlw m0, 4
+ psrlw m1, 4
+ mova [dstq+r6*2+16*0], m0
+ mova [dstq+r6*2+16*1], m1
+ add r6, 16
+ jl .h_w16_loop
+ add srcq, ssq
+ add dstq, dsq
+ dec hd
+ jg .h_w16_loop0
+ RET
+.v:
+ shl mxyd, 11
+ movd m5, mxyd
+ pshufb m5, [base+pw_256]
+ movifnidn hd, hm
+ cmp wd, 4
+ jg .v_w8
+ je .v_w4
+.v_w2:
+ movd m0, [srcq+ssq*0]
+.v_w2_loop:
+ movd m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklqdq m2, m0, m1
+ movd m0, [srcq+ssq*0]
+ punpcklqdq m1, m0
+ psubw m1, m2
+ pmulhrsw m1, m5
+ paddw m1, m2
+ movd [dstq+dsq*0], m1
+ punpckhqdq m1, m1
+ movd [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w2_loop
+ RET
+.v_w4:
+ movq m0, [srcq+ssq*0]
+.v_w4_loop:
+ movq m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklqdq m2, m0, m1
+ movq m0, [srcq+ssq*0]
+ punpcklqdq m1, m0
+ psubw m1, m2
+ pmulhrsw m1, m5
+ paddw m1, m2
+ movq [dstq+dsq*0], m1
+ movhps [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w4_loop
+ RET
+.v_w8:
+%if ARCH_X86_64
+%if WIN64
+ push r7
+%endif
+ shl wd, 5
+ mov r7, srcq
+ lea r6d, [wq+hq-256]
+ mov r4, dstq
+%else
+ mov r6, srcq
+%endif
+.v_w8_loop0:
+ movu m0, [srcq+ssq*0]
+.v_w8_loop:
+ movu m3, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ psubw m1, m3, m0
+ pmulhrsw m1, m5
+ paddw m1, m0
+ movu m0, [srcq+ssq*0]
+ psubw m2, m0, m3
+ pmulhrsw m2, m5
+ paddw m2, m3
+ mova [dstq+dsq*0], m1
+ mova [dstq+dsq*1], m2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w8_loop
+%if ARCH_X86_64
+ add r7, 16
+ add r4, 16
+ movzx hd, r6b
+ mov srcq, r7
+ mov dstq, r4
+ sub r6d, 1<<8
+%else
+ mov dstq, dstmp
+ add r6, 16
+ mov hd, hm
+ add dstq, 16
+ mov srcq, r6
+ mov dstmp, dstq
+ sub wd, 8
+%endif
+ jg .v_w8_loop0
+%if WIN64
+ pop r7
+%endif
+ RET
+.hv:
+ WIN64_SPILL_XMM 8
+ shl mxyd, 11
+ mova m3, [base+pw_2]
+ movd m6, mxyd
+ mova m7, [base+pw_8192]
+ pshufb m6, [base+pw_256]
+ test dword r8m, 0x800
+ jnz .hv_12bpc
+ psllw m4, 2
+ psllw m5, 2
+ mova m7, [base+pw_2048]
+.hv_12bpc:
+ movifnidn hd, hm
+ cmp wd, 4
+ jg .hv_w8
+ je .hv_w4
+.hv_w2:
+ movddup m0, [srcq+ssq*0]
+ pshufhw m1, m0, q0321
+ pmullw m0, m4
+ pmullw m1, m5
+ paddw m0, m3
+ paddw m0, m1
+ psrlw m0, 2
+.hv_w2_loop:
+ movq m2, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movhps m2, [srcq+ssq*0]
+ pmullw m1, m4, m2
+ psrlq m2, 16
+ pmullw m2, m5
+ paddw m1, m3
+ paddw m1, m2
+ psrlw m1, 2 ; 1 _ 2 _
+ shufpd m2, m0, m1, 0x01 ; 0 _ 1 _
+ mova m0, m1
+ psubw m1, m2
+ paddw m1, m1
+ pmulhw m1, m6
+ paddw m1, m2
+ pmulhrsw m1, m7
+ movd [dstq+dsq*0], m1
+ punpckhqdq m1, m1
+ movd [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w2_loop
+ RET
+.hv_w4:
+ movddup m0, [srcq+ssq*0]
+ movddup m1, [srcq+ssq*0+2]
+ pmullw m0, m4
+ pmullw m1, m5
+ paddw m0, m3
+ paddw m0, m1
+ psrlw m0, 2
+.hv_w4_loop:
+ movq m1, [srcq+ssq*1]
+ movq m2, [srcq+ssq*1+2]
+ lea srcq, [srcq+ssq*2]
+ movhps m1, [srcq+ssq*0]
+ movhps m2, [srcq+ssq*0+2]
+ pmullw m1, m4
+ pmullw m2, m5
+ paddw m1, m3
+ paddw m1, m2
+ psrlw m1, 2 ; 1 2
+ shufpd m2, m0, m1, 0x01 ; 0 1
+ mova m0, m1
+ psubw m1, m2
+ paddw m1, m1
+ pmulhw m1, m6
+ paddw m1, m2
+ pmulhrsw m1, m7
+ movq [dstq+dsq*0], m1
+ movhps [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+%if ARCH_X86_64
+%if WIN64
+ push r7
+%endif
+ shl wd, 5
+ lea r6d, [wq+hq-256]
+ mov r4, srcq
+ mov r7, dstq
+%else
+ mov r6, srcq
+%endif
+.hv_w8_loop0:
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*0+2]
+ pmullw m0, m4
+ pmullw m1, m5
+ paddw m0, m3
+ paddw m0, m1
+ psrlw m0, 2
+.hv_w8_loop:
+ movu m1, [srcq+ssq*1]
+ movu m2, [srcq+ssq*1+2]
+ lea srcq, [srcq+ssq*2]
+ pmullw m1, m4
+ pmullw m2, m5
+ paddw m1, m3
+ paddw m1, m2
+ psrlw m1, 2
+ psubw m2, m1, m0
+ paddw m2, m2
+ pmulhw m2, m6
+ paddw m2, m0
+ pmulhrsw m2, m7
+ mova [dstq+dsq*0], m2
+ movu m0, [srcq+ssq*0]
+ movu m2, [srcq+ssq*0+2]
+ pmullw m0, m4
+ pmullw m2, m5
+ paddw m0, m3
+ paddw m0, m2
+ psrlw m0, 2
+ psubw m2, m0, m1
+ paddw m2, m2
+ pmulhw m2, m6
+ paddw m2, m1
+ pmulhrsw m2, m7
+ mova [dstq+dsq*1], m2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w8_loop
+%if ARCH_X86_64
+ add r4, 16
+ add r7, 16
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
+ sub r6d, 1<<8
+%else
+ mov dstq, dstmp
+ add r6, 16
+ mov hd, hm
+ add dstq, 16
+ mov srcq, r6
+ mov dstmp, dstq
+ sub wd, 8
+%endif
+ jg .hv_w8_loop0
+%if WIN64
+ pop r7
+%endif
+ RET
+
+cglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w, h, mxy, stride3
+%define base r6-prep_ssse3
+ movifnidn mxyd, r5m ; mx
+ LEA r6, prep_ssse3
+ movifnidn hd, hm
+ test mxyd, mxyd
+ jnz .h
+ mov mxyd, r6m ; my
+ test mxyd, mxyd
+ jnz .v
+.prep:
+ tzcnt wd, wd
+ movzx wd, word [base+prep_ssse3_table+wq*2]
+ mov r5d, r7m ; bitdepth_max
+ mova m5, [base+pw_8192]
+ add wq, r6
+ shr r5d, 11
+ movddup m4, [base+prep_mul+r5*8]
+ lea stride3q, [strideq*3]
+ jmp wq
+.prep_w4:
+ movq m0, [srcq+strideq*0]
+ movhps m0, [srcq+strideq*1]
+ movq m1, [srcq+strideq*2]
+ movhps m1, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ pmullw m0, m4
+ pmullw m1, m4
+ psubw m0, m5
+ psubw m1, m5
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ add tmpq, 16*2
+ sub hd, 4
+ jg .prep_w4
+ RET
+.prep_w8:
+ movu m0, [srcq+strideq*0]
+ movu m1, [srcq+strideq*1]
+ movu m2, [srcq+strideq*2]
+ movu m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ REPX {pmullw x, m4}, m0, m1, m2, m3
+ REPX {psubw x, m5}, m0, m1, m2, m3
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ mova [tmpq+16*2], m2
+ mova [tmpq+16*3], m3
+ add tmpq, 16*4
+ sub hd, 4
+ jg .prep_w8
+ RET
+.prep_w16:
+ movu m0, [srcq+strideq*0+16*0]
+ movu m1, [srcq+strideq*0+16*1]
+ movu m2, [srcq+strideq*1+16*0]
+ movu m3, [srcq+strideq*1+16*1]
+ lea srcq, [srcq+strideq*2]
+ REPX {pmullw x, m4}, m0, m1, m2, m3
+ REPX {psubw x, m5}, m0, m1, m2, m3
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ mova [tmpq+16*2], m2
+ mova [tmpq+16*3], m3
+ add tmpq, 16*4
+ sub hd, 2
+ jg .prep_w16
+ RET
+.prep_w32:
+ movu m0, [srcq+16*0]
+ movu m1, [srcq+16*1]
+ movu m2, [srcq+16*2]
+ movu m3, [srcq+16*3]
+ add srcq, strideq
+ REPX {pmullw x, m4}, m0, m1, m2, m3
+ REPX {psubw x, m5}, m0, m1, m2, m3
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ mova [tmpq+16*2], m2
+ mova [tmpq+16*3], m3
+ add tmpq, 16*4
+ dec hd
+ jg .prep_w32
+ RET
+.prep_w64:
+ movu m0, [srcq+16*0]
+ movu m1, [srcq+16*1]
+ movu m2, [srcq+16*2]
+ movu m3, [srcq+16*3]
+ REPX {pmullw x, m4}, m0, m1, m2, m3
+ REPX {psubw x, m5}, m0, m1, m2, m3
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ mova [tmpq+16*2], m2
+ mova [tmpq+16*3], m3
+ movu m0, [srcq+16*4]
+ movu m1, [srcq+16*5]
+ movu m2, [srcq+16*6]
+ movu m3, [srcq+16*7]
+ add srcq, strideq
+ REPX {pmullw x, m4}, m0, m1, m2, m3
+ REPX {psubw x, m5}, m0, m1, m2, m3
+ mova [tmpq+16*4], m0
+ mova [tmpq+16*5], m1
+ mova [tmpq+16*6], m2
+ mova [tmpq+16*7], m3
+ add tmpq, 16*8
+ dec hd
+ jg .prep_w64
+ RET
+.prep_w128:
+ movu m0, [srcq+16* 0]
+ movu m1, [srcq+16* 1]
+ movu m2, [srcq+16* 2]
+ movu m3, [srcq+16* 3]
+ REPX {pmullw x, m4}, m0, m1, m2, m3
+ REPX {psubw x, m5}, m0, m1, m2, m3
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ mova [tmpq+16*2], m2
+ mova [tmpq+16*3], m3
+ movu m0, [srcq+16* 4]
+ movu m1, [srcq+16* 5]
+ movu m2, [srcq+16* 6]
+ movu m3, [srcq+16* 7]
+ REPX {pmullw x, m4}, m0, m1, m2, m3
+ REPX {psubw x, m5}, m0, m1, m2, m3
+ mova [tmpq+16*4], m0
+ mova [tmpq+16*5], m1
+ mova [tmpq+16*6], m2
+ mova [tmpq+16*7], m3
+ movu m0, [srcq+16* 8]
+ movu m1, [srcq+16* 9]
+ movu m2, [srcq+16*10]
+ movu m3, [srcq+16*11]
+ add tmpq, 16*16
+ REPX {pmullw x, m4}, m0, m1, m2, m3
+ REPX {psubw x, m5}, m0, m1, m2, m3
+ mova [tmpq-16*8], m0
+ mova [tmpq-16*7], m1
+ mova [tmpq-16*6], m2
+ mova [tmpq-16*5], m3
+ movu m0, [srcq+16*12]
+ movu m1, [srcq+16*13]
+ movu m2, [srcq+16*14]
+ movu m3, [srcq+16*15]
+ add srcq, strideq
+ REPX {pmullw x, m4}, m0, m1, m2, m3
+ REPX {psubw x, m5}, m0, m1, m2, m3
+ mova [tmpq-16*4], m0
+ mova [tmpq-16*3], m1
+ mova [tmpq-16*2], m2
+ mova [tmpq-16*1], m3
+ dec hd
+ jg .prep_w128
+ RET
+.h:
+ movd m4, mxyd
+ mov mxyd, r6m ; my
+ mova m3, [base+pw_16]
+ pshufb m4, [base+pw_256]
+ mova m5, [base+pw_32766]
+ psubw m3, m4
+ test dword r7m, 0x800
+ jnz .h_12bpc
+ psllw m3, 2
+ psllw m4, 2
+.h_12bpc:
+ test mxyd, mxyd
+ jnz .hv
+ sub wd, 8
+ je .h_w8
+ jg .h_w16
+.h_w4:
+ movq m0, [srcq+strideq*0]
+ movhps m0, [srcq+strideq*1]
+ movq m1, [srcq+strideq*0+2]
+ movhps m1, [srcq+strideq*1+2]
+ lea srcq, [srcq+strideq*2]
+ pmullw m0, m3
+ pmullw m1, m4
+ psubw m0, m5
+ paddw m0, m1
+ psraw m0, 2
+ mova [tmpq], m0
+ add tmpq, 16
+ sub hd, 2
+ jg .h_w4
+ RET
+.h_w8:
+ movu m0, [srcq+strideq*0]
+ movu m1, [srcq+strideq*0+2]
+ pmullw m0, m3
+ pmullw m1, m4
+ psubw m0, m5
+ paddw m0, m1
+ movu m1, [srcq+strideq*1]
+ movu m2, [srcq+strideq*1+2]
+ lea srcq, [srcq+strideq*2]
+ pmullw m1, m3
+ pmullw m2, m4
+ psubw m1, m5
+ paddw m1, m2
+ psraw m0, 2
+ psraw m1, 2
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ add tmpq, 16*2
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ lea srcq, [srcq+wq*2]
+ neg wq
+.h_w16_loop0:
+ mov r6, wq
+.h_w16_loop:
+ movu m0, [srcq+r6*2+ 0]
+ movu m1, [srcq+r6*2+ 2]
+ pmullw m0, m3
+ pmullw m1, m4
+ psubw m0, m5
+ paddw m0, m1
+ movu m1, [srcq+r6*2+16]
+ movu m2, [srcq+r6*2+18]
+ pmullw m1, m3
+ pmullw m2, m4
+ psubw m1, m5
+ paddw m1, m2
+ psraw m0, 2
+ psraw m1, 2
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ add tmpq, 16*2
+ add r6, 16
+ jl .h_w16_loop
+ add srcq, strideq
+ dec hd
+ jg .h_w16_loop0
+ RET
+.v:
+ movd m4, mxyd
+ mova m3, [base+pw_16]
+ pshufb m4, [base+pw_256]
+ mova m5, [base+pw_32766]
+ psubw m3, m4
+ test dword r7m, 0x800
+ jnz .v_12bpc
+ psllw m3, 2
+ psllw m4, 2
+.v_12bpc:
+ cmp wd, 8
+ je .v_w8
+ jg .v_w16
+.v_w4:
+ movq m0, [srcq+strideq*0]
+.v_w4_loop:
+ movq m2, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ punpcklqdq m1, m0, m2 ; 0 1
+ movq m0, [srcq+strideq*0]
+ punpcklqdq m2, m0 ; 1 2
+ pmullw m1, m3
+ pmullw m2, m4
+ psubw m1, m5
+ paddw m1, m2
+ psraw m1, 2
+ mova [tmpq], m1
+ add tmpq, 16
+ sub hd, 2
+ jg .v_w4_loop
+ RET
+.v_w8:
+ movu m0, [srcq+strideq*0]
+.v_w8_loop:
+ movu m2, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ pmullw m0, m3
+ pmullw m1, m4, m2
+ psubw m0, m5
+ paddw m1, m0
+ movu m0, [srcq+strideq*0]
+ psraw m1, 2
+ pmullw m2, m3
+ mova [tmpq+16*0], m1
+ pmullw m1, m4, m0
+ psubw m2, m5
+ paddw m1, m2
+ psraw m1, 2
+ mova [tmpq+16*1], m1
+ add tmpq, 16*2
+ sub hd, 2
+ jg .v_w8_loop
+ RET
+.v_w16:
+%if WIN64
+ push r7
+%endif
+ mov r5, srcq
+%if ARCH_X86_64
+ lea r6d, [wq*4-32]
+ mov wd, wd
+ lea r6d, [hq+r6*8]
+ mov r7, tmpq
+%else
+ mov r6d, wd
+%endif
+.v_w16_loop0:
+ movu m0, [srcq+strideq*0]
+.v_w16_loop:
+ movu m2, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ pmullw m0, m3
+ pmullw m1, m4, m2
+ psubw m0, m5
+ paddw m1, m0
+ movu m0, [srcq+strideq*0]
+ psraw m1, 2
+ pmullw m2, m3
+ mova [tmpq+wq*0], m1
+ pmullw m1, m4, m0
+ psubw m2, m5
+ paddw m1, m2
+ psraw m1, 2
+ mova [tmpq+wq*2], m1
+ lea tmpq, [tmpq+wq*4]
+ sub hd, 2
+ jg .v_w16_loop
+%if ARCH_X86_64
+ add r5, 16
+ add r7, 16
+ movzx hd, r6b
+ mov srcq, r5
+ mov tmpq, r7
+ sub r6d, 1<<8
+%else
+ mov tmpq, tmpmp
+ add r5, 16
+ mov hd, hm
+ add tmpq, 16
+ mov srcq, r5
+ mov tmpmp, tmpq
+ sub r6d, 8
+%endif
+ jg .v_w16_loop0
+%if WIN64
+ pop r7
+%endif
+ RET
+.hv:
+ WIN64_SPILL_XMM 7
+ shl mxyd, 11
+ movd m6, mxyd
+ pshufb m6, [base+pw_256]
+ cmp wd, 8
+ je .hv_w8
+ jg .hv_w16
+.hv_w4:
+ movddup m0, [srcq+strideq*0]
+ movddup m1, [srcq+strideq*0+2]
+ pmullw m0, m3
+ pmullw m1, m4
+ psubw m0, m5
+ paddw m0, m1
+ psraw m0, 2
+.hv_w4_loop:
+ movq m1, [srcq+strideq*1]
+ movq m2, [srcq+strideq*1+2]
+ lea srcq, [srcq+strideq*2]
+ movhps m1, [srcq+strideq*0]
+ movhps m2, [srcq+strideq*0+2]
+ pmullw m1, m3
+ pmullw m2, m4
+ psubw m1, m5
+ paddw m1, m2
+ psraw m1, 2 ; 1 2
+ shufpd m2, m0, m1, 0x01 ; 0 1
+ mova m0, m1
+ psubw m1, m2
+ pmulhrsw m1, m6
+ paddw m1, m2
+ mova [tmpq], m1
+ add tmpq, 16
+ sub hd, 2
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ movu m0, [srcq+strideq*0]
+ movu m1, [srcq+strideq*0+2]
+ pmullw m0, m3
+ pmullw m1, m4
+ psubw m0, m5
+ paddw m0, m1
+ psraw m0, 2
+.hv_w8_loop:
+ movu m1, [srcq+strideq*1]
+ movu m2, [srcq+strideq*1+2]
+ lea srcq, [srcq+strideq*2]
+ pmullw m1, m3
+ pmullw m2, m4
+ psubw m1, m5
+ paddw m1, m2
+ psraw m1, 2
+ psubw m2, m1, m0
+ pmulhrsw m2, m6
+ paddw m2, m0
+ mova [tmpq+16*0], m2
+ movu m0, [srcq+strideq*0]
+ movu m2, [srcq+strideq*0+2]
+ pmullw m0, m3
+ pmullw m2, m4
+ psubw m0, m5
+ paddw m0, m2
+ psraw m0, 2
+ psubw m2, m0, m1
+ pmulhrsw m2, m6
+ paddw m2, m1
+ mova [tmpq+16*1], m2
+ add tmpq, 16*2
+ sub hd, 2
+ jg .hv_w8_loop
+ RET
+.hv_w16:
+%if WIN64
+ push r7
+%endif
+ mov r5, srcq
+%if ARCH_X86_64
+ lea r6d, [wq*4-32]
+ mov wd, wd
+ lea r6d, [hq+r6*8]
+ mov r7, tmpq
+%else
+ mov r6d, wd
+%endif
+.hv_w16_loop0:
+ movu m0, [srcq+strideq*0]
+ movu m1, [srcq+strideq*0+2]
+ pmullw m0, m3
+ pmullw m1, m4
+ psubw m0, m5
+ paddw m0, m1
+ psraw m0, 2
+.hv_w16_loop:
+ movu m1, [srcq+strideq*1]
+ movu m2, [srcq+strideq*1+2]
+ lea srcq, [srcq+strideq*2]
+ pmullw m1, m3
+ pmullw m2, m4
+ psubw m1, m5
+ paddw m1, m2
+ psraw m1, 2
+ psubw m2, m1, m0
+ pmulhrsw m2, m6
+ paddw m2, m0
+ mova [tmpq+wq*0], m2
+ movu m0, [srcq+strideq*0]
+ movu m2, [srcq+strideq*0+2]
+ pmullw m0, m3
+ pmullw m2, m4
+ psubw m0, m5
+ paddw m0, m2
+ psraw m0, 2
+ psubw m2, m0, m1
+ pmulhrsw m2, m6
+ paddw m2, m1
+ mova [tmpq+wq*2], m2
+ lea tmpq, [tmpq+wq*4]
+ sub hd, 2
+ jg .hv_w16_loop
+%if ARCH_X86_64
+ add r5, 16
+ add r7, 16
+ movzx hd, r6b
+ mov srcq, r5
+ mov tmpq, r7
+ sub r6d, 1<<8
+%else
+ mov tmpq, tmpmp
+ add r5, 16
+ mov hd, hm
+ add tmpq, 16
+ mov srcq, r5
+ mov tmpmp, tmpq
+ sub r6d, 8
+%endif
+ jg .hv_w16_loop0
+%if WIN64
+ pop r7
+%endif
+ RET
+
+; int8_t subpel_filters[5][15][8]
+%assign FILTER_REGULAR (0*15 << 16) | 3*15
+%assign FILTER_SMOOTH (1*15 << 16) | 4*15
+%assign FILTER_SHARP (2*15 << 16) | 3*15
+
+%macro FN 4 ; prefix, type, type_h, type_v
+cglobal %1_%2_16bpc
+ mov t0d, FILTER_%3
+%ifidn %3, %4
+ mov t1d, t0d
+%else
+ mov t1d, FILTER_%4
+%endif
+%ifnidn %2, regular ; skip the jump in the last filter
+ jmp mangle(private_prefix %+ _%1_16bpc %+ SUFFIX)
+%endif
+%endmacro
+
+%if ARCH_X86_32
+DECLARE_REG_TMP 1, 2, 6
+%elif WIN64
+DECLARE_REG_TMP 4, 5, 8
+%else
+DECLARE_REG_TMP 7, 8, 8
+%endif
+
+%define PUT_8TAP_FN FN put_8tap,
+PUT_8TAP_FN sharp, SHARP, SHARP
+PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH
+PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP
+PUT_8TAP_FN smooth, SMOOTH, SMOOTH
+PUT_8TAP_FN sharp_regular, SHARP, REGULAR
+PUT_8TAP_FN regular_sharp, REGULAR, SHARP
+PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR
+PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PUT_8TAP_FN regular, REGULAR, REGULAR
+
+%if ARCH_X86_32
+cglobal put_8tap_16bpc, 0, 7, 8, dst, ds, src, ss, w, h, mx, my
+%define mxb r0b
+%define mxd r0
+%define mxq r0
+%define myb r1b
+%define myd r1
+%define myq r1
+%define m8 [esp+16*0]
+%define m9 [esp+16*1]
+%define m10 [esp+16*2]
+%define m11 [esp+16*3]
+%define m12 [esp+16*4]
+%define m13 [esp+16*5]
+%define m14 [esp+16*6]
+%define m15 [esp+16*7]
+%else
+cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
+%endif
+%define base t2-put_ssse3
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+ imul myd, mym, 0x010101
+ add myd, t1d ; 8tap_v, my, 4tap_v
+ LEA t2, put_ssse3
+ movifnidn wd, wm
+ movifnidn srcq, srcmp
+ movifnidn ssq, ssmp
+ movifnidn hd, hm
+ test mxd, 0xf00
+ jnz .h
+ test myd, 0xf00
+ jnz .v
+ tzcnt wd, wd
+ movzx wd, word [base+put_ssse3_table+wq*2]
+ movifnidn dstq, dstmp
+ movifnidn dsq, dsmp
+ add wq, t2
+%if WIN64
+ pop r8
+ pop r7
+%endif
+ jmp wq
+.h:
+ test myd, 0xf00
+ jnz .hv
+ mov myd, r8m
+ movd m5, r8m
+ shr myd, 11
+ movddup m4, [base+put_8tap_h_rnd+myq*8]
+ movifnidn dsq, dsmp
+ pshufb m5, [base+pw_256]
+ cmp wd, 4
+ jg .h_w8
+ movzx mxd, mxb
+ lea srcq, [srcq-2]
+ movq m3, [base+subpel_filters+mxq*8]
+ movifnidn dstq, dstmp
+ punpcklbw m3, m3
+ psraw m3, 8 ; sign-extend
+ je .h_w4
+.h_w2:
+ mova m2, [base+spel_h_shuf2]
+ pshufd m3, m3, q2121
+.h_w2_loop:
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb m0, m2
+ pshufb m1, m2
+ pmaddwd m0, m3
+ pmaddwd m1, m3
+ phaddd m0, m1
+ paddd m0, m4
+ psrad m0, 6
+ packssdw m0, m0
+ pxor m1, m1
+ pminsw m0, m5
+ pmaxsw m0, m1
+ movd [dstq+dsq*0], m0
+ pshuflw m0, m0, q3232
+ movd [dstq+dsq*1], m0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w2_loop
+ RET
+.h_w4:
+ WIN64_SPILL_XMM 8
+ mova m6, [base+spel_h_shufA]
+ mova m7, [base+spel_h_shufB]
+ pshufd m2, m3, q1111
+ pshufd m3, m3, q2222
+.h_w4_loop:
+ movu m1, [srcq]
+ add srcq, ssq
+ pshufb m0, m1, m6 ; 0 1 1 2 2 3 3 4
+ pshufb m1, m7 ; 2 3 3 4 4 5 5 6
+ pmaddwd m0, m2
+ pmaddwd m1, m3
+ paddd m0, m4
+ paddd m0, m1
+ psrad m0, 6
+ packssdw m0, m0
+ pxor m1, m1
+ pminsw m0, m5
+ pmaxsw m0, m1
+ movq [dstq], m0
+ add dstq, dsq
+ dec hd
+ jg .h_w4_loop
+ RET
+.h_w8:
+%if WIN64
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 12
+%endif
+ shr mxd, 16
+ movq m3, [base+subpel_filters+mxq*8]
+ movifnidn dstq, dstmp
+ mova m6, [base+spel_h_shufA]
+ mova m7, [base+spel_h_shufB]
+%if UNIX64
+ mov wd, wd
+%endif
+ lea srcq, [srcq+wq*2]
+ punpcklbw m3, m3
+ lea dstq, [dstq+wq*2]
+ psraw m3, 8
+ neg wq
+%if ARCH_X86_32
+ ALLOC_STACK -16*4
+ pshufd m0, m3, q0000
+ pshufd m1, m3, q1111
+ pshufd m2, m3, q2222
+ pshufd m3, m3, q3333
+ mova m8, m0
+ mova m9, m1
+ mova m10, m2
+ mova m11, m3
+%else
+ pshufd m8, m3, q0000
+ pshufd m9, m3, q1111
+ pshufd m10, m3, q2222
+ pshufd m11, m3, q3333
+%endif
+.h_w8_loop0:
+ mov r6, wq
+.h_w8_loop:
+ movu m0, [srcq+r6*2- 6]
+ movu m1, [srcq+r6*2+ 2]
+ pshufb m2, m0, m6 ; 0 1 1 2 2 3 3 4
+ pshufb m0, m7 ; 2 3 3 4 4 5 5 6
+ pmaddwd m2, m8 ; abcd0
+ pmaddwd m0, m9 ; abcd1
+ pshufb m3, m1, m6 ; 4 5 5 6 6 7 7 8
+ pshufb m1, m7 ; 6 7 7 8 8 9 9 a
+ paddd m2, m4
+ paddd m0, m2
+ pmaddwd m2, m10, m3 ; abcd2
+ pmaddwd m3, m8 ; efgh0
+ paddd m0, m2
+ pmaddwd m2, m11, m1 ; abcd3
+ pmaddwd m1, m9 ; efgh1
+ paddd m0, m2
+ movu m2, [srcq+r6*2+10]
+ paddd m3, m4
+ paddd m1, m3
+ pshufb m3, m2, m6 ; 8 9 9 a a b b c
+ pshufb m2, m7 ; a b b c c d d e
+ pmaddwd m3, m10 ; efgh2
+ pmaddwd m2, m11 ; efgh3
+ paddd m1, m3
+ paddd m1, m2
+ psrad m0, 6
+ psrad m1, 6
+ packssdw m0, m1
+ pxor m1, m1
+ pminsw m0, m5
+ pmaxsw m0, m1
+ mova [dstq+r6*2], m0
+ add r6, 8
+ jl .h_w8_loop
+ add srcq, ssq
+ add dstq, dsq
+ dec hd
+ jg .h_w8_loop0
+ RET
+.v:
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovb myd, mxd
+ movq m3, [base+subpel_filters+myq*8]
+%if STACK_ALIGNMENT < 16
+ %xdefine rstk rsp
+%else
+ %assign stack_offset stack_offset - stack_size_padded
+%endif
+%if WIN64
+ WIN64_SPILL_XMM 15
+%endif
+ movd m7, r8m
+ movifnidn dstq, dstmp
+ movifnidn dsq, dsmp
+ punpcklbw m3, m3
+ pshufb m7, [base+pw_256]
+ psraw m3, 8 ; sign-extend
+%if ARCH_X86_32
+ ALLOC_STACK -16*7
+ pshufd m0, m3, q0000
+ pshufd m1, m3, q1111
+ pshufd m2, m3, q2222
+ pshufd m3, m3, q3333
+ mova m8, m0
+ mova m9, m1
+ mova m10, m2
+ mova m11, m3
+%else
+ pshufd m8, m3, q0000
+ pshufd m9, m3, q1111
+ pshufd m10, m3, q2222
+ pshufd m11, m3, q3333
+%endif
+ lea r6, [ssq*3]
+ sub srcq, r6
+ cmp wd, 2
+ jne .v_w4
+.v_w2:
+ movd m1, [srcq+ssq*0]
+ movd m4, [srcq+ssq*1]
+ movd m2, [srcq+ssq*2]
+ add srcq, r6
+ movd m5, [srcq+ssq*0]
+ movd m3, [srcq+ssq*1]
+ movd m6, [srcq+ssq*2]
+ add srcq, r6
+ movd m0, [srcq+ssq*0]
+ punpckldq m1, m4 ; 0 1
+ punpckldq m4, m2 ; 1 2
+ punpckldq m2, m5 ; 2 3
+ punpckldq m5, m3 ; 3 4
+ punpckldq m3, m6 ; 4 5
+ punpckldq m6, m0 ; 5 6
+ punpcklwd m1, m4 ; 01 12
+ punpcklwd m2, m5 ; 23 34
+ punpcklwd m3, m6 ; 45 56
+ pxor m6, m6
+.v_w2_loop:
+ movd m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmaddwd m5, m8, m1 ; a0 b0
+ mova m1, m2
+ pmaddwd m2, m9 ; a1 b1
+ paddd m5, m2
+ mova m2, m3
+ pmaddwd m3, m10 ; a2 b2
+ paddd m5, m3
+ punpckldq m3, m0, m4 ; 6 7
+ movd m0, [srcq+ssq*0]
+ punpckldq m4, m0 ; 7 8
+ punpcklwd m3, m4 ; 67 78
+ pmaddwd m4, m11, m3 ; a3 b3
+ paddd m5, m4
+ psrad m5, 5
+ packssdw m5, m5
+ pmaxsw m5, m6
+ pavgw m5, m6
+ pminsw m5, m7
+ movd [dstq+dsq*0], m5
+ pshuflw m5, m5, q3232
+ movd [dstq+dsq*1], m5
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w2_loop
+ RET
+.v_w4:
+%if ARCH_X86_32
+ shl wd, 14
+%if STACK_ALIGNMENT < 16
+ mov [esp+4*29], srcq
+ mov [esp+4*30], dstq
+%else
+ mov srcmp, srcq
+%endif
+ lea wd, [wq+hq-(1<<16)]
+%else
+ shl wd, 6
+ mov r7, srcq
+ mov r8, dstq
+ lea wd, [wq+hq-(1<<8)]
+%endif
+.v_w4_loop0:
+ movq m1, [srcq+ssq*0]
+ movq m2, [srcq+ssq*1]
+ movq m3, [srcq+ssq*2]
+ add srcq, r6
+ movq m4, [srcq+ssq*0]
+ movq m5, [srcq+ssq*1]
+ movq m6, [srcq+ssq*2]
+ add srcq, r6
+ movq m0, [srcq+ssq*0]
+ punpcklwd m1, m2 ; 01
+ punpcklwd m2, m3 ; 12
+ punpcklwd m3, m4 ; 23
+ punpcklwd m4, m5 ; 34
+ punpcklwd m5, m6 ; 45
+ punpcklwd m6, m0 ; 56
+%if ARCH_X86_32
+ jmp .v_w4_loop_start
+.v_w4_loop:
+ mova m1, m12
+ mova m2, m13
+ mova m3, m14
+.v_w4_loop_start:
+ pmaddwd m1, m8 ; a0
+ pmaddwd m2, m8 ; b0
+ mova m12, m3
+ mova m13, m4
+ pmaddwd m3, m9 ; a1
+ pmaddwd m4, m9 ; b1
+ paddd m1, m3
+ paddd m2, m4
+ mova m14, m5
+ mova m4, m6
+ pmaddwd m5, m10 ; a2
+ pmaddwd m6, m10 ; b2
+ paddd m1, m5
+ paddd m2, m6
+ movq m6, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklwd m5, m0, m6 ; 67
+ movq m0, [srcq+ssq*0]
+ pmaddwd m3, m11, m5 ; a3
+ punpcklwd m6, m0 ; 78
+ paddd m1, m3
+ pmaddwd m3, m11, m6 ; b3
+ paddd m2, m3
+ psrad m1, 5
+ psrad m2, 5
+ packssdw m1, m2
+ pxor m2, m2
+ pmaxsw m1, m2
+ pavgw m1, m2
+ pminsw m1, m7
+ movq [dstq+dsq*0], m1
+ movhps [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w4_loop
+%if STACK_ALIGNMENT < 16
+ mov srcq, [esp+4*29]
+ mov dstq, [esp+4*30]
+ movzx hd, ww
+ add srcq, 8
+ add dstq, 8
+ mov [esp+4*29], srcq
+ mov [esp+4*30], dstq
+%else
+ mov srcq, srcmp
+ mov dstq, dstmp
+ movzx hd, ww
+ add srcq, 8
+ add dstq, 8
+ mov srcmp, srcq
+ mov dstmp, dstq
+%endif
+ sub wd, 1<<16
+%else
+.v_w4_loop:
+ pmaddwd m12, m8, m1 ; a0
+ pmaddwd m13, m8, m2 ; b0
+ mova m1, m3
+ mova m2, m4
+ pmaddwd m3, m9 ; a1
+ pmaddwd m4, m9 ; b1
+ paddd m12, m3
+ paddd m13, m4
+ mova m3, m5
+ mova m4, m6
+ pmaddwd m5, m10 ; a2
+ pmaddwd m6, m10 ; b2
+ paddd m12, m5
+ paddd m13, m6
+ movq m6, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklwd m5, m0, m6 ; 67
+ movq m0, [srcq+ssq*0]
+ pmaddwd m14, m11, m5 ; a3
+ punpcklwd m6, m0 ; 78
+ paddd m12, m14
+ pmaddwd m14, m11, m6 ; b3
+ paddd m13, m14
+ psrad m12, 5
+ psrad m13, 5
+ packssdw m12, m13
+ pxor m13, m13
+ pmaxsw m12, m13
+ pavgw m12, m13
+ pminsw m12, m7
+ movq [dstq+dsq*0], m12
+ movhps [dstq+dsq*1], m12
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w4_loop
+ add r7, 8
+ add r8, 8
+ movzx hd, wb
+ mov srcq, r7
+ mov dstq, r8
+ sub wd, 1<<8
+%endif
+ jg .v_w4_loop0
+ RET
+.hv:
+%if STACK_ALIGNMENT < 16
+ %xdefine rstk rsp
+%else
+ %assign stack_offset stack_offset - stack_size_padded
+%endif
+%if ARCH_X86_32
+ movd m4, r8m
+ mova m6, [base+pd_512]
+ pshufb m4, [base+pw_256]
+%else
+%if WIN64
+ ALLOC_STACK 16*6, 16
+%endif
+ movd m15, r8m
+ pshufb m15, [base+pw_256]
+%endif
+ cmp wd, 4
+ jg .hv_w8
+ movzx mxd, mxb
+ je .hv_w4
+ movq m0, [base+subpel_filters+mxq*8]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovb myd, mxd
+ movq m3, [base+subpel_filters+myq*8]
+%if ARCH_X86_32
+ mov dstq, dstmp
+ mov dsq, dsmp
+ mova m5, [base+spel_h_shuf2]
+ ALLOC_STACK -16*8
+%else
+ mova m6, [base+pd_512]
+ mova m9, [base+spel_h_shuf2]
+%endif
+ pshuflw m0, m0, q2121
+ pxor m7, m7
+ punpcklbw m7, m0
+ punpcklbw m3, m3
+ psraw m3, 8 ; sign-extend
+ test dword r8m, 0x800
+ jz .hv_w2_10bpc
+ psraw m7, 2
+ psllw m3, 2
+.hv_w2_10bpc:
+ lea r6, [ssq*3]
+ sub srcq, 2
+ sub srcq, r6
+%if ARCH_X86_32
+ pshufd m0, m3, q0000
+ pshufd m1, m3, q1111
+ pshufd m2, m3, q2222
+ pshufd m3, m3, q3333
+ mova m9, m5
+ mova m11, m0
+ mova m12, m1
+ mova m13, m2
+ mova m14, m3
+ mova m15, m4
+%else
+ pshufd m11, m3, q0000
+ pshufd m12, m3, q1111
+ pshufd m13, m3, q2222
+ pshufd m14, m3, q3333
+%endif
+ movu m2, [srcq+ssq*0]
+ movu m3, [srcq+ssq*1]
+ movu m1, [srcq+ssq*2]
+ add srcq, r6
+ movu m4, [srcq+ssq*0]
+%if ARCH_X86_32
+ REPX {pshufb x, m5}, m2, m3, m1, m4
+%else
+ REPX {pshufb x, m9}, m2, m3, m1, m4
+%endif
+ REPX {pmaddwd x, m7}, m2, m3, m1, m4
+ phaddd m2, m3 ; 0 1
+ phaddd m1, m4 ; 2 3
+ movu m3, [srcq+ssq*1]
+ movu m4, [srcq+ssq*2]
+ add srcq, r6
+ movu m0, [srcq+ssq*0]
+%if ARCH_X86_32
+ REPX {pshufb x, m5}, m3, m4, m0
+%else
+ REPX {pshufb x, m9}, m3, m4, m0
+%endif
+ REPX {pmaddwd x, m7}, m3, m4, m0
+ phaddd m3, m4 ; 4 5
+ phaddd m0, m0 ; 6 6
+ REPX {paddd x, m6}, m2, m1, m3, m0
+ REPX {psrad x, 10}, m2, m1, m3, m0
+ packssdw m2, m1 ; 0 1 2 3
+ packssdw m3, m0 ; 4 5 6 _
+ palignr m4, m3, m2, 4 ; 1 2 3 4
+ pshufd m5, m3, q0321 ; 5 6 _ _
+ punpcklwd m1, m2, m4 ; 01 12
+ punpckhwd m2, m4 ; 23 34
+ punpcklwd m3, m5 ; 45 56
+.hv_w2_loop:
+ movu m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movu m5, [srcq+ssq*0]
+ pshufb m4, m9
+ pshufb m5, m9
+ pmaddwd m4, m7
+ pmaddwd m5, m7
+ phaddd m4, m5
+ pmaddwd m5, m11, m1 ; a0 b0
+ mova m1, m2
+ pmaddwd m2, m12 ; a1 b1
+ paddd m5, m2
+ mova m2, m3
+ pmaddwd m3, m13 ; a2 b2
+ paddd m5, m3
+ paddd m4, m6
+ psrad m4, 10 ; 7 8
+ packssdw m0, m4
+ pshufd m3, m0, q2103
+ punpckhwd m3, m0 ; 67 78
+ mova m0, m4
+ pmaddwd m4, m14, m3 ; a3 b3
+ paddd m5, m6
+ paddd m5, m4
+ psrad m5, 10
+ packssdw m5, m5
+ pxor m4, m4
+ pminsw m5, m15
+ pmaxsw m5, m4
+ movd [dstq+dsq*0], m5
+ pshuflw m5, m5, q3232
+ movd [dstq+dsq*1], m5
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w2_loop
+ RET
+.hv_w8:
+ shr mxd, 16
+.hv_w4:
+ movq m2, [base+subpel_filters+mxq*8]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovb myd, mxd
+ movq m3, [base+subpel_filters+myq*8]
+%if ARCH_X86_32
+%if STACK_ALIGNMENT < 16
+ %xdefine rstk rsp
+%else
+ %assign stack_offset stack_offset - stack_size_padded
+%endif
+ mov dstq, dstmp
+ mov dsq, dsmp
+ mova m0, [base+spel_h_shufA]
+ mova m1, [base+spel_h_shufB]
+ ALLOC_STACK -16*15
+ mova m8, m0
+ mova m9, m1
+ mova m14, m6
+%else
+ mova m8, [base+spel_h_shufA]
+ mova m9, [base+spel_h_shufB]
+%endif
+ pxor m0, m0
+ punpcklbw m0, m2
+ punpcklbw m3, m3
+ psraw m3, 8
+ test dword r8m, 0x800
+ jz .hv_w4_10bpc
+ psraw m0, 2
+ psllw m3, 2
+.hv_w4_10bpc:
+ lea r6, [ssq*3]
+ sub srcq, 6
+ sub srcq, r6
+%if ARCH_X86_32
+ %define tmp esp+16*8
+ shl wd, 14
+%if STACK_ALIGNMENT < 16
+ mov [esp+4*61], srcq
+ mov [esp+4*62], dstq
+%else
+ mov srcmp, srcq
+%endif
+ mova [tmp+16*5], m4
+ lea wd, [wq+hq-(1<<16)]
+ pshufd m1, m0, q0000
+ pshufd m2, m0, q1111
+ pshufd m5, m0, q2222
+ pshufd m0, m0, q3333
+ mova m10, m1
+ mova m11, m2
+ mova m12, m5
+ mova m13, m0
+%else
+%if WIN64
+ %define tmp rsp
+%else
+ %define tmp rsp-104 ; red zone
+%endif
+ shl wd, 6
+ mov r7, srcq
+ mov r8, dstq
+ lea wd, [wq+hq-(1<<8)]
+ pshufd m10, m0, q0000
+ pshufd m11, m0, q1111
+ pshufd m12, m0, q2222
+ pshufd m13, m0, q3333
+ mova [tmp+16*5], m15
+%endif
+ pshufd m0, m3, q0000
+ pshufd m1, m3, q1111
+ pshufd m2, m3, q2222
+ pshufd m3, m3, q3333
+ mova [tmp+16*1], m0
+ mova [tmp+16*2], m1
+ mova [tmp+16*3], m2
+ mova [tmp+16*4], m3
+%macro PUT_8TAP_HV_H 4-5 m14 ; dst/src+0, src+8, tmp, shift, [pd_512]
+ pshufb m%3, m%1, m8 ; 0 1 1 2 2 3 3 4
+ pshufb m%1, m9 ; 2 3 3 4 4 5 5 6
+ pmaddwd m%3, m10
+ pmaddwd m%1, m11
+ paddd m%3, %5
+ paddd m%1, m%3
+ pshufb m%3, m%2, m8 ; 4 5 5 6 6 7 7 8
+ pshufb m%2, m9 ; 6 7 7 8 8 9 9 a
+ pmaddwd m%3, m12
+ pmaddwd m%2, m13
+ paddd m%1, m%3
+ paddd m%1, m%2
+ psrad m%1, %4
+%endmacro
+.hv_w4_loop0:
+%if ARCH_X86_64
+ mova m14, [pd_512]
+%endif
+ movu m4, [srcq+ssq*0+0]
+ movu m1, [srcq+ssq*0+8]
+ movu m5, [srcq+ssq*1+0]
+ movu m2, [srcq+ssq*1+8]
+ movu m6, [srcq+ssq*2+0]
+ movu m3, [srcq+ssq*2+8]
+ add srcq, r6
+ PUT_8TAP_HV_H 4, 1, 0, 10
+ PUT_8TAP_HV_H 5, 2, 0, 10
+ PUT_8TAP_HV_H 6, 3, 0, 10
+ movu m7, [srcq+ssq*0+0]
+ movu m2, [srcq+ssq*0+8]
+ movu m1, [srcq+ssq*1+0]
+ movu m3, [srcq+ssq*1+8]
+ PUT_8TAP_HV_H 7, 2, 0, 10
+ PUT_8TAP_HV_H 1, 3, 0, 10
+ movu m2, [srcq+ssq*2+0]
+ movu m3, [srcq+ssq*2+8]
+ add srcq, r6
+ PUT_8TAP_HV_H 2, 3, 0, 10
+ packssdw m4, m7 ; 0 3
+ packssdw m5, m1 ; 1 4
+ movu m0, [srcq+ssq*0+0]
+ movu m1, [srcq+ssq*0+8]
+ PUT_8TAP_HV_H 0, 1, 3, 10
+ packssdw m6, m2 ; 2 5
+ packssdw m7, m0 ; 3 6
+ punpcklwd m1, m4, m5 ; 01
+ punpckhwd m4, m5 ; 34
+ punpcklwd m2, m5, m6 ; 12
+ punpckhwd m5, m6 ; 45
+ punpcklwd m3, m6, m7 ; 23
+ punpckhwd m6, m7 ; 56
+%if ARCH_X86_32
+ jmp .hv_w4_loop_start
+.hv_w4_loop:
+ mova m1, [tmp+16*6]
+ mova m2, m15
+.hv_w4_loop_start:
+ mova m7, [tmp+16*1]
+ pmaddwd m1, m7 ; a0
+ pmaddwd m2, m7 ; b0
+ mova m7, [tmp+16*2]
+ mova [tmp+16*6], m3
+ pmaddwd m3, m7 ; a1
+ mova m15, m4
+ pmaddwd m4, m7 ; b1
+ mova m7, [tmp+16*3]
+ paddd m1, m3
+ paddd m2, m4
+ mova m3, m5
+ pmaddwd m5, m7 ; a2
+ mova m4, m6
+ pmaddwd m6, m7 ; b2
+ paddd m1, m5
+ paddd m2, m6
+ movu m7, [srcq+ssq*1+0]
+ movu m5, [srcq+ssq*1+8]
+ lea srcq, [srcq+ssq*2]
+ PUT_8TAP_HV_H 7, 5, 6, 10
+ packssdw m0, m7 ; 6 7
+ mova [tmp+16*0], m0
+ movu m0, [srcq+ssq*0+0]
+ movu m5, [srcq+ssq*0+8]
+ PUT_8TAP_HV_H 0, 5, 6, 10
+ mova m6, [tmp+16*0]
+ packssdw m7, m0 ; 7 8
+ punpcklwd m5, m6, m7 ; 67
+ punpckhwd m6, m7 ; 78
+ pmaddwd m7, m5, [tmp+16*4]
+ paddd m1, m7 ; a3
+ pmaddwd m7, m6, [tmp+16*4]
+ paddd m2, m7 ; b3
+ psrad m1, 9
+ psrad m2, 9
+ packssdw m1, m2
+ pxor m7, m7
+ pmaxsw m1, m7
+ pavgw m7, m1
+ pminsw m7, [tmp+16*5]
+ movq [dstq+dsq*0], m7
+ movhps [dstq+dsq*1], m7
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w4_loop
+%if STACK_ALIGNMENT < 16
+ mov srcq, [esp+4*61]
+ mov dstq, [esp+4*62]
+ add srcq, 8
+ add dstq, 8
+ mov [esp+4*61], srcq
+ mov [esp+4*62], dstq
+%else
+ mov srcq, srcmp
+ mov dstq, dstmp
+ add srcq, 8
+ add dstq, 8
+ mov srcmp, srcq
+ mov dstmp, dstq
+%endif
+ movzx hd, ww
+ sub wd, 1<<16
+%else
+.hv_w4_loop:
+ mova m15, [tmp+16*1]
+ pmaddwd m14, m15, m1 ; a0
+ pmaddwd m15, m2 ; b0
+ mova m7, [tmp+16*2]
+ mova m1, m3
+ pmaddwd m3, m7 ; a1
+ mova m2, m4
+ pmaddwd m4, m7 ; b1
+ mova m7, [tmp+16*3]
+ paddd m14, m3
+ paddd m15, m4
+ mova m3, m5
+ pmaddwd m5, m7 ; a2
+ mova m4, m6
+ pmaddwd m6, m7 ; b2
+ paddd m14, m5
+ paddd m15, m6
+ movu m7, [srcq+ssq*1+0]
+ movu m5, [srcq+ssq*1+8]
+ lea srcq, [srcq+ssq*2]
+ PUT_8TAP_HV_H 7, 5, 6, 10, [pd_512]
+ packssdw m0, m7 ; 6 7
+ mova [tmp+16*0], m0
+ movu m0, [srcq+ssq*0+0]
+ movu m5, [srcq+ssq*0+8]
+ PUT_8TAP_HV_H 0, 5, 6, 10, [pd_512]
+ mova m6, [tmp+16*0]
+ packssdw m7, m0 ; 7 8
+ punpcklwd m5, m6, m7 ; 67
+ punpckhwd m6, m7 ; 78
+ pmaddwd m7, m5, [tmp+16*4]
+ paddd m14, m7 ; a3
+ pmaddwd m7, m6, [tmp+16*4]
+ paddd m15, m7 ; b3
+ psrad m14, 9
+ psrad m15, 9
+ packssdw m14, m15
+ pxor m7, m7
+ pmaxsw m14, m7
+ pavgw m7, m14
+ pminsw m7, [tmp+16*5]
+ movq [dstq+dsq*0], m7
+ movhps [dstq+dsq*1], m7
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w4_loop
+ add r7, 8
+ add r8, 8
+ movzx hd, wb
+ mov srcq, r7
+ mov dstq, r8
+ sub wd, 1<<8
+%endif
+ jg .hv_w4_loop0
+ RET
+%undef tmp
+
+%if ARCH_X86_32
+DECLARE_REG_TMP 2, 1, 6, 4
+%elif WIN64
+DECLARE_REG_TMP 6, 4, 7, 4
+%else
+DECLARE_REG_TMP 6, 7, 7, 8
+%endif
+
+%define PREP_8TAP_FN FN prep_8tap,
+PREP_8TAP_FN sharp, SHARP, SHARP
+PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH
+PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP
+PREP_8TAP_FN smooth, SMOOTH, SMOOTH
+PREP_8TAP_FN sharp_regular, SHARP, REGULAR
+PREP_8TAP_FN regular_sharp, REGULAR, SHARP
+PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR
+PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PREP_8TAP_FN regular, REGULAR, REGULAR
+
+%if ARCH_X86_32
+cglobal prep_8tap_16bpc, 0, 7, 8, tmp, src, ss, w, h, mx, my
+%define mxb r0b
+%define mxd r0
+%define mxq r0
+%define myb r2b
+%define myd r2
+%define myq r2
+%else
+cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, ss, w, h, mx, my
+%endif
+%define base t2-prep_ssse3
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+ imul myd, mym, 0x010101
+ add myd, t1d ; 8tap_v, my, 4tap_v
+ LEA t2, prep_ssse3
+ movifnidn wd, wm
+ movifnidn srcq, srcmp
+ test mxd, 0xf00
+ jnz .h
+ movifnidn hd, hm
+ test myd, 0xf00
+ jnz .v
+ tzcnt wd, wd
+ mov myd, r7m ; bitdepth_max
+ movzx wd, word [base+prep_ssse3_table+wq*2]
+ mova m5, [base+pw_8192]
+ shr myd, 11
+ add wq, t2
+ movddup m4, [base+prep_mul+myq*8]
+ movifnidn ssq, ssmp
+ movifnidn tmpq, tmpmp
+ lea r6, [ssq*3]
+%if WIN64
+ pop r7
+%endif
+ jmp wq
+.h:
+ test myd, 0xf00
+ jnz .hv
+ movifnidn ssq, r2mp
+ movifnidn hd, r4m
+ movddup m5, [base+prep_8tap_1d_rnd]
+ cmp wd, 4
+ jne .h_w8
+ movzx mxd, mxb
+ movq m0, [base+subpel_filters+mxq*8]
+ mova m3, [base+spel_h_shufA]
+ mova m4, [base+spel_h_shufB]
+ movifnidn tmpq, tmpmp
+ sub srcq, 2
+ WIN64_SPILL_XMM 8
+ punpcklbw m0, m0
+ psraw m0, 8
+ test dword r7m, 0x800
+ jnz .h_w4_12bpc
+ psllw m0, 2
+.h_w4_12bpc:
+ pshufd m6, m0, q1111
+ pshufd m7, m0, q2222
+.h_w4_loop:
+ movu m1, [srcq+ssq*0]
+ movu m2, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb m0, m1, m3 ; 0 1 1 2 2 3 3 4
+ pshufb m1, m4 ; 2 3 3 4 4 5 5 6
+ pmaddwd m0, m6
+ pmaddwd m1, m7
+ paddd m0, m5
+ paddd m0, m1
+ pshufb m1, m2, m3
+ pshufb m2, m4
+ pmaddwd m1, m6
+ pmaddwd m2, m7
+ paddd m1, m5
+ paddd m1, m2
+ psrad m0, 4
+ psrad m1, 4
+ packssdw m0, m1
+ mova [tmpq], m0
+ add tmpq, 16
+ sub hd, 2
+ jg .h_w4_loop
+ RET
+.h_w8:
+ WIN64_SPILL_XMM 11
+ shr mxd, 16
+ movq m2, [base+subpel_filters+mxq*8]
+ mova m4, [base+spel_h_shufA]
+ mova m6, [base+spel_h_shufB]
+ movifnidn tmpq, r0mp
+ add wd, wd
+ punpcklbw m2, m2
+ add srcq, wq
+ psraw m2, 8
+ add tmpq, wq
+ neg wq
+ test dword r7m, 0x800
+ jnz .h_w8_12bpc
+ psllw m2, 2
+.h_w8_12bpc:
+ pshufd m7, m2, q0000
+%if ARCH_X86_32
+ ALLOC_STACK -16*3
+ pshufd m0, m2, q1111
+ pshufd m1, m2, q2222
+ pshufd m2, m2, q3333
+ mova m8, m0
+ mova m9, m1
+ mova m10, m2
+%else
+ pshufd m8, m2, q1111
+ pshufd m9, m2, q2222
+ pshufd m10, m2, q3333
+%endif
+.h_w8_loop0:
+ mov r6, wq
+.h_w8_loop:
+ movu m0, [srcq+r6- 6]
+ movu m1, [srcq+r6+ 2]
+ pshufb m2, m0, m4 ; 0 1 1 2 2 3 3 4
+ pshufb m0, m6 ; 2 3 3 4 4 5 5 6
+ pmaddwd m2, m7 ; abcd0
+ pmaddwd m0, m8 ; abcd1
+ pshufb m3, m1, m4 ; 4 5 5 6 6 7 7 8
+ pshufb m1, m6 ; 6 7 7 8 8 9 9 a
+ paddd m2, m5
+ paddd m0, m2
+ pmaddwd m2, m9, m3 ; abcd2
+ pmaddwd m3, m7 ; efgh0
+ paddd m0, m2
+ pmaddwd m2, m10, m1 ; abcd3
+ pmaddwd m1, m8 ; efgh1
+ paddd m0, m2
+ movu m2, [srcq+r6+10]
+ paddd m3, m5
+ paddd m1, m3
+ pshufb m3, m2, m4 ; a b b c c d d e
+ pshufb m2, m6 ; 8 9 9 a a b b c
+ pmaddwd m3, m9 ; efgh2
+ pmaddwd m2, m10 ; efgh3
+ paddd m1, m3
+ paddd m1, m2
+ psrad m0, 4
+ psrad m1, 4
+ packssdw m0, m1
+ mova [tmpq+r6], m0
+ add r6, 16
+ jl .h_w8_loop
+ add srcq, ssq
+ sub tmpq, wq
+ dec hd
+ jg .h_w8_loop0
+ RET
+.v:
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmove myd, mxd
+ movq m3, [base+subpel_filters+myq*8]
+%if STACK_ALIGNMENT < 16
+ %xdefine rstk rsp
+%else
+ %assign stack_offset stack_offset - stack_size_padded
+%endif
+ WIN64_SPILL_XMM 15
+ movddup m7, [base+prep_8tap_1d_rnd]
+ movifnidn ssq, r2mp
+ movifnidn tmpq, r0mp
+ punpcklbw m3, m3
+ psraw m3, 8 ; sign-extend
+ test dword r7m, 0x800
+ jnz .v_12bpc
+ psllw m3, 2
+.v_12bpc:
+%if ARCH_X86_32
+ ALLOC_STACK -16*7
+ pshufd m0, m3, q0000
+ pshufd m1, m3, q1111
+ pshufd m2, m3, q2222
+ pshufd m3, m3, q3333
+ mova m8, m0
+ mova m9, m1
+ mova m10, m2
+ mova m11, m3
+%else
+ pshufd m8, m3, q0000
+ pshufd m9, m3, q1111
+ pshufd m10, m3, q2222
+ pshufd m11, m3, q3333
+%endif
+ lea r6, [ssq*3]
+ sub srcq, r6
+ mov r6d, wd
+ shl wd, 6
+ mov r5, srcq
+%if ARCH_X86_64
+ mov r7, tmpq
+%elif STACK_ALIGNMENT < 16
+ mov [esp+4*29], tmpq
+%endif
+ lea wd, [wq+hq-(1<<8)]
+.v_loop0:
+ movq m1, [srcq+ssq*0]
+ movq m2, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movq m3, [srcq+ssq*0]
+ movq m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movq m5, [srcq+ssq*0]
+ movq m6, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movq m0, [srcq+ssq*0]
+ punpcklwd m1, m2 ; 01
+ punpcklwd m2, m3 ; 12
+ punpcklwd m3, m4 ; 23
+ punpcklwd m4, m5 ; 34
+ punpcklwd m5, m6 ; 45
+ punpcklwd m6, m0 ; 56
+%if ARCH_X86_32
+ jmp .v_loop_start
+.v_loop:
+ mova m1, m12
+ mova m2, m13
+ mova m3, m14
+.v_loop_start:
+ pmaddwd m1, m8 ; a0
+ pmaddwd m2, m8 ; b0
+ mova m12, m3
+ mova m13, m4
+ pmaddwd m3, m9 ; a1
+ pmaddwd m4, m9 ; b1
+ paddd m1, m3
+ paddd m2, m4
+ mova m14, m5
+ mova m4, m6
+ pmaddwd m5, m10 ; a2
+ pmaddwd m6, m10 ; b2
+ paddd m1, m5
+ paddd m2, m6
+ movq m6, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklwd m5, m0, m6 ; 67
+ movq m0, [srcq+ssq*0]
+ pmaddwd m3, m11, m5 ; a3
+ punpcklwd m6, m0 ; 78
+ paddd m1, m7
+ paddd m1, m3
+ pmaddwd m3, m11, m6 ; b3
+ paddd m2, m7
+ paddd m2, m3
+ psrad m1, 4
+ psrad m2, 4
+ packssdw m1, m2
+ movq [tmpq+r6*0], m1
+ movhps [tmpq+r6*2], m1
+ lea tmpq, [tmpq+r6*4]
+ sub hd, 2
+ jg .v_loop
+%if STACK_ALIGNMENT < 16
+ mov tmpq, [esp+4*29]
+ add r5, 8
+ add tmpq, 8
+ mov srcq, r5
+ mov [esp+4*29], tmpq
+%else
+ mov tmpq, tmpmp
+ add r5, 8
+ add tmpq, 8
+ mov srcq, r5
+ mov tmpmp, tmpq
+%endif
+%else
+.v_loop:
+ pmaddwd m12, m8, m1 ; a0
+ pmaddwd m13, m8, m2 ; b0
+ mova m1, m3
+ mova m2, m4
+ pmaddwd m3, m9 ; a1
+ pmaddwd m4, m9 ; b1
+ paddd m12, m3
+ paddd m13, m4
+ mova m3, m5
+ mova m4, m6
+ pmaddwd m5, m10 ; a2
+ pmaddwd m6, m10 ; b2
+ paddd m12, m5
+ paddd m13, m6
+ movq m6, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklwd m5, m0, m6 ; 67
+ movq m0, [srcq+ssq*0]
+ pmaddwd m14, m11, m5 ; a3
+ punpcklwd m6, m0 ; 78
+ paddd m12, m7
+ paddd m12, m14
+ pmaddwd m14, m11, m6 ; b3
+ paddd m13, m7
+ paddd m13, m14
+ psrad m12, 4
+ psrad m13, 4
+ packssdw m12, m13
+ movq [tmpq+r6*0], m12
+ movhps [tmpq+r6*2], m12
+ lea tmpq, [tmpq+r6*4]
+ sub hd, 2
+ jg .v_loop
+ add r5, 8
+ add r7, 8
+ mov srcq, r5
+ mov tmpq, r7
+%endif
+ movzx hd, wb
+ sub wd, 1<<8
+ jg .v_loop0
+ RET
+.hv:
+%if STACK_ALIGNMENT < 16
+ %xdefine rstk rsp
+%else
+ %assign stack_offset stack_offset - stack_size_padded
+%endif
+ movzx t3d, mxb
+ shr mxd, 16
+ cmp wd, 4
+ cmove mxd, t3d
+ movifnidn hd, r4m
+ movq m2, [base+subpel_filters+mxq*8]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmove myd, mxd
+ movq m3, [base+subpel_filters+myq*8]
+%if ARCH_X86_32
+ mov ssq, r2mp
+ mov tmpq, r0mp
+ mova m0, [base+spel_h_shufA]
+ mova m1, [base+spel_h_shufB]
+ mova m4, [base+prep_8tap_2d_rnd]
+ ALLOC_STACK -16*14
+ mova m8, m0
+ mova m9, m1
+ mova m14, m4
+%else
+%if WIN64
+ ALLOC_STACK 16*6, 16
+%endif
+ mova m8, [base+spel_h_shufA]
+ mova m9, [base+spel_h_shufB]
+%endif
+ pxor m0, m0
+ punpcklbw m0, m2
+ punpcklbw m3, m3
+ psraw m0, 4
+ psraw m3, 8
+ test dword r7m, 0x800
+ jz .hv_10bpc
+ psraw m0, 2
+.hv_10bpc:
+ lea r6, [ssq*3]
+ sub srcq, 6
+ sub srcq, r6
+ mov r6d, wd
+ shl wd, 6
+ mov r5, srcq
+%if ARCH_X86_32
+ %define tmp esp+16*8
+%if STACK_ALIGNMENT < 16
+ mov [esp+4*61], tmpq
+%endif
+ pshufd m1, m0, q0000
+ pshufd m2, m0, q1111
+ pshufd m5, m0, q2222
+ pshufd m0, m0, q3333
+ mova m10, m1
+ mova m11, m2
+ mova m12, m5
+ mova m13, m0
+%else
+%if WIN64
+ %define tmp rsp
+%else
+ %define tmp rsp-88 ; red zone
+%endif
+ mov r7, tmpq
+ pshufd m10, m0, q0000
+ pshufd m11, m0, q1111
+ pshufd m12, m0, q2222
+ pshufd m13, m0, q3333
+%endif
+ lea wd, [wq+hq-(1<<8)]
+ pshufd m0, m3, q0000
+ pshufd m1, m3, q1111
+ pshufd m2, m3, q2222
+ pshufd m3, m3, q3333
+ mova [tmp+16*1], m0
+ mova [tmp+16*2], m1
+ mova [tmp+16*3], m2
+ mova [tmp+16*4], m3
+.hv_loop0:
+%if ARCH_X86_64
+ mova m14, [prep_8tap_2d_rnd]
+%endif
+ movu m4, [srcq+ssq*0+0]
+ movu m1, [srcq+ssq*0+8]
+ movu m5, [srcq+ssq*1+0]
+ movu m2, [srcq+ssq*1+8]
+ lea srcq, [srcq+ssq*2]
+ movu m6, [srcq+ssq*0+0]
+ movu m3, [srcq+ssq*0+8]
+ PUT_8TAP_HV_H 4, 1, 0, 6
+ PUT_8TAP_HV_H 5, 2, 0, 6
+ PUT_8TAP_HV_H 6, 3, 0, 6
+ movu m7, [srcq+ssq*1+0]
+ movu m2, [srcq+ssq*1+8]
+ lea srcq, [srcq+ssq*2]
+ movu m1, [srcq+ssq*0+0]
+ movu m3, [srcq+ssq*0+8]
+ PUT_8TAP_HV_H 7, 2, 0, 6
+ PUT_8TAP_HV_H 1, 3, 0, 6
+ movu m2, [srcq+ssq*1+0]
+ movu m3, [srcq+ssq*1+8]
+ lea srcq, [srcq+ssq*2]
+ PUT_8TAP_HV_H 2, 3, 0, 6
+ packssdw m4, m7 ; 0 3
+ packssdw m5, m1 ; 1 4
+ movu m0, [srcq+ssq*0+0]
+ movu m1, [srcq+ssq*0+8]
+ PUT_8TAP_HV_H 0, 1, 3, 6
+ packssdw m6, m2 ; 2 5
+ packssdw m7, m0 ; 3 6
+ punpcklwd m1, m4, m5 ; 01
+ punpckhwd m4, m5 ; 34
+ punpcklwd m2, m5, m6 ; 12
+ punpckhwd m5, m6 ; 45
+ punpcklwd m3, m6, m7 ; 23
+ punpckhwd m6, m7 ; 56
+%if ARCH_X86_32
+ jmp .hv_loop_start
+.hv_loop:
+ mova m1, [tmp+16*5]
+ mova m2, m15
+.hv_loop_start:
+ mova m7, [tmp+16*1]
+ pmaddwd m1, m7 ; a0
+ pmaddwd m2, m7 ; b0
+ mova m7, [tmp+16*2]
+ mova [tmp+16*5], m3
+ pmaddwd m3, m7 ; a1
+ mova m15, m4
+ pmaddwd m4, m7 ; b1
+ mova m7, [tmp+16*3]
+ paddd m1, m14
+ paddd m2, m14
+ paddd m1, m3
+ paddd m2, m4
+ mova m3, m5
+ pmaddwd m5, m7 ; a2
+ mova m4, m6
+ pmaddwd m6, m7 ; b2
+ paddd m1, m5
+ paddd m2, m6
+ movu m7, [srcq+ssq*1+0]
+ movu m5, [srcq+ssq*1+8]
+ lea srcq, [srcq+ssq*2]
+ PUT_8TAP_HV_H 7, 5, 6, 6
+ packssdw m0, m7 ; 6 7
+ mova [tmp+16*0], m0
+ movu m0, [srcq+ssq*0+0]
+ movu m5, [srcq+ssq*0+8]
+ PUT_8TAP_HV_H 0, 5, 6, 6
+ mova m6, [tmp+16*0]
+ packssdw m7, m0 ; 7 8
+ punpcklwd m5, m6, m7 ; 67
+ punpckhwd m6, m7 ; 78
+ pmaddwd m7, m5, [tmp+16*4]
+ paddd m1, m7 ; a3
+ pmaddwd m7, m6, [tmp+16*4]
+ paddd m2, m7 ; b3
+ psrad m1, 6
+ psrad m2, 6
+ packssdw m1, m2
+ movq [tmpq+r6*0], m1
+ movhps [tmpq+r6*2], m1
+ lea tmpq, [tmpq+r6*4]
+ sub hd, 2
+ jg .hv_loop
+%if STACK_ALIGNMENT < 16
+ mov tmpq, [esp+4*61]
+ add r5, 8
+ add tmpq, 8
+ mov srcq, r5
+ mov [esp+4*61], tmpq
+%else
+ mov tmpq, tmpmp
+ add r5, 8
+ add tmpq, 8
+ mov srcq, r5
+ mov tmpmp, tmpq
+%endif
+%else
+.hv_loop:
+ mova m15, [tmp+16*1]
+ mova m7, [prep_8tap_2d_rnd]
+ pmaddwd m14, m15, m1 ; a0
+ pmaddwd m15, m2 ; b0
+ paddd m14, m7
+ paddd m15, m7
+ mova m7, [tmp+16*2]
+ mova m1, m3
+ pmaddwd m3, m7 ; a1
+ mova m2, m4
+ pmaddwd m4, m7 ; b1
+ mova m7, [tmp+16*3]
+ paddd m14, m3
+ paddd m15, m4
+ mova m3, m5
+ pmaddwd m5, m7 ; a2
+ mova m4, m6
+ pmaddwd m6, m7 ; b2
+ paddd m14, m5
+ paddd m15, m6
+ movu m7, [srcq+ssq*1+0]
+ movu m5, [srcq+ssq*1+8]
+ lea srcq, [srcq+ssq*2]
+ PUT_8TAP_HV_H 7, 5, 6, 6, [prep_8tap_2d_rnd]
+ packssdw m0, m7 ; 6 7
+ mova [tmp+16*0], m0
+ movu m0, [srcq+ssq*0+0]
+ movu m5, [srcq+ssq*0+8]
+ PUT_8TAP_HV_H 0, 5, 6, 6, [prep_8tap_2d_rnd]
+ mova m6, [tmp+16*0]
+ packssdw m7, m0 ; 7 8
+ punpcklwd m5, m6, m7 ; 67
+ punpckhwd m6, m7 ; 78
+ pmaddwd m7, m5, [tmp+16*4]
+ paddd m14, m7 ; a3
+ pmaddwd m7, m6, [tmp+16*4]
+ paddd m15, m7 ; b3
+ psrad m14, 6
+ psrad m15, 6
+ packssdw m14, m15
+ movq [tmpq+r6*0], m14
+ movhps [tmpq+r6*2], m14
+ lea tmpq, [tmpq+r6*4]
+ sub hd, 2
+ jg .hv_loop
+ add r5, 8
+ add r7, 8
+ mov srcq, r5
+ mov tmpq, r7
+%endif
+ movzx hd, wb
+ sub wd, 1<<8
+ jg .hv_loop0
+ RET
+%undef tmp
+
+%macro movifprep 2
+ %if isprep
+ mov %1, %2
+ %endif
+%endmacro
+
+%macro SAVE_REG 1
+ %xdefine r%1_save r%1
+ %xdefine r%1q_save r%1q
+ %xdefine r%1d_save r%1d
+ %if ARCH_X86_32
+ %define r%1m_save [rstk+stack_offset+(%1+1)*4]
+ %endif
+%endmacro
+
+%macro LOAD_REG 1
+ %xdefine r%1 r%1_save
+ %xdefine r%1q r%1q_save
+ %xdefine r%1d r%1d_save
+ %if ARCH_X86_32
+ %define r%1m r%1m_save
+ %endif
+ %undef r%1d_save
+ %undef r%1q_save
+ %undef r%1_save
+%endmacro
+
+%macro REMAP_REG 2-3
+ %xdefine r%1 r%2
+ %xdefine r%1q r%2q
+ %xdefine r%1d r%2d
+ %if ARCH_X86_32
+ %if %3 == 0
+ %xdefine r%1m r%2m
+ %else
+ %define r%1m [rstk+stack_offset+(%1+1)*4]
+ %endif
+ %endif
+%endmacro
+
+%macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0
+ %if isprep
+ %if ARCH_X86_64
+ SAVE_REG 14
+ %assign %%i 14
+ %rep 14
+ %assign %%j %%i-1
+ REMAP_REG %%i, %%j
+ %assign %%i %%i-1
+ %endrep
+ %else
+ SAVE_REG 5
+ %assign %%i 5
+ %rep 5
+ %assign %%j %%i-1
+ REMAP_REG %%i, %%j, 0
+ %assign %%i %%i-1
+ %endrep
+ %endif
+ %endif
+%endmacro
+
+%macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0
+ %if isprep
+ %assign %%i 1
+ %if ARCH_X86_64
+ %rep 13
+ %assign %%j %%i+1
+ REMAP_REG %%i, %%j
+ %assign %%i %%i+1
+ %endrep
+ LOAD_REG 14
+ %else
+ %rep 4
+ %assign %%j %%i+1
+ REMAP_REG %%i, %%j, 1
+ %assign %%i %%i+1
+ %endrep
+ LOAD_REG 5
+ %endif
+ %endif
+%endmacro
+
+%macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged
+ MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT
+ RET
+ %if %1
+ MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
+ %endif
+%endmacro
+
+%if ARCH_X86_32
+ %macro MC_4TAP_SCALED_H 1 ; dst_mem
+ movu m7, [srcq+ssq*0]
+ movu m2, [srcq+ssq*1]
+ movu m5, [r4 +ssq*0]
+ movu m6, [r4 +ssq*1]
+ lea srcq, [srcq+ssq*2]
+ lea r4, [r4 +ssq*2]
+ REPX {pshufb x, m12}, m7, m2
+ REPX {pmaddwd x, m13}, m7, m2
+ REPX {pshufb x, m14}, m5, m6
+ REPX {pmaddwd x, m15}, m5, m6
+ phaddd m7, m5
+ phaddd m2, m6
+ mova m5, [esp+0x00]
+ movd m6, [esp+0x10]
+ paddd m7, m5
+ paddd m2, m5
+ psrad m7, m6
+ psrad m2, m6
+ packssdw m7, m2
+ mova [stk+%1], m7
+ %endmacro
+%endif
+
+%if ARCH_X86_64
+ %macro MC_8TAP_SCALED_H 8 ; dst, tmp[0-6]
+ movu m%1, [srcq+ r4*2]
+ movu m%2, [srcq+ r6*2]
+ movu m%3, [srcq+ r7*2]
+ movu m%4, [srcq+ r9*2]
+ movu m%5, [srcq+r10*2]
+ movu m%6, [srcq+r11*2]
+ movu m%7, [srcq+r13*2]
+ movu m%8, [srcq+ rX*2]
+ add srcq, ssq
+ pmaddwd m%1, [stk+0x10]
+ pmaddwd m%2, [stk+0x20]
+ pmaddwd m%3, [stk+0x30]
+ pmaddwd m%4, [stk+0x40]
+ pmaddwd m%5, [stk+0x50]
+ pmaddwd m%6, [stk+0x60]
+ pmaddwd m%7, [stk+0x70]
+ pmaddwd m%8, [stk+0x80]
+ phaddd m%1, m%2
+ phaddd m%3, m%4
+ phaddd m%5, m%6
+ phaddd m%7, m%8
+ phaddd m%1, m%3
+ phaddd m%5, m%7
+ paddd m%1, hround
+ paddd m%5, hround
+ psrad m%1, m12
+ psrad m%5, m12
+ packssdw m%1, m%5
+ %endmacro
+%else
+ %macro MC_8TAP_SCALED_H 2-3 1 ; weights_mem_start, h_mem, load_fh_offsets
+ %if %3 == 1
+ mov r0, [stk+ 0]
+ mov rX, [stk+ 4]
+ mov r4, [stk+ 8]
+ mov r5, [stk+12]
+ %endif
+ movu m0, [srcq+r0*2]
+ movu m1, [srcq+rX*2]
+ movu m2, [srcq+r4*2]
+ movu m3, [srcq+r5*2]
+ mov r0, [stk+16]
+ mov rX, [stk+20]
+ mov r4, [stk+24]
+ mov r5, [stk+28]
+ pmaddwd m0, [stk+%1+0x00]
+ pmaddwd m1, [stk+%1+0x10]
+ pmaddwd m2, [stk+%1+0x20]
+ pmaddwd m3, [stk+%1+0x30]
+ phaddd m0, m1
+ phaddd m2, m3
+ movu m4, [srcq+r0*2]
+ movu m5, [srcq+rX*2]
+ movu m6, [srcq+r4*2]
+ movu m7, [srcq+r5*2]
+ add srcq, ssq
+ pmaddwd m4, [stk+%1+0xa0]
+ pmaddwd m5, [stk+%1+0xb0]
+ pmaddwd m6, [stk+%1+0xc0]
+ pmaddwd m7, [stk+%1+0xd0]
+ phaddd m4, m5
+ phaddd m6, m7
+ phaddd m0, m2
+ phaddd m4, m6
+ paddd m0, hround
+ paddd m4, hround
+ psrad m0, m12
+ psrad m4, m12
+ packssdw m0, m4
+ %if %2 != 0
+ mova [stk+%2], m0
+ %endif
+ %endmacro
+%endif
+
+%macro MC_8TAP_SCALED 1
+%ifidn %1, put
+ %assign isput 1
+ %assign isprep 0
+ %if ARCH_X86_64
+ %if required_stack_alignment <= STACK_ALIGNMENT
+cglobal put_8tap_scaled_16bpc, 2, 15, 16, 0x1c0, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax
+ %else
+cglobal put_8tap_scaled_16bpc, 2, 14, 16, 0x1c0, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax
+ %endif
+ %else ; ARCH_X86_32
+ %if required_stack_alignment <= STACK_ALIGNMENT
+cglobal put_8tap_scaled_16bpc, 0, 7, 8, 0x200, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax
+ %else
+cglobal put_8tap_scaled_16bpc, 0, 7, 8, -0x200-0x30, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax
+ %endif
+ %endif
+ %xdefine base_reg r12
+%else ; prep
+ %assign isput 0
+ %assign isprep 1
+ %if ARCH_X86_64
+ %if required_stack_alignment <= STACK_ALIGNMENT
+cglobal prep_8tap_scaled_16bpc, 2, 15, 16, 0x1c0, tmp, src, ss, w, h, mx, my, dx, dy, pxmax
+ %xdefine tmp_stridem r14q
+ %else
+cglobal prep_8tap_scaled_16bpc, 2, 14, 16, 0x1c0, tmp, src, ss, w, h, mx, my, dx, dy, pxmax
+ %define tmp_stridem qword [stk+0x138]
+ %endif
+ %xdefine base_reg r11
+ %else ; ARCH_X86_32
+ %if required_stack_alignment <= STACK_ALIGNMENT
+cglobal prep_8tap_scaled_16bpc, 0, 7, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy, pxmax
+ %else
+cglobal prep_8tap_scaled_16bpc, 0, 6, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy, pxmax
+ %endif
+ %define tmp_stridem dword [stk+0x138]
+ %endif
+%endif
+%if ARCH_X86_32
+ mov [esp+0x1f0], t0d
+ mov [esp+0x1f4], t1d
+ %if isput && required_stack_alignment > STACK_ALIGNMENT
+ mov dstd, dstm
+ mov dsd, dsm
+ mov srcd, srcm
+ mov ssd, ssm
+ mov hd, hm
+ mov r4, mxm
+ %define r0m [esp+0x200]
+ %define dsm [esp+0x204]
+ %define dsmp dsm
+ %define r1m dsm
+ %define r2m [esp+0x208]
+ %define ssm [esp+0x20c]
+ %define r3m ssm
+ %define hm [esp+0x210]
+ %define mxm [esp+0x214]
+ mov r0m, dstd
+ mov dsm, dsd
+ mov r2m, srcd
+ mov ssm, ssd
+ mov hm, hd
+ mov r0, mym
+ mov r1, dxm
+ mov r2, dym
+ %define mym [esp+0x218]
+ %define dxm [esp+0x21c]
+ %define dym [esp+0x220]
+ mov mxm, r4
+ mov mym, r0
+ mov dxm, r1
+ mov dym, r2
+ tzcnt wd, wm
+ %endif
+ %if isput
+ mov r3, pxmaxm
+ %define pxmaxm r3
+ %else
+ mov r2, pxmaxm
+ %endif
+ %if isprep && required_stack_alignment > STACK_ALIGNMENT
+ %xdefine base_reg r5
+ %else
+ %xdefine base_reg r6
+ %endif
+%endif
+ LEA base_reg, %1_8tap_scaled_16bpc_ssse3
+%xdefine base base_reg-%1_8tap_scaled_16bpc_ssse3
+%if ARCH_X86_64 || isprep || required_stack_alignment <= STACK_ALIGNMENT
+ tzcnt wd, wm
+%endif
+%if ARCH_X86_64
+ %if isput
+ mov r7d, pxmaxm
+ %endif
+%else
+ %define m8 m0
+ %define m9 m1
+ %define m14 m4
+ %define m15 m3
+%endif
+ movd m8, dxm
+ movd m14, mxm
+%if isput
+ movd m15, pxmaxm
+%endif
+ pshufd m8, m8, q0000
+ pshufd m14, m14, q0000
+%if isput
+ pshuflw m15, m15, q0000
+ punpcklqdq m15, m15
+%endif
+%if isprep
+ %if UNIX64
+ mov r5d, t0d
+ DECLARE_REG_TMP 5, 7
+ %endif
+ %if ARCH_X86_64
+ mov r6d, pxmaxm
+ %endif
+%endif
+%if ARCH_X86_64
+ mov dyd, dym
+%endif
+%if isput
+ %if WIN64
+ mov r8d, hm
+ DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3
+ %define hm r5m
+ %define dxm r8m
+ %elif ARCH_X86_64
+ DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3
+ %define hm r6m
+ %else
+ %endif
+ %if ARCH_X86_64
+ %if required_stack_alignment > STACK_ALIGNMENT
+ %define dsm [rsp+0x138]
+ %define rX r1
+ %define rXd r1d
+ %else
+ %define dsm dsq
+ %define rX r14
+ %define rXd r14d
+ %endif
+ %else
+ %define rX r1
+ %endif
+%else ; prep
+ %if WIN64
+ mov r7d, hm
+ DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3
+ %define hm r4m
+ %define dxm r7m
+ %elif ARCH_X86_64
+ DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3
+ %xdefine hm r7m
+ %endif
+ MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
+ %if ARCH_X86_64
+ %define rX r14
+ %define rXd r14d
+ %else
+ %define rX r3
+ %endif
+%endif
+%if ARCH_X86_64
+ shr r7d, 11
+ mova m10, [base+pd_0x3ff]
+ movddup m11, [base+s_8tap_h_rnd+r7*8]
+ movd m12, [base+s_8tap_h_sh+r7*4]
+ %if isput
+ movddup m13, [base+put_s_8tap_v_rnd+r7*8]
+ movd m7, [base+put_s_8tap_v_sh+r7*4]
+ %define pxmaxm [rsp]
+ mova pxmaxm, m15
+ punpcklqdq m12, m7
+ %endif
+ lea ss3q, [ssq*3]
+ movzx r7d, t1b
+ shr t1d, 16
+ cmp hd, 6
+ cmovs t1d, r7d
+ sub srcq, ss3q
+%else
+ %define m10 [base+pd_0x3ff]
+ %define m11 [esp+0x00]
+ %define m12 [esp+0x10]
+ shr r3, 11
+ movddup m1, [base+s_8tap_h_rnd+r3*8]
+ movd m2, [base+s_8tap_h_sh+r3*4]
+ %if isput
+ %define m13 [esp+0x20]
+ %define pxmaxm [esp+0x30]
+ %define stk esp+0x40
+ movddup m5, [base+put_s_8tap_v_rnd+r3*8]
+ movd m6, [base+put_s_8tap_v_sh+r3*4]
+ mova pxmaxm, m15
+ punpcklqdq m2, m6
+ mova m13, m5
+ %else
+ %define m13 [base+pd_m524256]
+ %endif
+ mov ssd, ssm
+ mova m11, m1
+ mova m12, m2
+ MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT
+ mov r1, [esp+0x1f4]
+ lea r0, [ssd*3]
+ movzx r2, r1b
+ shr r1, 16
+ cmp dword hm, 6
+ cmovs r1, r2
+ mov [esp+0x1f4], r1
+ %if isprep
+ mov r1, r1m
+ %endif
+ mov r2, r2m
+ sub srcq, r0
+ MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
+ %define ss3q r0
+ %define myd r4
+ %define dyd dword dym
+ %define hd dword hm
+%endif
+ cmp dyd, 1024
+ je .dy1
+ cmp dyd, 2048
+ je .dy2
+ movzx wd, word [base+%1_8tap_scaled_ssse3_table+wq*2]
+ add wq, base_reg
+ jmp wq
+%if isput
+.w2:
+ %if ARCH_X86_64
+ mov myd, mym
+ movzx t0d, t0b
+ sub srcq, 2
+ movd m15, t0d
+ %else
+ movzx r4, byte [esp+0x1f0]
+ sub srcq, 2
+ movd m15, r4
+ %endif
+ pxor m9, m9
+ punpckldq m9, m8
+ paddd m14, m9 ; mx+dx*[0-1]
+ %if ARCH_X86_64
+ mova m9, [base+pd_0x4000]
+ %endif
+ pshufd m15, m15, q0000
+ pand m8, m14, m10
+ psrld m8, 6
+ paddd m15, m8
+ movd r4d, m15
+ pshufd m15, m15, q0321
+ %if ARCH_X86_64
+ movd r6d, m15
+ %else
+ movd r3d, m15
+ %endif
+ mova m5, [base+bdct_lb_q]
+ mova m6, [base+spel_s_shuf2]
+ movd m15, [base+subpel_filters+r4*8+2]
+ %if ARCH_X86_64
+ movd m7, [base+subpel_filters+r6*8+2]
+ %else
+ movd m7, [base+subpel_filters+r3*8+2]
+ %endif
+ pxor m2, m2
+ pcmpeqd m8, m2
+ psrld m14, 10
+ paddd m14, m14
+ %if ARCH_X86_32
+ mov r3, r3m
+ pshufb m14, m5
+ paddb m14, m6
+ mova [stk], m14
+ SWAP m5, m0
+ SWAP m6, m3
+ %define m15 m6
+ %endif
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
+ movu m2, [srcq+ssq*2]
+ movu m3, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ punpckldq m15, m7
+ %if ARCH_X86_64
+ pshufb m14, m5
+ paddb m14, m6
+ pand m9, m8
+ pandn m8, m15
+ SWAP m15, m8
+ por m15, m9
+ movu m4, [srcq+ssq*0]
+ movu m5, [srcq+ssq*1]
+ movu m6, [srcq+ssq*2]
+ movu m7, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ %else
+ pand m7, m5, [base+pd_0x4000]
+ pandn m5, m15
+ por m5, m7
+ %define m15 m5
+ %endif
+ punpcklbw m15, m15
+ psraw m15, 8
+ REPX {pshufb x, m14}, m0, m1, m2, m3
+ REPX {pmaddwd x, m15}, m0, m1, m2, m3
+ %if ARCH_X86_64
+ REPX {pshufb x, m14}, m4, m5, m6, m7
+ REPX {pmaddwd x, m15}, m4, m5, m6, m7
+ phaddd m0, m1
+ phaddd m2, m3
+ phaddd m4, m5
+ phaddd m6, m7
+ REPX {paddd x, m11}, m0, m2, m4, m6
+ REPX {psrad x, m12}, m0, m2, m4, m6
+ packssdw m0, m2 ; 0 1 2 3
+ packssdw m4, m6 ; 4 5 6 7
+ SWAP m1, m4
+ %else
+ mova [stk+0x10], m15
+ phaddd m0, m1
+ phaddd m2, m3
+ movu m1, [srcq+ssq*0]
+ movu m7, [srcq+ssq*1]
+ movu m6, [srcq+ssq*2]
+ movu m3, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ REPX {pshufb x, m14}, m1, m7, m6, m3
+ REPX {pmaddwd x, m15}, m1, m7, m6, m3
+ phaddd m1, m7
+ phaddd m6, m3
+ REPX {paddd x, m11}, m0, m2, m1, m6
+ REPX {psrad x, m12}, m0, m2, m1, m6
+ packssdw m0, m2
+ packssdw m1, m6
+ %define m14 [stk+0x00]
+ %define m15 [stk+0x10]
+ %endif
+ palignr m2, m1, m0, 4 ; 1 2 3 4
+ punpcklwd m3, m0, m2 ; 01 12
+ punpckhwd m0, m2 ; 23 34
+ pshufd m5, m1, q0321 ; 5 6 7 _
+ punpcklwd m2, m1, m5 ; 45 56
+ punpckhwd m4, m1, m5 ; 67 __
+ %if ARCH_X86_32
+ mov myd, mym
+ mov r0, r0m
+ mova [stk+0x20], m3
+ mova [stk+0x30], m0
+ mova [stk+0x40], m2
+ mova [stk+0x50], m4
+ %endif
+.w2_loop:
+ and myd, 0x3ff
+ %if ARCH_X86_64
+ mov r6d, 64 << 24
+ mov r4d, myd
+ shr r4d, 6
+ lea r4d, [t1+r4]
+ cmovnz r6q, [base+subpel_filters+r4*8]
+ movq m10, r6q
+ punpcklbw m10, m10
+ psraw m10, 8
+ pshufd m7, m10, q0000
+ pshufd m8, m10, q1111
+ pmaddwd m5, m3, m7
+ pmaddwd m6, m0, m8
+ pshufd m9, m10, q2222
+ pshufd m10, m10, q3333
+ pmaddwd m7, m2, m9
+ pmaddwd m8, m4, m10
+ paddd m5, m6
+ paddd m7, m8
+ %else
+ mov r1, [esp+0x1f4]
+ xor r3, r3
+ mov r5, myd
+ shr r5, 6
+ lea r1, [r1+r5]
+ mov r5, 64 << 24
+ cmovnz r3, [base+subpel_filters+r1*8+4]
+ cmovnz r5, [base+subpel_filters+r1*8+0]
+ movd m6, r3
+ movd m7, r5
+ punpckldq m7, m6
+ punpcklbw m7, m7
+ psraw m7, 8
+ pshufd m5, m7, q0000
+ pshufd m6, m7, q1111
+ pmaddwd m3, m5
+ pmaddwd m0, m6
+ pshufd m5, m7, q2222
+ pshufd m7, m7, q3333
+ pmaddwd m2, m5
+ pmaddwd m4, m7
+ paddd m3, m0
+ paddd m2, m4
+ SWAP m5, m3
+ SWAP m7, m2
+ %define m8 m3
+ %endif
+ paddd m5, m13
+ pshufd m6, m12, q1032
+ pxor m8, m8
+ paddd m5, m7
+ psrad m5, m6
+ packssdw m5, m5
+ pmaxsw m5, m8
+ pminsw m5, pxmaxm
+ movd [dstq], m5
+ add dstq, dsmp
+ dec hd
+ jz .ret
+ %if ARCH_X86_64
+ add myd, dyd
+ %else
+ add myd, dym
+ %endif
+ test myd, ~0x3ff
+ %if ARCH_X86_32
+ SWAP m3, m5
+ SWAP m2, m7
+ mova m3, [stk+0x20]
+ mova m0, [stk+0x30]
+ mova m2, [stk+0x40]
+ mova m4, [stk+0x50]
+ %endif
+ jz .w2_loop
+ %if ARCH_X86_32
+ mov r3, r3m
+ %endif
+ movu m5, [srcq]
+ test myd, 0x400
+ jz .w2_skip_line
+ add srcq, ssq
+ shufps m3, m0, q1032 ; 01 12
+ shufps m0, m2, q1032 ; 23 34
+ shufps m2, m4, q1032 ; 45 56
+ pshufb m5, m14
+ pmaddwd m5, m15
+ phaddd m5, m5
+ paddd m5, m11
+ psrad m5, m12
+ packssdw m5, m5
+ palignr m4, m5, m1, 12
+ punpcklqdq m1, m4, m4 ; 6 7 6 7
+ punpcklwd m4, m1, m5 ; 67 __
+ %if ARCH_X86_32
+ mova [stk+0x20], m3
+ mova [stk+0x30], m0
+ mova [stk+0x40], m2
+ mova [stk+0x50], m4
+ %endif
+ jmp .w2_loop
+.w2_skip_line:
+ movu m6, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova m3, m0 ; 01 12
+ mova m0, m2 ; 23 34
+ pshufb m5, m14
+ pshufb m6, m14
+ pmaddwd m5, m15
+ pmaddwd m6, m15
+ phaddd m5, m6
+ paddd m5, m11
+ psrad m5, m12
+ packssdw m5, m5 ; 6 7 6 7
+ punpckhqdq m1, m5 ; 4 5 6 7
+ pshufd m5, m1, q0321 ; 5 6 7 _
+ punpcklwd m2, m1, m5 ; 45 56
+ punpckhwd m4, m1, m5 ; 67 __
+ %if ARCH_X86_32
+ mova [stk+0x20], m3
+ mova [stk+0x30], m0
+ mova [stk+0x40], m2
+ mova [stk+0x50], m4
+ %endif
+ jmp .w2_loop
+%endif
+INIT_XMM ssse3
+.w4:
+%if ARCH_X86_64
+ mov myd, mym
+ mova [rsp+0x10], m11
+ mova [rsp+0x20], m12
+ %if isput
+ mova [rsp+0x30], m13
+ %endif
+ movzx t0d, t0b
+ sub srcq, 2
+ movd m15, t0d
+%else
+ %define m8 m0
+ %xdefine m14 m4
+ %define m15 m3
+ movzx r4, byte [esp+0x1f0]
+ sub srcq, 2
+ movd m15, r4
+%endif
+ pmaddwd m8, [base+rescale_mul]
+%if ARCH_X86_64
+ mova m9, [base+pd_0x4000]
+%else
+ %define m9 [base+pd_0x4000]
+%endif
+ pshufd m15, m15, q0000
+ paddd m14, m8 ; mx+dx*[0-3]
+ pand m0, m14, m10
+ psrld m0, 6
+ paddd m15, m0
+ pshufd m7, m15, q1032
+%if ARCH_X86_64
+ movd r4d, m15
+ movd r11d, m7
+ pshufd m15, m15, q0321
+ pshufd m7, m7, q0321
+ movd r6d, m15
+ movd r13d, m7
+ mova m10, [base+bdct_lb_q+ 0]
+ mova m11, [base+bdct_lb_q+16]
+ movd m13, [base+subpel_filters+ r4*8+2]
+ movd m2, [base+subpel_filters+ r6*8+2]
+ movd m15, [base+subpel_filters+r11*8+2]
+ movd m4, [base+subpel_filters+r13*8+2]
+%else
+ movd r0, m15
+ movd r4, m7
+ pshufd m15, m15, q0321
+ pshufd m7, m7, q0321
+ movd rX, m15
+ movd r5, m7
+ mova m5, [base+bdct_lb_q+ 0]
+ mova m6, [base+bdct_lb_q+16]
+ movd m1, [base+subpel_filters+r0*8+2]
+ movd m2, [base+subpel_filters+rX*8+2]
+ movd m3, [base+subpel_filters+r4*8+2]
+ movd m7, [base+subpel_filters+r5*8+2]
+ movifprep r3, r3m
+ SWAP m4, m7
+ %define m10 m5
+ %define m11 m6
+ %define m12 m1
+ %define m13 m1
+%endif
+ psrld m14, 10
+ paddd m14, m14
+ punpckldq m13, m2
+ punpckldq m15, m4
+ punpcklqdq m13, m15
+ pxor m2, m2
+ pcmpeqd m0, m2
+%if ARCH_X86_64
+ pand m9, m0
+%else
+ pand m2, m9, m0
+ %define m9 m2
+ SWAP m7, m4
+%endif
+ pandn m0, m13
+%if ARCH_X86_64
+ SWAP m13, m0
+%else
+ %define m13 m0
+%endif
+ por m13, m9
+ punpckhbw m15, m13, m13
+ punpcklbw m13, m13
+ psraw m15, 8
+ psraw m13, 8
+ pshufb m12, m14, m10
+ pshufb m14, m11
+ mova m10, [base+spel_s_shuf2]
+ movd r4d, m14
+ shr r4d, 24
+%if ARCH_X86_32
+ mova [stk+0x20], m13
+ mova [stk+0x30], m15
+ pxor m2, m2
+%endif
+ pshufb m7, m14, m2
+ psubb m14, m7
+ paddb m12, m10
+ paddb m14, m10
+%if ARCH_X86_64
+ lea r6, [r4+ssq*1]
+ lea r11, [r4+ssq*2]
+ lea r13, [r4+ss3q ]
+ movu m7, [srcq+ssq*0]
+ movu m9, [srcq+ssq*1]
+ movu m8, [srcq+ssq*2]
+ movu m10, [srcq+ss3q ]
+ movu m1, [srcq+r4 ]
+ movu m3, [srcq+r6 ]
+ movu m2, [srcq+r11 ]
+ movu m4, [srcq+r13 ]
+ lea srcq, [srcq+ssq*4]
+ REPX {pshufb x, m12}, m7, m9, m8, m10
+ REPX {pmaddwd x, m13}, m7, m9, m8, m10
+ REPX {pshufb x, m14}, m1, m2, m3, m4
+ REPX {pmaddwd x, m15}, m1, m2, m3, m4
+ mova m5, [rsp+0x10]
+ movd xm6, [rsp+0x20]
+ phaddd m7, m1
+ phaddd m9, m3
+ phaddd m8, m2
+ phaddd m10, m4
+ movu m1, [srcq+ssq*0]
+ movu m2, [srcq+ssq*1]
+ movu m3, [srcq+ssq*2]
+ movu m4, [srcq+ss3q ]
+ REPX {paddd x, m5}, m7, m9, m8, m10
+ REPX {psrad x, xm6}, m7, m9, m8, m10
+ packssdw m7, m9 ; 0 1
+ packssdw m8, m10 ; 2 3
+ movu m0, [srcq+r4 ]
+ movu m9, [srcq+r6 ]
+ movu m10, [srcq+r11 ]
+ movu m11, [srcq+r13 ]
+ lea srcq, [srcq+ssq*4]
+ REPX {pshufb x, m12}, m1, m2, m3, m4
+ REPX {pmaddwd x, m13}, m1, m2, m3, m4
+ REPX {pshufb x, m14}, m0, m9, m10, m11
+ REPX {pmaddwd x, m15}, m0, m9, m10, m11
+ phaddd m1, m0
+ phaddd m2, m9
+ phaddd m3, m10
+ phaddd m4, m11
+ REPX {paddd x, m5}, m1, m2, m3, m4
+ REPX {psrad x, xm6}, m1, m2, m3, m4
+ packssdw m1, m2 ; 4 5
+ packssdw m3, m4 ; 6 7
+ SWAP m9, m1
+ shufps m4, m7, m8, q1032 ; 1 2
+ shufps m5, m8, m9, q1032 ; 3 4
+ shufps m6, m9, m3, q1032 ; 5 6
+ pshufd m10, m3, q1032 ; 7 _
+ punpcklwd m0, m7, m4 ; 01
+ punpckhwd m7, m4 ; 12
+ punpcklwd m1, m8, m5 ; 23
+ punpckhwd m8, m5 ; 34
+ punpcklwd m2, m9, m6 ; 45
+ punpckhwd m9, m6 ; 56
+ punpcklwd m3, m10 ; 67
+ mova [rsp+0x40], m7
+ mova [rsp+0x50], m8
+ mova [rsp+0x60], m9
+%else
+ mova [stk+0x00], m12
+ mova [stk+0x10], m14
+ add r4, srcq
+ MC_4TAP_SCALED_H 0x40 ; 0 1
+ MC_4TAP_SCALED_H 0x50 ; 2 3
+ MC_4TAP_SCALED_H 0x60 ; 4 5
+ MC_4TAP_SCALED_H 0x70 ; 6 7
+ mova m4, [stk+0x40]
+ mova m5, [stk+0x50]
+ mova m6, [stk+0x60]
+ mova m7, [stk+0x70]
+ mov [stk+0xc0], r4
+ shufps m1, m4, m5, q1032 ; 1 2
+ shufps m2, m5, m6, q1032 ; 3 4
+ shufps m3, m6, m7, q1032 ; 5 6
+ pshufd m0, m7, q1032 ; 7 _
+ mova [stk+0xb0], m0
+ punpcklwd m0, m4, m1 ; 01
+ punpckhwd m4, m1 ; 12
+ punpcklwd m1, m5, m2 ; 23
+ punpckhwd m5, m2 ; 34
+ punpcklwd m2, m6, m3 ; 45
+ punpckhwd m6, m3 ; 56
+ punpcklwd m3, m7, [stk+0xb0] ; 67
+ mov myd, mym
+ mov r0, r0m
+ mova [stk+0x40], m0 ; 01
+ mova [stk+0x50], m1 ; 23
+ mova [stk+0x60], m2 ; 45
+ mova [stk+0x70], m3 ; 67
+ mova [stk+0x80], m4 ; 12
+ mova [stk+0x90], m5 ; 34
+ mova [stk+0xa0], m6 ; 56
+ %define m12 [stk+0x00]
+ %define m14 [stk+0x10]
+ %define m13 [stk+0x20]
+ %define m15 [stk+0x30]
+ %define hrnd_mem [esp+0x00]
+ %define hsh_mem [esp+0x10]
+ %if isput
+ %define vrnd_mem [esp+0x20]
+ %else
+ %define vrnd_mem [base+pd_m524256]
+ %endif
+%endif
+.w4_loop:
+ and myd, 0x3ff
+%if ARCH_X86_64
+ mov r11d, 64 << 24
+ mov r13d, myd
+ shr r13d, 6
+ lea r13d, [t1+r13]
+ cmovnz r11q, [base+subpel_filters+r13*8]
+ movq m9, r11q
+ punpcklbw m9, m9
+ psraw m9, 8
+ pshufd m7, m9, q0000
+ pshufd m8, m9, q1111
+ pmaddwd m4, m0, m7
+ pmaddwd m5, m1, m8
+ pshufd m7, m9, q2222
+ pshufd m9, m9, q3333
+ pmaddwd m6, m2, m7
+ pmaddwd m8, m3, m9
+ %if isput
+ movd m9, [rsp+0x28]
+ %define vrnd_mem [rsp+0x30]
+ %else
+ %define vrnd_mem [base+pd_m524256]
+ %endif
+ paddd m4, m5
+ paddd m6, m8
+ paddd m4, m6
+ paddd m4, vrnd_mem
+%else
+ mov mym, myd
+ mov r5, [esp+0x1f4]
+ xor r3, r3
+ shr r4, 6
+ lea r5, [r5+r4]
+ mov r4, 64 << 24
+ cmovnz r4, [base+subpel_filters+r5*8+0]
+ cmovnz r3, [base+subpel_filters+r5*8+4]
+ movd m7, r4
+ movd m6, r3
+ punpckldq m7, m6
+ punpcklbw m7, m7
+ psraw m7, 8
+ pshufd m4, m7, q0000
+ pshufd m5, m7, q1111
+ pshufd m6, m7, q2222
+ pshufd m7, m7, q3333
+ pmaddwd m0, m4
+ pmaddwd m1, m5
+ pmaddwd m2, m6
+ pmaddwd m3, m7
+ %if isput
+ movd m4, [esp+0x18]
+ %endif
+ paddd m0, m1
+ paddd m2, m3
+ paddd m0, vrnd_mem
+ paddd m0, m2
+ SWAP m4, m0
+ %define m9 m0
+%endif
+%if isput
+ pxor m5, m5
+ psrad m4, m9
+ packssdw m4, m4
+ pmaxsw m4, m5
+ pminsw m4, pxmaxm
+ movq [dstq], m4
+ add dstq, dsmp
+%else
+ psrad m4, 6
+ packssdw m4, m4
+ movq [tmpq], m4
+ add tmpq, 8
+%endif
+ dec hd
+ jz .ret
+%if ARCH_X86_64
+ add myd, dyd
+ test myd, ~0x3ff
+ jz .w4_loop
+ mova m8, [rsp+0x10]
+ movd m9, [rsp+0x20]
+ movu m4, [srcq]
+ movu m5, [srcq+r4]
+ test myd, 0x400
+ jz .w4_skip_line
+ mova m0, [rsp+0x40]
+ mova [rsp+0x40], m1
+ mova m1, [rsp+0x50]
+ mova [rsp+0x50], m2
+ mova m2, [rsp+0x60]
+ mova [rsp+0x60], m3
+ pshufb m4, m12
+ pshufb m5, m14
+ pmaddwd m4, m13
+ pmaddwd m5, m15
+ phaddd m4, m5
+ paddd m4, m8
+ psrad m4, m9
+ packssdw m4, m4
+ punpcklwd m3, m10, m4
+ mova m10, m4
+ add srcq, ssq
+ jmp .w4_loop
+.w4_skip_line:
+ movu m6, [srcq+ssq*1]
+ movu m7, [srcq+r6]
+ mova m0, [rsp+0x50]
+ mova m11, [rsp+0x60]
+ pshufb m4, m12
+ pshufb m6, m12
+ pshufb m5, m14
+ pshufb m7, m14
+ pmaddwd m4, m13
+ pmaddwd m6, m13
+ pmaddwd m5, m15
+ pmaddwd m7, m15
+ mova [rsp+0x40], m0
+ mova [rsp+0x50], m11
+ phaddd m4, m5
+ phaddd m6, m7
+ paddd m4, m8
+ paddd m6, m8
+ psrad m4, m9
+ psrad m6, m9
+ packssdw m4, m6
+ punpcklwd m9, m10, m4
+ mova [rsp+0x60], m9
+ pshufd m10, m4, q1032
+ mova m0, m1
+ mova m1, m2
+ mova m2, m3
+ punpcklwd m3, m4, m10
+ lea srcq, [srcq+ssq*2]
+ jmp .w4_loop
+%else
+ SWAP m0, m4
+ mov myd, mym
+ mov r3, r3m
+ add myd, dym
+ test myd, ~0x3ff
+ jnz .w4_next_line
+ mova m0, [stk+0x40]
+ mova m1, [stk+0x50]
+ mova m2, [stk+0x60]
+ mova m3, [stk+0x70]
+ jmp .w4_loop
+.w4_next_line:
+ mov r5, [stk+0xc0]
+ movu m4, [srcq]
+ movu m5, [r5]
+ test myd, 0x400
+ jz .w4_skip_line
+ add [stk+0xc0], ssq
+ mova m0, [stk+0x80]
+ mova m3, [stk+0x50]
+ mova [stk+0x40], m0
+ mova [stk+0x80], m3
+ mova m1, [stk+0x90]
+ mova m6, [stk+0x60]
+ mova [stk+0x50], m1
+ mova [stk+0x90], m6
+ mova m2, [stk+0xa0]
+ mova m7, [stk+0x70]
+ mova [stk+0x60], m2
+ mova [stk+0xa0], m7
+ pshufb m4, m12
+ pshufb m5, m14
+ pmaddwd m4, m13
+ pmaddwd m5, m15
+ phaddd m4, m5
+ paddd m4, hrnd_mem
+ psrad m4, hsh_mem
+ packssdw m4, m4
+ punpcklwd m3, [stk+0xb0], m4
+ mova [stk+0xb0], m4
+ mova [stk+0x70], m3
+ add srcq, ssq
+ jmp .w4_loop
+.w4_skip_line:
+ movu m6, [srcq+ssq*1]
+ movu m7, [r5 +ssq*1]
+ lea r5, [r5 +ssq*2]
+ mov [stk+0xc0], r5
+ mova m0, [stk+0x50]
+ mova m1, [stk+0x60]
+ mova m2, [stk+0x70]
+ mova m3, [stk+0x90]
+ pshufb m4, m12
+ pshufb m6, m12
+ pshufb m5, m14
+ pshufb m7, m14
+ pmaddwd m4, m13
+ pmaddwd m6, m13
+ pmaddwd m5, m15
+ pmaddwd m7, m15
+ mova [stk+0x40], m0
+ mova [stk+0x50], m1
+ mova [stk+0x60], m2
+ mova [stk+0x80], m3
+ phaddd m4, m5
+ phaddd m6, m7
+ mova m5, [stk+0xa0]
+ mova m7, [stk+0xb0]
+ paddd m4, hrnd_mem
+ paddd m6, hrnd_mem
+ psrad m4, hsh_mem
+ psrad m6, hsh_mem
+ packssdw m4, m6
+ punpcklwd m7, m4
+ pshufd m6, m4, q1032
+ mova [stk+0x90], m5
+ mova [stk+0xa0], m7
+ mova [stk+0xb0], m6
+ punpcklwd m3, m4, m6
+ mova [stk+0x70], m3
+ lea srcq, [srcq+ssq*2]
+ jmp .w4_loop
+%endif
+INIT_XMM ssse3
+%if ARCH_X86_64
+ %define stk rsp+0x20
+%endif
+.w8:
+ mov dword [stk+0xf0], 1
+ movifprep tmp_stridem, 16
+ jmp .w_start
+.w16:
+ mov dword [stk+0xf0], 2
+ movifprep tmp_stridem, 32
+ jmp .w_start
+.w32:
+ mov dword [stk+0xf0], 4
+ movifprep tmp_stridem, 64
+ jmp .w_start
+.w64:
+ mov dword [stk+0xf0], 8
+ movifprep tmp_stridem, 128
+ jmp .w_start
+.w128:
+ mov dword [stk+0xf0], 16
+ movifprep tmp_stridem, 256
+.w_start:
+%if ARCH_X86_64
+ %ifidn %1, put
+ movifnidn dsm, dsq
+ %endif
+ mova [rsp+0x10], m11
+ %define hround m11
+ shr t0d, 16
+ movd m15, t0d
+ %if isprep
+ mova m13, [base+pd_m524256]
+ %endif
+%else
+ %define hround [esp+0x00]
+ %define m12 [esp+0x10]
+ %define m10 [base+pd_0x3ff]
+ %define m8 m0
+ %xdefine m14 m4
+ %define m15 m3
+ %if isprep
+ %define ssq ssm
+ %endif
+ mov r4, [esp+0x1f0]
+ shr r4, 16
+ movd m15, r4
+ mov r0, r0m
+ mov myd, mym
+%endif
+ sub srcq, 6
+ pslld m7, m8, 2 ; dx*4
+ pmaddwd m8, [base+rescale_mul] ; dx*[0-3]
+ pshufd m15, m15, q0000
+ paddd m14, m8 ; mx+dx*[0-3]
+ mova [stk+0x100], m7
+ mova [stk+0x120], m15
+ mov [stk+0x0f8], srcq
+ mov [stk+0x130], r0q ; dstq / tmpq
+%if ARCH_X86_64 && UNIX64
+ mov hm, hd
+%elif ARCH_X86_32
+ mov r5, hm
+ mov [stk+0x0f4], myd
+ mov [stk+0x134], r5
+%endif
+ jmp .hloop
+.hloop_prep:
+ dec dword [stk+0x0f0]
+ jz .ret
+%if ARCH_X86_64
+ add qword [stk+0x130], 16
+ mov hd, hm
+%else
+ add dword [stk+0x130], 16
+ mov myd, [stk+0x0f4]
+ mov r5, [stk+0x134]
+ mov r0, [stk+0x130]
+%endif
+ mova m7, [stk+0x100]
+ mova m14, [stk+0x110]
+%if ARCH_X86_64
+ mova m10, [base+pd_0x3ff]
+ mova m11, [rsp+0x10]
+%endif
+ mova m15, [stk+0x120]
+ mov srcq, [stk+0x0f8]
+%if ARCH_X86_64
+ mov r0q, [stk+0x130] ; dstq / tmpq
+%else
+ mov mym, myd
+ mov hm, r5
+ mov r0m, r0
+ mov r3, r3m
+%endif
+ paddd m14, m7
+.hloop:
+%if ARCH_X86_64
+ mova m9, [base+pq_0x40000000]
+%else
+ %define m9 [base+pq_0x40000000]
+%endif
+ pxor m1, m1
+ psrld m2, m14, 10
+ mova [stk], m2
+ pand m6, m14, m10
+ psrld m6, 6
+ paddd m5, m15, m6
+ pcmpeqd m6, m1
+ pshufd m2, m5, q1032
+%if ARCH_X86_64
+ movd r4d, m5
+ movd r6d, m2
+ pshufd m5, m5, q0321
+ pshufd m2, m2, q0321
+ movd r7d, m5
+ movd r9d, m2
+ movq m0, [base+subpel_filters+r4*8]
+ movq m1, [base+subpel_filters+r6*8]
+ movhps m0, [base+subpel_filters+r7*8]
+ movhps m1, [base+subpel_filters+r9*8]
+%else
+ movd r0, m5
+ movd rX, m2
+ pshufd m5, m5, q0321
+ pshufd m2, m2, q0321
+ movd r4, m5
+ movd r5, m2
+ movq m0, [base+subpel_filters+r0*8]
+ movq m1, [base+subpel_filters+rX*8]
+ movhps m0, [base+subpel_filters+r4*8]
+ movhps m1, [base+subpel_filters+r5*8]
+%endif
+ paddd m14, m7 ; mx+dx*[4-7]
+ pand m5, m14, m10
+ psrld m5, 6
+ paddd m15, m5
+ pxor m2, m2
+ pcmpeqd m5, m2
+ mova [stk+0x110], m14
+ pshufd m4, m15, q1032
+%if ARCH_X86_64
+ movd r10d, m15
+ movd r11d, m4
+ pshufd m15, m15, q0321
+ pshufd m4, m4, q0321
+ movd r13d, m15
+ movd rXd, m4
+ movq m2, [base+subpel_filters+r10*8]
+ movq m3, [base+subpel_filters+r11*8]
+ movhps m2, [base+subpel_filters+r13*8]
+ movhps m3, [base+subpel_filters+ rX*8]
+ psrld m14, 10
+ movq r11, m14
+ punpckhqdq m14, m14
+ movq rX, m14
+ mov r10d, r11d
+ shr r11, 32
+ mov r13d, rXd
+ shr rX, 32
+ mov r4d, [stk+ 0]
+ mov r6d, [stk+ 4]
+ mov r7d, [stk+ 8]
+ mov r9d, [stk+12]
+ pshufd m4, m6, q1100
+ pshufd m6, m6, q3322
+ pshufd m14, m5, q1100
+ pshufd m5, m5, q3322
+ pand m7, m9, m4
+ pand m8, m9, m6
+ pand m15, m9, m14
+ pand m9, m9, m5
+ pandn m4, m0
+ pandn m6, m1
+ pandn m14, m2
+ pandn m5, m3
+ por m7, m4
+ por m8, m6
+ por m15, m14
+ por m9, m5
+ punpcklbw m0, m7, m7
+ punpckhbw m7, m7
+ punpcklbw m1, m8, m8
+ punpckhbw m8, m8
+ psraw m0, 8
+ psraw m7, 8
+ psraw m1, 8
+ psraw m8, 8
+ punpcklbw m2, m15, m15
+ punpckhbw m15, m15
+ punpcklbw m3, m9, m9
+ punpckhbw m9, m9
+ psraw m2, 8
+ psraw m15, 8
+ psraw m3, 8
+ psraw m9, 8
+ mova [stk+0x10], m0
+ mova [stk+0x20], m7
+ mova [stk+0x30], m1
+ mova [stk+0x40], m8
+ mova [stk+0x50], m2
+ mova [stk+0x60], m15
+ mova [stk+0x70], m3
+ mova [stk+0x80], m9
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10 ; 0
+ mova [stk+0x90], m1
+ MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 1, 9, 10 ; 1
+ mova [stk+0xa0], m2
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10 ; 2
+ mova [stk+0xb0], m3
+ MC_8TAP_SCALED_H 4, 5, 6, 1, 2, 3, 9, 10 ; 3
+ mova [stk+0xc0], m4
+ MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10 ; 4
+ mova [stk+0xd0], m5
+ MC_8TAP_SCALED_H 6, 1, 2, 3, 4, 5, 9, 10 ; 5
+ MC_8TAP_SCALED_H 7, 1, 2, 3, 4, 5, 9, 10 ; 6
+ MC_8TAP_SCALED_H 8, 1, 2, 3, 4, 5, 9, 10 ; 7
+ mova m5, [stk+0xd0]
+ mova m1, [stk+0x90]
+ mova m2, [stk+0xa0]
+ mova m3, [stk+0xb0]
+ mova m9, [stk+0xc0]
+ mov myd, mym
+ mov dyd, dym
+ punpcklwd m4, m5, m6 ; 45a
+ punpckhwd m5, m6 ; 45b
+ punpcklwd m6, m7, m8 ; 67a
+ punpckhwd m7, m8 ; 67b
+ punpcklwd m0, m1, m2 ; 01a
+ punpckhwd m1, m2 ; 01b
+ punpcklwd m2, m3, m9 ; 23a
+ punpckhwd m3, m9 ; 23b
+ mova [stk+0x90], m4
+ mova [stk+0xa0], m5
+ mova [stk+0xb0], m6
+ mova [stk+0xc0], m7
+ %define hround [rsp+0x10]
+.vloop:
+ and myd, 0x3ff
+ mov r6d, 64 << 24
+ mov r4d, myd
+ shr r4d, 6
+ lea r4d, [t1+r4]
+ cmovnz r6q, [base+subpel_filters+r4*8]
+ movq m11, r6q
+ punpcklbw m11, m11
+ psraw m11, 8
+ pshufd m5, m11, q0000
+ pshufd m7, m11, q1111
+ pshufd m10, m11, q2222
+ pshufd m11, m11, q3333
+ pmaddwd m4, m5, m0
+ pmaddwd m5, m5, m1
+ pmaddwd m6, m7, m2
+ pmaddwd m7, m7, m3
+ paddd m4, m13
+ paddd m5, m13
+ paddd m4, m6
+ paddd m5, m7
+ pmaddwd m6, [stk+0x90], m10
+ pmaddwd m7, [stk+0xa0], m10
+ pmaddwd m8, [stk+0xb0], m11
+ pmaddwd m9, [stk+0xc0], m11
+ paddd m4, m6
+ paddd m5, m7
+ %if isput
+ pshufd m6, m12, q1032
+ %endif
+ paddd m4, m8
+ paddd m5, m9
+%else
+ movd r0, m15
+ movd rX, m4
+ pshufd m15, m15, q0321
+ pshufd m4, m4, q0321
+ movd r4, m15
+ movd r5, m4
+ mova m14, [stk+0x110]
+ movq m2, [base+subpel_filters+r0*8]
+ movq m3, [base+subpel_filters+rX*8]
+ movhps m2, [base+subpel_filters+r4*8]
+ movhps m3, [base+subpel_filters+r5*8]
+ psrld m14, 10
+ mova [stk+16], m14
+ mov r0, [stk+ 0]
+ mov rX, [stk+ 4]
+ mov r4, [stk+ 8]
+ mov r5, [stk+12]
+ mova [stk+0x20], m0
+ mova [stk+0x30], m1
+ mova [stk+0x40], m2
+ mova [stk+0x50], m3
+ pshufd m4, m6, q1100
+ pshufd m6, m6, q3322
+ pshufd m7, m5, q1100
+ pshufd m5, m5, q3322
+ pand m0, m9, m4
+ pand m1, m9, m6
+ pand m2, m9, m7
+ pand m3, m9, m5
+ pandn m4, [stk+0x20]
+ pandn m6, [stk+0x30]
+ pandn m7, [stk+0x40]
+ pandn m5, [stk+0x50]
+ por m0, m4
+ por m1, m6
+ por m2, m7
+ por m3, m5
+ punpcklbw m4, m0, m0
+ punpckhbw m0, m0
+ punpcklbw m5, m1, m1
+ punpckhbw m1, m1
+ psraw m4, 8
+ psraw m0, 8
+ psraw m5, 8
+ psraw m1, 8
+ punpcklbw m6, m2, m2
+ punpckhbw m2, m2
+ punpcklbw m7, m3, m3
+ punpckhbw m3, m3
+ psraw m6, 8
+ psraw m2, 8
+ psraw m7, 8
+ psraw m3, 8
+ mova [stk+0x0a0], m4
+ mova [stk+0x0b0], m0
+ mova [stk+0x0c0], m5
+ mova [stk+0x0d0], m1
+ mova [stk+0x140], m6
+ mova [stk+0x150], m2
+ mova [stk+0x160], m7
+ mova [stk+0x170], m3
+ MC_8TAP_SCALED_H 0xa0, 0x20, 0 ; 0
+ MC_8TAP_SCALED_H 0xa0, 0x30 ; 1
+ MC_8TAP_SCALED_H 0xa0, 0x40 ; 2
+ MC_8TAP_SCALED_H 0xa0, 0x50 ; 3
+ MC_8TAP_SCALED_H 0xa0, 0x60 ; 4
+ MC_8TAP_SCALED_H 0xa0, 0x70 ; 5
+ MC_8TAP_SCALED_H 0xa0, 0x80 ; 6
+ MC_8TAP_SCALED_H 0xa0, 0x90 ; 7
+ mova m5, [stk+0x60]
+ mova m6, [stk+0x70]
+ mova m7, [stk+0x80]
+ mova m0, [stk+0x90]
+ mov myd, mym
+ punpcklwd m4, m5, m6 ; 45a
+ punpckhwd m5, m6 ; 45b
+ punpcklwd m6, m7, m0 ; 67a
+ punpckhwd m7, m0 ; 67b
+ mova [stk+0x60], m4
+ mova [stk+0x70], m5
+ mova [stk+0x80], m6
+ mova [stk+0x90], m7
+ mova m1, [stk+0x20]
+ mova m2, [stk+0x30]
+ mova m3, [stk+0x40]
+ mova m4, [stk+0x50]
+ punpcklwd m0, m1, m2 ; 01a
+ punpckhwd m1, m2 ; 01b
+ punpcklwd m2, m3, m4 ; 23a
+ punpckhwd m3, m4 ; 23b
+ mova [stk+0x20], m0
+ mova [stk+0x30], m1
+ mova [stk+0x40], m2
+ mova [stk+0x50], m3
+.vloop:
+ mov r0, r0m
+ mov r5, [esp+0x1f4]
+ and myd, 0x3ff
+ mov mym, myd
+ xor r3, r3
+ shr r4, 6
+ lea r5, [r5+r4]
+ mov r4, 64 << 24
+ cmovnz r4, [base+subpel_filters+r5*8+0]
+ cmovnz r3, [base+subpel_filters+r5*8+4]
+ movd m7, r4
+ movd m6, r3
+ punpckldq m7, m6
+ punpcklbw m7, m7
+ psraw m7, 8
+ pshufd m4, m7, q0000
+ pshufd m5, m7, q1111
+ pmaddwd m0, m4
+ pmaddwd m1, m4
+ pmaddwd m2, m5
+ pmaddwd m3, m5
+ pshufd m6, m7, q2222
+ pshufd m7, m7, q3333
+ paddd m0, m2
+ paddd m1, m3
+ pmaddwd m2, [stk+0x60], m6
+ pmaddwd m3, [stk+0x70], m6
+ pmaddwd m4, [stk+0x80], m7
+ pmaddwd m5, [stk+0x90], m7
+ %if isput
+ movd m6, [esp+0x18]
+ %endif
+ paddd m0, m2
+ paddd m1, m3
+ paddd m0, vrnd_mem
+ paddd m1, vrnd_mem
+ paddd m4, m0
+ paddd m5, m1
+%endif
+%ifidn %1, put
+ psrad m4, m6
+ psrad m5, m6
+ packssdw m4, m5
+ pxor m7, m7
+ pmaxsw m4, m7
+ pminsw m4, pxmaxm
+ mova [dstq], m4
+ add dstq, dsm
+%else
+ psrad m4, 6
+ psrad m5, 6
+ packssdw m4, m5
+ mova [tmpq], m4
+ add tmpq, tmp_stridem
+%endif
+ dec hd
+ jz .hloop_prep
+%if ARCH_X86_64
+ add myd, dyd
+ test myd, ~0x3ff
+ jz .vloop
+ test myd, 0x400
+ mov [stk+0x140], myd
+ mov r4d, [stk+ 0]
+ mov r6d, [stk+ 4]
+ mov r7d, [stk+ 8]
+ mov r9d, [stk+12]
+ jz .skip_line
+ mova m14, [base+unpckw]
+ movu m8, [srcq+r10*2]
+ movu m9, [srcq+r11*2]
+ movu m10, [srcq+r13*2]
+ movu m11, [srcq+ rX*2]
+ movu m4, [srcq+ r4*2]
+ movu m5, [srcq+ r6*2]
+ movu m6, [srcq+ r7*2]
+ movu m7, [srcq+ r9*2]
+ add srcq, ssq
+ mov myd, [stk+0x140]
+ mov dyd, dym
+ pshufd m15, m14, q1032
+ pshufb m0, m14 ; 0a 1a
+ pshufb m1, m14 ; 0b 1b
+ pshufb m2, m15 ; 3a 2a
+ pshufb m3, m15 ; 3b 2b
+ pmaddwd m8, [stk+0x50]
+ pmaddwd m9, [stk+0x60]
+ pmaddwd m10, [stk+0x70]
+ pmaddwd m11, [stk+0x80]
+ pmaddwd m4, [stk+0x10]
+ pmaddwd m5, [stk+0x20]
+ pmaddwd m6, [stk+0x30]
+ pmaddwd m7, [stk+0x40]
+ phaddd m8, m9
+ phaddd m10, m11
+ mova m11, hround
+ phaddd m4, m5
+ phaddd m6, m7
+ phaddd m8, m10
+ phaddd m4, m6
+ paddd m4, m11
+ paddd m8, m11
+ psrad m4, m12
+ psrad m8, m12
+ packssdw m4, m8
+ pshufb m5, [stk+0x90], m14 ; 4a 5a
+ pshufb m6, [stk+0xa0], m14 ; 4b 5b
+ pshufb m7, [stk+0xb0], m15 ; 7a 6a
+ pshufb m8, [stk+0xc0], m15 ; 7b 6b
+ punpckhwd m0, m2 ; 12a
+ punpckhwd m1, m3 ; 12b
+ punpcklwd m2, m5 ; 34a
+ punpcklwd m3, m6 ; 34b
+ punpckhwd m5, m7 ; 56a
+ punpckhwd m6, m8 ; 56b
+ punpcklwd m7, m4 ; 78a
+ punpckhqdq m4, m4
+ punpcklwd m8, m4 ; 78b
+ mova [stk+0x90], m5
+ mova [stk+0xa0], m6
+ mova [stk+0xb0], m7
+ mova [stk+0xc0], m8
+ jmp .vloop
+.skip_line:
+ MC_8TAP_SCALED_H 4, 8, 5, 6, 7, 9, 10, 11
+ MC_8TAP_SCALED_H 8, 5, 6, 7, 9, 0, 10, 11
+ mov myd, [stk+0x140]
+ mov dyd, dym
+ mova m0, m2 ; 01a
+ mova m1, m3 ; 01b
+ mova m2, [stk+0x90] ; 23a
+ mova m3, [stk+0xa0] ; 23b
+ mova m5, [stk+0xb0] ; 45a
+ mova m6, [stk+0xc0] ; 45b
+ punpcklwd m7, m4, m8 ; 67a
+ punpckhwd m4, m8 ; 67b
+ mova [stk+0x90], m5
+ mova [stk+0xa0], m6
+ mova [stk+0xb0], m7
+ mova [stk+0xc0], m4
+%else
+ mov r0m, r0
+ mov myd, mym
+ mov r3, r3m
+ add myd, dym
+ test myd, ~0x3ff
+ mov mym, myd
+ jnz .next_line
+ mova m0, [stk+0x20]
+ mova m1, [stk+0x30]
+ mova m2, [stk+0x40]
+ mova m3, [stk+0x50]
+ jmp .vloop
+.next_line:
+ test myd, 0x400
+ mov r0, [stk+ 0]
+ mov rX, [stk+ 4]
+ mov r4, [stk+ 8]
+ mov r5, [stk+12]
+ jz .skip_line
+ MC_8TAP_SCALED_H 0xa0, 0xe0, 0 ; 8
+ mova m7, [base+unpckw]
+ pshufd m4, m7, q1032
+ pshufb m0, [stk+0x20], m7 ; 0a 1a
+ pshufb m1, [stk+0x30], m7 ; 0b 1b
+ pshufb m2, [stk+0x40], m4 ; 3a 2a
+ pshufb m3, [stk+0x50], m4 ; 3b 2b
+ pshufb m5, [stk+0x60], m7 ; 4a 5a
+ pshufb m6, [stk+0x70], m7 ; 4b 5b
+ pshufb m7, [stk+0x80], m4 ; 7a 6a
+ punpckhwd m0, m2 ; 12a
+ punpckhwd m1, m3 ; 12b
+ punpcklwd m2, m5 ; 34a
+ punpcklwd m3, m6 ; 34b
+ mova [stk+0x20], m0
+ mova [stk+0x30], m1
+ mova [stk+0x40], m2
+ mova [stk+0x50], m3
+ punpckhwd m5, m7 ; 56a
+ mova [stk+0x60], m5
+ pshufb m5, [stk+0x90], m4 ; 7b 6b
+ punpcklwd m7, [stk+0xe0] ; 78a
+ punpckhwd m6, m5 ; 56b
+ mova [stk+0x70], m6
+ movq m6, [stk+0xe8]
+ mova [stk+0x80], m7
+ punpcklwd m5, m6
+ mov myd, mym
+ mova [stk+0x90], m5
+ jmp .vloop
+.skip_line:
+ MC_8TAP_SCALED_H 0xa0, 0xe0, 0 ; 8
+ MC_8TAP_SCALED_H 0xa0, 0 ; 9
+ mova m7, [stk+0xe0]
+ mova m2, [stk+0x60] ; 23a
+ mova m3, [stk+0x70] ; 23b
+ mova m4, [stk+0x80] ; 45a
+ mova m5, [stk+0x90] ; 45b
+ punpcklwd m6, m7, m0 ; 67a
+ punpckhwd m7, m0 ; 67b
+ mova m0, [stk+0x40] ; 01a
+ mova m1, [stk+0x50] ; 01b
+ mov myd, mym
+ mova [stk+0x40], m2
+ mova [stk+0x50], m3
+ mova [stk+0x60], m4
+ mova [stk+0x70], m5
+ mova [stk+0x80], m6
+ mova [stk+0x90], m7
+ mova [stk+0x20], m0
+ mova [stk+0x30], m1
+%endif
+ jmp .vloop
+INIT_XMM ssse3
+.dy1:
+ movzx wd, word [base+%1_8tap_scaled_ssse3_dy1_table+wq*2]
+ add wq, base_reg
+ jmp wq
+%if isput
+.dy1_w2:
+ %if ARCH_X86_64
+ mov myd, mym
+ movzx t0d, t0b
+ sub srcq, 2
+ movd m15, t0d
+ %else
+ %define m8 m0
+ %define m9 m1
+ %define m14 m4
+ %define m15 m3
+ %define m11 [esp+0x00]
+ %define m12 [esp+0x10]
+ %define m13 [esp+0x20]
+ movzx r5, byte [esp+0x1f0]
+ sub srcq, 2
+ movd m15, r5
+ mov r1, r1m
+ %endif
+ pxor m9, m9
+ punpckldq m9, m8
+ paddd m14, m9 ; mx+dx*[0-1]
+ %if ARCH_X86_64
+ mova m9, [base+pd_0x4000]
+ %endif
+ pshufd m15, m15, q0000
+ pand m8, m14, m10
+ psrld m8, 6
+ paddd m15, m8
+ movd r4d, m15
+ pshufd m15, m15, q0321
+ %if ARCH_X86_64
+ movd r6d, m15
+ %else
+ movd r3d, m15
+ %endif
+ mova m5, [base+bdct_lb_q]
+ mova m6, [base+spel_s_shuf2]
+ movd m15, [base+subpel_filters+r4*8+2]
+ %if ARCH_X86_64
+ movd m7, [base+subpel_filters+r6*8+2]
+ %else
+ movd m7, [base+subpel_filters+r3*8+2]
+ %endif
+ pxor m2, m2
+ pcmpeqd m8, m2
+ psrld m14, 10
+ paddd m14, m14
+ %if ARCH_X86_32
+ mov r3, r3m
+ pshufb m14, m5
+ paddb m14, m6
+ mova [stk], m14
+ SWAP m5, m0
+ SWAP m6, m3
+ %define m15 m6
+ %endif
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
+ movu m2, [srcq+ssq*2]
+ movu m3, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ punpckldq m15, m7
+ %if ARCH_X86_64
+ pshufb m14, m5
+ paddb m14, m6
+ pand m9, m8
+ pandn m8, m15
+ SWAP m15, m8
+ por m15, m9
+ movu m4, [srcq+ssq*0]
+ movu m5, [srcq+ssq*1]
+ movu m6, [srcq+ssq*2]
+ add srcq, ss3q
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ %else
+ pand m7, m5, [base+pd_0x4000]
+ pandn m5, m15
+ por m5, m7
+ %define m15 m5
+ mov myd, mym
+ mov r5, [esp+0x1f4]
+ xor r3, r3
+ shr myd, 6
+ lea r5, [r5+myd]
+ mov r4, 64 << 24
+ cmovnz r4, [base+subpel_filters+r5*8+0]
+ cmovnz r3, [base+subpel_filters+r5*8+4]
+ mov [stk+0x20], r3
+ mov r3, r3m
+ %endif
+ punpcklbw m15, m15
+ psraw m15, 8
+ REPX {pshufb x, m14}, m0, m1, m2, m3
+ REPX {pmaddwd x, m15}, m0, m1, m2, m3
+ %if ARCH_X86_64
+ REPX {pshufb x, m14}, m4, m5, m6
+ REPX {pmaddwd x, m15}, m4, m5, m6
+ phaddd m0, m1
+ phaddd m2, m3
+ phaddd m4, m5
+ phaddd m6, m6
+ REPX {paddd x, m11}, m0, m2, m4, m6
+ REPX {psrad x, m12}, m0, m2, m4, m6
+ packssdw m0, m2 ; 0 1 2 3
+ packssdw m4, m6 ; 4 5 6
+ SWAP m1, m4
+ movq m10, r4
+ %else
+ mova [stk+0x10], m15
+ phaddd m0, m1
+ phaddd m2, m3
+ movu m1, [srcq+ssq*0]
+ movu m7, [srcq+ssq*1]
+ movu m6, [srcq+ssq*2]
+ add srcq, ss3q
+ REPX {pshufb x, m14}, m1, m7, m6
+ REPX {pmaddwd x, m15}, m1, m7, m6
+ %define m14 [stk+0x00]
+ %define m15 [stk+0x10]
+ phaddd m1, m7
+ phaddd m6, m6
+ REPX {paddd x, m11}, m0, m2, m1, m6
+ REPX {psrad x, m12}, m0, m2, m1, m6
+ packssdw m0, m2
+ packssdw m1, m6
+ %define m8 m6
+ %define m9 m4
+ %define m10 m5
+ movd m10, r4
+ movd m9, [stk+0x20]
+ punpckldq m10, m9
+ %endif
+ punpcklbw m10, m10
+ psraw m10, 8
+ pshufd m7, m10, q0000
+ pshufd m8, m10, q1111
+ pshufd m9, m10, q2222
+ pshufd m10, m10, q3333
+ %if ARCH_X86_32
+ mova [stk+0x50], m7
+ mova [stk+0x60], m8
+ mova [stk+0x70], m9
+ mova [stk+0x80], m10
+ %define m7 [stk+0x50]
+ %define m8 [stk+0x60]
+ %define m9 [stk+0x70]
+ %define m10 [stk+0x80]
+ %endif
+ palignr m2, m1, m0, 4 ; 1 2 3 4
+ punpcklwd m3, m0, m2 ; 01 12
+ punpckhwd m0, m2 ; 23 34
+ pshufd m4, m1, q2121 ; 5 6 5 6
+ punpcklwd m2, m1, m4 ; 45 56
+ %if ARCH_X86_32
+ mov r0, r0m
+ %endif
+.dy1_w2_loop:
+ movu m1, [srcq+ssq*0]
+ movu m6, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmaddwd m5, m3, m7
+ mova m3, m0
+ pmaddwd m0, m8
+ pshufb m1, m14
+ pshufb m6, m14
+ pmaddwd m1, m15
+ pmaddwd m6, m15
+ phaddd m1, m6
+ paddd m1, m11
+ psrad m1, m12
+ packssdw m1, m1
+ paddd m5, m0
+ mova m0, m2
+ pmaddwd m2, m9
+ paddd m5, m2
+ palignr m2, m1, m4, 12
+ punpcklwd m2, m1 ; 67 78
+ pmaddwd m4, m2, m10
+ paddd m5, m13
+ paddd m5, m4
+ pxor m6, m6
+ mova m4, m1
+ pshufd m1, m12, q1032
+ psrad m5, m1
+ packssdw m5, m5
+ pmaxsw m5, m6
+ pminsw m5, pxmaxm
+ movd [dstq+dsq*0], m5
+ pshuflw m5, m5, q1032
+ movd [dstq+dsq*1], m5
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .dy1_w2_loop
+ RET
+%endif
+INIT_XMM ssse3
+.dy1_w4:
+%if ARCH_X86_64
+ mov myd, mym
+ mova [rsp+0x10], m11
+ mova [rsp+0x20], m12
+ %if isput
+ mova [rsp+0x30], m13
+ %define vrnd_mem [rsp+0x30]
+ %define stk rsp+0x40
+ %else
+ %define vrnd_mem [base+pd_m524256]
+ %define stk rsp+0x30
+ %endif
+ movzx t0d, t0b
+ sub srcq, 2
+ movd m15, t0d
+%else
+ %define m10 [base+pd_0x3ff]
+ %define m9 [base+pd_0x4000]
+ %define m8 m0
+ %xdefine m14 m4
+ %define m15 m3
+ %if isprep
+ %define ssq r3
+ %endif
+ movzx r5, byte [esp+0x1f0]
+ sub srcq, 2
+ movd m15, r5
+%endif
+ pmaddwd m8, [base+rescale_mul]
+%if ARCH_X86_64
+ mova m9, [base+pd_0x4000]
+%endif
+ pshufd m15, m15, q0000
+ paddd m14, m8 ; mx+dx*[0-3]
+ pand m0, m14, m10
+ psrld m0, 6
+ paddd m15, m0
+ pshufd m7, m15, q1032
+%if ARCH_X86_64
+ movd r4d, m15
+ movd r11d, m7
+ pshufd m15, m15, q0321
+ pshufd m7, m7, q0321
+ movd r6d, m15
+ movd r13d, m7
+ mova m10, [base+bdct_lb_q+ 0]
+ mova m11, [base+bdct_lb_q+16]
+ movd m13, [base+subpel_filters+ r4*8+2]
+ movd m2, [base+subpel_filters+ r6*8+2]
+ movd m15, [base+subpel_filters+r11*8+2]
+ movd m4, [base+subpel_filters+r13*8+2]
+%else
+ movd r0, m15
+ movd r4, m7
+ pshufd m15, m15, q0321
+ pshufd m7, m7, q0321
+ movd rX, m15
+ movd r5, m7
+ mova m5, [base+bdct_lb_q+ 0]
+ mova m6, [base+bdct_lb_q+16]
+ movd m1, [base+subpel_filters+r0*8+2]
+ movd m2, [base+subpel_filters+rX*8+2]
+ movd m3, [base+subpel_filters+r4*8+2]
+ movd m7, [base+subpel_filters+r5*8+2]
+ SWAP m4, m7
+ %if isprep
+ mov r3, r3m
+ %endif
+ %define m10 m5
+ %define m11 m6
+ %define m12 m1
+ %define m13 m1
+%endif
+ psrld m14, 10
+ paddd m14, m14
+ punpckldq m13, m2
+ punpckldq m15, m4
+ punpcklqdq m13, m15
+ pxor m2, m2
+ pcmpeqd m0, m2
+%if ARCH_X86_64
+ pand m9, m0
+%else
+ pand m2, m9, m0
+ %define m9 m2
+ SWAP m7, m4
+%endif
+ pandn m0, m13
+%if ARCH_X86_64
+ SWAP m13, m0
+%else
+ %define m13 m0
+%endif
+ por m13, m9
+ punpckhbw m15, m13, m13
+ punpcklbw m13, m13
+ psraw m15, 8
+ psraw m13, 8
+ pshufb m12, m14, m10
+ pshufb m14, m11
+ mova m10, [base+spel_s_shuf2]
+ movd r4d, m14
+ shr r4d, 24
+%if ARCH_X86_32
+ mova [stk+0x40], m13
+ mova [stk+0x50], m15
+ pxor m2, m2
+%endif
+ pshufb m7, m14, m2
+ psubb m14, m7
+ paddb m12, m10
+ paddb m14, m10
+%if ARCH_X86_64
+ lea r6, [r4+ssq*1]
+ lea r11, [r4+ssq*2]
+ lea r13, [r4+ss3q ]
+ movu m7, [srcq+ssq*0]
+ movu m9, [srcq+ssq*1]
+ movu m8, [srcq+ssq*2]
+ movu m10, [srcq+ss3q ]
+ movu m1, [srcq+r4 ]
+ movu m3, [srcq+r6 ]
+ movu m2, [srcq+r11 ]
+ movu m4, [srcq+r13 ]
+ lea srcq, [srcq+ssq*4]
+ REPX {pshufb x, m12}, m7, m9, m8, m10
+ REPX {pmaddwd x, m13}, m7, m9, m8, m10
+ REPX {pshufb x, m14}, m1, m3, m2, m4
+ REPX {pmaddwd x, m15}, m1, m3, m2, m4
+ mova m5, [rsp+0x10]
+ movd xm6, [rsp+0x20]
+ phaddd m7, m1
+ phaddd m9, m3
+ phaddd m8, m2
+ phaddd m10, m4
+ movu m1, [srcq+ssq*0]
+ movu m2, [srcq+ssq*1]
+ movu m3, [srcq+ssq*2]
+ REPX {paddd x, m5}, m7, m9, m8, m10
+ REPX {psrad x, xm6}, m7, m9, m8, m10
+ packssdw m7, m9 ; 0 1
+ packssdw m8, m10 ; 2 3
+ movu m0, [srcq+r4 ]
+ movu m9, [srcq+r6 ]
+ movu m10, [srcq+r11 ]
+ add srcq, ss3q
+ REPX {pshufb x, m12}, m1, m2, m3
+ REPX {pmaddwd x, m13}, m1, m2, m3
+ REPX {pshufb x, m14}, m0, m9, m10
+ REPX {pmaddwd x, m15}, m0, m9, m10
+ phaddd m1, m0
+ phaddd m2, m9
+ phaddd m3, m10
+ shr myd, 6
+ mov r13d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r13q, [base+subpel_filters+myq*8]
+ REPX {paddd x, m5}, m1, m2, m3
+ REPX {psrad x, xm6}, m1, m2, m3
+ packssdw m1, m2 ; 4 5
+ packssdw m3, m3 ; 6 6
+ SWAP m9, m1
+ shufps m4, m7, m8, q1032 ; 1 2
+ shufps m5, m8, m9, q1032 ; 3 4
+ shufps m6, m9, m3, q1032 ; 5 6
+ punpcklwd m0, m7, m4 ; 01
+ punpckhwd m7, m4 ; 12
+ punpcklwd m1, m8, m5 ; 23
+ punpckhwd m8, m5 ; 34
+ punpcklwd m2, m9, m6 ; 45
+ punpckhwd m9, m6 ; 56
+ movq m10, r13
+ mova [stk+0x00], m1
+ mova [stk+0x10], m8
+ mova [stk+0x20], m2
+ mova [stk+0x30], m9
+ mova [stk+0x40], m3
+ %define hrnd_mem [rsp+0x10]
+ %define hsh_mem [rsp+0x20]
+ %define vsh_mem [rsp+0x28]
+ %if isput
+ %define vrnd_mem [rsp+0x30]
+ %else
+ %define vrnd_mem [base+pd_m524256]
+ %endif
+%else
+ mova [stk+0x20], m12
+ mova [stk+0x30], m14
+ add r4, srcq
+ MC_4TAP_SCALED_H 0x60 ; 0 1
+ MC_4TAP_SCALED_H 0x70 ; 2 3
+ MC_4TAP_SCALED_H 0x80 ; 4 5
+ movu m7, [srcq]
+ movu m2, [r4]
+ add srcq, ssq
+ add r4, ssq
+ mov [stk+0xb0], r4
+ pshufb m7, m12
+ pshufb m2, m14
+ pmaddwd m7, m13
+ pmaddwd m2, m15
+ phaddd m7, m2
+ paddd m7, [esp+0x00]
+ psrad m7, [esp+0x10]
+ packssdw m7, m7 ; 6 6
+ mova m4, [stk+0x60]
+ mova m5, [stk+0x70]
+ mova m6, [stk+0x80]
+ mov myd, mym
+ mov rX, [esp+0x1f4]
+ xor r5, r5
+ shr myd, 6
+ lea rX, [rX+myd]
+ mov r4, 64 << 24
+ cmovnz r4, [base+subpel_filters+rX*8+0]
+ cmovnz r5, [base+subpel_filters+rX*8+4]
+ mov r3, r3m
+ shufps m1, m4, m5, q1032 ; 1 2
+ shufps m2, m5, m6, q1032 ; 3 4
+ shufps m3, m6, m7, q1032 ; 5 6
+ mova [stk+0xa0], m7
+ punpcklwd m0, m4, m1 ; 01
+ punpckhwd m4, m1 ; 12
+ punpcklwd m1, m5, m2 ; 23
+ punpckhwd m5, m2 ; 34
+ punpcklwd m2, m6, m3 ; 45
+ punpckhwd m6, m3 ; 56
+ movd m7, r4
+ movd m3, r5
+ mov r0, r0m
+ %if isput
+ mov r1, r1m
+ %endif
+ mov r4, [stk+0xb0]
+ mova [stk+0xc0], m4 ; 12
+ mova [stk+0x60], m1 ; 23
+ mova [stk+0x70], m2 ; 45
+ mova [stk+0x80], m5 ; 34
+ mova [stk+0x90], m6 ; 56
+ %define m12 [stk+0x20]
+ %define m14 [stk+0x30]
+ %define m13 [stk+0x40]
+ %define m15 [stk+0x50]
+ %define hrnd_mem [esp+0x00]
+ %define hsh_mem [esp+0x10]
+ %define vsh_mem [esp+0x18]
+ %if isput
+ %define vrnd_mem [esp+0x20]
+ %else
+ %define vrnd_mem [base+pd_m524256]
+ %endif
+ %define m10 m7
+ punpckldq m10, m3
+%endif
+ punpcklbw m10, m10
+ psraw m10, 8
+ pshufd m3, m10, q0000
+ pshufd m4, m10, q1111
+ pshufd m5, m10, q2222
+ pshufd m10, m10, q3333
+%if ARCH_X86_32
+ %xdefine m8 m3
+ %xdefine m9 m6
+ %xdefine m11 m5
+ %xdefine m6 m4
+ mova [stk+0x100], m3
+ mova [stk+0x110], m4
+ mova [stk+0x120], m5
+ mova [stk+0x130], m10
+ %define m3 [stk+0x100]
+ %define m4 [stk+0x110]
+ %define m5 [stk+0x120]
+ %define m10 [stk+0x130]
+ mova m7, [stk+0xc0]
+ mova m8, [stk+0x80]
+%endif
+.dy1_w4_loop:
+ movu m11, [srcq+ssq*0]
+ movu m6, [srcq+ssq*1]
+ pmaddwd m0, m3
+ pmaddwd m7, m3
+ pmaddwd m1, m4
+ pmaddwd m8, m4
+ pmaddwd m2, m5
+ pmaddwd m9, m5
+ paddd m1, m0
+ paddd m8, m7
+%if ARCH_X86_64
+ movu m0, [srcq+r4]
+ movu m7, [srcq+r6]
+%else
+ movu m0, [r4+ssq*0]
+ movu m7, [r4+ssq*1]
+ lea r4, [r4+ssq*2]
+%endif
+ lea srcq, [srcq+ssq*2]
+ paddd m1, m2
+ paddd m8, m9
+ pshufb m11, m12
+ pshufb m6, m12
+ pmaddwd m11, m13
+ pmaddwd m6, m13
+ pshufb m0, m14
+ pshufb m7, m14
+ pmaddwd m0, m15
+ pmaddwd m7, m15
+ phaddd m11, m0
+ phaddd m6, m7
+ paddd m11, hrnd_mem
+ paddd m6, hrnd_mem
+ psrad m11, hsh_mem
+ psrad m6, hsh_mem
+ packssdw m11, m6 ; 7 8
+%if ARCH_X86_64
+ shufps m9, [stk+0x40], m11, q1032 ; 6 7
+ mova m0, [stk+0x00]
+ mova [stk+0x40], m11
+%else
+ shufps m9, [stk+0xa0], m11, q1032 ; 6 7
+ mova m0, [stk+0x60]
+ mova [stk+0xa0], m11
+%endif
+ punpcklwd m2, m9, m11 ; 67
+ punpckhwd m9, m11 ; 78
+ pmaddwd m6, m2, m10
+ pmaddwd m7, m9, m10
+%if isput
+ movd m11, vsh_mem
+%endif
+ paddd m1, vrnd_mem
+ paddd m8, vrnd_mem
+ paddd m1, m6
+ paddd m8, m7
+%if ARCH_X86_64
+ mova m7, [stk+0x10]
+%else
+ mova m7, [stk+0x80]
+%endif
+%if isput
+ psrad m1, m11
+ psrad m8, m11
+%else
+ psrad m1, 6
+ psrad m8, 6
+%endif
+ packssdw m1, m8
+%if ARCH_X86_64
+ mova m8, [stk+0x30]
+%else
+ mova m8, [stk+0x90]
+%endif
+%if isput
+ pxor m6, m6
+ pmaxsw m1, m6
+ pminsw m1, pxmaxm
+ movq [dstq+dsq*0], m1
+ movhps [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+%else
+ mova [tmpq], m1
+ add tmpq, 16
+%endif
+%if ARCH_X86_64
+ mova m1, [stk+0x20]
+ mova [stk+0x10], m8
+ mova [stk+0x00], m1
+ mova [stk+0x20], m2
+ mova [stk+0x30], m9
+%else
+ mova m1, [stk+0x70]
+ mova [stk+0x80], m8
+ mova [stk+0x60], m1
+ mova [stk+0x70], m2
+ mova [stk+0x90], m9
+%endif
+ sub hd, 2
+ jg .dy1_w4_loop
+ MC_8TAP_SCALED_RET ; why not jz .ret?
+INIT_XMM ssse3
+.dy1_w8:
+ mov dword [stk+0xf0], 1
+ movifprep tmp_stridem, 16
+ jmp .dy1_w_start
+.dy1_w16:
+ mov dword [stk+0xf0], 2
+ movifprep tmp_stridem, 32
+ jmp .dy1_w_start
+.dy1_w32:
+ mov dword [stk+0xf0], 4
+ movifprep tmp_stridem, 64
+ jmp .dy1_w_start
+.dy1_w64:
+ mov dword [stk+0xf0], 8
+ movifprep tmp_stridem, 128
+ jmp .dy1_w_start
+.dy1_w128:
+ mov dword [stk+0xf0], 16
+ movifprep tmp_stridem, 256
+.dy1_w_start:
+ mov myd, mym
+%if ARCH_X86_64
+ %ifidn %1, put
+ movifnidn dsm, dsq
+ %endif
+ mova [rsp+0x10], m11
+ mova [rsp+0x20], m12
+ %define hround m11
+ %if isput
+ mova [rsp+0x30], m13
+ %else
+ mova m13, [base+pd_m524256]
+ %endif
+ shr t0d, 16
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ movd m15, t0d
+%else
+ %define hround [esp+0x00]
+ %define m12 [esp+0x10]
+ %define m10 [base+pd_0x3ff]
+ %define m8 m0
+ %xdefine m14 m4
+ %xdefine m15 m3
+ %if isprep
+ %define ssq ssm
+ %endif
+ mov r5, [esp+0x1f0]
+ mov r3, [esp+0x1f4]
+ shr r5, 16
+ movd m15, r5
+ xor r5, r5
+ shr myd, 6
+ lea r3, [r3+myd]
+ mov r4, 64 << 24
+ cmovnz r4, [base+subpel_filters+r3*8+0]
+ cmovnz r5, [base+subpel_filters+r3*8+4]
+ mov r0, r0m
+ mov r3, r3m
+%endif
+ sub srcq, 6
+ pslld m7, m8, 2 ; dx*4
+ pmaddwd m8, [base+rescale_mul] ; dx*[0-3]
+ pshufd m15, m15, q0000
+ paddd m14, m8 ; mx+dx*[0-3]
+%if ARCH_X86_64
+ movq m3, r4q
+%else
+ movd m5, r4
+ movd m6, r5
+ punpckldq m5, m6
+ SWAP m3, m5
+%endif
+ punpcklbw m3, m3
+ psraw m3, 8
+ mova [stk+0x100], m7
+ mova [stk+0x120], m15
+ mov [stk+0x0f8], srcq
+ mov [stk+0x130], r0q ; dstq / tmpq
+ pshufd m0, m3, q0000
+ pshufd m1, m3, q1111
+ pshufd m2, m3, q2222
+ pshufd m3, m3, q3333
+%if ARCH_X86_64
+ mova [stk+0x140], m0
+ mova [stk+0x150], m1
+ mova [stk+0x160], m2
+ mova [stk+0x170], m3
+ %if UNIX64
+ mov hm, hd
+ %endif
+%else
+ mova [stk+0x180], m0
+ mova [stk+0x190], m1
+ mova [stk+0x1a0], m2
+ mova [stk+0x1b0], m3
+ SWAP m5, m3
+ mov r5, hm
+ mov [stk+0x134], r5
+%endif
+ jmp .dy1_hloop
+.dy1_hloop_prep:
+ dec dword [stk+0x0f0]
+ jz .ret
+%if ARCH_X86_64
+ add qword [stk+0x130], 16
+ mov hd, hm
+%else
+ add dword [stk+0x130], 16
+ mov r5, [stk+0x134]
+ mov r0, [stk+0x130]
+%endif
+ mova m7, [stk+0x100]
+ mova m14, [stk+0x110]
+%if ARCH_X86_64
+ mova m10, [base+pd_0x3ff]
+ mova m11, [rsp+0x10]
+%endif
+ mova m15, [stk+0x120]
+ mov srcq, [stk+0x0f8]
+%if ARCH_X86_64
+ mov r0q, [stk+0x130] ; dstq / tmpq
+%else
+ mov hm, r5
+ mov r0m, r0
+ mov r3, r3m
+%endif
+ paddd m14, m7
+.dy1_hloop:
+%if ARCH_X86_64
+ mova m9, [base+pq_0x40000000]
+%else
+ %define m9 [base+pq_0x40000000]
+%endif
+ pxor m1, m1
+ psrld m2, m14, 10
+ mova [stk], m2
+ pand m6, m14, m10
+ psrld m6, 6
+ paddd m5, m15, m6
+ pcmpeqd m6, m1
+ pshufd m2, m5, q1032
+%if ARCH_X86_64
+ movd r4d, m5
+ movd r6d, m2
+ pshufd m5, m5, q0321
+ pshufd m2, m2, q0321
+ movd r7d, m5
+ movd r9d, m2
+ movq m0, [base+subpel_filters+r4*8]
+ movq m1, [base+subpel_filters+r6*8]
+ movhps m0, [base+subpel_filters+r7*8]
+ movhps m1, [base+subpel_filters+r9*8]
+%else
+ movd r0, m5
+ movd rX, m2
+ pshufd m5, m5, q0321
+ pshufd m2, m2, q0321
+ movd r4, m5
+ movd r5, m2
+ movq m0, [base+subpel_filters+r0*8]
+ movq m1, [base+subpel_filters+rX*8]
+ movhps m0, [base+subpel_filters+r4*8]
+ movhps m1, [base+subpel_filters+r5*8]
+%endif
+ paddd m14, m7 ; mx+dx*[4-7]
+ pand m5, m14, m10
+ psrld m5, 6
+ paddd m15, m5
+ pxor m2, m2
+ pcmpeqd m5, m2
+ mova [stk+0x110], m14
+ pshufd m4, m15, q1032
+%if ARCH_X86_64
+ movd r10d, m15
+ movd r11d, m4
+ pshufd m15, m15, q0321
+ pshufd m4, m4, q0321
+ movd r13d, m15
+ movd rXd, m4
+ movq m2, [base+subpel_filters+r10*8]
+ movq m3, [base+subpel_filters+r11*8]
+ movhps m2, [base+subpel_filters+r13*8]
+ movhps m3, [base+subpel_filters+ rX*8]
+ psrld m14, 10
+ movq r11, m14
+ punpckhqdq m14, m14
+ movq rX, m14
+ mov r10d, r11d
+ shr r11, 32
+ mov r13d, rXd
+ shr rX, 32
+ mov r4d, [stk+ 0]
+ mov r6d, [stk+ 4]
+ mov r7d, [stk+ 8]
+ mov r9d, [stk+12]
+ pshufd m4, m6, q1100
+ pshufd m6, m6, q3322
+ pshufd m14, m5, q1100
+ pshufd m5, m5, q3322
+ pand m7, m9, m4
+ pand m8, m9, m6
+ pand m15, m9, m14
+ pand m9, m9, m5
+ pandn m4, m0
+ pandn m6, m1
+ pandn m14, m2
+ pandn m5, m3
+ por m7, m4
+ por m8, m6
+ por m15, m14
+ por m9, m5
+ punpcklbw m0, m7, m7
+ punpckhbw m7, m7
+ punpcklbw m1, m8, m8
+ punpckhbw m8, m8
+ psraw m0, 8
+ psraw m7, 8
+ psraw m1, 8
+ psraw m8, 8
+ punpcklbw m2, m15, m15
+ punpckhbw m15, m15
+ punpcklbw m3, m9, m9
+ punpckhbw m9, m9
+ psraw m2, 8
+ psraw m15, 8
+ psraw m3, 8
+ psraw m9, 8
+ mova [stk+0x10], m0
+ mova [stk+0x20], m7
+ mova [stk+0x30], m1
+ mova [stk+0x40], m8
+ mova [stk+0x50], m2
+ mova [stk+0x60], m15
+ mova [stk+0x70], m3
+ mova [stk+0x80], m9
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10 ; 0
+ mova [stk+0x90], m1
+ MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 1, 9, 10 ; 1
+ mova [stk+0xa0], m2
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10 ; 2
+ mova [stk+0xb0], m3
+ MC_8TAP_SCALED_H 4, 5, 6, 1, 2, 3, 9, 10 ; 3
+ mova [stk+0xc0], m4
+ MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10 ; 4
+ mova [stk+0xd0], m5
+ MC_8TAP_SCALED_H 6, 1, 2, 3, 4, 5, 9, 10 ; 5
+ MC_8TAP_SCALED_H 7, 1, 2, 3, 4, 5, 9, 10 ; 6
+ MC_8TAP_SCALED_H 8, 1, 2, 3, 4, 5, 9, 10 ; 7
+ mova m5, [stk+0xd0]
+ mova m1, [stk+0x90]
+ mova m2, [stk+0xa0]
+ mova m3, [stk+0xb0]
+ mova m9, [stk+0xc0]
+ punpcklwd m4, m5, m6 ; 45a
+ punpckhwd m5, m6 ; 45b
+ punpcklwd m6, m7, m8 ; 67a
+ punpckhwd m7, m8 ; 67b
+ punpcklwd m0, m1, m2 ; 01a
+ punpckhwd m1, m2 ; 01b
+ punpcklwd m2, m3, m9 ; 23a
+ punpckhwd m3, m9 ; 23b
+ mova m10, [stk+0x140]
+ mova m11, [stk+0x150]
+ mova m14, [stk+0x160]
+ mova m15, [stk+0x170]
+ mova [stk+0x90], m4
+ mova [stk+0xa0], m5
+ mova [stk+0xb0], m6
+ mova [stk+0xc0], m7
+ %define hround [rsp+0x10]
+ %define shift [rsp+0x20]
+ %if isput
+ %define vround [rsp+0x30]
+ %else
+ %define vround [base+pd_m524256]
+ %endif
+.dy1_vloop:
+ pmaddwd m4, m0, m10
+ pmaddwd m5, m1, m10
+ pmaddwd m6, m2, m11
+ pmaddwd m7, m3, m11
+ paddd m4, m13
+ paddd m5, m13
+ paddd m4, m6
+ paddd m5, m7
+ pmaddwd m6, [stk+0x90], m14
+ pmaddwd m7, [stk+0xa0], m14
+ pmaddwd m8, [stk+0xb0], m15
+ pmaddwd m9, [stk+0xc0], m15
+ paddd m4, m6
+ paddd m5, m7
+ %if isput
+ pshufd m6, m12, q1032
+ %endif
+ paddd m4, m8
+ paddd m5, m9
+%else
+ movd r0, m15
+ movd rX, m4
+ pshufd m15, m15, q0321
+ pshufd m4, m4, q0321
+ movd r4, m15
+ movd r5, m4
+ mova m14, [stk+0x110]
+ movq m2, [base+subpel_filters+r0*8]
+ movq m3, [base+subpel_filters+rX*8]
+ movhps m2, [base+subpel_filters+r4*8]
+ movhps m3, [base+subpel_filters+r5*8]
+ psrld m14, 10
+ mova [stk+16], m14
+ mov r0, [stk+ 0]
+ mov rX, [stk+ 4]
+ mov r4, [stk+ 8]
+ mov r5, [stk+12]
+ mova [stk+0x20], m0
+ mova [stk+0x30], m1
+ mova [stk+0x40], m2
+ mova [stk+0x50], m3
+ pshufd m4, m6, q1100
+ pshufd m6, m6, q3322
+ pshufd m7, m5, q1100
+ pshufd m5, m5, q3322
+ pand m0, m9, m4
+ pand m1, m9, m6
+ pand m2, m9, m7
+ pand m3, m9, m5
+ pandn m4, [stk+0x20]
+ pandn m6, [stk+0x30]
+ pandn m7, [stk+0x40]
+ pandn m5, [stk+0x50]
+ por m0, m4
+ por m1, m6
+ por m2, m7
+ por m3, m5
+ punpcklbw m4, m0, m0
+ punpckhbw m0, m0
+ punpcklbw m5, m1, m1
+ punpckhbw m1, m1
+ psraw m4, 8
+ psraw m0, 8
+ psraw m5, 8
+ psraw m1, 8
+ punpcklbw m6, m2, m2
+ punpckhbw m2, m2
+ punpcklbw m7, m3, m3
+ punpckhbw m3, m3
+ psraw m6, 8
+ psraw m2, 8
+ psraw m7, 8
+ psraw m3, 8
+ mova [stk+0x0a0], m4
+ mova [stk+0x0b0], m0
+ mova [stk+0x0c0], m5
+ mova [stk+0x0d0], m1
+ mova [stk+0x140], m6
+ mova [stk+0x150], m2
+ mova [stk+0x160], m7
+ mova [stk+0x170], m3
+ MC_8TAP_SCALED_H 0xa0, 0x20, 0 ; 0
+ MC_8TAP_SCALED_H 0xa0, 0x30 ; 1
+ MC_8TAP_SCALED_H 0xa0, 0x40 ; 2
+ MC_8TAP_SCALED_H 0xa0, 0x50 ; 3
+ MC_8TAP_SCALED_H 0xa0, 0x60 ; 4
+ MC_8TAP_SCALED_H 0xa0, 0x70 ; 5
+ MC_8TAP_SCALED_H 0xa0, 0x80 ; 6
+ MC_8TAP_SCALED_H 0xa0, 0x90 ; 7
+ mova m5, [stk+0x60]
+ mova m6, [stk+0x70]
+ mova m7, [stk+0x80]
+ mova m0, [stk+0x90]
+ mov r0, r0m
+ punpcklwd m4, m5, m6 ; 45a
+ punpckhwd m5, m6 ; 45b
+ punpcklwd m6, m7, m0 ; 67a
+ punpckhwd m7, m0 ; 67b
+ mova [stk+0x60], m4
+ mova [stk+0x70], m5
+ mova [stk+0x80], m6
+ mova [stk+0x90], m7
+ mova m1, [stk+0x20]
+ mova m2, [stk+0x30]
+ mova m3, [stk+0x40]
+ mova m4, [stk+0x50]
+ punpcklwd m0, m1, m2 ; 01a
+ punpckhwd m1, m2 ; 01b
+ punpcklwd m2, m3, m4 ; 23a
+ punpckhwd m3, m4 ; 23b
+ mova m4, [stk+0x180]
+ mova m5, [stk+0x190]
+ mova m6, [stk+0x1a0]
+ mova m7, [stk+0x1b0]
+ mova [stk+0x20], m0
+ mova [stk+0x30], m1
+ mova [stk+0x40], m2
+ mova [stk+0x50], m3
+.dy1_vloop:
+ pmaddwd m0, m4
+ pmaddwd m1, m4
+ pmaddwd m2, m5
+ pmaddwd m3, m5
+ paddd m0, m2
+ paddd m1, m3
+ pmaddwd m2, [stk+0x60], m6
+ pmaddwd m3, [stk+0x70], m6
+ pmaddwd m4, [stk+0x80], m7
+ pmaddwd m5, [stk+0x90], m7
+ %if isput
+ movd m6, [esp+0x18]
+ %endif
+ paddd m0, m2
+ paddd m1, m3
+ paddd m0, vrnd_mem
+ paddd m1, vrnd_mem
+ paddd m4, m0
+ paddd m5, m1
+%endif
+%ifidn %1, put
+ psrad m4, m6
+ psrad m5, m6
+ packssdw m4, m5
+ pxor m7, m7
+ pmaxsw m4, m7
+ pminsw m4, pxmaxm
+ mova [dstq], m4
+ add dstq, dsm
+%else
+ psrad m4, 6
+ psrad m5, 6
+ packssdw m4, m5
+ mova [tmpq], m4
+ add tmpq, tmp_stridem
+%endif
+ dec hd
+ jz .dy1_hloop_prep
+%if ARCH_X86_64
+ movu m8, [srcq+r10*2]
+ movu m9, [srcq+r11*2]
+ movu m12, [srcq+r13*2]
+ movu m13, [srcq+ rX*2]
+ movu m4, [srcq+ r4*2]
+ movu m5, [srcq+ r6*2]
+ movu m6, [srcq+ r7*2]
+ movu m7, [srcq+ r9*2]
+ add srcq, ssq
+ pmaddwd m8, [stk+0x50]
+ pmaddwd m9, [stk+0x60]
+ pmaddwd m12, [stk+0x70]
+ pmaddwd m13, [stk+0x80]
+ pmaddwd m4, [stk+0x10]
+ pmaddwd m5, [stk+0x20]
+ pmaddwd m6, [stk+0x30]
+ pmaddwd m7, [stk+0x40]
+ phaddd m8, m9
+ phaddd m12, m13
+ mova m9, [base+unpckw]
+ mova m13, hround
+ phaddd m4, m5
+ phaddd m6, m7
+ phaddd m8, m12
+ phaddd m4, m6
+ pshufd m5, m9, q1032
+ pshufb m0, m9 ; 0a 1a
+ pshufb m1, m9 ; 0b 1b
+ pshufb m2, m5 ; 3a 2a
+ pshufb m3, m5 ; 3b 2b
+ mova m12, shift
+ paddd m4, m13
+ paddd m8, m13
+ psrad m4, m12
+ psrad m8, m12
+ packssdw m4, m8
+ pshufb m6, [stk+0x90], m9 ; 4a 5a
+ pshufb m7, [stk+0xa0], m9 ; 4b 5b
+ pshufb m8, [stk+0xb0], m5 ; 7a 6a
+ pshufb m13, [stk+0xc0], m5 ; 7b 6b
+ punpckhwd m0, m2 ; 12a
+ punpckhwd m1, m3 ; 12b
+ punpcklwd m2, m6 ; 34a
+ punpcklwd m3, m7 ; 34b
+ punpckhwd m6, m8 ; 56a
+ punpckhwd m7, m13 ; 56b
+ punpcklwd m8, m4 ; 78a
+ punpckhqdq m4, m4
+ punpcklwd m13, m4 ; 78b
+ mova [stk+0x90], m6
+ mova [stk+0xa0], m7
+ mova [stk+0xb0], m8
+ mova [stk+0xc0], m13
+ mova m13, vround
+%else
+ mov r0m, r0
+ mov r3, r3m
+ mov r0, [stk+ 0]
+ mov rX, [stk+ 4]
+ mov r4, [stk+ 8]
+ mov r5, [stk+12]
+ MC_8TAP_SCALED_H 0xa0, 0xe0, 0 ; 8
+ mova m7, [base+unpckw]
+ pshufd m4, m7, q1032
+ pshufb m0, [stk+0x20], m7 ; 0a 1a
+ pshufb m1, [stk+0x30], m7 ; 0b 1b
+ pshufb m2, [stk+0x40], m4 ; 3a 2a
+ pshufb m3, [stk+0x50], m4 ; 3b 2b
+ pshufb m5, [stk+0x60], m7 ; 4a 5a
+ pshufb m6, [stk+0x70], m7 ; 4b 5b
+ pshufb m7, [stk+0x80], m4 ; 7a 6a
+ punpckhwd m0, m2 ; 12a
+ punpckhwd m1, m3 ; 12b
+ punpcklwd m2, m5 ; 34a
+ punpcklwd m3, m6 ; 34b
+ mova [stk+0x20], m0
+ mova [stk+0x30], m1
+ mova [stk+0x40], m2
+ mova [stk+0x50], m3
+ punpckhwd m5, m7 ; 56a
+ mova [stk+0x60], m5
+ pshufb m5, [stk+0x90], m4 ; 7b 6b
+ punpcklwd m7, [stk+0xe0] ; 78a
+ mova m4, [stk+0x180]
+ punpckhwd m6, m5 ; 56b
+ mova [stk+0x70], m6
+ movq m6, [stk+0xe8]
+ mova [stk+0x80], m7
+ mova m7, [stk+0x1b0]
+ punpcklwd m5, m6
+ mova m6, [stk+0x1a0]
+ mova [stk+0x90], m5
+ mova m5, [stk+0x190]
+ mov r0, r0m
+%endif
+ jmp .dy1_vloop
+INIT_XMM ssse3
+%if ARCH_X86_64
+ %define stk rsp+0x20
+%endif
+.dy2:
+ movzx wd, word [base+%1_8tap_scaled_ssse3_dy2_table+wq*2]
+ add wq, base_reg
+ jmp wq
+%if isput
+.dy2_w2:
+ %if ARCH_X86_64
+ mov myd, mym
+ mova [rsp+0x10], m13
+ %define vrnd_mem [rsp+0x10]
+ movzx t0d, t0b
+ sub srcq, 2
+ movd m15, t0d
+ %else
+ %define m8 m0
+ %define m9 m1
+ %define m14 m4
+ %define m15 m3
+ %define m11 [esp+0x00]
+ %define m12 [esp+0x10]
+ %define vrnd_mem [esp+0x20]
+ mov r1, r1m
+ movzx r5, byte [esp+0x1f0]
+ sub srcq, 2
+ movd m15, r5
+ %endif
+ pxor m9, m9
+ punpckldq m9, m8
+ paddd m14, m9 ; mx+dx*[0-1]
+ %if ARCH_X86_64
+ mova m9, [base+pd_0x4000]
+ %endif
+ pshufd m15, m15, q0000
+ pand m8, m14, m10
+ psrld m8, 6
+ paddd m15, m8
+ movd r4d, m15
+ pshufd m15, m15, q0321
+ %if ARCH_X86_64
+ movd r6d, m15
+ %else
+ movd r3d, m15
+ %endif
+ mova m5, [base+bdct_lb_q]
+ mova m6, [base+spel_s_shuf2]
+ movd m15, [base+subpel_filters+r4*8+2]
+ %if ARCH_X86_64
+ movd m7, [base+subpel_filters+r6*8+2]
+ %else
+ movd m7, [base+subpel_filters+r3*8+2]
+ %endif
+ pxor m2, m2
+ pcmpeqd m8, m2
+ psrld m14, 10
+ paddd m14, m14
+ %if ARCH_X86_32
+ mov r3, r3m
+ pshufb m14, m5
+ paddb m14, m6
+ mova [stk], m14
+ SWAP m5, m0
+ SWAP m6, m3
+ %define m15 m6
+ %endif
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*2]
+ movu m2, [srcq+ssq*4]
+ punpckldq m15, m7
+ %if ARCH_X86_64
+ pshufb m14, m5
+ paddb m14, m6
+ pand m9, m8
+ pandn m8, m15
+ SWAP m15, m8
+ por m15, m9
+ movu m4, [srcq+ssq*1]
+ movu m5, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ movu m6, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ %else
+ pand m7, m5, [base+pd_0x4000]
+ pandn m5, m15
+ por m5, m7
+ %define m15 m5
+ mov myd, mym
+ mov r5, [esp+0x1f4]
+ xor r3, r3
+ shr myd, 6
+ lea r5, [r5+myd]
+ mov r4, 64 << 24
+ cmovnz r4, [base+subpel_filters+r5*8+0]
+ cmovnz r3, [base+subpel_filters+r5*8+4]
+ mov [stk+0x20], r3
+ mov r3, r3m
+ %endif
+ punpcklbw m15, m15
+ psraw m15, 8
+ REPX {pshufb x, m14}, m0, m1, m2
+ REPX {pmaddwd x, m15}, m0, m1, m2
+ %if ARCH_X86_64
+ REPX {pshufb x, m14}, m4, m5, m6
+ REPX {pmaddwd x, m15}, m4, m5, m6
+ phaddd m0, m1
+ phaddd m1, m2
+ phaddd m4, m5
+ phaddd m5, m6
+ REPX {paddd x, m11}, m0, m1, m4, m5
+ REPX {psrad x, m12}, m0, m1, m4, m5
+ packssdw m0, m1 ; 0 2 2 4
+ packssdw m4, m5 ; 1 3 3 5
+ SWAP m2, m4
+ movq m10, r4
+ %else
+ mova [stk+0x10], m15
+ phaddd m0, m1
+ phaddd m1, m2
+ movu m2, [srcq+ssq*1]
+ movu m7, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ movu m6, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ REPX {pshufb x, m14}, m2, m7, m6
+ REPX {pmaddwd x, m15}, m2, m7, m6
+ %define m14 [stk+0x00]
+ %define m15 [stk+0x10]
+ phaddd m2, m7
+ phaddd m7, m6
+ REPX {paddd x, m11}, m0, m1, m2, m7
+ REPX {psrad x, m12}, m0, m1, m2, m7
+ packssdw m0, m1
+ packssdw m2, m7
+ %define m8 m6
+ %define m9 m4
+ %define m10 m5
+ movd m10, r4
+ movd m9, [stk+0x20]
+ punpckldq m10, m9
+ %endif
+ punpcklbw m10, m10
+ psraw m10, 8
+ pshufd m7, m10, q0000
+ pshufd m8, m10, q1111
+ pshufd m9, m10, q2222
+ pshufd m10, m10, q3333
+ %if ARCH_X86_32
+ mova [stk+0x50], m7
+ mova [stk+0x60], m8
+ mova [stk+0x70], m9
+ mova [stk+0x80], m10
+ %xdefine m13 m7
+ %define m7 [stk+0x50]
+ %define m8 [stk+0x60]
+ %define m9 [stk+0x70]
+ %define m10 [stk+0x80]
+ %endif
+ punpcklwd m1, m0, m2 ; 01 23
+ punpckhwd m3, m0, m2 ; 23 45
+ %if ARCH_X86_32
+ mov r4, r0m
+ %define dstq r4
+ mova [stk+0x20], m3
+ mova [stk+0x30], m0
+ %endif
+.dy2_w2_loop:
+ movu m4, [srcq+ssq*0]
+ movu m5, [srcq+ssq*1]
+ movu m6, [srcq+ssq*2]
+ movu m13, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ pmaddwd m3, m8
+ REPX {pshufb x, m14}, m4, m5, m6, m13
+ REPX {pmaddwd x, m15}, m4, m5, m6, m13
+ phaddd m4, m5
+ phaddd m6, m13
+ pmaddwd m5, m1, m7
+ paddd m4, m11
+ paddd m6, m11
+ psrad m4, m12
+ psrad m6, m12
+ packssdw m4, m6 ; 6 7 8 9
+ paddd m5, m3
+ pshufd m3, m4, q2200
+ pshufd m4, m4, q3311
+ palignr m3, m0, 12 ; 4 6 6 8
+ palignr m4, m2, 12 ; 5 7 7 9
+ mova m0, m3
+ mova m2, m4
+ punpcklwd m1, m3, m4
+ punpckhwd m3, m4
+ pmaddwd m6, m1, m9
+ pmaddwd m4, m3, m10
+ paddd m5, vrnd_mem
+ paddd m6, m4
+ paddd m5, m6
+ pshufd m4, m12, q1032
+ pxor m6, m6
+ psrad m5, m4
+ packssdw m5, m5
+ pmaxsw m5, m6
+ pminsw m5, pxmaxm
+ movd [dstq+dsq*0], m5
+ pshuflw m5, m5, q1032
+ movd [dstq+dsq*1], m5
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .dy2_w2_loop
+ RET
+%endif
+INIT_XMM ssse3
+.dy2_w4:
+%if ARCH_X86_64
+ mov myd, mym
+ mova [rsp+0x10], m11
+ mova [rsp+0x20], m12
+ %if isput
+ mova [rsp+0x30], m13
+ %define vrnd_mem [rsp+0x30]
+ %define stk rsp+0x40
+ %else
+ %define vrnd_mem [base+pd_m524256]
+ %define stk rsp+0x30
+ %endif
+ movzx t0d, t0b
+ sub srcq, 2
+ movd m15, t0d
+%else
+ %define m10 [base+pd_0x3ff]
+ %define m9 [base+pd_0x4000]
+ %define m8 m0
+ %xdefine m14 m4
+ %define m15 m3
+ %if isprep
+ %define ssq r3
+ %endif
+ movzx r5, byte [esp+0x1f0]
+ sub srcq, 2
+ movd m15, r5
+%endif
+ pmaddwd m8, [base+rescale_mul]
+%if ARCH_X86_64
+ mova m9, [base+pd_0x4000]
+%endif
+ pshufd m15, m15, q0000
+ paddd m14, m8 ; mx+dx*[0-3]
+ pand m0, m14, m10
+ psrld m0, 6
+ paddd m15, m0
+ pshufd m7, m15, q1032
+%if ARCH_X86_64
+ movd r4d, m15
+ movd r11d, m7
+ pshufd m15, m15, q0321
+ pshufd m7, m7, q0321
+ movd r6d, m15
+ movd r13d, m7
+ mova m10, [base+bdct_lb_q+ 0]
+ mova m11, [base+bdct_lb_q+16]
+ movd m13, [base+subpel_filters+ r4*8+2]
+ movd m2, [base+subpel_filters+ r6*8+2]
+ movd m15, [base+subpel_filters+r11*8+2]
+ movd m4, [base+subpel_filters+r13*8+2]
+%else
+ movd r1, m15
+ movd r4, m7
+ pshufd m15, m15, q0321
+ pshufd m7, m7, q0321
+ movd r3, m15
+ movd r5, m7
+ mova m5, [base+bdct_lb_q+ 0]
+ mova m6, [base+bdct_lb_q+16]
+ movd m1, [base+subpel_filters+r1*8+2]
+ movd m2, [base+subpel_filters+r3*8+2]
+ movd m3, [base+subpel_filters+r4*8+2]
+ movd m7, [base+subpel_filters+r5*8+2]
+ SWAP m4, m7
+ mov r3, r3m
+ %if isprep
+ lea ss3q, [ssq*3]
+ %endif
+ %define m10 m5
+ %define m11 m6
+ %define m12 m1
+ %define m13 m1
+%endif
+ psrld m14, 10
+ paddd m14, m14
+ punpckldq m13, m2
+ punpckldq m15, m4
+ punpcklqdq m13, m15
+ pxor m2, m2
+ pcmpeqd m0, m2
+%if ARCH_X86_64
+ pand m9, m0
+%else
+ pand m2, m9, m0
+ %define m9 m2
+ SWAP m7, m4
+%endif
+ pandn m0, m13
+%if ARCH_X86_64
+ SWAP m13, m0
+%else
+ %define m13 m0
+%endif
+ por m13, m9
+ punpckhbw m15, m13, m13
+ punpcklbw m13, m13
+ psraw m15, 8
+ psraw m13, 8
+ pshufb m12, m14, m10
+ pshufb m14, m11
+ mova m10, [base+spel_s_shuf2]
+ movd r4d, m14
+ shr r4d, 24
+%if ARCH_X86_32
+ mova [stk+0x40], m13
+ mova [stk+0x50], m15
+ pxor m2, m2
+%endif
+ pshufb m7, m14, m2
+ psubb m14, m7
+ paddb m12, m10
+ paddb m14, m10
+%if ARCH_X86_64
+ lea r6, [r4+ssq*1]
+ lea r11, [r4+ssq*2]
+ lea r13, [r4+ss3q ]
+ movu m1, [srcq+ssq*0]
+ movu m8, [srcq+ssq*2]
+ movu m9, [srcq+ssq*1]
+ movu m10, [srcq+ss3q ]
+ movu m7, [srcq+r4 ]
+ movu m2, [srcq+r11 ]
+ movu m3, [srcq+r6 ]
+ movu m4, [srcq+r13 ]
+ lea srcq, [srcq+ssq*4]
+ REPX {pshufb x, m12}, m1, m9, m8, m10
+ REPX {pmaddwd x, m13}, m1, m9, m8, m10
+ REPX {pshufb x, m14}, m7, m3, m2, m4
+ REPX {pmaddwd x, m15}, m7, m3, m2, m4
+ mova m5, [rsp+0x10]
+ movd xm6, [rsp+0x20]
+ phaddd m1, m7
+ phaddd m8, m2
+ phaddd m9, m3
+ phaddd m10, m4
+ movu m2, [srcq+ssq*0]
+ movu m3, [srcq+ssq*1]
+ REPX {paddd x, m5}, m1, m9, m8, m10
+ REPX {psrad x, xm6}, m1, m9, m8, m10
+ packssdw m1, m8 ; 0 2
+ packssdw m9, m10 ; 1 3
+ movu m0, [srcq+r4 ]
+ movu m8, [srcq+r6 ]
+ lea srcq, [srcq+ssq*2]
+ REPX {pshufb x, m12}, m2, m3
+ REPX {pmaddwd x, m13}, m2, m3
+ REPX {pshufb x, m14}, m0, m8
+ REPX {pmaddwd x, m15}, m0, m8
+ phaddd m2, m0
+ phaddd m3, m8
+ shr myd, 6
+ mov r9d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r9q, [base+subpel_filters+myq*8]
+ REPX {paddd x, m5}, m2, m3
+ REPX {psrad x, xm6}, m2, m3
+ packssdw m2, m3 ; 4 5
+ pshufd m3, m2, q1032 ; 5 _
+ punpcklwd m0, m1, m9 ; 01
+ punpckhwd m1, m9 ; 23
+ punpcklwd m2, m3 ; 45
+ movq m10, r9
+ %define hrnd_mem [rsp+0x10]
+ %define hsh_mem [rsp+0x20]
+ %define vsh_mem [rsp+0x28]
+ %if isput
+ %define vrnd_mem [rsp+0x30]
+ %else
+ %define vrnd_mem [base+pd_m524256]
+ %endif
+%else
+ mova [stk+0x20], m12
+ mova [stk+0x30], m14
+ add r4, srcq
+ MC_4TAP_SCALED_H 0x60 ; 0 1
+ MC_4TAP_SCALED_H 0x70 ; 2 3
+ MC_4TAP_SCALED_H 0x80 ; 4 5
+ mov [stk+0xe0], r4
+ mova m3, [base+spel_s_shuf8]
+ mova m0, [stk+0x60]
+ mova m1, [stk+0x70]
+ mova m2, [stk+0x80]
+ mov myd, mym
+ mov rX, [esp+0x1f4]
+ xor r5, r5
+ shr myd, 6
+ lea rX, [rX+myd]
+ mov r4, 64 << 24
+ cmovnz r4, [base+subpel_filters+rX*8+0]
+ cmovnz r5, [base+subpel_filters+rX*8+4]
+ mov r3, r3m
+ pshufb m0, m3 ; 01
+ pshufb m1, m3 ; 23
+ pshufb m2, m3 ; 45
+ movd m7, r4
+ movd m4, r5
+ mov r5, r0m
+ %if isput
+ mov r1, r1m
+ %endif
+ mov r4, [stk+0xe0]
+ %define dstq r5
+ %define tmpq r5
+ %define m12 [stk+0x20]
+ %define m14 [stk+0x30]
+ %define m13 [stk+0x40]
+ %define m15 [stk+0x50]
+ %define hrnd_mem [esp+0x00]
+ %define hsh_mem [esp+0x10]
+ %define vsh_mem [esp+0x18]
+ %if isput
+ %define vrnd_mem [esp+0x20]
+ %else
+ %define vrnd_mem [base+pd_m524256]
+ %endif
+ %define m10 m7
+ punpckldq m10, m4
+%endif
+ punpcklbw m10, m10
+ psraw m10, 8
+ pshufd m3, m10, q0000
+ pshufd m4, m10, q1111
+ pshufd m5, m10, q2222
+ pshufd m10, m10, q3333
+%if ARCH_X86_32
+ %xdefine m8 m3
+ %xdefine m9 m6
+ %xdefine m11 m5
+ %xdefine m6 m4
+ mova [stk+0x100], m3
+ mova [stk+0x110], m4
+ mova [stk+0x120], m5
+ mova [stk+0x130], m10
+ %define m3 [stk+0x100]
+ %define m4 [stk+0x110]
+ %define m5 [stk+0x120]
+ %define m10 [stk+0x130]
+%endif
+.dy2_w4_loop:
+ pmaddwd m8, m0, m3
+ pmaddwd m9, m1, m3
+ mova m0, m2
+ pmaddwd m1, m4
+ pmaddwd m11, m2, m4
+ paddd m8, vrnd_mem
+ paddd m9, vrnd_mem
+ pmaddwd m2, m5
+ paddd m8, m1
+ paddd m9, m11
+ paddd m8, m2
+ movu m6, [srcq+ssq*0]
+ movu m1, [srcq+ssq*2]
+%if ARCH_X86_64
+ movu m11, [srcq+r4 ]
+ movu m2, [srcq+r11]
+%else
+ movu m11, [r4+ssq*0]
+ movu m2, [r4+ssq*2]
+%endif
+ pshufb m6, m12
+ pshufb m1, m12
+ pmaddwd m6, m13
+ pmaddwd m1, m13
+ pshufb m11, m14
+ pshufb m2, m14
+ pmaddwd m11, m15
+ pmaddwd m2, m15
+ phaddd m6, m11
+ phaddd m1, m2
+ paddd m6, hrnd_mem
+ paddd m1, hrnd_mem
+ psrad m6, hsh_mem
+ psrad m1, hsh_mem
+ movu m7, [srcq+ssq*1]
+ movu m11, [srcq+ss3q ]
+ packssdw m6, m1 ; 6 8
+%if ARCH_X86_64
+ movu m2, [srcq+r6 ]
+ movu m1, [srcq+r13]
+%else
+ movu m2, [r4+ssq*1]
+ movu m1, [r4+ss3q ]
+%endif
+ pshufb m7, m12
+ pshufb m11, m12
+ pmaddwd m7, m13
+ pmaddwd m11, m13
+ pshufb m2, m14
+ pshufb m1, m14
+ pmaddwd m2, m15
+ pmaddwd m1, m15
+ phaddd m7, m2
+ phaddd m11, m1
+ paddd m7, hrnd_mem
+ paddd m11, hrnd_mem
+ psrad m7, hsh_mem
+ psrad m11, hsh_mem
+ packssdw m7, m11 ; 7 9
+%if ARCH_X86_32
+ lea r4, [r4+ssq*4]
+%endif
+ lea srcq, [srcq+ssq*4]
+ punpcklwd m1, m6, m7 ; 67
+ punpckhwd m6, m7 ; 89
+ mova m2, m6
+ pmaddwd m11, m1, m5
+ pmaddwd m7, m1, m10
+ pmaddwd m6, m10
+ paddd m9, m11
+%if isput
+ movd m11, vsh_mem
+%endif
+ paddd m8, m7
+ paddd m9, m6
+%if isput
+ psrad m8, m11
+ psrad m9, m11
+ packssdw m8, m9
+ pxor m7, m7
+ pmaxsw m8, m7
+ pminsw m8, pxmaxm
+ movq [dstq+dsq*0], m8
+ movhps [dstq+dsq*1], m8
+ lea dstq, [dstq+dsq*2]
+%else
+ psrad m8, 6
+ psrad m9, 6
+ packssdw m8, m9
+ mova [tmpq], m8
+ add tmpq, 16
+%endif
+ sub hd, 2
+ jg .dy2_w4_loop
+ MC_8TAP_SCALED_RET ; why not jz .ret?
+INIT_XMM ssse3
+.dy2_w8:
+ mov dword [stk+0xf0], 1
+ movifprep tmp_stridem, 16
+ jmp .dy2_w_start
+.dy2_w16:
+ mov dword [stk+0xf0], 2
+ movifprep tmp_stridem, 32
+ jmp .dy2_w_start
+.dy2_w32:
+ mov dword [stk+0xf0], 4
+ movifprep tmp_stridem, 64
+ jmp .dy2_w_start
+.dy2_w64:
+ mov dword [stk+0xf0], 8
+ movifprep tmp_stridem, 128
+ jmp .dy2_w_start
+.dy2_w128:
+ mov dword [stk+0xf0], 16
+ movifprep tmp_stridem, 256
+.dy2_w_start:
+ mov myd, mym
+%if ARCH_X86_64
+ %ifidn %1, put
+ movifnidn dsm, dsq
+ %endif
+ mova [rsp+0x10], m11
+ mova [rsp+0x20], m12
+ %define hround m11
+ %if isput
+ mova [rsp+0x30], m13
+ %else
+ mova m13, [base+pd_m524256]
+ %endif
+ shr t0d, 16
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ movd m15, t0d
+%else
+ %define hround [esp+0x00]
+ %define m12 [esp+0x10]
+ %define m10 [base+pd_0x3ff]
+ %define m8 m0
+ %xdefine m14 m4
+ %xdefine m15 m3
+ %if isput
+ %define dstq r0
+ %else
+ %define tmpq r0
+ %define ssq ssm
+ %endif
+ mov r5, [esp+0x1f0]
+ mov r3, [esp+0x1f4]
+ shr r5, 16
+ movd m15, r5
+ xor r5, r5
+ shr myd, 6
+ lea r3, [r3+myd]
+ mov r4, 64 << 24
+ cmovnz r4, [base+subpel_filters+r3*8+0]
+ cmovnz r5, [base+subpel_filters+r3*8+4]
+ mov r0, r0m
+ mov r3, r3m
+%endif
+ sub srcq, 6
+ pslld m7, m8, 2 ; dx*4
+ pmaddwd m8, [base+rescale_mul] ; dx*[0-3]
+ pshufd m15, m15, q0000
+ paddd m14, m8 ; mx+dx*[0-3]
+%if ARCH_X86_64
+ movq m3, r4q
+%else
+ movd m5, r4
+ movd m6, r5
+ punpckldq m5, m6
+ SWAP m3, m5
+%endif
+ punpcklbw m3, m3
+ psraw m3, 8
+ mova [stk+0x100], m7
+ mova [stk+0x120], m15
+ mov [stk+0x0f8], srcq
+ mov [stk+0x130], r0q ; dstq / tmpq
+ pshufd m0, m3, q0000
+ pshufd m1, m3, q1111
+ pshufd m2, m3, q2222
+ pshufd m3, m3, q3333
+%if ARCH_X86_64
+ mova [stk+0x140], m0
+ mova [stk+0x150], m1
+ mova [stk+0x160], m2
+ mova [stk+0x170], m3
+ %if UNIX64
+ mov hm, hd
+ %endif
+%else
+ mova [stk+0x180], m0
+ mova [stk+0x190], m1
+ mova [stk+0x1a0], m2
+ mova [stk+0x1b0], m3
+ SWAP m5, m3
+ mov r5, hm
+ mov [stk+0x134], r5
+%endif
+ jmp .dy2_hloop
+.dy2_hloop_prep:
+ dec dword [stk+0x0f0]
+ jz .ret
+%if ARCH_X86_64
+ add qword [stk+0x130], 16
+ mov hd, hm
+%else
+ add dword [stk+0x130], 16
+ mov r5, [stk+0x134]
+ mov r0, [stk+0x130]
+%endif
+ mova m7, [stk+0x100]
+ mova m14, [stk+0x110]
+%if ARCH_X86_64
+ mova m10, [base+pd_0x3ff]
+ mova m11, [rsp+0x10]
+%endif
+ mova m15, [stk+0x120]
+ mov srcq, [stk+0x0f8]
+%if ARCH_X86_64
+ mov r0q, [stk+0x130] ; dstq / tmpq
+%else
+ mov hm, r5
+ mov r0m, r0
+ mov r3, r3m
+%endif
+ paddd m14, m7
+.dy2_hloop:
+%if ARCH_X86_64
+ mova m9, [base+pq_0x40000000]
+%else
+ %define m9 [base+pq_0x40000000]
+%endif
+ pxor m1, m1
+ psrld m2, m14, 10
+ mova [stk], m2
+ pand m6, m14, m10
+ psrld m6, 6
+ paddd m5, m15, m6
+ pcmpeqd m6, m1
+ pshufd m2, m5, q1032
+%if ARCH_X86_64
+ movd r4d, m5
+ movd r6d, m2
+ pshufd m5, m5, q0321
+ pshufd m2, m2, q0321
+ movd r7d, m5
+ movd r9d, m2
+ movq m0, [base+subpel_filters+r4*8]
+ movq m1, [base+subpel_filters+r6*8]
+ movhps m0, [base+subpel_filters+r7*8]
+ movhps m1, [base+subpel_filters+r9*8]
+%else
+ movd r0, m5
+ movd rX, m2
+ pshufd m5, m5, q0321
+ pshufd m2, m2, q0321
+ movd r4, m5
+ movd r5, m2
+ movq m0, [base+subpel_filters+r0*8]
+ movq m1, [base+subpel_filters+rX*8]
+ movhps m0, [base+subpel_filters+r4*8]
+ movhps m1, [base+subpel_filters+r5*8]
+%endif
+ paddd m14, m7 ; mx+dx*[4-7]
+ pand m5, m14, m10
+ psrld m5, 6
+ paddd m15, m5
+ pxor m2, m2
+ pcmpeqd m5, m2
+ mova [stk+0x110], m14
+ pshufd m4, m15, q1032
+%if ARCH_X86_64
+ movd r10d, m15
+ movd r11d, m4
+ pshufd m15, m15, q0321
+ pshufd m4, m4, q0321
+ movd r13d, m15
+ movd rXd, m4
+ movq m2, [base+subpel_filters+r10*8]
+ movq m3, [base+subpel_filters+r11*8]
+ movhps m2, [base+subpel_filters+r13*8]
+ movhps m3, [base+subpel_filters+ rX*8]
+ psrld m14, 10
+ movq r11, m14
+ punpckhqdq m14, m14
+ movq rX, m14
+ mov r10d, r11d
+ shr r11, 32
+ mov r13d, rXd
+ shr rX, 32
+ mov r4d, [stk+ 0]
+ mov r6d, [stk+ 4]
+ mov r7d, [stk+ 8]
+ mov r9d, [stk+12]
+ pshufd m4, m6, q1100
+ pshufd m6, m6, q3322
+ pshufd m14, m5, q1100
+ pshufd m5, m5, q3322
+ pand m7, m9, m4
+ pand m8, m9, m6
+ pand m15, m9, m14
+ pand m9, m9, m5
+ pandn m4, m0
+ pandn m6, m1
+ pandn m14, m2
+ pandn m5, m3
+ por m7, m4
+ por m8, m6
+ por m15, m14
+ por m9, m5
+ punpcklbw m0, m7, m7
+ punpckhbw m7, m7
+ punpcklbw m1, m8, m8
+ punpckhbw m8, m8
+ psraw m0, 8
+ psraw m7, 8
+ psraw m1, 8
+ psraw m8, 8
+ punpcklbw m2, m15, m15
+ punpckhbw m15, m15
+ punpcklbw m3, m9, m9
+ punpckhbw m9, m9
+ psraw m2, 8
+ psraw m15, 8
+ psraw m3, 8
+ psraw m9, 8
+ mova [stk+0x10], m0
+ mova [stk+0x20], m7
+ mova [stk+0x30], m1
+ mova [stk+0x40], m8
+ mova [stk+0x50], m2
+ mova [stk+0x60], m15
+ mova [stk+0x70], m3
+ mova [stk+0x80], m9
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10 ; 0
+ mova [stk+0x90], m1
+ MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 1, 9, 10 ; 1
+ mova [stk+0xa0], m2
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10 ; 2
+ mova [stk+0xb0], m3
+ MC_8TAP_SCALED_H 4, 5, 6, 1, 2, 3, 9, 10 ; 3
+ mova [stk+0xc0], m4
+ MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10 ; 4
+ mova [stk+0xd0], m5
+ MC_8TAP_SCALED_H 6, 1, 2, 3, 4, 5, 9, 10 ; 5
+ MC_8TAP_SCALED_H 7, 1, 2, 3, 4, 5, 9, 10 ; 6
+ MC_8TAP_SCALED_H 8, 1, 2, 3, 4, 5, 9, 10 ; 7
+ mova m5, [stk+0xd0]
+ mova m1, [stk+0x90]
+ mova m2, [stk+0xa0]
+ mova m3, [stk+0xb0]
+ mova m9, [stk+0xc0]
+ punpcklwd m4, m5, m6 ; 45a
+ punpckhwd m5, m6 ; 45b
+ punpcklwd m6, m7, m8 ; 67a
+ punpckhwd m7, m8 ; 67b
+ punpcklwd m0, m1, m2 ; 01a
+ punpckhwd m1, m2 ; 01b
+ punpcklwd m2, m3, m9 ; 23a
+ punpckhwd m3, m9 ; 23b
+ mova m10, [stk+0x140]
+ mova m11, [stk+0x150]
+ mova m14, [stk+0x160]
+ mova m15, [stk+0x170]
+ mova [stk+0x90], m4
+ mova [stk+0xa0], m5
+ mova [stk+0xb0], m6
+ mova [stk+0xc0], m7
+ %define hround [rsp+0x10]
+ %define shift [rsp+0x20]
+ %if isput
+ %define vround [rsp+0x30]
+ %else
+ %define vround [base+pd_m524256]
+ %endif
+.dy2_vloop:
+ pmaddwd m4, m0, m10
+ pmaddwd m5, m1, m10
+ pmaddwd m6, m2, m11
+ pmaddwd m7, m3, m11
+ paddd m4, m13
+ paddd m5, m13
+ paddd m4, m6
+ paddd m5, m7
+ pmaddwd m6, [stk+0x90], m14
+ pmaddwd m7, [stk+0xa0], m14
+ pmaddwd m8, [stk+0xb0], m15
+ pmaddwd m9, [stk+0xc0], m15
+ paddd m4, m6
+ paddd m5, m7
+ %if isput
+ pshufd m6, m12, q1032
+ %endif
+ paddd m4, m8
+ paddd m5, m9
+%else
+ movd r0, m15
+ movd rX, m4
+ pshufd m15, m15, q0321
+ pshufd m4, m4, q0321
+ movd r4, m15
+ movd r5, m4
+ mova m14, [stk+0x110]
+ movq m2, [base+subpel_filters+r0*8]
+ movq m3, [base+subpel_filters+rX*8]
+ movhps m2, [base+subpel_filters+r4*8]
+ movhps m3, [base+subpel_filters+r5*8]
+ psrld m14, 10
+ mova [stk+16], m14
+ mov r0, [stk+ 0]
+ mov rX, [stk+ 4]
+ mov r4, [stk+ 8]
+ mov r5, [stk+12]
+ mova [stk+0x20], m0
+ mova [stk+0x30], m1
+ mova [stk+0x40], m2
+ mova [stk+0x50], m3
+ pshufd m4, m6, q1100
+ pshufd m6, m6, q3322
+ pshufd m7, m5, q1100
+ pshufd m5, m5, q3322
+ pand m0, m9, m4
+ pand m1, m9, m6
+ pand m2, m9, m7
+ pand m3, m9, m5
+ pandn m4, [stk+0x20]
+ pandn m6, [stk+0x30]
+ pandn m7, [stk+0x40]
+ pandn m5, [stk+0x50]
+ por m0, m4
+ por m1, m6
+ por m2, m7
+ por m3, m5
+ punpcklbw m4, m0, m0
+ punpckhbw m0, m0
+ punpcklbw m5, m1, m1
+ punpckhbw m1, m1
+ psraw m4, 8
+ psraw m0, 8
+ psraw m5, 8
+ psraw m1, 8
+ punpcklbw m6, m2, m2
+ punpckhbw m2, m2
+ punpcklbw m7, m3, m3
+ punpckhbw m3, m3
+ psraw m6, 8
+ psraw m2, 8
+ psraw m7, 8
+ psraw m3, 8
+ mova [stk+0x0a0], m4
+ mova [stk+0x0b0], m0
+ mova [stk+0x0c0], m5
+ mova [stk+0x0d0], m1
+ mova [stk+0x140], m6
+ mova [stk+0x150], m2
+ mova [stk+0x160], m7
+ mova [stk+0x170], m3
+ MC_8TAP_SCALED_H 0xa0, 0x20, 0 ; 0
+ MC_8TAP_SCALED_H 0xa0, 0x30 ; 1
+ MC_8TAP_SCALED_H 0xa0, 0x40 ; 2
+ MC_8TAP_SCALED_H 0xa0, 0x50 ; 3
+ MC_8TAP_SCALED_H 0xa0, 0x60 ; 4
+ MC_8TAP_SCALED_H 0xa0, 0x70 ; 5
+ MC_8TAP_SCALED_H 0xa0, 0x80 ; 6
+ MC_8TAP_SCALED_H 0xa0, 0x90 ; 7
+ mova m5, [stk+0x60]
+ mova m6, [stk+0x70]
+ mova m7, [stk+0x80]
+ mova m0, [stk+0x90]
+ mov r0, r0m
+ punpcklwd m4, m5, m6 ; 45a
+ punpckhwd m5, m6 ; 45b
+ punpcklwd m6, m7, m0 ; 67a
+ punpckhwd m7, m0 ; 67b
+ mova [stk+0x60], m4
+ mova [stk+0x70], m5
+ mova [stk+0x80], m6
+ mova [stk+0x90], m7
+ mova m1, [stk+0x20]
+ mova m2, [stk+0x30]
+ mova m3, [stk+0x40]
+ mova m4, [stk+0x50]
+ punpcklwd m0, m1, m2 ; 01a
+ punpckhwd m1, m2 ; 01b
+ punpcklwd m2, m3, m4 ; 23a
+ punpckhwd m3, m4 ; 23b
+ mova m4, [stk+0x180]
+ mova m5, [stk+0x190]
+ mova m6, [stk+0x1a0]
+ mova m7, [stk+0x1b0]
+ mova [stk+0x40], m2
+ mova [stk+0x50], m3
+.dy2_vloop:
+ pmaddwd m0, m4
+ pmaddwd m1, m4
+ pmaddwd m2, m5
+ pmaddwd m3, m5
+ paddd m0, m2
+ paddd m1, m3
+ pmaddwd m2, [stk+0x60], m6
+ pmaddwd m3, [stk+0x70], m6
+ pmaddwd m4, [stk+0x80], m7
+ pmaddwd m5, [stk+0x90], m7
+ %if isput
+ movd m6, [esp+0x18]
+ %endif
+ paddd m0, m2
+ paddd m1, m3
+ paddd m0, vrnd_mem
+ paddd m1, vrnd_mem
+ paddd m4, m0
+ paddd m5, m1
+%endif
+%ifidn %1, put
+ psrad m4, m6
+ psrad m5, m6
+ packssdw m4, m5
+ pxor m7, m7
+ pmaxsw m4, m7
+ pminsw m4, pxmaxm
+ mova [dstq], m4
+ add dstq, dsm
+%else
+ psrad m4, 6
+ psrad m5, 6
+ packssdw m4, m5
+ mova [tmpq], m4
+ add tmpq, tmp_stridem
+%endif
+ dec hd
+ jz .dy2_hloop_prep
+%if ARCH_X86_64
+ MC_8TAP_SCALED_H 4, 8, 5, 6, 7, 9, 0, 1
+ mova [stk+0xd0], m4
+ MC_8TAP_SCALED_H 8, 5, 6, 7, 9, 4, 0, 1
+ mova m4, [stk+0xd0]
+ mova m0, m2 ; 01a
+ mova m1, m3 ; 01b
+ mova m2, [stk+0x90] ; 23a
+ mova m3, [stk+0xa0] ; 23b
+ mova m5, [stk+0xb0] ; 45a
+ mova m6, [stk+0xc0] ; 45b
+ punpcklwd m7, m4, m8 ; 67a
+ punpckhwd m4, m8 ; 67b
+ mova [stk+0x90], m5
+ mova [stk+0xa0], m6
+ mova [stk+0xb0], m7
+ mova [stk+0xc0], m4
+%else
+ mov r0m, r0
+ mov r3, r3m
+ MC_8TAP_SCALED_H 0xa0, 0xe0 ; 8
+ MC_8TAP_SCALED_H 0xa0, 0 ; 9
+ mova m7, [stk+0xe0]
+ mova m2, [stk+0x60] ; 23a
+ mova m3, [stk+0x70] ; 23b
+ mova m4, [stk+0x80] ; 45a
+ mova m5, [stk+0x90] ; 45b
+ punpcklwd m6, m7, m0 ; 67a
+ punpckhwd m7, m0 ; 67b
+ mova m0, [stk+0x40] ; 01a
+ mova m1, [stk+0x50] ; 01b
+ mova [stk+0x40], m2
+ mova [stk+0x50], m3
+ mova [stk+0x60], m4
+ mova [stk+0x70], m5
+ mova m4, [stk+0x180]
+ mova m5, [stk+0x190]
+ mova [stk+0x80], m6
+ mova [stk+0x90], m7
+ mova m6, [stk+0x1a0]
+ mova m7, [stk+0x1b0]
+ mov r0, r0m
+%endif
+ jmp .dy2_vloop
+INIT_XMM ssse3
+.ret:
+ MC_8TAP_SCALED_RET 0
+%if ARCH_X86_32 && !isprep && required_stack_alignment > STACK_ALIGNMENT
+ %define r0m [rstk+stack_offset+ 4]
+ %define r1m [rstk+stack_offset+ 8]
+ %define r2m [rstk+stack_offset+12]
+ %define r3m [rstk+stack_offset+16]
+%endif
+%undef isput
+%undef isprep
+%endmacro
+
+%macro BILIN_SCALED_FN 1
+cglobal %1_bilin_scaled_16bpc
+ mov t0d, (5*15 << 16) | 5*15
+ mov t1d, (5*15 << 16) | 5*15
+ jmp mangle(private_prefix %+ _%1_8tap_scaled_16bpc %+ SUFFIX)
+%endmacro
+
+%if WIN64
+DECLARE_REG_TMP 6, 5
+%elif ARCH_X86_64
+DECLARE_REG_TMP 6, 8
+%else
+DECLARE_REG_TMP 1, 2
+%endif
+BILIN_SCALED_FN put
+FN put_8tap_scaled, sharp, SHARP, SHARP
+FN put_8tap_scaled, sharp_smooth, SHARP, SMOOTH
+FN put_8tap_scaled, smooth_sharp, SMOOTH, SHARP
+FN put_8tap_scaled, smooth, SMOOTH, SMOOTH
+FN put_8tap_scaled, sharp_regular, SHARP, REGULAR
+FN put_8tap_scaled, regular_sharp, REGULAR, SHARP
+FN put_8tap_scaled, smooth_regular, SMOOTH, REGULAR
+FN put_8tap_scaled, regular_smooth, REGULAR, SMOOTH
+FN put_8tap_scaled, regular, REGULAR, REGULAR
+MC_8TAP_SCALED put
+
+%if WIN64
+DECLARE_REG_TMP 5, 4
+%elif ARCH_X86_64
+DECLARE_REG_TMP 6, 7
+%else
+DECLARE_REG_TMP 1, 2
+%endif
+BILIN_SCALED_FN prep
+FN prep_8tap_scaled, sharp, SHARP, SHARP
+FN prep_8tap_scaled, sharp_smooth, SHARP, SMOOTH
+FN prep_8tap_scaled, smooth_sharp, SMOOTH, SHARP
+FN prep_8tap_scaled, smooth, SMOOTH, SMOOTH
+FN prep_8tap_scaled, sharp_regular, SHARP, REGULAR
+FN prep_8tap_scaled, regular_sharp, REGULAR, SHARP
+FN prep_8tap_scaled, smooth_regular, SMOOTH, REGULAR
+FN prep_8tap_scaled, regular_smooth, REGULAR, SMOOTH
+FN prep_8tap_scaled, regular, REGULAR, REGULAR
+MC_8TAP_SCALED prep
+
+%if ARCH_X86_64
+DECLARE_REG_TMP 6
+%else
+DECLARE_REG_TMP 2
+%endif
+
+%if ARCH_X86_64
+; warp8x8t spills one less xmm register than warp8x8 on WIN64, compensate that
+; by allocating 16 bytes more stack space so that stack offsets match up.
+%if WIN64 && STACK_ALIGNMENT == 16
+%assign stksz 16*14
+%else
+%assign stksz 16*13
+%endif
+cglobal warp_affine_8x8t_16bpc, 4, 13, 9, stksz, dst, ds, src, ss, delta, \
+ mx, tmp, alpha, beta, \
+ filter, my, gamma, cnt
+%assign stack_size_padded_8x8t stack_size_padded
+%else
+cglobal warp_affine_8x8t_16bpc, 0, 7, 8, -16*17, alpha, gamma, src, tmp, \
+ filter, mx, my
+%define m8 [esp+16*13]
+%define m9 [esp+16*14]
+%define cntd dword [esp+4*63]
+%define dstq tmpq
+%define dsq 0
+%if STACK_ALIGNMENT < 16
+%define dstm [esp+4*65]
+%define dsm [esp+4*66]
+%else
+%define dstm r0m
+%define dsm r1m
+%endif
+%endif
+%define base filterq-$$
+ mov t0d, r7m
+ LEA filterq, $$
+ shr t0d, 11
+%if ARCH_X86_64
+ movddup m8, [base+warp8x8t_rnd]
+%else
+ movddup m1, [base+warp8x8t_rnd]
+ mov r1, r1m
+ add r1, r1
+ mova m8, m1
+ mov r1m, r1 ; ds *= 2
+%endif
+ call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main
+ jmp .start
+.loop:
+%if ARCH_X86_64
+ lea dstq, [dstq+dsq*4]
+%else
+ add dstq, dsm
+ mov dstm, dstq
+%endif
+ call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main2
+.start:
+%if ARCH_X86_32
+ mov dstq, dstm
+%endif
+ paddd m1, m8
+ paddd m2, m8
+ psrad m1, 15
+ psrad m2, 15
+ packssdw m1, m2
+ mova [dstq+dsq*0], m1
+ call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main3
+%if ARCH_X86_32
+ mov dstq, dstm
+ add dstq, dsm
+%endif
+ paddd m1, m8
+ paddd m2, m8
+ psrad m1, 15
+ psrad m2, 15
+ packssdw m1, m2
+ mova [dstq+dsq*2], m1
+ dec cntd
+ jg .loop
+ RET
+
+%if ARCH_X86_64
+cglobal warp_affine_8x8_16bpc, 4, 13, 10, 16*13, dst, ds, src, ss, delta, \
+ mx, tmp, alpha, beta, \
+ filter, my, gamma, cnt
+ASSERT stack_size_padded == stack_size_padded_8x8t
+%else
+cglobal warp_affine_8x8_16bpc, 0, 7, 8, -16*17, alpha, gamma, src, tmp, \
+ filter, mx, my
+%endif
+ mov t0d, r7m
+ LEA filterq, $$
+ shr t0d, 11
+%if ARCH_X86_64
+ movddup m8, [base+warp8x8_rnd2+t0*8]
+ movd m9, r7m ; pixel_max
+ pshufb m9, [base+pw_256]
+%else
+ movddup m1, [base+warp8x8_rnd2+t0*8]
+ movd m2, r7m ; pixel_max
+ pshufb m2, [base+pw_256]
+ mova m8, m1
+ mova m9, m2
+%endif
+ call .main
+ jmp .start
+.loop:
+%if ARCH_X86_64
+ lea dstq, [dstq+dsq*2]
+%else
+ add dstq, dsm
+ mov dstm, dstq
+%endif
+ call .main2
+.start:
+%if ARCH_X86_32
+ mov dstq, dstm
+%endif
+ psrad m1, 16
+ psrad m2, 16
+ packssdw m1, m2
+ pmaxsw m1, m6
+ pmulhrsw m1, m8
+ pminsw m1, m9
+ mova [dstq+dsq*0], m1
+ call .main3
+%if ARCH_X86_32
+ mov dstq, dstm
+ add dstq, dsm
+%endif
+ psrad m1, 16
+ psrad m2, 16
+ packssdw m1, m2
+ pmaxsw m1, m6
+ pmulhrsw m1, m8
+ pminsw m1, m9
+ mova [dstq+dsq*1], m1
+ dec cntd
+ jg .loop
+ RET
+ALIGN function_align
+.main:
+ ; Stack args offset by one (r4m -> r5m etc.) due to call
+%if WIN64
+ mov deltaq, r5m
+ mov mxd, r6m
+%endif
+ movd m0, [base+warp8x8_shift+t0*4]
+ movddup m7, [base+warp8x8_rnd1+t0*8]
+ add filterq, mc_warp_filter-$$
+%if ARCH_X86_64
+ movsx alphad, word [deltaq+2*0]
+ movsx betad, word [deltaq+2*1]
+ movsx gammad, word [deltaq+2*2]
+ movsx deltad, word [deltaq+2*3]
+ lea tmpq, [ssq*3]
+ add mxd, 512+(64<<10)
+ sub srcq, tmpq ; src -= ss*3
+ imul tmpd, alphad, -7
+ mov myd, r7m
+ add betad, tmpd ; beta -= alpha*7
+ imul tmpd, gammad, -7
+ add myd, 512+(64<<10)
+ mov cntd, 4
+ add deltad, tmpd ; delta -= gamma*7
+%else
+%if STACK_ALIGNMENT < 16
+ %assign stack_offset stack_offset - gprsize
+%endif
+ mov r3d, r5m ; abcd
+%if STACK_ALIGNMENT < 16
+ mov r0, r1m ; dst
+ mov r1, r2m ; ds
+ mov [esp+gprsize+4*65], r0
+ mov [esp+gprsize+4*66], r1
+%endif
+ movsx alphad, word [r3+2*0]
+ movsx r2d, word [r3+2*1]
+ movsx gammad, word [r3+2*2]
+ movsx r3d, word [r3+2*3]
+ imul r5d, alphad, -7
+ add r2d, r5d ; beta -= alpha*7
+ imul r5d, gammad, -7
+ mov [esp+gprsize+4*60], r2d
+ add r3d, r5d ; delta -= gamma*7
+ mov [esp+gprsize+4*61], r3d
+ mov r3d, r4m ; ss
+ mov srcq, r3m
+ mov mxd, r6m
+ mov myd, r7m
+ mov dword [esp+gprsize+4*63], 4 ; cnt
+ mov [esp+gprsize+4*62], r3
+ lea r3, [r3*3]
+ add mxd, 512+(64<<10)
+ add myd, 512+(64<<10)
+ sub srcq, r3 ; src -= ss*3
+%if STACK_ALIGNMENT < 16
+ %assign stack_offset stack_offset + gprsize
+%endif
+%endif
+ mova [rsp+gprsize], m0
+ pxor m6, m6
+ call .h
+ mova m5, m0
+ call .h
+ punpcklwd m1, m5, m0 ; 01
+ punpckhwd m5, m0
+ mova [rsp+gprsize+16* 1], m1
+ mova [rsp+gprsize+16* 4], m5
+ mova m5, m0
+ call .h
+ punpcklwd m1, m5, m0 ; 12
+ punpckhwd m5, m0
+ mova [rsp+gprsize+16* 7], m1
+ mova [rsp+gprsize+16*10], m5
+ mova m5, m0
+ call .h
+ punpcklwd m1, m5, m0 ; 23
+ punpckhwd m5, m0
+ mova [rsp+gprsize+16* 2], m1
+ mova [rsp+gprsize+16* 5], m5
+ mova m5, m0
+ call .h
+ punpcklwd m1, m5, m0 ; 34
+ punpckhwd m5, m0
+ mova [rsp+gprsize+16* 8], m1
+ mova [rsp+gprsize+16*11], m5
+ mova m5, m0
+ call .h
+ punpcklwd m1, m5, m0 ; 45
+ punpckhwd m5, m0
+ mova [rsp+gprsize+16* 3], m1
+ mova [rsp+gprsize+16* 6], m5
+ mova m5, m0
+ call .h
+ punpcklwd m1, m5, m0 ; 56
+ punpckhwd m5, m0
+ mova [rsp+gprsize+16* 9], m1
+ mova [rsp+gprsize+16*12], m5
+ mova m5, m0
+.main2:
+ call .h
+%macro WARP_V 6 ; 01l, 23l, 45l, 01h, 23h, 45h
+ lea tmpd, [myq+gammaq]
+ shr myd, 10
+ movq m4, [filterq+myq*8] ; a
+ lea myd, [tmpq+gammaq]
+ shr tmpd, 10
+ movq m2, [filterq+tmpq*8] ; b
+ lea tmpd, [myq+gammaq]
+ shr myd, 10
+ movq m3, [filterq+myq*8] ; c
+ lea myd, [tmpq+gammaq]
+ shr tmpd, 10
+ movq m1, [filterq+tmpq*8] ; d
+ lea tmpd, [myq+gammaq]
+ shr myd, 10
+ punpcklwd m4, m2
+ punpcklwd m3, m1
+ punpckldq m2, m4, m3
+ punpckhdq m4, m3
+ punpcklbw m1, m6, m2 ; a0 a1 b0 b1 c0 c1 d0 d1 << 8
+ pmaddwd m1, [rsp+gprsize+16*%1]
+ punpckhbw m3, m6, m2 ; a2 a3 b2 b3 c2 c3 d2 d3 << 8
+ mova m2, [rsp+gprsize+16*%2]
+ pmaddwd m3, m2
+ mova [rsp+gprsize+16*%1], m2
+ paddd m1, m3
+ punpcklbw m3, m6, m4 ; a4 a5 b4 b5 c4 c5 d4 d5 << 8
+ mova m2, [rsp+gprsize+16*%3]
+ pmaddwd m3, m2
+ mova [rsp+gprsize+16*%2], m2
+ paddd m1, m3
+ punpcklwd m3, m5, m0 ; 67
+ punpckhbw m2, m6, m4 ; a6 a7 b6 b7 c6 c7 d6 d7 << 8
+ pmaddwd m2, m3
+ mova [rsp+gprsize+16*%3], m3
+ paddd m1, m2
+ movq m4, [filterq+myq*8] ; e
+ lea myd, [tmpq+gammaq]
+ shr tmpd, 10
+ movq m3, [filterq+tmpq*8] ; f
+ lea tmpd, [myq+gammaq]
+ shr myd, 10
+ movq m2, [filterq+myq*8] ; g
+%if ARCH_X86_64
+ lea myd, [tmpq+deltaq] ; my += delta
+%else
+ mov myd, [esp+gprsize+4*61]
+ add myd, tmpd
+%endif
+ shr tmpd, 10
+ punpcklwd m4, m3
+ movq m3, [filterq+tmpq*8] ; h
+ punpcklwd m2, m3
+ punpckldq m3, m4, m2
+ punpckhdq m4, m2
+ punpcklbw m2, m6, m3 ; e0 e1 f0 f1 g0 g1 h0 h1 << 8
+ pmaddwd m2, [rsp+gprsize+16*%4]
+ punpckhbw m6, m3 ; e2 e3 f2 f3 g2 g3 h2 h3 << 8
+ mova m3, [rsp+gprsize+16*%5]
+ pmaddwd m6, m3
+ mova [rsp+gprsize+16*%4], m3
+ pxor m3, m3
+ paddd m2, m6
+ punpcklbw m3, m4 ; e4 e5 f4 f5 g4 g5 h4 h5 << 8
+ mova m6, [rsp+gprsize+16*%6]
+ pmaddwd m3, m6
+ mova [rsp+gprsize+16*%5], m6
+ punpckhwd m5, m0
+ pxor m6, m6
+ paddd m2, m3
+ punpckhbw m3, m6, m4 ; e6 e7 f6 f7 g6 g7 h6 h7 << 8
+ pmaddwd m3, m5
+ mova [rsp+gprsize+16*%6], m5
+ mova m5, m0
+ paddd m2, m3
+%endmacro
+ WARP_V 1, 2, 3, 4, 5, 6
+ ret
+.main3:
+ call .h
+ WARP_V 7, 8, 9, 10, 11, 12
+ ret
+ALIGN function_align
+.h:
+ lea tmpd, [mxq+alphaq]
+ shr mxd, 10
+ movq m3, [filterq+mxq*8]
+ punpcklbw m0, m6, m3
+ movu m3, [srcq-6]
+ pmaddwd m0, m3 ; 0
+ lea mxd, [tmpq+alphaq]
+ shr tmpd, 10
+ movq m3, [filterq+tmpq*8]
+ punpcklbw m2, m6, m3
+ movu m3, [srcq-4]
+ pmaddwd m2, m3 ; 1
+ lea tmpd, [mxq+alphaq]
+ shr mxd, 10
+ movq m3, [filterq+mxq*8]
+ phaddd m0, m2 ; 0 1
+ punpcklbw m2, m6, m3
+ movu m3, [srcq-2]
+ pmaddwd m2, m3 ; 2
+ lea mxd, [tmpq+alphaq]
+ shr tmpd, 10
+ movq m3, [filterq+tmpq*8]
+ punpcklbw m1, m6, m3
+ movu m3, [srcq+0]
+ pmaddwd m1, m3 ; 3
+ lea tmpd, [mxq+alphaq]
+ shr mxd, 10
+ movq m3, [filterq+mxq*8]
+ phaddd m2, m1 ; 2 3
+ punpcklbw m1, m6, m3
+ movu m3, [srcq+2]
+ pmaddwd m1, m3 ; 4
+ lea mxd, [tmpq+alphaq]
+ shr tmpd, 10
+ movq m3, [filterq+tmpq*8]
+ phaddd m0, m2 ; 0 1 2 3
+ punpcklbw m2, m6, m3
+ movu m3, [srcq+4]
+ pmaddwd m2, m3 ; 5
+ lea tmpd, [mxq+alphaq]
+ shr mxd, 10
+ movq m3, [filterq+mxq*8]
+ phaddd m1, m2 ; 4 5
+ punpcklbw m2, m6, m3
+ movu m3, [srcq+6]
+ pmaddwd m2, m3 ; 6
+%if ARCH_X86_64
+ lea mxd, [tmpq+betaq] ; mx += beta
+%else
+ mov mxd, [esp+gprsize*2+4*60]
+ add mxd, tmpd
+%endif
+ shr tmpd, 10
+ movq m3, [filterq+tmpq*8]
+ punpcklbw m4, m6, m3
+ movu m3, [srcq+8]
+%if ARCH_X86_64
+ add srcq, ssq
+%else
+ add srcq, [esp+gprsize*2+4*62]
+%endif
+ pmaddwd m3, m4 ; 7
+ phaddd m2, m3 ; 6 7
+ phaddd m1, m2 ; 4 5 6 7
+ paddd m0, m7
+ paddd m1, m7
+ psrad m0, [rsp+gprsize*2]
+ psrad m1, [rsp+gprsize*2]
+ packssdw m0, m1
+ ret
+
+%macro BIDIR_FN 0
+ call .main
+ jmp wq
+.w4_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w4:
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ movq [dstq+strideq*0], m1
+ movhps [dstq+strideq*1], m1
+ sub hd, 4
+ jg .w4_loop
+.ret:
+ RET
+.w8_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w8:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ sub hd, 2
+ jne .w8_loop
+ RET
+.w16_loop:
+ call .main
+ add dstq, strideq
+.w16:
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ dec hd
+ jg .w16_loop
+ RET
+.w32_loop:
+ call .main
+ add dstq, strideq
+.w32:
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ call .main
+ mova [dstq+16*2], m0
+ mova [dstq+16*3], m1
+ dec hd
+ jg .w32_loop
+ RET
+.w64_loop:
+ call .main
+ add dstq, strideq
+.w64:
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ call .main
+ mova [dstq+16*2], m0
+ mova [dstq+16*3], m1
+ call .main
+ mova [dstq+16*4], m0
+ mova [dstq+16*5], m1
+ call .main
+ mova [dstq+16*6], m0
+ mova [dstq+16*7], m1
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop:
+ call .main
+ add dstq, strideq
+.w128:
+ mova [dstq+16* 0], m0
+ mova [dstq+16* 1], m1
+ call .main
+ mova [dstq+16* 2], m0
+ mova [dstq+16* 3], m1
+ call .main
+ mova [dstq+16* 4], m0
+ mova [dstq+16* 5], m1
+ call .main
+ mova [dstq+16* 6], m0
+ mova [dstq+16* 7], m1
+ call .main
+ mova [dstq+16* 8], m0
+ mova [dstq+16* 9], m1
+ call .main
+ mova [dstq+16*10], m0
+ mova [dstq+16*11], m1
+ call .main
+ mova [dstq+16*12], m0
+ mova [dstq+16*13], m1
+ call .main
+ mova [dstq+16*14], m0
+ mova [dstq+16*15], m1
+ dec hd
+ jg .w128_loop
+ RET
+%endmacro
+
+%if UNIX64
+DECLARE_REG_TMP 7
+%else
+DECLARE_REG_TMP 5
+%endif
+
+cglobal avg_16bpc, 4, 7, 4, dst, stride, tmp1, tmp2, w, h
+%define base r6-avg_ssse3_table
+ LEA r6, avg_ssse3_table
+ tzcnt wd, wm
+ mov t0d, r6m ; pixel_max
+ movsxd wq, [r6+wq*4]
+ shr t0d, 11
+ movddup m2, [base+bidir_rnd+t0*8]
+ movddup m3, [base+bidir_mul+t0*8]
+ movifnidn hd, hm
+ add wq, r6
+ BIDIR_FN
+ALIGN function_align
+.main:
+ mova m0, [tmp1q+16*0]
+ paddsw m0, [tmp2q+16*0]
+ mova m1, [tmp1q+16*1]
+ paddsw m1, [tmp2q+16*1]
+ add tmp1q, 16*2
+ add tmp2q, 16*2
+ pmaxsw m0, m2
+ pmaxsw m1, m2
+ psubsw m0, m2
+ psubsw m1, m2
+ pmulhw m0, m3
+ pmulhw m1, m3
+ ret
+
+cglobal w_avg_16bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, h
+%define base r6-w_avg_ssse3_table
+ LEA r6, w_avg_ssse3_table
+ tzcnt wd, wm
+ mov t0d, r6m ; weight
+ movd m6, r7m ; pixel_max
+ movddup m5, [base+pd_65538]
+ movsxd wq, [r6+wq*4]
+ pshufb m6, [base+pw_256]
+ add wq, r6
+ lea r6d, [t0-16]
+ shl t0d, 16
+ sub t0d, r6d ; 16-weight, weight
+ paddw m5, m6
+ mov r6d, t0d
+ shl t0d, 2
+ test dword r7m, 0x800
+ cmovnz r6d, t0d
+ movifnidn hd, hm
+ movd m4, r6d
+ pslld m5, 7
+ pxor m7, m7
+ pshufd m4, m4, q0000
+ BIDIR_FN
+ALIGN function_align
+.main:
+ mova m2, [tmp1q+16*0]
+ mova m0, [tmp2q+16*0]
+ punpckhwd m3, m0, m2
+ punpcklwd m0, m2
+ mova m2, [tmp1q+16*1]
+ mova m1, [tmp2q+16*1]
+ add tmp1q, 16*2
+ add tmp2q, 16*2
+ pmaddwd m3, m4
+ pmaddwd m0, m4
+ paddd m3, m5
+ paddd m0, m5
+ psrad m3, 8
+ psrad m0, 8
+ packssdw m0, m3
+ punpckhwd m3, m1, m2
+ punpcklwd m1, m2
+ pmaddwd m3, m4
+ pmaddwd m1, m4
+ paddd m3, m5
+ paddd m1, m5
+ psrad m3, 8
+ psrad m1, 8
+ packssdw m1, m3
+ pminsw m0, m6
+ pminsw m1, m6
+ pmaxsw m0, m7
+ pmaxsw m1, m7
+ ret
+
+%if ARCH_X86_64
+cglobal mask_16bpc, 4, 7, 9, dst, stride, tmp1, tmp2, w, h, mask
+%else
+cglobal mask_16bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, mask
+%define hd dword r5m
+%define m8 [base+pw_64]
+%endif
+%define base r6-mask_ssse3_table
+ LEA r6, mask_ssse3_table
+ tzcnt wd, wm
+ mov t0d, r7m ; pixel_max
+ shr t0d, 11
+ movsxd wq, [r6+wq*4]
+ movddup m6, [base+bidir_rnd+t0*8]
+ movddup m7, [base+bidir_mul+t0*8]
+%if ARCH_X86_64
+ mova m8, [base+pw_64]
+ movifnidn hd, hm
+%endif
+ add wq, r6
+ mov maskq, r6mp
+ BIDIR_FN
+ALIGN function_align
+.main:
+ movq m3, [maskq+8*0]
+ mova m0, [tmp1q+16*0]
+ mova m4, [tmp2q+16*0]
+ pxor m5, m5
+ punpcklbw m3, m5
+ punpckhwd m2, m0, m4
+ punpcklwd m0, m4
+ psubw m1, m8, m3
+ punpckhwd m4, m3, m1 ; m, 64-m
+ punpcklwd m3, m1
+ pmaddwd m2, m4 ; tmp1 * m + tmp2 * (64-m)
+ pmaddwd m0, m3
+ movq m3, [maskq+8*1]
+ mova m1, [tmp1q+16*1]
+ mova m4, [tmp2q+16*1]
+ add maskq, 8*2
+ add tmp1q, 16*2
+ add tmp2q, 16*2
+ psrad m2, 5
+ psrad m0, 5
+ packssdw m0, m2
+ punpcklbw m3, m5
+ punpckhwd m2, m1, m4
+ punpcklwd m1, m4
+ psubw m5, m8, m3
+ punpckhwd m4, m3, m5 ; m, 64-m
+ punpcklwd m3, m5
+ pmaddwd m2, m4 ; tmp1 * m + tmp2 * (64-m)
+ pmaddwd m1, m3
+ psrad m2, 5
+ psrad m1, 5
+ packssdw m1, m2
+ pmaxsw m0, m6
+ pmaxsw m1, m6
+ psubsw m0, m6
+ psubsw m1, m6
+ pmulhw m0, m7
+ pmulhw m1, m7
+ ret
+
+cglobal w_mask_420_16bpc, 4, 7, 12, dst, stride, tmp1, tmp2, w, h, mask
+%define base t0-w_mask_420_ssse3_table
+ LEA t0, w_mask_420_ssse3_table
+ tzcnt wd, wm
+ mov r6d, r8m ; pixel_max
+ movd m0, r7m ; sign
+ shr r6d, 11
+ movsxd wq, [t0+wq*4]
+%if ARCH_X86_64
+ mova m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32
+ mova m9, [base+pw_64]
+ movddup m10, [base+bidir_rnd+r6*8]
+ movddup m11, [base+bidir_mul+r6*8]
+%else
+ mova m1, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32
+ mova m2, [base+pw_64]
+ movddup m3, [base+bidir_rnd+r6*8]
+ movddup m4, [base+bidir_mul+r6*8]
+ ALLOC_STACK -16*4
+ mova [rsp+16*0], m1
+ mova [rsp+16*1], m2
+ mova [rsp+16*2], m3
+ mova [rsp+16*3], m4
+ %define m8 [rsp+gprsize+16*0]
+ %define m9 [rsp+gprsize+16*1]
+ %define m10 [rsp+gprsize+16*2]
+ %define m11 [rsp+gprsize+16*3]
+%endif
+ movd m7, [base+pw_2]
+ psubw m7, m0
+ pshufb m7, [base+pw_256]
+ add wq, t0
+ movifnidn hd, r5m
+ mov maskq, r6mp
+ call .main
+ jmp wq
+.w4_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+ add maskq, 4
+.w4:
+ movq [dstq+strideq*0], m0
+ phaddw m2, m3
+ movhps [dstq+strideq*1], m0
+ phaddd m2, m2
+ lea dstq, [dstq+strideq*2]
+ paddw m2, m7
+ movq [dstq+strideq*0], m1
+ psrlw m2, 2
+ movhps [dstq+strideq*1], m1
+ packuswb m2, m2
+ movd [maskq], m2
+ sub hd, 4
+ jg .w4_loop
+ RET
+.w8_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+ add maskq, 4
+.w8:
+ mova [dstq+strideq*0], m0
+ paddw m2, m3
+ phaddw m2, m2
+ mova [dstq+strideq*1], m1
+ paddw m2, m7
+ psrlw m2, 2
+ packuswb m2, m2
+ movd [maskq], m2
+ sub hd, 2
+ jg .w8_loop
+ RET
+.w16_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+ add maskq, 8
+.w16:
+ mova [dstq+strideq*1+16*0], m2
+ mova [dstq+strideq*0+16*0], m0
+ mova [dstq+strideq*1+16*1], m3
+ mova [dstq+strideq*0+16*1], m1
+ call .main
+ paddw m2, [dstq+strideq*1+16*0]
+ paddw m3, [dstq+strideq*1+16*1]
+ mova [dstq+strideq*1+16*0], m0
+ phaddw m2, m3
+ mova [dstq+strideq*1+16*1], m1
+ paddw m2, m7
+ psrlw m2, 2
+ packuswb m2, m2
+ movq [maskq], m2
+ sub hd, 2
+ jg .w16_loop
+ RET
+.w32_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+ add maskq, 16
+.w32:
+ mova [dstq+strideq*1+16*0], m2
+ mova [dstq+strideq*0+16*0], m0
+ mova [dstq+strideq*1+16*1], m3
+ mova [dstq+strideq*0+16*1], m1
+ call .main
+ mova [dstq+strideq*0+16*2], m0
+ phaddw m2, m3
+ mova [dstq+strideq*1+16*3], m2
+ mova [dstq+strideq*0+16*3], m1
+ call .main
+ paddw m2, [dstq+strideq*1+16*0]
+ paddw m3, [dstq+strideq*1+16*1]
+ mova [dstq+strideq*1+16*0], m0
+ phaddw m2, m3
+ mova [dstq+strideq*1+16*2], m2
+ mova [dstq+strideq*1+16*1], m1
+ call .main
+ phaddw m2, m3
+ paddw m3, m7, [dstq+strideq*1+16*2]
+ paddw m2, [dstq+strideq*1+16*3]
+ mova [dstq+strideq*1+16*2], m0
+ paddw m2, m7
+ psrlw m3, 2
+ psrlw m2, 2
+ mova [dstq+strideq*1+16*3], m1
+ packuswb m3, m2
+ mova [maskq], m3
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+ add maskq, 16*2
+.w64:
+ mova [dstq+strideq*1+16*1], m2
+ mova [dstq+strideq*0+16*0], m0
+ mova [dstq+strideq*1+16*2], m3
+ mova [dstq+strideq*0+16*1], m1
+ call .main
+ mova [dstq+strideq*1+16*3], m2
+ mova [dstq+strideq*0+16*2], m0
+ mova [dstq+strideq*1+16*4], m3
+ mova [dstq+strideq*0+16*3], m1
+ call .main
+ mova [dstq+strideq*1+16*5], m2
+ mova [dstq+strideq*0+16*4], m0
+ mova [dstq+strideq*1+16*6], m3
+ mova [dstq+strideq*0+16*5], m1
+ call .main
+ mova [dstq+strideq*0+16*6], m0
+ phaddw m2, m3
+ mova [dstq+strideq*1+16*7], m2
+ mova [dstq+strideq*0+16*7], m1
+ call .main
+ paddw m2, [dstq+strideq*1+16*1]
+ paddw m3, [dstq+strideq*1+16*2]
+ mova [dstq+strideq*1+16*0], m0
+ phaddw m2, m3
+ mova [dstq+strideq*1+16*2], m2
+ mova [dstq+strideq*1+16*1], m1
+ call .main
+ paddw m2, [dstq+strideq*1+16*3]
+ paddw m3, [dstq+strideq*1+16*4]
+ phaddw m2, m3
+ paddw m3, m7, [dstq+strideq*1+16*2]
+ mova [dstq+strideq*1+16*2], m0
+ paddw m2, m7
+ psrlw m3, 2
+ psrlw m2, 2
+ mova [dstq+strideq*1+16*3], m1
+ packuswb m3, m2
+ mova [maskq+16*0], m3
+ call .main
+ paddw m2, [dstq+strideq*1+16*5]
+ paddw m3, [dstq+strideq*1+16*6]
+ mova [dstq+strideq*1+16*4], m0
+ phaddw m2, m3
+ mova [dstq+strideq*1+16*6], m2
+ mova [dstq+strideq*1+16*5], m1
+ call .main
+ phaddw m2, m3
+ paddw m3, m7, [dstq+strideq*1+16*6]
+ paddw m2, [dstq+strideq*1+16*7]
+ mova [dstq+strideq*1+16*6], m0
+ paddw m2, m7
+ psrlw m3, 2
+ psrlw m2, 2
+ mova [dstq+strideq*1+16*7], m1
+ packuswb m3, m2
+ mova [maskq+16*1], m3
+ sub hd, 2
+ jg .w64_loop
+ RET
+.w128_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+ add maskq, 16*4
+.w128:
+ mova [dstq+strideq*1+16* 1], m2
+ mova [dstq+strideq*0+16* 0], m0
+ mova [dstq+strideq*1+16* 2], m3
+ mova [dstq+strideq*0+16* 1], m1
+ call .main
+ mova [dstq+strideq*1+16* 3], m2
+ mova [dstq+strideq*0+16* 2], m0
+ mova [dstq+strideq*1+16* 4], m3
+ mova [dstq+strideq*0+16* 3], m1
+ call .main
+ mova [dstq+strideq*1+16* 5], m2
+ mova [dstq+strideq*0+16* 4], m0
+ mova [dstq+strideq*1+16* 6], m3
+ mova [dstq+strideq*0+16* 5], m1
+ call .main
+ mova [dstq+strideq*1+16* 7], m2
+ mova [dstq+strideq*0+16* 6], m0
+ mova [dstq+strideq*1+16* 8], m3
+ mova [dstq+strideq*0+16* 7], m1
+ call .main
+ mova [dstq+strideq*1+16* 9], m2
+ mova [dstq+strideq*0+16* 8], m0
+ mova [dstq+strideq*1+16*10], m3
+ mova [dstq+strideq*0+16* 9], m1
+ call .main
+ mova [dstq+strideq*1+16*11], m2
+ mova [dstq+strideq*0+16*10], m0
+ mova [dstq+strideq*1+16*12], m3
+ mova [dstq+strideq*0+16*11], m1
+ call .main
+ mova [dstq+strideq*1+16*13], m2
+ mova [dstq+strideq*0+16*12], m0
+ mova [dstq+strideq*1+16*14], m3
+ mova [dstq+strideq*0+16*13], m1
+ call .main
+ mova [dstq+strideq*0+16*14], m0
+ phaddw m2, m3
+ mova [dstq+strideq*1+16*15], m2
+ mova [dstq+strideq*0+16*15], m1
+ call .main
+ paddw m2, [dstq+strideq*1+16* 1]
+ paddw m3, [dstq+strideq*1+16* 2]
+ mova [dstq+strideq*1+16* 0], m0
+ phaddw m2, m3
+ mova [dstq+strideq*1+16* 2], m2
+ mova [dstq+strideq*1+16* 1], m1
+ call .main
+ paddw m2, [dstq+strideq*1+16* 3]
+ paddw m3, [dstq+strideq*1+16* 4]
+ phaddw m2, m3
+ paddw m3, m7, [dstq+strideq*1+16* 2]
+ mova [dstq+strideq*1+16* 2], m0
+ paddw m2, m7
+ psrlw m3, 2
+ psrlw m2, 2
+ mova [dstq+strideq*1+16* 3], m1
+ packuswb m3, m2
+ mova [maskq+16*0], m3
+ call .main
+ paddw m2, [dstq+strideq*1+16* 5]
+ paddw m3, [dstq+strideq*1+16* 6]
+ mova [dstq+strideq*1+16* 4], m0
+ phaddw m2, m3
+ mova [dstq+strideq*1+16* 6], m2
+ mova [dstq+strideq*1+16* 5], m1
+ call .main
+ paddw m2, [dstq+strideq*1+16* 7]
+ paddw m3, [dstq+strideq*1+16* 8]
+ phaddw m2, m3
+ paddw m3, m7, [dstq+strideq*1+16* 6]
+ mova [dstq+strideq*1+16* 6], m0
+ paddw m2, m7
+ psrlw m3, 2
+ psrlw m2, 2
+ mova [dstq+strideq*1+16* 7], m1
+ packuswb m3, m2
+ mova [maskq+16*1], m3
+ call .main
+ paddw m2, [dstq+strideq*1+16* 9]
+ paddw m3, [dstq+strideq*1+16*10]
+ mova [dstq+strideq*1+16* 8], m0
+ phaddw m2, m3
+ mova [dstq+strideq*1+16*10], m2
+ mova [dstq+strideq*1+16* 9], m1
+ call .main
+ paddw m2, [dstq+strideq*1+16*11]
+ paddw m3, [dstq+strideq*1+16*12]
+ phaddw m2, m3
+ paddw m3, m7, [dstq+strideq*1+16*10]
+ mova [dstq+strideq*1+16*10], m0
+ paddw m2, m7
+ psrlw m3, 2
+ psrlw m2, 2
+ mova [dstq+strideq*1+16*11], m1
+ packuswb m3, m2
+ mova [maskq+16*2], m3
+ call .main
+ paddw m2, [dstq+strideq*1+16*13]
+ paddw m3, [dstq+strideq*1+16*14]
+ mova [dstq+strideq*1+16*12], m0
+ phaddw m2, m3
+ mova [dstq+strideq*1+16*14], m2
+ mova [dstq+strideq*1+16*13], m1
+ call .main
+ phaddw m2, m3
+ paddw m3, m7, [dstq+strideq*1+16*14]
+ paddw m2, [dstq+strideq*1+16*15]
+ mova [dstq+strideq*1+16*14], m0
+ paddw m2, m7
+ psrlw m3, 2
+ psrlw m2, 2
+ mova [dstq+strideq*1+16*15], m1
+ packuswb m3, m2
+ mova [maskq+16*3], m3
+ sub hd, 2
+ jg .w128_loop
+ RET
+ALIGN function_align
+.main:
+%macro W_MASK 2 ; dst/tmp_offset, mask
+ mova m%1, [tmp1q+16*%1]
+ mova m%2, [tmp2q+16*%1]
+ punpcklwd m4, m%2, m%1
+ punpckhwd m5, m%2, m%1
+ psubsw m%1, m%2
+ pabsw m%1, m%1
+ psubusw m6, m8, m%1
+ psrlw m6, 10 ; 64-m
+ psubw m%2, m9, m6 ; m
+ punpcklwd m%1, m6, m%2
+ punpckhwd m6, m%2
+ pmaddwd m%1, m4
+ pmaddwd m6, m5
+ psrad m%1, 5
+ psrad m6, 5
+ packssdw m%1, m6
+ pmaxsw m%1, m10
+ psubsw m%1, m10
+ pmulhw m%1, m11
+%endmacro
+ W_MASK 0, 2
+ W_MASK 1, 3
+ add tmp1q, 16*2
+ add tmp2q, 16*2
+ ret
+
+cglobal w_mask_422_16bpc, 4, 7, 12, dst, stride, tmp1, tmp2, w, h, mask
+%define base t0-w_mask_422_ssse3_table
+ LEA t0, w_mask_422_ssse3_table
+ tzcnt wd, wm
+ mov r6d, r8m ; pixel_max
+ movd m7, r7m ; sign
+ shr r6d, 11
+ movsxd wq, [t0+wq*4]
+%if ARCH_X86_64
+ mova m8, [base+pw_27615]
+ mova m9, [base+pw_64]
+ movddup m10, [base+bidir_rnd+r6*8]
+ movddup m11, [base+bidir_mul+r6*8]
+%else
+ mova m1, [base+pw_27615]
+ mova m2, [base+pw_64]
+ movddup m3, [base+bidir_rnd+r6*8]
+ movddup m4, [base+bidir_mul+r6*8]
+ ALLOC_STACK -16*4
+ mova [rsp+16*0], m1
+ mova [rsp+16*1], m2
+ mova [rsp+16*2], m3
+ mova [rsp+16*3], m4
+%endif
+ pxor m0, m0
+ add wq, t0
+ pshufb m7, m0
+ movifnidn hd, r5m
+ mov maskq, r6mp
+ call .main
+ jmp wq
+.w4_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w4:
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ movq [dstq+strideq*0], m1
+ movhps [dstq+strideq*1], m1
+ sub hd, 4
+ jg .w4_loop
+.end:
+ RET
+.w8_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w8:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ sub hd, 2
+ jg .w8_loop
+.w8_end:
+ RET
+.w16_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w16:
+ mova [dstq+strideq*0+16*0], m0
+ mova [dstq+strideq*0+16*1], m1
+ call .main
+ mova [dstq+strideq*1+16*0], m0
+ mova [dstq+strideq*1+16*1], m1
+ sub hd, 2
+ jg .w16_loop
+ RET
+.w32_loop:
+ call .main
+ add dstq, strideq
+.w32:
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ call .main
+ mova [dstq+16*2], m0
+ mova [dstq+16*3], m1
+ dec hd
+ jg .w32_loop
+ RET
+.w64_loop:
+ call .main
+ add dstq, strideq
+.w64:
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ call .main
+ mova [dstq+16*2], m0
+ mova [dstq+16*3], m1
+ call .main
+ mova [dstq+16*4], m0
+ mova [dstq+16*5], m1
+ call .main
+ mova [dstq+16*6], m0
+ mova [dstq+16*7], m1
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop:
+ call .main
+ add dstq, strideq
+.w128:
+ mova [dstq+16* 0], m0
+ mova [dstq+16* 1], m1
+ call .main
+ mova [dstq+16* 2], m0
+ mova [dstq+16* 3], m1
+ call .main
+ mova [dstq+16* 4], m0
+ mova [dstq+16* 5], m1
+ call .main
+ mova [dstq+16* 6], m0
+ mova [dstq+16* 7], m1
+ call .main
+ mova [dstq+16* 8], m0
+ mova [dstq+16* 9], m1
+ call .main
+ mova [dstq+16*10], m0
+ mova [dstq+16*11], m1
+ call .main
+ mova [dstq+16*12], m0
+ mova [dstq+16*13], m1
+ call .main
+ mova [dstq+16*14], m0
+ mova [dstq+16*15], m1
+ dec hd
+ jg .w128_loop
+ RET
+ALIGN function_align
+.main:
+ W_MASK 0, 2
+ W_MASK 1, 3
+ phaddw m2, m3
+ add tmp1q, 16*2
+ add tmp2q, 16*2
+ packuswb m2, m2
+ pxor m3, m3
+ psubb m2, m7
+ pavgb m2, m3
+ movq [maskq], m2
+ add maskq, 8
+ ret
+
+cglobal w_mask_444_16bpc, 4, 7, 12, dst, stride, tmp1, tmp2, w, h, mask
+%define base t0-w_mask_444_ssse3_table
+ LEA t0, w_mask_444_ssse3_table
+ tzcnt wd, wm
+ mov r6d, r8m ; pixel_max
+ shr r6d, 11
+ movsxd wq, [t0+wq*4]
+%if ARCH_X86_64
+ mova m8, [base+pw_27615]
+ mova m9, [base+pw_64]
+ movddup m10, [base+bidir_rnd+r6*8]
+ movddup m11, [base+bidir_mul+r6*8]
+%else
+ mova m1, [base+pw_27615]
+ mova m2, [base+pw_64]
+ movddup m3, [base+bidir_rnd+r6*8]
+ movddup m7, [base+bidir_mul+r6*8]
+ ALLOC_STACK -16*3
+ mova [rsp+16*0], m1
+ mova [rsp+16*1], m2
+ mova [rsp+16*2], m3
+ %define m11 m7
+%endif
+ add wq, t0
+ movifnidn hd, r5m
+ mov maskq, r6mp
+ call .main
+ jmp wq
+.w4_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w4:
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ movq [dstq+strideq*0], m1
+ movhps [dstq+strideq*1], m1
+ sub hd, 4
+ jg .w4_loop
+.end:
+ RET
+.w8_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w8:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ sub hd, 2
+ jg .w8_loop
+.w8_end:
+ RET
+.w16_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w16:
+ mova [dstq+strideq*0+16*0], m0
+ mova [dstq+strideq*0+16*1], m1
+ call .main
+ mova [dstq+strideq*1+16*0], m0
+ mova [dstq+strideq*1+16*1], m1
+ sub hd, 2
+ jg .w16_loop
+ RET
+.w32_loop:
+ call .main
+ add dstq, strideq
+.w32:
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ call .main
+ mova [dstq+16*2], m0
+ mova [dstq+16*3], m1
+ dec hd
+ jg .w32_loop
+ RET
+.w64_loop:
+ call .main
+ add dstq, strideq
+.w64:
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ call .main
+ mova [dstq+16*2], m0
+ mova [dstq+16*3], m1
+ call .main
+ mova [dstq+16*4], m0
+ mova [dstq+16*5], m1
+ call .main
+ mova [dstq+16*6], m0
+ mova [dstq+16*7], m1
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop:
+ call .main
+ add dstq, strideq
+.w128:
+ mova [dstq+16* 0], m0
+ mova [dstq+16* 1], m1
+ call .main
+ mova [dstq+16* 2], m0
+ mova [dstq+16* 3], m1
+ call .main
+ mova [dstq+16* 4], m0
+ mova [dstq+16* 5], m1
+ call .main
+ mova [dstq+16* 6], m0
+ mova [dstq+16* 7], m1
+ call .main
+ mova [dstq+16* 8], m0
+ mova [dstq+16* 9], m1
+ call .main
+ mova [dstq+16*10], m0
+ mova [dstq+16*11], m1
+ call .main
+ mova [dstq+16*12], m0
+ mova [dstq+16*13], m1
+ call .main
+ mova [dstq+16*14], m0
+ mova [dstq+16*15], m1
+ dec hd
+ jg .w128_loop
+ RET
+ALIGN function_align
+.main:
+ W_MASK 0, 2
+ W_MASK 1, 3
+ packuswb m2, m3
+ add tmp1q, 16*2
+ add tmp2q, 16*2
+ mova [maskq], m2
+ add maskq, 16
+ ret
+
+; (a * (64 - m) + b * m + 32) >> 6
+; = (((b - a) * m + 32) >> 6) + a
+; = (((b - a) * (m << 9) + 16384) >> 15) + a
+; except m << 9 overflows int16_t when m == 64 (which is possible),
+; but if we negate m it works out (-64 << 9 == -32768).
+; = (((a - b) * (m * -512) + 16384) >> 15) + a
+cglobal blend_16bpc, 3, 7, 8, dst, stride, tmp, w, h, mask, stride3
+%define base r6-blend_ssse3_table
+ LEA r6, blend_ssse3_table
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, [r6+wq*4]
+ movifnidn maskq, maskmp
+ mova m7, [base+pw_m512]
+ add wq, r6
+ lea stride3q, [strideq*3]
+ pxor m6, m6
+ jmp wq
+.w4:
+ mova m5, [maskq]
+ movq m0, [dstq+strideq*0]
+ movhps m0, [dstq+strideq*1]
+ movq m1, [dstq+strideq*2]
+ movhps m1, [dstq+stride3q ]
+ psubw m2, m0, [tmpq+16*0]
+ psubw m3, m1, [tmpq+16*1]
+ add maskq, 16
+ add tmpq, 32
+ punpcklbw m4, m5, m6
+ punpckhbw m5, m6
+ pmullw m4, m7
+ pmullw m5, m7
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+ paddw m0, m2
+ paddw m1, m3
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ movq [dstq+strideq*2], m1
+ movhps [dstq+stride3q ], m1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4
+ RET
+.w8:
+ mova m5, [maskq]
+ mova m0, [dstq+strideq*0]
+ mova m1, [dstq+strideq*1]
+ psubw m2, m0, [tmpq+16*0]
+ psubw m3, m1, [tmpq+16*1]
+ add maskq, 16
+ add tmpq, 32
+ punpcklbw m4, m5, m6
+ punpckhbw m5, m6
+ pmullw m4, m7
+ pmullw m5, m7
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8
+ RET
+.w16:
+ mova m5, [maskq]
+ mova m0, [dstq+16*0]
+ mova m1, [dstq+16*1]
+ psubw m2, m0, [tmpq+16*0]
+ psubw m3, m1, [tmpq+16*1]
+ add maskq, 16
+ add tmpq, 32
+ punpcklbw m4, m5, m6
+ punpckhbw m5, m6
+ pmullw m4, m7
+ pmullw m5, m7
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ add dstq, strideq
+ dec hd
+ jg .w16
+ RET
+.w32:
+ mova m5, [maskq+16*0]
+ mova m0, [dstq+16*0]
+ mova m1, [dstq+16*1]
+ psubw m2, m0, [tmpq+16*0]
+ psubw m3, m1, [tmpq+16*1]
+ punpcklbw m4, m5, m6
+ punpckhbw m5, m6
+ pmullw m4, m7
+ pmullw m5, m7
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ mova m5, [maskq+16*1]
+ mova m0, [dstq+16*2]
+ mova m1, [dstq+16*3]
+ psubw m2, m0, [tmpq+16*2]
+ psubw m3, m1, [tmpq+16*3]
+ add maskq, 32
+ add tmpq, 64
+ punpcklbw m4, m5, m6
+ punpckhbw m5, m6
+ pmullw m4, m7
+ pmullw m5, m7
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+16*2], m0
+ mova [dstq+16*3], m1
+ add dstq, strideq
+ dec hd
+ jg .w32
+ RET
+
+cglobal blend_v_16bpc, 3, 6, 6, dst, stride, tmp, w, h
+%define base r5-blend_v_ssse3_table
+ LEA r5, blend_v_ssse3_table
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ jmp wq
+.w2:
+ movd m4, [base+obmc_masks+2*2]
+.w2_loop:
+ movd m0, [dstq+strideq*0]
+ movd m2, [tmpq+4*0]
+ movd m1, [dstq+strideq*1]
+ movd m3, [tmpq+4*1]
+ add tmpq, 4*2
+ psubw m2, m0
+ psubw m3, m1
+ pmulhrsw m2, m4
+ pmulhrsw m3, m4
+ paddw m0, m2
+ paddw m1, m3
+ movd [dstq+strideq*0], m0
+ movd [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w2_loop
+ RET
+.w4:
+ movddup m2, [base+obmc_masks+4*2]
+.w4_loop:
+ movq m0, [dstq+strideq*0]
+ movhps m0, [dstq+strideq*1]
+ mova m1, [tmpq]
+ add tmpq, 8*2
+ psubw m1, m0
+ pmulhrsw m1, m2
+ paddw m0, m1
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w4_loop
+ RET
+.w8:
+ mova m4, [base+obmc_masks+8*2]
+.w8_loop:
+ mova m0, [dstq+strideq*0]
+ mova m2, [tmpq+16*0]
+ mova m1, [dstq+strideq*1]
+ mova m3, [tmpq+16*1]
+ add tmpq, 16*2
+ psubw m2, m0
+ psubw m3, m1
+ pmulhrsw m2, m4
+ pmulhrsw m3, m4
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8_loop
+ RET
+.w16:
+ mova m4, [base+obmc_masks+16*2]
+ movq m5, [base+obmc_masks+16*3]
+.w16_loop:
+ mova m0, [dstq+16*0]
+ mova m2, [tmpq+16*0]
+ mova m1, [dstq+16*1]
+ mova m3, [tmpq+16*1]
+ add tmpq, 16*2
+ psubw m2, m0
+ psubw m3, m1
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ add dstq, strideq
+ dec hd
+ jg .w16_loop
+ RET
+.w32:
+%if WIN64
+ movaps [rsp+8], m6
+%endif
+ mova m4, [base+obmc_masks+16*4]
+ mova m5, [base+obmc_masks+16*5]
+ mova m6, [base+obmc_masks+16*6]
+.w32_loop:
+ mova m0, [dstq+16*0]
+ mova m2, [tmpq+16*0]
+ mova m1, [dstq+16*1]
+ mova m3, [tmpq+16*1]
+ psubw m2, m0
+ psubw m3, m1
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+ paddw m0, m2
+ mova m2, [dstq+16*2]
+ paddw m1, m3
+ mova m3, [tmpq+16*2]
+ add tmpq, 16*4
+ psubw m3, m2
+ pmulhrsw m3, m6
+ paddw m2, m3
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ mova [dstq+16*2], m2
+ add dstq, strideq
+ dec hd
+ jg .w32_loop
+%if WIN64
+ movaps m6, [rsp+8]
+%endif
+ RET
+
+%macro BLEND_H_ROW 2-3 0; dst_off, tmp_off, inc_tmp
+ mova m0, [dstq+16*(%1+0)]
+ mova m2, [tmpq+16*(%2+0)]
+ mova m1, [dstq+16*(%1+1)]
+ mova m3, [tmpq+16*(%2+1)]
+%if %3
+ add tmpq, 16*%3
+%endif
+ psubw m2, m0
+ psubw m3, m1
+ pmulhrsw m2, m5
+ pmulhrsw m3, m5
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+16*(%1+0)], m0
+ mova [dstq+16*(%1+1)], m1
+%endmacro
+
+cglobal blend_h_16bpc, 3, 7, 6, dst, ds, tmp, w, h, mask
+%define base r6-blend_h_ssse3_table
+ LEA r6, blend_h_ssse3_table
+ tzcnt wd, wm
+ mov hd, hm
+ movsxd wq, [r6+wq*4]
+ movddup m4, [base+blend_shuf]
+ lea maskq, [base+obmc_masks+hq*2]
+ lea hd, [hq*3]
+ add wq, r6
+ shr hd, 2 ; h * 3/4
+ lea maskq, [maskq+hq*2]
+ neg hq
+ jmp wq
+.w2:
+ movd m0, [dstq+dsq*0]
+ movd m2, [dstq+dsq*1]
+ movd m3, [maskq+hq*2]
+ movq m1, [tmpq]
+ add tmpq, 4*2
+ punpckldq m0, m2
+ punpcklwd m3, m3
+ psubw m1, m0
+ pmulhrsw m1, m3
+ paddw m0, m1
+ movd [dstq+dsq*0], m0
+ psrlq m0, 32
+ movd [dstq+dsq*1], m0
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w2
+ RET
+.w4:
+ mova m3, [base+blend_shuf]
+.w4_loop:
+ movq m0, [dstq+dsq*0]
+ movhps m0, [dstq+dsq*1]
+ movd m2, [maskq+hq*2]
+ mova m1, [tmpq]
+ add tmpq, 8*2
+ psubw m1, m0
+ pshufb m2, m3
+ pmulhrsw m1, m2
+ paddw m0, m1
+ movq [dstq+dsq*0], m0
+ movhps [dstq+dsq*1], m0
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w4_loop
+ RET
+.w8:
+ movddup m5, [base+blend_shuf+8]
+%if WIN64
+ movaps [rsp+ 8], m6
+ movaps [rsp+24], m7
+%endif
+.w8_loop:
+ movd m7, [maskq+hq*2]
+ mova m0, [dstq+dsq*0]
+ mova m2, [tmpq+16*0]
+ mova m1, [dstq+dsq*1]
+ mova m3, [tmpq+16*1]
+ add tmpq, 16*2
+ pshufb m6, m7, m4
+ psubw m2, m0
+ pshufb m7, m5
+ psubw m3, m1
+ pmulhrsw m2, m6
+ pmulhrsw m3, m7
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w8_loop
+%if WIN64
+ movaps m6, [rsp+ 8]
+ movaps m7, [rsp+24]
+%endif
+ RET
+.w16:
+ movd m5, [maskq+hq*2]
+ pshufb m5, m4
+ BLEND_H_ROW 0, 0, 2
+ add dstq, dsq
+ inc hq
+ jl .w16
+ RET
+.w32:
+ movd m5, [maskq+hq*2]
+ pshufb m5, m4
+ BLEND_H_ROW 0, 0
+ BLEND_H_ROW 2, 2, 4
+ add dstq, dsq
+ inc hq
+ jl .w32
+ RET
+.w64:
+ movd m5, [maskq+hq*2]
+ pshufb m5, m4
+ BLEND_H_ROW 0, 0
+ BLEND_H_ROW 2, 2
+ BLEND_H_ROW 4, 4
+ BLEND_H_ROW 6, 6, 8
+ add dstq, dsq
+ inc hq
+ jl .w64
+ RET
+.w128:
+ movd m5, [maskq+hq*2]
+ pshufb m5, m4
+ BLEND_H_ROW 0, 0
+ BLEND_H_ROW 2, 2
+ BLEND_H_ROW 4, 4
+ BLEND_H_ROW 6, 6, 16
+ BLEND_H_ROW 8, -8
+ BLEND_H_ROW 10, -6
+ BLEND_H_ROW 12, -4
+ BLEND_H_ROW 14, -2
+ add dstq, dsq
+ inc hq
+ jl .w128
+ RET
+
+; emu_edge args:
+; const intptr_t bw, const intptr_t bh, const intptr_t iw, const intptr_t ih,
+; const intptr_t x, const intptr_t y, pixel *dst, const ptrdiff_t dst_stride,
+; const pixel *ref, const ptrdiff_t ref_stride
+;
+; bw, bh total filled size
+; iw, ih, copied block -> fill bottom, right
+; x, y, offset in bw/bh -> fill top, left
+cglobal emu_edge_16bpc, 10, 13, 1, bw, bh, iw, ih, x, \
+ y, dst, dstride, src, sstride, \
+ bottomext, rightext, blk
+ ; we assume that the buffer (stride) is larger than width, so we can
+ ; safely overwrite by a few bytes
+
+%if ARCH_X86_64
+ %define reg_zero r12q
+ %define reg_tmp r10
+ %define reg_src srcq
+ %define reg_bottomext bottomextq
+ %define reg_rightext rightextq
+ %define reg_blkm r9m
+%else
+ %define reg_zero r6
+ %define reg_tmp r0
+ %define reg_src r1
+ %define reg_bottomext r0
+ %define reg_rightext r1
+ %define reg_blkm r2m
+%endif
+ ;
+ ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
+ xor reg_zero, reg_zero
+ lea reg_tmp, [ihq-1]
+ cmp yq, ihq
+ cmovs reg_tmp, yq
+ test yq, yq
+ cmovs reg_tmp, reg_zero
+%if ARCH_X86_64
+ imul reg_tmp, sstrideq
+ add srcq, reg_tmp
+%else
+ imul reg_tmp, sstridem
+ mov reg_src, srcm
+ add reg_src, reg_tmp
+%endif
+ ;
+ ; ref += iclip(x, 0, iw - 1)
+ lea reg_tmp, [iwq-1]
+ cmp xq, iwq
+ cmovs reg_tmp, xq
+ test xq, xq
+ cmovs reg_tmp, reg_zero
+ lea reg_src, [reg_src+reg_tmp*2]
+%if ARCH_X86_32
+ mov srcm, reg_src
+%endif
+ ;
+ ; bottom_ext = iclip(y + bh - ih, 0, bh - 1)
+%if ARCH_X86_32
+ mov r1, r1m ; restore bh
+%endif
+ lea reg_bottomext, [yq+bhq]
+ sub reg_bottomext, ihq
+ lea r3, [bhq-1]
+ cmovs reg_bottomext, reg_zero
+ ;
+
+ DEFINE_ARGS bw, bh, iw, ih, x, \
+ topext, dst, dstride, src, sstride, \
+ bottomext, rightext, blk
+
+ ; top_ext = iclip(-y, 0, bh - 1)
+ neg topextq
+ cmovs topextq, reg_zero
+ cmp reg_bottomext, bhq
+ cmovns reg_bottomext, r3
+ cmp topextq, bhq
+ cmovg topextq, r3
+ %if ARCH_X86_32
+ mov r4m, reg_bottomext
+ ;
+ ; right_ext = iclip(x + bw - iw, 0, bw - 1)
+ mov r0, r0m ; restore bw
+ %endif
+ lea reg_rightext, [xq+bwq]
+ sub reg_rightext, iwq
+ lea r2, [bwq-1]
+ cmovs reg_rightext, reg_zero
+
+ DEFINE_ARGS bw, bh, iw, ih, leftext, \
+ topext, dst, dstride, src, sstride, \
+ bottomext, rightext, blk
+
+ ; left_ext = iclip(-x, 0, bw - 1)
+ neg leftextq
+ cmovs leftextq, reg_zero
+ cmp reg_rightext, bwq
+ cmovns reg_rightext, r2
+ %if ARCH_X86_32
+ mov r3m, r1
+ %endif
+ cmp leftextq, bwq
+ cmovns leftextq, r2
+
+%undef reg_zero
+%undef reg_tmp
+%undef reg_src
+%undef reg_bottomext
+%undef reg_rightext
+
+ DEFINE_ARGS bw, centerh, centerw, dummy, leftext, \
+ topext, dst, dstride, src, sstride, \
+ bottomext, rightext, blk
+
+ ; center_h = bh - top_ext - bottom_ext
+%if ARCH_X86_64
+ lea r3, [bottomextq+topextq]
+ sub centerhq, r3
+%else
+ mov r1, centerhm ; restore r1
+ sub centerhq, topextq
+ sub centerhq, r4m
+ mov r1m, centerhq
+%endif
+ ;
+ ; blk += top_ext * PXSTRIDE(dst_stride)
+ mov r2, topextq
+%if ARCH_X86_64
+ imul r2, dstrideq
+%else
+ mov r6, r6m ; restore dstq
+ imul r2, dstridem
+%endif
+ add dstq, r2
+ mov reg_blkm, dstq ; save pointer for ext
+ ;
+ ; center_w = bw - left_ext - right_ext
+ mov centerwq, bwq
+%if ARCH_X86_64
+ lea r3, [rightextq+leftextq]
+ sub centerwq, r3
+%else
+ sub centerwq, r3m
+ sub centerwq, leftextq
+%endif
+
+; vloop Macro
+%macro v_loop 3 ; need_left_ext, need_right_ext, suffix
+ %if ARCH_X86_64
+ %define reg_tmp r12
+ %else
+ %define reg_tmp r0
+ %endif
+.v_loop_%3:
+ %if ARCH_X86_32
+ mov r0, r0m
+ mov r1, r1m
+ %endif
+%if %1
+ ; left extension
+ %if ARCH_X86_64
+ movd m0, [srcq]
+ %else
+ mov r3, srcm
+ movd m0, [r3]
+ %endif
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+ xor r3, r3
+.left_loop_%3:
+ mova [dstq+r3*2], m0
+ add r3, mmsize/2
+ cmp r3, leftextq
+ jl .left_loop_%3
+ ; body
+ lea reg_tmp, [dstq+leftextq*2]
+%endif
+ xor r3, r3
+.body_loop_%3:
+ %if ARCH_X86_64
+ movu m0, [srcq+r3*2]
+ %else
+ mov r1, srcm
+ movu m0, [r1+r3*2]
+ %endif
+%if %1
+ movu [reg_tmp+r3*2], m0
+%else
+ movu [dstq+r3*2], m0
+%endif
+ add r3, mmsize/2
+ cmp r3, centerwq
+ jl .body_loop_%3
+%if %2
+ ; right extension
+%if %1
+ lea reg_tmp, [reg_tmp+centerwq*2]
+%else
+ lea reg_tmp, [dstq+centerwq*2]
+%endif
+ %if ARCH_X86_64
+ movd m0, [srcq+centerwq*2-2]
+ %else
+ mov r3, srcm
+ movd m0, [r3+centerwq*2-2]
+ %endif
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+ xor r3, r3
+.right_loop_%3:
+ movu [reg_tmp+r3*2], m0
+ add r3, mmsize/2
+ %if ARCH_X86_64
+ cmp r3, rightextq
+ %else
+ cmp r3, r3m
+ %endif
+ jl .right_loop_%3
+%endif
+ %if ARCH_X86_64
+ add dstq, dstrideq
+ add srcq, sstrideq
+ dec centerhq
+ jg .v_loop_%3
+ %else
+ add dstq, dstridem
+ mov r0, sstridem
+ add srcm, r0
+ sub dword centerhm, 1
+ jg .v_loop_%3
+ mov r0, r0m ; restore r0
+ %endif
+%endmacro ; vloop MACRO
+
+ test leftextq, leftextq
+ jnz .need_left_ext
+ %if ARCH_X86_64
+ test rightextq, rightextq
+ jnz .need_right_ext
+ %else
+ cmp leftextq, r3m ; leftextq == 0
+ jne .need_right_ext
+ %endif
+ v_loop 0, 0, 0
+ jmp .body_done
+
+ ;left right extensions
+.need_left_ext:
+ %if ARCH_X86_64
+ test rightextq, rightextq
+ %else
+ mov r3, r3m
+ test r3, r3
+ %endif
+ jnz .need_left_right_ext
+ v_loop 1, 0, 1
+ jmp .body_done
+
+.need_left_right_ext:
+ v_loop 1, 1, 2
+ jmp .body_done
+
+.need_right_ext:
+ v_loop 0, 1, 3
+
+.body_done:
+; r0 ; bw
+; r1 ;; x loop
+; r4 ;; y loop
+; r5 ; topextq
+; r6 ;dstq
+; r7 ;dstrideq
+; r8 ; srcq
+%if ARCH_X86_64
+ %define reg_dstride dstrideq
+%else
+ %define reg_dstride r2
+%endif
+ ;
+ ; bottom edge extension
+ %if ARCH_X86_64
+ test bottomextq, bottomextq
+ jz .top
+ %else
+ xor r1, r1
+ cmp r1, r4m
+ je .top
+ %endif
+ ;
+ %if ARCH_X86_64
+ mov srcq, dstq
+ sub srcq, dstrideq
+ xor r1, r1
+ %else
+ mov r3, dstq
+ mov reg_dstride, dstridem
+ sub r3, reg_dstride
+ mov srcm, r3
+ %endif
+ ;
+.bottom_x_loop:
+ %if ARCH_X86_64
+ mova m0, [srcq+r1*2]
+ lea r3, [dstq+r1*2]
+ mov r4, bottomextq
+ %else
+ mov r3, srcm
+ mova m0, [r3+r1*2]
+ lea r3, [dstq+r1*2]
+ mov r4, r4m
+ %endif
+ ;
+.bottom_y_loop:
+ mova [r3], m0
+ add r3, reg_dstride
+ dec r4
+ jg .bottom_y_loop
+ add r1, mmsize/2
+ cmp r1, bwq
+ jl .bottom_x_loop
+
+.top:
+ ; top edge extension
+ test topextq, topextq
+ jz .end
+%if ARCH_X86_64
+ mov srcq, reg_blkm
+%else
+ mov r3, reg_blkm
+ mov reg_dstride, dstridem
+%endif
+ mov dstq, dstm
+ xor r1, r1
+ ;
+.top_x_loop:
+%if ARCH_X86_64
+ mova m0, [srcq+r1*2]
+%else
+ mov r3, reg_blkm
+ mova m0, [r3+r1*2]
+%endif
+ lea r3, [dstq+r1*2]
+ mov r4, topextq
+ ;
+.top_y_loop:
+ mova [r3], m0
+ add r3, reg_dstride
+ dec r4
+ jg .top_y_loop
+ add r1, mmsize/2
+ cmp r1, bwq
+ jl .top_x_loop
+
+.end:
+ RET
+
+%undef reg_dstride
+%undef reg_blkm
+%undef reg_tmp
+
+%macro SCRATCH 3
+%if ARCH_X86_32
+ mova [rsp+%3*mmsize], m%1
+%define m%2 [rsp+%3*mmsize]
+%else
+ SWAP %1, %2
+%endif
+%endmacro
+
+%if ARCH_X86_64
+cglobal resize_16bpc, 0, 12, 16, 1*16, dst, dst_stride, src, src_stride, \
+ dst_w, h, src_w, dx, mx0, pxmax
+%elif STACK_ALIGNMENT >= 16
+cglobal resize_16bpc, 0, 7, 8, 6*16, dst, dst_stride, src, src_stride, \
+ dst_w, h, src_w, dx, mx0, pxmax
+%else
+cglobal resize_16bpc, 0, 6, 8, 6*16, dst, dst_stride, src, src_stride, \
+ dst_w, h, src_w, dx, mx0, pxmax
+%endif
+ movifnidn dstq, dstmp
+ movifnidn srcq, srcmp
+%if STACK_ALIGNMENT >= 16
+ movifnidn dst_wd, dst_wm
+%endif
+%if ARCH_X86_64
+ movifnidn hd, hm
+%endif
+ sub dword mx0m, 4<<14
+ sub dword src_wm, 8
+ movd m4, pxmaxm
+ movd m7, dxm
+ movd m6, mx0m
+ movd m5, src_wm
+ punpcklwd m4, m4
+ pshufd m4, m4, q0000
+ pshufd m7, m7, q0000
+ pshufd m6, m6, q0000
+ pshufd m5, m5, q0000
+ mova [rsp+16*3*ARCH_X86_32], m4
+%if ARCH_X86_64
+ DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x
+ LEA r7, $$
+ %define base r7-$$
+%else
+ DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, x
+ %define hd dword r5m
+ %if STACK_ALIGNMENT >= 16
+ LEA r6, $$
+ %define base r6-$$
+ %else
+ LEA r4, $$
+ %define base r4-$$
+ %endif
+%endif
+%if ARCH_X86_64
+ mova m12, [base+pd_64]
+ mova m11, [base+pd_63]
+%else
+ %define m12 [base+pd_64]
+ %define m11 [base+pd_63]
+%endif
+ pmaddwd m4, m7, [base+rescale_mul] ; dx*[0,1,2,3]
+ pslld m7, 2 ; dx*4
+ pslld m5, 14
+ paddd m6, m4 ; mx+[0..3]*dx
+ SCRATCH 7, 15, 0
+ SCRATCH 6, 14, 1
+ SCRATCH 5, 13, 2
+ pxor m1, m1
+.loop_y:
+ xor xd, xd
+ mova m0, m14 ; per-line working version of mx
+.loop_x:
+ pcmpgtd m1, m0
+ pandn m1, m0
+ psrad m2, m0, 8 ; filter offset (unmasked)
+ pcmpgtd m3, m13, m1
+ pand m1, m3
+ pandn m3, m13
+ por m1, m3
+ psubd m3, m0, m1 ; pshufb offset
+ psrad m1, 14 ; clipped src_x offset
+ psrad m3, 14 ; pshufb edge_emu offset
+ pand m2, m11 ; filter offset (masked)
+ ; load source pixels
+%if ARCH_X86_64
+ movd r8d, m1
+ pshuflw m1, m1, q3232
+ movd r9d, m1
+ punpckhqdq m1, m1
+ movd r10d, m1
+ psrlq m1, 32
+ movd r11d, m1
+ movu m4, [srcq+r8*2]
+ movu m5, [srcq+r9*2]
+ movu m6, [srcq+r10*2]
+ movu m7, [srcq+r11*2]
+ ; if no emulation is required, we don't need to shuffle or emulate edges
+ packssdw m3, m3
+ movq r11, m3
+ test r11, r11
+ jz .filter
+ movsx r8, r11w
+ sar r11, 16
+ movsx r9, r11w
+ sar r11, 16
+ movsx r10, r11w
+ sar r11, 16
+ movu m1, [base+resize_shuf+8+r8*2]
+ movu m3, [base+resize_shuf+8+r9*2]
+ movu m8, [base+resize_shuf+8+r10*2]
+ movu m9, [base+resize_shuf+8+r11*2]
+ pshufb m4, m1
+ pshufb m5, m3
+ pshufb m6, m8
+ pshufb m7, m9
+.filter:
+ movd r8d, m2
+ pshuflw m2, m2, q3232
+ movd r9d, m2
+ punpckhqdq m2, m2
+ movd r10d, m2
+ psrlq m2, 32
+ movd r11d, m2
+ movq m8, [base+resize_filter+r8*8]
+ movq m2, [base+resize_filter+r9*8]
+ pxor m9, m9
+ punpcklbw m1, m9, m8
+ punpcklbw m3, m9, m2
+ psraw m1, 8
+ psraw m3, 8
+ movq m10, [base+resize_filter+r10*8]
+ movq m2, [base+resize_filter+r11*8]
+ punpcklbw m8, m9, m10
+ punpcklbw m9, m2
+ psraw m8, 8
+ psraw m9, 8
+ pmaddwd m4, m1
+ pmaddwd m5, m3
+ pmaddwd m6, m8
+ pmaddwd m7, m9
+ phaddd m4, m5
+%else
+ movd r3, m1
+ pshuflw m1, m1, q3232
+ movd r1, m1
+ punpckhqdq m1, m1
+ movu m4, [srcq+r3*2]
+ movu m5, [srcq+r1*2]
+ movd r3, m1
+ psrlq m1, 32
+ movd r1, m1
+ movu m6, [srcq+r3*2]
+ movu m7, [srcq+r1*2]
+ ; if no emulation is required, we don't need to shuffle or emulate edges
+ pxor m1, m1
+ pcmpeqb m1, m3
+ pmovmskb r3d, m1
+ cmp r3d, 0xffff
+ je .filter
+ movd r3, m3
+ movu m1, [base+resize_shuf+8+r3*2]
+ pshuflw m3, m3, q3232
+ movd r1, m3
+ pshufb m4, m1
+ movu m1, [base+resize_shuf+8+r1*2]
+ punpckhqdq m3, m3
+ movd r3, m3
+ pshufb m5, m1
+ movu m1, [base+resize_shuf+8+r3*2]
+ psrlq m3, 32
+ movd r1, m3
+ pshufb m6, m1
+ movu m1, [base+resize_shuf+8+r1*2]
+ pshufb m7, m1
+.filter:
+ mova [esp+4*16], m6
+ mova [esp+5*16], m7
+ movd r3, m2
+ pshuflw m2, m2, q3232
+ movd r1, m2
+ movq m6, [base+resize_filter+r3*8]
+ movq m7, [base+resize_filter+r1*8]
+ pxor m3, m3
+ punpcklbw m1, m3, m6
+ punpcklbw m3, m7
+ psraw m1, 8
+ psraw m3, 8
+ pmaddwd m4, m1
+ pmaddwd m5, m3
+ punpckhqdq m2, m2
+ movd r3, m2
+ psrlq m2, 32
+ movd r1, m2
+ phaddd m4, m5
+ movq m2, [base+resize_filter+r3*8]
+ movq m5, [base+resize_filter+r1*8]
+ mova m6, [esp+4*16]
+ mova m7, [esp+5*16]
+ pxor m3, m3
+ punpcklbw m1, m3, m2
+ punpcklbw m3, m5
+ psraw m1, 8
+ psraw m3, 8
+ pmaddwd m6, m1
+ pmaddwd m7, m3
+%endif
+ phaddd m6, m7
+ phaddd m4, m6
+ pxor m1, m1
+ psubd m2, m12, m4
+ psrad m2, 7
+ packssdw m2, m2
+ pmaxsw m2, m1
+ pminsw m2, [rsp+16*3*ARCH_X86_32]
+ movq [dstq+xq*2], m2
+ paddd m0, m15
+ add xd, 4
+%if STACK_ALIGNMENT >= 16
+ cmp xd, dst_wd
+%else
+ cmp xd, dst_wm
+%endif
+ jl .loop_x
+ add dstq, dst_stridemp
+ add srcq, src_stridemp
+ dec hd
+ jg .loop_y
+ RET
diff --git a/third_party/dav1d/src/x86/mc_avx2.asm b/third_party/dav1d/src/x86/mc_avx2.asm
new file mode 100644
index 0000000000..3b208033bd
--- /dev/null
+++ b/third_party/dav1d/src/x86/mc_avx2.asm
@@ -0,0 +1,5669 @@
+; Copyright © 2018-2021, VideoLAN and dav1d authors
+; Copyright © 2018-2021, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 32
+
+; dav1d_obmc_masks[] with 64-x interleaved
+obmc_masks: db 0, 0, 0, 0
+ ; 2
+ db 45, 19, 64, 0
+ ; 4
+ db 39, 25, 50, 14, 59, 5, 64, 0
+ ; 8
+ db 36, 28, 42, 22, 48, 16, 53, 11, 57, 7, 61, 3, 64, 0, 64, 0
+ ; 16
+ db 34, 30, 37, 27, 40, 24, 43, 21, 46, 18, 49, 15, 52, 12, 54, 10
+ db 56, 8, 58, 6, 60, 4, 61, 3, 64, 0, 64, 0, 64, 0, 64, 0
+ ; 32
+ db 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20
+ db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55, 9
+ db 56, 8, 57, 7, 58, 6, 59, 5, 60, 4, 60, 4, 61, 3, 62, 2
+ db 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0
+
+warp_8x8_shufA: db 0, 2, 4, 6, 1, 3, 5, 7, 1, 3, 5, 7, 2, 4, 6, 8
+ db 4, 6, 8, 10, 5, 7, 9, 11, 5, 7, 9, 11, 6, 8, 10, 12
+warp_8x8_shufB: db 2, 4, 6, 8, 3, 5, 7, 9, 3, 5, 7, 9, 4, 6, 8, 10
+ db 6, 8, 10, 12, 7, 9, 11, 13, 7, 9, 11, 13, 8, 10, 12, 14
+subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12
+ db 2, 3, 4, 5, 3, 4, 5, 6, 10, 11, 12, 13, 11, 12, 13, 14
+subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
+subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
+subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+subpel_v_shuf4: db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
+subpel_s_shuf2: db 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11
+subpel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
+bilin_h_shuf4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12
+bilin_h_shuf8: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+bilin_v_shuf4: db 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7
+deint_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11
+blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
+pb_8x0_8x8: db 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8
+bdct_lb_dw: db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12
+wswap: db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
+rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7
+resize_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7
+
+wm_420_sign: dd 0x01020102, 0x01010101
+wm_422_sign: dd 0x80808080, 0x7f7f7f7f
+
+pb_64: times 4 db 64
+pw_m256: times 2 dw -256
+pw_15: times 2 dw 15
+pw_32: times 2 dw 32
+pw_34: times 2 dw 34
+pw_258: times 2 dw 258
+pw_512: times 2 dw 512
+pw_1024: times 2 dw 1024
+pw_2048: times 2 dw 2048
+pw_6903: times 2 dw 6903
+pw_8192: times 2 dw 8192
+pd_32: dd 32
+pd_63: dd 63
+pd_512: dd 512
+pd_32768: dd 32768
+pd_0x3ff: dd 0x3ff
+pd_0x4000: dd 0x4000
+pq_0x40000000: dq 0x40000000
+
+cextern mc_subpel_filters
+cextern mc_warp_filter2
+cextern resize_filter
+
+%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
+
+%macro BASE_JMP_TABLE 3-*
+ %xdefine %1_%2_table (%%table - %3)
+ %xdefine %%base %1_%2
+ %%table:
+ %rep %0 - 2
+ dw %%base %+ _w%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro HV_JMP_TABLE 5-*
+ %xdefine %%prefix mangle(private_prefix %+ _%1_%2_8bpc_%3)
+ %xdefine %%base %1_%3
+ %assign %%types %4
+ %if %%types & 1
+ %xdefine %1_%2_h_%3_table (%%h - %5)
+ %%h:
+ %rep %0 - 4
+ dw %%prefix %+ .h_w%5 - %%base
+ %rotate 1
+ %endrep
+ %rotate 4
+ %endif
+ %if %%types & 2
+ %xdefine %1_%2_v_%3_table (%%v - %5)
+ %%v:
+ %rep %0 - 4
+ dw %%prefix %+ .v_w%5 - %%base
+ %rotate 1
+ %endrep
+ %rotate 4
+ %endif
+ %if %%types & 4
+ %xdefine %1_%2_hv_%3_table (%%hv - %5)
+ %%hv:
+ %rep %0 - 4
+ dw %%prefix %+ .hv_w%5 - %%base
+ %rotate 1
+ %endrep
+ %endif
+%endmacro
+
+%macro BIDIR_JMP_TABLE 2-*
+ %xdefine %1_%2_table (%%table - 2*%3)
+ %xdefine %%base %1_%2_table
+ %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2)
+ %%table:
+ %rep %0 - 2
+ dd %%prefix %+ .w%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro SCALED_JMP_TABLE 2-*
+ %xdefine %1_%2_table (%%table - %3)
+ %xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2)
+%%table:
+ %rep %0 - 2
+ dw %%base %+ .w%3 - %%base
+ %rotate 1
+ %endrep
+ %rotate 2
+%%dy_1024:
+ %xdefine %1_%2_dy1_table (%%dy_1024 - %3)
+ %rep %0 - 2
+ dw %%base %+ .dy1_w%3 - %%base
+ %rotate 1
+ %endrep
+ %rotate 2
+%%dy_2048:
+ %xdefine %1_%2_dy2_table (%%dy_2048 - %3)
+ %rep %0 - 2
+ dw %%base %+ .dy2_w%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+%xdefine put_avx2 mangle(private_prefix %+ _put_bilin_8bpc_avx2.put)
+%xdefine prep_avx2 mangle(private_prefix %+ _prep_bilin_8bpc_avx2.prep)
+
+%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
+
+BASE_JMP_TABLE put, avx2, 2, 4, 8, 16, 32, 64, 128
+BASE_JMP_TABLE prep, avx2, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE put, bilin, avx2, 7, 2, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, bilin, avx2, 7, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE put, 8tap, avx2, 3, 2, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, 8tap, avx2, 1, 4, 8, 16, 32, 64, 128
+SCALED_JMP_TABLE put_8tap_scaled, avx2, 2, 4, 8, 16, 32, 64, 128
+SCALED_JMP_TABLE prep_8tap_scaled, avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE avg, avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_avg, avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE mask, avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_420, avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_422, avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_444, avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE blend, avx2, 4, 8, 16, 32
+BIDIR_JMP_TABLE blend_v, avx2, 2, 4, 8, 16, 32
+BIDIR_JMP_TABLE blend_h, avx2, 2, 4, 8, 16, 32, 32, 32
+
+SECTION .text
+
+INIT_XMM avx2
+cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy
+ movifnidn mxyd, r6m ; mx
+ lea r7, [put_avx2]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ test mxyd, mxyd
+ jnz .h
+ mov mxyd, r7m ; my
+ test mxyd, mxyd
+ jnz .v
+.put:
+ movzx wd, word [r7+wq*2+table_offset(put,)]
+ add wq, r7
+ jmp wq
+.put_w2:
+ movzx r6d, word [srcq+ssq*0]
+ movzx r7d, word [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mov [dstq+dsq*0], r6w
+ mov [dstq+dsq*1], r7w
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w2
+ RET
+.put_w4:
+ mov r6d, [srcq+ssq*0]
+ mov r7d, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mov [dstq+dsq*0], r6d
+ mov [dstq+dsq*1], r7d
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w4
+ RET
+.put_w8:
+ mov r6, [srcq+ssq*0]
+ mov r7, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mov [dstq+dsq*0], r6
+ mov [dstq+dsq*1], r7
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w8
+ RET
+.put_w16:
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w16
+ RET
+INIT_YMM avx2
+.put_w32:
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w32
+ RET
+.put_w64:
+ movu m0, [srcq+ssq*0+32*0]
+ movu m1, [srcq+ssq*0+32*1]
+ movu m2, [srcq+ssq*1+32*0]
+ movu m3, [srcq+ssq*1+32*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0+32*0], m0
+ mova [dstq+dsq*0+32*1], m1
+ mova [dstq+dsq*1+32*0], m2
+ mova [dstq+dsq*1+32*1], m3
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w64
+ RET
+.put_w128:
+ movu m0, [srcq+32*0]
+ movu m1, [srcq+32*1]
+ movu m2, [srcq+32*2]
+ movu m3, [srcq+32*3]
+ add srcq, ssq
+ mova [dstq+32*0], m0
+ mova [dstq+32*1], m1
+ mova [dstq+32*2], m2
+ mova [dstq+32*3], m3
+ add dstq, dsq
+ dec hd
+ jg .put_w128
+ RET
+.h:
+ ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4
+ ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4
+ imul mxyd, 255
+ vbroadcasti128 m4, [bilin_h_shuf8]
+ add mxyd, 16
+ movd xm5, mxyd
+ mov mxyd, r7m ; my
+ vpbroadcastw m5, xm5
+ test mxyd, mxyd
+ jnz .hv
+ movzx wd, word [r7+wq*2+table_offset(put, _bilin_h)]
+ vpbroadcastd m3, [pw_2048]
+ add wq, r7
+ jmp wq
+.h_w2:
+ movd xm0, [srcq+ssq*0]
+ pinsrd xm0, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ pshufb xm0, xm4
+ pmaddubsw xm0, xm5
+ pmulhrsw xm0, xm3
+ packuswb xm0, xm0
+ pextrw [dstq+dsq*0], xm0, 0
+ pextrw [dstq+dsq*1], xm0, 2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w2
+ RET
+.h_w4:
+ mova xm4, [bilin_h_shuf4]
+.h_w4_loop:
+ movq xm0, [srcq+ssq*0]
+ movhps xm0, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb xm0, xm4
+ pmaddubsw xm0, xm5
+ pmulhrsw xm0, xm3
+ packuswb xm0, xm0
+ movd [dstq+dsq*0], xm0
+ pextrd [dstq+dsq*1], xm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w4_loop
+ RET
+.h_w8:
+ movu xm0, [srcq+ssq*0]
+ movu xm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb xm0, xm4
+ pshufb xm1, xm4
+ pmaddubsw xm0, xm5
+ pmaddubsw xm1, xm5
+ pmulhrsw xm0, xm3
+ pmulhrsw xm1, xm3
+ packuswb xm0, xm1
+ movq [dstq+dsq*0], xm0
+ movhps [dstq+dsq*1], xm0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ movu xm0, [srcq+ssq*0+8*0]
+ vinserti128 m0, [srcq+ssq*1+8*0], 1
+ movu xm1, [srcq+ssq*0+8*1]
+ vinserti128 m1, [srcq+ssq*1+8*1], 1
+ lea srcq, [srcq+ssq*2]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ mova [dstq+dsq*0], xm0
+ vextracti128 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w16
+ RET
+.h_w32:
+ movu m0, [srcq+8*0]
+ movu m1, [srcq+8*1]
+ add srcq, ssq
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ mova [dstq], m0
+ add dstq, dsq
+ dec hd
+ jg .h_w32
+ RET
+.h_w64:
+ movu m0, [srcq+8*0]
+ movu m1, [srcq+8*1]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ movu m1, [srcq+8*4]
+ movu m2, [srcq+8*5]
+ add srcq, ssq
+ pshufb m1, m4
+ pshufb m2, m4
+ pmaddubsw m1, m5
+ pmaddubsw m2, m5
+ pmulhrsw m1, m3
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ mova [dstq+32*0], m0
+ mova [dstq+32*1], m1
+ add dstq, dsq
+ dec hd
+ jg .h_w64
+ RET
+.h_w128:
+ mov r6, -32*3
+.h_w128_loop:
+ movu m0, [srcq+r6+32*3+8*0]
+ movu m1, [srcq+r6+32*3+8*1]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ mova [dstq+r6+32*3], m0
+ add r6, 32
+ jle .h_w128_loop
+ add srcq, ssq
+ add dstq, dsq
+ dec hd
+ jg .h_w128
+ RET
+.v:
+ movzx wd, word [r7+wq*2+table_offset(put, _bilin_v)]
+ imul mxyd, 255
+ vpbroadcastd m5, [pw_2048]
+ add mxyd, 16
+ add wq, r7
+ movd xm4, mxyd
+ vpbroadcastw m4, xm4
+ jmp wq
+.v_w2:
+ movd xm0, [srcq+ssq*0]
+.v_w2_loop:
+ pinsrw xm1, xm0, [srcq+ssq*1], 1 ; 0 1
+ lea srcq, [srcq+ssq*2]
+ pinsrw xm0, xm1, [srcq+ssq*0], 0 ; 2 1
+ pshuflw xm1, xm1, q2301 ; 1 0
+ punpcklbw xm1, xm0
+ pmaddubsw xm1, xm4
+ pmulhrsw xm1, xm5
+ packuswb xm1, xm1
+ pextrw [dstq+dsq*0], xm1, 1
+ pextrw [dstq+dsq*1], xm1, 0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w2_loop
+ RET
+.v_w4:
+ movd xm0, [srcq+ssq*0]
+.v_w4_loop:
+ vpbroadcastd xm2, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpblendd xm1, xm2, xm0, 0x01 ; 0 1
+ vpbroadcastd xm0, [srcq+ssq*0]
+ vpblendd xm2, xm0, 0x02 ; 1 2
+ punpcklbw xm1, xm2
+ pmaddubsw xm1, xm4
+ pmulhrsw xm1, xm5
+ packuswb xm1, xm1
+ movd [dstq+dsq*0], xm1
+ pextrd [dstq+dsq*1], xm1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w4_loop
+ RET
+.v_w8:
+ movq xm0, [srcq+ssq*0]
+.v_w8_loop:
+ movq xm2, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklbw xm1, xm0, xm2
+ movq xm0, [srcq+ssq*0]
+ punpcklbw xm2, xm0
+ pmaddubsw xm1, xm4
+ pmaddubsw xm2, xm4
+ pmulhrsw xm1, xm5
+ pmulhrsw xm2, xm5
+ packuswb xm1, xm2
+ movq [dstq+dsq*0], xm1
+ movhps [dstq+dsq*1], xm1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w8_loop
+ RET
+.v_w16:
+ movu xm0, [srcq+ssq*0]
+.v_w16_loop:
+ vbroadcasti128 m3, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpblendd m2, m3, m0, 0x0f ; 0 1
+ vbroadcasti128 m0, [srcq+ssq*0]
+ vpblendd m3, m0, 0xf0 ; 1 2
+ punpcklbw m1, m2, m3
+ punpckhbw m2, m3
+ pmaddubsw m1, m4
+ pmaddubsw m2, m4
+ pmulhrsw m1, m5
+ pmulhrsw m2, m5
+ packuswb m1, m2
+ mova [dstq+dsq*0], xm1
+ vextracti128 [dstq+dsq*1], m1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w16_loop
+ RET
+.v_w32:
+%macro PUT_BILIN_V_W32 0
+ movu m0, [srcq+ssq*0]
+%%loop:
+ movu m3, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklbw m1, m0, m3
+ punpckhbw m2, m0, m3
+ movu m0, [srcq+ssq*0]
+ pmaddubsw m1, m4
+ pmaddubsw m2, m4
+ pmulhrsw m1, m5
+ pmulhrsw m2, m5
+ packuswb m1, m2
+ punpcklbw m2, m3, m0
+ punpckhbw m3, m0
+ pmaddubsw m2, m4
+ pmaddubsw m3, m4
+ pmulhrsw m2, m5
+ pmulhrsw m3, m5
+ packuswb m2, m3
+ mova [dstq+dsq*0], m1
+ mova [dstq+dsq*1], m2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg %%loop
+%endmacro
+ PUT_BILIN_V_W32
+ RET
+.v_w64:
+ movu m0, [srcq+32*0]
+ movu m1, [srcq+32*1]
+.v_w64_loop:
+ add srcq, ssq
+ movu m3, [srcq+32*0]
+ punpcklbw m2, m0, m3
+ punpckhbw m0, m3
+ pmaddubsw m2, m4
+ pmaddubsw m0, m4
+ pmulhrsw m2, m5
+ pmulhrsw m0, m5
+ packuswb m2, m0
+ mova m0, m3
+ movu m3, [srcq+32*1]
+ mova [dstq+32*0], m2
+ punpcklbw m2, m1, m3
+ punpckhbw m1, m3
+ pmaddubsw m2, m4
+ pmaddubsw m1, m4
+ pmulhrsw m2, m5
+ pmulhrsw m1, m5
+ packuswb m2, m1
+ mova m1, m3
+ mova [dstq+32*1], m2
+ add dstq, dsq
+ dec hd
+ jg .v_w64_loop
+ RET
+.v_w128:
+ lea r6d, [hq+(3<<8)]
+ mov r4, srcq
+ mov r7, dstq
+.v_w128_loop:
+ PUT_BILIN_V_W32
+ add r4, 32
+ add r7, 32
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
+ sub r6d, 1<<8
+ jg .v_w128_loop
+ RET
+.hv:
+ ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8
+ ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4
+ movzx wd, word [r7+wq*2+table_offset(put, _bilin_hv)]
+ WIN64_SPILL_XMM 8
+ shl mxyd, 11 ; can't shift by 12 due to signed overflow
+ vpbroadcastd m7, [pw_15]
+ movd xm6, mxyd
+ add wq, r7
+ paddb m5, m5
+ vpbroadcastw m6, xm6
+ jmp wq
+.hv_w2:
+ vpbroadcastd xm0, [srcq+ssq*0]
+ pshufb xm0, xm4
+ pmaddubsw xm0, xm5
+.hv_w2_loop:
+ movd xm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pinsrd xm1, [srcq+ssq*0], 1
+ pshufb xm1, xm4
+ pmaddubsw xm1, xm5 ; 1 _ 2 _
+ shufps xm2, xm0, xm1, q1032 ; 0 _ 1 _
+ mova xm0, xm1
+ psubw xm1, xm2
+ pmulhw xm1, xm6
+ pavgw xm2, xm7
+ paddw xm1, xm2
+ psrlw xm1, 4
+ packuswb xm1, xm1
+ pextrw [dstq+dsq*0], xm1, 0
+ pextrw [dstq+dsq*1], xm1, 2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w2_loop
+ RET
+.hv_w4:
+ mova xm4, [bilin_h_shuf4]
+ movddup xm0, [srcq+ssq*0]
+ pshufb xm0, xm4
+ pmaddubsw xm0, xm5
+.hv_w4_loop:
+ movq xm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movhps xm1, [srcq+ssq*0]
+ pshufb xm1, xm4
+ pmaddubsw xm1, xm5 ; 1 2
+ shufps xm2, xm0, xm1, q1032 ; 0 1
+ mova xm0, xm1
+ psubw xm1, xm2
+ pmulhw xm1, xm6
+ pavgw xm2, xm7
+ paddw xm1, xm2
+ psrlw xm1, 4
+ packuswb xm1, xm1
+ movd [dstq+dsq*0], xm1
+ pextrd [dstq+dsq*1], xm1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ vbroadcasti128 m0, [srcq+ssq*0]
+ pshufb m0, m4
+ pmaddubsw m0, m5
+.hv_w8_loop:
+ movu xm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vinserti128 m1, [srcq+ssq*0], 1
+ pshufb m1, m4
+ pmaddubsw m1, m5 ; 1 2
+ vperm2i128 m2, m0, m1, 0x21 ; 0 1
+ mova m0, m1
+ psubw m1, m2
+ pmulhw m1, m6
+ pavgw m2, m7
+ paddw m1, m2
+ psrlw m1, 4
+ vextracti128 xm2, m1, 1
+ packuswb xm1, xm2
+ movq [dstq+dsq*0], xm1
+ movhps [dstq+dsq*1], xm1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w8_loop
+ RET
+.hv_w16:
+ movu m0, [srcq+ssq*0+8*0]
+ vinserti128 m0, [srcq+ssq*0+8*1], 1
+ pshufb m0, m4
+ pmaddubsw m0, m5
+.hv_w16_loop:
+ movu xm2, [srcq+ssq*1+8*0]
+ vinserti128 m2, [srcq+ssq*1+8*1], 1
+ lea srcq, [srcq+ssq*2]
+ movu xm3, [srcq+ssq*0+8*0]
+ vinserti128 m3, [srcq+ssq*0+8*1], 1
+ pshufb m2, m4
+ pshufb m3, m4
+ pmaddubsw m2, m5
+ psubw m1, m2, m0
+ pmulhw m1, m6
+ pavgw m0, m7
+ paddw m1, m0
+ pmaddubsw m0, m3, m5
+ psubw m3, m0, m2
+ pmulhw m3, m6
+ pavgw m2, m7
+ paddw m3, m2
+ psrlw m1, 4
+ psrlw m3, 4
+ packuswb m1, m3
+ vpermq m1, m1, q3120
+ mova [dstq+dsq*0], xm1
+ vextracti128 [dstq+dsq*1], m1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w16_loop
+ RET
+.hv_w128:
+ lea r6d, [hq+(3<<16)]
+ jmp .hv_w32_start
+.hv_w64:
+ lea r6d, [hq+(1<<16)]
+.hv_w32_start:
+ mov r4, srcq
+ mov r7, dstq
+.hv_w32:
+%if WIN64
+ movaps r4m, xmm8
+%endif
+.hv_w32_loop0:
+ movu m0, [srcq+8*0]
+ movu m1, [srcq+8*1]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+.hv_w32_loop:
+ add srcq, ssq
+ movu m2, [srcq+8*0]
+ movu m3, [srcq+8*1]
+ pshufb m2, m4
+ pshufb m3, m4
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ psubw m8, m2, m0
+ pmulhw m8, m6
+ pavgw m0, m7
+ paddw m8, m0
+ mova m0, m2
+ psubw m2, m3, m1
+ pmulhw m2, m6
+ pavgw m1, m7
+ paddw m2, m1
+ mova m1, m3
+ psrlw m8, 4
+ psrlw m2, 4
+ packuswb m8, m2
+ mova [dstq], m8
+ add dstq, dsq
+ dec hd
+ jg .hv_w32_loop
+ add r4, 32
+ add r7, 32
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
+ sub r6d, 1<<16
+ jg .hv_w32_loop0
+%if WIN64
+ movaps xmm8, r4m
+%endif
+ RET
+
+cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
+ movifnidn mxyd, r5m ; mx
+ lea r6, [prep%+SUFFIX]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ test mxyd, mxyd
+ jnz .h
+ mov mxyd, r6m ; my
+ test mxyd, mxyd
+ jnz .v
+.prep:
+ movzx wd, word [r6+wq*2+table_offset(prep,)]
+ add wq, r6
+ lea stride3q, [strideq*3]
+ jmp wq
+.prep_w4:
+ movd xm0, [srcq+strideq*0]
+ pinsrd xm0, [srcq+strideq*1], 1
+ pinsrd xm0, [srcq+strideq*2], 2
+ pinsrd xm0, [srcq+stride3q ], 3
+ lea srcq, [srcq+strideq*4]
+ pmovzxbw m0, xm0
+ psllw m0, 4
+ mova [tmpq], m0
+ add tmpq, 32
+ sub hd, 4
+ jg .prep_w4
+ RET
+.prep_w8:
+ movq xm0, [srcq+strideq*0]
+ movhps xm0, [srcq+strideq*1]
+ movq xm1, [srcq+strideq*2]
+ movhps xm1, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ pmovzxbw m0, xm0
+ pmovzxbw m1, xm1
+ psllw m0, 4
+ psllw m1, 4
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ add tmpq, 32*2
+ sub hd, 4
+ jg .prep_w8
+ RET
+.prep_w16:
+ pmovzxbw m0, [srcq+strideq*0]
+ pmovzxbw m1, [srcq+strideq*1]
+ pmovzxbw m2, [srcq+strideq*2]
+ pmovzxbw m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ psllw m0, 4
+ psllw m1, 4
+ psllw m2, 4
+ psllw m3, 4
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ mova [tmpq+32*2], m2
+ mova [tmpq+32*3], m3
+ add tmpq, 32*4
+ sub hd, 4
+ jg .prep_w16
+ RET
+.prep_w32:
+ pmovzxbw m0, [srcq+strideq*0+16*0]
+ pmovzxbw m1, [srcq+strideq*0+16*1]
+ pmovzxbw m2, [srcq+strideq*1+16*0]
+ pmovzxbw m3, [srcq+strideq*1+16*1]
+ lea srcq, [srcq+strideq*2]
+ psllw m0, 4
+ psllw m1, 4
+ psllw m2, 4
+ psllw m3, 4
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ mova [tmpq+32*2], m2
+ mova [tmpq+32*3], m3
+ add tmpq, 32*4
+ sub hd, 2
+ jg .prep_w32
+ RET
+.prep_w64:
+ pmovzxbw m0, [srcq+16*0]
+ pmovzxbw m1, [srcq+16*1]
+ pmovzxbw m2, [srcq+16*2]
+ pmovzxbw m3, [srcq+16*3]
+ add srcq, strideq
+ psllw m0, 4
+ psllw m1, 4
+ psllw m2, 4
+ psllw m3, 4
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ mova [tmpq+32*2], m2
+ mova [tmpq+32*3], m3
+ add tmpq, 32*4
+ dec hd
+ jg .prep_w64
+ RET
+.prep_w128:
+ pmovzxbw m0, [srcq+16*0]
+ pmovzxbw m1, [srcq+16*1]
+ pmovzxbw m2, [srcq+16*2]
+ pmovzxbw m3, [srcq+16*3]
+ psllw m0, 4
+ psllw m1, 4
+ psllw m2, 4
+ psllw m3, 4
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ mova [tmpq+32*2], m2
+ mova [tmpq+32*3], m3
+ pmovzxbw m0, [srcq+16*4]
+ pmovzxbw m1, [srcq+16*5]
+ pmovzxbw m2, [srcq+16*6]
+ pmovzxbw m3, [srcq+16*7]
+ add tmpq, 32*8
+ add srcq, strideq
+ psllw m0, 4
+ psllw m1, 4
+ psllw m2, 4
+ psllw m3, 4
+ mova [tmpq-32*4], m0
+ mova [tmpq-32*3], m1
+ mova [tmpq-32*2], m2
+ mova [tmpq-32*1], m3
+ dec hd
+ jg .prep_w128
+ RET
+.h:
+ ; 16 * src[x] + (mx * (src[x + 1] - src[x]))
+ ; = (16 - mx) * src[x] + mx * src[x + 1]
+ imul mxyd, 255
+ vbroadcasti128 m4, [bilin_h_shuf8]
+ add mxyd, 16
+ movd xm5, mxyd
+ mov mxyd, r6m ; my
+ vpbroadcastw m5, xm5
+ test mxyd, mxyd
+ jnz .hv
+ movzx wd, word [r6+wq*2+table_offset(prep, _bilin_h)]
+ add wq, r6
+ lea stride3q, [strideq*3]
+ jmp wq
+.h_w4:
+ vbroadcasti128 m4, [bilin_h_shuf4]
+.h_w4_loop:
+ movq xm0, [srcq+strideq*0]
+ movhps xm0, [srcq+strideq*1]
+ movq xm1, [srcq+strideq*2]
+ movhps xm1, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vinserti128 m0, xm1, 1
+ pshufb m0, m4
+ pmaddubsw m0, m5
+ mova [tmpq], m0
+ add tmpq, 32
+ sub hd, 4
+ jg .h_w4_loop
+ RET
+.h_w8:
+.h_w8_loop:
+ movu xm0, [srcq+strideq*0]
+ vinserti128 m0, [srcq+strideq*1], 1
+ movu xm1, [srcq+strideq*2]
+ vinserti128 m1, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ add tmpq, 32*2
+ sub hd, 4
+ jg .h_w8_loop
+ RET
+.h_w16:
+.h_w16_loop:
+ movu xm0, [srcq+strideq*0+8*0]
+ vinserti128 m0, [srcq+strideq*0+8*1], 1
+ movu xm1, [srcq+strideq*1+8*0]
+ vinserti128 m1, [srcq+strideq*1+8*1], 1
+ movu xm2, [srcq+strideq*2+8*0]
+ vinserti128 m2, [srcq+strideq*2+8*1], 1
+ movu xm3, [srcq+stride3q +8*0]
+ vinserti128 m3, [srcq+stride3q +8*1], 1
+ lea srcq, [srcq+strideq*4]
+ pshufb m0, m4
+ pshufb m1, m4
+ pshufb m2, m4
+ pshufb m3, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ mova [tmpq+32*2], m2
+ mova [tmpq+32*3], m3
+ add tmpq, 32*4
+ sub hd, 4
+ jg .h_w16_loop
+ RET
+.h_w32:
+.h_w32_loop:
+ movu xm0, [srcq+strideq*0+8*0]
+ vinserti128 m0, [srcq+strideq*0+8*1], 1
+ movu xm1, [srcq+strideq*0+8*2]
+ vinserti128 m1, [srcq+strideq*0+8*3], 1
+ movu xm2, [srcq+strideq*1+8*0]
+ vinserti128 m2, [srcq+strideq*1+8*1], 1
+ movu xm3, [srcq+strideq*1+8*2]
+ vinserti128 m3, [srcq+strideq*1+8*3], 1
+ lea srcq, [srcq+strideq*2]
+ pshufb m0, m4
+ pshufb m1, m4
+ pshufb m2, m4
+ pshufb m3, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ mova [tmpq+32*2], m2
+ mova [tmpq+32*3], m3
+ add tmpq, 32*4
+ sub hd, 2
+ jg .h_w32_loop
+ RET
+.h_w64:
+ movu xm0, [srcq+8*0]
+ vinserti128 m0, [srcq+8*1], 1
+ movu xm1, [srcq+8*2]
+ vinserti128 m1, [srcq+8*3], 1
+ movu xm2, [srcq+8*4]
+ vinserti128 m2, [srcq+8*5], 1
+ movu xm3, [srcq+8*6]
+ vinserti128 m3, [srcq+8*7], 1
+ add srcq, strideq
+ pshufb m0, m4
+ pshufb m1, m4
+ pshufb m2, m4
+ pshufb m3, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ mova [tmpq+32*2], m2
+ mova [tmpq+32*3], m3
+ add tmpq, 32*4
+ dec hd
+ jg .h_w64
+ RET
+.h_w128:
+ movu xm0, [srcq+8*0]
+ vinserti128 m0, [srcq+8*1], 1
+ movu xm1, [srcq+8*2]
+ vinserti128 m1, [srcq+8*3], 1
+ movu xm2, [srcq+8*4]
+ vinserti128 m2, [srcq+8*5], 1
+ movu xm3, [srcq+8*6]
+ vinserti128 m3, [srcq+8*7], 1
+ pshufb m0, m4
+ pshufb m1, m4
+ pshufb m2, m4
+ pshufb m3, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ mova [tmpq+32*2], m2
+ mova [tmpq+32*3], m3
+ movu xm0, [srcq+8* 8]
+ vinserti128 m0, [srcq+8* 9], 1
+ movu xm1, [srcq+8*10]
+ vinserti128 m1, [srcq+8*11], 1
+ movu xm2, [srcq+8*12]
+ vinserti128 m2, [srcq+8*13], 1
+ movu xm3, [srcq+8*14]
+ vinserti128 m3, [srcq+8*15], 1
+ add tmpq, 32*8
+ add srcq, strideq
+ pshufb m0, m4
+ pshufb m1, m4
+ pshufb m2, m4
+ pshufb m3, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ mova [tmpq-32*4], m0
+ mova [tmpq-32*3], m1
+ mova [tmpq-32*2], m2
+ mova [tmpq-32*1], m3
+ dec hd
+ jg .h_w128
+ RET
+.v:
+ WIN64_SPILL_XMM 7
+ movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)]
+ imul mxyd, 255
+ add mxyd, 16
+ add wq, r6
+ lea stride3q, [strideq*3]
+ movd xm6, mxyd
+ vpbroadcastw m6, xm6
+ jmp wq
+.v_w4:
+ movd xm0, [srcq+strideq*0]
+.v_w4_loop:
+ vpbroadcastd m1, [srcq+strideq*2]
+ vpbroadcastd xm2, [srcq+strideq*1]
+ vpbroadcastd m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpblendd m1, m0, 0x05 ; 0 2 2 2
+ vpbroadcastd m0, [srcq+strideq*0]
+ vpblendd m3, m2, 0x0f ; 1 1 3 3
+ vpblendd m2, m1, m0, 0xa0 ; 0 2 2 4
+ vpblendd m1, m3, 0xaa ; 0 1 2 3
+ vpblendd m2, m3, 0x55 ; 1 2 3 4
+ punpcklbw m1, m2
+ pmaddubsw m1, m6
+ mova [tmpq], m1
+ add tmpq, 32
+ sub hd, 4
+ jg .v_w4_loop
+ RET
+.v_w8:
+ movq xm0, [srcq+strideq*0]
+.v_w8_loop:
+ vpbroadcastq m1, [srcq+strideq*2]
+ vpbroadcastq m2, [srcq+strideq*1]
+ vpbroadcastq m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpblendd m1, m0, 0x03 ; 0 2 2 2
+ vpbroadcastq m0, [srcq+strideq*0]
+ vpblendd m2, m3, 0xcc ; 1 3 1 3
+ vpblendd m3, m2, m1, 0xf0 ; 1 3 2 2
+ vpblendd m2, m1, 0x0f ; 0 2 1 3
+ vpblendd m3, m0, 0xc0 ; 1 3 2 4
+ punpcklbw m1, m2, m3
+ punpckhbw m2, m3
+ pmaddubsw m1, m6
+ pmaddubsw m2, m6
+ mova [tmpq+32*0], m1
+ mova [tmpq+32*1], m2
+ add tmpq, 32*2
+ sub hd, 4
+ jg .v_w8_loop
+ RET
+.v_w16:
+ vbroadcasti128 m0, [srcq+strideq*0]
+.v_w16_loop:
+ vbroadcasti128 m1, [srcq+strideq*1]
+ vbroadcasti128 m2, [srcq+strideq*2]
+ vbroadcasti128 m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ shufpd m4, m0, m2, 0x0c ; 0 2
+ vbroadcasti128 m0, [srcq+strideq*0]
+ shufpd m1, m3, 0x0c ; 1 3
+ shufpd m2, m0, 0x0c ; 2 4
+ punpcklbw m3, m4, m1
+ punpcklbw m5, m1, m2
+ punpckhbw m4, m1
+ punpckhbw m1, m2
+ pmaddubsw m3, m6
+ pmaddubsw m5, m6
+ pmaddubsw m4, m6
+ pmaddubsw m1, m6
+ mova [tmpq+32*0], m3
+ mova [tmpq+32*1], m5
+ mova [tmpq+32*2], m4
+ mova [tmpq+32*3], m1
+ add tmpq, 32*4
+ sub hd, 4
+ jg .v_w16_loop
+ RET
+.v_w32:
+ vpermq m0, [srcq+strideq*0], q3120
+.v_w32_loop:
+ vpermq m1, [srcq+strideq*1], q3120
+ vpermq m2, [srcq+strideq*2], q3120
+ vpermq m3, [srcq+stride3q ], q3120
+ lea srcq, [srcq+strideq*4]
+ punpcklbw m4, m0, m1
+ punpckhbw m5, m0, m1
+ vpermq m0, [srcq+strideq*0], q3120
+ pmaddubsw m4, m6
+ pmaddubsw m5, m6
+ mova [tmpq+32*0], m4
+ mova [tmpq+32*1], m5
+ punpcklbw m4, m1, m2
+ punpckhbw m1, m2
+ pmaddubsw m4, m6
+ pmaddubsw m1, m6
+ punpcklbw m5, m2, m3
+ punpckhbw m2, m3
+ pmaddubsw m5, m6
+ pmaddubsw m2, m6
+ mova [tmpq+32*2], m4
+ mova [tmpq+32*3], m1
+ add tmpq, 32*8
+ punpcklbw m1, m3, m0
+ punpckhbw m3, m0
+ pmaddubsw m1, m6
+ pmaddubsw m3, m6
+ mova [tmpq-32*4], m5
+ mova [tmpq-32*3], m2
+ mova [tmpq-32*2], m1
+ mova [tmpq-32*1], m3
+ sub hd, 4
+ jg .v_w32_loop
+ RET
+.v_w64:
+ vpermq m0, [srcq+strideq*0+32*0], q3120
+ vpermq m1, [srcq+strideq*0+32*1], q3120
+.v_w64_loop:
+ vpermq m2, [srcq+strideq*1+32*0], q3120
+ vpermq m3, [srcq+strideq*1+32*1], q3120
+ lea srcq, [srcq+strideq*2]
+ punpcklbw m4, m0, m2
+ punpckhbw m0, m2
+ pmaddubsw m4, m6
+ pmaddubsw m0, m6
+ mova [tmpq+32*0], m4
+ mova [tmpq+32*1], m0
+ punpcklbw m4, m1, m3
+ punpckhbw m5, m1, m3
+ vpermq m0, [srcq+strideq*0+32*0], q3120
+ vpermq m1, [srcq+strideq*0+32*1], q3120
+ pmaddubsw m4, m6
+ pmaddubsw m5, m6
+ mova [tmpq+32*2], m4
+ mova [tmpq+32*3], m5
+ add tmpq, 32*8
+ punpcklbw m4, m2, m0
+ punpckhbw m2, m0
+ punpcklbw m5, m3, m1
+ punpckhbw m3, m1
+ pmaddubsw m4, m6
+ pmaddubsw m2, m6
+ pmaddubsw m5, m6
+ pmaddubsw m3, m6
+ mova [tmpq-32*4], m4
+ mova [tmpq-32*3], m2
+ mova [tmpq-32*2], m5
+ mova [tmpq-32*1], m3
+ sub hd, 2
+ jg .v_w64_loop
+ RET
+.v_w128:
+ lea r6d, [hq+(3<<8)]
+ mov r3, srcq
+ mov r5, tmpq
+.v_w128_loop0:
+ vpermq m0, [srcq+strideq*0], q3120
+.v_w128_loop:
+ vpermq m1, [srcq+strideq*1], q3120
+ lea srcq, [srcq+strideq*2]
+ punpcklbw m2, m0, m1
+ punpckhbw m3, m0, m1
+ vpermq m0, [srcq+strideq*0], q3120
+ pmaddubsw m2, m6
+ pmaddubsw m3, m6
+ punpcklbw m4, m1, m0
+ punpckhbw m1, m0
+ pmaddubsw m4, m6
+ pmaddubsw m1, m6
+ mova [tmpq+32*0], m2
+ mova [tmpq+32*1], m3
+ mova [tmpq+32*8], m4
+ mova [tmpq+32*9], m1
+ add tmpq, 32*16
+ sub hd, 2
+ jg .v_w128_loop
+ add r3, 32
+ add r5, 64
+ movzx hd, r6b
+ mov srcq, r3
+ mov tmpq, r5
+ sub r6d, 1<<8
+ jg .v_w128_loop0
+ RET
+.hv:
+ ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4
+ ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4)
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 7
+ movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)]
+ shl mxyd, 11
+ movd xm6, mxyd
+ vpbroadcastw m6, xm6
+ add wq, r6
+ lea stride3q, [strideq*3]
+ jmp wq
+.hv_w4:
+ vbroadcasti128 m4, [bilin_h_shuf4]
+ vpbroadcastq m0, [srcq+strideq*0]
+ pshufb m0, m4
+ pmaddubsw m0, m5
+.hv_w4_loop:
+ movq xm1, [srcq+strideq*1]
+ movhps xm1, [srcq+strideq*2]
+ movq xm2, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ movhps xm2, [srcq+strideq*0]
+ vinserti128 m1, xm2, 1
+ pshufb m1, m4
+ pmaddubsw m1, m5 ; 1 2 3 4
+ vpblendd m2, m1, m0, 0xc0
+ vpermq m2, m2, q2103 ; 0 1 2 3
+ mova m0, m1
+ psubw m1, m2
+ pmulhrsw m1, m6
+ paddw m1, m2
+ mova [tmpq], m1
+ add tmpq, 32
+ sub hd, 4
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ vbroadcasti128 m0, [srcq+strideq*0]
+ pshufb m0, m4
+ pmaddubsw m0, m5
+.hv_w8_loop:
+ movu xm1, [srcq+strideq*1]
+ vinserti128 m1, [srcq+strideq*2], 1
+ movu xm2, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vinserti128 m2, [srcq+strideq*0], 1
+ pshufb m1, m4
+ pshufb m2, m4
+ pmaddubsw m1, m5 ; 1 2
+ vperm2i128 m3, m0, m1, 0x21 ; 0 1
+ pmaddubsw m0, m2, m5 ; 3 4
+ vperm2i128 m2, m1, m0, 0x21 ; 2 3
+ psubw m1, m3
+ pmulhrsw m1, m6
+ paddw m1, m3
+ psubw m3, m0, m2
+ pmulhrsw m3, m6
+ paddw m3, m2
+ mova [tmpq+32*0], m1
+ mova [tmpq+32*1], m3
+ add tmpq, 32*2
+ sub hd, 4
+ jg .hv_w8_loop
+ RET
+.hv_w16:
+ movu xm0, [srcq+strideq*0+8*0]
+ vinserti128 m0, [srcq+strideq*0+8*1], 1
+ pshufb m0, m4
+ pmaddubsw m0, m5
+.hv_w16_loop:
+ movu xm1, [srcq+strideq*1+8*0]
+ vinserti128 m1, [srcq+strideq*1+8*1], 1
+ lea srcq, [srcq+strideq*2]
+ movu xm2, [srcq+strideq*0+8*0]
+ vinserti128 m2, [srcq+strideq*0+8*1], 1
+ pshufb m1, m4
+ pshufb m2, m4
+ pmaddubsw m1, m5
+ psubw m3, m1, m0
+ pmulhrsw m3, m6
+ paddw m3, m0
+ pmaddubsw m0, m2, m5
+ psubw m2, m0, m1
+ pmulhrsw m2, m6
+ paddw m2, m1
+ mova [tmpq+32*0], m3
+ mova [tmpq+32*1], m2
+ add tmpq, 32*2
+ sub hd, 2
+ jg .hv_w16_loop
+ RET
+.hv_w32:
+ movu xm0, [srcq+8*0]
+ vinserti128 m0, [srcq+8*1], 1
+ movu xm1, [srcq+8*2]
+ vinserti128 m1, [srcq+8*3], 1
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+.hv_w32_loop:
+ add srcq, strideq
+ movu xm2, [srcq+8*0]
+ vinserti128 m2, [srcq+8*1], 1
+ pshufb m2, m4
+ pmaddubsw m2, m5
+ psubw m3, m2, m0
+ pmulhrsw m3, m6
+ paddw m3, m0
+ mova m0, m2
+ movu xm2, [srcq+8*2]
+ vinserti128 m2, [srcq+8*3], 1
+ pshufb m2, m4
+ pmaddubsw m2, m5
+ mova [tmpq+32*0], m3
+ psubw m3, m2, m1
+ pmulhrsw m3, m6
+ paddw m3, m1
+ mova m1, m2
+ mova [tmpq+32*1], m3
+ add tmpq, 32*2
+ dec hd
+ jg .hv_w32_loop
+ RET
+.hv_w128:
+ lea r3d, [hq+(7<<8)]
+ mov r6d, 256
+ jmp .hv_w64_start
+.hv_w64:
+ lea r3d, [hq+(3<<8)]
+ mov r6d, 128
+.hv_w64_start:
+%if WIN64
+ PUSH r7
+%endif
+ mov r5, srcq
+ mov r7, tmpq
+.hv_w64_loop0:
+ movu xm0, [srcq+strideq*0+8*0]
+ vinserti128 m0, [srcq+strideq*0+8*1], 1
+ pshufb m0, m4
+ pmaddubsw m0, m5
+.hv_w64_loop:
+ movu xm1, [srcq+strideq*1+8*0]
+ vinserti128 m1, [srcq+strideq*1+8*1], 1
+ lea srcq, [srcq+strideq*2]
+ movu xm2, [srcq+strideq*0+8*0]
+ vinserti128 m2, [srcq+strideq*0+8*1], 1
+ pshufb m1, m4
+ pshufb m2, m4
+ pmaddubsw m1, m5
+ psubw m3, m1, m0
+ pmulhrsw m3, m6
+ paddw m3, m0
+ pmaddubsw m0, m2, m5
+ psubw m2, m0, m1
+ pmulhrsw m2, m6
+ paddw m2, m1
+ mova [tmpq+r6*0], m3
+ mova [tmpq+r6*1], m2
+ lea tmpq, [tmpq+r6*2]
+ sub hd, 2
+ jg .hv_w64_loop
+ add r5, 16
+ add r7, 32
+ movzx hd, r3b
+ mov srcq, r5
+ mov tmpq, r7
+ sub r3d, 1<<8
+ jg .hv_w64_loop0
+%if WIN64
+ POP r7
+%endif
+ RET
+
+; int8_t subpel_filters[5][15][8]
+%assign FILTER_REGULAR (0*15 << 16) | 3*15
+%assign FILTER_SMOOTH (1*15 << 16) | 4*15
+%assign FILTER_SHARP (2*15 << 16) | 3*15
+
+%macro FN 4 ; fn, type, type_h, type_v
+cglobal %1_%2_8bpc
+ mov t0d, FILTER_%3
+%ifidn %3, %4
+ mov t1d, t0d
+%else
+ mov t1d, FILTER_%4
+%endif
+%ifnidn %2, regular ; skip the jump in the last filter
+ jmp mangle(private_prefix %+ _%1_8bpc %+ SUFFIX)
+%endif
+%endmacro
+
+%if WIN64
+DECLARE_REG_TMP 4, 5
+%else
+DECLARE_REG_TMP 7, 8
+%endif
+
+%define PUT_8TAP_FN FN put_8tap,
+PUT_8TAP_FN sharp, SHARP, SHARP
+PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH
+PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP
+PUT_8TAP_FN smooth, SMOOTH, SMOOTH
+PUT_8TAP_FN sharp_regular, SHARP, REGULAR
+PUT_8TAP_FN regular_sharp, REGULAR, SHARP
+PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR
+PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PUT_8TAP_FN regular, REGULAR, REGULAR
+
+cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+ imul myd, mym, 0x010101
+ add myd, t1d ; 8tap_v, my, 4tap_v
+ lea r8, [put_avx2]
+ movsxd wq, wm
+ movifnidn hd, hm
+ test mxd, 0xf00
+ jnz .h
+ test myd, 0xf00
+ jnz .v
+ tzcnt wd, wd
+ movzx wd, word [r8+wq*2+table_offset(put,)]
+ add wq, r8
+ lea r6, [ssq*3]
+ lea r7, [dsq*3]
+%if WIN64
+ pop r8
+%endif
+ jmp wq
+.h:
+ test myd, 0xf00
+ jnz .hv
+ vpbroadcastd m5, [pw_34] ; 2 + (8 << 2)
+ WIN64_SPILL_XMM 11
+ cmp wd, 4
+ jl .h_w2
+ vbroadcasti128 m6, [subpel_h_shufA]
+ je .h_w4
+ tzcnt wd, wd
+ vbroadcasti128 m7, [subpel_h_shufB]
+ vbroadcasti128 m8, [subpel_h_shufC]
+ shr mxd, 16
+ sub srcq, 3
+ movzx wd, word [r8+wq*2+table_offset(put, _8tap_h)]
+ vpbroadcastd m9, [r8+mxq*8+subpel_filters-put_avx2+0]
+ vpbroadcastd m10, [r8+mxq*8+subpel_filters-put_avx2+4]
+ add wq, r8
+ jmp wq
+.h_w2:
+ movzx mxd, mxb
+ dec srcq
+ mova xm4, [subpel_h_shuf4]
+ vpbroadcastd xm3, [r8+mxq*8+subpel_filters-put_avx2+2]
+.h_w2_loop:
+ movq xm0, [srcq+ssq*0]
+ movhps xm0, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb xm0, xm4
+ pmaddubsw xm0, xm3
+ phaddw xm0, xm0
+ paddw xm0, xm5
+ psraw xm0, 6
+ packuswb xm0, xm0
+ pextrw [dstq+dsq*0], xm0, 0
+ pextrw [dstq+dsq*1], xm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w2_loop
+ RET
+.h_w4:
+ movzx mxd, mxb
+ dec srcq
+ vpbroadcastd xm3, [r8+mxq*8+subpel_filters-put_avx2+2]
+.h_w4_loop:
+ movq xm0, [srcq+ssq*0]
+ movq xm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb xm0, xm6
+ pshufb xm1, xm6
+ pmaddubsw xm0, xm3
+ pmaddubsw xm1, xm3
+ phaddw xm0, xm1
+ paddw xm0, xm5
+ psraw xm0, 6
+ packuswb xm0, xm0
+ movd [dstq+dsq*0], xm0
+ pextrd [dstq+dsq*1], xm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w4_loop
+ RET
+.h_w8:
+%macro PUT_8TAP_H 4 ; dst/src, tmp[1-3]
+ pshufb m%2, m%1, m7
+ pshufb m%3, m%1, m8
+ pshufb m%1, m6
+ pmaddubsw m%4, m%2, m9
+ pmaddubsw m%2, m10
+ pmaddubsw m%3, m10
+ pmaddubsw m%1, m9
+ paddw m%3, m%4
+ paddw m%1, m%2
+ phaddw m%1, m%3
+ paddw m%1, m5
+ psraw m%1, 6
+%endmacro
+ movu xm0, [srcq+ssq*0]
+ vinserti128 m0, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ PUT_8TAP_H 0, 1, 2, 3
+ vextracti128 xm1, m0, 1
+ packuswb xm0, xm1
+ movq [dstq+dsq*0], xm0
+ movhps [dstq+dsq*1], xm0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ movu xm0, [srcq+ssq*0+8*0]
+ vinserti128 m0, [srcq+ssq*1+8*0], 1
+ movu xm1, [srcq+ssq*0+8*1]
+ vinserti128 m1, [srcq+ssq*1+8*1], 1
+ PUT_8TAP_H 0, 2, 3, 4
+ lea srcq, [srcq+ssq*2]
+ PUT_8TAP_H 1, 2, 3, 4
+ packuswb m0, m1
+ mova [dstq+dsq*0], xm0
+ vextracti128 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w16
+ RET
+.h_w32:
+ xor r6d, r6d
+ jmp .h_start
+.h_w64:
+ mov r6, -32*1
+ jmp .h_start
+.h_w128:
+ mov r6, -32*3
+.h_start:
+ sub srcq, r6
+ sub dstq, r6
+ mov r4, r6
+.h_loop:
+ movu m0, [srcq+r6+8*0]
+ movu m1, [srcq+r6+8*1]
+ PUT_8TAP_H 0, 2, 3, 4
+ PUT_8TAP_H 1, 2, 3, 4
+ packuswb m0, m1
+ mova [dstq+r6], m0
+ add r6, 32
+ jle .h_loop
+ add srcq, ssq
+ add dstq, dsq
+ mov r6, r4
+ dec hd
+ jg .h_loop
+ RET
+.v:
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 16
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ tzcnt r6d, wd
+ movzx r6d, word [r8+r6*2+table_offset(put, _8tap_v)]
+ vpbroadcastd m7, [pw_512]
+ lea myq, [r8+myq*8+subpel_filters-put_avx2]
+ vpbroadcastw m8, [myq+0]
+ vpbroadcastw m9, [myq+2]
+ vpbroadcastw m10, [myq+4]
+ vpbroadcastw m11, [myq+6]
+ add r6, r8
+ lea ss3q, [ssq*3]
+ sub srcq, ss3q
+ jmp r6
+.v_w2:
+ movd xm2, [srcq+ssq*0]
+ pinsrw xm2, [srcq+ssq*1], 2
+ pinsrw xm2, [srcq+ssq*2], 4
+ add srcq, ss3q
+ pinsrw xm2, [srcq+ssq*0], 6 ; 0 1 2 3
+ movd xm3, [srcq+ssq*1]
+ vpbroadcastd xm1, [srcq+ssq*2]
+ add srcq, ss3q
+ vpbroadcastd xm0, [srcq+ssq*0]
+ vpblendd xm3, xm1, 0x02 ; 4 5
+ vpblendd xm1, xm0, 0x02 ; 5 6
+ palignr xm4, xm3, xm2, 4 ; 1 2 3 4
+ punpcklbw xm3, xm1 ; 45 56
+ punpcklbw xm1, xm2, xm4 ; 01 12
+ punpckhbw xm2, xm4 ; 23 34
+.v_w2_loop:
+ pmaddubsw xm5, xm1, xm8 ; a0 b0
+ mova xm1, xm2
+ pmaddubsw xm2, xm9 ; a1 b1
+ paddw xm5, xm2
+ mova xm2, xm3
+ pmaddubsw xm3, xm10 ; a2 b2
+ paddw xm5, xm3
+ vpbroadcastd xm4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpblendd xm3, xm0, xm4, 0x02 ; 6 7
+ vpbroadcastd xm0, [srcq+ssq*0]
+ vpblendd xm4, xm0, 0x02 ; 7 8
+ punpcklbw xm3, xm4 ; 67 78
+ pmaddubsw xm4, xm3, xm11 ; a3 b3
+ paddw xm5, xm4
+ pmulhrsw xm5, xm7
+ packuswb xm5, xm5
+ pextrw [dstq+dsq*0], xm5, 0
+ pextrw [dstq+dsq*1], xm5, 2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w2_loop
+ RET
+.v_w4:
+ movd xm2, [srcq+ssq*0]
+ pinsrd xm2, [srcq+ssq*1], 1
+ pinsrd xm2, [srcq+ssq*2], 2
+ add srcq, ss3q
+ pinsrd xm2, [srcq+ssq*0], 3 ; 0 1 2 3
+ movd xm3, [srcq+ssq*1]
+ vpbroadcastd xm1, [srcq+ssq*2]
+ add srcq, ss3q
+ vpbroadcastd xm0, [srcq+ssq*0]
+ vpblendd xm3, xm1, 0x02 ; 4 5
+ vpblendd xm1, xm0, 0x02 ; 5 6
+ palignr xm4, xm3, xm2, 4 ; 1 2 3 4
+ punpcklbw xm3, xm1 ; 45 56
+ punpcklbw xm1, xm2, xm4 ; 01 12
+ punpckhbw xm2, xm4 ; 23 34
+.v_w4_loop:
+ pmaddubsw xm5, xm1, xm8 ; a0 b0
+ mova xm1, xm2
+ pmaddubsw xm2, xm9 ; a1 b1
+ paddw xm5, xm2
+ mova xm2, xm3
+ pmaddubsw xm3, xm10 ; a2 b2
+ paddw xm5, xm3
+ vpbroadcastd xm4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpblendd xm3, xm0, xm4, 0x02 ; 6 7
+ vpbroadcastd xm0, [srcq+ssq*0]
+ vpblendd xm4, xm0, 0x02 ; 7 8
+ punpcklbw xm3, xm4 ; 67 78
+ pmaddubsw xm4, xm3, xm11 ; a3 b3
+ paddw xm5, xm4
+ pmulhrsw xm5, xm7
+ packuswb xm5, xm5
+ movd [dstq+dsq*0], xm5
+ pextrd [dstq+dsq*1], xm5, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w4_loop
+ RET
+.v_w8:
+ movq xm1, [srcq+ssq*0]
+ vpbroadcastq m4, [srcq+ssq*1]
+ vpbroadcastq m2, [srcq+ssq*2]
+ add srcq, ss3q
+ vpbroadcastq m5, [srcq+ssq*0]
+ vpbroadcastq m3, [srcq+ssq*1]
+ vpbroadcastq m6, [srcq+ssq*2]
+ add srcq, ss3q
+ vpbroadcastq m0, [srcq+ssq*0]
+ vpblendd m1, m4, 0x30
+ vpblendd m4, m2, 0x30
+ punpcklbw m1, m4 ; 01 12
+ vpblendd m2, m5, 0x30
+ vpblendd m5, m3, 0x30
+ punpcklbw m2, m5 ; 23 34
+ vpblendd m3, m6, 0x30
+ vpblendd m6, m0, 0x30
+ punpcklbw m3, m6 ; 45 56
+.v_w8_loop:
+ vpbroadcastq m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmaddubsw m5, m1, m8 ; a0 b0
+ mova m1, m2
+ pmaddubsw m2, m9 ; a1 b1
+ paddw m5, m2
+ mova m2, m3
+ pmaddubsw m3, m10 ; a2 b2
+ paddw m5, m3
+ vpblendd m3, m0, m4, 0x30
+ vpbroadcastq m0, [srcq+ssq*0]
+ vpblendd m4, m0, 0x30
+ punpcklbw m3, m4 ; 67 78
+ pmaddubsw m4, m3, m11 ; a3 b3
+ paddw m5, m4
+ pmulhrsw m5, m7
+ vextracti128 xm4, m5, 1
+ packuswb xm5, xm4
+ movq [dstq+dsq*0], xm5
+ movhps [dstq+dsq*1], xm5
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w8_loop
+ RET
+.v_w16:
+.v_w32:
+.v_w64:
+.v_w128:
+ lea r6d, [wq*8-128]
+ mov r4, srcq
+ mov r7, dstq
+ lea r6d, [hq+r6*2]
+.v_w16_loop0:
+ vbroadcasti128 m4, [srcq+ssq*0]
+ vbroadcasti128 m5, [srcq+ssq*1]
+ vbroadcasti128 m6, [srcq+ssq*2]
+ add srcq, ss3q
+ vbroadcasti128 m0, [srcq+ssq*0]
+ vbroadcasti128 m1, [srcq+ssq*1]
+ vbroadcasti128 m2, [srcq+ssq*2]
+ add srcq, ss3q
+ vbroadcasti128 m3, [srcq+ssq*0]
+ shufpd m4, m0, 0x0c
+ shufpd m5, m1, 0x0c
+ punpcklbw m1, m4, m5 ; 01
+ punpckhbw m4, m5 ; 34
+ shufpd m6, m2, 0x0c
+ punpcklbw m2, m5, m6 ; 12
+ punpckhbw m5, m6 ; 45
+ shufpd m0, m3, 0x0c
+ punpcklbw m3, m6, m0 ; 23
+ punpckhbw m6, m0 ; 56
+.v_w16_loop:
+ vbroadcasti128 m12, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vbroadcasti128 m13, [srcq+ssq*0]
+ pmaddubsw m14, m1, m8 ; a0
+ pmaddubsw m15, m2, m8 ; b0
+ mova m1, m3
+ mova m2, m4
+ pmaddubsw m3, m9 ; a1
+ pmaddubsw m4, m9 ; b1
+ paddw m14, m3
+ paddw m15, m4
+ mova m3, m5
+ mova m4, m6
+ pmaddubsw m5, m10 ; a2
+ pmaddubsw m6, m10 ; b2
+ paddw m14, m5
+ paddw m15, m6
+ shufpd m6, m0, m12, 0x0d
+ shufpd m0, m12, m13, 0x0c
+ punpcklbw m5, m6, m0 ; 67
+ punpckhbw m6, m0 ; 78
+ pmaddubsw m12, m5, m11 ; a3
+ pmaddubsw m13, m6, m11 ; b3
+ paddw m14, m12
+ paddw m15, m13
+ pmulhrsw m14, m7
+ pmulhrsw m15, m7
+ packuswb m14, m15
+ vpermq m14, m14, q3120
+ mova [dstq+dsq*0], xm14
+ vextracti128 [dstq+dsq*1], m14, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w16_loop
+ add r4, 16
+ add r7, 16
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
+ sub r6d, 1<<8
+ jg .v_w16_loop0
+ RET
+.hv:
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 16
+ cmp wd, 4
+ jg .hv_w8
+ movzx mxd, mxb
+ dec srcq
+ vpbroadcastd m7, [r8+mxq*8+subpel_filters-put_avx2+2]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ vpbroadcastq m0, [r8+myq*8+subpel_filters-put_avx2]
+ lea ss3q, [ssq*3]
+ sub srcq, ss3q
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ vpbroadcastd m8, [pw_8192]
+ vpbroadcastd m9, [pd_512]
+ pshufd m10, m0, q0000
+ pshufd m11, m0, q1111
+ pshufd m12, m0, q2222
+ pshufd m13, m0, q3333
+ cmp wd, 4
+ je .hv_w4
+ vbroadcasti128 m6, [subpel_h_shuf4]
+ movq xm2, [srcq+ssq*0]
+ movhps xm2, [srcq+ssq*1]
+ movq xm0, [srcq+ssq*2]
+ add srcq, ss3q
+ movhps xm0, [srcq+ssq*0]
+ vpbroadcastq m3, [srcq+ssq*1]
+ vpbroadcastq m4, [srcq+ssq*2]
+ add srcq, ss3q
+ vpbroadcastq m1, [srcq+ssq*0]
+ vpblendd m2, m3, 0x30
+ vpblendd m0, m1, 0x30
+ vpblendd m2, m4, 0xc0
+ pshufb m2, m6
+ pshufb m0, m6
+ pmaddubsw m2, m7
+ pmaddubsw m0, m7
+ phaddw m2, m0
+ pmulhrsw m2, m8
+ vextracti128 xm3, m2, 1
+ palignr xm4, xm3, xm2, 4
+ punpcklwd xm1, xm2, xm4 ; 01 12
+ punpckhwd xm2, xm4 ; 23 34
+ pshufd xm0, xm3, q2121
+ punpcklwd xm3, xm0 ; 45 56
+.hv_w2_loop:
+ movq xm4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movhps xm4, [srcq+ssq*0]
+ pshufb xm4, xm6
+ pmaddubsw xm4, xm7
+ pmaddwd xm5, xm1, xm10 ; a0 b0
+ mova xm1, xm2
+ pmaddwd xm2, xm11 ; a1 b1
+ paddd xm5, xm2
+ mova xm2, xm3
+ pmaddwd xm3, xm12 ; a2 b2
+ phaddw xm4, xm4
+ pmulhrsw xm4, xm8
+ paddd xm5, xm3
+ palignr xm3, xm4, xm0, 12
+ mova xm0, xm4
+ punpcklwd xm3, xm0 ; 67 78
+ pmaddwd xm4, xm3, xm13 ; a3 b3
+ paddd xm5, xm9
+ paddd xm5, xm4
+ psrad xm5, 10
+ packssdw xm5, xm5
+ packuswb xm5, xm5
+ pextrw [dstq+dsq*0], xm5, 0
+ pextrw [dstq+dsq*1], xm5, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w2_loop
+ RET
+.hv_w4:
+ mova m6, [subpel_h_shuf4]
+ vpbroadcastq m2, [srcq+ssq*0]
+ vpbroadcastq m4, [srcq+ssq*1]
+ vpbroadcastq m0, [srcq+ssq*2]
+ add srcq, ss3q
+ vpbroadcastq m5, [srcq+ssq*0]
+ vpbroadcastq m3, [srcq+ssq*1]
+ vpblendd m2, m4, 0xcc ; 0 1
+ vpbroadcastq m4, [srcq+ssq*2]
+ add srcq, ss3q
+ vpbroadcastq m1, [srcq+ssq*0]
+ vpblendd m0, m5, 0xcc ; 2 3
+ vpblendd m3, m4, 0xcc ; 4 5
+ pshufb m2, m6
+ pshufb m0, m6
+ pshufb m3, m6
+ pshufb m1, m6
+ pmaddubsw m2, m7
+ pmaddubsw m0, m7
+ pmaddubsw m3, m7
+ pmaddubsw m1, m7
+ phaddw m2, m0
+ phaddw m3, m1
+ pmulhrsw m2, m8
+ pmulhrsw m3, m8
+ palignr m4, m3, m2, 4
+ punpcklwd m1, m2, m4 ; 01 12
+ punpckhwd m2, m4 ; 23 34
+ pshufd m0, m3, q2121
+ punpcklwd m3, m0 ; 45 56
+.hv_w4_loop:
+ vpbroadcastq m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmaddwd m5, m1, m10 ; a0 b0
+ mova m1, m2
+ pmaddwd m2, m11 ; a1 b1
+ paddd m5, m2
+ mova m2, m3
+ pmaddwd m3, m12 ; a2 b2
+ paddd m5, m3
+ vpbroadcastq m3, [srcq+ssq*0]
+ vpblendd m4, m3, 0xcc ; 7 8
+ pshufb m4, m6
+ pmaddubsw m4, m7
+ phaddw m4, m4
+ pmulhrsw m4, m8
+ palignr m3, m4, m0, 12
+ mova m0, m4
+ punpcklwd m3, m0 ; 67 78
+ pmaddwd m4, m3, m13 ; a3 b3
+ paddd m5, m9
+ paddd m5, m4
+ psrad m5, 10
+ vextracti128 xm4, m5, 1
+ packssdw xm5, xm4
+ packuswb xm5, xm5
+ pshuflw xm5, xm5, q3120
+ movd [dstq+dsq*0], xm5
+ pextrd [dstq+dsq*1], xm5, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ shr mxd, 16
+ sub srcq, 3
+ vpbroadcastd m10, [r8+mxq*8+subpel_filters-put_avx2+0]
+ vpbroadcastd m11, [r8+mxq*8+subpel_filters-put_avx2+4]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ vpbroadcastq m0, [r8+myq*8+subpel_filters-put_avx2]
+ lea ss3q, [ssq*3]
+ sub srcq, ss3q
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ pshufd m12, m0, q0000
+ pshufd m13, m0, q1111
+ pshufd m14, m0, q2222
+ pshufd m15, m0, q3333
+ lea r6d, [wq*8-64]
+ mov r4, srcq
+ mov r7, dstq
+ lea r6d, [hq+r6*4]
+.hv_w8_loop0:
+ vbroadcasti128 m7, [subpel_h_shufA]
+ movu xm4, [srcq+ssq*0]
+ vbroadcasti128 m8, [subpel_h_shufB]
+ movu xm5, [srcq+ssq*1]
+ vbroadcasti128 m9, [subpel_h_shufC]
+ movu xm6, [srcq+ssq*2]
+ add srcq, ss3q
+ vbroadcasti128 m0, [srcq+ssq*0]
+ vpblendd m4, m0, 0xf0 ; 0 3
+ vinserti128 m5, [srcq+ssq*1], 1 ; 1 4
+ vinserti128 m6, [srcq+ssq*2], 1 ; 2 5
+ add srcq, ss3q
+ vinserti128 m0, [srcq+ssq*0], 1 ; 3 6
+%macro HV_H_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3]
+ pshufb %3, %1, %6
+ pshufb %4, %1, %7
+ pshufb %1, %5
+ pmaddubsw %2, %3, m10
+ pmaddubsw %4, m11
+ pmaddubsw %3, m11
+ pmaddubsw %1, m10
+ paddw %2, %4
+ paddw %1, %3
+ phaddw %1, %2
+%endmacro
+ HV_H_W8 m4, m1, m2, m3, m7, m8, m9
+ HV_H_W8 m5, m1, m2, m3, m7, m8, m9
+ HV_H_W8 m6, m1, m2, m3, m7, m8, m9
+ HV_H_W8 m0, m1, m2, m3, m7, m8, m9
+ vpbroadcastd m7, [pw_8192]
+ vpermq m4, m4, q3120
+ vpermq m5, m5, q3120
+ vpermq m6, m6, q3120
+ pmulhrsw m0, m7
+ pmulhrsw m4, m7
+ pmulhrsw m5, m7
+ pmulhrsw m6, m7
+ vpermq m7, m0, q3120
+ punpcklwd m1, m4, m5 ; 01
+ punpckhwd m4, m5 ; 34
+ punpcklwd m2, m5, m6 ; 12
+ punpckhwd m5, m6 ; 45
+ punpcklwd m3, m6, m7 ; 23
+ punpckhwd m6, m7 ; 56
+.hv_w8_loop:
+ vextracti128 r6m, m0, 1 ; not enough registers
+ movu xm0, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vinserti128 m0, [srcq+ssq*0], 1 ; 7 8
+ pmaddwd m8, m1, m12 ; a0
+ pmaddwd m9, m2, m12 ; b0
+ mova m1, m3
+ mova m2, m4
+ pmaddwd m3, m13 ; a1
+ pmaddwd m4, m13 ; b1
+ paddd m8, m3
+ paddd m9, m4
+ mova m3, m5
+ mova m4, m6
+ pmaddwd m5, m14 ; a2
+ pmaddwd m6, m14 ; b2
+ paddd m8, m5
+ paddd m9, m6
+ vbroadcasti128 m6, [subpel_h_shufB]
+ vbroadcasti128 m7, [subpel_h_shufC]
+ vbroadcasti128 m5, [subpel_h_shufA]
+ HV_H_W8 m0, m5, m6, m7, m5, m6, m7
+ vpbroadcastd m5, [pw_8192]
+ vpbroadcastd m7, [pd_512]
+ vbroadcasti128 m6, r6m
+ pmulhrsw m0, m5
+ paddd m8, m7
+ paddd m9, m7
+ vpermq m7, m0, q3120 ; 7 8
+ shufpd m6, m6, m7, 0x04 ; 6 7
+ punpcklwd m5, m6, m7 ; 67
+ punpckhwd m6, m7 ; 78
+ pmaddwd m7, m5, m15 ; a3
+ paddd m8, m7
+ pmaddwd m7, m6, m15 ; b3
+ paddd m7, m9
+ psrad m8, 10
+ psrad m7, 10
+ packssdw m8, m7
+ vextracti128 xm7, m8, 1
+ packuswb xm8, xm7
+ pshufd xm7, xm8, q3120
+ movq [dstq+dsq*0], xm7
+ movhps [dstq+dsq*1], xm7
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w8_loop
+ add r4, 8
+ add r7, 8
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
+ sub r6d, 1<<8
+ jg .hv_w8_loop0
+ RET
+
+%macro PREP_8TAP_H 0
+ pshufb m1, m0, m5
+ pshufb m2, m0, m6
+ pshufb m3, m0, m7
+ pmaddubsw m1, m8
+ pmaddubsw m0, m2, m8
+ pmaddubsw m2, m9
+ pmaddubsw m3, m9
+ paddw m1, m2
+ paddw m0, m3
+ phaddw m0, m1, m0
+ pmulhrsw m0, m4
+%endmacro
+
+%if WIN64
+DECLARE_REG_TMP 6, 4
+%else
+DECLARE_REG_TMP 6, 7
+%endif
+
+%define PREP_8TAP_FN FN prep_8tap,
+PREP_8TAP_FN sharp, SHARP, SHARP
+PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH
+PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP
+PREP_8TAP_FN smooth, SMOOTH, SMOOTH
+PREP_8TAP_FN sharp_regular, SHARP, REGULAR
+PREP_8TAP_FN regular_sharp, REGULAR, SHARP
+PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR
+PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PREP_8TAP_FN regular, REGULAR, REGULAR
+
+cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+ imul myd, mym, 0x010101
+ add myd, t1d ; 8tap_v, my, 4tap_v
+ lea r7, [prep%+SUFFIX]
+ movsxd wq, wm
+ movifnidn hd, hm
+ test mxd, 0xf00
+ jnz .h
+ test myd, 0xf00
+ jnz .v
+ tzcnt wd, wd
+ movzx wd, word [r7+wq*2+table_offset(prep,)]
+ add wq, r7
+ lea r6, [strideq*3]
+%if WIN64
+ pop r7
+%endif
+ jmp wq
+.h:
+ test myd, 0xf00
+ jnz .hv
+ vpbroadcastd m4, [pw_8192]
+ vbroadcasti128 m5, [subpel_h_shufA]
+ WIN64_SPILL_XMM 10
+ cmp wd, 4
+ je .h_w4
+ tzcnt wd, wd
+ vbroadcasti128 m6, [subpel_h_shufB]
+ vbroadcasti128 m7, [subpel_h_shufC]
+ shr mxd, 16
+ sub srcq, 3
+ movzx wd, word [r7+wq*2+table_offset(prep, _8tap_h)]
+ vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep%+SUFFIX+0]
+ vpbroadcastd m9, [r7+mxq*8+subpel_filters-prep%+SUFFIX+4]
+ add wq, r7
+ jmp wq
+.h_w4:
+ movzx mxd, mxb
+ dec srcq
+ vpbroadcastd m6, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2]
+ lea stride3q, [strideq*3]
+.h_w4_loop:
+ movq xm0, [srcq+strideq*0]
+ vpbroadcastq m2, [srcq+strideq*2]
+ movq xm1, [srcq+strideq*1]
+ vpblendd m0, m2, 0xf0
+ vpbroadcastq m2, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpblendd m1, m2, 0xf0
+ pshufb m0, m5
+ pshufb m1, m5
+ pmaddubsw m0, m6
+ pmaddubsw m1, m6
+ phaddw m0, m1
+ pmulhrsw m0, m4
+ mova [tmpq], m0
+ add tmpq, 32
+ sub hd, 4
+ jg .h_w4_loop
+ RET
+.h_w8:
+ movu xm0, [srcq+strideq*0]
+ vinserti128 m0, [srcq+strideq*1], 1
+ lea srcq, [srcq+strideq*2]
+ PREP_8TAP_H
+ mova [tmpq], m0
+ add tmpq, 32
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ movu xm0, [srcq+strideq*0+8*0]
+ vinserti128 m0, [srcq+strideq*0+8*1], 1
+ PREP_8TAP_H
+ mova [tmpq+32*0], m0
+ movu xm0, [srcq+strideq*1+8*0]
+ vinserti128 m0, [srcq+strideq*1+8*1], 1
+ lea srcq, [srcq+strideq*2]
+ PREP_8TAP_H
+ mova [tmpq+32*1], m0
+ add tmpq, 32*2
+ sub hd, 2
+ jg .h_w16
+ RET
+.h_w32:
+ xor r6d, r6d
+ jmp .h_start
+.h_w64:
+ mov r6, -32*1
+ jmp .h_start
+.h_w128:
+ mov r6, -32*3
+.h_start:
+ sub srcq, r6
+ mov r5, r6
+.h_loop:
+ movu xm0, [srcq+r6+8*0]
+ vinserti128 m0, [srcq+r6+8*1], 1
+ PREP_8TAP_H
+ mova [tmpq+32*0], m0
+ movu xm0, [srcq+r6+8*2]
+ vinserti128 m0, [srcq+r6+8*3], 1
+ PREP_8TAP_H
+ mova [tmpq+32*1], m0
+ add tmpq, 32*2
+ add r6, 32
+ jle .h_loop
+ add srcq, strideq
+ mov r6, r5
+ dec hd
+ jg .h_loop
+ RET
+.v:
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 16
+ movzx mxd, myb ; Select 4-tap/8-tap filter multipliers.
+ shr myd, 16 ; Note that the code is 8-tap only, having
+ cmp hd, 4 ; a separate 4-tap code path for (4|8|16)x4
+ cmove myd, mxd ; had a negligible effect on performance.
+ ; TODO: Would a 6-tap code path be worth it?
+ lea myq, [r7+myq*8+subpel_filters-prep%+SUFFIX]
+ lea stride3q, [strideq*3]
+ sub srcq, stride3q
+ vpbroadcastd m7, [pw_8192]
+ vpbroadcastw m8, [myq+0]
+ vpbroadcastw m9, [myq+2]
+ vpbroadcastw m10, [myq+4]
+ vpbroadcastw m11, [myq+6]
+ cmp wd, 8
+ jg .v_w16
+ je .v_w8
+.v_w4:
+ movd xm0, [srcq+strideq*0]
+ vpbroadcastd m1, [srcq+strideq*2]
+ vpbroadcastd xm2, [srcq+strideq*1]
+ add srcq, stride3q
+ vpbroadcastd m3, [srcq+strideq*0]
+ vpblendd m1, m0, 0x01 ; 0 2 2 _ 2 _ _ _
+ vpblendd m3, m2, 0x03 ; 1 1 3 3 3 3 _ _
+ vpbroadcastd m0, [srcq+strideq*1]
+ vpbroadcastd m2, [srcq+strideq*2]
+ vpblendd m1, m0, 0x68 ; 0 2 2 4 2 4 4 _
+ vpbroadcastd m0, [srcq+stride3q ]
+ vbroadcasti128 m5, [deint_shuf4]
+ vpblendd m3, m2, 0xc0 ; 1 1 3 3 3 3 5 5
+ vpblendd m2, m3, m1, 0x55 ; 0 1 2 3 2 3 4 5
+ vpblendd m3, m1, 0xaa ; 1 2 3 4 3 4 5 _
+ punpcklbw m1, m2, m3 ; 01 12 23 34
+ vpblendd m3, m0, 0x80 ; 1 2 3 4 3 4 5 6
+ punpckhbw m2, m3 ; 23 34 45 56
+.v_w4_loop:
+ lea srcq, [srcq+strideq*4]
+ pinsrd xm0, [srcq+strideq*0], 1
+ vpbroadcastd m3, [srcq+strideq*1]
+ vpbroadcastd m4, [srcq+strideq*2]
+ vpblendd m3, m0, 0x03 ; 6 7 8 _ 8 _ _ _
+ vpbroadcastd m0, [srcq+stride3q ]
+ vpblendd m3, m4, 0x20 ; 6 7 8 _ 8 9 _ _
+ vpblendd m3, m0, 0x40 ; 6 7 8 _ 8 9 a _
+ pshufb m3, m5 ; 67 78 89 9a
+ pmaddubsw m4, m1, m8
+ vperm2i128 m1, m2, m3, 0x21 ; 45 56 67 78
+ pmaddubsw m2, m9
+ paddw m4, m2
+ mova m2, m3
+ pmaddubsw m3, m11
+ paddw m3, m4
+ pmaddubsw m4, m1, m10
+ paddw m3, m4
+ pmulhrsw m3, m7
+ mova [tmpq], m3
+ add tmpq, 32
+ sub hd, 4
+ jg .v_w4_loop
+ RET
+.v_w8:
+ movq xm1, [srcq+strideq*0]
+ vpbroadcastq m4, [srcq+strideq*1]
+ vpbroadcastq m2, [srcq+strideq*2]
+ vpbroadcastq m5, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpbroadcastq m3, [srcq+strideq*0]
+ vpbroadcastq m6, [srcq+strideq*1]
+ vpbroadcastq m0, [srcq+strideq*2]
+ vpblendd m1, m4, 0x30
+ vpblendd m4, m2, 0x30
+ punpcklbw m1, m4 ; 01 12
+ vpblendd m2, m5, 0x30
+ vpblendd m5, m3, 0x30
+ punpcklbw m2, m5 ; 23 34
+ vpblendd m3, m6, 0x30
+ vpblendd m6, m0, 0x30
+ punpcklbw m3, m6 ; 45 56
+.v_w8_loop:
+ vpbroadcastq m4, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ pmaddubsw m5, m2, m9 ; a1
+ pmaddubsw m6, m2, m8 ; b0
+ vpblendd m2, m0, m4, 0x30
+ vpbroadcastq m0, [srcq+strideq*0]
+ vpblendd m4, m0, 0x30
+ punpcklbw m2, m4 ; 67 78
+ pmaddubsw m1, m8 ; a0
+ pmaddubsw m4, m3, m9 ; b1
+ paddw m5, m1
+ mova m1, m3
+ pmaddubsw m3, m10 ; a2
+ paddw m6, m4
+ paddw m5, m3
+ vpbroadcastq m4, [srcq+strideq*1]
+ vpblendd m3, m0, m4, 0x30
+ vpbroadcastq m0, [srcq+strideq*2]
+ vpblendd m4, m0, 0x30
+ punpcklbw m3, m4 ; 89 9a
+ pmaddubsw m4, m2, m11 ; a3
+ paddw m5, m4
+ pmaddubsw m4, m2, m10 ; b2
+ paddw m6, m4
+ pmaddubsw m4, m3, m11 ; b3
+ paddw m6, m4
+ pmulhrsw m5, m7
+ pmulhrsw m6, m7
+ mova [tmpq+32*0], m5
+ mova [tmpq+32*1], m6
+ add tmpq, 32*2
+ sub hd, 4
+ jg .v_w8_loop
+ RET
+.v_w16:
+ add wd, wd
+ mov r5, srcq
+ mov r7, tmpq
+ lea r6d, [hq+wq*8-256]
+.v_w16_loop0:
+ vbroadcasti128 m4, [srcq+strideq*0]
+ vbroadcasti128 m5, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vbroadcasti128 m0, [srcq+strideq*1]
+ vbroadcasti128 m6, [srcq+strideq*0]
+ lea srcq, [srcq+strideq*2]
+ vbroadcasti128 m1, [srcq+strideq*0]
+ vbroadcasti128 m2, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vbroadcasti128 m3, [srcq+strideq*0]
+ shufpd m4, m4, m0, 0x0c
+ shufpd m5, m5, m1, 0x0c
+ punpcklbw m1, m4, m5 ; 01
+ punpckhbw m4, m5 ; 34
+ shufpd m6, m6, m2, 0x0c
+ punpcklbw m2, m5, m6 ; 12
+ punpckhbw m5, m6 ; 45
+ shufpd m0, m0, m3, 0x0c
+ punpcklbw m3, m6, m0 ; 23
+ punpckhbw m6, m0 ; 56
+.v_w16_loop:
+ vbroadcasti128 m12, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vbroadcasti128 m13, [srcq+strideq*0]
+ pmaddubsw m14, m1, m8 ; a0
+ pmaddubsw m15, m2, m8 ; b0
+ mova m1, m3
+ mova m2, m4
+ pmaddubsw m3, m9 ; a1
+ pmaddubsw m4, m9 ; b1
+ paddw m14, m3
+ paddw m15, m4
+ mova m3, m5
+ mova m4, m6
+ pmaddubsw m5, m10 ; a2
+ pmaddubsw m6, m10 ; b2
+ paddw m14, m5
+ paddw m15, m6
+ shufpd m6, m0, m12, 0x0d
+ shufpd m0, m12, m13, 0x0c
+ punpcklbw m5, m6, m0 ; 67
+ punpckhbw m6, m0 ; 78
+ pmaddubsw m12, m5, m11 ; a3
+ pmaddubsw m13, m6, m11 ; b3
+ paddw m14, m12
+ paddw m15, m13
+ pmulhrsw m14, m7
+ pmulhrsw m15, m7
+ mova [tmpq+wq*0], m14
+ mova [tmpq+wq*1], m15
+ lea tmpq, [tmpq+wq*2]
+ sub hd, 2
+ jg .v_w16_loop
+ add r5, 16
+ add r7, 32
+ movzx hd, r6b
+ mov srcq, r5
+ mov tmpq, r7
+ sub r6d, 1<<8
+ jg .v_w16_loop0
+ RET
+.hv:
+ %assign stack_offset stack_offset - stack_size_padded
+ %assign stack_size_padded 0
+ WIN64_SPILL_XMM 16
+ cmp wd, 4
+ je .hv_w4
+ shr mxd, 16
+ sub srcq, 3
+ vpbroadcastd m10, [r7+mxq*8+subpel_filters-prep%+SUFFIX+0]
+ vpbroadcastd m11, [r7+mxq*8+subpel_filters-prep%+SUFFIX+4]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmove myd, mxd
+ vpbroadcastq m0, [r7+myq*8+subpel_filters-prep%+SUFFIX]
+ lea stride3q, [strideq*3]
+ sub srcq, stride3q
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ pshufd m12, m0, q0000
+ pshufd m13, m0, q1111
+ pshufd m14, m0, q2222
+ pshufd m15, m0, q3333
+ jmp .hv_w8
+.hv_w4:
+ movzx mxd, mxb
+ dec srcq
+ vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmove myd, mxd
+ vpbroadcastq m0, [r7+myq*8+subpel_filters-prep%+SUFFIX]
+ lea stride3q, [strideq*3]
+ sub srcq, stride3q
+ mova m7, [subpel_h_shuf4]
+ pmovzxbd m9, [deint_shuf4]
+ vpbroadcastd m10, [pw_8192]
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ vpbroadcastd m11, [pd_32]
+ pshufd m12, m0, q0000
+ pshufd m13, m0, q1111
+ pshufd m14, m0, q2222
+ pshufd m15, m0, q3333
+ vpbroadcastq m2, [srcq+strideq*0]
+ vpbroadcastq m4, [srcq+strideq*1]
+ vpbroadcastq m0, [srcq+strideq*2]
+ vpbroadcastq m5, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpbroadcastq m3, [srcq+strideq*0]
+ vpbroadcastq m6, [srcq+strideq*1]
+ vpbroadcastq m1, [srcq+strideq*2]
+ vpblendd m2, m4, 0xcc ; 0 1
+ vpblendd m0, m5, 0xcc ; 2 3
+ vpblendd m3, m6, 0xcc ; 4 5
+ pshufb m2, m7 ; 00 01 10 11 02 03 12 13
+ pshufb m0, m7 ; 20 21 30 31 22 23 32 33
+ pshufb m3, m7 ; 40 41 50 51 42 43 52 53
+ pshufb m1, m7 ; 60 61 60 61 62 63 62 63
+ pmaddubsw m2, m8
+ pmaddubsw m0, m8
+ pmaddubsw m3, m8
+ pmaddubsw m1, m8
+ phaddw m2, m0 ; 0a 1a 2a 3a 0b 1b 2b 3b
+ phaddw m3, m1 ; 4a 5a 6a __ 4b 5b 6b __
+ pmulhrsw m2, m10
+ pmulhrsw m3, m10
+ palignr m4, m3, m2, 4 ; 1a 2a 3a 4a 1b 2b 3b 4b
+ punpcklwd m1, m2, m4 ; 01 12
+ punpckhwd m2, m4 ; 23 34
+ pshufd m0, m3, q2121
+ punpcklwd m3, m0 ; 45 56
+.hv_w4_loop:
+ pmaddwd m5, m1, m12 ; a0 b0
+ pmaddwd m6, m2, m12 ; c0 d0
+ pmaddwd m2, m13 ; a1 b1
+ pmaddwd m4, m3, m13 ; c1 d1
+ mova m1, m3
+ pmaddwd m3, m14 ; a2 b2
+ paddd m5, m2
+ vpbroadcastq m2, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ paddd m6, m4
+ vpbroadcastq m4, [srcq+strideq*0]
+ paddd m5, m3
+ vpbroadcastq m3, [srcq+strideq*1]
+ vpblendd m2, m4, 0xcc
+ vpbroadcastq m4, [srcq+strideq*2]
+ vpblendd m3, m4, 0xcc
+ pshufb m2, m7
+ pshufb m3, m7
+ pmaddubsw m2, m8
+ pmaddubsw m3, m8
+ phaddw m2, m3
+ pmulhrsw m2, m10
+ palignr m3, m2, m0, 12
+ mova m0, m2
+ punpcklwd m2, m3, m0 ; 67 78
+ punpckhwd m3, m0 ; 89 9a
+ pmaddwd m4, m2, m14 ; c2 d2
+ paddd m6, m11
+ paddd m5, m11
+ paddd m6, m4
+ pmaddwd m4, m2, m15 ; a3 b3
+ paddd m5, m4
+ pmaddwd m4, m3, m15 ; c3 d3
+ paddd m6, m4
+ psrad m5, 6
+ psrad m6, 6
+ packssdw m5, m6
+ vpermd m5, m9, m5
+ mova [tmpq], m5
+ add tmpq, 32
+ sub hd, 4
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ lea r6d, [wq*8-64]
+ mov r5, srcq
+ mov r7, tmpq
+ lea r6d, [hq+r6*4]
+.hv_w8_loop0:
+ vbroadcasti128 m7, [subpel_h_shufA]
+ movu xm4, [srcq+strideq*0]
+ vbroadcasti128 m8, [subpel_h_shufB]
+ movu xm5, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vbroadcasti128 m9, [subpel_h_shufC]
+ movu xm6, [srcq+strideq*0]
+ vbroadcasti128 m0, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vpblendd m4, m0, 0xf0 ; 0 3
+ vinserti128 m5, [srcq+strideq*0], 1 ; 1 4
+ vinserti128 m6, [srcq+strideq*1], 1 ; 2 5
+ lea srcq, [srcq+strideq*2]
+ vinserti128 m0, [srcq+strideq*0], 1 ; 3 6
+ HV_H_W8 m4, m1, m2, m3, m7, m8, m9
+ HV_H_W8 m5, m1, m2, m3, m7, m8, m9
+ HV_H_W8 m6, m1, m2, m3, m7, m8, m9
+ HV_H_W8 m0, m1, m2, m3, m7, m8, m9
+ vpbroadcastd m7, [pw_8192]
+ vpermq m4, m4, q3120
+ vpermq m5, m5, q3120
+ vpermq m6, m6, q3120
+ pmulhrsw m0, m7
+ pmulhrsw m4, m7
+ pmulhrsw m5, m7
+ pmulhrsw m6, m7
+ vpermq m7, m0, q3120
+ punpcklwd m1, m4, m5 ; 01
+ punpckhwd m4, m5 ; 34
+ punpcklwd m2, m5, m6 ; 12
+ punpckhwd m5, m6 ; 45
+ punpcklwd m3, m6, m7 ; 23
+ punpckhwd m6, m7 ; 56
+.hv_w8_loop:
+ vextracti128 [tmpq], m0, 1 ; not enough registers
+ movu xm0, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vinserti128 m0, [srcq+strideq*0], 1 ; 7 8
+ pmaddwd m8, m1, m12 ; a0
+ pmaddwd m9, m2, m12 ; b0
+ mova m1, m3
+ mova m2, m4
+ pmaddwd m3, m13 ; a1
+ pmaddwd m4, m13 ; b1
+ paddd m8, m3
+ paddd m9, m4
+ mova m3, m5
+ mova m4, m6
+ pmaddwd m5, m14 ; a2
+ pmaddwd m6, m14 ; b2
+ paddd m8, m5
+ paddd m9, m6
+ vbroadcasti128 m6, [subpel_h_shufB]
+ vbroadcasti128 m7, [subpel_h_shufC]
+ vbroadcasti128 m5, [subpel_h_shufA]
+ HV_H_W8 m0, m5, m6, m7, m5, m6, m7
+ vpbroadcastd m5, [pw_8192]
+ vpbroadcastd m7, [pd_32]
+ vbroadcasti128 m6, [tmpq]
+ pmulhrsw m0, m5
+ paddd m8, m7
+ paddd m9, m7
+ vpermq m7, m0, q3120 ; 7 8
+ shufpd m6, m6, m7, 0x04 ; 6 7
+ punpcklwd m5, m6, m7 ; 67
+ punpckhwd m6, m7 ; 78
+ pmaddwd m7, m5, m15 ; a3
+ paddd m8, m7
+ pmaddwd m7, m6, m15 ; b3
+ paddd m7, m9
+ psrad m8, 6
+ psrad m7, 6
+ packssdw m8, m7
+ vpermq m7, m8, q3120
+ mova [tmpq+wq*0], xm7
+ vextracti128 [tmpq+wq*2], m7, 1
+ lea tmpq, [tmpq+wq*4]
+ sub hd, 2
+ jg .hv_w8_loop
+ add r5, 8
+ add r7, 16
+ movzx hd, r6b
+ mov srcq, r5
+ mov tmpq, r7
+ sub r6d, 1<<8
+ jg .hv_w8_loop0
+ RET
+
+%macro movifprep 2
+ %if isprep
+ mov %1, %2
+ %endif
+%endmacro
+
+%macro REMAP_REG 2
+ %xdefine r%1 r%2
+ %xdefine r%1q r%2q
+ %xdefine r%1d r%2d
+%endmacro
+
+%macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0
+ %if isprep
+ %xdefine r14_save r14
+ %assign %%i 14
+ %rep 14
+ %assign %%j %%i-1
+ REMAP_REG %%i, %%j
+ %assign %%i %%i-1
+ %endrep
+ %endif
+%endmacro
+
+%macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0
+ %if isprep
+ %assign %%i 1
+ %rep 13
+ %assign %%j %%i+1
+ REMAP_REG %%i, %%j
+ %assign %%i %%i+1
+ %endrep
+ %xdefine r14 r14_save
+ %undef r14_save
+ %endif
+%endmacro
+
+%macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged
+ MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT
+ RET
+ %if %1
+ MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
+ %endif
+%endmacro
+
+%macro MC_8TAP_SCALED_H 8 ; dst, tmp[0-6]
+ movq xm%1, [srcq+ r4]
+ movq xm%2, [srcq+ r6]
+ movhps xm%1, [srcq+ r7]
+ movhps xm%2, [srcq+ r9]
+ vinserti128 m%1, [srcq+r10], 1
+ vinserti128 m%2, [srcq+r11], 1
+ vpbroadcastq m%5, [srcq+r13]
+ vpbroadcastq m%6, [srcq+ rX]
+ add srcq, ssq
+ movq xm%3, [srcq+ r4]
+ movq xm%4, [srcq+ r6]
+ movhps xm%3, [srcq+ r7]
+ movhps xm%4, [srcq+ r9]
+ vinserti128 m%3, [srcq+r10], 1
+ vinserti128 m%4, [srcq+r11], 1
+ vpbroadcastq m%7, [srcq+r13]
+ vpbroadcastq m%8, [srcq+ rX]
+ add srcq, ssq
+ vpblendd m%1, m%5, 0xc0
+ vpblendd m%2, m%6, 0xc0
+ vpblendd m%3, m%7, 0xc0
+ vpblendd m%4, m%8, 0xc0
+ pmaddubsw m%1, m15
+ pmaddubsw m%2, m10
+ pmaddubsw m%3, m15
+ pmaddubsw m%4, m10
+ phaddw m%1, m%2
+ phaddw m%3, m%4
+ phaddw m%1, m%3
+ pmulhrsw m%1, m12
+%endmacro
+
+%macro MC_8TAP_SCALED 1
+%ifidn %1, put
+ %assign isprep 0
+cglobal put_8tap_scaled_8bpc, 4, 14, 16, 128, dst, ds, src, ss, w, h, mx, my, dx, dy
+ %xdefine base_reg r12
+ %define rndshift 10
+%else
+ %assign isprep 1
+cglobal prep_8tap_scaled_8bpc, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy
+ %define tmp_stridem qword [rsp+120]
+ %xdefine base_reg r11
+ %define rndshift 6
+%endif
+ lea base_reg, [%1_8tap_scaled_8bpc_avx2]
+%define base base_reg-%1_8tap_scaled_8bpc_avx2
+ tzcnt wd, wm
+ vpbroadcastd m8, dxm
+%if isprep && UNIX64
+ movd xm14, mxd
+ vpbroadcastd m14, xm14
+ mov r5d, t0d
+ DECLARE_REG_TMP 5, 7
+%else
+ vpbroadcastd m14, mxm
+%endif
+ mov dyd, dym
+%ifidn %1, put
+ %if WIN64
+ mov r8d, hm
+ DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3
+ %define hm r5m
+ %define dxm r8m
+ %else
+ DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3
+ %define hm r6m
+ %endif
+ %define dsm [rsp+112]
+ %define rX r1
+ %define rXd r1d
+%else ; prep
+ %if WIN64
+ mov r7d, hm
+ DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3
+ %define hm r4m
+ %define dxm r7m
+ %else
+ DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3
+ %define hm [rsp+112]
+ %endif
+ MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
+ %define rX r14
+ %define rXd r14d
+%endif
+ vpbroadcastd m10, [base+pd_0x3ff]
+ vpbroadcastd m12, [base+pw_8192]
+%ifidn %1, put
+ vpbroadcastd m13, [base+pd_512]
+%else
+ vpbroadcastd m13, [base+pd_32]
+%endif
+ pxor m9, m9
+ lea ss3q, [ssq*3]
+ movzx r7d, t1b
+ shr t1d, 16
+ cmp hd, 6
+ cmovs t1d, r7d
+ sub srcq, ss3q
+ cmp dyd, 1024
+ je .dy1
+ cmp dyd, 2048
+ je .dy2
+ movzx wd, word [base+%1_8tap_scaled_avx2_table+wq*2]
+ add wq, base_reg
+ jmp wq
+%ifidn %1, put
+.w2:
+ mov myd, mym
+ movzx t0d, t0b
+ dec srcq
+ movd xm15, t0d
+ punpckldq m8, m9, m8
+ paddd m14, m8 ; mx+dx*[0,1]
+ vpbroadcastd m11, [base+pd_0x4000]
+ vpbroadcastd xm15, xm15
+ pand m8, m14, m10
+ psrld m8, 6
+ paddd xm15, xm8
+ movd r4d, xm15
+ pextrd r6d, xm15, 1
+ vbroadcasti128 m5, [base+bdct_lb_dw]
+ vbroadcasti128 m6, [base+subpel_s_shuf2]
+ vpbroadcastd m15, [base+subpel_filters+r4*8+2]
+ vpbroadcastd m7, [base+subpel_filters+r6*8+2]
+ pcmpeqd m8, m9
+ psrld m14, 10
+ movq xm0, [srcq+ssq*0]
+ movq xm1, [srcq+ssq*2]
+ movhps xm0, [srcq+ssq*1]
+ movhps xm1, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ pshufb m14, m5
+ paddb m14, m6
+ vinserti128 m0, [srcq+ssq*0], 1
+ vinserti128 m1, [srcq+ssq*2], 1
+ vpbroadcastq m2, [srcq+ssq*1]
+ vpbroadcastq m3, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ vpblendd m15, m7, 0xaa
+ vpblendd m0, m2, 0xc0 ; 0 1 4 5
+ vpblendd m1, m3, 0xc0 ; 2 3 6 7
+ pblendvb m15, m11, m8
+ pshufb m0, m14
+ pshufb m1, m14
+ pmaddubsw m0, m15
+ pmaddubsw m1, m15
+ phaddw m0, m1
+ pmulhrsw m0, m12 ; 0 1 2 3 4 5 6 7
+ vextracti128 xm1, m0, 1 ; 4 5 6 7
+ palignr xm2, xm1, xm0, 4 ; 1 2 3 4
+ punpcklwd xm3, xm0, xm2 ; 01 12
+ punpckhwd xm0, xm2 ; 23 34
+ pshufd xm4, xm1, q0321 ; 5 6 7 _
+ punpcklwd xm2, xm1, xm4 ; 45 56
+ punpckhwd xm4, xm1, xm4 ; 67 __
+.w2_loop:
+ and myd, 0x3ff
+ mov r6d, 64 << 24
+ mov r4d, myd
+ shr r4d, 6
+ lea r4d, [t1+r4]
+ cmovnz r6q, [base+subpel_filters+r4*8]
+ movq xm11, r6q
+ pmovsxbw xm11, xm11
+ pshufd xm8, xm11, q0000
+ pshufd xm9, xm11, q1111
+ pshufd xm10, xm11, q2222
+ pshufd xm11, xm11, q3333
+ pmaddwd xm5, xm3, xm8
+ pmaddwd xm6, xm0, xm9
+ pmaddwd xm7, xm2, xm10
+ pmaddwd xm8, xm4, xm11
+ paddd xm5, xm6
+ paddd xm7, xm8
+ paddd xm5, xm13
+ paddd xm5, xm7
+ psrad xm5, 10
+ packssdw xm5, xm5
+ packuswb xm5, xm5
+ pextrw [dstq], xm5, 0
+ add dstq, dsq
+ dec hd
+ jz .ret
+ add myd, dyd
+ test myd, ~0x3ff
+ jz .w2_loop
+ movq xm5, [srcq]
+ test myd, 0x400
+ jz .w2_skip_line
+ add srcq, ssq
+ shufps xm3, xm0, q1032 ; 01 12
+ shufps xm0, xm2, q1032 ; 23 34
+ shufps xm2, xm4, q1032 ; 45 56
+ pshufb xm5, xm14
+ pmaddubsw xm5, xm15
+ phaddw xm5, xm5
+ pmulhrsw xm5, xm12
+ palignr xm1, xm5, xm1, 12
+ punpcklqdq xm1, xm1 ; 6 7 6 7
+ punpcklwd xm4, xm1, xm5 ; 67 __
+ jmp .w2_loop
+.w2_skip_line:
+ movhps xm5, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova xm3, xm0 ; 01 12
+ mova xm0, xm2 ; 23 34
+ pshufb xm5, xm14
+ pmaddubsw xm5, xm15
+ phaddw xm5, xm5
+ pmulhrsw xm5, xm12 ; 6 7 6 7
+ palignr xm1, xm5, xm1, 8 ; 4 5 6 7
+ pshufd xm5, xm1, q0321 ; 5 6 7 _
+ punpcklwd xm2, xm1, xm5 ; 45 56
+ punpckhwd xm4, xm1, xm5 ; 67 __
+ jmp .w2_loop
+%endif
+.w4:
+ mov myd, mym
+ vbroadcasti128 m7, [base+rescale_mul]
+ movzx t0d, t0b
+ dec srcq
+ movd xm15, t0d
+ pmaddwd m8, m7
+ vpbroadcastd m11, [base+pd_0x4000]
+ vpbroadcastd xm15, xm15
+ paddd m14, m8 ; mx+dx*[0-3]
+ pand m0, m14, m10
+ psrld m0, 6
+ paddd xm15, xm0
+ movd r4d, xm15
+ pextrd r6d, xm15, 1
+ pextrd r11d, xm15, 2
+ pextrd r13d, xm15, 3
+ movd xm15, [base+subpel_filters+r4*8+2]
+ vbroadcasti128 m5, [base+bdct_lb_dw]
+ vpbroadcastq m6, [base+subpel_s_shuf2]
+ pinsrd xm15, [base+subpel_filters+r6*8+2], 1
+ pcmpeqd m0, m9
+ psrld m14, 10
+ movu xm7, [srcq+ssq*0]
+ movu xm9, [srcq+ssq*1]
+ pinsrd xm15, [base+subpel_filters+r11*8+2], 2
+ movu xm8, [srcq+ssq*2]
+ movu xm10, [srcq+ss3q ]
+ pinsrd xm15, [base+subpel_filters+r13*8+2], 3
+ lea srcq, [srcq+ssq*4]
+ pshufb m14, m5
+ paddb m14, m6
+ vinserti128 m7, [srcq+ssq*0], 1
+ vinserti128 m9, [srcq+ssq*1], 1
+ vinserti128 m15, xm15, 1
+ vinserti128 m8, [srcq+ssq*2], 1
+ vinserti128 m10, [srcq+ss3q ], 1
+ lea srcq, [srcq+ssq*4]
+ pblendvb m15, m11, m0
+ pshufb m7, m14
+ pshufb m9, m14
+ pshufb m8, m14
+ pshufb m10, m14
+ pmaddubsw m7, m15
+ pmaddubsw m9, m15
+ pmaddubsw m8, m15
+ pmaddubsw m10, m15
+ phaddw m7, m9
+ phaddw m8, m10
+ pmulhrsw m7, m12 ; 0 1 4 5
+ pmulhrsw m8, m12 ; 2 3 6 7
+ vextracti128 xm9, m7, 1 ; 4 5
+ vextracti128 xm3, m8, 1 ; 6 7
+ shufps xm4, xm7, xm8, q1032 ; 1 2
+ shufps xm5, xm8, xm9, q1032 ; 3 4
+ shufps xm6, xm9, xm3, q1032 ; 5 6
+ psrldq xm11, xm3, 8 ; 7 _
+ punpcklwd xm0, xm7, xm4 ; 01
+ punpckhwd xm7, xm4 ; 12
+ punpcklwd xm1, xm8, xm5 ; 23
+ punpckhwd xm8, xm5 ; 34
+ punpcklwd xm2, xm9, xm6 ; 45
+ punpckhwd xm9, xm6 ; 56
+ punpcklwd xm3, xm11 ; 67
+ mova [rsp+0x00], xm7
+ mova [rsp+0x10], xm8
+ mova [rsp+0x20], xm9
+.w4_loop:
+ and myd, 0x3ff
+ mov r6d, 64 << 24
+ mov r4d, myd
+ shr r4d, 6
+ lea r4d, [t1+r4]
+ cmovnz r6q, [base+subpel_filters+r4*8]
+ movq xm10, r6q
+ pmovsxbw xm10, xm10
+ pshufd xm7, xm10, q0000
+ pshufd xm8, xm10, q1111
+ pshufd xm9, xm10, q2222
+ pshufd xm10, xm10, q3333
+ pmaddwd xm4, xm0, xm7
+ pmaddwd xm5, xm1, xm8
+ pmaddwd xm6, xm2, xm9
+ pmaddwd xm7, xm3, xm10
+ paddd xm4, xm5
+ paddd xm6, xm7
+ paddd xm4, xm13
+ paddd xm4, xm6
+ psrad xm4, rndshift
+ packssdw xm4, xm4
+%ifidn %1, put
+ packuswb xm4, xm4
+ movd [dstq], xm4
+ add dstq, dsq
+%else
+ movq [tmpq], xm4
+ add tmpq, 8
+%endif
+ dec hd
+ jz .ret
+ add myd, dyd
+ test myd, ~0x3ff
+ jz .w4_loop
+ movu xm4, [srcq]
+ test myd, 0x400
+ jz .w4_skip_line
+ mova xm0, [rsp+0x00]
+ mova [rsp+0x00], xm1
+ mova xm1, [rsp+0x10]
+ mova [rsp+0x10], xm2
+ mova xm2, [rsp+0x20]
+ mova [rsp+0x20], xm3
+ pshufb xm4, xm14
+ pmaddubsw xm4, xm15
+ phaddw xm4, xm4
+ pmulhrsw xm4, xm12
+ punpcklwd xm3, xm11, xm4
+ mova xm11, xm4
+ add srcq, ssq
+ jmp .w4_loop
+.w4_skip_line:
+ movu xm5, [srcq+ssq*1]
+ movu m6, [rsp+0x10]
+ pshufb xm4, xm14
+ pshufb xm5, xm14
+ pmaddubsw xm4, xm15
+ pmaddubsw xm5, xm15
+ movu [rsp+0x00], m6
+ phaddw xm4, xm5
+ pmulhrsw xm4, xm12
+ punpcklwd xm9, xm11, xm4
+ mova [rsp+0x20], xm9
+ psrldq xm11, xm4, 8
+ mova xm0, xm1
+ mova xm1, xm2
+ mova xm2, xm3
+ punpcklwd xm3, xm4, xm11
+ lea srcq, [srcq+ssq*2]
+ jmp .w4_loop
+.w8:
+ mov dword [rsp+48], 1
+ movifprep tmp_stridem, 16
+ jmp .w_start
+.w16:
+ mov dword [rsp+48], 2
+ movifprep tmp_stridem, 32
+ jmp .w_start
+.w32:
+ mov dword [rsp+48], 4
+ movifprep tmp_stridem, 64
+ jmp .w_start
+.w64:
+ mov dword [rsp+48], 8
+ movifprep tmp_stridem, 128
+ jmp .w_start
+.w128:
+ mov dword [rsp+48], 16
+ movifprep tmp_stridem, 256
+.w_start:
+%ifidn %1, put
+ movifnidn dsm, dsq
+%endif
+ shr t0d, 16
+ sub srcq, 3
+ pmaddwd m8, [base+rescale_mul]
+ movd xm15, t0d
+ mov [rsp+72], t0d
+ mov [rsp+56], srcq
+ mov [rsp+64], r0q ; dstq / tmpq
+%if UNIX64
+ mov hm, hd
+%endif
+ shl dword dxm, 3 ; dx*8
+ vpbroadcastd m15, xm15
+ paddd m14, m8 ; mx+dx*[0-7]
+ jmp .hloop
+.hloop_prep:
+ dec dword [rsp+48]
+ jz .ret
+ add qword [rsp+64], 8*(isprep+1)
+ mov hd, hm
+ vpbroadcastd m8, dxm
+ vpbroadcastd m10, [base+pd_0x3ff]
+ paddd m14, m8, [rsp+16]
+ vpbroadcastd m15, [rsp+72]
+ pxor m9, m9
+ mov srcq, [rsp+56]
+ mov r0q, [rsp+64] ; dstq / tmpq
+.hloop:
+ vpbroadcastq m11, [base+pq_0x40000000]
+ pand m6, m14, m10
+ psrld m6, 6
+ paddd m15, m6
+ pcmpeqd m6, m9
+ vextracti128 xm7, m15, 1
+ movd r4d, xm15
+ pextrd r6d, xm15, 2
+ pextrd r7d, xm15, 1
+ pextrd r9d, xm15, 3
+ movd r10d, xm7
+ pextrd r11d, xm7, 2
+ pextrd r13d, xm7, 1
+ pextrd rXd, xm7, 3
+ movu [rsp+16], m14
+ movq xm15, [base+subpel_filters+ r4*8]
+ movq xm10, [base+subpel_filters+ r6*8]
+ movhps xm15, [base+subpel_filters+ r7*8]
+ movhps xm10, [base+subpel_filters+ r9*8]
+ vinserti128 m15, [base+subpel_filters+r10*8], 1
+ vinserti128 m10, [base+subpel_filters+r11*8], 1
+ vpbroadcastq m9, [base+subpel_filters+r13*8]
+ vpbroadcastq m8, [base+subpel_filters+ rX*8]
+ psrld m14, 10
+ vextracti128 xm7, m14, 1
+ mova [rsp], xm14
+ movd r4d, xm14
+ pextrd r6d, xm14, 2
+ pextrd r7d, xm14, 1
+ pextrd r9d, xm14, 3
+ movd r10d, xm7
+ pextrd r11d, xm7, 2
+ pextrd r13d, xm7, 1
+ pextrd rXd, xm7, 3
+ pshufd m5, m6, q1100
+ pshufd m6, m6, q3322
+ vpblendd m15, m9, 0xc0
+ vpblendd m10, m8, 0xc0
+ pblendvb m15, m11, m5
+ pblendvb m10, m11, m6
+ vbroadcasti128 m14, [base+subpel_s_shuf8]
+ MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
+ MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
+ mov myd, mym
+ mov dyd, dym
+ pshufb m0, m14 ; 01a 01b
+ pshufb m1, m14 ; 23a 23b
+ pshufb m2, m14 ; 45a 45b
+ pshufb m3, m14 ; 67a 67b
+ vbroadcasti128 m14, [base+wswap]
+.vloop:
+ and myd, 0x3ff
+ mov r6d, 64 << 24
+ mov r4d, myd
+ shr r4d, 6
+ lea r4d, [t1+r4]
+ cmovnz r6q, [base+subpel_filters+r4*8]
+ movq xm11, r6q
+ punpcklqdq xm11, xm11
+ pmovsxbw m11, xm11
+ pshufd m8, m11, q0000
+ pshufd m9, m11, q1111
+ pmaddwd m4, m0, m8
+ pmaddwd m5, m1, m9
+ pshufd m8, m11, q2222
+ pshufd m11, m11, q3333
+ pmaddwd m6, m2, m8
+ pmaddwd m7, m3, m11
+ paddd m4, m5
+ paddd m6, m7
+ paddd m4, m13
+ paddd m4, m6
+ psrad m4, rndshift
+ vextracti128 xm5, m4, 1
+ packssdw xm4, xm5
+%ifidn %1, put
+ packuswb xm4, xm4
+ movq [dstq], xm4
+ add dstq, dsm
+%else
+ mova [tmpq], xm4
+ add tmpq, tmp_stridem
+%endif
+ dec hd
+ jz .hloop_prep
+ add myd, dyd
+ test myd, ~0x3ff
+ jz .vloop
+ test myd, 0x400
+ mov [rsp+52], myd
+ mov r4d, [rsp+ 0]
+ mov r6d, [rsp+ 8]
+ mov r7d, [rsp+ 4]
+ mov r9d, [rsp+12]
+ jz .skip_line
+ vpbroadcastq m6, [srcq+r13]
+ vpbroadcastq m7, [srcq+ rX]
+ movq xm4, [srcq+ r4]
+ movq xm5, [srcq+ r6]
+ movhps xm4, [srcq+ r7]
+ movhps xm5, [srcq+ r9]
+ vinserti128 m4, [srcq+r10], 1
+ vinserti128 m5, [srcq+r11], 1
+ add srcq, ssq
+ mov myd, [rsp+52]
+ mov dyd, dym
+ pshufb m0, m14
+ pshufb m1, m14
+ pshufb m2, m14
+ pshufb m3, m14
+ vpblendd m4, m6, 0xc0
+ vpblendd m5, m7, 0xc0
+ pmaddubsw m4, m15
+ pmaddubsw m5, m10
+ phaddw m4, m5
+ pslld m5, m4, 16
+ paddw m4, m5
+ pmulhrsw m4, m12
+ pblendw m0, m1, 0xaa
+ pblendw m1, m2, 0xaa
+ pblendw m2, m3, 0xaa
+ pblendw m3, m4, 0xaa
+ jmp .vloop
+.skip_line:
+ mova m0, m1
+ mova m1, m2
+ mova m2, m3
+ vpbroadcastq m7, [srcq+r13]
+ vpbroadcastq m8, [srcq+ rX]
+ movq xm3, [srcq+ r4]
+ movq xm4, [srcq+ r6]
+ movhps xm3, [srcq+ r7]
+ movhps xm4, [srcq+ r9]
+ vinserti128 m3, [srcq+r10], 1
+ vinserti128 m4, [srcq+r11], 1
+ add srcq, ssq
+ movq xm5, [srcq+ r4]
+ movq xm6, [srcq+ r6]
+ movhps xm5, [srcq+ r7]
+ movhps xm6, [srcq+ r9]
+ vinserti128 m5, [srcq+r10], 1
+ vinserti128 m6, [srcq+r11], 1
+ vpbroadcastq m9, [srcq+r13]
+ vpbroadcastq m11, [srcq+ rX]
+ add srcq, ssq
+ mov myd, [rsp+52]
+ mov dyd, dym
+ vpblendd m3, m7, 0xc0
+ vpblendd m4, m8, 0xc0
+ vpblendd m5, m9, 0xc0
+ vpblendd m6, m11, 0xc0
+ pmaddubsw m3, m15
+ pmaddubsw m4, m10
+ pmaddubsw m5, m15
+ pmaddubsw m6, m10
+ phaddw m3, m4
+ phaddw m5, m6
+ psrld m4, m3, 16
+ pslld m6, m5, 16
+ paddw m3, m4
+ paddw m5, m6
+ pblendw m3, m5, 0xaa
+ pmulhrsw m3, m12
+ jmp .vloop
+.dy1:
+ movzx wd, word [base+%1_8tap_scaled_avx2_dy1_table+wq*2]
+ add wq, base_reg
+ jmp wq
+%ifidn %1, put
+.dy1_w2:
+ mov myd, mym
+ movzx t0d, t0b
+ dec srcq
+ movd xm15, t0d
+ punpckldq m8, m9, m8
+ paddd m14, m8 ; mx+dx*[0-1]
+ vpbroadcastd m11, [base+pd_0x4000]
+ vpbroadcastd xm15, xm15
+ pand m8, m14, m10
+ psrld m8, 6
+ paddd xm15, xm8
+ movd r4d, xm15
+ pextrd r6d, xm15, 1
+ vbroadcasti128 m5, [base+bdct_lb_dw]
+ vbroadcasti128 m6, [base+subpel_s_shuf2]
+ vpbroadcastd m15, [base+subpel_filters+r4*8+2]
+ vpbroadcastd m7, [base+subpel_filters+r6*8+2]
+ pcmpeqd m8, m9
+ psrld m14, 10
+ movq xm0, [srcq+ssq*0]
+ movq xm1, [srcq+ssq*2]
+ movhps xm0, [srcq+ssq*1]
+ movhps xm1, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ pshufb m14, m5
+ paddb m14, m6
+ vinserti128 m0, [srcq+ssq*0], 1
+ vinserti128 m1, [srcq+ssq*2], 1
+ vpbroadcastq m2, [srcq+ssq*1]
+ add srcq, ss3q
+ movq xm10, r4q
+ pmovsxbw xm10, xm10
+ vpblendd m15, m7, 0xaa
+ pblendvb m15, m11, m8
+ pshufd xm8, xm10, q0000
+ pshufd xm9, xm10, q1111
+ pshufd xm11, xm10, q3333
+ pshufd xm10, xm10, q2222
+ vpblendd m0, m2, 0xc0
+ pshufb m1, m14
+ pshufb m0, m14
+ pmaddubsw m1, m15
+ pmaddubsw m0, m15
+ phaddw m0, m1
+ pmulhrsw m0, m12
+ vextracti128 xm1, m0, 1
+ palignr xm2, xm1, xm0, 4
+ pshufd xm4, xm1, q2121
+ punpcklwd xm3, xm0, xm2 ; 01 12
+ punpckhwd xm0, xm2 ; 23 34
+ punpcklwd xm2, xm1, xm4 ; 45 56
+.dy1_w2_loop:
+ movq xm1, [srcq+ssq*0]
+ movhps xm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmaddwd xm5, xm3, xm8
+ pmaddwd xm6, xm0, xm9
+ pmaddwd xm7, xm2, xm10
+ mova xm3, xm0
+ mova xm0, xm2
+ paddd xm5, xm13
+ paddd xm6, xm7
+ pshufb xm1, xm14
+ pmaddubsw xm1, xm15
+ phaddw xm1, xm1
+ pmulhrsw xm1, xm12
+ palignr xm7, xm1, xm4, 12
+ punpcklwd xm2, xm7, xm1 ; 67 78
+ pmaddwd xm7, xm2, xm11
+ mova xm4, xm1
+ paddd xm5, xm6
+ paddd xm5, xm7
+ psrad xm5, rndshift
+ packssdw xm5, xm5
+ packuswb xm5, xm5
+ pextrw [dstq+dsq*0], xm5, 0
+ pextrw [dstq+dsq*1], xm5, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .dy1_w2_loop
+ RET
+%endif
+.dy1_w4:
+ mov myd, mym
+ vbroadcasti128 m7, [base+rescale_mul]
+ movzx t0d, t0b
+ dec srcq
+ movd xm15, t0d
+ pmaddwd m8, m7
+ vpbroadcastd m11, [base+pd_0x4000]
+ vpbroadcastd xm15, xm15
+ paddd m14, m8 ; mx+dx*[0-3]
+ pand m8, m14, m10
+ psrld m8, 6
+ paddd xm15, xm8
+ vpermq m8, m8, q3120
+ movd r4d, xm15
+ pextrd r6d, xm15, 2
+ pextrd r11d, xm15, 1
+ pextrd r13d, xm15, 3
+ movd xm15, [base+subpel_filters+r4*8+2]
+ vpbroadcastd m7, [base+subpel_filters+r6*8+2]
+ movu xm2, [srcq+ssq*0]
+ movu xm3, [srcq+ssq*2]
+ vbroadcasti128 m5, [base+bdct_lb_dw]
+ vpbroadcastq m6, [base+subpel_s_shuf2]
+ pcmpeqd m8, m9
+ psrld m14, 10
+ pinsrd xm15, [base+subpel_filters+r11*8+2], 1
+ vpblendd m7, [base+subpel_filters+r13*8+2-20], 0x20
+ vinserti128 m2, [srcq+ssq*1], 1
+ vinserti128 m3, [srcq+ss3q ], 1
+ lea srcq, [srcq+ssq*4]
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ pshufb m14, m5
+ paddb m14, m6
+ movu xm4, [srcq+ssq*0]
+ movu xm5, [srcq+ssq*2]
+ vinserti128 m4, [srcq+ssq*1], 1
+ add srcq, ss3q
+ vpblendd m15, m7, 0x30
+ punpcklqdq m15, m15
+ pblendvb m15, m11, m8
+ movq xm10, r4q
+ punpcklqdq xm10, xm10
+ pmovsxbw m10, xm10
+ pshufb m2, m14
+ pshufb m3, m14
+ pshufb m4, m14
+ pshufb xm5, xm14
+ vpermq m2, m2, q3120
+ vpermq m3, m3, q3120
+ vpermq m4, m4, q3120
+ vpermq m5, m5, q3120
+ pshufd m7, m10, q0000
+ pshufd m8, m10, q1111
+ pshufd m9, m10, q2222
+ pshufd m10, m10, q3333
+ pmaddubsw m2, m15
+ pmaddubsw m3, m15
+ pmaddubsw m4, m15
+ pmaddubsw m5, m15
+ phaddw m2, m3
+ phaddw m4, m5
+ pmulhrsw m2, m12
+ pmulhrsw m4, m12
+ palignr m5, m4, m2, 4
+ pshufd m3, m4, q2121
+ punpcklwd m0, m2, m5 ; 01 12
+ punpckhwd m1, m2, m5 ; 23 34
+ punpcklwd m2, m4, m3 ; 45 56
+.dy1_w4_loop:
+ movu xm11, [srcq+ssq*0]
+ vinserti128 m11, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ pmaddwd m4, m0, m7
+ pmaddwd m5, m1, m8
+ pmaddwd m6, m2, m9
+ mova m0, m1
+ mova m1, m2
+ paddd m4, m13
+ paddd m5, m6
+ pshufb m11, m14
+ vpermq m11, m11, q3120
+ pmaddubsw m11, m15
+ phaddw m11, m11
+ pmulhrsw m11, m12
+ palignr m6, m11, m3, 12
+ punpcklwd m2, m6, m11 ; 67 78
+ mova m3, m11
+ pmaddwd m6, m2, m10
+ paddd m4, m5
+ paddd m4, m6
+ psrad m4, rndshift
+ vextracti128 xm5, m4, 1
+ packssdw xm4, xm5
+%ifidn %1, put
+ packuswb xm4, xm4
+ pshuflw xm4, xm4, q3120
+ movd [dstq+dsq*0], xm4
+ pextrd [dstq+dsq*1], xm4, 1
+ lea dstq, [dstq+dsq*2]
+%else
+ pshufd xm4, xm4, q3120
+ mova [tmpq], xm4
+ add tmpq, 16
+%endif
+ sub hd, 2
+ jg .dy1_w4_loop
+ MC_8TAP_SCALED_RET
+.dy1_w8:
+ mov dword [rsp+72], 1
+ movifprep tmp_stridem, 16
+ jmp .dy1_w_start
+.dy1_w16:
+ mov dword [rsp+72], 2
+ movifprep tmp_stridem, 32
+ jmp .dy1_w_start
+.dy1_w32:
+ mov dword [rsp+72], 4
+ movifprep tmp_stridem, 64
+ jmp .dy1_w_start
+.dy1_w64:
+ mov dword [rsp+72], 8
+ movifprep tmp_stridem, 128
+ jmp .dy1_w_start
+.dy1_w128:
+ mov dword [rsp+72], 16
+ movifprep tmp_stridem, 256
+.dy1_w_start:
+ mov myd, mym
+%ifidn %1, put
+ movifnidn dsm, dsq
+%endif
+ shr t0d, 16
+ sub srcq, 3
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ pmaddwd m8, [base+rescale_mul]
+ movd xm15, t0d
+ mov [rsp+76], t0d
+ mov [rsp+80], srcq
+ mov [rsp+88], r0q ; dstq / tmpq
+%if UNIX64
+ mov hm, hd
+%endif
+ shl dword dxm, 3 ; dx*8
+ vpbroadcastd m15, xm15
+ paddd m14, m8 ; mx+dx*[0-7]
+ movq xm0, r4q
+ pmovsxbw xm0, xm0
+ mova [rsp+96], xm0
+ jmp .dy1_hloop
+.dy1_hloop_prep:
+ dec dword [rsp+72]
+ jz .ret
+ add qword [rsp+88], 8*(isprep+1)
+ mov hd, hm
+ vpbroadcastd m8, dxm
+ vpbroadcastd m10, [base+pd_0x3ff]
+ paddd m14, m8, [rsp+32]
+ vpbroadcastd m15, [rsp+76]
+ pxor m9, m9
+ mov srcq, [rsp+80]
+ mov r0q, [rsp+88] ; dstq / tmpq
+.dy1_hloop:
+ vpbroadcastq m11, [base+pq_0x40000000]
+ pand m6, m14, m10
+ psrld m6, 6
+ paddd m15, m6
+ pcmpeqd m6, m9
+ vextracti128 xm7, m15, 1
+ movd r4d, xm15
+ pextrd r6d, xm15, 2
+ pextrd r7d, xm15, 1
+ pextrd r9d, xm15, 3
+ movd r10d, xm7
+ pextrd r11d, xm7, 2
+ pextrd r13d, xm7, 1
+ pextrd rXd, xm7, 3
+ movu [rsp+32], m14
+ movq xm15, [base+subpel_filters+ r4*8]
+ movq xm10, [base+subpel_filters+ r6*8]
+ movhps xm15, [base+subpel_filters+ r7*8]
+ movhps xm10, [base+subpel_filters+ r9*8]
+ vinserti128 m15, [base+subpel_filters+r10*8], 1
+ vinserti128 m10, [base+subpel_filters+r11*8], 1
+ vpbroadcastq m9, [base+subpel_filters+r13*8]
+ vpbroadcastq m8, [base+subpel_filters+ rX*8]
+ psrld m14, 10
+ vextracti128 xm7, m14, 1
+ movq [rsp+64], xm14
+ movd r4d, xm14
+ pextrd r6d, xm14, 2
+ pextrd r7d, xm14, 1
+ pextrd r9d, xm14, 3
+ movd r10d, xm7
+ pextrd r11d, xm7, 2
+ pextrd r13d, xm7, 1
+ pextrd rXd, xm7, 3
+ pshufd m5, m6, q1100
+ pshufd m6, m6, q3322
+ vpblendd m15, m9, 0xc0
+ vpblendd m10, m8, 0xc0
+ pblendvb m15, m11, m5
+ pblendvb m10, m11, m6
+ vbroadcasti128 m14, [base+subpel_s_shuf8]
+ MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
+ MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
+ movu [rsp], m10
+ vpbroadcastd m8, [rsp+0x60]
+ vpbroadcastd m9, [rsp+0x64]
+ vpbroadcastd m10, [rsp+0x68]
+ vpbroadcastd m11, [rsp+0x6c]
+ pshufb m0, m14 ; 01a 01b
+ pshufb m1, m14 ; 23a 23b
+ pshufb m2, m14 ; 45a 45b
+ pshufb m3, m14 ; 67a 67b
+ vbroadcasti128 m14, [base+wswap]
+.dy1_vloop:
+ pmaddwd m4, m0, m8
+ pmaddwd m5, m1, m9
+ pmaddwd m6, m2, m10
+ pmaddwd m7, m3, m11
+ paddd m4, m5
+ paddd m6, m7
+ paddd m4, m13
+ paddd m4, m6
+ psrad m4, rndshift
+ vextracti128 xm5, m4, 1
+ packssdw xm4, xm5
+%ifidn %1, put
+ packuswb xm4, xm4
+ movq [dstq], xm4
+ add dstq, dsm
+%else
+ mova [tmpq], xm4
+ add tmpq, tmp_stridem
+%endif
+ dec hd
+ jz .dy1_hloop_prep
+ movq xm4, [srcq+ r4]
+ movq xm5, [srcq+ r6]
+ movhps xm4, [srcq+ r7]
+ movhps xm5, [srcq+ r9]
+ vinserti128 m4, [srcq+r10], 1
+ vinserti128 m5, [srcq+r11], 1
+ vpbroadcastq m6, [srcq+r13]
+ vpbroadcastq m7, [srcq+ rX]
+ add srcq, ssq
+ pshufb m0, m14
+ pshufb m1, m14
+ pshufb m2, m14
+ pshufb m3, m14
+ vpblendd m4, m6, 0xc0
+ vpblendd m5, m7, 0xc0
+ pmaddubsw m4, m15
+ pmaddubsw m5, [rsp]
+ phaddw m4, m5
+ pslld m5, m4, 16
+ paddw m4, m5
+ pmulhrsw m4, m12
+ pblendw m0, m1, 0xaa
+ pblendw m1, m2, 0xaa
+ pblendw m2, m3, 0xaa
+ pblendw m3, m4, 0xaa
+ jmp .dy1_vloop
+.dy2:
+ movzx wd, word [base+%1_8tap_scaled_avx2_dy2_table+wq*2]
+ add wq, base_reg
+ jmp wq
+%ifidn %1, put
+.dy2_w2:
+ mov myd, mym
+ movzx t0d, t0b
+ dec srcq
+ movd xm15, t0d
+ punpckldq m8, m9, m8
+ paddd m14, m8 ; mx+dx*[0-1]
+ vpbroadcastd m11, [base+pd_0x4000]
+ vpbroadcastd xm15, xm15
+ pand m8, m14, m10
+ psrld m8, 6
+ paddd xm15, xm8
+ movd r4d, xm15
+ pextrd r6d, xm15, 1
+ vbroadcasti128 m5, [base+bdct_lb_dw]
+ vbroadcasti128 m6, [base+subpel_s_shuf2]
+ vpbroadcastd m15, [base+subpel_filters+r4*8+2]
+ vpbroadcastd m7, [base+subpel_filters+r6*8+2]
+ pcmpeqd m8, m9
+ psrld m14, 10
+ movq xm0, [srcq+ssq*0]
+ vpbroadcastq m2, [srcq+ssq*1]
+ movhps xm0, [srcq+ssq*2]
+ vpbroadcastq m3, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ pshufb m14, m5
+ paddb m14, m6
+ vpblendd m15, m7, 0xaa
+ pblendvb m15, m11, m8
+ movhps xm1, [srcq+ssq*0]
+ vpbroadcastq m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ vpblendd m0, m2, 0x30
+ vpblendd m1, m4, 0xc0
+ vpblendd m0, m3, 0xc0
+ pshufb m0, m14
+ pshufb m1, m14
+ pmaddubsw m0, m15
+ pmaddubsw m1, m15
+ movq xm11, r4q
+ pmovsxbw xm11, xm11
+ phaddw m0, m1
+ pmulhrsw m0, m12 ; 0 2 _ 4 1 3 _ 5
+ pshufd xm8, xm11, q0000
+ pshufd xm9, xm11, q1111
+ pshufd xm10, xm11, q2222
+ pshufd xm11, xm11, q3333
+ pshufd m2, m0, q3110 ; 0 2 2 4 1 3 3 5
+ vextracti128 xm1, m2, 1
+ punpcklwd xm3, xm2, xm1 ; 01 23
+ punpckhwd xm2, xm1 ; 23 45
+.dy2_w2_loop:
+ movq xm6, [srcq+ssq*0]
+ vpbroadcastq m7, [srcq+ssq*1]
+ movhps xm6, [srcq+ssq*2]
+ vpbroadcastq m1, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ pmaddwd xm4, xm3, xm8
+ pmaddwd xm5, xm2, xm9
+ vpblendd m6, m7, 0x30
+ vpblendd m6, m1, 0xc0
+ pshufb m6, m14
+ pmaddubsw m6, m15
+ phaddw m6, m6
+ pmulhrsw m6, m12
+ palignr m0, m6, m0, 8
+ pshufd m2, m0, q3221
+ vextracti128 xm1, m2, 1
+ punpcklwd xm3, xm2, xm1 ; 45 67
+ punpckhwd xm2, xm1 ; 67 89
+ pmaddwd xm6, xm3, xm10
+ pmaddwd xm7, xm2, xm11
+ paddd xm4, xm5
+ paddd xm4, xm13
+ paddd xm6, xm7
+ paddd xm4, xm6
+ psrad xm4, rndshift
+ packssdw xm4, xm4
+ packuswb xm4, xm4
+ pextrw [dstq+dsq*0], xm4, 0
+ pextrw [dstq+dsq*1], xm4, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .dy2_w2_loop
+ RET
+%endif
+.dy2_w4:
+ mov myd, mym
+ vbroadcasti128 m7, [base+rescale_mul]
+ movzx t0d, t0b
+ dec srcq
+ movd xm15, t0d
+ pmaddwd m8, m7
+ vpbroadcastd m11, [base+pd_0x4000]
+ vpbroadcastd xm15, xm15
+ paddd m14, m8 ; mx+dx*[0-3]
+ pand m8, m14, m10
+ psrld m8, 6
+ paddd xm15, xm8
+ movd r4d, xm15
+ pextrd r6d, xm15, 1
+ pextrd r11d, xm15, 2
+ pextrd r13d, xm15, 3
+ movd xm15, [base+subpel_filters+r4*8+2]
+ vbroadcasti128 m5, [base+bdct_lb_dw]
+ vpbroadcastq m6, [base+subpel_s_shuf2]
+ pinsrd xm15, [base+subpel_filters+r6*8+2], 1
+ pcmpeqd m8, m9
+ psrld m14, 10
+ movu xm0, [srcq+ssq*0]
+ movu xm2, [srcq+ssq*2]
+ pinsrd xm15, [base+subpel_filters+r11*8+2], 2
+ movu xm1, [srcq+ssq*1]
+ movu xm3, [srcq+ss3q ]
+ pinsrd xm15, [base+subpel_filters+r13*8+2], 3
+ lea srcq, [srcq+ssq*4]
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ vinserti128 m15, xm15, 1
+ pshufb m14, m5
+ paddb m14, m6
+ vinserti128 m2, [srcq+ssq*0], 1
+ vinserti128 m3, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ pblendvb m15, m11, m8
+ pshufb xm0, xm14
+ pshufb m2, m14
+ pshufb xm1, xm14
+ pshufb m3, m14
+ pmaddubsw xm0, xm15
+ pmaddubsw m2, m15
+ pmaddubsw xm1, xm15
+ pmaddubsw m3, m15
+ movq xm11, r4q
+ punpcklqdq xm11, xm11
+ pmovsxbw m11, xm11
+ phaddw m0, m2
+ phaddw m1, m3
+ pmulhrsw m0, m12 ; 0 2 _ 4
+ pmulhrsw m1, m12 ; 1 3 _ 5
+ pshufd m8, m11, q0000
+ pshufd m9, m11, q1111
+ pshufd m10, m11, q2222
+ pshufd m11, m11, q3333
+ punpcklwd xm2, xm0, xm1
+ punpckhwd m1, m0, m1 ; 23 45
+ vinserti128 m0, m2, xm1, 1 ; 01 23
+.dy2_w4_loop:
+ movu xm6, [srcq+ssq*0]
+ movu xm7, [srcq+ssq*1]
+ vinserti128 m6, [srcq+ssq*2], 1
+ vinserti128 m7, [srcq+ss3q ], 1
+ lea srcq, [srcq+ssq*4]
+ pmaddwd m4, m0, m8
+ pmaddwd m5, m1, m9
+ pshufb m6, m14
+ pshufb m7, m14
+ pmaddubsw m6, m15
+ pmaddubsw m7, m15
+ psrld m2, m6, 16
+ pslld m3, m7, 16
+ paddw m6, m2
+ paddw m7, m3
+ pblendw m6, m7, 0xaa ; 67 89
+ pmulhrsw m6, m12
+ paddd m4, m5
+ vperm2i128 m0, m1, m6, 0x21 ; 45 67
+ mova m1, m6
+ pmaddwd m6, m0, m10
+ pmaddwd m7, m1, m11
+ paddd m4, m13
+ paddd m6, m7
+ paddd m4, m6
+ psrad m4, rndshift
+ vextracti128 xm5, m4, 1
+ packssdw xm4, xm5
+%ifidn %1, put
+ packuswb xm4, xm4
+ movd [dstq+dsq*0], xm4
+ pextrd [dstq+dsq*1], xm4, 1
+ lea dstq, [dstq+dsq*2]
+%else
+ mova [tmpq], xm4
+ add tmpq, 16
+%endif
+ sub hd, 2
+ jg .dy2_w4_loop
+ MC_8TAP_SCALED_RET
+.dy2_w8:
+ mov dword [rsp+40], 1
+ movifprep tmp_stridem, 16
+ jmp .dy2_w_start
+.dy2_w16:
+ mov dword [rsp+40], 2
+ movifprep tmp_stridem, 32
+ jmp .dy2_w_start
+.dy2_w32:
+ mov dword [rsp+40], 4
+ movifprep tmp_stridem, 64
+ jmp .dy2_w_start
+.dy2_w64:
+ mov dword [rsp+40], 8
+ movifprep tmp_stridem, 128
+ jmp .dy2_w_start
+.dy2_w128:
+ mov dword [rsp+40], 16
+ movifprep tmp_stridem, 256
+.dy2_w_start:
+ mov myd, mym
+%ifidn %1, put
+ movifnidn dsm, dsq
+%endif
+ shr t0d, 16
+ sub srcq, 3
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ pmaddwd m8, [base+rescale_mul]
+ movd xm15, t0d
+ mov [rsp+64], t0d
+ mov [rsp+48], srcq
+ mov [rsp+56], r0q ; dstq / tmpq
+%if UNIX64
+ mov hm, hd
+%endif
+ shl dword dxm, 3 ; dx*8
+ vpbroadcastd m15, xm15
+ paddd m14, m8 ; mx+dx*[0-7]
+ movq xm0, r4q
+ pmovsxbw xm0, xm0
+ mova [rsp+0x50], xm0
+ jmp .dy2_hloop
+.dy2_hloop_prep:
+ dec dword [rsp+40]
+ jz .ret
+ add qword [rsp+56], 8*(isprep+1)
+ mov hd, hm
+ vpbroadcastd m8, dxm
+ vpbroadcastd m10, [base+pd_0x3ff]
+ paddd m14, m8, [rsp]
+ vpbroadcastd m15, [rsp+64]
+ pxor m9, m9
+ mov srcq, [rsp+48]
+ mov r0q, [rsp+56] ; dstq / tmpq
+.dy2_hloop:
+ vpbroadcastq m11, [base+pq_0x40000000]
+ pand m6, m14, m10
+ psrld m6, 6
+ paddd m15, m6
+ pcmpeqd m6, m9
+ vextracti128 xm7, m15, 1
+ movd r4d, xm15
+ pextrd r6d, xm15, 2
+ pextrd r7d, xm15, 1
+ pextrd r9d, xm15, 3
+ movd r10d, xm7
+ pextrd r11d, xm7, 2
+ pextrd r13d, xm7, 1
+ pextrd rXd, xm7, 3
+ movu [rsp], m14
+ movq xm15, [base+subpel_filters+ r4*8]
+ movq xm10, [base+subpel_filters+ r6*8]
+ movhps xm15, [base+subpel_filters+ r7*8]
+ movhps xm10, [base+subpel_filters+ r9*8]
+ vinserti128 m15, [base+subpel_filters+r10*8], 1
+ vinserti128 m10, [base+subpel_filters+r11*8], 1
+ vpbroadcastq m9, [base+subpel_filters+r13*8]
+ vpbroadcastq m8, [base+subpel_filters+ rX*8]
+ psrld m14, 10
+ vextracti128 xm7, m14, 1
+ movd r4d, xm14
+ pextrd r6d, xm14, 2
+ pextrd r7d, xm14, 1
+ pextrd r9d, xm14, 3
+ movd r10d, xm7
+ pextrd r11d, xm7, 2
+ pextrd r13d, xm7, 1
+ pextrd rXd, xm7, 3
+ pshufd m5, m6, q1100
+ pshufd m6, m6, q3322
+ vpblendd m15, m9, 0xc0
+ vpblendd m10, m8, 0xc0
+ pblendvb m15, m11, m5
+ pblendvb m10, m11, m6
+ vbroadcasti128 m14, [base+subpel_s_shuf8]
+ MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
+ MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
+ vpbroadcastd m8, [rsp+0x50]
+ vpbroadcastd m9, [rsp+0x54]
+ vpbroadcastd m11, [rsp+0x58]
+ vpbroadcastd m4, [rsp+0x5c]
+ pshufb m0, m14 ; 01a 01b
+ pshufb m1, m14 ; 23a 23b
+ pshufb m2, m14 ; 45a 45b
+ pshufb m3, m14 ; 67a 67b
+ SWAP m14, m4
+.dy2_vloop:
+ pmaddwd m4, m0, m8
+ pmaddwd m5, m1, m9
+ pmaddwd m6, m2, m11
+ pmaddwd m7, m3, m14
+ paddd m4, m5
+ paddd m6, m7
+ paddd m4, m13
+ paddd m4, m6
+ psrad m4, rndshift
+ vextracti128 xm5, m4, 1
+ packssdw xm4, xm5
+%ifidn %1, put
+ packuswb xm4, xm4
+ movq [dstq], xm4
+ add dstq, dsm
+%else
+ mova [tmpq], xm4
+ add tmpq, tmp_stridem
+%endif
+ dec hd
+ jz .dy2_hloop_prep
+ mova m0, m1
+ mova m1, m2
+ mova m2, m3
+ movq xm3, [srcq+ r4]
+ movq xm4, [srcq+ r6]
+ movhps xm3, [srcq+ r7]
+ movhps xm4, [srcq+ r9]
+ vinserti128 m3, [srcq+r10], 1
+ vinserti128 m4, [srcq+r11], 1
+ vpbroadcastq m5, [srcq+r13]
+ vpbroadcastq m6, [srcq+ rX]
+ add srcq, ssq
+ vpblendd m3, m5, 0xc0
+ vpblendd m4, m6, 0xc0
+ pmaddubsw m3, m15
+ pmaddubsw m4, m10
+ phaddw m3, m4
+ movq xm4, [srcq+ r4]
+ movq xm5, [srcq+ r6]
+ movhps xm4, [srcq+ r7]
+ movhps xm5, [srcq+ r9]
+ vinserti128 m4, [srcq+r10], 1
+ vinserti128 m5, [srcq+r11], 1
+ vpbroadcastq m6, [srcq+r13]
+ vpbroadcastq m7, [srcq+ rX]
+ add srcq, ssq
+ vpblendd m4, m6, 0xc0
+ vpblendd m5, m7, 0xc0
+ pmaddubsw m4, m15
+ pmaddubsw m5, m10
+ phaddw m4, m5
+ psrld m5, m3, 16
+ pslld m6, m4, 16
+ paddw m3, m5
+ paddw m4, m6
+ pblendw m3, m4, 0xaa
+ pmulhrsw m3, m12
+ jmp .dy2_vloop
+.ret:
+ MC_8TAP_SCALED_RET 0
+%undef isprep
+%endmacro
+
+%macro BILIN_SCALED_FN 1
+cglobal %1_bilin_scaled_8bpc
+ mov t0d, (5*15 << 16) | 5*15
+ mov t1d, t0d
+ jmp mangle(private_prefix %+ _%1_8tap_scaled_8bpc %+ SUFFIX)
+%endmacro
+
+%if WIN64
+DECLARE_REG_TMP 6, 5
+%else
+DECLARE_REG_TMP 6, 8
+%endif
+
+%define PUT_8TAP_SCALED_FN FN put_8tap_scaled,
+%define PREP_8TAP_SCALED_FN FN prep_8tap_scaled,
+
+BILIN_SCALED_FN put
+PUT_8TAP_SCALED_FN sharp, SHARP, SHARP
+PUT_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH
+PUT_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP
+PUT_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH
+PUT_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR
+PUT_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP
+PUT_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR
+PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH
+PUT_8TAP_SCALED_FN regular, REGULAR, REGULAR
+MC_8TAP_SCALED put
+
+%if WIN64
+DECLARE_REG_TMP 5, 4
+%else
+DECLARE_REG_TMP 6, 7
+%endif
+
+BILIN_SCALED_FN prep
+PREP_8TAP_SCALED_FN sharp, SHARP, SHARP
+PREP_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH
+PREP_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP
+PREP_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH
+PREP_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR
+PREP_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP
+PREP_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR
+PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH
+PREP_8TAP_SCALED_FN regular, REGULAR, REGULAR
+MC_8TAP_SCALED prep
+
+%macro WARP_V 5 ; dst, 02, 46, 13, 57
+ ; Can be done using gathers, but that's terribly slow on many CPU:s
+ lea tmp1d, [myq+deltaq*4]
+ lea tmp2d, [myq+deltaq*1]
+ shr myd, 10
+ shr tmp1d, 10
+ movq xm8, [filterq+myq *8]
+ vinserti128 m8, [filterq+tmp1q*8], 1 ; a e
+ lea tmp1d, [tmp2q+deltaq*4]
+ lea myd, [tmp2q+deltaq*1]
+ shr tmp2d, 10
+ shr tmp1d, 10
+ movq xm0, [filterq+tmp2q*8]
+ vinserti128 m0, [filterq+tmp1q*8], 1 ; b f
+ lea tmp1d, [myq+deltaq*4]
+ lea tmp2d, [myq+deltaq*1]
+ shr myd, 10
+ shr tmp1d, 10
+ movq xm9, [filterq+myq *8]
+ vinserti128 m9, [filterq+tmp1q*8], 1 ; c g
+ lea tmp1d, [tmp2q+deltaq*4]
+ lea myd, [tmp2q+gammaq] ; my += gamma
+ shr tmp2d, 10
+ shr tmp1d, 10
+ punpcklwd m8, m0
+ movq xm0, [filterq+tmp2q*8]
+ vinserti128 m0, [filterq+tmp1q*8], 1 ; d h
+ punpcklwd m0, m9, m0
+ punpckldq m9, m8, m0
+ punpckhdq m0, m8, m0
+ punpcklbw m8, m11, m9 ; a0 a2 b0 b2 c0 c2 d0 d2 << 8
+ punpckhbw m9, m11, m9 ; a4 a6 b4 b6 c4 c6 d4 d6 << 8
+ pmaddwd m%2, m8
+ pmaddwd m9, m%3
+ punpcklbw m8, m11, m0 ; a1 a3 b1 b3 c1 c3 d1 d3 << 8
+ punpckhbw m0, m11, m0 ; a5 a7 b5 b7 c5 c7 d5 d7 << 8
+ pmaddwd m8, m%4
+ pmaddwd m0, m%5
+ paddd m%2, m9
+ paddd m0, m8
+ paddd m%1, m0, m%2
+%endmacro
+
+cglobal warp_affine_8x8t_8bpc, 0, 14, 0, tmp, ts
+%if WIN64
+ sub rsp, 0xa0
+%endif
+ call mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx2).main
+.loop:
+ psrad m7, 13
+ psrad m0, 13
+ packssdw m7, m0
+ pmulhrsw m7, m14 ; (x + (1 << 6)) >> 7
+ vpermq m7, m7, q3120
+ mova [tmpq+tsq*0], xm7
+ vextracti128 [tmpq+tsq*2], m7, 1
+ dec r4d
+ jz mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx2).end
+ call mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx2).main2
+ lea tmpq, [tmpq+tsq*4]
+ jmp .loop
+
+cglobal warp_affine_8x8_8bpc, 0, 14, 0, dst, ds, src, ss, abcd, mx, tmp2, alpha, \
+ beta, filter, tmp1, delta, my, gamma
+%if WIN64
+ sub rsp, 0xa0
+ %assign xmm_regs_used 16
+ %assign stack_size_padded 0xa0
+ %assign stack_offset stack_offset+stack_size_padded
+%endif
+ call .main
+ jmp .start
+.loop:
+ call .main2
+ lea dstq, [dstq+dsq*2]
+.start:
+ psrad m7, 18
+ psrad m0, 18
+ packusdw m7, m0
+ pavgw m7, m11 ; (x + (1 << 10)) >> 11
+ vextracti128 xm0, m7, 1
+ packuswb xm7, xm0
+ pshufd xm7, xm7, q3120
+ movq [dstq+dsq*0], xm7
+ movhps [dstq+dsq*1], xm7
+ dec r4d
+ jg .loop
+.end:
+ RET
+ALIGN function_align
+.main:
+ ; Stack args offset by one (r4m -> r5m etc.) due to call
+%if WIN64
+ mov abcdq, r5m
+ mov mxd, r6m
+ movaps [rsp+stack_offset+0x10], xmm6
+ movaps [rsp+stack_offset+0x20], xmm7
+ movaps [rsp+0x28], xmm8
+ movaps [rsp+0x38], xmm9
+ movaps [rsp+0x48], xmm10
+ movaps [rsp+0x58], xmm11
+ movaps [rsp+0x68], xmm12
+ movaps [rsp+0x78], xmm13
+ movaps [rsp+0x88], xmm14
+ movaps [rsp+0x98], xmm15
+%endif
+ movsx alphad, word [abcdq+2*0]
+ movsx betad, word [abcdq+2*1]
+ mova m12, [warp_8x8_shufA]
+ mova m13, [warp_8x8_shufB]
+ vpbroadcastd m14, [pw_8192]
+ vpbroadcastd m15, [pd_32768]
+ pxor m11, m11
+ lea filterq, [mc_warp_filter2]
+ lea tmp1q, [ssq*3+3]
+ add mxd, 512+(64<<10)
+ lea tmp2d, [alphaq*3]
+ sub srcq, tmp1q ; src -= src_stride*3 + 3
+ sub betad, tmp2d ; beta -= alpha*3
+ mov myd, r7m
+ call .h
+ psrld m1, m0, 16
+ call .h
+ psrld m4, m0, 16
+ call .h
+ pblendw m1, m0, 0xaa ; 02
+ call .h
+ pblendw m4, m0, 0xaa ; 13
+ call .h
+ psrld m2, m1, 16
+ pblendw m2, m0, 0xaa ; 24
+ call .h
+ psrld m5, m4, 16
+ pblendw m5, m0, 0xaa ; 35
+ call .h
+ psrld m3, m2, 16
+ pblendw m3, m0, 0xaa ; 46
+ movsx deltad, word [abcdq+2*2]
+ movsx gammad, word [abcdq+2*3]
+ add myd, 512+(64<<10)
+ mov r4d, 4
+ lea tmp1d, [deltaq*3]
+ sub gammad, tmp1d ; gamma -= delta*3
+.main2:
+ call .h
+ psrld m6, m5, 16
+ pblendw m6, m0, 0xaa ; 57
+ WARP_V 7, 1, 3, 4, 6
+ call .h
+ mova m1, m2
+ mova m2, m3
+ psrld m3, 16
+ pblendw m3, m0, 0xaa ; 68
+ WARP_V 0, 4, 6, 1, 3
+ mova m4, m5
+ mova m5, m6
+ ret
+ALIGN function_align
+.h:
+ lea tmp1d, [mxq+alphaq*4]
+ lea tmp2d, [mxq+alphaq*1]
+ vbroadcasti128 m10, [srcq]
+ shr mxd, 10
+ shr tmp1d, 10
+ movq xm8, [filterq+mxq *8]
+ vinserti128 m8, [filterq+tmp1q*8], 1
+ lea tmp1d, [tmp2q+alphaq*4]
+ lea mxd, [tmp2q+alphaq*1]
+ shr tmp2d, 10
+ shr tmp1d, 10
+ movq xm0, [filterq+tmp2q*8]
+ vinserti128 m0, [filterq+tmp1q*8], 1
+ lea tmp1d, [mxq+alphaq*4]
+ lea tmp2d, [mxq+alphaq*1]
+ shr mxd, 10
+ shr tmp1d, 10
+ movq xm9, [filterq+mxq *8]
+ vinserti128 m9, [filterq+tmp1q*8], 1
+ lea tmp1d, [tmp2q+alphaq*4]
+ lea mxd, [tmp2q+betaq] ; mx += beta
+ shr tmp2d, 10
+ shr tmp1d, 10
+ punpcklqdq m8, m0 ; 0 1 4 5
+ movq xm0, [filterq+tmp2q*8]
+ vinserti128 m0, [filterq+tmp1q*8], 1
+ punpcklqdq m9, m0 ; 2 3 6 7
+ pshufb m0, m10, m12
+ pmaddubsw m0, m8
+ pshufb m10, m13
+ pmaddubsw m10, m9
+ add srcq, ssq
+ phaddw m0, m10
+ pmaddwd m0, m14 ; 17-bit intermediate, upshifted by 13
+ paddd m0, m15 ; rounded 14-bit result in upper 16 bits of dword
+ ret
+
+%macro BIDIR_FN 1 ; op
+ %1 0
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ vextracti128 xm1, m0, 1
+ movd [dstq ], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q ], xm1, 1
+ cmp hd, 4
+ je .ret
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq ], xm0, 2
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
+ cmp hd, 8
+ je .ret
+ %1 2
+ lea dstq, [dstq+strideq*4]
+ vextracti128 xm1, m0, 1
+ movd [dstq ], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q ], xm1, 1
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq ], xm0, 2
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
+.ret:
+ RET
+.w8_loop:
+ %1_INC_PTR 2
+ %1 0
+ lea dstq, [dstq+strideq*4]
+.w8:
+ vextracti128 xm1, m0, 1
+ movq [dstq ], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm1
+ sub hd, 4
+ jg .w8_loop
+ RET
+.w16_loop:
+ %1_INC_PTR 4
+ %1 0
+ lea dstq, [dstq+strideq*4]
+.w16:
+ vpermq m0, m0, q3120
+ mova [dstq ], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ %1 2
+ vpermq m0, m0, q3120
+ mova [dstq+strideq*2], xm0
+ vextracti128 [dstq+stride3q ], m0, 1
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32_loop:
+ %1_INC_PTR 4
+ %1 0
+ lea dstq, [dstq+strideq*2]
+.w32:
+ vpermq m0, m0, q3120
+ mova [dstq+strideq*0], m0
+ %1 2
+ vpermq m0, m0, q3120
+ mova [dstq+strideq*1], m0
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64_loop:
+ %1_INC_PTR 4
+ %1 0
+ add dstq, strideq
+.w64:
+ vpermq m0, m0, q3120
+ mova [dstq], m0
+ %1 2
+ vpermq m0, m0, q3120
+ mova [dstq+32], m0
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop:
+ %1 0
+ add dstq, strideq
+.w128:
+ vpermq m0, m0, q3120
+ mova [dstq+0*32], m0
+ %1 2
+ vpermq m0, m0, q3120
+ mova [dstq+1*32], m0
+ %1_INC_PTR 8
+ %1 -4
+ vpermq m0, m0, q3120
+ mova [dstq+2*32], m0
+ %1 -2
+ vpermq m0, m0, q3120
+ mova [dstq+3*32], m0
+ dec hd
+ jg .w128_loop
+ RET
+%endmacro
+
+%macro AVG 1 ; src_offset
+ mova m0, [tmp1q+(%1+0)*32]
+ paddw m0, [tmp2q+(%1+0)*32]
+ mova m1, [tmp1q+(%1+1)*32]
+ paddw m1, [tmp2q+(%1+1)*32]
+ pmulhrsw m0, m2
+ pmulhrsw m1, m2
+ packuswb m0, m1
+%endmacro
+
+%macro AVG_INC_PTR 1
+ add tmp1q, %1*32
+ add tmp2q, %1*32
+%endmacro
+
+cglobal avg_8bpc, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3
+%define base r6-avg %+ SUFFIX %+ _table
+ lea r6, [avg %+ SUFFIX %+ _table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, dword [r6+wq*4]
+ vpbroadcastd m2, [base+pw_1024]
+ add wq, r6
+ BIDIR_FN AVG
+
+%macro W_AVG 1 ; src_offset
+ ; (a * weight + b * (16 - weight) + 128) >> 8
+ ; = ((a - b) * weight + (b << 4) + 128) >> 8
+ ; = ((((a - b) * ((weight-16) << 12)) >> 16) + a + 8) >> 4
+ ; = ((((b - a) * (-weight << 12)) >> 16) + b + 8) >> 4
+ mova m0, [tmp1q+(%1+0)*32]
+ psubw m2, m0, [tmp2q+(%1+0)*32]
+ mova m1, [tmp1q+(%1+1)*32]
+ psubw m3, m1, [tmp2q+(%1+1)*32]
+ pmulhw m2, m4
+ pmulhw m3, m4
+ paddw m0, m2
+ paddw m1, m3
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+%endmacro
+
+%define W_AVG_INC_PTR AVG_INC_PTR
+
+cglobal w_avg_8bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
+%define base r6-w_avg %+ SUFFIX %+ _table
+ lea r6, [w_avg %+ SUFFIX %+ _table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ vpbroadcastw m4, r6m ; weight
+ movsxd wq, dword [r6+wq*4]
+ vpbroadcastd m5, [base+pw_2048]
+ psllw m4, 12 ; (weight-16) << 12 when interpreted as signed
+ add wq, r6
+ cmp dword r6m, 7
+ jg .weight_gt7
+ mov r6, tmp1q
+ pxor m0, m0
+ mov tmp1q, tmp2q
+ psubw m4, m0, m4 ; -weight
+ mov tmp2q, r6
+.weight_gt7:
+ BIDIR_FN W_AVG
+
+%macro MASK 1 ; src_offset
+ ; (a * m + b * (64 - m) + 512) >> 10
+ ; = ((a - b) * m + (b << 6) + 512) >> 10
+ ; = ((((b - a) * (-m << 10)) >> 16) + b + 8) >> 4
+ vpermq m3, [maskq+%1*16], q3120
+ mova m0, [tmp2q+(%1+0)*32]
+ psubw m1, m0, [tmp1q+(%1+0)*32]
+ psubb m3, m4, m3
+ paddw m1, m1 ; (b - a) << 1
+ paddb m3, m3
+ punpcklbw m2, m4, m3 ; -m << 9
+ pmulhw m1, m2
+ paddw m0, m1
+ mova m1, [tmp2q+(%1+1)*32]
+ psubw m2, m1, [tmp1q+(%1+1)*32]
+ paddw m2, m2
+ punpckhbw m3, m4, m3
+ pmulhw m2, m3
+ paddw m1, m2
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+%endmacro
+
+%macro MASK_INC_PTR 1
+ add maskq, %1*16
+ add tmp2q, %1*32
+ add tmp1q, %1*32
+%endmacro
+
+cglobal mask_8bpc, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-mask %+ SUFFIX %+ _table
+ lea r7, [mask %+ SUFFIX %+ _table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ mov maskq, maskmp
+ movsxd wq, dword [r7+wq*4]
+ vpbroadcastd m5, [base+pw_2048]
+ pxor m4, m4
+ add wq, r7
+ BIDIR_FN MASK
+
+%macro W_MASK 4-5 0 ; dst, mask, tmp_offset[1-2], 4:4:4
+ mova m%1, [tmp1q+32*%3]
+ mova m1, [tmp2q+32*%3]
+ psubw m1, m%1
+ pabsw m%2, m1
+ psubusw m%2, m6, m%2
+ psrlw m%2, 8 ; 64 - m
+ psllw m2, m%2, 10
+ pmulhw m1, m2
+ paddw m%1, m1
+ mova m1, [tmp1q+32*%4]
+ mova m2, [tmp2q+32*%4]
+ psubw m2, m1
+ pabsw m3, m2
+ psubusw m3, m6, m3
+ psrlw m3, 8
+%if %5
+ packuswb m%2, m3
+ psubb m%2, m5, m%2
+ vpermq m%2, m%2, q3120
+%else
+ phaddw m%2, m3
+%endif
+ psllw m3, 10
+ pmulhw m2, m3
+ paddw m1, m2
+ pmulhrsw m%1, m7
+ pmulhrsw m1, m7
+ packuswb m%1, m1
+%endmacro
+
+cglobal blend_8bpc, 3, 7, 7, dst, ds, tmp, w, h, mask
+%define base r6-blend_avx2_table
+ lea r6, [blend_avx2_table]
+ tzcnt wd, wm
+ movifnidn maskq, maskmp
+ movifnidn hd, hm
+ movsxd wq, dword [r6+wq*4]
+ vpbroadcastd m4, [base+pb_64]
+ vpbroadcastd m5, [base+pw_512]
+ sub tmpq, maskq
+ add wq, r6
+ lea r6, [dsq*3]
+ jmp wq
+.w4:
+ movd xm0, [dstq+dsq*0]
+ pinsrd xm0, [dstq+dsq*1], 1
+ vpbroadcastd xm1, [dstq+dsq*2]
+ pinsrd xm1, [dstq+r6 ], 3
+ mova xm6, [maskq]
+ psubb xm3, xm4, xm6
+ punpcklbw xm2, xm3, xm6
+ punpckhbw xm3, xm6
+ mova xm6, [maskq+tmpq]
+ add maskq, 4*4
+ punpcklbw xm0, xm6
+ punpckhbw xm1, xm6
+ pmaddubsw xm0, xm2
+ pmaddubsw xm1, xm3
+ pmulhrsw xm0, xm5
+ pmulhrsw xm1, xm5
+ packuswb xm0, xm1
+ movd [dstq+dsq*0], xm0
+ pextrd [dstq+dsq*1], xm0, 1
+ pextrd [dstq+dsq*2], xm0, 2
+ pextrd [dstq+r6 ], xm0, 3
+ lea dstq, [dstq+dsq*4]
+ sub hd, 4
+ jg .w4
+ RET
+ALIGN function_align
+.w8:
+ movq xm1, [dstq+dsq*0]
+ movhps xm1, [dstq+dsq*1]
+ vpbroadcastq m2, [dstq+dsq*2]
+ vpbroadcastq m3, [dstq+r6 ]
+ mova m0, [maskq]
+ mova m6, [maskq+tmpq]
+ add maskq, 8*4
+ vpblendd m1, m2, 0x30
+ vpblendd m1, m3, 0xc0
+ psubb m3, m4, m0
+ punpcklbw m2, m3, m0
+ punpckhbw m3, m0
+ punpcklbw m0, m1, m6
+ punpckhbw m1, m6
+ pmaddubsw m0, m2
+ pmaddubsw m1, m3
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+ vextracti128 xm1, m0, 1
+ movq [dstq+dsq*0], xm0
+ movhps [dstq+dsq*1], xm0
+ movq [dstq+dsq*2], xm1
+ movhps [dstq+r6 ], xm1
+ lea dstq, [dstq+dsq*4]
+ sub hd, 4
+ jg .w8
+ RET
+ALIGN function_align
+.w16:
+ mova m0, [maskq]
+ mova xm1, [dstq+dsq*0]
+ vinserti128 m1, [dstq+dsq*1], 1
+ psubb m3, m4, m0
+ punpcklbw m2, m3, m0
+ punpckhbw m3, m0
+ mova m6, [maskq+tmpq]
+ add maskq, 16*2
+ punpcklbw m0, m1, m6
+ punpckhbw m1, m6
+ pmaddubsw m0, m2
+ pmaddubsw m1, m3
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+ mova [dstq+dsq*0], xm0
+ vextracti128 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w16
+ RET
+ALIGN function_align
+.w32:
+ mova m0, [maskq]
+ mova m1, [dstq]
+ mova m6, [maskq+tmpq]
+ add maskq, 32
+ psubb m3, m4, m0
+ punpcklbw m2, m3, m0
+ punpckhbw m3, m0
+ punpcklbw m0, m1, m6
+ punpckhbw m1, m6
+ pmaddubsw m0, m2
+ pmaddubsw m1, m3
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+ mova [dstq], m0
+ add dstq, dsq
+ dec hd
+ jg .w32
+ RET
+
+cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mask
+%define base r5-blend_v_avx2_table
+ lea r5, [blend_v_avx2_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, dword [r5+wq*4]
+ vpbroadcastd m5, [base+pw_512]
+ add wq, r5
+ add maskq, obmc_masks-blend_v_avx2_table
+ jmp wq
+.w2:
+ vpbroadcastd xm2, [maskq+2*2]
+.w2_s0_loop:
+ movd xm0, [dstq+dsq*0]
+ pinsrw xm0, [dstq+dsq*1], 1
+ movd xm1, [tmpq]
+ add tmpq, 2*2
+ punpcklbw xm0, xm1
+ pmaddubsw xm0, xm2
+ pmulhrsw xm0, xm5
+ packuswb xm0, xm0
+ pextrw [dstq+dsq*0], xm0, 0
+ pextrw [dstq+dsq*1], xm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w2_s0_loop
+ RET
+ALIGN function_align
+.w4:
+ vpbroadcastq xm2, [maskq+4*2]
+.w4_loop:
+ movd xm0, [dstq+dsq*0]
+ pinsrd xm0, [dstq+dsq*1], 1
+ movq xm1, [tmpq]
+ add tmpq, 4*2
+ punpcklbw xm0, xm1
+ pmaddubsw xm0, xm2
+ pmulhrsw xm0, xm5
+ packuswb xm0, xm0
+ movd [dstq+dsq*0], xm0
+ pextrd [dstq+dsq*1], xm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w4_loop
+ RET
+ALIGN function_align
+.w8:
+ mova xm3, [maskq+8*2]
+.w8_loop:
+ movq xm0, [dstq+dsq*0]
+ vpbroadcastq xm1, [dstq+dsq*1]
+ mova xm2, [tmpq]
+ add tmpq, 8*2
+ punpcklbw xm0, xm2
+ punpckhbw xm1, xm2
+ pmaddubsw xm0, xm3
+ pmaddubsw xm1, xm3
+ pmulhrsw xm0, xm5
+ pmulhrsw xm1, xm5
+ packuswb xm0, xm1
+ movq [dstq+dsq*0], xm0
+ movhps [dstq+dsq*1], xm0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w8_loop
+ RET
+ALIGN function_align
+.w16:
+ vbroadcasti128 m3, [maskq+16*2]
+ vbroadcasti128 m4, [maskq+16*3]
+.w16_loop:
+ mova xm1, [dstq+dsq*0]
+ vinserti128 m1, [dstq+dsq*1], 1
+ mova m2, [tmpq]
+ add tmpq, 16*2
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ pmaddubsw m0, m3
+ pmaddubsw m1, m4
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+ mova [dstq+dsq*0], xm0
+ vextracti128 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w16_loop
+ RET
+ALIGN function_align
+.w32:
+ mova xm3, [maskq+16*4]
+ vinserti128 m3, [maskq+16*6], 1
+ mova xm4, [maskq+16*5]
+ vinserti128 m4, [maskq+16*7], 1
+.w32_loop:
+ mova m1, [dstq]
+ mova m2, [tmpq]
+ add tmpq, 32
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ pmaddubsw m0, m3
+ pmaddubsw m1, m4
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+ mova [dstq], m0
+ add dstq, dsq
+ dec hd
+ jg .w32_loop
+ RET
+
+cglobal blend_h_8bpc, 4, 7, 6, dst, ds, tmp, w, h, mask
+%define base r5-blend_h_avx2_table
+ lea r5, [blend_h_avx2_table]
+ mov r6d, wd
+ tzcnt wd, wd
+ mov hd, hm
+ movsxd wq, dword [r5+wq*4]
+ vpbroadcastd m5, [base+pw_512]
+ add wq, r5
+ lea maskq, [base+obmc_masks+hq*2]
+ lea hd, [hq*3]
+ shr hd, 2 ; h * 3/4
+ lea maskq, [maskq+hq*2]
+ neg hq
+ jmp wq
+.w2:
+ movd xm0, [dstq+dsq*0]
+ pinsrw xm0, [dstq+dsq*1], 1
+ movd xm2, [maskq+hq*2]
+ movd xm1, [tmpq]
+ add tmpq, 2*2
+ punpcklwd xm2, xm2
+ punpcklbw xm0, xm1
+ pmaddubsw xm0, xm2
+ pmulhrsw xm0, xm5
+ packuswb xm0, xm0
+ pextrw [dstq+dsq*0], xm0, 0
+ pextrw [dstq+dsq*1], xm0, 1
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w2
+ RET
+ALIGN function_align
+.w4:
+ mova xm3, [blend_shuf]
+.w4_loop:
+ movd xm0, [dstq+dsq*0]
+ pinsrd xm0, [dstq+dsq*1], 1
+ movd xm2, [maskq+hq*2]
+ movq xm1, [tmpq]
+ add tmpq, 4*2
+ pshufb xm2, xm3
+ punpcklbw xm0, xm1
+ pmaddubsw xm0, xm2
+ pmulhrsw xm0, xm5
+ packuswb xm0, xm0
+ movd [dstq+dsq*0], xm0
+ pextrd [dstq+dsq*1], xm0, 1
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w4_loop
+ RET
+ALIGN function_align
+.w8:
+ vbroadcasti128 m4, [blend_shuf]
+ shufpd m4, m4, 0x03
+.w8_loop:
+ vpbroadcastq m1, [dstq+dsq*0]
+ movq xm0, [dstq+dsq*1]
+ vpblendd m0, m1, 0x30
+ vpbroadcastd m3, [maskq+hq*2]
+ movq xm1, [tmpq+8*1]
+ vinserti128 m1, [tmpq+8*0], 1
+ add tmpq, 8*2
+ pshufb m3, m4
+ punpcklbw m0, m1
+ pmaddubsw m0, m3
+ pmulhrsw m0, m5
+ vextracti128 xm1, m0, 1
+ packuswb xm0, xm1
+ movhps [dstq+dsq*0], xm0
+ movq [dstq+dsq*1], xm0
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w8_loop
+ RET
+ALIGN function_align
+.w16:
+ vbroadcasti128 m4, [blend_shuf]
+ shufpd m4, m4, 0x0c
+.w16_loop:
+ mova xm1, [dstq+dsq*0]
+ vinserti128 m1, [dstq+dsq*1], 1
+ vpbroadcastd m3, [maskq+hq*2]
+ mova m2, [tmpq]
+ add tmpq, 16*2
+ pshufb m3, m4
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ pmaddubsw m0, m3
+ pmaddubsw m1, m3
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+ mova [dstq+dsq*0], xm0
+ vextracti128 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w16_loop
+ RET
+ALIGN function_align
+.w32: ; w32/w64/w128
+ sub dsq, r6
+.w32_loop0:
+ vpbroadcastw m3, [maskq+hq*2]
+ mov wd, r6d
+.w32_loop:
+ mova m1, [dstq]
+ mova m2, [tmpq]
+ add tmpq, 32
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ pmaddubsw m0, m3
+ pmaddubsw m1, m3
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+ mova [dstq], m0
+ add dstq, 32
+ sub wd, 32
+ jg .w32_loop
+ add dstq, dsq
+ inc hq
+ jl .w32_loop0
+ RET
+
+cglobal emu_edge_8bpc, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \
+ bottomext, rightext
+ ; we assume that the buffer (stride) is larger than width, so we can
+ ; safely overwrite by a few bytes
+
+ ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
+ xor r12d, r12d
+ lea r10, [ihq-1]
+ cmp yq, ihq
+ cmovs r10, yq
+ test yq, yq
+ cmovs r10, r12
+ imul r10, sstrideq
+ add srcq, r10
+
+ ; ref += iclip(x, 0, iw - 1)
+ lea r10, [iwq-1]
+ cmp xq, iwq
+ cmovs r10, xq
+ test xq, xq
+ cmovs r10, r12
+ add srcq, r10
+
+ ; bottom_ext = iclip(y + bh - ih, 0, bh - 1)
+ lea bottomextq, [yq+bhq]
+ sub bottomextq, ihq
+ lea r3, [bhq-1]
+ cmovs bottomextq, r12
+
+ DEFINE_ARGS bw, bh, iw, ih, x, topext, dst, dstride, src, sstride, \
+ bottomext, rightext
+
+ ; top_ext = iclip(-y, 0, bh - 1)
+ neg topextq
+ cmovs topextq, r12
+ cmp bottomextq, bhq
+ cmovns bottomextq, r3
+ cmp topextq, bhq
+ cmovg topextq, r3
+
+ ; right_ext = iclip(x + bw - iw, 0, bw - 1)
+ lea rightextq, [xq+bwq]
+ sub rightextq, iwq
+ lea r2, [bwq-1]
+ cmovs rightextq, r12
+
+ DEFINE_ARGS bw, bh, iw, ih, leftext, topext, dst, dstride, src, sstride, \
+ bottomext, rightext
+
+ ; left_ext = iclip(-x, 0, bw - 1)
+ neg leftextq
+ cmovs leftextq, r12
+ cmp rightextq, bwq
+ cmovns rightextq, r2
+ cmp leftextq, bwq
+ cmovns leftextq, r2
+
+ DEFINE_ARGS bw, centerh, centerw, dummy, leftext, topext, \
+ dst, dstride, src, sstride, bottomext, rightext
+
+ ; center_h = bh - top_ext - bottom_ext
+ lea r3, [bottomextq+topextq]
+ sub centerhq, r3
+
+ ; blk += top_ext * PXSTRIDE(dst_stride)
+ mov r2, topextq
+ imul r2, dstrideq
+ add dstq, r2
+ mov r9m, dstq
+
+ ; center_w = bw - left_ext - right_ext
+ mov centerwq, bwq
+ lea r3, [rightextq+leftextq]
+ sub centerwq, r3
+
+%macro v_loop 3 ; need_left_ext, need_right_ext, suffix
+.v_loop_%3:
+%if %1
+ ; left extension
+ xor r3, r3
+ vpbroadcastb m0, [srcq]
+.left_loop_%3:
+ mova [dstq+r3], m0
+ add r3, 32
+ cmp r3, leftextq
+ jl .left_loop_%3
+
+ ; body
+ lea r12, [dstq+leftextq]
+%endif
+ xor r3, r3
+.body_loop_%3:
+ movu m0, [srcq+r3]
+%if %1
+ movu [r12+r3], m0
+%else
+ movu [dstq+r3], m0
+%endif
+ add r3, 32
+ cmp r3, centerwq
+ jl .body_loop_%3
+
+%if %2
+ ; right extension
+%if %1
+ add r12, centerwq
+%else
+ lea r12, [dstq+centerwq]
+%endif
+ xor r3, r3
+ vpbroadcastb m0, [srcq+centerwq-1]
+.right_loop_%3:
+ movu [r12+r3], m0
+ add r3, 32
+ cmp r3, rightextq
+ jl .right_loop_%3
+
+%endif
+ add dstq, dstrideq
+ add srcq, sstrideq
+ dec centerhq
+ jg .v_loop_%3
+%endmacro
+
+ test leftextq, leftextq
+ jnz .need_left_ext
+ test rightextq, rightextq
+ jnz .need_right_ext
+ v_loop 0, 0, 0
+ jmp .body_done
+
+.need_left_ext:
+ test rightextq, rightextq
+ jnz .need_left_right_ext
+ v_loop 1, 0, 1
+ jmp .body_done
+
+.need_left_right_ext:
+ v_loop 1, 1, 2
+ jmp .body_done
+
+.need_right_ext:
+ v_loop 0, 1, 3
+
+.body_done:
+ ; bottom edge extension
+ test bottomextq, bottomextq
+ jz .top
+ mov srcq, dstq
+ sub srcq, dstrideq
+ xor r1, r1
+.bottom_x_loop:
+ mova m0, [srcq+r1]
+ lea r3, [dstq+r1]
+ mov r4, bottomextq
+.bottom_y_loop:
+ mova [r3], m0
+ add r3, dstrideq
+ dec r4
+ jg .bottom_y_loop
+ add r1, 32
+ cmp r1, bwq
+ jl .bottom_x_loop
+
+.top:
+ ; top edge extension
+ test topextq, topextq
+ jz .end
+ mov srcq, r9m
+ mov dstq, dstm
+ xor r1, r1
+.top_x_loop:
+ mova m0, [srcq+r1]
+ lea r3, [dstq+r1]
+ mov r4, topextq
+.top_y_loop:
+ mova [r3], m0
+ add r3, dstrideq
+ dec r4
+ jg .top_y_loop
+ add r1, 32
+ cmp r1, bwq
+ jl .top_x_loop
+
+.end:
+ RET
+
+cglobal resize_8bpc, 6, 12, 16, dst, dst_stride, src, src_stride, \
+ dst_w, h, src_w, dx, mx0
+ sub dword mx0m, 4<<14
+ sub dword src_wm, 8
+ vpbroadcastd m5, dxm
+ vpbroadcastd m8, mx0m
+ vpbroadcastd m6, src_wm
+
+ DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x
+ LEA r7, $$
+%define base r7-$$
+
+ vpbroadcastd xm3, [base+pw_m256]
+ vpbroadcastd m7, [base+pd_63]
+ vbroadcasti128 m15, [base+pb_8x0_8x8]
+ pmaddwd m2, m5, [base+rescale_mul] ; dx*[0,1,2,3,4,5,6,7]
+ pslld m5, 3 ; dx*8
+ pslld m6, 14
+ paddd m8, m2 ; mx+[0..7]*dx
+ pxor m2, m2
+
+ ; m2 = 0, m3 = pmulhrsw constant for x=(x+64)>>7
+ ; m8 = mx+[0..7]*dx, m5 = dx*8, m6 = src_w, m7 = 0x3f, m15=0,8
+
+.loop_y:
+ xor xd, xd
+ mova m4, m8 ; per-line working version of mx
+
+.loop_x:
+ pmaxsd m0, m4, m2
+ psrad m9, m4, 8 ; filter offset (unmasked)
+ pminsd m0, m6 ; iclip(mx, 0, src_w-8)
+ psubd m1, m4, m0 ; pshufb offset
+ psrad m0, 14 ; clipped src_x offset
+ psrad m1, 14 ; pshufb edge_emu offset
+ pand m9, m7 ; filter offset (masked)
+
+ ; load source pixels - this ugly code is vpgatherdq emulation since
+ ; directly using vpgatherdq on Haswell is quite a bit slower :(
+ movd r8d, xm0
+ pextrd r9d, xm0, 1
+ pextrd r10d, xm0, 2
+ pextrd r11d, xm0, 3
+ vextracti128 xm0, m0, 1
+ movq xm12, [srcq+r8]
+ movq xm13, [srcq+r10]
+ movhps xm12, [srcq+r9]
+ movhps xm13, [srcq+r11]
+ movd r8d, xm0
+ pextrd r9d, xm0, 1
+ pextrd r10d, xm0, 2
+ pextrd r11d, xm0, 3
+ vinserti128 m12, [srcq+r8], 1
+ vinserti128 m13, [srcq+r10], 1
+ vpbroadcastq m10, [srcq+r9]
+ vpbroadcastq m11, [srcq+r11]
+ vpblendd m12, m10, 11000000b
+ vpblendd m13, m11, 11000000b
+
+ ; if no emulation is required, we don't need to shuffle or emulate edges
+ ; this also saves 2 quasi-vpgatherdqs
+ vptest m1, m1
+ jz .filter
+
+ movq r9, xm1
+ pextrq r11, xm1, 1
+ movsxd r8, r9d
+ sar r9, 32
+ movsxd r10, r11d
+ sar r11, 32
+ vextracti128 xm1, m1, 1
+ movq xm14, [base+resize_shuf+4+r8]
+ movq xm0, [base+resize_shuf+4+r10]
+ movhps xm14, [base+resize_shuf+4+r9]
+ movhps xm0, [base+resize_shuf+4+r11]
+ movq r9, xm1
+ pextrq r11, xm1, 1
+ movsxd r8, r9d
+ sar r9, 32
+ movsxd r10, r11d
+ sar r11, 32
+ vinserti128 m14, [base+resize_shuf+4+r8], 1
+ vinserti128 m0, [base+resize_shuf+4+r10], 1
+ vpbroadcastq m10, [base+resize_shuf+4+r9]
+ vpbroadcastq m11, [base+resize_shuf+4+r11]
+ vpblendd m14, m10, 11000000b
+ vpblendd m0, m11, 11000000b
+
+ paddb m14, m15
+ paddb m0, m15
+ pshufb m12, m14
+ pshufb m13, m0
+
+.filter:
+ movd r8d, xm9
+ pextrd r9d, xm9, 1
+ pextrd r10d, xm9, 2
+ pextrd r11d, xm9, 3
+ vextracti128 xm9, m9, 1
+ movq xm10, [base+resize_filter+r8*8]
+ movq xm11, [base+resize_filter+r10*8]
+ movhps xm10, [base+resize_filter+r9*8]
+ movhps xm11, [base+resize_filter+r11*8]
+ movd r8d, xm9
+ pextrd r9d, xm9, 1
+ pextrd r10d, xm9, 2
+ pextrd r11d, xm9, 3
+ vinserti128 m10, [base+resize_filter+r8*8], 1
+ vinserti128 m11, [base+resize_filter+r10*8], 1
+ vpbroadcastq m14, [base+resize_filter+r9*8]
+ vpbroadcastq m1, [base+resize_filter+r11*8]
+ vpblendd m10, m14, 11000000b
+ vpblendd m11, m1, 11000000b
+
+ pmaddubsw m12, m10
+ pmaddubsw m13, m11
+ phaddw m12, m13
+ vextracti128 xm13, m12, 1
+ phaddsw xm12, xm13
+ pmulhrsw xm12, xm3 ; x=(x+64)>>7
+ packuswb xm12, xm12
+ movq [dstq+xq], xm12
+
+ paddd m4, m5
+ add xd, 8
+ cmp xd, dst_wd
+ jl .loop_x
+
+ add dstq, dst_strideq
+ add srcq, src_strideq
+ dec hd
+ jg .loop_y
+ RET
+
+cglobal w_mask_420_8bpc, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-w_mask_420_avx2_table
+ lea r7, [w_mask_420_avx2_table]
+ tzcnt wd, wm
+ mov r6d, r7m ; sign
+ movifnidn hd, hm
+ movsxd wq, [r7+wq*4]
+ vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
+ vpbroadcastd m7, [base+pw_2048]
+ pmovzxbd m9, [base+deint_shuf4]
+ vpbroadcastd m8, [base+wm_420_sign+r6*4] ; 258 - sign
+ add wq, r7
+ W_MASK 0, 4, 0, 1
+ mov maskq, maskmp
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ vextracti128 xm1, m0, 1
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q ], xm1, 1
+ cmp hd, 8
+ jl .w4_end
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 2
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
+ jg .w4_h16
+.w4_end:
+ vextracti128 xm0, m4, 1
+ vpblendd xm1, xm4, xm0, 0x05
+ vpblendd xm4, xm0, 0x0a
+ pshufd xm1, xm1, q2301
+ psubw xm4, xm8, xm4
+ psubw xm4, xm1
+ psrlw xm4, 2
+ packuswb xm4, xm4
+ movq [maskq], xm4
+ RET
+.w4_h16:
+ W_MASK 0, 5, 2, 3
+ lea dstq, [dstq+strideq*4]
+ phaddd m4, m5
+ vextracti128 xm1, m0, 1
+ psubw m4, m8, m4
+ psrlw m4, 2
+ vpermd m4, m9, m4
+ vextracti128 xm5, m4, 1
+ packuswb xm4, xm5
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q], xm1, 1
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 2
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
+ mova [maskq], xm4
+ RET
+.w8_loop:
+ add tmp1q, 2*32
+ add tmp2q, 2*32
+ W_MASK 0, 4, 0, 1
+ lea dstq, [dstq+strideq*4]
+ add maskq, 8
+.w8:
+ vextracti128 xm2, m4, 1
+ vextracti128 xm1, m0, 1
+ psubw xm4, xm8, xm4
+ psubw xm4, xm2
+ psrlw xm4, 2
+ packuswb xm4, xm4
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm1
+ movq [maskq], xm4
+ sub hd, 4
+ jg .w8_loop
+ RET
+.w16_loop:
+ add tmp1q, 4*32
+ add tmp2q, 4*32
+ W_MASK 0, 4, 0, 1
+ lea dstq, [dstq+strideq*4]
+ add maskq, 16
+.w16:
+ vpermq m0, m0, q3120
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ W_MASK 0, 5, 2, 3
+ punpckhqdq m1, m4, m5
+ punpcklqdq m4, m5
+ psubw m1, m8, m1
+ psubw m1, m4
+ psrlw m1, 2
+ vpermq m0, m0, q3120
+ packuswb m1, m1
+ vpermd m1, m9, m1
+ mova [dstq+strideq*2], xm0
+ vextracti128 [dstq+stride3q ], m0, 1
+ mova [maskq], xm1
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32_loop:
+ add tmp1q, 4*32
+ add tmp2q, 4*32
+ W_MASK 0, 4, 0, 1
+ lea dstq, [dstq+strideq*2]
+ add maskq, 16
+.w32:
+ vpermq m0, m0, q3120
+ mova [dstq+strideq*0], m0
+ W_MASK 0, 5, 2, 3
+ psubw m4, m8, m4
+ psubw m4, m5
+ psrlw m4, 2
+ vpermq m0, m0, q3120
+ packuswb m4, m4
+ vpermd m4, m9, m4
+ mova [dstq+strideq*1], m0
+ mova [maskq], xm4
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64_loop_even:
+ psubw m10, m8, m4
+ psubw m11, m8, m5
+ dec hd
+.w64_loop:
+ add tmp1q, 4*32
+ add tmp2q, 4*32
+ W_MASK 0, 4, 0, 1
+ add dstq, strideq
+.w64:
+ vpermq m0, m0, q3120
+ mova [dstq+32*0], m0
+ W_MASK 0, 5, 2, 3
+ vpermq m0, m0, q3120
+ mova [dstq+32*1], m0
+ test hd, 1
+ jz .w64_loop_even
+ psubw m4, m10, m4
+ psubw m5, m11, m5
+ psrlw m4, 2
+ psrlw m5, 2
+ packuswb m4, m5
+ vpermd m4, m9, m4
+ mova [maskq], m4
+ add maskq, 32
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop_even:
+ psubw m12, m8, m4
+ psubw m13, m8, m5
+ dec hd
+.w128_loop:
+ W_MASK 0, 4, 0, 1
+ add dstq, strideq
+.w128:
+ vpermq m0, m0, q3120
+ mova [dstq+32*0], m0
+ W_MASK 0, 5, 2, 3
+ vpermq m0, m0, q3120
+ mova [dstq+32*1], m0
+ add tmp1q, 8*32
+ add tmp2q, 8*32
+ test hd, 1
+ jz .w128_even
+ psubw m4, m10, m4
+ psubw m5, m11, m5
+ psrlw m4, 2
+ psrlw m5, 2
+ packuswb m4, m5
+ vpermd m4, m9, m4
+ mova [maskq+32*0], m4
+ jmp .w128_odd
+.w128_even:
+ psubw m10, m8, m4
+ psubw m11, m8, m5
+.w128_odd:
+ W_MASK 0, 4, -4, -3
+ vpermq m0, m0, q3120
+ mova [dstq+32*2], m0
+ W_MASK 0, 5, -2, -1
+ vpermq m0, m0, q3120
+ mova [dstq+32*3], m0
+ test hd, 1
+ jz .w128_loop_even
+ psubw m4, m12, m4
+ psubw m5, m13, m5
+ psrlw m4, 2
+ psrlw m5, 2
+ packuswb m4, m5
+ vpermd m4, m9, m4
+ mova [maskq+32*1], m4
+ add maskq, 64
+ dec hd
+ jg .w128_loop
+ RET
+
+cglobal w_mask_422_8bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-w_mask_422_avx2_table
+ lea r7, [w_mask_422_avx2_table]
+ tzcnt wd, wm
+ mov r6d, r7m ; sign
+ movifnidn hd, hm
+ pxor m9, m9
+ movsxd wq, dword [r7+wq*4]
+ vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
+ vpbroadcastd m7, [base+pw_2048]
+ pmovzxbd m10, [base+deint_shuf4]
+ vpbroadcastd m8, [base+wm_422_sign+r6*4] ; 128 - sign
+ add wq, r7
+ mov maskq, maskmp
+ W_MASK 0, 4, 0, 1
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ vextracti128 xm1, m0, 1
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q ], xm1, 1
+ cmp hd, 8
+ jl .w4_end
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 2
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
+ jg .w4_h16
+.w4_end:
+ vextracti128 xm5, m4, 1
+ packuswb xm4, xm5
+ psubb xm5, xm8, xm4
+ pavgb xm5, xm9
+ pshufd xm5, xm5, q3120
+ mova [maskq], xm5
+ RET
+.w4_h16:
+ W_MASK 0, 5, 2, 3
+ lea dstq, [dstq+strideq*4]
+ packuswb m4, m5
+ psubb m5, m8, m4
+ pavgb m5, m9
+ vpermd m5, m10, m5
+ vextracti128 xm1, m0, 1
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q ], xm1, 1
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 2
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
+ mova [maskq], m5
+ RET
+.w8_loop:
+ add tmp1q, 32*2
+ add tmp2q, 32*2
+ W_MASK 0, 4, 0, 1
+ lea dstq, [dstq+strideq*4]
+ add maskq, 16
+.w8:
+ vextracti128 xm5, m4, 1
+ vextracti128 xm1, m0, 1
+ packuswb xm4, xm5
+ psubb xm5, xm8, xm4
+ pavgb xm5, xm9
+ pshufd xm5, xm5, q3120
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm1
+ mova [maskq], xm5
+ sub hd, 4
+ jg .w8_loop
+ RET
+.w16_loop:
+ add tmp1q, 32*4
+ add tmp2q, 32*4
+ W_MASK 0, 4, 0, 1
+ lea dstq, [dstq+strideq*4]
+ add maskq, 32
+.w16:
+ vpermq m0, m0, q3120
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ W_MASK 0, 5, 2, 3
+ packuswb m4, m5
+ psubb m5, m8, m4
+ pavgb m5, m9
+ vpermq m0, m0, q3120
+ vpermd m5, m10, m5
+ mova [dstq+strideq*2], xm0
+ vextracti128 [dstq+stride3q ], m0, 1
+ mova [maskq], m5
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32_loop:
+ add tmp1q, 32*4
+ add tmp2q, 32*4
+ W_MASK 0, 4, 0, 1
+ lea dstq, [dstq+strideq*2]
+ add maskq, 32
+.w32:
+ vpermq m0, m0, q3120
+ mova [dstq+strideq*0], m0
+ W_MASK 0, 5, 2, 3
+ packuswb m4, m5
+ psubb m5, m8, m4
+ pavgb m5, m9
+ vpermq m0, m0, q3120
+ vpermd m5, m10, m5
+ mova [dstq+strideq*1], m0
+ mova [maskq], m5
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64_loop:
+ add tmp1q, 32*4
+ add tmp2q, 32*4
+ W_MASK 0, 4, 0, 1
+ add dstq, strideq
+ add maskq, 32
+.w64:
+ vpermq m0, m0, q3120
+ mova [dstq+32*0], m0
+ W_MASK 0, 5, 2, 3
+ packuswb m4, m5
+ psubb m5, m8, m4
+ pavgb m5, m9
+ vpermq m0, m0, q3120
+ vpermd m5, m10, m5
+ mova [dstq+32*1], m0
+ mova [maskq], m5
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop:
+ add tmp1q, 32*8
+ add tmp2q, 32*8
+ W_MASK 0, 4, 0, 1
+ add dstq, strideq
+ add maskq, 32*2
+.w128:
+ vpermq m0, m0, q3120
+ mova [dstq+32*0], m0
+ W_MASK 0, 5, 2, 3
+ packuswb m4, m5
+ psubb m5, m8, m4
+ pavgb m5, m9
+ vpermq m0, m0, q3120
+ vpermd m5, m10, m5
+ mova [dstq+32*1], m0
+ mova [maskq+32*0], m5
+ W_MASK 0, 4, 4, 5
+ vpermq m0, m0, q3120
+ mova [dstq+32*2], m0
+ W_MASK 0, 5, 6, 7
+ packuswb m4, m5
+ psubb m5, m8, m4
+ pavgb m5, m9
+ vpermq m0, m0, q3120
+ vpermd m5, m10, m5
+ mova [dstq+32*3], m0
+ mova [maskq+32*1], m5
+ dec hd
+ jg .w128_loop
+ RET
+
+cglobal w_mask_444_8bpc, 4, 8, 8, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-w_mask_444_avx2_table
+ lea r7, [w_mask_444_avx2_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ mov maskq, maskmp
+ movsxd wq, dword [r7+wq*4]
+ vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
+ vpbroadcastd m5, [base+pb_64]
+ vpbroadcastd m7, [base+pw_2048]
+ add wq, r7
+ W_MASK 0, 4, 0, 1, 1
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ vextracti128 xm1, m0, 1
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q ], xm1, 1
+ mova [maskq+32*0], m4
+ cmp hd, 8
+ jl .w4_end
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 2
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
+ je .w4_end
+ W_MASK 0, 4, 2, 3, 1
+ lea dstq, [dstq+strideq*4]
+ vextracti128 xm1, m0, 1
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q ], xm1, 1
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 2
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
+ mova [maskq+32*1], m4
+.w4_end:
+ RET
+.w8_loop:
+ add tmp1q, 32*2
+ add tmp2q, 32*2
+ W_MASK 0, 4, 0, 1, 1
+ lea dstq, [dstq+strideq*4]
+ add maskq, 32
+.w8:
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm1
+ mova [maskq], m4
+ sub hd, 4
+ jg .w8_loop
+ RET
+.w16_loop:
+ add tmp1q, 32*2
+ add tmp2q, 32*2
+ W_MASK 0, 4, 0, 1, 1
+ lea dstq, [dstq+strideq*2]
+ add maskq, 32
+.w16:
+ vpermq m0, m0, q3120
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ mova [maskq], m4
+ sub hd, 2
+ jg .w16_loop
+ RET
+.w32_loop:
+ add tmp1q, 32*2
+ add tmp2q, 32*2
+ W_MASK 0, 4, 0, 1, 1
+ add dstq, strideq
+ add maskq, 32
+.w32:
+ vpermq m0, m0, q3120
+ mova [dstq], m0
+ mova [maskq], m4
+ dec hd
+ jg .w32_loop
+ RET
+.w64_loop:
+ add tmp1q, 32*4
+ add tmp2q, 32*4
+ W_MASK 0, 4, 0, 1, 1
+ add dstq, strideq
+ add maskq, 32*2
+.w64:
+ vpermq m0, m0, q3120
+ mova [dstq+32*0], m0
+ mova [maskq+32*0], m4
+ W_MASK 0, 4, 2, 3, 1
+ vpermq m0, m0, q3120
+ mova [dstq+32*1], m0
+ mova [maskq+32*1], m4
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop:
+ add tmp1q, 32*8
+ add tmp2q, 32*8
+ W_MASK 0, 4, 0, 1, 1
+ add dstq, strideq
+ add maskq, 32*4
+.w128:
+ vpermq m0, m0, q3120
+ mova [dstq+32*0], m0
+ mova [maskq+32*0], m4
+ W_MASK 0, 4, 2, 3, 1
+ vpermq m0, m0, q3120
+ mova [dstq+32*1], m0
+ mova [maskq+32*1], m4
+ W_MASK 0, 4, 4, 5, 1
+ vpermq m0, m0, q3120
+ mova [dstq+32*2], m0
+ mova [maskq+32*2], m4
+ W_MASK 0, 4, 6, 7, 1
+ vpermq m0, m0, q3120
+ mova [dstq+32*3], m0
+ mova [maskq+32*3], m4
+ dec hd
+ jg .w128_loop
+ RET
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/mc_avx512.asm b/third_party/dav1d/src/x86/mc_avx512.asm
new file mode 100644
index 0000000000..7897f1decc
--- /dev/null
+++ b/third_party/dav1d/src/x86/mc_avx512.asm
@@ -0,0 +1,4538 @@
+; Copyright © 2020, VideoLAN and dav1d authors
+; Copyright © 2020, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 64
+
+obmc_masks:
+pw_512: times 2 dw 512
+ ; 2
+ db 45, 19, 64, 0
+ ; 4
+ db 39, 25, 50, 14, 59, 5, 64, 0
+ ; 8
+ db 36, 28, 42, 22, 48, 16, 53, 11, 57, 7, 61, 3, 64, 0, 64, 0
+ ; 16
+ db 34, 30, 37, 27, 40, 24, 43, 21, 46, 18, 49, 15, 52, 12, 54, 10
+ db 56, 8, 58, 6, 60, 4, 61, 3, 64, 0, 64, 0, 64, 0, 64, 0
+ ; 32
+ db 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20
+ db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55, 9
+ db 56, 8, 57, 7, 58, 6, 59, 5, 60, 4, 60, 4, 61, 3, 62, 2
+ db 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0
+
+warp_8x8_permA: db 4, 5, 6, 7, 16, 17, 18, 19, 5, 6, 7, 8, 17, 18, 19, 20
+ db 6, 7, 8, 9, 18, 19, 20, 21, 7, 8, 9, 10, 19, 20, 21, 22
+ db 8, 9, 10, 11, 20, 21, 22, 23, 9, 10, 11, 12, 21, 22, 23, 24
+ db 10, 11, 12, 13, 22, 23, 24, 25, 11, 12, 13, 14, 23, 24, 25, 26
+warp_8x8_permB: db 0, 1, 2, 3, 20, 21, 22, 23, 1, 2, 3, 4, 21, 22, 23, 24
+ db 2, 3, 4, 5, 22, 23, 24, 25, 3, 4, 5, 6, 23, 24, 25, 26
+ db 4, 5, 6, 7, 24, 25, 26, 27, 5, 6, 7, 8, 25, 26, 27, 28
+ db 6, 7, 8, 9, 26, 27, 28, 29, 7, 8, 9, 10, 27, 28, 29, 30
+warp_8x8_permC: db -1, 0, -1, 1, -1, 8, -1, 9, -1, 4, -1, 5, -1, 12, -1, 13
+warp_8x8_permD: db -1, 2, -1, 3, -1, 10, -1, 11, -1, 6, -1, 7, -1, 14, -1, 15
+pd_0to7: dd 0, 1, 2, 3, 4, 5, 6, 7
+warp_8x8_hpack: db 3, 11, 3, 11, 35, 43, 35, 43
+pd_16384: dd 16384
+pd_262144: dd 262144
+warp_8x8_end: db 0, 4, 16, 20, 32, 36, 48, 52, 2, 6, 18, 22, 34, 38, 50, 54
+warp_8x8t_end: db 2, 3, 10, 11, 18, 19, 26, 27, 34, 35, 42, 43, 50, 51, 58, 59
+ db 6, 7, 14, 15, 22, 23, 30, 31, 38, 39, 46, 47, 54, 55, 62, 63
+bidir_sctr_w4: dd 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
+wm_420_perm4: db 1, 3, 9, 11, 5, 7, 13, 15, 17, 19, 25, 27, 21, 23, 29, 31
+ db 33, 35, 41, 43, 37, 39, 45, 47, 49, 51, 57, 59, 53, 55, 61, 63
+ db 0, 2, 8, 10, 4, 6, 12, 14, 16, 18, 24, 26, 20, 22, 28, 30
+ db 32, 34, 40, 42, 36, 38, 44, 46, 48, 50, 56, 58, 52, 54, 60, 62
+wm_420_perm8: db 1, 3, 17, 19, 5, 7, 21, 23, 9, 11, 25, 27, 13, 15, 29, 31
+ db 33, 35, 49, 51, 37, 39, 53, 55, 41, 43, 57, 59, 45, 47, 61, 63
+ db 0, 2, 16, 18, 4, 6, 20, 22, 8, 10, 24, 26, 12, 14, 28, 30
+ db 32, 34, 48, 50, 36, 38, 52, 54, 40, 42, 56, 58, 44, 46, 60, 62
+wm_420_perm16: db 1, 3, 33, 35, 5, 7, 37, 39, 9, 11, 41, 43, 13, 15, 45, 47
+ db 17, 19, 49, 51, 21, 23, 53, 55, 25, 27, 57, 59, 29, 31, 61, 63
+ db 0, 2, 32, 34, 4, 6, 36, 38, 8, 10, 40, 42, 12, 14, 44, 46
+ db 16, 18, 48, 50, 20, 22, 52, 54, 24, 26, 56, 58, 28, 30, 60, 62
+wm_420_mask: db 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63
+ db 67, 71, 75, 79, 83, 87, 91, 95, 99,103,107,111,115,119,123,127
+ db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61
+ db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125
+wm_422_mask: db 2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62
+ db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61
+ db 66, 70, 74, 78, 82, 86, 90, 94, 98,102,106,110,114,118,122,126
+ db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125
+wm_444_mask: db 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+ db 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63
+ db 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+ db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62
+bilin_h_perm16: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
+ db 9, 8, 10, 9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14, 16, 15
+ db 33, 32, 34, 33, 35, 34, 36, 35, 37, 36, 38, 37, 39, 38, 40, 39
+ db 41, 40, 42, 41, 43, 42, 44, 43, 45, 44, 46, 45, 47, 46, 48, 47
+bilin_h_perm32: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
+ db 9, 8, 10, 9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14, 16, 15
+ db 17, 16, 18, 17, 19, 18, 20, 19, 21, 20, 22, 21, 23, 22, 24, 23
+ db 25, 24, 26, 25, 27, 26, 28, 27, 29, 28, 30, 29, 31, 30, 32, 31
+bilin_v_perm8: db 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7
+ db 80, 16, 81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23
+ db 32, 80, 33, 81, 34, 82, 35, 83, 36, 84, 37, 85, 38, 86, 39, 87
+ db 64, 32, 65, 33, 66, 34, 67, 35, 68, 36, 69, 37, 70, 38, 71, 39
+bilin_v_perm16: db 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7
+ db 24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15
+ db 64, 16, 65, 17, 66, 18, 67, 19, 68, 20, 69, 21, 70, 22, 71, 23
+ db 72, 24, 73, 25, 74, 26, 75, 27, 76, 28, 77, 29, 78, 30, 79, 31
+bilin_v_perm32: db 64, 0, 65, 1, 66, 2, 67, 3, 68, 4, 69, 5, 70, 6, 71, 7
+ db 72, 8, 73, 9, 74, 10, 75, 11, 76, 12, 77, 13, 78, 14, 79, 15
+ db 80, 16, 81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23
+ db 88, 24, 89, 25, 90, 26, 91, 27, 92, 28, 93, 29, 94, 30, 95, 31
+bilin_v_perm64: dq 0, 4, 1, 5, 2, 6, 3, 7
+spel_h_perm16a: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
+ db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+ db 32, 33, 34, 35, 33, 34, 35, 36, 34, 35, 36, 37, 35, 36, 37, 38
+ db 40, 41, 42, 43, 41, 42, 43, 44, 42, 43, 44, 45, 43, 44, 45, 46
+spel_h_perm16b: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
+ db 12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18
+ db 36, 37, 38, 39, 37, 38, 39, 40, 38, 39, 40, 41, 39, 40, 41, 42
+ db 44, 45, 46, 47, 45, 46, 47, 48, 46, 47, 48, 49, 47, 48, 49, 50
+spel_h_perm16c: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+ db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22
+ db 40, 41, 42, 43, 41, 42, 43, 44, 42, 43, 44, 45, 43, 44, 45, 46
+ db 48, 49, 50, 51, 49, 50, 51, 52, 50, 51, 52, 53, 51, 52, 53, 54
+spel_h_perm32a: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
+ db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+ db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22
+ db 24, 25, 26, 27, 25, 26, 27, 28, 26, 27, 28, 29, 27, 28, 29, 30
+spel_h_perm32b: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
+ db 12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18
+ db 20, 21, 22, 23, 21, 22, 23, 24, 22, 23, 24, 25, 23, 24, 25, 26
+ db 28, 29, 30, 31, 29, 30, 31, 32, 30, 31, 32, 33, 31, 32, 33, 34
+spel_h_perm32c: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+ db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22
+ db 24, 25, 26, 27, 25, 26, 27, 28, 26, 27, 28, 29, 27, 28, 29, 30
+ db 32, 33, 34, 35, 33, 34, 35, 36, 34, 35, 36, 37, 35, 36, 37, 38
+spel_v_perm16: db 32, 0, 33, 1, 34, 2, 35, 3, 36, 4, 37, 5, 38, 6, 39, 7
+ db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
+ db 40, 16, 41, 17, 42, 18, 43, 19, 44, 20, 45, 21, 46, 22, 47, 23
+ db 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
+spel_v_perm32: db 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39
+ db 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47
+ db 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55
+ db 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63
+spel_hv_perm4a: db 8, 9, 16, 17, 10, 11, 18, 19, 12, 13, 20, 21, 14, 15, 22, 23
+ db 16, 17, 24, 25, 18, 19, 26, 27, 20, 21, 28, 29, 22, 23, 30, 31
+spel_hv_perm4b: db 24, 25, 32, 33, 26, 27, 34, 35, 28, 29, 36, 37, 30, 31, 38, 39
+ db 32, 33, 40, 41, 34, 35, 42, 43, 36, 37, 44, 45, 38, 39, 46, 47
+spel_hv_perm4c: db 40, 41, 48, 49, 42, 43, 50, 51, 44, 45, 52, 53, 46, 47, 54, 55
+ db 48, 49, 56, 57, 50, 51, 58, 59, 52, 53, 60, 61, 54, 55, 62, 63
+spel_hv_perm4d: db 18, 19, 0, 1, 22, 23, 4, 5, 26, 27, 8, 9, 30, 31, 12, 13
+ db 0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29
+spel_hv_perm8a: db 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23
+ db 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31
+ db 16, 17, 32, 33, 18, 19, 34, 35, 20, 21, 36, 37, 22, 23, 38, 39
+ db 24, 25, 40, 41, 26, 27, 42, 43, 28, 29, 44, 45, 30, 31, 46, 47
+spel_hv_perm8b: db 32, 33, 48, 49, 34, 35, 50, 51, 36, 37, 52, 53, 38, 39, 54, 55
+ db 40, 41, 56, 57, 42, 43, 58, 59, 44, 45, 60, 61, 46, 47, 62, 63
+ db 48, 49, 64, 65, 50, 51, 66, 67, 52, 53, 68, 69, 54, 55, 70, 71
+ db 56, 57, 72, 73, 58, 59, 74, 75, 60, 61, 76, 77, 62, 63, 78, 79
+spel_hv_perm8c: db 34, 35, 0, 1, 38, 39, 4, 5, 42, 43, 8, 9, 46, 47, 12, 13
+ db 50, 51, 16, 17, 54, 55, 20, 21, 58, 59, 24, 25, 62, 63, 28, 29
+ db 0, 1, 32, 33, 4, 5, 36, 37, 8, 9, 40, 41, 12, 13, 44, 45
+ db 16, 17, 48, 49, 20, 21, 52, 53, 24, 25, 56, 57, 28, 29, 60, 61
+spel_hv_end16: db 1, 3, 17, 19, 5, 7, 21, 23, 33, 35, 49, 51, 37, 39, 53, 55
+ db 9, 11, 25, 27, 13, 15, 29, 31, 41, 43, 57, 59, 45, 47, 61, 63
+spel_hv_perm16a:db 0, 1, 2, 3, 32, 33, 34, 35, 1, 2, 3, 4, 33, 34, 35, 36
+ db 2, 3, 4, 5, 34, 35, 36, 37, 3, 4, 5, 6, 35, 36, 37, 38
+spel_hv_perm16c:db 8, 9, 10, 11, 40, 41, 42, 43, 9, 10, 11, 12, 41, 42, 43, 44
+ db 10, 11, 12, 13, 42, 43, 44, 45, 11, 12, 13, 14, 43, 44, 45, 46
+ db 16, 17, 18, 19, 48, 49, 50, 51, 17, 18, 19, 20, 49, 50, 51, 52
+ db 18, 19, 20, 21, 50, 51, 52, 53, 19, 20, 21, 22, 51, 52, 53, 54
+spel_hv_perm16b:db 4, 5, 6, 7, 36, 37, 38, 39, 5, 6, 7, 8, 37, 38, 39, 40
+ db 6, 7, 8, 9, 38, 39, 40, 41, 7, 8, 9, 10, 39, 40, 41, 42
+ db 12, 13, 14, 15, 44, 45, 46, 47, 13, 14, 15, 16, 45, 46, 47, 48
+ db 14, 15, 16, 17, 46, 47, 48, 49, 15, 16, 17, 18, 47, 48, 49, 50
+spel_hv_perm16d:db 0, 1, 2, 3, 1, 2, 3, 4, 4, 5, 6, 7, 5, 6, 7, 8
+ db 2, 3, 4, 5, 3, 4, 5, 6, 6, 7, 8, 9, 7, 8, 9, 10
+ db 8, 9, 10, 11, 9, 10, 11, 12, 12, 13, 14, 15, 13, 14, 15, 16
+ db 10, 11, 12, 13, 11, 12, 13, 14, 14, 15, 16, 17, 15, 16, 17, 18
+spel_hv_perm16e:db 4, 5, 6, 7, 5, 6, 7, 8, 8, 9, 10, 11, 9, 10, 11, 12
+ db 6, 7, 8, 9, 7, 8, 9, 10, 10, 11, 12, 13, 11, 12, 13, 14
+ db 12, 13, 14, 15, 13, 14, 15, 16, 16, 17, 18, 19, 17, 18, 19, 20
+ db 14, 15, 16, 17, 15, 16, 17, 18, 18, 19, 20, 21, 19, 20, 21, 22
+spel_hv_end: db 1, 3, 5, 7, 17, 19, 21, 23, 33, 35, 37, 39, 49, 51, 53, 55
+deint_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11
+subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12
+ db 2, 3, 4, 5, 3, 4, 5, 6, 10, 11, 12, 13, 11, 12, 13, 14
+subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
+subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
+subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+bilin_h_shuf4: db 1, 0, 2, 1, 3, 2, 4, 3, 9, 8, 10, 9, 11, 10, 12, 11
+bilin_h_shuf8: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
+bilin_v_shuf4: db 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7
+blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
+rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+resize_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7
+resize_permA: dd 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+resize_permB: dd 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+resize_permC: dd 0, 4, 8, 12
+pb_02461357: db 0, 2, 4, 6, 1, 3, 5, 7
+
+wm_420_perm64: dq 0xfedcba9876543210
+wm_sign: dd 0x40804080, 0xc0c0c0c0, 0x40404040
+
+pb_8x0_8x8: times 8 db 0
+ times 8 db 8
+pb_127: times 4 db 127
+pw_m128 times 2 dw -128
+pw_m256: times 2 dw -256
+pw_1024: times 2 dw 1024
+pw_2048: times 2 dw 2048
+pw_6903: times 2 dw 6903
+pw_8192: times 2 dw 8192
+pd_32: dd 32
+pd_34: dd 34
+pd_63: dd 63
+pd_512: dd 512
+pd_32768: dd 32768
+
+%define pb_m64 (wm_sign+4)
+%define pb_64 (wm_sign+8)
+%define pd_2 (pd_0to7+8)
+
+cextern mc_subpel_filters
+%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
+cextern mc_warp_filter
+cextern resize_filter
+
+%macro BASE_JMP_TABLE 3-*
+ %xdefine %1_%2_table (%%table - %3)
+ %xdefine %%base %1_%2
+ %%table:
+ %rep %0 - 2
+ dw %%base %+ _w%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro HV_JMP_TABLE 5-*
+ %xdefine %%prefix mangle(private_prefix %+ _%1_%2_8bpc_%3)
+ %xdefine %%base %1_%3
+ %assign %%types %4
+ %if %%types & 1
+ %xdefine %1_%2_h_%3_table (%%h - %5)
+ %%h:
+ %rep %0 - 4
+ dw %%prefix %+ .h_w%5 - %%base
+ %rotate 1
+ %endrep
+ %rotate 4
+ %endif
+ %if %%types & 2
+ %xdefine %1_%2_v_%3_table (%%v - %5)
+ %%v:
+ %rep %0 - 4
+ dw %%prefix %+ .v_w%5 - %%base
+ %rotate 1
+ %endrep
+ %rotate 4
+ %endif
+ %if %%types & 4
+ %xdefine %1_%2_hv_%3_table (%%hv - %5)
+ %%hv:
+ %rep %0 - 4
+ dw %%prefix %+ .hv_w%5 - %%base
+ %rotate 1
+ %endrep
+ %endif
+%endmacro
+
+%macro BIDIR_JMP_TABLE 2-*
+ %xdefine %1_%2_table (%%table - 2*%3)
+ %xdefine %%base %1_%2_table
+ %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2)
+ %%table:
+ %rep %0 - 2
+ dd %%prefix %+ .w%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+%xdefine put_avx512icl mangle(private_prefix %+ _put_bilin_8bpc_avx512icl.put)
+%xdefine prep_avx512icl mangle(private_prefix %+ _prep_bilin_8bpc_avx512icl.prep)
+
+%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
+
+BASE_JMP_TABLE put, avx512icl, 2, 4, 8, 16, 32, 64, 128
+BASE_JMP_TABLE prep, avx512icl, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE put, bilin, avx512icl, 7, 2, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, bilin, avx512icl, 7, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE put, 8tap, avx512icl, 3, 2, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, 8tap, avx512icl, 7, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE avg, avx512icl, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_avg, avx512icl, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE mask, avx512icl, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_420, avx512icl, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_422, avx512icl, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_444, avx512icl, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE blend, avx512icl, 4, 8, 16, 32
+BIDIR_JMP_TABLE blend_v, avx512icl, 2, 4, 8, 16, 32
+BIDIR_JMP_TABLE blend_h, avx512icl, 2, 4, 8, 16, 32, 64, 128
+
+SECTION .text
+
+%macro WRAP_YMM 1+
+INIT_YMM cpuname
+ %1
+INIT_ZMM cpuname
+%endmacro
+
+INIT_ZMM avx512icl
+cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy
+ movifnidn mxyd, r6m ; mx
+ lea r7, [put_avx512icl]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ test mxyd, mxyd
+ jnz .h
+ mov mxyd, r7m ; my
+ test mxyd, mxyd
+ jnz .v
+.put:
+ movzx wd, word [r7+wq*2+table_offset(put,)]
+ add wq, r7
+ jmp wq
+.put_w2:
+ movzx r6d, word [srcq+ssq*0]
+ movzx r7d, word [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mov [dstq+dsq*0], r6w
+ mov [dstq+dsq*1], r7w
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w2
+ RET
+.put_w4:
+ mov r6d, [srcq+ssq*0]
+ mov r7d, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mov [dstq+dsq*0], r6d
+ mov [dstq+dsq*1], r7d
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w4
+ RET
+.put_w8:
+ mov r6, [srcq+ssq*0]
+ mov r7, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mov [dstq+dsq*0], r6
+ mov [dstq+dsq*1], r7
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w8
+ RET
+.put_w16:
+ movu xmm0, [srcq+ssq*0]
+ movu xmm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0], xmm0
+ mova [dstq+dsq*1], xmm1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w16
+ RET
+.put_w32:
+ movu ym0, [srcq+ssq*0]
+ movu ym1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0], ym0
+ mova [dstq+dsq*1], ym1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w32
+ RET
+.put_w64:
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w64
+ RET
+.put_w128:
+ movu m0, [srcq+ssq*0+64*0]
+ movu m1, [srcq+ssq*0+64*1]
+ movu m2, [srcq+ssq*1+64*0]
+ movu m3, [srcq+ssq*1+64*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0+64*0], m0
+ mova [dstq+dsq*0+64*1], m1
+ mova [dstq+dsq*1+64*0], m2
+ mova [dstq+dsq*1+64*1], m3
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w128
+ RET
+.h:
+ ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4
+ ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4
+ imul mxyd, 0xff01
+ vbroadcasti128 m4, [bilin_h_shuf8]
+ add mxyd, 16 << 8
+ vpbroadcastw m5, mxyd
+ mov mxyd, r7m ; my
+ test mxyd, mxyd
+ jnz .hv
+ movzx wd, word [r7+wq*2+table_offset(put, _bilin_h)]
+ vpbroadcastd m3, [pw_2048]
+ add wq, r7
+ jmp wq
+.h_w2:
+ movd xmm0, [srcq+ssq*0]
+ pinsrd xmm0, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ pshufb xmm0, xm4
+ pmaddubsw xmm0, xm5
+ pmulhrsw xmm0, xm3
+ packuswb xmm0, xmm0
+ pextrw [dstq+dsq*0], xmm0, 0
+ pextrw [dstq+dsq*1], xmm0, 2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w2
+ RET
+.h_w4:
+ mova xmm4, [bilin_h_shuf4]
+.h_w4_loop:
+ movq xmm0, [srcq+ssq*0]
+ movhps xmm0, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb xmm0, xmm4
+ pmaddubsw xmm0, xm5
+ pmulhrsw xmm0, xm3
+ packuswb xmm0, xmm0
+ movd [dstq+dsq*0], xmm0
+ pextrd [dstq+dsq*1], xmm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w4_loop
+ RET
+.h_w8:
+ movu xm0, [srcq+ssq*0]
+ vinserti32x4 ym0, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ pshufb ym0, ym4
+ pmaddubsw ym0, ym5
+ pmulhrsw ym0, ym3
+ vpmovuswb xm0, ym0
+ movq [dstq+dsq*0], xm0
+ movhps [dstq+dsq*1], xm0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ mova m4, [bilin_h_perm16]
+.h_w16_loop:
+ movu ym0, [srcq+ssq*0]
+ vinserti32x8 m0, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ vpermb m0, m4, m0
+ pmaddubsw m0, m5
+ pmulhrsw m0, m3
+ vpmovuswb ym0, m0
+ mova [dstq+dsq*0], xm0
+ vextracti128 [dstq+dsq*1], ym0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w16_loop
+ RET
+.h_w32:
+ movu ym0, [srcq+ssq*0+8*0]
+ vinserti32x8 m0, [srcq+ssq*1+8*0], 1
+ movu ym1, [srcq+ssq*0+8*1]
+ vinserti32x8 m1, [srcq+ssq*1+8*1], 1
+ lea srcq, [srcq+ssq*2]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ mova [dstq+dsq*0], ym0
+ vextracti32x8 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w32
+ RET
+.h_w64:
+ movu m0, [srcq+8*0]
+ movu m1, [srcq+8*1]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ add srcq, ssq
+ mova [dstq], m0
+ add dstq, dsq
+ dec hd
+ jg .h_w64
+ RET
+.h_w128:
+ movu m0, [srcq+8*0]
+ movu m2, [srcq+8*1]
+ movu m1, [srcq+8*8]
+ movu m6, [srcq+8*9]
+ add srcq, ssq
+ REPX {pshufb x, m4}, m0, m2, m1, m6
+ REPX {pmaddubsw x, m5}, m0, m2, m1, m6
+ REPX {pmulhrsw x, m3}, m0, m2, m1, m6
+ packuswb m0, m2
+ packuswb m1, m6
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ add dstq, dsq
+ dec hd
+ jg .h_w128
+ RET
+.v:
+ movzx wd, word [r7+wq*2+table_offset(put, _bilin_v)]
+ imul mxyd, 0xff01
+ vpbroadcastd m5, [pw_2048]
+ add mxyd, 16 << 8
+ add wq, r7
+ vpbroadcastw m4, mxyd
+ jmp wq
+.v_w2:
+ movd xmm0, [srcq+ssq*0]
+.v_w2_loop:
+ pinsrw xmm1, xmm0, [srcq+ssq*1], 1 ; 0 1
+ lea srcq, [srcq+ssq*2]
+ pinsrw xmm0, xmm1, [srcq+ssq*0], 0 ; 2 1
+ pshuflw xmm1, xmm1, q2301 ; 1 0
+ punpcklbw xmm1, xmm0, xmm1
+ pmaddubsw xmm1, xm4
+ pmulhrsw xmm1, xm5
+ packuswb xmm1, xmm1
+ pextrw [dstq+dsq*0], xmm1, 1
+ pextrw [dstq+dsq*1], xmm1, 0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w2_loop
+ RET
+.v_w4:
+ movd xmm0, [srcq+ssq*0]
+.v_w4_loop:
+ vpbroadcastd xmm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpblendd xmm2, xmm1, xmm0, 0x01 ; 0 1
+ vpbroadcastd xmm0, [srcq+ssq*0]
+ vpblendd xmm1, xmm0, 0x02 ; 1 2
+ punpcklbw xmm1, xmm2
+ pmaddubsw xmm1, xm4
+ pmulhrsw xmm1, xm5
+ packuswb xmm1, xmm1
+ movd [dstq+dsq*0], xmm1
+ pextrd [dstq+dsq*1], xmm1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w4_loop
+ RET
+.v_w8:
+ movq xmm0, [srcq+ssq*0]
+.v_w8_loop:
+ movq xmm3, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklbw xmm1, xmm3, xmm0
+ movq xmm0, [srcq+ssq*0]
+ punpcklbw xmm2, xmm0, xmm3
+ pmaddubsw xmm1, xm4
+ pmaddubsw xmm2, xm4
+ pmulhrsw xmm1, xm5
+ pmulhrsw xmm2, xm5
+ packuswb xmm1, xmm2
+ movq [dstq+dsq*0], xmm1
+ movhps [dstq+dsq*1], xmm1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w8_loop
+ RET
+.v_w16:
+ movu xmm0, [srcq+ssq*0]
+.v_w16_loop:
+ vbroadcasti128 ymm2, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpblendd ymm3, ymm2, ymm0, 0x0f ; 0 1
+ vbroadcasti128 ymm0, [srcq+ssq*0]
+ vpblendd ymm2, ymm2, ymm0, 0xf0 ; 1 2
+ punpcklbw ymm1, ymm2, ymm3
+ punpckhbw ymm2, ymm3
+ pmaddubsw ymm1, ym4
+ pmaddubsw ymm2, ym4
+ pmulhrsw ymm1, ym5
+ pmulhrsw ymm2, ym5
+ packuswb ymm1, ymm2
+ mova [dstq+dsq*0], xmm1
+ vextracti128 [dstq+dsq*1], ymm1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w16_loop
+ vzeroupper
+ RET
+.v_w32:
+ movu ym0, [srcq+ssq*0]
+ kxnorb k1, k1, k1
+.v_w32_loop:
+ vbroadcasti32x8 m2, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpblendmd m3{k1}, m2, m0 ; 0 1
+ vbroadcasti32x8 m0, [srcq+ssq*0]
+ vpblendmd m2{k1}, m0, m2 ; 1 2
+ punpcklbw m1, m2, m3
+ punpckhbw m2, m3
+ pmaddubsw m1, m4
+ pmaddubsw m2, m4
+ pmulhrsw m1, m5
+ pmulhrsw m2, m5
+ packuswb m1, m2
+ mova [dstq+dsq*0], ym1
+ vextracti32x8 [dstq+dsq*1], m1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w32_loop
+ RET
+.v_w64:
+ movu m0, [srcq+ssq*0]
+.v_w64_loop:
+ movu m3, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklbw m1, m3, m0
+ punpckhbw m6, m3, m0
+ movu m0, [srcq+ssq*0]
+ pmaddubsw m1, m4
+ pmaddubsw m6, m4
+ punpcklbw m2, m0, m3
+ punpckhbw m7, m0, m3
+ pmaddubsw m2, m4
+ pmaddubsw m7, m4
+ REPX {pmulhrsw x, m5}, m1, m6, m2, m7
+ packuswb m1, m6
+ packuswb m2, m7
+ mova [dstq+dsq*0], m1
+ mova [dstq+dsq*1], m2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w64_loop
+ RET
+.v_w128:
+ movu m0, [srcq+64*0]
+ movu m1, [srcq+64*1]
+.v_w128_loop:
+ add srcq, ssq
+ movu m2, [srcq+64*0]
+ movu m3, [srcq+64*1]
+ punpcklbw m6, m2, m0
+ pmaddubsw m6, m4
+ punpckhbw m0, m2, m0
+ pmaddubsw m0, m4
+ punpcklbw m7, m3, m1
+ pmaddubsw m7, m4
+ punpckhbw m1, m3, m1
+ pmaddubsw m1, m4
+ REPX {pmulhrsw x, m5}, m6, m0, m7, m1
+ packuswb m6, m0
+ mova m0, m2
+ packuswb m7, m1
+ mova m1, m3
+ mova [dstq+64*0], m6
+ mova [dstq+64*1], m7
+ add dstq, dsq
+ dec hd
+ jg .v_w128_loop
+ RET
+.hv:
+ ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8
+ ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4
+ movzx wd, word [r7+wq*2+table_offset(put, _bilin_hv)]
+ WIN64_SPILL_XMM 8
+ shl mxyd, 11 ; can't shift by 12 due to signed overflow
+ vpbroadcastd m7, [pw_2048]
+ add wq, r7
+ vpbroadcastw m6, mxyd
+ jmp wq
+.hv_w2:
+ vpbroadcastd xmm0, [srcq+ssq*0]
+ pshufb xmm0, xm4
+ pmaddubsw xmm0, xm5
+.hv_w2_loop:
+ movd xmm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pinsrd xmm1, [srcq+ssq*0], 1
+ pshufb xmm1, xm4
+ pmaddubsw xmm1, xm5 ; 1 _ 2 _
+ shufps xmm2, xmm0, xmm1, q1032 ; 0 _ 1 _
+ mova xmm0, xmm1
+ psubw xmm1, xmm2
+ paddw xmm1, xmm1
+ pmulhw xmm1, xm6
+ paddw xmm1, xmm2
+ pmulhrsw xmm1, xm7
+ packuswb xmm1, xmm1
+ pextrw [dstq+dsq*0], xmm1, 0
+ pextrw [dstq+dsq*1], xmm1, 2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w2_loop
+ RET
+.hv_w4:
+ mova xmm4, [bilin_h_shuf4]
+ movddup xmm0, [srcq+ssq*0]
+ pshufb xmm0, xmm4
+ pmaddubsw xmm0, xm5
+.hv_w4_loop:
+ movq xmm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movhps xmm1, [srcq+ssq*0]
+ pshufb xmm1, xmm4
+ pmaddubsw xmm1, xm5 ; 1 2
+ shufps xmm2, xmm0, xmm1, q1032 ; 0 1
+ mova xmm0, xmm1
+ psubw xmm1, xmm2
+ paddw xmm1, xmm1
+ pmulhw xmm1, xm6
+ paddw xmm1, xmm2
+ pmulhrsw xmm1, xm7
+ packuswb xmm1, xmm1
+ movd [dstq+dsq*0], xmm1
+ pextrd [dstq+dsq*1], xmm1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ vbroadcasti128 ym0, [srcq+ssq*0]
+ pshufb ym0, ym4
+ pmaddubsw ym0, ym5
+.hv_w8_loop:
+ movu xm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vinserti128 ym1, [srcq+ssq*0], 1
+ pshufb ym1, ym4
+ pmaddubsw ym1, ym5 ; 1 2
+ valignq ym2, ym1, ym0, 2
+ mova ym0, ym1
+ psubw ym1, ym2
+ paddw ym1, ym1
+ pmulhw ym1, ym6
+ paddw ym1, ym2
+ pmulhrsw ym1, ym7
+ vpmovuswb xm1, ym1
+ movq [dstq+dsq*0], xm1
+ movhps [dstq+dsq*1], xm1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w8_loop
+ RET
+.hv_w16:
+ vbroadcasti32x8 m0, [srcq+ssq*0]
+ mova m4, [bilin_h_perm16]
+ vpermb m0, m4, m0
+ pmaddubsw m0, m5
+.hv_w16_loop:
+ movu ym1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vinserti32x8 m1, [srcq+ssq*0], 1
+ vpermb m1, m4, m1
+ pmaddubsw m1, m5 ; 1 2
+ valignq m2, m1, m0, 4 ; 0 1
+ mova m0, m1
+ psubw m1, m2
+ paddw m1, m1
+ pmulhw m1, m6
+ paddw m1, m2
+ pmulhrsw m1, m7
+ vpmovuswb ym1, m1
+ mova [dstq+dsq*0], xm1
+ vextracti32x4 [dstq+dsq*1], ym1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w16_loop
+ RET
+.hv_w32:
+ mova m4, [bilin_h_perm32]
+ vpermb m0, m4, [srcq+ssq*0]
+ pmovzxbq m8, [pb_02461357]
+ pmaddubsw m0, m5
+.hv_w32_loop:
+ vpermb m2, m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpermb m3, m4, [srcq+ssq*0]
+ pmaddubsw m2, m5
+ psubw m1, m2, m0
+ paddw m1, m1
+ pmulhw m1, m6
+ paddw m1, m0
+ pmaddubsw m0, m3, m5
+ psubw m3, m0, m2
+ paddw m3, m3
+ pmulhw m3, m6
+ paddw m3, m2
+ pmulhrsw m1, m7
+ pmulhrsw m3, m7
+ packuswb m1, m3
+ vpermq m1, m8, m1
+ mova [dstq+dsq*0], ym1
+ vextracti32x8 [dstq+dsq*1], m1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w32_loop
+ RET
+.hv_w64:
+ movu m0, [srcq+8*0]
+ movu m1, [srcq+8*1]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+.hv_w64_loop:
+ add srcq, ssq
+ movu m2, [srcq+8*0]
+ movu m3, [srcq+8*1]
+ pshufb m2, m4
+ pshufb m3, m4
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ psubw m8, m2, m0
+ psubw m9, m3, m1
+ paddw m8, m8
+ pmulhw m8, m6
+ paddw m9, m9
+ pmulhw m9, m6
+ paddw m8, m0
+ pmulhrsw m8, m7
+ paddw m9, m1
+ pmulhrsw m9, m7
+ mova m0, m2
+ mova m1, m3
+ packuswb m8, m9
+ mova [dstq], m8
+ add dstq, dsq
+ dec hd
+ jg .hv_w64_loop
+ RET
+.hv_w128:
+ movu m0, [srcq+8*0]
+ movu m1, [srcq+8*1]
+ movu m2, [srcq+8*8]
+ movu m3, [srcq+8*9]
+ REPX {pshufb x, m4}, m0, m1, m2, m3
+ REPX {pmaddubsw x, m5}, m0, m1, m2, m3
+.hv_w128_loop:
+ add srcq, ssq
+ movu m8, [srcq+8*0]
+ movu m9, [srcq+8*1]
+ movu m10, [srcq+8*8]
+ movu m11, [srcq+8*9]
+ REPX {pshufb x, m4}, m8, m9, m10, m11
+ REPX {pmaddubsw x, m5}, m8, m9, m10, m11
+ psubw m12, m8, m0
+ psubw m13, m9, m1
+ psubw m14, m10, m2
+ psubw m15, m11, m3
+ paddw m12, m12
+ pmulhw m12, m6
+ paddw m13, m13
+ pmulhw m13, m6
+ paddw m14, m14
+ pmulhw m14, m6
+ paddw m15, m15
+ pmulhw m15, m6
+ paddw m12, m0
+ pmulhrsw m12, m7
+ paddw m13, m1
+ pmulhrsw m13, m7
+ paddw m14, m2
+ pmulhrsw m14, m7
+ paddw m15, m3
+ pmulhrsw m15, m7
+ mova m0, m8
+ mova m1, m9
+ mova m2, m10
+ mova m3, m11
+ packuswb m12, m13
+ packuswb m14, m15
+ mova [dstq+64*0], m12
+ mova [dstq+64*1], m14
+ add dstq, dsq
+ dec hd
+ jg .hv_w128_loop
+ RET
+
+DECLARE_REG_TMP 3, 5, 6
+
+cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
+ movifnidn mxyd, r5m ; mx
+ lea t2, [prep_avx512icl]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ test mxyd, mxyd
+ jnz .h
+ mov mxyd, r6m ; my
+ test mxyd, mxyd
+ jnz .v
+.prep:
+ movzx wd, word [t2+wq*2+table_offset(prep,)]
+ add wq, t2
+ lea stride3q, [strideq*3]
+ jmp wq
+.prep_w4:
+ movd xmm0, [srcq+strideq*0]
+ pinsrd xmm0, [srcq+strideq*1], 1
+ pinsrd xmm0, [srcq+strideq*2], 2
+ pinsrd xmm0, [srcq+stride3q ], 3
+ lea srcq, [srcq+strideq*4]
+ pmovzxbw ym0, xmm0
+ psllw ym0, 4
+ mova [tmpq], ym0
+ add tmpq, 32
+ sub hd, 4
+ jg .prep_w4
+ RET
+.prep_w8:
+ movq xmm0, [srcq+strideq*0]
+ movq xmm1, [srcq+strideq*1]
+ vinserti128 ym0, ymm0, [srcq+strideq*2], 1
+ vinserti128 ym1, ymm1, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ punpcklqdq ym0, ym1
+ pmovzxbw m0, ym0
+ psllw m0, 4
+ mova [tmpq], m0
+ add tmpq, 32*2
+ sub hd, 4
+ jg .prep_w8
+ RET
+.prep_w16:
+ movu xmm0, [srcq+strideq*0]
+ vinserti128 ym0, ymm0, [srcq+strideq*1], 1
+ movu xmm1, [srcq+strideq*2]
+ vinserti128 ym1, ymm1, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ pmovzxbw m0, ym0
+ pmovzxbw m1, ym1
+ psllw m0, 4
+ psllw m1, 4
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+ add tmpq, 32*4
+ sub hd, 4
+ jg .prep_w16
+ RET
+.prep_w32:
+ pmovzxbw m0, [srcq+strideq*0]
+ pmovzxbw m1, [srcq+strideq*1]
+ pmovzxbw m2, [srcq+strideq*2]
+ pmovzxbw m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ REPX {psllw x, 4}, m0, m1, m2, m3
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+ mova [tmpq+64*2], m2
+ mova [tmpq+64*3], m3
+ add tmpq, 64*4
+ sub hd, 4
+ jg .prep_w32
+ RET
+.prep_w64:
+ pmovzxbw m0, [srcq+strideq*0+32*0]
+ pmovzxbw m1, [srcq+strideq*0+32*1]
+ pmovzxbw m2, [srcq+strideq*1+32*0]
+ pmovzxbw m3, [srcq+strideq*1+32*1]
+ lea srcq, [srcq+strideq*2]
+ REPX {psllw x, 4}, m0, m1, m2, m3
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+ mova [tmpq+64*2], m2
+ mova [tmpq+64*3], m3
+ add tmpq, 64*4
+ sub hd, 2
+ jg .prep_w64
+ RET
+.prep_w128:
+ pmovzxbw m0, [srcq+32*0]
+ pmovzxbw m1, [srcq+32*1]
+ pmovzxbw m2, [srcq+32*2]
+ pmovzxbw m3, [srcq+32*3]
+ REPX {psllw x, 4}, m0, m1, m2, m3
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+ mova [tmpq+64*2], m2
+ mova [tmpq+64*3], m3
+ add tmpq, 64*4
+ add srcq, strideq
+ dec hd
+ jg .prep_w128
+ RET
+.h:
+ ; 16 * src[x] + (mx * (src[x + 1] - src[x]))
+ ; = (16 - mx) * src[x] + mx * src[x + 1]
+ imul mxyd, 0xff01
+ add mxyd, 16 << 8
+ vpbroadcastw m5, mxyd
+ mov mxyd, r6m ; my
+ test mxyd, mxyd
+ jnz .hv
+ movzx wd, word [t2+wq*2+table_offset(prep, _bilin_h)]
+ add wq, t2
+ lea stride3q, [strideq*3]
+ jmp wq
+.h_w4:
+ vbroadcasti32x4 ym4, [bilin_h_shuf4]
+.h_w4_loop:
+ movq xmm0, [srcq+strideq*0]
+ movq xmm1, [srcq+strideq*1]
+ vinserti32x4 ym0, ymm0, [srcq+strideq*2], 1
+ vinserti32x4 ym1, ymm1, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ punpcklqdq ym0, ym1
+ pshufb ym0, ym4
+ pmaddubsw ym0, ym5
+ mova [tmpq], ym0
+ add tmpq, 32
+ sub hd, 4
+ jg .h_w4_loop
+ RET
+.h_w8:
+ vbroadcasti32x4 m4, [bilin_h_shuf8]
+.h_w8_loop:
+ movu xmm0, [srcq+strideq*0]
+ vinserti32x4 ym0, ymm0, [srcq+strideq*1], 1
+ vinserti32x4 m0, [srcq+strideq*2], 2
+ vinserti32x4 m0, [srcq+stride3q ], 3
+ lea srcq, [srcq+strideq*4]
+ pshufb m0, m4
+ pmaddubsw m0, m5
+ mova [tmpq], m0
+ add tmpq, 64
+ sub hd, 4
+ jg .h_w8_loop
+ RET
+.h_w16:
+ mova m4, [bilin_h_perm16]
+.h_w16_loop:
+ movu ym0, [srcq+strideq*0]
+ vinserti32x8 m0, [srcq+strideq*1], 1
+ movu ym1, [srcq+strideq*2]
+ vinserti32x8 m1, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ vpermb m0, m4, m0
+ vpermb m1, m4, m1
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+ add tmpq, 64*2
+ sub hd, 4
+ jg .h_w16_loop
+ RET
+.h_w32:
+ mova m4, [bilin_h_perm32]
+.h_w32_loop:
+ vpermb m0, m4, [srcq+strideq*0]
+ vpermb m1, m4, [srcq+strideq*1]
+ vpermb m2, m4, [srcq+strideq*2]
+ vpermb m3, m4, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+ mova [tmpq+64*2], m2
+ mova [tmpq+64*3], m3
+ add tmpq, 64*4
+ sub hd, 4
+ jg .h_w32_loop
+ RET
+.h_w64:
+ mova m4, [bilin_h_perm32]
+.h_w64_loop:
+ vpermb m0, m4, [srcq+strideq*0+32*0]
+ vpermb m1, m4, [srcq+strideq*0+32*1]
+ vpermb m2, m4, [srcq+strideq*1+32*0]
+ vpermb m3, m4, [srcq+strideq*1+32*1]
+ lea srcq, [srcq+strideq*2]
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+ mova [tmpq+64*2], m2
+ mova [tmpq+64*3], m3
+ add tmpq, 64*4
+ sub hd, 2
+ jg .h_w64_loop
+ RET
+.h_w128:
+ mova m4, [bilin_h_perm32]
+.h_w128_loop:
+ vpermb m0, m4, [srcq+32*0]
+ vpermb m1, m4, [srcq+32*1]
+ vpermb m2, m4, [srcq+32*2]
+ vpermb m3, m4, [srcq+32*3]
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+ mova [tmpq+64*2], m2
+ mova [tmpq+64*3], m3
+ add tmpq, 64*4
+ add srcq, strideq
+ dec hd
+ jg .h_w128_loop
+ RET
+.v:
+ WIN64_SPILL_XMM 7
+ movzx wd, word [t2+wq*2+table_offset(prep, _bilin_v)]
+ imul mxyd, 0xff01
+ add mxyd, 16 << 8
+ add wq, t2
+ lea stride3q, [strideq*3]
+ vpbroadcastw m6, mxyd
+ jmp wq
+.v_w4:
+ vpbroadcastd xm0, [srcq+strideq*0]
+ mov r3d, 0x29
+ vbroadcasti32x4 ym3, [bilin_v_shuf4]
+ kmovb k1, r3d
+.v_w4_loop:
+ vpblendmd xm1{k1}, xm0, [srcq+strideq*1] {1to4} ; __01 ____
+ vpbroadcastd ym2, [srcq+strideq*2]
+ vpbroadcastd ym2{k1}, [srcq+stride3q ] ; __2_ 23__
+ lea srcq, [srcq+strideq*4]
+ vpbroadcastd ym0, [srcq+strideq*0]
+ punpckhqdq ym2{k1}, ym1, ym0 ; 012_ 234_
+ pshufb ym2, ym3
+ pmaddubsw ym2, ym6
+ mova [tmpq], ym2
+ add tmpq, 32
+ sub hd, 4
+ jg .v_w4_loop
+ RET
+.v_w8:
+ mova m5, [bilin_v_perm8]
+ vbroadcasti32x4 ym0, [srcq+strideq*0]
+.v_w8_loop:
+ vinserti32x4 ym1, ym0, [srcq+strideq*1], 1
+ vpbroadcastq ym0, [srcq+strideq*2]
+ vinserti32x4 m1, [srcq+stride3q ], 2
+ lea srcq, [srcq+strideq*4]
+ vinserti32x4 ym0, [srcq+strideq*0], 0
+ vpermt2b m1, m5, m0
+ pmaddubsw m1, m6
+ mova [tmpq], m1
+ add tmpq, 64
+ sub hd, 4
+ jg .v_w8_loop
+ RET
+.v_w16:
+ mova m5, [bilin_v_perm16]
+ movu xm0, [srcq+strideq*0]
+.v_w16_loop:
+ movu xm2, [srcq+strideq*2]
+ vinserti32x4 ym1, ym0, [srcq+strideq*1], 1
+ vpermt2b m1, m5, m2
+ vinserti32x4 ym2, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ movu xm0, [srcq+strideq*0]
+ vpermt2b m2, m5, m0
+ pmaddubsw m1, m6
+ pmaddubsw m2, m6
+ mova [tmpq+64*0], m1
+ mova [tmpq+64*1], m2
+ add tmpq, 64*2
+ sub hd, 4
+ jg .v_w16_loop
+ RET
+.v_w32:
+ mova m5, [bilin_v_perm32]
+ movu ym0, [srcq+strideq*0]
+.v_w32_loop:
+ movu ym2, [srcq+strideq*1]
+ movu ym3, [srcq+strideq*2]
+ movu ym4, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpermt2b m0, m5, m2
+ vpermt2b m2, m5, m3
+ vpermt2b m3, m5, m4
+ pmaddubsw m1, m0, m6
+ movu ym0, [srcq+strideq*0]
+ vpermt2b m4, m5, m0
+ pmaddubsw m2, m6
+ pmaddubsw m3, m6
+ pmaddubsw m4, m6
+ mova [tmpq+64*0], m1
+ mova [tmpq+64*1], m2
+ mova [tmpq+64*2], m3
+ mova [tmpq+64*3], m4
+ add tmpq, 64*4
+ sub hd, 4
+ jg .v_w32_loop
+ RET
+.v_w64:
+ mova m5, [bilin_v_perm64]
+ vpermq m0, m5, [srcq+strideq*0]
+.v_w64_loop:
+ vpermq m1, m5, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ punpcklbw m4, m1, m0
+ punpckhbw m2, m1, m0
+ vpermq m0, m5, [srcq+strideq*0]
+ punpcklbw m3, m0, m1
+ punpckhbw m1, m0, m1
+ pmaddubsw m4, m6
+ pmaddubsw m2, m6
+ pmaddubsw m3, m6
+ pmaddubsw m1, m6
+ mova [tmpq+64*0], m4
+ mova [tmpq+64*1], m2
+ mova [tmpq+64*2], m3
+ mova [tmpq+64*3], m1
+ add tmpq, 64*4
+ sub hd, 2
+ jg .v_w64_loop
+ RET
+.v_w128:
+ mova m5, [bilin_v_perm64]
+ vpermq m0, m5, [srcq+strideq*0+ 0]
+ vpermq m1, m5, [srcq+strideq*0+64]
+.v_w128_loop:
+ vpermq m2, m5, [srcq+strideq*1+ 0]
+ vpermq m3, m5, [srcq+strideq*1+64]
+ lea srcq, [srcq+strideq*2]
+ punpcklbw m4, m2, m0
+ punpckhbw m0, m2, m0
+ pmaddubsw m4, m6
+ pmaddubsw m0, m6
+ mova [tmpq+64*0], m4
+ mova [tmpq+64*1], m0
+ punpcklbw m4, m3, m1
+ punpckhbw m1, m3, m1
+ pmaddubsw m4, m6
+ pmaddubsw m1, m6
+ mova [tmpq+64*2], m4
+ mova [tmpq+64*3], m1
+ vpermq m0, m5, [srcq+strideq*0+ 0]
+ vpermq m1, m5, [srcq+strideq*0+64]
+ punpcklbw m4, m0, m2
+ punpckhbw m2, m0, m2
+ pmaddubsw m4, m6
+ pmaddubsw m2, m6
+ mova [tmpq+64*4], m4
+ mova [tmpq+64*5], m2
+ punpcklbw m4, m1, m3
+ punpckhbw m3, m1, m3
+ pmaddubsw m4, m6
+ pmaddubsw m3, m6
+ mova [tmpq+64*6], m4
+ mova [tmpq+64*7], m3
+ add tmpq, 64*8
+ sub hd, 2
+ jg .v_w128_loop
+ RET
+.hv:
+ ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4
+ ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4)
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 7
+ movzx wd, word [t2+wq*2+table_offset(prep, _bilin_hv)]
+ shl mxyd, 11
+ vpbroadcastw m6, mxyd
+ add wq, t2
+ lea stride3q, [strideq*3]
+ jmp wq
+.hv_w4:
+ vbroadcasti32x4 ym4, [bilin_h_shuf4]
+ vpbroadcastq ym0, [srcq+strideq*0]
+ pshufb ym0, ym4
+ pmaddubsw ym0, ym5
+.hv_w4_loop:
+ movq xmm1, [srcq+strideq*1]
+ movq xmm2, [srcq+strideq*2]
+ vinserti32x4 ym1, ymm1, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ vinserti32x4 ym2, ymm2, [srcq+strideq*0], 1
+ punpcklqdq ym1, ym2
+ pshufb ym1, ym4
+ pmaddubsw ym1, ym5 ; 1 2 3 4
+ valignq ym2, ym1, ym0, 3 ; 0 1 2 3
+ mova ym0, ym1
+ psubw ym1, ym2
+ pmulhrsw ym1, ym6
+ paddw ym1, ym2
+ mova [tmpq], ym1
+ add tmpq, 32
+ sub hd, 4
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ vbroadcasti32x4 m4, [bilin_h_shuf8]
+ vbroadcasti32x4 m0, [srcq+strideq*0]
+ pshufb m0, m4
+ pmaddubsw m0, m5
+.hv_w8_loop:
+ movu xmm1, [srcq+strideq*1]
+ vinserti128 ym1, ymm1, [srcq+strideq*2], 1
+ vinserti128 m1, [srcq+stride3q ], 2
+ lea srcq, [srcq+strideq*4]
+ vinserti128 m1, [srcq+strideq*0], 3
+ pshufb m1, m4
+ pmaddubsw m1, m5 ; 1 2 3 4
+ valignq m2, m1, m0, 6 ; 0 1 2 3
+ mova m0, m1
+ psubw m1, m2
+ pmulhrsw m1, m6
+ paddw m1, m2
+ mova [tmpq], m1
+ add tmpq, 64
+ sub hd, 4
+ jg .hv_w8_loop
+ RET
+.hv_w16:
+ mova m4, [bilin_h_perm16]
+ vbroadcasti32x8 m0, [srcq+strideq*0]
+ vpermb m0, m4, m0
+ pmaddubsw m0, m5
+.hv_w16_loop:
+ movu ym1, [srcq+strideq*1]
+ vinserti32x8 m1, [srcq+strideq*2], 1
+ movu ym2, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vinserti32x8 m2, [srcq+strideq*0], 1
+ vpermb m1, m4, m1
+ vpermb m2, m4, m2
+ pmaddubsw m1, m5 ; 1 2
+ vshufi32x4 m3, m0, m1, q1032 ; 0 1
+ pmaddubsw m0, m2, m5 ; 3 4
+ vshufi32x4 m2, m1, m0, q1032 ; 2 3
+ psubw m1, m3
+ pmulhrsw m1, m6
+ paddw m1, m3
+ psubw m3, m0, m2
+ pmulhrsw m3, m6
+ paddw m3, m2
+ mova [tmpq+64*0], m1
+ mova [tmpq+64*1], m3
+ add tmpq, 64*2
+ sub hd, 4
+ jg .hv_w16_loop
+ RET
+.hv_w32:
+ mova m4, [bilin_h_perm32]
+ vpermb m0, m4, [srcq+strideq*0]
+ pmaddubsw m0, m5
+.hv_w32_loop:
+ vpermb m1, m4, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vpermb m2, m4, [srcq+strideq*0]
+ pmaddubsw m1, m5
+ psubw m3, m1, m0
+ pmulhrsw m3, m6
+ paddw m3, m0
+ pmaddubsw m0, m2, m5
+ psubw m2, m0, m1
+ pmulhrsw m2, m6
+ paddw m2, m1
+ mova [tmpq+64*0], m3
+ mova [tmpq+64*1], m2
+ add tmpq, 64*2
+ sub hd, 2
+ jg .hv_w32_loop
+ RET
+.hv_w64:
+ mova m4, [bilin_h_perm32]
+ vpermb m0, m4, [srcq+32*0]
+ vpermb m1, m4, [srcq+32*1]
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+.hv_w64_loop:
+ add srcq, strideq
+ vpermb m2, m4, [srcq+32*0]
+ vpermb m3, m4, [srcq+32*1]
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ psubw m7, m2, m0
+ psubw m8, m3, m1
+ pmulhrsw m7, m6
+ pmulhrsw m8, m6
+ paddw m7, m0
+ mova m0, m2
+ paddw m8, m1
+ mova m1, m3
+ mova [tmpq+64*0], m7
+ mova [tmpq+64*1], m8
+ add tmpq, 64*2
+ dec hd
+ jg .hv_w64_loop
+ RET
+.hv_w128:
+ mova m4, [bilin_h_perm32]
+ vpermb m0, m4, [srcq+32*0]
+ vpermb m1, m4, [srcq+32*1]
+ vpermb m2, m4, [srcq+32*2]
+ vpermb m3, m4, [srcq+32*3]
+ REPX {pmaddubsw x, m5}, m0, m1, m2, m3
+.hv_w128_loop:
+ add srcq, strideq
+ vpermb m7, m4, [srcq+32*0]
+ vpermb m8, m4, [srcq+32*1]
+ vpermb m9, m4, [srcq+32*2]
+ vpermb m10, m4, [srcq+32*3]
+ REPX {pmaddubsw x, m5}, m7, m8, m9, m10
+ psubw m11, m7, m0
+ psubw m12, m8, m1
+ psubw m13, m9, m2
+ psubw m14, m10, m3
+ REPX {pmulhrsw x, m6}, m11, m12, m13, m14
+ paddw m11, m0
+ mova m0, m7
+ paddw m12, m1
+ mova m1, m8
+ paddw m13, m2
+ mova m2, m9
+ paddw m14, m3
+ mova m3, m10
+ mova [tmpq+64*0], m11
+ mova [tmpq+64*1], m12
+ mova [tmpq+64*2], m13
+ mova [tmpq+64*3], m14
+ add tmpq, 64*4
+ dec hd
+ jg .hv_w128_loop
+ RET
+
+; int8_t subpel_filters[5][15][8]
+%assign FILTER_REGULAR (0*15 << 16) | 3*15
+%assign FILTER_SMOOTH (1*15 << 16) | 4*15
+%assign FILTER_SHARP (2*15 << 16) | 3*15
+
+%macro FN 4 ; fn, type, type_h, type_v
+cglobal %1_%2_8bpc
+ mov t0d, FILTER_%3
+%ifidn %3, %4
+ mov t1d, t0d
+%else
+ mov t1d, FILTER_%4
+%endif
+%ifnidn %2, regular ; skip the jump in the last filter
+ jmp mangle(private_prefix %+ _%1_8bpc %+ SUFFIX)
+%endif
+%endmacro
+
+%macro PUT_8TAP_H 4-5 0 ; dst/src, tmp[1-3], vpermb
+%if %5
+ vpermb m%2, m6, m%1
+ vpermb m%3, m7, m%1
+ vpermb m%4, m8, m%1
+%else
+%if %2 < %4 ; reuse a previous value if possible
+ pshufb m%2, m%1, m6
+%endif
+ pshufb m%3, m%1, m7
+ pshufb m%4, m%1, m8
+%endif
+ mova m%1, m5
+ vpdpbusd m%1, m%2, m9
+ mova m%2, m5
+ vpdpbusd m%2, m%3, m9
+ vpdpbusd m%1, m%3, m10
+ vpdpbusd m%2, m%4, m10
+ packusdw m%1, m%2
+ psrlw m%1, 6
+%endmacro
+
+%if WIN64
+DECLARE_REG_TMP 4, 5
+%else
+DECLARE_REG_TMP 7, 8
+%endif
+
+%define PUT_8TAP_FN FN put_8tap,
+
+PUT_8TAP_FN sharp, SHARP, SHARP
+PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH
+PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP
+PUT_8TAP_FN smooth, SMOOTH, SMOOTH
+PUT_8TAP_FN sharp_regular, SHARP, REGULAR
+PUT_8TAP_FN regular_sharp, REGULAR, SHARP
+PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR
+PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PUT_8TAP_FN regular, REGULAR, REGULAR
+
+cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
+%define base r8-put_avx512icl
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+ imul myd, mym, 0x010101
+ add myd, t1d ; 8tap_v, my, 4tap_v
+ lea r8, [put_avx512icl]
+ movsxd wq, wm
+ movifnidn hd, hm
+ test mxd, 0xf00
+ jnz .h
+ test myd, 0xf00
+ jnz .v
+ tzcnt wd, wd
+ movzx wd, word [r8+wq*2+table_offset(put,)]
+ add wq, r8
+ lea r6, [ssq*3]
+ lea r7, [dsq*3]
+%if WIN64
+ pop r8
+%endif
+ jmp wq
+.h:
+ test myd, 0xf00
+ jnz .hv
+ vpbroadcastd m5, [pd_34] ; 2 + (8 << 2)
+ WIN64_SPILL_XMM 11
+ cmp wd, 4
+ jl .h_w2
+ vbroadcasti128 m6, [subpel_h_shufA]
+ je .h_w4
+ tzcnt wd, wd
+ vbroadcasti128 m7, [subpel_h_shufB]
+ vbroadcasti128 m8, [subpel_h_shufC]
+ shr mxd, 16
+ sub srcq, 3
+ movzx wd, word [r8+wq*2+table_offset(put, _8tap_h)]
+ vpbroadcastd m9, [base+mxq*8+subpel_filters+0]
+ vpbroadcastd m10, [base+mxq*8+subpel_filters+4]
+ add wq, r8
+ jmp wq
+.h_w2:
+ movzx mxd, mxb
+ dec srcq
+ mova xmm4, [subpel_h_shuf4]
+ vpbroadcastd xmm3, [base+mxq*8+subpel_filters+2]
+.h_w2_loop:
+ movq xmm0, [srcq+ssq*0]
+ movhps xmm0, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb xmm0, xmm4
+ mova xmm1, xm5
+ vpdpbusd xmm1, xmm0, xmm3
+ packssdw xmm0, xmm1, xmm1
+ psraw xmm0, 6
+ packuswb xmm0, xm0
+ pextrw [dstq+dsq*0], xmm0, 0
+ pextrw [dstq+dsq*1], xmm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w2_loop
+ RET
+.h_w4:
+ movzx mxd, mxb
+ dec srcq
+ vpbroadcastd xmm3, [base+mxq*8+subpel_filters+2]
+.h_w4_loop:
+ movq xmm0, [srcq+ssq*0]
+ movq xmm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb xmm0, xm6
+ pshufb xmm1, xm6
+ mova xmm2, xm5
+ vpdpbusd xmm2, xmm0, xmm3
+ mova xmm0, xm5
+ vpdpbusd xmm0, xmm1, xmm3
+ packssdw xmm0, xmm2, xmm0
+ psraw xmm0, 6
+ packuswb xmm0, xmm0
+ movd [dstq+dsq*0], xmm0
+ pextrd [dstq+dsq*1], xmm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w4_loop
+ RET
+.h_w8:
+ movu xm0, [srcq+ssq*0]
+ vinserti32x4 ym0, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ WRAP_YMM PUT_8TAP_H 0, 1, 2, 3
+ vpmovuswb xm0, ym0
+ movq [dstq+dsq*0], xm0
+ movhps [dstq+dsq*1], xm0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ mova m6, [spel_h_perm16a]
+ mova m7, [spel_h_perm16b]
+ mova m8, [spel_h_perm16c]
+.h_w16_loop:
+ movu ym0, [srcq+ssq*0]
+ vinserti32x8 m0, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ PUT_8TAP_H 0, 1, 2, 3, 1
+ vpmovuswb ym0, m0
+ mova [dstq+dsq*0], xm0
+ vextracti128 [dstq+dsq*1], ym0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w16_loop
+ RET
+.h_w32:
+ movu ym0, [srcq+ssq*0+8*0]
+ vinserti32x8 m0, [srcq+ssq*1+8*0], 1
+ movu ym1, [srcq+ssq*0+8*1]
+ vinserti32x8 m1, [srcq+ssq*1+8*1], 1
+ lea srcq, [srcq+ssq*2]
+ PUT_8TAP_H 0, 2, 3, 4
+ PUT_8TAP_H 1, 4, 3, 2
+ packuswb m0, m1
+ mova [dstq+dsq*0], ym0
+ vextracti32x8 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w32
+ RET
+.h_w64:
+ movu m0, [srcq+8*0]
+ movu m1, [srcq+8*1]
+ add srcq, ssq
+ PUT_8TAP_H 0, 2, 3, 4
+ PUT_8TAP_H 1, 4, 3, 2
+ packuswb m0, m1
+ mova [dstq], m0
+ add dstq, dsq
+ dec hd
+ jg .h_w64
+ RET
+.h_w128:
+ movu m0, [srcq+8*0]
+ movu m2, [srcq+8*1]
+ movu m1, [srcq+8*8]
+ movu m3, [srcq+8*9]
+ add srcq, ssq
+ PUT_8TAP_H 0, 4, 11, 12
+ PUT_8TAP_H 2, 12, 11, 4
+ PUT_8TAP_H 1, 4, 11, 12
+ PUT_8TAP_H 3, 12, 11, 4
+ packuswb m0, m2
+ packuswb m1, m3
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ add dstq, dsq
+ dec hd
+ jg .h_w128
+ RET
+.v:
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ tzcnt r6d, wd
+ movzx r6d, word [r8+r6*2+table_offset(put, _8tap_v)]
+ vpbroadcastd m7, [pw_512]
+ lea myq, [base+subpel_filters+myq*8]
+ vpbroadcastw m8, [myq+0]
+ vpbroadcastw m9, [myq+2]
+ vpbroadcastw m10, [myq+4]
+ vpbroadcastw m11, [myq+6]
+ add r6, r8
+ lea ss3q, [ssq*3]
+ sub srcq, ss3q
+ jmp r6
+.v_w2:
+ movd xmm2, [srcq+ssq*0]
+ pinsrw xmm2, [srcq+ssq*1], 2
+ pinsrw xmm2, [srcq+ssq*2], 4
+ add srcq, ss3q
+ pinsrw xmm2, [srcq+ssq*0], 6 ; 0 1 2 3
+ movd xmm3, [srcq+ssq*1]
+ vpbroadcastd xmm1, [srcq+ssq*2]
+ add srcq, ss3q
+ vpbroadcastd xmm0, [srcq+ssq*0]
+ vpblendd xmm3, xmm3, xmm1, 0x02 ; 4 5
+ vpblendd xmm1, xmm1, xmm0, 0x02 ; 5 6
+ palignr xmm4, xmm3, xmm2, 4 ; 1 2 3 4
+ punpcklbw xmm3, xmm1 ; 45 56
+ punpcklbw xmm1, xmm2, xmm4 ; 01 12
+ punpckhbw xmm2, xmm4 ; 23 34
+.v_w2_loop:
+ pmaddubsw xmm5, xmm1, xm8 ; a0 b0
+ mova xmm1, xmm2
+ pmaddubsw xmm2, xm9 ; a1 b1
+ paddw xmm5, xmm2
+ mova xmm2, xmm3
+ pmaddubsw xmm3, xm10 ; a2 b2
+ paddw xmm5, xmm3
+ vpbroadcastd xmm4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpblendd xmm3, xmm0, xmm4, 0x02 ; 6 7
+ vpbroadcastd xmm0, [srcq+ssq*0]
+ vpblendd xmm4, xmm4, xmm0, 0x02 ; 7 8
+ punpcklbw xmm3, xmm4 ; 67 78
+ pmaddubsw xmm4, xmm3, xm11 ; a3 b3
+ paddw xmm5, xmm4
+ pmulhrsw xmm5, xm7
+ packuswb xmm5, xmm5
+ pextrw [dstq+dsq*0], xmm5, 0
+ pextrw [dstq+dsq*1], xmm5, 2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w2_loop
+ RET
+.v_w4:
+ movd xmm2, [srcq+ssq*0]
+ pinsrd xmm2, [srcq+ssq*1], 1
+ pinsrd xmm2, [srcq+ssq*2], 2
+ add srcq, ss3q
+ pinsrd xmm2, [srcq+ssq*0], 3 ; 0 1 2 3
+ movd xmm3, [srcq+ssq*1]
+ vpbroadcastd xmm1, [srcq+ssq*2]
+ add srcq, ss3q
+ vpbroadcastd xmm0, [srcq+ssq*0]
+ vpblendd xmm3, xmm3, xmm1, 0x02 ; 4 5
+ vpblendd xmm1, xmm1, xmm0, 0x02 ; 5 6
+ palignr xmm4, xmm3, xmm2, 4 ; 1 2 3 4
+ punpcklbw xmm3, xmm1 ; 45 56
+ punpcklbw xmm1, xmm2, xmm4 ; 01 12
+ punpckhbw xmm2, xmm4 ; 23 34
+.v_w4_loop:
+ vpbroadcastd xmm4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmaddubsw xmm5, xmm1, xm8 ; a0 b0
+ mova xmm1, xmm2
+ pmaddubsw xmm2, xm9 ; a1 b1
+ paddw xmm5, xmm2
+ mova xmm2, xmm3
+ pmaddubsw xmm3, xm10 ; a2 b2
+ paddw xmm5, xmm3
+ vpblendd xmm3, xmm0, xmm4, 0x02 ; 6 7
+ vpbroadcastd xmm0, [srcq+ssq*0]
+ vpblendd xmm4, xmm4, xmm0, 0x02 ; 7 8
+ punpcklbw xmm3, xmm4 ; 67 78
+ pmaddubsw xmm4, xmm3, xm11 ; a3 b3
+ paddw xmm5, xmm4
+ pmulhrsw xmm5, xm7
+ packuswb xmm5, xmm5
+ movd [dstq+dsq*0], xmm5
+ pextrd [dstq+dsq*1], xmm5, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w4_loop
+ RET
+.v_w8:
+ movq xmm1, [srcq+ssq*0]
+ vpbroadcastq ymm0, [srcq+ssq*1]
+ vpbroadcastq ymm2, [srcq+ssq*2]
+ add srcq, ss3q
+ vpbroadcastq ymm5, [srcq+ssq*0]
+ vpbroadcastq ymm3, [srcq+ssq*1]
+ vpbroadcastq ymm4, [srcq+ssq*2]
+ add srcq, ss3q
+ vpblendd ymm1, ymm0, 0x30
+ vpblendd ymm0, ymm2, 0x30
+ punpcklbw ymm1, ymm0 ; 01 12
+ vpbroadcastq ymm0, [srcq+ssq*0]
+ vpblendd ymm2, ymm5, 0x30
+ vpblendd ymm5, ymm3, 0x30
+ punpcklbw ymm2, ymm5 ; 23 34
+ vpblendd ymm3, ymm4, 0x30
+ vpblendd ymm4, ymm0, 0x30
+ punpcklbw ymm3, ymm4 ; 45 56
+.v_w8_loop:
+ vpbroadcastq ymm4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmaddubsw ymm5, ymm1, ym8 ; a0 b0
+ mova ymm1, ymm2
+ pmaddubsw ymm2, ym9 ; a1 b1
+ paddw ymm5, ymm2
+ mova ymm2, ymm3
+ pmaddubsw ymm3, ym10 ; a2 b2
+ paddw ymm5, ymm3
+ vpblendd ymm3, ymm0, ymm4, 0x30
+ vpbroadcastq ymm0, [srcq+ssq*0]
+ vpblendd ymm4, ymm4, ymm0, 0x30
+ punpcklbw ymm3, ymm4 ; 67 78
+ pmaddubsw ymm4, ymm3, ym11 ; a3 b3
+ paddw ymm5, ymm4
+ pmulhrsw ymm5, ym7
+ vextracti128 xmm4, ymm5, 1
+ packuswb xmm5, xmm4
+ movq [dstq+dsq*0], xmm5
+ movhps [dstq+dsq*1], xmm5
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w8_loop
+ vzeroupper
+ RET
+.v_w16:
+ mova m12, [spel_v_perm16]
+ vbroadcasti32x4 m1, [srcq+ssq*0]
+ vbroadcasti32x4 ym4, [srcq+ssq*1]
+ mov r6d, 0x0f
+ vbroadcasti32x4 m2, [srcq+ssq*2]
+ add srcq, ss3q
+ vbroadcasti32x4 ym5, [srcq+ssq*0]
+ kmovb k1, r6d
+ vbroadcasti32x4 m3, [srcq+ssq*1]
+ vbroadcasti32x4 ym6, [srcq+ssq*2]
+ add srcq, ss3q
+ vbroadcasti32x4 m0, [srcq+ssq*0]
+ vshufpd m1{k1}, m4, m2, 0xcc
+ vshufpd m2{k1}, m5, m3, 0xcc
+ vshufpd m3{k1}, m6, m0, 0xcc
+ vpermb m1, m12, m1 ; 01 12
+ vpermb m2, m12, m2 ; 23 34
+ vpermb m3, m12, m3 ; 45 56
+.v_w16_loop:
+ pmaddubsw m4, m1, m8 ; a0 b0
+ mova m1, m2
+ pmaddubsw m5, m2, m9 ; a1 b1
+ mova m2, m3
+ pmaddubsw m6, m3, m10 ; a2 b2
+ mova m3, m0
+ paddw m4, m5
+ vbroadcasti32x4 ym5, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vbroadcasti32x4 m0, [srcq+ssq*0]
+ vshufpd m3{k1}, m5, m0, 0xcc
+ vpermb m3, m12, m3 ; 67 78
+ pmaddubsw m5, m3, m11 ; a3 b3
+ paddw m4, m6
+ paddw m4, m5
+ pmulhrsw m4, m7
+ vextracti32x8 ym5, m4, 1
+ packuswb ym4, ym5
+ mova [dstq+dsq*0], xm4
+ vextracti32x4 [dstq+dsq*1], ym4, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w16_loop
+ RET
+.v_w32:
+ mova m12, [spel_v_perm32]
+ pmovzxbq m14, [pb_02461357]
+ vpshrdw m13, m12, m12, 8
+ movu ym0, [srcq+ssq*0]
+ vinserti32x8 m0, [srcq+ssq*1], 1
+ vpermb m1, m12, m0 ; 01
+ vinserti32x8 m0, [srcq+ssq*2], 0
+ add srcq, ss3q
+ vpermb m2, m13, m0 ; 12
+ vinserti32x8 m0, [srcq+ssq*0], 1
+ vpermb m3, m12, m0 ; 23
+ vinserti32x8 m0, [srcq+ssq*1], 0
+ vpermb m4, m13, m0 ; 34
+ vinserti32x8 m0, [srcq+ssq*2], 1
+ add srcq, ss3q
+ vpermb m5, m12, m0 ; 45
+ vinserti32x8 m0, [srcq+ssq*0], 0
+ vpermb m6, m13, m0 ; 56
+.v_w32_loop:
+ vinserti32x8 m0, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ pmaddubsw m15, m1, m8
+ mova m1, m3
+ pmaddubsw m16, m2, m8
+ mova m2, m4
+ pmaddubsw m17, m3, m9
+ mova m3, m5
+ pmaddubsw m18, m4, m9
+ mova m4, m6
+ pmaddubsw m19, m5, m10
+ vpermb m5, m12, m0 ; 67
+ vinserti32x8 m0, [srcq+ssq*0], 0
+ pmaddubsw m20, m6, m10
+ vpermb m6, m13, m0 ; 78
+ paddw m15, m17
+ pmaddubsw m17, m5, m11
+ paddw m16, m18
+ pmaddubsw m18, m6, m11
+ paddw m15, m19
+ paddw m16, m20
+ paddw m15, m17
+ paddw m16, m18
+ pmulhrsw m15, m7
+ pmulhrsw m16, m7
+ packuswb m15, m16
+ vpermq m15, m14, m15
+ mova [dstq+dsq*0], ym15
+ vextracti32x8 [dstq+dsq*1], m15, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w32_loop
+ vzeroupper
+ RET
+.v_w64:
+.v_w128:
+ lea r6d, [hq+wq*4-256]
+ mov r4, srcq
+ mov r7, dstq
+.v_loop0:
+ movu m2, [srcq+ssq*0]
+ movu m4, [srcq+ssq*1]
+ movu m6, [srcq+ssq*2]
+ add srcq, ss3q
+ movu m13, [srcq+ssq*0]
+ movu m15, [srcq+ssq*1]
+ movu m17, [srcq+ssq*2]
+ add srcq, ss3q
+ movu m0, [srcq+ssq*0]
+ punpcklbw m1, m2, m4 ; 01l
+ punpckhbw m2, m4 ; 01h
+ punpcklbw m3, m4, m6 ; 12l
+ punpckhbw m4, m6 ; 12h
+ punpcklbw m5, m6, m13 ; 23l
+ punpckhbw m6, m13 ; 23h
+ punpcklbw m12, m13, m15 ; 34l
+ punpckhbw m13, m15 ; 34h
+ punpcklbw m14, m15, m17 ; 45l
+ punpckhbw m15, m17 ; 45h
+ punpcklbw m16, m17, m0 ; 56l
+ punpckhbw m17, m0 ; 56h
+.v_loop:
+ pmaddubsw m18, m1, m8 ; a0l
+ mova m1, m5
+ pmaddubsw m19, m2, m8 ; a0h
+ mova m2, m6
+ pmaddubsw m20, m3, m8 ; b0l
+ mova m3, m12
+ pmaddubsw m21, m4, m8 ; b0h
+ mova m4, m13
+ pmaddubsw m5, m9 ; a1l
+ pmaddubsw m6, m9 ; a1h
+ pmaddubsw m12, m9 ; b1l
+ pmaddubsw m13, m9 ; b1h
+ paddw m18, m5
+ mova m5, m14
+ pmaddubsw m14, m10 ; a2l
+ paddw m19, m6
+ mova m6, m15
+ pmaddubsw m15, m10 ; a2h
+ paddw m20, m12
+ mova m12, m16
+ pmaddubsw m16, m10 ; b2l
+ paddw m21, m13
+ mova m13, m17
+ pmaddubsw m17, m10 ; b2h
+ paddw m18, m14
+ paddw m19, m15
+ paddw m20, m16
+ paddw m21, m17
+ movu m17, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklbw m14, m0, m17 ; 67l
+ punpckhbw m15, m0, m17 ; 67h
+ pmaddubsw m16, m14, m11 ; a3l
+ pmaddubsw m0, m15, m11 ; a3h
+ paddw m18, m16
+ paddw m19, m0
+ movu m0, [srcq+ssq*0]
+ punpcklbw m16, m17, m0 ; 78l
+ punpckhbw m17, m0 ; 78h
+ pmulhrsw m18, m7
+ pmulhrsw m19, m7
+ packuswb m18, m19
+ mova [dstq+dsq*0], m18
+ pmaddubsw m18, m16, m11 ; b3l
+ pmaddubsw m19, m17, m11 ; b3h
+ paddw m18, m20
+ paddw m19, m21
+ pmulhrsw m18, m7
+ pmulhrsw m19, m7
+ packuswb m18, m19
+ mova [dstq+dsq*1], m18
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_loop
+ add r4, 64
+ add r7, 64
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
+ sub r6d, 256
+ jg .v_loop0
+ vzeroupper
+ RET
+.hv:
+ cmp wd, 4
+ jg .hv_w8
+ movzx mxd, mxb
+ dec srcq
+ vpbroadcastd m7, [base+subpel_filters+mxq*8+2]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ vpbroadcastd m8, [pd_2]
+ vpbroadcastq ym0, [base+subpel_filters+myq*8]
+ lea ss3q, [ssq*3]
+ vpbroadcastd ym9, [pd_32768]
+ mov r6, srcq
+ punpcklbw ym0, ym8, ym0
+ sub r6, ss3q
+ psraw ym0, 2 ; << 6
+ mova xm14, [spel_hv_end]
+ pshufd ym10, ym0, q0000
+ pshufd ym11, ym0, q1111
+ pshufd ym12, ym0, q2222
+ pshufd ym13, ym0, q3333
+ cmp wd, 4
+ je .hv_w4
+ vbroadcasti128 ym6, [subpel_h_shuf4]
+ movq xmm2, [r6+ssq*0]
+ movhps xmm2, [r6+ssq*1]
+ movq xmm0, [r6+ssq*2]
+ movhps xmm0, [srcq+ssq*0]
+ vpbroadcastq ymm3, [srcq+ssq*1]
+ vpbroadcastq ymm4, [srcq+ssq*2]
+ add srcq, ss3q
+ vpbroadcastq ymm1, [srcq+ssq*0]
+ vpblendd ymm2, ymm3, 0x30
+ vpblendd ymm0, ymm1, 0x30 ; 2 3 6 _
+ vpblendd ymm2, ymm4, 0xc0 ; 0 1 4 5
+ pshufb ymm2, ym6
+ pshufb ymm0, ym6
+ mova ymm1, ym8
+ vpdpbusd ymm1, ymm2, ym7
+ mova ymm2, ym8
+ vpdpbusd ymm2, ymm0, ym7
+ packssdw ymm2, ymm1, ymm2
+ psraw ymm2, 2
+ vextracti128 xmm3, ymm2, 1
+ palignr xmm4, xmm3, xmm2, 4
+ punpcklwd xmm1, xmm2, xmm4 ; 01 12
+ punpckhwd xmm2, xmm4 ; 23 34
+ pshufd xmm0, xmm3, q2121
+ punpcklwd xmm3, xmm0 ; 45 56
+.hv_w2_loop:
+ movq xmm4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movhps xmm4, [srcq+ssq*0]
+ mova xmm5, xm9
+ vpdpwssd xmm5, xmm1, xm10 ; a0 b0
+ mova xmm1, xmm2
+ vpdpwssd xmm5, xmm2, xm11 ; a1 b1
+ pshufb xmm4, xm6
+ mova xmm2, xmm3
+ vpdpwssd xmm5, xmm3, xm12 ; a2 b2
+ mova xmm3, xm8
+ vpdpbusd xmm3, xmm4, xm7
+ packssdw xmm4, xmm3, xmm3
+ psraw xmm4, 2
+ palignr xmm3, xmm4, xmm0, 12
+ mova xmm0, xmm4
+ punpcklwd xmm3, xmm4 ; 67 78
+ vpdpwssd xmm5, xmm3, xm13 ; a3 b3
+ packuswb xmm5, xmm5
+ pshufb xmm5, xm14
+ pextrw [dstq+dsq*0], xmm5, 0
+ pextrw [dstq+dsq*1], xmm5, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w2_loop
+ vzeroupper
+ RET
+.hv_w4:
+ movq xmm1, [r6+ssq*0]
+ vpbroadcastq ym2, [r6+ssq*1]
+ vinserti32x4 ym1, ymm1, [r6+ssq*2], 1
+ vinserti32x4 m2, [srcq+ssq*0], 2
+ vinserti32x4 m1, [srcq+ssq*1], 2
+ vinserti32x4 m2, [srcq+ssq*2], 3 ; _ 1 3 5
+ vbroadcasti32x4 m6, [subpel_h_shufA]
+ add srcq, ss3q
+ vinserti32x4 m1, [srcq+ssq*0], 3 ; 0 2 4 6
+ pshufb m2, m6
+ pshufb m1, m6
+ mova m0, m8
+ vpdpbusd m0, m2, m7
+ mova m4, m8
+ vpdpbusd m4, m1, m7
+ mova ym1, [spel_hv_perm4a]
+ mova ym2, [spel_hv_perm4b]
+ mova ym3, [spel_hv_perm4c]
+ packssdw m0, m4
+ psraw m0, 2 ; _ 0 1 2 3 4 5 6
+ mov r6d, 0x5555
+ vpermb ym1, ym1, ym0 ; 01 12
+ vpermb m2, m2, m0 ; 23 34
+ vpermb m3, m3, m0 ; 45 56
+ kmovw k1, r6d
+ mova ym15, [spel_hv_perm4d]
+.hv_w4_loop:
+ movq xmm4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vinserti32x4 ym4, ymm4, [srcq+ssq*0], 1
+ mova ym5, ym9
+ vpdpwssd ym5, ym1, ym10 ; a0 b0
+ mova ym1, ym2
+ pshufb ym4, ym6
+ mova ym0, ym8
+ vpdpbusd ym0, ym4, ym7
+ vpdpwssd ym5, ym2, ym11 ; a1 b1
+ mova ym2, ym3
+ vpdpwssd ym5, ym3, ym12 ; a2 b2
+ vpsraw ym3{k1}, ym0, 2 ; 7 8
+ vpermb ym3, ym15, ym3 ; 67 78
+ vpdpwssd ym5, ym3, ym13 ; a3 b3
+ packuswb ym5, ym5
+ vpermb ym5, ym14, ym5
+ movd [dstq+dsq*0], xm5
+ pextrd [dstq+dsq*1], xm5, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ shr mxd, 16
+ sub srcq, 3
+ vpbroadcastd m10, [base+subpel_filters+mxq*8+0]
+ vpbroadcastd m11, [base+subpel_filters+mxq*8+4]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ vpbroadcastd m8, [pd_2]
+ vpbroadcastq m0, [base+subpel_filters+myq*8]
+ vpbroadcastd m9, [pd_32768]
+ punpcklbw m0, m8, m0
+ lea ss3q, [ssq*3]
+ psraw m0, 2 ; << 6
+ pshufd m12, m0, q0000
+ pshufd m13, m0, q1111
+ pshufd m14, m0, q2222
+ pshufd m15, m0, q3333
+ cmp wd, 8
+ jne .hv_w16
+ mov r6, srcq
+ sub r6, ss3q
+ movu xmm1, [r6+ssq*0]
+ vinserti128 ymm1, [r6+ssq*1], 1
+ movu xmm2, [srcq+ssq*1]
+ vinserti32x4 m6, zmm1, [r6+ssq*2], 2
+ vinserti128 ymm2, [srcq+ssq*2], 1
+ vinserti32x4 m6, [srcq+ssq*0], 3 ; 0 1 2 3
+ add srcq, ss3q
+ vbroadcasti32x4 m4, [subpel_h_shufA]
+ vinserti32x4 m0, zmm2, [srcq+ssq*0], 2 ; 4 5 6 _
+ vbroadcasti32x4 m7, [subpel_h_shufB]
+ vbroadcasti32x4 m17, [subpel_h_shufC]
+ pshufb m1, m6, m4 ; 0 1 2 3 0123
+ mova m2, m8
+ vpdpbusd m2, m1, m10
+ pshufb m5, m6, m7 ; 0 1 2 3 4567
+ mova m1, m8
+ vpdpbusd m1, m5, m10
+ pshufb m4, m0, m4 ; 4 5 6 _ 0123
+ mova m3, m8
+ vpdpbusd m3, m4, m10
+ pshufb m7, m0, m7 ; 4 5 6 _ 4567
+ mova m4, m8
+ vpdpbusd m4, m7, m10
+ pshufb m6, m17
+ vpdpbusd m2, m5, m11
+ vpdpbusd m1, m6, m11
+ pshufb m6, m0, m17
+ vpdpbusd m3, m7, m11
+ vpdpbusd m4, m6, m11
+ mova m5, [spel_hv_perm8a]
+ mova m0, [spel_hv_perm8b]
+ mov r6, 0x55555555ff00
+ packssdw m2, m1
+ packssdw m3, m4
+ mova m18, [spel_hv_perm8c]
+ psraw m2, 2 ; 0 1 2 3
+ psraw m3, 2 ; 4 5 6 _
+ vpermb m1, m5, m2 ; 01 12
+ vbroadcasti32x8 m6, [subpel_h_shufA]
+ kmovq k1, r6
+ vpermt2b m2, m0, m3 ; 23 34
+ vbroadcasti32x8 m7, [subpel_h_shufB]
+ kshiftrq k2, k1, 16
+ mova xm16, [spel_hv_end]
+ vpermb m3, m5, m3 ; 45 56
+.hv_w8_loop:
+ vbroadcasti32x4 ym4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vbroadcasti32x4 m4{k1}, [srcq+ssq*0]
+ mova m0, m9
+ vpdpwssd m0, m1, m12 ; a0 b0
+ pshufb m1, m4, m6 ; 7 8 0123 4567
+ mova m5, m8
+ vpdpbusd m5, m1, m10
+ pshufb m4, m7 ; 7 8 4567 89ab
+ vpdpwssd m0, m2, m13 ; a1 b1
+ mova m1, m2
+ vpdpbusd m5, m4, m11
+ mova m2, m3
+ vpdpwssd m0, m3, m14 ; a2 b2
+ psraw m3{k2}, m5, 2 ; 75 86
+ vpermb m3, m18, m3 ; 67 78
+ vpdpwssd m0, m3, m15 ; a3 b3
+ packuswb m0, m0
+ vpermb zmm1, m16, m0
+ movq [dstq+dsq*0], xmm1
+ movhps [dstq+dsq*1], xmm1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w8_loop
+ vzeroupper
+ RET
+.hv_w16:
+ movu m7, [spel_hv_perm16a]
+ sub srcq, ss3q
+ mova m20, [spel_hv_perm16b]
+ lea r6d, [wq*2-32]
+ mova m21, [spel_hv_perm16c]
+ mov r4, srcq
+ mov r7, dstq
+ mova ym16, [spel_hv_end16]
+ lea r6d, [hq+r6*8]
+.hv_w16_loop0:
+ movu ym17, [srcq+ssq*0]
+ vinserti32x8 m17, [srcq+ssq*1], 1 ; 0 1
+ movu ym18, [srcq+ssq*2]
+ add srcq, ss3q
+ vinserti32x8 m18, [srcq+ssq*0], 1 ; 2 3
+ movu ym19, [srcq+ssq*1]
+ vinserti32x8 m19, [srcq+ssq*2], 1 ; 4 5
+ add srcq, ss3q
+ vpermb m2, m7, m17 ; 0 1 0123 89ab
+ vpermb m0, m20, m17 ; 0 1 4567 cdef
+ vpermb m4, m7, m18 ; 2 3 0123 89ab
+ mova m1, m8
+ vpdpbusd m1, m2, m10
+ vpermb m5, m20, m18 ; 2 3 4567 cdef
+ mova m2, m8
+ vpdpbusd m2, m0, m10
+ vpermb m17, m21, m17 ; 0 1 89ab ghij
+ mova m3, m8
+ vpdpbusd m3, m4, m10
+ vpermb m6, m7, m19 ; 4 5 0123 89ab
+ mova m4, m8
+ vpdpbusd m4, m5, m10
+ vpermb m18, m21, m18 ; 2 3 89ab ghij
+ vpdpbusd m1, m0, m11
+ movu ym0, [srcq+ssq*0] ; 6
+ vpdpbusd m2, m17, m11
+ vpermb m17, m20, m19 ; 4 5 4567 cdef
+ vpdpbusd m3, m5, m11
+ mova m5, m8
+ vpdpbusd m5, m6, m10
+ mova m6, m8
+ vpdpbusd m6, m17, m10
+ vpdpbusd m4, m18, m11
+ mova m18, [spel_hv_perm16d]
+ vpermb m18, m18, m0 ; 6 0145 2367 89cd abef
+ vpdpbusd m5, m17, m11
+ vpermb m19, m21, m19 ; 4 5 89ab ghij
+ mova m17, m8
+ vpdpbusd m17, m18, m10
+ mova m18, [spel_hv_perm16e]
+ vpermb m0, m18, m0 ; 6 4589 67ab cdgh efij
+ packssdw m1, m2 ; 01
+ vpdpbusd m6, m19, m11
+ packssdw m3, m4 ; 23
+ vpdpbusd m17, m0, m11
+ psraw m1, 2
+ packssdw m5, m6 ; 45
+ psraw m3, 2
+ vpshrdd m2, m1, m3, 16 ; 12
+ psraw m5, 2
+ vpshrdd m4, m3, m5, 16 ; 34
+ psraw m17, 2
+ vpshrdd m6, m5, m17, 16 ; 56
+.hv_w16_loop:
+ movu ym18, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vinserti32x8 m18, [srcq+ssq*0], 1
+ mova m0, m9
+ vpdpwssd m0, m1, m12 ; a0
+ vpermb m1, m7, m18 ; 7 8 0123 89ab
+ mova m17, m9
+ vpdpwssd m17, m2, m12 ; b0
+ vpermb m2, m20, m18 ; 7 8 4567 cdef
+ mova m19, m8
+ vpdpbusd m19, m1, m10
+ vpermb m18, m21, m18
+ mova m1, m8
+ vpdpbusd m1, m2, m10
+ vpdpwssd m0, m3, m13 ; a1
+ vpdpwssd m17, m4, m13 ; b1
+ vpdpbusd m19, m2, m11
+ mova m2, m4
+ vpdpbusd m1, m18, m11
+ mova m4, m6
+ vpdpwssd m0, m5, m14 ; a2
+ vpdpwssd m17, m6, m14 ; b2
+ packssdw m19, m1
+ mova m1, m3
+ mova m3, m5
+ psraw m6, m19, 2 ; 7 8
+ vpshrdd m5, m4, m6, 16 ; 6 7
+ vpdpwssd m17, m6, m15 ; b3
+ vpdpwssd m0, m5, m15 ; a3
+ packuswb m0, m17
+ vpermb zmm1, m16, m0
+ mova [dstq+dsq*0], xmm1
+ vextracti128 [dstq+dsq*1], ymm1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w16_loop
+ add r4, 16
+ add r7, 16
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
+ sub r6d, 1<<8
+ jg .hv_w16_loop0
+ vzeroupper
+ RET
+
+%macro PREP_8TAP_H 0
+ vpermb m10, m5, m0
+ vpermb m11, m5, m1
+ vpermb m12, m6, m0
+ vpermb m13, m6, m1
+ vpermb m14, m7, m0
+ vpermb m15, m7, m1
+ mova m0, m4
+ vpdpbusd m0, m10, m8
+ mova m2, m4
+ vpdpbusd m2, m12, m8
+ mova m1, m4
+ vpdpbusd m1, m11, m8
+ mova m3, m4
+ vpdpbusd m3, m13, m8
+ vpdpbusd m0, m12, m9
+ vpdpbusd m2, m14, m9
+ vpdpbusd m1, m13, m9
+ vpdpbusd m3, m15, m9
+ packssdw m0, m2
+ packssdw m1, m3
+ psraw m0, 2
+ psraw m1, 2
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+%endmacro
+
+%if WIN64
+DECLARE_REG_TMP 6, 4
+%else
+DECLARE_REG_TMP 6, 7
+%endif
+
+%define PREP_8TAP_FN FN prep_8tap,
+
+PREP_8TAP_FN sharp, SHARP, SHARP
+PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH
+PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP
+PREP_8TAP_FN smooth, SMOOTH, SMOOTH
+PREP_8TAP_FN sharp_regular, SHARP, REGULAR
+PREP_8TAP_FN regular_sharp, REGULAR, SHARP
+PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR
+PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PREP_8TAP_FN regular, REGULAR, REGULAR
+
+cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+ imul myd, mym, 0x010101
+ add myd, t1d ; 8tap_v, my, 4tap_v
+ lea r7, [prep_avx512icl]
+ movsxd wq, wm
+ movifnidn hd, hm
+ test mxd, 0xf00
+ jnz .h
+ test myd, 0xf00
+ jnz .v
+ tzcnt wd, wd
+ movzx wd, word [r7+wq*2+table_offset(prep,)]
+ add wq, r7
+ lea r6, [strideq*3]
+%if WIN64
+ pop r7
+%endif
+ jmp wq
+.h:
+ test myd, 0xf00
+ jnz .hv
+ vpbroadcastd m4, [pd_2]
+ WIN64_SPILL_XMM 10
+ cmp wd, 4
+ je .h_w4
+ tzcnt wd, wd
+ shr mxd, 16
+ sub srcq, 3
+ movzx wd, word [r7+wq*2+table_offset(prep, _8tap_h)]
+ vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep_avx512icl+0]
+ vpbroadcastd m9, [r7+mxq*8+subpel_filters-prep_avx512icl+4]
+ add wq, r7
+ jmp wq
+.h_w4:
+ movzx mxd, mxb
+ vbroadcasti128 ym5, [subpel_h_shufA]
+ mov r3d, 0x4
+ dec srcq
+ vpbroadcastd ym6, [r7+mxq*8+subpel_filters-prep_avx512icl+2]
+ kmovb k1, r3d
+ lea stride3q, [strideq*3]
+.h_w4_loop:
+ movq xm2, [srcq+strideq*0]
+ movq xm3, [srcq+strideq*1]
+ vpbroadcastq ym2{k1}, [srcq+strideq*2]
+ vpbroadcastq ym3{k1}, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ pshufb ym2, ym5
+ pshufb ym3, ym5
+ mova ym0, ym4
+ vpdpbusd ym0, ym2, ym6
+ mova ym1, ym4
+ vpdpbusd ym1, ym3, ym6
+ packssdw ym0, ym1
+ psraw ym0, 2
+ mova [tmpq], ym0
+ add tmpq, 32
+ sub hd, 4
+ jg .h_w4_loop
+ RET
+.h_w8:
+ vbroadcasti128 m5, [subpel_h_shufA]
+ vbroadcasti128 m6, [subpel_h_shufB]
+ vbroadcasti128 m7, [subpel_h_shufC]
+ lea stride3q, [strideq*3]
+.h_w8_loop:
+ movu xmm3, [srcq+strideq*0]
+ vinserti128 ym3, ymm3, [srcq+strideq*1], 1
+ vinserti128 m3, [srcq+strideq*2], 2
+ vinserti128 m3, [srcq+stride3q ], 3
+ lea srcq, [srcq+strideq*4]
+ pshufb m1, m3, m5
+ pshufb m2, m3, m6
+ mova m0, m4
+ vpdpbusd m0, m1, m8
+ mova m1, m4
+ vpdpbusd m1, m2, m8
+ pshufb m3, m7
+ vpdpbusd m0, m2, m9
+ vpdpbusd m1, m3, m9
+ packssdw m0, m1
+ psraw m0, 2
+ mova [tmpq], m0
+ add tmpq, 64
+ sub hd, 4
+ jg .h_w8_loop
+ RET
+.h_w16:
+ mova m5, [spel_h_perm16a]
+ mova m6, [spel_h_perm16b]
+ mova m7, [spel_h_perm16c]
+ lea stride3q, [strideq*3]
+.h_w16_loop:
+ movu ym0, [srcq+strideq*0]
+ movu ym1, [srcq+strideq*2]
+ vinserti32x8 m0, [srcq+strideq*1], 1
+ vinserti32x8 m1, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ PREP_8TAP_H
+ add tmpq, 64*2
+ sub hd, 4
+ jg .h_w16_loop
+ RET
+.h_w32:
+ mova m5, [spel_h_perm32a]
+ mova m6, [spel_h_perm32b]
+ mova m7, [spel_h_perm32c]
+.h_w32_loop:
+ movu m0, [srcq+strideq*0]
+ movu m1, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ PREP_8TAP_H
+ add tmpq, 64*2
+ sub hd, 2
+ jg .h_w32_loop
+ RET
+.h_w64:
+ xor r6d, r6d
+ jmp .h_start
+.h_w128:
+ mov r6, -64*1
+.h_start:
+ mova m5, [spel_h_perm32a]
+ mova m6, [spel_h_perm32b]
+ mova m7, [spel_h_perm32c]
+ sub srcq, r6
+ mov r5, r6
+.h_loop:
+ movu m0, [srcq+r6+32*0]
+ movu m1, [srcq+r6+32*1]
+ PREP_8TAP_H
+ add tmpq, 64*2
+ add r6, 64
+ jle .h_loop
+ add srcq, strideq
+ mov r6, r5
+ dec hd
+ jg .h_loop
+ RET
+.v:
+ movzx mxd, myb ; Select 4-tap/8-tap filter multipliers.
+ shr myd, 16 ; Note that the code is 8-tap only, having
+ tzcnt wd, wd
+ cmp hd, 4 ; a separate 4-tap code path for (4|8|16)x4
+ cmove myd, mxd ; had a negligible effect on performance.
+ ; TODO: Would a 6-tap code path be worth it?
+ lea myq, [r7+myq*8+subpel_filters-prep_avx512icl]
+ movzx wd, word [r7+wq*2+table_offset(prep, _8tap_v)]
+ add wq, r7
+ lea stride3q, [strideq*3]
+ sub srcq, stride3q
+ vpbroadcastd m7, [pw_8192]
+ vpbroadcastw m8, [myq+0]
+ vpbroadcastw m9, [myq+2]
+ vpbroadcastw m10, [myq+4]
+ vpbroadcastw m11, [myq+6]
+ jmp wq
+.v_w4:
+ movd xmm0, [srcq+strideq*0]
+ vpbroadcastd ymm1, [srcq+strideq*2]
+ vpbroadcastd xmm2, [srcq+strideq*1]
+ vpbroadcastd ymm3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpblendd ymm1, ymm0, 0x01 ; 0 2 2 _ 2 _ _ _
+ vpblendd ymm3, ymm2, 0x03 ; 1 1 3 3 3 3 _ _
+ vpbroadcastd ymm0, [srcq+strideq*0]
+ vpbroadcastd ymm2, [srcq+strideq*1]
+ vpblendd ymm1, ymm0, 0x68 ; 0 2 2 4 2 4 4 _
+ vpbroadcastd ymm0, [srcq+strideq*2]
+ vbroadcasti128 ymm5, [deint_shuf4]
+ vpblendd ymm3, ymm2, 0xc0 ; 1 1 3 3 3 3 5 5
+ vpblendd ymm2, ymm3, ymm1, 0x55 ; 0 1 2 3 2 3 4 5
+ vpblendd ymm3, ymm1, 0xaa ; 1 2 3 4 3 4 5 _
+ punpcklbw ymm1, ymm2, ymm3 ; 01 12 23 34
+ vpblendd ymm3, ymm0, 0x80 ; 1 2 3 4 3 4 5 6
+ punpckhbw ymm2, ymm3 ; 23 34 45 56
+.v_w4_loop:
+ pinsrd xmm0, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ vpbroadcastd ymm3, [srcq+strideq*0]
+ vpbroadcastd ymm4, [srcq+strideq*1]
+ vpblendd ymm3, ymm4, 0x20 ; _ _ 8 _ 8 9 _ _
+ vpblendd ymm3, ymm0, 0x03 ; 6 7 8 _ 8 9 _ _
+ vpbroadcastd ymm0, [srcq+strideq*2]
+ vpblendd ymm3, ymm0, 0x40 ; 6 7 8 _ 8 9 a _
+ pshufb ymm3, ymm5 ; 67 78 89 9a
+ pmaddubsw ymm4, ymm1, ym8
+ vperm2i128 ymm1, ymm2, ymm3, 0x21 ; 45 56 67 78
+ pmaddubsw ymm2, ym9
+ paddw ymm4, ymm2
+ mova ymm2, ymm3
+ pmaddubsw ymm3, ym11
+ paddw ymm3, ymm4
+ pmaddubsw ymm4, ymm1, ym10
+ paddw ymm3, ymm4
+ pmulhrsw ymm3, ym7
+ mova [tmpq], ymm3
+ add tmpq, 32
+ sub hd, 4
+ jg .v_w4_loop
+ vzeroupper
+ RET
+.v_w8:
+ mov r3d, 0xf044
+ kmovw k1, r3d
+ kshiftrw k2, k1, 8
+ movq xm0, [srcq+strideq*0]
+ vpbroadcastq ym1, [srcq+strideq*1]
+ vpbroadcastq m2, [srcq+strideq*2]
+ vpbroadcastq m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpbroadcastq m4, [srcq+strideq*0]
+ vpbroadcastq m5, [srcq+strideq*1]
+ vpbroadcastq m6, [srcq+strideq*2]
+ vmovdqa64 ym0{k1}, ym1
+ vmovdqa64 ym1{k1}, ym2
+ vmovdqa64 m2{k1}, m3
+ vmovdqa64 m3{k1}, m4
+ vmovdqa64 m4{k1}, m5
+ vmovdqa64 m5{k1}, m6
+ punpcklbw ym0, ym1 ; 01 12 __ __
+ punpcklbw m2, m3 ; 23 34 23 34
+ punpcklbw m4, m5 ; 45 56 45 56
+ vmovdqa64 m0{k2}, m2 ; 01 12 23 34
+ vmovdqa64 m2{k2}, m4 ; 23 34 45 56
+.v_w8_loop:
+ vpbroadcastq m1, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpbroadcastq m3, [srcq+strideq*0]
+ vpbroadcastq m5, [srcq+strideq*1]
+ pmaddubsw m14, m0, m8
+ pmaddubsw m15, m2, m9
+ vpblendmq m0{k1}, m6, m1
+ vpblendmq m2{k1}, m1, m3
+ vpbroadcastq m6, [srcq+strideq*2]
+ paddw m14, m15
+ punpcklbw m2, m0, m2 ; 67 78 67 78
+ vpblendmq m12{k1}, m3, m5
+ vpblendmq m13{k1}, m5, m6
+ vpblendmq m0{k2}, m4, m2 ; 45 56 67 78
+ punpcklbw m4, m12, m13 ; 89 9a 89 9a
+ vmovdqa64 m2{k2}, m4 ; 67 78 89 9a
+ pmaddubsw m12, m0, m10
+ pmaddubsw m13, m2, m11
+ paddw m14, m12
+ paddw m14, m13
+ pmulhrsw m14, m7
+ mova [tmpq], m14
+ add tmpq, 64
+ sub hd, 4
+ jg .v_w8_loop
+ RET
+.v_w16:
+ mov r3d, 0xf0
+ kmovb k1, r3d
+ vbroadcasti128 m0, [srcq+strideq*0]
+ vbroadcasti128 m1, [srcq+strideq*1]
+ vbroadcasti128 m2, [srcq+strideq*2]
+ vbroadcasti128 m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vbroadcasti128 m4, [srcq+strideq*0]
+ vbroadcasti128 m5, [srcq+strideq*1]
+ vbroadcasti128 m6, [srcq+strideq*2]
+ vmovdqa64 m0{k1}, m1
+ vmovdqa64 m1{k1}, m2
+ vmovdqa64 m2{k1}, m3
+ vmovdqa64 m3{k1}, m4
+ vmovdqa64 m4{k1}, m5
+ vmovdqa64 m5{k1}, m6
+ shufpd m0, m2, 0xcc ; 0a_2a 0b_2b 1a_3a 1b_3b
+ shufpd m1, m3, 0xcc ; 1a_3a 1b_3b 2a_4a 2b_4b
+ shufpd m4, m4, 0x44 ; 4a_-- 4b_-- 5a_-- 5b_--
+ shufpd m5, m5, 0x44 ; 5a_-- 5b_-- 6a_-- 6b_--
+ punpckhbw m2, m0, m1 ; 23a 23b 34a 34b
+ punpcklbw m0, m1 ; 01a 01b 12a 12b
+ punpcklbw m4, m5 ; 45a 45b 56a 56b
+.v_w16_loop:
+ vbroadcasti128 m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vbroadcasti128 m5, [srcq+strideq*0]
+ vpblendmq m1{k1}, m6, m3
+ vmovdqa64 m3{k1}, m5
+ pmaddubsw m12, m0, m8
+ pmaddubsw m13, m2, m8
+ pmaddubsw m14, m2, m9
+ pmaddubsw m15, m4, m9
+ pmaddubsw m0, m4, m10
+ vbroadcasti128 m2, [srcq+strideq*1]
+ vbroadcasti128 m6, [srcq+strideq*2]
+ paddw m12, m14
+ paddw m13, m15
+ paddw m12, m0
+ vmovdqa64 m5{k1}, m2
+ vmovdqa64 m2{k1}, m6
+ mova m0, m4
+ shufpd m1, m5, 0xcc ; 6a_8a 6b_8b 7a_9a 7b_9b
+ shufpd m3, m2, 0xcc ; 7a_9a 7b_9b 8a_Aa 8b_Ab
+ punpcklbw m2, m1, m3 ; 67a 67b 78a 78b
+ punpckhbw m4, m1, m3 ; 89a 89b 9Aa 9Ab
+ pmaddubsw m14, m2, m10
+ pmaddubsw m15, m2, m11
+ paddw m13, m14
+ paddw m12, m15
+ pmaddubsw m14, m4, m11
+ paddw m13, m14
+ pmulhrsw m12, m7
+ pmulhrsw m13, m7
+ mova [tmpq+ 0], m12
+ mova [tmpq+64], m13
+ add tmpq, 64*2
+ sub hd, 4
+ jg .v_w16_loop
+ RET
+.v_w32:
+ mova m18, [bilin_v_perm64]
+ movu ym0, [srcq+strideq*0]
+ movu ym1, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ movu ym2, [srcq+strideq*0]
+ movu ym3, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ movu ym4, [srcq+strideq*0]
+ movu ym5, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ movu ym6, [srcq+strideq*0]
+ vpermq m0, m18, m0
+ vpermq m1, m18, m1
+ vpermq m2, m18, m2
+ vpermq m3, m18, m3
+ vpermq m4, m18, m4
+ vpermq m5, m18, m5
+ vpermq m6, m18, m6
+ punpcklbw m0, m1
+ punpcklbw m1, m2
+ punpcklbw m2, m3
+ punpcklbw m3, m4
+ punpcklbw m4, m5
+ punpcklbw m5, m6
+.v_w32_loop:
+ movu ym12, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ movu ym13, [srcq+strideq*0]
+ pmaddubsw m14, m0, m8
+ pmaddubsw m16, m2, m9
+ pmaddubsw m15, m1, m8
+ pmaddubsw m17, m3, m9
+ mova m0, m2
+ mova m1, m3
+ vpermq m12, m18, m12
+ vpermq m13, m18, m13
+ paddw m14, m16
+ paddw m15, m17
+ pmaddubsw m16, m4, m10
+ pmaddubsw m17, m5, m10
+ punpcklbw m6, m12
+ punpcklbw m12, m13
+ mova m2, m4
+ mova m3, m5
+ paddw m14, m16
+ paddw m15, m17
+ pmaddubsw m16, m6, m11
+ pmaddubsw m17, m12, m11
+ mova m4, m6
+ mova m5, m12
+ paddw m14, m16
+ paddw m15, m17
+ pmulhrsw m14, m7
+ pmulhrsw m15, m7
+ mova m6, m13
+ mova [tmpq+ 0], m14
+ mova [tmpq+64], m15
+ add tmpq, 64*2
+ sub hd, 2
+ jg .v_w32_loop
+ vzeroupper
+ RET
+.v_w64:
+ mov wd, 64
+ jmp .v_start
+.v_w128:
+ mov wd, 128
+.v_start:
+ WIN64_SPILL_XMM 27
+ mova m26, [bilin_v_perm64]
+ lea r6d, [hq+wq*2]
+ mov r5, srcq
+ mov r7, tmpq
+.v_loop0:
+ vpermq m0, m26, [srcq+strideq*0]
+ vpermq m1, m26, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vpermq m2, m26, [srcq+strideq*0]
+ vpermq m3, m26, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vpermq m4, m26, [srcq+strideq*0]
+ vpermq m5, m26, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vpermq m6, m26, [srcq+strideq*0]
+ punpckhbw m12, m0, m1
+ punpcklbw m0, m1
+ punpckhbw m13, m1, m2
+ punpcklbw m1, m2
+ punpckhbw m14, m2, m3
+ punpcklbw m2, m3
+ punpckhbw m15, m3, m4
+ punpcklbw m3, m4
+ punpckhbw m16, m4, m5
+ punpcklbw m4, m5
+ punpckhbw m17, m5, m6
+ punpcklbw m5, m6
+.v_loop:
+ vpermq m18, m26, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vpermq m19, m26, [srcq+strideq*0]
+ pmaddubsw m20, m0, m8
+ pmaddubsw m21, m12, m8
+ pmaddubsw m22, m1, m8
+ pmaddubsw m23, m13, m8
+ mova m0, m2
+ mova m12, m14
+ mova m1, m3
+ mova m13, m15
+ pmaddubsw m2, m9
+ pmaddubsw m14, m9
+ pmaddubsw m3, m9
+ pmaddubsw m15, m9
+ punpckhbw m24, m6, m18
+ punpcklbw m6, m18
+ paddw m20, m2
+ paddw m21, m14
+ paddw m22, m3
+ paddw m23, m15
+ mova m2, m4
+ mova m14, m16
+ mova m3, m5
+ mova m15, m17
+ pmaddubsw m4, m10
+ pmaddubsw m16, m10
+ pmaddubsw m5, m10
+ pmaddubsw m17, m10
+ punpckhbw m25, m18, m19
+ punpcklbw m18, m19
+ paddw m20, m4
+ paddw m21, m16
+ paddw m22, m5
+ paddw m23, m17
+ mova m4, m6
+ mova m16, m24
+ mova m5, m18
+ mova m17, m25
+ pmaddubsw m6, m11
+ pmaddubsw m24, m11
+ pmaddubsw m18, m11
+ pmaddubsw m25, m11
+ paddw m20, m6
+ paddw m21, m24
+ paddw m22, m18
+ paddw m23, m25
+ pmulhrsw m20, m7
+ pmulhrsw m21, m7
+ pmulhrsw m22, m7
+ pmulhrsw m23, m7
+ mova m6, m19
+ mova [tmpq+wq*0+ 0], m20
+ mova [tmpq+wq*0+64], m21
+ mova [tmpq+wq*2+ 0], m22
+ mova [tmpq+wq*2+64], m23
+ lea tmpq, [tmpq+wq*4]
+ sub hd, 2
+ jg .v_loop
+ add r5, 64
+ add r7, 128
+ movzx hd, r6b
+ mov srcq, r5
+ mov tmpq, r7
+ sub r6d, 1<<8
+ jg .v_loop0
+ RET
+.hv:
+ %assign stack_offset stack_offset - stack_size_padded
+ %assign stack_size_padded 0
+ WIN64_SPILL_XMM 16
+ cmp wd, 4
+ je .hv_w4
+ shr mxd, 16
+ sub srcq, 3
+ vpbroadcastd m10, [r7+mxq*8+subpel_filters-prep_avx512icl+0]
+ vpbroadcastd m11, [r7+mxq*8+subpel_filters-prep_avx512icl+4]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmove myd, mxd
+ tzcnt wd, wd
+ vpbroadcastd m8, [pd_2]
+ movzx wd, word [r7+wq*2+table_offset(prep, _8tap_hv)]
+ vpbroadcastd m9, [pd_32]
+ add wq, r7
+ vpbroadcastq m0, [r7+myq*8+subpel_filters-prep_avx512icl]
+ lea stride3q, [strideq*3]
+ sub srcq, stride3q
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ pshufd m12, m0, q0000
+ pshufd m13, m0, q1111
+ pshufd m14, m0, q2222
+ pshufd m15, m0, q3333
+ jmp wq
+.hv_w4:
+ movzx mxd, mxb
+ dec srcq
+ vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep_avx512icl+2]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmove myd, mxd
+ vpbroadcastq m0, [r7+myq*8+subpel_filters-prep_avx512icl]
+ lea stride3q, [strideq*3]
+ sub srcq, stride3q
+ mov r3d, 0x04
+ kmovb k1, r3d
+ kshiftlb k2, k1, 2
+ kshiftlb k3, k1, 4
+ vpbroadcastd m10, [pd_2]
+ vbroadcasti128 m16, [subpel_h_shufA]
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ vpbroadcastd m11, [pd_32]
+ pshufd m12, m0, q0000
+ pshufd m13, m0, q1111
+ pshufd m14, m0, q2222
+ pshufd m15, m0, q3333
+ movq xm3, [srcq+strideq*0]
+ vpbroadcastq ym2, [srcq+strideq*1]
+ vpbroadcastq ym3{k1}, [srcq+strideq*2]
+ vpbroadcastq m2{k2}, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpbroadcastq m3{k2}, [srcq+strideq*0]
+ vpbroadcastq m2{k3}, [srcq+strideq*1]
+ vpbroadcastq m3{k3}, [srcq+strideq*2]
+ mova m17, [spel_hv_perm4a]
+ movu m18, [spel_hv_perm4b]
+ mova m0, m10
+ mova m1, m10
+ pshufb m2, m16
+ pshufb m3, m16
+ vpdpbusd m0, m2, m8
+ vpdpbusd m1, m3, m8
+ packssdw m0, m1 ; _ 0 1 2 3 4 5 6
+ psraw m0, 2
+ vpermb m1, m17, m0 ; 01 12 23 34
+ vpermb m2, m18, m0 ; 23 34 45 56
+.hv_w4_loop:
+ movq xm3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ movq xm4, [srcq+strideq*0]
+ vpbroadcastq ym3{k1}, [srcq+strideq*1]
+ vpbroadcastq ym4{k1}, [srcq+strideq*2]
+ mova ym5, ym10
+ mova ym6, ym10
+ pshufb ym3, ym16
+ pshufb ym4, ym16
+ vpdpbusd ym5, ym3, ym8
+ vpdpbusd ym6, ym4, ym8
+ mova m7, m11
+ packssdw ym5, ym6 ; 7 8 9 a _ _ _ _
+ psraw ym5, 2
+ valignq m0, m5, m0, 4 ; _ 4 5 6 7 8 9 a
+ vpdpwssd m7, m1, m12
+ vpdpwssd m7, m2, m13
+ vpermb m1, m17, m0 ; 45 56 67 78
+ vpermb m2, m18, m0 ; 67 78 89 9a
+ vpdpwssd m7, m1, m14
+ vpdpwssd m7, m2, m15
+ psrad m7, 6
+ vpmovdw [tmpq], m7
+ add tmpq, 32
+ sub hd, 4
+ jg .hv_w4_loop
+ vzeroupper
+ RET
+.hv_w8:
+ WIN64_SPILL_XMM 24
+ vbroadcasti128 m16, [subpel_h_shufA]
+ vbroadcasti128 m17, [subpel_h_shufB]
+ vbroadcasti128 m18, [subpel_h_shufC]
+ vinserti128 ym0, [srcq+strideq*0], 1
+ vinserti128 m0, [srcq+strideq*1], 2
+ vinserti128 m0, [srcq+strideq*2], 3
+ movu xm1, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vinserti128 ym1, [srcq+strideq*0], 1
+ vinserti128 m1, [srcq+strideq*1], 2
+ vinserti128 m1, [srcq+strideq*2], 3
+ mova m2, m8
+ mova m4, m8
+ mova m3, m8
+ mova m5, m8
+ pshufb m20, m0, m16
+ pshufb m21, m0, m17
+ pshufb m22, m0, m18
+ pshufb m23, m1, m16
+ pshufb m6, m1, m17
+ pshufb m7, m1, m18
+ vpdpbusd m2, m20, m10
+ vpdpbusd m4, m21, m10
+ vpdpbusd m2, m21, m11
+ vpdpbusd m4, m22, m11
+ vpdpbusd m3, m23, m10
+ vpdpbusd m5, m6, m10
+ vpdpbusd m3, m6, m11
+ vpdpbusd m5, m7, m11
+ packssdw m2, m4
+ packssdw m3, m5
+ psraw m2, 2 ; _ 0 1 2
+ psraw m3, 2 ; 3 4 5 6
+ valignq m0, m3, m2, 2 ; 0 1 2 3
+ valignq m1, m3, m2, 4 ; 1 2 3 4
+ valignq m2, m3, m2, 6 ; 2 3 4 5
+ punpcklwd m4, m0, m1 ; 01a 12a 23a 34a
+ punpckhwd m5, m0, m1 ; 01b 12b 23b 34b
+ punpcklwd m6, m2, m3 ; 23a 34a 45a 56a
+ punpckhwd m7, m2, m3 ; 23b 34b 45b 56b
+.hv_w8_loop:
+ movu xm19, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vinserti128 ym19, [srcq+strideq*0], 1
+ vinserti128 m19, [srcq+strideq*1], 2
+ vinserti128 m19, [srcq+strideq*2], 3
+ mova m20, m9
+ mova m21, m9
+ mova m22, m8
+ mova m23, m8
+ vpdpwssd m20, m4, m12
+ vpdpwssd m21, m5, m12
+ vpdpwssd m20, m6, m13
+ vpdpwssd m21, m7, m13
+ pshufb m0, m19, m16
+ pshufb m1, m19, m17
+ pshufb m2, m19, m18
+ vpdpbusd m22, m0, m10
+ vpdpbusd m23, m1, m10
+ vpdpbusd m22, m1, m11
+ vpdpbusd m23, m2, m11
+ packssdw m22, m23
+ psraw m22, 2 ; 7 8 9 A
+ valignq m0, m22, m3, 2 ; 4 5 6 7
+ valignq m1, m22, m3, 4 ; 5 6 7 8
+ valignq m2, m22, m3, 6 ; 6 7 8 9
+ mova m3, m22
+ punpcklwd m4, m0, m1 ; 45a 56a 67a 78a
+ punpckhwd m5, m0, m1 ; 45b 56b 67b 78b
+ punpcklwd m6, m2, m3 ; 67a 78a 89a 9Aa
+ punpckhwd m7, m2, m3 ; 67b 78b 89b 9Ab
+ vpdpwssd m20, m4, m14
+ vpdpwssd m21, m5, m14
+ vpdpwssd m20, m6, m15
+ vpdpwssd m21, m7, m15
+ psrad m20, 6
+ psrad m21, 6
+ packssdw m20, m21
+ mova [tmpq], m20
+ add tmpq, 64
+ sub hd, 4
+ jg .hv_w8_loop
+ RET
+.hv_w16:
+ mov wd, 16*2
+ jmp .hv_start
+.hv_w32:
+ mov wd, 32*2
+ jmp .hv_start
+.hv_w64:
+ mov wd, 64*2
+ jmp .hv_start
+.hv_w128:
+ mov wd, 128*2
+.hv_start:
+ WIN64_SPILL_XMM 31
+ mova m16, [spel_h_perm16a]
+ mova m17, [spel_h_perm16b]
+ mova m18, [spel_h_perm16c]
+ lea r6d, [hq+wq*8-256]
+ mov r5, srcq
+ mov r7, tmpq
+.hv_loop0:
+ movu ym0, [srcq+strideq*0]
+ vinserti32x8 m0, [srcq+strideq*1], 1
+ lea srcq, [srcq+strideq*2]
+ movu ym1, [srcq+strideq*0]
+ vinserti32x8 m1, [srcq+strideq*1], 1
+ lea srcq, [srcq+strideq*2]
+ movu ym2, [srcq+strideq*0]
+ vinserti32x8 m2, [srcq+strideq*1], 1
+ lea srcq, [srcq+strideq*2]
+ movu ym3, [srcq+strideq*0]
+ mova m4, m8
+ mova m5, m8
+ mova m6, m8
+ mova m7, m8
+ vpermb m19, m16, m0
+ vpermb m20, m17, m0
+ vpermb m21, m18, m0
+ vpermb m22, m16, m1
+ vpermb m23, m17, m1
+ vpermb m24, m18, m1
+ vpermb m25, m16, m2
+ vpermb m26, m17, m2
+ vpermb m27, m18, m2
+ vpermb ym28, ym16, ym3
+ vpermb ym29, ym17, ym3
+ vpermb ym30, ym18, ym3
+ mova m0, m8
+ mova m1, m8
+ mova ym2, ym8
+ mova ym3, ym8
+ vpdpbusd m4, m19, m10
+ vpdpbusd m5, m20, m10
+ vpdpbusd m6, m22, m10
+ vpdpbusd m7, m23, m10
+ vpdpbusd m0, m25, m10
+ vpdpbusd m1, m26, m10
+ vpdpbusd ym2, ym28, ym10
+ vpdpbusd ym3, ym29, ym10
+ vpdpbusd m4, m20, m11
+ vpdpbusd m5, m21, m11
+ vpdpbusd m6, m23, m11
+ vpdpbusd m7, m24, m11
+ vpdpbusd m0, m26, m11
+ vpdpbusd m1, m27, m11
+ vpdpbusd ym2, ym29, ym11
+ vpdpbusd ym3, ym30, ym11
+ packssdw m4, m5
+ packssdw m6, m7
+ packssdw m0, m1
+ packssdw ym2, ym3
+ psraw m4, 2 ; 0a 0b 1a 1b
+ psraw m6, 2 ; 2a 2b 3a 3b
+ psraw m0, 2 ; 4a 4b 5a 5b
+ psraw ym2, 2 ; 6a 6b __ __
+ vshufi32x4 m5, m4, m6, q1032 ; 1a 1b 2a 2b
+ vshufi32x4 m7, m6, m0, q1032 ; 3a 3b 4a 4b
+ vshufi32x4 m1, m0, m2, q1032 ; 5a 5b 6a 6b
+ punpcklwd m2, m4, m5 ; 01a 01c 12a 12c
+ punpckhwd m3, m4, m5 ; 01b 01d 12b 12d
+ punpcklwd m4, m6, m7 ; 23a 23c 34a 34c
+ punpckhwd m5, m6, m7 ; 23b 23d 34b 34d
+ punpcklwd m6, m0, m1 ; 45a 45c 56a 56c
+ punpckhwd m7, m0, m1 ; 45b 45d 56b 56d
+.hv_loop:
+ movu ym19, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vinserti32x8 m19, [srcq+strideq*0], 1
+ mova m20, m9
+ mova m21, m9
+ mova m22, m8
+ mova m23, m8
+ vpdpwssd m20, m2, m12
+ vpdpwssd m21, m3, m12
+ vpdpwssd m20, m4, m13
+ vpdpwssd m21, m5, m13
+ vpermb m24, m16, m19
+ vpermb m25, m17, m19
+ vpermb m26, m18, m19
+ vpdpbusd m22, m24, m10
+ vpdpbusd m23, m25, m10
+ vpdpbusd m22, m25, m11
+ vpdpbusd m23, m26, m11
+ packssdw m22, m23
+ psraw m22, 2 ; 7a 7b 8a 8b
+ vshufi32x4 m0, m1, m22, q1032 ; 6a 6b 7a 7b
+ mova m2, m4
+ mova m3, m5
+ mova m1, m22
+ mova m4, m6
+ mova m5, m7
+ punpcklwd m6, m0, m1 ; 67a 67c 78a 78c
+ punpckhwd m7, m0, m1 ; 67b 67d 78b 78d
+ vpdpwssd m20, m4, m14
+ vpdpwssd m21, m5, m14
+ vpdpwssd m20, m6, m15
+ vpdpwssd m21, m7, m15
+ psrad m20, 6
+ psrad m21, 6
+ packssdw m20, m21
+ mova [tmpq+wq*0], ym20
+ vextracti32x8 [tmpq+wq*1], m20, 1
+ lea tmpq, [tmpq+wq*2]
+ sub hd, 2
+ jg .hv_loop
+ add r5, 16
+ add r7, 32
+ movzx hd, r6b
+ mov srcq, r5
+ mov tmpq, r7
+ sub r6d, 1<<8
+ jg .hv_loop0
+ RET
+
+cglobal warp_affine_8x8t_8bpc, 4, 7, 22, tmp, ts
+ vpbroadcastd m9, [pd_16384]
+ mova ym15, [warp_8x8t_end]
+ call mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx512icl).main
+ jmp .start
+.loop:
+ call mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx512icl).main2
+ lea tmpq, [tmpq+tsq*4]
+.start:
+ paddd m16, m16
+ vpermb m16, m15, m16
+ mova [tmpq+tsq*0], xm16
+ vextracti128 [tmpq+tsq*2], ym16, 1
+ sub r6d, 0x1800
+ jg .loop
+ RET
+
+cglobal warp_affine_8x8_8bpc, 4, 7, 22, dst, ds, src, ss, abcd, filter
+ vpbroadcastd m9, [pd_262144]
+ mova xm15, [warp_8x8_end]
+ call .main
+ jmp .start
+.loop:
+ call .main2
+ lea dstq, [dstq+dsq*2]
+.start:
+ psrad m16, 19
+ packuswb m16, m16
+ vpermb m16, m15, m16
+ movq [dstq+dsq*0], xm16
+ movhps [dstq+dsq*1], xm16
+ sub r6d, 0x1800
+ jg .loop
+ RET
+ALIGN function_align
+.main:
+ vpbroadcastd m1, [pd_512]
+%if WIN64
+ mov abcdq, r5mp
+ vpaddd ym18, ym1, r6m {1to8} ; mx
+%else
+ add r5d, 512
+ vpbroadcastd ym18, r5d
+%endif
+ vpaddd ym20, ym1, r7m {1to8} ; my
+ mova ym16, [pd_0to7]
+ vpbroadcastd ym19, [abcdq+4*0]
+ vpbroadcastd ym21, [abcdq+4*1]
+ lea r4, [ssq*3+3]
+ mova m10, [warp_8x8_permA]
+ mov r6d, 0x5555
+ mova m11, [warp_8x8_permB]
+ lea filterq, [mc_warp_filter+64*8]
+ vpbroadcastq m12, [warp_8x8_hpack]
+ sub srcq, r4 ; src -= src_stride*3 + 3
+ vbroadcasti32x4 m13, [warp_8x8_permC]
+ kxnorb k2, k2, k2
+ vbroadcasti32x4 m14, [warp_8x8_permD]
+ vpdpwssd ym18, ym19, ym16 ; alpha
+ vpdpwssd ym20, ym21, ym16 ; gamma
+ vbroadcasti32x4 m0, [srcq]
+ psrad ym19, 16 ; beta
+ psrad ym21, 16 ; delta
+ kmovw k1, r6d
+ psrad ym16, ym18, 10
+ kmovb k3, k2
+ paddd ym18, ym19
+ vpgatherdq m2{k2}, [filterq+ym16*8] ; filter_x0
+ psrld m1, 8 ; pd_2
+ pshufb m0, m11
+ paddd m8, m1, m1 ; pd_4
+ vpdpbusd m1, m0, m2
+ call .h
+ psllq m2, m1, 45
+ pslld m1, 13
+ paddd m1, m2
+ vpshrdq m1, m0, 48 ; 01 12
+ call .h
+ vpshrdq m2, m1, m0, 48 ; 23 34
+ call .h
+ vpshrdq m3, m2, m0, 48 ; 45 56
+.main2:
+ call .h
+ psrad ym17, ym20, 10
+ kmovb k2, k3
+ paddd ym20, ym21
+ vpgatherdq m7{k3}, [filterq+ym17*8] ; filter_y0
+ psrad ym16, ym20, 10
+ kmovb k3, k2
+ paddd ym20, ym21
+ vpgatherdq m17{k2}, [filterq+ym16*8] ; filter_y1
+ shufps m5, m7, m17, q2020 ; a0 a1 a2 a3 b0 b1 b2 b3 A0 A1 A2 A3 B0 B1 B2 B3
+ mova m16, m9
+ pshufb m4, m5, m13 ; a0 a1 A0 A1 b0 b1 B0 B1
+ vpdpwssd m16, m1, m4
+ pshufb m5, m14 ; a2 a3 A2 A3 b2 b3 B2 B3
+ mova m1, m2
+ vpdpwssd m16, m2, m5
+ shufps m5, m7, m17, q3131 ; a4 a5 a6 a7 b4 b5 b6 b7 A4 A5 A6 A7 B4 B5 B6 B7
+ mova m2, m3
+ pshufb m4, m5, m13 ; a4 a5 A4 A5 b4 b5 B4 B5
+ vpdpwssd m16, m3, m4
+ vpshrdq m3, m0, 48 ; 67 78
+ pshufb m5, m14 ; a6 a7 A6 A7 b6 b7 B6 B7
+ vpdpwssd m16, m3, m5
+ ret
+ALIGN function_align
+.h:
+ movu xm5, [srcq+ssq*1]
+ psrad ym16, ym18, 10
+ lea srcq, [srcq+ssq*2]
+ vinserti32x4 ym5, [srcq+ssq*0], 1
+ kmovb k2, k3
+ paddd ym18, ym19
+ vpgatherdq m6{k3}, [filterq+ym16*8] ; filter_x1
+ psrad ym17, ym18, 10
+ kmovb k3, k2
+ paddd ym18, ym19
+ vpgatherdq m16{k2}, [filterq+ym17*8] ; filter_x2
+ mova m0, m8
+ vpermb m4, m10, m5 ; a4 b0 a5 b1 a6 b2 a7 b3 a8 b4 a9 b5 aa b6 ab b7
+ vpshldq m17, m16, m6, 32 ; a4 a5 a6 a7 b0 b1 b2 b3
+ vpdpbusd m0, m4, m17
+ vpermb m5, m11, m5 ; a0 b4 a1 b5 a2 b6 a3 b7 a4 b8 a5 b9 a6 ba a7 bb
+ vmovdqa32 m16{k1}, m6 ; a0 a1 a2 a3 b4 b5 b6 b7
+ vpdpbusd m0, m5, m16
+ vpmultishiftqb m0, m12, m0 ; 1 1 2 2 (>> 3)
+ ret
+
+%macro BIDIR_FN 1 ; op
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ cmp hd, 8
+ jg .w4_h16
+ WRAP_YMM %1 0
+ vextracti32x4 xm1, ym0, 1
+ movd [dstq ], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q ], xm1, 1
+ jl .w4_ret
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq ], xm0, 2
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
+.w4_ret:
+ RET
+.w4_h16:
+ vpbroadcastd m7, strided
+ pmulld m7, [bidir_sctr_w4]
+ %1 0
+ kxnorw k1, k1, k1
+ vpscatterdd [dstq+m7]{k1}, m0
+ RET
+.w8:
+ cmp hd, 4
+ jne .w8_h8
+ WRAP_YMM %1 0
+ vextracti32x4 xm1, ym0, 1
+ movq [dstq ], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm1
+ RET
+.w8_loop:
+ %1_INC_PTR 2
+ lea dstq, [dstq+strideq*4]
+.w8_h8:
+ %1 0
+ vextracti32x4 xm1, ym0, 1
+ vextracti32x4 xm2, m0, 2
+ vextracti32x4 xm3, m0, 3
+ movq [dstq ], xm0
+ movq [dstq+strideq*1], xm1
+ movq [dstq+strideq*2], xm2
+ movq [dstq+stride3q ], xm3
+ lea dstq, [dstq+strideq*4]
+ movhps [dstq ], xm0
+ movhps [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm3
+ sub hd, 8
+ jg .w8_loop
+ RET
+.w16_loop:
+ %1_INC_PTR 2
+ lea dstq, [dstq+strideq*4]
+.w16:
+ %1 0
+ vpermq m0, m0, q3120
+ mova [dstq ], xm0
+ vextracti32x4 [dstq+strideq*1], m0, 2
+ vextracti32x4 [dstq+strideq*2], ym0, 1
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32:
+ pmovzxbq m7, [pb_02461357]
+.w32_loop:
+ %1 0
+ %1_INC_PTR 2
+ vpermq m0, m7, m0
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64:
+ pmovzxbq m7, [pb_02461357]
+.w64_loop:
+ %1 0
+ %1_INC_PTR 2
+ vpermq m0, m7, m0
+ mova [dstq], m0
+ add dstq, strideq
+ dec hd
+ jg .w64_loop
+ RET
+.w128:
+ pmovzxbq m7, [pb_02461357]
+.w128_loop:
+ %1 0
+ vpermq m6, m7, m0
+ %1 2
+ mova [dstq+64*0], m6
+ %1_INC_PTR 4
+ vpermq m6, m7, m0
+ mova [dstq+64*1], m6
+ add dstq, strideq
+ dec hd
+ jg .w128_loop
+ RET
+%endmacro
+
+%macro AVG 1 ; src_offset
+ mova m0, [tmp1q+(%1+0)*mmsize]
+ paddw m0, [tmp2q+(%1+0)*mmsize]
+ mova m1, [tmp1q+(%1+1)*mmsize]
+ paddw m1, [tmp2q+(%1+1)*mmsize]
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+ packuswb m0, m1
+%endmacro
+
+%macro AVG_INC_PTR 1
+ add tmp1q, %1*mmsize
+ add tmp2q, %1*mmsize
+%endmacro
+
+cglobal avg_8bpc, 4, 7, 5, dst, stride, tmp1, tmp2, w, h, stride3
+%define base r6-avg_avx512icl_table
+ lea r6, [avg_avx512icl_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, dword [r6+wq*4]
+ vpbroadcastd m4, [base+pw_1024]
+ add wq, r6
+ BIDIR_FN AVG
+
+%macro W_AVG 1 ; src_offset
+ ; (a * weight + b * (16 - weight) + 128) >> 8
+ ; = ((a - b) * weight + (b << 4) + 128) >> 8
+ ; = ((((a - b) * ((weight-16) << 12)) >> 16) + a + 8) >> 4
+ ; = ((((b - a) * (-weight << 12)) >> 16) + b + 8) >> 4
+ mova m0, [tmp1q+(%1+0)*mmsize]
+ psubw m2, m0, [tmp2q+(%1+0)*mmsize]
+ mova m1, [tmp1q+(%1+1)*mmsize]
+ psubw m3, m1, [tmp2q+(%1+1)*mmsize]
+ pmulhw m2, m4
+ pmulhw m3, m4
+ paddw m0, m2
+ paddw m1, m3
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+%endmacro
+
+%define W_AVG_INC_PTR AVG_INC_PTR
+
+cglobal w_avg_8bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
+%define base r6-w_avg_avx512icl_table
+ lea r6, [w_avg_avx512icl_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ vpbroadcastw m4, r6m ; weight
+ movsxd wq, dword [r6+wq*4]
+ vpbroadcastd m5, [base+pw_2048]
+ psllw m4, 12 ; (weight-16) << 12 when interpreted as signed
+ add wq, r6
+ cmp dword r6m, 7
+ jg .weight_gt7
+ mov r6, tmp1q
+ pxor m0, m0
+ mov tmp1q, tmp2q
+ psubw m4, m0, m4 ; -weight
+ mov tmp2q, r6
+.weight_gt7:
+ BIDIR_FN W_AVG
+
+%macro MASK 1 ; src_offset
+ ; (a * m + b * (64 - m) + 512) >> 10
+ ; = ((a - b) * m + (b << 6) + 512) >> 10
+ ; = ((((b - a) * (-m << 10)) >> 16) + b + 8) >> 4
+%if mmsize == 64
+ vpermq m3, m8, [maskq+%1*32]
+%else
+ vpermq m3, [maskq+%1*16], q3120
+%endif
+ mova m0, [tmp2q+(%1+0)*mmsize]
+ psubw m1, m0, [tmp1q+(%1+0)*mmsize]
+ psubb m3, m4, m3
+ paddw m1, m1 ; (b - a) << 1
+ paddb m3, m3
+ punpcklbw m2, m4, m3 ; -m << 9
+ pmulhw m1, m2
+ paddw m0, m1
+ mova m1, [tmp2q+(%1+1)*mmsize]
+ psubw m2, m1, [tmp1q+(%1+1)*mmsize]
+ paddw m2, m2
+ punpckhbw m3, m4, m3
+ pmulhw m2, m3
+ paddw m1, m2
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+%endmacro
+
+%macro MASK_INC_PTR 1
+ add maskq, %1*32
+ add tmp2q, %1*64
+ add tmp1q, %1*64
+%endmacro
+
+cglobal mask_8bpc, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-mask_avx512icl_table
+ lea r7, [mask_avx512icl_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ mov maskq, maskmp
+ movsxd wq, dword [r7+wq*4]
+ pxor m4, m4
+ mova m8, [base+bilin_v_perm64]
+ vpbroadcastd m5, [base+pw_2048]
+ add wq, r7
+ BIDIR_FN MASK
+
+%macro W_MASK 4-5 0 ; dst, mask, tmp_offset[1-2], 4:4:4
+ mova m%1, [tmp1q+mmsize*%3]
+ mova m1, [tmp2q+mmsize*%3]
+ psubw m1, m%1
+ pabsw m%2, m1
+ psubusw m%2, m6, m%2
+ psrlw m%2, 8 ; 64 - m
+ psllw m2, m%2, 10
+ pmulhw m1, m2
+ paddw m%1, m1
+ mova m1, [tmp1q+mmsize*%4]
+ mova m2, [tmp2q+mmsize*%4]
+ psubw m2, m1
+ pabsw m3, m2
+ psubusw m3, m6, m3
+ vpshldw m%2, m3, 8
+ psllw m3, m%2, 10
+%if %5
+ psubb m%2, m5, m%2
+%endif
+ pmulhw m2, m3
+ paddw m1, m2
+ pmulhrsw m%1, m7
+ pmulhrsw m1, m7
+ packuswb m%1, m1
+%endmacro
+
+cglobal w_mask_420_8bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-w_mask_420_avx512icl_table
+ lea r7, [w_mask_420_avx512icl_table]
+ tzcnt wd, wm
+ mov r6d, r7m ; sign
+ movifnidn hd, hm
+ movsxd wq, [r7+wq*4]
+ vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
+ vpbroadcastd m7, [base+pw_2048]
+ vpbroadcastd m9, [base+pb_m64] ; -1 << 6
+ mova ym10, [base+wm_420_mask+32]
+ vpbroadcastd m8, [base+wm_sign+r6*8] ; (258 - sign) << 6
+ add wq, r7
+ mov maskq, maskmp
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ mova m5, [wm_420_perm4]
+ cmp hd, 8
+ jg .w4_h16
+ WRAP_YMM W_MASK 0, 4, 0, 1
+ vinserti128 ym5, [wm_420_perm4+32], 1
+ vpermb ym4, ym5, ym4
+ vpdpbusd ym8, ym4, ym9
+ vextracti32x4 xm1, m0, 1
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q ], xm1, 1
+ jl .w4_end
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 2
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
+.w4_end:
+ vpermb ym8, ym10, ym8
+ movq [maskq], xm8
+ RET
+.w4_h16:
+ vpbroadcastd m11, strided
+ pmulld m11, [bidir_sctr_w4]
+ W_MASK 0, 4, 0, 1
+ vpermb m4, m5, m4
+ vpdpbusd m8, m4, m9
+ kxnorw k1, k1, k1
+ vpermb m8, m10, m8
+ mova [maskq], xm8
+ vpscatterdd [dstq+m11]{k1}, m0
+ RET
+.w8:
+ mova m5, [wm_420_perm8]
+ cmp hd, 4
+ jne .w8_h8
+ WRAP_YMM W_MASK 0, 4, 0, 1
+ vinserti128 ym5, [wm_420_perm8+32], 1
+ vpermb ym4, ym5, ym4
+ vpdpbusd ym8, ym4, ym9
+ vpermb m8, m10, m8
+ mova [maskq], xm8
+ vextracti32x4 xm1, ym0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm1
+ RET
+.w8_loop:
+ add tmp1q, 128
+ add tmp2q, 128
+ add maskq, 16
+ lea dstq, [dstq+strideq*4]
+.w8_h8:
+ W_MASK 0, 4, 0, 1
+ vpermb m4, m5, m4
+ mova m1, m8
+ vpdpbusd m1, m4, m9
+ vpermb m1, m10, m1
+ mova [maskq], xm1
+ vextracti32x4 xm1, ym0, 1
+ vextracti32x4 xm2, m0, 2
+ vextracti32x4 xm3, m0, 3
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movq [dstq+strideq*2], xm2
+ movq [dstq+stride3q ], xm3
+ lea dstq, [dstq+strideq*4]
+ movhps [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm3
+ sub hd, 8
+ jg .w8_loop
+ RET
+.w16:
+ mova m5, [wm_420_perm16]
+.w16_loop:
+ W_MASK 0, 4, 0, 1
+ vpermb m4, m5, m4
+ mova m1, m8
+ vpdpbusd m1, m4, m9
+ add tmp1q, 128
+ add tmp2q, 128
+ vpermb m1, m10, m1
+ vpermq m0, m0, q3120
+ mova [maskq], xm1
+ add maskq, 16
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], m0, 2
+ vextracti32x4 [dstq+strideq*2], ym0, 1
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32:
+ pmovzxbq m5, [pb_02461357]
+.w32_loop:
+ W_MASK 0, 4, 0, 1
+ mova m1, m8
+ vpdpbusd m1, m4, m9
+ add tmp1q, 128
+ add tmp2q, 128
+ vpermb m1, m10, m1
+ vpermq m0, m5, m0
+ mova [maskq], xm1
+ add maskq, 16
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64:
+ pmovzxbq m12, [wm_420_perm64] ; 0, 2, 4, 6, 8, 10, 12, 14
+ psrlq m13, m12, 4 ; 1, 3, 5, 7, 9, 11, 13, 15
+.w64_loop:
+ W_MASK 0, 4, 0, 2
+ W_MASK 11, 5, 1, 3
+ mova m2, m8
+ vpdpbusd m2, m4, m9
+ mova m3, m8
+ vpdpbusd m3, m5, m9
+ add tmp1q, 256
+ add tmp2q, 256
+ vpermt2b m2, m10, m3
+ mova m1, m0
+ vpermt2q m0, m12, m11
+ vpermt2q m1, m13, m11
+ mova [maskq], ym2
+ add maskq, 32
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w64_loop
+ RET
+.w128:
+ pmovzxbq m14, [wm_420_perm64]
+ mova m10, [wm_420_mask]
+ psrlq m15, m14, 4
+.w128_loop:
+ W_MASK 0, 12, 0, 4
+ W_MASK 11, 13, 1, 5
+ mova m4, m8
+ vpdpbusd m4, m12, m9
+ mova m5, m8
+ vpdpbusd m5, m13, m9
+ mova m1, m0
+ vpermt2q m0, m14, m11
+ vpermt2q m1, m15, m11
+ mova [dstq+strideq*0+64*0], m0
+ mova [dstq+strideq*1+64*0], m1
+ W_MASK 0, 12, 2, 6
+ W_MASK 11, 13, 3, 7
+ vprold m4, 16
+ vprold m5, 16
+ vpdpbusd m4, m12, m9
+ vpdpbusd m5, m13, m9
+ add tmp1q, 512
+ add tmp2q, 512
+ vpermt2b m4, m10, m5
+ mova m1, m0
+ vpermt2q m0, m14, m11
+ vpermt2q m1, m15, m11
+ mova [maskq], m4
+ add maskq, 64
+ mova [dstq+strideq*0+64*1], m0
+ mova [dstq+strideq*1+64*1], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w128_loop
+ RET
+
+cglobal w_mask_422_8bpc, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-w_mask_422_avx512icl_table
+ lea r7, [w_mask_422_avx512icl_table]
+ tzcnt wd, wm
+ mov r6d, r7m ; sign
+ movifnidn hd, hm
+ movsxd wq, dword [r7+wq*4]
+ vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
+ vpbroadcastd m7, [base+pw_2048]
+ vpbroadcastd m9, [base+pw_m128]
+ mova m10, [base+wm_422_mask]
+ vpbroadcastd m11, [base+pb_127]
+ add wq, r7
+ vpbroadcastd m8, [base+wm_sign+4+r6*4]
+ mov maskq, maskmp
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ cmp hd, 8
+ jg .w4_h16
+ WRAP_YMM W_MASK 0, 4, 0, 1
+ movhps xm10, [wm_422_mask+16]
+ vpdpwssd ym8, ym4, ym9
+ vpermb ym8, ym10, ym8
+ vextracti32x4 xm1, m0, 1
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q ], xm1, 1
+ jl .w4_end
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 2
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
+.w4_end:
+ pand xm8, xm11
+ mova [maskq], xm8
+ RET
+.w4_h16:
+ vpbroadcastd m5, strided
+ pmulld m5, [bidir_sctr_w4]
+ W_MASK 0, 4, 0, 1
+ vpdpwssd m8, m4, m9
+ kxnorw k1, k1, k1
+ vpermb m8, m10, m8
+ pand ym8, ym11
+ mova [maskq], ym8
+ vpscatterdd [dstq+m5]{k1}, m0
+ RET
+.w8:
+ cmp hd, 4
+ jne .w8_h8
+ WRAP_YMM W_MASK 0, 4, 0, 1
+ movhps xm10, [wm_422_mask+16]
+ vpdpwssd ym8, ym4, ym9
+ vpermb ym8, ym10, ym8
+ pand xm8, xm11
+ mova [maskq], xm8
+ vextracti32x4 xm1, ym0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm1
+ RET
+.w8_loop:
+ add tmp1q, 128
+ add tmp2q, 128
+ add maskq, 32
+ lea dstq, [dstq+strideq*4]
+.w8_h8:
+ W_MASK 0, 4, 0, 1
+ mova m1, m8
+ vpdpwssd m1, m4, m9
+ vpermb m1, m10, m1
+ pand ym1, ym11
+ mova [maskq], ym1
+ vextracti32x4 xm1, ym0, 1
+ vextracti32x4 xm2, m0, 2
+ vextracti32x4 xm3, m0, 3
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movq [dstq+strideq*2], xm2
+ movq [dstq+stride3q ], xm3
+ lea dstq, [dstq+strideq*4]
+ movhps [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm3
+ sub hd, 8
+ jg .w8_loop
+ RET
+.w16_loop:
+ add tmp1q, 128
+ add tmp2q, 128
+ add maskq, 32
+ lea dstq, [dstq+strideq*4]
+.w16:
+ W_MASK 0, 4, 0, 1
+ mova m1, m8
+ vpdpwssd m1, m4, m9
+ vpermb m1, m10, m1
+ vpermq m0, m0, q3120
+ pand ym1, ym11
+ mova [maskq], ym1
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], m0, 2
+ vextracti32x4 [dstq+strideq*2], ym0, 1
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32:
+ pmovzxbq m5, [pb_02461357]
+.w32_loop:
+ W_MASK 0, 4, 0, 1
+ mova m1, m8
+ vpdpwssd m1, m4, m9
+ add tmp1q, 128
+ add tmp2q, 128
+ vpermb m1, m10, m1
+ vpermq m0, m5, m0
+ pand ym1, ym11
+ mova [maskq], ym1
+ add maskq, 32
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64:
+ pmovzxbq m5, [pb_02461357]
+.w64_loop:
+ W_MASK 0, 4, 0, 1
+ mova m1, m8
+ vpdpwssd m1, m4, m9
+ add tmp1q, 128
+ add tmp2q, 128
+ vpermb m1, m10, m1
+ vpermq m0, m5, m0
+ pand ym1, ym11
+ mova [maskq], ym1
+ add maskq, 32
+ mova [dstq], m0
+ add dstq, strideq
+ dec hd
+ jg .w64_loop
+ RET
+.w128:
+ pmovzxbq m13, [pb_02461357]
+.w128_loop:
+ W_MASK 0, 4, 0, 1
+ W_MASK 12, 5, 2, 3
+ mova m2, m8
+ vpdpwssd m2, m4, m9
+ mova m3, m8
+ vpdpwssd m3, m5, m9
+ add tmp1q, 256
+ add tmp2q, 256
+ vpermt2b m2, m10, m3
+ vpermq m0, m13, m0
+ vpermq m1, m13, m12
+ pand m2, m11
+ mova [maskq], m2
+ add maskq, 64
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ add dstq, strideq
+ dec hd
+ jg .w128_loop
+ RET
+
+cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-w_mask_444_avx512icl_table
+ lea r7, [w_mask_444_avx512icl_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, dword [r7+wq*4]
+ vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
+ vpbroadcastd m5, [base+pb_64]
+ vpbroadcastd m7, [base+pw_2048]
+ mova m8, [base+wm_444_mask]
+ add wq, r7
+ mov maskq, maskmp
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ cmp hd, 8
+ jg .w4_h16
+ WRAP_YMM W_MASK 0, 4, 0, 1, 1
+ vinserti128 ym8, [wm_444_mask+32], 1
+ vpermb ym4, ym8, ym4
+ mova [maskq], ym4
+ vextracti32x4 xm1, m0, 1
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q ], xm1, 1
+ jl .w4_end
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 2
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
+.w4_end:
+ RET
+.w4_h16:
+ vpbroadcastd m9, strided
+ pmulld m9, [bidir_sctr_w4]
+ W_MASK 0, 4, 0, 1, 1
+ vpermb m4, m8, m4
+ kxnorw k1, k1, k1
+ mova [maskq], m4
+ vpscatterdd [dstq+m9]{k1}, m0
+ RET
+.w8:
+ cmp hd, 4
+ jne .w8_h8
+ WRAP_YMM W_MASK 0, 4, 0, 1, 1
+ vinserti128 ym8, [wm_444_mask+32], 1
+ vpermb ym4, ym8, ym4
+ mova [maskq], ym4
+ vextracti32x4 xm1, ym0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm1
+ RET
+.w8_loop:
+ add tmp1q, 128
+ add tmp2q, 128
+ add maskq, 64
+ lea dstq, [dstq+strideq*4]
+.w8_h8:
+ W_MASK 0, 4, 0, 1, 1
+ vpermb m4, m8, m4
+ mova [maskq], m4
+ vextracti32x4 xm1, ym0, 1
+ vextracti32x4 xm2, m0, 2
+ vextracti32x4 xm3, m0, 3
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movq [dstq+strideq*2], xm2
+ movq [dstq+stride3q ], xm3
+ lea dstq, [dstq+strideq*4]
+ movhps [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm3
+ sub hd, 8
+ jg .w8_loop
+ RET
+.w16_loop:
+ add tmp1q, 128
+ add tmp2q, 128
+ add maskq, 64
+ lea dstq, [dstq+strideq*4]
+.w16:
+ W_MASK 0, 4, 0, 1, 1
+ vpermb m4, m8, m4
+ vpermq m0, m0, q3120
+ mova [maskq], m4
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], m0, 2
+ vextracti32x4 [dstq+strideq*2], ym0, 1
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32:
+ pmovzxbq m9, [pb_02461357]
+.w32_loop:
+ W_MASK 0, 4, 0, 1, 1
+ vpermb m4, m8, m4
+ add tmp1q, 128
+ add tmp2q, 128
+ vpermq m0, m9, m0
+ mova [maskq], m4
+ add maskq, 64
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64:
+ pmovzxbq m9, [pb_02461357]
+.w64_loop:
+ W_MASK 0, 4, 0, 1, 1
+ vpermb m4, m8, m4
+ add tmp1q, 128
+ add tmp2q, 128
+ vpermq m0, m9, m0
+ mova [maskq], m4
+ add maskq, 64
+ mova [dstq], m0
+ add dstq, strideq
+ dec hd
+ jg .w64_loop
+ RET
+.w128:
+ pmovzxbq m11, [pb_02461357]
+.w128_loop:
+ W_MASK 0, 4, 0, 1, 1
+ W_MASK 10, 9, 2, 3, 1
+ vpermb m4, m8, m4
+ vpermb m9, m8, m9
+ add tmp1q, 256
+ add tmp2q, 256
+ vpermq m0, m11, m0
+ vpermq m10, m11, m10
+ mova [maskq+64*0], m4
+ mova [maskq+64*1], m9
+ add maskq, 128
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m10
+ add dstq, strideq
+ dec hd
+ jg .w128_loop
+ RET
+
+cglobal blend_8bpc, 3, 7, 8, dst, ds, tmp, w, h, mask
+%define base r6-blend_avx512icl_table
+ lea r6, [blend_avx512icl_table]
+ tzcnt wd, wm
+ movifnidn maskq, maskmp
+ movifnidn hd, hm
+ movsxd wq, [r6+wq*4]
+ vpbroadcastd m6, [base+pb_64]
+ vpbroadcastd m7, [base+pw_512]
+ sub tmpq, maskq
+ add wq, r6
+ lea r6, [dsq*3]
+ jmp wq
+.w4:
+ movd xmm0, [dstq+dsq*0]
+ pinsrd xmm0, [dstq+dsq*1], 1
+ vpbroadcastd xmm1, [dstq+dsq*2]
+ pinsrd xmm1, [dstq+r6 ], 3
+ mova xmm4, [maskq]
+ mova xmm5, [maskq+tmpq]
+ add maskq, 4*4
+ psubb xmm3, xm6, xmm4
+ punpcklbw xmm0, xmm5
+ punpcklbw xmm2, xmm3, xmm4
+ punpckhbw xmm1, xmm5
+ punpckhbw xmm3, xmm4
+ pmaddubsw xmm0, xmm2
+ pmaddubsw xmm1, xmm3
+ pmulhrsw xmm0, xm7
+ pmulhrsw xmm1, xm7
+ packuswb xmm0, xmm1
+ movd [dstq+dsq*0], xmm0
+ pextrd [dstq+dsq*1], xmm0, 1
+ pextrd [dstq+dsq*2], xmm0, 2
+ pextrd [dstq+r6 ], xmm0, 3
+ lea dstq, [dstq+dsq*4]
+ sub hd, 4
+ jg .w4
+ RET
+.w8:
+ movq xmm0, [dstq+dsq*0]
+ vpbroadcastq xmm1, [dstq+dsq*1]
+ vpbroadcastq ymm2, [dstq+dsq*2]
+ vpbroadcastq ymm3, [dstq+r6 ]
+ mova ymm4, [maskq]
+ mova ymm5, [maskq+tmpq]
+ add maskq, 8*4
+ vpblendd ymm0, ymm2, 0x30
+ vpblendd ymm1, ymm3, 0xc0
+ psubb ymm3, ym6, ymm4
+ punpcklbw ymm0, ymm5
+ punpcklbw ymm2, ymm3, ymm4
+ punpckhbw ymm1, ymm5
+ punpckhbw ymm3, ymm4
+ pmaddubsw ymm0, ymm2
+ pmaddubsw ymm1, ymm3
+ pmulhrsw ymm0, ym7
+ pmulhrsw ymm1, ym7
+ packuswb ymm0, ymm1
+ vextracti128 xmm1, ymm0, 1
+ movq [dstq+dsq*0], xmm0
+ movhps [dstq+dsq*1], xmm0
+ movq [dstq+dsq*2], xmm1
+ movhps [dstq+r6 ], xmm1
+ lea dstq, [dstq+dsq*4]
+ sub hd, 4
+ jg .w8
+ vzeroupper
+ RET
+.w16:
+ mova xm1, [dstq+dsq*0]
+ vinserti32x4 ym1, [dstq+dsq*1], 1
+ vinserti32x4 m1, [dstq+dsq*2], 2
+ mova m4, [maskq]
+ vinserti32x4 m1, [dstq+r6 ], 3
+ mova m5, [maskq+tmpq]
+ add maskq, 16*4
+ psubb m3, m6, m4
+ punpcklbw m0, m1, m5
+ punpcklbw m2, m3, m4
+ punpckhbw m1, m5
+ punpckhbw m3, m4
+ pmaddubsw m0, m2
+ pmaddubsw m1, m3
+ pmulhrsw m0, m7
+ pmulhrsw m1, m7
+ packuswb m0, m1
+ mova [dstq+dsq*0], xm0
+ vextracti32x4 [dstq+dsq*1], ym0, 1
+ vextracti32x4 [dstq+dsq*2], m0, 2
+ vextracti32x4 [dstq+r6 ], m0, 3
+ lea dstq, [dstq+dsq*4]
+ sub hd, 4
+ jg .w16
+ RET
+.w32:
+ mova ym1, [dstq+dsq*0]
+ vinserti32x8 m1, [dstq+dsq*1], 1
+ mova m4, [maskq]
+ mova m5, [maskq+tmpq]
+ add maskq, 32*2
+ psubb m3, m6, m4
+ punpcklbw m0, m1, m5
+ punpcklbw m2, m3, m4
+ punpckhbw m1, m5
+ punpckhbw m3, m4
+ pmaddubsw m0, m2
+ pmaddubsw m1, m3
+ pmulhrsw m0, m7
+ pmulhrsw m1, m7
+ packuswb m0, m1
+ mova [dstq+dsq*0], ym0
+ vextracti32x8 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w32
+ RET
+
+cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mask
+%define base r5-blend_v_avx512icl_table
+ lea r5, [blend_v_avx512icl_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ vpbroadcastd m5, [base+pw_512]
+ add wq, r5
+ add maskq, obmc_masks-blend_v_avx512icl_table
+ jmp wq
+.w2:
+ vpbroadcastd xmm2, [maskq+2*2]
+.w2_s0_loop:
+ movd xmm0, [dstq+dsq*0]
+ pinsrw xmm0, [dstq+dsq*1], 1
+ movd xmm1, [tmpq]
+ add tmpq, 2*2
+ punpcklbw xmm0, xmm1
+ pmaddubsw xmm0, xmm2
+ pmulhrsw xmm0, xm5
+ packuswb xmm0, xmm0
+ pextrw [dstq+dsq*0], xmm0, 0
+ pextrw [dstq+dsq*1], xmm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w2_s0_loop
+ RET
+.w4:
+ vpbroadcastq xmm2, [maskq+4*2]
+.w4_loop:
+ movd xmm0, [dstq+dsq*0]
+ pinsrd xmm0, [dstq+dsq*1], 1
+ movq xmm1, [tmpq]
+ add tmpq, 4*2
+ punpcklbw xmm0, xmm1
+ pmaddubsw xmm0, xmm2
+ pmulhrsw xmm0, xm5
+ packuswb xmm0, xmm0
+ movd [dstq+dsq*0], xmm0
+ pextrd [dstq+dsq*1], xmm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w4_loop
+ RET
+.w8:
+ mova xmm3, [maskq+8*2]
+.w8_loop:
+ movq xmm0, [dstq+dsq*0]
+ vpbroadcastq xmm1, [dstq+dsq*1]
+ mova xmm2, [tmpq]
+ add tmpq, 8*2
+ punpcklbw xmm0, xmm2
+ punpckhbw xmm1, xmm2
+ pmaddubsw xmm0, xmm3
+ pmaddubsw xmm1, xmm3
+ pmulhrsw xmm0, xm5
+ pmulhrsw xmm1, xm5
+ packuswb xmm0, xmm1
+ movq [dstq+dsq*0], xmm0
+ movhps [dstq+dsq*1], xmm0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w8_loop
+ RET
+.w16:
+ vbroadcasti32x4 ym3, [maskq+16*2]
+ vbroadcasti32x4 ym4, [maskq+16*3]
+.w16_loop:
+ mova xm1, [dstq+dsq*0]
+ vinserti32x4 ym1, [dstq+dsq*1], 1
+ mova ym2, [tmpq]
+ add tmpq, 16*2
+ punpcklbw ym0, ym1, ym2
+ punpckhbw ym1, ym2
+ pmaddubsw ym0, ym3
+ pmaddubsw ym1, ym4
+ pmulhrsw ym0, ym5
+ pmulhrsw ym1, ym5
+ packuswb ym0, ym1
+ mova [dstq+dsq*0], xm0
+ vextracti32x4 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w16_loop
+ RET
+.w32:
+ mova m4, [maskq+32*2]
+ vshufi32x4 m3, m4, m4, q2020
+ vshufi32x4 m4, m4, q3131
+.w32_loop:
+ mova ym1, [dstq+dsq*0]
+ vinserti32x8 m1, [dstq+dsq*1], 1
+ mova m2, [tmpq]
+ add tmpq, 32*2
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ pmaddubsw m0, m3
+ pmaddubsw m1, m4
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+ mova [dstq+dsq*0], ym0
+ vextracti32x8 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w32_loop
+ RET
+
+cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mask
+%define base r6-blend_h_avx512icl_table
+ lea r6, [blend_h_avx512icl_table]
+ tzcnt wd, wm
+ mov hd, hm
+ movsxd wq, [r6+wq*4]
+ lea maskq, [base+obmc_masks+hq*2]
+ vpbroadcastd m5, [base+pw_512]
+ lea hd, [hq*3]
+ add wq, r6
+ shr hd, 2 ; h * 3/4
+ lea maskq, [maskq+hq*2]
+ neg hq
+ jmp wq
+.w2:
+ movd xmm0, [dstq+dsq*0]
+ pinsrw xmm0, [dstq+dsq*1], 1
+ movd xmm2, [maskq+hq*2]
+ movd xmm1, [tmpq]
+ add tmpq, 2*2
+ punpcklwd xmm2, xmm2
+ punpcklbw xmm0, xmm1
+ pmaddubsw xmm0, xmm2
+ pmulhrsw xmm0, xm5
+ packuswb xmm0, xmm0
+ pextrw [dstq+dsq*0], xmm0, 0
+ pextrw [dstq+dsq*1], xmm0, 1
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w2
+ RET
+.w4:
+ mova xmm3, [blend_shuf]
+.w4_loop:
+ movd xmm0, [dstq+dsq*0]
+ pinsrd xmm0, [dstq+dsq*1], 1
+ movd xmm2, [maskq+hq*2]
+ movq xmm1, [tmpq]
+ add tmpq, 4*2
+ pshufb xmm2, xmm3
+ punpcklbw xmm0, xmm1
+ pmaddubsw xmm0, xmm2
+ pmulhrsw xmm0, xm5
+ packuswb xmm0, xmm0
+ movd [dstq+dsq*0], xmm0
+ pextrd [dstq+dsq*1], xmm0, 1
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w4_loop
+ RET
+.w8:
+ vbroadcasti128 ymm4, [blend_shuf]
+ shufpd ymm4, ymm4, 0x03
+.w8_loop:
+ vpbroadcastq ymm1, [dstq+dsq*0]
+ movq xmm0, [dstq+dsq*1]
+ vpblendd ymm0, ymm1, 0x30
+ vpbroadcastd ymm3, [maskq+hq*2]
+ movq xmm1, [tmpq+8*1]
+ vinserti128 ymm1, [tmpq+8*0], 1
+ add tmpq, 8*2
+ pshufb ymm3, ymm4
+ punpcklbw ymm0, ymm1
+ pmaddubsw ymm0, ymm3
+ pmulhrsw ymm0, ym5
+ vextracti128 xmm1, ymm0, 1
+ packuswb xmm0, xmm1
+ movhps [dstq+dsq*0], xmm0
+ movq [dstq+dsq*1], xmm0
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w8_loop
+ vzeroupper
+ RET
+.w16:
+ vbroadcasti32x4 ym4, [blend_shuf]
+ shufpd ym4, ym4, 0x0c
+.w16_loop:
+ mova xm1, [dstq+dsq*0]
+ vinserti32x4 ym1, [dstq+dsq*1], 1
+ vpbroadcastd ym3, [maskq+hq*2]
+ mova ym2, [tmpq]
+ add tmpq, 16*2
+ pshufb ym3, ym4
+ punpcklbw ym0, ym1, ym2
+ punpckhbw ym1, ym2
+ pmaddubsw ym0, ym3
+ pmaddubsw ym1, ym3
+ pmulhrsw ym0, ym5
+ pmulhrsw ym1, ym5
+ packuswb ym0, ym1
+ mova [dstq+dsq*0], xm0
+ vextracti32x4 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w16_loop
+ RET
+.w32:
+ vbroadcasti32x4 m4, [blend_shuf]
+ shufpd m4, m4, 0xf0
+.w32_loop:
+ mova ym1, [dstq+dsq*0]
+ vinserti32x8 m1, [dstq+dsq*1], 1
+ vpbroadcastd m3, [maskq+hq*2]
+ mova m2, [tmpq]
+ add tmpq, 32*2
+ pshufb m3, m4
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ pmaddubsw m0, m3
+ pmaddubsw m1, m3
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+ mova [dstq+dsq*0], ym0
+ vextracti32x8 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w32_loop
+ RET
+.w64:
+ vpbroadcastw m3, [maskq+hq*2]
+ mova m1, [dstq]
+ mova m2, [tmpq]
+ add tmpq, 32*2
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ pmaddubsw m0, m3
+ pmaddubsw m1, m3
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+ mova [dstq], m0
+ add dstq, dsq
+ inc hq
+ jl .w64
+ RET
+.w128:
+ vpbroadcastw m6, [maskq+hq*2]
+ mova m2, [dstq+64*0]
+ mova m1, [tmpq+64*0]
+ mova m3, [dstq+64*1]
+ mova m4, [tmpq+64*1]
+ add tmpq, 64*2
+ punpcklbw m0, m2, m1
+ punpckhbw m2, m1
+ pmaddubsw m0, m6
+ pmaddubsw m2, m6
+ punpcklbw m1, m3, m4
+ punpckhbw m3, m4
+ pmaddubsw m1, m6
+ pmaddubsw m3, m6
+ REPX {pmulhrsw x, m5}, m0, m2, m1, m3
+ packuswb m0, m2
+ packuswb m1, m3
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ add dstq, dsq
+ inc hq
+ jl .w128
+ RET
+
+cglobal resize_8bpc, 6, 12, 19, dst, dst_stride, src, src_stride, \
+ dst_w, h, src_w, dx, mx0
+ sub dword mx0m, 4<<14
+ sub dword src_wm, 8
+ mov r6, ~0
+ vpbroadcastd m5, dxm
+ vpbroadcastd m8, mx0m
+ vpbroadcastd m6, src_wm
+ kmovq k3, r6
+ DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x
+ LEA r7, $$
+%define base r7-$$
+ vpbroadcastd m3, [base+pw_m256]
+ vpbroadcastd m7, [base+pd_63]
+ vbroadcasti32x4 m15, [base+pb_8x0_8x8]
+ vpdpwssd m8, m5, [base+rescale_mul] ; mx+dx*[0-15]
+ pslld m5, 4 ; dx*16
+ pslld m6, 14
+ pxor m2, m2
+ mova m16, [base+resize_permA]
+ mova m17, [base+resize_permB]
+ mova xm18, [base+resize_permC]
+.loop_y:
+ xor xd, xd
+ mova m4, m8 ; per-line working version of mx
+.loop_x:
+ pmaxsd m0, m4, m2
+ psrad m9, m4, 8 ; filter offset (unmasked)
+ pminsd m0, m6 ; iclip(mx, 0, src_w-8)
+ psubd m1, m4, m0 ; pshufb offset
+ psrad m0, 14 ; clipped src_x offset
+ psrad m1, 14 ; pshufb edge_emu offset
+ vptestmd k4, m1, m1
+ pand m9, m7 ; filter offset (masked)
+ ktestw k4, k4
+ jz .load
+ vextracti32x8 ym12, m0, 1
+ vextracti32x8 ym13, m1, 1
+ kmovq k1, k3
+ kmovq k2, k3
+ vpgatherdq m10{k1}, [srcq+ym0]
+ vpgatherdq m11{k2}, [srcq+ym12]
+ kmovq k1, k3
+ kmovq k2, k3
+ vpgatherdq m14{k1}, [base+resize_shuf+4+ym1]
+ vpgatherdq m0{k2}, [base+resize_shuf+4+ym13]
+ mova m12, m16
+ mova m13, m17
+ paddb m14, m15
+ paddb m0, m15
+ pshufb m10, m14
+ pshufb m11, m0
+ vpermi2d m12, m10, m11
+ vpermi2d m13, m10, m11
+ jmp .filter
+.load:
+ kmovq k1, k3
+ kmovq k2, k3
+ vpgatherdd m12{k1}, [srcq+m0+0]
+ vpgatherdd m13{k2}, [srcq+m0+4]
+.filter:
+ kmovq k1, k3
+ kmovq k2, k3
+ vpgatherdd m10{k1}, [base+resize_filter+m9*8+0]
+ vpgatherdd m11{k2}, [base+resize_filter+m9*8+4]
+ mova m14, m2
+ vpdpbusd m14, m12, m10
+ vpdpbusd m14, m13, m11
+ packssdw m14, m14
+ pmulhrsw m14, m3
+ packuswb m14, m14
+ vpermd m14, m18, m14
+ mova [dstq+xq], xm14
+ paddd m4, m5
+ add xd, 16
+ cmp xd, dst_wd
+ jl .loop_x
+ add dstq, dst_strideq
+ add srcq, src_strideq
+ dec hd
+ jg .loop_y
+ RET
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/mc_sse.asm b/third_party/dav1d/src/x86/mc_sse.asm
new file mode 100644
index 0000000000..54939c647a
--- /dev/null
+++ b/third_party/dav1d/src/x86/mc_sse.asm
@@ -0,0 +1,9599 @@
+; Copyright © 2018, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; Copyright © 2018, VideoLabs
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA 16
+
+; dav1d_obmc_masks[] with 64-x interleaved
+obmc_masks: db 0, 0, 0, 0
+ ; 2 @4
+ db 45, 19, 64, 0
+ ; 4 @8
+ db 39, 25, 50, 14, 59, 5, 64, 0
+ ; 8 @16
+ db 36, 28, 42, 22, 48, 16, 53, 11, 57, 7, 61, 3, 64, 0, 64, 0
+ ; 16 @32
+ db 34, 30, 37, 27, 40, 24, 43, 21, 46, 18, 49, 15, 52, 12, 54, 10
+ db 56, 8, 58, 6, 60, 4, 61, 3, 64, 0, 64, 0, 64, 0, 64, 0
+ ; 32 @64
+ db 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20
+ db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55, 9
+ db 56, 8, 57, 7, 58, 6, 59, 5, 60, 4, 60, 4, 61, 3, 62, 2
+
+warp_8x8_shufA: db 0, 2, 4, 6, 1, 3, 5, 7, 1, 3, 5, 7, 2, 4, 6, 8
+warp_8x8_shufB: db 4, 6, 8, 10, 5, 7, 9, 11, 5, 7, 9, 11, 6, 8, 10, 12
+warp_8x8_shufC: db 2, 4, 6, 8, 3, 5, 7, 9, 3, 5, 7, 9, 4, 6, 8, 10
+warp_8x8_shufD: db 6, 8, 10, 12, 7, 9, 11, 13, 7, 9, 11, 13, 8, 10, 12, 14
+blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
+subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12
+ db 2, 3, 4, 5, 3, 4, 5, 6, 10, 11, 12, 13, 11, 12, 13, 14
+subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
+subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
+subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+subpel_s_shuf2: db 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11
+subpel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
+bilin_h_shuf4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12
+bilin_h_shuf8: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+unpckw: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
+rescale_mul: dd 0, 1, 2, 3
+resize_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7
+
+wm_420_sign: times 4 dw 258
+ times 4 dw 257
+wm_422_sign: times 8 db 128
+ times 8 db 127
+
+pb_8x0_8x8: times 8 db 0
+ times 8 db 8
+bdct_lb_dw: times 4 db 0
+ times 4 db 4
+ times 4 db 8
+ times 4 db 12
+
+pb_64: times 16 db 64
+pw_m256: times 8 dw -256
+pw_1: times 8 dw 1
+pw_2: times 8 dw 2
+pw_8: times 8 dw 8
+pw_15: times 8 dw 15
+pw_26: times 8 dw 26
+pw_34: times 8 dw 34
+pw_512: times 8 dw 512
+pw_1024: times 8 dw 1024
+pw_2048: times 8 dw 2048
+pw_6903: times 8 dw 6903
+pw_8192: times 8 dw 8192
+pd_32: times 4 dd 32
+pd_63: times 4 dd 63
+pd_512: times 4 dd 512
+pd_16384: times 4 dd 16484
+pd_32768: times 4 dd 32768
+pd_262144:times 4 dd 262144
+pd_0x3ff: times 4 dd 0x3ff
+pd_0x4000:times 4 dd 0x4000
+pq_0x40000000: times 2 dq 0x40000000
+
+const mc_warp_filter2 ; dav1d_mc_warp_filter[] reordered for pmaddubsw usage
+ ; [-1, 0)
+ db 0, 127, 0, 0, 0, 1, 0, 0, 0, 127, 0, 0, -1, 2, 0, 0
+ db 1, 127, -1, 0, -3, 4, 0, 0, 1, 126, -2, 0, -4, 6, 1, 0
+ db 1, 126, -3, 0, -5, 8, 1, 0, 1, 125, -4, 0, -6, 11, 1, 0
+ db 1, 124, -4, 0, -7, 13, 1, 0, 2, 123, -5, 0, -8, 15, 1, 0
+ db 2, 122, -6, 0, -9, 18, 1, 0, 2, 121, -6, 0, -10, 20, 1, 0
+ db 2, 120, -7, 0, -11, 22, 2, 0, 2, 119, -8, 0, -12, 25, 2, 0
+ db 3, 117, -8, 0, -13, 27, 2, 0, 3, 116, -9, 0, -13, 29, 2, 0
+ db 3, 114, -10, 0, -14, 32, 3, 0, 3, 113, -10, 0, -15, 35, 2, 0
+ db 3, 111, -11, 0, -15, 37, 3, 0, 3, 109, -11, 0, -16, 40, 3, 0
+ db 3, 108, -12, 0, -16, 42, 3, 0, 4, 106, -13, 0, -17, 45, 3, 0
+ db 4, 104, -13, 0, -17, 47, 3, 0, 4, 102, -14, 0, -17, 50, 3, 0
+ db 4, 100, -14, 0, -17, 52, 3, 0, 4, 98, -15, 0, -18, 55, 4, 0
+ db 4, 96, -15, 0, -18, 58, 3, 0, 4, 94, -16, 0, -18, 60, 4, 0
+ db 4, 91, -16, 0, -18, 63, 4, 0, 4, 89, -16, 0, -18, 65, 4, 0
+ db 4, 87, -17, 0, -18, 68, 4, 0, 4, 85, -17, 0, -18, 70, 4, 0
+ db 4, 82, -17, 0, -18, 73, 4, 0, 4, 80, -17, 0, -18, 75, 4, 0
+ db 4, 78, -18, 0, -18, 78, 4, 0, 4, 75, -18, 0, -17, 80, 4, 0
+ db 4, 73, -18, 0, -17, 82, 4, 0, 4, 70, -18, 0, -17, 85, 4, 0
+ db 4, 68, -18, 0, -17, 87, 4, 0, 4, 65, -18, 0, -16, 89, 4, 0
+ db 4, 63, -18, 0, -16, 91, 4, 0, 4, 60, -18, 0, -16, 94, 4, 0
+ db 3, 58, -18, 0, -15, 96, 4, 0, 4, 55, -18, 0, -15, 98, 4, 0
+ db 3, 52, -17, 0, -14, 100, 4, 0, 3, 50, -17, 0, -14, 102, 4, 0
+ db 3, 47, -17, 0, -13, 104, 4, 0, 3, 45, -17, 0, -13, 106, 4, 0
+ db 3, 42, -16, 0, -12, 108, 3, 0, 3, 40, -16, 0, -11, 109, 3, 0
+ db 3, 37, -15, 0, -11, 111, 3, 0, 2, 35, -15, 0, -10, 113, 3, 0
+ db 3, 32, -14, 0, -10, 114, 3, 0, 2, 29, -13, 0, -9, 116, 3, 0
+ db 2, 27, -13, 0, -8, 117, 3, 0, 2, 25, -12, 0, -8, 119, 2, 0
+ db 2, 22, -11, 0, -7, 120, 2, 0, 1, 20, -10, 0, -6, 121, 2, 0
+ db 1, 18, -9, 0, -6, 122, 2, 0, 1, 15, -8, 0, -5, 123, 2, 0
+ db 1, 13, -7, 0, -4, 124, 1, 0, 1, 11, -6, 0, -4, 125, 1, 0
+ db 1, 8, -5, 0, -3, 126, 1, 0, 1, 6, -4, 0, -2, 126, 1, 0
+ db 0, 4, -3, 0, -1, 127, 1, 0, 0, 2, -1, 0, 0, 127, 0, 0
+ ; [0, 1)
+ db 0, 0, 1, 0, 0, 127, 0, 0, 0, -1, 2, 0, 0, 127, 0, 0
+ db 0, -3, 4, 1, 1, 127, -2, 0, 0, -5, 6, 1, 1, 127, -2, 0
+ db 0, -6, 8, 1, 2, 126, -3, 0, -1, -7, 11, 2, 2, 126, -4, -1
+ db -1, -8, 13, 2, 3, 125, -5, -1, -1, -10, 16, 3, 3, 124, -6, -1
+ db -1, -11, 18, 3, 4, 123, -7, -1, -1, -12, 20, 3, 4, 122, -7, -1
+ db -1, -13, 23, 3, 4, 121, -8, -1, -2, -14, 25, 4, 5, 120, -9, -1
+ db -1, -15, 27, 4, 5, 119, -10, -1, -1, -16, 30, 4, 5, 118, -11, -1
+ db -2, -17, 33, 5, 6, 116, -12, -1, -2, -17, 35, 5, 6, 114, -12, -1
+ db -2, -18, 38, 5, 6, 113, -13, -1, -2, -19, 41, 6, 7, 111, -14, -2
+ db -2, -19, 43, 6, 7, 110, -15, -2, -2, -20, 46, 6, 7, 108, -15, -2
+ db -2, -20, 49, 6, 7, 106, -16, -2, -2, -21, 51, 7, 7, 104, -16, -2
+ db -2, -21, 54, 7, 7, 102, -17, -2, -2, -21, 56, 7, 8, 100, -18, -2
+ db -2, -22, 59, 7, 8, 98, -18, -2, -2, -22, 62, 7, 8, 96, -19, -2
+ db -2, -22, 64, 7, 8, 94, -19, -2, -2, -22, 67, 8, 8, 91, -20, -2
+ db -2, -22, 69, 8, 8, 89, -20, -2, -2, -22, 72, 8, 8, 87, -21, -2
+ db -2, -21, 74, 8, 8, 84, -21, -2, -2, -22, 77, 8, 8, 82, -21, -2
+ db -2, -21, 79, 8, 8, 79, -21, -2, -2, -21, 82, 8, 8, 77, -22, -2
+ db -2, -21, 84, 8, 8, 74, -21, -2, -2, -21, 87, 8, 8, 72, -22, -2
+ db -2, -20, 89, 8, 8, 69, -22, -2, -2, -20, 91, 8, 8, 67, -22, -2
+ db -2, -19, 94, 8, 7, 64, -22, -2, -2, -19, 96, 8, 7, 62, -22, -2
+ db -2, -18, 98, 8, 7, 59, -22, -2, -2, -18, 100, 8, 7, 56, -21, -2
+ db -2, -17, 102, 7, 7, 54, -21, -2, -2, -16, 104, 7, 7, 51, -21, -2
+ db -2, -16, 106, 7, 6, 49, -20, -2, -2, -15, 108, 7, 6, 46, -20, -2
+ db -2, -15, 110, 7, 6, 43, -19, -2, -2, -14, 111, 7, 6, 41, -19, -2
+ db -1, -13, 113, 6, 5, 38, -18, -2, -1, -12, 114, 6, 5, 35, -17, -2
+ db -1, -12, 116, 6, 5, 33, -17, -2, -1, -11, 118, 5, 4, 30, -16, -1
+ db -1, -10, 119, 5, 4, 27, -15, -1, -1, -9, 120, 5, 4, 25, -14, -2
+ db -1, -8, 121, 4, 3, 23, -13, -1, -1, -7, 122, 4, 3, 20, -12, -1
+ db -1, -7, 123, 4, 3, 18, -11, -1, -1, -6, 124, 3, 3, 16, -10, -1
+ db -1, -5, 125, 3, 2, 13, -8, -1, -1, -4, 126, 2, 2, 11, -7, -1
+ db 0, -3, 126, 2, 1, 8, -6, 0, 0, -2, 127, 1, 1, 6, -5, 0
+ db 0, -2, 127, 1, 1, 4, -3, 0, 0, 0, 127, 0, 0, 2, -1, 0
+ ; [1, 2)
+ db 0, 0, 127, 0, 0, 1, 0, 0, 0, 0, 127, 0, 0, -1, 2, 0
+ db 0, 1, 127, -1, 0, -3, 4, 0, 0, 1, 126, -2, 0, -4, 6, 1
+ db 0, 1, 126, -3, 0, -5, 8, 1, 0, 1, 125, -4, 0, -6, 11, 1
+ db 0, 1, 124, -4, 0, -7, 13, 1, 0, 2, 123, -5, 0, -8, 15, 1
+ db 0, 2, 122, -6, 0, -9, 18, 1, 0, 2, 121, -6, 0, -10, 20, 1
+ db 0, 2, 120, -7, 0, -11, 22, 2, 0, 2, 119, -8, 0, -12, 25, 2
+ db 0, 3, 117, -8, 0, -13, 27, 2, 0, 3, 116, -9, 0, -13, 29, 2
+ db 0, 3, 114, -10, 0, -14, 32, 3, 0, 3, 113, -10, 0, -15, 35, 2
+ db 0, 3, 111, -11, 0, -15, 37, 3, 0, 3, 109, -11, 0, -16, 40, 3
+ db 0, 3, 108, -12, 0, -16, 42, 3, 0, 4, 106, -13, 0, -17, 45, 3
+ db 0, 4, 104, -13, 0, -17, 47, 3, 0, 4, 102, -14, 0, -17, 50, 3
+ db 0, 4, 100, -14, 0, -17, 52, 3, 0, 4, 98, -15, 0, -18, 55, 4
+ db 0, 4, 96, -15, 0, -18, 58, 3, 0, 4, 94, -16, 0, -18, 60, 4
+ db 0, 4, 91, -16, 0, -18, 63, 4, 0, 4, 89, -16, 0, -18, 65, 4
+ db 0, 4, 87, -17, 0, -18, 68, 4, 0, 4, 85, -17, 0, -18, 70, 4
+ db 0, 4, 82, -17, 0, -18, 73, 4, 0, 4, 80, -17, 0, -18, 75, 4
+ db 0, 4, 78, -18, 0, -18, 78, 4, 0, 4, 75, -18, 0, -17, 80, 4
+ db 0, 4, 73, -18, 0, -17, 82, 4, 0, 4, 70, -18, 0, -17, 85, 4
+ db 0, 4, 68, -18, 0, -17, 87, 4, 0, 4, 65, -18, 0, -16, 89, 4
+ db 0, 4, 63, -18, 0, -16, 91, 4, 0, 4, 60, -18, 0, -16, 94, 4
+ db 0, 3, 58, -18, 0, -15, 96, 4, 0, 4, 55, -18, 0, -15, 98, 4
+ db 0, 3, 52, -17, 0, -14, 100, 4, 0, 3, 50, -17, 0, -14, 102, 4
+ db 0, 3, 47, -17, 0, -13, 104, 4, 0, 3, 45, -17, 0, -13, 106, 4
+ db 0, 3, 42, -16, 0, -12, 108, 3, 0, 3, 40, -16, 0, -11, 109, 3
+ db 0, 3, 37, -15, 0, -11, 111, 3, 0, 2, 35, -15, 0, -10, 113, 3
+ db 0, 3, 32, -14, 0, -10, 114, 3, 0, 2, 29, -13, 0, -9, 116, 3
+ db 0, 2, 27, -13, 0, -8, 117, 3, 0, 2, 25, -12, 0, -8, 119, 2
+ db 0, 2, 22, -11, 0, -7, 120, 2, 0, 1, 20, -10, 0, -6, 121, 2
+ db 0, 1, 18, -9, 0, -6, 122, 2, 0, 1, 15, -8, 0, -5, 123, 2
+ db 0, 1, 13, -7, 0, -4, 124, 1, 0, 1, 11, -6, 0, -4, 125, 1
+ db 0, 1, 8, -5, 0, -3, 126, 1, 0, 1, 6, -4, 0, -2, 126, 1
+ db 0, 0, 4, -3, 0, -1, 127, 1, 0, 0, 2, -1, 0, 0, 127, 0
+ db 0, 0, 2, -1, 0, 0, 127, 0
+
+pw_258: times 2 dw 258
+
+cextern mc_subpel_filters
+%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
+
+%macro BIDIR_JMP_TABLE 2-*
+ ;evaluated at definition time (in loop below)
+ %xdefine %1_%2_table (%%table - 2*%3)
+ %xdefine %%base %1_%2_table
+ %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2)
+ ; dynamically generated label
+ %%table:
+ %rep %0 - 2 ; repeat for num args
+ dd %%prefix %+ .w%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+BIDIR_JMP_TABLE avg, ssse3, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_avg, ssse3, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE mask, ssse3, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_420, ssse3, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_422, ssse3, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_444, ssse3, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE blend, ssse3, 4, 8, 16, 32
+BIDIR_JMP_TABLE blend_v, ssse3, 2, 4, 8, 16, 32
+BIDIR_JMP_TABLE blend_h, ssse3, 2, 4, 8, 16, 16, 16, 16
+
+%macro BASE_JMP_TABLE 3-*
+ %xdefine %1_%2_table (%%table - %3)
+ %xdefine %%base %1_%2
+ %%table:
+ %rep %0 - 2
+ dw %%base %+ _w%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+%xdefine prep_sse2 mangle(private_prefix %+ _prep_bilin_8bpc_sse2.prep)
+%xdefine put_ssse3 mangle(private_prefix %+ _put_bilin_8bpc_ssse3.put)
+%xdefine prep_ssse3 mangle(private_prefix %+ _prep_bilin_8bpc_ssse3.prep)
+
+BASE_JMP_TABLE put, ssse3, 2, 4, 8, 16, 32, 64, 128
+BASE_JMP_TABLE prep, ssse3, 4, 8, 16, 32, 64, 128
+
+%macro HV_JMP_TABLE 5-*
+ %xdefine %%prefix mangle(private_prefix %+ _%1_%2_8bpc_%3)
+ %xdefine %%base %1_%3
+ %assign %%types %4
+ %if %%types & 1
+ %xdefine %1_%2_h_%3_table (%%h - %5)
+ %%h:
+ %rep %0 - 4
+ dw %%prefix %+ .h_w%5 - %%base
+ %rotate 1
+ %endrep
+ %rotate 4
+ %endif
+ %if %%types & 2
+ %xdefine %1_%2_v_%3_table (%%v - %5)
+ %%v:
+ %rep %0 - 4
+ dw %%prefix %+ .v_w%5 - %%base
+ %rotate 1
+ %endrep
+ %rotate 4
+ %endif
+ %if %%types & 4
+ %xdefine %1_%2_hv_%3_table (%%hv - %5)
+ %%hv:
+ %rep %0 - 4
+ dw %%prefix %+ .hv_w%5 - %%base
+ %rotate 1
+ %endrep
+ %endif
+%endmacro
+
+HV_JMP_TABLE prep, 8tap, sse2, 1, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, bilin, sse2, 7, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE put, 8tap, ssse3, 3, 2, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, 8tap, ssse3, 1, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE put, bilin, ssse3, 7, 2, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, bilin, ssse3, 7, 4, 8, 16, 32, 64, 128
+
+%macro SCALED_JMP_TABLE 2-*
+ %xdefine %1_%2_table (%%table - %3)
+ %xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2)
+%%table:
+ %rep %0 - 2
+ dw %%base %+ .w%3 - %%base
+ %rotate 1
+ %endrep
+ %rotate 2
+%%dy_1024:
+ %xdefine %1_%2_dy1_table (%%dy_1024 - %3)
+ %rep %0 - 2
+ dw %%base %+ .dy1_w%3 - %%base
+ %rotate 1
+ %endrep
+ %rotate 2
+%%dy_2048:
+ %xdefine %1_%2_dy2_table (%%dy_2048 - %3)
+ %rep %0 - 2
+ dw %%base %+ .dy2_w%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+SCALED_JMP_TABLE put_8tap_scaled, ssse3, 2, 4, 8, 16, 32, 64, 128
+SCALED_JMP_TABLE prep_8tap_scaled, ssse3, 4, 8, 16, 32, 64, 128
+
+%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
+
+SECTION .text
+
+INIT_XMM ssse3
+
+%if ARCH_X86_32
+ DECLARE_REG_TMP 1
+ %define base t0-put_ssse3
+%else
+ DECLARE_REG_TMP 7
+ %define base 0
+%endif
+
+%macro RESTORE_DSQ_32 1
+ %if ARCH_X86_32
+ mov %1, dsm ; restore dsq
+ %endif
+%endmacro
+
+cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, h, mxy
+ movifnidn mxyd, r6m ; mx
+ LEA t0, put_ssse3
+ movifnidn srcq, srcmp
+ movifnidn ssq, ssmp
+ tzcnt wd, wm
+ mov hd, hm
+ test mxyd, mxyd
+ jnz .h
+ mov mxyd, r7m ; my
+ test mxyd, mxyd
+ jnz .v
+.put:
+ movzx wd, word [t0+wq*2+table_offset(put,)]
+ add wq, t0
+ RESTORE_DSQ_32 t0
+ jmp wq
+.put_w2:
+ movzx r4d, word [srcq+ssq*0]
+ movzx r6d, word [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mov [dstq+dsq*0], r4w
+ mov [dstq+dsq*1], r6w
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w2
+ RET
+.put_w4:
+ mov r4d, [srcq+ssq*0]
+ mov r6d, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mov [dstq+dsq*0], r4d
+ mov [dstq+dsq*1], r6d
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w4
+ RET
+.put_w8:
+ movq m0, [srcq+ssq*0]
+ movq m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movq [dstq+dsq*0], m0
+ movq [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w8
+ RET
+.put_w16:
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w16
+ RET
+.put_w32:
+ movu m0, [srcq+ssq*0+16*0]
+ movu m1, [srcq+ssq*0+16*1]
+ movu m2, [srcq+ssq*1+16*0]
+ movu m3, [srcq+ssq*1+16*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0+16*0], m0
+ mova [dstq+dsq*0+16*1], m1
+ mova [dstq+dsq*1+16*0], m2
+ mova [dstq+dsq*1+16*1], m3
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w32
+ RET
+.put_w64:
+ movu m0, [srcq+16*0]
+ movu m1, [srcq+16*1]
+ movu m2, [srcq+16*2]
+ movu m3, [srcq+16*3]
+ add srcq, ssq
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ mova [dstq+16*2], m2
+ mova [dstq+16*3], m3
+ add dstq, dsq
+ dec hd
+ jg .put_w64
+ RET
+.put_w128:
+ movu m0, [srcq+16*0]
+ movu m1, [srcq+16*1]
+ movu m2, [srcq+16*2]
+ movu m3, [srcq+16*3]
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ mova [dstq+16*2], m2
+ mova [dstq+16*3], m3
+ movu m0, [srcq+16*4]
+ movu m1, [srcq+16*5]
+ movu m2, [srcq+16*6]
+ movu m3, [srcq+16*7]
+ mova [dstq+16*4], m0
+ mova [dstq+16*5], m1
+ mova [dstq+16*6], m2
+ mova [dstq+16*7], m3
+ add srcq, ssq
+ add dstq, dsq
+ dec hd
+ jg .put_w128
+ RET
+.h:
+ ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4
+ ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4
+ imul mxyd, 0x00ff00ff
+ mova m4, [base+bilin_h_shuf8]
+ mova m0, [base+bilin_h_shuf4]
+ add mxyd, 0x00100010
+ movd m5, mxyd
+ mov mxyd, r7m ; my
+ pshufd m5, m5, q0000
+ test mxyd, mxyd
+ jnz .hv
+ movzx wd, word [t0+wq*2+table_offset(put, _bilin_h)]
+ mova m3, [base+pw_2048]
+ add wq, t0
+ movifnidn dsq, dsmp
+ jmp wq
+.h_w2:
+ pshufd m4, m4, q3120 ; m4 = {1, 0, 2, 1, 5, 4, 6, 5}
+.h_w2_loop:
+ movd m0, [srcq+ssq*0]
+ movd m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpckldq m0, m1
+ pshufb m0, m4
+ pmaddubsw m0, m5
+ pmulhrsw m0, m3
+ packuswb m0, m0
+ movd r6d, m0
+ mov [dstq+dsq*0], r6w
+ shr r6d, 16
+ mov [dstq+dsq*1], r6w
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w2_loop
+ RET
+.h_w4:
+ movq m4, [srcq+ssq*0]
+ movhps m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb m4, m0
+ pmaddubsw m4, m5
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movd [dstq+dsq*0], m4
+ psrlq m4, 32
+ movd [dstq+dsq*1], m4
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w4
+ RET
+.h_w8:
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ movq [dstq+dsq*0], m0
+ movhps [dstq+dsq*1], m0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ movu m0, [srcq+8*0]
+ movu m1, [srcq+8*1]
+ add srcq, ssq
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ mova [dstq], m0
+ add dstq, dsq
+ dec hd
+ jg .h_w16
+ RET
+.h_w32:
+ movu m0, [srcq+mmsize*0+8*0]
+ movu m1, [srcq+mmsize*0+8*1]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ movu m1, [srcq+mmsize*1+8*0]
+ movu m2, [srcq+mmsize*1+8*1]
+ add srcq, ssq
+ pshufb m1, m4
+ pshufb m2, m4
+ pmaddubsw m1, m5
+ pmaddubsw m2, m5
+ pmulhrsw m1, m3
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ add dstq, dsq
+ dec hd
+ jg .h_w32
+ RET
+.h_w64:
+ mov r6, -16*3
+.h_w64_loop:
+ movu m0, [srcq+r6+16*3+8*0]
+ movu m1, [srcq+r6+16*3+8*1]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ mova [dstq+r6+16*3], m0
+ add r6, 16
+ jle .h_w64_loop
+ add srcq, ssq
+ add dstq, dsq
+ dec hd
+ jg .h_w64
+ RET
+.h_w128:
+ mov r6, -16*7
+.h_w128_loop:
+ movu m0, [srcq+r6+16*7+8*0]
+ movu m1, [srcq+r6+16*7+8*1]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ mova [dstq+r6+16*7], m0
+ add r6, 16
+ jle .h_w128_loop
+ add srcq, ssq
+ add dstq, dsq
+ dec hd
+ jg .h_w128
+ RET
+.v:
+ movzx wd, word [t0+wq*2+table_offset(put, _bilin_v)]
+ imul mxyd, 0x00ff00ff
+ mova m5, [base+pw_2048]
+ add mxyd, 0x00100010
+ add wq, t0
+ movd m4, mxyd
+ pshufd m4, m4, q0000
+ movifnidn dsq, dsmp
+ jmp wq
+.v_w2:
+ movd m0, [srcq+ssq*0]
+.v_w2_loop:
+ pinsrw m0, [srcq+ssq*1], 1 ; 0 1
+ lea srcq, [srcq+ssq*2]
+ pshuflw m1, m0, q2301
+ pinsrw m0, [srcq+ssq*0], 0 ; 2 1
+ punpcklbw m1, m0
+ pmaddubsw m1, m4
+ pmulhrsw m1, m5
+ packuswb m1, m1
+ movd r6d, m1
+ mov [dstq+dsq*1], r6w
+ shr r6d, 16
+ mov [dstq+dsq*0], r6w
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w2_loop
+ RET
+.v_w4:
+ movd m0, [srcq+ssq*0]
+.v_w4_loop:
+ movd m2, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova m1, m0
+ movd m0, [srcq+ssq*0]
+ punpckldq m1, m2 ; 0 1
+ punpckldq m2, m0 ; 1 2
+ punpcklbw m1, m2
+ pmaddubsw m1, m4
+ pmulhrsw m1, m5
+ packuswb m1, m1
+ movd [dstq+dsq*0], m1
+ psrlq m1, 32
+ movd [dstq+dsq*1], m1
+ ;
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w4_loop
+ RET
+.v_w8:
+ movq m0, [srcq+ssq*0]
+.v_w8_loop:
+ movq m2, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova m1, m0
+ movq m0, [srcq+ssq*0]
+ punpcklbw m1, m2
+ punpcklbw m2, m0
+ pmaddubsw m1, m4
+ pmaddubsw m2, m4
+ pmulhrsw m1, m5
+ pmulhrsw m2, m5
+ packuswb m1, m2
+ movq [dstq+dsq*0], m1
+ movhps [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w8_loop
+ RET
+%macro PUT_BILIN_V_W16 0
+ movu m0, [srcq+ssq*0]
+%%loop:
+ movu m3, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova m1, m0
+ mova m2, m0
+ movu m0, [srcq+ssq*0]
+ punpcklbw m1, m3
+ punpckhbw m2, m3
+ pmaddubsw m1, m4
+ pmaddubsw m2, m4
+ pmulhrsw m1, m5
+ pmulhrsw m2, m5
+ packuswb m1, m2
+ punpcklbw m2, m3, m0
+ punpckhbw m3, m0
+ pmaddubsw m2, m4
+ pmaddubsw m3, m4
+ pmulhrsw m2, m5
+ pmulhrsw m3, m5
+ packuswb m2, m3
+ mova [dstq+dsq*0], m1
+ mova [dstq+dsq*1], m2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg %%loop
+%endmacro
+.v_w16:
+ PUT_BILIN_V_W16
+ RET
+.v_w128:
+ lea r6d, [hq+(7<<16)]
+ jmp .v_w16gt
+.v_w64:
+ lea r6d, [hq+(3<<16)]
+ jmp .v_w16gt
+.v_w32:
+ lea r6d, [hq+(1<<16)]
+.v_w16gt:
+ mov r4, srcq
+%if ARCH_X86_64
+ mov r7, dstq
+%endif
+.v_w16gt_loop:
+ PUT_BILIN_V_W16
+%if ARCH_X86_64
+ add r4, 16
+ add r7, 16
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
+%else
+ mov dstq, dstmp
+ add r4, 16
+ movzx hd, r6w
+ add dstq, 16
+ mov srcq, r4
+ mov dstmp, dstq
+%endif
+ sub r6d, 1<<16
+ jg .v_w16gt
+ RET
+.hv:
+ ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8
+ ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4
+ movzx wd, word [t0+wq*2+table_offset(put, _bilin_hv)]
+ WIN64_SPILL_XMM 8
+ shl mxyd, 11 ; can't shift by 12 due to signed overflow
+ mova m7, [base+pw_15]
+ movd m6, mxyd
+ add wq, t0
+ pshuflw m6, m6, q0000
+ paddb m5, m5
+ punpcklqdq m6, m6
+ jmp wq
+.hv_w2:
+ RESTORE_DSQ_32 t0
+ movd m0, [srcq+ssq*0]
+ punpckldq m0, m0
+ pshufb m0, m4
+ pmaddubsw m0, m5
+.hv_w2_loop:
+ movd m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movd m2, [srcq+ssq*0]
+ punpckldq m1, m2
+ pshufb m1, m4
+ pmaddubsw m1, m5 ; 1 _ 2 _
+ shufps m2, m0, m1, q1032 ; 0 _ 1 _
+ mova m0, m1
+ psubw m1, m2 ; 2 * (src[x + src_stride] - src[x])
+ pmulhw m1, m6 ; (my * (src[x + src_stride] - src[x]) >> 4
+ pavgw m2, m7 ; src[x] + 8
+ paddw m1, m2 ; src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8
+ psrlw m1, 4
+ packuswb m1, m1
+%if ARCH_X86_64
+ movq r6, m1
+%else
+ pshuflw m1, m1, q2020
+ movd r6d, m1
+%endif
+ mov [dstq+dsq*0], r6w
+ shr r6, gprsize*4
+ mov [dstq+dsq*1], r6w
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w2_loop
+ RET
+.hv_w4:
+ mova m4, [base+bilin_h_shuf4]
+ movddup m0, [srcq+ssq*0]
+ movifnidn dsq, dsmp
+ pshufb m0, m4
+ pmaddubsw m0, m5
+.hv_w4_loop:
+ movq m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movhps m1, [srcq+ssq*0]
+ pshufb m1, m4
+ pmaddubsw m1, m5 ; 1 2
+ shufps m2, m0, m1, q1032 ; 0 1
+ mova m0, m1
+ psubw m1, m2
+ pmulhw m1, m6
+ pavgw m2, m7
+ paddw m1, m2
+ psrlw m1, 4
+ packuswb m1, m1
+ movd [dstq+dsq*0], m1
+ psrlq m1, 32
+ movd [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ movu m0, [srcq+ssq*0]
+ movifnidn dsq, dsmp
+ pshufb m0, m4
+ pmaddubsw m0, m5
+.hv_w8_loop:
+ movu m2, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb m2, m4
+ pmaddubsw m2, m5
+ psubw m1, m2, m0
+ pmulhw m1, m6
+ pavgw m0, m7
+ paddw m1, m0
+ movu m0, [srcq+ssq*0]
+ pshufb m0, m4
+ pmaddubsw m0, m5
+ psubw m3, m0, m2
+ pmulhw m3, m6
+ pavgw m2, m7
+ paddw m3, m2
+ psrlw m1, 4
+ psrlw m3, 4
+ packuswb m1, m3
+ movq [dstq+dsq*0], m1
+ movhps [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w8_loop
+ RET
+.hv_w128:
+ lea r6d, [hq+(7<<16)]
+ jmp .hv_w16_start
+.hv_w64:
+ lea r6d, [hq+(3<<16)]
+ jmp .hv_w16_start
+.hv_w32:
+ lea r6d, [hq+(1<<16)]
+.hv_w16_start:
+ mov r4, srcq
+%if ARCH_X86_32
+ %define m8 [dstq]
+%else
+ mov r7, dstq
+%endif
+.hv_w16:
+ movifnidn dsq, dsmp
+%if WIN64
+ movaps r4m, m8
+%endif
+.hv_w16_loop0:
+ movu m0, [srcq+8*0]
+ movu m1, [srcq+8*1]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+.hv_w16_loop:
+ add srcq, ssq
+ movu m2, [srcq+8*0]
+ movu m3, [srcq+8*1]
+ pshufb m2, m4
+ pshufb m3, m4
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ mova m8, m2
+ psubw m2, m0
+ pmulhw m2, m6
+ pavgw m0, m7
+ paddw m2, m0
+ mova m0, m3
+ psubw m3, m1
+ pmulhw m3, m6
+ pavgw m1, m7
+ paddw m3, m1
+ mova m1, m0
+ mova m0, m8
+ psrlw m2, 4
+ psrlw m3, 4
+ packuswb m2, m3
+ mova [dstq], m2
+ add dstq, dsmp
+ dec hd
+ jg .hv_w16_loop
+%if ARCH_X86_32
+ mov dstq, dstm
+ add r4, 16
+ movzx hd, r6w
+ add dstq, 16
+ mov srcq, r4
+ mov dstm, dstq
+%else
+ add r4, 16
+ add r7, 16
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
+%endif
+ sub r6d, 1<<16
+ jg .hv_w16_loop0
+%if WIN64
+ movaps m8, r4m
+%endif
+ RET
+
+%macro PSHUFB_BILIN_H8 2 ; dst, src
+ %if cpuflag(ssse3)
+ pshufb %1, %2
+ %else
+ psrldq %2, %1, 1
+ punpcklbw %1, %2
+ %endif
+%endmacro
+
+%macro PSHUFB_BILIN_H4 3 ; dst, src, tmp
+ %if cpuflag(ssse3)
+ pshufb %1, %2
+ %else
+ psrldq %2, %1, 1
+ punpckhbw %3, %1, %2
+ punpcklbw %1, %2
+ punpcklqdq %1, %3
+ %endif
+%endmacro
+
+%macro PMADDUBSW 5 ; dst/src1, src2, zero, tmp, reset_zero
+ %if cpuflag(ssse3)
+ pmaddubsw %1, %2
+ %else
+ %if %5 == 1
+ pxor %3, %3
+ %endif
+ punpckhbw %4, %1, %3
+ punpcklbw %1, %1, %3
+ pmaddwd %4, %2
+ pmaddwd %1, %2
+ packssdw %1, %4
+ %endif
+%endmacro
+
+%macro PMULHRSW 5 ; dst, src, tmp, rndval, shift
+ %if cpuflag(ssse3)
+ pmulhrsw %1, %2
+ %else
+ punpckhwd %3, %1, %4
+ punpcklwd %1, %4
+ pmaddwd %3, %2
+ pmaddwd %1, %2
+ psrad %3, %5
+ psrad %1, %5
+ packssdw %1, %3
+ %endif
+%endmacro
+
+%macro PREP_BILIN 0
+%if ARCH_X86_32
+ %define base r6-prep%+SUFFIX
+%else
+ %define base 0
+%endif
+
+cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
+ movifnidn mxyd, r5m ; mx
+ LEA r6, prep%+SUFFIX
+ tzcnt wd, wm
+ movifnidn hd, hm
+ test mxyd, mxyd
+ jnz .h
+ mov mxyd, r6m ; my
+ test mxyd, mxyd
+ jnz .v
+.prep:
+%if notcpuflag(ssse3)
+ add r6, prep_ssse3 - prep_sse2
+ jmp prep_ssse3
+%else
+ movzx wd, word [r6+wq*2+table_offset(prep,)]
+ pxor m4, m4
+ add wq, r6
+ lea stride3q, [strideq*3]
+ jmp wq
+.prep_w4:
+ movd m0, [srcq+strideq*0]
+ movd m1, [srcq+strideq*1]
+ movd m2, [srcq+strideq*2]
+ movd m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ punpckldq m0, m1
+ punpckldq m2, m3
+ punpcklbw m0, m4
+ punpcklbw m2, m4
+ psllw m0, 4
+ psllw m2, 4
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m2
+ add tmpq, 16*2
+ sub hd, 4
+ jg .prep_w4
+ RET
+.prep_w8:
+ movq m0, [srcq+strideq*0]
+ movq m1, [srcq+strideq*1]
+ movq m2, [srcq+strideq*2]
+ movq m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ punpcklbw m0, m4
+ punpcklbw m1, m4
+ punpcklbw m2, m4
+ punpcklbw m3, m4
+ psllw m0, 4
+ psllw m1, 4
+ psllw m2, 4
+ psllw m3, 4
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ mova [tmpq+16*2], m2
+ mova [tmpq+16*3], m3
+ add tmpq, 16*4
+ sub hd, 4
+ jg .prep_w8
+ RET
+.prep_w16:
+ movu m1, [srcq+strideq*0]
+ movu m3, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ punpcklbw m0, m1, m4
+ punpckhbw m1, m4
+ punpcklbw m2, m3, m4
+ punpckhbw m3, m4
+ psllw m0, 4
+ psllw m1, 4
+ psllw m2, 4
+ psllw m3, 4
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ mova [tmpq+16*2], m2
+ mova [tmpq+16*3], m3
+ add tmpq, 16*4
+ sub hd, 2
+ jg .prep_w16
+ RET
+.prep_w128:
+ mov r3, -128
+ jmp .prep_w32_start
+.prep_w64:
+ mov r3, -64
+ jmp .prep_w32_start
+.prep_w32:
+ mov r3, -32
+.prep_w32_start:
+ sub srcq, r3
+.prep_w32_vloop:
+ mov r6, r3
+.prep_w32_hloop:
+ movu m1, [srcq+r6+16*0]
+ movu m3, [srcq+r6+16*1]
+ punpcklbw m0, m1, m4
+ punpckhbw m1, m4
+ punpcklbw m2, m3, m4
+ punpckhbw m3, m4
+ psllw m0, 4
+ psllw m1, 4
+ psllw m2, 4
+ psllw m3, 4
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ mova [tmpq+16*2], m2
+ mova [tmpq+16*3], m3
+ add tmpq, 16*4
+ add r6, 32
+ jl .prep_w32_hloop
+ add srcq, strideq
+ dec hd
+ jg .prep_w32_vloop
+ RET
+%endif
+.h:
+ ; 16 * src[x] + (mx * (src[x + 1] - src[x]))
+ ; = (16 - mx) * src[x] + mx * src[x + 1]
+%if cpuflag(ssse3)
+ imul mxyd, 0x00ff00ff
+ mova m4, [base+bilin_h_shuf8]
+ add mxyd, 0x00100010
+%else
+ imul mxyd, 0xffff
+ add mxyd, 16
+%endif
+ movd m5, mxyd
+ mov mxyd, r6m ; my
+ pshufd m5, m5, q0000
+ test mxyd, mxyd
+ jnz .hv
+ movzx wd, word [r6+wq*2+table_offset(prep, _bilin_h)]
+%if notcpuflag(ssse3)
+ WIN64_SPILL_XMM 8
+ pxor m6, m6
+%endif
+ add wq, r6
+ jmp wq
+.h_w4:
+%if cpuflag(ssse3)
+ mova m4, [base+bilin_h_shuf4]
+%endif
+ lea stride3q, [strideq*3]
+.h_w4_loop:
+ movq m0, [srcq+strideq*0]
+ movhps m0, [srcq+strideq*1]
+ movq m1, [srcq+strideq*2]
+ movhps m1, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ PSHUFB_BILIN_H4 m0, m4, m2
+ PMADDUBSW m0, m5, m6, m2, 0
+ PSHUFB_BILIN_H4 m1, m4, m2
+ PMADDUBSW m1, m5, m6, m2, 0
+ mova [tmpq+0 ], m0
+ mova [tmpq+16], m1
+ add tmpq, 32
+ sub hd, 4
+ jg .h_w4_loop
+ RET
+.h_w8:
+ lea stride3q, [strideq*3]
+.h_w8_loop:
+ movu m0, [srcq+strideq*0]
+ movu m1, [srcq+strideq*1]
+ movu m2, [srcq+strideq*2]
+ movu m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ PSHUFB_BILIN_H8 m0, m4
+ PSHUFB_BILIN_H8 m1, m4
+ PSHUFB_BILIN_H8 m2, m4
+ PSHUFB_BILIN_H8 m3, m4
+ PMADDUBSW m0, m5, m6, m7, 0
+ PMADDUBSW m1, m5, m6, m7, 0
+ PMADDUBSW m2, m5, m6, m7, 0
+ PMADDUBSW m3, m5, m6, m7, 0
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ mova [tmpq+16*2], m2
+ mova [tmpq+16*3], m3
+ add tmpq, 16*4
+ sub hd, 4
+ jg .h_w8_loop
+ RET
+.h_w16:
+ movu m0, [srcq+strideq*0+8*0]
+ movu m1, [srcq+strideq*0+8*1]
+ movu m2, [srcq+strideq*1+8*0]
+ movu m3, [srcq+strideq*1+8*1]
+ lea srcq, [srcq+strideq*2]
+ PSHUFB_BILIN_H8 m0, m4
+ PSHUFB_BILIN_H8 m1, m4
+ PSHUFB_BILIN_H8 m2, m4
+ PSHUFB_BILIN_H8 m3, m4
+ PMADDUBSW m0, m5, m6, m7, 0
+ PMADDUBSW m1, m5, m6, m7, 0
+ PMADDUBSW m2, m5, m6, m7, 0
+ PMADDUBSW m3, m5, m6, m7, 0
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ mova [tmpq+16*2], m2
+ mova [tmpq+16*3], m3
+ add tmpq, 16*4
+ sub hd, 2
+ jg .h_w16
+ RET
+.h_w128:
+ mov r3, -128
+ jmp .h_w32_start
+.h_w64:
+ mov r3, -64
+ jmp .h_w32_start
+.h_w32:
+ mov r3, -32
+.h_w32_start:
+ sub srcq, r3
+.h_w32_vloop:
+ mov r6, r3
+.h_w32_hloop:
+ movu m0, [srcq+r6+8*0]
+ movu m1, [srcq+r6+8*1]
+ movu m2, [srcq+r6+8*2]
+ movu m3, [srcq+r6+8*3]
+ PSHUFB_BILIN_H8 m0, m4
+ PSHUFB_BILIN_H8 m1, m4
+ PSHUFB_BILIN_H8 m2, m4
+ PSHUFB_BILIN_H8 m3, m4
+ PMADDUBSW m0, m5, m6, m7, 0
+ PMADDUBSW m1, m5, m6, m7, 0
+ PMADDUBSW m2, m5, m6, m7, 0
+ PMADDUBSW m3, m5, m6, m7, 0
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ mova [tmpq+16*2], m2
+ mova [tmpq+16*3], m3
+ add tmpq, 16*4
+ add r6, 32
+ jl .h_w32_hloop
+ add srcq, strideq
+ dec hd
+ jg .h_w32_vloop
+ RET
+.v:
+%if notcpuflag(ssse3)
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 8
+%endif
+ movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)]
+%if cpuflag(ssse3)
+ imul mxyd, 0x00ff00ff
+ add mxyd, 0x00100010
+%else
+ imul mxyd, 0xffff
+ pxor m6, m6
+ add mxyd, 16
+%endif
+ add wq, r6
+ lea stride3q, [strideq*3]
+ movd m5, mxyd
+ pshufd m5, m5, q0000
+ jmp wq
+.v_w4:
+ movd m0, [srcq+strideq*0]
+.v_w4_loop:
+ movd m1, [srcq+strideq*1]
+ movd m2, [srcq+strideq*2]
+ movd m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ punpckldq m0, m1
+ punpckldq m1, m2
+ punpcklbw m0, m1 ; 01 12
+ PMADDUBSW m0, m5, m6, m7, 0
+ mova [tmpq+16*0], m0
+ movd m0, [srcq+strideq*0]
+ punpckldq m2, m3
+ punpckldq m3, m0
+ punpcklbw m2, m3 ; 23 34
+ PMADDUBSW m2, m5, m6, m7, 0
+ mova [tmpq+16*1], m2
+ add tmpq, 16*2
+ sub hd, 4
+ jg .v_w4_loop
+ RET
+.v_w8:
+ movq m0, [srcq+strideq*0]
+.v_w8_loop:
+ movq m1, [srcq+strideq*1]
+ movq m2, [srcq+strideq*2]
+ movq m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ punpcklbw m0, m1 ; 01
+ punpcklbw m1, m2 ; 12
+ PMADDUBSW m0, m5, m6, m7, 0
+ PMADDUBSW m1, m5, m6, m7, 0
+ mova [tmpq+16*0], m0
+ movq m0, [srcq+strideq*0]
+ punpcklbw m2, m3 ; 23
+ punpcklbw m3, m0 ; 34
+ PMADDUBSW m2, m5, m6, m7, 0
+ mova [tmpq+16*1], m1
+ PMADDUBSW m3, m5, m6, m7, 0
+ mova [tmpq+16*2], m2
+ mova [tmpq+16*3], m3
+ add tmpq, 16*4
+ sub hd, 4
+ jg .v_w8_loop
+ RET
+.v_w16:
+ movu m0, [srcq+strideq*0]
+.v_w16_loop:
+ movu m1, [srcq+strideq*1]
+ movu m2, [srcq+strideq*2]
+ movu m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ punpcklbw m4, m0, m1
+ punpckhbw m0, m1
+ PMADDUBSW m4, m5, m6, m7, 0
+ PMADDUBSW m0, m5, m6, m7, 0
+ mova [tmpq+16*0], m4
+ punpcklbw m4, m1, m2
+ punpckhbw m1, m2
+ PMADDUBSW m4, m5, m6, m7, 0
+ mova [tmpq+16*1], m0
+ movu m0, [srcq+strideq*0]
+ PMADDUBSW m1, m5, m6, m7, 0
+ mova [tmpq+16*2], m4
+ punpcklbw m4, m2, m3
+ punpckhbw m2, m3
+ PMADDUBSW m4, m5, m6, m7, 0
+ mova [tmpq+16*3], m1
+ PMADDUBSW m2, m5, m6, m7, 0
+ mova [tmpq+16*4], m4
+ punpcklbw m4, m3, m0
+ punpckhbw m3, m0
+ PMADDUBSW m4, m5, m6, m7, 0
+ mova [tmpq+16*5], m2
+ PMADDUBSW m3, m5, m6, m7, 0
+ mova [tmpq+16*6], m4
+ mova [tmpq+16*7], m3
+ add tmpq, 16*8
+ sub hd, 4
+ jg .v_w16_loop
+ RET
+.v_w128:
+ lea r3d, [hq+(3<<8)]
+ mov r6d, 256
+ jmp .v_w32_start
+.v_w64:
+ lea r3d, [hq+(1<<8)]
+ mov r6d, 128
+ jmp .v_w32_start
+.v_w32:
+ xor r3d, r3d
+ mov r6d, 64
+.v_w32_start:
+%if ARCH_X86_64
+ %if WIN64
+ PUSH r7
+ %endif
+ mov r7, tmpq
+%endif
+ mov r5, srcq
+.v_w32_hloop:
+ movu m0, [srcq+strideq*0+16*0]
+ movu m1, [srcq+strideq*0+16*1]
+.v_w32_vloop:
+ movu m2, [srcq+strideq*1+16*0]
+ movu m3, [srcq+strideq*1+16*1]
+ lea srcq, [srcq+strideq*2]
+ punpcklbw m4, m0, m2
+ punpckhbw m0, m2
+ PMADDUBSW m4, m5, m6, m7, 0
+ PMADDUBSW m0, m5, m6, m7, 0
+ mova [tmpq+16*0], m4
+ mova [tmpq+16*1], m0
+ movu m0, [srcq+strideq*0+16*0]
+ punpcklbw m4, m1, m3
+ punpckhbw m1, m3
+ PMADDUBSW m4, m5, m6, m7, 0
+ PMADDUBSW m1, m5, m6, m7, 0
+ mova [tmpq+16*2], m4
+ mova [tmpq+16*3], m1
+ movu m1, [srcq+strideq*0+16*1]
+ add tmpq, r6
+ punpcklbw m4, m2, m0
+ punpckhbw m2, m0
+ PMADDUBSW m4, m5, m6, m7, 0
+ PMADDUBSW m2, m5, m6, m7, 0
+ mova [tmpq+16*0], m4
+ mova [tmpq+16*1], m2
+ punpcklbw m4, m3, m1
+ punpckhbw m3, m1
+ PMADDUBSW m4, m5, m6, m7, 0
+ PMADDUBSW m3, m5, m6, m7, 0
+ mova [tmpq+16*2], m4
+ mova [tmpq+16*3], m3
+ add tmpq, r6
+ sub hd, 2
+ jg .v_w32_vloop
+ add r5, 32
+ movzx hd, r3b
+ mov srcq, r5
+%if ARCH_X86_64
+ add r7, 16*4
+ mov tmpq, r7
+%else
+ mov tmpq, tmpmp
+ add tmpq, 16*4
+ mov tmpmp, tmpq
+%endif
+ sub r3d, 1<<8
+ jg .v_w32_hloop
+%if WIN64
+ POP r7
+%endif
+ RET
+.hv:
+ ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4
+ ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4)
+ movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)]
+%assign stack_offset stack_offset - stack_size_padded
+%if cpuflag(ssse3)
+ imul mxyd, 0x08000800
+ WIN64_SPILL_XMM 8
+%else
+ or mxyd, 1<<16
+ WIN64_SPILL_XMM 9
+ %if ARCH_X86_64
+ mova m8, [base+pw_8]
+ %else
+ %define m8 [base+pw_8]
+ %endif
+ pxor m7, m7
+%endif
+ movd m6, mxyd
+ add wq, r6
+ pshufd m6, m6, q0000
+ jmp wq
+.hv_w4:
+%if cpuflag(ssse3)
+ mova m4, [base+bilin_h_shuf4]
+ movddup m0, [srcq+strideq*0]
+%else
+ movhps m0, [srcq+strideq*0]
+%endif
+ lea r3, [strideq*3]
+ PSHUFB_BILIN_H4 m0, m4, m3
+ PMADDUBSW m0, m5, m7, m4, 0 ; _ 0
+.hv_w4_loop:
+ movq m1, [srcq+strideq*1]
+ movhps m1, [srcq+strideq*2]
+ movq m2, [srcq+r3 ]
+ lea srcq, [srcq+strideq*4]
+ movhps m2, [srcq+strideq*0]
+ PSHUFB_BILIN_H4 m1, m4, m3
+ PSHUFB_BILIN_H4 m2, m4, m3
+ PMADDUBSW m1, m5, m7, m4, 0 ; 1 2
+ PMADDUBSW m2, m5, m7, m4, 0 ; 3 4
+ shufpd m0, m1, 0x01 ; 0 1
+ shufpd m3, m1, m2, 0x01 ; 2 3
+ psubw m1, m0
+ PMULHRSW m1, m6, m4, m8, 4
+ paddw m1, m0
+ mova m0, m2
+ psubw m2, m3
+ PMULHRSW m2, m6, m4, m8, 4
+ paddw m2, m3
+ mova [tmpq+16*0], m1
+ mova [tmpq+16*1], m2
+ add tmpq, 32
+ sub hd, 4
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ movu m0, [srcq+strideq*0]
+ PSHUFB_BILIN_H8 m0, m4
+ PMADDUBSW m0, m5, m7, m4, 0 ; 0
+.hv_w8_loop:
+ movu m1, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ movu m2, [srcq+strideq*0]
+ PSHUFB_BILIN_H8 m1, m4
+ PSHUFB_BILIN_H8 m2, m4
+ PMADDUBSW m1, m5, m7, m4, 0 ; 1
+ PMADDUBSW m2, m5, m7, m4, 0 ; 2
+ psubw m3, m1, m0
+ PMULHRSW m3, m6, m4, m8, 4
+ paddw m3, m0
+ mova m0, m2
+ psubw m2, m1
+ PMULHRSW m2, m6, m4, m8, 4
+ paddw m2, m1
+ mova [tmpq+16*0], m3
+ mova [tmpq+16*1], m2
+ add tmpq, 16*2
+ sub hd, 2
+ jg .hv_w8_loop
+ RET
+.hv_w128:
+ lea r3d, [hq+(7<<8)]
+ mov r5d, 256
+ jmp .hv_w16_start
+.hv_w64:
+ lea r3d, [hq+(3<<8)]
+ mov r5d, 128
+ jmp .hv_w16_start
+.hv_w32:
+ lea r3d, [hq+(1<<8)]
+ mov r5d, 64
+ jmp .hv_w16_start
+.hv_w16:
+ xor r3d, r3d
+ mov r5d, 32
+.hv_w16_start:
+%if ARCH_X86_64 || cpuflag(ssse3)
+ mov r6, srcq
+%endif
+%if ARCH_X86_64
+ %if WIN64
+ PUSH r7
+ %endif
+ mov r7, tmpq
+%endif
+.hv_w16_hloop:
+ movu m0, [srcq+strideq*0+8*0]
+ movu m1, [srcq+strideq*0+8*1]
+ PSHUFB_BILIN_H8 m0, m4
+ PSHUFB_BILIN_H8 m1, m4
+ PMADDUBSW m0, m5, m7, m4, 0 ; 0a
+ PMADDUBSW m1, m5, m7, m4, 0 ; 0b
+.hv_w16_vloop:
+ movu m2, [srcq+strideq*1+8*0]
+ PSHUFB_BILIN_H8 m2, m4
+ PMADDUBSW m2, m5, m7, m4, 0 ; 1a
+ psubw m3, m2, m0
+ PMULHRSW m3, m6, m4, m8, 4
+ paddw m3, m0
+ mova [tmpq+16*0], m3
+ movu m3, [srcq+strideq*1+8*1]
+ lea srcq, [srcq+strideq*2]
+ PSHUFB_BILIN_H8 m3, m4
+ PMADDUBSW m3, m5, m7, m4, 0 ; 1b
+ psubw m0, m3, m1
+ PMULHRSW m0, m6, m4, m8, 4
+ paddw m0, m1
+ mova [tmpq+16*1], m0
+ add tmpq, r5
+ movu m0, [srcq+strideq*0+8*0]
+ PSHUFB_BILIN_H8 m0, m4
+ PMADDUBSW m0, m5, m7, m4, 0 ; 2a
+ psubw m1, m0, m2
+ PMULHRSW m1, m6, m4, m8, 4
+ paddw m1, m2
+ mova [tmpq+16*0], m1
+ movu m1, [srcq+strideq*0+8*1]
+ PSHUFB_BILIN_H8 m1, m4
+ PMADDUBSW m1, m5, m7, m4, 0 ; 2b
+ psubw m2, m1, m3
+ PMULHRSW m2, m6, m4, m8, 4
+ paddw m2, m3
+ mova [tmpq+16*1], m2
+ add tmpq, r5
+ sub hd, 2
+ jg .hv_w16_vloop
+ movzx hd, r3b
+%if ARCH_X86_64
+ add r6, 16
+ add r7, 2*16
+ mov srcq, r6
+ mov tmpq, r7
+%elif cpuflag(ssse3)
+ mov tmpq, tmpm
+ add r6, 16
+ add tmpq, 2*16
+ mov srcq, r6
+ mov tmpm, tmpq
+%else
+ mov srcq, srcm
+ mov tmpq, tmpm
+ add srcq, 16
+ add tmpq, 2*16
+ mov srcm, srcq
+ mov tmpm, tmpq
+%endif
+ sub r3d, 1<<8
+ jg .hv_w16_hloop
+%if WIN64
+ POP r7
+%endif
+ RET
+%endmacro
+
+; int8_t subpel_filters[5][15][8]
+%assign FILTER_REGULAR (0*15 << 16) | 3*15
+%assign FILTER_SMOOTH (1*15 << 16) | 4*15
+%assign FILTER_SHARP (2*15 << 16) | 3*15
+
+%macro FN 4 ; prefix, type, type_h, type_v
+cglobal %1_%2_8bpc
+ mov t0d, FILTER_%3
+%ifidn %3, %4
+ mov t1d, t0d
+%else
+ mov t1d, FILTER_%4
+%endif
+%ifnidn %2, regular ; skip the jump in the last filter
+ jmp mangle(private_prefix %+ _%1_8bpc %+ SUFFIX)
+%endif
+%endmacro
+
+%if ARCH_X86_32
+DECLARE_REG_TMP 1, 2
+%elif WIN64
+DECLARE_REG_TMP 4, 5
+%else
+DECLARE_REG_TMP 7, 8
+%endif
+
+FN put_8tap, sharp, SHARP, SHARP
+FN put_8tap, sharp_smooth, SHARP, SMOOTH
+FN put_8tap, smooth_sharp, SMOOTH, SHARP
+FN put_8tap, smooth, SMOOTH, SMOOTH
+FN put_8tap, sharp_regular, SHARP, REGULAR
+FN put_8tap, regular_sharp, REGULAR, SHARP
+FN put_8tap, smooth_regular, SMOOTH, REGULAR
+FN put_8tap, regular_smooth, REGULAR, SMOOTH
+FN put_8tap, regular, REGULAR, REGULAR
+
+%if ARCH_X86_32
+ %define base_reg r1
+ %define base base_reg-put_ssse3
+%else
+ %define base_reg r8
+ %define base 0
+%endif
+
+cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
+%assign org_stack_offset stack_offset
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+%if ARCH_X86_64
+ imul myd, mym, 0x010101
+ add myd, t1d ; 8tap_v, my, 4tap_v
+%else
+ imul ssd, mym, 0x010101
+ add ssd, t1d ; 8tap_v, my, 4tap_v
+ mov srcq, srcm
+%endif
+ mov wd, wm
+ movifnidn hd, hm
+ LEA base_reg, put_ssse3
+ test mxd, 0xf00
+ jnz .h
+%if ARCH_X86_32
+ test ssd, 0xf00
+%else
+ test myd, 0xf00
+%endif
+ jnz .v
+ tzcnt wd, wd
+ movzx wd, word [base_reg+wq*2+table_offset(put,)]
+ add wq, base_reg
+; put_bilin mangling jump
+%assign stack_offset org_stack_offset
+ movifnidn dsq, dsmp
+ movifnidn ssq, ssmp
+%if WIN64
+ pop r8
+%endif
+ lea r6, [ssq*3]
+ jmp wq
+.h:
+%if ARCH_X86_32
+ test ssd, 0xf00
+%else
+ test myd, 0xf00
+%endif
+ jnz .hv
+ movifnidn ssq, ssmp
+ WIN64_SPILL_XMM 12
+ cmp wd, 4
+ jl .h_w2
+ je .h_w4
+ tzcnt wd, wd
+%if ARCH_X86_64
+ mova m10, [base+subpel_h_shufA]
+ mova m11, [base+subpel_h_shufB]
+ mova m9, [base+subpel_h_shufC]
+%endif
+ shr mxd, 16
+ sub srcq, 3
+ movzx wd, word [base_reg+wq*2+table_offset(put, _8tap_h)]
+ movq m6, [base_reg+mxq*8+subpel_filters-put_ssse3]
+ mova m7, [base+pw_34] ; 2 + (8 << 2)
+ pshufd m5, m6, q0000
+ pshufd m6, m6, q1111
+ add wq, base_reg
+ jmp wq
+.h_w2:
+%if ARCH_X86_32
+ and mxd, 0x7f
+%else
+ movzx mxd, mxb
+%endif
+ dec srcq
+ mova m4, [base+subpel_h_shuf4]
+ movd m3, [base_reg+mxq*8+subpel_filters-put_ssse3+2]
+ mova m5, [base+pw_34] ; 2 + (8 << 2)
+ pshufd m3, m3, q0000
+ movifnidn dsq, dsmp
+.h_w2_loop:
+ movq m0, [srcq+ssq*0]
+ movhps m0, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb m0, m4
+ pmaddubsw m0, m3
+ phaddw m0, m0
+ paddw m0, m5 ; pw34
+ psraw m0, 6
+ packuswb m0, m0
+ movd r6d, m0
+ mov [dstq+dsq*0], r6w
+ shr r6d, 16
+ mov [dstq+dsq*1], r6w
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w2_loop
+ RET
+.h_w4:
+%if ARCH_X86_32
+ and mxd, 0x7f
+%else
+ movzx mxd, mxb
+%endif
+ dec srcq
+ movd m3, [base_reg+mxq*8+subpel_filters-put_ssse3+2]
+ mova m6, [base+subpel_h_shufA]
+ mova m5, [base+pw_34] ; 2 + (8 << 2)
+ pshufd m3, m3, q0000
+ movifnidn dsq, dsmp
+.h_w4_loop:
+ movq m0, [srcq+ssq*0] ; 1
+ movq m1, [srcq+ssq*1] ; 2
+ lea srcq, [srcq+ssq*2]
+ pshufb m0, m6 ; subpel_h_shufA
+ pshufb m1, m6 ; subpel_h_shufA
+ pmaddubsw m0, m3 ; subpel_filters
+ pmaddubsw m1, m3 ; subpel_filters
+ phaddw m0, m1
+ paddw m0, m5 ; pw34
+ psraw m0, 6
+ packuswb m0, m0
+ movd [dstq+dsq*0], m0
+ psrlq m0, 32
+ movd [dstq+dsq*1], m0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w4_loop
+ RET
+%macro PUT_8TAP_H 4 ; dst/src, tmp[1-3]
+ %if ARCH_X86_32
+ pshufb %2, %1, [base+subpel_h_shufB]
+ pshufb %3, %1, [base+subpel_h_shufC]
+ pshufb %1, [base+subpel_h_shufA]
+ %else
+ pshufb %2, %1, m11; subpel_h_shufB
+ pshufb %3, %1, m9 ; subpel_h_shufC
+ pshufb %1, m10 ; subpel_h_shufA
+ %endif
+ pmaddubsw %4, %2, m5 ; subpel +0 B0
+ pmaddubsw %2, m6 ; subpel +4 B4
+ pmaddubsw %3, m6 ; C4
+ pmaddubsw %1, m5 ; A0
+ paddw %3, %4 ; C4+B0
+ paddw %1, %2 ; A0+B4
+ phaddw %1, %3
+ paddw %1, m7 ; pw34
+ psraw %1, 6
+%endmacro
+.h_w8:
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ PUT_8TAP_H m0, m2, m3, m4
+ PUT_8TAP_H m1, m2, m3, m4
+ packuswb m0, m1
+%if ARCH_X86_32
+ movq [dstq], m0
+ add dstq, dsm
+ movhps [dstq], m0
+ add dstq, dsm
+%else
+ movq [dstq+dsq*0], m0
+ movhps [dstq+dsq*1], m0
+ lea dstq, [dstq+dsq*2]
+%endif
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w128:
+ mov r4, -16*7
+ jmp .h_w16_start
+.h_w64:
+ mov r4, -16*3
+ jmp .h_w16_start
+.h_w32:
+ mov r4, -16*1
+ jmp .h_w16_start
+.h_w16:
+ xor r4d, r4d
+.h_w16_start:
+ sub srcq, r4
+ sub dstq, r4
+.h_w16_loop_v:
+ mov r6, r4
+.h_w16_loop_h:
+ movu m0, [srcq+r6+8*0]
+ movu m1, [srcq+r6+8*1]
+ PUT_8TAP_H m0, m2, m3, m4
+ PUT_8TAP_H m1, m2, m3, m4
+ packuswb m0, m1
+ mova [dstq+r6], m0
+ add r6, 16
+ jle .h_w16_loop_h
+ add srcq, ssq
+ add dstq, dsmp
+ dec hd
+ jg .h_w16_loop_v
+ RET
+.v:
+%if ARCH_X86_32
+ movzx mxd, ssb
+ shr ssd, 16
+ cmp hd, 6
+ cmovs ssd, mxd
+ movq m0, [base_reg+ssq*8+subpel_filters-put_ssse3]
+%else
+ %assign stack_offset org_stack_offset
+ WIN64_SPILL_XMM 16
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ movq m0, [base_reg+myq*8+subpel_filters-put_ssse3]
+%endif
+ tzcnt r6d, wd
+ movzx r6d, word [base_reg+r6*2+table_offset(put, _8tap_v)]
+ punpcklwd m0, m0
+ mova m7, [base+pw_512]
+ add r6, base_reg
+%if ARCH_X86_32
+ %define subpel0 [rsp+mmsize*0]
+ %define subpel1 [rsp+mmsize*1]
+ %define subpel2 [rsp+mmsize*2]
+ %define subpel3 [rsp+mmsize*3]
+%assign regs_used 2 ; use r1 (ds) as tmp for stack alignment if needed
+ ALLOC_STACK -16*4
+%assign regs_used 7
+ pshufd m1, m0, q0000
+ mova subpel0, m1
+ pshufd m1, m0, q1111
+ mova subpel1, m1
+ pshufd m1, m0, q2222
+ mova subpel2, m1
+ pshufd m1, m0, q3333
+ mova subpel3, m1
+ mov ssq, [rstk+stack_offset+gprsize*4]
+ lea ssq, [ssq*3]
+ sub srcq, ssq
+ mov ssq, [rstk+stack_offset+gprsize*4]
+ mov dsq, [rstk+stack_offset+gprsize*2]
+%else
+ %define subpel0 m8
+ %define subpel1 m9
+ %define subpel2 m10
+ %define subpel3 m11
+ lea ss3q, [ssq*3]
+ pshufd m8, m0, q0000
+ sub srcq, ss3q
+ pshufd m9, m0, q1111
+ pshufd m10, m0, q2222
+ pshufd m11, m0, q3333
+%endif
+ jmp r6
+.v_w2:
+ movd m1, [srcq+ssq*0]
+ movd m0, [srcq+ssq*1]
+%if ARCH_X86_32
+ lea srcq, [srcq+ssq*2]
+ movd m2, [srcq+ssq*0]
+ movd m5, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movd m3, [srcq+ssq*0]
+ movd m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+%else
+ movd m2, [srcq+ssq*2]
+ add srcq, ss3q
+ movd m5, [srcq+ssq*0]
+ movd m3, [srcq+ssq*1]
+ movd m4, [srcq+ssq*2]
+ add srcq, ss3q
+%endif
+ punpcklwd m1, m0 ; 0 1
+ punpcklwd m0, m2 ; 1 2
+ punpcklbw m1, m0 ; 01 12
+ movd m0, [srcq+ssq*0]
+ punpcklwd m2, m5 ; 2 3
+ punpcklwd m5, m3 ; 3 4
+ punpcklwd m3, m4 ; 4 5
+ punpcklwd m4, m0 ; 5 6
+ punpcklbw m2, m5 ; 23 34
+ punpcklbw m3, m4 ; 45 56
+.v_w2_loop:
+ movd m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmaddubsw m5, m1, subpel0 ; a0 b0
+ mova m1, m2
+ pmaddubsw m2, subpel1 ; a1 b1
+ paddw m5, m2
+ mova m2, m3
+ pmaddubsw m3, subpel2 ; a2 b2
+ paddw m5, m3
+ punpcklwd m3, m0, m4 ; 6 7
+ movd m0, [srcq+ssq*0]
+ punpcklwd m4, m0 ; 7 8
+ punpcklbw m3, m4 ; 67 78
+ pmaddubsw m4, m3, subpel3 ; a3 b3
+ paddw m5, m4
+ pmulhrsw m5, m7
+ packuswb m5, m5
+ movd r6d, m5
+ mov [dstq+dsq*0], r6w
+ shr r6d, 16
+ mov [dstq+dsq*1], r6w
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w2_loop
+ RET
+.v_w4:
+%if ARCH_X86_32
+.v_w8:
+.v_w16:
+.v_w32:
+.v_w64:
+.v_w128:
+ shl wd, 14
+%if STACK_ALIGNMENT < 16
+ %define dstm [rsp+mmsize*4+gprsize]
+ mov dstm, dstq
+%endif
+ lea r6d, [hq+wq-(1<<16)]
+ mov r4, srcq
+.v_w4_loop0:
+%endif
+ movd m1, [srcq+ssq*0]
+ movd m0, [srcq+ssq*1]
+%if ARCH_X86_32
+ lea srcq, [srcq+ssq*2]
+ movd m2, [srcq+ssq*0]
+ movd m5, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movd m3, [srcq+ssq*0]
+ movd m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+%else
+ movd m2, [srcq+ssq*2]
+ add srcq, ss3q
+ movd m5, [srcq+ssq*0]
+ movd m3, [srcq+ssq*1]
+ movd m4, [srcq+ssq*2]
+ add srcq, ss3q
+%endif
+ punpckldq m1, m0 ; 0 1
+ punpckldq m0, m2 ; 1 2
+ punpcklbw m1, m0 ; 01 12
+ movd m0, [srcq+ssq*0]
+ punpckldq m2, m5 ; 2 3
+ punpckldq m5, m3 ; 3 4
+ punpckldq m3, m4 ; 4 5
+ punpckldq m4, m0 ; 5 6
+ punpcklbw m2, m5 ; 23 34
+ punpcklbw m3, m4 ; 45 56
+.v_w4_loop:
+ movd m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmaddubsw m5, m1, subpel0 ; a0 b0
+ mova m1, m2
+ pmaddubsw m2, subpel1 ; a1 b1
+ paddw m5, m2
+ mova m2, m3
+ pmaddubsw m3, subpel2 ; a2 b2
+ paddw m5, m3
+ punpckldq m3, m0, m4 ; 6 7 _ _
+ movd m0, [srcq+ssq*0]
+ punpckldq m4, m0 ; 7 8 _ _
+ punpcklbw m3, m4 ; 67 78
+ pmaddubsw m4, m3, subpel3 ; a3 b3
+ paddw m5, m4
+ pmulhrsw m5, m7
+ packuswb m5, m5
+ movd [dstq+dsq*0], m5
+ psrlq m5, 32
+ movd [dstq+dsq*1], m5
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w4_loop
+%if ARCH_X86_32
+ mov dstq, dstm
+ add r4, 4
+ movzx hd, r6w
+ add dstq, 4
+ mov srcq, r4
+ mov dstm, dstq
+ sub r6d, 1<<16
+ jg .v_w4_loop0
+%endif
+ RET
+%if ARCH_X86_64
+.v_w8:
+.v_w16:
+.v_w32:
+.v_w64:
+.v_w128:
+ lea r6d, [wq*8-64]
+ mov r4, srcq
+ mov r7, dstq
+ lea r6d, [hq+r6*4]
+.v_w8_loop0:
+ movq m1, [srcq+ssq*0]
+ movq m2, [srcq+ssq*1]
+ movq m3, [srcq+ssq*2]
+ add srcq, ss3q
+ movq m4, [srcq+ssq*0]
+ movq m5, [srcq+ssq*1]
+ movq m6, [srcq+ssq*2]
+ add srcq, ss3q
+ movq m0, [srcq+ssq*0]
+ punpcklbw m1, m2 ; 01
+ punpcklbw m2, m3 ; 12
+ punpcklbw m3, m4 ; 23
+ punpcklbw m4, m5 ; 34
+ punpcklbw m5, m6 ; 45
+ punpcklbw m6, m0 ; 56
+.v_w8_loop:
+ movq m13, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmaddubsw m14, m1, subpel0 ; a0
+ mova m1, m3
+ pmaddubsw m15, m2, subpel0 ; b0
+ mova m2, m4
+ pmaddubsw m3, subpel1 ; a1
+ mova m12, m0
+ pmaddubsw m4, subpel1 ; b1
+ movq m0, [srcq+ssq*0]
+ paddw m14, m3
+ paddw m15, m4
+ mova m3, m5
+ pmaddubsw m5, subpel2 ; a2
+ mova m4, m6
+ pmaddubsw m6, subpel2 ; b2
+ punpcklbw m12, m13 ; 67
+ punpcklbw m13, m0 ; 78
+ paddw m14, m5
+ mova m5, m12
+ pmaddubsw m12, subpel3 ; a3
+ paddw m15, m6
+ mova m6, m13
+ pmaddubsw m13, subpel3 ; b3
+ paddw m14, m12
+ paddw m15, m13
+ pmulhrsw m14, m7
+ pmulhrsw m15, m7
+ packuswb m14, m15
+ movq [dstq+dsq*0], m14
+ movhps [dstq+dsq*1], m14
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w8_loop
+ add r4, 8
+ add r7, 8
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
+ sub r6d, 1<<8
+ jg .v_w8_loop0
+ RET
+%endif ;ARCH_X86_64
+%undef subpel0
+%undef subpel1
+%undef subpel2
+%undef subpel3
+.hv:
+ %assign stack_offset org_stack_offset
+ cmp wd, 4
+ jg .hv_w8
+%if ARCH_X86_32
+ and mxd, 0x7f
+%else
+ movzx mxd, mxb
+%endif
+ dec srcq
+ movd m1, [base_reg+mxq*8+subpel_filters-put_ssse3+2]
+%if ARCH_X86_32
+ movzx mxd, ssb
+ shr ssd, 16
+ cmp hd, 6
+ cmovs ssd, mxd
+ movq m0, [base_reg+ssq*8+subpel_filters-put_ssse3]
+ mov ssq, ssmp
+ lea r6, [ssq*3]
+ sub srcq, r6
+ %define base_reg r6
+ mov r6, r1; use as new base
+ %assign regs_used 2
+ ALLOC_STACK -mmsize*14
+ %assign regs_used 7
+ mov dsq, [rstk+stack_offset+gprsize*2]
+ %define subpelv0 [rsp+mmsize*0]
+ %define subpelv1 [rsp+mmsize*1]
+ %define subpelv2 [rsp+mmsize*2]
+ %define subpelv3 [rsp+mmsize*3]
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ pshufd m6, m0, q0000
+ mova subpelv0, m6
+ pshufd m6, m0, q1111
+ mova subpelv1, m6
+ pshufd m6, m0, q2222
+ mova subpelv2, m6
+ pshufd m6, m0, q3333
+ mova subpelv3, m6
+%else
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ movq m0, [base_reg+myq*8+subpel_filters-put_ssse3]
+ ALLOC_STACK mmsize*14, 14
+ lea ss3q, [ssq*3]
+ sub srcq, ss3q
+ %define subpelv0 m10
+ %define subpelv1 m11
+ %define subpelv2 m12
+ %define subpelv3 m13
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ mova m8, [base+pw_8192]
+ mova m9, [base+pd_512]
+ pshufd m10, m0, q0000
+ pshufd m11, m0, q1111
+ pshufd m12, m0, q2222
+ pshufd m13, m0, q3333
+%endif
+ pshufd m7, m1, q0000
+ cmp wd, 4
+ je .hv_w4
+.hv_w2:
+ mova m6, [base+subpel_h_shuf4]
+ movq m2, [srcq+ssq*0] ; 0
+ movhps m2, [srcq+ssq*1] ; 0 _ 1
+%if ARCH_X86_32
+ %define w8192reg [base+pw_8192]
+ %define d512reg [base+pd_512]
+ lea srcq, [srcq+ssq*2]
+ movq m0, [srcq+ssq*0] ; 2
+ movhps m0, [srcq+ssq*1] ; 2 _ 3
+ lea srcq, [srcq+ssq*2]
+%else
+ %define w8192reg m8
+ %define d512reg m9
+ movq m0, [srcq+ssq*2] ; 2
+ add srcq, ss3q
+ movhps m0, [srcq+ssq*0] ; 2 _ 3
+%endif
+ pshufb m2, m6 ; 0 ~ 1 ~
+ pshufb m0, m6 ; 2 ~ 3 ~
+ pmaddubsw m2, m7 ; subpel_filters
+ pmaddubsw m0, m7 ; subpel_filters
+ phaddw m2, m0 ; 0 1 2 3
+ pmulhrsw m2, w8192reg
+%if ARCH_X86_32
+ movq m3, [srcq+ssq*0] ; 4
+ movhps m3, [srcq+ssq*1] ; 4 _ 5
+ lea srcq, [srcq+ssq*2]
+%else
+ movq m3, [srcq+ssq*1] ; 4
+ movhps m3, [srcq+ssq*2] ; 4 _ 5
+ add srcq, ss3q
+%endif
+ movq m0, [srcq+ssq*0] ; 6
+ pshufb m3, m6 ; 4 ~ 5 ~
+ pshufb m0, m6 ; 6 ~
+ pmaddubsw m3, m7 ; subpel_filters
+ pmaddubsw m0, m7 ; subpel_filters
+ phaddw m3, m0 ; 4 5 6 _
+ pmulhrsw m3, w8192reg
+ palignr m4, m3, m2, 4; V 1 2 3 4
+ punpcklwd m1, m2, m4 ; V 01 12 0 1 1 2
+ punpckhwd m2, m4 ; V 23 34 2 3 3 4
+ pshufd m0, m3, q2121; V 5 6 5 6
+ punpcklwd m3, m0 ; V 45 56 4 5 5 6
+.hv_w2_loop:
+ movq m4, [srcq+ssq*1] ; V 7
+ lea srcq, [srcq+ssq*2] ; V
+ movhps m4, [srcq+ssq*0] ; V 7 8
+ pshufb m4, m6
+ pmaddubsw m4, m7
+ pmaddwd m5, m1, subpelv0; V a0 b0
+ mova m1, m2 ; V
+ pmaddwd m2, subpelv1 ; V a1 b1
+ paddd m5, m2 ; V
+ mova m2, m3 ; V
+ pmaddwd m3, subpelv2 ; a2 b2
+ phaddw m4, m4
+ pmulhrsw m4, w8192reg
+ paddd m5, m3 ; V
+ palignr m3, m4, m0, 12
+ mova m0, m4
+ punpcklwd m3, m0 ; V 67 78
+ pmaddwd m4, m3, subpelv3 ; V a3 b3
+ paddd m5, d512reg
+ paddd m5, m4
+ psrad m5, 10
+ packssdw m5, m5
+ packuswb m5, m5
+ movd r4d, m5
+ mov [dstq+dsq*0], r4w
+ shr r4d, 16
+ mov [dstq+dsq*1], r4w
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w2_loop
+ RET
+%undef w8192reg
+%undef d512reg
+.hv_w4:
+%define hv4_line_0_0 4
+%define hv4_line_0_1 5
+%define hv4_line_0_2 6
+%define hv4_line_0_3 7
+%define hv4_line_0_4 8
+%define hv4_line_0_5 9
+%define hv4_line_1_0 10
+%define hv4_line_1_1 11
+%define hv4_line_1_2 12
+%define hv4_line_1_3 13
+%macro SAVELINE_W4 3
+ mova [rsp+mmsize*hv4_line_%3_%2], %1
+%endmacro
+%macro RESTORELINE_W4 3
+ mova %1, [rsp+mmsize*hv4_line_%3_%2]
+%endmacro
+%if ARCH_X86_32
+ %define w8192reg [base+pw_8192]
+ %define d512reg [base+pd_512]
+%else
+ %define w8192reg m8
+ %define d512reg m9
+%endif
+ ; lower shuffle 0 1 2 3 4
+ mova m6, [base+subpel_h_shuf4]
+ movq m5, [srcq+ssq*0] ; 0 _ _ _
+ movhps m5, [srcq+ssq*1] ; 0 _ 1 _
+%if ARCH_X86_32
+ lea srcq, [srcq+ssq*2]
+ movq m4, [srcq+ssq*0] ; 2 _ _ _
+ movhps m4, [srcq+ssq*1] ; 2 _ 3 _
+ lea srcq, [srcq+ssq*2]
+%else
+ movq m4, [srcq+ssq*2] ; 2 _ _ _
+ movhps m4, [srcq+ss3q ] ; 2 _ 3 _
+ lea srcq, [srcq+ssq*4]
+%endif
+ pshufb m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~
+ pshufb m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~
+ pmaddubsw m2, m7 ;H subpel_filters
+ pmaddubsw m0, m7 ;H subpel_filters
+ phaddw m2, m0 ;H 0 1 2 3
+ pmulhrsw m2, w8192reg ;H pw_8192
+ SAVELINE_W4 m2, 2, 0
+ ; upper shuffle 2 3 4 5 6
+ mova m6, [base+subpel_h_shuf4+16]
+ pshufb m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~
+ pshufb m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~
+ pmaddubsw m2, m7 ;H subpel_filters
+ pmaddubsw m0, m7 ;H subpel_filters
+ phaddw m2, m0 ;H 0 1 2 3
+ pmulhrsw m2, w8192reg ;H pw_8192
+ ;
+ ; lower shuffle
+ mova m6, [base+subpel_h_shuf4]
+ movq m5, [srcq+ssq*0] ; 4 _ _ _
+ movhps m5, [srcq+ssq*1] ; 4 _ 5 _
+%if ARCH_X86_32
+ lea srcq, [srcq+ssq*2]
+ movq m4, [srcq+ssq*0] ; 6 _ _ _
+ add srcq, ssq
+%else
+ movq m4, [srcq+ssq*2] ; 6 _ _ _
+ add srcq, ss3q
+%endif
+ pshufb m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~
+ pshufb m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~
+ pmaddubsw m3, m7 ;H subpel_filters
+ pmaddubsw m0, m7 ;H subpel_filters
+ phaddw m3, m0 ;H 4 5 6 7
+ pmulhrsw m3, w8192reg ;H pw_8192
+ SAVELINE_W4 m3, 3, 0
+ ; upper shuffle
+ mova m6, [base+subpel_h_shuf4+16]
+ pshufb m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~
+ pshufb m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~
+ pmaddubsw m3, m7 ;H subpel_filters
+ pmaddubsw m0, m7 ;H subpel_filters
+ phaddw m3, m0 ;H 4 5 6 7
+ pmulhrsw m3, w8192reg ;H pw_8192
+ ;process high
+ palignr m4, m3, m2, 4;V 1 2 3 4
+ punpcklwd m1, m2, m4 ; V 01 12
+ punpckhwd m2, m4 ; V 23 34
+ pshufd m0, m3, q2121;V 5 6 5 6
+ punpcklwd m3, m0 ; V 45 56
+ SAVELINE_W4 m0, 0, 1
+ SAVELINE_W4 m1, 1, 1
+ SAVELINE_W4 m2, 2, 1
+ SAVELINE_W4 m3, 3, 1
+ ;process low
+ RESTORELINE_W4 m2, 2, 0
+ RESTORELINE_W4 m3, 3, 0
+ palignr m4, m3, m2, 4;V 1 2 3 4
+ punpcklwd m1, m2, m4 ; V 01 12
+ punpckhwd m2, m4 ; V 23 34
+ pshufd m0, m3, q2121;V 5 6 5 6
+ punpcklwd m3, m0 ; V 45 56
+.hv_w4_loop:
+ ;process low
+ pmaddwd m5, m1, subpelv0 ; V a0 b0
+ mova m1, m2
+ pmaddwd m2, subpelv1; V a1 b1
+ paddd m5, m2
+ mova m2, m3
+ pmaddwd m3, subpelv2; V a2 b2
+ paddd m5, m3
+ mova m6, [base+subpel_h_shuf4]
+ movq m4, [srcq+ssq*0] ; 7
+ movhps m4, [srcq+ssq*1] ; 7 _ 8 _
+ pshufb m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~
+ pmaddubsw m4, m7 ;H subpel_filters
+ phaddw m4, m4 ;H 7 8 7 8
+ pmulhrsw m4, w8192reg ;H pw_8192
+ palignr m3, m4, m0, 12 ; 6 7 8 7
+ mova m0, m4
+ punpcklwd m3, m4 ; 67 78
+ pmaddwd m4, m3, subpelv3; a3 b3
+ paddd m5, d512reg ; pd_512
+ paddd m5, m4
+ psrad m5, 10
+ SAVELINE_W4 m0, 0, 0
+ SAVELINE_W4 m1, 1, 0
+ SAVELINE_W4 m2, 2, 0
+ SAVELINE_W4 m3, 3, 0
+ SAVELINE_W4 m5, 5, 0
+ ;process high
+ RESTORELINE_W4 m0, 0, 1
+ RESTORELINE_W4 m1, 1, 1
+ RESTORELINE_W4 m2, 2, 1
+ RESTORELINE_W4 m3, 3, 1
+ pmaddwd m5, m1, subpelv0; V a0 b0
+ mova m1, m2
+ pmaddwd m2, subpelv1; V a1 b1
+ paddd m5, m2
+ mova m2, m3
+ pmaddwd m3, subpelv2; V a2 b2
+ paddd m5, m3
+ mova m6, [base+subpel_h_shuf4+16]
+ movq m4, [srcq+ssq*0] ; 7
+ movhps m4, [srcq+ssq*1] ; 7 _ 8 _
+ lea srcq, [srcq+ssq*2]
+ pshufb m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~
+ pmaddubsw m4, m7 ;H subpel_filters
+ phaddw m4, m4 ;H 7 8 7 8
+ pmulhrsw m4, w8192reg ;H pw_8192
+ palignr m3, m4, m0, 12 ; 6 7 8 7
+ mova m0, m4
+ punpcklwd m3, m4 ; 67 78
+ pmaddwd m4, m3, subpelv3; a3 b3
+ paddd m5, d512reg ; pd_512
+ paddd m5, m4
+ psrad m4, m5, 10
+ RESTORELINE_W4 m5, 5, 0
+ packssdw m5, m4 ; d -> w
+ packuswb m5, m5 ; w -> b
+ pshuflw m5, m5, q3120
+ movd [dstq+dsq*0], m5
+ psrlq m5, 32
+ movd [dstq+dsq*1], m5
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ SAVELINE_W4 m0, 0, 1
+ SAVELINE_W4 m1, 1, 1
+ SAVELINE_W4 m2, 2, 1
+ SAVELINE_W4 m3, 3, 1
+ RESTORELINE_W4 m0, 0, 0
+ RESTORELINE_W4 m1, 1, 0
+ RESTORELINE_W4 m2, 2, 0
+ RESTORELINE_W4 m3, 3, 0
+ jg .hv_w4_loop
+ RET
+%undef subpelv0
+%undef subpelv1
+%undef subpelv2
+%undef subpelv3
+.hv_w8:
+ %assign stack_offset org_stack_offset
+%define hv8_line_1 0
+%define hv8_line_2 1
+%define hv8_line_3 2
+%define hv8_line_4 3
+%define hv8_line_6 4
+%macro SAVELINE_W8 2
+ mova [rsp+hv8_line_%1*mmsize], %2
+%endmacro
+%macro RESTORELINE_W8 2
+ mova %2, [rsp+hv8_line_%1*mmsize]
+%endmacro
+ shr mxd, 16
+ sub srcq, 3
+%if ARCH_X86_32
+ %define base_reg r1
+ %define subpelh0 [rsp+mmsize*5]
+ %define subpelh1 [rsp+mmsize*6]
+ %define subpelv0 [rsp+mmsize*7]
+ %define subpelv1 [rsp+mmsize*8]
+ %define subpelv2 [rsp+mmsize*9]
+ %define subpelv3 [rsp+mmsize*10]
+ %define accuv0 [rsp+mmsize*11]
+ %define accuv1 [rsp+mmsize*12]
+ movq m1, [base_reg+mxq*8+subpel_filters-put_ssse3]
+ movzx mxd, ssb
+ shr ssd, 16
+ cmp hd, 6
+ cmovs ssd, mxd
+ movq m5, [base_reg+ssq*8+subpel_filters-put_ssse3]
+ mov ssq, ssmp
+ ALLOC_STACK -mmsize*13
+%if STACK_ALIGNMENT < 16
+ %define dstm [rsp+mmsize*13+gprsize*1]
+ %define dsm [rsp+mmsize*13+gprsize*2]
+ mov r6, [rstk+stack_offset+gprsize*2]
+ mov dsm, r6
+%endif
+ pshufd m0, m1, q0000
+ pshufd m1, m1, q1111
+ punpcklbw m5, m5
+ psraw m5, 8 ; sign-extend
+ pshufd m2, m5, q0000
+ pshufd m3, m5, q1111
+ pshufd m4, m5, q2222
+ pshufd m5, m5, q3333
+ mova subpelh0, m0
+ mova subpelh1, m1
+ mova subpelv0, m2
+ mova subpelv1, m3
+ mova subpelv2, m4
+ mova subpelv3, m5
+ lea r6, [ssq*3]
+ mov dstm, dstq
+ sub srcq, r6
+%else
+ ALLOC_STACK 16*5, 16
+ %define subpelh0 m10
+ %define subpelh1 m11
+ %define subpelv0 m12
+ %define subpelv1 m13
+ %define subpelv2 m14
+ %define subpelv3 m15
+ %define accuv0 m8
+ %define accuv1 m9
+ movq m0, [base_reg+mxq*8+subpel_filters-put_ssse3]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ movq m1, [base_reg+myq*8+subpel_filters-put_ssse3]
+ pshufd subpelh0, m0, q0000
+ pshufd subpelh1, m0, q1111
+ punpcklbw m1, m1
+ psraw m1, 8 ; sign-extend
+ pshufd subpelv0, m1, q0000
+ pshufd subpelv1, m1, q1111
+ pshufd subpelv2, m1, q2222
+ pshufd subpelv3, m1, q3333
+ lea ss3q, [ssq*3]
+ mov r7, dstq
+ sub srcq, ss3q
+%endif
+ shl wd, 14
+ lea r6d, [hq+wq-(1<<16)]
+ mov r4, srcq
+.hv_w8_loop0:
+ movu m4, [srcq+ssq*0] ; 0 = _ _
+ movu m5, [srcq+ssq*1] ; 1 = _ _
+%if ARCH_X86_32
+ lea srcq, [srcq+ssq*2]
+%endif
+%macro HV_H_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3]
+ %if ARCH_X86_32
+ pshufb %3, %1, [base+subpel_h_shufB]
+ pshufb %4, %1, [base+subpel_h_shufC]
+ pshufb %1, [base+subpel_h_shufA]
+ %else
+ pshufb %3, %1, %6 ; subpel_h_shufB
+ pshufb %4, %1, %7 ; subpel_h_shufC
+ pshufb %1, %5 ; subpel_h_shufA
+ %endif
+ pmaddubsw %2, %3, subpelh0 ; subpel +0 C0
+ pmaddubsw %4, subpelh1; subpel +4 B4
+ pmaddubsw %3, subpelh1; C4
+ pmaddubsw %1, subpelh0; A0
+ paddw %2, %4 ; C0+B4
+ paddw %1, %3 ; A0+C4
+ phaddw %1, %2
+%endmacro
+%if ARCH_X86_64
+ mova m7, [base+subpel_h_shufA]
+ mova m8, [base+subpel_h_shufB]
+ mova m9, [base+subpel_h_shufC]
+%endif
+ HV_H_W8 m4, m1, m2, m3, m7, m8, m9 ; 0 ~ ~ ~
+ HV_H_W8 m5, m1, m2, m3, m7, m8, m9 ; 1 ~ ~ ~
+%if ARCH_X86_32
+ movu m6, [srcq+ssq*0] ; 2 = _ _
+ movu m0, [srcq+ssq*1] ; 3 = _ _
+ lea srcq, [srcq+ssq*2]
+%else
+ movu m6, [srcq+ssq*2] ; 2 = _ _
+ add srcq, ss3q
+ movu m0, [srcq+ssq*0] ; 3 = _ _
+%endif
+ HV_H_W8 m6, m1, m2, m3, m7, m8, m9 ; 2 ~ ~ ~
+ HV_H_W8 m0, m1, m2, m3, m7, m8, m9 ; 3 ~ ~ ~
+ mova m7, [base+pw_8192]
+ pmulhrsw m4, m7 ; H pw_8192
+ pmulhrsw m5, m7 ; H pw_8192
+ pmulhrsw m6, m7 ; H pw_8192
+ pmulhrsw m0, m7 ; H pw_8192
+ punpcklwd m1, m4, m5 ; 0 1 ~
+ punpcklwd m2, m5, m6 ; 1 2 ~
+ punpcklwd m3, m6, m0 ; 2 3 ~
+ SAVELINE_W8 1, m1
+ SAVELINE_W8 2, m2
+ SAVELINE_W8 3, m3
+ mova m7, [base+subpel_h_shufA]
+%if ARCH_X86_32
+ movu m4, [srcq+ssq*0] ; 4 = _ _
+ movu m5, [srcq+ssq*1] ; 5 = _ _
+ lea srcq, [srcq+ssq*2]
+%else
+ movu m4, [srcq+ssq*1] ; 4 = _ _
+ movu m5, [srcq+ssq*2] ; 5 = _ _
+ add srcq, ss3q
+%endif
+ movu m6, [srcq+ssq*0] ; 6 = _ _
+ HV_H_W8 m4, m1, m2, m3, m7, m8, m9 ; 4 ~ ~ ~
+ HV_H_W8 m5, m1, m2, m3, m7, m8, m9 ; 5 ~ ~ ~
+ HV_H_W8 m6, m1, m2, m3, m7, m8, m9 ; 6 ~ ~ ~
+ mova m7, [base+pw_8192]
+ pmulhrsw m1, m4, m7 ; H pw_8192 4 ~
+ pmulhrsw m2, m5, m7 ; H pw_8192 5 ~
+ pmulhrsw m3, m6, m7 ; H pw_8192 6 ~
+ punpcklwd m4, m0, m1 ; 3 4 ~
+ punpcklwd m5, m1, m2 ; 4 5 ~
+ punpcklwd m6, m2, m3 ; 5 6 ~
+ SAVELINE_W8 6, m3
+ RESTORELINE_W8 1, m1
+ RESTORELINE_W8 2, m2
+ RESTORELINE_W8 3, m3
+.hv_w8_loop:
+ ; m8 accu for V a
+ ; m9 accu for V b
+ SAVELINE_W8 1, m3
+ SAVELINE_W8 2, m4
+ SAVELINE_W8 3, m5
+ SAVELINE_W8 4, m6
+%if ARCH_X86_32
+ pmaddwd m0, m1, subpelv0 ; a0
+ pmaddwd m7, m2, subpelv0 ; b0
+ pmaddwd m3, subpelv1 ; a1
+ pmaddwd m4, subpelv1 ; b1
+ paddd m0, m3
+ paddd m7, m4
+ pmaddwd m5, subpelv2 ; a2
+ pmaddwd m6, subpelv2 ; b2
+ paddd m0, m5
+ paddd m7, m6
+ mova m5, [base+pd_512]
+ paddd m0, m5 ; pd_512
+ paddd m7, m5 ; pd_512
+ mova accuv0, m0
+ mova accuv1, m7
+%else
+ pmaddwd m8, m1, subpelv0 ; a0
+ pmaddwd m9, m2, subpelv0 ; b0
+ pmaddwd m3, subpelv1 ; a1
+ pmaddwd m4, subpelv1 ; b1
+ paddd m8, m3
+ paddd m9, m4
+ pmaddwd m5, subpelv2 ; a2
+ pmaddwd m6, subpelv2 ; b2
+ paddd m8, m5
+ paddd m9, m6
+ mova m7, [base+pd_512]
+ paddd m8, m7 ; pd_512
+ paddd m9, m7 ; pd_512
+ mova m7, [base+subpel_h_shufB]
+ mova m6, [base+subpel_h_shufC]
+ mova m5, [base+subpel_h_shufA]
+%endif
+ movu m0, [srcq+ssq*1] ; 7
+ movu m4, [srcq+ssq*2] ; 8
+ lea srcq, [srcq+ssq*2]
+ HV_H_W8 m0, m1, m2, m3, m5, m7, m6
+ HV_H_W8 m4, m1, m2, m3, m5, m7, m6
+ mova m5, [base+pw_8192]
+ pmulhrsw m0, m5 ; H pw_8192
+ pmulhrsw m4, m5 ; H pw_8192
+ RESTORELINE_W8 6, m6
+ punpcklwd m5, m6, m0 ; 6 7 ~
+ punpcklwd m6, m0, m4 ; 7 8 ~
+ pmaddwd m1, m5, subpelv3 ; a3
+ paddd m2, m1, accuv0
+ pmaddwd m1, m6, subpelv3 ; b3
+ paddd m1, m1, accuv1 ; H + V
+ psrad m2, 10
+ psrad m1, 10
+ packssdw m2, m1 ; d -> w
+ packuswb m2, m1 ; w -> b
+ movd [dstq+dsq*0], m2
+ psrlq m2, 32
+%if ARCH_X86_32
+ add dstq, dsm
+ movd [dstq+dsq*0], m2
+ add dstq, dsm
+%else
+ movd [dstq+dsq*1], m2
+ lea dstq, [dstq+dsq*2]
+%endif
+ sub hd, 2
+ jle .hv_w8_outer
+ SAVELINE_W8 6, m4
+ RESTORELINE_W8 1, m1
+ RESTORELINE_W8 2, m2
+ RESTORELINE_W8 3, m3
+ RESTORELINE_W8 4, m4
+ jmp .hv_w8_loop
+.hv_w8_outer:
+%if ARCH_X86_32
+ mov dstq, dstm
+ add r4, 4
+ movzx hd, r6w
+ add dstq, 4
+ mov srcq, r4
+ mov dstm, dstq
+%else
+ add r4, 4
+ add r7, 4
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
+%endif
+ sub r6d, 1<<16
+ jg .hv_w8_loop0
+ RET
+
+%macro PSHUFB_SUBPEL_H_4 5 ; dst/src1, src2/mask, tmp1, tmp2, reset_mask
+ %if cpuflag(ssse3)
+ pshufb %1, %2
+ %else
+ %if %5 == 1
+ pcmpeqd %2, %2
+ psrlq %2, 32
+ %endif
+ psrldq %3, %1, 1
+ pshufd %3, %3, q2301
+ pand %1, %2
+ pandn %4, %2, %3
+ por %1, %4
+ %endif
+%endmacro
+
+%macro PSHUFB_SUBPEL_H_4a 6 ; dst, src1, src2/mask, tmp1, tmp2, reset_mask
+ %ifnidn %1, %2
+ mova %1, %2
+ %endif
+ PSHUFB_SUBPEL_H_4 %1, %3, %4, %5, %6
+%endmacro
+
+%macro PSHUFB_SUBPEL_H_4b 6 ; dst, src1, src2/mask, tmp1, tmp2, reset_mask
+ %if notcpuflag(ssse3)
+ psrlq %1, %2, 16
+ %elifnidn %1, %2
+ mova %1, %2
+ %endif
+ PSHUFB_SUBPEL_H_4 %1, %3, %4, %5, %6
+%endmacro
+
+%macro PALIGNR 4-5 ; dst, src1, src2, shift[, tmp]
+ %if cpuflag(ssse3)
+ palignr %1, %2, %3, %4
+ %else
+ %if %0 == 4
+ %assign %%i regnumof%+%1 + 1
+ %define %%tmp m %+ %%i
+ %else
+ %define %%tmp %5
+ %endif
+ psrldq %1, %3, %4
+ pslldq %%tmp, %2, 16-%4
+ por %1, %%tmp
+ %endif
+%endmacro
+
+%macro PHADDW 4 ; dst, src, pw_1/tmp, load_pw_1
+ %if cpuflag(ssse3)
+ phaddw %1, %2
+ %elifnidn %1, %2
+ %if %4 == 1
+ mova %3, [base+pw_1]
+ %endif
+ pmaddwd %1, %3
+ pmaddwd %2, %3
+ packssdw %1, %2
+ %else
+ %if %4 == 1
+ pmaddwd %1, [base+pw_1]
+ %else
+ pmaddwd %1, %3
+ %endif
+ packssdw %1, %1
+ %endif
+%endmacro
+
+%macro PMULHRSW_POW2 4 ; dst, src1, src2, shift
+ %if cpuflag(ssse3)
+ pmulhrsw %1, %2, %3
+ %else
+ paddw %1, %2, %3
+ psraw %1, %4
+ %endif
+%endmacro
+
+%macro PMULHRSW_8192 3 ; dst, src1, src2
+ PMULHRSW_POW2 %1, %2, %3, 2
+%endmacro
+
+%macro PREP_8TAP_H_LOAD4 5 ; dst, src_memloc, tmp[1-2]
+ movd %1, [%2+0]
+ movd %3, [%2+1]
+ movd %4, [%2+2]
+ movd %5, [%2+3]
+ punpckldq %1, %3
+ punpckldq %4, %5
+ punpcklqdq %1, %4
+%endmacro
+
+%macro PREP_8TAP_H_LOAD 2 ; dst0, src_memloc
+ %if cpuflag(ssse3)
+ movu m%1, [%2]
+ pshufb m2, m%1, m11 ; subpel_h_shufB
+ pshufb m3, m%1, m9 ; subpel_h_shufC
+ pshufb m%1, m10 ; subpel_h_shufA
+ %else
+ %if ARCH_X86_64
+ SWAP m12, m5
+ SWAP m13, m6
+ SWAP m14, m7
+ %define %%mx0 m%+%%i
+ %define %%mx1 m%+%%j
+ %assign %%i 0
+ %rep 12
+ movd %%mx0, [%2+%%i]
+ %assign %%i %%i+1
+ %endrep
+ %assign %%i 0
+ %rep 6
+ %assign %%j %%i+1
+ punpckldq %%mx0, %%mx1
+ %assign %%i %%i+2
+ %endrep
+ %assign %%i 0
+ %rep 3
+ %assign %%j %%i+2
+ punpcklqdq %%mx0, %%mx1
+ %assign %%i %%i+4
+ %endrep
+ SWAP m%1, m0
+ SWAP m2, m4
+ SWAP m3, m8
+ SWAP m5, m12
+ SWAP m6, m13
+ SWAP m7, m14
+ %else
+ PREP_8TAP_H_LOAD4 m0, %2+0, m1, m4, m7
+ PREP_8TAP_H_LOAD4 m2, %2+4, m1, m4, m7
+ PREP_8TAP_H_LOAD4 m3, %2+8, m1, m4, m7
+ SWAP m%1, m0
+ %endif
+ %endif
+%endmacro
+
+%macro PREP_8TAP_H 2 ; dst, src_memloc
+ PREP_8TAP_H_LOAD %1, %2
+ %if ARCH_X86_64 && notcpuflag(ssse3)
+ SWAP m8, m1
+ SWAP m9, m7
+ %endif
+ %xdefine mX m%+%1
+ %assign %%i regnumof%+mX
+ %define mX m%+%%i
+ mova m4, m2
+ PMADDUBSW m4, m5, m1, m7, 1 ; subpel +0 B0
+ PMADDUBSW m2, m6, m1, m7, 0 ; subpel +4 B4
+ PMADDUBSW m3, m6, m1, m7, 0 ; subpel +4 C4
+ PMADDUBSW mX, m5, m1, m7, 0 ; subpel +0 A0
+ %undef mX
+ %if ARCH_X86_64 && notcpuflag(ssse3)
+ SWAP m1, m8
+ SWAP m7, m9
+ %endif
+ paddw m3, m4
+ paddw m%1, m2
+ PHADDW m%1, m3, m15, ARCH_X86_32
+ %if ARCH_X86_64 || cpuflag(ssse3)
+ PMULHRSW_8192 m%1, m%1, m7
+ %else
+ PMULHRSW_8192 m%1, m%1, [base+pw_2]
+ %endif
+%endmacro
+
+%macro PREP_8TAP_HV 4 ; dst, src_memloc, tmp[1-2]
+ %if cpuflag(ssse3)
+ movu %1, [%2]
+ pshufb m2, %1, shufB
+ pshufb m3, %1, shufC
+ pshufb %1, shufA
+ %else
+ PREP_8TAP_H_LOAD4 %1, %2+0, m1, %3, %4
+ PREP_8TAP_H_LOAD4 m2, %2+4, m1, %3, %4
+ PREP_8TAP_H_LOAD4 m3, %2+8, m1, %3, %4
+ %endif
+ mova m1, m2
+ PMADDUBSW m1, subpelh0, %3, %4, 1 ; subpel +0 C0
+ PMADDUBSW m3, subpelh1, %3, %4, 0 ; subpel +4 B4
+ PMADDUBSW m2, subpelh1, %3, %4, 0 ; C4
+ PMADDUBSW %1, subpelh0, %3, %4, 0 ; A0
+ paddw m1, m3 ; C0+B4
+ paddw %1, m2 ; A0+C4
+ PHADDW %1, m1, %3, 1
+%endmacro
+
+%macro PREP_8TAP 0
+%if ARCH_X86_32
+ DECLARE_REG_TMP 1, 2
+%elif WIN64
+ DECLARE_REG_TMP 6, 4
+%else
+ DECLARE_REG_TMP 6, 7
+%endif
+
+FN prep_8tap, sharp, SHARP, SHARP
+FN prep_8tap, sharp_smooth, SHARP, SMOOTH
+FN prep_8tap, smooth_sharp, SMOOTH, SHARP
+FN prep_8tap, smooth, SMOOTH, SMOOTH
+FN prep_8tap, sharp_regular, SHARP, REGULAR
+FN prep_8tap, regular_sharp, REGULAR, SHARP
+FN prep_8tap, smooth_regular, SMOOTH, REGULAR
+FN prep_8tap, regular_smooth, REGULAR, SMOOTH
+FN prep_8tap, regular, REGULAR, REGULAR
+
+%if ARCH_X86_32
+ %define base_reg r2
+ %define base base_reg-prep%+SUFFIX
+%else
+ %define base_reg r7
+ %define base 0
+%endif
+cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
+%assign org_stack_offset stack_offset
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+ imul myd, mym, 0x010101
+ add myd, t1d ; 8tap_v, my, 4tap_v
+ mov wd, wm
+ movifnidn srcd, srcm
+ movifnidn hd, hm
+ test mxd, 0xf00
+ jnz .h
+ test myd, 0xf00
+ jnz .v
+ LEA base_reg, prep_ssse3
+ tzcnt wd, wd
+ movzx wd, word [base_reg-prep_ssse3+prep_ssse3_table+wq*2]
+ pxor m4, m4
+ add wq, base_reg
+ movifnidn strided, stridem
+ lea r6, [strideq*3]
+ %assign stack_offset org_stack_offset
+%if WIN64
+ pop r8
+ pop r7
+%endif
+ jmp wq
+.h:
+ LEA base_reg, prep%+SUFFIX
+ test myd, 0xf00
+ jnz .hv
+%if cpuflag(ssse3)
+ WIN64_SPILL_XMM 12
+%else
+ WIN64_SPILL_XMM 16
+%endif
+%if ARCH_X86_32
+ %define strideq r6
+ mov strideq, stridem
+%endif
+ cmp wd, 4
+ je .h_w4
+ tzcnt wd, wd
+%if cpuflag(ssse3)
+ %if ARCH_X86_64
+ mova m10, [base+subpel_h_shufA]
+ mova m11, [base+subpel_h_shufB]
+ mova m9, [base+subpel_h_shufC]
+ %else
+ %define m10 [base+subpel_h_shufA]
+ %define m11 [base+subpel_h_shufB]
+ %define m9 [base+subpel_h_shufC]
+ %endif
+%endif
+ shr mxd, 16
+ sub srcq, 3
+ movzx wd, word [base_reg+wq*2+table_offset(prep, _8tap_h)]
+ movq m6, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX]
+%if cpuflag(ssse3)
+ mova m7, [base+pw_8192]
+ pshufd m5, m6, q0000
+ pshufd m6, m6, q1111
+%else
+ punpcklbw m6, m6
+ psraw m6, 8
+ %if ARCH_X86_64
+ mova m7, [pw_2]
+ mova m15, [pw_1]
+ %else
+ %define m15 m4
+ %endif
+ pshufd m5, m6, q1010
+ punpckhqdq m6, m6
+%endif
+ add wq, base_reg
+ jmp wq
+.h_w4:
+%if ARCH_X86_32
+ and mxd, 0x7f
+%else
+ movzx mxd, mxb
+%endif
+ dec srcq
+ movd m4, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+2]
+%if cpuflag(ssse3)
+ mova m6, [base+pw_8192]
+ mova m5, [base+subpel_h_shufA]
+ pshufd m4, m4, q0000
+%else
+ mova m6, [base+pw_2]
+ %if ARCH_X86_64
+ mova m14, [pw_1]
+ %else
+ %define m14 m7
+ %endif
+ punpcklbw m4, m4
+ psraw m4, 8
+ punpcklqdq m4, m4
+%endif
+%if ARCH_X86_64
+ lea stride3q, [strideq*3]
+%endif
+.h_w4_loop:
+%if cpuflag(ssse3)
+ movq m0, [srcq+strideq*0] ; 0
+ movq m1, [srcq+strideq*1] ; 1
+ %if ARCH_X86_32
+ lea srcq, [srcq+strideq*2]
+ movq m2, [srcq+strideq*0] ; 2
+ movq m3, [srcq+strideq*1] ; 3
+ lea srcq, [srcq+strideq*2]
+ %else
+ movq m2, [srcq+strideq*2] ; 2
+ movq m3, [srcq+stride3q ] ; 3
+ lea srcq, [srcq+strideq*4]
+ %endif
+ pshufb m0, m5
+ pshufb m1, m5
+ pshufb m2, m5
+ pshufb m3, m5
+%elif ARCH_X86_64
+ movd m0, [srcq+strideq*0+0]
+ movd m12, [srcq+strideq*0+1]
+ movd m1, [srcq+strideq*1+0]
+ movd m5, [srcq+strideq*1+1]
+ movd m2, [srcq+strideq*2+0]
+ movd m13, [srcq+strideq*2+1]
+ movd m3, [srcq+stride3q +0]
+ movd m7, [srcq+stride3q +1]
+ punpckldq m0, m12
+ punpckldq m1, m5
+ punpckldq m2, m13
+ punpckldq m3, m7
+ movd m12, [srcq+strideq*0+2]
+ movd m8, [srcq+strideq*0+3]
+ movd m5, [srcq+strideq*1+2]
+ movd m9, [srcq+strideq*1+3]
+ movd m13, [srcq+strideq*2+2]
+ movd m10, [srcq+strideq*2+3]
+ movd m7, [srcq+stride3q +2]
+ movd m11, [srcq+stride3q +3]
+ lea srcq, [srcq+strideq*4]
+ punpckldq m12, m8
+ punpckldq m5, m9
+ punpckldq m13, m10
+ punpckldq m7, m11
+ punpcklqdq m0, m12 ; 0
+ punpcklqdq m1, m5 ; 1
+ punpcklqdq m2, m13 ; 2
+ punpcklqdq m3, m7 ; 3
+%else
+ movd m0, [srcq+strideq*0+0]
+ movd m1, [srcq+strideq*0+1]
+ movd m2, [srcq+strideq*0+2]
+ movd m3, [srcq+strideq*0+3]
+ punpckldq m0, m1
+ punpckldq m2, m3
+ punpcklqdq m0, m2 ; 0
+ movd m1, [srcq+strideq*1+0]
+ movd m2, [srcq+strideq*1+1]
+ movd m3, [srcq+strideq*1+2]
+ movd m7, [srcq+strideq*1+3]
+ lea srcq, [srcq+strideq*2]
+ punpckldq m1, m2
+ punpckldq m3, m7
+ punpcklqdq m1, m3 ; 1
+ movd m2, [srcq+strideq*0+0]
+ movd m3, [srcq+strideq*0+1]
+ movd m7, [srcq+strideq*0+2]
+ movd m5, [srcq+strideq*0+3]
+ punpckldq m2, m3
+ punpckldq m7, m5
+ punpcklqdq m2, m7 ; 2
+ movd m3, [srcq+strideq*1+0]
+ movd m7, [srcq+strideq*1+1]
+ punpckldq m3, m7
+ movd m7, [srcq+strideq*1+2]
+ movd m5, [srcq+strideq*1+3]
+ lea srcq, [srcq+strideq*2]
+ punpckldq m7, m5
+ punpcklqdq m3, m7 ; 3
+%endif
+ PMADDUBSW m0, m4, m5, m7, 1 ; subpel_filters + 2
+ PMADDUBSW m1, m4, m5, m7, 0
+ PMADDUBSW m2, m4, m5, m7, 0
+ PMADDUBSW m3, m4, m5, m7, 0
+ PHADDW m0, m1, m14, ARCH_X86_32
+ PHADDW m2, m3, m14, 0
+ PMULHRSW_8192 m0, m0, m6
+ PMULHRSW_8192 m2, m2, m6
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m2
+ add tmpq, 32
+ sub hd, 4
+ jg .h_w4_loop
+ RET
+.h_w8:
+%if cpuflag(ssse3)
+ PREP_8TAP_H 0, srcq+strideq*0
+ PREP_8TAP_H 1, srcq+strideq*1
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ lea srcq, [srcq+strideq*2]
+ add tmpq, 32
+ sub hd, 2
+%else
+ PREP_8TAP_H 0, srcq
+ mova [tmpq], m0
+ add srcq, strideq
+ add tmpq, 16
+ dec hd
+%endif
+ jg .h_w8
+ RET
+.h_w16:
+ mov r3, -16*1
+ jmp .h_start
+.h_w32:
+ mov r3, -16*2
+ jmp .h_start
+.h_w64:
+ mov r3, -16*4
+ jmp .h_start
+.h_w128:
+ mov r3, -16*8
+.h_start:
+ sub srcq, r3
+ mov r5, r3
+.h_loop:
+%if cpuflag(ssse3)
+ PREP_8TAP_H 0, srcq+r3+8*0
+ PREP_8TAP_H 1, srcq+r3+8*1
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ add tmpq, 32
+ add r3, 16
+%else
+ PREP_8TAP_H 0, srcq+r3
+ mova [tmpq], m0
+ add tmpq, 16
+ add r3, 8
+%endif
+ jl .h_loop
+ add srcq, strideq
+ mov r3, r5
+ dec hd
+ jg .h_loop
+ RET
+.v:
+ LEA base_reg, prep%+SUFFIX
+%if ARCH_X86_32
+ mov mxd, myd
+ and mxd, 0x7f
+%else
+ %assign stack_offset org_stack_offset
+ WIN64_SPILL_XMM 16
+ movzx mxd, myb
+%endif
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ movq m0, [base_reg+myq*8+subpel_filters-prep%+SUFFIX]
+%if cpuflag(ssse3)
+ mova m2, [base+pw_512]
+ mova m7, [base+pw_8192]
+ punpcklwd m0, m0
+%else
+ punpcklbw m0, m0
+ psraw m0, 8
+%endif
+%if ARCH_X86_32
+ %define subpel0 [rsp+mmsize*0]
+ %define subpel1 [rsp+mmsize*1]
+ %define subpel2 [rsp+mmsize*2]
+ %define subpel3 [rsp+mmsize*3]
+%assign regs_used 6 ; use r5 (mx) as tmp for stack alignment if needed
+ %if cpuflag(ssse3)
+ ALLOC_STACK -mmsize*4
+ %else
+ ALLOC_STACK -mmsize*5
+ %endif
+%assign regs_used 7
+ mov strideq, [rstk+stack_offset+gprsize*3]
+ pshufd m1, m0, q0000
+ mova subpel0, m1
+ pshufd m1, m0, q1111
+ mova subpel1, m1
+ lea r5, [strideq*3]
+ pshufd m1, m0, q2222
+ mova subpel2, m1
+ pshufd m1, m0, q3333
+ mova subpel3, m1
+ sub srcq, r5
+%else
+ %define subpel0 m8
+ %define subpel1 m9
+ %define subpel2 m10
+ %define subpel3 m11
+ pshufd m8, m0, q0000
+ pshufd m9, m0, q1111
+ lea stride3q, [strideq*3]
+ pshufd m10, m0, q2222
+ pshufd m11, m0, q3333
+ sub srcq, stride3q
+ cmp wd, 8
+ jns .v_w8
+%endif
+.v_w4:
+%if notcpuflag(ssse3)
+ pxor m6, m6
+ %if ARCH_X86_64
+ mova m7, [base+pw_2]
+ %endif
+%endif
+%if ARCH_X86_32
+ %if STACK_ALIGNMENT < mmsize
+ %define srcm [esp+stack_size+gprsize*1]
+ %define tmpm [esp+stack_size+gprsize*2]
+ %endif
+ mov tmpm, tmpq
+ mov srcm, srcq
+ lea r5d, [wq - 4] ; horizontal loop
+ shl r5d, (16 - 2) ; (wq / 4) << 16
+ mov r5w, hw
+.v_w4_loop0:
+%endif
+ movd m1, [srcq+strideq*0]
+ movd m0, [srcq+strideq*1]
+%if ARCH_X86_32
+ lea srcq, [srcq+strideq*2]
+ movd m2, [srcq+strideq*0]
+ movd m4, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ movd m3, [srcq+strideq*0]
+ movd m5, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+%else
+ movd m2, [srcq+strideq*2]
+ add srcq, stride3q
+ movd m4, [srcq+strideq*0]
+ movd m3, [srcq+strideq*1]
+ movd m5, [srcq+strideq*2]
+ add srcq, stride3q
+%endif
+ punpckldq m1, m0 ; 0 1
+ punpckldq m0, m2 ; 1 2
+ punpcklbw m1, m0 ; 01 12
+ movd m0, [srcq+strideq*0]
+ punpckldq m2, m4 ; 2 3
+ punpckldq m4, m3 ; 3 4
+ punpckldq m3, m5 ; 4 5
+ punpckldq m5, m0 ; 5 6
+ punpcklbw m2, m4 ; 23 34
+ punpcklbw m3, m5 ; 45 56
+.v_w4_loop:
+%if ARCH_X86_32 && notcpuflag(ssse3)
+ mova m7, subpel0
+ %define subpel0 m7
+%endif
+ mova m5, m1
+ PMADDUBSW m5, subpel0, m6, m4, 0 ; a0 b0
+%if ARCH_X86_32 && notcpuflag(ssse3)
+ mova m7, subpel1
+ %define subpel1 m7
+%endif
+ mova m1, m2
+ PMADDUBSW m2, subpel1, m6, m4, 0 ; a1 b1
+ paddw m5, m2
+%if ARCH_X86_32 && notcpuflag(ssse3)
+ mova m7, subpel2
+ %define subpel2 m7
+%endif
+ mova m2, m3
+ PMADDUBSW m3, subpel2, m6, m4, 0 ; a2 b2
+ movd m4, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ paddw m5, m3
+ punpckldq m3, m0, m4 ; 6 7 _ _
+ movd m0, [srcq+strideq*0]
+ punpckldq m4, m0 ; 7 8 _ _
+ punpcklbw m3, m4 ; 67 78
+%if notcpuflag(ssse3)
+ %if ARCH_X86_64
+ SWAP m12, m0
+ %else
+ mova [esp+mmsize*4], m0
+ mova m7, subpel3
+ %define subpel3 m7
+ %endif
+%endif
+ mova m4, m3
+ PMADDUBSW m4, subpel3, m6, m0, 0 ; a3 b3
+ paddw m5, m4
+%if ARCH_X86_64 || cpuflag(ssse3)
+ %if notcpuflag(ssse3)
+ SWAP m0, m12
+ %endif
+ PMULHRSW_8192 m5, m5, m7
+%else
+ mova m0, [esp+mmsize*4]
+ PMULHRSW_8192 m5, m5, [base+pw_2]
+%endif
+ movq [tmpq+wq*0], m5
+ movhps [tmpq+wq*2], m5
+ lea tmpq, [tmpq+wq*4]
+ sub hd, 2
+ jg .v_w4_loop
+%if ARCH_X86_32
+ mov srcq, srcm
+ mov tmpq, tmpm
+ movzx hd, r5w
+ add srcq, 4
+ add tmpq, 8
+ mov srcm, srcq
+ mov tmpm, tmpq
+ sub r5d, 1<<16 ; horizontal--
+ jg .v_w4_loop0
+%endif
+ RET
+%if ARCH_X86_64
+.v_w8:
+ lea r6d, [wq*8-64]
+ mov r5, srcq
+ mov r8, tmpq
+ lea r6d, [hq+r6*4]
+.v_w8_loop0:
+ movq m1, [srcq+strideq*0]
+ movq m2, [srcq+strideq*1]
+ movq m3, [srcq+strideq*2]
+ add srcq, stride3q
+ movq m4, [srcq+strideq*0]
+ movq m5, [srcq+strideq*1]
+ movq m6, [srcq+strideq*2]
+ add srcq, stride3q
+ movq m0, [srcq+strideq*0]
+ punpcklbw m1, m2 ; 01
+ punpcklbw m2, m3 ; 12
+ punpcklbw m3, m4 ; 23
+ punpcklbw m4, m5 ; 34
+ punpcklbw m5, m6 ; 45
+ punpcklbw m6, m0 ; 56
+.v_w8_loop:
+ movq m13, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+%if cpuflag(ssse3)
+ pmaddubsw m14, m1, subpel0 ; a0
+ pmaddubsw m15, m2, subpel0 ; b0
+ mova m1, m3
+ mova m2, m4
+ pmaddubsw m3, subpel1 ; a1
+ pmaddubsw m4, subpel1 ; b1
+ paddw m14, m3
+ paddw m15, m4
+ mova m3, m5
+ mova m4, m6
+ pmaddubsw m5, subpel2 ; a2
+ pmaddubsw m6, subpel2 ; b2
+ punpcklbw m12, m0, m13 ; 67
+ movq m0, [srcq+strideq*0]
+ punpcklbw m13, m0 ; 78
+ paddw m14, m5
+ mova m5, m12
+ pmaddubsw m12, subpel3 ; a3
+ paddw m15, m6
+ mova m6, m13
+ pmaddubsw m13, subpel3 ; b3
+ paddw m14, m12
+ paddw m15, m13
+ pmulhrsw m14, m7
+ pmulhrsw m15, m7
+%else
+ mova m14, m1
+ PMADDUBSW m14, subpel0, m7, m12, 1 ; a0
+ mova m15, m2
+ PMADDUBSW m15, subpel0, m7, m12, 0 ; b0
+ mova m1, m3
+ PMADDUBSW m3, subpel1, m7, m12, 0 ; a1
+ mova m2, m4
+ PMADDUBSW m4, subpel1, m7, m12, 0 ; b1
+ paddw m14, m3
+ mova m3, m5
+ PMADDUBSW m5, subpel2, m7, m12, 0 ; a2
+ paddw m15, m4
+ mova m4, m6
+ PMADDUBSW m6, subpel2, m7, m12, 0 ; b2
+ paddw m15, m6
+ punpcklbw m12, m0, m13 ; 67
+ movq m0, [srcq+strideq*0]
+ punpcklbw m13, m0 ; 78
+ paddw m14, m5
+ mova m5, m12
+ PMADDUBSW m12, subpel3, m7, m6, 0 ; a3
+ paddw m14, m12
+ mova m6, m13
+ PMADDUBSW m13, subpel3, m7, m12, 0 ; b3
+ paddw m15, m13
+ PMULHRSW_8192 m14, m14, [base+pw_2]
+ PMULHRSW_8192 m15, m15, [base+pw_2]
+%endif
+ movu [tmpq+wq*0], m14
+ movu [tmpq+wq*2], m15
+ lea tmpq, [tmpq+wq*4]
+ sub hd, 2
+ jg .v_w8_loop
+ add r5, 8
+ add r8, 16
+ movzx hd, r6b
+ mov srcq, r5
+ mov tmpq, r8
+ sub r6d, 1<<8
+ jg .v_w8_loop0
+ RET
+%endif ;ARCH_X86_64
+%undef subpel0
+%undef subpel1
+%undef subpel2
+%undef subpel3
+.hv:
+ %assign stack_offset org_stack_offset
+ cmp wd, 4
+ jg .hv_w8
+ and mxd, 0x7f
+ movd m1, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+2]
+%if ARCH_X86_32
+ mov mxd, myd
+ shr myd, 16
+ and mxd, 0x7f
+ cmp hd, 6
+ cmovs myd, mxd
+ movq m0, [base_reg+myq*8+subpel_filters-prep%+SUFFIX]
+ mov strideq, stridem
+ %assign regs_used 6
+ ALLOC_STACK -mmsize*14
+ %assign regs_used 7
+ lea r5, [strideq*3+1]
+ sub srcq, r5
+ %define subpelv0 [rsp+mmsize*0]
+ %define subpelv1 [rsp+mmsize*1]
+ %define subpelv2 [rsp+mmsize*2]
+ %define subpelv3 [rsp+mmsize*3]
+ punpcklbw m0, m0
+ psraw m0, 8
+ pshufd m6, m0, q0000
+ mova subpelv0, m6
+ pshufd m6, m0, q1111
+ mova subpelv1, m6
+ pshufd m6, m0, q2222
+ mova subpelv2, m6
+ pshufd m6, m0, q3333
+ mova subpelv3, m6
+%else
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ movq m0, [base_reg+myq*8+subpel_filters-prep%+SUFFIX]
+ %if cpuflag(ssse3)
+ ALLOC_STACK mmsize*14, 14
+ %else
+ ALLOC_STACK mmsize*14, 16
+ %endif
+ lea stride3q, [strideq*3]
+ sub srcq, stride3q
+ dec srcq
+ %define subpelv0 m10
+ %define subpelv1 m11
+ %define subpelv2 m12
+ %define subpelv3 m13
+ punpcklbw m0, m0
+ psraw m0, 8
+ %if cpuflag(ssse3)
+ mova m8, [base+pw_8192]
+ %else
+ mova m8, [base+pw_2]
+ %endif
+ mova m9, [base+pd_32]
+ pshufd m10, m0, q0000
+ pshufd m11, m0, q1111
+ pshufd m12, m0, q2222
+ pshufd m13, m0, q3333
+%endif
+ pshufd m7, m1, q0000
+%if notcpuflag(ssse3)
+ punpcklbw m7, m7
+ psraw m7, 8
+%endif
+%define hv4_line_0_0 4
+%define hv4_line_0_1 5
+%define hv4_line_0_2 6
+%define hv4_line_0_3 7
+%define hv4_line_0_4 8
+%define hv4_line_0_5 9
+%define hv4_line_1_0 10
+%define hv4_line_1_1 11
+%define hv4_line_1_2 12
+%define hv4_line_1_3 13
+%if ARCH_X86_32
+ %if cpuflag(ssse3)
+ %define w8192reg [base+pw_8192]
+ %else
+ %define w8192reg [base+pw_2]
+ %endif
+ %define d32reg [base+pd_32]
+%else
+ %define w8192reg m8
+ %define d32reg m9
+%endif
+ ; lower shuffle 0 1 2 3 4
+%if cpuflag(ssse3)
+ mova m6, [base+subpel_h_shuf4]
+%else
+ %if ARCH_X86_64
+ mova m15, [pw_1]
+ %else
+ %define m15 m1
+ %endif
+%endif
+ movq m5, [srcq+strideq*0] ; 0 _ _ _
+ movhps m5, [srcq+strideq*1] ; 0 _ 1 _
+%if ARCH_X86_32
+ lea srcq, [srcq+strideq*2]
+ movq m4, [srcq+strideq*0] ; 2 _ _ _
+ movhps m4, [srcq+strideq*1] ; 2 _ 3 _
+ lea srcq, [srcq+strideq*2]
+%else
+ movq m4, [srcq+strideq*2] ; 2 _ _ _
+ movhps m4, [srcq+stride3q ] ; 2 _ 3 _
+ lea srcq, [srcq+strideq*4]
+%endif
+ PSHUFB_SUBPEL_H_4a m2, m5, m6, m1, m3, 1 ;H subpel_h_shuf4 0~1~
+ PSHUFB_SUBPEL_H_4a m0, m4, m6, m1, m3, 0 ;H subpel_h_shuf4 2~3~
+ PMADDUBSW m2, m7, m1, m3, 1 ;H subpel_filters
+ PMADDUBSW m0, m7, m1, m3, 0 ;H subpel_filters
+ PHADDW m2, m0, m15, ARCH_X86_32 ;H 0 1 2 3
+ PMULHRSW_8192 m2, m2, w8192reg
+ SAVELINE_W4 m2, 2, 0
+ ; upper shuffle 2 3 4 5 6
+%if cpuflag(ssse3)
+ mova m6, [base+subpel_h_shuf4+16]
+%endif
+ PSHUFB_SUBPEL_H_4b m2, m5, m6, m1, m3, 0 ;H subpel_h_shuf4 0~1~
+ PSHUFB_SUBPEL_H_4b m0, m4, m6, m1, m3, 0 ;H subpel_h_shuf4 2~3~
+ PMADDUBSW m2, m7, m1, m3, 1 ;H subpel_filters
+ PMADDUBSW m0, m7, m1, m3, 0 ;H subpel_filters
+ PHADDW m2, m0, m15, ARCH_X86_32 ;H 0 1 2 3
+ PMULHRSW_8192 m2, m2, w8192reg
+%if notcpuflag(ssse3)
+ %if ARCH_X86_64
+ SWAP m14, m2
+ %else
+ mova [esp+mmsize*4], m2
+ %endif
+%endif
+ ; lower shuffle
+%if cpuflag(ssse3)
+ mova m6, [base+subpel_h_shuf4]
+%endif
+ movq m5, [srcq+strideq*0] ; 4 _ _ _
+ movhps m5, [srcq+strideq*1] ; 4 _ 5 _
+%if ARCH_X86_32
+ lea srcq, [srcq+strideq*2]
+ movq m4, [srcq+strideq*0] ; 6 _ _ _
+ add srcq, strideq
+%else
+ movq m4, [srcq+strideq*2] ; 6 _ _ _
+ add srcq, stride3q
+%endif
+ PSHUFB_SUBPEL_H_4a m3, m5, m6, m1, m2, 0 ;H subpel_h_shuf4 4~5~
+ PSHUFB_SUBPEL_H_4a m0, m4, m6, m1, m2, 0 ;H subpel_h_shuf4 6~6~
+ PMADDUBSW m3, m7, m1, m2, 1 ;H subpel_filters
+ PMADDUBSW m0, m7, m1, m2, 0 ;H subpel_filters
+ PHADDW m3, m0, m15, ARCH_X86_32 ;H 4 5 6 7
+ PMULHRSW_8192 m3, m3, w8192reg
+ SAVELINE_W4 m3, 3, 0
+ ; upper shuffle
+%if cpuflag(ssse3)
+ mova m6, [base+subpel_h_shuf4+16]
+%endif
+ PSHUFB_SUBPEL_H_4b m3, m5, m6, m1, m2, 0 ;H subpel_h_shuf4 4~5~
+ PSHUFB_SUBPEL_H_4b m0, m4, m6, m1, m2, 0 ;H subpel_h_shuf4 6~6~
+ PMADDUBSW m3, m7, m1, m2, 1 ;H subpel_filters
+ PMADDUBSW m0, m7, m1, m2, 0 ;H subpel_filters
+ PHADDW m3, m0, m15, ARCH_X86_32 ;H 4 5 6 7
+ PMULHRSW_8192 m3, m3, w8192reg
+%if notcpuflag(ssse3)
+ %if ARCH_X86_64
+ SWAP m2, m14
+ %else
+ mova m2, [esp+mmsize*4]
+ %endif
+%endif
+ ;process high
+ PALIGNR m4, m3, m2, 4;V 1 2 3 4
+ punpcklwd m1, m2, m4 ; V 01 12
+ punpckhwd m2, m4 ; V 23 34
+ pshufd m0, m3, q2121;V 5 6 5 6
+ punpcklwd m3, m0 ; V 45 56
+ SAVELINE_W4 m0, 0, 1
+ SAVELINE_W4 m1, 1, 1
+ SAVELINE_W4 m2, 2, 1
+ SAVELINE_W4 m3, 3, 1
+ ;process low
+ RESTORELINE_W4 m2, 2, 0
+ RESTORELINE_W4 m3, 3, 0
+ PALIGNR m4, m3, m2, 4;V 1 2 3 4
+ punpcklwd m1, m2, m4 ; V 01 12
+ punpckhwd m2, m4 ; V 23 34
+ pshufd m0, m3, q2121;V 5 6 5 6
+ punpcklwd m3, m0 ; V 45 56
+.hv_w4_loop:
+ ;process low
+ pmaddwd m5, m1, subpelv0 ; V a0 b0
+ mova m1, m2
+ pmaddwd m2, subpelv1; V a1 b1
+ paddd m5, m2
+ mova m2, m3
+ pmaddwd m3, subpelv2; V a2 b2
+ paddd m5, m3
+%if notcpuflag(ssse3)
+ %if ARCH_X86_64
+ SWAP m14, m5
+ %else
+ mova [esp+mmsize*4], m5
+ %define m15 m3
+ %endif
+%endif
+%if cpuflag(ssse3)
+ mova m6, [base+subpel_h_shuf4]
+%endif
+ movq m4, [srcq+strideq*0] ; 7
+ movhps m4, [srcq+strideq*1] ; 7 _ 8 _
+ PSHUFB_SUBPEL_H_4a m4, m4, m6, m3, m5, 0 ; H subpel_h_shuf4 7~8~
+ PMADDUBSW m4, m7, m3, m5, 1 ; H subpel_filters
+ PHADDW m4, m4, m15, ARCH_X86_32 ; H 7878
+ PMULHRSW_8192 m4, m4, w8192reg
+ PALIGNR m3, m4, m0, 12, m5 ; 6787
+ mova m0, m4
+ punpcklwd m3, m4 ; 67 78
+ pmaddwd m4, m3, subpelv3; a3 b3
+%if notcpuflag(ssse3)
+ %if ARCH_X86_64
+ SWAP m5, m14
+ %else
+ mova m5, [esp+mmsize*4]
+ %endif
+%endif
+ paddd m5, d32reg ; pd_32
+ paddd m5, m4
+ psrad m5, 6
+ SAVELINE_W4 m0, 0, 0
+ SAVELINE_W4 m1, 1, 0
+ SAVELINE_W4 m2, 2, 0
+ SAVELINE_W4 m3, 3, 0
+ SAVELINE_W4 m5, 5, 0
+ ;process high
+ RESTORELINE_W4 m0, 0, 1
+ RESTORELINE_W4 m1, 1, 1
+ RESTORELINE_W4 m2, 2, 1
+ RESTORELINE_W4 m3, 3, 1
+ pmaddwd m5, m1, subpelv0; V a0 b0
+ mova m1, m2
+ pmaddwd m2, subpelv1; V a1 b1
+ paddd m5, m2
+ mova m2, m3
+ pmaddwd m3, subpelv2; V a2 b2
+ paddd m5, m3
+%if notcpuflag(ssse3)
+ %if ARCH_X86_64
+ SWAP m14, m5
+ %else
+ mova [esp+0xA0], m5
+ %endif
+%endif
+%if cpuflag(ssse3)
+ mova m6, [base+subpel_h_shuf4+16]
+%endif
+ movq m4, [srcq+strideq*0] ; 7
+ movhps m4, [srcq+strideq*1] ; 7 _ 8 _
+ PSHUFB_SUBPEL_H_4b m4, m4, m6, m3, m5, 0 ; H subpel_h_shuf4 7~8~
+ PMADDUBSW m4, m7, m3, m5, 1 ; H subpel_filters
+ PHADDW m4, m4, m15, ARCH_X86_32 ; H 7878
+ PMULHRSW_8192 m4, m4, w8192reg
+ PALIGNR m3, m4, m0, 12, m5 ; 6787
+ mova m0, m4
+ punpcklwd m3, m4 ; 67 78
+ pmaddwd m4, m3, subpelv3; a3 b3
+%if notcpuflag(ssse3)
+ %if ARCH_X86_64
+ SWAP m5, m14
+ %else
+ mova m5, [esp+0xA0]
+ %endif
+%endif
+ paddd m5, d32reg ; pd_32
+ paddd m5, m4
+ psrad m4, m5, 6
+ RESTORELINE_W4 m5, 5, 0
+ packssdw m5, m4
+ pshufd m5, m5, q3120
+ movu [tmpq], m5
+ lea srcq, [srcq+strideq*2]
+ add tmpq, 16
+ sub hd, 2
+ SAVELINE_W4 m0, 0, 1
+ SAVELINE_W4 m1, 1, 1
+ SAVELINE_W4 m2, 2, 1
+ SAVELINE_W4 m3, 3, 1
+ RESTORELINE_W4 m0, 0, 0
+ RESTORELINE_W4 m1, 1, 0
+ RESTORELINE_W4 m2, 2, 0
+ RESTORELINE_W4 m3, 3, 0
+ jg .hv_w4_loop
+ RET
+%undef subpelv0
+%undef subpelv1
+%undef subpelv2
+%undef subpelv3
+.hv_w8:
+ %assign stack_offset org_stack_offset
+%define hv8_line_1 0
+%define hv8_line_2 1
+%define hv8_line_3 2
+%define hv8_line_4 3
+%define hv8_line_6 4
+ shr mxd, 16
+%if ARCH_X86_32
+ %define subpelh0 [rsp+mmsize*5]
+ %define subpelh1 [rsp+mmsize*6]
+ %define subpelv0 [rsp+mmsize*7]
+ %define subpelv1 [rsp+mmsize*8]
+ %define subpelv2 [rsp+mmsize*9]
+ %define subpelv3 [rsp+mmsize*10]
+ %define accuv0 [rsp+mmsize*11]
+ %define accuv1 [rsp+mmsize*12]
+ movq m1, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX]
+ mov mxd, myd
+ shr myd, 16
+ and mxd, 0x7f
+ cmp hd, 6
+ cmovs myd, mxd
+ movq m5, [base_reg+myq*8+subpel_filters-prep%+SUFFIX]
+ mov strideq, stridem
+ %assign regs_used 6
+ ALLOC_STACK -mmsize*14
+ %assign regs_used 7
+ %if STACK_ALIGNMENT < mmsize
+ %define tmpm [rsp+mmsize*13+gprsize*1]
+ %define srcm [rsp+mmsize*13+gprsize*2]
+ %define stridem [rsp+mmsize*13+gprsize*3]
+ mov tmpm, tmpq
+ mov stridem, strideq
+ %endif
+ %if cpuflag(ssse3)
+ pshufd m0, m1, q0000
+ pshufd m1, m1, q1111
+ %else
+ punpcklbw m1, m1
+ psraw m1, 8
+ pshufd m0, m1, q1010
+ punpckhqdq m1, m1
+ %endif
+ punpcklbw m5, m5
+ psraw m5, 8
+ pshufd m2, m5, q0000
+ pshufd m3, m5, q1111
+ pshufd m4, m5, q2222
+ pshufd m5, m5, q3333
+ mova subpelh0, m0
+ mova subpelh1, m1
+ mova subpelv0, m2
+ mova subpelv1, m3
+ mova subpelv2, m4
+ mova subpelv3, m5
+ lea r5, [strideq*3+3]
+ sub srcq, r5
+ mov srcm, srcq
+%else
+ ALLOC_STACK mmsize*5, 16
+ %define subpelh0 m10
+ %define subpelh1 m11
+ %define subpelv0 m12
+ %define subpelv1 m13
+ %define subpelv2 m14
+ %define subpelv3 m15
+ %define accuv0 m8
+ %define accuv1 m9
+ movq m0, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ movq m1, [base_reg+myq*8+subpel_filters-prep%+SUFFIX]
+ %if cpuflag(ssse3)
+ pshufd subpelh0, m0, q0000
+ pshufd subpelh1, m0, q1111
+ %else
+ punpcklbw m0, m0
+ psraw m0, 8
+ pshufd subpelh0, m0, q1010
+ pshufd subpelh1, m0, q3232
+ mova m7, [base+pw_2]
+ %endif
+ punpcklbw m1, m1
+ psraw m1, 8
+ pshufd subpelv0, m1, q0000
+ pshufd subpelv1, m1, q1111
+ pshufd subpelv2, m1, q2222
+ pshufd subpelv3, m1, q3333
+ lea stride3q, [strideq*3]
+ sub srcq, 3
+ sub srcq, stride3q
+ mov r6, srcq
+ mov r8, tmpq
+%endif
+ lea r5d, [wq-4]
+ shl r5d, 14
+ add r5d, hd
+.hv_w8_loop0:
+%if cpuflag(ssse3)
+ %if ARCH_X86_64
+ mova m7, [base+subpel_h_shufA]
+ mova m8, [base+subpel_h_shufB]
+ mova m9, [base+subpel_h_shufC]
+ %define shufA m7
+ %define shufB m8
+ %define shufC m9
+ %else
+ %define shufA [base+subpel_h_shufA]
+ %define shufB [base+subpel_h_shufB]
+ %define shufC [base+subpel_h_shufC]
+ %endif
+%endif
+ PREP_8TAP_HV m4, srcq+strideq*0, m7, m0
+ PREP_8TAP_HV m5, srcq+strideq*1, m7, m0
+%if ARCH_X86_64
+ PREP_8TAP_HV m6, srcq+strideq*2, m7, m0
+ add srcq, stride3q
+ PREP_8TAP_HV m0, srcq+strideq*0, m7, m9
+%else
+ lea srcq, [srcq+strideq*2]
+ %if notcpuflag(ssse3)
+ mova [esp], m4
+ %endif
+ PREP_8TAP_HV m6, srcq+strideq*0, m7, m4
+ PREP_8TAP_HV m0, srcq+strideq*1, m7, m4
+ lea srcq, [srcq+strideq*2]
+%endif
+%if cpuflag(ssse3)
+ mova m7, [base+pw_8192]
+%else
+ mova m7, [base+pw_2]
+ %if ARCH_X86_32
+ mova m4, [esp]
+ %endif
+%endif
+ PMULHRSW_8192 m4, m4, m7
+ PMULHRSW_8192 m5, m5, m7
+ PMULHRSW_8192 m6, m6, m7
+ PMULHRSW_8192 m0, m0, m7
+ punpcklwd m1, m4, m5 ; 01
+ punpcklwd m2, m5, m6 ; 12
+ punpcklwd m3, m6, m0 ; 23
+ SAVELINE_W8 1, m1
+ SAVELINE_W8 2, m2
+ SAVELINE_W8 3, m3
+%if cpuflag(ssse3)
+ mova m7, [base+subpel_h_shufA]
+%endif
+%if ARCH_X86_64
+ PREP_8TAP_HV m4, srcq+strideq*1, m8, m9
+ PREP_8TAP_HV m5, srcq+strideq*2, m8, m9
+ add srcq, stride3q
+ PREP_8TAP_HV m6, srcq+strideq*0, m8, m9
+%else
+ %if notcpuflag(ssse3)
+ mova [esp+0x30], m0
+ %endif
+ PREP_8TAP_HV m4, srcq+strideq*0, m7, m0
+ PREP_8TAP_HV m5, srcq+strideq*1, m7, m0
+ lea srcq, [srcq+strideq*2]
+ PREP_8TAP_HV m6, srcq+strideq*0, m7, m0
+%endif
+%if cpuflag(ssse3)
+ mova m7, [base+pw_8192]
+%elif ARCH_X86_32
+ mova m0, [esp+0x30]
+ mova m7, [base+pw_2]
+%endif
+ PMULHRSW_8192 m1, m4, m7
+ PMULHRSW_8192 m2, m5, m7
+ PMULHRSW_8192 m3, m6, m7
+ punpcklwd m4, m0, m1 ; 34
+ punpcklwd m5, m1, m2 ; 45
+ punpcklwd m6, m2, m3 ; 56
+ SAVELINE_W8 6, m3
+ RESTORELINE_W8 1, m1
+ RESTORELINE_W8 2, m2
+ RESTORELINE_W8 3, m3
+.hv_w8_loop:
+ SAVELINE_W8 1, m3
+ SAVELINE_W8 2, m4
+ SAVELINE_W8 3, m5
+ SAVELINE_W8 4, m6
+%if ARCH_X86_32
+ pmaddwd m0, m1, subpelv0 ; a0
+ pmaddwd m7, m2, subpelv0 ; b0
+ pmaddwd m3, subpelv1 ; a1
+ pmaddwd m4, subpelv1 ; b1
+ paddd m0, m3
+ paddd m7, m4
+ pmaddwd m5, subpelv2 ; a2
+ pmaddwd m6, subpelv2 ; b2
+ paddd m0, m5
+ paddd m7, m6
+ mova m5, [base+pd_32]
+ paddd m0, m5
+ paddd m7, m5
+ mova accuv0, m0
+ mova accuv1, m7
+%else
+ pmaddwd accuv0, m1, subpelv0 ; a0
+ pmaddwd accuv1, m2, subpelv0 ; b0
+ pmaddwd m3, subpelv1 ; a1
+ pmaddwd m4, subpelv1 ; b1
+ paddd accuv0, m3
+ paddd accuv1, m4
+ pmaddwd m5, subpelv2 ; a2
+ pmaddwd m6, subpelv2 ; b2
+ paddd accuv0, m5
+ paddd accuv1, m6
+ mova m7, [base+pd_32]
+ paddd accuv0, m7
+ paddd accuv1, m7
+ %if cpuflag(ssse3)
+ mova m7, [base+subpel_h_shufB]
+ mova m6, [base+subpel_h_shufC]
+ mova m5, [base+subpel_h_shufA]
+ %define shufA m5
+ %define shufB m7
+ %define shufC m6
+ %endif
+%endif
+ PREP_8TAP_HV m0, srcq+strideq*1, m5, m6
+ lea srcq, [srcq+strideq*2]
+ PREP_8TAP_HV m4, srcq+strideq*0, m5, m6
+%if cpuflag(ssse3)
+ mova m5, [base+pw_8192]
+%else
+ mova m5, [base+pw_2]
+%endif
+ PMULHRSW_8192 m0, m0, m5
+ PMULHRSW_8192 m4, m4, m5
+ RESTORELINE_W8 6, m6
+ punpcklwd m5, m6, m0 ; 67
+ punpcklwd m6, m0, m4 ; 78
+ pmaddwd m1, m5, subpelv3 ; a3
+ paddd m2, m1, accuv0
+ pmaddwd m1, m6, subpelv3 ; b3
+ paddd m1, m1, accuv1
+ psrad m2, 6
+ psrad m1, 6
+ packssdw m2, m1
+ movq [tmpq+wq*0], m2
+ movhps [tmpq+wq*2], m2
+ lea tmpq, [tmpq+wq*4]
+ sub hd, 2
+ jle .hv_w8_outer
+ SAVELINE_W8 6, m4
+ RESTORELINE_W8 1, m1
+ RESTORELINE_W8 2, m2
+ RESTORELINE_W8 3, m3
+ RESTORELINE_W8 4, m4
+ jmp .hv_w8_loop
+.hv_w8_outer:
+%if ARCH_X86_32
+ mov srcq, srcm
+ mov tmpq, tmpm
+ movzx hd, r5w
+ add srcq, 4
+ add tmpq, 8
+ mov srcm, srcq
+ mov tmpm, tmpq
+%else
+ add r6, 4
+ add r8, 8
+ movzx hd, r5b
+ mov srcq, r6
+ mov tmpq, r8
+%endif
+ sub r5d, 1<<16
+ jg .hv_w8_loop0
+ RET
+%endmacro
+
+%macro movifprep 2
+ %if isprep
+ mov %1, %2
+ %endif
+%endmacro
+
+%macro SAVE_REG 1
+ %xdefine r%1_save r%1
+ %xdefine r%1q_save r%1q
+ %xdefine r%1d_save r%1d
+ %if ARCH_X86_32
+ %define r%1m_save [rstk+stack_offset+(%1+1)*4]
+ %endif
+%endmacro
+
+%macro LOAD_REG 1
+ %xdefine r%1 r%1_save
+ %xdefine r%1q r%1q_save
+ %xdefine r%1d r%1d_save
+ %if ARCH_X86_32
+ %define r%1m r%1m_save
+ %endif
+ %undef r%1d_save
+ %undef r%1q_save
+ %undef r%1_save
+%endmacro
+
+%macro REMAP_REG 2-3
+ %xdefine r%1 r%2
+ %xdefine r%1q r%2q
+ %xdefine r%1d r%2d
+ %if ARCH_X86_32
+ %if %3 == 0
+ %xdefine r%1m r%2m
+ %else
+ %define r%1m [rstk+stack_offset+(%1+1)*4]
+ %endif
+ %endif
+%endmacro
+
+%macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0
+ %if isprep
+ %if ARCH_X86_64
+ SAVE_REG 14
+ %assign %%i 14
+ %rep 14
+ %assign %%j %%i-1
+ REMAP_REG %%i, %%j
+ %assign %%i %%i-1
+ %endrep
+ %else
+ SAVE_REG 5
+ %assign %%i 5
+ %rep 5
+ %assign %%j %%i-1
+ REMAP_REG %%i, %%j, 0
+ %assign %%i %%i-1
+ %endrep
+ %endif
+ %endif
+%endmacro
+
+%macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0
+ %if isprep
+ %assign %%i 1
+ %if ARCH_X86_64
+ %rep 13
+ %assign %%j %%i+1
+ REMAP_REG %%i, %%j
+ %assign %%i %%i+1
+ %endrep
+ LOAD_REG 14
+ %else
+ %rep 4
+ %assign %%j %%i+1
+ REMAP_REG %%i, %%j, 1
+ %assign %%i %%i+1
+ %endrep
+ LOAD_REG 5
+ %endif
+ %endif
+%endmacro
+
+%macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged
+ MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT
+ RET
+ %if %1
+ MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
+ %endif
+%endmacro
+
+%if ARCH_X86_64
+ %macro MC_8TAP_SCALED_H 12 ; dst[0-1], tmp[0-5], weights[0-3]
+ SWAP m%2, m%5
+ movq m%1, [srcq+ r4]
+ movq m%2, [srcq+ r6]
+ movhps m%1, [srcq+ r7]
+ movhps m%2, [srcq+ r9]
+ movq m%3, [srcq+r10]
+ movq m%4, [srcq+r11]
+ movhps m%3, [srcq+r13]
+ movhps m%4, [srcq+ rX]
+ add srcq, ssq
+ movq m%5, [srcq+ r4]
+ movq m%6, [srcq+ r6]
+ movhps m%5, [srcq+ r7]
+ movhps m%6, [srcq+ r9]
+ movq m%7, [srcq+r10]
+ movq m%8, [srcq+r11]
+ movhps m%7, [srcq+r13]
+ movhps m%8, [srcq+ rX]
+ add srcq, ssq
+ pmaddubsw m%1, m%9
+ pmaddubsw m%5, m%9
+ pmaddubsw m%2, m%10
+ pmaddubsw m%6, m%10
+ pmaddubsw m%3, m%11
+ pmaddubsw m%7, m%11
+ pmaddubsw m%4, m%12
+ pmaddubsw m%8, m%12
+ phaddw m%1, m%2
+ phaddw m%5, m%6
+ phaddw m%3, m%4
+ phaddw m%7, m%8
+ phaddw m%1, m%3
+ phaddw m%5, m%7
+ pmulhrsw m%1, m12
+ pmulhrsw m%5, m12
+ SWAP m%2, m%5
+ %endmacro
+%else
+ %macro MC_8TAP_SCALED_H 2-3 1 ; weights_mem_start, h_mem_start, load_fh_offsets
+ %if %3 == 1
+ mov r0, [esp+ 0]
+ mov rX, [esp+ 8]
+ mov r4, [esp+ 4]
+ mov r5, [esp+12]
+ %endif
+ movq m0, [srcq+r0]
+ movq m1, [srcq+rX]
+ movhps m0, [srcq+r4]
+ movhps m1, [srcq+r5]
+ add srcq, ssq
+ movq m4, [srcq+r0]
+ movq m5, [srcq+rX]
+ movhps m4, [srcq+r4]
+ movhps m5, [srcq+r5]
+ mov r0, [esp+16]
+ mov rX, [esp+24]
+ mov r4, [esp+20]
+ mov r5, [esp+28]
+ sub srcq, ssq
+ movq m2, [srcq+r0]
+ movq m3, [srcq+rX]
+ movhps m2, [srcq+r4]
+ movhps m3, [srcq+r5]
+ add srcq, ssq
+ movq m6, [srcq+r0]
+ movq m7, [srcq+rX]
+ movhps m6, [srcq+r4]
+ movhps m7, [srcq+r5]
+ add srcq, ssq
+ pmaddubsw m0, [esp+%1+ 0]
+ pmaddubsw m4, [esp+%1+ 0]
+ pmaddubsw m1, [esp+%1+16]
+ pmaddubsw m5, [esp+%1+16]
+ pmaddubsw m2, [esp+%1+32]
+ pmaddubsw m6, [esp+%1+32]
+ pmaddubsw m3, [esp+%1+48]
+ pmaddubsw m7, [esp+%1+48]
+ phaddw m0, m1
+ phaddw m4, m5
+ phaddw m2, m3
+ phaddw m6, m7
+ phaddw m0, m2
+ phaddw m4, m6
+ pmulhrsw m0, m12
+ pmulhrsw m4, m12
+ %if %2 != 0
+ mova [esp+%2+ 0], m0
+ mova [esp+%2+16], m4
+ %endif
+ %endmacro
+%endif
+
+%macro MC_8TAP_SCALED 1
+%ifidn %1, put
+ %assign isprep 0
+ %if ARCH_X86_64
+ %if required_stack_alignment <= STACK_ALIGNMENT
+cglobal put_8tap_scaled_8bpc, 2, 15, 16, 0x180, dst, ds, src, ss, w, h, mx, my, dx, dy
+ %else
+cglobal put_8tap_scaled_8bpc, 2, 14, 16, 0x180, dst, ds, src, ss, w, h, mx, my, dx, dy
+ %endif
+ %else ; ARCH_X86_32
+ %if required_stack_alignment <= STACK_ALIGNMENT
+cglobal put_8tap_scaled_8bpc, 0, 7, 8, 0x200, dst, ds, src, ss, w, h, mx, my, dx, dy
+ %else
+cglobal put_8tap_scaled_8bpc, 0, 7, 8, -0x200-0x20, dst, ds, src, ss, w, h, mx, my, dx, dy
+ %endif
+ %endif
+ %xdefine base_reg r12
+ %define rndshift 10
+%else ; prep
+ %assign isprep 1
+ %if ARCH_X86_64
+ %if required_stack_alignment <= STACK_ALIGNMENT
+cglobal prep_8tap_scaled_8bpc, 2, 15, 16, 0x180, tmp, src, ss, w, h, mx, my, dx, dy
+ %xdefine tmp_stridem r14q
+ %else
+cglobal prep_8tap_scaled_8bpc, 2, 14, 16, 0x180, tmp, src, ss, w, h, mx, my, dx, dy
+ %define tmp_stridem qword [rsp+0x138]
+ %endif
+ %xdefine base_reg r11
+ %else ; ARCH_X86_32
+ %if required_stack_alignment <= STACK_ALIGNMENT
+cglobal prep_8tap_scaled_8bpc, 0, 7, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy
+ %else
+cglobal prep_8tap_scaled_8bpc, 0, 6, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy
+ %endif
+ %define tmp_stridem dword [esp+0x138]
+ %endif
+ %define rndshift 6
+%endif
+%if ARCH_X86_32
+ mov [esp+0x1f0], t0d
+ mov [esp+0x1f4], t1d
+ %if !isprep && required_stack_alignment > STACK_ALIGNMENT
+ mov dstd, dstm
+ mov dsd, dsm
+ mov srcd, srcm
+ mov ssd, ssm
+ mov hd, hm
+ mov r4, mxm
+ %define r0m [esp+0x200]
+ %define dsm [esp+0x204]
+ %define dsmp dsm
+ %define r1m dsm
+ %define r2m [esp+0x208]
+ %define ssm [esp+0x20c]
+ %define r3m ssm
+ %define hm [esp+0x210]
+ %define mxm [esp+0x214]
+ mov r0m, dstd
+ mov dsm, dsd
+ mov r2m, srcd
+ mov ssm, ssd
+ mov hm, hd
+ mov r0, mym
+ mov r1, dxm
+ mov r2, dym
+ %define mym [esp+0x218]
+ %define dxm [esp+0x09c]
+ %define dym [esp+0x21c]
+ mov mxm, r4
+ mov mym, r0
+ mov dxm, r1
+ mov dym, r2
+ tzcnt wd, wm
+ %endif
+ %if isprep && required_stack_alignment > STACK_ALIGNMENT
+ %xdefine base_reg r5
+ %else
+ %xdefine base_reg r6
+ %endif
+ mov ssd, ssm
+%endif
+ LEA base_reg, %1_8tap_scaled_8bpc_ssse3
+%xdefine base base_reg-%1_8tap_scaled_8bpc_ssse3
+%if ARCH_X86_64 || isprep || required_stack_alignment <= STACK_ALIGNMENT
+ tzcnt wd, wm
+%endif
+%if ARCH_X86_32
+ %define m8 m0
+ %define m9 m1
+ %define m14 m4
+ %define m15 m3
+%endif
+ movd m8, dxm
+ movd m14, mxm
+ pshufd m8, m8, q0000
+ pshufd m14, m14, q0000
+%if isprep && UNIX64
+ mov r5d, t0d
+ DECLARE_REG_TMP 5, 7
+%endif
+%if ARCH_X86_64
+ mov dyd, dym
+%endif
+%ifidn %1, put
+ %if WIN64
+ mov r8d, hm
+ DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3
+ %define hm r5m
+ %define dxm r8m
+ %elif ARCH_X86_64
+ DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3
+ %define hm r6m
+ %endif
+ %if ARCH_X86_64
+ %if required_stack_alignment > STACK_ALIGNMENT
+ %define dsm [rsp+0x138]
+ %define rX r1
+ %define rXd r1d
+ %else
+ %define dsm dsq
+ %define rX r14
+ %define rXd r14d
+ %endif
+ %else
+ %define rX r1
+ %endif
+%else ; prep
+ %if WIN64
+ mov r7d, hm
+ DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3
+ %define hm r4m
+ %define dxm r7m
+ %elif ARCH_X86_64
+ DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3
+ %define hm [rsp+0x94]
+ %endif
+ MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
+ %if ARCH_X86_64
+ %define rX r14
+ %define rXd r14d
+ %else
+ %define rX r3
+ %endif
+%endif
+%if ARCH_X86_64
+ mova m10, [base+pd_0x3ff]
+ mova m12, [base+pw_8192]
+ %ifidn %1, put
+ mova m13, [base+pd_512]
+ %else
+ mova m13, [base+pd_32]
+ %endif
+%else
+ %define m10 [base+pd_0x3ff]
+ %define m12 [base+pw_8192]
+ %ifidn %1, put
+ %define m13 [base+pd_512]
+ %else
+ %define m13 [base+pd_32]
+ %endif
+%endif
+ pxor m9, m9
+%if ARCH_X86_64
+ lea ss3q, [ssq*3]
+ movzx r7d, t1b
+ shr t1d, 16
+ cmp hd, 6
+ cmovs t1d, r7d
+ sub srcq, ss3q
+%else
+ MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT
+ mov r1, [esp+0x1f4]
+ lea r0, [ssq*3]
+ movzx r2, r1b
+ shr r1, 16
+ cmp dword hm, 6
+ cmovs r1, r2
+ mov [esp+0x1f4], r1
+ mov r1, r1m
+ mov r2, r2m
+ sub srcq, r0
+ MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
+ %define ss3q r0
+ %define myd r4
+ %define dyd dword dym
+ %define hd dword hm
+%endif
+ cmp dyd, 1024
+ je .dy1
+ cmp dyd, 2048
+ je .dy2
+ movzx wd, word [base+%1_8tap_scaled_ssse3_table+wq*2]
+ add wq, base_reg
+ jmp wq
+%ifidn %1, put
+.w2:
+ %if ARCH_X86_64
+ mov myd, mym
+ movzx t0d, t0b
+ dec srcq
+ movd m15, t0d
+ %else
+ movzx r4, byte [esp+0x1f0]
+ dec srcq
+ movd m15, r4
+ %endif
+ punpckldq m9, m8
+ SWAP m8, m9
+ paddd m14, m8 ; mx+dx*[0-1]
+ %if ARCH_X86_64
+ mova m11, [base+pd_0x4000]
+ %else
+ %define m11 [base+pd_0x4000]
+ %endif
+ pshufd m15, m15, q0000
+ pand m8, m14, m10
+ psrld m8, 6
+ paddd m15, m8
+ movd r4d, m15
+ psrldq m15, 4
+ %if ARCH_X86_64
+ movd r6d, m15
+ %else
+ movd r3d, m15
+ %endif
+ mova m5, [base+bdct_lb_dw]
+ mova m6, [base+subpel_s_shuf2]
+ movd m15, [base+subpel_filters+r4*8+2]
+ %if ARCH_X86_64
+ movd m7, [base+subpel_filters+r6*8+2]
+ %else
+ movd m7, [base+subpel_filters+r3*8+2]
+ %endif
+ pxor m9, m9
+ pcmpeqd m8, m9
+ psrld m14, 10
+ %if ARCH_X86_32
+ mov r3, r3m
+ pshufb m14, m5
+ paddb m14, m6
+ mova [rsp+0x180], m14
+ SWAP m5, m0
+ SWAP m6, m3
+ %define m8 m5
+ %define m15 m6
+ %endif
+ movq m0, [srcq+ssq*0]
+ movq m2, [srcq+ssq*2]
+ movhps m0, [srcq+ssq*1]
+ movhps m2, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ %if ARCH_X86_64
+ pshufb m14, m5
+ paddb m14, m6
+ %endif
+ movq m1, [srcq+ssq*0]
+ movq m3, [srcq+ssq*2]
+ movhps m1, [srcq+ssq*1]
+ movhps m3, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ punpckldq m15, m7
+ punpcklqdq m15, m15
+ %if ARCH_X86_64
+ pand m11, m8
+ pandn m8, m15
+ SWAP m15, m8
+ por m15, m11
+ %else
+ pand m7, m8, m11
+ pandn m8, m15
+ %define m8 m6
+ %define m15 m5
+ por m15, m7
+ mova [rsp+0x190], m15
+ %endif
+ pshufb m0, m14
+ pshufb m2, m14
+ pshufb m1, m14
+ pshufb m3, m14
+ pmaddubsw m0, m15
+ pmaddubsw m2, m15
+ pmaddubsw m1, m15
+ pmaddubsw m3, m15
+ phaddw m0, m2
+ phaddw m1, m3
+ pmulhrsw m0, m12 ; 0 1 2 3
+ pmulhrsw m1, m12 ; 4 5 6 7
+ palignr m2, m1, m0, 4 ; 1 2 3 4
+ punpcklwd m3, m0, m2 ; 01 12
+ punpckhwd m0, m2 ; 23 34
+ pshufd m5, m1, q0321 ; 5 6 7 _
+ punpcklwd m2, m1, m5 ; 45 56
+ punpckhwd m4, m1, m5 ; 67 __
+ %if ARCH_X86_32
+ mov myd, mym
+ mov r0, r0m
+ mova [rsp+0x1a0], m3
+ mova [rsp+0x1b0], m0
+ mova [rsp+0x1c0], m2
+ mova [rsp+0x1d0], m4
+ %endif
+.w2_loop:
+ and myd, 0x3ff
+ %if ARCH_X86_64
+ mov r6d, 64 << 24
+ mov r4d, myd
+ shr r4d, 6
+ lea r4d, [t1+r4]
+ cmovnz r6q, [base+subpel_filters+r4*8]
+ movq m11, r6q
+ punpcklbw m11, m11
+ psraw m11, 8
+ pshufd m8, m11, q0000
+ pshufd m9, m11, q1111
+ pshufd m10, m11, q2222
+ pshufd m11, m11, q3333
+ pmaddwd m5, m3, m8
+ pmaddwd m6, m0, m9
+ pmaddwd m7, m2, m10
+ pmaddwd m8, m4, m11
+ paddd m5, m6
+ paddd m7, m8
+ %else
+ mov mym, myd
+ mov r1, [esp+0x1f4]
+ xor r3, r3
+ shr r4, 6
+ lea r1, [r1+r4]
+ mov r4, 64 << 24
+ cmovnz r4, [base+subpel_filters+r1*8+0]
+ cmovnz r3, [base+subpel_filters+r1*8+4]
+ movd m7, r4
+ movd m6, r3
+ punpckldq m7, m6
+ punpcklbw m7, m7
+ psraw m7, 8
+ pshufd m5, m7, q0000
+ pshufd m6, m7, q1111
+ pmaddwd m3, m5
+ pmaddwd m0, m6
+ pshufd m5, m7, q2222
+ pshufd m7, m7, q3333
+ pmaddwd m2, m5
+ pmaddwd m4, m7
+ paddd m3, m0
+ paddd m2, m4
+ SWAP m5, m3
+ SWAP m7, m2
+ %endif
+ paddd m5, m13
+ paddd m5, m7
+ psrad m5, 10
+ packssdw m5, m5
+ packuswb m5, m5
+ %if ARCH_X86_64
+ pextrw r6d, m5, 0
+ mov [dstq], r6w
+ add dstq, dsq
+ dec hd
+ jz .ret
+ add myd, dyd
+ %else
+ pextrw r3d, m5, 0
+ mov [dstq], r3w
+ add dstq, dsm
+ dec hd
+ jz .ret
+ mov myd, mym
+ add myd, dym
+ %endif
+ test myd, ~0x3ff
+ %if ARCH_X86_32
+ SWAP m3, m5
+ SWAP m2, m7
+ mova m3, [rsp+0x1a0]
+ mova m0, [rsp+0x1b0]
+ mova m2, [rsp+0x1c0]
+ mova m4, [rsp+0x1d0]
+ %define m14 [esp+0x180]
+ %define m15 [esp+0x190]
+ %endif
+ jz .w2_loop
+ %if ARCH_X86_32
+ mov r3, r3m
+ %endif
+ movq m5, [srcq]
+ test myd, 0x400
+ jz .w2_skip_line
+ add srcq, ssq
+ shufps m3, m0, q1032 ; 01 12
+ shufps m0, m2, q1032 ; 23 34
+ shufps m2, m4, q1032 ; 45 56
+ pshufb m5, m14
+ pmaddubsw m5, m15
+ phaddw m5, m5
+ pmulhrsw m5, m12
+ palignr m4, m5, m1, 12
+ punpcklqdq m1, m4, m4 ; 6 7 6 7
+ punpcklwd m4, m1, m5 ; 67 __
+ %if ARCH_X86_32
+ mova [rsp+0x1a0], m3
+ mova [rsp+0x1b0], m0
+ mova [rsp+0x1c0], m2
+ mova [rsp+0x1d0], m4
+ %endif
+ jmp .w2_loop
+.w2_skip_line:
+ movhps m5, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova m3, m0 ; 01 12
+ mova m0, m2 ; 23 34
+ pshufb m5, m14
+ pmaddubsw m5, m15
+ phaddw m5, m5
+ pmulhrsw m5, m12 ; 6 7 6 7
+ palignr m4, m5, m1, 8 ; 4 5 6 7
+ pshufd m5, m4, q0321 ; 5 6 7 _
+ mova m1, m4
+ punpcklwd m2, m4, m5 ; 45 56
+ punpckhwd m4, m5 ; 67 __
+ %if ARCH_X86_32
+ mova [rsp+0x1a0], m3
+ mova [rsp+0x1b0], m0
+ mova [rsp+0x1c0], m2
+ mova [rsp+0x1d0], m4
+ %endif
+ jmp .w2_loop
+%endif
+INIT_XMM ssse3
+.w4:
+%if ARCH_X86_64
+ mov myd, mym
+ movzx t0d, t0b
+ dec srcq
+ movd m15, t0d
+%else
+ %define m8 m0
+ %xdefine m14 m4
+ %define m15 m3
+ movzx r4, byte [esp+0x1f0]
+ dec srcq
+ movd m15, r4
+%endif
+ pmaddwd m8, [base+rescale_mul]
+%if ARCH_X86_64
+ mova m11, [base+pd_0x4000]
+%else
+ %define m11 [base+pd_0x4000]
+%endif
+ pshufd m15, m15, q0000
+ paddd m14, m8 ; mx+dx*[0-3]
+ pand m0, m14, m10
+ psrld m0, 6
+ paddd m15, m0
+ psrldq m7, m15, 8
+%if ARCH_X86_64
+ movd r4d, m15
+ movd r11d, m7
+ psrldq m15, 4
+ psrldq m7, 4
+ movd r6d, m15
+ movd r13d, m7
+ movd m15, [base+subpel_filters+ r4*8+2]
+ movd m2, [base+subpel_filters+r11*8+2]
+ movd m3, [base+subpel_filters+ r6*8+2]
+ movd m4, [base+subpel_filters+r13*8+2]
+%else
+ movd r0, m15
+ movd rX, m7
+ psrldq m15, 4
+ psrldq m7, 4
+ movd r4, m15
+ movd r5, m7
+ movd m1, [base+subpel_filters+r0*8+2]
+ movd m2, [base+subpel_filters+rX*8+2]
+ movd m3, [base+subpel_filters+r4*8+2]
+ movd m7, [base+subpel_filters+r5*8+2]
+ movifprep r3, r3m
+ SWAP m4, m7
+ %define m15 m1
+%endif
+ mova m5, [base+bdct_lb_dw]
+ movq m6, [base+subpel_s_shuf2]
+ psrld m14, 10
+ punpckldq m15, m3
+ punpckldq m2, m4
+ punpcklqdq m15, m2
+ punpcklqdq m6, m6
+ pshufb m14, m5
+ paddb m14, m6
+%if ARCH_X86_64
+ pcmpeqd m0, m9
+ pand m11, m0
+%else
+ mova [esp+0x180], m14
+ SWAP m7, m4
+ pxor m3, m3
+ pcmpeqd m0, m3
+ pand m2, m11, m0
+ %define m11 m2
+%endif
+ pandn m0, m15
+%if ARCH_X86_64
+ SWAP m15, m0
+%else
+ %define m15 m0
+%endif
+ por m15, m11
+%if ARCH_X86_64
+ movu m7, [srcq+ssq*0]
+ movu m9, [srcq+ssq*1]
+ movu m8, [srcq+ssq*2]
+ movu m10, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ movu m2, [srcq+ssq*0]
+ movu m4, [srcq+ssq*1]
+ movu m3, [srcq+ssq*2]
+ movu m5, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ pshufb m7, m14
+ pshufb m9, m14
+ pshufb m8, m14
+ pshufb m10, m14
+ pshufb m2, m14
+ pshufb m4, m14
+ pshufb m3, m14
+ pshufb m5, m14
+ pmaddubsw m7, m15
+ pmaddubsw m9, m15
+ pmaddubsw m8, m15
+ pmaddubsw m10, m15
+ pmaddubsw m2, m15
+ pmaddubsw m4, m15
+ pmaddubsw m3, m15
+ pmaddubsw m5, m15
+ phaddw m7, m9
+ phaddw m8, m10
+ phaddw m9, m2, m4
+ phaddw m3, m5
+ pmulhrsw m7, m12 ; 0 1
+ pmulhrsw m8, m12 ; 2 3
+ pmulhrsw m9, m12 ; 4 5
+ pmulhrsw m3, m12 ; 6 7
+ shufps m4, m7, m8, q1032 ; 1 2
+ shufps m5, m8, m9, q1032 ; 3 4
+ shufps m6, m9, m3, q1032 ; 5 6
+ psrldq m11, m3, 8 ; 7 _
+ punpcklwd m0, m7, m4 ; 01
+ punpckhwd m7, m4 ; 12
+ punpcklwd m1, m8, m5 ; 23
+ punpckhwd m8, m5 ; 34
+ punpcklwd m2, m9, m6 ; 45
+ punpckhwd m9, m6 ; 56
+ punpcklwd m3, m11 ; 67
+ mova [rsp+0x00], m7
+ mova [rsp+0x10], m8
+ mova [rsp+0x20], m9
+%else
+ mova [esp+0x190], m15
+ lea ss3q, [ssq*3]
+ movu m2, [srcq+ssq*0]
+ movu m3, [srcq+ssq*1]
+ movu m7, [srcq+ssq*2]
+ movu m6, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ pshufb m2, m14
+ pshufb m3, m14
+ pshufb m7, m14
+ pshufb m6, m14
+ pmaddubsw m2, m15
+ pmaddubsw m3, m15
+ pmaddubsw m7, m15
+ pmaddubsw m6, m15
+ phaddw m2, m3
+ phaddw m7, m6
+ movu m1, [srcq+ssq*0]
+ movu m5, [srcq+ssq*1]
+ movu m3, [srcq+ssq*2]
+ movu m6, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ pshufb m1, m14
+ pshufb m5, m14
+ pshufb m3, m14
+ pshufb m6, m14
+ pmaddubsw m1, m15
+ pmaddubsw m5, m15
+ pmaddubsw m3, m15
+ pmaddubsw m6, m15
+ phaddw m1, m5
+ phaddw m3, m6
+ pmulhrsw m2, m12
+ pmulhrsw m7, m12
+ pmulhrsw m1, m12
+ pmulhrsw m3, m12
+ shufps m4, m2, m7, q1032 ; 1 2
+ shufps m5, m7, m1, q1032 ; 3 4
+ shufps m6, m1, m3, q1032 ; 5 6
+ psrldq m0, m3, 8 ; 7 _
+ mova [esp+0x1a0], m0
+ %define m11 [esp+0x1a0]
+ punpcklwd m0, m2, m4 ; 01
+ punpckhwd m2, m4 ; 12
+ punpcklwd m4, m7, m5 ; 23
+ punpckhwd m7, m5 ; 34
+ punpcklwd m5, m1, m6 ; 45
+ punpckhwd m1, m6 ; 56
+ punpcklwd m3, [esp+0x1a0] ; 67
+ mov myd, mym
+ mov r0, r0m
+ mova [esp+0x1b0], m0 ; 01
+ mova [esp+0x1c0], m4 ; 23
+ mova [esp+0x1d0], m5 ; 45
+ mova [esp+0x1e0], m3 ; 67
+ mova [rsp+0x00], m2 ; 12
+ mova [rsp+0x10], m7 ; 34
+ mova [rsp+0x20], m1 ; 56
+ SWAP m1, m4
+ SWAP m2, m5
+%endif
+.w4_loop:
+ and myd, 0x3ff
+%if ARCH_X86_64
+ mov r6d, 64 << 24
+ mov r4d, myd
+ shr r4d, 6
+ lea r4d, [t1+r4]
+ cmovnz r6q, [base+subpel_filters+r4*8]
+ movq m10, r6q
+ punpcklbw m10, m10
+ psraw m10, 8
+ pshufd m7, m10, q0000
+ pshufd m8, m10, q1111
+ pshufd m9, m10, q2222
+ pshufd m10, m10, q3333
+ pmaddwd m4, m0, m7
+ pmaddwd m5, m1, m8
+ pmaddwd m6, m2, m9
+ pmaddwd m7, m3, m10
+ paddd m4, m5
+ paddd m6, m7
+ paddd m4, m13
+ paddd m4, m6
+%else
+ mov mym, myd
+ mov r5, [esp+0x1f4]
+ xor r3, r3
+ shr r4, 6
+ lea r5, [r5+r4]
+ mov r4, 64 << 24
+ cmovnz r4, [base+subpel_filters+r5*8+0]
+ cmovnz r3, [base+subpel_filters+r5*8+4]
+ movd m7, r4
+ movd m6, r3
+ punpckldq m7, m6
+ punpcklbw m7, m7
+ psraw m7, 8
+ pshufd m4, m7, q0000
+ pshufd m5, m7, q1111
+ pshufd m6, m7, q2222
+ pshufd m7, m7, q3333
+ pmaddwd m0, m4
+ pmaddwd m1, m5
+ pmaddwd m2, m6
+ pmaddwd m3, m7
+ paddd m0, m1
+ paddd m2, m3
+ paddd m0, m13
+ paddd m0, m2
+ SWAP m4, m0
+%endif
+ psrad m4, rndshift
+ packssdw m4, m4
+%ifidn %1, put
+ packuswb m4, m4
+ movd [dstq], m4
+ add dstq, dsmp
+%else
+ movq [tmpq], m4
+ add tmpq, 8
+%endif
+ dec hd
+ jz .ret
+%if ARCH_X86_64
+ add myd, dyd
+ test myd, ~0x3ff
+ jz .w4_loop
+%else
+ SWAP m0, m4
+ mov myd, mym
+ mov r3, r3m
+ add myd, dym
+ test myd, ~0x3ff
+ jnz .w4_next_line
+ mova m0, [esp+0x1b0]
+ mova m1, [esp+0x1c0]
+ mova m2, [esp+0x1d0]
+ mova m3, [esp+0x1e0]
+ jmp .w4_loop
+.w4_next_line:
+ %define m14 [esp+0x180]
+ %define m15 [esp+0x190]
+%endif
+ movu m4, [srcq]
+ test myd, 0x400
+ jz .w4_skip_line
+%if ARCH_X86_64
+ mova m0, [rsp+0x00]
+ mova [rsp+0x00], m1
+ mova m1, [rsp+0x10]
+ mova [rsp+0x10], m2
+ mova m2, [rsp+0x20]
+ mova [rsp+0x20], m3
+%else
+ mova m5, [esp+0x1c0]
+ mova m0, [rsp+0x000]
+ mova [rsp+0x00], m5
+ mova [esp+0x1b0], m0
+ mova m6, [esp+0x1d0]
+ mova m1, [rsp+0x010]
+ mova [rsp+0x10], m6
+ mova [esp+0x1c0], m1
+ mova m7, [esp+0x1e0]
+ mova m2, [rsp+0x020]
+ mova [rsp+0x20], m7
+ mova [esp+0x1d0], m2
+%endif
+ pshufb m4, m14
+ pmaddubsw m4, m15
+ phaddw m4, m4
+ pmulhrsw m4, m12
+ punpcklwd m3, m11, m4
+%if ARCH_X86_32
+ mova [esp+0x1e0], m3
+%endif
+ mova m11, m4
+ add srcq, ssq
+ jmp .w4_loop
+.w4_skip_line:
+%if ARCH_X86_32
+ mova m0, [esp+0x1c0]
+ mova m1, [esp+0x1d0]
+ mova m2, [esp+0x1e0]
+%endif
+ movu m5, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova m6, [rsp+0x10]
+ mova m7, [rsp+0x20]
+ pshufb m4, m14
+ pshufb m5, m14
+ pmaddubsw m4, m15
+ pmaddubsw m5, m15
+ phaddw m4, m5
+ pmulhrsw m4, m12
+ punpcklwd m5, m11, m4
+ mova [rsp+0x00], m6
+ mova [rsp+0x10], m7
+ mova [rsp+0x20], m5
+%if ARCH_X86_64
+ psrldq m11, m4, 8
+ mova m0, m1
+ mova m1, m2
+ mova m2, m3
+ punpcklwd m3, m4, m11
+%else
+ psrldq m6, m4, 8
+ punpcklwd m3, m4, m6
+ mova [esp+0x1a0], m6
+ mova [esp+0x1b0], m0
+ mova [esp+0x1c0], m1
+ mova [esp+0x1d0], m2
+ mova [esp+0x1e0], m3
+%endif
+ jmp .w4_loop
+INIT_XMM ssse3
+.w8:
+ mov dword [rsp+0x90], 1
+ movifprep tmp_stridem, 16
+ jmp .w_start
+.w16:
+ mov dword [rsp+0x90], 2
+ movifprep tmp_stridem, 32
+ jmp .w_start
+.w32:
+ mov dword [rsp+0x90], 4
+ movifprep tmp_stridem, 64
+ jmp .w_start
+.w64:
+ mov dword [rsp+0x90], 8
+ movifprep tmp_stridem, 128
+ jmp .w_start
+.w128:
+ mov dword [rsp+0x90], 16
+ movifprep tmp_stridem, 256
+.w_start:
+%ifidn %1, put
+ movifnidn dsm, dsq
+%endif
+%if ARCH_X86_64
+ shr t0d, 16
+ movd m15, t0d
+%else
+ %define m8 m0
+ %xdefine m14 m4
+ %define m15 m3
+ %if isprep
+ %define ssq ssm
+ %endif
+ mov r4, [esp+0x1f0]
+ shr r4, 16
+ movd m15, r4
+ mov r0, r0m
+ mov myd, mym
+%endif
+ sub srcq, 3
+ pslld m7, m8, 2 ; dx*4
+ pmaddwd m8, [base+rescale_mul] ; dx*[0-3]
+ pshufd m15, m15, q0000
+ paddd m14, m8 ; mx+dx*[0-3]
+ mova [rsp+0x100], m7
+ mova [rsp+0x120], m15
+ mov [rsp+0x098], srcq
+ mov [rsp+0x130], r0q ; dstq / tmpq
+%if ARCH_X86_64 && UNIX64
+ mov hm, hd
+%elif ARCH_X86_32
+ mov r5, hm
+ mov [esp+0x094], myd
+ mov [esp+0x134], r5
+%endif
+ jmp .hloop
+.hloop_prep:
+ dec dword [rsp+0x090]
+ jz .ret
+%if ARCH_X86_64
+ add qword [rsp+0x130], 8*(isprep+1)
+ mov hd, hm
+%else
+ add dword [esp+0x130], 8*(isprep+1)
+ mov myd, [esp+0x094]
+ mov r5, [esp+0x134]
+ mov r0, [esp+0x130]
+%endif
+ mova m7, [rsp+0x100]
+ mova m14, [rsp+0x110]
+%if ARCH_X86_64
+ mova m10, [base+pd_0x3ff]
+%endif
+ mova m15, [rsp+0x120]
+ pxor m9, m9
+ mov srcq, [rsp+0x098]
+%if ARCH_X86_64
+ mov r0q, [rsp+0x130] ; dstq / tmpq
+%else
+ mov mym, myd
+ mov hm, r5
+ mov r0m, r0
+ mov r3, r3m
+%endif
+ paddd m14, m7
+.hloop:
+%if ARCH_X86_64
+ mova m11, [base+pq_0x40000000]
+%else
+ %define m11 [base+pq_0x40000000]
+%endif
+ psrld m2, m14, 10
+ mova [rsp], m2
+ pand m6, m14, m10
+ psrld m6, 6
+ paddd m5, m15, m6
+ pcmpeqd m6, m9
+ psrldq m2, m5, 8
+%if ARCH_X86_64
+ movd r4d, m5
+ movd r6d, m2
+ psrldq m5, 4
+ psrldq m2, 4
+ movd r7d, m5
+ movd r9d, m2
+ movq m0, [base+subpel_filters+r4*8]
+ movq m1, [base+subpel_filters+r6*8]
+ movhps m0, [base+subpel_filters+r7*8]
+ movhps m1, [base+subpel_filters+r9*8]
+%else
+ movd r0, m5
+ movd rX, m2
+ psrldq m5, 4
+ psrldq m2, 4
+ movd r4, m5
+ movd r5, m2
+ movq m0, [base+subpel_filters+r0*8]
+ movq m1, [base+subpel_filters+rX*8]
+ movhps m0, [base+subpel_filters+r4*8]
+ movhps m1, [base+subpel_filters+r5*8]
+ pxor m2, m2
+ %define m9 m2
+%endif
+ paddd m14, m7 ; mx+dx*[4-7]
+ pand m5, m14, m10
+ psrld m5, 6
+ paddd m15, m5
+ pcmpeqd m5, m9
+ mova [rsp+0x110], m14
+ psrldq m4, m15, 8
+%if ARCH_X86_64
+ movd r10d, m15
+ movd r11d, m4
+ psrldq m15, 4
+ psrldq m4, 4
+ movd r13d, m15
+ movd rXd, m4
+ movq m2, [base+subpel_filters+r10*8]
+ movq m3, [base+subpel_filters+r11*8]
+ movhps m2, [base+subpel_filters+r13*8]
+ movhps m3, [base+subpel_filters+ rX*8]
+ psrld m14, 10
+ psrldq m4, m14, 8
+ movd r10d, m14
+ movd r11d, m4
+ psrldq m14, 4
+ psrldq m4, 4
+ movd r13d, m14
+ movd rXd, m4
+ mov r4d, [rsp+ 0]
+ mov r6d, [rsp+ 8]
+ mov r7d, [rsp+ 4]
+ mov r9d, [rsp+12]
+ pshufd m4, m6, q1100
+ pshufd m6, m6, q3322
+ pshufd m14, m5, q1100
+ pshufd m5, m5, q3322
+ pand m7, m11, m4
+ pand m8, m11, m6
+ pand m15, m11, m14
+ pand m11, m11, m5
+ pandn m4, m0
+ pandn m6, m1
+ pandn m14, m2
+ pandn m5, m3
+ por m7, m4
+ por m8, m6
+ por m15, m14
+ por m11, m5
+ mova [rsp+0x10], m7
+ mova [rsp+0x20], m8
+ mova [rsp+0x30], m15
+ mova [rsp+0x40], m11
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10, 7, 8, 15, 11 ; 0-1
+ mova [rsp+0x50], m1
+ mova [rsp+0x60], m2
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10, 7, 8, 15, 11 ; 2-3
+ mova [rsp+0x70], m3
+ mova [rsp+0x80], m4
+ MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10, 7, 8, 15, 11 ; 4-5
+ MC_8TAP_SCALED_H 0,14, 1, 2, 3, 4, 9, 10, 7, 8, 15, 11 ; 6-7
+ SWAP m7, m0
+ SWAP m8, m14
+ mova m1, [rsp+0x50]
+ mova m2, [rsp+0x60]
+ mova m3, [rsp+0x70]
+ mova m9, [rsp+0x80]
+ mov myd, mym
+ mov dyd, dym
+ punpcklwd m4, m5, m6 ; 45a
+ punpckhwd m5, m6 ; 45b
+ punpcklwd m6, m7, m8 ; 67a
+ punpckhwd m7, m8 ; 67b
+ punpcklwd m0, m1, m2 ; 01a
+ punpckhwd m1, m2 ; 01b
+ punpcklwd m2, m3, m9 ; 23a
+ punpckhwd m3, m9 ; 23b
+ mova [rsp+0x50], m4
+ mova [rsp+0x60], m5
+ mova [rsp+0x70], m6
+ mova [rsp+0x80], m7
+ SWAP m14, m8
+.vloop:
+ and myd, 0x3ff
+ mov r6d, 64 << 24
+ mov r4d, myd
+ shr r4d, 6
+ lea r4d, [t1+r4]
+ cmovnz r6q, [base+subpel_filters+r4*8]
+ movq m11, r6q
+ punpcklbw m11, m11
+ psraw m11, 8
+ pshufd m5, m11, q0000
+ pshufd m7, m11, q1111
+ pshufd m10, m11, q2222
+ pshufd m11, m11, q3333
+ pmaddwd m4, m5, m0
+ pmaddwd m5, m5, m1
+ pmaddwd m6, m7, m2
+ pmaddwd m7, m7, m3
+ paddd m4, m13
+ paddd m5, m13
+ paddd m4, m6
+ paddd m5, m7
+ pmaddwd m6, [rsp+0x50], m10
+ pmaddwd m7, [rsp+0x60], m10
+ pmaddwd m8, [rsp+0x70], m11
+ pmaddwd m9, [rsp+0x80], m11
+ paddd m4, m6
+ paddd m5, m7
+ paddd m4, m8
+ paddd m5, m9
+%else
+ movd r0, m15
+ movd rX, m4
+ psrldq m15, 4
+ psrldq m4, 4
+ movd r4, m15
+ movd r5, m4
+ mova m14, [esp+0x110]
+ movq m2, [base+subpel_filters+r0*8]
+ movq m3, [base+subpel_filters+rX*8]
+ movhps m2, [base+subpel_filters+r4*8]
+ movhps m3, [base+subpel_filters+r5*8]
+ psrld m14, 10
+ mova [esp+16], m14
+ mov r0, [esp+ 0]
+ mov rX, [esp+ 8]
+ mov r4, [esp+ 4]
+ mov r5, [esp+12]
+ mova [esp+0x20], m0
+ mova [esp+0x30], m1
+ mova [esp+0x40], m2
+ mova [esp+0x50], m3
+ pshufd m4, m6, q1100
+ pshufd m6, m6, q3322
+ pshufd m7, m5, q1100
+ pshufd m5, m5, q3322
+ pand m0, m11, m4
+ pand m1, m11, m6
+ pand m2, m11, m7
+ pand m3, m11, m5
+ pandn m4, [esp+0x20]
+ pandn m6, [esp+0x30]
+ pandn m7, [esp+0x40]
+ pandn m5, [esp+0x50]
+ por m0, m4
+ por m1, m6
+ por m2, m7
+ por m3, m5
+ mova [esp+0x20], m0
+ mova [esp+0x30], m1
+ mova [esp+0x40], m2
+ mova [esp+0x50], m3
+ MC_8TAP_SCALED_H 0x20, 0x140, 0 ; 0-1
+ MC_8TAP_SCALED_H 0x20, 0x160 ; 2-3
+ MC_8TAP_SCALED_H 0x20, 0x180 ; 4-5
+ MC_8TAP_SCALED_H 0x20, 0x1a0 ; 6-7
+ mova m5, [esp+0x180]
+ mova m6, [esp+0x190]
+ mova m7, [esp+0x1a0]
+ mova m0, [esp+0x1b0]
+ mov myd, mym
+ punpcklwd m4, m5, m6 ; 45a
+ punpckhwd m5, m6 ; 45b
+ punpcklwd m6, m7, m0 ; 67a
+ punpckhwd m7, m0 ; 67b
+ mova [esp+0x180], m4
+ mova [esp+0x190], m5
+ mova [esp+0x1a0], m6
+ mova [esp+0x1b0], m7
+ mova m1, [esp+0x140]
+ mova m2, [esp+0x150]
+ mova m3, [esp+0x160]
+ mova m4, [esp+0x170]
+ punpcklwd m0, m1, m2 ; 01a
+ punpckhwd m1, m2 ; 01b
+ punpcklwd m2, m3, m4 ; 23a
+ punpckhwd m3, m4 ; 23b
+ mova [esp+0x140], m0
+ mova [esp+0x150], m1
+ mova [esp+0x160], m2
+ mova [esp+0x170], m3
+.vloop:
+ mov r0, r0m
+ mov r5, [esp+0x1f4]
+ and myd, 0x3ff
+ mov mym, myd
+ xor r3, r3
+ shr r4, 6
+ lea r5, [r5+r4]
+ mov r4, 64 << 24
+ cmovnz r4, [base+subpel_filters+r5*8+0]
+ cmovnz r3, [base+subpel_filters+r5*8+4]
+ movd m7, r4
+ movd m6, r3
+ punpckldq m7, m6
+ punpcklbw m7, m7
+ psraw m7, 8
+ pshufd m4, m7, q0000
+ pshufd m5, m7, q1111
+ pmaddwd m0, m4
+ pmaddwd m1, m4
+ pmaddwd m2, m5
+ pmaddwd m3, m5
+ pshufd m6, m7, q2222
+ pshufd m7, m7, q3333
+ paddd m0, m2
+ paddd m1, m3
+ pmaddwd m2, [esp+0x180], m6
+ pmaddwd m3, [esp+0x190], m6
+ pmaddwd m4, [esp+0x1a0], m7
+ pmaddwd m5, [esp+0x1b0], m7
+ paddd m0, m2
+ paddd m1, m3
+ paddd m0, m13
+ paddd m1, m13
+ paddd m4, m0
+ paddd m5, m1
+%endif
+ psrad m4, rndshift
+ psrad m5, rndshift
+ packssdw m4, m5
+%ifidn %1, put
+ packuswb m4, m4
+ movq [dstq], m4
+ add dstq, dsm
+%else
+ mova [tmpq], m4
+ add tmpq, tmp_stridem
+%endif
+ dec hd
+ jz .hloop_prep
+%if ARCH_X86_64
+ add myd, dyd
+ test myd, ~0x3ff
+ jz .vloop
+ test myd, 0x400
+ mov [rsp+0x140], myd
+ mov r4d, [rsp+ 0]
+ mov r6d, [rsp+ 8]
+ mov r7d, [rsp+ 4]
+ mov r9d, [rsp+12]
+ jz .skip_line
+ mova m14, [base+unpckw]
+ movq m6, [srcq+r10]
+ movq m7, [srcq+r11]
+ movhps m6, [srcq+r13]
+ movhps m7, [srcq+ rX]
+ movq m4, [srcq+ r4]
+ movq m5, [srcq+ r6]
+ movhps m4, [srcq+ r7]
+ movhps m5, [srcq+ r9]
+ add srcq, ssq
+ mov myd, [rsp+0x140]
+ mov dyd, dym
+ pshufd m9, m14, q1032
+ pshufb m0, m14 ; 0a 1a
+ pshufb m1, m14 ; 0b 1b
+ pshufb m2, m9 ; 3a 2a
+ pshufb m3, m9 ; 3b 2b
+ pmaddubsw m6, [rsp+0x30]
+ pmaddubsw m7, [rsp+0x40]
+ pmaddubsw m4, [rsp+0x10]
+ pmaddubsw m5, [rsp+0x20]
+ phaddw m6, m7
+ phaddw m4, m5
+ phaddw m4, m6
+ pmulhrsw m4, m12
+ pshufb m5, [rsp+0x50], m14 ; 4a 5a
+ pshufb m6, [rsp+0x60], m14 ; 4b 5b
+ pshufb m7, [rsp+0x70], m9 ; 7a 6a
+ pshufb m8, [rsp+0x80], m9 ; 7b 6b
+ punpckhwd m0, m2 ; 12a
+ punpckhwd m1, m3 ; 12b
+ punpcklwd m2, m5 ; 34a
+ punpcklwd m3, m6 ; 34b
+ punpckhwd m5, m7 ; 56a
+ punpckhwd m6, m8 ; 56b
+ punpcklwd m7, m4 ; 78a
+ punpckhqdq m4, m4
+ punpcklwd m8, m4 ; 78b
+ mova [rsp+0x50], m5
+ mova [rsp+0x60], m6
+ mova [rsp+0x70], m7
+ mova [rsp+0x80], m8
+ jmp .vloop
+.skip_line:
+ mova m0, [rsp+0x10]
+ mova m1, [rsp+0x20]
+ mova m14, [rsp+0x30]
+ mova m15, [rsp+0x40]
+ MC_8TAP_SCALED_H 4, 8, 5, 6, 7, 9, 10, 11, 0, 1, 14, 15
+ mov myd, [rsp+0x140]
+ mov dyd, dym
+ mova m0, m2 ; 01a
+ mova m1, m3 ; 01b
+ mova m2, [rsp+0x50] ; 23a
+ mova m3, [rsp+0x60] ; 23b
+ mova m5, [rsp+0x70] ; 45a
+ mova m6, [rsp+0x80] ; 45b
+ punpcklwd m7, m4, m8 ; 67a
+ punpckhwd m4, m8 ; 67b
+ mova [rsp+0x50], m5
+ mova [rsp+0x60], m6
+ mova [rsp+0x70], m7
+ mova [rsp+0x80], m4
+%else
+ mov r0m, r0
+ mov myd, mym
+ mov r3, r3m
+ add myd, dym
+ test myd, ~0x3ff
+ mov mym, myd
+ jnz .next_line
+ mova m0, [esp+0x140]
+ mova m1, [esp+0x150]
+ mova m2, [esp+0x160]
+ mova m3, [esp+0x170]
+ jmp .vloop
+.next_line:
+ test myd, 0x400
+ mov r0, [esp+ 0]
+ mov rX, [esp+ 8]
+ mov r4, [esp+ 4]
+ mov r5, [esp+12]
+ jz .skip_line
+ mova m6, [base+unpckw]
+ mova m0, [esp+0x140]
+ mova m1, [esp+0x150]
+ mova m7, [esp+0x180]
+ movq m4, [srcq+r0]
+ movq m5, [srcq+rX]
+ movhps m4, [srcq+r4]
+ movhps m5, [srcq+r5]
+ pshufb m0, m6 ; 0a 1a
+ pshufb m1, m6 ; 0b 1b
+ pshufb m7, m6 ; 4a 5a
+ mov r0, [esp+16]
+ mov rX, [esp+24]
+ mov r4, [esp+20]
+ mov r5, [esp+28]
+ movq m3, [srcq+r0]
+ movq m2, [srcq+rX]
+ movhps m3, [srcq+r4]
+ movhps m2, [srcq+r5]
+ add srcq, ssq
+ pmaddubsw m4, [esp+0x20]
+ pmaddubsw m5, [esp+0x30]
+ pmaddubsw m3, [esp+0x40]
+ pmaddubsw m2, [esp+0x50]
+ phaddw m4, m5
+ phaddw m3, m2
+ mova m5, [esp+0x190]
+ mova m2, [esp+0x160]
+ phaddw m4, m3
+ mova m3, [esp+0x170]
+ pmulhrsw m4, m12 ; 8a 8b
+ mov myd, mym
+ pshufb m5, m6 ; 4b 5b
+ pshufd m6, m6, q1032
+ pshufb m2, m6 ; 3a 2a
+ pshufb m3, m6 ; 3b 2b
+ punpckhwd m0, m2 ; 12a
+ punpckhwd m1, m3 ; 12b
+ mova [esp+0x140], m0
+ mova [esp+0x150], m1
+ mova m0, [esp+0x1a0]
+ mova m1, [esp+0x1b0]
+ punpcklwd m2, m7 ; 34a
+ punpcklwd m3, m5 ; 34b
+ mova [esp+0x160], m2
+ mova [esp+0x170], m3
+ pshufb m0, m6 ; 7a 6a
+ pshufb m1, m6 ; 7b 6b
+ punpckhwd m7, m0 ; 56a
+ punpckhwd m5, m1 ; 56b
+ punpcklwd m0, m4
+ punpckhqdq m4, m4
+ punpcklwd m1, m4
+ mova [esp+0x180], m7
+ mova [esp+0x190], m5
+ mova [esp+0x1a0], m0
+ mova [esp+0x1b0], m1
+ mova m0, [esp+0x140]
+ mova m1, [esp+0x150]
+ jmp .vloop
+.skip_line:
+ MC_8TAP_SCALED_H 0x20, 0x1c0, 0
+ mov myd, mym
+ mova m0, [esp+0x160]
+ mova m1, [esp+0x170]
+ mova m2, [esp+0x180]
+ mova m3, [esp+0x190]
+ mova [esp+0x140], m0
+ mova [esp+0x150], m1
+ mova m4, [esp+0x1a0]
+ mova m5, [esp+0x1b0]
+ mova [esp+0x160], m2
+ mova [esp+0x170], m3
+ mova m6, [esp+0x1c0]
+ mova m7, [esp+0x1d0]
+ mova [esp+0x180], m4
+ mova [esp+0x190], m5
+ punpcklwd m4, m6, m7
+ punpckhwd m6, m7
+ mova [esp+0x1a0], m4
+ mova [esp+0x1b0], m6
+%endif
+ jmp .vloop
+INIT_XMM ssse3
+.dy1:
+ movzx wd, word [base+%1_8tap_scaled_ssse3_dy1_table+wq*2]
+ add wq, base_reg
+ jmp wq
+%ifidn %1, put
+.dy1_w2:
+ %if ARCH_X86_64
+ mov myd, mym
+ movzx t0d, t0b
+ dec srcq
+ movd m15, t0d
+ %else
+ %define m8 m0
+ %define m9 m1
+ %define m14 m4
+ %define m15 m3
+ movzx r5, byte [esp+0x1f0]
+ dec srcd
+ movd m15, r5
+ %endif
+ punpckldq m9, m8
+ SWAP m8, m9
+ paddd m14, m8 ; mx+dx*[0-1]
+ %if ARCH_X86_64
+ mova m11, [base+pd_0x4000]
+ %else
+ %define m11 [base+pd_0x4000]
+ %endif
+ pshufd m15, m15, q0000
+ pand m8, m14, m10
+ psrld m8, 6
+ paddd m15, m8
+ movd r4d, m15
+ psrldq m15, 4
+ %if ARCH_X86_64
+ movd r6d, m15
+ %else
+ movd r3d, m15
+ %endif
+ mova m5, [base+bdct_lb_dw]
+ mova m6, [base+subpel_s_shuf2]
+ movd m15, [base+subpel_filters+r4*8+2]
+ %if ARCH_X86_64
+ movd m7, [base+subpel_filters+r6*8+2]
+ %else
+ movd m7, [base+subpel_filters+r3*8+2]
+ %endif
+ pxor m9, m9
+ pcmpeqd m8, m9
+ psrld m14, 10
+ %if ARCH_X86_32
+ mov r3, r3m
+ pshufb m14, m5
+ paddb m14, m6
+ mova [esp+0x00], m14
+ %define m14 [esp+0x00]
+ SWAP m5, m0
+ SWAP m6, m3
+ %define m8 m5
+ %define m15 m6
+ %endif
+ movq m0, [srcq+ssq*0]
+ movq m2, [srcq+ssq*2]
+ movhps m0, [srcq+ssq*1]
+ movhps m2, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ %if ARCH_X86_64
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ pshufb m14, m5
+ paddb m14, m6
+ movq m10, r4
+ %else
+ mov myd, mym
+ mov r5, [esp+0x1f4]
+ xor r3, r3
+ shr myd, 6
+ lea r5, [r5+myd]
+ mov r4, 64 << 24
+ cmovnz r4, [base+subpel_filters+r5*8+0]
+ cmovnz r3, [base+subpel_filters+r5*8+4]
+ %define m10 m4
+ movd m10, r4
+ movd m3, r3
+ mov r3, r3m
+ punpckldq m10, m3
+ %endif
+ movq m1, [srcq+ssq*0]
+ movq m3, [srcq+ssq*2]
+ movhps m1, [srcq+ssq*1]
+ add srcq, ss3q
+ punpcklbw m10, m10
+ psraw m10, 8
+ punpckldq m15, m7
+ punpcklqdq m15, m15
+ %if ARCH_X86_64
+ pand m11, m8
+ %else
+ pand m7, m11, m8
+ %define m11 m7
+ %endif
+ pandn m8, m15
+ SWAP m15, m8
+ por m15, m11
+ %if ARCH_X86_64
+ pshufd m8, m10, q0000
+ pshufd m9, m10, q1111
+ pshufd m11, m10, q3333
+ pshufd m10, m10, q2222
+ %else
+ mova [esp+0x10], m15
+ %define m15 [esp+0x10]
+ mov r0, r0m
+ pshufd m5, m4, q0000
+ pshufd m6, m4, q1111
+ pshufd m7, m4, q2222
+ pshufd m4, m4, q3333
+ %define m8 [esp+0x20]
+ %define m9 [esp+0x30]
+ %define m10 [esp+0x40]
+ %define m11 [esp+0x50]
+ mova m8, m5
+ mova m9, m6
+ mova m10, m7
+ mova m11, m4
+ %endif
+ pshufb m0, m14
+ pshufb m2, m14
+ pshufb m1, m14
+ pshufb m3, m14
+ pmaddubsw m0, m15
+ pmaddubsw m2, m15
+ pmaddubsw m1, m15
+ pmaddubsw m3, m15
+ phaddw m0, m2
+ phaddw m1, m3
+ pmulhrsw m0, m12
+ pmulhrsw m1, m12
+ palignr m2, m1, m0, 4
+ pshufd m4, m1, q2121
+ punpcklwd m3, m0, m2 ; 01 12
+ punpckhwd m0, m2 ; 23 34
+ punpcklwd m2, m1, m4 ; 45 56
+.dy1_w2_loop:
+ movq m1, [srcq+ssq*0]
+ movhps m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmaddwd m5, m3, m8
+ pmaddwd m6, m0, m9
+ pmaddwd m7, m2, m10
+ mova m3, m0
+ mova m0, m2
+ paddd m5, m13
+ paddd m6, m7
+ pshufb m1, m14
+ pmaddubsw m1, m15
+ phaddw m1, m1
+ pmulhrsw m1, m12
+ palignr m7, m1, m4, 12
+ punpcklwd m2, m7, m1 ; 67 78
+ pmaddwd m7, m2, m11
+ mova m4, m1
+ paddd m5, m6
+ paddd m5, m7
+ psrad m5, rndshift
+ packssdw m5, m5
+ packuswb m5, m5
+ movd r4d, m5
+ mov [dstq+dsq*0], r4w
+ shr r4d, 16
+ mov [dstq+dsq*1], r4w
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .dy1_w2_loop
+ RET
+%endif
+INIT_XMM ssse3
+.dy1_w4:
+%if ARCH_X86_64
+ mov myd, mym
+ movzx t0d, t0b
+ dec srcq
+ movd m15, t0d
+%else
+ %define m10 [base+pd_0x3ff]
+ %define m11 [base+pd_0x4000]
+ %define m8 m0
+ %xdefine m14 m4
+ %define m15 m3
+ %if isprep
+ %define ssq r3
+ %endif
+ movzx r4, byte [esp+0x1f0]
+ dec srcq
+ movd m15, r4
+%endif
+ pmaddwd m8, [base+rescale_mul]
+%if ARCH_X86_64
+ mova m11, [base+pd_0x4000]
+%endif
+ pshufd m15, m15, q0000
+ paddd m14, m8 ; mx+dx*[0-3]
+ pand m8, m14, m10
+ psrld m8, 6
+ paddd m15, m8
+ psrldq m7, m15, 8
+%if ARCH_X86_64
+ movd r4d, m15
+ movd r11d, m7
+ psrldq m15, 4
+ psrldq m7, 4
+ movd r6d, m15
+ movd r13d, m7
+ movd m15, [base+subpel_filters+ r4*8+2]
+ movd m2, [base+subpel_filters+r11*8+2]
+ movd m3, [base+subpel_filters+ r6*8+2]
+ movd m4, [base+subpel_filters+r13*8+2]
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+%else
+ movd r1, m15
+ movd r3, m7
+ psrldq m15, 4
+ psrldq m7, 4
+ movd r4, m15
+ movd r5, m7
+ %define m15 m5
+ SWAP m4, m7
+ movd m15, [base+subpel_filters+r1*8+2]
+ movd m2, [base+subpel_filters+r3*8+2]
+ movd m3, [base+subpel_filters+r4*8+2]
+ movd m4, [base+subpel_filters+r5*8+2]
+ mov myd, mym
+ mov rX, [esp+0x1f4]
+ xor r5, r5
+ shr myd, 6
+ lea rX, [rX+myd]
+ mov r4, 64 << 24
+ cmovnz r4, [base+subpel_filters+rX*8+0]
+ cmovnz r5, [base+subpel_filters+rX*8+4]
+ mov r3, r3m
+ %if isprep
+ lea ss3q, [ssq*3]
+ %endif
+%endif
+ punpckldq m15, m3
+ punpckldq m2, m4
+ punpcklqdq m15, m2
+ movq m6, [base+subpel_s_shuf2]
+%if ARCH_X86_64
+ pcmpeqd m8, m9
+ psrld m14, 10
+ pshufb m14, [base+bdct_lb_dw]
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
+ movu m2, [srcq+ssq*2]
+ movu m3, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ punpcklqdq m6, m6
+ movu m4, [srcq+ssq*0]
+ movu m5, [srcq+ssq*1]
+ movu m7, [srcq+ssq*2]
+ add srcq, ss3q
+ pand m11, m8
+ pandn m8, m15
+ SWAP m15, m8
+ por m15, m11
+ paddb m14, m6
+ movq m10, r4q
+ punpcklbw m10, m10
+ psraw m10, 8
+ pshufb m0, m14
+ pshufb m1, m14
+ pshufb m2, m14
+ pshufb m3, m14
+ pshufb m4, m14
+ pshufb m5, m14
+ pshufb m7, m14
+ pmaddubsw m0, m15
+ pmaddubsw m1, m15
+ pmaddubsw m2, m15
+ pmaddubsw m3, m15
+ pmaddubsw m4, m15
+ pmaddubsw m5, m15
+ pmaddubsw m7, m15
+ phaddw m0, m1
+ phaddw m2, m3
+ phaddw m4, m5
+ phaddw m6, m7, m7
+ pmulhrsw m0, m12 ; 0 1
+ pmulhrsw m2, m12 ; 2 3
+ pmulhrsw m4, m12 ; 4 5
+ pmulhrsw m6, m12 ; 6 _
+ shufps m1, m0, m2, q1032 ; 1 2
+ shufps m3, m2, m4, q1032 ; 3 4
+ shufps m5, m4, m6, q1032 ; 5 6
+ punpcklwd m7, m0, m1 ; 01
+ punpckhwd m0, m1 ; 12
+ punpcklwd m8, m2, m3 ; 23
+ punpckhwd m2, m3 ; 34
+ punpcklwd m9, m4, m5 ; 45
+ punpckhwd m4, m5 ; 56
+%else
+ pxor m3, m3
+ pcmpeqd m8, m3
+ psrld m14, 10
+ pshufb m14, [base+bdct_lb_dw]
+ movu m1, [srcq+ssq*0]
+ movu m2, [srcq+ssq*1]
+ movu m3, [srcq+ssq*2]
+ add srcq, ss3q
+ punpcklqdq m6, m6
+ SWAP m4, m7
+ pand m7, m11, m8
+ pandn m8, m15
+ SWAP m5, m0
+ por m15, m7
+ paddb m14, m6
+ movu m0, [srcq+ssq*0]
+ movu m7, [srcq+ssq*1]
+ movu m6, [srcq+ssq*2]
+ pshufb m1, m14
+ pshufb m2, m14
+ pshufb m3, m14
+ pshufb m0, m14
+ pshufb m7, m14
+ pshufb m6, m14
+ pmaddubsw m1, m15
+ pmaddubsw m2, m15
+ pmaddubsw m3, m15
+ mova [esp+0x00], m14
+ mova [esp+0x10], m15
+ pmaddubsw m0, m15
+ pmaddubsw m7, m15
+ pmaddubsw m6, m15
+ phaddw m1, m2
+ movu m2, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ mov r0, r0m
+ phaddw m3, m0
+ pshufb m2, m14
+ pmaddubsw m2, m15
+ %define m14 [esp+0x00]
+ %define m15 [esp+0x10]
+ phaddw m7, m6
+ phaddw m2, m2
+ movd m6, r4
+ movd m0, r5
+ punpckldq m6, m0
+ punpcklbw m6, m6
+ psraw m6, 8
+ mova [esp+0x20], m6
+ pmulhrsw m1, m12 ; 0 1
+ pmulhrsw m3, m12 ; 2 3
+ pmulhrsw m7, m12 ; 4 5
+ pmulhrsw m2, m12 ; 6 _
+ shufps m0, m1, m3, q1032 ; 1 2
+ shufps m4, m3, m7, q1032 ; 3 4
+ shufps m5, m7, m2, q1032 ; 5 6
+ punpcklwd m6, m1, m0 ; 01
+ punpckhwd m1, m0 ; 12
+ mova [esp+0x30], m1
+ punpcklwd m1, m3, m4 ; 23
+ punpckhwd m3, m4 ; 34
+ mova [esp+0x40], m3
+ punpcklwd m3, m7, m5 ; 45
+ punpckhwd m7, m5 ; 56
+ mova [esp+0x50], m7
+ mova [esp+0x60], m2
+ mova m0, [esp+0x20]
+ %xdefine m8 m1
+ %xdefine m9 m3
+ %xdefine m10 m0
+ SWAP m7, m6
+ SWAP m1, m4
+ SWAP m3, m2
+%endif
+ pshufd m1, m10, q0000
+ pshufd m3, m10, q1111
+ pshufd m5, m10, q2222
+ pshufd m10, m10, q3333
+%if ARCH_X86_64
+ mova [rsp+0x00], m8
+ mova [rsp+0x10], m2
+ mova [rsp+0x20], m9
+ mova [rsp+0x30], m4
+%else
+ mova [esp+0x70], m8
+ mova [esp+0x80], m9
+ mova [esp+0x90], m1
+ mova [esp+0xa0], m3
+ mova [esp+0xb0], m5
+ mova [esp+0xc0], m10
+ %ifidn %1, put
+ mov dsd, dsm
+ %endif
+ %define m11 m6
+%endif
+.dy1_w4_loop:
+%if ARCH_X86_64
+ movu m11, [srcq+ssq*0]
+ pmaddwd m7, m1
+ pmaddwd m8, m3
+ pmaddwd m0, m1
+ pmaddwd m2, m3
+ pmaddwd m9, m5
+ pmaddwd m4, m5
+ paddd m7, m8
+ paddd m0, m2
+ movu m8, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb m11, m14
+ pmaddubsw m11, m15
+ paddd m7, m13
+ paddd m0, m13
+ paddd m7, m9
+ paddd m0, m4
+ pshufb m8, m14
+ pmaddubsw m8, m15
+ phaddw m11, m8
+ mova m8, [rsp+0x20]
+ pmulhrsw m11, m12
+ punpcklwd m9, m6, m11 ; 67
+ psrldq m6, m11, 8
+ punpcklwd m4, m11, m6 ; 78
+ pmaddwd m2, m9, m10
+ pmaddwd m11, m4, m10
+ paddd m7, m2
+ mova m2, [rsp+0x30]
+ paddd m0, m11
+%else
+ SWAP m7, m6
+ SWAP m1, m4
+ SWAP m3, m2
+ movu m5, [srcq+ssq*0]
+ mova m0, [esp+0x30]
+ mova m2, [esp+0x40]
+ mova m4, [esp+0x50]
+ pmaddwd m6, [esp+0x90]
+ pmaddwd m1, [esp+0xa0]
+ pmaddwd m0, [esp+0x90]
+ pmaddwd m2, [esp+0xa0]
+ pmaddwd m3, [esp+0xb0]
+ pmaddwd m4, [esp+0xb0]
+ paddd m6, m1
+ paddd m0, m2
+ movu m7, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb m5, m14
+ pmaddubsw m5, m15
+ paddd m6, m13
+ paddd m0, m13
+ paddd m6, m3
+ paddd m0, m4
+ pshufb m7, m14
+ pmaddubsw m7, m15
+ phaddw m5, m7
+ mova m7, [rsp+0x80]
+ pmulhrsw m5, m12
+ punpcklwd m3, [esp+0x60], m5 ; 67
+ psrldq m1, m5, 8
+ punpcklwd m4, m5, m1 ; 78
+ pmaddwd m2, m3, [esp+0xc0]
+ pmaddwd m5, m4, [esp+0xc0]
+ mova [esp+0x60], m1
+ paddd m6, m2
+ mova m2, [esp+0x50]
+ paddd m0, m5
+ SWAP m7, m6
+%endif
+ psrad m7, rndshift
+ psrad m0, rndshift
+ packssdw m7, m0
+%if ARCH_X86_64
+ mova m0, [rsp+0x10]
+%else
+ mova m0, [esp+0x40]
+%define m11 m5
+%endif
+%ifidn %1, put
+ packuswb m7, m7
+ psrldq m11, m7, 4
+ movd [dstq+dsq*0], m7
+ movd [dstq+dsq*1], m11
+ lea dstq, [dstq+dsq*2]
+%else
+ mova [tmpq], m7
+ add tmpq, 16
+%endif
+ sub hd, 2
+ jz .ret
+%if ARCH_X86_64
+ mova m7, [rsp+0x00]
+ mova [rsp+0x00], m8
+ mova [rsp+0x10], m2
+ mova [rsp+0x20], m9
+ mova [rsp+0x30], m4
+%else
+ mova m7, [esp+0x70] ; 01
+ mova m1, [esp+0x80] ; 23
+ mova m2, [esp+0x50] ; 34
+ mova [esp+0x30], m0
+ mova [esp+0x70], m1
+ mova [esp+0x40], m2
+ mova [esp+0x80], m3
+ mova [esp+0x50], m4
+%endif
+ jmp .dy1_w4_loop
+INIT_XMM ssse3
+.dy1_w8:
+ mov dword [rsp+0x90], 1
+ movifprep tmp_stridem, 16
+ jmp .dy1_w_start
+.dy1_w16:
+ mov dword [rsp+0x90], 2
+ movifprep tmp_stridem, 32
+ jmp .dy1_w_start
+.dy1_w32:
+ mov dword [rsp+0x90], 4
+ movifprep tmp_stridem, 64
+ jmp .dy1_w_start
+.dy1_w64:
+ mov dword [rsp+0x90], 8
+ movifprep tmp_stridem, 128
+ jmp .dy1_w_start
+.dy1_w128:
+ mov dword [rsp+0x90], 16
+ movifprep tmp_stridem, 256
+.dy1_w_start:
+ mov myd, mym
+%ifidn %1, put
+ movifnidn dsm, dsq
+%endif
+%if ARCH_X86_64
+ shr t0d, 16
+ sub srcq, 3
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ movd m15, t0d
+%else
+ %define m8 m0
+ %define m9 m1
+ %xdefine m14 m4
+ %xdefine m15 m3
+ %if isprep
+ %define ssq ssm
+ %endif
+ mov r5, [esp+0x1f0]
+ mov r3, [esp+0x1f4]
+ shr r5, 16
+ sub srcq, 3
+ movd m15, r5
+ xor r5, r5
+ shr myd, 6
+ lea r3, [r3+myd]
+ mov r4, 64 << 24
+ cmovnz r4, [base+subpel_filters+r3*8+0]
+ cmovnz r5, [base+subpel_filters+r3*8+4]
+ mov r0, r0m
+ mov r3, r3m
+%endif
+ pslld m7, m8, 2 ; dx*4
+ pmaddwd m8, [base+rescale_mul] ; dx*[0-3]
+ pshufd m15, m15, q0000
+ paddd m14, m8 ; mx+dx*[0-3]
+%if ARCH_X86_64
+ movq m3, r4q
+ punpcklbw m3, m3
+ psraw m3, 8
+%else
+ movd m5, r4
+ movd m6, r5
+ punpckldq m5, m6
+ punpcklbw m5, m5
+ psraw m5, 8
+ SWAP m3, m5
+%endif
+ mova [rsp+0x100], m7
+ mova [rsp+0x120], m15
+ mov [rsp+0x098], srcq
+ mov [rsp+0x130], r0q ; dstq / tmpq
+ pshufd m0, m3, q0000
+ pshufd m1, m3, q1111
+ pshufd m2, m3, q2222
+ pshufd m3, m3, q3333
+ mova [rsp+0x140], m0
+ mova [rsp+0x150], m1
+ mova [rsp+0x160], m2
+ mova [rsp+0x170], m3
+%if ARCH_X86_64 && UNIX64
+ mov hm, hd
+%elif ARCH_X86_32
+ SWAP m5, m3
+ mov r5, hm
+ mov [esp+0x134], r5
+%endif
+ jmp .dy1_hloop
+.dy1_hloop_prep:
+ dec dword [rsp+0x090]
+ jz .ret
+%if ARCH_X86_64
+ add qword [rsp+0x130], 8*(isprep+1)
+ mov hd, hm
+%else
+ add dword [rsp+0x130], 8*(isprep+1)
+ mov r5, [esp+0x134]
+ mov r0, [esp+0x130]
+%endif
+ mova m7, [rsp+0x100]
+ mova m14, [rsp+0x110]
+%if ARCH_X86_64
+ mova m10, [base+pd_0x3ff]
+%else
+ %define m10 [base+pd_0x3ff]
+%endif
+ mova m15, [rsp+0x120]
+ mov srcq, [rsp+0x098]
+%if ARCH_X86_64
+ mov r0q, [rsp+0x130] ; dstq / tmpq
+%else
+ mov hm, r5
+ mov r0m, r0
+ mov r3, r3m
+%endif
+ paddd m14, m7
+.dy1_hloop:
+ pxor m9, m9
+%if ARCH_X86_64
+ mova m11, [base+pq_0x40000000]
+%else
+ %define m11 [base+pq_0x40000000]
+%endif
+ psrld m2, m14, 10
+ mova [rsp], m2
+ pand m6, m14, m10
+ psrld m6, 6
+ paddd m5, m15, m6
+ pcmpeqd m6, m9
+ psrldq m2, m5, 8
+%if ARCH_X86_64
+ movd r4d, m5
+ movd r6d, m2
+ psrldq m5, 4
+ psrldq m2, 4
+ movd r7d, m5
+ movd r9d, m2
+ movq m0, [base+subpel_filters+r4*8]
+ movq m1, [base+subpel_filters+r6*8]
+ movhps m0, [base+subpel_filters+r7*8]
+ movhps m1, [base+subpel_filters+r9*8]
+%else
+ movd r0, m5
+ movd rX, m2
+ psrldq m5, 4
+ psrldq m2, 4
+ movd r4, m5
+ movd r5, m2
+ movq m0, [base+subpel_filters+r0*8]
+ movq m1, [base+subpel_filters+rX*8]
+ movhps m0, [base+subpel_filters+r4*8]
+ movhps m1, [base+subpel_filters+r5*8]
+ pxor m2, m2
+ %define m9 m2
+%endif
+ paddd m14, m7 ; mx+dx*[4-7]
+ pand m5, m14, m10
+ psrld m5, 6
+ paddd m15, m5
+ pcmpeqd m5, m9
+ mova [rsp+0x110], m14
+ psrldq m4, m15, 8
+%if ARCH_X86_64
+ movd r10d, m15
+ movd r11d, m4
+ psrldq m15, 4
+ psrldq m4, 4
+ movd r13d, m15
+ movd rXd, m4
+ movq m2, [base+subpel_filters+r10*8]
+ movq m3, [base+subpel_filters+r11*8]
+ movhps m2, [base+subpel_filters+r13*8]
+ movhps m3, [base+subpel_filters+ rX*8]
+ psrld m14, 10
+ psrldq m4, m14, 8
+ movd r10d, m14
+ movd r11d, m4
+ psrldq m14, 4
+ psrldq m4, 4
+ movd r13d, m14
+ movd rXd, m4
+ mov r4d, [rsp+ 0]
+ mov r6d, [rsp+ 8]
+ mov r7d, [rsp+ 4]
+ mov r9d, [rsp+12]
+ pshufd m4, m6, q1100
+ pshufd m6, m6, q3322
+ pshufd m7, m5, q1100
+ pshufd m5, m5, q3322
+ pand m8, m11, m4
+ pand m9, m11, m6
+ pand m15, m11, m7
+ pand m11, m11, m5
+ pandn m4, m0
+ pandn m6, m1
+ pandn m7, m2
+ pandn m5, m3
+ por m8, m4
+ por m9, m6
+ por m15, m7
+ por m11, m5
+ mova [rsp+0x10], m8
+ mova [rsp+0x20], m9
+ mova [rsp+0x30], m15
+ mova [rsp+0x40], m11
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 10, 8, 9, 15, 11 ; 0-1
+ mova [rsp+0x50], m1
+ mova [rsp+0x60], m2
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 7, 10, 8, 9, 15, 11 ; 2-3
+ mova [rsp+0x70], m3
+ mova [rsp+0x80], m4
+ MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 4-5
+ MC_8TAP_SCALED_H 0,14, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 6-7
+ SWAP m7, m0
+ SWAP m8, m14
+ mova m1, [rsp+0x50]
+ mova m2, [rsp+0x60]
+ mova m3, [rsp+0x70]
+ mova m15, [rsp+0x80]
+ punpcklwd m4, m5, m6 ; 45a
+ punpckhwd m5, m6 ; 45b
+ punpcklwd m6, m7, m8 ; 67a
+ punpckhwd m7, m8 ; 67b
+ SWAP m14, m8
+ mova m8, [rsp+0x140]
+ mova m9, [rsp+0x150]
+ mova m10, [rsp+0x160]
+ mova m11, [rsp+0x170]
+ punpcklwd m0, m1, m2 ; 01a
+ punpckhwd m1, m2 ; 01b
+ punpcklwd m2, m3, m15; 23a
+ punpckhwd m3, m15 ; 23b
+ mova [rsp+0x50], m4
+ mova [rsp+0x60], m5
+ mova [rsp+0x70], m6
+ mova [rsp+0x80], m7
+ mova m14, [base+unpckw]
+%else
+ movd r0, m15
+ movd rX, m4
+ psrldq m15, 4
+ psrldq m4, 4
+ movd r4, m15
+ movd r5, m4
+ mova m14, [esp+0x110]
+ movq m2, [base+subpel_filters+r0*8]
+ movq m3, [base+subpel_filters+rX*8]
+ movhps m2, [base+subpel_filters+r4*8]
+ movhps m3, [base+subpel_filters+r5*8]
+ psrld m14, 10
+ mova [esp+16], m14
+ mov r0, [esp+ 0]
+ mov rX, [esp+ 8]
+ mov r4, [esp+ 4]
+ mov r5, [esp+12]
+ mova [esp+0x20], m0
+ mova [esp+0x30], m1
+ mova [esp+0x40], m2
+ mova [esp+0x50], m3
+ pshufd m4, m6, q1100
+ pshufd m6, m6, q3322
+ pshufd m7, m5, q1100
+ pshufd m5, m5, q3322
+ pand m0, m11, m4
+ pand m1, m11, m6
+ pand m2, m11, m7
+ pand m3, m11, m5
+ pandn m4, [esp+0x20]
+ pandn m6, [esp+0x30]
+ pandn m7, [esp+0x40]
+ pandn m5, [esp+0x50]
+ por m0, m4
+ por m1, m6
+ por m2, m7
+ por m3, m5
+ mova [esp+0x20], m0
+ mova [esp+0x30], m1
+ mova [esp+0x40], m2
+ mova [esp+0x50], m3
+ MC_8TAP_SCALED_H 0x20, 0x60, 0 ; 0-1
+ MC_8TAP_SCALED_H 0x20, 0x180 ; 2-3
+ MC_8TAP_SCALED_H 0x20, 0x1a0 ; 4-5
+ MC_8TAP_SCALED_H 0x20, 0x1c0 ; 6-7
+ mova m5, [esp+0x1a0]
+ mova m6, [esp+0x1b0]
+ mova m7, [esp+0x1c0]
+ mova m0, [esp+0x1d0]
+ punpcklwd m4, m5, m6 ; 45a
+ punpckhwd m5, m6 ; 45b
+ punpcklwd m6, m7, m0 ; 67a
+ punpckhwd m7, m0 ; 67b
+ mova [esp+0x1a0], m4
+ mova [esp+0x1b0], m5
+ mova [esp+0x1c0], m6
+ mova [esp+0x1d0], m7
+ mova m1, [esp+0x060]
+ mova m2, [esp+0x070]
+ mova m3, [esp+0x180]
+ mova m4, [esp+0x190]
+ punpcklwd m0, m1, m2 ; 01a
+ punpckhwd m1, m2 ; 01b
+ punpcklwd m2, m3, m4 ; 23a
+ punpckhwd m3, m4 ; 23b
+ mova [esp+0x060], m0
+ mova [esp+0x070], m1
+ mova [esp+0x180], m2
+ mova [esp+0x190], m3
+ %define m8 [esp+0x140]
+ %define m9 [esp+0x150]
+ %define m10 [esp+0x160]
+ %define m11 [esp+0x170]
+%endif
+.dy1_vloop:
+%if ARCH_X86_32
+ mov r0, r0m
+%endif
+ pmaddwd m4, m0, m8
+ pmaddwd m5, m1, m8
+ pmaddwd m6, m2, m9
+ pmaddwd m7, m3, m9
+ paddd m4, m13
+ paddd m5, m13
+ paddd m4, m6
+ paddd m5, m7
+%if ARCH_X86_64
+ pmaddwd m6, [rsp+0x50], m10
+ pmaddwd m7, [rsp+0x60], m10
+%else
+ pmaddwd m6, [rsp+0x1a0], m10
+ pmaddwd m7, [rsp+0x1b0], m10
+%endif
+ paddd m4, m6
+ paddd m5, m7
+%if ARCH_X86_64
+ pmaddwd m6, [rsp+0x70], m11
+ pmaddwd m7, [rsp+0x80], m11
+%else
+ pmaddwd m6, [rsp+0x1c0], m11
+ pmaddwd m7, [rsp+0x1d0], m11
+%endif
+ paddd m4, m6
+ paddd m5, m7
+ psrad m4, rndshift
+ psrad m5, rndshift
+ packssdw m4, m5
+%ifidn %1, put
+ packuswb m4, m4
+ movq [dstq], m4
+ add dstq, dsm
+%else
+ mova [tmpq], m4
+ add tmpq, tmp_stridem
+%endif
+%if ARCH_X86_32
+ mov r0m, r0
+%endif
+ dec hd
+ jz .dy1_hloop_prep
+%if ARCH_X86_64
+ movq m4, [srcq+ r4]
+ movq m5, [srcq+ r6]
+ movhps m4, [srcq+ r7]
+ movhps m5, [srcq+ r9]
+ movq m6, [srcq+r10]
+ movq m7, [srcq+r11]
+ movhps m6, [srcq+r13]
+ movhps m7, [srcq+ rX]
+ add srcq, ssq
+ pshufd m15, m14, q1032
+ pshufb m0, m14 ; 0a 1a
+ pshufb m1, m14 ; 0b 1b
+ pshufb m2, m15 ; 3a 2a
+ pshufb m3, m15 ; 3b 2b
+ pmaddubsw m4, [rsp+0x10]
+ pmaddubsw m5, [rsp+0x20]
+ pmaddubsw m6, [rsp+0x30]
+ pmaddubsw m7, [rsp+0x40]
+ phaddw m4, m5
+ phaddw m6, m7
+ phaddw m4, m6
+ pmulhrsw m4, m12
+ pshufb m5, [rsp+0x70], m15 ; 7a 6a
+ pshufb m7, [rsp+0x80], m15 ; 7b 6b
+ pshufb m6, [rsp+0x50], m14 ; 4a 5a
+ pshufb m15, [rsp+0x60], m14 ; 4b 5b
+ punpckhwd m0, m2 ; 12a
+ punpckhwd m1, m3 ; 12b
+ punpcklwd m2, m6 ; 34a
+ punpcklwd m3, m15 ; 34b
+ punpckhwd m6, m5 ; 56a
+ punpckhwd m15, m7 ; 56b
+ punpcklwd m5, m4 ; 78a
+ psrldq m4, 8
+ punpcklwd m7, m4 ; 78b
+ mova [rsp+0x50], m6
+ mova [rsp+0x60], m15
+ mova [rsp+0x70], m5
+ mova [rsp+0x80], m7
+%else
+ mov r0, [esp+ 0]
+ mov rX, [esp+ 8]
+ mov r4, [esp+ 4]
+ mov r5, [esp+12]
+ mova m6, [base+unpckw]
+ mova m0, [esp+0x060]
+ mova m1, [esp+0x070]
+ mova m7, [esp+0x1a0]
+ movq m4, [srcq+r0]
+ movq m5, [srcq+rX]
+ movhps m4, [srcq+r4]
+ movhps m5, [srcq+r5]
+ pshufb m0, m6 ; 0a 1a
+ pshufb m1, m6 ; 0b 1b
+ pshufb m7, m6 ; 4a 5a
+ mov r0, [esp+16]
+ mov rX, [esp+24]
+ mov r4, [esp+20]
+ mov r5, [esp+28]
+ movq m3, [srcq+r0]
+ movq m2, [srcq+rX]
+ movhps m3, [srcq+r4]
+ movhps m2, [srcq+r5]
+ add srcq, ssq
+ pmaddubsw m4, [esp+0x20]
+ pmaddubsw m5, [esp+0x30]
+ pmaddubsw m3, [esp+0x40]
+ pmaddubsw m2, [esp+0x50]
+ phaddw m4, m5
+ phaddw m3, m2
+ mova m5, [esp+0x1b0]
+ mova m2, [esp+0x180]
+ phaddw m4, m3
+ mova m3, [esp+0x190]
+ pmulhrsw m4, m12 ; 8a 8b
+ pshufb m5, m6 ; 4b 5b
+ pshufd m6, m6, q1032
+ pshufb m2, m6 ; 3a 2a
+ pshufb m3, m6 ; 3b 2b
+ punpckhwd m0, m2 ; 12a
+ punpckhwd m1, m3 ; 12b
+ mova [esp+0x60], m0
+ mova [esp+0x70], m1
+ mova m0, [esp+0x1c0]
+ mova m1, [esp+0x1d0]
+ punpcklwd m2, m7 ; 34a
+ punpcklwd m3, m5 ; 34b
+ mova [esp+0x180], m2
+ mova [esp+0x190], m3
+ pshufb m0, m6 ; 7a 6a
+ pshufb m1, m6 ; 7b 6b
+ punpckhwd m7, m0 ; 56a
+ punpckhwd m5, m1 ; 56b
+ punpcklwd m0, m4
+ punpckhqdq m4, m4
+ punpcklwd m1, m4
+ mova [esp+0x1a0], m7
+ mova [esp+0x1b0], m5
+ mova [esp+0x1c0], m0
+ mova [esp+0x1d0], m1
+ mova m0, [esp+0x60]
+ mova m1, [esp+0x70]
+%endif
+ jmp .dy1_vloop
+INIT_XMM ssse3
+.dy2:
+ movzx wd, word [base+%1_8tap_scaled_ssse3_dy2_table+wq*2]
+ add wq, base_reg
+ jmp wq
+%ifidn %1, put
+.dy2_w2:
+ %if ARCH_X86_64
+ mov myd, mym
+ movzx t0d, t0b
+ dec srcq
+ movd m15, t0d
+ %else
+ %define m10 [base+pd_0x3ff]
+ %define m11 [base+pd_0x4000]
+ %define m8 m0
+ %define m9 m1
+ %define m14 m4
+ %define m15 m3
+ movzx r5, byte [esp+0x1f0]
+ dec srcd
+ movd m15, r5
+ %endif
+ punpckldq m9, m8
+ SWAP m8, m9
+ paddd m14, m8 ; mx+dx*[0-1]
+ %if ARCH_X86_64
+ mova m11, [base+pd_0x4000]
+ %endif
+ pshufd m15, m15, q0000
+ pand m8, m14, m10
+ psrld m8, 6
+ paddd m15, m8
+ movd r4d, m15
+ psrldq m15, 4
+ %if ARCH_X86_64
+ movd r6d, m15
+ %else
+ movd r3d, m15
+ %endif
+ mova m5, [base+bdct_lb_dw]
+ mova m6, [base+subpel_s_shuf2]
+ movd m15, [base+subpel_filters+r4*8+2]
+ %if ARCH_X86_64
+ movd m7, [base+subpel_filters+r6*8+2]
+ %else
+ movd m7, [base+subpel_filters+r3*8+2]
+ %endif
+ pxor m9, m9
+ pcmpeqd m8, m9
+ psrld m14, 10
+ %if ARCH_X86_32
+ mov r3, r3m
+ pshufb m14, m5
+ paddb m14, m6
+ mova [esp+0x00], m14
+ %define m14 [esp+0x00]
+ SWAP m5, m0
+ SWAP m6, m3
+ %define m8 m5
+ %define m15 m6
+ %endif
+ movq m0, [srcq+ssq*0]
+ movq m1, [srcq+ssq*1]
+ movhps m0, [srcq+ssq*2]
+ movhps m1, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ %if ARCH_X86_64
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ pshufb m14, m5
+ paddb m14, m6
+ movq m10, r4q
+ %else
+ mov myd, mym
+ mov r3, [esp+0x1f4]
+ xor r5, r5
+ shr myd, 6
+ lea r3, [r3+myd]
+ mov r4, 64 << 24
+ cmovnz r4, [base+subpel_filters+r3*8+0]
+ cmovnz r5, [base+subpel_filters+r3*8+4]
+ mov r3, r3m
+ %define m10 m4
+ movd m10, r4
+ movd m3, r5
+ punpckldq m10, m3
+ %endif
+ movq m3, [srcq+ssq*0]
+ movhps m3, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklbw m10, m10
+ psraw m10, 8
+ punpckldq m15, m7
+ punpcklqdq m15, m15
+ %if ARCH_X86_64
+ pand m11, m8
+ %else
+ pand m7, m11, m8
+ %define m11 m7
+ %endif
+ pandn m8, m15
+ SWAP m15, m8
+ por m15, m11
+ %if ARCH_X86_64
+ pshufd m8, m10, q0000
+ pshufd m9, m10, q1111
+ pshufd m11, m10, q3333
+ pshufd m10, m10, q2222
+ %else
+ mova [esp+0x10], m15
+ %define m15 [esp+0x10]
+ mov r5, r0m
+ %define dstq r5
+ mov dsd, dsm
+ pshufd m5, m4, q0000
+ pshufd m6, m4, q1111
+ pshufd m7, m4, q2222
+ pshufd m4, m4, q3333
+ %define m8 [esp+0x20]
+ %define m9 [esp+0x30]
+ %define m10 [esp+0x40]
+ %define m11 [esp+0x50]
+ mova m8, m5
+ mova m9, m6
+ mova m10, m7
+ mova m11, m4
+ %endif
+ pshufb m0, m14
+ pshufb m1, m14
+ pshufb m3, m14
+ pmaddubsw m0, m15
+ pmaddubsw m1, m15
+ pmaddubsw m3, m15
+ pslldq m2, m3, 8
+ phaddw m0, m2
+ phaddw m1, m3
+ pmulhrsw m0, m12 ; 0 2 _ 4
+ pmulhrsw m1, m12 ; 1 3 _ 5
+ pshufd m2, m0, q3110 ; 0 2 2 4
+ pshufd m1, m1, q3110 ; 1 3 3 5
+ punpcklwd m3, m2, m1 ; 01 23
+ punpckhwd m2, m1 ; 23 45
+.dy2_w2_loop:
+ movq m6, [srcq+ssq*0]
+ movq m7, [srcq+ssq*1]
+ movhps m6, [srcq+ssq*2]
+ movhps m7, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ pmaddwd m4, m3, m8
+ pmaddwd m5, m2, m9
+ pshufb m6, m14
+ pshufb m7, m14
+ pmaddubsw m6, m15
+ pmaddubsw m7, m15
+ phaddw m6, m7
+ pmulhrsw m6, m12
+ psrldq m7, m6, 8
+ palignr m6, m0, 8
+ palignr m7, m1, 8
+ mova m0, m6
+ mova m1, m7
+ pshufd m6, m6, q3221
+ pshufd m7, m7, q3221
+ punpcklwd m3, m6, m7 ; 45 67
+ punpckhwd m2, m6, m7 ; 67 89
+ pmaddwd m6, m3, m10
+ pmaddwd m7, m2, m11
+ paddd m4, m5
+ paddd m4, m13
+ paddd m6, m7
+ paddd m4, m6
+ psrad m4, rndshift
+ packssdw m4, m4
+ packuswb m4, m4
+ movd r4d, m4
+ mov [dstq+dsq*0], r4w
+ shr r4d, 16
+ mov [dstq+dsq*1], r4w
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .dy2_w2_loop
+ RET
+%endif
+INIT_XMM ssse3
+.dy2_w4:
+%if ARCH_X86_64
+ mov myd, mym
+ movzx t0d, t0b
+ dec srcq
+ movd m15, t0d
+%else
+ %define m10 [base+pd_0x3ff]
+ %define m11 [base+pd_0x4000]
+ %define m8 m0
+ %xdefine m14 m4
+ %define m15 m3
+ %define dstq r0
+ %if isprep
+ %define ssq r3
+ %endif
+ movzx r4, byte [esp+0x1f0]
+ dec srcq
+ movd m15, r4
+%endif
+ pmaddwd m8, [base+rescale_mul]
+%if ARCH_X86_64
+ mova m11, [base+pd_0x4000]
+%endif
+ pshufd m15, m15, q0000
+ paddd m14, m8 ; mx+dx*[0-3]
+ pand m8, m14, m10
+ psrld m8, 6
+ paddd m15, m8
+ psrldq m7, m15, 8
+%if ARCH_X86_64
+ movd r4d, m15
+ movd r11d, m7
+ psrldq m15, 4
+ psrldq m7, 4
+ movd r6d, m15
+ movd r13d, m7
+ movd m15, [base+subpel_filters+ r4*8+2]
+ movd m2, [base+subpel_filters+r11*8+2]
+ movd m3, [base+subpel_filters+ r6*8+2]
+ movd m4, [base+subpel_filters+r13*8+2]
+ movq m6, [base+subpel_s_shuf2]
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+%else
+ movd r1, m15
+ movd r3, m7
+ psrldq m15, 4
+ psrldq m7, 4
+ movd r4, m15
+ movd r5, m7
+ %define m15 m5
+ SWAP m4, m7
+ movd m15, [base+subpel_filters+r1*8+2]
+ movd m2, [base+subpel_filters+r3*8+2]
+ movd m3, [base+subpel_filters+r4*8+2]
+ movd m4, [base+subpel_filters+r5*8+2]
+ movq m6, [base+subpel_s_shuf2]
+ mov myd, mym
+ mov r3, [esp+0x1f4]
+ xor r5, r5
+ shr myd, 6
+ lea r3, [r3+myd]
+ mov r4, 64 << 24
+ cmovnz r4, [base+subpel_filters+r3*8+0]
+ cmovnz r5, [base+subpel_filters+r3*8+4]
+ mov r3, r3m
+ %if isprep
+ lea ss3q, [ssq*3]
+ %endif
+%endif
+ punpckldq m15, m3
+ punpckldq m2, m4
+ punpcklqdq m15, m2
+%if ARCH_X86_64
+ pcmpeqd m8, m9
+ psrld m14, 10
+ movu m0, [srcq+ssq*0]
+ movu m2, [srcq+ssq*2]
+ movu m1, [srcq+ssq*1]
+ movu m3, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ punpcklqdq m6, m6
+ pshufb m14, [base+bdct_lb_dw]
+ movu m4, [srcq+ssq*0]
+ movu m5, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pand m11, m8
+ pandn m8, m15
+ SWAP m15, m8
+ por m15, m11
+ paddb m14, m6
+ movq m11, r4q
+ punpcklbw m11, m11
+ psraw m11, 8
+ pshufb m0, m14
+ pshufb m2, m14
+ pshufb m1, m14
+ pshufb m3, m14
+ pshufb m4, m14
+ pshufb m5, m14
+ pmaddubsw m0, m15
+ pmaddubsw m2, m15
+ pmaddubsw m1, m15
+ pmaddubsw m3, m15
+ pmaddubsw m4, m15
+ pmaddubsw m5, m15
+ phaddw m0, m2
+ phaddw m1, m3
+ phaddw m4, m5
+ pmulhrsw m0, m12 ; 0 2
+ pmulhrsw m1, m12 ; 1 3
+ pmulhrsw m4, m12 ; 4 5
+ pshufd m8, m11, q0000
+ pshufd m9, m11, q1111
+ pshufd m10, m11, q2222
+ pshufd m11, m11, q3333
+%else
+ pxor m3, m3
+ pcmpeqd m8, m3
+ psrld m14, 10
+ pshufb m14, [base+bdct_lb_dw]
+ movu m1, [srcq+ssq*0]
+ movu m2, [srcq+ssq*2]
+ movu m3, [srcq+ssq*1]
+ add srcq, ss3q
+ punpcklqdq m6, m6
+ SWAP m4, m7
+ pand m7, m11, m8
+ pandn m8, m15
+ SWAP m15, m8
+ por m15, m7
+ paddb m14, m6
+ movu m0, [srcq+ssq*0]
+ movu m7, [srcq+ssq*1]
+ movu m6, [srcq+ssq*2]
+ add srcq, ss3q
+ pshufb m1, m14
+ pshufb m2, m14
+ pshufb m3, m14
+ pshufb m0, m14
+ pshufb m7, m14
+ pshufb m6, m14
+ pmaddubsw m1, m15
+ pmaddubsw m2, m15
+ pmaddubsw m3, m15
+ mova [esp+0x00], m14
+ mova [esp+0x10], m15
+ pmaddubsw m0, m15
+ pmaddubsw m7, m15
+ pmaddubsw m6, m15
+ %define m14 [esp+0x00]
+ %define m15 [esp+0x10]
+ phaddw m1, m2
+ phaddw m3, m0
+ phaddw m7, m6
+ %ifidn %1, put
+ mov dsd, dsm
+ %define dstq r5
+ %else
+ %define tmpq r5
+ %endif
+ movd m6, r4
+ movd m0, r5
+ punpckldq m6, m0
+ punpcklbw m6, m6
+ psraw m6, 8
+ mov r5, r0m
+ pmulhrsw m1, m12 ; 0 2
+ pmulhrsw m3, m12 ; 1 3
+ pmulhrsw m7, m12 ; 4 5
+ SWAP m0, m1, m3
+ SWAP m4, m7
+ pshufd m2, m6, q0000
+ pshufd m3, m6, q1111
+ pshufd m7, m6, q2222
+ pshufd m6, m6, q3333
+ mova [esp+0x30], m2
+ mova [esp+0x40], m3
+ mova [esp+0x50], m7
+ mova [esp+0x60], m6
+ %define m8 [esp+0x30]
+ %define m9 [esp+0x40]
+ %define m10 [esp+0x50]
+ %define m11 [esp+0x60]
+%endif
+ psrldq m5, m4, 8 ; 5 _
+ punpckhwd m2, m0, m1 ; 23
+ punpcklwd m0, m1 ; 01
+ punpcklwd m4, m5 ; 45
+.dy2_w4_loop:
+ pmaddwd m0, m8 ; a0
+ pmaddwd m5, m2, m8 ; b0
+ pmaddwd m2, m9 ; a1
+ pmaddwd m7, m4, m9 ; b1
+ pmaddwd m3, m4, m10 ; a2
+ paddd m0, m13
+ paddd m5, m13
+ paddd m0, m2
+ paddd m5, m7
+ paddd m0, m3
+ movu m6, [srcq+ssq*0]
+ movu m7, [srcq+ssq*1]
+ movu m3, [srcq+ssq*2]
+ movu m1, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ pshufb m6, m14
+ pshufb m7, m14
+ pshufb m3, m14
+ pshufb m1, m14
+ pmaddubsw m6, m15
+ pmaddubsw m7, m15
+ pmaddubsw m3, m15
+ pmaddubsw m1, m15
+ phaddw m6, m7
+ phaddw m3, m1
+ pmulhrsw m6, m12 ; 6 7
+ pmulhrsw m3, m12 ; 8 9
+ psrldq m7, m6, 8
+ psrldq m1, m3, 8
+ punpcklwd m6, m7 ; 67
+ punpcklwd m3, m1 ; 89
+ mova m2, m6
+ pmaddwd m1, m6, m10 ; b2
+ pmaddwd m6, m11 ; a3
+ pmaddwd m7, m3, m11 ; b3
+ paddd m5, m1
+ paddd m0, m6
+ paddd m5, m7
+ psrad m0, rndshift
+ psrad m5, rndshift
+ packssdw m0, m5
+%ifidn %1, put
+ packuswb m0, m0
+ psrldq m1, m0, 4
+ movd [dstq+dsq*0], m0
+ movd [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+%else
+ mova [tmpq], m0
+ add tmpq, 16
+%endif
+ mova m0, m4
+ mova m4, m3
+ sub hd, 2
+ jg .dy2_w4_loop
+ MC_8TAP_SCALED_RET
+INIT_XMM ssse3
+.dy2_w8:
+ mov dword [rsp+0x90], 1
+ movifprep tmp_stridem, 16
+ jmp .dy2_w_start
+.dy2_w16:
+ mov dword [rsp+0x90], 2
+ movifprep tmp_stridem, 32
+ jmp .dy2_w_start
+.dy2_w32:
+ mov dword [rsp+0x90], 4
+ movifprep tmp_stridem, 64
+ jmp .dy2_w_start
+.dy2_w64:
+ mov dword [rsp+0x90], 8
+ movifprep tmp_stridem, 128
+ jmp .dy2_w_start
+.dy2_w128:
+ mov dword [rsp+0x90], 16
+ movifprep tmp_stridem, 256
+.dy2_w_start:
+ mov myd, mym
+%ifidn %1, put
+ movifnidn dsm, dsq
+%endif
+%if ARCH_X86_64
+ shr t0d, 16
+ sub srcq, 3
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ movd m15, t0d
+%else
+ %define m10 [base+pd_0x3ff]
+ %define m11 [base+pd_0x4000]
+ %define m8 m0
+ %define m9 m1
+ %xdefine m14 m4
+ %xdefine m15 m3
+ %if isprep
+ %define tmpq r0
+ %define ssq ssm
+ %else
+ %define dstq r0
+ %endif
+ mov r5, [esp+0x1f0]
+ mov r3, [esp+0x1f4]
+ shr r5, 16
+ sub srcq, 3
+ movd m15, r5
+ xor r5, r5
+ shr myd, 6
+ lea r3, [r3+myd]
+ mov r4, 64 << 24
+ cmovnz r4, [base+subpel_filters+r3*8+0]
+ cmovnz r5, [base+subpel_filters+r3*8+4]
+ mov r0, r0m
+ mov r3, r3m
+%endif
+ pslld m7, m8, 2 ; dx*4
+ pmaddwd m8, [base+rescale_mul] ; dx*[0-3]
+ pshufd m15, m15, q0000
+ paddd m14, m8 ; mx+dx*[0-3]
+%if ARCH_X86_64
+ movq m3, r4q
+ punpcklbw m3, m3
+ psraw m3, 8
+%else
+ movd m5, r4
+ movd m6, r5
+ punpckldq m5, m6
+ punpcklbw m5, m5
+ psraw m5, 8
+ SWAP m3, m5
+%endif
+ mova [rsp+0x100], m7
+ mova [rsp+0x120], m15
+ mov [rsp+0x098], srcq
+ mov [rsp+0x130], r0q ; dstq / tmpq
+ pshufd m0, m3, q0000
+ pshufd m1, m3, q1111
+ pshufd m2, m3, q2222
+ pshufd m3, m3, q3333
+ mova [rsp+0x140], m0
+ mova [rsp+0x150], m1
+ mova [rsp+0x160], m2
+ mova [rsp+0x170], m3
+%if ARCH_X86_64 && UNIX64
+ mov hm, hd
+%elif ARCH_X86_32
+ SWAP m5, m3
+ mov r5, hm
+ mov [esp+0x134], r5
+%endif
+ jmp .dy2_hloop
+.dy2_hloop_prep:
+ dec dword [rsp+0x090]
+ jz .ret
+%if ARCH_X86_64
+ add qword [rsp+0x130], 8*(isprep+1)
+ mov hd, hm
+%else
+ add dword [rsp+0x130], 8*(isprep+1)
+ mov r5, [esp+0x134]
+ mov r0, [esp+0x130]
+%endif
+ mova m7, [rsp+0x100]
+ mova m14, [rsp+0x110]
+%if ARCH_X86_64
+ mova m10, [base+pd_0x3ff]
+%else
+ %define m10 [base+pd_0x3ff]
+%endif
+ mova m15, [rsp+0x120]
+ mov srcq, [rsp+0x098]
+%if ARCH_X86_64
+ mov r0q, [rsp+0x130] ; dstq / tmpq
+%else
+ mov hm, r5
+ mov r0m, r0
+ mov r3, r3m
+%endif
+ paddd m14, m7
+.dy2_hloop:
+ pxor m9, m9
+%if ARCH_X86_64
+ mova m11, [base+pq_0x40000000]
+%else
+ %define m11 [base+pq_0x40000000]
+%endif
+ psrld m2, m14, 10
+ mova [rsp], m2
+ pand m6, m14, m10
+ psrld m6, 6
+ paddd m5, m15, m6
+ pcmpeqd m6, m9
+ psrldq m2, m5, 8
+%if ARCH_X86_64
+ movd r4d, m5
+ movd r6d, m2
+ psrldq m5, 4
+ psrldq m2, 4
+ movd r7d, m5
+ movd r9d, m2
+ movq m0, [base+subpel_filters+r4*8]
+ movq m1, [base+subpel_filters+r6*8]
+ movhps m0, [base+subpel_filters+r7*8]
+ movhps m1, [base+subpel_filters+r9*8]
+%else
+ movd r0, m5
+ movd rX, m2
+ psrldq m5, 4
+ psrldq m2, 4
+ movd r4, m5
+ movd r5, m2
+ movq m0, [base+subpel_filters+r0*8]
+ movq m1, [base+subpel_filters+rX*8]
+ movhps m0, [base+subpel_filters+r4*8]
+ movhps m1, [base+subpel_filters+r5*8]
+ pxor m2, m2
+ %define m9 m2
+%endif
+ paddd m14, m7 ; mx+dx*[4-7]
+ pand m5, m14, m10
+ psrld m5, 6
+ paddd m15, m5
+ pcmpeqd m5, m9
+ mova [rsp+0x110], m14
+ psrldq m4, m15, 8
+%if ARCH_X86_64
+ movd r10d, m15
+ movd r11d, m4
+ psrldq m15, 4
+ psrldq m4, 4
+ movd r13d, m15
+ movd rXd, m4
+ movq m2, [base+subpel_filters+r10*8]
+ movq m3, [base+subpel_filters+r11*8]
+ movhps m2, [base+subpel_filters+r13*8]
+ movhps m3, [base+subpel_filters+ rX*8]
+ psrld m14, 10
+ psrldq m4, m14, 8
+ movd r10d, m14
+ movd r11d, m4
+ psrldq m14, 4
+ psrldq m4, 4
+ movd r13d, m14
+ movd rXd, m4
+ mov r4d, [rsp+ 0]
+ mov r6d, [rsp+ 8]
+ mov r7d, [rsp+ 4]
+ mov r9d, [rsp+12]
+ pshufd m4, m6, q1100
+ pshufd m6, m6, q3322
+ pshufd m7, m5, q1100
+ pshufd m5, m5, q3322
+ pand m8, m11, m4
+ pand m9, m11, m6
+ pand m15, m11, m7
+ pand m11, m11, m5
+ pandn m4, m0
+ pandn m6, m1
+ pandn m7, m2
+ pandn m5, m3
+ por m8, m4
+ por m9, m6
+ por m15, m7
+ por m11, m5
+ mova [rsp+0x10], m8
+ mova [rsp+0x20], m9
+ mova [rsp+0x30], m15
+ mova [rsp+0x40], m11
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 10, 8, 9, 15, 11 ; 0-1
+ mova [rsp+0x50], m1
+ mova [rsp+0x60], m2
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 7, 10, 8, 9, 15, 11 ; 2-3
+ mova [rsp+0x70], m3
+ mova [rsp+0x80], m4
+ MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 4-5
+ MC_8TAP_SCALED_H 0,14, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 6-7
+ SWAP m7, m0
+ SWAP m8, m14
+ mova m1, [rsp+0x50]
+ mova m2, [rsp+0x60]
+ mova m3, [rsp+0x70]
+ mova m15, [rsp+0x80]
+ punpcklwd m4, m5, m6 ; 45a
+ punpckhwd m5, m6 ; 45b
+ punpcklwd m6, m7, m8 ; 67a
+ punpckhwd m7, m8 ; 67b
+ SWAP m14, m8
+ mova m8, [rsp+0x140]
+ mova m9, [rsp+0x150]
+ mova m10, [rsp+0x160]
+ mova m11, [rsp+0x170]
+ punpcklwd m0, m1, m2 ; 01a
+ punpckhwd m1, m2 ; 01b
+ punpcklwd m2, m3, m15; 23a
+ punpckhwd m3, m15 ; 23b
+ mova [rsp+0x50], m4
+ mova [rsp+0x60], m5
+ mova [rsp+0x70], m6
+ mova [rsp+0x80], m7
+%else
+ movd r0, m15
+ movd rX, m4
+ psrldq m15, 4
+ psrldq m4, 4
+ movd r4, m15
+ movd r5, m4
+ mova m14, [esp+0x110]
+ movq m2, [base+subpel_filters+r0*8]
+ movq m3, [base+subpel_filters+rX*8]
+ movhps m2, [base+subpel_filters+r4*8]
+ movhps m3, [base+subpel_filters+r5*8]
+ psrld m14, 10
+ mova [esp+16], m14
+ mov r0, [esp+ 0]
+ mov rX, [esp+ 8]
+ mov r4, [esp+ 4]
+ mov r5, [esp+12]
+ mova [esp+0x20], m0
+ mova [esp+0x30], m1
+ mova [esp+0x40], m2
+ mova [esp+0x50], m3
+ pshufd m4, m6, q1100
+ pshufd m6, m6, q3322
+ pshufd m7, m5, q1100
+ pshufd m5, m5, q3322
+ pand m0, m11, m4
+ pand m1, m11, m6
+ pand m2, m11, m7
+ pand m3, m11, m5
+ pandn m4, [esp+0x20]
+ pandn m6, [esp+0x30]
+ pandn m7, [esp+0x40]
+ pandn m5, [esp+0x50]
+ por m0, m4
+ por m1, m6
+ por m2, m7
+ por m3, m5
+ mova [esp+0x20], m0
+ mova [esp+0x30], m1
+ mova [esp+0x40], m2
+ mova [esp+0x50], m3
+ MC_8TAP_SCALED_H 0x20, 0x60, 0 ; 0-1
+ MC_8TAP_SCALED_H 0x20, 0x180 ; 2-3
+ MC_8TAP_SCALED_H 0x20, 0x1a0 ; 4-5
+ MC_8TAP_SCALED_H 0x20, 0x1c0 ; 6-7
+ mova m5, [esp+0x1a0]
+ mova m6, [esp+0x1b0]
+ mova m7, [esp+0x1c0]
+ mova m0, [esp+0x1d0]
+ punpcklwd m4, m5, m6 ; 45a
+ punpckhwd m5, m6 ; 45b
+ punpcklwd m6, m7, m0 ; 67a
+ punpckhwd m7, m0 ; 67b
+ mova [esp+0x1a0], m4
+ mova [esp+0x1b0], m5
+ mova [esp+0x1c0], m6
+ mova [esp+0x1d0], m7
+ mova m1, [esp+0x060]
+ mova m2, [esp+0x070]
+ mova m3, [esp+0x180]
+ mova m4, [esp+0x190]
+ punpcklwd m0, m1, m2 ; 01a
+ punpckhwd m1, m2 ; 01b
+ punpcklwd m2, m3, m4 ; 23a
+ punpckhwd m3, m4 ; 23b
+ mova [esp+0x180], m2
+ mova [esp+0x190], m3
+ %define m8 [esp+0x140]
+ %define m9 [esp+0x150]
+ %define m10 [esp+0x160]
+ %define m11 [esp+0x170]
+%endif
+.dy2_vloop:
+%if ARCH_X86_32
+ mov r0, r0m
+%endif
+ pmaddwd m4, m0, m8
+ pmaddwd m5, m1, m8
+ pmaddwd m6, m2, m9
+ pmaddwd m7, m3, m9
+ paddd m4, m13
+ paddd m5, m13
+ paddd m4, m6
+ paddd m5, m7
+%if ARCH_X86_64
+ pmaddwd m6, [rsp+0x50], m10
+ pmaddwd m7, [rsp+0x60], m10
+%else
+ pmaddwd m6, [esp+0x1a0], m10
+ pmaddwd m7, [esp+0x1b0], m10
+%endif
+ paddd m4, m6
+ paddd m5, m7
+%if ARCH_X86_64
+ pmaddwd m6, [rsp+0x70], m11
+ pmaddwd m7, [rsp+0x80], m11
+%else
+ pmaddwd m6, [esp+0x1c0], m11
+ pmaddwd m7, [esp+0x1d0], m11
+%endif
+ paddd m4, m6
+ paddd m5, m7
+ psrad m4, rndshift
+ psrad m5, rndshift
+ packssdw m4, m5
+%ifidn %1, put
+ packuswb m4, m4
+ movq [dstq], m4
+ add dstq, dsm
+%else
+ mova [tmpq], m4
+ add tmpq, tmp_stridem
+%endif
+%if ARCH_X86_32
+ mov r0m, r0
+%endif
+ dec hd
+ jz .dy2_hloop_prep
+%if ARCH_X86_64
+ mova m8, [rsp+0x10]
+ mova m9, [rsp+0x20]
+ mova m10, [rsp+0x30]
+ mova m11, [rsp+0x40]
+ mova m0, m2 ; 01a
+ mova m1, m3 ; 01b
+ MC_8TAP_SCALED_H 2, 6, 3, 4, 5, 7, 14, 15, 8, 9, 10, 11
+ mova m3, [rsp+0x50] ; 23a
+ mova m4, [rsp+0x60] ; 23b
+ mova m5, [rsp+0x70] ; 45a
+ mova m7, [rsp+0x80] ; 45b
+ mova m8, [rsp+0x140]
+ mova m9, [rsp+0x150]
+ mova m10, [rsp+0x160]
+ mova m11, [rsp+0x170]
+ punpcklwd m14, m2, m6 ; 67a
+ punpckhwd m2, m6 ; 67b
+ mova [rsp+0x50], m5
+ mova [rsp+0x60], m7
+ mova [rsp+0x70], m14
+ mova [rsp+0x80], m2
+ mova m2, m3
+ mova m3, m4
+%else
+ MC_8TAP_SCALED_H 0x20, 0
+ punpcklwd m6, m0, m4
+ punpckhwd m7, m0, m4
+ mova m0, [esp+0x180] ; 01a
+ mova m1, [esp+0x190] ; 01b
+ mova m2, [rsp+0x1a0] ; 23a
+ mova m3, [esp+0x1b0] ; 23b
+ mova m4, [esp+0x1c0] ; 45a
+ mova m5, [esp+0x1d0] ; 45b
+ mova [esp+0x180], m2
+ mova [esp+0x190], m3
+ mova [esp+0x1a0], m4
+ mova [esp+0x1b0], m5
+ mova [esp+0x1c0], m6 ; 67a
+ mova [esp+0x1d0], m7 ; 67b
+%endif
+ jmp .dy2_vloop
+.ret:
+ MC_8TAP_SCALED_RET 0
+%if ARCH_X86_32 && !isprep && required_stack_alignment > STACK_ALIGNMENT
+ %define r0m [rstk+stack_offset+ 4]
+ %define r1m [rstk+stack_offset+ 8]
+ %define r2m [rstk+stack_offset+12]
+ %define r3m [rstk+stack_offset+16]
+%endif
+%undef isprep
+%endmacro
+
+%macro BILIN_SCALED_FN 1
+cglobal %1_bilin_scaled_8bpc
+ mov t0d, (5*15 << 16) | 5*15
+ mov t1d, (5*15 << 16) | 5*15
+ jmp mangle(private_prefix %+ _%1_8tap_scaled_8bpc %+ SUFFIX)
+%endmacro
+
+%if WIN64
+DECLARE_REG_TMP 6, 5
+%elif ARCH_X86_64
+DECLARE_REG_TMP 6, 8
+%else
+DECLARE_REG_TMP 1, 2
+%endif
+BILIN_SCALED_FN put
+FN put_8tap_scaled, sharp, SHARP, SHARP
+FN put_8tap_scaled, sharp_smooth, SHARP, SMOOTH
+FN put_8tap_scaled, smooth_sharp, SMOOTH, SHARP
+FN put_8tap_scaled, smooth, SMOOTH, SMOOTH
+FN put_8tap_scaled, sharp_regular, SHARP, REGULAR
+FN put_8tap_scaled, regular_sharp, REGULAR, SHARP
+FN put_8tap_scaled, smooth_regular, SMOOTH, REGULAR
+FN put_8tap_scaled, regular_smooth, REGULAR, SMOOTH
+FN put_8tap_scaled, regular, REGULAR, REGULAR
+MC_8TAP_SCALED put
+
+%if WIN64
+DECLARE_REG_TMP 5, 4
+%elif ARCH_X86_64
+DECLARE_REG_TMP 6, 7
+%else
+DECLARE_REG_TMP 1, 2
+%endif
+BILIN_SCALED_FN prep
+FN prep_8tap_scaled, sharp, SHARP, SHARP
+FN prep_8tap_scaled, sharp_smooth, SHARP, SMOOTH
+FN prep_8tap_scaled, smooth_sharp, SMOOTH, SHARP
+FN prep_8tap_scaled, smooth, SMOOTH, SMOOTH
+FN prep_8tap_scaled, sharp_regular, SHARP, REGULAR
+FN prep_8tap_scaled, regular_sharp, REGULAR, SHARP
+FN prep_8tap_scaled, smooth_regular, SMOOTH, REGULAR
+FN prep_8tap_scaled, regular_smooth, REGULAR, SMOOTH
+FN prep_8tap_scaled, regular, REGULAR, REGULAR
+MC_8TAP_SCALED prep
+
+%if ARCH_X86_32
+ %macro SAVE_ALPHA_BETA 0
+ mov alpham, alphad
+ mov betam, betad
+ %endmacro
+
+ %macro SAVE_DELTA_GAMMA 0
+ mov deltam, deltad
+ mov gammam, gammad
+ %endmacro
+
+ %macro LOAD_ALPHA_BETA_MX 0
+ mov mym, myd
+ mov alphad, alpham
+ mov betad, betam
+ mov mxd, mxm
+ %endmacro
+
+ %macro LOAD_DELTA_GAMMA_MY 0
+ mov mxm, mxd
+ mov deltad, deltam
+ mov gammad, gammam
+ mov myd, mym
+ %endmacro
+
+ %define PIC_reg r2
+ %define PIC_base_offset $$
+ %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset)
+%else
+ %define SAVE_ALPHA_BETA
+ %define SAVE_DELTA_GAMMA
+ %define PIC_sym(sym) sym
+%endif
+
+%if ARCH_X86_32
+ %if STACK_ALIGNMENT < required_stack_alignment
+ %assign copy_args 8*4
+ %else
+ %assign copy_args 0
+ %endif
+%endif
+
+%macro RELOC_ARGS 0
+ %if copy_args
+ mov r0, r0m
+ mov r1, r1m
+ mov r2, r2m
+ mov r3, r3m
+ mov r5, r5m
+ mov dstm, r0
+ mov dsm, r1
+ mov srcm, r2
+ mov ssm, r3
+ mov mxm, r5
+ mov r0, r6m
+ mov mym, r0
+ %endif
+%endmacro
+
+%macro BLENDHWDW 2 ; blend high words from dwords, src1, src2
+ %if cpuflag(sse4)
+ pblendw %1, %2, 0xAA
+ %else
+ pand %2, m10
+ por %1, %2
+ %endif
+%endmacro
+
+%macro WARP_V 10 ; dst0, dst1, 0, 2, 4, 6, 1, 3, 5, 7
+ %if ARCH_X86_32
+ %define m8 m4
+ %define m9 m5
+ %define m14 m6
+ %define m15 m7
+ %define m11 m7
+ %endif
+ %if notcpuflag(ssse3) || ARCH_X86_32
+ pxor m11, m11
+ %endif
+ lea tmp1d, [myq+deltaq*4]
+ lea tmp2d, [myq+deltaq*1]
+ shr myd, 10
+ shr tmp1d, 10
+ movq m2, [filterq+myq *8] ; a
+ movq m8, [filterq+tmp1q*8] ; e
+ lea tmp1d, [tmp2q+deltaq*4]
+ lea myd, [tmp2q+deltaq*1]
+ shr tmp2d, 10
+ shr tmp1d, 10
+ movq m3, [filterq+tmp2q*8] ; b
+ movq m0, [filterq+tmp1q*8] ; f
+ punpcklwd m2, m3
+ punpcklwd m8, m0
+ lea tmp1d, [myq+deltaq*4]
+ lea tmp2d, [myq+deltaq*1]
+ shr myd, 10
+ shr tmp1d, 10
+ movq m0, [filterq+myq *8] ; c
+ movq m9, [filterq+tmp1q*8] ; g
+ lea tmp1d, [tmp2q+deltaq*4]
+ lea myd, [tmp2q+gammaq] ; my += gamma
+ shr tmp2d, 10
+ shr tmp1d, 10
+ movq m3, [filterq+tmp2q*8] ; d
+ movq m1, [filterq+tmp1q*8] ; h
+ punpcklwd m0, m3
+ punpcklwd m9, m1
+ punpckldq m1, m2, m0
+ punpckhdq m2, m0
+ punpcklbw m0, m11, m1 ; a0 a2 b0 b2 c0 c2 d0 d2 << 8
+ punpckhbw m3, m11, m1 ; a4 a6 b4 b6 c4 c6 d4 d6 << 8
+ punpcklbw m1, m11, m2 ; a1 a3 b1 b3 c1 c3 d1 d3 << 8
+ punpckhbw m14, m11, m2 ; a5 a7 b5 b7 c5 c7 d5 d7 << 8
+ pmaddwd m0, %3
+ pmaddwd m3, %5
+ pmaddwd m1, %7
+ pmaddwd m14, %9
+ paddd m0, m3
+ paddd m1, m14
+ paddd m0, m1
+ mova %1, m0
+ %if ARCH_X86_64
+ SWAP m3, m14
+ %endif
+ punpckldq m0, m8, m9
+ punpckhdq m8, m9
+ punpcklbw m1, m11, m0 ; e0 e2 f0 f2 g0 g2 h0 h2 << 8
+ punpckhbw m14, m11, m0 ; e4 e6 f4 f6 g4 g6 h4 h6 << 8
+ punpcklbw m2, m11, m8 ; e1 e3 f1 f3 g1 g3 h1 h3 << 8
+ punpckhbw m15, m11, m8 ; e5 e7 f5 f7 g5 g7 h5 h7 << 8
+ pmaddwd m1, %4
+ pmaddwd m14, %6
+ pmaddwd m2, %8
+ pmaddwd m15, %10
+ paddd m1, m14
+ paddd m2, m15
+ paddd m1, m2
+ mova %2, m1
+ %if ARCH_X86_64
+ SWAP m14, m3
+ %endif
+%endmacro
+
+%if ARCH_X86_64
+ %define counterd r4d
+%else
+ %if copy_args == 0
+ %define counterd dword r4m
+ %else
+ %define counterd dword [esp+stack_size-4*7]
+ %endif
+%endif
+
+%macro WARP_AFFINE_8X8T 0
+%if ARCH_X86_64
+cglobal warp_affine_8x8t_8bpc, 6, 14, 16, 0x90, tmp, ts
+%else
+cglobal warp_affine_8x8t_8bpc, 0, 7, 16, -0x130-copy_args, tmp, ts
+ %if copy_args
+ %define tmpm [esp+stack_size-4*1]
+ %define tsm [esp+stack_size-4*2]
+ %endif
+%endif
+ call mangle(private_prefix %+ _warp_affine_8x8_8bpc_%+cpuname).main
+.loop:
+%if ARCH_X86_32
+ %define m12 m4
+ %define m13 m5
+ %define m14 m6
+ %define m15 m7
+ mova m12, [esp+0xC0]
+ mova m13, [esp+0xD0]
+ mova m14, [esp+0xE0]
+ mova m15, [esp+0xF0]
+%endif
+%if cpuflag(ssse3)
+ psrad m12, 13
+ psrad m13, 13
+ psrad m14, 13
+ psrad m15, 13
+ packssdw m12, m13
+ packssdw m14, m15
+ mova m13, [PIC_sym(pw_8192)]
+ pmulhrsw m12, m13 ; (x + (1 << 6)) >> 7
+ pmulhrsw m14, m13
+%else
+ %if ARCH_X86_32
+ %define m10 m0
+ %endif
+ mova m10, [PIC_sym(pd_16384)]
+ paddd m12, m10
+ paddd m13, m10
+ paddd m14, m10
+ paddd m15, m10
+ psrad m12, 15
+ psrad m13, 15
+ psrad m14, 15
+ psrad m15, 15
+ packssdw m12, m13
+ packssdw m14, m15
+%endif
+ mova [tmpq+tsq*0], m12
+ mova [tmpq+tsq*2], m14
+ dec counterd
+ jz mangle(private_prefix %+ _warp_affine_8x8_8bpc_%+cpuname).end
+%if ARCH_X86_32
+ mov tmpm, tmpd
+ mov r0, [esp+0x100]
+ mov r1, [esp+0x104]
+%endif
+ call mangle(private_prefix %+ _warp_affine_8x8_8bpc_%+cpuname).main2
+ lea tmpq, [tmpq+tsq*4]
+ jmp .loop
+%endmacro
+
+%macro WARP_AFFINE_8X8 0
+%if ARCH_X86_64
+cglobal warp_affine_8x8_8bpc, 6, 14, 16, 0x90, \
+ dst, ds, src, ss, abcd, mx, tmp2, alpha, beta, \
+ filter, tmp1, delta, my, gamma
+%else
+cglobal warp_affine_8x8_8bpc, 0, 7, 16, -0x130-copy_args, \
+ dst, ds, src, ss, abcd, mx, tmp2, alpha, beta, \
+ filter, tmp1, delta, my, gamma
+ %define alphaq r0
+ %define alphad r0
+ %define alpham [esp+gprsize+0x100]
+ %define betaq r1
+ %define betad r1
+ %define betam [esp+gprsize+0x104]
+ %define deltaq r0
+ %define deltad r0
+ %define deltam [esp+gprsize+0x108]
+ %define gammaq r1
+ %define gammad r1
+ %define gammam [esp+gprsize+0x10C]
+ %define filterq r3
+ %define tmp1q r4
+ %define tmp1d r4
+ %define tmp1m [esp+gprsize+0x110]
+ %define myq r5
+ %define myd r5
+ %define mym r6m
+ %if copy_args
+ %define dstm [esp+stack_size-4*1]
+ %define dsm [esp+stack_size-4*2]
+ %define srcm [esp+stack_size-4*3]
+ %define ssm [esp+stack_size-4*4]
+ %define mxm [esp+stack_size-4*5]
+ %define mym [esp+stack_size-4*6]
+ %endif
+%endif
+ call .main
+ jmp .start
+.loop:
+%if ARCH_X86_32
+ mov dstm, dstd
+ mov alphad, [esp+0x100]
+ mov betad, [esp+0x104]
+%endif
+ call .main2
+ lea dstq, [dstq+dsq*2]
+.start:
+%if notcpuflag(sse4)
+ %if cpuflag(ssse3)
+ %define roundval pw_8192
+ %else
+ %define roundval pd_262144
+ %endif
+ %if ARCH_X86_64
+ mova m10, [PIC_sym(roundval)]
+ %else
+ %define m10 [PIC_sym(roundval)]
+ %endif
+%endif
+%if ARCH_X86_32
+ %define m12 m5
+ %define m13 m6
+ mova m12, [esp+0xC0]
+ mova m13, [esp+0xD0]
+%endif
+%if cpuflag(sse4)
+ %if ARCH_X86_32
+ %define m11 m4
+ pxor m11, m11
+ %endif
+ psrad m12, 18
+ psrad m13, 18
+ packusdw m12, m13
+ pavgw m12, m11 ; (x + (1 << 10)) >> 11
+%else
+ %if cpuflag(ssse3)
+ psrad m12, 17
+ psrad m13, 17
+ packssdw m12, m13
+ pmulhrsw m12, m10
+ %else
+ paddd m12, m10
+ paddd m13, m10
+ psrad m12, 19
+ psrad m13, 19
+ packssdw m12, m13
+ %endif
+%endif
+%if ARCH_X86_32
+ %define m14 m6
+ %define m15 m7
+ mova m14, [esp+0xE0]
+ mova m15, [esp+0xF0]
+%endif
+%if cpuflag(sse4)
+ psrad m14, 18
+ psrad m15, 18
+ packusdw m14, m15
+ pavgw m14, m11 ; (x + (1 << 10)) >> 11
+%else
+ %if cpuflag(ssse3)
+ psrad m14, 17
+ psrad m15, 17
+ packssdw m14, m15
+ pmulhrsw m14, m10
+ %else
+ paddd m14, m10
+ paddd m15, m10
+ psrad m14, 19
+ psrad m15, 19
+ packssdw m14, m15
+ %endif
+%endif
+ packuswb m12, m14
+ movq [dstq+dsq*0], m12
+ movhps [dstq+dsq*1], m12
+ dec counterd
+ jg .loop
+.end:
+ RET
+ALIGN function_align
+.main:
+%assign stack_offset stack_offset+gprsize
+%if ARCH_X86_32
+ %assign stack_size stack_size+4
+ %if copy_args
+ %assign stack_offset stack_offset-4
+ %endif
+ RELOC_ARGS
+ LEA PIC_reg, $$
+ %define PIC_mem [esp+gprsize+0x114]
+ mov abcdd, abcdm
+ %if copy_args == 0
+ mov ssd, ssm
+ mov mxd, mxm
+ %endif
+ mov PIC_mem, PIC_reg
+ mov srcd, srcm
+%endif
+ movsx deltad, word [abcdq+2*2]
+ movsx gammad, word [abcdq+2*3]
+ lea tmp1d, [deltaq*3]
+ sub gammad, tmp1d ; gamma -= delta*3
+ SAVE_DELTA_GAMMA
+%if ARCH_X86_32
+ mov abcdd, abcdm
+%endif
+ movsx alphad, word [abcdq+2*0]
+ movsx betad, word [abcdq+2*1]
+ lea tmp1q, [ssq*3+3]
+ add mxd, 512+(64<<10)
+ lea tmp2d, [alphaq*3]
+ sub srcq, tmp1q ; src -= src_stride*3 + 3
+%if ARCH_X86_32
+ mov srcm, srcd
+ mov PIC_reg, PIC_mem
+%endif
+ sub betad, tmp2d ; beta -= alpha*3
+ lea filterq, [PIC_sym(mc_warp_filter2)]
+%if ARCH_X86_64
+ mov myd, r6m
+ %if cpuflag(ssse3)
+ pxor m11, m11
+ %endif
+%endif
+ call .h
+ psrld m2, m0, 16
+ psrld m3, m1, 16
+%if ARCH_X86_32
+ %if notcpuflag(ssse3)
+ mova [esp+gprsize+0x00], m2
+ %endif
+ mova [esp+gprsize+0x10], m3
+%endif
+ call .h
+ psrld m4, m0, 16
+ psrld m5, m1, 16
+%if ARCH_X86_32
+ mova [esp+gprsize+0x20], m4
+ mova [esp+gprsize+0x30], m5
+%endif
+ call .h
+%if ARCH_X86_64
+ %define blendmask [rsp+gprsize+0x80]
+%else
+ %if notcpuflag(ssse3)
+ mova m2, [esp+gprsize+0x00]
+ %endif
+ mova m3, [esp+gprsize+0x10]
+ %define blendmask [esp+gprsize+0x120]
+ %define m10 m7
+%endif
+ pcmpeqd m10, m10
+ pslld m10, 16
+ mova blendmask, m10
+ BLENDHWDW m2, m0 ; 0
+ BLENDHWDW m3, m1 ; 2
+ mova [rsp+gprsize+0x00], m2
+ mova [rsp+gprsize+0x10], m3
+ call .h
+%if ARCH_X86_32
+ mova m4, [esp+gprsize+0x20]
+ mova m5, [esp+gprsize+0x30]
+%endif
+ mova m10, blendmask
+ BLENDHWDW m4, m0 ; 1
+ BLENDHWDW m5, m1 ; 3
+ mova [rsp+gprsize+0x20], m4
+ mova [rsp+gprsize+0x30], m5
+ call .h
+%if ARCH_X86_32
+ %if notcpuflag(ssse3)
+ mova m2, [esp+gprsize+0x00]
+ %endif
+ mova m3, [esp+gprsize+0x10]
+ %define m10 m5
+%endif
+ psrld m6, m2, 16
+ psrld m7, m3, 16
+ mova m10, blendmask
+ BLENDHWDW m6, m0 ; 2
+ BLENDHWDW m7, m1 ; 4
+ mova [rsp+gprsize+0x40], m6
+ mova [rsp+gprsize+0x50], m7
+ call .h
+%if ARCH_X86_32
+ mova m4, [esp+gprsize+0x20]
+ mova m5, [esp+gprsize+0x30]
+%endif
+ psrld m2, m4, 16
+ psrld m3, m5, 16
+ mova m10, blendmask
+ BLENDHWDW m2, m0 ; 3
+ BLENDHWDW m3, m1 ; 5
+ mova [rsp+gprsize+0x60], m2
+ mova [rsp+gprsize+0x70], m3
+ call .h
+%if ARCH_X86_32
+ mova m6, [esp+gprsize+0x40]
+ mova m7, [esp+gprsize+0x50]
+ %define m10 m7
+%endif
+ psrld m4, m6, 16
+ psrld m5, m7, 16
+ mova m10, blendmask
+ BLENDHWDW m4, m0 ; 4
+ BLENDHWDW m5, m1 ; 6
+%if ARCH_X86_64
+ add myd, 512+(64<<10)
+ mova m6, m2
+ mova m7, m3
+%else
+ mova [esp+gprsize+0x80], m4
+ mova [esp+gprsize+0x90], m5
+ add dword mym, 512+(64<<10)
+%endif
+ mov counterd, 4
+ SAVE_ALPHA_BETA
+.main2:
+ call .h
+%if ARCH_X86_32
+ mova m6, [esp+gprsize+0x60]
+ mova m7, [esp+gprsize+0x70]
+ %define m10 m5
+%endif
+ psrld m6, 16
+ psrld m7, 16
+ mova m10, blendmask
+ BLENDHWDW m6, m0 ; 5
+ BLENDHWDW m7, m1 ; 7
+%if ARCH_X86_64
+ WARP_V m12, m13, [rsp+gprsize+0x00], [rsp+gprsize+0x10], \
+ m4, m5, \
+ [rsp+gprsize+0x20], [rsp+gprsize+0x30], \
+ m6, m7
+%else
+ mova [esp+gprsize+0xA0], m6
+ mova [esp+gprsize+0xB0], m7
+ LOAD_DELTA_GAMMA_MY
+ WARP_V [esp+gprsize+0xC0], [esp+gprsize+0xD0], \
+ [esp+gprsize+0x00], [esp+gprsize+0x10], \
+ [esp+gprsize+0x80], [esp+gprsize+0x90], \
+ [esp+gprsize+0x20], [esp+gprsize+0x30], \
+ [esp+gprsize+0xA0], [esp+gprsize+0xB0]
+ LOAD_ALPHA_BETA_MX
+%endif
+ call .h
+ mova m2, [rsp+gprsize+0x40]
+ mova m3, [rsp+gprsize+0x50]
+%if ARCH_X86_32
+ mova m4, [rsp+gprsize+0x80]
+ mova m5, [rsp+gprsize+0x90]
+ %define m10 m7
+%endif
+ mova [rsp+gprsize+0x00], m2
+ mova [rsp+gprsize+0x10], m3
+ mova [rsp+gprsize+0x40], m4
+ mova [rsp+gprsize+0x50], m5
+ psrld m4, 16
+ psrld m5, 16
+ mova m10, blendmask
+ BLENDHWDW m4, m0 ; 6
+ BLENDHWDW m5, m1 ; 8
+%if ARCH_X86_64
+ WARP_V m14, m15, [rsp+gprsize+0x20], [rsp+gprsize+0x30], \
+ m6, m7, \
+ [rsp+gprsize+0x00], [rsp+gprsize+0x10], \
+ m4, m5
+%else
+ mova [esp+gprsize+0x80], m4
+ mova [esp+gprsize+0x90], m5
+ LOAD_DELTA_GAMMA_MY
+ WARP_V [esp+gprsize+0xE0], [esp+gprsize+0xF0], \
+ [esp+gprsize+0x20], [esp+gprsize+0x30], \
+ [esp+gprsize+0xA0], [esp+gprsize+0xB0], \
+ [esp+gprsize+0x00], [esp+gprsize+0x10], \
+ [esp+gprsize+0x80], [esp+gprsize+0x90]
+ mov mym, myd
+ mov dstd, dstm
+ mov dsd, dsm
+ mov mxd, mxm
+%endif
+ mova m2, [rsp+gprsize+0x60]
+ mova m3, [rsp+gprsize+0x70]
+%if ARCH_X86_32
+ mova m6, [esp+gprsize+0xA0]
+ mova m7, [esp+gprsize+0xB0]
+%endif
+ mova [rsp+gprsize+0x20], m2
+ mova [rsp+gprsize+0x30], m3
+ mova [rsp+gprsize+0x60], m6
+ mova [rsp+gprsize+0x70], m7
+ ret
+ALIGN function_align
+.h:
+%if ARCH_X86_32
+ %define m8 m3
+ %define m9 m4
+ %define m10 m5
+ %define m14 m6
+ %define m15 m7
+%endif
+ lea tmp1d, [mxq+alphaq*4]
+ lea tmp2d, [mxq+alphaq*1]
+%if ARCH_X86_32
+ %assign stack_offset stack_offset+4
+ %assign stack_size stack_size+4
+ %define PIC_mem [esp+gprsize*2+0x114]
+ mov PIC_mem, PIC_reg
+ mov srcd, srcm
+%endif
+ movu m10, [srcq]
+%if ARCH_X86_32
+ add srcd, ssm
+ mov srcm, srcd
+ mov PIC_reg, PIC_mem
+%else
+ add srcq, ssq
+%endif
+ shr mxd, 10
+ shr tmp1d, 10
+ movq m1, [filterq+mxq *8] ; 0 X
+ movq m8, [filterq+tmp1q*8] ; 4 X
+ lea tmp1d, [tmp2q+alphaq*4]
+ lea mxd, [tmp2q+alphaq*1]
+ shr tmp2d, 10
+ shr tmp1d, 10
+ movhps m1, [filterq+tmp2q*8] ; 0 1
+ movhps m8, [filterq+tmp1q*8] ; 4 5
+ lea tmp1d, [mxq+alphaq*4]
+ lea tmp2d, [mxq+alphaq*1]
+ shr mxd, 10
+ shr tmp1d, 10
+%if cpuflag(ssse3)
+ movq m14, [filterq+mxq *8] ; 2 X
+ movq m9, [filterq+tmp1q*8] ; 6 X
+ lea tmp1d, [tmp2q+alphaq*4]
+ lea mxd, [tmp2q+betaq] ; mx += beta
+ shr tmp2d, 10
+ shr tmp1d, 10
+ movhps m14, [filterq+tmp2q*8] ; 2 3
+ movhps m9, [filterq+tmp1q*8] ; 6 7
+ pshufb m0, m10, [PIC_sym(warp_8x8_shufA)]
+ pmaddubsw m0, m1
+ pshufb m1, m10, [PIC_sym(warp_8x8_shufB)]
+ pmaddubsw m1, m8
+ pshufb m15, m10, [PIC_sym(warp_8x8_shufC)]
+ pmaddubsw m15, m14
+ pshufb m10, m10, [PIC_sym(warp_8x8_shufD)]
+ pmaddubsw m10, m9
+ phaddw m0, m15
+ phaddw m1, m10
+%else
+ %if ARCH_X86_32
+ %define m11 m2
+ %endif
+ pcmpeqw m0, m0
+ psrlw m14, m0, 8
+ psrlw m15, m10, 8 ; 01 03 05 07 09 11 13 15
+ pand m14, m10 ; 00 02 04 06 08 10 12 14
+ packuswb m14, m15 ; 00 02 04 06 08 10 12 14 01 03 05 07 09 11 13 15
+ psrldq m9, m0, 4
+ pshufd m0, m14, q0220
+ pand m0, m9
+ psrldq m14, 1 ; 02 04 06 08 10 12 14 01 03 05 07 09 11 13 15 __
+ pslldq m15, m14, 12
+ por m0, m15 ; shufA
+ psrlw m15, m0, 8
+ psraw m11, m1, 8
+ psllw m0, 8
+ psllw m1, 8
+ psrlw m0, 8
+ psraw m1, 8
+ pmullw m15, m11
+ pmullw m0, m1
+ paddw m0, m15 ; pmaddubsw m0, m1
+ pshufd m15, m14, q0220
+ pand m15, m9
+ psrldq m14, 1 ; 04 06 08 10 12 14 01 03 05 07 09 11 13 15 __ __
+ pslldq m1, m14, 12
+ por m15, m1 ; shufC
+ pshufd m1, m14, q0220
+ pand m1, m9
+ psrldq m14, 1 ; 06 08 10 12 14 01 03 05 07 09 11 13 15 __ __ __
+ pslldq m11, m14, 12
+ por m1, m11 ; shufB
+ pshufd m10, m14, q0220
+ pand m10, m9
+ psrldq m14, 1 ; 08 10 12 14 01 03 05 07 09 11 13 15 __ __ __ __
+ pslldq m14, m14, 12
+ por m10, m14 ; shufD
+ psrlw m9, m1, 8
+ psraw m11, m8, 8
+ psllw m1, 8
+ psllw m8, 8
+ psrlw m1, 8
+ psraw m8, 8
+ pmullw m9, m11
+ pmullw m1, m8
+ paddw m1, m9 ; pmaddubsw m1, m8
+ movq m14, [filterq+mxq *8] ; 2 X
+ movq m9, [filterq+tmp1q*8] ; 6 X
+ lea tmp1d, [tmp2q+alphaq*4]
+ lea mxd, [tmp2q+betaq] ; mx += beta
+ shr tmp2d, 10
+ shr tmp1d, 10
+ movhps m14, [filterq+tmp2q*8] ; 2 3
+ movhps m9, [filterq+tmp1q*8] ; 6 7
+ psrlw m8, m15, 8
+ psraw m11, m14, 8
+ psllw m15, 8
+ psllw m14, 8
+ psrlw m15, 8
+ psraw m14, 8
+ pmullw m8, m11
+ pmullw m15, m14
+ paddw m15, m8 ; pmaddubsw m15, m14
+ psrlw m8, m10, 8
+ psraw m11, m9, 8
+ psllw m10, 8
+ psllw m9, 8
+ psrlw m10, 8
+ psraw m9, 8
+ pmullw m8, m11
+ pmullw m10, m9
+ paddw m10, m8 ; pmaddubsw m10, m9
+ pslld m8, m0, 16
+ pslld m9, m1, 16
+ pslld m14, m15, 16
+ pslld m11, m10, 16
+ paddw m0, m8
+ paddw m1, m9
+ paddw m15, m14
+ paddw m10, m11
+ psrad m0, 16
+ psrad m1, 16
+ psrad m15, 16
+ psrad m10, 16
+ packssdw m0, m15 ; phaddw m0, m15
+ packssdw m1, m10 ; phaddw m1, m10
+%endif
+ mova m14, [PIC_sym(pw_8192)]
+ mova m9, [PIC_sym(pd_32768)]
+ pmaddwd m0, m14 ; 17-bit intermediate, upshifted by 13
+ pmaddwd m1, m14
+ paddd m0, m9 ; rounded 14-bit result in upper 16 bits of dword
+ paddd m1, m9
+ ret
+%endmacro
+
+%if WIN64
+DECLARE_REG_TMP 6, 4
+%else
+DECLARE_REG_TMP 6, 7
+%endif
+
+%macro BIDIR_FN 1 ; op
+ %1 0
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4_loop:
+ %1_INC_PTR 2
+ %1 0
+ lea dstq, [dstq+strideq*4]
+.w4: ; tile 4x
+ movd [dstq ], m0 ; copy dw[0]
+ pshuflw m1, m0, q1032 ; swap dw[1] and dw[0]
+ movd [dstq+strideq*1], m1 ; copy dw[1]
+ punpckhqdq m0, m0 ; swap dw[3,2] with dw[1,0]
+ movd [dstq+strideq*2], m0 ; dw[2]
+ psrlq m0, 32 ; shift right in dw[3]
+ movd [dstq+stride3q ], m0 ; copy
+ sub hd, 4
+ jg .w4_loop
+ RET
+.w8_loop:
+ %1_INC_PTR 2
+ %1 0
+ lea dstq, [dstq+strideq*2]
+.w8:
+ movq [dstq ], m0
+ movhps [dstq+strideq*1], m0
+ sub hd, 2
+ jg .w8_loop
+ RET
+.w16_loop:
+ %1_INC_PTR 2
+ %1 0
+ lea dstq, [dstq+strideq]
+.w16:
+ mova [dstq ], m0
+ dec hd
+ jg .w16_loop
+ RET
+.w32_loop:
+ %1_INC_PTR 4
+ %1 0
+ lea dstq, [dstq+strideq]
+.w32:
+ mova [dstq ], m0
+ %1 2
+ mova [dstq + 16 ], m0
+ dec hd
+ jg .w32_loop
+ RET
+.w64_loop:
+ %1_INC_PTR 8
+ %1 0
+ add dstq, strideq
+.w64:
+ %assign i 0
+ %rep 4
+ mova [dstq + i*16 ], m0
+ %assign i i+1
+ %if i < 4
+ %1 2*i
+ %endif
+ %endrep
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop:
+ %1_INC_PTR 16
+ %1 0
+ add dstq, strideq
+.w128:
+ %assign i 0
+ %rep 8
+ mova [dstq + i*16 ], m0
+ %assign i i+1
+ %if i < 8
+ %1 2*i
+ %endif
+ %endrep
+ dec hd
+ jg .w128_loop
+ RET
+%endmacro
+
+%macro AVG 1 ; src_offset
+ ; writes AVG of tmp1 tmp2 uint16 coeffs into uint8 pixel
+ mova m0, [tmp1q+(%1+0)*mmsize] ; load 8 coef(2bytes) from tmp1
+ paddw m0, [tmp2q+(%1+0)*mmsize] ; load/add 8 coef(2bytes) tmp2
+ mova m1, [tmp1q+(%1+1)*mmsize]
+ paddw m1, [tmp2q+(%1+1)*mmsize]
+ pmulhrsw m0, m2
+ pmulhrsw m1, m2
+ packuswb m0, m1 ; pack/trunc 16 bits from m0 & m1 to 8 bit
+%endmacro
+
+%macro AVG_INC_PTR 1
+ add tmp1q, %1*mmsize
+ add tmp2q, %1*mmsize
+%endmacro
+
+cglobal avg_8bpc, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3
+ LEA r6, avg_ssse3_table
+ tzcnt wd, wm ; leading zeros
+ movifnidn hd, hm ; move h(stack) to h(register) if not already that register
+ movsxd wq, dword [r6+wq*4] ; push table entry matching the tile width (tzcnt) in widen reg
+ mova m2, [pw_1024+r6-avg_ssse3_table] ; fill m2 with shift/align
+ add wq, r6
+ BIDIR_FN AVG
+
+%macro W_AVG 1 ; src_offset
+ ; (a * weight + b * (16 - weight) + 128) >> 8
+ ; = ((a - b) * weight + (b << 4) + 128) >> 8
+ ; = ((((a - b) * ((weight-16) << 12)) >> 16) + a + 8) >> 4
+ ; = ((((b - a) * (-weight << 12)) >> 16) + b + 8) >> 4
+ mova m2, [tmp1q+(%1+0)*mmsize]
+ mova m0, m2
+ psubw m2, [tmp2q+(%1+0)*mmsize]
+ mova m3, [tmp1q+(%1+1)*mmsize]
+ mova m1, m3
+ psubw m3, [tmp2q+(%1+1)*mmsize]
+ pmulhw m2, m4
+ pmulhw m3, m4
+ paddw m0, m2
+ paddw m1, m3
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+%endmacro
+
+%define W_AVG_INC_PTR AVG_INC_PTR
+
+cglobal w_avg_8bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
+ LEA r6, w_avg_ssse3_table
+ tzcnt wd, wm
+ movd m4, r6m
+ movifnidn hd, hm
+ pxor m0, m0
+ movsxd wq, dword [r6+wq*4]
+ mova m5, [pw_2048+r6-w_avg_ssse3_table]
+ pshufb m4, m0
+ psllw m4, 12 ; (weight-16) << 12 when interpreted as signed
+ add wq, r6
+ cmp dword r6m, 7
+ jg .weight_gt7
+ mov r6, tmp1q
+ psubw m0, m4
+ mov tmp1q, tmp2q
+ mova m4, m0 ; -weight
+ mov tmp2q, r6
+.weight_gt7:
+ BIDIR_FN W_AVG
+
+%macro MASK 1 ; src_offset
+ ; (a * m + b * (64 - m) + 512) >> 10
+ ; = ((a - b) * m + (b << 6) + 512) >> 10
+ ; = ((((b - a) * (-m << 10)) >> 16) + b + 8) >> 4
+ mova m3, [maskq+(%1+0)*(mmsize/2)]
+ mova m0, [tmp2q+(%1+0)*mmsize] ; b
+ psubw m1, m0, [tmp1q+(%1+0)*mmsize] ; b - a
+ mova m6, m3 ; m
+ psubb m3, m4, m6 ; -m
+ paddw m1, m1 ; (b - a) << 1
+ paddb m3, m3 ; -m << 1
+ punpcklbw m2, m4, m3 ; -m << 9 (<< 8 when ext as uint16)
+ pmulhw m1, m2 ; (-m * (b - a)) << 10
+ paddw m0, m1 ; + b
+ mova m1, [tmp2q+(%1+1)*mmsize] ; b
+ psubw m2, m1, [tmp1q+(%1+1)*mmsize] ; b - a
+ paddw m2, m2 ; (b - a) << 1
+ mova m6, m3 ; (-m << 1)
+ punpckhbw m3, m4, m6 ; (-m << 9)
+ pmulhw m2, m3 ; (-m << 9)
+ paddw m1, m2 ; (-m * (b - a)) << 10
+ pmulhrsw m0, m5 ; round
+ pmulhrsw m1, m5 ; round
+ packuswb m0, m1 ; interleave 16 -> 8
+%endmacro
+
+%macro MASK_INC_PTR 1
+ add maskq, %1*mmsize/2
+ add tmp1q, %1*mmsize
+ add tmp2q, %1*mmsize
+%endmacro
+
+%if ARCH_X86_64
+cglobal mask_8bpc, 4, 8, 7, dst, stride, tmp1, tmp2, w, h, mask, stride3
+ movifnidn hd, hm
+%else
+cglobal mask_8bpc, 4, 7, 7, dst, stride, tmp1, tmp2, w, mask, stride3
+%define hd dword r5m
+%endif
+%define base r6-mask_ssse3_table
+ LEA r6, mask_ssse3_table
+ tzcnt wd, wm
+ movsxd wq, dword [r6+wq*4]
+ pxor m4, m4
+ mova m5, [base+pw_2048]
+ add wq, r6
+ mov maskq, r6m
+ BIDIR_FN MASK
+%undef hd
+
+%macro W_MASK_420_END 1-*
+%rep %0
+ call .main
+ paddw m2, [maskq+16*%1]
+ mova [maskq+16*%1], m2
+ mova [dstq+strideq*1+16*(2*%1+0)], m0
+ call .main
+ psubw m3, m7, m2
+ psubw m1, m7, [maskq+16*%1]
+ psubw m3, [dstq+strideq*1+16*(2*%1+1)]
+ psrlw m1, 2
+ psrlw m3, 2
+ packuswb m1, m3
+ mova [maskq+16*%1], m1
+ mova [dstq+strideq*1+16*(2*%1+1)], m0
+ %rotate 1
+%endrep
+%endmacro
+
+%if UNIX64
+DECLARE_REG_TMP 7
+%else
+DECLARE_REG_TMP 5
+%endif
+
+cglobal w_mask_420_8bpc, 4, 7, 9, dst, stride, tmp1, tmp2, w, h, mask
+%define base t0-w_mask_420_ssse3_table
+ LEA t0, w_mask_420_ssse3_table
+ tzcnt wd, wm
+ mov r6d, r7m ; sign
+ sub tmp2q, tmp1q
+ movsxd wq, [t0+wq*4]
+ mova m6, [base+pw_2048]
+ movddup m7, [base+wm_420_sign+r6*8] ; 258 - sign
+ add wq, t0
+%if ARCH_X86_64
+ mova m8, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
+ movifnidn hd, hm
+%else
+ %define m8 [base+pw_6903]
+ %define hd dword hm
+%endif
+ mov maskq, maskmp
+ call .main
+ jmp wq
+.w4_loop:
+ call .main
+ add maskq, 4
+ lea dstq, [dstq+strideq*2]
+.w4:
+ pshufd m3, m2, q2020
+ pshufd m2, m2, q3131
+ psubw m1, m7, m3
+ psubw m1, m2
+ psrlw m1, 2
+ packuswb m1, m1
+ movd [maskq], m1
+ movd [dstq+strideq*0], m0
+ pshuflw m1, m0, q1032
+ movd [dstq+strideq*1], m1
+ punpckhqdq m0, m0
+ lea dstq, [dstq+strideq*2]
+ movd [dstq+strideq*0], m0
+ pshuflw m1, m0, q1032
+ movd [dstq+strideq*1], m1
+ sub hd, 4
+ jg .w4_loop
+ RET
+.w8_loop:
+ call .main
+ add maskq, 4
+ lea dstq, [dstq+strideq*2]
+.w8:
+ movhlps m3, m2
+ psubw m1, m7, m2
+ psubw m1, m3
+ psrlw m1, 2
+ packuswb m1, m1
+ movd [maskq], m1
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ sub hd, 2
+ jg .w8_loop
+ RET
+.w16_loop:
+ call .main
+ add maskq, 8
+ lea dstq, [dstq+strideq*2]
+.w16:
+ mova [dstq+strideq*1], m2
+ mova [dstq+strideq*0], m0
+ call .main
+ psubw m1, m7, [dstq+strideq*1]
+ psubw m1, m2
+ psrlw m1, 2
+ packuswb m1, m1
+ movq [maskq], m1
+ mova [dstq+strideq*1], m0
+ sub hd, 2
+ jg .w16_loop
+ RET
+.w32_loop:
+ call .main
+ add maskq, 16
+ lea dstq, [dstq+strideq*2]
+.w32:
+ mova [maskq], m2
+ mova [dstq+strideq*0+16*0], m0
+ call .main
+ mova [dstq+strideq*1+16*1], m2
+ mova [dstq+strideq*0+16*1], m0
+ W_MASK_420_END 0
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64_loop:
+ call .main
+ add maskq, 16*2
+ lea dstq, [dstq+strideq*2]
+.w64:
+ mova [maskq+16*0], m2
+ mova [dstq+strideq*0+16*0], m0
+ call .main
+ mova [dstq+strideq*1+16*1], m2
+ mova [dstq+strideq*0+16*1], m0
+ call .main
+ mova [maskq+16*1], m2
+ mova [dstq+strideq*0+16*2], m0
+ call .main
+ mova [dstq+strideq*1+16*3], m2
+ mova [dstq+strideq*0+16*3], m0
+ W_MASK_420_END 0, 1
+ sub hd, 2
+ jg .w64_loop
+ RET
+.w128_loop:
+ call .main
+ add maskq, 16*4
+ lea dstq, [dstq+strideq*2]
+.w128:
+ mova [maskq+16*0], m2
+ mova [dstq+strideq*0+16*0], m0
+ call .main
+ mova [dstq+strideq*1+16*1], m2
+ mova [dstq+strideq*0+16*1], m0
+ call .main
+ mova [maskq+16*1], m2
+ mova [dstq+strideq*0+16*2], m0
+ call .main
+ mova [dstq+strideq*1+16*3], m2
+ mova [dstq+strideq*0+16*3], m0
+ call .main
+ mova [maskq+16*2], m2
+ mova [dstq+strideq*0+16*4], m0
+ call .main
+ mova [dstq+strideq*1+16*5], m2
+ mova [dstq+strideq*0+16*5], m0
+ call .main
+ mova [maskq+16*3], m2
+ mova [dstq+strideq*0+16*6], m0
+ call .main
+ mova [dstq+strideq*1+16*7], m2
+ mova [dstq+strideq*0+16*7], m0
+ W_MASK_420_END 0, 1, 2, 3
+ sub hd, 2
+ jg .w128_loop
+ RET
+ALIGN function_align
+.main:
+ mova m0, [tmp1q +16*0]
+ mova m3, [tmp1q+tmp2q+16*0]
+ mova m1, [tmp1q +16*1]
+ mova m4, [tmp1q+tmp2q+16*1]
+ add tmp1q, 16*2
+ psubw m3, m0
+ psubw m4, m1
+ pabsw m5, m3
+ psubusw m2, m8, m5
+ psrlw m2, 8 ; 64 - m
+ psllw m5, m2, 10
+ pmulhw m3, m5
+ pabsw m5, m4
+ paddw m0, m3
+ psubusw m3, m8, m5
+ psrlw m3, 8
+ phaddw m2, m3
+ psllw m3, 10
+ pmulhw m4, m3
+ paddw m1, m4
+ pmulhrsw m0, m6
+ pmulhrsw m1, m6
+ packuswb m0, m1
+ ret
+
+%macro W_MASK_422_BACKUP 1 ; mask_offset
+%if ARCH_X86_64
+ mova m10, m2
+%else
+ mova [maskq+16*%1], m2
+%endif
+%endmacro
+
+%macro W_MASK_422_END 1 ; mask_offset
+%if ARCH_X86_64
+ packuswb m10, m2
+ psubb m1, m7, m10
+ pavgb m1, m9
+%else
+ mova m3, [maskq+16*%1]
+ packuswb m3, m2
+ pxor m2, m2
+ psubb m1, m7, m3
+ pavgb m1, m2
+%endif
+ mova [maskq+16*%1], m1
+%endmacro
+
+cglobal w_mask_422_8bpc, 4, 7, 11, dst, stride, tmp1, tmp2, w, h, mask
+%define base t0-w_mask_422_ssse3_table
+ LEA t0, w_mask_422_ssse3_table
+ tzcnt wd, wm
+ mov r6d, r7m ; sign
+ sub tmp2q, tmp1q
+ movsxd wq, [t0+wq*4]
+ mova m6, [base+pw_2048]
+ movddup m7, [base+wm_422_sign+r6*8] ; 128 - sign
+ add wq, t0
+%if ARCH_X86_64
+ mova m8, [base+pw_6903]
+ pxor m9, m9
+ movifnidn hd, hm
+%else
+ add t0, w_mask_420_ssse3_table-w_mask_422_ssse3_table
+ %define hd dword hm
+%endif
+ mov maskq, maskmp
+ call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
+ jmp wq
+.w4_loop:
+ call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
+ add maskq, 8
+ lea dstq, [dstq+strideq*2]
+.w4:
+ packuswb m2, m2
+ psubb m1, m7, m2
+%if ARCH_X86_64
+ pavgb m1, m9
+%else
+ pxor m2, m2
+ pavgb m1, m2
+%endif
+ movq [maskq], m1
+ movd [dstq+strideq*0], m0
+ pshuflw m1, m0, q1032
+ movd [dstq+strideq*1], m1
+ punpckhqdq m0, m0
+ lea dstq, [dstq+strideq*2]
+ movd [dstq+strideq*0], m0
+ pshuflw m1, m0, q1032
+ movd [dstq+strideq*1], m1
+ sub hd, 4
+ jg .w4_loop
+ RET
+.w8_loop:
+ call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
+ add maskq, 16
+ lea dstq, [dstq+strideq*2]
+.w8:
+ W_MASK_422_BACKUP 0
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
+ lea dstq, [dstq+strideq*2]
+ W_MASK_422_END 0
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ sub hd, 4
+ jg .w8_loop
+ RET
+.w16_loop:
+ call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
+ add maskq, 16
+ lea dstq, [dstq+strideq*2]
+.w16:
+ W_MASK_422_BACKUP 0
+ mova [dstq+strideq*0], m0
+ call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
+ W_MASK_422_END 0
+ mova [dstq+strideq*1], m0
+ sub hd, 2
+ jg .w16_loop
+ RET
+.w32_loop:
+ call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
+ add maskq, 16
+ add dstq, strideq
+.w32:
+ W_MASK_422_BACKUP 0
+ mova [dstq+16*0], m0
+ call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
+ W_MASK_422_END 0
+ mova [dstq+16*1], m0
+ dec hd
+ jg .w32_loop
+ RET
+.w64_loop:
+ call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
+ add maskq, 16*2
+ add dstq, strideq
+.w64:
+ W_MASK_422_BACKUP 0
+ mova [dstq+16*0], m0
+ call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
+ W_MASK_422_END 0
+ mova [dstq+16*1], m0
+ call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
+ W_MASK_422_BACKUP 1
+ mova [dstq+16*2], m0
+ call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
+ W_MASK_422_END 1
+ mova [dstq+16*3], m0
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop:
+ call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
+ add maskq, 16*4
+ add dstq, strideq
+.w128:
+ W_MASK_422_BACKUP 0
+ mova [dstq+16*0], m0
+ call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
+ W_MASK_422_END 0
+ mova [dstq+16*1], m0
+ call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
+ W_MASK_422_BACKUP 1
+ mova [dstq+16*2], m0
+ call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
+ W_MASK_422_END 1
+ mova [dstq+16*3], m0
+ call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
+ W_MASK_422_BACKUP 2
+ mova [dstq+16*4], m0
+ call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
+ W_MASK_422_END 2
+ mova [dstq+16*5], m0
+ call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
+ W_MASK_422_BACKUP 3
+ mova [dstq+16*6], m0
+ call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
+ W_MASK_422_END 3
+ mova [dstq+16*7], m0
+ dec hd
+ jg .w128_loop
+ RET
+
+cglobal w_mask_444_8bpc, 4, 7, 9, dst, stride, tmp1, tmp2, w, h, mask
+%define base t0-w_mask_444_ssse3_table
+ LEA t0, w_mask_444_ssse3_table
+ tzcnt wd, wm
+ mov maskq, maskmp
+ sub tmp2q, tmp1q
+ movsxd wq, [t0+wq*4]
+ mova m6, [base+pw_6903]
+ mova m7, [base+pw_2048]
+ add wq, t0
+%if ARCH_X86_64
+ mova m8, [base+pb_64]
+ movifnidn hd, hm
+%else
+ %define m8 [base+pb_64]
+ %define hd dword hm
+%endif
+ call .main
+ jmp wq
+.w4_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w4:
+ movd [dstq+strideq*0], m0
+ pshuflw m1, m0, q1032
+ movd [dstq+strideq*1], m1
+ punpckhqdq m0, m0
+ lea dstq, [dstq+strideq*2]
+ movd [dstq+strideq*0], m0
+ pshuflw m1, m0, q1032
+ movd [dstq+strideq*1], m1
+ sub hd, 4
+ jg .w4_loop
+ RET
+.w8_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w8:
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ sub hd, 2
+ jg .w8_loop
+ RET
+.w16_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w16:
+ mova [dstq+strideq*0], m0
+ call .main
+ mova [dstq+strideq*1], m0
+ sub hd, 2
+ jg .w16_loop
+ RET
+.w32_loop:
+ call .main
+ add dstq, strideq
+.w32:
+ mova [dstq+16*0], m0
+ call .main
+ mova [dstq+16*1], m0
+ dec hd
+ jg .w32_loop
+ RET
+.w64_loop:
+ call .main
+ add dstq, strideq
+.w64:
+ mova [dstq+16*0], m0
+ call .main
+ mova [dstq+16*1], m0
+ call .main
+ mova [dstq+16*2], m0
+ call .main
+ mova [dstq+16*3], m0
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop:
+ call .main
+ add dstq, strideq
+.w128:
+ mova [dstq+16*0], m0
+ call .main
+ mova [dstq+16*1], m0
+ call .main
+ mova [dstq+16*2], m0
+ call .main
+ mova [dstq+16*3], m0
+ call .main
+ mova [dstq+16*4], m0
+ call .main
+ mova [dstq+16*5], m0
+ call .main
+ mova [dstq+16*6], m0
+ call .main
+ mova [dstq+16*7], m0
+ dec hd
+ jg .w128_loop
+ RET
+ALIGN function_align
+.main:
+ mova m0, [tmp1q +16*0]
+ mova m3, [tmp1q+tmp2q+16*0]
+ mova m1, [tmp1q +16*1]
+ mova m4, [tmp1q+tmp2q+16*1]
+ add tmp1q, 16*2
+ psubw m3, m0
+ psubw m4, m1
+ pabsw m5, m3
+ psubusw m2, m6, m5
+ psrlw m2, 8 ; 64 - m
+ psllw m5, m2, 10
+ pmulhw m3, m5
+ pabsw m5, m4
+ paddw m0, m3
+ psubusw m3, m6, m5
+ psrlw m3, 8
+ packuswb m2, m3
+ psllw m3, 10
+ pmulhw m4, m3
+ psubb m3, m8, m2
+ paddw m1, m4
+ pmulhrsw m0, m7
+ pmulhrsw m1, m7
+ mova [maskq], m3
+ add maskq, 16
+ packuswb m0, m1
+ ret
+
+%macro BLEND_64M 4; a, b, mask1, mask2
+ punpcklbw m0, %1, %2; {b;a}[7..0]
+ punpckhbw %1, %2 ; {b;a}[15..8]
+ pmaddubsw m0, %3 ; {b*m[0] + (64-m[0])*a}[7..0] u16
+ pmaddubsw %1, %4 ; {b*m[1] + (64-m[1])*a}[15..8] u16
+ pmulhrsw m0, m5 ; {((b*m[0] + (64-m[0])*a) + 1) / 32}[7..0] u16
+ pmulhrsw %1, m5 ; {((b*m[1] + (64-m[0])*a) + 1) / 32}[15..8] u16
+ packuswb m0, %1 ; {blendpx}[15..0] u8
+%endmacro
+
+%macro BLEND 2; a, b
+ psubb m3, m4, m0 ; m3 = (64 - m)
+ punpcklbw m2, m3, m0 ; {m;(64-m)}[7..0]
+ punpckhbw m3, m0 ; {m;(64-m)}[15..8]
+ BLEND_64M %1, %2, m2, m3
+%endmacro
+
+cglobal blend_8bpc, 3, 7, 7, dst, ds, tmp, w, h, mask
+%define base r6-blend_ssse3_table
+ LEA r6, blend_ssse3_table
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movifnidn maskq, maskmp
+ movsxd wq, dword [r6+wq*4]
+ mova m4, [base+pb_64]
+ mova m5, [base+pw_512]
+ add wq, r6
+ lea r6, [dsq*3]
+ jmp wq
+.w4:
+ movq m0, [maskq]; m
+ movd m1, [dstq+dsq*0] ; a
+ movd m6, [dstq+dsq*1]
+ punpckldq m1, m6
+ movq m6, [tmpq] ; b
+ psubb m3, m4, m0 ; m3 = (64 - m)
+ punpcklbw m2, m3, m0 ; {m;(64-m)}[7..0]
+ punpcklbw m1, m6 ; {b;a}[7..0]
+ pmaddubsw m1, m2 ; {b*m[0] + (64-m[0])*a}[7..0] u16
+ pmulhrsw m1, m5 ; {((b*m[0] + (64-m[0])*a) + 1) / 32}[7..0] u16
+ packuswb m1, m0 ; {blendpx}[15..0] u8
+ movd [dstq+dsq*0], m1
+ psrlq m1, 32
+ movd [dstq+dsq*1], m1
+ add maskq, 8
+ add tmpq, 8
+ lea dstq, [dstq+dsq*2] ; dst_stride * 2
+ sub hd, 2
+ jg .w4
+ RET
+.w8:
+ mova m0, [maskq]; m
+ movq m1, [dstq+dsq*0] ; a
+ movhps m1, [dstq+dsq*1]
+ mova m6, [tmpq] ; b
+ BLEND m1, m6
+ movq [dstq+dsq*0], m0
+ movhps [dstq+dsq*1], m0
+ add maskq, 16
+ add tmpq, 16
+ lea dstq, [dstq+dsq*2] ; dst_stride * 2
+ sub hd, 2
+ jg .w8
+ RET
+.w16:
+ mova m0, [maskq]; m
+ mova m1, [dstq] ; a
+ mova m6, [tmpq] ; b
+ BLEND m1, m6
+ mova [dstq], m0
+ add maskq, 16
+ add tmpq, 16
+ add dstq, dsq ; dst_stride
+ dec hd
+ jg .w16
+ RET
+.w32:
+ %assign i 0
+ %rep 2
+ mova m0, [maskq+16*i]; m
+ mova m1, [dstq+16*i] ; a
+ mova m6, [tmpq+16*i] ; b
+ BLEND m1, m6
+ mova [dstq+i*16], m0
+ %assign i i+1
+ %endrep
+ add maskq, 32
+ add tmpq, 32
+ add dstq, dsq ; dst_stride
+ dec hd
+ jg .w32
+ RET
+
+cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mask
+%define base r5-blend_v_ssse3_table
+ LEA r5, blend_v_ssse3_table
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, dword [r5+wq*4]
+ mova m5, [base+pw_512]
+ add wq, r5
+ add maskq, obmc_masks-blend_v_ssse3_table
+ jmp wq
+.w2:
+ movd m3, [maskq+4]
+ punpckldq m3, m3
+ ; 2 mask blend is provided for 4 pixels / 2 lines
+.w2_loop:
+ movd m1, [dstq+dsq*0] ; a {..;a;a}
+ pinsrw m1, [dstq+dsq*1], 1
+ movd m2, [tmpq] ; b
+ punpcklbw m0, m1, m2; {b;a}[7..0]
+ pmaddubsw m0, m3 ; {b*m + (64-m)*a}[7..0] u16
+ pmulhrsw m0, m5 ; {((b*m + (64-m)*a) + 1) / 32}[7..0] u16
+ packuswb m0, m1 ; {blendpx}[8..0] u8
+ movd r3d, m0
+ mov [dstq+dsq*0], r3w
+ shr r3d, 16
+ mov [dstq+dsq*1], r3w
+ add tmpq, 2*2
+ lea dstq, [dstq + dsq * 2]
+ sub hd, 2
+ jg .w2_loop
+ RET
+.w4:
+ movddup m3, [maskq+8]
+ ; 4 mask blend is provided for 8 pixels / 2 lines
+.w4_loop:
+ movd m1, [dstq+dsq*0] ; a
+ movd m2, [dstq+dsq*1] ;
+ punpckldq m1, m2
+ movq m2, [tmpq] ; b
+ punpcklbw m1, m2 ; {b;a}[7..0]
+ pmaddubsw m1, m3 ; {b*m + (64-m)*a}[7..0] u16
+ pmulhrsw m1, m5 ; {((b*m + (64-m)*a) + 1) / 32}[7..0] u16
+ packuswb m1, m1 ; {blendpx}[8..0] u8
+ movd [dstq], m1
+ psrlq m1, 32
+ movd [dstq+dsq*1], m1
+ add tmpq, 2*4
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w4_loop
+ RET
+.w8:
+ mova m3, [maskq+16]
+ ; 8 mask blend is provided for 16 pixels
+.w8_loop:
+ movq m1, [dstq+dsq*0] ; a
+ movhps m1, [dstq+dsq*1]
+ mova m2, [tmpq]; b
+ BLEND_64M m1, m2, m3, m3
+ movq [dstq+dsq*0], m0
+ movhps [dstq+dsq*1], m0
+ add tmpq, 16
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w8_loop
+ RET
+.w16:
+ ; 16 mask blend is provided for 32 pixels
+ mova m3, [maskq+32] ; obmc_masks_16[0] (64-m[0])
+ mova m4, [maskq+48] ; obmc_masks_16[1] (64-m[1])
+.w16_loop:
+ mova m1, [dstq] ; a
+ mova m2, [tmpq] ; b
+ BLEND_64M m1, m2, m3, m4
+ mova [dstq], m0
+ add tmpq, 16
+ add dstq, dsq
+ dec hd
+ jg .w16_loop
+ RET
+.w32:
+%if WIN64
+ mova [rsp+8], xmm6
+%endif
+ mova m3, [maskq+64] ; obmc_masks_32[0] (64-m[0])
+ mova m4, [maskq+80] ; obmc_masks_32[1] (64-m[1])
+ mova m6, [maskq+96] ; obmc_masks_32[2] (64-m[2])
+ ; 16 mask blend is provided for 64 pixels
+.w32_loop:
+ mova m1, [dstq+16*0] ; a
+ mova m2, [tmpq+16*0] ; b
+ BLEND_64M m1, m2, m3, m4
+ movq m1, [dstq+16*1] ; a
+ punpcklbw m1, [tmpq+16*1] ; b
+ pmaddubsw m1, m6
+ pmulhrsw m1, m5
+ packuswb m1, m1
+ mova [dstq+16*0], m0
+ movq [dstq+16*1], m1
+ add tmpq, 32
+ add dstq, dsq
+ dec hd
+ jg .w32_loop
+%if WIN64
+ mova xmm6, [rsp+8]
+%endif
+ RET
+
+cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mask
+%define base t0-blend_h_ssse3_table
+%if ARCH_X86_32
+ ; We need to keep the PIC pointer for w4, reload wd from stack instead
+ DECLARE_REG_TMP 6
+%else
+ DECLARE_REG_TMP 5
+ mov r6d, wd
+%endif
+ LEA t0, blend_h_ssse3_table
+ tzcnt wd, wm
+ mov hd, hm
+ movsxd wq, dword [t0+wq*4]
+ mova m5, [base+pw_512]
+ add wq, t0
+ lea maskq, [base+obmc_masks+hq*2]
+ lea hd, [hq*3]
+ shr hd, 2 ; h * 3/4
+ lea maskq, [maskq+hq*2]
+ neg hq
+ jmp wq
+.w2:
+ movd m0, [dstq+dsq*0]
+ pinsrw m0, [dstq+dsq*1], 1
+ movd m2, [maskq+hq*2]
+ movd m1, [tmpq]
+ punpcklwd m2, m2
+ punpcklbw m0, m1
+ pmaddubsw m0, m2
+ pmulhrsw m0, m5
+ packuswb m0, m0
+ movd r3d, m0
+ mov [dstq+dsq*0], r3w
+ shr r3d, 16
+ mov [dstq+dsq*1], r3w
+ lea dstq, [dstq+dsq*2]
+ add tmpq, 2*2
+ add hq, 2
+ jl .w2
+ RET
+.w4:
+%if ARCH_X86_32
+ mova m3, [base+blend_shuf]
+%else
+ mova m3, [blend_shuf]
+%endif
+.w4_loop:
+ movd m0, [dstq+dsq*0]
+ movd m2, [dstq+dsq*1]
+ punpckldq m0, m2 ; a
+ movq m1, [tmpq] ; b
+ movq m2, [maskq+hq*2] ; m
+ pshufb m2, m3
+ punpcklbw m0, m1
+ pmaddubsw m0, m2
+ pmulhrsw m0, m5
+ packuswb m0, m0
+ movd [dstq+dsq*0], m0
+ psrlq m0, 32
+ movd [dstq+dsq*1], m0
+ lea dstq, [dstq+dsq*2]
+ add tmpq, 4*2
+ add hq, 2
+ jl .w4_loop
+ RET
+.w8:
+ movd m4, [maskq+hq*2]
+ punpcklwd m4, m4
+ pshufd m3, m4, q0000
+ pshufd m4, m4, q1111
+ movq m1, [dstq+dsq*0] ; a
+ movhps m1, [dstq+dsq*1]
+ mova m2, [tmpq]
+ BLEND_64M m1, m2, m3, m4
+ movq [dstq+dsq*0], m0
+ movhps [dstq+dsq*1], m0
+ lea dstq, [dstq+dsq*2]
+ add tmpq, 8*2
+ add hq, 2
+ jl .w8
+ RET
+; w16/w32/w64/w128
+.w16:
+%if ARCH_X86_32
+ mov r6d, wm
+%endif
+ sub dsq, r6
+.w16_loop0:
+ movd m3, [maskq+hq*2]
+ pshuflw m3, m3, q0000
+ punpcklqdq m3, m3
+ mov wd, r6d
+.w16_loop:
+ mova m1, [dstq] ; a
+ mova m2, [tmpq] ; b
+ BLEND_64M m1, m2, m3, m3
+ mova [dstq], m0
+ add dstq, 16
+ add tmpq, 16
+ sub wd, 16
+ jg .w16_loop
+ add dstq, dsq
+ inc hq
+ jl .w16_loop0
+ RET
+
+; emu_edge args:
+; const intptr_t bw, const intptr_t bh, const intptr_t iw, const intptr_t ih,
+; const intptr_t x, const intptr_t y, pixel *dst, const ptrdiff_t dst_stride,
+; const pixel *ref, const ptrdiff_t ref_stride
+;
+; bw, bh total filled size
+; iw, ih, copied block -> fill bottom, right
+; x, y, offset in bw/bh -> fill top, left
+cglobal emu_edge_8bpc, 10, 13, 2, bw, bh, iw, ih, x, \
+ y, dst, dstride, src, sstride, \
+ bottomext, rightext, blk
+ ; we assume that the buffer (stride) is larger than width, so we can
+ ; safely overwrite by a few bytes
+ pxor m1, m1
+
+%if ARCH_X86_64
+ %define reg_zero r12q
+ %define reg_tmp r10
+ %define reg_src srcq
+ %define reg_bottomext bottomextq
+ %define reg_rightext rightextq
+ %define reg_blkm r9m
+%else
+ %define reg_zero r6
+ %define reg_tmp r0
+ %define reg_src r1
+ %define reg_bottomext r0
+ %define reg_rightext r1
+ %define reg_blkm r2m
+%endif
+ ;
+ ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
+ xor reg_zero, reg_zero
+ lea reg_tmp, [ihq-1]
+ cmp yq, ihq
+ cmovs reg_tmp, yq
+ test yq, yq
+ cmovs reg_tmp, reg_zero
+%if ARCH_X86_64
+ imul reg_tmp, sstrideq
+ add srcq, reg_tmp
+%else
+ imul reg_tmp, sstridem
+ mov reg_src, srcm
+ add reg_src, reg_tmp
+%endif
+ ;
+ ; ref += iclip(x, 0, iw - 1)
+ lea reg_tmp, [iwq-1]
+ cmp xq, iwq
+ cmovs reg_tmp, xq
+ test xq, xq
+ cmovs reg_tmp, reg_zero
+ add reg_src, reg_tmp
+%if ARCH_X86_32
+ mov srcm, reg_src
+%endif
+ ;
+ ; bottom_ext = iclip(y + bh - ih, 0, bh - 1)
+%if ARCH_X86_32
+ mov r1, r1m ; restore bh
+%endif
+ lea reg_bottomext, [yq+bhq]
+ sub reg_bottomext, ihq
+ lea r3, [bhq-1]
+ cmovs reg_bottomext, reg_zero
+ ;
+
+ DEFINE_ARGS bw, bh, iw, ih, x, \
+ topext, dst, dstride, src, sstride, \
+ bottomext, rightext, blk
+
+ ; top_ext = iclip(-y, 0, bh - 1)
+ neg topextq
+ cmovs topextq, reg_zero
+ cmp reg_bottomext, bhq
+ cmovns reg_bottomext, r3
+ cmp topextq, bhq
+ cmovg topextq, r3
+ %if ARCH_X86_32
+ mov r4m, reg_bottomext
+ ;
+ ; right_ext = iclip(x + bw - iw, 0, bw - 1)
+ mov r0, r0m ; restore bw
+ %endif
+ lea reg_rightext, [xq+bwq]
+ sub reg_rightext, iwq
+ lea r2, [bwq-1]
+ cmovs reg_rightext, reg_zero
+
+ DEFINE_ARGS bw, bh, iw, ih, leftext, \
+ topext, dst, dstride, src, sstride, \
+ bottomext, rightext, blk
+
+ ; left_ext = iclip(-x, 0, bw - 1)
+ neg leftextq
+ cmovs leftextq, reg_zero
+ cmp reg_rightext, bwq
+ cmovns reg_rightext, r2
+ %if ARCH_X86_32
+ mov r3m, r1
+ %endif
+ cmp leftextq, bwq
+ cmovns leftextq, r2
+
+%undef reg_zero
+%undef reg_tmp
+%undef reg_src
+%undef reg_bottomext
+%undef reg_rightext
+
+ DEFINE_ARGS bw, centerh, centerw, dummy, leftext, \
+ topext, dst, dstride, src, sstride, \
+ bottomext, rightext, blk
+
+ ; center_h = bh - top_ext - bottom_ext
+%if ARCH_X86_64
+ lea r3, [bottomextq+topextq]
+ sub centerhq, r3
+%else
+ mov r1, centerhm ; restore r1
+ sub centerhq, topextq
+ sub centerhq, r4m
+ mov r1m, centerhq
+%endif
+ ;
+ ; blk += top_ext * PXSTRIDE(dst_stride)
+ mov r2, topextq
+%if ARCH_X86_64
+ imul r2, dstrideq
+%else
+ mov r6, r6m ; restore dstq
+ imul r2, dstridem
+%endif
+ add dstq, r2
+ mov reg_blkm, dstq ; save pointer for ext
+ ;
+ ; center_w = bw - left_ext - right_ext
+ mov centerwq, bwq
+%if ARCH_X86_64
+ lea r3, [rightextq+leftextq]
+ sub centerwq, r3
+%else
+ sub centerwq, r3m
+ sub centerwq, leftextq
+%endif
+
+; vloop Macro
+%macro v_loop 3 ; need_left_ext, need_right_ext, suffix
+ %if ARCH_X86_64
+ %define reg_tmp r12
+ %else
+ %define reg_tmp r0
+ %endif
+.v_loop_%3:
+ %if ARCH_X86_32
+ mov r0, r0m
+ mov r1, r1m
+ %endif
+%if %1
+ ; left extension
+ %if ARCH_X86_64
+ movd m0, [srcq]
+ %else
+ mov r3, srcm
+ movd m0, [r3]
+ %endif
+ pshufb m0, m1
+ xor r3, r3
+.left_loop_%3:
+ mova [dstq+r3], m0
+ add r3, mmsize
+ cmp r3, leftextq
+ jl .left_loop_%3
+ ; body
+ lea reg_tmp, [dstq+leftextq]
+%endif
+ xor r3, r3
+.body_loop_%3:
+ %if ARCH_X86_64
+ movu m0, [srcq+r3]
+ %else
+ mov r1, srcm
+ movu m0, [r1+r3]
+ %endif
+%if %1
+ movu [reg_tmp+r3], m0
+%else
+ movu [dstq+r3], m0
+%endif
+ add r3, mmsize
+ cmp r3, centerwq
+ jl .body_loop_%3
+%if %2
+ ; right extension
+%if %1
+ add reg_tmp, centerwq
+%else
+ lea reg_tmp, [dstq+centerwq]
+%endif
+ %if ARCH_X86_64
+ movd m0, [srcq+centerwq-1]
+ %else
+ mov r3, srcm
+ movd m0, [r3+centerwq-1]
+ %endif
+ pshufb m0, m1
+ xor r3, r3
+.right_loop_%3:
+ movu [reg_tmp+r3], m0
+ add r3, mmsize
+ %if ARCH_X86_64
+ cmp r3, rightextq
+ %else
+ cmp r3, r3m
+ %endif
+ jl .right_loop_%3
+%endif
+ %if ARCH_X86_64
+ add dstq, dstrideq
+ add srcq, sstrideq
+ dec centerhq
+ jg .v_loop_%3
+ %else
+ add dstq, dstridem
+ mov r0, sstridem
+ add srcm, r0
+ sub dword centerhm, 1
+ jg .v_loop_%3
+ mov r0, r0m ; restore r0
+ %endif
+%endmacro ; vloop MACRO
+
+ test leftextq, leftextq
+ jnz .need_left_ext
+ %if ARCH_X86_64
+ test rightextq, rightextq
+ jnz .need_right_ext
+ %else
+ cmp leftextq, r3m ; leftextq == 0
+ jne .need_right_ext
+ %endif
+ v_loop 0, 0, 0
+ jmp .body_done
+
+ ;left right extensions
+.need_left_ext:
+ %if ARCH_X86_64
+ test rightextq, rightextq
+ %else
+ mov r3, r3m
+ test r3, r3
+ %endif
+ jnz .need_left_right_ext
+ v_loop 1, 0, 1
+ jmp .body_done
+
+.need_left_right_ext:
+ v_loop 1, 1, 2
+ jmp .body_done
+
+.need_right_ext:
+ v_loop 0, 1, 3
+
+.body_done:
+; r0 ; bw
+; r1 ;; x loop
+; r4 ;; y loop
+; r5 ; topextq
+; r6 ;dstq
+; r7 ;dstrideq
+; r8 ; srcq
+%if ARCH_X86_64
+ %define reg_dstride dstrideq
+%else
+ %define reg_dstride r2
+%endif
+ ;
+ ; bottom edge extension
+ %if ARCH_X86_64
+ test bottomextq, bottomextq
+ jz .top
+ %else
+ xor r1, r1
+ cmp r1, r4m
+ je .top
+ %endif
+ ;
+ %if ARCH_X86_64
+ mov srcq, dstq
+ sub srcq, dstrideq
+ xor r1, r1
+ %else
+ mov r3, dstq
+ mov reg_dstride, dstridem
+ sub r3, reg_dstride
+ mov srcm, r3
+ %endif
+ ;
+.bottom_x_loop:
+ %if ARCH_X86_64
+ mova m0, [srcq+r1]
+ lea r3, [dstq+r1]
+ mov r4, bottomextq
+ %else
+ mov r3, srcm
+ mova m0, [r3+r1]
+ lea r3, [dstq+r1]
+ mov r4, r4m
+ %endif
+ ;
+.bottom_y_loop:
+ mova [r3], m0
+ add r3, reg_dstride
+ dec r4
+ jg .bottom_y_loop
+ add r1, mmsize
+ cmp r1, bwq
+ jl .bottom_x_loop
+
+.top:
+ ; top edge extension
+ test topextq, topextq
+ jz .end
+%if ARCH_X86_64
+ mov srcq, reg_blkm
+%else
+ mov r3, reg_blkm
+ mov reg_dstride, dstridem
+%endif
+ mov dstq, dstm
+ xor r1, r1
+ ;
+.top_x_loop:
+%if ARCH_X86_64
+ mova m0, [srcq+r1]
+%else
+ mov r3, reg_blkm
+ mova m0, [r3+r1]
+%endif
+ lea r3, [dstq+r1]
+ mov r4, topextq
+ ;
+.top_y_loop:
+ mova [r3], m0
+ add r3, reg_dstride
+ dec r4
+ jg .top_y_loop
+ add r1, mmsize
+ cmp r1, bwq
+ jl .top_x_loop
+
+.end:
+ RET
+
+%undef reg_dstride
+%undef reg_blkm
+%undef reg_tmp
+
+cextern resize_filter
+
+%macro SCRATCH 3
+%if ARCH_X86_32
+ mova [rsp+%3*mmsize], m%1
+%define m%2 [rsp+%3*mmsize]
+%else
+ SWAP %1, %2
+%endif
+%endmacro
+
+%if ARCH_X86_64
+cglobal resize_8bpc, 0, 12, 14, dst, dst_stride, src, src_stride, \
+ dst_w, h, src_w, dx, mx0
+%elif STACK_ALIGNMENT >= 16
+cglobal resize_8bpc, 0, 7, 8, 3 * 16, dst, dst_stride, src, src_stride, \
+ dst_w, h, src_w, dx, mx0
+%else
+cglobal resize_8bpc, 0, 6, 8, 3 * 16, dst, dst_stride, src, src_stride, \
+ dst_w, h, src_w, dx, mx0
+%endif
+ movifnidn dstq, dstmp
+ movifnidn srcq, srcmp
+%if STACK_ALIGNMENT >= 16
+ movifnidn dst_wd, dst_wm
+%endif
+%if ARCH_X86_64
+ movifnidn hd, hm
+%endif
+ sub dword mx0m, 4<<14
+ sub dword src_wm, 8
+ movd m7, dxm
+ movd m6, mx0m
+ movd m5, src_wm
+ pshufd m7, m7, q0000
+ pshufd m6, m6, q0000
+ pshufd m5, m5, q0000
+
+%if ARCH_X86_64
+ DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x
+ LEA r7, $$
+%define base r7-$$
+%else
+ DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, x
+%define hd dword r5m
+%if STACK_ALIGNMENT >= 16
+ LEA r6, $$
+%define base r6-$$
+%else
+ LEA r4, $$
+%define base r4-$$
+%endif
+%endif
+
+%if ARCH_X86_64
+ mova m10, [base+pw_m256]
+ mova m9, [base+pd_63]
+ mova m8, [base+pb_8x0_8x8]
+%else
+%define m10 [base+pw_m256]
+%define m9 [base+pd_63]
+%define m8 [base+pb_8x0_8x8]
+%endif
+ pmaddwd m4, m7, [base+rescale_mul] ; dx*[0,1,2,3]
+ pslld m7, 2 ; dx*4
+ pslld m5, 14
+ paddd m6, m4 ; mx+[0..3]*dx
+ SCRATCH 7, 13, 0
+ SCRATCH 6, 12, 1
+ SCRATCH 5, 11, 2
+
+ ; m10 = pmulhrsw constant for x=(x+64)>>7
+ ; m12 = mx+[0..3]*dx, m13 = dx*4, m11 = src_w, m9 = 0x3f, m8=0,8
+
+.loop_y:
+ xor xd, xd
+ mova m0, m12 ; per-line working version of mx
+
+.loop_x:
+ pxor m1, m1
+ pcmpgtd m1, m0
+ pandn m1, m0
+ psrad m2, m0, 8 ; filter offset (unmasked)
+ pcmpgtd m3, m11, m1
+ pand m1, m3
+ pandn m3, m11
+ por m1, m3
+ psubd m3, m0, m1 ; pshufb offset
+ psrad m1, 14 ; clipped src_x offset
+ psrad m3, 14 ; pshufb edge_emu offset
+ pand m2, m9 ; filter offset (masked)
+
+ ; load source pixels
+%if ARCH_X86_64
+ movd r8d, m1
+ pshuflw m1, m1, q3232
+ movd r9d, m1
+ punpckhqdq m1, m1
+ movd r10d, m1
+ psrlq m1, 32
+ movd r11d, m1
+ movq m4, [srcq+r8]
+ movq m5, [srcq+r10]
+ movhps m4, [srcq+r9]
+ movhps m5, [srcq+r11]
+%else
+ movd r3d, m1
+ pshufd m1, m1, q3312
+ movd r1d, m1
+ pshuflw m1, m1, q3232
+ movq m4, [srcq+r3]
+ movq m5, [srcq+r1]
+ movd r3d, m1
+ punpckhqdq m1, m1
+ movd r1d, m1
+ movhps m4, [srcq+r3]
+ movhps m5, [srcq+r1]
+%endif
+
+ ; if no emulation is required, we don't need to shuffle or emulate edges
+ ; this also saves 2 quasi-vpgatherdqs
+ pxor m6, m6
+ pcmpeqb m6, m3
+%if ARCH_X86_64
+ pmovmskb r8d, m6
+ cmp r8d, 0xffff
+%else
+ pmovmskb r3d, m6
+ cmp r3d, 0xffff
+%endif
+ je .filter
+
+%if ARCH_X86_64
+ movd r8d, m3
+ pshuflw m3, m3, q3232
+ movd r9d, m3
+ punpckhqdq m3, m3
+ movd r10d, m3
+ psrlq m3, 32
+ movd r11d, m3
+ movsxd r8, r8d
+ movsxd r9, r9d
+ movsxd r10, r10d
+ movsxd r11, r11d
+ movq m6, [base+resize_shuf+4+r8]
+ movq m7, [base+resize_shuf+4+r10]
+ movhps m6, [base+resize_shuf+4+r9]
+ movhps m7, [base+resize_shuf+4+r11]
+%else
+ movd r3d, m3
+ pshufd m3, m3, q3312
+ movd r1d, m3
+ pshuflw m3, m3, q3232
+ movq m6, [base+resize_shuf+4+r3]
+ movq m7, [base+resize_shuf+4+r1]
+ movd r3d, m3
+ punpckhqdq m3, m3
+ movd r1d, m3
+ movhps m6, [base+resize_shuf+4+r3]
+ movhps m7, [base+resize_shuf+4+r1]
+%endif
+
+ paddb m6, m8
+ paddb m7, m8
+ pshufb m4, m6
+ pshufb m5, m7
+
+.filter:
+%if ARCH_X86_64
+ movd r8d, m2
+ pshuflw m2, m2, q3232
+ movd r9d, m2
+ punpckhqdq m2, m2
+ movd r10d, m2
+ psrlq m2, 32
+ movd r11d, m2
+ movq m6, [base+resize_filter+r8*8]
+ movq m7, [base+resize_filter+r10*8]
+ movhps m6, [base+resize_filter+r9*8]
+ movhps m7, [base+resize_filter+r11*8]
+%else
+ movd r3d, m2
+ pshufd m2, m2, q3312
+ movd r1d, m2
+ pshuflw m2, m2, q3232
+ movq m6, [base+resize_filter+r3*8]
+ movq m7, [base+resize_filter+r1*8]
+ movd r3d, m2
+ punpckhqdq m2, m2
+ movd r1d, m2
+ movhps m6, [base+resize_filter+r3*8]
+ movhps m7, [base+resize_filter+r1*8]
+%endif
+
+ pmaddubsw m4, m6
+ pmaddubsw m5, m7
+ phaddw m4, m5
+ phaddsw m4, m4
+ pmulhrsw m4, m10 ; x=(x+64)>>7
+ packuswb m4, m4
+ movd [dstq+xq], m4
+
+ paddd m0, m13
+ add xd, 4
+%if STACK_ALIGNMENT >= 16
+ cmp xd, dst_wd
+%else
+ cmp xd, dst_wm
+%endif
+ jl .loop_x
+
+ add dstq, dst_stridemp
+ add srcq, src_stridemp
+ dec hd
+ jg .loop_y
+ RET
+
+INIT_XMM ssse3
+PREP_BILIN
+PREP_8TAP
+WARP_AFFINE_8X8
+WARP_AFFINE_8X8T
+
+INIT_XMM sse4
+WARP_AFFINE_8X8
+WARP_AFFINE_8X8T
+
+INIT_XMM sse2
+PREP_BILIN
+PREP_8TAP
+WARP_AFFINE_8X8
+WARP_AFFINE_8X8T
diff --git a/third_party/dav1d/src/x86/msac.asm b/third_party/dav1d/src/x86/msac.asm
new file mode 100644
index 0000000000..9f05c921a6
--- /dev/null
+++ b/third_party/dav1d/src/x86/msac.asm
@@ -0,0 +1,667 @@
+; Copyright © 2019, VideoLAN and dav1d authors
+; Copyright © 2019, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA 64 ; avoids cacheline splits
+
+min_prob: dw 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0
+pw_0xff00: times 8 dw 0xff00
+pw_32: times 8 dw 32
+
+%if ARCH_X86_64
+%define resp resq
+%define movp movq
+%define c_shuf q3333
+%macro DECODE_SYMBOL_ADAPT_INIT 0-1
+%endmacro
+%else
+%define resp resd
+%define movp movd
+%define c_shuf q1111
+%macro DECODE_SYMBOL_ADAPT_INIT 0-1 0 ; hi_tok
+ mov t0, r0m
+ mov t1, r1m
+%if %1 == 0
+ mov t2, r2m
+%endif
+%if STACK_ALIGNMENT >= 16
+ sub esp, 40-%1*4
+%else
+ mov eax, esp
+ and esp, ~15
+ sub esp, 40-%1*4
+ mov [esp], eax
+%endif
+%endmacro
+%endif
+
+struc msac
+ .buf: resp 1
+ .end: resp 1
+ .dif: resp 1
+ .rng: resd 1
+ .cnt: resd 1
+ .update_cdf: resd 1
+endstruc
+
+%define m(x, y) mangle(private_prefix %+ _ %+ x %+ y)
+
+SECTION .text
+
+%if WIN64
+DECLARE_REG_TMP 0, 1, 2, 3, 4, 5, 7, 3, 8
+%define buf rsp+stack_offset+8 ; shadow space
+%elif UNIX64
+DECLARE_REG_TMP 0, 1, 2, 3, 4, 5, 7, 0, 8
+%define buf rsp-40 ; red zone
+%else
+DECLARE_REG_TMP 2, 3, 4, 1, 5, 6, 5, 2, 3
+%define buf esp+8
+%endif
+
+INIT_XMM sse2
+cglobal msac_decode_symbol_adapt4, 0, 6, 6
+ DECODE_SYMBOL_ADAPT_INIT
+ LEA rax, pw_0xff00
+ movd m2, [t0+msac.rng]
+ movq m1, [t1]
+ movp m3, [t0+msac.dif]
+ mov t3d, [t0+msac.update_cdf]
+ mov t4d, t2d
+ not t2 ; -(n_symbols + 1)
+ pshuflw m2, m2, q0000
+ movd [buf+12], m2
+ pand m2, [rax]
+ mova m0, m1
+ psrlw m1, 6
+ psllw m1, 7
+ pmulhuw m1, m2
+ movq m2, [rax+t2*2]
+ pshuflw m3, m3, c_shuf
+ paddw m1, m2
+ mova [buf+16], m1
+ psubusw m1, m3
+ pxor m2, m2
+ pcmpeqw m1, m2 ; c >= v
+ pmovmskb eax, m1
+ test t3d, t3d
+ jz .renorm ; !allow_update_cdf
+
+; update_cdf:
+ movzx t3d, word [t1+t4*2] ; count
+ pcmpeqw m2, m2
+ mov t2d, t3d
+ shr t3d, 4
+ cmp t4d, 3
+ sbb t3d, -5 ; (count >> 4) + (n_symbols > 2) + 4
+ cmp t2d, 32
+ adc t2d, 0 ; count + (count < 32)
+ movd m3, t3d
+ pavgw m2, m1 ; i >= val ? -1 : 32768
+ psubw m2, m0 ; for (i = 0; i < val; i++)
+ psubw m0, m1 ; cdf[i] += (32768 - cdf[i]) >> rate;
+ psraw m2, m3 ; for (; i < n_symbols; i++)
+ paddw m0, m2 ; cdf[i] += (( -1 - cdf[i]) >> rate) + 1;
+ movq [t1], m0
+ mov [t1+t4*2], t2w
+
+.renorm:
+ tzcnt eax, eax
+ mov t4, [t0+msac.dif]
+ movzx t1d, word [buf+rax+16] ; v
+ movzx t2d, word [buf+rax+14] ; u
+ shr eax, 1
+.renorm2:
+%if ARCH_X86_64 == 0
+%if STACK_ALIGNMENT >= 16
+ add esp, 40
+%else
+ mov esp, [esp]
+%endif
+%endif
+ not t4
+ sub t2d, t1d ; rng
+ shl t1, gprsize*8-16
+ add t4, t1 ; ~dif
+.renorm3:
+ mov t1d, [t0+msac.cnt]
+ movifnidn t7, t0
+.renorm4:
+ bsr ecx, t2d
+ xor ecx, 15 ; d
+.renorm5:
+ shl t2d, cl
+ shl t4, cl
+ mov [t7+msac.rng], t2d
+ not t4
+ sub t1d, ecx
+ jae .end ; no refill required
+
+; refill:
+ mov t2, [t7+msac.buf]
+ mov rcx, [t7+msac.end]
+%if ARCH_X86_64 == 0
+ push t5
+%endif
+ lea t5, [t2+gprsize]
+ cmp t5, rcx
+ ja .refill_eob
+ mov t2, [t2]
+ lea ecx, [t1+23]
+ add t1d, 16
+ shr ecx, 3 ; shift_bytes
+ bswap t2
+ sub t5, rcx
+ shl ecx, 3 ; shift_bits
+ shr t2, cl
+ sub ecx, t1d ; shift_bits - 16 - cnt
+ mov t1d, gprsize*8-16
+ shl t2, cl
+ mov [t7+msac.buf], t5
+ sub t1d, ecx ; cnt + gprsize*8 - shift_bits
+ xor t4, t2
+%if ARCH_X86_64 == 0
+ pop t5
+%endif
+.end:
+ mov [t7+msac.cnt], t1d
+ mov [t7+msac.dif], t4
+ RET
+.refill_eob: ; avoid overreading the input buffer
+ mov t5, rcx
+ mov ecx, gprsize*8-24
+ sub ecx, t1d ; c
+.refill_eob_loop:
+ cmp t2, t5
+ jae .refill_eob_end ; eob reached
+ movzx t1d, byte [t2]
+ inc t2
+ shl t1, cl
+ xor t4, t1
+ sub ecx, 8
+ jge .refill_eob_loop
+.refill_eob_end:
+ mov t1d, gprsize*8-24
+%if ARCH_X86_64 == 0
+ pop t5
+%endif
+ sub t1d, ecx
+ mov [t7+msac.buf], t2
+ mov [t7+msac.dif], t4
+ mov [t7+msac.cnt], t1d
+ RET
+
+cglobal msac_decode_symbol_adapt8, 0, 6, 6
+ DECODE_SYMBOL_ADAPT_INIT
+ LEA rax, pw_0xff00
+ movd m2, [t0+msac.rng]
+ mova m1, [t1]
+ movp m3, [t0+msac.dif]
+ mov t3d, [t0+msac.update_cdf]
+ mov t4d, t2d
+ not t2
+ pshuflw m2, m2, q0000
+ movd [buf+12], m2
+ punpcklqdq m2, m2
+ mova m0, m1
+ psrlw m1, 6
+ pand m2, [rax]
+ psllw m1, 7
+ pmulhuw m1, m2
+ movu m2, [rax+t2*2]
+ pshuflw m3, m3, c_shuf
+ paddw m1, m2
+ punpcklqdq m3, m3
+ mova [buf+16], m1
+ psubusw m1, m3
+ pxor m2, m2
+ pcmpeqw m1, m2
+ pmovmskb eax, m1
+ test t3d, t3d
+ jz m(msac_decode_symbol_adapt4, SUFFIX).renorm
+ movzx t3d, word [t1+t4*2]
+ pcmpeqw m2, m2
+ mov t2d, t3d
+ shr t3d, 4
+ cmp t4d, 3 ; may be called with n_symbols <= 2
+ sbb t3d, -5
+ cmp t2d, 32
+ adc t2d, 0
+ movd m3, t3d
+ pavgw m2, m1
+ psubw m2, m0
+ psubw m0, m1
+ psraw m2, m3
+ paddw m0, m2
+ mova [t1], m0
+ mov [t1+t4*2], t2w
+ jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm
+
+cglobal msac_decode_symbol_adapt16, 0, 6, 6
+ DECODE_SYMBOL_ADAPT_INIT
+ LEA rax, pw_0xff00
+ movd m4, [t0+msac.rng]
+ mova m2, [t1]
+ mova m3, [t1+16]
+ movp m5, [t0+msac.dif]
+ mov t3d, [t0+msac.update_cdf]
+ mov t4d, t2d
+ not t2
+%if WIN64
+ sub rsp, 48 ; need 36 bytes, shadow space is only 32
+%endif
+ pshuflw m4, m4, q0000
+ movd [buf-4], m4
+ punpcklqdq m4, m4
+ mova m0, m2
+ psrlw m2, 6
+ mova m1, m3
+ psrlw m3, 6
+ pand m4, [rax]
+ psllw m2, 7
+ psllw m3, 7
+ pmulhuw m2, m4
+ pmulhuw m3, m4
+ movu m4, [rax+t2*2]
+ pshuflw m5, m5, c_shuf
+ paddw m2, m4
+ psubw m4, [rax-pw_0xff00+pw_32]
+ punpcklqdq m5, m5
+ paddw m3, m4
+ mova [buf], m2
+ psubusw m2, m5
+ mova [buf+16], m3
+ psubusw m3, m5
+ pxor m4, m4
+ pcmpeqw m2, m4
+ pcmpeqw m3, m4
+ packsswb m5, m2, m3
+ pmovmskb eax, m5
+ test t3d, t3d
+ jz .renorm
+ movzx t3d, word [t1+t4*2]
+ pcmpeqw m4, m4
+ mova m5, m4
+ lea t2d, [t3+80] ; only support n_symbols > 2
+ shr t2d, 4
+ cmp t3d, 32
+ adc t3d, 0
+ pavgw m4, m2
+ pavgw m5, m3
+ psubw m4, m0
+ psubw m0, m2
+ movd m2, t2d
+ psubw m5, m1
+ psubw m1, m3
+ psraw m4, m2
+ psraw m5, m2
+ paddw m0, m4
+ paddw m1, m5
+ mova [t1], m0
+ mova [t1+16], m1
+ mov [t1+t4*2], t3w
+.renorm:
+ tzcnt eax, eax
+ mov t4, [t0+msac.dif]
+ movzx t1d, word [buf+rax*2]
+ movzx t2d, word [buf+rax*2-2]
+%if WIN64
+ add rsp, 48
+%endif
+ jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm2
+
+cglobal msac_decode_bool_adapt, 0, 6, 0
+ movifnidn t1, r1mp
+ movifnidn t0, r0mp
+ movzx eax, word [t1]
+ movzx t3d, byte [t0+msac.rng+1]
+ mov t4, [t0+msac.dif]
+ mov t2d, [t0+msac.rng]
+%if ARCH_X86_64
+ mov t5d, eax
+%endif
+ and eax, ~63
+ imul eax, t3d
+%if UNIX64
+ mov t6, t4
+%endif
+ shr eax, 7
+ add eax, 4 ; v
+ mov t3d, eax
+ shl rax, gprsize*8-16 ; vw
+ sub t2d, t3d ; r - v
+ sub t4, rax ; dif - vw
+ setb al
+ cmovb t2d, t3d
+ mov t3d, [t0+msac.update_cdf]
+%if UNIX64
+ cmovb t4, t6
+%else
+ cmovb t4, [t0+msac.dif]
+%endif
+%if ARCH_X86_64 == 0
+ movzx eax, al
+%endif
+ not t4
+ test t3d, t3d
+ jz m(msac_decode_symbol_adapt4, SUFFIX).renorm3
+%if UNIX64 == 0
+ push t6
+%endif
+ movzx t6d, word [t1+2]
+%if ARCH_X86_64 == 0
+ push t5
+ movzx t5d, word [t1]
+%endif
+ movifnidn t7, t0
+ lea ecx, [t6+64]
+ cmp t6d, 32
+ adc t6d, 0
+ mov [t1+2], t6w
+ imul t6d, eax, -32769
+ shr ecx, 4 ; rate
+ add t6d, t5d ; if (bit)
+ sub t5d, eax ; cdf[0] -= ((cdf[0] - 32769) >> rate) + 1;
+ sar t6d, cl ; else
+ sub t5d, t6d ; cdf[0] -= cdf[0] >> rate;
+ mov [t1], t5w
+%if WIN64
+ mov t1d, [t7+msac.cnt]
+ pop t6
+ jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm4
+%else
+%if ARCH_X86_64 == 0
+ pop t5
+ pop t6
+%endif
+ jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm3
+%endif
+
+cglobal msac_decode_bool_equi, 0, 6, 0
+ movifnidn t0, r0mp
+ mov t1d, [t0+msac.rng]
+ mov t4, [t0+msac.dif]
+ mov t2d, t1d
+ mov t1b, 8
+ mov t3, t4
+ mov eax, t1d
+ shr t1d, 1 ; v
+ shl rax, gprsize*8-17 ; vw
+ sub t2d, t1d ; r - v
+ sub t4, rax ; dif - vw
+ cmovb t2d, t1d
+ mov t1d, [t0+msac.cnt]
+ cmovb t4, t3
+ movifnidn t7, t0
+ mov ecx, 0xbfff
+ setb al ; the upper 32 bits contains garbage but that's OK
+ sub ecx, t2d
+ not t4
+ ; In this case of this function, (d =) 16 - clz(v) = 2 - (v >> 14)
+ ; i.e. (0 <= d <= 2) and v < (3 << 14)
+ shr ecx, 14 ; d
+%if ARCH_X86_64 == 0
+ movzx eax, al
+%endif
+ jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm5
+
+cglobal msac_decode_bool, 0, 6, 0
+ movifnidn t0, r0mp
+ movifnidn t1d, r1m
+ movzx eax, byte [t0+msac.rng+1] ; r >> 8
+ mov t4, [t0+msac.dif]
+ mov t2d, [t0+msac.rng]
+ and t1d, ~63
+ imul eax, t1d
+ mov t3, t4
+ shr eax, 7
+ add eax, 4 ; v
+ mov t1d, eax
+ shl rax, gprsize*8-16 ; vw
+ sub t2d, t1d ; r - v
+ sub t4, rax ; dif - vw
+ cmovb t2d, t1d
+ cmovb t4, t3
+ setb al
+ not t4
+%if ARCH_X86_64 == 0
+ movzx eax, al
+%endif
+ jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm3
+
+%macro HI_TOK 1 ; update_cdf
+%if ARCH_X86_64 == 0
+ mov eax, -24
+%endif
+%%loop:
+%if %1
+ movzx t2d, word [t1+3*2]
+%endif
+ mova m1, m0
+ pshuflw m2, m2, q0000
+ psrlw m1, 6
+ movd [buf+12], m2
+ pand m2, m4
+ psllw m1, 7
+ pmulhuw m1, m2
+%if ARCH_X86_64 == 0
+ add eax, 5
+ mov [buf+8], eax
+%endif
+ pshuflw m3, m3, c_shuf
+ paddw m1, m5
+ movq [buf+16], m1
+ psubusw m1, m3
+ pxor m2, m2
+ pcmpeqw m1, m2
+ pmovmskb eax, m1
+%if %1
+ lea ecx, [t2+80]
+ pcmpeqw m2, m2
+ shr ecx, 4
+ cmp t2d, 32
+ adc t2d, 0
+ movd m3, ecx
+ pavgw m2, m1
+ psubw m2, m0
+ psubw m0, m1
+ psraw m2, m3
+ paddw m0, m2
+ movq [t1], m0
+ mov [t1+3*2], t2w
+%endif
+ tzcnt eax, eax
+ movzx ecx, word [buf+rax+16]
+ movzx t2d, word [buf+rax+14]
+ not t4
+%if ARCH_X86_64
+ add t6d, 5
+%endif
+ sub eax, 5 ; setup for merging the tok_br and tok branches
+ sub t2d, ecx
+ shl rcx, gprsize*8-16
+ add t4, rcx
+ bsr ecx, t2d
+ xor ecx, 15
+ shl t2d, cl
+ shl t4, cl
+ movd m2, t2d
+ mov [t7+msac.rng], t2d
+ not t4
+ sub t5d, ecx
+ jae %%end
+ mov t2, [t7+msac.buf]
+ mov rcx, [t7+msac.end]
+%if UNIX64 == 0
+ push t8
+%endif
+ lea t8, [t2+gprsize]
+ cmp t8, rcx
+ ja %%refill_eob
+ mov t2, [t2]
+ lea ecx, [t5+23]
+ add t5d, 16
+ shr ecx, 3
+ bswap t2
+ sub t8, rcx
+ shl ecx, 3
+ shr t2, cl
+ sub ecx, t5d
+ mov t5d, gprsize*8-16
+ shl t2, cl
+ mov [t7+msac.buf], t8
+%if UNIX64 == 0
+ pop t8
+%endif
+ sub t5d, ecx
+ xor t4, t2
+%%end:
+ movp m3, t4
+%if ARCH_X86_64
+ add t6d, eax ; CF = tok_br < 3 || tok == 15
+ jnc %%loop
+ lea eax, [t6+30]
+%else
+ add eax, [buf+8]
+ jnc %%loop
+ add eax, 30
+%if STACK_ALIGNMENT >= 16
+ add esp, 36
+%else
+ mov esp, [esp]
+%endif
+%endif
+ mov [t7+msac.dif], t4
+ shr eax, 1
+ mov [t7+msac.cnt], t5d
+ RET
+%%refill_eob:
+ mov t8, rcx
+ mov ecx, gprsize*8-24
+ sub ecx, t5d
+%%refill_eob_loop:
+ cmp t2, t8
+ jae %%refill_eob_end
+ movzx t5d, byte [t2]
+ inc t2
+ shl t5, cl
+ xor t4, t5
+ sub ecx, 8
+ jge %%refill_eob_loop
+%%refill_eob_end:
+%if UNIX64 == 0
+ pop t8
+%endif
+ mov t5d, gprsize*8-24
+ mov [t7+msac.buf], t2
+ sub t5d, ecx
+ jmp %%end
+%endmacro
+
+cglobal msac_decode_hi_tok, 0, 7 + ARCH_X86_64, 6
+ DECODE_SYMBOL_ADAPT_INIT 1
+%if ARCH_X86_64 == 0 && PIC
+ LEA t2, min_prob+12*2
+ %define base t2-(min_prob+12*2)
+%else
+ %define base 0
+%endif
+ movq m0, [t1]
+ movd m2, [t0+msac.rng]
+ mov eax, [t0+msac.update_cdf]
+ movq m4, [base+pw_0xff00]
+ movp m3, [t0+msac.dif]
+ movq m5, [base+min_prob+12*2]
+ mov t4, [t0+msac.dif]
+ mov t5d, [t0+msac.cnt]
+%if ARCH_X86_64
+ mov t6d, -24
+%endif
+ movifnidn t7, t0
+ test eax, eax
+ jz .no_update_cdf
+ HI_TOK 1
+.no_update_cdf:
+ HI_TOK 0
+
+%if ARCH_X86_64
+INIT_YMM avx2
+cglobal msac_decode_symbol_adapt16, 3, 6, 6
+ lea rax, [pw_0xff00]
+ vpbroadcastw m2, [t0+msac.rng]
+ mova m0, [t1]
+ vpbroadcastw m3, [t0+msac.dif+6]
+ vbroadcasti128 m4, [rax]
+ mov t3d, [t0+msac.update_cdf]
+ mov t4d, t2d
+ not t2
+ mov r5, rsp
+%if WIN64
+ and rsp, ~31
+ sub rsp, 40
+%else
+ and r5, ~31
+ %define buf r5-32
+%endif
+ psrlw m1, m0, 6
+ movd [buf-4], xm2
+ pand m2, m4
+ psllw m1, 7
+ pmulhuw m1, m2
+ paddw m1, [rax+t2*2]
+ mova [buf], m1
+ pmaxuw m1, m3
+ pcmpeqw m1, m3
+ pmovmskb eax, m1
+ test t3d, t3d
+ jz .renorm
+ movzx t3d, word [t1+t4*2]
+ pcmpeqw m2, m2
+ lea t2d, [t3+80]
+ shr t2d, 4
+ cmp t3d, 32
+ adc t3d, 0
+ movd xm3, t2d
+ pavgw m2, m1
+ psubw m2, m0
+ psubw m0, m1
+ psraw m2, xm3
+ paddw m0, m2
+ mova [t1], m0
+ mov [t1+t4*2], t3w
+.renorm:
+ tzcnt eax, eax
+ mov t4, [t0+msac.dif]
+ movzx t1d, word [buf+rax-0]
+ movzx t2d, word [buf+rax-2]
+ shr eax, 1
+%if WIN64
+ mov rsp, r5
+%endif
+ vzeroupper
+ jmp m(msac_decode_symbol_adapt4, _sse2).renorm2
+%endif
diff --git a/third_party/dav1d/src/x86/msac.h b/third_party/dav1d/src/x86/msac.h
new file mode 100644
index 0000000000..0bb632fb31
--- /dev/null
+++ b/third_party/dav1d/src/x86/msac.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright © 2019, VideoLAN and dav1d authors
+ * Copyright © 2019, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_X86_MSAC_H
+#define DAV1D_SRC_X86_MSAC_H
+
+#include "src/cpu.h"
+
+unsigned dav1d_msac_decode_symbol_adapt4_sse2(MsacContext *s, uint16_t *cdf,
+ size_t n_symbols);
+unsigned dav1d_msac_decode_symbol_adapt8_sse2(MsacContext *s, uint16_t *cdf,
+ size_t n_symbols);
+unsigned dav1d_msac_decode_symbol_adapt16_sse2(MsacContext *s, uint16_t *cdf,
+ size_t n_symbols);
+unsigned dav1d_msac_decode_symbol_adapt16_avx2(MsacContext *s, uint16_t *cdf,
+ size_t n_symbols);
+unsigned dav1d_msac_decode_bool_adapt_sse2(MsacContext *s, uint16_t *cdf);
+unsigned dav1d_msac_decode_bool_equi_sse2(MsacContext *s);
+unsigned dav1d_msac_decode_bool_sse2(MsacContext *s, unsigned f);
+unsigned dav1d_msac_decode_hi_tok_sse2(MsacContext *s, uint16_t *cdf);
+
+#if ARCH_X86_64 || defined(__SSE2__) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
+#define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt4_sse2
+#define dav1d_msac_decode_symbol_adapt8 dav1d_msac_decode_symbol_adapt8_sse2
+#define dav1d_msac_decode_hi_tok dav1d_msac_decode_hi_tok_sse2
+#endif
+
+#define dav1d_msac_decode_bool_adapt dav1d_msac_decode_bool_adapt_sse2
+#define dav1d_msac_decode_bool_equi dav1d_msac_decode_bool_equi_sse2
+#define dav1d_msac_decode_bool dav1d_msac_decode_bool_sse2
+
+#if ARCH_X86_64
+#define dav1d_msac_decode_symbol_adapt16(ctx, cdf, symb) ((ctx)->symbol_adapt16(ctx, cdf, symb))
+
+static ALWAYS_INLINE void msac_init_x86(MsacContext *const s) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (flags & DAV1D_X86_CPU_FLAG_SSE2) {
+ s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_sse2;
+ }
+
+ if (flags & DAV1D_X86_CPU_FLAG_AVX2) {
+ s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_avx2;
+ }
+}
+
+#elif defined(__SSE2__) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
+#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_sse2
+#endif
+
+#endif /* DAV1D_SRC_X86_MSAC_H */
diff --git a/third_party/dav1d/src/x86/pal.asm b/third_party/dav1d/src/x86/pal.asm
new file mode 100644
index 0000000000..92075b9ba8
--- /dev/null
+++ b/third_party/dav1d/src/x86/pal.asm
@@ -0,0 +1,641 @@
+; Copyright © 2023, VideoLAN and dav1d authors
+; Copyright © 2023, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA 64
+
+const pb_0to63, db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+%if ARCH_X86_64
+ db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ db 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47
+ db 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63
+%endif
+pal_idx_w8_padh: db 0, 1, 2, 3, 3, 3, 3, 3, 8, 9, 10, 11, 11, 11, 11, 11
+
+pb_1_16: times 4 db 1, 16
+%if ARCH_X86_64
+pb_32: times 4 db 32
+%endif
+
+%macro JMP_TABLE 2-*
+ %xdefine %1_table (%%table - 2*4)
+ %xdefine %%base mangle(private_prefix %+ _%1)
+ %%table:
+ %rep %0 - 1
+ dd %%base %+ .w%2 - (%%table - 2*4)
+ %rotate 1
+ %endrep
+%endmacro
+
+JMP_TABLE pal_idx_finish_ssse3, 4, 8, 16, 32, 64
+%if ARCH_X86_64
+JMP_TABLE pal_idx_finish_avx2, 4, 8, 16, 32, 64
+JMP_TABLE pal_idx_finish_avx512icl, 4, 8, 16, 32, 64
+%endif
+
+SECTION .text
+
+INIT_XMM ssse3
+cglobal pal_idx_finish, 2, 7, 6, dst, src, bw, bh, w, h
+%define base r6-pal_idx_finish_ssse3_table
+ LEA r6, pal_idx_finish_ssse3_table
+ tzcnt bwd, bwm
+ movifnidn bhd, bhm
+ movifnidn wd, wm
+ movifnidn hd, hm
+ movsxd bwq, [r6+bwq*4]
+ movddup m3, [base+pb_1_16]
+ add bwq, r6
+ sub bhd, hd
+ jmp bwq
+.w4:
+ mova m0, [srcq]
+ add srcq, 16
+ pmaddubsw m0, m3
+ packuswb m0, m0
+ movq [dstq], m0
+ add dstq, 8
+ sub hd, 4
+ jg .w4
+ test bhd, bhd
+ jz .w4_end
+ pshuflw m0, m0, q3333
+.w4_padv:
+ movq [dstq], m0
+ add dstq, 8
+ sub bhd, 4
+ jg .w4_padv
+.w4_end:
+ RET
+.w8_padh:
+ pshufb m0, m2
+ pshufb m1, m2
+ jmp .w8_main
+.w8:
+ mova m2, [base+pal_idx_w8_padh]
+.w8_loop:
+ mova m0, [srcq+16*0]
+ mova m1, [srcq+16*1]
+ cmp wd, 8
+ jl .w8_padh
+.w8_main:
+ pmaddubsw m0, m3
+ pmaddubsw m1, m3
+ add srcq, 16*2
+ packuswb m0, m1
+ movu [dstq], m0
+ add dstq, 16
+ sub hd, 4
+ jg .w8_loop
+ test bhd, bhd
+ jz .w8_end
+ pshufd m0, m0, q3333
+.w8_padv:
+ movu [dstq], m0
+ add dstq, 16
+ sub bhd, 4
+ jg .w8_padv
+.w8_end:
+ RET
+.w16_padh:
+ pshufb m0, m4
+ pshufb m1, m4
+ jmp .w16_main
+.w16:
+ cmp wd, 16
+ je .w16_loop
+ call .setup_padh
+.w16_loop:
+ mova m0, [srcq+16*0]
+ mova m1, [srcq+16*1]
+ cmp wd, 16
+ jl .w16_padh
+.w16_main:
+ pmaddubsw m0, m3
+ pmaddubsw m1, m3
+ add srcq, 16*2
+ packuswb m0, m1
+ movu [dstq], m0
+ add dstq, 16
+ sub hd, 2
+ jg .w16_loop
+ test bhd, bhd
+ jz .w16_end
+ punpckhqdq m0, m0
+.w16_padv:
+ movu [dstq+16*0], m0
+ movu [dstq+16*1], m0
+ add dstq, 16*2
+ sub bhd, 4
+ jg .w16_padv
+.w16_end:
+ RET
+.w32_padh:
+ cmp wd, 16
+ jg .w32_padh2
+ pshufb m1, m0, m5
+ pshufb m0, m4
+ jmp .w32_main
+.w32_padh2:
+ pshufb m1, m4
+ jmp .w32_main
+.w32:
+ cmp wd, 32
+ je .w32_loop
+ call .setup_padh
+.w32_loop:
+ mova m0, [srcq+16*0]
+ mova m1, [srcq+16*1]
+ cmp wd, 32
+ jl .w32_padh
+.w32_main:
+ pmaddubsw m0, m3
+ pmaddubsw m1, m3
+ add srcq, 16*2
+ packuswb m0, m1
+ movu [dstq], m0
+ add dstq, 16
+ dec hd
+ jg .w32_loop
+ test bhd, bhd
+ jz .w32_end
+.w32_padv:
+ movu [dstq+16*0], m0
+ movu [dstq+16*1], m0
+ movu [dstq+16*2], m0
+ movu [dstq+16*3], m0
+ add dstq, 16*4
+ sub bhd, 4
+ jg .w32_padv
+.w32_end:
+ RET
+.w64_padh:
+ cmp wd, 16
+ jg .w64_padh2
+ pshufb m1, m0, m5
+ pshufb m0, m4
+ pmaddubsw m0, m3
+ pmaddubsw m1, m3
+ packuswb m0, m1
+ packuswb m1, m1
+ jmp .w64_main
+.w64_padh2:
+ pshufb m1, m4
+ pmaddubsw m0, m3
+ pmaddubsw m2, m1, m3
+ pshufb m1, m5
+ pmaddubsw m1, m3
+ packuswb m0, m2
+ packuswb m1, m1
+ jmp .w64_main
+.w64_padh3:
+ cmp wd, 48
+ jg .w64_padh4
+ pshufb m2, m1, m5
+ pshufb m1, m4
+ jmp .w64_main2
+.w64_padh4:
+ pshufb m2, m4
+ jmp .w64_main2
+.w64:
+ cmp wd, 64
+ je .w64_loop
+ call .setup_padh
+.w64_loop:
+ mova m0, [srcq+16*0]
+ mova m1, [srcq+16*1]
+ cmp wd, 32
+ jle .w64_padh
+ pmaddubsw m0, m3
+ pmaddubsw m1, m3
+ packuswb m0, m1
+ mova m1, [srcq+16*2]
+ mova m2, [srcq+16*3]
+ cmp wd, 64
+ jl .w64_padh3
+.w64_main2:
+ pmaddubsw m1, m3
+ pmaddubsw m2, m3
+ packuswb m1, m2
+.w64_main:
+ add srcq, 16*4
+ movu [dstq+16*0], m0
+ movu [dstq+16*1], m1
+ add dstq, 16*2
+ dec hd
+ jg .w64_loop
+ test bhd, bhd
+ jz .w64_end
+.w64_padv:
+ movu [dstq+16*0], m0
+ movu [dstq+16*1], m1
+ movu [dstq+16*2], m0
+ movu [dstq+16*3], m1
+ add dstq, 16*4
+ sub bhd, 2
+ jg .w64_padv
+.w64_end:
+ RET
+.setup_padh:
+ mova m4, [base+pb_0to63]
+ lea r6d, [wq-1]
+ and r6d, 15
+ movd m5, r6d
+ pxor m0, m0
+ pshufb m5, m0
+ pminub m4, m5
+ ret
+
+%if ARCH_X86_64
+
+INIT_YMM avx2
+cglobal pal_idx_finish, 4, 7, 5, dst, src, bw, bh, w, h
+%define base r6-pal_idx_finish_avx2_table
+ lea r6, [pal_idx_finish_avx2_table]
+ tzcnt bwd, bwd
+ movifnidn wd, wm
+ movifnidn hd, hm
+ movsxd bwq, [r6+bwq*4]
+ vpbroadcastd m2, [base+pb_1_16]
+ dec wd
+ add bwq, r6
+ sub bhd, hd
+ jmp bwq
+.w4:
+ mova xm0, [srcq]
+ add srcq, 16
+ pmaddubsw xm0, xm2
+ packuswb xm0, xm0
+ movq [dstq], xm0
+ add dstq, 8
+ sub hd, 4
+ jg .w4
+ test bhd, bhd
+ jz .w4_end
+ pshuflw xm0, xm0, q3333
+.w4_padv:
+ movq [dstq], xm0
+ add dstq, 8
+ sub bhd, 4
+ jg .w4_padv
+.w4_end:
+ RET
+.w8_padh:
+ pshufb xm0, xm3
+ pshufb xm1, xm3
+ jmp .w8_main
+.w8:
+ mova xm3, [base+pal_idx_w8_padh]
+.w8_loop:
+ mova xm0, [srcq+16*0]
+ mova xm1, [srcq+16*1]
+ cmp wd, 7
+ jl .w8_padh
+.w8_main:
+ pmaddubsw xm0, xm2
+ pmaddubsw xm1, xm2
+ add srcq, 16*2
+ packuswb xm0, xm1
+ movu [dstq], xm0
+ add dstq, 16
+ sub hd, 4
+ jg .w8_loop
+ test bhd, bhd
+ jz .w8_end
+ pshufd xm0, xm0, q3333
+.w8_padv:
+ movu [dstq], xm0
+ add dstq, 16
+ sub bhd, 4
+ jg .w8_padv
+.w8_end:
+ RET
+.w16_padh:
+ pshufb m0, m3
+ pshufb m1, m3
+ jmp .w16_main
+.w16:
+ cmp wd, 15
+ je .w16_loop
+ vbroadcasti128 m0, [base+pb_0to63]
+ movd xm3, wd
+ vpbroadcastb m3, xm3
+ pminub m3, m0
+.w16_loop:
+ mova m0, [srcq+32*0]
+ mova m1, [srcq+32*1]
+ cmp wd, 15
+ jl .w16_padh
+.w16_main:
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ add srcq, 32*2
+ packuswb m0, m1
+ vpermq m1, m0, q3120
+ movu [dstq], m1
+ add dstq, 32
+ sub hd, 4
+ jg .w16_loop
+ test bhd, bhd
+ jz .w16_end
+ vpermq m0, m0, q3333
+.w16_padv:
+ movu [dstq], m0
+ add dstq, 32
+ sub bhd, 4
+ jg .w16_padv
+.w16_end:
+ RET
+.w32_padh:
+ cmp wd, 15
+ jg .w32_padh2
+ vinserti128 m0, xm0, 1
+ vinserti128 m1, xm1, 1
+.w32_padh2:
+ pshufb m0, m3
+ pshufb m1, m3
+ jmp .w32_main
+.w32:
+ cmp wd, 31
+ je .w32_loop
+ movd xm3, wd
+ vpbroadcastb m3, xm3
+ pminub m3, [base+pb_0to63]
+.w32_loop:
+ mova m0, [srcq+32*0]
+ mova m1, [srcq+32*1]
+ cmp wd, 31
+ jl .w32_padh
+.w32_main:
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ add srcq, 32*2
+ packuswb m0, m1
+ vpermq m1, m0, q3120
+ movu [dstq], m1
+ add dstq, 32
+ sub hd, 2
+ jg .w32_loop
+ test bhd, bhd
+ jz .w32_end
+ vpermq m0, m0, q3131
+.w32_padv:
+ movu [dstq+32*0], m0
+ movu [dstq+32*1], m0
+ add dstq, 32*2
+ sub bhd, 4
+ jg .w32_padv
+.w32_end:
+ RET
+.w64_padh:
+ cmp wd, 15
+ jg .w64_padh2
+ vinserti128 m1, m0, xm0, 1
+ pshufb m0, m1, m3
+ pshufb m1, m4
+ jmp .w64_main
+.w64_padh2:
+ cmp wd, 31
+ jg .w64_padh3
+ vperm2i128 m1, m0, m0, 0x11
+ pshufb m0, m3
+ pshufb m1, m4
+ jmp .w64_main
+.w64_padh3:
+ cmp wd, 47
+ jg .w64_padh4
+ vinserti128 m1, xm1, 1
+.w64_padh4:
+ pshufb m1, m3
+ jmp .w64_main
+.w64:
+ cmp wd, 63
+ je .w64_loop
+ mov r6d, wd
+ and r6d, 31
+ movd xm4, r6d
+ vpbroadcastb m4, xm4
+ pminub m3, m4, [pb_0to63]
+.w64_loop:
+ mova m0, [srcq+32*0]
+ mova m1, [srcq+32*1]
+ cmp wd, 63
+ jl .w64_padh
+.w64_main:
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ add srcq, 32*2
+ packuswb m0, m1
+ vpermq m0, m0, q3120
+ movu [dstq], m0
+ add dstq, 32
+ dec hd
+ jg .w64_loop
+ test bhd, bhd
+ jz .w64_end
+.w64_padv:
+ movu [dstq+32*0], m0
+ movu [dstq+32*1], m0
+ movu [dstq+32*2], m0
+ movu [dstq+32*3], m0
+ add dstq, 32*4
+ sub bhd, 4
+ jg .w64_padv
+.w64_end:
+ RET
+
+INIT_ZMM avx512icl
+cglobal pal_idx_finish, 4, 7, 7, dst, src, bw, bh, w, h
+%define base r6-pal_idx_finish_avx512icl_table
+ lea r6, [pal_idx_finish_avx512icl_table]
+ tzcnt bwd, bwd
+ movifnidn wd, wm
+ movifnidn hd, hm
+ movsxd bwq, [r6+bwq*4]
+ vpbroadcastd m4, [base+pb_1_16]
+ dec wd
+ add bwq, r6
+ sub bhd, hd
+ jmp bwq
+.w4:
+ mova xmm0, [srcq]
+ add srcq, 16
+ pmaddubsw xmm0, xm4
+ packuswb xmm0, xmm0
+ movq [dstq], xmm0
+ add dstq, 8
+ sub hd, 4
+ jg .w4
+ test bhd, bhd
+ jz .w4_end
+ pshuflw xmm0, xmm0, q3333
+.w4_padv:
+ movq [dstq], xmm0
+ add dstq, 8
+ sub bhd, 4
+ jg .w4_padv
+.w4_end:
+ RET
+.w8_padh:
+ pshufb xmm0, xmm2
+ pshufb xmm1, xmm2
+ jmp .w8_main
+.w8:
+ mova xmm2, [base+pal_idx_w8_padh]
+.w8_loop:
+ mova xmm0, [srcq+16*0]
+ mova xmm1, [srcq+16*1]
+ cmp wd, 7
+ jl .w8_padh
+.w8_main:
+ pmaddubsw xmm0, xm4
+ pmaddubsw xmm1, xm4
+ add srcq, 16*2
+ packuswb xmm0, xmm1
+ movu [dstq], xmm0
+ add dstq, 16
+ sub hd, 4
+ jg .w8_loop
+ test bhd, bhd
+ jz .w8_end
+ pshufd xmm0, xmm0, q3333
+.w8_padv:
+ movu [dstq], xmm0
+ add dstq, 16
+ sub bhd, 4
+ jg .w8_padv
+.w8_end:
+ RET
+.w16_padh:
+ pshufb m0, m2
+ jmp .w16_main
+.w16:
+ cmp wd, 15
+ je .w16_loop
+ vbroadcasti32x4 m2, [base+pb_0to63]
+ vpbroadcastb m0, wd
+ pminub m2, m0
+.w16_loop:
+ mova m0, [srcq]
+ cmp wd, 15
+ jl .w16_padh
+.w16_main:
+ pmaddubsw m0, m4
+ add srcq, 64
+ vpmovwb ym0, m0
+ movu [dstq], ym0
+ add dstq, 32
+ sub hd, 4
+ jg .w16_loop
+ test bhd, bhd
+ jz .w16_end
+ vpermq ym0, ym0, q3333
+.w16_padv:
+ movu [dstq], ym0
+ add dstq, 32
+ sub bhd, 4
+ jg .w16_padv
+.w16_end:
+ RET
+.w32_padh:
+ vpermb m0, m2, m0
+ vpermb m1, m2, m1
+ jmp .w32_main
+.w32:
+ mova m2, [base+pb_0to63]
+ paddb m3, m2, m2
+ cmp wd, 31
+ je .w32_loop
+ vpbroadcastb m0, wd
+ mov r6d, 0xff00
+ kmovw k1, r6d
+ vpaddd m0{k1}, [pb_32] {1to16}
+ pminub m2, m0
+.w32_loop:
+ mova m0, [srcq+64*0]
+ mova m1, [srcq+64*1]
+ cmp wd, 31
+ jl .w32_padh
+.w32_main:
+ pmaddubsw m0, m4
+ pmaddubsw m1, m4
+ add srcq, 64*2
+ vpermt2b m0, m3, m1
+ movu [dstq], m0
+ add dstq, 64
+ sub hd, 4
+ jg .w32_loop
+ test bhd, bhd
+ jz .w32_end
+ vshufi32x4 m0, m0, q3333
+.w32_padv:
+ movu [dstq], m0
+ add dstq, 64
+ sub bhd, 4
+ jg .w32_padv
+.w32_end:
+ RET
+.w64_padh:
+ REPX {vpermb x, m5, x}, m0, m1, m2, m3
+ jmp .w64_main
+.w64:
+ mova m5, [base+pb_0to63]
+ paddb m6, m5, m5
+ cmp wd, 63
+ je .w64_loop
+ vpbroadcastb m0, wd
+ pminub m5, m0
+.w64_loop:
+ mova m0, [srcq+64*0]
+ mova m1, [srcq+64*1]
+ mova m2, [srcq+64*2]
+ mova m3, [srcq+64*3]
+ cmp wd, 63
+ jl .w64_padh
+.w64_main:
+ REPX {pmaddubsw x, m4}, m0, m1, m2, m3
+ add srcq, 64*4
+ vpermt2b m0, m6, m1
+ vpermt2b m2, m6, m3
+ movu [dstq+64*0], m0
+ movu [dstq+64*1], m2
+ add dstq, 64*2
+ sub hd, 4
+ jg .w64_loop
+ test bhd, bhd
+ jz .w64_end
+ vshufi32x4 m2, m2, q3232
+.w64_padv:
+ movu [dstq+64*0], m2
+ movu [dstq+64*1], m2
+ add dstq, 64*2
+ sub bhd, 4
+ jg .w64_padv
+.w64_end:
+ RET
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/pal.h b/third_party/dav1d/src/x86/pal.h
new file mode 100644
index 0000000000..7cd2e68d5b
--- /dev/null
+++ b/third_party/dav1d/src/x86/pal.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright © 2023, VideoLAN and dav1d authors
+ * Copyright © 2023, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+
+decl_pal_idx_finish_fn(dav1d_pal_idx_finish_ssse3);
+decl_pal_idx_finish_fn(dav1d_pal_idx_finish_avx2);
+decl_pal_idx_finish_fn(dav1d_pal_idx_finish_avx512icl);
+
+static ALWAYS_INLINE void pal_dsp_init_x86(Dav1dPalDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
+
+ c->pal_idx_finish = dav1d_pal_idx_finish_ssse3;
+
+#if ARCH_X86_64
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
+
+ c->pal_idx_finish = dav1d_pal_idx_finish_avx2;
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
+
+ c->pal_idx_finish = dav1d_pal_idx_finish_avx512icl;
+#endif
+}
diff --git a/third_party/dav1d/src/x86/refmvs.asm b/third_party/dav1d/src/x86/refmvs.asm
new file mode 100644
index 0000000000..d95861fa17
--- /dev/null
+++ b/third_party/dav1d/src/x86/refmvs.asm
@@ -0,0 +1,912 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA 64
+
+%macro JMP_TABLE 2-*
+ %xdefine %%prefix mangle(private_prefix %+ _%1)
+ %1_table:
+ %xdefine %%base %1_table
+ %rep %0 - 1
+ dd %%prefix %+ .w%2 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro SAVE_TMVS_TABLE 3 ; num_entries, w, suffix
+ %rep %1
+ db %2*3
+ db mangle(private_prefix %+ _save_tmvs_%3).write%2 - \
+ mangle(private_prefix %+ _save_tmvs_%3).write1
+ %endrep
+%endmacro
+
+%if ARCH_X86_64
+mv_proj: dw 0, 16384, 8192, 5461, 4096, 3276, 2730, 2340
+ dw 2048, 1820, 1638, 1489, 1365, 1260, 1170, 1092
+ dw 1024, 963, 910, 862, 819, 780, 744, 712
+ dw 682, 655, 630, 606, 585, 564, 546, 528
+splat_mv_shuf: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3
+ db 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7
+ db 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
+ db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3
+%endif
+save_pack0: db 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0
+ db 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1
+save_pack1: db 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2
+ db 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3
+save_ref_shuf: db 0, -1, -1, -1, 1, -1, -1, -1, 8, -1, -1, -1, 9, -1, -1, -1
+cond_shuf512: db 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 7, 3, 3, 3, 3
+save_cond0: db 0x80, 0x81, 0x82, 0x83, 0x89, 0x84, 0x00, 0x00
+save_cond1: db 0x84, 0x85, 0x86, 0x87, 0x88, 0x80, 0x00, 0x00
+pb_128: times 16 db 128
+pq_8192: dq 8192
+
+save_tmvs_ssse3_table: SAVE_TMVS_TABLE 2, 16, ssse3
+ SAVE_TMVS_TABLE 4, 8, ssse3
+ SAVE_TMVS_TABLE 4, 4, ssse3
+ SAVE_TMVS_TABLE 5, 2, ssse3
+ SAVE_TMVS_TABLE 7, 1, ssse3
+
+%if ARCH_X86_64
+save_tmvs_avx2_table: SAVE_TMVS_TABLE 2, 16, avx2
+ SAVE_TMVS_TABLE 4, 8, avx2
+ SAVE_TMVS_TABLE 4, 4, avx2
+ SAVE_TMVS_TABLE 5, 2, avx2
+ SAVE_TMVS_TABLE 7, 1, avx2
+
+save_tmvs_avx512icl_table: SAVE_TMVS_TABLE 2, 16, avx512icl
+ SAVE_TMVS_TABLE 4, 8, avx512icl
+ SAVE_TMVS_TABLE 4, 4, avx512icl
+ SAVE_TMVS_TABLE 5, 2, avx512icl
+ SAVE_TMVS_TABLE 7, 1, avx512icl
+
+JMP_TABLE splat_mv_avx512icl, 1, 2, 4, 8, 16, 32
+JMP_TABLE splat_mv_avx2, 1, 2, 4, 8, 16, 32
+%endif
+
+JMP_TABLE splat_mv_sse2, 1, 2, 4, 8, 16, 32
+
+SECTION .text
+
+%macro movif32 2
+%if ARCH_X86_32
+ mov %1, %2
+%endif
+%endmacro
+
+INIT_XMM ssse3
+; refmvs_temporal_block *rp, ptrdiff_t stride,
+; refmvs_block **rr, uint8_t *ref_sign,
+; int col_end8, int row_end8, int col_start8, int row_start8
+%if ARCH_X86_64
+cglobal save_tmvs, 4, 13, 11, rp, stride, rr, ref_sign, \
+ xend, yend, xstart, ystart
+%define base_reg r12
+%else
+cglobal save_tmvs, 6, 7, 8, rp, stride, rr, ref_sign, \
+ xend, yend, xstart, ystart
+ movq m5, [ref_signq]
+ lea strided, [strided*5]
+ mov stridem, strided
+ mov r3, xstartm
+ mov r1, ystartm
+ DEFINE_ARGS b, ystart, rr, cand, xend, x
+%define stridemp r1m
+%define m8 [base+pb_128]
+%define m9 [base+save_pack0+ 0]
+%define m10 [base+save_pack0+16]
+%define base_reg r6
+%endif
+%define base base_reg-.write1
+ LEA base_reg, .write1
+%if ARCH_X86_64
+ movifnidn xendd, xendm
+ movifnidn yendd, yendm
+ mov xstartd, xstartm
+ mov ystartd, ystartm
+ movq m5, [ref_signq]
+%endif
+ movu m4, [base+save_ref_shuf]
+ movddup m6, [base+save_cond0]
+ movddup m7, [base+save_cond1]
+%if ARCH_X86_64
+ mova m8, [base+pb_128]
+ mova m9, [base+save_pack0+ 0]
+ mova m10, [base+save_pack0+16]
+%endif
+ psllq m5, 8
+%if ARCH_X86_64
+ lea r9d, [xendq*5]
+ lea xstartd, [xstartq*5]
+ sub yendd, ystartd
+ add ystartd, ystartd
+ lea strideq, [strideq*5]
+ sub xstartq, r9
+ add xendd, r9d
+ add rpq, r9
+ DEFINE_ARGS rp, stride, rr, x, xend, h, xstart, ystart, b, cand
+%else
+ lea r0, [xendd*5] ; xend5
+ lea r3, [r3*5] ; xstart5
+ sub r3, r0 ; -w5
+ mov r6m, r3
+%define xstartq r6m
+ add xendd, r0 ; xend6
+ add r0m, r0 ; rp+xend5
+ mov xendm, xendd
+ sub r5, r1 ; h
+ add r1, r1
+ mov r7m, r1
+ mov r5m, r5
+%define hd r5mp
+ jmp .loop_y_noload
+%endif
+.loop_y:
+ movif32 ystartd, r7m
+ movif32 xendd, xendm
+.loop_y_noload:
+ and ystartd, 30
+ mov xq, xstartq
+ mov bq, [rrq+ystartq*gprsize]
+ add ystartd, 2
+ movif32 r7m, ystartd
+ lea bq, [bq+xendq*4]
+.loop_x:
+%if ARCH_X86_32
+%define rpq r3
+%define r10 r1
+%define r10d r1
+%define r11 r4
+%define r11d r4
+%endif
+ imul candq, xq, 0x9999 ; x / 5 * 3
+ sar candq, 16
+ movzx r10d, byte [bq+candq*8+22] ; cand_b->bs
+ movu m0, [bq+candq*8+12] ; cand_b
+ movzx r11d, byte [base+save_tmvs_ssse3_table+r10*2+0]
+ movzx r10d, byte [base+save_tmvs_ssse3_table+r10*2+1]
+ add r10, base_reg
+ add candq, r11
+ jge .calc
+ movu m1, [bq+candq*8+12]
+ movzx r11d, byte [bq+candq*8+22]
+ movzx r11d, byte [base+save_tmvs_ssse3_table+r11*2+1]
+ add r11, base_reg
+.calc:
+ movif32 rpq, r0m
+ ; ref check
+ punpckhqdq m2, m0, m1
+ pshufb m2, m4 ; b0.ref0 b0.ref1 b1.ref0 b1.ref1 | ...
+ pshufb m3, m5, m2 ; ref > 0 && res_sign[ref - 1]
+ ; mv check
+ punpcklqdq m2, m0, m1 ; b0.mv0 b0.mv1 b1.mv0 b1.mv1 | ...
+ pabsw m2, m2
+ psrlw m2, 12 ; (abs(mv.x) | abs(mv.y)) < 4096
+ ; res
+ pcmpgtd m3, m2
+ pshufd m2, m3, q2301
+ pand m3, m6 ; b0c0 b0c1 b1c0 b1c1 | ...
+ pand m2, m7 ; b0c1 b0c0 b1c1 b1c0 | ...
+ por m3, m2 ; b0.shuf b1.shuf | ...
+ pxor m3, m8 ; if cond0|cond1 == 0 => zero out
+ pshufb m0, m3
+ pshufb m1, m3
+ call r10
+ jge .next_line
+ pshufd m0, m1, q3232
+ call r11
+ jl .loop_x
+.next_line:
+ add rpq, stridemp
+ movif32 r0m, rpq
+ dec hd
+ jg .loop_y
+ RET
+.write1:
+ movd [rpq+xq+0], m0
+ psrlq m0, 8
+ movd [rpq+xq+1], m0
+ add xq, 5*1
+ ret
+.write2:
+ movq [rpq+xq+0], m0
+ psrlq m0, 8
+ movd [rpq+xq+6], m0
+ add xq, 5*2
+ ret
+.write4:
+ pshufb m0, m9
+ movu [rpq+xq+ 0], m0
+ psrlq m0, 8
+ movd [rpq+xq+16], m0
+ add xq, 5*4
+ ret
+.write8:
+ pshufb m2, m0, m9
+ movu [rpq+xq+ 0], m2
+ pshufb m0, m10
+ movu [rpq+xq+16], m0
+ psrldq m2, 2
+ movq [rpq+xq+32], m2
+ add xq, 5*8
+ ret
+.write16:
+ pshufb m2, m0, m9
+ movu [rpq+xq+ 0], m2
+ pshufb m0, m10
+ movu [rpq+xq+16], m0
+ shufps m2, m0, q1032
+ movu [rpq+xq+48], m2
+ shufps m2, m0, q2121
+ movu [rpq+xq+32], m2
+ shufps m0, m2, q1032
+ movu [rpq+xq+64], m0
+ add xq, 5*16
+ ret
+
+INIT_XMM sse2
+; refmvs_block **rr, refmvs_block *a, int bx4, int bw4, int bh4
+cglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4
+ add bx4d, bw4d
+ tzcnt bw4d, bw4d
+ mova m2, [aq]
+ LEA aq, splat_mv_sse2_table
+ lea bx4q, [bx4q*3-32]
+ movsxd bw4q, [aq+bw4q*4]
+ movifnidn bh4d, bh4m
+ pshufd m0, m2, q0210
+ pshufd m1, m2, q1021
+ pshufd m2, m2, q2102
+ add bw4q, aq
+.loop:
+ mov aq, [rrq]
+ add rrq, gprsize
+ lea aq, [aq+bx4q*4]
+ jmp bw4q
+.w32:
+ mova [aq-16*16], m0
+ mova [aq-16*15], m1
+ mova [aq-16*14], m2
+ mova [aq-16*13], m0
+ mova [aq-16*12], m1
+ mova [aq-16*11], m2
+ mova [aq-16*10], m0
+ mova [aq-16* 9], m1
+ mova [aq-16* 8], m2
+ mova [aq-16* 7], m0
+ mova [aq-16* 6], m1
+ mova [aq-16* 5], m2
+.w16:
+ mova [aq-16* 4], m0
+ mova [aq-16* 3], m1
+ mova [aq-16* 2], m2
+ mova [aq-16* 1], m0
+ mova [aq+16* 0], m1
+ mova [aq+16* 1], m2
+.w8:
+ mova [aq+16* 2], m0
+ mova [aq+16* 3], m1
+ mova [aq+16* 4], m2
+.w4:
+ mova [aq+16* 5], m0
+ mova [aq+16* 6], m1
+ mova [aq+16* 7], m2
+ dec bh4d
+ jg .loop
+ RET
+.w2:
+ movu [aq+104], m0
+ movq [aq+120], m1
+ dec bh4d
+ jg .loop
+ RET
+.w1:
+ movq [aq+116], m0
+ movd [aq+124], m2
+ dec bh4d
+ jg .loop
+ RET
+
+%if ARCH_X86_64
+INIT_XMM sse4
+; refmvs_frame *rf, int tile_row_idx,
+; int col_start8, int col_end8, int row_start8, int row_end8
+cglobal load_tmvs, 6, 15, 4, -0x50, rf, tridx, xstart, xend, ystart, yend, \
+ stride, rp_proj, roff, troff, \
+ xendi, xstarti, iw8, ih8, dst
+ xor r14d, r14d
+ cmp dword [rfq+212], 1 ; n_tile_threads
+ mov ih8d, [rfq+20] ; rf->ih8
+ mov iw8d, [rfq+16] ; rf->iw8
+ mov xstartd, xstartd
+ mov xendd, xendd
+ cmove tridxd, r14d
+ lea xstartid, [xstartq-8]
+ lea xendid, [xendq+8]
+ mov strideq, [rfq+184]
+ mov rp_projq, [rfq+176]
+ cmp ih8d, yendd
+ mov [rsp+0x30], strideq
+ cmovs yendd, ih8d
+ test xstartid, xstartid
+ cmovs xstartid, r14d
+ cmp iw8d, xendid
+ cmovs xendid, iw8d
+ mov troffq, strideq
+ shl troffq, 4
+ imul troffq, tridxq
+ mov dstd, ystartd
+ and dstd, 15
+ imul dstq, strideq
+ add dstq, troffq ; (16 * tridx + (ystart & 15)) * stride
+ lea dstq, [dstq*5]
+ add dstq, rp_projq
+ lea troffq, [troffq*5] ; 16 * tridx * stride * 5
+ lea r13d, [xendq*5]
+ lea r12, [strideq*5]
+ DEFINE_ARGS rf, w5, xstart, xend, ystart, yend, h, x5, \
+ _, troff, xendi, xstarti, stride5, _, dst
+ lea w5d, [xstartq*5]
+ add r7, troffq ; rp_proj + tile_row_offset
+ mov hd, yendd
+ mov [rsp+0x28], r7
+ add dstq, r13
+ sub w5q, r13
+ sub hd, ystartd
+.init_xloop_start:
+ mov x5q, w5q
+ test w5b, 1
+ jz .init_2blk
+ mov dword [dstq+x5q], 0x80008000
+ add x5q, 5
+ jz .init_next_row
+.init_2blk:
+ mov dword [dstq+x5q+0], 0x80008000
+ mov dword [dstq+x5q+5], 0x80008000
+ add x5q, 10
+ jl .init_2blk
+.init_next_row:
+ add dstq, stride5q
+ dec hd
+ jg .init_xloop_start
+ DEFINE_ARGS rf, _, xstart, xend, ystart, yend, n7, stride, \
+ _, _, xendi, xstarti, stride5, _, n
+ mov r13d, [rfq+152] ; rf->n_mfmvs
+ test r13d, r13d
+ jz .ret
+ mov [rsp+0x0c], r13d
+ mov strideq, [rsp+0x30]
+ movddup m3, [pq_8192]
+ mov r9d, ystartd
+ mov [rsp+0x38], yendd
+ mov [rsp+0x20], xstartid
+ xor nd, nd
+ xor n7d, n7d
+ imul r9, strideq ; ystart * stride
+ mov [rsp+0x48], rfq
+ mov [rsp+0x18], stride5q
+ lea r7, [r9*5]
+ mov [rsp+0x24], ystartd
+ mov [rsp+0x00], r7
+.nloop:
+ DEFINE_ARGS y, off, xstart, xend, ystart, rf, n7, refsign, \
+ ref, rp_ref, xendi, xstarti, _, _, n
+ mov rfq, [rsp+0x48]
+ mov refd, [rfq+56+nq*4] ; ref2cur
+ cmp refd, 0x80000000
+ je .next_n
+ mov [rsp+0x40], refd
+ mov offq, [rsp+0x00] ; ystart * stride * 5
+ movzx refd, byte [rfq+53+nq] ; rf->mfmv_ref[n]
+ lea refsignq, [refq-4]
+ mov rp_refq, [rfq+168]
+ movq m2, refsignq
+ add offq, [rp_refq+refq*8] ; r = rp_ref[ref] + row_offset
+ mov [rsp+0x14], nd
+ mov yd, ystartd
+.yloop:
+ mov r11d, [rsp+0x24] ; ystart
+ mov r12d, [rsp+0x38] ; yend
+ mov r14d, yd
+ and r14d, ~7 ; y_sb_align
+ cmp r11d, r14d
+ cmovs r11d, r14d ; imax(y_sb_align, ystart)
+ mov [rsp+0x44], r11d ; y_proj_start
+ add r14d, 8
+ cmp r12d, r14d
+ cmovs r14d, r12d ; imin(y_sb_align + 8, yend)
+ mov [rsp+0x3c], r14d ; y_proj_end
+ DEFINE_ARGS y, src, xstart, xend, frac, rf, n7, mv, \
+ ref, x, xendi, mvx, mvy, rb, ref2ref
+ mov xd, [rsp+0x20] ; xstarti
+.xloop:
+ lea rbd, [xq*5]
+ add rbq, srcq
+ movsx refd, byte [rbq+4]
+ test refd, refd
+ jz .next_x_bad_ref
+ mov rfq, [rsp+0x48]
+ lea r14d, [16+n7q+refq]
+ mov ref2refd, [rfq+r14*4] ; rf->mfmv_ref2ref[n][b_ref-1]
+ test ref2refd, ref2refd
+ jz .next_x_bad_ref
+ lea fracq, [mv_proj]
+ movzx fracd, word [fracq+ref2refq*2]
+ mov mvd, [rbq]
+ imul fracd, [rsp+0x40] ; ref2cur
+ pmovsxwq m0, [rbq]
+ movd m1, fracd
+ punpcklqdq m1, m1
+ pmuldq m0, m1 ; mv * frac
+ pshufd m1, m0, q3311
+ paddd m0, m3
+ paddd m0, m1
+ psrad m0, 14 ; offset = (xy + (xy >> 31) + 8192) >> 14
+ pabsd m1, m0
+ packssdw m0, m0
+ psrld m1, 6
+ packuswb m1, m1
+ pxor m0, m2 ; offset ^ ref_sign
+ psignd m1, m0 ; apply_sign(abs(offset) >> 6, offset ^ refsign)
+ movq mvxq, m1
+ lea mvyd, [mvxq+yq] ; ypos
+ sar mvxq, 32
+ DEFINE_ARGS y, src, xstart, xend, _, _, n7, mv, \
+ ref, x, xendi, mvx, ypos, rb, ref2ref
+ cmp yposd, [rsp+0x44] ; y_proj_start
+ jl .next_x_bad_pos_y
+ cmp yposd, [rsp+0x3c] ; y_proj_end
+ jge .next_x_bad_pos_y
+ and yposd, 15
+ add mvxq, xq ; xpos
+ imul yposq, [rsp+0x30] ; pos = (ypos & 15) * stride
+ DEFINE_ARGS y, src, xstart, xend, dst, _, n7, mv, \
+ ref, x, xendi, xpos, pos, rb, ref2ref
+ mov dstq, [rsp+0x28] ; dst = rp_proj + tile_row_offset
+ add posq, xposq ; pos += xpos
+ lea posq, [posq*5]
+ add dstq, posq ; dst += pos5
+ jmp .write_loop_entry
+.write_loop:
+ add rbq, 5
+ cmp refb, byte [rbq+4]
+ jne .xloop
+ cmp mvd, [rbq]
+ jne .xloop
+ add dstq, 5
+ inc xposd
+.write_loop_entry:
+ mov r12d, xd
+ and r12d, ~7
+ lea r5d, [r12-8]
+ cmp r5d, xstartd
+ cmovs r5d, xstartd ; x_proj_start
+ cmp xposd, r5d
+ jl .next_xpos
+ add r12d, 16
+ cmp xendd, r12d
+ cmovs r12d, xendd ; x_proj_end
+ cmp xposd, r12d
+ jge .next_xpos
+ mov [dstq+0], mvd
+ mov byte [dstq+4], ref2refb
+.next_xpos:
+ inc xd
+ cmp xd, xendid
+ jl .write_loop
+.next_y:
+ DEFINE_ARGS y, src, xstart, xend, ystart, _, n7, _, _, x, xendi, _, _, _, n
+ add srcq, [rsp+0x18] ; stride5
+ inc yd
+ cmp yd, [rsp+0x38] ; yend
+ jne .yloop
+ mov nd, [rsp+0x14]
+ mov ystartd, [rsp+0x24]
+.next_n:
+ add n7d, 7
+ inc nd
+ cmp nd, [rsp+0x0c] ; n_mfmvs
+ jne .nloop
+.ret:
+ RET
+.next_x:
+ DEFINE_ARGS y, src, xstart, xend, _, _, n7, mv, ref, x, xendi, _, _, rb, _
+ add rbq, 5
+ cmp refb, byte [rbq+4]
+ jne .xloop
+ cmp mvd, [rbq]
+ jne .xloop
+.next_x_bad_pos_y:
+ inc xd
+ cmp xd, xendid
+ jl .next_x
+ jmp .next_y
+.next_x_bad_ref:
+ inc xd
+ cmp xd, xendid
+ jl .xloop
+ jmp .next_y
+
+INIT_YMM avx2
+; refmvs_temporal_block *rp, ptrdiff_t stride,
+; refmvs_block **rr, uint8_t *ref_sign,
+; int col_end8, int row_end8, int col_start8, int row_start8
+cglobal save_tmvs, 4, 13, 10, rp, stride, rr, ref_sign, \
+ xend, yend, xstart, ystart
+%define base r12-.write1
+ lea r12, [.write1]
+ movifnidn xendd, xendm
+ movifnidn yendd, yendm
+ mov xstartd, xstartm
+ mov ystartd, ystartm
+ vpbroadcastq m4, [ref_signq]
+ vpbroadcastq m3, [base+save_ref_shuf+8]
+ vpbroadcastq m5, [base+save_cond0]
+ vpbroadcastq m6, [base+save_cond1]
+ vpbroadcastd m7, [base+pb_128]
+ mova m8, [base+save_pack0]
+ mova m9, [base+save_pack1]
+ psllq m4, 8
+ lea r9d, [xendq*5]
+ lea xstartd, [xstartq*5]
+ sub yendd, ystartd
+ add ystartd, ystartd
+ lea strideq, [strideq*5]
+ sub xstartq, r9
+ add xendd, r9d
+ add rpq, r9
+ DEFINE_ARGS rp, stride, rr, x, xend, h, xstart, ystart, b, cand
+.loop_y:
+ and ystartd, 30
+ mov xq, xstartq
+ mov bq, [rrq+ystartq*8]
+ add ystartd, 2
+ lea bq, [bq+xendq*4]
+.loop_x:
+ imul candq, xq, 0x9999
+ sar candq, 16 ; x / 5 * 3
+ movzx r10d, byte [bq+candq*8+22] ; cand_b->bs
+ movu xm0, [bq+candq*8+12] ; cand_b
+ movzx r11d, byte [base+save_tmvs_avx2_table+r10*2+0]
+ movzx r10d, byte [base+save_tmvs_avx2_table+r10*2+1]
+ add r10, r12
+ add candq, r11
+ jge .calc
+ vinserti128 m0, [bq+candq*8+12], 1
+ movzx r11d, byte [bq+candq*8+22]
+ movzx r11d, byte [base+save_tmvs_avx2_table+r11*2+1]
+ add r11, r12
+.calc:
+ pshufb m1, m0, m3
+ pabsw m2, m0
+ pshufb m1, m4, m1 ; ref > 0 && res_sign[ref - 1]
+ psrlw m2, 12 ; (abs(mv.x) | abs(mv.y)) < 4096
+ pcmpgtd m1, m2
+ pshufd m2, m1, q2301
+ pand m1, m5 ; b0.cond0 b1.cond0
+ pand m2, m6 ; b0.cond1 b1.cond1
+ por m1, m2 ; b0.shuf b1.shuf
+ pxor m1, m7 ; if cond0|cond1 == 0 => zero out
+ pshufb m0, m1
+ call r10
+ jge .next_line
+ vextracti128 xm0, m0, 1
+ call r11
+ jl .loop_x
+.next_line:
+ add rpq, strideq
+ dec hd
+ jg .loop_y
+ RET
+.write1:
+ movd [rpq+xq+ 0], xm0
+ pextrb [rpq+xq+ 4], xm0, 4
+ add xq, 5*1
+ ret
+.write2:
+ movq [rpq+xq+0], xm0
+ psrlq xm1, xm0, 8
+ movd [rpq+xq+6], xm1
+ add xq, 5*2
+ ret
+.write4:
+ pshufb xm1, xm0, xm8
+ movu [rpq+xq+ 0], xm1
+ psrlq xm1, 8
+ movd [rpq+xq+16], xm1
+ add xq, 5*4
+ ret
+.write8:
+ vinserti128 m1, m0, xm0, 1
+ pshufb m1, m8
+ movu [rpq+xq+ 0], m1
+ psrldq xm1, 2
+ movq [rpq+xq+32], xm1
+ add xq, 5*8
+ ret
+.write16:
+ vinserti128 m1, m0, xm0, 1
+ pshufb m2, m1, m8
+ movu [rpq+xq+ 0], m2
+ pshufb m1, m9
+ movu [rpq+xq+32], m1
+ shufps xm2, xm1, q1021
+ movu [rpq+xq+64], xm2
+ add xq, 5*16
+ ret
+
+cglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4
+ add bx4d, bw4d
+ tzcnt bw4d, bw4d
+ vbroadcasti128 m0, [aq]
+ lea aq, [splat_mv_avx2_table]
+ lea bx4q, [bx4q*3-32]
+ movsxd bw4q, [aq+bw4q*4]
+ pshufb m0, [splat_mv_shuf]
+ movifnidn bh4d, bh4m
+ pshufd m1, m0, q2102
+ pshufd m2, m0, q1021
+ add bw4q, aq
+.loop:
+ mov aq, [rrq]
+ add rrq, gprsize
+ lea aq, [aq+bx4q*4]
+ jmp bw4q
+.w32:
+ mova [aq-32*8], m0
+ mova [aq-32*7], m1
+ mova [aq-32*6], m2
+ mova [aq-32*5], m0
+ mova [aq-32*4], m1
+ mova [aq-32*3], m2
+.w16:
+ mova [aq-32*2], m0
+ mova [aq-32*1], m1
+ mova [aq+32*0], m2
+.w8:
+ mova [aq+32*1], m0
+ mova [aq+32*2], m1
+ mova [aq+32*3], m2
+ dec bh4d
+ jg .loop
+ RET
+.w4:
+ movu [aq+ 80], m0
+ mova [aq+112], xm1
+ dec bh4d
+ jg .loop
+ RET
+.w2:
+ movu [aq+104], xm0
+ movq [aq+120], xm2
+ dec bh4d
+ jg .loop
+ RET
+.w1:
+ movq [aq+116], xm0
+ movd [aq+124], xm1
+ dec bh4d
+ jg .loop
+ RET
+
+INIT_ZMM avx512icl
+; refmvs_temporal_block *rp, ptrdiff_t stride,
+; refmvs_block **rr, uint8_t *ref_sign,
+; int col_end8, int row_end8, int col_start8, int row_start8
+cglobal save_tmvs, 4, 15, 10, rp, stride, rr, ref_sign, \
+ xend, yend, xstart, ystart
+%define base r14-.write1
+ lea r14, [.write1]
+ movifnidn xendd, xendm
+ movifnidn yendd, yendm
+ mov xstartd, xstartm
+ mov ystartd, ystartm
+ psllq m4, [ref_signq]{bcstq}, 8
+ vpbroadcastq m3, [base+save_ref_shuf+8]
+ vbroadcasti32x4 m5, [base+cond_shuf512]
+ vbroadcasti32x4 m6, [base+save_cond0]
+ vpbroadcastd m7, [base+pb_128]
+ mova m8, [base+save_pack0]
+ movu xm9, [base+save_pack0+4]
+ lea r9d, [xendq*5]
+ lea xstartd, [xstartq*5]
+ sub yendd, ystartd
+ add ystartd, ystartd
+ lea strideq, [strideq*5]
+ sub xstartq, r9
+ add xendd, r9d
+ add rpq, r9
+ mov r10d, 0x1f
+ kmovb k2, r10d
+ DEFINE_ARGS rp, stride, rr, x, xend, h, xstart, ystart, b, cand
+.loop_y:
+ and ystartd, 30
+ mov xq, xstartq
+ mov bq, [rrq+ystartq*8]
+ add ystartd, 2
+ lea bq, [bq+xendq*4]
+.loop_x:
+ imul candq, xq, 0x9999
+ sar candq, 16 ; x / 5 * 3
+ movzx r10d, byte [bq+candq*8+22] ; cand_b->bs
+ movu xm0, [bq+candq*8+12] ; cand_b
+ movzx r11d, byte [base+save_tmvs_avx512icl_table+r10*2+0]
+ movzx r10d, byte [base+save_tmvs_avx512icl_table+r10*2+1]
+ add r10, r14
+ add candq, r11
+ jge .calc
+ movzx r11d, byte [bq+candq*8+22]
+ vinserti32x4 ym0, [bq+candq*8+12], 1
+ movzx r12d, byte [base+save_tmvs_avx512icl_table+r11*2+0]
+ movzx r11d, byte [base+save_tmvs_avx512icl_table+r11*2+1]
+ add r11, r14
+ add candq, r12
+ jge .calc
+ movzx r12d, byte [bq+candq*8+22]
+ vinserti32x4 m0, [bq+candq*8+12], 2
+ movzx r13d, byte [base+save_tmvs_avx512icl_table+r12*2+0]
+ movzx r12d, byte [base+save_tmvs_avx512icl_table+r12*2+1]
+ add r12, r14
+ add candq, r13
+ jge .calc
+ vinserti32x4 m0, [bq+candq*8+12], 3
+ movzx r13d, byte [bq+candq*8+22]
+ movzx r13d, byte [base+save_tmvs_avx512icl_table+r13*2+1]
+ add r13, r14
+.calc:
+ pshufb m1, m0, m3
+ pabsw m2, m0
+ pshufb m1, m4, m1 ; ref > 0 && res_sign[ref - 1]
+ psrlw m2, 12 ; (abs(mv.x) | abs(mv.y)) < 4096
+ psubd m2, m1
+ pshufb m2, m5 ; c0 c1 c1 c0
+ pand m2, m6
+ punpckhqdq m1, m2, m2
+ vpternlogd m1, m2, m7, 0x56 ; (c0shuf | c1shuf) ^ 0x80
+ pshufb m2, m0, m1
+ mova xm0, xm2
+ call r10
+ jge .next_line
+ vextracti32x4 xm0, m2, 1
+ call r11
+ jge .next_line
+ vextracti32x4 xm0, m2, 2
+ call r12
+ jge .next_line
+ vextracti32x4 xm0, m2, 3
+ call r13
+ jl .loop_x
+.next_line:
+ add rpq, strideq
+ dec hd
+ jg .loop_y
+ RET
+.write1:
+ vmovdqu8 [rpq+xq]{k2}, xm0
+ add xq, 5*1
+ ret
+.write2:
+ pshufb xm0, xm8
+ vmovdqu16 [rpq+xq]{k2}, xm0
+ add xq, 5*2
+ ret
+.write4:
+ vpermb ym0, ym8, ym0
+ vmovdqu32 [rpq+xq]{k2}, ym0
+ add xq, 5*4
+ ret
+.write8:
+ vpermb m0, m8, m0
+ vmovdqu64 [rpq+xq]{k2}, m0
+ add xq, 5*8
+ ret
+.write16:
+ vpermb m1, m8, m0
+ movu [rpq+xq+ 0], m1
+ pshufb xm0, xm9
+ movu [rpq+xq+64], xm0
+ add xq, 5*16
+ ret
+
+INIT_ZMM avx512icl
+cglobal splat_mv, 4, 7, 3, rr, a, bx4, bw4, bh4
+ vbroadcasti32x4 m0, [aq]
+ lea r1, [splat_mv_avx512icl_table]
+ tzcnt bw4d, bw4d
+ lea bx4d, [bx4q*3]
+ pshufb m0, [splat_mv_shuf]
+ movsxd bw4q, [r1+bw4q*4]
+ mov r6d, bh4m
+ add bw4q, r1
+ lea rrq, [rrq+r6*8]
+ mov r1d, 0x3f
+ neg r6
+ kmovb k1, r1d
+ jmp bw4q
+.w1:
+ mov r1, [rrq+r6*8]
+ vmovdqu16 [r1+bx4q*4]{k1}, xm0
+ inc r6
+ jl .w1
+ RET
+.w2:
+ mov r1, [rrq+r6*8]
+ vmovdqu32 [r1+bx4q*4]{k1}, ym0
+ inc r6
+ jl .w2
+ RET
+.w4:
+ mov r1, [rrq+r6*8]
+ vmovdqu64 [r1+bx4q*4]{k1}, m0
+ inc r6
+ jl .w4
+ RET
+.w8:
+ pshufd ym1, ym0, q1021
+.w8_loop:
+ mov r1, [rrq+r6*8+0]
+ mov r3, [rrq+r6*8+8]
+ movu [r1+bx4q*4+ 0], m0
+ mova [r1+bx4q*4+64], ym1
+ movu [r3+bx4q*4+ 0], m0
+ mova [r3+bx4q*4+64], ym1
+ add r6, 2
+ jl .w8_loop
+ RET
+.w16:
+ pshufd m1, m0, q1021
+ pshufd m2, m0, q2102
+.w16_loop:
+ mov r1, [rrq+r6*8+0]
+ mov r3, [rrq+r6*8+8]
+ mova [r1+bx4q*4+64*0], m0
+ mova [r1+bx4q*4+64*1], m1
+ mova [r1+bx4q*4+64*2], m2
+ mova [r3+bx4q*4+64*0], m0
+ mova [r3+bx4q*4+64*1], m1
+ mova [r3+bx4q*4+64*2], m2
+ add r6, 2
+ jl .w16_loop
+ RET
+.w32:
+ pshufd m1, m0, q1021
+ pshufd m2, m0, q2102
+.w32_loop:
+ mov r1, [rrq+r6*8]
+ lea r1, [r1+bx4q*4]
+ mova [r1+64*0], m0
+ mova [r1+64*1], m1
+ mova [r1+64*2], m2
+ mova [r1+64*3], m0
+ mova [r1+64*4], m1
+ mova [r1+64*5], m2
+ inc r6
+ jl .w32_loop
+ RET
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/refmvs.h b/third_party/dav1d/src/x86/refmvs.h
new file mode 100644
index 0000000000..c9978561ec
--- /dev/null
+++ b/third_party/dav1d/src/x86/refmvs.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright © 2021, VideoLAN and dav1d authors
+ * Copyright © 2021, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/refmvs.h"
+
+decl_load_tmvs_fn(dav1d_load_tmvs_sse4);
+
+decl_save_tmvs_fn(dav1d_save_tmvs_ssse3);
+decl_save_tmvs_fn(dav1d_save_tmvs_avx2);
+decl_save_tmvs_fn(dav1d_save_tmvs_avx512icl);
+
+decl_splat_mv_fn(dav1d_splat_mv_sse2);
+decl_splat_mv_fn(dav1d_splat_mv_avx2);
+decl_splat_mv_fn(dav1d_splat_mv_avx512icl);
+
+static ALWAYS_INLINE void refmvs_dsp_init_x86(Dav1dRefmvsDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return;
+
+ c->splat_mv = dav1d_splat_mv_sse2;
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
+
+ c->save_tmvs = dav1d_save_tmvs_ssse3;
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSE41)) return;
+#if ARCH_X86_64
+ c->load_tmvs = dav1d_load_tmvs_sse4;
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
+
+ c->save_tmvs = dav1d_save_tmvs_avx2;
+ c->splat_mv = dav1d_splat_mv_avx2;
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
+
+ c->save_tmvs = dav1d_save_tmvs_avx512icl;
+ c->splat_mv = dav1d_splat_mv_avx512icl;
+#endif
+}